mindstudio-probe 8.3.2__py3-none-any.whl → 26.0.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (689) hide show
  1. {mindstudio_probe-8.3.2.dist-info → mindstudio_probe-26.0.0a1.dist-info}/METADATA +26 -14
  2. mindstudio_probe-26.0.0a1.dist-info/RECORD +498 -0
  3. {mindstudio_probe-8.3.2.dist-info → mindstudio_probe-26.0.0a1.dist-info}/WHEEL +1 -1
  4. mindstudio_probe-26.0.0a1.dist-info/entry_points.txt +5 -0
  5. mindstudio_probe-26.0.0a1.dist-info/licenses/LICENSE +124 -0
  6. mindstudio_probe-26.0.0a1.dist-info/top_level.txt +2 -0
  7. msprobe/__init__.py +12 -13
  8. msprobe/config.json +9 -31
  9. msprobe/core/__init__.py +12 -11
  10. msprobe/core/acc_check/acc_check_cli.py +145 -0
  11. msprobe/core/common/const.py +97 -38
  12. msprobe/core/common/db_manager.py +133 -12
  13. msprobe/core/common/decorator.py +12 -11
  14. msprobe/core/common/exceptions.py +12 -11
  15. msprobe/core/common/file_utils.py +101 -25
  16. msprobe/core/common/framework_adapter.py +36 -25
  17. msprobe/core/common/global_lock.py +12 -11
  18. msprobe/core/common/inplace_op_checker.py +12 -11
  19. msprobe/core/common/log.py +22 -11
  20. msprobe/core/common/megatron_utils.py +566 -11
  21. msprobe/core/common/parallel_state.py +12 -11
  22. msprobe/core/common/runtime.py +12 -11
  23. msprobe/core/common/utils.py +41 -41
  24. msprobe/core/compare/acc_compare.py +361 -104
  25. msprobe/core/compare/atb_data_compare.py +422 -0
  26. msprobe/core/compare/auto_compare.py +134 -0
  27. msprobe/core/compare/check.py +14 -17
  28. msprobe/core/compare/compare_cli.py +72 -149
  29. msprobe/core/compare/config.py +12 -13
  30. msprobe/core/compare/diff_analyze/first_diff_analyze.py +28 -15
  31. msprobe/core/compare/diff_analyze/ignore_op_list.yaml +3 -0
  32. msprobe/core/compare/find_first/analyzer.py +18 -18
  33. msprobe/core/compare/find_first/graph.py +12 -11
  34. msprobe/core/compare/find_first/utils.py +13 -12
  35. msprobe/core/compare/indicator_analysis/__init__.py +15 -0
  36. msprobe/core/compare/indicator_analysis/algorithm.py +363 -0
  37. msprobe/core/compare/indicator_analysis/api_data.py +141 -0
  38. msprobe/core/compare/indicator_analysis/calculator.py +181 -0
  39. msprobe/core/compare/indicator_analysis/utils.py +116 -0
  40. msprobe/core/compare/layer_mapping/__init__.py +12 -11
  41. msprobe/core/compare/layer_mapping/data_scope_parser.py +20 -11
  42. msprobe/core/compare/layer_mapping/layer_mapping.py +14 -13
  43. msprobe/core/compare/layer_mapping/postprocess_pass.py +13 -11
  44. msprobe/core/compare/merge_result/merge_result.py +12 -11
  45. msprobe/core/compare/merge_result/merge_result_cli.py +12 -11
  46. msprobe/core/compare/merge_result/utils.py +12 -11
  47. msprobe/core/compare/multiprocessing_compute.py +13 -14
  48. msprobe/core/compare/npy_compare.py +13 -11
  49. msprobe/core/compare/offline_data_compare.py +160 -0
  50. msprobe/core/compare/stats_diff_calc.py +39 -0
  51. msprobe/core/compare/torchair_acc_cmp.py +764 -0
  52. msprobe/core/compare/torchair_cmp_utils.py +338 -0
  53. msprobe/core/compare/utils.py +140 -49
  54. msprobe/core/config_check/__init__.py +12 -11
  55. msprobe/core/config_check/checkers/__init__.py +12 -11
  56. msprobe/core/config_check/checkers/base_checker.py +15 -14
  57. msprobe/core/config_check/checkers/dataset_checker.py +13 -12
  58. msprobe/core/config_check/checkers/env_args_checker.py +13 -12
  59. msprobe/core/config_check/checkers/hyperparameter_checker.py +16 -15
  60. msprobe/core/config_check/checkers/pip_checker.py +15 -15
  61. msprobe/core/config_check/checkers/random_checker.py +13 -12
  62. msprobe/core/config_check/checkers/weights_checker.py +14 -12
  63. msprobe/core/config_check/ckpt_compare/ckpt_comparator.py +13 -17
  64. msprobe/core/config_check/ckpt_compare/megatron_loader.py +13 -12
  65. msprobe/core/config_check/ckpt_compare/metrics.py +12 -11
  66. msprobe/core/config_check/config_check_cli.py +18 -17
  67. msprobe/core/config_check/config_checker.py +16 -14
  68. msprobe/core/config_check/resource/dependency.yaml +15 -12
  69. msprobe/core/config_check/resource/env.yaml +12 -11
  70. msprobe/core/config_check/utils/hyperparameter_parser.py +12 -11
  71. msprobe/core/config_check/utils/utils.py +12 -11
  72. msprobe/core/{data_dump → dump/api_dump}/api_registry.py +12 -11
  73. msprobe/core/{common_config.py → dump/common_config.py} +13 -24
  74. msprobe/core/dump/data_dump/data_collector.py +257 -0
  75. msprobe/core/{data_dump → dump/data_dump}/data_processor/base.py +45 -36
  76. msprobe/core/{data_dump → dump/data_dump}/data_processor/factory.py +33 -25
  77. msprobe/core/{data_dump → dump/data_dump}/data_processor/mindspore_processor.py +37 -113
  78. msprobe/core/{data_dump → dump/data_dump}/data_processor/pytorch_processor.py +364 -131
  79. msprobe/core/{data_dump → dump/data_dump}/json_writer.py +24 -31
  80. msprobe/core/{data_dump → dump/data_dump}/scope.py +12 -13
  81. msprobe/core/{debugger → dump/debugger}/precision_debugger.py +15 -23
  82. msprobe/core/dump/dump2db/db_utils.py +215 -0
  83. msprobe/core/dump/dump2db/dump2db.py +409 -0
  84. msprobe/core/{hook_manager.py → dump/hook_manager.py} +38 -87
  85. msprobe/core/dump/kernel_dump/kernel_config.py +34 -0
  86. msprobe/core/{service.py → dump/service.py} +43 -27
  87. msprobe/core/install_deps/install_deps.py +51 -0
  88. msprobe/core/monitor/anomaly_processor.py +13 -11
  89. msprobe/core/monitor/csv2db.py +73 -93
  90. msprobe/core/monitor/db_utils.py +140 -205
  91. msprobe/core/monitor/utils.py +18 -17
  92. msprobe/core/monitor_v2/__init__.py +20 -0
  93. msprobe/core/monitor_v2/base.py +83 -0
  94. msprobe/core/monitor_v2/cc.py +287 -0
  95. msprobe/core/monitor_v2/factory.py +81 -0
  96. msprobe/core/monitor_v2/module.py +201 -0
  97. msprobe/core/monitor_v2/optimizer.py +245 -0
  98. msprobe/core/monitor_v2/param.py +154 -0
  99. msprobe/core/monitor_v2/trainer.py +326 -0
  100. msprobe/core/monitor_v2/utils.py +122 -0
  101. msprobe/core/monitor_v2/weight_grad.py +419 -0
  102. msprobe/core/monitor_v2/writer.py +162 -0
  103. msprobe/core/overflow_check/abnormal_scene.py +12 -11
  104. msprobe/core/overflow_check/api_info.py +12 -11
  105. msprobe/core/overflow_check/checker.py +12 -11
  106. msprobe/core/overflow_check/filter.py +13 -11
  107. msprobe/core/overflow_check/level.py +12 -11
  108. msprobe/core/overflow_check/utils.py +12 -11
  109. msprobe/core/single_save/single_comparator.py +12 -11
  110. msprobe/core/single_save/single_saver.py +12 -11
  111. msprobe/infer/__init__.py +16 -0
  112. msprobe/infer/offline/__init__.py +16 -0
  113. msprobe/infer/offline/compare/__init__.py +16 -0
  114. msprobe/infer/offline/compare/msquickcmp/__init__.py +16 -0
  115. msprobe/infer/offline/compare/msquickcmp/adapter_cli/__init__.py +16 -0
  116. msprobe/infer/offline/compare/msquickcmp/adapter_cli/args_adapter.py +46 -0
  117. msprobe/infer/offline/compare/msquickcmp/atc/__init__.py +16 -0
  118. msprobe/infer/offline/compare/msquickcmp/atc/atc_utils.py +98 -0
  119. msprobe/infer/offline/compare/msquickcmp/cmp_process.py +328 -0
  120. msprobe/infer/offline/compare/msquickcmp/common/__init__.py +16 -0
  121. msprobe/infer/offline/compare/msquickcmp/common/args_check.py +112 -0
  122. msprobe/infer/offline/compare/msquickcmp/common/convert.py +74 -0
  123. msprobe/infer/offline/compare/msquickcmp/common/dump_data.py +121 -0
  124. msprobe/infer/offline/compare/msquickcmp/common/dynamic_argument_bean.py +39 -0
  125. msprobe/infer/offline/compare/msquickcmp/common/utils.py +669 -0
  126. msprobe/infer/offline/compare/msquickcmp/config.ini +6 -0
  127. msprobe/infer/offline/compare/msquickcmp/dump/__init__.py +16 -0
  128. msprobe/infer/offline/compare/msquickcmp/dump/args_adapter.py +50 -0
  129. msprobe/infer/offline/compare/msquickcmp/dump/dump_process.py +91 -0
  130. msprobe/infer/offline/compare/msquickcmp/install_aclruntime_aisbench.sh +180 -0
  131. msprobe/infer/offline/compare/msquickcmp/main.py +199 -0
  132. msprobe/infer/offline/compare/msquickcmp/net_compare/__init__.py +16 -0
  133. msprobe/infer/offline/compare/msquickcmp/net_compare/net_compare.py +277 -0
  134. msprobe/infer/offline/compare/msquickcmp/npu/__init__.py +16 -0
  135. msprobe/infer/offline/compare/msquickcmp/npu/npu_dump_data.py +558 -0
  136. msprobe/infer/offline/compare/msquickcmp/npu/om_parser.py +416 -0
  137. msprobe/infer/offline/compare/msquickcmp/onnx_model/__init__.py +16 -0
  138. msprobe/infer/offline/compare/msquickcmp/onnx_model/onnx_dump_data.py +374 -0
  139. msprobe/infer/utils/__init__.py +15 -0
  140. msprobe/infer/utils/acc_cmp.py +94 -0
  141. msprobe/infer/utils/check/__init__.py +37 -0
  142. msprobe/infer/utils/check/args_checker.py +35 -0
  143. msprobe/infer/utils/check/checker.py +227 -0
  144. msprobe/infer/utils/check/dict_checker.py +78 -0
  145. msprobe/infer/utils/check/func_wrapper.py +96 -0
  146. msprobe/infer/utils/check/list_checker.py +56 -0
  147. msprobe/infer/utils/check/number_checker.py +64 -0
  148. msprobe/infer/utils/check/obj_checker.py +41 -0
  149. msprobe/infer/utils/check/path_checker.py +249 -0
  150. msprobe/infer/utils/check/rule.py +126 -0
  151. msprobe/infer/utils/check/string_checker.py +66 -0
  152. msprobe/infer/utils/cmp_algorithm.py +261 -0
  153. msprobe/infer/utils/constants.py +112 -0
  154. msprobe/infer/utils/file_open_check.py +337 -0
  155. msprobe/infer/utils/util.py +177 -0
  156. msprobe/mindspore/__init__.py +14 -13
  157. msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +14 -13
  158. msprobe/mindspore/api_accuracy_checker/api_info.py +12 -11
  159. msprobe/mindspore/api_accuracy_checker/api_runner.py +12 -11
  160. msprobe/mindspore/api_accuracy_checker/base_compare_algorithm.py +12 -11
  161. msprobe/mindspore/api_accuracy_checker/bench_functions/flash_attention_score.py +12 -11
  162. msprobe/mindspore/api_accuracy_checker/bench_functions/fusion_operator.py +12 -11
  163. msprobe/mindspore/api_accuracy_checker/checker_support_api.yaml +12 -11
  164. msprobe/mindspore/api_accuracy_checker/cmd_parser.py +15 -14
  165. msprobe/mindspore/api_accuracy_checker/compute_element.py +12 -11
  166. msprobe/mindspore/api_accuracy_checker/data_manager.py +13 -11
  167. msprobe/mindspore/api_accuracy_checker/main.py +12 -11
  168. msprobe/mindspore/api_accuracy_checker/multi_api_accuracy_checker.py +14 -12
  169. msprobe/mindspore/api_accuracy_checker/multi_data_manager.py +13 -11
  170. msprobe/mindspore/api_accuracy_checker/torch_mindtorch_importer.py +12 -11
  171. msprobe/mindspore/api_accuracy_checker/type_mapping.py +12 -11
  172. msprobe/mindspore/api_accuracy_checker/utils.py +12 -11
  173. msprobe/mindspore/common/const.py +15 -74
  174. msprobe/mindspore/common/log.py +12 -11
  175. msprobe/mindspore/common/utils.py +30 -15
  176. msprobe/mindspore/compare/common_dir_compare.py +21 -23
  177. msprobe/mindspore/compare/distributed_compare.py +18 -16
  178. msprobe/mindspore/compare/ms_compare.py +14 -14
  179. msprobe/mindspore/compare/ms_graph_compare.py +26 -20
  180. msprobe/mindspore/compare/utils.py +14 -12
  181. msprobe/mindspore/{cell_processor.py → dump/cell_processor.py} +15 -14
  182. msprobe/mindspore/{debugger → dump/debugger}/debugger_config.py +12 -30
  183. msprobe/mindspore/{debugger → dump/debugger}/precision_debugger.py +43 -45
  184. msprobe/mindspore/dump/{cell_dump_process.py → dump_processor/cell_dump_process.py} +31 -17
  185. msprobe/mindspore/dump/{cell_dump_with_insert_gradient.py → dump_processor/cell_dump_with_insert_gradient.py} +18 -14
  186. msprobe/mindspore/dump/{dump_tool_factory.py → dump_processor/dump_tool_factory.py} +16 -15
  187. msprobe/mindspore/dump/{graph_mode_cell_dump.py → dump_processor/graph_mode_cell_dump.py} +16 -15
  188. msprobe/mindspore/dump/{graph_tensor_dump.py → dump_processor/graph_tensor_dump.py} +134 -133
  189. msprobe/mindspore/dump/{hook_cell → dump_processor/hook_cell}/api_register.py +15 -14
  190. msprobe/mindspore/dump/{hook_cell → dump_processor/hook_cell}/hook_cell.py +12 -11
  191. msprobe/mindspore/dump/{hook_cell → dump_processor/hook_cell}/ms_hook_manager.py +47 -20
  192. msprobe/mindspore/dump/{hook_cell → dump_processor/hook_cell}/primitive_hooks.py +14 -13
  193. msprobe/mindspore/dump/{hook_cell → dump_processor/hook_cell}/support_wrap_ops.yaml +13 -11
  194. msprobe/mindspore/dump/{jit_dump.py → dump_processor/jit_dump.py} +14 -13
  195. msprobe/mindspore/dump/{kernel_graph_dump.py → dump_processor/kernel_graph_dump.py} +13 -12
  196. msprobe/mindspore/dump/{kernel_kbyk_dump.py → dump_processor/kernel_kbyk_dump.py} +13 -12
  197. msprobe/mindspore/{exception_dump → dump/exception_dump}/exception_dump_tool_factory.py +14 -13
  198. msprobe/mindspore/{exception_dump → dump/exception_dump}/kernel_graph_exception_dump.py +13 -12
  199. msprobe/mindspore/{mindspore_service.py → dump/mindspore_service.py} +18 -17
  200. msprobe/mindspore/dump/mindtorch/__init__.py +19 -0
  201. msprobe/mindspore/dump/ms_config.py +105 -0
  202. msprobe/mindspore/{overflow_check → dump/overflow_check}/kernel_graph_overflow_check.py +13 -12
  203. msprobe/mindspore/{overflow_check → dump/overflow_check}/overflow_check_tool_factory.py +14 -13
  204. msprobe/mindspore/dump/task_handler_factory.py +43 -0
  205. msprobe/mindspore/monitor/common_func.py +12 -11
  206. msprobe/mindspore/monitor/data_writers.py +12 -11
  207. msprobe/mindspore/monitor/distributed/wrap_distributed.py +93 -39
  208. msprobe/mindspore/monitor/features.py +12 -11
  209. msprobe/mindspore/monitor/module_hook.py +19 -22
  210. msprobe/mindspore/monitor/optimizer_collect.py +29 -25
  211. msprobe/mindspore/monitor/utils.py +13 -11
  212. msprobe/msaccucmp/advisor/__init__.py +16 -0
  213. msprobe/msaccucmp/advisor/advisor_const.py +65 -0
  214. msprobe/msaccucmp/advisor/advisor_result.py +73 -0
  215. msprobe/msaccucmp/advisor/compare_advisor.py +99 -0
  216. msprobe/msaccucmp/advisor/input_advisor.py +66 -0
  217. msprobe/msaccucmp/advisor/node_advisor.py +68 -0
  218. msprobe/msaccucmp/advisor/overflow_advisor.py +58 -0
  219. msprobe/msaccucmp/algorithm_manager/__init__.py +16 -0
  220. msprobe/msaccucmp/algorithm_manager/algorithm_manager.py +464 -0
  221. msprobe/msaccucmp/algorithm_manager/algorithm_parameter.py +42 -0
  222. msprobe/msaccucmp/algorithm_manager/builtin_algorithm/alg_AccumulatedRelativeError.py +46 -0
  223. msprobe/msaccucmp/algorithm_manager/builtin_algorithm/alg_CosineSimilarity.py +58 -0
  224. msprobe/msaccucmp/algorithm_manager/builtin_algorithm/alg_KullbackLeiblerDivergence.py +84 -0
  225. msprobe/msaccucmp/algorithm_manager/builtin_algorithm/alg_MaxAbsoluteError.py +41 -0
  226. msprobe/msaccucmp/algorithm_manager/builtin_algorithm/alg_MaxRelativeError.py +46 -0
  227. msprobe/msaccucmp/algorithm_manager/builtin_algorithm/alg_MeanAbsoluteError.py +41 -0
  228. msprobe/msaccucmp/algorithm_manager/builtin_algorithm/alg_MeanRelativeError.py +46 -0
  229. msprobe/msaccucmp/algorithm_manager/builtin_algorithm/alg_RelativeEuclideanDistance.py +46 -0
  230. msprobe/msaccucmp/algorithm_manager/builtin_algorithm/alg_RootMeanSquareError.py +40 -0
  231. msprobe/msaccucmp/algorithm_manager/builtin_algorithm/alg_StandardDeviation.py +47 -0
  232. msprobe/msaccucmp/cmp_utils/__init__.py +16 -0
  233. msprobe/msaccucmp/cmp_utils/common.py +113 -0
  234. msprobe/msaccucmp/cmp_utils/constant/__init__.py +16 -0
  235. msprobe/msaccucmp/cmp_utils/constant/compare_error.py +81 -0
  236. msprobe/msaccucmp/cmp_utils/constant/const_manager.py +530 -0
  237. msprobe/msaccucmp/cmp_utils/file_utils.py +497 -0
  238. msprobe/msaccucmp/cmp_utils/log.py +257 -0
  239. msprobe/msaccucmp/cmp_utils/multi_process/__init__.py +16 -0
  240. msprobe/msaccucmp/cmp_utils/multi_process/multi_convert_process.py +140 -0
  241. msprobe/msaccucmp/cmp_utils/multi_process/progress.py +78 -0
  242. msprobe/msaccucmp/cmp_utils/path_check.py +274 -0
  243. msprobe/msaccucmp/cmp_utils/reg_manager.py +98 -0
  244. msprobe/msaccucmp/cmp_utils/tlv_parse.py +279 -0
  245. msprobe/msaccucmp/cmp_utils/utils.py +356 -0
  246. msprobe/msaccucmp/cmp_utils/utils_type.py +63 -0
  247. msprobe/msaccucmp/compare_vector.py +48 -0
  248. msprobe/msaccucmp/conversion/__init__.py +16 -0
  249. msprobe/msaccucmp/conversion/data_conversion.py +277 -0
  250. msprobe/msaccucmp/conversion/dtype_conversion.py +99 -0
  251. msprobe/msaccucmp/conversion/shape_format_conversion.py +477 -0
  252. msprobe/msaccucmp/conversion/tensor_conversion.py +369 -0
  253. msprobe/msaccucmp/dump_data_conversion.py +46 -0
  254. msprobe/msaccucmp/dump_parse/__init__.py +16 -0
  255. msprobe/msaccucmp/dump_parse/big_dump_data.py +317 -0
  256. msprobe/msaccucmp/dump_parse/dump.py +423 -0
  257. msprobe/msaccucmp/dump_parse/dump_data_object.py +322 -0
  258. msprobe/msaccucmp/dump_parse/dump_data_parser.py +436 -0
  259. msprobe/msaccucmp/dump_parse/dump_utils.py +246 -0
  260. msprobe/msaccucmp/dump_parse/ffts_parser.py +137 -0
  261. msprobe/msaccucmp/dump_parse/mapping.py +62 -0
  262. msprobe/msaccucmp/dump_parse/nano_dump_data.py +392 -0
  263. msprobe/msaccucmp/dump_parse/proto_dump_data.py +308 -0
  264. msprobe/msaccucmp/dump_parser.py +90 -0
  265. msprobe/msaccucmp/format_manager/__init__.py +16 -0
  266. msprobe/msaccucmp/format_manager/builtin_format_convert/convert_FRACTAL_NZ_to_NCHW.py +53 -0
  267. msprobe/msaccucmp/format_manager/builtin_format_convert/convert_FRACTAL_NZ_to_ND.py +52 -0
  268. msprobe/msaccucmp/format_manager/builtin_format_convert/convert_FRACTAL_NZ_to_NHWC.py +53 -0
  269. msprobe/msaccucmp/format_manager/builtin_format_convert/convert_FRACTAL_Z_to_HWCN.py +47 -0
  270. msprobe/msaccucmp/format_manager/builtin_format_convert/convert_FRACTAL_Z_to_NCHW.py +47 -0
  271. msprobe/msaccucmp/format_manager/builtin_format_convert/convert_HWCN_to_FRACTAL_Z.py +89 -0
  272. msprobe/msaccucmp/format_manager/builtin_format_convert/convert_HWCN_to_NCHW.py +37 -0
  273. msprobe/msaccucmp/format_manager/builtin_format_convert/convert_HWCN_to_NHWC.py +37 -0
  274. msprobe/msaccucmp/format_manager/builtin_format_convert/convert_NC1HWC0_to_HWCN.py +43 -0
  275. msprobe/msaccucmp/format_manager/builtin_format_convert/convert_NC1HWC0_to_NCHW.py +48 -0
  276. msprobe/msaccucmp/format_manager/builtin_format_convert/convert_NC1HWC0_to_NHWC.py +43 -0
  277. msprobe/msaccucmp/format_manager/builtin_format_convert/convert_NCHW_to_FRACTAL_Z.py +87 -0
  278. msprobe/msaccucmp/format_manager/builtin_format_convert/convert_NCHW_to_NHWC.py +37 -0
  279. msprobe/msaccucmp/format_manager/builtin_format_convert/convert_NDC1HWC0_to_NCDHW.py +48 -0
  280. msprobe/msaccucmp/format_manager/builtin_format_convert/convert_NDC1HWC0_to_ND.py +44 -0
  281. msprobe/msaccucmp/format_manager/builtin_format_convert/convert_NHWC_to_FRACTAL_Z.py +87 -0
  282. msprobe/msaccucmp/format_manager/builtin_format_convert/convert_NHWC_to_HWCN.py +37 -0
  283. msprobe/msaccucmp/format_manager/builtin_format_convert/convert_NHWC_to_NCHW.py +37 -0
  284. msprobe/msaccucmp/format_manager/format_manager.py +307 -0
  285. msprobe/msaccucmp/inplace_layer_process.py +186 -0
  286. msprobe/msaccucmp/msaccucmp.py +532 -0
  287. msprobe/msaccucmp/mscmp_advisor.py +128 -0
  288. msprobe/msaccucmp/overflow/__init__.py +16 -0
  289. msprobe/msaccucmp/overflow/overflow_analyse.py +305 -0
  290. msprobe/msaccucmp/overflow/overflow_detection.py +143 -0
  291. msprobe/msaccucmp/pytorch_cmp/__init__.py +16 -0
  292. msprobe/msaccucmp/pytorch_cmp/compare_pytorch.py +389 -0
  293. msprobe/msaccucmp/pytorch_cmp/hdf5_parser.py +377 -0
  294. msprobe/msaccucmp/pytorch_cmp/pytorch_dump_data.py +461 -0
  295. msprobe/msaccucmp/shape_conversion.py +41 -0
  296. msprobe/msaccucmp/vector_cmp/__init__.py +16 -0
  297. msprobe/msaccucmp/vector_cmp/batch_compare.py +197 -0
  298. msprobe/msaccucmp/vector_cmp/compare_detail/__init__.py +16 -0
  299. msprobe/msaccucmp/vector_cmp/compare_detail/compare_detail.py +245 -0
  300. msprobe/msaccucmp/vector_cmp/compare_detail/detail.py +182 -0
  301. msprobe/msaccucmp/vector_cmp/compare_detail/detail_writer.py +580 -0
  302. msprobe/msaccucmp/vector_cmp/fusion_manager/__init__.py +16 -0
  303. msprobe/msaccucmp/vector_cmp/fusion_manager/compare_fusion_op.py +588 -0
  304. msprobe/msaccucmp/vector_cmp/fusion_manager/compare_npu_vs_npu.py +339 -0
  305. msprobe/msaccucmp/vector_cmp/fusion_manager/compare_result.py +326 -0
  306. msprobe/msaccucmp/vector_cmp/fusion_manager/compare_rule.py +156 -0
  307. msprobe/msaccucmp/vector_cmp/fusion_manager/fusion_op.py +204 -0
  308. msprobe/msaccucmp/vector_cmp/fusion_manager/fusion_rule_parser.py +635 -0
  309. msprobe/msaccucmp/vector_cmp/fusion_manager/quant_filter.py +187 -0
  310. msprobe/msaccucmp/vector_cmp/range_manager/__init__.py +16 -0
  311. msprobe/msaccucmp/vector_cmp/range_manager/range_manager.py +100 -0
  312. msprobe/msaccucmp/vector_cmp/range_manager/range_mode.py +94 -0
  313. msprobe/msaccucmp/vector_cmp/range_manager/select_mode.py +86 -0
  314. msprobe/msaccucmp/vector_cmp/vector_comparison.py +535 -0
  315. msprobe/msprobe.py +101 -130
  316. msprobe/overflow_check/__init__.py +15 -0
  317. msprobe/{nan_analyze → overflow_check}/analyzer.py +38 -27
  318. msprobe/{nan_analyze → overflow_check}/graph.py +30 -27
  319. msprobe/{nan_analyze → overflow_check}/utils.py +15 -14
  320. msprobe/pytorch/__init__.py +20 -14
  321. msprobe/pytorch/aclgraph_dump/__init__.py +45 -0
  322. msprobe/pytorch/aclgraph_dump/_meta.py +26 -0
  323. msprobe/pytorch/api_accuracy_checker/{run_ut/run_ut.py → acc_check/acc_check.py} +50 -45
  324. msprobe/pytorch/api_accuracy_checker/{run_ut/run_ut_utils.py → acc_check/acc_check_utils.py} +201 -30
  325. msprobe/pytorch/api_accuracy_checker/{run_ut → acc_check}/data_generate.py +56 -16
  326. msprobe/pytorch/api_accuracy_checker/{run_ut/multi_run_ut.py → acc_check/multi_acc_check.py} +32 -47
  327. msprobe/pytorch/api_accuracy_checker/{run_ut → acc_check}/run_overflow_check.py +19 -18
  328. msprobe/pytorch/api_accuracy_checker/common/config.py +22 -20
  329. msprobe/pytorch/api_accuracy_checker/common/utils.py +72 -13
  330. msprobe/pytorch/api_accuracy_checker/compare/algorithm.py +41 -11
  331. msprobe/pytorch/api_accuracy_checker/compare/api_precision_compare.py +23 -14
  332. msprobe/pytorch/api_accuracy_checker/compare/compare.py +45 -32
  333. msprobe/pytorch/api_accuracy_checker/compare/compare_column.py +12 -11
  334. msprobe/pytorch/api_accuracy_checker/compare/compare_input.py +14 -12
  335. msprobe/pytorch/api_accuracy_checker/compare/compare_utils.py +14 -12
  336. msprobe/pytorch/api_accuracy_checker/precision_standard/absolute_threshold.py +12 -11
  337. msprobe/pytorch/api_accuracy_checker/precision_standard/accumulative_error_compare.py +12 -11
  338. msprobe/pytorch/api_accuracy_checker/precision_standard/base_standard.py +21 -19
  339. msprobe/pytorch/api_accuracy_checker/precision_standard/benchmark_compare.py +14 -13
  340. msprobe/pytorch/api_accuracy_checker/precision_standard/binary_consistency.py +12 -11
  341. msprobe/pytorch/api_accuracy_checker/precision_standard/standard_config.py +60 -11
  342. msprobe/pytorch/api_accuracy_checker/precision_standard/standard_register.py +27 -16
  343. msprobe/pytorch/api_accuracy_checker/precision_standard/thousandth_standard.py +13 -11
  344. msprobe/pytorch/api_accuracy_checker/precision_standard/ulp_compare.py +39 -18
  345. msprobe/pytorch/bench_functions/__init__.py +12 -11
  346. msprobe/pytorch/bench_functions/apply_adam.py +12 -11
  347. msprobe/pytorch/bench_functions/apply_adam_w.py +12 -11
  348. msprobe/pytorch/bench_functions/confusion_transpose.py +12 -11
  349. msprobe/pytorch/bench_functions/fast_gelu.py +12 -11
  350. msprobe/pytorch/bench_functions/group_norm_silu.py +12 -11
  351. msprobe/pytorch/bench_functions/layer_norm_eval.py +12 -11
  352. msprobe/pytorch/bench_functions/linear.py +12 -11
  353. msprobe/pytorch/bench_functions/matmul_backward.py +12 -11
  354. msprobe/pytorch/bench_functions/mish.py +12 -11
  355. msprobe/pytorch/bench_functions/moe_gating_top_k_softmax.py +12 -11
  356. msprobe/pytorch/bench_functions/npu_fusion_attention.py +12 -11
  357. msprobe/pytorch/bench_functions/rms_norm.py +12 -11
  358. msprobe/pytorch/bench_functions/rotary_mul.py +12 -11
  359. msprobe/pytorch/bench_functions/scaled_mask_softmax.py +12 -11
  360. msprobe/pytorch/bench_functions/sort_v2.py +12 -11
  361. msprobe/pytorch/bench_functions/swiglu.py +12 -11
  362. msprobe/pytorch/common/__init__.py +12 -11
  363. msprobe/pytorch/common/log.py +12 -11
  364. msprobe/pytorch/common/parse_json.py +12 -11
  365. msprobe/pytorch/common/utils.py +52 -19
  366. msprobe/pytorch/compare/distributed_compare.py +13 -13
  367. msprobe/pytorch/compare/match.py +12 -11
  368. msprobe/pytorch/compare/pt_compare.py +14 -20
  369. msprobe/pytorch/compare/pt_diff_analyze.py +12 -11
  370. msprobe/pytorch/compare/utils.py +12 -11
  371. msprobe/pytorch/{hook_module → dump/api_dump}/api_register.py +18 -16
  372. msprobe/pytorch/{hook_module → dump/api_dump}/hook_module.py +14 -13
  373. msprobe/pytorch/{hook_module → dump/api_dump}/pt_hook_manager.py +68 -23
  374. msprobe/pytorch/{hook_module → dump/api_dump}/register_optimizer_hook.py +13 -11
  375. msprobe/pytorch/{hook_module → dump/api_dump}/script_wrapper.py +17 -14
  376. msprobe/pytorch/{hook_module → dump/api_dump}/utils.py +12 -11
  377. msprobe/pytorch/{debugger → dump/debugger}/debugger_config.py +23 -38
  378. msprobe/pytorch/dump/debugger/precision_debugger.py +130 -0
  379. msprobe/pytorch/{function_factory.py → dump/function_factory.py} +12 -11
  380. msprobe/pytorch/dump/module_dump/hook_wrapper.py +17 -13
  381. msprobe/pytorch/dump/module_dump/module_dump.py +16 -15
  382. msprobe/pytorch/dump/module_dump/{module_processer.py → module_processor.py} +54 -42
  383. msprobe/pytorch/dump/pt_config.py +128 -0
  384. msprobe/pytorch/{pytorch_service.py → dump/pytorch_service.py} +22 -21
  385. msprobe/pytorch/monitor/csv2tb.py +13 -11
  386. msprobe/pytorch/monitor/data_writers.py +13 -11
  387. msprobe/pytorch/monitor/distributed/wrap_distributed.py +13 -11
  388. msprobe/pytorch/monitor/features.py +12 -11
  389. msprobe/pytorch/monitor/module_hook.py +67 -59
  390. msprobe/pytorch/monitor/module_metric.py +13 -11
  391. msprobe/pytorch/monitor/optimizer_collect.py +37 -35
  392. msprobe/pytorch/monitor/utils.py +13 -11
  393. msprobe/pytorch/monitor/visualizer.py +12 -11
  394. msprobe/pytorch/torchair_dump/__init__.py +17 -0
  395. msprobe/pytorch/torchair_dump/torchair_dump.py +114 -0
  396. msprobe/scripts/atb/config_example.json +10 -0
  397. msprobe/scripts/atb/load_atb_probe.sh +101 -0
  398. msprobe/scripts/atb/unload_atb_probe.sh +27 -0
  399. msprobe/scripts/build_msaccucmp.sh +186 -0
  400. msprobe/scripts/conf/help.info +6 -0
  401. msprobe/scripts/conf/version.info +3 -0
  402. msprobe/scripts/run_script/common.sh +538 -0
  403. msprobe/scripts/run_script/main_msaccucmp.sh +232 -0
  404. msprobe/visualization/__init__.py +12 -11
  405. msprobe/visualization/builder/__init__.py +12 -11
  406. msprobe/visualization/builder/graph_builder.py +45 -30
  407. msprobe/visualization/builder/graph_merger.py +53 -32
  408. msprobe/visualization/builder/msprobe_adapter.py +34 -44
  409. msprobe/visualization/compare/__init__.py +12 -11
  410. msprobe/visualization/compare/graph_comparator.py +63 -51
  411. msprobe/visualization/compare/mode_adapter.py +28 -113
  412. msprobe/visualization/db_utils.py +133 -22
  413. msprobe/visualization/graph/__init__.py +12 -11
  414. msprobe/visualization/graph/base_node.py +15 -27
  415. msprobe/visualization/graph/distributed_analyzer.py +97 -40
  416. msprobe/visualization/graph/graph.py +14 -16
  417. msprobe/visualization/graph/node_colors.py +34 -31
  418. msprobe/visualization/graph/node_op.py +12 -11
  419. msprobe/visualization/graph_service.py +580 -205
  420. msprobe/visualization/utils.py +278 -31
  421. tb_graph_ascend/secure_build.py +175 -0
  422. tb_graph_ascend/server/__init__.py +15 -0
  423. tb_graph_ascend/server/app/__init__.py +15 -0
  424. tb_graph_ascend/server/app/model/__init__.py +15 -0
  425. tb_graph_ascend/server/app/model/hierarchy.py +348 -0
  426. tb_graph_ascend/server/app/model/layout_hierarchy_model.py +69 -0
  427. tb_graph_ascend/server/app/model/match_nodes_model.py +573 -0
  428. tb_graph_ascend/server/app/repositories/__init__.py +15 -0
  429. tb_graph_ascend/server/app/repositories/graph_repo_base.py +32 -0
  430. tb_graph_ascend/server/app/repositories/graph_repo_db.py +879 -0
  431. tb_graph_ascend/server/app/repositories/graph_repo_vis.py +83 -0
  432. tb_graph_ascend/server/app/service/__init__.py +18 -0
  433. tb_graph_ascend/server/app/service/graph_service_base.py +158 -0
  434. tb_graph_ascend/server/app/service/graph_service_db.py +438 -0
  435. tb_graph_ascend/server/app/service/graph_service_factory.py +54 -0
  436. tb_graph_ascend/server/app/service/graph_service_vis.py +480 -0
  437. tb_graph_ascend/server/app/utils/__init__.py +15 -0
  438. tb_graph_ascend/server/app/utils/constant.py +80 -0
  439. tb_graph_ascend/server/app/utils/file_check_wrapper.py +46 -0
  440. tb_graph_ascend/server/app/utils/global_state.py +95 -0
  441. tb_graph_ascend/server/app/utils/graph_utils.py +661 -0
  442. tb_graph_ascend/server/app/utils/i18n.py +153 -0
  443. tb_graph_ascend/server/app/utils/request_method.py +46 -0
  444. tb_graph_ascend/server/app/views/__init__.py +15 -0
  445. tb_graph_ascend/server/app/views/graph_views.py +304 -0
  446. tb_graph_ascend/server/plugin.py +108 -0
  447. tb_graph_ascend/server/static/index.html +9250 -0
  448. tb_graph_ascend/server/static/index.js +21 -0
  449. tb_graph_ascend/setup.py +57 -0
  450. mindstudio_probe-8.3.2.dist-info/LICENSE +0 -201
  451. mindstudio_probe-8.3.2.dist-info/RECORD +0 -491
  452. mindstudio_probe-8.3.2.dist-info/entry_points.txt +0 -2
  453. mindstudio_probe-8.3.2.dist-info/top_level.txt +0 -1
  454. msprobe/CMakeLists.txt +0 -5
  455. msprobe/README.md +0 -203
  456. msprobe/core/advisor/advisor.py +0 -129
  457. msprobe/core/advisor/advisor_const.py +0 -58
  458. msprobe/core/advisor/advisor_result.py +0 -58
  459. msprobe/core/compare/find_first/data_processor.py +0 -35
  460. msprobe/core/compare/highlight.py +0 -390
  461. msprobe/core/data_dump/data_collector.py +0 -356
  462. msprobe/core/grad_probe/constant.py +0 -90
  463. msprobe/core/grad_probe/grad_compare.py +0 -187
  464. msprobe/core/grad_probe/utils.py +0 -105
  465. msprobe/core/kernel_dump/kernel_config.py +0 -33
  466. msprobe/docs/01.installation.md +0 -250
  467. msprobe/docs/02.config_introduction.md +0 -221
  468. msprobe/docs/03.config_examples.md +0 -281
  469. msprobe/docs/04.kernel_dump_PyTorch.md +0 -73
  470. msprobe/docs/05.data_dump_PyTorch.md +0 -518
  471. msprobe/docs/06.data_dump_MindSpore.md +0 -618
  472. msprobe/docs/07.accuracy_checker_PyTorch.md +0 -310
  473. msprobe/docs/09.accuracy_checker_MindSpore.md +0 -120
  474. msprobe/docs/10.accuracy_compare_PyTorch.md +0 -637
  475. msprobe/docs/11.accuracy_compare_MindSpore.md +0 -769
  476. msprobe/docs/12.overflow_check_PyTorch.md +0 -82
  477. msprobe/docs/13.overflow_check_MindSpore.md +0 -33
  478. msprobe/docs/14.data_parse_PyTorch.md +0 -282
  479. msprobe/docs/15.free_benchmarking_PyTorch.md +0 -169
  480. msprobe/docs/16.free_benchmarking_MindSpore.md +0 -159
  481. msprobe/docs/17.grad_probe.md +0 -205
  482. msprobe/docs/18.online_dispatch.md +0 -89
  483. msprobe/docs/19.monitor.md +0 -753
  484. msprobe/docs/20.monitor_performance_baseline.md +0 -52
  485. msprobe/docs/21.visualization_PyTorch.md +0 -519
  486. msprobe/docs/22.visualization_MindSpore.md +0 -515
  487. msprobe/docs/23.generate_operator_PyTorch.md +0 -107
  488. msprobe/docs/24.code_mapping_Mindspore.md +0 -29
  489. msprobe/docs/25.tool_function_introduction.md +0 -29
  490. msprobe/docs/26.data_dump_PyTorch_baseline.md +0 -48
  491. msprobe/docs/27.dump_json_instruction.md +0 -795
  492. msprobe/docs/28.debugger_save_instruction.md +0 -288
  493. msprobe/docs/28.kernel_dump_MindSpore.md +0 -69
  494. msprobe/docs/29.data_dump_MSAdapter.md +0 -235
  495. msprobe/docs/30.overflow_check_MSAdapter.md +0 -31
  496. msprobe/docs/31.config_check.md +0 -107
  497. msprobe/docs/32.ckpt_compare.md +0 -69
  498. msprobe/docs/33.generate_operator_MindSpore.md +0 -181
  499. msprobe/docs/34.RL_collect.md +0 -101
  500. msprobe/docs/35.nan_analyze.md +0 -73
  501. msprobe/docs/36.calculation_result_change.md +0 -75
  502. msprobe/docs/FAQ.md +0 -232
  503. msprobe/docs/S02.report_free_benchmarking_validation_performance_baseline.md +0 -146
  504. msprobe/docs/accuracy_checker_MindSpore/accuracy_checker_MindSpore_baseline.md +0 -14
  505. msprobe/docs/data_dump_MindSpore/data_dump_MindSpore_baseline.md +0 -33
  506. msprobe/docs/data_dump_MindSpore/dynamic_graph_quick_start_example.md +0 -217
  507. msprobe/docs/img/BLOOM-7B_1.png +0 -0
  508. msprobe/docs/img/BLOOM-7B_2.png +0 -0
  509. msprobe/docs/img/BLOOM-7B_3.png +0 -0
  510. msprobe/docs/img/BLOOM-7B_4.png +0 -0
  511. msprobe/docs/img/GPT-3_1.png +0 -0
  512. msprobe/docs/img/GPT-3_2.png +0 -0
  513. msprobe/docs/img/GPT-3_3.png +0 -0
  514. msprobe/docs/img/GPT-3_4.png +0 -0
  515. msprobe/docs/img/GPT-3_5.png +0 -0
  516. msprobe/docs/img/GPT-3_6.png +0 -0
  517. msprobe/docs/img/GPT-3_7.png +0 -0
  518. msprobe/docs/img/GPT-3_8.png +0 -0
  519. msprobe/docs/img/YOLOV5S_1.png +0 -0
  520. msprobe/docs/img/YOLOV5S_2.png +0 -0
  521. msprobe/docs/img/accuracy_checking_details.png +0 -0
  522. msprobe/docs/img/accuracy_checking_result.png +0 -0
  523. msprobe/docs/img/api_precision_compare_details.png +0 -0
  524. msprobe/docs/img/api_precision_compare_result.png +0 -0
  525. msprobe/docs/img/auto_analyze_log.png +0 -0
  526. msprobe/docs/img/compare_result.png +0 -0
  527. msprobe/docs/img/compare_result_pkl.png +0 -0
  528. msprobe/docs/img/compare_result_pkl_md5.png.png +0 -0
  529. msprobe/docs/img/cpu_info.png +0 -0
  530. msprobe/docs/img/free_benchmark.png +0 -0
  531. msprobe/docs/img/free_benchmark_framework.png +0 -0
  532. msprobe/docs/img/grad_probe_image-1.png +0 -0
  533. msprobe/docs/img/grad_probe_image-2.png +0 -0
  534. msprobe/docs/img/grad_probe_image-3.png +0 -0
  535. msprobe/docs/img/grad_probe_image-4.png +0 -0
  536. msprobe/docs/img/grad_probe_image.png +0 -0
  537. msprobe/docs/img/merge_result.png +0 -0
  538. msprobe/docs/img/module_compare.png +0 -0
  539. msprobe/docs/img/monitor/cpu_info.png +0 -0
  540. msprobe/docs/img/monitor/step_count_per_record.png +0 -0
  541. msprobe/docs/img/ms_dump.png +0 -0
  542. msprobe/docs/img/ms_layer.png +0 -0
  543. msprobe/docs/img/pt_dump.png +0 -0
  544. msprobe/docs/img/save_compare_result_sample.png +0 -0
  545. msprobe/docs/img/visualization/fuzzy_match_ms.png +0 -0
  546. msprobe/docs/img/visualization/fuzzy_match_pt.png +0 -0
  547. msprobe/docs/img/visualization/proxy.png +0 -0
  548. msprobe/docs/img/visualization/tensorboard_1.png +0 -0
  549. msprobe/docs/img/visualization/tensorboard_2.png +0 -0
  550. msprobe/docs/img/visualization/vis_browser_1.png +0 -0
  551. msprobe/docs/img/visualization/vis_browser_2.png +0 -0
  552. msprobe/docs/img/visualization/vis_match_info.png +0 -0
  553. msprobe/docs/img/visualization/vis_precision_info.png +0 -0
  554. msprobe/docs/img/visualization/vis_search_info.png +0 -0
  555. msprobe/docs/img/visualization/vis_show_info.png +0 -0
  556. msprobe/docs/img/visualization/vis_showcase.png +0 -0
  557. msprobe/docs/img/visualization/vis_unmatch_info.png +0 -0
  558. msprobe/docs/visualization/GPTModel.png +0 -0
  559. msprobe/docs/visualization/ParallelMLP.png +0 -0
  560. msprobe/docs/visualization/layer_mapping_example.md +0 -132
  561. msprobe/docs/visualization/mapping.png +0 -0
  562. msprobe/docs/visualization/mapping1.png +0 -0
  563. msprobe/docs/visualization/mindspeed_llamafactoary_img/1.png +0 -0
  564. msprobe/docs/visualization/mindspeed_llamafactoary_img/2.png +0 -0
  565. msprobe/docs/visualization/mindspeed_llamafactoary_img/3.png +0 -0
  566. msprobe/docs/visualization/mindspeed_llamafactoary_img/4.png +0 -0
  567. msprobe/docs/visualization/mindspeed_llamafactoary_img/5.png +0 -0
  568. msprobe/docs/visualization/mindspeed_llamafactoary_img/6.png +0 -0
  569. msprobe/docs/visualization/mindspeed_llamafactoary_img/7.png +0 -0
  570. msprobe/docs/visualization/mindspeed_llamafactoary_img/llamafactory-qwen25vl.txt +0 -59
  571. msprobe/docs/visualization/mindspeed_llamafactoary_img/llamafactory1.png +0 -0
  572. msprobe/docs/visualization/mindspeed_llamafactoary_img/llamafactory2.png +0 -0
  573. msprobe/docs/visualization/mindspeed_llamafactoary_img/mindspeed-mm-qwen25vl.txt +0 -80
  574. msprobe/docs/visualization/mindspeed_llamafactoary_img/mindspeed1.png +0 -0
  575. msprobe/docs/visualization/mindspeed_llamafactoary_img/mindspeed2.png +0 -0
  576. msprobe/docs/visualization/mindspeed_llamafactory_mapping.md +0 -330
  577. msprobe/docs/visualization/module_name.png +0 -0
  578. msprobe/docs/visualization/module_name1.png +0 -0
  579. msprobe/docs/visualization/no_mapping.png +0 -0
  580. msprobe/docs/visualization/no_mapping1.png +0 -0
  581. msprobe/docs/visualization/no_mapping_analyze.png +0 -0
  582. msprobe/docs/visualization/top_layer.png +0 -0
  583. msprobe/mindspore/api_accuracy_checker/generate_op_script/op_generator.py +0 -460
  584. msprobe/mindspore/api_accuracy_checker/generate_op_script/operator_replication.template +0 -2081
  585. msprobe/mindspore/code_mapping/bind.py +0 -283
  586. msprobe/mindspore/code_mapping/cmd_parser.py +0 -40
  587. msprobe/mindspore/code_mapping/graph.py +0 -49
  588. msprobe/mindspore/code_mapping/graph_parser.py +0 -211
  589. msprobe/mindspore/code_mapping/main.py +0 -24
  590. msprobe/mindspore/code_mapping/processor.py +0 -34
  591. msprobe/mindspore/dym_loader/hook_dynamic_loader.cpp +0 -111
  592. msprobe/mindspore/dym_loader/hook_dynamic_loader.h +0 -52
  593. msprobe/mindspore/free_benchmark/api_pynative_self_check.py +0 -257
  594. msprobe/mindspore/free_benchmark/common/config.py +0 -27
  595. msprobe/mindspore/free_benchmark/common/handler_params.py +0 -31
  596. msprobe/mindspore/free_benchmark/common/utils.py +0 -100
  597. msprobe/mindspore/free_benchmark/data/support_wrap_ops.yaml +0 -638
  598. msprobe/mindspore/free_benchmark/handler/base_handler.py +0 -105
  599. msprobe/mindspore/free_benchmark/handler/check_handler.py +0 -55
  600. msprobe/mindspore/free_benchmark/handler/fix_handler.py +0 -51
  601. msprobe/mindspore/free_benchmark/handler/handler_factory.py +0 -36
  602. msprobe/mindspore/free_benchmark/perturbation/add_noise.py +0 -82
  603. msprobe/mindspore/free_benchmark/perturbation/base_perturbation.py +0 -45
  604. msprobe/mindspore/free_benchmark/perturbation/bit_noise.py +0 -78
  605. msprobe/mindspore/free_benchmark/perturbation/exchange_value.py +0 -77
  606. msprobe/mindspore/free_benchmark/perturbation/improve_precision.py +0 -56
  607. msprobe/mindspore/free_benchmark/perturbation/no_change.py +0 -27
  608. msprobe/mindspore/free_benchmark/perturbation/perturbation_factory.py +0 -46
  609. msprobe/mindspore/free_benchmark/self_check_tool_factory.py +0 -51
  610. msprobe/mindspore/grad_probe/global_context.py +0 -127
  611. msprobe/mindspore/grad_probe/grad_analyzer.py +0 -260
  612. msprobe/mindspore/grad_probe/grad_monitor.py +0 -42
  613. msprobe/mindspore/grad_probe/grad_stat_csv.py +0 -161
  614. msprobe/mindspore/grad_probe/hook.py +0 -115
  615. msprobe/mindspore/grad_probe/utils.py +0 -43
  616. msprobe/mindspore/mindtorch/__init__.py +0 -18
  617. msprobe/mindspore/ms_config.py +0 -153
  618. msprobe/mindspore/task_handler_factory.py +0 -44
  619. msprobe/nan_analyze/__init__.py +0 -14
  620. msprobe/pytorch/api_accuracy_checker/generate_op_script/config_op.json +0 -9
  621. msprobe/pytorch/api_accuracy_checker/generate_op_script/op_generator.py +0 -480
  622. msprobe/pytorch/api_accuracy_checker/generate_op_script/operator_replication.template +0 -567
  623. msprobe/pytorch/debugger/precision_debugger.py +0 -181
  624. msprobe/pytorch/free_benchmark/__init__.py +0 -23
  625. msprobe/pytorch/free_benchmark/common/constant.py +0 -85
  626. msprobe/pytorch/free_benchmark/common/counter.py +0 -87
  627. msprobe/pytorch/free_benchmark/common/enums.py +0 -80
  628. msprobe/pytorch/free_benchmark/common/params.py +0 -152
  629. msprobe/pytorch/free_benchmark/common/utils.py +0 -143
  630. msprobe/pytorch/free_benchmark/compare/grad_saver.py +0 -215
  631. msprobe/pytorch/free_benchmark/compare/single_benchmark.py +0 -121
  632. msprobe/pytorch/free_benchmark/main.py +0 -123
  633. msprobe/pytorch/free_benchmark/perturbed_layers/base_layer.py +0 -28
  634. msprobe/pytorch/free_benchmark/perturbed_layers/layer_factory.py +0 -56
  635. msprobe/pytorch/free_benchmark/perturbed_layers/npu/add_noise.py +0 -107
  636. msprobe/pytorch/free_benchmark/perturbed_layers/npu/bit_noise.py +0 -121
  637. msprobe/pytorch/free_benchmark/perturbed_layers/npu/change_value.py +0 -89
  638. msprobe/pytorch/free_benchmark/perturbed_layers/npu/improve_precision.py +0 -87
  639. msprobe/pytorch/free_benchmark/perturbed_layers/npu/no_change.py +0 -43
  640. msprobe/pytorch/free_benchmark/perturbed_layers/npu/npu_base_layser.py +0 -60
  641. msprobe/pytorch/free_benchmark/perturbed_layers/run_cpu.py +0 -34
  642. msprobe/pytorch/free_benchmark/result_handlers/base_handler.py +0 -252
  643. msprobe/pytorch/free_benchmark/result_handlers/check_handler.py +0 -54
  644. msprobe/pytorch/free_benchmark/result_handlers/fix_handler.py +0 -40
  645. msprobe/pytorch/free_benchmark/result_handlers/handler_factory.py +0 -45
  646. msprobe/pytorch/free_benchmark/result_handlers/preheat_handler.py +0 -181
  647. msprobe/pytorch/grad_probe/__init__.py +0 -0
  648. msprobe/pytorch/grad_probe/grad_monitor.py +0 -108
  649. msprobe/pytorch/grad_probe/grad_stat_csv.py +0 -160
  650. msprobe/pytorch/hook_module/__init__.py +0 -16
  651. msprobe/pytorch/hook_module/wrap_aten.py +0 -111
  652. msprobe/pytorch/online_dispatch/__init__.py +0 -19
  653. msprobe/pytorch/online_dispatch/compare.py +0 -224
  654. msprobe/pytorch/online_dispatch/dispatch.py +0 -332
  655. msprobe/pytorch/online_dispatch/dump_compare.py +0 -179
  656. msprobe/pytorch/online_dispatch/single_compare.py +0 -412
  657. msprobe/pytorch/online_dispatch/torch_ops_config.yaml +0 -58
  658. msprobe/pytorch/online_dispatch/utils.py +0 -158
  659. msprobe/pytorch/parse_tool/__init__.py +0 -0
  660. msprobe/pytorch/parse_tool/cli.py +0 -31
  661. msprobe/pytorch/parse_tool/lib/__init__.py +0 -0
  662. msprobe/pytorch/parse_tool/lib/compare.py +0 -253
  663. msprobe/pytorch/parse_tool/lib/config.py +0 -50
  664. msprobe/pytorch/parse_tool/lib/file_desc.py +0 -45
  665. msprobe/pytorch/parse_tool/lib/interactive_cli.py +0 -97
  666. msprobe/pytorch/parse_tool/lib/parse_exception.py +0 -54
  667. msprobe/pytorch/parse_tool/lib/parse_tool.py +0 -161
  668. msprobe/pytorch/parse_tool/lib/utils.py +0 -299
  669. msprobe/pytorch/parse_tool/lib/visualization.py +0 -85
  670. msprobe/pytorch/pt_config.py +0 -299
  671. /msprobe/core/{grad_probe → dump}/__init__.py +0 -0
  672. /msprobe/{mindspore/code_mapping → core/dump/api_dump}/__init__.py +0 -0
  673. /msprobe/{mindspore/debugger → core/dump/data_dump}/__init__.py +0 -0
  674. /msprobe/{mindspore/exception_dump → core/dump/data_dump/data_processor}/__init__.py +0 -0
  675. /msprobe/{mindspore/free_benchmark → core/dump/debugger}/__init__.py +0 -0
  676. /msprobe/{mindspore/free_benchmark/common → core/dump/kernel_dump}/__init__.py +0 -0
  677. /msprobe/mindspore/{free_benchmark/handler → dump/debugger}/__init__.py +0 -0
  678. /msprobe/mindspore/{grad_probe → dump/dump_processor}/__init__.py +0 -0
  679. /msprobe/mindspore/{overflow_check → dump/exception_dump}/__init__.py +0 -0
  680. /msprobe/mindspore/{mindtorch → dump/mindtorch}/mindtorch_adaptor.py +0 -0
  681. /msprobe/{pytorch/api_accuracy_checker/run_ut → mindspore/dump/overflow_check}/__init__.py +0 -0
  682. /msprobe/{pytorch/debugger → mindspore/monitor}/__init__.py +0 -0
  683. /msprobe/{pytorch/free_benchmark/common → msaccucmp}/__init__.py +0 -0
  684. /msprobe/pytorch/api_accuracy_checker/{run_ut → acc_check}/.keep +0 -0
  685. /msprobe/pytorch/{free_benchmark/perturbed_layers → api_accuracy_checker/acc_check}/__init__.py +0 -0
  686. /msprobe/pytorch/api_accuracy_checker/{run_ut → acc_check}/torch_ut_setting.json +0 -0
  687. /msprobe/pytorch/{free_benchmark/perturbed_layers/npu → dump/api_dump}/__init__.py +0 -0
  688. /msprobe/pytorch/{hook_module → dump/api_dump}/support_wrap_ops.yaml +0 -0
  689. /msprobe/pytorch/{free_benchmark/result_handlers → dump/debugger}/__init__.py +0 -0
@@ -0,0 +1,409 @@
1
+ # -------------------------------------------------------------------------
2
+ # This file is part of the MindStudio project.
3
+ # Copyright (c) 2025 Huawei Technologies Co.,Ltd.
4
+ #
5
+ # MindStudio is licensed under Mulan PSL v2.
6
+ # You can use this software according to the terms and conditions of the Mulan PSL v2.
7
+ # You may obtain a copy of Mulan PSL v2 at:
8
+ #
9
+ # http://license.coscl.org.cn/MulanPSL2
10
+ #
11
+ # THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
12
+ # EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
13
+ # MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
14
+ # See the Mulan PSL v2 for more details.
15
+ # -------------------------------------------------------------------------
16
+
17
+ import os
18
+ import sys
19
+ from collections import defaultdict
20
+ from dataclasses import dataclass
21
+ from typing import Any, Dict, Optional
22
+
23
+ from tqdm import tqdm
24
+
25
+ from msprobe.core.common.const import Const, Data2DBConst
26
+ from msprobe.core.common.file_utils import (
27
+ check_file_or_directory_path,
28
+ create_directory,
29
+ load_json,
30
+ recursive_chmod,
31
+ remove_path,
32
+ )
33
+ from msprobe.core.common.log import logger
34
+ from msprobe.core.dump.dump2db.db_utils import DumpDB
35
+
36
+
37
+ def validate_micro_step(micro_step):
38
+ """
39
+ 校验 micro_step 值是否在有效范围内
40
+ """
41
+ if micro_step is not None:
42
+ if type(micro_step) is not int:
43
+ raise ValueError("Micro step must be an integer")
44
+ if micro_step < Data2DBConst.MIN_MICRO_STEP or micro_step > Data2DBConst.MAX_MICRO_STEP:
45
+ raise ValueError(f"Micro step must be between"
46
+ f"{Data2DBConst.MIN_MICRO_STEP} and {Data2DBConst.MAX_MICRO_STEP}")
47
+
48
+
49
+ def load_mapping(mapping_path):
50
+ if mapping_path and isinstance(mapping_path, str):
51
+ return load_json(mapping_path)
52
+ else:
53
+ return {}
54
+
55
+
56
+ @dataclass
57
+ class TensorProcessingParams:
58
+ """Tensor处理函数的参数载体"""
59
+ tensor_data: Dict[str, Any]
60
+ target_prefix: str
61
+ vpp_stage: int
62
+ micro_step: int
63
+ step: int
64
+ rank: int
65
+ metric_id: int
66
+ metric_type: Optional[str]
67
+
68
+
69
+ class DumpRecordBuilder:
70
+ def __init__(self, db: DumpDB, data_dir, mapping, micro_step):
71
+ self.db = db
72
+ self.data_dir = data_dir
73
+ self.mapping = mapping
74
+ self.micro_step = micro_step if micro_step else None
75
+
76
+ @staticmethod
77
+ def extract_target_info(full_key):
78
+ """
79
+ 从完整的key中提取target信息
80
+ 格式: Module.layerN.operation.M
81
+ """
82
+ vpp_stage = 0 # 暂不提取vpp信息
83
+ micro_step = 0
84
+ target_prefix = full_key
85
+
86
+ parts = full_key.split(Const.SEP)
87
+ if parts and parts[-1].isdigit():
88
+ micro_step = int(parts[-1])
89
+ # 重新构建target_prefix(去掉最后一个数字部分)
90
+ target_prefix = Const.SEP.join(parts[:-1])
91
+
92
+ return target_prefix, vpp_stage, micro_step
93
+
94
+ @staticmethod
95
+ def parse_tensor_target(metric_type, tensor_type, tensor_idx):
96
+ """根据tensor类型和索引生成target后缀"""
97
+ if metric_type in [Data2DBConst.FORWARD, Data2DBConst.RECOMPUTE]:
98
+ if tensor_type == Const.INPUT_ARGS:
99
+ return f".input.{tensor_idx}"
100
+ elif tensor_type == Const.OUTPUT:
101
+ return f".output.{tensor_idx}"
102
+ elif tensor_type == Const.PARAMS:
103
+ return f".parameters.{tensor_idx}"
104
+ elif metric_type == Data2DBConst.BACKWARD:
105
+ if tensor_type == Const.INPUT:
106
+ return f".input.{tensor_idx}"
107
+ elif tensor_type == Const.OUTPUT:
108
+ return f".output.{tensor_idx}"
109
+ return ""
110
+
111
+ @staticmethod
112
+ def process_tensor_value(value):
113
+ """处理统计量值 将inf转换为float极值"""
114
+ if value is None:
115
+ return sys.float_info.max
116
+ elif value == float("inf"):
117
+ return sys.float_info.max - 1 # 最大float值
118
+ elif value == float("-inf"):
119
+ return sys.float_info.min + 1 # 最小float值
120
+ elif isinstance(value, (int, float)):
121
+ return float(value)
122
+ else:
123
+ return None
124
+
125
+ def import_data(self):
126
+ """导入数据"""
127
+ logger.info("Scanning data directory...")
128
+
129
+ # 扫描所有step和rank
130
+ max_rank = 0
131
+ min_step = float('inf')
132
+ max_step = 0
133
+ valid_ranks = defaultdict(list)
134
+
135
+ # 首先扫描目录结构获取step和rank信息
136
+ for step_dir in os.listdir(self.data_dir):
137
+ if not step_dir.startswith('step'):
138
+ continue
139
+ try:
140
+ step = int(step_dir[4:]) # 提取step数字
141
+ except ValueError:
142
+ continue
143
+ min_step = min(min_step, step *
144
+ self.micro_step if self.micro_step else step)
145
+ max_step = max(max_step, (step + 1) *
146
+ self.micro_step if self.micro_step else step)
147
+
148
+ step_path = os.path.join(self.data_dir, step_dir)
149
+ for rank_dir in os.listdir(step_path):
150
+ if not rank_dir.startswith('rank'):
151
+ continue
152
+ if rank_dir == "rank":
153
+ rank = 0
154
+ else:
155
+ try:
156
+ rank = int(rank_dir[4:]) # 提取rank数字
157
+ except ValueError:
158
+ continue
159
+
160
+ rank_path = os.path.join(step_path, rank_dir)
161
+ json_path = os.path.join(rank_path, "dump.json")
162
+ if os.path.exists(json_path):
163
+ valid_ranks[step].append((rank, json_path))
164
+ max_rank = max(max_rank, rank)
165
+
166
+ if min_step == float('inf'):
167
+ logger.warning(f"No valid step directories found in: {self.data_dir},"
168
+ f"looking for directories starting with 'step' (e.g., step0, step1")
169
+ return
170
+
171
+ # 更新全局统计信息
172
+ global_stats = {
173
+ "max_rank": max_rank,
174
+ "min_step": min_step,
175
+ "max_step": max_step
176
+ }
177
+ for metric_name in Data2DBConst.METRICS:
178
+ global_stats[metric_name] = Data2DBConst.ORDERED_STAT
179
+ self.db.init_global_stats_data(global_stats)
180
+
181
+ for step in tqdm(sorted(valid_ranks.keys()), desc="Processing steps"):
182
+ step_dir = f"step{step}"
183
+ step_path = os.path.join(self.data_dir, step_dir)
184
+
185
+ check_file_or_directory_path(step_path, isdir=True)
186
+
187
+ # 为当前step的所有有效rank创建进度条
188
+ if not valid_ranks[step]:
189
+ continue
190
+ for rank, json_path in tqdm(valid_ranks[step], desc=f"Step {step} ranks", leave=False):
191
+ self._process_dump_file(
192
+ json_path, step, rank)
193
+ self.db.extract_tags_from_processed_targets()
194
+
195
+ def _determine_metric_type(self, full_key, tensor_data):
196
+ for key, value in self.mapping.items():
197
+ full_key = full_key.replace(key, value)
198
+
199
+ # 确定metric类型
200
+ metric_type = None
201
+ if f"{Const.SEP}{Const.FORWARD}{Const.SEP}" in full_key or \
202
+ full_key.endswith(f"{Const.SEP}{Const.FORWARD}"):
203
+ if tensor_data.get('is_recompute', False):
204
+ metric_type = Data2DBConst.RECOMPUTE
205
+ else:
206
+ metric_type = Data2DBConst.FORWARD
207
+ full_key = full_key.replace(
208
+ f"{Const.SEP}{Const.FORWARD}{Const.SEP}", Const.SEP) # module
209
+ full_key = full_key.replace(
210
+ f"{Const.SEP}{Const.FORWARD}", "") # api
211
+ elif f"{Const.SEP}{Const.BACKWARD}{Const.SEP}" in full_key or \
212
+ full_key.endswith(f"{Const.SEP}{Const.BACKWARD}"):
213
+ metric_type = Data2DBConst.BACKWARD
214
+ full_key = full_key.replace(
215
+ f"{Const.SEP}{Const.BACKWARD}{Const.SEP}", Const.SEP)
216
+ full_key = full_key.replace(f"{Const.SEP}{Const.BACKWARD}", "")
217
+ elif full_key.endswith(Const.PARAMS_GRAD):
218
+ metric_type = Data2DBConst.PARAMETERS_GRAD
219
+ full_key = full_key.replace(f"{Const.SEP}{Const.PARAMS_GRAD}", "")
220
+ # fsdp
221
+ elif len(full_key.split(Const.SEP)) >= 3:
222
+ parts = full_key.split(Const.SEP)
223
+ if parts[-2] == Const.PARAMS_GRAD:
224
+ metric_type = Data2DBConst.PARAMETERS_GRAD
225
+ full_key = Const.SEP.join(parts[:-2])
226
+
227
+ return metric_type, full_key
228
+
229
+ def _add_tensor_data(self, tensor, target_name, tensor_params: TensorProcessingParams, batch_data):
230
+ """添加tensor数据方法"""
231
+
232
+ cache_key = (target_name, tensor_params.vpp_stage,
233
+ tensor_params.micro_step)
234
+
235
+ # 如果缓存中不存在,创建临时ID
236
+ cache_id_dict = self.db.cache_targets(
237
+ cache_key, tensor_params.metric_id)
238
+
239
+ # 准备数据行, 这里id还是个临时id, 需要更新后读取, 第三个实际为{"id": 0}
240
+ row_data = [tensor_params.rank, tensor_params.step, cache_id_dict, tensor_params.metric_id]
241
+ for stat in Data2DBConst.ORDERED_STAT:
242
+ value = tensor.get(stat.capitalize(), None)
243
+ row_data.append(DumpRecordBuilder.process_tensor_value(value))
244
+ batch_data.append((row_data))
245
+
246
+ def _process_forward_data(self, tensor_params: TensorProcessingParams, batch_data):
247
+ """处理forward/recompute数据"""
248
+ # 处理input_args
249
+ if Const.INPUT_ARGS in tensor_params.tensor_data:
250
+ for idx, tensor in enumerate(tensor_params.tensor_data[Const.INPUT_ARGS]):
251
+ if isinstance(tensor, dict) and tensor.get(Const.TYPE) in Data2DBConst.SUPPORT_TYPE:
252
+ if tensor.get(Const.DTYPE) not in Data2DBConst.SUPPORT_DTYPE:
253
+ continue
254
+ target_suffix = DumpRecordBuilder.parse_tensor_target(
255
+ Data2DBConst.FORWARD, Const.INPUT_ARGS, idx)
256
+ target_name = tensor_params.target_prefix + target_suffix
257
+ self._add_tensor_data(
258
+ tensor, target_name, tensor_params, batch_data)
259
+
260
+ # 处理output
261
+ if Const.OUTPUT in tensor_params.tensor_data:
262
+ for idx, tensor in enumerate(tensor_params.tensor_data[Const.OUTPUT]):
263
+ if isinstance(tensor, dict) and tensor.get(Const.TYPE) in Data2DBConst.SUPPORT_TYPE:
264
+ if tensor.get(Const.DTYPE) not in Data2DBConst.SUPPORT_DTYPE:
265
+ continue
266
+ target_suffix = DumpRecordBuilder.parse_tensor_target(
267
+ Data2DBConst.FORWARD, Const.OUTPUT, idx)
268
+ target_name = tensor_params.target_prefix + target_suffix
269
+ self._add_tensor_data(
270
+ tensor, target_name, tensor_params, batch_data)
271
+
272
+ # 处理parameters
273
+ if Const.PARAMS in tensor_params.tensor_data:
274
+ for param_name, param_tensors in tensor_params.tensor_data[Const.PARAMS].items():
275
+ if not (isinstance(param_tensors, list) and param_tensors and isinstance(param_tensors[0], dict)):
276
+ continue
277
+ if param_tensors[0].get(Const.TYPE) in Data2DBConst.SUPPORT_TYPE:
278
+ if param_tensors[0].get(Const.DTYPE) not in Data2DBConst.SUPPORT_DTYPE:
279
+ continue
280
+ target_suffix = DumpRecordBuilder.parse_tensor_target(
281
+ Data2DBConst.FORWARD, Const.PARAMS, param_name)
282
+ target_name = tensor_params.target_prefix + target_suffix
283
+ self._add_tensor_data(
284
+ param_tensors[0], target_name, tensor_params, batch_data)
285
+
286
+ def _process_parameters_data(self, tensor_params: TensorProcessingParams, batch_data):
287
+ """处理parameters数据"""
288
+ for param_name, param_tensors in tensor_params.tensor_data.items():
289
+ if not (isinstance(param_tensors, list) and param_tensors and isinstance(param_tensors[0], dict)):
290
+ continue
291
+ if param_tensors[0].get(Const.TYPE) in Data2DBConst.SUPPORT_TYPE:
292
+ if param_tensors[0].get(Const.DTYPE) not in Data2DBConst.SUPPORT_DTYPE:
293
+ continue
294
+ target_name = tensor_params.target_prefix + f".{param_name}"
295
+ self._add_tensor_data(
296
+ param_tensors[0], target_name, tensor_params, batch_data)
297
+
298
+ def _process_backward_data(self, tensor_params: TensorProcessingParams, batch_data):
299
+ """处理backward数据"""
300
+ # 处理input
301
+ if Const.INPUT in tensor_params.tensor_data:
302
+ for idx, tensor in enumerate(tensor_params.tensor_data[Const.INPUT]):
303
+ if isinstance(tensor, dict) and tensor.get(Const.TYPE) in Data2DBConst.SUPPORT_TYPE:
304
+ if tensor.get(Const.DTYPE) not in Data2DBConst.SUPPORT_DTYPE:
305
+ continue
306
+ target_suffix = DumpRecordBuilder.parse_tensor_target(
307
+ Data2DBConst.BACKWARD, Const.INPUT, idx)
308
+ target_name = tensor_params.target_prefix + target_suffix
309
+ self._add_tensor_data(
310
+ tensor, target_name, tensor_params, batch_data)
311
+
312
+ # 处理output
313
+ if Const.OUTPUT in tensor_params.tensor_data:
314
+ for idx, tensor in enumerate(tensor_params.tensor_data[Const.OUTPUT]):
315
+ if isinstance(tensor, dict) and tensor.get(Const.TYPE) in Data2DBConst.SUPPORT_TYPE:
316
+ if tensor.get(Const.DTYPE) not in Data2DBConst.SUPPORT_DTYPE:
317
+ continue
318
+ target_suffix = DumpRecordBuilder.parse_tensor_target(
319
+ Data2DBConst.BACKWARD, Const.OUTPUT, idx)
320
+ target_name = tensor_params.target_prefix + target_suffix
321
+ self._add_tensor_data(
322
+ tensor, target_name, tensor_params, batch_data)
323
+
324
+ def _process_dump_file(self, json_path, step, rank):
325
+ """处理单个dump.json文件"""
326
+ data = load_json(json_path)
327
+ if 'data' not in data or not isinstance(data['data'], dict):
328
+ return
329
+
330
+ batch_data = []
331
+ # 预先计算所有metric_type的table_name
332
+
333
+ for i, (ori_key, tensor_data) in enumerate(data['data'].items()):
334
+ if not isinstance(ori_key, str) or not isinstance(tensor_data, dict):
335
+ continue
336
+ metric_type, full_key = self._determine_metric_type(
337
+ ori_key, tensor_data)
338
+ if not metric_type:
339
+ continue
340
+
341
+ # 使用缓存获取metric_id和stats
342
+ metric_id = self.db.get_metric_id(metric_type)
343
+ target_prefix, vpp_stage, mstep = DumpRecordBuilder.extract_target_info(
344
+ full_key)
345
+ if self.micro_step:
346
+ current_mstep = step * self.micro_step + mstep
347
+ mstep = 0
348
+ else:
349
+ current_mstep = step
350
+
351
+ tensor_params = TensorProcessingParams(
352
+ tensor_data=tensor_data,
353
+ target_prefix=target_prefix,
354
+ vpp_stage=vpp_stage,
355
+ micro_step=mstep,
356
+ step=current_mstep,
357
+ rank=rank,
358
+ metric_id=metric_id,
359
+ metric_type=metric_type
360
+ )
361
+
362
+ # 处理不同类型的tensor数据
363
+ if metric_type in [Data2DBConst.FORWARD, Data2DBConst.RECOMPUTE]:
364
+ self._process_forward_data(tensor_params, batch_data)
365
+ elif metric_type == Data2DBConst.PARAMETERS_GRAD:
366
+ self._process_parameters_data(tensor_params, batch_data)
367
+ elif metric_type == Data2DBConst.BACKWARD:
368
+ self._process_backward_data(tensor_params, batch_data)
369
+
370
+ if len(batch_data) % Data2DBConst.BATCH_SIZE == 0 and len(batch_data) > 0:
371
+ self.db.batch_insert_targets()
372
+ self.db.batch_insert_data(batch_data)
373
+ batch_data = []
374
+
375
+ self.db.batch_insert_targets()
376
+ self.db.batch_insert_data(batch_data)
377
+
378
+
379
+ def _data2db_service_parser(parser):
380
+ parser.add_argument('--db', type=str, required=True,
381
+ help='Path to SQLite database file')
382
+ parser.add_argument('--data', type=str, required=True,
383
+ help='Path to dump output directory')
384
+ parser.add_argument('--mapping', type=str, default=None,
385
+ help='Path to optional JSON mapping file')
386
+ parser.add_argument('--micro_step', type=int, default=None,
387
+ help='Specific micro step value to split data in one step (must be between 1 and 10000)')
388
+
389
+
390
+ def _data2db_command(args):
391
+ data_path = args.data
392
+
393
+ check_file_or_directory_path(data_path, isdir=True, is_strict=True)
394
+ create_directory(args.db)
395
+ db_path = os.path.join(args.db, "monitor_metrics.db")
396
+
397
+ if os.path.exists(db_path):
398
+ logger.warning(f"Existing path {db_path} will be recovered")
399
+ remove_path(db_path)
400
+
401
+ micro_step = args.micro_step
402
+ validate_micro_step(micro_step)
403
+ mapping = load_mapping(args.mapping)
404
+ db = DumpDB(db_path)
405
+ builder = DumpRecordBuilder(
406
+ db, data_path, mapping=mapping, micro_step=micro_step)
407
+ builder.import_data()
408
+
409
+ recursive_chmod(args.db)
@@ -1,17 +1,18 @@
1
- # Copyright (c) 2025, Huawei Technologies Co., Ltd.
2
- # All rights reserved.
1
+ # -------------------------------------------------------------------------
2
+ # This file is part of the MindStudio project.
3
+ # Copyright (c) 2025 Huawei Technologies Co.,Ltd.
3
4
  #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
5
+ # MindStudio is licensed under Mulan PSL v2.
6
+ # You can use this software according to the terms and conditions of the Mulan PSL v2.
7
+ # You may obtain a copy of Mulan PSL v2 at:
7
8
  #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
+ # http://license.coscl.org.cn/MulanPSL2
9
10
  #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
11
+ # THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
12
+ # EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
13
+ # MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
14
+ # See the Mulan PSL v2 for more details.
15
+ # -------------------------------------------------------------------------
15
16
 
16
17
  import gc
17
18
  import os
@@ -21,7 +22,7 @@ from collections import defaultdict
21
22
 
22
23
  from msprobe.core.common.runtime import Runtime
23
24
  from msprobe.core.common.utils import Const, ThreadSafe
24
- from msprobe.core.data_dump.data_processor.base import (ModuleBackwardInputsOutputs, ModuleForwardInputsOutputs)
25
+ from msprobe.core.dump.data_dump.data_processor.base import (ModuleBackwardInputsOutputs, ModuleForwardInputsOutputs)
25
26
 
26
27
 
27
28
  class HookSet:
@@ -45,20 +46,17 @@ class BaseHookManager(ABC):
45
46
  inner_api_count = defaultdict(int)
46
47
  hook_handle_dict = {}
47
48
  params_grad_info = {}
49
+ grad_hook_call = {}
48
50
 
49
51
  def __init__(self, data_collector, config):
50
52
  self.data_collector = data_collector
51
53
  self.config = config
54
+ self.current_pid = self._pid
52
55
 
53
56
  @property
54
57
  def _pid(self):
55
58
  return os.getpid()
56
59
 
57
- @property
58
- @abstractmethod
59
- def _is_recompute(self):
60
- pass
61
-
62
60
  @staticmethod
63
61
  def reset_status():
64
62
  BaseHookManager.inner_switch = defaultdict(bool)
@@ -82,6 +80,15 @@ class BaseHookManager(ABC):
82
80
  if hasattr(module, 'msprobe_input_kwargs') and tid in module.msprobe_input_kwargs:
83
81
  del module.msprobe_input_kwargs[tid]
84
82
 
83
+ @staticmethod
84
+ def _get_grad_hook_call_index(ori_name, param_name):
85
+ if ori_name not in BaseHookManager.grad_hook_call:
86
+ BaseHookManager.grad_hook_call[ori_name] = [0, param_name]
87
+ else:
88
+ if BaseHookManager.grad_hook_call.get(ori_name)[1] == param_name:
89
+ BaseHookManager.grad_hook_call[ori_name][0] += 1
90
+ return BaseHookManager.grad_hook_call.get(ori_name)[0]
91
+
85
92
  @staticmethod
86
93
  @abstractmethod
87
94
  def _no_grad_context():
@@ -103,7 +110,7 @@ class BaseHookManager(ABC):
103
110
  pass
104
111
 
105
112
  @abstractmethod
106
- def build_hook(self):
113
+ def build_hook(self, hook_type, name):
107
114
  pass
108
115
 
109
116
  @abstractmethod
@@ -115,7 +122,7 @@ class BaseHookManager(ABC):
115
122
  pass
116
123
 
117
124
  @abstractmethod
118
- def _register_backward_pre_hook(self, module, full_backward_name, output):
125
+ def _register_backward_pre_hook(self, module, full_backward_name, args, kwargs, output):
119
126
  pass
120
127
 
121
128
  @abstractmethod
@@ -126,44 +133,17 @@ class BaseHookManager(ABC):
126
133
  def _need_exchange(self, module):
127
134
  pass
128
135
 
136
+ @abstractmethod
129
137
  def _register_param_hook(self, name, module, params_dict):
130
- ori_name = name.rsplit(Const.SEP, 2)[0]
131
- grad_name = ori_name + Const.SEP + Const.PARAMS_GRAD
132
- # 首次执行前向hook时,添加params_grad_name属性,并注册参数hook
133
- setattr(module, 'params_grad_name', grad_name)
134
- # data_mode为forward时,不注册参数hook
135
- if not (Const.FORWARD in self.config.data_mode and Const.BACKWARD not in self.config.data_mode):
136
- for param_name, param in params_dict.items():
137
- if param.requires_grad:
138
- name = ori_name + Const.SEP + param_name
139
- old_handle = BaseHookManager.hook_handle_dict.get(name)
140
- if old_handle and hasattr(old_handle, "remove"):
141
- old_handle.remove()
142
- handle = param.register_hook(self._build_grad_hook(ori_name, param_name))
143
- BaseHookManager.hook_handle_dict[name] = handle
144
-
145
- def _init_params_grad_info(self, module, params_dict):
146
- '''
147
- 初始化参数梯度信息, 在前向hook结束后, 将参数梯度信息写入cache_data中用于占位
148
- '''
149
- if not params_dict:
150
- return
151
- if not (Const.FORWARD in self.config.data_mode and Const.BACKWARD not in self.config.data_mode):
152
- grad_name = module.params_grad_name if hasattr(module, 'params_grad_name') else None
153
- # 判断是否已经在cache_data中进行了占位, 若没有则先写入cache_data中
154
- if not BaseHookManager.params_grad_info.get(grad_name):
155
- data_info = {grad_name: {key: [None] for key, value in params_dict.items() if value.requires_grad}}
156
- # 当模块中的参数有requires_grad属性为True时,才会进行梯度计算,此时才需要占位
157
- if data_info.get(grad_name):
158
- # 将grad_name的data_info先写入cache_data中, 梯度计算后再更新
159
- self.data_collector.handle_data(grad_name, data_info,
160
- flush=self.data_collector.data_processor.is_terminated)
161
- self.data_collector.params_grad_record[grad_name] = True
162
- # 记录当前模块的参数梯度信息已占位
163
- BaseHookManager.params_grad_info[grad_name] = True
138
+ pass
139
+
140
+ def is_child_process(self):
141
+ return self.current_pid != self._pid
164
142
 
165
143
  def _should_execute_hook(self, hook_type, tid, is_forward=True):
166
144
  is_api_hook = hook_type == Const.API
145
+ if self.is_child_process():
146
+ return False
167
147
  if BaseHookManager.inner_switch[tid]:
168
148
  return False
169
149
  if not is_api_hook and not Runtime.is_running:
@@ -174,21 +154,6 @@ class BaseHookManager(ABC):
174
154
  return False
175
155
  return True
176
156
 
177
- def _build_grad_hook(self, ori_name, param_name):
178
- def hook_fn(grad):
179
- tid = threading.get_ident()
180
- if not self._should_execute_hook(Const.MODULE, tid):
181
- return
182
- with ThreadSafe():
183
- original_state = self.ensure_gc_enabled()
184
- BaseHookManager.inner_switch[tid] = True
185
- self.data_collector.params_data_collect(ori_name, param_name, self._pid, grad)
186
- BaseHookManager.inner_switch[tid] = False
187
- self.restore_gc_state(original_state)
188
- return
189
-
190
- return hook_fn
191
-
192
157
  def _build_forward_pre_hook(self, hook_type, api_name):
193
158
  def forward_pre_hook(module, args, kwargs=None):
194
159
  if hook_type == Const.MODULE:
@@ -220,8 +185,7 @@ class BaseHookManager(ABC):
220
185
  full_forward_name,
221
186
  module,
222
187
  self._pid,
223
- module_input_output,
224
- self._is_recompute
188
+ module_input_output
225
189
  )
226
190
  BaseHookManager.inner_switch[tid] = False
227
191
  self.restore_gc_state(original_state)
@@ -257,7 +221,7 @@ class BaseHookManager(ABC):
257
221
  if hook_type == Const.API:
258
222
  full_forward_name = api_name + str(self._get_count(api_name)) + Const.SEP + Const.FORWARD
259
223
  full_backward_name = api_name + str(self._get_count(api_name)) + Const.SEP + Const.BACKWARD
260
- output = self._register_backward_pre_hook(module, full_backward_name, output)
224
+ output = self._register_backward_pre_hook(module, full_backward_name, args, kwargs, output)
261
225
 
262
226
  with self._no_grad_context():
263
227
  if hook_type == Const.MODULE:
@@ -270,28 +234,19 @@ class BaseHookManager(ABC):
270
234
  api_name,
271
235
  module,
272
236
  self._pid,
273
- module_input_output,
274
- self._is_recompute
237
+ module_input_output
275
238
  )
276
- self._init_params_grad_info(module, params_dict)
277
239
  else:
278
240
  self.data_collector.update_api_or_module_name(full_forward_name)
279
241
  self.data_collector.forward_output_data_collect(
280
242
  full_forward_name,
281
243
  module,
282
244
  self._pid,
283
- module_input_output,
284
- self._is_recompute
245
+ module_input_output
285
246
  )
286
247
  self._add_count(api_name)
287
248
  BaseHookManager.inner_api_count[tid] -= 1
288
249
  self._clear_input_kwargs(module, tid)
289
-
290
- if self.data_collector.if_return_forward_new_output():
291
- forward_new_output = self.data_collector.get_forward_new_output()
292
- BaseHookManager.inner_switch[tid] = False
293
- return forward_new_output
294
-
295
250
  BaseHookManager.inner_switch[tid] = False
296
251
  self.restore_gc_state(original_state)
297
252
  return output
@@ -318,12 +273,8 @@ class BaseHookManager(ABC):
318
273
  full_name,
319
274
  module,
320
275
  self._pid,
321
- module_input_output,
322
- self._is_recompute
276
+ module_input_output
323
277
  )
324
- if hook_type == Const.MODULE:
325
- params_dict = self._get_params_dict(module)
326
- self.data_collector.params_data_collect_in_bw_hook(params_dict, full_name)
327
278
  BaseHookManager.inner_switch[tid] = False
328
279
  self.restore_gc_state(original_state)
329
280
 
@@ -0,0 +1,34 @@
1
+ # -------------------------------------------------------------------------
2
+ # This file is part of the MindStudio project.
3
+ # Copyright (c) 2025 Huawei Technologies Co.,Ltd.
4
+ #
5
+ # MindStudio is licensed under Mulan PSL v2.
6
+ # You can use this software according to the terms and conditions of the Mulan PSL v2.
7
+ # You may obtain a copy of Mulan PSL v2 at:
8
+ #
9
+ # http://license.coscl.org.cn/MulanPSL2
10
+ #
11
+ # THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
12
+ # EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
13
+ # MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
14
+ # See the Mulan PSL v2 for more details.
15
+ # -------------------------------------------------------------------------
16
+
17
+ import os
18
+
19
+ from msprobe.core.common.file_utils import save_json
20
+
21
+
22
+ def create_kernel_config_json(dump_path, cur_rank):
23
+ kernel_config_name = "kernel_config.json" if cur_rank == '' else f"kernel_config_{cur_rank}.json"
24
+ kernel_config_path = os.path.join(dump_path, kernel_config_name)
25
+ config_info = {
26
+ "dump": {
27
+ "dump_list": [],
28
+ "dump_path": dump_path,
29
+ "dump_mode": "all",
30
+ "dump_op_switch": "on"
31
+ }
32
+ }
33
+ save_json(kernel_config_path, config_info, indent=4)
34
+ return kernel_config_path