mindstudio-probe 8.3.2__py3-none-any.whl → 26.0.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (689) hide show
  1. {mindstudio_probe-8.3.2.dist-info → mindstudio_probe-26.0.0a1.dist-info}/METADATA +26 -14
  2. mindstudio_probe-26.0.0a1.dist-info/RECORD +498 -0
  3. {mindstudio_probe-8.3.2.dist-info → mindstudio_probe-26.0.0a1.dist-info}/WHEEL +1 -1
  4. mindstudio_probe-26.0.0a1.dist-info/entry_points.txt +5 -0
  5. mindstudio_probe-26.0.0a1.dist-info/licenses/LICENSE +124 -0
  6. mindstudio_probe-26.0.0a1.dist-info/top_level.txt +2 -0
  7. msprobe/__init__.py +12 -13
  8. msprobe/config.json +9 -31
  9. msprobe/core/__init__.py +12 -11
  10. msprobe/core/acc_check/acc_check_cli.py +145 -0
  11. msprobe/core/common/const.py +97 -38
  12. msprobe/core/common/db_manager.py +133 -12
  13. msprobe/core/common/decorator.py +12 -11
  14. msprobe/core/common/exceptions.py +12 -11
  15. msprobe/core/common/file_utils.py +101 -25
  16. msprobe/core/common/framework_adapter.py +36 -25
  17. msprobe/core/common/global_lock.py +12 -11
  18. msprobe/core/common/inplace_op_checker.py +12 -11
  19. msprobe/core/common/log.py +22 -11
  20. msprobe/core/common/megatron_utils.py +566 -11
  21. msprobe/core/common/parallel_state.py +12 -11
  22. msprobe/core/common/runtime.py +12 -11
  23. msprobe/core/common/utils.py +41 -41
  24. msprobe/core/compare/acc_compare.py +361 -104
  25. msprobe/core/compare/atb_data_compare.py +422 -0
  26. msprobe/core/compare/auto_compare.py +134 -0
  27. msprobe/core/compare/check.py +14 -17
  28. msprobe/core/compare/compare_cli.py +72 -149
  29. msprobe/core/compare/config.py +12 -13
  30. msprobe/core/compare/diff_analyze/first_diff_analyze.py +28 -15
  31. msprobe/core/compare/diff_analyze/ignore_op_list.yaml +3 -0
  32. msprobe/core/compare/find_first/analyzer.py +18 -18
  33. msprobe/core/compare/find_first/graph.py +12 -11
  34. msprobe/core/compare/find_first/utils.py +13 -12
  35. msprobe/core/compare/indicator_analysis/__init__.py +15 -0
  36. msprobe/core/compare/indicator_analysis/algorithm.py +363 -0
  37. msprobe/core/compare/indicator_analysis/api_data.py +141 -0
  38. msprobe/core/compare/indicator_analysis/calculator.py +181 -0
  39. msprobe/core/compare/indicator_analysis/utils.py +116 -0
  40. msprobe/core/compare/layer_mapping/__init__.py +12 -11
  41. msprobe/core/compare/layer_mapping/data_scope_parser.py +20 -11
  42. msprobe/core/compare/layer_mapping/layer_mapping.py +14 -13
  43. msprobe/core/compare/layer_mapping/postprocess_pass.py +13 -11
  44. msprobe/core/compare/merge_result/merge_result.py +12 -11
  45. msprobe/core/compare/merge_result/merge_result_cli.py +12 -11
  46. msprobe/core/compare/merge_result/utils.py +12 -11
  47. msprobe/core/compare/multiprocessing_compute.py +13 -14
  48. msprobe/core/compare/npy_compare.py +13 -11
  49. msprobe/core/compare/offline_data_compare.py +160 -0
  50. msprobe/core/compare/stats_diff_calc.py +39 -0
  51. msprobe/core/compare/torchair_acc_cmp.py +764 -0
  52. msprobe/core/compare/torchair_cmp_utils.py +338 -0
  53. msprobe/core/compare/utils.py +140 -49
  54. msprobe/core/config_check/__init__.py +12 -11
  55. msprobe/core/config_check/checkers/__init__.py +12 -11
  56. msprobe/core/config_check/checkers/base_checker.py +15 -14
  57. msprobe/core/config_check/checkers/dataset_checker.py +13 -12
  58. msprobe/core/config_check/checkers/env_args_checker.py +13 -12
  59. msprobe/core/config_check/checkers/hyperparameter_checker.py +16 -15
  60. msprobe/core/config_check/checkers/pip_checker.py +15 -15
  61. msprobe/core/config_check/checkers/random_checker.py +13 -12
  62. msprobe/core/config_check/checkers/weights_checker.py +14 -12
  63. msprobe/core/config_check/ckpt_compare/ckpt_comparator.py +13 -17
  64. msprobe/core/config_check/ckpt_compare/megatron_loader.py +13 -12
  65. msprobe/core/config_check/ckpt_compare/metrics.py +12 -11
  66. msprobe/core/config_check/config_check_cli.py +18 -17
  67. msprobe/core/config_check/config_checker.py +16 -14
  68. msprobe/core/config_check/resource/dependency.yaml +15 -12
  69. msprobe/core/config_check/resource/env.yaml +12 -11
  70. msprobe/core/config_check/utils/hyperparameter_parser.py +12 -11
  71. msprobe/core/config_check/utils/utils.py +12 -11
  72. msprobe/core/{data_dump → dump/api_dump}/api_registry.py +12 -11
  73. msprobe/core/{common_config.py → dump/common_config.py} +13 -24
  74. msprobe/core/dump/data_dump/data_collector.py +257 -0
  75. msprobe/core/{data_dump → dump/data_dump}/data_processor/base.py +45 -36
  76. msprobe/core/{data_dump → dump/data_dump}/data_processor/factory.py +33 -25
  77. msprobe/core/{data_dump → dump/data_dump}/data_processor/mindspore_processor.py +37 -113
  78. msprobe/core/{data_dump → dump/data_dump}/data_processor/pytorch_processor.py +364 -131
  79. msprobe/core/{data_dump → dump/data_dump}/json_writer.py +24 -31
  80. msprobe/core/{data_dump → dump/data_dump}/scope.py +12 -13
  81. msprobe/core/{debugger → dump/debugger}/precision_debugger.py +15 -23
  82. msprobe/core/dump/dump2db/db_utils.py +215 -0
  83. msprobe/core/dump/dump2db/dump2db.py +409 -0
  84. msprobe/core/{hook_manager.py → dump/hook_manager.py} +38 -87
  85. msprobe/core/dump/kernel_dump/kernel_config.py +34 -0
  86. msprobe/core/{service.py → dump/service.py} +43 -27
  87. msprobe/core/install_deps/install_deps.py +51 -0
  88. msprobe/core/monitor/anomaly_processor.py +13 -11
  89. msprobe/core/monitor/csv2db.py +73 -93
  90. msprobe/core/monitor/db_utils.py +140 -205
  91. msprobe/core/monitor/utils.py +18 -17
  92. msprobe/core/monitor_v2/__init__.py +20 -0
  93. msprobe/core/monitor_v2/base.py +83 -0
  94. msprobe/core/monitor_v2/cc.py +287 -0
  95. msprobe/core/monitor_v2/factory.py +81 -0
  96. msprobe/core/monitor_v2/module.py +201 -0
  97. msprobe/core/monitor_v2/optimizer.py +245 -0
  98. msprobe/core/monitor_v2/param.py +154 -0
  99. msprobe/core/monitor_v2/trainer.py +326 -0
  100. msprobe/core/monitor_v2/utils.py +122 -0
  101. msprobe/core/monitor_v2/weight_grad.py +419 -0
  102. msprobe/core/monitor_v2/writer.py +162 -0
  103. msprobe/core/overflow_check/abnormal_scene.py +12 -11
  104. msprobe/core/overflow_check/api_info.py +12 -11
  105. msprobe/core/overflow_check/checker.py +12 -11
  106. msprobe/core/overflow_check/filter.py +13 -11
  107. msprobe/core/overflow_check/level.py +12 -11
  108. msprobe/core/overflow_check/utils.py +12 -11
  109. msprobe/core/single_save/single_comparator.py +12 -11
  110. msprobe/core/single_save/single_saver.py +12 -11
  111. msprobe/infer/__init__.py +16 -0
  112. msprobe/infer/offline/__init__.py +16 -0
  113. msprobe/infer/offline/compare/__init__.py +16 -0
  114. msprobe/infer/offline/compare/msquickcmp/__init__.py +16 -0
  115. msprobe/infer/offline/compare/msquickcmp/adapter_cli/__init__.py +16 -0
  116. msprobe/infer/offline/compare/msquickcmp/adapter_cli/args_adapter.py +46 -0
  117. msprobe/infer/offline/compare/msquickcmp/atc/__init__.py +16 -0
  118. msprobe/infer/offline/compare/msquickcmp/atc/atc_utils.py +98 -0
  119. msprobe/infer/offline/compare/msquickcmp/cmp_process.py +328 -0
  120. msprobe/infer/offline/compare/msquickcmp/common/__init__.py +16 -0
  121. msprobe/infer/offline/compare/msquickcmp/common/args_check.py +112 -0
  122. msprobe/infer/offline/compare/msquickcmp/common/convert.py +74 -0
  123. msprobe/infer/offline/compare/msquickcmp/common/dump_data.py +121 -0
  124. msprobe/infer/offline/compare/msquickcmp/common/dynamic_argument_bean.py +39 -0
  125. msprobe/infer/offline/compare/msquickcmp/common/utils.py +669 -0
  126. msprobe/infer/offline/compare/msquickcmp/config.ini +6 -0
  127. msprobe/infer/offline/compare/msquickcmp/dump/__init__.py +16 -0
  128. msprobe/infer/offline/compare/msquickcmp/dump/args_adapter.py +50 -0
  129. msprobe/infer/offline/compare/msquickcmp/dump/dump_process.py +91 -0
  130. msprobe/infer/offline/compare/msquickcmp/install_aclruntime_aisbench.sh +180 -0
  131. msprobe/infer/offline/compare/msquickcmp/main.py +199 -0
  132. msprobe/infer/offline/compare/msquickcmp/net_compare/__init__.py +16 -0
  133. msprobe/infer/offline/compare/msquickcmp/net_compare/net_compare.py +277 -0
  134. msprobe/infer/offline/compare/msquickcmp/npu/__init__.py +16 -0
  135. msprobe/infer/offline/compare/msquickcmp/npu/npu_dump_data.py +558 -0
  136. msprobe/infer/offline/compare/msquickcmp/npu/om_parser.py +416 -0
  137. msprobe/infer/offline/compare/msquickcmp/onnx_model/__init__.py +16 -0
  138. msprobe/infer/offline/compare/msquickcmp/onnx_model/onnx_dump_data.py +374 -0
  139. msprobe/infer/utils/__init__.py +15 -0
  140. msprobe/infer/utils/acc_cmp.py +94 -0
  141. msprobe/infer/utils/check/__init__.py +37 -0
  142. msprobe/infer/utils/check/args_checker.py +35 -0
  143. msprobe/infer/utils/check/checker.py +227 -0
  144. msprobe/infer/utils/check/dict_checker.py +78 -0
  145. msprobe/infer/utils/check/func_wrapper.py +96 -0
  146. msprobe/infer/utils/check/list_checker.py +56 -0
  147. msprobe/infer/utils/check/number_checker.py +64 -0
  148. msprobe/infer/utils/check/obj_checker.py +41 -0
  149. msprobe/infer/utils/check/path_checker.py +249 -0
  150. msprobe/infer/utils/check/rule.py +126 -0
  151. msprobe/infer/utils/check/string_checker.py +66 -0
  152. msprobe/infer/utils/cmp_algorithm.py +261 -0
  153. msprobe/infer/utils/constants.py +112 -0
  154. msprobe/infer/utils/file_open_check.py +337 -0
  155. msprobe/infer/utils/util.py +177 -0
  156. msprobe/mindspore/__init__.py +14 -13
  157. msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +14 -13
  158. msprobe/mindspore/api_accuracy_checker/api_info.py +12 -11
  159. msprobe/mindspore/api_accuracy_checker/api_runner.py +12 -11
  160. msprobe/mindspore/api_accuracy_checker/base_compare_algorithm.py +12 -11
  161. msprobe/mindspore/api_accuracy_checker/bench_functions/flash_attention_score.py +12 -11
  162. msprobe/mindspore/api_accuracy_checker/bench_functions/fusion_operator.py +12 -11
  163. msprobe/mindspore/api_accuracy_checker/checker_support_api.yaml +12 -11
  164. msprobe/mindspore/api_accuracy_checker/cmd_parser.py +15 -14
  165. msprobe/mindspore/api_accuracy_checker/compute_element.py +12 -11
  166. msprobe/mindspore/api_accuracy_checker/data_manager.py +13 -11
  167. msprobe/mindspore/api_accuracy_checker/main.py +12 -11
  168. msprobe/mindspore/api_accuracy_checker/multi_api_accuracy_checker.py +14 -12
  169. msprobe/mindspore/api_accuracy_checker/multi_data_manager.py +13 -11
  170. msprobe/mindspore/api_accuracy_checker/torch_mindtorch_importer.py +12 -11
  171. msprobe/mindspore/api_accuracy_checker/type_mapping.py +12 -11
  172. msprobe/mindspore/api_accuracy_checker/utils.py +12 -11
  173. msprobe/mindspore/common/const.py +15 -74
  174. msprobe/mindspore/common/log.py +12 -11
  175. msprobe/mindspore/common/utils.py +30 -15
  176. msprobe/mindspore/compare/common_dir_compare.py +21 -23
  177. msprobe/mindspore/compare/distributed_compare.py +18 -16
  178. msprobe/mindspore/compare/ms_compare.py +14 -14
  179. msprobe/mindspore/compare/ms_graph_compare.py +26 -20
  180. msprobe/mindspore/compare/utils.py +14 -12
  181. msprobe/mindspore/{cell_processor.py → dump/cell_processor.py} +15 -14
  182. msprobe/mindspore/{debugger → dump/debugger}/debugger_config.py +12 -30
  183. msprobe/mindspore/{debugger → dump/debugger}/precision_debugger.py +43 -45
  184. msprobe/mindspore/dump/{cell_dump_process.py → dump_processor/cell_dump_process.py} +31 -17
  185. msprobe/mindspore/dump/{cell_dump_with_insert_gradient.py → dump_processor/cell_dump_with_insert_gradient.py} +18 -14
  186. msprobe/mindspore/dump/{dump_tool_factory.py → dump_processor/dump_tool_factory.py} +16 -15
  187. msprobe/mindspore/dump/{graph_mode_cell_dump.py → dump_processor/graph_mode_cell_dump.py} +16 -15
  188. msprobe/mindspore/dump/{graph_tensor_dump.py → dump_processor/graph_tensor_dump.py} +134 -133
  189. msprobe/mindspore/dump/{hook_cell → dump_processor/hook_cell}/api_register.py +15 -14
  190. msprobe/mindspore/dump/{hook_cell → dump_processor/hook_cell}/hook_cell.py +12 -11
  191. msprobe/mindspore/dump/{hook_cell → dump_processor/hook_cell}/ms_hook_manager.py +47 -20
  192. msprobe/mindspore/dump/{hook_cell → dump_processor/hook_cell}/primitive_hooks.py +14 -13
  193. msprobe/mindspore/dump/{hook_cell → dump_processor/hook_cell}/support_wrap_ops.yaml +13 -11
  194. msprobe/mindspore/dump/{jit_dump.py → dump_processor/jit_dump.py} +14 -13
  195. msprobe/mindspore/dump/{kernel_graph_dump.py → dump_processor/kernel_graph_dump.py} +13 -12
  196. msprobe/mindspore/dump/{kernel_kbyk_dump.py → dump_processor/kernel_kbyk_dump.py} +13 -12
  197. msprobe/mindspore/{exception_dump → dump/exception_dump}/exception_dump_tool_factory.py +14 -13
  198. msprobe/mindspore/{exception_dump → dump/exception_dump}/kernel_graph_exception_dump.py +13 -12
  199. msprobe/mindspore/{mindspore_service.py → dump/mindspore_service.py} +18 -17
  200. msprobe/mindspore/dump/mindtorch/__init__.py +19 -0
  201. msprobe/mindspore/dump/ms_config.py +105 -0
  202. msprobe/mindspore/{overflow_check → dump/overflow_check}/kernel_graph_overflow_check.py +13 -12
  203. msprobe/mindspore/{overflow_check → dump/overflow_check}/overflow_check_tool_factory.py +14 -13
  204. msprobe/mindspore/dump/task_handler_factory.py +43 -0
  205. msprobe/mindspore/monitor/common_func.py +12 -11
  206. msprobe/mindspore/monitor/data_writers.py +12 -11
  207. msprobe/mindspore/monitor/distributed/wrap_distributed.py +93 -39
  208. msprobe/mindspore/monitor/features.py +12 -11
  209. msprobe/mindspore/monitor/module_hook.py +19 -22
  210. msprobe/mindspore/monitor/optimizer_collect.py +29 -25
  211. msprobe/mindspore/monitor/utils.py +13 -11
  212. msprobe/msaccucmp/advisor/__init__.py +16 -0
  213. msprobe/msaccucmp/advisor/advisor_const.py +65 -0
  214. msprobe/msaccucmp/advisor/advisor_result.py +73 -0
  215. msprobe/msaccucmp/advisor/compare_advisor.py +99 -0
  216. msprobe/msaccucmp/advisor/input_advisor.py +66 -0
  217. msprobe/msaccucmp/advisor/node_advisor.py +68 -0
  218. msprobe/msaccucmp/advisor/overflow_advisor.py +58 -0
  219. msprobe/msaccucmp/algorithm_manager/__init__.py +16 -0
  220. msprobe/msaccucmp/algorithm_manager/algorithm_manager.py +464 -0
  221. msprobe/msaccucmp/algorithm_manager/algorithm_parameter.py +42 -0
  222. msprobe/msaccucmp/algorithm_manager/builtin_algorithm/alg_AccumulatedRelativeError.py +46 -0
  223. msprobe/msaccucmp/algorithm_manager/builtin_algorithm/alg_CosineSimilarity.py +58 -0
  224. msprobe/msaccucmp/algorithm_manager/builtin_algorithm/alg_KullbackLeiblerDivergence.py +84 -0
  225. msprobe/msaccucmp/algorithm_manager/builtin_algorithm/alg_MaxAbsoluteError.py +41 -0
  226. msprobe/msaccucmp/algorithm_manager/builtin_algorithm/alg_MaxRelativeError.py +46 -0
  227. msprobe/msaccucmp/algorithm_manager/builtin_algorithm/alg_MeanAbsoluteError.py +41 -0
  228. msprobe/msaccucmp/algorithm_manager/builtin_algorithm/alg_MeanRelativeError.py +46 -0
  229. msprobe/msaccucmp/algorithm_manager/builtin_algorithm/alg_RelativeEuclideanDistance.py +46 -0
  230. msprobe/msaccucmp/algorithm_manager/builtin_algorithm/alg_RootMeanSquareError.py +40 -0
  231. msprobe/msaccucmp/algorithm_manager/builtin_algorithm/alg_StandardDeviation.py +47 -0
  232. msprobe/msaccucmp/cmp_utils/__init__.py +16 -0
  233. msprobe/msaccucmp/cmp_utils/common.py +113 -0
  234. msprobe/msaccucmp/cmp_utils/constant/__init__.py +16 -0
  235. msprobe/msaccucmp/cmp_utils/constant/compare_error.py +81 -0
  236. msprobe/msaccucmp/cmp_utils/constant/const_manager.py +530 -0
  237. msprobe/msaccucmp/cmp_utils/file_utils.py +497 -0
  238. msprobe/msaccucmp/cmp_utils/log.py +257 -0
  239. msprobe/msaccucmp/cmp_utils/multi_process/__init__.py +16 -0
  240. msprobe/msaccucmp/cmp_utils/multi_process/multi_convert_process.py +140 -0
  241. msprobe/msaccucmp/cmp_utils/multi_process/progress.py +78 -0
  242. msprobe/msaccucmp/cmp_utils/path_check.py +274 -0
  243. msprobe/msaccucmp/cmp_utils/reg_manager.py +98 -0
  244. msprobe/msaccucmp/cmp_utils/tlv_parse.py +279 -0
  245. msprobe/msaccucmp/cmp_utils/utils.py +356 -0
  246. msprobe/msaccucmp/cmp_utils/utils_type.py +63 -0
  247. msprobe/msaccucmp/compare_vector.py +48 -0
  248. msprobe/msaccucmp/conversion/__init__.py +16 -0
  249. msprobe/msaccucmp/conversion/data_conversion.py +277 -0
  250. msprobe/msaccucmp/conversion/dtype_conversion.py +99 -0
  251. msprobe/msaccucmp/conversion/shape_format_conversion.py +477 -0
  252. msprobe/msaccucmp/conversion/tensor_conversion.py +369 -0
  253. msprobe/msaccucmp/dump_data_conversion.py +46 -0
  254. msprobe/msaccucmp/dump_parse/__init__.py +16 -0
  255. msprobe/msaccucmp/dump_parse/big_dump_data.py +317 -0
  256. msprobe/msaccucmp/dump_parse/dump.py +423 -0
  257. msprobe/msaccucmp/dump_parse/dump_data_object.py +322 -0
  258. msprobe/msaccucmp/dump_parse/dump_data_parser.py +436 -0
  259. msprobe/msaccucmp/dump_parse/dump_utils.py +246 -0
  260. msprobe/msaccucmp/dump_parse/ffts_parser.py +137 -0
  261. msprobe/msaccucmp/dump_parse/mapping.py +62 -0
  262. msprobe/msaccucmp/dump_parse/nano_dump_data.py +392 -0
  263. msprobe/msaccucmp/dump_parse/proto_dump_data.py +308 -0
  264. msprobe/msaccucmp/dump_parser.py +90 -0
  265. msprobe/msaccucmp/format_manager/__init__.py +16 -0
  266. msprobe/msaccucmp/format_manager/builtin_format_convert/convert_FRACTAL_NZ_to_NCHW.py +53 -0
  267. msprobe/msaccucmp/format_manager/builtin_format_convert/convert_FRACTAL_NZ_to_ND.py +52 -0
  268. msprobe/msaccucmp/format_manager/builtin_format_convert/convert_FRACTAL_NZ_to_NHWC.py +53 -0
  269. msprobe/msaccucmp/format_manager/builtin_format_convert/convert_FRACTAL_Z_to_HWCN.py +47 -0
  270. msprobe/msaccucmp/format_manager/builtin_format_convert/convert_FRACTAL_Z_to_NCHW.py +47 -0
  271. msprobe/msaccucmp/format_manager/builtin_format_convert/convert_HWCN_to_FRACTAL_Z.py +89 -0
  272. msprobe/msaccucmp/format_manager/builtin_format_convert/convert_HWCN_to_NCHW.py +37 -0
  273. msprobe/msaccucmp/format_manager/builtin_format_convert/convert_HWCN_to_NHWC.py +37 -0
  274. msprobe/msaccucmp/format_manager/builtin_format_convert/convert_NC1HWC0_to_HWCN.py +43 -0
  275. msprobe/msaccucmp/format_manager/builtin_format_convert/convert_NC1HWC0_to_NCHW.py +48 -0
  276. msprobe/msaccucmp/format_manager/builtin_format_convert/convert_NC1HWC0_to_NHWC.py +43 -0
  277. msprobe/msaccucmp/format_manager/builtin_format_convert/convert_NCHW_to_FRACTAL_Z.py +87 -0
  278. msprobe/msaccucmp/format_manager/builtin_format_convert/convert_NCHW_to_NHWC.py +37 -0
  279. msprobe/msaccucmp/format_manager/builtin_format_convert/convert_NDC1HWC0_to_NCDHW.py +48 -0
  280. msprobe/msaccucmp/format_manager/builtin_format_convert/convert_NDC1HWC0_to_ND.py +44 -0
  281. msprobe/msaccucmp/format_manager/builtin_format_convert/convert_NHWC_to_FRACTAL_Z.py +87 -0
  282. msprobe/msaccucmp/format_manager/builtin_format_convert/convert_NHWC_to_HWCN.py +37 -0
  283. msprobe/msaccucmp/format_manager/builtin_format_convert/convert_NHWC_to_NCHW.py +37 -0
  284. msprobe/msaccucmp/format_manager/format_manager.py +307 -0
  285. msprobe/msaccucmp/inplace_layer_process.py +186 -0
  286. msprobe/msaccucmp/msaccucmp.py +532 -0
  287. msprobe/msaccucmp/mscmp_advisor.py +128 -0
  288. msprobe/msaccucmp/overflow/__init__.py +16 -0
  289. msprobe/msaccucmp/overflow/overflow_analyse.py +305 -0
  290. msprobe/msaccucmp/overflow/overflow_detection.py +143 -0
  291. msprobe/msaccucmp/pytorch_cmp/__init__.py +16 -0
  292. msprobe/msaccucmp/pytorch_cmp/compare_pytorch.py +389 -0
  293. msprobe/msaccucmp/pytorch_cmp/hdf5_parser.py +377 -0
  294. msprobe/msaccucmp/pytorch_cmp/pytorch_dump_data.py +461 -0
  295. msprobe/msaccucmp/shape_conversion.py +41 -0
  296. msprobe/msaccucmp/vector_cmp/__init__.py +16 -0
  297. msprobe/msaccucmp/vector_cmp/batch_compare.py +197 -0
  298. msprobe/msaccucmp/vector_cmp/compare_detail/__init__.py +16 -0
  299. msprobe/msaccucmp/vector_cmp/compare_detail/compare_detail.py +245 -0
  300. msprobe/msaccucmp/vector_cmp/compare_detail/detail.py +182 -0
  301. msprobe/msaccucmp/vector_cmp/compare_detail/detail_writer.py +580 -0
  302. msprobe/msaccucmp/vector_cmp/fusion_manager/__init__.py +16 -0
  303. msprobe/msaccucmp/vector_cmp/fusion_manager/compare_fusion_op.py +588 -0
  304. msprobe/msaccucmp/vector_cmp/fusion_manager/compare_npu_vs_npu.py +339 -0
  305. msprobe/msaccucmp/vector_cmp/fusion_manager/compare_result.py +326 -0
  306. msprobe/msaccucmp/vector_cmp/fusion_manager/compare_rule.py +156 -0
  307. msprobe/msaccucmp/vector_cmp/fusion_manager/fusion_op.py +204 -0
  308. msprobe/msaccucmp/vector_cmp/fusion_manager/fusion_rule_parser.py +635 -0
  309. msprobe/msaccucmp/vector_cmp/fusion_manager/quant_filter.py +187 -0
  310. msprobe/msaccucmp/vector_cmp/range_manager/__init__.py +16 -0
  311. msprobe/msaccucmp/vector_cmp/range_manager/range_manager.py +100 -0
  312. msprobe/msaccucmp/vector_cmp/range_manager/range_mode.py +94 -0
  313. msprobe/msaccucmp/vector_cmp/range_manager/select_mode.py +86 -0
  314. msprobe/msaccucmp/vector_cmp/vector_comparison.py +535 -0
  315. msprobe/msprobe.py +101 -130
  316. msprobe/overflow_check/__init__.py +15 -0
  317. msprobe/{nan_analyze → overflow_check}/analyzer.py +38 -27
  318. msprobe/{nan_analyze → overflow_check}/graph.py +30 -27
  319. msprobe/{nan_analyze → overflow_check}/utils.py +15 -14
  320. msprobe/pytorch/__init__.py +20 -14
  321. msprobe/pytorch/aclgraph_dump/__init__.py +45 -0
  322. msprobe/pytorch/aclgraph_dump/_meta.py +26 -0
  323. msprobe/pytorch/api_accuracy_checker/{run_ut/run_ut.py → acc_check/acc_check.py} +50 -45
  324. msprobe/pytorch/api_accuracy_checker/{run_ut/run_ut_utils.py → acc_check/acc_check_utils.py} +201 -30
  325. msprobe/pytorch/api_accuracy_checker/{run_ut → acc_check}/data_generate.py +56 -16
  326. msprobe/pytorch/api_accuracy_checker/{run_ut/multi_run_ut.py → acc_check/multi_acc_check.py} +32 -47
  327. msprobe/pytorch/api_accuracy_checker/{run_ut → acc_check}/run_overflow_check.py +19 -18
  328. msprobe/pytorch/api_accuracy_checker/common/config.py +22 -20
  329. msprobe/pytorch/api_accuracy_checker/common/utils.py +72 -13
  330. msprobe/pytorch/api_accuracy_checker/compare/algorithm.py +41 -11
  331. msprobe/pytorch/api_accuracy_checker/compare/api_precision_compare.py +23 -14
  332. msprobe/pytorch/api_accuracy_checker/compare/compare.py +45 -32
  333. msprobe/pytorch/api_accuracy_checker/compare/compare_column.py +12 -11
  334. msprobe/pytorch/api_accuracy_checker/compare/compare_input.py +14 -12
  335. msprobe/pytorch/api_accuracy_checker/compare/compare_utils.py +14 -12
  336. msprobe/pytorch/api_accuracy_checker/precision_standard/absolute_threshold.py +12 -11
  337. msprobe/pytorch/api_accuracy_checker/precision_standard/accumulative_error_compare.py +12 -11
  338. msprobe/pytorch/api_accuracy_checker/precision_standard/base_standard.py +21 -19
  339. msprobe/pytorch/api_accuracy_checker/precision_standard/benchmark_compare.py +14 -13
  340. msprobe/pytorch/api_accuracy_checker/precision_standard/binary_consistency.py +12 -11
  341. msprobe/pytorch/api_accuracy_checker/precision_standard/standard_config.py +60 -11
  342. msprobe/pytorch/api_accuracy_checker/precision_standard/standard_register.py +27 -16
  343. msprobe/pytorch/api_accuracy_checker/precision_standard/thousandth_standard.py +13 -11
  344. msprobe/pytorch/api_accuracy_checker/precision_standard/ulp_compare.py +39 -18
  345. msprobe/pytorch/bench_functions/__init__.py +12 -11
  346. msprobe/pytorch/bench_functions/apply_adam.py +12 -11
  347. msprobe/pytorch/bench_functions/apply_adam_w.py +12 -11
  348. msprobe/pytorch/bench_functions/confusion_transpose.py +12 -11
  349. msprobe/pytorch/bench_functions/fast_gelu.py +12 -11
  350. msprobe/pytorch/bench_functions/group_norm_silu.py +12 -11
  351. msprobe/pytorch/bench_functions/layer_norm_eval.py +12 -11
  352. msprobe/pytorch/bench_functions/linear.py +12 -11
  353. msprobe/pytorch/bench_functions/matmul_backward.py +12 -11
  354. msprobe/pytorch/bench_functions/mish.py +12 -11
  355. msprobe/pytorch/bench_functions/moe_gating_top_k_softmax.py +12 -11
  356. msprobe/pytorch/bench_functions/npu_fusion_attention.py +12 -11
  357. msprobe/pytorch/bench_functions/rms_norm.py +12 -11
  358. msprobe/pytorch/bench_functions/rotary_mul.py +12 -11
  359. msprobe/pytorch/bench_functions/scaled_mask_softmax.py +12 -11
  360. msprobe/pytorch/bench_functions/sort_v2.py +12 -11
  361. msprobe/pytorch/bench_functions/swiglu.py +12 -11
  362. msprobe/pytorch/common/__init__.py +12 -11
  363. msprobe/pytorch/common/log.py +12 -11
  364. msprobe/pytorch/common/parse_json.py +12 -11
  365. msprobe/pytorch/common/utils.py +52 -19
  366. msprobe/pytorch/compare/distributed_compare.py +13 -13
  367. msprobe/pytorch/compare/match.py +12 -11
  368. msprobe/pytorch/compare/pt_compare.py +14 -20
  369. msprobe/pytorch/compare/pt_diff_analyze.py +12 -11
  370. msprobe/pytorch/compare/utils.py +12 -11
  371. msprobe/pytorch/{hook_module → dump/api_dump}/api_register.py +18 -16
  372. msprobe/pytorch/{hook_module → dump/api_dump}/hook_module.py +14 -13
  373. msprobe/pytorch/{hook_module → dump/api_dump}/pt_hook_manager.py +68 -23
  374. msprobe/pytorch/{hook_module → dump/api_dump}/register_optimizer_hook.py +13 -11
  375. msprobe/pytorch/{hook_module → dump/api_dump}/script_wrapper.py +17 -14
  376. msprobe/pytorch/{hook_module → dump/api_dump}/utils.py +12 -11
  377. msprobe/pytorch/{debugger → dump/debugger}/debugger_config.py +23 -38
  378. msprobe/pytorch/dump/debugger/precision_debugger.py +130 -0
  379. msprobe/pytorch/{function_factory.py → dump/function_factory.py} +12 -11
  380. msprobe/pytorch/dump/module_dump/hook_wrapper.py +17 -13
  381. msprobe/pytorch/dump/module_dump/module_dump.py +16 -15
  382. msprobe/pytorch/dump/module_dump/{module_processer.py → module_processor.py} +54 -42
  383. msprobe/pytorch/dump/pt_config.py +128 -0
  384. msprobe/pytorch/{pytorch_service.py → dump/pytorch_service.py} +22 -21
  385. msprobe/pytorch/monitor/csv2tb.py +13 -11
  386. msprobe/pytorch/monitor/data_writers.py +13 -11
  387. msprobe/pytorch/monitor/distributed/wrap_distributed.py +13 -11
  388. msprobe/pytorch/monitor/features.py +12 -11
  389. msprobe/pytorch/monitor/module_hook.py +67 -59
  390. msprobe/pytorch/monitor/module_metric.py +13 -11
  391. msprobe/pytorch/monitor/optimizer_collect.py +37 -35
  392. msprobe/pytorch/monitor/utils.py +13 -11
  393. msprobe/pytorch/monitor/visualizer.py +12 -11
  394. msprobe/pytorch/torchair_dump/__init__.py +17 -0
  395. msprobe/pytorch/torchair_dump/torchair_dump.py +114 -0
  396. msprobe/scripts/atb/config_example.json +10 -0
  397. msprobe/scripts/atb/load_atb_probe.sh +101 -0
  398. msprobe/scripts/atb/unload_atb_probe.sh +27 -0
  399. msprobe/scripts/build_msaccucmp.sh +186 -0
  400. msprobe/scripts/conf/help.info +6 -0
  401. msprobe/scripts/conf/version.info +3 -0
  402. msprobe/scripts/run_script/common.sh +538 -0
  403. msprobe/scripts/run_script/main_msaccucmp.sh +232 -0
  404. msprobe/visualization/__init__.py +12 -11
  405. msprobe/visualization/builder/__init__.py +12 -11
  406. msprobe/visualization/builder/graph_builder.py +45 -30
  407. msprobe/visualization/builder/graph_merger.py +53 -32
  408. msprobe/visualization/builder/msprobe_adapter.py +34 -44
  409. msprobe/visualization/compare/__init__.py +12 -11
  410. msprobe/visualization/compare/graph_comparator.py +63 -51
  411. msprobe/visualization/compare/mode_adapter.py +28 -113
  412. msprobe/visualization/db_utils.py +133 -22
  413. msprobe/visualization/graph/__init__.py +12 -11
  414. msprobe/visualization/graph/base_node.py +15 -27
  415. msprobe/visualization/graph/distributed_analyzer.py +97 -40
  416. msprobe/visualization/graph/graph.py +14 -16
  417. msprobe/visualization/graph/node_colors.py +34 -31
  418. msprobe/visualization/graph/node_op.py +12 -11
  419. msprobe/visualization/graph_service.py +580 -205
  420. msprobe/visualization/utils.py +278 -31
  421. tb_graph_ascend/secure_build.py +175 -0
  422. tb_graph_ascend/server/__init__.py +15 -0
  423. tb_graph_ascend/server/app/__init__.py +15 -0
  424. tb_graph_ascend/server/app/model/__init__.py +15 -0
  425. tb_graph_ascend/server/app/model/hierarchy.py +348 -0
  426. tb_graph_ascend/server/app/model/layout_hierarchy_model.py +69 -0
  427. tb_graph_ascend/server/app/model/match_nodes_model.py +573 -0
  428. tb_graph_ascend/server/app/repositories/__init__.py +15 -0
  429. tb_graph_ascend/server/app/repositories/graph_repo_base.py +32 -0
  430. tb_graph_ascend/server/app/repositories/graph_repo_db.py +879 -0
  431. tb_graph_ascend/server/app/repositories/graph_repo_vis.py +83 -0
  432. tb_graph_ascend/server/app/service/__init__.py +18 -0
  433. tb_graph_ascend/server/app/service/graph_service_base.py +158 -0
  434. tb_graph_ascend/server/app/service/graph_service_db.py +438 -0
  435. tb_graph_ascend/server/app/service/graph_service_factory.py +54 -0
  436. tb_graph_ascend/server/app/service/graph_service_vis.py +480 -0
  437. tb_graph_ascend/server/app/utils/__init__.py +15 -0
  438. tb_graph_ascend/server/app/utils/constant.py +80 -0
  439. tb_graph_ascend/server/app/utils/file_check_wrapper.py +46 -0
  440. tb_graph_ascend/server/app/utils/global_state.py +95 -0
  441. tb_graph_ascend/server/app/utils/graph_utils.py +661 -0
  442. tb_graph_ascend/server/app/utils/i18n.py +153 -0
  443. tb_graph_ascend/server/app/utils/request_method.py +46 -0
  444. tb_graph_ascend/server/app/views/__init__.py +15 -0
  445. tb_graph_ascend/server/app/views/graph_views.py +304 -0
  446. tb_graph_ascend/server/plugin.py +108 -0
  447. tb_graph_ascend/server/static/index.html +9250 -0
  448. tb_graph_ascend/server/static/index.js +21 -0
  449. tb_graph_ascend/setup.py +57 -0
  450. mindstudio_probe-8.3.2.dist-info/LICENSE +0 -201
  451. mindstudio_probe-8.3.2.dist-info/RECORD +0 -491
  452. mindstudio_probe-8.3.2.dist-info/entry_points.txt +0 -2
  453. mindstudio_probe-8.3.2.dist-info/top_level.txt +0 -1
  454. msprobe/CMakeLists.txt +0 -5
  455. msprobe/README.md +0 -203
  456. msprobe/core/advisor/advisor.py +0 -129
  457. msprobe/core/advisor/advisor_const.py +0 -58
  458. msprobe/core/advisor/advisor_result.py +0 -58
  459. msprobe/core/compare/find_first/data_processor.py +0 -35
  460. msprobe/core/compare/highlight.py +0 -390
  461. msprobe/core/data_dump/data_collector.py +0 -356
  462. msprobe/core/grad_probe/constant.py +0 -90
  463. msprobe/core/grad_probe/grad_compare.py +0 -187
  464. msprobe/core/grad_probe/utils.py +0 -105
  465. msprobe/core/kernel_dump/kernel_config.py +0 -33
  466. msprobe/docs/01.installation.md +0 -250
  467. msprobe/docs/02.config_introduction.md +0 -221
  468. msprobe/docs/03.config_examples.md +0 -281
  469. msprobe/docs/04.kernel_dump_PyTorch.md +0 -73
  470. msprobe/docs/05.data_dump_PyTorch.md +0 -518
  471. msprobe/docs/06.data_dump_MindSpore.md +0 -618
  472. msprobe/docs/07.accuracy_checker_PyTorch.md +0 -310
  473. msprobe/docs/09.accuracy_checker_MindSpore.md +0 -120
  474. msprobe/docs/10.accuracy_compare_PyTorch.md +0 -637
  475. msprobe/docs/11.accuracy_compare_MindSpore.md +0 -769
  476. msprobe/docs/12.overflow_check_PyTorch.md +0 -82
  477. msprobe/docs/13.overflow_check_MindSpore.md +0 -33
  478. msprobe/docs/14.data_parse_PyTorch.md +0 -282
  479. msprobe/docs/15.free_benchmarking_PyTorch.md +0 -169
  480. msprobe/docs/16.free_benchmarking_MindSpore.md +0 -159
  481. msprobe/docs/17.grad_probe.md +0 -205
  482. msprobe/docs/18.online_dispatch.md +0 -89
  483. msprobe/docs/19.monitor.md +0 -753
  484. msprobe/docs/20.monitor_performance_baseline.md +0 -52
  485. msprobe/docs/21.visualization_PyTorch.md +0 -519
  486. msprobe/docs/22.visualization_MindSpore.md +0 -515
  487. msprobe/docs/23.generate_operator_PyTorch.md +0 -107
  488. msprobe/docs/24.code_mapping_Mindspore.md +0 -29
  489. msprobe/docs/25.tool_function_introduction.md +0 -29
  490. msprobe/docs/26.data_dump_PyTorch_baseline.md +0 -48
  491. msprobe/docs/27.dump_json_instruction.md +0 -795
  492. msprobe/docs/28.debugger_save_instruction.md +0 -288
  493. msprobe/docs/28.kernel_dump_MindSpore.md +0 -69
  494. msprobe/docs/29.data_dump_MSAdapter.md +0 -235
  495. msprobe/docs/30.overflow_check_MSAdapter.md +0 -31
  496. msprobe/docs/31.config_check.md +0 -107
  497. msprobe/docs/32.ckpt_compare.md +0 -69
  498. msprobe/docs/33.generate_operator_MindSpore.md +0 -181
  499. msprobe/docs/34.RL_collect.md +0 -101
  500. msprobe/docs/35.nan_analyze.md +0 -73
  501. msprobe/docs/36.calculation_result_change.md +0 -75
  502. msprobe/docs/FAQ.md +0 -232
  503. msprobe/docs/S02.report_free_benchmarking_validation_performance_baseline.md +0 -146
  504. msprobe/docs/accuracy_checker_MindSpore/accuracy_checker_MindSpore_baseline.md +0 -14
  505. msprobe/docs/data_dump_MindSpore/data_dump_MindSpore_baseline.md +0 -33
  506. msprobe/docs/data_dump_MindSpore/dynamic_graph_quick_start_example.md +0 -217
  507. msprobe/docs/img/BLOOM-7B_1.png +0 -0
  508. msprobe/docs/img/BLOOM-7B_2.png +0 -0
  509. msprobe/docs/img/BLOOM-7B_3.png +0 -0
  510. msprobe/docs/img/BLOOM-7B_4.png +0 -0
  511. msprobe/docs/img/GPT-3_1.png +0 -0
  512. msprobe/docs/img/GPT-3_2.png +0 -0
  513. msprobe/docs/img/GPT-3_3.png +0 -0
  514. msprobe/docs/img/GPT-3_4.png +0 -0
  515. msprobe/docs/img/GPT-3_5.png +0 -0
  516. msprobe/docs/img/GPT-3_6.png +0 -0
  517. msprobe/docs/img/GPT-3_7.png +0 -0
  518. msprobe/docs/img/GPT-3_8.png +0 -0
  519. msprobe/docs/img/YOLOV5S_1.png +0 -0
  520. msprobe/docs/img/YOLOV5S_2.png +0 -0
  521. msprobe/docs/img/accuracy_checking_details.png +0 -0
  522. msprobe/docs/img/accuracy_checking_result.png +0 -0
  523. msprobe/docs/img/api_precision_compare_details.png +0 -0
  524. msprobe/docs/img/api_precision_compare_result.png +0 -0
  525. msprobe/docs/img/auto_analyze_log.png +0 -0
  526. msprobe/docs/img/compare_result.png +0 -0
  527. msprobe/docs/img/compare_result_pkl.png +0 -0
  528. msprobe/docs/img/compare_result_pkl_md5.png.png +0 -0
  529. msprobe/docs/img/cpu_info.png +0 -0
  530. msprobe/docs/img/free_benchmark.png +0 -0
  531. msprobe/docs/img/free_benchmark_framework.png +0 -0
  532. msprobe/docs/img/grad_probe_image-1.png +0 -0
  533. msprobe/docs/img/grad_probe_image-2.png +0 -0
  534. msprobe/docs/img/grad_probe_image-3.png +0 -0
  535. msprobe/docs/img/grad_probe_image-4.png +0 -0
  536. msprobe/docs/img/grad_probe_image.png +0 -0
  537. msprobe/docs/img/merge_result.png +0 -0
  538. msprobe/docs/img/module_compare.png +0 -0
  539. msprobe/docs/img/monitor/cpu_info.png +0 -0
  540. msprobe/docs/img/monitor/step_count_per_record.png +0 -0
  541. msprobe/docs/img/ms_dump.png +0 -0
  542. msprobe/docs/img/ms_layer.png +0 -0
  543. msprobe/docs/img/pt_dump.png +0 -0
  544. msprobe/docs/img/save_compare_result_sample.png +0 -0
  545. msprobe/docs/img/visualization/fuzzy_match_ms.png +0 -0
  546. msprobe/docs/img/visualization/fuzzy_match_pt.png +0 -0
  547. msprobe/docs/img/visualization/proxy.png +0 -0
  548. msprobe/docs/img/visualization/tensorboard_1.png +0 -0
  549. msprobe/docs/img/visualization/tensorboard_2.png +0 -0
  550. msprobe/docs/img/visualization/vis_browser_1.png +0 -0
  551. msprobe/docs/img/visualization/vis_browser_2.png +0 -0
  552. msprobe/docs/img/visualization/vis_match_info.png +0 -0
  553. msprobe/docs/img/visualization/vis_precision_info.png +0 -0
  554. msprobe/docs/img/visualization/vis_search_info.png +0 -0
  555. msprobe/docs/img/visualization/vis_show_info.png +0 -0
  556. msprobe/docs/img/visualization/vis_showcase.png +0 -0
  557. msprobe/docs/img/visualization/vis_unmatch_info.png +0 -0
  558. msprobe/docs/visualization/GPTModel.png +0 -0
  559. msprobe/docs/visualization/ParallelMLP.png +0 -0
  560. msprobe/docs/visualization/layer_mapping_example.md +0 -132
  561. msprobe/docs/visualization/mapping.png +0 -0
  562. msprobe/docs/visualization/mapping1.png +0 -0
  563. msprobe/docs/visualization/mindspeed_llamafactoary_img/1.png +0 -0
  564. msprobe/docs/visualization/mindspeed_llamafactoary_img/2.png +0 -0
  565. msprobe/docs/visualization/mindspeed_llamafactoary_img/3.png +0 -0
  566. msprobe/docs/visualization/mindspeed_llamafactoary_img/4.png +0 -0
  567. msprobe/docs/visualization/mindspeed_llamafactoary_img/5.png +0 -0
  568. msprobe/docs/visualization/mindspeed_llamafactoary_img/6.png +0 -0
  569. msprobe/docs/visualization/mindspeed_llamafactoary_img/7.png +0 -0
  570. msprobe/docs/visualization/mindspeed_llamafactoary_img/llamafactory-qwen25vl.txt +0 -59
  571. msprobe/docs/visualization/mindspeed_llamafactoary_img/llamafactory1.png +0 -0
  572. msprobe/docs/visualization/mindspeed_llamafactoary_img/llamafactory2.png +0 -0
  573. msprobe/docs/visualization/mindspeed_llamafactoary_img/mindspeed-mm-qwen25vl.txt +0 -80
  574. msprobe/docs/visualization/mindspeed_llamafactoary_img/mindspeed1.png +0 -0
  575. msprobe/docs/visualization/mindspeed_llamafactoary_img/mindspeed2.png +0 -0
  576. msprobe/docs/visualization/mindspeed_llamafactory_mapping.md +0 -330
  577. msprobe/docs/visualization/module_name.png +0 -0
  578. msprobe/docs/visualization/module_name1.png +0 -0
  579. msprobe/docs/visualization/no_mapping.png +0 -0
  580. msprobe/docs/visualization/no_mapping1.png +0 -0
  581. msprobe/docs/visualization/no_mapping_analyze.png +0 -0
  582. msprobe/docs/visualization/top_layer.png +0 -0
  583. msprobe/mindspore/api_accuracy_checker/generate_op_script/op_generator.py +0 -460
  584. msprobe/mindspore/api_accuracy_checker/generate_op_script/operator_replication.template +0 -2081
  585. msprobe/mindspore/code_mapping/bind.py +0 -283
  586. msprobe/mindspore/code_mapping/cmd_parser.py +0 -40
  587. msprobe/mindspore/code_mapping/graph.py +0 -49
  588. msprobe/mindspore/code_mapping/graph_parser.py +0 -211
  589. msprobe/mindspore/code_mapping/main.py +0 -24
  590. msprobe/mindspore/code_mapping/processor.py +0 -34
  591. msprobe/mindspore/dym_loader/hook_dynamic_loader.cpp +0 -111
  592. msprobe/mindspore/dym_loader/hook_dynamic_loader.h +0 -52
  593. msprobe/mindspore/free_benchmark/api_pynative_self_check.py +0 -257
  594. msprobe/mindspore/free_benchmark/common/config.py +0 -27
  595. msprobe/mindspore/free_benchmark/common/handler_params.py +0 -31
  596. msprobe/mindspore/free_benchmark/common/utils.py +0 -100
  597. msprobe/mindspore/free_benchmark/data/support_wrap_ops.yaml +0 -638
  598. msprobe/mindspore/free_benchmark/handler/base_handler.py +0 -105
  599. msprobe/mindspore/free_benchmark/handler/check_handler.py +0 -55
  600. msprobe/mindspore/free_benchmark/handler/fix_handler.py +0 -51
  601. msprobe/mindspore/free_benchmark/handler/handler_factory.py +0 -36
  602. msprobe/mindspore/free_benchmark/perturbation/add_noise.py +0 -82
  603. msprobe/mindspore/free_benchmark/perturbation/base_perturbation.py +0 -45
  604. msprobe/mindspore/free_benchmark/perturbation/bit_noise.py +0 -78
  605. msprobe/mindspore/free_benchmark/perturbation/exchange_value.py +0 -77
  606. msprobe/mindspore/free_benchmark/perturbation/improve_precision.py +0 -56
  607. msprobe/mindspore/free_benchmark/perturbation/no_change.py +0 -27
  608. msprobe/mindspore/free_benchmark/perturbation/perturbation_factory.py +0 -46
  609. msprobe/mindspore/free_benchmark/self_check_tool_factory.py +0 -51
  610. msprobe/mindspore/grad_probe/global_context.py +0 -127
  611. msprobe/mindspore/grad_probe/grad_analyzer.py +0 -260
  612. msprobe/mindspore/grad_probe/grad_monitor.py +0 -42
  613. msprobe/mindspore/grad_probe/grad_stat_csv.py +0 -161
  614. msprobe/mindspore/grad_probe/hook.py +0 -115
  615. msprobe/mindspore/grad_probe/utils.py +0 -43
  616. msprobe/mindspore/mindtorch/__init__.py +0 -18
  617. msprobe/mindspore/ms_config.py +0 -153
  618. msprobe/mindspore/task_handler_factory.py +0 -44
  619. msprobe/nan_analyze/__init__.py +0 -14
  620. msprobe/pytorch/api_accuracy_checker/generate_op_script/config_op.json +0 -9
  621. msprobe/pytorch/api_accuracy_checker/generate_op_script/op_generator.py +0 -480
  622. msprobe/pytorch/api_accuracy_checker/generate_op_script/operator_replication.template +0 -567
  623. msprobe/pytorch/debugger/precision_debugger.py +0 -181
  624. msprobe/pytorch/free_benchmark/__init__.py +0 -23
  625. msprobe/pytorch/free_benchmark/common/constant.py +0 -85
  626. msprobe/pytorch/free_benchmark/common/counter.py +0 -87
  627. msprobe/pytorch/free_benchmark/common/enums.py +0 -80
  628. msprobe/pytorch/free_benchmark/common/params.py +0 -152
  629. msprobe/pytorch/free_benchmark/common/utils.py +0 -143
  630. msprobe/pytorch/free_benchmark/compare/grad_saver.py +0 -215
  631. msprobe/pytorch/free_benchmark/compare/single_benchmark.py +0 -121
  632. msprobe/pytorch/free_benchmark/main.py +0 -123
  633. msprobe/pytorch/free_benchmark/perturbed_layers/base_layer.py +0 -28
  634. msprobe/pytorch/free_benchmark/perturbed_layers/layer_factory.py +0 -56
  635. msprobe/pytorch/free_benchmark/perturbed_layers/npu/add_noise.py +0 -107
  636. msprobe/pytorch/free_benchmark/perturbed_layers/npu/bit_noise.py +0 -121
  637. msprobe/pytorch/free_benchmark/perturbed_layers/npu/change_value.py +0 -89
  638. msprobe/pytorch/free_benchmark/perturbed_layers/npu/improve_precision.py +0 -87
  639. msprobe/pytorch/free_benchmark/perturbed_layers/npu/no_change.py +0 -43
  640. msprobe/pytorch/free_benchmark/perturbed_layers/npu/npu_base_layser.py +0 -60
  641. msprobe/pytorch/free_benchmark/perturbed_layers/run_cpu.py +0 -34
  642. msprobe/pytorch/free_benchmark/result_handlers/base_handler.py +0 -252
  643. msprobe/pytorch/free_benchmark/result_handlers/check_handler.py +0 -54
  644. msprobe/pytorch/free_benchmark/result_handlers/fix_handler.py +0 -40
  645. msprobe/pytorch/free_benchmark/result_handlers/handler_factory.py +0 -45
  646. msprobe/pytorch/free_benchmark/result_handlers/preheat_handler.py +0 -181
  647. msprobe/pytorch/grad_probe/__init__.py +0 -0
  648. msprobe/pytorch/grad_probe/grad_monitor.py +0 -108
  649. msprobe/pytorch/grad_probe/grad_stat_csv.py +0 -160
  650. msprobe/pytorch/hook_module/__init__.py +0 -16
  651. msprobe/pytorch/hook_module/wrap_aten.py +0 -111
  652. msprobe/pytorch/online_dispatch/__init__.py +0 -19
  653. msprobe/pytorch/online_dispatch/compare.py +0 -224
  654. msprobe/pytorch/online_dispatch/dispatch.py +0 -332
  655. msprobe/pytorch/online_dispatch/dump_compare.py +0 -179
  656. msprobe/pytorch/online_dispatch/single_compare.py +0 -412
  657. msprobe/pytorch/online_dispatch/torch_ops_config.yaml +0 -58
  658. msprobe/pytorch/online_dispatch/utils.py +0 -158
  659. msprobe/pytorch/parse_tool/__init__.py +0 -0
  660. msprobe/pytorch/parse_tool/cli.py +0 -31
  661. msprobe/pytorch/parse_tool/lib/__init__.py +0 -0
  662. msprobe/pytorch/parse_tool/lib/compare.py +0 -253
  663. msprobe/pytorch/parse_tool/lib/config.py +0 -50
  664. msprobe/pytorch/parse_tool/lib/file_desc.py +0 -45
  665. msprobe/pytorch/parse_tool/lib/interactive_cli.py +0 -97
  666. msprobe/pytorch/parse_tool/lib/parse_exception.py +0 -54
  667. msprobe/pytorch/parse_tool/lib/parse_tool.py +0 -161
  668. msprobe/pytorch/parse_tool/lib/utils.py +0 -299
  669. msprobe/pytorch/parse_tool/lib/visualization.py +0 -85
  670. msprobe/pytorch/pt_config.py +0 -299
  671. /msprobe/core/{grad_probe → dump}/__init__.py +0 -0
  672. /msprobe/{mindspore/code_mapping → core/dump/api_dump}/__init__.py +0 -0
  673. /msprobe/{mindspore/debugger → core/dump/data_dump}/__init__.py +0 -0
  674. /msprobe/{mindspore/exception_dump → core/dump/data_dump/data_processor}/__init__.py +0 -0
  675. /msprobe/{mindspore/free_benchmark → core/dump/debugger}/__init__.py +0 -0
  676. /msprobe/{mindspore/free_benchmark/common → core/dump/kernel_dump}/__init__.py +0 -0
  677. /msprobe/mindspore/{free_benchmark/handler → dump/debugger}/__init__.py +0 -0
  678. /msprobe/mindspore/{grad_probe → dump/dump_processor}/__init__.py +0 -0
  679. /msprobe/mindspore/{overflow_check → dump/exception_dump}/__init__.py +0 -0
  680. /msprobe/mindspore/{mindtorch → dump/mindtorch}/mindtorch_adaptor.py +0 -0
  681. /msprobe/{pytorch/api_accuracy_checker/run_ut → mindspore/dump/overflow_check}/__init__.py +0 -0
  682. /msprobe/{pytorch/debugger → mindspore/monitor}/__init__.py +0 -0
  683. /msprobe/{pytorch/free_benchmark/common → msaccucmp}/__init__.py +0 -0
  684. /msprobe/pytorch/api_accuracy_checker/{run_ut → acc_check}/.keep +0 -0
  685. /msprobe/pytorch/{free_benchmark/perturbed_layers → api_accuracy_checker/acc_check}/__init__.py +0 -0
  686. /msprobe/pytorch/api_accuracy_checker/{run_ut → acc_check}/torch_ut_setting.json +0 -0
  687. /msprobe/pytorch/{free_benchmark/perturbed_layers/npu → dump/api_dump}/__init__.py +0 -0
  688. /msprobe/pytorch/{hook_module → dump/api_dump}/support_wrap_ops.yaml +0 -0
  689. /msprobe/pytorch/{free_benchmark/result_handlers → dump/debugger}/__init__.py +0 -0
@@ -1,356 +0,0 @@
1
- # Copyright (c) 2024-2025, Huawei Technologies Co., Ltd.
2
- # All rights reserved.
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
- import atexit
17
- import os
18
- import threading
19
- import traceback
20
-
21
- from msprobe.core.data_dump.scope import ScopeFactory
22
- from msprobe.core.data_dump.json_writer import DataWriter
23
- from msprobe.core.common.log import logger
24
- from msprobe.core.common.const import Const
25
- from msprobe.core.data_dump.data_processor.factory import DataProcessorFactory
26
- from msprobe.core.common.megatron_utils import MegatronStepInfo, get_micro_step, is_megatron
27
-
28
-
29
- def build_data_collector(config):
30
- return DataCollector(config)
31
-
32
-
33
- class DataCollector:
34
- tasks_need_tensor_data = [Const.OVERFLOW_CHECK, Const.TENSOR, Const.FREE_BENCHMARK]
35
- level_without_construct = [Const.LEVEL_L1, Const.LEVEL_L2]
36
-
37
- def __init__(self, config):
38
- self.config = config
39
- self.data_writer = DataWriter()
40
- self.data_processor = DataProcessorFactory.create_processor(self.config, self.data_writer)
41
- self.module_processor = DataProcessorFactory.get_module_processor(self.config.framework)
42
- self.module_count = {}
43
- self.scope = ScopeFactory(self.config).build_scope()
44
- self.backward_module_names = {}
45
- self.params_grad_record = {}
46
- self.optimizer_status = ""
47
- self.optimizer_status_first_start = {Const.OPTIMIZER: True, Const.CLIP_GRAD: True}
48
- atexit.register(self.write_json_at_exit)
49
-
50
- @property
51
- def dump_data_dir(self):
52
- return self.data_writer.dump_tensor_data_dir
53
-
54
- @property
55
- def dump_file_path(self):
56
- return self.data_writer.dump_file_path
57
-
58
- @staticmethod
59
- def check_scope_and_pid(scope, name, pid):
60
- return (not scope or scope.check(name)) and pid == os.getpid()
61
-
62
- @staticmethod
63
- def set_is_recomputable(data_info, is_recompute):
64
- if data_info and len(data_info) == 1 and is_recompute is not None: # 正常情况下data_info的长度应改为1
65
- data_info[list(data_info.keys())[0]]["is_recompute"] = is_recompute
66
-
67
- def reset_status(self):
68
- self.optimizer_status = ""
69
- self.optimizer_status_first_start = {Const.OPTIMIZER: True, Const.CLIP_GRAD: True}
70
- self.data_writer.reset_cache()
71
- self.backward_module_names.clear()
72
-
73
- def if_return_forward_new_output(self):
74
- return self.data_processor.if_return_forward_new_output()
75
-
76
- def get_forward_new_output(self):
77
- return self.data_processor.get_forward_new_output()
78
-
79
- def update_api_or_module_name(self, api_or_module_name):
80
- self.data_processor.update_api_or_module_name(api_or_module_name)
81
-
82
- def write_json(self):
83
- self.data_writer.write_json()
84
-
85
- def write_json_at_exit(self):
86
- if self.config.async_dump and self.config.task == Const.TENSOR:
87
- self.data_processor.dump_async_data()
88
- self.data_writer.write_json()
89
-
90
- def update_data(self, name, data_info):
91
- msg = f"msprobe is collecting data on {name}."
92
- if self.config.task == Const.OVERFLOW_CHECK:
93
- if self.data_processor.has_overflow:
94
- msg += " Overflow detected."
95
- logger.warning(msg)
96
- self.data_writer.update_data(data_info)
97
- return
98
- logger.debug(msg)
99
- self.data_writer.update_data(data_info)
100
-
101
- def call_stack_collect(self, name):
102
- stack_info = self.data_processor.analyze_api_call_stack(name)
103
- self.data_writer.update_stack(name, stack_info)
104
-
105
- def forward_input_data_collect(self, name, module, pid, module_input_output, is_recompute=None):
106
- try:
107
-
108
- if self.config.task == Const.FREE_BENCHMARK:
109
- backward_name = name.replace(Const.FORWARD, Const.BACKWARD)
110
- if self.check_scope_and_pid(self.scope, backward_name, pid):
111
- self.data_processor.analyze_forward_input(backward_name, module, module_input_output)
112
- return
113
-
114
- if not self.check_scope_and_pid(self.scope, name, pid):
115
- return
116
-
117
- data_info = {}
118
- if self.config.task != Const.STRUCTURE:
119
- data_info = self.data_processor.analyze_forward_input(name, module, module_input_output)
120
- self.set_is_recomputable(data_info, is_recompute)
121
- if self.config.level == Const.LEVEL_L2:
122
- return
123
- self.call_stack_collect(name)
124
- self.handle_data(name, data_info, flush=self.data_processor.is_terminated)
125
-
126
- except Exception as e:
127
- # 取异常类名作为“类型”做去重
128
- error_type = type(e).__name__
129
- tb = traceback.format_exc()
130
- self.data_writer.write_error_log(
131
- f"[ERROR] forward_input_data_collect failed: name={name}, pid={pid}\n{tb}",
132
- error_type=error_type
133
- )
134
-
135
- def forward_output_data_collect(self, name, module, pid, module_input_output, is_recompute=None):
136
- try:
137
-
138
- self.update_construct(name)
139
- if not self.check_scope_and_pid(self.scope, name, pid):
140
- return
141
-
142
- data_info = {}
143
- if self.config.task != Const.STRUCTURE:
144
- data_info = self.data_processor.analyze_forward_output(name, module, module_input_output)
145
- self.set_is_recomputable(data_info, is_recompute)
146
- if self.config.level == Const.LEVEL_L2:
147
- return
148
- self.handle_data(name, data_info, flush=self.data_processor.is_terminated)
149
-
150
- except Exception as e:
151
- # 取异常类名作为“类型”做去重
152
- error_type = type(e).__name__
153
- tb = traceback.format_exc()
154
- self.data_writer.write_error_log(
155
- f"[ERROR] forward_output_data_collect failed: name={name}, pid={pid}\n{tb}",
156
- error_type=error_type
157
- )
158
-
159
- def forward_data_collect_only_tensor(self, name, module, pid, module_input_output):
160
- try:
161
- if not self.check_scope_and_pid(self.scope, name, pid):
162
- return
163
- self.data_processor.analyze_forward(name, module, module_input_output)
164
-
165
- except Exception as e:
166
- # 取异常类名作为“类型”做去重
167
- error_type = type(e).__name__
168
- tb = traceback.format_exc()
169
- self.data_writer.write_error_log(
170
- f"[ERROR] forward_data_collect_only_tensor failed: name={name}, pid={pid}\n{tb}",
171
- error_type=error_type
172
- )
173
-
174
- def forward_data_collect(self, name, module, pid, module_input_output, is_recompute=None):
175
- try:
176
-
177
- self.update_construct(name)
178
- if not self.check_scope_and_pid(self.scope, name, pid):
179
- return
180
- data_info = {}
181
- if self.config.task != Const.STRUCTURE:
182
- data_info = self.data_processor.analyze_forward(name, module, module_input_output)
183
- self.set_is_recomputable(data_info, is_recompute)
184
- self.call_stack_collect(name)
185
- self.handle_data(name, data_info, flush=self.data_processor.is_terminated)
186
-
187
- except Exception as e:
188
- error_type = type(e).__name__
189
- tb = traceback.format_exc()
190
- self.data_writer.write_error_log(
191
- f"[ERROR] forward_data_collect failed: name={name}, pid={pid}\n{tb}",
192
- error_type=error_type
193
- )
194
-
195
- def backward_data_collect_only_tensor(self, name, module, pid, module_input_output, is_recompute=None):
196
- try:
197
- if not self.check_scope_and_pid(self.scope, name, pid):
198
- return
199
- self.data_processor.analyze_backward(name, module, module_input_output)
200
-
201
- except Exception as e:
202
- error_type = type(e).__name__
203
- tb = traceback.format_exc()
204
- self.data_writer.write_error_log(
205
- f"[ERROR] backward_data_collect_only_tensor failed: name={name}, pid={pid}\n{tb}",
206
- error_type=error_type
207
- )
208
-
209
- def backward_data_collect(self, name, module, pid, module_input_output, is_recompute=None):
210
- try:
211
- self.update_construct(name)
212
- if not self.check_scope_and_pid(self.scope, name, pid):
213
- return
214
- data_info = {}
215
- if self.config.task != Const.STRUCTURE:
216
- data_info = self.data_processor.analyze_backward(name, module, module_input_output)
217
- if self.config.level == Const.LEVEL_L2:
218
- return
219
- if data_info and name.split(Const.SEP)[0] in Const.MODULE_PREFIX:
220
- module_name = name.rsplit(Const.SEP, 2)[0]
221
- self.backward_module_names[module_name] = True
222
- self.handle_data(name, data_info, flush=self.data_processor.is_terminated)
223
-
224
- except Exception as e:
225
- error_type = type(e).__name__
226
- tb = traceback.format_exc()
227
- self.data_writer.write_error_log(
228
- f"[ERROR] backward_data_collect failed: name={name}, pid={pid}\n{tb}",
229
- error_type=error_type
230
- )
231
-
232
- def backward_input_data_collect(self, name, module, pid, module_input_output, is_recompute=None):
233
- try:
234
- self.update_construct(name)
235
- if not self.check_scope_and_pid(self.scope, name, pid):
236
- return
237
- data_info = {}
238
- if self.config.task != Const.STRUCTURE:
239
- data_info = self.data_processor.analyze_backward_input(name, module, module_input_output)
240
- self.set_is_recomputable(data_info, is_recompute)
241
- self.handle_data(name, data_info)
242
-
243
- except Exception as e:
244
- error_type = type(e).__name__
245
- tb = traceback.format_exc()
246
- self.data_writer.write_error_log(
247
- f"[ERROR] backward_input_data_collect failed: name={name}, pid={pid}\n{tb}",
248
- error_type=error_type
249
- )
250
-
251
- def backward_output_data_collect(self, name, module, pid, module_input_output, is_recompute=None):
252
- try:
253
- self.update_construct(name)
254
- if not self.check_scope_and_pid(self.scope, name, pid):
255
- return
256
- data_info = {}
257
- if self.config.task != Const.STRUCTURE:
258
- data_info = self.data_processor.analyze_backward_output(name, module, module_input_output)
259
- self.set_is_recomputable(data_info, is_recompute)
260
- self.handle_data(name, data_info)
261
-
262
- except Exception as e:
263
- error_type = type(e).__name__
264
- tb = traceback.format_exc()
265
- self.data_writer.write_error_log(
266
- f"[ERROR] backward_output_data_collect failed: name={name}, pid={pid}\n{tb}",
267
- error_type=error_type
268
- )
269
-
270
- def update_construct(self, name):
271
- if self.config.level not in DataCollector.level_without_construct:
272
- if self.optimizer_status in [Const.OPTIMIZER, Const.CLIP_GRAD]:
273
- if self.optimizer_status_first_start[self.optimizer_status]:
274
- self.data_writer.update_construct(
275
- {self.optimizer_status: None if not is_megatron() else [None, get_micro_step()]})
276
- self.optimizer_status_first_start[self.optimizer_status] = False
277
- self.data_writer.update_construct(
278
- {name: self.optimizer_status if not is_megatron() else [self.optimizer_status, get_micro_step()]})
279
- else:
280
- if self.config.level == Const.LEVEL_MIX and \
281
- not (name.startswith(Const.MODULE) or name.startswith(Const.CELL)):
282
- self.data_writer.update_construct(
283
- {name: self.module_processor.api_parent_node.get(threading.get_ident())}
284
- )
285
- if MegatronStepInfo.is_megatron:
286
- micro_step_number = max(MegatronStepInfo.forward_micro_step, MegatronStepInfo.backward_micro_step)
287
- self.data_writer.update_construct({Const.MEGATRON_MICRO_STEP_NUMBER: micro_step_number})
288
-
289
- self.data_writer.update_construct(self.module_processor.module_node)
290
-
291
- def handle_data(self, name, data_info, flush=False):
292
- if data_info:
293
- self.update_data(name, data_info)
294
- if self.config.async_dump:
295
- return
296
- if not flush:
297
- self.data_writer.flush_data_periodically()
298
- else:
299
- self.write_json()
300
-
301
- def update_dump_paths(self, *args):
302
- self.data_writer.update_dump_paths(*args)
303
-
304
- def initialize_json_file(self, framework=Const.UNKNOWN_FRAMEWORK):
305
- self.data_writer.initialize_json_file(task=self.config.task, level=self.config.level, framework=framework)
306
-
307
- def update_iter(self, current_iter):
308
- self.data_processor.update_iter(current_iter)
309
-
310
- def params_data_collect(self, name, param_name, pid, data):
311
- grad_name = name + Const.SEP + Const.PARAMS_GRAD
312
- self.update_api_or_module_name(grad_name)
313
- if not self.check_scope_and_pid(self.scope, name, pid) and not self.backward_module_names.get(name):
314
- if self.data_writer.cache_data.get("data"):
315
- self.data_writer.cache_data.get("data").pop(grad_name, None)
316
- self.params_grad_record[grad_name] = False
317
- return
318
- data_info = self.data_processor.analyze_params(grad_name, param_name, data)
319
- self.handle_data(grad_name, data_info, flush=self.data_processor.is_terminated)
320
- self.params_grad_record[grad_name] = False
321
-
322
- def params_data_collect_in_bw_hook(self, params_dict, name):
323
- try:
324
- if not params_dict:
325
- return
326
- ori_name = name.rsplit(Const.SEP, 2)[0]
327
- for param_name, param in params_dict.items():
328
- grad_name = ori_name + Const.SEP + Const.PARAMS_GRAD
329
- self.update_api_or_module_name(grad_name)
330
- if self.params_grad_record.get(grad_name, False):
331
- grad = param.grad if hasattr(param, "grad") else None
332
- data_info = self.data_processor.analyze_params(grad_name, param_name, grad)
333
- self.handle_data(grad_name, data_info, flush=self.data_processor.is_terminated)
334
- except Exception as e:
335
- error_type = type(e).__name__
336
- tb = traceback.format_exc()
337
- self.data_writer.write_error_log(
338
- f"[ERROR] params_data_collect_in_bw_hook failed: "
339
- f"name={name}",
340
- error_type=error_type
341
- )
342
-
343
- def debug_data_collect_forward(self, variable, name_with_count):
344
- data_info = self.data_processor.analyze_debug_forward(variable, name_with_count)
345
- name_with_count_category = name_with_count + Const.SEP + Const.DEBUG
346
- self.data_writer.update_debug({name_with_count_category: data_info})
347
-
348
- def debug_data_collect_backward(self, variable, grad_name_with_count):
349
- # prepare all None nested data structure
350
- all_none_data_info = self.data_processor.analyze_element_to_all_none(variable)
351
- grad_name_with_count_category = grad_name_with_count + Const.SEP + Const.DEBUG
352
- self.data_writer.update_debug({grad_name_with_count_category: all_none_data_info})
353
-
354
- # register tensor backward hook
355
- self.data_processor.analyze_debug_backward(variable, grad_name_with_count_category,
356
- self.data_writer.cache_debug['data'])
@@ -1,90 +0,0 @@
1
- # Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
2
- # All rights reserved.
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
- class GradConst:
17
-
18
- FRAMEWORKS = {"PyTorch", "MindSpore"}
19
- PYTORCH = "PyTorch"
20
- MindSpore = "MindSpore"
21
-
22
- GRAD_FILE_SUFFIX = {"npy", "pt"}
23
- NPY_SUFFIX = "npy"
24
- PT_SUFFIX = "pt"
25
-
26
- # for callback
27
- CURRENT_STEP = "current_step"
28
-
29
- PARAM_LIST = "param_list"
30
- RANK = "rank"
31
- STEP = "step"
32
- BOUNDS = "bounds"
33
- OUTPUT_PATH = "output_path"
34
- TIME_STAMP = "time_stamp"
35
-
36
- # level const
37
- LEVEL = "level"
38
- LEVEL0 = "L0"
39
- LEVEL1 = "L1"
40
- LEVEL2 = "L2"
41
- SUPPORTED_LEVEL = {"L0", "L1", "L2"}
42
-
43
- # numpy coding
44
- STEP_IDX = 0
45
- SHAPE_DIM_IDX = 4
46
- MAX_SIZE = 10 * 1024 * 1024 * 1024
47
-
48
- # direction suffix
49
- DIR_SUFFIX = "dir.npy"
50
-
51
- # bounds safety
52
- BOUNDS_MINIMUM = -2**63
53
- BOUNDS_MAXIMUM = 2**63 - 1
54
-
55
- # file safety
56
- DATA_DIR_AUTHORITY = 0o750
57
- DATA_FILE_AUTHORITY = 0o640
58
- DIRECTORY_LENGTH = 4096
59
- FILE_NAME_LENGTH = 255
60
- FILE_VALID_PATTERN = r"^[a-zA-Z0-9_.:/-]+$"
61
- PARAM_VALID_PATTERN = r"^[a-zA-Z0-9_.]+$"
62
- DIR = "dir"
63
- FILE = "file"
64
-
65
- STEP_FINISH = "step_finish"
66
-
67
- SUMMARY = "summary"
68
-
69
- # csv header entry
70
- MD5 = "MD5"
71
- DISTRIBUTION = "distribution"
72
- SHAPE = "shape"
73
- MAX = "max"
74
- MIN = "min"
75
- NORM = "norm"
76
-
77
- level_adp = {
78
- "L0": {
79
- "header": [GradConst.MD5, GradConst.MAX, GradConst.MIN, GradConst.NORM, GradConst.SHAPE],
80
- "have_grad_direction": False
81
- },
82
- "L1": {
83
- "header": [GradConst.MAX, GradConst.MIN, GradConst.NORM, GradConst.SHAPE],
84
- "have_grad_direction": True
85
- },
86
- "L2": {
87
- "header": [GradConst.DISTRIBUTION, GradConst.MAX, GradConst.MIN, GradConst.NORM, GradConst.SHAPE],
88
- "have_grad_direction": True
89
- },
90
- }
@@ -1,187 +0,0 @@
1
- # Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
2
- # All rights reserved.
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
- import os
17
- from typing import List
18
-
19
- from tqdm import tqdm
20
- import matplotlib.pyplot as plt
21
-
22
- from msprobe.core.common.file_utils import create_directory, check_file_or_directory_path
23
- from msprobe.core.common.log import logger
24
- from msprobe.core.common.file_utils import remove_path, load_npy, write_csv, read_csv
25
- from msprobe.core.grad_probe.constant import GradConst
26
- from msprobe.core.grad_probe.utils import plt_savefig
27
-
28
-
29
- class GradComparator:
30
-
31
- @staticmethod
32
- def _get_grad_weight_order(path1, path2):
33
- for summary_file in os.listdir(path1):
34
- if not summary_file.endswith(".csv"):
35
- continue
36
- if not os.path.exists(os.path.join(path2, summary_file)):
37
- continue
38
- summary_csv = read_csv(os.path.join(path1, summary_file))
39
- return summary_csv["param_name"]
40
- raise RuntimeError("no matched grad_summary.csv for comparison, please dump data in same configuration")
41
-
42
- @staticmethod
43
- def _get_name_matched_grad_file(param_name, grad_files):
44
- for grad_file in grad_files:
45
- if param_name == grad_file[:grad_file.rfind('.')]:
46
- return grad_file
47
- raise RuntimeError("no matched grad_file for comparison, please dump data in same configuration")
48
-
49
- @classmethod
50
- def compare_distributed(cls, path1: str, path2: str, output_dir: str):
51
- check_file_or_directory_path(path1, isdir=True)
52
- check_file_or_directory_path(path2, isdir=True)
53
- ranks = cls._get_matched_dirs(path1, path2, "rank")
54
- logger.info(f"the following ranks will be compared: {ranks}")
55
- if not ranks:
56
- raise RuntimeError("no matched ranks for comparison, please dump data in same configuration")
57
- if not os.path.isdir(output_dir):
58
- create_directory(output_dir)
59
- for rank in tqdm(ranks, desc="rank"):
60
- logger.info(f"now comparing rank {rank}:")
61
- cls.compare(os.path.join(path1, f"rank{rank}"),
62
- os.path.join(path2, f"rank{rank}"),
63
- os.path.join(output_dir, f"rank{rank}"))
64
-
65
- @classmethod
66
- def compare(cls, path1: str, path2: str, output_dir: str):
67
- steps = cls._get_matched_dirs(path1, path2, "step")
68
- if not steps:
69
- raise RuntimeError("no matched steps for comparison, please dump data in same configuration")
70
- similarities = cls._calculate_separated_similarities(path1, path2, steps)
71
- if not os.path.isdir(output_dir):
72
- create_directory(output_dir)
73
- cls._save_similarities(similarities, steps, output_dir)
74
-
75
- @classmethod
76
- def _get_matched_dirs(cls, path1: str, path2: str, dir_prefix):
77
- check_file_or_directory_path(path1, isdir=True)
78
- check_file_or_directory_path(path2, isdir=True)
79
- dirs = []
80
- for dir_name in os.listdir(path1):
81
- index = dir_name.replace(dir_prefix, "", 1)
82
- if not dir_name.startswith(dir_prefix) or not index.isdigit():
83
- continue
84
-
85
- folder2 = os.path.join(path2, dir_name)
86
- if not os.path.isdir(folder2):
87
- continue
88
- dirs.append(int(index))
89
- dirs = sorted(dirs)
90
- return dirs
91
-
92
- @classmethod
93
- def _save_similarities(cls, similarities: List[float], steps: List[int], output_dir: str):
94
- if not similarities:
95
- raise ValueError(f"length of similarities is 0")
96
- result = [['step'] + [str(step) for step in steps]]
97
- for key, value in tqdm(similarities.items(), desc="save similarities (by param)"):
98
- if len(value) != len(steps):
99
- raise RuntimeError(f"similarities length of {key}:{len(value)} not equal steps:{len(steps)}")
100
- plt.plot(steps, value)
101
- plt.xlabel('steps')
102
- plt.ylabel('similarities')
103
- plt.title(f'{key}_similarities')
104
- picture_dir = os.path.join(output_dir, "similarities_picture")
105
- if not os.path.isdir(picture_dir):
106
- create_directory(picture_dir)
107
- fig_save_path = os.path.join(picture_dir, f"{key}_similarities.png")
108
-
109
- plt_savefig(fig_save_path)
110
- plt.close()
111
-
112
- result.append([key] + value)
113
- result_csv_path = os.path.join(output_dir, "similarities.csv")
114
- if os.path.exists(result_csv_path):
115
- logger.warning(f"{result_csv_path} will be deleted")
116
- remove_path(result_csv_path)
117
- write_csv(result, result_csv_path)
118
-
119
- @classmethod
120
- def _calculate_separated_similarities(cls, path1, path2, steps):
121
- similarities = {}
122
- logger.info(f"{len(steps)} steps will be compared")
123
- grad_weight_order = cls._get_grad_weight_order(path1, path2)
124
- for step in tqdm(steps, desc="calculate similarities (by step)"):
125
- grad_files = cls._get_matched_grad_files(path1, path2, step)
126
- same_count_summary = 0
127
- total_count_summary = 0
128
- for grad_name in grad_weight_order:
129
- grad_file = cls._get_name_matched_grad_file(grad_name, grad_files)
130
- grad1 = os.path.join(path1, f"step{step}", grad_file)
131
- grad2 = os.path.join(path2, f"step{step}", grad_file)
132
- same_count, total_count = cls._calculate_similarity(grad1, grad2)
133
- same_count_summary += same_count
134
- total_count_summary += total_count
135
- idx = grad_file.rfind(".")
136
- param_name = grad_file[:idx]
137
- if param_name not in similarities:
138
- similarities[param_name] = []
139
- if total_count == 0:
140
- similarities[param_name].append(0)
141
- else:
142
- similarities[param_name].append(same_count / total_count)
143
- if GradConst.SUMMARY not in similarities:
144
- similarities[GradConst.SUMMARY] = []
145
- if total_count_summary == 0:
146
- similarities[GradConst.SUMMARY].append(0)
147
- else:
148
- similarities[GradConst.SUMMARY].append(same_count_summary / total_count_summary)
149
- return similarities
150
-
151
- @classmethod
152
- def _get_matched_grad_files(cls, path1: str, path2: str, step: int):
153
- path1 = os.path.join(path1, f"step{step}")
154
- path2 = os.path.join(path2, f"step{step}")
155
- check_file_or_directory_path(path1, isdir=True)
156
- check_file_or_directory_path(path2, isdir=True)
157
- grad_files = []
158
- for grad_file in os.listdir(path1):
159
- splits = grad_file.split('.')
160
- if len(splits) < 1 or splits[-1] not in GradConst.GRAD_FILE_SUFFIX:
161
- continue
162
- folder2 = os.path.join(path2, grad_file)
163
- if not os.path.exists(folder2):
164
- continue
165
- grad_files.append(grad_file)
166
- return sorted(grad_files)
167
-
168
- @classmethod
169
- def _calculate_similarity(cls, grad_file1: str, grad_file2: str):
170
- npy1, npy2 = cls._load_grad_files(grad_file1, grad_file2)
171
- same_count = (npy1 == npy2).sum()
172
- total_count = npy1.size
173
- return same_count, total_count
174
-
175
- @classmethod
176
- def _load_grad_files(cls, grad_file1: str, grad_file2: str):
177
- grad1 = load_npy(grad_file1)
178
- grad2 = load_npy(grad_file2)
179
- if grad1.shape != grad2.shape:
180
- raise RuntimeError(f"tensor shape is not equal: {grad_file1}, {grad_file2}")
181
- if grad1.dtype != bool:
182
- raise TypeError(f"tensor type is not bool: {grad_file1}")
183
- if grad2.dtype != bool:
184
- raise TypeError(f"tensor type is not bool: {grad_file2}")
185
- return grad1, grad2
186
-
187
-