mindstudio-probe 1.0.1__py3-none-any.whl → 1.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (323) hide show
  1. {mindstudio_probe-1.0.1.dist-info → mindstudio_probe-1.0.4.dist-info}/LICENSE +201 -201
  2. {mindstudio_probe-1.0.1.dist-info → mindstudio_probe-1.0.4.dist-info}/METADATA +36 -30
  3. mindstudio_probe-1.0.4.dist-info/RECORD +276 -0
  4. {mindstudio_probe-1.0.1.dist-info → mindstudio_probe-1.0.4.dist-info}/WHEEL +1 -1
  5. {mindstudio_probe-1.0.1.dist-info → mindstudio_probe-1.0.4.dist-info}/entry_points.txt +1 -0
  6. msprobe/README.md +101 -182
  7. msprobe/__init__.py +1 -0
  8. msprobe/{config/config.json → config.json} +49 -27
  9. msprobe/core/__init__.py +0 -0
  10. msprobe/{pytorch → core}/advisor/advisor.py +124 -124
  11. msprobe/{pytorch → core}/advisor/advisor_const.py +59 -59
  12. msprobe/{pytorch → core}/advisor/advisor_result.py +58 -58
  13. msprobe/core/common/const.py +341 -241
  14. msprobe/core/common/exceptions.py +100 -88
  15. msprobe/core/common/{file_check.py → file_utils.py} +478 -265
  16. msprobe/core/common/log.py +76 -55
  17. msprobe/core/common/utils.py +385 -516
  18. msprobe/core/common_config.py +85 -58
  19. msprobe/core/compare/acc_compare.py +300 -0
  20. msprobe/core/compare/check.py +95 -0
  21. msprobe/core/compare/compare_cli.py +49 -0
  22. msprobe/core/compare/highlight.py +223 -0
  23. msprobe/core/compare/multiprocessing_compute.py +149 -0
  24. msprobe/{pytorch → core}/compare/npy_compare.py +295 -244
  25. msprobe/core/compare/utils.py +430 -0
  26. msprobe/core/data_dump/data_collector.py +154 -140
  27. msprobe/core/data_dump/data_processor/base.py +314 -245
  28. msprobe/core/data_dump/data_processor/factory.py +59 -61
  29. msprobe/core/data_dump/data_processor/mindspore_processor.py +186 -0
  30. msprobe/core/data_dump/data_processor/pytorch_processor.py +366 -346
  31. msprobe/core/data_dump/json_writer.py +96 -116
  32. msprobe/core/data_dump/scope.py +178 -178
  33. msprobe/core/grad_probe/__init__.py +0 -0
  34. msprobe/core/grad_probe/constant.py +71 -0
  35. msprobe/core/grad_probe/grad_compare.py +171 -0
  36. msprobe/core/grad_probe/utils.py +64 -0
  37. msprobe/docs/01.installation.md +89 -0
  38. msprobe/docs/02.config_introduction.md +165 -0
  39. msprobe/docs/03.config_examples.md +247 -0
  40. msprobe/docs/04.acl_config_examples.md +76 -0
  41. msprobe/docs/05.data_dump_PyTorch.md +198 -0
  42. msprobe/docs/06.data_dump_MindSpore.md +243 -0
  43. msprobe/docs/07.accuracy_checker_PyTorch.md +274 -0
  44. msprobe/docs/08.accuracy_checker_online_PyTorch.md +198 -0
  45. msprobe/docs/09.accuracy_checker_MindSpore.md +68 -0
  46. msprobe/docs/10.accuracy_compare_PyTorch.md +245 -0
  47. msprobe/docs/11.accuracy_compare_MindSpore.md +202 -0
  48. msprobe/docs/12.overflow_check_PyTorch.md +79 -0
  49. msprobe/docs/13.overflow_check_MindSpore.md +31 -0
  50. msprobe/{pytorch/doc/parse_tool.md → docs/14.data_parse_PyTorch.md} +283 -286
  51. msprobe/docs/15.free_benchmarking_PyTorch.md +164 -0
  52. msprobe/docs/17.grad_probe.md +207 -0
  53. msprobe/docs/FAQ_PyTorch.md +177 -0
  54. msprobe/docs/S02.report_free_benchmarking_validation_performance_baseline.md +146 -0
  55. msprobe/docs/img/free_benchmark_framework.png +0 -0
  56. msprobe/docs/img/grad_probe_image-1.png +0 -0
  57. msprobe/docs/img/grad_probe_image-2.png +0 -0
  58. msprobe/docs/img/grad_probe_image-3.png +0 -0
  59. msprobe/docs/img/grad_probe_image-4.png +0 -0
  60. msprobe/docs/img/grad_probe_image.png +0 -0
  61. msprobe/mindspore/__init__.py +1 -1
  62. msprobe/mindspore/api_accuracy_checker/__init__.py +0 -0
  63. msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +255 -0
  64. msprobe/mindspore/api_accuracy_checker/api_info.py +69 -0
  65. msprobe/mindspore/api_accuracy_checker/api_runner.py +156 -0
  66. msprobe/mindspore/api_accuracy_checker/base_compare_algorithm.py +197 -0
  67. msprobe/mindspore/api_accuracy_checker/cmd_parser.py +6 -0
  68. msprobe/mindspore/api_accuracy_checker/compute_element.py +239 -0
  69. msprobe/mindspore/api_accuracy_checker/main.py +9 -0
  70. msprobe/mindspore/api_accuracy_checker/type_mapping.py +114 -0
  71. msprobe/mindspore/api_accuracy_checker/utils.py +80 -0
  72. msprobe/mindspore/cell_processor.py +34 -0
  73. msprobe/mindspore/common/const.py +106 -0
  74. msprobe/mindspore/common/log.py +38 -0
  75. msprobe/mindspore/common/utils.py +81 -0
  76. msprobe/mindspore/compare/distributed_compare.py +75 -0
  77. msprobe/mindspore/compare/ms_compare.py +219 -0
  78. msprobe/mindspore/compare/ms_graph_compare.py +348 -0
  79. msprobe/mindspore/compare/ms_to_pt_api.yaml +399 -0
  80. msprobe/mindspore/debugger/debugger_config.py +66 -51
  81. msprobe/mindspore/debugger/precision_debugger.py +126 -32
  82. msprobe/mindspore/dump/dump_tool_factory.py +35 -38
  83. msprobe/mindspore/dump/hook_cell/api_registry.py +118 -0
  84. msprobe/mindspore/dump/hook_cell/hook_cell.py +55 -0
  85. msprobe/mindspore/dump/hook_cell/support_wrap_ops.yaml +922 -0
  86. msprobe/mindspore/dump/hook_cell/wrap_api.py +113 -0
  87. msprobe/mindspore/dump/jit_dump.py +72 -0
  88. msprobe/mindspore/dump/kernel_graph_dump.py +59 -60
  89. msprobe/mindspore/dump/kernel_kbyk_dump.py +64 -0
  90. msprobe/mindspore/free_benchmark/__init__.py +0 -0
  91. msprobe/mindspore/free_benchmark/api_pynative_self_check.py +116 -0
  92. msprobe/mindspore/free_benchmark/common/__init__.py +0 -0
  93. msprobe/mindspore/free_benchmark/common/config.py +12 -0
  94. msprobe/mindspore/free_benchmark/common/handler_params.py +17 -0
  95. msprobe/mindspore/free_benchmark/common/utils.py +71 -0
  96. msprobe/mindspore/free_benchmark/data/support_wrap_ops.yaml +842 -0
  97. msprobe/mindspore/free_benchmark/decorator/__init__.py +0 -0
  98. msprobe/mindspore/free_benchmark/decorator/dec_forward.py +43 -0
  99. msprobe/mindspore/free_benchmark/decorator/decorator_factory.py +107 -0
  100. msprobe/mindspore/free_benchmark/handler/__init__.py +0 -0
  101. msprobe/mindspore/free_benchmark/handler/base_handler.py +90 -0
  102. msprobe/mindspore/free_benchmark/handler/check_handler.py +41 -0
  103. msprobe/mindspore/free_benchmark/handler/fix_handler.py +36 -0
  104. msprobe/mindspore/free_benchmark/handler/handler_factory.py +21 -0
  105. msprobe/mindspore/free_benchmark/perturbation/add_noise.py +67 -0
  106. msprobe/mindspore/free_benchmark/perturbation/base_perturbation.py +21 -0
  107. msprobe/mindspore/free_benchmark/perturbation/bit_noise.py +63 -0
  108. msprobe/mindspore/free_benchmark/perturbation/exchange_value.py +51 -0
  109. msprobe/mindspore/free_benchmark/perturbation/improve_precision.py +35 -0
  110. msprobe/mindspore/free_benchmark/perturbation/no_change.py +12 -0
  111. msprobe/mindspore/free_benchmark/perturbation/perturbation_factory.py +29 -0
  112. msprobe/mindspore/free_benchmark/self_check_tool_factory.py +33 -0
  113. msprobe/mindspore/grad_probe/__init__.py +0 -0
  114. msprobe/mindspore/grad_probe/global_context.py +90 -0
  115. msprobe/mindspore/grad_probe/grad_analyzer.py +231 -0
  116. msprobe/mindspore/grad_probe/grad_monitor.py +27 -0
  117. msprobe/mindspore/grad_probe/grad_stat_csv.py +132 -0
  118. msprobe/mindspore/grad_probe/hook.py +94 -0
  119. msprobe/mindspore/grad_probe/utils.py +30 -0
  120. msprobe/mindspore/ms_config.py +128 -78
  121. msprobe/mindspore/overflow_check/kernel_graph_overflow_check.py +44 -45
  122. msprobe/mindspore/overflow_check/overflow_check_tool_factory.py +34 -32
  123. msprobe/mindspore/runtime.py +4 -0
  124. msprobe/mindspore/service.py +378 -0
  125. msprobe/mindspore/task_handler_factory.py +24 -21
  126. msprobe/msprobe.py +105 -67
  127. msprobe/pytorch/__init__.py +4 -4
  128. msprobe/pytorch/api_accuracy_checker/common/config.py +53 -50
  129. msprobe/pytorch/api_accuracy_checker/common/utils.py +214 -224
  130. msprobe/pytorch/api_accuracy_checker/compare/algorithm.py +213 -216
  131. msprobe/pytorch/api_accuracy_checker/compare/api_precision_compare.py +606 -545
  132. msprobe/pytorch/api_accuracy_checker/compare/api_precision_standard.yaml +132 -132
  133. msprobe/pytorch/api_accuracy_checker/compare/api_precision_threshold.yaml +390 -390
  134. msprobe/pytorch/api_accuracy_checker/compare/compare.py +386 -345
  135. msprobe/pytorch/api_accuracy_checker/compare/compare_column.py +73 -73
  136. msprobe/pytorch/api_accuracy_checker/compare/compare_utils.py +245 -248
  137. msprobe/pytorch/api_accuracy_checker/config.yaml +10 -4
  138. msprobe/pytorch/api_accuracy_checker/run_ut/data_generate.py +335 -328
  139. msprobe/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py +200 -203
  140. msprobe/pytorch/api_accuracy_checker/run_ut/run_overflow_check.py +133 -127
  141. msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py +592 -493
  142. msprobe/pytorch/api_accuracy_checker/run_ut/run_ut_utils.py +70 -7
  143. msprobe/pytorch/api_accuracy_checker/run_ut/torch_ut_setting.json +7 -4
  144. msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/__init__.py +0 -0
  145. msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/attl.py +197 -0
  146. msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/client.py +325 -0
  147. msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/device_dispatch.py +204 -0
  148. msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/server.py +219 -0
  149. msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/ssl_config.py +10 -0
  150. msprobe/pytorch/bench_functions/__init__.py +15 -0
  151. msprobe/pytorch/bench_functions/apply_adam_w.py +28 -0
  152. msprobe/pytorch/bench_functions/confusion_transpose.py +19 -0
  153. msprobe/pytorch/bench_functions/fast_gelu.py +55 -0
  154. msprobe/pytorch/bench_functions/layer_norm_eval.py +6 -0
  155. msprobe/pytorch/bench_functions/linear.py +12 -0
  156. msprobe/pytorch/bench_functions/matmul_backward.py +48 -0
  157. msprobe/pytorch/bench_functions/npu_fusion_attention.py +509 -0
  158. msprobe/pytorch/bench_functions/rms_norm.py +15 -0
  159. msprobe/pytorch/bench_functions/rotary_mul.py +52 -0
  160. msprobe/pytorch/bench_functions/scaled_mask_softmax.py +26 -0
  161. msprobe/pytorch/bench_functions/swiglu.py +55 -0
  162. msprobe/pytorch/common/__init__.py +2 -2
  163. msprobe/pytorch/common/compare_script.template +14 -14
  164. msprobe/pytorch/common/log.py +20 -31
  165. msprobe/pytorch/common/parse_json.py +39 -37
  166. msprobe/pytorch/common/utils.py +305 -224
  167. msprobe/pytorch/compare/distributed_compare.py +66 -111
  168. msprobe/pytorch/compare/mapping.yaml +607 -607
  169. msprobe/pytorch/compare/match.py +34 -36
  170. msprobe/pytorch/compare/pt_compare.py +50 -0
  171. msprobe/pytorch/debugger/debugger_config.py +95 -86
  172. msprobe/pytorch/debugger/precision_debugger.py +125 -95
  173. msprobe/pytorch/free_benchmark/__init__.py +8 -8
  174. msprobe/pytorch/free_benchmark/common/constant.py +70 -67
  175. msprobe/pytorch/free_benchmark/common/counter.py +71 -71
  176. msprobe/pytorch/free_benchmark/common/enums.py +37 -37
  177. msprobe/pytorch/free_benchmark/common/params.py +129 -129
  178. msprobe/pytorch/free_benchmark/common/utils.py +102 -98
  179. msprobe/pytorch/free_benchmark/compare/grad_saver.py +179 -183
  180. msprobe/pytorch/free_benchmark/compare/single_benchmark.py +104 -104
  181. msprobe/pytorch/free_benchmark/main.py +105 -102
  182. msprobe/pytorch/free_benchmark/perturbed_layers/base_layer.py +13 -13
  183. msprobe/pytorch/free_benchmark/perturbed_layers/layer_factory.py +41 -41
  184. msprobe/pytorch/free_benchmark/perturbed_layers/npu/add_noise.py +90 -90
  185. msprobe/pytorch/free_benchmark/perturbed_layers/npu/bit_noise.py +104 -104
  186. msprobe/pytorch/free_benchmark/perturbed_layers/npu/change_value.py +63 -63
  187. msprobe/pytorch/free_benchmark/perturbed_layers/npu/improve_precision.py +68 -68
  188. msprobe/pytorch/free_benchmark/perturbed_layers/npu/no_change.py +28 -28
  189. msprobe/pytorch/free_benchmark/perturbed_layers/npu/npu_base_layser.py +45 -45
  190. msprobe/pytorch/free_benchmark/perturbed_layers/run_cpu.py +19 -19
  191. msprobe/pytorch/free_benchmark/result_handlers/base_handler.py +217 -203
  192. msprobe/pytorch/free_benchmark/result_handlers/check_handler.py +39 -39
  193. msprobe/pytorch/free_benchmark/result_handlers/fix_handler.py +23 -23
  194. msprobe/pytorch/free_benchmark/result_handlers/handler_factory.py +30 -31
  195. msprobe/pytorch/free_benchmark/result_handlers/preheat_handler.py +170 -170
  196. msprobe/pytorch/function_factory.py +76 -0
  197. msprobe/pytorch/functional/dump_module.py +39 -39
  198. msprobe/pytorch/grad_probe/__init__.py +0 -0
  199. msprobe/pytorch/grad_probe/grad_monitor.py +91 -0
  200. msprobe/pytorch/grad_probe/grad_stat_csv.py +129 -0
  201. msprobe/pytorch/hook_module/api_registry.py +161 -161
  202. msprobe/pytorch/hook_module/hook_module.py +120 -109
  203. msprobe/pytorch/hook_module/support_wrap_ops.yaml +1879 -1876
  204. msprobe/pytorch/hook_module/utils.py +30 -29
  205. msprobe/pytorch/hook_module/wrap_aten.py +110 -100
  206. msprobe/pytorch/hook_module/wrap_distributed.py +78 -75
  207. msprobe/pytorch/hook_module/wrap_functional.py +105 -108
  208. msprobe/pytorch/hook_module/wrap_npu_custom.py +93 -73
  209. msprobe/pytorch/hook_module/wrap_tensor.py +71 -72
  210. msprobe/pytorch/hook_module/wrap_torch.py +86 -88
  211. msprobe/pytorch/hook_module/wrap_vf.py +62 -64
  212. msprobe/pytorch/module_processer.py +138 -98
  213. msprobe/pytorch/online_dispatch/__init__.py +20 -20
  214. msprobe/pytorch/online_dispatch/compare.py +236 -236
  215. msprobe/pytorch/online_dispatch/dispatch.py +271 -273
  216. msprobe/pytorch/online_dispatch/dump_compare.py +155 -186
  217. msprobe/pytorch/online_dispatch/single_compare.py +391 -391
  218. msprobe/pytorch/online_dispatch/torch_ops_config.yaml +49 -49
  219. msprobe/pytorch/online_dispatch/utils.py +130 -187
  220. msprobe/pytorch/parse.py +4 -4
  221. msprobe/pytorch/parse_tool/cli.py +32 -32
  222. msprobe/pytorch/parse_tool/lib/compare.py +260 -259
  223. msprobe/pytorch/parse_tool/lib/config.py +52 -51
  224. msprobe/pytorch/parse_tool/lib/file_desc.py +31 -31
  225. msprobe/pytorch/parse_tool/lib/interactive_cli.py +102 -102
  226. msprobe/pytorch/parse_tool/lib/parse_exception.py +54 -54
  227. msprobe/pytorch/parse_tool/lib/parse_tool.py +158 -158
  228. msprobe/pytorch/parse_tool/lib/utils.py +316 -367
  229. msprobe/pytorch/parse_tool/lib/visualization.py +85 -90
  230. msprobe/pytorch/pt_config.py +188 -93
  231. msprobe/pytorch/service.py +246 -167
  232. mindstudio_probe-1.0.1.dist-info/RECORD +0 -228
  233. msprobe/config/README.md +0 -397
  234. msprobe/mindspore/doc/dump.md +0 -65
  235. msprobe/mindspore/dump/api_kbk_dump.py +0 -55
  236. msprobe/pytorch/compare/acc_compare.py +0 -1024
  237. msprobe/pytorch/compare/highlight.py +0 -100
  238. msprobe/pytorch/doc/FAQ.md +0 -193
  239. msprobe/pytorch/doc/api_accuracy_checker.md +0 -269
  240. msprobe/pytorch/doc/atat/321/207/342/226/223/342/225/233/321/205/342/225/221/320/266/321/205/342/225/226/320/265/321/205/320/225/342/225/226/321/206/320/245/342/226/221/321/206/320/235/320/276dump/321/206/320/260/320/227/321/205/320/227/320/226/321/206/320/220/320/267/321/210/320/223/342/225/234/321/205/320/257/342/225/221/321/207/342/225/221/342/224/220/321/206/320/232/320/265/321/205/320/241/320/232.md +0 -182
  241. msprobe/pytorch/doc/dump.md +0 -207
  242. msprobe/pytorch/doc/ptdbg_ascend_compare.md +0 -176
  243. msprobe/pytorch/doc/ptdbg_ascend_overview.md +0 -68
  244. msprobe/pytorch/doc/ptdbg_ascend_quickstart.md +0 -381
  245. msprobe/pytorch/doc/run_overflow_check.md +0 -25
  246. msprobe/pytorch/doc//321/205/320/254/320/270/321/207/342/225/221/342/224/220/321/207/342/226/223/342/225/233/321/205/342/225/221/320/266/321/206/320/277/320/244/321/205/320/277/342/225/243.md +0 -90
  247. msprobe/test/core_ut/common/test_utils.py +0 -345
  248. msprobe/test/core_ut/data_dump/test_data_collector.py +0 -47
  249. msprobe/test/core_ut/data_dump/test_json_writer.py +0 -183
  250. msprobe/test/core_ut/data_dump/test_scope.py +0 -151
  251. msprobe/test/core_ut/test_common_config.py +0 -152
  252. msprobe/test/core_ut/test_file_check.py +0 -218
  253. msprobe/test/core_ut/test_log.py +0 -109
  254. msprobe/test/mindspore_ut/test_api_kbk_dump.py +0 -51
  255. msprobe/test/mindspore_ut/test_debugger_config.py +0 -42
  256. msprobe/test/mindspore_ut/test_dump_tool_factory.py +0 -51
  257. msprobe/test/mindspore_ut/test_kernel_graph_dump.py +0 -66
  258. msprobe/test/mindspore_ut/test_kernel_graph_overflow_check.py +0 -63
  259. msprobe/test/mindspore_ut/test_ms_config.py +0 -69
  260. msprobe/test/mindspore_ut/test_overflow_check_tool_factory.py +0 -51
  261. msprobe/test/mindspore_ut/test_precision_debugger.py +0 -56
  262. msprobe/test/mindspore_ut/test_task_handler_factory.py +0 -58
  263. msprobe/test/pytorch_ut/advisor/test_advisor.py +0 -83
  264. msprobe/test/pytorch_ut/api_accuracy_checker/common/test_common_utils.py +0 -108
  265. msprobe/test/pytorch_ut/api_accuracy_checker/common/test_config.py +0 -39
  266. msprobe/test/pytorch_ut/api_accuracy_checker/compare/test_algorithm.py +0 -112
  267. msprobe/test/pytorch_ut/api_accuracy_checker/compare/test_api_precision_compare.py +0 -77
  268. msprobe/test/pytorch_ut/api_accuracy_checker/compare/test_compare.py +0 -125
  269. msprobe/test/pytorch_ut/api_accuracy_checker/compare/test_compare_column.py +0 -10
  270. msprobe/test/pytorch_ut/api_accuracy_checker/compare/test_compare_utils.py +0 -43
  271. msprobe/test/pytorch_ut/api_accuracy_checker/run_ut/dump.json +0 -179
  272. msprobe/test/pytorch_ut/api_accuracy_checker/run_ut/forward.json +0 -63
  273. msprobe/test/pytorch_ut/api_accuracy_checker/run_ut/test_data_generate.py +0 -99
  274. msprobe/test/pytorch_ut/api_accuracy_checker/run_ut/test_multi_run_ut.py +0 -115
  275. msprobe/test/pytorch_ut/api_accuracy_checker/run_ut/test_run_ut.py +0 -72
  276. msprobe/test/pytorch_ut/compare/test_acc_compare.py +0 -17
  277. msprobe/test/pytorch_ut/free_benchmark/perturbed_layers/test_perturbed_layser.py +0 -105
  278. msprobe/test/pytorch_ut/free_benchmark/result_handlers/test_result_handler.py +0 -121
  279. msprobe/test/pytorch_ut/free_benchmark/test_main.py +0 -101
  280. msprobe/test/pytorch_ut/functional/test_dump_module.py +0 -15
  281. msprobe/test/pytorch_ut/hook_module/test_api_registry.py +0 -130
  282. msprobe/test/pytorch_ut/hook_module/test_hook_module.py +0 -42
  283. msprobe/test/pytorch_ut/hook_module/test_wrap_aten.py +0 -65
  284. msprobe/test/pytorch_ut/hook_module/test_wrap_distributed.py +0 -35
  285. msprobe/test/pytorch_ut/hook_module/test_wrap_functional.py +0 -20
  286. msprobe/test/pytorch_ut/hook_module/test_wrap_tensor.py +0 -35
  287. msprobe/test/pytorch_ut/hook_module/test_wrap_torch.py +0 -43
  288. msprobe/test/pytorch_ut/hook_module/test_wrap_vf.py +0 -11
  289. msprobe/test/pytorch_ut/test_pt_config.py +0 -69
  290. msprobe/test/pytorch_ut/test_service.py +0 -59
  291. msprobe/test/resources/advisor.txt +0 -3
  292. msprobe/test/resources/compare_result_20230703104808.csv +0 -9
  293. msprobe/test/resources/compare_result_without_accuracy.csv +0 -9
  294. msprobe/test/resources/config.yaml +0 -3
  295. msprobe/test/resources/npu_test.pkl +0 -8
  296. msprobe/test/run_test.sh +0 -30
  297. msprobe/test/run_ut.py +0 -58
  298. msprobe/test/test_module_processer.py +0 -64
  299. {mindstudio_probe-1.0.1.dist-info → mindstudio_probe-1.0.4.dist-info}/top_level.txt +0 -0
  300. /msprobe/{pytorch/doc → docs}/img/BLOOM-7B_1.png +0 -0
  301. /msprobe/{pytorch/doc → docs}/img/BLOOM-7B_2.png +0 -0
  302. /msprobe/{pytorch/doc → docs}/img/BLOOM-7B_3.png +0 -0
  303. /msprobe/{pytorch/doc → docs}/img/BLOOM-7B_4.png +0 -0
  304. /msprobe/{pytorch/doc → docs}/img/GPT-3_1.png +0 -0
  305. /msprobe/{pytorch/doc → docs}/img/GPT-3_2.png +0 -0
  306. /msprobe/{pytorch/doc → docs}/img/GPT-3_3.png +0 -0
  307. /msprobe/{pytorch/doc → docs}/img/GPT-3_4.png +0 -0
  308. /msprobe/{pytorch/doc → docs}/img/GPT-3_5.png +0 -0
  309. /msprobe/{pytorch/doc → docs}/img/GPT-3_6.png +0 -0
  310. /msprobe/{pytorch/doc → docs}/img/GPT-3_7.png +0 -0
  311. /msprobe/{pytorch/doc → docs}/img/GPT-3_8.png +0 -0
  312. /msprobe/{pytorch/doc → docs}/img/YOLOV5S_1.png +0 -0
  313. /msprobe/{pytorch/doc → docs}/img/YOLOV5S_2.png +0 -0
  314. /msprobe/{pytorch/doc → docs}/img/accuracy_checking_details.png +0 -0
  315. /msprobe/{pytorch/doc → docs}/img/accuracy_checking_result.png +0 -0
  316. /msprobe/{pytorch/doc → docs}/img/api_precision_compare_details.png +0 -0
  317. /msprobe/{pytorch/doc → docs}/img/api_precision_compare_result.png +0 -0
  318. /msprobe/{pytorch/doc → docs}/img/auto_analyze_log.png +0 -0
  319. /msprobe/{pytorch/doc → docs}/img/compare_result_pkl.png +0 -0
  320. /msprobe/{pytorch/doc → docs}/img/compare_result_pkl_md5.png.png +0 -0
  321. /msprobe/{pytorch/doc → docs}/img/cpu_info.png +0 -0
  322. /msprobe/{config → docs}/img/free_benchmark.png +0 -0
  323. /msprobe/{pytorch/doc → docs}/img/module_compare.png +0 -0
@@ -1,167 +1,246 @@
1
- import functools
2
- import os
3
- from pathlib import Path
4
-
5
- from msprobe.pytorch.common.log import logger
6
- from msprobe.core.common.file_check import FileChecker, check_path_before_create
7
- from msprobe.core.common.const import Const, FileCheckConst
8
- from msprobe.core.common.exceptions import DistributedNotInitializedError, MsaccException
9
- from msprobe.core.data_dump.data_collector import build_data_collector
10
- from msprobe.core.data_dump.scope import BaseScope
11
- from msprobe.core.data_dump.data_processor.base import ModuleForwardInputsOutputs, ModuleBackwardInputsOutputs
12
- from msprobe.pytorch.common.utils import get_rank_if_initialized
13
- from msprobe.pytorch.module_processer import ModuleProcesser
14
- from msprobe.pytorch.hook_module import remove_dropout
15
- from msprobe.pytorch.hook_module.api_registry import api_register
16
-
17
-
18
- class Service:
19
- def __init__(self, config):
20
- self.model = None
21
- self.config = config
22
- self.data_collector = build_data_collector(config)
23
- self.module_processor = ModuleProcesser(self.data_collector.scope)
24
- self.switch = False
25
- self.current_iter = 0
26
- self.first_start = True
27
- self.current_rank = None
28
- self.dump_iter_dir = None
29
-
30
- def build_hook(self, module_type, name):
31
- def pre_hook(api_or_module_name, module, args, kwargs):
32
- if module_type == BaseScope.Module_Type_Module:
33
- api_or_module_name = module.mindstudio_reserved_name
34
- self.data_collector.visit_and_clear_overflow_status(api_or_module_name)
35
-
36
- if not self.switch:
37
- return args, kwargs
38
- if self.data_collector:
39
- module_input_output = ModuleForwardInputsOutputs(args=args, kwargs=kwargs, output=None)
40
- self.data_collector.pre_forward_data_collect(api_or_module_name, module, pid, module_input_output)
41
- return args, kwargs
42
-
43
- def forward_hook(api_or_module_name, module, args, kwargs, output):
44
- if module_type == BaseScope.Module_Type_Module:
45
- api_or_module_name = module.mindstudio_reserved_name
46
- self.data_collector.visit_and_clear_overflow_status(api_or_module_name)
47
-
48
- if not self.switch:
49
- return None
50
- if self.data_collector:
51
- module_input_output = ModuleForwardInputsOutputs(args=args, kwargs=kwargs, output=output)
52
- self.data_collector.forward_data_collect(api_or_module_name, module, pid, module_input_output)
53
- if self.data_collector.if_return_forward_new_output():
54
- return self.data_collector.get_forward_new_output()
55
- return output
56
-
57
- def backward_hook(api_or_module_name, module, grad_input, grad_output):
58
- if module_type == BaseScope.Module_Type_Module:
59
- api_or_module_name = module.mindstudio_reserved_name
60
- self.data_collector.visit_and_clear_overflow_status(api_or_module_name)
61
-
62
- if not self.switch:
63
- return
64
- if self.data_collector:
65
- module_input_output = ModuleBackwardInputsOutputs(grad_input=grad_input, grad_output=grad_output)
66
- self.data_collector.backward_data_collect(api_or_module_name, module, pid, module_input_output)
67
-
68
- pid = os.getpid()
69
- forward_name_template = name + Const.FORWARD
70
- backward_name_template = name + Const.BACKWARD
71
- pre_forward_hook = functools.partial(pre_hook, forward_name_template)
72
- forward_hook = functools.partial(forward_hook, forward_name_template)
73
- backward_hook = functools.partial(backward_hook, backward_name_template)
74
- return pre_forward_hook, forward_hook, backward_hook
75
-
76
- def step(self):
77
- self.current_iter += 1
78
- self.data_collector.update_iter(self.current_iter)
79
-
80
- def start(self, model):
81
- self.model = model
82
- if self.config.step and self.current_iter > max(self.config.step):
83
- self.stop()
84
- raise Exception("msprobe: exit after iteration {}".format(max(self.config.step)))
85
- if self.config.step and self.current_iter not in self.config.step:
86
- return
87
- if self.first_start:
88
- try:
89
- self.current_rank = get_rank_if_initialized()
90
- except DistributedNotInitializedError:
91
- self.current_rank = None
92
-
93
- if self.config.rank and self.current_rank not in self.config.rank:
94
- return
95
- self.register_hook_new()
96
- self.first_start = False
97
- self.switch = True
98
- logger.info_on_rank_0(f"Dump switch is turned on at step {self.current_iter}. ")
99
- if self.config.level != "L2":
100
- self.create_dirs()
101
- logger.info_on_rank_0(f"Dump data will be saved in {self.dump_iter_dir}.")
102
-
103
- def stop(self):
104
- if self.config.level == "L2":
105
- return
106
- if self.config.step and self.current_iter not in self.config.step:
107
- return
108
- if self.config.rank and self.current_rank not in self.config.rank:
109
- return
110
- self.switch = False
111
- self.data_collector.write_json()
112
-
113
- def create_dirs(self):
114
- check_path_before_create(self.config.dump_path)
115
- if not os.path.exists(self.config.dump_path):
116
- Path(self.config.dump_path).mkdir(mode=0o750, exist_ok=True)
117
- file_check = FileChecker(self.config.dump_path, FileCheckConst.DIR)
118
- file_check.common_check()
119
- self.dump_iter_dir = os.path.join(self.config.dump_path, f"step{self.current_iter}")
120
- cur_rank = self.current_rank if self.current_rank is not None else ''
121
- dump_dir = os.path.join(self.dump_iter_dir, f"rank{cur_rank}")
122
- if not os.path.exists(dump_dir):
123
- Path(dump_dir).mkdir(mode=0o750, parents=True, exist_ok=True)
124
- if self.config.task in self.data_collector.tasks_need_tensor_data:
125
- dump_data_dir = os.path.join(dump_dir, "dump_tensor_data")
126
- Path(dump_data_dir).mkdir(mode=0o750, exist_ok=True)
127
- else:
128
- dump_data_dir = None
129
-
130
- dump_file_path = os.path.join(dump_dir, "dump.json")
131
- stack_file_path = os.path.join(dump_dir, "stack.json")
132
- construct_file_path = os.path.join(dump_dir, "construct.json")
133
- free_benchmark_file_path = os.path.join(self.config.dump_path, "free_benchmark.csv")
134
- self.data_collector.update_dump_paths(
135
- dump_file_path, stack_file_path, construct_file_path, dump_data_dir, free_benchmark_file_path)
136
-
137
- def register_hook_new(self):
138
- logger.info_on_rank_0("The {} hook function is successfully mounted to the model.".format(self.config.task))
139
- if self.config.level in ["L0", "mix"]:
140
- if self.model is None:
141
- logger.error_log_with_exp("The model is None.", MsaccException.INVALID_PARAM_ERROR)
142
- logger.info_on_rank_0("The init dump mode is enabled, and the module dump function will not be available")
143
- for name, module in self.model.named_modules():
144
- if module == self.model:
145
- continue
146
- prefix = BaseScope.Module_Type_Module + Const.SEP + name + Const.SEP + \
147
- module.__class__.__name__ + Const.SEP
148
-
149
- pre_forward_hook, forward_hook, backward_hook = self.build_hook(BaseScope.Module_Type_Module, prefix)
150
- module.register_forward_hook(forward_hook, with_kwargs=True)
151
- module.register_full_backward_hook(backward_hook)
152
-
153
- module.register_forward_pre_hook(
154
- self.module_processor.node_hook(prefix + Const.FORWARD, Const.START))
155
- module.register_forward_hook(
156
- self.module_processor.node_hook(prefix + Const.FORWARD, Const.STOP))
157
- module.register_full_backward_pre_hook(
158
- self.module_processor.node_hook(prefix + Const.BACKWARD, Const.START))
159
- module.register_full_backward_hook(
160
- self.module_processor.node_hook(prefix + Const.BACKWARD, Const.STOP))
161
-
162
- if self.config.level in ["mix", "L1", "L2"]:
163
- api_register.initialize_hook(functools.partial(self.build_hook, BaseScope.Module_Type_API))
164
- api_register.api_modularity()
165
-
166
- if Const.STATISTICS == self.config.task or Const.TENSOR == self.config.task:
167
- remove_dropout()
1
+ import functools
2
+ import os
3
+
4
+ from collections import namedtuple
5
+ import torch
6
+ from msprobe.core.common.const import Const
7
+ from msprobe.core.common.exceptions import DistributedNotInitializedError, MsprobeException
8
+ from msprobe.core.common.file_utils import create_directory
9
+ from msprobe.core.data_dump.data_collector import build_data_collector
10
+ from msprobe.core.data_dump.data_processor.base import ModuleForwardInputsOutputs, ModuleBackwardInputsOutputs
11
+ from msprobe.core.data_dump.scope import BaseScope
12
+ from msprobe.pytorch.common.log import logger
13
+ from msprobe.pytorch.common.utils import get_rank_if_initialized
14
+ from msprobe.pytorch.hook_module import remove_dropout
15
+ from msprobe.pytorch.hook_module.api_registry import api_register
16
+ from msprobe.pytorch.hook_module.hook_module import HOOKModule
17
+ from msprobe.pytorch.module_processer import ModuleProcesser
18
+ from msprobe.pytorch.api_accuracy_checker.common.utils import ApiData
19
+ torch_version_above_or_equal_2 = torch.__version__.split('+')[0] >= '2.0'
20
+
21
+ HookFn = namedtuple('hookFn', ['pre_hook', 'forward_hook', 'backward_hook', 'forward_hook_torch_version_below_2'])
22
+
23
+
24
+ class Service:
25
+ def __init__(self, config):
26
+ self.model = None
27
+ self.config = config
28
+ self.data_collector = build_data_collector(config)
29
+ self.module_processor = ModuleProcesser(self.data_collector.scope)
30
+ self.switch = False
31
+ self.current_iter = 0
32
+ self.first_start = True
33
+ self.current_rank = None
34
+ self.dump_iter_dir = None
35
+ self.attl = None
36
+
37
+ @staticmethod
38
+ def forward_backward_dump_end():
39
+ logger.info_on_rank_0("Data needed ends here.")
40
+ api_register.api_originality()
41
+
42
+ def build_hook(self, module_type, name):
43
+ def pre_hook(api_or_module_name, module, args, kwargs):
44
+ if module_type == BaseScope.Module_Type_Module:
45
+ api_or_module_name = module.mindstudio_reserved_name
46
+ self.data_collector.update_api_or_module_name(api_or_module_name)
47
+
48
+ if not self.switch:
49
+ return args, kwargs
50
+ if self.config.online_run_ut:
51
+ return None, None
52
+ if self.data_collector:
53
+ module_input_output = ModuleForwardInputsOutputs(args=args, kwargs=kwargs, output=None)
54
+ self.data_collector.pre_forward_data_collect(api_or_module_name, module, pid, module_input_output)
55
+ return args, kwargs
56
+
57
+ def forward_hook(api_or_module_name, module, args, kwargs, output):
58
+ if module_type == BaseScope.Module_Type_Module:
59
+ api_or_module_name = module.mindstudio_reserved_name
60
+ self.data_collector.update_api_or_module_name(api_or_module_name)
61
+
62
+ if not self.switch:
63
+ return None
64
+
65
+ if self.config.online_run_ut:
66
+ if self.data_collector.scope and not self.data_collector.scope.check(api_or_module_name):
67
+ return None
68
+ api_data = ApiData(name[:-1], args, kwargs, output, self.current_iter, self.current_rank)
69
+ self.attl_send(api_data)
70
+ return None
71
+
72
+ if self.data_collector:
73
+ module_input_output = ModuleForwardInputsOutputs(args=args, kwargs=kwargs, output=output)
74
+ self.data_collector.forward_data_collect(api_or_module_name, module, pid, module_input_output)
75
+ if self.data_collector.if_return_forward_new_output():
76
+ return self.data_collector.get_forward_new_output()
77
+ return output
78
+
79
+ def forward_hook_torch_version_below_2(api_or_module_name, module, args, output):
80
+ return forward_hook(api_or_module_name, module, args, {}, output)
81
+
82
+ def backward_hook(api_or_module_name, module, grad_input, grad_output):
83
+ if module_type == BaseScope.Module_Type_Module:
84
+ api_or_module_name = module.mindstudio_reserved_name
85
+ self.data_collector.update_api_or_module_name(api_or_module_name)
86
+
87
+ if not self.switch:
88
+ return
89
+
90
+ if self.config.online_run_ut:
91
+ if self.data_collector.scope and not self.data_collector.scope.check(api_or_module_name):
92
+ return
93
+ api_data = ApiData(name[:-1], grad_input, {}, grad_output, self.current_iter, self.current_rank)
94
+ self.attl_send(api_data)
95
+ return
96
+
97
+ if self.data_collector:
98
+ # 此处获取到的grad_input实际为反向过程的输出数据,grad_output为反向过程的输入数据,因此传入时调换顺序
99
+ module_input_output = ModuleBackwardInputsOutputs(grad_input=grad_output, grad_output=grad_input)
100
+ self.data_collector.backward_data_collect(api_or_module_name, module, pid, module_input_output)
101
+
102
+ pid = os.getpid()
103
+ forward_name_template = name + Const.FORWARD
104
+ backward_name_template = name + Const.BACKWARD
105
+ pre_forward_hook_fn = functools.partial(pre_hook, forward_name_template)
106
+ forward_hook_fn = functools.partial(forward_hook, forward_name_template)
107
+ backward_hook_fn = functools.partial(backward_hook, backward_name_template)
108
+ forward_hook_torch_version_below_2_fn = functools.partial(forward_hook_torch_version_below_2, forward_name_template)
109
+ return HookFn(pre_forward_hook_fn, forward_hook_fn, backward_hook_fn, forward_hook_torch_version_below_2_fn)
110
+
111
+ def step(self):
112
+ self.current_iter += 1
113
+ self.data_collector.update_iter(self.current_iter)
114
+
115
+ ModuleProcesser.reset_module_stats()
116
+ HOOKModule.reset_module_stats()
117
+
118
+ def start(self, model, api_origin=False):
119
+ self.model = model
120
+ if self.config.step and self.current_iter > max(self.config.step):
121
+ if self.config.online_run_ut:
122
+ # send stop signal if online_run_ut
123
+ self.attl_stop()
124
+ self.stop()
125
+ raise Exception("msprobe: exit after iteration {}".format(max(self.config.step)))
126
+ if self.config.step and self.current_iter not in self.config.step:
127
+ return
128
+ if self.first_start:
129
+ try:
130
+ self.current_rank = get_rank_if_initialized()
131
+ except DistributedNotInitializedError:
132
+ self.current_rank = None
133
+ self.attl_init()
134
+
135
+ if self.config.rank and self.current_rank not in self.config.rank:
136
+ return
137
+ self.register_hook_new()
138
+ self.first_start = False
139
+ if api_origin:
140
+ api_register.api_modularity()
141
+ self.switch = True
142
+ logger.info_on_rank_0(f"Dump switch is turned on at step {self.current_iter}. ")
143
+ if self.config.level != "L2" and not self.config.online_run_ut:
144
+ self.create_dirs()
145
+ logger.info_on_rank_0(f"Dump data will be saved in {self.dump_iter_dir}.")
146
+
147
+ def stop(self):
148
+ if self.config.level == "L2":
149
+ return
150
+ if self.config.step and self.current_iter not in self.config.step:
151
+ return
152
+ if self.config.rank and self.current_rank not in self.config.rank:
153
+ return
154
+ self.switch = False
155
+ if self.config.online_run_ut:
156
+ return
157
+ self.data_collector.write_json()
158
+
159
+ def create_dirs(self):
160
+ create_directory(self.config.dump_path)
161
+ self.dump_iter_dir = os.path.join(self.config.dump_path, f"step{self.current_iter}")
162
+ cur_rank = self.current_rank if self.current_rank is not None else ''
163
+ dump_dir = os.path.join(self.dump_iter_dir, f"rank{cur_rank}")
164
+ create_directory(dump_dir)
165
+ if self.config.task in self.data_collector.tasks_need_tensor_data:
166
+ dump_data_dir = os.path.join(dump_dir, "dump_tensor_data")
167
+ create_directory(dump_data_dir)
168
+ else:
169
+ dump_data_dir = None
170
+
171
+ dump_file_path = os.path.join(dump_dir, "dump.json")
172
+ stack_file_path = os.path.join(dump_dir, "stack.json")
173
+ construct_file_path = os.path.join(dump_dir, "construct.json")
174
+ free_benchmark_file_path = os.path.join(self.config.dump_path, "free_benchmark.csv")
175
+ self.data_collector.update_dump_paths(
176
+ dump_file_path, stack_file_path, construct_file_path, dump_data_dir, free_benchmark_file_path)
177
+
178
+ def register_hook_new(self):
179
+ logger.info_on_rank_0("The {} hook function is successfully mounted to the model.".format(self.config.task))
180
+ if self.config.level in ["L0", "mix"]:
181
+ if self.model is None:
182
+ logger.error_log_with_exp("The model is None.", MsprobeException.INVALID_PARAM_ERROR)
183
+ logger.info_on_rank_0("The init dump mode is enabled, and the module dump function will not be available")
184
+ for name, module in self.model.named_modules():
185
+ if module == self.model:
186
+ continue
187
+ prefix = BaseScope.Module_Type_Module + Const.SEP + name + Const.SEP + \
188
+ module.__class__.__name__ + Const.SEP
189
+
190
+ pre_forward_hook, forward_hook, backward_hook, forward_hook_torch_version_below_2 \
191
+ = self.build_hook(BaseScope.Module_Type_Module, prefix)
192
+ if torch_version_above_or_equal_2:
193
+ module.register_forward_hook(forward_hook, with_kwargs=True)
194
+ else:
195
+ module.register_full_backward_hook(
196
+ self.module_processor.node_hook(prefix + Const.BACKWARD, Const.STOP))
197
+ module.register_forward_hook(forward_hook_torch_version_below_2)
198
+ module.register_full_backward_hook(backward_hook)
199
+
200
+ module.register_forward_pre_hook(
201
+ self.module_processor.node_hook(prefix + Const.FORWARD, Const.START))
202
+ module.register_forward_hook(
203
+ self.module_processor.node_hook(prefix + Const.FORWARD, Const.STOP))
204
+ if torch_version_above_or_equal_2:
205
+ module.register_full_backward_pre_hook(
206
+ self.module_processor.node_hook(prefix + Const.BACKWARD, Const.START))
207
+ module.register_full_backward_hook(
208
+ self.module_processor.node_hook(prefix + Const.BACKWARD, Const.STOP))
209
+
210
+ if self.config.level in ["mix", "L1", "L2"]:
211
+ api_register.initialize_hook(functools.partial(self.build_hook, BaseScope.Module_Type_API))
212
+ api_register.api_modularity()
213
+
214
+ if Const.STATISTICS == self.config.task or Const.TENSOR == self.config.task:
215
+ remove_dropout()
216
+
217
+ def attl_init(self):
218
+ if self.config.online_run_ut:
219
+ from msprobe.pytorch.api_accuracy_checker.tensor_transport_layer.attl import ATTLConfig, ATTL
220
+ attl_config = ATTLConfig(is_benchmark_device=False,
221
+ connect_ip=self.config.host,
222
+ connect_port=self.config.port,
223
+ nfs_path=self.config.nfs_path,
224
+ tls_path=self.config.tls_path)
225
+ need_dump = len(self.config.rank) == 0 or self.current_rank in self.config.rank
226
+ self.attl = ATTL('npu', attl_config, need_dump=need_dump)
227
+ if self.config.nfs_path:
228
+ self.attl.upload("start")
229
+
230
+ def attl_send(self, api_data):
231
+ logger.info(f"tools is dumping api: {api_data.name}, rank: {self.current_rank}")
232
+ api_type, _, _ = api_data.name.split(Const.SEP)
233
+ if api_type in [Const.DISTRIBUTED]:
234
+ logger.info(f"api {api_data.name} is not supported, skip")
235
+ return
236
+ if self.config.nfs_path:
237
+ self.attl.upload(api_data)
238
+ else:
239
+ self.attl.send(api_data)
240
+
241
+ def attl_stop(self):
242
+ if self.config.nfs_path:
243
+ self.attl.upload("end")
244
+ elif self.attl.socket_manager is not None:
245
+ logger.info(f"pid: {os.getpid()} finished, start send STOP signal.")
246
+ self.attl.socket_manager.send_stop_signal()