mindstudio-probe 1.0.1__py3-none-any.whl → 1.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (323) hide show
  1. {mindstudio_probe-1.0.1.dist-info → mindstudio_probe-1.0.4.dist-info}/LICENSE +201 -201
  2. {mindstudio_probe-1.0.1.dist-info → mindstudio_probe-1.0.4.dist-info}/METADATA +36 -30
  3. mindstudio_probe-1.0.4.dist-info/RECORD +276 -0
  4. {mindstudio_probe-1.0.1.dist-info → mindstudio_probe-1.0.4.dist-info}/WHEEL +1 -1
  5. {mindstudio_probe-1.0.1.dist-info → mindstudio_probe-1.0.4.dist-info}/entry_points.txt +1 -0
  6. msprobe/README.md +101 -182
  7. msprobe/__init__.py +1 -0
  8. msprobe/{config/config.json → config.json} +49 -27
  9. msprobe/core/__init__.py +0 -0
  10. msprobe/{pytorch → core}/advisor/advisor.py +124 -124
  11. msprobe/{pytorch → core}/advisor/advisor_const.py +59 -59
  12. msprobe/{pytorch → core}/advisor/advisor_result.py +58 -58
  13. msprobe/core/common/const.py +341 -241
  14. msprobe/core/common/exceptions.py +100 -88
  15. msprobe/core/common/{file_check.py → file_utils.py} +478 -265
  16. msprobe/core/common/log.py +76 -55
  17. msprobe/core/common/utils.py +385 -516
  18. msprobe/core/common_config.py +85 -58
  19. msprobe/core/compare/acc_compare.py +300 -0
  20. msprobe/core/compare/check.py +95 -0
  21. msprobe/core/compare/compare_cli.py +49 -0
  22. msprobe/core/compare/highlight.py +223 -0
  23. msprobe/core/compare/multiprocessing_compute.py +149 -0
  24. msprobe/{pytorch → core}/compare/npy_compare.py +295 -244
  25. msprobe/core/compare/utils.py +430 -0
  26. msprobe/core/data_dump/data_collector.py +154 -140
  27. msprobe/core/data_dump/data_processor/base.py +314 -245
  28. msprobe/core/data_dump/data_processor/factory.py +59 -61
  29. msprobe/core/data_dump/data_processor/mindspore_processor.py +186 -0
  30. msprobe/core/data_dump/data_processor/pytorch_processor.py +366 -346
  31. msprobe/core/data_dump/json_writer.py +96 -116
  32. msprobe/core/data_dump/scope.py +178 -178
  33. msprobe/core/grad_probe/__init__.py +0 -0
  34. msprobe/core/grad_probe/constant.py +71 -0
  35. msprobe/core/grad_probe/grad_compare.py +171 -0
  36. msprobe/core/grad_probe/utils.py +64 -0
  37. msprobe/docs/01.installation.md +89 -0
  38. msprobe/docs/02.config_introduction.md +165 -0
  39. msprobe/docs/03.config_examples.md +247 -0
  40. msprobe/docs/04.acl_config_examples.md +76 -0
  41. msprobe/docs/05.data_dump_PyTorch.md +198 -0
  42. msprobe/docs/06.data_dump_MindSpore.md +243 -0
  43. msprobe/docs/07.accuracy_checker_PyTorch.md +274 -0
  44. msprobe/docs/08.accuracy_checker_online_PyTorch.md +198 -0
  45. msprobe/docs/09.accuracy_checker_MindSpore.md +68 -0
  46. msprobe/docs/10.accuracy_compare_PyTorch.md +245 -0
  47. msprobe/docs/11.accuracy_compare_MindSpore.md +202 -0
  48. msprobe/docs/12.overflow_check_PyTorch.md +79 -0
  49. msprobe/docs/13.overflow_check_MindSpore.md +31 -0
  50. msprobe/{pytorch/doc/parse_tool.md → docs/14.data_parse_PyTorch.md} +283 -286
  51. msprobe/docs/15.free_benchmarking_PyTorch.md +164 -0
  52. msprobe/docs/17.grad_probe.md +207 -0
  53. msprobe/docs/FAQ_PyTorch.md +177 -0
  54. msprobe/docs/S02.report_free_benchmarking_validation_performance_baseline.md +146 -0
  55. msprobe/docs/img/free_benchmark_framework.png +0 -0
  56. msprobe/docs/img/grad_probe_image-1.png +0 -0
  57. msprobe/docs/img/grad_probe_image-2.png +0 -0
  58. msprobe/docs/img/grad_probe_image-3.png +0 -0
  59. msprobe/docs/img/grad_probe_image-4.png +0 -0
  60. msprobe/docs/img/grad_probe_image.png +0 -0
  61. msprobe/mindspore/__init__.py +1 -1
  62. msprobe/mindspore/api_accuracy_checker/__init__.py +0 -0
  63. msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +255 -0
  64. msprobe/mindspore/api_accuracy_checker/api_info.py +69 -0
  65. msprobe/mindspore/api_accuracy_checker/api_runner.py +156 -0
  66. msprobe/mindspore/api_accuracy_checker/base_compare_algorithm.py +197 -0
  67. msprobe/mindspore/api_accuracy_checker/cmd_parser.py +6 -0
  68. msprobe/mindspore/api_accuracy_checker/compute_element.py +239 -0
  69. msprobe/mindspore/api_accuracy_checker/main.py +9 -0
  70. msprobe/mindspore/api_accuracy_checker/type_mapping.py +114 -0
  71. msprobe/mindspore/api_accuracy_checker/utils.py +80 -0
  72. msprobe/mindspore/cell_processor.py +34 -0
  73. msprobe/mindspore/common/const.py +106 -0
  74. msprobe/mindspore/common/log.py +38 -0
  75. msprobe/mindspore/common/utils.py +81 -0
  76. msprobe/mindspore/compare/distributed_compare.py +75 -0
  77. msprobe/mindspore/compare/ms_compare.py +219 -0
  78. msprobe/mindspore/compare/ms_graph_compare.py +348 -0
  79. msprobe/mindspore/compare/ms_to_pt_api.yaml +399 -0
  80. msprobe/mindspore/debugger/debugger_config.py +66 -51
  81. msprobe/mindspore/debugger/precision_debugger.py +126 -32
  82. msprobe/mindspore/dump/dump_tool_factory.py +35 -38
  83. msprobe/mindspore/dump/hook_cell/api_registry.py +118 -0
  84. msprobe/mindspore/dump/hook_cell/hook_cell.py +55 -0
  85. msprobe/mindspore/dump/hook_cell/support_wrap_ops.yaml +922 -0
  86. msprobe/mindspore/dump/hook_cell/wrap_api.py +113 -0
  87. msprobe/mindspore/dump/jit_dump.py +72 -0
  88. msprobe/mindspore/dump/kernel_graph_dump.py +59 -60
  89. msprobe/mindspore/dump/kernel_kbyk_dump.py +64 -0
  90. msprobe/mindspore/free_benchmark/__init__.py +0 -0
  91. msprobe/mindspore/free_benchmark/api_pynative_self_check.py +116 -0
  92. msprobe/mindspore/free_benchmark/common/__init__.py +0 -0
  93. msprobe/mindspore/free_benchmark/common/config.py +12 -0
  94. msprobe/mindspore/free_benchmark/common/handler_params.py +17 -0
  95. msprobe/mindspore/free_benchmark/common/utils.py +71 -0
  96. msprobe/mindspore/free_benchmark/data/support_wrap_ops.yaml +842 -0
  97. msprobe/mindspore/free_benchmark/decorator/__init__.py +0 -0
  98. msprobe/mindspore/free_benchmark/decorator/dec_forward.py +43 -0
  99. msprobe/mindspore/free_benchmark/decorator/decorator_factory.py +107 -0
  100. msprobe/mindspore/free_benchmark/handler/__init__.py +0 -0
  101. msprobe/mindspore/free_benchmark/handler/base_handler.py +90 -0
  102. msprobe/mindspore/free_benchmark/handler/check_handler.py +41 -0
  103. msprobe/mindspore/free_benchmark/handler/fix_handler.py +36 -0
  104. msprobe/mindspore/free_benchmark/handler/handler_factory.py +21 -0
  105. msprobe/mindspore/free_benchmark/perturbation/add_noise.py +67 -0
  106. msprobe/mindspore/free_benchmark/perturbation/base_perturbation.py +21 -0
  107. msprobe/mindspore/free_benchmark/perturbation/bit_noise.py +63 -0
  108. msprobe/mindspore/free_benchmark/perturbation/exchange_value.py +51 -0
  109. msprobe/mindspore/free_benchmark/perturbation/improve_precision.py +35 -0
  110. msprobe/mindspore/free_benchmark/perturbation/no_change.py +12 -0
  111. msprobe/mindspore/free_benchmark/perturbation/perturbation_factory.py +29 -0
  112. msprobe/mindspore/free_benchmark/self_check_tool_factory.py +33 -0
  113. msprobe/mindspore/grad_probe/__init__.py +0 -0
  114. msprobe/mindspore/grad_probe/global_context.py +90 -0
  115. msprobe/mindspore/grad_probe/grad_analyzer.py +231 -0
  116. msprobe/mindspore/grad_probe/grad_monitor.py +27 -0
  117. msprobe/mindspore/grad_probe/grad_stat_csv.py +132 -0
  118. msprobe/mindspore/grad_probe/hook.py +94 -0
  119. msprobe/mindspore/grad_probe/utils.py +30 -0
  120. msprobe/mindspore/ms_config.py +128 -78
  121. msprobe/mindspore/overflow_check/kernel_graph_overflow_check.py +44 -45
  122. msprobe/mindspore/overflow_check/overflow_check_tool_factory.py +34 -32
  123. msprobe/mindspore/runtime.py +4 -0
  124. msprobe/mindspore/service.py +378 -0
  125. msprobe/mindspore/task_handler_factory.py +24 -21
  126. msprobe/msprobe.py +105 -67
  127. msprobe/pytorch/__init__.py +4 -4
  128. msprobe/pytorch/api_accuracy_checker/common/config.py +53 -50
  129. msprobe/pytorch/api_accuracy_checker/common/utils.py +214 -224
  130. msprobe/pytorch/api_accuracy_checker/compare/algorithm.py +213 -216
  131. msprobe/pytorch/api_accuracy_checker/compare/api_precision_compare.py +606 -545
  132. msprobe/pytorch/api_accuracy_checker/compare/api_precision_standard.yaml +132 -132
  133. msprobe/pytorch/api_accuracy_checker/compare/api_precision_threshold.yaml +390 -390
  134. msprobe/pytorch/api_accuracy_checker/compare/compare.py +386 -345
  135. msprobe/pytorch/api_accuracy_checker/compare/compare_column.py +73 -73
  136. msprobe/pytorch/api_accuracy_checker/compare/compare_utils.py +245 -248
  137. msprobe/pytorch/api_accuracy_checker/config.yaml +10 -4
  138. msprobe/pytorch/api_accuracy_checker/run_ut/data_generate.py +335 -328
  139. msprobe/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py +200 -203
  140. msprobe/pytorch/api_accuracy_checker/run_ut/run_overflow_check.py +133 -127
  141. msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py +592 -493
  142. msprobe/pytorch/api_accuracy_checker/run_ut/run_ut_utils.py +70 -7
  143. msprobe/pytorch/api_accuracy_checker/run_ut/torch_ut_setting.json +7 -4
  144. msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/__init__.py +0 -0
  145. msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/attl.py +197 -0
  146. msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/client.py +325 -0
  147. msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/device_dispatch.py +204 -0
  148. msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/server.py +219 -0
  149. msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/ssl_config.py +10 -0
  150. msprobe/pytorch/bench_functions/__init__.py +15 -0
  151. msprobe/pytorch/bench_functions/apply_adam_w.py +28 -0
  152. msprobe/pytorch/bench_functions/confusion_transpose.py +19 -0
  153. msprobe/pytorch/bench_functions/fast_gelu.py +55 -0
  154. msprobe/pytorch/bench_functions/layer_norm_eval.py +6 -0
  155. msprobe/pytorch/bench_functions/linear.py +12 -0
  156. msprobe/pytorch/bench_functions/matmul_backward.py +48 -0
  157. msprobe/pytorch/bench_functions/npu_fusion_attention.py +509 -0
  158. msprobe/pytorch/bench_functions/rms_norm.py +15 -0
  159. msprobe/pytorch/bench_functions/rotary_mul.py +52 -0
  160. msprobe/pytorch/bench_functions/scaled_mask_softmax.py +26 -0
  161. msprobe/pytorch/bench_functions/swiglu.py +55 -0
  162. msprobe/pytorch/common/__init__.py +2 -2
  163. msprobe/pytorch/common/compare_script.template +14 -14
  164. msprobe/pytorch/common/log.py +20 -31
  165. msprobe/pytorch/common/parse_json.py +39 -37
  166. msprobe/pytorch/common/utils.py +305 -224
  167. msprobe/pytorch/compare/distributed_compare.py +66 -111
  168. msprobe/pytorch/compare/mapping.yaml +607 -607
  169. msprobe/pytorch/compare/match.py +34 -36
  170. msprobe/pytorch/compare/pt_compare.py +50 -0
  171. msprobe/pytorch/debugger/debugger_config.py +95 -86
  172. msprobe/pytorch/debugger/precision_debugger.py +125 -95
  173. msprobe/pytorch/free_benchmark/__init__.py +8 -8
  174. msprobe/pytorch/free_benchmark/common/constant.py +70 -67
  175. msprobe/pytorch/free_benchmark/common/counter.py +71 -71
  176. msprobe/pytorch/free_benchmark/common/enums.py +37 -37
  177. msprobe/pytorch/free_benchmark/common/params.py +129 -129
  178. msprobe/pytorch/free_benchmark/common/utils.py +102 -98
  179. msprobe/pytorch/free_benchmark/compare/grad_saver.py +179 -183
  180. msprobe/pytorch/free_benchmark/compare/single_benchmark.py +104 -104
  181. msprobe/pytorch/free_benchmark/main.py +105 -102
  182. msprobe/pytorch/free_benchmark/perturbed_layers/base_layer.py +13 -13
  183. msprobe/pytorch/free_benchmark/perturbed_layers/layer_factory.py +41 -41
  184. msprobe/pytorch/free_benchmark/perturbed_layers/npu/add_noise.py +90 -90
  185. msprobe/pytorch/free_benchmark/perturbed_layers/npu/bit_noise.py +104 -104
  186. msprobe/pytorch/free_benchmark/perturbed_layers/npu/change_value.py +63 -63
  187. msprobe/pytorch/free_benchmark/perturbed_layers/npu/improve_precision.py +68 -68
  188. msprobe/pytorch/free_benchmark/perturbed_layers/npu/no_change.py +28 -28
  189. msprobe/pytorch/free_benchmark/perturbed_layers/npu/npu_base_layser.py +45 -45
  190. msprobe/pytorch/free_benchmark/perturbed_layers/run_cpu.py +19 -19
  191. msprobe/pytorch/free_benchmark/result_handlers/base_handler.py +217 -203
  192. msprobe/pytorch/free_benchmark/result_handlers/check_handler.py +39 -39
  193. msprobe/pytorch/free_benchmark/result_handlers/fix_handler.py +23 -23
  194. msprobe/pytorch/free_benchmark/result_handlers/handler_factory.py +30 -31
  195. msprobe/pytorch/free_benchmark/result_handlers/preheat_handler.py +170 -170
  196. msprobe/pytorch/function_factory.py +76 -0
  197. msprobe/pytorch/functional/dump_module.py +39 -39
  198. msprobe/pytorch/grad_probe/__init__.py +0 -0
  199. msprobe/pytorch/grad_probe/grad_monitor.py +91 -0
  200. msprobe/pytorch/grad_probe/grad_stat_csv.py +129 -0
  201. msprobe/pytorch/hook_module/api_registry.py +161 -161
  202. msprobe/pytorch/hook_module/hook_module.py +120 -109
  203. msprobe/pytorch/hook_module/support_wrap_ops.yaml +1879 -1876
  204. msprobe/pytorch/hook_module/utils.py +30 -29
  205. msprobe/pytorch/hook_module/wrap_aten.py +110 -100
  206. msprobe/pytorch/hook_module/wrap_distributed.py +78 -75
  207. msprobe/pytorch/hook_module/wrap_functional.py +105 -108
  208. msprobe/pytorch/hook_module/wrap_npu_custom.py +93 -73
  209. msprobe/pytorch/hook_module/wrap_tensor.py +71 -72
  210. msprobe/pytorch/hook_module/wrap_torch.py +86 -88
  211. msprobe/pytorch/hook_module/wrap_vf.py +62 -64
  212. msprobe/pytorch/module_processer.py +138 -98
  213. msprobe/pytorch/online_dispatch/__init__.py +20 -20
  214. msprobe/pytorch/online_dispatch/compare.py +236 -236
  215. msprobe/pytorch/online_dispatch/dispatch.py +271 -273
  216. msprobe/pytorch/online_dispatch/dump_compare.py +155 -186
  217. msprobe/pytorch/online_dispatch/single_compare.py +391 -391
  218. msprobe/pytorch/online_dispatch/torch_ops_config.yaml +49 -49
  219. msprobe/pytorch/online_dispatch/utils.py +130 -187
  220. msprobe/pytorch/parse.py +4 -4
  221. msprobe/pytorch/parse_tool/cli.py +32 -32
  222. msprobe/pytorch/parse_tool/lib/compare.py +260 -259
  223. msprobe/pytorch/parse_tool/lib/config.py +52 -51
  224. msprobe/pytorch/parse_tool/lib/file_desc.py +31 -31
  225. msprobe/pytorch/parse_tool/lib/interactive_cli.py +102 -102
  226. msprobe/pytorch/parse_tool/lib/parse_exception.py +54 -54
  227. msprobe/pytorch/parse_tool/lib/parse_tool.py +158 -158
  228. msprobe/pytorch/parse_tool/lib/utils.py +316 -367
  229. msprobe/pytorch/parse_tool/lib/visualization.py +85 -90
  230. msprobe/pytorch/pt_config.py +188 -93
  231. msprobe/pytorch/service.py +246 -167
  232. mindstudio_probe-1.0.1.dist-info/RECORD +0 -228
  233. msprobe/config/README.md +0 -397
  234. msprobe/mindspore/doc/dump.md +0 -65
  235. msprobe/mindspore/dump/api_kbk_dump.py +0 -55
  236. msprobe/pytorch/compare/acc_compare.py +0 -1024
  237. msprobe/pytorch/compare/highlight.py +0 -100
  238. msprobe/pytorch/doc/FAQ.md +0 -193
  239. msprobe/pytorch/doc/api_accuracy_checker.md +0 -269
  240. msprobe/pytorch/doc/atat/321/207/342/226/223/342/225/233/321/205/342/225/221/320/266/321/205/342/225/226/320/265/321/205/320/225/342/225/226/321/206/320/245/342/226/221/321/206/320/235/320/276dump/321/206/320/260/320/227/321/205/320/227/320/226/321/206/320/220/320/267/321/210/320/223/342/225/234/321/205/320/257/342/225/221/321/207/342/225/221/342/224/220/321/206/320/232/320/265/321/205/320/241/320/232.md +0 -182
  241. msprobe/pytorch/doc/dump.md +0 -207
  242. msprobe/pytorch/doc/ptdbg_ascend_compare.md +0 -176
  243. msprobe/pytorch/doc/ptdbg_ascend_overview.md +0 -68
  244. msprobe/pytorch/doc/ptdbg_ascend_quickstart.md +0 -381
  245. msprobe/pytorch/doc/run_overflow_check.md +0 -25
  246. msprobe/pytorch/doc//321/205/320/254/320/270/321/207/342/225/221/342/224/220/321/207/342/226/223/342/225/233/321/205/342/225/221/320/266/321/206/320/277/320/244/321/205/320/277/342/225/243.md +0 -90
  247. msprobe/test/core_ut/common/test_utils.py +0 -345
  248. msprobe/test/core_ut/data_dump/test_data_collector.py +0 -47
  249. msprobe/test/core_ut/data_dump/test_json_writer.py +0 -183
  250. msprobe/test/core_ut/data_dump/test_scope.py +0 -151
  251. msprobe/test/core_ut/test_common_config.py +0 -152
  252. msprobe/test/core_ut/test_file_check.py +0 -218
  253. msprobe/test/core_ut/test_log.py +0 -109
  254. msprobe/test/mindspore_ut/test_api_kbk_dump.py +0 -51
  255. msprobe/test/mindspore_ut/test_debugger_config.py +0 -42
  256. msprobe/test/mindspore_ut/test_dump_tool_factory.py +0 -51
  257. msprobe/test/mindspore_ut/test_kernel_graph_dump.py +0 -66
  258. msprobe/test/mindspore_ut/test_kernel_graph_overflow_check.py +0 -63
  259. msprobe/test/mindspore_ut/test_ms_config.py +0 -69
  260. msprobe/test/mindspore_ut/test_overflow_check_tool_factory.py +0 -51
  261. msprobe/test/mindspore_ut/test_precision_debugger.py +0 -56
  262. msprobe/test/mindspore_ut/test_task_handler_factory.py +0 -58
  263. msprobe/test/pytorch_ut/advisor/test_advisor.py +0 -83
  264. msprobe/test/pytorch_ut/api_accuracy_checker/common/test_common_utils.py +0 -108
  265. msprobe/test/pytorch_ut/api_accuracy_checker/common/test_config.py +0 -39
  266. msprobe/test/pytorch_ut/api_accuracy_checker/compare/test_algorithm.py +0 -112
  267. msprobe/test/pytorch_ut/api_accuracy_checker/compare/test_api_precision_compare.py +0 -77
  268. msprobe/test/pytorch_ut/api_accuracy_checker/compare/test_compare.py +0 -125
  269. msprobe/test/pytorch_ut/api_accuracy_checker/compare/test_compare_column.py +0 -10
  270. msprobe/test/pytorch_ut/api_accuracy_checker/compare/test_compare_utils.py +0 -43
  271. msprobe/test/pytorch_ut/api_accuracy_checker/run_ut/dump.json +0 -179
  272. msprobe/test/pytorch_ut/api_accuracy_checker/run_ut/forward.json +0 -63
  273. msprobe/test/pytorch_ut/api_accuracy_checker/run_ut/test_data_generate.py +0 -99
  274. msprobe/test/pytorch_ut/api_accuracy_checker/run_ut/test_multi_run_ut.py +0 -115
  275. msprobe/test/pytorch_ut/api_accuracy_checker/run_ut/test_run_ut.py +0 -72
  276. msprobe/test/pytorch_ut/compare/test_acc_compare.py +0 -17
  277. msprobe/test/pytorch_ut/free_benchmark/perturbed_layers/test_perturbed_layser.py +0 -105
  278. msprobe/test/pytorch_ut/free_benchmark/result_handlers/test_result_handler.py +0 -121
  279. msprobe/test/pytorch_ut/free_benchmark/test_main.py +0 -101
  280. msprobe/test/pytorch_ut/functional/test_dump_module.py +0 -15
  281. msprobe/test/pytorch_ut/hook_module/test_api_registry.py +0 -130
  282. msprobe/test/pytorch_ut/hook_module/test_hook_module.py +0 -42
  283. msprobe/test/pytorch_ut/hook_module/test_wrap_aten.py +0 -65
  284. msprobe/test/pytorch_ut/hook_module/test_wrap_distributed.py +0 -35
  285. msprobe/test/pytorch_ut/hook_module/test_wrap_functional.py +0 -20
  286. msprobe/test/pytorch_ut/hook_module/test_wrap_tensor.py +0 -35
  287. msprobe/test/pytorch_ut/hook_module/test_wrap_torch.py +0 -43
  288. msprobe/test/pytorch_ut/hook_module/test_wrap_vf.py +0 -11
  289. msprobe/test/pytorch_ut/test_pt_config.py +0 -69
  290. msprobe/test/pytorch_ut/test_service.py +0 -59
  291. msprobe/test/resources/advisor.txt +0 -3
  292. msprobe/test/resources/compare_result_20230703104808.csv +0 -9
  293. msprobe/test/resources/compare_result_without_accuracy.csv +0 -9
  294. msprobe/test/resources/config.yaml +0 -3
  295. msprobe/test/resources/npu_test.pkl +0 -8
  296. msprobe/test/run_test.sh +0 -30
  297. msprobe/test/run_ut.py +0 -58
  298. msprobe/test/test_module_processer.py +0 -64
  299. {mindstudio_probe-1.0.1.dist-info → mindstudio_probe-1.0.4.dist-info}/top_level.txt +0 -0
  300. /msprobe/{pytorch/doc → docs}/img/BLOOM-7B_1.png +0 -0
  301. /msprobe/{pytorch/doc → docs}/img/BLOOM-7B_2.png +0 -0
  302. /msprobe/{pytorch/doc → docs}/img/BLOOM-7B_3.png +0 -0
  303. /msprobe/{pytorch/doc → docs}/img/BLOOM-7B_4.png +0 -0
  304. /msprobe/{pytorch/doc → docs}/img/GPT-3_1.png +0 -0
  305. /msprobe/{pytorch/doc → docs}/img/GPT-3_2.png +0 -0
  306. /msprobe/{pytorch/doc → docs}/img/GPT-3_3.png +0 -0
  307. /msprobe/{pytorch/doc → docs}/img/GPT-3_4.png +0 -0
  308. /msprobe/{pytorch/doc → docs}/img/GPT-3_5.png +0 -0
  309. /msprobe/{pytorch/doc → docs}/img/GPT-3_6.png +0 -0
  310. /msprobe/{pytorch/doc → docs}/img/GPT-3_7.png +0 -0
  311. /msprobe/{pytorch/doc → docs}/img/GPT-3_8.png +0 -0
  312. /msprobe/{pytorch/doc → docs}/img/YOLOV5S_1.png +0 -0
  313. /msprobe/{pytorch/doc → docs}/img/YOLOV5S_2.png +0 -0
  314. /msprobe/{pytorch/doc → docs}/img/accuracy_checking_details.png +0 -0
  315. /msprobe/{pytorch/doc → docs}/img/accuracy_checking_result.png +0 -0
  316. /msprobe/{pytorch/doc → docs}/img/api_precision_compare_details.png +0 -0
  317. /msprobe/{pytorch/doc → docs}/img/api_precision_compare_result.png +0 -0
  318. /msprobe/{pytorch/doc → docs}/img/auto_analyze_log.png +0 -0
  319. /msprobe/{pytorch/doc → docs}/img/compare_result_pkl.png +0 -0
  320. /msprobe/{pytorch/doc → docs}/img/compare_result_pkl_md5.png.png +0 -0
  321. /msprobe/{pytorch/doc → docs}/img/cpu_info.png +0 -0
  322. /msprobe/{config → docs}/img/free_benchmark.png +0 -0
  323. /msprobe/{pytorch/doc → docs}/img/module_compare.png +0 -0
@@ -1,39 +1,39 @@
1
- import torch.nn as nn
2
- from msprobe.pytorch.common.log import logger
3
- from msprobe.core.common.const import Const
4
- from msprobe.pytorch.hook_module.api_registry import api_register
5
- from msprobe.pytorch.debugger.precision_debugger import PrecisionDebugger
6
- from msprobe.core.common.exceptions import MsaccException
7
- from msprobe.core.data_dump.scope import BaseScope
8
-
9
- module_count = {}
10
-
11
-
12
- def module_dump(module, dump_name):
13
- if not isinstance(module, nn.Module):
14
- logger.error("The parameter:module in module_dump is not a Module subclass.")
15
- raise MsaccException(MsaccException.INVALID_PARAM_ERROR)
16
- if not isinstance(dump_name, str):
17
- logger.error("The parameter:dump_name in module_dump is not a str type.")
18
- raise MsaccException(MsaccException.INVALID_PARAM_ERROR)
19
- api_register.api_originality()
20
- if dump_name not in module_count:
21
- module_count[dump_name] = 0
22
- else:
23
- module_count[dump_name] += 1
24
- dump_name = dump_name + Const.SEP + str(module_count.get(dump_name)) + Const.SEP
25
-
26
- pdg = PrecisionDebugger()
27
- _, forward_hook, backward_hook = pdg.service.build_hook(BaseScope.Module_Type_Module, dump_name)
28
- module.register_forward_hook(forward_hook, with_kwargs=True)
29
- module.register_full_backward_hook(backward_hook)
30
-
31
- module.register_forward_pre_hook(pdg.service.module_processor.node_hook(dump_name + Const.FORWARD, Const.START))
32
- module.register_forward_hook(pdg.service.module_processor.node_hook(dump_name + Const.FORWARD, Const.STOP))
33
- module.register_full_backward_pre_hook(
34
- pdg.service.module_processor.node_hook(dump_name + Const.BACKWARD, Const.START))
35
- module.register_full_backward_hook(pdg.service.module_processor.node_hook(dump_name + Const.BACKWARD, Const.STOP))
36
-
37
-
38
- def module_dump_end():
39
- api_register.api_modularity()
1
+ import torch.nn as nn
2
+ from msprobe.pytorch.common.log import logger
3
+ from msprobe.core.common.const import Const
4
+ from msprobe.pytorch.hook_module.api_registry import api_register
5
+ from msprobe.pytorch.debugger.precision_debugger import PrecisionDebugger
6
+ from msprobe.core.common.exceptions import MsprobeException
7
+ from msprobe.core.data_dump.scope import BaseScope
8
+
9
+ module_count = {}
10
+
11
+
12
+ def module_dump(module, dump_name):
13
+ if not isinstance(module, nn.Module):
14
+ logger.error("The parameter:module in module_dump is not a Module subclass.")
15
+ raise MsprobeException(MsprobeException.INVALID_PARAM_ERROR)
16
+ if not isinstance(dump_name, str):
17
+ logger.error("The parameter:dump_name in module_dump is not a str type.")
18
+ raise MsprobeException(MsprobeException.INVALID_PARAM_ERROR)
19
+ api_register.api_originality()
20
+ if dump_name not in module_count:
21
+ module_count[dump_name] = 0
22
+ else:
23
+ module_count[dump_name] += 1
24
+ dump_name = dump_name + Const.SEP + str(module_count.get(dump_name)) + Const.SEP
25
+
26
+ pdg = PrecisionDebugger()
27
+ _, forward_hook, backward_hook, _ = pdg.service.build_hook(BaseScope.Module_Type_Module, dump_name)
28
+ module.register_forward_hook(forward_hook, with_kwargs=True)
29
+ module.register_full_backward_hook(backward_hook)
30
+
31
+ module.register_forward_pre_hook(pdg.service.module_processor.node_hook(dump_name + Const.FORWARD, Const.START))
32
+ module.register_forward_hook(pdg.service.module_processor.node_hook(dump_name + Const.FORWARD, Const.STOP))
33
+ module.register_full_backward_pre_hook(
34
+ pdg.service.module_processor.node_hook(dump_name + Const.BACKWARD, Const.START))
35
+ module.register_full_backward_hook(pdg.service.module_processor.node_hook(dump_name + Const.BACKWARD, Const.STOP))
36
+
37
+
38
+ def module_dump_end():
39
+ api_register.api_modularity()
File without changes
@@ -0,0 +1,91 @@
1
+ import os
2
+ from collections import defaultdict
3
+
4
+ import torch
5
+ if int(torch.__version__.split('.')[0]) >= 2:
6
+ from torch.optim.optimizer import register_optimizer_step_pre_hook
7
+ from msprobe.pytorch.grad_probe.grad_stat_csv import GradStatCsv
8
+ from msprobe.core.grad_probe.utils import check_numeral_list_ascend, data_in_list_target
9
+ from msprobe.core.grad_probe.constant import level_adp
10
+ from msprobe.pytorch.common.log import logger
11
+ from msprobe.core.common.file_utils import remove_path, save_npy, write_csv, create_directory
12
+ from msprobe.pytorch.common.utils import get_rank_id, print_rank_0
13
+
14
+
15
+ class GradientMonitor:
16
+
17
+ def __init__(self, common_config, task_config):
18
+ level = task_config.grad_level
19
+ if level not in level_adp:
20
+ raise Exception(f"level is valid, not in {level_adp.keys()}")
21
+ self._level_adp = level_adp[level]
22
+ self._param_list = task_config.param_list
23
+ self._target_ranks = common_config.rank
24
+ logger.info(f"target rank {self._target_ranks}")
25
+ self._target_step = common_config.step
26
+ logger.info(f"target step {self._target_step}")
27
+ self._bounds = task_config.bounds
28
+ check_numeral_list_ascend(self._bounds)
29
+ self._output_path = common_config.dump_path
30
+ if not os.path.exists(self._output_path):
31
+ create_directory(self._output_path)
32
+ else:
33
+ logger.warning(f"the file in {self._output_path} will be recoverd")
34
+ self._step = -1
35
+ self._param2name = defaultdict(str)
36
+
37
+ @property
38
+ def output_path(self):
39
+ return self._output_path
40
+
41
+ @staticmethod
42
+ def save_grad_direction(param_name, grad, save_path):
43
+ if not os.path.exists(save_path):
44
+ create_directory(save_path)
45
+ param_grad = grad.clone().detach()
46
+ is_positive = param_grad > 0
47
+ save_filepath = os.path.join(save_path, f"{param_name}.npy")
48
+ save_npy(is_positive.cpu().numpy(), save_filepath)
49
+
50
+ def monitor(self, model):
51
+ print_rank_0("> parameter names:")
52
+ for name, param in model.named_parameters():
53
+ self._param2name[param] = name
54
+ print_rank_0(f"\t{name}")
55
+ setattr(self, "_rank", get_rank_id())
56
+ if torch.distributed.is_initialized() and not data_in_list_target(getattr(self, "_rank"), self._target_ranks):
57
+ return
58
+ self._hook_optimizer()
59
+
60
+ def _hook_optimizer(self):
61
+ def optimizer_pre_step_hook(optimizer, args, kargs):
62
+ self._step += 1
63
+ logger.info(f"grad_probe: optimizer step {self._step}")
64
+ if not data_in_list_target(self._step, self._target_step):
65
+ return
66
+ output_lines = []
67
+ for param, param_name in self._param2name.items():
68
+ if not data_in_list_target(param_name, self._param_list):
69
+ continue
70
+ grad = param.main_grad if hasattr(param, "main_grad") else param.grad
71
+ if grad is None:
72
+ logger.info(f"grad is None: {param_name}")
73
+ continue
74
+ grad_info = GradStatCsv.generate_csv_line(param_name, self._level_adp, grad, self._bounds)
75
+ output_lines.append(grad_info)
76
+ if self._level_adp["have_grad_direction"]:
77
+ GradientMonitor.save_grad_direction(param_name, grad,
78
+ f'{self._output_path}/rank{self._rank}/step{self._step}')
79
+ output_dirpath = os.path.join(self._output_path, f"rank{getattr(self, '_rank')}")
80
+ if not os.path.isdir(output_dirpath):
81
+ create_directory(output_dirpath)
82
+ output_path = os.path.join(output_dirpath, f"grad_summary_{self._step}.csv")
83
+ if os.path.exists(output_path):
84
+ logger.warning(f"{output_path} will be recoverd")
85
+ remove_path(output_path)
86
+ header_result = GradStatCsv.generate_csv_header(self._level_adp, self._bounds)
87
+ output_lines.insert(0, header_result)
88
+ write_csv(output_lines, output_path)
89
+ logger.info(f"write grad data to {output_path}")
90
+ if int(torch.__version__.split('.')[0]) >= 2:
91
+ register_optimizer_step_pre_hook(optimizer_pre_step_hook)
@@ -0,0 +1,129 @@
1
+ from abc import ABC, abstractmethod
2
+ from collections import namedtuple
3
+ import hashlib
4
+ import torch
5
+ from msprobe.core.grad_probe.constant import GradConst
6
+
7
+ CSV_header_input = namedtuple("CSV_header_input", ["bounds"])
8
+ CSV_content_input = namedtuple("CSV_content_input", ["grad", "bounds"])
9
+
10
+
11
+ class GradStatCsv:
12
+ csv = {}
13
+
14
+ @staticmethod
15
+ def generate_csv_header(level, bounds):
16
+ header = ["param_name"]
17
+ for key in level["header"]:
18
+ csv_header_input = CSV_header_input(bounds=bounds)
19
+ header.extend(GradStatCsv.csv[key].generate_csv_header(csv_header_input))
20
+ return header
21
+
22
+ @staticmethod
23
+ def generate_csv_line(param_name, level, grad, bounds):
24
+ line = [param_name]
25
+ for key in level["header"]:
26
+ csv_content_input = CSV_content_input(grad=grad, bounds=bounds)
27
+ line.extend(GradStatCsv.csv[key].generate_csv_content(csv_content_input))
28
+ return line
29
+
30
+
31
+ def register_csv_item(key, cls=None):
32
+ if cls is None:
33
+ # 无参数时,返回装饰器函数
34
+ return lambda cls: register_csv_item(key, cls)
35
+ GradStatCsv.csv[key] = cls
36
+ return cls
37
+
38
+
39
+ class CsvItem(ABC):
40
+ @abstractmethod
41
+ def generate_csv_header(csv_header_input):
42
+ pass
43
+
44
+ @abstractmethod
45
+ def generate_csv_content(csv_content_input):
46
+ pass
47
+
48
+
49
+ @register_csv_item(GradConst.MD5)
50
+ class CSV_md5(CsvItem):
51
+ def generate_csv_header(csv_header_input):
52
+ return ["MD5"]
53
+
54
+ def generate_csv_content(csv_content_input):
55
+ grad = csv_content_input.grad
56
+ tensor_bytes = grad.cpu().detach().float().numpy().tobytes()
57
+ md5_hash = hashlib.md5(tensor_bytes)
58
+ return [md5_hash.hexdigest()]
59
+
60
+
61
+ @register_csv_item(GradConst.DISTRIBUTION)
62
+ class CSV_distribution(CsvItem):
63
+ def generate_csv_header(csv_header_input):
64
+ bounds = csv_header_input.bounds
65
+ intervals = []
66
+ if bounds:
67
+ intervals.append(f"(-inf, {bounds[0]}]")
68
+ for i in range(1, len(bounds)):
69
+ intervals.append(f"({bounds[i-1]}, {bounds[i]}]")
70
+ if intervals:
71
+ intervals.append(f"({bounds[-1]}, inf)")
72
+ intervals.append("=0")
73
+
74
+ return intervals
75
+
76
+ def generate_csv_content(csv_content_input):
77
+ grad = csv_content_input.grad
78
+ bounds = csv_content_input.bounds
79
+ grad = grad.cpu().detach()
80
+ if grad.dtype == torch.bfloat16:
81
+ grad = grad.to(torch.float32)
82
+ element_num = grad.numel()
83
+ grad_equal_0_num = (grad == 0).sum().item()
84
+ bound = torch.Tensor(bounds)
85
+ bucketsize_result = torch.bucketize(grad, bound)
86
+ interval_nums = [(bucketsize_result == i).sum().item() for i in range(len(bound) + 1)]
87
+ interval_nums.append(grad_equal_0_num)
88
+ return_list = [x / element_num if element_num != 0 else 0 for x in interval_nums]
89
+ return return_list
90
+
91
+
92
+ @register_csv_item(GradConst.MAX)
93
+ class CSV_max(CsvItem):
94
+ def generate_csv_header(csv_header_input):
95
+ return ["max"]
96
+
97
+ def generate_csv_content(csv_content_input):
98
+ grad = csv_content_input.grad
99
+ return [torch.max(grad).cpu().detach().float().numpy().tolist()]
100
+
101
+
102
+ @register_csv_item(GradConst.MIN)
103
+ class CSV_max(CsvItem):
104
+ def generate_csv_header(csv_header_input):
105
+ return ["min"]
106
+
107
+ def generate_csv_content(csv_content_input):
108
+ grad = csv_content_input.grad
109
+ return [torch.min(grad).cpu().detach().float().numpy().tolist()]
110
+
111
+
112
+ @register_csv_item(GradConst.NORM)
113
+ class CSV_max(CsvItem):
114
+ def generate_csv_header(csv_header_input):
115
+ return ["norm"]
116
+
117
+ def generate_csv_content(csv_content_input):
118
+ grad = csv_content_input.grad
119
+ return [torch.norm(grad).cpu().detach().float().numpy().tolist()]
120
+
121
+
122
+ @register_csv_item(GradConst.SHAPE)
123
+ class CSV_shape(CsvItem):
124
+ def generate_csv_header(csv_header_input):
125
+ return ["shape"]
126
+
127
+ def generate_csv_content(csv_content_input):
128
+ grad = csv_content_input.grad
129
+ return [list(grad.shape)]
@@ -1,161 +1,161 @@
1
- #!/usr/bin/env python3
2
- # -*- coding: utf-8 -*-
3
- """
4
- # Copyright (C) 2022-2023. Huawei Technologies Co., Ltd. All rights reserved.
5
- # Licensed under the Apache License, Version 2.0 (the "License");
6
- # you may not use this file except in compliance with the License.
7
- # You may obtain a copy of the License at
8
- #
9
- # http://www.apache.org/licenses/LICENSE-2.0
10
- #
11
- # Unless required by applicable law or agreed to in writing, software
12
- # distributed under the License is distributed on an "AS IS" BASIS,
13
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
- # See the License for the specific language governing permissions and
15
- # limitations under the License.
16
- """
17
-
18
- import torch
19
- import torch.distributed as dist
20
-
21
- from msprobe.pytorch.hook_module import wrap_torch, wrap_functional, wrap_tensor, wrap_vf, wrap_distributed, wrap_aten
22
- from msprobe.pytorch.hook_module.wrap_aten import get_aten_ops
23
- from msprobe.pytorch.hook_module.wrap_distributed import get_distributed_ops
24
- from msprobe.pytorch.hook_module.wrap_functional import get_functional_ops
25
- from msprobe.pytorch.hook_module.wrap_tensor import get_tensor_ops
26
- from msprobe.pytorch.hook_module.wrap_torch import get_torch_ops
27
- from msprobe.pytorch.hook_module.wrap_vf import get_vf_ops
28
- from msprobe.pytorch.common.utils import torch_without_guard_version, npu_distributed_api, is_gpu
29
- from msprobe.core.common.const import Const
30
-
31
- torch_version_above_2 = torch.__version__.split('+')[0] > '2.0'
32
-
33
- if not is_gpu:
34
- import torch_npu
35
- from . import wrap_npu_custom
36
- from .wrap_npu_custom import get_npu_ops
37
-
38
-
39
- class ApiRegistry:
40
- def __init__(self):
41
- self.tensor_ori_attr = {}
42
- self.torch_ori_attr = {}
43
- self.functional_ori_attr = {}
44
- self.distributed_ori_attr = {}
45
- self.npu_distributed_ori_attr = {}
46
- self.vf_ori_attr = {}
47
- self.aten_ori_attr = {}
48
- self.torch_npu_ori_attr = {}
49
-
50
- self.tensor_hook_attr = {}
51
- self.torch_hook_attr = {}
52
- self.functional_hook_attr = {}
53
- self.distributed_hook_attr = {}
54
- self.npu_distributed_hook_attr = {}
55
- self.vf_hook_attr = {}
56
- self.aten_hook_attr = {}
57
- self.torch_npu_hook_attr = {}
58
-
59
- @staticmethod
60
- def store_ori_attr(ori_api_group, api_list, api_ori_attr):
61
- for api in api_list:
62
- if '.' in api:
63
- sub_module_name, sub_op = api.rsplit('.', 1)
64
- sub_module = getattr(ori_api_group, sub_module_name)
65
- api_ori_attr[api] = getattr(sub_module, sub_op)
66
- else:
67
- api_ori_attr[api] = getattr(ori_api_group, api)
68
-
69
- @staticmethod
70
- def set_api_attr(api_group, attr_dict):
71
- for api, api_attr in attr_dict.items():
72
- if '.' in api:
73
- sub_module_name, sub_op = api.rsplit('.', 1)
74
- sub_module = getattr(api_group, sub_module_name, None)
75
- if sub_module is not None:
76
- setattr(sub_module, sub_op, api_attr)
77
- else:
78
- setattr(api_group, api, api_attr)
79
-
80
- def api_modularity(self):
81
- self.set_api_attr(torch.Tensor, self.tensor_hook_attr)
82
- self.set_api_attr(torch, self.torch_hook_attr)
83
- self.set_api_attr(torch.nn.functional, self.functional_hook_attr)
84
- self.set_api_attr(dist, self.distributed_hook_attr)
85
- self.set_api_attr(dist.distributed_c10d, self.distributed_hook_attr)
86
- if not is_gpu and not torch_without_guard_version:
87
- self.set_api_attr(torch_npu.distributed, self.npu_distributed_hook_attr)
88
- self.set_api_attr(torch_npu.distributed.distributed_c10d, self.npu_distributed_hook_attr)
89
- if torch_version_above_2:
90
- self.set_api_attr(torch.ops.aten, self.aten_hook_attr)
91
- self.set_api_attr(torch._VF, self.vf_hook_attr)
92
- if not is_gpu:
93
- self.set_api_attr(torch_npu, self.torch_npu_hook_attr)
94
-
95
- def api_originality(self):
96
- self.set_api_attr(torch.Tensor, self.tensor_ori_attr)
97
- self.set_api_attr(torch, self.torch_ori_attr)
98
- self.set_api_attr(torch.nn.functional, self.functional_ori_attr)
99
- self.set_api_attr(dist, self.distributed_ori_attr)
100
- self.set_api_attr(dist.distributed_c10d, self.distributed_ori_attr)
101
- if not is_gpu and not torch_without_guard_version:
102
- self.set_api_attr(torch_npu.distributed, self.npu_distributed_ori_attr)
103
- self.set_api_attr(torch_npu.distributed.distributed_c10d, self.npu_distributed_ori_attr)
104
- if torch_version_above_2:
105
- self.set_api_attr(torch.ops.aten, self.aten_ori_attr)
106
- self.set_api_attr(torch._VF, self.vf_ori_attr)
107
- if not is_gpu:
108
- self.set_api_attr(torch_npu, self.torch_npu_ori_attr)
109
-
110
- def initialize_hook(self, hook):
111
- self.store_ori_attr(torch.Tensor, get_tensor_ops(), self.tensor_ori_attr)
112
- wrap_tensor.wrap_tensor_ops_and_bind(hook)
113
- for attr_name in dir(wrap_tensor.HOOKTensor):
114
- if attr_name.startswith(Const.ATTR_NAME_PREFIX):
115
- self.tensor_hook_attr[attr_name[5:]] = getattr(wrap_tensor.HOOKTensor, attr_name)
116
-
117
- self.store_ori_attr(torch, get_torch_ops(), self.torch_ori_attr)
118
- wrap_torch.wrap_torch_ops_and_bind(hook)
119
- for attr_name in dir(wrap_torch.HOOKTorchOP):
120
- if attr_name.startswith(Const.ATTR_NAME_PREFIX):
121
- self.torch_hook_attr[attr_name[5:]] = getattr(wrap_torch.HOOKTorchOP, attr_name)
122
-
123
- self.store_ori_attr(torch.nn.functional, get_functional_ops(), self.functional_ori_attr)
124
- wrap_functional.wrap_functional_ops_and_bind(hook)
125
- for attr_name in dir(wrap_functional.HOOKFunctionalOP):
126
- if attr_name.startswith(Const.ATTR_NAME_PREFIX):
127
- self.functional_hook_attr[attr_name[5:]] = getattr(wrap_functional.HOOKFunctionalOP, attr_name)
128
-
129
- self.store_ori_attr(dist, get_distributed_ops(), self.distributed_ori_attr)
130
- wrap_distributed.wrap_distributed_ops_and_bind(hook)
131
- if not is_gpu and not torch_without_guard_version:
132
- self.store_ori_attr(torch_npu.distributed, npu_distributed_api, self.npu_distributed_ori_attr)
133
- for attr_name in dir(wrap_distributed.HOOKDistributedOP):
134
- if attr_name.startswith(Const.ATTR_NAME_PREFIX):
135
- self.distributed_hook_attr[attr_name[5:]] = getattr(wrap_distributed.HOOKDistributedOP, attr_name)
136
- if not is_gpu and not torch_without_guard_version and attr_name[5:] in npu_distributed_api:
137
- self.npu_distributed_hook_attr[attr_name[5:]] = getattr(wrap_distributed.HOOKDistributedOP,
138
- attr_name)
139
-
140
- if torch_version_above_2:
141
- self.store_ori_attr(torch.ops.aten, get_aten_ops(), self.aten_ori_attr)
142
- wrap_aten.wrap_aten_ops_and_bind(hook)
143
- for attr_name in dir(wrap_aten.HOOKAtenOP):
144
- if attr_name.startswith(Const.ATTR_NAME_PREFIX):
145
- self.aten_hook_attr[attr_name[5:]] = getattr(wrap_aten.HOOKAtenOP, attr_name)
146
-
147
- self.store_ori_attr(torch._VF, get_vf_ops(), self.vf_ori_attr)
148
- wrap_vf.wrap_vf_ops_and_bind(hook)
149
- for attr_name in dir(wrap_vf.HOOKVfOP):
150
- if attr_name.startswith(Const.ATTR_NAME_PREFIX):
151
- self.vf_hook_attr[attr_name[5:]] = getattr(wrap_vf.HOOKVfOP, attr_name)
152
-
153
- if not is_gpu:
154
- self.store_ori_attr(torch_npu, get_npu_ops(), self.torch_npu_ori_attr)
155
- wrap_npu_custom.wrap_npu_ops_and_bind(hook)
156
- for attr_name in dir(wrap_npu_custom.HOOKNpuOP):
157
- if attr_name.startswith(Const.ATTR_NAME_PREFIX):
158
- self.torch_npu_hook_attr[attr_name[5:]] = getattr(wrap_npu_custom.HOOKNpuOP, attr_name)
159
-
160
-
161
- api_register = ApiRegistry()
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ # Copyright (C) 2022-2023. Huawei Technologies Co., Ltd. All rights reserved.
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ """
17
+
18
+ import torch
19
+ import torch.distributed as dist
20
+
21
+ from msprobe.pytorch.hook_module import wrap_torch, wrap_functional, wrap_tensor, wrap_vf, wrap_distributed, wrap_aten
22
+ from msprobe.pytorch.hook_module.wrap_aten import get_aten_ops
23
+ from msprobe.pytorch.hook_module.wrap_distributed import get_distributed_ops
24
+ from msprobe.pytorch.hook_module.wrap_functional import get_functional_ops
25
+ from msprobe.pytorch.hook_module.wrap_tensor import get_tensor_ops
26
+ from msprobe.pytorch.hook_module.wrap_torch import get_torch_ops
27
+ from msprobe.pytorch.hook_module.wrap_vf import get_vf_ops
28
+ from msprobe.pytorch.common.utils import torch_without_guard_version, npu_distributed_api, is_gpu
29
+ from msprobe.core.common.const import Const
30
+
31
+ torch_version_above_2 = torch.__version__.split('+')[0] > '2.0'
32
+
33
+ if not is_gpu:
34
+ import torch_npu
35
+ from . import wrap_npu_custom
36
+ from .wrap_npu_custom import get_npu_ops
37
+
38
+
39
+ class ApiRegistry:
40
+ def __init__(self):
41
+ self.tensor_ori_attr = {}
42
+ self.torch_ori_attr = {}
43
+ self.functional_ori_attr = {}
44
+ self.distributed_ori_attr = {}
45
+ self.npu_distributed_ori_attr = {}
46
+ self.vf_ori_attr = {}
47
+ self.aten_ori_attr = {}
48
+ self.torch_npu_ori_attr = {}
49
+
50
+ self.tensor_hook_attr = {}
51
+ self.torch_hook_attr = {}
52
+ self.functional_hook_attr = {}
53
+ self.distributed_hook_attr = {}
54
+ self.npu_distributed_hook_attr = {}
55
+ self.vf_hook_attr = {}
56
+ self.aten_hook_attr = {}
57
+ self.torch_npu_hook_attr = {}
58
+
59
+ @staticmethod
60
+ def store_ori_attr(ori_api_group, api_list, api_ori_attr):
61
+ for api in api_list:
62
+ if '.' in api:
63
+ sub_module_name, sub_op = api.rsplit('.', 1)
64
+ sub_module = getattr(ori_api_group, sub_module_name)
65
+ api_ori_attr[api] = getattr(sub_module, sub_op)
66
+ else:
67
+ api_ori_attr[api] = getattr(ori_api_group, api)
68
+
69
+ @staticmethod
70
+ def set_api_attr(api_group, attr_dict):
71
+ for api, api_attr in attr_dict.items():
72
+ if '.' in api:
73
+ sub_module_name, sub_op = api.rsplit('.', 1)
74
+ sub_module = getattr(api_group, sub_module_name, None)
75
+ if sub_module is not None:
76
+ setattr(sub_module, sub_op, api_attr)
77
+ else:
78
+ setattr(api_group, api, api_attr)
79
+
80
+ def api_modularity(self):
81
+ self.set_api_attr(torch.Tensor, self.tensor_hook_attr)
82
+ self.set_api_attr(torch, self.torch_hook_attr)
83
+ self.set_api_attr(torch.nn.functional, self.functional_hook_attr)
84
+ self.set_api_attr(dist, self.distributed_hook_attr)
85
+ self.set_api_attr(dist.distributed_c10d, self.distributed_hook_attr)
86
+ if not is_gpu and not torch_without_guard_version:
87
+ self.set_api_attr(torch_npu.distributed, self.npu_distributed_hook_attr)
88
+ self.set_api_attr(torch_npu.distributed.distributed_c10d, self.npu_distributed_hook_attr)
89
+ if torch_version_above_2:
90
+ self.set_api_attr(torch.ops.aten, self.aten_hook_attr)
91
+ self.set_api_attr(torch._VF, self.vf_hook_attr)
92
+ if not is_gpu:
93
+ self.set_api_attr(torch_npu, self.torch_npu_hook_attr)
94
+
95
+ def api_originality(self):
96
+ self.set_api_attr(torch.Tensor, self.tensor_ori_attr)
97
+ self.set_api_attr(torch, self.torch_ori_attr)
98
+ self.set_api_attr(torch.nn.functional, self.functional_ori_attr)
99
+ self.set_api_attr(dist, self.distributed_ori_attr)
100
+ self.set_api_attr(dist.distributed_c10d, self.distributed_ori_attr)
101
+ if not is_gpu and not torch_without_guard_version:
102
+ self.set_api_attr(torch_npu.distributed, self.npu_distributed_ori_attr)
103
+ self.set_api_attr(torch_npu.distributed.distributed_c10d, self.npu_distributed_ori_attr)
104
+ if torch_version_above_2:
105
+ self.set_api_attr(torch.ops.aten, self.aten_ori_attr)
106
+ self.set_api_attr(torch._VF, self.vf_ori_attr)
107
+ if not is_gpu:
108
+ self.set_api_attr(torch_npu, self.torch_npu_ori_attr)
109
+
110
+ def initialize_hook(self, hook):
111
+ self.store_ori_attr(torch.Tensor, get_tensor_ops(), self.tensor_ori_attr)
112
+ wrap_tensor.wrap_tensor_ops_and_bind(hook)
113
+ for attr_name in dir(wrap_tensor.HOOKTensor):
114
+ if attr_name.startswith(Const.ATTR_NAME_PREFIX):
115
+ self.tensor_hook_attr[attr_name[5:]] = getattr(wrap_tensor.HOOKTensor, attr_name)
116
+
117
+ self.store_ori_attr(torch, get_torch_ops(), self.torch_ori_attr)
118
+ wrap_torch.wrap_torch_ops_and_bind(hook)
119
+ for attr_name in dir(wrap_torch.HOOKTorchOP):
120
+ if attr_name.startswith(Const.ATTR_NAME_PREFIX):
121
+ self.torch_hook_attr[attr_name[5:]] = getattr(wrap_torch.HOOKTorchOP, attr_name)
122
+
123
+ self.store_ori_attr(torch.nn.functional, get_functional_ops(), self.functional_ori_attr)
124
+ wrap_functional.wrap_functional_ops_and_bind(hook)
125
+ for attr_name in dir(wrap_functional.HOOKFunctionalOP):
126
+ if attr_name.startswith(Const.ATTR_NAME_PREFIX):
127
+ self.functional_hook_attr[attr_name[5:]] = getattr(wrap_functional.HOOKFunctionalOP, attr_name)
128
+
129
+ self.store_ori_attr(dist, get_distributed_ops(), self.distributed_ori_attr)
130
+ wrap_distributed.wrap_distributed_ops_and_bind(hook)
131
+ if not is_gpu and not torch_without_guard_version:
132
+ self.store_ori_attr(torch_npu.distributed, npu_distributed_api, self.npu_distributed_ori_attr)
133
+ for attr_name in dir(wrap_distributed.HOOKDistributedOP):
134
+ if attr_name.startswith(Const.ATTR_NAME_PREFIX):
135
+ self.distributed_hook_attr[attr_name[5:]] = getattr(wrap_distributed.HOOKDistributedOP, attr_name)
136
+ if not is_gpu and not torch_without_guard_version and attr_name[5:] in npu_distributed_api:
137
+ self.npu_distributed_hook_attr[attr_name[5:]] = getattr(wrap_distributed.HOOKDistributedOP,
138
+ attr_name)
139
+
140
+ if torch_version_above_2:
141
+ self.store_ori_attr(torch.ops.aten, get_aten_ops(), self.aten_ori_attr)
142
+ wrap_aten.wrap_aten_ops_and_bind(hook)
143
+ for attr_name in dir(wrap_aten.HOOKAtenOP):
144
+ if attr_name.startswith(Const.ATTR_NAME_PREFIX):
145
+ self.aten_hook_attr[attr_name[5:]] = getattr(wrap_aten.HOOKAtenOP, attr_name)
146
+
147
+ self.store_ori_attr(torch._VF, get_vf_ops(), self.vf_ori_attr)
148
+ wrap_vf.wrap_vf_ops_and_bind(hook)
149
+ for attr_name in dir(wrap_vf.HOOKVfOP):
150
+ if attr_name.startswith(Const.ATTR_NAME_PREFIX):
151
+ self.vf_hook_attr[attr_name[5:]] = getattr(wrap_vf.HOOKVfOP, attr_name)
152
+
153
+ if not is_gpu:
154
+ self.store_ori_attr(torch_npu, get_npu_ops(), self.torch_npu_ori_attr)
155
+ wrap_npu_custom.wrap_npu_ops_and_bind(hook)
156
+ for attr_name in dir(wrap_npu_custom.HOOKNpuOP):
157
+ if attr_name.startswith(Const.ATTR_NAME_PREFIX):
158
+ self.torch_npu_hook_attr[attr_name[5:]] = getattr(wrap_npu_custom.HOOKNpuOP, attr_name)
159
+
160
+
161
+ api_register = ApiRegistry()