mindstudio-probe 1.0.3__py3-none-any.whl → 1.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (262) hide show
  1. {mindstudio_probe-1.0.3.dist-info → mindstudio_probe-1.0.4.dist-info}/LICENSE +201 -201
  2. {mindstudio_probe-1.0.3.dist-info → mindstudio_probe-1.0.4.dist-info}/METADATA +36 -34
  3. mindstudio_probe-1.0.4.dist-info/RECORD +276 -0
  4. {mindstudio_probe-1.0.3.dist-info → mindstudio_probe-1.0.4.dist-info}/WHEEL +1 -1
  5. {mindstudio_probe-1.0.3.dist-info → mindstudio_probe-1.0.4.dist-info}/entry_points.txt +1 -0
  6. msprobe/README.md +101 -237
  7. msprobe/{config/config.json → config.json} +49 -49
  8. msprobe/core/advisor/advisor.py +124 -124
  9. msprobe/core/advisor/advisor_const.py +59 -59
  10. msprobe/core/advisor/advisor_result.py +58 -58
  11. msprobe/core/common/const.py +341 -318
  12. msprobe/core/common/exceptions.py +99 -99
  13. msprobe/core/common/{file_check.py → file_utils.py} +478 -283
  14. msprobe/core/common/log.py +76 -69
  15. msprobe/core/common/utils.py +385 -616
  16. msprobe/core/common_config.py +85 -71
  17. msprobe/core/compare/acc_compare.py +299 -298
  18. msprobe/core/compare/check.py +95 -95
  19. msprobe/core/compare/compare_cli.py +49 -49
  20. msprobe/core/compare/highlight.py +223 -222
  21. msprobe/core/compare/multiprocessing_compute.py +149 -149
  22. msprobe/core/compare/npy_compare.py +295 -295
  23. msprobe/core/compare/utils.py +430 -429
  24. msprobe/core/data_dump/data_collector.py +154 -144
  25. msprobe/core/data_dump/data_processor/base.py +314 -293
  26. msprobe/core/data_dump/data_processor/factory.py +59 -59
  27. msprobe/core/data_dump/data_processor/mindspore_processor.py +186 -198
  28. msprobe/core/data_dump/data_processor/pytorch_processor.py +366 -389
  29. msprobe/core/data_dump/json_writer.py +96 -116
  30. msprobe/core/data_dump/scope.py +178 -178
  31. msprobe/core/grad_probe/constant.py +70 -70
  32. msprobe/core/grad_probe/grad_compare.py +171 -175
  33. msprobe/core/grad_probe/utils.py +64 -52
  34. msprobe/docs/01.installation.md +89 -0
  35. msprobe/docs/02.config_introduction.md +165 -0
  36. msprobe/docs/03.config_examples.md +247 -0
  37. msprobe/docs/04.acl_config_examples.md +76 -0
  38. msprobe/docs/05.data_dump_PyTorch.md +198 -0
  39. msprobe/docs/06.data_dump_MindSpore.md +243 -0
  40. msprobe/docs/07.accuracy_checker_PyTorch.md +274 -0
  41. msprobe/docs/08.accuracy_checker_online_PyTorch.md +198 -0
  42. msprobe/docs/09.accuracy_checker_MindSpore.md +68 -0
  43. msprobe/docs/10.accuracy_compare_PyTorch.md +245 -0
  44. msprobe/docs/11.accuracy_compare_MindSpore.md +202 -0
  45. msprobe/docs/12.overflow_check_PyTorch.md +79 -0
  46. msprobe/docs/13.overflow_check_MindSpore.md +31 -0
  47. msprobe/{pytorch/doc/parse_tool.md → docs/14.data_parse_PyTorch.md} +283 -286
  48. msprobe/docs/15.free_benchmarking_PyTorch.md +164 -0
  49. msprobe/{doc/grad_probe/grad_probe.md → docs/17.grad_probe.md} +207 -207
  50. msprobe/docs/FAQ_PyTorch.md +177 -0
  51. msprobe/docs/S02.report_free_benchmarking_validation_performance_baseline.md +146 -0
  52. msprobe/docs/img/free_benchmark_framework.png +0 -0
  53. msprobe/mindspore/__init__.py +1 -1
  54. msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +254 -245
  55. msprobe/mindspore/api_accuracy_checker/api_info.py +69 -69
  56. msprobe/mindspore/api_accuracy_checker/api_runner.py +155 -151
  57. msprobe/mindspore/api_accuracy_checker/base_compare_algorithm.py +196 -196
  58. msprobe/mindspore/api_accuracy_checker/cmd_parser.py +6 -0
  59. msprobe/mindspore/api_accuracy_checker/compute_element.py +238 -223
  60. msprobe/mindspore/api_accuracy_checker/main.py +8 -15
  61. msprobe/mindspore/api_accuracy_checker/type_mapping.py +113 -113
  62. msprobe/mindspore/api_accuracy_checker/utils.py +79 -62
  63. msprobe/mindspore/cell_processor.py +34 -34
  64. msprobe/mindspore/common/const.py +106 -87
  65. msprobe/mindspore/common/log.py +37 -37
  66. msprobe/mindspore/common/utils.py +81 -57
  67. msprobe/mindspore/compare/distributed_compare.py +75 -75
  68. msprobe/mindspore/compare/ms_compare.py +219 -117
  69. msprobe/mindspore/compare/ms_graph_compare.py +348 -317
  70. msprobe/mindspore/compare/ms_to_pt_api.yaml +399 -399
  71. msprobe/mindspore/debugger/debugger_config.py +66 -74
  72. msprobe/mindspore/debugger/precision_debugger.py +126 -107
  73. msprobe/mindspore/dump/dump_tool_factory.py +35 -35
  74. msprobe/mindspore/dump/hook_cell/api_registry.py +118 -104
  75. msprobe/mindspore/dump/hook_cell/hook_cell.py +55 -53
  76. msprobe/mindspore/dump/hook_cell/support_wrap_ops.yaml +922 -925
  77. msprobe/mindspore/dump/hook_cell/wrap_api.py +113 -0
  78. msprobe/mindspore/dump/jit_dump.py +72 -56
  79. msprobe/mindspore/dump/kernel_graph_dump.py +59 -60
  80. msprobe/mindspore/dump/kernel_kbyk_dump.py +64 -65
  81. msprobe/mindspore/free_benchmark/api_pynative_self_check.py +116 -116
  82. msprobe/mindspore/free_benchmark/common/config.py +12 -12
  83. msprobe/mindspore/free_benchmark/common/handler_params.py +17 -17
  84. msprobe/mindspore/free_benchmark/common/utils.py +71 -71
  85. msprobe/mindspore/free_benchmark/data/support_wrap_ops.yaml +842 -842
  86. msprobe/mindspore/free_benchmark/decorator/dec_forward.py +43 -42
  87. msprobe/mindspore/free_benchmark/decorator/decorator_factory.py +107 -107
  88. msprobe/mindspore/free_benchmark/handler/base_handler.py +90 -90
  89. msprobe/mindspore/free_benchmark/handler/check_handler.py +41 -41
  90. msprobe/mindspore/free_benchmark/handler/fix_handler.py +36 -36
  91. msprobe/mindspore/free_benchmark/handler/handler_factory.py +21 -21
  92. msprobe/mindspore/free_benchmark/perturbation/add_noise.py +67 -67
  93. msprobe/mindspore/free_benchmark/perturbation/base_perturbation.py +21 -21
  94. msprobe/mindspore/free_benchmark/perturbation/bit_noise.py +63 -63
  95. msprobe/mindspore/free_benchmark/perturbation/exchange_value.py +51 -0
  96. msprobe/mindspore/free_benchmark/perturbation/improve_precision.py +35 -34
  97. msprobe/mindspore/free_benchmark/perturbation/no_change.py +12 -12
  98. msprobe/mindspore/free_benchmark/perturbation/perturbation_factory.py +29 -27
  99. msprobe/mindspore/free_benchmark/self_check_tool_factory.py +33 -33
  100. msprobe/mindspore/grad_probe/global_context.py +90 -91
  101. msprobe/mindspore/grad_probe/grad_analyzer.py +231 -231
  102. msprobe/mindspore/grad_probe/grad_monitor.py +27 -27
  103. msprobe/mindspore/grad_probe/grad_stat_csv.py +131 -131
  104. msprobe/mindspore/grad_probe/hook.py +94 -92
  105. msprobe/mindspore/grad_probe/utils.py +29 -28
  106. msprobe/mindspore/ms_config.py +128 -126
  107. msprobe/mindspore/overflow_check/kernel_graph_overflow_check.py +44 -45
  108. msprobe/mindspore/overflow_check/overflow_check_tool_factory.py +34 -34
  109. msprobe/mindspore/runtime.py +4 -4
  110. msprobe/mindspore/service.py +378 -354
  111. msprobe/mindspore/task_handler_factory.py +24 -24
  112. msprobe/msprobe.py +105 -107
  113. msprobe/pytorch/__init__.py +3 -3
  114. msprobe/pytorch/api_accuracy_checker/common/config.py +53 -55
  115. msprobe/pytorch/api_accuracy_checker/common/utils.py +214 -165
  116. msprobe/pytorch/api_accuracy_checker/compare/algorithm.py +213 -213
  117. msprobe/pytorch/api_accuracy_checker/compare/api_precision_compare.py +606 -581
  118. msprobe/pytorch/api_accuracy_checker/compare/api_precision_standard.yaml +132 -132
  119. msprobe/pytorch/api_accuracy_checker/compare/api_precision_threshold.yaml +390 -390
  120. msprobe/pytorch/api_accuracy_checker/compare/compare.py +386 -381
  121. msprobe/pytorch/api_accuracy_checker/compare/compare_column.py +73 -73
  122. msprobe/pytorch/api_accuracy_checker/compare/compare_utils.py +245 -244
  123. msprobe/pytorch/api_accuracy_checker/config.yaml +10 -10
  124. msprobe/pytorch/api_accuracy_checker/run_ut/data_generate.py +335 -332
  125. msprobe/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py +200 -199
  126. msprobe/pytorch/api_accuracy_checker/run_ut/run_overflow_check.py +133 -134
  127. msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py +592 -581
  128. msprobe/pytorch/api_accuracy_checker/run_ut/run_ut_utils.py +70 -74
  129. msprobe/pytorch/api_accuracy_checker/run_ut/torch_ut_setting.json +7 -4
  130. msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/attl.py +197 -202
  131. msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/client.py +325 -324
  132. msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/device_dispatch.py +204 -204
  133. msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/server.py +219 -218
  134. msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/ssl_config.py +10 -10
  135. msprobe/pytorch/bench_functions/__init__.py +15 -15
  136. msprobe/pytorch/bench_functions/apply_adam_w.py +28 -28
  137. msprobe/pytorch/bench_functions/confusion_transpose.py +19 -19
  138. msprobe/pytorch/bench_functions/fast_gelu.py +55 -55
  139. msprobe/pytorch/bench_functions/layer_norm_eval.py +6 -6
  140. msprobe/pytorch/bench_functions/linear.py +12 -12
  141. msprobe/pytorch/bench_functions/matmul_backward.py +48 -48
  142. msprobe/pytorch/bench_functions/npu_fusion_attention.py +509 -421
  143. msprobe/pytorch/bench_functions/rms_norm.py +15 -15
  144. msprobe/pytorch/bench_functions/rotary_mul.py +52 -52
  145. msprobe/pytorch/bench_functions/scaled_mask_softmax.py +26 -26
  146. msprobe/pytorch/bench_functions/swiglu.py +55 -55
  147. msprobe/pytorch/common/__init__.py +2 -2
  148. msprobe/pytorch/common/compare_script.template +14 -14
  149. msprobe/pytorch/common/log.py +20 -31
  150. msprobe/pytorch/common/parse_json.py +39 -39
  151. msprobe/pytorch/common/utils.py +305 -300
  152. msprobe/pytorch/compare/distributed_compare.py +66 -66
  153. msprobe/pytorch/compare/mapping.yaml +607 -607
  154. msprobe/pytorch/compare/match.py +34 -33
  155. msprobe/pytorch/compare/pt_compare.py +50 -40
  156. msprobe/pytorch/debugger/debugger_config.py +95 -95
  157. msprobe/pytorch/debugger/precision_debugger.py +125 -125
  158. msprobe/pytorch/free_benchmark/__init__.py +8 -8
  159. msprobe/pytorch/free_benchmark/common/constant.py +70 -70
  160. msprobe/pytorch/free_benchmark/common/counter.py +71 -71
  161. msprobe/pytorch/free_benchmark/common/enums.py +37 -37
  162. msprobe/pytorch/free_benchmark/common/params.py +129 -129
  163. msprobe/pytorch/free_benchmark/common/utils.py +102 -102
  164. msprobe/pytorch/free_benchmark/compare/grad_saver.py +179 -179
  165. msprobe/pytorch/free_benchmark/compare/single_benchmark.py +104 -104
  166. msprobe/pytorch/free_benchmark/main.py +105 -105
  167. msprobe/pytorch/free_benchmark/perturbed_layers/base_layer.py +13 -13
  168. msprobe/pytorch/free_benchmark/perturbed_layers/layer_factory.py +41 -41
  169. msprobe/pytorch/free_benchmark/perturbed_layers/npu/add_noise.py +90 -90
  170. msprobe/pytorch/free_benchmark/perturbed_layers/npu/bit_noise.py +104 -104
  171. msprobe/pytorch/free_benchmark/perturbed_layers/npu/change_value.py +63 -63
  172. msprobe/pytorch/free_benchmark/perturbed_layers/npu/improve_precision.py +68 -68
  173. msprobe/pytorch/free_benchmark/perturbed_layers/npu/no_change.py +28 -28
  174. msprobe/pytorch/free_benchmark/perturbed_layers/npu/npu_base_layser.py +45 -45
  175. msprobe/pytorch/free_benchmark/perturbed_layers/run_cpu.py +19 -19
  176. msprobe/pytorch/free_benchmark/result_handlers/base_handler.py +217 -217
  177. msprobe/pytorch/free_benchmark/result_handlers/check_handler.py +39 -39
  178. msprobe/pytorch/free_benchmark/result_handlers/fix_handler.py +23 -23
  179. msprobe/pytorch/free_benchmark/result_handlers/handler_factory.py +30 -30
  180. msprobe/pytorch/free_benchmark/result_handlers/preheat_handler.py +170 -170
  181. msprobe/pytorch/function_factory.py +76 -75
  182. msprobe/pytorch/functional/dump_module.py +39 -39
  183. msprobe/pytorch/grad_probe/grad_monitor.py +91 -90
  184. msprobe/pytorch/grad_probe/grad_stat_csv.py +128 -128
  185. msprobe/pytorch/hook_module/api_registry.py +161 -161
  186. msprobe/pytorch/hook_module/hook_module.py +120 -120
  187. msprobe/pytorch/hook_module/support_wrap_ops.yaml +1879 -1877
  188. msprobe/pytorch/hook_module/utils.py +30 -29
  189. msprobe/pytorch/hook_module/wrap_aten.py +110 -110
  190. msprobe/pytorch/hook_module/wrap_distributed.py +78 -78
  191. msprobe/pytorch/hook_module/wrap_functional.py +105 -105
  192. msprobe/pytorch/hook_module/wrap_npu_custom.py +93 -84
  193. msprobe/pytorch/hook_module/wrap_tensor.py +71 -71
  194. msprobe/pytorch/hook_module/wrap_torch.py +86 -86
  195. msprobe/pytorch/hook_module/wrap_vf.py +62 -62
  196. msprobe/pytorch/module_processer.py +138 -138
  197. msprobe/pytorch/online_dispatch/__init__.py +20 -20
  198. msprobe/pytorch/online_dispatch/compare.py +236 -236
  199. msprobe/pytorch/online_dispatch/dispatch.py +271 -271
  200. msprobe/pytorch/online_dispatch/dump_compare.py +155 -156
  201. msprobe/pytorch/online_dispatch/single_compare.py +391 -391
  202. msprobe/pytorch/online_dispatch/torch_ops_config.yaml +49 -49
  203. msprobe/pytorch/online_dispatch/utils.py +130 -146
  204. msprobe/pytorch/parse.py +4 -4
  205. msprobe/pytorch/parse_tool/cli.py +32 -32
  206. msprobe/pytorch/parse_tool/lib/compare.py +260 -271
  207. msprobe/pytorch/parse_tool/lib/config.py +52 -52
  208. msprobe/pytorch/parse_tool/lib/file_desc.py +31 -31
  209. msprobe/pytorch/parse_tool/lib/interactive_cli.py +102 -102
  210. msprobe/pytorch/parse_tool/lib/parse_exception.py +54 -54
  211. msprobe/pytorch/parse_tool/lib/parse_tool.py +158 -158
  212. msprobe/pytorch/parse_tool/lib/utils.py +316 -321
  213. msprobe/pytorch/parse_tool/lib/visualization.py +85 -91
  214. msprobe/pytorch/pt_config.py +188 -187
  215. msprobe/pytorch/service.py +246 -252
  216. mindstudio_probe-1.0.3.dist-info/RECORD +0 -272
  217. msprobe/config/README.md +0 -539
  218. msprobe/mindspore/doc/compare.md +0 -58
  219. msprobe/mindspore/doc/dump.md +0 -217
  220. msprobe/mindspore/dump/hook_cell/wrap_functional.py +0 -91
  221. msprobe/mindspore/dump/hook_cell/wrap_tensor.py +0 -63
  222. msprobe/pytorch/doc/FAQ.md +0 -193
  223. msprobe/pytorch/doc/api_accuracy_checker.md +0 -313
  224. msprobe/pytorch/doc/api_accuracy_checker_online.md +0 -187
  225. msprobe/pytorch/doc/dump.md +0 -260
  226. msprobe/pytorch/doc/msprobe/321/207/342/226/223/342/225/233/321/205/342/225/221/320/266/321/205/342/225/226/320/265/321/205/320/225/342/225/226/321/206/320/245/342/226/221/321/206/320/235/320/276dump/321/206/320/260/320/227/321/205/320/227/320/226/321/206/320/220/320/267/321/210/320/223/342/225/234/321/205/320/257/342/225/221/321/207/342/225/221/342/224/220/321/206/320/232/320/265/321/205/320/241/320/232.md +0 -182
  227. msprobe/pytorch/doc/ptdbg_ascend_compare.md +0 -240
  228. msprobe/pytorch/doc/ptdbg_ascend_overview.md +0 -68
  229. msprobe/pytorch/doc/ptdbg_ascend_quickstart.md +0 -381
  230. msprobe/pytorch/doc/run_overflow_check.md +0 -25
  231. msprobe/pytorch/doc//321/205/320/254/320/270/321/207/342/225/221/342/224/220/321/207/342/226/223/342/225/233/321/205/342/225/221/320/266/321/206/320/277/320/244/321/205/320/277/342/225/243.md +0 -90
  232. msprobe/pytorch/doc//321/206/320/247/320/260/321/206/320/260/320/227/321/206/320/255/320/226/321/205/342/225/226/320/265/321/205/320/225/342/225/226/321/205/320/254/342/225/221/321/206/320/251/320/277/321/211/320/272/320/234/321/210/320/277/320/221/321/205/320/242/320/234/321/206/320/220/320/267/321/210/320/223/342/225/234/321/205/320/257/342/225/221/321/207/342/225/221/342/224/220/321/206/320/232/320/265/321/205/320/241/320/232.md +0 -151
  233. {mindstudio_probe-1.0.3.dist-info → mindstudio_probe-1.0.4.dist-info}/top_level.txt +0 -0
  234. /msprobe/{pytorch/doc → docs}/img/BLOOM-7B_1.png +0 -0
  235. /msprobe/{pytorch/doc → docs}/img/BLOOM-7B_2.png +0 -0
  236. /msprobe/{pytorch/doc → docs}/img/BLOOM-7B_3.png +0 -0
  237. /msprobe/{pytorch/doc → docs}/img/BLOOM-7B_4.png +0 -0
  238. /msprobe/{pytorch/doc → docs}/img/GPT-3_1.png +0 -0
  239. /msprobe/{pytorch/doc → docs}/img/GPT-3_2.png +0 -0
  240. /msprobe/{pytorch/doc → docs}/img/GPT-3_3.png +0 -0
  241. /msprobe/{pytorch/doc → docs}/img/GPT-3_4.png +0 -0
  242. /msprobe/{pytorch/doc → docs}/img/GPT-3_5.png +0 -0
  243. /msprobe/{pytorch/doc → docs}/img/GPT-3_6.png +0 -0
  244. /msprobe/{pytorch/doc → docs}/img/GPT-3_7.png +0 -0
  245. /msprobe/{pytorch/doc → docs}/img/GPT-3_8.png +0 -0
  246. /msprobe/{pytorch/doc → docs}/img/YOLOV5S_1.png +0 -0
  247. /msprobe/{pytorch/doc → docs}/img/YOLOV5S_2.png +0 -0
  248. /msprobe/{pytorch/doc → docs}/img/accuracy_checking_details.png +0 -0
  249. /msprobe/{pytorch/doc → docs}/img/accuracy_checking_result.png +0 -0
  250. /msprobe/{pytorch/doc → docs}/img/api_precision_compare_details.png +0 -0
  251. /msprobe/{pytorch/doc → docs}/img/api_precision_compare_result.png +0 -0
  252. /msprobe/{pytorch/doc → docs}/img/auto_analyze_log.png +0 -0
  253. /msprobe/{pytorch/doc → docs}/img/compare_result_pkl.png +0 -0
  254. /msprobe/{pytorch/doc → docs}/img/compare_result_pkl_md5.png.png +0 -0
  255. /msprobe/{pytorch/doc → docs}/img/cpu_info.png +0 -0
  256. /msprobe/{config → docs}/img/free_benchmark.png +0 -0
  257. /msprobe/{doc/grad_probe/img/image-1.png → docs/img/grad_probe_image-1.png} +0 -0
  258. /msprobe/{doc/grad_probe/img/image-2.png → docs/img/grad_probe_image-2.png} +0 -0
  259. /msprobe/{doc/grad_probe/img/image-3.png → docs/img/grad_probe_image-3.png} +0 -0
  260. /msprobe/{doc/grad_probe/img/image-4.png → docs/img/grad_probe_image-4.png} +0 -0
  261. /msprobe/{doc/grad_probe/img/image.png → docs/img/grad_probe_image.png} +0 -0
  262. /msprobe/{pytorch/doc → docs}/img/module_compare.png +0 -0
@@ -1,71 +1,71 @@
1
-
2
- class GradConst:
3
-
4
- FRAMEWORKS = {"PyTorch", "MindSpore"}
5
- PYTORCH = "PyTorch"
6
- MindSpore = "MindSpore"
7
-
8
- GRAD_FILE_SUFFIX = {"npy", "pt"}
9
- NPY_SUFFIX = "npy"
10
- PT_SUFFIX = "pt"
11
-
12
- # for callback
13
- CURRENT_STEP = "current_step"
14
-
15
- PARAM_LIST = "param_list"
16
- RANK = "rank"
17
- STEP = "step"
18
- BOUNDS = "bounds"
19
- OUTPUT_PATH = "output_path"
20
-
21
- # level const
22
- LEVEL = "level"
23
- LEVEL0 = "L0"
24
- LEVEL1 = "L1"
25
- LEVEL2 = "L2"
26
- SUPPORTED_LEVEL = {"L0", "L1", "L2"}
27
-
28
- # numpy coding
29
- STEP_IDX = 0
30
- SHAPE_DIM_IDX = 4
31
- MAX_SIZE = 10 * 1024 * 1024 * 1024
32
-
33
- # direction suffix
34
- DIR_SUFFIX = "dir.npy"
35
-
36
- # file safty
37
- DATA_DIR_AUTHORITY = 0o750
38
- DATA_FILE_AUTHORITY = 0o640
39
- DIRECTORY_LENGTH = 4096
40
- FILE_NAME_LENGTH = 255
41
- FILE_VALID_PATTERN = r"^[a-zA-Z0-9_.:/-]+$"
42
- PARAM_VALID_PATTERN = r"^[a-zA-Z0-9_.]+$"
43
- DIR = "dir"
44
- FILE = "file"
45
-
46
- STEP_FINISH = "step_finish"
47
-
48
- SUMMARY = "summary"
49
-
50
- # csv header entry
51
- MD5 = "MD5"
52
- DISTRIBUTION = "distribution"
53
- SHAPE = "shape"
54
- MAX = "max"
55
- MIN = "min"
56
- NORM = "norm"
57
-
58
- level_adp = {
59
- "L0": {
60
- "header": [GradConst.MD5, GradConst.MAX, GradConst.MIN, GradConst.NORM, GradConst.SHAPE],
61
- "have_grad_direction": False
62
- },
63
- "L1": {
64
- "header": [GradConst.MAX, GradConst.MIN, GradConst.NORM, GradConst.SHAPE],
65
- "have_grad_direction": True
66
- },
67
- "L2": {
68
- "header": [GradConst.DISTRIBUTION, GradConst.MAX, GradConst.MIN, GradConst.NORM, GradConst.SHAPE],
69
- "have_grad_direction": True
70
- },
1
+
2
+ class GradConst:
3
+
4
+ FRAMEWORKS = {"PyTorch", "MindSpore"}
5
+ PYTORCH = "PyTorch"
6
+ MindSpore = "MindSpore"
7
+
8
+ GRAD_FILE_SUFFIX = {"npy", "pt"}
9
+ NPY_SUFFIX = "npy"
10
+ PT_SUFFIX = "pt"
11
+
12
+ # for callback
13
+ CURRENT_STEP = "current_step"
14
+
15
+ PARAM_LIST = "param_list"
16
+ RANK = "rank"
17
+ STEP = "step"
18
+ BOUNDS = "bounds"
19
+ OUTPUT_PATH = "output_path"
20
+
21
+ # level const
22
+ LEVEL = "level"
23
+ LEVEL0 = "L0"
24
+ LEVEL1 = "L1"
25
+ LEVEL2 = "L2"
26
+ SUPPORTED_LEVEL = {"L0", "L1", "L2"}
27
+
28
+ # numpy coding
29
+ STEP_IDX = 0
30
+ SHAPE_DIM_IDX = 4
31
+ MAX_SIZE = 10 * 1024 * 1024 * 1024
32
+
33
+ # direction suffix
34
+ DIR_SUFFIX = "dir.npy"
35
+
36
+ # file safty
37
+ DATA_DIR_AUTHORITY = 0o750
38
+ DATA_FILE_AUTHORITY = 0o640
39
+ DIRECTORY_LENGTH = 4096
40
+ FILE_NAME_LENGTH = 255
41
+ FILE_VALID_PATTERN = r"^[a-zA-Z0-9_.:/-]+$"
42
+ PARAM_VALID_PATTERN = r"^[a-zA-Z0-9_.]+$"
43
+ DIR = "dir"
44
+ FILE = "file"
45
+
46
+ STEP_FINISH = "step_finish"
47
+
48
+ SUMMARY = "summary"
49
+
50
+ # csv header entry
51
+ MD5 = "MD5"
52
+ DISTRIBUTION = "distribution"
53
+ SHAPE = "shape"
54
+ MAX = "max"
55
+ MIN = "min"
56
+ NORM = "norm"
57
+
58
+ level_adp = {
59
+ "L0": {
60
+ "header": [GradConst.MD5, GradConst.MAX, GradConst.MIN, GradConst.NORM, GradConst.SHAPE],
61
+ "have_grad_direction": False
62
+ },
63
+ "L1": {
64
+ "header": [GradConst.MAX, GradConst.MIN, GradConst.NORM, GradConst.SHAPE],
65
+ "have_grad_direction": True
66
+ },
67
+ "L2": {
68
+ "header": [GradConst.DISTRIBUTION, GradConst.MAX, GradConst.MIN, GradConst.NORM, GradConst.SHAPE],
69
+ "have_grad_direction": True
70
+ },
71
71
  }
@@ -1,175 +1,171 @@
1
- import os
2
- from typing import List
3
-
4
- from tqdm import tqdm
5
- import pandas as pd
6
- import matplotlib.pyplot as plt
7
-
8
- from msprobe.core.common.utils import check_file_or_directory_path, check_path_before_create
9
- from msprobe.core.common.file_check import create_directory
10
- from msprobe.core.common.log import logger
11
- from msprobe.core.common.utils import remove_path, write_csv, load_npy
12
- from msprobe.core.grad_probe.constant import GradConst
13
-
14
-
15
- class GradComparator:
16
-
17
- @staticmethod
18
- def _get_grad_weight_order(path1, path2):
19
- for summary_file in os.listdir(path1):
20
- if not summary_file.endswith(".csv"):
21
- continue
22
- if not os.path.exists(os.path.join(path2, summary_file)):
23
- continue
24
- summary_csv = pd.read_csv(os.path.join(path1, summary_file))
25
- return summary_csv["param_name"]
26
- raise RuntimeError("no matched grad_summary.csv for comparison, please dump data in same configuration")
27
-
28
- @staticmethod
29
- def _get_name_matched_grad_file(param_name, grad_files):
30
- for grad_file in grad_files:
31
- if param_name == grad_file[:grad_file.rfind('.')]:
32
- return grad_file
33
- raise RuntimeError("no matched grad_file for comparison, please dump data in same configuration")
34
-
35
- @classmethod
36
- def compare_distributed(cls, path1: str, path2: str, output_dir: str):
37
- ranks = cls._get_matched_dirs(path1, path2, "rank")
38
- logger.info(f"the following ranks will be compared: {ranks}")
39
- if not ranks:
40
- raise RuntimeError("no matched ranks for comparison, please dump data in same configuration")
41
- if not os.path.isdir(output_dir):
42
- create_directory(output_dir)
43
- for rank in tqdm(ranks, desc="rank"):
44
- logger.info(f"now comparing rank {rank}:")
45
- cls.compare(os.path.join(path1, f"rank{rank}"),
46
- os.path.join(path2, f"rank{rank}"),
47
- os.path.join(output_dir, f"rank{rank}"))
48
-
49
- @classmethod
50
- def compare(cls, path1: str, path2: str, output_dir: str):
51
- steps = cls._get_matched_dirs(path1, path2, "step")
52
- if not steps:
53
- raise RuntimeError("no matched steps for comparison, please dump data in same configuration")
54
- similarities = cls._calculate_separated_similarities(path1, path2, steps)
55
- if not os.path.isdir(output_dir):
56
- create_directory(output_dir)
57
- cls._save_similarities(similarities, steps, output_dir)
58
-
59
- @classmethod
60
- def _get_matched_dirs(cls, path1: str, path2: str, dir_prefix):
61
- check_file_or_directory_path(path1, isdir=True)
62
- check_file_or_directory_path(path2, isdir=True)
63
- dirs = []
64
- for dir_name in os.listdir(path1):
65
- index = dir_name.replace(dir_prefix, "", 1)
66
- if not dir_name.startswith(dir_prefix) or not index.isdigit():
67
- continue
68
-
69
- folder2 = os.path.join(path2, dir_name)
70
- if not os.path.isdir(folder2):
71
- continue
72
- dirs.append(int(index))
73
- dirs = sorted(dirs)
74
- return dirs
75
-
76
- @classmethod
77
- def _save_similarities(cls, similarities: List[float], steps: List[int], output_dir: str):
78
- if not similarities:
79
- raise ValueError(f"length of similarities is 0")
80
- result = [['step'] + [str(step) for step in steps]]
81
- for key, value in tqdm(similarities.items(), desc="save similarities (by param)"):
82
- if len(value) != len(steps):
83
- raise RuntimeError(f"similarities length of {key}:{len(value)} not equal steps:{len(steps)}")
84
- plt.plot(steps, value)
85
- plt.xlabel('steps')
86
- plt.ylabel('similarities')
87
- plt.title(f'{key}_similarities')
88
- picture_dir = os.path.join(output_dir, "similarities_picture")
89
- if not os.path.isdir(picture_dir):
90
- create_directory(picture_dir)
91
- fig_save_path = os.path.join(picture_dir, f"{key}_similarities.png")
92
-
93
- check_path_before_create(fig_save_path)
94
- try:
95
- plt.savefig(fig_save_path)
96
- except Exception as e:
97
- raise RuntimeError(f"save plt figure {fig_save_path} failed") from e
98
- plt.close()
99
-
100
- result.append([key] + value)
101
- result_csv_path = os.path.join(output_dir, "similarities.csv")
102
- if os.path.exists(result_csv_path):
103
- logger.warning(f"{result_csv_path} will be recoverd")
104
- remove_path(result_csv_path)
105
- write_csv(result, result_csv_path)
106
-
107
- @classmethod
108
- def _calculate_separated_similarities(cls, path1, path2, steps):
109
- similarities = {}
110
- logger.info(f"{len(steps)} steps will be compared")
111
- grad_weight_order = cls._get_grad_weight_order(path1, path2)
112
- for step in tqdm(steps, desc="culculate similarities (by step)"):
113
- grad_files = cls._get_matched_grad_files(path1, path2, step)
114
- same_count_summary = 0
115
- total_count_summary = 0
116
- for grad_name in grad_weight_order:
117
- grad_file = cls._get_name_matched_grad_file(grad_name, grad_files)
118
- grad1 = os.path.join(path1, f"step{step}", grad_file)
119
- grad2 = os.path.join(path2, f"step{step}", grad_file)
120
- same_count, total_count = cls._calculate_similarity(grad1, grad2)
121
- same_count_summary += same_count
122
- total_count_summary += total_count
123
- idx = grad_file.rfind(".")
124
- param_name = grad_file[:idx]
125
- if param_name not in similarities:
126
- similarities[param_name] = []
127
- if total_count == 0:
128
- similarities[param_name].append(0)
129
- else:
130
- similarities[param_name].append(same_count / total_count)
131
- if GradConst.SUMMARY not in similarities:
132
- similarities[GradConst.SUMMARY] = []
133
- if total_count_summary == 0:
134
- similarities[GradConst.SUMMARY].append(0)
135
- else:
136
- similarities[GradConst.SUMMARY].append(same_count_summary / total_count_summary)
137
- return similarities
138
-
139
- @classmethod
140
- def _get_matched_grad_files(cls, path1: str, path2: str, step: int):
141
- path1 = os.path.join(path1, f"step{step}")
142
- path2 = os.path.join(path2, f"step{step}")
143
- check_file_or_directory_path(path1, isdir=True)
144
- check_file_or_directory_path(path2, isdir=True)
145
- grad_files = []
146
- for grad_file in os.listdir(path1):
147
- splits = grad_file.split('.')
148
- if len(splits) < 1 or splits[-1] not in GradConst.GRAD_FILE_SUFFIX:
149
- continue
150
- folder2 = os.path.join(path2, grad_file)
151
- if not os.path.exists(folder2):
152
- continue
153
- grad_files.append(grad_file)
154
- return sorted(grad_files)
155
-
156
- @classmethod
157
- def _calculate_similarity(cls, grad_file1: str, grad_file2: str):
158
- npy1, npy2 = cls._load_grad_files(grad_file1, grad_file2)
159
- same_count = (npy1 == npy2).sum()
160
- total_count = npy1.size
161
- return same_count, total_count
162
-
163
- @classmethod
164
- def _load_grad_files(cls, grad_file1: str, grad_file2: str):
165
- grad1 = load_npy(grad_file1)
166
- grad2 = load_npy(grad_file2)
167
- if grad1.shape != grad2.shape:
168
- raise RuntimeError(f"tensor shape is not equal: {grad_file1}, {grad_file2}")
169
- if grad1.dtype != bool:
170
- raise TypeError(f"tensor type is not bool: {grad_file1}")
171
- if grad2.dtype != bool:
172
- raise TypeError(f"tensor type is not bool: {grad_file2}")
173
- return grad1, grad2
174
-
175
-
1
+ import os
2
+ from typing import List
3
+
4
+ from tqdm import tqdm
5
+ import pandas as pd
6
+ import matplotlib.pyplot as plt
7
+
8
+ from msprobe.core.common.file_utils import create_directory, check_path_before_create, check_file_or_directory_path
9
+ from msprobe.core.common.log import logger
10
+ from msprobe.core.common.file_utils import remove_path, load_npy, write_csv
11
+ from msprobe.core.grad_probe.constant import GradConst
12
+ from msprobe.core.grad_probe.utils import plt_savefig
13
+
14
+
15
+ class GradComparator:
16
+
17
+ @staticmethod
18
+ def _get_grad_weight_order(path1, path2):
19
+ for summary_file in os.listdir(path1):
20
+ if not summary_file.endswith(".csv"):
21
+ continue
22
+ if not os.path.exists(os.path.join(path2, summary_file)):
23
+ continue
24
+ summary_csv = pd.read_csv(os.path.join(path1, summary_file))
25
+ return summary_csv["param_name"]
26
+ raise RuntimeError("no matched grad_summary.csv for comparison, please dump data in same configuration")
27
+
28
+ @staticmethod
29
+ def _get_name_matched_grad_file(param_name, grad_files):
30
+ for grad_file in grad_files:
31
+ if param_name == grad_file[:grad_file.rfind('.')]:
32
+ return grad_file
33
+ raise RuntimeError("no matched grad_file for comparison, please dump data in same configuration")
34
+
35
+ @classmethod
36
+ def compare_distributed(cls, path1: str, path2: str, output_dir: str):
37
+ ranks = cls._get_matched_dirs(path1, path2, "rank")
38
+ logger.info(f"the following ranks will be compared: {ranks}")
39
+ if not ranks:
40
+ raise RuntimeError("no matched ranks for comparison, please dump data in same configuration")
41
+ if not os.path.isdir(output_dir):
42
+ create_directory(output_dir)
43
+ for rank in tqdm(ranks, desc="rank"):
44
+ logger.info(f"now comparing rank {rank}:")
45
+ cls.compare(os.path.join(path1, f"rank{rank}"),
46
+ os.path.join(path2, f"rank{rank}"),
47
+ os.path.join(output_dir, f"rank{rank}"))
48
+
49
+ @classmethod
50
+ def compare(cls, path1: str, path2: str, output_dir: str):
51
+ steps = cls._get_matched_dirs(path1, path2, "step")
52
+ if not steps:
53
+ raise RuntimeError("no matched steps for comparison, please dump data in same configuration")
54
+ similarities = cls._calculate_separated_similarities(path1, path2, steps)
55
+ if not os.path.isdir(output_dir):
56
+ create_directory(output_dir)
57
+ cls._save_similarities(similarities, steps, output_dir)
58
+
59
+ @classmethod
60
+ def _get_matched_dirs(cls, path1: str, path2: str, dir_prefix):
61
+ check_file_or_directory_path(path1, isdir=True)
62
+ check_file_or_directory_path(path2, isdir=True)
63
+ dirs = []
64
+ for dir_name in os.listdir(path1):
65
+ index = dir_name.replace(dir_prefix, "", 1)
66
+ if not dir_name.startswith(dir_prefix) or not index.isdigit():
67
+ continue
68
+
69
+ folder2 = os.path.join(path2, dir_name)
70
+ if not os.path.isdir(folder2):
71
+ continue
72
+ dirs.append(int(index))
73
+ dirs = sorted(dirs)
74
+ return dirs
75
+
76
+ @classmethod
77
+ def _save_similarities(cls, similarities: List[float], steps: List[int], output_dir: str):
78
+ if not similarities:
79
+ raise ValueError(f"length of similarities is 0")
80
+ result = [['step'] + [str(step) for step in steps]]
81
+ for key, value in tqdm(similarities.items(), desc="save similarities (by param)"):
82
+ if len(value) != len(steps):
83
+ raise RuntimeError(f"similarities length of {key}:{len(value)} not equal steps:{len(steps)}")
84
+ plt.plot(steps, value)
85
+ plt.xlabel('steps')
86
+ plt.ylabel('similarities')
87
+ plt.title(f'{key}_similarities')
88
+ picture_dir = os.path.join(output_dir, "similarities_picture")
89
+ if not os.path.isdir(picture_dir):
90
+ create_directory(picture_dir)
91
+ fig_save_path = os.path.join(picture_dir, f"{key}_similarities.png")
92
+
93
+ plt_savefig(fig_save_path)
94
+ plt.close()
95
+
96
+ result.append([key] + value)
97
+ result_csv_path = os.path.join(output_dir, "similarities.csv")
98
+ if os.path.exists(result_csv_path):
99
+ logger.warning(f"{result_csv_path} will be recoverd")
100
+ remove_path(result_csv_path)
101
+ write_csv(result, result_csv_path)
102
+
103
+ @classmethod
104
+ def _calculate_separated_similarities(cls, path1, path2, steps):
105
+ similarities = {}
106
+ logger.info(f"{len(steps)} steps will be compared")
107
+ grad_weight_order = cls._get_grad_weight_order(path1, path2)
108
+ for step in tqdm(steps, desc="culculate similarities (by step)"):
109
+ grad_files = cls._get_matched_grad_files(path1, path2, step)
110
+ same_count_summary = 0
111
+ total_count_summary = 0
112
+ for grad_name in grad_weight_order:
113
+ grad_file = cls._get_name_matched_grad_file(grad_name, grad_files)
114
+ grad1 = os.path.join(path1, f"step{step}", grad_file)
115
+ grad2 = os.path.join(path2, f"step{step}", grad_file)
116
+ same_count, total_count = cls._calculate_similarity(grad1, grad2)
117
+ same_count_summary += same_count
118
+ total_count_summary += total_count
119
+ idx = grad_file.rfind(".")
120
+ param_name = grad_file[:idx]
121
+ if param_name not in similarities:
122
+ similarities[param_name] = []
123
+ if total_count == 0:
124
+ similarities[param_name].append(0)
125
+ else:
126
+ similarities[param_name].append(same_count / total_count)
127
+ if GradConst.SUMMARY not in similarities:
128
+ similarities[GradConst.SUMMARY] = []
129
+ if total_count_summary == 0:
130
+ similarities[GradConst.SUMMARY].append(0)
131
+ else:
132
+ similarities[GradConst.SUMMARY].append(same_count_summary / total_count_summary)
133
+ return similarities
134
+
135
+ @classmethod
136
+ def _get_matched_grad_files(cls, path1: str, path2: str, step: int):
137
+ path1 = os.path.join(path1, f"step{step}")
138
+ path2 = os.path.join(path2, f"step{step}")
139
+ check_file_or_directory_path(path1, isdir=True)
140
+ check_file_or_directory_path(path2, isdir=True)
141
+ grad_files = []
142
+ for grad_file in os.listdir(path1):
143
+ splits = grad_file.split('.')
144
+ if len(splits) < 1 or splits[-1] not in GradConst.GRAD_FILE_SUFFIX:
145
+ continue
146
+ folder2 = os.path.join(path2, grad_file)
147
+ if not os.path.exists(folder2):
148
+ continue
149
+ grad_files.append(grad_file)
150
+ return sorted(grad_files)
151
+
152
+ @classmethod
153
+ def _calculate_similarity(cls, grad_file1: str, grad_file2: str):
154
+ npy1, npy2 = cls._load_grad_files(grad_file1, grad_file2)
155
+ same_count = (npy1 == npy2).sum()
156
+ total_count = npy1.size
157
+ return same_count, total_count
158
+
159
+ @classmethod
160
+ def _load_grad_files(cls, grad_file1: str, grad_file2: str):
161
+ grad1 = load_npy(grad_file1)
162
+ grad2 = load_npy(grad_file2)
163
+ if grad1.shape != grad2.shape:
164
+ raise RuntimeError(f"tensor shape is not equal: {grad_file1}, {grad_file2}")
165
+ if grad1.dtype != bool:
166
+ raise TypeError(f"tensor type is not bool: {grad_file1}")
167
+ if grad2.dtype != bool:
168
+ raise TypeError(f"tensor type is not bool: {grad_file2}")
169
+ return grad1, grad2
170
+
171
+
@@ -1,52 +1,64 @@
1
- import re
2
- from msprobe.core.grad_probe.constant import GradConst
3
- from msprobe.core.common.log import logger
4
- from msprobe.core.common.utils import write_csv
5
-
6
- def data_in_list_target(data, lst):
7
- return not lst or len(lst) == 0 or data in lst
8
-
9
-
10
- def check_numeral_list_ascend(lst):
11
- if any(not isinstance(item, (int, float)) for item in lst):
12
- raise Exception("The input list should only contain numbers")
13
- if lst != sorted(lst):
14
- raise Exception("The input list should be ascending")
15
-
16
-
17
- def check_param(param_name):
18
- if not re.match(GradConst.PARAM_VALID_PATTERN, param_name):
19
- raise RuntimeError("The parameter name contains special characters.")
20
-
21
-
22
- def check_str(string, variable_name):
23
- if not isinstance(string, str):
24
- raise ValueError(f'The variable: "{variable_name}" is not a string.')
25
-
26
-
27
- class ListCache(list):
28
- threshold = 1000
29
-
30
- def __init__(self, *args):
31
- super().__init__(*args)
32
- self._output_file = None
33
-
34
- def __del__(self):
35
- self.flush()
36
-
37
- def flush(self):
38
- if len(self) == 0:
39
- return
40
- if not self._output_file:
41
- logger.warning("dumpfile path is not setted")
42
- write_csv(self, self._output_file)
43
- logger.info(f"write {len(self)} items to {self._output_file}.")
44
- self.clear()
45
-
46
- def append(self, data):
47
- list.append(self, data)
48
- if len(self) >= ListCache.threshold:
49
- self.flush()
50
-
51
- def set_output_file(self, output_file):
52
- self._output_file = output_file
1
+ import re
2
+ from msprobe.core.grad_probe.constant import GradConst
3
+ from msprobe.core.common.log import logger
4
+ from msprobe.core.common.file_utils import write_csv, check_path_before_create, change_mode
5
+ from msprobe.core.common.const import FileCheckConst
6
+ import matplotlib.pyplot as plt
7
+
8
+
9
+ def data_in_list_target(data, lst):
10
+ return not lst or len(lst) == 0 or data in lst
11
+
12
+
13
+ def check_numeral_list_ascend(lst):
14
+ if any(not isinstance(item, (int, float)) for item in lst):
15
+ raise Exception("The input list should only contain numbers")
16
+ if lst != sorted(lst):
17
+ raise Exception("The input list should be ascending")
18
+
19
+
20
+ def check_param(param_name):
21
+ if not re.match(GradConst.PARAM_VALID_PATTERN, param_name):
22
+ raise RuntimeError("The parameter name contains special characters.")
23
+
24
+
25
+ def check_str(string, variable_name):
26
+ if not isinstance(string, str):
27
+ raise ValueError(f'The variable: "{variable_name}" is not a string.')
28
+
29
+
30
+ class ListCache(list):
31
+ threshold = 1000
32
+
33
+ def __init__(self, *args):
34
+ super().__init__(*args)
35
+ self._output_file = None
36
+
37
+ def __del__(self):
38
+ self.flush()
39
+
40
+ def flush(self):
41
+ if len(self) == 0:
42
+ return
43
+ if not self._output_file:
44
+ logger.warning("dumpfile path is not setted")
45
+ write_csv(self, self._output_file)
46
+ logger.info(f"write {len(self)} items to {self._output_file}.")
47
+ self.clear()
48
+
49
+ def append(self, data):
50
+ list.append(self, data)
51
+ if len(self) >= ListCache.threshold:
52
+ self.flush()
53
+
54
+ def set_output_file(self, output_file):
55
+ self._output_file = output_file
56
+
57
+
58
+ def plt_savefig(fig_save_path):
59
+ check_path_before_create(fig_save_path)
60
+ try:
61
+ plt.savefig(fig_save_path)
62
+ except Exception as e:
63
+ raise RuntimeError(f"save plt figure {fig_save_path} failed") from e
64
+ change_mode(fig_save_path, FileCheckConst.DATA_FILE_AUTHORITY)