mindstudio-probe 1.1.1__py3-none-any.whl → 1.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (197) hide show
  1. {mindstudio_probe-1.1.1.dist-info → mindstudio_probe-1.2.1.dist-info}/METADATA +3 -2
  2. {mindstudio_probe-1.1.1.dist-info → mindstudio_probe-1.2.1.dist-info}/RECORD +196 -141
  3. msprobe/CMakeLists.txt +5 -0
  4. msprobe/README.md +14 -19
  5. msprobe/config.json +1 -0
  6. msprobe/core/common/const.py +155 -6
  7. msprobe/core/common/exceptions.py +3 -1
  8. msprobe/core/common/file_utils.py +33 -7
  9. msprobe/core/common/inplace_ops.yaml +3 -0
  10. msprobe/core/common/utils.py +28 -14
  11. msprobe/core/common_config.py +6 -0
  12. msprobe/core/compare/acc_compare.py +139 -128
  13. msprobe/core/compare/check.py +31 -29
  14. msprobe/core/compare/compare_cli.py +17 -16
  15. msprobe/core/compare/highlight.py +186 -99
  16. msprobe/core/compare/layer_mapping/data_scope_parser.py +18 -7
  17. msprobe/core/compare/layer_mapping/layer_mapping.py +21 -14
  18. msprobe/core/compare/layer_mapping/postprocess_pass.py +4 -3
  19. msprobe/core/compare/merge_result/merge_result.py +380 -0
  20. msprobe/core/compare/merge_result/merge_result_cli.py +31 -0
  21. msprobe/core/compare/multiprocessing_compute.py +2 -2
  22. msprobe/core/compare/npy_compare.py +109 -147
  23. msprobe/core/compare/utils.py +189 -69
  24. msprobe/core/data_dump/data_collector.py +51 -21
  25. msprobe/core/data_dump/data_processor/base.py +38 -20
  26. msprobe/core/data_dump/data_processor/factory.py +5 -3
  27. msprobe/core/data_dump/data_processor/mindspore_processor.py +154 -20
  28. msprobe/core/data_dump/data_processor/pytorch_processor.py +118 -58
  29. msprobe/core/data_dump/json_writer.py +29 -1
  30. msprobe/core/data_dump/scope.py +19 -18
  31. msprobe/core/overflow_check/abnormal_scene.py +9 -5
  32. msprobe/core/overflow_check/checker.py +1 -1
  33. msprobe/core/overflow_check/utils.py +1 -1
  34. msprobe/docs/01.installation.md +96 -17
  35. msprobe/docs/02.config_introduction.md +5 -5
  36. msprobe/docs/05.data_dump_PyTorch.md +91 -61
  37. msprobe/docs/06.data_dump_MindSpore.md +57 -19
  38. msprobe/docs/07.accuracy_checker_PyTorch.md +18 -18
  39. msprobe/docs/09.accuracy_checker_MindSpore.md +4 -4
  40. msprobe/docs/10.accuracy_compare_PyTorch.md +99 -41
  41. msprobe/docs/11.accuracy_compare_MindSpore.md +249 -48
  42. msprobe/docs/12.overflow_check_PyTorch.md +1 -1
  43. msprobe/docs/19.monitor.md +120 -27
  44. msprobe/docs/21.visualization_PyTorch.md +115 -35
  45. msprobe/docs/22.visualization_MindSpore.md +138 -41
  46. msprobe/docs/23.generate_operator_PyTorch.md +107 -0
  47. msprobe/docs/24.code_mapping_Mindspore.md +28 -0
  48. msprobe/docs/{23.tool_function_introduction.md → 25.tool_function_introduction.md} +1 -0
  49. msprobe/docs/26.data_dump_PyTorch_baseline.md +37 -0
  50. msprobe/docs/27.dump_json_instruction.md +521 -0
  51. msprobe/docs/FAQ.md +26 -2
  52. msprobe/docs/accuracy_checker_MindSpore/accuracy_checker_MindSpore_baseline.md +14 -0
  53. msprobe/docs/data_dump_MindSpore/data_dump_MindSpore_baseline.md +22 -0
  54. msprobe/docs/img/merge_result.png +0 -0
  55. msprobe/docs/img/visualization/fuzzy_match_ms.png +0 -0
  56. msprobe/docs/img/visualization/fuzzy_match_pt.png +0 -0
  57. msprobe/docs/img/visualization/tensorboard_1.png +0 -0
  58. msprobe/docs/img/visualization/tensorboard_2.png +0 -0
  59. msprobe/docs/img/visualization/vis_browser_1.png +0 -0
  60. msprobe/docs/img/visualization/vis_browser_2.png +0 -0
  61. msprobe/docs/img/visualization/vis_precision_info.png +0 -0
  62. msprobe/docs/img/visualization/vis_search_info.png +0 -0
  63. msprobe/docs/img/visualization/vis_show_info.png +0 -0
  64. msprobe/docs/img/visualization/vis_showcase.png +0 -0
  65. msprobe/docs/img/visualization/vis_unmatch_info.png +0 -0
  66. msprobe/docs/visualization/GPTModel.png +0 -0
  67. msprobe/docs/visualization/ParallelMLP.png +0 -0
  68. msprobe/docs/visualization/layer_mapping_example.md +132 -0
  69. msprobe/docs/visualization/mapping.png +0 -0
  70. msprobe/docs/visualization/mapping1.png +0 -0
  71. msprobe/docs/visualization/module_name.png +0 -0
  72. msprobe/docs/visualization/module_name1.png +0 -0
  73. msprobe/docs/visualization/no_mapping.png +0 -0
  74. msprobe/docs/visualization/no_mapping1.png +0 -0
  75. msprobe/docs/visualization/no_mapping_analyze.png +0 -0
  76. msprobe/docs/visualization/top_layer.png +0 -0
  77. msprobe/mindspore/__init__.py +10 -0
  78. msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +57 -25
  79. msprobe/mindspore/api_accuracy_checker/cmd_parser.py +2 -1
  80. msprobe/mindspore/api_accuracy_checker/compute_element.py +5 -7
  81. msprobe/mindspore/api_accuracy_checker/data_manager.py +37 -0
  82. msprobe/mindspore/api_accuracy_checker/main.py +1 -0
  83. msprobe/mindspore/api_accuracy_checker/multi_api_accuracy_checker.py +12 -6
  84. msprobe/mindspore/api_accuracy_checker/multi_data_manager.py +3 -1
  85. msprobe/mindspore/code_mapping/bind.py +264 -0
  86. msprobe/mindspore/code_mapping/cmd_parser.py +40 -0
  87. msprobe/mindspore/code_mapping/graph.py +49 -0
  88. msprobe/mindspore/code_mapping/graph_parser.py +226 -0
  89. msprobe/mindspore/code_mapping/main.py +24 -0
  90. msprobe/mindspore/code_mapping/processor.py +34 -0
  91. msprobe/mindspore/common/const.py +3 -1
  92. msprobe/mindspore/common/utils.py +50 -5
  93. msprobe/mindspore/compare/distributed_compare.py +0 -2
  94. msprobe/mindspore/compare/ms_compare.py +105 -63
  95. msprobe/mindspore/compare/ms_graph_compare.py +14 -5
  96. msprobe/mindspore/debugger/debugger_config.py +3 -0
  97. msprobe/mindspore/debugger/precision_debugger.py +81 -12
  98. msprobe/mindspore/dump/hook_cell/api_registry.py +83 -16
  99. msprobe/mindspore/dump/hook_cell/hook_cell.py +60 -38
  100. msprobe/mindspore/dump/hook_cell/primitive_hooks.py +33 -15
  101. msprobe/mindspore/dump/hook_cell/support_wrap_ops.yaml +11 -1
  102. msprobe/mindspore/dump/hook_cell/wrap_api.py +92 -1
  103. msprobe/mindspore/dump/kernel_dump/kernel_config.py +33 -0
  104. msprobe/mindspore/dump/kernel_graph_dump.py +7 -0
  105. msprobe/mindspore/free_benchmark/api_pynative_self_check.py +13 -4
  106. msprobe/mindspore/free_benchmark/perturbation/bit_noise.py +2 -2
  107. msprobe/mindspore/grad_probe/grad_analyzer.py +24 -12
  108. msprobe/mindspore/grad_probe/hook.py +13 -4
  109. msprobe/mindspore/mindtorch/__init__.py +18 -0
  110. msprobe/mindspore/mindtorch/mindtorch_adaptor.py +255 -0
  111. msprobe/mindspore/ms_config.py +5 -1
  112. msprobe/mindspore/overflow_check/kernel_graph_overflow_check.py +7 -0
  113. msprobe/mindspore/service.py +267 -101
  114. msprobe/msprobe.py +24 -3
  115. msprobe/pytorch/__init__.py +7 -6
  116. msprobe/pytorch/api_accuracy_checker/common/utils.py +31 -16
  117. msprobe/pytorch/api_accuracy_checker/compare/algorithm.py +41 -8
  118. msprobe/pytorch/api_accuracy_checker/compare/api_precision_compare.py +100 -267
  119. msprobe/pytorch/api_accuracy_checker/compare/api_precision_standard.yaml +4 -1
  120. msprobe/pytorch/api_accuracy_checker/compare/compare.py +69 -68
  121. msprobe/pytorch/api_accuracy_checker/compare/compare_column.py +54 -0
  122. msprobe/pytorch/api_accuracy_checker/compare/compare_input.py +51 -0
  123. msprobe/pytorch/api_accuracy_checker/compare/compare_utils.py +2 -4
  124. msprobe/pytorch/api_accuracy_checker/generate_op_script/op_generator.py +54 -30
  125. msprobe/pytorch/api_accuracy_checker/precision_standard/absolute_threshold.py +106 -0
  126. msprobe/pytorch/api_accuracy_checker/precision_standard/accumulative_error_compare.py +107 -0
  127. msprobe/pytorch/api_accuracy_checker/precision_standard/base_standard.py +151 -0
  128. msprobe/pytorch/api_accuracy_checker/precision_standard/benchmark_compare.py +226 -0
  129. msprobe/pytorch/api_accuracy_checker/precision_standard/binary_consistency.py +68 -0
  130. msprobe/pytorch/api_accuracy_checker/precision_standard/standard_config.py +218 -0
  131. msprobe/pytorch/api_accuracy_checker/precision_standard/standard_register.py +104 -0
  132. msprobe/pytorch/api_accuracy_checker/precision_standard/thousandth_standard.py +63 -0
  133. msprobe/pytorch/api_accuracy_checker/precision_standard/ulp_compare.py +200 -0
  134. msprobe/pytorch/api_accuracy_checker/run_ut/data_generate.py +57 -1
  135. msprobe/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py +2 -1
  136. msprobe/pytorch/api_accuracy_checker/run_ut/run_overflow_check.py +42 -14
  137. msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py +64 -19
  138. msprobe/pytorch/api_accuracy_checker/run_ut/run_ut_utils.py +34 -4
  139. msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/device_dispatch.py +5 -3
  140. msprobe/pytorch/bench_functions/npu_fusion_attention.py +42 -10
  141. msprobe/pytorch/common/parse_json.py +2 -1
  142. msprobe/pytorch/common/utils.py +45 -2
  143. msprobe/pytorch/compare/distributed_compare.py +17 -29
  144. msprobe/pytorch/compare/pt_compare.py +40 -20
  145. msprobe/pytorch/debugger/debugger_config.py +27 -12
  146. msprobe/pytorch/debugger/precision_debugger.py +42 -12
  147. msprobe/pytorch/dump/module_dump/__init__.py +0 -0
  148. msprobe/pytorch/dump/module_dump/module_dump.py +86 -0
  149. msprobe/pytorch/{module_processer.py → dump/module_dump/module_processer.py} +80 -6
  150. msprobe/pytorch/free_benchmark/common/params.py +2 -1
  151. msprobe/pytorch/free_benchmark/common/utils.py +3 -0
  152. msprobe/pytorch/free_benchmark/compare/grad_saver.py +0 -2
  153. msprobe/pytorch/free_benchmark/result_handlers/base_handler.py +31 -47
  154. msprobe/pytorch/free_benchmark/result_handlers/preheat_handler.py +0 -4
  155. msprobe/pytorch/hook_module/__init__.py +1 -1
  156. msprobe/pytorch/hook_module/hook_module.py +14 -11
  157. msprobe/pytorch/hook_module/register_optimizer_hook.py +59 -0
  158. msprobe/pytorch/hook_module/support_wrap_ops.yaml +34 -0
  159. msprobe/pytorch/hook_module/wrap_distributed.py +6 -8
  160. msprobe/pytorch/hook_module/wrap_functional.py +0 -40
  161. msprobe/pytorch/monitor/anomaly_analyse.py +1 -1
  162. msprobe/pytorch/monitor/anomaly_detect.py +107 -22
  163. msprobe/pytorch/monitor/csv2tb.py +166 -0
  164. msprobe/pytorch/monitor/distributed/wrap_distributed.py +25 -14
  165. msprobe/pytorch/monitor/features.py +3 -3
  166. msprobe/pytorch/monitor/module_hook.py +483 -277
  167. msprobe/pytorch/monitor/module_metric.py +27 -48
  168. msprobe/pytorch/monitor/module_spec_verifier.py +3 -1
  169. msprobe/pytorch/monitor/optimizer_collect.py +52 -14
  170. msprobe/pytorch/monitor/unittest/test_monitor.py +24 -9
  171. msprobe/pytorch/monitor/utils.py +77 -6
  172. msprobe/pytorch/online_dispatch/dispatch.py +8 -2
  173. msprobe/pytorch/parse_tool/lib/compare.py +10 -10
  174. msprobe/pytorch/parse_tool/lib/config.py +5 -7
  175. msprobe/pytorch/parse_tool/lib/file_desc.py +15 -1
  176. msprobe/pytorch/parse_tool/lib/interactive_cli.py +10 -10
  177. msprobe/pytorch/parse_tool/lib/parse_exception.py +7 -7
  178. msprobe/pytorch/parse_tool/lib/parse_tool.py +11 -10
  179. msprobe/pytorch/parse_tool/lib/utils.py +18 -19
  180. msprobe/pytorch/parse_tool/lib/visualization.py +9 -10
  181. msprobe/pytorch/service.py +176 -106
  182. msprobe/visualization/builder/graph_builder.py +62 -5
  183. msprobe/visualization/builder/msprobe_adapter.py +24 -2
  184. msprobe/visualization/compare/graph_comparator.py +64 -14
  185. msprobe/visualization/compare/mode_adapter.py +1 -15
  186. msprobe/visualization/graph/base_node.py +12 -17
  187. msprobe/visualization/graph/distributed_analyzer.py +318 -0
  188. msprobe/visualization/graph/graph.py +9 -0
  189. msprobe/visualization/graph_service.py +97 -23
  190. msprobe/visualization/utils.py +14 -29
  191. msprobe/pytorch/functional/module_dump.py +0 -84
  192. {mindstudio_probe-1.1.1.dist-info → mindstudio_probe-1.2.1.dist-info}/LICENSE +0 -0
  193. {mindstudio_probe-1.1.1.dist-info → mindstudio_probe-1.2.1.dist-info}/WHEEL +0 -0
  194. {mindstudio_probe-1.1.1.dist-info → mindstudio_probe-1.2.1.dist-info}/entry_points.txt +0 -0
  195. {mindstudio_probe-1.1.1.dist-info → mindstudio_probe-1.2.1.dist-info}/top_level.txt +0 -0
  196. /msprobe/docs/{data_dump_Mindspore → data_dump_MindSpore}/dynamic_graph_quick_start_example.md +0 -0
  197. /msprobe/{pytorch/functional → mindspore/code_mapping}/__init__.py +0 -0
@@ -0,0 +1,521 @@
1
+ # dump.json文件说明及示例
2
+
3
+ ## 1. dump.json文件介绍(Pytorch)
4
+
5
+ ### 1.1 L0级别
6
+ L0级别的dump.json文件包括模块的前反向的输入输出,以及模块的参数和参数梯度。以Pytorch的Conv2d模块为例,网络中模块调用代码为:
7
+ `output = torch.nn.Conv2d(64, 128, 5, padding=2, bias=True)(input)`
8
+
9
+ dump.json文件中包含以下字段:
10
+
11
+ 1. `Module.conv2.Conv2d.forward.0`为模块的前向数据,其中input_args为模块的输入数据(位置参数),input_kwargs为模块的输入数据(关键字参数),output为模块的输出数据,parameters为模块的参数数据,包括权重(weight)和偏置(bias)。
12
+ 2. `Module.conv2.Conv2d.parameters_grad`为模块的参数梯度数据,包括权重(weight)和偏置(bias)的梯度。
13
+ 3. `Module.conv2.Conv2d.backward.0`为模块的反向数据,其中input为模块反向的输入梯度(对应前向输出的梯度),output为模块的反向输出梯度(对应前向输入的梯度)。
14
+
15
+ ```json
16
+ {
17
+ "task": "tensor",
18
+ "level": "L0",
19
+ "framework": "pytorch",
20
+ "dump_data_dir": "/dump/path",
21
+ "data": {
22
+ "Module.conv2.Conv2d.forward.0": {
23
+ "input_args": [
24
+ {
25
+ "type": "torch.Tensor",
26
+ "dtype": "torch.float32",
27
+ "shape": [
28
+ 8,
29
+ 16,
30
+ 14,
31
+ 14
32
+ ],
33
+ "Max": 1.638758659362793,
34
+ "Min": 0.0,
35
+ "Mean": 0.2544615864753723,
36
+ "Norm": 70.50277709960938,
37
+ "requires_grad": true,
38
+ "data_name": "Module.conv2.Conv2d.forward.0.input.0.pt"
39
+ }
40
+ ],
41
+ "input_kwargs": {},
42
+ "output": [
43
+ {
44
+ "type": "torch.Tensor",
45
+ "dtype": "torch.float32",
46
+ "shape": [
47
+ 8,
48
+ 32,
49
+ 10,
50
+ 10
51
+ ],
52
+ "Max": 1.6815717220306396,
53
+ "Min": -1.5120246410369873,
54
+ "Mean": -0.025344856083393097,
55
+ "Norm": 149.65576171875,
56
+ "requires_grad": true,
57
+ "data_name": "Module.conv2.Conv2d.forward.0.output.0.pt"
58
+ }
59
+ ],
60
+ "parameters": {
61
+ "weight": {
62
+ "type": "torch.Tensor",
63
+ "dtype": "torch.float32",
64
+ "shape": [
65
+ 32,
66
+ 16,
67
+ 5,
68
+ 5
69
+ ],
70
+ "Max": 0.05992485210299492,
71
+ "Min": -0.05999220535159111,
72
+ "Mean": -0.0006165213999338448,
73
+ "Norm": 3.421217441558838,
74
+ "requires_grad": true,
75
+ "data_name": "Module.conv2.Conv2d.forward.0.parameters.weight.pt"
76
+ },
77
+ "bias": {
78
+ "type": "torch.Tensor",
79
+ "dtype": "torch.float32",
80
+ "shape": [
81
+ 32
82
+ ],
83
+ "Max": 0.05744686722755432,
84
+ "Min": -0.04894155263900757,
85
+ "Mean": 0.006410328671336174,
86
+ "Norm": 0.17263513803482056,
87
+ "requires_grad": true,
88
+ "data_name": "Module.conv2.Conv2d.forward.0.parameters.bias.pt"
89
+ }
90
+ }
91
+ },
92
+ "Module.conv2.Conv2d.parameters_grad": {
93
+ "weight": [
94
+ {
95
+ "type": "torch.Tensor",
96
+ "dtype": "torch.float32",
97
+ "shape": [
98
+ 32,
99
+ 16,
100
+ 5,
101
+ 5
102
+ ],
103
+ "Max": 0.018550323322415352,
104
+ "Min": -0.008627401664853096,
105
+ "Mean": 0.0006675920449197292,
106
+ "Norm": 0.26084786653518677,
107
+ "requires_grad": false,
108
+ "data_name": "Module.conv2.Conv2d.parameters_grad.weight.pt"
109
+ }
110
+ ],
111
+ "bias": [
112
+ {
113
+ "type": "torch.Tensor",
114
+ "dtype": "torch.float32",
115
+ "shape": [
116
+ 32
117
+ ],
118
+ "Max": 0.014914230443537235,
119
+ "Min": -0.006656786892563105,
120
+ "Mean": 0.002657240955159068,
121
+ "Norm": 0.029451673850417137,
122
+ "requires_grad": false,
123
+ "data_name": "Module.conv2.Conv2d.parameters_grad.bias.pt"
124
+ }
125
+ ]
126
+ },
127
+ "Module.conv2.Conv2d.backward.0": {
128
+ "input": [
129
+ {
130
+ "type": "torch.Tensor",
131
+ "dtype": "torch.float32",
132
+ "shape": [
133
+ 8,
134
+ 32,
135
+ 10,
136
+ 10
137
+ ],
138
+ "Max": 0.0015069986693561077,
139
+ "Min": -0.001139344065450132,
140
+ "Mean": 3.3215508210560074e-06,
141
+ "Norm": 0.020567523315548897,
142
+ "requires_grad": false,
143
+ "data_name": "Module.conv2.Conv2d.backward.0.input.0.pt"
144
+ }
145
+ ],
146
+ "output": [
147
+ {
148
+ "type": "torch.Tensor",
149
+ "dtype": "torch.float32",
150
+ "shape": [
151
+ 8,
152
+ 16,
153
+ 14,
154
+ 14
155
+ ],
156
+ "Max": 0.0007466732058674097,
157
+ "Min": -0.00044813455315306783,
158
+ "Mean": 6.814070275140693e-06,
159
+ "Norm": 0.01474067009985447,
160
+ "requires_grad": false,
161
+ "data_name": "Module.conv2.Conv2d.backward.0.output.0.pt"
162
+ }
163
+ ]
164
+ }
165
+ }
166
+ }
167
+ ```
168
+
169
+ ### 1.2 L1级别
170
+ L1级别的dump.json文件包括API的前反向的输入输出。以Pytorch的relu函数为例,网络中API调用代码为:
171
+ `output = torch.nn.functional.relu(input)`
172
+
173
+ dump.json文件中包含以下字段:
174
+ 1. `Functional.relu.0.forward`为API的前向数据,其中input_args为API的输入数据(位置参数),input_kwargs为API的输入数据(关键字参数),output为API的输出数据。
175
+ 2. `Functional.relu.0.backward`为API的反向数据,其中input为API的反向输入梯度(对应前向输出的梯度),output为API的反向输出梯度(对应前向输入的梯度)。
176
+
177
+ ```json
178
+ {
179
+ "task": "tensor",
180
+ "level": "L1",
181
+ "framework": "pytorch",
182
+ "dump_data_dir":"/dump/path",
183
+ "data": {
184
+ "Functional.relu.0.forward": {
185
+ "input_args": [
186
+ {
187
+ "type": "torch.Tensor",
188
+ "dtype": "torch.float32",
189
+ "shape": [
190
+ 32,
191
+ 16,
192
+ 28,
193
+ 28
194
+ ],
195
+ "Max": 1.3864083290100098,
196
+ "Min": -1.3364859819412231,
197
+ "Mean": 0.03711778670549393,
198
+ "Norm": 236.20692443847656,
199
+ "requires_grad": true,
200
+ "data_name": "Functional.relu.0.forward.input.0.pt"
201
+ }
202
+ ],
203
+ "input_kwargs": {},
204
+ "output": [
205
+ {
206
+ "type": "torch.Tensor",
207
+ "dtype": "torch.float32",
208
+ "shape": [
209
+ 32,
210
+ 16,
211
+ 28,
212
+ 28
213
+ ],
214
+ "Max": 1.3864083290100098,
215
+ "Min": 0.0,
216
+ "Mean": 0.16849493980407715,
217
+ "Norm": 175.23345947265625,
218
+ "requires_grad": true,
219
+ "data_name": "Functional.relu.0.forward.output.0.pt"
220
+ }
221
+ ]
222
+ },
223
+ "Functional.relu.0.backward": {
224
+ "input": [
225
+ {
226
+ "type": "torch.Tensor",
227
+ "dtype": "torch.float32",
228
+ "shape": [
229
+ 32,
230
+ 16,
231
+ 28,
232
+ 28
233
+ ],
234
+ "Max": 0.0001815402356442064,
235
+ "Min": -0.00013352684618439525,
236
+ "Mean": 0.00011915402356442064,
237
+ "Norm": 0.007598237134516239,
238
+ "requires_grad": false,
239
+ "data_name": "Functional.relu.0.backward.input.0.pt"
240
+ }
241
+ ],
242
+ "output": [
243
+ {
244
+ "type": "torch.Tensor",
245
+ "dtype": "torch.float32",
246
+ "shape": [
247
+ 32,
248
+ 16,
249
+ 28,
250
+ 28
251
+ ],
252
+ "Max": 0.0001815402356442064,
253
+ "Min": -0.00012117840378778055,
254
+ "Mean": 2.0098118724831693e-08,
255
+ "Norm": 0.006532244384288788,
256
+ "requires_grad": false,
257
+ "data_name": "Functional.relu.0.backward.output.0.pt"
258
+ }
259
+ ]
260
+ }
261
+ }
262
+ }
263
+ ```
264
+
265
+ ### 1.3 mix级别
266
+
267
+ mix级别的dump.json文件同时包括L0和L1级别的dump数据,文件格式与上述示例相同。
268
+
269
+ ## 2. dump.json文件示例(MindSpore)
270
+
271
+ ### 2.1 L0级别
272
+
273
+ L0级别的dump.json文件包括模块的前反向的输入输出,以及模块的参数和参数梯度。
274
+ 以MindSpore的Conv2d模块为例,dump.json文件中使用的模块调用代码为:
275
+ `output = mindspore.nn.Conv2d(64, 128, 5, pad_mode='same', has_bias=True)(input)`
276
+
277
+ dump.json文件中包含以下字段:
278
+ 1. `Cell.conv2.Conv2d.forward.0`为模块的前向数据,其中input_args为模块的输入数据(位置参数),input_kwargs为模块的输入数据(关键字参数),output为模块的输出数据,parameters为模块的参数数据,包括权重(weight)和偏置(bias)。
279
+ 2. `Cell.conv2.Conv2d.parameters_grad`为模块的参数梯度数据,包括权重(weight)和偏置(bias)的梯度。
280
+ 3. `Cell.conv2.Conv2d.backward.0`为模块的反向数据,其中input为模块反向的输入梯度(对应前向输出的梯度),output为模块的反向输出梯度(对应前向输入的梯度)。
281
+
282
+ ```json
283
+ {
284
+ "task": "tensor",
285
+ "level": "L0",
286
+ "framework": "mindspore",
287
+ "dump_data_dir": "/dump/path",
288
+ "data": {
289
+ "Cell.conv2.Conv2d.forward.0": {
290
+ "input_args": [
291
+ {
292
+ "type": "mindspore.Tensor",
293
+ "dtype": "Float32",
294
+ "shape": [
295
+ 8,
296
+ 16,
297
+ 14,
298
+ 14
299
+ ],
300
+ "Max": 1.638758659362793,
301
+ "Min": 0.0,
302
+ "Mean": 0.2544615864753723,
303
+ "Norm": 70.50277709960938,
304
+ "data_name": "Cell.conv2.Conv2d.forward.0.input.0.npy"
305
+ }
306
+ ],
307
+ "input_kwargs": {},
308
+ "output": [
309
+ {
310
+ "type": "mindspore.Tensor",
311
+ "dtype": "Float32",
312
+ "shape": [
313
+ 8,
314
+ 32,
315
+ 10,
316
+ 10
317
+ ],
318
+ "Max": 1.6815717220306396,
319
+ "Min": -1.5120246410369873,
320
+ "Mean": -0.025344856083393097,
321
+ "Norm": 149.65576171875,
322
+ "data_name": "Cell.conv2.Conv2d.forward.0.output.0.npy"
323
+ }
324
+ ],
325
+ "parameters": {
326
+ "weight": {
327
+ "type": "mindspore.Tensor",
328
+ "dtype": "Float32",
329
+ "shape": [
330
+ 32,
331
+ 16,
332
+ 5,
333
+ 5
334
+ ],
335
+ "Max": 0.05992485210299492,
336
+ "Min": -0.05999220535159111,
337
+ "Mean": -0.0006165213999338448,
338
+ "Norm": 3.421217441558838,
339
+ "data_name": "Cell.conv2.Conv2d.forward.0.parameters.weight.npy"
340
+ },
341
+ "bias": {
342
+ "type": "mindspore.Tensor",
343
+ "dtype": "Float32",
344
+ "shape": [
345
+ 32
346
+ ],
347
+ "Max": 0.05744686722755432,
348
+ "Min": -0.04894155263900757,
349
+ "Mean": 0.006410328671336174,
350
+ "Norm": 0.17263513803482056,
351
+ "data_name": "Cell.conv2.Conv2d.forward.0.parameters.bias.npy"
352
+ }
353
+ }
354
+ },
355
+ "Cell.conv2.Conv2d.parameters_grad": {
356
+ "weight": [
357
+ {
358
+ "type": "mindspore.Tensor",
359
+ "dtype": "Float32",
360
+ "shape": [
361
+ 32,
362
+ 16,
363
+ 5,
364
+ 5
365
+ ],
366
+ "Max": 0.018550323322415352,
367
+ "Min": -0.008627401664853096,
368
+ "Mean": 0.0006675920449197292,
369
+ "Norm": 0.26084786653518677,
370
+ "data_name": "Cell.conv2.Conv2d.parameters_grad.weight.npy"
371
+ }
372
+ ],
373
+ "bias": [
374
+ {
375
+ "type": "mindspore.Tensor",
376
+ "dtype": "Float32",
377
+ "shape": [
378
+ 32
379
+ ],
380
+ "Max": 0.014914230443537235,
381
+ "Min": -0.006656786892563105,
382
+ "Mean": 0.002657240955159068,
383
+ "Norm": 0.029451673850417137,
384
+ "data_name": "Cell.conv2.Conv2d.parameters_grad.bias.npy"
385
+ }
386
+ ]
387
+ },
388
+ "Cell.conv2.Conv2d.backward.0": {
389
+ "input": [
390
+ {
391
+ "type": "mindspore.Tensor",
392
+ "dtype": "Float32",
393
+ "shape": [
394
+ 8,
395
+ 32,
396
+ 10,
397
+ 10
398
+ ],
399
+ "Max": 0.0015069986693561077,
400
+ "Min": -0.001139344065450132,
401
+ "Mean": 3.3215508210560074e-06,
402
+ "Norm": 0.020567523315548897,
403
+ "data_name": "Cell.conv2.Conv2d.backward.0.input.0.npy"
404
+ }
405
+ ],
406
+ "output": [
407
+ {
408
+ "type": "mindspore.Tensor",
409
+ "dtype": "Float32",
410
+ "shape": [
411
+ 8,
412
+ 16,
413
+ 14,
414
+ 14
415
+ ],
416
+ "Max": 0.0007466732058674097,
417
+ "Min": -0.00044813455315306783,
418
+ "Mean": 6.814070275140693e-06,
419
+ "Norm": 0.01474067009985447,
420
+ "data_name": "Cell.conv2.Conv2d.backward.0.output.0.npy"
421
+ }
422
+ ]
423
+ }
424
+ }
425
+ }
426
+ ```
427
+
428
+ ### 2.2 L1级别
429
+ L1级别的dump.json文件包括API的前反向的输入输出,以MindSpore的relu函数为例,网络中API调用代码为:
430
+ `output = mindspore.ops.relu(input)`
431
+
432
+ dump.json文件中包含以下字段:
433
+ 1. `Functional.relu.0.forward`为API的前向数据,其中input_args为API的输入数据(位置参数),input_kwargs为API的输入数据(关键字参数),output为API的输出数据。
434
+ 2. `Functional.relu.0.backward`为API的反向数据,其中input为API的反向输入梯度(对应前向输出的梯度),output为API的反向输出梯度(对应前向输入的梯度)。
435
+
436
+ ```json
437
+ {
438
+ "task": "tensor",
439
+ "level": "L1",
440
+ "framework": "mindspore",
441
+ "dump_data_dir":"/dump/path",
442
+ "data": {
443
+ "Functional.relu.0.forward": {
444
+ "input_args": [
445
+ {
446
+ "type": "mindspore.Tensor",
447
+ "dtype": "Float32",
448
+ "shape": [
449
+ 32,
450
+ 16,
451
+ 28,
452
+ 28
453
+ ],
454
+ "Max": 1.3864083290100098,
455
+ "Min": -1.3364859819412231,
456
+ "Mean": 0.03711778670549393,
457
+ "Norm": 236.20692443847656,
458
+ "data_name": "Functional.relu.0.forward.input.0.npy"
459
+ }
460
+ ],
461
+ "input_kwargs": {},
462
+ "output": [
463
+ {
464
+ "type": "mindspore.Tensor",
465
+ "dtype": "Float32",
466
+ "shape": [
467
+ 32,
468
+ 16,
469
+ 28,
470
+ 28
471
+ ],
472
+ "Max": 1.3864083290100098,
473
+ "Min": 0.0,
474
+ "Mean": 0.16849493980407715,
475
+ "Norm": 175.23345947265625,
476
+ "data_name": "Functional.relu.0.forward.output.0.npy"
477
+ }
478
+ ]
479
+ },
480
+ "Functional.relu.0.backward": {
481
+ "input": [
482
+ {
483
+ "type": "mindspore.Tensor",
484
+ "dtype": "Float32",
485
+ "shape": [
486
+ 32,
487
+ 16,
488
+ 28,
489
+ 28
490
+ ],
491
+ "Max": 0.0001815402356442064,
492
+ "Min": -0.00013352684618439525,
493
+ "Mean": 0.00011915402356442064,
494
+ "Norm": 0.007598237134516239,
495
+ "data_name": "Functional.relu.0.backward.input.0.npy"
496
+ }
497
+ ],
498
+ "output": [
499
+ {
500
+ "type": "mindspore.Tensor",
501
+ "dtype": "Float32",
502
+ "shape": [
503
+ 32,
504
+ 16,
505
+ 28,
506
+ 28
507
+ ],
508
+ "Max": 0.0001815402356442064,
509
+ "Min": -0.00012117840378778055,
510
+ "Mean": 2.0098118724831693e-08,
511
+ "Norm": 0.006532244384288788,
512
+ "data_name": "Functional.relu.0.backward.output.0.npy"
513
+ }
514
+ ]
515
+ }
516
+ }
517
+ }
518
+ ```
519
+
520
+ ### 2.3 mix级别
521
+ mix级别的dump.json文件同时包括L0和L1级别的dump数据,文件格式与上述示例相同。
msprobe/docs/FAQ.md CHANGED
@@ -13,6 +13,29 @@
13
13
  2. 如果存在namedtuple类型的数据作为nn.Module的输出,工具会将各字段数据dump下来,但是输出数据类型会被转成tuple,原因是什么?
14
14
  - 这是由于pytorch框架自身,在注册module的backward hook时,会将namedtuple类型转成tuple类型。
15
15
 
16
+ 3. 如果某个api在dump支持列表support_wrap_ops.yaml中,但没有dump该api的数据,原因是什么?
17
+ - 首先确认api调用是否在采集范围内,即需要在 **start** 和 **stop** 接口涵盖的范围内。
18
+ - 其次,由于工具只在被调用时才对api进行patch,从而使得数据可以被dump下来。因此当api是被直接import进行调用时,由于该api的地址已经确定,
19
+ 工具无法再对其进行patch,故而该api数据无法被dump下来。如下示例,relu将无法被dump:
20
+ ```python
21
+ import torch
22
+ from torch import relu # 此时relu地址已经确定,无法修改
23
+
24
+ from msprobe.pytorch import PrecisionDebugger
25
+
26
+ debugger = PrecisionDebugger(dump_path="./dump_data")
27
+ x = torch.randn(10)
28
+ debugger.start() # 此时会对torch下面的api进行patch,但已无法对import进来的api进行patch了
29
+ x = relu(x)
30
+ debugger.stop()
31
+ ```
32
+ 在上述场景中,若希望采集relu数据,只需要将`relu(x)`修改为`torch.relu(x)`即可。
33
+
34
+ 4. 在使用L0 dump时,发现有些 module 的数据没有采集下来,原因是什么?
35
+ - 确认日志打印中是否存在`The {module_name} has registered deprecated register_backward_hook`信息,
36
+ 该信息说明 module 挂载了被 PyTorch 框架废弃的 register_backward_hook,这与工具使用的 register_full_backward_hook 接口会产生冲突,故工具会跳过该 module 的反向数据采集。
37
+ - 如果您希望所有 module 数据都能采集下来,可以将模型中使用的 register_backward_hook 接口改为 PyTorch 框架推荐的 register_full_backward_pre_hook 或 register_full_backward_hook 接口。
38
+
16
39
  # 2 精度预检(PyTorch)
17
40
 
18
41
  1. 预检工具在 dump 和 run_ut 的过程中,是否需要同时开启或关闭 jit 编译(jit_compile)?
@@ -183,9 +206,10 @@ def npu_forward_fused_softmax(self, input_, mask):
183
206
 
184
207
  答:注释工具目录 `mstt/debug/accuracy_tools/msprobe/pytorch/hook_module/support_wrap_ops.yaml` 文件中 `Tensor: ` 下的 `- __getitem__`,工具会跳过采集该 API。如果是需要采集关键位置 API 也可以考虑根据报错堆栈信息注释引发报错的类型检查。
185
208
 
186
- 11. 添加 msprobe 工具后 F.gelu 触发 ValueError 报错:`activation_func must be F.gelu` 等。以及采集 Megatron 数据时报错:`ValueError(Only support fusion of gelu and swiglu)`。
209
+ 11. 使用 msprobe 工具数据采集功能后,模型出现报错,报错信息为:`activation_func must be F.gelu` `ValueError(Only support fusion of gelu and swiglu)`。
187
210
 
188
- 答:这一类问题是因为工具本身封装了 torch 算子,所以校验算子名时会报错。注释 `mstt/debug/accuracy_tools/msprobe/pytorch/hook_module/support_wrap_ops.yaml` 文件中的 `-gelu` 或者 `-silu`,工具会跳过采集该 API。如果需要采集关键位置 API 也可以考虑根据报错堆栈信息注释引发报错的类型检查。
211
+ 答:这一类报错常见于 Megatron/MindSpeed/ModelLink 等加速库或模型仓中,原因是工具本身会封装 torch API(API类型和地址会发生改变),而有些 API 在工具使能前类型和地址就已经确定,此时工具无法对这类 API 再进行封装,而加速库中会对某些 API 进行类型检查,即会把工具无法封装的原始的 API和工具封装之后的 API 进行判断,所以会报错。
212
+ 规避方式有3种:①将PrecisionDebugger的实例化放在文件的开始位置,即导包后的位置,确保所有API都被封装;②注释 `mstt/debug/accuracy_tools/msprobe/pytorch/hook_module/support_wrap_ops.yaml` 文件中的 `-gelu` 或者 `-silu`,工具会跳过采集该 API。③ 可以考虑根据报错堆栈信息注释引发报错的类型检查。
189
213
 
190
214
  12. 添加 msprobe 工具后触发与 AsStrided 算子相关、或者编译相关的报错,如:`Failed to compile Op [AsStrided]`。
191
215
 
@@ -0,0 +1,14 @@
1
+ # MindSpore 场景的精度预检基线
2
+
3
+ ## "multi_run_ut"模式精度预检耗时参考基线
4
+
5
+ 该基线为MindSpore框架下,使用"multi_run_ut"模式精度预检耗时参考基线。本基线测试了38B语言大模型在不同卡数下耗时的变化。
6
+
7
+ ### 38B语言大模型
8
+
9
+ | 卡数 | 总耗时 (分钟) | 备注 |
10
+ | ----- |----------|---------- |
11
+ | 1 卡 | 21.0 | 单卡基线 |
12
+ | 2 卡 | 11.5 | 双卡基线 |
13
+ | 4 卡 | 6.7 | 四卡基线 |
14
+ | 8 卡 | 3.5 | 八卡基线 |
@@ -0,0 +1,22 @@
1
+ # MindSpore 场景的精度数据采集基线
2
+
3
+ ## "tensor"模式采集数据量参考基线
4
+
5
+ 该基线为MindSpore框架下,使用"tensor"模式采集数据量参考基线。本基线测试了38B语言大模型在不同采集模式下,不同global_batch_size下,单卡和8卡下,数据量的变化。
6
+
7
+ ### 38B语言大模型
8
+
9
+ <table>
10
+ <tr><th>采集模式</th><th>global_batch_size</th><th>单卡</th><th>8卡</th></tr>
11
+ </td><td rowspan="3">L0</td><td>1</td><td>262GB</td><td>2.1T</td></tr>
12
+ <tr><td>2</td><td>480GB</td><td>3.8T</td></tr>
13
+ <tr><td>3</td><td>928GB</td><td>7.4T</td></tr>
14
+ </td><td rowspan="3">L1</td><td>1</td><td>2.1TB</td><td>17.1TB</td></tr>
15
+ <tr><td>2</td><td>2.8T</td><td>22.7TB</td></tr>
16
+ <tr><td>3</td><td>4.2T</td><td>34.3TB</td></tr>
17
+ </td><td rowspan="3">mix</td><td>1</td><td>2.4T</td><td>19.2TB</td></tr>
18
+ <tr><td>2</td><td>3.3TB</td><td>26.6TB</td></tr>
19
+ <tr><td>3</td><td>5.1TB</td><td>41.4TB</td></tr>
20
+
21
+ </table>
22
+
Binary file
Binary file