mindstudio-probe 1.2.2__py3-none-any.whl → 8.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {mindstudio_probe-1.2.2.dist-info → mindstudio_probe-8.1.0.dist-info}/METADATA +4 -3
- {mindstudio_probe-1.2.2.dist-info → mindstudio_probe-8.1.0.dist-info}/RECORD +243 -191
- msprobe/README.md +57 -21
- msprobe/core/__init__.py +17 -0
- msprobe/core/common/const.py +224 -82
- msprobe/core/common/decorator.py +50 -0
- msprobe/core/common/exceptions.py +5 -3
- msprobe/core/common/file_utils.py +274 -40
- msprobe/core/common/framework_adapter.py +169 -0
- msprobe/core/common/global_lock.py +86 -0
- msprobe/core/common/runtime.py +25 -0
- msprobe/core/common/utils.py +148 -72
- msprobe/core/common_config.py +7 -0
- msprobe/core/compare/acc_compare.py +640 -462
- msprobe/core/compare/check.py +36 -107
- msprobe/core/compare/compare_cli.py +4 -0
- msprobe/core/compare/config.py +72 -0
- msprobe/core/compare/highlight.py +217 -215
- msprobe/core/compare/layer_mapping/layer_mapping.py +4 -1
- msprobe/core/compare/merge_result/merge_result.py +12 -6
- msprobe/core/compare/multiprocessing_compute.py +227 -107
- msprobe/core/compare/npy_compare.py +32 -16
- msprobe/core/compare/utils.py +218 -244
- msprobe/{mindspore/runtime.py → core/config_check/__init__.py} +2 -4
- msprobe/{pytorch/dump/kernel_dump/kernel_config.py → core/config_check/checkers/__init__.py} +8 -16
- msprobe/core/config_check/checkers/base_checker.py +60 -0
- msprobe/core/config_check/checkers/dataset_checker.py +138 -0
- msprobe/core/config_check/checkers/env_args_checker.py +96 -0
- msprobe/core/config_check/checkers/hyperparameter_checker.py +170 -0
- msprobe/core/config_check/checkers/pip_checker.py +90 -0
- msprobe/core/config_check/checkers/random_checker.py +367 -0
- msprobe/core/config_check/checkers/weights_checker.py +147 -0
- msprobe/core/config_check/ckpt_compare/ckpt_comparator.py +74 -0
- msprobe/core/config_check/ckpt_compare/megatron_loader.py +302 -0
- msprobe/core/config_check/ckpt_compare/metrics.py +83 -0
- msprobe/core/config_check/ckpt_compare/name_mapping.yaml +12 -0
- msprobe/core/config_check/config_check_cli.py +51 -0
- msprobe/core/config_check/config_checker.py +100 -0
- msprobe/{pytorch/parse.py → core/config_check/resource/dependency.yaml} +7 -4
- msprobe/core/config_check/resource/env.yaml +57 -0
- msprobe/core/config_check/resource/hyperparameter.yaml +21 -0
- msprobe/core/config_check/utils/hyperparameter_parser.py +115 -0
- msprobe/core/config_check/utils/utils.py +107 -0
- msprobe/core/data_dump/api_registry.py +239 -0
- msprobe/core/data_dump/data_collector.py +36 -9
- msprobe/core/data_dump/data_processor/base.py +74 -53
- msprobe/core/data_dump/data_processor/mindspore_processor.py +119 -78
- msprobe/core/data_dump/data_processor/pytorch_processor.py +134 -96
- msprobe/core/data_dump/json_writer.py +146 -57
- msprobe/core/debugger/precision_debugger.py +143 -0
- msprobe/core/grad_probe/constant.py +2 -1
- msprobe/core/grad_probe/grad_compare.py +2 -2
- msprobe/core/grad_probe/utils.py +1 -1
- msprobe/core/hook_manager.py +242 -0
- msprobe/core/monitor/anomaly_processor.py +384 -0
- msprobe/core/overflow_check/abnormal_scene.py +2 -0
- msprobe/core/service.py +356 -0
- msprobe/core/single_save/__init__.py +0 -0
- msprobe/core/single_save/single_comparator.py +243 -0
- msprobe/core/single_save/single_saver.py +157 -0
- msprobe/docs/01.installation.md +6 -5
- msprobe/docs/02.config_introduction.md +89 -30
- msprobe/docs/03.config_examples.md +1 -0
- msprobe/docs/04.kernel_dump_PyTorch.md +1 -1
- msprobe/docs/05.data_dump_PyTorch.md +184 -50
- msprobe/docs/06.data_dump_MindSpore.md +193 -28
- msprobe/docs/07.accuracy_checker_PyTorch.md +13 -3
- msprobe/docs/08.accuracy_checker_online_PyTorch.md +72 -10
- msprobe/docs/09.accuracy_checker_MindSpore.md +19 -7
- msprobe/docs/10.accuracy_compare_PyTorch.md +266 -102
- msprobe/docs/11.accuracy_compare_MindSpore.md +117 -43
- msprobe/docs/12.overflow_check_PyTorch.md +5 -3
- msprobe/docs/13.overflow_check_MindSpore.md +6 -4
- msprobe/docs/14.data_parse_PyTorch.md +4 -10
- msprobe/docs/17.grad_probe.md +2 -1
- msprobe/docs/18.online_dispatch.md +3 -3
- msprobe/docs/19.monitor.md +211 -103
- msprobe/docs/21.visualization_PyTorch.md +100 -28
- msprobe/docs/22.visualization_MindSpore.md +103 -31
- msprobe/docs/23.generate_operator_PyTorch.md +9 -9
- msprobe/docs/25.tool_function_introduction.md +23 -22
- msprobe/docs/26.data_dump_PyTorch_baseline.md +14 -3
- msprobe/docs/27.dump_json_instruction.md +278 -8
- msprobe/docs/28.debugger_save_instruction.md +111 -20
- msprobe/docs/28.kernel_dump_MindSpore.md +1 -1
- msprobe/docs/29.data_dump_MSAdapter.md +229 -0
- msprobe/docs/30.overflow_check_MSAdapter.md +31 -0
- msprobe/docs/31.config_check.md +95 -0
- msprobe/docs/32.ckpt_compare.md +69 -0
- msprobe/docs/33.generate_operator_MindSpore.md +190 -0
- msprobe/docs/34.RL_collect.md +92 -0
- msprobe/docs/35.nan_analyze.md +72 -0
- msprobe/docs/FAQ.md +3 -11
- msprobe/docs/data_dump_MindSpore/data_dump_MindSpore_baseline.md +12 -1
- msprobe/docs/data_dump_MindSpore/dynamic_graph_quick_start_example.md +3 -1
- msprobe/docs/img/compare_result.png +0 -0
- msprobe/docs/img/merge_result.png +0 -0
- msprobe/docs/img/save_compare_result_sample.png +0 -0
- msprobe/docs/img/visualization/proxy.png +0 -0
- msprobe/docs/img/visualization/vis_browser_1.png +0 -0
- msprobe/docs/img/visualization/vis_match_info.png +0 -0
- msprobe/docs/img/visualization/vis_precision_info.png +0 -0
- msprobe/docs/img/visualization/vis_search_info.png +0 -0
- msprobe/docs/img/visualization/vis_show_info.png +0 -0
- msprobe/docs/img/visualization/vis_showcase.png +0 -0
- msprobe/docs/img/visualization/vis_unmatch_info.png +0 -0
- msprobe/mindspore/__init__.py +3 -3
- msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +151 -55
- msprobe/mindspore/api_accuracy_checker/api_runner.py +25 -11
- msprobe/mindspore/api_accuracy_checker/base_compare_algorithm.py +2 -1
- msprobe/mindspore/api_accuracy_checker/bench_functions/flash_attention_score.py +580 -0
- msprobe/mindspore/api_accuracy_checker/bench_functions/fusion_operator.py +41 -0
- msprobe/mindspore/api_accuracy_checker/cmd_parser.py +4 -0
- msprobe/mindspore/api_accuracy_checker/data_manager.py +4 -3
- msprobe/mindspore/api_accuracy_checker/generate_op_script/config_op.json +9 -0
- msprobe/mindspore/api_accuracy_checker/generate_op_script/op_generator.py +451 -0
- msprobe/mindspore/api_accuracy_checker/generate_op_script/operator_replication.template +2081 -0
- msprobe/mindspore/api_accuracy_checker/multi_api_accuracy_checker.py +11 -1
- msprobe/mindspore/api_accuracy_checker/torch_mindtorch_importer.py +2 -1
- msprobe/mindspore/cell_processor.py +204 -33
- msprobe/mindspore/code_mapping/graph_parser.py +4 -21
- msprobe/mindspore/common/const.py +73 -2
- msprobe/mindspore/common/utils.py +157 -29
- msprobe/mindspore/compare/common_dir_compare.py +382 -0
- msprobe/mindspore/compare/distributed_compare.py +2 -26
- msprobe/mindspore/compare/ms_compare.py +18 -398
- msprobe/mindspore/compare/ms_graph_compare.py +20 -10
- msprobe/mindspore/compare/utils.py +37 -0
- msprobe/mindspore/debugger/debugger_config.py +59 -7
- msprobe/mindspore/debugger/precision_debugger.py +83 -90
- msprobe/mindspore/dump/cell_dump_process.py +902 -0
- msprobe/mindspore/dump/cell_dump_with_insert_gradient.py +889 -0
- msprobe/mindspore/dump/dump_tool_factory.py +18 -8
- msprobe/mindspore/dump/graph_mode_cell_dump.py +139 -0
- msprobe/mindspore/dump/graph_tensor_dump.py +123 -0
- msprobe/mindspore/dump/hook_cell/api_register.py +176 -0
- msprobe/mindspore/dump/hook_cell/hook_cell.py +22 -12
- msprobe/mindspore/dump/hook_cell/ms_hook_manager.py +88 -0
- msprobe/mindspore/dump/hook_cell/primitive_hooks.py +8 -2
- msprobe/mindspore/dump/hook_cell/support_wrap_ops.yaml +42 -26
- msprobe/mindspore/dump/jit_dump.py +35 -27
- msprobe/mindspore/dump/kernel_kbyk_dump.py +6 -3
- msprobe/mindspore/dym_loader/hook_dynamic_loader.cpp +110 -0
- msprobe/mindspore/dym_loader/hook_dynamic_loader.h +15 -16
- msprobe/mindspore/free_benchmark/api_pynative_self_check.py +22 -12
- msprobe/mindspore/free_benchmark/common/utils.py +1 -1
- msprobe/mindspore/free_benchmark/perturbation/perturbation_factory.py +4 -2
- msprobe/mindspore/free_benchmark/self_check_tool_factory.py +6 -3
- msprobe/mindspore/grad_probe/global_context.py +9 -2
- msprobe/mindspore/grad_probe/grad_analyzer.py +2 -1
- msprobe/mindspore/grad_probe/grad_stat_csv.py +3 -2
- msprobe/mindspore/grad_probe/hook.py +2 -4
- msprobe/mindspore/mindspore_service.py +111 -0
- msprobe/mindspore/monitor/common_func.py +52 -0
- msprobe/mindspore/monitor/data_writers.py +237 -0
- msprobe/mindspore/monitor/distributed/wrap_distributed.py +1 -1
- msprobe/mindspore/monitor/features.py +13 -1
- msprobe/mindspore/monitor/module_hook.py +568 -444
- msprobe/mindspore/monitor/optimizer_collect.py +331 -0
- msprobe/mindspore/monitor/utils.py +71 -9
- msprobe/mindspore/ms_config.py +16 -15
- msprobe/mindspore/overflow_check/overflow_check_tool_factory.py +5 -3
- msprobe/mindspore/task_handler_factory.py +5 -2
- msprobe/msprobe.py +19 -0
- msprobe/nan_analyze/__init__.py +14 -0
- msprobe/nan_analyze/analyzer.py +255 -0
- msprobe/nan_analyze/graph.py +189 -0
- msprobe/nan_analyze/utils.py +211 -0
- msprobe/pytorch/api_accuracy_checker/common/config.py +2 -2
- msprobe/pytorch/api_accuracy_checker/compare/api_precision_compare.py +3 -6
- msprobe/pytorch/api_accuracy_checker/compare/compare.py +36 -34
- msprobe/pytorch/api_accuracy_checker/generate_op_script/op_generator.py +15 -13
- msprobe/pytorch/api_accuracy_checker/generate_op_script/operator_replication.template +206 -4
- msprobe/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py +9 -9
- msprobe/pytorch/api_accuracy_checker/run_ut/run_overflow_check.py +6 -5
- msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py +31 -9
- msprobe/pytorch/api_accuracy_checker/run_ut/run_ut_utils.py +28 -20
- msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/attl.py +3 -1
- msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/client.py +29 -13
- msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/device_dispatch.py +12 -2
- msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/server.py +45 -31
- msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/utils.py +154 -0
- msprobe/pytorch/attl_manager.py +65 -0
- msprobe/pytorch/bench_functions/moe_gating_top_k_softmax.py +6 -0
- msprobe/pytorch/bench_functions/npu_fusion_attention.py +27 -0
- msprobe/pytorch/common/utils.py +53 -19
- msprobe/pytorch/compare/distributed_compare.py +4 -36
- msprobe/pytorch/compare/pt_compare.py +13 -84
- msprobe/pytorch/compare/utils.py +47 -0
- msprobe/pytorch/debugger/debugger_config.py +34 -17
- msprobe/pytorch/debugger/precision_debugger.py +50 -96
- msprobe/pytorch/dump/module_dump/hook_wrapper.py +93 -0
- msprobe/pytorch/dump/module_dump/module_dump.py +15 -61
- msprobe/pytorch/dump/module_dump/module_processer.py +150 -114
- msprobe/pytorch/free_benchmark/common/utils.py +1 -1
- msprobe/pytorch/free_benchmark/compare/single_benchmark.py +1 -1
- msprobe/pytorch/free_benchmark/perturbed_layers/npu/add_noise.py +3 -3
- msprobe/pytorch/free_benchmark/perturbed_layers/npu/bit_noise.py +3 -3
- msprobe/pytorch/free_benchmark/perturbed_layers/npu/change_value.py +1 -1
- msprobe/pytorch/free_benchmark/perturbed_layers/npu/improve_precision.py +1 -1
- msprobe/pytorch/free_benchmark/result_handlers/check_handler.py +1 -1
- msprobe/pytorch/function_factory.py +1 -1
- msprobe/pytorch/grad_probe/grad_monitor.py +2 -2
- msprobe/pytorch/grad_probe/grad_stat_csv.py +3 -2
- msprobe/pytorch/hook_module/api_register.py +155 -0
- msprobe/pytorch/hook_module/hook_module.py +18 -22
- msprobe/pytorch/hook_module/jit_script_wrapper.py +33 -0
- msprobe/pytorch/hook_module/pt_hook_manager.py +68 -0
- msprobe/pytorch/hook_module/register_optimizer_hook.py +2 -1
- msprobe/pytorch/hook_module/support_wrap_ops.yaml +193 -75
- msprobe/pytorch/hook_module/utils.py +28 -2
- msprobe/pytorch/monitor/csv2tb.py +14 -4
- msprobe/pytorch/monitor/data_writers.py +259 -0
- msprobe/pytorch/monitor/distributed/wrap_distributed.py +8 -2
- msprobe/pytorch/monitor/module_hook.py +336 -241
- msprobe/pytorch/monitor/module_metric.py +17 -0
- msprobe/pytorch/monitor/optimizer_collect.py +244 -224
- msprobe/pytorch/monitor/utils.py +84 -4
- msprobe/pytorch/online_dispatch/compare.py +0 -2
- msprobe/pytorch/online_dispatch/dispatch.py +13 -2
- msprobe/pytorch/online_dispatch/dump_compare.py +8 -2
- msprobe/pytorch/online_dispatch/utils.py +3 -0
- msprobe/pytorch/parse_tool/lib/interactive_cli.py +1 -6
- msprobe/pytorch/parse_tool/lib/utils.py +5 -4
- msprobe/pytorch/pt_config.py +16 -11
- msprobe/pytorch/pytorch_service.py +70 -0
- msprobe/visualization/builder/graph_builder.py +69 -10
- msprobe/visualization/builder/msprobe_adapter.py +24 -12
- msprobe/visualization/compare/graph_comparator.py +63 -51
- msprobe/visualization/compare/mode_adapter.py +22 -20
- msprobe/visualization/graph/base_node.py +11 -4
- msprobe/visualization/graph/distributed_analyzer.py +1 -10
- msprobe/visualization/graph/graph.py +2 -13
- msprobe/visualization/graph/node_op.py +1 -2
- msprobe/visualization/graph_service.py +251 -104
- msprobe/visualization/utils.py +26 -44
- msprobe/mindspore/dump/hook_cell/api_registry.py +0 -207
- msprobe/mindspore/dump/hook_cell/wrap_api.py +0 -212
- msprobe/mindspore/dym_loader/hook_dynamic_loader.cc +0 -140
- msprobe/mindspore/monitor/anomaly_detect.py +0 -404
- msprobe/mindspore/monitor/module_spec_verifier.py +0 -94
- msprobe/mindspore/service.py +0 -543
- msprobe/pytorch/hook_module/api_registry.py +0 -166
- msprobe/pytorch/hook_module/wrap_distributed.py +0 -79
- msprobe/pytorch/hook_module/wrap_functional.py +0 -66
- msprobe/pytorch/hook_module/wrap_npu_custom.py +0 -85
- msprobe/pytorch/hook_module/wrap_tensor.py +0 -69
- msprobe/pytorch/hook_module/wrap_torch.py +0 -84
- msprobe/pytorch/hook_module/wrap_vf.py +0 -60
- msprobe/pytorch/monitor/anomaly_analyse.py +0 -201
- msprobe/pytorch/monitor/anomaly_detect.py +0 -410
- msprobe/pytorch/monitor/module_spec_verifier.py +0 -95
- msprobe/pytorch/monitor/unittest/test_monitor.py +0 -160
- msprobe/pytorch/service.py +0 -470
- {mindstudio_probe-1.2.2.dist-info → mindstudio_probe-8.1.0.dist-info}/LICENSE +0 -0
- {mindstudio_probe-1.2.2.dist-info → mindstudio_probe-8.1.0.dist-info}/WHEEL +0 -0
- {mindstudio_probe-1.2.2.dist-info → mindstudio_probe-8.1.0.dist-info}/entry_points.txt +0 -0
- {mindstudio_probe-1.2.2.dist-info → mindstudio_probe-8.1.0.dist-info}/top_level.txt +0 -0
- /msprobe/{mindspore → core}/compare/ms_to_pt_api.yaml +0 -0
- /msprobe/{mindspore/dump → core}/kernel_dump/kernel_config.py +0 -0
- /msprobe/{pytorch/monitor/unittest → core/monitor}/__init__.py +0 -0
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
# dump.json文件说明及示例
|
|
2
2
|
|
|
3
|
-
## 1. dump.json
|
|
3
|
+
## 1. PyTorch 场景下的 dump.json 文件
|
|
4
4
|
|
|
5
|
-
### 1.1 L0级别
|
|
5
|
+
### 1.1 L0 级别
|
|
6
6
|
L0级别的dump.json文件包括模块的前反向的输入输出,以及模块的参数和参数梯度。以PyTorch的Conv2d模块为例,网络中模块调用代码为:
|
|
7
7
|
`output = self.conv2(input) # self.conv2 = torch.nn.Conv2d(64, 128, 5, padding=2, bias=True)`
|
|
8
8
|
|
|
@@ -168,7 +168,7 @@ dump.json文件中包含以下数据名称:
|
|
|
168
168
|
}
|
|
169
169
|
```
|
|
170
170
|
|
|
171
|
-
### 1.2 L1级别
|
|
171
|
+
### 1.2 L1 级别
|
|
172
172
|
L1级别的dump.json文件包括API的前反向的输入输出。以PyTorch的relu函数为例,网络中API调用代码为:
|
|
173
173
|
`output = torch.nn.functional.relu(input)`
|
|
174
174
|
|
|
@@ -264,13 +264,13 @@ dump.json文件中包含以下数据名称:
|
|
|
264
264
|
}
|
|
265
265
|
```
|
|
266
266
|
|
|
267
|
-
### 1.3 mix级别
|
|
267
|
+
### 1.3 mix 级别
|
|
268
268
|
|
|
269
269
|
mix级别的dump.json文件同时包括L0和L1级别的dump数据,文件格式与上述示例相同。
|
|
270
270
|
|
|
271
|
-
## 2. dump.json
|
|
271
|
+
## 2. MindSpore 场景下的 dump.json 文件
|
|
272
272
|
|
|
273
|
-
### 2.1 L0级别
|
|
273
|
+
### 2.1 L0 级别
|
|
274
274
|
|
|
275
275
|
L0级别的dump.json文件包括模块的前反向的输入输出,以及模块的参数和参数梯度。
|
|
276
276
|
以MindSpore的Conv2d模块为例,dump.json文件中使用的模块调用代码为:
|
|
@@ -429,7 +429,7 @@ dump.json文件中包含以下数据名称:
|
|
|
429
429
|
}
|
|
430
430
|
```
|
|
431
431
|
|
|
432
|
-
### 2.2 L1级别
|
|
432
|
+
### 2.2 L1 级别
|
|
433
433
|
L1级别的dump.json文件包括API的前反向的输入输出,以MindSpore的relu函数为例,网络中API调用代码为:
|
|
434
434
|
`output = mindspore.ops.relu(input)`
|
|
435
435
|
|
|
@@ -521,5 +521,275 @@ L1级别的dump.json文件包括API的前反向的输入输出,以MindSpore的
|
|
|
521
521
|
}
|
|
522
522
|
```
|
|
523
523
|
|
|
524
|
-
### 2.3 mix级别
|
|
524
|
+
### 2.3 mix 级别
|
|
525
|
+
|
|
525
526
|
mix级别的dump.json文件同时包括L0和L1级别的dump数据,文件格式与上述示例相同。
|
|
527
|
+
|
|
528
|
+
## 3. MSAdapter 场景下的 dump.json 文件
|
|
529
|
+
|
|
530
|
+
### 3.1 L0 级别
|
|
531
|
+
|
|
532
|
+
L0 级别的 dump.json 文件包括模块的前反向的输入输出,以及模块的参数和参数梯度。以 Conv2d 模块为例,网络中模块调用代码为:
|
|
533
|
+
`output = self.conv2(input) # self.conv2 = torch.nn.Conv2d(64, 128, 5, padding=2, bias=True)`
|
|
534
|
+
|
|
535
|
+
dump.json文件中包含以下数据名称:
|
|
536
|
+
|
|
537
|
+
- `Module.conv2.Conv2d.forward.0`:模块的前向数据,其中input_args为模块的输入数据(位置参数),input_kwargs为模块的输入数据(关键字参数),output为模块的输出数据,parameters为模块的参数数据,包括权重(weight)和偏置(bias)。
|
|
538
|
+
- `Module.conv2.Conv2d.parameters_grad`:模块的参数梯度数据,包括权重(weight)和偏置(bias)的梯度。
|
|
539
|
+
- `Module.conv2.Conv2d.backward.0`:模块的反向数据,其中input为模块反向的输入梯度(对应前向输出的梯度),output为模块的反向输出梯度(对应前向输入的梯度)。
|
|
540
|
+
|
|
541
|
+
**说明**:当dump时传入的model参数为List[torch.nn.Module]或Tuple[torch.nn.Module]时,模块级数据的命名中包含该模块在列表中的索引index,命名格式为`{Module}.{index}.*`,*表示以上三种模块级数据的命名格式,例如:`Module.0.conv1.Conv2d.forward.0`。
|
|
542
|
+
|
|
543
|
+
```json
|
|
544
|
+
{
|
|
545
|
+
"task": "tensor",
|
|
546
|
+
"level": "L0",
|
|
547
|
+
"framework": "mindtorch",
|
|
548
|
+
"dump_data_dir": "/dump/path",
|
|
549
|
+
"data": {
|
|
550
|
+
"Module.conv2.Conv2d.forward.0": {
|
|
551
|
+
"input_args": [
|
|
552
|
+
{
|
|
553
|
+
"type": "mindspore.Tensor",
|
|
554
|
+
"dtype": "Float32",
|
|
555
|
+
"shape": [
|
|
556
|
+
8,
|
|
557
|
+
16,
|
|
558
|
+
14,
|
|
559
|
+
14
|
|
560
|
+
],
|
|
561
|
+
"Max": 1.638758659362793,
|
|
562
|
+
"Min": 0.0,
|
|
563
|
+
"Mean": 0.2544615864753723,
|
|
564
|
+
"Norm": 70.50277709960938,
|
|
565
|
+
"requires_grad": true,
|
|
566
|
+
"data_name": "Module.conv2.Conv2d.forward.0.input.0.npy"
|
|
567
|
+
}
|
|
568
|
+
],
|
|
569
|
+
"input_kwargs": {},
|
|
570
|
+
"output": [
|
|
571
|
+
{
|
|
572
|
+
"type": "mindspore.Tensor",
|
|
573
|
+
"dtype": "Float32",
|
|
574
|
+
"shape": [
|
|
575
|
+
8,
|
|
576
|
+
32,
|
|
577
|
+
10,
|
|
578
|
+
10
|
|
579
|
+
],
|
|
580
|
+
"Max": 1.6815717220306396,
|
|
581
|
+
"Min": -1.5120246410369873,
|
|
582
|
+
"Mean": -0.025344856083393097,
|
|
583
|
+
"Norm": 149.65576171875,
|
|
584
|
+
"requires_grad": true,
|
|
585
|
+
"data_name": "Module.conv2.Conv2d.forward.0.output.0.npy"
|
|
586
|
+
}
|
|
587
|
+
],
|
|
588
|
+
"parameters": {
|
|
589
|
+
"weight": {
|
|
590
|
+
"type": "mindspore.Tensor",
|
|
591
|
+
"dtype": "Float32",
|
|
592
|
+
"shape": [
|
|
593
|
+
32,
|
|
594
|
+
16,
|
|
595
|
+
5,
|
|
596
|
+
5
|
|
597
|
+
],
|
|
598
|
+
"Max": 0.05992485210299492,
|
|
599
|
+
"Min": -0.05999220535159111,
|
|
600
|
+
"Mean": -0.0006165213999338448,
|
|
601
|
+
"Norm": 3.421217441558838,
|
|
602
|
+
"requires_grad": true,
|
|
603
|
+
"data_name": "Module.conv2.Conv2d.forward.0.parameters.weight.npy"
|
|
604
|
+
},
|
|
605
|
+
"bias": {
|
|
606
|
+
"type": "mindspore.Tensor",
|
|
607
|
+
"dtype": "Float32",
|
|
608
|
+
"shape": [
|
|
609
|
+
32
|
|
610
|
+
],
|
|
611
|
+
"Max": 0.05744686722755432,
|
|
612
|
+
"Min": -0.04894155263900757,
|
|
613
|
+
"Mean": 0.006410328671336174,
|
|
614
|
+
"Norm": 0.17263513803482056,
|
|
615
|
+
"requires_grad": true,
|
|
616
|
+
"data_name": "Module.conv2.Conv2d.forward.0.parameters.bias.npy"
|
|
617
|
+
}
|
|
618
|
+
}
|
|
619
|
+
},
|
|
620
|
+
"Module.conv2.Conv2d.parameters_grad": {
|
|
621
|
+
"weight": [
|
|
622
|
+
{
|
|
623
|
+
"type": "mindspore.Tensor",
|
|
624
|
+
"dtype": "Float32",
|
|
625
|
+
"shape": [
|
|
626
|
+
32,
|
|
627
|
+
16,
|
|
628
|
+
5,
|
|
629
|
+
5
|
|
630
|
+
],
|
|
631
|
+
"Max": 0.018550323322415352,
|
|
632
|
+
"Min": -0.008627401664853096,
|
|
633
|
+
"Mean": 0.0006675920449197292,
|
|
634
|
+
"Norm": 0.26084786653518677,
|
|
635
|
+
"requires_grad": false,
|
|
636
|
+
"data_name": "Module.conv2.Conv2d.parameters_grad.weight.npy"
|
|
637
|
+
}
|
|
638
|
+
],
|
|
639
|
+
"bias": [
|
|
640
|
+
{
|
|
641
|
+
"type": "mindspore.Tensor",
|
|
642
|
+
"dtype": "Float32",
|
|
643
|
+
"shape": [
|
|
644
|
+
32
|
|
645
|
+
],
|
|
646
|
+
"Max": 0.014914230443537235,
|
|
647
|
+
"Min": -0.006656786892563105,
|
|
648
|
+
"Mean": 0.002657240955159068,
|
|
649
|
+
"Norm": 0.029451673850417137,
|
|
650
|
+
"requires_grad": false,
|
|
651
|
+
"data_name": "Module.conv2.Conv2d.parameters_grad.bias.npy"
|
|
652
|
+
}
|
|
653
|
+
]
|
|
654
|
+
},
|
|
655
|
+
"Module.conv2.Conv2d.backward.0": {
|
|
656
|
+
"input": [
|
|
657
|
+
{
|
|
658
|
+
"type": "mindspore.Tensor",
|
|
659
|
+
"dtype": "Float32",
|
|
660
|
+
"shape": [
|
|
661
|
+
8,
|
|
662
|
+
32,
|
|
663
|
+
10,
|
|
664
|
+
10
|
|
665
|
+
],
|
|
666
|
+
"Max": 0.0015069986693561077,
|
|
667
|
+
"Min": -0.001139344065450132,
|
|
668
|
+
"Mean": 3.3215508210560074e-06,
|
|
669
|
+
"Norm": 0.020567523315548897,
|
|
670
|
+
"requires_grad": false,
|
|
671
|
+
"data_name": "Module.conv2.Conv2d.backward.0.input.0.npy"
|
|
672
|
+
}
|
|
673
|
+
],
|
|
674
|
+
"output": [
|
|
675
|
+
{
|
|
676
|
+
"type": "mindspore.Tensor",
|
|
677
|
+
"dtype": "Float32",
|
|
678
|
+
"shape": [
|
|
679
|
+
8,
|
|
680
|
+
16,
|
|
681
|
+
14,
|
|
682
|
+
14
|
|
683
|
+
],
|
|
684
|
+
"Max": 0.0007466732058674097,
|
|
685
|
+
"Min": -0.00044813455315306783,
|
|
686
|
+
"Mean": 6.814070275140693e-06,
|
|
687
|
+
"Norm": 0.01474067009985447,
|
|
688
|
+
"requires_grad": false,
|
|
689
|
+
"data_name": "Module.conv2.Conv2d.backward.0.output.0.npy"
|
|
690
|
+
}
|
|
691
|
+
]
|
|
692
|
+
}
|
|
693
|
+
}
|
|
694
|
+
}
|
|
695
|
+
```
|
|
696
|
+
|
|
697
|
+
### 3.2 L1 级别
|
|
698
|
+
L1级别的dump.json文件包括API的前反向的输入输出。以 relu API 为例,网络中 API 调用代码为:
|
|
699
|
+
`output = torch.nn.functional.relu(input)`
|
|
700
|
+
|
|
701
|
+
dump.json文件中包含以下数据名称:
|
|
702
|
+
- `Functional.relu.0.forward`:API的前向数据,其中input_args为API的输入数据(位置参数),input_kwargs为API的输入数据(关键字参数),output为API的输出数据。
|
|
703
|
+
- `Functional.relu.0.backward`:API的反向数据,其中input为API的反向输入梯度(对应前向输出的梯度),output为API的反向输出梯度(对应前向输入的梯度)。
|
|
704
|
+
|
|
705
|
+
```json
|
|
706
|
+
{
|
|
707
|
+
"task": "tensor",
|
|
708
|
+
"level": "L1",
|
|
709
|
+
"framework": "mindtorch",
|
|
710
|
+
"dump_data_dir":"/dump/path",
|
|
711
|
+
"data": {
|
|
712
|
+
"Functional.relu.0.forward": {
|
|
713
|
+
"input_args": [
|
|
714
|
+
{
|
|
715
|
+
"type": "mindspore.Tensor",
|
|
716
|
+
"dtype": "Float32",
|
|
717
|
+
"shape": [
|
|
718
|
+
32,
|
|
719
|
+
16,
|
|
720
|
+
28,
|
|
721
|
+
28
|
|
722
|
+
],
|
|
723
|
+
"Max": 1.3864083290100098,
|
|
724
|
+
"Min": -1.3364859819412231,
|
|
725
|
+
"Mean": 0.03711778670549393,
|
|
726
|
+
"Norm": 236.20692443847656,
|
|
727
|
+
"requires_grad": true,
|
|
728
|
+
"data_name": "Functional.relu.0.forward.input.0.npy"
|
|
729
|
+
}
|
|
730
|
+
],
|
|
731
|
+
"input_kwargs": {},
|
|
732
|
+
"output": [
|
|
733
|
+
{
|
|
734
|
+
"type": "mindspore.Tensor",
|
|
735
|
+
"dtype": "Float32",
|
|
736
|
+
"shape": [
|
|
737
|
+
32,
|
|
738
|
+
16,
|
|
739
|
+
28,
|
|
740
|
+
28
|
|
741
|
+
],
|
|
742
|
+
"Max": 1.3864083290100098,
|
|
743
|
+
"Min": 0.0,
|
|
744
|
+
"Mean": 0.16849493980407715,
|
|
745
|
+
"Norm": 175.23345947265625,
|
|
746
|
+
"requires_grad": true,
|
|
747
|
+
"data_name": "Functional.relu.0.forward.output.0.npy"
|
|
748
|
+
}
|
|
749
|
+
]
|
|
750
|
+
},
|
|
751
|
+
"Functional.relu.0.backward": {
|
|
752
|
+
"input": [
|
|
753
|
+
{
|
|
754
|
+
"type": "mindspore.Tensor",
|
|
755
|
+
"dtype": "Float32",
|
|
756
|
+
"shape": [
|
|
757
|
+
32,
|
|
758
|
+
16,
|
|
759
|
+
28,
|
|
760
|
+
28
|
|
761
|
+
],
|
|
762
|
+
"Max": 0.0001815402356442064,
|
|
763
|
+
"Min": -0.00013352684618439525,
|
|
764
|
+
"Mean": 0.00011915402356442064,
|
|
765
|
+
"Norm": 0.007598237134516239,
|
|
766
|
+
"requires_grad": false,
|
|
767
|
+
"data_name": "Functional.relu.0.backward.input.0.npy"
|
|
768
|
+
}
|
|
769
|
+
],
|
|
770
|
+
"output": [
|
|
771
|
+
{
|
|
772
|
+
"type": "mindspore.Tensor",
|
|
773
|
+
"dtype": "Float32",
|
|
774
|
+
"shape": [
|
|
775
|
+
32,
|
|
776
|
+
16,
|
|
777
|
+
28,
|
|
778
|
+
28
|
|
779
|
+
],
|
|
780
|
+
"Max": 0.0001815402356442064,
|
|
781
|
+
"Min": -0.00012117840378778055,
|
|
782
|
+
"Mean": 2.0098118724831693e-08,
|
|
783
|
+
"Norm": 0.006532244384288788,
|
|
784
|
+
"requires_grad": false,
|
|
785
|
+
"data_name": "Functional.relu.0.backward.output.0.npy"
|
|
786
|
+
}
|
|
787
|
+
]
|
|
788
|
+
}
|
|
789
|
+
}
|
|
790
|
+
}
|
|
791
|
+
```
|
|
792
|
+
|
|
793
|
+
### 3.3 mix 级别
|
|
794
|
+
|
|
795
|
+
mix级别的dump.json文件同时包括L0和L1级别的dump数据,文件格式与上述示例相同。
|
|
@@ -1,28 +1,33 @@
|
|
|
1
|
-
# 单点保存工具
|
|
1
|
+
# 单点保存工具
|
|
2
2
|
|
|
3
3
|
## 简介
|
|
4
|
-
L0, L1, mix
|
|
4
|
+
L0, L1, mix级别的dump能力存在盲区,网络中的非API或module的输入输出不会被批量dump下来。单点保存提供类似np.save和print的功能和使用体验,可以保存指定的变量。同时针对大模型场景进行了增强,具备以下特性:
|
|
5
5
|
- 可保存变量的反向梯度结果。
|
|
6
6
|
- 能直接保存嵌套结构数据(如 list、dict),无需手动遍历。
|
|
7
|
-
- 自动分
|
|
7
|
+
- 自动分 Rank 保存。
|
|
8
|
+
- 可分 Step 保存数据。
|
|
8
9
|
- 多次调用时会自动计数。
|
|
9
|
-
-
|
|
10
|
+
- 可配置保存统计值(MindSpore静态图暂不支持)或者张量。
|
|
11
|
+
- 支持异步保存。
|
|
10
12
|
|
|
11
13
|
## 支持场景
|
|
12
|
-
仅支持 PyTorch 与 MindSpore 的动态图场景。
|
|
13
14
|
|
|
14
|
-
##
|
|
15
|
+
## 动态图场景(Pytorch&MindSpore)
|
|
15
16
|
|
|
16
|
-
###
|
|
17
|
+
### 使能方式
|
|
17
18
|
|
|
18
|
-
|
|
19
|
+
#### 配置文件说明
|
|
20
|
+
|
|
21
|
+
通用配置 (细节详见[通用配置说明](./02.config_introduction.md#11-通用配置) ):
|
|
19
22
|
|
|
20
23
|
| 参数 | 解释 | 是否必选 |
|
|
21
24
|
| -------- |-------------------------------------------| -------- |
|
|
22
25
|
| task | dump 的任务类型,str 类型。 单点保存场景仅支持传入"statistics", "tensor"。 | 是 |
|
|
23
26
|
| level | dump 级别,str 类型,根据不同级别采集不同数据。单点保存场景传入"debug"。 | 是 |
|
|
24
|
-
| dump_path | 设置 dump 数据目录路径,str
|
|
25
|
-
| rank | 指定对某张卡上的数据进行采集,list[Union[int, str]]
|
|
27
|
+
| dump_path | 设置 dump 数据目录路径,str 类型。 | 是 |
|
|
28
|
+
| rank | 指定对某张卡上的数据进行采集,list[Union[int, str]] 类型。 | 否 |
|
|
29
|
+
| step | 指定采集某个 Step 的数据,list[Union[int, str]] 类型。 | 否 |
|
|
30
|
+
| async_dump | 异步 dump 开关,bool 类型。 | 否 |
|
|
26
31
|
|
|
27
32
|
"statistics" 任务子配置项:
|
|
28
33
|
| 参数 | 解释 | 是否必选 |
|
|
@@ -31,19 +36,21 @@ L0, L1, mix dump存在盲区,网络中的非api/module的输入输出不会被
|
|
|
31
36
|
|
|
32
37
|
"tensor" 任务无子配置项。
|
|
33
38
|
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
调用PrecisionDebugger.save,传入需要保存的变量,指定变量名称以及是否需要保存反向数据。接口入参说明详见[pytorch单点保存接口](./05.data_dump_PyTorch.md#19-save),[mindspore单点保存接口](./06.data_dump_MindSpore.md#615-save)
|
|
39
|
+
#### 接口调用说明
|
|
37
40
|
|
|
38
|
-
|
|
41
|
+
调用PrecisionDebugger.save,传入需要保存的变量,指定变量名称以及是否需要保存反向数据。接口入参说明详见[PyTorch单点保存接口](./05.data_dump_PyTorch.md#19-save),[MindSpore单点保存接口](./06.data_dump_MindSpore.md#615-save)
|
|
39
42
|
|
|
43
|
+
#### 实例
|
|
44
|
+
(以PyTorch场景为例,MindSpore场景只需要从msprobe.mindspore模块导包即可)
|
|
40
45
|
配置文件
|
|
41
46
|
```json
|
|
42
47
|
{
|
|
43
48
|
"task": "statistics",
|
|
44
49
|
"dump_path": "./dump_path",
|
|
45
50
|
"rank": [],
|
|
51
|
+
"step": [],
|
|
46
52
|
"level": "debug",
|
|
53
|
+
"async_dump": false,
|
|
47
54
|
"statistics": {
|
|
48
55
|
"summary_mode": "statistics"
|
|
49
56
|
}
|
|
@@ -53,7 +60,7 @@ L0, L1, mix dump存在盲区,网络中的非api/module的输入输出不会被
|
|
|
53
60
|
初始化
|
|
54
61
|
```python
|
|
55
62
|
# 训练启动py脚本
|
|
56
|
-
from
|
|
63
|
+
from msprobe.pytorch import PrecisionDebugger
|
|
57
64
|
debugger = PrecisionDebugger("./config.json")
|
|
58
65
|
for data, label in data_loader:
|
|
59
66
|
# 执行模型训练
|
|
@@ -64,7 +71,7 @@ for data, label in data_loader:
|
|
|
64
71
|
初始化(无配置文件)
|
|
65
72
|
```python
|
|
66
73
|
# 训练启动py脚本
|
|
67
|
-
from
|
|
74
|
+
from msprobe.pytorch import PrecisionDebugger
|
|
68
75
|
debugger = PrecisionDebugger(dump_path="dump_path", level="debug")
|
|
69
76
|
for data, label in data_loader:
|
|
70
77
|
# 执行模型训练
|
|
@@ -75,20 +82,104 @@ for data, label in data_loader:
|
|
|
75
82
|
调用保存接口
|
|
76
83
|
```python
|
|
77
84
|
# 训练过程中被调用py文件
|
|
78
|
-
from
|
|
85
|
+
from msprobe.pytorch import PrecisionDebugger
|
|
79
86
|
dict_variable = {"key1": "value1", "key2": [1, 2]}
|
|
80
87
|
PrecisionDebugger.save(dict_variable, "dict_variable", save_backward=False)
|
|
81
88
|
|
|
82
89
|
```
|
|
83
90
|
|
|
91
|
+
## 静态图场景(MindSpore)
|
|
92
|
+
|
|
93
|
+
### 使能方式
|
|
94
|
+
|
|
95
|
+
#### 接口调用说明
|
|
96
|
+
工具提供两个对外接口`save`和`save_grad`,分别用于保存训练中的tensor以及tensor对应的反向数据
|
|
97
|
+
| 接口名称 | 入参 | device | MindSpore版本 |备注 |
|
|
98
|
+
| ------- | ------ | -------------- | --------------|--------------------------------------------------- |
|
|
99
|
+
| save | save_dir name, data | Ascend | >= 2.6.0 | (主流场景)图模式下只支持Ascend,pynative下支持Ascend/GPU/CPU。 |
|
|
100
|
+
| save_grad | save_dir, name, data | Ascend | >= 2.6.0 | (主流场景)图模式下只支持Ascend,pynative下支持Ascend/GPU/CPU。 |
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
----
|
|
104
|
+
> 函数原型:
|
|
105
|
+
`save(save_dir:str, name:str, data)`
|
|
106
|
+
- save_dir:表示要保存的目录。
|
|
107
|
+
- name :表示要保存的文件标志名称。
|
|
108
|
+
- data :表示数据入参,可以是`mindspore.Tensor`或者是`List`,`Tuple`,`Dict`等嵌套结构。
|
|
109
|
+
|
|
110
|
+
> 函数原型:
|
|
111
|
+
`save_grad(save_dir:str, name:str, data)`
|
|
112
|
+
- save_dir:表示要保存的目录。
|
|
113
|
+
- name :表示要保存的文件标志名称。
|
|
114
|
+
- data :表示数据入参,**只能**是`mindspore.Tensor`。
|
|
115
|
+
|
|
116
|
+
#### 实例
|
|
117
|
+
|
|
118
|
+
- save接口使用:
|
|
119
|
+
|
|
120
|
+
```python
|
|
121
|
+
# save api usage
|
|
122
|
+
# **first import**
|
|
123
|
+
from msprobe.mindspore import save
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
class Net(nn.Cell):
|
|
127
|
+
def construct(self, x, y, z):
|
|
128
|
+
# **use save api**
|
|
129
|
+
save("./test_dump", 'x', x)
|
|
130
|
+
return x * y * z
|
|
131
|
+
|
|
132
|
+
x = Tensor([1, 2], ms.float32)
|
|
133
|
+
y = Tensor([-2, 3], ms.float32)
|
|
134
|
+
z = Tensor([0, 3], ms.float32)
|
|
135
|
+
net = Net()
|
|
136
|
+
output = grad(net, grad_position=(1, 2))(x, y, z)
|
|
137
|
+
time.sleep(1)
|
|
138
|
+
|
|
139
|
+
# then will generate **./test_dump/step0/rank0/x_float32_0.npy**
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
- save_grad接口使用:
|
|
143
|
+
|
|
144
|
+
```python
|
|
145
|
+
# save_grad usage
|
|
146
|
+
# **first import**
|
|
147
|
+
from msprobe.mindspore import save_grad
|
|
148
|
+
class Net(nn.Cell):
|
|
149
|
+
def construct(self, x, y, z):
|
|
150
|
+
# **use save api** the return value of save_grad must be received by origin
|
|
151
|
+
z = save_grad("./test_dump", 'z', z)
|
|
152
|
+
return x * y * z
|
|
153
|
+
|
|
154
|
+
x = Tensor([1, 2], ms.float32)
|
|
155
|
+
y = Tensor([-2, 3], ms.float32)
|
|
156
|
+
z = Tensor([0, 3], ms.float32)
|
|
157
|
+
net = Net()
|
|
158
|
+
output = grad(net, grad_position=(1, 2))(x, y, z)
|
|
159
|
+
time.sleep(1)
|
|
160
|
+
|
|
161
|
+
# then will generate **./test_dump/step0/rank0/z_grad_float32_0.npy**
|
|
162
|
+
```
|
|
163
|
+
**注意**save_grad需要将返回值回传给原tensor,此操作不会有精度影响,只会传递原值。
|
|
164
|
+
|
|
165
|
+
|
|
84
166
|
## 输出结果
|
|
167
|
+
### 动态图场景(Pytorch&MindSpore)
|
|
85
168
|
* **"task" 配置为 "statistics" 场景** :在 dump 目录下会生成包含变量统计值信息的 `debug.json` 文件。
|
|
86
|
-
|
|
169
|
+
`debug.json` 中统计值的key命名格式为 `{variable_name}{grad_flag}.{count}.debug`。
|
|
170
|
+
* **"task" 配置为 "tensor" 场景** :除了在 dump 目录下生成包含变量统计值信息的 `debug.json` 文件外,还会在 dump 子目录 `dump_tensor_data` 中保存张量二进制文件,文件名称格式为 `{variable_name}{grad_flag}.{count}.debug.{indexes}.{file_suffix}`。
|
|
87
171
|
|
|
88
172
|
- variable_name: 传入save接口的变量名称。
|
|
89
173
|
- grad_flag: 反向数据标识,反向数据为"_grad",正向数据为""。
|
|
90
174
|
- count: 调用计数,多次以相同变量名称调用时的计数。
|
|
91
|
-
- indexes: 索引,在保存嵌套结构数据时的索引。例如:嵌套结构为`{"key1": "value1", "key2": ["value2", "value3"]}`,"value2"的索引为"key2.0"
|
|
92
|
-
- file_suffix:文件后缀,
|
|
175
|
+
- indexes: 索引,在保存嵌套结构数据时的索引。例如:嵌套结构为`{"key1": "value1", "key2": ["value2", "value3"]}`,"value2"的索引为"key2.0"。
|
|
176
|
+
- file_suffix:文件后缀,PyTorch场景为"pt",MindSpore场景为"npy"。
|
|
177
|
+
|
|
178
|
+
### 静态图场景(MindSpore)
|
|
179
|
+
在指定目录`save_dir`下生成`{step}/{rank}`目录,目录下生成指定`{name}`的npy文件,如果是save_grad接口调用,则会生成`{name}_grad`的npy文件。
|
|
180
|
+
|
|
181
|
+
如`save("./test_dump", 'x', x)` -> `./test_dump/step0/rank0/x_float32_0.npy`。
|
|
182
|
+
|
|
183
|
+
或如`z = save_grad("./test_dump", 'z', z)` -> `./test_dump/step0/rank0/z_grad_float32_0.npy`。
|
|
93
184
|
|
|
94
185
|
|