PyPI - mindstudio-probe - Versions diffs - 1.0.4__py3-none-any.whl → 1.1.1__py3-none-any.whl - Mend

mindstudio-probe 1.0.4py3-none-any.whl → 1.1.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (278) hide show

{mindstudio_probe-1.0.4.dist-info → mindstudio_probe-1.1.1.dist-info}/METADATA +5 -5
mindstudio_probe-1.1.1.dist-info/RECORD +341 -0
{mindstudio_probe-1.0.4.dist-info → mindstudio_probe-1.1.1.dist-info}/WHEEL +1 -1
{mindstudio_probe-1.0.4.dist-info → mindstudio_probe-1.1.1.dist-info}/entry_points.txt +0 -1
msprobe/README.md +84 -18
msprobe/__init__.py +16 -1
msprobe/config.json +1 -5
msprobe/core/advisor/advisor.py +16 -11
msprobe/core/advisor/advisor_const.py +6 -7
msprobe/core/advisor/advisor_result.py +12 -12
msprobe/core/common/const.py +164 -3
msprobe/core/common/exceptions.py +26 -4
msprobe/core/common/file_utils.py +196 -27
msprobe/core/common/inplace_op_checker.py +53 -0
msprobe/core/common/inplace_ops.yaml +251 -0
msprobe/core/common/log.py +46 -18
msprobe/core/common/utils.py +308 -209
msprobe/core/common_config.py +60 -38
msprobe/core/compare/acc_compare.py +332 -94
msprobe/core/compare/check.py +104 -22
msprobe/core/compare/compare_cli.py +42 -5
msprobe/core/compare/highlight.py +162 -57
msprobe/core/compare/layer_mapping/__init__.py +19 -0
msprobe/core/compare/layer_mapping/data_scope_parser.py +235 -0
msprobe/core/compare/layer_mapping/layer_mapping.py +242 -0
msprobe/core/compare/layer_mapping/postprocess_pass.py +94 -0
msprobe/core/compare/multiprocessing_compute.py +33 -8
msprobe/core/compare/npy_compare.py +73 -29
msprobe/core/compare/utils.py +306 -247
msprobe/core/data_dump/data_collector.py +44 -43
msprobe/core/data_dump/data_processor/base.py +88 -35
msprobe/core/data_dump/data_processor/factory.py +20 -3
msprobe/core/data_dump/data_processor/mindspore_processor.py +14 -8
msprobe/core/data_dump/data_processor/pytorch_processor.py +180 -66
msprobe/core/data_dump/json_writer.py +63 -42
msprobe/core/data_dump/scope.py +143 -48
msprobe/core/grad_probe/constant.py +31 -13
msprobe/core/grad_probe/grad_compare.py +20 -4
msprobe/core/grad_probe/utils.py +44 -3
msprobe/core/overflow_check/abnormal_scene.py +185 -0
msprobe/core/overflow_check/api_info.py +55 -0
msprobe/core/overflow_check/checker.py +138 -0
msprobe/core/overflow_check/filter.py +157 -0
msprobe/core/overflow_check/ignore_rules.yaml +55 -0
msprobe/core/overflow_check/level.py +22 -0
msprobe/core/overflow_check/utils.py +28 -0
msprobe/docs/01.installation.md +29 -9
msprobe/docs/02.config_introduction.md +83 -84
msprobe/docs/03.config_examples.md +3 -20
msprobe/docs/04.kernel_dump_PyTorch.md +73 -0
msprobe/docs/05.data_dump_PyTorch.md +143 -13
msprobe/docs/06.data_dump_MindSpore.md +197 -88
msprobe/docs/07.accuracy_checker_PyTorch.md +69 -46
msprobe/docs/08.accuracy_checker_online_PyTorch.md +52 -17
msprobe/docs/09.accuracy_checker_MindSpore.md +51 -15
msprobe/docs/10.accuracy_compare_PyTorch.md +187 -99
msprobe/docs/11.accuracy_compare_MindSpore.md +253 -31
msprobe/docs/12.overflow_check_PyTorch.md +1 -1
msprobe/docs/13.overflow_check_MindSpore.md +6 -6
msprobe/docs/15.free_benchmarking_PyTorch.md +60 -55
msprobe/docs/16.free_benchmarking_MindSpore.md +159 -0
msprobe/docs/17.grad_probe.md +19 -22
msprobe/docs/18.online_dispatch.md +89 -0
msprobe/docs/19.monitor.md +468 -0
msprobe/docs/20.monitor_performance_baseline.md +52 -0
msprobe/docs/21.visualization_PyTorch.md +386 -0
msprobe/docs/22.visualization_MindSpore.md +384 -0
msprobe/docs/23.tool_function_introduction.md +28 -0
msprobe/docs/{FAQ_PyTorch.md → FAQ.md} +25 -10
msprobe/docs/data_dump_Mindspore/dynamic_graph_quick_start_example.md +211 -0
msprobe/docs/img/compare_result.png +0 -0
msprobe/docs/img/monitor/cpu_info.png +0 -0
msprobe/docs/img/ms_dump.png +0 -0
msprobe/docs/img/ms_layer.png +0 -0
msprobe/docs/img/pt_dump.png +0 -0
msprobe/mindspore/__init__.py +16 -0
msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +130 -138
msprobe/mindspore/api_accuracy_checker/api_info.py +27 -5
msprobe/mindspore/api_accuracy_checker/api_runner.py +43 -18
msprobe/mindspore/api_accuracy_checker/base_compare_algorithm.py +21 -7
msprobe/mindspore/api_accuracy_checker/checker_support_api.yaml +77 -0
msprobe/mindspore/api_accuracy_checker/cmd_parser.py +63 -1
msprobe/mindspore/api_accuracy_checker/compute_element.py +59 -24
msprobe/mindspore/api_accuracy_checker/data_manager.py +264 -0
msprobe/mindspore/api_accuracy_checker/main.py +27 -3
msprobe/mindspore/api_accuracy_checker/multi_api_accuracy_checker.py +206 -0
msprobe/mindspore/api_accuracy_checker/multi_data_manager.py +58 -0
msprobe/mindspore/api_accuracy_checker/type_mapping.py +22 -5
msprobe/mindspore/api_accuracy_checker/utils.py +34 -17
msprobe/mindspore/cell_processor.py +58 -13
msprobe/mindspore/common/const.py +35 -13
msprobe/mindspore/common/log.py +5 -9
msprobe/mindspore/common/utils.py +60 -5
msprobe/mindspore/compare/distributed_compare.py +15 -28
msprobe/mindspore/compare/ms_compare.py +319 -158
msprobe/mindspore/compare/ms_graph_compare.py +99 -49
msprobe/mindspore/debugger/debugger_config.py +20 -14
msprobe/mindspore/debugger/precision_debugger.py +43 -13
msprobe/mindspore/dump/dump_tool_factory.py +18 -1
msprobe/mindspore/dump/hook_cell/api_registry.py +23 -3
msprobe/mindspore/dump/hook_cell/primitive_hooks.py +203 -0
msprobe/mindspore/dump/hook_cell/support_wrap_ops.yaml +107 -10
msprobe/mindspore/dump/hook_cell/wrap_api.py +21 -13
msprobe/mindspore/dump/jit_dump.py +56 -20
msprobe/mindspore/dump/kernel_graph_dump.py +19 -5
msprobe/mindspore/dump/kernel_kbyk_dump.py +19 -6
msprobe/mindspore/dym_loader/hook_dynamic_loader.cc +140 -0
msprobe/mindspore/dym_loader/hook_dynamic_loader.h +53 -0
msprobe/mindspore/free_benchmark/api_pynative_self_check.py +162 -41
msprobe/mindspore/free_benchmark/common/config.py +15 -0
msprobe/mindspore/free_benchmark/common/handler_params.py +15 -1
msprobe/mindspore/free_benchmark/common/utils.py +37 -8
msprobe/mindspore/free_benchmark/data/support_wrap_ops.yaml +0 -204
msprobe/mindspore/free_benchmark/handler/base_handler.py +20 -5
msprobe/mindspore/free_benchmark/handler/check_handler.py +21 -7
msprobe/mindspore/free_benchmark/handler/fix_handler.py +18 -3
msprobe/mindspore/free_benchmark/handler/handler_factory.py +21 -6
msprobe/mindspore/free_benchmark/perturbation/add_noise.py +23 -8
msprobe/mindspore/free_benchmark/perturbation/base_perturbation.py +29 -5
msprobe/mindspore/free_benchmark/perturbation/bit_noise.py +25 -10
msprobe/mindspore/free_benchmark/perturbation/exchange_value.py +45 -19
msprobe/mindspore/free_benchmark/perturbation/improve_precision.py +29 -8
msprobe/mindspore/free_benchmark/perturbation/no_change.py +16 -1
msprobe/mindspore/free_benchmark/perturbation/perturbation_factory.py +22 -7
msprobe/mindspore/free_benchmark/self_check_tool_factory.py +17 -2
msprobe/mindspore/grad_probe/global_context.py +44 -14
msprobe/mindspore/grad_probe/grad_analyzer.py +27 -13
msprobe/mindspore/grad_probe/grad_monitor.py +16 -1
msprobe/mindspore/grad_probe/grad_stat_csv.py +33 -5
msprobe/mindspore/grad_probe/hook.py +24 -10
msprobe/mindspore/grad_probe/utils.py +18 -5
msprobe/mindspore/ms_config.py +22 -15
msprobe/mindspore/overflow_check/kernel_graph_overflow_check.py +20 -6
msprobe/mindspore/overflow_check/overflow_check_tool_factory.py +15 -0
msprobe/mindspore/runtime.py +15 -0
msprobe/mindspore/service.py +75 -150
msprobe/mindspore/task_handler_factory.py +15 -0
msprobe/msprobe.py +24 -7
msprobe/pytorch/__init__.py +23 -3
msprobe/pytorch/api_accuracy_checker/common/config.py +81 -2
msprobe/pytorch/api_accuracy_checker/common/utils.py +53 -21
msprobe/pytorch/api_accuracy_checker/compare/algorithm.py +19 -2
msprobe/pytorch/api_accuracy_checker/compare/api_precision_compare.py +50 -25
msprobe/pytorch/api_accuracy_checker/compare/compare.py +51 -21
msprobe/pytorch/api_accuracy_checker/compare/compare_column.py +23 -6
msprobe/pytorch/api_accuracy_checker/compare/compare_utils.py +28 -8
msprobe/pytorch/api_accuracy_checker/config.yaml +1 -1
msprobe/pytorch/api_accuracy_checker/generate_op_script/config_op.json +9 -0
msprobe/pytorch/api_accuracy_checker/generate_op_script/op_generator.py +454 -0
msprobe/pytorch/api_accuracy_checker/generate_op_script/operator_replication.template +365 -0
msprobe/pytorch/api_accuracy_checker/run_ut/data_generate.py +73 -33
msprobe/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py +44 -18
msprobe/pytorch/api_accuracy_checker/run_ut/run_overflow_check.py +32 -11
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py +122 -172
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut_utils.py +158 -4
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/attl.py +30 -24
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/client.py +68 -31
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/device_dispatch.py +27 -4
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/dump_dispatch.py +115 -0
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/server.py +26 -9
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/torch_ops_config.yaml +63 -0
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/utils.py +44 -0
msprobe/pytorch/bench_functions/__init__.py +18 -3
msprobe/pytorch/bench_functions/apply_adam_w.py +15 -0
msprobe/pytorch/bench_functions/confusion_transpose.py +20 -1
msprobe/pytorch/bench_functions/fast_gelu.py +15 -0
msprobe/pytorch/bench_functions/layer_norm_eval.py +15 -0
msprobe/pytorch/bench_functions/linear.py +15 -0
msprobe/pytorch/bench_functions/matmul_backward.py +33 -6
msprobe/pytorch/bench_functions/npu_fusion_attention.py +280 -157
msprobe/pytorch/bench_functions/rms_norm.py +15 -0
msprobe/pytorch/bench_functions/rotary_mul.py +32 -9
msprobe/pytorch/bench_functions/scaled_mask_softmax.py +15 -0
msprobe/pytorch/bench_functions/swiglu.py +29 -6
msprobe/pytorch/common/__init__.py +15 -0
msprobe/pytorch/common/log.py +18 -6
msprobe/pytorch/common/parse_json.py +31 -16
msprobe/pytorch/common/utils.py +96 -40
msprobe/pytorch/compare/distributed_compare.py +13 -14
msprobe/pytorch/compare/match.py +15 -0
msprobe/pytorch/compare/pt_compare.py +44 -10
msprobe/pytorch/debugger/debugger_config.py +69 -52
msprobe/pytorch/debugger/precision_debugger.py +72 -24
msprobe/pytorch/dump/kernel_dump/kernel_config.py +33 -0
msprobe/pytorch/free_benchmark/__init__.py +20 -5
msprobe/pytorch/free_benchmark/common/constant.py +15 -0
msprobe/pytorch/free_benchmark/common/counter.py +15 -0
msprobe/pytorch/free_benchmark/common/enums.py +43 -0
msprobe/pytorch/free_benchmark/common/params.py +23 -1
msprobe/pytorch/free_benchmark/common/utils.py +43 -5
msprobe/pytorch/free_benchmark/compare/grad_saver.py +47 -9
msprobe/pytorch/free_benchmark/compare/single_benchmark.py +17 -0
msprobe/pytorch/free_benchmark/main.py +19 -4
msprobe/pytorch/free_benchmark/perturbed_layers/base_layer.py +15 -0
msprobe/pytorch/free_benchmark/perturbed_layers/layer_factory.py +19 -4
msprobe/pytorch/free_benchmark/perturbed_layers/npu/add_noise.py +18 -1
msprobe/pytorch/free_benchmark/perturbed_layers/npu/bit_noise.py +21 -4
msprobe/pytorch/free_benchmark/perturbed_layers/npu/change_value.py +28 -2
msprobe/pytorch/free_benchmark/perturbed_layers/npu/improve_precision.py +19 -0
msprobe/pytorch/free_benchmark/perturbed_layers/npu/no_change.py +15 -0
msprobe/pytorch/free_benchmark/perturbed_layers/npu/npu_base_layser.py +15 -0
msprobe/pytorch/free_benchmark/perturbed_layers/run_cpu.py +15 -0
msprobe/pytorch/free_benchmark/result_handlers/base_handler.py +65 -16
msprobe/pytorch/free_benchmark/result_handlers/check_handler.py +15 -0
msprobe/pytorch/free_benchmark/result_handlers/fix_handler.py +21 -5
msprobe/pytorch/free_benchmark/result_handlers/handler_factory.py +15 -0
msprobe/pytorch/free_benchmark/result_handlers/preheat_handler.py +19 -4
msprobe/pytorch/function_factory.py +17 -2
msprobe/pytorch/functional/module_dump.py +84 -0
msprobe/pytorch/grad_probe/grad_monitor.py +23 -6
msprobe/pytorch/grad_probe/grad_stat_csv.py +40 -10
msprobe/pytorch/hook_module/__init__.py +16 -1
msprobe/pytorch/hook_module/api_registry.py +13 -8
msprobe/pytorch/hook_module/hook_module.py +17 -19
msprobe/pytorch/hook_module/support_wrap_ops.yaml +1 -0
msprobe/pytorch/hook_module/utils.py +4 -6
msprobe/pytorch/hook_module/wrap_aten.py +12 -11
msprobe/pytorch/hook_module/wrap_distributed.py +6 -7
msprobe/pytorch/hook_module/wrap_functional.py +21 -20
msprobe/pytorch/hook_module/wrap_npu_custom.py +9 -17
msprobe/pytorch/hook_module/wrap_tensor.py +4 -6
msprobe/pytorch/hook_module/wrap_torch.py +4 -6
msprobe/pytorch/hook_module/wrap_vf.py +4 -6
msprobe/pytorch/module_processer.py +18 -6
msprobe/pytorch/monitor/anomaly_analyse.py +201 -0
msprobe/pytorch/monitor/anomaly_detect.py +340 -0
msprobe/pytorch/monitor/distributed/distributed_ops.yaml +19 -0
msprobe/pytorch/monitor/distributed/stack_blacklist.yaml +5 -0
msprobe/pytorch/monitor/distributed/wrap_distributed.py +272 -0
msprobe/pytorch/monitor/features.py +108 -0
msprobe/pytorch/monitor/module_hook.py +870 -0
msprobe/pytorch/monitor/module_metric.py +193 -0
msprobe/pytorch/monitor/module_spec_verifier.py +93 -0
msprobe/pytorch/monitor/optimizer_collect.py +295 -0
msprobe/pytorch/monitor/unittest/__init__.py +0 -0
msprobe/pytorch/monitor/unittest/test_monitor.py +145 -0
msprobe/pytorch/monitor/utils.py +250 -0
msprobe/pytorch/monitor/visualizer.py +59 -0
msprobe/pytorch/online_dispatch/__init__.py +2 -3
msprobe/pytorch/online_dispatch/compare.py +38 -48
msprobe/pytorch/online_dispatch/dispatch.py +50 -25
msprobe/pytorch/online_dispatch/dump_compare.py +21 -9
msprobe/pytorch/online_dispatch/single_compare.py +60 -39
msprobe/pytorch/online_dispatch/torch_ops_config.yaml +9 -1
msprobe/pytorch/online_dispatch/utils.py +48 -23
msprobe/pytorch/parse.py +15 -0
msprobe/pytorch/parse_tool/cli.py +5 -6
msprobe/pytorch/parse_tool/lib/compare.py +19 -26
msprobe/pytorch/parse_tool/lib/config.py +1 -1
msprobe/pytorch/parse_tool/lib/parse_tool.py +4 -2
msprobe/pytorch/parse_tool/lib/utils.py +40 -55
msprobe/pytorch/parse_tool/lib/visualization.py +3 -1
msprobe/pytorch/pt_config.py +192 -40
msprobe/pytorch/service.py +110 -35
msprobe/visualization/__init__.py +14 -0
msprobe/visualization/builder/__init__.py +14 -0
msprobe/visualization/builder/graph_builder.py +165 -0
msprobe/visualization/builder/msprobe_adapter.py +205 -0
msprobe/visualization/compare/__init__.py +14 -0
msprobe/visualization/compare/graph_comparator.py +130 -0
msprobe/visualization/compare/mode_adapter.py +211 -0
msprobe/visualization/graph/__init__.py +14 -0
msprobe/visualization/graph/base_node.py +124 -0
msprobe/visualization/graph/graph.py +200 -0
msprobe/visualization/graph/node_colors.py +95 -0
msprobe/visualization/graph/node_op.py +39 -0
msprobe/visualization/graph_service.py +214 -0
msprobe/visualization/utils.py +232 -0
mindstudio_probe-1.0.4.dist-info/RECORD +0 -276
msprobe/docs/04.acl_config_examples.md +0 -76
msprobe/mindspore/free_benchmark/decorator/dec_forward.py +0 -43
msprobe/mindspore/free_benchmark/decorator/decorator_factory.py +0 -107
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/ssl_config.py +0 -10
msprobe/pytorch/functional/dump_module.py +0 -39
{mindstudio_probe-1.0.4.dist-info → mindstudio_probe-1.1.1.dist-info}/LICENSE +0 -0
{mindstudio_probe-1.0.4.dist-info → mindstudio_probe-1.1.1.dist-info}/top_level.txt +0 -0
/msprobe/{mindspore/free_benchmark/decorator → pytorch/monitor}/__init__.py +0 -0
/msprobe/pytorch/{functional/data_processor.py → monitor/distributed/__init__.py} +0 -0

msprobe/docs/22.visualization_MindSpore.md ADDED Viewed

@@ -0,0 +1,384 @@
+# MindSpore 场景的分级可视化构图比对
+分级可视化工具将msprobe工具dump的精度数据进行解析，还原模型图结构，实现模型各个层级的精度数据比对，方便用户理解模型结构、分析精度问题。
+工具支持MindSpore版本：2.4.0
+## 1.依赖安装
+分级可视化工具依赖**msprobe工具**和**tensorboard。**
+### 1.1 安装msprobe工具
+现阶段分级可视化工具还未集成在已发布的msprobe工具中，需要从源码安装，请参考从源码安装章节。
+[msprobe工具安装](https://gitee.com/ascend/mstt/blob/master/debug/accuracy_tools/msprobe/docs/01.installation.md)
+### 1.2 安装tb_graph_ascend
+**请安装tb_graph_ascend，否则无法解析构图结果。**
+[tb_graph_ascend下载](https://mindstudio-sample.obs.cn-north-4.myhuaweicloud.com/GRAPH_ASCEND/tb_graph_ascend-0.1.0-py3-none-any.whl)
+``pip3 install``即可。
+## 2.模型结构数据采集
+[MindSpore场景的精度数据采集](https://gitee.com/ascend/mstt/blob/master/debug/accuracy_tools/msprobe/docs/06.data_dump_MindSpore.md)
+**仅支持动态图场景，需要选择level为L0（cell信息）或者mix（cell信息+api信息），才能采集到模型结构数据，即采集结果件construct.json内容不为空**。
+## 3.生成图结构文件
+### 3.1 构图命令行说明
+**命令示例如下**：
+```
+msprobe -f mindspore graph -i ./compare.json -o ./output
+```
+**命令行参数说明**：
+| 参数名               | 说明                                                                                                                                                            | 是否必选 |
+|-------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------| -------- |
+| -i 或 --input_path   | 指定比对文件，str 类型。                                                                                                                                                | 是       |
+| -o 或 --output_path  | 配置比对结果文件存盘目录，str 类型。文件名称基于时间戳自动生成，格式为：`compare_{timestamp}.vis`。                                                                                              | 是       |
+| -lm 或 --layer_mapping| 跨框架比对，MindSpore和PyTorch的比对场景。配置该参数时表示开启跨框架Layer层的比对功能，指定模型代码中的Layer层后，可以识别对应dump数据中的模块或API。需要指定自定义映射文件*.yaml。自定义映射文件的格式请参见[自定义映射文件（Layer）](#71-自定义映射文件layer)。 | 否    |
+**比对文件说明**：
+以在当前目录创建 ./compare.json 为例。
+```
+{
+"npu_path": "./npu_dump",
+"bench_path": "./bench_dump",
+"is_print_compare_log": true
+}
+```
+**比对文件参数说明**：
+| 参数名               | 说明                                                                                                    | 是否必选 |
+|-------------------|-------------------------------------------------------------------------------------------------------|------|
+| npu_path   | 指定待调试侧比对路径，str类型。工具根据路径格式自动进行单rank比对、多rank批量比对或多step批量比对，具体格式参考3.2 图构建和比对。           | 是    |
+| bench_path  | 指定标杆侧比对路径，str类型。单图构建场景可以不配置 | 否    |
+| is_print_compare_log  | 配置是否开启单个算子的日志打屏。可取值 true 或 false，默认为 true。关闭后则只输出常规日志，bool 类型。                                        | 否    |
+### 3.2 图构建和比对
+**如果只是想查看一个模型的结构，请选择单图构建**；
+**如果想比较两个模型的结构差异和精度数据差异，请选择双图比对**。
+#### 3.2.1 单图构建
+展示模型结构、精度数据、堆栈信息。
+**1. 准备比对文件**：
+以在当前目录创建 ./compare.json 为例。
+```
+{
+"npu_path": "./npu_dump",
+"is_print_compare_log": true
+}
+```
+npu_path格式：必须包含dump.json、stack.json和construct.json，且construct.json不能为空。如果construct.json为空，请检查dump的level参数是否没有选择L0或者mix。
+```
+├── npu_path
+│   ├── dump_tensor_data（配置dump的task参数选择tensor时存在）
+|   |    ├── MintFunctional.relu.0.backward.input.0.npy
+|   |    ├── Mint.abs.0.forward.input.0.npy
+|   |    ...
+|   |    └── Cell.relu.ReLU.forward.0.input.0.npy
+|   ├── dump.json         # 数据信息
+|   ├── stack.json        # 调用栈信息
+|   └── construct.json    # 分层分级结构，level为L1时，construct.json内容为空
+```
+**2. 执行命令**：
+```
+msprobe -f mindspore graph -i ./compare.json -o ./output
+```
+#### 3.2.2 双图比对
+展示模型结构、结构差异、精度数据和精度比对指标、精度是否疑似有问题（精度比对指标差异越大颜色越深）。
+当前比对支持三种类型的dump数据，分级可视化工具比对时会自动判断：
+1.统计信息：仅dump了API和Module的输入输出数据统计信息，占用磁盘空间小；
+2.真实数据：不仅dump了API和Module的输入输出数据统计信息，还将tensor进行存盘，占用磁盘空间大，但比对更加准确；
+3.md5：dump了API和Module的输入输出数据统计信息和md5信息。
+dump类型如何配置见[数据采集配置文件介绍](https://gitee.com/ascend/mstt/blob/master/debug/accuracy_tools/msprobe/docs/02.config_introduction.md)
+**1. 准备比对文件**：
+以在当前目录创建 ./compare.json 为例。
+```
+{
+"npu_path": "./npu_dump",
+"bench_path": "./bench_dump",
+"is_print_compare_log": true
+}
+```
+npu_path或bench_path格式：必须包含dump.json、stack.json和construct.json，且construct.json不能为空。如果construct.json为空，请检查dump的level参数是否没有选择L0或者mix。
+```
+├── npu_path或bench_path
+│   ├── dump_tensor_data（配置dump的task参数选择tensor时存在）
+|   |    ├── MintFunctional.relu.0.backward.input.0.npy
+|   |    ├── Mint.abs.0.forward.input.0.npy
+|   |    ...
+|   |    └── Cell.relu.ReLU.forward.0.input.0.npy
+|   ├── dump.json         # 数据信息
+|   ├── stack.json        # 调用栈信息
+|   └── construct.json    # 分层分级结构，level为L1时，construct.json内容为空
+```
+**2. 执行命令**：
+```
+msprobe -f mindspore graph -i ./compare.json -o ./output
+```
+比对完成后将在**output**下生成一个**vis后缀文件**。
+#### 3.2.3 批量构建或比对
+##### 3.2.3.1 多rank批量构建或比对
+批量构建或比对一个step下的所有rank的数据
+**1. 准备比对文件**：
+以在当前目录创建 ./compare.json 为例。
+```
+{
+"npu_path": "./npu_dump",
+"bench_path": "./bench_dump", # 只进行图构建可不配置
+"is_print_compare_log": true
+}
+```
+npu_path或bench_path格式：必须只包含rank+数字格式的文件夹，且每个rank文件夹中必须包含dump.json、stack.json和construct.json，且construct.json不能为空。如果construct.json为空，请检查dump的level参数是否没有选择L0或者mix。
+进行批量图比对时，npu_path和bench_path中包含的rank+数字格式的文件夹必须数量一致且能够一一对应。
+```
+├── npu_path或bench_path
+|   ├── rank0
+|   │   ├── dump_tensor_data（仅配置dump的task参数选择tensor时存在）
+|   |   |    ├── MintFunctional.relu.0.backward.input.0.npy
+|   |   |    ├── Mint.abs.0.forward.input.0.npy
+|   |   |    ...
+|   |   |    └── Cell.relu.ReLU.forward.0.input.0.npy
+|   |   ├── dump.json         # 数据信息
+|   |   ├── stack.json        # 算子调用栈信息
+|   |   └── construct.json    # 分层分级结构，level为L1时，construct.json内容为空
+|   ├── rank1
+|   |   ├── dump_tensor_data
+|   |   |   └── ...
+|   |   ├── dump.json
+|   |   ├── stack.json
+|   |   └── construct.json
+|   ├── ...
+|   |
+|   └── rankn
+```
+**2. 执行命令**：
+```
+msprobe -f mindspore graph -i ./compare.json -o ./output
+```
+比对完成后将在**output**下生成n个**vis后缀文件**。
+图构建：
+```
+├── build_rank0_{timestamp}.vis
+├── build_rank1_{timestamp}.vis
+├── build_rank2_{timestamp}.vis
+├── build_rank3_{timestamp}.vis
+├── ...
+├── build_rankn_{timestamp}.vis
+```
+图比对：
+```
+├── compare_rank0_{timestamp}.vis
+├── compare_rank1_{timestamp}.vis
+├── compare_rank2_{timestamp}.vis
+├── compare_rank3_{timestamp}.vis
+├── ...
+├── compare_rankn_{timestamp}.vis
+```
+##### 3.2.3.2 多step批量构建或比对
+批量构建或比对多个step下的所有rank的数据
+**1. 准备比对文件**：
+以在当前目录创建 ./compare.json 为例。
+```
+{
+"npu_path": "./npu_dump",
+"bench_path": "./bench_dump", # 只进行图构建可不配置
+"is_print_compare_log": true
+}
+```
+npu_path或bench_path格式：必须只包含step+数字格式的文件夹，且每个step文件夹中必须只包含rank+数字格式的文件夹，每个rank文件夹中必须包含dump.json、stack.json和construct.json，且construct.json不能为空。如果construct.json为空，请检查dump的level参数是否没有选择L0或者mix。
+进行批量图比对时，npu_path和bench_path中包含的step+数字格式的文件夹必须数量一致且能够一一对应，每个step文件夹中包含的rank+数字格式的文件夹必须数量一致且能够一一对应。
+```
+├── npu_path或bench_path
+│   ├── step0
+│   |   ├── rank0
+│   |   │   ├── dump_tensor_data（仅配置dump的task参数选择tensor时存在）
+|   |   |   |    ├── MintFunctional.relu.0.backward.input.0.npy
+|   |   |   |    ├── Mint.abs.0.forward.input.0.npy
+|   |   |   |    ...
+|   |   |   |    └── Cell.relu.ReLU.forward.0.input.0.npy
+│   |   |   ├── dump.json             # 数据信息
+│   |   |   ├── stack.json            # 调用栈信息
+│   |   |   └── construct.json        # 分层分级结构，level为L1时，construct.json内容为空
+│   |   ├── rank1
+|   |   |   ├── dump_tensor_data
+|   |   |   |   └── ...
+│   |   |   ├── dump.json
+│   |   |   ├── stack.json
+|   |   |   └── construct.json
+│   |   ├── ...
+│   |   |
+|   |   └── rankn
+│   ├── step1
+│   |   ├── ...
+│   ├── step2
+```
+**2. 执行命令**：
+```
+msprobe -f mindspore graph -i ./compare.json -o ./output
+```
+比对完成后将在**output**下生成若干个**vis后缀文件**。
+图构建：
+```
+├── build_step0_rank0_{timestamp}.vis
+├── build_step0_rank1_{timestamp}.vis
+├── build_step0_rank2_{timestamp}.vis
+├── build_step0_rank3_{timestamp}.vis
+├── build_step1_rank0_{timestamp}.vis
+├── build_step1_rank1_{timestamp}.vis
+├── build_step1_rank2_{timestamp}.vis
+├── build_step1_rank3_{timestamp}.vis
+├── ...
+├── build_stepn_rankn_{timestamp}.vis
+```
+图比对：
+```
+├── compare_step0_rank0_{timestamp}.vis
+├── compare_step0_rank1_{timestamp}.vis
+├── compare_step0_rank2_{timestamp}.vis
+├── compare_step0_rank3_{timestamp}.vis
+├── compare_step1_rank0_{timestamp}.vis
+├── compare_step1_rank1_{timestamp}.vis
+├── compare_step1_rank2_{timestamp}.vis
+├── compare_step1_rank3_{timestamp}.vis
+├── ...
+├── compare_stepn_rankn_{timestamp}.vis
+```
+## 4.启动tensorboard
+将生成vis文件的路径**out_path**传入--logdir
+```
+tensorboard --logdir out_path --bind_all --port [可选，端口号]
+```
+启动后会打印日志。
+``TensorBoard 2.15.1 at http://localhost.localdomain:6008/#graphs (Press CTRL+C to quit)``
+localhost.localdomain是机器地址，6008是端口号。
+**注意，localhost.localdomain需要替换为真实的服务器地址，例如真实的服务器地址为10.123.456.78，则需要在浏览器窗口输入http://10.123.456.78:6008/#graphs**
+**如果链接打不开(例如服务器无法直连需要挂vpn才能连接的场景)，可以尝试使用vscode连接服务器，在vscode终端输入：**
+```
+tensorboard --logdir out_path
+```
+CTRL+C点击链接即可
+## 5.浏览器查看
+推荐使用谷歌浏览器，在浏览器中输入机器地址+端口号回车，出现TensorBoard页面，右上方选择GRAPHS_ASCEND即可展示模型结构图。
+节点需要双击打开。
+键盘WS可放大缩小，AD可左右移动，鼠标滚轮可上下移动。
+## 6.图比对说明
+### 颜色
+颜色越深，精度比对差异越大，越可疑，具体信息可见浏览器页面左下角颜色图例。
+### 疑似有精度问题判定
+#### 真实数据模式
+节点中所有输入的最小双千指标和所有输出的最小双千分之一指标的差值，反映了双千指标的下降情况，**值越大精度差距越大，颜色标记越深**。
+``One Thousandth Err Ratio（双千分之一）精度指标：Tensor中的元素逐个与对应的标杆数据对比，相对误差小于千分之一的比例占总元素个数的比例，比例越接近1越好``
+#### 统计信息模式
+节点中输出的统计量相对误差，**值越大精度差距越大，颜色标记越深**。
+``相对误差：abs（(npu统计值 - bench统计值) / bench统计值)``
+其中小值不使用相对误差来判断精度差异，而是使用**绝对误差**来判断精度差异
+**判定为小值的阈值：**
+   - float32：e-6
+   - float16：e-3
+   - bfloat16：e-3
+**小值域的绝对误差阈值：**
+   - float32：e-6
+   - float16：e-3
+   - bfloat16：e-3
+#### md5模式
+节点中任意输入输出的md5值不同。
+## 7.附录
+### 7.1 自定义映射文件（Layer）
+文件名格式：\*.yaml，*为文件名，可自定义。
+文件内容示例：
+```yaml
+ParallelAttention:                 # Layer层名称
+  qkv_proj: query_key_value        # 冒号左侧为MindSpore框架模型代码中嵌套的Layer层名称，冒号右侧为PyTorch框架模型代码中嵌套的Layer层名称
+  out_proj: dense
+ParallelTransformerLayer:
+  attention: self_attention
+Embedding:
+  dropout: embedding_dropout
+ParallelMLP:
+  mapping: dense_h_to_4h
+  projection: dense_4h_to_h
+PipelineCell:
+  model: module
+Cell:
+  network_with_loss: module
+layers:                           # 手动映射MindSpore与PyTorch模型代码中的Layer层序号
+  '5': '0'
+  '6': '1'
+  '7': '2'
+  '8': '3'
+  '9': '4'
+```
+Layer层名称需要从模型代码中获取。
+yaml文件中只需配置MindSpore与PyTorch模型代码中功能一致但名称不同的Layer层，名称相同的Layer层会被自动识别并映射。
+模型代码示例：
+![ms_dump](./img/ms_layer.png)

msprobe/docs/23.tool_function_introduction.md ADDED Viewed

@@ -0,0 +1,28 @@
+# msprobe 工具功能模块简介、适用场景和当前版本局限性
+## 1 PyTorch框架
+| 功能名（英文）                                                                            | 简介                                                            | 适用场景/优势                                                                  | 当前版本局限性                                                                                                         |
+|------------------------------------------------------------------------------------|---------------------------------------------------------------|--------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------|
+| [数据采集<br>（dump）](./05.data_dump_PyTorch.md)                                        | 采集模型训练过程中的API或Module层级的前反向输入输出数据，包括层次关系、统计值信息、真实数据和调用栈等。      | 1、将模型中训练的API或Module的前反向输入输出数据保存下来分析<br> 2、模型出现溢出时，可用于查看哪些API或Module出现了溢出 | 1、API级数据采集仅支持白名单列表上的API<br>2、工具会做一些同步操作，引入工具可能会导致一些同步问题消失<br>3、当前对inplace操作API或Module的支持度有限<br>4、暂不支持参数及参数梯度的采集 |
+| [离线预检<br>（api_accuracy_checker）](./07.accuracy_checker_PyTorch.md)                 | 为网络中每个API创建用例，检验其精度，并根据不同比对算法综合判定API在NPU上的精度是否达标，快速找出精度差异API。 | 1、对模型中所有的API做精度初步排查<br>2、精度排查不受模型累计误差影响                                  | 1、依赖GPU环境<br>2、不支持通信算子<br>3、仅支持部分融合算子                                                                           |
+| [整网比对<br>（compare）](./10.accuracy_compare_PyTorch.md)                              | 计算模型整网NPU和标杆设备的精度误差指标，标记精度异常API或Module，助力快速定位精度问题根因。          | 1、整网比对定位精度可疑算子                                                           | 1、由于使用整网dump数据，定位的可疑算子受累计误差影响<br>2、当模型规模较大时，比对所需时间较长                                                            |
+| [在线预检<br>（online_api_accuracy_checker）](./08.accuracy_checker_online_PyTorch.md)   | 通过TCP通信或共享存储空间的方式，进行在线精度预检，解决离线预检大数据量落盘、传输困难痛点。               | 1、使用离线预检，数据量较大落盘困难或传输耗时长时，可通过在线预检进行精度排查                                  | 1、依赖GPU环境，NPU和GPU能够通信<br>2、重计算模式下，不支持反向aten算子预检                                                                 |
+| [溢出检查<br>（overflow_checker）](./12.overflow_check_PyTorch.md)                       | 检测模型计算过程的输入输出，并在溢出时落盘数据，助力用户快速定位溢出位置。                         | 1、当模型出现溢出时，用于快速定位最先溢出的API或Module<br>2、相比数据采集，性能更优，磁盘压力更小                 | 1、局限性同数据采集                                                                                                      |
+| [数据解析<br>（parse_tool）](./14.data_parse_PyTorch.md)                                 | 互交式界面处理解析kernel层级dump数据，便于查看分析。                               | 1、比对kernel层级dump数据的一致性                                                   | 1、仅限于NPU                                                                                                        |
+| [无标杆比对<br>（free_benchmark）](./15.free_benchmarking_PyTorch.md)                     | 不依赖标杆数据，通过对算子输入增加微小扰动，计算扰动后输出与原始输出的相对误差，识别有精度风险算子。            | 1、无标杆数据场景下的算子精度排查<br>2、对个别算子进行升精度、“to cpu”等操作，以验证其对模型loss的影响             | 1、由于需要拷贝输入进行二次执行，所以在遇到大张量的输入时容易发生显存OOM的问题, 特别是反向比对过程。建议结合白名单使用<br>2、比对会延长训练时间，整网比对可能会造成严重的耗时膨胀，建议结合白名单使用        |
+| [梯度状态监测<br>（grad_probe）](./17.grad_probe.md)                                       | 可导出模型权重梯度数据并对比相似度，助力确认训练过程精度问题step和反向中的异常。                    | 1、需要分析梯度数据时<br>2、需要定位发生问题的step时                                          | 暂无                                                                                                              |
+| [在线精度比对<br>（online_dispatch）](./18.online_dispatch.md)                             | 训练过程中直接完成NPU和CPU的精度比对并输出比对结果。                                 | 1、执行一次就可获取NPU和CPU分别执行后的精度比对结果                                            | 暂无                                                                                                              |
+| [训练状态监控<br>（monitor）](./19.monitor.md)                                             | 收集模型训练过程中的激活值、梯度和优化器状态，助力分析计算、通信、优化器各部分异常情况。                  | 1、通过监控模块级统计量指标，快速定位异常模块位置，如loss出现nan                                     | 1、仅支持模块级别统计量指标分析<br>2、仅支持megatron、deepspeed框架<br>3、少量增加时间和显存膨胀                                                  |
+| [可视化比对<br>（visualization） ](./21.visualization_PyTorch.md)                         | 解析dump的精度数据，还原模型图结构，比对各层级精度数据，助力理解模型结构、分析精度问题。                | 1、整网精度比对定位可疑算子，通过浏览器展示比对结果，支持快速搜索到可疑算子<br>2、支持查看模型层级结果，比对模型层级结构差异        | 1、由于使用整网dump数据，定位的可疑算子受累计误差影响<br>2、当模型规模较大时，比对所需时间较长                                                            |
+## 2 MindSpore框架
+| 功能名（英文）                                                              | 简介                                                                | 适用场景/优势                                                                      | 当前版本局限性                                                                                                                                           |
+|----------------------------------------------------------------------|-------------------------------------------------------------------|------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------|
+| [数据采集<br>（dump）](./06.data_dump_MindSpore.md)                        | 采集模型训练过程中的API或Cell层级的前反向输入输出数据，包括层次关系、统计值信息、真实数据和调用栈等。            | 1、将模型中训练的API或Cell的前反向输入输出数据保存下来分析<br> 2、模型出现溢出时，可用于查看哪些API或Cell出现了溢出         | 1、API级数据采集仅支持白名单列表上的API<br>2、当前对inplace操作API或Cell的支持度有限<br>3、暂不支持参数及参数梯度的采集                                                                       |
+| [离线预检<br>（api_accuracy_checker）](./09.accuracy_checker_MindSpore.md) | 为网络中每个API创建用例，检验其精度，并根据不同比对算法综合判定API在NPU上的精度是否达标，快速找出精度差异API。     | 1、对模型中所有的API做精度初步排查<br>2、精度排查不受模型累计误差影响                                      | 1、仅针对MindSpore.mint API                                                                                                                           |
+| [整网比对<br>（compare）](./11.accuracy_compare_MindSpore.md)              | NPU精度数据与标杆数据的比对，支持MindSpore框架内和与PyTorch跨框架的比对，助力快速定位精度异常API或Cell。 | 1、MindSpore同框架静态图比对<br>2、MindSpore同框架动态图比对<br>3、MindSpore vs PyTorch跨框架动态图比对 | 1、部分PyTorch的API关联不到MindSpore，需要手动配置映射关系                                                                                                           |
+| [溢出检查<br>（overflow_checker）](./13.overflow_check_MindSpore.md)       | 检测模型计算过程的输入输出，并在溢出时落盘数据，助力用户快速定位溢出位置。                             | 1、当模型出现溢出时，可用于定位最先溢出的API或Cell或kernel<br>2、相比数据采集，性能更优，磁盘压力更小                 | 1、除具有与数据采集功能相同的局限性外，动态图场景下，不支持 Primitive 和 Jit 类 API 的检测<br>2、动态图场景下，仅支持检测API或Cell级别溢出<br>3、静态图场景下，仅支持检测kernel级别溢出                                |
+| [无标杆比对<br>（free_benchmark）](./16.free_benchmarking_MindSpore.md)     | 不依赖标杆数据，通过对算子输入增加微小扰动，计算扰动后输出与原始输出的相对误差，识别有精度风险算子。                | 1、无标杆数据场景下的算子精度排查<br>2、对个别算子进行升精度修复，验证其对模型loss的影响                            | 1、仅支持动态图场景<br>2、由于需要拷贝输入进行二次执行，所以在遇到大张量的输入时容易发生显存OOM的问题, 特别是反向比对过程。建议结合白名单使用<br>3、比对会延长训练时间，整网比对可能会造成严重的耗时膨胀，建议结合白名单使用<br>4、不支持“to cpu”操作，不支持预热功能 |
+| [可视化比对<br>（visualization） ](./22.visualization_MindSpore.md)         | 解析dump的精度数据，还原模型图结构，比对各层级精度数据，助力理解模型结构、分析精度问题。                    | 1、整网精度比对定位可疑算子，通过浏览器展示比对结果，支持快速搜索到可疑算子<br>2、支持查看模型层级结果，比对模型层级结构差异            | 1、由于使用整网dump数据，定位的可疑算子受累计误差影响<br>2、当模型规模较大时，比对所需时间较长                                                                                              |

msprobe/docs/{FAQ_PyTorch.md → FAQ.md} RENAMED Viewed

@@ -1,4 +1,19 @@
-# 1 精度预检工具
+# 1 数据采集
+1. dump.json中API或Module统计信息里出现null或None值的原因是什么？
+   dump.json里出现null或None值的可能性较多，常见的场景有：
+   - 输入或者输出参数本身是一个None值。
+   - 输入参数或输出参数类型当前工具不支持，会有日志打印提醒。
+   - 输入或者输出tensor的dtype为bool时，Mean和Norm等字段为null。
+2. 如果存在namedtuple类型的数据作为nn.Module的输出，工具会将各字段数据dump下来，但是输出数据类型会被转成tuple，原因是什么？
+   - 这是由于pytorch框架自身，在注册module的backward hook时，会将namedtuple类型转成tuple类型。
+# 2 精度预检(PyTorch)
 1. 预检工具在 dump 和 run_ut 的过程中，是否需要同时开启或关闭 jit 编译（jit_compile）？
@@ -52,20 +67,20 @@
    | `__matmul__`    | 矩阵乘法         |
    | `__mod__`       | %                |
    | `__mul__`       | *                |
-   | `__nonzero__`   | 同`__bool__`     |
+   | `__nonzero__`   | 同 `__bool__`     |
    | `__or__`        | \|               |
    | `__radd__`      | +（反向）        |
    | `__rmul__`      | *（反向）        |
    | `__rshift__`    | >>               |
    | `__sub__`       | -                |
-   | `__truediv__`   | 同`__div__`      |
+   | `__truediv__`   | 同 `__div__`      |
    | `__xor__`       | ^                |
-# 2 精度比对工具
+# 3 精度比对(PyTorch)
-## 2.1 工具使用
+## 3.1 工具使用
-### 2.1.1 dump 指定融合算子
+### 3.1.1 dump 指定融合算子
 数据采集当前支持融合算子的输入输出，需要在 `mstt/debug/accuracy_tools/msprobe/pytorch/hook_module/support_wrap_ops.yaml` 中添加，比如以下代码段调用的 softmax 融合算子。
@@ -83,7 +98,7 @@ def npu_forward_fused_softmax(self, input_, mask):
 （npu_scaled_masked_softmax 融合算子工具已支持 dump，本例仅供参考）。
-## 2.2 常见问题
+## 3.2 常见问题
 1. 在同一个目录多次执行 dump 会冲突吗？
@@ -97,7 +112,7 @@ def npu_forward_fused_softmax(self, input_, mask):
     答：torch 版本和硬件差异属于正常情况。
-## 2.3 异常情况
+## 3.3 异常情况
 1. HCCL 报错： error code: EI0006。
@@ -168,9 +183,9 @@ def npu_forward_fused_softmax(self, input_, mask):
     答：注释工具目录 `mstt/debug/accuracy_tools/msprobe/pytorch/hook_module/support_wrap_ops.yaml` 文件中 `Tensor: ` 下的 `- __getitem__`，工具会跳过采集该 API。如果是需要采集关键位置 API 也可以考虑根据报错堆栈信息注释引发报错的类型检查。
-11. 添加 msprobe 工具后 F.gelu 触发 ValueError 报错：`activation_func must be F.gelu`等。
+11. 添加 msprobe 工具后 F.gelu 触发 ValueError 报错：`activation_func must be F.gelu` 等。以及采集 Megatron 数据时报错：`ValueError(Only support fusion of gelu and swiglu)`。
-    答：注释工具目录 `mstt/debug/accuracy_tools/msprobe/pytorch/hook_module/support_wrap_ops.yaml` 文件中 `functional: ` 下的 `-gelu`，工具会跳过采集该 API。如果需要采集关键位置 api 也可以考虑根据报错堆栈信息注释引发报错的类型检查。
+    答：这一类问题是因为工具本身封装了 torch 算子，所以校验算子名时会报错。注释 `mstt/debug/accuracy_tools/msprobe/pytorch/hook_module/support_wrap_ops.yaml` 文件中的 `-gelu` 或者 `-silu`，工具会跳过采集该 API。如果需要采集关键位置 API 也可以考虑根据报错堆栈信息注释引发报错的类型检查。
 12. 添加 msprobe 工具后触发与 AsStrided 算子相关、或者编译相关的报错，如：`Failed to compile Op [AsStrided]`。

mindstudio-probe 1.0.4__py3-none-any.whl → 1.1.1__py3-none-any.whl

mindstudio-probe 1.0.4py3-none-any.whl → 1.1.1py3-none-any.whl