PyPI - mindstudio-probe - Versions diffs - 1.0.1__py3-none-any.whl → 1.0.3__py3-none-any.whl - Mend

mindstudio-probe 1.0.1py3-none-any.whl → 1.0.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (249) hide show

{mindstudio_probe-1.0.1.dist-info → mindstudio_probe-1.0.3.dist-info}/METADATA +5 -1
mindstudio_probe-1.0.3.dist-info/RECORD +272 -0
msprobe/README.md +78 -23
msprobe/__init__.py +1 -0
msprobe/config/README.md +182 -40
msprobe/config/config.json +22 -0
msprobe/core/__init__.py +0 -0
msprobe/{pytorch → core}/advisor/advisor.py +3 -3
msprobe/{pytorch → core}/advisor/advisor_result.py +2 -2
msprobe/core/common/const.py +82 -5
msprobe/core/common/exceptions.py +30 -18
msprobe/core/common/file_check.py +19 -1
msprobe/core/common/log.py +15 -1
msprobe/core/common/utils.py +130 -30
msprobe/core/common_config.py +32 -19
msprobe/core/compare/acc_compare.py +299 -0
msprobe/core/compare/check.py +95 -0
msprobe/core/compare/compare_cli.py +49 -0
msprobe/core/compare/highlight.py +222 -0
msprobe/core/compare/multiprocessing_compute.py +149 -0
msprobe/{pytorch → core}/compare/npy_compare.py +55 -4
msprobe/core/compare/utils.py +429 -0
msprobe/core/data_dump/data_collector.py +39 -35
msprobe/core/data_dump/data_processor/base.py +85 -37
msprobe/core/data_dump/data_processor/factory.py +5 -7
msprobe/core/data_dump/data_processor/mindspore_processor.py +198 -0
msprobe/core/data_dump/data_processor/pytorch_processor.py +94 -51
msprobe/core/data_dump/json_writer.py +11 -11
msprobe/core/grad_probe/__init__.py +0 -0
msprobe/core/grad_probe/constant.py +71 -0
msprobe/core/grad_probe/grad_compare.py +175 -0
msprobe/core/grad_probe/utils.py +52 -0
msprobe/doc/grad_probe/grad_probe.md +207 -0
msprobe/doc/grad_probe/img/image-1.png +0 -0
msprobe/doc/grad_probe/img/image-2.png +0 -0
msprobe/doc/grad_probe/img/image-3.png +0 -0
msprobe/doc/grad_probe/img/image-4.png +0 -0
msprobe/doc/grad_probe/img/image.png +0 -0
msprobe/mindspore/api_accuracy_checker/__init__.py +0 -0
msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +246 -0
msprobe/mindspore/api_accuracy_checker/api_info.py +69 -0
msprobe/mindspore/api_accuracy_checker/api_runner.py +152 -0
msprobe/mindspore/api_accuracy_checker/base_compare_algorithm.py +197 -0
msprobe/mindspore/api_accuracy_checker/compute_element.py +224 -0
msprobe/mindspore/api_accuracy_checker/main.py +16 -0
msprobe/mindspore/api_accuracy_checker/type_mapping.py +114 -0
msprobe/mindspore/api_accuracy_checker/utils.py +63 -0
msprobe/mindspore/cell_processor.py +34 -0
msprobe/mindspore/common/const.py +87 -0
msprobe/mindspore/common/log.py +38 -0
msprobe/mindspore/common/utils.py +57 -0
msprobe/mindspore/compare/distributed_compare.py +75 -0
msprobe/mindspore/compare/ms_compare.py +117 -0
msprobe/mindspore/compare/ms_graph_compare.py +317 -0
msprobe/mindspore/compare/ms_to_pt_api.yaml +399 -0
msprobe/mindspore/debugger/debugger_config.py +38 -15
msprobe/mindspore/debugger/precision_debugger.py +79 -4
msprobe/mindspore/doc/compare.md +58 -0
msprobe/mindspore/doc/dump.md +158 -6
msprobe/mindspore/dump/dump_tool_factory.py +19 -22
msprobe/mindspore/dump/hook_cell/api_registry.py +104 -0
msprobe/mindspore/dump/hook_cell/hook_cell.py +53 -0
msprobe/mindspore/dump/hook_cell/support_wrap_ops.yaml +925 -0
msprobe/mindspore/dump/hook_cell/wrap_functional.py +91 -0
msprobe/mindspore/dump/hook_cell/wrap_tensor.py +63 -0
msprobe/mindspore/dump/jit_dump.py +56 -0
msprobe/mindspore/dump/kernel_kbyk_dump.py +65 -0
msprobe/mindspore/free_benchmark/__init__.py +0 -0
msprobe/mindspore/free_benchmark/api_pynative_self_check.py +116 -0
msprobe/mindspore/free_benchmark/common/__init__.py +0 -0
msprobe/mindspore/free_benchmark/common/config.py +12 -0
msprobe/mindspore/free_benchmark/common/handler_params.py +17 -0
msprobe/mindspore/free_benchmark/common/utils.py +71 -0
msprobe/mindspore/free_benchmark/data/support_wrap_ops.yaml +842 -0
msprobe/mindspore/free_benchmark/decorator/__init__.py +0 -0
msprobe/mindspore/free_benchmark/decorator/dec_forward.py +42 -0
msprobe/mindspore/free_benchmark/decorator/decorator_factory.py +107 -0
msprobe/mindspore/free_benchmark/handler/__init__.py +0 -0
msprobe/mindspore/free_benchmark/handler/base_handler.py +90 -0
msprobe/mindspore/free_benchmark/handler/check_handler.py +41 -0
msprobe/mindspore/free_benchmark/handler/fix_handler.py +36 -0
msprobe/mindspore/free_benchmark/handler/handler_factory.py +21 -0
msprobe/mindspore/free_benchmark/perturbation/add_noise.py +67 -0
msprobe/mindspore/free_benchmark/perturbation/base_perturbation.py +21 -0
msprobe/mindspore/free_benchmark/perturbation/bit_noise.py +63 -0
msprobe/mindspore/free_benchmark/perturbation/improve_precision.py +34 -0
msprobe/mindspore/free_benchmark/perturbation/no_change.py +12 -0
msprobe/mindspore/free_benchmark/perturbation/perturbation_factory.py +27 -0
msprobe/mindspore/free_benchmark/self_check_tool_factory.py +33 -0
msprobe/mindspore/grad_probe/__init__.py +0 -0
msprobe/mindspore/grad_probe/global_context.py +91 -0
msprobe/mindspore/grad_probe/grad_analyzer.py +231 -0
msprobe/mindspore/grad_probe/grad_monitor.py +27 -0
msprobe/mindspore/grad_probe/grad_stat_csv.py +132 -0
msprobe/mindspore/grad_probe/hook.py +92 -0
msprobe/mindspore/grad_probe/utils.py +29 -0
msprobe/mindspore/ms_config.py +63 -15
msprobe/mindspore/overflow_check/overflow_check_tool_factory.py +17 -15
msprobe/mindspore/runtime.py +4 -0
msprobe/mindspore/service.py +354 -0
msprobe/mindspore/task_handler_factory.py +7 -4
msprobe/msprobe.py +66 -26
msprobe/pytorch/__init__.py +1 -1
msprobe/pytorch/api_accuracy_checker/common/config.py +21 -16
msprobe/pytorch/api_accuracy_checker/common/utils.py +1 -60
msprobe/pytorch/api_accuracy_checker/compare/algorithm.py +2 -5
msprobe/pytorch/api_accuracy_checker/compare/api_precision_compare.py +46 -10
msprobe/pytorch/api_accuracy_checker/compare/compare.py +84 -48
msprobe/pytorch/api_accuracy_checker/compare/compare_utils.py +8 -12
msprobe/pytorch/api_accuracy_checker/config.yaml +7 -1
msprobe/pytorch/api_accuracy_checker/run_ut/data_generate.py +15 -11
msprobe/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py +11 -15
msprobe/pytorch/api_accuracy_checker/run_ut/run_overflow_check.py +16 -9
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py +193 -105
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut_utils.py +68 -1
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/__init__.py +0 -0
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/attl.py +202 -0
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/client.py +324 -0
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/device_dispatch.py +204 -0
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/server.py +218 -0
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/ssl_config.py +10 -0
msprobe/pytorch/bench_functions/__init__.py +15 -0
msprobe/pytorch/bench_functions/apply_adam_w.py +28 -0
msprobe/pytorch/bench_functions/confusion_transpose.py +19 -0
msprobe/pytorch/bench_functions/fast_gelu.py +55 -0
msprobe/pytorch/bench_functions/layer_norm_eval.py +6 -0
msprobe/pytorch/bench_functions/linear.py +12 -0
msprobe/pytorch/bench_functions/matmul_backward.py +48 -0
msprobe/pytorch/bench_functions/npu_fusion_attention.py +421 -0
msprobe/pytorch/bench_functions/rms_norm.py +15 -0
msprobe/pytorch/bench_functions/rotary_mul.py +52 -0
msprobe/pytorch/bench_functions/scaled_mask_softmax.py +26 -0
msprobe/pytorch/bench_functions/swiglu.py +55 -0
msprobe/pytorch/common/parse_json.py +3 -1
msprobe/pytorch/common/utils.py +83 -7
msprobe/pytorch/compare/distributed_compare.py +19 -64
msprobe/pytorch/compare/match.py +3 -6
msprobe/pytorch/compare/pt_compare.py +40 -0
msprobe/pytorch/debugger/debugger_config.py +11 -2
msprobe/pytorch/debugger/precision_debugger.py +34 -4
msprobe/pytorch/doc/api_accuracy_checker.md +57 -13
msprobe/pytorch/doc/api_accuracy_checker_online.md +187 -0
msprobe/pytorch/doc/dump.md +73 -20
msprobe/pytorch/doc/ptdbg_ascend_compare.md +75 -11
msprobe/pytorch/doc/ptdbg_ascend_quickstart.md +3 -3
msprobe/pytorch/doc/run_overflow_check.md +1 -1
msprobe/pytorch/doc//321/206/320/247/320/260/321/206/320/260/320/227/321/206/320/255/320/226/321/205/342/225/226/320/265/321/205/320/225/342/225/226/321/205/320/254/342/225/221/321/206/320/251/320/277/321/211/320/272/320/234/321/210/320/277/320/221/321/205/320/242/320/234/321/206/320/220/320/267/321/210/320/223/342/225/234/321/205/320/257/342/225/221/321/207/342/225/221/342/224/220/321/206/320/232/320/265/321/205/320/241/320/232.md +151 -0
msprobe/pytorch/free_benchmark/common/constant.py +3 -0
msprobe/pytorch/free_benchmark/common/utils.py +4 -0
msprobe/pytorch/free_benchmark/compare/grad_saver.py +22 -26
msprobe/pytorch/free_benchmark/main.py +7 -4
msprobe/pytorch/free_benchmark/perturbed_layers/npu/add_noise.py +1 -1
msprobe/pytorch/free_benchmark/perturbed_layers/npu/bit_noise.py +1 -1
msprobe/pytorch/free_benchmark/perturbed_layers/npu/change_value.py +1 -1
msprobe/pytorch/free_benchmark/perturbed_layers/npu/improve_precision.py +3 -3
msprobe/pytorch/free_benchmark/perturbed_layers/npu/no_change.py +1 -1
msprobe/pytorch/free_benchmark/perturbed_layers/run_cpu.py +1 -1
msprobe/pytorch/free_benchmark/result_handlers/base_handler.py +43 -29
msprobe/pytorch/free_benchmark/result_handlers/handler_factory.py +0 -1
msprobe/pytorch/function_factory.py +75 -0
msprobe/pytorch/functional/dump_module.py +4 -4
msprobe/pytorch/grad_probe/__init__.py +0 -0
msprobe/pytorch/grad_probe/grad_monitor.py +90 -0
msprobe/pytorch/grad_probe/grad_stat_csv.py +129 -0
msprobe/pytorch/hook_module/hook_module.py +14 -3
msprobe/pytorch/hook_module/support_wrap_ops.yaml +2 -1
msprobe/pytorch/hook_module/utils.py +9 -9
msprobe/pytorch/hook_module/wrap_aten.py +20 -10
msprobe/pytorch/hook_module/wrap_distributed.py +10 -7
msprobe/pytorch/hook_module/wrap_functional.py +4 -7
msprobe/pytorch/hook_module/wrap_npu_custom.py +21 -10
msprobe/pytorch/hook_module/wrap_tensor.py +5 -6
msprobe/pytorch/hook_module/wrap_torch.py +5 -7
msprobe/pytorch/hook_module/wrap_vf.py +6 -8
msprobe/pytorch/module_processer.py +53 -13
msprobe/pytorch/online_dispatch/compare.py +4 -4
msprobe/pytorch/online_dispatch/dispatch.py +39 -41
msprobe/pytorch/online_dispatch/dump_compare.py +17 -47
msprobe/pytorch/online_dispatch/single_compare.py +5 -5
msprobe/pytorch/online_dispatch/utils.py +2 -43
msprobe/pytorch/parse_tool/lib/compare.py +31 -19
msprobe/pytorch/parse_tool/lib/config.py +2 -1
msprobe/pytorch/parse_tool/lib/parse_tool.py +4 -4
msprobe/pytorch/parse_tool/lib/utils.py +34 -80
msprobe/pytorch/parse_tool/lib/visualization.py +4 -3
msprobe/pytorch/pt_config.py +100 -6
msprobe/pytorch/service.py +104 -19
mindstudio_probe-1.0.1.dist-info/RECORD +0 -228
msprobe/mindspore/dump/api_kbk_dump.py +0 -55
msprobe/pytorch/compare/acc_compare.py +0 -1024
msprobe/pytorch/compare/highlight.py +0 -100
msprobe/test/core_ut/common/test_utils.py +0 -345
msprobe/test/core_ut/data_dump/test_data_collector.py +0 -47
msprobe/test/core_ut/data_dump/test_json_writer.py +0 -183
msprobe/test/core_ut/data_dump/test_scope.py +0 -151
msprobe/test/core_ut/test_common_config.py +0 -152
msprobe/test/core_ut/test_file_check.py +0 -218
msprobe/test/core_ut/test_log.py +0 -109
msprobe/test/mindspore_ut/test_api_kbk_dump.py +0 -51
msprobe/test/mindspore_ut/test_debugger_config.py +0 -42
msprobe/test/mindspore_ut/test_dump_tool_factory.py +0 -51
msprobe/test/mindspore_ut/test_kernel_graph_dump.py +0 -66
msprobe/test/mindspore_ut/test_kernel_graph_overflow_check.py +0 -63
msprobe/test/mindspore_ut/test_ms_config.py +0 -69
msprobe/test/mindspore_ut/test_overflow_check_tool_factory.py +0 -51
msprobe/test/mindspore_ut/test_precision_debugger.py +0 -56
msprobe/test/mindspore_ut/test_task_handler_factory.py +0 -58
msprobe/test/pytorch_ut/advisor/test_advisor.py +0 -83
msprobe/test/pytorch_ut/api_accuracy_checker/common/test_common_utils.py +0 -108
msprobe/test/pytorch_ut/api_accuracy_checker/common/test_config.py +0 -39
msprobe/test/pytorch_ut/api_accuracy_checker/compare/test_algorithm.py +0 -112
msprobe/test/pytorch_ut/api_accuracy_checker/compare/test_api_precision_compare.py +0 -77
msprobe/test/pytorch_ut/api_accuracy_checker/compare/test_compare.py +0 -125
msprobe/test/pytorch_ut/api_accuracy_checker/compare/test_compare_column.py +0 -10
msprobe/test/pytorch_ut/api_accuracy_checker/compare/test_compare_utils.py +0 -43
msprobe/test/pytorch_ut/api_accuracy_checker/run_ut/dump.json +0 -179
msprobe/test/pytorch_ut/api_accuracy_checker/run_ut/forward.json +0 -63
msprobe/test/pytorch_ut/api_accuracy_checker/run_ut/test_data_generate.py +0 -99
msprobe/test/pytorch_ut/api_accuracy_checker/run_ut/test_multi_run_ut.py +0 -115
msprobe/test/pytorch_ut/api_accuracy_checker/run_ut/test_run_ut.py +0 -72
msprobe/test/pytorch_ut/compare/test_acc_compare.py +0 -17
msprobe/test/pytorch_ut/free_benchmark/perturbed_layers/test_perturbed_layser.py +0 -105
msprobe/test/pytorch_ut/free_benchmark/result_handlers/test_result_handler.py +0 -121
msprobe/test/pytorch_ut/free_benchmark/test_main.py +0 -101
msprobe/test/pytorch_ut/functional/test_dump_module.py +0 -15
msprobe/test/pytorch_ut/hook_module/test_api_registry.py +0 -130
msprobe/test/pytorch_ut/hook_module/test_hook_module.py +0 -42
msprobe/test/pytorch_ut/hook_module/test_wrap_aten.py +0 -65
msprobe/test/pytorch_ut/hook_module/test_wrap_distributed.py +0 -35
msprobe/test/pytorch_ut/hook_module/test_wrap_functional.py +0 -20
msprobe/test/pytorch_ut/hook_module/test_wrap_tensor.py +0 -35
msprobe/test/pytorch_ut/hook_module/test_wrap_torch.py +0 -43
msprobe/test/pytorch_ut/hook_module/test_wrap_vf.py +0 -11
msprobe/test/pytorch_ut/test_pt_config.py +0 -69
msprobe/test/pytorch_ut/test_service.py +0 -59
msprobe/test/resources/advisor.txt +0 -3
msprobe/test/resources/compare_result_20230703104808.csv +0 -9
msprobe/test/resources/compare_result_without_accuracy.csv +0 -9
msprobe/test/resources/config.yaml +0 -3
msprobe/test/resources/npu_test.pkl +0 -8
msprobe/test/run_test.sh +0 -30
msprobe/test/run_ut.py +0 -58
msprobe/test/test_module_processer.py +0 -64
{mindstudio_probe-1.0.1.dist-info → mindstudio_probe-1.0.3.dist-info}/LICENSE +0 -0
{mindstudio_probe-1.0.1.dist-info → mindstudio_probe-1.0.3.dist-info}/WHEEL +0 -0
{mindstudio_probe-1.0.1.dist-info → mindstudio_probe-1.0.3.dist-info}/entry_points.txt +0 -0
{mindstudio_probe-1.0.1.dist-info → mindstudio_probe-1.0.3.dist-info}/top_level.txt +0 -0
/msprobe/{pytorch → core}/advisor/advisor_const.py +0 -0
/msprobe/pytorch/doc/{atat → msprobe}/321/207/342/226/223/342/225/233/321/205/342/225/221/320/266/321/205/342/225/226/320/265/321/205/320/225/342/225/226/321/206/320/245/342/226/221/321/206/320/235/320/276dump/321/206/320/260/320/227/321/205/320/227/320/226/321/206/320/220/320/267/321/210/320/223/342/225/234/321/205/320/257/342/225/221/321/207/342/225/221/342/224/220/321/206/320/232/320/265/321/205/320/241/320/232.md" +0 -0

msprobe/pytorch/doc//321/206/320/247/320/260/321/206/320/260/320/227/321/206/320/255/320/226/321/205/342/225/226/320/265/321/205/320/225/342/225/226/321/205/320/254/342/225/221/321/206/320/251/320/277/321/211/320/272/320/234/321/210/320/277/320/221/321/205/320/242/320/234/321/206/320/220/320/267/321/210/320/223/342/225/234/321/205/320/257/342/225/221/321/207/342/225/221/342/224/220/321/206/320/232/320/265/321/205/320/241/320/232.md ADDED Viewed

@@ -0,0 +1,151 @@
+# 无标杆工具场景验证和性能基线报告
+## 环境信息
+NPU：Atlas A2 训练系列产品
+CPU：
+![输入图片说明](img/cpu_info.png)
+Torch：2.1.0
+CANN：8.0.T5
+除上述环境信息影响性能外，API的数量、种类以及Shape都会对性能产生影响，因此本次选取不同场景网络和不同算子进行测试。
+## 模型信息和性能基线
+大模型在使用msprobe工具dump数据时，建议先简化模型层数，减少dump数据量。
+以下场景的性能基线测试数据均为多次测试后取平均值，因此实际运行时性能数据可能会根据环境状态稍有浮动。
+### LLaMA2-7B
+NUM_LAYER：1，1卡，主要数据类型：FLOAT16，模型来源: ascend/ModelLink
+其中，softmax算子为FLOAT32，输入输出均为2G大小，为模型最大显存开销的API。
+在该模型下、对无标杆工具处理模式、插装范围、扰动方式组合下性能和显存基线进行覆盖。
+性能基线报告
+其中耗时为训练10步，去除第一步耗时所得的平均每步耗时。
+| 处理模式                     | 前/反向                 | 算子范围                 | 扰动方式 | 耗时（s） | 显存峰值（GB） | 耗时膨胀倍数 | 显存峰值膨胀倍数 | 备注                           |
+|--------------------------------|-----------------------------------|-----------------|----------|--------------------------------|--------------------------------|--------------------------------|--------------------------------|--------------------------------|
+| /        | /       | /           | /                 | 0.24      | 13.69          | 1            | 1                | 混精模式基线                   |
+| check    | 前      | ["softmax"] | improve_precision | 0.26      | 13.69          | 1.08         | 1             | softmax本身为高精度，跳过      |
+| check    | 前      | ["softmax"] | add_noise         | 0.54      | 19.17          | 2.25         | 1.40             |                                |
+| check    | 前      | ["softmax"] | bit_noise         | 0.56      | 19.17          | 2.33         | 1.40             |                                |
+| check    | 前      | ["softmax"] | change_value      | 0.48      | 14.9           | 2            | 1.09             |                                |
+| check    | 前      | ["softmax"] | no_change         | 0.47      | 14.9           | 1.96         | 1.09             |                                |
+| check | 前 | ["softmax"] | to_cpu | 26.45 | 22.67 | 110.21 | 1.66 | 不建议整网 |
+| check    | 前      | ["matmul"]  | improve_precision | 0.57      | 13.69          | 2.38       | 1                |                                |
+| check    | 前      | ["matmul"]  | change_value      | 0.48      | 13.69          | 2            | 1                |                                |
+| check | 前 | ["matmul"] | to_cpu | 78.43 | 19.20 | 326.79 | 1.40 | 不建议整网 |
+| check | 前 | [] | improve_precision | 3.45 | 18.79 | 14.37 | 1.37 | |
+| check | 前 | [] | add_noise | 4.67 | 19.17 | 19.46 | 1.40 | |
+| check | 前 | [] | bit_noise | 16.99 | 19.17 | 70.79 | 1.40 | |
+| check | 前 | [] | no_change | 3.22 | 14.90 | 13.42 | 1.09 | |
+| check    | 反      | ["softmax"] | improve_precision | 6.23      | 25.69          | 25.96        | 1.88             | 不建议整网                     |
+| check    | 反      | ["softmax"] | change_value      | 22.76     | 25.69          | 94.83        | 1.88             | 不建议整网                     |
+| check | 反 | ["softmax"] | to_cpu | 141.71 | 26.19 | 590.46 | 1.91 | 不建议整网 |
+| fix      | 前      | ["softmax"] | to_cpu            | 9.70      | 16.67          | 40.42        | 1.22             | 不支持整网、不支持反向         |
+| fix      | 前      | ["softmax"] | improve_precision | 0.26      | 14.67          | 1.08         | 1.07             | 不支持整网、不支持反向         |
+| 预热     | 前      | []          | improve_precision | 155.07 | 24.79 | 646.13 | 1.81 | 低精度模型基线、只测预热的迭代 |
+| 预热     | 反      | []          | improve_precision | 72.29 | 22.01 | 301.21 | 1.61 | 低精度模型基线、只测预热的迭代，grad_output为高精度的算子跳过 |
+### Aquila2-7B
+NUM_LAYER：1，1卡，主要数据类型：FLOAT16，模型来源: ascend/ModelLink
+| 处理模式                     | 前/反向                 | 算子范围                 | 扰动方式 | 耗时（s） | 显存峰值（GB） | 耗时膨胀倍数 | 显存峰值膨胀倍数 | 备注                           |
+|--------------------------------|-----------------------------------|-----------------|----------|--------------------------------|--------------------------------|--------------------------------|--------------------------------|--------------------------------|
+| /        | /       | /           | /                 | 0.17 | 13.66 | 1            | 1                | 混精模式基线                   |
+| check    | 前      | []          | improve_precision | 1.57 | 14.24 | 9.24 | 1.04 |                                |
+| check    | 反      | []          | add_noise         | 21.05 | 14.19 | 123.82 | 1.04 |                                |
+| fix      | 前      | []          | improve_precision | 0.95 | 15.55 | 5.59 | 1.14 |                                |
+### Baichuan2-7B
+NUM_LAYER：1，1卡，主要数据类型：FLOAT16，模型来源: ascend/ModelLink
+| 处理模式                     | 前/反向                 | 算子范围                 | 扰动方式 | 耗时（s） | 显存峰值（GB） | 耗时膨胀倍数 | 显存峰值膨胀倍数 | 备注                           |
+|--------------------------------|-----------------------------------|-----------------|----------|--------------------------------|--------------------------------|--------------------------------|--------------------------------|--------------------------------|
+| /        | /       | /           | /                 | 0.26 | 12.12 | 1            | 1                | 混精模式基线                   |
+| check    | 前      | []          | improve_precision | 1.02 | 12.27 | 3.92 | 1.01 |                                |
+| check    | 反      | []          | add_noise         | 11.15 | 12.67 | 42.88 | 1.05 |                                |
+| fix      | 前      | []          | improve_precision | 0.95 | 12.82 | 3.65 | 1.06 |                                |
+### Bloom-7B
+NUM_LAYER：1，1卡，主要数据类型：FLOAT16，模型来源: ascend/ModelLink
+| 处理模式                     | 前/反向                 | 算子范围                 | 扰动方式 | 耗时（s） | 显存峰值（GB） | 耗时膨胀倍数 | 显存峰值膨胀倍数 | 备注                           |
+|--------------------------------|-----------------------------------|-----------------|----------|--------------------------------|--------------------------------|--------------------------------|--------------------------------|--------------------------------|
+| /        | /       | /           | /                 | 0.14 | 9.51 | 1            | 1                | 混精模式基线                   |
+| check    | 前      | []          | improve_precision | 1.64 | 11.58 | 11.71 | 1.22 |                                |
+| check    | 反      | []          | add_noise         | 17.15 | 9.51 | 122.5 | 1 |                                |
+| fix      | 前      | []          | improve_precision | 0.87 | 10.62 | 6.21 | 1.12 |                                |
+### Interlm-7B
+NUM_LAYER：1，1卡，主要数据类型：FLOAT16，模型来源: ascend/ModelLink
+| 处理模式                     | 前/反向                 | 算子范围                 | 扰动方式 | 耗时（s） | 显存峰值（GB） | 耗时膨胀倍数 | 显存峰值膨胀倍数 | 备注                           |
+|--------------------------------|-----------------------------------|-----------------|----------|--------------------------------|--------------------------------|--------------------------------|--------------------------------|--------------------------------|
+| /        | /       | /           | /                 | 0.13 | 10.76 | 1            | 1                | 混精模式基线                   |
+| check    | 前      | []          | improve_precision | 1.19 | 11.68 | 9.15 | 1.09 |                                |
+| check    | 反      | []          | add_noise         | 11.69 | 10.89 | 89.92 | 1.01 |                                |
+| fix      | 前      | []          | improve_precision | 0.75 | 11.68 | 5.77 | 1.09 |                                |
+### Qwen-7B
+NUM_LAYER：1，1卡，主要数据类型：FLOAT16，模型来源: ascend/ModelLink
+| 处理模式                     | 前/反向                 | 算子范围                 | 扰动方式 | 耗时（s） | 显存峰值（GB） | 耗时膨胀倍数 | 显存峰值膨胀倍数 | 备注                           |
+|--------------------------------|-----------------------------------|-----------------|----------|--------------------------------|--------------------------------|--------------------------------|--------------------------------|--------------------------------|
+| /        | /       | /           | /                 | 0.28 | 18.41 | 1            | 1                | 混精模式基线                   |
+| check    | 前      | []          | improve_precision | 2.34 | 23.18 | 8.36 | 1.26 |                                |
+| check    | 反      | []          | add_noise         | 22.07 | 19.47 | 78.82 | 1.06 |                                |
+| fix      | 前      | []          | improve_precision | 1.31 | 21.11 | 4.68 | 1.15 |                                |
+### Gemma-7B
+NUM_LAYER：1，1卡，主要数据类型：FLOAT16，模型来源: ascend/ModelLink
+| 处理模式                     | 前/反向                 | 算子范围                 | 扰动方式 | 耗时（s） | 显存峰值（GB） | 耗时膨胀倍数 | 显存峰值膨胀倍数 | 备注                           |
+|--------------------------------|-----------------------------------|-----------------|----------|--------------------------------|--------------------------------|--------------------------------|--------------------------------|--------------------------------|
+| /        | /       | /           | /                 | 0.15 | 11.06 | 1            | 1                | 混精模式基线                   |
+| check    | 前      | []          | improve_precision | 1.49 | 13.17 | 9.93 | 1.19 |                                |
+| check    | 反      | []          | add_noise         | 16.69 | 11.06 | 111.27 | 1 |                                |
+| fix      | 前      | []          | improve_precision | 0.87 | 12.25 | 5.8 | 1.11 |                                |
+### ResNet50-Cifar
+1卡，主要数据类型：FLOAT16，模型来源: ascend/ModelZoo-PyTorch。
+主要算子为conv2d，每个step有51个, 因此对conv2d进行检测。
+CV模型、依赖mmcv实现（如果不修改mmcv代码、工具无法获取step信息和反向信息）。
+| 处理模式                     | 前/反向                 | 算子范围                 | 扰动方式 | 耗时（s） | 显存峰值（GB） | 耗时膨胀倍数 | 显存峰值膨胀倍数 | 备注                           |
+|--------------------------------|-----------------------------------|-----------------|----------|--------------------------------|--------------------------------|--------------------------------|--------------------------------|--------------------------------|
+| /        | /       | /           | /                 | 0.09      |  7.63         | 1            | 1                | 基线                   |
+| check    | 前      | ["conv2d"]  | improve_precision | 0.889      | 7.94          |  9.81       |  1.04            |                                |
+| fix      | 前      | ["conv2d"]  | improve_precision | 0.328      | 7.47          |  3.64       |  0.91            |                                |
+| fix      | 前      | ["conv2d"]  | to_cpu            | 12.23      | 7.47          |  135.88     |  0.91              |                                |
+### OpenSora1.0
+4卡，主要数据类型：FLOAT16，模型来源: ascend/ModelZoo-PyTorch
+每张卡每个step中linear算子个数为257个，FA算子个数为83（FA算子反向无效）。
+| 处理模式                     | 前/反向                 | 算子范围                 | 扰动方式 | 耗时（s） | 显存峰值（GB） | 耗时膨胀倍数 | 显存峰值膨胀倍数 | 备注                           |
+|--------------------------------|-----------------------------------|-----------------|----------|--------------------------------|--------------------------------|--------------------------------|--------------------------------|--------------------------------|
+| /        | /       | /           | /                 | 0.99 | 17.61 | 1            | 1                | 混精模式基线                   |
+| check    | 前      | ["linear","npu_fusion_attention"] | improve_precision | 3.88 | 17.61 | 3.92 | 1 |                                |
+| check | 前 | ["linear","npu_fusion_attention"] | add_noise | 3.46 | 17.61 | 3.49 | 1 | |
+| check | 反 | ["linear"] | improve_precision | 12.61 | 17.61 | 12.74 | 1 | |
+| check    | 反      | ["linear"]  | add_noise         | 9.8 | 17.61 | 9.90 | 1 |                                |
+| fix      | 前      | ["linear"] | to_cpu            | 18.83 | 17.61 | 19.02 | 1 |                                |
+| fix | 前 | ["linear"] | improve_precision | 2.83 | 17.61 | 2.86 | 1 | |

msprobe/pytorch/free_benchmark/common/constant.py CHANGED Viewed

@@ -52,6 +52,7 @@ class ThresholdConfig:
     DTYPE_PER_THD = {
         torch.float16: 1.002,
+        torch.bfloat16: 1.004,
         torch.float32: 1.0002,
     }
     BENCHMARK_THD_DICT = {
@@ -60,6 +61,8 @@ class ThresholdConfig:
         torch.bfloat16: BenchmarkThd(2**-8, 1.0, 2**-8, 1e-4),
     }
+    TENSOR_SPLIT_MAX_CHUNK = 128
 class PreheatConfig:
     IF_PREHEAT = "if_preheat"

msprobe/pytorch/free_benchmark/common/utils.py CHANGED Viewed

@@ -96,3 +96,7 @@ class TorchC:
     add = torch._C._VariableFunctionsClass.add
     bitwise_xor = torch._C._VariableFunctionsClass.bitwise_xor
     clone = torch._C._VariableFunctionsClass.clone
+    clamp = torch._C._VariableFunctionsClass.clamp
+    tensor_split = torch._C._VariableFunctionsClass.tensor_split
+    stack = torch._C._VariableFunctionsClass.stack
+    reshape = torch._C._VariableFunctionsClass.reshape

msprobe/pytorch/free_benchmark/compare/grad_saver.py CHANGED Viewed

@@ -2,7 +2,7 @@ import torch
 from msprobe.core.common.exceptions import FreeBenchmarkException
 from msprobe.pytorch.free_benchmark import logger
 from msprobe.pytorch.free_benchmark.common.constant import CommonField
-from msprobe.pytorch.free_benchmark.common.params import DataParams, HandlerParams
+from msprobe.pytorch.free_benchmark.common.params import DataParams, HandlerParams, data_pre_deal
 from msprobe.pytorch.free_benchmark.perturbed_layers.layer_factory import LayerFactory
 from msprobe.pytorch.free_benchmark.result_handlers.handler_factory import (
     FuzzHandlerFactory,
@@ -16,7 +16,6 @@ class GradSaver:
         self.handler_params = handler_params
         self.api_name = handler_params.api_name
         self.origin_func = origin_func
-        self.data_params = DataParams()
         self.is_compare = True
         self.kwargs = dict()
         self.perturbed_grad_input = tuple()
@@ -61,28 +60,25 @@ class GradSaver:
                 _index += 1
     def compare_grad_results(self, handler, origin_grad, perturbed_grad, index):
-        # TODO get dtype?
-        self.data_params.original_result = origin_grad
-        self.data_params.perturbed_result = perturbed_grad
-        self.data_params.grad_unequal_flag = False
-        self.data_params.valid_input_index = index
+        data_params = DataParams()
+        data_params.original_result = origin_grad
+        data_params.perturbed_result = perturbed_grad
+        data_params.grad_unequal_flag = False
+        data_params.valid_input_index = index
         try:
-            handler.handle(self.data_params)
-            if not self.data_params.is_consistent:
+            handler.handle(data_params)
+            if not data_params.is_consistent:
                 self.is_compare = False
-                self.data_params.grad_unequal_flag = True
-                self.data_params.is_consistent = True
-                self.data_params.perturbed_result = self.perturbed_grad_input
-                self.data_params.original_result = self.origin_grad_input
-                handler.handle(self.data_params)
+                data_params.grad_unequal_flag = True
+                data_params.is_consistent = True
+                data_params.perturbed_result = self.perturbed_grad_input
+                data_params.original_result = self.origin_grad_input
+                handler.handle(data_params)
         except Exception as e:
             logger.warning_on_rank_0(
                 f"[msprobe] Free benchmark: compare two vjp failed: api:{self.handler_params.api_name}."
                 f"{e}"
             )
-        # 在扰动前后输出对比后释放输出的引用
-        self.data_params.perturbed_result = None
-        self.data_params.original_result = None
     def check_grad_input(self, origin_grad, new_grad_index):
         if self.perturbed_grad_input is None:
@@ -164,20 +160,20 @@ class GradSaver:
         return grad_input
     def calculate_perturbed_grad_input(self, grad_output, need_grad_tensors, inner_args):
-        self.data_params.args = [need_grad_tensors, grad_output, inner_args]
-        self.data_params.kwargs = {}
-        self.data_params.valid_input_index = 0
-        self.data_params.origin_func = self.get_grad_input_from_vjp
+        data_params = data_pre_deal(
+            self.handler_params.api_name,
+            self.get_grad_input_from_vjp,
+            [need_grad_tensors, grad_output, inner_args],
+            {}
+        )
         layer = LayerFactory.create(
             self.handler_params.api_name,
             self.handler_params.fuzz_device,
             self.handler_params.pert_mode,
         )
-        layer.handle(self.data_params)
-        # 在计算扰动输出之后，释放输入的引用
-        self.data_params.args = None
+        layer.handle(data_params)
         # 确定扰动成功后，才会暂存
-        if self.data_params.perturbed_result:
+        if data_params.perturbed_result:
             self.perturbed_grad_input = tuple(
-                [x.cpu() for x in self.data_params.perturbed_result]
+                [x.cpu() for x in data_params.perturbed_result]
             )

msprobe/pytorch/free_benchmark/main.py CHANGED Viewed

@@ -10,7 +10,10 @@ from msprobe.pytorch.free_benchmark.common.enums import (
     HandlerType,
     PerturbationMode,
 )
-from msprobe.pytorch.free_benchmark.common.params import data_pre_deal, make_handler_params
+from msprobe.pytorch.free_benchmark.common.params import (
+    data_pre_deal,
+    make_handler_params,
+)
 from msprobe.pytorch.free_benchmark.compare.grad_saver import GradSaver
 from msprobe.pytorch.free_benchmark.perturbed_layers.layer_factory import LayerFactory
 from msprobe.pytorch.free_benchmark.result_handlers.handler_factory import (
@@ -70,9 +73,9 @@ class FreeBenchmarkCheck(ABC):
         layer.handle(data_params)
         handler_params = make_handler_params(name, self.config, self.current_iter)
         handler = FuzzHandlerFactory.create(handler_params)
-        handler.handle(data_params)
-        return data_params.perturbed_result, handler.get_unequal_rows()
+        perturbed_output = handler.handle(data_params)
+        return perturbed_output, handler.get_unequal_rows()
     def backward(self, name, module, grad_output):
         if not self.config.fuzz_stage == Const.BACKWARD:

msprobe/pytorch/free_benchmark/perturbed_layers/npu/add_noise.py CHANGED Viewed

@@ -32,7 +32,7 @@ class AddNoiseLayer(NpuBaseLayer):
             return type(tensor_obj)([self.add_noise(value) for value in tensor_obj])
         return tensor_obj
-    def handle(self, params: DataParams) -> torch.Any:
+    def handle(self, params: DataParams):
         """
         对输入添加扰动并返回
         """

msprobe/pytorch/free_benchmark/perturbed_layers/npu/bit_noise.py CHANGED Viewed

@@ -48,7 +48,7 @@ class BitNoiseLayer(NpuBaseLayer):
             return type(tensor_obj)([self.add_bit_noise(value) for value in tensor_obj])
         return tensor_obj
-    def handle(self, params: DataParams) -> torch.Any:
+    def handle(self, params: DataParams):
         """
         对输入添加扰动并返回
         """

msprobe/pytorch/free_benchmark/perturbed_layers/npu/change_value.py CHANGED Viewed

@@ -39,7 +39,7 @@ class ChangeValueLayer(NpuBaseLayer):
             return type(tensor_obj)([self.change_value(value) for value in tensor_obj])
         return tensor_obj
-    def handle(self, params: DataParams) -> torch.Any:
+    def handle(self, params: DataParams):
         """
         对输入添加扰动并返回
         """

msprobe/pytorch/free_benchmark/perturbed_layers/npu/improve_precision.py CHANGED Viewed

@@ -17,7 +17,7 @@ class ImprovePrecisionLayer(NpuBaseLayer):
             and torch.is_floating_point(tensor_obj)
             and tensor_obj.dtype not in [torch.float32, torch.float64]
         ):
-            self._set_improve_valus(tensor_obj)
+            self._set_improve_values(tensor_obj)
             tensor_obj = self._change_dtype(tensor_obj)
             self.is_added = True
             return tensor_obj
@@ -32,7 +32,7 @@ class ImprovePrecisionLayer(NpuBaseLayer):
             )
         return tensor_obj
-    def handle(self, params: DataParams) -> torch.Any:
+    def handle(self, params: DataParams):
         logger.info_on_rank_0(
             f"[msprobe] Free benchmark: Perturbation is "
             f"{PerturbationMode.IMPROVE_PRECISION} of {self.api_name}."
@@ -50,7 +50,7 @@ class ImprovePrecisionLayer(NpuBaseLayer):
         params.perturbed_result = params.origin_func(*new_args, **new_kwargs)
         return params.perturbed_result
-    def _set_improve_valus(self, inputs):
+    def _set_improve_values(self, inputs):
         if inputs.dtype in [torch.float16, torch.bfloat16]:
             self.perturbed_value = torch.float32

msprobe/pytorch/free_benchmark/perturbed_layers/npu/no_change.py CHANGED Viewed

@@ -16,7 +16,7 @@ class NoChangeLayer(NpuBaseLayer):
         self.is_added = True
         return tensor_obj
-    def handle(self, params: DataParams) -> torch.Any:
+    def handle(self, params: DataParams):
         """
         对输入添加扰动并返回
         """

msprobe/pytorch/free_benchmark/perturbed_layers/run_cpu.py CHANGED Viewed

@@ -8,7 +8,7 @@ from msprobe.pytorch.free_benchmark.perturbed_layers.base_layer import BaseLayer
 class CpuLayer(BaseLayer):
-    def handle(self, params: DataParams) -> torch.Any:
+    def handle(self, params: DataParams):
         logger.info_on_rank_0(
             f"[msprobe] Free benchmark: Perturbation is to_cpu of {self.api_name}."

msprobe/pytorch/free_benchmark/result_handlers/base_handler.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import math
 from abc import ABC, abstractmethod
 from typing import Any, Optional, Tuple
+import numpy as np
 import torch
 from msprobe.core.common.const import Const
@@ -34,15 +35,36 @@ class FuzzHandler(ABC):
             origin_ouput = origin_ouput.values
             perturbed_output = perturbed_output.values
         if hasattr(perturbed_output, "dtype"):
-            abs_tol = ThresholdConfig.ABS_TOL_VALUE_DICT.get(perturbed_output.dtype)
+            abs_tol = ThresholdConfig.ABS_TOL_VALUE_DICT.get(perturbed_output.dtype, FuzzThreshold.F32_THD)
         else:
-            abs_tol = FuzzThreshold.F32_THD.value
+            abs_tol = FuzzThreshold.F32_THD
         return (
             origin_ouput.to(perturbed_output.dtype).to(perturbed_output.device),
             perturbed_output,
             abs_tol,
         )
+    @staticmethod
+    def tensor_split_for_error_calculate(origin_output, perturbed_output):
+        """
+        对将投入误差值计算的扰动前后输出张量进行分块
+        :param origin_output: 原始输出
+        :param perturbed_output: 扰动后输出
+        :return origin_output_chunks: 切块后原始输出列表
+        :return perturbed_output_chunks: 切块后扰动后输出列表
+        """
+        single_output_mem = origin_output.element_size() * origin_output.nelement() / Const.ONE_MB
+        if single_output_mem == 0 or origin_output.ndim == 0:
+            return [origin_output], [perturbed_output]
+        # 张量大小和批数之间的关系：chunks_exp=math.log(M,2)-4, chunks=2**chunks_exp (M为对比张量数据大小[Mb])
+        chunks_exp = int(math.log(single_output_mem, 2)) - 4
+        chunks = 2 ** chunks_exp
+        chunks = max(chunks, 1)
+        chunks = min(chunks, ThresholdConfig.TENSOR_SPLIT_MAX_CHUNK)
+        origin_output_chunks = TorchC.tensor_split(TorchC.reshape(origin_output, (-1,)), chunks)
+        perturbed_output_chunks = TorchC.tensor_split(TorchC.reshape(perturbed_output, (-1,)), chunks)
+        return origin_output_chunks, perturbed_output_chunks
     @staticmethod
     def convert_overflow_ratio_to_consistent(ratio):
         if math.isnan(ratio) or math.isinf(ratio):
@@ -61,36 +83,28 @@ class FuzzHandler(ABC):
             self, origin_output, perturbed_output, norm_type, abs_tol
     ):
         if norm_type == NormType.ENDLESS_NORM:
-            return self.get_endless_norm(origin_output, perturbed_output, abs_tol)
+            return self.calculate_error(origin_output, perturbed_output, abs_tol)
         return ThresholdConfig.COMP_CONSISTENT
-    def get_endless_norm(self, origin_output, perturbed_output, abs_tol):
-        ratio_tensor1 = TorchC.where(
-            TorchC.gt(TorchC.abs(perturbed_output), abs_tol),
-            TorchC.div(
-                TorchC.abs(origin_output),
-                TorchC.add(TorchC.abs(perturbed_output), abs_tol),
-            ),
-            1,
-        )
-        ratio_tensor2 = TorchC.where(
-            TorchC.gt(TorchC.abs(origin_output), abs_tol),
-            TorchC.div(
-                TorchC.abs(perturbed_output),
-                TorchC.add(TorchC.abs(origin_output), abs_tol),
-            ),
-            1,
-        )
+    def calculate_error(self, origin_output, perturbed_output, abs_tol):
+        origin_output_chunks, perturbed_output_chunks = self.tensor_split_for_error_calculate(origin_output, perturbed_output)
+        norm1 = -np.inf
+        norm2 = -np.inf
+        norm3 = np.inf
+        for i, chunk_origin in enumerate(origin_output_chunks):
+            if chunk_origin.nelement() == 0:
+                break
+            chunk_perturbed = perturbed_output_chunks[i]
+            ratio_tensor1 = TorchC.where(TorchC.abs(chunk_perturbed) > abs_tol,
+                                         TorchC.div(TorchC.clamp(chunk_origin, min=abs_tol), TorchC.clamp(chunk_perturbed, min=abs_tol)), 1)
+            ratio_tensor2 = TorchC.where(TorchC.abs(chunk_origin) > abs_tol,
+                                         TorchC.div(TorchC.clamp(chunk_perturbed, min=abs_tol), TorchC.clamp(chunk_origin, min=abs_tol)), 1)
+            norm_values = TorchC.stack([TorchC.max(ratio_tensor1), TorchC.max(ratio_tensor2)])
+            max_ratio1, max_ratio2 = norm_values.tolist()
+            norm1 = max(norm1, self.convert_overflow_ratio_to_consistent(max_ratio1))
+            norm2 = max(norm2, self.convert_overflow_ratio_to_consistent(max_ratio2))
+            norm3 = min(norm3, self.convert_overflow_ratio_to_consistent(max_ratio1))
-        norm1 = self.convert_overflow_ratio_to_consistent(
-            TorchC.max(ratio_tensor1).item()
-        )
-        norm2 = self.convert_overflow_ratio_to_consistent(
-            TorchC.max(ratio_tensor2).item()
-        )
-        norm3 = self.convert_overflow_ratio_to_consistent(
-            TorchC.min(ratio_tensor1).item()
-        )
         if norm3 < 0:
             ratio = ThresholdConfig.SYMBOL_FLIPPING
         else:

msprobe/pytorch/free_benchmark/result_handlers/handler_factory.py CHANGED Viewed

@@ -22,7 +22,6 @@ class FuzzHandlerFactory:
             handler = FuzzHandlerFactory.result_handlers.get(params.handler_type)
         else:
             handler = FuzzHandlerFactory.result_handlers.get(HandlerType.PREHEAT)
-            # TODO
         if not handler:
             raise FreeBenchmarkException(
                 FreeBenchmarkException.UnsupportedType,

msprobe/pytorch/function_factory.py ADDED Viewed

@@ -0,0 +1,75 @@
+from msprobe.pytorch.common.utils import logger
+from msprobe.pytorch.bench_functions.apply_adam_w import npu_apply_adam_w
+from msprobe.pytorch.bench_functions.confusion_transpose import npu_confusion_transpose, \
+    npu_confusion_transpose_backward
+from msprobe.pytorch.bench_functions.fast_gelu import fast_gelu, npu_fast_gelu_backward
+from msprobe.pytorch.bench_functions.layer_norm_eval import npu_layer_norm_eval
+from msprobe.pytorch.bench_functions.linear import npu_linear, npu_linear_backward
+from msprobe.pytorch.bench_functions.matmul_backward import matmul_backward
+from msprobe.pytorch.bench_functions.npu_fusion_attention import npu_fusion_attention, npu_fusion_attention_grad
+from msprobe.pytorch.bench_functions.rms_norm import npu_rms_norm, npu_rms_norm_backward
+from msprobe.pytorch.bench_functions.rotary_mul import npu_rotary_mul, npu_rotary_mul_backward
+from msprobe.pytorch.bench_functions.scaled_mask_softmax import npu_scaled_masked_softmax, \
+    npu_scaled_masked_softmax_backward
+from msprobe.pytorch.bench_functions.swiglu import npu_swiglu, npu_swiglu_backward, swish_grad, swish
+class Register(dict):
+    def __init__(self, *args, **kwargs):
+        super(Register, self).__init__(*args, **kwargs)
+        self._dict = {}
+    def __call__(self, target_func_list):
+        for target in target_func_list:
+            self.register(target)
+        return
+    def __setitem__(self, key, value):
+        self._dict[key] = value
+    def __getitem__(self, key):
+        return self._dict[key]
+    def __contains__(self, key):
+        return key in self._dict
+    def __str__(self):
+        return str(self._dict)
+    def keys(self):
+        return self._dict.keys()
+    def values(self):
+        return self._dict.values()
+    def items(self):
+        return self._dict.items()
+    def register(self, target):
+        def add_register_item(key, value):
+            if key in self._dict:
+                logger.warning(f"{value.__name__} has been registered before, so we will overriden it.")
+            self[key] = value
+            return value
+        if callable(target):
+            return add_register_item(target.__name__, target)
+        else:
+            raise Exception(f"The func {target} is not callable.")
+# register for npu custom bench functions
+npu_custom_functions = Register()
+npu_custom_functions([
+    npu_apply_adam_w, npu_confusion_transpose, fast_gelu, npu_layer_norm_eval, npu_linear, npu_fusion_attention,
+    npu_rms_norm, npu_rotary_mul, npu_scaled_masked_softmax, npu_swiglu
+])
+# register for npu custom backward bench functions
+npu_custom_grad_functions = Register()
+npu_custom_grad_functions([
+    npu_confusion_transpose_backward, npu_fast_gelu_backward, npu_linear_backward, matmul_backward,
+    npu_fusion_attention_grad, npu_rms_norm_backward, npu_rotary_mul_backward, npu_scaled_masked_softmax_backward,
+    npu_swiglu_backward
+])

msprobe/pytorch/functional/dump_module.py CHANGED Viewed

@@ -3,7 +3,7 @@ from msprobe.pytorch.common.log import logger
 from msprobe.core.common.const import Const
 from msprobe.pytorch.hook_module.api_registry import api_register
 from msprobe.pytorch.debugger.precision_debugger import PrecisionDebugger
-from msprobe.core.common.exceptions import MsaccException
+from msprobe.core.common.exceptions import MsprobeException
 from msprobe.core.data_dump.scope import BaseScope
 module_count = {}
@@ -12,10 +12,10 @@ module_count = {}
 def module_dump(module, dump_name):
     if not isinstance(module, nn.Module):
         logger.error("The parameter:module in module_dump is not a Module subclass.")
-        raise MsaccException(MsaccException.INVALID_PARAM_ERROR)
+        raise MsprobeException(MsprobeException.INVALID_PARAM_ERROR)
     if not isinstance(dump_name, str):
         logger.error("The parameter:dump_name in module_dump is not a str type.")
-        raise MsaccException(MsaccException.INVALID_PARAM_ERROR)
+        raise MsprobeException(MsprobeException.INVALID_PARAM_ERROR)
     api_register.api_originality()
     if dump_name not in module_count:
         module_count[dump_name] = 0
@@ -24,7 +24,7 @@ def module_dump(module, dump_name):
     dump_name = dump_name + Const.SEP + str(module_count.get(dump_name)) + Const.SEP
     pdg = PrecisionDebugger()
-    _, forward_hook, backward_hook = pdg.service.build_hook(BaseScope.Module_Type_Module, dump_name)
+    _, forward_hook, backward_hook, _ = pdg.service.build_hook(BaseScope.Module_Type_Module, dump_name)
     module.register_forward_hook(forward_hook, with_kwargs=True)
     module.register_full_backward_hook(backward_hook)

msprobe/pytorch/grad_probe/__init__.py ADDED Viewed

File without changes

mindstudio-probe 1.0.1__py3-none-any.whl → 1.0.3__py3-none-any.whl

mindstudio-probe 1.0.1py3-none-any.whl → 1.0.3py3-none-any.whl