PyPI - mindstudio-probe - Versions diffs - 1.0.1__py3-none-any.whl → 1.0.4__py3-none-any.whl - Mend

mindstudio-probe 1.0.1py3-none-any.whl → 1.0.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (323) hide show

{mindstudio_probe-1.0.1.dist-info → mindstudio_probe-1.0.4.dist-info}/LICENSE +201 -201
{mindstudio_probe-1.0.1.dist-info → mindstudio_probe-1.0.4.dist-info}/METADATA +36 -30
mindstudio_probe-1.0.4.dist-info/RECORD +276 -0
{mindstudio_probe-1.0.1.dist-info → mindstudio_probe-1.0.4.dist-info}/WHEEL +1 -1
{mindstudio_probe-1.0.1.dist-info → mindstudio_probe-1.0.4.dist-info}/entry_points.txt +1 -0
msprobe/README.md +101 -182
msprobe/__init__.py +1 -0
msprobe/{config/config.json → config.json} +49 -27
msprobe/core/__init__.py +0 -0
msprobe/{pytorch → core}/advisor/advisor.py +124 -124
msprobe/{pytorch → core}/advisor/advisor_const.py +59 -59
msprobe/{pytorch → core}/advisor/advisor_result.py +58 -58
msprobe/core/common/const.py +341 -241
msprobe/core/common/exceptions.py +100 -88
msprobe/core/common/{file_check.py → file_utils.py} +478 -265
msprobe/core/common/log.py +76 -55
msprobe/core/common/utils.py +385 -516
msprobe/core/common_config.py +85 -58
msprobe/core/compare/acc_compare.py +300 -0
msprobe/core/compare/check.py +95 -0
msprobe/core/compare/compare_cli.py +49 -0
msprobe/core/compare/highlight.py +223 -0
msprobe/core/compare/multiprocessing_compute.py +149 -0
msprobe/{pytorch → core}/compare/npy_compare.py +295 -244
msprobe/core/compare/utils.py +430 -0
msprobe/core/data_dump/data_collector.py +154 -140
msprobe/core/data_dump/data_processor/base.py +314 -245
msprobe/core/data_dump/data_processor/factory.py +59 -61
msprobe/core/data_dump/data_processor/mindspore_processor.py +186 -0
msprobe/core/data_dump/data_processor/pytorch_processor.py +366 -346
msprobe/core/data_dump/json_writer.py +96 -116
msprobe/core/data_dump/scope.py +178 -178
msprobe/core/grad_probe/__init__.py +0 -0
msprobe/core/grad_probe/constant.py +71 -0
msprobe/core/grad_probe/grad_compare.py +171 -0
msprobe/core/grad_probe/utils.py +64 -0
msprobe/docs/01.installation.md +89 -0
msprobe/docs/02.config_introduction.md +165 -0
msprobe/docs/03.config_examples.md +247 -0
msprobe/docs/04.acl_config_examples.md +76 -0
msprobe/docs/05.data_dump_PyTorch.md +198 -0
msprobe/docs/06.data_dump_MindSpore.md +243 -0
msprobe/docs/07.accuracy_checker_PyTorch.md +274 -0
msprobe/docs/08.accuracy_checker_online_PyTorch.md +198 -0
msprobe/docs/09.accuracy_checker_MindSpore.md +68 -0
msprobe/docs/10.accuracy_compare_PyTorch.md +245 -0
msprobe/docs/11.accuracy_compare_MindSpore.md +202 -0
msprobe/docs/12.overflow_check_PyTorch.md +79 -0
msprobe/docs/13.overflow_check_MindSpore.md +31 -0
msprobe/{pytorch/doc/parse_tool.md → docs/14.data_parse_PyTorch.md} +283 -286
msprobe/docs/15.free_benchmarking_PyTorch.md +164 -0
msprobe/docs/17.grad_probe.md +207 -0
msprobe/docs/FAQ_PyTorch.md +177 -0
msprobe/docs/S02.report_free_benchmarking_validation_performance_baseline.md +146 -0
msprobe/docs/img/free_benchmark_framework.png +0 -0
msprobe/docs/img/grad_probe_image-1.png +0 -0
msprobe/docs/img/grad_probe_image-2.png +0 -0
msprobe/docs/img/grad_probe_image-3.png +0 -0
msprobe/docs/img/grad_probe_image-4.png +0 -0
msprobe/docs/img/grad_probe_image.png +0 -0
msprobe/mindspore/__init__.py +1 -1
msprobe/mindspore/api_accuracy_checker/__init__.py +0 -0
msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +255 -0
msprobe/mindspore/api_accuracy_checker/api_info.py +69 -0
msprobe/mindspore/api_accuracy_checker/api_runner.py +156 -0
msprobe/mindspore/api_accuracy_checker/base_compare_algorithm.py +197 -0
msprobe/mindspore/api_accuracy_checker/cmd_parser.py +6 -0
msprobe/mindspore/api_accuracy_checker/compute_element.py +239 -0
msprobe/mindspore/api_accuracy_checker/main.py +9 -0
msprobe/mindspore/api_accuracy_checker/type_mapping.py +114 -0
msprobe/mindspore/api_accuracy_checker/utils.py +80 -0
msprobe/mindspore/cell_processor.py +34 -0
msprobe/mindspore/common/const.py +106 -0
msprobe/mindspore/common/log.py +38 -0
msprobe/mindspore/common/utils.py +81 -0
msprobe/mindspore/compare/distributed_compare.py +75 -0
msprobe/mindspore/compare/ms_compare.py +219 -0
msprobe/mindspore/compare/ms_graph_compare.py +348 -0
msprobe/mindspore/compare/ms_to_pt_api.yaml +399 -0
msprobe/mindspore/debugger/debugger_config.py +66 -51
msprobe/mindspore/debugger/precision_debugger.py +126 -32
msprobe/mindspore/dump/dump_tool_factory.py +35 -38
msprobe/mindspore/dump/hook_cell/api_registry.py +118 -0
msprobe/mindspore/dump/hook_cell/hook_cell.py +55 -0
msprobe/mindspore/dump/hook_cell/support_wrap_ops.yaml +922 -0
msprobe/mindspore/dump/hook_cell/wrap_api.py +113 -0
msprobe/mindspore/dump/jit_dump.py +72 -0
msprobe/mindspore/dump/kernel_graph_dump.py +59 -60
msprobe/mindspore/dump/kernel_kbyk_dump.py +64 -0
msprobe/mindspore/free_benchmark/__init__.py +0 -0
msprobe/mindspore/free_benchmark/api_pynative_self_check.py +116 -0
msprobe/mindspore/free_benchmark/common/__init__.py +0 -0
msprobe/mindspore/free_benchmark/common/config.py +12 -0
msprobe/mindspore/free_benchmark/common/handler_params.py +17 -0
msprobe/mindspore/free_benchmark/common/utils.py +71 -0
msprobe/mindspore/free_benchmark/data/support_wrap_ops.yaml +842 -0
msprobe/mindspore/free_benchmark/decorator/__init__.py +0 -0
msprobe/mindspore/free_benchmark/decorator/dec_forward.py +43 -0
msprobe/mindspore/free_benchmark/decorator/decorator_factory.py +107 -0
msprobe/mindspore/free_benchmark/handler/__init__.py +0 -0
msprobe/mindspore/free_benchmark/handler/base_handler.py +90 -0
msprobe/mindspore/free_benchmark/handler/check_handler.py +41 -0
msprobe/mindspore/free_benchmark/handler/fix_handler.py +36 -0
msprobe/mindspore/free_benchmark/handler/handler_factory.py +21 -0
msprobe/mindspore/free_benchmark/perturbation/add_noise.py +67 -0
msprobe/mindspore/free_benchmark/perturbation/base_perturbation.py +21 -0
msprobe/mindspore/free_benchmark/perturbation/bit_noise.py +63 -0
msprobe/mindspore/free_benchmark/perturbation/exchange_value.py +51 -0
msprobe/mindspore/free_benchmark/perturbation/improve_precision.py +35 -0
msprobe/mindspore/free_benchmark/perturbation/no_change.py +12 -0
msprobe/mindspore/free_benchmark/perturbation/perturbation_factory.py +29 -0
msprobe/mindspore/free_benchmark/self_check_tool_factory.py +33 -0
msprobe/mindspore/grad_probe/__init__.py +0 -0
msprobe/mindspore/grad_probe/global_context.py +90 -0
msprobe/mindspore/grad_probe/grad_analyzer.py +231 -0
msprobe/mindspore/grad_probe/grad_monitor.py +27 -0
msprobe/mindspore/grad_probe/grad_stat_csv.py +132 -0
msprobe/mindspore/grad_probe/hook.py +94 -0
msprobe/mindspore/grad_probe/utils.py +30 -0
msprobe/mindspore/ms_config.py +128 -78
msprobe/mindspore/overflow_check/kernel_graph_overflow_check.py +44 -45
msprobe/mindspore/overflow_check/overflow_check_tool_factory.py +34 -32
msprobe/mindspore/runtime.py +4 -0
msprobe/mindspore/service.py +378 -0
msprobe/mindspore/task_handler_factory.py +24 -21
msprobe/msprobe.py +105 -67
msprobe/pytorch/__init__.py +4 -4
msprobe/pytorch/api_accuracy_checker/common/config.py +53 -50
msprobe/pytorch/api_accuracy_checker/common/utils.py +214 -224
msprobe/pytorch/api_accuracy_checker/compare/algorithm.py +213 -216
msprobe/pytorch/api_accuracy_checker/compare/api_precision_compare.py +606 -545
msprobe/pytorch/api_accuracy_checker/compare/api_precision_standard.yaml +132 -132
msprobe/pytorch/api_accuracy_checker/compare/api_precision_threshold.yaml +390 -390
msprobe/pytorch/api_accuracy_checker/compare/compare.py +386 -345
msprobe/pytorch/api_accuracy_checker/compare/compare_column.py +73 -73
msprobe/pytorch/api_accuracy_checker/compare/compare_utils.py +245 -248
msprobe/pytorch/api_accuracy_checker/config.yaml +10 -4
msprobe/pytorch/api_accuracy_checker/run_ut/data_generate.py +335 -328
msprobe/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py +200 -203
msprobe/pytorch/api_accuracy_checker/run_ut/run_overflow_check.py +133 -127
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py +592 -493
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut_utils.py +70 -7
msprobe/pytorch/api_accuracy_checker/run_ut/torch_ut_setting.json +7 -4
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/__init__.py +0 -0
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/attl.py +197 -0
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/client.py +325 -0
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/device_dispatch.py +204 -0
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/server.py +219 -0
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/ssl_config.py +10 -0
msprobe/pytorch/bench_functions/__init__.py +15 -0
msprobe/pytorch/bench_functions/apply_adam_w.py +28 -0
msprobe/pytorch/bench_functions/confusion_transpose.py +19 -0
msprobe/pytorch/bench_functions/fast_gelu.py +55 -0
msprobe/pytorch/bench_functions/layer_norm_eval.py +6 -0
msprobe/pytorch/bench_functions/linear.py +12 -0
msprobe/pytorch/bench_functions/matmul_backward.py +48 -0
msprobe/pytorch/bench_functions/npu_fusion_attention.py +509 -0
msprobe/pytorch/bench_functions/rms_norm.py +15 -0
msprobe/pytorch/bench_functions/rotary_mul.py +52 -0
msprobe/pytorch/bench_functions/scaled_mask_softmax.py +26 -0
msprobe/pytorch/bench_functions/swiglu.py +55 -0
msprobe/pytorch/common/__init__.py +2 -2
msprobe/pytorch/common/compare_script.template +14 -14
msprobe/pytorch/common/log.py +20 -31
msprobe/pytorch/common/parse_json.py +39 -37
msprobe/pytorch/common/utils.py +305 -224
msprobe/pytorch/compare/distributed_compare.py +66 -111
msprobe/pytorch/compare/mapping.yaml +607 -607
msprobe/pytorch/compare/match.py +34 -36
msprobe/pytorch/compare/pt_compare.py +50 -0
msprobe/pytorch/debugger/debugger_config.py +95 -86
msprobe/pytorch/debugger/precision_debugger.py +125 -95
msprobe/pytorch/free_benchmark/__init__.py +8 -8
msprobe/pytorch/free_benchmark/common/constant.py +70 -67
msprobe/pytorch/free_benchmark/common/counter.py +71 -71
msprobe/pytorch/free_benchmark/common/enums.py +37 -37
msprobe/pytorch/free_benchmark/common/params.py +129 -129
msprobe/pytorch/free_benchmark/common/utils.py +102 -98
msprobe/pytorch/free_benchmark/compare/grad_saver.py +179 -183
msprobe/pytorch/free_benchmark/compare/single_benchmark.py +104 -104
msprobe/pytorch/free_benchmark/main.py +105 -102
msprobe/pytorch/free_benchmark/perturbed_layers/base_layer.py +13 -13
msprobe/pytorch/free_benchmark/perturbed_layers/layer_factory.py +41 -41
msprobe/pytorch/free_benchmark/perturbed_layers/npu/add_noise.py +90 -90
msprobe/pytorch/free_benchmark/perturbed_layers/npu/bit_noise.py +104 -104
msprobe/pytorch/free_benchmark/perturbed_layers/npu/change_value.py +63 -63
msprobe/pytorch/free_benchmark/perturbed_layers/npu/improve_precision.py +68 -68
msprobe/pytorch/free_benchmark/perturbed_layers/npu/no_change.py +28 -28
msprobe/pytorch/free_benchmark/perturbed_layers/npu/npu_base_layser.py +45 -45
msprobe/pytorch/free_benchmark/perturbed_layers/run_cpu.py +19 -19
msprobe/pytorch/free_benchmark/result_handlers/base_handler.py +217 -203
msprobe/pytorch/free_benchmark/result_handlers/check_handler.py +39 -39
msprobe/pytorch/free_benchmark/result_handlers/fix_handler.py +23 -23
msprobe/pytorch/free_benchmark/result_handlers/handler_factory.py +30 -31
msprobe/pytorch/free_benchmark/result_handlers/preheat_handler.py +170 -170
msprobe/pytorch/function_factory.py +76 -0
msprobe/pytorch/functional/dump_module.py +39 -39
msprobe/pytorch/grad_probe/__init__.py +0 -0
msprobe/pytorch/grad_probe/grad_monitor.py +91 -0
msprobe/pytorch/grad_probe/grad_stat_csv.py +129 -0
msprobe/pytorch/hook_module/api_registry.py +161 -161
msprobe/pytorch/hook_module/hook_module.py +120 -109
msprobe/pytorch/hook_module/support_wrap_ops.yaml +1879 -1876
msprobe/pytorch/hook_module/utils.py +30 -29
msprobe/pytorch/hook_module/wrap_aten.py +110 -100
msprobe/pytorch/hook_module/wrap_distributed.py +78 -75
msprobe/pytorch/hook_module/wrap_functional.py +105 -108
msprobe/pytorch/hook_module/wrap_npu_custom.py +93 -73
msprobe/pytorch/hook_module/wrap_tensor.py +71 -72
msprobe/pytorch/hook_module/wrap_torch.py +86 -88
msprobe/pytorch/hook_module/wrap_vf.py +62 -64
msprobe/pytorch/module_processer.py +138 -98
msprobe/pytorch/online_dispatch/__init__.py +20 -20
msprobe/pytorch/online_dispatch/compare.py +236 -236
msprobe/pytorch/online_dispatch/dispatch.py +271 -273
msprobe/pytorch/online_dispatch/dump_compare.py +155 -186
msprobe/pytorch/online_dispatch/single_compare.py +391 -391
msprobe/pytorch/online_dispatch/torch_ops_config.yaml +49 -49
msprobe/pytorch/online_dispatch/utils.py +130 -187
msprobe/pytorch/parse.py +4 -4
msprobe/pytorch/parse_tool/cli.py +32 -32
msprobe/pytorch/parse_tool/lib/compare.py +260 -259
msprobe/pytorch/parse_tool/lib/config.py +52 -51
msprobe/pytorch/parse_tool/lib/file_desc.py +31 -31
msprobe/pytorch/parse_tool/lib/interactive_cli.py +102 -102
msprobe/pytorch/parse_tool/lib/parse_exception.py +54 -54
msprobe/pytorch/parse_tool/lib/parse_tool.py +158 -158
msprobe/pytorch/parse_tool/lib/utils.py +316 -367
msprobe/pytorch/parse_tool/lib/visualization.py +85 -90
msprobe/pytorch/pt_config.py +188 -93
msprobe/pytorch/service.py +246 -167
mindstudio_probe-1.0.1.dist-info/RECORD +0 -228
msprobe/config/README.md +0 -397
msprobe/mindspore/doc/dump.md +0 -65
msprobe/mindspore/dump/api_kbk_dump.py +0 -55
msprobe/pytorch/compare/acc_compare.py +0 -1024
msprobe/pytorch/compare/highlight.py +0 -100
msprobe/pytorch/doc/FAQ.md +0 -193
msprobe/pytorch/doc/api_accuracy_checker.md +0 -269
msprobe/pytorch/doc/atat/321/207/342/226/223/342/225/233/321/205/342/225/221/320/266/321/205/342/225/226/320/265/321/205/320/225/342/225/226/321/206/320/245/342/226/221/321/206/320/235/320/276dump/321/206/320/260/320/227/321/205/320/227/320/226/321/206/320/220/320/267/321/210/320/223/342/225/234/321/205/320/257/342/225/221/321/207/342/225/221/342/224/220/321/206/320/232/320/265/321/205/320/241/320/232.md +0 -182
msprobe/pytorch/doc/dump.md +0 -207
msprobe/pytorch/doc/ptdbg_ascend_compare.md +0 -176
msprobe/pytorch/doc/ptdbg_ascend_overview.md +0 -68
msprobe/pytorch/doc/ptdbg_ascend_quickstart.md +0 -381
msprobe/pytorch/doc/run_overflow_check.md +0 -25
msprobe/pytorch/doc//321/205/320/254/320/270/321/207/342/225/221/342/224/220/321/207/342/226/223/342/225/233/321/205/342/225/221/320/266/321/206/320/277/320/244/321/205/320/277/342/225/243.md +0 -90
msprobe/test/core_ut/common/test_utils.py +0 -345
msprobe/test/core_ut/data_dump/test_data_collector.py +0 -47
msprobe/test/core_ut/data_dump/test_json_writer.py +0 -183
msprobe/test/core_ut/data_dump/test_scope.py +0 -151
msprobe/test/core_ut/test_common_config.py +0 -152
msprobe/test/core_ut/test_file_check.py +0 -218
msprobe/test/core_ut/test_log.py +0 -109
msprobe/test/mindspore_ut/test_api_kbk_dump.py +0 -51
msprobe/test/mindspore_ut/test_debugger_config.py +0 -42
msprobe/test/mindspore_ut/test_dump_tool_factory.py +0 -51
msprobe/test/mindspore_ut/test_kernel_graph_dump.py +0 -66
msprobe/test/mindspore_ut/test_kernel_graph_overflow_check.py +0 -63
msprobe/test/mindspore_ut/test_ms_config.py +0 -69
msprobe/test/mindspore_ut/test_overflow_check_tool_factory.py +0 -51
msprobe/test/mindspore_ut/test_precision_debugger.py +0 -56
msprobe/test/mindspore_ut/test_task_handler_factory.py +0 -58
msprobe/test/pytorch_ut/advisor/test_advisor.py +0 -83
msprobe/test/pytorch_ut/api_accuracy_checker/common/test_common_utils.py +0 -108
msprobe/test/pytorch_ut/api_accuracy_checker/common/test_config.py +0 -39
msprobe/test/pytorch_ut/api_accuracy_checker/compare/test_algorithm.py +0 -112
msprobe/test/pytorch_ut/api_accuracy_checker/compare/test_api_precision_compare.py +0 -77
msprobe/test/pytorch_ut/api_accuracy_checker/compare/test_compare.py +0 -125
msprobe/test/pytorch_ut/api_accuracy_checker/compare/test_compare_column.py +0 -10
msprobe/test/pytorch_ut/api_accuracy_checker/compare/test_compare_utils.py +0 -43
msprobe/test/pytorch_ut/api_accuracy_checker/run_ut/dump.json +0 -179
msprobe/test/pytorch_ut/api_accuracy_checker/run_ut/forward.json +0 -63
msprobe/test/pytorch_ut/api_accuracy_checker/run_ut/test_data_generate.py +0 -99
msprobe/test/pytorch_ut/api_accuracy_checker/run_ut/test_multi_run_ut.py +0 -115
msprobe/test/pytorch_ut/api_accuracy_checker/run_ut/test_run_ut.py +0 -72
msprobe/test/pytorch_ut/compare/test_acc_compare.py +0 -17
msprobe/test/pytorch_ut/free_benchmark/perturbed_layers/test_perturbed_layser.py +0 -105
msprobe/test/pytorch_ut/free_benchmark/result_handlers/test_result_handler.py +0 -121
msprobe/test/pytorch_ut/free_benchmark/test_main.py +0 -101
msprobe/test/pytorch_ut/functional/test_dump_module.py +0 -15
msprobe/test/pytorch_ut/hook_module/test_api_registry.py +0 -130
msprobe/test/pytorch_ut/hook_module/test_hook_module.py +0 -42
msprobe/test/pytorch_ut/hook_module/test_wrap_aten.py +0 -65
msprobe/test/pytorch_ut/hook_module/test_wrap_distributed.py +0 -35
msprobe/test/pytorch_ut/hook_module/test_wrap_functional.py +0 -20
msprobe/test/pytorch_ut/hook_module/test_wrap_tensor.py +0 -35
msprobe/test/pytorch_ut/hook_module/test_wrap_torch.py +0 -43
msprobe/test/pytorch_ut/hook_module/test_wrap_vf.py +0 -11
msprobe/test/pytorch_ut/test_pt_config.py +0 -69
msprobe/test/pytorch_ut/test_service.py +0 -59
msprobe/test/resources/advisor.txt +0 -3
msprobe/test/resources/compare_result_20230703104808.csv +0 -9
msprobe/test/resources/compare_result_without_accuracy.csv +0 -9
msprobe/test/resources/config.yaml +0 -3
msprobe/test/resources/npu_test.pkl +0 -8
msprobe/test/run_test.sh +0 -30
msprobe/test/run_ut.py +0 -58
msprobe/test/test_module_processer.py +0 -64
{mindstudio_probe-1.0.1.dist-info → mindstudio_probe-1.0.4.dist-info}/top_level.txt +0 -0
/msprobe/{pytorch/doc → docs}/img/BLOOM-7B_1.png +0 -0
/msprobe/{pytorch/doc → docs}/img/BLOOM-7B_2.png +0 -0
/msprobe/{pytorch/doc → docs}/img/BLOOM-7B_3.png +0 -0
/msprobe/{pytorch/doc → docs}/img/BLOOM-7B_4.png +0 -0
/msprobe/{pytorch/doc → docs}/img/GPT-3_1.png +0 -0
/msprobe/{pytorch/doc → docs}/img/GPT-3_2.png +0 -0
/msprobe/{pytorch/doc → docs}/img/GPT-3_3.png +0 -0
/msprobe/{pytorch/doc → docs}/img/GPT-3_4.png +0 -0
/msprobe/{pytorch/doc → docs}/img/GPT-3_5.png +0 -0
/msprobe/{pytorch/doc → docs}/img/GPT-3_6.png +0 -0
/msprobe/{pytorch/doc → docs}/img/GPT-3_7.png +0 -0
/msprobe/{pytorch/doc → docs}/img/GPT-3_8.png +0 -0
/msprobe/{pytorch/doc → docs}/img/YOLOV5S_1.png +0 -0
/msprobe/{pytorch/doc → docs}/img/YOLOV5S_2.png +0 -0
/msprobe/{pytorch/doc → docs}/img/accuracy_checking_details.png +0 -0
/msprobe/{pytorch/doc → docs}/img/accuracy_checking_result.png +0 -0
/msprobe/{pytorch/doc → docs}/img/api_precision_compare_details.png +0 -0
/msprobe/{pytorch/doc → docs}/img/api_precision_compare_result.png +0 -0
/msprobe/{pytorch/doc → docs}/img/auto_analyze_log.png +0 -0
/msprobe/{pytorch/doc → docs}/img/compare_result_pkl.png +0 -0
/msprobe/{pytorch/doc → docs}/img/compare_result_pkl_md5.png.png +0 -0
/msprobe/{pytorch/doc → docs}/img/cpu_info.png +0 -0
/msprobe/{config → docs}/img/free_benchmark.png +0 -0
/msprobe/{pytorch/doc → docs}/img/module_compare.png +0 -0

msprobe/pytorch/bench_functions/npu_fusion_attention.py ADDED Viewed

@@ -0,0 +1,509 @@
+import torch
+import numpy as np
+from einops import rearrange
+try:
+    import torch_npu
+except ImportError:
+    is_gpu = True
+    try:
+        # flash_attn为gpu的fa三方库
+        from flash_attn import flash_attn_func
+    except ImportError:
+        #如果为cpu的ut环境，则不做任何处理
+        pass
+else:
+    is_gpu = False
+from msprobe.pytorch.common.utils import logger
+from msprobe.core.common.const import Const, CompareConst
+gtype = torch.float64  # arm host必须选择float64，x86环境选择float32即可，64也行。arm计算很慢，s=8k的场景建议使用x86
+softmax_build_mode = "QKV"  # "MAX_SUM"
+"""
+# 前向函数声明对比
+标杆实现:fusion_attention_forward: q, k, v, drop_mask, atten_mask, pse, scale, keep_prob
+融合算子:npu_fusion_attention_forward: query, key, value, head_num, input_layout, *, pse=None, padding_mask=None,
+                                      atten_mask=None, scale=1.0, keep_prob=1.0, pre_tockens=2147483647,
+                                      next_tockens=2147483647, inner_precise=0, prefix=None, sparse_mode=0,
+                                      gen_mask_parallel=True, sync=False
+# 反向函数声明对比
+标杆实现:fusion_attention_backward: dx, q, k, v, softmax_res, drop_mask, pse, scale, keep_prob
+融合算子:npu_fusion_attention_backward: query, key, value, dy, head_num, input_layout, *, pse=None, padding_mask=None,
+                                       atten_mask=None, softmax_max=None, softmax_sum=None, softmax_in=None,
+                                       attention_in=None, scale_value=1.0, keep_prob=1.0, pre_tockens=2147483647,
+                                       next_tockens=2147483647, inner_precise=0, seed=0, offset=0,
+                                       numels=0, prefix=None, sparse_mode=0, gen_mask_parallel=True, sync=False
+"""
+def softmax_forward(x):
+    x_max = torch.max(x, dim=-1, keepdims=True)[0]
+    x_sub = x.sub(x_max)
+    y = torch.exp(x_sub)
+    x_sum = y.sum(dim=-1, keepdims=True)
+    res = y.div(x_sum)
+    return res, x_max, x_sum
+def softmax_grad(dp, softmax_res):
+    muls = dp * softmax_res
+    muls_r = muls.sum(dim=-1, keepdims=True)
+    sub_r = dp - muls_r
+    res = sub_r * softmax_res
+    return res
+def broadcast_kv(num_heads, num_kv_heads, kv_tensor, dtype):
+    if num_kv_heads == 0 or num_kv_heads > num_heads:
+        raise ValueError(f"num_kv_heads must be non-zero and bigger than num_heads.")
+    factor = num_heads // num_kv_heads
+    kv_shape = kv_tensor.shape
+    B = kv_shape[0]
+    S = kv_shape[2]
+    D = kv_shape[3]
+    kv_res = torch.zeros([B, num_heads, S, D]).to(dtype)
+    for i in range(num_heads):
+        j = i // factor
+        kv_res[:, i:i + 1, :, :] = kv_tensor[:, j:j + 1, :, :]
+    return kv_res
+def calculate_qk(q, k, atten_mask, pse, scale):
+    if pse is None or len(pse.shape) == 0:
+        qk = torch.matmul(q, k.permute(0, 1, 3, 2)).mul(scale)
+    else:
+        qk = (torch.matmul(q, k.permute(0, 1, 3, 2)) + pse).mul(scale)
+    if atten_mask is None or len(atten_mask.shape) == 0:
+        return qk
+    else:
+        qk = qk + atten_mask.bool() * (-40000.0)  # -10000
+    return qk
+def fusion_attention_forward(q, k, v, drop_mask, atten_mask, pse, scale, keep_prob):
+    qk = calculate_qk(q, k, atten_mask, pse, scale)
+    softmax_res, softmax_max, softmax_sum = softmax_forward(qk)
+    if drop_mask is None or len(drop_mask.shape) == 0:
+        drop_res = softmax_res
+    else:
+        drop_res = softmax_res * drop_mask * (1.0 / keep_prob)
+    y = torch.matmul(drop_res, v)
+    return y, softmax_max, softmax_sum
+def fusion_attention_backward(dx, q, k, v, softmax_res, drop_mask, pse, scale, keep_prob):
+    dp = torch.matmul(dx, v.permute(0, 1, 3, 2))
+    if drop_mask is None or len(drop_mask.shape) == 0:
+        drop_res = softmax_res.permute(0, 1, 3, 2)
+        dp_drop = dp
+    else:
+        drop_res = softmax_res.mul(drop_mask).mul(1.0 / keep_prob).permute(0, 1, 3, 2)
+        dp_drop = dp * drop_mask * (1.0 / keep_prob)
+    dv = torch.matmul(drop_res, dx)
+    softmax_grad_res = (softmax_grad(dp_drop, softmax_res) * scale)
+    dq = torch.matmul(softmax_grad_res, k)
+    dk = torch.matmul(softmax_grad_res.permute(0, 1, 3, 2), q)
+    return dq, dk, dv
+def parse_bsnd_args(query, key, head_num, input_layout):
+    supported_input_layout = ["BSH", "SBH", "BSND", "BNSD", "TND"]
+    B, S1, S2, N1, N2, D, H1, H2 = None, None, None, head_num, None, None, None, None
+    if not isinstance(input_layout, str) or input_layout not in supported_input_layout:
+        raise ValueError(f"Invalid input_layout arg which must be one of {supported_input_layout}.")
+    if input_layout == "TND":
+        raise ValueError(f"input_layout {input_layout} does not supported for now.")
+    try:
+        if input_layout == "BSH":
+            B, S1, H1 = query.shape
+            _, S2, H2 = key.shape
+            D = H1 // N1
+            N2 = H2 // D
+        elif input_layout == "SBH":
+            S1, B, H1 = query.shape
+            S2, _, H2 = key.shape
+            D = H1 // N1
+            N2 = H2 // D
+        elif input_layout == "BSND":
+            B, S1, N1, D = query.shape
+            _, S2, N2, _ = key.shape
+            H1 = N1 * D
+            H2 = N2 * D
+        elif input_layout == "BNSD":
+            B, N1, S1, D = query.shape
+            _, N2, S2, _ = key.shape
+            H1 = N1 * D
+            H2 = N2 * D
+    except Exception as e:
+        raise ValueError(f"query.shape: {query.shape}, key.shape: {key.shape}, parse_bsnd_args error: {e}") from e
+    if D == 0:
+        raise ValueError(f"Value D must be non-zero.")
+    DTYPE = query.dtype
+    return B, S1, S2, N1, N2, D, H1, H2, DTYPE
+def convert_from_bnsd(_input, input_layout):
+    if input_layout == "BSH":
+        # (B,N,S,D)=>(B,S,N*D)
+        out = rearrange(_input, 'b n s d -> b s (n d)').contiguous()
+    elif input_layout == "SBH":
+        # (B,N,S,D)=>(S,B,N*D)
+        out = rearrange(_input, 'b n s d -> s b (n d)').contiguous()
+    elif input_layout == "BSND":
+        # (B,N,S,D)=>(B,S,N,D)
+        out = rearrange(_input, 'b n s d -> b s n d').contiguous()
+    elif input_layout == "TND":
+        raise ValueError(f"input_layout {input_layout} does not supported for now.")
+    else:
+        out = _input
+    return out
+def convert_to_bnsd(_input, n, input_layout):
+    # 默认"BNSD"无需处理
+    if input_layout == "BSH":
+        # (B,S,N*D)=>(B,N,S,D)
+        out = rearrange(_input, 'b s (n d) -> b n s d', n=n)
+    elif input_layout == "SBH":
+        # (S,B,N*D)=>(B,N,S,D)
+        out = rearrange(_input, 's b (n d) -> b n s d', n=n)
+    elif input_layout == "BSND":
+        # (B,S,N,D)=>(B,N,S,D)
+        out = rearrange(_input, 'b s n d -> b n s d', n=n)
+    elif input_layout == "TND":
+        raise ValueError(f"input_layout {input_layout} does not supported for now.")
+    else:
+        out = _input
+    if out.dim() != 4:
+        raise ValueError(f"convert qkv format failed with input_layout {input_layout}.")
+    return out.to(gtype)
+def generate_atten_mask(sparse_mode, atten_mask, B, N1, S1, S2, pre_tocken, next_tocken, dtype):
+    """
+    # 当sparse_mode=2、3、4时小算子到融合算子会走这个优化，反过来看就要拆解回原来的基本实现
+    ===> atten_mask = torch.from_numpy(np.triu(np.ones([2048, 2048]), k=1)).to(dtype)
+    """
+    shape = [S1, S2]
+    if atten_mask is not None:
+        # 当FA的输入已经包含atten_mask时，可以认为已经是转换之后的mask矩阵了，有三种特殊场景，即稀疏矩阵场景，需要进行逆向还原
+        if sparse_mode == 2 or sparse_mode == 3 or sparse_mode == 4:
+            logger.info(f"S1: {S1}, S2:{S2}, atten_mask.shape:{atten_mask.shape}, atten_mask.dtype:{atten_mask.dtype}")
+            if atten_mask.dim() == 2 and atten_mask.shape[0] == 2048 and atten_mask.shape[1] == 2048:
+                if atten_mask.equal(torch.from_numpy(np.triu(np.ones([2048, 2048]), k=1)).to(atten_mask.dtype)):
+                    if sparse_mode == 2:
+                        atten_mask = torch.from_numpy(np.triu(np.ones(shape), k=1))
+                    elif sparse_mode == 3:
+                        atten_mask = torch.from_numpy(np.triu(np.ones(shape), k=S2 - S1 + 1))
+                    elif sparse_mode == 4:
+                        atten_mask_u = torch.from_numpy(np.triu(np.ones(shape), k=next_tocken + 1))
+                        atten_mask_l = torch.from_numpy(np.tril(np.ones(shape), k=-pre_tocken - 1))
+                        atten_mask = atten_mask_u + atten_mask_l
+                    logger.debug(f"反向转换atten_mask {atten_mask.shape}")
+                    return atten_mask.to(dtype)
+        return atten_mask.to(dtype)
+    if atten_mask is not None:
+        if atten_mask.dim() == 2:
+            if atten_mask.shape[0] != S1 or atten_mask.shape[1] != S2:
+                raise ValueError(f"Invalid atten_mask shape `SS` {atten_mask.shape}")
+            shape = [S1, S2]
+        elif atten_mask.dim() == 4:
+            if atten_mask.shape[1] == 1:
+                shape = [B, 1, S1, S2] if B != 1 else [1, 1, S1, S2]
+            else:
+                shape = [B, N1, S1, S2] if B != 1 else [1, N1, S1, S2]
+    if sparse_mode == 0:
+        atten_mask_u = torch.from_numpy(np.triu(np.ones(shape), k=next_tocken + 1))
+        atten_mask_l = torch.from_numpy(np.tril(np.ones(shape), k=-pre_tocken - 1))
+        atten_mask = atten_mask_u + atten_mask_l
+    elif sparse_mode == 1:  # no sparse
+        atten_mask = torch.from_numpy(np.zeros(shape))
+    elif sparse_mode == 2:
+        atten_mask = torch.from_numpy(np.triu(np.ones(shape), k=1))
+    elif sparse_mode == 3:
+        atten_mask = torch.from_numpy(np.triu(np.ones(shape), k=S2 - S1 + 1))
+    elif sparse_mode == 4:
+        atten_mask_u = torch.from_numpy(np.triu(np.ones(shape), k=next_tocken + 1))
+        atten_mask_l = torch.from_numpy(np.tril(np.ones(shape), k=-pre_tocken - 1))
+        atten_mask = atten_mask_u + atten_mask_l
+    # 注:不会出现sparse_mode=5的情况，该情况要求必须要传入atten_mask，且atten_mask矩阵数据格式须为BNSS或B1SS，
+    # 因此可以认为FA的输入已经是正确的atten_mask了
+    return atten_mask.to(dtype)
+def generate_kv(key, value, N1, N2):
+    # N不等长适配by cdy
+    if not (N1 == N2):
+        k_new = broadcast_kv(N1, N2, key, key.dtype)
+        v_new = broadcast_kv(N1, N2, value, value.dtype)
+    else:
+        k_new = key
+        v_new = value
+    return k_new, v_new
+def rebuid_softmax_by_qkv(q, k, atten_mask, pse, scale):
+    """
+    attention = softmax(QK^T/sqrt(d))V
+    softmax(x_i) = e^(x_i - x_max) / sum(e^(x_i - x_max))
+    """
+    logger.info("Using QKV to rebuild original softmax")
+    qk = calculate_qk(q, k, atten_mask, pse, scale)
+    softmax_res, x_max, x_sum = softmax_forward(qk)
+    return softmax_res
+def rebuild_softmax_by_max_sum(q, k, atten_mask, pse, scale, softmax_max, softmax_sum):
+    """
+    attention = softmax(QK^T/sqrt(d))V
+    softmax(x_i) = e^(x_i - x_max_i) / x_sum_i)
+    """
+    logger.info("Using softmax_max and softmax_sum to rebuild original softmax")
+    qk = calculate_qk(q, k, atten_mask, pse, scale)
+    if softmax_max.shape[-1] == 0:
+        raise ValueError(f"softmax_max.shape[-1] must be non-zero, softmax_max.shape: {softmax_max.shape}")
+    repeat_dim = qk.shape[-1] // softmax_max.shape[-1]
+    softmax_res = torch.exp(qk.sub(softmax_max.repeat(1, 1, 1, repeat_dim))).div(
+        softmax_sum.repeat(1, 1, 1, repeat_dim))
+    return softmax_res
+def get_head_num(*args, **kwargs):
+    if kwargs.get("head_num", None):
+        head_num = kwargs.get("head_num")
+    elif len(args) >= 4:
+        head_num = args[3]
+    else:
+        raise ValueError(f"Unsupported npu_fusion_attention args {args}.")
+    return head_num
+def get_input_layout(*args, **kwargs):
+    if kwargs.get("input_layout", None):
+        input_layout = kwargs.get("input_layout")
+    elif len(args) >= 5:
+        input_layout = args[4]
+    else:
+        raise ValueError(f"Unsupported npu_fusion_attention args {args}.")
+    return input_layout
+def npu_fusion_attention_forward_patch(*args, **kwargs):
+    # query, key, value, head_num, input_layout
+    head_num = get_head_num(*args, **kwargs)
+    input_layout = get_input_layout(*args, **kwargs)
+    B, S1, S2, N1, N2, D, H1, H2, DTYPE = parse_bsnd_args(args[0], args[1], head_num, input_layout)
+    if N1 == N2 and S1 == S2:
+        logger.debug(f"running case : BNSD = {B}_{N1}_{S1}_{D}, sparse = {kwargs.get('sparse_mode', 0)}")
+    else:
+        logger.debug(f"running case: BNSD = {B}_{N1}({N2})_{S1}({S2})_{D}, sparse = {kwargs.get('sparse_mode', 0)}")
+    if not (N1 % N2 == 0 and N1 >= N2):
+        raise ValueError(f"N1与N2不匹配,请检查: N1 = {N1}, N2 = {N2}.")
+    dims_kwargs = {"B": B, "S1": S1, "S2": S2, "N1": N1, "N2": N2,
+                   "D": D, "H1": H1, "H2": H2, "DTYPE": DTYPE}
+    new_kwargs = {"keep_prob": 1,
+                  "scale": kwargs.get("scale", 1 / (D ** 0.5)),
+                  "sparse_mode": kwargs.get("sparse_mode", 0),
+                  "prefix": kwargs.get("prefix"),
+                  "pre_tockens": kwargs.get("pre_tockens", 2147483647),
+                  "next_tockens": kwargs.get("next_tockens", 2147483647),
+                  "pse": kwargs.get("pse"),
+                  "padding_mask": kwargs.get("padding_mask"),
+                  "atten_mask": kwargs.get("atten_mask")}
+    return args, dims_kwargs, new_kwargs
+def npu_fusion_attention_backward_patch(*args, **kwargs):
+    if len(args) != 6:
+        raise ValueError(f"Unsupported npu_fusion_attention_grad args {args}.")
+    B, S1, S2, N1, N2, D, H1, H2, DTYPE = parse_bsnd_args(args[0], args[1], args[4], args[5])
+    if N1 == N2 and S1 == S2:
+        logger.info(f"running case : BNSD = {B}_{N1}_{S1}_{D}, sparse = {kwargs.get('sparse_mode', 0)}")
+    else:
+        logger.info(f"running case: BNSD = {B}_{N1}({N2})_{S1}({S2})_{D}, sparse = {kwargs.get('sparse_mode', 0)}")
+    if not (N1 % N2 == 0 and N1 >= N2):
+        raise ValueError(f"N1与N2不匹配,请检查: N1 = {N1}, N2 = {N2}.")
+    dims_kwargs = {"B": B, "S1": S1, "S2": S2, "N1": N1, "N2": N2,
+                   "D": D, "H1": H1, "H2": H2, "DTYPE": DTYPE}
+    new_kwargs = {"keep_prob": 1,
+                  "scale_value": kwargs.get("scale_value", 1 / (D ** 0.5)),
+                  "sparse_mode": kwargs.get("sparse_mode", 0),
+                  "prefix": kwargs.get("prefix"),
+                  "pre_tockens": kwargs.get("pre_tockens", 2147483647),
+                  "next_tockens": kwargs.get("next_tockens", 2147483647),
+                  "pse": kwargs.get("pse"),
+                  "padding_mask": kwargs.get("padding_mask"),
+                  "softmax_max": kwargs.get("softmax_max"),
+                  "softmax_sum": kwargs.get("softmax_sum"),
+                  "softmax_in": kwargs.get("softmax_in"),
+                  "attention_in": kwargs.get("attention_in"),
+                  "seed": kwargs.get("seed", 0),
+                  "offset": kwargs.get("offset", 0),
+                  "numels": kwargs.get("numels", 0),
+                  "atten_mask": kwargs.get("atten_mask")}
+    return args, dims_kwargs, new_kwargs
+def npu_fusion_attention(*args, **kwargs):
+    new_args, dims_kwargs, new_kwargs = npu_fusion_attention_forward_patch(*args, **kwargs)
+    query, key, value = new_args[0], new_args[1], new_args[2]
+    input_layout = get_input_layout(*args, **kwargs)
+    N1 = dims_kwargs.get("N1")
+    N2 = dims_kwargs.get("N2")
+    S1 = dims_kwargs.get("S1")
+    S2 = dims_kwargs.get("S2")
+    B = dims_kwargs.get("B")
+    DTYPE = dims_kwargs.get("DTYPE")
+    atten_mask = new_kwargs.get("atten_mask")
+    keep_prob = new_kwargs.get("keep_prob")
+    sparse_mode = new_kwargs.get("sparse_mode")
+    pre_tockens = new_kwargs.get("pre_tockens")
+    next_tockens = new_kwargs.get("next_tockens")
+    pse = new_kwargs.get("pse")
+    scale = new_kwargs.get("scale")
+    atten_mask = generate_atten_mask(sparse_mode, atten_mask, B, N1, S1, S2, pre_tockens, next_tockens, DTYPE)
+    query = convert_to_bnsd(query, N1, input_layout)
+    key = convert_to_bnsd(key, N2, input_layout)
+    value = convert_to_bnsd(value, N2, input_layout)
+    k_new, v_new = generate_kv(key, value, N1, N2)
+    out_golden, softmax_max, softmax_sum = fusion_attention_forward(q=query, k=k_new, v=v_new,
+                                                                    drop_mask=None, atten_mask=atten_mask,
+                                                                    pse=pse, scale=scale,
+                                                                    keep_prob=keep_prob)
+    if out_golden.dim() == 5:
+        out_golden = out_golden.reshape(out_golden.size(0), out_golden.size(1) * out_golden.size(2), out_golden.size(3),
+                                        out_golden.size(4))
+    out_golden = convert_from_bnsd(out_golden, input_layout)
+    return out_golden.cpu(), softmax_max.repeat(1, 1, 1, 8).cpu(), softmax_sum.repeat(1, 1, 1, 8).cpu()
+def npu_fusion_attention_grad(*args, **kwargs):
+    # dx, q, k, v, softmax_res, drop_mask, pse, scale, keep_prob
+    new_args, dims_kwargs, new_kwargs = npu_fusion_attention_backward_patch(*args, **kwargs)
+    query, key, value, dx, input_layout = new_args[0], new_args[1], new_args[2], new_args[3], new_args[5]
+    N1 = dims_kwargs.get("N1")
+    N2 = dims_kwargs.get("N2")
+    S1 = dims_kwargs.get("S1")
+    S2 = dims_kwargs.get("S2")
+    B = dims_kwargs.get("B")
+    D = dims_kwargs.get("D")
+    DTYPE = dims_kwargs.get("DTYPE")
+    atten_mask = new_kwargs.get("atten_mask")
+    keep_prob = new_kwargs.get("keep_prob")
+    sparse_mode = new_kwargs.get("sparse_mode")
+    pre_tockens = new_kwargs.get("pre_tockens")
+    next_tockens = new_kwargs.get("next_tockens")
+    pse = new_kwargs.get("pse")
+    softmax_max = new_kwargs.get("softmax_max")
+    softmax_sum = new_kwargs.get("softmax_sum")
+    scale_value = new_kwargs.get("scale_value")
+    atten_mask = generate_atten_mask(sparse_mode, atten_mask, B, N1, S1, S2, pre_tockens, next_tockens, DTYPE)
+    query = convert_to_bnsd(query, N1, input_layout)
+    dx = convert_to_bnsd(dx, N1, input_layout)
+    key = convert_to_bnsd(key, N2, input_layout)
+    value = convert_to_bnsd(value, N2, input_layout)
+    k_new, v_new = generate_kv(key, value, N1, N2)
+    if softmax_build_mode == "QKV":
+        softmax_res = rebuid_softmax_by_qkv(query, k_new, atten_mask, pse, scale_value)
+    else:
+        softmax_res = rebuild_softmax_by_max_sum(query, k_new, atten_mask, pse, scale_value, softmax_max, softmax_sum)
+    dq, dk, dv = fusion_attention_backward(dx, query, k_new, v_new, softmax_res, None, pse, scale_value, keep_prob)
+    # N不等长适配by cdy
+    if not (N1 == N2):
+        if N2 == 0:
+            raise ValueError("dims_kwargs.N2 must be non-zero.")
+        G = int(N1 / N2)
+        dk = torch.sum(dk.reshape(B, N2, G, S2, D), dim=2, keepdim=True).reshape(B, N2, S2, D)
+        dv = torch.sum(dv.reshape(B, N2, G, S2, D), dim=2, keepdim=True).reshape(B, N2, S2, D)
+    if dq.dim() == 5:
+        dq = dq.reshape(dq.size(0), dq.size(1) * dq.size(2), dq.size(3), dq.size(4))
+    if dk.dim() == 5:
+        dk = dk.reshape(dk.size(0), dk.size(1) * dk.size(2), dk.size(3), dk.size(4))
+    if dv.dim() == 5:
+        dv = dv.reshape(dv.size(0), dv.size(1) * dv.size(2), dv.size(3), dv.size(4))
+    dq = convert_from_bnsd(dq, input_layout)
+    dk = convert_from_bnsd(dk, input_layout)
+    dv = convert_from_bnsd(dv, input_layout)
+    return dq.cpu(), dk.cpu(), dv.cpu()
+def is_attention_off_due_to_mask(atten_mask_dtype):
+    return not atten_mask_dtype
+def is_attention_off_in_sparse_mode_4(sparse_mode, next_tockens, pre_tockens, S1):
+    return sparse_mode == 4 and (next_tockens != 0 or pre_tockens < S1)
+def is_attention_off_in_sparse_mode_0(sparse_mode, pre_tockens, next_tockens, S1, S2):
+    return sparse_mode == 0 and pre_tockens >= S1 and next_tockens >= S2
+def gpu_fusion_attention(*args, **kwargs):
+    deterministic = False
+    new_args, dims_kwargs, new_kwargs = npu_fusion_attention_forward_patch(*args, **kwargs)
+    query, key, value = new_args[0], new_args[1], new_args[2]
+    keep_prob = new_kwargs.get("keep_prob", 1.0)
+    scale = new_kwargs.get("scale")
+    N1 = dims_kwargs.get("N1")
+    N2 = dims_kwargs.get("N2")
+    S1 = dims_kwargs.get("S1")
+    S2 = dims_kwargs.get("S2")
+    B = dims_kwargs.get("B")
+    pse = new_kwargs.get("pse")
+    sparse_mode = new_kwargs.get("sparse_mode")
+    pre_tockens = new_kwargs.get("pre_tockens")
+    next_tockens = new_kwargs.get("next_tockens")
+    attn_mask = new_kwargs.get("atten_mask")
+    atten_mask_dtype = attn_mask.dtype if new_kwargs.get("atten_mask") is not None else None
+    pre_tockens = min(CompareConst.MAX_TOKENS, pre_tockens)
+    next_tockens = min(CompareConst.MAX_TOKENS, next_tockens)
+    atten_off = (is_attention_off_due_to_mask(atten_mask_dtype) or
+             is_attention_off_in_sparse_mode_4(sparse_mode, next_tockens, pre_tockens, S1) or
+             is_attention_off_in_sparse_mode_0(sparse_mode, pre_tockens, next_tockens, S1, S2))
+    causal_switch = not atten_off
+    if sparse_mode == CompareConst.SPECIAL_SPARSE_MOED:
+        window_left = pre_tockens
+        window_right = next_tockens
+    else:
+        pre_tockens = next_tockens = CompareConst.MAX_TOKENS
+        window_left = pre_tockens - S1 + S2
+        window_right = next_tockens + S1 - S2
+    if pse is not None:
+        alibi_slopes = torch.rand(B, N1, dtype=torch.float32) * 0.3
+    else:
+        alibi_slopes = None
+    out = flash_attn_func(query, key, value, dropout_p=(1-keep_prob), softmax_scale=scale, causal=causal_switch,
+                          window_size=(window_left, window_right), alibi_slopes=alibi_slopes, deterministic=deterministic)
+    return out, Const.NONE, Const.NONE

msprobe/pytorch/bench_functions/rms_norm.py ADDED Viewed

@@ -0,0 +1,15 @@
+import torch
+def npu_rms_norm(x, gamma, epsilon=1e-5):
+    rstd = torch.rsqrt(torch.mean(torch.pow(x, 2), axis=-1, keepdim=True) + epsilon)
+    res = x * rstd * gamma
+    return res, rstd.float()
+def npu_rms_norm_backward(grad, x, gamma, rstd):
+    mean_gy = (grad * x * gamma * rstd).mean(dim=-1, keepdim=True)
+    grad_x = (grad * gamma - x * rstd * mean_gy) * rstd
+    grad_gamma = x * grad * rstd
+    return grad_x.cpu(), grad_gamma.cpu()

msprobe/pytorch/bench_functions/rotary_mul.py ADDED Viewed

@@ -0,0 +1,52 @@
+import torch
+def npu_rotary_mul(x, r1, r2):
+    x1, x2 = torch.chunk(x, 2, -1)
+    x_new = torch.cat((-x2, x1), dim=-1)
+    output = r1 * x + r2 * x_new
+    return output
+def npu_rotary_mul_backward(dy_tensor, x, r1, r2):
+    x.requires_grad = True
+    r1.requires_grad = True
+    r2.requires_grad = True
+    # golden
+    x1, x2 = torch.chunk(x, 2, -1)
+    x_new = torch.cat((-x2, x1), dim=-1)
+    golden_tensor = r1 * x + r2 * x_new
+    golden_tensor.backward(dy_tensor)
+    r1_shape = r1.shape
+    r1_grad = torch.zeros(r1_shape).type(torch.float32)
+    r2_grad = torch.zeros(r1_shape).type(torch.float32)
+    x1, x2 = torch.chunk(x.float(), 2, -1)
+    x_new2 = torch.cat((-x2, x1), dim=-1)
+    x_shape = x.shape
+    h = x.float()
+    grad = dy_tensor.float()
+    condition_1 = (((r1_shape[0] == 1 and x_shape[0] != 1) or (r1_shape[0] == 1 and x_shape[0] == 1)) and
+                   ((r1_shape[2] == 1 and x_shape[2] != 1) or (r1_shape[2] == 1 and x_shape[2] == 1)) and
+                   (r1_shape[1] == x_shape[1]) and (r1_shape[3] == x_shape[3]))
+    condition_2 = (((r1_shape[0] == 1 and x_shape[0] != 1) or (r1_shape[0] == 1 and x_shape[0] == 1)) and
+                   ((r1_shape[1] == 1 and x_shape[1] != 1) or (r1_shape[1] == 1 and x_shape[1] == 1)) and
+                   (r1_shape[2] == x_shape[2]) and (r1_shape[3] == x_shape[3]))
+    condition_3 = (((r1_shape[2] == 1 and x_shape[2] != 1) or (r1_shape[2] == 1 and x_shape[2] == 1)) and
+                   ((r1_shape[1] == 1 and x_shape[1] != 1) or (r1_shape[1] == 1 and x_shape[1] == 1)) and
+                   (r1_shape[0] == x_shape[0]) and (r1_shape[3] == x_shape[3]))
+    if condition_1:
+        for i in range(x_shape[0]):
+            for j in range(x_shape[2]):
+                r2_grad[0, :, 0, :] += (x_new2[i, :, j, :] * grad[i, :, j, :])
+                r1_grad[0, :, 0, :] += (h[i, :, j, :] * grad[i, :, j, :])
+    elif condition_2:
+        for i in range(x_shape[0]):
+            for j in range(x_shape[1]):
+                r2_grad[0, 0, :, :] += (x_new2[i, j, :, :] * grad[i, j, :, :])
+                r1_grad[0, 0, :, :] += (h[i, j, :, :] * grad[i, j, :, :])
+    elif condition_3:
+        for i in range(x_shape[1]):
+            for j in range(x_shape[2]):
+                r2_grad[:, 0, 0, :] += (x_new2[:, i, j, :] * grad[:, i, j, :])
+                r1_grad[:, 0, 0, :] += (h[:, i, j, :] * grad[:, i, j, :])
+    return x.grad.cpu(), r1_grad.cpu(), r2_grad.cpu()

msprobe/pytorch/bench_functions/scaled_mask_softmax.py ADDED Viewed

@@ -0,0 +1,26 @@
+import torch
+def npu_scaled_masked_softmax(x, mask, scale, fixed_triu_mask):
+    if fixed_triu_mask:
+        mask = (torch.triu(torch.ones(mask.shape), k=1)).bool().to(mask.device)
+    dtype = x.dtype
+    x = (x * scale).masked_fill(mask, value=-10000)
+    x = x - torch.max(x, dim=-1, keepdims=True)[0]
+    x = torch.exp(x.float())
+    y = torch.div(x, torch.sum(x, dim=-1, keepdims=True))
+    return y.to(dtype)
+def npu_scaled_masked_softmax_backward(y_grad, y, mask, scale, fixed_triu_mask):
+    if fixed_triu_mask:
+        mask = (torch.triu(torch.ones(mask.shape), k=1)).bool().to(mask.device)
+    dtype = y_grad.dtype
+    y_grad = y_grad.float()
+    y = y.float()
+    x_grad = y_grad * y
+    x_grad = y_grad - torch.sum(x_grad, dim=-1, keepdims=True)
+    x_grad = x_grad * y
+    x_grad = x_grad * scale
+    x_grad = x_grad.masked_fill(mask, value=0)
+    return x_grad.to(dtype).cpu()

msprobe/pytorch/bench_functions/swiglu.py ADDED Viewed

@@ -0,0 +1,55 @@
+import torch
+def npu_swiglu(x, dim=-1):
+    tensor_dtype = x.dtype
+    inTensors = torch.chunk(x, 2, dim=dim)
+    if tensor_dtype == torch.float32:
+        tensor_scalar = torch.sigmoid(torch.mul(inTensors[0], 1.0))
+        output_data = torch.mul(torch.mul(tensor_scalar, inTensors[0]), inTensors[1])
+    else:
+        tensor_self_float = inTensors[0].type(torch.float)
+        tensor_other_float = inTensors[1].type(torch.float)
+        tensor_out_float = torch.nn.functional.silu(tensor_self_float).type(tensor_dtype).type(
+            torch.float32) * tensor_other_float
+        output_data = tensor_out_float.type(tensor_dtype)
+    return output_data
+def npu_swiglu_backward(grad, x, dim=-1):
+    tensor_dtype = grad.dtype
+    in_tensors = torch.chunk(x, 2, dim=dim)
+    tensor_grad_out = grad
+    if tensor_dtype == torch.float16:
+        tensor_out1 = torch.mul(
+            torch.mul(in_tensors[1].type(torch.float32), swish_grad(1, in_tensors[0].type(torch.float32))),
+            tensor_grad_out.type(torch.float32)).type(torch.float16)
+        tensor_out2 = torch.mul(tensor_grad_out.type(torch.float32),
+                                swish(1, in_tensors[0].type(torch.float32))).type(torch.float16)
+        output = torch.cat((tensor_out1, tensor_out2), dim)
+    elif tensor_dtype == torch.bfloat16:
+        tensor_self_float = in_tensors[0].type(torch.float)
+        tensor_other_float = in_tensors[1].type(torch.float)
+        tensor_gradout_float = tensor_grad_out.type(torch.float)
+        tensor_out1 = torch.mul(tensor_gradout_float, swish_grad(1.0, tensor_self_float)).type(torch.bfloat16).type(
+            torch.float32) * tensor_other_float
+        tensor_out2 = swish(1.0, tensor_self_float).type(torch.bfloat16).type(torch.float32) * tensor_gradout_float
+        tensor_out_float = torch.cat((tensor_out1, tensor_out2), dim=dim)
+        output = tensor_out_float.type(torch.bfloat16)
+    else:
+        tensor_out1 = torch.mul(torch.mul(in_tensors[1], swish_grad(1.0, in_tensors[0])), tensor_grad_out)
+        tensor_out2 = torch.mul(tensor_grad_out, swish(1.0, in_tensors[0]))
+        output = torch.cat((tensor_out1, tensor_out2), dim)
+    return output.cpu()
+def swish_grad(beta, x):
+    return torch.sigmoid(beta * x) + x * (1 - torch.sigmoid(beta * x)) * torch.sigmoid(beta * x) * beta
+def swish(beta, x):
+    return x * torch.sigmoid(beta * x)

msprobe/pytorch/common/__init__.py CHANGED Viewed

@@ -1,2 +1,2 @@
-from .parse_json import parse_json_info_forward_backward
-from .utils import seed_all
+from .parse_json import parse_json_info_forward_backward
+from .utils import seed_all

mindstudio-probe 1.0.1__py3-none-any.whl → 1.0.4__py3-none-any.whl

mindstudio-probe 1.0.1py3-none-any.whl → 1.0.4py3-none-any.whl