PyPI - mindstudio-probe - Versions diffs - 1.0.1__py3-none-any.whl → 1.0.3__py3-none-any.whl - Mend

mindstudio-probe 1.0.1py3-none-any.whl → 1.0.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (249) hide show

{mindstudio_probe-1.0.1.dist-info → mindstudio_probe-1.0.3.dist-info}/METADATA +5 -1
mindstudio_probe-1.0.3.dist-info/RECORD +272 -0
msprobe/README.md +78 -23
msprobe/__init__.py +1 -0
msprobe/config/README.md +182 -40
msprobe/config/config.json +22 -0
msprobe/core/__init__.py +0 -0
msprobe/{pytorch → core}/advisor/advisor.py +3 -3
msprobe/{pytorch → core}/advisor/advisor_result.py +2 -2
msprobe/core/common/const.py +82 -5
msprobe/core/common/exceptions.py +30 -18
msprobe/core/common/file_check.py +19 -1
msprobe/core/common/log.py +15 -1
msprobe/core/common/utils.py +130 -30
msprobe/core/common_config.py +32 -19
msprobe/core/compare/acc_compare.py +299 -0
msprobe/core/compare/check.py +95 -0
msprobe/core/compare/compare_cli.py +49 -0
msprobe/core/compare/highlight.py +222 -0
msprobe/core/compare/multiprocessing_compute.py +149 -0
msprobe/{pytorch → core}/compare/npy_compare.py +55 -4
msprobe/core/compare/utils.py +429 -0
msprobe/core/data_dump/data_collector.py +39 -35
msprobe/core/data_dump/data_processor/base.py +85 -37
msprobe/core/data_dump/data_processor/factory.py +5 -7
msprobe/core/data_dump/data_processor/mindspore_processor.py +198 -0
msprobe/core/data_dump/data_processor/pytorch_processor.py +94 -51
msprobe/core/data_dump/json_writer.py +11 -11
msprobe/core/grad_probe/__init__.py +0 -0
msprobe/core/grad_probe/constant.py +71 -0
msprobe/core/grad_probe/grad_compare.py +175 -0
msprobe/core/grad_probe/utils.py +52 -0
msprobe/doc/grad_probe/grad_probe.md +207 -0
msprobe/doc/grad_probe/img/image-1.png +0 -0
msprobe/doc/grad_probe/img/image-2.png +0 -0
msprobe/doc/grad_probe/img/image-3.png +0 -0
msprobe/doc/grad_probe/img/image-4.png +0 -0
msprobe/doc/grad_probe/img/image.png +0 -0
msprobe/mindspore/api_accuracy_checker/__init__.py +0 -0
msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +246 -0
msprobe/mindspore/api_accuracy_checker/api_info.py +69 -0
msprobe/mindspore/api_accuracy_checker/api_runner.py +152 -0
msprobe/mindspore/api_accuracy_checker/base_compare_algorithm.py +197 -0
msprobe/mindspore/api_accuracy_checker/compute_element.py +224 -0
msprobe/mindspore/api_accuracy_checker/main.py +16 -0
msprobe/mindspore/api_accuracy_checker/type_mapping.py +114 -0
msprobe/mindspore/api_accuracy_checker/utils.py +63 -0
msprobe/mindspore/cell_processor.py +34 -0
msprobe/mindspore/common/const.py +87 -0
msprobe/mindspore/common/log.py +38 -0
msprobe/mindspore/common/utils.py +57 -0
msprobe/mindspore/compare/distributed_compare.py +75 -0
msprobe/mindspore/compare/ms_compare.py +117 -0
msprobe/mindspore/compare/ms_graph_compare.py +317 -0
msprobe/mindspore/compare/ms_to_pt_api.yaml +399 -0
msprobe/mindspore/debugger/debugger_config.py +38 -15
msprobe/mindspore/debugger/precision_debugger.py +79 -4
msprobe/mindspore/doc/compare.md +58 -0
msprobe/mindspore/doc/dump.md +158 -6
msprobe/mindspore/dump/dump_tool_factory.py +19 -22
msprobe/mindspore/dump/hook_cell/api_registry.py +104 -0
msprobe/mindspore/dump/hook_cell/hook_cell.py +53 -0
msprobe/mindspore/dump/hook_cell/support_wrap_ops.yaml +925 -0
msprobe/mindspore/dump/hook_cell/wrap_functional.py +91 -0
msprobe/mindspore/dump/hook_cell/wrap_tensor.py +63 -0
msprobe/mindspore/dump/jit_dump.py +56 -0
msprobe/mindspore/dump/kernel_kbyk_dump.py +65 -0
msprobe/mindspore/free_benchmark/__init__.py +0 -0
msprobe/mindspore/free_benchmark/api_pynative_self_check.py +116 -0
msprobe/mindspore/free_benchmark/common/__init__.py +0 -0
msprobe/mindspore/free_benchmark/common/config.py +12 -0
msprobe/mindspore/free_benchmark/common/handler_params.py +17 -0
msprobe/mindspore/free_benchmark/common/utils.py +71 -0
msprobe/mindspore/free_benchmark/data/support_wrap_ops.yaml +842 -0
msprobe/mindspore/free_benchmark/decorator/__init__.py +0 -0
msprobe/mindspore/free_benchmark/decorator/dec_forward.py +42 -0
msprobe/mindspore/free_benchmark/decorator/decorator_factory.py +107 -0
msprobe/mindspore/free_benchmark/handler/__init__.py +0 -0
msprobe/mindspore/free_benchmark/handler/base_handler.py +90 -0
msprobe/mindspore/free_benchmark/handler/check_handler.py +41 -0
msprobe/mindspore/free_benchmark/handler/fix_handler.py +36 -0
msprobe/mindspore/free_benchmark/handler/handler_factory.py +21 -0
msprobe/mindspore/free_benchmark/perturbation/add_noise.py +67 -0
msprobe/mindspore/free_benchmark/perturbation/base_perturbation.py +21 -0
msprobe/mindspore/free_benchmark/perturbation/bit_noise.py +63 -0
msprobe/mindspore/free_benchmark/perturbation/improve_precision.py +34 -0
msprobe/mindspore/free_benchmark/perturbation/no_change.py +12 -0
msprobe/mindspore/free_benchmark/perturbation/perturbation_factory.py +27 -0
msprobe/mindspore/free_benchmark/self_check_tool_factory.py +33 -0
msprobe/mindspore/grad_probe/__init__.py +0 -0
msprobe/mindspore/grad_probe/global_context.py +91 -0
msprobe/mindspore/grad_probe/grad_analyzer.py +231 -0
msprobe/mindspore/grad_probe/grad_monitor.py +27 -0
msprobe/mindspore/grad_probe/grad_stat_csv.py +132 -0
msprobe/mindspore/grad_probe/hook.py +92 -0
msprobe/mindspore/grad_probe/utils.py +29 -0
msprobe/mindspore/ms_config.py +63 -15
msprobe/mindspore/overflow_check/overflow_check_tool_factory.py +17 -15
msprobe/mindspore/runtime.py +4 -0
msprobe/mindspore/service.py +354 -0
msprobe/mindspore/task_handler_factory.py +7 -4
msprobe/msprobe.py +66 -26
msprobe/pytorch/__init__.py +1 -1
msprobe/pytorch/api_accuracy_checker/common/config.py +21 -16
msprobe/pytorch/api_accuracy_checker/common/utils.py +1 -60
msprobe/pytorch/api_accuracy_checker/compare/algorithm.py +2 -5
msprobe/pytorch/api_accuracy_checker/compare/api_precision_compare.py +46 -10
msprobe/pytorch/api_accuracy_checker/compare/compare.py +84 -48
msprobe/pytorch/api_accuracy_checker/compare/compare_utils.py +8 -12
msprobe/pytorch/api_accuracy_checker/config.yaml +7 -1
msprobe/pytorch/api_accuracy_checker/run_ut/data_generate.py +15 -11
msprobe/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py +11 -15
msprobe/pytorch/api_accuracy_checker/run_ut/run_overflow_check.py +16 -9
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py +193 -105
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut_utils.py +68 -1
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/__init__.py +0 -0
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/attl.py +202 -0
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/client.py +324 -0
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/device_dispatch.py +204 -0
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/server.py +218 -0
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/ssl_config.py +10 -0
msprobe/pytorch/bench_functions/__init__.py +15 -0
msprobe/pytorch/bench_functions/apply_adam_w.py +28 -0
msprobe/pytorch/bench_functions/confusion_transpose.py +19 -0
msprobe/pytorch/bench_functions/fast_gelu.py +55 -0
msprobe/pytorch/bench_functions/layer_norm_eval.py +6 -0
msprobe/pytorch/bench_functions/linear.py +12 -0
msprobe/pytorch/bench_functions/matmul_backward.py +48 -0
msprobe/pytorch/bench_functions/npu_fusion_attention.py +421 -0
msprobe/pytorch/bench_functions/rms_norm.py +15 -0
msprobe/pytorch/bench_functions/rotary_mul.py +52 -0
msprobe/pytorch/bench_functions/scaled_mask_softmax.py +26 -0
msprobe/pytorch/bench_functions/swiglu.py +55 -0
msprobe/pytorch/common/parse_json.py +3 -1
msprobe/pytorch/common/utils.py +83 -7
msprobe/pytorch/compare/distributed_compare.py +19 -64
msprobe/pytorch/compare/match.py +3 -6
msprobe/pytorch/compare/pt_compare.py +40 -0
msprobe/pytorch/debugger/debugger_config.py +11 -2
msprobe/pytorch/debugger/precision_debugger.py +34 -4
msprobe/pytorch/doc/api_accuracy_checker.md +57 -13
msprobe/pytorch/doc/api_accuracy_checker_online.md +187 -0
msprobe/pytorch/doc/dump.md +73 -20
msprobe/pytorch/doc/ptdbg_ascend_compare.md +75 -11
msprobe/pytorch/doc/ptdbg_ascend_quickstart.md +3 -3
msprobe/pytorch/doc/run_overflow_check.md +1 -1
msprobe/pytorch/doc//321/206/320/247/320/260/321/206/320/260/320/227/321/206/320/255/320/226/321/205/342/225/226/320/265/321/205/320/225/342/225/226/321/205/320/254/342/225/221/321/206/320/251/320/277/321/211/320/272/320/234/321/210/320/277/320/221/321/205/320/242/320/234/321/206/320/220/320/267/321/210/320/223/342/225/234/321/205/320/257/342/225/221/321/207/342/225/221/342/224/220/321/206/320/232/320/265/321/205/320/241/320/232.md +151 -0
msprobe/pytorch/free_benchmark/common/constant.py +3 -0
msprobe/pytorch/free_benchmark/common/utils.py +4 -0
msprobe/pytorch/free_benchmark/compare/grad_saver.py +22 -26
msprobe/pytorch/free_benchmark/main.py +7 -4
msprobe/pytorch/free_benchmark/perturbed_layers/npu/add_noise.py +1 -1
msprobe/pytorch/free_benchmark/perturbed_layers/npu/bit_noise.py +1 -1
msprobe/pytorch/free_benchmark/perturbed_layers/npu/change_value.py +1 -1
msprobe/pytorch/free_benchmark/perturbed_layers/npu/improve_precision.py +3 -3
msprobe/pytorch/free_benchmark/perturbed_layers/npu/no_change.py +1 -1
msprobe/pytorch/free_benchmark/perturbed_layers/run_cpu.py +1 -1
msprobe/pytorch/free_benchmark/result_handlers/base_handler.py +43 -29
msprobe/pytorch/free_benchmark/result_handlers/handler_factory.py +0 -1
msprobe/pytorch/function_factory.py +75 -0
msprobe/pytorch/functional/dump_module.py +4 -4
msprobe/pytorch/grad_probe/__init__.py +0 -0
msprobe/pytorch/grad_probe/grad_monitor.py +90 -0
msprobe/pytorch/grad_probe/grad_stat_csv.py +129 -0
msprobe/pytorch/hook_module/hook_module.py +14 -3
msprobe/pytorch/hook_module/support_wrap_ops.yaml +2 -1
msprobe/pytorch/hook_module/utils.py +9 -9
msprobe/pytorch/hook_module/wrap_aten.py +20 -10
msprobe/pytorch/hook_module/wrap_distributed.py +10 -7
msprobe/pytorch/hook_module/wrap_functional.py +4 -7
msprobe/pytorch/hook_module/wrap_npu_custom.py +21 -10
msprobe/pytorch/hook_module/wrap_tensor.py +5 -6
msprobe/pytorch/hook_module/wrap_torch.py +5 -7
msprobe/pytorch/hook_module/wrap_vf.py +6 -8
msprobe/pytorch/module_processer.py +53 -13
msprobe/pytorch/online_dispatch/compare.py +4 -4
msprobe/pytorch/online_dispatch/dispatch.py +39 -41
msprobe/pytorch/online_dispatch/dump_compare.py +17 -47
msprobe/pytorch/online_dispatch/single_compare.py +5 -5
msprobe/pytorch/online_dispatch/utils.py +2 -43
msprobe/pytorch/parse_tool/lib/compare.py +31 -19
msprobe/pytorch/parse_tool/lib/config.py +2 -1
msprobe/pytorch/parse_tool/lib/parse_tool.py +4 -4
msprobe/pytorch/parse_tool/lib/utils.py +34 -80
msprobe/pytorch/parse_tool/lib/visualization.py +4 -3
msprobe/pytorch/pt_config.py +100 -6
msprobe/pytorch/service.py +104 -19
mindstudio_probe-1.0.1.dist-info/RECORD +0 -228
msprobe/mindspore/dump/api_kbk_dump.py +0 -55
msprobe/pytorch/compare/acc_compare.py +0 -1024
msprobe/pytorch/compare/highlight.py +0 -100
msprobe/test/core_ut/common/test_utils.py +0 -345
msprobe/test/core_ut/data_dump/test_data_collector.py +0 -47
msprobe/test/core_ut/data_dump/test_json_writer.py +0 -183
msprobe/test/core_ut/data_dump/test_scope.py +0 -151
msprobe/test/core_ut/test_common_config.py +0 -152
msprobe/test/core_ut/test_file_check.py +0 -218
msprobe/test/core_ut/test_log.py +0 -109
msprobe/test/mindspore_ut/test_api_kbk_dump.py +0 -51
msprobe/test/mindspore_ut/test_debugger_config.py +0 -42
msprobe/test/mindspore_ut/test_dump_tool_factory.py +0 -51
msprobe/test/mindspore_ut/test_kernel_graph_dump.py +0 -66
msprobe/test/mindspore_ut/test_kernel_graph_overflow_check.py +0 -63
msprobe/test/mindspore_ut/test_ms_config.py +0 -69
msprobe/test/mindspore_ut/test_overflow_check_tool_factory.py +0 -51
msprobe/test/mindspore_ut/test_precision_debugger.py +0 -56
msprobe/test/mindspore_ut/test_task_handler_factory.py +0 -58
msprobe/test/pytorch_ut/advisor/test_advisor.py +0 -83
msprobe/test/pytorch_ut/api_accuracy_checker/common/test_common_utils.py +0 -108
msprobe/test/pytorch_ut/api_accuracy_checker/common/test_config.py +0 -39
msprobe/test/pytorch_ut/api_accuracy_checker/compare/test_algorithm.py +0 -112
msprobe/test/pytorch_ut/api_accuracy_checker/compare/test_api_precision_compare.py +0 -77
msprobe/test/pytorch_ut/api_accuracy_checker/compare/test_compare.py +0 -125
msprobe/test/pytorch_ut/api_accuracy_checker/compare/test_compare_column.py +0 -10
msprobe/test/pytorch_ut/api_accuracy_checker/compare/test_compare_utils.py +0 -43
msprobe/test/pytorch_ut/api_accuracy_checker/run_ut/dump.json +0 -179
msprobe/test/pytorch_ut/api_accuracy_checker/run_ut/forward.json +0 -63
msprobe/test/pytorch_ut/api_accuracy_checker/run_ut/test_data_generate.py +0 -99
msprobe/test/pytorch_ut/api_accuracy_checker/run_ut/test_multi_run_ut.py +0 -115
msprobe/test/pytorch_ut/api_accuracy_checker/run_ut/test_run_ut.py +0 -72
msprobe/test/pytorch_ut/compare/test_acc_compare.py +0 -17
msprobe/test/pytorch_ut/free_benchmark/perturbed_layers/test_perturbed_layser.py +0 -105
msprobe/test/pytorch_ut/free_benchmark/result_handlers/test_result_handler.py +0 -121
msprobe/test/pytorch_ut/free_benchmark/test_main.py +0 -101
msprobe/test/pytorch_ut/functional/test_dump_module.py +0 -15
msprobe/test/pytorch_ut/hook_module/test_api_registry.py +0 -130
msprobe/test/pytorch_ut/hook_module/test_hook_module.py +0 -42
msprobe/test/pytorch_ut/hook_module/test_wrap_aten.py +0 -65
msprobe/test/pytorch_ut/hook_module/test_wrap_distributed.py +0 -35
msprobe/test/pytorch_ut/hook_module/test_wrap_functional.py +0 -20
msprobe/test/pytorch_ut/hook_module/test_wrap_tensor.py +0 -35
msprobe/test/pytorch_ut/hook_module/test_wrap_torch.py +0 -43
msprobe/test/pytorch_ut/hook_module/test_wrap_vf.py +0 -11
msprobe/test/pytorch_ut/test_pt_config.py +0 -69
msprobe/test/pytorch_ut/test_service.py +0 -59
msprobe/test/resources/advisor.txt +0 -3
msprobe/test/resources/compare_result_20230703104808.csv +0 -9
msprobe/test/resources/compare_result_without_accuracy.csv +0 -9
msprobe/test/resources/config.yaml +0 -3
msprobe/test/resources/npu_test.pkl +0 -8
msprobe/test/run_test.sh +0 -30
msprobe/test/run_ut.py +0 -58
msprobe/test/test_module_processer.py +0 -64
{mindstudio_probe-1.0.1.dist-info → mindstudio_probe-1.0.3.dist-info}/LICENSE +0 -0
{mindstudio_probe-1.0.1.dist-info → mindstudio_probe-1.0.3.dist-info}/WHEEL +0 -0
{mindstudio_probe-1.0.1.dist-info → mindstudio_probe-1.0.3.dist-info}/entry_points.txt +0 -0
{mindstudio_probe-1.0.1.dist-info → mindstudio_probe-1.0.3.dist-info}/top_level.txt +0 -0
/msprobe/{pytorch → core}/advisor/advisor_const.py +0 -0
/msprobe/pytorch/doc/{atat → msprobe}/321/207/342/226/223/342/225/233/321/205/342/225/221/320/266/321/205/342/225/226/320/265/321/205/320/225/342/225/226/321/206/320/245/342/226/221/321/206/320/235/320/276dump/321/206/320/260/320/227/321/205/320/227/320/226/321/206/320/220/320/267/321/210/320/223/342/225/234/321/205/320/257/342/225/221/321/207/342/225/221/342/224/220/321/206/320/232/320/265/321/205/320/241/320/232.md" +0 -0

msprobe/pytorch/bench_functions/npu_fusion_attention.py ADDED Viewed

@@ -0,0 +1,421 @@
+import torch
+import numpy as np
+from einops import rearrange
+from msprobe.pytorch.common.utils import logger
+gtype = torch.float64  # arm host必须选择float64，x86环境选择float32即可，64也行。arm计算很慢，s=8k的场景建议使用x86
+softmax_build_mode = "QKV"  # "MAX_SUM"
+"""
+# 前向函数声明对比
+标杆实现:fusion_attention_forward: q, k, v, drop_mask, atten_mask, pse, scale, keep_prob
+融合算子:npu_fusion_attention_forward: query, key, value, head_num, input_layout, *, pse=None, padding_mask=None,
+                                      atten_mask=None, scale=1.0, keep_prob=1.0, pre_tockens=2147483647,
+                                      next_tockens=2147483647, inner_precise=0, prefix=None, sparse_mode=0,
+                                      gen_mask_parallel=True, sync=False
+# 反向函数声明对比
+标杆实现:fusion_attention_backward: dx, q, k, v, softmax_res, drop_mask, pse, scale, keep_prob
+融合算子:npu_fusion_attention_backward: query, key, value, dy, head_num, input_layout, *, pse=None, padding_mask=None,
+                                       atten_mask=None, softmax_max=None, softmax_sum=None, softmax_in=None,
+                                       attention_in=None, scale_value=1.0, keep_prob=1.0, pre_tockens=2147483647,
+                                       next_tockens=2147483647, inner_precise=0, seed=0, offset=0,
+                                       numels=0, prefix=None, sparse_mode=0, gen_mask_parallel=True, sync=False
+"""
+def softmax_forward(x):
+    x_max = torch.max(x, dim=-1, keepdims=True)[0]
+    x_sub = x.sub(x_max)
+    y = torch.exp(x_sub)
+    x_sum = y.sum(dim=-1, keepdims=True)
+    res = y.div(x_sum)
+    return res, x_max, x_sum
+def softmax_grad(dp, softmax_res):
+    muls = dp * softmax_res
+    muls_r = muls.sum(dim=-1, keepdims=True)
+    sub_r = dp - muls_r
+    res = sub_r * softmax_res
+    return res
+def broadcast_kv(num_heads, num_kv_heads, kv_tensor, dtype):
+    if num_kv_heads == 0 or num_kv_heads < num_heads:
+        raise ValueError(f"num_kv_heads must be non-zero and less than num_heads.")
+    factor = num_heads // num_kv_heads
+    kv_shape = kv_tensor.shape
+    B = kv_shape[0]
+    S = kv_shape[2]
+    D = kv_shape[3]
+    kv_res = torch.zeros([B, num_heads, S, D]).to(dtype)
+    for i in range(num_heads):
+        j = i // factor
+        kv_res[:, i:i + 1, :, :] = kv_tensor[:, j:j + 1, :, :]
+    return kv_res
+def calculate_qk(q, k, atten_mask, pse, scale):
+    if pse is None or len(pse.shape) == 0:
+        qk = torch.matmul(q, k.permute(0, 1, 3, 2)).mul(scale)
+    else:
+        qk = (torch.matmul(q, k.permute(0, 1, 3, 2)) + pse).mul(scale)
+    if atten_mask is None or len(atten_mask.shape) == 0:
+        return qk
+    else:
+        qk = qk + atten_mask.bool() * (-40000.0)  # -10000
+    return qk
+def fusion_attention_forward(q, k, v, drop_mask, atten_mask, pse, scale, keep_prob):
+    qk = calculate_qk(q, k, atten_mask, pse, scale)
+    softmax_res, softmax_max, softmax_sum = softmax_forward(qk)
+    if drop_mask is None or len(drop_mask.shape) == 0:
+        drop_res = softmax_res
+    else:
+        drop_res = softmax_res * drop_mask * (1.0 / keep_prob)
+    y = torch.matmul(drop_res, v)
+    return y, softmax_max, softmax_sum
+def fusion_attention_backward(dx, q, k, v, softmax_res, drop_mask, pse, scale, keep_prob):
+    dp = torch.matmul(dx, v.permute(0, 1, 3, 2))
+    if drop_mask is None or len(drop_mask.shape) == 0:
+        drop_res = softmax_res.permute(0, 1, 3, 2)
+        dp_drop = dp
+    else:
+        drop_res = softmax_res.mul(drop_mask).mul(1.0 / keep_prob).permute(0, 1, 3, 2)
+        dp_drop = dp * drop_mask * (1.0 / keep_prob)
+    dv = torch.matmul(drop_res, dx)
+    softmax_grad_res = (softmax_grad(dp_drop, softmax_res) * scale)
+    dq = torch.matmul(softmax_grad_res, k)
+    dk = torch.matmul(softmax_grad_res.permute(0, 1, 3, 2), q)
+    return dq, dk, dv
+def parse_bsnd_args(query, key, head_num, input_layout):
+    supported_input_layout = ["BSH", "SBH", "BSND", "BNSD", "TND"]
+    B, S1, S2, N1, N2, D, H1, H2 = None, None, None, head_num, None, None, None, None
+    if not isinstance(input_layout, str) or input_layout not in supported_input_layout:
+        raise ValueError(f"Invalid input_layout arg which must be one of {supported_input_layout}.")
+    if input_layout == "TND":
+        raise ValueError(f"input_layout {input_layout} does not supported for now.")
+    try:
+        if input_layout == "BSH":
+            B, S1, H1 = query.shape
+            _, S2, H2 = key.shape
+            D = H1 // N1
+            N2 = H2 // D
+        elif input_layout == "SBH":
+            S1, B, H1 = query.shape
+            S2, _, H2 = key.shape
+            D = H1 // N1
+            N2 = H2 // D
+        elif input_layout == "BSND":
+            B, S1, N1, D = query.shape
+            _, S2, N2, _ = key.shape
+            H1 = N1 * D
+            H2 = N2 * D
+        elif input_layout == "BNSD":
+            B, N1, S1, D = query.shape
+            _, N2, S2, _ = key.shape
+            H1 = N1 * D
+            H2 = N2 * D
+    except Exception as e:
+        raise ValueError(f"query.shape: {query.shape}, key.shape: {key.shape}, parse_bsnd_args error: {e}") from e
+    if D == 0:
+        raise ValueError(f"Value D must be non-zero.")
+    DTYPE = query.dtype
+    return B, S1, S2, N1, N2, D, H1, H2, DTYPE
+def convert_from_bnsd(_input, input_layout):
+    if input_layout == "BSH":
+        # (B,N,S,D)=>(B,S,N*D)
+        out = rearrange(_input, 'b n s d -> b s (n d)').contiguous()
+    elif input_layout == "SBH":
+        # (B,N,S,D)=>(S,B,N*D)
+        out = rearrange(_input, 'b n s d -> s b (n d)').contiguous()
+    elif input_layout == "BSND":
+        # (B,N,S,D)=>(B,S,N,D)
+        out = rearrange(_input, 'b n s d -> b s n d').contiguous()
+    elif input_layout == "TND":
+        raise ValueError(f"input_layout {input_layout} does not supported for now.")
+    else:
+        out = _input
+    return out
+def convert_to_bnsd(_input, n, input_layout):
+    # 默认"BNSD"无需处理
+    if input_layout == "BSH":
+        # (B,S,N*D)=>(B,N,S,D)
+        out = rearrange(_input, 'b s (n d) -> b n s d', n=n)
+    elif input_layout == "SBH":
+        # (S,B,N*D)=>(B,N,S,D)
+        out = rearrange(_input, 's b (n d) -> b n s d', n=n)
+    elif input_layout == "BSND":
+        # (B,S,N,D)=>(B,N,S,D)
+        out = rearrange(_input, 'b s n d -> b n s d', n=n)
+    elif input_layout == "TND":
+        raise ValueError(f"input_layout {input_layout} does not supported for now.")
+    else:
+        out = _input
+    if out.dim() != 4:
+        raise ValueError(f"convert qkv format failed with input_layout {input_layout}.")
+    return out.to(gtype)
+def generate_atten_mask(sparse_mode, atten_mask, B, N1, S1, S2, pre_tocken, next_tocken, dtype):
+    """
+    # 当sparse_mode=2、3、4时小算子到融合算子会走这个优化，反过来看就要拆解回原来的基本实现
+    ===> atten_mask = torch.from_numpy(np.triu(np.ones([2048, 2048]), k=1)).to(dtype)
+    """
+    shape = [S1, S2]
+    if atten_mask is not None:
+        # 当FA的输入已经包含atten_mask时，可以认为已经是转换之后的mask矩阵了，有三种特殊场景，即稀疏矩阵场景，需要进行逆向还原
+        if sparse_mode == 2 or sparse_mode == 3 or sparse_mode == 4:
+            logger.info(f"S1: {S1}, S2:{S2}, atten_mask.shape:{atten_mask.shape}, atten_mask.dtype:{atten_mask.dtype}")
+            if atten_mask.dim() == 2 and atten_mask.shape[0] == 2048 and atten_mask.shape[1] == 2048:
+                if atten_mask.equal(torch.from_numpy(np.triu(np.ones([2048, 2048]), k=1)).to(atten_mask.dtype)):
+                    if sparse_mode == 2:
+                        atten_mask = torch.from_numpy(np.triu(np.ones(shape), k=1))
+                    elif sparse_mode == 3:
+                        atten_mask = torch.from_numpy(np.triu(np.ones(shape), k=S2 - S1 + 1))
+                    elif sparse_mode == 4:
+                        atten_mask_u = torch.from_numpy(np.triu(np.ones(shape), k=next_tocken + 1))
+                        atten_mask_l = torch.from_numpy(np.tril(np.ones(shape), k=-pre_tocken - 1))
+                        atten_mask = atten_mask_u + atten_mask_l
+                    logger.debug(f"反向转换atten_mask {atten_mask.shape}")
+                    return atten_mask.to(dtype)
+        return atten_mask.to(dtype)
+    if atten_mask is not None:
+        if atten_mask.dim() == 2:
+            if atten_mask.shape[0] != S1 or atten_mask.shape[1] != S2:
+                raise ValueError(f"Invalid atten_mask shape `SS` {atten_mask.shape}")
+            shape = [S1, S2]
+        elif atten_mask.dim() == 4:
+            if atten_mask.shape[1] == 1:
+                shape = [B, 1, S1, S2] if B != 1 else [1, 1, S1, S2]
+            else:
+                shape = [B, N1, S1, S2] if B != 1 else [1, N1, S1, S2]
+    if sparse_mode == 0:
+        atten_mask_u = torch.from_numpy(np.triu(np.ones(shape), k=next_tocken + 1))
+        atten_mask_l = torch.from_numpy(np.tril(np.ones(shape), k=-pre_tocken - 1))
+        atten_mask = atten_mask_u + atten_mask_l
+    elif sparse_mode == 1:  # no sparse
+        atten_mask = torch.from_numpy(np.zeros(shape))
+    elif sparse_mode == 2:
+        atten_mask = torch.from_numpy(np.triu(np.ones(shape), k=1))
+    elif sparse_mode == 3:
+        atten_mask = torch.from_numpy(np.triu(np.ones(shape), k=S2 - S1 + 1))
+    elif sparse_mode == 4:
+        atten_mask_u = torch.from_numpy(np.triu(np.ones(shape), k=next_tocken + 1))
+        atten_mask_l = torch.from_numpy(np.tril(np.ones(shape), k=-pre_tocken - 1))
+        atten_mask = atten_mask_u + atten_mask_l
+    # 注:不会出现sparse_mode=5的情况，该情况要求必须要传入atten_mask，且atten_mask矩阵数据格式须为BNSS或B1SS，
+    # 因此可以认为FA的输入已经是正确的atten_mask了
+    return atten_mask.to(dtype)
+def generate_kv(key, value, N1, N2):
+    # N不等长适配by cdy
+    if not (N1 == N2):
+        k_new = broadcast_kv(N1, N2, key, key.dtype)
+        v_new = broadcast_kv(N1, N2, value, value.dtype)
+    else:
+        k_new = key
+        v_new = value
+    return k_new, v_new
+def rebuid_softmax_by_qkv(q, k, atten_mask, pse, scale):
+    """
+    attention = softmax(QK^T/sqrt(d))V
+    softmax(x_i) = e^(x_i - x_max) / sum(e^(x_i - x_max))
+    """
+    logger.info("Using QKV to rebuild original softmax")
+    qk = calculate_qk(q, k, atten_mask, pse, scale)
+    softmax_res, x_max, x_sum = softmax_forward(qk)
+    return softmax_res
+def rebuild_softmax_by_max_sum(q, k, atten_mask, pse, scale, softmax_max, softmax_sum):
+    """
+    attention = softmax(QK^T/sqrt(d))V
+    softmax(x_i) = e^(x_i - x_max_i) / x_sum_i)
+    """
+    logger.info("Using softmax_max and softmax_sum to rebuild original softmax")
+    qk = calculate_qk(q, k, atten_mask, pse, scale)
+    if softmax_max.shape[-1] == 0:
+        raise ValueError(f"softmax_max.shape[-1] must be non-zero, softmax_max.shape: {softmax_max.shape}")
+    repeat_dim = qk.shape[-1] // softmax_max.shape[-1]
+    softmax_res = torch.exp(qk.sub(softmax_max.repeat(1, 1, 1, repeat_dim))).div(
+        softmax_sum.repeat(1, 1, 1, repeat_dim))
+    return softmax_res
+def npu_fusion_attention_forward_patch(*args, **kwargs):
+    # query, key, value, head_num, input_layout
+    if len(args) != 5:
+        raise ValueError(f"Unsupported npu_fusion_attention args {args}.")
+    B, S1, S2, N1, N2, D, H1, H2, DTYPE = parse_bsnd_args(args[0], args[1], args[3], args[4])
+    if N1 == N2 and S1 == S2:
+        logger.debug(f"running case : BNSD = {B}_{N1}_{S1}_{D}, sparse = {kwargs.get('sparse_mode', 0)}")
+    else:
+        logger.debug(f"running case: BNSD = {B}_{N1}({N2})_{S1}({S2})_{D}, sparse = {kwargs.get('sparse_mode', 0)}")
+    if not (N1 % N2 == 0 and N1 >= N2):
+        raise ValueError(f"N1与N2不匹配,请检查: N1 = {N1}, N2 = {N2}.")
+    dims_kwargs = {"B": B, "S1": S1, "S2": S2, "N1": N1, "N2": N2,
+                   "D": D, "H1": H1, "H2": H2, "DTYPE": DTYPE}
+    new_kwargs = {"keep_prob": 1,
+                  "scale": kwargs.get("scale", 1 / (D ** 0.5)),
+                  "sparse_mode": kwargs.get("sparse_mode", 0),
+                  "prefix": kwargs.get("prefix"),
+                  "pre_tockens": kwargs.get("pre_tockens", 2147483647),
+                  "next_tockens": kwargs.get("next_tockens", 2147483647),
+                  "pse": kwargs.get("pse"),
+                  "padding_mask": kwargs.get("padding_mask"),
+                  "atten_mask": kwargs.get("atten_mask")}
+    return args, dims_kwargs, new_kwargs
+def npu_fusion_attention_backward_patch(*args, **kwargs):
+    if len(args) != 6:
+        raise ValueError(f"Unsupported npu_fusion_attention_grad args {args}.")
+    B, S1, S2, N1, N2, D, H1, H2, DTYPE = parse_bsnd_args(args[0], args[1], args[4], args[5])
+    if N1 == N2 and S1 == S2:
+        logger.info(f"running case : BNSD = {B}_{N1}_{S1}_{D}, sparse = {kwargs.get('sparse_mode', 0)}")
+    else:
+        logger.info(f"running case: BNSD = {B}_{N1}({N2})_{S1}({S2})_{D}, sparse = {kwargs.get('sparse_mode', 0)}")
+    if not (N1 % N2 == 0 and N1 >= N2):
+        raise ValueError(f"N1与N2不匹配,请检查: N1 = {N1}, N2 = {N2}.")
+    dims_kwargs = {"B": B, "S1": S1, "S2": S2, "N1": N1, "N2": N2,
+                   "D": D, "H1": H1, "H2": H2, "DTYPE": DTYPE}
+    new_kwargs = {"keep_prob": 1,
+                  "scale_value": kwargs.get("scale_value", 1 / (D ** 0.5)),
+                  "sparse_mode": kwargs.get("sparse_mode", 0),
+                  "prefix": kwargs.get("prefix"),
+                  "pre_tockens": kwargs.get("pre_tockens", 2147483647),
+                  "next_tockens": kwargs.get("next_tockens", 2147483647),
+                  "pse": kwargs.get("pse"),
+                  "padding_mask": kwargs.get("padding_mask"),
+                  "softmax_max": kwargs.get("softmax_max"),
+                  "softmax_sum": kwargs.get("softmax_sum"),
+                  "softmax_in": kwargs.get("softmax_in"),
+                  "attention_in": kwargs.get("attention_in"),
+                  "seed": kwargs.get("seed", 0),
+                  "offset": kwargs.get("offset", 0),
+                  "numels": kwargs.get("numels", 0),
+                  "atten_mask": kwargs.get("atten_mask")}
+    return args, dims_kwargs, new_kwargs
+def npu_fusion_attention(*args, **kwargs):
+    new_args, dims_kwargs, new_kwargs = npu_fusion_attention_forward_patch(*args, **kwargs)
+    query, key, value, input_layout = new_args[0], new_args[1], new_args[2], new_args[4]
+    N1 = dims_kwargs.get("N1")
+    N2 = dims_kwargs.get("N2")
+    S1 = dims_kwargs.get("S1")
+    S2 = dims_kwargs.get("S2")
+    B = dims_kwargs.get("B")
+    DTYPE = dims_kwargs.get("DTYPE")
+    atten_mask = new_kwargs.get("atten_mask")
+    keep_prob = new_kwargs.get("keep_prob")
+    sparse_mode = new_kwargs.get("sparse_mode")
+    pre_tockens = new_kwargs.get("pre_tockens")
+    next_tockens = new_kwargs.get("next_tockens")
+    pse = new_kwargs.get("pse")
+    scale = new_kwargs.get("scale")
+    atten_mask = generate_atten_mask(sparse_mode, atten_mask, B, N1, S1, S2, pre_tockens, next_tockens, DTYPE)
+    query = convert_to_bnsd(query, N1, input_layout)
+    key = convert_to_bnsd(key, N2, input_layout)
+    value = convert_to_bnsd(value, N2, input_layout)
+    k_new, v_new = generate_kv(key, value, N1, N2)
+    out_golden, softmax_max, softmax_sum = fusion_attention_forward(q=query, k=k_new, v=v_new,
+                                                                    drop_mask=None, atten_mask=atten_mask,
+                                                                    pse=pse, scale=scale,
+                                                                    keep_prob=keep_prob)
+    if out_golden.dim() == 5:
+        out_golden = out_golden.reshape(out_golden.size(0), out_golden.size(1) * out_golden.size(2), out_golden.size(3),
+                                        out_golden.size(4))
+    out_golden = convert_from_bnsd(out_golden, input_layout)
+    return out_golden.cpu(), softmax_max.repeat(1, 1, 1, 8).cpu(), softmax_sum.repeat(1, 1, 1, 8).cpu()
+def npu_fusion_attention_grad(*args, **kwargs):
+    # dx, q, k, v, softmax_res, drop_mask, pse, scale, keep_prob
+    new_args, dims_kwargs, new_kwargs = npu_fusion_attention_backward_patch(*args, **kwargs)
+    query, key, value, dx, input_layout = new_args[0], new_args[1], new_args[2], new_args[3], new_args[5]
+    N1 = dims_kwargs.get("N1")
+    N2 = dims_kwargs.get("N2")
+    S1 = dims_kwargs.get("S1")
+    S2 = dims_kwargs.get("S2")
+    B = dims_kwargs.get("B")
+    D = dims_kwargs.get("D")
+    DTYPE = dims_kwargs.get("DTYPE")
+    atten_mask = new_kwargs.get("atten_mask")
+    keep_prob = new_kwargs.get("keep_prob")
+    sparse_mode = new_kwargs.get("sparse_mode")
+    pre_tockens = new_kwargs.get("pre_tockens")
+    next_tockens = new_kwargs.get("next_tockens")
+    pse = new_kwargs.get("pse")
+    softmax_max = new_kwargs.get("softmax_max")
+    softmax_sum = new_kwargs.get("softmax_sum")
+    scale_value = new_kwargs.get("scale_value")
+    atten_mask = generate_atten_mask(sparse_mode, atten_mask, B, N1, S1, S2, pre_tockens, next_tockens, DTYPE)
+    query = convert_to_bnsd(query, N1, input_layout)
+    dx = convert_to_bnsd(dx, N1, input_layout)
+    key = convert_to_bnsd(key, N2, input_layout)
+    value = convert_to_bnsd(value, N2, input_layout)
+    k_new, v_new = generate_kv(key, value, N1, N2)
+    if softmax_build_mode == "QKV":
+        softmax_res = rebuid_softmax_by_qkv(query, k_new, atten_mask, pse, scale_value)
+    else:
+        softmax_res = rebuild_softmax_by_max_sum(query, k_new, atten_mask, pse, scale_value, softmax_max, softmax_sum)
+    dq, dk, dv = fusion_attention_backward(dx, query, k_new, v_new, softmax_res, None, pse, scale_value, keep_prob)
+    # N不等长适配by cdy
+    if not (N1 == N2):
+        if N2 == 0:
+            raise ValueError("dims_kwargs.N2 must be non-zero.")
+        G = int(N1 / N2)
+        dk = torch.sum(dk.reshape(B, N2, G, S2, D), dim=2, keepdim=True).reshape(B, N2, S2, D)
+        dv = torch.sum(dv.reshape(B, N2, G, S2, D), dim=2, keepdim=True).reshape(B, N2, S2, D)
+    if dq.dim() == 5:
+        dq = dq.reshape(dq.size(0), dq.size(1) * dq.size(2), dq.size(3), dq.size(4))
+    if dk.dim() == 5:
+        dk = dk.reshape(dk.size(0), dk.size(1) * dk.size(2), dk.size(3), dk.size(4))
+    if dv.dim() == 5:
+        dv = dv.reshape(dv.size(0), dv.size(1) * dv.size(2), dv.size(3), dv.size(4))
+    dq = convert_from_bnsd(dq, input_layout)
+    dk = convert_from_bnsd(dk, input_layout)
+    dv = convert_from_bnsd(dv, input_layout)
+    return dq.cpu(), dk.cpu(), dv.cpu()

msprobe/pytorch/bench_functions/rms_norm.py ADDED Viewed

@@ -0,0 +1,15 @@
+import torch
+def npu_rms_norm(x, gamma, epsilon=1e-5):
+    rstd = torch.rsqrt(torch.mean(torch.pow(x, 2), axis=-1, keepdim=True) + epsilon)
+    res = x * rstd * gamma
+    return res.cpu(), rstd.float().cpu()
+def npu_rms_norm_backward(grad, x, gamma, rstd):
+    mean_gy = (grad * x * gamma * rstd).mean(dim=-1, keepdim=True)
+    grad_x = (grad * gamma - x * rstd * mean_gy) * rstd
+    grad_gamma = x * grad * rstd
+    return grad_x.cpu(), grad_gamma.cpu()

msprobe/pytorch/bench_functions/rotary_mul.py ADDED Viewed

@@ -0,0 +1,52 @@
+import torch
+def npu_rotary_mul(x, r1, r2):
+    x1, x2 = torch.chunk(x, 2, -1)
+    x_new = torch.cat((-x2, x1), dim=-1)
+    output = r1 * x + r2 * x_new
+    return output.cpu()
+def npu_rotary_mul_backward(dy_tensor, x, r1, r2):
+    x.requires_grad = True
+    r1.requires_grad = True
+    r2.requires_grad = True
+    # golden
+    x1, x2 = torch.chunk(x, 2, -1)
+    x_new = torch.cat((-x2, x1), dim=-1)
+    golden_tensor = r1 * x + r2 * x_new
+    golden_tensor.backward(dy_tensor)
+    r1_shape = r1.shape
+    r1_grad = torch.zeros(r1_shape).type(torch.float32)
+    r2_grad = torch.zeros(r1_shape).type(torch.float32)
+    x1, x2 = torch.chunk(x.float(), 2, -1)
+    x_new2 = torch.cat((-x2, x1), dim=-1)
+    x_shape = x.shape
+    h = x.float()
+    grad = dy_tensor.float()
+    condition_1 = (((r1_shape[0] == 1 and x_shape[0] != 1) or (r1_shape[0] == 1 and x_shape[0] == 1)) and
+                   ((r1_shape[2] == 1 and x_shape[2] != 1) or (r1_shape[2] == 1 and x_shape[2] == 1)) and
+                   (r1_shape[1] == x_shape[1]) and (r1_shape[3] == x_shape[3]))
+    condition_2 = (((r1_shape[0] == 1 and x_shape[0] != 1) or (r1_shape[0] == 1 and x_shape[0] == 1)) and
+                   ((r1_shape[1] == 1 and x_shape[1] != 1) or (r1_shape[1] == 1 and x_shape[1] == 1)) and
+                   (r1_shape[2] == x_shape[2]) and (r1_shape[3] == x_shape[3]))
+    condition_3 = (((r1_shape[2] == 1 and x_shape[2] != 1) or (r1_shape[2] == 1 and x_shape[2] == 1)) and
+                   ((r1_shape[1] == 1 and x_shape[1] != 1) or (r1_shape[1] == 1 and x_shape[1] == 1)) and
+                   (r1_shape[0] == x_shape[0]) and (r1_shape[3] == x_shape[3]))
+    if condition_1:
+        for i in range(x_shape[0]):
+            for j in range(x_shape[2]):
+                r2_grad[0, :, 0, :] += (x_new2[i, :, j, :] * grad[i, :, j, :])
+                r1_grad[0, :, 0, :] += (h[i, :, j, :] * grad[i, :, j, :])
+    elif condition_2:
+        for i in range(x_shape[0]):
+            for j in range(x_shape[1]):
+                r2_grad[0, 0, :, :] += (x_new2[i, j, :, :] * grad[i, j, :, :])
+                r1_grad[0, 0, :, :] += (h[i, j, :, :] * grad[i, j, :, :])
+    elif condition_3:
+        for i in range(x_shape[1]):
+            for j in range(x_shape[2]):
+                r2_grad[:, 0, 0, :] += (x_new2[:, i, j, :] * grad[:, i, j, :])
+                r1_grad[:, 0, 0, :] += (h[:, i, j, :] * grad[:, i, j, :])
+    return x.grad.cpu(), r1_grad.cpu(), r2_grad.cpu()

msprobe/pytorch/bench_functions/scaled_mask_softmax.py ADDED Viewed

@@ -0,0 +1,26 @@
+import torch
+def npu_scaled_masked_softmax(x, mask, scale, fixed_triu_mask):
+    if fixed_triu_mask:
+        mask = (torch.triu(torch.ones(mask.shape), k=1)).bool().to(mask.device)
+    dtype = x.dtype
+    x = (x * scale).masked_fill(mask, value=-10000)
+    x = x - torch.max(x, dim=-1, keepdims=True)[0]
+    x = torch.exp(x.float())
+    y = torch.div(x, torch.sum(x, dim=-1, keepdims=True))
+    return y.to(dtype).cpu()
+def npu_scaled_masked_softmax_backward(y_grad, y, mask, scale, fixed_triu_mask):
+    if fixed_triu_mask:
+        mask = (torch.triu(torch.ones(mask.shape), k=1)).bool().to(mask.device)
+    dtype = y_grad.dtype
+    y_grad = y_grad.float()
+    y = y.float()
+    x_grad = y_grad * y
+    x_grad = y_grad - torch.sum(x_grad, dim=-1, keepdims=True)
+    x_grad = x_grad * y
+    x_grad = x_grad * scale
+    x_grad = x_grad.masked_fill(mask, value=0)
+    return x_grad.to(dtype).cpu()

msprobe/pytorch/bench_functions/swiglu.py ADDED Viewed

@@ -0,0 +1,55 @@
+import torch
+def npu_swiglu(x, dim=-1):
+    tensor_dtype = x.dtype
+    inTensors = torch.chunk(x, 2, dim=dim)
+    if tensor_dtype == torch.float32:
+        tensor_scalar = torch.sigmoid(torch.mul(inTensors[0], 1.0))
+        output_data = torch.mul(torch.mul(tensor_scalar, inTensors[0]), inTensors[1])
+    else:
+        tensor_self_float = inTensors[0].type(torch.float)
+        tensor_other_float = inTensors[1].type(torch.float)
+        tensor_out_float = torch.nn.functional.silu(tensor_self_float).type(tensor_dtype).type(
+            torch.float32) * tensor_other_float
+        output_data = tensor_out_float.type(tensor_dtype)
+    return output_data.cpu()
+def npu_swiglu_backward(grad, x, dim=-1):
+    tensor_dtype = grad.dtype
+    in_tensors = torch.chunk(x, 2, dim=dim)
+    tensor_grad_out = grad
+    if tensor_dtype == torch.float16:
+        tensor_out1 = torch.mul(
+            torch.mul(in_tensors[1].type(torch.float32), swish_grad(1, in_tensors[0].type(torch.float32))),
+            tensor_grad_out.type(torch.float32)).type(torch.float16)
+        tensor_out2 = torch.mul(tensor_grad_out.type(torch.float32),
+                                swish(1, in_tensors[0].type(torch.float32))).type(torch.float16)
+        output = torch.cat((tensor_out1, tensor_out2), dim)
+    elif tensor_dtype == torch.bfloat16:
+        tensor_self_float = in_tensors[0].type(torch.float)
+        tensor_other_float = in_tensors[1].type(torch.float)
+        tensor_gradout_float = tensor_grad_out.type(torch.float)
+        tensor_out1 = torch.mul(tensor_gradout_float, swish_grad(1.0, tensor_self_float)).type(torch.bfloat16).type(
+            torch.float32) * tensor_other_float
+        tensor_out2 = swish(1.0, tensor_self_float).type(torch.bfloat16).type(torch.float32) * tensor_gradout_float
+        tensor_out_float = torch.cat((tensor_out1, tensor_out2), dim=dim)
+        output = tensor_out_float.type(torch.bfloat16)
+    else:
+        tensor_out1 = torch.mul(torch.mul(in_tensors[1], swish_grad(1.0, in_tensors[0])), tensor_grad_out)
+        tensor_out2 = torch.mul(tensor_grad_out, swish(1.0, in_tensors[0]))
+        output = torch.cat((tensor_out1, tensor_out2), dim)
+    return output.cpu()
+def swish_grad(beta, x):
+    return torch.sigmoid(beta * x) + x * (1 - torch.sigmoid(beta * x)) * torch.sigmoid(beta * x) * beta
+def swish(beta, x):
+    return x * torch.sigmoid(beta * x)

msprobe/pytorch/common/parse_json.py CHANGED Viewed

@@ -1,5 +1,7 @@
 import json
 from msprobe.core.common.exceptions import ParseJsonException
+from msprobe.core.common.file_check import FileOpen
 def parse_json_info_forward_backward(json_path):
@@ -11,7 +13,7 @@ def parse_json_info_forward_backward(json_path):
         api_name = '.'.join(name_struct[:-1])
         return api_name
-    with open(json_path, 'r') as f:
+    with FileOpen(json_path, 'r') as f:
         dump_json = json.load(f)
     real_data_path = dump_json.get("dump_data_dir")

mindstudio-probe 1.0.1__py3-none-any.whl → 1.0.3__py3-none-any.whl

mindstudio-probe 1.0.1py3-none-any.whl → 1.0.3py3-none-any.whl