PyPI - mindstudio-probe - Versions diffs - 1.3.0__py3-none-any.whl → 8.1.1__py3-none-any.whl - Mend

mindstudio-probe 1.3.0py3-none-any.whl → 8.1.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (213) hide show

{mindstudio_probe-1.3.0.dist-info → mindstudio_probe-8.1.1.dist-info}/METADATA +4 -2
{mindstudio_probe-1.3.0.dist-info → mindstudio_probe-8.1.1.dist-info}/RECORD +204 -152
msprobe/README.md +32 -1
msprobe/core/__init__.py +17 -0
msprobe/core/common/const.py +120 -21
msprobe/core/common/exceptions.py +2 -2
msprobe/core/common/file_utils.py +279 -50
msprobe/core/common/framework_adapter.py +169 -0
msprobe/core/common/global_lock.py +86 -0
msprobe/core/common/runtime.py +25 -0
msprobe/core/common/utils.py +136 -45
msprobe/core/common_config.py +7 -0
msprobe/core/compare/acc_compare.py +646 -428
msprobe/core/compare/check.py +36 -103
msprobe/core/compare/compare_cli.py +4 -0
msprobe/core/compare/config.py +72 -0
msprobe/core/compare/highlight.py +215 -215
msprobe/core/compare/layer_mapping/layer_mapping.py +2 -0
msprobe/core/compare/merge_result/merge_result.py +4 -4
msprobe/core/compare/multiprocessing_compute.py +223 -110
msprobe/core/compare/npy_compare.py +2 -4
msprobe/core/compare/utils.py +214 -244
msprobe/core/config_check/__init__.py +17 -0
msprobe/{pytorch/dump/kernel_dump/kernel_config.py → core/config_check/checkers/__init__.py} +8 -16
msprobe/core/config_check/checkers/base_checker.py +60 -0
msprobe/core/config_check/checkers/dataset_checker.py +138 -0
msprobe/core/config_check/checkers/env_args_checker.py +96 -0
msprobe/core/config_check/checkers/hyperparameter_checker.py +170 -0
msprobe/core/config_check/checkers/pip_checker.py +90 -0
msprobe/core/config_check/checkers/random_checker.py +367 -0
msprobe/core/config_check/checkers/weights_checker.py +147 -0
msprobe/core/config_check/ckpt_compare/ckpt_comparator.py +74 -0
msprobe/core/config_check/ckpt_compare/megatron_loader.py +302 -0
msprobe/core/config_check/ckpt_compare/metrics.py +83 -0
msprobe/core/config_check/ckpt_compare/name_mapping.yaml +12 -0
msprobe/core/config_check/config_check_cli.py +51 -0
msprobe/core/config_check/config_checker.py +100 -0
msprobe/{mindspore/runtime.py → core/config_check/resource/dependency.yaml} +7 -4
msprobe/core/config_check/resource/env.yaml +57 -0
msprobe/core/config_check/resource/hyperparameter.yaml +21 -0
msprobe/core/config_check/utils/hyperparameter_parser.py +115 -0
msprobe/core/config_check/utils/utils.py +107 -0
msprobe/core/data_dump/api_registry.py +67 -4
msprobe/core/data_dump/data_collector.py +170 -89
msprobe/core/data_dump/data_processor/base.py +72 -51
msprobe/core/data_dump/data_processor/mindspore_processor.py +109 -55
msprobe/core/data_dump/data_processor/pytorch_processor.py +90 -82
msprobe/core/data_dump/json_writer.py +143 -27
msprobe/core/debugger/precision_debugger.py +144 -0
msprobe/core/grad_probe/constant.py +1 -1
msprobe/core/grad_probe/grad_compare.py +1 -1
msprobe/core/grad_probe/utils.py +1 -1
msprobe/core/hook_manager.py +242 -0
msprobe/core/monitor/anomaly_processor.py +384 -0
msprobe/core/service.py +357 -0
msprobe/core/single_save/__init__.py +0 -0
msprobe/core/single_save/single_comparator.py +243 -0
msprobe/core/single_save/single_saver.py +146 -0
msprobe/docs/01.installation.md +6 -5
msprobe/docs/02.config_introduction.md +79 -22
msprobe/docs/03.config_examples.md +1 -0
msprobe/docs/04.kernel_dump_PyTorch.md +1 -1
msprobe/docs/05.data_dump_PyTorch.md +118 -49
msprobe/docs/06.data_dump_MindSpore.md +167 -20
msprobe/docs/07.accuracy_checker_PyTorch.md +2 -2
msprobe/docs/08.accuracy_checker_online_PyTorch.md +69 -9
msprobe/docs/09.accuracy_checker_MindSpore.md +18 -6
msprobe/docs/10.accuracy_compare_PyTorch.md +212 -74
msprobe/docs/11.accuracy_compare_MindSpore.md +87 -37
msprobe/docs/12.overflow_check_PyTorch.md +2 -2
msprobe/docs/13.overflow_check_MindSpore.md +2 -2
msprobe/docs/14.data_parse_PyTorch.md +3 -3
msprobe/docs/17.grad_probe.md +2 -1
msprobe/docs/18.online_dispatch.md +2 -2
msprobe/docs/19.monitor.md +90 -44
msprobe/docs/21.visualization_PyTorch.md +68 -15
msprobe/docs/22.visualization_MindSpore.md +71 -18
msprobe/docs/25.tool_function_introduction.md +23 -22
msprobe/docs/26.data_dump_PyTorch_baseline.md +14 -3
msprobe/docs/27.dump_json_instruction.md +1 -1
msprobe/docs/28.debugger_save_instruction.md +111 -20
msprobe/docs/29.data_dump_MSAdapter.md +2 -2
msprobe/docs/30.overflow_check_MSAdapter.md +2 -2
msprobe/docs/31.config_check.md +95 -0
msprobe/docs/32.ckpt_compare.md +69 -0
msprobe/docs/33.generate_operator_MindSpore.md +181 -0
msprobe/docs/34.RL_collect.md +92 -0
msprobe/docs/35.nan_analyze.md +72 -0
msprobe/docs/data_dump_MindSpore/data_dump_MindSpore_baseline.md +12 -1
msprobe/docs/data_dump_MindSpore/dynamic_graph_quick_start_example.md +3 -1
msprobe/docs/img/compare_result.png +0 -0
msprobe/docs/img/save_compare_result_sample.png +0 -0
msprobe/docs/img/visualization/proxy.png +0 -0
msprobe/mindspore/__init__.py +1 -2
msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +150 -58
msprobe/mindspore/api_accuracy_checker/api_runner.py +7 -3
msprobe/mindspore/api_accuracy_checker/bench_functions/flash_attention_score.py +47 -69
msprobe/mindspore/api_accuracy_checker/cmd_parser.py +4 -0
msprobe/mindspore/api_accuracy_checker/compute_element.py +0 -1
msprobe/mindspore/api_accuracy_checker/data_manager.py +2 -2
msprobe/mindspore/api_accuracy_checker/generate_op_script/op_generator.py +460 -0
msprobe/mindspore/api_accuracy_checker/generate_op_script/operator_replication.template +2081 -0
msprobe/mindspore/api_accuracy_checker/multi_api_accuracy_checker.py +9 -0
msprobe/mindspore/api_accuracy_checker/torch_mindtorch_importer.py +2 -1
msprobe/mindspore/cell_processor.py +204 -33
msprobe/mindspore/code_mapping/graph_parser.py +4 -21
msprobe/mindspore/common/const.py +17 -7
msprobe/mindspore/common/utils.py +128 -11
msprobe/mindspore/compare/common_dir_compare.py +382 -0
msprobe/mindspore/compare/distributed_compare.py +2 -26
msprobe/mindspore/compare/ms_compare.py +17 -405
msprobe/mindspore/compare/ms_graph_compare.py +14 -5
msprobe/mindspore/compare/utils.py +37 -0
msprobe/mindspore/debugger/debugger_config.py +53 -3
msprobe/mindspore/debugger/precision_debugger.py +72 -91
msprobe/mindspore/dump/cell_dump_process.py +877 -0
msprobe/mindspore/dump/cell_dump_with_insert_gradient.py +864 -0
msprobe/mindspore/dump/dump_tool_factory.py +13 -5
msprobe/mindspore/dump/graph_mode_cell_dump.py +139 -0
msprobe/mindspore/dump/graph_tensor_dump.py +123 -0
msprobe/mindspore/dump/hook_cell/api_register.py +40 -6
msprobe/mindspore/dump/hook_cell/hook_cell.py +18 -7
msprobe/mindspore/dump/hook_cell/ms_hook_manager.py +88 -0
msprobe/mindspore/dump/hook_cell/primitive_hooks.py +8 -2
msprobe/mindspore/dump/hook_cell/support_wrap_ops.yaml +18 -0
msprobe/mindspore/dump/jit_dump.py +21 -18
msprobe/mindspore/dump/kernel_kbyk_dump.py +6 -3
msprobe/mindspore/dym_loader/hook_dynamic_loader.cpp +110 -0
msprobe/mindspore/dym_loader/hook_dynamic_loader.h +15 -15
msprobe/mindspore/free_benchmark/api_pynative_self_check.py +12 -6
msprobe/mindspore/free_benchmark/common/utils.py +1 -1
msprobe/mindspore/grad_probe/global_context.py +7 -2
msprobe/mindspore/grad_probe/grad_stat_csv.py +3 -2
msprobe/mindspore/mindspore_service.py +114 -0
msprobe/mindspore/monitor/common_func.py +52 -0
msprobe/mindspore/monitor/data_writers.py +237 -0
msprobe/mindspore/monitor/features.py +20 -7
msprobe/mindspore/monitor/module_hook.py +281 -209
msprobe/mindspore/monitor/optimizer_collect.py +334 -0
msprobe/mindspore/monitor/utils.py +25 -5
msprobe/mindspore/ms_config.py +16 -15
msprobe/mindspore/task_handler_factory.py +5 -2
msprobe/msprobe.py +19 -0
msprobe/nan_analyze/__init__.py +14 -0
msprobe/nan_analyze/analyzer.py +255 -0
msprobe/nan_analyze/graph.py +189 -0
msprobe/nan_analyze/utils.py +211 -0
msprobe/pytorch/api_accuracy_checker/common/config.py +2 -2
msprobe/pytorch/api_accuracy_checker/compare/compare.py +36 -34
msprobe/pytorch/api_accuracy_checker/compare/compare_utils.py +20 -20
msprobe/pytorch/api_accuracy_checker/generate_op_script/op_generator.py +4 -7
msprobe/pytorch/api_accuracy_checker/generate_op_script/operator_replication.template +204 -2
msprobe/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py +12 -11
msprobe/pytorch/api_accuracy_checker/run_ut/run_overflow_check.py +1 -0
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py +8 -5
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut_utils.py +2 -3
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/client.py +29 -13
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/device_dispatch.py +12 -2
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/server.py +45 -31
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/utils.py +156 -0
msprobe/pytorch/attl_manager.py +65 -0
msprobe/pytorch/bench_functions/npu_fusion_attention.py +27 -0
msprobe/pytorch/common/utils.py +26 -14
msprobe/pytorch/compare/distributed_compare.py +4 -36
msprobe/pytorch/compare/pt_compare.py +13 -84
msprobe/pytorch/compare/utils.py +47 -0
msprobe/pytorch/debugger/debugger_config.py +34 -17
msprobe/pytorch/debugger/precision_debugger.py +66 -118
msprobe/pytorch/dump/module_dump/hook_wrapper.py +93 -0
msprobe/pytorch/dump/module_dump/module_dump.py +11 -58
msprobe/pytorch/dump/module_dump/module_processer.py +143 -113
msprobe/pytorch/grad_probe/grad_stat_csv.py +3 -2
msprobe/pytorch/hook_module/api_register.py +29 -5
msprobe/pytorch/hook_module/hook_module.py +9 -18
msprobe/pytorch/hook_module/jit_script_wrapper.py +33 -0
msprobe/pytorch/hook_module/pt_hook_manager.py +68 -0
msprobe/pytorch/hook_module/support_wrap_ops.yaml +22 -1
msprobe/pytorch/hook_module/utils.py +28 -2
msprobe/pytorch/monitor/csv2tb.py +6 -2
msprobe/pytorch/monitor/data_writers.py +259 -0
msprobe/pytorch/monitor/module_hook.py +227 -158
msprobe/pytorch/monitor/module_metric.py +14 -0
msprobe/pytorch/monitor/optimizer_collect.py +242 -270
msprobe/pytorch/monitor/utils.py +16 -3
msprobe/pytorch/online_dispatch/dispatch.py +4 -2
msprobe/pytorch/online_dispatch/dump_compare.py +5 -2
msprobe/pytorch/parse_tool/lib/utils.py +3 -3
msprobe/pytorch/pt_config.py +8 -7
msprobe/pytorch/pytorch_service.py +73 -0
msprobe/visualization/builder/graph_builder.py +33 -13
msprobe/visualization/builder/msprobe_adapter.py +24 -11
msprobe/visualization/compare/graph_comparator.py +53 -45
msprobe/visualization/compare/mode_adapter.py +31 -1
msprobe/visualization/graph/base_node.py +3 -3
msprobe/visualization/graph/graph.py +2 -2
msprobe/visualization/graph_service.py +250 -103
msprobe/visualization/utils.py +27 -11
msprobe/mindspore/dym_loader/hook_dynamic_loader.cc +0 -106
msprobe/mindspore/monitor/anomaly_detect.py +0 -404
msprobe/mindspore/monitor/module_spec_verifier.py +0 -94
msprobe/mindspore/service.py +0 -549
msprobe/pytorch/monitor/anomaly_analyse.py +0 -201
msprobe/pytorch/monitor/anomaly_detect.py +0 -410
msprobe/pytorch/monitor/module_spec_verifier.py +0 -95
msprobe/pytorch/monitor/unittest/test_monitor.py +0 -160
msprobe/pytorch/service.py +0 -473
{mindstudio_probe-1.3.0.dist-info → mindstudio_probe-8.1.1.dist-info}/LICENSE +0 -0
{mindstudio_probe-1.3.0.dist-info → mindstudio_probe-8.1.1.dist-info}/WHEEL +0 -0
{mindstudio_probe-1.3.0.dist-info → mindstudio_probe-8.1.1.dist-info}/entry_points.txt +0 -0
{mindstudio_probe-1.3.0.dist-info → mindstudio_probe-8.1.1.dist-info}/top_level.txt +0 -0
/msprobe/{mindspore → core}/compare/ms_to_pt_api.yaml +0 -0
/msprobe/{mindspore/dump → core}/kernel_dump/kernel_config.py +0 -0
/msprobe/{pytorch/monitor/unittest → core/monitor}/__init__.py +0 -0

msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py CHANGED Viewed

@@ -14,8 +14,10 @@
 # limitations under the License.
 import os
+from dataclasses import dataclass
+from typing import Any, Optional
 from tqdm import tqdm
+import numpy as np
 from msprobe.core.common.const import Const, CompareConst
 from msprobe.core.common.file_utils import FileOpen, create_directory, write_csv, load_json, load_yaml
 from msprobe.core.common.utils import add_time_as_suffix
@@ -28,6 +30,9 @@ from msprobe.mindspore.api_accuracy_checker.utils import (check_and_get_from_jso
 from msprobe.mindspore.common.const import MsCompareConst
 from msprobe.mindspore.common.log import logger
 from msprobe.mindspore.api_accuracy_checker import torch_mindtorch_importer
+from msprobe.core.data_dump.data_collector import build_data_collector
+from msprobe.core.common.utils import Const, print_tools_ends_info, DumpPathAggregation
+from msprobe.core.data_dump.data_processor.base import ModuleForwardInputsOutputs, ModuleBackwardInputsOutputs
 cur_path = os.path.dirname(os.path.realpath(__file__))
 yaml_path = os.path.join(cur_path, MsCompareConst.SUPPORTED_API_LIST_FILE)
@@ -59,13 +64,129 @@ class ProcessResultPacket:
         self.err_msg = err_msg
+@dataclass
+class Config:
+    execution_mode: str
+    dump_path: str
+    task: str
+    level: str
+    scope: Optional[Any]
+    list: Optional[Any]
+    framework: str
+    data_mode: str
+    file_format: str
+    dump_tensor_data_dir: str
+    async_dump: bool
+    summary_mode: Optional[Any] = None
 class ApiAccuracyChecker:
     def __init__(self, args):
         self.api_infos = dict()
         self.data_manager = DataManager(args.out_path, args.result_csv_path)  # 在初始化时实例化 DataManager
+        self.save_error_data = args.save_error_data
+        if self.save_error_data:
+            config, dump_path_aggregation = self.init_save_error_data(args)
+            self.data_collector = build_data_collector(config)
+            self.data_collector.update_dump_paths(dump_path_aggregation)
     @staticmethod
-    def run_and_compare_helper(api_info, api_name_str, api_input_aggregation, forward_or_backward):
+    def init_save_error_data(args):
+        config = Config(
+            execution_mode="pynative",
+            dump_path=f"{args.out_path}",
+            dump_tensor_data_dir=f"{args.out_path}",
+            task="tensor",  # 任务类型,模拟保存tensor数据
+            level="L1",  # 级别
+            scope=None,  # 作用域 (None)
+            list=None,  # API 列表 (None)
+            framework=Const.MS_FRAMEWORK,  # 框架类型
+            data_mode="all",
+            file_format="npy",
+            async_dump=False
+        )
+        dump_dir = f"{args.out_path}"
+        dump_data_dir = os.path.join(dump_dir, "error_data")
+        create_directory(dump_data_dir)
+        dump_path_aggregation = DumpPathAggregation()
+        dump_path_aggregation.dump_file_path = os.path.join(dump_dir, "dump.json")
+        dump_path_aggregation.stack_file_path = os.path.join(dump_dir, "stack.json")
+        dump_path_aggregation.dump_error_info_path = os.path.join(dump_dir, "dump_error_info.log")
+        dump_path_aggregation.dump_tensor_data_dir = dump_data_dir
+        return config, dump_path_aggregation
+    @staticmethod
+    def prepare_api_input_aggregation(api_info, forward_or_backward=Const.FORWARD):
+        """
+        Args:
+            api_info: ApiInfo
+            forward_or_backward: str
+        Returns:
+            ApiInputAggregation
+        """
+        forward_inputs = api_info.get_compute_element_list(Const.FORWARD, Const.INPUT)
+        kwargs = api_info.get_kwargs()
+        if forward_or_backward == Const.FORWARD:
+            gradient_inputs = None
+        else:
+            gradient_inputs = api_info.get_compute_element_list(Const.BACKWARD, Const.INPUT)
+        return ApiInputAggregation(forward_inputs, kwargs, gradient_inputs)
+    @staticmethod
+    def is_api_checkable(api_name_str):
+        '''
+        Args:
+            api_name_str: str, e.g. "MintFunctional.relu.0.forward", key in data field of api_info.json
+        Returns:
+            is_checkable: bool
+        Description:
+            tell whether this api is checkable based on the key in "data" dict in api_info.json
+        '''
+        api_name_str_list = api_name_str.split(Const.SEP)
+        if len(api_name_str_list) < MsCompareConst.API_NAME_STR_LENGTH:
+            return False
+        api_type_str = api_name_str_list[0]
+        real_api_str = Const.SEP.join(api_name_str_list[1:-2])
+        api_list = load_yaml(yaml_path)
+        supported_tensor_api_list = api_list.get(MsCompareConst.SUPPORTED_TENSOR_LIST_KEY)
+        supported_fusion_api_list = MsCompareConst.SUPPORTED_FUSION_LIST
+        if api_type_str in (MsCompareConst.MINT, MsCompareConst.MINT_FUNCTIONAL) \
+                and global_context.get_framework() == Const.MS_FRAMEWORK:
+            return True
+        if api_type_str in MsCompareConst.MT_VALID_API_TYPES \
+                and global_context.get_framework() == Const.MT_FRAMEWORK:
+            return True
+        if api_type_str == MsCompareConst.TENSOR_API and real_api_str in supported_tensor_api_list \
+                and global_context.get_framework() == Const.MS_FRAMEWORK:
+            return True
+        if api_type_str == MsCompareConst.FUNCTIONAL_API and real_api_str in supported_fusion_api_list \
+                and global_context.get_framework() == Const.MS_FRAMEWORK:
+            return True
+        return False
+    def post_forward_hook(self, api_or_module_name, primitive_instance, args, kwargs, output):
+        self.data_collector.update_api_or_module_name(api_or_module_name)
+        module_input_output = ModuleForwardInputsOutputs(args=args, kwargs=kwargs, output=output)
+        self.data_collector.forward_data_collect_only_tensor(
+            api_or_module_name,
+            primitive_instance,
+            os.getpid(),
+            module_input_output
+        )
+    def backward_hook(self, api_or_module_name, module, grad_input, grad_output):
+        self.data_collector.update_api_or_module_name(api_or_module_name)
+        module_input_output = ModuleBackwardInputsOutputs(grad_input=grad_output, grad_output=grad_input)
+        self.data_collector.backward_data_collect_only_tensor(
+            api_or_module_name,
+            module,
+            os.getpid(),
+            module_input_output
+        )
+    def run_and_compare_helper(self, api_info, api_name_str, api_input_aggregation, forward_or_backward):
         """
         Args:
             api_info: ApiInfo
@@ -83,13 +204,22 @@ class ApiAccuracyChecker:
         """
         # get output
         if global_context.get_is_constructed():
-            # constructed situation, need use constructed input to run mindspore api getting tested_output
-            tested_outputs = api_runner(api_input_aggregation, api_name_str,
-                                        forward_or_backward, global_context.get_framework())
+            if forward_or_backward == Const.FORWARD:
+                tested_outputs, inputs, kwargs, forward_result_tuple = api_runner(api_input_aggregation, api_name_str,
+                                                                                  forward_or_backward,
+                                                                                  global_context.get_framework())
+            elif forward_or_backward == Const.BACKWARD:
+                tested_outputs, gradient_inputs, backward_result_tuple = api_runner(api_input_aggregation, api_name_str,
+                                                                                    forward_or_backward,
+                                                                                    global_context.get_framework())
+            else:
+                tested_outputs = api_runner(api_input_aggregation, api_name_str,
+                                            forward_or_backward, global_context.get_framework())
         else:
             tested_outputs = api_info.get_compute_element_list(forward_or_backward, Const.OUTPUT)
         bench_outputs = api_runner(api_input_aggregation, api_name_str, forward_or_backward, Const.PT_FRAMEWORK)
         tested_outputs = trim_output_compute_element_list(tested_outputs, forward_or_backward)
         bench_outputs = trim_output_compute_element_list(bench_outputs, forward_or_backward)
         if len(tested_outputs) != len(bench_outputs):
@@ -114,64 +244,26 @@ class ApiAccuracyChecker:
                     compare_result_dict.get(CompareConst.MAX_ABS_ERR).pass_status == CompareConst.PASS:
                 status = CompareConst.PASS
                 err_msg = ""
             else:
                 status = CompareConst.ERROR
                 err_msg = (compare_result_dict.get(CompareConst.COSINE).err_msg +
                            compare_result_dict.get(CompareConst.MAX_ABS_ERR).err_msg)
+                if forward_or_backward == Const.FORWARD and self.save_error_data \
+                        and global_context.get_is_constructed():
+                    api_name_str_backward = f"{api_name_str}{Const.SEP}{Const.FORWARD}"
+                    self.post_forward_hook(api_name_str_backward, None, inputs, kwargs, forward_result_tuple)
+                if forward_or_backward == Const.BACKWARD and self.save_error_data \
+                        and global_context.get_is_constructed():
+                    api_name_str_backward = f"{api_name_str}{Const.SEP}{Const.BACKWARD}"
+                    self.backward_hook(api_name_str_backward, None, gradient_inputs, backward_result_tuple)
             basic_info_status = \
                 BasicInfoAndStatus(api_name_with_slot, bench_dtype, tested_dtype, shape, status, err_msg)
             output_list.append(tuple([api_name_str, forward_or_backward, basic_info_status, compare_result_dict]))
         return output_list
-    @staticmethod
-    def prepare_api_input_aggregation(api_info, forward_or_backward=Const.FORWARD):
-        """
-        Args:
-            api_info: ApiInfo
-            forward_or_backward: str
-        Returns:
-            ApiInputAggregation
-        """
-        forward_inputs = api_info.get_compute_element_list(Const.FORWARD, Const.INPUT)
-        kwargs = api_info.get_kwargs()
-        if forward_or_backward == Const.FORWARD:
-            gradient_inputs = None
-        else:
-            gradient_inputs = api_info.get_compute_element_list(Const.BACKWARD, Const.INPUT)
-        return ApiInputAggregation(forward_inputs, kwargs, gradient_inputs)
-    @staticmethod
-    def is_api_checkable(api_name_str):
-        '''
-        Args:
-            api_name_str: str, e.g. "MintFunctional.relu.0.forward", key in data field of api_info.json
-        Returns:
-            is_checkable: bool
-        Description:
-            tell whether this api is checkable based on the key in "data" dict in api_info.json
-        '''
-        api_name_str_list = api_name_str.split(Const.SEP)
-        if len(api_name_str_list) < MsCompareConst.API_NAME_STR_LENGTH:
-            return False
-        api_type_str = api_name_str_list[0]
-        real_api_str = Const.SEP.join(api_name_str_list[1:-2])
-        api_list = load_yaml(yaml_path)
-        supported_tensor_api_list = api_list.get(MsCompareConst.SUPPORTED_TENSOR_LIST_KEY)
-        supported_fusion_api_list = MsCompareConst.SUPPORTED_FUSION_LIST
-        if api_type_str in (MsCompareConst.MINT, MsCompareConst.MINT_FUNCTIONAL) \
-                and global_context.get_framework() == Const.MS_FRAMEWORK:
-            return True
-        if api_type_str in MsCompareConst.MT_VALID_API_TYPES \
-                and global_context.get_framework() == Const.MT_FRAMEWORK:
-            return True
-        if api_type_str == MsCompareConst.TENSOR_API and real_api_str in supported_tensor_api_list \
-                and global_context.get_framework() == Const.MS_FRAMEWORK:
-            return True
-        if api_type_str == MsCompareConst.FUNCTIONAL_API and real_api_str in supported_fusion_api_list \
-                and global_context.get_framework() == Const.MS_FRAMEWORK:
-            return True
-        return False
     def parse(self, api_info_path):
         api_info_dict = load_json(api_info_path)
@@ -183,9 +275,9 @@ class ApiAccuracyChecker:
                                                             MsCompareConst.TENSOR_TASK))
         try:
             framework = check_and_get_from_json_dict(api_info_dict, MsCompareConst.FRAMEWORK,
-                                                "framework field in api_info.json", accepted_type=str,
-                                                accepted_value=(Const.MS_FRAMEWORK,
-                                                                Const.MT_FRAMEWORK))
+                                                     "framework field in api_info.json", accepted_type=str,
+                                                     accepted_value=(Const.MS_FRAMEWORK,
+                                                                     Const.MT_FRAMEWORK))
         except Exception as e:
             framework = Const.MS_FRAMEWORK
             logger.warning(f"JSON parsing error in framework field: {e}")
@@ -301,4 +393,4 @@ class ApiAccuracyChecker:
             elif process_result_packet.process_status == MsCompareConst.ProcessStatus.EXCEPTION_SKIP:
                 self.data_manager.record_exception_skip(api_name_str, Const.BACKWARD, process_result_packet.err_msg)
-            self.data_manager.save_results(api_name_str)
+            self.data_manager.save_results(api_name_str)

msprobe/mindspore/api_accuracy_checker/api_runner.py CHANGED Viewed

@@ -13,6 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import os
+import numpy as np
 import mindspore
 from mindspore import ops
 from msprobe.core.common.const import Const
@@ -38,7 +40,6 @@ else:
     import torch
 class ApiInputAggregation:
     def __init__(self, inputs, kwargs, gradient_inputs) -> None:
         """
@@ -148,13 +149,13 @@ class ApiRunner:
         Args:
             api_type_str: str, Union["MintFunctional", "Mint", "Tensor", "Functional"]
             api_sub_name: str, e.g. "relu"
-            api_platform: str: Union["mindpore", "pytorch"]
+            api_platform: str: Union["mindspore", "pytorch"]
         Return:
             api_instance: function object
         Description:
-            get mindspore.mint/torch api fucntion
+            get mindspore.mint/torch api function
             mindspore.mint.{api_sub_name} <--> torch.{api_sub_name}
             mindspore.mint.nn.functional.{api_sub_name} <--> torch.nn.functional.{api_sub_name}
         """
@@ -189,6 +190,8 @@ class ApiRunner:
             forward_result = api_instance(*inputs, **kwargs)  # can be single tensor or tuple
             forward_result_tuple = convert_to_tuple(forward_result)
             res_compute_element_list = [ComputeElement(parameter=api_res) for api_res in forward_result_tuple]
+            if api_platform == Const.MS_FRAMEWORK or api_platform == Const.MT_FRAMEWORK:
+                return res_compute_element_list, inputs, kwargs, forward_result_tuple
         else:
             if gradient_inputs is None:
                 err_msg = f"ApiRunner.run_api failed: run backward api but gradient_inputs is missing"
@@ -206,6 +209,7 @@ class ApiRunner:
                 backward_result = grad_func(*inputs, gradient_inputs)  # can be single tensor or tuple
                 backward_result_tuple = convert_to_tuple(backward_result)
                 res_compute_element_list = [ComputeElement(parameter=api_res) for api_res in backward_result_tuple]
+                return res_compute_element_list, gradient_inputs, backward_result_tuple
             else:
                 # set requires_grad
                 requires_grad_index = []

msprobe/mindspore/api_accuracy_checker/bench_functions/flash_attention_score.py CHANGED Viewed

@@ -52,8 +52,14 @@ def softmax_grad(dp, softmax_res):
 def broadcast_kv(num_heads, num_kv_heads, kv_tensor, dtype):
+    # 检查维度
+    if kv_tensor.dim() != 4:
+        raise ValueError(f"broadcast_kv: kv_tensor 必须是 4 维 (B, N_kv, S, D)，但得到 {kv_tensor.shape}")
     if num_kv_heads == 0 or num_kv_heads > num_heads:
-        raise ValueError(f"num_kv_heads must be non-zero and bigger than num_heads.")
+        raise ValueError("broadcast_kv: num_kv_heads 必须大于 0 且不超过 num_heads。")
+    if num_heads % num_kv_heads != 0:
+        raise ValueError(f"broadcast_kv: num_heads({num_heads}) 必须能被 num_kv_heads({num_kv_heads}) 整除。")
     factor = num_heads // num_kv_heads
     kv_shape = kv_tensor.shape
@@ -68,6 +74,13 @@ def broadcast_kv(num_heads, num_kv_heads, kv_tensor, dtype):
 def calculate_qk(q, k, attn_mask, pse, scalar_value):
+    # 基本形状检查
+    if q.dim() < 4 or k.dim() < 4:
+        raise ValueError(f"calculate_qk: q,k 必须至少 4 维，q={q.dim()}，k={k.dim()}")
+    # 检查 head_dim 一致性
+    if q.size(-1) != k.size(-1):
+        raise ValueError(f"calculate_qk: q.head_dim({q.size(-1)}) != k.head_dim({k.size(-1)})")
     if k.dim() != 4:
         raise ValueError(f"k tensor dimension must be 4, but got {k.dim()} dimensions (shape: {k.shape})")
@@ -95,6 +108,10 @@ def fusion_attention_forward(forward_params):
     scalar_value = forward_params.scalar_value
     keep_prob = forward_params.keep_prob
+    # 拦截 keep_prob 为 0 的情况，防止除零
+    if keep_prob == 0:
+        raise ValueError("fusion_attention_forward: keep_prob 不能为 0，避免除零错误。")
     qk = calculate_qk(q, k, attn_mask, pse, scalar_value)
     softmax_res, softmax_max, softmax_sum = softmax_forward(qk)
     if drop_mask is None or len(drop_mask.shape) == 0:
@@ -115,6 +132,11 @@ def fusion_attention_backward(backward_params):
     pse = backward_params.pse
     scalar_value = backward_params.scalar_value
     keep_prob = backward_params.keep_prob
+    # 拦截 keep_prob 为 0 的情况，防止除零
+    if keep_prob == 0:
+        raise ValueError("fusion_attention_backward: keep_prob 不能为 0，避免除零错误。")
     dp = torch.matmul(dx, v.permute(0, 1, 3, 2))
     if drop_mask is None or len(drop_mask.shape) == 0:
         drop_res = softmax_res.permute(0, 1, 3, 2)
@@ -138,34 +160,45 @@ def parse_bsnd_args(query, key, head_num, input_layout):
     if input_layout == "TND":
         raise ValueError(f"input_layout {input_layout} does not supported for now.")
+    # 防止 head_num 为 0
+    if n1 == 0:
+        raise ValueError("parse_bsnd_args: head_num (n1) 不能为 0，避免除零错误。")
     try:
         if input_layout == "BSH":
             b, s1, h1 = query.shape
             _, s2, h2 = key.shape
             d = h1 // n1
+            # 拦截 d 为 0 的情况
+            if d == 0:
+                raise ValueError("parse_bsnd_args: 计算得到的 head_dim d 不能为 0。")
             n2 = h2 // d
         elif input_layout == "SBH":
             s1, b, h1 = query.shape
             s2, _, h2 = key.shape
             d = h1 // n1
+            if d == 0:
+                raise ValueError("parse_bsnd_args: 计算得到的 head_dim d 不能为 0。")
             n2 = h2 // d
         elif input_layout == "BSND":
             b, s1, n1, d = query.shape
             _, s2, n2, _ = key.shape
+            if d == 0:
+                raise ValueError("parse_bsnd_args: head_dim d 不能为 0。")
             h1 = n1 * d
             h2 = n2 * d
         elif input_layout == "BNSD":
             b, n1, s1, d = query.shape
             _, n2, s2, _ = key.shape
+            if d == 0:
+                raise ValueError("parse_bsnd_args: head_dim d 不能为 0。")
             h1 = n1 * d
             h2 = n2 * d
     except Exception as e:
         raise ValueError(f"query.shape: {query.shape}, key.shape: {key.shape}, parse_bsnd_args error: {e}") from e
-    if d == 0:
-        raise ValueError(f"Value d must be non-zero.")
-    _dtype = query.dtype
-    ret = (b, s1, s2, n1, n2, d, h1, h2, _dtype)
+    ret = (b, s1, s2, n1, n2, d, h1, h2, query.dtype)
     return ret
@@ -230,67 +263,6 @@ def convert_to_bnsd(_input, n, input_layout):
     return out.to(GTYPE)
-def convert_from_bsnd(_input, input_layout):
-    """
-    transform qkv from bsnd to input_layout.
-    B: batch_size
-    S: sequence_length
-    N: num_heads
-    D: head_dim
-    Args:
-       _input (torch.Tensor): tensor of shape (B,S,N,D)
-        input_layout (str): "BSH" or "SBH" or "BSND" or "BNSD" or "TND"
-    Returns:
-        tensor of shape (B,N,S,D) or (B,S,N,D) or (S,B,H) or (B,S,H)
-    """
-    if input_layout == "BSH":
-        # (B,S,N,D)=>(B,S,N*D)
-        out = rearrange(_input, 'b s n d -> b s (n d)').contiguous()
-    elif input_layout == "SBH":
-        # (B,S,N,D)=>(S,B,N*D)
-        out = rearrange(_input, 'b s n d -> s b (n d)').contiguous()
-    elif input_layout == "BNSD":
-        # (B,S,N,D)=>(B,N,S,D)
-        out = rearrange(_input, 'b s n d -> b n s d').contiguous()
-    elif input_layout == "TND":
-        raise ValueError(f"input_layout {input_layout} does not supported for now.")
-    else:
-        out = _input
-    return out
-def convert_to_bsnd(_input, n, input_layout):
-    """
-    transform qkv from input_layout to bsnd.
-    B: batch_size
-    S: sequence_length
-    N: num_heads
-    D: head_dim
-    Args:
-        _input (torch.Tensor): tensor of shape (B,N,S,D) or (B,S,N,D) or (S,B,H) or (B,S,H)
-        n (int): num_heads
-        input_layout (str):"BSH" or "SBH" or "BSND" or "BNSD" or "TND"
-    Returns:
-        tensor of shape (B,S,N,D)
-    """
-    if input_layout == "BSH":
-        # (B,S,N*D)=>(B,S,N,D)
-        out = rearrange(_input, 'b s (n d) -> b s n d', n=n)
-    elif input_layout == "SBH":
-        # (S,B,N*D)=>(B,S,N,D)
-        out = rearrange(_input, 's b (n d) -> b s n d', n=n)
-    elif input_layout == "BNSD":
-        # (B,N,S,D)=>(B,S,N,D)
-        out = rearrange(_input, 'b n s d -> b s n d', n=n)
-    elif input_layout == "TND":
-        raise ValueError(f"input_layout {input_layout} does not supported for now.")
-    else:
-        out = _input
-    if out.dim() != 4:
-        raise ValueError(f"convert qkv format failed with input_layout {input_layout}.")
-    return out
 def generate_attn_mask(*args):
     """
     # 当sparse_mode=2、3、4时小算子到融合算子会走这个优化，反过来看就要拆解回原来的基本实现
@@ -417,17 +389,20 @@ def get_input_layout(*args, **kwargs):
 def npu_fusion_attention_forward_patch(*args, **kwargs):
     if len(args) < 2:
-        raise RuntimeError("npu_fusion_attention_forward_patch: length of args should greater than or equal to 2.")
+        raise RuntimeError("npu_fusion_attention_forward_patch: length of args should be greater than or equal to 2.")
     # query, key, value, head_num, input_layout
     head_num = get_head_num(*args, **kwargs)
     input_layout = get_input_layout(*args, **kwargs)
     b, s1, s2, n1, n2, d, h1, h2, dtype = parse_bsnd_args(args[0], args[1], head_num, input_layout)
+    # 此处 d 已在 parse_bsnd_args 中检查为非零
     if n1 == n2 and s1 == s2:
         logger.debug(f"running case : BNSD = {b}_{n1}_{s1}_{d}, sparse = {kwargs.get('sparse_mode', 0)}")
     else:
         logger.debug(f"running case: BNSD = {b}_{n1}({n2})_{s1}({s2})_{d}, sparse = {kwargs.get('sparse_mode', 0)}")
+    if n2 == 0:
+        raise ValueError("n2 不能为 0，避免除零错误。")
     if not (n1 % n2 == 0 and n1 >= n2):
         raise ValueError(f"N1与N2不匹配,请检查: n1 = {n1}, n2 = {n2}.")
@@ -436,7 +411,7 @@ def npu_fusion_attention_forward_patch(*args, **kwargs):
         "d": d, "h1": h1, "h2": h2, "dtype": dtype
     }
     new_kwargs = {
-        "keep_prob": 1,
+        "keep_prob": 1,  # 注意：如果外部传入 keep_prob 为 0，也会在 fusion_attention_forward 中捕获
         "scalar_value": kwargs.get("scalar_value", 1 / (d ** 0.5)),
         "sparse_mode": kwargs.get("sparse_mode", 0),
         "prefix": kwargs.get("prefix"),
@@ -455,10 +430,13 @@ def npu_fusion_attention_backward_patch(*args, **kwargs):
         raise ValueError(f"Unsupported npu_fusion_attention_grad args {args}.")
     b, s1, s2, n1, n2, d, h1, h2, dtype = parse_bsnd_args(args[0], args[1], args[4], args[5])
+    # 此处 d 已在 parse_bsnd_args 中检查为非零
     if n1 == n2 and s1 == s2:
         logger.info(f"running case : bnsd = {b}_{n1}_{s1}_{d}, sparse = {kwargs.get('sparse_mode', 0)}")
     else:
         logger.info(f"running case: bnsd = {b}_{n1}({n2})_{s1}({s2})_{d}, sparse = {kwargs.get('sparse_mode', 0)}")
+    if n2 == 0:
+        raise ValueError("n2 不能为 0，避免除零错误。")
     if not (n1 % n2 == 0 and n1 >= n2):
         raise ValueError(f"N1与N2不匹配,请检查: n1 = {n1}, n2 = {n2}.")
@@ -468,7 +446,7 @@ def npu_fusion_attention_backward_patch(*args, **kwargs):
     }
     new_kwargs = {
-        "keep_prob": 1,
+        "keep_prob": 1,  # 同上，fusion_attention_backward 内会拦截 keep_prob 为 0 的情况
         "scalar_value_value": kwargs.get("scalar_value_value", 1 / (d ** 0.5)),
         "sparse_mode": kwargs.get("sparse_mode", 0),
         "prefix": kwargs.get("prefix"),

msprobe/mindspore/api_accuracy_checker/cmd_parser.py CHANGED Viewed

@@ -39,6 +39,8 @@ def add_api_accuracy_checker_argument(parser):
                         help="<optional> The ut task result out path.")
     parser.add_argument("-csv_path", "--result_csv_path", dest="result_csv_path", default="", type=str, required=False,
                         help="<optional> the exit csv for continue")
+    parser.add_argument('-save_error_data', dest="save_error_data", action="store_true",
+                        help="<optional> Save compare failed api output.", required=False)
 def multi_add_api_accuracy_checker_argument(parser):
@@ -49,6 +51,8 @@ def multi_add_api_accuracy_checker_argument(parser):
                         help="<optional> The ut task result out path.")
     parser.add_argument("-csv_path", "--result_csv_path", dest="result_csv_path", default="", type=str, required=False,
                         help="<optional> the exit csv for continue")
+    parser.add_argument('-save_error_data', dest="save_error_data", action="store_true",
+                        help="<optional> Save compare failed api output.", required=False)
     #以下属于多线程参数
     parser.add_argument("-d", "--device", dest="device_id", nargs='+', type=int,
                         help="<optional> set device id to run ut, must be unique and in range 0-7",

msprobe/mindspore/api_accuracy_checker/compute_element.py CHANGED Viewed

@@ -17,7 +17,6 @@ import os
 import mindspore
 import numpy as np
-import torch
 from mindspore._c_expression import typing
 from msprobe.core.common.const import Const
 from msprobe.core.common.exceptions import ApiAccuracyCheckerException

msprobe/mindspore/api_accuracy_checker/data_manager.py CHANGED Viewed

@@ -188,7 +188,7 @@ class DataManager:
     def record_exception_skip(self, api_name, forward_or_backward, err_msg):
         '''
-            record exception_skip infomation into self.record_exception_skip.
+            record exception_skip information into self.record_exception_skip.
             self.record_exception_skip: dict{str: dict{"forward": str/None, "backward": str/None}}
             string in key is api_name, string in value is err_msg
         '''
@@ -270,7 +270,7 @@ class DataManager:
                 entry.backward_pass_status,
                 overall_err_msg
             ]
-            # change row if this api has excption_skip infomation
+            # change row if this api has exception_skip information
             if api_name in self.results_exception_skip:
                 if self.results_exception_skip[api_name][Const.FORWARD] is not None:
                     row[1] = CompareConst.SKIP

mindstudio-probe 1.3.0__py3-none-any.whl → 8.1.1__py3-none-any.whl

mindstudio-probe 1.3.0py3-none-any.whl → 8.1.1py3-none-any.whl