PyPI - mindstudio-probe - Versions diffs - 1.2.2__py3-none-any.whl → 8.1.0__py3-none-any.whl - Mend

mindstudio-probe 1.2.2py3-none-any.whl → 8.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (261) hide show

{mindstudio_probe-1.2.2.dist-info → mindstudio_probe-8.1.0.dist-info}/METADATA +4 -3
{mindstudio_probe-1.2.2.dist-info → mindstudio_probe-8.1.0.dist-info}/RECORD +243 -191
msprobe/README.md +57 -21
msprobe/core/__init__.py +17 -0
msprobe/core/common/const.py +224 -82
msprobe/core/common/decorator.py +50 -0
msprobe/core/common/exceptions.py +5 -3
msprobe/core/common/file_utils.py +274 -40
msprobe/core/common/framework_adapter.py +169 -0
msprobe/core/common/global_lock.py +86 -0
msprobe/core/common/runtime.py +25 -0
msprobe/core/common/utils.py +148 -72
msprobe/core/common_config.py +7 -0
msprobe/core/compare/acc_compare.py +640 -462
msprobe/core/compare/check.py +36 -107
msprobe/core/compare/compare_cli.py +4 -0
msprobe/core/compare/config.py +72 -0
msprobe/core/compare/highlight.py +217 -215
msprobe/core/compare/layer_mapping/layer_mapping.py +4 -1
msprobe/core/compare/merge_result/merge_result.py +12 -6
msprobe/core/compare/multiprocessing_compute.py +227 -107
msprobe/core/compare/npy_compare.py +32 -16
msprobe/core/compare/utils.py +218 -244
msprobe/{mindspore/runtime.py → core/config_check/__init__.py} +2 -4
msprobe/{pytorch/dump/kernel_dump/kernel_config.py → core/config_check/checkers/__init__.py} +8 -16
msprobe/core/config_check/checkers/base_checker.py +60 -0
msprobe/core/config_check/checkers/dataset_checker.py +138 -0
msprobe/core/config_check/checkers/env_args_checker.py +96 -0
msprobe/core/config_check/checkers/hyperparameter_checker.py +170 -0
msprobe/core/config_check/checkers/pip_checker.py +90 -0
msprobe/core/config_check/checkers/random_checker.py +367 -0
msprobe/core/config_check/checkers/weights_checker.py +147 -0
msprobe/core/config_check/ckpt_compare/ckpt_comparator.py +74 -0
msprobe/core/config_check/ckpt_compare/megatron_loader.py +302 -0
msprobe/core/config_check/ckpt_compare/metrics.py +83 -0
msprobe/core/config_check/ckpt_compare/name_mapping.yaml +12 -0
msprobe/core/config_check/config_check_cli.py +51 -0
msprobe/core/config_check/config_checker.py +100 -0
msprobe/{pytorch/parse.py → core/config_check/resource/dependency.yaml} +7 -4
msprobe/core/config_check/resource/env.yaml +57 -0
msprobe/core/config_check/resource/hyperparameter.yaml +21 -0
msprobe/core/config_check/utils/hyperparameter_parser.py +115 -0
msprobe/core/config_check/utils/utils.py +107 -0
msprobe/core/data_dump/api_registry.py +239 -0
msprobe/core/data_dump/data_collector.py +36 -9
msprobe/core/data_dump/data_processor/base.py +74 -53
msprobe/core/data_dump/data_processor/mindspore_processor.py +119 -78
msprobe/core/data_dump/data_processor/pytorch_processor.py +134 -96
msprobe/core/data_dump/json_writer.py +146 -57
msprobe/core/debugger/precision_debugger.py +143 -0
msprobe/core/grad_probe/constant.py +2 -1
msprobe/core/grad_probe/grad_compare.py +2 -2
msprobe/core/grad_probe/utils.py +1 -1
msprobe/core/hook_manager.py +242 -0
msprobe/core/monitor/anomaly_processor.py +384 -0
msprobe/core/overflow_check/abnormal_scene.py +2 -0
msprobe/core/service.py +356 -0
msprobe/core/single_save/__init__.py +0 -0
msprobe/core/single_save/single_comparator.py +243 -0
msprobe/core/single_save/single_saver.py +157 -0
msprobe/docs/01.installation.md +6 -5
msprobe/docs/02.config_introduction.md +89 -30
msprobe/docs/03.config_examples.md +1 -0
msprobe/docs/04.kernel_dump_PyTorch.md +1 -1
msprobe/docs/05.data_dump_PyTorch.md +184 -50
msprobe/docs/06.data_dump_MindSpore.md +193 -28
msprobe/docs/07.accuracy_checker_PyTorch.md +13 -3
msprobe/docs/08.accuracy_checker_online_PyTorch.md +72 -10
msprobe/docs/09.accuracy_checker_MindSpore.md +19 -7
msprobe/docs/10.accuracy_compare_PyTorch.md +266 -102
msprobe/docs/11.accuracy_compare_MindSpore.md +117 -43
msprobe/docs/12.overflow_check_PyTorch.md +5 -3
msprobe/docs/13.overflow_check_MindSpore.md +6 -4
msprobe/docs/14.data_parse_PyTorch.md +4 -10
msprobe/docs/17.grad_probe.md +2 -1
msprobe/docs/18.online_dispatch.md +3 -3
msprobe/docs/19.monitor.md +211 -103
msprobe/docs/21.visualization_PyTorch.md +100 -28
msprobe/docs/22.visualization_MindSpore.md +103 -31
msprobe/docs/23.generate_operator_PyTorch.md +9 -9
msprobe/docs/25.tool_function_introduction.md +23 -22
msprobe/docs/26.data_dump_PyTorch_baseline.md +14 -3
msprobe/docs/27.dump_json_instruction.md +278 -8
msprobe/docs/28.debugger_save_instruction.md +111 -20
msprobe/docs/28.kernel_dump_MindSpore.md +1 -1
msprobe/docs/29.data_dump_MSAdapter.md +229 -0
msprobe/docs/30.overflow_check_MSAdapter.md +31 -0
msprobe/docs/31.config_check.md +95 -0
msprobe/docs/32.ckpt_compare.md +69 -0
msprobe/docs/33.generate_operator_MindSpore.md +190 -0
msprobe/docs/34.RL_collect.md +92 -0
msprobe/docs/35.nan_analyze.md +72 -0
msprobe/docs/FAQ.md +3 -11
msprobe/docs/data_dump_MindSpore/data_dump_MindSpore_baseline.md +12 -1
msprobe/docs/data_dump_MindSpore/dynamic_graph_quick_start_example.md +3 -1
msprobe/docs/img/compare_result.png +0 -0
msprobe/docs/img/merge_result.png +0 -0
msprobe/docs/img/save_compare_result_sample.png +0 -0
msprobe/docs/img/visualization/proxy.png +0 -0
msprobe/docs/img/visualization/vis_browser_1.png +0 -0
msprobe/docs/img/visualization/vis_match_info.png +0 -0
msprobe/docs/img/visualization/vis_precision_info.png +0 -0
msprobe/docs/img/visualization/vis_search_info.png +0 -0
msprobe/docs/img/visualization/vis_show_info.png +0 -0
msprobe/docs/img/visualization/vis_showcase.png +0 -0
msprobe/docs/img/visualization/vis_unmatch_info.png +0 -0
msprobe/mindspore/__init__.py +3 -3
msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +151 -55
msprobe/mindspore/api_accuracy_checker/api_runner.py +25 -11
msprobe/mindspore/api_accuracy_checker/base_compare_algorithm.py +2 -1
msprobe/mindspore/api_accuracy_checker/bench_functions/flash_attention_score.py +580 -0
msprobe/mindspore/api_accuracy_checker/bench_functions/fusion_operator.py +41 -0
msprobe/mindspore/api_accuracy_checker/cmd_parser.py +4 -0
msprobe/mindspore/api_accuracy_checker/data_manager.py +4 -3
msprobe/mindspore/api_accuracy_checker/generate_op_script/config_op.json +9 -0
msprobe/mindspore/api_accuracy_checker/generate_op_script/op_generator.py +451 -0
msprobe/mindspore/api_accuracy_checker/generate_op_script/operator_replication.template +2081 -0
msprobe/mindspore/api_accuracy_checker/multi_api_accuracy_checker.py +11 -1
msprobe/mindspore/api_accuracy_checker/torch_mindtorch_importer.py +2 -1
msprobe/mindspore/cell_processor.py +204 -33
msprobe/mindspore/code_mapping/graph_parser.py +4 -21
msprobe/mindspore/common/const.py +73 -2
msprobe/mindspore/common/utils.py +157 -29
msprobe/mindspore/compare/common_dir_compare.py +382 -0
msprobe/mindspore/compare/distributed_compare.py +2 -26
msprobe/mindspore/compare/ms_compare.py +18 -398
msprobe/mindspore/compare/ms_graph_compare.py +20 -10
msprobe/mindspore/compare/utils.py +37 -0
msprobe/mindspore/debugger/debugger_config.py +59 -7
msprobe/mindspore/debugger/precision_debugger.py +83 -90
msprobe/mindspore/dump/cell_dump_process.py +902 -0
msprobe/mindspore/dump/cell_dump_with_insert_gradient.py +889 -0
msprobe/mindspore/dump/dump_tool_factory.py +18 -8
msprobe/mindspore/dump/graph_mode_cell_dump.py +139 -0
msprobe/mindspore/dump/graph_tensor_dump.py +123 -0
msprobe/mindspore/dump/hook_cell/api_register.py +176 -0
msprobe/mindspore/dump/hook_cell/hook_cell.py +22 -12
msprobe/mindspore/dump/hook_cell/ms_hook_manager.py +88 -0
msprobe/mindspore/dump/hook_cell/primitive_hooks.py +8 -2
msprobe/mindspore/dump/hook_cell/support_wrap_ops.yaml +42 -26
msprobe/mindspore/dump/jit_dump.py +35 -27
msprobe/mindspore/dump/kernel_kbyk_dump.py +6 -3
msprobe/mindspore/dym_loader/hook_dynamic_loader.cpp +110 -0
msprobe/mindspore/dym_loader/hook_dynamic_loader.h +15 -16
msprobe/mindspore/free_benchmark/api_pynative_self_check.py +22 -12
msprobe/mindspore/free_benchmark/common/utils.py +1 -1
msprobe/mindspore/free_benchmark/perturbation/perturbation_factory.py +4 -2
msprobe/mindspore/free_benchmark/self_check_tool_factory.py +6 -3
msprobe/mindspore/grad_probe/global_context.py +9 -2
msprobe/mindspore/grad_probe/grad_analyzer.py +2 -1
msprobe/mindspore/grad_probe/grad_stat_csv.py +3 -2
msprobe/mindspore/grad_probe/hook.py +2 -4
msprobe/mindspore/mindspore_service.py +111 -0
msprobe/mindspore/monitor/common_func.py +52 -0
msprobe/mindspore/monitor/data_writers.py +237 -0
msprobe/mindspore/monitor/distributed/wrap_distributed.py +1 -1
msprobe/mindspore/monitor/features.py +13 -1
msprobe/mindspore/monitor/module_hook.py +568 -444
msprobe/mindspore/monitor/optimizer_collect.py +331 -0
msprobe/mindspore/monitor/utils.py +71 -9
msprobe/mindspore/ms_config.py +16 -15
msprobe/mindspore/overflow_check/overflow_check_tool_factory.py +5 -3
msprobe/mindspore/task_handler_factory.py +5 -2
msprobe/msprobe.py +19 -0
msprobe/nan_analyze/__init__.py +14 -0
msprobe/nan_analyze/analyzer.py +255 -0
msprobe/nan_analyze/graph.py +189 -0
msprobe/nan_analyze/utils.py +211 -0
msprobe/pytorch/api_accuracy_checker/common/config.py +2 -2
msprobe/pytorch/api_accuracy_checker/compare/api_precision_compare.py +3 -6
msprobe/pytorch/api_accuracy_checker/compare/compare.py +36 -34
msprobe/pytorch/api_accuracy_checker/generate_op_script/op_generator.py +15 -13
msprobe/pytorch/api_accuracy_checker/generate_op_script/operator_replication.template +206 -4
msprobe/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py +9 -9
msprobe/pytorch/api_accuracy_checker/run_ut/run_overflow_check.py +6 -5
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py +31 -9
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut_utils.py +28 -20
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/attl.py +3 -1
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/client.py +29 -13
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/device_dispatch.py +12 -2
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/server.py +45 -31
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/utils.py +154 -0
msprobe/pytorch/attl_manager.py +65 -0
msprobe/pytorch/bench_functions/moe_gating_top_k_softmax.py +6 -0
msprobe/pytorch/bench_functions/npu_fusion_attention.py +27 -0
msprobe/pytorch/common/utils.py +53 -19
msprobe/pytorch/compare/distributed_compare.py +4 -36
msprobe/pytorch/compare/pt_compare.py +13 -84
msprobe/pytorch/compare/utils.py +47 -0
msprobe/pytorch/debugger/debugger_config.py +34 -17
msprobe/pytorch/debugger/precision_debugger.py +50 -96
msprobe/pytorch/dump/module_dump/hook_wrapper.py +93 -0
msprobe/pytorch/dump/module_dump/module_dump.py +15 -61
msprobe/pytorch/dump/module_dump/module_processer.py +150 -114
msprobe/pytorch/free_benchmark/common/utils.py +1 -1
msprobe/pytorch/free_benchmark/compare/single_benchmark.py +1 -1
msprobe/pytorch/free_benchmark/perturbed_layers/npu/add_noise.py +3 -3
msprobe/pytorch/free_benchmark/perturbed_layers/npu/bit_noise.py +3 -3
msprobe/pytorch/free_benchmark/perturbed_layers/npu/change_value.py +1 -1
msprobe/pytorch/free_benchmark/perturbed_layers/npu/improve_precision.py +1 -1
msprobe/pytorch/free_benchmark/result_handlers/check_handler.py +1 -1
msprobe/pytorch/function_factory.py +1 -1
msprobe/pytorch/grad_probe/grad_monitor.py +2 -2
msprobe/pytorch/grad_probe/grad_stat_csv.py +3 -2
msprobe/pytorch/hook_module/api_register.py +155 -0
msprobe/pytorch/hook_module/hook_module.py +18 -22
msprobe/pytorch/hook_module/jit_script_wrapper.py +33 -0
msprobe/pytorch/hook_module/pt_hook_manager.py +68 -0
msprobe/pytorch/hook_module/register_optimizer_hook.py +2 -1
msprobe/pytorch/hook_module/support_wrap_ops.yaml +193 -75
msprobe/pytorch/hook_module/utils.py +28 -2
msprobe/pytorch/monitor/csv2tb.py +14 -4
msprobe/pytorch/monitor/data_writers.py +259 -0
msprobe/pytorch/monitor/distributed/wrap_distributed.py +8 -2
msprobe/pytorch/monitor/module_hook.py +336 -241
msprobe/pytorch/monitor/module_metric.py +17 -0
msprobe/pytorch/monitor/optimizer_collect.py +244 -224
msprobe/pytorch/monitor/utils.py +84 -4
msprobe/pytorch/online_dispatch/compare.py +0 -2
msprobe/pytorch/online_dispatch/dispatch.py +13 -2
msprobe/pytorch/online_dispatch/dump_compare.py +8 -2
msprobe/pytorch/online_dispatch/utils.py +3 -0
msprobe/pytorch/parse_tool/lib/interactive_cli.py +1 -6
msprobe/pytorch/parse_tool/lib/utils.py +5 -4
msprobe/pytorch/pt_config.py +16 -11
msprobe/pytorch/pytorch_service.py +70 -0
msprobe/visualization/builder/graph_builder.py +69 -10
msprobe/visualization/builder/msprobe_adapter.py +24 -12
msprobe/visualization/compare/graph_comparator.py +63 -51
msprobe/visualization/compare/mode_adapter.py +22 -20
msprobe/visualization/graph/base_node.py +11 -4
msprobe/visualization/graph/distributed_analyzer.py +1 -10
msprobe/visualization/graph/graph.py +2 -13
msprobe/visualization/graph/node_op.py +1 -2
msprobe/visualization/graph_service.py +251 -104
msprobe/visualization/utils.py +26 -44
msprobe/mindspore/dump/hook_cell/api_registry.py +0 -207
msprobe/mindspore/dump/hook_cell/wrap_api.py +0 -212
msprobe/mindspore/dym_loader/hook_dynamic_loader.cc +0 -140
msprobe/mindspore/monitor/anomaly_detect.py +0 -404
msprobe/mindspore/monitor/module_spec_verifier.py +0 -94
msprobe/mindspore/service.py +0 -543
msprobe/pytorch/hook_module/api_registry.py +0 -166
msprobe/pytorch/hook_module/wrap_distributed.py +0 -79
msprobe/pytorch/hook_module/wrap_functional.py +0 -66
msprobe/pytorch/hook_module/wrap_npu_custom.py +0 -85
msprobe/pytorch/hook_module/wrap_tensor.py +0 -69
msprobe/pytorch/hook_module/wrap_torch.py +0 -84
msprobe/pytorch/hook_module/wrap_vf.py +0 -60
msprobe/pytorch/monitor/anomaly_analyse.py +0 -201
msprobe/pytorch/monitor/anomaly_detect.py +0 -410
msprobe/pytorch/monitor/module_spec_verifier.py +0 -95
msprobe/pytorch/monitor/unittest/test_monitor.py +0 -160
msprobe/pytorch/service.py +0 -470
{mindstudio_probe-1.2.2.dist-info → mindstudio_probe-8.1.0.dist-info}/LICENSE +0 -0
{mindstudio_probe-1.2.2.dist-info → mindstudio_probe-8.1.0.dist-info}/WHEEL +0 -0
{mindstudio_probe-1.2.2.dist-info → mindstudio_probe-8.1.0.dist-info}/entry_points.txt +0 -0
{mindstudio_probe-1.2.2.dist-info → mindstudio_probe-8.1.0.dist-info}/top_level.txt +0 -0
/msprobe/{mindspore → core}/compare/ms_to_pt_api.yaml +0 -0
/msprobe/{mindspore/dump → core}/kernel_dump/kernel_config.py +0 -0
/msprobe/{pytorch/monitor/unittest → core/monitor}/__init__.py +0 -0

msprobe/core/compare/utils.py CHANGED Viewed

@@ -20,33 +20,45 @@ import zlib
 from dataclasses import dataclass
 import numpy as np
+import pandas as pd
 from msprobe.core.common.const import Const, CompareConst, FileCheckConst
 from msprobe.core.common.utils import CompareException, check_regex_prefix_format_valid, logger, safe_get_value
 from msprobe.core.common.file_utils import check_file_or_directory_path
+json_file_mapping = {
+    Const.DUMP_JSON_FILE: "dump.json",
+    Const.DEBUG_JSON_FILE: "debug.json",
+    Const.STACK_JSON_FILE: "stack.json"
+}
-def extract_json(dirname, stack_json=False):
+def extract_json(dirname, json_file_type):
     json_path = ''
     for filename in os.listdir(dirname):
-        target_file_name = 'stack.json' if stack_json else 'dump.json'
+        target_file_name = json_file_mapping.get(json_file_type)
+        if target_file_name is None:
+            logger.error(f'extract_json failed, invalid json_file_type: {json_file_type}.')
+            raise CompareException(CompareException.INVALID_KEY_ERROR)
         if filename == target_file_name:
             json_path = os.path.join(dirname, filename)
             break
     # Provide robustness on invalid directory inputs
     if not json_path:
-        if stack_json:
+        if json_file_type == Const.STACK_JSON_FILE:
             logger.warning(f'stack.json is not found in dump dir {dirname}.')
-        else:
+        elif json_file_type == Const.DUMP_JSON_FILE:
             logger.error(f'dump.json is not found in dump dir {dirname}.')
-            raise CompareException(CompareException.NO_DUMP_FILE_ERROR)
+        elif json_file_type == Const.DEBUG_JSON_FILE:
+            logger.warning(f'debug.json is not found in dump dir {dirname}.')
     return json_path
 def set_stack_json_path(input_param):
     npu_data_dir = os.path.dirname(input_param.get("npu_json_path"))
-    stack_path = extract_json(npu_data_dir, stack_json=True)
+    stack_path = extract_json(npu_data_dir, json_file_type=Const.STACK_JSON_FILE)
     input_param["stack_json_path"] = stack_path if stack_path else None
     return bool(stack_path)
@@ -81,24 +93,9 @@ def check_and_return_dir_contents(dump_dir, prefix):
     return contents
-def rename_api(npu_name, process):
-    """
-    原api： {api_type}.{api_name}.{API调用次数}.{前向反向}.{input/output}.{参数序号}
-    rename后： {api_type}.{api_name}.{input/output}.{参数序号}
-    """
-    npu_split = npu_name.split(process)
-    try:
-        torch_func_index, in_out = npu_split[0], npu_split[1]
-    except IndexError as error:
-        logger.error(f'{npu_name} can not be split with {process}, please check!')
-        raise CompareException(CompareException.INDEX_OUT_OF_BOUNDS_ERROR) from error
-    torch_func_split = torch_func_index.rsplit(Const.SEP, 2)
-    torch_func = str(torch_func_split[0]) + str(in_out)
-    return torch_func
 def read_op(op_data, op_name):
-    if Const.PARAMS_GRAD in op_name.split(Const.SEP):
+    split_name = op_name.split(Const.SEP)
+    if Const.DEBUG in split_name or Const.PARAMS_GRAD in split_name:
         op_parsed_list = op_item_parse(op_data, op_name)
     else:
         op_parsed_list = []
@@ -191,35 +188,152 @@ def gen_op_item(op_data, op_name):
     return op_item
-def resolve_api_special_parameters(data_dict, full_op_name, item_list):
+@dataclass
+class ApiItemInfo:
+    name: str
+    struct: tuple
+    stack_info: list
+def merge_tensor(tensor_list, dump_mode):
+    keys = [
+        CompareConst.OP_NAME,
+        CompareConst.INPUT_STRUCT,
+        CompareConst.KWARGS_STRUCT,
+        CompareConst.OUTPUT_STRUCT,
+        CompareConst.PARAMS_STRUCT,
+        CompareConst.PARAMS_GRAD_STRUCT,
+        CompareConst.DEBUG_STRUCT,
+        Const.SUMMARY,
+        Const.STACK_INFO
+    ]
+    op_dict = {key: [] for key in keys}
+    if dump_mode == Const.ALL:
+        op_dict["data_name"] = []
+    for tensor in tensor_list:
+        # A dict(len=2) with 'full_op_name' and 'full_info' is added to the tensor only if self.stack_mode is True
+        if len(tensor) == 2:
+            op_dict[Const.STACK_INFO].append(tensor['full_info'])
+            break
+        op_dict[CompareConst.OP_NAME].append(tensor['full_op_name'])
+        _, state = get_name_and_state(tensor['full_op_name'])
+        struct_key = CompareConst.STATE_TO_STRUCT_MAPPING.get(state)
+        if not struct_key:
+            continue
+        if dump_mode == Const.MD5:
+            op_dict.get(struct_key).append((tensor[Const.DTYPE], tensor[Const.SHAPE], tensor[Const.MD5]))
+        else:
+            op_dict.get(struct_key).append((tensor[Const.DTYPE], tensor[Const.SHAPE]))
+        op_dict[Const.SUMMARY].append([tensor[Const.MAX], tensor[Const.MIN], tensor[Const.MEAN], tensor[Const.NORM]])
+        if dump_mode == Const.ALL:
+            op_dict["data_name"].append(tensor['data_name'])
+    if not op_dict[CompareConst.KWARGS_STRUCT]:
+        del op_dict[CompareConst.KWARGS_STRUCT]
+    return op_dict if op_dict[CompareConst.OP_NAME] else {}
+def print_compare_ends_info():
+    total_len = len(CompareConst.COMPARE_ENDS_SUCCESSFULLY) + Const.FILL_CHAR_NUMS
+    logger.info('*' * total_len)
+    logger.info(f"*{CompareConst.COMPARE_ENDS_SUCCESSFULLY.center(total_len - 2)}*")
+    logger.info('*' * total_len)
+def table_value_is_valid(value: str) -> bool:
+    if not isinstance(value, str):
+        return True
+    try:
+        # -1.00 or +1.00 should be considered as digit numbers
+        float(value)
+    except ValueError:
+        # otherwise, they will be considered as formular injections
+        return not bool(re.compile(FileCheckConst.CSV_BLACK_LIST).search(value))
+    return True
+def get_name_and_state(name):
     """
-    Function Description:
-        解析下面格式的数据, 是api参数的一种特殊格式
-        {
-         "last_hidden_state": {
-          "type": "torch.Tensor",
-          "dtype": "torch.bfloat16",
-          ...
-         },
-         "loss": {
-          "type": "torch.Tensor",
-          "dtype": "torch.float32",
-          ...
-         }
-        }
-    Parameter:
-        data_dict: 字典格式的数据
-        full_op_name: 参数的全名字符串
-        item_list: 参数信息集合
+    Get api/module name and state
+    example:
+    name = 'conv2d.forward.1.input.0'
+    return: ('conv2d.forward.1.', 'input')
+    name = 'Functional.pad.0.backward.output.0'
+    return: ('Functional.pad.0.backward.', 'output')
+    name = 'x_tensor.0.debug.{index}'
+    return: ('x_tensor.0.', 'debug')
+    state type: input, output, kwargs, parameters, parameters_grad, debug
     """
-    for key, value in data_dict.items():
-        if isinstance(value, dict):
-            parsed_item = value
-            parts = full_op_name.split(Const.SEP)
-            parts.insert(-1, key)
-            full_op_name_new = ".".join(parts)
-            parsed_item['full_op_name'] = full_op_name_new
-            item_list.append(parsed_item)
+    if not isinstance(name, str):
+        logger.error(f'Invalid name: {name}, type should be string, please check.')
+        raise CompareException(CompareException.INVALID_API_NAME_ERROR)
+    if Const.DEBUG in name.split(Const.SEP):
+        return name.split(Const.DEBUG)[0], Const.DEBUG
+    if Const.PARAMS_GRAD in name.split(Const.SEP):
+        return name.split(Const.PARAMS_GRAD)[0], Const.PARAMS_GRAD
+    split = re.split(Const.REGEX_FORWARD_BACKWARD, name)
+    if len(split) < 3:
+        logger.error(f'Invalid name string: {name}, can not be split by forward/backward, please check.')
+        raise CompareException(CompareException.INVALID_API_NAME_ERROR)
+    api = f'{split[0]}.{split[1]}.'
+    state_str = split[2]
+    match = re.match(r'^(\d+\.)?(input|output|kwargs|parameters)\..+$', state_str)
+    if not match:
+        raise CompareException(f'Invalid name string: {name}')
+    if match.group(1):
+        api = f'{api}{match.group(1)}'
+    state = match.group(2)
+    return api, state
+def reorder_op_name_list(op_name_list):
+    if not op_name_list:
+        return op_name_list
+    parameters = []
+    output = []
+    parameters_grad = []
+    others = []
+    for x in op_name_list:
+        state = get_name_and_state(x)[1]
+        if state == Const.PARAMS:
+            parameters.append(x)
+        elif state == Const.OUTPUT:
+            output.append(x)
+        elif state == Const.PARAMS_GRAD:
+            parameters_grad.append(x)
+        else:
+            others.append(x)
+    # 合并others, parameters, 和output，确保parameters排在output前面
+    op_name_reorder = others + parameters + output + parameters_grad
+    return op_name_reorder
+def reorder_op_x_list(op_name_list, summary_list, data_name_list):
+    """对op_name, summary, data_name重新排序，把parameters放到input后output前，data_name由于统计量比对时，为None，单独处理"""
+    if not op_name_list or not summary_list:
+        return op_name_list, summary_list, data_name_list
+    index_map = {name: index for index, name in enumerate(op_name_list)}
+    op_name_reorder = reorder_op_name_list(op_name_list)
+    summary_reorder = [summary_list[index_map.get(name)] for name in op_name_reorder]
+    if data_name_list:
+        data_name_reorder = [data_name_list[index_map.get(name)] for name in op_name_reorder]
+    else:
+        data_name_reorder = data_name_list
+    return op_name_reorder, summary_reorder, data_name_reorder
 def process_summary_data(summary_data):
@@ -285,9 +399,9 @@ def result_item_init(n_info, b_info, dump_mode):
             md5_compare_result = CompareConst.PASS if n_info.struct[2] == b_info.struct[2] else CompareConst.DIFF
             result_item.extend([n_info.struct[2], b_info.struct[2], md5_compare_result])
         elif dump_mode == Const.SUMMARY:
-            result_item.extend([" "] * 8)
+            result_item.extend([" "] * 8)  # 8个统计量数据情况的比对指标
         else:
-            result_item.extend([" "] * 5)
+            result_item.extend([" "] * 6)  # 6个真实数据情况的比对指标
     else:
         err_msg = "index out of bounds error will occur in result_item_init, please check!\n" \
                   f"npu_info_struct is {n_info.struct}\n" \
@@ -321,8 +435,8 @@ def get_accuracy(result, n_dict, b_dict, dump_mode):
         has_stack = npu_stack_info and bench_stack_info
         if dump_mode == Const.ALL:
-            npu_data_name = n_dict.get("data_name", None)
-            bench_data_name = b_dict.get("data_name", None)
+            npu_data_name_list = n_dict.get("data_name", None)
+            bench_data_name_list = b_dict.get("data_name", None)
         for index in range(min_len):
             n_name = safe_get_value(n_dict, n_start + index, "n_dict", key="op_name")
@@ -353,7 +467,9 @@ def get_accuracy(result, n_dict, b_dict, dump_mode):
             result_item.append(err_msg)
             result_item = stack_column_process(result_item, has_stack, index, key, npu_stack_info)
             if dump_mode == Const.ALL:
-                result_item.append(safe_get_value(npu_data_name, n_start + index, "npu_data_name"))
+                npu_data_name = safe_get_value(npu_data_name_list, n_start + index, "npu_data_name_list")
+                bench_data_name = safe_get_value(bench_data_name_list, b_start + index, "bench_data_name_list")
+                result_item.append([npu_data_name, bench_data_name])
             result.append(result_item)
@@ -371,7 +487,7 @@ def get_accuracy(result, n_dict, b_dict, dump_mode):
                         continue
                     result_item = [
                         n_name, CompareConst.NAN, n_struct[0], CompareConst.NAN, n_struct[1], CompareConst.NAN,
-                        " ", " ", " ", " ", " "
+                        " ", " ", " ", " ", " ", " "
                     ]
                     summary_data = n_dict.get(CompareConst.SUMMARY)[n_start + index]
                     result_item.extend(summary_data)
@@ -388,7 +504,8 @@ def get_accuracy(result, n_dict, b_dict, dump_mode):
                 result_item.append(err_msg)
                 result_item = stack_column_process(result_item, has_stack, index, key, npu_stack_info)
                 if dump_mode == Const.ALL:
-                    result_item.append(safe_get_value(npu_data_name, n_start + index, "npu_data_name"))
+                    npu_data_name = safe_get_value(npu_data_name_list, n_start + index, "npu_data_name_list")
+                    result_item.append([npu_data_name, "-1"])
                 result.append(result_item)
@@ -404,197 +521,23 @@ def get_accuracy(result, n_dict, b_dict, dump_mode):
                       CompareConst.PARAMS_GRAD_STRUCT)
-def append_stack_info(result_item, npu_stack_info, index):
-    """添加堆栈信息到 result_item"""
-    if npu_stack_info and index == 0:
-        result_item.extend(npu_stack_info)
-    else:
-        result_item.append(CompareConst.NONE)
+def make_result_table(result, dump_mode, stack_mode):
+    header = CompareConst.HEAD_OF_COMPARE_MODE[dump_mode][:]
-def get_un_match_accuracy(result, n_dict, dump_mode):
-    npu_stack_info = n_dict.get("stack_info", None)
-    bench_name, bench_type, bench_shape = CompareConst.N_A, CompareConst.N_A, CompareConst.N_A
-    struct_to_index_mapping = {
-        CompareConst.INPUT_STRUCT: 0,
-        CompareConst.OUTPUT_STRUCT: 0,
-        CompareConst.PARAMS_STRUCT: 0,
-        CompareConst.PARAMS_GRAD_STRUCT: 0
-    }
-    op_name_list = n_dict.get(CompareConst.OP_NAME)
-    summary_list = n_dict.get(Const.SUMMARY)
-    data_name_list = n_dict.get('data_name')
-    op_name_reorder, summary_reorder, _ = reorder_op_x_list(op_name_list,
-                                                            summary_list,
-                                                            data_name_list)
-    for index, n_name in enumerate(op_name_reorder):
-        _, state = get_name_and_state(n_name)
-        struct_key = CompareConst.STATE_TO_STRUCT_MAPPING.get(state)
-        if not struct_key:
-            continue
-        n_struct = safe_get_value(n_dict, struct_to_index_mapping.get(struct_key), "n_dict", key=struct_key)
-        struct_to_index_mapping[struct_key] += 1
-        try:
-            result_item = [n_name, bench_name, n_struct[0], bench_type, n_struct[1], bench_shape]
-        except IndexError as e:
-            err_msg = "index out of bounds error occurs, please check!\n" \
-                      f"op_name of n_dict is {n_dict['op_name']}\n" \
-                      f"input_struct of n_dict is {n_dict[CompareConst.INPUT_STRUCT]}\n" \
-                      f"output_struct of n_dict is {n_dict[CompareConst.OUTPUT_STRUCT]}"
-            logger.error(err_msg)
-            raise CompareException(CompareException.INDEX_OUT_OF_BOUNDS_ERROR) from e
-        if dump_mode == Const.MD5:
-            result_item.extend([CompareConst.N_A] * 3)
-            append_stack_info(result_item, npu_stack_info, index)
-            result.append(result_item)
-            continue
-        if dump_mode == Const.SUMMARY:
-            result_item.extend([CompareConst.N_A] * 8)
+    if stack_mode:
+        header.append(CompareConst.STACK)
         if dump_mode == Const.ALL:
-            result_item.extend([CompareConst.N_A] * 5)
-        npu_summary_data = safe_get_value(summary_reorder, index, "summary_reorder")
-        bench_summary_data = [CompareConst.N_A] * 4
-        result_item.extend(npu_summary_data)
-        result_item.extend(bench_summary_data)
-        err_msg = CompareConst.NO_BENCH
-        accuracy_check_res = CompareConst.N_A
-        result_item.append(accuracy_check_res)
-        result_item.append(err_msg)
-        append_stack_info(result_item, npu_stack_info, index)
-        if dump_mode == Const.ALL and result_item[1] == CompareConst.N_A:
-            result_item.extend(["-1"])
-        result.append(result_item)
-def merge_tensor(tensor_list, dump_mode):
-    op_dict = {}
-    op_dict["op_name"] = []
-    op_dict[CompareConst.INPUT_STRUCT] = []
-    op_dict[CompareConst.KWARGS_STRUCT] = []
-    op_dict[CompareConst.OUTPUT_STRUCT] = []
-    op_dict[CompareConst.PARAMS_STRUCT] = []
-    op_dict[CompareConst.PARAMS_GRAD_STRUCT] = []
-    op_dict[Const.SUMMARY] = []
-    op_dict["stack_info"] = []
-    if dump_mode == Const.ALL:
-        op_dict["data_name"] = []
-    for tensor in tensor_list:
-        # A dict(len=2) with 'full_op_name' and 'full_info' is added to the tensor only if self.stack_mode is True
-        if len(tensor) == 2:
-            op_dict['stack_info'].append(tensor['full_info'])
-            break
-        op_dict["op_name"].append(tensor['full_op_name'])
-        _, state = get_name_and_state(tensor['full_op_name'])
-        struct_key = CompareConst.STATE_TO_STRUCT_MAPPING.get(state)
-        if not struct_key:
-            continue
-        if dump_mode == Const.MD5:
-            op_dict.get(struct_key).append((tensor[Const.DTYPE], tensor[Const.SHAPE], tensor[Const.MD5]))
-        else:
-            op_dict.get(struct_key).append((tensor[Const.DTYPE], tensor[Const.SHAPE]))
-        op_dict[Const.SUMMARY].append([tensor[Const.MAX], tensor[Const.MIN], tensor[Const.MEAN], tensor[Const.NORM]])
+            header.append(CompareConst.DATA_NAME)
+    else:
         if dump_mode == Const.ALL:
-            op_dict["data_name"].append(tensor['data_name'])
-    if not op_dict[CompareConst.KWARGS_STRUCT]:
-        del op_dict[CompareConst.KWARGS_STRUCT]
-    return op_dict if op_dict["op_name"] else {}
-def print_compare_ends_info():
-    total_len = len(CompareConst.COMPARE_ENDS_SUCCESSFULLY) + Const.FILL_CHAR_NUMS
-    logger.info('*' * total_len)
-    logger.info(f"*{CompareConst.COMPARE_ENDS_SUCCESSFULLY.center(total_len - 2)}*")
-    logger.info('*' * total_len)
-def table_value_is_valid(value: str) -> bool:
-    if not isinstance(value, str):
-        return True
-    try:
-        # -1.00 or +1.00 should be consdiered as digit numbers
-        float(value)
-    except ValueError:
-        # otherwise, they will be considered as formular injections
-        return not bool(re.compile(FileCheckConst.CSV_BLACK_LIST).search(value))
-    return True
-def get_name_and_state(name):
-    """
-    Get api/module name and state
-    example:
-    name = 'conv2d.forward.1.input.0'
-    return: ('conv2d.forward.1.', 'input')
-    name = 'Functional.pad.0.backward.output.0'
-    return: ('Functional.pad.0.backward.', 'output')
-    state type: input, output, kwargs, parameters, parameters_grad
-    """
-    if Const.PARAMS_GRAD in name.split(Const.SEP):
-        return name.split(Const.PARAMS_GRAD)[0], Const.PARAMS_GRAD
-    split = re.split(Const.REGEX_FORWARD_BACKWARD, name)
-    api = f'{split[0]}.{split[1]}.'
-    state_str = split[2]
-    match = re.match(r'^(\d+\.)?(input|output|kwargs|parameters)\..+$', state_str)
-    if not match:
-        raise CompareException(f'Invalid name string: {name}')
-    if match.group(1):
-        api = f'{api}{match.group(1)}'
-    state = match.group(2)
-    return api, state
-def reorder_op_name_list(op_name_list):
-    if not op_name_list:
-        return op_name_list
-    parameters = []
-    output = []
-    parameters_grad = []
-    others = []
-    for x in op_name_list:
-        state = get_name_and_state(x)[1]
-        if state == Const.PARAMS:
-            parameters.append(x)
-        elif state == Const.OUTPUT:
-            output.append(x)
-        elif state == Const.PARAMS_GRAD:
-            parameters_grad.append(x)
+            for row in result:
+                del row[-2]  # 输出结果不要堆栈信息时，删除中间结果result中的stack info，真实数据时为倒数第2列
+            header.append(CompareConst.DATA_NAME)
         else:
-            others.append(x)
-    # 合并others, parameters, 和output，确保parameters排在output前面
-    op_name_reorder = others + parameters + output + parameters_grad
-    return op_name_reorder
-def reorder_op_x_list(op_name_list, summary_list, data_name_list):
-    """对op_name, summary, data_name重新排序，把parameters放到input后output前，data_name由于统计量比对时，为None，单独处理"""
-    if not op_name_list or not summary_list:
-        return op_name_list, summary_list, data_name_list
-    index_map = {name: index for index, name in enumerate(op_name_list)}
-    op_name_reorder = reorder_op_name_list(op_name_list)
-    summary_reorder = [summary_list[index_map.get(name)] for name in op_name_reorder]
-    if data_name_list:
-        data_name_reorder = [data_name_list[index_map.get(name)] for name in op_name_reorder]
-    else:
-        data_name_reorder = data_name_list
-    return op_name_reorder, summary_reorder, data_name_reorder
+            for row in result:
+                del row[-1]  # 输出结果不要堆栈信息时，删除中间结果result中的stack info，非真实数据时为倒数第1列
+    result_df = pd.DataFrame(result, columns=header, dtype='object')
+    return result_df
 def _compare_parser(parser):
@@ -617,3 +560,34 @@ def _compare_parser(parser):
                         help="<optional> The data mapping file path.", required=False)
     parser.add_argument("-lm", "--layer_mapping", dest="layer_mapping", type=str, nargs='?', const=True,
                         help="<optional> The layer mapping file path.", required=False)
+def compare_distributed_inner(npu_dump_dir, bench_dump_dir, output_path, compare_func, **kwargs):
+    if kwargs.get('suffix'):
+        logger.error("Argument 'suffix' is not supported for compare_distributed.")
+        raise CompareException(CompareException.INVALID_PARAM_ERROR)
+    is_print_compare_log = kwargs.get('is_print_compare_log', True)
+    # get the ranks and match by order
+    npu_ranks = sorted(check_and_return_dir_contents(npu_dump_dir, 'rank'))
+    bench_ranks = sorted(check_and_return_dir_contents(bench_dump_dir, 'rank'))
+    if len(npu_ranks) != len(bench_ranks):
+        logger.error('The number of ranks in the two runs are different. '
+                     'Unable to match the ranks. Please use another folder to compare '
+                     'or use compare() api and manually match the ranks.')
+        raise CompareException(CompareException.INVALID_PATH_ERROR)
+    for nr, br in zip(npu_ranks, bench_ranks):
+        npu_data_dir = os.path.join(npu_dump_dir, nr)
+        bench_data_dir = os.path.join(bench_dump_dir, br)
+        for file_type in [Const.DUMP_JSON_FILE, Const.DEBUG_JSON_FILE]:
+            npu_path = extract_json(npu_data_dir, file_type)
+            bench_path = extract_json(bench_data_dir, file_type)
+            if npu_path == "" or bench_path == "":
+                logger.debug(f'Did not find paired {file_type} in {npu_data_dir} and {bench_data_dir},'
+                             ' skip comparing.')
+                continue
+            dump_result_param = {
+                'npu_json_path': npu_path,
+                'bench_json_path': bench_path,
+                'is_print_compare_log': is_print_compare_log
+            }
+            compare_func(input_param=dump_result_param, output_path=output_path, suffix=f'_{nr}', **kwargs)

msprobe/{mindspore/runtime.py → core/config_check/__init__.py} RENAMED Viewed

@@ -13,7 +13,5 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-class Runtime:
-    step_count: int = 0
-    rank_id: int = -1
-    is_running: bool = False
+import msprobe.core.config_check.checkers
+from msprobe.core.config_check.config_checker import ConfigChecker

msprobe/{pytorch/dump/kernel_dump/kernel_config.py → core/config_check/checkers/__init__.py} RENAMED Viewed

@@ -13,21 +13,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import os
+__all__ = ['BaseChecker', 'apply_patches']
-from msprobe.core.common.file_utils import save_json
+import msprobe.core.config_check.checkers.env_args_checker
+import msprobe.core.config_check.checkers.pip_checker
+import msprobe.core.config_check.checkers.dataset_checker
+import msprobe.core.config_check.checkers.weights_checker
+import msprobe.core.config_check.checkers.hyperparameter_checker
+import msprobe.core.config_check.checkers.random_checker
-def create_kernel_config_json(dump_path, cur_rank):
-    kernel_config_name = "kernel_config.json" if cur_rank == '' else f"kernel_config_{cur_rank}.json"
-    kernel_config_path = os.path.join(dump_path, kernel_config_name)
-    config_info = {
-        "dump": {
-            "dump_list": [],
-            "dump_path": dump_path,
-            "dump_mode": "all",
-            "dump_op_switch": "on"
-        }
-    }
-    save_json(kernel_config_path, config_info, indent=4)
-    return kernel_config_path
+from msprobe.core.config_check.checkers.base_checker import BaseChecker

msprobe/core/config_check/checkers/base_checker.py ADDED Viewed

@@ -0,0 +1,60 @@
+# Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from msprobe.core.common.framework_adapter import FmkAdp
+from msprobe.core.common.const import FileCheckConst
+class PackInput:
+    def __init__(self, output_zip_path, model, shell_path):
+        self.output_zip_path = output_zip_path
+        self.shell_path = shell_path
+        self.model = model[0] if isinstance(model, list) and len(model) > 0 else model
+        self.check_input_params()
+    def check_input_params(self):
+        if self.model and not FmkAdp.is_nn_module(self.model):
+            raise Exception(f"model is not torch.nn.Module/mindspore.nn.Cell or module list.")
+        if not isinstance(self.output_zip_path, str) or not self.output_zip_path.endswith(FileCheckConst.ZIP_SUFFIX):
+            raise Exception(f"output zip path must be a string and ends with '.zip'")
+class BaseChecker:
+    input_needed = None
+    target_name_in_zip = None
+    multi_rank = False
+    @staticmethod
+    def pack(pack_input):
+        pass
+    @staticmethod
+    def compare(bench_dir, cmp_dir, output_path, fmk):
+        pass
+    @staticmethod
+    def apply_patches(fmk):
+        pass
+    @classmethod
+    def compare_ex(cls, bench_dir, cmp_dir, output_path, fmk):
+        bench_filepath = os.path.join(bench_dir, cls.target_name_in_zip)
+        cmp_filepath = os.path.join(cmp_dir, cls.target_name_in_zip)
+        if not os.path.exists(bench_filepath) or not os.path.exists(cmp_filepath):
+            return None, None, None
+        return cls.compare(bench_dir, cmp_dir, output_path, fmk)

mindstudio-probe 1.2.2__py3-none-any.whl → 8.1.0__py3-none-any.whl

mindstudio-probe 1.2.2py3-none-any.whl → 8.1.0py3-none-any.whl