PyPI - mindstudio-probe - Versions diffs - 8.1.2__py3-none-any.whl → 8.2.1__py3-none-any.whl - Mend

mindstudio-probe 8.1.2py3-none-any.whl → 8.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (181) hide show

{mindstudio_probe-8.1.2.dist-info → mindstudio_probe-8.2.1.dist-info}/METADATA +2 -2
{mindstudio_probe-8.1.2.dist-info → mindstudio_probe-8.2.1.dist-info}/RECORD +172 -147
msprobe/README.md +6 -6
msprobe/core/common/const.py +98 -41
msprobe/core/common/db_manager.py +256 -0
msprobe/core/common/file_utils.py +28 -5
msprobe/core/common/log.py +7 -0
msprobe/core/common/megatron_utils.py +59 -0
msprobe/core/common/parallel_state.py +193 -0
msprobe/core/common/utils.py +20 -13
msprobe/core/common_config.py +5 -0
msprobe/core/compare/acc_compare.py +140 -93
msprobe/core/compare/check.py +13 -0
msprobe/core/compare/compare_cli.py +64 -6
msprobe/core/compare/config.py +10 -8
msprobe/core/compare/diff_analyze/diff_analyze_threshold.yaml +14 -0
msprobe/core/compare/diff_analyze/first_diff_analyze.py +135 -0
msprobe/core/compare/diff_analyze/ignore_op_list.yaml +3 -0
msprobe/core/compare/find_first/__init__.py +0 -0
msprobe/core/compare/find_first/analyzer.py +282 -0
msprobe/core/compare/find_first/data_processor.py +35 -0
msprobe/core/compare/find_first/graph.py +188 -0
msprobe/core/compare/find_first/utils.py +189 -0
msprobe/core/compare/highlight.py +74 -101
msprobe/core/compare/layer_mapping/layer_mapping.py +14 -9
msprobe/core/compare/merge_result/merge_result.py +2 -2
msprobe/core/compare/multiprocessing_compute.py +45 -28
msprobe/core/compare/npy_compare.py +7 -10
msprobe/core/compare/utils.py +338 -130
msprobe/core/config_check/checkers/dataset_checker.py +2 -1
msprobe/core/config_check/checkers/env_args_checker.py +5 -5
msprobe/core/config_check/checkers/hyperparameter_checker.py +30 -10
msprobe/core/config_check/checkers/pip_checker.py +4 -3
msprobe/core/config_check/checkers/random_checker.py +3 -3
msprobe/core/config_check/checkers/weights_checker.py +2 -1
msprobe/core/config_check/ckpt_compare/megatron_loader.py +2 -0
msprobe/core/config_check/resource/hyperparameter.yaml +11 -1
msprobe/core/config_check/utils/hyperparameter_parser.py +7 -3
msprobe/core/config_check/utils/utils.py +10 -0
msprobe/core/data_dump/api_registry.py +49 -30
msprobe/core/data_dump/data_collector.py +71 -29
msprobe/core/data_dump/data_processor/base.py +2 -0
msprobe/core/data_dump/data_processor/mindspore_processor.py +47 -53
msprobe/core/data_dump/data_processor/pytorch_processor.py +227 -93
msprobe/core/data_dump/json_writer.py +81 -7
msprobe/core/data_dump/scope.py +4 -6
msprobe/core/hook_manager.py +129 -70
msprobe/core/monitor/csv2db.py +361 -0
msprobe/core/monitor/db_utils.py +278 -0
msprobe/core/monitor/utils.py +35 -1
msprobe/core/service.py +31 -39
msprobe/core/single_save/single_comparator.py +16 -3
msprobe/docs/01.installation.md +51 -19
msprobe/docs/02.config_introduction.md +16 -20
msprobe/docs/03.config_examples.md +26 -0
msprobe/docs/04.kernel_dump_PyTorch.md +1 -1
msprobe/docs/05.data_dump_PyTorch.md +6 -2
msprobe/docs/06.data_dump_MindSpore.md +44 -7
msprobe/docs/07.accuracy_checker_PyTorch.md +1 -1
msprobe/docs/10.accuracy_compare_PyTorch.md +124 -44
msprobe/docs/11.accuracy_compare_MindSpore.md +75 -7
msprobe/docs/14.data_parse_PyTorch.md +1 -1
msprobe/docs/19.monitor.md +94 -7
msprobe/docs/21.visualization_PyTorch.md +71 -101
msprobe/docs/22.visualization_MindSpore.md +69 -119
msprobe/docs/23.generate_operator_PyTorch.md +1 -1
msprobe/docs/25.tool_function_introduction.md +0 -1
msprobe/docs/26.data_dump_PyTorch_baseline.md +7 -7
msprobe/docs/28.debugger_save_instruction.md +184 -81
msprobe/docs/29.data_dump_MSAdapter.md +6 -0
msprobe/docs/31.config_check.md +4 -2
msprobe/docs/36.calculation_result_change.md +75 -0
msprobe/docs/FAQ.md +22 -1
msprobe/docs/data_dump_MindSpore/dynamic_graph_quick_start_example.md +6 -2
msprobe/docs/img/compare_result.png +0 -0
msprobe/docs/img/visualization/vis_browser_1.png +0 -0
msprobe/docs/img/visualization/vis_match_info.png +0 -0
msprobe/docs/img/visualization/vis_precision_info.png +0 -0
msprobe/docs/img/visualization/vis_search_info.png +0 -0
msprobe/docs/img/visualization/vis_show_info.png +0 -0
msprobe/docs/img/visualization/vis_showcase.png +0 -0
msprobe/docs/img/visualization/vis_unmatch_info.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/1.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/2.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/3.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/4.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/5.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/6.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/7.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/llamafactory-qwen25vl.txt +59 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/llamafactory1.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/llamafactory2.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/mindspeed-mm-qwen25vl.txt +80 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/mindspeed1.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/mindspeed2.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactory_mapping.md +330 -0
msprobe/mindspore/__init__.py +1 -1
msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +1 -1
msprobe/mindspore/api_accuracy_checker/api_runner.py +9 -6
msprobe/mindspore/api_accuracy_checker/compute_element.py +18 -12
msprobe/mindspore/cell_processor.py +64 -25
msprobe/mindspore/common/utils.py +51 -7
msprobe/mindspore/compare/common_dir_compare.py +45 -37
msprobe/mindspore/compare/ms_compare.py +10 -2
msprobe/mindspore/compare/ms_graph_compare.py +47 -52
msprobe/mindspore/debugger/debugger_config.py +18 -7
msprobe/mindspore/debugger/precision_debugger.py +16 -12
msprobe/mindspore/dump/cell_dump_process.py +130 -68
msprobe/mindspore/dump/cell_dump_with_insert_gradient.py +10 -2
msprobe/mindspore/dump/graph_mode_cell_dump.py +35 -9
msprobe/mindspore/dump/graph_tensor_dump.py +11 -0
msprobe/mindspore/dump/hook_cell/api_register.py +19 -20
msprobe/mindspore/dump/hook_cell/hook_cell.py +12 -34
msprobe/mindspore/dump/hook_cell/ms_hook_manager.py +142 -21
msprobe/mindspore/dump/kernel_kbyk_dump.py +24 -0
msprobe/mindspore/exception_dump/__init__.py +0 -0
msprobe/mindspore/exception_dump/exception_dump_tool_factory.py +51 -0
msprobe/mindspore/exception_dump/kernel_graph_exception_dump.py +57 -0
msprobe/mindspore/free_benchmark/api_pynative_self_check.py +5 -4
msprobe/mindspore/mindspore_service.py +2 -2
msprobe/mindspore/mindtorch/mindtorch_adaptor.py +12 -7
msprobe/mindspore/monitor/features.py +82 -0
msprobe/mindspore/monitor/module_hook.py +168 -10
msprobe/mindspore/monitor/utils.py +27 -1
msprobe/mindspore/ms_config.py +12 -4
msprobe/mindspore/overflow_check/overflow_check_tool_factory.py +1 -1
msprobe/mindspore/task_handler_factory.py +3 -1
msprobe/nan_analyze/graph.py +1 -1
msprobe/pytorch/api_accuracy_checker/common/config.py +3 -36
msprobe/pytorch/api_accuracy_checker/compare/api_precision_compare.py +0 -24
msprobe/pytorch/api_accuracy_checker/compare/compare.py +2 -12
msprobe/pytorch/api_accuracy_checker/config.yaml +1 -6
msprobe/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py +2 -2
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py +12 -132
msprobe/pytorch/common/utils.py +1 -21
msprobe/pytorch/compare/pt_compare.py +10 -2
msprobe/pytorch/{hook_module/jit_script_wrapper.py → compare/pt_diff_analyze.py} +3 -15
msprobe/pytorch/compare/utils.py +2 -1
msprobe/pytorch/debugger/debugger_config.py +18 -23
msprobe/pytorch/dump/module_dump/hook_wrapper.py +10 -7
msprobe/pytorch/dump/module_dump/module_processer.py +41 -19
msprobe/pytorch/free_benchmark/main.py +7 -4
msprobe/pytorch/hook_module/api_register.py +62 -24
msprobe/pytorch/hook_module/hook_module.py +9 -29
msprobe/pytorch/hook_module/pt_hook_manager.py +84 -15
msprobe/pytorch/hook_module/script_wrapper.py +140 -0
msprobe/pytorch/hook_module/support_wrap_ops.yaml +6 -0
msprobe/pytorch/monitor/csv2tb.py +1 -1
msprobe/pytorch/monitor/features.py +94 -0
msprobe/pytorch/monitor/module_hook.py +221 -81
msprobe/pytorch/monitor/module_metric.py +27 -1
msprobe/pytorch/monitor/optimizer_collect.py +109 -4
msprobe/pytorch/online_dispatch/dispatch.py +42 -24
msprobe/pytorch/online_dispatch/dump_compare.py +1 -1
msprobe/pytorch/parse_tool/lib/visualization.py +0 -1
msprobe/pytorch/pt_config.py +2 -51
msprobe/pytorch/pytorch_service.py +7 -14
msprobe/visualization/builder/graph_builder.py +192 -63
msprobe/visualization/builder/graph_merger.py +986 -0
msprobe/visualization/builder/msprobe_adapter.py +17 -15
msprobe/visualization/compare/graph_comparator.py +26 -16
msprobe/visualization/db_utils.py +252 -0
msprobe/visualization/graph/base_node.py +2 -22
msprobe/visualization/graph/distributed_analyzer.py +12 -12
msprobe/visualization/graph/graph.py +44 -16
msprobe/visualization/graph_service.py +143 -59
msprobe/visualization/utils.py +103 -4
msprobe/docs/08.accuracy_checker_online_PyTorch.md +0 -295
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/attl.py +0 -205
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/client.py +0 -378
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/device_dispatch.py +0 -239
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/dump_dispatch.py +0 -115
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/server.py +0 -250
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/torch_ops_config.yaml +0 -63
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/utils.py +0 -198
msprobe/pytorch/attl_manager.py +0 -65
{mindstudio_probe-8.1.2.dist-info → mindstudio_probe-8.2.1.dist-info}/LICENSE +0 -0
{mindstudio_probe-8.1.2.dist-info → mindstudio_probe-8.2.1.dist-info}/WHEEL +0 -0
{mindstudio_probe-8.1.2.dist-info → mindstudio_probe-8.2.1.dist-info}/entry_points.txt +0 -0
{mindstudio_probe-8.1.2.dist-info → mindstudio_probe-8.2.1.dist-info}/top_level.txt +0 -0
/msprobe/{pytorch/api_accuracy_checker/tensor_transport_layer → core/compare/diff_analyze}/__init__.py +0 -0

msprobe/core/config_check/checkers/dataset_checker.py CHANGED Viewed

@@ -22,6 +22,7 @@ from msprobe.core.config_check.config_checker import register_checker_item, regi
 from msprobe.core.config_check.utils.utils import config_checking_print, get_tensor_features
 from msprobe.core.common.decorator import recursion_depth_decorator
 from msprobe.core.common.framework_adapter import FmkAdp
+from msprobe.core.common.const import Const
 @recursion_depth_decorator("config_check: process_obj")
@@ -134,5 +135,5 @@ class DatasetChecker(BaseChecker):
         cmp_dataset_pack_path = os.path.join(cmp_dir, DatasetChecker.target_name_in_zip)
         df = compare_dataset(bench_dataset_pack_path, cmp_dataset_pack_path)
-        pass_check = False not in df['equal'].values
+        pass_check = Const.CONFIG_CHECK_PASS if False not in df['equal'].values else Const.CONFIG_CHECK_ERROR
         return DatasetChecker.target_name_in_zip, pass_check, df

msprobe/core/config_check/checkers/env_args_checker.py CHANGED Viewed

@@ -21,7 +21,7 @@ import pandas as pd
 from msprobe.core.common.file_utils import load_json, load_yaml, create_file_with_content, create_file_in_zip
 from msprobe.core.config_check.checkers.base_checker import BaseChecker
 from msprobe.core.config_check.config_checker import register_checker_item
-from msprobe.core.config_check.utils.utils import config_checking_print
+from msprobe.core.config_check.utils.utils import config_checking_print, process_pass_check
 from msprobe.core.common.const import Const
@@ -59,17 +59,17 @@ def compare_env_data(npu_path, bench_path):
             cmp_env_name = cmp_env["name"]
             cmp_value = cmp_data.get(cmp_env_name, value[cmp_type]["default_value"])
             if not bench_env:
-                data.append(["only cmp has this env", cmp_env["name"], "", cmp_value, "warning"])
+                data.append(["only cmp has this env", cmp_env["name"], "", cmp_value, Const.CONFIG_CHECK_WARNING])
                 continue
             bench_env_name = bench_env["name"]
             bench_value = bench_data.get(bench_env_name, value[bench_type]["default_value"])
             if cmp_value != bench_value:
-                data.append([bench_env_name, cmp_env_name, bench_value, cmp_value, "error"])
+                data.append([bench_env_name, cmp_env_name, bench_value, cmp_value, Const.CONFIG_CHECK_ERROR])
         else:
             bench_env_name = bench_env["name"]
             bench_value = bench_data.get(bench_env_name) if bench_data.get(bench_env_name) else value[bench_type][
                 "default_value"]
-            data.append([bench_env_name, "only bench has this env", bench_value, "", "warning"])
+            data.append([bench_env_name, "only bench has this env", bench_value, "", Const.CONFIG_CHECK_WARNING])
     df = pd.DataFrame(data, columns=EnvArgsChecker.result_header)
     return df
@@ -92,5 +92,5 @@ class EnvArgsChecker(BaseChecker):
         bench_env_data = os.path.join(bench_dir, EnvArgsChecker.target_name_in_zip)
         cmp_env_data = os.path.join(cmp_dir, EnvArgsChecker.target_name_in_zip)
         df = compare_env_data(bench_env_data, cmp_env_data)
-        pass_check = "error" not in df['level'].values
+        pass_check = process_pass_check(df['level'].values)
         return EnvArgsChecker.target_name_in_zip, pass_check, df

msprobe/core/config_check/checkers/hyperparameter_checker.py CHANGED Viewed

@@ -23,7 +23,7 @@ import pandas as pd
 from msprobe.core.common.utils import check_extern_input_list
 from msprobe.core.config_check.checkers.base_checker import BaseChecker
 from msprobe.core.config_check.config_checker import register_checker_item
-from msprobe.core.config_check.utils.utils import compare_dict, config_checking_print, update_dict
+from msprobe.core.config_check.utils.utils import compare_dict, config_checking_print, update_dict, process_pass_check
 from msprobe.core.config_check.utils.hyperparameter_parser import ParserFactory
 from msprobe.core.common.file_utils import (check_file_or_directory_path, create_file_in_zip, load_json,
                                             load_yaml)
@@ -36,6 +36,20 @@ parameter_name_mapping = load_yaml(os.path.realpath(hyperparameters_path))
 hyperparameters_dict = {}
+def refine_json_keys(json_dcit):
+    new_dict = {}
+    for key in json_dcit.keys():
+        new_key = key.split(Const.SEP)[-1].replace("-", "_")
+        new_dict[new_key] = key
+    return new_dict
+def to_str_if_number(value):
+    if isinstance(value, (int, float)):
+        return str(value)
+    return value
 @register_checker_item("hyperparameter")
 class HyperparameterChecker(BaseChecker):
     target_name_in_zip = "hyperparameters"
@@ -86,29 +100,35 @@ class HyperparameterChecker(BaseChecker):
                 all_diffs.extend(
                     HyperparameterChecker.compare_param(bench_hyperparameters, cmp_hyperparameters, file_name))
         df = pd.DataFrame(all_diffs, columns=HyperparameterChecker.result_header)
-        pass_check = "error" not in df["level"].values
+        pass_check = process_pass_check(df["level"].values)
         return HyperparameterChecker.target_name_in_zip, pass_check, df
     @staticmethod
     def compare_param(bench_params, cmp_params, file_name):
         all_diffs = []
-        bench_param_names = bench_params.keys()
-        for bench_param_name in bench_param_names:
+        bench_params_refined = refine_json_keys(bench_params)
+        cmp_params_refined = refine_json_keys(cmp_params)
+        for bench_param_name in bench_params_refined.keys():
             matched_cmp_param_name, matched_with = HyperparameterChecker._fuzzy_match_parameter(bench_param_name,
-                                                                                                cmp_params)
-            bench_param_value = bench_params[bench_param_name]
+                                                                                                cmp_params_refined)
+            matched_cmp_param_name = cmp_params_refined.get(matched_cmp_param_name)
+            bench_param_name = bench_params_refined.get(bench_param_name)
+            bench_param_value = to_str_if_number(bench_params[bench_param_name])
             if matched_cmp_param_name:
-                cmp_param_value = cmp_params[matched_cmp_param_name]
+                cmp_param_value = to_str_if_number(cmp_params[matched_cmp_param_name])
                 if bench_param_value != cmp_param_value:
                     all_diffs.append(
                         [file_name, bench_param_name, matched_cmp_param_name, bench_param_value, cmp_param_value,
-                         matched_with, "error"])
+                         matched_with, Const.CONFIG_CHECK_ERROR])
                 del cmp_params[matched_cmp_param_name]
             else:
                 all_diffs.append(
-                    [file_name, bench_param_name, "Only in benchmark", bench_param_value, "", "", "warning"])
+                    [file_name, bench_param_name, "Only in benchmark", bench_param_value, "", "",
+                     Const.CONFIG_CHECK_WARNING])
         for cmp_param_name, cmp_param_value in cmp_params.items():
-            all_diffs.append([file_name, "Only in comparison", cmp_param_name, "", cmp_param_value, "", "warning"])
+            all_diffs.append(
+                [file_name, "Only in comparison", cmp_param_name, "", cmp_param_value, "", Const.CONFIG_CHECK_WARNING])
         all_diffs.sort()
         return all_diffs

msprobe/core/config_check/checkers/pip_checker.py CHANGED Viewed

@@ -23,8 +23,9 @@ except ImportError:
 from msprobe.core.common.file_utils import load_yaml, create_file_in_zip
 from msprobe.core.config_check.checkers.base_checker import BaseChecker
 from msprobe.core.config_check.config_checker import register_checker_item
-from msprobe.core.config_check.utils.utils import config_checking_print
+from msprobe.core.config_check.utils.utils import config_checking_print, process_pass_check
 from msprobe.core.common.file_utils import FileOpen, save_excel
+from msprobe.core.common.const import Const
 dirpath = os.path.dirname(__file__)
 depend_path = os.path.join(dirpath, "../resource/dependency.yaml")
@@ -62,7 +63,7 @@ def compare_pip_data(bench_pip_path, cmp_pip_path, fmk):
         if bench_version != cmp_version:
             data.append([package, bench_version if bench_version else 'None',
                          cmp_version if cmp_version else 'None',
-                         "error"])
+                         Const.CONFIG_CHECK_ERROR])
     df = pd.DataFrame(data, columns=PipPackageChecker.result_header)
     return df
@@ -86,5 +87,5 @@ class PipPackageChecker(BaseChecker):
         bench_pip_path = os.path.join(bench_dir, PipPackageChecker.target_name_in_zip)
         cmp_pip_path = os.path.join(cmp_dir, PipPackageChecker.target_name_in_zip)
         df = compare_pip_data(bench_pip_path, cmp_pip_path, fmk)
-        pass_check = "error" not in df['level'].values
+        pass_check = process_pass_check(df['level'].values)
         return PipPackageChecker.target_name_in_zip, pass_check, df

msprobe/core/config_check/checkers/random_checker.py CHANGED Viewed

@@ -280,9 +280,9 @@ def mindspore_patchs():
     import mindspore
     mindspore_ops_patches = {
-        'rand': mindspore.ops.uniform,
+        'rand': mindspore.ops.rand,
         'randint': mindspore.ops.randint,
-        'randn': mindspore.ops.normal
+        'randn': mindspore.ops.randn
     }
     for name, func in mindspore_ops_patches.items():
         setattr(mindspore.ops, name, track_random_call(func, f"mindspore.ops.{name}"))
@@ -331,7 +331,7 @@ class RandomChecker(BaseChecker):
         cmp_stats_path = os.path.join(cmp_dir, RandomChecker.target_name_in_zip)
         df = compare_random_calls(bench_stats_path, cmp_stats_path)
-        pass_check = False not in df['check_result'].values
+        pass_check = Const.CONFIG_CHECK_PASS if False not in df['check_result'].values else Const.CONFIG_CHECK_ERROR
         return RandomChecker.target_name_in_zip, pass_check, df

msprobe/core/config_check/checkers/weights_checker.py CHANGED Viewed

@@ -22,6 +22,7 @@ from msprobe.core.config_check.checkers.base_checker import BaseChecker
 from msprobe.core.config_check.config_checker import register_checker_item, register_pre_forward_fun_list
 from msprobe.core.config_check.utils.utils import config_checking_print, get_tensor_features
 from msprobe.core.common.framework_adapter import FmkAdp
+from msprobe.core.common.const import Const
 def collect_weights_data(model):
@@ -143,5 +144,5 @@ class WeightsChecker(BaseChecker):
         bench_weight_pack_path = os.path.join(bench_dir, WeightsChecker.target_name_in_zip)
         cmp_weight_pack_path = os.path.join(cmp_dir, WeightsChecker.target_name_in_zip)
         df = compare_weight(bench_weight_pack_path, cmp_weight_pack_path)
-        pass_check = False not in df['equal'].values
+        pass_check = Const.CONFIG_CHECK_PASS if False not in df['equal'].values else Const.CONFIG_CHECK_ERROR
         return WeightsChecker.target_name_in_zip, pass_check, df

msprobe/core/config_check/ckpt_compare/megatron_loader.py CHANGED Viewed

@@ -138,6 +138,8 @@ def _consolidate_tp_weights(weights: Dict) -> Dict:
 def _parse_num_layers_per_stage(tp_partition):
     match = [re.findall(LAYER_IDX_PATTERN, key) for key in tp_partition.keys()]
     layer_idx = [int(i[0]) for i in match if i]
+    if not layer_idx:
+        return 1
     num_layers_per_pipeline_stage = max(layer_idx) + 1
     return num_layers_per_pipeline_stage

msprobe/core/config_check/resource/hyperparameter.yaml CHANGED Viewed

@@ -18,4 +18,14 @@ weight_decay:
 dropout_rate:
   - dropout
-  - drop_rate
+  - drop_rate
+compute_dtype:
+  - bf16
+  - fp32
+residual_dtype:
+  - fp32_residual_connection
+softmax_compute_dtype:
+  - attention_softmax_in_fp32

msprobe/core/config_check/utils/hyperparameter_parser.py CHANGED Viewed

@@ -96,9 +96,13 @@ class YamlParser(Parser):
                 new_prefix = prefix + Const.SEP + key if prefix else key
                 self.recursive_parse_parameters(value, new_prefix)
         elif isinstance(parameters, list):
-            for value in parameters:
-                self.recursive_parse_parameters(value, prefix)
-        elif isinstance(parameters, (int, str, bool)):
+            if all(isinstance(x, (int, float, str, bool, list))for x in parameters):
+                self.hyperparameters.update({prefix: parameters})
+            else:
+                for idx, value in enumerate(parameters):
+                    new_prefix = prefix + Const.SEP + str(idx) if prefix else str(idx)
+                    self.recursive_parse_parameters(value, new_prefix)
+        elif isinstance(parameters, (int, float, str, bool)):
             self.hyperparameters.update({prefix: parameters})

msprobe/core/config_check/utils/utils.py CHANGED Viewed

@@ -19,6 +19,7 @@ import hashlib
 from msprobe.core.common.framework_adapter import FmkAdp
 from msprobe.core.common.log import logger
+from msprobe.core.common.const import Const
 def merge_keys(dir_0, dir_1):
@@ -105,3 +106,12 @@ def update_dict(ori_dict, new_dict):
                 ori_dict[key] = {"description": "duplicate_value", "values": [ori_dict[key], new_dict[key]]}
         else:
             ori_dict[key] = value
+def process_pass_check(data):
+    if Const.CONFIG_CHECK_ERROR in data:
+        return Const.CONFIG_CHECK_ERROR
+    elif Const.CONFIG_CHECK_WARNING in data:
+        return Const.CONFIG_CHECK_WARNING
+    else:
+        return Const.CONFIG_CHECK_PASS

msprobe/core/data_dump/api_registry.py CHANGED Viewed

@@ -35,7 +35,7 @@ class ApiWrapper:
     def __init__(
         self, api_types: Dict[str, Dict[str, Any]],
         api_list_paths: Union[str, List[str], Tuple[str]],
-        backlist: Union[List[str], Tuple[str]] = None
+        blacklist: Union[List[str], Tuple[str]] = None
     ):
         self.api_types = api_types
         if not isinstance(api_list_paths, (list, tuple)):
@@ -44,7 +44,7 @@ class ApiWrapper:
             raise RuntimeError("The number of api_list_paths must be equal to the number of frameworks in 'api_types', "
                                "when api_list_paths is a list or tuple.")
         self.api_list_paths = api_list_paths
-        self.backlist = backlist if backlist else []
+        self.blacklist = blacklist if blacklist else []
         self.api_names = self._get_api_names()
         self.wrapped_api_functions = dict()
@@ -80,6 +80,26 @@ class ApiWrapper:
         return True, args, kwargs
+    def wrap_api_func(self, api_name, api_func, prefix, hook_build_func, api_template):
+        api_instance = api_template(api_name, api_func, prefix, hook_build_func)
+        def api_function(*args, **kwargs):
+            api_name_with_prefix = prefix + Const.SEP + str(api_name.split(Const.SEP)[-1])
+            enable_wrap, args, kwargs = self.deal_with_self_kwargs(api_name_with_prefix, api_func, args, kwargs)
+            if not enable_wrap:
+                logger.warning(f'Cannot collect precision data of {api_name_with_prefix}. '
+                               'It may be fixed by passing the value of "self" '
+                               'as a positional argument instead of a keyword argument. ')
+                return api_func(*args, **kwargs)
+            return api_instance(*args, **kwargs)
+        for attr_name in Const.API_ATTR_LIST:
+            if hasattr(api_func, attr_name):
+                attr_value = getattr(api_func, attr_name)
+                setattr(api_function, attr_name, attr_value)
+        return api_function
     def wrap_api(
         self, api_templates, hook_build_func: Optional[Callable]
     ):
@@ -100,23 +120,17 @@ class ApiWrapper:
                 api_template = api_templates[index]
                 index += 1
                 for api_name in self.api_names.get(framework, {}).get(api_type, []):
-                    ori_api = _get_attr(api_modules[0], api_name)
+                    ori_api = None
+                    for module in api_modules[0]:
+                        ori_api = ori_api or _get_attr(module, api_name)
                     if callable(ori_api):
-                        def wrap_api_func(api_name, api_func, prefix, hook_build_func, api_template):
-                            def api_function(*args, **kwargs):
-                                api_name_with_prefix = prefix + Const.SEP + str(api_name.split(Const.SEP)[-1])
-                                enable_wrap, args, kwargs = self.deal_with_self_kwargs(api_name_with_prefix,
-                                                                                       api_func, args, kwargs)
-                                if not enable_wrap:
-                                    logger.warning(f'Cannot collect precision data of {api_name_with_prefix}. '
-                                                   'It may be fixed by passing the value of "self" '
-                                                   'as a positional argument instead of a keyword argument. ')
-                                    return api_func(*args, **kwargs)
-                                return api_template(api_name, api_func, prefix, hook_build_func)(*args, **kwargs)
-                            api_function.__name__ = api_name
-                            return api_function
-                        wrapped_functions[api_name] = wrap_api_func(api_name, ori_api, name_prefix,
-                                                                    hook_build_func, api_template)
+                        wrapped_functions[api_name] = self.wrap_api_func(
+                            api_name,
+                            ori_api,
+                            name_prefix,
+                            hook_build_func,
+                            api_template
+                        )
                 wrapped_functions_in_framework[api_type] = wrapped_functions
             self.wrapped_api_functions[framework] = wrapped_functions_in_framework
         return self.wrapped_api_functions
@@ -132,15 +146,17 @@ class ApiWrapper:
                 api_from_file = api_list.get(key_in_file, [])
                 names = set()
                 for api_name in api_from_file:
-                    if f'{key_in_file}.{api_name}' in self.backlist:
+                    if f'{key_in_file}.{api_name}' in self.blacklist:
                         continue
                     target_attr = api_name
-                    target_module = api_modules[0]
-                    if Const.SEP in api_name:
-                        sub_module_name, target_attr = api_name.rsplit(Const.SEP, 1)
-                        target_module = getattr(api_modules[0], sub_module_name, None)
-                    if target_module and target_attr in dir(target_module):
-                        names.add(api_name)
+                    for module in api_modules[0]:
+                        if Const.SEP in api_name:
+                            sub_module_name, target_attr = api_name.rsplit(Const.SEP, 1)
+                            target_module = getattr(module, sub_module_name, None)
+                        else:
+                            target_module = module
+                        if target_module and target_attr in dir(target_module):
+                            names.add(api_name)
                 valid_names[api_type] = names
             api_names[framework] = valid_names
@@ -152,7 +168,7 @@ class ApiRegistry:
     Base class for api registry.
     """
-    def __init__(self, api_types, inner_used_api, supported_api_list_path, api_templates, backlist=None):
+    def __init__(self, api_types, inner_used_api, supported_api_list_path, api_templates, blacklist=None):
         self.ori_api_attr = dict()
         self.wrapped_api_attr = dict()
         self.inner_used_ori_attr = dict()
@@ -161,13 +177,16 @@ class ApiRegistry:
         self.inner_used_api = inner_used_api
         self.supported_api_list_path = supported_api_list_path
         self.api_templates = api_templates
-        self.backlist = backlist if backlist else []
+        self.blacklist = blacklist if blacklist else []
         self.all_api_registered = False
     @staticmethod
-    def store_ori_attr(ori_api_group, api_list, api_ori_attr):
+    def store_ori_attr(ori_api_groups, api_list, api_ori_attr):
         for api in api_list:
-            api_ori_attr[api] = _get_attr(ori_api_group, api)
+            ori_api = None
+            for ori_api_group in ori_api_groups:
+                ori_api = ori_api or _get_attr(ori_api_group, api)
+            api_ori_attr[api] = ori_api
     @staticmethod
     def set_api_attr(api_group, attr_dict):
@@ -217,7 +236,7 @@ class ApiRegistry:
             self.set_api_attr(self.inner_used_api.get(api_type)[0], self.inner_used_ori_attr.get(api_type, {}))
     def initialize_hook(self, hook_build_func):
-        api_wrapper = ApiWrapper(self.api_types, self.supported_api_list_path, self.backlist)
+        api_wrapper = ApiWrapper(self.api_types, self.supported_api_list_path, self.blacklist)
         wrapped_api_functions = api_wrapper.wrap_api(self.api_templates, hook_build_func)
         for framework, api_types in self.api_types.items():

msprobe/core/data_dump/data_collector.py CHANGED Viewed

@@ -23,6 +23,7 @@ from msprobe.core.data_dump.json_writer import DataWriter
 from msprobe.core.common.log import logger
 from msprobe.core.common.const import Const
 from msprobe.core.data_dump.data_processor.factory import DataProcessorFactory
+from msprobe.core.common.megatron_utils import MegatronStepInfo, get_micro_step, is_megatron
 def build_data_collector(config):
@@ -41,6 +42,7 @@ class DataCollector:
         self.module_count = {}
         self.scope = ScopeFactory(self.config).build_scope()
         self.backward_module_names = {}
+        self.params_grad_record = {}
         self.optimizer_status = ""
         self.optimizer_status_first_start = {Const.OPTIMIZER: True, Const.CLIP_GRAD: True}
         atexit.register(self.write_json_at_exit)
@@ -118,12 +120,16 @@ class DataCollector:
             self.set_is_recomputable(data_info, is_recompute)
             if self.config.level == Const.LEVEL_L2:
                 return
+            self.call_stack_collect(name)
             self.handle_data(name, data_info, flush=self.data_processor.is_terminated)
-        except Exception:
+        except Exception as e:
+            # 取异常类名作为“类型”做去重
+            error_type = type(e).__name__
             tb = traceback.format_exc()
             self.data_writer.write_error_log(
-                f"[ERROR] forward_input_data_collect failed: name={name}, pid={pid}\n{tb}"
+                f"[ERROR] forward_input_data_collect failed: name={name}, pid={pid}\n{tb}",
+                error_type=error_type
             )
     def forward_output_data_collect(self, name, module, pid, module_input_output, is_recompute=None):
@@ -139,13 +145,15 @@ class DataCollector:
             self.set_is_recomputable(data_info, is_recompute)
             if self.config.level == Const.LEVEL_L2:
                 return
-            self.call_stack_collect(name)
             self.handle_data(name, data_info, flush=self.data_processor.is_terminated)
-        except Exception:
+        except Exception as e:
+            # 取异常类名作为“类型”做去重
+            error_type = type(e).__name__
             tb = traceback.format_exc()
             self.data_writer.write_error_log(
-                f"[ERROR] forward_output_data_collect failed: name={name}, pid={pid}\n{tb}"
+                f"[ERROR] forward_output_data_collect failed: name={name}, pid={pid}\n{tb}",
+                error_type=error_type
             )
     def forward_data_collect_only_tensor(self, name, module, pid, module_input_output):
@@ -154,10 +162,13 @@ class DataCollector:
                 return
             self.data_processor.analyze_forward(name, module, module_input_output)
-        except Exception:
+        except Exception as e:
+            # 取异常类名作为“类型”做去重
+            error_type = type(e).__name__
             tb = traceback.format_exc()
             self.data_writer.write_error_log(
-                f"[ERROR] forward_data_collect_only_tensor failed: name={name}, pid={pid}\n{tb}"
+                f"[ERROR] forward_data_collect_only_tensor failed: name={name}, pid={pid}\n{tb}",
+                error_type=error_type
             )
     def forward_data_collect(self, name, module, pid, module_input_output, is_recompute=None):
@@ -173,10 +184,12 @@ class DataCollector:
             self.call_stack_collect(name)
             self.handle_data(name, data_info, flush=self.data_processor.is_terminated)
-        except Exception:
+        except Exception as e:
+            error_type = type(e).__name__
             tb = traceback.format_exc()
             self.data_writer.write_error_log(
-                f"[ERROR] forward_data_collect failed: name={name}, pid={pid}\n{tb}"
+                f"[ERROR] forward_data_collect failed: name={name}, pid={pid}\n{tb}",
+                error_type=error_type
             )
     def backward_data_collect_only_tensor(self, name, module, pid, module_input_output, is_recompute=None):
@@ -185,10 +198,12 @@ class DataCollector:
                 return
             self.data_processor.analyze_backward(name, module, module_input_output)
-        except Exception:
+        except Exception as e:
+            error_type = type(e).__name__
             tb = traceback.format_exc()
             self.data_writer.write_error_log(
-                f"[ERROR] backward_data_collect_only_tensor failed: name={name}, pid={pid}\n{tb}"
+                f"[ERROR] backward_data_collect_only_tensor failed: name={name}, pid={pid}\n{tb}",
+                error_type=error_type
             )
     def backward_data_collect(self, name, module, pid, module_input_output, is_recompute=None):
@@ -206,10 +221,12 @@ class DataCollector:
                 self.backward_module_names[module_name] = True
             self.handle_data(name, data_info, flush=self.data_processor.is_terminated)
-        except Exception:
+        except Exception as e:
+            error_type = type(e).__name__
             tb = traceback.format_exc()
             self.data_writer.write_error_log(
-                f"[ERROR] backward_data_collect failed: name={name}, pid={pid}\n{tb}"
+                f"[ERROR] backward_data_collect failed: name={name}, pid={pid}\n{tb}",
+                error_type=error_type
             )
     def backward_input_data_collect(self, name, module, pid, module_input_output, is_recompute=None):
@@ -223,10 +240,12 @@ class DataCollector:
             self.set_is_recomputable(data_info, is_recompute)
             self.handle_data(name, data_info)
-        except Exception:
+        except Exception as e:
+            error_type = type(e).__name__
             tb = traceback.format_exc()
             self.data_writer.write_error_log(
-                f"[ERROR] backward_input_data_collect failed: name={name}, pid={pid}\n{tb}"
+                f"[ERROR] backward_input_data_collect failed: name={name}, pid={pid}\n{tb}",
+                error_type=error_type
             )
     def backward_output_data_collect(self, name, module, pid, module_input_output, is_recompute=None):
@@ -240,25 +259,32 @@ class DataCollector:
             self.set_is_recomputable(data_info, is_recompute)
             self.handle_data(name, data_info)
-        except Exception:
+        except Exception as e:
+            error_type = type(e).__name__
             tb = traceback.format_exc()
             self.data_writer.write_error_log(
-                f"[ERROR] backward_output_data_collect failed: name={name}, pid={pid}\n{tb}"
+                f"[ERROR] backward_output_data_collect failed: name={name}, pid={pid}\n{tb}",
+                error_type=error_type
             )
     def update_construct(self, name):
         if self.config.level not in DataCollector.level_without_construct:
             if self.optimizer_status in [Const.OPTIMIZER, Const.CLIP_GRAD]:
                 if self.optimizer_status_first_start[self.optimizer_status]:
-                    self.data_writer.update_construct({self.optimizer_status: None})
+                    self.data_writer.update_construct(
+                        {self.optimizer_status: None if not is_megatron() else [None, get_micro_step()]})
                     self.optimizer_status_first_start[self.optimizer_status] = False
-                self.data_writer.update_construct({name: self.optimizer_status})
+                self.data_writer.update_construct(
+                    {name: self.optimizer_status if not is_megatron() else [self.optimizer_status, get_micro_step()]})
             else:
                 if self.config.level == Const.LEVEL_MIX and \
                   not (name.startswith(Const.MODULE) or name.startswith(Const.CELL)):
                     self.data_writer.update_construct(
                         {name: self.module_processor.api_parent_node.get(threading.get_ident())}
                     )
+            if MegatronStepInfo.is_megatron:
+                micro_step_number = max(MegatronStepInfo.forward_micro_step, MegatronStepInfo.backward_micro_step)
+                self.data_writer.update_construct({Const.MEGATRON_MICRO_STEP_NUMBER: micro_step_number})
             self.data_writer.update_construct(self.module_processor.module_node)
@@ -282,20 +308,36 @@ class DataCollector:
         self.data_processor.update_iter(current_iter)
     def params_data_collect(self, name, param_name, pid, data):
+        grad_name = name + Const.SEP + Const.PARAMS_GRAD
+        self.update_api_or_module_name(grad_name)
+        if not self.check_scope_and_pid(self.scope, name, pid) and not self.backward_module_names.get(name):
+            if self.data_writer.cache_data.get("data"):
+                self.data_writer.cache_data.get("data").pop(grad_name, None)
+                self.params_grad_record[grad_name] = False
+            return
+        data_info = self.data_processor.analyze_params(grad_name, param_name, data)
+        self.handle_data(grad_name, data_info, flush=self.data_processor.is_terminated)
+        self.params_grad_record[grad_name] = False
+    def params_data_collect_in_bw_hook(self, params_dict, name):
         try:
-            grad_name = name + Const.SEP + Const.PARAMS_GRAD
-            self.update_api_or_module_name(grad_name)
-            if not self.check_scope_and_pid(self.scope, name, pid) and not self.backward_module_names.get(name):
-                if self.data_writer.cache_data.get("data"):
-                    self.data_writer.cache_data.get("data").pop(grad_name, None)
+            if not params_dict:
                 return
-            data_info = self.data_processor.analyze_params(grad_name, param_name, data)
-            self.handle_data(grad_name, data_info, flush=self.data_processor.is_terminated)
-        except Exception:
+            ori_name = name.rsplit(Const.SEP, 2)[0]
+            for param_name, param in params_dict.items():
+                grad_name = ori_name + Const.SEP + Const.PARAMS_GRAD
+                self.update_api_or_module_name(grad_name)
+                if self.params_grad_record.get(grad_name, False):
+                    grad = param.grad if hasattr(param, "grad") else None
+                    data_info = self.data_processor.analyze_params(grad_name, param_name, grad)
+                    self.handle_data(grad_name, data_info, flush=self.data_processor.is_terminated)
+        except Exception as e:
+            error_type = type(e).__name__
             tb = traceback.format_exc()
             self.data_writer.write_error_log(
-                f"[ERROR] params_data_collect failed: "
-                f"name={name}, param_name={param_name}, pid={pid}\n{tb}"
+                f"[ERROR] params_data_collect_in_bw_hook failed: "
+                f"name={name}",
+                error_type=error_type
             )
     def debug_data_collect_forward(self, variable, name_with_count):

msprobe/core/data_dump/data_processor/base.py CHANGED Viewed

@@ -94,6 +94,8 @@ class BaseDataProcessor:
     def __init__(self, config, data_writer):
         self.data_writer = data_writer
         self.config = config
+        if self.data_writer is not None:
+            self.data_writer.config = config
         self.api_info_struct = {}
         self.stack_info_struct = {}
         self.current_api_or_module_name = None

mindstudio-probe 8.1.2__py3-none-any.whl → 8.2.1__py3-none-any.whl

mindstudio-probe 8.1.2py3-none-any.whl → 8.2.1py3-none-any.whl