PyPI - mindstudio-probe - Versions diffs - 1.2.2__py3-none-any.whl → 8.1.0__py3-none-any.whl - Mend

mindstudio-probe 1.2.2py3-none-any.whl → 8.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (261) hide show

{mindstudio_probe-1.2.2.dist-info → mindstudio_probe-8.1.0.dist-info}/METADATA +4 -3
{mindstudio_probe-1.2.2.dist-info → mindstudio_probe-8.1.0.dist-info}/RECORD +243 -191
msprobe/README.md +57 -21
msprobe/core/__init__.py +17 -0
msprobe/core/common/const.py +224 -82
msprobe/core/common/decorator.py +50 -0
msprobe/core/common/exceptions.py +5 -3
msprobe/core/common/file_utils.py +274 -40
msprobe/core/common/framework_adapter.py +169 -0
msprobe/core/common/global_lock.py +86 -0
msprobe/core/common/runtime.py +25 -0
msprobe/core/common/utils.py +148 -72
msprobe/core/common_config.py +7 -0
msprobe/core/compare/acc_compare.py +640 -462
msprobe/core/compare/check.py +36 -107
msprobe/core/compare/compare_cli.py +4 -0
msprobe/core/compare/config.py +72 -0
msprobe/core/compare/highlight.py +217 -215
msprobe/core/compare/layer_mapping/layer_mapping.py +4 -1
msprobe/core/compare/merge_result/merge_result.py +12 -6
msprobe/core/compare/multiprocessing_compute.py +227 -107
msprobe/core/compare/npy_compare.py +32 -16
msprobe/core/compare/utils.py +218 -244
msprobe/{mindspore/runtime.py → core/config_check/__init__.py} +2 -4
msprobe/{pytorch/dump/kernel_dump/kernel_config.py → core/config_check/checkers/__init__.py} +8 -16
msprobe/core/config_check/checkers/base_checker.py +60 -0
msprobe/core/config_check/checkers/dataset_checker.py +138 -0
msprobe/core/config_check/checkers/env_args_checker.py +96 -0
msprobe/core/config_check/checkers/hyperparameter_checker.py +170 -0
msprobe/core/config_check/checkers/pip_checker.py +90 -0
msprobe/core/config_check/checkers/random_checker.py +367 -0
msprobe/core/config_check/checkers/weights_checker.py +147 -0
msprobe/core/config_check/ckpt_compare/ckpt_comparator.py +74 -0
msprobe/core/config_check/ckpt_compare/megatron_loader.py +302 -0
msprobe/core/config_check/ckpt_compare/metrics.py +83 -0
msprobe/core/config_check/ckpt_compare/name_mapping.yaml +12 -0
msprobe/core/config_check/config_check_cli.py +51 -0
msprobe/core/config_check/config_checker.py +100 -0
msprobe/{pytorch/parse.py → core/config_check/resource/dependency.yaml} +7 -4
msprobe/core/config_check/resource/env.yaml +57 -0
msprobe/core/config_check/resource/hyperparameter.yaml +21 -0
msprobe/core/config_check/utils/hyperparameter_parser.py +115 -0
msprobe/core/config_check/utils/utils.py +107 -0
msprobe/core/data_dump/api_registry.py +239 -0
msprobe/core/data_dump/data_collector.py +36 -9
msprobe/core/data_dump/data_processor/base.py +74 -53
msprobe/core/data_dump/data_processor/mindspore_processor.py +119 -78
msprobe/core/data_dump/data_processor/pytorch_processor.py +134 -96
msprobe/core/data_dump/json_writer.py +146 -57
msprobe/core/debugger/precision_debugger.py +143 -0
msprobe/core/grad_probe/constant.py +2 -1
msprobe/core/grad_probe/grad_compare.py +2 -2
msprobe/core/grad_probe/utils.py +1 -1
msprobe/core/hook_manager.py +242 -0
msprobe/core/monitor/anomaly_processor.py +384 -0
msprobe/core/overflow_check/abnormal_scene.py +2 -0
msprobe/core/service.py +356 -0
msprobe/core/single_save/__init__.py +0 -0
msprobe/core/single_save/single_comparator.py +243 -0
msprobe/core/single_save/single_saver.py +157 -0
msprobe/docs/01.installation.md +6 -5
msprobe/docs/02.config_introduction.md +89 -30
msprobe/docs/03.config_examples.md +1 -0
msprobe/docs/04.kernel_dump_PyTorch.md +1 -1
msprobe/docs/05.data_dump_PyTorch.md +184 -50
msprobe/docs/06.data_dump_MindSpore.md +193 -28
msprobe/docs/07.accuracy_checker_PyTorch.md +13 -3
msprobe/docs/08.accuracy_checker_online_PyTorch.md +72 -10
msprobe/docs/09.accuracy_checker_MindSpore.md +19 -7
msprobe/docs/10.accuracy_compare_PyTorch.md +266 -102
msprobe/docs/11.accuracy_compare_MindSpore.md +117 -43
msprobe/docs/12.overflow_check_PyTorch.md +5 -3
msprobe/docs/13.overflow_check_MindSpore.md +6 -4
msprobe/docs/14.data_parse_PyTorch.md +4 -10
msprobe/docs/17.grad_probe.md +2 -1
msprobe/docs/18.online_dispatch.md +3 -3
msprobe/docs/19.monitor.md +211 -103
msprobe/docs/21.visualization_PyTorch.md +100 -28
msprobe/docs/22.visualization_MindSpore.md +103 -31
msprobe/docs/23.generate_operator_PyTorch.md +9 -9
msprobe/docs/25.tool_function_introduction.md +23 -22
msprobe/docs/26.data_dump_PyTorch_baseline.md +14 -3
msprobe/docs/27.dump_json_instruction.md +278 -8
msprobe/docs/28.debugger_save_instruction.md +111 -20
msprobe/docs/28.kernel_dump_MindSpore.md +1 -1
msprobe/docs/29.data_dump_MSAdapter.md +229 -0
msprobe/docs/30.overflow_check_MSAdapter.md +31 -0
msprobe/docs/31.config_check.md +95 -0
msprobe/docs/32.ckpt_compare.md +69 -0
msprobe/docs/33.generate_operator_MindSpore.md +190 -0
msprobe/docs/34.RL_collect.md +92 -0
msprobe/docs/35.nan_analyze.md +72 -0
msprobe/docs/FAQ.md +3 -11
msprobe/docs/data_dump_MindSpore/data_dump_MindSpore_baseline.md +12 -1
msprobe/docs/data_dump_MindSpore/dynamic_graph_quick_start_example.md +3 -1
msprobe/docs/img/compare_result.png +0 -0
msprobe/docs/img/merge_result.png +0 -0
msprobe/docs/img/save_compare_result_sample.png +0 -0
msprobe/docs/img/visualization/proxy.png +0 -0
msprobe/docs/img/visualization/vis_browser_1.png +0 -0
msprobe/docs/img/visualization/vis_match_info.png +0 -0
msprobe/docs/img/visualization/vis_precision_info.png +0 -0
msprobe/docs/img/visualization/vis_search_info.png +0 -0
msprobe/docs/img/visualization/vis_show_info.png +0 -0
msprobe/docs/img/visualization/vis_showcase.png +0 -0
msprobe/docs/img/visualization/vis_unmatch_info.png +0 -0
msprobe/mindspore/__init__.py +3 -3
msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +151 -55
msprobe/mindspore/api_accuracy_checker/api_runner.py +25 -11
msprobe/mindspore/api_accuracy_checker/base_compare_algorithm.py +2 -1
msprobe/mindspore/api_accuracy_checker/bench_functions/flash_attention_score.py +580 -0
msprobe/mindspore/api_accuracy_checker/bench_functions/fusion_operator.py +41 -0
msprobe/mindspore/api_accuracy_checker/cmd_parser.py +4 -0
msprobe/mindspore/api_accuracy_checker/data_manager.py +4 -3
msprobe/mindspore/api_accuracy_checker/generate_op_script/config_op.json +9 -0
msprobe/mindspore/api_accuracy_checker/generate_op_script/op_generator.py +451 -0
msprobe/mindspore/api_accuracy_checker/generate_op_script/operator_replication.template +2081 -0
msprobe/mindspore/api_accuracy_checker/multi_api_accuracy_checker.py +11 -1
msprobe/mindspore/api_accuracy_checker/torch_mindtorch_importer.py +2 -1
msprobe/mindspore/cell_processor.py +204 -33
msprobe/mindspore/code_mapping/graph_parser.py +4 -21
msprobe/mindspore/common/const.py +73 -2
msprobe/mindspore/common/utils.py +157 -29
msprobe/mindspore/compare/common_dir_compare.py +382 -0
msprobe/mindspore/compare/distributed_compare.py +2 -26
msprobe/mindspore/compare/ms_compare.py +18 -398
msprobe/mindspore/compare/ms_graph_compare.py +20 -10
msprobe/mindspore/compare/utils.py +37 -0
msprobe/mindspore/debugger/debugger_config.py +59 -7
msprobe/mindspore/debugger/precision_debugger.py +83 -90
msprobe/mindspore/dump/cell_dump_process.py +902 -0
msprobe/mindspore/dump/cell_dump_with_insert_gradient.py +889 -0
msprobe/mindspore/dump/dump_tool_factory.py +18 -8
msprobe/mindspore/dump/graph_mode_cell_dump.py +139 -0
msprobe/mindspore/dump/graph_tensor_dump.py +123 -0
msprobe/mindspore/dump/hook_cell/api_register.py +176 -0
msprobe/mindspore/dump/hook_cell/hook_cell.py +22 -12
msprobe/mindspore/dump/hook_cell/ms_hook_manager.py +88 -0
msprobe/mindspore/dump/hook_cell/primitive_hooks.py +8 -2
msprobe/mindspore/dump/hook_cell/support_wrap_ops.yaml +42 -26
msprobe/mindspore/dump/jit_dump.py +35 -27
msprobe/mindspore/dump/kernel_kbyk_dump.py +6 -3
msprobe/mindspore/dym_loader/hook_dynamic_loader.cpp +110 -0
msprobe/mindspore/dym_loader/hook_dynamic_loader.h +15 -16
msprobe/mindspore/free_benchmark/api_pynative_self_check.py +22 -12
msprobe/mindspore/free_benchmark/common/utils.py +1 -1
msprobe/mindspore/free_benchmark/perturbation/perturbation_factory.py +4 -2
msprobe/mindspore/free_benchmark/self_check_tool_factory.py +6 -3
msprobe/mindspore/grad_probe/global_context.py +9 -2
msprobe/mindspore/grad_probe/grad_analyzer.py +2 -1
msprobe/mindspore/grad_probe/grad_stat_csv.py +3 -2
msprobe/mindspore/grad_probe/hook.py +2 -4
msprobe/mindspore/mindspore_service.py +111 -0
msprobe/mindspore/monitor/common_func.py +52 -0
msprobe/mindspore/monitor/data_writers.py +237 -0
msprobe/mindspore/monitor/distributed/wrap_distributed.py +1 -1
msprobe/mindspore/monitor/features.py +13 -1
msprobe/mindspore/monitor/module_hook.py +568 -444
msprobe/mindspore/monitor/optimizer_collect.py +331 -0
msprobe/mindspore/monitor/utils.py +71 -9
msprobe/mindspore/ms_config.py +16 -15
msprobe/mindspore/overflow_check/overflow_check_tool_factory.py +5 -3
msprobe/mindspore/task_handler_factory.py +5 -2
msprobe/msprobe.py +19 -0
msprobe/nan_analyze/__init__.py +14 -0
msprobe/nan_analyze/analyzer.py +255 -0
msprobe/nan_analyze/graph.py +189 -0
msprobe/nan_analyze/utils.py +211 -0
msprobe/pytorch/api_accuracy_checker/common/config.py +2 -2
msprobe/pytorch/api_accuracy_checker/compare/api_precision_compare.py +3 -6
msprobe/pytorch/api_accuracy_checker/compare/compare.py +36 -34
msprobe/pytorch/api_accuracy_checker/generate_op_script/op_generator.py +15 -13
msprobe/pytorch/api_accuracy_checker/generate_op_script/operator_replication.template +206 -4
msprobe/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py +9 -9
msprobe/pytorch/api_accuracy_checker/run_ut/run_overflow_check.py +6 -5
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py +31 -9
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut_utils.py +28 -20
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/attl.py +3 -1
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/client.py +29 -13
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/device_dispatch.py +12 -2
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/server.py +45 -31
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/utils.py +154 -0
msprobe/pytorch/attl_manager.py +65 -0
msprobe/pytorch/bench_functions/moe_gating_top_k_softmax.py +6 -0
msprobe/pytorch/bench_functions/npu_fusion_attention.py +27 -0
msprobe/pytorch/common/utils.py +53 -19
msprobe/pytorch/compare/distributed_compare.py +4 -36
msprobe/pytorch/compare/pt_compare.py +13 -84
msprobe/pytorch/compare/utils.py +47 -0
msprobe/pytorch/debugger/debugger_config.py +34 -17
msprobe/pytorch/debugger/precision_debugger.py +50 -96
msprobe/pytorch/dump/module_dump/hook_wrapper.py +93 -0
msprobe/pytorch/dump/module_dump/module_dump.py +15 -61
msprobe/pytorch/dump/module_dump/module_processer.py +150 -114
msprobe/pytorch/free_benchmark/common/utils.py +1 -1
msprobe/pytorch/free_benchmark/compare/single_benchmark.py +1 -1
msprobe/pytorch/free_benchmark/perturbed_layers/npu/add_noise.py +3 -3
msprobe/pytorch/free_benchmark/perturbed_layers/npu/bit_noise.py +3 -3
msprobe/pytorch/free_benchmark/perturbed_layers/npu/change_value.py +1 -1
msprobe/pytorch/free_benchmark/perturbed_layers/npu/improve_precision.py +1 -1
msprobe/pytorch/free_benchmark/result_handlers/check_handler.py +1 -1
msprobe/pytorch/function_factory.py +1 -1
msprobe/pytorch/grad_probe/grad_monitor.py +2 -2
msprobe/pytorch/grad_probe/grad_stat_csv.py +3 -2
msprobe/pytorch/hook_module/api_register.py +155 -0
msprobe/pytorch/hook_module/hook_module.py +18 -22
msprobe/pytorch/hook_module/jit_script_wrapper.py +33 -0
msprobe/pytorch/hook_module/pt_hook_manager.py +68 -0
msprobe/pytorch/hook_module/register_optimizer_hook.py +2 -1
msprobe/pytorch/hook_module/support_wrap_ops.yaml +193 -75
msprobe/pytorch/hook_module/utils.py +28 -2
msprobe/pytorch/monitor/csv2tb.py +14 -4
msprobe/pytorch/monitor/data_writers.py +259 -0
msprobe/pytorch/monitor/distributed/wrap_distributed.py +8 -2
msprobe/pytorch/monitor/module_hook.py +336 -241
msprobe/pytorch/monitor/module_metric.py +17 -0
msprobe/pytorch/monitor/optimizer_collect.py +244 -224
msprobe/pytorch/monitor/utils.py +84 -4
msprobe/pytorch/online_dispatch/compare.py +0 -2
msprobe/pytorch/online_dispatch/dispatch.py +13 -2
msprobe/pytorch/online_dispatch/dump_compare.py +8 -2
msprobe/pytorch/online_dispatch/utils.py +3 -0
msprobe/pytorch/parse_tool/lib/interactive_cli.py +1 -6
msprobe/pytorch/parse_tool/lib/utils.py +5 -4
msprobe/pytorch/pt_config.py +16 -11
msprobe/pytorch/pytorch_service.py +70 -0
msprobe/visualization/builder/graph_builder.py +69 -10
msprobe/visualization/builder/msprobe_adapter.py +24 -12
msprobe/visualization/compare/graph_comparator.py +63 -51
msprobe/visualization/compare/mode_adapter.py +22 -20
msprobe/visualization/graph/base_node.py +11 -4
msprobe/visualization/graph/distributed_analyzer.py +1 -10
msprobe/visualization/graph/graph.py +2 -13
msprobe/visualization/graph/node_op.py +1 -2
msprobe/visualization/graph_service.py +251 -104
msprobe/visualization/utils.py +26 -44
msprobe/mindspore/dump/hook_cell/api_registry.py +0 -207
msprobe/mindspore/dump/hook_cell/wrap_api.py +0 -212
msprobe/mindspore/dym_loader/hook_dynamic_loader.cc +0 -140
msprobe/mindspore/monitor/anomaly_detect.py +0 -404
msprobe/mindspore/monitor/module_spec_verifier.py +0 -94
msprobe/mindspore/service.py +0 -543
msprobe/pytorch/hook_module/api_registry.py +0 -166
msprobe/pytorch/hook_module/wrap_distributed.py +0 -79
msprobe/pytorch/hook_module/wrap_functional.py +0 -66
msprobe/pytorch/hook_module/wrap_npu_custom.py +0 -85
msprobe/pytorch/hook_module/wrap_tensor.py +0 -69
msprobe/pytorch/hook_module/wrap_torch.py +0 -84
msprobe/pytorch/hook_module/wrap_vf.py +0 -60
msprobe/pytorch/monitor/anomaly_analyse.py +0 -201
msprobe/pytorch/monitor/anomaly_detect.py +0 -410
msprobe/pytorch/monitor/module_spec_verifier.py +0 -95
msprobe/pytorch/monitor/unittest/test_monitor.py +0 -160
msprobe/pytorch/service.py +0 -470
{mindstudio_probe-1.2.2.dist-info → mindstudio_probe-8.1.0.dist-info}/LICENSE +0 -0
{mindstudio_probe-1.2.2.dist-info → mindstudio_probe-8.1.0.dist-info}/WHEEL +0 -0
{mindstudio_probe-1.2.2.dist-info → mindstudio_probe-8.1.0.dist-info}/entry_points.txt +0 -0
{mindstudio_probe-1.2.2.dist-info → mindstudio_probe-8.1.0.dist-info}/top_level.txt +0 -0
/msprobe/{mindspore → core}/compare/ms_to_pt_api.yaml +0 -0
/msprobe/{mindspore/dump → core}/kernel_dump/kernel_config.py +0 -0
/msprobe/{pytorch/monitor/unittest → core/monitor}/__init__.py +0 -0

msprobe/pytorch/monitor/utils.py CHANGED Viewed

@@ -22,10 +22,10 @@ import re
 import torch
-from msprobe.core.common.const import MonitorConst, Const
+from msprobe.core.common.const import MonitorConst
 from msprobe.pytorch.common.log import logger
 from msprobe.core.common.utils import is_int
-from msprobe.core.common.file_utils import check_file_or_directory_path
+from msprobe.core.common.file_utils import check_file_or_directory_path, recursive_chmod
 device = "cpu"
@@ -43,7 +43,6 @@ DIRECTORY_MAX_LENGTH = 4096
 beijing_tz = timezone(timedelta(hours=8))
 MVResult = namedtuple('MVResult', ("exp_avg", "exp_avg_sq", "update", "ratio"))
-MVGradResult = namedtuple('MVGradResult', ("exp_avg", "exp_avg_sq", "update", "ratio", "grad"))
 class MsgConst:
@@ -102,9 +101,23 @@ def validate_ops(ops):
         default_op = MonitorConst.OP_LIST[0]
         valid_ops.append(default_op)
         logger.info_on_rank_0(f"There is no valid ops, default op {default_op} is used")
+    # 增加默认shape和dtype参数
+    if "shape" not in valid_ops:
+        valid_ops.append("shape")
+    if "dtype" not in valid_ops:
+        valid_ops.append("dtype")
     return valid_ops
+def validate_ndigits(ndigits):
+    if not ndigits:
+        return
+    if not is_int(ndigits) or ndigits <= 0:
+        raise ValueError(f"ndigits({ndigits}) is not a positive integer, current is: {ndigits}.")
+    if ndigits > MonitorConst.MAX_NDIGITS:
+        raise ValueError(f"The maximum supported ndigits is {MonitorConst.MAX_NDIGITS}, current value: {ndigits}.")
 def validate_ranks(ranks):
     if not isinstance(ranks, list):
         raise TypeError("module_ranks should be a list")
@@ -190,7 +203,7 @@ def validate_alert(alert):
             args = rule.get("args")
             if args and isinstance(args, dict):
                 threshold = args.get("threshold")
-                if not isinstance(threshold, float) or threshold < 0:
+                if not isinstance(threshold, (float, int)) or threshold < 0:
                     raise TypeError('threshold must be float and not less than 0')
     dump = alert.get('dump')
     if dump and not isinstance(dump, bool):
@@ -206,9 +219,24 @@ def validate_step_count_per_record(step_count_per_record):
         raise ValueError("step_count_per_record must smaller than 1e6")
+def validate_dynamic_on(dynamic_on):
+    if not isinstance(dynamic_on, bool):
+        raise TypeError('dynamic_on should be a bool')
+def validate_monitor_mbs_grad(monitor_mbs_grad):
+    if not isinstance(monitor_mbs_grad, bool):
+        logger.warning(f'monitor_mbs_grad should be a bool, actual value is {monitor_mbs_grad}.')
+        return False
+    return monitor_mbs_grad
 def validate_config(config):
     config['ops'] = validate_ops(config.get('ops', []))
+    ndigits = config.get('ndigits')
+    validate_ndigits(ndigits)
     eps = config.get('eps', 1e-8)
     if not isinstance(eps, float):
         raise TypeError("eps should be a float")
@@ -246,9 +274,22 @@ def validate_config(config):
     step_count_per_record = config.get('step_count_per_record', 1)
     validate_step_count_per_record(step_count_per_record)
+    config["start_step"] = validate_int_arg(config.get("start_step"), "start_step",
+                                            MonitorConst.DEFAULT_START_STEP, MonitorConst.DEFAULT_START_STEP)
+    config["collect_times"] = validate_int_arg(config.get("collect_times"), "collect_times",
+                                               MonitorConst.DEFAULT_MIN_COLLECT_TIMES,
+                                               MonitorConst.DEFAULT_MAX_COLLECT_TIMES)
+    config["step_interval"] = validate_int_arg(config.get("step_interval"), "step_interval",
+                                               MonitorConst.DEFAULT_STEP_INTERVAL, MonitorConst.DEFAULT_STEP_INTERVAL)
     squash_name = config.get('squash_name', True)
     validate_squash_name(squash_name)
+    config["monitor_mbs_grad"] = validate_monitor_mbs_grad(config.get('monitor_mbs_grad', False))
+    dynamic_on = config.get('dynamic_on', False)
+    validate_dynamic_on(dynamic_on)
     if not targets:
         if xy_distribution:
             config["all_xy"] = True
@@ -257,6 +298,8 @@ def validate_config(config):
 def time_str2time_digit(time_str):
     time_format = '%b%d_%H-%M-%S'
+    if not isinstance(time_str, str):
+        raise TypeError(f"time_str:{time_str} should be a str")
     try:
         time_digit = datetime.strptime(time_str, time_format)
     except Exception as e:
@@ -284,3 +327,40 @@ def get_target_output_dir(monitor_path, time_start, time_end):
         if start_ok and end_ok:
             result[rank] = os.path.join(monitor_path, dirname)
     return result
+def chmod_tensorboard_dir(path):
+    """
+        format配置为tensorboard时，需要补充文件权限设置
+    """
+    try:
+        recursive_chmod(path)
+    except Exception as e:
+        logger.warning(f"chmod tensorboard dir wrong because {e}, not updated, please check!!!")
+def validate_set_monitor(grad_acc_steps, start_iteration):
+    """
+    validate parameters of set_monitor.
+    """
+    grad_acc_steps = validate_int_arg(grad_acc_steps, "grad_acc_steps",
+                                      MonitorConst.DEFAULT_GRAD_ACC_STEPS, MonitorConst.DEFAULT_GRAD_ACC_STEPS)
+    start_iteration = validate_int_arg(start_iteration, "start_iteration",
+                                       MonitorConst.DEFAULT_START_ITERATION, MonitorConst.DEFAULT_START_ITERATION)
+    return grad_acc_steps, start_iteration
+def validate_int_arg(value, name, minimum, default_value):
+    """Validate int args, if any exception occurs, use the default value."""
+    if value is None:
+        return default_value
+    try:
+        if not is_int(value):
+            raise TypeError(f"{name} must be int")
+        if value < minimum:
+            raise ValueError(f"{name} must greater than {minimum}")
+    except Exception as e:
+        value = default_value
+        logger.warning(f"Validate {name} failed, {e}, replaced with default value {value}.")
+    return value

msprobe/pytorch/online_dispatch/compare.py CHANGED Viewed

@@ -125,8 +125,6 @@ class Saver:
     def write_summary_csv(self, test_result):
         test_rows = []
-        if self.stack_info:
-            test_rows[0].append(self.COLUMN_STACK_INFO)
         check_op_str_pattern_valid(test_result.api_name)
         df_row = [test_result.api_name, test_result.is_fwd_success, test_result.is_bwd_success]

msprobe/pytorch/online_dispatch/dispatch.py CHANGED Viewed

@@ -16,6 +16,7 @@
 import json
 import os
 import time
+import multiprocessing
 from multiprocessing import Pool
 import torch
@@ -52,6 +53,7 @@ class PtdbgDispatch(TorchDispatchMode):
             return
         if dump_path is None:
             logger.error("Please set dump_path when dump_mode is config!")
+            raise DispatchException("Please set dump_path when dump_mode is config!")
         check_file_or_directory_path(dump_path, True)
         self.device_id = torch_npu._C._npu_getDevice()
@@ -85,6 +87,11 @@ class PtdbgDispatch(TorchDispatchMode):
         self.get_ops(yaml_path)
         self.lock = None
+        max_process_num = max(int((multiprocessing.cpu_count() + 1) // Const.CPU_QUARTER), 1)
+        if process_num > max_process_num:
+            logger.error(f"process_num should be less than or equal to {max_process_num}, but got {process_num}!")
+            raise DispatchException(f'process_num should be less than or equal to {max_process_num}, '
+                                    f'but got {process_num}!')
         if process_num > 0:
             self.pool = Pool(process_num)
         if debug:
@@ -115,6 +122,8 @@ class PtdbgDispatch(TorchDispatchMode):
                 if len(json_line_data) == 0:
                     break
                 msg = json.loads(json_line_data)
+                if len(msg) < 2:
+                    raise ValueError("JSON data does not contain enough elements. Expected at least 2 elements.")
                 self.all_summary[msg[0]] = msg[1]
             fp_handle.close()
@@ -199,8 +208,10 @@ class PtdbgDispatch(TorchDispatchMode):
             dispatch_workflow(run_param, data_info)
         else:
             self.lock.acquire()
-            self.all_summary.append([])
-            self.lock.release()
+            try:
+                self.all_summary.append([])
+            finally:
+                self.lock.release()
             run_param.process_flag = True
             if self.check_fun(func, run_param):
                 data_info = DisPatchDataInfo(cpu_args, cpu_kwargs, self.all_summary, None, npu_out_cpu, cpu_out,

msprobe/pytorch/online_dispatch/dump_compare.py CHANGED Viewed

@@ -19,6 +19,8 @@ import os
 from datetime import datetime, timezone
 import torch
+from msprobe.core.common.const import Const
+from msprobe.core.common.decorator import recursion_depth_decorator
 from msprobe.core.common.file_utils import FileOpen, save_npy, save_json
 from msprobe.pytorch.common.log import logger
@@ -91,6 +93,7 @@ def support_basic_type(data):
     return False
+@recursion_depth_decorator("dump_data")
 def dump_data(data, prefix, dump_path):
     if isinstance(data, (tuple, list)) and data:
         for i, item in enumerate(data):
@@ -107,8 +110,11 @@ def dump_data(data, prefix, dump_path):
 def save_temp_summary(api_index, single_api_summary, path, lock):
     summary_path = os.path.join(path, f'summary.json')
     lock.acquire()
-    data = [api_index, single_api_summary]
-    save_json(summary_path, data, mode='a')
+    try:
+        data = [api_index, single_api_summary]
+        save_json(summary_path, data, mode='a')
+    finally:
+        lock.release()
 def dispatch_workflow(run_param: DispatchRunParam, data_info: DisPatchDataInfo):

msprobe/pytorch/online_dispatch/utils.py CHANGED Viewed

@@ -27,8 +27,10 @@ else:
     pta_cpu_device = torch.device("cpu")
 from msprobe.core.common.const import CompareConst
+from msprobe.core.common.decorator import recursion_depth_decorator
 from msprobe.pytorch.common.log import logger
 cpu_device = torch._C.device("cpu")
 COLOR_RED = '\033[31m'
 COLOR_GREEN = '\033[32m'
@@ -85,6 +87,7 @@ def get_callstack():
     return callstack
+@recursion_depth_decorator("data_to_cpu")
 def data_to_cpu(data, deep, data_cpu):
     global cpu_device
     list_cpu = []

msprobe/pytorch/parse_tool/lib/interactive_cli.py CHANGED Viewed

@@ -45,12 +45,7 @@ class InteractiveCli(cmd.Cmd):
     @catch_exception
     def default(self, line=""):
-        self.util.execute_command(line)
-        return False
-    @catch_exception
-    def do_run(self, line=""):
-        self.util.execute_command(line)
+        self.stdout.write("Command invalid, Only support command start with cad/vc/dc/pk/cn/pt\n")
     @catch_exception
     def do_vc(self, line=""):

msprobe/pytorch/parse_tool/lib/utils.py CHANGED Viewed

@@ -13,12 +13,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import hashlib
 import os
 import re
 import subprocess
 import sys
 import time
+import zlib
 from collections import namedtuple
 import numpy as np
@@ -114,11 +114,12 @@ class Util:
     @staticmethod
     def get_md5_for_numpy(obj):
         np_bytes = obj.tobytes()
-        md5_hash = hashlib.md5(np_bytes)
-        return md5_hash.hexdigest()
+        md5_crc = zlib.crc32(np_bytes)
+        return f"{md5_crc:08x}"
     @staticmethod
     def deal_with_dir_or_file_inconsistency(output_path):
+        logger.warning(f"Trying to delete {output_path}")
         remove_path(output_path)
         raise ParseException("Inconsistent directory structure or file.")
@@ -264,7 +265,7 @@ class Util:
             match = re_pattern.match(name)
             if not match:
                 continue
-            if extern_pattern != '' and re_pattern.match(extern_pattern) and not re.match(extern_pattern, name):
+            if extern_pattern != '' and re_pattern.match(extern_pattern) and not name.startswith(extern_pattern):
                 continue
             file_list[name] = gen_info_func(name, match, file["root"])
         return file_list

msprobe/pytorch/pt_config.py CHANGED Viewed

@@ -16,9 +16,9 @@
 import os
 import re
-from msprobe.core.common.const import Const
+from msprobe.core.common.const import Const, FileCheckConst
 from msprobe.core.common.exceptions import MsprobeException
-from msprobe.core.common.file_utils import FileOpen, load_json, check_file_or_directory_path, check_crt_valid
+from msprobe.core.common.file_utils import FileOpen, load_json, check_file_or_directory_path, FileChecker
 from msprobe.core.common.log import logger
 from msprobe.core.common.utils import is_int
 from msprobe.core.common_config import BaseConfig, CommonConfig
@@ -42,6 +42,7 @@ class TensorConfig(BaseConfig):
         self.tls_path = json_config.get("tls_path", "./")
         self.online_run_ut_recompute = json_config.get("online_run_ut_recompute", False)
         self.check_config()
+        self._check_summary_mode()
         self._check_file_format()
         if self.online_run_ut:
             self._check_online_run_ut()
@@ -65,7 +66,10 @@ class TensorConfig(BaseConfig):
             check_file_or_directory_path(self.tls_path, isdir=True)
             check_file_or_directory_path(os.path.join(self.tls_path, "client.key"))
             check_file_or_directory_path(os.path.join(self.tls_path, "client.crt"))
-            check_crt_valid(os.path.join(self.tls_path, "client.crt"))
+            check_file_or_directory_path(os.path.join(self.tls_path, "ca.crt"))
+            crl_path = os.path.join(self.tls_path, "crl.pem")
+            if os.path.exists(crl_path):
+                check_file_or_directory_path(crl_path)
         if not isinstance(self.host, str) or not re.match(Const.ipv4_pattern, self.host):
             raise Exception(f"host: {self.host} is invalid.")
@@ -80,9 +84,8 @@ class StatisticsConfig(BaseConfig):
         self.check_config()
         self._check_summary_mode()
-    def _check_summary_mode(self):
-        if self.summary_mode and self.summary_mode not in ["statistics", "md5"]:
-            raise Exception("summary_mode is invalid")
+        self.tensor_list = json_config.get("tensor_list", [])
+        self._check_str_list_config(self.tensor_list, "tensor_list")
 class OverflowCheckConfig(BaseConfig):
@@ -95,6 +98,8 @@ class OverflowCheckConfig(BaseConfig):
     def check_overflow_config(self):
         if self.overflow_nums is not None and not is_int(self.overflow_nums):
             raise Exception("overflow_num is invalid")
+        if self.overflow_nums is not None and self.overflow_nums != -1 and self.overflow_nums <= 0:
+            raise Exception("overflow_nums should be -1 or positive integer")
         if self.check_mode is not None and self.check_mode not in ["all", "aicore", "atomic"]:
             raise Exception("check_mode is invalid")
@@ -148,7 +153,7 @@ class FreeBenchmarkCheckConfig(BaseConfig):
                 self.pert_mode in PytorchFreeBenchmarkConst.CPU_MODE_LIST
         ):
             msg = (
-                f"You neet to and can only set fuzz_device as {DeviceType.CPU} "
+                f"You need to and can only set fuzz_device as {DeviceType.CPU} "
                 f"when pert_mode in {PytorchFreeBenchmarkConst.CPU_MODE_LIST}"
             )
             logger.error_log_with_exp(
@@ -271,13 +276,13 @@ class RunUTConfig(BaseConfig):
     @classmethod
     def check_nfs_path_config(cls, nfs_path):
-        if nfs_path and not os.path.exists(nfs_path):
-            raise Exception("nfs_path: %s does not exist" % nfs_path)
+        if nfs_path:
+            FileChecker(nfs_path, FileCheckConst.DIR, FileCheckConst.READ_ABLE).common_check()
     @classmethod
     def check_tls_path_config(cls, tls_path):
-        if tls_path and not os.path.exists(tls_path):
-            raise Exception("tls_path: %s does not exist" % tls_path)
+        if tls_path:
+            FileChecker(tls_path, FileCheckConst.DIR, FileCheckConst.READ_ABLE).common_check()
     def check_run_ut_config(self):
         RunUTConfig.check_filter_list_config(Const.WHITE_LIST, self.white_list)

msprobe/pytorch/pytorch_service.py ADDED Viewed

@@ -0,0 +1,70 @@
+# Copyright (c) 2025, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from msprobe.core.common.utils import Const
+from msprobe.core.service import BaseService
+from msprobe.pytorch.attl_manager import ATTLManager
+from msprobe.pytorch.common.log import logger
+from msprobe.pytorch.common.utils import get_rank_if_initialized, torch_version_above_or_equal_2
+from msprobe.pytorch.dump.module_dump.module_processer import ModuleProcesser
+from msprobe.pytorch.hook_module.api_register import get_api_register, ApiTemplate
+from msprobe.pytorch.hook_module.hook_module import HOOKModule
+from msprobe.pytorch.hook_module.jit_script_wrapper import wrap_jit_script_func
+from msprobe.pytorch.hook_module.pt_hook_manager import PytorchHookManager
+from msprobe.pytorch.hook_module.register_optimizer_hook import register_optimizer_hook
+if torch_version_above_or_equal_2:
+    from msprobe.pytorch.api_accuracy_checker.tensor_transport_layer.dump_dispatch import run_ut_dispatch
+class PytorchService(BaseService):
+    @property
+    def _get_framework_type(self):
+        return Const.PT_FRAMEWORK
+    @staticmethod
+    def _get_current_rank():
+        return get_rank_if_initialized()
+    def _init_specific_components(self):
+        self.logger = logger
+        self.api_register = get_api_register()
+        self.module_processor = ModuleProcesser(self.data_collector.scope)
+        self.attl_manager = ATTLManager(self.config)
+        self.hook_manager = PytorchHookManager(self.data_collector, self.config, self.attl_manager)
+        self.api_template = ApiTemplate
+    def _register_hook(self):
+        self.attl_manager.attl_init()
+        if self._is_mix_level:
+            register_optimizer_hook(self.data_collector)
+    def _register_api_hook(self):
+        super()._register_api_hook()
+        wrap_jit_script_func()
+    def _register_module_hook(self):
+        ModuleProcesser.enable_module_dump = True
+        self.module_processor.register_module_hook(self.model, self.build_hook)
+        self.logger.info(f"The module {self.config.task} hook function is successfully mounted to the model.")
+    def _run_ut_dispatch(self, status):
+        if torch_version_above_or_equal_2:
+            run_ut_dispatch(self.attl_manager.attl, status, self.config.online_run_ut_recompute)
+    def _reset_status(self):
+        super()._reset_status()
+        ModuleProcesser.reset_module_stats()
+        HOOKModule.reset_module_stats()

msprobe/visualization/builder/graph_builder.py CHANGED Viewed

@@ -14,21 +14,23 @@
 # limitations under the License.
 import re
+from dataclasses import dataclass
 from msprobe.core.common.const import Const
-from msprobe.core.common.file_utils import load_json
+from msprobe.core.common.file_utils import load_json, save_json
+from msprobe.core.common.utils import load_stack_json
 from msprobe.visualization.builder.msprobe_adapter import get_input_output
 from msprobe.visualization.builder.msprobe_adapter import op_patterns
 from msprobe.visualization.graph.graph import Graph
 from msprobe.visualization.graph.node_op import NodeOp
-from msprobe.visualization.utils import save_json_file, GraphConst
+from msprobe.visualization.utils import GraphConst
 class GraphBuilder:
     backward_pattern = re.compile(r"(\.backward\.)(\d+)$")
     forward_pattern = re.compile(r"(\.forward\.)(\d+)$")
-    # 匹配以大写字母开头，后接任意字母，并以Template(结尾
-    template_pattern = re.compile(r'\b[A-Z][a-zA-Z]*Template\(')
+    # 匹配以大写字母开头，后接任意字母，并以Template(结尾，或包含api_template(的字符串
+    template_pattern = re.compile(r'\b([A-Z][a-zA-Z]*Template|api_template)\(')
     @staticmethod
     def build(construct_path, data_path, stack_path, model_name='DefaultModel', complete_stack=False):
@@ -44,13 +46,14 @@ class GraphBuilder:
         """
         construct_dict = load_json(construct_path)
         dump_dict = load_json(data_path)
-        stack_dict = load_json(stack_path)
+        stack_dict = load_stack_json(stack_path)
         if not complete_stack:
             GraphBuilder._simplify_stack(stack_dict)
         data_dict = dump_dict.get(GraphConst.DATA_KEY, {})
         graph = Graph(model_name, data_path=dump_dict.get('dump_data_dir', ''), dump_data=data_dict)
         GraphBuilder._init_nodes(graph, construct_dict, data_dict, stack_dict)
         GraphBuilder._collect_apis_between_modules(graph)
+        GraphBuilder._add_parameters_grad(graph, data_dict)
         return graph
     @staticmethod
@@ -60,10 +63,10 @@ class GraphBuilder:
         """
         result = {}
         if config.graph_b:
-            result[GraphConst.JSON_NPU_KEY] = config.graph_n.to_dict()
-            result[GraphConst.JSON_BENCH_KEY] = config.graph_b.to_dict()
+            result[GraphConst.JSON_NPU_KEY] = config.graph_n.to_dict(config.compare_mode)
+            result[GraphConst.JSON_BENCH_KEY] = config.graph_b.to_dict(config.compare_mode)
         else:
-            result = config.graph_n.to_dict()
+            result = config.graph_n.to_dict(config.compare_mode)
         if config.tool_tip:
             result[GraphConst.JSON_TIP_KEY] = config.tool_tip
         if config.node_colors:
@@ -73,7 +76,7 @@ class GraphBuilder:
         if config.task:
             result[GraphConst.JSON_TASK_KEY] = config.task
         result[GraphConst.OVERFLOW_CHECK] = config.overflow_check
-        save_json_file(filename, result)
+        save_json(filename, result, indent=4)
     @staticmethod
     def _simplify_stack(stack_dict):
@@ -186,6 +189,8 @@ class GraphBuilder:
         # 数据格式："output": [[{param1}, {param2}, ...]]
         if GraphBuilder._is_valid_batch_p2p_output(param_list):
             for param in param_list[0]:
+                if not isinstance(param, dict):
+                    continue
                 info = {GraphConst.OP: param.get(GraphConst.OP), GraphConst.PEER: param.get(GraphConst.PEER),
                         GraphConst.GROUP_ID: param.get(GraphConst.GROUP_ID)}
                 node.batch_p2p_info.append(info)
@@ -235,10 +240,46 @@ class GraphBuilder:
         graph.root.subnodes = output
+    @staticmethod
+    def _add_parameters_grad(graph, data_dict):
+        """
+        将parameters_grad信息添加到graph中，
+        对应模块的parameters_grad节点添加到对应模块的最后一次backward节点（backward计数最大）内作为子节点
+        例如，graph有节点Module.a.backward.0, Module.a.backward.1, Module.a.backward.2
+        则Module.a.parameters_grad添加在Module.a.backward.2内作为子节点
+        """
+        prefixes = []
+        suffix = Const.SEP + Const.PARAMS_GRAD
+        for node_id in data_dict.keys():
+            if node_id not in graph.node_map and node_id.endswith(suffix):
+                prefixes.append(node_id.replace(suffix, ''))
+        max_info = {prefix: 0 for prefix in prefixes}
+        for key in graph.node_map.keys():
+            parts = key.split(Const.SEP)
+            if len(parts) > 2 and parts[-2] == Const.BACKWARD:
+                num = int(parts[-1])
+                prefix = Const.SEP.join(parts[:-2])
+                if prefix in max_info and num > max_info[prefix]:
+                    max_info[prefix] = num
+        for prefix, num in max_info.items():
+            node_id = prefix + Const.SEP + Const.BACKWARD + Const.SEP + str(num)
+            node = graph.get_node(node_id)
+            if node:
+                parameters_grad_node_id = graph.add_node(NodeOp.module, prefix + suffix, up_node=node)
+                # 添加输入输出数据
+                node_data = data_dict.get(parameters_grad_node_id, {})
+                input_data, output_data = get_input_output(node_data, parameters_grad_node_id)
+                # 更新数据
+                graph.get_node(parameters_grad_node_id).set_input_output(input_data, output_data)
 class GraphExportConfig:
     def __init__(self, graph_n, graph_b=None, tool_tip=None, node_colors=None, micro_steps=None, task='',
-                 overflow_check=False):
+                 overflow_check=False, compare_mode=None):
         self.graph_n = graph_n
         self.graph_b = graph_b
         self.tool_tip = tool_tip
@@ -246,3 +287,21 @@ class GraphExportConfig:
         self.micro_steps = micro_steps
         self.task = task
         self.overflow_check = overflow_check
+        self.compare_mode = compare_mode
+@dataclass
+class GraphInfo:
+    graph: Graph
+    construct_path: str
+    data_path: str
+    stack_path: str
+@dataclass
+class BuildGraphTaskInfo:
+    graph_info_n: GraphInfo
+    graph_info_b: GraphInfo
+    npu_rank: str
+    bench_rank: str
+    time_str: str

mindstudio-probe 1.2.2__py3-none-any.whl → 8.1.0__py3-none-any.whl

mindstudio-probe 1.2.2py3-none-any.whl → 8.1.0py3-none-any.whl