PyPI - mindstudio-probe - Versions diffs - 1.2.2__py3-none-any.whl → 1.3.0__py3-none-any.whl - Mend

mindstudio-probe 1.2.2py3-none-any.whl → 1.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (153) hide show

{mindstudio_probe-1.2.2.dist-info → mindstudio_probe-1.3.0.dist-info}/METADATA +3 -3
{mindstudio_probe-1.2.2.dist-info → mindstudio_probe-1.3.0.dist-info}/RECORD +143 -144
msprobe/README.md +25 -20
msprobe/core/common/const.py +110 -66
msprobe/core/common/decorator.py +50 -0
msprobe/core/common/exceptions.py +3 -1
msprobe/core/common/file_utils.py +25 -2
msprobe/core/common/utils.py +30 -34
msprobe/core/compare/acc_compare.py +43 -74
msprobe/core/compare/check.py +2 -6
msprobe/core/compare/highlight.py +2 -0
msprobe/core/compare/layer_mapping/layer_mapping.py +2 -1
msprobe/core/compare/merge_result/merge_result.py +8 -2
msprobe/core/compare/multiprocessing_compute.py +19 -12
msprobe/core/compare/npy_compare.py +30 -12
msprobe/core/compare/utils.py +20 -10
msprobe/core/data_dump/api_registry.py +176 -0
msprobe/core/data_dump/data_processor/base.py +2 -2
msprobe/core/data_dump/data_processor/mindspore_processor.py +19 -32
msprobe/core/data_dump/data_processor/pytorch_processor.py +45 -15
msprobe/core/data_dump/json_writer.py +38 -35
msprobe/core/grad_probe/constant.py +1 -0
msprobe/core/grad_probe/grad_compare.py +1 -1
msprobe/core/overflow_check/abnormal_scene.py +2 -0
msprobe/docs/01.installation.md +2 -1
msprobe/docs/02.config_introduction.md +17 -15
msprobe/docs/05.data_dump_PyTorch.md +70 -2
msprobe/docs/06.data_dump_MindSpore.md +33 -12
msprobe/docs/07.accuracy_checker_PyTorch.md +11 -1
msprobe/docs/08.accuracy_checker_online_PyTorch.md +3 -1
msprobe/docs/09.accuracy_checker_MindSpore.md +1 -1
msprobe/docs/10.accuracy_compare_PyTorch.md +59 -33
msprobe/docs/11.accuracy_compare_MindSpore.md +40 -16
msprobe/docs/12.overflow_check_PyTorch.md +3 -1
msprobe/docs/13.overflow_check_MindSpore.md +4 -2
msprobe/docs/14.data_parse_PyTorch.md +1 -7
msprobe/docs/18.online_dispatch.md +1 -1
msprobe/docs/19.monitor.md +124 -62
msprobe/docs/21.visualization_PyTorch.md +32 -13
msprobe/docs/22.visualization_MindSpore.md +32 -13
msprobe/docs/23.generate_operator_PyTorch.md +9 -9
msprobe/docs/27.dump_json_instruction.md +278 -8
msprobe/docs/28.kernel_dump_MindSpore.md +1 -1
msprobe/docs/29.data_dump_MSAdapter.md +229 -0
msprobe/docs/30.overflow_check_MSAdapter.md +31 -0
msprobe/docs/FAQ.md +3 -11
msprobe/docs/img/compare_result.png +0 -0
msprobe/docs/img/merge_result.png +0 -0
msprobe/docs/img/visualization/vis_browser_1.png +0 -0
msprobe/docs/img/visualization/vis_match_info.png +0 -0
msprobe/docs/img/visualization/vis_precision_info.png +0 -0
msprobe/docs/img/visualization/vis_search_info.png +0 -0
msprobe/docs/img/visualization/vis_show_info.png +0 -0
msprobe/docs/img/visualization/vis_showcase.png +0 -0
msprobe/docs/img/visualization/vis_unmatch_info.png +0 -0
msprobe/mindspore/__init__.py +4 -3
msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +6 -1
msprobe/mindspore/api_accuracy_checker/api_runner.py +19 -9
msprobe/mindspore/api_accuracy_checker/base_compare_algorithm.py +2 -1
msprobe/mindspore/api_accuracy_checker/bench_functions/flash_attention_score.py +602 -0
msprobe/mindspore/api_accuracy_checker/bench_functions/fusion_operator.py +41 -0
msprobe/mindspore/api_accuracy_checker/data_manager.py +2 -1
msprobe/mindspore/api_accuracy_checker/multi_api_accuracy_checker.py +2 -1
msprobe/mindspore/api_accuracy_checker/torch_mindtorch_importer.py +2 -1
msprobe/mindspore/common/const.py +61 -0
msprobe/mindspore/common/utils.py +31 -19
msprobe/mindspore/compare/ms_compare.py +27 -19
msprobe/mindspore/compare/ms_graph_compare.py +6 -5
msprobe/mindspore/debugger/debugger_config.py +6 -4
msprobe/mindspore/debugger/precision_debugger.py +22 -10
msprobe/mindspore/dump/dump_tool_factory.py +5 -3
msprobe/mindspore/dump/hook_cell/api_register.py +142 -0
msprobe/mindspore/dump/hook_cell/hook_cell.py +9 -10
msprobe/mindspore/dump/hook_cell/support_wrap_ops.yaml +24 -26
msprobe/mindspore/dump/jit_dump.py +14 -9
msprobe/mindspore/dym_loader/hook_dynamic_loader.cc +22 -56
msprobe/mindspore/dym_loader/hook_dynamic_loader.h +0 -1
msprobe/mindspore/free_benchmark/api_pynative_self_check.py +10 -6
msprobe/mindspore/free_benchmark/perturbation/perturbation_factory.py +4 -2
msprobe/mindspore/free_benchmark/self_check_tool_factory.py +6 -3
msprobe/mindspore/grad_probe/global_context.py +2 -0
msprobe/mindspore/grad_probe/grad_analyzer.py +2 -1
msprobe/mindspore/grad_probe/hook.py +2 -4
msprobe/mindspore/monitor/distributed/wrap_distributed.py +1 -1
msprobe/mindspore/monitor/module_hook.py +354 -302
msprobe/mindspore/monitor/utils.py +46 -4
msprobe/mindspore/overflow_check/overflow_check_tool_factory.py +5 -3
msprobe/mindspore/service.py +23 -17
msprobe/pytorch/api_accuracy_checker/compare/api_precision_compare.py +3 -6
msprobe/pytorch/api_accuracy_checker/generate_op_script/op_generator.py +11 -6
msprobe/pytorch/api_accuracy_checker/generate_op_script/operator_replication.template +2 -2
msprobe/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py +4 -5
msprobe/pytorch/api_accuracy_checker/run_ut/run_overflow_check.py +5 -5
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py +25 -6
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut_utils.py +28 -19
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/attl.py +3 -1
msprobe/pytorch/bench_functions/moe_gating_top_k_softmax.py +6 -0
msprobe/pytorch/common/utils.py +29 -7
msprobe/pytorch/debugger/precision_debugger.py +10 -1
msprobe/pytorch/dump/module_dump/module_dump.py +4 -3
msprobe/pytorch/dump/module_dump/module_processer.py +12 -6
msprobe/pytorch/free_benchmark/common/utils.py +1 -1
msprobe/pytorch/free_benchmark/compare/single_benchmark.py +1 -1
msprobe/pytorch/free_benchmark/perturbed_layers/npu/add_noise.py +3 -3
msprobe/pytorch/free_benchmark/perturbed_layers/npu/bit_noise.py +3 -3
msprobe/pytorch/free_benchmark/perturbed_layers/npu/change_value.py +1 -1
msprobe/pytorch/free_benchmark/perturbed_layers/npu/improve_precision.py +1 -1
msprobe/pytorch/free_benchmark/result_handlers/check_handler.py +1 -1
msprobe/pytorch/function_factory.py +1 -1
msprobe/pytorch/grad_probe/grad_monitor.py +2 -2
msprobe/pytorch/hook_module/api_register.py +131 -0
msprobe/pytorch/hook_module/hook_module.py +19 -14
msprobe/pytorch/hook_module/register_optimizer_hook.py +2 -1
msprobe/pytorch/hook_module/support_wrap_ops.yaml +172 -75
msprobe/pytorch/monitor/csv2tb.py +8 -2
msprobe/pytorch/monitor/distributed/wrap_distributed.py +8 -2
msprobe/pytorch/monitor/module_hook.py +131 -105
msprobe/pytorch/monitor/module_metric.py +3 -0
msprobe/pytorch/monitor/optimizer_collect.py +55 -4
msprobe/pytorch/monitor/unittest/test_monitor.py +1 -1
msprobe/pytorch/monitor/utils.py +68 -1
msprobe/pytorch/online_dispatch/compare.py +0 -2
msprobe/pytorch/online_dispatch/dispatch.py +9 -0
msprobe/pytorch/online_dispatch/dump_compare.py +3 -0
msprobe/pytorch/online_dispatch/utils.py +3 -0
msprobe/pytorch/parse_tool/lib/interactive_cli.py +1 -6
msprobe/pytorch/parse_tool/lib/utils.py +2 -1
msprobe/pytorch/pt_config.py +11 -7
msprobe/pytorch/service.py +11 -8
msprobe/visualization/builder/graph_builder.py +44 -5
msprobe/visualization/builder/msprobe_adapter.py +0 -1
msprobe/visualization/compare/graph_comparator.py +42 -38
msprobe/visualization/compare/mode_adapter.py +0 -19
msprobe/visualization/graph/base_node.py +8 -1
msprobe/visualization/graph/distributed_analyzer.py +1 -10
msprobe/visualization/graph/graph.py +0 -11
msprobe/visualization/graph/node_op.py +1 -2
msprobe/visualization/graph_service.py +1 -1
msprobe/visualization/utils.py +2 -33
msprobe/mindspore/dump/hook_cell/api_registry.py +0 -207
msprobe/mindspore/dump/hook_cell/wrap_api.py +0 -212
msprobe/pytorch/hook_module/api_registry.py +0 -166
msprobe/pytorch/hook_module/wrap_distributed.py +0 -79
msprobe/pytorch/hook_module/wrap_functional.py +0 -66
msprobe/pytorch/hook_module/wrap_npu_custom.py +0 -85
msprobe/pytorch/hook_module/wrap_tensor.py +0 -69
msprobe/pytorch/hook_module/wrap_torch.py +0 -84
msprobe/pytorch/hook_module/wrap_vf.py +0 -60
msprobe/pytorch/parse.py +0 -19
{mindstudio_probe-1.2.2.dist-info → mindstudio_probe-1.3.0.dist-info}/LICENSE +0 -0
{mindstudio_probe-1.2.2.dist-info → mindstudio_probe-1.3.0.dist-info}/WHEEL +0 -0
{mindstudio_probe-1.2.2.dist-info → mindstudio_probe-1.3.0.dist-info}/entry_points.txt +0 -0
{mindstudio_probe-1.2.2.dist-info → mindstudio_probe-1.3.0.dist-info}/top_level.txt +0 -0

msprobe/pytorch/monitor/utils.py CHANGED Viewed

@@ -25,7 +25,7 @@ import torch
 from msprobe.core.common.const import MonitorConst, Const
 from msprobe.pytorch.common.log import logger
 from msprobe.core.common.utils import is_int
-from msprobe.core.common.file_utils import check_file_or_directory_path
+from msprobe.core.common.file_utils import check_file_or_directory_path, recursive_chmod
 device = "cpu"
@@ -105,6 +105,15 @@ def validate_ops(ops):
     return valid_ops
+def validate_ndigits(ndigits):
+    if not ndigits:
+        return
+    if not is_int(ndigits) or ndigits <= 0:
+        raise ValueError(f"ndigits({ndigits}) is not a positive integer, current is: {ndigits}.")
+    if ndigits > MonitorConst.MAX_NDIGITS:
+        raise ValueError(f"The maximum supported ndigits is {MonitorConst.MAX_NDIGITS}, current value: {ndigits}.")
 def validate_ranks(ranks):
     if not isinstance(ranks, list):
         raise TypeError("module_ranks should be a list")
@@ -206,9 +215,17 @@ def validate_step_count_per_record(step_count_per_record):
         raise ValueError("step_count_per_record must smaller than 1e6")
+def validate_dynamic_on(dynamic_on):
+    if not isinstance(dynamic_on, bool):
+        raise TypeError('dynamic_on should be a bool')
 def validate_config(config):
     config['ops'] = validate_ops(config.get('ops', []))
+    ndigits = config.get('ndigits')
+    validate_ndigits(ndigits)
     eps = config.get('eps', 1e-8)
     if not isinstance(eps, float):
         raise TypeError("eps should be a float")
@@ -246,9 +263,20 @@ def validate_config(config):
     step_count_per_record = config.get('step_count_per_record', 1)
     validate_step_count_per_record(step_count_per_record)
+    config["start_step"] = validate_int_arg(config.get("start_step"), "start_step",
+                                            MonitorConst.DEFAULT_START_STEP, MonitorConst.DEFAULT_START_STEP)
+    config["collect_times"] = validate_int_arg(config.get("collect_times"), "collect_times",
+                                               MonitorConst.DEFAULT_MIN_COLLECT_TIMES,
+                                               MonitorConst.DEFAULT_MAX_COLLECT_TIMES)
+    config["step_interval"] = validate_int_arg(config.get("step_interval"), "step_interval",
+                                               MonitorConst.DEFAULT_STEP_INTERVAL, MonitorConst.DEFAULT_STEP_INTERVAL)
     squash_name = config.get('squash_name', True)
     validate_squash_name(squash_name)
+    dynamic_on = config.get('dynamic_on', False)
+    validate_dynamic_on(dynamic_on)
     if not targets:
         if xy_distribution:
             config["all_xy"] = True
@@ -257,6 +285,8 @@ def validate_config(config):
 def time_str2time_digit(time_str):
     time_format = '%b%d_%H-%M-%S'
+    if not isinstance(time_str, str):
+        raise TypeError(f"time_str:{time_str} should be a str")
     try:
         time_digit = datetime.strptime(time_str, time_format)
     except Exception as e:
@@ -284,3 +314,40 @@ def get_target_output_dir(monitor_path, time_start, time_end):
         if start_ok and end_ok:
             result[rank] = os.path.join(monitor_path, dirname)
     return result
+def chmod_tensorboard_dir(path):
+    """
+        format配置为tensorboard时，需要补充文件权限设置
+    """
+    try:
+        recursive_chmod(path)
+    except Exception as e:
+        logger.warning(f"chmod tensorboard dir wrong because {e}, not updated, please check!!!")
+def validate_set_monitor(grad_acc_steps, start_iteration):
+    """
+    validate parameters of set_monitor.
+    """
+    grad_acc_steps = validate_int_arg(grad_acc_steps, "grad_acc_steps",
+                                      MonitorConst.DEFAULT_GRAD_ACC_STEPS, MonitorConst.DEFAULT_GRAD_ACC_STEPS)
+    start_iteration = validate_int_arg(start_iteration, "start_iteration",
+                                       MonitorConst.DEFAULT_START_ITERATION, MonitorConst.DEFAULT_START_ITERATION)
+    return grad_acc_steps, start_iteration
+def validate_int_arg(value, name, minimum, default_value):
+    """Validate int args, if any exception occurs, use the default value."""
+    if value is None:
+        return default_value
+    try:
+        if not is_int(value):
+            raise TypeError(f"{name} must be int")
+        if value < minimum:
+            raise ValueError(f"{name} must greater than {minimum}")
+    except Exception as e:
+        value = default_value
+        logger.warning(f"Validate {name} failed, {e}, replaced with default value {value}.")
+    return value

msprobe/pytorch/online_dispatch/compare.py CHANGED Viewed

@@ -125,8 +125,6 @@ class Saver:
     def write_summary_csv(self, test_result):
         test_rows = []
-        if self.stack_info:
-            test_rows[0].append(self.COLUMN_STACK_INFO)
         check_op_str_pattern_valid(test_result.api_name)
         df_row = [test_result.api_name, test_result.is_fwd_success, test_result.is_bwd_success]

msprobe/pytorch/online_dispatch/dispatch.py CHANGED Viewed

@@ -16,6 +16,7 @@
 import json
 import os
 import time
+import multiprocessing
 from multiprocessing import Pool
 import torch
@@ -52,6 +53,7 @@ class PtdbgDispatch(TorchDispatchMode):
             return
         if dump_path is None:
             logger.error("Please set dump_path when dump_mode is config!")
+            raise DispatchException("Please set dump_path when dump_mode is config!")
         check_file_or_directory_path(dump_path, True)
         self.device_id = torch_npu._C._npu_getDevice()
@@ -85,6 +87,11 @@ class PtdbgDispatch(TorchDispatchMode):
         self.get_ops(yaml_path)
         self.lock = None
+        max_process_num = max(int((multiprocessing.cpu_count() + 1) // Const.CPU_QUARTER), 1)
+        if process_num > max_process_num:
+            logger.error(f"process_num should be less than or equal to {max_process_num}, but got {process_num}!")
+            raise DispatchException(f'process_num should be less than or equal to {max_process_num}, '
+                                    f'but got {process_num}!')
         if process_num > 0:
             self.pool = Pool(process_num)
         if debug:
@@ -115,6 +122,8 @@ class PtdbgDispatch(TorchDispatchMode):
                 if len(json_line_data) == 0:
                     break
                 msg = json.loads(json_line_data)
+                if len(msg) < 2:
+                    raise ValueError("JSON data does not contain enough elements. Expected at least 2 elements.")
                 self.all_summary[msg[0]] = msg[1]
             fp_handle.close()

msprobe/pytorch/online_dispatch/dump_compare.py CHANGED Viewed

@@ -19,6 +19,8 @@ import os
 from datetime import datetime, timezone
 import torch
+from msprobe.core.common.const import Const
+from msprobe.core.common.decorator import recursion_depth_decorator
 from msprobe.core.common.file_utils import FileOpen, save_npy, save_json
 from msprobe.pytorch.common.log import logger
@@ -91,6 +93,7 @@ def support_basic_type(data):
     return False
+@recursion_depth_decorator("dump_data")
 def dump_data(data, prefix, dump_path):
     if isinstance(data, (tuple, list)) and data:
         for i, item in enumerate(data):

msprobe/pytorch/online_dispatch/utils.py CHANGED Viewed

@@ -27,8 +27,10 @@ else:
     pta_cpu_device = torch.device("cpu")
 from msprobe.core.common.const import CompareConst
+from msprobe.core.common.decorator import recursion_depth_decorator
 from msprobe.pytorch.common.log import logger
 cpu_device = torch._C.device("cpu")
 COLOR_RED = '\033[31m'
 COLOR_GREEN = '\033[32m'
@@ -85,6 +87,7 @@ def get_callstack():
     return callstack
+@recursion_depth_decorator("data_to_cpu")
 def data_to_cpu(data, deep, data_cpu):
     global cpu_device
     list_cpu = []

msprobe/pytorch/parse_tool/lib/interactive_cli.py CHANGED Viewed

@@ -45,12 +45,7 @@ class InteractiveCli(cmd.Cmd):
     @catch_exception
     def default(self, line=""):
-        self.util.execute_command(line)
-        return False
-    @catch_exception
-    def do_run(self, line=""):
-        self.util.execute_command(line)
+        self.stdout.write("Command invalid, Only support command start with cad/vc/dc/pk/cn/pt\n")
     @catch_exception
     def do_vc(self, line=""):

msprobe/pytorch/parse_tool/lib/utils.py CHANGED Viewed

@@ -119,6 +119,7 @@ class Util:
     @staticmethod
     def deal_with_dir_or_file_inconsistency(output_path):
+        logger.warning(f"Trying to delete {output_path}")
         remove_path(output_path)
         raise ParseException("Inconsistent directory structure or file.")
@@ -264,7 +265,7 @@ class Util:
             match = re_pattern.match(name)
             if not match:
                 continue
-            if extern_pattern != '' and re_pattern.match(extern_pattern) and not re.match(extern_pattern, name):
+            if extern_pattern != '' and re_pattern.match(extern_pattern) and not name.startswith(extern_pattern):
                 continue
             file_list[name] = gen_info_func(name, match, file["root"])
         return file_list

msprobe/pytorch/pt_config.py CHANGED Viewed

@@ -16,9 +16,10 @@
 import os
 import re
-from msprobe.core.common.const import Const
+from msprobe.core.common.const import Const, FileCheckConst
 from msprobe.core.common.exceptions import MsprobeException
-from msprobe.core.common.file_utils import FileOpen, load_json, check_file_or_directory_path, check_crt_valid
+from msprobe.core.common.file_utils import FileOpen, load_json, check_file_or_directory_path, check_crt_valid, \
+    FileChecker
 from msprobe.core.common.log import logger
 from msprobe.core.common.utils import is_int
 from msprobe.core.common_config import BaseConfig, CommonConfig
@@ -66,6 +67,7 @@ class TensorConfig(BaseConfig):
             check_file_or_directory_path(os.path.join(self.tls_path, "client.key"))
             check_file_or_directory_path(os.path.join(self.tls_path, "client.crt"))
             check_crt_valid(os.path.join(self.tls_path, "client.crt"))
+            check_crt_valid(os.path.join(self.tls_path, "client.key"), True)
         if not isinstance(self.host, str) or not re.match(Const.ipv4_pattern, self.host):
             raise Exception(f"host: {self.host} is invalid.")
@@ -95,6 +97,8 @@ class OverflowCheckConfig(BaseConfig):
     def check_overflow_config(self):
         if self.overflow_nums is not None and not is_int(self.overflow_nums):
             raise Exception("overflow_num is invalid")
+        if self.overflow_nums is not None and self.overflow_nums != -1 and self.overflow_nums <= 0:
+            raise Exception("overflow_nums should be -1 or positive integer")
         if self.check_mode is not None and self.check_mode not in ["all", "aicore", "atomic"]:
             raise Exception("check_mode is invalid")
@@ -148,7 +152,7 @@ class FreeBenchmarkCheckConfig(BaseConfig):
                 self.pert_mode in PytorchFreeBenchmarkConst.CPU_MODE_LIST
         ):
             msg = (
-                f"You neet to and can only set fuzz_device as {DeviceType.CPU} "
+                f"You need to and can only set fuzz_device as {DeviceType.CPU} "
                 f"when pert_mode in {PytorchFreeBenchmarkConst.CPU_MODE_LIST}"
             )
             logger.error_log_with_exp(
@@ -271,13 +275,13 @@ class RunUTConfig(BaseConfig):
     @classmethod
     def check_nfs_path_config(cls, nfs_path):
-        if nfs_path and not os.path.exists(nfs_path):
-            raise Exception("nfs_path: %s does not exist" % nfs_path)
+        if nfs_path:
+            FileChecker(nfs_path, FileCheckConst.DIR, FileCheckConst.READ_ABLE).common_check()
     @classmethod
     def check_tls_path_config(cls, tls_path):
-        if tls_path and not os.path.exists(tls_path):
-            raise Exception("tls_path: %s does not exist" % tls_path)
+        if tls_path:
+            FileChecker(tls_path, FileCheckConst.DIR, FileCheckConst.READ_ABLE).common_check()
     def check_run_ut_config(self):
         RunUTConfig.check_filter_list_config(Const.WHITE_LIST, self.white_list)

msprobe/pytorch/service.py CHANGED Viewed

@@ -30,7 +30,7 @@ from msprobe.pytorch.common.log import logger
 from msprobe.pytorch.common.utils import get_rank_if_initialized, is_recomputation
 from msprobe.pytorch.dump.kernel_dump.kernel_config import create_kernel_config_json
 from msprobe.pytorch.dump.module_dump.module_processer import ModuleProcesser
-from msprobe.pytorch.hook_module.api_registry import api_register
+from msprobe.pytorch.hook_module.api_register import get_api_register
 from msprobe.pytorch.hook_module.hook_module import HOOKModule
 from msprobe.pytorch.hook_module.register_optimizer_hook import register_optimizer_hook
@@ -50,6 +50,8 @@ class Service:
         self.switch = False
         self.inner_switch = False
         self.current_iter = 0
+        self.loop = 0
+        self.init_step = 0
         self.first_start = True
         self.current_rank = None
         self.dump_iter_dir = None
@@ -58,6 +60,7 @@ class Service:
         self.params_grad_info = {}
         self.hook_handle_dict = {}
         # 提前注册，确保注册尽可能多的API hook
+        self.api_register = get_api_register()
         self.register_api_hook()
         self.init_for_debug_level()
@@ -246,6 +249,8 @@ class Service:
         return HookFn(pre_forward_hook_fn, forward_hook_fn, backward_hook_fn, forward_hook_torch_version_below_2_fn)
     def start(self, model):
+        self.current_iter = self.loop + self.init_step
+        self.data_collector.update_iter(self.current_iter)
         if self.config.level == Const.LEVEL_DEBUG:
             return
         if self.need_stop_service():
@@ -304,8 +309,7 @@ class Service:
             if self.config.task == Const.TENSOR:
                 self.data_collector.data_processor.dump_async_data()
         self.data_collector.write_json()
-        self.current_iter += 1
-        self.data_collector.update_iter(self.current_iter)
+        self.loop += 1
         self.reset_status()
     def need_stop_service(self):
@@ -370,11 +374,10 @@ class Service:
     def register_api_hook(self):
         if self.config.level in [Const.LEVEL_MIX, Const.LEVEL_L1, Const.LEVEL_L2]:
             logger.info_on_rank_0(f"The api {self.config.task} hook function is successfully mounted to the model.")
-            api_register.initialize_hook(
-                functools.partial(self.build_hook, BaseScope.Module_Type_API),
-                self.config.online_run_ut
+            self.api_register.initialize_hook(
+                functools.partial(self.build_hook, BaseScope.Module_Type_API)
             )
-            api_register.api_modularity()
+            self.api_register.register_all_api()
     def register_module_hook(self):
         if self.config.level in [Const.LEVEL_L0, Const.LEVEL_MIX]:
@@ -409,7 +412,7 @@ class Service:
         if self.config.nfs_path:
             self.attl.upload("end")
         elif self.attl.socket_manager is not None:
-            logger.info(f"pid: {os.getpid()} finished, start send STOP signal.")
+            logger.info(f"pid: {os.getpid()} finished, start sends STOP signal.")
             self.attl.socket_manager.send_stop_signal()
     def reset_status(self):

msprobe/visualization/builder/graph_builder.py CHANGED Viewed

@@ -16,19 +16,19 @@
 import re
 from msprobe.core.common.const import Const
-from msprobe.core.common.file_utils import load_json
+from msprobe.core.common.file_utils import load_json, save_json
 from msprobe.visualization.builder.msprobe_adapter import get_input_output
 from msprobe.visualization.builder.msprobe_adapter import op_patterns
 from msprobe.visualization.graph.graph import Graph
 from msprobe.visualization.graph.node_op import NodeOp
-from msprobe.visualization.utils import save_json_file, GraphConst
+from msprobe.visualization.utils import GraphConst
 class GraphBuilder:
     backward_pattern = re.compile(r"(\.backward\.)(\d+)$")
     forward_pattern = re.compile(r"(\.forward\.)(\d+)$")
-    # 匹配以大写字母开头，后接任意字母，并以Template(结尾
-    template_pattern = re.compile(r'\b[A-Z][a-zA-Z]*Template\(')
+    # 匹配以大写字母开头，后接任意字母，并以Template(结尾，或包含api_template(的字符串
+    template_pattern = re.compile(r'\b([A-Z][a-zA-Z]*Template|api_template)\(')
     @staticmethod
     def build(construct_path, data_path, stack_path, model_name='DefaultModel', complete_stack=False):
@@ -51,6 +51,7 @@ class GraphBuilder:
         graph = Graph(model_name, data_path=dump_dict.get('dump_data_dir', ''), dump_data=data_dict)
         GraphBuilder._init_nodes(graph, construct_dict, data_dict, stack_dict)
         GraphBuilder._collect_apis_between_modules(graph)
+        GraphBuilder._add_parameters_grad(graph, data_dict)
         return graph
     @staticmethod
@@ -73,7 +74,7 @@ class GraphBuilder:
         if config.task:
             result[GraphConst.JSON_TASK_KEY] = config.task
         result[GraphConst.OVERFLOW_CHECK] = config.overflow_check
-        save_json_file(filename, result)
+        save_json(filename, result, indent=4)
     @staticmethod
     def _simplify_stack(stack_dict):
@@ -235,6 +236,44 @@ class GraphBuilder:
         graph.root.subnodes = output
+    @staticmethod
+    def _add_parameters_grad(graph, data_dict):
+        """
+        将parameters_grad信息添加到graph中，
+        对应模块的parameters_grad节点添加到对应模块的最后一次backward节点（backward计数最大）内作为子节点
+        例如，graph有节点Module.a.backward.0, Module.a.backward.1, Module.a.backward.2
+        则Module.a.parameters_grad添加在Module.a.backward.2内作为子节点
+        """
+        prefixes = []
+        suffix = Const.SEP + Const.PARAMS_GRAD
+        for node_id in data_dict.keys():
+            if node_id not in graph.node_map and node_id.endswith(suffix):
+                prefixes.append(node_id.replace(suffix, ''))
+        max_info = {prefix: 0 for prefix in prefixes}
+        for key in graph.node_map.keys():
+            for prefix in prefixes:
+                # 构建正则表达式，匹配以 "backward.数字" 结尾的键
+                pattern = re.compile(r'^' + re.escape(prefix) + r'\.backward\.(\d+)$')
+                match = pattern.match(key)
+                if match:
+                    num = int(match.group(1))
+                    if num > max_info[prefix]:
+                        max_info[prefix] = num
+        for prefix, num in max_info.items():
+            node_id = prefix + Const.SEP + Const.BACKWARD + Const.SEP + str(num)
+            node = graph.get_node(node_id)
+            if node:
+                parameters_grad_node_id = graph.add_node(NodeOp.module, prefix + suffix, up_node=node)
+                # 添加输入输出数据
+                node_data = data_dict.get(parameters_grad_node_id, {})
+                input_data, output_data = get_input_output(node_data, parameters_grad_node_id)
+                # 更新数据
+                graph.get_node(parameters_grad_node_id).set_input_output(input_data, output_data)
 class GraphExportConfig:
     def __init__(self, graph_n, graph_b=None, tool_tip=None, node_colors=None, micro_steps=None, task='',

msprobe/visualization/builder/msprobe_adapter.py CHANGED Viewed

@@ -13,7 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import re
-import math
 from msprobe.core.compare.acc_compare import read_op, merge_tensor, get_accuracy
 from msprobe.core.common.utils import set_dump_path, get_dump_mode
 from msprobe.visualization.utils import GraphConst

msprobe/visualization/compare/graph_comparator.py CHANGED Viewed

@@ -17,12 +17,14 @@ import re
 from msprobe.visualization.builder.msprobe_adapter import compare_node, get_compare_mode, run_real_data
 from msprobe.visualization.utils import GraphConst, load_json_file, load_data_json_file, get_csv_df
 from msprobe.visualization.graph.graph import Graph, NodeOp
-from msprobe.visualization.graph.node_colors import NodeColors
 from msprobe.visualization.compare.mode_adapter import ModeAdapter
 from msprobe.core.common.const import Const
+from msprobe.core.common.decorator import recursion_depth_decorator
 class GraphComparator:
+    MAX_DEPTH = 1000
     def __init__(self, graphs, dump_path_param, args, mapping_dict=None):
         self.graph_n = graphs[0]
         self.graph_b = graphs[1]
@@ -41,7 +43,7 @@ class GraphComparator:
         else:
             self._compare_nodes(self.graph_n.root)
         self._postcompare()
     def add_compare_result_to_node(self, node, compare_result_list):
         """
         将比对结果添加到节点的输入输出数据中
@@ -66,43 +68,8 @@ class GraphComparator:
             self.ma.parse_result(node, [compare_in_dict, compare_out_dict]))
         node.data[GraphConst.JSON_INDEX_KEY] = precision_index
         node.data.update(other_dict)
-    def _parse_param(self, dump_path_param, output_path):
-        self.dump_path_param = dump_path_param
-        self.output_path = output_path
-        compare_mode = get_compare_mode(self.dump_path_param)
-        self.ma = ModeAdapter(compare_mode)
-        self.data_n_dict = load_data_json_file(dump_path_param.get('npu_json_path'))
-        self.data_b_dict = load_data_json_file(dump_path_param.get('bench_json_path'))
-        self.stack_json_data = load_json_file(dump_path_param.get('stack_json_path'))
-    def _postcompare(self):
-        self._handle_api_collection_index()
-        if not self.ma.compare_mode == GraphConst.REAL_DATA_COMPARE:
-            return
-        df = get_csv_df(True, self.ma.csv_data, self.ma.compare_mode)
-        df = run_real_data(self.dump_path_param, df, self.framework, True if self.mapping_dict else False)
-        compare_data_dict = {row[0]: row.tolist() for _, row in df.iterrows()}
-        for node in self.ma.compare_nodes:
-            precision_index, _ = self.ma.parse_result(node, [compare_data_dict])
-            node.data[GraphConst.JSON_INDEX_KEY] = precision_index
-    def _handle_api_collection_index(self):
-        """
-        api集合的指标, md5模式使用集合中所有api最小的指标，statistics和tensor模式使用集合中所有api最大的指标
-        md5模式下指标为0代表最差，statistics和tensor模式下指标为1代表最差
-        """
-        for node in self.graph_n.root.subnodes:
-            if node.op == NodeOp.api_collection:
-                precision_index = GraphConst.MAX_INDEX_KEY if self.ma.compare_mode == GraphConst.MD5_COMPARE \
-                    else GraphConst.MIN_INDEX_KEY
-                for api in node.subnodes:
-                    precision_index = min(precision_index,
-                                          api.data.get(GraphConst.JSON_INDEX_KEY, GraphConst.MAX_INDEX_KEY)) \
-                        if self.ma.compare_mode == GraphConst.MD5_COMPARE \
-                        else max(precision_index, api.data.get(GraphConst.JSON_INDEX_KEY, GraphConst.MIN_INDEX_KEY))
-                node.data[GraphConst.JSON_INDEX_KEY] = precision_index
+    @recursion_depth_decorator('GraphComparator._compare_nodes', max_depth=MAX_DEPTH)
     def _compare_nodes(self, node_n):
         """
         递归遍历NPU树中的节点，如果在Bench中找到具有相同名称的节点，检查他们的祖先和参数信息，检查一致则及逆行精度数据对比
@@ -126,6 +93,7 @@ class GraphComparator:
         for subnode in node_n.subnodes:
             self._compare_nodes(subnode)
+    @recursion_depth_decorator('GraphComparator._compare_nodes_fuzzy', max_depth=MAX_DEPTH)
     def _compare_nodes_fuzzy(self, node_n):
         if node_n.op != NodeOp.function_api:
             # 模块经过模糊匹配
@@ -146,6 +114,42 @@ class GraphComparator:
         for sub_node in node_n.subnodes:
             self._compare_nodes_fuzzy(sub_node)
+    def _parse_param(self, dump_path_param, output_path):
+        self.dump_path_param = dump_path_param
+        self.output_path = output_path
+        compare_mode = get_compare_mode(self.dump_path_param)
+        self.ma = ModeAdapter(compare_mode)
+        self.data_n_dict = load_data_json_file(dump_path_param.get('npu_json_path'))
+        self.data_b_dict = load_data_json_file(dump_path_param.get('bench_json_path'))
+        self.stack_json_data = load_json_file(dump_path_param.get('stack_json_path'))
+    def _postcompare(self):
+        self._handle_api_collection_index()
+        if not self.ma.compare_mode == GraphConst.REAL_DATA_COMPARE:
+            return
+        df = get_csv_df(True, self.ma.csv_data, self.ma.compare_mode)
+        df = run_real_data(self.dump_path_param, df, self.framework, True if self.mapping_dict else False)
+        compare_data_dict = {row[0]: row.tolist() for _, row in df.iterrows()}
+        for node in self.ma.compare_nodes:
+            precision_index, _ = self.ma.parse_result(node, [compare_data_dict])
+            node.data[GraphConst.JSON_INDEX_KEY] = precision_index
+    def _handle_api_collection_index(self):
+        """
+        api集合的指标, md5模式使用集合中所有api最小的指标，statistics和tensor模式使用集合中所有api最大的指标
+        md5模式下指标为0代表最差，statistics和tensor模式下指标为1代表最差
+        """
+        for node in self.graph_n.root.subnodes:
+            if node.op == NodeOp.api_collection:
+                precision_index = GraphConst.MAX_INDEX_KEY if self.ma.compare_mode == GraphConst.MD5_COMPARE \
+                    else GraphConst.MIN_INDEX_KEY
+                for api in node.subnodes:
+                    precision_index = min(precision_index,
+                                          api.data.get(GraphConst.JSON_INDEX_KEY, GraphConst.MAX_INDEX_KEY)) \
+                        if self.ma.compare_mode == GraphConst.MD5_COMPARE \
+                        else max(precision_index, api.data.get(GraphConst.JSON_INDEX_KEY, GraphConst.MIN_INDEX_KEY))
+                node.data[GraphConst.JSON_INDEX_KEY] = precision_index
     def _get_and_add_result(self, node_n, node_b):
         compare_result_list = compare_node([node_n.id, node_b.id],
                                            [self.data_n_dict, self.data_b_dict],

msprobe/visualization/compare/mode_adapter.py CHANGED Viewed

@@ -14,7 +14,6 @@
 # limitations under the License.
 import json
-import math
 from msprobe.core.common.const import CompareConst, Const
 from msprobe.visualization.utils import ToolTip, GraphConst, str2float
@@ -157,24 +156,6 @@ class ModeAdapter:
             return
         self.csv_data.extend(compare_result_list)
-    def add_error_key(self, node_data):
-        """
-        根据不同的模式进行提供不同错误信息
-        """
-        for key, value in node_data.items():
-            if not isinstance(value, dict):
-                continue
-            if self.compare_mode == GraphConst.SUMMARY_COMPARE:
-                message = [CompareConst.MAX_RELATIVE_ERR, CompareConst.MIN_RELATIVE_ERR,
-                           CompareConst.MEAN_RELATIVE_ERR, CompareConst.NORM_RELATIVE_ERR]
-            elif self.compare_mode == GraphConst.REAL_DATA_COMPARE:
-                message = [CompareConst.ONE_THOUSANDTH_ERR_RATIO, CompareConst.FIVE_THOUSANDTHS_ERR_RATIO]
-            else:
-                # 输出件优化
-                message = []
-            value[GraphConst.ERROR_KEY] = message
-            node_data[key] = value
     def get_tool_tip(self):
         """
         用于前端展示字段的具体含义

msprobe/visualization/graph/base_node.py CHANGED Viewed

@@ -12,10 +12,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from msprobe.core.overflow_check.level import OverflowLevel
-from msprobe.visualization.graph.node_op import NodeOp
 from msprobe.visualization.utils import GraphConst
 from msprobe.visualization.builder.msprobe_adapter import format_node_data, compare_data, compare_data_fuzzy
+from msprobe.core.common.log import logger
 class BaseNode:
@@ -114,7 +115,13 @@ class BaseNode:
         """
         ancestors = []
         current_node = self.upnode
+        seen_nodes = set()
         while current_node:
+            if current_node.id in seen_nodes:
+                logger.warning(f'Detected a cycle in the node structure and cannot get node ancestors, '
+                               f'current node is {current_node.id}.')
+                return []
+            seen_nodes.add(current_node.id)
             ancestors.append(current_node.id)
             current_node = current_node.upnode
         return list(reversed(ancestors))

msprobe/visualization/graph/distributed_analyzer.py CHANGED Viewed

@@ -107,15 +107,6 @@ class DistributedAnalyzer:
             return None, None
         return group_ranks, group_id
-    @staticmethod
-    def _get_batch_group_info(node, rank):
-        for data in node.input_data.values():
-            group_id = data.get('group_id')
-            if group_id is not None:
-                return group_id
-        logger.warning(f'The group_id of node {node.id} does not exist, {CANNOT_MATCH}{rank}')
-        return None
     def distributed_match(self):
         for rank, graph in self.graphs.items():
             nodes = graph.node_map
@@ -377,7 +368,7 @@ class DistributedAnalyzer:
                 target_api_name = self.config.get(api_name)[0]
                 target_rank = int(id_info[1].replace(Const.RANK, ''))
             except Exception as e:
-                logger.warning(f'Failed to parsing batch p2p parameter with error info: {e}.')
+                logger.warning(f'Failed to parse batch p2p parameter with error info: {e}.')
                 continue
             target_node = self._get_target_node(rank, unique_group_id, api_name, target_rank, target_api_name)
             if not target_node:

mindstudio-probe 1.2.2__py3-none-any.whl → 1.3.0__py3-none-any.whl

mindstudio-probe 1.2.2py3-none-any.whl → 1.3.0py3-none-any.whl