PyPI - mindstudio-probe - Versions diffs - 1.2.1__py3-none-any.whl → 1.3.0__py3-none-any.whl - Mend

mindstudio-probe 1.2.1py3-none-any.whl → 1.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (177) hide show

{mindstudio_probe-1.2.1.dist-info → mindstudio_probe-1.3.0.dist-info}/METADATA +3 -3
{mindstudio_probe-1.2.1.dist-info → mindstudio_probe-1.3.0.dist-info}/RECORD +168 -150
msprobe/README.md +27 -22
msprobe/core/common/const.py +129 -60
msprobe/core/common/decorator.py +50 -0
msprobe/core/common/exceptions.py +3 -1
msprobe/core/common/file_utils.py +25 -2
msprobe/core/common/inplace_ops.yaml +1 -0
msprobe/core/common/utils.py +43 -33
msprobe/core/compare/acc_compare.py +43 -74
msprobe/core/compare/check.py +2 -6
msprobe/core/compare/highlight.py +2 -0
msprobe/core/compare/layer_mapping/data_scope_parser.py +1 -1
msprobe/core/compare/layer_mapping/layer_mapping.py +2 -1
msprobe/core/compare/merge_result/merge_result.py +16 -9
msprobe/core/compare/merge_result/utils.py +81 -0
msprobe/core/compare/multiprocessing_compute.py +19 -12
msprobe/core/compare/npy_compare.py +30 -12
msprobe/core/compare/utils.py +30 -10
msprobe/core/data_dump/api_registry.py +176 -0
msprobe/core/data_dump/data_collector.py +58 -13
msprobe/core/data_dump/data_processor/base.py +94 -10
msprobe/core/data_dump/data_processor/factory.py +3 -0
msprobe/core/data_dump/data_processor/mindspore_processor.py +33 -33
msprobe/core/data_dump/data_processor/pytorch_processor.py +99 -18
msprobe/core/data_dump/json_writer.py +61 -40
msprobe/core/grad_probe/constant.py +1 -0
msprobe/core/grad_probe/grad_compare.py +1 -1
msprobe/core/overflow_check/abnormal_scene.py +2 -0
msprobe/docs/01.installation.md +27 -1
msprobe/docs/02.config_introduction.md +27 -23
msprobe/docs/03.config_examples.md +24 -0
msprobe/docs/05.data_dump_PyTorch.md +103 -16
msprobe/docs/06.data_dump_MindSpore.md +76 -32
msprobe/docs/07.accuracy_checker_PyTorch.md +11 -1
msprobe/docs/08.accuracy_checker_online_PyTorch.md +3 -1
msprobe/docs/09.accuracy_checker_MindSpore.md +5 -3
msprobe/docs/10.accuracy_compare_PyTorch.md +59 -33
msprobe/docs/11.accuracy_compare_MindSpore.md +40 -16
msprobe/docs/12.overflow_check_PyTorch.md +3 -1
msprobe/docs/13.overflow_check_MindSpore.md +4 -2
msprobe/docs/14.data_parse_PyTorch.md +1 -7
msprobe/docs/18.online_dispatch.md +1 -1
msprobe/docs/19.monitor.md +332 -273
msprobe/docs/21.visualization_PyTorch.md +42 -13
msprobe/docs/22.visualization_MindSpore.md +43 -13
msprobe/docs/23.generate_operator_PyTorch.md +9 -9
msprobe/docs/27.dump_json_instruction.md +301 -27
msprobe/docs/28.debugger_save_instruction.md +94 -0
msprobe/docs/28.kernel_dump_MindSpore.md +69 -0
msprobe/docs/29.data_dump_MSAdapter.md +229 -0
msprobe/docs/30.overflow_check_MSAdapter.md +31 -0
msprobe/docs/FAQ.md +3 -11
msprobe/docs/img/compare_result.png +0 -0
msprobe/docs/img/merge_result.png +0 -0
msprobe/docs/img/monitor/step_count_per_record.png +0 -0
msprobe/docs/img/visualization/vis_browser_1.png +0 -0
msprobe/docs/img/visualization/vis_match_info.png +0 -0
msprobe/docs/img/visualization/vis_precision_info.png +0 -0
msprobe/docs/img/visualization/vis_search_info.png +0 -0
msprobe/docs/img/visualization/vis_show_info.png +0 -0
msprobe/docs/img/visualization/vis_showcase.png +0 -0
msprobe/docs/img/visualization/vis_unmatch_info.png +0 -0
msprobe/mindspore/__init__.py +4 -2
msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +32 -7
msprobe/mindspore/api_accuracy_checker/api_runner.py +70 -22
msprobe/mindspore/api_accuracy_checker/base_compare_algorithm.py +2 -1
msprobe/mindspore/api_accuracy_checker/bench_functions/flash_attention_score.py +602 -0
msprobe/mindspore/api_accuracy_checker/bench_functions/fusion_operator.py +41 -0
msprobe/mindspore/api_accuracy_checker/compute_element.py +47 -1
msprobe/mindspore/api_accuracy_checker/data_manager.py +2 -1
msprobe/mindspore/api_accuracy_checker/multi_api_accuracy_checker.py +2 -1
msprobe/mindspore/api_accuracy_checker/torch_mindtorch_importer.py +130 -0
msprobe/mindspore/api_accuracy_checker/type_mapping.py +24 -1
msprobe/mindspore/api_accuracy_checker/utils.py +6 -1
msprobe/mindspore/common/const.py +61 -0
msprobe/mindspore/common/utils.py +48 -18
msprobe/mindspore/compare/ms_compare.py +27 -19
msprobe/mindspore/compare/ms_graph_compare.py +6 -5
msprobe/mindspore/debugger/debugger_config.py +31 -6
msprobe/mindspore/debugger/precision_debugger.py +45 -14
msprobe/mindspore/dump/dump_tool_factory.py +5 -3
msprobe/mindspore/dump/hook_cell/api_register.py +142 -0
msprobe/mindspore/dump/hook_cell/hook_cell.py +9 -10
msprobe/mindspore/dump/hook_cell/support_wrap_ops.yaml +24 -26
msprobe/mindspore/dump/jit_dump.py +21 -15
msprobe/mindspore/dym_loader/hook_dynamic_loader.cc +22 -56
msprobe/mindspore/dym_loader/hook_dynamic_loader.h +0 -1
msprobe/mindspore/free_benchmark/api_pynative_self_check.py +10 -6
msprobe/mindspore/free_benchmark/perturbation/perturbation_factory.py +4 -2
msprobe/mindspore/free_benchmark/self_check_tool_factory.py +6 -3
msprobe/mindspore/grad_probe/global_context.py +2 -0
msprobe/mindspore/grad_probe/grad_analyzer.py +2 -1
msprobe/mindspore/grad_probe/hook.py +2 -4
msprobe/mindspore/monitor/anomaly_detect.py +404 -0
msprobe/mindspore/monitor/distributed/__init__.py +0 -0
msprobe/mindspore/monitor/distributed/distributed_ops.yaml +15 -0
msprobe/mindspore/monitor/distributed/stack_blacklist.yaml +5 -0
msprobe/mindspore/monitor/distributed/wrap_distributed.py +300 -0
msprobe/mindspore/monitor/features.py +63 -0
msprobe/mindspore/monitor/module_hook.py +873 -0
msprobe/mindspore/monitor/module_spec_verifier.py +94 -0
msprobe/mindspore/monitor/utils.py +309 -0
msprobe/mindspore/ms_config.py +8 -2
msprobe/mindspore/overflow_check/overflow_check_tool_factory.py +5 -3
msprobe/mindspore/service.py +114 -34
msprobe/pytorch/__init__.py +0 -1
msprobe/pytorch/api_accuracy_checker/compare/api_precision_compare.py +3 -6
msprobe/pytorch/api_accuracy_checker/generate_op_script/op_generator.py +12 -7
msprobe/pytorch/api_accuracy_checker/generate_op_script/operator_replication.template +2 -2
msprobe/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py +4 -5
msprobe/pytorch/api_accuracy_checker/run_ut/run_overflow_check.py +5 -5
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py +25 -6
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut_utils.py +28 -19
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/attl.py +3 -1
msprobe/pytorch/bench_functions/apply_adam.py +215 -0
msprobe/pytorch/bench_functions/group_norm_silu.py +27 -0
msprobe/pytorch/{parse.py → bench_functions/mish.py} +6 -4
msprobe/pytorch/bench_functions/moe_gating_top_k_softmax.py +50 -0
msprobe/pytorch/bench_functions/sort_v2.py +21 -0
msprobe/pytorch/common/utils.py +97 -4
msprobe/pytorch/debugger/debugger_config.py +19 -9
msprobe/pytorch/debugger/precision_debugger.py +24 -1
msprobe/pytorch/dump/module_dump/module_dump.py +4 -3
msprobe/pytorch/dump/module_dump/module_processer.py +21 -35
msprobe/pytorch/free_benchmark/common/utils.py +1 -1
msprobe/pytorch/free_benchmark/compare/single_benchmark.py +1 -1
msprobe/pytorch/free_benchmark/perturbed_layers/npu/add_noise.py +3 -3
msprobe/pytorch/free_benchmark/perturbed_layers/npu/bit_noise.py +3 -3
msprobe/pytorch/free_benchmark/perturbed_layers/npu/change_value.py +1 -1
msprobe/pytorch/free_benchmark/perturbed_layers/npu/improve_precision.py +1 -1
msprobe/pytorch/free_benchmark/result_handlers/check_handler.py +1 -1
msprobe/pytorch/function_factory.py +8 -2
msprobe/pytorch/grad_probe/grad_monitor.py +2 -2
msprobe/pytorch/hook_module/api_register.py +131 -0
msprobe/pytorch/hook_module/hook_module.py +19 -14
msprobe/pytorch/hook_module/register_optimizer_hook.py +2 -1
msprobe/pytorch/hook_module/support_wrap_ops.yaml +173 -75
msprobe/pytorch/monitor/anomaly_detect.py +14 -29
msprobe/pytorch/monitor/csv2tb.py +18 -14
msprobe/pytorch/monitor/distributed/wrap_distributed.py +8 -2
msprobe/pytorch/monitor/module_hook.py +238 -193
msprobe/pytorch/monitor/module_metric.py +9 -6
msprobe/pytorch/monitor/optimizer_collect.py +100 -67
msprobe/pytorch/monitor/unittest/test_monitor.py +1 -1
msprobe/pytorch/monitor/utils.py +76 -44
msprobe/pytorch/online_dispatch/compare.py +0 -2
msprobe/pytorch/online_dispatch/dispatch.py +9 -0
msprobe/pytorch/online_dispatch/dump_compare.py +3 -0
msprobe/pytorch/online_dispatch/utils.py +3 -0
msprobe/pytorch/parse_tool/lib/interactive_cli.py +1 -6
msprobe/pytorch/parse_tool/lib/utils.py +2 -1
msprobe/pytorch/pt_config.py +30 -29
msprobe/pytorch/service.py +114 -32
msprobe/visualization/builder/graph_builder.py +75 -10
msprobe/visualization/builder/msprobe_adapter.py +7 -6
msprobe/visualization/compare/graph_comparator.py +42 -38
msprobe/visualization/compare/mode_adapter.py +0 -19
msprobe/visualization/graph/base_node.py +11 -3
msprobe/visualization/graph/distributed_analyzer.py +71 -3
msprobe/visualization/graph/graph.py +0 -11
msprobe/visualization/graph/node_op.py +4 -3
msprobe/visualization/graph_service.py +4 -5
msprobe/visualization/utils.py +12 -35
msprobe/mindspore/dump/hook_cell/api_registry.py +0 -205
msprobe/mindspore/dump/hook_cell/wrap_api.py +0 -212
msprobe/pytorch/hook_module/api_registry.py +0 -166
msprobe/pytorch/hook_module/wrap_distributed.py +0 -75
msprobe/pytorch/hook_module/wrap_functional.py +0 -66
msprobe/pytorch/hook_module/wrap_npu_custom.py +0 -85
msprobe/pytorch/hook_module/wrap_tensor.py +0 -69
msprobe/pytorch/hook_module/wrap_torch.py +0 -84
msprobe/pytorch/hook_module/wrap_vf.py +0 -60
{mindstudio_probe-1.2.1.dist-info → mindstudio_probe-1.3.0.dist-info}/LICENSE +0 -0
{mindstudio_probe-1.2.1.dist-info → mindstudio_probe-1.3.0.dist-info}/WHEEL +0 -0
{mindstudio_probe-1.2.1.dist-info → mindstudio_probe-1.3.0.dist-info}/entry_points.txt +0 -0
{mindstudio_probe-1.2.1.dist-info → mindstudio_probe-1.3.0.dist-info}/top_level.txt +0 -0

msprobe/pytorch/pt_config.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
+# Copyright (c) 2024-2025, Huawei Technologies Co., Ltd.
 # All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0  (the "License");
@@ -16,9 +16,10 @@
 import os
 import re
-from msprobe.core.common.const import Const
+from msprobe.core.common.const import Const, FileCheckConst
 from msprobe.core.common.exceptions import MsprobeException
-from msprobe.core.common.file_utils import FileOpen, load_json, check_file_or_directory_path, check_crt_valid
+from msprobe.core.common.file_utils import FileOpen, load_json, check_file_or_directory_path, check_crt_valid, \
+    FileChecker
 from msprobe.core.common.log import logger
 from msprobe.core.common.utils import is_int
 from msprobe.core.common_config import BaseConfig, CommonConfig
@@ -66,6 +67,7 @@ class TensorConfig(BaseConfig):
             check_file_or_directory_path(os.path.join(self.tls_path, "client.key"))
             check_file_or_directory_path(os.path.join(self.tls_path, "client.crt"))
             check_crt_valid(os.path.join(self.tls_path, "client.crt"))
+            check_crt_valid(os.path.join(self.tls_path, "client.key"), True)
         if not isinstance(self.host, str) or not re.match(Const.ipv4_pattern, self.host):
             raise Exception(f"host: {self.host} is invalid.")
@@ -95,6 +97,8 @@ class OverflowCheckConfig(BaseConfig):
     def check_overflow_config(self):
         if self.overflow_nums is not None and not is_int(self.overflow_nums):
             raise Exception("overflow_num is invalid")
+        if self.overflow_nums is not None and self.overflow_nums != -1 and self.overflow_nums <= 0:
+            raise Exception("overflow_nums should be -1 or positive integer")
         if self.check_mode is not None and self.check_mode not in ["all", "aicore", "atomic"]:
             raise Exception("check_mode is invalid")
@@ -148,7 +152,7 @@ class FreeBenchmarkCheckConfig(BaseConfig):
                 self.pert_mode in PytorchFreeBenchmarkConst.CPU_MODE_LIST
         ):
             msg = (
-                f"You neet to and can only set fuzz_device as {DeviceType.CPU} "
+                f"You need to and can only set fuzz_device as {DeviceType.CPU} "
                 f"when pert_mode in {PytorchFreeBenchmarkConst.CPU_MODE_LIST}"
             )
             logger.error_log_with_exp(
@@ -271,13 +275,13 @@ class RunUTConfig(BaseConfig):
     @classmethod
     def check_nfs_path_config(cls, nfs_path):
-        if nfs_path and not os.path.exists(nfs_path):
-            raise Exception("nfs_path: %s does not exist" % nfs_path)
+        if nfs_path:
+            FileChecker(nfs_path, FileCheckConst.DIR, FileCheckConst.READ_ABLE).common_check()
     @classmethod
     def check_tls_path_config(cls, tls_path):
-        if tls_path and not os.path.exists(tls_path):
-            raise Exception("tls_path: %s does not exist" % tls_path)
+        if tls_path:
+            FileChecker(tls_path, FileCheckConst.DIR, FileCheckConst.READ_ABLE).common_check()
     def check_run_ut_config(self):
         RunUTConfig.check_filter_list_config(Const.WHITE_LIST, self.white_list)
@@ -303,28 +307,25 @@ class GradToolConfig(BaseConfig):
         check_bounds(self.bounds)
+class StructureConfig(BaseConfig):
+    def __init__(self, json_config):
+        super().__init__(json_config)
+TaskDict = {
+    Const.TENSOR: TensorConfig,
+    Const.STATISTICS: StatisticsConfig,
+    Const.OVERFLOW_CHECK: OverflowCheckConfig,
+    Const.FREE_BENCHMARK: FreeBenchmarkCheckConfig,
+    Const.RUN_UT: RunUTConfig,
+    Const.GRAD_PROBE: GradToolConfig,
+    Const.STRUCTURE: StructureConfig
+}
 def parse_task_config(task, json_config):
-    default_dic = {}
-    if task == Const.TENSOR:
-        config_dic = json_config.get(Const.TENSOR, default_dic)
-        return TensorConfig(config_dic)
-    elif task == Const.STATISTICS:
-        config_dic = json_config.get(Const.STATISTICS, default_dic)
-        return StatisticsConfig(config_dic)
-    elif task == Const.OVERFLOW_CHECK:
-        config_dic = json_config.get(Const.OVERFLOW_CHECK, default_dic)
-        return OverflowCheckConfig(config_dic)
-    elif task == Const.FREE_BENCHMARK:
-        config_dic = json_config.get(Const.FREE_BENCHMARK, default_dic)
-        return FreeBenchmarkCheckConfig(config_dic)
-    elif task == Const.RUN_UT:
-        config_dic = json_config.get(Const.RUN_UT, default_dic)
-        return RunUTConfig(config_dic)
-    elif task == Const.GRAD_PROBE:
-        config_dic = json_config.get(Const.GRAD_PROBE, default_dic)
-        return GradToolConfig(config_dic)
-    else:
-        return StatisticsConfig(default_dic)
+    task_map = json_config.get(task, dict())
+    return TaskDict.get(task)(task_map)
 def parse_json_config(json_file_path, task):

msprobe/pytorch/service.py CHANGED Viewed

@@ -15,22 +15,22 @@
 import functools
 import os
-from collections import namedtuple
+from collections import namedtuple, defaultdict
 import torch
 from msprobe.core.common.const import Const
 from msprobe.core.common.exceptions import DistributedNotInitializedError
 from msprobe.core.common.file_utils import create_directory
-from msprobe.core.common.utils import print_tools_ends_info
+from msprobe.core.common.utils import print_tools_ends_info, DumpPathAggregation
 from msprobe.core.data_dump.data_collector import build_data_collector
 from msprobe.core.data_dump.data_processor.base import ModuleForwardInputsOutputs, ModuleBackwardInputsOutputs
 from msprobe.core.data_dump.scope import BaseScope
 from msprobe.pytorch.api_accuracy_checker.common.utils import ApiData
 from msprobe.pytorch.common.log import logger
-from msprobe.pytorch.common.utils import get_rank_if_initialized
+from msprobe.pytorch.common.utils import get_rank_if_initialized, is_recomputation
 from msprobe.pytorch.dump.kernel_dump.kernel_config import create_kernel_config_json
 from msprobe.pytorch.dump.module_dump.module_processer import ModuleProcesser
-from msprobe.pytorch.hook_module.api_registry import api_register
+from msprobe.pytorch.hook_module.api_register import get_api_register
 from msprobe.pytorch.hook_module.hook_module import HOOKModule
 from msprobe.pytorch.hook_module.register_optimizer_hook import register_optimizer_hook
@@ -50,19 +50,25 @@ class Service:
         self.switch = False
         self.inner_switch = False
         self.current_iter = 0
+        self.loop = 0
+        self.init_step = 0
         self.first_start = True
         self.current_rank = None
         self.dump_iter_dir = None
         self.should_stop_service = False
         self.attl = None
         self.params_grad_info = {}
+        self.hook_handle_dict = {}
         # 提前注册，确保注册尽可能多的API hook
+        self.api_register = get_api_register()
         self.register_api_hook()
+        self.init_for_debug_level()
     def build_hook(self, module_type, name):
         def pre_hook(api_or_module_name, module, args, kwargs):
             if not self.should_execute_hook(module_type, module, True):
                 return args, kwargs
+            is_recompute = is_recomputation()
             self.inner_switch = True
             if module_type == BaseScope.Module_Type_Module:
@@ -77,7 +83,13 @@ class Service:
                 return None, None
             if self.data_collector:
                 module_input_output = ModuleForwardInputsOutputs(args=args, kwargs=kwargs, output=None)
-                self.data_collector.forward_input_data_collect(api_or_module_name, module, pid, module_input_output)
+                self.data_collector.forward_input_data_collect(
+                    api_or_module_name,
+                    module,
+                    pid,
+                    module_input_output,
+                    is_recompute
+                )
             self.inner_switch = False
             return args, kwargs
@@ -101,7 +113,12 @@ class Service:
             if not (Const.FORWARD in self.config.data_mode and Const.BACKWARD not in self.config.data_mode):
                 for param_name, param in params_dict.items():
                     if param.requires_grad:
-                        param.register_hook(grad_hook(module, ori_name, param_name))
+                        name = ori_name + Const.SEP + param_name
+                        old_handle = self.hook_handle_dict.get(name)
+                        if old_handle and hasattr(old_handle, "remove"):
+                            old_handle.remove()
+                        handle = param.register_hook(grad_hook(module, ori_name, param_name))
+                        self.hook_handle_dict[name] = handle
         def init_params_grad_info(module, params_dict):
             '''
@@ -125,6 +142,7 @@ class Service:
         def forward_hook(api_or_module_name, module, args, kwargs, output):
             if not self.should_execute_hook(module_type, module, True):
                 return None
+            is_recompute = is_recomputation()
             self.inner_switch = True
             if self.config.online_run_ut:
@@ -147,10 +165,15 @@ class Service:
             if module_type == BaseScope.Module_Type_Module:
                 api_or_module_name = module.mindstudio_reserved_name[-1]
                 self.data_collector.update_api_or_module_name(api_or_module_name)
-                params_dict = {key.split(Const.SEP)[-1]: value for key, value in module.named_parameters(recurse=False)}
-                setattr(module_input_output, Const.PARAMS, params_dict)
+                params_dict = {}
+                if self.config.task != Const.STRUCTURE:
+                    params_dict = {
+                        key.split(Const.SEP)[-1]: value
+                        for key, value in module.named_parameters(recurse=False)
+                    }
+                    setattr(module_input_output, Const.PARAMS, params_dict)
                 # 判断是否需要注册参数hook
-                if not hasattr(module, 'params_grad_name') and params_dict:
+                if params_dict:
                     ori_name = api_or_module_name.rsplit(Const.SEP, 2)[0]
                     grad_name = ori_name + Const.SEP + Const.PARAMS_GRAD
                     # 首次执行前向hook时，添加params_grad_name属性，并注册参数hook
@@ -160,7 +183,8 @@ class Service:
                     api_or_module_name,
                     module,
                     pid,
-                    module_input_output
+                    module_input_output,
+                    is_recompute
                 )
                 init_params_grad_info(module, params_dict)
             else:
@@ -169,7 +193,8 @@ class Service:
                     api_or_module_name,
                     module,
                     pid,
-                    module_input_output
+                    module_input_output,
+                    is_recompute
                 )
             if self.data_collector.if_return_forward_new_output():
@@ -185,6 +210,7 @@ class Service:
         def backward_hook(api_or_module_name, module, grad_input, grad_output):
             if not self.should_execute_hook(module_type, module, False):
                 return
+            is_recompute = is_recomputation()
             self.inner_switch = True
             if module_type == BaseScope.Module_Type_Module:
@@ -198,7 +224,13 @@ class Service:
             if self.data_collector:
                 # 此处获取到的grad_input实际为反向过程的输出数据，grad_output为反向过程的输入数据，因此传入时调换顺序
                 module_input_output = ModuleBackwardInputsOutputs(grad_input=grad_output, grad_output=grad_input)
-                self.data_collector.backward_data_collect(api_or_module_name, module, pid, module_input_output)
+                self.data_collector.backward_data_collect(
+                    api_or_module_name,
+                    module,
+                    pid,
+                    module_input_output,
+                    is_recompute
+                )
             self.inner_switch = False
         pid = os.getpid()
@@ -217,6 +249,10 @@ class Service:
         return HookFn(pre_forward_hook_fn, forward_hook_fn, backward_hook_fn, forward_hook_torch_version_below_2_fn)
     def start(self, model):
+        self.current_iter = self.loop + self.init_step
+        self.data_collector.update_iter(self.current_iter)
+        if self.config.level == Const.LEVEL_DEBUG:
+            return
         if self.need_stop_service():
             return
@@ -231,6 +267,8 @@ class Service:
             if self.config.rank and self.current_rank not in self.config.rank:
                 return
             self.register_module_hook()
+            if self.config.level == Const.LEVEL_MIX:
+                register_optimizer_hook(self.data_collector)
             self.first_start = False
         if self.config.online_run_ut and torch_version_above_or_equal_2:
             run_ut_dispatch(self.attl, True, self.config.online_run_ut_recompute)
@@ -241,6 +279,8 @@ class Service:
             logger.info_on_rank_0(f"Dump data will be saved in {self.dump_iter_dir}.")
     def stop(self):
+        if self.config.level == Const.LEVEL_DEBUG:
+            return
         if self.should_stop_service:
             return
         if self.config.step and self.current_iter not in self.config.step:
@@ -255,18 +295,21 @@ class Service:
             return
         if self.config.async_dump:
             self.data_collector.fill_stack_tensor_data()
-            self.data_collector.data_processor.dump_async_data()
+            if self.config.task == Const.TENSOR:
+                self.data_collector.data_processor.dump_async_data()
         self.data_collector.write_json()
     def step(self):
+        if self.config.level == Const.LEVEL_DEBUG:
+            return
         if self.should_stop_service:
             return
         if self.config.async_dump:
             self.data_collector.fill_stack_tensor_data()
-            self.data_collector.data_processor.dump_async_data()
+            if self.config.task == Const.TENSOR:
+                self.data_collector.data_processor.dump_async_data()
         self.data_collector.write_json()
-        self.current_iter += 1
-        self.data_collector.update_iter(self.current_iter)
+        self.loop += 1
         self.reset_status()
     def need_stop_service(self):
@@ -319,26 +362,22 @@ class Service:
         else:
             dump_data_dir = None
-        dump_file_path = os.path.join(dump_dir, "dump.json")
-        stack_file_path = os.path.join(dump_dir, "stack.json")
-        construct_file_path = os.path.join(dump_dir, "construct.json")
-        free_benchmark_file_path = os.path.join(self.config.dump_path, "free_benchmark.csv")
-        self.data_collector.update_dump_paths(
-            dump_file_path, stack_file_path, construct_file_path, dump_data_dir, free_benchmark_file_path
-        )
+        dump_path_aggregation = DumpPathAggregation()
+        dump_path_aggregation.dump_file_path = os.path.join(dump_dir, "dump.json")
+        dump_path_aggregation.stack_file_path = os.path.join(dump_dir, "stack.json")
+        dump_path_aggregation.construct_file_path = os.path.join(dump_dir, "construct.json")
+        dump_path_aggregation.dump_tensor_data_dir = dump_data_dir
+        dump_path_aggregation.free_benchmark_file_path = os.path.join(dump_dir, "free_benchmark.csv")
+        self.data_collector.update_dump_paths(dump_path_aggregation)
         self.data_collector.initialize_json_file(framework=Const.PT_FRAMEWORK)
     def register_api_hook(self):
         if self.config.level in [Const.LEVEL_MIX, Const.LEVEL_L1, Const.LEVEL_L2]:
             logger.info_on_rank_0(f"The api {self.config.task} hook function is successfully mounted to the model.")
-            api_register.initialize_hook(
-                functools.partial(self.build_hook, BaseScope.Module_Type_API),
-                self.config.online_run_ut
+            self.api_register.initialize_hook(
+                functools.partial(self.build_hook, BaseScope.Module_Type_API)
             )
-            api_register.api_modularity()
-        if self.config.level == Const.LEVEL_MIX:
-            register_optimizer_hook(self.data_collector)
+            self.api_register.register_all_api()
     def register_module_hook(self):
         if self.config.level in [Const.LEVEL_L0, Const.LEVEL_MIX]:
@@ -373,13 +412,13 @@ class Service:
         if self.config.nfs_path:
             self.attl.upload("end")
         elif self.attl.socket_manager is not None:
-            logger.info(f"pid: {os.getpid()} finished, start send STOP signal.")
+            logger.info(f"pid: {os.getpid()} finished, start sends STOP signal.")
             self.attl.socket_manager.send_stop_signal()
     def reset_status(self):
         ModuleProcesser.reset_module_stats()
         HOOKModule.reset_module_stats()
-        self.data_collector.data_writer.reset_cache()
+        self.data_collector.reset_status()
         self.params_grad_info.clear()
         if self.config.level == Const.LEVEL_L2:
@@ -389,3 +428,46 @@ class Service:
             return
         if self.config.rank and self.current_rank not in self.config.rank:
             return
+    def init_for_debug_level(self):
+        if not (self.config.level == Const.LEVEL_DEBUG and self.config.task in [Const.TENSOR, Const.STATISTICS]):
+            return
+        try:
+            self.current_rank = get_rank_if_initialized()
+        except DistributedNotInitializedError:
+            self.current_rank = None
+        # dir: dump_path -- rank{} -- debug.json
+        self.dump_iter_dir = self.config.dump_path
+        cur_rank = self.current_rank if self.current_rank is not None else ''
+        dump_dir = os.path.join(self.dump_iter_dir, f"rank{cur_rank}")
+        create_directory(dump_dir)
+        if self.config.task in self.data_collector.tasks_need_tensor_data:
+            dump_data_dir = os.path.join(dump_dir, "dump_tensor_data")
+            create_directory(dump_data_dir)
+        else:
+            dump_data_dir = None
+        dump_path_aggregation = DumpPathAggregation()
+        dump_path_aggregation.dump_tensor_data_dir = dump_data_dir
+        dump_path_aggregation.debug_file_path = os.path.join(dump_dir, "debug.json")
+        self.data_collector.update_dump_paths(dump_path_aggregation)
+        self.data_collector.initialize_json_file(framework=Const.PT_FRAMEWORK)
+        self.debug_variable_counter = defaultdict(int)
+    def save(self, variable, name, save_backward):
+        if self.config.level != Const.LEVEL_DEBUG:
+            return
+        count = self.debug_variable_counter[name]
+        self.debug_variable_counter[name] += 1
+        name_with_count = f"{name}.{count}"
+        grad_name_with_count = f"{name}_grad.{count}"
+        # forward save
+        self.data_collector.debug_data_collect_forward(variable, name_with_count)
+        # backward save
+        if save_backward:
+            self.data_collector.debug_data_collect_backward(variable, grad_name_with_count)

msprobe/visualization/builder/graph_builder.py CHANGED Viewed

@@ -16,18 +16,19 @@
 import re
 from msprobe.core.common.const import Const
-from msprobe.core.common.file_utils import load_json
+from msprobe.core.common.file_utils import load_json, save_json
 from msprobe.visualization.builder.msprobe_adapter import get_input_output
 from msprobe.visualization.builder.msprobe_adapter import op_patterns
 from msprobe.visualization.graph.graph import Graph
 from msprobe.visualization.graph.node_op import NodeOp
-from msprobe.visualization.utils import save_json_file, GraphConst
+from msprobe.visualization.utils import GraphConst
 class GraphBuilder:
     backward_pattern = re.compile(r"(\.backward\.)(\d+)$")
-    # 匹配以大写字母开头，后接任意字母，并以Template(结尾
-    template_pattern = re.compile(r'\b[A-Z][a-zA-Z]*Template\(')
+    forward_pattern = re.compile(r"(\.forward\.)(\d+)$")
+    # 匹配以大写字母开头，后接任意字母，并以Template(结尾，或包含api_template(的字符串
+    template_pattern = re.compile(r'\b([A-Z][a-zA-Z]*Template|api_template)\(')
     @staticmethod
     def build(construct_path, data_path, stack_path, model_name='DefaultModel', complete_stack=False):
@@ -50,6 +51,7 @@ class GraphBuilder:
         graph = Graph(model_name, data_path=dump_dict.get('dump_data_dir', ''), dump_data=data_dict)
         GraphBuilder._init_nodes(graph, construct_dict, data_dict, stack_dict)
         GraphBuilder._collect_apis_between_modules(graph)
+        GraphBuilder._add_parameters_grad(graph, data_dict)
         return graph
     @staticmethod
@@ -72,7 +74,7 @@ class GraphBuilder:
         if config.task:
             result[GraphConst.JSON_TASK_KEY] = config.task
         result[GraphConst.OVERFLOW_CHECK] = config.overflow_check
-        save_json_file(filename, result)
+        save_json(filename, result, indent=4)
     @staticmethod
     def _simplify_stack(stack_dict):
@@ -113,12 +115,17 @@ class GraphBuilder:
         如果backward节点的父级节点是null，则尝试从同名的forward节点寻找父级节点
         """
         # 匹配以.backward.后跟一个或多个数字结尾的模式
-        backward_pattern = r"(\.backward\.)(\d+)$"
-        forward_pattern = r"(\.forward\.)(\d+)$"
-        if re.search(backward_pattern, subnode_id) and not upnode_id:
-            forward_upnode_id = construct_dict.get(re.sub(backward_pattern, r".forward.\2", subnode_id))
+        if GraphBuilder.backward_pattern.search(subnode_id) and not upnode_id:
+            forward_upnode_id = construct_dict.get(GraphBuilder.backward_pattern.sub(r".forward.\2", subnode_id))
             if forward_upnode_id:
-                new_upnode_id = re.sub(forward_pattern, r".backward.\2", forward_upnode_id)
+                new_upnode_id = GraphBuilder.forward_pattern.sub(r".backward.\2", forward_upnode_id)
+                if new_upnode_id in construct_dict:
+                    return new_upnode_id
+        # 匹配以.backward结尾的节点
+        if subnode_id.endswith(Const.SEP + Const.BACKWARD) and not upnode_id:
+            forward_upnode_id = construct_dict.get(subnode_id.replace(Const.BACKWARD, Const.FORWARD))
+            if forward_upnode_id:
+                new_upnode_id = forward_upnode_id.replace(Const.FORWARD, Const.BACKWARD)
                 if new_upnode_id in construct_dict:
                     return new_upnode_id
         return upnode_id
@@ -148,6 +155,8 @@ class GraphBuilder:
             input_data, output_data = get_input_output(node_data, node.id)
             # 更新数据
             node.set_input_output(input_data, output_data)
+            if GraphConst.BATCH_P2P in name:
+                GraphBuilder._extract_batch_p2p_info(node, node_data)
             # 反向节点使用对应前向节点的堆栈信息
             # 模块命名举例：Module.module.module.GPTModel.backward.0; API命名举例：Tensor.permute.1.backward
             if (not node_stack_info and
@@ -164,6 +173,24 @@ class GraphBuilder:
         node.add_upnode(upnode)
         return node
+    @staticmethod
+    def _is_valid_batch_p2p_output(param_list):
+        if not isinstance(param_list, list) or not param_list:
+            return False
+        if not isinstance(param_list[0], list) or not param_list[0]:
+            return False
+        return True
+    @staticmethod
+    def _extract_batch_p2p_info(node, node_data):
+        param_list = node_data.get(Const.OUTPUT, [])
+        # 数据格式："output": [[{param1}, {param2}, ...]]
+        if GraphBuilder._is_valid_batch_p2p_output(param_list):
+            for param in param_list[0]:
+                info = {GraphConst.OP: param.get(GraphConst.OP), GraphConst.PEER: param.get(GraphConst.PEER),
+                        GraphConst.GROUP_ID: param.get(GraphConst.GROUP_ID)}
+                node.batch_p2p_info.append(info)
     @staticmethod
     def _collect_apis_between_modules(graph):
         """
@@ -209,6 +236,44 @@ class GraphBuilder:
         graph.root.subnodes = output
+    @staticmethod
+    def _add_parameters_grad(graph, data_dict):
+        """
+        将parameters_grad信息添加到graph中，
+        对应模块的parameters_grad节点添加到对应模块的最后一次backward节点（backward计数最大）内作为子节点
+        例如，graph有节点Module.a.backward.0, Module.a.backward.1, Module.a.backward.2
+        则Module.a.parameters_grad添加在Module.a.backward.2内作为子节点
+        """
+        prefixes = []
+        suffix = Const.SEP + Const.PARAMS_GRAD
+        for node_id in data_dict.keys():
+            if node_id not in graph.node_map and node_id.endswith(suffix):
+                prefixes.append(node_id.replace(suffix, ''))
+        max_info = {prefix: 0 for prefix in prefixes}
+        for key in graph.node_map.keys():
+            for prefix in prefixes:
+                # 构建正则表达式，匹配以 "backward.数字" 结尾的键
+                pattern = re.compile(r'^' + re.escape(prefix) + r'\.backward\.(\d+)$')
+                match = pattern.match(key)
+                if match:
+                    num = int(match.group(1))
+                    if num > max_info[prefix]:
+                        max_info[prefix] = num
+        for prefix, num in max_info.items():
+            node_id = prefix + Const.SEP + Const.BACKWARD + Const.SEP + str(num)
+            node = graph.get_node(node_id)
+            if node:
+                parameters_grad_node_id = graph.add_node(NodeOp.module, prefix + suffix, up_node=node)
+                # 添加输入输出数据
+                node_data = data_dict.get(parameters_grad_node_id, {})
+                input_data, output_data = get_input_output(node_data, parameters_grad_node_id)
+                # 更新数据
+                graph.get_node(parameters_grad_node_id).set_input_output(input_data, output_data)
 class GraphExportConfig:
     def __init__(self, graph_n, graph_b=None, tool_tip=None, node_colors=None, micro_steps=None, task='',

msprobe/visualization/builder/msprobe_adapter.py CHANGED Viewed

@@ -13,7 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import re
-import math
 from msprobe.core.compare.acc_compare import read_op, merge_tensor, get_accuracy
 from msprobe.core.common.utils import set_dump_path, get_dump_mode
 from msprobe.visualization.utils import GraphConst
@@ -23,7 +22,7 @@ from msprobe.core.compare.acc_compare import ModeConfig
 # 用于将节点名字解析成对应的NodeOp的规则
 op_patterns = [
     # NodeOp.module
-    r'^(Module.|Cell.)',
+    r'^(Module.|Cell.|optimizer|clip_grad)',
     # NodeOp.function_api
     r'^(Tensor.|Torch.|Functional.|NPU.|VF.|Distributed.|Aten.|Mint.|Primitive.|Jit.|MintFunctional.)'
 ]
@@ -57,8 +56,8 @@ def run_real_data(dump_path_param, csv_path, framework, is_cross_frame=False):
         from msprobe.pytorch.compare.pt_compare import PTComparator
         return PTComparator(mode_config).do_multi_process(dump_path_param, csv_path)
     else:
-        from msprobe.mindspore.compare.ms_compare import MSComparator
-        ms_comparator = MSComparator(mode_config)
+        from msprobe.mindspore.compare.ms_compare import MSComparator, MappingConfig
+        ms_comparator = MSComparator(mode_config, MappingConfig())
         ms_comparator.cross_frame = is_cross_frame
         return ms_comparator.do_multi_process(dump_path_param, csv_path)
@@ -120,11 +119,13 @@ def compare_data_fuzzy(data_dict_list1, data_dict_list2):
     return True
-def format_node_data(data_dict):
+def format_node_data(data_dict, node_id=None):
     """
-    批量进行节点数据的输出
+    删除节点数据中不需要展示的字段
     """
     del_list = ['requires_grad', 'full_op_name']
+    if node_id and GraphConst.BATCH_P2P in node_id:
+        del_list.extend(['op', 'peer', 'tag', 'group_id'])
     for _, value in data_dict.items():
         if not isinstance(value, dict):
             continue

mindstudio-probe 1.2.1__py3-none-any.whl → 1.3.0__py3-none-any.whl

mindstudio-probe 1.2.1py3-none-any.whl → 1.3.0py3-none-any.whl