PyPI - mindstudio-probe - Versions diffs - 1.3.0__py3-none-any.whl → 8.1.1__py3-none-any.whl - Mend

mindstudio-probe 1.3.0py3-none-any.whl → 8.1.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (213) hide show

{mindstudio_probe-1.3.0.dist-info → mindstudio_probe-8.1.1.dist-info}/METADATA +4 -2
{mindstudio_probe-1.3.0.dist-info → mindstudio_probe-8.1.1.dist-info}/RECORD +204 -152
msprobe/README.md +32 -1
msprobe/core/__init__.py +17 -0
msprobe/core/common/const.py +120 -21
msprobe/core/common/exceptions.py +2 -2
msprobe/core/common/file_utils.py +279 -50
msprobe/core/common/framework_adapter.py +169 -0
msprobe/core/common/global_lock.py +86 -0
msprobe/core/common/runtime.py +25 -0
msprobe/core/common/utils.py +136 -45
msprobe/core/common_config.py +7 -0
msprobe/core/compare/acc_compare.py +646 -428
msprobe/core/compare/check.py +36 -103
msprobe/core/compare/compare_cli.py +4 -0
msprobe/core/compare/config.py +72 -0
msprobe/core/compare/highlight.py +215 -215
msprobe/core/compare/layer_mapping/layer_mapping.py +2 -0
msprobe/core/compare/merge_result/merge_result.py +4 -4
msprobe/core/compare/multiprocessing_compute.py +223 -110
msprobe/core/compare/npy_compare.py +2 -4
msprobe/core/compare/utils.py +214 -244
msprobe/core/config_check/__init__.py +17 -0
msprobe/{pytorch/dump/kernel_dump/kernel_config.py → core/config_check/checkers/__init__.py} +8 -16
msprobe/core/config_check/checkers/base_checker.py +60 -0
msprobe/core/config_check/checkers/dataset_checker.py +138 -0
msprobe/core/config_check/checkers/env_args_checker.py +96 -0
msprobe/core/config_check/checkers/hyperparameter_checker.py +170 -0
msprobe/core/config_check/checkers/pip_checker.py +90 -0
msprobe/core/config_check/checkers/random_checker.py +367 -0
msprobe/core/config_check/checkers/weights_checker.py +147 -0
msprobe/core/config_check/ckpt_compare/ckpt_comparator.py +74 -0
msprobe/core/config_check/ckpt_compare/megatron_loader.py +302 -0
msprobe/core/config_check/ckpt_compare/metrics.py +83 -0
msprobe/core/config_check/ckpt_compare/name_mapping.yaml +12 -0
msprobe/core/config_check/config_check_cli.py +51 -0
msprobe/core/config_check/config_checker.py +100 -0
msprobe/{mindspore/runtime.py → core/config_check/resource/dependency.yaml} +7 -4
msprobe/core/config_check/resource/env.yaml +57 -0
msprobe/core/config_check/resource/hyperparameter.yaml +21 -0
msprobe/core/config_check/utils/hyperparameter_parser.py +115 -0
msprobe/core/config_check/utils/utils.py +107 -0
msprobe/core/data_dump/api_registry.py +67 -4
msprobe/core/data_dump/data_collector.py +170 -89
msprobe/core/data_dump/data_processor/base.py +72 -51
msprobe/core/data_dump/data_processor/mindspore_processor.py +109 -55
msprobe/core/data_dump/data_processor/pytorch_processor.py +90 -82
msprobe/core/data_dump/json_writer.py +143 -27
msprobe/core/debugger/precision_debugger.py +144 -0
msprobe/core/grad_probe/constant.py +1 -1
msprobe/core/grad_probe/grad_compare.py +1 -1
msprobe/core/grad_probe/utils.py +1 -1
msprobe/core/hook_manager.py +242 -0
msprobe/core/monitor/anomaly_processor.py +384 -0
msprobe/core/service.py +357 -0
msprobe/core/single_save/__init__.py +0 -0
msprobe/core/single_save/single_comparator.py +243 -0
msprobe/core/single_save/single_saver.py +146 -0
msprobe/docs/01.installation.md +6 -5
msprobe/docs/02.config_introduction.md +79 -22
msprobe/docs/03.config_examples.md +1 -0
msprobe/docs/04.kernel_dump_PyTorch.md +1 -1
msprobe/docs/05.data_dump_PyTorch.md +118 -49
msprobe/docs/06.data_dump_MindSpore.md +167 -20
msprobe/docs/07.accuracy_checker_PyTorch.md +2 -2
msprobe/docs/08.accuracy_checker_online_PyTorch.md +69 -9
msprobe/docs/09.accuracy_checker_MindSpore.md +18 -6
msprobe/docs/10.accuracy_compare_PyTorch.md +212 -74
msprobe/docs/11.accuracy_compare_MindSpore.md +87 -37
msprobe/docs/12.overflow_check_PyTorch.md +2 -2
msprobe/docs/13.overflow_check_MindSpore.md +2 -2
msprobe/docs/14.data_parse_PyTorch.md +3 -3
msprobe/docs/17.grad_probe.md +2 -1
msprobe/docs/18.online_dispatch.md +2 -2
msprobe/docs/19.monitor.md +90 -44
msprobe/docs/21.visualization_PyTorch.md +68 -15
msprobe/docs/22.visualization_MindSpore.md +71 -18
msprobe/docs/25.tool_function_introduction.md +23 -22
msprobe/docs/26.data_dump_PyTorch_baseline.md +14 -3
msprobe/docs/27.dump_json_instruction.md +1 -1
msprobe/docs/28.debugger_save_instruction.md +111 -20
msprobe/docs/29.data_dump_MSAdapter.md +2 -2
msprobe/docs/30.overflow_check_MSAdapter.md +2 -2
msprobe/docs/31.config_check.md +95 -0
msprobe/docs/32.ckpt_compare.md +69 -0
msprobe/docs/33.generate_operator_MindSpore.md +181 -0
msprobe/docs/34.RL_collect.md +92 -0
msprobe/docs/35.nan_analyze.md +72 -0
msprobe/docs/data_dump_MindSpore/data_dump_MindSpore_baseline.md +12 -1
msprobe/docs/data_dump_MindSpore/dynamic_graph_quick_start_example.md +3 -1
msprobe/docs/img/compare_result.png +0 -0
msprobe/docs/img/save_compare_result_sample.png +0 -0
msprobe/docs/img/visualization/proxy.png +0 -0
msprobe/mindspore/__init__.py +1 -2
msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +150 -58
msprobe/mindspore/api_accuracy_checker/api_runner.py +7 -3
msprobe/mindspore/api_accuracy_checker/bench_functions/flash_attention_score.py +47 -69
msprobe/mindspore/api_accuracy_checker/cmd_parser.py +4 -0
msprobe/mindspore/api_accuracy_checker/compute_element.py +0 -1
msprobe/mindspore/api_accuracy_checker/data_manager.py +2 -2
msprobe/mindspore/api_accuracy_checker/generate_op_script/op_generator.py +460 -0
msprobe/mindspore/api_accuracy_checker/generate_op_script/operator_replication.template +2081 -0
msprobe/mindspore/api_accuracy_checker/multi_api_accuracy_checker.py +9 -0
msprobe/mindspore/api_accuracy_checker/torch_mindtorch_importer.py +2 -1
msprobe/mindspore/cell_processor.py +204 -33
msprobe/mindspore/code_mapping/graph_parser.py +4 -21
msprobe/mindspore/common/const.py +17 -7
msprobe/mindspore/common/utils.py +128 -11
msprobe/mindspore/compare/common_dir_compare.py +382 -0
msprobe/mindspore/compare/distributed_compare.py +2 -26
msprobe/mindspore/compare/ms_compare.py +17 -405
msprobe/mindspore/compare/ms_graph_compare.py +14 -5
msprobe/mindspore/compare/utils.py +37 -0
msprobe/mindspore/debugger/debugger_config.py +53 -3
msprobe/mindspore/debugger/precision_debugger.py +72 -91
msprobe/mindspore/dump/cell_dump_process.py +877 -0
msprobe/mindspore/dump/cell_dump_with_insert_gradient.py +864 -0
msprobe/mindspore/dump/dump_tool_factory.py +13 -5
msprobe/mindspore/dump/graph_mode_cell_dump.py +139 -0
msprobe/mindspore/dump/graph_tensor_dump.py +123 -0
msprobe/mindspore/dump/hook_cell/api_register.py +40 -6
msprobe/mindspore/dump/hook_cell/hook_cell.py +18 -7
msprobe/mindspore/dump/hook_cell/ms_hook_manager.py +88 -0
msprobe/mindspore/dump/hook_cell/primitive_hooks.py +8 -2
msprobe/mindspore/dump/hook_cell/support_wrap_ops.yaml +18 -0
msprobe/mindspore/dump/jit_dump.py +21 -18
msprobe/mindspore/dump/kernel_kbyk_dump.py +6 -3
msprobe/mindspore/dym_loader/hook_dynamic_loader.cpp +110 -0
msprobe/mindspore/dym_loader/hook_dynamic_loader.h +15 -15
msprobe/mindspore/free_benchmark/api_pynative_self_check.py +12 -6
msprobe/mindspore/free_benchmark/common/utils.py +1 -1
msprobe/mindspore/grad_probe/global_context.py +7 -2
msprobe/mindspore/grad_probe/grad_stat_csv.py +3 -2
msprobe/mindspore/mindspore_service.py +114 -0
msprobe/mindspore/monitor/common_func.py +52 -0
msprobe/mindspore/monitor/data_writers.py +237 -0
msprobe/mindspore/monitor/features.py +20 -7
msprobe/mindspore/monitor/module_hook.py +281 -209
msprobe/mindspore/monitor/optimizer_collect.py +334 -0
msprobe/mindspore/monitor/utils.py +25 -5
msprobe/mindspore/ms_config.py +16 -15
msprobe/mindspore/task_handler_factory.py +5 -2
msprobe/msprobe.py +19 -0
msprobe/nan_analyze/__init__.py +14 -0
msprobe/nan_analyze/analyzer.py +255 -0
msprobe/nan_analyze/graph.py +189 -0
msprobe/nan_analyze/utils.py +211 -0
msprobe/pytorch/api_accuracy_checker/common/config.py +2 -2
msprobe/pytorch/api_accuracy_checker/compare/compare.py +36 -34
msprobe/pytorch/api_accuracy_checker/compare/compare_utils.py +20 -20
msprobe/pytorch/api_accuracy_checker/generate_op_script/op_generator.py +4 -7
msprobe/pytorch/api_accuracy_checker/generate_op_script/operator_replication.template +204 -2
msprobe/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py +12 -11
msprobe/pytorch/api_accuracy_checker/run_ut/run_overflow_check.py +1 -0
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py +8 -5
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut_utils.py +2 -3
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/client.py +29 -13
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/device_dispatch.py +12 -2
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/server.py +45 -31
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/utils.py +156 -0
msprobe/pytorch/attl_manager.py +65 -0
msprobe/pytorch/bench_functions/npu_fusion_attention.py +27 -0
msprobe/pytorch/common/utils.py +26 -14
msprobe/pytorch/compare/distributed_compare.py +4 -36
msprobe/pytorch/compare/pt_compare.py +13 -84
msprobe/pytorch/compare/utils.py +47 -0
msprobe/pytorch/debugger/debugger_config.py +34 -17
msprobe/pytorch/debugger/precision_debugger.py +66 -118
msprobe/pytorch/dump/module_dump/hook_wrapper.py +93 -0
msprobe/pytorch/dump/module_dump/module_dump.py +11 -58
msprobe/pytorch/dump/module_dump/module_processer.py +143 -113
msprobe/pytorch/grad_probe/grad_stat_csv.py +3 -2
msprobe/pytorch/hook_module/api_register.py +29 -5
msprobe/pytorch/hook_module/hook_module.py +9 -18
msprobe/pytorch/hook_module/jit_script_wrapper.py +33 -0
msprobe/pytorch/hook_module/pt_hook_manager.py +68 -0
msprobe/pytorch/hook_module/support_wrap_ops.yaml +22 -1
msprobe/pytorch/hook_module/utils.py +28 -2
msprobe/pytorch/monitor/csv2tb.py +6 -2
msprobe/pytorch/monitor/data_writers.py +259 -0
msprobe/pytorch/monitor/module_hook.py +227 -158
msprobe/pytorch/monitor/module_metric.py +14 -0
msprobe/pytorch/monitor/optimizer_collect.py +242 -270
msprobe/pytorch/monitor/utils.py +16 -3
msprobe/pytorch/online_dispatch/dispatch.py +4 -2
msprobe/pytorch/online_dispatch/dump_compare.py +5 -2
msprobe/pytorch/parse_tool/lib/utils.py +3 -3
msprobe/pytorch/pt_config.py +8 -7
msprobe/pytorch/pytorch_service.py +73 -0
msprobe/visualization/builder/graph_builder.py +33 -13
msprobe/visualization/builder/msprobe_adapter.py +24 -11
msprobe/visualization/compare/graph_comparator.py +53 -45
msprobe/visualization/compare/mode_adapter.py +31 -1
msprobe/visualization/graph/base_node.py +3 -3
msprobe/visualization/graph/graph.py +2 -2
msprobe/visualization/graph_service.py +250 -103
msprobe/visualization/utils.py +27 -11
msprobe/mindspore/dym_loader/hook_dynamic_loader.cc +0 -106
msprobe/mindspore/monitor/anomaly_detect.py +0 -404
msprobe/mindspore/monitor/module_spec_verifier.py +0 -94
msprobe/mindspore/service.py +0 -549
msprobe/pytorch/monitor/anomaly_analyse.py +0 -201
msprobe/pytorch/monitor/anomaly_detect.py +0 -410
msprobe/pytorch/monitor/module_spec_verifier.py +0 -95
msprobe/pytorch/monitor/unittest/test_monitor.py +0 -160
msprobe/pytorch/service.py +0 -473
{mindstudio_probe-1.3.0.dist-info → mindstudio_probe-8.1.1.dist-info}/LICENSE +0 -0
{mindstudio_probe-1.3.0.dist-info → mindstudio_probe-8.1.1.dist-info}/WHEEL +0 -0
{mindstudio_probe-1.3.0.dist-info → mindstudio_probe-8.1.1.dist-info}/entry_points.txt +0 -0
{mindstudio_probe-1.3.0.dist-info → mindstudio_probe-8.1.1.dist-info}/top_level.txt +0 -0
/msprobe/{mindspore → core}/compare/ms_to_pt_api.yaml +0 -0
/msprobe/{mindspore/dump → core}/kernel_dump/kernel_config.py +0 -0
/msprobe/{pytorch/monitor/unittest → core/monitor}/__init__.py +0 -0

msprobe/pytorch/monitor/utils.py CHANGED Viewed

@@ -22,7 +22,7 @@ import re
 import torch
-from msprobe.core.common.const import MonitorConst, Const
+from msprobe.core.common.const import MonitorConst
 from msprobe.pytorch.common.log import logger
 from msprobe.core.common.utils import is_int
 from msprobe.core.common.file_utils import check_file_or_directory_path, recursive_chmod
@@ -43,7 +43,6 @@ DIRECTORY_MAX_LENGTH = 4096
 beijing_tz = timezone(timedelta(hours=8))
 MVResult = namedtuple('MVResult', ("exp_avg", "exp_avg_sq", "update", "ratio"))
-MVGradResult = namedtuple('MVGradResult', ("exp_avg", "exp_avg_sq", "update", "ratio", "grad"))
 class MsgConst:
@@ -102,6 +101,11 @@ def validate_ops(ops):
         default_op = MonitorConst.OP_LIST[0]
         valid_ops.append(default_op)
         logger.info_on_rank_0(f"There is no valid ops, default op {default_op} is used")
+    # 增加默认shape和dtype参数
+    if "shape" not in valid_ops:
+        valid_ops.append("shape")
+    if "dtype" not in valid_ops:
+        valid_ops.append("dtype")
     return valid_ops
@@ -199,7 +203,7 @@ def validate_alert(alert):
             args = rule.get("args")
             if args and isinstance(args, dict):
                 threshold = args.get("threshold")
-                if not isinstance(threshold, float) or threshold < 0:
+                if not isinstance(threshold, (float, int)) or threshold < 0:
                     raise TypeError('threshold must be float and not less than 0')
     dump = alert.get('dump')
     if dump and not isinstance(dump, bool):
@@ -220,6 +224,13 @@ def validate_dynamic_on(dynamic_on):
         raise TypeError('dynamic_on should be a bool')
+def validate_monitor_mbs_grad(monitor_mbs_grad):
+    if not isinstance(monitor_mbs_grad, bool):
+        logger.warning(f'monitor_mbs_grad should be a bool, actual value is {monitor_mbs_grad}.')
+        return False
+    return monitor_mbs_grad
 def validate_config(config):
     config['ops'] = validate_ops(config.get('ops', []))
@@ -274,6 +285,8 @@ def validate_config(config):
     squash_name = config.get('squash_name', True)
     validate_squash_name(squash_name)
+    config["monitor_mbs_grad"] = validate_monitor_mbs_grad(config.get('monitor_mbs_grad', False))
     dynamic_on = config.get('dynamic_on', False)
     validate_dynamic_on(dynamic_on)

msprobe/pytorch/online_dispatch/dispatch.py CHANGED Viewed

@@ -208,8 +208,10 @@ class PtdbgDispatch(TorchDispatchMode):
             dispatch_workflow(run_param, data_info)
         else:
             self.lock.acquire()
-            self.all_summary.append([])
-            self.lock.release()
+            try:
+                self.all_summary.append([])
+            finally:
+                self.lock.release()
             run_param.process_flag = True
             if self.check_fun(func, run_param):
                 data_info = DisPatchDataInfo(cpu_args, cpu_kwargs, self.all_summary, None, npu_out_cpu, cpu_out,

msprobe/pytorch/online_dispatch/dump_compare.py CHANGED Viewed

@@ -110,8 +110,11 @@ def dump_data(data, prefix, dump_path):
 def save_temp_summary(api_index, single_api_summary, path, lock):
     summary_path = os.path.join(path, f'summary.json')
     lock.acquire()
-    data = [api_index, single_api_summary]
-    save_json(summary_path, data, mode='a')
+    try:
+        data = [api_index, single_api_summary]
+        save_json(summary_path, data, mode='a')
+    finally:
+        lock.release()
 def dispatch_workflow(run_param: DispatchRunParam, data_info: DisPatchDataInfo):

msprobe/pytorch/parse_tool/lib/utils.py CHANGED Viewed

@@ -13,12 +13,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import hashlib
 import os
 import re
 import subprocess
 import sys
 import time
+import zlib
 from collections import namedtuple
 import numpy as np
@@ -114,8 +114,8 @@ class Util:
     @staticmethod
     def get_md5_for_numpy(obj):
         np_bytes = obj.tobytes()
-        md5_hash = hashlib.md5(np_bytes)
-        return md5_hash.hexdigest()
+        md5_crc = zlib.crc32(np_bytes)
+        return f"{md5_crc:08x}"
     @staticmethod
     def deal_with_dir_or_file_inconsistency(output_path):

msprobe/pytorch/pt_config.py CHANGED Viewed

@@ -18,8 +18,7 @@ import re
 from msprobe.core.common.const import Const, FileCheckConst
 from msprobe.core.common.exceptions import MsprobeException
-from msprobe.core.common.file_utils import FileOpen, load_json, check_file_or_directory_path, check_crt_valid, \
-    FileChecker
+from msprobe.core.common.file_utils import FileOpen, load_json, check_file_or_directory_path, FileChecker
 from msprobe.core.common.log import logger
 from msprobe.core.common.utils import is_int
 from msprobe.core.common_config import BaseConfig, CommonConfig
@@ -43,6 +42,7 @@ class TensorConfig(BaseConfig):
         self.tls_path = json_config.get("tls_path", "./")
         self.online_run_ut_recompute = json_config.get("online_run_ut_recompute", False)
         self.check_config()
+        self._check_summary_mode()
         self._check_file_format()
         if self.online_run_ut:
             self._check_online_run_ut()
@@ -66,8 +66,10 @@ class TensorConfig(BaseConfig):
             check_file_or_directory_path(self.tls_path, isdir=True)
             check_file_or_directory_path(os.path.join(self.tls_path, "client.key"))
             check_file_or_directory_path(os.path.join(self.tls_path, "client.crt"))
-            check_crt_valid(os.path.join(self.tls_path, "client.crt"))
-            check_crt_valid(os.path.join(self.tls_path, "client.key"), True)
+            check_file_or_directory_path(os.path.join(self.tls_path, "ca.crt"))
+            crl_path = os.path.join(self.tls_path, "crl.pem")
+            if os.path.exists(crl_path):
+                check_file_or_directory_path(crl_path)
         if not isinstance(self.host, str) or not re.match(Const.ipv4_pattern, self.host):
             raise Exception(f"host: {self.host} is invalid.")
@@ -82,9 +84,8 @@ class StatisticsConfig(BaseConfig):
         self.check_config()
         self._check_summary_mode()
-    def _check_summary_mode(self):
-        if self.summary_mode and self.summary_mode not in ["statistics", "md5"]:
-            raise Exception("summary_mode is invalid")
+        self.tensor_list = json_config.get("tensor_list", [])
+        self._check_str_list_config(self.tensor_list, "tensor_list")
 class OverflowCheckConfig(BaseConfig):

msprobe/pytorch/pytorch_service.py ADDED Viewed

@@ -0,0 +1,73 @@
+# Copyright (c) 2025, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from msprobe.core.common.utils import Const
+from msprobe.core.service import BaseService
+from msprobe.pytorch.attl_manager import ATTLManager
+from msprobe.pytorch.common.log import logger
+from msprobe.pytorch.common.utils import get_rank_if_initialized, torch_version_above_or_equal_2
+from msprobe.pytorch.dump.module_dump.module_processer import ModuleProcesser
+from msprobe.pytorch.hook_module.api_register import get_api_register, ApiTemplate
+from msprobe.pytorch.hook_module.hook_module import HOOKModule
+from msprobe.pytorch.hook_module.jit_script_wrapper import wrap_jit_script_func
+from msprobe.pytorch.hook_module.pt_hook_manager import PytorchHookManager
+from msprobe.pytorch.hook_module.register_optimizer_hook import register_optimizer_hook
+if torch_version_above_or_equal_2:
+    from msprobe.pytorch.api_accuracy_checker.tensor_transport_layer.dump_dispatch import run_ut_dispatch
+class PytorchService(BaseService):
+    @property
+    def _get_framework_type(self):
+        return Const.PT_FRAMEWORK
+    @staticmethod
+    def _get_current_rank():
+        return get_rank_if_initialized()
+    def reset_status(self):
+        self._reset_status()
+    def _init_specific_components(self):
+        self.logger = logger
+        self.api_register = get_api_register()
+        self.module_processor = ModuleProcesser(self.data_collector.scope)
+        self.attl_manager = ATTLManager(self.config)
+        self.hook_manager = PytorchHookManager(self.data_collector, self.config, self.attl_manager)
+        self.api_template = ApiTemplate
+    def _register_hook(self):
+        self.attl_manager.attl_init()
+        if self._is_mix_level:
+            register_optimizer_hook(self.data_collector)
+    def _register_api_hook(self):
+        super()._register_api_hook()
+        wrap_jit_script_func()
+    def _register_module_hook(self):
+        ModuleProcesser.enable_module_dump = True
+        self.module_processor.register_module_hook(self.model, self.build_hook)
+        self.logger.info(f"The module {self.config.task} hook function is successfully mounted to the model.")
+    def _run_ut_dispatch(self, status):
+        if torch_version_above_or_equal_2:
+            run_ut_dispatch(self.attl_manager.attl, status, self.config.online_run_ut_recompute)
+    def _reset_status(self):
+        super()._reset_status()
+        ModuleProcesser.reset_module_stats()
+        HOOKModule.reset_module_stats()

msprobe/visualization/builder/graph_builder.py CHANGED Viewed

@@ -14,9 +14,11 @@
 # limitations under the License.
 import re
+from dataclasses import dataclass
 from msprobe.core.common.const import Const
 from msprobe.core.common.file_utils import load_json, save_json
+from msprobe.core.common.utils import load_stack_json
 from msprobe.visualization.builder.msprobe_adapter import get_input_output
 from msprobe.visualization.builder.msprobe_adapter import op_patterns
 from msprobe.visualization.graph.graph import Graph
@@ -44,7 +46,7 @@ class GraphBuilder:
         """
         construct_dict = load_json(construct_path)
         dump_dict = load_json(data_path)
-        stack_dict = load_json(stack_path)
+        stack_dict = load_stack_json(stack_path)
         if not complete_stack:
             GraphBuilder._simplify_stack(stack_dict)
         data_dict = dump_dict.get(GraphConst.DATA_KEY, {})
@@ -61,10 +63,10 @@ class GraphBuilder:
         """
         result = {}
         if config.graph_b:
-            result[GraphConst.JSON_NPU_KEY] = config.graph_n.to_dict()
-            result[GraphConst.JSON_BENCH_KEY] = config.graph_b.to_dict()
+            result[GraphConst.JSON_NPU_KEY] = config.graph_n.to_dict(config.compare_mode)
+            result[GraphConst.JSON_BENCH_KEY] = config.graph_b.to_dict(config.compare_mode)
         else:
-            result = config.graph_n.to_dict()
+            result = config.graph_n.to_dict(config.compare_mode)
         if config.tool_tip:
             result[GraphConst.JSON_TIP_KEY] = config.tool_tip
         if config.node_colors:
@@ -187,6 +189,8 @@ class GraphBuilder:
         # 数据格式："output": [[{param1}, {param2}, ...]]
         if GraphBuilder._is_valid_batch_p2p_output(param_list):
             for param in param_list[0]:
+                if not isinstance(param, dict):
+                    continue
                 info = {GraphConst.OP: param.get(GraphConst.OP), GraphConst.PEER: param.get(GraphConst.PEER),
                         GraphConst.GROUP_ID: param.get(GraphConst.GROUP_ID)}
                 node.batch_p2p_info.append(info)
@@ -254,14 +258,12 @@ class GraphBuilder:
         max_info = {prefix: 0 for prefix in prefixes}
         for key in graph.node_map.keys():
-            for prefix in prefixes:
-                # 构建正则表达式，匹配以 "backward.数字" 结尾的键
-                pattern = re.compile(r'^' + re.escape(prefix) + r'\.backward\.(\d+)$')
-                match = pattern.match(key)
-                if match:
-                    num = int(match.group(1))
-                    if num > max_info[prefix]:
-                        max_info[prefix] = num
+            parts = key.split(Const.SEP)
+            if len(parts) > 2 and parts[-2] == Const.BACKWARD:
+                num = int(parts[-1])
+                prefix = Const.SEP.join(parts[:-2])
+                if prefix in max_info and num > max_info[prefix]:
+                    max_info[prefix] = num
         for prefix, num in max_info.items():
             node_id = prefix + Const.SEP + Const.BACKWARD + Const.SEP + str(num)
@@ -277,7 +279,7 @@ class GraphBuilder:
 class GraphExportConfig:
     def __init__(self, graph_n, graph_b=None, tool_tip=None, node_colors=None, micro_steps=None, task='',
-                 overflow_check=False):
+                 overflow_check=False, compare_mode=None):
         self.graph_n = graph_n
         self.graph_b = graph_b
         self.tool_tip = tool_tip
@@ -285,3 +287,21 @@ class GraphExportConfig:
         self.micro_steps = micro_steps
         self.task = task
         self.overflow_check = overflow_check
+        self.compare_mode = compare_mode
+@dataclass
+class GraphInfo:
+    graph: Graph
+    construct_path: str
+    data_path: str
+    stack_path: str
+@dataclass
+class BuildGraphTaskInfo:
+    graph_info_n: GraphInfo
+    graph_info_b: GraphInfo
+    npu_rank: str
+    bench_rank: str
+    time_str: str

msprobe/visualization/builder/msprobe_adapter.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
+# Copyright (c) 2024-2025, Huawei Technologies Co., Ltd.
 # All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0  (the "License");
@@ -12,12 +12,16 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import re
-from msprobe.core.compare.acc_compare import read_op, merge_tensor, get_accuracy
+from msprobe.core.compare.acc_compare import ModeConfig
+from msprobe.core.compare.multiprocessing_compute import CompareRealData
+from msprobe.core.compare.utils import read_op, merge_tensor, get_accuracy, make_result_table
 from msprobe.core.common.utils import set_dump_path, get_dump_mode
 from msprobe.visualization.utils import GraphConst
 from msprobe.core.common.const import Const
-from msprobe.core.compare.acc_compare import ModeConfig
 # 用于将节点名字解析成对应的NodeOp的规则
 op_patterns = [
@@ -53,13 +57,11 @@ def run_real_data(dump_path_param, csv_path, framework, is_cross_frame=False):
     mode_config = ModeConfig(stack_mode=False, auto_analyze=True, fuzzy_match=False, dump_mode=Const.ALL)
     if framework == Const.PT_FRAMEWORK:
-        from msprobe.pytorch.compare.pt_compare import PTComparator
-        return PTComparator(mode_config).do_multi_process(dump_path_param, csv_path)
+        from msprobe.pytorch.compare.pt_compare import read_real_data
+        return CompareRealData(read_real_data, mode_config, is_cross_frame).do_multi_process(dump_path_param, csv_path)
     else:
-        from msprobe.mindspore.compare.ms_compare import MSComparator, MappingConfig
-        ms_comparator = MSComparator(mode_config, MappingConfig())
-        ms_comparator.cross_frame = is_cross_frame
-        return ms_comparator.do_multi_process(dump_path_param, csv_path)
+        from msprobe.mindspore.compare.ms_compare import read_real_data
+        return CompareRealData(read_real_data, mode_config, is_cross_frame).do_multi_process(dump_path_param, csv_path)
 def get_input_output(node_data, node_id):
@@ -119,11 +121,13 @@ def compare_data_fuzzy(data_dict_list1, data_dict_list2):
     return True
-def format_node_data(data_dict, node_id=None):
+def format_node_data(data_dict, node_id=None, compare_mode=None):
     """
     删除节点数据中不需要展示的字段
     """
     del_list = ['requires_grad', 'full_op_name']
+    if GraphConst.MD5_COMPARE != compare_mode:
+        del_list.append(Const.MD5)
     if node_id and GraphConst.BATCH_P2P in node_id:
         del_list.extend(['op', 'peer', 'tag', 'group_id'])
     for _, value in data_dict.items():
@@ -171,7 +175,7 @@ def _format_decimal_string(s):
     """
     使用正则表达式匹配包含数字、小数点和可选的百分号的字符串
     """
-    pattern = re.compile(r'\d{1,20}\.\d{1,20}%?')
+    pattern = re.compile(r'^\d{1,20}\.\d{1,20}%?$')
     matches = pattern.findall(s)
     for match in matches:
         is_percent = match.endswith('%')
@@ -226,3 +230,12 @@ def _format_data(data_dict):
     if all_null:
         data_dict.clear()
         data_dict[GraphConst.VALUE] = GraphConst.NULL
+def get_csv_df(stack_mode, csv_data, compare_mode):
+    """
+    调用acc接口写入csv
+    """
+    dump_mode = GraphConst.GRAPHCOMPARE_MODE_TO_DUMP_MODE_TO_MAPPING.get(compare_mode)
+    return make_result_table(csv_data, dump_mode, stack_mode)

msprobe/visualization/compare/graph_comparator.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# Copyright (c) 2024, Huawei Technologies Co., Ltd.
+# Copyright (c) 2024-2025, Huawei Technologies Co., Ltd.
 # All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0  (the "License");
@@ -14,8 +14,8 @@
 # limitations under the License.
 import re
-from msprobe.visualization.builder.msprobe_adapter import compare_node, get_compare_mode, run_real_data
-from msprobe.visualization.utils import GraphConst, load_json_file, load_data_json_file, get_csv_df
+from msprobe.visualization.builder.msprobe_adapter import compare_node, get_compare_mode, run_real_data, get_csv_df
+from msprobe.visualization.utils import GraphConst, load_json_file, load_data_json_file
 from msprobe.visualization.graph.graph import Graph, NodeOp
 from msprobe.visualization.compare.mode_adapter import ModeAdapter
 from msprobe.core.common.const import Const
@@ -25,14 +25,16 @@ from msprobe.core.common.decorator import recursion_depth_decorator
 class GraphComparator:
     MAX_DEPTH = 1000
-    def __init__(self, graphs, dump_path_param, args, mapping_dict=None):
+    def __init__(self, graphs, dump_path_param, args, is_cross_framework, mapping_dict=None):
         self.graph_n = graphs[0]
         self.graph_b = graphs[1]
         self._parse_param(dump_path_param, args.output_path)
         self.framework = args.framework
+        self.layer_mapping = args.layer_mapping
         self.mapping_dict = mapping_dict
         self.fuzzy_match = args.fuzzy_match
         self.pattern = re.compile(r'\.\d+\.')
+        self.is_cross_framework = is_cross_framework
     def compare(self):
         """
@@ -69,50 +71,56 @@ class GraphComparator:
         node.data[GraphConst.JSON_INDEX_KEY] = precision_index
         node.data.update(other_dict)
-    @recursion_depth_decorator('GraphComparator._compare_nodes', max_depth=MAX_DEPTH)
-    def _compare_nodes(self, node_n):
+    def _compare_nodes(self, node_root):
         """
-        递归遍历NPU树中的节点，如果在Bench中找到具有相同名称的节点，检查他们的祖先和参数信息，检查一致则及逆行精度数据对比
+        遍历NPU树中的节点，如果在Bench中找到具有相同名称的节点，检查他们的祖先和参数信息，检查一致则及逆行精度数据对比
         这里采用先序遍历，好处在于当这个节点被比较时，他的先序已经被匹配，这可以为后续的模糊匹配提供重要信息
         """
-        if self.mapping_dict:
-            node_b, ancestors_n, ancestors_b = Graph.mapping_match(node_n, self.graph_b, self.mapping_dict)
-            if node_b:
-                ancestors_n.append(node_n.id)
-                ancestors_b.append(node_b.id)
-                node_n.matched_node_link = ancestors_b
-                node_b.matched_node_link = ancestors_n
-        else:
-            node_b, ancestors = Graph.match(self.graph_n, node_n, self.graph_b)
-            if node_b:
-                ancestors.append(node_b.id)
-                node_n.add_link(node_b, ancestors)
-        if node_b:
-            # 真实数据比对只会得到基本信息，并没有精度指标，需要调用多进程对比接口
-            self._get_and_add_result(node_n, node_b)
-        for subnode in node_n.subnodes:
-            self._compare_nodes(subnode)
-    @recursion_depth_decorator('GraphComparator._compare_nodes_fuzzy', max_depth=MAX_DEPTH)
-    def _compare_nodes_fuzzy(self, node_n):
-        if node_n.op != NodeOp.function_api:
-            # 模块经过模糊匹配
-            node_b, ancestors_n, ancestors_b = Graph.fuzzy_match(node_n, self.graph_b.node_map.get(node_n.id))
+        def compare_single_node(node_n):
+            if self.layer_mapping:
+                node_b, ancestors_n, ancestors_b = Graph.mapping_match(node_n, self.graph_b, self.mapping_dict)
+                if node_b:
+                    ancestors_n.append(node_n.id)
+                    ancestors_b.append(node_b.id)
+                    node_n.matched_node_link = ancestors_b
+                    node_b.matched_node_link = ancestors_n
+            else:
+                node_b, ancestors = Graph.match(self.graph_n, node_n, self.graph_b)
+                if node_b:
+                    ancestors.append(node_b.id)
+                    node_n.add_link(node_b, ancestors)
             if node_b:
-                self._process_matched_nodes(node_n, node_b, ancestors_n, ancestors_b)
-                # 匹配上的两个模块中的所有api, 忽略dump调用次数，按照名称一致+模块中的调用顺序进行匹配
-                recount_result_n = self._recount_api_node(node_n)
-                recount_result_b = self._recount_api_node(node_b)
-                for recount_node_id, node_id_n in recount_result_n.items():
-                    api_node_n = self.graph_n.node_map.get(node_id_n)
-                    if not api_node_n:
-                        continue
-                    api_node_b, ancestors_n, ancestors_b = Graph.fuzzy_match(
-                        api_node_n, self.graph_b.node_map.get(recount_result_b.get(recount_node_id)))
-                    if api_node_b:
-                        self._process_matched_nodes(api_node_n, api_node_b, ancestors_n, ancestors_b)
-        for sub_node in node_n.subnodes:
-            self._compare_nodes_fuzzy(sub_node)
+                # 真实数据比对只会得到基本信息，并没有精度指标，需要调用多进程对比接口
+                self._get_and_add_result(node_n, node_b)
+            node_list.extend(node_n.subnodes)
+        node_list = [node_root]
+        while node_list:
+            compare_single_node(node_list.pop(0))
+    def _compare_nodes_fuzzy(self, node_root):
+        def compare_single_nodes_fuzzy(node_n):
+            if node_n.op != NodeOp.function_api:
+                # 模块经过模糊匹配
+                node_b, ancestors_n, ancestors_b = Graph.fuzzy_match(node_n, self.graph_b.node_map.get(node_n.id))
+                if node_b:
+                    self._process_matched_nodes(node_n, node_b, ancestors_n, ancestors_b)
+                    # 匹配上的两个模块中的所有api, 忽略dump调用次数，按照名称一致+模块中的调用顺序进行匹配
+                    recount_result_n = self._recount_api_node(node_n)
+                    recount_result_b = self._recount_api_node(node_b)
+                    for recount_node_id, node_id_n in recount_result_n.items():
+                        api_node_n = self.graph_n.node_map.get(node_id_n)
+                        if not api_node_n:
+                            continue
+                        api_node_b, ancestors_n, ancestors_b = Graph.fuzzy_match(
+                            api_node_n, self.graph_b.node_map.get(recount_result_b.get(recount_node_id)))
+                        if api_node_b:
+                            self._process_matched_nodes(api_node_n, api_node_b, ancestors_n, ancestors_b)
+            node_list.extend(node_n.subnodes)
+        node_list = [node_root]
+        while node_list:
+            compare_single_nodes_fuzzy(node_list.pop(0))
     def _parse_param(self, dump_path_param, output_path):
         self.dump_path_param = dump_path_param
@@ -128,7 +136,7 @@ class GraphComparator:
         if not self.ma.compare_mode == GraphConst.REAL_DATA_COMPARE:
             return
         df = get_csv_df(True, self.ma.csv_data, self.ma.compare_mode)
-        df = run_real_data(self.dump_path_param, df, self.framework, True if self.mapping_dict else False)
+        df = run_real_data(self.dump_path_param, df, self.framework, self.is_cross_framework)
         compare_data_dict = {row[0]: row.tolist() for _, row in df.iterrows()}
         for node in self.ma.compare_nodes:
             precision_index, _ = self.ma.parse_result(node, [compare_data_dict])

msprobe/visualization/compare/mode_adapter.py CHANGED Viewed

@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import math
 import json
 from msprobe.core.common.const import CompareConst, Const
 from msprobe.visualization.utils import ToolTip, GraphConst, str2float
@@ -24,6 +25,12 @@ class ModeAdapter:
         self.csv_data = []
         self.compare_nodes = []
+    @staticmethod
+    def _is_invalid(value):
+        if not isinstance(value, float):
+            return False
+        return math.isnan(value) or math.isinf(value)
     @staticmethod
     def _add_md5_compare_data(node_data, compare_data_dict):
         precision_index = GraphConst.MAX_INDEX_KEY
@@ -48,6 +55,8 @@ class ModeAdapter:
         for key, value in node_data.items():
             if not isinstance(value, dict):
                 continue
+            if value.get(Const.MAX) is None:
+                continue
             compare_data = compare_data_dict.get(key)
             if compare_data:
                 headers = CompareConst.COMPARE_RESULT_HEADER
@@ -66,9 +75,13 @@ class ModeAdapter:
                 if thousandth is not None:
                     numbers.append(thousandth)
                 node_data[key] = value
+            if ModeAdapter._is_invalid(value.get(Const.MAX)) or ModeAdapter._is_invalid(value.get(Const.MIN)):
+                numbers.append(CompareConst.N_A)
         # 双千指标都是None的异常情况
         if not numbers:
             min_thousandth = None
+        elif CompareConst.N_A in numbers:
+            min_thousandth = CompareConst.N_A
         else:
             min_thousandth = min(numbers + [min_thousandth])
         return min_thousandth
@@ -80,6 +93,8 @@ class ModeAdapter:
         for key, data_info in node_data.items():
             if not isinstance(data_info, dict):
                 continue
+            if data_info.get(Const.MAX) is None:
+                continue
             compare_data = compare_data_dict.get(key)
             if compare_data:
                 # 对应比对结果csv的列
@@ -91,6 +106,8 @@ class ModeAdapter:
                     relative_err = str2float(data_info.get(item))
                     max_relative_err = max(max_relative_err, relative_err)
                 node_data[key] = data_info
+            if ModeAdapter._is_invalid(data_info.get(Const.MAX)) or ModeAdapter._is_invalid(data_info.get(Const.MIN)):
+                max_relative_err = GraphConst.MAX_INDEX_KEY
         max_relative_err = 1 if max_relative_err > 1 else max_relative_err
         return max_relative_err
@@ -132,7 +149,11 @@ class ModeAdapter:
             ModeAdapter._check_list_len(compare_data_dict_list, 1)
             min_thousandth_in = ModeAdapter._add_real_compare_data(node.input_data, compare_data_dict_list[0])
             min_thousandth_out = ModeAdapter._add_real_compare_data(node.output_data, compare_data_dict_list[0])
-            if min_thousandth_in is not None and min_thousandth_out is not None:
+            if CompareConst.N_A == min_thousandth_out:
+                change_percentage = GraphConst.MAX_INDEX_KEY
+            elif CompareConst.N_A == min_thousandth_in:
+                change_percentage = GraphConst.MIN_INDEX_KEY
+            elif min_thousandth_in is not None and min_thousandth_out is not None:
                 change_percentage = min_thousandth_in - min_thousandth_out
             else:
                 change_percentage = GraphConst.MIN_INDEX_KEY
@@ -140,6 +161,7 @@ class ModeAdapter:
                 else change_percentage
             precision_index = GraphConst.MAX_INDEX_KEY \
                 if change_percentage > GraphConst.MAX_INDEX_KEY else change_percentage
+        precision_index = self._ignore_precision_index(node.id, precision_index)
         return precision_index, other_dict
     def prepare_real_data(self, node):
@@ -176,3 +198,11 @@ class ModeAdapter:
                 CompareConst.MAX_ABS_ERR: ToolTip.MAX_ABS_ERR,
                 CompareConst.MAX_RELATIVE_ERR: ToolTip.MAX_RELATIVE_ERR}
         return json.dumps(tips)
+    def _ignore_precision_index(self, node_id, precision_index):
+        node_id_split = node_id.split(Const.SEP)
+        if len(node_id_split) < 2:
+            return precision_index
+        if node_id.split(Const.SEP)[1] in GraphConst.IGNORE_PRECISION_INDEX:
+            return GraphConst.MAX_INDEX_KEY if self.compare_mode == GraphConst.MD5_COMPARE else GraphConst.MIN_INDEX_KEY
+        return precision_index

msprobe/visualization/graph/base_node.py CHANGED Viewed

@@ -87,15 +87,15 @@ class BaseNode:
         self.matched_node_link = ancestors
         node.matched_node_link = ancestors
-    def to_dict(self):
+    def to_dict(self, compare_mode=None):
         """
         输出数据
         """
         result = {
             'id': self.id,
             'node_type': self.op.value,
-            'output_data': format_node_data(self.output_data, self.id),
-            'input_data': format_node_data(self.input_data, self.id),
+            'output_data': format_node_data(self.output_data, self.id, compare_mode),
+            'input_data': format_node_data(self.input_data, self.id, compare_mode),
             'upnode': self.upnode.id if self.upnode else 'None',
             'subnodes': [node.id for node in self.subnodes],
             'matched_node_link': self.matched_node_link,

mindstudio-probe 1.3.0__py3-none-any.whl → 8.1.1__py3-none-any.whl

mindstudio-probe 1.3.0py3-none-any.whl → 8.1.1py3-none-any.whl