PyPI - mindstudio-probe - Versions diffs - 8.3.0__py3-none-any.whl → 8.3.2__py3-none-any.whl - Mend

mindstudio-probe 8.3.0py3-none-any.whl → 8.3.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (54) hide show

{mindstudio_probe-8.3.0.dist-info → mindstudio_probe-8.3.2.dist-info}/METADATA +1 -1
{mindstudio_probe-8.3.0.dist-info → mindstudio_probe-8.3.2.dist-info}/RECORD +44 -54
msprobe/README.md +8 -5
msprobe/core/common/const.py +17 -3
msprobe/core/common/file_utils.py +64 -13
msprobe/core/common/framework_adapter.py +10 -1
msprobe/core/common/utils.py +17 -0
msprobe/core/compare/utils.py +26 -6
msprobe/core/config_check/ckpt_compare/ckpt_comparator.py +6 -1
msprobe/core/hook_manager.py +2 -16
msprobe/core/service.py +5 -16
msprobe/docs/01.installation.md +2 -0
msprobe/docs/02.config_introduction.md +0 -13
msprobe/docs/05.data_dump_PyTorch.md +1 -1
msprobe/docs/07.accuracy_checker_PyTorch.md +13 -13
msprobe/docs/10.accuracy_compare_PyTorch.md +6 -6
msprobe/docs/14.data_parse_PyTorch.md +2 -0
msprobe/docs/19.monitor.md +4 -4
msprobe/docs/21.visualization_PyTorch.md +1 -1
msprobe/docs/25.tool_function_introduction.md +0 -1
msprobe/docs/32.ckpt_compare.md +5 -5
msprobe/mindspore/monitor/module_hook.py +17 -20
msprobe/pytorch/api_accuracy_checker/common/config.py +3 -36
msprobe/pytorch/api_accuracy_checker/compare/api_precision_compare.py +0 -24
msprobe/pytorch/api_accuracy_checker/compare/compare.py +2 -12
msprobe/pytorch/api_accuracy_checker/config.yaml +1 -6
msprobe/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py +34 -5
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py +12 -132
msprobe/pytorch/common/utils.py +0 -70
msprobe/pytorch/debugger/debugger_config.py +0 -10
msprobe/pytorch/dump/module_dump/module_processer.py +18 -3
msprobe/pytorch/hook_module/api_register.py +14 -3
msprobe/pytorch/monitor/module_hook.py +16 -34
msprobe/pytorch/pt_config.py +2 -51
msprobe/pytorch/pytorch_service.py +10 -14
msprobe/visualization/builder/graph_builder.py +2 -2
msprobe/visualization/builder/graph_merger.py +13 -0
msprobe/visualization/db_utils.py +42 -18
msprobe/visualization/graph/graph.py +13 -9
msprobe/visualization/graph_service.py +20 -10
msprobe/docs/08.accuracy_checker_online_PyTorch.md +0 -295
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/__init__.py +0 -0
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/attl.py +0 -205
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/client.py +0 -378
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/device_dispatch.py +0 -239
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/dump_dispatch.py +0 -115
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/server.py +0 -250
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/torch_ops_config.yaml +0 -63
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/utils.py +0 -198
msprobe/pytorch/attl_manager.py +0 -65
{mindstudio_probe-8.3.0.dist-info → mindstudio_probe-8.3.2.dist-info}/LICENSE +0 -0
{mindstudio_probe-8.3.0.dist-info → mindstudio_probe-8.3.2.dist-info}/WHEEL +0 -0
{mindstudio_probe-8.3.0.dist-info → mindstudio_probe-8.3.2.dist-info}/entry_points.txt +0 -0
{mindstudio_probe-8.3.0.dist-info → mindstudio_probe-8.3.2.dist-info}/top_level.txt +0 -0

msprobe/pytorch/pt_config.py CHANGED Viewed

@@ -35,48 +35,15 @@ from msprobe.pytorch.hook_module.utils import get_ops
 class TensorConfig(BaseConfig):
     def __init__(self, json_config):
         super().__init__(json_config)
-        self.online_run_ut = json_config.get("online_run_ut", False)
-        self.nfs_path = json_config.get("nfs_path", "")
-        self.host = json_config.get("host", "")
-        self.port = json_config.get("port", -1)
-        self.tls_path = json_config.get("tls_path", "./")
-        self.online_run_ut_recompute = json_config.get("online_run_ut_recompute", False)
         self.check_config()
         self._check_summary_mode()
         self._check_file_format()
-        if self.online_run_ut:
-            self._check_online_run_ut()
     def _check_file_format(self):
         if self.file_format is not None and self.file_format not in ["npy", "bin"]:
             raise Exception("file_format is invalid")
-    def _check_online_run_ut(self):
-        if not isinstance(self.online_run_ut, bool):
-            raise Exception(f"online_run_ut: {self.online_run_ut} is invalid.")
-        if not isinstance(self.online_run_ut_recompute, bool):
-            raise Exception(f"online_run_ut_recompute: {self.online_run_ut_recompute} is invalid.")
-        if self.nfs_path:
-            check_file_or_directory_path(self.nfs_path, isdir=True)
-            return
-        if self.tls_path:
-            check_file_or_directory_path(self.tls_path, isdir=True)
-            check_file_or_directory_path(os.path.join(self.tls_path, "client.key"))
-            check_file_or_directory_path(os.path.join(self.tls_path, "client.crt"))
-            check_file_or_directory_path(os.path.join(self.tls_path, "ca.crt"))
-            crl_path = os.path.join(self.tls_path, "crl.pem")
-            if os.path.exists(crl_path):
-                check_file_or_directory_path(crl_path)
-        if not isinstance(self.host, str) or not re.match(Const.ipv4_pattern, self.host):
-            raise Exception(f"host: {self.host} is invalid.")
-        if not isinstance(self.port, int) or not (0 < self.port <= 65535):
-            raise Exception(f"port: {self.port} is invalid, port range 0-65535.")
 class StatisticsConfig(BaseConfig):
     def __init__(self, json_config):
@@ -257,12 +224,7 @@ class RunUTConfig(BaseConfig):
         self.white_list = json_config.get("white_list", Const.DEFAULT_LIST)
         self.black_list = json_config.get("black_list", Const.DEFAULT_LIST)
         self.error_data_path = json_config.get("error_data_path", Const.DEFAULT_PATH)
-        self.is_online = json_config.get("is_online", False)
-        self.nfs_path = json_config.get("nfs_path", "")
-        self.host = json_config.get("host", "")
-        self.port = json_config.get("port", -1)
-        self.rank_list = json_config.get("rank_list", Const.DEFAULT_LIST)
-        self.tls_path = json_config.get("tls_path", "./")
         self.check_run_ut_config()
     @classmethod
@@ -280,22 +242,11 @@ class RunUTConfig(BaseConfig):
         if not os.path.exists(error_data_path):
             raise Exception("error_data_path: %s does not exist" % error_data_path)
-    @classmethod
-    def check_nfs_path_config(cls, nfs_path):
-        if nfs_path:
-            FileChecker(nfs_path, FileCheckConst.DIR, FileCheckConst.READ_ABLE).common_check()
-    @classmethod
-    def check_tls_path_config(cls, tls_path):
-        if tls_path:
-            FileChecker(tls_path, FileCheckConst.DIR, FileCheckConst.READ_ABLE).common_check()
     def check_run_ut_config(self):
         RunUTConfig.check_filter_list_config(Const.WHITE_LIST, self.white_list)
         RunUTConfig.check_filter_list_config(Const.BLACK_LIST, self.black_list)
         RunUTConfig.check_error_data_path_config(self.error_data_path)
-        RunUTConfig.check_nfs_path_config(self.nfs_path)
-        RunUTConfig.check_tls_path_config(self.tls_path)
 class GradToolConfig(BaseConfig):

msprobe/pytorch/pytorch_service.py CHANGED Viewed

@@ -15,19 +15,20 @@
 from msprobe.core.common.utils import Const
 from msprobe.core.service import BaseService
-from msprobe.pytorch.attl_manager import ATTLManager
 from msprobe.pytorch.common.log import logger
-from msprobe.pytorch.common.utils import get_rank_if_initialized, torch_version_above_or_equal_2
+from msprobe.pytorch.common.utils import get_rank_if_initialized
 from msprobe.pytorch.dump.module_dump.module_processer import ModuleProcesser
-from msprobe.pytorch.hook_module.api_register import get_api_register, ApiTemplate, redirect_wait
+from msprobe.pytorch.hook_module.api_register import (
+    get_api_register,
+    ApiTemplate,
+    redirect_wait,
+    reset_dist_collect_func
+)
 from msprobe.pytorch.hook_module.hook_module import HOOKModule
 from msprobe.pytorch.hook_module.pt_hook_manager import PytorchHookManager
 from msprobe.pytorch.hook_module.register_optimizer_hook import register_optimizer_hook
 from msprobe.pytorch.hook_module.script_wrapper import wrap_script_func, preprocess_func
-if torch_version_above_or_equal_2:
-    from msprobe.pytorch.api_accuracy_checker.tensor_transport_layer.dump_dispatch import run_ut_dispatch
 class PytorchService(BaseService):
     @property
@@ -37,7 +38,7 @@ class PytorchService(BaseService):
     @staticmethod
     def _get_current_rank():
         return get_rank_if_initialized()
     def reset_status(self):
         self._reset_status()
@@ -45,12 +46,10 @@ class PytorchService(BaseService):
         self.logger = logger
         self.api_register = get_api_register()
         self.module_processor = ModuleProcesser(self.data_collector.scope)
-        self.attl_manager = ATTLManager(self.config)
-        self.hook_manager = PytorchHookManager(self.data_collector, self.config, self.attl_manager)
+        self.hook_manager = PytorchHookManager(self.data_collector, self.config)
         self.api_template = ApiTemplate
     def _register_hook(self):
-        self.attl_manager.attl_init()
         if self._is_mix_level:
             register_optimizer_hook(self.data_collector)
@@ -65,11 +64,8 @@ class PytorchService(BaseService):
         self.module_processor.register_module_hook(self.model, self.build_hook)
         self.logger.info(f"The module {self.config.task} hook function is successfully mounted to the model.")
-    def _run_ut_dispatch(self, status):
-        if torch_version_above_or_equal_2:
-            run_ut_dispatch(self.attl_manager.attl, status, self.config.online_run_ut_recompute)
     def _reset_status(self):
         super()._reset_status()
         ModuleProcesser.reset_module_stats()
         HOOKModule.reset_module_stats()
+        reset_dist_collect_func()

msprobe/visualization/builder/graph_builder.py CHANGED Viewed

@@ -298,8 +298,8 @@ class GraphBuilder:
         no_recompute_map = GraphBuilder._get_no_recompute_map(graph, id_prefixes)
         if not no_recompute_map:
             return
-        # 深拷贝非重计算节点字典用于反向模式
-        no_recompute_ids_b = copy.deepcopy(no_recompute_map)
+        # 拷贝非重计算节点字典用于反向模式
+        no_recompute_ids_b = {node_id: list(node_list) for node_id, node_list in no_recompute_map.items()}
         del_indexes = []
         for node_id, id_prefix in recompute_map.items():

msprobe/visualization/builder/graph_merger.py CHANGED Viewed

@@ -146,6 +146,7 @@ class BaseGraphMerger:
                                                                           GraphConst.APIS_BETWEEN_MODULES_ALL_RANKS,
                                                                           id_accumulation=True)
                 all_collection_node = main_graph_result.graph.get_node(all_collection_node_id)
+                all_collection_node.upnode = main_graph_result.graph.root
                 new_main_root_sub_nodes.append(all_collection_node)
                 # Apis_Between_Modules.0 --> Apis_Between_Modules_Rank0.0
                 origin_main_node_id = main_node.id
@@ -377,6 +378,12 @@ class PPMerger(BaseGraphMerger):
             logger.info('Unable to get pp groups based on Distributed Api (batch_isend_irecv, send, or isend), '
                         'generate pp groups using parallel param "rank_size", "tp" and "pp".')
             _, pp_groups = self.get_default_groups()
+        elif len(pp_groups[0]) != self.parallel_param.pp:
+            logger.warning(f'Based on Distributed Api (atch_isend_irecv, send, or isend), '
+                           f'the resulting pp groups={pp_groups}, '
+                           f'its length is not equal to the parallel param "pp"({self.parallel_param.pp}) you defined, '
+                           f'generate pp groups using parallel param "rank_size", "tp" and "pp".')
+            _, pp_groups = self.get_default_groups()
         logger.info(f'{self.log_prefix} All pp groups is {pp_groups}.')
         return pp_groups
@@ -657,6 +664,12 @@ class TPMerger(BaseGraphMerger):
             logger.info('Unable to get tp groups based on Distributed Api (reduce_scatter or all_reduce), '
                         'generate tp groups using parallel param "rank_size", "tp" and "pp".')
             tp_groups, _ = self.get_default_groups()
+        elif len(tp_groups[0]) != self.parallel_param.tp:
+            logger.warning(f'Based on Distributed Api (reduce_scatter or all_reduce), '
+                           f'the resulting tp groups={tp_groups}, '
+                           f'its length is not equal to the parallel param "tp"({self.parallel_param.tp}) you defined, '
+                           f'generate tp groups using parallel param "rank_size", "tp" and "pp".')
+            tp_groups, _ = self.get_default_groups()
         logger.info(f'{self.log_prefix} All tp groups is {tp_groups}.')
         return tp_groups

msprobe/visualization/db_utils.py CHANGED Viewed

@@ -17,6 +17,7 @@ import os
 import sqlite3
 import json
 import re
+import time
 from msprobe.core.common.log import logger
 from msprobe.core.common.file_utils import change_mode, check_path_before_create, FileChecker
 from msprobe.core.common.const import FileCheckConst
@@ -133,33 +134,56 @@ def create_insert_sql_from_dict(table_name, columns_dict, ignore_insert=False):
 def to_db(db_path, create_table_sql, insert_sql, data, db_insert_size=1000):
+    max_retries = 10
+    initial_delay = 0.1
     if not os.path.exists(db_path):
         check_path_before_create(db_path)
     else:
         FileChecker(db_path, FileCheckConst.FILE, FileCheckConst.READ_WRITE_ABLE,
                     FileCheckConst.DB_SUFFIX).common_check()
-    try:
-        conn = sqlite3.connect(db_path)
-    except sqlite3.Error as e:
-        logger.error(f"Unable to create database connection: {e}")
-        raise RuntimeError("Unable to create database connection") from e
-    try:
-        cursor = conn.cursor()
-        cursor.execute(create_table_sql)
-        if len(data) == 1:
-            cursor.execute(insert_sql, data[0])
-            conn.commit()
-        else:
+    retry_count = 0
+    current_delay = initial_delay
+    while retry_count <= max_retries:
+        conn = None
+        try:
+            conn = sqlite3.connect(db_path, timeout=30)
+            cursor = conn.cursor()
+            # 启用WAL模式提升多进程读写并发能力
+            cursor.execute("PRAGMA journal_mode=WAL")
+            cursor.execute("PRAGMA synchronous=NORMAL")
+            cursor.execute(create_table_sql)
             for i in range(0, len(data), db_insert_size):
                 batch = data[i:i + db_insert_size]
                 cursor.executemany(insert_sql, batch)
-                conn.commit()
-    except sqlite3.Error as e:
-        logger.error(f"An sqlite3 error occurred: {e}")
-        raise RuntimeError("An sqlite3 error occurred") from e
-    finally:
-        conn.close()
+            conn.commit()
+            return
+        except sqlite3.OperationalError as e:
+            if "database is locked" in str(e).lower():
+                retry_count += 1
+                if retry_count > max_retries:
+                    logger.error(f"Database lock conflict retry attempts exhausted ({max_retries}): {e}")
+                    raise RuntimeError(f"DB lock retry exhausted: {e}") from e
+                logger.warning(
+                    f"DB lock conflict (retry {retry_count}/{max_retries}), wait {current_delay:.2f}s : {e}"
+                )
+                time.sleep(current_delay)
+                current_delay *= 2
+                continue
+            logger.error(f"An sqlite3 error occurred: {e}")
+            raise e
+        except sqlite3.Error as e:
+            logger.error(f"An sqlite3 error occurred: {e}")
+            raise e
+        except Exception as e:
+            logger.error(f"An unknown error occurred: {e}")
+            raise e
+        finally:
+            if conn:
+                conn.close()
 def add_table_index(db_path):

msprobe/visualization/graph/graph.py CHANGED Viewed

@@ -126,21 +126,25 @@ class Graph:
     def get_sorted_nodes(self):
         """
-        通过深度优先遍历graph，获得排过序的node列表
+        通过深度优先遍历graph，获得排过序的node列表，使用栈实现避免超出递归深度问题
         """
         visited = set()
         order = []
+        stack = [(self.root, False)]
-        @recursion_depth_decorator('msprobe.visualization.graph.graph.Graph.get_nodes_order.visit', max_depth=500)
-        def visit(node):
+        while stack:
+            node, processed = stack.pop()
             if node.id in visited:
-                return
-            visited.add(node.id)
-            for sub_node in node.subnodes:
-                visit(sub_node)
-            order.append(node)
+                continue
+            if processed:
+                visited.add(node.id)
+                order.append(node)
+            else:
+                stack.append((node, True))
+                for sub_node in reversed(node.subnodes):
+                    if sub_node.id not in visited:
+                        stack.append((sub_node, False))
-        visit(self.root)
         return order
     def add_node(self, node_op, node_id, up_node=None, id_accumulation=False):

msprobe/visualization/graph_service.py CHANGED Viewed

@@ -242,11 +242,15 @@ def _compare_graph_ranks(input_param, args, step=None):
 def _get_compare_graph_results(input_param, serializable_args, step, pool, err_call):
     dump_rank_n = input_param.get('npu_path')
     dump_rank_b = input_param.get('bench_path')
-    npu_ranks = sorted(check_and_return_dir_contents(dump_rank_n, Const.RANK))
-    bench_ranks = sorted(check_and_return_dir_contents(dump_rank_b, Const.RANK))
+    npu_ranks = sort_rank_number_strings(check_and_return_dir_contents(dump_rank_n, Const.RANK))
+    bench_ranks = sort_rank_number_strings(check_and_return_dir_contents(dump_rank_b, Const.RANK))
     if npu_ranks != bench_ranks:
-        logger.error('The number of ranks in the two runs are different. Unable to match the ranks.')
-        raise CompareException(CompareException.INVALID_PATH_ERROR)
+        intersection_ranks = sort_rank_number_strings(list(set(npu_ranks) & set(bench_ranks)))
+        if not intersection_ranks:
+            logger.error('The ranks in the two runs are completely different. Unable to match the ranks.')
+            raise CompareException(CompareException.INVALID_PATH_ERROR)
+        npu_ranks = intersection_ranks
+        bench_ranks = intersection_ranks
     compare_graph_results = []
     if is_real_data_compare(input_param, npu_ranks, bench_ranks):
         mp_task_dict = {}
@@ -282,12 +286,16 @@ def _compare_graph_steps(input_param, args):
     dump_step_n = input_param.get('npu_path')
     dump_step_b = input_param.get('bench_path')
-    npu_steps = sorted(check_and_return_dir_contents(dump_step_n, Const.STEP))
-    bench_steps = sorted(check_and_return_dir_contents(dump_step_b, Const.STEP))
+    npu_steps = check_and_return_dir_contents(dump_step_n, Const.STEP)
+    bench_steps = check_and_return_dir_contents(dump_step_b, Const.STEP)
     if npu_steps != bench_steps:
-        logger.error('The number of steps in the two runs is different. Unable to match the steps.')
-        raise CompareException(CompareException.INVALID_PATH_ERROR)
+        intersection_steps = sort_rank_number_strings(list(set(npu_steps) & set(bench_steps)))
+        if not intersection_steps:
+            logger.error('The steps in the two runs are completely different. Unable to match the steps.')
+            raise CompareException(CompareException.INVALID_PATH_ERROR)
+        npu_steps = intersection_steps
     args.step_list = sorted([get_step_or_rank_int(step) for step in npu_steps])
@@ -355,8 +363,10 @@ def _build_graph_steps(dump_steps_path, args):
         _build_graph_ranks(dump_ranks_path, args, step)
-def _compare_and_export_graph(graph_task_info, input_param, args):
+def _compare_and_export_graph(graph_task_info, input_param, args, step=None):
     result = _run_graph_compare(graph_task_info, input_param, args)
+    if step is not None:
+        result.step = get_step_or_rank_int(step)
     return _export_compare_graph_result(args, result)
@@ -413,7 +423,7 @@ def _compare_graph_ranks_parallel(input_param, args, step=None):
                 _build_graph_info(os.path.join(bench_path, f'rank{graph_b.root.rank}'), args, graph_b),
                 f'rank{graph_n.root.rank}', f'rank{graph_b.root.rank}', current_time)
             export_res_task_list.append(pool.apply_async(_compare_and_export_graph,
-                                                         args=(graph_task_info, input_param, serializable_args),
+                                                         args=(graph_task_info, input_param, serializable_args, step),
                                                          error_callback=err_call))
         export_res_list = [res.get() for res in export_res_task_list]
         if any(export_res_list):

mindstudio-probe 8.3.0__py3-none-any.whl → 8.3.2__py3-none-any.whl

mindstudio-probe 8.3.0py3-none-any.whl → 8.3.2py3-none-any.whl