PyPI - mindstudio-probe - Versions diffs - 8.2.0__py3-none-any.whl → 8.3.0__py3-none-any.whl - Mend

mindstudio-probe 8.2.0py3-none-any.whl → 8.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (90) hide show

{mindstudio_probe-8.2.0.dist-info → mindstudio_probe-8.3.0.dist-info}/METADATA +2 -2
{mindstudio_probe-8.2.0.dist-info → mindstudio_probe-8.3.0.dist-info}/RECORD +90 -79
msprobe/README.md +7 -5
msprobe/core/common/const.py +6 -0
msprobe/core/common/db_manager.py +35 -4
msprobe/core/common/file_utils.py +105 -27
msprobe/core/common/framework_adapter.py +7 -6
msprobe/core/common/megatron_utils.py +59 -0
msprobe/core/common/utils.py +14 -3
msprobe/core/compare/find_first/analyzer.py +8 -7
msprobe/core/compare/find_first/graph.py +11 -3
msprobe/core/compare/find_first/utils.py +2 -1
msprobe/core/compare/highlight.py +13 -6
msprobe/core/compare/multiprocessing_compute.py +17 -10
msprobe/core/compare/utils.py +14 -5
msprobe/core/data_dump/data_collector.py +18 -21
msprobe/core/data_dump/data_processor/pytorch_processor.py +43 -20
msprobe/core/data_dump/json_writer.py +18 -8
msprobe/core/data_dump/scope.py +4 -6
msprobe/core/hook_manager.py +37 -3
msprobe/core/service.py +18 -5
msprobe/core/single_save/single_comparator.py +16 -3
msprobe/docs/01.installation.md +7 -5
msprobe/docs/02.config_introduction.md +14 -1
msprobe/docs/04.kernel_dump_PyTorch.md +1 -1
msprobe/docs/06.data_dump_MindSpore.md +1 -1
msprobe/docs/08.accuracy_checker_online_PyTorch.md +295 -0
msprobe/docs/10.accuracy_compare_PyTorch.md +46 -5
msprobe/docs/14.data_parse_PyTorch.md +1 -1
msprobe/docs/15.free_benchmarking_PyTorch.md +1 -1
msprobe/docs/19.monitor.md +2 -0
msprobe/docs/21.visualization_PyTorch.md +15 -80
msprobe/docs/22.visualization_MindSpore.md +20 -104
msprobe/docs/23.generate_operator_PyTorch.md +1 -1
msprobe/docs/25.tool_function_introduction.md +1 -0
msprobe/docs/26.data_dump_PyTorch_baseline.md +7 -7
msprobe/docs/img/visualization/vis_browser_1.png +0 -0
msprobe/docs/img/visualization/vis_match_info.png +0 -0
msprobe/docs/img/visualization/vis_precision_info.png +0 -0
msprobe/docs/img/visualization/vis_search_info.png +0 -0
msprobe/docs/img/visualization/vis_show_info.png +0 -0
msprobe/docs/img/visualization/vis_showcase.png +0 -0
msprobe/docs/img/visualization/vis_unmatch_info.png +0 -0
msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +1 -1
msprobe/mindspore/api_accuracy_checker/generate_op_script/op_generator.py +1 -1
msprobe/mindspore/cell_processor.py +33 -5
msprobe/mindspore/compare/common_dir_compare.py +22 -26
msprobe/mindspore/compare/utils.py +1 -2
msprobe/mindspore/debugger/precision_debugger.py +1 -1
msprobe/mindspore/dump/cell_dump_process.py +73 -62
msprobe/mindspore/dump/graph_mode_cell_dump.py +21 -10
msprobe/mindspore/dump/hook_cell/ms_hook_manager.py +2 -0
msprobe/msprobe.py +6 -4
msprobe/pytorch/api_accuracy_checker/common/config.py +36 -3
msprobe/pytorch/api_accuracy_checker/compare/api_precision_compare.py +24 -0
msprobe/pytorch/api_accuracy_checker/compare/compare.py +12 -2
msprobe/pytorch/api_accuracy_checker/config.yaml +6 -1
msprobe/pytorch/api_accuracy_checker/generate_op_script/op_generator.py +1 -1
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py +132 -12
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/__init__.py +0 -0
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/attl.py +205 -0
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/client.py +378 -0
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/device_dispatch.py +239 -0
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/dump_dispatch.py +115 -0
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/server.py +250 -0
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/torch_ops_config.yaml +63 -0
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/utils.py +198 -0
msprobe/pytorch/attl_manager.py +65 -0
msprobe/pytorch/common/utils.py +22 -2
msprobe/pytorch/compare/utils.py +3 -3
msprobe/pytorch/debugger/debugger_config.py +10 -0
msprobe/pytorch/dump/module_dump/hook_wrapper.py +34 -7
msprobe/pytorch/dump/module_dump/module_processer.py +23 -10
msprobe/pytorch/hook_module/api_register.py +6 -1
msprobe/pytorch/monitor/module_hook.py +28 -9
msprobe/pytorch/online_dispatch/dispatch.py +42 -24
msprobe/pytorch/pt_config.py +57 -2
msprobe/pytorch/pytorch_service.py +11 -2
msprobe/visualization/builder/graph_builder.py +170 -64
msprobe/visualization/builder/graph_merger.py +0 -1
msprobe/visualization/builder/msprobe_adapter.py +1 -1
msprobe/visualization/db_utils.py +25 -2
msprobe/visualization/graph/base_node.py +0 -24
msprobe/visualization/graph/graph.py +5 -14
msprobe/visualization/graph_service.py +29 -53
msprobe/visualization/utils.py +11 -1
{mindstudio_probe-8.2.0.dist-info → mindstudio_probe-8.3.0.dist-info}/LICENSE +0 -0
{mindstudio_probe-8.2.0.dist-info → mindstudio_probe-8.3.0.dist-info}/WHEEL +0 -0
{mindstudio_probe-8.2.0.dist-info → mindstudio_probe-8.3.0.dist-info}/entry_points.txt +0 -0
{mindstudio_probe-8.2.0.dist-info → mindstudio_probe-8.3.0.dist-info}/top_level.txt +0 -0

msprobe/mindspore/compare/utils.py CHANGED Viewed

@@ -26,8 +26,7 @@ def read_npy_data(dir_path, file_name):
         return None
     data_path = os.path.join(dir_path, file_name)
-    path_checker = FileChecker(data_path, FileCheckConst.FILE, FileCheckConst.READ_ABLE,
-                               FileCheckConst.NUMPY_SUFFIX, False)
+    path_checker = FileChecker(data_path, FileCheckConst.FILE, FileCheckConst.READ_ABLE, FileCheckConst.NUMPY_SUFFIX)
     data_path = path_checker.common_check()
     data_value = load_npy(data_path)
     return data_value

msprobe/mindspore/debugger/precision_debugger.py CHANGED Viewed

@@ -182,7 +182,7 @@ class PrecisionDebugger(BasePrecisionDebugger):
             with ThreadSafe():
                 instance.service.step()
         if is_graph_mode_cell_dump_allowed(instance.config):
-            GraphModeCellDump.step()
+            GraphModeCellDump.step(instance.config.dump_path, instance.config.step, instance.config.task)
         if enable_dynamic_kbyk_dump and instance.config.level_ori == Const.LEVEL_L2:
             _dump_step(1)
         if cls._is_kernel_dump() and _msprobe_c:

msprobe/mindspore/dump/cell_dump_process.py CHANGED Viewed

@@ -46,9 +46,11 @@ KEY_FORWARD = CoreConst.FORWARD
 KEY_BACKWARD = CoreConst.BACKWARD
 KEY_INPUT = CoreConst.INPUT
 KEY_OUTPUT = CoreConst.OUTPUT
-KEY_DUMP_TENSOR_DATA = "dump_tensor_data_"
+KEY_DUMP_TENSOR_DATA = "dump_tensor_data/"
 KEY_STATISTIC_CSV = "statistic.csv"
 KEY_TD_FLAG = "td_flag"
+# 设置落盘文件检测超时时间
+TIMEOUT = 600
 td = ops.TensorDump()
 if (ms.__version__ >= "2.5.0"):
     td_in = ops.TensorDump("in")
@@ -574,28 +576,33 @@ def generate_stack_info(path):
     logger.info(f"Stack data saved to {json_path}")
-def is_download_finished(directory, interval=3):
+def is_download_finished(directory, save_flag):
     """
     判断指定目录在一段时间后是否有数据被下载完成
     :param directory: 指定目录的路径
-    :param interval: 检查的时间间隔（秒），默认为 3 秒
+    :param save_flag: 数据落盘完成后的标志文件
     :return: 如有数据被下载完成返回 True，否则返回 False
     """
+    # 设定一定的延迟间隔，避免频繁进行磁盘的io读取操作
+    time.sleep(0.5)
+    logger.info("Waiting for download...")
     # 检查目录是否存在
     if not os.path.exists(directory):
         logger.warning(f"The specified directory {directory} does not exist.")
         return False
-    initial_modification_time = os.path.getmtime(directory)
-    time.sleep(interval)
-    current_modification_time = os.path.getmtime(directory)
-    # 比较初始和当前修改时间
-    if current_modification_time > initial_modification_time:
-        return False
-    else:
-        return True
+    # 遍历当前目录中的所有条目
+    for entry_path in os.listdir(directory):
+        if entry_path.startswith(save_flag):
+            return True
+    return False
+def process_step(dump_path, flag_path, step, step_list):
+    if step not in step_list:
+        return
-def process(dump_path):
     if not os.path.exists(dump_path):
         logger.warning('No grap cell data is dumped.')
         create_directory(dump_path)
@@ -606,32 +613,38 @@ def process(dump_path):
     if rank_id is not None:
         rank_dir = CoreConst.RANK + str(rank_id)
-    step_dir_list = os.listdir(dump_path)
-    for step_dir in step_dir_list:
-        step_path = os.path.join(dump_path, step_dir)
-        rank_path = os.path.join(step_path, rank_dir)
-        npy_path = os.path.join(rank_path, CoreConst.DUMP_TENSOR_DATA)
-        while True:
-            is_finished = is_download_finished(npy_path)
-            if not is_finished:
-                logger.info("There is data being downloaded in the specified directory, continue checking...")
-            else:
-                logger.info("There is no data being downloaded in the specified directory, Stop checking.")
-                break
-        logger.info("==========Start processing data that has already been stored on the disk!==========")
-        rename_filename(path=npy_path)
-        generate_construct(npy_path)
-        generate_dump_info(npy_path)
-        generate_stack_info(npy_path)
-        # 单卡场景，rank目录名称为rank
-        if rank_id is None:
-            new_rank_path = os.path.join(step_path, CoreConst.RANK)
-            try:
-                move_directory(rank_path, new_rank_path)
-                logger.info(f"Directory was successfully renamed to: {new_rank_path}")
-            except Exception as e:
-                logger.warning(f"Failed to renamed to {new_rank_path}: {e}")
-        logger.info("==========JSON file generation completed!==========")
+    step_dir = CoreConst.STEP + str(step)
+    step_path = os.path.join(dump_path, step_dir)
+    rank_path = os.path.join(step_path, rank_dir)
+    npy_path = os.path.join(rank_path, CoreConst.DUMP_TENSOR_DATA)
+    save_finish_flag = f"step_{step}"
+    start_time = time.time()
+    while True:
+        is_finished = is_download_finished(flag_path, save_finish_flag)
+        if not is_finished:
+            logger.info("There is data being downloaded in the specified directory, continue checking...")
+        else:
+            logger.info("There is no data being downloaded in the specified directory, Stop checking.")
+            break
+        elapsed_time = time.time() - start_time
+        if elapsed_time > TIMEOUT:
+            logger.error(f"Check timed out after {TIMEOUT} seconds. Exiting.")
+            return
+    logger.info(f"==========Start processing step_{step}'s data that has already been stored on the disk!==========")
+    rename_filename(path=npy_path)
+    generate_construct(npy_path)
+    generate_dump_info(npy_path)
+    generate_stack_info(npy_path)
+    # 单卡场景，rank目录名称为rank
+    if rank_id is None:
+        new_rank_path = os.path.join(step_path, CoreConst.RANK)
+        try:
+            move_directory(rank_path, new_rank_path)
+            logger.info(f"Directory was successfully renamed to: {new_rank_path}")
+        except Exception as e:
+            logger.warning(f"Failed to renamed to {new_rank_path}: {e}")
+    logger.info(f"==========Step_{step}'s JSON file generation completed!==========")
 # 删除csv文件中每行数据最后面的逗号
@@ -689,7 +702,10 @@ def merge_file(dump_path, rank_dir, file_dict):
                          " and the index is out of bounds.")
-def process_statistics(dump_path):
+def process_statistics_step(dump_path, step, step_list):
+    if step_list and step not in step_list:
+        return
     if not os.path.exists(dump_path):
         logger.warning('No grap cell data is dumped.')
         create_directory(dump_path)
@@ -723,25 +739,24 @@ def process_statistics(dump_path):
     rank_dir = rank_dir_kbk.replace(CoreConst.REPLACEMENT_CHARACTER, '')
     dir_list = os.listdir(dump_path)
-    step_dir_list = [d for d in dir_list if d.startswith(CoreConst.STEP)]
-    for step_dir in step_dir_list:
-        step_path = os.path.join(dump_path, step_dir)
-        rank_path = os.path.join(step_path, rank_dir)
-        csv_path = os.path.join(rank_path, KEY_STATISTIC_CSV)
-        logger.info("==========Start processing data csv!==========")
-        generate_construct(csv_path)
-        generate_dump_info(csv_path)
-        generate_stack_info(csv_path)
-        remove_path(rank_path_kbk)
-        # 单卡场景，rank目录名称为rank
-        if rank_id is None:
-            new_rank_path = os.path.join(step_path, CoreConst.RANK)
-            try:
-                move_directory(rank_path, new_rank_path)
-                logger.info(f"Directory was successfully renamed to: {new_rank_path}")
-            except Exception as e:
-                logger.warning(f"Failed to renamed to {new_rank_path}: {e}")
-        logger.info("==========JSON file generation completed!==========")
+    step_dir = CoreConst.STEP + str(step)
+    step_path = os.path.join(dump_path, step_dir)
+    rank_path = os.path.join(step_path, rank_dir)
+    csv_path = os.path.join(rank_path, KEY_STATISTIC_CSV)
+    logger.info("==========Start processing data csv!==========")
+    generate_construct(csv_path)
+    generate_dump_info(csv_path)
+    generate_stack_info(csv_path)
+    remove_path(rank_path_kbk)
+    # 单卡场景，rank目录名称为rank
+    if rank_id is None:
+        new_rank_path = os.path.join(step_path, CoreConst.RANK)
+        try:
+            move_directory(rank_path, new_rank_path)
+            logger.info(f"Directory was successfully renamed to: {new_rank_path}")
+        except Exception as e:
+            logger.warning(f"Failed to renamed to {new_rank_path}: {e}")
+    logger.info("==========JSON file generation completed!==========")
 def get_yaml_keys(yaml_data):
@@ -922,7 +937,3 @@ def start(config: CellDumpConfig):
             cell.data_mode = data_mode
     logger.info("==========The cell_dump_process_start phase is Finished!==========")
-    if dump_task == CoreConst.TENSOR:
-        atexit.register(process, dump_path=dump_path)
-    if dump_task == CoreConst.STATISTICS:
-        atexit.register(process_statistics, dump_path=dump_path)

msprobe/mindspore/dump/graph_mode_cell_dump.py CHANGED Viewed

@@ -14,7 +14,8 @@
 # limitations under the License.
 import os
+import glob
+import tempfile
 import mindspore as ms
 from mindspore import hal, ops, Tensor
 from mindspore.ops.primitive import _run_op
@@ -28,6 +29,7 @@ import msprobe.mindspore.dump.cell_dump_process as cellDumperWithDumpGradient
 import msprobe.mindspore.dump.cell_dump_with_insert_gradient as cellDumperWithInsertGradient
 tensordump_flag = True
+DEFAULT_RANK_DIR = "rank0"
 try:
     from mindspore._c_expression import _tensordump_set_step
 except ImportError:
@@ -41,8 +43,6 @@ except ImportError:
 class GraphModeCellDump:
-    task = CoreConst.STATISTICS
     def __init__(self, config: DebuggerConfig, model, strict=True):
         self.net = model
         self.white_list = []
@@ -55,29 +55,40 @@ class GraphModeCellDump:
         self.list = config.list
         self.data_mode = config.data_mode
         self.file_format = config.file_format
-        GraphModeCellDump.task = config.task
         self.summary_mode = config.summary_mode
+        self.task = config.task
         self.check_config(strict)
         self.set_step()
     @staticmethod
-    def step():
+    def step(dump_path, step_list, task):
         # 更新TensorDump Step
-        if GraphModeCellDump.task == CoreConst.TENSOR:
+        if task == CoreConst.TENSOR:
             hal.synchronize()
             temp_tensor = ms.Tensor([1], dtype=ms.float32)
-            step_flag = "<tensordump-update-step>"
-            _run_op(ops.TensorDump(), "TensorDump", (step_flag, temp_tensor))
-            ops.tensordump(step_flag, temp_tensor)
+            rank_id = os.environ.get('RANK_ID')
+            rank_dir = DEFAULT_RANK_DIR
+            if rank_id is not None:
+                rank_dir = CoreConst.RANK + str(rank_id)
+            with tempfile.TemporaryDirectory(dir=dump_path, prefix=rank_dir) as temp_dir:
+                save_file_flag = f"{temp_dir}/step_{Runtime.step_count}"
+                _run_op(ops.TensorDump(), "TensorDump", (save_file_flag, temp_tensor))
+                step_flag = "<tensordump-update-step>"
+                _run_op(ops.TensorDump(), "TensorDump", (step_flag, temp_tensor))
+                ops.tensordump(step_flag, temp_tensor)
+                cellDumperWithDumpGradient.process_step(dump_path, temp_dir, Runtime.step_count, step_list)
         # 更新静态图KBK dump的step数
-        if GraphModeCellDump.task == CoreConst.STATISTICS:
+        if task == CoreConst.STATISTICS:
             if not graph_step_flag:
                 raise Exception(
                     "Importing _dump_step failed, "
                     "please use the latest version package of MindSpore."
                 )
             _dump_step(1)
+            cellDumperWithDumpGradient.process_statistics_step(dump_path, Runtime.step_count, step_list)
     def check_config(self, strict):
         if not self.net:

msprobe/mindspore/dump/hook_cell/ms_hook_manager.py CHANGED Viewed

@@ -203,10 +203,12 @@ class MindsporeHookManager(BaseHookManager):
                 return
             with ThreadSafe():
+                original_state = self.ensure_gc_enabled()
                 BaseHookManager.inner_switch[tid] = True
                 module_input = ModuleBackwardInputs(grad_input=grad_input)
                 self.data_collector.update_api_or_module_name(full_name)
                 self.data_collector.backward_input_data_collect(full_name, module, self._pid, module_input)
                 BaseHookManager.inner_switch[tid] = False
+                self.restore_gc_state(original_state)
         return backward_pre_hook

msprobe/msprobe.py CHANGED Viewed

@@ -14,16 +14,16 @@
 # limitations under the License.
 import argparse
-import sys
 import importlib.util
+import sys
 from msprobe.core.common.const import Const
+from msprobe.core.common.file_utils import root_privilege_warning
 from msprobe.core.common.log import logger
-from msprobe.core.compare.utils import _compare_parser
 from msprobe.core.compare.compare_cli import compare_cli
 from msprobe.core.compare.merge_result.merge_result_cli import _merge_result_parser, merge_result_cli
-from msprobe.core.config_check.config_check_cli import _config_checking_parser, \
-            _run_config_checking_command
+from msprobe.core.compare.utils import _compare_parser
+from msprobe.core.config_check.config_check_cli import _config_checking_parser, _run_config_checking_command
 def is_module_available(module_name):
@@ -64,6 +64,8 @@ def main():
     if len(sys.argv) < 4:
         parser.print_help()
         sys.exit(0)
+    root_privilege_warning()
     framework_args = parser.parse_args(sys.argv[1:3])
     if framework_args.framework == Const.PT_FRAMEWORK:
         from msprobe.pytorch.api_accuracy_checker.run_ut.run_ut import _run_ut_parser, run_ut_command

msprobe/pytorch/api_accuracy_checker/common/config.py CHANGED Viewed

@@ -24,7 +24,8 @@ from msprobe.pytorch.pt_config import RunUTConfig
 RunUtConfig = namedtuple('RunUtConfig', ['forward_content', 'backward_content', 'result_csv_path', 'details_csv_path',
                                          'save_error_data', 'is_continue_run_ut', 'real_data_path', 'white_list',
-                                         'black_list', 'error_data_path'])
+                                         'black_list', 'error_data_path', 'online_config'])
+OnlineConfig = namedtuple('OnlineConfig', ['is_online', 'nfs_path', 'host', 'port', 'rank_list', 'tls_path'])
 class Config:
@@ -45,7 +46,13 @@ class Config:
             'white_list': list,
             'black_list': list,
             'error_data_path': str,
-            'precision': int
+            'precision': int,
+            'is_online': bool,
+            'nfs_path': str,
+            'host': str,
+            'port': int,
+            'rank_list': list,
+            'tls_path': str
         }
         if key not in validators:
             raise ValueError(f"{key} must be one of {validators.keys()}")
@@ -61,6 +68,10 @@ class Config:
             RunUTConfig.check_filter_list_config(key, value)
         if key == 'error_data_path':
             RunUTConfig.check_error_data_path_config(value)
+        if key == 'nfs_path':
+            RunUTConfig.check_nfs_path_config(value)
+        if key == 'tls_path':
+            RunUTConfig.check_tls_path_config(value)
         return value
@@ -74,6 +85,12 @@ class CheckerConfig:
         self.white_list = msCheckerConfig.white_list
         self.black_list = msCheckerConfig.black_list
         self.error_data_path = msCheckerConfig.error_data_path
+        self.is_online = msCheckerConfig.is_online
+        self.nfs_path = msCheckerConfig.nfs_path
+        self.host = msCheckerConfig.host
+        self.port = msCheckerConfig.port
+        self.rank_list = msCheckerConfig.rank_list
+        self.tls_path = msCheckerConfig.tls_path
         if task_config:
             self.load_config(task_config)
@@ -82,7 +99,22 @@ class CheckerConfig:
         self.white_list = task_config.white_list
         self.black_list = task_config.black_list
         self.error_data_path = task_config.error_data_path
+        self.is_online = task_config.is_online
+        self.nfs_path = task_config.nfs_path
+        self.host = task_config.host
+        self.port = task_config.port
+        self.rank_list = task_config.rank_list
+        self.tls_path = task_config.tls_path
+    def get_online_config(self):
+        return OnlineConfig(
+            is_online=self.is_online,
+            nfs_path=self.nfs_path,
+            host=self.host,
+            port=self.port,
+            rank_list=self.rank_list,
+            tls_path=self.tls_path
+        )
     def get_run_ut_config(self, **config_params):
         return RunUtConfig(
@@ -95,5 +127,6 @@ class CheckerConfig:
             real_data_path=config_params.get('real_data_path'),
             white_list=self.white_list.copy() if self.white_list else [],
             black_list=self.black_list.copy() if self.black_list else [],
-            error_data_path=config_params.get('error_data_path')
+            error_data_path=config_params.get('error_data_path'),
+            online_config=self.get_online_config()
         )

msprobe/pytorch/api_accuracy_checker/compare/api_precision_compare.py CHANGED Viewed

@@ -117,6 +117,30 @@ def api_precision_compare(config):
     change_mode(config.details_csv_path, FileCheckConst.DATA_FILE_AUTHORITY)
+def online_api_precision_compare(online_config):
+    rank = online_config.rank
+    result_csv_path = os.path.join(Const.DEFAULT_PATH, online_config.result_csv_path).replace(
+                    "_rank*.csv", f"_rank{rank}.csv")
+    details_csv_path = os.path.join(Const.DEFAULT_PATH, online_config.details_csv_path).replace(
+                    "_rank*.csv", f"_rank{rank}.csv")
+    detail_csv_title = [ApiPrecisionCompareColumn.get_detail_csv_title()]
+    result_csv_title = [ApiPrecisionCompareColumn.get_result_csv_title()]
+    if not os.path.exists(result_csv_path):
+        write_csv(result_csv_title, result_csv_path)
+    if not os.path.exists(details_csv_path):
+        write_csv(detail_csv_title, details_csv_path)
+    config = CompareConfig("", "", result_csv_path, details_csv_path)
+    try:
+        npu_data, gpu_data = online_config.npu_data, online_config.gpu_data
+        check_csv_columns(npu_data.columns, "npu_csv")
+        check_csv_columns(gpu_data.columns, "gpu_csv")
+        analyse_csv(npu_data, gpu_data, config)
+    except Exception as err:
+        logger.error(f"Online api precision compare Error: {str(err)}")
+    change_mode(result_csv_path, FileCheckConst.DATA_FILE_AUTHORITY)
+    change_mode(details_csv_path, FileCheckConst.DATA_FILE_AUTHORITY)
 def analyse_csv(npu_data, gpu_data, config):
     forward_status, backward_status = [], []
     last_api_name, last_api_dtype, last_api_full_name = None, None, None

msprobe/pytorch/api_accuracy_checker/compare/compare.py CHANGED Viewed

@@ -66,6 +66,13 @@ class Comparator:
         self.save_path_list = [result_csv_path]
         self.detail_save_path_list = [details_csv_path]
+        if config and config.online_config.is_online:
+            self.save_path_str = result_csv_path.replace(".csv", "_rank{}.csv")
+            self.detail_save_path_str = details_csv_path.replace(".csv", "_rank{}.csv")
+            self.save_path_list = [self.save_path_str.format(rank) for rank in config.online_config.rank_list]
+            self.detail_save_path_list = \
+                [self.detail_save_path_str.format(rank) for rank in config.online_config.rank_list]
         self.registry = self._register_compare_func()
         if not is_continue_run_ut:
@@ -238,8 +245,9 @@ class Comparator:
         self.write_detail_csv(args)
-    def compare_output(self, full_api_name, data_info):
+    def compare_output(self, full_api_name, data_info, is_online=False):
         """Get compare result and write to result and detail csv.
+        is_online: bool, default False. True: called by online api precision compare, only compare without write to csv.
         """
         _, api_name = extract_basic_api_segments(full_api_name)
         if not api_name:
@@ -272,7 +280,9 @@ class Comparator:
                                  fwd_compare_alg_results,
                                  bwd_compare_alg_results,
                                  data_info.rank)
+        if is_online:
+            # get run_ut compare detail
+            return self._get_run_ut_detail(result_info)
         self.record_results(result_info)
         return fwd_success_status == CompareConst.PASS, bwd_success_status == CompareConst.PASS \
                or bwd_success_status == CompareConst.SPACE

msprobe/pytorch/api_accuracy_checker/config.yaml CHANGED Viewed

@@ -2,4 +2,9 @@ white_list: []
 black_list: []
 error_data_path: './'
 precision: 14
+is_online: False
+nfs_path: ""
+host: ""
+port: -1
+rank_list: [0]
+tls_path: "./"

msprobe/pytorch/api_accuracy_checker/generate_op_script/op_generator.py CHANGED Viewed

@@ -47,7 +47,7 @@ API_INFO = 2
 FOUR_SEGMENT = 4
 FIVE_SEGMENT = 5
 DATA_NAME = "data_name"
-API_MAX_LENGTH = 30
+API_MAX_LENGTH = 300
 PROPAGATION_LIST = [Const.FORWARD, Const.BACKWARD]
 DATAMODE_LIST = ["random_data", "real_data"]
 ITER_MAX_TIMES = 1000

mindstudio-probe 8.2.0__py3-none-any.whl → 8.3.0__py3-none-any.whl

mindstudio-probe 8.2.0py3-none-any.whl → 8.3.0py3-none-any.whl