PyPI - mindstudio-probe - Versions diffs - 8.1.1__py3-none-any.whl → 8.1.2__py3-none-any.whl - Mend

mindstudio-probe 8.1.1py3-none-any.whl → 8.1.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (95) hide show

{mindstudio_probe-8.1.1.dist-info → mindstudio_probe-8.1.2.dist-info}/METADATA +1 -1
{mindstudio_probe-8.1.1.dist-info → mindstudio_probe-8.1.2.dist-info}/RECORD +95 -94
msprobe/core/common/const.py +3 -0
msprobe/core/common/file_utils.py +45 -5
msprobe/core/common/utils.py +117 -13
msprobe/core/common_config.py +15 -1
msprobe/core/compare/acc_compare.py +21 -9
msprobe/core/compare/compare_cli.py +10 -2
msprobe/core/compare/merge_result/merge_result.py +1 -1
msprobe/core/compare/utils.py +8 -2
msprobe/core/config_check/checkers/base_checker.py +2 -0
msprobe/core/config_check/checkers/hyperparameter_checker.py +5 -4
msprobe/core/config_check/ckpt_compare/ckpt_comparator.py +4 -1
msprobe/core/config_check/config_check_cli.py +1 -1
msprobe/core/config_check/config_checker.py +1 -2
msprobe/core/data_dump/data_collector.py +4 -1
msprobe/core/data_dump/data_processor/mindspore_processor.py +23 -1
msprobe/core/data_dump/data_processor/pytorch_processor.py +3 -25
msprobe/core/debugger/precision_debugger.py +13 -8
msprobe/core/hook_manager.py +112 -82
msprobe/core/monitor/utils.py +338 -0
msprobe/core/service.py +2 -1
msprobe/core/single_save/single_comparator.py +5 -3
msprobe/docs/01.installation.md +1 -0
msprobe/docs/05.data_dump_PyTorch.md +4 -4
msprobe/docs/07.accuracy_checker_PyTorch.md +14 -11
msprobe/docs/09.accuracy_checker_MindSpore.md +13 -11
msprobe/docs/10.accuracy_compare_PyTorch.md +3 -1
msprobe/docs/11.accuracy_compare_MindSpore.md +4 -2
msprobe/docs/12.overflow_check_PyTorch.md +3 -2
msprobe/docs/13.overflow_check_MindSpore.md +1 -1
msprobe/docs/14.data_parse_PyTorch.md +35 -32
msprobe/docs/21.visualization_PyTorch.md +9 -8
msprobe/docs/22.visualization_MindSpore.md +1 -0
msprobe/docs/23.generate_operator_PyTorch.md +1 -1
msprobe/docs/24.code_mapping_Mindspore.md +6 -5
msprobe/docs/31.config_check.md +15 -5
msprobe/docs/33.generate_operator_MindSpore.md +2 -2
msprobe/docs/34.RL_collect.md +18 -9
msprobe/docs/35.nan_analyze.md +4 -3
msprobe/docs/FAQ.md +3 -0
msprobe/docs/img/ms_layer.png +0 -0
msprobe/mindspore/api_accuracy_checker/api_runner.py +29 -1
msprobe/mindspore/cell_processor.py +35 -14
msprobe/mindspore/code_mapping/bind.py +23 -4
msprobe/mindspore/code_mapping/graph_parser.py +6 -4
msprobe/mindspore/common/utils.py +3 -0
msprobe/mindspore/compare/common_dir_compare.py +32 -12
msprobe/mindspore/compare/ms_graph_compare.py +7 -2
msprobe/mindspore/compare/utils.py +9 -1
msprobe/mindspore/debugger/debugger_config.py +13 -11
msprobe/mindspore/debugger/precision_debugger.py +67 -45
msprobe/mindspore/dump/dump_tool_factory.py +2 -0
msprobe/mindspore/dump/hook_cell/hook_cell.py +14 -9
msprobe/mindspore/dump/hook_cell/ms_hook_manager.py +12 -7
msprobe/mindspore/dump/hook_cell/primitive_hooks.py +27 -13
msprobe/mindspore/dump/jit_dump.py +6 -3
msprobe/mindspore/dump/kernel_kbyk_dump.py +13 -6
msprobe/mindspore/dym_loader/hook_dynamic_loader.cpp +6 -5
msprobe/mindspore/dym_loader/hook_dynamic_loader.h +2 -2
msprobe/mindspore/grad_probe/grad_analyzer.py +2 -0
msprobe/mindspore/mindspore_service.py +2 -2
msprobe/mindspore/monitor/common_func.py +1 -1
msprobe/mindspore/monitor/module_hook.py +3 -3
msprobe/mindspore/monitor/utils.py +0 -252
msprobe/mindspore/ms_config.py +0 -1
msprobe/mindspore/overflow_check/overflow_check_tool_factory.py +1 -1
msprobe/nan_analyze/graph.py +4 -0
msprobe/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py +15 -6
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py +1 -1
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/device_dispatch.py +1 -1
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/utils.py +2 -4
msprobe/pytorch/common/utils.py +0 -16
msprobe/pytorch/compare/pt_compare.py +5 -0
msprobe/pytorch/debugger/debugger_config.py +12 -5
msprobe/pytorch/debugger/precision_debugger.py +8 -1
msprobe/pytorch/dump/module_dump/hook_wrapper.py +1 -3
msprobe/pytorch/dump/module_dump/module_processer.py +44 -13
msprobe/pytorch/free_benchmark/result_handlers/base_handler.py +2 -0
msprobe/pytorch/hook_module/hook_module.py +9 -9
msprobe/pytorch/hook_module/pt_hook_manager.py +7 -7
msprobe/pytorch/monitor/csv2tb.py +3 -10
msprobe/pytorch/monitor/features.py +5 -0
msprobe/pytorch/monitor/module_hook.py +6 -7
msprobe/pytorch/monitor/module_metric.py +0 -3
msprobe/pytorch/monitor/optimizer_collect.py +1 -1
msprobe/pytorch/monitor/utils.py +1 -317
msprobe/pytorch/online_dispatch/dispatch.py +1 -1
msprobe/pytorch/online_dispatch/dump_compare.py +7 -1
msprobe/pytorch/parse_tool/lib/utils.py +2 -4
msprobe/visualization/graph_service.py +1 -1
{mindstudio_probe-8.1.1.dist-info → mindstudio_probe-8.1.2.dist-info}/LICENSE +0 -0
{mindstudio_probe-8.1.1.dist-info → mindstudio_probe-8.1.2.dist-info}/WHEEL +0 -0
{mindstudio_probe-8.1.1.dist-info → mindstudio_probe-8.1.2.dist-info}/entry_points.txt +0 -0
{mindstudio_probe-8.1.1.dist-info → mindstudio_probe-8.1.2.dist-info}/top_level.txt +0 -0

msprobe/core/common/utils.py CHANGED Viewed

@@ -14,21 +14,22 @@
 # limitations under the License.
 import collections
+import functools
+import inspect
 import os
 import re
-import subprocess
+import threading
 import time
-import inspect
+from collections import OrderedDict
 from datetime import datetime, timezone
 import numpy as np
-from msprobe.core.common.file_utils import (FileOpen, check_file_or_directory_path, load_json)
 from msprobe.core.common.const import Const, CompareConst
-from msprobe.core.common.log import logger
-from msprobe.core.common.exceptions import MsprobeException
 from msprobe.core.common.decorator import recursion_depth_decorator
+from msprobe.core.common.exceptions import MsprobeException
+from msprobe.core.common.file_utils import (FileOpen, check_file_or_directory_path, load_json)
+from msprobe.core.common.log import logger
 device = collections.namedtuple('device', ['type', 'index'])
 prefixes = ['api_stack', 'list', 'range', 'acl']
@@ -112,6 +113,82 @@ class DumpException(MsprobeBaseException):
         return f"Dump Error Code {self.code}: {self.error_info}"
+class ThreadSafe:
+    """
+    线程安全控制工具类，提供三种使用方式：
+    1.上下文管理器：with ThreadSafe()
+    2.主动加锁与释放锁：ThreadSafe.acquire()/ThreadSafe.release()
+    3.方法装饰器：@ThreadSafe.synchronized
+    """
+    _lock = threading.RLock()
+    def __enter__(self):
+        self.__class__._lock.acquire()
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.__class__._lock.release()
+    @classmethod
+    def acquire(cls):
+        cls._lock.acquire()
+    @classmethod
+    def release(cls):
+        cls._lock.release()
+    @classmethod
+    def synchronized(cls, func):
+        @functools.wraps(func)
+        def wrapper(*args, **kwargs):
+            with cls._lock:
+                return func(*args, **kwargs)
+        return wrapper
+class ModuleQueue:
+    def __init__(self):
+        self.queue = OrderedDict()
+    def add_name(self, name):
+        self.queue[name] = True
+    def remove_name(self, name):
+        if name in self.queue:
+            del self.queue[name]
+    def find_last(self, name):
+        """
+        在队列中找到当前 Module/Cell 的父节点名称并返回，若找不到则返回None
+        Args:
+            name: 需要寻找父节点的 Module/Cell 的名称
+        Returns:
+            返回父节点名称，找不到则返回None
+        Examples:
+            父节点名称格式: Module.module1.module1.forward.0
+            子节点名称格式: Module.module1.module2.Module2.forward.0
+            匹配关系: Module/Cell 的名称总能被点(.)分割符分成5个部分及以上，子节点截断后4个点和父节点截断后3个点的前缀名称是匹配的
+        """
+        child_parts = name.split('.')
+        if len(child_parts) < 5:
+            return None
+        child_name_prefix = '.'.join(child_parts[:-4])
+        if child_name_prefix in Const.MODULE_PREFIX:
+            return None
+        for parent_name in reversed(self.queue):
+            parent_parts = parent_name.split('.')
+            if len(parent_parts) < 5:
+                return None
+            parent_name_prefix = '.'.join(parent_parts[:-3])
+            if parent_name_prefix == child_name_prefix:
+                return parent_name
+        return None
 def is_json_file(file_path):
     if isinstance(file_path, str) and file_path.lower().endswith('.json'):
         return True
@@ -156,9 +233,10 @@ def check_compare_param(input_param, output_path, dump_mode, stack_mode):
 def check_configuration_param(stack_mode=False, auto_analyze=True, fuzzy_match=False, is_print_compare_log=True):
     arg_list = [stack_mode, auto_analyze, fuzzy_match, is_print_compare_log]
-    for arg in arg_list:
+    arg_names = ['stack_mode', 'auto_analyze', 'fuzzy_match', 'is_print_compare_log']
+    for arg, name in zip(arg_list, arg_names):
         if not isinstance(arg, bool):
-            logger.error(f"Invalid input parameter, {arg} which should be only bool type.")
+            logger.error(f"Invalid input parameter, {name} which should be only bool type.")
             raise CompareException(CompareException.INVALID_PARAM_ERROR)
@@ -282,9 +360,9 @@ def set_dump_path(input_param):
     npu_path = input_param.get("npu_json_path", None)
     bench_path = input_param.get("bench_json_path", None)
     dump_json_path_valid = npu_path is not None and npu_path.endswith("dump.json") and \
-        bench_path is not None and bench_path.endswith("dump.json")
+                           bench_path is not None and bench_path.endswith("dump.json")
     debug_json_path_valid = npu_path is not None and npu_path.endswith("debug.json") and \
-        bench_path is not None and bench_path.endswith("debug.json")
+                            bench_path is not None and bench_path.endswith("debug.json")
     if not dump_json_path_valid and not debug_json_path_valid:
         logger.error(f"Please check the json path is valid and ensure that neither npu_path nor bench_path is None.")
         raise CompareException(CompareException.INVALID_PATH_ERROR)
@@ -457,10 +535,10 @@ def get_real_step_or_rank(step_or_rank_input, obj):
 def check_init_step(step):
     if not is_int(step):
         raise MsprobeException(MsprobeException.INVALID_PARAM_ERROR,
-                        f"{step} must be an integer")
+                               f"{step} must be an integer")
     if not step >= 0:
         raise MsprobeException(MsprobeException.INVALID_PARAM_ERROR,
-                f"{step} must be greater than or equal to 0")
+                               f"{step} must be greater than or equal to 0")
 def check_token_range(token_range):
@@ -568,14 +646,25 @@ def replace_last_occurrence(text, old, new):
 def load_stack_json(stack_path):
     stack_dict = load_json(stack_path)
+    if not isinstance(stack_dict, dict):
+        raise MsprobeException(
+            MsprobeException.INVALID_PARAM_ERROR,
+            "The format of the stack.json is incorrect, the outermost layer of stack.json should be a dict type."
+        )
     if not stack_dict.get(Const.NEW_STACK_FLAG):
         return stack_dict
     new_stack_dict = {}
     for stack_info in stack_dict.values():
-        if len(stack_info) != 2:
+        if not isinstance(stack_info, list) or len(stack_info) != 2:
             continue
         api_list, stack_str = stack_info
+        if not isinstance(api_list, list):
+            continue
         for api_name in api_list:
             new_stack_dict.update({api_name: stack_str})
     return new_stack_dict
@@ -597,3 +686,18 @@ def analyze_api_call_stack(name):
     else:
         stack_str.append(Const.WITHOUT_CALL_STACK)
     return "".join(stack_str)
+def check_extern_input_list(input_list):
+    if not isinstance(input_list, list):
+        raise Exception("input is not a list")
+    if len(input_list) > Const.EXTERN_INPUT_LIST_MAX_LEN:
+        raise Exception(f"input list exceed max length {Const.EXTERN_INPUT_LIST_MAX_LEN}")
+def check_process_num(process_num):
+    if not is_int(process_num) or process_num <= 0:
+        raise ValueError(f"process_num({process_num}) is not a positive integer")
+    if process_num > Const.MAX_PROCESS_NUM:
+        raise ValueError(f"The maximum supported process_num is {Const.MAX_PROCESS_NUM}, current value: {process_num}.")

msprobe/core/common_config.py CHANGED Viewed

@@ -13,7 +13,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from msprobe.core.common.const import Const, FileCheckConst
+import re
+from msprobe.core.common.const import Const
 from msprobe.core.common.log import logger
 from msprobe.core.common.exceptions import MsprobeException
 from msprobe.core.common.utils import get_real_step_or_rank
@@ -67,6 +69,7 @@ class BaseConfig:
         self.if_preheat = json_config.get("if_preheat")
         self.preheat_step = json_config.get("preheat_step")
         self.max_sample = json_config.get("max_sample")
+        self.is_regex_valid = True
     @staticmethod
     def _check_str_list_config(config_item, config_name):
@@ -83,6 +86,7 @@ class BaseConfig:
         self._check_str_list_config(self.scope, "scope")
         self._check_str_list_config(self.list, "list")
         self._check_data_mode()
+        self._check_regex_in_list()
     def _check_data_mode(self):
         if self.data_mode is not None:
@@ -118,3 +122,13 @@ class BaseConfig:
                         f"summary_mode is invalid, summary_mode is not in {Const.SUMMARY_MODE}.",
                         MsprobeException(MsprobeException.INVALID_PARAM_ERROR)
                     )
+    def _check_regex_in_list(self):
+        if self.list:
+            for name in self.list:
+                if name.startswith('name-regex(') and name.endswith(')'):
+                    try:
+                        re.compile(name[len('name-regex('):-1])
+                    except re.error:
+                        self.is_regex_valid = False
+                        break

msprobe/core/compare/acc_compare.py CHANGED Viewed

@@ -31,7 +31,7 @@ from msprobe.core.common.utils import CompareException, add_time_with_xlsx, chec
     set_dump_path, get_dump_mode, check_compare_param, check_configuration_param, load_stack_json, get_file_type
 from msprobe.core.compare.check import check_dump_json_str, check_stack_json_str, cross_dtype_mapping
 from msprobe.core.compare.utils import merge_tensor, print_compare_ends_info, read_op, \
-    reorder_op_x_list, set_stack_json_path
+    reorder_op_x_list, set_stack_json_path, check_api_info_len
 from msprobe.core.compare.config import ModeConfig, MappingConfig, MappingDict
 from msprobe.core.compare.multiprocessing_compute import CompareRealData
 from msprobe.core.compare.highlight import HighLight
@@ -211,25 +211,37 @@ class ParseData:
             for index, op_name in enumerate(op_name_reorder):
                 result[CompareConst.OP_NAME].append(op_name)
                 if (CompareConst.INPUT_PATTERN in op_name) or (CompareConst.KWARGS_PATTERN in op_name):
-                    struct = merge_list[CompareConst.INPUT_STRUCT].pop(0)
+                    info_list = merge_list[CompareConst.INPUT_STRUCT]
                 elif CompareConst.OUTPUT_PATTERN in op_name:
-                    struct = merge_list[CompareConst.OUTPUT_STRUCT].pop(0)
+                    info_list = merge_list[CompareConst.OUTPUT_STRUCT]
                 elif CompareConst.PARAMS_PATTERN in op_name:
-                    struct = merge_list[CompareConst.PARAMS_STRUCT].pop(0)
+                    info_list = merge_list[CompareConst.PARAMS_STRUCT]
                 elif CompareConst.PARAMS_GRAD_PATTERN in op_name:
-                    struct = merge_list[CompareConst.PARAMS_GRAD_STRUCT].pop(0)
+                    info_list = merge_list[CompareConst.PARAMS_GRAD_STRUCT]
                 else:
-                    struct = merge_list[CompareConst.DEBUG_STRUCT].pop(0)
+                    info_list = merge_list[CompareConst.DEBUG_STRUCT]
+                check_api_info_len(op_name, info_list, 1)
+                struct = info_list.pop(0)
+                check_api_info_len(op_name, struct, 2)
                 result[Const.DTYPE].append(struct[0])
                 result[Const.SHAPE].append(struct[1])
                 if self.mode_config.dump_mode == Const.MD5:
+                    check_api_info_len(op_name, struct, 3)
                     result[Const.MD5].append(struct[2])
+                check_api_info_len(op_name, summary_reorder, 1)
                 result[Const.SUMMARY].append(summary_reorder.pop(0))
-                result[Const.STACK_INFO].append(
-                    merge_list[Const.STACK_INFO][0] if index == 0 and self.mode_config.stack_mode else None)
+                if index == 0 and self.mode_config.stack_mode:
+                    check_api_info_len(op_name, merge_list[Const.STACK_INFO], 1)
+                    result[Const.STACK_INFO].append(merge_list[Const.STACK_INFO][0])
+                else:
+                    result[Const.STACK_INFO].append(None)
                 if self.mode_config.dump_mode == Const.ALL:
+                    check_api_info_len(op_name, data_name_reorder, 1)
                     result['data_name'].append(data_name_reorder.pop(0))
             progress_bar.update(1)
         progress_bar.close()
         return pd.DataFrame(result)

msprobe/core/compare/compare_cli.py CHANGED Viewed

@@ -14,7 +14,7 @@
 # limitations under the License.
 import json
-from msprobe.core.common.file_utils import check_file_type, load_json
+from msprobe.core.common.file_utils import check_file_type, load_json, check_file_or_directory_path
 from msprobe.core.common.const import FileCheckConst, Const
 from msprobe.core.common.utils import CompareException
 from msprobe.core.common.log import logger
@@ -22,6 +22,9 @@ from msprobe.core.common.log import logger
 def compare_cli(args):
     input_param = load_json(args.input_path)
+    if not isinstance(input_param, dict):
+        logger.error("input_param should be dict, please check!")
+        raise CompareException(CompareException.INVALID_OBJECT_TYPE_ERROR)
     npu_path = input_param.get("npu_path", None)
     bench_path = input_param.get("bench_path", None)
     if not npu_path:
@@ -47,6 +50,8 @@ def compare_cli(args):
     }
     if check_file_type(npu_path) == FileCheckConst.FILE and check_file_type(bench_path) == FileCheckConst.FILE:
+        check_file_or_directory_path(npu_path)
+        check_file_or_directory_path(bench_path)
         input_param["npu_json_path"] = input_param.pop("npu_path")
         input_param["bench_json_path"] = input_param.pop("bench_path")
         if "stack_path" not in input_param:
@@ -68,6 +73,8 @@ def compare_cli(args):
             }
             ms_compare(input_param, args.output_path, **kwargs)
     elif check_file_type(npu_path) == FileCheckConst.DIR and check_file_type(bench_path) == FileCheckConst.DIR:
+        check_file_or_directory_path(npu_path, isdir=True)
+        check_file_or_directory_path(bench_path, isdir=True)
         kwargs = {
             **common_kwargs,
             "stack_mode": args.stack_mode,
@@ -79,7 +86,8 @@ def compare_cli(args):
         if input_param.get("rank_id") is not None:
             ms_graph_compare(input_param, args.output_path)
             return
-        if input_param.get('common', False):
+        common = input_param.get("common", False)
+        if isinstance(common, bool) and common:
             common_dir_compare(input_param, args.output_path)
             return
         if frame_name == Const.PT_FRAMEWORK:

msprobe/core/compare/merge_result/merge_result.py CHANGED Viewed

@@ -196,7 +196,7 @@ def result_process(compare_result_path_list, api_list):
         compare_index_dict = {}
         result_df = read_xlsx(compare_result_path)
-        rank_pattern = r"compare_result_rank(\d+)-rank"
+        rank_pattern = r"compare_result_rank(\d+)"
         rank_num = int(re.search(rank_pattern, os.path.basename(compare_result_path)).group(1))
         logger.info(f"Parsing rank{rank_num} compare result...")
         if not result_df.empty:

msprobe/core/compare/utils.py CHANGED Viewed

@@ -238,6 +238,12 @@ def merge_tensor(tensor_list, dump_mode):
     return op_dict if op_dict[CompareConst.OP_NAME] else {}
+def check_api_info_len(op_name, info_list, len_require):
+    if len(info_list) < len_require:
+        logger.error(f'Index out of bounds error, please check info of api: {op_name}.')
+        raise CompareException(CompareException.INDEX_OUT_OF_BOUNDS_ERROR)
 def print_compare_ends_info():
     total_len = len(CompareConst.COMPARE_ENDS_SUCCESSFULLY) + Const.FILL_CHAR_NUMS
     logger.info('*' * total_len)
@@ -509,8 +515,8 @@ def get_accuracy(result, n_dict, b_dict, dump_mode):
                 result.append(result_item)
-    n_num, n_num_input, n_num_output, n_num_params, n_num_params_grad = count_struct(n_dict)
-    b_num, b_num_input, b_num_output, b_num_params, b_num_params_grad = count_struct(b_dict)
+    _, n_num_input, n_num_output, n_num_params, n_num_params_grad = count_struct(n_dict)
+    _, b_num_input, b_num_output, b_num_params, b_num_params_grad = count_struct(b_dict)
     get_accuracy_core(0, n_num_input, 0, b_num_input, CompareConst.INPUT_STRUCT)
     get_accuracy_core(n_num_input + n_num_output, n_num_params, b_num_input + b_num_output, b_num_params,

msprobe/core/config_check/checkers/base_checker.py CHANGED Viewed

@@ -15,6 +15,7 @@
 import os
+from msprobe.core.common.file_utils import check_path_pattern_valid
 from msprobe.core.common.framework_adapter import FmkAdp
 from msprobe.core.common.const import FileCheckConst
@@ -32,6 +33,7 @@ class PackInput:
             raise Exception(f"model is not torch.nn.Module/mindspore.nn.Cell or module list.")
         if not isinstance(self.output_zip_path, str) or not self.output_zip_path.endswith(FileCheckConst.ZIP_SUFFIX):
             raise Exception(f"output zip path must be a string and ends with '.zip'")
+        check_path_pattern_valid(self.output_zip_path)
 class BaseChecker:

msprobe/core/config_check/checkers/hyperparameter_checker.py CHANGED Viewed

@@ -20,12 +20,13 @@ from difflib import SequenceMatcher
 from typing import Union, List, Dict, Any
 import pandas as pd
+from msprobe.core.common.utils import check_extern_input_list
 from msprobe.core.config_check.checkers.base_checker import BaseChecker
 from msprobe.core.config_check.config_checker import register_checker_item
 from msprobe.core.config_check.utils.utils import compare_dict, config_checking_print, update_dict
 from msprobe.core.config_check.utils.hyperparameter_parser import ParserFactory
-from msprobe.core.common.file_utils import (os_walk_for_files, create_file_in_zip, load_json, create_file_with_list,
-                                            FileOpen, load_yaml)
+from msprobe.core.common.file_utils import (check_file_or_directory_path, create_file_in_zip, load_json,
+                                            load_yaml)
 from msprobe.core.common.const import Const
@@ -47,13 +48,13 @@ class HyperparameterChecker(BaseChecker):
         output_zip_path = pack_input.output_zip_path
         if shell_path:
-            if not isinstance(shell_path, list):
-                raise TypeError("shell_path should be a list of file paths.")
+            check_extern_input_list(shell_path)
             hyperparameters = {}
             parser_factory = ParserFactory()
             for script_path in shell_path:
                 if os.path.isfile(script_path):
+                    check_file_or_directory_path(script_path)
                     parser = parser_factory.get_parser(os.path.splitext(script_path)[1])
                     update_dict(hyperparameters, parser.run(os.path.realpath(script_path)))
                 else:

msprobe/core/config_check/ckpt_compare/ckpt_comparator.py CHANGED Viewed

@@ -16,7 +16,8 @@
 from typing import Dict
 from tqdm import tqdm
-from msprobe.core.common.file_utils import save_json, check_path_before_create, check_path_not_exists
+from msprobe.core.common.file_utils import save_json, check_path_before_create, check_path_not_exists, \
+    check_file_or_directory_path
 from msprobe.core.common.log import logger
 from msprobe.core.config_check.ckpt_compare.megatron_loader import load_megatron_weights
 from msprobe.core.config_check.ckpt_compare.metrics import METRIC_FUNC
@@ -44,6 +45,8 @@ def compare_checkpoints(ckpt_path1, ckpt_path2, output_path) -> Dict:
     """
     # Load both checkpoints
+    check_file_or_directory_path(ckpt_path1, isdir=True)
+    check_file_or_directory_path(ckpt_path2, isdir=True)
     check_path_before_create(output_path)
     check_path_not_exists(output_path)
     weights1 = load_megatron_weights(ckpt_path1)

msprobe/core/config_check/config_check_cli.py CHANGED Viewed

@@ -29,7 +29,7 @@ def compare(bench_zip_path, cmp_zip_path, output_path, framework):
 def _config_checking_parser(parser):
     parser.add_argument('-d', '--dump', nargs='*', help='Collect the train config into a zip file')
     parser.add_argument('-c', '--compare', nargs=2, help='Compare two zip files or checkpoints')
-    parser.add_argument('-o', '--output', help='output path, default is current directory')
+    parser.add_argument('-o', '--output', help='output path, default is ./config_check_result')
 def _run_config_checking_command(args):

msprobe/core/config_check/config_checker.py CHANGED Viewed

@@ -43,8 +43,7 @@ class ConfigChecker:
     @staticmethod
     def compare(bench_zip_path, cmp_zip_path, output_path, fmk=Const.PT_FRAMEWORK):
-        if os.path.exists(output_path):
-            shutil.rmtree(output_path)
+        create_directory(output_path)
         bench_dir = os.path.join(output_path, "bench")
         cmp_dir = os.path.join(output_path, "cmp")
         extract_zip(bench_zip_path, bench_dir)

msprobe/core/data_dump/data_collector.py CHANGED Viewed

@@ -15,6 +15,7 @@
 import atexit
 import os
+import threading
 import traceback
 from msprobe.core.data_dump.scope import ScopeFactory
@@ -255,7 +256,9 @@ class DataCollector:
             else:
                 if self.config.level == Const.LEVEL_MIX and \
                   not (name.startswith(Const.MODULE) or name.startswith(Const.CELL)):
-                    self.data_writer.update_construct({name: self.module_processor.api_parent_node})
+                    self.data_writer.update_construct(
+                        {name: self.module_processor.api_parent_node.get(threading.get_ident())}
+                    )
             self.data_writer.update_construct(self.module_processor.module_node)

msprobe/core/data_dump/data_processor/mindspore_processor.py CHANGED Viewed

@@ -28,6 +28,7 @@ from msprobe.core.common.file_utils import path_len_exceeds_limit
 from msprobe.mindspore.common.utils import convert_bf16_to_fp32, save_tensor_as_npy
 from msprobe.mindspore.common.log import logger
 from msprobe.mindspore.dump.hook_cell.api_register import get_api_register
+from msprobe.mindspore.common.utils import is_mindtorch
 has_adump = True
 try:
@@ -35,9 +36,15 @@ try:
 except ImportError:
     has_adump = False
+if is_mindtorch():
+    from torch import distributed as dist
 class MindsporeDataProcessor(BaseDataProcessor):
-    mindspore_special_type = tuple([ms.Tensor, Number, distributed.P2POp])
+    if is_mindtorch():
+        mindspore_special_type = tuple([ms.Tensor, Number, distributed.P2POp, dist.ProcessGroup])
+    else:
+        mindspore_special_type = tuple([ms.Tensor, Number, distributed.P2POp])
     def __init__(self, config, data_writer):
         super().__init__(config, data_writer)
@@ -114,6 +121,19 @@ class MindsporeDataProcessor(BaseDataProcessor):
         group_ranks_hash = zlib.crc32(str(group_ranks).encode('utf-8'))
         return f"{group_ranks_hash:08x}"
+    @staticmethod
+    def _analyze_process_group(arg):
+        group_info = {"type": "mindspore.ProcessGroup"}
+        try:
+            group_ranks = dist.get_process_group_ranks(arg)
+            group_info.update({"group_ranks": group_ranks})
+            group_ranks_hash = zlib.crc32(str(group_ranks).encode('utf-8'))
+            group_id = f"{group_ranks_hash:08x}"
+            group_info.update({"group_id": group_id})
+        except Exception as e:
+            logger.warning(f"Failed to get process group ranks info with error info: {e}.")
+        return group_info
     @classmethod
     def get_special_types(cls):
         return super().get_special_types() + cls.mindspore_special_type
@@ -149,6 +169,8 @@ class MindsporeDataProcessor(BaseDataProcessor):
             (np.ndarray, lambda e: self._analyze_ndarray(e, suffix_str)),
             (distributed.P2POp, lambda e: self._analyze_p2pop(e, suffix_str))
         ]
+        if is_mindtorch():
+            type_analyzer.append((dist.ProcessGroup, self._analyze_process_group))
         for type_key, analyze_fn in type_analyzer:
             if isinstance(element, type_key):
                 return analyze_fn(element)

msprobe/core/data_dump/data_processor/pytorch_processor.py CHANGED Viewed

@@ -30,7 +30,7 @@ from msprobe.core.common.utils import convert_tuple
 from msprobe.core.common.decorator import recursion_depth_decorator
 from msprobe.core.data_dump.data_processor.base import BaseDataProcessor, ModuleBackwardInputsOutputs, \
     ModuleForwardInputsOutputs, TensorStatInfo
-from msprobe.pytorch.common.utils import Const as PtConst, save_pt, is_hifloat8_tensor, is_float8_tensor
+from msprobe.pytorch.common.utils import save_pt
 from msprobe.pytorch.free_benchmark import FreeBenchmarkCheck, UnequalRow
 is_gpu = False
@@ -181,7 +181,7 @@ class PytorchDataProcessor(BaseDataProcessor):
     @staticmethod
     def _analyze_torch_size(arg):
-        return {"type": "torch.Size", "value": list(arg)}
+        return {"type": "torch.Size", "value": [int(x) for x in list(arg)]}
     @staticmethod
     def _analyze_memory_format(arg):
@@ -210,18 +210,6 @@ class PytorchDataProcessor(BaseDataProcessor):
             logger.warning(f"Failed to get value of torch.distributed.ReduceOp with error info: {e}.")
         return {"type": "torch.distributed.ReduceOp", "value": op_type}
-    @staticmethod
-    def _cast_to_float_if_fp8(tensor):
-        dtype = str(tensor.dtype)
-        if is_float8_tensor(tensor):
-            dtype = PtConst.HIFLOAT8_TYPE if is_hifloat8_tensor(tensor) else dtype
-            logger.debug(
-                f"The {dtype} tensor analyzing/saving is unsupported in dump function."
-                f"Casting to float for processing."
-            )
-            tensor = tensor.float()
-        return tensor, dtype
     @classmethod
     def get_special_types(cls):
         return super().get_special_types() + cls.pytorch_special_type
@@ -268,11 +256,10 @@ class PytorchDataProcessor(BaseDataProcessor):
         return p2pop_info
     def _analyze_tensor(self, tensor, suffix):
-        tensor, dtype = self._cast_to_float_if_fp8(tensor)
         tensor_stat = self.get_stat_info(tensor, self.config.async_dump)
         tensor_json = {}
         tensor_json.update({'type': 'torch.Tensor'})
-        tensor_json.update({'dtype': dtype})
+        tensor_json.update({'dtype': str(tensor.dtype)})
         tensor_json.update({"shape": tensor.shape})
         stat_values = [
@@ -295,7 +282,6 @@ class PytorchDataProcessor(BaseDataProcessor):
         dump_data_name, file_path = self.get_save_file_path(suffix)
         single_arg = PytorchDataProcessor._analyze_tensor(self, tensor, suffix)
         single_arg.update({"data_name": dump_data_name})
-        tensor, _ = self._cast_to_float_if_fp8(tensor)
         if self.config.async_dump:
             self._async_dump_cache[file_path] = tensor.clone().detach()
         else:
@@ -396,7 +382,6 @@ class OverflowCheckDataProcessor(PytorchDataProcessor):
             self._analyze_maybe_overflow_flag()
         if self.has_overflow:
             for file_path, tensor in self.cached_tensors_and_file_paths.items():
-                tensor, _ = self._cast_to_float_if_fp8(tensor)
                 save_pt(tensor.clone().contiguous().detach(), file_path)
             self.real_overflow_nums += 1
             if self.overflow_nums != -1 and self.real_overflow_nums >= self.overflow_nums:
@@ -588,11 +573,6 @@ class KernelDumpDataProcessor(PytorchDataProcessor):
     )
     def clone_and_detach_tensor(self, input_params):
         if isinstance(input_params, torch.Tensor):
-            if is_float8_tensor(input_params):
-                raise MsprobeException(
-                    MsprobeException.UNSUPPORTED_TYPE_ERROR,
-                    f"L2 backward dump does not support float8 type."
-                )
             if input_params.requires_grad:
                 return input_params.clone().detach().requires_grad_()
             return input_params.clone()
@@ -607,8 +587,6 @@ class KernelDumpDataProcessor(PytorchDataProcessor):
     def analyze_single_element(self, element, suffix_stack):
         if isinstance(element, torch.Tensor):
-            if is_float8_tensor(element):
-                return {}
             if not self.is_found_output_tensor:
                 if element.requires_grad:
                     self.forward_output_tensor = element

mindstudio-probe 8.1.1__py3-none-any.whl → 8.1.2__py3-none-any.whl

mindstudio-probe 8.1.1py3-none-any.whl → 8.1.2py3-none-any.whl