PyPI - mindstudio-probe - Versions diffs - 8.1.0__py3-none-any.whl → 8.1.1__py3-none-any.whl - Mend

mindstudio-probe 8.1.0py3-none-any.whl → 8.1.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

msprobe/core/data_dump/data_collector.py CHANGED Viewed

@@ -15,6 +15,7 @@
 import atexit
 import os
+import traceback
 from msprobe.core.data_dump.scope import ScopeFactory
 from msprobe.core.data_dump.json_writer import DataWriter
@@ -99,100 +100,150 @@ class DataCollector:
         self.data_writer.update_stack(name, stack_info)
     def forward_input_data_collect(self, name, module, pid, module_input_output, is_recompute=None):
-        if self.config.task == Const.FREE_BENCHMARK:
-            backward_name = name.replace(Const.FORWARD, Const.BACKWARD)
-            if self.check_scope_and_pid(self.scope, backward_name, pid):
-                self.data_processor.analyze_forward_input(backward_name, module, module_input_output)
-            return
-        if not self.check_scope_and_pid(self.scope, name, pid):
-            return
-        data_info = {}
-        if self.config.task != Const.STRUCTURE:
-            data_info = self.data_processor.analyze_forward_input(name, module, module_input_output)
-        self.set_is_recomputable(data_info, is_recompute)
-        if self.config.level == Const.LEVEL_L2:
-            return
-        self.handle_data(name, data_info, flush=self.data_processor.is_terminated)
+        try:
+            if self.config.task == Const.FREE_BENCHMARK:
+                backward_name = name.replace(Const.FORWARD, Const.BACKWARD)
+                if self.check_scope_and_pid(self.scope, backward_name, pid):
+                    self.data_processor.analyze_forward_input(backward_name, module, module_input_output)
+                return
+            if not self.check_scope_and_pid(self.scope, name, pid):
+                return
+            data_info = {}
+            if self.config.task != Const.STRUCTURE:
+                data_info = self.data_processor.analyze_forward_input(name, module, module_input_output)
+            self.set_is_recomputable(data_info, is_recompute)
+            if self.config.level == Const.LEVEL_L2:
+                return
+            self.handle_data(name, data_info, flush=self.data_processor.is_terminated)
+        except Exception:
+            tb = traceback.format_exc()
+            self.data_writer.write_error_log(
+                f"[ERROR] forward_input_data_collect failed: name={name}, pid={pid}\n{tb}"
+            )
     def forward_output_data_collect(self, name, module, pid, module_input_output, is_recompute=None):
-        self.update_construct(name)
-        if not self.check_scope_and_pid(self.scope, name, pid):
-            return
-        data_info = {}
-        if self.config.task != Const.STRUCTURE:
-            data_info = self.data_processor.analyze_forward_output(name, module, module_input_output)
-        self.set_is_recomputable(data_info, is_recompute)
-        if self.config.level == Const.LEVEL_L2:
-            return
-        self.call_stack_collect(name)
-        self.handle_data(name, data_info, flush=self.data_processor.is_terminated)
+        try:
+            self.update_construct(name)
+            if not self.check_scope_and_pid(self.scope, name, pid):
+                return
+            data_info = {}
+            if self.config.task != Const.STRUCTURE:
+                data_info = self.data_processor.analyze_forward_output(name, module, module_input_output)
+            self.set_is_recomputable(data_info, is_recompute)
+            if self.config.level == Const.LEVEL_L2:
+                return
+            self.call_stack_collect(name)
+            self.handle_data(name, data_info, flush=self.data_processor.is_terminated)
+        except Exception:
+            tb = traceback.format_exc()
+            self.data_writer.write_error_log(
+                f"[ERROR] forward_output_data_collect failed: name={name}, pid={pid}\n{tb}"
+            )
     def forward_data_collect_only_tensor(self, name, module, pid, module_input_output):
-        if not self.check_scope_and_pid(self.scope, name, pid):
-            return
-        self.data_processor.analyze_forward(name, module, module_input_output)
+        try:
+            if not self.check_scope_and_pid(self.scope, name, pid):
+                return
+            self.data_processor.analyze_forward(name, module, module_input_output)
+        except Exception:
+            tb = traceback.format_exc()
+            self.data_writer.write_error_log(
+                f"[ERROR] forward_data_collect_only_tensor failed: name={name}, pid={pid}\n{tb}"
+            )
     def forward_data_collect(self, name, module, pid, module_input_output, is_recompute=None):
-        self.update_construct(name)
-        if not self.check_scope_and_pid(self.scope, name, pid):
-            return
-        data_info = {}
-        if self.config.task != Const.STRUCTURE:
-            data_info = self.data_processor.analyze_forward(name, module, module_input_output)
-        self.set_is_recomputable(data_info, is_recompute)
-        self.call_stack_collect(name)
-        self.handle_data(name, data_info, flush=self.data_processor.is_terminated)
+        try:
+            self.update_construct(name)
+            if not self.check_scope_and_pid(self.scope, name, pid):
+                return
+            data_info = {}
+            if self.config.task != Const.STRUCTURE:
+                data_info = self.data_processor.analyze_forward(name, module, module_input_output)
+            self.set_is_recomputable(data_info, is_recompute)
+            self.call_stack_collect(name)
+            self.handle_data(name, data_info, flush=self.data_processor.is_terminated)
+        except Exception:
+            tb = traceback.format_exc()
+            self.data_writer.write_error_log(
+                f"[ERROR] forward_data_collect failed: name={name}, pid={pid}\n{tb}"
+            )
     def backward_data_collect_only_tensor(self, name, module, pid, module_input_output, is_recompute=None):
-        if not self.check_scope_and_pid(self.scope, name, pid):
-            return
+        try:
+            if not self.check_scope_and_pid(self.scope, name, pid):
+                return
+            self.data_processor.analyze_backward(name, module, module_input_output)
-        self.data_processor.analyze_backward(name, module, module_input_output)
+        except Exception:
+            tb = traceback.format_exc()
+            self.data_writer.write_error_log(
+                f"[ERROR] backward_data_collect_only_tensor failed: name={name}, pid={pid}\n{tb}"
+            )
     def backward_data_collect(self, name, module, pid, module_input_output, is_recompute=None):
-        self.update_construct(name)
-        if not self.check_scope_and_pid(self.scope, name, pid):
-            return
-        data_info = {}
-        if self.config.task != Const.STRUCTURE:
-            data_info = self.data_processor.analyze_backward(name, module, module_input_output)
-        if self.config.level == Const.LEVEL_L2:
-            return
-        # 获取执行反向的模块名称
-        if data_info and name.split(Const.SEP)[0] in Const.MODULE_PREFIX:
-            module_name = name.rsplit(Const.SEP, 2)[0]
-            # 将模块名称加入到反向模块名称集合中，用于梯度收集时判断是否需要收集梯度
-            self.backward_module_names[module_name] = True
-        self.handle_data(name, data_info, flush=self.data_processor.is_terminated)
+        try:
+            self.update_construct(name)
+            if not self.check_scope_and_pid(self.scope, name, pid):
+                return
+            data_info = {}
+            if self.config.task != Const.STRUCTURE:
+                data_info = self.data_processor.analyze_backward(name, module, module_input_output)
+            if self.config.level == Const.LEVEL_L2:
+                return
+            if data_info and name.split(Const.SEP)[0] in Const.MODULE_PREFIX:
+                module_name = name.rsplit(Const.SEP, 2)[0]
+                self.backward_module_names[module_name] = True
+            self.handle_data(name, data_info, flush=self.data_processor.is_terminated)
+        except Exception:
+            tb = traceback.format_exc()
+            self.data_writer.write_error_log(
+                f"[ERROR] backward_data_collect failed: name={name}, pid={pid}\n{tb}"
+            )
     def backward_input_data_collect(self, name, module, pid, module_input_output, is_recompute=None):
-        self.update_construct(name)
-        if not self.check_scope_and_pid(self.scope, name, pid):
-            return
-        data_info = {}
-        if self.config.task != Const.STRUCTURE:
-            data_info = self.data_processor.analyze_backward_input(name, module, module_input_output)
-        self.set_is_recomputable(data_info, is_recompute)
-        self.handle_data(name, data_info)
+        try:
+            self.update_construct(name)
+            if not self.check_scope_and_pid(self.scope, name, pid):
+                return
+            data_info = {}
+            if self.config.task != Const.STRUCTURE:
+                data_info = self.data_processor.analyze_backward_input(name, module, module_input_output)
+            self.set_is_recomputable(data_info, is_recompute)
+            self.handle_data(name, data_info)
+        except Exception:
+            tb = traceback.format_exc()
+            self.data_writer.write_error_log(
+                f"[ERROR] backward_input_data_collect failed: name={name}, pid={pid}\n{tb}"
+            )
     def backward_output_data_collect(self, name, module, pid, module_input_output, is_recompute=None):
-        self.update_construct(name)
-        if not self.check_scope_and_pid(self.scope, name, pid):
-            return
-        data_info = {}
-        if self.config.task != Const.STRUCTURE:
-            data_info = self.data_processor.analyze_backward_output(name, module, module_input_output)
-        self.set_is_recomputable(data_info, is_recompute)
-        self.handle_data(name, data_info)
+        try:
+            self.update_construct(name)
+            if not self.check_scope_and_pid(self.scope, name, pid):
+                return
+            data_info = {}
+            if self.config.task != Const.STRUCTURE:
+                data_info = self.data_processor.analyze_backward_output(name, module, module_input_output)
+            self.set_is_recomputable(data_info, is_recompute)
+            self.handle_data(name, data_info)
+        except Exception:
+            tb = traceback.format_exc()
+            self.data_writer.write_error_log(
+                f"[ERROR] backward_output_data_collect failed: name={name}, pid={pid}\n{tb}"
+            )
     def update_construct(self, name):
         if self.config.level not in DataCollector.level_without_construct:
@@ -228,20 +279,23 @@ class DataCollector:
         self.data_processor.update_iter(current_iter)
     def params_data_collect(self, name, param_name, pid, data):
-        grad_name = name + Const.SEP + Const.PARAMS_GRAD
-        self.update_api_or_module_name(grad_name)
-        # 校验scope和pid，以及当前name是否有过反向计算
-        if not self.check_scope_and_pid(self.scope, name, pid) and not self.backward_module_names.get(name):
-            # 如果没有反向计算，则需要清除之前占位写入的grad数据
-            if self.data_writer.cache_data.get("data"):
-                self.data_writer.cache_data.get("data").pop(grad_name, None)
-            return
-        data_info = self.data_processor.analyze_params(grad_name, param_name, data)
-        self.handle_data(grad_name, data_info, flush=self.data_processor.is_terminated)
+        try:
+            grad_name = name + Const.SEP + Const.PARAMS_GRAD
+            self.update_api_or_module_name(grad_name)
+            if not self.check_scope_and_pid(self.scope, name, pid) and not self.backward_module_names.get(name):
+                if self.data_writer.cache_data.get("data"):
+                    self.data_writer.cache_data.get("data").pop(grad_name, None)
+                return
+            data_info = self.data_processor.analyze_params(grad_name, param_name, data)
+            self.handle_data(grad_name, data_info, flush=self.data_processor.is_terminated)
+        except Exception:
+            tb = traceback.format_exc()
+            self.data_writer.write_error_log(
+                f"[ERROR] params_data_collect failed: "
+                f"name={name}, param_name={param_name}, pid={pid}\n{tb}"
+            )
     def debug_data_collect_forward(self, variable, name_with_count):
         data_info = self.data_processor.analyze_debug_forward(variable, name_with_count)
         name_with_count_category = name_with_count + Const.SEP + Const.DEBUG
         self.data_writer.update_debug({name_with_count_category: data_info})

msprobe/core/data_dump/json_writer.py CHANGED Viewed

@@ -17,9 +17,11 @@ import csv
 import os
 import copy
 import threading
+import traceback
+from datetime import datetime, timezone, timedelta
 from msprobe.core.common.const import Const, FileCheckConst
-from msprobe.core.common.file_utils import change_mode, FileOpen, save_json, load_json
+from msprobe.core.common.file_utils import change_mode, FileOpen, save_json, load_json, check_path_before_create
 from msprobe.core.common.log import logger
 from msprobe.core.common.decorator import recursion_depth_decorator
@@ -35,6 +37,7 @@ class DataWriter:
         self.free_benchmark_file_path = None
         self.dump_tensor_data_dir = None
         self.debug_file_path = None
+        self.dump_error_info_path = None
         self.flush_size = 1000
         self.larger_flush_size = 20000
         self.cache_data = {}
@@ -42,6 +45,7 @@ class DataWriter:
         self.cache_construct = {}
         self.cache_debug = {}
         self.stat_stack_list = []
+        self._error_log_initialized = False
     @staticmethod
     def write_data_to_csv(result: list, result_header: tuple, file_path: str):
@@ -128,6 +132,7 @@ class DataWriter:
         self.dump_tensor_data_dir = dump_path_aggregation.dump_tensor_data_dir
         self.free_benchmark_file_path = dump_path_aggregation.free_benchmark_file_path
         self.debug_file_path = dump_path_aggregation.debug_file_path
+        self.dump_error_info_path = dump_path_aggregation.dump_error_info_path
     def flush_data_periodically(self):
         dump_data = self.cache_data.get(Const.DATA)
@@ -142,6 +147,31 @@ class DataWriter:
         if length % threshold == 0:
             self.write_json()
+    def write_error_log(self, message: str):
+        """
+        写错误日志：
+          - 第一次调用时以 'w' 模式清空文件，之后都用 'a' 模式追加
+          - 添加时间戳
+          - 在 message 后写入当前的调用栈（方便追踪日志来源）
+        """
+        try:
+            mode = "w" if not self._error_log_initialized else "a"
+            self._error_log_initialized = True
+            check_path_before_create(self.dump_error_info_path)
+            with FileOpen(self.dump_error_info_path, mode) as f:
+                cst_timezone = timezone(timedelta(hours=8), name="CST")
+                timestamp = datetime.now(cst_timezone).strftime("%Y-%m-%d %H:%M:%S %z")
+                f.write(f"[{timestamp}] {message}\n")
+                f.write("Call stack (most recent call last):\n")
+                f.write("".join(traceback.format_stack()[:-1]))  # 去掉自己这一层
+                f.write("\n")
+        except Exception as e:
+            # 如果连写日志都失败了，就打印到 stderr
+            logger.warning(f"[FallbackError] Failed to write error log: {e}")
     def update_data(self, new_data):
         with lock:
             if not isinstance(new_data, dict) or len(new_data.keys()) != 1:

msprobe/core/debugger/precision_debugger.py CHANGED Viewed

@@ -12,6 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
 from msprobe.core.common.const import Const, FileCheckConst, MsgConst
@@ -46,18 +47,14 @@ class BasePrecisionDebugger:
         if self.initialized:
             return
         self.initialized = True
-        self.check_input_params(config_path, task, dump_path, level)
-        self.common_config, self.task_config = self.parse_config_path(config_path, task)
+        self._check_input_params(config_path, task, dump_path, level)
+        self.common_config, self.task_config = self._parse_config_path(config_path, task)
         self.task = self.common_config.task
         if step is not None:
             self.common_config.step = get_real_step_or_rank(step, Const.STEP)
     @staticmethod
-    def get_task_config(task, json_config):
-        raise NotImplementedError("Subclass must implment get_task_config")
-    @staticmethod
-    def check_input_params(config_path, task, dump_path, level):
+    def _check_input_params(config_path, task, dump_path, level):
         if not config_path:
             config_path = os.path.join(os.path.dirname(__file__), "../../config.json")
         if config_path is not None:
@@ -81,14 +78,9 @@ class BasePrecisionDebugger:
             raise MsprobeException(
                 MsprobeException.INVALID_PARAM_ERROR, f"level must be one of {Const.LEVEL_LIST}")
-    @classmethod
-    def get_instance(cls):
-        instance = cls._instance
-        if not instance:
-            raise Exception(MsgConst.NOT_CREATED_INSTANCE)
-        if instance.task in BasePrecisionDebugger.tasks_not_need_debugger:
-            instance = None
-        return instance
+    @staticmethod
+    def _get_task_config(task, json_config):
+        raise NotImplementedError("Subclass must implement _get_task_config")
     @classmethod
     def forward_backward_dump_end(cls):
@@ -129,15 +121,24 @@ class BasePrecisionDebugger:
             raise Exception(MsgConst.NOT_CREATED_INSTANCE)
         instance.service.restore_custom_api(module, api)
-    def parse_config_path(self, json_file_path, task):
+    @classmethod
+    def _get_instance(cls):
+        instance = cls._instance
+        if not instance:
+            raise Exception(MsgConst.NOT_CREATED_INSTANCE)
+        if instance.task in BasePrecisionDebugger.tasks_not_need_debugger:
+            instance = None
+        return instance
+    def _parse_config_path(self, json_file_path, task):
         if not json_file_path:
             json_file_path = os.path.join(os.path.dirname(__file__), "../../config.json")
         json_config = load_json(json_file_path)
         common_config = CommonConfig(json_config)
         if task:
-            task_config = self.get_task_config(task, json_config)
+            task_config = self._get_task_config(task, json_config)
         else:
             if not common_config.task:
                 common_config.task = Const.STATISTICS
-            task_config = self.get_task_config(common_config.task, json_config)
+            task_config = self._get_task_config(common_config.task, json_config)
         return common_config, task_config

msprobe/core/service.py CHANGED Viewed

@@ -331,6 +331,7 @@ class BaseService(ABC):
         dump_path_aggregation.dump_file_path = os.path.join(dump_dir, "dump.json")
         dump_path_aggregation.stack_file_path = os.path.join(dump_dir, "stack.json")
         dump_path_aggregation.construct_file_path = os.path.join(dump_dir, "construct.json")
+        dump_path_aggregation.dump_error_info_path = os.path.join(dump_dir, "dump_error_info.log")
         dump_path_aggregation.dump_tensor_data_dir = dump_data_dir
         dump_path_aggregation.debug_file_path = os.path.join(dump_dir, "debug.json")
         dump_path_aggregation.free_benchmark_file_path = os.path.join(dump_dir, "free_benchmark.csv")

msprobe/core/single_save/single_comparator.py CHANGED Viewed

@@ -181,32 +181,32 @@ class SingleComparator:
     @classmethod
     def compare_single_tag(cls, tag, array_paths1, array_paths2, output_dir):
-        try:
-            data = []
-            paths1 = array_paths1.get(tag, [])
-            paths2 = array_paths2.get(tag, [])
-            path_dict1 = {(step, rank, micro_step, array_id): path for step, rank, micro_step, array_id, path in paths1}
-            path_dict2 = {(step, rank, micro_step, array_id): path for step, rank, micro_step, array_id, path in paths2}
-            common_keys = set(path_dict1.keys()) & set(path_dict2.keys())
-            for key in common_keys:
-                try:
-                    array1 = np.load(path_dict1[key])
-                    array2 = np.load(path_dict2[key])
-                    result = cls.compare_arrays(array1, array2)
-                    step, rank, micro_step, array_id = key
-                    data.append([
-                        step, rank, micro_step, array_id,
-                        list(array1.shape), list(array2.shape),
-                        result.same_percentage,
-                        result.first_mismatch_index,
-                        result.max_abs_error,
-                        result.max_relative_error,
-                        result.percentage_within_thousandth,
-                        result.percentage_within_hundredth
-                    ])
-                except Exception as e:
-                    logger.error(f"Error comparing {path_dict1[key]} and {path_dict2[key]}: {e}")
+        data = []
+        paths1 = array_paths1.get(tag, [])
+        paths2 = array_paths2.get(tag, [])
+        path_dict1 = {(step, rank, micro_step, array_id): path for step, rank, micro_step, array_id, path in paths1}
+        path_dict2 = {(step, rank, micro_step, array_id): path for step, rank, micro_step, array_id, path in paths2}
+        common_keys = set(path_dict1.keys()) & set(path_dict2.keys())
+        for key in common_keys:
+            try:
+                array1 = np.load(path_dict1[key])
+                array2 = np.load(path_dict2[key])
+                result = cls.compare_arrays(array1, array2)
+                step, rank, micro_step, array_id = key
+                data.append([
+                    step, rank, micro_step, array_id,
+                    list(array1.shape), list(array2.shape),
+                    result.same_percentage,
+                    result.first_mismatch_index,
+                    result.max_abs_error,
+                    result.max_relative_error,
+                    result.percentage_within_thousandth,
+                    result.percentage_within_hundredth
+                ])
+            except Exception as e:
+                logger.error(f"Error comparing {path_dict1[key]} and {path_dict2[key]}: {e}")
+        try:
             df = pd.DataFrame(data, columns=SingleComparator.result_header)
             df = df.sort_values(by=['step', 'rank', 'micro_step', 'id'])
             # 构建输出文件的完整路径

msprobe/core/single_save/single_saver.py CHANGED Viewed

@@ -14,6 +14,7 @@
 # limitations under the License.
 import os
+from collections import defaultdict
 from msprobe.core.common.file_utils import create_directory, save_json
 from msprobe.core.common.const import Const
@@ -36,7 +37,7 @@ class SingleSave:
             cls._instance.dump_path = dump_path
             cls._instance.rank = FmkAdp.get_rank_id()
             cls._instance.step_count = 0
-            cls._instance.cache_dict = {}
+            cls._instance.tag_count = defaultdict(int)
         return cls._instance
     @staticmethod
@@ -109,13 +110,7 @@ class SingleSave:
     @classmethod
     def step(cls):
         instance = cls._instance
-        for key, value in instance.cache_dict.items():
-            if not value["have_micro_batch"]:
-                cls.save_ex({key: value["data"][0]})
-            else:
-                for i, data in enumerate(value["data"]):
-                    cls.save_ex({key: data}, micro_batch=i)
-        instance.cache_dict = {}
+        instance.tag_count = defaultdict(int)
         instance.step_count += 1
     @classmethod
@@ -127,14 +122,8 @@ class SingleSave:
                              "Skip current save process.")
             return
         for key, value in data.items():
-            if key not in instance.cache_dict:
-                instance.cache_dict[key] = {
-                    "have_micro_batch": False,
-                    "data": [value]
-                }
-            else:
-                instance.cache_dict[key]["have_micro_batch"] = True
-                instance.cache_dict[key]["data"].append(value)
+            cls.save_ex({key: value}, micro_batch=instance.tag_count[key])
+            instance.tag_count[key] += 1
     @classmethod
     def _analyze_list_tuple_data(cls, data, data_name=None, save_dir=None):

msprobe/docs/01.installation.md CHANGED Viewed

@@ -16,6 +16,7 @@ pip install mindstudio-probe
 |版本|发布日期|支持 PyTorch 版本|支持 MindSpore 版本|下载链接|校验码|
 |:--:|:--:|:--:|:--:|:--:|:--:|
+|8.1.0|2025.6.14|1.11/2.0/2.1/2.2|2.4.0/2.5.0/2.6.0|[mindstudio_probe-8.1.0-py3-none-any.whl](https://ptdbg.obs.myhuaweicloud.com/msprobe/8.1/mindstudio_probe-8.1.0-py3-none-any.whl)|d10c0a57d073bbe7c681042a11e93a0eaaaf5aa45e1cec997142ce2593d77afd|
 |8.0.0|2025.5.07|1.11/2.0/2.1/2.2|2.4.0/2.5.0/2.6.0|[mindstudio_probe-8.0.0-py3-none-any.whl](https://ptdbg.obs.myhuaweicloud.com/msprobe/8.0/mindstudio_probe-8.0.0-py3-none-any.whl)|6810eade7ae99e3b24657d5cab251119882decd791aa76a7aeeb94dea767daec|
 |1.3.0|2025.4.17|1.11/2.0/2.1/2.2|2.4.0/2.5.0/2.6.0|[mindstudio_probe-1.3.0-py3-none-any.whl](https://ptdbg.obs.myhuaweicloud.com/msprobe/1.3/mindstudio_probe-1.3.0-py3-none-any.whl)|85dbc5518b5c23d29c67d7b85d662517d0318352f372891f8d91e73e71b439c3|
 |1.2.2|2025.3.03|1.11/2.0/2.1/2.2|2.4.0|[mindstudio_probe-1.2.2-py3-none-any.whl](https://ptdbg.obs.myhuaweicloud.com/msprobe/1.2/mindstudio_probe-1.2.2-py3-none-any.whl)|961411bb460d327ea51d6ca4d0c8e8c5565f07c0852d7b8592b781ca35b87212|

msprobe/docs/05.data_dump_PyTorch.md CHANGED Viewed

@@ -471,12 +471,14 @@ debugger.step()
 |   |   |   |                                                          # 当dump时传入的model参数为List[torch.nn.Module]或Tuple[torch.nn.Module]时，模块级数据的命名中包含该模块在列表中的索引index，命名格式为{Module}.{index}.*，*表示以上三种模块级数据的命名格式，例如：Module.0.conv1.Conv2d.forward.0.input.0.pt。
 │   |   |   ├── dump.json
 │   |   |   ├── stack.json
+│   |   |   ├── dump_error_info.log
 │   |   |   └── construct.json
 │   |   ├── rank1
 |   |   |   ├── dump_tensor_data
 |   |   |   |   └── ...
 │   |   |   ├── dump.json
 │   |   |   ├── stack.json
+│   |   |   ├── dump_error_info.log
 |   |   |   └── construct.json
 │   |   ├── ...
 │   |   |
@@ -488,6 +490,7 @@ debugger.step()
 * `rank`：设备 ID，每张卡的数据保存在对应的 `rank{ID}` 目录下。非分布式场景下没有 rank ID，目录名称为 rank。
 * `dump_tensor_data`：保存采集到的张量数据。
 * `dump.json`： 保存API或Module前反向数据的统计量信息。包含dump数据的API名称或Module名称，各数据的dtype、 shape、max、min、mean、L2norm（L2范数，平方根）统计信息以及当配置summary_mode="md5"时的CRC-32数据。具体介绍可参考[dump.json文件说明](./27.dump_json_instruction.md#1-PyTorch场景下的dump.json文件)。
+* `dump_error_info.log`: 仅在dump工具报错时拥有此记录日志，用于记录dump错误日志。
 * `stack.json`：API/Module的调用栈信息。
 * `construct.json`：分层分级结构，level为L1时，construct.json内容为空。

msprobe/docs/06.data_dump_MindSpore.md CHANGED Viewed

@@ -496,12 +496,14 @@ dump 结果目录结构示例如下：
 |   |   |   |                                                          # 当dump时传入的model参数为List[mindspore.nn.Cell]或Tuple[mindspore.nn.Cell]时，模块级数据的命名中包含该模块在列表中的索引index，命名格式为{Cell}.{index}.*，*表示以上三种模块级数据的命名格式，例如：Cell.0.relu.ReLU.forward.0.input.0.npy。
 │   |   |   ├── dump.json
 │   |   |   ├── stack.json
+│   |   |   ├── dump_error_info.log
 │   |   |   └── construct.json
 │   |   ├── rank1
 |   |   |   ├── dump_tensor_data
 |   |   |   |   └── ...
 │   |   |   ├── dump.json
 │   |   |   ├── stack.json
+│   |   |   ├── dump_error_info.log
 |   |   |   └── construct.json
 │   |   ├── ...
 │   |   |
@@ -514,6 +516,7 @@ dump 结果目录结构示例如下：
 * `rank`：设备 ID，每张卡的数据保存在对应的 `rank{ID}` 目录下。非分布式场景下没有 rank ID，目录名称为 rank。
 * `dump_tensor_data`：保存采集到的张量数据。
 * `dump.json`： 保存API或Cell前反向数据的统计量信息。包含dump数据的API名称或Cell名称，各数据的dtype、 shape、max、min、mean、L2norm（L2范数，平方根）统计信息以及当配置summary_mode="md5"时的CRC-32数据。具体介绍可参考[dump.json文件说明](./27.dump_json_instruction.md#2-mindspore-场景下的-dumpjson-文件)。
+* `dump_error_info.log`: 仅在dump工具报错时拥有此记录日志，用于记录dump错误日志。
 * `stack.json`：API/Cell的调用栈信息。
 * `construct.json`：分层分级结构，level为L1时，construct.json内容为空。

msprobe/docs/08.accuracy_checker_online_PyTorch.md CHANGED Viewed

@@ -88,7 +88,7 @@ extendedKeyUsage = serverAuth
 EOF
 )
-# 生成server公私钥，server_password
+# 生成server公私钥，其中server_password为私钥加密口令，仅作演示，请更换使用
 openssl genrsa -aes256 -passout pass:server_password -out server.key 3072
 # 基于server公私钥生成签名请求
 openssl req -new -key server.key -passin pass:server_password -subj "/CN=*example.com/O=Test, Inc./C=CN/ST=Zhejiang/L=Hangzhou" -out server.csr
@@ -115,7 +115,7 @@ default_ca = CA_default
 database = ./index.txt
 default_md = sha256
-# 吊销证书 client.crt
+# 吊销证书 client.crt，其中ca_password为CA私钥加密口令，与CA创建时保持一致
 openssl ca -revoke client.crt -config ca.cnf -cert ca.crt -keyfile ca.key -passin pass:ca_password
 # 生成CRL文件
 openssl ca -gencrl -config ca.cnf -cert ca.crt -keyfile ca.key -passin pass:ca_password -out crl.pem -crldays 30

mindstudio-probe 8.1.0__py3-none-any.whl → 8.1.1__py3-none-any.whl

mindstudio-probe 8.1.0py3-none-any.whl → 8.1.1py3-none-any.whl