PyPI - mindstudio-probe - Versions diffs - 8.2.0__py3-none-any.whl → 8.2.1__py3-none-any.whl - Mend

mindstudio-probe 8.2.0py3-none-any.whl → 8.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (63) hide show

{mindstudio_probe-8.2.0.dist-info → mindstudio_probe-8.2.1.dist-info}/METADATA +2 -2
{mindstudio_probe-8.2.0.dist-info → mindstudio_probe-8.2.1.dist-info}/RECORD +63 -61
msprobe/README.md +4 -4
msprobe/core/common/const.py +6 -0
msprobe/core/common/db_manager.py +35 -4
msprobe/core/common/file_utils.py +28 -5
msprobe/core/common/megatron_utils.py +59 -0
msprobe/core/common/utils.py +14 -3
msprobe/core/compare/diff_analyze/first_diff_analyze.py +16 -4
msprobe/core/compare/diff_analyze/ignore_op_list.yaml +3 -0
msprobe/core/compare/find_first/analyzer.py +8 -7
msprobe/core/compare/find_first/graph.py +11 -3
msprobe/core/compare/find_first/utils.py +3 -2
msprobe/core/compare/highlight.py +13 -6
msprobe/core/compare/multiprocessing_compute.py +17 -10
msprobe/core/compare/utils.py +14 -5
msprobe/core/data_dump/data_collector.py +18 -21
msprobe/core/data_dump/data_processor/pytorch_processor.py +43 -20
msprobe/core/data_dump/json_writer.py +18 -8
msprobe/core/data_dump/scope.py +4 -6
msprobe/core/hook_manager.py +21 -0
msprobe/core/service.py +2 -0
msprobe/core/single_save/single_comparator.py +16 -3
msprobe/docs/01.installation.md +7 -5
msprobe/docs/04.kernel_dump_PyTorch.md +1 -1
msprobe/docs/06.data_dump_MindSpore.md +1 -1
msprobe/docs/10.accuracy_compare_PyTorch.md +46 -5
msprobe/docs/14.data_parse_PyTorch.md +1 -1
msprobe/docs/19.monitor.md +2 -0
msprobe/docs/21.visualization_PyTorch.md +15 -80
msprobe/docs/22.visualization_MindSpore.md +20 -104
msprobe/docs/23.generate_operator_PyTorch.md +1 -1
msprobe/docs/26.data_dump_PyTorch_baseline.md +7 -7
msprobe/docs/img/visualization/vis_browser_1.png +0 -0
msprobe/docs/img/visualization/vis_match_info.png +0 -0
msprobe/docs/img/visualization/vis_precision_info.png +0 -0
msprobe/docs/img/visualization/vis_search_info.png +0 -0
msprobe/docs/img/visualization/vis_show_info.png +0 -0
msprobe/docs/img/visualization/vis_showcase.png +0 -0
msprobe/docs/img/visualization/vis_unmatch_info.png +0 -0
msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +1 -1
msprobe/mindspore/cell_processor.py +33 -5
msprobe/mindspore/compare/common_dir_compare.py +22 -26
msprobe/mindspore/debugger/precision_debugger.py +1 -1
msprobe/mindspore/dump/cell_dump_process.py +73 -62
msprobe/mindspore/dump/graph_mode_cell_dump.py +21 -10
msprobe/mindspore/dump/hook_cell/ms_hook_manager.py +2 -0
msprobe/pytorch/compare/utils.py +2 -1
msprobe/pytorch/dump/module_dump/hook_wrapper.py +10 -7
msprobe/pytorch/dump/module_dump/module_processer.py +15 -8
msprobe/pytorch/monitor/module_hook.py +28 -9
msprobe/pytorch/online_dispatch/dispatch.py +42 -24
msprobe/visualization/builder/graph_builder.py +169 -64
msprobe/visualization/builder/graph_merger.py +0 -1
msprobe/visualization/builder/msprobe_adapter.py +1 -1
msprobe/visualization/db_utils.py +25 -2
msprobe/visualization/graph/base_node.py +0 -24
msprobe/visualization/graph/graph.py +5 -14
msprobe/visualization/graph_service.py +29 -53
{mindstudio_probe-8.2.0.dist-info → mindstudio_probe-8.2.1.dist-info}/LICENSE +0 -0
{mindstudio_probe-8.2.0.dist-info → mindstudio_probe-8.2.1.dist-info}/WHEEL +0 -0
{mindstudio_probe-8.2.0.dist-info → mindstudio_probe-8.2.1.dist-info}/entry_points.txt +0 -0
{mindstudio_probe-8.2.0.dist-info → mindstudio_probe-8.2.1.dist-info}/top_level.txt +0 -0

msprobe/mindspore/dump/graph_mode_cell_dump.py CHANGED Viewed

@@ -14,7 +14,8 @@
 # limitations under the License.
 import os
+import glob
+import tempfile
 import mindspore as ms
 from mindspore import hal, ops, Tensor
 from mindspore.ops.primitive import _run_op
@@ -28,6 +29,7 @@ import msprobe.mindspore.dump.cell_dump_process as cellDumperWithDumpGradient
 import msprobe.mindspore.dump.cell_dump_with_insert_gradient as cellDumperWithInsertGradient
 tensordump_flag = True
+DEFAULT_RANK_DIR = "rank0"
 try:
     from mindspore._c_expression import _tensordump_set_step
 except ImportError:
@@ -41,8 +43,6 @@ except ImportError:
 class GraphModeCellDump:
-    task = CoreConst.STATISTICS
     def __init__(self, config: DebuggerConfig, model, strict=True):
         self.net = model
         self.white_list = []
@@ -55,29 +55,40 @@ class GraphModeCellDump:
         self.list = config.list
         self.data_mode = config.data_mode
         self.file_format = config.file_format
-        GraphModeCellDump.task = config.task
         self.summary_mode = config.summary_mode
+        self.task = config.task
         self.check_config(strict)
         self.set_step()
     @staticmethod
-    def step():
+    def step(dump_path, step_list, task):
         # 更新TensorDump Step
-        if GraphModeCellDump.task == CoreConst.TENSOR:
+        if task == CoreConst.TENSOR:
             hal.synchronize()
             temp_tensor = ms.Tensor([1], dtype=ms.float32)
-            step_flag = "<tensordump-update-step>"
-            _run_op(ops.TensorDump(), "TensorDump", (step_flag, temp_tensor))
-            ops.tensordump(step_flag, temp_tensor)
+            rank_id = os.environ.get('RANK_ID')
+            rank_dir = DEFAULT_RANK_DIR
+            if rank_id is not None:
+                rank_dir = CoreConst.RANK + str(rank_id)
+            with tempfile.TemporaryDirectory(dir=dump_path, prefix=rank_dir) as temp_dir:
+                save_file_flag = f"{temp_dir}/step_{Runtime.step_count}"
+                _run_op(ops.TensorDump(), "TensorDump", (save_file_flag, temp_tensor))
+                step_flag = "<tensordump-update-step>"
+                _run_op(ops.TensorDump(), "TensorDump", (step_flag, temp_tensor))
+                ops.tensordump(step_flag, temp_tensor)
+                cellDumperWithDumpGradient.process_step(dump_path, temp_dir, Runtime.step_count, step_list)
         # 更新静态图KBK dump的step数
-        if GraphModeCellDump.task == CoreConst.STATISTICS:
+        if task == CoreConst.STATISTICS:
             if not graph_step_flag:
                 raise Exception(
                     "Importing _dump_step failed, "
                     "please use the latest version package of MindSpore."
                 )
             _dump_step(1)
+            cellDumperWithDumpGradient.process_statistics_step(dump_path, Runtime.step_count, step_list)
     def check_config(self, strict):
         if not self.net:

msprobe/mindspore/dump/hook_cell/ms_hook_manager.py CHANGED Viewed

@@ -203,10 +203,12 @@ class MindsporeHookManager(BaseHookManager):
                 return
             with ThreadSafe():
+                original_state = self.ensure_gc_enabled()
                 BaseHookManager.inner_switch[tid] = True
                 module_input = ModuleBackwardInputs(grad_input=grad_input)
                 self.data_collector.update_api_or_module_name(full_name)
                 self.data_collector.backward_input_data_collect(full_name, module, self._pid, module_input)
                 BaseHookManager.inner_switch[tid] = False
+                self.restore_gc_state(original_state)
         return backward_pre_hook

msprobe/pytorch/compare/utils.py CHANGED Viewed

@@ -35,7 +35,8 @@ def read_pt_data(dir_path, file_name):
         data_value = load_pt(data_path, to_cpu=True).detach()
     except RuntimeError as e:
         # 这里捕获 load_pt 中抛出的异常
-        logger.error(f"Failed to load the .pt file at {data_path}.")
+        data_path_file_name = os.path.basename(data_path)
+        logger.error(f"Failed to load the .pt file at {data_path_file_name}.")
         raise CompareException(CompareException.INVALID_FILE_ERROR) from e
     except AttributeError as e:
         # 这里捕获 detach 方法抛出的异常

msprobe/pytorch/dump/module_dump/hook_wrapper.py CHANGED Viewed

@@ -24,8 +24,11 @@ from msprobe.pytorch.common.log import logger
 def wrap_setup_backward_hook(func):
-    def requires_clone(tensor):
-        return isinstance(tensor, torch.Tensor) and tensor.requires_grad and torch.is_grad_enabled()
+    def requires_clone(tensor, need_check_leaf=False):
+        need_clone = isinstance(tensor, torch.Tensor) and tensor.requires_grad and torch.is_grad_enabled()
+        if need_check_leaf:
+            need_clone &= tensor.grad_fn is not None
+        return need_clone
     @recursion_depth_decorator("Dump: wrap_setup_backward_hook.parse_tensor", max_depth=Const.DUMP_MAX_DEPTH)
     def parse_tensor(item, tensor_list):
@@ -39,20 +42,20 @@ def wrap_setup_backward_hook(func):
                 parse_tensor(value, tensor_list)
     @recursion_depth_decorator("Dump: wrap_setup_backward_hook.rebuild_args", max_depth=Const.DUMP_MAX_DEPTH)
-    def rebuild_args(item, tensor_iter):
-        if requires_clone(item):
+    def rebuild_args(item, tensor_iter, need_check_leaf=False):
+        if requires_clone(item, need_check_leaf):
             result = next(tensor_iter)
             if hasattr(result, "_base") and result._base is not None:
                 if torch._C._autograd._get_creation_meta(result) != torch._C._autograd.CreationMeta(0):
                     torch._C._autograd._set_creation_meta(result, torch._C._autograd.CreationMeta(0))
-            return result
+            return result
         if isinstance(item, list):
             for index, value in enumerate(item):
-                item[index] = rebuild_args(value, tensor_iter)
+                item[index] = rebuild_args(value, tensor_iter, need_check_leaf=True)
             return item
         if isinstance(item, dict):
             for key, value in item.items():
-                item[key] = rebuild_args(value, tensor_iter)
+                item[key] = rebuild_args(value, tensor_iter, need_check_leaf=True)
             return item
         if isinstance(item, tuple):
             if hasattr(item, '_fields'):

msprobe/pytorch/dump/module_dump/module_processer.py CHANGED Viewed

@@ -23,13 +23,15 @@ from torch.utils.hooks import BackwardHook, RemovableHandle
 from msprobe.core.common.const import Const
 from msprobe.core.common.runtime import Runtime
 from msprobe.core.common.utils import ModuleQueue, ThreadSafe
+from msprobe.core.common.megatron_utils import wrap_megatron_step, get_micro_step, is_megatron
 from msprobe.core.data_dump.scope import BaseScope, ModuleRangeScope, MixRangeScope
 from msprobe.pytorch.common.log import logger
 from msprobe.pytorch.common.utils import is_torch_nn_module, register_forward_pre_hook
 from msprobe.pytorch.dump.module_dump.hook_wrapper import wrap_setup_input_output_hook
 torch_version_above_or_equal_2 = torch.__version__.split('+')[0] >= '2.0'
-if torch_version_above_or_equal_2:
+torch_version_above_or_equal_21 = torch.__version__.split('+')[0] >= '2.1'
+if torch_version_above_or_equal_21:
     from torch.utils.checkpoint import _StopRecomputationError
@@ -61,7 +63,7 @@ def wrap_forward_with_hook_safety(module):
                 hook_fn = list(module._forward_hooks.values())[0]
                 hook_fn(module, args, kwargs, exception_output)
             raise e
-    if torch_version_above_or_equal_2:
+    if torch_version_above_or_equal_21:
         module.forward = wrapped_forward
@@ -82,6 +84,8 @@ class ModuleProcesser:
             from megatron.core.pipeline_parallel import schedules
             origin_func_id = id(schedules.deallocate_output_tensor)
             schedules.deallocate_output_tensor = wrap_megatron_deallocate(schedules.deallocate_output_tensor)
+            schedules.forward_step = wrap_megatron_step(schedules.forward_step)
+            schedules.backward_step = wrap_megatron_step(schedules.backward_step, is_forward=False)
             for module in list(sys.modules.values()):
                 if module.__name__ == 'schedules':
                     continue
@@ -258,14 +262,16 @@ class ModuleProcesser:
             ModuleProcesser.module_stack[tid] = []
         if self.module_stack[tid]:
-            ModuleProcesser.module_node[full_name] = self.module_stack[tid][-1]
+            ModuleProcesser.module_node[full_name] = self.module_stack[tid][-1] if not is_megatron() \
+                else [self.module_stack[tid][-1], get_micro_step()]
         else:
             parent_name = ModuleProcesser.module_queue.find_last(full_name)
-            ModuleProcesser.module_node[full_name] = parent_name
+            ModuleProcesser.module_node[full_name] = parent_name if not is_megatron() \
+                else [parent_name, get_micro_step()]
         ModuleProcesser.module_queue.add_name(full_name)
         ModuleProcesser.module_stack[tid].append(full_name)
-        ModuleProcesser.api_parent_node[tid] = full_name
+        ModuleProcesser.api_parent_node[tid] = full_name if not is_megatron() else [full_name, get_micro_step()]
         if self.scope:
             self.scope.begin_module(full_name)
@@ -273,14 +279,15 @@ class ModuleProcesser:
         tid = threading.get_ident()
         if torch_version_above_or_equal_2 or is_forward:
             ModuleProcesser.module_queue.remove_name(full_name)
-            ModuleProcesser.api_parent_node[tid] = None
+            ModuleProcesser.api_parent_node[tid] = None if not is_megatron() else [None, get_micro_step()]
             if self.module_stack.get(tid):
                 ModuleProcesser.module_stack[tid].pop()
             if self.module_stack.get(tid):
-                ModuleProcesser.api_parent_node[tid] = ModuleProcesser.module_stack[tid][-1]
+                ModuleProcesser.api_parent_node[tid] = ModuleProcesser.module_stack[tid][-1] if not is_megatron() \
+                    else [ModuleProcesser.module_stack[tid][-1], get_micro_step()]
             if self.scope:
                 self.scope.end_module(full_name)
         else:
             if self.scope:
                 self.scope.begin_module(full_name)
-            ModuleProcesser.api_parent_node[tid] = full_name
+            ModuleProcesser.api_parent_node[tid] = full_name if not is_megatron() else [full_name, get_micro_step()]

msprobe/pytorch/monitor/module_hook.py CHANGED Viewed

@@ -19,12 +19,14 @@ import importlib
 from collections import defaultdict
 from datetime import datetime
 from functools import partial
+from itertools import cycle
 import pytz
 import torch
 import torch.distributed as dist
 import pandas as pd
 from torch.utils.hooks import BackwardHook
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
 from msprobe.core.common.const import MonitorConst, Const
 from msprobe.core.common.file_utils import load_json, save_json, make_dir
@@ -229,6 +231,8 @@ class TrainerMon:
         self.duplicate_param = {}
         self.name2tag = {}
         self.param_name_call_id = {}
+        self.flat_prefix_names = []
+        self.flat_prefix_reverse_iter = None
         self.call_id = 0
         self.module_struct = defaultdict(dict)
         self.grad_accs = []
@@ -945,13 +949,20 @@ class TrainerMon:
         return False
     def _register_chunk(self, model_chunk, prefix):
+        if isinstance(model_chunk, FSDP):
+            if not model_chunk._use_orig_params:
+                raise ValueError("Only Support fsdp1 with use_orig_params=True")
+            self.fsdp_wrapped_module = True
         for (param_name, param) in model_chunk.named_parameters():
             if not param.requires_grad:
                 continue
-            if not self.fsdp_wrapped_module and param_name.startswith("_fsdp_wrapped_module"):
-                self.fsdp_wrapped_module = True
             if not self.fsdp2_wrapped_module and param.__class__.__name__ == "DTensor":
                 self.fsdp2_wrapped_module = True
+            if self.fsdp_wrapped_module:  # FSDP1需要记录完整的不被target限制的flat权重前缀名，以供后续对flat解包
+                flat_prefix_name, _ = param_name.rsplit(MonitorConst.FSDP_FLAT_SEP, 1)
+                if flat_prefix_name not in self.flat_prefix_names:
+                    self.flat_prefix_names.append(flat_prefix_name)
             if self._is_target_param(param_name, param, prefix):
                 name = prefix + squash_param_name(param_name, self.squash_name)
                 if name in self.param2name.values():
@@ -975,6 +986,8 @@ class TrainerMon:
                     k: get_summary_writer_tag_name(name, k, self.rank)
                     for k in keywords
                 }
+        if self.fsdp_wrapped_module:
+            self.flat_prefix_reverse_iter = cycle(reversed(self.flat_prefix_names))  # post_backward_hook调用顺序是反向的
     def _register_param_name(self):
         for vpp_stage, model_chunk in enumerate(self.model):
@@ -1224,17 +1237,22 @@ class TrainerMon:
         每个forward阶段，fsdp对AccumulateGrad重复注册hook方法，monitor工具内注册hook无法生效，
         因此对_post_backward_hook进行patch，在backward后，reduce_scatter前采集梯度。
         """
         def patch_post_backward_hook(_post_backward_hook):
             def wrapper(state, handle, *unused):
                 grad_dict = {}
-                offset = 0
-                for param, name in self.param2name.items():
-                    limit = param.numel()
-                    if not limit:
+                local_names = handle.flat_param._fqns
+                offsets = handle._get_flat_param_offsets()
+                shapes = handle.flat_param._shapes
+                flat_prefix = next(self.flat_prefix_reverse_iter)
+                for local_name, (start, end), local_shape in zip(local_names, offsets, shapes):
+                    grad_clip = handle.flat_param.grad[start:end + 1]
+                    grad = grad_clip.reshape(local_shape)
+                    total_name = f"{flat_prefix}{MonitorConst.FSDP_FLAT_SEP}{local_name}"
+                    if total_name not in self.origin2squash:
+                        logger.warning(f"{total_name} not in model.named_parameters(), skip.")
                         continue
-                    grad = handle.flat_param.grad[offset:offset + limit]
-                    offset += limit
-                    tag = self.name2tag.get(name, {}).get(MonitorConst.PRE_GRAD)
+                    tag = self.name2tag.get(self.origin2squash[total_name], {}).get(MonitorConst.PRE_GRAD)
                     if tag is None:
                         continue
                     grad_dict[tag] = grad
@@ -1242,6 +1260,7 @@ class TrainerMon:
                 get_metrics(self.ops, grad_dict, self.eps, self.grad_context.pre)
                 out = _post_backward_hook(state, handle, *unused)
                 return out
             return wrapper
         logger.info("Patch fsdp _post_backward_hook, collect pre_grad metrics.")

msprobe/pytorch/online_dispatch/dispatch.py CHANGED Viewed

@@ -17,7 +17,7 @@ import json
 import os
 import time
 import multiprocessing
-from multiprocessing import Pool
+from multiprocessing import Pool, Lock
 import torch
 from torch.utils._python_dispatch import TorchDispatchMode
@@ -39,6 +39,7 @@ from msprobe.pytorch.online_dispatch.utils import get_callstack, data_to_cpu, ge
 from msprobe.pytorch.online_dispatch.compare import Comparator
 from msprobe.core.common.utils import check_str_param, safe_get_value
+child_global_lock = None
 current_time = time.strftime("%Y%m%d%H%M%S")
 RESULT_FILE_NAME = "accuracy_checking_result_" + current_time + ".csv"
 DETAILS_FILE_NAME = "accuracy_checking_details_" + current_time + ".csv"
@@ -86,14 +87,14 @@ class PtdbgDispatch(TorchDispatchMode):
         yaml_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "torch_ops_config.yaml")
         self.get_ops(yaml_path)
-        self.lock = None
+        self.lock = Lock() if process_num > 0 else None
         max_process_num = max(int((multiprocessing.cpu_count() + 1) // Const.CPU_QUARTER), 1)
         if process_num > max_process_num:
             logger.error(f"process_num should be less than or equal to {max_process_num}, but got {process_num}!")
             raise DispatchException(f'process_num should be less than or equal to {max_process_num}, '
                                     f'but got {process_num}!')
         if process_num > 0:
-            self.pool = Pool(process_num)
+            self.pool = Pool(process_num, initializer=self._init_child_process, initargs=(self.lock,))
         if debug:
             logger.info(f'Main pid:{os.getpid()} device:{self.device_id} dump_list:{self.dump_api_list} '
                         f'dump_mode:{self.dump_mode} cpu_path[{self.root_cpu_path}], npu_path[{self.root_npu_path}], '
@@ -114,18 +115,17 @@ class PtdbgDispatch(TorchDispatchMode):
                 logger.error("Please check train log, An exception may have occurred!")
                 return
             check_file_or_directory_path(summary_path, False)
-            fp_handle = FileOpen(summary_path, "r")
-            while True:
-                json_line_data = fp_handle.readline()
-                if json_line_data == '\n':
-                    continue
-                if len(json_line_data) == 0:
-                    break
-                msg = json.loads(json_line_data)
-                if len(msg) < 2:
-                    raise ValueError("JSON data does not contain enough elements. Expected at least 2 elements.")
-                self.all_summary[msg[0]] = msg[1]
-            fp_handle.close()
+            with FileOpen(summary_path, "r") as fp_handle:
+                while True:
+                    json_line_data = fp_handle.readline()
+                    if json_line_data == '\n':
+                        continue
+                    if len(json_line_data) == 0:
+                        break
+                    msg = json.loads(json_line_data)
+                    if len(msg) < 2:
+                        raise ValueError("JSON data does not contain enough elements. Expected at least 2 elements.")
+                    self.all_summary[msg[0]] = msg[1]
         if self.debug_flag:
             input_num = 0
@@ -163,11 +163,16 @@ class PtdbgDispatch(TorchDispatchMode):
         call_stack = get_callstack()
         self.call_stack_list.append(call_stack)
-        self.api_index += 1
-        if aten_api not in self.single_api_index_dict:
-            self.single_api_index_dict[aten_api] = 1
-        else:
-            self.single_api_index_dict[aten_api] += 1
+        self.lock.acquire() if self.process_num > 0 else None
+        try:
+            self.api_index += 1
+            if aten_api not in self.single_api_index_dict:
+                self.single_api_index_dict[aten_api] = 1
+            else:
+                self.single_api_index_dict[aten_api] += 1
+        finally:
+            self.lock.release() if self.process_num > 0 else None
         run_param = self.get_run_param(aten_api, func.__name__, aten_api_overload_name)
@@ -180,7 +185,7 @@ class PtdbgDispatch(TorchDispatchMode):
         cpu_kwargs = []
         data_to_cpu(args, 0, cpu_args)
         data_to_cpu(kwargs, 0, cpu_kwargs)
         cpu_args = safe_get_value(cpu_args, 0, "cpu_args")
         cpu_kwargs = safe_get_value(cpu_kwargs, 0, "cpu_kwargs")
@@ -194,7 +199,12 @@ class PtdbgDispatch(TorchDispatchMode):
             try:
                 cpu_out = func(*cpu_args, **cpu_kwargs)
             except RuntimeError as e:
-                self.api_index -= 1
+                self.lock.acquire() if self.process_num > 0 else None
+                try:
+                    self.api_index -= 1
+                    self.single_api_index_dict[aten_api] -= 1
+                finally:
+                    self.lock.release() if self.process_num > 0 else None
                 logger.warning(f"RuntimeError: {e}")
                 logger.warning(f"This aten_api {aten_api} does not support running on cpu, so skip it.")
                 return npu_out
@@ -215,7 +225,7 @@ class PtdbgDispatch(TorchDispatchMode):
             run_param.process_flag = True
             if self.check_fun(func, run_param):
                 data_info = DisPatchDataInfo(cpu_args, cpu_kwargs, self.all_summary, None, npu_out_cpu, cpu_out,
-                                             self.lock)
+                                             child_global_lock)
                 self.pool.apply_async(func=dispatch_multiprocess, args=(run_param, data_info),
                                       error_callback=error_call)
             else:
@@ -233,12 +243,20 @@ class PtdbgDispatch(TorchDispatchMode):
                     return True
         return False
+    @staticmethod
+    def _init_child_process(lock):
+        global child_global_lock
+        child_global_lock = lock
     def get_dir_name(self, tag):
         # guarantee file uniqueness
         time.sleep(1)
-        time_now = time.strftime("%Y%m%d%H%M%S", time.localtime(time.time()))
+        # 时间格式：年-月-日-时-分-秒-毫秒（精确到千分之一秒）
+        time_now = time.strftime("%Y%m%d%H%M%S%f", time.localtime(time.time()))[:-3]  # 取前3位毫秒
         if tag is None or not isinstance(tag, str):
             logger.warning('There is not tag or the type of tag is not string.')
+            # 目录名格式：msprobe_rank{设备ID}_{毫秒时间戳}
             dir_name = f'msprobe_rank{self.device_id}_{time_now}'
         else:
             dir_name = f'msprobe_{tag}_rank{self.device_id}_{time_now}'

mindstudio-probe 8.2.0__py3-none-any.whl → 8.2.1__py3-none-any.whl

mindstudio-probe 8.2.0py3-none-any.whl → 8.2.1py3-none-any.whl