PyPI - mindstudio-probe - Versions diffs - 8.2.0__py3-none-any.whl → 8.3.0__py3-none-any.whl - Mend

mindstudio-probe 8.2.0py3-none-any.whl → 8.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (90) hide show

{mindstudio_probe-8.2.0.dist-info → mindstudio_probe-8.3.0.dist-info}/METADATA +2 -2
{mindstudio_probe-8.2.0.dist-info → mindstudio_probe-8.3.0.dist-info}/RECORD +90 -79
msprobe/README.md +7 -5
msprobe/core/common/const.py +6 -0
msprobe/core/common/db_manager.py +35 -4
msprobe/core/common/file_utils.py +105 -27
msprobe/core/common/framework_adapter.py +7 -6
msprobe/core/common/megatron_utils.py +59 -0
msprobe/core/common/utils.py +14 -3
msprobe/core/compare/find_first/analyzer.py +8 -7
msprobe/core/compare/find_first/graph.py +11 -3
msprobe/core/compare/find_first/utils.py +2 -1
msprobe/core/compare/highlight.py +13 -6
msprobe/core/compare/multiprocessing_compute.py +17 -10
msprobe/core/compare/utils.py +14 -5
msprobe/core/data_dump/data_collector.py +18 -21
msprobe/core/data_dump/data_processor/pytorch_processor.py +43 -20
msprobe/core/data_dump/json_writer.py +18 -8
msprobe/core/data_dump/scope.py +4 -6
msprobe/core/hook_manager.py +37 -3
msprobe/core/service.py +18 -5
msprobe/core/single_save/single_comparator.py +16 -3
msprobe/docs/01.installation.md +7 -5
msprobe/docs/02.config_introduction.md +14 -1
msprobe/docs/04.kernel_dump_PyTorch.md +1 -1
msprobe/docs/06.data_dump_MindSpore.md +1 -1
msprobe/docs/08.accuracy_checker_online_PyTorch.md +295 -0
msprobe/docs/10.accuracy_compare_PyTorch.md +46 -5
msprobe/docs/14.data_parse_PyTorch.md +1 -1
msprobe/docs/15.free_benchmarking_PyTorch.md +1 -1
msprobe/docs/19.monitor.md +2 -0
msprobe/docs/21.visualization_PyTorch.md +15 -80
msprobe/docs/22.visualization_MindSpore.md +20 -104
msprobe/docs/23.generate_operator_PyTorch.md +1 -1
msprobe/docs/25.tool_function_introduction.md +1 -0
msprobe/docs/26.data_dump_PyTorch_baseline.md +7 -7
msprobe/docs/img/visualization/vis_browser_1.png +0 -0
msprobe/docs/img/visualization/vis_match_info.png +0 -0
msprobe/docs/img/visualization/vis_precision_info.png +0 -0
msprobe/docs/img/visualization/vis_search_info.png +0 -0
msprobe/docs/img/visualization/vis_show_info.png +0 -0
msprobe/docs/img/visualization/vis_showcase.png +0 -0
msprobe/docs/img/visualization/vis_unmatch_info.png +0 -0
msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +1 -1
msprobe/mindspore/api_accuracy_checker/generate_op_script/op_generator.py +1 -1
msprobe/mindspore/cell_processor.py +33 -5
msprobe/mindspore/compare/common_dir_compare.py +22 -26
msprobe/mindspore/compare/utils.py +1 -2
msprobe/mindspore/debugger/precision_debugger.py +1 -1
msprobe/mindspore/dump/cell_dump_process.py +73 -62
msprobe/mindspore/dump/graph_mode_cell_dump.py +21 -10
msprobe/mindspore/dump/hook_cell/ms_hook_manager.py +2 -0
msprobe/msprobe.py +6 -4
msprobe/pytorch/api_accuracy_checker/common/config.py +36 -3
msprobe/pytorch/api_accuracy_checker/compare/api_precision_compare.py +24 -0
msprobe/pytorch/api_accuracy_checker/compare/compare.py +12 -2
msprobe/pytorch/api_accuracy_checker/config.yaml +6 -1
msprobe/pytorch/api_accuracy_checker/generate_op_script/op_generator.py +1 -1
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py +132 -12
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/__init__.py +0 -0
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/attl.py +205 -0
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/client.py +378 -0
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/device_dispatch.py +239 -0
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/dump_dispatch.py +115 -0
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/server.py +250 -0
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/torch_ops_config.yaml +63 -0
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/utils.py +198 -0
msprobe/pytorch/attl_manager.py +65 -0
msprobe/pytorch/common/utils.py +22 -2
msprobe/pytorch/compare/utils.py +3 -3
msprobe/pytorch/debugger/debugger_config.py +10 -0
msprobe/pytorch/dump/module_dump/hook_wrapper.py +34 -7
msprobe/pytorch/dump/module_dump/module_processer.py +23 -10
msprobe/pytorch/hook_module/api_register.py +6 -1
msprobe/pytorch/monitor/module_hook.py +28 -9
msprobe/pytorch/online_dispatch/dispatch.py +42 -24
msprobe/pytorch/pt_config.py +57 -2
msprobe/pytorch/pytorch_service.py +11 -2
msprobe/visualization/builder/graph_builder.py +170 -64
msprobe/visualization/builder/graph_merger.py +0 -1
msprobe/visualization/builder/msprobe_adapter.py +1 -1
msprobe/visualization/db_utils.py +25 -2
msprobe/visualization/graph/base_node.py +0 -24
msprobe/visualization/graph/graph.py +5 -14
msprobe/visualization/graph_service.py +29 -53
msprobe/visualization/utils.py +11 -1
{mindstudio_probe-8.2.0.dist-info → mindstudio_probe-8.3.0.dist-info}/LICENSE +0 -0
{mindstudio_probe-8.2.0.dist-info → mindstudio_probe-8.3.0.dist-info}/WHEEL +0 -0
{mindstudio_probe-8.2.0.dist-info → mindstudio_probe-8.3.0.dist-info}/entry_points.txt +0 -0
{mindstudio_probe-8.2.0.dist-info → mindstudio_probe-8.3.0.dist-info}/top_level.txt +0 -0

msprobe/core/data_dump/data_collector.py CHANGED Viewed

@@ -23,6 +23,7 @@ from msprobe.core.data_dump.json_writer import DataWriter
 from msprobe.core.common.log import logger
 from msprobe.core.common.const import Const
 from msprobe.core.data_dump.data_processor.factory import DataProcessorFactory
+from msprobe.core.common.megatron_utils import MegatronStepInfo, get_micro_step, is_megatron
 def build_data_collector(config):
@@ -270,15 +271,20 @@ class DataCollector:
         if self.config.level not in DataCollector.level_without_construct:
             if self.optimizer_status in [Const.OPTIMIZER, Const.CLIP_GRAD]:
                 if self.optimizer_status_first_start[self.optimizer_status]:
-                    self.data_writer.update_construct({self.optimizer_status: None})
+                    self.data_writer.update_construct(
+                        {self.optimizer_status: None if not is_megatron() else [None, get_micro_step()]})
                     self.optimizer_status_first_start[self.optimizer_status] = False
-                self.data_writer.update_construct({name: self.optimizer_status})
+                self.data_writer.update_construct(
+                    {name: self.optimizer_status if not is_megatron() else [self.optimizer_status, get_micro_step()]})
             else:
                 if self.config.level == Const.LEVEL_MIX and \
                   not (name.startswith(Const.MODULE) or name.startswith(Const.CELL)):
                     self.data_writer.update_construct(
                         {name: self.module_processor.api_parent_node.get(threading.get_ident())}
                     )
+            if MegatronStepInfo.is_megatron:
+                micro_step_number = max(MegatronStepInfo.forward_micro_step, MegatronStepInfo.backward_micro_step)
+                self.data_writer.update_construct({Const.MEGATRON_MICRO_STEP_NUMBER: micro_step_number})
             self.data_writer.update_construct(self.module_processor.module_node)
@@ -302,25 +308,16 @@ class DataCollector:
         self.data_processor.update_iter(current_iter)
     def params_data_collect(self, name, param_name, pid, data):
-        try:
-            grad_name = name + Const.SEP + Const.PARAMS_GRAD
-            self.update_api_or_module_name(grad_name)
-            if not self.check_scope_and_pid(self.scope, name, pid) and not self.backward_module_names.get(name):
-                if self.data_writer.cache_data.get("data"):
-                    self.data_writer.cache_data.get("data").pop(grad_name, None)
-                    self.params_grad_record[grad_name] = False
-                return
-            data_info = self.data_processor.analyze_params(grad_name, param_name, data)
-            self.handle_data(grad_name, data_info, flush=self.data_processor.is_terminated)
-            self.params_grad_record[grad_name] = False
-        except Exception as e:
-            error_type = type(e).__name__
-            tb = traceback.format_exc()
-            self.data_writer.write_error_log(
-                f"[ERROR] params_data_collect failed: "
-                f"name={name}, param_name={param_name}, pid={pid}\n{tb}",
-                error_type=error_type
-            )
+        grad_name = name + Const.SEP + Const.PARAMS_GRAD
+        self.update_api_or_module_name(grad_name)
+        if not self.check_scope_and_pid(self.scope, name, pid) and not self.backward_module_names.get(name):
+            if self.data_writer.cache_data.get("data"):
+                self.data_writer.cache_data.get("data").pop(grad_name, None)
+                self.params_grad_record[grad_name] = False
+            return
+        data_info = self.data_processor.analyze_params(grad_name, param_name, data)
+        self.handle_data(grad_name, data_info, flush=self.data_processor.is_terminated)
+        self.params_grad_record[grad_name] = False
     def params_data_collect_in_bw_hook(self, params_dict, name):
         try:

msprobe/core/data_dump/data_processor/pytorch_processor.py CHANGED Viewed

@@ -13,13 +13,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import ctypes
 import os
 import zlib
-import ctypes
 from collections.abc import Iterable
+from concurrent.futures import ThreadPoolExecutor
 from dataclasses import asdict
 from typing import List
-from concurrent.futures import ThreadPoolExecutor
 import numpy as np
 import torch
@@ -29,7 +29,6 @@ from torch.distributed.distributed_c10d import _get_default_group
 from msprobe.core.common.const import Const
 from msprobe.core.common.decorator import recursion_depth_decorator
 from msprobe.core.common.exceptions import MsprobeException
-from msprobe.core.common.file_utils import path_len_exceeds_limit
 from msprobe.core.common.log import logger
 from msprobe.core.common.utils import convert_tuple, is_int
 from msprobe.core.data_dump.data_processor.base import BaseDataProcessor, ModuleBackwardInputsOutputs, \
@@ -48,15 +47,28 @@ class TensorHandler:
     def __init__(self):
         self.has_dtensor = hasattr(dist, "tensor") and hasattr(dist.tensor, "DTensor")
         self.has_fake_tensor = hasattr(torch, "_subclasses") and hasattr(torch._subclasses, "fake_tensor")
+        self.has_async_collective_tensor = hasattr(dist, "_functional_collectives") and \
+                                           hasattr(dist._functional_collectives, "AsyncCollectiveTensor")
+    @staticmethod
+    def free_tensor(tensor, tensor_name):
+        try:
+            tensor.untyped_storage().resize_(0)
+        except Exception as e:
+            logger.warning(f"Failed to free tensor: {tensor_name}, the detail info: {e}.")
     def is_dtensor(self, tensor):
-        return self.has_dtensor and isinstance(tensor, torch.distributed.tensor.DTensor)
+        return self.has_dtensor and isinstance(tensor, dist.tensor.DTensor)
     def is_fake_tensor(self, tensor):
         return self.has_fake_tensor and isinstance(tensor, torch._subclasses.fake_tensor.FakeTensor)
+    def is_async_collective_tensor(self, tensor):
+        return self.has_async_collective_tensor and \
+            isinstance(tensor, dist._functional_collectives.AsyncCollectiveTensor)
     def is_empty_data(self, tensor):
-        return tensor.is_meta or self.is_fake_tensor(tensor)
+        return tensor.is_meta or self.is_fake_tensor(tensor) or self.is_async_collective_tensor(tensor)
     def convert_common_tensor(self, tensor):
         if self.is_dtensor(tensor):
@@ -71,6 +83,8 @@ class TensorHandler:
             return Const.DTENSOR_TYPE
         if self.is_fake_tensor(tensor):
             return Const.FAKE_TENSOR_TYPE
+        if self.is_async_collective_tensor(tensor):
+            return Const.AC_TENSOR_TYPE
         return Const.TENSOR_TYPE
     def get_dtensor_info(self, tensor):
@@ -94,6 +108,18 @@ class TensorHandler:
         dtensor_info.update({"placements": placements})
         return dtensor_info
+    def save_tensor(self, tensor, file_path):
+        common_tensor = self.convert_common_tensor(tensor)
+        if self.is_empty_data(common_tensor):
+            logger.debug(f"Saving fake tensor or meta tensor is not supported, the current tensor is {file_path}.")
+            return
+        if common_tensor.untyped_storage().data_ptr() == 0:
+            logger.debug(f"Saving null-pointer tensor is not supported, the current tensor is {file_path}.")
+            return
+        saved_tensor = common_tensor.clone().contiguous().detach()
+        save_pt(saved_tensor, file_path)
+        self.free_tensor(saved_tensor, file_path)
 class PytorchDataProcessor(BaseDataProcessor):
     pytorch_special_type = (
@@ -288,7 +314,7 @@ class PytorchDataProcessor(BaseDataProcessor):
     def dump_async_data(self):
         for file_path, tensor in self._async_dump_cache.items():
-            save_pt(tensor.contiguous(), file_path)
+            self.tensor_handler.save_tensor(tensor, file_path)
         self._async_dump_cache.clear()
     def analyze_single_element(self, element, suffix_stack):
@@ -385,24 +411,24 @@ class PytorchDataProcessor(BaseDataProcessor):
     def _analyze_and_save_tensor(self, tensor, suffix):
         dump_data_name, file_path = self.get_save_file_path(suffix)
         single_arg = PytorchDataProcessor._analyze_tensor(self, tensor, suffix)
-        if self.tensor_handler.is_empty_data(tensor) or tensor.untyped_storage().data_ptr() == 0:
-            logger.debug(
-                "Collecting real data of fake tensor or meta tensor is not supported or data_ptr is 0, "
-                f"the current api/module name is {self.current_api_or_module_name}."
-            )
+        common_tensor = self.tensor_handler.convert_common_tensor(tensor)
+        if self.tensor_handler.is_empty_data(common_tensor):
+            logger.debug(f"Saving fake tensor or meta tensor is not supported, the current tensor is {file_path}.")
+            return single_arg
+        if common_tensor.untyped_storage().data_ptr() == 0:
+            logger.debug(f"Saving null-pointer tensor is not supported, the current tensor is {file_path}.")
             return single_arg
         single_arg.update({"data_name": dump_data_name})
         if self.config.async_dump:
-            self._async_dump_cache[file_path] = tensor.clone().detach()
+            self._async_dump_cache[file_path] = common_tensor.clone().detach()
         else:
-            saved_tensor = tensor.clone().contiguous().detach()
-            save_pt(saved_tensor, file_path)
+            self.tensor_handler.save_tensor(common_tensor, file_path)
         return single_arg
     def _analyze_and_save_ndarray(self, ndarray, suffix):
         dump_data_name, file_path = self.get_save_file_path(suffix)
-        save_pt(torch.tensor(ndarray), file_path)
+        self.tensor_handler.save_tensor(torch.tensor(ndarray), file_path)
         ndarray_json = PytorchDataProcessor._analyze_ndarray(ndarray, suffix)
         ndarray_json.update({"data_name": dump_data_name})
         return ndarray_json
@@ -493,7 +519,7 @@ class OverflowCheckDataProcessor(PytorchDataProcessor):
             self._analyze_maybe_overflow_flag()
         if self.has_overflow:
             for file_path, tensor in self.cached_tensors_and_file_paths.items():
-                save_pt(tensor.clone().contiguous().detach(), file_path)
+                self.tensor_handler.save_tensor(tensor, file_path)
             self.real_overflow_nums += 1
             if self.overflow_nums != -1 and self.real_overflow_nums >= self.overflow_nums:
                 logger.info(f"[{Const.TOOL_NAME}] Reached the preset overflow times, "
@@ -538,10 +564,7 @@ class OverflowCheckDataProcessor(PytorchDataProcessor):
     def _analyze_tensor(self, tensor, suffix):
         dump_data_name, file_path = self.get_save_file_path(suffix)
-        if not path_len_exceeds_limit(file_path):
-            self.cached_tensors_and_file_paths.update({file_path: tensor})
-        else:
-            logger.warning(f'The file path {file_path} length exceeds limit.')
+        self.cached_tensors_and_file_paths.update({file_path: tensor})
         single_arg = super()._analyze_tensor(tensor, suffix)
         single_arg.update({"data_name": dump_data_name})
         if not self.has_overflow and self.support_inf_nan:

msprobe/core/data_dump/json_writer.py CHANGED Viewed

@@ -13,18 +13,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import concurrent
+import copy
 import csv
 import os
-import copy
 import threading
 import traceback
 from datetime import datetime, timezone, timedelta
-import concurrent
 from msprobe.core.common.const import Const, FileCheckConst
-from msprobe.core.common.file_utils import change_mode, FileOpen, save_json, load_json, check_path_before_create
-from msprobe.core.common.log import logger
 from msprobe.core.common.decorator import recursion_depth_decorator
+from msprobe.core.common.file_utils import change_mode, FileOpen, save_json, check_path_before_create
+from msprobe.core.common.log import logger
 lock = threading.Lock()
@@ -40,6 +40,7 @@ class DataWriter:
         self.debug_file_path = None
         self.dump_error_info_path = None
         self.flush_size = 1000
+        self.md5_flush_size = 5000
         self.larger_flush_size = 20000
         self.cache_data = {}
         self.cache_stack = {}
@@ -49,6 +50,7 @@ class DataWriter:
         self._error_log_initialized = False
         self._cache_logged_error_types = set()
         self.crc32_stack_list = []
+        self.data_updated = False
     @staticmethod
     def write_data_to_csv(result: list, result_header: tuple, file_path: str):
@@ -60,7 +62,7 @@ class DataWriter:
             spawn_writer = csv.writer(csv_file)
             if not is_exists:
                 spawn_writer.writerow(result_header)
-            spawn_writer.writerows([result,])
+            spawn_writer.writerows([result, ])
         is_new_file = not is_exists
         if is_new_file:
             change_mode(file_path, FileCheckConst.DATA_FILE_AUTHORITY)
@@ -190,7 +192,7 @@ class DataWriter:
         summary_mode = getattr(cfg, "summary_mode", None)
         if summary_mode == Const.MD5:
-            threshold = self.flush_size
+            threshold = self.md5_flush_size
         else:
             threshold = self.flush_size if length < self.larger_flush_size else self.larger_flush_size
@@ -238,6 +240,7 @@ class DataWriter:
                 logger.warning(f"The dump data({dump_data}) should be a dict.")
                 return
+            self.data_updated = True
             key = next(iter(new_data.keys()))
             if key in dump_data:
                 dump_data.get(key).update(new_data.get(key))
@@ -246,6 +249,7 @@ class DataWriter:
     def update_stack(self, name, stack_data):
         with lock:
+            self.data_updated = True
             api_list = self.cache_stack.get(stack_data)
             if api_list is None:
                 self.cache_stack.update({stack_data: [name]})
@@ -254,10 +258,12 @@ class DataWriter:
     def update_construct(self, new_data):
         with lock:
+            self.data_updated = True
             self.cache_construct.update(new_data)
     def update_debug(self, new_data):
         with lock:
+            self.data_updated = True
             self.cache_debug['data'].update(new_data)
     def write_data_json(self, file_path):
@@ -324,17 +330,21 @@ class DataWriter:
             stat_result = self.flush_stat_stack()
             # 遍历 cache_data，将占位符替换为最终统计值
             if stat_result:
+                self.data_updated = True
                 self._replace_stat_placeholders(self.cache_data, stat_result)
                 if self.cache_debug:
                     self._replace_stat_placeholders(self.cache_debug, stat_result)
-            # 2) 再 flush CRC32
             crc32_result = self.flush_crc32_stack()
             if crc32_result:
+                self.data_updated = True
                 self._replace_crc32_placeholders(self.cache_data, crc32_result)
                 if self.cache_debug:
                     self._replace_crc32_placeholders(self.cache_debug, crc32_result)
+            if not self.data_updated:
+                return
             if self.cache_data:
                 self.write_data_json(self.dump_file_path)
             if self.cache_stack:
@@ -343,4 +353,4 @@ class DataWriter:
                 self.write_construct_info_json(self.construct_file_path)
             if self.cache_debug:
                 self.write_debug_info_json(self.debug_file_path)
+            self.data_updated = False

msprobe/core/data_dump/scope.py CHANGED Viewed

@@ -69,8 +69,7 @@ class BaseScope(ABC):
         self.scope = scope
         self.api_list = api_list
-    @staticmethod
-    def rectify_args(scope, api_list):
+    def rectify_args(self, scope, api_list):
         if not isinstance(api_list, list):
             raise ScopeException(ScopeException.InvalidApiStr,
                                  f"api_list参数须配置为列表，实际类型为{type(api_list)}.")
@@ -104,12 +103,11 @@ class BaseScope(ABC):
 class ListScope(BaseScope):
-    @staticmethod
-    def rectify_args(scope, api_list):
+    def rectify_args(self, scope, api_list):
         if scope and api_list:
             raise ScopeException(ScopeException.ArgConflict,
                                  f"scope和api_list不可以同时配置，实际配置为scope={scope}, api_list={api_list}.")
-        return super(ListScope, ListScope).rectify_args(scope, api_list)
+        return super().rectify_args(scope, api_list)
     def check(self, name):
         if not self.scope or name in self.scope:
@@ -147,7 +145,7 @@ class RangeScope(BaseScope, ABC):
                                      f"scope参数格式错误，要求格式为api或模块完整命名，实际为{name}.")
     def rectify_args(self, scope, api_list):
-        scope, api_list = super(RangeScope, RangeScope).rectify_args(scope, api_list)
+        scope, api_list = super().rectify_args(scope, api_list)
         if scope and len(scope) != 2:
             raise ScopeException(ScopeException.InvalidScope,
                                  f"scope参数指定区间断点，须传入长度为2的列表，实际长度为{len(scope)}.")

msprobe/core/hook_manager.py CHANGED Viewed

@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import gc
 import os
 import threading
 from abc import ABC, abstractmethod
@@ -45,9 +46,10 @@ class BaseHookManager(ABC):
     hook_handle_dict = {}
     params_grad_info = {}
-    def __init__(self, data_collector, config):
+    def __init__(self, data_collector, config, attl_manager=None):
         self.data_collector = data_collector
         self.config = config
+        self.attl_manager = attl_manager
     @property
     def _pid(self):
@@ -62,9 +64,20 @@ class BaseHookManager(ABC):
     def reset_status():
         BaseHookManager.inner_switch = defaultdict(bool)
         BaseHookManager.inner_api_count = defaultdict(int)
-        BaseHookManager.hook_handle_dict.clear()
         BaseHookManager.params_grad_info.clear()
+    @staticmethod
+    def ensure_gc_enabled():
+        is_gc_disabled = not gc.isenabled()
+        if is_gc_disabled:
+            gc.enable()
+        return is_gc_disabled
+    @staticmethod
+    def restore_gc_state(original_state):
+        if original_state:
+            gc.disable()
     @staticmethod
     def _clear_input_kwargs(module, tid):
         if hasattr(module, 'msprobe_input_kwargs') and tid in module.msprobe_input_kwargs:
@@ -168,9 +181,11 @@ class BaseHookManager(ABC):
             if not self._should_execute_hook(Const.MODULE, tid):
                 return
             with ThreadSafe():
+                original_state = self.ensure_gc_enabled()
                 BaseHookManager.inner_switch[tid] = True
                 self.data_collector.params_data_collect(ori_name, param_name, self._pid, grad)
                 BaseHookManager.inner_switch[tid] = False
+                self.restore_gc_state(original_state)
             return
         return hook_fn
@@ -185,6 +200,7 @@ class BaseHookManager(ABC):
                 return None
             with ThreadSafe():
+                original_state = self.ensure_gc_enabled()
                 self._register_forward_hook(module, api_name)
                 BaseHookManager.inner_api_count[tid] += 1
                 if BaseHookManager.inner_api_count[tid] != 1:
@@ -200,6 +216,10 @@ class BaseHookManager(ABC):
                 args = self._register_backward_hook(module, full_backward_name, args)
                 with self._no_grad_context():
+                    if getattr(self.config, "online_run_ut", False):
+                        BaseHookManager.inner_switch[tid] = False
+                        ThreadSafe.release()
+                        return
                     self.data_collector.update_api_or_module_name(full_forward_name)
                     self.data_collector.forward_input_data_collect(
                         full_forward_name,
@@ -209,6 +229,7 @@ class BaseHookManager(ABC):
                         self._is_recompute
                     )
                 BaseHookManager.inner_switch[tid] = False
+                self.restore_gc_state(original_state)
                 return args
         return forward_pre_hook
@@ -221,6 +242,7 @@ class BaseHookManager(ABC):
                 return None
             with ThreadSafe():
+                original_state = self.ensure_gc_enabled()
                 if hook_type == Const.API:
                     if BaseHookManager.inner_api_count[tid] != 1:
                         if BaseHookManager.inner_api_count[tid] > 1:
@@ -243,6 +265,13 @@ class BaseHookManager(ABC):
                     output = self._register_backward_pre_hook(module, full_backward_name, output)
                 with self._no_grad_context():
+                    if getattr(self.config, "online_run_ut", False):
+                        if self.data_collector.scope and not self.data_collector.scope.check(full_name):
+                            return None
+                        if self.attl_manager:
+                            self.attl_manager.attl_send(full_name, args, kwargs, output)
+                        BaseHookManager.inner_switch[tid] = False
+                        return None
                     if hook_type == Const.MODULE:
                         params_dict = self._get_params_dict(module)
                         setattr(module_input_output, Const.PARAMS, params_dict)
@@ -276,6 +305,7 @@ class BaseHookManager(ABC):
                         return forward_new_output
                 BaseHookManager.inner_switch[tid] = False
+                self.restore_gc_state(original_state)
                 return output
         return forward_hook
@@ -287,9 +317,12 @@ class BaseHookManager(ABC):
                 return
             with ThreadSafe():
+                original_state = self.ensure_gc_enabled()
                 BaseHookManager.inner_switch[tid] = True
                 self.data_collector.update_api_or_module_name(full_name)
+                if getattr(self.config, "online_run_ut", False):
+                    BaseHookManager.inner_switch[tid] = False
+                    return
                 need_exchange = self._need_exchange(module) if hook_type == Const.MODULE else True
                 if need_exchange:
                     module_input_output = ModuleBackwardInputsOutputs(grad_input=grad_output, grad_output=grad_input)
@@ -306,5 +339,6 @@ class BaseHookManager(ABC):
                     params_dict = self._get_params_dict(module)
                     self.data_collector.params_data_collect_in_bw_hook(params_dict, full_name)
                 BaseHookManager.inner_switch[tid] = False
+                self.restore_gc_state(original_state)
         return backward_hook

msprobe/core/service.py CHANGED Viewed

@@ -26,6 +26,7 @@ from msprobe.core.common.utils import Const, print_tools_ends_info, DumpPathAggr
 from msprobe.core.data_dump.api_registry import ApiRegistry
 from msprobe.core.data_dump.data_collector import build_data_collector
 from msprobe.core.kernel_dump.kernel_config import create_kernel_config_json
+from msprobe.core.common.megatron_utils import MegatronStepInfo
 class BaseService(ABC):
@@ -34,6 +35,7 @@ class BaseService(ABC):
         self.config.level = getattr(config, 'level_ori', config.level)  # 兼容MindSpore配置
         self.model = None
         self.data_collector = build_data_collector(self.config)
+        self.attl_manager = None
         self.current_iter = 0
         self.loop = 0
         self.init_step = 0
@@ -89,6 +91,10 @@ class BaseService(ABC):
             self.config.task in self.data_collector.tasks_need_tensor_data or
             (self.config.task == Const.STATISTICS and self.config.tensor_list)
         )
+    @property
+    def _is_online_run_ut(self):
+        return getattr(self.config, "online_run_ut", False)
     @property
     @abstractmethod
@@ -140,9 +146,11 @@ class BaseService(ABC):
             self.primitive_switch = True
             self._change_jit_switch(True)
             self.logger.info(f"Dump switch is turned on at step {self.current_iter}. ")
-        self.create_dirs()
-        self.logger.info(f"Dump data will be saved in {self.dump_iter_dir}.")
+        if self._is_online_run_ut:
+            self._run_ut_dispatch(True)
+        else:
+            self.create_dirs()
+            self.logger.info(f"Dump data will be saved in {self.dump_iter_dir}.")
     def stop(self):
         """通用stop模板"""
@@ -157,7 +165,8 @@ class BaseService(ABC):
         self._change_jit_switch(False)
         if self._is_l2_level:
             return
+        if self._is_online_run_ut:
+            self._run_ut_dispatch(False)
         self._process_async_dump()
         self.data_collector.write_json()
@@ -170,6 +179,7 @@ class BaseService(ABC):
         self.currrent_step_first_debug_save = True
         self.loop += 1
         self._reset_status()
+        MegatronStepInfo.reset()
     def save(self, variable, name, save_backward):
         '''
@@ -256,6 +266,8 @@ class BaseService(ABC):
         end_service = self.config.step and self.current_iter > max(self.config.step) or \
             self.data_collector and self.data_collector.data_processor.is_terminated
         if end_service:
+            if self._is_online_run_ut and self.attl_manager:
+                self.attl_manager.attl_stop()
             self.primitive_switch = False
             self._change_jit_switch(False)
             Runtime.is_running = False
@@ -298,7 +310,8 @@ class BaseService(ABC):
         if root_model and isinstance(root_model, list):
             root_model = root_model[0]
             self.logger.warning("Infer model can only input one to support token_range, choose the first one.")
+        if self._is_online_run_ut:
+            return
         root_model.register_forward_pre_hook(infer_hook)
     def _create_l2_dirs(self, cur_rank):

msprobe/core/single_save/single_comparator.py CHANGED Viewed

@@ -14,6 +14,7 @@
 # limitations under the License.
 import os
+import re
 import multiprocessing
 from dataclasses import dataclass
@@ -70,6 +71,9 @@ class SingleComparator:
         比较两个NumPy数组，计算最大绝对误差、最大相对误差和相同元素的百分比
         """
         # 计算每个维度上的最小尺寸
+        if array1.ndim != array2.ndim:
+            array1 = array1.flatten()
+            array2 = array2.flatten()
         min_shape = [min(s1, s2) for s1, s2 in zip(array1.shape, array2.shape)]
         # 截取数组到相同的形状
         sliced_array1 = array1[tuple(slice(0, s) for s in min_shape)]
@@ -176,9 +180,18 @@ class SingleComparator:
                 continue
             for step, step_path in cls.get_steps(tag_path):
                 for rank, rank_path in cls.get_ranks(step_path):
-                    for micro_step, micro_step_path in cls.get_micro_steps(rank_path):
-                        for array_id, array_path in cls.get_arrays(micro_step_path):
-                            array_paths.setdefault(tag, []).append((step, rank, micro_step, array_id, array_path))
+                    for item in os.listdir(rank_path):
+                        next_path = os.path.join(rank_path, item)
+                        if re.match(r"micro_step(\d+)", item):
+                            micro_step = re.match(r"micro_step(\d+)", item).group(1)
+                            for array_id, array_path in cls.get_arrays(next_path):
+                                array_paths.setdefault(tag, []).append(
+                                    (step, rank, int(micro_step), array_id, array_path))
+                        elif re.match(r"\w{1,100}_(\d{1,100})\.npy", item):
+                            array_id = re.match(r"\w{1,100}_(\d{1,100})\.npy", item).group(1)
+                            array_paths.setdefault(tag, []).append((step, rank, 0, int(array_id), next_path))
+                        else:
+                            array_paths.setdefault(tag, []).append((step, rank, 0, 0, next_path))
         return array_paths
     @classmethod

msprobe/docs/01.installation.md CHANGED Viewed

@@ -16,6 +16,7 @@ pip install mindstudio-probe
 |  版本   |    发布日期    |支持 PyTorch 版本|支持 MindSpore 版本|                                                                下载链接                                                                |校验码|
 |:-----:|:----------:|:--:|:--:|:----------------------------------------------------------------------------------------------------------------------------------:|:--:|
+| 8.2.0 | 2025.9.03  |1.11/2.0/2.1/2.2|2.4.0/2.5.0/2.6.0| [mindstudio_probe-8.2.0-py3-none-any.whl](https://ptdbg.obs.myhuaweicloud.com/msprobe/8.2/mindstudio_probe-8.2.0-py3-none-any.whl) |bbc1577d76754adf987069308177d3e0a04e36de9c7f22e75c34cf4ad0ce1af2|
 | 8.1.2 | 2025.8.01  |1.11/2.0/2.1/2.2|2.4.0/2.5.0/2.6.0| [mindstudio_probe-8.1.2-py3-none-any.whl](https://ptdbg.obs.myhuaweicloud.com/msprobe/8.1/mindstudio_probe-8.1.2-py3-none-any.whl) |ff07bb81fddd3b8f3096d119ca1481bde8fdb24f10644def5250caad727448ab|
 | 8.1.1 | 2025.6.20  |1.11/2.0/2.1/2.2|2.4.0/2.5.0/2.6.0| [mindstudio_probe-8.1.1-py3-none-any.whl](https://ptdbg.obs.myhuaweicloud.com/msprobe/8.1/mindstudio_probe-8.1.1-py3-none-any.whl) |2aad10a243575544d7feef552caf4d06aa93028488ebd0bbc9aa350379da859d|
 | 8.1.0 | 2025.6.14  |1.11/2.0/2.1/2.2|2.4.0/2.5.0/2.6.0| [mindstudio_probe-8.1.0-py3-none-any.whl](https://ptdbg.obs.myhuaweicloud.com/msprobe/8.1/mindstudio_probe-8.1.0-py3-none-any.whl) |d10c0a57d073bbe7c681042a11e93a0eaaaf5aa45e1cec997142ce2593d77afd|
@@ -45,12 +46,12 @@ pip install ./mindstudio_probe-{version}-py3-none-any.whl # 安装whl包
 ## 3 从源码安装
 ```shell
-git clone https://gitee.com/ascend/mstt.git
+git clone https://gitcode.com/Ascend/mstt.git
 cd mstt/debug/accuracy_tools
 pip install setuptools wheel
-python setup.py bdist_wheel [--include-mod=[adump]]
+python setup.py bdist_wheel [--include-mod=[adump]] [--no-check]
 cd ./dist
 pip install ./mindstudio_probe*.whl
 ```
@@ -58,6 +59,7 @@ pip install ./mindstudio_probe*.whl
 |参数|说明|是否必选|
 |--|--|:--:|
 |--include-mod|指定可选模块，可取值`adump`，表示在编whl包时加入adump模块。默认未配置该参数，表示编基础包。<br>&#8226; adump模块用于MindSpore静态图场景L2级别的dump。<br>&#8226; 仅MindSpore 2.5.0及以上版本支持adump模块。<br>&#8226; 若使用源码安装，编译环境需支持GCC 7.5或以上版本，和CMake 3.14或以上版本。<br>&#8226; 生成的whl包仅限编译时使用的python版本和处理器架构可用。|否|
+|--no-check|指定可选模块`adump`后，会下载所依赖的三方库包，下载过程会进行证书校验。--no-check可以跳过证书校验。|否|
 # 特性变更说明
@@ -212,7 +214,7 @@ pip show mindstudio-probe
 Name: mindstudio-probe
 Version: 1.0.x
 Summary: Pytorch Ascend Probe Utils
-Home-page: https://gitee.com/ascend/mstt/tree/master/debug/accuracy_tools/msprobe
+Home-page: https://gitcode.com/Ascend/mstt/tree/master/debug/accuracy_tools/msprobe
 Author: Ascend Team
 Author-email: pmail_mindstudio@huawei.com
 License: Apache License 2.0
@@ -225,7 +227,7 @@ Required-by:
 ## 1 安装 CANN 包
-1.1 根据 CPU 架构和 NPU 型号选择 toolkit 和 kernel，可以参考 [CANN 软件安装指南](https://gitee.com/link?target=https%3A%2F%2Fwww.hiascend.com%2Fdocument%2Fdetail%2Fzh%2Fcanncommercial%2F700%2Fenvdeployment%2Finstg%2Finstg_0001.html)和[昇腾社区](https://www.hiascend.cn/developer/download/community/result?module=cann)。
+1.1 根据 CPU 架构和 NPU 型号选择 toolkit 和 kernel，可以参考 [CANN 软件安装指南](https://www.hiascend.com/document/detail/zh/canncommercial/700/envdeployment/instg/instg_0001.html)和[昇腾社区](https://www.hiascend.cn/developer/download/community/result?module=cann)。
 1.2 运行示例
 ```bash
@@ -239,7 +241,7 @@ source {cann_path}/ascend-toolkit/set_env.sh
 ```
 ## 2 安装 PyTorch_NPU
-链接：[https://gitee.com/ascend/pytorch](https://gitee.com/ascend/pytorch)。
+链接：[https://gitcode.com/Ascend/pytorch](https://gitcode.com/Ascend/pytorch)。
 ## 3 安装 MindSpeed LLM

mindstudio-probe 8.2.0__py3-none-any.whl → 8.3.0__py3-none-any.whl

mindstudio-probe 8.2.0py3-none-any.whl → 8.3.0py3-none-any.whl