PyPI - mindstudio-probe - Versions diffs - 8.1.2__py3-none-any.whl → 8.2.1__py3-none-any.whl - Mend

mindstudio-probe 8.1.2py3-none-any.whl → 8.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (181) hide show

{mindstudio_probe-8.1.2.dist-info → mindstudio_probe-8.2.1.dist-info}/METADATA +2 -2
{mindstudio_probe-8.1.2.dist-info → mindstudio_probe-8.2.1.dist-info}/RECORD +172 -147
msprobe/README.md +6 -6
msprobe/core/common/const.py +98 -41
msprobe/core/common/db_manager.py +256 -0
msprobe/core/common/file_utils.py +28 -5
msprobe/core/common/log.py +7 -0
msprobe/core/common/megatron_utils.py +59 -0
msprobe/core/common/parallel_state.py +193 -0
msprobe/core/common/utils.py +20 -13
msprobe/core/common_config.py +5 -0
msprobe/core/compare/acc_compare.py +140 -93
msprobe/core/compare/check.py +13 -0
msprobe/core/compare/compare_cli.py +64 -6
msprobe/core/compare/config.py +10 -8
msprobe/core/compare/diff_analyze/diff_analyze_threshold.yaml +14 -0
msprobe/core/compare/diff_analyze/first_diff_analyze.py +135 -0
msprobe/core/compare/diff_analyze/ignore_op_list.yaml +3 -0
msprobe/core/compare/find_first/__init__.py +0 -0
msprobe/core/compare/find_first/analyzer.py +282 -0
msprobe/core/compare/find_first/data_processor.py +35 -0
msprobe/core/compare/find_first/graph.py +188 -0
msprobe/core/compare/find_first/utils.py +189 -0
msprobe/core/compare/highlight.py +74 -101
msprobe/core/compare/layer_mapping/layer_mapping.py +14 -9
msprobe/core/compare/merge_result/merge_result.py +2 -2
msprobe/core/compare/multiprocessing_compute.py +45 -28
msprobe/core/compare/npy_compare.py +7 -10
msprobe/core/compare/utils.py +338 -130
msprobe/core/config_check/checkers/dataset_checker.py +2 -1
msprobe/core/config_check/checkers/env_args_checker.py +5 -5
msprobe/core/config_check/checkers/hyperparameter_checker.py +30 -10
msprobe/core/config_check/checkers/pip_checker.py +4 -3
msprobe/core/config_check/checkers/random_checker.py +3 -3
msprobe/core/config_check/checkers/weights_checker.py +2 -1
msprobe/core/config_check/ckpt_compare/megatron_loader.py +2 -0
msprobe/core/config_check/resource/hyperparameter.yaml +11 -1
msprobe/core/config_check/utils/hyperparameter_parser.py +7 -3
msprobe/core/config_check/utils/utils.py +10 -0
msprobe/core/data_dump/api_registry.py +49 -30
msprobe/core/data_dump/data_collector.py +71 -29
msprobe/core/data_dump/data_processor/base.py +2 -0
msprobe/core/data_dump/data_processor/mindspore_processor.py +47 -53
msprobe/core/data_dump/data_processor/pytorch_processor.py +227 -93
msprobe/core/data_dump/json_writer.py +81 -7
msprobe/core/data_dump/scope.py +4 -6
msprobe/core/hook_manager.py +129 -70
msprobe/core/monitor/csv2db.py +361 -0
msprobe/core/monitor/db_utils.py +278 -0
msprobe/core/monitor/utils.py +35 -1
msprobe/core/service.py +31 -39
msprobe/core/single_save/single_comparator.py +16 -3
msprobe/docs/01.installation.md +51 -19
msprobe/docs/02.config_introduction.md +16 -20
msprobe/docs/03.config_examples.md +26 -0
msprobe/docs/04.kernel_dump_PyTorch.md +1 -1
msprobe/docs/05.data_dump_PyTorch.md +6 -2
msprobe/docs/06.data_dump_MindSpore.md +44 -7
msprobe/docs/07.accuracy_checker_PyTorch.md +1 -1
msprobe/docs/10.accuracy_compare_PyTorch.md +124 -44
msprobe/docs/11.accuracy_compare_MindSpore.md +75 -7
msprobe/docs/14.data_parse_PyTorch.md +1 -1
msprobe/docs/19.monitor.md +94 -7
msprobe/docs/21.visualization_PyTorch.md +71 -101
msprobe/docs/22.visualization_MindSpore.md +69 -119
msprobe/docs/23.generate_operator_PyTorch.md +1 -1
msprobe/docs/25.tool_function_introduction.md +0 -1
msprobe/docs/26.data_dump_PyTorch_baseline.md +7 -7
msprobe/docs/28.debugger_save_instruction.md +184 -81
msprobe/docs/29.data_dump_MSAdapter.md +6 -0
msprobe/docs/31.config_check.md +4 -2
msprobe/docs/36.calculation_result_change.md +75 -0
msprobe/docs/FAQ.md +22 -1
msprobe/docs/data_dump_MindSpore/dynamic_graph_quick_start_example.md +6 -2
msprobe/docs/img/compare_result.png +0 -0
msprobe/docs/img/visualization/vis_browser_1.png +0 -0
msprobe/docs/img/visualization/vis_match_info.png +0 -0
msprobe/docs/img/visualization/vis_precision_info.png +0 -0
msprobe/docs/img/visualization/vis_search_info.png +0 -0
msprobe/docs/img/visualization/vis_show_info.png +0 -0
msprobe/docs/img/visualization/vis_showcase.png +0 -0
msprobe/docs/img/visualization/vis_unmatch_info.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/1.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/2.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/3.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/4.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/5.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/6.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/7.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/llamafactory-qwen25vl.txt +59 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/llamafactory1.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/llamafactory2.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/mindspeed-mm-qwen25vl.txt +80 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/mindspeed1.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/mindspeed2.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactory_mapping.md +330 -0
msprobe/mindspore/__init__.py +1 -1
msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +1 -1
msprobe/mindspore/api_accuracy_checker/api_runner.py +9 -6
msprobe/mindspore/api_accuracy_checker/compute_element.py +18 -12
msprobe/mindspore/cell_processor.py +64 -25
msprobe/mindspore/common/utils.py +51 -7
msprobe/mindspore/compare/common_dir_compare.py +45 -37
msprobe/mindspore/compare/ms_compare.py +10 -2
msprobe/mindspore/compare/ms_graph_compare.py +47 -52
msprobe/mindspore/debugger/debugger_config.py +18 -7
msprobe/mindspore/debugger/precision_debugger.py +16 -12
msprobe/mindspore/dump/cell_dump_process.py +130 -68
msprobe/mindspore/dump/cell_dump_with_insert_gradient.py +10 -2
msprobe/mindspore/dump/graph_mode_cell_dump.py +35 -9
msprobe/mindspore/dump/graph_tensor_dump.py +11 -0
msprobe/mindspore/dump/hook_cell/api_register.py +19 -20
msprobe/mindspore/dump/hook_cell/hook_cell.py +12 -34
msprobe/mindspore/dump/hook_cell/ms_hook_manager.py +142 -21
msprobe/mindspore/dump/kernel_kbyk_dump.py +24 -0
msprobe/mindspore/exception_dump/__init__.py +0 -0
msprobe/mindspore/exception_dump/exception_dump_tool_factory.py +51 -0
msprobe/mindspore/exception_dump/kernel_graph_exception_dump.py +57 -0
msprobe/mindspore/free_benchmark/api_pynative_self_check.py +5 -4
msprobe/mindspore/mindspore_service.py +2 -2
msprobe/mindspore/mindtorch/mindtorch_adaptor.py +12 -7
msprobe/mindspore/monitor/features.py +82 -0
msprobe/mindspore/monitor/module_hook.py +168 -10
msprobe/mindspore/monitor/utils.py +27 -1
msprobe/mindspore/ms_config.py +12 -4
msprobe/mindspore/overflow_check/overflow_check_tool_factory.py +1 -1
msprobe/mindspore/task_handler_factory.py +3 -1
msprobe/nan_analyze/graph.py +1 -1
msprobe/pytorch/api_accuracy_checker/common/config.py +3 -36
msprobe/pytorch/api_accuracy_checker/compare/api_precision_compare.py +0 -24
msprobe/pytorch/api_accuracy_checker/compare/compare.py +2 -12
msprobe/pytorch/api_accuracy_checker/config.yaml +1 -6
msprobe/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py +2 -2
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py +12 -132
msprobe/pytorch/common/utils.py +1 -21
msprobe/pytorch/compare/pt_compare.py +10 -2
msprobe/pytorch/{hook_module/jit_script_wrapper.py → compare/pt_diff_analyze.py} +3 -15
msprobe/pytorch/compare/utils.py +2 -1
msprobe/pytorch/debugger/debugger_config.py +18 -23
msprobe/pytorch/dump/module_dump/hook_wrapper.py +10 -7
msprobe/pytorch/dump/module_dump/module_processer.py +41 -19
msprobe/pytorch/free_benchmark/main.py +7 -4
msprobe/pytorch/hook_module/api_register.py +62 -24
msprobe/pytorch/hook_module/hook_module.py +9 -29
msprobe/pytorch/hook_module/pt_hook_manager.py +84 -15
msprobe/pytorch/hook_module/script_wrapper.py +140 -0
msprobe/pytorch/hook_module/support_wrap_ops.yaml +6 -0
msprobe/pytorch/monitor/csv2tb.py +1 -1
msprobe/pytorch/monitor/features.py +94 -0
msprobe/pytorch/monitor/module_hook.py +221 -81
msprobe/pytorch/monitor/module_metric.py +27 -1
msprobe/pytorch/monitor/optimizer_collect.py +109 -4
msprobe/pytorch/online_dispatch/dispatch.py +42 -24
msprobe/pytorch/online_dispatch/dump_compare.py +1 -1
msprobe/pytorch/parse_tool/lib/visualization.py +0 -1
msprobe/pytorch/pt_config.py +2 -51
msprobe/pytorch/pytorch_service.py +7 -14
msprobe/visualization/builder/graph_builder.py +192 -63
msprobe/visualization/builder/graph_merger.py +986 -0
msprobe/visualization/builder/msprobe_adapter.py +17 -15
msprobe/visualization/compare/graph_comparator.py +26 -16
msprobe/visualization/db_utils.py +252 -0
msprobe/visualization/graph/base_node.py +2 -22
msprobe/visualization/graph/distributed_analyzer.py +12 -12
msprobe/visualization/graph/graph.py +44 -16
msprobe/visualization/graph_service.py +143 -59
msprobe/visualization/utils.py +103 -4
msprobe/docs/08.accuracy_checker_online_PyTorch.md +0 -295
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/attl.py +0 -205
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/client.py +0 -378
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/device_dispatch.py +0 -239
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/dump_dispatch.py +0 -115
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/server.py +0 -250
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/torch_ops_config.yaml +0 -63
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/utils.py +0 -198
msprobe/pytorch/attl_manager.py +0 -65
{mindstudio_probe-8.1.2.dist-info → mindstudio_probe-8.2.1.dist-info}/LICENSE +0 -0
{mindstudio_probe-8.1.2.dist-info → mindstudio_probe-8.2.1.dist-info}/WHEEL +0 -0
{mindstudio_probe-8.1.2.dist-info → mindstudio_probe-8.2.1.dist-info}/entry_points.txt +0 -0
{mindstudio_probe-8.1.2.dist-info → mindstudio_probe-8.2.1.dist-info}/top_level.txt +0 -0
/msprobe/{pytorch/api_accuracy_checker/tensor_transport_layer → core/compare/diff_analyze}/__init__.py +0 -0

msprobe/core/data_dump/json_writer.py CHANGED Viewed

@@ -13,17 +13,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import concurrent
+import copy
 import csv
 import os
-import copy
 import threading
 import traceback
 from datetime import datetime, timezone, timedelta
 from msprobe.core.common.const import Const, FileCheckConst
-from msprobe.core.common.file_utils import change_mode, FileOpen, save_json, load_json, check_path_before_create
-from msprobe.core.common.log import logger
 from msprobe.core.common.decorator import recursion_depth_decorator
+from msprobe.core.common.file_utils import change_mode, FileOpen, save_json, check_path_before_create
+from msprobe.core.common.log import logger
 lock = threading.Lock()
@@ -39,6 +40,7 @@ class DataWriter:
         self.debug_file_path = None
         self.dump_error_info_path = None
         self.flush_size = 1000
+        self.md5_flush_size = 5000
         self.larger_flush_size = 20000
         self.cache_data = {}
         self.cache_stack = {}
@@ -46,6 +48,9 @@ class DataWriter:
         self.cache_debug = {}
         self.stat_stack_list = []
         self._error_log_initialized = False
+        self._cache_logged_error_types = set()
+        self.crc32_stack_list = []
+        self.data_updated = False
     @staticmethod
     def write_data_to_csv(result: list, result_header: tuple, file_path: str):
@@ -57,11 +62,31 @@ class DataWriter:
             spawn_writer = csv.writer(csv_file)
             if not is_exists:
                 spawn_writer.writerow(result_header)
-            spawn_writer.writerows([result,])
+            spawn_writer.writerows([result, ])
         is_new_file = not is_exists
         if is_new_file:
             change_mode(file_path, FileCheckConst.DATA_FILE_AUTHORITY)
+    @recursion_depth_decorator("JsonWriter: DataWriter._replace_crc32_placeholders")
+    def _replace_crc32_placeholders(self, data, crc32_results):
+        """
+        遍历 JSON 结构，将所有 md5_index 占位符替换成真实的 CRC32
+        """
+        if isinstance(data, dict):
+            for k, v in list(data.items()):
+                if k == Const.MD5_INDEX and isinstance(v, int):
+                    idx = v
+                    # 防越界
+                    crc = crc32_results[idx] if idx < len(crc32_results) else None
+                    # 删除占位符，改成真实字段
+                    del data[k]
+                    data[Const.MD5] = crc
+                else:
+                    self._replace_crc32_placeholders(v, crc32_results)
+        elif isinstance(data, (list, tuple)):
+            for item in data:
+                self._replace_crc32_placeholders(item, crc32_results)
     @recursion_depth_decorator("JsonWriter: DataWriter._replace_stat_placeholders")
     def _replace_stat_placeholders(self, data, stat_result):
         if isinstance(data, dict):
@@ -107,6 +132,25 @@ class DataWriter:
         self.cache_stack = {}
         self.cache_construct = {}
         self.cache_debug = {}
+        self._cache_logged_error_types = set()
+    def append_crc32_to_buffer(self, future: concurrent.futures.Future) -> int:
+        """
+        把一个计算 CRC32 的 Future 放入队列，返回占位符索引
+        """
+        idx = len(self.crc32_stack_list)
+        self.crc32_stack_list.append(future)
+        return idx
+    def flush_crc32_stack(self):
+        """
+        等待所有 CRC32 计算完成，返回结果列表
+        """
+        if not self.crc32_stack_list:
+            return []
+        results = [f.result() for f in self.crc32_stack_list]
+        self.crc32_stack_list = []
+        return results
     def initialize_json_file(self, **kwargs):
         if kwargs["level"] == Const.LEVEL_DEBUG and not self.cache_debug:
@@ -142,18 +186,32 @@ class DataWriter:
         length = len(dump_data)
-        threshold = self.flush_size if length < self.larger_flush_size else self.larger_flush_size
+        # 1) 先取到 config（如果没有，就拿 None）
+        cfg = getattr(self, "config", None)
+        # 2) 再取 summary_mode（如果 cfg 是 None 或者没 summary_mode，就拿 None）
+        summary_mode = getattr(cfg, "summary_mode", None)
+        if summary_mode == Const.MD5:
+            threshold = self.md5_flush_size
+        else:
+            threshold = self.flush_size if length < self.larger_flush_size else self.larger_flush_size
         if length % threshold == 0:
             self.write_json()
-    def write_error_log(self, message: str):
+    def write_error_log(self, message: str, error_type: str):
         """
         写错误日志：
           - 第一次调用时以 'w' 模式清空文件，之后都用 'a' 模式追加
           - 添加时间戳
           - 在 message 后写入当前的调用栈（方便追踪日志来源）
         """
+        # 如果同类型错误已经记录过，跳过
+        if error_type in self._cache_logged_error_types:
+            return
+        # 否则添加到已记录集合，并继续写日志
+        self._cache_logged_error_types.add(error_type)
         try:
             mode = "w" if not self._error_log_initialized else "a"
             self._error_log_initialized = True
@@ -182,6 +240,7 @@ class DataWriter:
                 logger.warning(f"The dump data({dump_data}) should be a dict.")
                 return
+            self.data_updated = True
             key = next(iter(new_data.keys()))
             if key in dump_data:
                 dump_data.get(key).update(new_data.get(key))
@@ -190,6 +249,7 @@ class DataWriter:
     def update_stack(self, name, stack_data):
         with lock:
+            self.data_updated = True
             api_list = self.cache_stack.get(stack_data)
             if api_list is None:
                 self.cache_stack.update({stack_data: [name]})
@@ -198,10 +258,12 @@ class DataWriter:
     def update_construct(self, new_data):
         with lock:
+            self.data_updated = True
             self.cache_construct.update(new_data)
     def update_debug(self, new_data):
         with lock:
+            self.data_updated = True
             self.cache_debug['data'].update(new_data)
     def write_data_json(self, file_path):
@@ -268,9 +330,21 @@ class DataWriter:
             stat_result = self.flush_stat_stack()
             # 遍历 cache_data，将占位符替换为最终统计值
             if stat_result:
+                self.data_updated = True
                 self._replace_stat_placeholders(self.cache_data, stat_result)
                 if self.cache_debug:
                     self._replace_stat_placeholders(self.cache_debug, stat_result)
+            crc32_result = self.flush_crc32_stack()
+            if crc32_result:
+                self.data_updated = True
+                self._replace_crc32_placeholders(self.cache_data, crc32_result)
+                if self.cache_debug:
+                    self._replace_crc32_placeholders(self.cache_debug, crc32_result)
+            if not self.data_updated:
+                return
             if self.cache_data:
                 self.write_data_json(self.dump_file_path)
             if self.cache_stack:
@@ -279,4 +353,4 @@ class DataWriter:
                 self.write_construct_info_json(self.construct_file_path)
             if self.cache_debug:
                 self.write_debug_info_json(self.debug_file_path)
+            self.data_updated = False

msprobe/core/data_dump/scope.py CHANGED Viewed

@@ -69,8 +69,7 @@ class BaseScope(ABC):
         self.scope = scope
         self.api_list = api_list
-    @staticmethod
-    def rectify_args(scope, api_list):
+    def rectify_args(self, scope, api_list):
         if not isinstance(api_list, list):
             raise ScopeException(ScopeException.InvalidApiStr,
                                  f"api_list参数须配置为列表，实际类型为{type(api_list)}.")
@@ -104,12 +103,11 @@ class BaseScope(ABC):
 class ListScope(BaseScope):
-    @staticmethod
-    def rectify_args(scope, api_list):
+    def rectify_args(self, scope, api_list):
         if scope and api_list:
             raise ScopeException(ScopeException.ArgConflict,
                                  f"scope和api_list不可以同时配置，实际配置为scope={scope}, api_list={api_list}.")
-        return super(ListScope, ListScope).rectify_args(scope, api_list)
+        return super().rectify_args(scope, api_list)
     def check(self, name):
         if not self.scope or name in self.scope:
@@ -147,7 +145,7 @@ class RangeScope(BaseScope, ABC):
                                      f"scope参数格式错误，要求格式为api或模块完整命名，实际为{name}.")
     def rectify_args(self, scope, api_list):
-        scope, api_list = super(RangeScope, RangeScope).rectify_args(scope, api_list)
+        scope, api_list = super().rectify_args(scope, api_list)
         if scope and len(scope) != 2:
             raise ScopeException(ScopeException.InvalidScope,
                                  f"scope参数指定区间断点，须传入长度为2的列表，实际长度为{len(scope)}.")

msprobe/core/hook_manager.py CHANGED Viewed

@@ -13,34 +13,42 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import gc
 import os
 import threading
 from abc import ABC, abstractmethod
 from collections import defaultdict
-from msprobe.core.common.log import logger
 from msprobe.core.common.runtime import Runtime
 from msprobe.core.common.utils import Const, ThreadSafe
 from msprobe.core.data_dump.data_processor.base import (ModuleBackwardInputsOutputs, ModuleForwardInputsOutputs)
 class HookSet:
-    def __init__(self, forward_hook=None, forward_pre_hook=None, backward_hook=None, backward_pre_hook=None):
-        self.forward_hook = forward_hook
+    def __init__(
+            self,
+            forward_pre_hook=None,
+            forward_hook=None,
+            backward_pre_hook=None,
+            backward_hook=None,
+            distributed_forward_hook=None
+    ):
         self.forward_pre_hook = forward_pre_hook
-        self.backward_hook = backward_hook
+        self.forward_hook = forward_hook
         self.backward_pre_hook = backward_pre_hook
+        self.backward_hook = backward_hook
+        self.distributed_forward_hook = distributed_forward_hook
 class BaseHookManager(ABC):
     inner_switch = defaultdict(bool)
+    inner_api_count = defaultdict(int)
     hook_handle_dict = {}
     params_grad_info = {}
-    def __init__(self, data_collector, config, attl_manager=None):
+    def __init__(self, data_collector, config):
         self.data_collector = data_collector
         self.config = config
-        self.attl_manager = attl_manager
     @property
     def _pid(self):
@@ -51,6 +59,30 @@ class BaseHookManager(ABC):
     def _is_recompute(self):
         pass
+    @staticmethod
+    def reset_status():
+        BaseHookManager.inner_switch = defaultdict(bool)
+        BaseHookManager.inner_api_count = defaultdict(int)
+        BaseHookManager.hook_handle_dict.clear()
+        BaseHookManager.params_grad_info.clear()
+    @staticmethod
+    def ensure_gc_enabled():
+        is_gc_disabled = not gc.isenabled()
+        if is_gc_disabled:
+            gc.enable()
+        return is_gc_disabled
+    @staticmethod
+    def restore_gc_state(original_state):
+        if original_state:
+            gc.disable()
+    @staticmethod
+    def _clear_input_kwargs(module, tid):
+        if hasattr(module, 'msprobe_input_kwargs') and tid in module.msprobe_input_kwargs:
+            del module.msprobe_input_kwargs[tid]
     @staticmethod
     @abstractmethod
     def _no_grad_context():
@@ -63,18 +95,30 @@ class BaseHookManager(ABC):
     @staticmethod
     @abstractmethod
-    def _process_kwargs_and_output(module, hook_type, kwargs_or_output, output_or_kwargs):
+    def _get_count(name):
         pass
     @staticmethod
-    def _clear_input_kwargs(module):
-        if hasattr(module, 'msprobe_input_kwargs'):
-            del module.msprobe_input_kwargs
+    @abstractmethod
+    def _process_kwargs_and_output(module, tid, hook_type, kwargs_or_output, output_or_kwargs):
+        pass
     @abstractmethod
     def build_hook(self):
         pass
+    @abstractmethod
+    def _register_forward_hook(self, module, api_name):
+        pass
+    @abstractmethod
+    def _register_backward_hook(self, module, full_backward_name, args):
+        pass
+    @abstractmethod
+    def _register_backward_pre_hook(self, module, full_backward_name, output):
+        pass
     @abstractmethod
     def _get_params_dict(self, module):
         pass
@@ -96,7 +140,7 @@ class BaseHookManager(ABC):
                     old_handle = BaseHookManager.hook_handle_dict.get(name)
                     if old_handle and hasattr(old_handle, "remove"):
                         old_handle.remove()
-                    handle = param.register_hook(self._build_grad_hook(module, ori_name, param_name))
+                    handle = param.register_hook(self._build_grad_hook(ori_name, param_name))
                     BaseHookManager.hook_handle_dict[name] = handle
     def _init_params_grad_info(self, module, params_dict):
@@ -115,108 +159,116 @@ class BaseHookManager(ABC):
                     # 将grad_name的data_info先写入cache_data中, 梯度计算后再更新
                     self.data_collector.handle_data(grad_name, data_info,
                                                     flush=self.data_collector.data_processor.is_terminated)
+                    self.data_collector.params_grad_record[grad_name] = True
                 # 记录当前模块的参数梯度信息已占位
                 BaseHookManager.params_grad_info[grad_name] = True
-    def _should_execute_hook(self, hook_type, module, is_forward, tid):
-        is_module_hook = hook_type == Const.MODULE
-        if is_module_hook and not Runtime.is_running:
-            return False
-        elif not is_module_hook and is_forward and not Runtime.is_running:
+    def _should_execute_hook(self, hook_type, tid, is_forward=True):
+        is_api_hook = hook_type == Const.API
+        if BaseHookManager.inner_switch[tid]:
             return False
-        elif not is_module_hook and not is_forward and not module.forward_data_collected:
+        if not is_api_hook and not Runtime.is_running:
             return False
-        if BaseHookManager.inner_switch[tid]:
+        elif is_api_hook and is_forward and not Runtime.is_running:
             return False
         if not self.data_collector or self.data_collector.data_processor.is_terminated:
             return False
         return True
-    def _build_grad_hook(self, module, ori_name, param_name):
+    def _build_grad_hook(self, ori_name, param_name):
         def hook_fn(grad):
             tid = threading.get_ident()
-            if not self._should_execute_hook(Const.MODULE, module, False, tid):
+            if not self._should_execute_hook(Const.MODULE, tid):
                 return
             with ThreadSafe():
+                original_state = self.ensure_gc_enabled()
                 BaseHookManager.inner_switch[tid] = True
                 self.data_collector.params_data_collect(ori_name, param_name, self._pid, grad)
                 BaseHookManager.inner_switch[tid] = False
+                self.restore_gc_state(original_state)
             return
         return hook_fn
-    def _build_forward_pre_hook(self, hook_type, full_name, api_name):
+    def _build_forward_pre_hook(self, hook_type, api_name):
         def forward_pre_hook(module, args, kwargs=None):
-            """
-            为确保多线程场景下 L1 级别数据采集的正确性，每个封装后的 API 的 init 方法和 forward_pre_hook 需要确保在一个线程内连续完成，
-            因此在 API 的 init 方法执行 ThreadSafe.acquire() 加锁操作，
-            并且在 API 的 forward_pre_hook 方法执行 ThreadSafe.release() 释放锁操作。
-            """
             if hook_type == Const.MODULE:
-                return
+                return None
             tid = threading.get_ident()
-            if not self._should_execute_hook(hook_type, module, True, tid):
-                ThreadSafe.release()
-                return
+            if not self._should_execute_hook(hook_type, tid):
+                return None
-            module.forward_data_collected = True
-            self._add_count(api_name)
-            if getattr(self.config, "online_run_ut", False):
-                ThreadSafe.release()
-                return
+            with ThreadSafe():
+                original_state = self.ensure_gc_enabled()
+                self._register_forward_hook(module, api_name)
+                BaseHookManager.inner_api_count[tid] += 1
+                if BaseHookManager.inner_api_count[tid] != 1:
+                    return None
+                full_forward_name = api_name + str(self._get_count(api_name)) + Const.SEP + Const.FORWARD
+                full_backward_name = api_name + str(self._get_count(api_name)) + Const.SEP + Const.BACKWARD
+                module.full_forward_name = full_forward_name
+                if kwargs is None:
+                    kwargs = module.msprobe_input_kwargs.get(tid, {}) if hasattr(module, 'msprobe_input_kwargs') else {}
+                BaseHookManager.inner_switch[tid] = True
+                module_input_output = ModuleForwardInputsOutputs(args=args, kwargs=kwargs, output=None)
-            BaseHookManager.inner_switch[tid] = True
-            if kwargs is None:
-                kwargs = module.msprobe_input_kwargs if hasattr(module, 'msprobe_input_kwargs') else {}
-            try:
+                args = self._register_backward_hook(module, full_backward_name, args)
                 with self._no_grad_context():
-                    module_input_output = ModuleForwardInputsOutputs(args=args, kwargs=kwargs, output=None)
-                    self.data_collector.update_api_or_module_name(full_name)
+                    self.data_collector.update_api_or_module_name(full_forward_name)
                     self.data_collector.forward_input_data_collect(
-                        full_name,
+                        full_forward_name,
                         module,
                         self._pid,
                         module_input_output,
                         self._is_recompute
                     )
-            except Exception as e:
-                logger.error(f"The forward pre hook execution of the {full_name} API failed.")
-                raise e
-            finally:
                 BaseHookManager.inner_switch[tid] = False
-                ThreadSafe.release()
+                self.restore_gc_state(original_state)
+                return args
         return forward_pre_hook
-    def _build_forward_hook(self, hook_type, full_name):
+    def _build_forward_hook(self, hook_type, api_name):
         def forward_hook(module, args, kwargs_or_output, output_or_kwargs=None):
             tid = threading.get_ident()
-            if not self._should_execute_hook(hook_type, module, True, tid):
-                self._clear_input_kwargs(module)
+            if not self._should_execute_hook(hook_type, tid):
+                self._clear_input_kwargs(module, tid)
                 return None
             with ThreadSafe():
-                kwargs, output = self._process_kwargs_and_output(module, hook_type, kwargs_or_output, output_or_kwargs)
+                original_state = self.ensure_gc_enabled()
+                if hook_type == Const.API:
+                    if BaseHookManager.inner_api_count[tid] != 1:
+                        if BaseHookManager.inner_api_count[tid] > 1:
+                            BaseHookManager.inner_api_count[tid] -= 1
+                        self._clear_input_kwargs(module, tid)
+                        return None
+                kwargs, output = self._process_kwargs_and_output(
+                    module,
+                    tid,
+                    hook_type,
+                    kwargs_or_output,
+                    output_or_kwargs
+                )
                 BaseHookManager.inner_switch[tid] = True
-                self.data_collector.update_api_or_module_name(full_name)
                 module_input_output = ModuleForwardInputsOutputs(args=args, kwargs=kwargs, output=output)
+                if hook_type == Const.API:
+                    full_forward_name = api_name + str(self._get_count(api_name)) + Const.SEP + Const.FORWARD
+                    full_backward_name = api_name + str(self._get_count(api_name)) + Const.SEP + Const.BACKWARD
+                    output = self._register_backward_pre_hook(module, full_backward_name, output)
                 with self._no_grad_context():
-                    if getattr(self.config, "online_run_ut", False):
-                        if self.data_collector.scope and not self.data_collector.scope.check(full_name):
-                            return None
-                        if self.attl_manager:
-                            self.attl_manager.attl_send(full_name, args, kwargs, output)
-                        BaseHookManager.inner_switch[tid] = False
-                        return None
                     if hook_type == Const.MODULE:
                         params_dict = self._get_params_dict(module)
                         setattr(module_input_output, Const.PARAMS, params_dict)
                         if params_dict:
-                            self._register_param_hook(full_name, module, params_dict)
-                        self.data_collector.update_api_or_module_name(full_name)
+                            self._register_param_hook(api_name, module, params_dict)
+                        self.data_collector.update_api_or_module_name(api_name)
                         self.data_collector.forward_data_collect(
-                            full_name,
+                            api_name,
                             module,
                             self._pid,
                             module_input_output,
@@ -224,37 +276,40 @@ class BaseHookManager(ABC):
                         )
                         self._init_params_grad_info(module, params_dict)
                     else:
+                        self.data_collector.update_api_or_module_name(full_forward_name)
                         self.data_collector.forward_output_data_collect(
-                            full_name,
+                            full_forward_name,
                             module,
                             self._pid,
                             module_input_output,
                             self._is_recompute
                         )
-                    self._clear_input_kwargs(module)
+                        self._add_count(api_name)
+                        BaseHookManager.inner_api_count[tid] -= 1
+                    self._clear_input_kwargs(module, tid)
                     if self.data_collector.if_return_forward_new_output():
                         forward_new_output = self.data_collector.get_forward_new_output()
                         BaseHookManager.inner_switch[tid] = False
                         return forward_new_output
-                    BaseHookManager.inner_switch[tid] = False
-                    return output
+                BaseHookManager.inner_switch[tid] = False
+                self.restore_gc_state(original_state)
+                return output
         return forward_hook
     def _build_backward_hook(self, hook_type, full_name):
         def backward_hook(module, grad_input, grad_output):
             tid = threading.get_ident()
-            if not self._should_execute_hook(hook_type, module, False, tid):
+            if not self._should_execute_hook(hook_type, tid, is_forward=False):
                 return
             with ThreadSafe():
+                original_state = self.ensure_gc_enabled()
                 BaseHookManager.inner_switch[tid] = True
                 self.data_collector.update_api_or_module_name(full_name)
-                if getattr(self.config, "online_run_ut", False):
-                    BaseHookManager.inner_switch[tid] = False
-                    return
                 need_exchange = self._need_exchange(module) if hook_type == Const.MODULE else True
                 if need_exchange:
                     module_input_output = ModuleBackwardInputsOutputs(grad_input=grad_output, grad_output=grad_input)
@@ -267,6 +322,10 @@ class BaseHookManager(ABC):
                     module_input_output,
                     self._is_recompute
                 )
+                if hook_type == Const.MODULE:
+                    params_dict = self._get_params_dict(module)
+                    self.data_collector.params_data_collect_in_bw_hook(params_dict, full_name)
                 BaseHookManager.inner_switch[tid] = False
+                self.restore_gc_state(original_state)
         return backward_hook

mindstudio-probe 8.1.2__py3-none-any.whl → 8.2.1__py3-none-any.whl

mindstudio-probe 8.1.2py3-none-any.whl → 8.2.1py3-none-any.whl