PyPI - mindstudio-probe - Versions diffs - 8.1.2__py3-none-any.whl → 8.2.1__py3-none-any.whl - Mend

mindstudio-probe 8.1.2py3-none-any.whl → 8.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (181) hide show

{mindstudio_probe-8.1.2.dist-info → mindstudio_probe-8.2.1.dist-info}/METADATA +2 -2
{mindstudio_probe-8.1.2.dist-info → mindstudio_probe-8.2.1.dist-info}/RECORD +172 -147
msprobe/README.md +6 -6
msprobe/core/common/const.py +98 -41
msprobe/core/common/db_manager.py +256 -0
msprobe/core/common/file_utils.py +28 -5
msprobe/core/common/log.py +7 -0
msprobe/core/common/megatron_utils.py +59 -0
msprobe/core/common/parallel_state.py +193 -0
msprobe/core/common/utils.py +20 -13
msprobe/core/common_config.py +5 -0
msprobe/core/compare/acc_compare.py +140 -93
msprobe/core/compare/check.py +13 -0
msprobe/core/compare/compare_cli.py +64 -6
msprobe/core/compare/config.py +10 -8
msprobe/core/compare/diff_analyze/diff_analyze_threshold.yaml +14 -0
msprobe/core/compare/diff_analyze/first_diff_analyze.py +135 -0
msprobe/core/compare/diff_analyze/ignore_op_list.yaml +3 -0
msprobe/core/compare/find_first/__init__.py +0 -0
msprobe/core/compare/find_first/analyzer.py +282 -0
msprobe/core/compare/find_first/data_processor.py +35 -0
msprobe/core/compare/find_first/graph.py +188 -0
msprobe/core/compare/find_first/utils.py +189 -0
msprobe/core/compare/highlight.py +74 -101
msprobe/core/compare/layer_mapping/layer_mapping.py +14 -9
msprobe/core/compare/merge_result/merge_result.py +2 -2
msprobe/core/compare/multiprocessing_compute.py +45 -28
msprobe/core/compare/npy_compare.py +7 -10
msprobe/core/compare/utils.py +338 -130
msprobe/core/config_check/checkers/dataset_checker.py +2 -1
msprobe/core/config_check/checkers/env_args_checker.py +5 -5
msprobe/core/config_check/checkers/hyperparameter_checker.py +30 -10
msprobe/core/config_check/checkers/pip_checker.py +4 -3
msprobe/core/config_check/checkers/random_checker.py +3 -3
msprobe/core/config_check/checkers/weights_checker.py +2 -1
msprobe/core/config_check/ckpt_compare/megatron_loader.py +2 -0
msprobe/core/config_check/resource/hyperparameter.yaml +11 -1
msprobe/core/config_check/utils/hyperparameter_parser.py +7 -3
msprobe/core/config_check/utils/utils.py +10 -0
msprobe/core/data_dump/api_registry.py +49 -30
msprobe/core/data_dump/data_collector.py +71 -29
msprobe/core/data_dump/data_processor/base.py +2 -0
msprobe/core/data_dump/data_processor/mindspore_processor.py +47 -53
msprobe/core/data_dump/data_processor/pytorch_processor.py +227 -93
msprobe/core/data_dump/json_writer.py +81 -7
msprobe/core/data_dump/scope.py +4 -6
msprobe/core/hook_manager.py +129 -70
msprobe/core/monitor/csv2db.py +361 -0
msprobe/core/monitor/db_utils.py +278 -0
msprobe/core/monitor/utils.py +35 -1
msprobe/core/service.py +31 -39
msprobe/core/single_save/single_comparator.py +16 -3
msprobe/docs/01.installation.md +51 -19
msprobe/docs/02.config_introduction.md +16 -20
msprobe/docs/03.config_examples.md +26 -0
msprobe/docs/04.kernel_dump_PyTorch.md +1 -1
msprobe/docs/05.data_dump_PyTorch.md +6 -2
msprobe/docs/06.data_dump_MindSpore.md +44 -7
msprobe/docs/07.accuracy_checker_PyTorch.md +1 -1
msprobe/docs/10.accuracy_compare_PyTorch.md +124 -44
msprobe/docs/11.accuracy_compare_MindSpore.md +75 -7
msprobe/docs/14.data_parse_PyTorch.md +1 -1
msprobe/docs/19.monitor.md +94 -7
msprobe/docs/21.visualization_PyTorch.md +71 -101
msprobe/docs/22.visualization_MindSpore.md +69 -119
msprobe/docs/23.generate_operator_PyTorch.md +1 -1
msprobe/docs/25.tool_function_introduction.md +0 -1
msprobe/docs/26.data_dump_PyTorch_baseline.md +7 -7
msprobe/docs/28.debugger_save_instruction.md +184 -81
msprobe/docs/29.data_dump_MSAdapter.md +6 -0
msprobe/docs/31.config_check.md +4 -2
msprobe/docs/36.calculation_result_change.md +75 -0
msprobe/docs/FAQ.md +22 -1
msprobe/docs/data_dump_MindSpore/dynamic_graph_quick_start_example.md +6 -2
msprobe/docs/img/compare_result.png +0 -0
msprobe/docs/img/visualization/vis_browser_1.png +0 -0
msprobe/docs/img/visualization/vis_match_info.png +0 -0
msprobe/docs/img/visualization/vis_precision_info.png +0 -0
msprobe/docs/img/visualization/vis_search_info.png +0 -0
msprobe/docs/img/visualization/vis_show_info.png +0 -0
msprobe/docs/img/visualization/vis_showcase.png +0 -0
msprobe/docs/img/visualization/vis_unmatch_info.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/1.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/2.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/3.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/4.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/5.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/6.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/7.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/llamafactory-qwen25vl.txt +59 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/llamafactory1.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/llamafactory2.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/mindspeed-mm-qwen25vl.txt +80 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/mindspeed1.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/mindspeed2.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactory_mapping.md +330 -0
msprobe/mindspore/__init__.py +1 -1
msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +1 -1
msprobe/mindspore/api_accuracy_checker/api_runner.py +9 -6
msprobe/mindspore/api_accuracy_checker/compute_element.py +18 -12
msprobe/mindspore/cell_processor.py +64 -25
msprobe/mindspore/common/utils.py +51 -7
msprobe/mindspore/compare/common_dir_compare.py +45 -37
msprobe/mindspore/compare/ms_compare.py +10 -2
msprobe/mindspore/compare/ms_graph_compare.py +47 -52
msprobe/mindspore/debugger/debugger_config.py +18 -7
msprobe/mindspore/debugger/precision_debugger.py +16 -12
msprobe/mindspore/dump/cell_dump_process.py +130 -68
msprobe/mindspore/dump/cell_dump_with_insert_gradient.py +10 -2
msprobe/mindspore/dump/graph_mode_cell_dump.py +35 -9
msprobe/mindspore/dump/graph_tensor_dump.py +11 -0
msprobe/mindspore/dump/hook_cell/api_register.py +19 -20
msprobe/mindspore/dump/hook_cell/hook_cell.py +12 -34
msprobe/mindspore/dump/hook_cell/ms_hook_manager.py +142 -21
msprobe/mindspore/dump/kernel_kbyk_dump.py +24 -0
msprobe/mindspore/exception_dump/__init__.py +0 -0
msprobe/mindspore/exception_dump/exception_dump_tool_factory.py +51 -0
msprobe/mindspore/exception_dump/kernel_graph_exception_dump.py +57 -0
msprobe/mindspore/free_benchmark/api_pynative_self_check.py +5 -4
msprobe/mindspore/mindspore_service.py +2 -2
msprobe/mindspore/mindtorch/mindtorch_adaptor.py +12 -7
msprobe/mindspore/monitor/features.py +82 -0
msprobe/mindspore/monitor/module_hook.py +168 -10
msprobe/mindspore/monitor/utils.py +27 -1
msprobe/mindspore/ms_config.py +12 -4
msprobe/mindspore/overflow_check/overflow_check_tool_factory.py +1 -1
msprobe/mindspore/task_handler_factory.py +3 -1
msprobe/nan_analyze/graph.py +1 -1
msprobe/pytorch/api_accuracy_checker/common/config.py +3 -36
msprobe/pytorch/api_accuracy_checker/compare/api_precision_compare.py +0 -24
msprobe/pytorch/api_accuracy_checker/compare/compare.py +2 -12
msprobe/pytorch/api_accuracy_checker/config.yaml +1 -6
msprobe/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py +2 -2
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py +12 -132
msprobe/pytorch/common/utils.py +1 -21
msprobe/pytorch/compare/pt_compare.py +10 -2
msprobe/pytorch/{hook_module/jit_script_wrapper.py → compare/pt_diff_analyze.py} +3 -15
msprobe/pytorch/compare/utils.py +2 -1
msprobe/pytorch/debugger/debugger_config.py +18 -23
msprobe/pytorch/dump/module_dump/hook_wrapper.py +10 -7
msprobe/pytorch/dump/module_dump/module_processer.py +41 -19
msprobe/pytorch/free_benchmark/main.py +7 -4
msprobe/pytorch/hook_module/api_register.py +62 -24
msprobe/pytorch/hook_module/hook_module.py +9 -29
msprobe/pytorch/hook_module/pt_hook_manager.py +84 -15
msprobe/pytorch/hook_module/script_wrapper.py +140 -0
msprobe/pytorch/hook_module/support_wrap_ops.yaml +6 -0
msprobe/pytorch/monitor/csv2tb.py +1 -1
msprobe/pytorch/monitor/features.py +94 -0
msprobe/pytorch/monitor/module_hook.py +221 -81
msprobe/pytorch/monitor/module_metric.py +27 -1
msprobe/pytorch/monitor/optimizer_collect.py +109 -4
msprobe/pytorch/online_dispatch/dispatch.py +42 -24
msprobe/pytorch/online_dispatch/dump_compare.py +1 -1
msprobe/pytorch/parse_tool/lib/visualization.py +0 -1
msprobe/pytorch/pt_config.py +2 -51
msprobe/pytorch/pytorch_service.py +7 -14
msprobe/visualization/builder/graph_builder.py +192 -63
msprobe/visualization/builder/graph_merger.py +986 -0
msprobe/visualization/builder/msprobe_adapter.py +17 -15
msprobe/visualization/compare/graph_comparator.py +26 -16
msprobe/visualization/db_utils.py +252 -0
msprobe/visualization/graph/base_node.py +2 -22
msprobe/visualization/graph/distributed_analyzer.py +12 -12
msprobe/visualization/graph/graph.py +44 -16
msprobe/visualization/graph_service.py +143 -59
msprobe/visualization/utils.py +103 -4
msprobe/docs/08.accuracy_checker_online_PyTorch.md +0 -295
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/attl.py +0 -205
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/client.py +0 -378
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/device_dispatch.py +0 -239
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/dump_dispatch.py +0 -115
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/server.py +0 -250
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/torch_ops_config.yaml +0 -63
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/utils.py +0 -198
msprobe/pytorch/attl_manager.py +0 -65
{mindstudio_probe-8.1.2.dist-info → mindstudio_probe-8.2.1.dist-info}/LICENSE +0 -0
{mindstudio_probe-8.1.2.dist-info → mindstudio_probe-8.2.1.dist-info}/WHEEL +0 -0
{mindstudio_probe-8.1.2.dist-info → mindstudio_probe-8.2.1.dist-info}/entry_points.txt +0 -0
{mindstudio_probe-8.1.2.dist-info → mindstudio_probe-8.2.1.dist-info}/top_level.txt +0 -0
/msprobe/{pytorch/api_accuracy_checker/tensor_transport_layer → core/compare/diff_analyze}/__init__.py +0 -0

msprobe/pytorch/hook_module/api_register.py CHANGED Viewed

@@ -14,22 +14,25 @@
 # limitations under the License.
 import functools
-import os
 import inspect
+import os
 import torch
 import torch.distributed as dist
 from msprobe.core.common.const import Const
+from msprobe.core.common.file_utils import load_yaml
 from msprobe.core.data_dump.api_registry import ApiRegistry
 from msprobe.pytorch.common.log import logger
 from msprobe.pytorch.common.utils import (
-    torch_without_guard_version, is_gpu, torch_device_guard, parameter_adapter
+    torch_without_guard_version,
+    is_gpu,
+    torch_device_guard,
+    parameter_adapter
 )
 from msprobe.pytorch.function_factory import npu_custom_functions
 from msprobe.pytorch.hook_module.hook_module import HOOKModule
 from msprobe.pytorch.hook_module.utils import dynamic_import_op
-from msprobe.core.common.file_utils import load_yaml
 try:
     import mindspeed.ops
@@ -38,42 +41,46 @@ except ImportError:
 else:
     mindspeed_enable = True
 torch_version_above_2 = torch.__version__.split('+')[0] > '2.0'
 _inner_used_api = {}
 _supported_api_list_path = (os.path.join(os.path.dirname(os.path.realpath(__file__)), Const.SUPPORT_API_FILE_NAME),)
 _cuda_func_mapping = {"npu_fusion_attention": "gpu_fusion_attention"}
+dist_data_collect_func = {}
+dist_batch_data_collect_func = []
 _api_types = {
     Const.PT_FRAMEWORK: {
-        Const.PT_API_TYPE_FUNCTIONAL: (torch.nn.functional, (torch.nn.functional,)),
-        Const.PT_API_TYPE_TENSOR: (torch.Tensor, (torch.Tensor,)),
-        Const.PT_API_TYPE_TORCH: (torch, (torch,)),
-        Const.PT_API_TYPE_VF: (torch._C._VariableFunctionsClass, (torch._VF,)),
-        Const.PT_API_TYPE_DIST: (dist, (dist, dist.distributed_c10d))
+        Const.PT_API_TYPE_FUNCTIONAL: ((torch.nn.functional,), (torch.nn.functional,)),
+        Const.PT_API_TYPE_TENSOR: ((torch.Tensor,), (torch.Tensor,)),
+        Const.PT_API_TYPE_TORCH: ((torch,), (torch,)),
+        Const.PT_API_TYPE_VF: ((torch._C._VariableFunctionsClass,), (torch._VF,)),
+        Const.PT_API_TYPE_DIST: ((dist,), (dist, dist.distributed_c10d))
     }
 }
 if not is_gpu:
     import torch_npu
     if torch_without_guard_version:
         _api_types.get(Const.PT_FRAMEWORK).update(
             {
-                Const.PT_API_TYPE_NPU: (torch.ops.npu, (torch_npu, torch.ops.npu))
+                Const.PT_API_TYPE_NPU: ((torch.ops.npu, torch_npu), (torch_npu, torch.ops.npu)),
             }
         )
     else:
         _api_types.get(Const.PT_FRAMEWORK).update(
-            {Const.PT_API_TYPE_NPU: (torch_npu._C._VariableFunctionsClass, (torch_npu,))}
+            {Const.PT_API_TYPE_NPU: ((torch_npu._C._VariableFunctionsClass,), (torch_npu,))}
         )
         _api_types.get(Const.PT_FRAMEWORK).update(
             {
-                Const.PT_API_TYPE_NPU_DIST: (torch_npu.distributed, (torch_npu.distributed,
-                                                                     torch_npu.distributed.distributed_c10d))
+                Const.PT_API_TYPE_NPU_DIST: (
+                    (torch_npu.distributed,),
+                    (torch_npu.distributed, torch_npu.distributed.distributed_c10d)
+                )
             }
         )
     if mindspeed_enable:
-        _api_types.get(Const.PT_FRAMEWORK).update({Const.PT_API_TYPE_MINDSPEED: (mindspeed.ops, (mindspeed.ops,))})
+        _api_types.get(Const.PT_FRAMEWORK).update({Const.PT_API_TYPE_MINDSPEED: ((mindspeed.ops,), (mindspeed.ops,))})
         mindspeed_op_list = load_yaml(_supported_api_list_path[0]).get(Const.PT_API_TYPE_MINDSPEED)
         mindspeed_op_file_list = [op.split(Const.SEP)[0] + Const.PY_SUFFIX for op in mindspeed_op_list]
         dynamic_import_op(mindspeed.ops, mindspeed_op_file_list)
@@ -94,16 +101,48 @@ def dist_module_forward(module, *args, **kwargs):
         use_async_op_flag = False
         logger.warning(f"fail to get dist api's func signature because {e}, no wait")
-    if use_async_op_flag or module.api_name in ["isend", "irecv"]:
-        if handle and hasattr(handle, 'wait'):
-            handle.wait()
-    if module.api_name == "batch_isend_irecv":
-        if isinstance(handle, list):
-            for req in handle:
-                req.wait()
+    def create_async_callback_func(catch_func):
+        full_name = module.full_forward_name if hasattr(module, "full_forward_name") else None
+        def store_data():
+            catch_func(module, full_name, args, kwargs, handle)
+        return store_data
+    if use_async_op_flag or module.api_name in ['isend', 'irecv']:
+        dist_data_collect_func[handle] = create_async_callback_func(module.distributed_forward_hook)
+    if module.api_name == 'batch_isend_irecv':
+        dist_batch_data_collect_func.append([handle, create_async_callback_func(module.distributed_forward_hook)])
     return handle
+def redirect_wait():
+    if hasattr(dist, "Work"):
+        from torch.distributed import Work
+    else:
+        from torch._C._distributed_c10d import Work
+    origin_wait = Work.wait
+    def wrapped_wait(work):
+        def wrapped_wait(*args, **kwargs):
+            origin_wait(*args, **kwargs)
+            if args[0] in dist_data_collect_func:
+                store_func = dist_data_collect_func.pop(args[0])
+                store_func()
+                return
+            for value in dist_batch_data_collect_func:
+                if args[0] in value[0]:
+                    value[0].remove(args[0])
+                    if len(value[0]) == 0:
+                        store_func = value[1]
+                        store_func()
+                    return
+        return wrapped_wait
+    Work.wait = wrapped_wait(Work)
 def npu_module_forward(module, *args, **kwargs):
     if not module.need_hook:
         if module.api_name not in npu_custom_functions:
@@ -125,15 +164,14 @@ forward_methods = {
 class ApiTemplate(HOOKModule):
     def __init__(self, api_name, api_func, prefix, hook_build_func, need_hook=True, device=Const.CPU_LOWERCASE):
         self.api_name = api_name
-        self.api_func = api_func
         self.prefix = prefix
         self.prefix_api_name = prefix + Const.SEP + str(api_name.split(Const.SEP)[-1]) + Const.SEP
         self.need_hook = need_hook
         self.device = device
+        self.op_is_distributed = prefix == Const.DIST_API_TYPE_PREFIX
         if self.need_hook:
             super().__init__(hook_build_func)
-        if prefix == Const.DIST_API_TYPE_PREFIX:
-            self.op_is_distributed = True
+        self.api_func = api_func
     @torch_device_guard
     def forward(self, *args, **kwargs):

msprobe/pytorch/hook_module/hook_module.py CHANGED Viewed

@@ -14,50 +14,30 @@
 # limitations under the License.
 import functools
-import threading
 from collections import defaultdict
 import torch
 import torch.nn as nn
 import torch.utils.hooks as full_hooks
-from msprobe.core.common.runtime import Runtime
-from msprobe.core.common.utils import ThreadSafe
-from msprobe.pytorch.common.utils import register_forward_pre_hook, register_forward_hook
+from msprobe.pytorch.common.utils import register_forward_pre_hook
 class HOOKModule(nn.Module):
     module_count = defaultdict(int)
-    inner_stop_hook = defaultdict(bool)
     def __init__(self, hook_build_func) -> None:
         super(HOOKModule, self).__init__()
-        self.has_overflow = False
-        self.tid = threading.get_ident()
-        self.stop_hook = HOOKModule.inner_stop_hook.get(self.tid, False)
-        if not self.stop_hook:
-            self.forward_data_collected = False
-            if not Runtime.is_running:
-                return
-            prefix = self.prefix_api_name if hasattr(self, "prefix_api_name") else ""
-            ThreadSafe.acquire()
-            if callable(hook_build_func):
-                hook_set = hook_build_func(prefix)
-                register_forward_pre_hook(self, hook_set.forward_pre_hook)
-                register_forward_hook(self, hook_set.forward_hook)
-                self.register_backward_hook(hook_set.backward_hook)
+        prefix = self.prefix_api_name if hasattr(self, "prefix_api_name") else ""
+        op_is_distributed = self.op_is_distributed if hasattr(self, "op_is_distributed") else False
+        if callable(hook_build_func):
+            hook_set = hook_build_func(prefix)
+            register_forward_pre_hook(self, hook_set.forward_pre_hook)
+            if op_is_distributed:
+                self.distributed_forward_hook = hook_set.distributed_forward_hook
     def __call__(self, *args, **kwargs):
-        changed = False
-        if not self.stop_hook:
-            HOOKModule.inner_stop_hook[self.tid] = True
-            changed = True
-        result = self._call_func(*args, **kwargs)
-        if changed:
-            HOOKModule.inner_stop_hook[self.tid] = False
-        return result
+        return self._call_func(*args, **kwargs)
     @staticmethod
     def reset_module_stats():

msprobe/pytorch/hook_module/pt_hook_manager.py CHANGED Viewed

@@ -13,13 +13,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import functools
+import threading
 from contextlib import nullcontext
+import torch
 from msprobe.core.common.const import Const
-from msprobe.core.common.utils import replace_last_occurrence
+from msprobe.core.common.runtime import Runtime
+from msprobe.core.common.utils import replace_last_occurrence, ThreadSafe
+from msprobe.core.data_dump.data_processor.base import (ModuleForwardInputsOutputs)
 from msprobe.core.hook_manager import BaseHookManager, HookSet
-from msprobe.pytorch.common.utils import is_recomputation, torch_version_above_or_equal_2
+from msprobe.pytorch.common.utils import is_recomputation, torch_version_above_or_equal_2, register_forward_hook
 from msprobe.pytorch.hook_module.hook_module import HOOKModule
@@ -37,23 +42,65 @@ class PytorchHookManager(BaseHookManager):
         HOOKModule.add_module_count(name)
     @staticmethod
-    def _process_kwargs_and_output(module, hook_type, kwargs_or_output, output_or_kwargs):
-        kwargs = kwargs_or_output if torch_version_above_or_equal_2 else {}
-        output = output_or_kwargs if torch_version_above_or_equal_2 else kwargs_or_output
+    def _get_count(name):
+        return HOOKModule.get_module_count(name)
+    @staticmethod
+    def _process_kwargs_and_output(module, tid, hook_type, kwargs_or_output, output_or_kwargs):
+        if hook_type == Const.API:
+            kwargs = kwargs_or_output
+            output = output_or_kwargs
+        else:
+            kwargs = kwargs_or_output if torch_version_above_or_equal_2 else {}
+            output = output_or_kwargs if torch_version_above_or_equal_2 else kwargs_or_output
         return kwargs, output
     def build_hook(self, hook_type, name):
         if hook_type == Const.API:
-            full_forward_name = name + str(HOOKModule.get_module_count(name)) + Const.SEP + Const.FORWARD
+            hook_set = HookSet(
+                forward_pre_hook=self._build_forward_pre_hook(hook_type, name),
+                distributed_forward_hook=self._build_distributed_forward_hook()
+            )
         else:
-            full_forward_name = name
-        full_backward_name = replace_last_occurrence(full_forward_name, Const.FORWARD, Const.BACKWARD)
-        hookset = HookSet(
-            forward_hook=self._build_forward_hook(hook_type, full_forward_name),
-            forward_pre_hook=self._build_forward_pre_hook(hook_type, full_forward_name, name),
-            backward_hook=self._build_backward_hook(hook_type, full_backward_name)
-        )
-        return hookset
+            full_backward_name = replace_last_occurrence(name, Const.FORWARD, Const.BACKWARD)
+            hook_set = HookSet(
+                forward_hook=self._build_forward_hook(hook_type, name),
+                backward_hook=self._build_backward_hook(hook_type, full_backward_name)
+            )
+        return hook_set
+    def _register_forward_hook(self, module, api_name):
+        if not hasattr(module, 'msprobe_forward_hook'):
+            register_forward_hook(module, self._build_forward_hook(Const.API, api_name))
+            setattr(module, 'msprobe_forward_hook', True)
+    def _register_backward_hook(self, module, full_backward_name, args):
+        pass
+    def _register_backward_pre_hook(self, module, full_backward_name, output):
+        var = output
+        while not isinstance(var, torch.Tensor):
+            if isinstance(var, dict):
+                var = next((v for v in var.values() if isinstance(v, torch.Tensor)))
+            elif isinstance(var, (list, tuple)):
+                if var:
+                    var = var[0]
+                else:
+                    return output
+            else:
+                return output
+        if not (var.requires_grad and torch.is_grad_enabled()):
+            return output
+        grad_fn = var.grad_fn
+        if grad_fn is not None:
+            backward_hook = self._build_backward_hook(Const.API, full_backward_name)
+            wrapper = functools.partial(backward_hook, module)
+            functools.update_wrapper(wrapper, backward_hook)
+            grad_fn.register_hook(wrapper)
+        return output
     def _need_exchange(self, module):
         return True
@@ -66,3 +113,25 @@ class PytorchHookManager(BaseHookManager):
                 for key, value in module.named_parameters(recurse=False)
             }
         return params_dict
+    def _build_distributed_forward_hook(self):
+        def distributed_forward_hook(module, full_name, args, kwargs, output):
+            if not full_name or not Runtime.is_running:
+                return
+            tid = threading.get_ident()
+            with ThreadSafe():
+                BaseHookManager.inner_switch[tid] = True
+                self.data_collector.update_api_or_module_name(full_name)
+                module_input_output = ModuleForwardInputsOutputs(args=args, kwargs=kwargs, output=output)
+                with self._no_grad_context():
+                    self.data_collector.forward_output_data_collect(
+                        full_name,
+                        module,
+                        self._pid,
+                        module_input_output,
+                        self._is_recompute
+                    )
+                BaseHookManager.inner_switch[tid] = False
+        return distributed_forward_hook

msprobe/pytorch/hook_module/script_wrapper.py ADDED Viewed

@@ -0,0 +1,140 @@
+# Copyright (c) 2025-2025, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import functools
+import importlib
+import types
+import torch
+from msprobe.core.common.log import logger
+from msprobe.pytorch.common.utils import torch_version_above_or_equal_2
+from msprobe.pytorch.hook_module.api_register import get_api_register
+if torch_version_above_or_equal_2:
+    from torch._dynamo.convert_frame import convert_frame as _orig_convert_frame, Hooks
+def wrap_jit_script_func():
+    def patched_script(*args, **kwargs):
+        all_api_registered = api_register.all_api_registered
+        if all_api_registered:
+            api_register.restore_all_api()
+        result = original_script(*args, **kwargs)
+        if all_api_registered:
+            api_register.register_all_api()
+        return result
+    original_script = torch.jit.script
+    api_register = get_api_register()
+    torch.jit.script = patched_script
+def wrap_compile_script_func():
+    def _patched_convert_frame(compiler_fn, hooks):
+        """
+        在调用原 convert_frame 生成的 _convert_frame 之前恢复 API，
+        调用完之后再重新注册所有 API。
+        """
+        # 拿到原来 inner 版的 _convert_frame
+        inner_convert = _orig_convert_frame(compiler_fn, hooks)
+        def _wrapped(frame: types.FrameType, cache_size: int, hooks: Hooks, frame_state):
+            reg = get_api_register()
+            # 进入前 restore
+            reg.restore_all_api()
+            try:
+                result = inner_convert(frame, cache_size, hooks, frame_state)
+            except Exception:
+                # 异常时也要确保 register
+                reg.register_all_api()
+                raise
+            # 正常结束后 register
+            reg.register_all_api()
+            return result
+        # 保留原属性以兼容
+        _wrapped._torchdynamo_orig_callable = compiler_fn  # type: ignore[attr-defined]
+        _wrapped._clone_with_backend = lambda backend: _patched_convert_frame(backend,
+                                                                              hooks)  # type: ignore[attr-defined]
+        return _wrapped
+    import torch._dynamo.convert_frame as _cf_mod
+    _cf_mod.convert_frame = _patched_convert_frame
+def patch_dynamo_compile():
+    cf = importlib.import_module("torch._dynamo.convert_frame")
+    if not hasattr(cf, "_compile"):
+        logger.warning("No found torch._dynamo.convert_frame._compile")
+    original = cf._compile
+    if getattr(original, "__msprobe_patched__", False):
+        return
+    @functools.wraps(original)
+    def wrapped(*args, **kwargs):
+        result = None
+        try:
+            reg = get_api_register()
+            reg.restore_all_api()
+        except Exception as e:
+            logger.warning(f"[msprobe] Pre restore_all_api failed: {e}")
+            return result
+        try:
+            result = original(*args, **kwargs)
+        except Exception:
+            logger.warning("[msprobe] _compile execution failed (returning None)")
+            result = None
+        finally:
+            try:
+                reg = get_api_register()
+                reg.register_all_api()  # 改成注册hook
+            except Exception as e:
+                logger.warning(f"[msprobe] Post register_all_api failed: {e}")
+        return result
+    wrapped.__msprobe_patched__ = True
+    wrapped.__msprobe_original__ = original
+    cf._compile = wrapped
+def unpatch_dynamo_compile() -> bool:
+    # 预留取消patch接口
+    cf = importlib.import_module("torch._dynamo.convert_frame")
+    current = getattr(cf, "_compile", None)
+    if current is None:
+        return False
+    original = getattr(current, "__msprobe_original__", None)
+    if original is None:
+        return False
+    cf._compile = original
+    return True
+def preprocess_func():
+    try:
+        from torch.utils._device import _device_constructors
+        _device_constructors()
+    except ImportError:
+        pass
+    except Exception as e:
+        logger.warning(f"Failed to execute _device_constructors. Error Details: {str(e)}")
+def wrap_script_func():
+    wrap_jit_script_func()
+    if torch_version_above_or_equal_2:
+        patch_dynamo_compile()

msprobe/pytorch/hook_module/support_wrap_ops.yaml CHANGED Viewed

@@ -1260,6 +1260,12 @@ torch_npu:
   - npu_scatter_nd_update
   - npu_prefetch
   - npu_dynamic_block_quant
+  - npu_add_rms_norm
+  - _npu_flash_attention
+  - _npu_rotary_embedding
+  - _npu_reshape_and_cache
+  - _npu_paged_attention
+  - npu_moe_gating_top_k
 aten:
   - signbit

msprobe/pytorch/monitor/csv2tb.py CHANGED Viewed

@@ -79,7 +79,7 @@ def write_step(output_dirpath, parse_step_result, rank, data_type):
             for op, value in ops.items():
                 tag = f"{vpp_name}/{op}"
                 writer.add_scalar(tag, value, step)
-    writer.flush()
+    writer.close()
 @recursion_depth_decorator("update_dict", max_depth=50)

msprobe/pytorch/monitor/features.py CHANGED Viewed

@@ -111,3 +111,97 @@ def cal_histc(tensor_cal, bins_total, min_val, max_val):
 @torch.no_grad()
 def get_nans(t):
     return torch.isnan(t).sum()
+def check_tensor_dim(tensor, n):
+    """检查张量维度是否大于n
+    """
+    if not isinstance(tensor, torch.Tensor):
+        raise TypeError(
+            f"Input must be a PyTorch tensor. Got {type(tensor)} instead. "
+            f"Consider using torch.tensor() for conversion."
+        )
+    if tensor.dim() < n:
+        raise ValueError(
+            f"Tensor must have at least {n} dimensions. "
+            f"Got shape: {tuple(tensor.shape)} with {tensor.dim()} dims."
+        )
+@torch.no_grad()
+def max_eigenvalue(input_tensor: torch.Tensor, num_iterations=3):
+    input_tensor = input_tensor.float()
+    try:
+        check_tensor_dim(input_tensor, 2)
+    except (TypeError, ValueError) as e:
+        logger.warning(f"Calculate max eigenvalue failed: {e}")
+        return torch.tensor(0)
+    in_features = input_tensor.shape[1]
+    u_tensor = torch.randn(in_features).to(input_tensor.device)
+    u_norm = u_tensor.norm()
+    if u_norm.item() == 0:
+        return torch.tensor(0)
+    u_tensor = u_tensor / u_tensor.norm()
+    input_seq = torch.matmul(input_tensor.T, input_tensor)
+    for _ in range(num_iterations):
+        v_tensor = torch.matmul(input_seq, u_tensor)
+        spectral_norm = torch.matmul(v_tensor.T, u_tensor)
+        v_norm = v_tensor.norm()
+        if v_norm > 0:
+            u_tensor = v_tensor / v_norm
+        else:
+            spectral_norm = torch.tensor(0)
+            break
+    return spectral_norm.sqrt()
+@torch.no_grad()
+def cal_entropy(qk_tensor, mask=None):
+    try:
+        check_tensor_dim(qk_tensor, 2)
+    except (TypeError, ValueError) as e:
+        logger.warning(f"Calculate max eigenvalue failed: {e}")
+        return torch.tensor(0), torch.tensor(0)
+    if mask is None:
+        mask = torch.tril(torch.ones(qk_tensor.shape[1], qk_tensor.shape[1])).to(
+            qk_tensor.device)
+    qk_tensor = qk_tensor - torch.amax(qk_tensor, dim=1, keepdim=True)
+    qk_tensor = qk_tensor.masked_fill(mask == 0, float('-inf'))
+    softmax_qkt = torch.nn.functional.softmax(qk_tensor.float(), dim=1)
+    # softmax取QK矩阵最大值
+    softmax_max = torch.mean(torch.amax(softmax_qkt, dim=1))
+    entropy = torch.mean(-torch.nansum(softmax_qkt *
+                         torch.log(softmax_qkt), dim=1))
+    return entropy, softmax_max
+@torch.no_grad()
+def cal_qkt(q_h, k_h, order="s,b,h,d"):
+    # q_h shape is [s, b, h, d]
+    try:
+        check_tensor_dim(q_h, 4)
+        check_tensor_dim(k_h, 4)
+    except (TypeError, ValueError) as e:
+        logger.warning(f"Calculate qk tensor failed: {e}")
+        return torch.tensor(0)
+    if order == "s,b,h,d":
+        qkt = torch.matmul(
+            q_h[:, 0, 0, :], k_h[:, 0, 0, :].t()) / q_h.shape[-1] ** 0.5
+    elif order == "b,s,h,d":
+        qkt = torch.matmul(
+            q_h[0, :, 0, :], k_h[0, :, 0, :].t()) / q_h.shape[-1] ** 0.5
+    else:
+        logger.warning("Calculate qk tensor failed: Order unsupported.")
+        qkt = torch.tensor(0)
+    return qkt
+@torch.no_grad()
+def cal_stable_rank(weight: torch.Tensor):
+    eig = max_eigenvalue(weight)
+    if eig == torch.tensor(0):
+        return torch.tensor(0), torch.tensor(0)
+    f_norm = torch.norm(weight, p="fro")
+    return f_norm / eig, eig

mindstudio-probe 8.1.2__py3-none-any.whl → 8.2.1__py3-none-any.whl

mindstudio-probe 8.1.2py3-none-any.whl → 8.2.1py3-none-any.whl