PyPI - mindstudio-probe - Versions diffs - 1.1.1__py3-none-any.whl → 1.2.1__py3-none-any.whl - Mend

mindstudio-probe 1.1.1py3-none-any.whl → 1.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (197) hide show

{mindstudio_probe-1.1.1.dist-info → mindstudio_probe-1.2.1.dist-info}/METADATA +3 -2
{mindstudio_probe-1.1.1.dist-info → mindstudio_probe-1.2.1.dist-info}/RECORD +196 -141
msprobe/CMakeLists.txt +5 -0
msprobe/README.md +14 -19
msprobe/config.json +1 -0
msprobe/core/common/const.py +155 -6
msprobe/core/common/exceptions.py +3 -1
msprobe/core/common/file_utils.py +33 -7
msprobe/core/common/inplace_ops.yaml +3 -0
msprobe/core/common/utils.py +28 -14
msprobe/core/common_config.py +6 -0
msprobe/core/compare/acc_compare.py +139 -128
msprobe/core/compare/check.py +31 -29
msprobe/core/compare/compare_cli.py +17 -16
msprobe/core/compare/highlight.py +186 -99
msprobe/core/compare/layer_mapping/data_scope_parser.py +18 -7
msprobe/core/compare/layer_mapping/layer_mapping.py +21 -14
msprobe/core/compare/layer_mapping/postprocess_pass.py +4 -3
msprobe/core/compare/merge_result/merge_result.py +380 -0
msprobe/core/compare/merge_result/merge_result_cli.py +31 -0
msprobe/core/compare/multiprocessing_compute.py +2 -2
msprobe/core/compare/npy_compare.py +109 -147
msprobe/core/compare/utils.py +189 -69
msprobe/core/data_dump/data_collector.py +51 -21
msprobe/core/data_dump/data_processor/base.py +38 -20
msprobe/core/data_dump/data_processor/factory.py +5 -3
msprobe/core/data_dump/data_processor/mindspore_processor.py +154 -20
msprobe/core/data_dump/data_processor/pytorch_processor.py +118 -58
msprobe/core/data_dump/json_writer.py +29 -1
msprobe/core/data_dump/scope.py +19 -18
msprobe/core/overflow_check/abnormal_scene.py +9 -5
msprobe/core/overflow_check/checker.py +1 -1
msprobe/core/overflow_check/utils.py +1 -1
msprobe/docs/01.installation.md +96 -17
msprobe/docs/02.config_introduction.md +5 -5
msprobe/docs/05.data_dump_PyTorch.md +91 -61
msprobe/docs/06.data_dump_MindSpore.md +57 -19
msprobe/docs/07.accuracy_checker_PyTorch.md +18 -18
msprobe/docs/09.accuracy_checker_MindSpore.md +4 -4
msprobe/docs/10.accuracy_compare_PyTorch.md +99 -41
msprobe/docs/11.accuracy_compare_MindSpore.md +249 -48
msprobe/docs/12.overflow_check_PyTorch.md +1 -1
msprobe/docs/19.monitor.md +120 -27
msprobe/docs/21.visualization_PyTorch.md +115 -35
msprobe/docs/22.visualization_MindSpore.md +138 -41
msprobe/docs/23.generate_operator_PyTorch.md +107 -0
msprobe/docs/24.code_mapping_Mindspore.md +28 -0
msprobe/docs/{23.tool_function_introduction.md → 25.tool_function_introduction.md} +1 -0
msprobe/docs/26.data_dump_PyTorch_baseline.md +37 -0
msprobe/docs/27.dump_json_instruction.md +521 -0
msprobe/docs/FAQ.md +26 -2
msprobe/docs/accuracy_checker_MindSpore/accuracy_checker_MindSpore_baseline.md +14 -0
msprobe/docs/data_dump_MindSpore/data_dump_MindSpore_baseline.md +22 -0
msprobe/docs/img/merge_result.png +0 -0
msprobe/docs/img/visualization/fuzzy_match_ms.png +0 -0
msprobe/docs/img/visualization/fuzzy_match_pt.png +0 -0
msprobe/docs/img/visualization/tensorboard_1.png +0 -0
msprobe/docs/img/visualization/tensorboard_2.png +0 -0
msprobe/docs/img/visualization/vis_browser_1.png +0 -0
msprobe/docs/img/visualization/vis_browser_2.png +0 -0
msprobe/docs/img/visualization/vis_precision_info.png +0 -0
msprobe/docs/img/visualization/vis_search_info.png +0 -0
msprobe/docs/img/visualization/vis_show_info.png +0 -0
msprobe/docs/img/visualization/vis_showcase.png +0 -0
msprobe/docs/img/visualization/vis_unmatch_info.png +0 -0
msprobe/docs/visualization/GPTModel.png +0 -0
msprobe/docs/visualization/ParallelMLP.png +0 -0
msprobe/docs/visualization/layer_mapping_example.md +132 -0
msprobe/docs/visualization/mapping.png +0 -0
msprobe/docs/visualization/mapping1.png +0 -0
msprobe/docs/visualization/module_name.png +0 -0
msprobe/docs/visualization/module_name1.png +0 -0
msprobe/docs/visualization/no_mapping.png +0 -0
msprobe/docs/visualization/no_mapping1.png +0 -0
msprobe/docs/visualization/no_mapping_analyze.png +0 -0
msprobe/docs/visualization/top_layer.png +0 -0
msprobe/mindspore/__init__.py +10 -0
msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +57 -25
msprobe/mindspore/api_accuracy_checker/cmd_parser.py +2 -1
msprobe/mindspore/api_accuracy_checker/compute_element.py +5 -7
msprobe/mindspore/api_accuracy_checker/data_manager.py +37 -0
msprobe/mindspore/api_accuracy_checker/main.py +1 -0
msprobe/mindspore/api_accuracy_checker/multi_api_accuracy_checker.py +12 -6
msprobe/mindspore/api_accuracy_checker/multi_data_manager.py +3 -1
msprobe/mindspore/code_mapping/bind.py +264 -0
msprobe/mindspore/code_mapping/cmd_parser.py +40 -0
msprobe/mindspore/code_mapping/graph.py +49 -0
msprobe/mindspore/code_mapping/graph_parser.py +226 -0
msprobe/mindspore/code_mapping/main.py +24 -0
msprobe/mindspore/code_mapping/processor.py +34 -0
msprobe/mindspore/common/const.py +3 -1
msprobe/mindspore/common/utils.py +50 -5
msprobe/mindspore/compare/distributed_compare.py +0 -2
msprobe/mindspore/compare/ms_compare.py +105 -63
msprobe/mindspore/compare/ms_graph_compare.py +14 -5
msprobe/mindspore/debugger/debugger_config.py +3 -0
msprobe/mindspore/debugger/precision_debugger.py +81 -12
msprobe/mindspore/dump/hook_cell/api_registry.py +83 -16
msprobe/mindspore/dump/hook_cell/hook_cell.py +60 -38
msprobe/mindspore/dump/hook_cell/primitive_hooks.py +33 -15
msprobe/mindspore/dump/hook_cell/support_wrap_ops.yaml +11 -1
msprobe/mindspore/dump/hook_cell/wrap_api.py +92 -1
msprobe/mindspore/dump/kernel_dump/kernel_config.py +33 -0
msprobe/mindspore/dump/kernel_graph_dump.py +7 -0
msprobe/mindspore/free_benchmark/api_pynative_self_check.py +13 -4
msprobe/mindspore/free_benchmark/perturbation/bit_noise.py +2 -2
msprobe/mindspore/grad_probe/grad_analyzer.py +24 -12
msprobe/mindspore/grad_probe/hook.py +13 -4
msprobe/mindspore/mindtorch/__init__.py +18 -0
msprobe/mindspore/mindtorch/mindtorch_adaptor.py +255 -0
msprobe/mindspore/ms_config.py +5 -1
msprobe/mindspore/overflow_check/kernel_graph_overflow_check.py +7 -0
msprobe/mindspore/service.py +267 -101
msprobe/msprobe.py +24 -3
msprobe/pytorch/__init__.py +7 -6
msprobe/pytorch/api_accuracy_checker/common/utils.py +31 -16
msprobe/pytorch/api_accuracy_checker/compare/algorithm.py +41 -8
msprobe/pytorch/api_accuracy_checker/compare/api_precision_compare.py +100 -267
msprobe/pytorch/api_accuracy_checker/compare/api_precision_standard.yaml +4 -1
msprobe/pytorch/api_accuracy_checker/compare/compare.py +69 -68
msprobe/pytorch/api_accuracy_checker/compare/compare_column.py +54 -0
msprobe/pytorch/api_accuracy_checker/compare/compare_input.py +51 -0
msprobe/pytorch/api_accuracy_checker/compare/compare_utils.py +2 -4
msprobe/pytorch/api_accuracy_checker/generate_op_script/op_generator.py +54 -30
msprobe/pytorch/api_accuracy_checker/precision_standard/absolute_threshold.py +106 -0
msprobe/pytorch/api_accuracy_checker/precision_standard/accumulative_error_compare.py +107 -0
msprobe/pytorch/api_accuracy_checker/precision_standard/base_standard.py +151 -0
msprobe/pytorch/api_accuracy_checker/precision_standard/benchmark_compare.py +226 -0
msprobe/pytorch/api_accuracy_checker/precision_standard/binary_consistency.py +68 -0
msprobe/pytorch/api_accuracy_checker/precision_standard/standard_config.py +218 -0
msprobe/pytorch/api_accuracy_checker/precision_standard/standard_register.py +104 -0
msprobe/pytorch/api_accuracy_checker/precision_standard/thousandth_standard.py +63 -0
msprobe/pytorch/api_accuracy_checker/precision_standard/ulp_compare.py +200 -0
msprobe/pytorch/api_accuracy_checker/run_ut/data_generate.py +57 -1
msprobe/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py +2 -1
msprobe/pytorch/api_accuracy_checker/run_ut/run_overflow_check.py +42 -14
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py +64 -19
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut_utils.py +34 -4
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/device_dispatch.py +5 -3
msprobe/pytorch/bench_functions/npu_fusion_attention.py +42 -10
msprobe/pytorch/common/parse_json.py +2 -1
msprobe/pytorch/common/utils.py +45 -2
msprobe/pytorch/compare/distributed_compare.py +17 -29
msprobe/pytorch/compare/pt_compare.py +40 -20
msprobe/pytorch/debugger/debugger_config.py +27 -12
msprobe/pytorch/debugger/precision_debugger.py +42 -12
msprobe/pytorch/dump/module_dump/__init__.py +0 -0
msprobe/pytorch/dump/module_dump/module_dump.py +86 -0
msprobe/pytorch/{module_processer.py → dump/module_dump/module_processer.py} +80 -6
msprobe/pytorch/free_benchmark/common/params.py +2 -1
msprobe/pytorch/free_benchmark/common/utils.py +3 -0
msprobe/pytorch/free_benchmark/compare/grad_saver.py +0 -2
msprobe/pytorch/free_benchmark/result_handlers/base_handler.py +31 -47
msprobe/pytorch/free_benchmark/result_handlers/preheat_handler.py +0 -4
msprobe/pytorch/hook_module/__init__.py +1 -1
msprobe/pytorch/hook_module/hook_module.py +14 -11
msprobe/pytorch/hook_module/register_optimizer_hook.py +59 -0
msprobe/pytorch/hook_module/support_wrap_ops.yaml +34 -0
msprobe/pytorch/hook_module/wrap_distributed.py +6 -8
msprobe/pytorch/hook_module/wrap_functional.py +0 -40
msprobe/pytorch/monitor/anomaly_analyse.py +1 -1
msprobe/pytorch/monitor/anomaly_detect.py +107 -22
msprobe/pytorch/monitor/csv2tb.py +166 -0
msprobe/pytorch/monitor/distributed/wrap_distributed.py +25 -14
msprobe/pytorch/monitor/features.py +3 -3
msprobe/pytorch/monitor/module_hook.py +483 -277
msprobe/pytorch/monitor/module_metric.py +27 -48
msprobe/pytorch/monitor/module_spec_verifier.py +3 -1
msprobe/pytorch/monitor/optimizer_collect.py +52 -14
msprobe/pytorch/monitor/unittest/test_monitor.py +24 -9
msprobe/pytorch/monitor/utils.py +77 -6
msprobe/pytorch/online_dispatch/dispatch.py +8 -2
msprobe/pytorch/parse_tool/lib/compare.py +10 -10
msprobe/pytorch/parse_tool/lib/config.py +5 -7
msprobe/pytorch/parse_tool/lib/file_desc.py +15 -1
msprobe/pytorch/parse_tool/lib/interactive_cli.py +10 -10
msprobe/pytorch/parse_tool/lib/parse_exception.py +7 -7
msprobe/pytorch/parse_tool/lib/parse_tool.py +11 -10
msprobe/pytorch/parse_tool/lib/utils.py +18 -19
msprobe/pytorch/parse_tool/lib/visualization.py +9 -10
msprobe/pytorch/service.py +176 -106
msprobe/visualization/builder/graph_builder.py +62 -5
msprobe/visualization/builder/msprobe_adapter.py +24 -2
msprobe/visualization/compare/graph_comparator.py +64 -14
msprobe/visualization/compare/mode_adapter.py +1 -15
msprobe/visualization/graph/base_node.py +12 -17
msprobe/visualization/graph/distributed_analyzer.py +318 -0
msprobe/visualization/graph/graph.py +9 -0
msprobe/visualization/graph_service.py +97 -23
msprobe/visualization/utils.py +14 -29
msprobe/pytorch/functional/module_dump.py +0 -84
{mindstudio_probe-1.1.1.dist-info → mindstudio_probe-1.2.1.dist-info}/LICENSE +0 -0
{mindstudio_probe-1.1.1.dist-info → mindstudio_probe-1.2.1.dist-info}/WHEEL +0 -0
{mindstudio_probe-1.1.1.dist-info → mindstudio_probe-1.2.1.dist-info}/entry_points.txt +0 -0
{mindstudio_probe-1.1.1.dist-info → mindstudio_probe-1.2.1.dist-info}/top_level.txt +0 -0
/msprobe/docs/{data_dump_Mindspore → data_dump_MindSpore}/dynamic_graph_quick_start_example.md +0 -0
/msprobe/{pytorch/functional → mindspore/code_mapping}/__init__.py +0 -0

msprobe/pytorch/free_benchmark/result_handlers/base_handler.py CHANGED Viewed

@@ -89,12 +89,6 @@ class FuzzHandler(ABC):
         )
         return origin_output_chunks, perturbed_output_chunks
-    @staticmethod
-    def convert_overflow_ratio_to_consistent(ratio):
-        if math.isnan(ratio) or math.isinf(ratio):
-            return ThresholdConfig.COMP_CONSISTENT
-        return ratio
     @abstractmethod
     def get_threshold(self, dtype):
         pass
@@ -107,10 +101,10 @@ class FuzzHandler(ABC):
         self, origin_output, perturbed_output, norm_type, abs_tol
     ):
         if norm_type == NormType.ENDLESS_NORM:
-            return self.calculate_error(origin_output, perturbed_output, abs_tol)
+            return self.calculate_max_ratio(origin_output, perturbed_output, abs_tol)
         return ThresholdConfig.COMP_CONSISTENT
-    def calculate_error(self, origin_output, perturbed_output, abs_tol):
+    def calculate_max_ratio(self, origin_output, perturbed_output, abs_tol):
         origin_output_chunks, perturbed_output_chunks = (
             self.tensor_split_for_error_calculate(origin_output, perturbed_output)
         )
@@ -122,42 +116,30 @@ class FuzzHandler(ABC):
             raise FreeBenchmarkException(
                 FreeBenchmarkException.OutputIndexError, err_msg
             )
-        norm1 = -np.inf
-        norm2 = -np.inf
-        norm3 = np.inf
+        max_ratio = ThresholdConfig.COMP_CONSISTENT
         for i, chunk_origin in enumerate(origin_output_chunks):
             if chunk_origin.nelement() == 0:
                 break
             chunk_perturbed = perturbed_output_chunks[i]
-            ratio_tensor1 = TorchC.where(
-                TorchC.abs(chunk_perturbed) > abs_tol,
-                TorchC.div(
-                    TorchC.clamp(chunk_origin, min=abs_tol),
-                    TorchC.clamp(chunk_perturbed, min=abs_tol),
-                ),
-                1,
-            )
-            ratio_tensor2 = TorchC.where(
-                TorchC.abs(chunk_origin) > abs_tol,
-                TorchC.div(
-                    TorchC.clamp(chunk_perturbed, min=abs_tol),
-                    TorchC.clamp(chunk_origin, min=abs_tol),
-                ),
-                1,
+            # 如果乘积最小值 < 极小值乘积的负值，认为存在非极小值符号相反的情况
+            if TorchC.lt(
+                TorchC.min(TorchC.mul(chunk_origin, chunk_perturbed)), -(abs_tol**2)
+            ):
+                return ThresholdConfig.SYMBOL_FLIPPING
+            # 求A/B B/A的比值前，将值限制在大于极小值范围内
+            clamp_origin = TorchC.clamp(TorchC.abs(chunk_origin), min=abs_tol)
+            clamp_perturbed = TorchC.clamp(TorchC.abs(chunk_perturbed), min=abs_tol)
+            # 对于计算结果为nan的情况，认为两者没有差异
+            ratio_tensor = TorchC.nan_to_num(
+                TorchC.div(clamp_origin, clamp_perturbed),
+                nan=ThresholdConfig.COMP_CONSISTENT,
             )
-            norm_values = TorchC.stack(
-                [TorchC.max(ratio_tensor1), TorchC.max(ratio_tensor2)]
-            )
-            max_ratio1, max_ratio2 = norm_values.tolist()
-            norm1 = max(norm1, self.convert_overflow_ratio_to_consistent(max_ratio1))
-            norm2 = max(norm2, self.convert_overflow_ratio_to_consistent(max_ratio2))
-            norm3 = min(norm3, self.convert_overflow_ratio_to_consistent(max_ratio1))
-        if norm3 < 0:
-            ratio = ThresholdConfig.SYMBOL_FLIPPING
-        else:
-            ratio = max(norm1, norm2)
-        return ratio
+            # 求A/B 和 B/A比值最大值，其中 B/A的最大值为 A/B的最小值的倒数
+            min_ratio, max_ratio = TorchC.stack([*TorchC.aminmax(ratio_tensor)]).tolist()
+            min_ratio_reciprocal = np.inf if min_ratio == 0 else 1 / min_ratio
+            max_ratio = max(max_ratio, min_ratio_reciprocal)
+        return max_ratio
     def ratio_calculate(self, origin_output, perturbed_output, norm_type) -> float:
         try:
@@ -220,10 +202,12 @@ class FuzzHandler(ABC):
                 )
                 npu_consistent = is_consistent
                 max_fuzz_ratio = (
-                    max_fuzz_ratio if ratio is None else max(max_fuzz_ratio, ratio)
+                    max_fuzz_ratio
+                    if not isinstance(ratio, (int, float))
+                    else max(max_fuzz_ratio, ratio)
                 )
-                data_params.is_consistent = is_consistent and data_params.is_consistent
-                if not is_consistent and data_params.grad_unequal_flag:
+                data_params.is_consistent = is_consistent
+                if not is_consistent:
                     self.unequal_rows.append(
                         make_unequal_row(data_params, self.params, ratio=ratio)
                     )
@@ -235,12 +219,12 @@ class FuzzHandler(ABC):
                     )
                     npu_consistent = npu_consistent and is_consistent
                     max_fuzz_ratio = (
-                        max_fuzz_ratio if ratio is None else max(max_fuzz_ratio, ratio)
-                    )
-                    data_params.is_consistent = (
-                        is_consistent and data_params.is_consistent
+                        max_fuzz_ratio
+                        if not isinstance(ratio, (int, float))
+                        else max(max_fuzz_ratio, ratio)
                     )
-                    if not is_consistent and data_params.grad_unequal_flag:
+                    data_params.is_consistent = is_consistent
+                    if not is_consistent:
                         self.unequal_rows.append(
                             make_unequal_row(
                                 data_params, self.params, ratio=ratio, index=index_

msprobe/pytorch/free_benchmark/result_handlers/preheat_handler.py CHANGED Viewed

@@ -75,10 +75,6 @@ class PreheatHandler(FuzzHandler):
         if self.params.preheat_config.get("preheat_step") <= self.params.step:
             return data_params.original_result
-        if not data_params.grad_unequal_flag:
-            data_params.grad_unequal_flag = True
-            data_params.is_consistent = False
-            return data_params.original_result
         preheat_counter.add_api_called_time(self.pure_name)
         if not self._is_take_a_sample():

msprobe/pytorch/hook_module/__init__.py CHANGED Viewed

@@ -13,4 +13,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from .wrap_functional import remove_dropout
+from msprobe.pytorch.common.utils import remove_dropout

msprobe/pytorch/hook_module/hook_module.py CHANGED Viewed

@@ -15,17 +15,17 @@
 import functools
 import threading
+from collections import defaultdict
 import torch
 import torch.nn as nn
 import torch.utils.hooks as full_hooks
-from msprobe.core.common.const import Const
 torch_version_above_or_equal_2 = torch.__version__.split('+')[0] >= '2.0'
 class HOOKModule(nn.Module):
-    module_count = {}
+    module_count = defaultdict(int)
     inner_stop_hook = {}
     def __init__(self, build_hook) -> None:
@@ -41,12 +41,7 @@ class HOOKModule(nn.Module):
             if hasattr(self, "prefix_op_name_"):
                 self.prefix = self.prefix_op_name_
-            if self.prefix not in HOOKModule.module_count:
-                HOOKModule.module_count[self.prefix] = 1
-                self.prefix += '0' + Const.SEP
-            else:
-                HOOKModule.module_count[self.prefix] += 1
-                self.prefix = self.prefix + str(HOOKModule.module_count[self.prefix] - 1) + Const.SEP
+            self.forward_data_collected = False
             forward_pre_hook, forward_hook, backward_hook, _ = build_hook(self.prefix)
             if torch_version_above_or_equal_2:
                 self.register_forward_pre_hook(forward_pre_hook, with_kwargs=True)
@@ -66,9 +61,17 @@ class HOOKModule(nn.Module):
             HOOKModule.inner_stop_hook[self.current_thread] = False
         return result
-    @classmethod
-    def reset_module_stats(cls):
-        cls.module_count = {}
+    @staticmethod
+    def reset_module_stats():
+        HOOKModule.module_count = defaultdict(int)
+    @staticmethod
+    def add_module_count(name):
+        HOOKModule.module_count[name] += 1
+    @staticmethod
+    def get_module_count(name):
+        return HOOKModule.module_count[name]
     def _call_func(self, *args, **kwargs):
         full_backward_hooks, non_full_backward_hooks = [], []

msprobe/pytorch/hook_module/register_optimizer_hook.py ADDED Viewed

@@ -0,0 +1,59 @@
+# Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+from msprobe.core.common.const import Const
+from msprobe.pytorch.common.log import logger
+torch_version_above_or_equal_2 = torch.__version__.split('+')[0] >= '2.0'
+if torch_version_above_or_equal_2:
+    from torch.optim.optimizer import register_optimizer_step_pre_hook, register_optimizer_step_post_hook
+def register_optimizer_hook(data_collector):
+    def optimizer_pre_step_hook(optimizer, args, kwargs):
+        data_collector.optimizer_status = Const.OPTIMIZER
+    def optimizer_post_step_hook(optimizer, args, kwargs):
+        data_collector.optimizer_status = Const.END_PREFIX + Const.OPTIMIZER
+    def patch_clip_grad(func):
+        def wrapper(*args, **kwargs):
+            data_collector.optimizer_status = Const.CLIP_GRAD
+            func(*args, **kwargs)
+            data_collector.optimizer_status = Const.END_PREFIX + Const.CLIP_GRAD
+        return wrapper
+    if torch_version_above_or_equal_2:
+        register_optimizer_step_pre_hook(optimizer_pre_step_hook)
+        register_optimizer_step_post_hook(optimizer_post_step_hook)
+    else:
+        logger.info_on_rank_0("Pytorch version is below 2.0, cannot register optimizer hook.")
+    try:
+        torch.nn.utils.clip_grad_norm_ = patch_clip_grad(torch.nn.utils.clip_grad_norm_)
+        torch.nn.utils.clip_grad_norm = patch_clip_grad(torch.nn.utils.clip_grad_norm)
+        torch.nn.utils.clip_grad_value_ = patch_clip_grad(torch.nn.utils.clip_grad_value_)
+    except Exception as e:
+        logger.info_on_rank_0("Cannot patch clip grad function. detail:%s" % str(e))
+    try:
+        from megatron.core.optimizer import MegatronOptimizer
+        MegatronOptimizer.clip_grad_norm = patch_clip_grad(MegatronOptimizer.clip_grad_norm)
+    except ImportError:
+        pass
+    except Exception as e:
+        logger.info_on_rank_0("Cannot patch megatron clip grad function. detail:%s" % str(e))

msprobe/pytorch/hook_module/support_wrap_ops.yaml CHANGED Viewed

@@ -138,6 +138,10 @@ functional:
   - fold
   - multi_head_attention_forward
   - scaled_dot_product_attention
+  - lp_pool3d
+  - dropout1d
+  - mish
+  - huber_loss
 tensor:
   - __add__
@@ -172,6 +176,7 @@ tensor:
   - __sub__
   - __truediv__
   - __xor__
+  - __pow__
   - abs
   - abs_
   - absolute
@@ -557,6 +562,27 @@ tensor:
   - view_as
   - xlogy
   - xlogy_
+  - split
+  - stft
+  - nan_to_num
+  - dsplit
+  - orgqr
+  - bitwise_left_shift_
+  - arctan2
+  - histogram
+  - q_zero_point
+  - adjoint
+  - ormqr
+  - bitwise_right_shift_
+  - nanquantile
+  - lu
+  - quantile
+  - arctan2_
+  - qr
+  - diagonal_scatter
+  - corrcoef
+  - vsplit
+  - aminmax
 torch:
   - linalg.norm
@@ -1131,6 +1157,14 @@ torch_npu:
   - npu_lstm
   - npu_apply_adam
   - npu_apply_adam_w
+  - npu_anti_quant
+  - npu_grouped_matmu
+  - npu_quant_scatter
+  - npu_group_norm_silu
+  - npu_format_cast
+  - npu_moe_finalize_routing
+  - npu_moe_gating_top_k_softmax
+  - npu_trans_quant_param
 aten:
   - signbit

msprobe/pytorch/hook_module/wrap_distributed.py CHANGED Viewed

@@ -21,7 +21,6 @@ from msprobe.pytorch.hook_module.hook_module import HOOKModule
 from msprobe.pytorch.common.utils import torch_device_guard
 from msprobe.core.common.const import Const
 from msprobe.core.common.file_utils import load_yaml
-from msprobe.core.common.inplace_op_checker import InplaceOpChecker
 cur_path = os.path.dirname(os.path.realpath(__file__))
@@ -49,17 +48,16 @@ class DistributedOPTemplate(HOOKModule):
         self.op_name_ = op_name
         self.prefix_op_name_ = "Distributed" + Const.SEP + str(op_name) + Const.SEP
         super().__init__(build_hook)
-        if not self.stop_hook and InplaceOpChecker.check(self.op_name_, InplaceOpChecker.OP_DISTRIBUTED):
-            self.op_is_inplace = True
+        if not self.stop_hook:
+            self.op_is_distributed = True
     @torch_device_guard
     def forward(self, *args, **kwargs):
+        handle = distributed_func.get(self.op_name_)(*args, **kwargs)
         if kwargs.get("async_op") or self.op_name_ in ["isend", "irecv"]:
-            handle = distributed_func.get(self.op_name_)(*args, **kwargs)
-            handle.wait()
-            return handle
-        else:
-            return distributed_func.get(self.op_name_)(*args, **kwargs)
+            if handle and hasattr(handle, 'wait'):
+                handle.wait()
+        return handle
 def wrap_distributed_op(op_name, hook):

msprobe/pytorch/hook_module/wrap_functional.py CHANGED Viewed

@@ -23,46 +23,6 @@ from msprobe.pytorch.common.log import logger
 from msprobe.core.common.file_utils import load_yaml
-def remove_dropout():
-    if torch.__version__ > "1.8":
-        logger.info_on_rank_0("For precision comparison, the probability p in the dropout method is set to 0.")
-        import torch.nn.functional as F
-        from torch import _VF
-        from torch.overrides import has_torch_function_unary, handle_torch_function
-        def function_dropout(input_tensor: torch.Tensor, p: float = 0.5, training: bool = True,
-                             inplace: bool = False) -> torch.Tensor:
-            if has_torch_function_unary(input_tensor):
-                return handle_torch_function(
-                    function_dropout, (input_tensor,), input_tensor, p=0., training=training, inplace=inplace)
-            if p < 0.0 or p > 1.0:
-                raise ValueError("dropout probability has to be between 0 and 1, " "but got {}".format(p))
-            return _VF.dropout_(input_tensor, 0., training) if inplace else _VF.dropout(input_tensor, 0., training)
-        def function_dropout2d(input_tensor: torch.Tensor, p: float = 0.5, training: bool = True,
-                               inplace: bool = False) -> torch.Tensor:
-            if has_torch_function_unary(input_tensor):
-                return handle_torch_function(
-                    function_dropout2d, (input_tensor,), input_tensor, p=0., training=training, inplace=inplace)
-            if p < 0.0 or p > 1.0:
-                raise ValueError("dropout probability has to be between 0 and 1, " "but got {}".format(p))
-            return _VF.feature_dropout_(input_tensor, 0., training) if inplace else _VF.feature_dropout(input_tensor,
-                                                                                                        0., training)
-        def function_dropout3d(input_tensor: torch.Tensor, p: float = 0.5, training: bool = True,
-                               inplace: bool = False) -> torch.Tensor:
-            if has_torch_function_unary(input_tensor):
-                return handle_torch_function(
-                    function_dropout3d, (input_tensor,), input_tensor, p=0., training=training, inplace=inplace)
-            if p < 0.0 or p > 1.0:
-                raise ValueError("dropout probability has to be between 0 and 1, " "but got {}".format(p))
-            return _VF.feature_dropout_(input_tensor, 0., training) if inplace else _VF.feature_dropout(input_tensor,
-                                                                                                        0., training)
-        F.dropout = function_dropout
-        F.dropout2d = function_dropout2d
-        F.dropout3d = function_dropout3d
 cur_path = os.path.dirname(os.path.realpath(__file__))
 yaml_path = os.path.join(cur_path, "support_wrap_ops.yaml")

msprobe/pytorch/monitor/anomaly_analyse.py CHANGED Viewed

@@ -19,7 +19,7 @@ import argparse
 import ast
 import heapq
-from msprobe.core.common.log import logger
+from msprobe.pytorch.common.log import logger
 from msprobe.core.common.const import MonitorConst
 from msprobe.core.common.file_utils import check_path_before_create, save_json, create_directory, remove_path, \
     check_file_or_directory_path, load_json

msprobe/pytorch/monitor/anomaly_detect.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
+# Copyright (c) 2024-2025, Huawei Technologies Co., Ltd.
 # All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0  (the "License");
@@ -12,21 +12,22 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import itertools
 import os
-import sys
 import statistics as st
+import sys
 from abc import ABC
+from collections import defaultdict
 from dataclasses import dataclass, field
 from typing import List
-from collections import defaultdict
 import pandas as pd
+import torch
 from torch.utils.tensorboard import SummaryWriter
-from msprobe.core.common.log import logger
-from msprobe.core.common.file_utils import change_mode, create_directory, write_df_to_csv
 from msprobe.core.common.const import FileCheckConst, MonitorConst
+from msprobe.core.common.file_utils import change_mode, create_directory, write_df_to_csv
+from msprobe.pytorch.common.log import logger
 class ScanRule(ABC):
@@ -134,7 +135,7 @@ class AnomalyDataFactory(ABC):
             raise ValueError("tag must be a tuple with length 2")
         tag_name = tag[0]
         param_name = tag_name.split('/')[0]
-        call_id = self.name2callid.get(param_name, -1)
+        call_id = self.name2callid.get(tag_name, -1)
         if MonitorConst.VPP_SEP in param_name:
             vpp_stage = int(param_name.split(MonitorConst.VPP_SEP)[0])
         else:
@@ -153,6 +154,24 @@ class AnomalyDataFactory(ABC):
         )
+class TrainStage:
+    DEFAULT_STAGE = -1
+    FORWARD_STAGE = 0
+    BACKWARD_STAGE = 1
+    OPTIMIZER_STAGE = 2
+FORWARD_KEY = [MonitorConst.ACTV_IN, MonitorConst.ACTV_OUT]
+BACKWARD_KEY = [MonitorConst.ACTVGRAD_IN, MonitorConst.ACTVGRAD_OUT,
+                MonitorConst.PRE_GRAD, MonitorConst.POST_GRAD, MonitorConst.ACC_GRAD]
+OPTIMIZER_KEY = [MonitorConst.EXP_AVG, MonitorConst.EFXP_AVG_SQ]
+TRAIN_STAGE = {
+    **{key_: TrainStage.FORWARD_STAGE for key_ in FORWARD_KEY},
+    **{key_: TrainStage.BACKWARD_STAGE for key_ in BACKWARD_KEY},
+    **{key_: TrainStage.OPTIMIZER_STAGE for key_ in OPTIMIZER_KEY}
+}
 @dataclass(eq=True)
 class GradAnomalyData:
     rank: int = 0
@@ -166,25 +185,48 @@ class GradAnomalyData:
     group_mates: list = field(default=None, compare=False)
     def __lt__(self, other):
+        """
+        自定义比较函数，用于确定 GradAnomalyData 实例之间的顺序。
+        比较规则为：
+            step 和 micro_step 值越小优先级越高；
+            vpp 和 pp 在前向阶段值越小优先级越高，在非前向阶段值越大优先级越高；
+            call_id 值越小优先级越高。
+        """
         if not isinstance(other, GradAnomalyData):
             return NotImplemented
-        if self.step != other.step:
-            return self.step < other.step
-        if self.micro_step != other.micro_step:
-            return self.micro_step < other.micro_step
-        if self.vpp_stage != other.vpp_stage:
-            return self.vpp_stage > other.vpp_stage
-        if self.pp_stage != other.pp_stage:
-            return self.pp_stage > other.pp_stage
-        if self.call_id != other.call_id:
-            return self.call_id < other.call_id
-        return False
+        self_train_stage = self.get_train_stage(self.tag_name)
+        other_train_stage = self.get_train_stage(other.tag_name)
+        def vpp_pp_comparator(anomaly):
+            """
+            Determine the priority rule for vpp and pp based on train stage
+            Forward stage prefers smaller vpp and pp
+            Other stages prefer larger vpp and pp
+            """
+            if self_train_stage == TrainStage.FORWARD_STAGE:
+                return anomaly.vpp_stage, anomaly.pp_stage
+            else:
+                return -anomaly.vpp_stage, -anomaly.pp_stage
+        self_cmp = [self.step, self.micro_step, self_train_stage, *vpp_pp_comparator(self), self.call_id]
+        other_cmp = [other.step, other.micro_step, other_train_stage, *vpp_pp_comparator(other), other.call_id]
+        return self_cmp < other_cmp
     def __le__(self, other):
         if not isinstance(other, GradAnomalyData):
             return NotImplemented
         return self == other or self < other
+    @staticmethod
+    def get_train_stage(tag_name):
+        """
+        :param tag_name: "0:fc2_0/rank0/input", "0:fc1.weight/rank0/post_grad", "0:fc2.weight/rank0/efxp_avg_sq"
+        :return: int, if forward return 0; if backward return 1; if optimizer return 2
+        """
+        key_ = tag_name.split("/")[-1]
+        return TRAIN_STAGE.get(key_, TrainStage.DEFAULT_STAGE)
     def to_dict(self):
         return self.__dict__
@@ -198,7 +240,6 @@ class WriterInput:
     path: str
     ad_rules: list
     job_id: str
-    anomaly_inform: bool = False
     anomaly_factory: AnomalyDataFactory = None
     ndigits: int = 6
     step_count_per_record: int = 1
@@ -209,7 +250,6 @@ class BaseWriterWithAD:
         self.tag2scalars = {}
         self.ad_rules = writer_input.ad_rules
         self.job_id = writer_input.job_id
-        self.anomaly_inform = writer_input.anomaly_inform
         self.anomaly_factory = writer_input.anomaly_factory
         self.anomalies = []
         self.ndigits = writer_input.ndigits
@@ -242,6 +282,27 @@ class BaseWriterWithAD:
             if self.anomaly_factory:
                 self.anomalies.append(self.anomaly_factory.create(tag, exception_message, global_step))
+    def write_metrics(self, ops, metric_value, step, prefix=''):
+        if not metric_value:
+            return
+        tensors = []
+        tags = list(itertools.product(metric_value.keys(), ops))
+        for op2tensor in metric_value.values():
+            tensors.extend(op2tensor.values())
+        if not tensors:
+            return
+        n_slices = len(tensors) // MonitorConst.SLICE_SIZE
+        with torch.no_grad():
+            for i in range(n_slices + 1):
+                begin = i * MonitorConst.SLICE_SIZE
+                end = (i+1) * MonitorConst.SLICE_SIZE
+                if begin == len(tensors):
+                    continue
+                metric_list = torch.stack(tensors[begin:end]).cpu()
+                for tag, metric in zip(tags[begin:end], metric_list):
+                    self.add_scalar(tag, metric, step)
     def _ad(self, scalar_value, history):
         return AnomalyScanner.scan(self.ad_rules, history, cur=scalar_value)
@@ -291,7 +352,7 @@ class CSVWriterWithAD(BaseWriterWithAD):
         """
         if len(self.context_dict) == 0:
             return
         ster_start, step_end = self.get_step_interval(step)
         filepath = os.path.join(self.log_dir, f'{prefix}_{ster_start}-{step_end}.csv')
         if not os.path.exists(filepath):
@@ -304,7 +365,7 @@ class CSVWriterWithAD(BaseWriterWithAD):
                 new_data.append([name] + [step] + metric_value)
             else:
                 new_data.append(name.split(MonitorConst.VPP_SEP) + [step] + metric_value)
-        new_data = pd.DataFrame(new_data).round(self.ndigits)
+        new_data = pd.DataFrame(new_data).round(self.ndigits).fillna("nan")
         write_df_to_csv(new_data, filepath, mode='a+', header=False)
         self.context_dict = defaultdict(list)
@@ -317,6 +378,30 @@ class CSVWriterWithAD(BaseWriterWithAD):
         name = tag[0].split('/')[0]
         self.context_dict[name].append(scalar_value.item())
+    def write_metrics(self, ops, metric_value, step, prefix=''):
+        super().write_metrics(ops, metric_value, step, prefix='')
+        # generate csv headers
+        # set hashmap to reduce the number of headers generated.
+        # 前向的norm用input.ops_和output.ops_，反向的用input_grad.ops_和output_grad.ops_
+        if prefix in {"actv", "actv_grad"}:
+            if prefix == "actv":
+                input_and_output = [MonitorConst.ACTV_IN, MonitorConst.ACTV_OUT]
+            else:
+                input_and_output = [MonitorConst.ACTVGRAD_IN, MonitorConst.ACTVGRAD_OUT]
+            ops_ = [MonitorConst.DOT.join(i) for i in itertools.product(input_and_output, ops)]
+            csv_header = ["module_name", "step", *ops_]
+        else:
+            csv_header = ["param_name", "step", *ops]
+        keys = list(metric_value.keys())
+        if keys and MonitorConst.VPP_SEP in keys[0]:
+            csv_header.insert(0, "vpp_stage")
+        self.header = csv_header
+        self.write_csv(prefix, step)
+        self.header = []
     def close(self):
         pass

mindstudio-probe 1.1.1__py3-none-any.whl → 1.2.1__py3-none-any.whl

mindstudio-probe 1.1.1py3-none-any.whl → 1.2.1py3-none-any.whl