PyPI - mindstudio-probe - Versions diffs - 1.1.1__py3-none-any.whl → 1.2.2__py3-none-any.whl - Mend

mindstudio-probe 1.1.1py3-none-any.whl → 1.2.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (226) hide show

{mindstudio_probe-1.1.1.dist-info → mindstudio_probe-1.2.2.dist-info}/METADATA +3 -2
mindstudio_probe-1.2.2.dist-info/RECORD +415 -0
msprobe/CMakeLists.txt +5 -0
msprobe/README.md +16 -21
msprobe/config.json +1 -0
msprobe/core/common/const.py +185 -11
msprobe/core/common/exceptions.py +3 -1
msprobe/core/common/file_utils.py +33 -7
msprobe/core/common/inplace_ops.yaml +4 -0
msprobe/core/common/utils.py +42 -14
msprobe/core/common_config.py +6 -0
msprobe/core/compare/acc_compare.py +139 -128
msprobe/core/compare/check.py +31 -29
msprobe/core/compare/compare_cli.py +17 -16
msprobe/core/compare/highlight.py +186 -99
msprobe/core/compare/layer_mapping/data_scope_parser.py +19 -8
msprobe/core/compare/layer_mapping/layer_mapping.py +21 -14
msprobe/core/compare/layer_mapping/postprocess_pass.py +4 -3
msprobe/core/compare/merge_result/merge_result.py +381 -0
msprobe/core/compare/merge_result/merge_result_cli.py +31 -0
msprobe/core/compare/merge_result/utils.py +81 -0
msprobe/core/compare/multiprocessing_compute.py +2 -2
msprobe/core/compare/npy_compare.py +109 -147
msprobe/core/compare/utils.py +199 -69
msprobe/core/data_dump/data_collector.py +100 -25
msprobe/core/data_dump/data_processor/base.py +130 -28
msprobe/core/data_dump/data_processor/factory.py +8 -3
msprobe/core/data_dump/data_processor/mindspore_processor.py +170 -23
msprobe/core/data_dump/data_processor/pytorch_processor.py +175 -64
msprobe/core/data_dump/json_writer.py +54 -8
msprobe/core/data_dump/scope.py +19 -18
msprobe/core/overflow_check/abnormal_scene.py +9 -5
msprobe/core/overflow_check/checker.py +1 -1
msprobe/core/overflow_check/utils.py +1 -1
msprobe/docs/01.installation.md +121 -17
msprobe/docs/02.config_introduction.md +18 -16
msprobe/docs/03.config_examples.md +24 -0
msprobe/docs/05.data_dump_PyTorch.md +107 -58
msprobe/docs/06.data_dump_MindSpore.md +95 -34
msprobe/docs/07.accuracy_checker_PyTorch.md +18 -18
msprobe/docs/09.accuracy_checker_MindSpore.md +8 -6
msprobe/docs/10.accuracy_compare_PyTorch.md +99 -41
msprobe/docs/11.accuracy_compare_MindSpore.md +249 -48
msprobe/docs/12.overflow_check_PyTorch.md +1 -1
msprobe/docs/19.monitor.md +310 -220
msprobe/docs/21.visualization_PyTorch.md +125 -35
msprobe/docs/22.visualization_MindSpore.md +149 -41
msprobe/docs/23.generate_operator_PyTorch.md +107 -0
msprobe/docs/24.code_mapping_Mindspore.md +28 -0
msprobe/docs/{23.tool_function_introduction.md → 25.tool_function_introduction.md} +1 -0
msprobe/docs/26.data_dump_PyTorch_baseline.md +37 -0
msprobe/docs/27.dump_json_instruction.md +525 -0
msprobe/docs/28.debugger_save_instruction.md +94 -0
msprobe/docs/28.kernel_dump_MindSpore.md +69 -0
msprobe/docs/FAQ.md +26 -2
msprobe/docs/accuracy_checker_MindSpore/accuracy_checker_MindSpore_baseline.md +14 -0
msprobe/docs/data_dump_MindSpore/data_dump_MindSpore_baseline.md +22 -0
msprobe/docs/img/merge_result.png +0 -0
msprobe/docs/img/monitor/step_count_per_record.png +0 -0
msprobe/docs/img/visualization/fuzzy_match_ms.png +0 -0
msprobe/docs/img/visualization/fuzzy_match_pt.png +0 -0
msprobe/docs/img/visualization/tensorboard_1.png +0 -0
msprobe/docs/img/visualization/tensorboard_2.png +0 -0
msprobe/docs/img/visualization/vis_browser_1.png +0 -0
msprobe/docs/img/visualization/vis_browser_2.png +0 -0
msprobe/docs/img/visualization/vis_precision_info.png +0 -0
msprobe/docs/img/visualization/vis_search_info.png +0 -0
msprobe/docs/img/visualization/vis_show_info.png +0 -0
msprobe/docs/img/visualization/vis_showcase.png +0 -0
msprobe/docs/img/visualization/vis_unmatch_info.png +0 -0
msprobe/docs/visualization/GPTModel.png +0 -0
msprobe/docs/visualization/ParallelMLP.png +0 -0
msprobe/docs/visualization/layer_mapping_example.md +132 -0
msprobe/docs/visualization/mapping.png +0 -0
msprobe/docs/visualization/mapping1.png +0 -0
msprobe/docs/visualization/module_name.png +0 -0
msprobe/docs/visualization/module_name1.png +0 -0
msprobe/docs/visualization/no_mapping.png +0 -0
msprobe/docs/visualization/no_mapping1.png +0 -0
msprobe/docs/visualization/no_mapping_analyze.png +0 -0
msprobe/docs/visualization/top_layer.png +0 -0
msprobe/mindspore/__init__.py +11 -0
msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +80 -28
msprobe/mindspore/api_accuracy_checker/api_runner.py +54 -16
msprobe/mindspore/api_accuracy_checker/cmd_parser.py +2 -1
msprobe/mindspore/api_accuracy_checker/compute_element.py +52 -8
msprobe/mindspore/api_accuracy_checker/data_manager.py +37 -0
msprobe/mindspore/api_accuracy_checker/main.py +1 -0
msprobe/mindspore/api_accuracy_checker/multi_api_accuracy_checker.py +12 -6
msprobe/mindspore/api_accuracy_checker/multi_data_manager.py +3 -1
msprobe/mindspore/api_accuracy_checker/torch_mindtorch_importer.py +129 -0
msprobe/mindspore/api_accuracy_checker/type_mapping.py +24 -1
msprobe/mindspore/api_accuracy_checker/utils.py +6 -1
msprobe/mindspore/code_mapping/bind.py +264 -0
msprobe/mindspore/code_mapping/cmd_parser.py +40 -0
msprobe/mindspore/code_mapping/graph.py +49 -0
msprobe/mindspore/code_mapping/graph_parser.py +226 -0
msprobe/mindspore/code_mapping/main.py +24 -0
msprobe/mindspore/code_mapping/processor.py +34 -0
msprobe/mindspore/common/const.py +3 -1
msprobe/mindspore/common/utils.py +68 -5
msprobe/mindspore/compare/distributed_compare.py +0 -2
msprobe/mindspore/compare/ms_compare.py +105 -63
msprobe/mindspore/compare/ms_graph_compare.py +14 -5
msprobe/mindspore/debugger/debugger_config.py +28 -2
msprobe/mindspore/debugger/precision_debugger.py +100 -12
msprobe/mindspore/dump/hook_cell/api_registry.py +85 -16
msprobe/mindspore/dump/hook_cell/hook_cell.py +60 -38
msprobe/mindspore/dump/hook_cell/primitive_hooks.py +33 -15
msprobe/mindspore/dump/hook_cell/support_wrap_ops.yaml +11 -1
msprobe/mindspore/dump/hook_cell/wrap_api.py +92 -1
msprobe/mindspore/dump/jit_dump.py +7 -6
msprobe/mindspore/dump/kernel_dump/kernel_config.py +33 -0
msprobe/mindspore/dump/kernel_graph_dump.py +7 -0
msprobe/mindspore/free_benchmark/api_pynative_self_check.py +13 -4
msprobe/mindspore/free_benchmark/perturbation/bit_noise.py +2 -2
msprobe/mindspore/grad_probe/grad_analyzer.py +24 -12
msprobe/mindspore/grad_probe/hook.py +13 -4
msprobe/mindspore/mindtorch/__init__.py +18 -0
msprobe/mindspore/mindtorch/mindtorch_adaptor.py +255 -0
msprobe/mindspore/monitor/anomaly_detect.py +404 -0
msprobe/mindspore/monitor/distributed/__init__.py +0 -0
msprobe/mindspore/monitor/distributed/distributed_ops.yaml +15 -0
msprobe/mindspore/monitor/distributed/stack_blacklist.yaml +5 -0
msprobe/mindspore/monitor/distributed/wrap_distributed.py +300 -0
msprobe/mindspore/monitor/features.py +63 -0
msprobe/mindspore/monitor/module_hook.py +821 -0
msprobe/mindspore/monitor/module_spec_verifier.py +94 -0
msprobe/mindspore/monitor/utils.py +267 -0
msprobe/mindspore/ms_config.py +13 -3
msprobe/mindspore/overflow_check/kernel_graph_overflow_check.py +7 -0
msprobe/mindspore/service.py +347 -107
msprobe/msprobe.py +24 -3
msprobe/pytorch/__init__.py +7 -7
msprobe/pytorch/api_accuracy_checker/common/utils.py +31 -16
msprobe/pytorch/api_accuracy_checker/compare/algorithm.py +41 -8
msprobe/pytorch/api_accuracy_checker/compare/api_precision_compare.py +100 -267
msprobe/pytorch/api_accuracy_checker/compare/api_precision_standard.yaml +4 -1
msprobe/pytorch/api_accuracy_checker/compare/compare.py +69 -68
msprobe/pytorch/api_accuracy_checker/compare/compare_column.py +54 -0
msprobe/pytorch/api_accuracy_checker/compare/compare_input.py +51 -0
msprobe/pytorch/api_accuracy_checker/compare/compare_utils.py +2 -4
msprobe/pytorch/api_accuracy_checker/generate_op_script/op_generator.py +55 -31
msprobe/pytorch/api_accuracy_checker/precision_standard/absolute_threshold.py +106 -0
msprobe/pytorch/api_accuracy_checker/precision_standard/accumulative_error_compare.py +107 -0
msprobe/pytorch/api_accuracy_checker/precision_standard/base_standard.py +151 -0
msprobe/pytorch/api_accuracy_checker/precision_standard/benchmark_compare.py +226 -0
msprobe/pytorch/api_accuracy_checker/precision_standard/binary_consistency.py +68 -0
msprobe/pytorch/api_accuracy_checker/precision_standard/standard_config.py +218 -0
msprobe/pytorch/api_accuracy_checker/precision_standard/standard_register.py +104 -0
msprobe/pytorch/api_accuracy_checker/precision_standard/thousandth_standard.py +63 -0
msprobe/pytorch/api_accuracy_checker/precision_standard/ulp_compare.py +200 -0
msprobe/pytorch/api_accuracy_checker/run_ut/data_generate.py +57 -1
msprobe/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py +2 -1
msprobe/pytorch/api_accuracy_checker/run_ut/run_overflow_check.py +42 -14
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py +64 -19
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut_utils.py +34 -4
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/device_dispatch.py +5 -3
msprobe/pytorch/bench_functions/apply_adam.py +215 -0
msprobe/pytorch/bench_functions/group_norm_silu.py +27 -0
msprobe/pytorch/bench_functions/mish.py +21 -0
msprobe/pytorch/bench_functions/moe_gating_top_k_softmax.py +44 -0
msprobe/pytorch/bench_functions/npu_fusion_attention.py +42 -10
msprobe/pytorch/bench_functions/sort_v2.py +21 -0
msprobe/pytorch/common/parse_json.py +2 -1
msprobe/pytorch/common/utils.py +116 -2
msprobe/pytorch/compare/distributed_compare.py +17 -29
msprobe/pytorch/compare/pt_compare.py +40 -20
msprobe/pytorch/debugger/debugger_config.py +42 -17
msprobe/pytorch/debugger/precision_debugger.py +56 -12
msprobe/pytorch/dump/module_dump/__init__.py +0 -0
msprobe/pytorch/dump/module_dump/module_dump.py +86 -0
msprobe/pytorch/dump/module_dump/module_processer.py +204 -0
msprobe/pytorch/free_benchmark/common/params.py +2 -1
msprobe/pytorch/free_benchmark/common/utils.py +3 -0
msprobe/pytorch/free_benchmark/compare/grad_saver.py +0 -2
msprobe/pytorch/free_benchmark/result_handlers/base_handler.py +31 -47
msprobe/pytorch/free_benchmark/result_handlers/preheat_handler.py +0 -4
msprobe/pytorch/function_factory.py +7 -1
msprobe/pytorch/hook_module/__init__.py +1 -1
msprobe/pytorch/hook_module/hook_module.py +14 -11
msprobe/pytorch/hook_module/register_optimizer_hook.py +59 -0
msprobe/pytorch/hook_module/support_wrap_ops.yaml +36 -1
msprobe/pytorch/hook_module/wrap_distributed.py +10 -8
msprobe/pytorch/hook_module/wrap_functional.py +0 -40
msprobe/pytorch/monitor/anomaly_analyse.py +1 -1
msprobe/pytorch/monitor/anomaly_detect.py +98 -28
msprobe/pytorch/monitor/csv2tb.py +164 -0
msprobe/pytorch/monitor/distributed/wrap_distributed.py +25 -14
msprobe/pytorch/monitor/features.py +3 -3
msprobe/pytorch/monitor/module_hook.py +543 -318
msprobe/pytorch/monitor/module_metric.py +27 -48
msprobe/pytorch/monitor/module_spec_verifier.py +3 -1
msprobe/pytorch/monitor/optimizer_collect.py +76 -56
msprobe/pytorch/monitor/unittest/test_monitor.py +24 -9
msprobe/pytorch/monitor/utils.py +84 -48
msprobe/pytorch/online_dispatch/dispatch.py +8 -2
msprobe/pytorch/parse_tool/lib/compare.py +10 -10
msprobe/pytorch/parse_tool/lib/config.py +5 -7
msprobe/pytorch/parse_tool/lib/file_desc.py +15 -1
msprobe/pytorch/parse_tool/lib/interactive_cli.py +10 -10
msprobe/pytorch/parse_tool/lib/parse_exception.py +7 -7
msprobe/pytorch/parse_tool/lib/parse_tool.py +11 -10
msprobe/pytorch/parse_tool/lib/utils.py +18 -19
msprobe/pytorch/parse_tool/lib/visualization.py +9 -10
msprobe/pytorch/pt_config.py +19 -22
msprobe/pytorch/service.py +264 -115
msprobe/visualization/builder/graph_builder.py +93 -10
msprobe/visualization/builder/msprobe_adapter.py +30 -6
msprobe/visualization/compare/graph_comparator.py +64 -14
msprobe/visualization/compare/mode_adapter.py +1 -15
msprobe/visualization/graph/base_node.py +15 -19
msprobe/visualization/graph/distributed_analyzer.py +395 -0
msprobe/visualization/graph/graph.py +9 -0
msprobe/visualization/graph/node_op.py +4 -2
msprobe/visualization/graph_service.py +100 -27
msprobe/visualization/utils.py +24 -31
mindstudio_probe-1.1.1.dist-info/RECORD +0 -341
msprobe/pytorch/functional/module_dump.py +0 -84
msprobe/pytorch/module_processer.py +0 -150
{mindstudio_probe-1.1.1.dist-info → mindstudio_probe-1.2.2.dist-info}/LICENSE +0 -0
{mindstudio_probe-1.1.1.dist-info → mindstudio_probe-1.2.2.dist-info}/WHEEL +0 -0
{mindstudio_probe-1.1.1.dist-info → mindstudio_probe-1.2.2.dist-info}/entry_points.txt +0 -0
{mindstudio_probe-1.1.1.dist-info → mindstudio_probe-1.2.2.dist-info}/top_level.txt +0 -0
/msprobe/docs/{data_dump_Mindspore → data_dump_MindSpore}/dynamic_graph_quick_start_example.md +0 -0
/msprobe/{pytorch/functional → mindspore/code_mapping}/__init__.py +0 -0

msprobe/pytorch/hook_module/hook_module.py CHANGED Viewed

@@ -15,17 +15,17 @@
 import functools
 import threading
+from collections import defaultdict
 import torch
 import torch.nn as nn
 import torch.utils.hooks as full_hooks
-from msprobe.core.common.const import Const
 torch_version_above_or_equal_2 = torch.__version__.split('+')[0] >= '2.0'
 class HOOKModule(nn.Module):
-    module_count = {}
+    module_count = defaultdict(int)
     inner_stop_hook = {}
     def __init__(self, build_hook) -> None:
@@ -41,12 +41,7 @@ class HOOKModule(nn.Module):
             if hasattr(self, "prefix_op_name_"):
                 self.prefix = self.prefix_op_name_
-            if self.prefix not in HOOKModule.module_count:
-                HOOKModule.module_count[self.prefix] = 1
-                self.prefix += '0' + Const.SEP
-            else:
-                HOOKModule.module_count[self.prefix] += 1
-                self.prefix = self.prefix + str(HOOKModule.module_count[self.prefix] - 1) + Const.SEP
+            self.forward_data_collected = False
             forward_pre_hook, forward_hook, backward_hook, _ = build_hook(self.prefix)
             if torch_version_above_or_equal_2:
                 self.register_forward_pre_hook(forward_pre_hook, with_kwargs=True)
@@ -66,9 +61,17 @@ class HOOKModule(nn.Module):
             HOOKModule.inner_stop_hook[self.current_thread] = False
         return result
-    @classmethod
-    def reset_module_stats(cls):
-        cls.module_count = {}
+    @staticmethod
+    def reset_module_stats():
+        HOOKModule.module_count = defaultdict(int)
+    @staticmethod
+    def add_module_count(name):
+        HOOKModule.module_count[name] += 1
+    @staticmethod
+    def get_module_count(name):
+        return HOOKModule.module_count[name]
     def _call_func(self, *args, **kwargs):
         full_backward_hooks, non_full_backward_hooks = [], []

msprobe/pytorch/hook_module/register_optimizer_hook.py ADDED Viewed

@@ -0,0 +1,59 @@
+# Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+from msprobe.core.common.const import Const
+from msprobe.pytorch.common.log import logger
+torch_version_above_or_equal_2 = torch.__version__.split('+')[0] >= '2.0'
+if torch_version_above_or_equal_2:
+    from torch.optim.optimizer import register_optimizer_step_pre_hook, register_optimizer_step_post_hook
+def register_optimizer_hook(data_collector):
+    def optimizer_pre_step_hook(optimizer, args, kwargs):
+        data_collector.optimizer_status = Const.OPTIMIZER
+    def optimizer_post_step_hook(optimizer, args, kwargs):
+        data_collector.optimizer_status = Const.END_PREFIX + Const.OPTIMIZER
+    def patch_clip_grad(func):
+        def wrapper(*args, **kwargs):
+            data_collector.optimizer_status = Const.CLIP_GRAD
+            func(*args, **kwargs)
+            data_collector.optimizer_status = Const.END_PREFIX + Const.CLIP_GRAD
+        return wrapper
+    if torch_version_above_or_equal_2:
+        register_optimizer_step_pre_hook(optimizer_pre_step_hook)
+        register_optimizer_step_post_hook(optimizer_post_step_hook)
+    else:
+        logger.info_on_rank_0("Pytorch version is below 2.0, cannot register optimizer hook.")
+    try:
+        torch.nn.utils.clip_grad_norm_ = patch_clip_grad(torch.nn.utils.clip_grad_norm_)
+        torch.nn.utils.clip_grad_norm = patch_clip_grad(torch.nn.utils.clip_grad_norm)
+        torch.nn.utils.clip_grad_value_ = patch_clip_grad(torch.nn.utils.clip_grad_value_)
+    except Exception as e:
+        logger.info_on_rank_0("Cannot patch clip grad function. detail:%s" % str(e))
+    try:
+        from megatron.core.optimizer import MegatronOptimizer
+        MegatronOptimizer.clip_grad_norm = patch_clip_grad(MegatronOptimizer.clip_grad_norm)
+    except ImportError:
+        pass
+    except Exception as e:
+        logger.info_on_rank_0("Cannot patch megatron clip grad function. detail:%s" % str(e))

msprobe/pytorch/hook_module/support_wrap_ops.yaml CHANGED Viewed

@@ -138,6 +138,10 @@ functional:
   - fold
   - multi_head_attention_forward
   - scaled_dot_product_attention
+  - lp_pool3d
+  - dropout1d
+  - mish
+  - huber_loss
 tensor:
   - __add__
@@ -172,6 +176,7 @@ tensor:
   - __sub__
   - __truediv__
   - __xor__
+  - __pow__
   - abs
   - abs_
   - absolute
@@ -557,6 +562,27 @@ tensor:
   - view_as
   - xlogy
   - xlogy_
+  - split
+  - stft
+  - nan_to_num
+  - dsplit
+  - orgqr
+  - bitwise_left_shift_
+  - arctan2
+  - histogram
+  - q_zero_point
+  - adjoint
+  - ormqr
+  - bitwise_right_shift_
+  - nanquantile
+  - lu
+  - quantile
+  - arctan2_
+  - qr
+  - diagonal_scatter
+  - corrcoef
+  - vsplit
+  - aminmax
 torch:
   - linalg.norm
@@ -1131,6 +1157,14 @@ torch_npu:
   - npu_lstm
   - npu_apply_adam
   - npu_apply_adam_w
+  - npu_anti_quant
+  - npu_grouped_matmu
+  - npu_quant_scatter
+  - npu_group_norm_silu
+  - npu_format_cast
+  - npu_moe_finalize_routing
+  - npu_moe_gating_top_k_softmax
+  - npu_trans_quant_param
 aten:
   - signbit
@@ -1877,4 +1911,5 @@ distributed:
   - all_to_all_single
   - all_to_all
   - all_gather_into_tensor
-  - reduce_scatter_tensor
+  - reduce_scatter_tensor
+  - batch_isend_irecv

msprobe/pytorch/hook_module/wrap_distributed.py CHANGED Viewed

@@ -21,7 +21,6 @@ from msprobe.pytorch.hook_module.hook_module import HOOKModule
 from msprobe.pytorch.common.utils import torch_device_guard
 from msprobe.core.common.const import Const
 from msprobe.core.common.file_utils import load_yaml
-from msprobe.core.common.inplace_op_checker import InplaceOpChecker
 cur_path = os.path.dirname(os.path.realpath(__file__))
@@ -49,17 +48,20 @@ class DistributedOPTemplate(HOOKModule):
         self.op_name_ = op_name
         self.prefix_op_name_ = "Distributed" + Const.SEP + str(op_name) + Const.SEP
         super().__init__(build_hook)
-        if not self.stop_hook and InplaceOpChecker.check(self.op_name_, InplaceOpChecker.OP_DISTRIBUTED):
-            self.op_is_inplace = True
+        if not self.stop_hook:
+            self.op_is_distributed = True
     @torch_device_guard
     def forward(self, *args, **kwargs):
+        handle = distributed_func.get(self.op_name_)(*args, **kwargs)
         if kwargs.get("async_op") or self.op_name_ in ["isend", "irecv"]:
-            handle = distributed_func.get(self.op_name_)(*args, **kwargs)
-            handle.wait()
-            return handle
-        else:
-            return distributed_func.get(self.op_name_)(*args, **kwargs)
+            if handle and hasattr(handle, 'wait'):
+                handle.wait()
+        if self.op_name_ == "batch_isend_irecv":
+            if isinstance(handle, list):
+                for req in handle:
+                    req.wait()
+        return handle
 def wrap_distributed_op(op_name, hook):

msprobe/pytorch/hook_module/wrap_functional.py CHANGED Viewed

@@ -23,46 +23,6 @@ from msprobe.pytorch.common.log import logger
 from msprobe.core.common.file_utils import load_yaml
-def remove_dropout():
-    if torch.__version__ > "1.8":
-        logger.info_on_rank_0("For precision comparison, the probability p in the dropout method is set to 0.")
-        import torch.nn.functional as F
-        from torch import _VF
-        from torch.overrides import has_torch_function_unary, handle_torch_function
-        def function_dropout(input_tensor: torch.Tensor, p: float = 0.5, training: bool = True,
-                             inplace: bool = False) -> torch.Tensor:
-            if has_torch_function_unary(input_tensor):
-                return handle_torch_function(
-                    function_dropout, (input_tensor,), input_tensor, p=0., training=training, inplace=inplace)
-            if p < 0.0 or p > 1.0:
-                raise ValueError("dropout probability has to be between 0 and 1, " "but got {}".format(p))
-            return _VF.dropout_(input_tensor, 0., training) if inplace else _VF.dropout(input_tensor, 0., training)
-        def function_dropout2d(input_tensor: torch.Tensor, p: float = 0.5, training: bool = True,
-                               inplace: bool = False) -> torch.Tensor:
-            if has_torch_function_unary(input_tensor):
-                return handle_torch_function(
-                    function_dropout2d, (input_tensor,), input_tensor, p=0., training=training, inplace=inplace)
-            if p < 0.0 or p > 1.0:
-                raise ValueError("dropout probability has to be between 0 and 1, " "but got {}".format(p))
-            return _VF.feature_dropout_(input_tensor, 0., training) if inplace else _VF.feature_dropout(input_tensor,
-                                                                                                        0., training)
-        def function_dropout3d(input_tensor: torch.Tensor, p: float = 0.5, training: bool = True,
-                               inplace: bool = False) -> torch.Tensor:
-            if has_torch_function_unary(input_tensor):
-                return handle_torch_function(
-                    function_dropout3d, (input_tensor,), input_tensor, p=0., training=training, inplace=inplace)
-            if p < 0.0 or p > 1.0:
-                raise ValueError("dropout probability has to be between 0 and 1, " "but got {}".format(p))
-            return _VF.feature_dropout_(input_tensor, 0., training) if inplace else _VF.feature_dropout(input_tensor,
-                                                                                                        0., training)
-        F.dropout = function_dropout
-        F.dropout2d = function_dropout2d
-        F.dropout3d = function_dropout3d
 cur_path = os.path.dirname(os.path.realpath(__file__))
 yaml_path = os.path.join(cur_path, "support_wrap_ops.yaml")

msprobe/pytorch/monitor/anomaly_analyse.py CHANGED Viewed

@@ -19,7 +19,7 @@ import argparse
 import ast
 import heapq
-from msprobe.core.common.log import logger
+from msprobe.pytorch.common.log import logger
 from msprobe.core.common.const import MonitorConst
 from msprobe.core.common.file_utils import check_path_before_create, save_json, create_directory, remove_path, \
     check_file_or_directory_path, load_json

msprobe/pytorch/monitor/anomaly_detect.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
+# Copyright (c) 2024-2025, Huawei Technologies Co., Ltd.
 # All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0  (the "License");
@@ -12,21 +12,22 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import itertools
 import os
-import sys
 import statistics as st
+import sys
 from abc import ABC
+from collections import defaultdict
 from dataclasses import dataclass, field
 from typing import List
-from collections import defaultdict
 import pandas as pd
+import torch
 from torch.utils.tensorboard import SummaryWriter
-from msprobe.core.common.log import logger
-from msprobe.core.common.file_utils import change_mode, create_directory, write_df_to_csv
 from msprobe.core.common.const import FileCheckConst, MonitorConst
+from msprobe.core.common.file_utils import change_mode, create_directory, write_df_to_csv
+from msprobe.pytorch.common.log import logger
 class ScanRule(ABC):
@@ -134,9 +135,9 @@ class AnomalyDataFactory(ABC):
             raise ValueError("tag must be a tuple with length 2")
         tag_name = tag[0]
         param_name = tag_name.split('/')[0]
-        call_id = self.name2callid.get(param_name, -1)
-        if MonitorConst.VPP_SEP in param_name:
-            vpp_stage = int(param_name.split(MonitorConst.VPP_SEP)[0])
+        call_id = self.name2callid.get(tag_name, -1)
+        if MonitorConst.NAME_SEP in param_name:
+            vpp_stage = int(param_name.split(MonitorConst.NAME_SEP)[0])
         else:
             vpp_stage = 0
@@ -153,6 +154,24 @@ class AnomalyDataFactory(ABC):
         )
+class TrainStage:
+    DEFAULT_STAGE = -1
+    FORWARD_STAGE = 0
+    BACKWARD_STAGE = 1
+    OPTIMIZER_STAGE = 2
+FORWARD_KEY = [MonitorConst.ACTV]
+BACKWARD_KEY = [MonitorConst.ACTVGRAD, MonitorConst.PRE_GRAD,
+                MonitorConst.POST_GRAD, MonitorConst.ACC_GRAD]
+OPTIMIZER_KEY = [MonitorConst.EXP_AVG, MonitorConst.EXP_AVG_SQ]
+TRAIN_STAGE = {
+    **{key_: TrainStage.FORWARD_STAGE for key_ in FORWARD_KEY},
+    **{key_: TrainStage.BACKWARD_STAGE for key_ in BACKWARD_KEY},
+    **{key_: TrainStage.OPTIMIZER_STAGE for key_ in OPTIMIZER_KEY}
+}
 @dataclass(eq=True)
 class GradAnomalyData:
     rank: int = 0
@@ -166,25 +185,48 @@ class GradAnomalyData:
     group_mates: list = field(default=None, compare=False)
     def __lt__(self, other):
+        """
+        自定义比较函数，用于确定 GradAnomalyData 实例之间的顺序。
+        比较规则为：
+            step 和 micro_step 值越小优先级越高；
+            vpp 和 pp 在前向阶段值越小优先级越高，在非前向阶段值越大优先级越高；
+            call_id 值越小优先级越高。
+        """
         if not isinstance(other, GradAnomalyData):
             return NotImplemented
-        if self.step != other.step:
-            return self.step < other.step
-        if self.micro_step != other.micro_step:
-            return self.micro_step < other.micro_step
-        if self.vpp_stage != other.vpp_stage:
-            return self.vpp_stage > other.vpp_stage
-        if self.pp_stage != other.pp_stage:
-            return self.pp_stage > other.pp_stage
-        if self.call_id != other.call_id:
-            return self.call_id < other.call_id
-        return False
+        self_train_stage = self.get_train_stage(self.tag_name)
+        other_train_stage = self.get_train_stage(other.tag_name)
+        def vpp_pp_comparator(anomaly):
+            """
+            Determine the priority rule for vpp and pp based on train stage
+            Forward stage prefers smaller vpp and pp
+            Other stages prefer larger vpp and pp
+            """
+            if self_train_stage == TrainStage.FORWARD_STAGE:
+                return anomaly.vpp_stage, anomaly.pp_stage
+            else:
+                return -anomaly.vpp_stage, -anomaly.pp_stage
+        self_cmp = [self.step, self.micro_step, self_train_stage, *vpp_pp_comparator(self), self.call_id]
+        other_cmp = [other.step, other.micro_step, other_train_stage, *vpp_pp_comparator(other), other.call_id]
+        return self_cmp < other_cmp
     def __le__(self, other):
         if not isinstance(other, GradAnomalyData):
             return NotImplemented
         return self == other or self < other
+    @staticmethod
+    def get_train_stage(tag_name):
+        """
+        :param tag_name: "0:fc2.input:0/rank0/actv", "0:fc1.weight/rank0/post_grad", "0:fc2.weight/rank0/exp_avg_sq"
+        :return: int, if forward return 0; if backward return 1; if optimizer return 2
+        """
+        key_ = tag_name.split("/")[-1]
+        return TRAIN_STAGE.get(key_, TrainStage.DEFAULT_STAGE)
     def to_dict(self):
         return self.__dict__
@@ -198,7 +240,6 @@ class WriterInput:
     path: str
     ad_rules: list
     job_id: str
-    anomaly_inform: bool = False
     anomaly_factory: AnomalyDataFactory = None
     ndigits: int = 6
     step_count_per_record: int = 1
@@ -209,7 +250,6 @@ class BaseWriterWithAD:
         self.tag2scalars = {}
         self.ad_rules = writer_input.ad_rules
         self.job_id = writer_input.job_id
-        self.anomaly_inform = writer_input.anomaly_inform
         self.anomaly_factory = writer_input.anomaly_factory
         self.anomalies = []
         self.ndigits = writer_input.ndigits
@@ -242,6 +282,27 @@ class BaseWriterWithAD:
             if self.anomaly_factory:
                 self.anomalies.append(self.anomaly_factory.create(tag, exception_message, global_step))
+    def write_metrics(self, ops, metric_value, step, prefix=''):
+        if not metric_value:
+            return
+        tensors = []
+        tags = list(itertools.product(metric_value.keys(), ops))
+        for op2tensor in metric_value.values():
+            tensors.extend(op2tensor.values())
+        if not tensors:
+            return
+        n_slices = len(tensors) // MonitorConst.SLICE_SIZE
+        with torch.no_grad():
+            for i in range(n_slices + 1):
+                begin = i * MonitorConst.SLICE_SIZE
+                end = (i+1) * MonitorConst.SLICE_SIZE
+                if begin == len(tensors):
+                    continue
+                metric_list = torch.stack(tensors[begin:end]).cpu()
+                for tag, metric in zip(tags[begin:end], metric_list):
+                    self.add_scalar(tag, metric, step)
     def _ad(self, scalar_value, history):
         return AnomalyScanner.scan(self.ad_rules, history, cur=scalar_value)
@@ -291,7 +352,7 @@ class CSVWriterWithAD(BaseWriterWithAD):
         """
         if len(self.context_dict) == 0:
             return
         ster_start, step_end = self.get_step_interval(step)
         filepath = os.path.join(self.log_dir, f'{prefix}_{ster_start}-{step_end}.csv')
         if not os.path.exists(filepath):
@@ -300,11 +361,11 @@ class CSVWriterWithAD(BaseWriterWithAD):
         new_data = []
         for name, metric_value in self.context_dict.items():
-            if MonitorConst.VPP_SEP not in name:
-                new_data.append([name] + [step] + metric_value)
-            else:
-                new_data.append(name.split(MonitorConst.VPP_SEP) + [step] + metric_value)
-        new_data = pd.DataFrame(new_data).round(self.ndigits)
+            new_line = name.split(MonitorConst.NAME_SEP) + metric_value
+            new_line.insert(2, step)
+            new_data.append(new_line)
+        new_data = pd.DataFrame(new_data).round(self.ndigits).fillna("nan")
         write_df_to_csv(new_data, filepath, mode='a+', header=False)
         self.context_dict = defaultdict(list)
@@ -317,6 +378,15 @@ class CSVWriterWithAD(BaseWriterWithAD):
         name = tag[0].split('/')[0]
         self.context_dict[name].append(scalar_value.item())
+    def write_metrics(self, ops, metric_value, step, prefix=''):
+        super().write_metrics(ops, metric_value, step, prefix='')
+        if prefix in [MonitorConst.ACTV, MonitorConst.ACTVGRAD]:
+            self.header = MonitorConst.CSV_HEADER_XY + ops
+        else:
+            self.header = MonitorConst.CSV_HEADER + ops
+        self.write_csv(prefix, step)
     def close(self):
         pass

msprobe/pytorch/monitor/csv2tb.py ADDED Viewed

@@ -0,0 +1,164 @@
+# Copyright (c) 2024-2025, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import datetime
+import os
+import re
+from multiprocessing import Process
+import pytz
+from torch.utils.tensorboard import SummaryWriter
+from tqdm import tqdm
+from msprobe.core.common.const import MonitorConst
+from msprobe.core.common.file_utils import read_csv, create_directory, remove_path
+from msprobe.core.common.utils import is_int
+from msprobe.pytorch.common.log import logger
+from msprobe.pytorch.monitor.utils import get_target_output_dir
+all_data_type_list = ["actv", "actv_grad", "exp_avg", "exp_avg_sq", "grad_unreduced", "grad_reduced", "param"]
+CSV_FILE_SUFFIX = r"_\d+-\d+\.csv"
+def parse_step_line(line, ops):
+    vp_id = line["vpp_stage"]
+    module_name = line[MonitorConst.HEADER_NAME]
+    step = line["step"]
+    vpp_name = f"vp{vp_id}:{module_name}"
+    if 'micro_step' in line:
+        vpp_name = f'{vpp_name}{MonitorConst.NAME_SEP}micro{line["micro_step"]}'
+    ops_result = {}
+    for op in ops:
+        ops_result[op] = line[op]
+    return vpp_name, step, ops_result
+def parse_step_fn(filepath):
+    data = read_csv(filepath)
+    ops = [k for k in data.keys() if k in MonitorConst.OP_LIST]
+    parse_step_result = {}
+    for _, line in data.iterrows():
+        vpp_name, step, ops_result = parse_step_line(line, ops)
+        if vpp_name not in parse_step_result:
+            parse_step_result[vpp_name] = {}
+        if step in parse_step_result[vpp_name]:
+            raise Exception(f"duplicated step({step})")
+        parse_step_result[vpp_name][step] = ops_result
+    return parse_step_result
+def write_step(output_dirpath, parse_step_result, rank, data_type):
+    tb_output_path = os.path.join(output_dirpath, f"rank{rank}", data_type)
+    if os.path.exists(tb_output_path):
+        remove_path(tb_output_path)
+        logger.warning(f"existing path {tb_output_path} will be recovered")
+    writer = SummaryWriter(tb_output_path)
+    for vpp_name, step_data_dict in parse_step_result.items():
+        step_data_list = [(step, ops) for step, ops in step_data_dict.items()]
+        step_data_list.sort(key=lambda x: x[0])
+        for step_data in step_data_list:
+            step = step_data[0]
+            ops = step_data[1]
+            for op, value in ops.items():
+                tag = f"{vpp_name}/{op}"
+                writer.add_scalar(tag, value, step)
+def update_dict(dict1, dict2):
+    for key, value in dict2.items():
+        if key in dict1:
+            if isinstance(dict1[key], dict) and isinstance(value, dict):
+                try:
+                    update_dict(dict1[key], value)
+                except Exception as e:
+                    raise Exception(f"Error updating nested dict failed at key '{key}': {e}") from e
+            else:
+                raise Exception(f"duplicate key: {key}")
+        else:
+            dict1[key] = value
+    return dict1
+def csv2tb_by_step_work(target_output_dirs, output_dirpath, data_type_list):
+    for directory in tqdm(target_output_dirs):
+        dirpath = directory["path"]
+        rank = directory["rank"]
+        for data_type in data_type_list:
+            all_step_result = {}
+            for filename in os.listdir(dirpath):
+                if not re.match(f"{data_type}{CSV_FILE_SUFFIX}", filename):
+                    continue
+                filepath = os.path.join(dirpath, filename)
+                try:
+                    parse_step_result = parse_step_fn(filepath)
+                except Exception as e:
+                    logger.error(f"csv2tensorboard parse {filepath} failed \n {e}")
+                    break
+                all_step_result = update_dict(all_step_result, parse_step_result)
+            if all_step_result:
+                write_step(output_dirpath, all_step_result, rank, data_type)
+def check_process_num(process_num):
+    if not is_int(process_num) or process_num <= 0:
+        raise ValueError(f"process_num({process_num}) is not a positive integer")
+def check_data_type_list(data_type_list):
+    if data_type_list is None:
+        logger.info(f"data_type_list is None, use defualt all_data_type_list: {all_data_type_list}")
+        return
+    if not isinstance(data_type_list, list):
+        raise ValueError(f"data_type_list({data_type_list}) is not a list")
+    for data_type in data_type_list:
+        if data_type not in all_data_type_list:
+            raise ValueError(f"data type({data_type}) is not supported, supported data type: {all_data_type_list}")
+def csv2tensorboard_by_step(
+        monitor_path,
+        time_start=None,
+        time_end=None,
+        process_num=1,
+        data_type_list=None,
+        output_dirpath=None
+):
+    check_process_num(process_num)
+    check_data_type_list(data_type_list)
+    target_output_dirs = get_target_output_dir(monitor_path, time_start, time_end)
+    target_output_dirs = [{"rank": rank, "path": path} for rank, path in target_output_dirs.items()]
+    if output_dirpath is None:
+        local_tz = pytz.timezone("Asia/Shanghai")  # 根据需要调整为目标时区
+        cur_time = datetime.datetime.now(local_tz).strftime("%b%d_%H-%M-%S")
+        output_dirpath = os.path.join(monitor_path, f"{cur_time}-csv2tensorboard_by_step")
+    create_directory(output_dirpath)
+    task_num = len(target_output_dirs)
+    task_num_per_pro = task_num // process_num
+    target_data_type = data_type_list if data_type_list else all_data_type_list
+    processes = []
+    for pro_id in range(process_num):
+        task_start_id = pro_id * task_num_per_pro
+        task_end_id = (pro_id + 1) * task_num_per_pro if pro_id != process_num - 1 else task_num
+        task_dirs = target_output_dirs[task_start_id: task_end_id]
+        p = Process(target=csv2tb_by_step_work, args=(task_dirs, output_dirpath, target_data_type))
+        processes.append(p)
+        p.start()
+    for p in processes:
+        p.join()
+    logger.info(f"output has been saved to: {output_dirpath}")

mindstudio-probe 1.1.1__py3-none-any.whl → 1.2.2__py3-none-any.whl

mindstudio-probe 1.1.1py3-none-any.whl → 1.2.2py3-none-any.whl