PyPI - mindstudio-probe - Versions diffs - 1.1.1__py3-none-any.whl → 1.2.2__py3-none-any.whl - Mend

mindstudio-probe 1.1.1py3-none-any.whl → 1.2.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (226) hide show

{mindstudio_probe-1.1.1.dist-info → mindstudio_probe-1.2.2.dist-info}/METADATA +3 -2
mindstudio_probe-1.2.2.dist-info/RECORD +415 -0
msprobe/CMakeLists.txt +5 -0
msprobe/README.md +16 -21
msprobe/config.json +1 -0
msprobe/core/common/const.py +185 -11
msprobe/core/common/exceptions.py +3 -1
msprobe/core/common/file_utils.py +33 -7
msprobe/core/common/inplace_ops.yaml +4 -0
msprobe/core/common/utils.py +42 -14
msprobe/core/common_config.py +6 -0
msprobe/core/compare/acc_compare.py +139 -128
msprobe/core/compare/check.py +31 -29
msprobe/core/compare/compare_cli.py +17 -16
msprobe/core/compare/highlight.py +186 -99
msprobe/core/compare/layer_mapping/data_scope_parser.py +19 -8
msprobe/core/compare/layer_mapping/layer_mapping.py +21 -14
msprobe/core/compare/layer_mapping/postprocess_pass.py +4 -3
msprobe/core/compare/merge_result/merge_result.py +381 -0
msprobe/core/compare/merge_result/merge_result_cli.py +31 -0
msprobe/core/compare/merge_result/utils.py +81 -0
msprobe/core/compare/multiprocessing_compute.py +2 -2
msprobe/core/compare/npy_compare.py +109 -147
msprobe/core/compare/utils.py +199 -69
msprobe/core/data_dump/data_collector.py +100 -25
msprobe/core/data_dump/data_processor/base.py +130 -28
msprobe/core/data_dump/data_processor/factory.py +8 -3
msprobe/core/data_dump/data_processor/mindspore_processor.py +170 -23
msprobe/core/data_dump/data_processor/pytorch_processor.py +175 -64
msprobe/core/data_dump/json_writer.py +54 -8
msprobe/core/data_dump/scope.py +19 -18
msprobe/core/overflow_check/abnormal_scene.py +9 -5
msprobe/core/overflow_check/checker.py +1 -1
msprobe/core/overflow_check/utils.py +1 -1
msprobe/docs/01.installation.md +121 -17
msprobe/docs/02.config_introduction.md +18 -16
msprobe/docs/03.config_examples.md +24 -0
msprobe/docs/05.data_dump_PyTorch.md +107 -58
msprobe/docs/06.data_dump_MindSpore.md +95 -34
msprobe/docs/07.accuracy_checker_PyTorch.md +18 -18
msprobe/docs/09.accuracy_checker_MindSpore.md +8 -6
msprobe/docs/10.accuracy_compare_PyTorch.md +99 -41
msprobe/docs/11.accuracy_compare_MindSpore.md +249 -48
msprobe/docs/12.overflow_check_PyTorch.md +1 -1
msprobe/docs/19.monitor.md +310 -220
msprobe/docs/21.visualization_PyTorch.md +125 -35
msprobe/docs/22.visualization_MindSpore.md +149 -41
msprobe/docs/23.generate_operator_PyTorch.md +107 -0
msprobe/docs/24.code_mapping_Mindspore.md +28 -0
msprobe/docs/{23.tool_function_introduction.md → 25.tool_function_introduction.md} +1 -0
msprobe/docs/26.data_dump_PyTorch_baseline.md +37 -0
msprobe/docs/27.dump_json_instruction.md +525 -0
msprobe/docs/28.debugger_save_instruction.md +94 -0
msprobe/docs/28.kernel_dump_MindSpore.md +69 -0
msprobe/docs/FAQ.md +26 -2
msprobe/docs/accuracy_checker_MindSpore/accuracy_checker_MindSpore_baseline.md +14 -0
msprobe/docs/data_dump_MindSpore/data_dump_MindSpore_baseline.md +22 -0
msprobe/docs/img/merge_result.png +0 -0
msprobe/docs/img/monitor/step_count_per_record.png +0 -0
msprobe/docs/img/visualization/fuzzy_match_ms.png +0 -0
msprobe/docs/img/visualization/fuzzy_match_pt.png +0 -0
msprobe/docs/img/visualization/tensorboard_1.png +0 -0
msprobe/docs/img/visualization/tensorboard_2.png +0 -0
msprobe/docs/img/visualization/vis_browser_1.png +0 -0
msprobe/docs/img/visualization/vis_browser_2.png +0 -0
msprobe/docs/img/visualization/vis_precision_info.png +0 -0
msprobe/docs/img/visualization/vis_search_info.png +0 -0
msprobe/docs/img/visualization/vis_show_info.png +0 -0
msprobe/docs/img/visualization/vis_showcase.png +0 -0
msprobe/docs/img/visualization/vis_unmatch_info.png +0 -0
msprobe/docs/visualization/GPTModel.png +0 -0
msprobe/docs/visualization/ParallelMLP.png +0 -0
msprobe/docs/visualization/layer_mapping_example.md +132 -0
msprobe/docs/visualization/mapping.png +0 -0
msprobe/docs/visualization/mapping1.png +0 -0
msprobe/docs/visualization/module_name.png +0 -0
msprobe/docs/visualization/module_name1.png +0 -0
msprobe/docs/visualization/no_mapping.png +0 -0
msprobe/docs/visualization/no_mapping1.png +0 -0
msprobe/docs/visualization/no_mapping_analyze.png +0 -0
msprobe/docs/visualization/top_layer.png +0 -0
msprobe/mindspore/__init__.py +11 -0
msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +80 -28
msprobe/mindspore/api_accuracy_checker/api_runner.py +54 -16
msprobe/mindspore/api_accuracy_checker/cmd_parser.py +2 -1
msprobe/mindspore/api_accuracy_checker/compute_element.py +52 -8
msprobe/mindspore/api_accuracy_checker/data_manager.py +37 -0
msprobe/mindspore/api_accuracy_checker/main.py +1 -0
msprobe/mindspore/api_accuracy_checker/multi_api_accuracy_checker.py +12 -6
msprobe/mindspore/api_accuracy_checker/multi_data_manager.py +3 -1
msprobe/mindspore/api_accuracy_checker/torch_mindtorch_importer.py +129 -0
msprobe/mindspore/api_accuracy_checker/type_mapping.py +24 -1
msprobe/mindspore/api_accuracy_checker/utils.py +6 -1
msprobe/mindspore/code_mapping/bind.py +264 -0
msprobe/mindspore/code_mapping/cmd_parser.py +40 -0
msprobe/mindspore/code_mapping/graph.py +49 -0
msprobe/mindspore/code_mapping/graph_parser.py +226 -0
msprobe/mindspore/code_mapping/main.py +24 -0
msprobe/mindspore/code_mapping/processor.py +34 -0
msprobe/mindspore/common/const.py +3 -1
msprobe/mindspore/common/utils.py +68 -5
msprobe/mindspore/compare/distributed_compare.py +0 -2
msprobe/mindspore/compare/ms_compare.py +105 -63
msprobe/mindspore/compare/ms_graph_compare.py +14 -5
msprobe/mindspore/debugger/debugger_config.py +28 -2
msprobe/mindspore/debugger/precision_debugger.py +100 -12
msprobe/mindspore/dump/hook_cell/api_registry.py +85 -16
msprobe/mindspore/dump/hook_cell/hook_cell.py +60 -38
msprobe/mindspore/dump/hook_cell/primitive_hooks.py +33 -15
msprobe/mindspore/dump/hook_cell/support_wrap_ops.yaml +11 -1
msprobe/mindspore/dump/hook_cell/wrap_api.py +92 -1
msprobe/mindspore/dump/jit_dump.py +7 -6
msprobe/mindspore/dump/kernel_dump/kernel_config.py +33 -0
msprobe/mindspore/dump/kernel_graph_dump.py +7 -0
msprobe/mindspore/free_benchmark/api_pynative_self_check.py +13 -4
msprobe/mindspore/free_benchmark/perturbation/bit_noise.py +2 -2
msprobe/mindspore/grad_probe/grad_analyzer.py +24 -12
msprobe/mindspore/grad_probe/hook.py +13 -4
msprobe/mindspore/mindtorch/__init__.py +18 -0
msprobe/mindspore/mindtorch/mindtorch_adaptor.py +255 -0
msprobe/mindspore/monitor/anomaly_detect.py +404 -0
msprobe/mindspore/monitor/distributed/__init__.py +0 -0
msprobe/mindspore/monitor/distributed/distributed_ops.yaml +15 -0
msprobe/mindspore/monitor/distributed/stack_blacklist.yaml +5 -0
msprobe/mindspore/monitor/distributed/wrap_distributed.py +300 -0
msprobe/mindspore/monitor/features.py +63 -0
msprobe/mindspore/monitor/module_hook.py +821 -0
msprobe/mindspore/monitor/module_spec_verifier.py +94 -0
msprobe/mindspore/monitor/utils.py +267 -0
msprobe/mindspore/ms_config.py +13 -3
msprobe/mindspore/overflow_check/kernel_graph_overflow_check.py +7 -0
msprobe/mindspore/service.py +347 -107
msprobe/msprobe.py +24 -3
msprobe/pytorch/__init__.py +7 -7
msprobe/pytorch/api_accuracy_checker/common/utils.py +31 -16
msprobe/pytorch/api_accuracy_checker/compare/algorithm.py +41 -8
msprobe/pytorch/api_accuracy_checker/compare/api_precision_compare.py +100 -267
msprobe/pytorch/api_accuracy_checker/compare/api_precision_standard.yaml +4 -1
msprobe/pytorch/api_accuracy_checker/compare/compare.py +69 -68
msprobe/pytorch/api_accuracy_checker/compare/compare_column.py +54 -0
msprobe/pytorch/api_accuracy_checker/compare/compare_input.py +51 -0
msprobe/pytorch/api_accuracy_checker/compare/compare_utils.py +2 -4
msprobe/pytorch/api_accuracy_checker/generate_op_script/op_generator.py +55 -31
msprobe/pytorch/api_accuracy_checker/precision_standard/absolute_threshold.py +106 -0
msprobe/pytorch/api_accuracy_checker/precision_standard/accumulative_error_compare.py +107 -0
msprobe/pytorch/api_accuracy_checker/precision_standard/base_standard.py +151 -0
msprobe/pytorch/api_accuracy_checker/precision_standard/benchmark_compare.py +226 -0
msprobe/pytorch/api_accuracy_checker/precision_standard/binary_consistency.py +68 -0
msprobe/pytorch/api_accuracy_checker/precision_standard/standard_config.py +218 -0
msprobe/pytorch/api_accuracy_checker/precision_standard/standard_register.py +104 -0
msprobe/pytorch/api_accuracy_checker/precision_standard/thousandth_standard.py +63 -0
msprobe/pytorch/api_accuracy_checker/precision_standard/ulp_compare.py +200 -0
msprobe/pytorch/api_accuracy_checker/run_ut/data_generate.py +57 -1
msprobe/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py +2 -1
msprobe/pytorch/api_accuracy_checker/run_ut/run_overflow_check.py +42 -14
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py +64 -19
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut_utils.py +34 -4
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/device_dispatch.py +5 -3
msprobe/pytorch/bench_functions/apply_adam.py +215 -0
msprobe/pytorch/bench_functions/group_norm_silu.py +27 -0
msprobe/pytorch/bench_functions/mish.py +21 -0
msprobe/pytorch/bench_functions/moe_gating_top_k_softmax.py +44 -0
msprobe/pytorch/bench_functions/npu_fusion_attention.py +42 -10
msprobe/pytorch/bench_functions/sort_v2.py +21 -0
msprobe/pytorch/common/parse_json.py +2 -1
msprobe/pytorch/common/utils.py +116 -2
msprobe/pytorch/compare/distributed_compare.py +17 -29
msprobe/pytorch/compare/pt_compare.py +40 -20
msprobe/pytorch/debugger/debugger_config.py +42 -17
msprobe/pytorch/debugger/precision_debugger.py +56 -12
msprobe/pytorch/dump/module_dump/__init__.py +0 -0
msprobe/pytorch/dump/module_dump/module_dump.py +86 -0
msprobe/pytorch/dump/module_dump/module_processer.py +204 -0
msprobe/pytorch/free_benchmark/common/params.py +2 -1
msprobe/pytorch/free_benchmark/common/utils.py +3 -0
msprobe/pytorch/free_benchmark/compare/grad_saver.py +0 -2
msprobe/pytorch/free_benchmark/result_handlers/base_handler.py +31 -47
msprobe/pytorch/free_benchmark/result_handlers/preheat_handler.py +0 -4
msprobe/pytorch/function_factory.py +7 -1
msprobe/pytorch/hook_module/__init__.py +1 -1
msprobe/pytorch/hook_module/hook_module.py +14 -11
msprobe/pytorch/hook_module/register_optimizer_hook.py +59 -0
msprobe/pytorch/hook_module/support_wrap_ops.yaml +36 -1
msprobe/pytorch/hook_module/wrap_distributed.py +10 -8
msprobe/pytorch/hook_module/wrap_functional.py +0 -40
msprobe/pytorch/monitor/anomaly_analyse.py +1 -1
msprobe/pytorch/monitor/anomaly_detect.py +98 -28
msprobe/pytorch/monitor/csv2tb.py +164 -0
msprobe/pytorch/monitor/distributed/wrap_distributed.py +25 -14
msprobe/pytorch/monitor/features.py +3 -3
msprobe/pytorch/monitor/module_hook.py +543 -318
msprobe/pytorch/monitor/module_metric.py +27 -48
msprobe/pytorch/monitor/module_spec_verifier.py +3 -1
msprobe/pytorch/monitor/optimizer_collect.py +76 -56
msprobe/pytorch/monitor/unittest/test_monitor.py +24 -9
msprobe/pytorch/monitor/utils.py +84 -48
msprobe/pytorch/online_dispatch/dispatch.py +8 -2
msprobe/pytorch/parse_tool/lib/compare.py +10 -10
msprobe/pytorch/parse_tool/lib/config.py +5 -7
msprobe/pytorch/parse_tool/lib/file_desc.py +15 -1
msprobe/pytorch/parse_tool/lib/interactive_cli.py +10 -10
msprobe/pytorch/parse_tool/lib/parse_exception.py +7 -7
msprobe/pytorch/parse_tool/lib/parse_tool.py +11 -10
msprobe/pytorch/parse_tool/lib/utils.py +18 -19
msprobe/pytorch/parse_tool/lib/visualization.py +9 -10
msprobe/pytorch/pt_config.py +19 -22
msprobe/pytorch/service.py +264 -115
msprobe/visualization/builder/graph_builder.py +93 -10
msprobe/visualization/builder/msprobe_adapter.py +30 -6
msprobe/visualization/compare/graph_comparator.py +64 -14
msprobe/visualization/compare/mode_adapter.py +1 -15
msprobe/visualization/graph/base_node.py +15 -19
msprobe/visualization/graph/distributed_analyzer.py +395 -0
msprobe/visualization/graph/graph.py +9 -0
msprobe/visualization/graph/node_op.py +4 -2
msprobe/visualization/graph_service.py +100 -27
msprobe/visualization/utils.py +24 -31
mindstudio_probe-1.1.1.dist-info/RECORD +0 -341
msprobe/pytorch/functional/module_dump.py +0 -84
msprobe/pytorch/module_processer.py +0 -150
{mindstudio_probe-1.1.1.dist-info → mindstudio_probe-1.2.2.dist-info}/LICENSE +0 -0
{mindstudio_probe-1.1.1.dist-info → mindstudio_probe-1.2.2.dist-info}/WHEEL +0 -0
{mindstudio_probe-1.1.1.dist-info → mindstudio_probe-1.2.2.dist-info}/entry_points.txt +0 -0
{mindstudio_probe-1.1.1.dist-info → mindstudio_probe-1.2.2.dist-info}/top_level.txt +0 -0
/msprobe/docs/{data_dump_Mindspore → data_dump_MindSpore}/dynamic_graph_quick_start_example.md +0 -0
/msprobe/{pytorch/functional → mindspore/code_mapping}/__init__.py +0 -0

msprobe/core/data_dump/data_processor/pytorch_processor.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
+# Copyright (c) 2024-2025, Huawei Technologies Co., Ltd.
 # All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0  (the "License");
@@ -21,6 +21,7 @@ from typing import List
 import numpy as np
 import torch
 from torch import distributed as dist
+from torch.distributed.distributed_c10d import _get_default_group
 from msprobe.core.common.const import Const
 from msprobe.core.common.file_utils import path_len_exceeds_limit
@@ -40,7 +41,16 @@ except ImportError:
 class PytorchDataProcessor(BaseDataProcessor):
-    pytorch_special_type = (torch.device, torch.dtype, torch.Size, torch.Tensor, torch.memory_format, dist.ProcessGroup)
+    pytorch_special_type = (
+        torch.device,
+        torch.dtype,
+        torch.Size,
+        torch.Tensor,
+        torch.memory_format,
+        dist.ProcessGroup,
+        dist.P2POp,
+        dist.ReduceOp
+    )
     memory_format = {
         torch.contiguous_format: "contiguous_format",
         torch.channels_last: "channels_last",
@@ -54,6 +64,7 @@ class PytorchDataProcessor(BaseDataProcessor):
             "device": self.analyze_device_in_kwargs,
             "dtype": self.analyze_dtype_in_kwargs
         }
+        self._async_dump_cache = {}
     @staticmethod
     def get_md5_for_tensor(x):
@@ -82,49 +93,80 @@ class PytorchDataProcessor(BaseDataProcessor):
         return {"type": "torch.dtype", "value": str(element)}
     @staticmethod
-    def get_stat_info(data):
+    def get_stat_info_async(data):
         tensor_stat = TensorStatInfo()
-        if data.is_meta:
-            return tensor_stat
-        data_clone = data.detach()
-        if data_clone.numel() == 0:
+        if torch.is_complex(data):
+            logger.warning("Async dump do not support complex data!")
             return tensor_stat
-        elif data_clone.dtype == torch.bool:
-            tensor_stat.max = torch._C._VariableFunctionsClass.any(data_clone).item()
-            tensor_stat.min = torch._C._VariableFunctionsClass.all(data_clone).item()
-        elif not data_clone.shape:
-            tensor_stat.max = tensor_stat.min = tensor_stat.mean = tensor_stat.norm = data_clone.item()
-        elif torch.is_complex(data_clone):
-            data_np = data_clone.cpu().numpy()
+        elif data.dtype == torch.bool:
+            tensor_stat.stack_tensor_stat = (["Max", "Min"], torch.stack(
+                [torch.any(data), torch.all(data)]))
+        elif not data.shape:
+            tensor_stat.stack_tensor_stat = (["Max", "Min", "Mean", "Norm"], torch.stack([data, data, data, data]))
+        else:
+            if not data.is_floating_point() or data.dtype == torch.float64:
+                data = data.float()
+            tensor_stat.stack_tensor_stat = (["Max", "Min", "Mean", "Norm"], torch.stack([
+                torch.max(data),
+                torch.min(data),
+                torch.mean(data),
+                torch.norm(data)
+            ]))
+        return tensor_stat
+    @staticmethod
+    def get_stat_info_sync(data):
+        tensor_stat = TensorStatInfo()
+        if torch.is_complex(data):
+            data_np = data.cpu().numpy()
             data_abs = np.abs(data_np)
             tensor_stat.max = np.max(data_abs).item()
             tensor_stat.min = np.min(data_abs).item()
             tensor_stat.mean = np.mean(data_abs).item()
+        elif data.dtype == torch.bool:
+            tensor_stat.max = torch.any(data).item()
+            tensor_stat.min = torch.all(data).item()
+        elif not data.shape:
+            tensor_stat.max = tensor_stat.min = tensor_stat.mean = tensor_stat.norm = data.item()
         else:
-            if not data_clone.is_floating_point() or data_clone.dtype == torch.float64:
-                data_clone = data_clone.float()
-            tensor_stat.max = torch._C._VariableFunctionsClass.max(data_clone).item()
-            tensor_stat.min = torch._C._VariableFunctionsClass.min(data_clone).item()
-            tensor_stat.mean = torch._C._VariableFunctionsClass.mean(data_clone).item()
-            tensor_stat.norm = torch._C._VariableFunctionsClass.norm(data_clone).item()
+            if not data.is_floating_point() or data.dtype == torch.float64:
+                data = data.float()
+            tensor_stat.max = torch.max(data).item()
+            tensor_stat.min = torch.min(data).item()
+            tensor_stat.mean = torch.mean(data).item()
+            tensor_stat.norm = torch.norm(data).item()
         return tensor_stat
+    @staticmethod
+    def get_stat_info(data, async_dump=False):
+        tensor_stat = TensorStatInfo()
+        if data.is_meta:
+            return tensor_stat
+        data_clone = data.detach()
+        if data_clone.numel() == 0:
+            return tensor_stat
+        else:
+            if data_clone.device.type == Const.CPU_LOWERCASE or not async_dump:
+                return PytorchDataProcessor.get_stat_info_sync(data_clone)
+            else:
+                return PytorchDataProcessor.get_stat_info_async(data_clone)
     @staticmethod
     def handle_tensor_extremum_nan_inf(tensor, operator):
         data_clone = tensor.detach()
-        data_nan = torch._C._VariableFunctionsClass.isnan(data_clone)
-        if int(torch._C._VariableFunctionsClass.sum(data_nan)) == data_clone.numel():
+        data_nan = torch.isnan(data_clone)
+        if int(torch.sum(data_nan)) == data_clone.numel():
             return float('nan')
-        finite_mask = torch._C._VariableFunctionsClass.isfinite(data_clone)
-        if int(torch._C._VariableFunctionsClass.sum(finite_mask)) > 0:
-            finite_values = getattr(torch._C._TensorBase, "__getitem__")(data_clone, finite_mask)
-            return torch._C._VariableFunctionsClass.max(finite_values).item() if operator == 'max' else \
-                torch._C._VariableFunctionsClass.min(finite_values).item()
+        finite_mask = torch.isfinite(data_clone)
+        if int(torch.sum(finite_mask)) > 0:
+            finite_values = data_clone[finite_mask]
+            return torch.max(finite_values).item() if operator == 'max' else \
+                torch.min(finite_values).item()
         else:
-            data_no_nan = getattr(torch._C._TensorBase, "__getitem__")(data_clone, ~data_nan)
-            return torch._C._VariableFunctionsClass.max(data_no_nan).item() if operator == 'max' else \
-                torch._C._VariableFunctionsClass.min(data_no_nan).item()
+            data_no_nan = data_clone[~data_nan]
+            return torch.max(data_no_nan).item() if operator == 'max' else \
+                torch.min(data_no_nan).item()
     @staticmethod
     def process_group_hash(arg):
@@ -132,6 +174,15 @@ class PytorchDataProcessor(BaseDataProcessor):
         group_ranks_hash = hashlib.md5(str(group_ranks).encode('utf-8')).hexdigest()
         return group_ranks_hash
+    @staticmethod
+    def is_distributed_op(module):
+        return getattr(module, "op_is_distributed", False)
+    @staticmethod
+    def is_hookable_element(element):
+        return (hasattr(element, "register_hook") and callable(element.register_hook)) and \
+            (hasattr(element, "requires_grad") and element.requires_grad)
     @staticmethod
     def _analyze_torch_size(arg):
         return {"type": "torch.Size", "value": list(arg)}
@@ -140,7 +191,6 @@ class PytorchDataProcessor(BaseDataProcessor):
     def _analyze_memory_format(arg):
         # 获取内存格式
         format_type = PytorchDataProcessor.memory_format.get(arg)
         return {"type": "torch.memory_format", "format": format_type}
     @staticmethod
@@ -152,9 +202,18 @@ class PytorchDataProcessor(BaseDataProcessor):
             group_id = PytorchDataProcessor.process_group_hash(arg)
             group_info.update({"group_id": group_id})
         except Exception as e:
-            logger.warning(f"Failed to get process group(id: {group_id}) ranks info with error info: {e}.")
+            logger.warning(f"Failed to get process group ranks info with error info: {e}.")
         return group_info
+    @staticmethod
+    def _analyze_reduce_op(arg):
+        op_type = None
+        try:
+            op_type = str(arg)
+        except Exception as e:
+            logger.warning(f"Failed to get value of torch.distributed.ReduceOp with error info: {e}.")
+        return {"type": "torch.distributed.ReduceOp", "value": op_type}
     @classmethod
     def get_special_types(cls):
         return super().get_special_types() + cls.pytorch_special_type
@@ -168,35 +227,65 @@ class PytorchDataProcessor(BaseDataProcessor):
             return self._analyze_memory_format(element)
         if isinstance(element, dist.ProcessGroup):
             return self._analyze_process_group(element)
+        if isinstance(element, dist.P2POp):
+            return self._analyze_p2pop(element)
+        if isinstance(element, dist.ReduceOp):
+            return self._analyze_reduce_op(element)
         converted_numpy, numpy_type = self._convert_numpy_to_builtin(element)
         if converted_numpy is not element:
-            return self._analyze_numpy(converted_numpy, numpy_type)
+            return {"type": numpy_type, "value": converted_numpy}
         if isinstance(element, torch.Tensor):
-            return self._analyze_tensor(element, Const.SEP.join(suffix_stack))
+            return self._analyze_tensor(element, Const.SEP.join([str(suffix) for suffix in suffix_stack]))
+        if isinstance(element, np.ndarray):
+            return self._analyze_numpy(element, Const.SEP.join([str(suffix) for suffix in suffix_stack]))
         if isinstance(element, (bool, int, float, str, slice, type(Ellipsis))):
             return self._analyze_builtin(element)
         return {}
+    def analyze_forward_output(self, name, module, module_input_output: ModuleForwardInputsOutputs):
+        if self.is_distributed_op(module):
+            module_input_output.update_output_with_args_and_kwargs()
+        return super().analyze_forward_output(name, module, module_input_output)
+    def _analyze_p2pop(self, arg):
+        p2pop_info = {"class_type": "torch.distributed.P2POp"}
+        try:
+            tensor_info = self._analyze_tensor(arg.tensor, [])
+            p2pop_info.update({"tensor": tensor_info})
+            p2pop_info.update({"op": arg.op.__name__})
+            p2pop_info.update({"peer": arg.peer})
+            p2pop_info.update({"tag": arg.tag})
+            group_id = PytorchDataProcessor.process_group_hash(
+                arg.group) if arg.group else PytorchDataProcessor.process_group_hash(_get_default_group())
+            p2pop_info.update({"group_id": group_id})
+        except Exception as e:
+            logger.warning(f"Failed to parse the P2POp content with error info: {e}.")
+        return p2pop_info
     def _analyze_tensor(self, tensor, suffix):
-        tensor_stat = self.get_stat_info(tensor)
+        tensor_stat = self.get_stat_info(tensor, self.config.async_dump)
         tensor_json = {}
         tensor_json.update({'type': 'torch.Tensor'})
         tensor_json.update({'dtype': str(tensor.dtype)})
         tensor_json.update({"shape": tensor.shape})
-        tensor_json.update({"Max": tensor_stat.max})
-        tensor_json.update({"Min": tensor_stat.min})
-        tensor_json.update({"Mean": tensor_stat.mean})
-        tensor_json.update({"Norm": tensor_stat.norm})
-        tensor_json.update({"requires_grad": tensor.requires_grad})
-        if tensor_stat.max is not None:
-            if np.isinf(tensor_stat.max) or np.isnan(tensor_stat.max):
-                tensor_json['Max_except_inf_nan'] = self.handle_tensor_extremum_nan_inf(tensor, "max")
-        if tensor_stat.min is not None:
-            if np.isinf(tensor_stat.min) or np.isnan(tensor_stat.min):
-                tensor_json['Min_except_inf_nan'] = self.handle_tensor_extremum_nan_inf(tensor, "min")
-        if self.config.summary_mode == Const.MD5:
+        if tensor_stat.stack_tensor_stat is None:
+            tensor_json.update({"Max": tensor_stat.max})
+            tensor_json.update({"Min": tensor_stat.min})
+            tensor_json.update({"Mean": tensor_stat.mean})
+            tensor_json.update({"Norm": tensor_stat.norm})
+            tensor_json.update({"requires_grad": tensor.requires_grad})
+            if tensor_stat.max is not None:
+                if np.isinf(tensor_stat.max) or np.isnan(tensor_stat.max):
+                    tensor_json['Max_except_inf_nan'] = self.handle_tensor_extremum_nan_inf(tensor, "max")
+            if tensor_stat.min is not None:
+                if np.isinf(tensor_stat.min) or np.isnan(tensor_stat.min):
+                    tensor_json['Min_except_inf_nan'] = self.handle_tensor_extremum_nan_inf(tensor, "min")
+        else:
+            tensor_json.update({"requires_grad": tensor.requires_grad})
+            tensor_json.update({"tensor_stat": tensor_stat.stack_tensor_stat})
+        if self.config.summary_mode == Const.MD5 and not self.config.async_dump:
             tensor_md5 = self.get_md5_for_tensor(tensor)
             tensor_json.update({Const.MD5: tensor_md5})
         return tensor_json
@@ -207,13 +296,28 @@ class StatisticsDataProcessor(PytorchDataProcessor):
 class TensorDataProcessor(PytorchDataProcessor):
+    def dump_async_data(self):
+        for file_path, tensor in self._async_dump_cache.items():
+            save_pt(tensor.contiguous(), file_path)
+        self._async_dump_cache.clear()
     def _analyze_tensor(self, tensor, suffix):
         dump_data_name, file_path = self.get_save_file_path(suffix)
-        saved_tensor = tensor.clone().contiguous().detach()
-        save_pt(saved_tensor, file_path)
         single_arg = super()._analyze_tensor(tensor, suffix)
         single_arg.update({"data_name": dump_data_name})
+        if self.config.async_dump:
+            self._async_dump_cache[file_path] = tensor.clone().detach()
+        else:
+            saved_tensor = tensor.clone().contiguous().detach()
+            save_pt(saved_tensor, file_path)
         return single_arg
+    def _analyze_numpy(self, ndarray, suffix):
+        dump_data_name, file_path = self.get_save_file_path(suffix)
+        save_pt(torch.tensor(ndarray), file_path)
+        ndarray_json = super()._analyze_numpy(ndarray, suffix)
+        ndarray_json.update({"data_name": dump_data_name})
+        return ndarray_json
 class OverflowCheckDataProcessor(PytorchDataProcessor):
@@ -223,7 +327,7 @@ class OverflowCheckDataProcessor(PytorchDataProcessor):
         super().__init__(config, data_writer)
         self.has_overflow = False
         self.support_inf_nan = None
-        self.cached_inplace_api_info = {}
+        self.cached_api_info = {}
         self.cached_tensors_and_file_paths = {}
         self.bits_for_overflow = 8
         self.real_overflow_nums = 0
@@ -237,21 +341,21 @@ class OverflowCheckDataProcessor(PytorchDataProcessor):
             return True
         return False
-    def analyze_pre_forward_inplace(self, name, module_input_output: ModuleForwardInputsOutputs):
+    def analyze_forward_input(self, name, module, module_input_output: ModuleForwardInputsOutputs):
         self.has_overflow = False
         self._is_support_inf_nan()
-        self.cached_inplace_api_info = super().analyze_pre_forward_inplace(name, module_input_output)
+        self.cached_api_info = super().analyze_forward_input(name, module, module_input_output)
         return None
-    def analyze_forward_inplace(self, name, module_input_output: ModuleForwardInputsOutputs):
+    def analyze_forward_output(self, name, module, module_input_output: ModuleForwardInputsOutputs):
         self._is_support_inf_nan()
-        api_info_struct = super().analyze_forward_inplace(name, module_input_output)
-        if name in self.cached_inplace_api_info and name in api_info_struct:
-            self.cached_inplace_api_info[name].update(api_info_struct[name])
+        api_info_struct = super().analyze_forward_output(name, module, module_input_output)
+        if name in self.cached_api_info and name in api_info_struct:
+            self.cached_api_info[name].update(api_info_struct[name])
         elif name in api_info_struct:
-            self.cached_inplace_api_info = api_info_struct
+            self.cached_api_info = api_info_struct
         self.handle_overflow()
-        return self.cached_inplace_api_info if self.has_overflow else None
+        return self.cached_api_info if self.has_overflow else None
     def analyze_forward(self, name, module, module_input_output: ModuleForwardInputsOutputs):
         self.has_overflow = False
@@ -267,6 +371,13 @@ class OverflowCheckDataProcessor(PytorchDataProcessor):
         self.handle_overflow()
         return api_info_struct if self.has_overflow else None
+    def analyze_params(self, name, param_name, grad):
+        self.has_overflow = False
+        self._is_support_inf_nan()
+        api_info_struct = super().analyze_params(name, param_name, grad)
+        self.handle_overflow()
+        return api_info_struct if self.has_overflow else None
     def handle_overflow(self):
         if not self.support_inf_nan:
             self._analyze_maybe_overflow_flag()
@@ -340,10 +451,10 @@ class FreeBenchmarkDataProcessor(PytorchDataProcessor):
             )
         return
-    def analyze_pre_forward(self, name, module, module_input_output: ModuleForwardInputsOutputs):
+    def analyze_forward_input(self, name, module, module_input_output: ModuleForwardInputsOutputs):
         self.checker.pre_forward(name, module, self, module_input_output.args, module_input_output.kwargs)
-    def analyze_forward(self, name, module, module_input_output: ModuleForwardInputsOutputs):
+    def analyze_forward_output(self, name, module, module_input_output: ModuleForwardInputsOutputs):
         new_output, unequal_rows = self.checker.forward(
             name,
             module,
@@ -388,7 +499,7 @@ class KernelDumpDataProcessor(PytorchDataProcessor):
     def _print_unsupported_log(api_name):
         logger.warning(f"The kernel dump does not support the {api_name} API.")
-    def analyze_pre_forward(self, name, module, module_input_output):
+    def analyze_forward_input(self, name, module, module_input_output):
         if not self.enable_kernel_dump:
             return
         if is_gpu:
@@ -413,7 +524,7 @@ class KernelDumpDataProcessor(PytorchDataProcessor):
             return
         self.start_kernel_dump(self.config.kernel_config_path)
-    def analyze_forward(self, name, module, module_input_output):
+    def analyze_forward_output(self, name, module, module_input_output):
         if not self.enable_kernel_dump:
             return
         if self.config.is_backward_kernel_dump:

msprobe/core/data_dump/json_writer.py CHANGED Viewed

@@ -15,10 +15,13 @@
 import csv
 import os
+import copy
+import numpy as np
 from msprobe.core.common.const import Const, FileCheckConst
-from msprobe.core.common.file_utils import change_mode, FileOpen, save_json
+from msprobe.core.common.file_utils import change_mode, FileOpen, save_json, load_json
 from msprobe.core.common.log import logger
+from msprobe.core.common.exceptions import MsprobeException
 class DataWriter:
@@ -29,10 +32,12 @@ class DataWriter:
         self.construct_file_path = None
         self.free_benchmark_file_path = None
         self.dump_tensor_data_dir = None
+        self.debug_file_path = None
         self.flush_size = 1000
         self.cache_data = {}
         self.cache_stack = {}
         self.cache_construct = {}
+        self.cache_debug = {}
     @staticmethod
     def write_data_to_csv(result: list, result_header: tuple, file_path: str):
@@ -55,6 +60,13 @@ class DataWriter:
         self.cache_construct = {}
     def initialize_json_file(self, **kwargs):
+        if self.debug_file_path and not self.cache_debug:
+            # debug level case only create debug.json
+            debug_dict = copy.deepcopy(kwargs)
+            debug_dict.update({"dump_data_dir": self.dump_tensor_data_dir, Const.DATA: {}})
+            self.cache_debug = debug_dict
+            save_json(self.debug_file_path, self.cache_debug, indent=1)
+            return
         if not self.cache_data:
             kwargs.update({"dump_data_dir": self.dump_tensor_data_dir, Const.DATA: {}})
             self.cache_data = kwargs
@@ -64,13 +76,13 @@ class DataWriter:
         if not self.cache_construct:
             save_json(self.construct_file_path, self.cache_construct, indent=1)
-    def update_dump_paths(self, dump_file_path, stack_file_path, construct_file_path, dump_data_dir,
-                          free_benchmark_file_path):
-        self.dump_file_path = dump_file_path
-        self.stack_file_path = stack_file_path
-        self.construct_file_path = construct_file_path
-        self.dump_tensor_data_dir = dump_data_dir
-        self.free_benchmark_file_path = free_benchmark_file_path
+    def update_dump_paths(self, dump_path_aggregation):
+        self.dump_file_path = dump_path_aggregation.dump_file_path
+        self.stack_file_path = dump_path_aggregation.stack_file_path
+        self.construct_file_path = dump_path_aggregation.construct_file_path
+        self.dump_tensor_data_dir = dump_path_aggregation.dump_tensor_data_dir
+        self.free_benchmark_file_path = dump_path_aggregation.free_benchmark_file_path
+        self.debug_file_path = dump_path_aggregation.debug_file_path
     def flush_data_periodically(self):
         dump_data = self.cache_data.get(Const.DATA)
@@ -98,6 +110,9 @@ class DataWriter:
     def update_construct(self, new_data):
         self.cache_construct.update(new_data)
+    def update_debug(self, new_data):
+        self.cache_debug['data'].update(new_data)
     def write_data_json(self, file_path):
         logger.info(f"dump.json is at {os.path.dirname(os.path.dirname(file_path))}. ")
         save_json(file_path, self.cache_data, indent=1)
@@ -108,6 +123,9 @@ class DataWriter:
     def write_construct_info_json(self, file_path):
         save_json(file_path, self.cache_construct, indent=1)
+    def write_debug_info_json(self, file_path):
+        save_json(file_path, self.cache_debug, indent=1)
     def write_json(self):
         if self.cache_data:
             self.write_data_json(self.dump_file_path)
@@ -115,3 +133,31 @@ class DataWriter:
             self.write_stack_info_json(self.stack_file_path)
         if self.cache_construct:
             self.write_construct_info_json(self.construct_file_path)
+        if self.cache_debug:
+            self.write_debug_info_json(self.debug_file_path)
+    def fill_stack_tensor_data(self):
+        self.process_stat_data_recursive(self.cache_data)
+    def process_stat_data_recursive(self, data, depth=0):
+        if depth > Const.MAX_DEPTH:
+            logger.error(f"The maximum depth of recursive process stat data, {Const.MAX_DEPTH} is reached.")
+            raise MsprobeException(MsprobeException.RECURSION_LIMIT_ERROR)
+        if isinstance(data, dict):
+            if "tensor_stat" in data.keys():
+                tensor_stat = data["tensor_stat"]
+                if len(tensor_stat) != Const.TENSOR_STAT_LEN or len(tensor_stat[0]) != len(tensor_stat[1]):
+                    logger.warning("Some bad data in async dump")
+                else:
+                    tensor_stat_index, tensor_stat_data = tensor_stat[0], tensor_stat[1]
+                    if hasattr(tensor_stat_data, "device") and tensor_stat_data.device != Const.CPU_LOWERCASE:
+                        tensor_stat_data = tensor_stat_data.cpu()
+                    for index, stat in zip(tensor_stat_index, tensor_stat_data):
+                        data.update({index: stat.item()})
+                del data["tensor_stat"]
+            else:
+                for key in data.keys():
+                    self.process_stat_data_recursive(data[key], depth + 1)
+        elif isinstance(data, (list, tuple)):
+            for i in data:
+                self.process_stat_data_recursive(i, depth + 1)

msprobe/core/data_dump/scope.py CHANGED Viewed

@@ -45,7 +45,7 @@ class ScopeFactory:
         if self.level == Const.LEVEL_MIX:
             return mix_range_scope
         if not self.scope:
             return api_range_scope
         if api_range_scope.is_valid and module_range_scope.is_valid:
@@ -73,21 +73,21 @@ class BaseScope(ABC):
     def rectify_args(scope, api_list):
         if not isinstance(api_list, list):
             raise ScopeException(ScopeException.InvalidApiStr,
-                f"api_list参数须配置为列表，实际类型为{type(api_list)}.")
+                                 f"api_list参数须配置为列表，实际类型为{type(api_list)}.")
         for api in api_list:
             if not isinstance(api, str):
                 raise ScopeException(ScopeException.InvalidApiStr,
-                    f"api_list中的元素须配置为字符串，实际类型为{type(api)}.")
+                                     f"api_list中的元素须配置为字符串，实际类型为{type(api)}.")
         if isinstance(scope, str):
             scope = [scope]
             return scope, api_list
         if not isinstance(scope, list):
             raise ScopeException(ScopeException.InvalidScope,
-                f"scope参数须配置为字符串或列表，实际类型为{type(scope)}.")
+                                 f"scope参数须配置为字符串或列表，实际类型为{type(scope)}.")
         for s in scope:
             if not isinstance(s, str):
                 raise ScopeException(ScopeException.InvalidScope,
-                f"scope列表元素要求类型为字符串，实际类型为{type(s)}.")
+                                     f"scope列表元素要求类型为字符串，实际类型为{type(s)}.")
         return scope, api_list
     @abstractmethod
@@ -108,7 +108,7 @@ class ListScope(BaseScope):
     def rectify_args(scope, api_list):
         if scope and api_list:
             raise ScopeException(ScopeException.ArgConflict,
-                f"scope和api_list不可以同时配置，实际配置为scope={scope}, api_list={api_list}.")
+                                 f"scope和api_list不可以同时配置，实际配置为scope={scope}, api_list={api_list}.")
         return super(ListScope, ListScope).rectify_args(scope, api_list)
     def check(self, name):
@@ -123,6 +123,7 @@ class RangeScope(BaseScope, ABC):
         super().__init__(*args)
         self.in_scope = False
         self.in_list = False
+        self.start_name_set = set()
         self.is_valid = self.check_scope_is_valid()
     def check_name_pattern(self, name):
@@ -133,23 +134,23 @@ class RangeScope(BaseScope, ABC):
         if self.level == Const.LEVEL_L1:
             if not re.match(api_pattern, name):
                 raise ScopeException(ScopeException.InvalidScope,
-                                    f"scope参数格式错误，要求格式为api完整命名，实际为{name}.")
+                                     f"scope参数格式错误，要求格式为api完整命名，实际为{name}.")
         if self.level == Const.LEVEL_L0:
             if not re.match(module_pattern, name):
                 raise ScopeException(ScopeException.InvalidScope,
-                                    f"scope参数格式错误，要求格式为模块完整命名，实际为{name}.")
+                                     f"scope参数格式错误，要求格式为模块完整命名，实际为{name}.")
         if self.level == Const.LEVEL_MIX:
             if not re.match(api_pattern, name) and not re.match(module_pattern, name):
                 raise ScopeException(ScopeException.InvalidScope,
-                                    f"scope参数格式错误，要求格式为api或模块完整命名，实际为{name}.")
+                                     f"scope参数格式错误，要求格式为api或模块完整命名，实际为{name}.")
     def rectify_args(self, scope, api_list):
         scope, api_list = super(RangeScope, RangeScope).rectify_args(scope, api_list)
         if scope and len(scope) != 2:
             raise ScopeException(ScopeException.InvalidScope,
-                f"scope参数指定区间断点，须传入长度为2的列表，实际长度为{len(scope)}.")
+                                 f"scope参数指定区间断点，须传入长度为2的列表，实际长度为{len(scope)}.")
         for name in scope:
             self.check_name_pattern(name)
         return scope, api_list
@@ -229,30 +230,31 @@ class ModuleRangeScope(RangeScope):
 class MixRangeScope(RangeScope):
     def check_scope_is_valid(self):
         return True if self.scope else False
     def begin_module(self, module_name):
         if self.scope and module_name == self.scope[0]:
             self.in_scope = True
         for name in self.api_list:
             if name in module_name:
                 self.in_list = True
+                self.start_name_set.add(module_name)  # 记录每一个开启in_list的module_name
     def end_module(self, module_name):
         if self.scope and module_name == self.scope[1]:
             self.in_scope = False
-        for name in self.api_list:
-            if name in module_name:
-                self.in_list = False
+        self.start_name_set.discard(module_name)  # 从集合中删除每一个module_name
+        if not self.start_name_set:  # 如果集合为空，说明当前module_name是最后一个开启in_list的module_name
+            self.in_list = False  # 关闭in_list
     def check_api_list(self, api_name):
         if not self.api_list:
             return True
         for name in self.api_list:
             if name in api_name:
                 return True
         return False
     def check(self, name):
         """
         dump时调用的接口，根据scope和api_list判断是否需要dump
@@ -270,4 +272,3 @@ class MixRangeScope(RangeScope):
         if self.scope and name == self.scope[1]:
             self.in_scope = False
         return result

msprobe/core/overflow_check/abnormal_scene.py CHANGED Viewed

@@ -37,7 +37,11 @@ class AnomalyScene:
     @staticmethod
     def _has_anomaly(data: Union[Dict, Any]) -> bool:
         """检查张量是否包含异常值"""
-        return has_nan_inf(data)
+        if isinstance(data, dict):
+            return has_nan_inf(data)
+        elif isinstance(data, list):
+            return any(AnomalyScene._has_anomaly(x) for x in data)
+        return False
     def get_details(self) -> Dict:
         """获取异常详情"""
@@ -70,14 +74,14 @@ class InputOutputAnomalyScene(AnomalyScene):
     def has_input_anomaly(self) -> bool:
         """检查输入是否有异常（包括args和kwargs）"""
         # args
-        args_anomaly = any(self._has_anomaly(x) for x in self.api_data.input_args if isinstance(x, dict))
+        args_anomaly = any(self._has_anomaly(x) for x in self.api_data.input_args)
         # kwargs
-        kwargs_anomaly = any(self._has_anomaly(x) for x in self.api_data.input_kwargs.values() if isinstance(x, dict))
+        kwargs_anomaly = any(self._has_anomaly(x) for x in self.api_data.input_kwargs.values())
         return args_anomaly or kwargs_anomaly
     def has_output_anomaly(self) -> bool:
         """检查输出是否有异常"""
-        return any(self._has_anomaly(x) for x in self.api_data.output_data if isinstance(x, dict))
+        return any(self._has_anomaly(x) for x in self.api_data.output_data)
     def matches(self) -> bool:
         """判断是否匹配该场景"""
@@ -121,7 +125,7 @@ class NumericalMutationScene(AnomalyScene):
     """
         检查数值突变，统计输入args、kwargs中norm值，同时统计输出的norm最大值，计算差异，大于 threshold 则认为是异常情况
     """
-    def __init__(self, api_info: APIInfo, threshold: float = 100000.0):
+    def __init__(self, api_info: APIInfo, threshold: float = 100.0):
         super().__init__(api_info)
         self.threshold = threshold

mindstudio-probe 1.1.1__py3-none-any.whl → 1.2.2__py3-none-any.whl

mindstudio-probe 1.1.1py3-none-any.whl → 1.2.2py3-none-any.whl