PyPI - mindstudio-probe - Versions diffs - 1.1.0__py3-none-any.whl → 1.2.1__py3-none-any.whl - Mend

mindstudio-probe 1.1.0py3-none-any.whl → 1.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (299) hide show

{mindstudio_probe-1.1.0.dist-info → mindstudio_probe-1.2.1.dist-info}/METADATA +7 -6
mindstudio_probe-1.2.1.dist-info/RECORD +396 -0
{mindstudio_probe-1.1.0.dist-info → mindstudio_probe-1.2.1.dist-info}/WHEEL +1 -1
{mindstudio_probe-1.1.0.dist-info → mindstudio_probe-1.2.1.dist-info}/entry_points.txt +0 -1
msprobe/CMakeLists.txt +5 -0
msprobe/README.md +51 -20
msprobe/config.json +2 -3
msprobe/core/advisor/advisor.py +8 -3
msprobe/core/common/const.py +264 -15
msprobe/core/common/exceptions.py +27 -3
msprobe/core/common/file_utils.py +176 -26
msprobe/core/common/inplace_op_checker.py +15 -0
msprobe/core/common/inplace_ops.yaml +3 -0
msprobe/core/common/log.py +27 -9
msprobe/core/common/utils.py +204 -77
msprobe/core/common_config.py +49 -14
msprobe/core/compare/acc_compare.py +274 -198
msprobe/core/compare/check.py +32 -33
msprobe/core/compare/compare_cli.py +32 -14
msprobe/core/compare/highlight.py +283 -127
msprobe/core/compare/layer_mapping/__init__.py +19 -0
msprobe/core/compare/layer_mapping/data_scope_parser.py +246 -0
msprobe/core/compare/layer_mapping/layer_mapping.py +249 -0
msprobe/core/compare/layer_mapping/postprocess_pass.py +95 -0
msprobe/core/compare/merge_result/merge_result.py +380 -0
msprobe/core/compare/merge_result/merge_result_cli.py +31 -0
msprobe/core/compare/multiprocessing_compute.py +2 -2
msprobe/core/compare/npy_compare.py +135 -144
msprobe/core/compare/utils.py +419 -274
msprobe/core/data_dump/data_collector.py +60 -28
msprobe/core/data_dump/data_processor/base.py +84 -36
msprobe/core/data_dump/data_processor/factory.py +5 -3
msprobe/core/data_dump/data_processor/mindspore_processor.py +152 -18
msprobe/core/data_dump/data_processor/pytorch_processor.py +267 -110
msprobe/core/data_dump/json_writer.py +29 -1
msprobe/core/data_dump/scope.py +119 -39
msprobe/core/grad_probe/constant.py +27 -13
msprobe/core/grad_probe/grad_compare.py +18 -1
msprobe/core/grad_probe/utils.py +30 -2
msprobe/core/overflow_check/abnormal_scene.py +189 -0
msprobe/core/overflow_check/api_info.py +55 -0
msprobe/core/overflow_check/checker.py +138 -0
msprobe/core/overflow_check/filter.py +157 -0
msprobe/core/overflow_check/ignore_rules.yaml +55 -0
msprobe/core/overflow_check/level.py +22 -0
msprobe/core/overflow_check/utils.py +28 -0
msprobe/docs/01.installation.md +96 -7
msprobe/docs/02.config_introduction.md +50 -23
msprobe/docs/03.config_examples.md +2 -9
msprobe/docs/04.kernel_dump_PyTorch.md +73 -0
msprobe/docs/05.data_dump_PyTorch.md +93 -61
msprobe/docs/06.data_dump_MindSpore.md +200 -95
msprobe/docs/07.accuracy_checker_PyTorch.md +28 -28
msprobe/docs/08.accuracy_checker_online_PyTorch.md +1 -6
msprobe/docs/09.accuracy_checker_MindSpore.md +44 -8
msprobe/docs/10.accuracy_compare_PyTorch.md +114 -50
msprobe/docs/11.accuracy_compare_MindSpore.md +340 -48
msprobe/docs/12.overflow_check_PyTorch.md +2 -2
msprobe/docs/13.overflow_check_MindSpore.md +6 -6
msprobe/docs/15.free_benchmarking_PyTorch.md +4 -5
msprobe/docs/16.free_benchmarking_MindSpore.md +56 -37
msprobe/docs/17.grad_probe.md +5 -6
msprobe/docs/19.monitor.md +561 -0
msprobe/docs/20.monitor_performance_baseline.md +52 -0
msprobe/docs/21.visualization_PyTorch.md +466 -0
msprobe/docs/22.visualization_MindSpore.md +481 -0
msprobe/docs/23.generate_operator_PyTorch.md +107 -0
msprobe/docs/24.code_mapping_Mindspore.md +28 -0
msprobe/docs/25.tool_function_introduction.md +29 -0
msprobe/docs/26.data_dump_PyTorch_baseline.md +37 -0
msprobe/docs/27.dump_json_instruction.md +521 -0
msprobe/docs/FAQ.md +29 -2
msprobe/docs/accuracy_checker_MindSpore/accuracy_checker_MindSpore_baseline.md +14 -0
msprobe/docs/data_dump_MindSpore/data_dump_MindSpore_baseline.md +22 -0
msprobe/docs/data_dump_MindSpore/dynamic_graph_quick_start_example.md +211 -0
msprobe/docs/img/compare_result.png +0 -0
msprobe/docs/img/merge_result.png +0 -0
msprobe/docs/img/monitor/cpu_info.png +0 -0
msprobe/docs/img/visualization/fuzzy_match_ms.png +0 -0
msprobe/docs/img/visualization/fuzzy_match_pt.png +0 -0
msprobe/docs/img/visualization/tensorboard_1.png +0 -0
msprobe/docs/img/visualization/tensorboard_2.png +0 -0
msprobe/docs/img/visualization/vis_browser_1.png +0 -0
msprobe/docs/img/visualization/vis_browser_2.png +0 -0
msprobe/docs/img/visualization/vis_precision_info.png +0 -0
msprobe/docs/img/visualization/vis_search_info.png +0 -0
msprobe/docs/img/visualization/vis_show_info.png +0 -0
msprobe/docs/img/visualization/vis_showcase.png +0 -0
msprobe/docs/img/visualization/vis_unmatch_info.png +0 -0
msprobe/docs/visualization/GPTModel.png +0 -0
msprobe/docs/visualization/ParallelMLP.png +0 -0
msprobe/docs/visualization/layer_mapping_example.md +132 -0
msprobe/docs/visualization/mapping.png +0 -0
msprobe/docs/visualization/mapping1.png +0 -0
msprobe/docs/visualization/module_name.png +0 -0
msprobe/docs/visualization/module_name1.png +0 -0
msprobe/docs/visualization/no_mapping.png +0 -0
msprobe/docs/visualization/no_mapping1.png +0 -0
msprobe/docs/visualization/no_mapping_analyze.png +0 -0
msprobe/docs/visualization/top_layer.png +0 -0
msprobe/mindspore/__init__.py +25 -0
msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +151 -151
msprobe/mindspore/api_accuracy_checker/api_info.py +21 -6
msprobe/mindspore/api_accuracy_checker/api_runner.py +43 -18
msprobe/mindspore/api_accuracy_checker/base_compare_algorithm.py +21 -7
msprobe/mindspore/api_accuracy_checker/checker_support_api.yaml +77 -0
msprobe/mindspore/api_accuracy_checker/cmd_parser.py +64 -1
msprobe/mindspore/api_accuracy_checker/compute_element.py +64 -31
msprobe/mindspore/api_accuracy_checker/data_manager.py +301 -0
msprobe/mindspore/api_accuracy_checker/main.py +28 -3
msprobe/mindspore/api_accuracy_checker/multi_api_accuracy_checker.py +212 -0
msprobe/mindspore/api_accuracy_checker/multi_data_manager.py +60 -0
msprobe/mindspore/api_accuracy_checker/type_mapping.py +22 -5
msprobe/mindspore/api_accuracy_checker/utils.py +34 -17
msprobe/mindspore/cell_processor.py +33 -12
msprobe/mindspore/code_mapping/bind.py +264 -0
msprobe/mindspore/code_mapping/cmd_parser.py +40 -0
msprobe/mindspore/code_mapping/graph.py +49 -0
msprobe/mindspore/code_mapping/graph_parser.py +226 -0
msprobe/mindspore/code_mapping/main.py +24 -0
msprobe/mindspore/code_mapping/processor.py +34 -0
msprobe/mindspore/common/const.py +35 -13
msprobe/mindspore/common/log.py +5 -9
msprobe/mindspore/common/utils.py +88 -4
msprobe/mindspore/compare/distributed_compare.py +22 -24
msprobe/mindspore/compare/ms_compare.py +333 -268
msprobe/mindspore/compare/ms_graph_compare.py +95 -52
msprobe/mindspore/debugger/debugger_config.py +7 -1
msprobe/mindspore/debugger/precision_debugger.py +87 -12
msprobe/mindspore/dump/dump_tool_factory.py +3 -1
msprobe/mindspore/dump/hook_cell/api_registry.py +95 -18
msprobe/mindspore/dump/hook_cell/hook_cell.py +60 -38
msprobe/mindspore/dump/hook_cell/primitive_hooks.py +45 -30
msprobe/mindspore/dump/hook_cell/support_wrap_ops.yaml +36 -1
msprobe/mindspore/dump/hook_cell/wrap_api.py +92 -1
msprobe/mindspore/dump/jit_dump.py +17 -5
msprobe/mindspore/dump/kernel_dump/kernel_config.py +33 -0
msprobe/mindspore/dump/kernel_graph_dump.py +9 -4
msprobe/mindspore/dump/kernel_kbyk_dump.py +2 -4
msprobe/mindspore/dym_loader/hook_dynamic_loader.cc +140 -0
msprobe/mindspore/dym_loader/hook_dynamic_loader.h +53 -0
msprobe/mindspore/free_benchmark/api_pynative_self_check.py +156 -41
msprobe/mindspore/free_benchmark/common/handler_params.py +1 -2
msprobe/mindspore/free_benchmark/common/utils.py +19 -4
msprobe/mindspore/free_benchmark/data/support_wrap_ops.yaml +0 -204
msprobe/mindspore/free_benchmark/handler/base_handler.py +3 -3
msprobe/mindspore/free_benchmark/handler/check_handler.py +4 -5
msprobe/mindspore/free_benchmark/handler/fix_handler.py +4 -4
msprobe/mindspore/free_benchmark/handler/handler_factory.py +4 -4
msprobe/mindspore/free_benchmark/perturbation/add_noise.py +2 -2
msprobe/mindspore/free_benchmark/perturbation/base_perturbation.py +15 -6
msprobe/mindspore/free_benchmark/perturbation/bit_noise.py +2 -2
msprobe/mindspore/free_benchmark/perturbation/exchange_value.py +2 -2
msprobe/mindspore/free_benchmark/perturbation/improve_precision.py +13 -6
msprobe/mindspore/free_benchmark/perturbation/perturbation_factory.py +2 -2
msprobe/mindspore/free_benchmark/self_check_tool_factory.py +2 -2
msprobe/mindspore/grad_probe/global_context.py +28 -8
msprobe/mindspore/grad_probe/grad_analyzer.py +50 -24
msprobe/mindspore/grad_probe/grad_monitor.py +16 -1
msprobe/mindspore/grad_probe/grad_stat_csv.py +33 -5
msprobe/mindspore/grad_probe/hook.py +35 -12
msprobe/mindspore/grad_probe/utils.py +18 -5
msprobe/mindspore/mindtorch/__init__.py +18 -0
msprobe/mindspore/mindtorch/mindtorch_adaptor.py +255 -0
msprobe/mindspore/ms_config.py +27 -16
msprobe/mindspore/overflow_check/kernel_graph_overflow_check.py +9 -4
msprobe/mindspore/runtime.py +15 -0
msprobe/mindspore/service.py +285 -113
msprobe/mindspore/task_handler_factory.py +15 -0
msprobe/msprobe.py +48 -10
msprobe/pytorch/__init__.py +8 -6
msprobe/pytorch/api_accuracy_checker/common/config.py +62 -0
msprobe/pytorch/api_accuracy_checker/common/utils.py +31 -16
msprobe/pytorch/api_accuracy_checker/compare/algorithm.py +41 -8
msprobe/pytorch/api_accuracy_checker/compare/api_precision_compare.py +103 -271
msprobe/pytorch/api_accuracy_checker/compare/api_precision_standard.yaml +4 -1
msprobe/pytorch/api_accuracy_checker/compare/compare.py +69 -68
msprobe/pytorch/api_accuracy_checker/compare/compare_column.py +54 -0
msprobe/pytorch/api_accuracy_checker/compare/compare_input.py +51 -0
msprobe/pytorch/api_accuracy_checker/compare/compare_utils.py +2 -4
msprobe/pytorch/api_accuracy_checker/generate_op_script/config_op.json +9 -0
msprobe/pytorch/api_accuracy_checker/generate_op_script/op_generator.py +478 -0
msprobe/pytorch/api_accuracy_checker/generate_op_script/operator_replication.template +365 -0
msprobe/pytorch/api_accuracy_checker/precision_standard/absolute_threshold.py +106 -0
msprobe/pytorch/api_accuracy_checker/precision_standard/accumulative_error_compare.py +107 -0
msprobe/pytorch/api_accuracy_checker/precision_standard/base_standard.py +151 -0
msprobe/pytorch/api_accuracy_checker/precision_standard/benchmark_compare.py +226 -0
msprobe/pytorch/api_accuracy_checker/precision_standard/binary_consistency.py +68 -0
msprobe/pytorch/api_accuracy_checker/precision_standard/standard_config.py +218 -0
msprobe/pytorch/api_accuracy_checker/precision_standard/standard_register.py +104 -0
msprobe/pytorch/api_accuracy_checker/precision_standard/thousandth_standard.py +63 -0
msprobe/pytorch/api_accuracy_checker/precision_standard/ulp_compare.py +200 -0
msprobe/pytorch/api_accuracy_checker/run_ut/data_generate.py +63 -2
msprobe/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py +21 -15
msprobe/pytorch/api_accuracy_checker/run_ut/run_overflow_check.py +54 -22
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py +140 -71
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut_utils.py +49 -8
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/attl.py +9 -24
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/client.py +4 -12
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/device_dispatch.py +5 -3
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/dump_dispatch.py +9 -4
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/server.py +3 -11
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/utils.py +2 -2
msprobe/pytorch/bench_functions/confusion_transpose.py +5 -1
msprobe/pytorch/bench_functions/matmul_backward.py +12 -0
msprobe/pytorch/bench_functions/npu_fusion_attention.py +142 -16
msprobe/pytorch/bench_functions/rotary_mul.py +4 -0
msprobe/pytorch/bench_functions/swiglu.py +10 -2
msprobe/pytorch/common/parse_json.py +7 -6
msprobe/pytorch/common/utils.py +101 -7
msprobe/pytorch/compare/distributed_compare.py +17 -30
msprobe/pytorch/compare/pt_compare.py +44 -22
msprobe/pytorch/debugger/debugger_config.py +46 -27
msprobe/pytorch/debugger/precision_debugger.py +42 -12
msprobe/pytorch/dump/kernel_dump/kernel_config.py +33 -0
msprobe/pytorch/dump/module_dump/module_dump.py +86 -0
msprobe/pytorch/{module_processer.py → dump/module_dump/module_processer.py} +81 -10
msprobe/pytorch/free_benchmark/common/constant.py +15 -0
msprobe/pytorch/free_benchmark/common/counter.py +15 -0
msprobe/pytorch/free_benchmark/common/enums.py +15 -0
msprobe/pytorch/free_benchmark/common/params.py +10 -2
msprobe/pytorch/free_benchmark/common/utils.py +29 -4
msprobe/pytorch/free_benchmark/compare/grad_saver.py +20 -5
msprobe/pytorch/free_benchmark/compare/single_benchmark.py +2 -0
msprobe/pytorch/free_benchmark/perturbed_layers/npu/add_noise.py +3 -1
msprobe/pytorch/free_benchmark/perturbed_layers/npu/bit_noise.py +6 -4
msprobe/pytorch/free_benchmark/perturbed_layers/npu/change_value.py +2 -0
msprobe/pytorch/free_benchmark/perturbed_layers/npu/improve_precision.py +4 -0
msprobe/pytorch/free_benchmark/result_handlers/base_handler.py +41 -47
msprobe/pytorch/free_benchmark/result_handlers/fix_handler.py +6 -5
msprobe/pytorch/free_benchmark/result_handlers/preheat_handler.py +0 -4
msprobe/pytorch/grad_probe/grad_monitor.py +23 -6
msprobe/pytorch/grad_probe/grad_stat_csv.py +40 -10
msprobe/pytorch/hook_module/__init__.py +1 -1
msprobe/pytorch/hook_module/hook_module.py +14 -11
msprobe/pytorch/hook_module/register_optimizer_hook.py +59 -0
msprobe/pytorch/hook_module/support_wrap_ops.yaml +35 -0
msprobe/pytorch/hook_module/wrap_distributed.py +6 -8
msprobe/pytorch/hook_module/wrap_functional.py +0 -38
msprobe/pytorch/monitor/__init__.py +0 -0
msprobe/pytorch/monitor/anomaly_analyse.py +201 -0
msprobe/pytorch/monitor/anomaly_detect.py +425 -0
msprobe/pytorch/monitor/csv2tb.py +166 -0
msprobe/pytorch/monitor/distributed/__init__.py +0 -0
msprobe/pytorch/monitor/distributed/distributed_ops.yaml +19 -0
msprobe/pytorch/monitor/distributed/stack_blacklist.yaml +5 -0
msprobe/pytorch/monitor/distributed/wrap_distributed.py +283 -0
msprobe/pytorch/monitor/features.py +108 -0
msprobe/pytorch/monitor/module_hook.py +1076 -0
msprobe/pytorch/monitor/module_metric.py +172 -0
msprobe/pytorch/monitor/module_spec_verifier.py +95 -0
msprobe/pytorch/monitor/optimizer_collect.py +333 -0
msprobe/pytorch/monitor/unittest/__init__.py +0 -0
msprobe/pytorch/monitor/unittest/test_monitor.py +160 -0
msprobe/pytorch/monitor/utils.py +321 -0
msprobe/pytorch/monitor/visualizer.py +59 -0
msprobe/pytorch/online_dispatch/__init__.py +2 -3
msprobe/pytorch/online_dispatch/compare.py +29 -38
msprobe/pytorch/online_dispatch/dispatch.py +58 -27
msprobe/pytorch/online_dispatch/dump_compare.py +21 -9
msprobe/pytorch/online_dispatch/single_compare.py +53 -32
msprobe/pytorch/online_dispatch/torch_ops_config.yaml +1 -1
msprobe/pytorch/online_dispatch/utils.py +49 -21
msprobe/pytorch/parse_tool/lib/compare.py +21 -27
msprobe/pytorch/parse_tool/lib/config.py +6 -8
msprobe/pytorch/parse_tool/lib/file_desc.py +15 -1
msprobe/pytorch/parse_tool/lib/interactive_cli.py +10 -10
msprobe/pytorch/parse_tool/lib/parse_exception.py +7 -7
msprobe/pytorch/parse_tool/lib/parse_tool.py +12 -12
msprobe/pytorch/parse_tool/lib/utils.py +33 -53
msprobe/pytorch/parse_tool/lib/visualization.py +11 -10
msprobe/pytorch/pt_config.py +31 -8
msprobe/pytorch/service.py +188 -108
msprobe/visualization/__init__.py +14 -0
msprobe/visualization/builder/__init__.py +14 -0
msprobe/visualization/builder/graph_builder.py +222 -0
msprobe/visualization/builder/msprobe_adapter.py +227 -0
msprobe/visualization/compare/__init__.py +14 -0
msprobe/visualization/compare/graph_comparator.py +180 -0
msprobe/visualization/compare/mode_adapter.py +197 -0
msprobe/visualization/graph/__init__.py +14 -0
msprobe/visualization/graph/base_node.py +119 -0
msprobe/visualization/graph/distributed_analyzer.py +318 -0
msprobe/visualization/graph/graph.py +209 -0
msprobe/visualization/graph/node_colors.py +95 -0
msprobe/visualization/graph/node_op.py +39 -0
msprobe/visualization/graph_service.py +288 -0
msprobe/visualization/utils.py +217 -0
mindstudio_probe-1.1.0.dist-info/RECORD +0 -287
msprobe/docs/04.acl_config_examples.md +0 -78
msprobe/mindspore/compare/layer_mapping.py +0 -146
msprobe/mindspore/compare/modify_mapping.py +0 -107
msprobe/mindspore/free_benchmark/decorator/dec_forward.py +0 -57
msprobe/mindspore/free_benchmark/decorator/decorator_factory.py +0 -122
msprobe/pytorch/functional/module_dump.py +0 -84
{mindstudio_probe-1.1.0.dist-info → mindstudio_probe-1.2.1.dist-info}/LICENSE +0 -0
{mindstudio_probe-1.1.0.dist-info → mindstudio_probe-1.2.1.dist-info}/top_level.txt +0 -0
/msprobe/mindspore/{free_benchmark/decorator → code_mapping}/__init__.py +0 -0
/msprobe/pytorch/{functional → dump/module_dump}/__init__.py +0 -0

msprobe/core/data_dump/data_processor/pytorch_processor.py CHANGED Viewed

@@ -13,19 +13,24 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import hashlib
 import zlib
 from dataclasses import asdict
 from typing import List
 import numpy as np
 import torch
+from torch import distributed as dist
 from msprobe.core.common.const import Const
 from msprobe.core.common.file_utils import path_len_exceeds_limit
 from msprobe.core.common.log import logger
+from msprobe.core.common.utils import convert_tuple
 from msprobe.core.data_dump.data_processor.base import BaseDataProcessor, ModuleBackwardInputsOutputs, \
     ModuleForwardInputsOutputs, TensorStatInfo
 from msprobe.pytorch.common.utils import save_pt, load_pt
 from msprobe.pytorch.free_benchmark import FreeBenchmarkCheck, UnequalRow
+from msprobe.core.common.utils import recursion_depth_decorator
 is_gpu = False
 try:
@@ -35,7 +40,13 @@ except ImportError:
 class PytorchDataProcessor(BaseDataProcessor):
-    pytorch_special_type = (torch.device, torch.dtype, torch.Size, torch.Tensor)
+    pytorch_special_type = (torch.device, torch.dtype, torch.Size, torch.Tensor, torch.memory_format, dist.ProcessGroup)
+    memory_format = {
+        torch.contiguous_format: "contiguous_format",
+        torch.channels_last: "channels_last",
+        torch.channels_last_3d: "channels_last_3d",
+        torch.preserve_format: "preserve_format"
+    }
     def __init__(self, config, data_writer):
         super().__init__(config, data_writer)
@@ -43,6 +54,7 @@ class PytorchDataProcessor(BaseDataProcessor):
             "device": self.analyze_device_in_kwargs,
             "dtype": self.analyze_dtype_in_kwargs
         }
+        self._async_dump_cache = {}
     @staticmethod
     def get_md5_for_tensor(x):
@@ -71,53 +83,114 @@ class PytorchDataProcessor(BaseDataProcessor):
         return {"type": "torch.dtype", "value": str(element)}
     @staticmethod
-    def get_stat_info(data):
+    def get_stat_info_async(data):
         tensor_stat = TensorStatInfo()
-        if data.is_meta:
-            return tensor_stat
-        data_clone = data.detach()
-        if data_clone.numel() == 0:
+        if torch.is_complex(data):
+            logger.warning("Async dump do not support complex data!")
             return tensor_stat
-        elif data_clone.dtype == torch.bool:
-            tensor_stat.max = True in data_clone
-            tensor_stat.min = False not in data_clone
-        elif not data_clone.shape:
-            tensor_stat.max = tensor_stat.min = tensor_stat.mean = tensor_stat.norm = data_clone.item()
-        elif torch.is_complex(data_clone):
-            data_np = data_clone.cpu().numpy()
+        elif data.dtype == torch.bool:
+            tensor_stat.stack_tensor_stat = (["Max", "Min"], torch.stack(
+                [torch.any(data), torch.all(data)]))
+        elif not data.shape:
+            tensor_stat.stack_tensor_stat = (["Max", "Min", "Mean", "Norm"], torch.stack([data, data, data, data]))
+        else:
+            if not data.is_floating_point() or data.dtype == torch.float64:
+                data = data.float()
+            tensor_stat.stack_tensor_stat = (["Max", "Min", "Mean", "Norm"], torch.stack([
+                torch.max(data),
+                torch.min(data),
+                torch.mean(data),
+                torch.norm(data)
+            ]))
+        return tensor_stat
+    @staticmethod
+    def get_stat_info_sync(data):
+        tensor_stat = TensorStatInfo()
+        if torch.is_complex(data):
+            data_np = data.cpu().numpy()
             data_abs = np.abs(data_np)
             tensor_stat.max = np.max(data_abs).item()
             tensor_stat.min = np.min(data_abs).item()
             tensor_stat.mean = np.mean(data_abs).item()
+        elif data.dtype == torch.bool:
+            tensor_stat.max = torch.any(data).item()
+            tensor_stat.min = torch.all(data).item()
+        elif not data.shape:
+            tensor_stat.max = tensor_stat.min = tensor_stat.mean = tensor_stat.norm = data.item()
         else:
-            if not data_clone.is_floating_point() or data_clone.dtype == torch.float64:
-                data_clone = data_clone.float()
-            tensor_stat.max = torch._C._VariableFunctionsClass.max(data_clone).item()
-            tensor_stat.min = torch._C._VariableFunctionsClass.min(data_clone).item()
-            tensor_stat.mean = torch._C._VariableFunctionsClass.mean(data_clone).item()
-            tensor_stat.norm = torch._C._VariableFunctionsClass.norm(data_clone).item()
+            if not data.is_floating_point() or data.dtype == torch.float64:
+                data = data.float()
+            tensor_stat.max = torch.max(data).item()
+            tensor_stat.min = torch.min(data).item()
+            tensor_stat.mean = torch.mean(data).item()
+            tensor_stat.norm = torch.norm(data).item()
         return tensor_stat
+    @staticmethod
+    def get_stat_info(data, async_dump=False):
+        tensor_stat = TensorStatInfo()
+        if data.is_meta:
+            return tensor_stat
+        data_clone = data.detach()
+        if data_clone.numel() == 0:
+            return tensor_stat
+        else:
+            if data_clone.device.type == Const.CPU_LOWERCASE or not async_dump:
+                return PytorchDataProcessor.get_stat_info_sync(data_clone)
+            else:
+                return PytorchDataProcessor.get_stat_info_async(data_clone)
     @staticmethod
     def handle_tensor_extremum_nan_inf(tensor, operator):
         data_clone = tensor.detach()
-        data_nan = torch._C._VariableFunctionsClass.isnan(data_clone)
-        if int(torch._C._VariableFunctionsClass.sum(data_nan)) == data_clone.numel():
+        data_nan = torch.isnan(data_clone)
+        if int(torch.sum(data_nan)) == data_clone.numel():
             return float('nan')
-        finite_mask = torch._C._VariableFunctionsClass.isfinite(data_clone)
-        if int(torch._C._VariableFunctionsClass.sum(finite_mask)) > 0:
+        finite_mask = torch.isfinite(data_clone)
+        if int(torch.sum(finite_mask)) > 0:
             finite_values = data_clone[finite_mask]
-            return torch._C._VariableFunctionsClass.max(finite_values).item() if operator == 'max' else \
-                torch._C._VariableFunctionsClass.min(finite_values).item()
+            return torch.max(finite_values).item() if operator == 'max' else \
+                torch.min(finite_values).item()
         else:
             data_no_nan = data_clone[~data_nan]
-            return torch._C._VariableFunctionsClass.max(data_no_nan).item() if operator == 'max' else \
-                torch._C._VariableFunctionsClass.min(data_no_nan).item()
+            return torch.max(data_no_nan).item() if operator == 'max' else \
+                torch.min(data_no_nan).item()
+    @staticmethod
+    def process_group_hash(arg):
+        group_ranks = dist.get_process_group_ranks(arg)
+        group_ranks_hash = hashlib.md5(str(group_ranks).encode('utf-8')).hexdigest()
+        return group_ranks_hash
+    @staticmethod
+    def is_distributed_op(module):
+        return getattr(module, "op_is_distributed", False)
     @staticmethod
     def _analyze_torch_size(arg):
         return {"type": "torch.Size", "value": list(arg)}
+    @staticmethod
+    def _analyze_memory_format(arg):
+        # 获取内存格式
+        format_type = PytorchDataProcessor.memory_format.get(arg)
+        return {"type": "torch.memory_format", "format": format_type}
+    @staticmethod
+    def _analyze_process_group(arg):
+        group_info = {"type": "torch.ProcessGroup"}
+        try:
+            group_ranks = dist.get_process_group_ranks(arg)
+            group_info.update({"group_ranks": group_ranks})
+            group_id = PytorchDataProcessor.process_group_hash(arg)
+            group_info.update({"group_id": group_id})
+        except Exception as e:
+            logger.warning(f"Failed to get process group(id: {group_id}) ranks info with error info: {e}.")
+        return group_info
     @classmethod
     def get_special_types(cls):
         return super().get_special_types() + cls.pytorch_special_type
@@ -127,6 +200,10 @@ class PytorchDataProcessor(BaseDataProcessor):
             return self.torch_object_key[suffix_stack[-1]](element)
         if isinstance(element, torch.Size):
             return self._analyze_torch_size(element)
+        if isinstance(element, torch.memory_format):
+            return self._analyze_memory_format(element)
+        if isinstance(element, dist.ProcessGroup):
+            return self._analyze_process_group(element)
         converted_numpy, numpy_type = self._convert_numpy_to_builtin(element)
         if converted_numpy is not element:
             return self._analyze_numpy(converted_numpy, numpy_type)
@@ -136,26 +213,35 @@ class PytorchDataProcessor(BaseDataProcessor):
             return self._analyze_builtin(element)
         return {}
+    def analyze_forward_output(self, name, module, module_input_output: ModuleForwardInputsOutputs):
+        if self.is_distributed_op(module):
+            module_input_output.update_output_with_args_and_kwargs()
+        return super().analyze_forward_output(name, module, module_input_output)
     def _analyze_tensor(self, tensor, suffix):
-        tensor_stat = self.get_stat_info(tensor)
+        tensor_stat = self.get_stat_info(tensor, self.config.async_dump)
         tensor_json = {}
         tensor_json.update({'type': 'torch.Tensor'})
         tensor_json.update({'dtype': str(tensor.dtype)})
         tensor_json.update({"shape": tensor.shape})
-        tensor_json.update({"Max": tensor_stat.max})
-        tensor_json.update({"Min": tensor_stat.min})
-        tensor_json.update({"Mean": tensor_stat.mean})
-        tensor_json.update({"Norm": tensor_stat.norm})
-        tensor_json.update({"requires_grad": tensor.requires_grad})
-        if tensor_stat.max is not None:
-            if np.isinf(tensor_stat.max) or np.isnan(tensor_stat.max):
-                tensor_json['Max_except_inf_nan'] = self.handle_tensor_extremum_nan_inf(tensor, "max")
-        if tensor_stat.min is not None:
-            if np.isinf(tensor_stat.min) or np.isnan(tensor_stat.min):
-                tensor_json['Min_except_inf_nan'] = self.handle_tensor_extremum_nan_inf(tensor, "min")
-        if self.config.summary_mode == Const.MD5:
+        if tensor_stat.stack_tensor_stat is None:
+            tensor_json.update({"Max": tensor_stat.max})
+            tensor_json.update({"Min": tensor_stat.min})
+            tensor_json.update({"Mean": tensor_stat.mean})
+            tensor_json.update({"Norm": tensor_stat.norm})
+            tensor_json.update({"requires_grad": tensor.requires_grad})
+            if tensor_stat.max is not None:
+                if np.isinf(tensor_stat.max) or np.isnan(tensor_stat.max):
+                    tensor_json['Max_except_inf_nan'] = self.handle_tensor_extremum_nan_inf(tensor, "max")
+            if tensor_stat.min is not None:
+                if np.isinf(tensor_stat.min) or np.isnan(tensor_stat.min):
+                    tensor_json['Min_except_inf_nan'] = self.handle_tensor_extremum_nan_inf(tensor, "min")
+        else:
+            tensor_json.update({"requires_grad": tensor.requires_grad})
+            tensor_json.update({"tensor_stat": tensor_stat.stack_tensor_stat})
+        if self.config.summary_mode == Const.MD5 and not self.config.async_dump:
             tensor_md5 = self.get_md5_for_tensor(tensor)
             tensor_json.update({Const.MD5: tensor_md5})
         return tensor_json
@@ -166,12 +252,20 @@ class StatisticsDataProcessor(PytorchDataProcessor):
 class TensorDataProcessor(PytorchDataProcessor):
+    def dump_async_data(self):
+        for file_path, tensor in self._async_dump_cache.items():
+            save_pt(tensor.contiguous(), file_path)
+        self._async_dump_cache.clear()
     def _analyze_tensor(self, tensor, suffix):
         dump_data_name, file_path = self.get_save_file_path(suffix)
-        saved_tensor = tensor.clone().contiguous().detach()
-        save_pt(saved_tensor, file_path)
         single_arg = super()._analyze_tensor(tensor, suffix)
         single_arg.update({"data_name": dump_data_name})
+        if self.config.async_dump:
+            self._async_dump_cache[file_path] = tensor.clone().detach()
+        else:
+            saved_tensor = tensor.clone().contiguous().detach()
+            save_pt(saved_tensor, file_path)
         return single_arg
@@ -182,7 +276,7 @@ class OverflowCheckDataProcessor(PytorchDataProcessor):
         super().__init__(config, data_writer)
         self.has_overflow = False
         self.support_inf_nan = None
-        self.cached_inplace_api_info = {}
+        self.cached_api_info = {}
         self.cached_tensors_and_file_paths = {}
         self.bits_for_overflow = 8
         self.real_overflow_nums = 0
@@ -196,21 +290,21 @@ class OverflowCheckDataProcessor(PytorchDataProcessor):
             return True
         return False
-    def analyze_pre_forward_inplace(self, name, module_input_output: ModuleForwardInputsOutputs):
+    def analyze_forward_input(self, name, module, module_input_output: ModuleForwardInputsOutputs):
         self.has_overflow = False
         self._is_support_inf_nan()
-        self.cached_inplace_api_info = super().analyze_pre_forward_inplace(name, module_input_output)
+        self.cached_api_info = super().analyze_forward_input(name, module, module_input_output)
         return None
-    def analyze_forward_inplace(self, name, module_input_output: ModuleForwardInputsOutputs):
+    def analyze_forward_output(self, name, module, module_input_output: ModuleForwardInputsOutputs):
         self._is_support_inf_nan()
-        api_info_struct = super().analyze_forward_inplace(name, module_input_output)
-        if name in self.cached_inplace_api_info and name in api_info_struct:
-            self.cached_inplace_api_info[name].update(api_info_struct[name])
+        api_info_struct = super().analyze_forward_output(name, module, module_input_output)
+        if name in self.cached_api_info and name in api_info_struct:
+            self.cached_api_info[name].update(api_info_struct[name])
         elif name in api_info_struct:
-            self.cached_inplace_api_info = api_info_struct
+            self.cached_api_info = api_info_struct
         self.handle_overflow()
-        return self.cached_inplace_api_info if self.has_overflow else None
+        return self.cached_api_info if self.has_overflow else None
     def analyze_forward(self, name, module, module_input_output: ModuleForwardInputsOutputs):
         self.has_overflow = False
@@ -225,6 +319,13 @@ class OverflowCheckDataProcessor(PytorchDataProcessor):
         api_info_struct = super().analyze_backward(name, module, module_input_output)
         self.handle_overflow()
         return api_info_struct if self.has_overflow else None
+    def analyze_params(self, name, param_name, grad):
+        self.has_overflow = False
+        self._is_support_inf_nan()
+        api_info_struct = super().analyze_params(name, param_name, grad)
+        self.handle_overflow()
+        return api_info_struct if self.has_overflow else None
     def handle_overflow(self):
         if not self.support_inf_nan:
@@ -299,10 +400,10 @@ class FreeBenchmarkDataProcessor(PytorchDataProcessor):
             )
         return
-    def analyze_pre_forward(self, name, module, module_input_output: ModuleForwardInputsOutputs):
+    def analyze_forward_input(self, name, module, module_input_output: ModuleForwardInputsOutputs):
         self.checker.pre_forward(name, module, self, module_input_output.args, module_input_output.kwargs)
-    def analyze_forward(self, name, module, module_input_output: ModuleForwardInputsOutputs):
+    def analyze_forward_output(self, name, module, module_input_output: ModuleForwardInputsOutputs):
         new_output, unequal_rows = self.checker.forward(
             name,
             module,
@@ -320,64 +421,120 @@ class FreeBenchmarkDataProcessor(PytorchDataProcessor):
 class KernelDumpDataProcessor(PytorchDataProcessor):
-    forward_init_status = False
-    multi_output_apis = ["_sort_", "npu_flash_attention"]
     def __init__(self, config, data_writer):
         super().__init__(config, data_writer)
+        self.enable_kernel_dump = True
+        self.is_found_output_tensor = False
+        self.is_found_grad_input_tensor = False
+        self.forward_args = None
+        self.forward_kwargs = None
+        self.forward_output_tensor = None
+        self.grad_input_tensor = None
+    @staticmethod
+    def start_kernel_dump(config_path):
+        torch_npu.npu.synchronize()
+        torch_npu.npu.init_dump()
+        torch_npu.npu.set_dump(config_path)
+        torch_npu.npu.synchronize()
+    @staticmethod
+    def stop_kernel_dump():
+        torch_npu.npu.synchronize()
+        torch_npu.npu.finalize_dump()
+        torch_npu.npu.synchronize()
+    @staticmethod
+    def _print_unsupported_log(api_name):
+        logger.warning(f"The kernel dump does not support the {api_name} API.")
+    def analyze_forward_input(self, name, module, module_input_output):
+        if not self.enable_kernel_dump:
+            return
+        if is_gpu:
+            logger.warning("The current environment is not a complete NPU environment, and kernel dump cannot be used.")
+            self.enable_kernel_dump = False
+            return
+        if self.config.is_backward_kernel_dump:
+            self.forward_args = self.clone_and_detach_tensor(module_input_output.args)
+            self.forward_kwargs = self.clone_and_detach_tensor(module_input_output.kwargs)
+            try:
+                output = module.forward(*self.forward_args, **self.forward_kwargs)
+            except Exception:
+                self._print_unsupported_log(name)
+                self.enable_kernel_dump = False
+                return
+            self.analyze_element(convert_tuple(output))
+            if not self.is_found_output_tensor:
+                self._print_unsupported_log(name)
+                self.enable_kernel_dump = False
+            return
+        self.start_kernel_dump(self.config.kernel_config_path)
+    def analyze_forward_output(self, name, module, module_input_output):
+        if not self.enable_kernel_dump:
+            return
+        if self.config.is_backward_kernel_dump:
+            return
+        self.enable_kernel_dump = False
+        self.stop_kernel_dump()
+        logger.info(f"The kernel data of {name} is dumped successfully.")
+    def analyze_backward(self, name, module, module_input_output):
+        if not self.enable_kernel_dump:
+            return
+        self.enable_kernel_dump = False
+        self.analyze_element(module_input_output.grad_input)
+        if not self.is_found_grad_input_tensor:
+            self._print_unsupported_log(name)
+            return
+        self.start_kernel_dump(self.config.kernel_config_path)
+        try:
+            self.forward_output_tensor.backward(self.grad_input_tensor, retain_graph=True)
+        except Exception:
+            self._print_unsupported_log(name)
+            self.stop_kernel_dump()
+            return
-    def analyze_forward(self, name, module, module_input_output):
-        if self.config.is_forward_acl_dump:
-            self.forward_acl_dump(name, module, module_input_output)
+        self.stop_kernel_dump()
+        logger.info(f"The kernel data of {name} is dumped successfully.")
+    @recursion_depth_decorator("KernelDump: KernelDumpDataProcessor.clone_and_detach_tensor")
+    def clone_and_detach_tensor(self, input_params):
+        if isinstance(input_params, torch.Tensor):
+            if input_params.requires_grad:
+                return input_params.clone().detach().requires_grad_()
+            return input_params.clone()
+        elif isinstance(input_params, tuple):
+            return tuple(self.clone_and_detach_tensor(x) for x in input_params)
+        elif isinstance(input_params, list):
+            return list(self.clone_and_detach_tensor(x) for x in input_params)
+        elif isinstance(input_params, dict):
+            return {k: self.clone_and_detach_tensor(v) for k, v in input_params.items()}
         else:
-            self.dump_mode_backward_acl_dump(name, module, module_input_output)
-    def forward_acl_dump(self, name, module, module_input_output):
-        if not KernelDumpDataProcessor.forward_init_status:
-            KernelDumpDataProcessor.forward_init_status = True
-            torch_npu.npu.synchronize()
-            torch_npu.npu.init_dump()
-            torch_npu.npu.set_dump(self.config.acl_config)
-            torch_npu.npu.synchronize()
-            if self.op_need_trigger(name):
-                module.forward(*module_input_output.args, **module_input_output.kwargs).cpu()
-            else:
-                module.forward(*module_input_output.args, **module_input_output.kwargs)
-            torch_npu.npu.synchronize()
-            torch_npu.npu.finalize_dump()
-            torch_npu.npu.synchronize()
-        KernelDumpDataProcessor.forward_init_status = False
-        logger.info("Dump %s op file." % name)
-    def acl_backward_dump_status(self, output, grad, module_name):
-        if isinstance(output, torch.Tensor):
-            output.backward(grad, retain_graph=True)
-            return True
+            return input_params
-        for api_name in KernelDumpDataProcessor.multi_output_apis:
-            if api_name in module_name:
-                output[0].backward(grad, retain_graph=True)
-                return True
-        return False
+    def analyze_single_element(self, element, suffix_stack):
+        if isinstance(element, torch.Tensor):
+            if not self.is_found_output_tensor:
+                if element.requires_grad:
+                    self.forward_output_tensor = element
+                    self.is_found_output_tensor = True
+                return {}
+            if not self.is_found_grad_input_tensor:
+                self.grad_input_tensor = element.clone()
+                self.is_found_grad_input_tensor = True
+        return {}
-    def dump_mode_backward_acl_dump(self, name, module, module_input_output):
-        grad_path = self.config.backward_input.get(name)
-        if not KernelDumpDataProcessor.forward_init_status:
-            KernelDumpDataProcessor.forward_init_status = True
-            output = module.forward(*module_input_output.args, **module_input_output.kwargs)
-            pt = load_pt(grad_path)
-            grad = pt.to("npu").requires_grad_()
-            torch_npu.npu.init_dump()
-            torch_npu.npu.set_dump(self.config.acl_config)
-            torch_npu.npu.synchronize()
-            if not self.acl_backward_dump_status(output, grad, name):
-                logger.warning("The output of {} is not of tensor type and cannot be automatically derived. "
-                               "you can manually construct a single API backward case for ACL dump.".format(
-                    name))
-            torch_npu.npu.synchronize()
-            torch_npu.npu.finalize_dump()
-        KernelDumpDataProcessor.forward_init_status = False
-        logger.info("Dump %s op file." % name)
-    def op_need_trigger(self, module_name):
-        return 'Tensor.__getitem__.' in module_name
+    def reset_status(self):
+        self.enable_kernel_dump = True
+        self.is_found_output_tensor = False
+        self.is_found_grad_input_tensor = False
+        self.forward_args = None
+        self.forward_kwargs = None
+        self.forward_output_tensor = None
+        self.grad_input_tensor = None

msprobe/core/data_dump/json_writer.py CHANGED Viewed

@@ -15,10 +15,12 @@
 import csv
 import os
+import numpy as np
 from msprobe.core.common.const import Const, FileCheckConst
-from msprobe.core.common.file_utils import change_mode, FileOpen, save_json
+from msprobe.core.common.file_utils import change_mode, FileOpen, save_json, load_json
 from msprobe.core.common.log import logger
+from msprobe.core.common.exceptions import MsprobeException
 class DataWriter:
@@ -115,3 +117,29 @@ class DataWriter:
             self.write_stack_info_json(self.stack_file_path)
         if self.cache_construct:
             self.write_construct_info_json(self.construct_file_path)
+    def fill_stack_tensor_data(self):
+        self.process_stat_data_recursive(self.cache_data)
+    def process_stat_data_recursive(self, data, depth=0):
+        if depth > Const.MAX_DEPTH:
+            logger.error(f"The maximum depth of recursive process stat data, {Const.MAX_DEPTH} is reached.")
+            raise MsprobeException(MsprobeException.RECURSION_LIMIT_ERROR)
+        if isinstance(data, dict):
+            if "tensor_stat" in data.keys():
+                tensor_stat = data["tensor_stat"]
+                if len(tensor_stat) != Const.TENSOR_STAT_LEN or len(tensor_stat[0]) != len(tensor_stat[1]):
+                    logger.warning("Some bad data in async dump")
+                else:
+                    tensor_stat_index, tensor_stat_data = tensor_stat[0], tensor_stat[1]
+                    if hasattr(tensor_stat_data, "device") and tensor_stat_data.device != Const.CPU_LOWERCASE:
+                        tensor_stat_data = tensor_stat_data.cpu()
+                    for index, stat in zip(tensor_stat_index, tensor_stat_data):
+                        data.update({index, stat.item()})
+                del data["tensor_stat"]
+            else:
+                for key in data.keys():
+                    self.process_stat_data_recursive(data[key], depth + 1)
+        elif isinstance(data, (list, tuple)):
+            for i in data:
+                self.process_stat_data_recursive(i, depth + 1)

mindstudio-probe 1.1.0__py3-none-any.whl → 1.2.1__py3-none-any.whl

mindstudio-probe 1.1.0py3-none-any.whl → 1.2.1py3-none-any.whl