PyPI - mindstudio-probe - Versions diffs - 1.0.1__py3-none-any.whl - Mend

mindstudio-probe 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (228) hide show

mindstudio_probe-1.0.1.dist-info/LICENSE +201 -0
mindstudio_probe-1.0.1.dist-info/METADATA +30 -0
mindstudio_probe-1.0.1.dist-info/RECORD +228 -0
mindstudio_probe-1.0.1.dist-info/WHEEL +5 -0
mindstudio_probe-1.0.1.dist-info/entry_points.txt +2 -0
mindstudio_probe-1.0.1.dist-info/top_level.txt +1 -0
msprobe/README.md +182 -0
msprobe/__init__.py +0 -0
msprobe/config/README.md +397 -0
msprobe/config/config.json +28 -0
msprobe/config/img/free_benchmark.png +0 -0
msprobe/core/common/const.py +241 -0
msprobe/core/common/exceptions.py +88 -0
msprobe/core/common/file_check.py +265 -0
msprobe/core/common/log.py +55 -0
msprobe/core/common/utils.py +516 -0
msprobe/core/common_config.py +58 -0
msprobe/core/data_dump/data_collector.py +140 -0
msprobe/core/data_dump/data_processor/base.py +245 -0
msprobe/core/data_dump/data_processor/factory.py +61 -0
msprobe/core/data_dump/data_processor/pytorch_processor.py +346 -0
msprobe/core/data_dump/json_writer.py +116 -0
msprobe/core/data_dump/scope.py +178 -0
msprobe/mindspore/__init__.py +1 -0
msprobe/mindspore/debugger/__init__.py +0 -0
msprobe/mindspore/debugger/debugger_config.py +51 -0
msprobe/mindspore/debugger/precision_debugger.py +32 -0
msprobe/mindspore/doc/dump.md +65 -0
msprobe/mindspore/dump/__init__.py +0 -0
msprobe/mindspore/dump/api_kbk_dump.py +55 -0
msprobe/mindspore/dump/dump_tool_factory.py +38 -0
msprobe/mindspore/dump/kernel_graph_dump.py +60 -0
msprobe/mindspore/ms_config.py +78 -0
msprobe/mindspore/overflow_check/__init__.py +0 -0
msprobe/mindspore/overflow_check/kernel_graph_overflow_check.py +45 -0
msprobe/mindspore/overflow_check/overflow_check_tool_factory.py +32 -0
msprobe/mindspore/task_handler_factory.py +21 -0
msprobe/msprobe.py +67 -0
msprobe/pytorch/__init__.py +4 -0
msprobe/pytorch/advisor/advisor.py +124 -0
msprobe/pytorch/advisor/advisor_const.py +59 -0
msprobe/pytorch/advisor/advisor_result.py +58 -0
msprobe/pytorch/api_accuracy_checker/.keep +0 -0
msprobe/pytorch/api_accuracy_checker/__init__.py +0 -0
msprobe/pytorch/api_accuracy_checker/common/.keep +0 -0
msprobe/pytorch/api_accuracy_checker/common/__init__.py +0 -0
msprobe/pytorch/api_accuracy_checker/common/config.py +50 -0
msprobe/pytorch/api_accuracy_checker/common/utils.py +224 -0
msprobe/pytorch/api_accuracy_checker/compare/__init__.py +0 -0
msprobe/pytorch/api_accuracy_checker/compare/algorithm.py +216 -0
msprobe/pytorch/api_accuracy_checker/compare/api_precision_compare.py +545 -0
msprobe/pytorch/api_accuracy_checker/compare/api_precision_standard.yaml +133 -0
msprobe/pytorch/api_accuracy_checker/compare/api_precision_threshold.yaml +390 -0
msprobe/pytorch/api_accuracy_checker/compare/compare.py +345 -0
msprobe/pytorch/api_accuracy_checker/compare/compare_column.py +74 -0
msprobe/pytorch/api_accuracy_checker/compare/compare_utils.py +249 -0
msprobe/pytorch/api_accuracy_checker/config.yaml +4 -0
msprobe/pytorch/api_accuracy_checker/run_ut/.keep +0 -0
msprobe/pytorch/api_accuracy_checker/run_ut/__init__.py +0 -0
msprobe/pytorch/api_accuracy_checker/run_ut/data_generate.py +328 -0
msprobe/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py +203 -0
msprobe/pytorch/api_accuracy_checker/run_ut/run_overflow_check.py +127 -0
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py +493 -0
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut_utils.py +7 -0
msprobe/pytorch/api_accuracy_checker/run_ut/torch_ut_setting.json +5 -0
msprobe/pytorch/common/__init__.py +2 -0
msprobe/pytorch/common/compare_script.template +14 -0
msprobe/pytorch/common/log.py +32 -0
msprobe/pytorch/common/parse_json.py +37 -0
msprobe/pytorch/common/utils.py +224 -0
msprobe/pytorch/compare/acc_compare.py +1024 -0
msprobe/pytorch/compare/distributed_compare.py +111 -0
msprobe/pytorch/compare/highlight.py +100 -0
msprobe/pytorch/compare/mapping.yaml +607 -0
msprobe/pytorch/compare/match.py +36 -0
msprobe/pytorch/compare/npy_compare.py +244 -0
msprobe/pytorch/debugger/__init__.py +0 -0
msprobe/pytorch/debugger/debugger_config.py +86 -0
msprobe/pytorch/debugger/precision_debugger.py +95 -0
msprobe/pytorch/doc/FAQ.md +193 -0
msprobe/pytorch/doc/api_accuracy_checker.md +269 -0
msprobe/pytorch/doc/atat/321/207/342/226/223/342/225/233/321/205/342/225/221/320/266/321/205/342/225/226/320/265/321/205/320/225/342/225/226/321/206/320/245/342/226/221/321/206/320/235/320/276dump/321/206/320/260/320/227/321/205/320/227/320/226/321/206/320/220/320/267/321/210/320/223/342/225/234/321/205/320/257/342/225/221/321/207/342/225/221/342/224/220/321/206/320/232/320/265/321/205/320/241/320/232.md +182 -0
msprobe/pytorch/doc/dump.md +207 -0
msprobe/pytorch/doc/img/BLOOM-7B_1.png +0 -0
msprobe/pytorch/doc/img/BLOOM-7B_2.png +0 -0
msprobe/pytorch/doc/img/BLOOM-7B_3.png +0 -0
msprobe/pytorch/doc/img/BLOOM-7B_4.png +0 -0
msprobe/pytorch/doc/img/GPT-3_1.png +0 -0
msprobe/pytorch/doc/img/GPT-3_2.png +0 -0
msprobe/pytorch/doc/img/GPT-3_3.png +0 -0
msprobe/pytorch/doc/img/GPT-3_4.png +0 -0
msprobe/pytorch/doc/img/GPT-3_5.png +0 -0
msprobe/pytorch/doc/img/GPT-3_6.png +0 -0
msprobe/pytorch/doc/img/GPT-3_7.png +0 -0
msprobe/pytorch/doc/img/GPT-3_8.png +0 -0
msprobe/pytorch/doc/img/YOLOV5S_1.png +0 -0
msprobe/pytorch/doc/img/YOLOV5S_2.png +0 -0
msprobe/pytorch/doc/img/accuracy_checking_details.png +0 -0
msprobe/pytorch/doc/img/accuracy_checking_result.png +0 -0
msprobe/pytorch/doc/img/api_precision_compare_details.png +0 -0
msprobe/pytorch/doc/img/api_precision_compare_result.png +0 -0
msprobe/pytorch/doc/img/auto_analyze_log.png +0 -0
msprobe/pytorch/doc/img/compare_result_pkl.png +0 -0
msprobe/pytorch/doc/img/compare_result_pkl_md5.png.png +0 -0
msprobe/pytorch/doc/img/cpu_info.png +0 -0
msprobe/pytorch/doc/img/module_compare.png +0 -0
msprobe/pytorch/doc/parse_tool.md +286 -0
msprobe/pytorch/doc/ptdbg_ascend_compare.md +176 -0
msprobe/pytorch/doc/ptdbg_ascend_overview.md +68 -0
msprobe/pytorch/doc/ptdbg_ascend_quickstart.md +381 -0
msprobe/pytorch/doc/run_overflow_check.md +25 -0
msprobe/pytorch/doc//321/205/320/254/320/270/321/207/342/225/221/342/224/220/321/207/342/226/223/342/225/233/321/205/342/225/221/320/266/321/206/320/277/320/244/321/205/320/277/342/225/243.md +90 -0
msprobe/pytorch/free_benchmark/__init__.py +8 -0
msprobe/pytorch/free_benchmark/common/__init__.py +0 -0
msprobe/pytorch/free_benchmark/common/constant.py +67 -0
msprobe/pytorch/free_benchmark/common/counter.py +72 -0
msprobe/pytorch/free_benchmark/common/enums.py +37 -0
msprobe/pytorch/free_benchmark/common/params.py +129 -0
msprobe/pytorch/free_benchmark/common/utils.py +98 -0
msprobe/pytorch/free_benchmark/compare/grad_saver.py +183 -0
msprobe/pytorch/free_benchmark/compare/single_benchmark.py +104 -0
msprobe/pytorch/free_benchmark/main.py +102 -0
msprobe/pytorch/free_benchmark/perturbed_layers/__init__.py +0 -0
msprobe/pytorch/free_benchmark/perturbed_layers/base_layer.py +13 -0
msprobe/pytorch/free_benchmark/perturbed_layers/layer_factory.py +41 -0
msprobe/pytorch/free_benchmark/perturbed_layers/npu/__init__.py +0 -0
msprobe/pytorch/free_benchmark/perturbed_layers/npu/add_noise.py +90 -0
msprobe/pytorch/free_benchmark/perturbed_layers/npu/bit_noise.py +104 -0
msprobe/pytorch/free_benchmark/perturbed_layers/npu/change_value.py +63 -0
msprobe/pytorch/free_benchmark/perturbed_layers/npu/improve_precision.py +68 -0
msprobe/pytorch/free_benchmark/perturbed_layers/npu/no_change.py +28 -0
msprobe/pytorch/free_benchmark/perturbed_layers/npu/npu_base_layser.py +45 -0
msprobe/pytorch/free_benchmark/perturbed_layers/run_cpu.py +19 -0
msprobe/pytorch/free_benchmark/result_handlers/__init__.py +0 -0
msprobe/pytorch/free_benchmark/result_handlers/base_handler.py +203 -0
msprobe/pytorch/free_benchmark/result_handlers/check_handler.py +39 -0
msprobe/pytorch/free_benchmark/result_handlers/fix_handler.py +24 -0
msprobe/pytorch/free_benchmark/result_handlers/handler_factory.py +31 -0
msprobe/pytorch/free_benchmark/result_handlers/preheat_handler.py +170 -0
msprobe/pytorch/functional/__init__.py +0 -0
msprobe/pytorch/functional/data_processor.py +0 -0
msprobe/pytorch/functional/dump_module.py +39 -0
msprobe/pytorch/hook_module/__init__.py +1 -0
msprobe/pytorch/hook_module/api_registry.py +161 -0
msprobe/pytorch/hook_module/hook_module.py +109 -0
msprobe/pytorch/hook_module/support_wrap_ops.yaml +1876 -0
msprobe/pytorch/hook_module/utils.py +29 -0
msprobe/pytorch/hook_module/wrap_aten.py +100 -0
msprobe/pytorch/hook_module/wrap_distributed.py +75 -0
msprobe/pytorch/hook_module/wrap_functional.py +108 -0
msprobe/pytorch/hook_module/wrap_npu_custom.py +73 -0
msprobe/pytorch/hook_module/wrap_tensor.py +72 -0
msprobe/pytorch/hook_module/wrap_torch.py +88 -0
msprobe/pytorch/hook_module/wrap_vf.py +64 -0
msprobe/pytorch/module_processer.py +98 -0
msprobe/pytorch/online_dispatch/__init__.py +20 -0
msprobe/pytorch/online_dispatch/compare.py +236 -0
msprobe/pytorch/online_dispatch/dispatch.py +274 -0
msprobe/pytorch/online_dispatch/dump_compare.py +186 -0
msprobe/pytorch/online_dispatch/single_compare.py +391 -0
msprobe/pytorch/online_dispatch/torch_ops_config.yaml +50 -0
msprobe/pytorch/online_dispatch/utils.py +187 -0
msprobe/pytorch/parse.py +4 -0
msprobe/pytorch/parse_tool/__init__.py +0 -0
msprobe/pytorch/parse_tool/cli.py +32 -0
msprobe/pytorch/parse_tool/lib/__init__.py +0 -0
msprobe/pytorch/parse_tool/lib/compare.py +259 -0
msprobe/pytorch/parse_tool/lib/config.py +51 -0
msprobe/pytorch/parse_tool/lib/file_desc.py +31 -0
msprobe/pytorch/parse_tool/lib/interactive_cli.py +102 -0
msprobe/pytorch/parse_tool/lib/parse_exception.py +54 -0
msprobe/pytorch/parse_tool/lib/parse_tool.py +158 -0
msprobe/pytorch/parse_tool/lib/utils.py +367 -0
msprobe/pytorch/parse_tool/lib/visualization.py +90 -0
msprobe/pytorch/pt_config.py +93 -0
msprobe/pytorch/service.py +167 -0
msprobe/test/core_ut/common/test_utils.py +345 -0
msprobe/test/core_ut/data_dump/test_data_collector.py +47 -0
msprobe/test/core_ut/data_dump/test_json_writer.py +183 -0
msprobe/test/core_ut/data_dump/test_scope.py +151 -0
msprobe/test/core_ut/test_common_config.py +152 -0
msprobe/test/core_ut/test_file_check.py +218 -0
msprobe/test/core_ut/test_log.py +109 -0
msprobe/test/mindspore_ut/test_api_kbk_dump.py +51 -0
msprobe/test/mindspore_ut/test_debugger_config.py +42 -0
msprobe/test/mindspore_ut/test_dump_tool_factory.py +51 -0
msprobe/test/mindspore_ut/test_kernel_graph_dump.py +66 -0
msprobe/test/mindspore_ut/test_kernel_graph_overflow_check.py +63 -0
msprobe/test/mindspore_ut/test_ms_config.py +69 -0
msprobe/test/mindspore_ut/test_overflow_check_tool_factory.py +51 -0
msprobe/test/mindspore_ut/test_precision_debugger.py +56 -0
msprobe/test/mindspore_ut/test_task_handler_factory.py +58 -0
msprobe/test/pytorch_ut/advisor/test_advisor.py +83 -0
msprobe/test/pytorch_ut/api_accuracy_checker/common/test_common_utils.py +108 -0
msprobe/test/pytorch_ut/api_accuracy_checker/common/test_config.py +39 -0
msprobe/test/pytorch_ut/api_accuracy_checker/compare/test_algorithm.py +112 -0
msprobe/test/pytorch_ut/api_accuracy_checker/compare/test_api_precision_compare.py +77 -0
msprobe/test/pytorch_ut/api_accuracy_checker/compare/test_compare.py +125 -0
msprobe/test/pytorch_ut/api_accuracy_checker/compare/test_compare_column.py +10 -0
msprobe/test/pytorch_ut/api_accuracy_checker/compare/test_compare_utils.py +43 -0
msprobe/test/pytorch_ut/api_accuracy_checker/run_ut/dump.json +179 -0
msprobe/test/pytorch_ut/api_accuracy_checker/run_ut/forward.json +63 -0
msprobe/test/pytorch_ut/api_accuracy_checker/run_ut/test_data_generate.py +99 -0
msprobe/test/pytorch_ut/api_accuracy_checker/run_ut/test_multi_run_ut.py +115 -0
msprobe/test/pytorch_ut/api_accuracy_checker/run_ut/test_run_ut.py +72 -0
msprobe/test/pytorch_ut/compare/test_acc_compare.py +17 -0
msprobe/test/pytorch_ut/free_benchmark/perturbed_layers/test_perturbed_layser.py +105 -0
msprobe/test/pytorch_ut/free_benchmark/result_handlers/test_result_handler.py +121 -0
msprobe/test/pytorch_ut/free_benchmark/test_main.py +101 -0
msprobe/test/pytorch_ut/functional/test_dump_module.py +15 -0
msprobe/test/pytorch_ut/hook_module/test_api_registry.py +130 -0
msprobe/test/pytorch_ut/hook_module/test_hook_module.py +42 -0
msprobe/test/pytorch_ut/hook_module/test_wrap_aten.py +65 -0
msprobe/test/pytorch_ut/hook_module/test_wrap_distributed.py +35 -0
msprobe/test/pytorch_ut/hook_module/test_wrap_functional.py +20 -0
msprobe/test/pytorch_ut/hook_module/test_wrap_tensor.py +35 -0
msprobe/test/pytorch_ut/hook_module/test_wrap_torch.py +43 -0
msprobe/test/pytorch_ut/hook_module/test_wrap_vf.py +11 -0
msprobe/test/pytorch_ut/test_pt_config.py +69 -0
msprobe/test/pytorch_ut/test_service.py +59 -0
msprobe/test/resources/advisor.txt +3 -0
msprobe/test/resources/compare_result_20230703104808.csv +9 -0
msprobe/test/resources/compare_result_without_accuracy.csv +9 -0
msprobe/test/resources/config.yaml +3 -0
msprobe/test/resources/npu_test.pkl +8 -0
msprobe/test/run_test.sh +30 -0
msprobe/test/run_ut.py +58 -0
msprobe/test/test_module_processer.py +64 -0

msprobe/core/data_dump/data_processor/pytorch_processor.py ADDED Viewed

@@ -0,0 +1,346 @@
+import os
+import zlib
+from dataclasses import asdict
+from typing import List
+import numpy as np
+import torch
+from msprobe.core.common.exceptions import MsaccException
+from msprobe.core.common.file_check import path_len_exceeds_limit, change_mode
+from msprobe.core.common.log import logger
+from msprobe.core.common.const import Const, OverflowConst, FileCheckConst
+from msprobe.core.data_dump.data_processor.base import BaseDataProcessor, ModuleBackwardInputsOutputs, \
+    ModuleForwardInputsOutputs, TensorStatInfo
+from msprobe.pytorch.free_benchmark import FreeBenchmarkCheck, UnequalRow
+try:
+    import torch_npu
+except ImportError:
+    pass
+class PytorchDataProcessor(BaseDataProcessor):
+    pytorch_special_type = (torch.device, torch.dtype, torch.Size, torch.Tensor)
+    def __init__(self, config, data_writer):
+        super().__init__(config, data_writer)
+        self.torch_object_key = {
+            "device": self.analyze_device_in_kwargs,
+            "dtype": self.analyze_dtype_in_kwargs
+        }
+    @staticmethod
+    def get_md5_for_tensor(x):
+        if x.dtype == torch.bfloat16:
+            x = x.float()
+        tensor_bytes = x.cpu().detach().numpy().tobytes()
+        crc32_hash = zlib.crc32(tensor_bytes)
+        return f"{crc32_hash:08x}"
+    @staticmethod
+    def analyze_device_in_kwargs(element):
+        single_arg = {}
+        single_arg.update({'type': "torch.device"})
+        if not isinstance(element, str):
+            if hasattr(element, "index"):
+                device_value = element.type + ":" + str(element.index)
+            else:
+                device_value = element.type
+            single_arg.update({"value": device_value})
+        else:
+            single_arg.update({"value": element})
+        return single_arg
+    @staticmethod
+    def analyze_dtype_in_kwargs(element):
+        return {"type": "torch.dtype", "value": str(element)}
+    @staticmethod
+    def get_stat_info(data):
+        tensor_stat = TensorStatInfo()
+        if data.is_meta:
+            return tensor_stat
+        data_clone = data.detach()
+        if data_clone.numel() == 0:
+            return tensor_stat
+        elif data_clone.dtype == torch.bool:
+            tensor_stat.max = True in data_clone
+            tensor_stat.min = False not in data_clone
+        elif not data_clone.shape:
+            tensor_stat.max = tensor_stat.min = tensor_stat.mean = tensor_stat.norm = data_clone.item()
+        else:
+            if not data_clone.is_floating_point() or data_clone.dtype == torch.float64:
+                data_clone = data_clone.float()
+            tensor_stat.max = torch._C._VariableFunctionsClass.max(data_clone).item()
+            tensor_stat.min = torch._C._VariableFunctionsClass.min(data_clone).item()
+            tensor_stat.mean = torch._C._VariableFunctionsClass.mean(data_clone).item()
+            tensor_stat.norm = torch._C._VariableFunctionsClass.norm(data_clone).item()
+        return tensor_stat
+    @staticmethod
+    def _analyze_torch_size(arg):
+        return {"type": "torch.Size", "value": list(arg)}
+    @classmethod
+    def get_special_types(cls):
+        return super().get_special_types() + cls.pytorch_special_type
+    def analyze_single_element(self, element, suffix_stack):
+        if suffix_stack and suffix_stack[-1] in self.torch_object_key:
+            return self.torch_object_key[suffix_stack[-1]](element)
+        if isinstance(element, torch.Size):
+            return self._analyze_torch_size(element)
+        converted_numpy, numpy_type = self._convert_numpy_to_builtin(element)
+        if converted_numpy is not element:
+            return self._analyze_numpy(converted_numpy, numpy_type)
+        if isinstance(element, torch.Tensor):
+            return self._analyze_tensor(element, Const.SEP.join(suffix_stack))
+        if isinstance(element, (bool, int, float, str, slice)):
+            return self._analyze_builtin(element)
+        return None
+    def analyze_element(self, element):
+        return self.recursive_apply_transform(element, self.analyze_single_element)
+    def _analyze_tensor(self, tensor, suffix):
+        tensor_stat = self.get_stat_info(tensor)
+        tensor_json = {}
+        tensor_json.update({'type': 'torch.Tensor'})
+        tensor_json.update({'dtype': str(tensor.dtype)})
+        tensor_json.update({"shape": tensor.shape})
+        tensor_json.update({"Max": tensor_stat.max})
+        tensor_json.update({"Min": tensor_stat.min})
+        tensor_json.update({"Mean": tensor_stat.mean})
+        tensor_json.update({"Norm": tensor_stat.norm})
+        tensor_json.update({"requires_grad": tensor.requires_grad})
+        if self.config.summary_mode == "md5":
+            tensor_md5 = self.get_md5_for_tensor(tensor)
+            tensor_json.update({"md5": tensor_md5})
+        return tensor_json
+class StatisticsDataProcessor(PytorchDataProcessor):
+    pass
+class TensorDataProcessor(PytorchDataProcessor):
+    def _analyze_tensor(self, tensor, suffix):
+        dump_data_name, file_path = self.get_save_file_path(suffix)
+        if not path_len_exceeds_limit(file_path):
+            torch.save(tensor, file_path)
+            change_mode(file_path, FileCheckConst.DATA_FILE_AUTHORITY)
+        else:
+            logger.warning(f'The file path {file_path} length exceeds limit.')
+        single_arg = super()._analyze_tensor(tensor, suffix)
+        single_arg.update({"data_name": dump_data_name})
+        return single_arg
+class OverflowCheckDataProcessor(PytorchDataProcessor):
+    __slots__ = ["cached_tensors_and_file_paths"]
+    def __init__(self, config, data_writer):
+        super().__init__(config, data_writer)
+        self.cached_tensors_and_file_paths = {}
+        self.real_overflow_dump_times = 0
+        self.overflow_nums = config.overflow_num
+        self.bits_for_overflow = 8
+    @staticmethod
+    def overflow_debug_mode_enable():
+        overflow_mode = os.getenv(OverflowConst.OVERFLOW_DEBUG_MODE_ENABLE, Const.ENV_DISABLE)
+        return overflow_mode == Const.ENV_ENABLE
+    @staticmethod
+    def handle_tensor_extremum_nan_inf(data_clone, operator):
+        data_nan = torch._C._VariableFunctionsClass.isnan(data_clone)
+        if int(torch._C._VariableFunctionsClass.sum(data_nan)) == data_clone.numel():
+            return float('nan')
+        finite_mask = torch._C._VariableFunctionsClass.isfinite(data_clone)
+        if int(torch._C._VariableFunctionsClass.sum(finite_mask)) > 0:
+            finite_values = data_clone[finite_mask]
+            return torch._C._VariableFunctionsClass.max(finite_values).item() if operator == 'max' else \
+                torch._C._VariableFunctionsClass.min(finite_values).item()
+        else:
+            data_no_nan = data_clone[~data_nan]
+            return torch._C._VariableFunctionsClass.max(data_no_nan).item() if operator == 'max' else \
+                torch._C._VariableFunctionsClass.min(data_no_nan).item()
+    def analyze_forward(self, name, module, module_input_output: ModuleForwardInputsOutputs):
+        self.has_overflow = False
+        api_info_struct = super().analyze_forward(name, module, module_input_output)
+        self.maybe_save_overflow_data_and_check_overflow_times()
+        return api_info_struct if self.has_overflow else None
+    def analyze_backward(self, name, module, module_input_output: ModuleBackwardInputsOutputs):
+        self.has_overflow = False
+        api_info_struct = super().analyze_backward(name, module, module_input_output)
+        self.maybe_save_overflow_data_and_check_overflow_times()
+        return api_info_struct if self.has_overflow else None
+    def maybe_save_overflow_data_and_check_overflow_times(self):
+        if self.has_overflow:
+            for file_path, tensor in self.cached_tensors_and_file_paths.items():
+                torch.save(tensor, file_path)
+                change_mode(file_path, FileCheckConst.DATA_FILE_AUTHORITY)
+            self.inc_and_check_overflow_times()
+        self.cached_tensors_and_file_paths = {}
+    def inc_and_check_overflow_times(self):
+        self.real_overflow_dump_times += 1
+        if self.overflow_nums == -1:
+            return
+        if self.real_overflow_dump_times >= self.overflow_nums:
+            raise MsaccException(MsaccException.OVERFLOW_NUMS_ERROR, str(self.real_overflow_dump_times))
+    def check_overflow_npu(self):
+        if self.overflow_debug_mode_enalbe():
+            float_status = torch.zeros(self.bits_for_overflow).npu()
+            result = torch_npu.npu_get_float_status(float_status, OverflowConst.OVERFLOW_DEBUG_MODE)
+            if result.cpu()[0] != 0:
+                return True
+            else:
+                return False
+        else:
+            return torch_npu._C._check_overflow_npu()
+    def clear_overflow_npu(self):
+        if self.overflow_debug_mode_enable():
+            float_status = torch.zeros(self.bits_for_overflow).npu()
+            torch_npu.npu_clear_float_status(float_status, OverflowConst.OVERFLOW_DEBUG_MODE)
+        else:
+            torch_npu._C._clear_overflow_npu()
+    def _analyze_maybe_overflow_tensor(self, tensor_json, tensor):
+        data_clone = tensor.detach()
+        if hasattr(torch_npu._C, '_npu_is_support_inf_nan') and torch_npu._C._npu_is_support_inf_nan():
+            if tensor_json['Max'] is None:
+                return
+            if np.isinf(tensor_json['Max']) or np.isnan(tensor_json['Max']):
+                tensor_json['Max_except_inf_nan'] = self.handle_tensor_extremum_nan_inf(data_clone, "max")
+                self.has_overflow = True
+            if np.isinf(tensor_json['Min']) or np.isnan(tensor_json['Min']):
+                tensor_json['Min_except_inf_nan'] = self.handle_tensor_extremum_nan_inf(data_clone, "min")
+                self.has_overflow = True
+        else:
+            self.has_overflow = self.check_overflow_npu()
+            if self.has_overflow:
+                self.clear_overflow_npu()
+    def _analyze_tensor(self, tensor, suffix):
+        dump_data_name, file_path = self.get_save_file_path(suffix)
+        if not path_len_exceeds_limit(file_path):
+            self.cached_tensors_and_file_paths.update({file_path: tensor})
+        else:
+            logger.warning(f'The file path {file_path} length exceeds limit.')
+        single_arg = super()._analyze_tensor(tensor, suffix)
+        self._analyze_maybe_overflow_tensor(single_arg, tensor)
+        single_arg.update({"data_name": dump_data_name})
+        return single_arg
+class FreeBenchmarkDataProcessor(PytorchDataProcessor):
+    def __init__(self, config, data_writer):
+        super().__init__(config, data_writer)
+        self.checker = FreeBenchmarkCheck(config=config)
+        self._return_forward_new_output = None
+        self._forward_new_output = None
+    def update_iter(self, current_iter):
+        super().update_iter(current_iter)
+        self.checker.update_iter(current_iter)
+    def update_unequal_rows(self, unequal_rows: List[UnequalRow]):
+        if not unequal_rows:
+            return
+        for row in unequal_rows:
+            data_dict = asdict(row)
+            self.data_writer.write_data_to_csv(
+                data_dict.values(),
+                data_dict.keys(),
+                self.data_writer.free_benchmark_file_path
+            )
+        return
+    def analyze_pre_forward(self, name, module, module_input_output: ModuleForwardInputsOutputs):
+        self.checker.pre_forward(name, module, self, module_input_output.args, module_input_output.kwargs)
+    def analyze_forward(self, name, module, module_input_output: ModuleForwardInputsOutputs):
+        new_output, unequal_rows = self.checker.forward(
+            name,
+            module,
+            module_input_output.args,
+            module_input_output.kwargs,
+            module_input_output.output,
+        )
+        self.update_unequal_rows(unequal_rows)
+        if self.checker.if_fix():
+            self._return_forward_new_output = True
+            self._forward_new_output = new_output
+    def analyze_backward(self, name, module, module_input_output: ModuleBackwardInputsOutputs):
+        self.checker.backward(name, module, module_input_output.grad_output)
+class KernelDumpDataProcessor(PytorchDataProcessor):
+    forward_init_status = False
+    multi_output_apis = ["_sort_", "npu_flash_attention"]
+    def __init__(self, config, data_writer):
+        super().__init__(config, data_writer)
+    def analyze_forward(self, name, module, module_input_output):
+        if self.config.is_forward_acl_dump:
+            self.forward_acl_dump(name, module, module_input_output)
+        else:
+            self.dump_mode_backward_acl_dump(name, module, module_input_output)
+    def forward_acl_dump(self, name, module, module_input_output):
+        if not KernelDumpDataProcessor.forward_init_status:
+            KernelDumpDataProcessor.forward_init_status = True
+            torch_npu.npu.synchronize()
+            torch_npu.npu.init_dump()
+            torch_npu.npu.set_dump(self.config.acl_config)
+            torch_npu.npu.synchronize()
+            if self.op_need_trigger(name):
+                module.forward(*module_input_output.args, **module_input_output.kwargs).cpu()
+            else:
+                module.forward(*module_input_output.args, **module_input_output.kwargs)
+            torch_npu.npu.synchronize()
+            torch_npu.npu.finalize_dump()
+            torch_npu.npu.synchronize()
+        KernelDumpDataProcessor.forward_init_status = False
+        logger.info("Dump %s op file." % name)
+    def acl_backward_dump_status(self, output, grad, module_name):
+        if isinstance(output, torch.Tensor):
+            output.backward(grad, retain_graph=True)
+            return True
+        for api_name in KernelDumpDataProcessor.multi_output_apis:
+            if api_name in module_name:
+                output[0].backward(grad, retain_graph=True)
+                return True
+        return False
+    def dump_mode_backward_acl_dump(self, name, module, module_input_output):
+        grad_path = self.config.backward_input.get(name)
+        if not KernelDumpDataProcessor.forward_init_status:
+            KernelDumpDataProcessor.forward_init_status = True
+            output = module.forward(*module_input_output.args, **module_input_output.kwargs)
+            grad = torch.load(grad_path).to("npu").requires_grad_()
+            torch_npu.npu.init_dump()
+            torch_npu.npu.set_dump(self.config.acl_config)
+            torch_npu.npu.synchronize()
+            if not self.acl_backward_dump_status(output, grad, name):
+                logger.warning("The output of {} is not of tensor type and cannot be automatically derived. "
+                               "you can manually construct a single API backward case for ACL dump.".format(
+                    name))
+            torch_npu.npu.synchronize()
+            torch_npu.npu.finalize_dump()
+        KernelDumpDataProcessor.forward_init_status = False
+        logger.info("Dump %s op file." % name)
+    def op_need_trigger(self, module_name):
+        return 'Tensor.__getitem__.' in module_name

msprobe/core/data_dump/json_writer.py ADDED Viewed

@@ -0,0 +1,116 @@
+import os
+import csv
+import fcntl
+import json
+from pathlib import Path
+from msprobe.core.common.file_check import change_mode
+from msprobe.core.common.log import logger
+from msprobe.core.common.const import Const, FileCheckConst
+class DataWriter:
+    def __init__(self, init_json=None) -> None:
+        self.dump_count = 0
+        self.init_json = init_json
+        self.dump_file_path = None  # os.path.join(dump_dir, DataWriter.dump_json_name)
+        self.stack_file_path = None  # os.path.join(dump_dir, DataWriter.stack_json_name)
+        self.construct_file_path = None  # os.path.join(dump_dir, DataWriter.construct_json_name)
+        self.free_benchmark_file_path = None
+        self.dump_tensor_data_dir = None
+        self.buffer_size = 1000
+        self.cache_data = {Const.DATA: {}}
+        self.cache_stack = {}
+        self.cache_construct = {}
+    @staticmethod
+    def write_data_to_csv(result: list, result_header: tuple, file_path: str):
+        if not result:
+            return
+        is_exists = os.path.exists(file_path)
+        append = "a+" if is_exists else "w+"
+        with os.fdopen(
+            os.open(file_path, Const.WRITE_FLAGS, FileCheckConst.DATA_FILE_AUTHORITY), append, newline=""
+        ) as csv_file:
+            spawn_writer = csv.writer(csv_file)
+            if not is_exists:
+                spawn_writer.writerow(result_header)
+            spawn_writer.writerows([result,])
+    def initialize_json_file(self, **kwargs):
+        kwargs.update({"dump_data_dir": self.dump_tensor_data_dir, Const.DATA: {}})
+        with os.fdopen(
+            os.open(self.dump_file_path, Const.OVERWRITE_FLAGS, FileCheckConst.DATA_FILE_AUTHORITY), 'w'
+        ) as f:
+            json.dump(kwargs, f)
+        if os.path.exists(self.stack_file_path):
+            os.remove(self.stack_file_path)
+        Path(self.stack_file_path).touch()
+        change_mode(self.stack_file_path, FileCheckConst.DATA_FILE_AUTHORITY)
+        if os.path.exists(self.construct_file_path):
+            os.remove(self.construct_file_path)
+        Path(self.construct_file_path).touch()
+        change_mode(self.construct_file_path, FileCheckConst.DATA_FILE_AUTHORITY)
+    def update_dump_paths(self, dump_file_path, stack_file_path, construct_file_path, dump_data_dir,
+                          free_benchmark_file_path):
+        self.dump_file_path = dump_file_path
+        self.stack_file_path = stack_file_path
+        self.construct_file_path = construct_file_path
+        self.dump_tensor_data_dir = dump_data_dir
+        self.free_benchmark_file_path = free_benchmark_file_path
+    def update_data(self, new_data):
+        key = next(iter(new_data.keys()))  # assert len(new_data.keys()) == 1
+        if key in self.cache_data[Const.DATA]:
+            self.cache_data[Const.DATA][key].update(new_data[key])
+        else:
+            self.cache_data[Const.DATA].update(new_data)
+    def flush_data_when_buffer_is_full(self):
+        if len(self.cache_data[Const.DATA]) >= self.buffer_size:
+            self.write_data_json(self.dump_file_path)
+    def update_stack(self, new_data):
+        self.cache_stack.update(new_data)
+    def update_construct(self, new_data):
+        self.cache_construct.update(new_data)
+    def write_data_json(self, file_path):
+        logger.info(f"dump.json is at {os.path.dirname(os.path.dirname(file_path))}. ")
+        if Path(file_path).exists() and os.path.getsize(file_path) > 0:
+            with open(file_path, "r+") as f:
+                fcntl.flock(f, fcntl.LOCK_EX)
+                data_to_write = json.load(f)
+                fcntl.flock(f, fcntl.LOCK_UN)
+        else:
+            self.init_json['data_path'] = self.dump_tensor_data_dir
+            data_to_write = self.init_json
+        data_to_write[Const.DATA].update(self.cache_data[Const.DATA])
+        with open(file_path, 'w+') as f:
+            fcntl.flock(f, fcntl.LOCK_EX)
+            json.dump(data_to_write, f, indent=1)
+            fcntl.flock(f, fcntl.LOCK_UN)
+        self.cache_data[Const.DATA].clear()
+    def write_stack_info_json(self, file_path):
+        with open(file_path, 'w+') as f:
+            fcntl.flock(f, fcntl.LOCK_EX)
+            json.dump(self.cache_stack, f, indent=1)
+            fcntl.flock(f, fcntl.LOCK_UN)
+    def write_construct_info_json(self, file_path):
+        with open(file_path, 'w+') as f:
+            fcntl.flock(f, fcntl.LOCK_EX)
+            json.dump(self.cache_construct, f, indent=1)
+            fcntl.flock(f, fcntl.LOCK_UN)
+    def write_json(self):
+        self.write_data_json(self.dump_file_path)
+        self.write_stack_info_json(self.stack_file_path)
+        self.write_construct_info_json(self.construct_file_path)

msprobe/core/data_dump/scope.py ADDED Viewed

@@ -0,0 +1,178 @@
+from abc import ABC, abstractmethod
+from msprobe.core.common.exceptions import ScopeException
+from msprobe.core.common.const import Const
+def build_scope(scope_class, scope=None, api_list=None):
+    if not scope and not api_list:
+        return None
+    if scope is None:
+        scope = []
+    if api_list is None:
+        api_list = []
+    if scope_class:
+        return scope_class(scope, api_list)
+    return build_range_scope_according_to_scope_name(scope, api_list)
+def build_range_scope_according_to_scope_name(scope, api_list):
+    api_range_scope = APIRangeScope(scope, api_list)
+    module_range_scope = ModuleRangeScope(scope, api_list)
+    if not scope:  # 如果没有scope参数则用哪类scope都一样
+        return api_range_scope
+    if api_range_scope.is_valid and module_range_scope.is_valid:
+        raise ScopeException(ScopeException.InvalidScope, f"scope={scope}.")
+    elif api_range_scope.is_valid:
+        return api_range_scope
+    elif module_range_scope.is_valid:
+        return module_range_scope
+    else:
+        raise ScopeException(ScopeException.InvalidScope, f"scope={scope}")
+class BaseScope(ABC):
+    Module_Type_Module = "Module"
+    Module_Type_API = "api"
+    def __init__(self, scope, api_list):
+        scope, api_list = self.rectify_args(scope, api_list)
+        self.scope = scope
+        self.api_list = api_list
+    @staticmethod
+    def rectify_args(scope, api_list):
+        if not isinstance(api_list, list):
+            raise ScopeException(ScopeException.InvalidApiStr,
+                f"api_list参数须配置为列表，实际类型为{type(api_list)}.")
+        for api in api_list:
+            if not isinstance(api, str):
+                raise ScopeException(ScopeException.InvalidApiStr,
+                    f"api_list中的元素须配置为字符串，实际类型为{type(api)}.")
+        if isinstance(scope, str):
+            scope = [scope]
+            return scope, api_list
+        if not isinstance(scope, list):
+            raise ScopeException(ScopeException.InvalidScope,
+                f"scope参数须配置为字符串或列表，实际类型为{type(scope)}.")
+        for s in scope:
+            if not isinstance(s, str):
+                raise ScopeException(ScopeException.InvalidScope,
+                f"scope列表元素要求类型为字符串，实际类型为{type(s)}.")
+        return scope, api_list
+    @abstractmethod
+    def check(self, name):
+        pass
+    def check_api_list(self, api_name):
+        if not self.api_list:
+            return True
+        for api_str in self.api_list:
+            if api_str in api_name:
+                return True
+        return False
+class ListScope(BaseScope):
+    @staticmethod
+    def rectify_args(scope, api_list):
+        if scope and api_list:
+            raise ScopeException(ScopeException.ArgConflict,
+                f"scope和api_list不可以同时配置，实际配置为scope={scope}, api_list={api_list}.")
+        return super(ListScope, ListScope).rectify_args(scope, api_list)
+    def check(self, module_name):
+        if not self.scope or module_name in self.scope:
+            return self.check_api_list(module_name)
+        return False
+class RangeScope(BaseScope, ABC):
+    def __init__(self, *args):
+        super().__init__(*args)
+        self.in_scope = False
+        self.is_valid = self.check_scope_is_valid()
+    @staticmethod
+    def rectify_args(scope, api_list):
+        scope, api_list = super(RangeScope, RangeScope).rectify_args(scope, api_list)
+        if isinstance(scope, list):
+            if len(scope) == 1:
+                scope.append(scope[0])
+            elif len(scope) > 2:
+                raise ScopeException(ScopeException.InvalidScope,
+                    f"scope参数指定区间断点，须传入长度为1或2的列表，实际长度为{len(scope)}.")
+        return scope, api_list
+    @abstractmethod
+    def check_scope_is_valid(self):
+        pass
+    def begin_module(self, module_name):
+        pass
+    def end_module(self, module_name):
+        pass
+class APIRangeScope(RangeScope):
+    def check_scope_is_valid(self):
+        if not self.scope:
+            return True
+        scope_start_type = self.scope[0].split(Const.SEP)[0]
+        if scope_start_type == BaseScope.Module_Type_Module:
+            return False
+        scope_stop_type = self.scope[1].split(Const.SEP)[0]
+        if scope_stop_type == BaseScope.Module_Type_Module:
+            return False
+        return True
+    def check(self, api_name):
+        if self.scope and api_name == self.scope[0]:
+            self.in_scope = True
+        if not self.scope or self.in_scope:
+            result = self.check_api_list(api_name)
+        else:
+            result = False
+        if self.scope and api_name == self.scope[1]:
+            self.in_scope = False
+        return result
+class ModuleRangeScope(RangeScope):
+    """
+        模块与api不同的是，模块内部还有子结构需要dump，
+        需要用pre_hook和full_backward_hook来精确控制module的开始和结束，
+        在这些hook触发时调用begin_module和end_module做区间控制
+    """
+    def check_scope_is_valid(self):
+        if not self.scope:
+            return True
+        scope_start_type = self.scope[0].split(Const.SEP)[0]
+        scope_stop_type = self.scope[1].split(Const.SEP)[0]
+        if scope_start_type == BaseScope.Module_Type_Module and \
+                scope_stop_type == BaseScope.Module_Type_Module:
+            return True
+        return False
+    def begin_module(self, module_name):
+        if not self.scope:
+            return
+        if module_name == self.scope[0]:
+            self.in_scope = True
+    def end_module(self, module_name):
+        if not self.scope:
+            return
+        if module_name == self.scope[1]:
+            self.in_scope = False
+    def check(self, module_name):
+        if not self.scope or self.in_scope:
+            return self.check_api_list(module_name)
+        return False

msprobe/mindspore/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ from msprobe.mindspore.debugger.precision_debugger import PrecisionDebugger

msprobe/mindspore/debugger/__init__.py ADDED Viewed

File without changes

msprobe/mindspore/debugger/debugger_config.py ADDED Viewed

@@ -0,0 +1,51 @@
+import os
+class DebuggerConfig:
+    convert_map = {
+        "L0": "cell",
+        "L1": "api",
+        "L2": 'kernel'
+    }
+    def __init__(self, common_config, task_config):
+        self.dump_path = common_config.dump_path
+        self.task = common_config.task
+        self.rank = [] if not common_config.rank else common_config.rank
+        self.step = [] if not common_config.step else common_config.step
+        if not common_config.level:
+            common_config.level = "L1"
+        self.level = DebuggerConfig.convert_map[common_config.level]
+        self.list = [] if not task_config.list else task_config.list
+        self.data_mode =  [] if not task_config.data_mode else task_config.data_mode
+        self.file_format = task_config.file_format
+        self.check_mode = task_config.check_mode
+        self.check()
+    def check(self):
+        if not self.dump_path:
+            raise Exception("Dump path is empty.")
+        if not os.path.isabs(self.dump_path):
+            raise Exception("Dump path must be absolute path.")
+        if not self.task:
+            self.task = "statistics"
+        if not self.level:
+            raise Exception("level must be L0, L1 or L2")
+        if not self.file_format:
+            self.file_format = "npy"
+        if not self.check_mode:
+            self.check_mode = "all"
+        self._check_rank()
+        self._check_step()
+        return True
+    def _check_rank(self):
+        for rank_id in self.rank:
+            if not isinstance(rank_id, int) or rank_id < 0:
+                raise ValueError(f"rank {self.rank} must be a positive integer.")
+    def _check_step(self):
+        for s in self.step:
+            if not isinstance(s, int):
+                raise ValueError(f"step element {s} should be int")