PyPI - mindstudio-probe - Versions diffs - 1.0.1__py3-none-any.whl → 1.0.3__py3-none-any.whl - Mend

mindstudio-probe 1.0.1py3-none-any.whl → 1.0.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (249) hide show

{mindstudio_probe-1.0.1.dist-info → mindstudio_probe-1.0.3.dist-info}/METADATA +5 -1
mindstudio_probe-1.0.3.dist-info/RECORD +272 -0
msprobe/README.md +78 -23
msprobe/__init__.py +1 -0
msprobe/config/README.md +182 -40
msprobe/config/config.json +22 -0
msprobe/core/__init__.py +0 -0
msprobe/{pytorch → core}/advisor/advisor.py +3 -3
msprobe/{pytorch → core}/advisor/advisor_result.py +2 -2
msprobe/core/common/const.py +82 -5
msprobe/core/common/exceptions.py +30 -18
msprobe/core/common/file_check.py +19 -1
msprobe/core/common/log.py +15 -1
msprobe/core/common/utils.py +130 -30
msprobe/core/common_config.py +32 -19
msprobe/core/compare/acc_compare.py +299 -0
msprobe/core/compare/check.py +95 -0
msprobe/core/compare/compare_cli.py +49 -0
msprobe/core/compare/highlight.py +222 -0
msprobe/core/compare/multiprocessing_compute.py +149 -0
msprobe/{pytorch → core}/compare/npy_compare.py +55 -4
msprobe/core/compare/utils.py +429 -0
msprobe/core/data_dump/data_collector.py +39 -35
msprobe/core/data_dump/data_processor/base.py +85 -37
msprobe/core/data_dump/data_processor/factory.py +5 -7
msprobe/core/data_dump/data_processor/mindspore_processor.py +198 -0
msprobe/core/data_dump/data_processor/pytorch_processor.py +94 -51
msprobe/core/data_dump/json_writer.py +11 -11
msprobe/core/grad_probe/__init__.py +0 -0
msprobe/core/grad_probe/constant.py +71 -0
msprobe/core/grad_probe/grad_compare.py +175 -0
msprobe/core/grad_probe/utils.py +52 -0
msprobe/doc/grad_probe/grad_probe.md +207 -0
msprobe/doc/grad_probe/img/image-1.png +0 -0
msprobe/doc/grad_probe/img/image-2.png +0 -0
msprobe/doc/grad_probe/img/image-3.png +0 -0
msprobe/doc/grad_probe/img/image-4.png +0 -0
msprobe/doc/grad_probe/img/image.png +0 -0
msprobe/mindspore/api_accuracy_checker/__init__.py +0 -0
msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +246 -0
msprobe/mindspore/api_accuracy_checker/api_info.py +69 -0
msprobe/mindspore/api_accuracy_checker/api_runner.py +152 -0
msprobe/mindspore/api_accuracy_checker/base_compare_algorithm.py +197 -0
msprobe/mindspore/api_accuracy_checker/compute_element.py +224 -0
msprobe/mindspore/api_accuracy_checker/main.py +16 -0
msprobe/mindspore/api_accuracy_checker/type_mapping.py +114 -0
msprobe/mindspore/api_accuracy_checker/utils.py +63 -0
msprobe/mindspore/cell_processor.py +34 -0
msprobe/mindspore/common/const.py +87 -0
msprobe/mindspore/common/log.py +38 -0
msprobe/mindspore/common/utils.py +57 -0
msprobe/mindspore/compare/distributed_compare.py +75 -0
msprobe/mindspore/compare/ms_compare.py +117 -0
msprobe/mindspore/compare/ms_graph_compare.py +317 -0
msprobe/mindspore/compare/ms_to_pt_api.yaml +399 -0
msprobe/mindspore/debugger/debugger_config.py +38 -15
msprobe/mindspore/debugger/precision_debugger.py +79 -4
msprobe/mindspore/doc/compare.md +58 -0
msprobe/mindspore/doc/dump.md +158 -6
msprobe/mindspore/dump/dump_tool_factory.py +19 -22
msprobe/mindspore/dump/hook_cell/api_registry.py +104 -0
msprobe/mindspore/dump/hook_cell/hook_cell.py +53 -0
msprobe/mindspore/dump/hook_cell/support_wrap_ops.yaml +925 -0
msprobe/mindspore/dump/hook_cell/wrap_functional.py +91 -0
msprobe/mindspore/dump/hook_cell/wrap_tensor.py +63 -0
msprobe/mindspore/dump/jit_dump.py +56 -0
msprobe/mindspore/dump/kernel_kbyk_dump.py +65 -0
msprobe/mindspore/free_benchmark/__init__.py +0 -0
msprobe/mindspore/free_benchmark/api_pynative_self_check.py +116 -0
msprobe/mindspore/free_benchmark/common/__init__.py +0 -0
msprobe/mindspore/free_benchmark/common/config.py +12 -0
msprobe/mindspore/free_benchmark/common/handler_params.py +17 -0
msprobe/mindspore/free_benchmark/common/utils.py +71 -0
msprobe/mindspore/free_benchmark/data/support_wrap_ops.yaml +842 -0
msprobe/mindspore/free_benchmark/decorator/__init__.py +0 -0
msprobe/mindspore/free_benchmark/decorator/dec_forward.py +42 -0
msprobe/mindspore/free_benchmark/decorator/decorator_factory.py +107 -0
msprobe/mindspore/free_benchmark/handler/__init__.py +0 -0
msprobe/mindspore/free_benchmark/handler/base_handler.py +90 -0
msprobe/mindspore/free_benchmark/handler/check_handler.py +41 -0
msprobe/mindspore/free_benchmark/handler/fix_handler.py +36 -0
msprobe/mindspore/free_benchmark/handler/handler_factory.py +21 -0
msprobe/mindspore/free_benchmark/perturbation/add_noise.py +67 -0
msprobe/mindspore/free_benchmark/perturbation/base_perturbation.py +21 -0
msprobe/mindspore/free_benchmark/perturbation/bit_noise.py +63 -0
msprobe/mindspore/free_benchmark/perturbation/improve_precision.py +34 -0
msprobe/mindspore/free_benchmark/perturbation/no_change.py +12 -0
msprobe/mindspore/free_benchmark/perturbation/perturbation_factory.py +27 -0
msprobe/mindspore/free_benchmark/self_check_tool_factory.py +33 -0
msprobe/mindspore/grad_probe/__init__.py +0 -0
msprobe/mindspore/grad_probe/global_context.py +91 -0
msprobe/mindspore/grad_probe/grad_analyzer.py +231 -0
msprobe/mindspore/grad_probe/grad_monitor.py +27 -0
msprobe/mindspore/grad_probe/grad_stat_csv.py +132 -0
msprobe/mindspore/grad_probe/hook.py +92 -0
msprobe/mindspore/grad_probe/utils.py +29 -0
msprobe/mindspore/ms_config.py +63 -15
msprobe/mindspore/overflow_check/overflow_check_tool_factory.py +17 -15
msprobe/mindspore/runtime.py +4 -0
msprobe/mindspore/service.py +354 -0
msprobe/mindspore/task_handler_factory.py +7 -4
msprobe/msprobe.py +66 -26
msprobe/pytorch/__init__.py +1 -1
msprobe/pytorch/api_accuracy_checker/common/config.py +21 -16
msprobe/pytorch/api_accuracy_checker/common/utils.py +1 -60
msprobe/pytorch/api_accuracy_checker/compare/algorithm.py +2 -5
msprobe/pytorch/api_accuracy_checker/compare/api_precision_compare.py +46 -10
msprobe/pytorch/api_accuracy_checker/compare/compare.py +84 -48
msprobe/pytorch/api_accuracy_checker/compare/compare_utils.py +8 -12
msprobe/pytorch/api_accuracy_checker/config.yaml +7 -1
msprobe/pytorch/api_accuracy_checker/run_ut/data_generate.py +15 -11
msprobe/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py +11 -15
msprobe/pytorch/api_accuracy_checker/run_ut/run_overflow_check.py +16 -9
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py +193 -105
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut_utils.py +68 -1
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/__init__.py +0 -0
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/attl.py +202 -0
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/client.py +324 -0
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/device_dispatch.py +204 -0
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/server.py +218 -0
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/ssl_config.py +10 -0
msprobe/pytorch/bench_functions/__init__.py +15 -0
msprobe/pytorch/bench_functions/apply_adam_w.py +28 -0
msprobe/pytorch/bench_functions/confusion_transpose.py +19 -0
msprobe/pytorch/bench_functions/fast_gelu.py +55 -0
msprobe/pytorch/bench_functions/layer_norm_eval.py +6 -0
msprobe/pytorch/bench_functions/linear.py +12 -0
msprobe/pytorch/bench_functions/matmul_backward.py +48 -0
msprobe/pytorch/bench_functions/npu_fusion_attention.py +421 -0
msprobe/pytorch/bench_functions/rms_norm.py +15 -0
msprobe/pytorch/bench_functions/rotary_mul.py +52 -0
msprobe/pytorch/bench_functions/scaled_mask_softmax.py +26 -0
msprobe/pytorch/bench_functions/swiglu.py +55 -0
msprobe/pytorch/common/parse_json.py +3 -1
msprobe/pytorch/common/utils.py +83 -7
msprobe/pytorch/compare/distributed_compare.py +19 -64
msprobe/pytorch/compare/match.py +3 -6
msprobe/pytorch/compare/pt_compare.py +40 -0
msprobe/pytorch/debugger/debugger_config.py +11 -2
msprobe/pytorch/debugger/precision_debugger.py +34 -4
msprobe/pytorch/doc/api_accuracy_checker.md +57 -13
msprobe/pytorch/doc/api_accuracy_checker_online.md +187 -0
msprobe/pytorch/doc/dump.md +73 -20
msprobe/pytorch/doc/ptdbg_ascend_compare.md +75 -11
msprobe/pytorch/doc/ptdbg_ascend_quickstart.md +3 -3
msprobe/pytorch/doc/run_overflow_check.md +1 -1
msprobe/pytorch/doc//321/206/320/247/320/260/321/206/320/260/320/227/321/206/320/255/320/226/321/205/342/225/226/320/265/321/205/320/225/342/225/226/321/205/320/254/342/225/221/321/206/320/251/320/277/321/211/320/272/320/234/321/210/320/277/320/221/321/205/320/242/320/234/321/206/320/220/320/267/321/210/320/223/342/225/234/321/205/320/257/342/225/221/321/207/342/225/221/342/224/220/321/206/320/232/320/265/321/205/320/241/320/232.md +151 -0
msprobe/pytorch/free_benchmark/common/constant.py +3 -0
msprobe/pytorch/free_benchmark/common/utils.py +4 -0
msprobe/pytorch/free_benchmark/compare/grad_saver.py +22 -26
msprobe/pytorch/free_benchmark/main.py +7 -4
msprobe/pytorch/free_benchmark/perturbed_layers/npu/add_noise.py +1 -1
msprobe/pytorch/free_benchmark/perturbed_layers/npu/bit_noise.py +1 -1
msprobe/pytorch/free_benchmark/perturbed_layers/npu/change_value.py +1 -1
msprobe/pytorch/free_benchmark/perturbed_layers/npu/improve_precision.py +3 -3
msprobe/pytorch/free_benchmark/perturbed_layers/npu/no_change.py +1 -1
msprobe/pytorch/free_benchmark/perturbed_layers/run_cpu.py +1 -1
msprobe/pytorch/free_benchmark/result_handlers/base_handler.py +43 -29
msprobe/pytorch/free_benchmark/result_handlers/handler_factory.py +0 -1
msprobe/pytorch/function_factory.py +75 -0
msprobe/pytorch/functional/dump_module.py +4 -4
msprobe/pytorch/grad_probe/__init__.py +0 -0
msprobe/pytorch/grad_probe/grad_monitor.py +90 -0
msprobe/pytorch/grad_probe/grad_stat_csv.py +129 -0
msprobe/pytorch/hook_module/hook_module.py +14 -3
msprobe/pytorch/hook_module/support_wrap_ops.yaml +2 -1
msprobe/pytorch/hook_module/utils.py +9 -9
msprobe/pytorch/hook_module/wrap_aten.py +20 -10
msprobe/pytorch/hook_module/wrap_distributed.py +10 -7
msprobe/pytorch/hook_module/wrap_functional.py +4 -7
msprobe/pytorch/hook_module/wrap_npu_custom.py +21 -10
msprobe/pytorch/hook_module/wrap_tensor.py +5 -6
msprobe/pytorch/hook_module/wrap_torch.py +5 -7
msprobe/pytorch/hook_module/wrap_vf.py +6 -8
msprobe/pytorch/module_processer.py +53 -13
msprobe/pytorch/online_dispatch/compare.py +4 -4
msprobe/pytorch/online_dispatch/dispatch.py +39 -41
msprobe/pytorch/online_dispatch/dump_compare.py +17 -47
msprobe/pytorch/online_dispatch/single_compare.py +5 -5
msprobe/pytorch/online_dispatch/utils.py +2 -43
msprobe/pytorch/parse_tool/lib/compare.py +31 -19
msprobe/pytorch/parse_tool/lib/config.py +2 -1
msprobe/pytorch/parse_tool/lib/parse_tool.py +4 -4
msprobe/pytorch/parse_tool/lib/utils.py +34 -80
msprobe/pytorch/parse_tool/lib/visualization.py +4 -3
msprobe/pytorch/pt_config.py +100 -6
msprobe/pytorch/service.py +104 -19
mindstudio_probe-1.0.1.dist-info/RECORD +0 -228
msprobe/mindspore/dump/api_kbk_dump.py +0 -55
msprobe/pytorch/compare/acc_compare.py +0 -1024
msprobe/pytorch/compare/highlight.py +0 -100
msprobe/test/core_ut/common/test_utils.py +0 -345
msprobe/test/core_ut/data_dump/test_data_collector.py +0 -47
msprobe/test/core_ut/data_dump/test_json_writer.py +0 -183
msprobe/test/core_ut/data_dump/test_scope.py +0 -151
msprobe/test/core_ut/test_common_config.py +0 -152
msprobe/test/core_ut/test_file_check.py +0 -218
msprobe/test/core_ut/test_log.py +0 -109
msprobe/test/mindspore_ut/test_api_kbk_dump.py +0 -51
msprobe/test/mindspore_ut/test_debugger_config.py +0 -42
msprobe/test/mindspore_ut/test_dump_tool_factory.py +0 -51
msprobe/test/mindspore_ut/test_kernel_graph_dump.py +0 -66
msprobe/test/mindspore_ut/test_kernel_graph_overflow_check.py +0 -63
msprobe/test/mindspore_ut/test_ms_config.py +0 -69
msprobe/test/mindspore_ut/test_overflow_check_tool_factory.py +0 -51
msprobe/test/mindspore_ut/test_precision_debugger.py +0 -56
msprobe/test/mindspore_ut/test_task_handler_factory.py +0 -58
msprobe/test/pytorch_ut/advisor/test_advisor.py +0 -83
msprobe/test/pytorch_ut/api_accuracy_checker/common/test_common_utils.py +0 -108
msprobe/test/pytorch_ut/api_accuracy_checker/common/test_config.py +0 -39
msprobe/test/pytorch_ut/api_accuracy_checker/compare/test_algorithm.py +0 -112
msprobe/test/pytorch_ut/api_accuracy_checker/compare/test_api_precision_compare.py +0 -77
msprobe/test/pytorch_ut/api_accuracy_checker/compare/test_compare.py +0 -125
msprobe/test/pytorch_ut/api_accuracy_checker/compare/test_compare_column.py +0 -10
msprobe/test/pytorch_ut/api_accuracy_checker/compare/test_compare_utils.py +0 -43
msprobe/test/pytorch_ut/api_accuracy_checker/run_ut/dump.json +0 -179
msprobe/test/pytorch_ut/api_accuracy_checker/run_ut/forward.json +0 -63
msprobe/test/pytorch_ut/api_accuracy_checker/run_ut/test_data_generate.py +0 -99
msprobe/test/pytorch_ut/api_accuracy_checker/run_ut/test_multi_run_ut.py +0 -115
msprobe/test/pytorch_ut/api_accuracy_checker/run_ut/test_run_ut.py +0 -72
msprobe/test/pytorch_ut/compare/test_acc_compare.py +0 -17
msprobe/test/pytorch_ut/free_benchmark/perturbed_layers/test_perturbed_layser.py +0 -105
msprobe/test/pytorch_ut/free_benchmark/result_handlers/test_result_handler.py +0 -121
msprobe/test/pytorch_ut/free_benchmark/test_main.py +0 -101
msprobe/test/pytorch_ut/functional/test_dump_module.py +0 -15
msprobe/test/pytorch_ut/hook_module/test_api_registry.py +0 -130
msprobe/test/pytorch_ut/hook_module/test_hook_module.py +0 -42
msprobe/test/pytorch_ut/hook_module/test_wrap_aten.py +0 -65
msprobe/test/pytorch_ut/hook_module/test_wrap_distributed.py +0 -35
msprobe/test/pytorch_ut/hook_module/test_wrap_functional.py +0 -20
msprobe/test/pytorch_ut/hook_module/test_wrap_tensor.py +0 -35
msprobe/test/pytorch_ut/hook_module/test_wrap_torch.py +0 -43
msprobe/test/pytorch_ut/hook_module/test_wrap_vf.py +0 -11
msprobe/test/pytorch_ut/test_pt_config.py +0 -69
msprobe/test/pytorch_ut/test_service.py +0 -59
msprobe/test/resources/advisor.txt +0 -3
msprobe/test/resources/compare_result_20230703104808.csv +0 -9
msprobe/test/resources/compare_result_without_accuracy.csv +0 -9
msprobe/test/resources/config.yaml +0 -3
msprobe/test/resources/npu_test.pkl +0 -8
msprobe/test/run_test.sh +0 -30
msprobe/test/run_ut.py +0 -58
msprobe/test/test_module_processer.py +0 -64
{mindstudio_probe-1.0.1.dist-info → mindstudio_probe-1.0.3.dist-info}/LICENSE +0 -0
{mindstudio_probe-1.0.1.dist-info → mindstudio_probe-1.0.3.dist-info}/WHEEL +0 -0
{mindstudio_probe-1.0.1.dist-info → mindstudio_probe-1.0.3.dist-info}/entry_points.txt +0 -0
{mindstudio_probe-1.0.1.dist-info → mindstudio_probe-1.0.3.dist-info}/top_level.txt +0 -0
/msprobe/{pytorch → core}/advisor/advisor_const.py +0 -0
/msprobe/pytorch/doc/{atat → msprobe}/321/207/342/226/223/342/225/233/321/205/342/225/221/320/266/321/205/342/225/226/320/265/321/205/320/225/342/225/226/321/206/320/245/342/226/221/321/206/320/235/320/276dump/321/206/320/260/320/227/321/205/320/227/320/226/321/206/320/220/320/267/321/210/320/223/342/225/234/321/205/320/257/342/225/221/321/207/342/225/221/342/224/220/321/206/320/232/320/265/321/205/320/241/320/232.md" +0 -0

msprobe/core/data_dump/data_processor/pytorch_processor.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import copy
 import os
 import zlib
 from dataclasses import asdict
@@ -5,18 +6,20 @@ from typing import List
 import numpy as np
 import torch
-from msprobe.core.common.exceptions import MsaccException
 from msprobe.core.common.file_check import path_len_exceeds_limit, change_mode
 from msprobe.core.common.log import logger
 from msprobe.core.common.const import Const, OverflowConst, FileCheckConst
 from msprobe.core.data_dump.data_processor.base import BaseDataProcessor, ModuleBackwardInputsOutputs, \
     ModuleForwardInputsOutputs, TensorStatInfo
 from msprobe.pytorch.free_benchmark import FreeBenchmarkCheck, UnequalRow
+from msprobe.pytorch.common.utils import save_pt
 try:
     import torch_npu
+    is_gpu = False
 except ImportError:
-    pass
+    is_gpu = True
 class PytorchDataProcessor(BaseDataProcessor):
@@ -68,6 +71,12 @@ class PytorchDataProcessor(BaseDataProcessor):
             tensor_stat.min = False not in data_clone
         elif not data_clone.shape:
             tensor_stat.max = tensor_stat.min = tensor_stat.mean = tensor_stat.norm = data_clone.item()
+        elif torch.is_complex(data_clone):
+            data_np = data_clone.cpu().numpy()
+            data_abs = np.abs(data_np)
+            tensor_stat.max = np.max(data_abs).item()
+            tensor_stat.min = np.min(data_abs).item()
+            tensor_stat.mean = np.mean(data_abs).item()
         else:
             if not data_clone.is_floating_point() or data_clone.dtype == torch.float64:
                 data_clone = data_clone.float()
@@ -76,7 +85,39 @@ class PytorchDataProcessor(BaseDataProcessor):
             tensor_stat.mean = torch._C._VariableFunctionsClass.mean(data_clone).item()
             tensor_stat.norm = torch._C._VariableFunctionsClass.norm(data_clone).item()
         return tensor_stat
+    @staticmethod
+    def handle_tensor_extremum_nan_inf(tensor, operator):
+        data_clone = tensor.detach()
+        data_nan = torch._C._VariableFunctionsClass.isnan(data_clone)
+        if int(torch._C._VariableFunctionsClass.sum(data_nan)) == data_clone.numel():
+            return float('nan')
+        finite_mask = torch._C._VariableFunctionsClass.isfinite(data_clone)
+        if int(torch._C._VariableFunctionsClass.sum(finite_mask)) > 0:
+            finite_values = data_clone[finite_mask]
+            return torch._C._VariableFunctionsClass.max(finite_values).item() if operator == 'max' else \
+                torch._C._VariableFunctionsClass.min(finite_values).item()
+        else:
+            data_no_nan = data_clone[~data_nan]
+            return torch._C._VariableFunctionsClass.max(data_no_nan).item() if operator == 'max' else \
+                torch._C._VariableFunctionsClass.min(data_no_nan).item()
+    @staticmethod
+    def _analyze_builtin(arg):
+        single_arg = {}
+        if isinstance(arg, slice):
+            single_arg.update({"type": "slice"})
+            # slice参数中可能存在tensor类型，json序列化，需要转换为python数值类型
+            values = [
+                value if not isinstance(value, torch.Tensor) else value.item()
+                for value in [arg.start, arg.stop, arg.step]
+            ]
+            single_arg.update({"value": values})
+        else:
+            single_arg.update({"type": type(arg).__name__})
+            single_arg.update({"value": arg})
+        return single_arg
     @staticmethod
     def _analyze_torch_size(arg):
         return {"type": "torch.Size", "value": list(arg)}
@@ -97,10 +138,7 @@ class PytorchDataProcessor(BaseDataProcessor):
             return self._analyze_tensor(element, Const.SEP.join(suffix_stack))
         if isinstance(element, (bool, int, float, str, slice)):
             return self._analyze_builtin(element)
-        return None
-    def analyze_element(self, element):
-        return self.recursive_apply_transform(element, self.analyze_single_element)
+        return {}
     def _analyze_tensor(self, tensor, suffix):
         tensor_stat = self.get_stat_info(tensor)
@@ -113,9 +151,17 @@ class PytorchDataProcessor(BaseDataProcessor):
         tensor_json.update({"Mean": tensor_stat.mean})
         tensor_json.update({"Norm": tensor_stat.norm})
         tensor_json.update({"requires_grad": tensor.requires_grad})
-        if self.config.summary_mode == "md5":
+        if tensor_stat.max is not None:
+            if np.isinf(tensor_stat.max) or np.isnan(tensor_stat.max):
+                tensor_json['Max_except_inf_nan'] = self.handle_tensor_extremum_nan_inf(tensor, "max")
+        if tensor_stat.min is not None:
+            if np.isinf(tensor_stat.min) or np.isnan(tensor_stat.min):
+                tensor_json['Min_except_inf_nan'] = self.handle_tensor_extremum_nan_inf(tensor, "min")
+        if self.config.summary_mode == Const.MD5:
             tensor_md5 = self.get_md5_for_tensor(tensor)
-            tensor_json.update({"md5": tensor_md5})
+            tensor_json.update({Const.MD5: tensor_md5})
         return tensor_json
@@ -126,11 +172,8 @@ class StatisticsDataProcessor(PytorchDataProcessor):
 class TensorDataProcessor(PytorchDataProcessor):
     def _analyze_tensor(self, tensor, suffix):
         dump_data_name, file_path = self.get_save_file_path(suffix)
-        if not path_len_exceeds_limit(file_path):
-            torch.save(tensor, file_path)
-            change_mode(file_path, FileCheckConst.DATA_FILE_AUTHORITY)
-        else:
-            logger.warning(f'The file path {file_path} length exceeds limit.')
+        saved_tensor = tensor.contiguous().detach()
+        save_pt(saved_tensor, file_path)
         single_arg = super()._analyze_tensor(tensor, suffix)
         single_arg.update({"data_name": dump_data_name})
         return single_arg
@@ -142,29 +185,36 @@ class OverflowCheckDataProcessor(PytorchDataProcessor):
     def __init__(self, config, data_writer):
         super().__init__(config, data_writer)
         self.cached_tensors_and_file_paths = {}
-        self.real_overflow_dump_times = 0
-        self.overflow_nums = config.overflow_num
         self.bits_for_overflow = 8
+        self.real_overflow_nums = 0
+        self.overflow_nums = config.overflow_nums
+        self.forward_inplace_inputs = None
+    @property
+    def is_terminated(self):
+        if self.overflow_nums == -1:
+            return False
+        if self.real_overflow_nums >= self.overflow_nums:
+            logger.info(f"[msprobe] 超过预设溢出次数 当前溢出次数: {self.real_overflow_nums}")
+            return True
+        return False
     @staticmethod
     def overflow_debug_mode_enable():
         overflow_mode = os.getenv(OverflowConst.OVERFLOW_DEBUG_MODE_ENABLE, Const.ENV_DISABLE)
         return overflow_mode == Const.ENV_ENABLE
-    @staticmethod
-    def handle_tensor_extremum_nan_inf(data_clone, operator):
-        data_nan = torch._C._VariableFunctionsClass.isnan(data_clone)
-        if int(torch._C._VariableFunctionsClass.sum(data_nan)) == data_clone.numel():
-            return float('nan')
-        finite_mask = torch._C._VariableFunctionsClass.isfinite(data_clone)
-        if int(torch._C._VariableFunctionsClass.sum(finite_mask)) > 0:
-            finite_values = data_clone[finite_mask]
-            return torch._C._VariableFunctionsClass.max(finite_values).item() if operator == 'max' else \
-                torch._C._VariableFunctionsClass.min(finite_values).item()
-        else:
-            data_no_nan = data_clone[~data_nan]
-            return torch._C._VariableFunctionsClass.max(data_no_nan).item() if operator == 'max' else \
-                torch._C._VariableFunctionsClass.min(data_no_nan).item()
+    def analyze_pre_forward_inplace(self, name, module_input_output: ModuleForwardInputsOutputs):
+        self.forward_inplace_inputs = copy.deepcopy(module_input_output)
+        return None
+    def analyze_forward_inplace(self, name, module_input_output: ModuleForwardInputsOutputs):
+        module_input_output.output = module_input_output.concat_args_and_kwargs()
+        module_input_output.args = self.forward_inplace_inputs.args
+        module_input_output.kwargs = self.forward_inplace_inputs.kwargs
+        # release memory used by forward inputs
+        self.forward_inplace_inputs = None
+        return self.analyze_forward(name, None, module_input_output)
     def analyze_forward(self, name, module, module_input_output: ModuleForwardInputsOutputs):
         self.has_overflow = False
@@ -181,20 +231,12 @@ class OverflowCheckDataProcessor(PytorchDataProcessor):
     def maybe_save_overflow_data_and_check_overflow_times(self):
         if self.has_overflow:
             for file_path, tensor in self.cached_tensors_and_file_paths.items():
-                torch.save(tensor, file_path)
-                change_mode(file_path, FileCheckConst.DATA_FILE_AUTHORITY)
-            self.inc_and_check_overflow_times()
+                save_pt(tensor, file_path)
+            self.real_overflow_nums += 1
         self.cached_tensors_and_file_paths = {}
-    def inc_and_check_overflow_times(self):
-        self.real_overflow_dump_times += 1
-        if self.overflow_nums == -1:
-            return
-        if self.real_overflow_dump_times >= self.overflow_nums:
-            raise MsaccException(MsaccException.OVERFLOW_NUMS_ERROR, str(self.real_overflow_dump_times))
     def check_overflow_npu(self):
-        if self.overflow_debug_mode_enalbe():
+        if self.overflow_debug_mode_enable():
             float_status = torch.zeros(self.bits_for_overflow).npu()
             result = torch_npu.npu_get_float_status(float_status, OverflowConst.OVERFLOW_DEBUG_MODE)
             if result.cpu()[0] != 0:
@@ -211,21 +253,22 @@ class OverflowCheckDataProcessor(PytorchDataProcessor):
         else:
             torch_npu._C._clear_overflow_npu()
-    def _analyze_maybe_overflow_tensor(self, tensor_json, tensor):
-        data_clone = tensor.detach()
-        if hasattr(torch_npu._C, '_npu_is_support_inf_nan') and torch_npu._C._npu_is_support_inf_nan():
+    def _analyze_maybe_overflow_tensor(self, tensor_json):
+        if is_gpu or (hasattr(torch_npu._C, '_npu_is_support_inf_nan') and torch_npu._C._npu_is_support_inf_nan()):
             if tensor_json['Max'] is None:
                 return
             if np.isinf(tensor_json['Max']) or np.isnan(tensor_json['Max']):
-                tensor_json['Max_except_inf_nan'] = self.handle_tensor_extremum_nan_inf(data_clone, "max")
                 self.has_overflow = True
             if np.isinf(tensor_json['Min']) or np.isnan(tensor_json['Min']):
-                tensor_json['Min_except_inf_nan'] = self.handle_tensor_extremum_nan_inf(data_clone, "min")
                 self.has_overflow = True
         else:
-            self.has_overflow = self.check_overflow_npu()
-            if self.has_overflow:
-                self.clear_overflow_npu()
+            try:
+                self.has_overflow = self.check_overflow_npu()
+                if self.has_overflow:
+                    self.clear_overflow_npu()
+            except Exception as e:
+                logger.error(f"Overflow check failed, the current environment may be abnormal.")
+                raise RuntimeError(f"overflow check failed") from e
     def _analyze_tensor(self, tensor, suffix):
         dump_data_name, file_path = self.get_save_file_path(suffix)
@@ -234,7 +277,7 @@ class OverflowCheckDataProcessor(PytorchDataProcessor):
         else:
             logger.warning(f'The file path {file_path} length exceeds limit.')
         single_arg = super()._analyze_tensor(tensor, suffix)
-        self._analyze_maybe_overflow_tensor(single_arg, tensor)
+        self._analyze_maybe_overflow_tensor(single_arg)
         single_arg.update({"data_name": dump_data_name})
         return single_arg
@@ -280,7 +323,7 @@ class FreeBenchmarkDataProcessor(PytorchDataProcessor):
             self._forward_new_output = new_output
     def analyze_backward(self, name, module, module_input_output: ModuleBackwardInputsOutputs):
-        self.checker.backward(name, module, module_input_output.grad_output)
+        self.checker.backward(name, module, module_input_output.grad_input)
 class KernelDumpDataProcessor(PytorchDataProcessor):

msprobe/core/data_dump/json_writer.py CHANGED Viewed

@@ -4,7 +4,7 @@ import fcntl
 import json
 from pathlib import Path
-from msprobe.core.common.file_check import change_mode
+from msprobe.core.common.file_check import change_mode, FileOpen
 from msprobe.core.common.log import logger
 from msprobe.core.common.const import Const, FileCheckConst
@@ -30,20 +30,20 @@ class DataWriter:
             return
         is_exists = os.path.exists(file_path)
         append = "a+" if is_exists else "w+"
-        with os.fdopen(
-            os.open(file_path, Const.WRITE_FLAGS, FileCheckConst.DATA_FILE_AUTHORITY), append, newline=""
-        ) as csv_file:
+        with FileOpen(file_path, append) as csv_file:
             spawn_writer = csv.writer(csv_file)
             if not is_exists:
                 spawn_writer.writerow(result_header)
             spawn_writer.writerows([result,])
+        is_new_file = not is_exists
+        if is_new_file:
+            change_mode(file_path, FileCheckConst.DATA_FILE_AUTHORITY)
     def initialize_json_file(self, **kwargs):
         kwargs.update({"dump_data_dir": self.dump_tensor_data_dir, Const.DATA: {}})
-        with os.fdopen(
-            os.open(self.dump_file_path, Const.OVERWRITE_FLAGS, FileCheckConst.DATA_FILE_AUTHORITY), 'w'
-        ) as f:
+        with FileOpen(self.dump_file_path, 'w') as f:
             json.dump(kwargs, f)
+        change_mode(self.dump_file_path, FileCheckConst.DATA_FILE_AUTHORITY)
         if os.path.exists(self.stack_file_path):
             os.remove(self.stack_file_path)
@@ -83,7 +83,7 @@ class DataWriter:
     def write_data_json(self, file_path):
         logger.info(f"dump.json is at {os.path.dirname(os.path.dirname(file_path))}. ")
         if Path(file_path).exists() and os.path.getsize(file_path) > 0:
-            with open(file_path, "r+") as f:
+            with FileOpen(file_path, "r+") as f:
                 fcntl.flock(f, fcntl.LOCK_EX)
                 data_to_write = json.load(f)
                 fcntl.flock(f, fcntl.LOCK_UN)
@@ -91,7 +91,7 @@ class DataWriter:
             self.init_json['data_path'] = self.dump_tensor_data_dir
             data_to_write = self.init_json
         data_to_write[Const.DATA].update(self.cache_data[Const.DATA])
-        with open(file_path, 'w+') as f:
+        with FileOpen(file_path, 'w+') as f:
             fcntl.flock(f, fcntl.LOCK_EX)
             json.dump(data_to_write, f, indent=1)
             fcntl.flock(f, fcntl.LOCK_UN)
@@ -99,13 +99,13 @@ class DataWriter:
         self.cache_data[Const.DATA].clear()
     def write_stack_info_json(self, file_path):
-        with open(file_path, 'w+') as f:
+        with FileOpen(file_path, 'w+') as f:
             fcntl.flock(f, fcntl.LOCK_EX)
             json.dump(self.cache_stack, f, indent=1)
             fcntl.flock(f, fcntl.LOCK_UN)
     def write_construct_info_json(self, file_path):
-        with open(file_path, 'w+') as f:
+        with FileOpen(file_path, 'w+') as f:
             fcntl.flock(f, fcntl.LOCK_EX)
             json.dump(self.cache_construct, f, indent=1)
             fcntl.flock(f, fcntl.LOCK_UN)

msprobe/core/grad_probe/__init__.py ADDED Viewed

File without changes

msprobe/core/grad_probe/constant.py ADDED Viewed

@@ -0,0 +1,71 @@
+class GradConst:
+    FRAMEWORKS = {"PyTorch", "MindSpore"}
+    PYTORCH = "PyTorch"
+    MindSpore = "MindSpore"
+    GRAD_FILE_SUFFIX = {"npy", "pt"}
+    NPY_SUFFIX = "npy"
+    PT_SUFFIX = "pt"
+    # for callback
+    CURRENT_STEP = "current_step"
+    PARAM_LIST = "param_list"
+    RANK = "rank"
+    STEP = "step"
+    BOUNDS = "bounds"
+    OUTPUT_PATH = "output_path"
+    # level const
+    LEVEL = "level"
+    LEVEL0 = "L0"
+    LEVEL1 = "L1"
+    LEVEL2 = "L2"
+    SUPPORTED_LEVEL = {"L0", "L1", "L2"}
+    # numpy coding
+    STEP_IDX = 0
+    SHAPE_DIM_IDX = 4
+    MAX_SIZE = 10 * 1024 * 1024 * 1024
+    # direction suffix
+    DIR_SUFFIX = "dir.npy"
+    # file safty
+    DATA_DIR_AUTHORITY = 0o750
+    DATA_FILE_AUTHORITY = 0o640
+    DIRECTORY_LENGTH = 4096
+    FILE_NAME_LENGTH = 255
+    FILE_VALID_PATTERN = r"^[a-zA-Z0-9_.:/-]+$"
+    PARAM_VALID_PATTERN = r"^[a-zA-Z0-9_.]+$"
+    DIR = "dir"
+    FILE = "file"
+    STEP_FINISH = "step_finish"
+    SUMMARY = "summary"
+    # csv header entry
+    MD5 = "MD5"
+    DISTRIBUTION = "distribution"
+    SHAPE = "shape"
+    MAX = "max"
+    MIN = "min"
+    NORM = "norm"
+level_adp = {
+        "L0": {
+            "header": [GradConst.MD5, GradConst.MAX, GradConst.MIN, GradConst.NORM, GradConst.SHAPE],
+            "have_grad_direction": False
+        },
+        "L1": {
+            "header": [GradConst.MAX, GradConst.MIN, GradConst.NORM, GradConst.SHAPE],
+            "have_grad_direction": True
+        },
+        "L2": {
+            "header": [GradConst.DISTRIBUTION, GradConst.MAX, GradConst.MIN, GradConst.NORM, GradConst.SHAPE],
+            "have_grad_direction": True
+        },
+    }

msprobe/core/grad_probe/grad_compare.py ADDED Viewed

@@ -0,0 +1,175 @@
+import os
+from typing import List
+from tqdm import tqdm
+import pandas as pd
+import matplotlib.pyplot as plt
+from msprobe.core.common.utils import check_file_or_directory_path, check_path_before_create
+from msprobe.core.common.file_check import create_directory
+from msprobe.core.common.log import logger
+from msprobe.core.common.utils import remove_path, write_csv, load_npy
+from msprobe.core.grad_probe.constant import GradConst
+class GradComparator:
+    @staticmethod
+    def _get_grad_weight_order(path1, path2):
+        for summary_file in os.listdir(path1):
+            if not summary_file.endswith(".csv"):
+                continue
+            if not os.path.exists(os.path.join(path2, summary_file)):
+                continue
+            summary_csv = pd.read_csv(os.path.join(path1, summary_file))
+            return summary_csv["param_name"]
+        raise RuntimeError("no matched grad_summary.csv for comparison, please dump data in same configuration")
+    @staticmethod
+    def _get_name_matched_grad_file(param_name, grad_files):
+        for grad_file in grad_files:
+            if param_name == grad_file[:grad_file.rfind('.')]:
+                return grad_file
+        raise RuntimeError("no matched grad_file for comparison, please dump data in same configuration")
+    @classmethod
+    def compare_distributed(cls, path1: str, path2: str, output_dir: str):
+        ranks = cls._get_matched_dirs(path1, path2, "rank")
+        logger.info(f"the following ranks will be compared: {ranks}")
+        if not ranks:
+            raise RuntimeError("no matched ranks for comparison, please dump data in same configuration")
+        if not os.path.isdir(output_dir):
+            create_directory(output_dir)
+        for rank in tqdm(ranks, desc="rank"):
+            logger.info(f"now comparing rank {rank}:")
+            cls.compare(os.path.join(path1, f"rank{rank}"),
+                        os.path.join(path2, f"rank{rank}"),
+                        os.path.join(output_dir, f"rank{rank}"))
+    @classmethod
+    def compare(cls, path1: str, path2: str, output_dir: str):
+        steps = cls._get_matched_dirs(path1, path2, "step")
+        if not steps:
+            raise RuntimeError("no matched steps for comparison, please dump data in same configuration")
+        similarities = cls._calculate_separated_similarities(path1, path2, steps)
+        if not os.path.isdir(output_dir):
+            create_directory(output_dir)
+        cls._save_similarities(similarities, steps, output_dir)
+    @classmethod
+    def _get_matched_dirs(cls, path1: str, path2: str, dir_prefix):
+        check_file_or_directory_path(path1, isdir=True)
+        check_file_or_directory_path(path2, isdir=True)
+        dirs = []
+        for dir_name in os.listdir(path1):
+            index = dir_name.replace(dir_prefix, "", 1)
+            if not dir_name.startswith(dir_prefix) or not index.isdigit():
+                continue
+            folder2 = os.path.join(path2, dir_name)
+            if not os.path.isdir(folder2):
+                continue
+            dirs.append(int(index))
+        dirs = sorted(dirs)
+        return dirs
+    @classmethod
+    def _save_similarities(cls, similarities: List[float], steps: List[int], output_dir: str):
+        if not similarities:
+            raise ValueError(f"length of similarities is 0")
+        result = [['step'] + [str(step) for step in steps]]
+        for key, value in tqdm(similarities.items(), desc="save similarities (by param)"):
+            if len(value) != len(steps):
+                raise RuntimeError(f"similarities length of {key}:{len(value)} not equal steps:{len(steps)}")
+            plt.plot(steps, value)
+            plt.xlabel('steps')
+            plt.ylabel('similarities')
+            plt.title(f'{key}_similarities')
+            picture_dir = os.path.join(output_dir, "similarities_picture")
+            if not os.path.isdir(picture_dir):
+                create_directory(picture_dir)
+            fig_save_path = os.path.join(picture_dir, f"{key}_similarities.png")
+            check_path_before_create(fig_save_path)
+            try:
+                plt.savefig(fig_save_path)
+            except Exception as e:
+                raise RuntimeError(f"save plt figure {fig_save_path} failed") from e
+            plt.close()
+            result.append([key] + value)
+        result_csv_path = os.path.join(output_dir, "similarities.csv")
+        if os.path.exists(result_csv_path):
+            logger.warning(f"{result_csv_path} will be recoverd")
+            remove_path(result_csv_path)
+        write_csv(result, result_csv_path)
+    @classmethod
+    def _calculate_separated_similarities(cls, path1, path2, steps):
+        similarities = {}
+        logger.info(f"{len(steps)} steps will be compared")
+        grad_weight_order = cls._get_grad_weight_order(path1, path2)
+        for step in tqdm(steps, desc="culculate similarities (by step)"):
+            grad_files = cls._get_matched_grad_files(path1, path2, step)
+            same_count_summary = 0
+            total_count_summary = 0
+            for grad_name in grad_weight_order:
+                grad_file = cls._get_name_matched_grad_file(grad_name, grad_files)
+                grad1 = os.path.join(path1, f"step{step}", grad_file)
+                grad2 = os.path.join(path2, f"step{step}", grad_file)
+                same_count, total_count = cls._calculate_similarity(grad1, grad2)
+                same_count_summary += same_count
+                total_count_summary += total_count
+                idx = grad_file.rfind(".")
+                param_name = grad_file[:idx]
+                if param_name not in similarities:
+                    similarities[param_name] = []
+                if total_count == 0:
+                    similarities[param_name].append(0)
+                else:
+                    similarities[param_name].append(same_count / total_count)
+            if GradConst.SUMMARY not in similarities:
+                similarities[GradConst.SUMMARY] = []
+            if total_count_summary == 0:
+                similarities[GradConst.SUMMARY].append(0)
+            else:
+                similarities[GradConst.SUMMARY].append(same_count_summary / total_count_summary)
+        return similarities
+    @classmethod
+    def _get_matched_grad_files(cls, path1: str, path2: str, step: int):
+        path1 = os.path.join(path1, f"step{step}")
+        path2 = os.path.join(path2, f"step{step}")
+        check_file_or_directory_path(path1, isdir=True)
+        check_file_or_directory_path(path2, isdir=True)
+        grad_files = []
+        for grad_file in os.listdir(path1):
+            splits = grad_file.split('.')
+            if len(splits) < 1 or splits[-1] not in GradConst.GRAD_FILE_SUFFIX:
+                continue
+            folder2 = os.path.join(path2, grad_file)
+            if not os.path.exists(folder2):
+                continue
+            grad_files.append(grad_file)
+        return sorted(grad_files)
+    @classmethod
+    def _calculate_similarity(cls, grad_file1: str, grad_file2: str):
+        npy1, npy2 = cls._load_grad_files(grad_file1, grad_file2)
+        same_count = (npy1 == npy2).sum()
+        total_count = npy1.size
+        return same_count, total_count
+    @classmethod
+    def _load_grad_files(cls, grad_file1: str, grad_file2: str):
+        grad1 = load_npy(grad_file1)
+        grad2 = load_npy(grad_file2)
+        if grad1.shape != grad2.shape:
+            raise RuntimeError(f"tensor shape is not equal: {grad_file1}, {grad_file2}")
+        if grad1.dtype != bool:
+            raise TypeError(f"tensor type is not bool: {grad_file1}")
+        if grad2.dtype != bool:
+            raise TypeError(f"tensor type is not bool: {grad_file2}")
+        return grad1, grad2

msprobe/core/grad_probe/utils.py ADDED Viewed

@@ -0,0 +1,52 @@
+import re
+from msprobe.core.grad_probe.constant import GradConst
+from msprobe.core.common.log import logger
+from msprobe.core.common.utils import write_csv
+def data_in_list_target(data, lst):
+    return not lst or len(lst) == 0 or data in lst
+def check_numeral_list_ascend(lst):
+    if any(not isinstance(item, (int, float)) for item in lst):
+        raise Exception("The input list should only contain numbers")
+    if lst != sorted(lst):
+        raise Exception("The input list should be ascending")
+def check_param(param_name):
+    if not re.match(GradConst.PARAM_VALID_PATTERN, param_name):
+        raise RuntimeError("The parameter name contains special characters.")
+def check_str(string, variable_name):
+    if not isinstance(string, str):
+        raise ValueError(f'The variable: "{variable_name}" is not a string.')
+class ListCache(list):
+    threshold = 1000
+    def __init__(self, *args):
+        super().__init__(*args)
+        self._output_file = None
+    def __del__(self):
+        self.flush()
+    def flush(self):
+        if len(self) == 0:
+            return
+        if not self._output_file:
+            logger.warning("dumpfile path is not setted")
+        write_csv(self, self._output_file)
+        logger.info(f"write {len(self)} items to {self._output_file}.")
+        self.clear()
+    def append(self, data):
+        list.append(self, data)
+        if len(self) >= ListCache.threshold:
+            self.flush()
+    def set_output_file(self, output_file):
+        self._output_file = output_file

mindstudio-probe 1.0.1__py3-none-any.whl → 1.0.3__py3-none-any.whl

mindstudio-probe 1.0.1py3-none-any.whl → 1.0.3py3-none-any.whl