PyPI - mindstudio-probe - Versions diffs - 1.0.4__py3-none-any.whl → 1.1.0__py3-none-any.whl - Mend

mindstudio-probe 1.0.4py3-none-any.whl → 1.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (194) hide show

{mindstudio_probe-1.0.4.dist-info → mindstudio_probe-1.1.0.dist-info}/METADATA +1 -1
mindstudio_probe-1.1.0.dist-info/RECORD +287 -0
msprobe/README.md +46 -16
msprobe/__init__.py +16 -1
msprobe/config.json +0 -2
msprobe/core/advisor/advisor.py +8 -8
msprobe/core/advisor/advisor_const.py +6 -7
msprobe/core/advisor/advisor_result.py +12 -12
msprobe/core/common/const.py +64 -3
msprobe/core/common/exceptions.py +2 -2
msprobe/core/common/file_utils.py +54 -9
msprobe/core/common/inplace_op_checker.py +38 -0
msprobe/core/common/inplace_ops.yaml +251 -0
msprobe/core/common/log.py +21 -11
msprobe/core/common/utils.py +153 -167
msprobe/core/common_config.py +18 -25
msprobe/core/compare/acc_compare.py +209 -36
msprobe/core/compare/check.py +102 -17
msprobe/core/compare/compare_cli.py +21 -1
msprobe/core/compare/highlight.py +41 -5
msprobe/core/compare/multiprocessing_compute.py +33 -8
msprobe/core/compare/npy_compare.py +21 -6
msprobe/core/compare/utils.py +82 -48
msprobe/core/data_dump/data_collector.py +31 -32
msprobe/core/data_dump/data_processor/base.py +45 -22
msprobe/core/data_dump/data_processor/factory.py +20 -3
msprobe/core/data_dump/data_processor/mindspore_processor.py +11 -5
msprobe/core/data_dump/data_processor/pytorch_processor.py +24 -7
msprobe/core/data_dump/json_writer.py +63 -42
msprobe/core/data_dump/scope.py +32 -16
msprobe/core/grad_probe/constant.py +4 -0
msprobe/core/grad_probe/grad_compare.py +2 -3
msprobe/core/grad_probe/utils.py +16 -3
msprobe/docs/01.installation.md +19 -9
msprobe/docs/02.config_introduction.md +52 -80
msprobe/docs/03.config_examples.md +3 -13
msprobe/docs/04.acl_config_examples.md +11 -9
msprobe/docs/05.data_dump_PyTorch.md +140 -12
msprobe/docs/06.data_dump_MindSpore.md +47 -5
msprobe/docs/07.accuracy_checker_PyTorch.md +57 -34
msprobe/docs/08.accuracy_checker_online_PyTorch.md +51 -11
msprobe/docs/09.accuracy_checker_MindSpore.md +8 -8
msprobe/docs/10.accuracy_compare_PyTorch.md +181 -99
msprobe/docs/11.accuracy_compare_MindSpore.md +162 -31
msprobe/docs/13.overflow_check_MindSpore.md +1 -1
msprobe/docs/15.free_benchmarking_PyTorch.md +59 -53
msprobe/docs/16.free_benchmarking_MindSpore.md +140 -0
msprobe/docs/17.grad_probe.md +14 -16
msprobe/docs/18.online_dispatch.md +89 -0
msprobe/docs/{FAQ_PyTorch.md → FAQ.md} +22 -10
msprobe/docs/img/ms_dump.png +0 -0
msprobe/docs/img/ms_layer.png +0 -0
msprobe/docs/img/pt_dump.png +0 -0
msprobe/mindspore/__init__.py +1 -0
msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +35 -11
msprobe/mindspore/api_accuracy_checker/api_info.py +7 -0
msprobe/mindspore/cell_processor.py +27 -3
msprobe/mindspore/common/const.py +2 -0
msprobe/mindspore/common/utils.py +18 -2
msprobe/mindspore/compare/distributed_compare.py +9 -22
msprobe/mindspore/compare/layer_mapping.py +146 -0
msprobe/mindspore/compare/modify_mapping.py +107 -0
msprobe/mindspore/compare/ms_compare.py +173 -35
msprobe/mindspore/compare/ms_graph_compare.py +27 -11
msprobe/mindspore/debugger/debugger_config.py +16 -13
msprobe/mindspore/debugger/precision_debugger.py +37 -13
msprobe/mindspore/dump/dump_tool_factory.py +16 -1
msprobe/mindspore/dump/hook_cell/api_registry.py +11 -1
msprobe/mindspore/dump/hook_cell/primitive_hooks.py +206 -0
msprobe/mindspore/dump/hook_cell/support_wrap_ops.yaml +82 -10
msprobe/mindspore/dump/hook_cell/wrap_api.py +21 -13
msprobe/mindspore/dump/jit_dump.py +41 -17
msprobe/mindspore/dump/kernel_graph_dump.py +19 -3
msprobe/mindspore/dump/kernel_kbyk_dump.py +19 -4
msprobe/mindspore/free_benchmark/api_pynative_self_check.py +19 -4
msprobe/mindspore/free_benchmark/common/config.py +15 -0
msprobe/mindspore/free_benchmark/common/handler_params.py +15 -0
msprobe/mindspore/free_benchmark/common/utils.py +19 -5
msprobe/mindspore/free_benchmark/decorator/dec_forward.py +16 -2
msprobe/mindspore/free_benchmark/decorator/decorator_factory.py +18 -3
msprobe/mindspore/free_benchmark/handler/base_handler.py +18 -3
msprobe/mindspore/free_benchmark/handler/check_handler.py +18 -3
msprobe/mindspore/free_benchmark/handler/fix_handler.py +15 -0
msprobe/mindspore/free_benchmark/handler/handler_factory.py +18 -3
msprobe/mindspore/free_benchmark/perturbation/add_noise.py +22 -7
msprobe/mindspore/free_benchmark/perturbation/base_perturbation.py +15 -0
msprobe/mindspore/free_benchmark/perturbation/bit_noise.py +22 -7
msprobe/mindspore/free_benchmark/perturbation/exchange_value.py +44 -18
msprobe/mindspore/free_benchmark/perturbation/improve_precision.py +18 -4
msprobe/mindspore/free_benchmark/perturbation/no_change.py +16 -1
msprobe/mindspore/free_benchmark/perturbation/perturbation_factory.py +20 -5
msprobe/mindspore/free_benchmark/self_check_tool_factory.py +15 -0
msprobe/mindspore/grad_probe/global_context.py +18 -8
msprobe/mindspore/overflow_check/kernel_graph_overflow_check.py +20 -4
msprobe/mindspore/overflow_check/overflow_check_tool_factory.py +15 -0
msprobe/mindspore/service.py +42 -123
msprobe/pytorch/__init__.py +20 -1
msprobe/pytorch/api_accuracy_checker/common/config.py +19 -2
msprobe/pytorch/api_accuracy_checker/common/utils.py +53 -21
msprobe/pytorch/api_accuracy_checker/compare/algorithm.py +19 -2
msprobe/pytorch/api_accuracy_checker/compare/api_precision_compare.py +47 -21
msprobe/pytorch/api_accuracy_checker/compare/compare.py +51 -21
msprobe/pytorch/api_accuracy_checker/compare/compare_column.py +23 -6
msprobe/pytorch/api_accuracy_checker/compare/compare_utils.py +28 -8
msprobe/pytorch/api_accuracy_checker/config.yaml +1 -1
msprobe/pytorch/api_accuracy_checker/run_ut/data_generate.py +67 -32
msprobe/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py +26 -5
msprobe/pytorch/api_accuracy_checker/run_ut/run_overflow_check.py +19 -2
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py +51 -125
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut_utils.py +146 -3
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/attl.py +21 -0
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/client.py +78 -33
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/device_dispatch.py +27 -4
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/dump_dispatch.py +110 -0
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/server.py +36 -11
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/torch_ops_config.yaml +63 -0
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/utils.py +44 -0
msprobe/pytorch/bench_functions/__init__.py +18 -3
msprobe/pytorch/bench_functions/apply_adam_w.py +15 -0
msprobe/pytorch/bench_functions/confusion_transpose.py +15 -0
msprobe/pytorch/bench_functions/fast_gelu.py +15 -0
msprobe/pytorch/bench_functions/layer_norm_eval.py +15 -0
msprobe/pytorch/bench_functions/linear.py +15 -0
msprobe/pytorch/bench_functions/matmul_backward.py +21 -6
msprobe/pytorch/bench_functions/npu_fusion_attention.py +180 -151
msprobe/pytorch/bench_functions/rms_norm.py +15 -0
msprobe/pytorch/bench_functions/rotary_mul.py +28 -9
msprobe/pytorch/bench_functions/scaled_mask_softmax.py +15 -0
msprobe/pytorch/bench_functions/swiglu.py +20 -5
msprobe/pytorch/common/__init__.py +15 -0
msprobe/pytorch/common/log.py +18 -6
msprobe/pytorch/common/parse_json.py +26 -11
msprobe/pytorch/common/utils.py +40 -35
msprobe/pytorch/compare/distributed_compare.py +11 -11
msprobe/pytorch/compare/match.py +15 -0
msprobe/pytorch/compare/pt_compare.py +38 -6
msprobe/pytorch/debugger/debugger_config.py +52 -39
msprobe/pytorch/debugger/precision_debugger.py +72 -24
msprobe/pytorch/free_benchmark/__init__.py +20 -5
msprobe/pytorch/free_benchmark/common/enums.py +28 -0
msprobe/pytorch/free_benchmark/common/params.py +15 -0
msprobe/pytorch/free_benchmark/common/utils.py +17 -1
msprobe/pytorch/free_benchmark/compare/grad_saver.py +28 -7
msprobe/pytorch/free_benchmark/compare/single_benchmark.py +15 -0
msprobe/pytorch/free_benchmark/main.py +19 -4
msprobe/pytorch/free_benchmark/perturbed_layers/base_layer.py +15 -0
msprobe/pytorch/free_benchmark/perturbed_layers/layer_factory.py +19 -4
msprobe/pytorch/free_benchmark/perturbed_layers/npu/add_noise.py +15 -0
msprobe/pytorch/free_benchmark/perturbed_layers/npu/bit_noise.py +15 -0
msprobe/pytorch/free_benchmark/perturbed_layers/npu/change_value.py +26 -2
msprobe/pytorch/free_benchmark/perturbed_layers/npu/improve_precision.py +15 -0
msprobe/pytorch/free_benchmark/perturbed_layers/npu/no_change.py +15 -0
msprobe/pytorch/free_benchmark/perturbed_layers/npu/npu_base_layser.py +15 -0
msprobe/pytorch/free_benchmark/perturbed_layers/run_cpu.py +15 -0
msprobe/pytorch/free_benchmark/result_handlers/base_handler.py +55 -16
msprobe/pytorch/free_benchmark/result_handlers/check_handler.py +15 -0
msprobe/pytorch/free_benchmark/result_handlers/fix_handler.py +15 -0
msprobe/pytorch/free_benchmark/result_handlers/handler_factory.py +15 -0
msprobe/pytorch/free_benchmark/result_handlers/preheat_handler.py +19 -4
msprobe/pytorch/function_factory.py +17 -2
msprobe/pytorch/functional/module_dump.py +84 -0
msprobe/pytorch/grad_probe/grad_stat_csv.py +2 -2
msprobe/pytorch/hook_module/__init__.py +16 -1
msprobe/pytorch/hook_module/api_registry.py +13 -8
msprobe/pytorch/hook_module/hook_module.py +17 -19
msprobe/pytorch/hook_module/utils.py +4 -6
msprobe/pytorch/hook_module/wrap_aten.py +12 -11
msprobe/pytorch/hook_module/wrap_distributed.py +6 -7
msprobe/pytorch/hook_module/wrap_functional.py +10 -11
msprobe/pytorch/hook_module/wrap_npu_custom.py +9 -17
msprobe/pytorch/hook_module/wrap_tensor.py +4 -6
msprobe/pytorch/hook_module/wrap_torch.py +4 -6
msprobe/pytorch/hook_module/wrap_vf.py +4 -6
msprobe/pytorch/module_processer.py +17 -2
msprobe/pytorch/online_dispatch/compare.py +11 -12
msprobe/pytorch/online_dispatch/single_compare.py +7 -7
msprobe/pytorch/online_dispatch/torch_ops_config.yaml +8 -0
msprobe/pytorch/online_dispatch/utils.py +1 -4
msprobe/pytorch/parse.py +15 -0
msprobe/pytorch/parse_tool/cli.py +5 -6
msprobe/pytorch/parse_tool/lib/compare.py +9 -10
msprobe/pytorch/parse_tool/lib/parse_tool.py +3 -0
msprobe/pytorch/parse_tool/lib/utils.py +28 -24
msprobe/pytorch/parse_tool/lib/visualization.py +1 -1
msprobe/pytorch/pt_config.py +167 -38
msprobe/pytorch/service.py +97 -32
mindstudio_probe-1.0.4.dist-info/RECORD +0 -276
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/ssl_config.py +0 -10
msprobe/pytorch/functional/data_processor.py +0 -0
msprobe/pytorch/functional/dump_module.py +0 -39
{mindstudio_probe-1.0.4.dist-info → mindstudio_probe-1.1.0.dist-info}/LICENSE +0 -0
{mindstudio_probe-1.0.4.dist-info → mindstudio_probe-1.1.0.dist-info}/WHEEL +0 -0
{mindstudio_probe-1.0.4.dist-info → mindstudio_probe-1.1.0.dist-info}/entry_points.txt +0 -0
{mindstudio_probe-1.0.4.dist-info → mindstudio_probe-1.1.0.dist-info}/top_level.txt +0 -0

msprobe/core/data_dump/data_processor/factory.py CHANGED Viewed

@@ -1,3 +1,18 @@
+# Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 from msprobe.core.common.const import Const
@@ -34,14 +49,14 @@ class DataProcessorFactory:
     @classmethod
     def register_processors(cls, framework):
         if framework == Const.PT_FRAMEWORK:
-            from .pytorch_processor import (
+            from msprobe.core.data_dump.data_processor.pytorch_processor import (
                 StatisticsDataProcessor as PytorchStatisticsDataProcessor,
                 TensorDataProcessor as PytorchTensorDataProcessor,
                 OverflowCheckDataProcessor as PytorchOverflowCheckDataProcessor,
                 FreeBenchmarkDataProcessor as PytorchFreeBenchmarkDataProcessor,
                 KernelDumpDataProcessor as PytorchKernelDumpDataProcessor
             )
-            from ....pytorch.module_processer import ModuleProcesser
+            from msprobe.pytorch.module_processer import ModuleProcesser
             cls.register_processor(Const.PT_FRAMEWORK, Const.STATISTICS, PytorchStatisticsDataProcessor)
             cls.register_processor(Const.PT_FRAMEWORK, Const.TENSOR, PytorchTensorDataProcessor)
             cls.register_processor(Const.PT_FRAMEWORK, Const.OVERFLOW_CHECK, PytorchOverflowCheckDataProcessor)
@@ -49,11 +64,13 @@ class DataProcessorFactory:
             cls.register_processor(Const.PT_FRAMEWORK, Const.KERNEL_DUMP, PytorchKernelDumpDataProcessor)
             cls.register_module_processor(Const.PT_FRAMEWORK, ModuleProcesser)
         elif framework == Const.MS_FRAMEWORK:
-            from .mindspore_processor import (
+            from msprobe.core.data_dump.data_processor.mindspore_processor import (
                 StatisticsDataProcessor as MindsporeStatisticsDataProcessor,
                 TensorDataProcessor as MindsporeTensorDataProcessor,
                 OverflowCheckDataProcessor as MindsporeOverflowCheckDataProcessor
             )
+            from msprobe.mindspore.cell_processor import CellProcessor
             cls.register_processor(Const.MS_FRAMEWORK, Const.STATISTICS, MindsporeStatisticsDataProcessor)
             cls.register_processor(Const.MS_FRAMEWORK, Const.TENSOR, MindsporeTensorDataProcessor)
             cls.register_processor(Const.MS_FRAMEWORK, Const.OVERFLOW_CHECK, MindsporeOverflowCheckDataProcessor)
+            cls.register_module_processor(Const.MS_FRAMEWORK, CellProcessor)

msprobe/core/data_dump/data_processor/mindspore_processor.py CHANGED Viewed

@@ -17,6 +17,7 @@ import zlib
 import mindspore as ms
 from mindspore import mint, ops
+from mindspore._c_expression.typing import Number
 import numpy as np
 from msprobe.core.common.const import Const
@@ -29,7 +30,7 @@ from msprobe.mindspore.dump.hook_cell.api_registry import api_register
 class MindsporeDataProcessor(BaseDataProcessor):
-    mindspore_special_type = tuple([ms.Tensor])
+    mindspore_special_type = tuple([ms.Tensor, Number])
     def __init__(self, config, data_writer):
         super().__init__(config, data_writer)
@@ -69,13 +70,16 @@ class MindsporeDataProcessor(BaseDataProcessor):
             tensor_stat.mean = np.mean(data_abs).item()
             tensor_stat.norm = np.linalg.norm(data_abs).item()
         else:
-            if data.dtype == ms.bfloat16 or not ops.is_floating_point(data):
+            if not ops.is_floating_point(data):
                 data = data.to(ms.float32)
             api_register.norm_inner_op_set_ori_func()
             get_max_value = api_register.mint_ops_ori_attr.get("max", mint.max)
             get_min_value = api_register.mint_ops_ori_attr.get("min", mint.min)
             get_mean_value = api_register.mint_ops_ori_attr.get("mean", mint.mean)
-            get_norm_value = api_register.functional_ori_attr.get("norm", ops.norm)
+            if hasattr(mint, "norm"):
+                get_norm_value = api_register.mint_ops_ori_attr.get("norm", mint.norm)
+            else:
+                get_norm_value = api_register.functional_ori_attr.get("norm", ops.norm)
             tensor_stat.max = get_max_value(data).item()
             tensor_stat.min = get_min_value(data).item()
             tensor_stat.mean = get_mean_value(data).item()
@@ -90,9 +94,10 @@ class MindsporeDataProcessor(BaseDataProcessor):
         converted_numpy, numpy_type = self._convert_numpy_to_builtin(element)
         if converted_numpy is not element:
             return self._analyze_numpy(converted_numpy, numpy_type)
+        if isinstance(element, Number):
+            return self.analyze_dtype_in_kwargs(element)
         if isinstance(element, ms.Tensor):
             return self._analyze_tensor(element, Const.SEP.join(suffix_stack))
         if isinstance(element, (bool, int, float, str, slice, type(Ellipsis))):
             return self._analyze_builtin(element)
         return {}
@@ -163,7 +168,8 @@ class OverflowCheckDataProcessor(MindsporeDataProcessor):
                 save_tensor_as_npy(tensor, file_path)
             self.real_overflow_nums += 1
             if self.overflow_nums != -1 and self.real_overflow_nums >= self.overflow_nums:
-                logger.info(f"[{Const.TOOL_NAME}] 超过预设溢出次数 当前溢出次数: {self.real_overflow_nums}")
+                logger.info(f"[{Const.TOOL_NAME}] Reached the preset overflow times, "
+                            f"current overflow times: {self.real_overflow_nums}.")
         self.cached_tensors_and_file_paths = {}
     def _analyze_maybe_overflow_tensor(self, tensor_json):

msprobe/core/data_dump/data_processor/pytorch_processor.py CHANGED Viewed

@@ -1,20 +1,35 @@
+# Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import zlib
 from dataclasses import asdict
 from typing import List
 import numpy as np
 import torch
-from msprobe.core.common.file_utils import path_len_exceeds_limit, change_mode
+from msprobe.core.common.const import Const
+from msprobe.core.common.file_utils import path_len_exceeds_limit
 from msprobe.core.common.log import logger
-from msprobe.core.common.const import Const, OverflowConst, FileCheckConst
 from msprobe.core.data_dump.data_processor.base import BaseDataProcessor, ModuleBackwardInputsOutputs, \
     ModuleForwardInputsOutputs, TensorStatInfo
-from msprobe.pytorch.free_benchmark import FreeBenchmarkCheck, UnequalRow
 from msprobe.pytorch.common.utils import save_pt, load_pt
+from msprobe.pytorch.free_benchmark import FreeBenchmarkCheck, UnequalRow
+is_gpu = False
 try:
     import torch_npu
-    is_gpu = False
 except ImportError:
     is_gpu = True
@@ -153,7 +168,7 @@ class StatisticsDataProcessor(PytorchDataProcessor):
 class TensorDataProcessor(PytorchDataProcessor):
     def _analyze_tensor(self, tensor, suffix):
         dump_data_name, file_path = self.get_save_file_path(suffix)
-        saved_tensor = tensor.contiguous().detach()
+        saved_tensor = tensor.clone().contiguous().detach()
         save_pt(saved_tensor, file_path)
         single_arg = super()._analyze_tensor(tensor, suffix)
         single_arg.update({"data_name": dump_data_name})
@@ -178,7 +193,6 @@ class OverflowCheckDataProcessor(PytorchDataProcessor):
         if self.overflow_nums == -1:
             return False
         if self.real_overflow_nums >= self.overflow_nums:
-            logger.info(f"[msprobe] 超过预设溢出次数 当前溢出次数: {self.real_overflow_nums}")
             return True
         return False
@@ -219,6 +233,9 @@ class OverflowCheckDataProcessor(PytorchDataProcessor):
             for file_path, tensor in self.cached_tensors_and_file_paths.items():
                 save_pt(tensor, file_path)
             self.real_overflow_nums += 1
+            if self.overflow_nums != -1 and self.real_overflow_nums >= self.overflow_nums:
+                logger.info(f"[{Const.TOOL_NAME}] Reached the preset overflow times, "
+                            f"current overflow times: {self.real_overflow_nums}.")
         self.cached_tensors_and_file_paths = {}
     def _is_support_inf_nan(self):
@@ -243,7 +260,7 @@ class OverflowCheckDataProcessor(PytorchDataProcessor):
         if tensor_json['Max'] is None or tensor_json['Min'] is None:
             return
         self.has_overflow = np.isinf(tensor_json['Max']) or np.isnan(tensor_json['Max']) or \
-            np.isinf(tensor_json['Min']) or np.isnan(tensor_json['Min'])
+                            np.isinf(tensor_json['Min']) or np.isnan(tensor_json['Min'])
     def _analyze_tensor(self, tensor, suffix):
         dump_data_name, file_path = self.get_save_file_path(suffix)

msprobe/core/data_dump/json_writer.py CHANGED Viewed

@@ -1,24 +1,36 @@
-import os
+# Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import csv
+import os
-from msprobe.core.common.file_utils import change_mode, FileOpen
-from msprobe.core.common.log import logger
 from msprobe.core.common.const import Const, FileCheckConst
-from msprobe.core.common.file_utils import remove_path, load_json, save_json
+from msprobe.core.common.file_utils import change_mode, FileOpen, save_json
+from msprobe.core.common.log import logger
 class DataWriter:
-    def __init__(self, init_json=None) -> None:
-        self.dump_count = 0
-        self.init_json = init_json
-        self.dump_file_path = None  # os.path.join(dump_dir, DataWriter.dump_json_name)
-        self.stack_file_path = None  # os.path.join(dump_dir, DataWriter.stack_json_name)
-        self.construct_file_path = None  # os.path.join(dump_dir, DataWriter.construct_json_name)
+    def __init__(self) -> None:
+        self.dump_file_path = None
+        self.stack_file_path = None
+        self.construct_file_path = None
         self.free_benchmark_file_path = None
         self.dump_tensor_data_dir = None
-        self.buffer_size = 1000
-        self.cache_data = {Const.DATA: {}}
+        self.flush_size = 1000
+        self.cache_data = {}
         self.cache_stack = {}
         self.cache_construct = {}
@@ -37,18 +49,22 @@ class DataWriter:
         if is_new_file:
             change_mode(file_path, FileCheckConst.DATA_FILE_AUTHORITY)
-    def initialize_json_file(self, **kwargs):
-        kwargs.update({"dump_data_dir": self.dump_tensor_data_dir, Const.DATA: {}})
-        save_json(self.dump_file_path, kwargs)
-        empty_dict = {}
-        remove_path(self.stack_file_path)
-        save_json(self.stack_file_path, empty_dict)
-        remove_path(self.construct_file_path)
-        save_json(self.construct_file_path, empty_dict)
+    def reset_cache(self):
+        self.cache_data = {}
+        self.cache_stack = {}
+        self.cache_construct = {}
-    def update_dump_paths(self, dump_file_path, stack_file_path, construct_file_path, dump_data_dir,
+    def initialize_json_file(self, **kwargs):
+        if not self.cache_data:
+            kwargs.update({"dump_data_dir": self.dump_tensor_data_dir, Const.DATA: {}})
+            self.cache_data = kwargs
+            save_json(self.dump_file_path, self.cache_data, indent=1)
+        if not self.cache_stack:
+            save_json(self.stack_file_path, self.cache_stack, indent=1)
+        if not self.cache_construct:
+            save_json(self.construct_file_path, self.cache_construct, indent=1)
+    def update_dump_paths(self, dump_file_path, stack_file_path, construct_file_path, dump_data_dir,
                           free_benchmark_file_path):
         self.dump_file_path = dump_file_path
         self.stack_file_path = stack_file_path
@@ -56,16 +72,25 @@ class DataWriter:
         self.dump_tensor_data_dir = dump_data_dir
         self.free_benchmark_file_path = free_benchmark_file_path
+    def flush_data_periodically(self):
+        dump_data = self.cache_data.get(Const.DATA)
+        if dump_data and isinstance(dump_data, dict) and len(dump_data) % self.flush_size == 0:
+            self.write_json()
     def update_data(self, new_data):
-        key = next(iter(new_data.keys()))  # assert len(new_data.keys()) == 1
-        if key in self.cache_data[Const.DATA]:
-            self.cache_data[Const.DATA][key].update(new_data[key])
-        else:
-            self.cache_data[Const.DATA].update(new_data)
+        if not isinstance(new_data, dict) or len(new_data.keys()) != 1:
+            logger.warning(f"The data info({new_data}) should be a dict with only one outer key.")
+            return
+        dump_data = self.cache_data.get(Const.DATA)
+        if not isinstance(dump_data, dict):
+            logger.warning(f"The dump data({dump_data}) should be a dict.")
+            return
-    def flush_data_when_buffer_is_full(self):
-        if len(self.cache_data[Const.DATA]) >= self.buffer_size:
-            self.write_data_json(self.dump_file_path)
+        key = next(iter(new_data.keys()))
+        if key in dump_data:
+            dump_data.get(key).update(new_data.get(key))
+        else:
+            dump_data.update(new_data)
     def update_stack(self, new_data):
         self.cache_stack.update(new_data)
@@ -75,14 +100,7 @@ class DataWriter:
     def write_data_json(self, file_path):
         logger.info(f"dump.json is at {os.path.dirname(os.path.dirname(file_path))}. ")
-        if os.path.exists(file_path) and os.path.getsize(file_path) > 0:
-            data_to_write = load_json(file_path)
-        else:
-            self.init_json['data_path'] = self.dump_tensor_data_dir
-            data_to_write = self.init_json
-        data_to_write[Const.DATA].update(self.cache_data[Const.DATA])
-        save_json(file_path, data_to_write, indent=1)
-        self.cache_data[Const.DATA].clear()
+        save_json(file_path, self.cache_data, indent=1)
     def write_stack_info_json(self, file_path):
         save_json(file_path, self.cache_stack, indent=1)
@@ -91,6 +109,9 @@ class DataWriter:
         save_json(file_path, self.cache_construct, indent=1)
     def write_json(self):
-        self.write_data_json(self.dump_file_path)
-        self.write_stack_info_json(self.stack_file_path)
-        self.write_construct_info_json(self.construct_file_path)
+        if self.cache_data:
+            self.write_data_json(self.dump_file_path)
+        if self.cache_stack:
+            self.write_stack_info_json(self.stack_file_path)
+        if self.cache_construct:
+            self.write_construct_info_json(self.construct_file_path)

msprobe/core/data_dump/scope.py CHANGED Viewed

@@ -1,6 +1,22 @@
+# Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 from abc import ABC, abstractmethod
-from msprobe.core.common.exceptions import ScopeException
 from msprobe.core.common.const import Const
+from msprobe.core.common.exceptions import ScopeException
 def build_scope(scope_class, scope=None, api_list=None):
@@ -33,6 +49,7 @@ def build_range_scope_according_to_scope_name(scope, api_list):
 class BaseScope(ABC):
     Module_Type_Module = "Module"
     Module_Type_API = "api"
+    module_type = ["Module", "Cell"]
     def __init__(self, scope, api_list):
         scope, api_list = self.rectify_args(scope, api_list)
@@ -81,9 +98,9 @@ class ListScope(BaseScope):
                 f"scope和api_list不可以同时配置，实际配置为scope={scope}, api_list={api_list}.")
         return super(ListScope, ListScope).rectify_args(scope, api_list)
-    def check(self, module_name):
-        if not self.scope or module_name in self.scope:
-            return self.check_api_list(module_name)
+    def check(self, name):
+        if not self.scope or name in self.scope:
+            return self.check_api_list(name)
         return False
@@ -94,7 +111,6 @@ class RangeScope(BaseScope, ABC):
         self.in_scope = False
         self.is_valid = self.check_scope_is_valid()
     @staticmethod
     def rectify_args(scope, api_list):
         scope, api_list = super(RangeScope, RangeScope).rectify_args(scope, api_list)
@@ -104,7 +120,6 @@ class RangeScope(BaseScope, ABC):
             elif len(scope) > 2:
                 raise ScopeException(ScopeException.InvalidScope,
                     f"scope参数指定区间断点，须传入长度为1或2的列表，实际长度为{len(scope)}.")
         return scope, api_list
     @abstractmethod
@@ -123,23 +138,23 @@ class APIRangeScope(RangeScope):
         if not self.scope:
             return True
         scope_start_type = self.scope[0].split(Const.SEP)[0]
-        if scope_start_type == BaseScope.Module_Type_Module:
+        if scope_start_type in BaseScope.module_type:
             return False
         scope_stop_type = self.scope[1].split(Const.SEP)[0]
-        if scope_stop_type == BaseScope.Module_Type_Module:
+        if scope_stop_type in BaseScope.module_type:
             return False
         return True
-    def check(self, api_name):
-        if self.scope and api_name == self.scope[0]:
+    def check(self, name):
+        if self.scope and name == self.scope[0]:
             self.in_scope = True
         if not self.scope or self.in_scope:
-            result = self.check_api_list(api_name)
+            result = self.check_api_list(name)
         else:
             result = False
-        if self.scope and api_name == self.scope[1]:
+        if self.scope and name == self.scope[1]:
             self.in_scope = False
         return result
@@ -150,13 +165,14 @@ class ModuleRangeScope(RangeScope):
         需要用pre_hook和full_backward_hook来精确控制module的开始和结束，
         在这些hook触发时调用begin_module和end_module做区间控制
     """
     def check_scope_is_valid(self):
         if not self.scope:
             return True
         scope_start_type = self.scope[0].split(Const.SEP)[0]
         scope_stop_type = self.scope[1].split(Const.SEP)[0]
-        if scope_start_type == BaseScope.Module_Type_Module and \
-                scope_stop_type == BaseScope.Module_Type_Module:
+        if scope_start_type in BaseScope.module_type and \
+                scope_stop_type in BaseScope.module_type:
             return True
         return False
@@ -172,7 +188,7 @@ class ModuleRangeScope(RangeScope):
         if module_name == self.scope[1]:
             self.in_scope = False
-    def check(self, module_name):
+    def check(self, name):
         if not self.scope or self.in_scope:
-            return self.check_api_list(module_name)
+            return self.check_api_list(name)
         return False

msprobe/core/grad_probe/constant.py CHANGED Viewed

@@ -33,6 +33,10 @@ class GradConst:
     # direction suffix
     DIR_SUFFIX = "dir.npy"
+    # bounds safety
+    BOUNDS_MINIMUM = -2**63
+    BOUNDS_MAXIMUM = 2**63 - 1
     # file safty
     DATA_DIR_AUTHORITY = 0o750
     DATA_FILE_AUTHORITY = 0o640

msprobe/core/grad_probe/grad_compare.py CHANGED Viewed

@@ -2,12 +2,11 @@ import os
 from typing import List
 from tqdm import tqdm
-import pandas as pd
 import matplotlib.pyplot as plt
 from msprobe.core.common.file_utils import create_directory, check_path_before_create, check_file_or_directory_path
 from msprobe.core.common.log import logger
-from msprobe.core.common.file_utils import remove_path, load_npy, write_csv
+from msprobe.core.common.file_utils import remove_path, load_npy, write_csv, read_csv
 from msprobe.core.grad_probe.constant import GradConst
 from msprobe.core.grad_probe.utils import plt_savefig
@@ -21,7 +20,7 @@ class GradComparator:
                 continue
             if not os.path.exists(os.path.join(path2, summary_file)):
                 continue
-            summary_csv = pd.read_csv(os.path.join(path1, summary_file))
+            summary_csv = read_csv(os.path.join(path1, summary_file))
             return summary_csv["param_name"]
         raise RuntimeError("no matched grad_summary.csv for comparison, please dump data in same configuration")

msprobe/core/grad_probe/utils.py CHANGED Viewed

@@ -20,12 +20,25 @@ def check_numeral_list_ascend(lst):
 def check_param(param_name):
     if not re.match(GradConst.PARAM_VALID_PATTERN, param_name):
         raise RuntimeError("The parameter name contains special characters.")
 def check_str(string, variable_name):
     if not isinstance(string, str):
         raise ValueError(f'The variable: "{variable_name}" is not a string.')
+def check_bounds_element(bound):
+    return GradConst.BOUNDS_MINIMUM <= bound and bound <= GradConst.BOUNDS_MAXIMUM
+def check_bounds(bounds):
+    prev = GradConst.BOUNDS_MINIMUM - 1
+    for element in bounds:
+        if not isinstance(element, (int, float)):
+            raise Exception("bounds element is not int or float")
+        if not check_bounds_element(element):
+            raise Exception("bounds element is out of int64 range")
+        if prev >= element:
+            raise Exception("bounds list is not ascending")
+        prev = element
 class ListCache(list):
     threshold = 1000
@@ -50,7 +63,7 @@ class ListCache(list):
         list.append(self, data)
         if len(self) >= ListCache.threshold:
             self.flush()
     def set_output_file(self, output_file):
         self._output_file = output_file

msprobe/docs/01.installation.md CHANGED Viewed

@@ -3,19 +3,20 @@
 推荐使用 [miniconda](https://docs.anaconda.com/miniconda/) 管理环境依赖。
 ```bash
-conda create -n msprobe python=3.8
+conda create -n msprobe python
 conda activate msprobe
 ```
-## 1. 从 PyPI 安装
+## 1 从 PyPI 安装
 ```shell
-pip install mindstudio-probe[==版本号]
+pip install mindstudio-probe
 ```
-## 2. 下载 whl 包安装
+## 2 下载 whl 包安装
 |版本|发布日期|支持 PyTorch 版本|支持 MindSpore 版本|下载链接|校验码|
 |:--:|:--:|:--:|:--:|:--:|:--:|
+|1.0.4|2024.09.09|1.11/2.0/2.1/2.2|2.4.0|[mindstudio_probe-1.0.4-py3-none-any.whl](https://ptdbg.obs.myhuaweicloud.com/msprobe/1.0/mindstudio_probe-1.0.4-py3-none-any.whl)|4e1909566a71a855b356597750c20ee43d964a22b2c2b02ac08312a5def75fd6|
 | 1.0.3 | 2024.08.23 | 1.11/2.0/2.1/2.2 | 2.4.0 | [mindstudio_probe-1.0.3-py3-none-any.whl](https://ptdbg.obs.myhuaweicloud.com/msprobe/1.0/mindstudio_probe-1.0.3-py3-none-any.whl) | 7060cc141a5b98ef770cd9220995d299393f32a61938261e632c7e8b5160bef2 |
 | 1.0.2 | 2024.08.09 | 1.11/2.0/2.1/2.2 | 2.4.0 | [mindstudio_probe-1.0.2-py3-none-any.whl](https://ptdbg.obs.myhuaweicloud.com/msprobe/1.0/mindstudio_probe-1.0.2-py3-none-any.whl) | e4a980e5d98c426ce5ce9842520d9bc031d3b3de621c74b3d59414cc6e238e0e |
 | 1.0.1 | 2024.07.25 | 2.0/2.1/2.2 | 2.4.0 | [mindstudio_probe-1.0.1-py3-none-any.whl](https://ptdbg.obs.myhuaweicloud.com/msprobe/1.0/mindstudio_probe-1.0.1-py3-none-any.whl) | b699e224e4d4e3bcf9412c54fa858a1ee370f0d7a2bc69cb3f1273ac14a6dc82 |
@@ -31,7 +32,7 @@ pip install ./mindstudio_probe-{version}-py3-none-any.whl # 安装whl包
 若覆盖安装，请在命令行末尾添加 `--force-reinstall` 参数。
-## 3. 从源码安装
+## 3 从源码安装
 ```shell
 git clone https://gitee.com/ascend/mstt.git
@@ -40,9 +41,18 @@ cd mstt/debug/accuracy_tools
 pip install setuptools wheel
 python setup.py bdist_wheel
-pip install ./dist/mindstudio_probe*.whl
+cd ./dist
+pip install ./mindstudio_probe*.whl
 ```
+# 历史版本特性
+<table>
+    <tr><th>版本</th><th>特性</th></tr>
+    <tr><td rowspan="2">1.0.3</td><td>【精度预检】</br>1. 落盘数据小；</br>2. 支持随机生成模式和真实数据模式；</br>3. 单 API 测试，排除整网中的累计误差问题。</td></tr>
+    <tr><td>【梯度检测】</br>1. 使用便捷，无需在训练流程里插入代码。</br>2. 可以精准定位问题出现的 step。</td></tr>
+</table>
 # 查看 msprobe 工具信息
 ```bash
@@ -59,7 +69,7 @@ Home-page: https://gitee.com/ascend/mstt/tree/master/debug/accuracy_tools/msprob
 Author: Ascend Team
 Author-email: pmail_mindstudio@huawei.com
 License: Apache License 2.0
-Location: /home/xxx/miniconda3/envs/xxx/lib/python3.8/site-packages/mindstudio_probe-1.0.0-py3.8.egg
+Location: /home/xxx/miniconda3/envs/xxx/lib/python3.x/site-packages/mindstudio_probe-1.0.x-py3.x.egg
 Requires: einops, matplotlib, numpy, openpyxl, pandas, pyOpenSSL, pyyaml, rich, tqdm, twisted, wheel
 Required-by:
 ```
@@ -68,11 +78,11 @@ Required-by:
 ## 1 安装 CANN 包
-1.1 根据 CPU 架构和 NPU 型号选择 toolkit 和 kernal 包，可以参考 [CANN 软件安装指南](https://gitee.com/link?target=https%3A%2F%2Fwww.hiascend.com%2Fdocument%2Fdetail%2Fzh%2Fcanncommercial%2F700%2Fenvdeployment%2Finstg%2Finstg_0001.html)和[昇腾社区](https://www.hiascend.cn/developer/download/community/result?module=cann)。
+1.1 根据 CPU 架构和 NPU 型号选择 toolkit 和 kernel，可以参考 [CANN 软件安装指南](https://gitee.com/link?target=https%3A%2F%2Fwww.hiascend.com%2Fdocument%2Fdetail%2Fzh%2Fcanncommercial%2F700%2Fenvdeployment%2Finstg%2Finstg_0001.html)和[昇腾社区](https://www.hiascend.cn/developer/download/community/result?module=cann)。
 1.2 运行示例
 ```bash
-Ascend-cann-toolkit_x.x.x_linux-aarch64.run --full --install-path={cann_path}
+Ascend-cann-toolkit_x.x.x_linux-xxxx.run --full --install-path={cann_path}
 Ascend-cann-kernels_x.x.x_linux.run --install --install-path={cann_path}
 ```

mindstudio-probe 1.0.4__py3-none-any.whl → 1.1.0__py3-none-any.whl

mindstudio-probe 1.0.4py3-none-any.whl → 1.1.0py3-none-any.whl