PyPI - mindstudio-probe - Versions diffs - 1.0.4__py3-none-any.whl → 1.1.0__py3-none-any.whl - Mend

mindstudio-probe 1.0.4py3-none-any.whl → 1.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (194) hide show

{mindstudio_probe-1.0.4.dist-info → mindstudio_probe-1.1.0.dist-info}/METADATA +1 -1
mindstudio_probe-1.1.0.dist-info/RECORD +287 -0
msprobe/README.md +46 -16
msprobe/__init__.py +16 -1
msprobe/config.json +0 -2
msprobe/core/advisor/advisor.py +8 -8
msprobe/core/advisor/advisor_const.py +6 -7
msprobe/core/advisor/advisor_result.py +12 -12
msprobe/core/common/const.py +64 -3
msprobe/core/common/exceptions.py +2 -2
msprobe/core/common/file_utils.py +54 -9
msprobe/core/common/inplace_op_checker.py +38 -0
msprobe/core/common/inplace_ops.yaml +251 -0
msprobe/core/common/log.py +21 -11
msprobe/core/common/utils.py +153 -167
msprobe/core/common_config.py +18 -25
msprobe/core/compare/acc_compare.py +209 -36
msprobe/core/compare/check.py +102 -17
msprobe/core/compare/compare_cli.py +21 -1
msprobe/core/compare/highlight.py +41 -5
msprobe/core/compare/multiprocessing_compute.py +33 -8
msprobe/core/compare/npy_compare.py +21 -6
msprobe/core/compare/utils.py +82 -48
msprobe/core/data_dump/data_collector.py +31 -32
msprobe/core/data_dump/data_processor/base.py +45 -22
msprobe/core/data_dump/data_processor/factory.py +20 -3
msprobe/core/data_dump/data_processor/mindspore_processor.py +11 -5
msprobe/core/data_dump/data_processor/pytorch_processor.py +24 -7
msprobe/core/data_dump/json_writer.py +63 -42
msprobe/core/data_dump/scope.py +32 -16
msprobe/core/grad_probe/constant.py +4 -0
msprobe/core/grad_probe/grad_compare.py +2 -3
msprobe/core/grad_probe/utils.py +16 -3
msprobe/docs/01.installation.md +19 -9
msprobe/docs/02.config_introduction.md +52 -80
msprobe/docs/03.config_examples.md +3 -13
msprobe/docs/04.acl_config_examples.md +11 -9
msprobe/docs/05.data_dump_PyTorch.md +140 -12
msprobe/docs/06.data_dump_MindSpore.md +47 -5
msprobe/docs/07.accuracy_checker_PyTorch.md +57 -34
msprobe/docs/08.accuracy_checker_online_PyTorch.md +51 -11
msprobe/docs/09.accuracy_checker_MindSpore.md +8 -8
msprobe/docs/10.accuracy_compare_PyTorch.md +181 -99
msprobe/docs/11.accuracy_compare_MindSpore.md +162 -31
msprobe/docs/13.overflow_check_MindSpore.md +1 -1
msprobe/docs/15.free_benchmarking_PyTorch.md +59 -53
msprobe/docs/16.free_benchmarking_MindSpore.md +140 -0
msprobe/docs/17.grad_probe.md +14 -16
msprobe/docs/18.online_dispatch.md +89 -0
msprobe/docs/{FAQ_PyTorch.md → FAQ.md} +22 -10
msprobe/docs/img/ms_dump.png +0 -0
msprobe/docs/img/ms_layer.png +0 -0
msprobe/docs/img/pt_dump.png +0 -0
msprobe/mindspore/__init__.py +1 -0
msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +35 -11
msprobe/mindspore/api_accuracy_checker/api_info.py +7 -0
msprobe/mindspore/cell_processor.py +27 -3
msprobe/mindspore/common/const.py +2 -0
msprobe/mindspore/common/utils.py +18 -2
msprobe/mindspore/compare/distributed_compare.py +9 -22
msprobe/mindspore/compare/layer_mapping.py +146 -0
msprobe/mindspore/compare/modify_mapping.py +107 -0
msprobe/mindspore/compare/ms_compare.py +173 -35
msprobe/mindspore/compare/ms_graph_compare.py +27 -11
msprobe/mindspore/debugger/debugger_config.py +16 -13
msprobe/mindspore/debugger/precision_debugger.py +37 -13
msprobe/mindspore/dump/dump_tool_factory.py +16 -1
msprobe/mindspore/dump/hook_cell/api_registry.py +11 -1
msprobe/mindspore/dump/hook_cell/primitive_hooks.py +206 -0
msprobe/mindspore/dump/hook_cell/support_wrap_ops.yaml +82 -10
msprobe/mindspore/dump/hook_cell/wrap_api.py +21 -13
msprobe/mindspore/dump/jit_dump.py +41 -17
msprobe/mindspore/dump/kernel_graph_dump.py +19 -3
msprobe/mindspore/dump/kernel_kbyk_dump.py +19 -4
msprobe/mindspore/free_benchmark/api_pynative_self_check.py +19 -4
msprobe/mindspore/free_benchmark/common/config.py +15 -0
msprobe/mindspore/free_benchmark/common/handler_params.py +15 -0
msprobe/mindspore/free_benchmark/common/utils.py +19 -5
msprobe/mindspore/free_benchmark/decorator/dec_forward.py +16 -2
msprobe/mindspore/free_benchmark/decorator/decorator_factory.py +18 -3
msprobe/mindspore/free_benchmark/handler/base_handler.py +18 -3
msprobe/mindspore/free_benchmark/handler/check_handler.py +18 -3
msprobe/mindspore/free_benchmark/handler/fix_handler.py +15 -0
msprobe/mindspore/free_benchmark/handler/handler_factory.py +18 -3
msprobe/mindspore/free_benchmark/perturbation/add_noise.py +22 -7
msprobe/mindspore/free_benchmark/perturbation/base_perturbation.py +15 -0
msprobe/mindspore/free_benchmark/perturbation/bit_noise.py +22 -7
msprobe/mindspore/free_benchmark/perturbation/exchange_value.py +44 -18
msprobe/mindspore/free_benchmark/perturbation/improve_precision.py +18 -4
msprobe/mindspore/free_benchmark/perturbation/no_change.py +16 -1
msprobe/mindspore/free_benchmark/perturbation/perturbation_factory.py +20 -5
msprobe/mindspore/free_benchmark/self_check_tool_factory.py +15 -0
msprobe/mindspore/grad_probe/global_context.py +18 -8
msprobe/mindspore/overflow_check/kernel_graph_overflow_check.py +20 -4
msprobe/mindspore/overflow_check/overflow_check_tool_factory.py +15 -0
msprobe/mindspore/service.py +42 -123
msprobe/pytorch/__init__.py +20 -1
msprobe/pytorch/api_accuracy_checker/common/config.py +19 -2
msprobe/pytorch/api_accuracy_checker/common/utils.py +53 -21
msprobe/pytorch/api_accuracy_checker/compare/algorithm.py +19 -2
msprobe/pytorch/api_accuracy_checker/compare/api_precision_compare.py +47 -21
msprobe/pytorch/api_accuracy_checker/compare/compare.py +51 -21
msprobe/pytorch/api_accuracy_checker/compare/compare_column.py +23 -6
msprobe/pytorch/api_accuracy_checker/compare/compare_utils.py +28 -8
msprobe/pytorch/api_accuracy_checker/config.yaml +1 -1
msprobe/pytorch/api_accuracy_checker/run_ut/data_generate.py +67 -32
msprobe/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py +26 -5
msprobe/pytorch/api_accuracy_checker/run_ut/run_overflow_check.py +19 -2
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py +51 -125
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut_utils.py +146 -3
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/attl.py +21 -0
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/client.py +78 -33
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/device_dispatch.py +27 -4
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/dump_dispatch.py +110 -0
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/server.py +36 -11
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/torch_ops_config.yaml +63 -0
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/utils.py +44 -0
msprobe/pytorch/bench_functions/__init__.py +18 -3
msprobe/pytorch/bench_functions/apply_adam_w.py +15 -0
msprobe/pytorch/bench_functions/confusion_transpose.py +15 -0
msprobe/pytorch/bench_functions/fast_gelu.py +15 -0
msprobe/pytorch/bench_functions/layer_norm_eval.py +15 -0
msprobe/pytorch/bench_functions/linear.py +15 -0
msprobe/pytorch/bench_functions/matmul_backward.py +21 -6
msprobe/pytorch/bench_functions/npu_fusion_attention.py +180 -151
msprobe/pytorch/bench_functions/rms_norm.py +15 -0
msprobe/pytorch/bench_functions/rotary_mul.py +28 -9
msprobe/pytorch/bench_functions/scaled_mask_softmax.py +15 -0
msprobe/pytorch/bench_functions/swiglu.py +20 -5
msprobe/pytorch/common/__init__.py +15 -0
msprobe/pytorch/common/log.py +18 -6
msprobe/pytorch/common/parse_json.py +26 -11
msprobe/pytorch/common/utils.py +40 -35
msprobe/pytorch/compare/distributed_compare.py +11 -11
msprobe/pytorch/compare/match.py +15 -0
msprobe/pytorch/compare/pt_compare.py +38 -6
msprobe/pytorch/debugger/debugger_config.py +52 -39
msprobe/pytorch/debugger/precision_debugger.py +72 -24
msprobe/pytorch/free_benchmark/__init__.py +20 -5
msprobe/pytorch/free_benchmark/common/enums.py +28 -0
msprobe/pytorch/free_benchmark/common/params.py +15 -0
msprobe/pytorch/free_benchmark/common/utils.py +17 -1
msprobe/pytorch/free_benchmark/compare/grad_saver.py +28 -7
msprobe/pytorch/free_benchmark/compare/single_benchmark.py +15 -0
msprobe/pytorch/free_benchmark/main.py +19 -4
msprobe/pytorch/free_benchmark/perturbed_layers/base_layer.py +15 -0
msprobe/pytorch/free_benchmark/perturbed_layers/layer_factory.py +19 -4
msprobe/pytorch/free_benchmark/perturbed_layers/npu/add_noise.py +15 -0
msprobe/pytorch/free_benchmark/perturbed_layers/npu/bit_noise.py +15 -0
msprobe/pytorch/free_benchmark/perturbed_layers/npu/change_value.py +26 -2
msprobe/pytorch/free_benchmark/perturbed_layers/npu/improve_precision.py +15 -0
msprobe/pytorch/free_benchmark/perturbed_layers/npu/no_change.py +15 -0
msprobe/pytorch/free_benchmark/perturbed_layers/npu/npu_base_layser.py +15 -0
msprobe/pytorch/free_benchmark/perturbed_layers/run_cpu.py +15 -0
msprobe/pytorch/free_benchmark/result_handlers/base_handler.py +55 -16
msprobe/pytorch/free_benchmark/result_handlers/check_handler.py +15 -0
msprobe/pytorch/free_benchmark/result_handlers/fix_handler.py +15 -0
msprobe/pytorch/free_benchmark/result_handlers/handler_factory.py +15 -0
msprobe/pytorch/free_benchmark/result_handlers/preheat_handler.py +19 -4
msprobe/pytorch/function_factory.py +17 -2
msprobe/pytorch/functional/module_dump.py +84 -0
msprobe/pytorch/grad_probe/grad_stat_csv.py +2 -2
msprobe/pytorch/hook_module/__init__.py +16 -1
msprobe/pytorch/hook_module/api_registry.py +13 -8
msprobe/pytorch/hook_module/hook_module.py +17 -19
msprobe/pytorch/hook_module/utils.py +4 -6
msprobe/pytorch/hook_module/wrap_aten.py +12 -11
msprobe/pytorch/hook_module/wrap_distributed.py +6 -7
msprobe/pytorch/hook_module/wrap_functional.py +10 -11
msprobe/pytorch/hook_module/wrap_npu_custom.py +9 -17
msprobe/pytorch/hook_module/wrap_tensor.py +4 -6
msprobe/pytorch/hook_module/wrap_torch.py +4 -6
msprobe/pytorch/hook_module/wrap_vf.py +4 -6
msprobe/pytorch/module_processer.py +17 -2
msprobe/pytorch/online_dispatch/compare.py +11 -12
msprobe/pytorch/online_dispatch/single_compare.py +7 -7
msprobe/pytorch/online_dispatch/torch_ops_config.yaml +8 -0
msprobe/pytorch/online_dispatch/utils.py +1 -4
msprobe/pytorch/parse.py +15 -0
msprobe/pytorch/parse_tool/cli.py +5 -6
msprobe/pytorch/parse_tool/lib/compare.py +9 -10
msprobe/pytorch/parse_tool/lib/parse_tool.py +3 -0
msprobe/pytorch/parse_tool/lib/utils.py +28 -24
msprobe/pytorch/parse_tool/lib/visualization.py +1 -1
msprobe/pytorch/pt_config.py +167 -38
msprobe/pytorch/service.py +97 -32
mindstudio_probe-1.0.4.dist-info/RECORD +0 -276
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/ssl_config.py +0 -10
msprobe/pytorch/functional/data_processor.py +0 -0
msprobe/pytorch/functional/dump_module.py +0 -39
{mindstudio_probe-1.0.4.dist-info → mindstudio_probe-1.1.0.dist-info}/LICENSE +0 -0
{mindstudio_probe-1.0.4.dist-info → mindstudio_probe-1.1.0.dist-info}/WHEEL +0 -0
{mindstudio_probe-1.0.4.dist-info → mindstudio_probe-1.1.0.dist-info}/entry_points.txt +0 -0
{mindstudio_probe-1.0.4.dist-info → mindstudio_probe-1.1.0.dist-info}/top_level.txt +0 -0

msprobe/pytorch/pt_config.py CHANGED Viewed

@@ -1,12 +1,33 @@
-import json
+# Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import os
-from msprobe.core.common_config import CommonConfig, BaseConfig
-from msprobe.core.common.file_utils import FileOpen
 from msprobe.core.common.const import Const
-from msprobe.pytorch.hook_module.utils import get_ops
+from msprobe.core.common.exceptions import MsprobeException
+from msprobe.core.common.file_utils import FileOpen, load_json
+from msprobe.core.common.log import logger
+from msprobe.core.common_config import BaseConfig, CommonConfig
 from msprobe.core.grad_probe.constant import level_adp
-from msprobe.core.grad_probe.utils import check_numeral_list_ascend
+from msprobe.core.grad_probe.utils import check_bounds
+from msprobe.pytorch.free_benchmark.common.enums import (
+    DeviceType,
+    HandlerType,
+    PytorchFreeBenchmarkConst,
+)
+from msprobe.pytorch.hook_module.utils import get_ops
 class TensorConfig(BaseConfig):
@@ -16,7 +37,7 @@ class TensorConfig(BaseConfig):
         self.nfs_path = json_config.get("nfs_path", "")
         self.host = json_config.get("host", "")
         self.port = json_config.get("port", -1)
-        self.tls_path = json_config.get("tls_path", "")
+        self.tls_path = json_config.get("tls_path", "./")
         self.check_config()
         self._check_file_format()
         self._check_tls_path_config()
@@ -26,13 +47,8 @@ class TensorConfig(BaseConfig):
             raise Exception("file_format is invalid")
     def _check_tls_path_config(self):
-        if self.tls_path:
-            if not os.path.exists(self.tls_path):
-                raise Exception("tls_path: %s does not exist" % self.tls_path)
-            if not os.path.exists(os.path.join(self.tls_path, "client.key")):
-                raise Exception("tls_path does not contain client.key")
-            if not os.path.exists(os.path.join(self.tls_path, "client.crt")):
-                raise Exception("tls_path does not contain client.crt")
+        if self.tls_path and not os.path.exists(self.tls_path):
+            raise Exception("tls_path: %s does not exist" % self.tls_path)
 class StatisticsConfig(BaseConfig):
@@ -61,23 +77,142 @@ class OverflowCheckConfig(BaseConfig):
 class FreeBenchmarkCheckConfig(BaseConfig):
     def __init__(self, json_config):
         super().__init__(json_config)
-        self.fuzz_device = json_config.get("fuzz_device")
-        self.pert_mode = json_config.get("pert_mode")
-        self.handler_type = json_config.get("handler_type")
-        self.fuzz_level = json_config.get("fuzz_level")
-        self.fuzz_stage = json_config.get("fuzz_stage")
-        self.if_preheat = json_config.get("if_preheat")
-        self.preheat_step = json_config.get("preheat_step")
-        self.max_sample = json_config.get("max_sample")
+        self.fuzz_device = json_config.get("fuzz_device", PytorchFreeBenchmarkConst.DEFAULT_DEVICE)
+        self.pert_mode = json_config.get("pert_mode", PytorchFreeBenchmarkConst.DEFAULT_MODE)
+        self.handler_type = json_config.get("handler_type", PytorchFreeBenchmarkConst.DEFAULT_HANDLER)
+        self.fuzz_level = json_config.get("fuzz_level", PytorchFreeBenchmarkConst.DEFAULT_FUZZ_LEVEL)
+        self.fuzz_stage = json_config.get("fuzz_stage", PytorchFreeBenchmarkConst.DEFAULT_FUZZ_STAGE)
+        self.if_preheat = json_config.get("if_preheat", False)
+        self.preheat_step = json_config.get("preheat_step", PytorchFreeBenchmarkConst.DEFAULT_PREHEAT_STEP)
+        self.max_sample = json_config.get("max_sample", PytorchFreeBenchmarkConst.DEFAULT_PREHEAT_STEP)
         self.check_freebenchmark_config()
     def check_freebenchmark_config(self):
-        if self.if_preheat and self.handler_type == "fix":
-            raise Exception("Preheating is not supported in fix handler type")
-        if self.preheat_step and self.preheat_step == 0:
-            raise Exception("preheat_step cannot be 0")
+        self._check_pert_mode()
+        self._check_fuzz_device()
+        self._check_handler_type()
+        self._check_fuzz_stage()
+        self._check_fuzz_level()
+        self._check_if_preheat()
+        if self.handler_type == HandlerType.FIX:
+            self._check_fix_config()
+        if self.if_preheat:
+            self._check_preheat_config()
+    def _check_pert_mode(self):
+        if self.pert_mode not in PytorchFreeBenchmarkConst.PERTURBATION_MODE_LIST:
+            msg = (
+                f"pert_mode is invalid, it should be one of"
+                f" {PytorchFreeBenchmarkConst.PERTURBATION_MODE_LIST}"
+            )
+            logger.error_log_with_exp(
+                msg, MsprobeException(MsprobeException.INVALID_PARAM_ERROR, msg)
+            )
+    def _check_fuzz_device(self):
+        if self.fuzz_device not in PytorchFreeBenchmarkConst.DEVICE_LIST:
+            msg = (
+                f"fuzz_device is invalid, it should be one of"
+                f" {PytorchFreeBenchmarkConst.DEVICE_LIST}"
+            )
+            logger.error_log_with_exp(
+                msg, MsprobeException(MsprobeException.INVALID_PARAM_ERROR, msg)
+            )
+        if (self.fuzz_device == DeviceType.CPU) ^ (
+                self.pert_mode in PytorchFreeBenchmarkConst.CPU_MODE_LIST
+        ):
+            msg = (
+                f"You neet to and can only set fuzz_device as {DeviceType.CPU} "
+                f"when pert_mode in {PytorchFreeBenchmarkConst.CPU_MODE_LIST}"
+            )
+            logger.error_log_with_exp(
+                msg, MsprobeException(MsprobeException.INVALID_PARAM_ERROR, msg)
+            )
+    def _check_handler_type(self):
+        if self.handler_type not in PytorchFreeBenchmarkConst.HANDLER_LIST:
+            msg = (
+                f"handler_type is invalid, it should be one of"
+                f" {PytorchFreeBenchmarkConst.HANDLER_LIST}"
+            )
+            logger.error_log_with_exp(
+                msg, MsprobeException(MsprobeException.INVALID_PARAM_ERROR, msg)
+            )
+    def _check_fuzz_stage(self):
+        if self.fuzz_stage not in PytorchFreeBenchmarkConst.FUZZ_STAGE_LIST:
+            msg = (
+                f"fuzz_stage is invalid, it should be one of"
+                f" {PytorchFreeBenchmarkConst.FUZZ_STAGE_LIST}"
+            )
+            logger.error_log_with_exp(
+                msg, MsprobeException(MsprobeException.INVALID_PARAM_ERROR, msg)
+            )
+    def _check_fuzz_level(self):
+        if self.fuzz_level not in PytorchFreeBenchmarkConst.FUZZ_LEVEL_LIST:
+            msg = (
+                f"fuzz_level is invalid, it should be one of"
+                f" {PytorchFreeBenchmarkConst.FUZZ_LEVEL_LIST}"
+            )
+            logger.error_log_with_exp(
+                msg, MsprobeException(MsprobeException.INVALID_PARAM_ERROR, msg)
+            )
+    def _check_if_preheat(self):
+        if not isinstance(self.if_preheat, bool):
+            msg = "if_preheat is invalid, it should be a boolean"
+            logger.error_log_with_exp(
+                msg, MsprobeException(MsprobeException.INVALID_PARAM_ERROR, msg)
+            )
+    def _check_preheat_config(self):
+        if not isinstance(self.preheat_step, int):
+            msg = "preheat_step is invalid, it should be an integer"
+            logger.error_log_with_exp(
+                msg, MsprobeException(MsprobeException.INVALID_PARAM_ERROR, msg)
+            )
+        if self.preheat_step <= 0:
+            msg = "preheat_step must be greater than 0"
+            logger.error_log_with_exp(
+                msg, MsprobeException(MsprobeException.INVALID_PARAM_ERROR, msg)
+            )
+        if not isinstance(self.max_sample, int):
+            msg = "max_sample is invalid, it should be an integer"
+            logger.error_log_with_exp(
+                msg, MsprobeException(MsprobeException.INVALID_PARAM_ERROR, msg)
+            )
+        if self.max_sample <= 0:
+            msg = "max_sample must be greater than 0"
+            logger.error_log_with_exp(
+                msg, MsprobeException(MsprobeException.INVALID_PARAM_ERROR, msg)
+            )
+    def _check_fix_config(self):
+        if self.if_preheat:
+            msg = f"Preheating is not supported for {HandlerType.FIX} handler type"
+            logger.error_log_with_exp(
+                msg, MsprobeException(MsprobeException.INVALID_PARAM_ERROR, msg)
+            )
+        if self.fuzz_stage not in PytorchFreeBenchmarkConst.FIX_STAGE_LIST:
+            msg = (
+                f"The fuzz_stage when opening {HandlerType.FIX} handler must be one of "
+                f"{PytorchFreeBenchmarkConst.FIX_STAGE_LIST}"
+            )
+            logger.error_log_with_exp(
+                msg, MsprobeException(MsprobeException.INVALID_PARAM_ERROR, msg)
+            )
+        if self.pert_mode not in PytorchFreeBenchmarkConst.FIX_MODE_LIST:
+            msg = (
+                f"The pert_mode when opening {HandlerType.FIX} handler must be one of "
+                f"{PytorchFreeBenchmarkConst.FIX_MODE_LIST}"
+            )
+            logger.error_log_with_exp(
+                msg, MsprobeException(MsprobeException.INVALID_PARAM_ERROR, msg)
+            )
 class RunUTConfig(BaseConfig):
@@ -93,7 +228,7 @@ class RunUTConfig(BaseConfig):
         self.host = json_config.get("host", "")
         self.port = json_config.get("port", -1)
         self.rank_list = json_config.get("rank_list", Const.DEFAULT_LIST)
-        self.tls_path = json_config.get("tls_path", "")
+        self.tls_path = json_config.get("tls_path", "./")
         self.check_run_ut_config()
     @classmethod
@@ -118,13 +253,8 @@ class RunUTConfig(BaseConfig):
     @classmethod
     def check_tls_path_config(cls, tls_path):
-        if tls_path:
-            if not os.path.exists(tls_path):
-                raise Exception("tls_path: %s does not exist" % tls_path)
-            if not os.path.exists(os.path.join(tls_path, "server.key")):
-                raise Exception("tls_path does not contain server.key")
-            if not os.path.exists(os.path.join(tls_path, "server.crt")):
-                raise Exception("tls_path does not contain server.crt")
+        if tls_path and not os.path.exists(tls_path):
+            raise Exception("tls_path: %s does not exist" % tls_path)
     def check_run_ut_config(self):
         RunUTConfig.check_filter_list_config(Const.WHITE_LIST, self.white_list)
@@ -141,13 +271,13 @@ class GradToolConfig(BaseConfig):
         self.param_list = json_config.get("param_list", [])
         self.bounds = json_config.get("bounds", [-1, 0, 1])
         self._check_config()
     def _check_config(self):
         if self.grad_level not in level_adp.keys():
             raise Exception(f"grad_level must be one of {level_adp.keys()}")
         if not isinstance(self.param_list, list):
             raise Exception(f"param_list must be a list")
-        check_numeral_list_ascend(self.bounds)
+        check_bounds(self.bounds)
 def parse_task_config(task, json_config):
@@ -178,10 +308,9 @@ def parse_json_config(json_file_path, task):
     if not json_file_path:
         config_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
         json_file_path = os.path.join(config_dir, "config.json")
-    with FileOpen(json_file_path, 'r') as file:
-        json_config = json.load(file)
+    json_config = load_json(json_file_path)
     common_config = CommonConfig(json_config)
-    if task and task in Const.TASK_LIST:
+    if task:
         task_config = parse_task_config(task, json_config)
     else:
         task_config = parse_task_config(common_config.task, json_config)

msprobe/pytorch/service.py CHANGED Viewed

@@ -1,3 +1,18 @@
+# Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import functools
 import os
@@ -6,6 +21,7 @@ import torch
 from msprobe.core.common.const import Const
 from msprobe.core.common.exceptions import DistributedNotInitializedError, MsprobeException
 from msprobe.core.common.file_utils import create_directory
+from msprobe.core.common.utils import print_tools_ends_info
 from msprobe.core.data_dump.data_collector import build_data_collector
 from msprobe.core.data_dump.data_processor.base import ModuleForwardInputsOutputs, ModuleBackwardInputsOutputs
 from msprobe.core.data_dump.scope import BaseScope
@@ -16,7 +32,10 @@ from msprobe.pytorch.hook_module.api_registry import api_register
 from msprobe.pytorch.hook_module.hook_module import HOOKModule
 from msprobe.pytorch.module_processer import ModuleProcesser
 from msprobe.pytorch.api_accuracy_checker.common.utils import ApiData
 torch_version_above_or_equal_2 = torch.__version__.split('+')[0] >= '2.0'
+if torch_version_above_or_equal_2:
+    from msprobe.pytorch.api_accuracy_checker.tensor_transport_layer.dump_dispatch import run_ut_dispatch
 HookFn = namedtuple('hookFn', ['pre_hook', 'forward_hook', 'backward_hook', 'forward_hook_torch_version_below_2'])
@@ -32,6 +51,7 @@ class Service:
         self.first_start = True
         self.current_rank = None
         self.dump_iter_dir = None
+        self.should_stop_service = False
         self.attl = None
     @staticmethod
@@ -39,14 +59,29 @@ class Service:
         logger.info_on_rank_0("Data needed ends here.")
         api_register.api_originality()
+    @staticmethod
+    def is_registered_backward_hook(module):
+        if hasattr(module, '_backward_hooks') and \
+                len(module._backward_hooks) > 0 and \
+                module._is_full_backward_hook is False:
+            return True
+        return False
+    def check_register_full_backward_hook(self, module):
+        if self.is_registered_backward_hook(module):
+            module._backward_hooks.clear()
+            module._is_full_backward_hook = None
+            logger.warning("Found deprecated backward hooks. Removing them and switching to full backward hooks.")
     def build_hook(self, module_type, name):
         def pre_hook(api_or_module_name, module, args, kwargs):
+            if not self.should_execute_hook():
+                return args, kwargs
             if module_type == BaseScope.Module_Type_Module:
                 api_or_module_name = module.mindstudio_reserved_name
             self.data_collector.update_api_or_module_name(api_or_module_name)
-            if not self.switch:
-                return args, kwargs
             if self.config.online_run_ut:
                 return None, None
             if self.data_collector:
@@ -55,13 +90,13 @@ class Service:
             return args, kwargs
         def forward_hook(api_or_module_name, module, args, kwargs, output):
+            if not self.should_execute_hook():
+                return None
             if module_type == BaseScope.Module_Type_Module:
                 api_or_module_name = module.mindstudio_reserved_name
             self.data_collector.update_api_or_module_name(api_or_module_name)
-            if not self.switch:
-                return None
             if self.config.online_run_ut:
                 if self.data_collector.scope and not self.data_collector.scope.check(api_or_module_name):
                     return None
@@ -80,18 +115,14 @@ class Service:
             return forward_hook(api_or_module_name, module, args, {}, output)
         def backward_hook(api_or_module_name, module, grad_input, grad_output):
+            if not self.should_execute_hook():
+                return
             if module_type == BaseScope.Module_Type_Module:
                 api_or_module_name = module.mindstudio_reserved_name
             self.data_collector.update_api_or_module_name(api_or_module_name)
-            if not self.switch:
-                return
             if self.config.online_run_ut:
-                if self.data_collector.scope and not self.data_collector.scope.check(api_or_module_name):
-                    return
-                api_data = ApiData(name[:-1], grad_input, {}, grad_output, self.current_iter, self.current_rank)
-                self.attl_send(api_data)
                 return
             if self.data_collector:
@@ -105,26 +136,15 @@ class Service:
         pre_forward_hook_fn = functools.partial(pre_hook, forward_name_template)
         forward_hook_fn = functools.partial(forward_hook, forward_name_template)
         backward_hook_fn = functools.partial(backward_hook, backward_name_template)
-        forward_hook_torch_version_below_2_fn = functools.partial(forward_hook_torch_version_below_2, forward_name_template)
+        forward_hook_torch_version_below_2_fn = functools.partial(forward_hook_torch_version_below_2,
+                                                                  forward_name_template)
         return HookFn(pre_forward_hook_fn, forward_hook_fn, backward_hook_fn, forward_hook_torch_version_below_2_fn)
-    def step(self):
-        self.current_iter += 1
-        self.data_collector.update_iter(self.current_iter)
-        ModuleProcesser.reset_module_stats()
-        HOOKModule.reset_module_stats()
     def start(self, model, api_origin=False):
-        self.model = model
-        if self.config.step and self.current_iter > max(self.config.step):
-            if self.config.online_run_ut:
-                # send stop signal if online_run_ut
-                self.attl_stop()
-            self.stop()
-            raise Exception("msprobe: exit after iteration {}".format(max(self.config.step)))
-        if self.config.step and self.current_iter not in self.config.step:
+        if self.need_stop_service():
             return
+        self.model = model
         if self.first_start:
             try:
                 self.current_rank = get_rank_if_initialized()
@@ -138,6 +158,8 @@ class Service:
             self.first_start = False
         if api_origin:
             api_register.api_modularity()
+        if self.config.online_run_ut and torch_version_above_or_equal_2:
+            run_ut_dispatch(self.attl, True)
         self.switch = True
         logger.info_on_rank_0(f"Dump switch is turned on at step {self.current_iter}. ")
         if self.config.level != "L2" and not self.config.online_run_ut:
@@ -145,6 +167,8 @@ class Service:
             logger.info_on_rank_0(f"Dump data will be saved in {self.dump_iter_dir}.")
     def stop(self):
+        if self.should_stop_service:
+            return
         if self.config.level == "L2":
             return
         if self.config.step and self.current_iter not in self.config.step:
@@ -152,10 +176,47 @@ class Service:
         if self.config.rank and self.current_rank not in self.config.rank:
             return
         self.switch = False
-        if self.config.online_run_ut:
+        if self.config.online_run_ut and torch_version_above_or_equal_2:
+            run_ut_dispatch(self.attl, False)
             return
         self.data_collector.write_json()
+    def step(self):
+        if self.should_stop_service:
+            return
+        self.current_iter += 1
+        self.data_collector.update_iter(self.current_iter)
+        ModuleProcesser.reset_module_stats()
+        HOOKModule.reset_module_stats()
+        self.data_collector.data_writer.reset_cache()
+    def need_stop_service(self):
+        if self.should_stop_service:
+            return True
+        end_service = self.config.step and self.current_iter > max(self.config.step) or \
+                      self.data_collector and self.data_collector.data_processor.is_terminated
+        if end_service:
+            if self.config.online_run_ut:
+                # send stop signal if online_run_ut
+                self.attl_stop()
+            if self.config.level in [Const.LEVEL_L1, Const.LEVEL_L2, Const.LEVEL_MIX]:
+                api_register.api_originality()
+            self.switch = False
+            self.should_stop_service = True
+            print_tools_ends_info()
+            return True
+        if self.config.step and self.current_iter not in self.config.step:
+            return True
+        return False
+    def should_execute_hook(self):
+        if not self.switch:
+            return False
+        if self.data_collector and self.data_collector.data_processor.is_terminated:
+            return False
+        return True
     def create_dirs(self):
         create_directory(self.config.dump_path)
         self.dump_iter_dir = os.path.join(self.config.dump_path, f"step{self.current_iter}")
@@ -187,14 +248,16 @@ class Service:
                 prefix = BaseScope.Module_Type_Module + Const.SEP + name + Const.SEP + \
                          module.__class__.__name__ + Const.SEP
-                pre_forward_hook, forward_hook, backward_hook, forward_hook_torch_version_below_2 \
-                    = self.build_hook(BaseScope.Module_Type_Module, prefix)
+                pre_forward_hook, forward_hook, backward_hook, forward_hook_torch_version_below_2 = self.build_hook(
+                    BaseScope.Module_Type_Module, prefix)
                 if torch_version_above_or_equal_2:
                     module.register_forward_hook(forward_hook, with_kwargs=True)
                 else:
+                    self.check_register_full_backward_hook(module)
                     module.register_full_backward_hook(
                         self.module_processor.node_hook(prefix + Const.BACKWARD, Const.STOP))
                     module.register_forward_hook(forward_hook_torch_version_below_2)
+                self.check_register_full_backward_hook(module)
                 module.register_full_backward_hook(backward_hook)
                 module.register_forward_pre_hook(
@@ -204,11 +267,13 @@ class Service:
                 if torch_version_above_or_equal_2:
                     module.register_full_backward_pre_hook(
                         self.module_processor.node_hook(prefix + Const.BACKWARD, Const.START))
+                    self.check_register_full_backward_hook(module)
                     module.register_full_backward_hook(
                         self.module_processor.node_hook(prefix + Const.BACKWARD, Const.STOP))
         if self.config.level in ["mix", "L1", "L2"]:
-            api_register.initialize_hook(functools.partial(self.build_hook, BaseScope.Module_Type_API))
+            api_register.initialize_hook(functools.partial(self.build_hook, BaseScope.Module_Type_API),
+                                         self.config.online_run_ut)
             api_register.api_modularity()
         if Const.STATISTICS == self.config.task or Const.TENSOR == self.config.task:

mindstudio-probe 1.0.4__py3-none-any.whl → 1.1.0__py3-none-any.whl

mindstudio-probe 1.0.4py3-none-any.whl → 1.1.0py3-none-any.whl