PyPI - mindstudio-probe - Versions diffs - 1.0.1__py3-none-any.whl → 1.0.3__py3-none-any.whl - Mend

mindstudio-probe 1.0.1py3-none-any.whl → 1.0.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (249) hide show

{mindstudio_probe-1.0.1.dist-info → mindstudio_probe-1.0.3.dist-info}/METADATA +5 -1
mindstudio_probe-1.0.3.dist-info/RECORD +272 -0
msprobe/README.md +78 -23
msprobe/__init__.py +1 -0
msprobe/config/README.md +182 -40
msprobe/config/config.json +22 -0
msprobe/core/__init__.py +0 -0
msprobe/{pytorch → core}/advisor/advisor.py +3 -3
msprobe/{pytorch → core}/advisor/advisor_result.py +2 -2
msprobe/core/common/const.py +82 -5
msprobe/core/common/exceptions.py +30 -18
msprobe/core/common/file_check.py +19 -1
msprobe/core/common/log.py +15 -1
msprobe/core/common/utils.py +130 -30
msprobe/core/common_config.py +32 -19
msprobe/core/compare/acc_compare.py +299 -0
msprobe/core/compare/check.py +95 -0
msprobe/core/compare/compare_cli.py +49 -0
msprobe/core/compare/highlight.py +222 -0
msprobe/core/compare/multiprocessing_compute.py +149 -0
msprobe/{pytorch → core}/compare/npy_compare.py +55 -4
msprobe/core/compare/utils.py +429 -0
msprobe/core/data_dump/data_collector.py +39 -35
msprobe/core/data_dump/data_processor/base.py +85 -37
msprobe/core/data_dump/data_processor/factory.py +5 -7
msprobe/core/data_dump/data_processor/mindspore_processor.py +198 -0
msprobe/core/data_dump/data_processor/pytorch_processor.py +94 -51
msprobe/core/data_dump/json_writer.py +11 -11
msprobe/core/grad_probe/__init__.py +0 -0
msprobe/core/grad_probe/constant.py +71 -0
msprobe/core/grad_probe/grad_compare.py +175 -0
msprobe/core/grad_probe/utils.py +52 -0
msprobe/doc/grad_probe/grad_probe.md +207 -0
msprobe/doc/grad_probe/img/image-1.png +0 -0
msprobe/doc/grad_probe/img/image-2.png +0 -0
msprobe/doc/grad_probe/img/image-3.png +0 -0
msprobe/doc/grad_probe/img/image-4.png +0 -0
msprobe/doc/grad_probe/img/image.png +0 -0
msprobe/mindspore/api_accuracy_checker/__init__.py +0 -0
msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +246 -0
msprobe/mindspore/api_accuracy_checker/api_info.py +69 -0
msprobe/mindspore/api_accuracy_checker/api_runner.py +152 -0
msprobe/mindspore/api_accuracy_checker/base_compare_algorithm.py +197 -0
msprobe/mindspore/api_accuracy_checker/compute_element.py +224 -0
msprobe/mindspore/api_accuracy_checker/main.py +16 -0
msprobe/mindspore/api_accuracy_checker/type_mapping.py +114 -0
msprobe/mindspore/api_accuracy_checker/utils.py +63 -0
msprobe/mindspore/cell_processor.py +34 -0
msprobe/mindspore/common/const.py +87 -0
msprobe/mindspore/common/log.py +38 -0
msprobe/mindspore/common/utils.py +57 -0
msprobe/mindspore/compare/distributed_compare.py +75 -0
msprobe/mindspore/compare/ms_compare.py +117 -0
msprobe/mindspore/compare/ms_graph_compare.py +317 -0
msprobe/mindspore/compare/ms_to_pt_api.yaml +399 -0
msprobe/mindspore/debugger/debugger_config.py +38 -15
msprobe/mindspore/debugger/precision_debugger.py +79 -4
msprobe/mindspore/doc/compare.md +58 -0
msprobe/mindspore/doc/dump.md +158 -6
msprobe/mindspore/dump/dump_tool_factory.py +19 -22
msprobe/mindspore/dump/hook_cell/api_registry.py +104 -0
msprobe/mindspore/dump/hook_cell/hook_cell.py +53 -0
msprobe/mindspore/dump/hook_cell/support_wrap_ops.yaml +925 -0
msprobe/mindspore/dump/hook_cell/wrap_functional.py +91 -0
msprobe/mindspore/dump/hook_cell/wrap_tensor.py +63 -0
msprobe/mindspore/dump/jit_dump.py +56 -0
msprobe/mindspore/dump/kernel_kbyk_dump.py +65 -0
msprobe/mindspore/free_benchmark/__init__.py +0 -0
msprobe/mindspore/free_benchmark/api_pynative_self_check.py +116 -0
msprobe/mindspore/free_benchmark/common/__init__.py +0 -0
msprobe/mindspore/free_benchmark/common/config.py +12 -0
msprobe/mindspore/free_benchmark/common/handler_params.py +17 -0
msprobe/mindspore/free_benchmark/common/utils.py +71 -0
msprobe/mindspore/free_benchmark/data/support_wrap_ops.yaml +842 -0
msprobe/mindspore/free_benchmark/decorator/__init__.py +0 -0
msprobe/mindspore/free_benchmark/decorator/dec_forward.py +42 -0
msprobe/mindspore/free_benchmark/decorator/decorator_factory.py +107 -0
msprobe/mindspore/free_benchmark/handler/__init__.py +0 -0
msprobe/mindspore/free_benchmark/handler/base_handler.py +90 -0
msprobe/mindspore/free_benchmark/handler/check_handler.py +41 -0
msprobe/mindspore/free_benchmark/handler/fix_handler.py +36 -0
msprobe/mindspore/free_benchmark/handler/handler_factory.py +21 -0
msprobe/mindspore/free_benchmark/perturbation/add_noise.py +67 -0
msprobe/mindspore/free_benchmark/perturbation/base_perturbation.py +21 -0
msprobe/mindspore/free_benchmark/perturbation/bit_noise.py +63 -0
msprobe/mindspore/free_benchmark/perturbation/improve_precision.py +34 -0
msprobe/mindspore/free_benchmark/perturbation/no_change.py +12 -0
msprobe/mindspore/free_benchmark/perturbation/perturbation_factory.py +27 -0
msprobe/mindspore/free_benchmark/self_check_tool_factory.py +33 -0
msprobe/mindspore/grad_probe/__init__.py +0 -0
msprobe/mindspore/grad_probe/global_context.py +91 -0
msprobe/mindspore/grad_probe/grad_analyzer.py +231 -0
msprobe/mindspore/grad_probe/grad_monitor.py +27 -0
msprobe/mindspore/grad_probe/grad_stat_csv.py +132 -0
msprobe/mindspore/grad_probe/hook.py +92 -0
msprobe/mindspore/grad_probe/utils.py +29 -0
msprobe/mindspore/ms_config.py +63 -15
msprobe/mindspore/overflow_check/overflow_check_tool_factory.py +17 -15
msprobe/mindspore/runtime.py +4 -0
msprobe/mindspore/service.py +354 -0
msprobe/mindspore/task_handler_factory.py +7 -4
msprobe/msprobe.py +66 -26
msprobe/pytorch/__init__.py +1 -1
msprobe/pytorch/api_accuracy_checker/common/config.py +21 -16
msprobe/pytorch/api_accuracy_checker/common/utils.py +1 -60
msprobe/pytorch/api_accuracy_checker/compare/algorithm.py +2 -5
msprobe/pytorch/api_accuracy_checker/compare/api_precision_compare.py +46 -10
msprobe/pytorch/api_accuracy_checker/compare/compare.py +84 -48
msprobe/pytorch/api_accuracy_checker/compare/compare_utils.py +8 -12
msprobe/pytorch/api_accuracy_checker/config.yaml +7 -1
msprobe/pytorch/api_accuracy_checker/run_ut/data_generate.py +15 -11
msprobe/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py +11 -15
msprobe/pytorch/api_accuracy_checker/run_ut/run_overflow_check.py +16 -9
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py +193 -105
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut_utils.py +68 -1
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/__init__.py +0 -0
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/attl.py +202 -0
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/client.py +324 -0
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/device_dispatch.py +204 -0
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/server.py +218 -0
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/ssl_config.py +10 -0
msprobe/pytorch/bench_functions/__init__.py +15 -0
msprobe/pytorch/bench_functions/apply_adam_w.py +28 -0
msprobe/pytorch/bench_functions/confusion_transpose.py +19 -0
msprobe/pytorch/bench_functions/fast_gelu.py +55 -0
msprobe/pytorch/bench_functions/layer_norm_eval.py +6 -0
msprobe/pytorch/bench_functions/linear.py +12 -0
msprobe/pytorch/bench_functions/matmul_backward.py +48 -0
msprobe/pytorch/bench_functions/npu_fusion_attention.py +421 -0
msprobe/pytorch/bench_functions/rms_norm.py +15 -0
msprobe/pytorch/bench_functions/rotary_mul.py +52 -0
msprobe/pytorch/bench_functions/scaled_mask_softmax.py +26 -0
msprobe/pytorch/bench_functions/swiglu.py +55 -0
msprobe/pytorch/common/parse_json.py +3 -1
msprobe/pytorch/common/utils.py +83 -7
msprobe/pytorch/compare/distributed_compare.py +19 -64
msprobe/pytorch/compare/match.py +3 -6
msprobe/pytorch/compare/pt_compare.py +40 -0
msprobe/pytorch/debugger/debugger_config.py +11 -2
msprobe/pytorch/debugger/precision_debugger.py +34 -4
msprobe/pytorch/doc/api_accuracy_checker.md +57 -13
msprobe/pytorch/doc/api_accuracy_checker_online.md +187 -0
msprobe/pytorch/doc/dump.md +73 -20
msprobe/pytorch/doc/ptdbg_ascend_compare.md +75 -11
msprobe/pytorch/doc/ptdbg_ascend_quickstart.md +3 -3
msprobe/pytorch/doc/run_overflow_check.md +1 -1
msprobe/pytorch/doc//321/206/320/247/320/260/321/206/320/260/320/227/321/206/320/255/320/226/321/205/342/225/226/320/265/321/205/320/225/342/225/226/321/205/320/254/342/225/221/321/206/320/251/320/277/321/211/320/272/320/234/321/210/320/277/320/221/321/205/320/242/320/234/321/206/320/220/320/267/321/210/320/223/342/225/234/321/205/320/257/342/225/221/321/207/342/225/221/342/224/220/321/206/320/232/320/265/321/205/320/241/320/232.md +151 -0
msprobe/pytorch/free_benchmark/common/constant.py +3 -0
msprobe/pytorch/free_benchmark/common/utils.py +4 -0
msprobe/pytorch/free_benchmark/compare/grad_saver.py +22 -26
msprobe/pytorch/free_benchmark/main.py +7 -4
msprobe/pytorch/free_benchmark/perturbed_layers/npu/add_noise.py +1 -1
msprobe/pytorch/free_benchmark/perturbed_layers/npu/bit_noise.py +1 -1
msprobe/pytorch/free_benchmark/perturbed_layers/npu/change_value.py +1 -1
msprobe/pytorch/free_benchmark/perturbed_layers/npu/improve_precision.py +3 -3
msprobe/pytorch/free_benchmark/perturbed_layers/npu/no_change.py +1 -1
msprobe/pytorch/free_benchmark/perturbed_layers/run_cpu.py +1 -1
msprobe/pytorch/free_benchmark/result_handlers/base_handler.py +43 -29
msprobe/pytorch/free_benchmark/result_handlers/handler_factory.py +0 -1
msprobe/pytorch/function_factory.py +75 -0
msprobe/pytorch/functional/dump_module.py +4 -4
msprobe/pytorch/grad_probe/__init__.py +0 -0
msprobe/pytorch/grad_probe/grad_monitor.py +90 -0
msprobe/pytorch/grad_probe/grad_stat_csv.py +129 -0
msprobe/pytorch/hook_module/hook_module.py +14 -3
msprobe/pytorch/hook_module/support_wrap_ops.yaml +2 -1
msprobe/pytorch/hook_module/utils.py +9 -9
msprobe/pytorch/hook_module/wrap_aten.py +20 -10
msprobe/pytorch/hook_module/wrap_distributed.py +10 -7
msprobe/pytorch/hook_module/wrap_functional.py +4 -7
msprobe/pytorch/hook_module/wrap_npu_custom.py +21 -10
msprobe/pytorch/hook_module/wrap_tensor.py +5 -6
msprobe/pytorch/hook_module/wrap_torch.py +5 -7
msprobe/pytorch/hook_module/wrap_vf.py +6 -8
msprobe/pytorch/module_processer.py +53 -13
msprobe/pytorch/online_dispatch/compare.py +4 -4
msprobe/pytorch/online_dispatch/dispatch.py +39 -41
msprobe/pytorch/online_dispatch/dump_compare.py +17 -47
msprobe/pytorch/online_dispatch/single_compare.py +5 -5
msprobe/pytorch/online_dispatch/utils.py +2 -43
msprobe/pytorch/parse_tool/lib/compare.py +31 -19
msprobe/pytorch/parse_tool/lib/config.py +2 -1
msprobe/pytorch/parse_tool/lib/parse_tool.py +4 -4
msprobe/pytorch/parse_tool/lib/utils.py +34 -80
msprobe/pytorch/parse_tool/lib/visualization.py +4 -3
msprobe/pytorch/pt_config.py +100 -6
msprobe/pytorch/service.py +104 -19
mindstudio_probe-1.0.1.dist-info/RECORD +0 -228
msprobe/mindspore/dump/api_kbk_dump.py +0 -55
msprobe/pytorch/compare/acc_compare.py +0 -1024
msprobe/pytorch/compare/highlight.py +0 -100
msprobe/test/core_ut/common/test_utils.py +0 -345
msprobe/test/core_ut/data_dump/test_data_collector.py +0 -47
msprobe/test/core_ut/data_dump/test_json_writer.py +0 -183
msprobe/test/core_ut/data_dump/test_scope.py +0 -151
msprobe/test/core_ut/test_common_config.py +0 -152
msprobe/test/core_ut/test_file_check.py +0 -218
msprobe/test/core_ut/test_log.py +0 -109
msprobe/test/mindspore_ut/test_api_kbk_dump.py +0 -51
msprobe/test/mindspore_ut/test_debugger_config.py +0 -42
msprobe/test/mindspore_ut/test_dump_tool_factory.py +0 -51
msprobe/test/mindspore_ut/test_kernel_graph_dump.py +0 -66
msprobe/test/mindspore_ut/test_kernel_graph_overflow_check.py +0 -63
msprobe/test/mindspore_ut/test_ms_config.py +0 -69
msprobe/test/mindspore_ut/test_overflow_check_tool_factory.py +0 -51
msprobe/test/mindspore_ut/test_precision_debugger.py +0 -56
msprobe/test/mindspore_ut/test_task_handler_factory.py +0 -58
msprobe/test/pytorch_ut/advisor/test_advisor.py +0 -83
msprobe/test/pytorch_ut/api_accuracy_checker/common/test_common_utils.py +0 -108
msprobe/test/pytorch_ut/api_accuracy_checker/common/test_config.py +0 -39
msprobe/test/pytorch_ut/api_accuracy_checker/compare/test_algorithm.py +0 -112
msprobe/test/pytorch_ut/api_accuracy_checker/compare/test_api_precision_compare.py +0 -77
msprobe/test/pytorch_ut/api_accuracy_checker/compare/test_compare.py +0 -125
msprobe/test/pytorch_ut/api_accuracy_checker/compare/test_compare_column.py +0 -10
msprobe/test/pytorch_ut/api_accuracy_checker/compare/test_compare_utils.py +0 -43
msprobe/test/pytorch_ut/api_accuracy_checker/run_ut/dump.json +0 -179
msprobe/test/pytorch_ut/api_accuracy_checker/run_ut/forward.json +0 -63
msprobe/test/pytorch_ut/api_accuracy_checker/run_ut/test_data_generate.py +0 -99
msprobe/test/pytorch_ut/api_accuracy_checker/run_ut/test_multi_run_ut.py +0 -115
msprobe/test/pytorch_ut/api_accuracy_checker/run_ut/test_run_ut.py +0 -72
msprobe/test/pytorch_ut/compare/test_acc_compare.py +0 -17
msprobe/test/pytorch_ut/free_benchmark/perturbed_layers/test_perturbed_layser.py +0 -105
msprobe/test/pytorch_ut/free_benchmark/result_handlers/test_result_handler.py +0 -121
msprobe/test/pytorch_ut/free_benchmark/test_main.py +0 -101
msprobe/test/pytorch_ut/functional/test_dump_module.py +0 -15
msprobe/test/pytorch_ut/hook_module/test_api_registry.py +0 -130
msprobe/test/pytorch_ut/hook_module/test_hook_module.py +0 -42
msprobe/test/pytorch_ut/hook_module/test_wrap_aten.py +0 -65
msprobe/test/pytorch_ut/hook_module/test_wrap_distributed.py +0 -35
msprobe/test/pytorch_ut/hook_module/test_wrap_functional.py +0 -20
msprobe/test/pytorch_ut/hook_module/test_wrap_tensor.py +0 -35
msprobe/test/pytorch_ut/hook_module/test_wrap_torch.py +0 -43
msprobe/test/pytorch_ut/hook_module/test_wrap_vf.py +0 -11
msprobe/test/pytorch_ut/test_pt_config.py +0 -69
msprobe/test/pytorch_ut/test_service.py +0 -59
msprobe/test/resources/advisor.txt +0 -3
msprobe/test/resources/compare_result_20230703104808.csv +0 -9
msprobe/test/resources/compare_result_without_accuracy.csv +0 -9
msprobe/test/resources/config.yaml +0 -3
msprobe/test/resources/npu_test.pkl +0 -8
msprobe/test/run_test.sh +0 -30
msprobe/test/run_ut.py +0 -58
msprobe/test/test_module_processer.py +0 -64
{mindstudio_probe-1.0.1.dist-info → mindstudio_probe-1.0.3.dist-info}/LICENSE +0 -0
{mindstudio_probe-1.0.1.dist-info → mindstudio_probe-1.0.3.dist-info}/WHEEL +0 -0
{mindstudio_probe-1.0.1.dist-info → mindstudio_probe-1.0.3.dist-info}/entry_points.txt +0 -0
{mindstudio_probe-1.0.1.dist-info → mindstudio_probe-1.0.3.dist-info}/top_level.txt +0 -0
/msprobe/{pytorch → core}/advisor/advisor_const.py +0 -0
/msprobe/pytorch/doc/{atat → msprobe}/321/207/342/226/223/342/225/233/321/205/342/225/221/320/266/321/205/342/225/226/320/265/321/205/320/225/342/225/226/321/206/320/245/342/226/221/321/206/320/235/320/276dump/321/206/320/260/320/227/321/205/320/227/320/226/321/206/320/220/320/267/321/210/320/223/342/225/234/321/205/320/257/342/225/221/321/207/342/225/221/342/224/220/321/206/320/232/320/265/321/205/320/241/320/232.md" +0 -0

msprobe/mindspore/ms_config.py CHANGED Viewed

@@ -1,6 +1,12 @@
 import json
 from msprobe.core.common_config import CommonConfig, BaseConfig
 from msprobe.core.common.file_check import FileOpen
+from msprobe.core.common.const import Const
+from msprobe.mindspore.common.const import FreeBenchmarkConst
+from msprobe.mindspore.common.log import logger
+from msprobe.core.grad_probe.constant import level_adp
+from msprobe.core.grad_probe.utils import check_numeral_list_ascend
 class TensorConfig(BaseConfig):
@@ -31,39 +37,81 @@ class StatisticsConfig(BaseConfig):
         if self.data_mode is not None and len(self.data_mode) > 0:
             if len(self.data_mode) > 1 or self.data_mode[0] not in ["all", "input", "output"]:
                 raise Exception("data_mode must be all, input or output")
+        if self.summary_mode and self.summary_mode not in ["statistics", "md5"]:
+            raise Exception("summary_mode is invalid")
-class OverflowCheck(BaseConfig):
+class OverflowCheckConfig(BaseConfig):
     def __init__(self, json_config):
         super().__init__(json_config)
-        self.file_format = None
-        self.check_mode = json_config.get("check_mode")
+        self.data_mode = ["all"]
         self._check_config()
     def _check_config(self):
-        if self.data_mode is not None and len(self.data_mode) > 0:
-            if len(self.data_mode) > 1 or self.data_mode[0] not in ["all", "input", "output"]:
-                raise Exception("data_mode must be all, input or output")
+        if self.overflow_nums is not None and not isinstance(self.overflow_nums, int):
+            raise Exception("overflow_nums is invalid, it should be an integer")
+        if self.overflow_nums is not None and self.overflow_nums != -1 and self.overflow_nums <= 0:
+            raise Exception("overflow_nums should be -1 or positive integer")
         if self.check_mode and self.check_mode not in ["all", "aicore", "atomic"]:
             raise Exception("check_mode is invalid")
+class FreeBenchmarkConfig(BaseConfig):
+    def __init__(self, task_config):
+        super().__init__(task_config)
+        self._check_config()
+    def _check_config(self):
+        if self.fuzz_device and self.fuzz_device not in FreeBenchmarkConst.DEVICE_LIST:
+            raise Exception("fuzz_device must be npu or empty")
+        if self.pert_mode and self.pert_mode not in FreeBenchmarkConst.PERT_TYPE_LIST:
+            raise Exception("pert_mode must be improve_precision, add_noise, bit_noise, no_change or empty")
+        if self.handler_type and self.handler_type not in FreeBenchmarkConst.HANDLER_TYPE_LIST:
+            raise Exception("handler_type must be check, fix or empty")
+        if self.fuzz_level and self.fuzz_level not in FreeBenchmarkConst.DUMP_LEVEL_LIST:
+            raise Exception("fuzz_level must be L1 or empty")
+        if self.fuzz_stage and self.fuzz_stage not in FreeBenchmarkConst.STAGE_LIST:
+            raise Exception("fuzz_stage must be forward or empty")
+        if self.if_preheat or self.preheat_step or self.max_sample:
+            logger.warning("'if_preheat', 'preheat_step' and 'max_sample' settings "
+                           "are not supported for mindspore free benchmark task.")
+class GradProbeConfig(BaseConfig):
+    def __init__(self, json_config):
+        super().__init__(json_config)
+        self.grad_level = json_config.get("grad_level", "L1")
+        self.param_list = json_config.get("param_list", [])
+        self.bounds = json_config.get("bounds", [])
+    def _check_config(self):
+        if self.grad_level not in level_adp.keys():
+            raise Exception(f"grad_level must be one of {level_adp.keys()}")
+        if not isinstance(self.param_list, list):
+            raise Exception(f"param_list must be a list")
+        check_numeral_list_ascend(self.bounds)
+TaskDict = {
+    Const.TENSOR: TensorConfig,
+    Const.STATISTICS: StatisticsConfig,
+    Const.OVERFLOW_CHECK: OverflowCheckConfig,
+    Const.FREE_BENCHMARK: FreeBenchmarkConfig,
+    Const.GRAD_PROBE: GradProbeConfig,
+}
 def parse_common_config(json_config):
     return CommonConfig(json_config)
 def parse_task_config(task, json_config):
-    task_map = json_config[task]
+    task_map = json_config.get(task)
     if not task_map:
         task_map = dict()
-    if task == "tensor":
-        return TensorConfig(task_map)
-    elif task == "statistics":
-        return StatisticsConfig(task_map)
-    elif task == "overflow_check":
-        return OverflowCheck(task_map)
-    else:
+    if task not in TaskDict:
         raise Exception("task is invalid.")
+    return TaskDict.get(task)(task_map)
 def parse_json_config(json_file_path):
@@ -73,6 +121,6 @@ def parse_json_config(json_file_path):
         json_config = json.load(file)
     common_config = parse_common_config(json_config)
     if not common_config.task:
-        common_config.task = "statistics"
+        common_config.task = Const.STATISTICS
     task_config = parse_task_config(common_config.task, json_config)
     return common_config, task_config

msprobe/mindspore/overflow_check/overflow_check_tool_factory.py CHANGED Viewed

@@ -1,23 +1,24 @@
+from msprobe.mindspore.common.const import Const
 from msprobe.mindspore.debugger.debugger_config import DebuggerConfig
 from msprobe.mindspore.overflow_check.kernel_graph_overflow_check import KernelGraphOverflowCheck
 class OverflowCheckToolFactory:
     tools = {
-        "cell": {
-            "kbk": None,
-            "graph": None,
-            "pynative": None
+        Const.CELL: {
+            Const.GRAPH_KBYK_MODE: None,
+            Const.GRAPH_GE_MODE: None,
+            Const.PYNATIVE_MODE: None
         },
-        "api": {
-            "kbk": None,
-            "graph": None,
-            "pynative": None
+        Const.API: {
+            Const.GRAPH_KBYK_MODE: None,
+            Const.GRAPH_GE_MODE: None,
+            Const.PYNATIVE_MODE: None
         },
-        "kernel": {
-            "kbk": None,
-            "graph": KernelGraphOverflowCheck,
-            "pynative": None
+        Const.KERNEL: {
+            Const.GRAPH_KBYK_MODE: None,
+            Const.GRAPH_GE_MODE: KernelGraphOverflowCheck,
+            Const.PYNATIVE_MODE: None
         }
     }
@@ -25,8 +26,9 @@ class OverflowCheckToolFactory:
     def create(config: DebuggerConfig):
         tool = OverflowCheckToolFactory.tools.get(config.level)
         if not tool:
-            raise Exception("valid level is needed.")
-        tool = tool.get("graph")
+            raise Exception("Valid level is needed.")
+        tool = tool.get(config.execution_mode)
         if not tool:
-            raise Exception("Overflow check in not supported in this mode.")
+            raise Exception(f"Overflow check is not supported in {config.execution_mode} mode "
+                            f"when level is {config.level}.")
         return tool(config)

msprobe/mindspore/runtime.py ADDED Viewed

@@ -0,0 +1,4 @@
+class Runtime:
+    step_count: int = 0
+    rank_id: int = -1
+    is_running: bool = False

msprobe/mindspore/service.py ADDED Viewed

@@ -0,0 +1,354 @@
+# Copyright 2024 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+import os
+import copy
+from pathlib import Path
+import functools
+from collections import defaultdict
+import mindspore as ms
+from mindspore.common.tensor import Tensor
+from mindspore import ops
+from mindspore import nn
+try:
+    from mindspore.common._pijit_context import PIJitCaptureContext
+    pijit_label = True
+except ImportError:
+    pijit_label = False
+from msprobe.core.data_dump.data_collector import build_data_collector
+from msprobe.core.data_dump.scope import BaseScope
+from msprobe.mindspore.common.utils import get_rank_if_initialized
+from msprobe.core.common.file_check import FileChecker, FileCheckConst, check_path_before_create
+from msprobe.mindspore.common.log import logger
+from msprobe.core.common.utils import Const
+from msprobe.core.common.exceptions import DistributedNotInitializedError
+from msprobe.mindspore.dump.hook_cell.api_registry import api_register
+from msprobe.core.data_dump.data_processor.base import ModuleBackwardInputsOutputs, ModuleForwardInputsOutputs, \
+    ModuleBackwardInputs, ModuleBackwardOutputs
+from msprobe.core.common.exceptions import MsprobeException
+from msprobe.mindspore.dump.hook_cell.hook_cell import HOOKCell
+from msprobe.mindspore.cell_processor import CellProcessor
+from msprobe.mindspore.dump.jit_dump import JitDump
+class Service:
+    def __init__(self, config):
+        self.model = None
+        self.config = copy.deepcopy(config)
+        self.config.level = self.config.level_ori
+        self.data_collector = build_data_collector(self.config)
+        self.cell_processor = CellProcessor(self.data_collector.scope)
+        self.switch = False
+        self.current_iter = 0
+        self.first_start = True
+        self.current_rank = None
+        self.primitive_counters = {}
+        self.dump_iter_dir = None
+        self.start_call = False
+        self.check_level_valid()
+    @staticmethod
+    def check_model_valid(model):
+        if not model or isinstance(model, nn.Cell):
+            return model
+        raise MsprobeException(
+            MsprobeException.INVALID_PARAM_ERROR, "model 参数必须是 mindspore.nn.Cell 类型。"
+        )
+    def check_level_valid(self):
+        if self.config.level == "L2":
+            raise MsprobeException(
+                MsprobeException.INVALID_PARAM_ERROR, "L2 level dump function is currently not supported."
+            )
+    def build_hook(self, target_type, name):
+        def forward_hook(api_or_cell_name, cell, input, output):
+            if target_type == BaseScope.Module_Type_Module:
+                api_or_cell_name = cell.mindstudio_reserved_name
+            self.data_collector.visit_and_clear_overflow_status(api_or_cell_name)
+            if not self.switch:
+                return None
+            if self.data_collector:
+                if target_type == BaseScope.Module_Type_Module:
+                    module_input_output = ModuleForwardInputsOutputs(args=input, kwargs={}, output=output)
+                else:
+                    module_input_output = ModuleForwardInputsOutputs(args=input, kwargs=cell.input_kwargs, output=output)
+                self.data_collector.forward_data_collect(api_or_cell_name, cell, pid, module_input_output)
+                if self.data_collector.if_return_forward_new_output():
+                    return self.data_collector.get_forward_new_output()
+                if target_type == BaseScope.Module_Type_API:
+                    del cell.input_kwargs
+            return output
+        def backward_hook(api_or_cell_name, cell, grad_input, grad_output):
+            if target_type == BaseScope.Module_Type_Module:
+                api_or_cell_name = cell.mindstudio_reserved_name
+            self.data_collector.visit_and_clear_overflow_status(api_or_cell_name)
+            if not self.switch:
+                return
+            if self.data_collector:
+                # 框架最新接口变更，grad_input和grad_output的含义发生了变化，与torch含义保持一致，因此此处调换顺序传入
+                module_input_output = ModuleBackwardInputsOutputs(grad_input=grad_output, grad_output=grad_input)
+                self.data_collector.backward_data_collect(api_or_cell_name, cell, pid, module_input_output)
+        pid = os.getpid()
+        forward_name_template = name + Const.FORWARD
+        backward_name_template = name + Const.BACKWARD
+        forward_hook = functools.partial(forward_hook, forward_name_template)
+        backward_hook = functools.partial(backward_hook, backward_name_template)
+        def wrap_forward_hook(cell, input, output):
+            return forward_hook(cell, input, output)
+        def wrap_backward_hook(cell, grad_input, grad_output):
+            return backward_hook(cell, grad_input, grad_output)
+        return wrap_forward_hook, wrap_backward_hook
+    def wrap_primitive(self, origin_func, primitive_name):
+        service_instance = self
+        def create_backward_hook(captured_grads, num_tensors, updated_primitive_name, hook_type):
+            def backward_hook(grad):
+                captured_grads.append(grad)
+                backward_primitive_name = f"{updated_primitive_name}.{Const.BACKWARD}"
+                try:
+                    if len(captured_grads) == num_tensors and hook_type == Const.INPUT:
+                        service_instance.data_collector.visit_and_clear_overflow_status(backward_primitive_name)
+                        new_module_input_output = ModuleBackwardOutputs(grad_output=tuple(captured_grads))
+                        service_instance.data_collector.backward_output_data_collect(
+                            backward_primitive_name, service_instance, os.getpid(), new_module_input_output
+                        )
+                        captured_grads.clear()
+                    elif len(captured_grads) == num_tensors and hook_type == Const.OUTPUT:
+                        service_instance.data_collector.visit_and_clear_overflow_status(backward_primitive_name)
+                        new_module_input_output = ModuleBackwardInputs(grad_input=tuple(captured_grads))
+                        service_instance.data_collector.backward_input_data_collect(
+                            backward_primitive_name, service_instance, os.getpid(), new_module_input_output
+                        )
+                        captured_grads.clear()
+                except Exception as exception:
+                    raise Exception(f"This is a primitive op {hook_type}_backward dump error: {exception},"
+                                    f" updated_primitive_name: {updated_primitive_name}") from exception
+            return backward_hook
+        def hook_primitive_inputs(args, captured_grads_input, updated_primitive_name):
+            hooked_inputs = []
+            num_tensors = sum(isinstance(arg, Tensor) for arg in args)
+            input_backward_hook = create_backward_hook(captured_grads_input, num_tensors, updated_primitive_name,
+                                                       Const.INPUT)
+            for _, arg in enumerate(args):
+                if isinstance(arg, Tensor):
+                    arg_hooked = ops.HookBackward(input_backward_hook)(arg)
+                    hooked_inputs.append(arg_hooked)
+                else:
+                    hooked_inputs.append(arg)
+            return hooked_inputs
+        def hook_primitive_outputs(out, captured_grads_output, updated_primitive_name):
+            if isinstance(out, tuple):
+                num_output_tensors = sum(isinstance(tensor, Tensor) for tensor in out)
+            else:
+                num_output_tensors = 1
+            output_backward_hook = create_backward_hook(captured_grads_output, num_output_tensors,
+                                                        updated_primitive_name, Const.OUTPUT)
+            if isinstance(out, Tensor):
+                return ops.HookBackward(output_backward_hook)(out)
+            elif isinstance(out, tuple):
+                hooked_outputs = []
+                for tensor in out:
+                    if isinstance(tensor, Tensor):
+                        hooked_outputs.append(ops.HookBackward(output_backward_hook)(tensor))
+                    else:
+                        hooked_outputs.append(tensor)
+                return tuple(hooked_outputs)
+            return out
+        def wrapped_primitive_call(instance_self, *args, **kwargs):
+            service_instance.update_primitive_counters(primitive_name)
+            current_count = service_instance.primitive_counters.get(primitive_name, 0)
+            updated_primitive_name = f"{Const.PRIMITIVE_PREFIX}.{primitive_name}.{current_count}"
+            if not service_instance.switch:
+                return origin_func(*args, **kwargs)
+            captured_grads_input, captured_grads_output = [], []
+            try:
+                hooked_inputs = hook_primitive_inputs(args, captured_grads_input, updated_primitive_name)
+            except Exception as exception:
+                raise Exception("This is a primitive op dump error during input hooking: {},"
+                                " primitive_name: {}".format(exception, primitive_name)) from exception
+            try:
+                out = origin_func(*hooked_inputs, **kwargs)
+            except Exception as exception:
+                raise Exception("This is a primitive op dump error during function call: {},"
+                                " primitive_name: {}".format(exception, primitive_name)) from exception
+            forward_primitive_name = f"{updated_primitive_name}.{Const.FORWARD}"
+            service_instance.data_collector.visit_and_clear_overflow_status(forward_primitive_name)
+            if service_instance.data_collector:
+                module_input_output = ModuleForwardInputsOutputs(args=hooked_inputs, kwargs=kwargs, output=out)
+                try:
+                    service_instance.data_collector.forward_data_collect(forward_primitive_name, instance_self,
+                                                                         os.getpid(), module_input_output)
+                except Exception as exception:
+                    raise Exception("This is a primitive op dump error during forward data collection: {},"
+                                    " primitive_name: {}".format(exception, primitive_name)) from exception
+                if service_instance.data_collector.if_return_forward_new_output():
+                    out = service_instance.data_collector.get_forward_new_output()
+            try:
+                out = hook_primitive_outputs(out, captured_grads_output, updated_primitive_name)
+            except Exception as exception:
+                raise Exception("This is a primitive op dump error during output hooking: {},"
+                                " primitive_name: {}".format(exception, primitive_name)) from exception
+            return out
+        return wrapped_primitive_call
+    def update_primitive_counters(self, primitive_name):
+        if primitive_name not in self.primitive_counters:
+            self.primitive_counters[primitive_name] = 0
+        else:
+            self.primitive_counters[primitive_name] += 1
+    def register_hooks(self):
+        primitive_set = set()
+        for _, cell in self.model.cells_and_names():
+            for pname, primitive in cell._primitives.items():
+                primitive_set.add((pname, primitive))
+        for pname, primitive in primitive_set:
+            NewPrimitive = type('NewPrimitive', (primitive.__class__,),
+                                {'__call__': self.wrap_primitive(primitive.__call__, pname)})
+            primitive.__class__ = NewPrimitive
+    def step(self):
+        self.current_iter += 1
+        self.data_collector.update_iter(self.current_iter)
+        HOOKCell.cell_count = defaultdict(int)
+        CellProcessor.cell_count = {}
+        self.primitive_counters.clear()
+    def start(self, model=None):
+        self.model = self.check_model_valid(model)
+        self.start_call = True
+        logger.info("msprobe: debugger.start() is set successfully")
+        if self.config.step and self.current_iter > max(self.config.step):
+            self.stop()
+            raise Exception("msprobe: exit after iteration {}".format(max(self.config.step)))
+        if self.config.step and self.current_iter not in self.config.step:
+            return
+        if self.first_start:
+            try:
+                self.current_rank = get_rank_if_initialized()
+            except DistributedNotInitializedError:
+                self.current_rank = None
+            if self.config.rank and self.current_rank not in self.config.rank:
+                return
+            self.register_hook_new()
+            self.first_start = False
+        self.switch = True
+        logger.info(f"Dump switch is turned on at step {self.current_iter}. ")
+        self.create_dirs()
+        logger.info(f"Dump data will be saved in {self.dump_iter_dir}.")
+        if self.config.level == "L1":
+            JitDump.set_config(self.config)
+            JitDump.set_data_collector(self.data_collector)
+            ms.common.api._MindsporeFunctionExecutor =  JitDump
+            ms.common.api._PyNativeExecutor.grad =  JitDump.grad
+            if pijit_label:
+                PIJitCaptureContext.__enter__ = self.empty
+                PIJitCaptureContext.__exit__ = self.empty
+    def stop(self):
+        logger.info("msprobe: debugger.stop() is set successfully. "
+                    "Please set debugger.start() to turn on the dump switch again. ")
+        if not self.start_call:
+            logger.error("msprobe: debugger.start() is not set in the current scope.")
+            raise Exception("debugger.start() is not set in the current scope.")
+        if self.config.step and self.current_iter not in self.config.step:
+            return
+        if self.config.rank and self.current_rank not in self.config.rank:
+            return
+        self.switch = False
+        self.start_call = False
+        self.data_collector.write_json()
+    def create_dirs(self):
+        check_path_before_create(self.config.dump_path)
+        if not os.path.exists(self.config.dump_path):
+            Path(self.config.dump_path).mkdir(mode=0o750, exist_ok=True)
+        file_check = FileChecker(self.config.dump_path, FileCheckConst.DIR)
+        file_check.common_check()
+        self.dump_iter_dir = os.path.join(self.config.dump_path, f"step{self.current_iter}")
+        cur_rank = self.current_rank if self.current_rank is not None else ''
+        dump_dir = os.path.join(self.dump_iter_dir, f"rank{cur_rank}")
+        if not os.path.exists(dump_dir):
+            Path(dump_dir).mkdir(mode=0o750, parents=True, exist_ok=True)
+        if self.config.task in self.data_collector.tasks_need_tensor_data:
+            dump_data_dir = os.path.join(dump_dir, "dump_tensor_data")
+            Path(dump_data_dir).mkdir(mode=0o750, exist_ok=True)
+        else:
+            dump_data_dir = None
+        dump_file_path = os.path.join(dump_dir, "dump.json")
+        stack_file_path = os.path.join(dump_dir, "stack.json")
+        construct_file_path = os.path.join(dump_dir, "construct.json")
+        self.data_collector.update_dump_paths(
+            dump_file_path, stack_file_path, construct_file_path, dump_data_dir, None)
+    def empty(self, *args, **kwargs):
+        pass
+    def register_hook_new(self):
+        logger.info("The {} hook function is successfully mounted to the model.".format(self.config.task))
+        if self.config.level == "L1":
+            api_register.initialize_hook(functools.partial(self.build_hook, BaseScope.Module_Type_API))
+            api_register.api_set_hook_func()
+            if self.model:
+                self.register_hooks()
+        if self.config.level == "L0":
+            if not self.model:
+                raise MsprobeException(MsprobeException.INVALID_PARAM_ERROR, "The current level is L0, the model cannot be None")
+            for name, cell in self.model.cells_and_names():
+                if cell == self.model:
+                    continue
+                prefix = 'Cell' + Const.SEP + name + Const.SEP + \
+                         cell.__class__.__name__ + Const.SEP
+                forward_hook, backward_hook = self.build_hook(BaseScope.Module_Type_Module, prefix)
+                cell.register_forward_hook(forward_hook)
+                cell.register_backward_hook(backward_hook)
+                cell.register_forward_pre_hook(
+                    self.cell_processor.node_hook(prefix + Const.FORWARD, Const.START))
+                cell.register_forward_hook(
+                    self.cell_processor.node_hook(prefix + Const.FORWARD, Const.STOP))
+                cell.register_backward_pre_hook(
+                    self.cell_processor.node_hook(prefix + Const.BACKWARD, Const.START))
+                cell.register_backward_hook(
+                    self.cell_processor.node_hook(prefix + Const.BACKWARD, Const.STOP))

msprobe/mindspore/task_handler_factory.py CHANGED Viewed

@@ -1,20 +1,23 @@
+from msprobe.core.common.const import Const
 from msprobe.mindspore.debugger.debugger_config import DebuggerConfig
 from msprobe.mindspore.dump.dump_tool_factory import DumpToolFactory
 from msprobe.mindspore.overflow_check.overflow_check_tool_factory import OverflowCheckToolFactory
+from msprobe.mindspore.free_benchmark.self_check_tool_factory import SelfCheckToolFactory
 class TaskHandlerFactory:
     tasks = {
-        "tensor": DumpToolFactory,
-        "statistics": DumpToolFactory,
-        "overflow_check": OverflowCheckToolFactory
+        Const.TENSOR: DumpToolFactory,
+        Const.STATISTICS: DumpToolFactory,
+        Const.OVERFLOW_CHECK: OverflowCheckToolFactory,
+        Const.FREE_BENCHMARK: SelfCheckToolFactory
     }
     @staticmethod
     def create(config: DebuggerConfig):
         task = TaskHandlerFactory.tasks.get(config.task)
         if not task:
-            raise Exception("valid task is needed.")
+            raise Exception("Valid task is needed.")
         handler = task.create(config)
         if not handler:
             raise Exception("Can not find task handler")

mindstudio-probe 1.0.1__py3-none-any.whl → 1.0.3__py3-none-any.whl

mindstudio-probe 1.0.1py3-none-any.whl → 1.0.3py3-none-any.whl