PyPI - mindstudio-probe - Versions diffs - 1.0.1__py3-none-any.whl - Mend

mindstudio-probe 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (228) hide show

mindstudio_probe-1.0.1.dist-info/LICENSE +201 -0
mindstudio_probe-1.0.1.dist-info/METADATA +30 -0
mindstudio_probe-1.0.1.dist-info/RECORD +228 -0
mindstudio_probe-1.0.1.dist-info/WHEEL +5 -0
mindstudio_probe-1.0.1.dist-info/entry_points.txt +2 -0
mindstudio_probe-1.0.1.dist-info/top_level.txt +1 -0
msprobe/README.md +182 -0
msprobe/__init__.py +0 -0
msprobe/config/README.md +397 -0
msprobe/config/config.json +28 -0
msprobe/config/img/free_benchmark.png +0 -0
msprobe/core/common/const.py +241 -0
msprobe/core/common/exceptions.py +88 -0
msprobe/core/common/file_check.py +265 -0
msprobe/core/common/log.py +55 -0
msprobe/core/common/utils.py +516 -0
msprobe/core/common_config.py +58 -0
msprobe/core/data_dump/data_collector.py +140 -0
msprobe/core/data_dump/data_processor/base.py +245 -0
msprobe/core/data_dump/data_processor/factory.py +61 -0
msprobe/core/data_dump/data_processor/pytorch_processor.py +346 -0
msprobe/core/data_dump/json_writer.py +116 -0
msprobe/core/data_dump/scope.py +178 -0
msprobe/mindspore/__init__.py +1 -0
msprobe/mindspore/debugger/__init__.py +0 -0
msprobe/mindspore/debugger/debugger_config.py +51 -0
msprobe/mindspore/debugger/precision_debugger.py +32 -0
msprobe/mindspore/doc/dump.md +65 -0
msprobe/mindspore/dump/__init__.py +0 -0
msprobe/mindspore/dump/api_kbk_dump.py +55 -0
msprobe/mindspore/dump/dump_tool_factory.py +38 -0
msprobe/mindspore/dump/kernel_graph_dump.py +60 -0
msprobe/mindspore/ms_config.py +78 -0
msprobe/mindspore/overflow_check/__init__.py +0 -0
msprobe/mindspore/overflow_check/kernel_graph_overflow_check.py +45 -0
msprobe/mindspore/overflow_check/overflow_check_tool_factory.py +32 -0
msprobe/mindspore/task_handler_factory.py +21 -0
msprobe/msprobe.py +67 -0
msprobe/pytorch/__init__.py +4 -0
msprobe/pytorch/advisor/advisor.py +124 -0
msprobe/pytorch/advisor/advisor_const.py +59 -0
msprobe/pytorch/advisor/advisor_result.py +58 -0
msprobe/pytorch/api_accuracy_checker/.keep +0 -0
msprobe/pytorch/api_accuracy_checker/__init__.py +0 -0
msprobe/pytorch/api_accuracy_checker/common/.keep +0 -0
msprobe/pytorch/api_accuracy_checker/common/__init__.py +0 -0
msprobe/pytorch/api_accuracy_checker/common/config.py +50 -0
msprobe/pytorch/api_accuracy_checker/common/utils.py +224 -0
msprobe/pytorch/api_accuracy_checker/compare/__init__.py +0 -0
msprobe/pytorch/api_accuracy_checker/compare/algorithm.py +216 -0
msprobe/pytorch/api_accuracy_checker/compare/api_precision_compare.py +545 -0
msprobe/pytorch/api_accuracy_checker/compare/api_precision_standard.yaml +133 -0
msprobe/pytorch/api_accuracy_checker/compare/api_precision_threshold.yaml +390 -0
msprobe/pytorch/api_accuracy_checker/compare/compare.py +345 -0
msprobe/pytorch/api_accuracy_checker/compare/compare_column.py +74 -0
msprobe/pytorch/api_accuracy_checker/compare/compare_utils.py +249 -0
msprobe/pytorch/api_accuracy_checker/config.yaml +4 -0
msprobe/pytorch/api_accuracy_checker/run_ut/.keep +0 -0
msprobe/pytorch/api_accuracy_checker/run_ut/__init__.py +0 -0
msprobe/pytorch/api_accuracy_checker/run_ut/data_generate.py +328 -0
msprobe/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py +203 -0
msprobe/pytorch/api_accuracy_checker/run_ut/run_overflow_check.py +127 -0
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py +493 -0
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut_utils.py +7 -0
msprobe/pytorch/api_accuracy_checker/run_ut/torch_ut_setting.json +5 -0
msprobe/pytorch/common/__init__.py +2 -0
msprobe/pytorch/common/compare_script.template +14 -0
msprobe/pytorch/common/log.py +32 -0
msprobe/pytorch/common/parse_json.py +37 -0
msprobe/pytorch/common/utils.py +224 -0
msprobe/pytorch/compare/acc_compare.py +1024 -0
msprobe/pytorch/compare/distributed_compare.py +111 -0
msprobe/pytorch/compare/highlight.py +100 -0
msprobe/pytorch/compare/mapping.yaml +607 -0
msprobe/pytorch/compare/match.py +36 -0
msprobe/pytorch/compare/npy_compare.py +244 -0
msprobe/pytorch/debugger/__init__.py +0 -0
msprobe/pytorch/debugger/debugger_config.py +86 -0
msprobe/pytorch/debugger/precision_debugger.py +95 -0
msprobe/pytorch/doc/FAQ.md +193 -0
msprobe/pytorch/doc/api_accuracy_checker.md +269 -0
msprobe/pytorch/doc/atat/321/207/342/226/223/342/225/233/321/205/342/225/221/320/266/321/205/342/225/226/320/265/321/205/320/225/342/225/226/321/206/320/245/342/226/221/321/206/320/235/320/276dump/321/206/320/260/320/227/321/205/320/227/320/226/321/206/320/220/320/267/321/210/320/223/342/225/234/321/205/320/257/342/225/221/321/207/342/225/221/342/224/220/321/206/320/232/320/265/321/205/320/241/320/232.md +182 -0
msprobe/pytorch/doc/dump.md +207 -0
msprobe/pytorch/doc/img/BLOOM-7B_1.png +0 -0
msprobe/pytorch/doc/img/BLOOM-7B_2.png +0 -0
msprobe/pytorch/doc/img/BLOOM-7B_3.png +0 -0
msprobe/pytorch/doc/img/BLOOM-7B_4.png +0 -0
msprobe/pytorch/doc/img/GPT-3_1.png +0 -0
msprobe/pytorch/doc/img/GPT-3_2.png +0 -0
msprobe/pytorch/doc/img/GPT-3_3.png +0 -0
msprobe/pytorch/doc/img/GPT-3_4.png +0 -0
msprobe/pytorch/doc/img/GPT-3_5.png +0 -0
msprobe/pytorch/doc/img/GPT-3_6.png +0 -0
msprobe/pytorch/doc/img/GPT-3_7.png +0 -0
msprobe/pytorch/doc/img/GPT-3_8.png +0 -0
msprobe/pytorch/doc/img/YOLOV5S_1.png +0 -0
msprobe/pytorch/doc/img/YOLOV5S_2.png +0 -0
msprobe/pytorch/doc/img/accuracy_checking_details.png +0 -0
msprobe/pytorch/doc/img/accuracy_checking_result.png +0 -0
msprobe/pytorch/doc/img/api_precision_compare_details.png +0 -0
msprobe/pytorch/doc/img/api_precision_compare_result.png +0 -0
msprobe/pytorch/doc/img/auto_analyze_log.png +0 -0
msprobe/pytorch/doc/img/compare_result_pkl.png +0 -0
msprobe/pytorch/doc/img/compare_result_pkl_md5.png.png +0 -0
msprobe/pytorch/doc/img/cpu_info.png +0 -0
msprobe/pytorch/doc/img/module_compare.png +0 -0
msprobe/pytorch/doc/parse_tool.md +286 -0
msprobe/pytorch/doc/ptdbg_ascend_compare.md +176 -0
msprobe/pytorch/doc/ptdbg_ascend_overview.md +68 -0
msprobe/pytorch/doc/ptdbg_ascend_quickstart.md +381 -0
msprobe/pytorch/doc/run_overflow_check.md +25 -0
msprobe/pytorch/doc//321/205/320/254/320/270/321/207/342/225/221/342/224/220/321/207/342/226/223/342/225/233/321/205/342/225/221/320/266/321/206/320/277/320/244/321/205/320/277/342/225/243.md +90 -0
msprobe/pytorch/free_benchmark/__init__.py +8 -0
msprobe/pytorch/free_benchmark/common/__init__.py +0 -0
msprobe/pytorch/free_benchmark/common/constant.py +67 -0
msprobe/pytorch/free_benchmark/common/counter.py +72 -0
msprobe/pytorch/free_benchmark/common/enums.py +37 -0
msprobe/pytorch/free_benchmark/common/params.py +129 -0
msprobe/pytorch/free_benchmark/common/utils.py +98 -0
msprobe/pytorch/free_benchmark/compare/grad_saver.py +183 -0
msprobe/pytorch/free_benchmark/compare/single_benchmark.py +104 -0
msprobe/pytorch/free_benchmark/main.py +102 -0
msprobe/pytorch/free_benchmark/perturbed_layers/__init__.py +0 -0
msprobe/pytorch/free_benchmark/perturbed_layers/base_layer.py +13 -0
msprobe/pytorch/free_benchmark/perturbed_layers/layer_factory.py +41 -0
msprobe/pytorch/free_benchmark/perturbed_layers/npu/__init__.py +0 -0
msprobe/pytorch/free_benchmark/perturbed_layers/npu/add_noise.py +90 -0
msprobe/pytorch/free_benchmark/perturbed_layers/npu/bit_noise.py +104 -0
msprobe/pytorch/free_benchmark/perturbed_layers/npu/change_value.py +63 -0
msprobe/pytorch/free_benchmark/perturbed_layers/npu/improve_precision.py +68 -0
msprobe/pytorch/free_benchmark/perturbed_layers/npu/no_change.py +28 -0
msprobe/pytorch/free_benchmark/perturbed_layers/npu/npu_base_layser.py +45 -0
msprobe/pytorch/free_benchmark/perturbed_layers/run_cpu.py +19 -0
msprobe/pytorch/free_benchmark/result_handlers/__init__.py +0 -0
msprobe/pytorch/free_benchmark/result_handlers/base_handler.py +203 -0
msprobe/pytorch/free_benchmark/result_handlers/check_handler.py +39 -0
msprobe/pytorch/free_benchmark/result_handlers/fix_handler.py +24 -0
msprobe/pytorch/free_benchmark/result_handlers/handler_factory.py +31 -0
msprobe/pytorch/free_benchmark/result_handlers/preheat_handler.py +170 -0
msprobe/pytorch/functional/__init__.py +0 -0
msprobe/pytorch/functional/data_processor.py +0 -0
msprobe/pytorch/functional/dump_module.py +39 -0
msprobe/pytorch/hook_module/__init__.py +1 -0
msprobe/pytorch/hook_module/api_registry.py +161 -0
msprobe/pytorch/hook_module/hook_module.py +109 -0
msprobe/pytorch/hook_module/support_wrap_ops.yaml +1876 -0
msprobe/pytorch/hook_module/utils.py +29 -0
msprobe/pytorch/hook_module/wrap_aten.py +100 -0
msprobe/pytorch/hook_module/wrap_distributed.py +75 -0
msprobe/pytorch/hook_module/wrap_functional.py +108 -0
msprobe/pytorch/hook_module/wrap_npu_custom.py +73 -0
msprobe/pytorch/hook_module/wrap_tensor.py +72 -0
msprobe/pytorch/hook_module/wrap_torch.py +88 -0
msprobe/pytorch/hook_module/wrap_vf.py +64 -0
msprobe/pytorch/module_processer.py +98 -0
msprobe/pytorch/online_dispatch/__init__.py +20 -0
msprobe/pytorch/online_dispatch/compare.py +236 -0
msprobe/pytorch/online_dispatch/dispatch.py +274 -0
msprobe/pytorch/online_dispatch/dump_compare.py +186 -0
msprobe/pytorch/online_dispatch/single_compare.py +391 -0
msprobe/pytorch/online_dispatch/torch_ops_config.yaml +50 -0
msprobe/pytorch/online_dispatch/utils.py +187 -0
msprobe/pytorch/parse.py +4 -0
msprobe/pytorch/parse_tool/__init__.py +0 -0
msprobe/pytorch/parse_tool/cli.py +32 -0
msprobe/pytorch/parse_tool/lib/__init__.py +0 -0
msprobe/pytorch/parse_tool/lib/compare.py +259 -0
msprobe/pytorch/parse_tool/lib/config.py +51 -0
msprobe/pytorch/parse_tool/lib/file_desc.py +31 -0
msprobe/pytorch/parse_tool/lib/interactive_cli.py +102 -0
msprobe/pytorch/parse_tool/lib/parse_exception.py +54 -0
msprobe/pytorch/parse_tool/lib/parse_tool.py +158 -0
msprobe/pytorch/parse_tool/lib/utils.py +367 -0
msprobe/pytorch/parse_tool/lib/visualization.py +90 -0
msprobe/pytorch/pt_config.py +93 -0
msprobe/pytorch/service.py +167 -0
msprobe/test/core_ut/common/test_utils.py +345 -0
msprobe/test/core_ut/data_dump/test_data_collector.py +47 -0
msprobe/test/core_ut/data_dump/test_json_writer.py +183 -0
msprobe/test/core_ut/data_dump/test_scope.py +151 -0
msprobe/test/core_ut/test_common_config.py +152 -0
msprobe/test/core_ut/test_file_check.py +218 -0
msprobe/test/core_ut/test_log.py +109 -0
msprobe/test/mindspore_ut/test_api_kbk_dump.py +51 -0
msprobe/test/mindspore_ut/test_debugger_config.py +42 -0
msprobe/test/mindspore_ut/test_dump_tool_factory.py +51 -0
msprobe/test/mindspore_ut/test_kernel_graph_dump.py +66 -0
msprobe/test/mindspore_ut/test_kernel_graph_overflow_check.py +63 -0
msprobe/test/mindspore_ut/test_ms_config.py +69 -0
msprobe/test/mindspore_ut/test_overflow_check_tool_factory.py +51 -0
msprobe/test/mindspore_ut/test_precision_debugger.py +56 -0
msprobe/test/mindspore_ut/test_task_handler_factory.py +58 -0
msprobe/test/pytorch_ut/advisor/test_advisor.py +83 -0
msprobe/test/pytorch_ut/api_accuracy_checker/common/test_common_utils.py +108 -0
msprobe/test/pytorch_ut/api_accuracy_checker/common/test_config.py +39 -0
msprobe/test/pytorch_ut/api_accuracy_checker/compare/test_algorithm.py +112 -0
msprobe/test/pytorch_ut/api_accuracy_checker/compare/test_api_precision_compare.py +77 -0
msprobe/test/pytorch_ut/api_accuracy_checker/compare/test_compare.py +125 -0
msprobe/test/pytorch_ut/api_accuracy_checker/compare/test_compare_column.py +10 -0
msprobe/test/pytorch_ut/api_accuracy_checker/compare/test_compare_utils.py +43 -0
msprobe/test/pytorch_ut/api_accuracy_checker/run_ut/dump.json +179 -0
msprobe/test/pytorch_ut/api_accuracy_checker/run_ut/forward.json +63 -0
msprobe/test/pytorch_ut/api_accuracy_checker/run_ut/test_data_generate.py +99 -0
msprobe/test/pytorch_ut/api_accuracy_checker/run_ut/test_multi_run_ut.py +115 -0
msprobe/test/pytorch_ut/api_accuracy_checker/run_ut/test_run_ut.py +72 -0
msprobe/test/pytorch_ut/compare/test_acc_compare.py +17 -0
msprobe/test/pytorch_ut/free_benchmark/perturbed_layers/test_perturbed_layser.py +105 -0
msprobe/test/pytorch_ut/free_benchmark/result_handlers/test_result_handler.py +121 -0
msprobe/test/pytorch_ut/free_benchmark/test_main.py +101 -0
msprobe/test/pytorch_ut/functional/test_dump_module.py +15 -0
msprobe/test/pytorch_ut/hook_module/test_api_registry.py +130 -0
msprobe/test/pytorch_ut/hook_module/test_hook_module.py +42 -0
msprobe/test/pytorch_ut/hook_module/test_wrap_aten.py +65 -0
msprobe/test/pytorch_ut/hook_module/test_wrap_distributed.py +35 -0
msprobe/test/pytorch_ut/hook_module/test_wrap_functional.py +20 -0
msprobe/test/pytorch_ut/hook_module/test_wrap_tensor.py +35 -0
msprobe/test/pytorch_ut/hook_module/test_wrap_torch.py +43 -0
msprobe/test/pytorch_ut/hook_module/test_wrap_vf.py +11 -0
msprobe/test/pytorch_ut/test_pt_config.py +69 -0
msprobe/test/pytorch_ut/test_service.py +59 -0
msprobe/test/resources/advisor.txt +3 -0
msprobe/test/resources/compare_result_20230703104808.csv +9 -0
msprobe/test/resources/compare_result_without_accuracy.csv +9 -0
msprobe/test/resources/config.yaml +3 -0
msprobe/test/resources/npu_test.pkl +8 -0
msprobe/test/run_test.sh +30 -0
msprobe/test/run_ut.py +58 -0
msprobe/test/test_module_processer.py +64 -0

msprobe/mindspore/debugger/precision_debugger.py ADDED Viewed

@@ -0,0 +1,32 @@
+import os
+from msprobe.mindspore.ms_config import parse_json_config
+from msprobe.mindspore.debugger.debugger_config import DebuggerConfig
+from msprobe.mindspore.task_handler_factory import TaskHandlerFactory
+class PrecisionDebugger:
+    _instance = None
+    def __new__(cls, config_path=None):
+        if not cls._instance:
+            cls._instance = super().__new__(cls)
+            cls._instance.initialized = False
+            cls._instance.config = None
+        return cls._instance
+    def __init__(self, config_path=None):
+        if self.initialized:
+            return
+        if not config_path:
+            config_path = os.path.join(os.path.dirname(__file__), "../../config/config.json")
+        common_config, task_config = parse_json_config(config_path)
+        self.config = DebuggerConfig(common_config, task_config)
+        self.initialized = True
+    @classmethod
+    def start(cls, target=None):
+        instance = cls._instance
+        if not instance:
+            raise Exception("No instance of PrecisionDebugger found.")
+        handler = TaskHandlerFactory.create(instance.config)
+        handler.handle()

msprobe/mindspore/doc/dump.md ADDED Viewed

@@ -0,0 +1,65 @@
+# **精度数据采集**
+msprobe工具主要通过在训练脚本内添加dump接口并启动训练的方式来采集精度数据。
+执行dump操作需要安装msprobe工具。详见《[MindStudio精度调试工具](../../README.md)》的“工具安装”章节。
+## dump接口介绍
+### PrecisionDebugger
+**功能说明**
+通过加载dump配置文件的方式来确定dump操作的详细配置。
+可以在from msprobe.mindspore import PrecisionDebugger和模型初始化之间的任意位置添加该接口。
+**原型**
+```Python
+PrecisionDebugger(config_path=None)
+```
+**参数说明**
+| 参数名      | 说明                                                         | 是否必选 |
+| ----------- | ------------------------------------------------------------ | -------- |
+| config_path | 指定dump配置文件路径，String类型。参数示例："./config.json"。未配置该路径时，默认使用[config.json](../../config)文件的默认配置。config.json文件可以配置更多参数，若需要进行更多场景的精度数据dump，建议配置[config.json](../../config/config.json)文件。 | 否       |
+### start函数
+**功能说明**
+启动函数。
+**原型**
+```Python
+debugger.start()
+```
+该函数为类函数，可以使用debugger.start()也可以使用PrecisionDebugger.start()。
+## 示例代码
+```Python
+from msprobe.mindspore import PrecisionDebugger
+debugger = PrecisionDebugger(config_path="./config.json")
+# 请勿将以上初始化流程插入到循环代码中
+# 下面代码也可以用PrecisionDebugger.start()
+debugger.start()
+...
+```
+## dump结果文件介绍
+训练结束后，工具将dump的数据保存在dump_path参数指定的目录下。
+- level为L1时
+  dump结果目录请参见MindSpore官网中的《[同步Dump数据对象目录](https://www.mindspore.cn/tutorials/experts/zh-CN/r2.3.0rc2/debug/dump.html#%E5%90%8C%E6%AD%A5dump%E6%95%B0%E6%8D%AE%E5%AF%B9%E8%B1%A1%E7%9B%AE%E5%BD%95)》。
+- level为L2时
+  dump结果目录请参见MindSpore官网中的《[异步Dump数据对象目录](https://www.mindspore.cn/tutorials/experts/zh-CN/r2.3.0rc2/debug/dump.html#%E5%BC%82%E6%AD%A5dump%E6%95%B0%E6%8D%AE%E5%AF%B9%E8%B1%A1%E7%9B%AE%E5%BD%95)》。

msprobe/mindspore/dump/__init__.py ADDED Viewed

File without changes

msprobe/mindspore/dump/api_kbk_dump.py ADDED Viewed

@@ -0,0 +1,55 @@
+import os
+import json
+from msprobe.core.common.utils import make_dump_path_if_not_exists
+from msprobe.mindspore.debugger.debugger_config import DebuggerConfig
+from msprobe.core.common.log import logger
+from msprobe.core.common.file_check import FileOpen
+class ApiKbkDump:
+    def __init__(self, config: DebuggerConfig):
+        self.dump_json = dict()
+        self.dump_json["common_dump_settings"] = dict()
+        self.dump_json["common_dump_settings"]["dump_mode"] = 0
+        self.dump_json["common_dump_settings"]["path"] = ""
+        self.dump_json["common_dump_settings"]["net_name"] = "Net"
+        self.dump_json["common_dump_settings"]["iteration"] = "all"
+        self.dump_json["common_dump_settings"]["saved_data"] = "statistic"
+        self.dump_json["common_dump_settings"]["input_output"] = 0
+        self.dump_json["common_dump_settings"]["kernels"] = []
+        self.dump_json["common_dump_settings"]["support_device"] = [0,1,2,3,4,5,6,7]
+        self.dump_json["e2e_dump_settings"] = dict()
+        self.dump_json["e2e_dump_settings"]["enable"] = True
+        self.dump_json["e2e_dump_settings"]["trans_flag"] = True
+        if len(config.list) > 0:
+            self.dump_json["common_dump_settings"]["dump_mode"] = 1
+            self.dump_json["common_dump_settings"]["kernels"] = config.list
+        self.dump_json["common_dump_settings"]["path"] = config.dump_path
+        if len(config.step) > 0:
+            step_str = ""
+            for s in config.step:
+                step_str += (str(s) + '|')
+            self.dump_json["common_dump_settings"]["iteration"] = step_str[:-1]
+        if len(config.rank) > 0:
+            self.dump_json["common_dump_settings"]["support_device"] = config.rank
+        if config.task == "tensor":
+            self.dump_json["common_dump_settings"]["saved_data"] = "tensor"
+        if len(config.data_mode) == 1:
+            if config.data_mode[0] == "input":
+                self.dump_json["common_dump_settings"]["input_output"] = 1
+            if config.data_mode[0] == "output":
+                self.dump_json["common_dump_settings"]["input_output"] = 2
+    def handle(self):
+        json_path = self.dump_json["common_dump_settings"]["path"]
+        make_dump_path_if_not_exists(json_path)
+        json_path = os.path.join(json_path, "api_kbk_dump.json")
+        with FileOpen(json_path, 'w') as f:
+            json.dump(self.dump_json, f)
+        logger.info(json_path + " has been created.")
+        os.environ["GRAPH_OP_RUN"] = "1"
+        os.environ["MINDSPORE_DUMP_CONFIG"] = json_path
+        if "MS_ACL_DUMP_CFG_PATH" in os.environ:
+            del os.environ["MS_ACL_DUMP_CFG_PATH"]

msprobe/mindspore/dump/dump_tool_factory.py ADDED Viewed

@@ -0,0 +1,38 @@
+from msprobe.mindspore.debugger.debugger_config import DebuggerConfig
+from msprobe.mindspore.dump.api_kbk_dump import ApiKbkDump
+from msprobe.mindspore.dump.kernel_graph_dump import KernelGraphDump
+class DumpToolFactory:
+    tools = {
+        "cell": {
+            "kbk": None,
+            "graph": None,
+            "pynative": None
+        },
+        "api": {
+            "kbk": ApiKbkDump,
+            "graph": None,
+            "pynative": None
+        },
+        "kernel": {
+            "kbk": None,
+            "graph": KernelGraphDump,
+            "pynative": None
+        }
+    }
+    @staticmethod
+    def create(config: DebuggerConfig):
+        tool = DumpToolFactory.tools.get(config.level)
+        if not tool:
+            raise Exception("valid level is needed.")
+        if config.level == "api":
+            tool = tool.get("kbk")
+        elif config.level == "kernel":
+            tool = tool.get("graph")
+        elif config.level == "cell":
+            raise Exception("Cell dump in not supported now.")
+        if not tool:
+            raise Exception("Data dump in not supported in this mode.")
+        return tool(config)

msprobe/mindspore/dump/kernel_graph_dump.py ADDED Viewed

@@ -0,0 +1,60 @@
+import os
+import json
+from msprobe.core.common.utils import make_dump_path_if_not_exists
+from msprobe.mindspore.debugger.debugger_config import DebuggerConfig
+from msprobe.core.common.log import logger
+from msprobe.core.common.file_check import FileOpen
+class KernelGraphDump:
+    def __init__(self, config: DebuggerConfig):
+        self.dump_json = dict()
+        self.dump_json["common_dump_settings"] = dict()
+        self.dump_json["common_dump_settings"]["dump_mode"] = 0
+        self.dump_json["common_dump_settings"]["path"] = ""
+        self.dump_json["common_dump_settings"]["net_name"] = "Net"
+        self.dump_json["common_dump_settings"]["iteration"] = "all"
+        self.dump_json["common_dump_settings"]["saved_data"] = "statistic"
+        self.dump_json["common_dump_settings"]["input_output"] = 0
+        self.dump_json["common_dump_settings"]["kernels"] = []
+        self.dump_json["common_dump_settings"]["support_device"] = [0, 1, 2, 3, 4, 5, 6, 7]
+        self.dump_json["common_dump_settings"]["op_debug_mode"] = 0
+        self.dump_json["common_dump_settings"]["file_format"] = "npy"
+        if len(config.list) > 0:
+            self.dump_json["common_dump_settings"]["dump_mode"] = 1
+            self.dump_json["common_dump_settings"]["kernels"] = config.list
+        self.dump_json["common_dump_settings"]["path"] = config.dump_path
+        if len(config.step) > 0:
+            step_str = ""
+            for s in config.step:
+                step_str += (str(s) + '|')
+            self.dump_json["common_dump_settings"]["iteration"] = step_str[:-1]
+        if len(config.rank) > 0:
+            self.dump_json["common_dump_settings"]["support_device"] = config.rank
+        if config.task == "tensor":
+            self.dump_json["common_dump_settings"]["saved_data"] = "tensor"
+            self.dump_json["common_dump_settings"]["file_format"] = config.file_format
+        if len(config.data_mode) == 1:
+            if config.data_mode[0] == "input":
+                self.dump_json["common_dump_settings"]["input_output"] = 1
+            if config.data_mode[0] == "output":
+                self.dump_json["common_dump_settings"]["input_output"] = 2
+    def handle(self):
+        if os.getenv("GRAPH_OP_RUN") == "1":
+            raise Exception("Must run in graph mode, not kbk mode")
+        json_path = self.dump_json["common_dump_settings"]["path"]
+        make_dump_path_if_not_exists(json_path)
+        json_path = os.path.join(json_path, "kernel_graph_dump.json")
+        with FileOpen(json_path, 'w') as f:
+            json.dump(self.dump_json, f)
+        logger.info(json_path + " has been created.")
+        os.environ["MINDSPORE_DUMP_CONFIG"] = json_path
+        if self.dump_json["common_dump_settings"]["dump_mode"] == 0:
+            if self.dump_json["common_dump_settings"]["iteration"] != "all" or \
+               len(self.dump_json["common_dump_settings"]["kernels"]) == 0:
+                os.environ["MS_ACL_DUMP_CFG_PATH"] = json_path
+        else:
+            if "MS_ACL_DUMP_CFG_PATH" in os.environ:
+                del os.environ["MS_ACL_DUMP_CFG_PATH"]

msprobe/mindspore/ms_config.py ADDED Viewed

@@ -0,0 +1,78 @@
+import json
+from msprobe.core.common_config import CommonConfig, BaseConfig
+from msprobe.core.common.file_check import FileOpen
+class TensorConfig(BaseConfig):
+    def __init__(self, json_config):
+        super().__init__(json_config)
+        self.check_mode = None
+        self.file_format = json_config.get("file_format")
+        self.check_config()
+        self._check_config()
+    def _check_config(self):
+        if self.data_mode is not None and len(self.data_mode) > 0:
+            if len(self.data_mode) > 1 or self.data_mode[0] not in ["all", "input", "output"]:
+                raise Exception("data_mode must be all, input or output")
+        if self.file_format and self.file_format not in ["npy", "bin"]:
+            raise Exception("file_format is invalid")
+class StatisticsConfig(BaseConfig):
+    def __init__(self, json_config):
+        super().__init__(json_config)
+        self.file_format = None
+        self.check_mode = None
+        self.check_config()
+        self._check_config()
+    def _check_config(self):
+        if self.data_mode is not None and len(self.data_mode) > 0:
+            if len(self.data_mode) > 1 or self.data_mode[0] not in ["all", "input", "output"]:
+                raise Exception("data_mode must be all, input or output")
+class OverflowCheck(BaseConfig):
+    def __init__(self, json_config):
+        super().__init__(json_config)
+        self.file_format = None
+        self.check_mode = json_config.get("check_mode")
+        self._check_config()
+    def _check_config(self):
+        if self.data_mode is not None and len(self.data_mode) > 0:
+            if len(self.data_mode) > 1 or self.data_mode[0] not in ["all", "input", "output"]:
+                raise Exception("data_mode must be all, input or output")
+        if self.check_mode and self.check_mode not in ["all", "aicore", "atomic"]:
+            raise Exception("check_mode is invalid")
+def parse_common_config(json_config):
+    return CommonConfig(json_config)
+def parse_task_config(task, json_config):
+    task_map = json_config[task]
+    if not task_map:
+        task_map = dict()
+    if task == "tensor":
+        return TensorConfig(task_map)
+    elif task == "statistics":
+        return StatisticsConfig(task_map)
+    elif task == "overflow_check":
+        return OverflowCheck(task_map)
+    else:
+        raise Exception("task is invalid.")
+def parse_json_config(json_file_path):
+    if not json_file_path:
+        raise Exception("json file path is None")
+    with FileOpen(json_file_path, 'r') as file:
+        json_config = json.load(file)
+    common_config = parse_common_config(json_config)
+    if not common_config.task:
+        common_config.task = "statistics"
+    task_config = parse_task_config(common_config.task, json_config)
+    return common_config, task_config

msprobe/mindspore/overflow_check/__init__.py ADDED Viewed

File without changes

msprobe/mindspore/overflow_check/kernel_graph_overflow_check.py ADDED Viewed

@@ -0,0 +1,45 @@
+import os
+import json
+from msprobe.core.common.utils import make_dump_path_if_not_exists
+from msprobe.mindspore.debugger.debugger_config import DebuggerConfig
+from msprobe.core.common.log import logger
+from msprobe.core.common.file_check import FileOpen
+class KernelGraphOverflowCheck:
+    def __init__(self, config: DebuggerConfig):
+        self.dump_json = dict()
+        self.dump_json["common_dump_settings"] = dict()
+        self.dump_json["common_dump_settings"]["dump_mode"] = 0
+        self.dump_json["common_dump_settings"]["path"] = ""
+        self.dump_json["common_dump_settings"]["net_name"] = "Net"
+        self.dump_json["common_dump_settings"]["iteration"] = "all"
+        self.dump_json["common_dump_settings"]["saved_data"] = "full"
+        self.dump_json["common_dump_settings"]["input_output"] = 0
+        self.dump_json["common_dump_settings"]["kernels"] = []
+        self.dump_json["common_dump_settings"]["support_device"] = [0,1,2,3,4,5,6,7]
+        self.dump_json["common_dump_settings"]["op_debug_mode"] = 3
+        self.dump_json["common_dump_settings"]["file_format"] = "npy"
+        self.dump_json["common_dump_settings"]["path"] = config.dump_path
+        if len(config.step) > 0:
+            logger.warning("Step would change to all in this task.")
+        if len(config.rank) > 0:
+            self.dump_json["common_dump_settings"]["support_device"] = config.rank
+        if config.check_mode == "aicore":
+            self.dump_json["common_dump_settings"]["op_debug_mode"] = 1
+        elif config.check_mode == "atomic":
+            self.dump_json["common_dump_settings"]["op_debug_mode"] = 2
+    def handle(self):
+        if os.getenv("GRAPH_OP_RUN") == "1":
+            raise Exception("Must run in graph mode, not kbk mode")
+        json_path = self.dump_json["common_dump_settings"]["path"]
+        make_dump_path_if_not_exists(json_path)
+        json_path = os.path.join(json_path, "kernel_graph_overflow_check.json")
+        with FileOpen(json_path, 'w') as f:
+            json.dump(self.dump_json, f)
+        logger.info(json_path + " has been created.")
+        os.environ["MINDSPORE_DUMP_CONFIG"] = json_path
+        if "MS_ACL_DUMP_CFG_PATH" in os.environ:
+            del os.environ["MS_ACL_DUMP_CFG_PATH"]

msprobe/mindspore/overflow_check/overflow_check_tool_factory.py ADDED Viewed

@@ -0,0 +1,32 @@
+from msprobe.mindspore.debugger.debugger_config import DebuggerConfig
+from msprobe.mindspore.overflow_check.kernel_graph_overflow_check import KernelGraphOverflowCheck
+class OverflowCheckToolFactory:
+    tools = {
+        "cell": {
+            "kbk": None,
+            "graph": None,
+            "pynative": None
+        },
+        "api": {
+            "kbk": None,
+            "graph": None,
+            "pynative": None
+        },
+        "kernel": {
+            "kbk": None,
+            "graph": KernelGraphOverflowCheck,
+            "pynative": None
+        }
+    }
+    @staticmethod
+    def create(config: DebuggerConfig):
+        tool = OverflowCheckToolFactory.tools.get(config.level)
+        if not tool:
+            raise Exception("valid level is needed.")
+        tool = tool.get("graph")
+        if not tool:
+            raise Exception("Overflow check in not supported in this mode.")
+        return tool(config)

msprobe/mindspore/task_handler_factory.py ADDED Viewed

@@ -0,0 +1,21 @@
+from msprobe.mindspore.debugger.debugger_config import DebuggerConfig
+from msprobe.mindspore.dump.dump_tool_factory import DumpToolFactory
+from msprobe.mindspore.overflow_check.overflow_check_tool_factory import OverflowCheckToolFactory
+class TaskHandlerFactory:
+    tasks = {
+        "tensor": DumpToolFactory,
+        "statistics": DumpToolFactory,
+        "overflow_check": OverflowCheckToolFactory
+    }
+    @staticmethod
+    def create(config: DebuggerConfig):
+        task = TaskHandlerFactory.tasks.get(config.task)
+        if not task:
+            raise Exception("valid task is needed.")
+        handler = task.create(config)
+        if not handler:
+            raise Exception("Can not find task handler")
+        return handler

msprobe/msprobe.py ADDED Viewed

@@ -0,0 +1,67 @@
+# Copyright (c) 2024, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import sys
+from msprobe.pytorch.api_accuracy_checker.run_ut.run_ut import _run_ut_parser, run_ut_command
+from msprobe.pytorch.parse_tool.cli import parse as cli_parse
+from msprobe.pytorch.api_accuracy_checker.run_ut.multi_run_ut import prepare_config, run_parallel_ut
+from msprobe.pytorch.api_accuracy_checker.compare.api_precision_compare import _api_precision_compare_parser, \
+    _api_precision_compare_command
+from msprobe.pytorch.api_accuracy_checker.run_ut.run_overflow_check import _run_overflow_check_parser, \
+    _run_overflow_check_command
+def main():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        description="msprobe(mindstudio probe), [Powered by MindStudio].\n"
+                    "Providing one-site accuracy difference debugging toolkit for training on Ascend Devices.\n"
+                    f"For any issue, refer README.md first",
+    )
+    parser.set_defaults(print_help=parser.print_help)
+    parser.add_argument('-f', '--framework', required=True, choices=['pytorch'],
+                        help='Deep learning framework.')
+    subparsers = parser.add_subparsers()
+    subparsers.add_parser('parse')
+    run_ut_cmd_parser = subparsers.add_parser('run_ut')
+    multi_run_ut_cmd_parser = subparsers.add_parser('multi_run_ut')
+    api_precision_compare_cmd_parser = subparsers.add_parser('api_precision_compare')
+    run_overflow_check_cmd_parser = subparsers.add_parser('run_overflow_check')
+    _run_ut_parser(run_ut_cmd_parser)
+    _run_ut_parser(multi_run_ut_cmd_parser)
+    multi_run_ut_cmd_parser.add_argument('-n', '--num_splits', type=int, choices=range(1, 65), default=8,
+                                         help='Number of splits for parallel processing. Range: 1-64')
+    _api_precision_compare_parser(api_precision_compare_cmd_parser)
+    _run_overflow_check_parser(run_overflow_check_cmd_parser)
+    if len(sys.argv) == 1:
+        parser.print_help()
+        sys.exit(0)
+    args = parser.parse_args(sys.argv[1:])
+    if sys.argv[3] == "run_ut":
+        run_ut_command(args)
+    elif sys.argv[3] == "parse":
+        cli_parse()
+    elif sys.argv[3] == "multi_run_ut":
+        config = prepare_config(args)
+        run_parallel_ut(config)
+    elif sys.argv[3] == "api_precision_compare":
+        _api_precision_compare_command(args)
+    elif sys.argv[3] == "run_overflow_check":
+        _run_overflow_check_command(args)
+if __name__ == "__main__":
+    main()

msprobe/pytorch/__init__.py ADDED Viewed

@@ -0,0 +1,4 @@
+from .debugger.precision_debugger import PrecisionDebugger
+from .common.utils import seed_all
+from .compare.acc_compare import compare
+from .compare.distributed_compare import compare_distributed

msprobe/pytorch/advisor/advisor.py ADDED Viewed

@@ -0,0 +1,124 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+# Copyright (C) 2022-2024. Huawei Technologies Co., Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+import os
+from msprobe.pytorch.advisor.advisor_result import AdvisorResult
+from msprobe.pytorch.advisor.advisor_const import AdvisorConst
+from msprobe.pytorch.common.log import logger
+from msprobe.core.common.utils import CompareException
+from msprobe.core.common.file_check import FileChecker
+from msprobe.core.common.const import Const, CompareConst, FileCheckConst
+class Advisor:
+    """
+    Class for generate advisor
+    """
+    def __init__(self, input_data, out_path=""):
+        self.input_data = input_data
+        self.out_path = os.path.realpath(out_path)
+        self.file_type = None
+    @staticmethod
+    def deterministic_advisor(message, node_name):
+        for api_name in AdvisorConst.NEED_DETERMINISTIC_API:
+            if api_name in node_name:
+                return AdvisorConst.DETERMINISTIC_SUGGEST
+        return message
+    @staticmethod
+    def batch_norm_advisor(message, node_name):
+        if AdvisorConst.FUNC_BATCH_NORM in node_name and AdvisorConst.FORWARD_INPUT_1 in node_name:
+            message = AdvisorConst.BATCH_NORM_SUGGEST
+        return message
+    def analyze_unmatched(self, analyze_data):
+        if self.file_type == Const.ALL:
+            accuracy_unmatched = analyze_data[
+                analyze_data[CompareConst.ACCURACY] == CompareConst.ACCURACY_CHECK_UNMATCH]
+        else:
+            accuracy_unmatched = analyze_data[(analyze_data[CompareConst.NPU_SHAPE] == CompareConst.NAN) |
+                                              (analyze_data[CompareConst.BENCH_SHAPE] == CompareConst.NAN)]
+        num_unmatch = len(accuracy_unmatched)
+        if num_unmatch != 0:
+            for i in range(len(accuracy_unmatched)):
+                item = accuracy_unmatched.iloc[i]
+                logger.warning("The tensor name matches but the shape or dtype does not match: {}"
+                            .format(item[CompareConst.NPU_NAME]))
+    def gen_advisor_result(self, pd_data):
+        first_failing_data = pd_data.iloc[0]
+        node_name = first_failing_data[CompareConst.NPU_NAME]
+        index = first_failing_data['index']
+        message = self.gen_advisor_message(node_name)
+        logger.warning("Find %s accuracy not reached, the line is %s" % (node_name, index))
+        result = AdvisorResult(node_name, index, message)
+        return result
+    def gen_advisor_message(self, node_name):
+        if AdvisorConst.FORWARD in node_name:
+            if AdvisorConst.INPUT in node_name:
+                message = AdvisorConst.FORWARD_INPUT_SUGGEST
+            else:
+                message = AdvisorConst.FORWARD_OUTPUT_SUGGEST
+                message = self.deterministic_advisor(message, node_name)
+        else:
+            if AdvisorConst.INPUT in node_name:
+                message = AdvisorConst.BACKWARD_INPUT_SUGGEST
+            else:
+                message = AdvisorConst.BACKWARD_OUTPUT_SUGGEST
+                message = self.deterministic_advisor(message, node_name)
+        message = self.batch_norm_advisor(message, node_name)
+        return message
+    def analysis(self):
+        self._check_path_vaild()
+        analyze_data = self._parse_input_data()
+        logger.info("Start analyzing the comparison result: %s" % self.file_type)
+        self.analyze_unmatched(analyze_data)
+        if self.file_type == Const.ALL:
+            failing_data = analyze_data[analyze_data[CompareConst.ACCURACY] == CompareConst.ACCURACY_CHECK_NO]
+        elif self.file_type == Const.MD5:
+            failing_data = analyze_data[analyze_data[CompareConst.RESULT] == CompareConst.DIFF]
+        elif self.file_type == Const.SUMMARY:
+            failing_data = analyze_data[analyze_data[CompareConst.RESULT] == CompareConst.WARNING]
+        if failing_data.empty:
+            logger.info("All data from api input/output accuracy reached")
+            result = AdvisorResult(AdvisorConst.NO_ERROR_API, AdvisorConst.NO_ERROR_API, AdvisorConst.NO_ERR_SUGGEST)
+        else:
+            result = self.gen_advisor_result(failing_data)
+        message_list = result.print_advisor_log()
+        result.gen_summary_file(self.out_path, message_list)
+    def _parse_input_data(self):
+        data_columns = self.input_data.columns.values
+        if {CompareConst.ACCURACY, CompareConst.NPU_NAME}.issubset(data_columns):
+            self.file_type = Const.ALL
+        elif {CompareConst.RESULT, CompareConst.NPU_MD5}.issubset(data_columns):
+            self.file_type = Const.MD5
+        elif {CompareConst.MAX_DIFF, CompareConst.RESULT}.issubset(data_columns):
+            self.file_type = Const.SUMMARY
+        else:
+            logger.error('Compare result does not meet the required conditions.')
+            raise CompareException(CompareException.INVALID_DATA_ERROR)
+        df = self.input_data.reset_index()
+        return df
+    def _check_path_vaild(self):
+        out_path_checker = FileChecker(self.out_path, FileCheckConst.DIR, FileCheckConst.WRITE_ABLE)
+        out_path_checker.common_check()