PyPI - mindstudio-probe - Versions diffs - 1.2.2__py3-none-any.whl → 8.1.0__py3-none-any.whl - Mend

mindstudio-probe 1.2.2py3-none-any.whl → 8.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (261) hide show

{mindstudio_probe-1.2.2.dist-info → mindstudio_probe-8.1.0.dist-info}/METADATA +4 -3
{mindstudio_probe-1.2.2.dist-info → mindstudio_probe-8.1.0.dist-info}/RECORD +243 -191
msprobe/README.md +57 -21
msprobe/core/__init__.py +17 -0
msprobe/core/common/const.py +224 -82
msprobe/core/common/decorator.py +50 -0
msprobe/core/common/exceptions.py +5 -3
msprobe/core/common/file_utils.py +274 -40
msprobe/core/common/framework_adapter.py +169 -0
msprobe/core/common/global_lock.py +86 -0
msprobe/core/common/runtime.py +25 -0
msprobe/core/common/utils.py +148 -72
msprobe/core/common_config.py +7 -0
msprobe/core/compare/acc_compare.py +640 -462
msprobe/core/compare/check.py +36 -107
msprobe/core/compare/compare_cli.py +4 -0
msprobe/core/compare/config.py +72 -0
msprobe/core/compare/highlight.py +217 -215
msprobe/core/compare/layer_mapping/layer_mapping.py +4 -1
msprobe/core/compare/merge_result/merge_result.py +12 -6
msprobe/core/compare/multiprocessing_compute.py +227 -107
msprobe/core/compare/npy_compare.py +32 -16
msprobe/core/compare/utils.py +218 -244
msprobe/{mindspore/runtime.py → core/config_check/__init__.py} +2 -4
msprobe/{pytorch/dump/kernel_dump/kernel_config.py → core/config_check/checkers/__init__.py} +8 -16
msprobe/core/config_check/checkers/base_checker.py +60 -0
msprobe/core/config_check/checkers/dataset_checker.py +138 -0
msprobe/core/config_check/checkers/env_args_checker.py +96 -0
msprobe/core/config_check/checkers/hyperparameter_checker.py +170 -0
msprobe/core/config_check/checkers/pip_checker.py +90 -0
msprobe/core/config_check/checkers/random_checker.py +367 -0
msprobe/core/config_check/checkers/weights_checker.py +147 -0
msprobe/core/config_check/ckpt_compare/ckpt_comparator.py +74 -0
msprobe/core/config_check/ckpt_compare/megatron_loader.py +302 -0
msprobe/core/config_check/ckpt_compare/metrics.py +83 -0
msprobe/core/config_check/ckpt_compare/name_mapping.yaml +12 -0
msprobe/core/config_check/config_check_cli.py +51 -0
msprobe/core/config_check/config_checker.py +100 -0
msprobe/{pytorch/parse.py → core/config_check/resource/dependency.yaml} +7 -4
msprobe/core/config_check/resource/env.yaml +57 -0
msprobe/core/config_check/resource/hyperparameter.yaml +21 -0
msprobe/core/config_check/utils/hyperparameter_parser.py +115 -0
msprobe/core/config_check/utils/utils.py +107 -0
msprobe/core/data_dump/api_registry.py +239 -0
msprobe/core/data_dump/data_collector.py +36 -9
msprobe/core/data_dump/data_processor/base.py +74 -53
msprobe/core/data_dump/data_processor/mindspore_processor.py +119 -78
msprobe/core/data_dump/data_processor/pytorch_processor.py +134 -96
msprobe/core/data_dump/json_writer.py +146 -57
msprobe/core/debugger/precision_debugger.py +143 -0
msprobe/core/grad_probe/constant.py +2 -1
msprobe/core/grad_probe/grad_compare.py +2 -2
msprobe/core/grad_probe/utils.py +1 -1
msprobe/core/hook_manager.py +242 -0
msprobe/core/monitor/anomaly_processor.py +384 -0
msprobe/core/overflow_check/abnormal_scene.py +2 -0
msprobe/core/service.py +356 -0
msprobe/core/single_save/__init__.py +0 -0
msprobe/core/single_save/single_comparator.py +243 -0
msprobe/core/single_save/single_saver.py +157 -0
msprobe/docs/01.installation.md +6 -5
msprobe/docs/02.config_introduction.md +89 -30
msprobe/docs/03.config_examples.md +1 -0
msprobe/docs/04.kernel_dump_PyTorch.md +1 -1
msprobe/docs/05.data_dump_PyTorch.md +184 -50
msprobe/docs/06.data_dump_MindSpore.md +193 -28
msprobe/docs/07.accuracy_checker_PyTorch.md +13 -3
msprobe/docs/08.accuracy_checker_online_PyTorch.md +72 -10
msprobe/docs/09.accuracy_checker_MindSpore.md +19 -7
msprobe/docs/10.accuracy_compare_PyTorch.md +266 -102
msprobe/docs/11.accuracy_compare_MindSpore.md +117 -43
msprobe/docs/12.overflow_check_PyTorch.md +5 -3
msprobe/docs/13.overflow_check_MindSpore.md +6 -4
msprobe/docs/14.data_parse_PyTorch.md +4 -10
msprobe/docs/17.grad_probe.md +2 -1
msprobe/docs/18.online_dispatch.md +3 -3
msprobe/docs/19.monitor.md +211 -103
msprobe/docs/21.visualization_PyTorch.md +100 -28
msprobe/docs/22.visualization_MindSpore.md +103 -31
msprobe/docs/23.generate_operator_PyTorch.md +9 -9
msprobe/docs/25.tool_function_introduction.md +23 -22
msprobe/docs/26.data_dump_PyTorch_baseline.md +14 -3
msprobe/docs/27.dump_json_instruction.md +278 -8
msprobe/docs/28.debugger_save_instruction.md +111 -20
msprobe/docs/28.kernel_dump_MindSpore.md +1 -1
msprobe/docs/29.data_dump_MSAdapter.md +229 -0
msprobe/docs/30.overflow_check_MSAdapter.md +31 -0
msprobe/docs/31.config_check.md +95 -0
msprobe/docs/32.ckpt_compare.md +69 -0
msprobe/docs/33.generate_operator_MindSpore.md +190 -0
msprobe/docs/34.RL_collect.md +92 -0
msprobe/docs/35.nan_analyze.md +72 -0
msprobe/docs/FAQ.md +3 -11
msprobe/docs/data_dump_MindSpore/data_dump_MindSpore_baseline.md +12 -1
msprobe/docs/data_dump_MindSpore/dynamic_graph_quick_start_example.md +3 -1
msprobe/docs/img/compare_result.png +0 -0
msprobe/docs/img/merge_result.png +0 -0
msprobe/docs/img/save_compare_result_sample.png +0 -0
msprobe/docs/img/visualization/proxy.png +0 -0
msprobe/docs/img/visualization/vis_browser_1.png +0 -0
msprobe/docs/img/visualization/vis_match_info.png +0 -0
msprobe/docs/img/visualization/vis_precision_info.png +0 -0
msprobe/docs/img/visualization/vis_search_info.png +0 -0
msprobe/docs/img/visualization/vis_show_info.png +0 -0
msprobe/docs/img/visualization/vis_showcase.png +0 -0
msprobe/docs/img/visualization/vis_unmatch_info.png +0 -0
msprobe/mindspore/__init__.py +3 -3
msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +151 -55
msprobe/mindspore/api_accuracy_checker/api_runner.py +25 -11
msprobe/mindspore/api_accuracy_checker/base_compare_algorithm.py +2 -1
msprobe/mindspore/api_accuracy_checker/bench_functions/flash_attention_score.py +580 -0
msprobe/mindspore/api_accuracy_checker/bench_functions/fusion_operator.py +41 -0
msprobe/mindspore/api_accuracy_checker/cmd_parser.py +4 -0
msprobe/mindspore/api_accuracy_checker/data_manager.py +4 -3
msprobe/mindspore/api_accuracy_checker/generate_op_script/config_op.json +9 -0
msprobe/mindspore/api_accuracy_checker/generate_op_script/op_generator.py +451 -0
msprobe/mindspore/api_accuracy_checker/generate_op_script/operator_replication.template +2081 -0
msprobe/mindspore/api_accuracy_checker/multi_api_accuracy_checker.py +11 -1
msprobe/mindspore/api_accuracy_checker/torch_mindtorch_importer.py +2 -1
msprobe/mindspore/cell_processor.py +204 -33
msprobe/mindspore/code_mapping/graph_parser.py +4 -21
msprobe/mindspore/common/const.py +73 -2
msprobe/mindspore/common/utils.py +157 -29
msprobe/mindspore/compare/common_dir_compare.py +382 -0
msprobe/mindspore/compare/distributed_compare.py +2 -26
msprobe/mindspore/compare/ms_compare.py +18 -398
msprobe/mindspore/compare/ms_graph_compare.py +20 -10
msprobe/mindspore/compare/utils.py +37 -0
msprobe/mindspore/debugger/debugger_config.py +59 -7
msprobe/mindspore/debugger/precision_debugger.py +83 -90
msprobe/mindspore/dump/cell_dump_process.py +902 -0
msprobe/mindspore/dump/cell_dump_with_insert_gradient.py +889 -0
msprobe/mindspore/dump/dump_tool_factory.py +18 -8
msprobe/mindspore/dump/graph_mode_cell_dump.py +139 -0
msprobe/mindspore/dump/graph_tensor_dump.py +123 -0
msprobe/mindspore/dump/hook_cell/api_register.py +176 -0
msprobe/mindspore/dump/hook_cell/hook_cell.py +22 -12
msprobe/mindspore/dump/hook_cell/ms_hook_manager.py +88 -0
msprobe/mindspore/dump/hook_cell/primitive_hooks.py +8 -2
msprobe/mindspore/dump/hook_cell/support_wrap_ops.yaml +42 -26
msprobe/mindspore/dump/jit_dump.py +35 -27
msprobe/mindspore/dump/kernel_kbyk_dump.py +6 -3
msprobe/mindspore/dym_loader/hook_dynamic_loader.cpp +110 -0
msprobe/mindspore/dym_loader/hook_dynamic_loader.h +15 -16
msprobe/mindspore/free_benchmark/api_pynative_self_check.py +22 -12
msprobe/mindspore/free_benchmark/common/utils.py +1 -1
msprobe/mindspore/free_benchmark/perturbation/perturbation_factory.py +4 -2
msprobe/mindspore/free_benchmark/self_check_tool_factory.py +6 -3
msprobe/mindspore/grad_probe/global_context.py +9 -2
msprobe/mindspore/grad_probe/grad_analyzer.py +2 -1
msprobe/mindspore/grad_probe/grad_stat_csv.py +3 -2
msprobe/mindspore/grad_probe/hook.py +2 -4
msprobe/mindspore/mindspore_service.py +111 -0
msprobe/mindspore/monitor/common_func.py +52 -0
msprobe/mindspore/monitor/data_writers.py +237 -0
msprobe/mindspore/monitor/distributed/wrap_distributed.py +1 -1
msprobe/mindspore/monitor/features.py +13 -1
msprobe/mindspore/monitor/module_hook.py +568 -444
msprobe/mindspore/monitor/optimizer_collect.py +331 -0
msprobe/mindspore/monitor/utils.py +71 -9
msprobe/mindspore/ms_config.py +16 -15
msprobe/mindspore/overflow_check/overflow_check_tool_factory.py +5 -3
msprobe/mindspore/task_handler_factory.py +5 -2
msprobe/msprobe.py +19 -0
msprobe/nan_analyze/__init__.py +14 -0
msprobe/nan_analyze/analyzer.py +255 -0
msprobe/nan_analyze/graph.py +189 -0
msprobe/nan_analyze/utils.py +211 -0
msprobe/pytorch/api_accuracy_checker/common/config.py +2 -2
msprobe/pytorch/api_accuracy_checker/compare/api_precision_compare.py +3 -6
msprobe/pytorch/api_accuracy_checker/compare/compare.py +36 -34
msprobe/pytorch/api_accuracy_checker/generate_op_script/op_generator.py +15 -13
msprobe/pytorch/api_accuracy_checker/generate_op_script/operator_replication.template +206 -4
msprobe/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py +9 -9
msprobe/pytorch/api_accuracy_checker/run_ut/run_overflow_check.py +6 -5
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py +31 -9
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut_utils.py +28 -20
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/attl.py +3 -1
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/client.py +29 -13
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/device_dispatch.py +12 -2
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/server.py +45 -31
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/utils.py +154 -0
msprobe/pytorch/attl_manager.py +65 -0
msprobe/pytorch/bench_functions/moe_gating_top_k_softmax.py +6 -0
msprobe/pytorch/bench_functions/npu_fusion_attention.py +27 -0
msprobe/pytorch/common/utils.py +53 -19
msprobe/pytorch/compare/distributed_compare.py +4 -36
msprobe/pytorch/compare/pt_compare.py +13 -84
msprobe/pytorch/compare/utils.py +47 -0
msprobe/pytorch/debugger/debugger_config.py +34 -17
msprobe/pytorch/debugger/precision_debugger.py +50 -96
msprobe/pytorch/dump/module_dump/hook_wrapper.py +93 -0
msprobe/pytorch/dump/module_dump/module_dump.py +15 -61
msprobe/pytorch/dump/module_dump/module_processer.py +150 -114
msprobe/pytorch/free_benchmark/common/utils.py +1 -1
msprobe/pytorch/free_benchmark/compare/single_benchmark.py +1 -1
msprobe/pytorch/free_benchmark/perturbed_layers/npu/add_noise.py +3 -3
msprobe/pytorch/free_benchmark/perturbed_layers/npu/bit_noise.py +3 -3
msprobe/pytorch/free_benchmark/perturbed_layers/npu/change_value.py +1 -1
msprobe/pytorch/free_benchmark/perturbed_layers/npu/improve_precision.py +1 -1
msprobe/pytorch/free_benchmark/result_handlers/check_handler.py +1 -1
msprobe/pytorch/function_factory.py +1 -1
msprobe/pytorch/grad_probe/grad_monitor.py +2 -2
msprobe/pytorch/grad_probe/grad_stat_csv.py +3 -2
msprobe/pytorch/hook_module/api_register.py +155 -0
msprobe/pytorch/hook_module/hook_module.py +18 -22
msprobe/pytorch/hook_module/jit_script_wrapper.py +33 -0
msprobe/pytorch/hook_module/pt_hook_manager.py +68 -0
msprobe/pytorch/hook_module/register_optimizer_hook.py +2 -1
msprobe/pytorch/hook_module/support_wrap_ops.yaml +193 -75
msprobe/pytorch/hook_module/utils.py +28 -2
msprobe/pytorch/monitor/csv2tb.py +14 -4
msprobe/pytorch/monitor/data_writers.py +259 -0
msprobe/pytorch/monitor/distributed/wrap_distributed.py +8 -2
msprobe/pytorch/monitor/module_hook.py +336 -241
msprobe/pytorch/monitor/module_metric.py +17 -0
msprobe/pytorch/monitor/optimizer_collect.py +244 -224
msprobe/pytorch/monitor/utils.py +84 -4
msprobe/pytorch/online_dispatch/compare.py +0 -2
msprobe/pytorch/online_dispatch/dispatch.py +13 -2
msprobe/pytorch/online_dispatch/dump_compare.py +8 -2
msprobe/pytorch/online_dispatch/utils.py +3 -0
msprobe/pytorch/parse_tool/lib/interactive_cli.py +1 -6
msprobe/pytorch/parse_tool/lib/utils.py +5 -4
msprobe/pytorch/pt_config.py +16 -11
msprobe/pytorch/pytorch_service.py +70 -0
msprobe/visualization/builder/graph_builder.py +69 -10
msprobe/visualization/builder/msprobe_adapter.py +24 -12
msprobe/visualization/compare/graph_comparator.py +63 -51
msprobe/visualization/compare/mode_adapter.py +22 -20
msprobe/visualization/graph/base_node.py +11 -4
msprobe/visualization/graph/distributed_analyzer.py +1 -10
msprobe/visualization/graph/graph.py +2 -13
msprobe/visualization/graph/node_op.py +1 -2
msprobe/visualization/graph_service.py +251 -104
msprobe/visualization/utils.py +26 -44
msprobe/mindspore/dump/hook_cell/api_registry.py +0 -207
msprobe/mindspore/dump/hook_cell/wrap_api.py +0 -212
msprobe/mindspore/dym_loader/hook_dynamic_loader.cc +0 -140
msprobe/mindspore/monitor/anomaly_detect.py +0 -404
msprobe/mindspore/monitor/module_spec_verifier.py +0 -94
msprobe/mindspore/service.py +0 -543
msprobe/pytorch/hook_module/api_registry.py +0 -166
msprobe/pytorch/hook_module/wrap_distributed.py +0 -79
msprobe/pytorch/hook_module/wrap_functional.py +0 -66
msprobe/pytorch/hook_module/wrap_npu_custom.py +0 -85
msprobe/pytorch/hook_module/wrap_tensor.py +0 -69
msprobe/pytorch/hook_module/wrap_torch.py +0 -84
msprobe/pytorch/hook_module/wrap_vf.py +0 -60
msprobe/pytorch/monitor/anomaly_analyse.py +0 -201
msprobe/pytorch/monitor/anomaly_detect.py +0 -410
msprobe/pytorch/monitor/module_spec_verifier.py +0 -95
msprobe/pytorch/monitor/unittest/test_monitor.py +0 -160
msprobe/pytorch/service.py +0 -470
{mindstudio_probe-1.2.2.dist-info → mindstudio_probe-8.1.0.dist-info}/LICENSE +0 -0
{mindstudio_probe-1.2.2.dist-info → mindstudio_probe-8.1.0.dist-info}/WHEEL +0 -0
{mindstudio_probe-1.2.2.dist-info → mindstudio_probe-8.1.0.dist-info}/entry_points.txt +0 -0
{mindstudio_probe-1.2.2.dist-info → mindstudio_probe-8.1.0.dist-info}/top_level.txt +0 -0
/msprobe/{mindspore → core}/compare/ms_to_pt_api.yaml +0 -0
/msprobe/{mindspore/dump → core}/kernel_dump/kernel_config.py +0 -0
/msprobe/{pytorch/monitor/unittest → core/monitor}/__init__.py +0 -0

msprobe/core/config_check/utils/hyperparameter_parser.py ADDED Viewed

@@ -0,0 +1,115 @@
+# Copyright (c) 2025-2025, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import re
+from abc import ABC, abstractmethod
+from msprobe.core.config_check.utils.utils import config_checking_print
+from msprobe.core.common.file_utils import FileOpen, load_yaml
+from msprobe.core.common.const import Const, FileCheckConst
+class Parser(ABC):
+    @abstractmethod
+    def parse(self, file_path: str) -> dict:
+        pass
+    def run(self, file_path: str) -> dict:
+        """
+            统一对外调用接口
+        :param file_path: 需解析的文件路径
+        :return:
+        """
+        try:
+            result = self.parse(file_path)
+        except Exception as exc:
+            config_checking_print(f"{self.__class__} parsing error, skip file path: {file_path}, error: {exc}")
+            result = {}
+        return result
+class ShellParser(Parser):
+    def parse(self, file_path: str) -> dict:
+        """
+        Extracts arguments from bash script used to run a model training.
+        """
+        hyperparameters = {}
+        script_content_list = []
+        with FileOpen(file_path, 'r') as file:
+            for line in file:
+                stripped_line = line.lstrip()
+                if not stripped_line.startswith('#'):
+                    line = line.split('#')[0].rstrip() + '\n'
+                    if line.strip():
+                        script_content_list.append(line)
+        script_content = ''.join(script_content_list)
+        command_line = re.search(r'msrun\s[^|]*|torchrun\s[^|]*|python\d? -m torch.distributed.launch\s[^|]*',
+                                 script_content,
+                                 re.DOTALL)
+        if command_line:
+            command_line = command_line.group()
+            blocks = re.findall(r'([a-zA-Z0-9_]{1,20}_ARGS)="(.*?)"', script_content, re.DOTALL)
+            block_contents = {}
+            for block_name, block_content in blocks:
+                block_content = block_content.replace('\n', ' ')
+                block_contents[block_name] = block_content
+                command_line = command_line.replace(f"${block_name}", block_content)
+            matches = re.findall(r'--([\w-]+)(?:\s+([^\s\\]+))?', command_line)
+            for match in matches:
+                key, value = match
+                args_key = re.match(r'\$\{?(\w+)}?', value)
+                if args_key:
+                    env_vars = re.findall(rf'{args_key.group(1)}=\s*(.+)', script_content)
+                    if env_vars:
+                        value = env_vars[-1]
+                hyperparameters[key] = value if value else True
+        return hyperparameters
+class YamlParser(Parser):
+    hyperparameters = {}
+    def parse(self, file_path: str) -> dict:
+        ori_hyper = load_yaml(file_path)
+        self.recursive_parse_parameters(ori_hyper, "")
+        return self.hyperparameters
+    def recursive_parse_parameters(self, parameters, prefix):
+        if isinstance(parameters, dict):
+            for key, value in parameters.items():
+                new_prefix = prefix + Const.SEP + key if prefix else key
+                self.recursive_parse_parameters(value, new_prefix)
+        elif isinstance(parameters, list):
+            for value in parameters:
+                self.recursive_parse_parameters(value, prefix)
+        elif isinstance(parameters, (int, str, bool)):
+            self.hyperparameters.update({prefix: parameters})
+class ParserFactory:
+    __ParserDict = {
+        FileCheckConst.SHELL_SUFFIX: ShellParser(),
+        FileCheckConst.YAML_SUFFIX: YamlParser()
+    }
+    def get_parser(self, file_type: str) -> Parser:
+        parser = self.__ParserDict.get(file_type, None)
+        if not parser:
+            raise ValueError(f'Invalid parser type: {file_type}')
+        return parser

msprobe/core/config_check/utils/utils.py ADDED Viewed

@@ -0,0 +1,107 @@
+# Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import re
+import hashlib
+from msprobe.core.common.framework_adapter import FmkAdp
+from msprobe.core.common.log import logger
+def merge_keys(dir_0, dir_1):
+    output_list = list(dir_0.keys())
+    output_list.extend(list(dir_1.keys()))
+    return set(output_list)
+def compare_dict(bench_dict, cmp_dict):
+    result = []
+    for key in set(bench_dict.keys()) | set(cmp_dict.keys()):
+        if key in bench_dict and key in cmp_dict:
+            if bench_dict[key] != cmp_dict[key]:
+                result.append(f"{key}: {bench_dict[key]} -> {cmp_dict[key]}")
+        elif key in bench_dict:
+            result.append(f"{key}: [deleted] -> {bench_dict[key]}")
+        else:
+            result.append(f"{key}: [added] -> {cmp_dict[key]}")
+    return result
+def config_checking_print(msg):
+    logger.info(f"[config checking log] {msg}")
+def tensor_to_hash(tensor):
+    """Compute the hash value of a tensor"""
+    tensor_bytes = tensor.clone().detach().cpu().numpy().tobytes()
+    return bytes_hash(tensor_bytes)
+def get_tensor_features(tensor):
+    features = {
+        "max": FmkAdp.tensor_max(tensor),
+        "min": FmkAdp.tensor_min(tensor),
+        "mean": FmkAdp.tensor_mean(tensor),
+        "norm": FmkAdp.tensor_norm(tensor),
+    }
+    return features
+def compare_dicts(dict1, dict2, path=''):
+    deleted = []
+    added = []
+    changed = []
+    result = {}
+    for key in dict1:
+        if key not in dict2:
+            deleted.append(f"[Deleted]: {path + key}")
+            result[key] = "[deleted]"
+        else:
+            if isinstance(dict1[key], dict) and isinstance(dict2[key], dict):
+                sub_deleted, sub_added, sub_changed, sub_result = compare_dicts(
+                    dict1[key], dict2[key], path + key + '/')
+                deleted.extend(sub_deleted)
+                added.extend(sub_added)
+                changed.extend(sub_changed)
+                if sub_result:
+                    result[key] = sub_result
+            elif dict1[key] != dict2[key]:
+                changed.append(f"[Changed]: {path + key} : {dict1[key]} -> {dict2[key]}")
+                result[key] = f"[changed]: {dict1[key]} -> {dict2[key]}"
+    for key in dict2:
+        if key not in dict1:
+            added.append(f"[Added]: {path + key}")
+            result[key] = "[added]"
+    return deleted, added, changed, result
+def bytes_hash(obj: bytes):
+    hex_dig = hashlib.sha256(obj).hexdigest()
+    short_hash = int(hex_dig, 16) % (2 ** 16)
+    return short_hash
+def update_dict(ori_dict, new_dict):
+    for key, value in new_dict.items():
+        if key in ori_dict and ori_dict[key] != value:
+            if "values" in ori_dict.keys():
+                ori_dict[key]["values"].append(new_dict[key])
+            else:
+                ori_dict[key] = {"description": "duplicate_value", "values": [ori_dict[key], new_dict[key]]}
+        else:
+            ori_dict[key] = value

msprobe/core/data_dump/api_registry.py ADDED Viewed

@@ -0,0 +1,239 @@
+# Copyright (c) 2025-2025, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+from typing import Dict, Any, Optional, Callable, Union, List, Tuple
+from msprobe.core.common.const import Const
+from msprobe.core.common.file_utils import load_yaml
+from msprobe.core.common.log import logger
+def _get_attr(module, attr_name):
+    if Const.SEP in attr_name:
+        sub_module_name, sub_attr = attr_name.rsplit(Const.SEP, 1)
+        sub_module = getattr(module, sub_module_name, None)
+        attr = getattr(sub_module, sub_attr, None)
+    else:
+        attr = getattr(module, attr_name, None)
+    return attr
+class ApiWrapper:
+    def __init__(
+        self, api_types: Dict[str, Dict[str, Any]],
+        api_list_paths: Union[str, List[str], Tuple[str]],
+        backlist: Union[List[str], Tuple[str]] = None
+    ):
+        self.api_types = api_types
+        if not isinstance(api_list_paths, (list, tuple)):
+            api_list_paths = [api_list_paths] * len(self.api_types)
+        elif len(api_list_paths) != len(self.api_types):
+            raise RuntimeError("The number of api_list_paths must be equal to the number of frameworks in 'api_types', "
+                               "when api_list_paths is a list or tuple.")
+        self.api_list_paths = api_list_paths
+        self.backlist = backlist if backlist else []
+        self.api_names = self._get_api_names()
+        self.wrapped_api_functions = dict()
+    @staticmethod
+    def deal_with_self_kwargs(api_name, api_func, args, kwargs):
+        if kwargs and 'self' in kwargs:
+            func_params = None
+            try:
+                func_params = inspect.signature(api_func).parameters
+            except Exception:
+                if api_name in Const.API_WITH_SELF_ARG:
+                    func_params = inspect.signature(Const.API_WITH_SELF_ARG.get(api_name)).parameters
+            if func_params is None:
+                return False, args, kwargs
+            for name, param in func_params.items():
+                if name == 'self' and param.kind == inspect.Parameter.KEYWORD_ONLY:
+                    return False, args, kwargs
+            args_ = list(args)
+            names_and_values = []
+            self_index = 0
+            for i, item in enumerate(func_params.items()):
+                names_and_values.append((item[0], item[1].default))
+                if item[0] == 'self':
+                    self_index = i
+                    break
+            for i in range(len(args), self_index + 1):
+                if names_and_values[i][0] in kwargs:
+                    args_.append(kwargs.pop(names_and_values[i][0]))
+                else:
+                    args_.append(names_and_values[i][1])
+            args = tuple(args_)
+        return True, args, kwargs
+    def wrap_api(
+        self, api_templates, hook_build_func: Optional[Callable]
+    ):
+        api_types_num = sum([len(v) for v in self.api_types.values()])
+        if not isinstance(api_templates, (list, tuple)):
+            api_templates = [api_templates] * api_types_num
+        elif len(api_templates) != api_types_num:
+            raise RuntimeError("The number of api_templates must be equal to the number of api_types, "
+                               "when api_templates is a list or tuple.")
+        self.wrapped_api_functions.clear()
+        index = 0
+        for framework, api_types in self.api_types.items():
+            wrapped_functions_in_framework = dict()
+            for api_type, api_modules in api_types.items():
+                wrapped_functions = dict()
+                name_prefix = Const.API_DATA_PREFIX.get(framework, {}).get(api_type, "API")
+                api_template = api_templates[index]
+                index += 1
+                for api_name in self.api_names.get(framework, {}).get(api_type, []):
+                    ori_api = _get_attr(api_modules[0], api_name)
+                    if callable(ori_api):
+                        def wrap_api_func(api_name, api_func, prefix, hook_build_func, api_template):
+                            def api_function(*args, **kwargs):
+                                api_name_with_prefix = prefix + Const.SEP + str(api_name.split(Const.SEP)[-1])
+                                enable_wrap, args, kwargs = self.deal_with_self_kwargs(api_name_with_prefix,
+                                                                                       api_func, args, kwargs)
+                                if not enable_wrap:
+                                    logger.warning(f'Cannot collect precision data of {api_name_with_prefix}. '
+                                                   'It may be fixed by passing the value of "self" '
+                                                   'as a positional argument instead of a keyword argument. ')
+                                    return api_func(*args, **kwargs)
+                                return api_template(api_name, api_func, prefix, hook_build_func)(*args, **kwargs)
+                            api_function.__name__ = api_name
+                            return api_function
+                        wrapped_functions[api_name] = wrap_api_func(api_name, ori_api, name_prefix,
+                                                                    hook_build_func, api_template)
+                wrapped_functions_in_framework[api_type] = wrapped_functions
+            self.wrapped_api_functions[framework] = wrapped_functions_in_framework
+        return self.wrapped_api_functions
+    def _get_api_names(self):
+        api_names = dict()
+        for index, framework in enumerate(self.api_types.keys()):
+            api_list = load_yaml(self.api_list_paths[index])
+            valid_names = dict()
+            for api_type, api_modules in self.api_types.get(framework, {}).items():
+                key_in_file = Const.SUPPORT_API_DICT_KEY_MAP.get(framework, {}).get(api_type)
+                api_from_file = api_list.get(key_in_file, [])
+                names = set()
+                for api_name in api_from_file:
+                    if f'{key_in_file}.{api_name}' in self.backlist:
+                        continue
+                    target_attr = api_name
+                    target_module = api_modules[0]
+                    if Const.SEP in api_name:
+                        sub_module_name, target_attr = api_name.rsplit(Const.SEP, 1)
+                        target_module = getattr(api_modules[0], sub_module_name, None)
+                    if target_module and target_attr in dir(target_module):
+                        names.add(api_name)
+                valid_names[api_type] = names
+            api_names[framework] = valid_names
+        return api_names
+class ApiRegistry:
+    """
+    Base class for api registry.
+    """
+    def __init__(self, api_types, inner_used_api, supported_api_list_path, api_templates, backlist=None):
+        self.ori_api_attr = dict()
+        self.wrapped_api_attr = dict()
+        self.inner_used_ori_attr = dict()
+        self.inner_used_wrapped_attr = dict()
+        self.api_types = api_types
+        self.inner_used_api = inner_used_api
+        self.supported_api_list_path = supported_api_list_path
+        self.api_templates = api_templates
+        self.backlist = backlist if backlist else []
+        self.all_api_registered = False
+    @staticmethod
+    def store_ori_attr(ori_api_group, api_list, api_ori_attr):
+        for api in api_list:
+            api_ori_attr[api] = _get_attr(ori_api_group, api)
+    @staticmethod
+    def set_api_attr(api_group, attr_dict):
+        for api, api_attr in attr_dict.items():
+            if Const.SEP in api:
+                sub_module_name, sub_op = api.rsplit(Const.SEP, 1)
+                sub_module = getattr(api_group, sub_module_name, None)
+                if sub_module is not None:
+                    setattr(sub_module, sub_op, api_attr)
+            else:
+                setattr(api_group, api, api_attr)
+    @staticmethod
+    def register_custom_api(module, api_name, api_prefix, hook_build_func, api_template):
+        def wrap_api_func(api_name, api_func, prefix, hook_build_func, api_template):
+            def api_function(*args, **kwargs):
+                return api_template(api_name, api_func, prefix, hook_build_func)(*args, **kwargs)
+            api_function.__name__ = api_name
+            return api_function
+        setattr(module, api_name,
+                wrap_api_func(api_name, getattr(module, api_name), api_prefix, hook_build_func, api_template))
+    def register_all_api(self):
+        self.all_api_registered = True
+        for framework, api_types in self.api_types.items():
+            for api_type, api_modules in api_types.items():
+                api_type_with_framework = framework + Const.SEP + api_type
+                for module in api_modules[1]:
+                    self.set_api_attr(module, self.wrapped_api_attr.get(api_type_with_framework, {}))
+    def register_inner_used_api(self):
+        for api_type in self.inner_used_api.keys():
+            self.set_api_attr(self.inner_used_api.get(api_type)[0], self.inner_used_wrapped_attr.get(api_type, {}))
+    def restore_all_api(self):
+        self.all_api_registered = False
+        for framework, api_types in self.api_types.items():
+            for api_type, api_modules in api_types.items():
+                api_type_with_framework = framework + Const.SEP + api_type
+                for module in api_modules[1]:
+                    self.set_api_attr(module, self.ori_api_attr.get(api_type_with_framework, {}))
+    def restore_inner_used_api(self):
+        for api_type in self.inner_used_api.keys():
+            self.set_api_attr(self.inner_used_api.get(api_type)[0], self.inner_used_ori_attr.get(api_type, {}))
+    def initialize_hook(self, hook_build_func):
+        api_wrapper = ApiWrapper(self.api_types, self.supported_api_list_path, self.backlist)
+        wrapped_api_functions = api_wrapper.wrap_api(self.api_templates, hook_build_func)
+        for framework, api_types in self.api_types.items():
+            for api_type, api_modules in api_types.items():
+                ori_attr = dict()
+                self.store_ori_attr(api_modules[0], api_wrapper.api_names.get(framework).get(api_type), ori_attr)
+                api_type_with_framework = framework + Const.SEP + api_type
+                self.ori_api_attr[api_type_with_framework] = ori_attr
+                self.wrapped_api_attr[api_type_with_framework] = wrapped_api_functions.get(framework).get(api_type)
+        for inner_used_api_type, inner_used_api_list in self.inner_used_api.items():
+            ori_attr = dict()
+            wrapped_attr = dict()
+            for api_name in inner_used_api_list[1:]:
+                if self.ori_api_attr.get(inner_used_api_type, {}).get(api_name):
+                    ori_attr[api_name] = self.ori_api_attr.get(inner_used_api_type).get(api_name)
+                    wrapped_attr[api_name] = self.wrapped_api_attr.get(inner_used_api_type).get(api_name)
+            self.inner_used_ori_attr[inner_used_api_type] = ori_attr
+            self.inner_used_wrapped_attr[inner_used_api_type] = wrapped_attr

msprobe/core/data_dump/data_collector.py CHANGED Viewed

@@ -41,7 +41,7 @@ class DataCollector:
         self.backward_module_names = {}
         self.optimizer_status = ""
         self.optimizer_status_first_start = {Const.OPTIMIZER: True, Const.CLIP_GRAD: True}
-        atexit.register(self.write_json)
+        atexit.register(self.write_json_at_exit)
     @property
     def dump_data_dir(self):
@@ -78,6 +78,11 @@ class DataCollector:
     def write_json(self):
         self.data_writer.write_json()
+    def write_json_at_exit(self):
+        if self.config.async_dump and self.config.task == Const.TENSOR:
+            self.data_processor.dump_async_data()
+        self.data_writer.write_json()
     def update_data(self, name, data_info):
         msg = f"msprobe is collecting data on {name}."
         if self.config.task == Const.OVERFLOW_CHECK:
@@ -89,6 +94,10 @@ class DataCollector:
         logger.debug(msg)
         self.data_writer.update_data(data_info)
+    def call_stack_collect(self, name):
+        stack_info = self.data_processor.analyze_api_call_stack(name)
+        self.data_writer.update_stack(name, stack_info)
     def forward_input_data_collect(self, name, module, pid, module_input_output, is_recompute=None):
         if self.config.task == Const.FREE_BENCHMARK:
             backward_name = name.replace(Const.FORWARD, Const.BACKWARD)
@@ -118,9 +127,16 @@ class DataCollector:
         self.set_is_recomputable(data_info, is_recompute)
         if self.config.level == Const.LEVEL_L2:
             return
-        self.data_writer.update_stack(self.data_processor.analyze_api_call_stack(name))
+        self.call_stack_collect(name)
         self.handle_data(name, data_info, flush=self.data_processor.is_terminated)
+    def forward_data_collect_only_tensor(self, name, module, pid, module_input_output):
+        if not self.check_scope_and_pid(self.scope, name, pid):
+            return
+        self.data_processor.analyze_forward(name, module, module_input_output)
     def forward_data_collect(self, name, module, pid, module_input_output, is_recompute=None):
         self.update_construct(name)
         if not self.check_scope_and_pid(self.scope, name, pid):
@@ -130,9 +146,15 @@ class DataCollector:
         if self.config.task != Const.STRUCTURE:
             data_info = self.data_processor.analyze_forward(name, module, module_input_output)
         self.set_is_recomputable(data_info, is_recompute)
-        self.data_writer.update_stack(self.data_processor.analyze_api_call_stack(name))
+        self.call_stack_collect(name)
         self.handle_data(name, data_info, flush=self.data_processor.is_terminated)
+    def backward_data_collect_only_tensor(self, name, module, pid, module_input_output, is_recompute=None):
+        if not self.check_scope_and_pid(self.scope, name, pid):
+            return
+        self.data_processor.analyze_backward(name, module, module_input_output)
     def backward_data_collect(self, name, module, pid, module_input_output, is_recompute=None):
         self.update_construct(name)
         if not self.check_scope_and_pid(self.scope, name, pid):
@@ -180,7 +202,10 @@ class DataCollector:
                     self.optimizer_status_first_start[self.optimizer_status] = False
                 self.data_writer.update_construct({name: self.optimizer_status})
             else:
-                self.data_writer.update_construct({name: self.module_processor.api_parent_node})
+                if self.config.level == Const.LEVEL_MIX and \
+                  not (name.startswith(Const.MODULE) or name.startswith(Const.CELL)):
+                    self.data_writer.update_construct({name: self.module_processor.api_parent_node})
             self.data_writer.update_construct(self.module_processor.module_node)
     def handle_data(self, name, data_info, flush=False):
@@ -204,6 +229,7 @@ class DataCollector:
     def params_data_collect(self, name, param_name, pid, data):
         grad_name = name + Const.SEP + Const.PARAMS_GRAD
+        self.update_api_or_module_name(grad_name)
         # 校验scope和pid，以及当前name是否有过反向计算
         if not self.check_scope_and_pid(self.scope, name, pid) and not self.backward_module_names.get(name):
             # 如果没有反向计算，则需要清除之前占位写入的grad数据
@@ -213,18 +239,19 @@ class DataCollector:
         data_info = self.data_processor.analyze_params(grad_name, param_name, data)
         self.handle_data(grad_name, data_info, flush=self.data_processor.is_terminated)
-    def fill_stack_tensor_data(self):
-        self.data_writer.fill_stack_tensor_data()
     def debug_data_collect_forward(self, variable, name_with_count):
         data_info = self.data_processor.analyze_debug_forward(variable, name_with_count)
-        self.data_writer.update_debug({name_with_count: data_info})
+        name_with_count_category = name_with_count + Const.SEP + Const.DEBUG
+        self.data_writer.update_debug({name_with_count_category: data_info})
     def debug_data_collect_backward(self, variable, grad_name_with_count):
         # prepare all None nested data structure
         all_none_data_info = self.data_processor.analyze_element_to_all_none(variable)
-        self.data_writer.update_debug({grad_name_with_count: all_none_data_info})
+        grad_name_with_count_category = grad_name_with_count + Const.SEP + Const.DEBUG
+        self.data_writer.update_debug({grad_name_with_count_category: all_none_data_info})
         # register tensor backward hook
-        self.data_processor.analyze_debug_backward(variable, grad_name_with_count, self.data_writer.cache_debug['data'])
+        self.data_processor.analyze_debug_backward(variable, grad_name_with_count_category,
+                                                   self.data_writer.cache_debug['data'])

mindstudio-probe 1.2.2__py3-none-any.whl → 8.1.0__py3-none-any.whl

mindstudio-probe 1.2.2py3-none-any.whl → 8.1.0py3-none-any.whl