PyPI - mindstudio-probe - Versions diffs - 1.1.0__py3-none-any.whl → 1.2.1__py3-none-any.whl - Mend

mindstudio-probe 1.1.0py3-none-any.whl → 1.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (299) hide show

{mindstudio_probe-1.1.0.dist-info → mindstudio_probe-1.2.1.dist-info}/METADATA +7 -6
mindstudio_probe-1.2.1.dist-info/RECORD +396 -0
{mindstudio_probe-1.1.0.dist-info → mindstudio_probe-1.2.1.dist-info}/WHEEL +1 -1
{mindstudio_probe-1.1.0.dist-info → mindstudio_probe-1.2.1.dist-info}/entry_points.txt +0 -1
msprobe/CMakeLists.txt +5 -0
msprobe/README.md +51 -20
msprobe/config.json +2 -3
msprobe/core/advisor/advisor.py +8 -3
msprobe/core/common/const.py +264 -15
msprobe/core/common/exceptions.py +27 -3
msprobe/core/common/file_utils.py +176 -26
msprobe/core/common/inplace_op_checker.py +15 -0
msprobe/core/common/inplace_ops.yaml +3 -0
msprobe/core/common/log.py +27 -9
msprobe/core/common/utils.py +204 -77
msprobe/core/common_config.py +49 -14
msprobe/core/compare/acc_compare.py +274 -198
msprobe/core/compare/check.py +32 -33
msprobe/core/compare/compare_cli.py +32 -14
msprobe/core/compare/highlight.py +283 -127
msprobe/core/compare/layer_mapping/__init__.py +19 -0
msprobe/core/compare/layer_mapping/data_scope_parser.py +246 -0
msprobe/core/compare/layer_mapping/layer_mapping.py +249 -0
msprobe/core/compare/layer_mapping/postprocess_pass.py +95 -0
msprobe/core/compare/merge_result/merge_result.py +380 -0
msprobe/core/compare/merge_result/merge_result_cli.py +31 -0
msprobe/core/compare/multiprocessing_compute.py +2 -2
msprobe/core/compare/npy_compare.py +135 -144
msprobe/core/compare/utils.py +419 -274
msprobe/core/data_dump/data_collector.py +60 -28
msprobe/core/data_dump/data_processor/base.py +84 -36
msprobe/core/data_dump/data_processor/factory.py +5 -3
msprobe/core/data_dump/data_processor/mindspore_processor.py +152 -18
msprobe/core/data_dump/data_processor/pytorch_processor.py +267 -110
msprobe/core/data_dump/json_writer.py +29 -1
msprobe/core/data_dump/scope.py +119 -39
msprobe/core/grad_probe/constant.py +27 -13
msprobe/core/grad_probe/grad_compare.py +18 -1
msprobe/core/grad_probe/utils.py +30 -2
msprobe/core/overflow_check/abnormal_scene.py +189 -0
msprobe/core/overflow_check/api_info.py +55 -0
msprobe/core/overflow_check/checker.py +138 -0
msprobe/core/overflow_check/filter.py +157 -0
msprobe/core/overflow_check/ignore_rules.yaml +55 -0
msprobe/core/overflow_check/level.py +22 -0
msprobe/core/overflow_check/utils.py +28 -0
msprobe/docs/01.installation.md +96 -7
msprobe/docs/02.config_introduction.md +50 -23
msprobe/docs/03.config_examples.md +2 -9
msprobe/docs/04.kernel_dump_PyTorch.md +73 -0
msprobe/docs/05.data_dump_PyTorch.md +93 -61
msprobe/docs/06.data_dump_MindSpore.md +200 -95
msprobe/docs/07.accuracy_checker_PyTorch.md +28 -28
msprobe/docs/08.accuracy_checker_online_PyTorch.md +1 -6
msprobe/docs/09.accuracy_checker_MindSpore.md +44 -8
msprobe/docs/10.accuracy_compare_PyTorch.md +114 -50
msprobe/docs/11.accuracy_compare_MindSpore.md +340 -48
msprobe/docs/12.overflow_check_PyTorch.md +2 -2
msprobe/docs/13.overflow_check_MindSpore.md +6 -6
msprobe/docs/15.free_benchmarking_PyTorch.md +4 -5
msprobe/docs/16.free_benchmarking_MindSpore.md +56 -37
msprobe/docs/17.grad_probe.md +5 -6
msprobe/docs/19.monitor.md +561 -0
msprobe/docs/20.monitor_performance_baseline.md +52 -0
msprobe/docs/21.visualization_PyTorch.md +466 -0
msprobe/docs/22.visualization_MindSpore.md +481 -0
msprobe/docs/23.generate_operator_PyTorch.md +107 -0
msprobe/docs/24.code_mapping_Mindspore.md +28 -0
msprobe/docs/25.tool_function_introduction.md +29 -0
msprobe/docs/26.data_dump_PyTorch_baseline.md +37 -0
msprobe/docs/27.dump_json_instruction.md +521 -0
msprobe/docs/FAQ.md +29 -2
msprobe/docs/accuracy_checker_MindSpore/accuracy_checker_MindSpore_baseline.md +14 -0
msprobe/docs/data_dump_MindSpore/data_dump_MindSpore_baseline.md +22 -0
msprobe/docs/data_dump_MindSpore/dynamic_graph_quick_start_example.md +211 -0
msprobe/docs/img/compare_result.png +0 -0
msprobe/docs/img/merge_result.png +0 -0
msprobe/docs/img/monitor/cpu_info.png +0 -0
msprobe/docs/img/visualization/fuzzy_match_ms.png +0 -0
msprobe/docs/img/visualization/fuzzy_match_pt.png +0 -0
msprobe/docs/img/visualization/tensorboard_1.png +0 -0
msprobe/docs/img/visualization/tensorboard_2.png +0 -0
msprobe/docs/img/visualization/vis_browser_1.png +0 -0
msprobe/docs/img/visualization/vis_browser_2.png +0 -0
msprobe/docs/img/visualization/vis_precision_info.png +0 -0
msprobe/docs/img/visualization/vis_search_info.png +0 -0
msprobe/docs/img/visualization/vis_show_info.png +0 -0
msprobe/docs/img/visualization/vis_showcase.png +0 -0
msprobe/docs/img/visualization/vis_unmatch_info.png +0 -0
msprobe/docs/visualization/GPTModel.png +0 -0
msprobe/docs/visualization/ParallelMLP.png +0 -0
msprobe/docs/visualization/layer_mapping_example.md +132 -0
msprobe/docs/visualization/mapping.png +0 -0
msprobe/docs/visualization/mapping1.png +0 -0
msprobe/docs/visualization/module_name.png +0 -0
msprobe/docs/visualization/module_name1.png +0 -0
msprobe/docs/visualization/no_mapping.png +0 -0
msprobe/docs/visualization/no_mapping1.png +0 -0
msprobe/docs/visualization/no_mapping_analyze.png +0 -0
msprobe/docs/visualization/top_layer.png +0 -0
msprobe/mindspore/__init__.py +25 -0
msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +151 -151
msprobe/mindspore/api_accuracy_checker/api_info.py +21 -6
msprobe/mindspore/api_accuracy_checker/api_runner.py +43 -18
msprobe/mindspore/api_accuracy_checker/base_compare_algorithm.py +21 -7
msprobe/mindspore/api_accuracy_checker/checker_support_api.yaml +77 -0
msprobe/mindspore/api_accuracy_checker/cmd_parser.py +64 -1
msprobe/mindspore/api_accuracy_checker/compute_element.py +64 -31
msprobe/mindspore/api_accuracy_checker/data_manager.py +301 -0
msprobe/mindspore/api_accuracy_checker/main.py +28 -3
msprobe/mindspore/api_accuracy_checker/multi_api_accuracy_checker.py +212 -0
msprobe/mindspore/api_accuracy_checker/multi_data_manager.py +60 -0
msprobe/mindspore/api_accuracy_checker/type_mapping.py +22 -5
msprobe/mindspore/api_accuracy_checker/utils.py +34 -17
msprobe/mindspore/cell_processor.py +33 -12
msprobe/mindspore/code_mapping/bind.py +264 -0
msprobe/mindspore/code_mapping/cmd_parser.py +40 -0
msprobe/mindspore/code_mapping/graph.py +49 -0
msprobe/mindspore/code_mapping/graph_parser.py +226 -0
msprobe/mindspore/code_mapping/main.py +24 -0
msprobe/mindspore/code_mapping/processor.py +34 -0
msprobe/mindspore/common/const.py +35 -13
msprobe/mindspore/common/log.py +5 -9
msprobe/mindspore/common/utils.py +88 -4
msprobe/mindspore/compare/distributed_compare.py +22 -24
msprobe/mindspore/compare/ms_compare.py +333 -268
msprobe/mindspore/compare/ms_graph_compare.py +95 -52
msprobe/mindspore/debugger/debugger_config.py +7 -1
msprobe/mindspore/debugger/precision_debugger.py +87 -12
msprobe/mindspore/dump/dump_tool_factory.py +3 -1
msprobe/mindspore/dump/hook_cell/api_registry.py +95 -18
msprobe/mindspore/dump/hook_cell/hook_cell.py +60 -38
msprobe/mindspore/dump/hook_cell/primitive_hooks.py +45 -30
msprobe/mindspore/dump/hook_cell/support_wrap_ops.yaml +36 -1
msprobe/mindspore/dump/hook_cell/wrap_api.py +92 -1
msprobe/mindspore/dump/jit_dump.py +17 -5
msprobe/mindspore/dump/kernel_dump/kernel_config.py +33 -0
msprobe/mindspore/dump/kernel_graph_dump.py +9 -4
msprobe/mindspore/dump/kernel_kbyk_dump.py +2 -4
msprobe/mindspore/dym_loader/hook_dynamic_loader.cc +140 -0
msprobe/mindspore/dym_loader/hook_dynamic_loader.h +53 -0
msprobe/mindspore/free_benchmark/api_pynative_self_check.py +156 -41
msprobe/mindspore/free_benchmark/common/handler_params.py +1 -2
msprobe/mindspore/free_benchmark/common/utils.py +19 -4
msprobe/mindspore/free_benchmark/data/support_wrap_ops.yaml +0 -204
msprobe/mindspore/free_benchmark/handler/base_handler.py +3 -3
msprobe/mindspore/free_benchmark/handler/check_handler.py +4 -5
msprobe/mindspore/free_benchmark/handler/fix_handler.py +4 -4
msprobe/mindspore/free_benchmark/handler/handler_factory.py +4 -4
msprobe/mindspore/free_benchmark/perturbation/add_noise.py +2 -2
msprobe/mindspore/free_benchmark/perturbation/base_perturbation.py +15 -6
msprobe/mindspore/free_benchmark/perturbation/bit_noise.py +2 -2
msprobe/mindspore/free_benchmark/perturbation/exchange_value.py +2 -2
msprobe/mindspore/free_benchmark/perturbation/improve_precision.py +13 -6
msprobe/mindspore/free_benchmark/perturbation/perturbation_factory.py +2 -2
msprobe/mindspore/free_benchmark/self_check_tool_factory.py +2 -2
msprobe/mindspore/grad_probe/global_context.py +28 -8
msprobe/mindspore/grad_probe/grad_analyzer.py +50 -24
msprobe/mindspore/grad_probe/grad_monitor.py +16 -1
msprobe/mindspore/grad_probe/grad_stat_csv.py +33 -5
msprobe/mindspore/grad_probe/hook.py +35 -12
msprobe/mindspore/grad_probe/utils.py +18 -5
msprobe/mindspore/mindtorch/__init__.py +18 -0
msprobe/mindspore/mindtorch/mindtorch_adaptor.py +255 -0
msprobe/mindspore/ms_config.py +27 -16
msprobe/mindspore/overflow_check/kernel_graph_overflow_check.py +9 -4
msprobe/mindspore/runtime.py +15 -0
msprobe/mindspore/service.py +285 -113
msprobe/mindspore/task_handler_factory.py +15 -0
msprobe/msprobe.py +48 -10
msprobe/pytorch/__init__.py +8 -6
msprobe/pytorch/api_accuracy_checker/common/config.py +62 -0
msprobe/pytorch/api_accuracy_checker/common/utils.py +31 -16
msprobe/pytorch/api_accuracy_checker/compare/algorithm.py +41 -8
msprobe/pytorch/api_accuracy_checker/compare/api_precision_compare.py +103 -271
msprobe/pytorch/api_accuracy_checker/compare/api_precision_standard.yaml +4 -1
msprobe/pytorch/api_accuracy_checker/compare/compare.py +69 -68
msprobe/pytorch/api_accuracy_checker/compare/compare_column.py +54 -0
msprobe/pytorch/api_accuracy_checker/compare/compare_input.py +51 -0
msprobe/pytorch/api_accuracy_checker/compare/compare_utils.py +2 -4
msprobe/pytorch/api_accuracy_checker/generate_op_script/config_op.json +9 -0
msprobe/pytorch/api_accuracy_checker/generate_op_script/op_generator.py +478 -0
msprobe/pytorch/api_accuracy_checker/generate_op_script/operator_replication.template +365 -0
msprobe/pytorch/api_accuracy_checker/precision_standard/absolute_threshold.py +106 -0
msprobe/pytorch/api_accuracy_checker/precision_standard/accumulative_error_compare.py +107 -0
msprobe/pytorch/api_accuracy_checker/precision_standard/base_standard.py +151 -0
msprobe/pytorch/api_accuracy_checker/precision_standard/benchmark_compare.py +226 -0
msprobe/pytorch/api_accuracy_checker/precision_standard/binary_consistency.py +68 -0
msprobe/pytorch/api_accuracy_checker/precision_standard/standard_config.py +218 -0
msprobe/pytorch/api_accuracy_checker/precision_standard/standard_register.py +104 -0
msprobe/pytorch/api_accuracy_checker/precision_standard/thousandth_standard.py +63 -0
msprobe/pytorch/api_accuracy_checker/precision_standard/ulp_compare.py +200 -0
msprobe/pytorch/api_accuracy_checker/run_ut/data_generate.py +63 -2
msprobe/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py +21 -15
msprobe/pytorch/api_accuracy_checker/run_ut/run_overflow_check.py +54 -22
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py +140 -71
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut_utils.py +49 -8
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/attl.py +9 -24
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/client.py +4 -12
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/device_dispatch.py +5 -3
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/dump_dispatch.py +9 -4
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/server.py +3 -11
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/utils.py +2 -2
msprobe/pytorch/bench_functions/confusion_transpose.py +5 -1
msprobe/pytorch/bench_functions/matmul_backward.py +12 -0
msprobe/pytorch/bench_functions/npu_fusion_attention.py +142 -16
msprobe/pytorch/bench_functions/rotary_mul.py +4 -0
msprobe/pytorch/bench_functions/swiglu.py +10 -2
msprobe/pytorch/common/parse_json.py +7 -6
msprobe/pytorch/common/utils.py +101 -7
msprobe/pytorch/compare/distributed_compare.py +17 -30
msprobe/pytorch/compare/pt_compare.py +44 -22
msprobe/pytorch/debugger/debugger_config.py +46 -27
msprobe/pytorch/debugger/precision_debugger.py +42 -12
msprobe/pytorch/dump/kernel_dump/kernel_config.py +33 -0
msprobe/pytorch/dump/module_dump/module_dump.py +86 -0
msprobe/pytorch/{module_processer.py → dump/module_dump/module_processer.py} +81 -10
msprobe/pytorch/free_benchmark/common/constant.py +15 -0
msprobe/pytorch/free_benchmark/common/counter.py +15 -0
msprobe/pytorch/free_benchmark/common/enums.py +15 -0
msprobe/pytorch/free_benchmark/common/params.py +10 -2
msprobe/pytorch/free_benchmark/common/utils.py +29 -4
msprobe/pytorch/free_benchmark/compare/grad_saver.py +20 -5
msprobe/pytorch/free_benchmark/compare/single_benchmark.py +2 -0
msprobe/pytorch/free_benchmark/perturbed_layers/npu/add_noise.py +3 -1
msprobe/pytorch/free_benchmark/perturbed_layers/npu/bit_noise.py +6 -4
msprobe/pytorch/free_benchmark/perturbed_layers/npu/change_value.py +2 -0
msprobe/pytorch/free_benchmark/perturbed_layers/npu/improve_precision.py +4 -0
msprobe/pytorch/free_benchmark/result_handlers/base_handler.py +41 -47
msprobe/pytorch/free_benchmark/result_handlers/fix_handler.py +6 -5
msprobe/pytorch/free_benchmark/result_handlers/preheat_handler.py +0 -4
msprobe/pytorch/grad_probe/grad_monitor.py +23 -6
msprobe/pytorch/grad_probe/grad_stat_csv.py +40 -10
msprobe/pytorch/hook_module/__init__.py +1 -1
msprobe/pytorch/hook_module/hook_module.py +14 -11
msprobe/pytorch/hook_module/register_optimizer_hook.py +59 -0
msprobe/pytorch/hook_module/support_wrap_ops.yaml +35 -0
msprobe/pytorch/hook_module/wrap_distributed.py +6 -8
msprobe/pytorch/hook_module/wrap_functional.py +0 -38
msprobe/pytorch/monitor/__init__.py +0 -0
msprobe/pytorch/monitor/anomaly_analyse.py +201 -0
msprobe/pytorch/monitor/anomaly_detect.py +425 -0
msprobe/pytorch/monitor/csv2tb.py +166 -0
msprobe/pytorch/monitor/distributed/__init__.py +0 -0
msprobe/pytorch/monitor/distributed/distributed_ops.yaml +19 -0
msprobe/pytorch/monitor/distributed/stack_blacklist.yaml +5 -0
msprobe/pytorch/monitor/distributed/wrap_distributed.py +283 -0
msprobe/pytorch/monitor/features.py +108 -0
msprobe/pytorch/monitor/module_hook.py +1076 -0
msprobe/pytorch/monitor/module_metric.py +172 -0
msprobe/pytorch/monitor/module_spec_verifier.py +95 -0
msprobe/pytorch/monitor/optimizer_collect.py +333 -0
msprobe/pytorch/monitor/unittest/__init__.py +0 -0
msprobe/pytorch/monitor/unittest/test_monitor.py +160 -0
msprobe/pytorch/monitor/utils.py +321 -0
msprobe/pytorch/monitor/visualizer.py +59 -0
msprobe/pytorch/online_dispatch/__init__.py +2 -3
msprobe/pytorch/online_dispatch/compare.py +29 -38
msprobe/pytorch/online_dispatch/dispatch.py +58 -27
msprobe/pytorch/online_dispatch/dump_compare.py +21 -9
msprobe/pytorch/online_dispatch/single_compare.py +53 -32
msprobe/pytorch/online_dispatch/torch_ops_config.yaml +1 -1
msprobe/pytorch/online_dispatch/utils.py +49 -21
msprobe/pytorch/parse_tool/lib/compare.py +21 -27
msprobe/pytorch/parse_tool/lib/config.py +6 -8
msprobe/pytorch/parse_tool/lib/file_desc.py +15 -1
msprobe/pytorch/parse_tool/lib/interactive_cli.py +10 -10
msprobe/pytorch/parse_tool/lib/parse_exception.py +7 -7
msprobe/pytorch/parse_tool/lib/parse_tool.py +12 -12
msprobe/pytorch/parse_tool/lib/utils.py +33 -53
msprobe/pytorch/parse_tool/lib/visualization.py +11 -10
msprobe/pytorch/pt_config.py +31 -8
msprobe/pytorch/service.py +188 -108
msprobe/visualization/__init__.py +14 -0
msprobe/visualization/builder/__init__.py +14 -0
msprobe/visualization/builder/graph_builder.py +222 -0
msprobe/visualization/builder/msprobe_adapter.py +227 -0
msprobe/visualization/compare/__init__.py +14 -0
msprobe/visualization/compare/graph_comparator.py +180 -0
msprobe/visualization/compare/mode_adapter.py +197 -0
msprobe/visualization/graph/__init__.py +14 -0
msprobe/visualization/graph/base_node.py +119 -0
msprobe/visualization/graph/distributed_analyzer.py +318 -0
msprobe/visualization/graph/graph.py +209 -0
msprobe/visualization/graph/node_colors.py +95 -0
msprobe/visualization/graph/node_op.py +39 -0
msprobe/visualization/graph_service.py +288 -0
msprobe/visualization/utils.py +217 -0
mindstudio_probe-1.1.0.dist-info/RECORD +0 -287
msprobe/docs/04.acl_config_examples.md +0 -78
msprobe/mindspore/compare/layer_mapping.py +0 -146
msprobe/mindspore/compare/modify_mapping.py +0 -107
msprobe/mindspore/free_benchmark/decorator/dec_forward.py +0 -57
msprobe/mindspore/free_benchmark/decorator/decorator_factory.py +0 -122
msprobe/pytorch/functional/module_dump.py +0 -84
{mindstudio_probe-1.1.0.dist-info → mindstudio_probe-1.2.1.dist-info}/LICENSE +0 -0
{mindstudio_probe-1.1.0.dist-info → mindstudio_probe-1.2.1.dist-info}/top_level.txt +0 -0
/msprobe/mindspore/{free_benchmark/decorator → code_mapping}/__init__.py +0 -0
/msprobe/pytorch/{functional → dump/module_dump}/__init__.py +0 -0

msprobe/core/overflow_check/checker.py ADDED Viewed

@@ -0,0 +1,138 @@
+# Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Dict, List, Optional, Any
+from msprobe.core.common.const import Const
+from msprobe.core.overflow_check.abnormal_scene import InputAnomalyOutputNormalScene, InputAnomalyOutputAnomalyScene, \
+    InputNormalOutputAnomalyScene, NumericalMutationScene, AnomalyScene
+from msprobe.core.overflow_check.api_info import APIInfo
+from msprobe.core.overflow_check.filter import IgnoreFilter
+from msprobe.core.overflow_check.level import OverflowLevel
+class StatisticsFields:
+    """统计字段常量类"""
+    CRITICAL_APIS = 'critical_apis'
+    HIGH_PRIORITY_APIS = 'high_priority_apis'
+    MEDIUM_PRIORITY_APIS = 'medium_priority_apis'
+    ANOMALY_DETAILS = 'anomaly_details'
+    # 所有字段
+    ALL_FIELDS = [CRITICAL_APIS, HIGH_PRIORITY_APIS, MEDIUM_PRIORITY_APIS, ANOMALY_DETAILS]
+class AnomalyDetector:
+    """异常检测器"""
+    def __init__(self, dump_data: Dict):
+        """
+            初始化检测器，并保存dump_data
+        Args:
+            dump_data: 数据格式如下
+                {
+                    "api/module": {statistics}
+                }
+        """
+        self.dump_data = dump_data
+        self.ignore_filter = IgnoreFilter()
+        self.scene_types = [
+            InputNormalOutputAnomalyScene,      # 输入正常，输出异常
+            InputAnomalyOutputAnomalyScene,     # 输入异常，输出异常
+            InputAnomalyOutputNormalScene,      # 输入异常，输出正常
+            NumericalMutationScene              # 输出较输入值突变
+        ]
+        self.anomaly_scenes: Dict[str, AnomalyScene] = dict()
+    @staticmethod
+    def _create_api_info(api_name: str, data: Dict) -> APIInfo:
+        """从原始数据创建APIInfo实例"""
+        return APIInfo(
+            api_name=api_name,
+            input_args=data.get(Const.INPUT_ARGS, data.get(Const.INPUT, [])),
+            input_kwargs=data.get(Const.INPUT_KWARGS, {}),
+            output_data=data.get(Const.OUTPUT, [])
+        )
+    def get_statistics(self) -> Dict[str, List]:
+        """获取统计信息
+        使用StatisticsFields类统一管理字段名称，避免硬编码
+        Returns:
+            Dict[str, List]: 包含各优先级API列表和异常详情的字典
+        """
+        stats = {field: [] for field in StatisticsFields.ALL_FIELDS}
+        # 定义rank到结果key的映射关系
+        rank_to_key = {
+            OverflowLevel.CRITICAL: StatisticsFields.CRITICAL_APIS,
+            OverflowLevel.HIGH: StatisticsFields.HIGH_PRIORITY_APIS,
+            OverflowLevel.MEDIUM: StatisticsFields.MEDIUM_PRIORITY_APIS
+        }
+        for scene in self.anomaly_scenes.values():
+            stats[StatisticsFields.ANOMALY_DETAILS].append(scene.get_details())
+            # 根据rank分类API
+            key = rank_to_key.get(scene.rank, None)
+            if not key:
+                stats[key].append(scene.api_name)
+        return stats
+    def analyze(self):
+        """
+            按照异常场景对调用数据进行分析
+        Returns:
+            返回类本身，若不进行过滤，则仅调用analyze即可
+        """
+        # 遍历data item
+        for api_name, data in self.dump_data.items():
+            api_info = self._create_api_info(api_name, data)
+            # 每种都进行检测，可能涉及多种命中，原则如下：
+            #   - 就高原则
+            #   - 优先原则，数据异常放最后检测
+            for scene_type in self.scene_types:
+                scene = scene_type(api_info)
+                if hasattr(scene, 'matches') and scene.matches():
+                    self.anomaly_scenes[api_name] = scene
+                    break  # 直接跳过，就高原则
+        return self
+    def filter(self):
+        """
+            对误检数据进行过滤
+        Returns:
+            检查checker自身，方便链式调用
+        """
+        result = dict()
+        for api_name, scene in self.anomaly_scenes.items():
+            if self.ignore_filter.apply_filter(scene.api_data):
+                continue
+            result[api_name] = scene
+        self.anomaly_scenes = result
+        return self
+    def overflow_result(self) -> Dict[str, AnomalyScene]:
+        return self.anomaly_scenes
+    def has_overflow(self, api_name: str) -> bool:
+        return api_name in self.anomaly_scenes.keys()
+    def get_overflow_level(self, api_name: str) -> Optional[Any]:
+        scene = self.anomaly_scenes.get(api_name, None)
+        return scene.rank if scene else None

msprobe/core/overflow_check/filter.py ADDED Viewed

@@ -0,0 +1,157 @@
+# Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os.path
+from dataclasses import dataclass, field
+from typing import Set
+from msprobe.core.common.file_utils import load_yaml
+from msprobe.core.overflow_check.api_info import APIInfo
+from msprobe.core.overflow_check.utils import has_nan_inf
+cur_path = os.path.dirname(os.path.realpath(__file__))
+class IgnoreFilter:
+    def __init__(self, rule_path=os.path.join(cur_path, './ignore_rules.yaml')):
+        self.rules = dict()
+        self._load_rules(rule_path)
+    def has_api_rule(self, api_name: str) -> bool:
+        return api_name in self.rules.keys()
+    def apply_filter(self, api_info: APIInfo) -> bool:
+        """
+            应用过滤规则，返回是否需要被过滤
+        Args:
+            api_info: API调用信息
+        Returns:
+            是否为误检，是否需要过滤
+        """
+        torch_api = api_info.torch_api_name
+        if not self.has_api_rule(torch_api):
+            return False
+        rule = self.rules.get(torch_api)
+        if not rule.match(api_info):
+            return False
+        return True
+    def _load_rules(self, rule_file_path):
+        if self.rules and len(self.rules):
+            return
+        data = load_yaml(rule_file_path)
+        self.rules = dict()
+        for rule_item in data.get('ignore_nan_inf', []):
+            rule = Rule(
+                api_name=rule_item.get('api_name', ''),
+                desc=rule_item.get('description', ''),
+                input_ignore=rule_item.get('input_ignore', []),
+                output_ignore=rule_item.get('output_ignore', [])
+            )
+            if not rule.verify_field():
+                continue
+            if self.has_api_rule(rule.api_name):
+                continue
+            self.rules[rule.api_name] = rule
+class Rule:
+    def __init__(self, api_name, desc='', input_ignore=None, output_ignore=None):
+        self.api_name = api_name
+        self.desc = desc
+        self.input_ignore = IgnoreItem()
+        self.output_ignore = IgnoreItem()
+        self._init_ignore(input_ignore, output_ignore)
+    def __repr__(self):
+        return (f'Rule(api_name={self.api_name}, desc={self.desc}, input_ignore={self.input_ignore}, output_ignore='
+                f'{self.output_ignore})')
+    def verify_field(self):
+        if self.api_name == '':
+            return False
+        # 若无输入输出规则长度，则为无效规则
+        if not (len(self.input_ignore.index) + len(self.input_ignore.name) + len(self.output_ignore.index)):
+            return False
+        return True
+    def match(self, api_info: APIInfo) -> bool:
+        """
+            匹配API信息是否符合规则
+        Returns:
+            bool: True if the api_info matches this rule, False otherwise
+        """
+        # 首先检查API名称是否匹配
+        api_name = api_info.torch_api_name
+        if api_name != self.api_name:
+            return False
+        # 检查输入参数中的NaN/Inf
+        if self.input_ignore.index and len(api_info.input_args):
+            for idx, arg in enumerate(api_info.input_args):
+                if has_nan_inf(arg) and not self.input_ignore.has_index(idx):
+                    return False
+        # 检查输入kwargs中的NaN/Inf
+        if self.input_ignore.name and len(api_info.input_kwargs):
+            for name, value in api_info.input_kwargs.items():
+                if has_nan_inf(value) and not self.input_ignore.has_name(name):
+                    return False
+        # 检查输出中的NaN/Inf
+        if self.output_ignore.index and len(api_info.output_data):
+            for idx, out in enumerate(api_info.output_data):
+                if has_nan_inf(out) and not self.output_ignore.has_index(idx):
+                    return False
+        return True
+    def _init_ignore(self, input_ignore=None, output_ignore=None):
+        """初始化忽略项"""
+        if input_ignore is None:
+            input_ignore = []
+        if output_ignore is None:
+            output_ignore = []
+        # 处理输入忽略规则
+        for item in input_ignore:
+            if 'index' in item:
+                self.input_ignore.add_index(item['index'])
+            if 'name' in item:
+                self.input_ignore.add_name(item['name'])
+        # 处理输出忽略规则
+        for item in output_ignore:
+            if 'index' in item:
+                self.output_ignore.add_index(item['index'])
+@dataclass
+class IgnoreItem:
+    """存储需要忽略的索引和名称"""
+    index: Set[int] = field(default_factory=set)
+    name: Set[str] = field(default_factory=set)
+    def add_index(self, idx: int):
+        self.index.add(idx)
+    def add_name(self, name: str):
+        self.name.add(name)
+    def has_index(self, idx: int) -> bool:
+        return idx in self.index
+    def has_name(self, name: str) -> bool:
+        return name in self.name

msprobe/core/overflow_check/ignore_rules.yaml ADDED Viewed

@@ -0,0 +1,55 @@
+ignore_nan_inf:
+  # Create an uninitialized memory
+  - api_name: "torch.empty"
+    description: "Creates a tensor with uninitialized data. The values may contain NaN or Inf because the memory is not cleared or set to zero."
+    output_ignore:
+      - index: 0
+  - api_name: "torch.empty_like"
+    description: "Creates an uninitialized tensor with the same size, dtype, and device as the input tensor. The values may contain NaN or Inf due to uninitialized memory."
+    output_ignore:
+      - index: 0
+  - api_name: "torch.empty_strided"
+    description: "Creates a tensor with uninitialized data using specified strides. NaN or Inf may be present due to uninitialized memory."
+    output_ignore:
+      - index: 0
+  # Distributed func
+  - api_name: "distributed.recv"
+    description: "Receives a tensor from another process. The input tensor may contain uninitialized data before the recv call, but it will be overwritten with received data."
+    input_ignore:
+      - index: 0  # tensor (the input buffer, which may be uninitialized before receiving)
+      - name: tensor
+  - api_name: "distributed.all_gather"
+    description: "Gathers tensors from all processes and distributes them to each process. The tensors in tensor_list may contain uninitialized data before the all_gather call, but they will be overwritten with collected data from all processes."
+    input_ignore:
+      - index: 0  # tensor_list (the input list of tensors, which may contain uninitialized data before the all_gather call)
+  - api_name: "distributed.reduce_scatter"
+    description: "Combines reduction and scatter operations. The output tensor may contain uninitialized data before the reduce_scatter call, but it will be overwritten with the reduced and scattered data from all processes."
+    input_ignore:
+      - index: 0
+      - name: output
+  - api_name: "distributed._reduce_scatter_base"
+    description: "Performs a combined reduction and scatter operation using a single input tensor. The output tensor may contain uninitialized data before the _reduce_scatter_base call, but it will be overwritten with the reduced and scattered data."
+    input_ignore:
+      - index: 0
+  - api_name: "distributed.all_gather_into_tensor"
+    description: "Gathers tensors from all processes into a single output tensor. The output tensor may contain uninitialized data before the all_gather_into_tensor call, but it will be overwritten with collected data from all processes."
+    input_ignore:
+      - index: 0
+  - api_name: "distributed.reduce_scatter_tensor"
+    description: "Performs a reduction operation across all processes and scatters the result into the output tensor. The output tensor may contain uninitialized data before the reduce_scatter_tensor call, but it will be overwritten with the reduced and scattered data."
+    input_ignore:
+      - index: 0
+  # Tensor inplace func
+  - api_name: "tensor.masked_fill_"
+    description: "Inplace fill tensor with given value by filtered mask"
+    input_ignore:
+      - index: 0

msprobe/core/overflow_check/level.py ADDED Viewed

@@ -0,0 +1,22 @@
+# Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from enum import Enum
+class OverflowLevel(Enum):
+    MEDIUM = "medium"
+    HIGH = "high"
+    CRITICAL = "critical"

msprobe/core/overflow_check/utils.py ADDED Viewed

@@ -0,0 +1,28 @@
+# Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any
+CHECK_FIELDS = ['Max', 'Min', 'Mean']
+OVERFLOW_VALUES = ['inf', '-inf', 'nan']
+def has_nan_inf(value: Any) -> bool:
+    """检查值是否包含NaN或Inf"""
+    if isinstance(value, dict):
+        for k, v in value.items():
+            if k in CHECK_FIELDS and str(v).lower() in OVERFLOW_VALUES:
+                return True
+    return False

msprobe/docs/01.installation.md CHANGED Viewed

@@ -16,6 +16,9 @@ pip install mindstudio-probe
 |版本|发布日期|支持 PyTorch 版本|支持 MindSpore 版本|下载链接|校验码|
 |:--:|:--:|:--:|:--:|:--:|:--:|
+|1.2.0|2025.1.13|1.11/2.0/2.1/2.2|2.4.0|[mindstudio_probe-1.2.0-py3-none-any.whl](https://ptdbg.obs.myhuaweicloud.com/msprobe/1.2/mindstudio_probe-1.2.0-py3-none-any.whl)|1e3aeea1706112f6ee52fd1165037936bb209138f0b9ec42ea21e2c1c8942cdc|
+|1.1.1|2024.12.09|1.11/2.0/2.1/2.2|2.4.0|[mindstudio_probe-1.1.1-py3-none-any.whl](https://ptdbg.obs.myhuaweicloud.com/msprobe/1.1/mindstudio_probe-1.1.1-py3-none-any.whl)|577b597555dc155b76ba1a62d575c3546004644e140a456c3ba0824d46283735|
+|1.1.0|2024.10.14|1.11/2.0/2.1/2.2|2.4.0|[mindstudio_probe-1.1.0-py3-none-any.whl](https://ptdbg.obs.myhuaweicloud.com/msprobe/1.1/mindstudio_probe-1.1.0-py3-none-any.whl)|83a5a9b7c65a357639f8c9636d88c693b4cf0eb590d4f8f5cb56395ba69b1f6d|
 |1.0.4|2024.09.09|1.11/2.0/2.1/2.2|2.4.0|[mindstudio_probe-1.0.4-py3-none-any.whl](https://ptdbg.obs.myhuaweicloud.com/msprobe/1.0/mindstudio_probe-1.0.4-py3-none-any.whl)|4e1909566a71a855b356597750c20ee43d964a22b2c2b02ac08312a5def75fd6|
 | 1.0.3 | 2024.08.23 | 1.11/2.0/2.1/2.2 | 2.4.0 | [mindstudio_probe-1.0.3-py3-none-any.whl](https://ptdbg.obs.myhuaweicloud.com/msprobe/1.0/mindstudio_probe-1.0.3-py3-none-any.whl) | 7060cc141a5b98ef770cd9220995d299393f32a61938261e632c7e8b5160bef2 |
 | 1.0.2 | 2024.08.09 | 1.11/2.0/2.1/2.2 | 2.4.0 | [mindstudio_probe-1.0.2-py3-none-any.whl](https://ptdbg.obs.myhuaweicloud.com/msprobe/1.0/mindstudio_probe-1.0.2-py3-none-any.whl) | e4a980e5d98c426ce5ce9842520d9bc031d3b3de621c74b3d59414cc6e238e0e |
@@ -40,18 +43,104 @@ cd mstt/debug/accuracy_tools
 pip install setuptools wheel
-python setup.py bdist_wheel
+python setup.py bdist_wheel [--include-mod=[adump]]
 cd ./dist
 pip install ./mindstudio_probe*.whl
 ```
-# 历史版本特性
+|参数|说明|是否必选|
+|--|--|:--:|
+|--include-mod|指定可选模块，可取值`adump`，表示在编whl包时加入adump模块。默认未配置该参数，表示编基础包。<br>&#8226; adump模块用于MindSpore静态图场景L2级别的dump。<br>&#8226; 仅MindSpore 2.5.0及以上版本支持adump模块。<br>&#8226; 若使用源码安装，编译环境需支持GCC 7或以上版本，和CMAKE 3.14或以上版本。<br>&#8226; 生成的whl包仅限编译时使用的python版本和处理器架构可用。|否|
-<table>
-    <tr><th>版本</th><th>特性</th></tr>
-    <tr><td rowspan="2">1.0.3</td><td>【精度预检】</br>1. 落盘数据小；</br>2. 支持随机生成模式和真实数据模式；</br>3. 单 API 测试，排除整网中的累计误差问题。</td></tr>
-    <tr><td>【梯度检测】</br>1. 使用便捷，无需在训练流程里插入代码。</br>2. 可以精准定位问题出现的 step。</td></tr>
-</table>
+# 特性变更说明
+## 1.1.1
+【数据采集】
+- dump 支持 processgroup、namedtuple、slice 等数据类型
+- MindSpore 动态图 dump 能力增强，支持 mix 模式 dump、控制 dropout 失效、支持控制区间正反向数据 dump
+【精度预检】
+- PyTorch 场景新增单算子 API 自动生成脚本
+- MindSpore 动态图场景新增支持 multi_run_ut 多线程预检
+- MindSpore 场景新增支持断点续检
+【精度比对】
+- 新增 MindSpore 跨框架比对能力，支持 MindSpore 与 PyTorch 跨框架比对
+- 支持异常比对结果数据自动颜色标注
+【无标杆比对】
+- Mindspore 动态图场景支持反向过程的无标杆比对
+【训练状态监控】
+- 新增支持通信聚合前梯度信息监控
+【分级可视化构图比对】
+- 新增分级可视化构图比对工具，支持单数据构图、溢出检测、双数据比对构图、同时支持传入映射文件，支持跨框架或同框架比对
+## 1.1.0
+【总体】
+- 训练精度一体化工具 atat 统一更名为 msprobe
+- msprobe 支持日志分级功能
+【数据采集】
+- 增加 L1 dump 接口，支持在指定区间内进行正反向 dump 功能
+- 新增 MindSpore 函数式接口的通信 API dump 功能
+【精度预检】
+- 支持配置 blacklist 黑名单字段
+- 补充了支持的融合算子列表
+【精度比对】
+- 支持 data mapping 和 layer mapping 的比对功能。
+【梯度工具】
+- 增加了梯度工具中关于 JIT 限制的说明
+## 1.0.4
+【数据采集】
+- 支持在 config.json 中传入 step 范围配置
+- 优化了 MindSpore 场景下的 step 机制，step 结束后训练继续运行
+【精度预检】
+- 在 PyTorch 场景下，支持部分 NPU 融合算子精度预检
+【精度比对】
+- 解决了在 MindSpore 场景下需要安装 PyTorch 的问题
+【无标杆比对】
+- 补充了 PyTorch 场景的性能基线报告
+- 支持 MindSpore 场景下的 change_value 扰动模式
+## 1.0.3
+【精度预检】
+- 落盘数据缩减
+- 支持随机生成模式和真实数据模式
+- 单 API 测试，排除整网中的累计误差问题
+【梯度检测】
+- 使用便捷，无需在训练流程里插入代码
+- 可以精准定位问题出现的 step
 # 查看 msprobe 工具信息

mindstudio-probe 1.1.0__py3-none-any.whl → 1.2.1__py3-none-any.whl

mindstudio-probe 1.1.0py3-none-any.whl → 1.2.1py3-none-any.whl