PyPI - mindstudio-probe - Versions diffs - 1.2.2__py3-none-any.whl → 1.3.0__py3-none-any.whl - Mend

mindstudio-probe 1.2.2py3-none-any.whl → 1.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (153) hide show

{mindstudio_probe-1.2.2.dist-info → mindstudio_probe-1.3.0.dist-info}/METADATA +3 -3
{mindstudio_probe-1.2.2.dist-info → mindstudio_probe-1.3.0.dist-info}/RECORD +143 -144
msprobe/README.md +25 -20
msprobe/core/common/const.py +110 -66
msprobe/core/common/decorator.py +50 -0
msprobe/core/common/exceptions.py +3 -1
msprobe/core/common/file_utils.py +25 -2
msprobe/core/common/utils.py +30 -34
msprobe/core/compare/acc_compare.py +43 -74
msprobe/core/compare/check.py +2 -6
msprobe/core/compare/highlight.py +2 -0
msprobe/core/compare/layer_mapping/layer_mapping.py +2 -1
msprobe/core/compare/merge_result/merge_result.py +8 -2
msprobe/core/compare/multiprocessing_compute.py +19 -12
msprobe/core/compare/npy_compare.py +30 -12
msprobe/core/compare/utils.py +20 -10
msprobe/core/data_dump/api_registry.py +176 -0
msprobe/core/data_dump/data_processor/base.py +2 -2
msprobe/core/data_dump/data_processor/mindspore_processor.py +19 -32
msprobe/core/data_dump/data_processor/pytorch_processor.py +45 -15
msprobe/core/data_dump/json_writer.py +38 -35
msprobe/core/grad_probe/constant.py +1 -0
msprobe/core/grad_probe/grad_compare.py +1 -1
msprobe/core/overflow_check/abnormal_scene.py +2 -0
msprobe/docs/01.installation.md +2 -1
msprobe/docs/02.config_introduction.md +17 -15
msprobe/docs/05.data_dump_PyTorch.md +70 -2
msprobe/docs/06.data_dump_MindSpore.md +33 -12
msprobe/docs/07.accuracy_checker_PyTorch.md +11 -1
msprobe/docs/08.accuracy_checker_online_PyTorch.md +3 -1
msprobe/docs/09.accuracy_checker_MindSpore.md +1 -1
msprobe/docs/10.accuracy_compare_PyTorch.md +59 -33
msprobe/docs/11.accuracy_compare_MindSpore.md +40 -16
msprobe/docs/12.overflow_check_PyTorch.md +3 -1
msprobe/docs/13.overflow_check_MindSpore.md +4 -2
msprobe/docs/14.data_parse_PyTorch.md +1 -7
msprobe/docs/18.online_dispatch.md +1 -1
msprobe/docs/19.monitor.md +124 -62
msprobe/docs/21.visualization_PyTorch.md +32 -13
msprobe/docs/22.visualization_MindSpore.md +32 -13
msprobe/docs/23.generate_operator_PyTorch.md +9 -9
msprobe/docs/27.dump_json_instruction.md +278 -8
msprobe/docs/28.kernel_dump_MindSpore.md +1 -1
msprobe/docs/29.data_dump_MSAdapter.md +229 -0
msprobe/docs/30.overflow_check_MSAdapter.md +31 -0
msprobe/docs/FAQ.md +3 -11
msprobe/docs/img/compare_result.png +0 -0
msprobe/docs/img/merge_result.png +0 -0
msprobe/docs/img/visualization/vis_browser_1.png +0 -0
msprobe/docs/img/visualization/vis_match_info.png +0 -0
msprobe/docs/img/visualization/vis_precision_info.png +0 -0
msprobe/docs/img/visualization/vis_search_info.png +0 -0
msprobe/docs/img/visualization/vis_show_info.png +0 -0
msprobe/docs/img/visualization/vis_showcase.png +0 -0
msprobe/docs/img/visualization/vis_unmatch_info.png +0 -0
msprobe/mindspore/__init__.py +4 -3
msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +6 -1
msprobe/mindspore/api_accuracy_checker/api_runner.py +19 -9
msprobe/mindspore/api_accuracy_checker/base_compare_algorithm.py +2 -1
msprobe/mindspore/api_accuracy_checker/bench_functions/flash_attention_score.py +602 -0
msprobe/mindspore/api_accuracy_checker/bench_functions/fusion_operator.py +41 -0
msprobe/mindspore/api_accuracy_checker/data_manager.py +2 -1
msprobe/mindspore/api_accuracy_checker/multi_api_accuracy_checker.py +2 -1
msprobe/mindspore/api_accuracy_checker/torch_mindtorch_importer.py +2 -1
msprobe/mindspore/common/const.py +61 -0
msprobe/mindspore/common/utils.py +31 -19
msprobe/mindspore/compare/ms_compare.py +27 -19
msprobe/mindspore/compare/ms_graph_compare.py +6 -5
msprobe/mindspore/debugger/debugger_config.py +6 -4
msprobe/mindspore/debugger/precision_debugger.py +22 -10
msprobe/mindspore/dump/dump_tool_factory.py +5 -3
msprobe/mindspore/dump/hook_cell/api_register.py +142 -0
msprobe/mindspore/dump/hook_cell/hook_cell.py +9 -10
msprobe/mindspore/dump/hook_cell/support_wrap_ops.yaml +24 -26
msprobe/mindspore/dump/jit_dump.py +14 -9
msprobe/mindspore/dym_loader/hook_dynamic_loader.cc +22 -56
msprobe/mindspore/dym_loader/hook_dynamic_loader.h +0 -1
msprobe/mindspore/free_benchmark/api_pynative_self_check.py +10 -6
msprobe/mindspore/free_benchmark/perturbation/perturbation_factory.py +4 -2
msprobe/mindspore/free_benchmark/self_check_tool_factory.py +6 -3
msprobe/mindspore/grad_probe/global_context.py +2 -0
msprobe/mindspore/grad_probe/grad_analyzer.py +2 -1
msprobe/mindspore/grad_probe/hook.py +2 -4
msprobe/mindspore/monitor/distributed/wrap_distributed.py +1 -1
msprobe/mindspore/monitor/module_hook.py +354 -302
msprobe/mindspore/monitor/utils.py +46 -4
msprobe/mindspore/overflow_check/overflow_check_tool_factory.py +5 -3
msprobe/mindspore/service.py +23 -17
msprobe/pytorch/api_accuracy_checker/compare/api_precision_compare.py +3 -6
msprobe/pytorch/api_accuracy_checker/generate_op_script/op_generator.py +11 -6
msprobe/pytorch/api_accuracy_checker/generate_op_script/operator_replication.template +2 -2
msprobe/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py +4 -5
msprobe/pytorch/api_accuracy_checker/run_ut/run_overflow_check.py +5 -5
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py +25 -6
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut_utils.py +28 -19
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/attl.py +3 -1
msprobe/pytorch/bench_functions/moe_gating_top_k_softmax.py +6 -0
msprobe/pytorch/common/utils.py +29 -7
msprobe/pytorch/debugger/precision_debugger.py +10 -1
msprobe/pytorch/dump/module_dump/module_dump.py +4 -3
msprobe/pytorch/dump/module_dump/module_processer.py +12 -6
msprobe/pytorch/free_benchmark/common/utils.py +1 -1
msprobe/pytorch/free_benchmark/compare/single_benchmark.py +1 -1
msprobe/pytorch/free_benchmark/perturbed_layers/npu/add_noise.py +3 -3
msprobe/pytorch/free_benchmark/perturbed_layers/npu/bit_noise.py +3 -3
msprobe/pytorch/free_benchmark/perturbed_layers/npu/change_value.py +1 -1
msprobe/pytorch/free_benchmark/perturbed_layers/npu/improve_precision.py +1 -1
msprobe/pytorch/free_benchmark/result_handlers/check_handler.py +1 -1
msprobe/pytorch/function_factory.py +1 -1
msprobe/pytorch/grad_probe/grad_monitor.py +2 -2
msprobe/pytorch/hook_module/api_register.py +131 -0
msprobe/pytorch/hook_module/hook_module.py +19 -14
msprobe/pytorch/hook_module/register_optimizer_hook.py +2 -1
msprobe/pytorch/hook_module/support_wrap_ops.yaml +172 -75
msprobe/pytorch/monitor/csv2tb.py +8 -2
msprobe/pytorch/monitor/distributed/wrap_distributed.py +8 -2
msprobe/pytorch/monitor/module_hook.py +131 -105
msprobe/pytorch/monitor/module_metric.py +3 -0
msprobe/pytorch/monitor/optimizer_collect.py +55 -4
msprobe/pytorch/monitor/unittest/test_monitor.py +1 -1
msprobe/pytorch/monitor/utils.py +68 -1
msprobe/pytorch/online_dispatch/compare.py +0 -2
msprobe/pytorch/online_dispatch/dispatch.py +9 -0
msprobe/pytorch/online_dispatch/dump_compare.py +3 -0
msprobe/pytorch/online_dispatch/utils.py +3 -0
msprobe/pytorch/parse_tool/lib/interactive_cli.py +1 -6
msprobe/pytorch/parse_tool/lib/utils.py +2 -1
msprobe/pytorch/pt_config.py +11 -7
msprobe/pytorch/service.py +11 -8
msprobe/visualization/builder/graph_builder.py +44 -5
msprobe/visualization/builder/msprobe_adapter.py +0 -1
msprobe/visualization/compare/graph_comparator.py +42 -38
msprobe/visualization/compare/mode_adapter.py +0 -19
msprobe/visualization/graph/base_node.py +8 -1
msprobe/visualization/graph/distributed_analyzer.py +1 -10
msprobe/visualization/graph/graph.py +0 -11
msprobe/visualization/graph/node_op.py +1 -2
msprobe/visualization/graph_service.py +1 -1
msprobe/visualization/utils.py +2 -33
msprobe/mindspore/dump/hook_cell/api_registry.py +0 -207
msprobe/mindspore/dump/hook_cell/wrap_api.py +0 -212
msprobe/pytorch/hook_module/api_registry.py +0 -166
msprobe/pytorch/hook_module/wrap_distributed.py +0 -79
msprobe/pytorch/hook_module/wrap_functional.py +0 -66
msprobe/pytorch/hook_module/wrap_npu_custom.py +0 -85
msprobe/pytorch/hook_module/wrap_tensor.py +0 -69
msprobe/pytorch/hook_module/wrap_torch.py +0 -84
msprobe/pytorch/hook_module/wrap_vf.py +0 -60
msprobe/pytorch/parse.py +0 -19
{mindstudio_probe-1.2.2.dist-info → mindstudio_probe-1.3.0.dist-info}/LICENSE +0 -0
{mindstudio_probe-1.2.2.dist-info → mindstudio_probe-1.3.0.dist-info}/WHEEL +0 -0
{mindstudio_probe-1.2.2.dist-info → mindstudio_probe-1.3.0.dist-info}/entry_points.txt +0 -0
{mindstudio_probe-1.2.2.dist-info → mindstudio_probe-1.3.0.dist-info}/top_level.txt +0 -0

msprobe/mindspore/monitor/utils.py CHANGED Viewed

@@ -12,13 +12,16 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import os
+import re
+from datetime import datetime
 from mindspore import dtype as mstype, Tensor
 from msprobe.mindspore.monitor.features import FUNC_MAP
 from msprobe.core.common.const import MonitorConst
 from msprobe.core.common.utils import is_int
 from msprobe.core.common.log import logger
+from msprobe.core.common.file_utils import check_file_or_directory_path
 def get_single_metrics(op_list, tag, tensor, output=None):
@@ -95,8 +98,8 @@ def validate_ranks(ranks):
     if not isinstance(ranks, list):
         raise TypeError("module_ranks should be a list")
     for rank in ranks:
-        if not isinstance(rank, str):
-            raise TypeError(f"element in module_ranks should be a str, get {type(rank)}")
+        if not isinstance(rank, int):
+            raise TypeError(f"element in module_ranks should be a int, get {type(rank)}")
 def validate_targets(targets):
@@ -209,6 +212,11 @@ def validate_collect_times(collect_times):
         raise ValueError("collect_times must greater than 1")
+def validate_dynamic_on(dynamic_on):
+    if not isinstance(dynamic_on, bool):
+        raise TypeError('dynamic_on should be a bool')
 def validate_config(config):
     config['ops'] = validate_ops(config.get('ops', []))
@@ -255,9 +263,12 @@ def validate_config(config):
     step_interval = config.get('step_interval', 1)
     validate_step_interval(step_interval)
-    collect_times = config.get('collect_times', 1e8)
+    collect_times = config.get('collect_times', int(1e8))
     validate_collect_times(collect_times)
+    dynamic_on = config.get('dynamic_on', False)
+    validate_dynamic_on(dynamic_on)
     if not targets:
         if xy_distribution:
             config["all_xy"] = True
@@ -265,3 +276,34 @@ def validate_config(config):
         config["is_select"] = False
     else:
         config["is_select"] = True
+def time_str2time_digit(time_str):
+    time_format = '%b%d_%H-%M-%S'
+    try:
+        time_digit = datetime.strptime(time_str, time_format)
+    except Exception as e:
+        raise RuntimeError(f"illegal timestamp: {time_str}, timestamp should be prefix \
+                           of existing output dirpath, like 'Dec03_21-34-40'.") from e
+    return time_digit
+def get_target_output_dir(monitor_path, time_start, time_end):
+    check_file_or_directory_path(monitor_path, isdir=True)
+    time_start = time_str2time_digit(time_start) if time_start is not None else time_start
+    time_end = time_str2time_digit(time_end) if time_end is not None else time_end
+    if time_start and time_end and time_start > time_end:
+        raise ValueError(f"time_start({time_start}) greater than time_end({time_end})")
+    result = {}
+    for dirname in os.listdir(monitor_path):
+        match = re.match(MonitorConst.OUTPUT_DIR_PATTERN, dirname)
+        if not match:
+            continue
+        time_tag = match.group(1)
+        rank = match.group(2)
+        target_time = time_str2time_digit(time_tag)
+        start_ok = time_start is None or target_time >= time_start
+        end_ok = time_end is None or target_time <= time_end
+        if start_ok and end_ok:
+            result[rank] = os.path.join(monitor_path, dirname)
+    return result

msprobe/mindspore/overflow_check/overflow_check_tool_factory.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
+# Copyright (c) 2024-2025, Huawei Technologies Co., Ltd.
 # All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0  (the "License");
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from msprobe.core.common.log import logger
 from msprobe.mindspore.common.const import Const
 from msprobe.mindspore.debugger.debugger_config import DebuggerConfig
 from msprobe.mindspore.overflow_check.kernel_graph_overflow_check import KernelGraphOverflowCheck
@@ -44,6 +45,7 @@ class OverflowCheckToolFactory:
             raise Exception("Valid level is needed.")
         tool = tool.get(config.execution_mode)
         if not tool:
-            raise Exception(f"Overflow check is not supported in {config.execution_mode} mode "
-                            f"when level is {config.level}.")
+            logger.error(f"Overflow check is not supported in {config.execution_mode} mode "
+                         f"when level is {config.level}.")
+            raise ValueError
         return tool(config)

msprobe/mindspore/service.py CHANGED Viewed

@@ -41,7 +41,7 @@ from msprobe.mindspore.cell_processor import CellProcessor
 from msprobe.mindspore.common.log import logger
 from msprobe.mindspore.common.utils import (get_rank_if_initialized, clean_input_kwargs,
                                             is_mindtorch, register_backward_hook_functions)
-from msprobe.mindspore.dump.hook_cell.api_registry import api_register
+from msprobe.mindspore.dump.hook_cell.api_register import get_api_register
 from msprobe.mindspore.dump.hook_cell.primitive_hooks import PrimitiveHookService
 from msprobe.mindspore.dump.jit_dump import JitDump
 from msprobe.mindspore.dump.hook_cell.hook_cell import HOOKCell
@@ -63,6 +63,8 @@ class Service:
         self.inner_switch = False
         self.primitive_switch = False
         self.current_iter = 0
+        self.loop = 0
+        self.init_step = 0
         self.first_start = True
         self.current_rank = None
         self.dump_iter_dir = None
@@ -71,6 +73,7 @@ class Service:
         self.params_grad_info = {}
         self.hook_handle_dict = {}
         # 提前注册，确保注册尽可能多的API hook
+        self.api_register = get_api_register()
         self.register_api_hook()
         self.init_for_debug_level()
@@ -276,11 +279,24 @@ class Service:
             if self.config.task == Const.TENSOR:
                 self.data_collector.data_processor.dump_async_data()
         self.data_collector.write_json()
-        self.current_iter += 1
-        self.data_collector.update_iter(self.current_iter)
+        self.loop += 1
         self.reset_status()
     def start(self, model=None):
+        if self.current_iter == 0:
+            if self.config.level in [Const.LEVEL_MIX, Const.LEVEL_L1]:
+                JitDump.set_config(self.config)
+                JitDump.set_data_collector(self.data_collector)
+                if hasattr(ms.common.api, "_MindsporeFunctionExecutor"):
+                    ms.common.api._MindsporeFunctionExecutor = JitDump
+                else:
+                    ms.common.api._JitExecutor = JitDump
+                ms.common.api._PyNativeExecutor.grad = JitDump.grad
+                if pijit_label:
+                    PIJitCaptureContext.__enter__ = self.empty
+                    PIJitCaptureContext.__exit__ = self.empty
+        self.current_iter = self.loop + self.init_step
+        self.data_collector.update_iter(self.current_iter)
         if self.config.level == Const.LEVEL_DEBUG:
             return
         self.start_call = True
@@ -293,6 +309,7 @@ class Service:
             print_tools_ends_info()
             return
         if self.config.step and self.current_iter not in self.config.step:
+            JitDump.jit_dump_switch = False
             return
         self.model = self.check_model_valid(model)
@@ -308,20 +325,9 @@ class Service:
                 return
             self.register_primitive_hook()
             self.register_cell_hook()
-            if self.config.level in [Const.LEVEL_MIX, Const.LEVEL_L1]:
-                JitDump.set_config(self.config)
-                JitDump.set_data_collector(self.data_collector)
-                if hasattr(ms.common.api, "_MindsporeFunctionExecutor"):
-                    ms.common.api._MindsporeFunctionExecutor = JitDump
-                else:
-                    ms.common.api._JitExecutor = JitDump
-                ms.common.api._PyNativeExecutor.grad = JitDump.grad
-                if pijit_label:
-                    PIJitCaptureContext.__enter__ = self.empty
-                    PIJitCaptureContext.__exit__ = self.empty
             self.first_start = False
-        api_register.api_set_hook_func()
+        self.api_register.register_all_api()
         self.switch = True
         self.primitive_switch = True
         logger.info(f"Dump switch is turned on at step {self.current_iter}. ")
@@ -410,8 +416,8 @@ class Service:
     def register_api_hook(self):
         if self.config.level in [Const.LEVEL_MIX, Const.LEVEL_L1, Const.LEVEL_L2]:
             logger.info(f"The api {self.config.task} hook function is successfully mounted to the model.")
-            api_register.initialize_hook(functools.partial(self.build_hook, BaseScope.Module_Type_API))
-            api_register.api_set_hook_func()
+            self.api_register.initialize_hook(functools.partial(self.build_hook, BaseScope.Module_Type_API))
+            self.api_register.register_all_api()
     def get_cells_and_names(self):
         cells_and_names_with_index = {}

msprobe/pytorch/api_accuracy_checker/compare/api_precision_compare.py CHANGED Viewed

@@ -40,7 +40,7 @@ from msprobe.pytorch.api_accuracy_checker.run_ut.run_ut_utils import get_validat
 from msprobe.pytorch.api_accuracy_checker.common.utils import extract_detailed_api_segments, extract_basic_api_segments
 from msprobe.core.common.file_utils import FileChecker, change_mode, create_directory
 from msprobe.pytorch.common.log import logger
-from msprobe.core.common.utils import CompareException
+from msprobe.core.common.utils import CompareException, check_op_str_pattern_valid
 from msprobe.core.common.const import Const, CompareConst, FileCheckConst
 CompareConfig = namedtuple('CompareConfig', ['npu_csv_path', 'gpu_csv_path', 'result_csv_path', 'details_csv_path'])
@@ -151,6 +151,7 @@ def analyse_csv(npu_data, gpu_data, config):
         message = ''
         compare_column = ApiPrecisionOutputColumn()
         full_api_name_with_direction_status = row_npu[ApiPrecisionCompareColumn.API_NAME]
+        check_op_str_pattern_valid(full_api_name_with_direction_status)
         row_gpu = gpu_data[gpu_data[ApiPrecisionCompareColumn.API_NAME] == full_api_name_with_direction_status]
         api_name, api_full_name, direction_status = extract_detailed_api_segments(full_api_name_with_direction_status)
         if not api_full_name:
@@ -430,6 +431,7 @@ def _api_precision_compare(parser=None):
     _api_precision_compare_parser(parser)
     args = parser.parse_args(sys.argv[1:])
     _api_precision_compare_command(args)
+    logger.info("Compare task completed.")
 def _api_precision_compare_command(args):
@@ -457,8 +459,3 @@ def _api_precision_compare_parser(parser):
     parser.add_argument("-o", "--out_path", dest="out_path", default="", type=str,
                         help="<optional> The api precision compare task result out path.",
                         required=False)
-if __name__ == '__main__':
-    _api_precision_compare()
-    logger.info("Compare task completed.")

msprobe/pytorch/api_accuracy_checker/generate_op_script/op_generator.py CHANGED Viewed

@@ -28,10 +28,10 @@ from msprobe.pytorch.api_accuracy_checker.compare.compare_utils import binary_st
 ulp_standard_api, thousandth_standard_api
 from msprobe.core.common.file_utils import FileOpen, load_json, save_json
 from msprobe.core.common.utils import check_file_or_directory_path, check_op_str_pattern_valid, is_int
-from msprobe.core.common.const import Const, MonitorConst, MsgConst
+from msprobe.core.common.const import Const, MonitorConst, MsgConst, FileCheckConst
 from msprobe.core.common.log import logger
-from msprobe.core.common.file_utils import make_dir
-from msprobe.core.common.utils import recursion_depth_decorator
+from msprobe.core.common.file_utils import make_dir, change_mode
+from msprobe.core.common.decorator import recursion_depth_decorator
 TENSOR_DATA_LIST = ["torch.Tensor", "torch.nn.parameter.Parameter"]
 TORCH_BOOL_TYPE = ["torch.bool"]
@@ -50,6 +50,7 @@ DATA_NAME = "data_name"
 API_MAX_LENGTH = 30
 PROPAGATION_LIST = [Const.FORWARD, Const.BACKWARD]
 DATAMODE_LIST = ["random_data", "real_data"]
+ITER_MAX_TIMES = 1000
 class APIInfo:
@@ -97,6 +98,8 @@ class CommonConfig:
         iter_t = self.iter_times
         if iter_t <= 0:
             raise ValueError("iter_times should be an integer bigger than zero!")
+        if iter_t > ITER_MAX_TIMES:
+            raise ValueError("iter_times should not be greater than 1000!")
         json_file = self.extract_api_path
         propagation = self.propagation
@@ -117,7 +120,7 @@ class CommonConfig:
         # Retrieve the first API name and dictionary
         forward_item = next(iter(json_content.items()), None)
-        if not forward_item or not isinstance(forward_item[1], dict):
+        if not forward_item or not isinstance(forward_item[1], dict) or not forward_item[1]:
             raise ValueError(f'Invalid forward API data in json_content!')
         # if propagation is backward, ensure json file contains forward and backward info
@@ -127,7 +130,7 @@ class CommonConfig:
         # if propagation is backward, ensure it has valid data
         if propagation == Const.BACKWARD:
             backward_item = list(json_content.items())[1]
-            if not isinstance(backward_item[1], dict):
+            if not isinstance(backward_item[1], dict) or not backward_item[1]:
                 raise ValueError(f'Invalid backward API data in json_content!')
         return json_content
@@ -169,7 +172,7 @@ class APIExtractor:
                     value = self.load_real_data_path(value, real_data_path)
                 new_data[key] = value
         if not new_data:
-            logger.error(f"Error: The api '{self.api_name}' does not exist in the file.")
+            logger.warning(f"Warning: The api '{self.api_name}' does not exist in the file.")
         else:
             save_json(self.output_file, new_data, indent=4)
             logger.info(
@@ -183,6 +186,7 @@ class APIExtractor:
                     self.update_data_name(v, dump_data_dir)
         return value
+    @recursion_depth_decorator("OpGenerator: APIExtractor.update_data_name")
     def update_data_name(self, data, dump_data_dir):
         if isinstance(data, list):
             for item in data:
@@ -467,6 +471,7 @@ def _run_operator_generate_commond(cmd_args):
             fout.write(code_template.format(**internal_settings))
     except OSError:
         logger.error(f"Failed to open file. Please check file {template_path} or {operator_script_path}.")
+    change_mode(operator_script_path, FileCheckConst.DATA_FILE_AUTHORITY)
     logger.info(f"Generate operator script successfully and the name is {operator_script_path}.")

msprobe/pytorch/api_accuracy_checker/generate_op_script/operator_replication.template CHANGED Viewed

@@ -37,9 +37,9 @@ def load_pt(pt_path, to_cpu=False):
     pt_path = os.path.realpath(pt_path)
     try:
         if to_cpu:
-            pt = torch.load(pt_path, map_location=torch.device("cpu"))
+            pt = torch.load(pt_path, map_location=torch.device("cpu"), weights_only=True)
         else:
-            pt = torch.load(pt_path)
+            pt = torch.load(pt_path, weights_only=True)
     except Exception as e:
         raise RuntimeError(f"load pt file {{pt_path}} failed") from e
     return pt

msprobe/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py CHANGED Viewed

@@ -50,6 +50,9 @@ def split_json_file(input_file, num_splits, filter_api):
         backward_data[f"{data_name}.backward"] = backward_data.pop(data_name)
     input_data = load_json(input_file)
+    if "dump_data_dir" not in input_data.keys():
+        logger.error("Invalid input file, 'dump_data_dir' field is missing")
+        raise CompareException("Invalid input file, 'dump_data_dir' field is missing")
     if input_data.get("data") is None:
         logger.error("Invalid input file, 'data' field is missing")
         raise CompareException("Invalid input file, 'data' field is missing")
@@ -97,7 +100,7 @@ def run_parallel_ut(config):
     processes = []
     device_id_cycle = cycle(config.device_id)
     if config.save_error_data_flag:
-        logger.info("UT task error datas will be saved")
+        logger.info("UT task error data will be saved")
     logger.info(f"Starting parallel UT with {config.num_splits} processes")
     progress_bar = tqdm(total=config.total_items, desc="Total items", unit="items")
@@ -221,7 +224,3 @@ def main():
     args = parser.parse_args()
     config = prepare_config(args)
     run_parallel_ut(config)
-if __name__ == '__main__':
-    main()

msprobe/pytorch/api_accuracy_checker/run_ut/run_overflow_check.py CHANGED Viewed

@@ -34,8 +34,10 @@ from msprobe.pytorch.api_accuracy_checker.run_ut.run_ut_utils import exec_api, i
 from msprobe.core.common.file_utils import check_link, FileChecker
 from msprobe.pytorch.api_accuracy_checker.common.utils import extract_basic_api_segments
 from msprobe.core.common.const import FileCheckConst, Const
+from msprobe.core.common.utils import check_op_str_pattern_valid
 from msprobe.pytorch.common.log import logger
 from msprobe.pytorch.common.parse_json import parse_json_info_forward_backward
+from msprobe.core.common.decorator import recursion_depth_decorator
 def check_tensor_overflow(x):
@@ -75,6 +77,7 @@ def check_data_overflow(x, device):
             return torch_npu.npu.utils.npu_check_overflow(x)
+@recursion_depth_decorator("is_bool_output")
 def is_bool_output(x):
     if isinstance(x, (tuple, list)):
         if not x:
@@ -91,6 +94,7 @@ def run_overflow_check(forward_file):
         dump_path = os.path.dirname(forward_file)
         real_data_path = os.path.join(dump_path, Const.DUMP_TENSOR_DATA)
     for api_full_name, api_info_dict in tqdm(forward_content.items()):
+        check_op_str_pattern_valid(api_full_name)
         if is_unsupported_api(api_full_name, is_overflow_check=True):
             continue
         try:
@@ -161,6 +165,7 @@ def _run_overflow_check(parser=None):
     _run_overflow_check_parser(parser)
     args = parser.parse_args(sys.argv[1:])
     _run_overflow_check_command(args)
+    logger.info("UT task completed.")
 def _run_overflow_check_command(args):
@@ -175,8 +180,3 @@ def _run_overflow_check_command(args):
         logger.error(f"Set NPU device id failed. device id is: {args.device_id}")
         raise NotImplementedError from error
     run_overflow_check(api_info)
-if __name__ == '__main__':
-    _run_overflow_check()
-    logger.info("UT task completed.")

msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py CHANGED Viewed

@@ -49,7 +49,7 @@ from msprobe.core.common.file_utils import FileChecker, change_mode, \
 from msprobe.pytorch.common.log import logger
 from msprobe.pytorch.pt_config import parse_json_config
 from msprobe.core.common.const import Const, FileCheckConst, CompareConst
-from msprobe.core.common.utils import safe_get_value, CompareException
+from msprobe.core.common.utils import safe_get_value, CompareException, is_int, check_op_str_pattern_valid
 from msprobe.pytorch.common.utils import seed_all
 from msprobe.pytorch.api_accuracy_checker.tensor_transport_layer.attl import ATTL, ATTLConfig, move2device_exec
 from msprobe.pytorch.api_accuracy_checker.tensor_transport_layer.device_dispatch import ConsumerDispatcher
@@ -65,6 +65,7 @@ DETAILS_FILE_NAME = "accuracy_checking_details_" + current_time + ".csv"
 not_backward_list = ['repeat_interleave']
 unsupported_backward_list = ['masked_select']
+unsupported_api_list = ["to"]
 tqdm_params = {
@@ -83,6 +84,9 @@ tqdm_params = {
 }
+seed_all()
 def run_ut(config):
     logger.info("start UT test")
     if config.online_config.is_online:
@@ -93,7 +97,7 @@ def run_ut(config):
         logger.info(f"UT task details will be saved in {config.details_csv_path}")
     if config.save_error_data:
-        logger.info(f"UT task error_datas will be saved in {config.error_data_path}")
+        logger.info(f"UT task error_data will be saved in {config.error_data_path}")
     compare = Comparator(config.result_csv_path, config.details_csv_path, config.is_continue_run_ut, config=config)
     if config.online_config.is_online:
@@ -117,6 +121,7 @@ def run_ut(config):
 def run_api_offline(config, compare, api_name_set):
     err_column = CompareColumn()
     for _, (api_full_name, api_info_dict) in enumerate(tqdm(config.forward_content.items(), **tqdm_params)):
+        check_op_str_pattern_valid(api_full_name)
         if api_full_name in api_name_set:
             continue
         if is_unsupported_api(api_full_name):
@@ -218,6 +223,7 @@ def blacklist_and_whitelist_filter(api_name, black_list, white_list):
     If api is both in black_list and black_list, black_list first.
     return: False for exec api, True for not exec
     """
+    black_list.extend(unsupported_api_list)
     if black_list and api_name in black_list:
         return True
     if white_list and api_name not in white_list:
@@ -317,7 +323,8 @@ def run_torch_api_online(api_full_name, api_data, backward_content):
     if kwargs.get("device"):
         del kwargs["device"]
-    device_out = exec_api(api_type, api_name, Const.CUDA_LOWERCASE, args, kwargs)
+    device_exec_params = ExecParams(api_type, api_name, current_device, args, kwargs, False, None)
+    device_out = exec_api(device_exec_params)
     device_out = move2device_exec(device_out, "cpu")
     return UtDataInfo(None, None, out, device_out, None, in_fwd_data_list, None, rank=api_data.rank)
@@ -344,6 +351,9 @@ def need_to_backward(grad_index, out):
 def run_backward(args, grad, grad_index, out):
     if grad_index is not None:
+        if not is_int(grad_index):
+            logger.error(f"{grad_index} dtype is not int")
+            raise TypeError(f"{grad_index} dtype is not int")
         if grad_index >= len(out):
             logger.error(f"Run backward error when grad_index is {grad_index}")
             raise IndexError(f"Run backward error when grad_index is {grad_index}")
@@ -430,6 +440,7 @@ def preprocess_forward_content(forward_content):
     arg_cache = {}
     for key, value in forward_content.items():
+        check_op_str_pattern_valid(key)
         base_key = key.rsplit(Const.SEP, 1)[0]
         if key not in arg_cache:
@@ -469,6 +480,7 @@ def _run_ut(parser=None):
     _run_ut_parser(parser)
     args = parser.parse_args(sys.argv[1:])
     run_ut_command(args)
 def checked_online_config(online_config):
@@ -492,6 +504,7 @@ def checked_online_config(online_config):
         check_file_or_directory_path(os.path.join(online_config.tls_path, "server.key"))
         check_file_or_directory_path(os.path.join(online_config.tls_path, "server.crt"))
         check_crt_valid(os.path.join(online_config.tls_path, "server.crt"))
+        check_crt_valid(os.path.join(online_config.tls_path, "server.key"), True)
     # host and port
     if not isinstance(online_config.host, str) or not re.match(Const.ipv4_pattern, online_config.host):
@@ -561,7 +574,14 @@ def run_ut_command(args):
     error_data_path = checker_config.error_data_path
     if save_error_data:
         if args.result_csv_path:
-            time_info = result_csv_path.split('.')[0].split('_')[-1]
+            parts_by_dot = result_csv_path.split(Const.SEP)
+            if len(parts_by_dot) < 2 or not parts_by_dot[0]:
+                raise ValueError("result_csv_path does not contain a valid file name with an extension.")
+            file_name_part = parts_by_dot[0]
+            parts_by_underscore = file_name_part.split(Const.REPLACEMENT_CHARACTER)
+            if len(parts_by_underscore) < 2:
+                raise ValueError("File name part does not contain enough '_' separated segments.")
+            time_info = parts_by_underscore[-1]
             global UT_ERROR_DATA_DIR
             UT_ERROR_DATA_DIR = 'ut_error_data' + time_info
         error_data_path = initialize_save_error_data(error_data_path)
@@ -579,9 +599,8 @@ def run_ut_command(args):
     }
     run_ut_config = checker_config.get_run_ut_config(**config_params)
     run_ut(run_ut_config)
+    logger.info("UT task completed.")
 if __name__ == '__main__':
-    seed_all()
     _run_ut()
-    logger.info("UT task completed.")

msprobe/pytorch/api_accuracy_checker/run_ut/run_ut_utils.py CHANGED Viewed

@@ -1,9 +1,7 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-# Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
+# Copyright (c) 2024-2025, Huawei Technologies Co., Ltd.
 # All rights reserved.
 #
-# Licensed under the Apache License, Version 2.0  (the "License");
+# Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
@@ -18,8 +16,8 @@
 import os
 from collections import namedtuple
 import re
-import torch
+import torch
 try:
     import torch_npu
 except ImportError:
@@ -33,11 +31,9 @@ from msprobe.core.common.const import FileCheckConst, Const, CompareConst
 from msprobe.core.common.file_utils import FileChecker
 from msprobe.core.common.log import logger
 from msprobe.core.common.utils import CompareException
+from msprobe.pytorch.hook_module.api_register import ApiTemplate, get_api_register
 from msprobe.pytorch.hook_module.wrap_aten import AtenOPTemplate
-from msprobe.pytorch.hook_module.wrap_functional import FunctionalOPTemplate
-from msprobe.pytorch.hook_module.wrap_npu_custom import NpuOPTemplate
-from msprobe.pytorch.hook_module.wrap_tensor import TensorOPTemplate
-from msprobe.pytorch.hook_module.wrap_torch import TorchOPTemplate
 hf_32_standard_api = ["conv1d", "conv2d"]
 not_detach_set = {'resize_', 'resize_as_', 'set_', 'transpose_', 't_', 'squeeze_', 'unsqueeze_'}
@@ -108,17 +104,30 @@ def exec_api(exec_params):
     kwargs = exec_params.kwargs
     is_autocast = exec_params.is_autocast
     autocast_dtype = exec_params.autocast_dtype
-    if api_type == "Functional":
-        torch_api = FunctionalOPTemplate(api_name, str, False)
-    if api_type == "Tensor":
-        torch_api = TensorOPTemplate(api_name, str, False)
-    if api_type == "Torch":
-        torch_api = TorchOPTemplate(api_name, str, False)
-    if api_type == "Aten":
+    out = None
+    prefix_map = Const.API_DATA_PREFIX.get(Const.PT_FRAMEWORK, {})
+    if not prefix_map or api_type not in prefix_map.values() or \
+        api_type not in (
+            Const.FUNCTIONAL_API_TYPE_PREFIX,
+            Const.TENSOR_API_TYPE_PREFIX,
+            Const.TORCH_API_TYPE_PREFIX,
+            Const.ATEN_API_TYPE_PREFIX,
+            Const.NPU_API_TYPE_PREFIX
+    ):
+        return out
+    if api_type == Const.ATEN_API_TYPE_PREFIX:
         torch_api = AtenOPTemplate(api_name, None, False)
-    if api_type == "NPU":
-        torch_api = NpuOPTemplate(api_name, None, False, device)
+    else:
+        api_register = get_api_register()
+        api_register.initialize_hook(None)
+        api_func_type = list(prefix_map.keys())[list(prefix_map.values()).index(api_type)]
+        api_func = api_register.ori_api_attr.get(Const.PT_FRAMEWORK + Const.SEP + api_func_type, {}).get(api_name)
+        if api_func is None:
+            return out
+        torch_api = ApiTemplate(api_name, api_func, api_type, None, need_hook=False, device=device)
     if is_autocast:
         with autocast(dtype=autocast_dtype):
             out = torch_api.forward(*args, **kwargs)

msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/attl.py CHANGED Viewed

@@ -27,6 +27,7 @@ from msprobe.pytorch.api_accuracy_checker.tensor_transport_layer.client import T
 from msprobe.pytorch.api_accuracy_checker.tensor_transport_layer.server import TCPServer
 from msprobe.core.common.file_utils import remove_path
 from msprobe.pytorch.common.utils import logger, save_api_data, load_api_data, save_pkl, load_pkl
+from msprobe.core.common.decorator import recursion_depth_decorator
 BufferType = Union[ApiData, Dict[str, Any], str]  # Union[Tensor, Tuple[Optional[Tensor]]]
@@ -168,11 +169,12 @@ class ATTL:
         return buffer
+@recursion_depth_decorator("move2device_exec")
 def move2device_exec(obj, device):
     if isinstance(obj, (tuple, list)):
         data_list = [move2device_exec(val, device) for val in obj]
         return data_list if isinstance(obj, list) else tuple(data_list)
-    if isinstance(obj, dict):
+    if isinstance(obj, dict):
         return {key: move2device_exec(val, device) for key, val in obj.items()}
     elif isinstance(obj, torch.Tensor):
         obj = obj.detach()

msprobe/pytorch/bench_functions/moe_gating_top_k_softmax.py CHANGED Viewed

@@ -29,6 +29,8 @@ def softmax_func(x, axis=None):
 def npu_moe_gating_top_k_softmax(x, finished_optional, k):
     input_dtype = x.dtype
+    if x.dim() < 1:
+        raise ValueError("Input x must have at least 1 dimensions.")
     num_expert = x.shape[-1]
     softmax = softmax_func(x, -1)
     softmax = softmax.to(input_dtype)
@@ -36,9 +38,13 @@ def npu_moe_gating_top_k_softmax(x, finished_optional, k):
     expert_idx = expert_idx[:, :k]
     y = torch.gather(softmax, index=expert_idx, dim=-1)
     if finished_optional is not None:
+        if finished_optional.dim() < 1:
+            raise ValueError("Finished_optional must have at least 1 dimensions.")
         finished_optional = finished_optional.view(finished_optional.shape[0], 1)
         finished_optional = finished_optional.expand(-1, k)
         expert_idx = torch.where(finished_optional, num_expert, expert_idx)
+    if y.dim() < 2:
+        raise ValueError("Variable y must have at least 2 dimensions.")
     row_idx = torch.arange(y.shape[0] * y.shape[1]).reshape(y.shape[1], y.shape[0]).t()
     return y, expert_idx, row_idx

mindstudio-probe 1.2.2__py3-none-any.whl → 1.3.0__py3-none-any.whl

mindstudio-probe 1.2.2py3-none-any.whl → 1.3.0py3-none-any.whl