PyPI - mindstudio-probe - Versions diffs - 1.2.1__py3-none-any.whl → 1.3.0__py3-none-any.whl - Mend

mindstudio-probe 1.2.1py3-none-any.whl → 1.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (177) hide show

{mindstudio_probe-1.2.1.dist-info → mindstudio_probe-1.3.0.dist-info}/METADATA +3 -3
{mindstudio_probe-1.2.1.dist-info → mindstudio_probe-1.3.0.dist-info}/RECORD +168 -150
msprobe/README.md +27 -22
msprobe/core/common/const.py +129 -60
msprobe/core/common/decorator.py +50 -0
msprobe/core/common/exceptions.py +3 -1
msprobe/core/common/file_utils.py +25 -2
msprobe/core/common/inplace_ops.yaml +1 -0
msprobe/core/common/utils.py +43 -33
msprobe/core/compare/acc_compare.py +43 -74
msprobe/core/compare/check.py +2 -6
msprobe/core/compare/highlight.py +2 -0
msprobe/core/compare/layer_mapping/data_scope_parser.py +1 -1
msprobe/core/compare/layer_mapping/layer_mapping.py +2 -1
msprobe/core/compare/merge_result/merge_result.py +16 -9
msprobe/core/compare/merge_result/utils.py +81 -0
msprobe/core/compare/multiprocessing_compute.py +19 -12
msprobe/core/compare/npy_compare.py +30 -12
msprobe/core/compare/utils.py +30 -10
msprobe/core/data_dump/api_registry.py +176 -0
msprobe/core/data_dump/data_collector.py +58 -13
msprobe/core/data_dump/data_processor/base.py +94 -10
msprobe/core/data_dump/data_processor/factory.py +3 -0
msprobe/core/data_dump/data_processor/mindspore_processor.py +33 -33
msprobe/core/data_dump/data_processor/pytorch_processor.py +99 -18
msprobe/core/data_dump/json_writer.py +61 -40
msprobe/core/grad_probe/constant.py +1 -0
msprobe/core/grad_probe/grad_compare.py +1 -1
msprobe/core/overflow_check/abnormal_scene.py +2 -0
msprobe/docs/01.installation.md +27 -1
msprobe/docs/02.config_introduction.md +27 -23
msprobe/docs/03.config_examples.md +24 -0
msprobe/docs/05.data_dump_PyTorch.md +103 -16
msprobe/docs/06.data_dump_MindSpore.md +76 -32
msprobe/docs/07.accuracy_checker_PyTorch.md +11 -1
msprobe/docs/08.accuracy_checker_online_PyTorch.md +3 -1
msprobe/docs/09.accuracy_checker_MindSpore.md +5 -3
msprobe/docs/10.accuracy_compare_PyTorch.md +59 -33
msprobe/docs/11.accuracy_compare_MindSpore.md +40 -16
msprobe/docs/12.overflow_check_PyTorch.md +3 -1
msprobe/docs/13.overflow_check_MindSpore.md +4 -2
msprobe/docs/14.data_parse_PyTorch.md +1 -7
msprobe/docs/18.online_dispatch.md +1 -1
msprobe/docs/19.monitor.md +332 -273
msprobe/docs/21.visualization_PyTorch.md +42 -13
msprobe/docs/22.visualization_MindSpore.md +43 -13
msprobe/docs/23.generate_operator_PyTorch.md +9 -9
msprobe/docs/27.dump_json_instruction.md +301 -27
msprobe/docs/28.debugger_save_instruction.md +94 -0
msprobe/docs/28.kernel_dump_MindSpore.md +69 -0
msprobe/docs/29.data_dump_MSAdapter.md +229 -0
msprobe/docs/30.overflow_check_MSAdapter.md +31 -0
msprobe/docs/FAQ.md +3 -11
msprobe/docs/img/compare_result.png +0 -0
msprobe/docs/img/merge_result.png +0 -0
msprobe/docs/img/monitor/step_count_per_record.png +0 -0
msprobe/docs/img/visualization/vis_browser_1.png +0 -0
msprobe/docs/img/visualization/vis_match_info.png +0 -0
msprobe/docs/img/visualization/vis_precision_info.png +0 -0
msprobe/docs/img/visualization/vis_search_info.png +0 -0
msprobe/docs/img/visualization/vis_show_info.png +0 -0
msprobe/docs/img/visualization/vis_showcase.png +0 -0
msprobe/docs/img/visualization/vis_unmatch_info.png +0 -0
msprobe/mindspore/__init__.py +4 -2
msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +32 -7
msprobe/mindspore/api_accuracy_checker/api_runner.py +70 -22
msprobe/mindspore/api_accuracy_checker/base_compare_algorithm.py +2 -1
msprobe/mindspore/api_accuracy_checker/bench_functions/flash_attention_score.py +602 -0
msprobe/mindspore/api_accuracy_checker/bench_functions/fusion_operator.py +41 -0
msprobe/mindspore/api_accuracy_checker/compute_element.py +47 -1
msprobe/mindspore/api_accuracy_checker/data_manager.py +2 -1
msprobe/mindspore/api_accuracy_checker/multi_api_accuracy_checker.py +2 -1
msprobe/mindspore/api_accuracy_checker/torch_mindtorch_importer.py +130 -0
msprobe/mindspore/api_accuracy_checker/type_mapping.py +24 -1
msprobe/mindspore/api_accuracy_checker/utils.py +6 -1
msprobe/mindspore/common/const.py +61 -0
msprobe/mindspore/common/utils.py +48 -18
msprobe/mindspore/compare/ms_compare.py +27 -19
msprobe/mindspore/compare/ms_graph_compare.py +6 -5
msprobe/mindspore/debugger/debugger_config.py +31 -6
msprobe/mindspore/debugger/precision_debugger.py +45 -14
msprobe/mindspore/dump/dump_tool_factory.py +5 -3
msprobe/mindspore/dump/hook_cell/api_register.py +142 -0
msprobe/mindspore/dump/hook_cell/hook_cell.py +9 -10
msprobe/mindspore/dump/hook_cell/support_wrap_ops.yaml +24 -26
msprobe/mindspore/dump/jit_dump.py +21 -15
msprobe/mindspore/dym_loader/hook_dynamic_loader.cc +22 -56
msprobe/mindspore/dym_loader/hook_dynamic_loader.h +0 -1
msprobe/mindspore/free_benchmark/api_pynative_self_check.py +10 -6
msprobe/mindspore/free_benchmark/perturbation/perturbation_factory.py +4 -2
msprobe/mindspore/free_benchmark/self_check_tool_factory.py +6 -3
msprobe/mindspore/grad_probe/global_context.py +2 -0
msprobe/mindspore/grad_probe/grad_analyzer.py +2 -1
msprobe/mindspore/grad_probe/hook.py +2 -4
msprobe/mindspore/monitor/anomaly_detect.py +404 -0
msprobe/mindspore/monitor/distributed/__init__.py +0 -0
msprobe/mindspore/monitor/distributed/distributed_ops.yaml +15 -0
msprobe/mindspore/monitor/distributed/stack_blacklist.yaml +5 -0
msprobe/mindspore/monitor/distributed/wrap_distributed.py +300 -0
msprobe/mindspore/monitor/features.py +63 -0
msprobe/mindspore/monitor/module_hook.py +873 -0
msprobe/mindspore/monitor/module_spec_verifier.py +94 -0
msprobe/mindspore/monitor/utils.py +309 -0
msprobe/mindspore/ms_config.py +8 -2
msprobe/mindspore/overflow_check/overflow_check_tool_factory.py +5 -3
msprobe/mindspore/service.py +114 -34
msprobe/pytorch/__init__.py +0 -1
msprobe/pytorch/api_accuracy_checker/compare/api_precision_compare.py +3 -6
msprobe/pytorch/api_accuracy_checker/generate_op_script/op_generator.py +12 -7
msprobe/pytorch/api_accuracy_checker/generate_op_script/operator_replication.template +2 -2
msprobe/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py +4 -5
msprobe/pytorch/api_accuracy_checker/run_ut/run_overflow_check.py +5 -5
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py +25 -6
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut_utils.py +28 -19
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/attl.py +3 -1
msprobe/pytorch/bench_functions/apply_adam.py +215 -0
msprobe/pytorch/bench_functions/group_norm_silu.py +27 -0
msprobe/pytorch/{parse.py → bench_functions/mish.py} +6 -4
msprobe/pytorch/bench_functions/moe_gating_top_k_softmax.py +50 -0
msprobe/pytorch/bench_functions/sort_v2.py +21 -0
msprobe/pytorch/common/utils.py +97 -4
msprobe/pytorch/debugger/debugger_config.py +19 -9
msprobe/pytorch/debugger/precision_debugger.py +24 -1
msprobe/pytorch/dump/module_dump/module_dump.py +4 -3
msprobe/pytorch/dump/module_dump/module_processer.py +21 -35
msprobe/pytorch/free_benchmark/common/utils.py +1 -1
msprobe/pytorch/free_benchmark/compare/single_benchmark.py +1 -1
msprobe/pytorch/free_benchmark/perturbed_layers/npu/add_noise.py +3 -3
msprobe/pytorch/free_benchmark/perturbed_layers/npu/bit_noise.py +3 -3
msprobe/pytorch/free_benchmark/perturbed_layers/npu/change_value.py +1 -1
msprobe/pytorch/free_benchmark/perturbed_layers/npu/improve_precision.py +1 -1
msprobe/pytorch/free_benchmark/result_handlers/check_handler.py +1 -1
msprobe/pytorch/function_factory.py +8 -2
msprobe/pytorch/grad_probe/grad_monitor.py +2 -2
msprobe/pytorch/hook_module/api_register.py +131 -0
msprobe/pytorch/hook_module/hook_module.py +19 -14
msprobe/pytorch/hook_module/register_optimizer_hook.py +2 -1
msprobe/pytorch/hook_module/support_wrap_ops.yaml +173 -75
msprobe/pytorch/monitor/anomaly_detect.py +14 -29
msprobe/pytorch/monitor/csv2tb.py +18 -14
msprobe/pytorch/monitor/distributed/wrap_distributed.py +8 -2
msprobe/pytorch/monitor/module_hook.py +238 -193
msprobe/pytorch/monitor/module_metric.py +9 -6
msprobe/pytorch/monitor/optimizer_collect.py +100 -67
msprobe/pytorch/monitor/unittest/test_monitor.py +1 -1
msprobe/pytorch/monitor/utils.py +76 -44
msprobe/pytorch/online_dispatch/compare.py +0 -2
msprobe/pytorch/online_dispatch/dispatch.py +9 -0
msprobe/pytorch/online_dispatch/dump_compare.py +3 -0
msprobe/pytorch/online_dispatch/utils.py +3 -0
msprobe/pytorch/parse_tool/lib/interactive_cli.py +1 -6
msprobe/pytorch/parse_tool/lib/utils.py +2 -1
msprobe/pytorch/pt_config.py +30 -29
msprobe/pytorch/service.py +114 -32
msprobe/visualization/builder/graph_builder.py +75 -10
msprobe/visualization/builder/msprobe_adapter.py +7 -6
msprobe/visualization/compare/graph_comparator.py +42 -38
msprobe/visualization/compare/mode_adapter.py +0 -19
msprobe/visualization/graph/base_node.py +11 -3
msprobe/visualization/graph/distributed_analyzer.py +71 -3
msprobe/visualization/graph/graph.py +0 -11
msprobe/visualization/graph/node_op.py +4 -3
msprobe/visualization/graph_service.py +4 -5
msprobe/visualization/utils.py +12 -35
msprobe/mindspore/dump/hook_cell/api_registry.py +0 -205
msprobe/mindspore/dump/hook_cell/wrap_api.py +0 -212
msprobe/pytorch/hook_module/api_registry.py +0 -166
msprobe/pytorch/hook_module/wrap_distributed.py +0 -75
msprobe/pytorch/hook_module/wrap_functional.py +0 -66
msprobe/pytorch/hook_module/wrap_npu_custom.py +0 -85
msprobe/pytorch/hook_module/wrap_tensor.py +0 -69
msprobe/pytorch/hook_module/wrap_torch.py +0 -84
msprobe/pytorch/hook_module/wrap_vf.py +0 -60
{mindstudio_probe-1.2.1.dist-info → mindstudio_probe-1.3.0.dist-info}/LICENSE +0 -0
{mindstudio_probe-1.2.1.dist-info → mindstudio_probe-1.3.0.dist-info}/WHEEL +0 -0
{mindstudio_probe-1.2.1.dist-info → mindstudio_probe-1.3.0.dist-info}/entry_points.txt +0 -0
{mindstudio_probe-1.2.1.dist-info → mindstudio_probe-1.3.0.dist-info}/top_level.txt +0 -0

msprobe/mindspore/service.py CHANGED Viewed

@@ -22,6 +22,7 @@ import mindspore as ms
 from mindspore import nn
 from mindspore.common.api import _no_grad
 from mindspore.ops.primitive import Primitive
 try:
     from mindspore.common._pijit_context import PIJitCaptureContext
 except ImportError:
@@ -31,7 +32,7 @@ else:
 from msprobe.core.common.exceptions import DistributedNotInitializedError, MsprobeException
 from msprobe.core.common.file_utils import create_directory
-from msprobe.core.common.utils import Const, print_tools_ends_info
+from msprobe.core.common.utils import Const, print_tools_ends_info, DumpPathAggregation
 from msprobe.core.data_dump.data_collector import build_data_collector
 from msprobe.core.data_dump.data_processor.base import (ModuleBackwardInputsOutputs, ModuleForwardInputsOutputs,
                                                         ModuleBackwardInputs)
@@ -40,7 +41,7 @@ from msprobe.mindspore.cell_processor import CellProcessor
 from msprobe.mindspore.common.log import logger
 from msprobe.mindspore.common.utils import (get_rank_if_initialized, clean_input_kwargs,
                                             is_mindtorch, register_backward_hook_functions)
-from msprobe.mindspore.dump.hook_cell.api_registry import api_register
+from msprobe.mindspore.dump.hook_cell.api_register import get_api_register
 from msprobe.mindspore.dump.hook_cell.primitive_hooks import PrimitiveHookService
 from msprobe.mindspore.dump.jit_dump import JitDump
 from msprobe.mindspore.dump.hook_cell.hook_cell import HOOKCell
@@ -62,14 +63,19 @@ class Service:
         self.inner_switch = False
         self.primitive_switch = False
         self.current_iter = 0
+        self.loop = 0
+        self.init_step = 0
         self.first_start = True
         self.current_rank = None
         self.dump_iter_dir = None
         self.start_call = False
         self.should_stop_service = False
         self.params_grad_info = {}
+        self.hook_handle_dict = {}
         # 提前注册，确保注册尽可能多的API hook
+        self.api_register = get_api_register()
         self.register_api_hook()
+        self.init_for_debug_level()
     @staticmethod
     def check_model_valid(models):
@@ -138,7 +144,12 @@ class Service:
             if not (Const.FORWARD in self.config.data_mode and Const.BACKWARD not in self.config.data_mode):
                 for param_name, param in params_dict.items():
                     if param.requires_grad:
-                        param.register_hook(grad_hook(cell, ori_name, param_name))
+                        name = ori_name + Const.SEP + param_name
+                        old_handle = self.hook_handle_dict.get(name)
+                        if old_handle and hasattr(old_handle, "remove"):
+                            old_handle.remove()
+                        handle = param.register_hook(grad_hook(cell, ori_name, param_name))
+                        self.hook_handle_dict[name] = handle
         def init_params_grad_info(cell, params_dict):
             '''
@@ -168,11 +179,15 @@ class Service:
                 module_input_output = self.prepare_module_input_output(target_type, cell, input_data, output)
                 if target_type == BaseScope.Module_Type_Module:
                     api_or_cell_name = self.cell_processor.set_and_get_reserved_name(cell, api_or_cell_name)
-                    params_dict = {key.split(Const.SEP)[-1]: value for key, value in cell.parameters_dict(
-                        recurse=False).items()}
-                    setattr(module_input_output, Const.PARAMS, params_dict)
+                    params_dict = {}
+                    if self.config.task != Const.STRUCTURE:
+                        params_dict = {
+                            key.split(Const.SEP)[-1]: value
+                            for key, value in cell.parameters_dict(recurse=False).items()
+                        }
+                        setattr(module_input_output, Const.PARAMS, params_dict)
                     # 判断是否需要注册参数hook
-                    if not hasattr(cell, 'params_grad_name') and params_dict:
+                    if params_dict:
                         ori_name = api_or_cell_name.rsplit(Const.SEP, 2)[0]
                         grad_name = ori_name + Const.SEP + Const.PARAMS_GRAD
                         # 首次执行前向hook时，添加params_grad_name属性，并注册参数hook
@@ -257,15 +272,33 @@ class Service:
             self.primitive_counters[primitive_name] += 1
     def step(self):
+        if self.config.level == Const.LEVEL_DEBUG:
+            return
         if self.config.async_dump:
             self.data_collector.fill_stack_tensor_data()
-            self.data_collector.data_processor.dump_async_data()
+            if self.config.task == Const.TENSOR:
+                self.data_collector.data_processor.dump_async_data()
         self.data_collector.write_json()
-        self.current_iter += 1
-        self.data_collector.update_iter(self.current_iter)
+        self.loop += 1
         self.reset_status()
     def start(self, model=None):
+        if self.current_iter == 0:
+            if self.config.level in [Const.LEVEL_MIX, Const.LEVEL_L1]:
+                JitDump.set_config(self.config)
+                JitDump.set_data_collector(self.data_collector)
+                if hasattr(ms.common.api, "_MindsporeFunctionExecutor"):
+                    ms.common.api._MindsporeFunctionExecutor = JitDump
+                else:
+                    ms.common.api._JitExecutor = JitDump
+                ms.common.api._PyNativeExecutor.grad = JitDump.grad
+                if pijit_label:
+                    PIJitCaptureContext.__enter__ = self.empty
+                    PIJitCaptureContext.__exit__ = self.empty
+        self.current_iter = self.loop + self.init_step
+        self.data_collector.update_iter(self.current_iter)
+        if self.config.level == Const.LEVEL_DEBUG:
+            return
         self.start_call = True
         if self.should_stop_service:
             return
@@ -276,6 +309,7 @@ class Service:
             print_tools_ends_info()
             return
         if self.config.step and self.current_iter not in self.config.step:
+            JitDump.jit_dump_switch = False
             return
         self.model = self.check_model_valid(model)
@@ -291,17 +325,9 @@ class Service:
                 return
             self.register_primitive_hook()
             self.register_cell_hook()
-            if self.config.level in [Const.LEVEL_MIX, Const.LEVEL_L1]:
-                JitDump.set_config(self.config)
-                JitDump.set_data_collector(self.data_collector)
-                ms.common.api._MindsporeFunctionExecutor = JitDump
-                ms.common.api._PyNativeExecutor.grad = JitDump.grad
-                if pijit_label:
-                    PIJitCaptureContext.__enter__ = self.empty
-                    PIJitCaptureContext.__exit__ = self.empty
             self.first_start = False
-        api_register.api_set_hook_func()
+        self.api_register.register_all_api()
         self.switch = True
         self.primitive_switch = True
         logger.info(f"Dump switch is turned on at step {self.current_iter}. ")
@@ -310,6 +336,8 @@ class Service:
         JitDump.jit_dump_switch = True
     def stop(self):
+        if self.config.level == Const.LEVEL_DEBUG:
+            return
         if self.should_stop_service:
             return
         logger.info(f"{Const.TOOL_NAME}: debugger.stop() is set successfully. "
@@ -326,7 +354,8 @@ class Service:
         self.start_call = False
         if self.config.async_dump:
             self.data_collector.fill_stack_tensor_data()
-            self.data_collector.data_processor.dump_async_data()
+            if self.config.task == Const.TENSOR:
+                self.data_collector.data_processor.dump_async_data()
         self.data_collector.write_json()
         JitDump.jit_dump_switch = False
@@ -370,12 +399,13 @@ class Service:
         else:
             dump_data_dir = None
-        dump_file_path = os.path.join(dump_dir, "dump.json")
-        stack_file_path = os.path.join(dump_dir, "stack.json")
-        construct_file_path = os.path.join(dump_dir, "construct.json")
-        self.data_collector.update_dump_paths(
-            dump_file_path, stack_file_path, construct_file_path, dump_data_dir, None
-        )
+        dump_path_aggregation = DumpPathAggregation()
+        dump_path_aggregation.dump_file_path = os.path.join(dump_dir, "dump.json")
+        dump_path_aggregation.stack_file_path = os.path.join(dump_dir, "stack.json")
+        dump_path_aggregation.construct_file_path = os.path.join(dump_dir, "construct.json")
+        dump_path_aggregation.dump_tensor_data_dir = dump_data_dir
+        self.data_collector.update_dump_paths(dump_path_aggregation)
         self.data_collector.initialize_json_file(
             framework=Const.MT_FRAMEWORK if is_mindtorch() else Const.MS_FRAMEWORK
         )
@@ -386,21 +416,21 @@ class Service:
     def register_api_hook(self):
         if self.config.level in [Const.LEVEL_MIX, Const.LEVEL_L1, Const.LEVEL_L2]:
             logger.info(f"The api {self.config.task} hook function is successfully mounted to the model.")
-            api_register.initialize_hook(functools.partial(self.build_hook, BaseScope.Module_Type_API))
-            api_register.api_set_hook_func()
+            self.api_register.initialize_hook(functools.partial(self.build_hook, BaseScope.Module_Type_API))
+            self.api_register.register_all_api()
     def get_cells_and_names(self):
         cells_and_names_with_index = {}
         def get_cell_or_module(model):
             return model.named_modules() if is_mindtorch() else model.cells_and_names()
         if isinstance(self.model, (list, tuple)):
             for index, model in enumerate(self.model):
                 cells_and_names_with_index[str(index)] = get_cell_or_module(model)
         else:
             cells_and_names_with_index["-1"] = get_cell_or_module(self.model)
-        return cells_and_names_with_index
+        return cells_and_names_with_index
     def register_primitive_hook(self):
         if self.config.level not in [Const.LEVEL_MIX, Const.LEVEL_L1]:
@@ -430,7 +460,7 @@ class Service:
             if not self.model:
                 raise MsprobeException(MsprobeException.INVALID_PARAM_ERROR,
                                        f"The current level is {self.config.level}, the model cannot be None")
-            model_type = Const.MODULE if is_mindtorch() else Const.CELL
+            model_type = Const.MODULE if is_mindtorch() else Const.CELL
             cells_and_names_with_index = self.get_cells_and_names()
             for index, cells_and_names in cells_and_names_with_index.items():
@@ -439,7 +469,7 @@ class Service:
                     if cell == model:
                         continue
                     cell_index = (index + Const.SEP) if index != "-1" else ""
-                    prefix = (model_type + Const.SEP + cell_index + name +
+                    prefix = (model_type + Const.SEP + cell_index + name +
                               Const.SEP + cell.__class__.__name__ + Const.SEP)
                     _, forward_hook, backward_hook, _ = self.build_hook(BaseScope.Module_Type_Module, prefix)
                     cell.register_forward_hook(forward_hook)
@@ -456,10 +486,9 @@ class Service:
     def reset_status(self):
         self.primitive_hook_service.primitive_counters.clear()
-        self.data_collector.data_writer.reset_cache()
+        self.data_collector.reset_status()
         JitDump.jit_count = defaultdict(int)
         self.params_grad_info.clear()
         if self.config.level == Const.LEVEL_L2:
             self.data_collector.data_processor.reset_status()
             return
@@ -467,3 +496,54 @@ class Service:
             return
         if self.config.rank and self.current_rank not in self.config.rank:
             return
+    def init_for_debug_level(self):
+        if not (self.config.level == Const.LEVEL_DEBUG and self.config.task in [Const.TENSOR, Const.STATISTICS]):
+            return
+        try:
+            self.current_rank = get_rank_if_initialized()
+        except DistributedNotInitializedError:
+            self.current_rank = None
+        # dir: dump_path -- rank{} -- debug.json
+        self.dump_iter_dir = self.config.dump_path
+        cur_rank = self.current_rank if self.current_rank is not None else ''
+        dump_dir = os.path.join(self.dump_iter_dir, f"rank{cur_rank}")
+        create_directory(dump_dir)
+        if self.config.task in self.data_collector.tasks_need_tensor_data:
+            dump_data_dir = os.path.join(dump_dir, "dump_tensor_data")
+            create_directory(dump_data_dir)
+        else:
+            dump_data_dir = None
+        dump_path_aggregation = DumpPathAggregation()
+        dump_path_aggregation.dump_tensor_data_dir = dump_data_dir
+        dump_path_aggregation.debug_file_path = os.path.join(dump_dir, "debug.json")
+        self.data_collector.update_dump_paths(dump_path_aggregation)
+        self.data_collector.initialize_json_file(
+            framework=Const.MT_FRAMEWORK if is_mindtorch() else Const.MS_FRAMEWORK
+        )
+        self.debug_variable_counter = defaultdict(int)
+    def save(self, variable, name, save_backward):
+        '''
+        Args:
+            variable: Union[List[variable], dict{str: variable}, mindspore.tensor, str, float, int]
+            name: str
+            save_backward: boolean
+        Return:
+            void
+        '''
+        if self.config.level != Const.LEVEL_DEBUG:
+            return
+        count = self.debug_variable_counter[name]
+        self.debug_variable_counter[name] += 1
+        name_with_count = f"{name}.{count}"
+        grad_name_with_count = f"{name}_grad.{count}"
+        # forward save
+        self.data_collector.debug_data_collect_forward(variable, name_with_count)
+        # backward save
+        if save_backward:
+            self.data_collector.debug_data_collect_backward(variable, grad_name_with_count)

msprobe/pytorch/__init__.py CHANGED Viewed

@@ -13,7 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import torch
 from .compare.distributed_compare import compare_distributed
 from .compare.pt_compare import compare

msprobe/pytorch/api_accuracy_checker/compare/api_precision_compare.py CHANGED Viewed

@@ -40,7 +40,7 @@ from msprobe.pytorch.api_accuracy_checker.run_ut.run_ut_utils import get_validat
 from msprobe.pytorch.api_accuracy_checker.common.utils import extract_detailed_api_segments, extract_basic_api_segments
 from msprobe.core.common.file_utils import FileChecker, change_mode, create_directory
 from msprobe.pytorch.common.log import logger
-from msprobe.core.common.utils import CompareException
+from msprobe.core.common.utils import CompareException, check_op_str_pattern_valid
 from msprobe.core.common.const import Const, CompareConst, FileCheckConst
 CompareConfig = namedtuple('CompareConfig', ['npu_csv_path', 'gpu_csv_path', 'result_csv_path', 'details_csv_path'])
@@ -151,6 +151,7 @@ def analyse_csv(npu_data, gpu_data, config):
         message = ''
         compare_column = ApiPrecisionOutputColumn()
         full_api_name_with_direction_status = row_npu[ApiPrecisionCompareColumn.API_NAME]
+        check_op_str_pattern_valid(full_api_name_with_direction_status)
         row_gpu = gpu_data[gpu_data[ApiPrecisionCompareColumn.API_NAME] == full_api_name_with_direction_status]
         api_name, api_full_name, direction_status = extract_detailed_api_segments(full_api_name_with_direction_status)
         if not api_full_name:
@@ -430,6 +431,7 @@ def _api_precision_compare(parser=None):
     _api_precision_compare_parser(parser)
     args = parser.parse_args(sys.argv[1:])
     _api_precision_compare_command(args)
+    logger.info("Compare task completed.")
 def _api_precision_compare_command(args):
@@ -457,8 +459,3 @@ def _api_precision_compare_parser(parser):
     parser.add_argument("-o", "--out_path", dest="out_path", default="", type=str,
                         help="<optional> The api precision compare task result out path.",
                         required=False)
-if __name__ == '__main__':
-    _api_precision_compare()
-    logger.info("Compare task completed.")

msprobe/pytorch/api_accuracy_checker/generate_op_script/op_generator.py CHANGED Viewed

@@ -28,10 +28,10 @@ from msprobe.pytorch.api_accuracy_checker.compare.compare_utils import binary_st
 ulp_standard_api, thousandth_standard_api
 from msprobe.core.common.file_utils import FileOpen, load_json, save_json
 from msprobe.core.common.utils import check_file_or_directory_path, check_op_str_pattern_valid, is_int
-from msprobe.core.common.const import Const, MonitorConst, MsgConst
+from msprobe.core.common.const import Const, MonitorConst, MsgConst, FileCheckConst
 from msprobe.core.common.log import logger
-from msprobe.core.common.file_utils import make_dir
-from msprobe.core.common.utils import recursion_depth_decorator
+from msprobe.core.common.file_utils import make_dir, change_mode
+from msprobe.core.common.decorator import recursion_depth_decorator
 TENSOR_DATA_LIST = ["torch.Tensor", "torch.nn.parameter.Parameter"]
 TORCH_BOOL_TYPE = ["torch.bool"]
@@ -50,6 +50,7 @@ DATA_NAME = "data_name"
 API_MAX_LENGTH = 30
 PROPAGATION_LIST = [Const.FORWARD, Const.BACKWARD]
 DATAMODE_LIST = ["random_data", "real_data"]
+ITER_MAX_TIMES = 1000
 class APIInfo:
@@ -97,6 +98,8 @@ class CommonConfig:
         iter_t = self.iter_times
         if iter_t <= 0:
             raise ValueError("iter_times should be an integer bigger than zero!")
+        if iter_t > ITER_MAX_TIMES:
+            raise ValueError("iter_times should not be greater than 1000!")
         json_file = self.extract_api_path
         propagation = self.propagation
@@ -117,7 +120,7 @@ class CommonConfig:
         # Retrieve the first API name and dictionary
         forward_item = next(iter(json_content.items()), None)
-        if not forward_item or not isinstance(forward_item[1], dict):
+        if not forward_item or not isinstance(forward_item[1], dict) or not forward_item[1]:
             raise ValueError(f'Invalid forward API data in json_content!')
         # if propagation is backward, ensure json file contains forward and backward info
@@ -127,7 +130,7 @@ class CommonConfig:
         # if propagation is backward, ensure it has valid data
         if propagation == Const.BACKWARD:
             backward_item = list(json_content.items())[1]
-            if not isinstance(backward_item[1], dict):
+            if not isinstance(backward_item[1], dict) or not backward_item[1]:
                 raise ValueError(f'Invalid backward API data in json_content!')
         return json_content
@@ -169,7 +172,7 @@ class APIExtractor:
                     value = self.load_real_data_path(value, real_data_path)
                 new_data[key] = value
         if not new_data:
-            logger.error(f"Error: The api '{self.api_name}' does not exist in the file.")
+            logger.warning(f"Warning: The api '{self.api_name}' does not exist in the file.")
         else:
             save_json(self.output_file, new_data, indent=4)
             logger.info(
@@ -183,6 +186,7 @@ class APIExtractor:
                     self.update_data_name(v, dump_data_dir)
         return value
+    @recursion_depth_decorator("OpGenerator: APIExtractor.update_data_name")
     def update_data_name(self, data, dump_data_dir):
         if isinstance(data, list):
             for item in data:
@@ -399,7 +403,7 @@ class OperatorScriptGenerator:
     def generate_kwargs_dict(self, kwargs_info, flag_device):
         kwargs_dict_generator = ""
         for key, value in kwargs_info.items():
-            kwargs_dict_generator += '"' + key + '"' + MonitorConst.VPP_SEP
+            kwargs_dict_generator += '"' + key + '"' + MonitorConst.NAME_SEP
             if flag_device:
                 kwargs_dict_generator += self.recursive_kwargs_dict(value, flag_device=True) + Const.COMMA
             else:
@@ -467,6 +471,7 @@ def _run_operator_generate_commond(cmd_args):
             fout.write(code_template.format(**internal_settings))
     except OSError:
         logger.error(f"Failed to open file. Please check file {template_path} or {operator_script_path}.")
+    change_mode(operator_script_path, FileCheckConst.DATA_FILE_AUTHORITY)
     logger.info(f"Generate operator script successfully and the name is {operator_script_path}.")

msprobe/pytorch/api_accuracy_checker/generate_op_script/operator_replication.template CHANGED Viewed

@@ -37,9 +37,9 @@ def load_pt(pt_path, to_cpu=False):
     pt_path = os.path.realpath(pt_path)
     try:
         if to_cpu:
-            pt = torch.load(pt_path, map_location=torch.device("cpu"))
+            pt = torch.load(pt_path, map_location=torch.device("cpu"), weights_only=True)
         else:
-            pt = torch.load(pt_path)
+            pt = torch.load(pt_path, weights_only=True)
     except Exception as e:
         raise RuntimeError(f"load pt file {{pt_path}} failed") from e
     return pt

msprobe/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py CHANGED Viewed

@@ -50,6 +50,9 @@ def split_json_file(input_file, num_splits, filter_api):
         backward_data[f"{data_name}.backward"] = backward_data.pop(data_name)
     input_data = load_json(input_file)
+    if "dump_data_dir" not in input_data.keys():
+        logger.error("Invalid input file, 'dump_data_dir' field is missing")
+        raise CompareException("Invalid input file, 'dump_data_dir' field is missing")
     if input_data.get("data") is None:
         logger.error("Invalid input file, 'data' field is missing")
         raise CompareException("Invalid input file, 'data' field is missing")
@@ -97,7 +100,7 @@ def run_parallel_ut(config):
     processes = []
     device_id_cycle = cycle(config.device_id)
     if config.save_error_data_flag:
-        logger.info("UT task error datas will be saved")
+        logger.info("UT task error data will be saved")
     logger.info(f"Starting parallel UT with {config.num_splits} processes")
     progress_bar = tqdm(total=config.total_items, desc="Total items", unit="items")
@@ -221,7 +224,3 @@ def main():
     args = parser.parse_args()
     config = prepare_config(args)
     run_parallel_ut(config)
-if __name__ == '__main__':
-    main()

msprobe/pytorch/api_accuracy_checker/run_ut/run_overflow_check.py CHANGED Viewed

@@ -34,8 +34,10 @@ from msprobe.pytorch.api_accuracy_checker.run_ut.run_ut_utils import exec_api, i
 from msprobe.core.common.file_utils import check_link, FileChecker
 from msprobe.pytorch.api_accuracy_checker.common.utils import extract_basic_api_segments
 from msprobe.core.common.const import FileCheckConst, Const
+from msprobe.core.common.utils import check_op_str_pattern_valid
 from msprobe.pytorch.common.log import logger
 from msprobe.pytorch.common.parse_json import parse_json_info_forward_backward
+from msprobe.core.common.decorator import recursion_depth_decorator
 def check_tensor_overflow(x):
@@ -75,6 +77,7 @@ def check_data_overflow(x, device):
             return torch_npu.npu.utils.npu_check_overflow(x)
+@recursion_depth_decorator("is_bool_output")
 def is_bool_output(x):
     if isinstance(x, (tuple, list)):
         if not x:
@@ -91,6 +94,7 @@ def run_overflow_check(forward_file):
         dump_path = os.path.dirname(forward_file)
         real_data_path = os.path.join(dump_path, Const.DUMP_TENSOR_DATA)
     for api_full_name, api_info_dict in tqdm(forward_content.items()):
+        check_op_str_pattern_valid(api_full_name)
         if is_unsupported_api(api_full_name, is_overflow_check=True):
             continue
         try:
@@ -161,6 +165,7 @@ def _run_overflow_check(parser=None):
     _run_overflow_check_parser(parser)
     args = parser.parse_args(sys.argv[1:])
     _run_overflow_check_command(args)
+    logger.info("UT task completed.")
 def _run_overflow_check_command(args):
@@ -175,8 +180,3 @@ def _run_overflow_check_command(args):
         logger.error(f"Set NPU device id failed. device id is: {args.device_id}")
         raise NotImplementedError from error
     run_overflow_check(api_info)
-if __name__ == '__main__':
-    _run_overflow_check()
-    logger.info("UT task completed.")

msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py CHANGED Viewed

@@ -49,7 +49,7 @@ from msprobe.core.common.file_utils import FileChecker, change_mode, \
 from msprobe.pytorch.common.log import logger
 from msprobe.pytorch.pt_config import parse_json_config
 from msprobe.core.common.const import Const, FileCheckConst, CompareConst
-from msprobe.core.common.utils import safe_get_value, CompareException
+from msprobe.core.common.utils import safe_get_value, CompareException, is_int, check_op_str_pattern_valid
 from msprobe.pytorch.common.utils import seed_all
 from msprobe.pytorch.api_accuracy_checker.tensor_transport_layer.attl import ATTL, ATTLConfig, move2device_exec
 from msprobe.pytorch.api_accuracy_checker.tensor_transport_layer.device_dispatch import ConsumerDispatcher
@@ -65,6 +65,7 @@ DETAILS_FILE_NAME = "accuracy_checking_details_" + current_time + ".csv"
 not_backward_list = ['repeat_interleave']
 unsupported_backward_list = ['masked_select']
+unsupported_api_list = ["to"]
 tqdm_params = {
@@ -83,6 +84,9 @@ tqdm_params = {
 }
+seed_all()
 def run_ut(config):
     logger.info("start UT test")
     if config.online_config.is_online:
@@ -93,7 +97,7 @@ def run_ut(config):
         logger.info(f"UT task details will be saved in {config.details_csv_path}")
     if config.save_error_data:
-        logger.info(f"UT task error_datas will be saved in {config.error_data_path}")
+        logger.info(f"UT task error_data will be saved in {config.error_data_path}")
     compare = Comparator(config.result_csv_path, config.details_csv_path, config.is_continue_run_ut, config=config)
     if config.online_config.is_online:
@@ -117,6 +121,7 @@ def run_ut(config):
 def run_api_offline(config, compare, api_name_set):
     err_column = CompareColumn()
     for _, (api_full_name, api_info_dict) in enumerate(tqdm(config.forward_content.items(), **tqdm_params)):
+        check_op_str_pattern_valid(api_full_name)
         if api_full_name in api_name_set:
             continue
         if is_unsupported_api(api_full_name):
@@ -218,6 +223,7 @@ def blacklist_and_whitelist_filter(api_name, black_list, white_list):
     If api is both in black_list and black_list, black_list first.
     return: False for exec api, True for not exec
     """
+    black_list.extend(unsupported_api_list)
     if black_list and api_name in black_list:
         return True
     if white_list and api_name not in white_list:
@@ -317,7 +323,8 @@ def run_torch_api_online(api_full_name, api_data, backward_content):
     if kwargs.get("device"):
         del kwargs["device"]
-    device_out = exec_api(api_type, api_name, Const.CUDA_LOWERCASE, args, kwargs)
+    device_exec_params = ExecParams(api_type, api_name, current_device, args, kwargs, False, None)
+    device_out = exec_api(device_exec_params)
     device_out = move2device_exec(device_out, "cpu")
     return UtDataInfo(None, None, out, device_out, None, in_fwd_data_list, None, rank=api_data.rank)
@@ -344,6 +351,9 @@ def need_to_backward(grad_index, out):
 def run_backward(args, grad, grad_index, out):
     if grad_index is not None:
+        if not is_int(grad_index):
+            logger.error(f"{grad_index} dtype is not int")
+            raise TypeError(f"{grad_index} dtype is not int")
         if grad_index >= len(out):
             logger.error(f"Run backward error when grad_index is {grad_index}")
             raise IndexError(f"Run backward error when grad_index is {grad_index}")
@@ -430,6 +440,7 @@ def preprocess_forward_content(forward_content):
     arg_cache = {}
     for key, value in forward_content.items():
+        check_op_str_pattern_valid(key)
         base_key = key.rsplit(Const.SEP, 1)[0]
         if key not in arg_cache:
@@ -469,6 +480,7 @@ def _run_ut(parser=None):
     _run_ut_parser(parser)
     args = parser.parse_args(sys.argv[1:])
     run_ut_command(args)
 def checked_online_config(online_config):
@@ -492,6 +504,7 @@ def checked_online_config(online_config):
         check_file_or_directory_path(os.path.join(online_config.tls_path, "server.key"))
         check_file_or_directory_path(os.path.join(online_config.tls_path, "server.crt"))
         check_crt_valid(os.path.join(online_config.tls_path, "server.crt"))
+        check_crt_valid(os.path.join(online_config.tls_path, "server.key"), True)
     # host and port
     if not isinstance(online_config.host, str) or not re.match(Const.ipv4_pattern, online_config.host):
@@ -561,7 +574,14 @@ def run_ut_command(args):
     error_data_path = checker_config.error_data_path
     if save_error_data:
         if args.result_csv_path:
-            time_info = result_csv_path.split('.')[0].split('_')[-1]
+            parts_by_dot = result_csv_path.split(Const.SEP)
+            if len(parts_by_dot) < 2 or not parts_by_dot[0]:
+                raise ValueError("result_csv_path does not contain a valid file name with an extension.")
+            file_name_part = parts_by_dot[0]
+            parts_by_underscore = file_name_part.split(Const.REPLACEMENT_CHARACTER)
+            if len(parts_by_underscore) < 2:
+                raise ValueError("File name part does not contain enough '_' separated segments.")
+            time_info = parts_by_underscore[-1]
             global UT_ERROR_DATA_DIR
             UT_ERROR_DATA_DIR = 'ut_error_data' + time_info
         error_data_path = initialize_save_error_data(error_data_path)
@@ -579,9 +599,8 @@ def run_ut_command(args):
     }
     run_ut_config = checker_config.get_run_ut_config(**config_params)
     run_ut(run_ut_config)
+    logger.info("UT task completed.")
 if __name__ == '__main__':
-    seed_all()
     _run_ut()
-    logger.info("UT task completed.")

mindstudio-probe 1.2.1__py3-none-any.whl → 1.3.0__py3-none-any.whl

mindstudio-probe 1.2.1py3-none-any.whl → 1.3.0py3-none-any.whl