PyPI - mindstudio-probe - Versions diffs - 8.1.2__py3-none-any.whl → 8.2.1__py3-none-any.whl - Mend

mindstudio-probe 8.1.2py3-none-any.whl → 8.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (181) hide show

{mindstudio_probe-8.1.2.dist-info → mindstudio_probe-8.2.1.dist-info}/METADATA +2 -2
{mindstudio_probe-8.1.2.dist-info → mindstudio_probe-8.2.1.dist-info}/RECORD +172 -147
msprobe/README.md +6 -6
msprobe/core/common/const.py +98 -41
msprobe/core/common/db_manager.py +256 -0
msprobe/core/common/file_utils.py +28 -5
msprobe/core/common/log.py +7 -0
msprobe/core/common/megatron_utils.py +59 -0
msprobe/core/common/parallel_state.py +193 -0
msprobe/core/common/utils.py +20 -13
msprobe/core/common_config.py +5 -0
msprobe/core/compare/acc_compare.py +140 -93
msprobe/core/compare/check.py +13 -0
msprobe/core/compare/compare_cli.py +64 -6
msprobe/core/compare/config.py +10 -8
msprobe/core/compare/diff_analyze/diff_analyze_threshold.yaml +14 -0
msprobe/core/compare/diff_analyze/first_diff_analyze.py +135 -0
msprobe/core/compare/diff_analyze/ignore_op_list.yaml +3 -0
msprobe/core/compare/find_first/__init__.py +0 -0
msprobe/core/compare/find_first/analyzer.py +282 -0
msprobe/core/compare/find_first/data_processor.py +35 -0
msprobe/core/compare/find_first/graph.py +188 -0
msprobe/core/compare/find_first/utils.py +189 -0
msprobe/core/compare/highlight.py +74 -101
msprobe/core/compare/layer_mapping/layer_mapping.py +14 -9
msprobe/core/compare/merge_result/merge_result.py +2 -2
msprobe/core/compare/multiprocessing_compute.py +45 -28
msprobe/core/compare/npy_compare.py +7 -10
msprobe/core/compare/utils.py +338 -130
msprobe/core/config_check/checkers/dataset_checker.py +2 -1
msprobe/core/config_check/checkers/env_args_checker.py +5 -5
msprobe/core/config_check/checkers/hyperparameter_checker.py +30 -10
msprobe/core/config_check/checkers/pip_checker.py +4 -3
msprobe/core/config_check/checkers/random_checker.py +3 -3
msprobe/core/config_check/checkers/weights_checker.py +2 -1
msprobe/core/config_check/ckpt_compare/megatron_loader.py +2 -0
msprobe/core/config_check/resource/hyperparameter.yaml +11 -1
msprobe/core/config_check/utils/hyperparameter_parser.py +7 -3
msprobe/core/config_check/utils/utils.py +10 -0
msprobe/core/data_dump/api_registry.py +49 -30
msprobe/core/data_dump/data_collector.py +71 -29
msprobe/core/data_dump/data_processor/base.py +2 -0
msprobe/core/data_dump/data_processor/mindspore_processor.py +47 -53
msprobe/core/data_dump/data_processor/pytorch_processor.py +227 -93
msprobe/core/data_dump/json_writer.py +81 -7
msprobe/core/data_dump/scope.py +4 -6
msprobe/core/hook_manager.py +129 -70
msprobe/core/monitor/csv2db.py +361 -0
msprobe/core/monitor/db_utils.py +278 -0
msprobe/core/monitor/utils.py +35 -1
msprobe/core/service.py +31 -39
msprobe/core/single_save/single_comparator.py +16 -3
msprobe/docs/01.installation.md +51 -19
msprobe/docs/02.config_introduction.md +16 -20
msprobe/docs/03.config_examples.md +26 -0
msprobe/docs/04.kernel_dump_PyTorch.md +1 -1
msprobe/docs/05.data_dump_PyTorch.md +6 -2
msprobe/docs/06.data_dump_MindSpore.md +44 -7
msprobe/docs/07.accuracy_checker_PyTorch.md +1 -1
msprobe/docs/10.accuracy_compare_PyTorch.md +124 -44
msprobe/docs/11.accuracy_compare_MindSpore.md +75 -7
msprobe/docs/14.data_parse_PyTorch.md +1 -1
msprobe/docs/19.monitor.md +94 -7
msprobe/docs/21.visualization_PyTorch.md +71 -101
msprobe/docs/22.visualization_MindSpore.md +69 -119
msprobe/docs/23.generate_operator_PyTorch.md +1 -1
msprobe/docs/25.tool_function_introduction.md +0 -1
msprobe/docs/26.data_dump_PyTorch_baseline.md +7 -7
msprobe/docs/28.debugger_save_instruction.md +184 -81
msprobe/docs/29.data_dump_MSAdapter.md +6 -0
msprobe/docs/31.config_check.md +4 -2
msprobe/docs/36.calculation_result_change.md +75 -0
msprobe/docs/FAQ.md +22 -1
msprobe/docs/data_dump_MindSpore/dynamic_graph_quick_start_example.md +6 -2
msprobe/docs/img/compare_result.png +0 -0
msprobe/docs/img/visualization/vis_browser_1.png +0 -0
msprobe/docs/img/visualization/vis_match_info.png +0 -0
msprobe/docs/img/visualization/vis_precision_info.png +0 -0
msprobe/docs/img/visualization/vis_search_info.png +0 -0
msprobe/docs/img/visualization/vis_show_info.png +0 -0
msprobe/docs/img/visualization/vis_showcase.png +0 -0
msprobe/docs/img/visualization/vis_unmatch_info.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/1.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/2.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/3.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/4.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/5.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/6.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/7.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/llamafactory-qwen25vl.txt +59 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/llamafactory1.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/llamafactory2.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/mindspeed-mm-qwen25vl.txt +80 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/mindspeed1.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/mindspeed2.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactory_mapping.md +330 -0
msprobe/mindspore/__init__.py +1 -1
msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +1 -1
msprobe/mindspore/api_accuracy_checker/api_runner.py +9 -6
msprobe/mindspore/api_accuracy_checker/compute_element.py +18 -12
msprobe/mindspore/cell_processor.py +64 -25
msprobe/mindspore/common/utils.py +51 -7
msprobe/mindspore/compare/common_dir_compare.py +45 -37
msprobe/mindspore/compare/ms_compare.py +10 -2
msprobe/mindspore/compare/ms_graph_compare.py +47 -52
msprobe/mindspore/debugger/debugger_config.py +18 -7
msprobe/mindspore/debugger/precision_debugger.py +16 -12
msprobe/mindspore/dump/cell_dump_process.py +130 -68
msprobe/mindspore/dump/cell_dump_with_insert_gradient.py +10 -2
msprobe/mindspore/dump/graph_mode_cell_dump.py +35 -9
msprobe/mindspore/dump/graph_tensor_dump.py +11 -0
msprobe/mindspore/dump/hook_cell/api_register.py +19 -20
msprobe/mindspore/dump/hook_cell/hook_cell.py +12 -34
msprobe/mindspore/dump/hook_cell/ms_hook_manager.py +142 -21
msprobe/mindspore/dump/kernel_kbyk_dump.py +24 -0
msprobe/mindspore/exception_dump/__init__.py +0 -0
msprobe/mindspore/exception_dump/exception_dump_tool_factory.py +51 -0
msprobe/mindspore/exception_dump/kernel_graph_exception_dump.py +57 -0
msprobe/mindspore/free_benchmark/api_pynative_self_check.py +5 -4
msprobe/mindspore/mindspore_service.py +2 -2
msprobe/mindspore/mindtorch/mindtorch_adaptor.py +12 -7
msprobe/mindspore/monitor/features.py +82 -0
msprobe/mindspore/monitor/module_hook.py +168 -10
msprobe/mindspore/monitor/utils.py +27 -1
msprobe/mindspore/ms_config.py +12 -4
msprobe/mindspore/overflow_check/overflow_check_tool_factory.py +1 -1
msprobe/mindspore/task_handler_factory.py +3 -1
msprobe/nan_analyze/graph.py +1 -1
msprobe/pytorch/api_accuracy_checker/common/config.py +3 -36
msprobe/pytorch/api_accuracy_checker/compare/api_precision_compare.py +0 -24
msprobe/pytorch/api_accuracy_checker/compare/compare.py +2 -12
msprobe/pytorch/api_accuracy_checker/config.yaml +1 -6
msprobe/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py +2 -2
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py +12 -132
msprobe/pytorch/common/utils.py +1 -21
msprobe/pytorch/compare/pt_compare.py +10 -2
msprobe/pytorch/{hook_module/jit_script_wrapper.py → compare/pt_diff_analyze.py} +3 -15
msprobe/pytorch/compare/utils.py +2 -1
msprobe/pytorch/debugger/debugger_config.py +18 -23
msprobe/pytorch/dump/module_dump/hook_wrapper.py +10 -7
msprobe/pytorch/dump/module_dump/module_processer.py +41 -19
msprobe/pytorch/free_benchmark/main.py +7 -4
msprobe/pytorch/hook_module/api_register.py +62 -24
msprobe/pytorch/hook_module/hook_module.py +9 -29
msprobe/pytorch/hook_module/pt_hook_manager.py +84 -15
msprobe/pytorch/hook_module/script_wrapper.py +140 -0
msprobe/pytorch/hook_module/support_wrap_ops.yaml +6 -0
msprobe/pytorch/monitor/csv2tb.py +1 -1
msprobe/pytorch/monitor/features.py +94 -0
msprobe/pytorch/monitor/module_hook.py +221 -81
msprobe/pytorch/monitor/module_metric.py +27 -1
msprobe/pytorch/monitor/optimizer_collect.py +109 -4
msprobe/pytorch/online_dispatch/dispatch.py +42 -24
msprobe/pytorch/online_dispatch/dump_compare.py +1 -1
msprobe/pytorch/parse_tool/lib/visualization.py +0 -1
msprobe/pytorch/pt_config.py +2 -51
msprobe/pytorch/pytorch_service.py +7 -14
msprobe/visualization/builder/graph_builder.py +192 -63
msprobe/visualization/builder/graph_merger.py +986 -0
msprobe/visualization/builder/msprobe_adapter.py +17 -15
msprobe/visualization/compare/graph_comparator.py +26 -16
msprobe/visualization/db_utils.py +252 -0
msprobe/visualization/graph/base_node.py +2 -22
msprobe/visualization/graph/distributed_analyzer.py +12 -12
msprobe/visualization/graph/graph.py +44 -16
msprobe/visualization/graph_service.py +143 -59
msprobe/visualization/utils.py +103 -4
msprobe/docs/08.accuracy_checker_online_PyTorch.md +0 -295
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/attl.py +0 -205
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/client.py +0 -378
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/device_dispatch.py +0 -239
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/dump_dispatch.py +0 -115
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/server.py +0 -250
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/torch_ops_config.yaml +0 -63
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/utils.py +0 -198
msprobe/pytorch/attl_manager.py +0 -65
{mindstudio_probe-8.1.2.dist-info → mindstudio_probe-8.2.1.dist-info}/LICENSE +0 -0
{mindstudio_probe-8.1.2.dist-info → mindstudio_probe-8.2.1.dist-info}/WHEEL +0 -0
{mindstudio_probe-8.1.2.dist-info → mindstudio_probe-8.2.1.dist-info}/entry_points.txt +0 -0
{mindstudio_probe-8.1.2.dist-info → mindstudio_probe-8.2.1.dist-info}/top_level.txt +0 -0
/msprobe/{pytorch/api_accuracy_checker/tensor_transport_layer → core/compare/diff_analyze}/__init__.py +0 -0

msprobe/mindspore/cell_processor.py CHANGED Viewed

@@ -25,13 +25,15 @@ from msprobe.core.common.exceptions import MsprobeException
 from msprobe.core.common.runtime import Runtime
 from msprobe.core.common.utils import ModuleQueue, ThreadSafe
 from msprobe.core.data_dump.scope import ModuleRangeScope, MixRangeScope, BaseScope
+from msprobe.core.common.megatron_utils import wrap_megatron_step, get_micro_step, is_megatron
 from msprobe.mindspore.common.const import Const as MsConst
 from msprobe.mindspore.common.log import logger
 from msprobe.mindspore.common.utils import (
     is_mindtorch,
     get_cells_and_names_with_index,
     has_kwargs_in_forward_hook,
-    is_graph_mode_cell_dump_allowed
+    is_graph_mode_cell_dump_allowed,
+    is_backward_hook_output_a_view
 )
 from msprobe.mindspore.debugger.debugger_config import DebuggerConfig
 from msprobe.mindspore.dump.graph_mode_cell_dump import GraphModeCellDump
@@ -46,6 +48,28 @@ def get_cell_construct(construct):
     return _construct
+def patch_schedules_step():
+    try:
+        from mindspeed.mindspore.core.pipeline_parallel import schedules
+        schedules.forward_step = wrap_megatron_step(schedules.forward_step)
+        schedules.backward_step = wrap_megatron_step(schedules.backward_step, is_forward=False)
+        logger.info_on_rank_0("Patch mindspeed.mindspore method success.")
+    except ImportError:
+        logger.info_on_rank_0("No mindspeed.mindspore find.")
+    except Exception as e:
+        logger.info_on_rank_0(f"Patch mindspeed.mindspore method failed, detail:{str(e)}")
+    try:
+        from megatron.core.pipeline_parallel import schedules
+        schedules.forward_step = wrap_megatron_step(schedules.forward_step)
+        schedules.backward_step = wrap_megatron_step(schedules.backward_step, is_forward=False)
+        logger.info_on_rank_0("Patch megatron method success.")
+    except ImportError:
+        logger.info_on_rank_0("No megatron find.")
+    except Exception as e:
+        logger.info_on_rank_0(f"Patch megatron method failed, detail:{str(e)}")
 class CellProcessor:
     cell_queue = ModuleQueue()
     cell_count = {}
@@ -83,6 +107,8 @@ class CellProcessor:
             raise MsprobeException(MsprobeException.INVALID_PARAM_ERROR,
                                    'The model cannot be None, when level is "L0" or "mix"')
+        patch_schedules_step()
         is_registered = False
         model_type = Const.MODULE if is_mindtorch() else Const.CELL
         cells_with_index_in_pynative_mode, cells_with_index_in_graph_mode = get_cells_and_names_with_index(models)
@@ -116,19 +142,23 @@ class CellProcessor:
             cells_and_names_in_graph_mode = []
             for index, cells_and_names in cells_with_index_in_graph_mode.items():
                 model = models if index == "-1" else models[int(index)]
-                for name, cell in cells_and_names:
+                for name, cell, parent_cell in cells_and_names:
                     if cell == model:
                         continue
                     cell_index = (index + Const.SEP) if index != "-1" else ""
-                    cells_and_names_in_graph_mode.append((f'{cell_index}{name}', cell))
+                    cells_and_names_in_graph_mode.append((f'{cell_index}{name}', cell, parent_cell))
             if cells_and_names_in_graph_mode:
                 Runtime.run_mode = MsConst.PYNATIVE_GRAPH_MODE
                 GraphModeCellDump(config, cells_and_names_in_graph_mode, strict=False).handle()
     def build_cell_hook(self, cell_name, build_data_hook):
         @ThreadSafe.synchronized
         def forward_pre_hook(cell, args):
+            if not Runtime.is_running:
+                return args
             index = CellProcessor.set_and_get_calls_number(cell_name)
             full_forward_name = f'{cell_name}{Const.FORWARD}{Const.SEP}{index}'
             full_backward_name = f'{cell_name}{Const.BACKWARD}{Const.SEP}{index}'
@@ -174,7 +204,7 @@ class CellProcessor:
                 bw_hook.register_backward_hook()
                 CellProcessor.cell_bw_hook_kernels[full_forward_name] = bw_hook
-                args = bw_hook(*args)
+                args = bw_hook(args) if is_backward_hook_output_a_view() else bw_hook(*args)
             return args
@@ -199,12 +229,15 @@ class CellProcessor:
                     logger.warning("For backward hooks to be called,"
                                    " cell output should be a Tensor or a tuple of Tensors"
                                    f" but received {type(outputs)}")
-                if isinstance(outputs, tuple):
-                    new_outputs = bw_hook(*outputs)
-                else:
+                if is_backward_hook_output_a_view():
                     new_outputs = bw_hook(outputs)
-                if isinstance(outputs, tuple) and len(outputs) == 1:
-                    new_outputs = (new_outputs,)
+                else:
+                    if isinstance(outputs, tuple):
+                        new_outputs = bw_hook(*outputs)
+                    else:
+                        new_outputs = bw_hook(outputs)
+                    if isinstance(outputs, tuple) and len(outputs) == 1:
+                        new_outputs = (new_outputs,)
                 outputs = new_outputs
             def get_backward_pre_hook(full_backward_name, backward_data_hook):
@@ -227,18 +260,21 @@ class CellProcessor:
                                                  self.cell_backward_pre_hook[-1])
             bw_pre_hook.register_backward_pre_hook()
-            if isinstance(outputs, tuple):
-                result = bw_pre_hook(*outputs)
-            else:
+            if is_backward_hook_output_a_view():
                 result = bw_pre_hook(outputs)
-            if isinstance(outputs, tuple):
-                if len(outputs) == 1:
-                    result = (result,)
-                if len(result) != len(outputs):
-                    raise TypeError(
-                        f"The backward pre hook return value size is {len(result)} "
-                        f"not equal to output size {len(outputs)}"
-                    )
+            else:
+                if isinstance(outputs, tuple):
+                    result = bw_pre_hook(*outputs)
+                else:
+                    result = bw_pre_hook(outputs)
+                if isinstance(outputs, tuple):
+                    if len(outputs) == 1:
+                        result = (result,)
+                    if len(result) != len(outputs):
+                        raise TypeError(
+                            f"The backward pre hook return value size is {len(result)} "
+                            f"not equal to output size {len(outputs)}"
+                        )
             return result
         return forward_pre_hook
@@ -249,23 +285,26 @@ class CellProcessor:
             CellProcessor.cell_stack[tid] = []
         if self.cell_stack[tid]:
-            CellProcessor.module_node[full_name] = self.cell_stack[tid][-1]
+            CellProcessor.module_node[full_name] = self.cell_stack[tid][-1] if not is_megatron() \
+                else [self.cell_stack[tid][-1], get_micro_step()]
         else:
             parent_name = CellProcessor.cell_queue.find_last(full_name)
-            CellProcessor.module_node[full_name] = parent_name
+            CellProcessor.module_node[full_name] = parent_name if not is_megatron() else [parent_name, get_micro_step()]
         CellProcessor.cell_queue.add_name(full_name)
         CellProcessor.cell_stack[tid].append(full_name)
-        CellProcessor.api_parent_node[tid] = full_name
+        CellProcessor.api_parent_node[tid] = full_name if not is_megatron() else [full_name, get_micro_step()]
         if self.scope:
             self.scope.begin_module(full_name)
     def set_construct_info_in_hook(self, full_name):
         tid = threading.get_ident()
-        CellProcessor.api_parent_node[tid] = None
+        CellProcessor.cell_queue.remove_name(full_name)
+        CellProcessor.api_parent_node[tid] = None if not is_megatron() else [None, get_micro_step()]
         if self.cell_stack.get(tid):
             CellProcessor.cell_stack[tid].pop()
         if self.cell_stack.get(tid):
-            CellProcessor.api_parent_node[tid] = CellProcessor.cell_stack[tid][-1]
+            CellProcessor.api_parent_node[tid] = CellProcessor.cell_stack[tid][-1] if not is_megatron() \
+                else [CellProcessor.cell_stack[tid][-1], get_micro_step()]
         if self.scope:
             self.scope.end_module(full_name)

msprobe/mindspore/common/utils.py CHANGED Viewed

@@ -16,6 +16,7 @@
 import inspect
 import os
 import random
+import sys
 import types
 import mindspore as ms
@@ -41,6 +42,7 @@ else:
 mindtorch_check_result = None
 register_backward_hook_functions = {}
 kwargs_exist_in_forward_hook = None
+is_output_of_backward_hook_a_view = None
 class MsprobeStep(ms.train.Callback):
@@ -129,7 +131,7 @@ def list_lowest_level_directories(root_dir):
     return lowest_level_dirs
-def seed_all(seed=1234, mode=False, rm_dropout=True):
+def seed_all(seed=1234, mode=False, rm_dropout=False):
     check_seed_all(seed, mode, rm_dropout)
     os.environ['PYTHONHASHSEED'] = str(seed)
     ms.set_seed(seed)
@@ -179,6 +181,8 @@ def is_mindtorch():
     global mindtorch_check_result
     if mindtorch_check_result is None:
         mindtorch_check_result = False
+        if 'torch' not in sys.modules:
+            return mindtorch_check_result
         try:
             import torch
         except ImportError:
@@ -254,14 +258,14 @@ def is_decorated_by_jit(func):
 @recursion_depth_decorator('msprobe.mindspore.common.utils.get_cells_and_names')
-def get_cells_and_names(model, cells_set=None, name_prefix=''):
+def get_cells_and_names(model, cells_set=None, name_prefix='', parent_cell=None):
     cells_set = cells_set if cells_set else set()
     if model in cells_set:
         return
     cells_set.add(model)
     jit_decorated = is_decorated_by_jit(model.construct)
-    yield name_prefix, model, jit_decorated
+    yield name_prefix, model, jit_decorated, parent_cell
     if jit_decorated:
         return
@@ -271,9 +275,9 @@ def get_cells_and_names(model, cells_set=None, name_prefix=''):
             cells_name_prefix = f'{name_prefix}{Const.SEP}{name}' if name_prefix else name
             jit_decorated = is_decorated_by_jit(model.construct)
             if jit_decorated:
-                yield cells_name_prefix, cell, jit_decorated
+                yield cells_name_prefix, cell, jit_decorated, model
             else:
-                for ele in get_cells_and_names(cell, cells_set, cells_name_prefix):
+                for ele in get_cells_and_names(cell, cells_set, cells_name_prefix, model):
                     yield ele
@@ -284,9 +288,9 @@ def get_cells_and_names_with_index(models):
     def distinguish_cells(cells):
         cells_in_pynative_mode = []
         cells_in_graph_mode = []
-        for name, cell, jit_decorated in cells:
+        for name, cell, jit_decorated, parent_cell in cells:
             if jit_decorated:
-                cells_in_graph_mode.append((name, cell))
+                cells_in_graph_mode.append((name, cell, parent_cell))
             else:
                 cells_in_pynative_mode.append((name, cell))
         return cells_in_pynative_mode, cells_in_graph_mode
@@ -329,3 +333,43 @@ def has_kwargs_in_forward_hook():
         return kwargs_exist_in_forward_hook
     return kwargs_exist_in_forward_hook
+def is_backward_hook_output_a_view():
+    global is_output_of_backward_hook_a_view
+    if is_output_of_backward_hook_a_view is None:
+        is_output_of_backward_hook_a_view = False
+        if getattr(ms, '__version__', '2.4.0') < '2.7.0':
+            return is_output_of_backward_hook_a_view
+        try:
+            from mindspore.ops.operations import _inner_ops as inner
+            call_func = getattr(inner.CellBackwardHook, '__call__')
+            func_params = inspect.signature(call_func).parameters
+        except Exception:
+            return is_output_of_backward_hook_a_view
+        if 'args' in func_params and func_params['args'].kind == inspect.Parameter.POSITIONAL_OR_KEYWORD:
+            is_output_of_backward_hook_a_view = True
+    return is_output_of_backward_hook_a_view
+def wrap_backward_hook_call_func(call_func):
+    if not is_backward_hook_output_a_view():
+        return call_func
+    from mindspore.common.api import _pynative_executor as executor
+    from mindspore._c_expression import CreationType
+    def new_call(self, args):
+        outputs = call_func(self, args)
+        if isinstance(outputs, ms.Tensor):
+            executor.set_creation_type(outputs, CreationType.DEFAULT)
+        elif isinstance(outputs, tuple):
+            for item in outputs:
+                if isinstance(item, ms.Tensor):
+                    executor.set_creation_type(item, CreationType.DEFAULT)
+        return outputs
+    new_call.__name__ = '__call__'
+    return new_call

msprobe/mindspore/compare/common_dir_compare.py CHANGED Viewed

@@ -154,21 +154,34 @@ def find_npy_files(directory):
             dirs.clear()
         for file in files:
             if file.endswith(".npy"):
-                # 分割文件名并去掉最后两个元素
-                file_name = file.split('_')
-                if len(file_name) < 2:
+                # 正确移除文件扩展名
+                base_name = os.path.splitext(file)
+                if not base_name or len(base_name) < 1:
+                    logger.warning("Invalid file encountered.")
                     continue
-                key = '_'.join(file_name[:-2])
-                # 文件的完整路径
-                value = os.path.join(root, file)
-                # 添加到字典中
-                if not npy_files_dict.get(key):
-                    npy_files_dict[key] = []
-                npy_files_dict[key].append(value)
+                file_name = base_name[0]
+                logger.info(f"Generating file info for file: {file}")
+                # 使用一致的分割逻辑
+                file_ele = file_name.split('_')
+                if len(file_ele) < 2:
+                    continue
+                key = '_'.join(file_ele[:-2])
+                if key:
+                    # 文件的完整路径
+                    value = os.path.join(root, file)
+                    # 添加到字典中
+                    if key not in npy_files_dict:
+                        npy_files_dict[key] = []
+                    npy_files_dict[key].append(value)
     return npy_files_dict
 def generate_map_dict(npu_file_dict, bench_file_dict, name_map_dict=None):
+    result_dict = {}
     for k, npu_file_list in npu_file_dict.items():
         bench_file_list = bench_file_dict.get(k)
         if not bench_file_list and k in name_map_dict:
@@ -176,7 +189,6 @@ def generate_map_dict(npu_file_dict, bench_file_dict, name_map_dict=None):
         bench_length = len(bench_file_list)
         if not (bench_file_list and bench_length):
             continue
-        result_dict = {}
         for i, npu_file in enumerate(npu_file_list):
             if i >= bench_length:
                 break
@@ -200,14 +212,14 @@ def do_multi_process(func, map_dict):
         df_chunks = [result_df]
         process_num = 1
     logger.info(f"Using {process_num} processes with chunk size {df_chunk_size}")
     # 分割字典
     map_chunks = split_dict(map_dict, df_chunk_size)
     # 创建结果列表和进程池
     results = []
     pool = multiprocessing.Pool(process_num)
     progress_bar = tqdm(total=len(result_df), desc="API/Module Item Compare Process", unit="row", ncols=100)
     def update_progress(size, progress_lock, extra_param=None):
@@ -216,34 +228,30 @@ def do_multi_process(func, map_dict):
     def err_call(args):
         logger.error('multiprocess compare failed! Reason: {}'.format(args))
-        try:
-            pool.close()
-        except OSError as e:
-            logger.error(f'pool terminate failed: {str(e)}')
     results = []
+    # 提交任务到进程池
+    for process_idx, (df_chunk, map_chunk) in enumerate(zip(df_chunks, map_chunks)):
+        start_idx = df_chunk_size * process_idx
+        result = pool.apply_async(
+            func,
+            args=(df_chunk, start_idx, map_chunk, lock),
+            error_callback=err_call,
+            callback=partial(update_progress, len(map_chunk), lock)
+        )
+        results.append(result)
+    pool.close()
     try:
-        # 提交任务到进程池
-        for process_idx, (df_chunk, map_chunk) in enumerate(zip(df_chunks, map_chunks)):
-            start_idx = df_chunk_size * process_idx
-            result = pool.apply_async(
-                func,
-                args=(df_chunk, start_idx, map_chunk, lock),
-                error_callback=err_call,
-                callback=partial(update_progress, len(map_chunk), lock)
-            )
-            results.append(result)
-        final_results = [r.get() for r in results]
-        # 等待所有任务完成
-        pool.close()
-        pool.join()
-        return pd.concat(final_results, ignore_index=True)
+        final_results = [r.get(timeout=3600) for r in results]
     except Exception as e:
-        logger.error(f"\nMain process error: {str(e)}")
+        logger.error(f"Task failed with exception: {e}")
         pool.terminate()
         return pd.DataFrame({})
-    finally:
-        pool.close()
+    # 等待所有任务完成
+    pool.join()
+    return pd.concat(final_results, ignore_index=True)
 def initialize_result_df(total_size):

msprobe/mindspore/compare/ms_compare.py CHANGED Viewed

@@ -35,8 +35,16 @@ def ms_compare(input_param, output_path, **kwargs):
         config.data_mapping = generate_data_mapping_by_layer_mapping(input_param, config.layer_mapping, output_path)
     is_cross_framework = check_cross_framework(input_param.get('bench_json_path'))
-    mode_config = ModeConfig(config.stack_mode, config.auto_analyze, config.fuzzy_match,
-                             config.dump_mode, config.compared_file_type)
+    config_dict = {
+        'stack_mode': config.stack_mode,
+        'auto_analyze': config.auto_analyze,
+        'fuzzy_match': config.fuzzy_match,
+        'highlight': config.highlight,
+        'dump_mode': config.dump_mode,
+        'compared_file_type': config.compared_file_type
+    }
+    mode_config = ModeConfig(**config_dict)
     mapping_config = MappingConfig(config.cell_mapping, config.api_mapping, config.data_mapping)
     ms_comparator = Comparator(read_real_data, mode_config, mapping_config, is_cross_framework)
     ms_comparator.compare_core(input_param, output_path, suffix=config.suffix)

msprobe/mindspore/compare/ms_graph_compare.py CHANGED Viewed

@@ -34,10 +34,11 @@ class RowData:
         self.basic_data = copy.deepcopy(CompareConst.MS_GRAPH_BASE)
         self.npy_data = copy.deepcopy(CompareConst.MS_GRAPH_NPY)
         self.statistic_data = copy.deepcopy(CompareConst.MS_GRAPH_STATISTIC)
+        self.csv = copy.deepcopy(CompareConst.MS_GRAPH_CSV)
         if mode == GraphMode.NPY_MODE:
             self.data = {**self.basic_data, **self.npy_data}
         else:
-            self.data = {**self.basic_data, **self.statistic_data}
+            self.data = {**self.basic_data, **self.statistic_data, **self.csv}
     def __call__(self):
         return self.data
@@ -80,8 +81,8 @@ def statistic_data_read(statistic_file_list, statistic_file_path):
     data_list = []
     statistic_data_list = []
     header_index = {
-        'Data Type': None, 'Shape': None, 'Max Value': None,
-        'Min Value': None, 'Avg Value': None, 'L2Norm Value': None
+        'Data Type': None, 'Shape': None,
+        'Max Value': None, 'Min Value': None, 'Avg Value': None, 'L2Norm Value': None
     }
     for statistic_file in statistic_file_list:
         content = read_csv(statistic_file, as_pd=False)
@@ -107,7 +108,7 @@ def statistic_data_read(statistic_file_list, statistic_file_path):
             logger.error(f'Dump file {statistic_file_path} has been modified into incorrect format!')
             raise CompareException(f'Dump file {statistic_file_path} has been modified into incorrect format!')
         compare_key = f"{data[1]}.{data[2]}.{data[5]}.{data[6]}"  # OpName, TaskId, IO, Slot
-        op_name = f"{compare_key} {statistic_file_path}"
+        op_name = f"{compare_key}"
         timestamp = int(data[4])
         result_data = [op_name, compare_key, timestamp]
         for key in header_index.keys():
@@ -115,6 +116,8 @@ def statistic_data_read(statistic_file_list, statistic_file_path):
                 result_data.append(np.nan)
             else:
                 result_data.append(data[header_index[key]])
+        csv_file = f"{statistic_file_path}"
+        result_data.append(csv_file)
         data_list.append(result_data)
     return data_list
@@ -230,6 +233,17 @@ class GraphMSComparator:
                 result[f'{prefix} min'] = np.float32(rows[f'{prefix} min'])
                 result[f'{prefix} mean'] = np.float32(rows[f'{prefix} mean'])
                 result[f'{prefix} l2norm'] = np.float32(rows[f'{prefix} l2norm'])
+                result[f'{prefix} CSV File'] = rows[f'{prefix} CSV File']
+            def calculate_relative_error(numerator, denominator):
+                """Calculates relative error, handling division by zero and NaN."""
+                if denominator != 0:
+                    result = numerator / denominator
+                    if not np.isnan(result):
+                        return str(abs(result * 100)) + "%"
+                    else:
+                        return CompareConst.NAN
+                return CompareConst.N_A
             # 使用示例
             update_result_dict(result_dict, row, 'NPU')
@@ -237,34 +251,26 @@ class GraphMSComparator:
             error_flag, error_message = statistics_data_check(result_dict)
             result_dict[CompareConst.ERROR_MESSAGE] += error_message
             if not error_flag:
-                result_dict[CompareConst.MAX_DIFF] = np.abs(
-                    result_dict[CompareConst.NPU_MAX] - result_dict[CompareConst.BENCH_MAX])
-                result_dict[CompareConst.MIN_DIFF] = np.abs(
-                    result_dict[CompareConst.NPU_MIN] - result_dict[CompareConst.BENCH_MIN])
-                result_dict[CompareConst.MEAN_DIFF] = np.abs(
-                    result_dict[CompareConst.NPU_MEAN] - result_dict[CompareConst.BENCH_MEAN])
-                result_dict[CompareConst.NORM_DIFF] = np.abs(
-                    result_dict[CompareConst.NPU_NORM] - result_dict[CompareConst.BENCH_NORM])
-                result_dict[CompareConst.MAX_RELATIVE_ERR] = result_dict[CompareConst.MAX_DIFF] / result_dict[
-                    CompareConst.BENCH_MAX] if result_dict[CompareConst.BENCH_MAX] > 0 else 0
-                if not np.isnan(result_dict[CompareConst.MAX_RELATIVE_ERR]):
-                    result_dict[CompareConst.MAX_RELATIVE_ERR] = str(
-                        result_dict[CompareConst.MAX_RELATIVE_ERR] * 100) + "%"
-                result_dict[CompareConst.MIN_RELATIVE_ERR] = result_dict[CompareConst.MIN_DIFF] / result_dict[
-                    CompareConst.BENCH_MIN] if result_dict[CompareConst.BENCH_MIN] > 0 else 0
-                if not np.isnan(result_dict[CompareConst.MIN_RELATIVE_ERR]):
-                    result_dict[CompareConst.MIN_RELATIVE_ERR] = \
-                        str(result_dict[CompareConst.MIN_RELATIVE_ERR] * 100) + "%"
-                result_dict[CompareConst.MEAN_RELATIVE_ERR] = result_dict[CompareConst.MEAN_DIFF] / result_dict[
-                    CompareConst.BENCH_MEAN] if result_dict[CompareConst.BENCH_MEAN] > 0 else 0
-                if not np.isnan(result_dict[CompareConst.MEAN_RELATIVE_ERR]):
-                    result_dict[CompareConst.MEAN_RELATIVE_ERR] = str(
-                        result_dict[CompareConst.MEAN_RELATIVE_ERR] * 100) + "%"
-                result_dict[CompareConst.NORM_RELATIVE_ERR] = result_dict[CompareConst.NORM_DIFF] / result_dict[
-                    CompareConst.BENCH_NORM] if result_dict[CompareConst.BENCH_NORM] > 0 else 0
-                if not np.isnan(result_dict[CompareConst.NORM_RELATIVE_ERR]):
-                    result_dict[CompareConst.NORM_RELATIVE_ERR] = str(
-                        result_dict[CompareConst.NORM_RELATIVE_ERR] * 100) + "%"
+                metrics = [
+                    (CompareConst.MAX_DIFF, CompareConst.NPU_MAX, CompareConst.BENCH_MAX),
+                    (CompareConst.MIN_DIFF, CompareConst.NPU_MIN, CompareConst.BENCH_MIN),
+                    (CompareConst.MEAN_DIFF, CompareConst.NPU_MEAN, CompareConst.BENCH_MEAN),
+                    (CompareConst.NORM_DIFF, CompareConst.NPU_NORM, CompareConst.BENCH_NORM),
+                ]
+                relative_error_metrics = [
+                    (CompareConst.MAX_RELATIVE_ERR, CompareConst.MAX_DIFF, CompareConst.BENCH_MAX),
+                    (CompareConst.MIN_RELATIVE_ERR, CompareConst.MIN_DIFF, CompareConst.BENCH_MIN),
+                    (CompareConst.MEAN_RELATIVE_ERR, CompareConst.MEAN_DIFF, CompareConst.BENCH_MEAN),
+                    (CompareConst.NORM_RELATIVE_ERR, CompareConst.NORM_DIFF, CompareConst.BENCH_NORM),
+                ]
+                for diff_metric, npu_metric, bench_metric in metrics:
+                    result_dict[diff_metric] = result_dict[npu_metric] - result_dict[bench_metric]
+                for rel_metric, diff_metric, bench_metric in relative_error_metrics:
+                    result_dict[rel_metric] = calculate_relative_error(result_dict[diff_metric],
+                                                                       result_dict[bench_metric])
                 magnitude_diff = result_dict[CompareConst.MAX_DIFF] / (
                         max(result_dict[CompareConst.NPU_MAX], result_dict[CompareConst.BENCH_MAX]) + 1e-10)
                 if np.isnan(result_dict[CompareConst.NPU_MAX]) and np.isnan(result_dict[CompareConst.BENCH_MAX]):
@@ -296,20 +302,8 @@ class GraphMSComparator:
             compare_result_df = self.do_multi_process(compare_result_df, mode)
             compare_result_name = add_time_with_xlsx(f"compare_result_{str(rank_id)}_{str(step_id)}")
             compare_result_path = os.path.join(os.path.realpath(self.output_path), f"{compare_result_name}")
-            self.to_excel(compare_result_df, compare_result_path)
-            logger.info(f"Compare rank: {rank_id} step: {step_id} finish. Compare result: {compare_result_path}.")
-    def to_excel(self, compare_result_df: pd.DataFrame, compare_result_path: str, slice_num=0, need_slice=False) -> int:
-        size = len(compare_result_df)
-        # sheet size cannot be larger than 1048576
-        if size < CompareConst.MAX_EXCEL_LENGTH:
-            compare_result_path = compare_result_path.replace('.xlsx', f'_slice_{slice_num}.xlsx') if \
-                need_slice else compare_result_path
             save_excel(compare_result_path, compare_result_df)
-            return slice_num + 1
-        else:
-            slice_num = self.to_excel(compare_result_df.iloc[0: size // 2], compare_result_path, slice_num, True)
-            return self.to_excel(compare_result_df.iloc[size // 2:], compare_result_path, slice_num, True)
+            logger.info(f"Compare rank: {rank_id} step: {step_id} finish. Compare result: {compare_result_path}.")
     def compare_process(self, rank_id, step_id):
         # generate data_path
@@ -331,7 +325,7 @@ class GraphMSComparator:
             bench_data_list.extend(data_list)
         if npu_mode == GraphMode.ERROR_MODE or bench_mode == GraphMode.ERROR_MODE:
-            logger.warning(f"Data_path {npu_data_path} or {bench_data_path} is not exist.")
+            logger.warning(f"Data path: npu_data_path or bench_data_path does not exist.")
             return [], ''
         if npu_mode != bench_mode:
             logger.error(f"NPU mode {npu_mode} not equal to MATCH mode {bench_mode}.")
@@ -344,14 +338,15 @@ class GraphMSComparator:
             npu_data_df = pd.DataFrame(npu_data_list,
                                        columns=[CompareConst.NPU_NAME, 'Compare Key', 'TimeStamp',
                                                 CompareConst.NPU_DTYPE, CompareConst.NPU_SHAPE,
-                                                CompareConst.NPU_MAX, CompareConst.NPU_MIN, CompareConst.NPU_MEAN,
-                                                CompareConst.NPU_NORM])
+                                                CompareConst.NPU_MAX, CompareConst.NPU_MIN,
+                                                CompareConst.NPU_MEAN, CompareConst.NPU_NORM,
+                                                CompareConst.NPU_CSV_FILE])
             bench_data_df = pd.DataFrame(bench_data_list,
                                          columns=[CompareConst.BENCH_NAME, 'Compare Key', 'TimeStamp',
-                                                  CompareConst.BENCH_DTYPE,
-                                                  CompareConst.BENCH_SHAPE, CompareConst.BENCH_MAX,
-                                                  CompareConst.BENCH_MIN, CompareConst.BENCH_MEAN,
-                                                  CompareConst.BENCH_NORM])
+                                                  CompareConst.BENCH_DTYPE, CompareConst.BENCH_SHAPE,
+                                                  CompareConst.BENCH_MAX, CompareConst.BENCH_MIN,
+                                                  CompareConst.BENCH_MEAN, CompareConst.BENCH_NORM,
+                                                  CompareConst.BENCH_CSV_FILE])
             npu_float_type = [CompareConst.NPU_MAX, CompareConst.NPU_MIN, CompareConst.NPU_MEAN, CompareConst.NPU_NORM]
             npu_float_data_df = npu_data_df[npu_float_type].astype(str)

msprobe/mindspore/debugger/debugger_config.py CHANGED Viewed

@@ -49,8 +49,9 @@ class DebuggerConfig:
         self.summary_mode = task_config.summary_mode
         self.stat_cal_mode = task_config.stat_cal_mode if hasattr(task_config, 'stat_cal_mode') else None
         self.device_stat_precision_mode = task_config.device_stat_precision_mode \
-                                          if hasattr(task_config, 'device_stat_precision_mode') else None
+            if hasattr(task_config, 'device_stat_precision_mode') else None
         self.async_dump = common_config.async_dump if common_config.async_dump else False
+        self.precision = common_config.precision if common_config.precision else Const.DUMP_PRECISION_LOW
         self.check()
         self._check_statistics_config(task_config)
         create_directory(self.dump_path)
@@ -115,18 +116,28 @@ class DebuggerConfig:
             self.check_mode = "all"
         if not isinstance(self.async_dump, bool):
             raise Exception("The parameters async_dump should be bool.")
-        if self.async_dump and self.task == Const.TENSOR:
-            if self.level_ori == Const.LEVEL_DEBUG:
-                self.list = [] # async_dump + debug level case ignore list
-            if not self.list and self.level_ori != Const.LEVEL_DEBUG:
-                raise Exception("The parameters async_dump is true in tensor task,"
-                                " the parameters list cannot be empty.")
         if self.task == Const.STRUCTURE and self.level_ori not in [Const.LEVEL_L0, Const.LEVEL_MIX]:
             logger.warning_on_rank_0(
                 f"When the task is set to structure, the level should be one of {[Const.LEVEL_L0, Const.LEVEL_MIX]}. "
                 f"If not, the default level is {Const.LEVEL_MIX}."
             )
             self.level_ori = Const.LEVEL_MIX
+        if self.async_dump:
+            if self.task == Const.TENSOR:
+                if self.level_ori == Const.LEVEL_DEBUG:
+                    self.list = []  # async_dump + debug level case ignore list
+                if not self.list and self.level_ori != Const.LEVEL_DEBUG:
+                    raise MsprobeException(
+                        MsprobeException.INVALID_PARAM_ERROR,
+                        "The parameters async_dump is true in tensor task, the parameters list cannot be empty."
+                    )
+            is_unsupported_mode = self.summary_mode == Const.MD5 or \
+                                  isinstance(self.summary_mode, list) and Const.MD5 in self.summary_mode
+            if is_unsupported_mode:
+                raise MsprobeException(
+                    MsprobeException.INVALID_PARAM_ERROR,
+                    f"The parameters async_dump is true, the parameters summary_mode cannot be/contain md5."
+                )
         return True
     def check_config_with_l2(self, is_graph_config):

mindstudio-probe 8.1.2__py3-none-any.whl → 8.2.1__py3-none-any.whl

mindstudio-probe 8.1.2py3-none-any.whl → 8.2.1py3-none-any.whl