PyPI - mindstudio-probe - Versions diffs - 8.1.2__py3-none-any.whl → 8.2.1__py3-none-any.whl - Mend

mindstudio-probe 8.1.2py3-none-any.whl → 8.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (181) hide show

{mindstudio_probe-8.1.2.dist-info → mindstudio_probe-8.2.1.dist-info}/METADATA +2 -2
{mindstudio_probe-8.1.2.dist-info → mindstudio_probe-8.2.1.dist-info}/RECORD +172 -147
msprobe/README.md +6 -6
msprobe/core/common/const.py +98 -41
msprobe/core/common/db_manager.py +256 -0
msprobe/core/common/file_utils.py +28 -5
msprobe/core/common/log.py +7 -0
msprobe/core/common/megatron_utils.py +59 -0
msprobe/core/common/parallel_state.py +193 -0
msprobe/core/common/utils.py +20 -13
msprobe/core/common_config.py +5 -0
msprobe/core/compare/acc_compare.py +140 -93
msprobe/core/compare/check.py +13 -0
msprobe/core/compare/compare_cli.py +64 -6
msprobe/core/compare/config.py +10 -8
msprobe/core/compare/diff_analyze/diff_analyze_threshold.yaml +14 -0
msprobe/core/compare/diff_analyze/first_diff_analyze.py +135 -0
msprobe/core/compare/diff_analyze/ignore_op_list.yaml +3 -0
msprobe/core/compare/find_first/__init__.py +0 -0
msprobe/core/compare/find_first/analyzer.py +282 -0
msprobe/core/compare/find_first/data_processor.py +35 -0
msprobe/core/compare/find_first/graph.py +188 -0
msprobe/core/compare/find_first/utils.py +189 -0
msprobe/core/compare/highlight.py +74 -101
msprobe/core/compare/layer_mapping/layer_mapping.py +14 -9
msprobe/core/compare/merge_result/merge_result.py +2 -2
msprobe/core/compare/multiprocessing_compute.py +45 -28
msprobe/core/compare/npy_compare.py +7 -10
msprobe/core/compare/utils.py +338 -130
msprobe/core/config_check/checkers/dataset_checker.py +2 -1
msprobe/core/config_check/checkers/env_args_checker.py +5 -5
msprobe/core/config_check/checkers/hyperparameter_checker.py +30 -10
msprobe/core/config_check/checkers/pip_checker.py +4 -3
msprobe/core/config_check/checkers/random_checker.py +3 -3
msprobe/core/config_check/checkers/weights_checker.py +2 -1
msprobe/core/config_check/ckpt_compare/megatron_loader.py +2 -0
msprobe/core/config_check/resource/hyperparameter.yaml +11 -1
msprobe/core/config_check/utils/hyperparameter_parser.py +7 -3
msprobe/core/config_check/utils/utils.py +10 -0
msprobe/core/data_dump/api_registry.py +49 -30
msprobe/core/data_dump/data_collector.py +71 -29
msprobe/core/data_dump/data_processor/base.py +2 -0
msprobe/core/data_dump/data_processor/mindspore_processor.py +47 -53
msprobe/core/data_dump/data_processor/pytorch_processor.py +227 -93
msprobe/core/data_dump/json_writer.py +81 -7
msprobe/core/data_dump/scope.py +4 -6
msprobe/core/hook_manager.py +129 -70
msprobe/core/monitor/csv2db.py +361 -0
msprobe/core/monitor/db_utils.py +278 -0
msprobe/core/monitor/utils.py +35 -1
msprobe/core/service.py +31 -39
msprobe/core/single_save/single_comparator.py +16 -3
msprobe/docs/01.installation.md +51 -19
msprobe/docs/02.config_introduction.md +16 -20
msprobe/docs/03.config_examples.md +26 -0
msprobe/docs/04.kernel_dump_PyTorch.md +1 -1
msprobe/docs/05.data_dump_PyTorch.md +6 -2
msprobe/docs/06.data_dump_MindSpore.md +44 -7
msprobe/docs/07.accuracy_checker_PyTorch.md +1 -1
msprobe/docs/10.accuracy_compare_PyTorch.md +124 -44
msprobe/docs/11.accuracy_compare_MindSpore.md +75 -7
msprobe/docs/14.data_parse_PyTorch.md +1 -1
msprobe/docs/19.monitor.md +94 -7
msprobe/docs/21.visualization_PyTorch.md +71 -101
msprobe/docs/22.visualization_MindSpore.md +69 -119
msprobe/docs/23.generate_operator_PyTorch.md +1 -1
msprobe/docs/25.tool_function_introduction.md +0 -1
msprobe/docs/26.data_dump_PyTorch_baseline.md +7 -7
msprobe/docs/28.debugger_save_instruction.md +184 -81
msprobe/docs/29.data_dump_MSAdapter.md +6 -0
msprobe/docs/31.config_check.md +4 -2
msprobe/docs/36.calculation_result_change.md +75 -0
msprobe/docs/FAQ.md +22 -1
msprobe/docs/data_dump_MindSpore/dynamic_graph_quick_start_example.md +6 -2
msprobe/docs/img/compare_result.png +0 -0
msprobe/docs/img/visualization/vis_browser_1.png +0 -0
msprobe/docs/img/visualization/vis_match_info.png +0 -0
msprobe/docs/img/visualization/vis_precision_info.png +0 -0
msprobe/docs/img/visualization/vis_search_info.png +0 -0
msprobe/docs/img/visualization/vis_show_info.png +0 -0
msprobe/docs/img/visualization/vis_showcase.png +0 -0
msprobe/docs/img/visualization/vis_unmatch_info.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/1.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/2.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/3.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/4.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/5.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/6.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/7.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/llamafactory-qwen25vl.txt +59 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/llamafactory1.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/llamafactory2.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/mindspeed-mm-qwen25vl.txt +80 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/mindspeed1.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/mindspeed2.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactory_mapping.md +330 -0
msprobe/mindspore/__init__.py +1 -1
msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +1 -1
msprobe/mindspore/api_accuracy_checker/api_runner.py +9 -6
msprobe/mindspore/api_accuracy_checker/compute_element.py +18 -12
msprobe/mindspore/cell_processor.py +64 -25
msprobe/mindspore/common/utils.py +51 -7
msprobe/mindspore/compare/common_dir_compare.py +45 -37
msprobe/mindspore/compare/ms_compare.py +10 -2
msprobe/mindspore/compare/ms_graph_compare.py +47 -52
msprobe/mindspore/debugger/debugger_config.py +18 -7
msprobe/mindspore/debugger/precision_debugger.py +16 -12
msprobe/mindspore/dump/cell_dump_process.py +130 -68
msprobe/mindspore/dump/cell_dump_with_insert_gradient.py +10 -2
msprobe/mindspore/dump/graph_mode_cell_dump.py +35 -9
msprobe/mindspore/dump/graph_tensor_dump.py +11 -0
msprobe/mindspore/dump/hook_cell/api_register.py +19 -20
msprobe/mindspore/dump/hook_cell/hook_cell.py +12 -34
msprobe/mindspore/dump/hook_cell/ms_hook_manager.py +142 -21
msprobe/mindspore/dump/kernel_kbyk_dump.py +24 -0
msprobe/mindspore/exception_dump/__init__.py +0 -0
msprobe/mindspore/exception_dump/exception_dump_tool_factory.py +51 -0
msprobe/mindspore/exception_dump/kernel_graph_exception_dump.py +57 -0
msprobe/mindspore/free_benchmark/api_pynative_self_check.py +5 -4
msprobe/mindspore/mindspore_service.py +2 -2
msprobe/mindspore/mindtorch/mindtorch_adaptor.py +12 -7
msprobe/mindspore/monitor/features.py +82 -0
msprobe/mindspore/monitor/module_hook.py +168 -10
msprobe/mindspore/monitor/utils.py +27 -1
msprobe/mindspore/ms_config.py +12 -4
msprobe/mindspore/overflow_check/overflow_check_tool_factory.py +1 -1
msprobe/mindspore/task_handler_factory.py +3 -1
msprobe/nan_analyze/graph.py +1 -1
msprobe/pytorch/api_accuracy_checker/common/config.py +3 -36
msprobe/pytorch/api_accuracy_checker/compare/api_precision_compare.py +0 -24
msprobe/pytorch/api_accuracy_checker/compare/compare.py +2 -12
msprobe/pytorch/api_accuracy_checker/config.yaml +1 -6
msprobe/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py +2 -2
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py +12 -132
msprobe/pytorch/common/utils.py +1 -21
msprobe/pytorch/compare/pt_compare.py +10 -2
msprobe/pytorch/{hook_module/jit_script_wrapper.py → compare/pt_diff_analyze.py} +3 -15
msprobe/pytorch/compare/utils.py +2 -1
msprobe/pytorch/debugger/debugger_config.py +18 -23
msprobe/pytorch/dump/module_dump/hook_wrapper.py +10 -7
msprobe/pytorch/dump/module_dump/module_processer.py +41 -19
msprobe/pytorch/free_benchmark/main.py +7 -4
msprobe/pytorch/hook_module/api_register.py +62 -24
msprobe/pytorch/hook_module/hook_module.py +9 -29
msprobe/pytorch/hook_module/pt_hook_manager.py +84 -15
msprobe/pytorch/hook_module/script_wrapper.py +140 -0
msprobe/pytorch/hook_module/support_wrap_ops.yaml +6 -0
msprobe/pytorch/monitor/csv2tb.py +1 -1
msprobe/pytorch/monitor/features.py +94 -0
msprobe/pytorch/monitor/module_hook.py +221 -81
msprobe/pytorch/monitor/module_metric.py +27 -1
msprobe/pytorch/monitor/optimizer_collect.py +109 -4
msprobe/pytorch/online_dispatch/dispatch.py +42 -24
msprobe/pytorch/online_dispatch/dump_compare.py +1 -1
msprobe/pytorch/parse_tool/lib/visualization.py +0 -1
msprobe/pytorch/pt_config.py +2 -51
msprobe/pytorch/pytorch_service.py +7 -14
msprobe/visualization/builder/graph_builder.py +192 -63
msprobe/visualization/builder/graph_merger.py +986 -0
msprobe/visualization/builder/msprobe_adapter.py +17 -15
msprobe/visualization/compare/graph_comparator.py +26 -16
msprobe/visualization/db_utils.py +252 -0
msprobe/visualization/graph/base_node.py +2 -22
msprobe/visualization/graph/distributed_analyzer.py +12 -12
msprobe/visualization/graph/graph.py +44 -16
msprobe/visualization/graph_service.py +143 -59
msprobe/visualization/utils.py +103 -4
msprobe/docs/08.accuracy_checker_online_PyTorch.md +0 -295
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/attl.py +0 -205
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/client.py +0 -378
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/device_dispatch.py +0 -239
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/dump_dispatch.py +0 -115
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/server.py +0 -250
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/torch_ops_config.yaml +0 -63
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/utils.py +0 -198
msprobe/pytorch/attl_manager.py +0 -65
{mindstudio_probe-8.1.2.dist-info → mindstudio_probe-8.2.1.dist-info}/LICENSE +0 -0
{mindstudio_probe-8.1.2.dist-info → mindstudio_probe-8.2.1.dist-info}/WHEEL +0 -0
{mindstudio_probe-8.1.2.dist-info → mindstudio_probe-8.2.1.dist-info}/entry_points.txt +0 -0
{mindstudio_probe-8.1.2.dist-info → mindstudio_probe-8.2.1.dist-info}/top_level.txt +0 -0
/msprobe/{pytorch/api_accuracy_checker/tensor_transport_layer → core/compare/diff_analyze}/__init__.py +0 -0

msprobe/mindspore/debugger/precision_debugger.py CHANGED Viewed

@@ -17,6 +17,7 @@ import os
 from collections import defaultdict, namedtuple
 import mindspore as ms
+from mindspore.ops.operations import _inner_ops as inner
 from mindspore._c_expression import MSContext
 from msprobe.core.common.const import Const, MsgConst
@@ -28,7 +29,8 @@ from msprobe.mindspore.common.const import Const as MsConst
 from msprobe.mindspore.common.utils import (
     set_register_backward_hook_functions,
     check_save_param,
-    is_graph_mode_cell_dump_allowed
+    is_graph_mode_cell_dump_allowed,
+    wrap_backward_hook_call_func
 )
 from msprobe.mindspore.debugger.debugger_config import DebuggerConfig
 from msprobe.mindspore.dump.graph_mode_cell_dump import GraphModeCellDump
@@ -41,6 +43,7 @@ from msprobe.mindspore.task_handler_factory import TaskHandlerFactory
 try:
     from mindspore._c_expression import _dump_start, _dump_stop, _dump_step, _set_init_iter, _dump_set_dynamic
+    import mindspore as ms
 except ImportError:
     enable_dynamic_kbyk_dump = False
 else:
@@ -80,6 +83,9 @@ class PrecisionDebugger(BasePrecisionDebugger):
         if self._is_kernel_dump() and not self.task_config.is_regex_valid:
             raise ValueError('Illegal regular expressions exist in the list.')
+        setattr(inner.CellBackwardHook, '__call__',
+                wrap_backward_hook_call_func(getattr(inner.CellBackwardHook, '__call__')))
         if self._is_kernel_dump() and _msprobe_c:
             os.environ["MS_HOOK_ENABLE"] = "on"
             _msprobe_c._PrecisionDebugger(framework="MindSpore", config_path=config_path)
@@ -90,7 +96,7 @@ class PrecisionDebugger(BasePrecisionDebugger):
         Runtime.step_count = 0
         Runtime.is_running = False
-        if enable_dynamic_kbyk_dump:
+        if enable_dynamic_kbyk_dump and self.config.level_ori == Const.LEVEL_L2:
             _dump_set_dynamic()
     @staticmethod
@@ -160,7 +166,8 @@ class PrecisionDebugger(BasePrecisionDebugger):
                 instance.service.stop()
         else:
             Runtime.is_running = False
-        if enable_dynamic_kbyk_dump:
+        if enable_dynamic_kbyk_dump and instance.config.level_ori == Const.LEVEL_L2:
+            ms.runtime.synchronize()
             _dump_stop()
         if cls._is_kernel_dump() and _msprobe_c:
             _msprobe_c._PrecisionDebugger().stop()
@@ -175,8 +182,8 @@ class PrecisionDebugger(BasePrecisionDebugger):
             with ThreadSafe():
                 instance.service.step()
         if is_graph_mode_cell_dump_allowed(instance.config):
-            GraphModeCellDump.step()
-        if enable_dynamic_kbyk_dump:
+            GraphModeCellDump.step(instance.config.dump_path, instance.config.step, instance.config.task)
+        if enable_dynamic_kbyk_dump and instance.config.level_ori == Const.LEVEL_L2:
             _dump_step(1)
         if cls._is_kernel_dump() and _msprobe_c:
             _msprobe_c._PrecisionDebugger().step()
@@ -207,12 +214,9 @@ class PrecisionDebugger(BasePrecisionDebugger):
             check_save_param(variable, name, save_backward)
         except ValueError:
             return
-        instance.config.execution_mode = cls._get_execution_mode()
-        if cls._need_service():
-            if not instance.service:
-                instance.service = MindsporeService(instance.config)
-            instance.service.save(variable, name, save_backward)
+        if not instance.service:
+            instance.service = MindsporeService(instance.config)
+        instance.service.save(variable, name, save_backward)
     @classmethod
     def _need_service(cls):
@@ -220,7 +224,7 @@ class PrecisionDebugger(BasePrecisionDebugger):
         if not instance:
             raise Exception(MsgConst.NOT_CREATED_INSTANCE)
         if instance.config.level_ori == Const.LEVEL_L2:
-            return False
+            return not instance._is_graph_dump(instance.config)
         if instance.config.execution_mode != MsConst.PYNATIVE_MODE:
             return False
         else:

msprobe/mindspore/dump/cell_dump_process.py CHANGED Viewed

@@ -38,15 +38,19 @@ DEFAULT_RANK_DIR = "rank0"
 KEY_LAYERS = "layers"
 construct = {}
 cell_list = []
+free_cells = {}
+parent_cell_types = {}
 KEY_SIDE_EFFECT = "side_effect_io"
 KEY_TOPLAYER = "TopLayer"
 KEY_FORWARD = CoreConst.FORWARD
 KEY_BACKWARD = CoreConst.BACKWARD
 KEY_INPUT = CoreConst.INPUT
 KEY_OUTPUT = CoreConst.OUTPUT
-KEY_DUMP_TENSOR_DATA = "dump_tensor_data_"
+KEY_DUMP_TENSOR_DATA = "dump_tensor_data/"
 KEY_STATISTIC_CSV = "statistic.csv"
 KEY_TD_FLAG = "td_flag"
+# 设置落盘文件检测超时时间
+TIMEOUT = 600
 td = ops.TensorDump()
 if (ms.__version__ >= "2.5.0"):
     td_in = ops.TensorDump("in")
@@ -219,8 +223,16 @@ def cell_construct_wrapper(func, self):
 def sort_filenames(path):
     filenames = os.listdir(path)
     id_pattern = re.compile(rf'{CoreConst.REPLACEMENT_CHARACTER}(\d+){CoreConst.NUMPY_SUFFIX}$')
-    filenames.sort(key=lambda x: int(id_pattern.findall(x)[0]))
-    return filenames
+    # 只保留能提取到数字id的文件，避免数组越界
+    valid_files = []
+    for filename in filenames:
+        match = id_pattern.findall(filename)
+        if match and match[0].isdigit():
+            valid_files.append(filename)
+        else:
+            logger.warning(f"File {filename} does not match the expected pattern and will be ignored.")
+    valid_files.sort(key=lambda x: int(id_pattern.findall(x)[0]))
+    return valid_files
 def rename_filename(path="", data_df=None):
@@ -294,7 +306,24 @@ def check_relation(cell_name, parent_cell_name):
     return False
+def get_parent_cell_name(child_cell_name):
+    parent_cell_name = ''
+    last_dot_index = child_cell_name.rfind(CoreConst.SEP)
+    if last_dot_index == -1:
+        return parent_cell_name
+    layers_pattern = rf"{CoreConst.SEP}{KEY_LAYERS}{CoreConst.SEP}\d+$"
+    if re.search(layers_pattern, child_cell_name):
+        parent_cell_name = re.sub(layers_pattern, '', child_cell_name)
+    else:
+        parent_cell_name = child_cell_name[:last_dot_index]
+    return parent_cell_name
 def get_construct(cell_list_input):
+    global free_cells, parent_cell_types
     for cell in cell_list_input:
         cell_name = get_cell_name(cell)
         cell_data_mode = get_data_mode(cell)
@@ -308,7 +337,20 @@ def get_construct(cell_list_input):
                 found_flag = True
                 break
         if not found_flag:
-            construct.update({cell: None})
+            cell_name_with_mode = f'{cell_name}{CoreConst.SEP}{cell_data_mode}'
+            if cell_name_with_mode in free_cells:
+                construct.update({cell: free_cells.get(cell_name_with_mode)})
+                continue
+            parent_cell = None
+            parent_cell_name = get_parent_cell_name(cell_name)
+            if parent_cell_name and cell_name in parent_cell_types:
+                parent_cell = CoreConst.SEP.join([CoreConst.CELL, parent_cell_name, parent_cell_types.get(cell_name)])
+                second_last_dot_index = cell.rfind(CoreConst.SEP, 0, cell.rfind(CoreConst.SEP))
+                parent_cell = f'{parent_cell}{cell[second_last_dot_index:]}'
+                free_cells[cell_name_with_mode] = parent_cell
+            construct.update({cell: parent_cell})
 def generate_construct(path):
@@ -462,7 +504,7 @@ def process_csv(path):
             if col_name in columns:
                 value = convert_special_values(row[col_name])
                 tensor_json[json_key] = value
         if io_key == KEY_INPUT:
             data_info.append([op_name, CoreConst.INPUT_ARGS, tensor_json])
         elif io_key == KEY_OUTPUT:
@@ -534,59 +576,75 @@ def generate_stack_info(path):
     logger.info(f"Stack data saved to {json_path}")
-def is_download_finished(directory, interval=3):
+def is_download_finished(directory, save_flag):
     """
     判断指定目录在一段时间后是否有数据被下载完成
     :param directory: 指定目录的路径
-    :param interval: 检查的时间间隔（秒），默认为 3 秒
+    :param save_flag: 数据落盘完成后的标志文件
     :return: 如有数据被下载完成返回 True，否则返回 False
     """
+    # 设定一定的延迟间隔，避免频繁进行磁盘的io读取操作
+    time.sleep(0.5)
+    logger.info("Waiting for download...")
     # 检查目录是否存在
     if not os.path.exists(directory):
         logger.warning(f"The specified directory {directory} does not exist.")
         return False
-    initial_modification_time = os.path.getmtime(directory)
-    time.sleep(interval)
-    current_modification_time = os.path.getmtime(directory)
-    # 比较初始和当前修改时间
-    if current_modification_time > initial_modification_time:
-        return False
-    else:
-        return True
+    # 遍历当前目录中的所有条目
+    for entry_path in os.listdir(directory):
+        if entry_path.startswith(save_flag):
+            return True
+    return False
+def process_step(dump_path, flag_path, step, step_list):
+    if step not in step_list:
+        return
+    if not os.path.exists(dump_path):
+        logger.warning('No grap cell data is dumped.')
+        create_directory(dump_path)
+        return
-def process(dump_path):
     rank_id = os.environ.get('RANK_ID')
     rank_dir = DEFAULT_RANK_DIR
     if rank_id is not None:
         rank_dir = CoreConst.RANK + str(rank_id)
-    step_dir_list = os.listdir(dump_path)
-    for step_dir in step_dir_list:
-        step_path = os.path.join(dump_path, step_dir)
-        rank_path = os.path.join(step_path, rank_dir)
-        npy_path = os.path.join(rank_path, CoreConst.DUMP_TENSOR_DATA)
-        while True:
-            is_finished = is_download_finished(npy_path)
-            if not is_finished:
-                logger.info("There is data being downloaded in the specified directory, continue checking...")
-            else:
-                logger.info("There is no data being downloaded in the specified directory, Stop checking.")
-                break
-        logger.info("==========Start processing data that has already been stored on the disk!==========")
-        rename_filename(path=npy_path)
-        generate_construct(npy_path)
-        generate_dump_info(npy_path)
-        generate_stack_info(npy_path)
-        # 单卡场景，rank目录名称为rank
-        if rank_id is None:
-            new_rank_path = os.path.join(step_path, CoreConst.RANK)
-            try:
-                move_directory(rank_path, new_rank_path)
-                logger.info(f"Directory was successfully renamed to: {new_rank_path}")
-            except Exception as e:
-                logger.warning(f"Failed to renamed to {new_rank_path}: {e}")
-        logger.info("==========JSON file generation completed!==========")
+    step_dir = CoreConst.STEP + str(step)
+    step_path = os.path.join(dump_path, step_dir)
+    rank_path = os.path.join(step_path, rank_dir)
+    npy_path = os.path.join(rank_path, CoreConst.DUMP_TENSOR_DATA)
+    save_finish_flag = f"step_{step}"
+    start_time = time.time()
+    while True:
+        is_finished = is_download_finished(flag_path, save_finish_flag)
+        if not is_finished:
+            logger.info("There is data being downloaded in the specified directory, continue checking...")
+        else:
+            logger.info("There is no data being downloaded in the specified directory, Stop checking.")
+            break
+        elapsed_time = time.time() - start_time
+        if elapsed_time > TIMEOUT:
+            logger.error(f"Check timed out after {TIMEOUT} seconds. Exiting.")
+            return
+    logger.info(f"==========Start processing step_{step}'s data that has already been stored on the disk!==========")
+    rename_filename(path=npy_path)
+    generate_construct(npy_path)
+    generate_dump_info(npy_path)
+    generate_stack_info(npy_path)
+    # 单卡场景，rank目录名称为rank
+    if rank_id is None:
+        new_rank_path = os.path.join(step_path, CoreConst.RANK)
+        try:
+            move_directory(rank_path, new_rank_path)
+            logger.info(f"Directory was successfully renamed to: {new_rank_path}")
+        except Exception as e:
+            logger.warning(f"Failed to renamed to {new_rank_path}: {e}")
+    logger.info(f"==========Step_{step}'s JSON file generation completed!==========")
 # 删除csv文件中每行数据最后面的逗号
@@ -644,7 +702,15 @@ def merge_file(dump_path, rank_dir, file_dict):
                          " and the index is out of bounds.")
-def process_statistics(dump_path):
+def process_statistics_step(dump_path, step, step_list):
+    if step_list and step not in step_list:
+        return
+    if not os.path.exists(dump_path):
+        logger.warning('No grap cell data is dumped.')
+        create_directory(dump_path)
+        return
     rank_id = os.environ.get('RANK_ID')
     rank_dir_kbk = "rank_0"
     if rank_id is not None:
@@ -673,25 +739,24 @@ def process_statistics(dump_path):
     rank_dir = rank_dir_kbk.replace(CoreConst.REPLACEMENT_CHARACTER, '')
     dir_list = os.listdir(dump_path)
-    step_dir_list = [d for d in dir_list if d.startswith(CoreConst.STEP)]
-    for step_dir in step_dir_list:
-        step_path = os.path.join(dump_path, step_dir)
-        rank_path = os.path.join(step_path, rank_dir)
-        csv_path = os.path.join(rank_path, KEY_STATISTIC_CSV)
-        logger.info("==========Start processing data csv!==========")
-        generate_construct(csv_path)
-        generate_dump_info(csv_path)
-        generate_stack_info(csv_path)
-        remove_path(rank_path_kbk)
-        # 单卡场景，rank目录名称为rank
-        if rank_id is None:
-            new_rank_path = os.path.join(step_path, CoreConst.RANK)
-            try:
-                move_directory(rank_path, new_rank_path)
-                logger.info(f"Directory was successfully renamed to: {new_rank_path}")
-            except Exception as e:
-                logger.warning(f"Failed to renamed to {new_rank_path}: {e}")
-        logger.info("==========JSON file generation completed!==========")
+    step_dir = CoreConst.STEP + str(step)
+    step_path = os.path.join(dump_path, step_dir)
+    rank_path = os.path.join(step_path, rank_dir)
+    csv_path = os.path.join(rank_path, KEY_STATISTIC_CSV)
+    logger.info("==========Start processing data csv!==========")
+    generate_construct(csv_path)
+    generate_dump_info(csv_path)
+    generate_stack_info(csv_path)
+    remove_path(rank_path_kbk)
+    # 单卡场景，rank目录名称为rank
+    if rank_id is None:
+        new_rank_path = os.path.join(step_path, CoreConst.RANK)
+        try:
+            move_directory(rank_path, new_rank_path)
+            logger.info(f"Directory was successfully renamed to: {new_rank_path}")
+        except Exception as e:
+            logger.warning(f"Failed to renamed to {new_rank_path}: {e}")
+    logger.info("==========JSON file generation completed!==========")
 def get_yaml_keys(yaml_data):
@@ -786,7 +851,7 @@ def create_kbyk_json(dump_path, summary_mode, step):
 def start(config: CellDumpConfig):
-    global dump_task
+    global dump_task, parent_cell_types
     dump_task = config.task
     net = config.net
     dump_path = config.dump_path
@@ -814,7 +879,7 @@ def start(config: CellDumpConfig):
         return
     if isinstance(net, nn.Cell):
-        net = (('', net),)
+        net = (('', net, None),)
     td_config_path = ""
     try:
@@ -837,6 +902,7 @@ def start(config: CellDumpConfig):
     black_list = ["grad_reducer", ""]
     for name_and_model in net:
+        parent_cell_types[name_and_model[0]] = name_and_model[2].__class__.__name__
         for name, cell in name_and_model[1].cells_and_names(name_prefix=name_and_model[0]):
             class_name = cell.__class__.__name__
             # 跳过黑名单cell
@@ -871,7 +937,3 @@ def start(config: CellDumpConfig):
             cell.data_mode = data_mode
     logger.info("==========The cell_dump_process_start phase is Finished!==========")
-    if dump_task == CoreConst.TENSOR:
-        atexit.register(process, dump_path=dump_path)
-    if dump_task == CoreConst.STATISTICS:
-        atexit.register(process_statistics, dump_path=dump_path)

msprobe/mindspore/dump/cell_dump_with_insert_gradient.py CHANGED Viewed

@@ -197,8 +197,16 @@ def cell_construct_wrapper(func, self):
 def sort_filenames(path):
     filenames = os.listdir(path)
     id_pattern = re.compile(rf'{CoreConst.REPLACEMENT_CHARACTER}(\d+){CoreConst.NUMPY_SUFFIX}$')
-    filenames.sort(key=lambda x: int(id_pattern.findall(x)[0]))
-    return filenames
+    # 只保留能提取到数字id的文件，避免数组越界
+    valid_files = []
+    for filename in filenames:
+        match = id_pattern.findall(filename)
+        if match and match[0].isdigit():
+            valid_files.append(filename)
+        else:
+            logger.warning(f"File {filename} does not match the expected pattern and will be ignored.")
+    valid_files.sort(key=lambda x: int(id_pattern.findall(x)[0]))
+    return valid_files
 def rename_filename(path="", data_df=None):

msprobe/mindspore/dump/graph_mode_cell_dump.py CHANGED Viewed

@@ -14,7 +14,8 @@
 # limitations under the License.
 import os
+import glob
+import tempfile
 import mindspore as ms
 from mindspore import hal, ops, Tensor
 from mindspore.ops.primitive import _run_op
@@ -28,15 +29,20 @@ import msprobe.mindspore.dump.cell_dump_process as cellDumperWithDumpGradient
 import msprobe.mindspore.dump.cell_dump_with_insert_gradient as cellDumperWithInsertGradient
 tensordump_flag = True
+DEFAULT_RANK_DIR = "rank0"
 try:
     from mindspore._c_expression import _tensordump_set_step
 except ImportError:
     tensordump_flag = False
+graph_step_flag = True
+try:
+    from mindspore._c_expression import _dump_step
+except ImportError:
+    graph_step_flag = False
-class GraphModeCellDump:
-    task = CoreConst.STATISTICS
+class GraphModeCellDump:
     def __init__(self, config: DebuggerConfig, model, strict=True):
         self.net = model
         self.white_list = []
@@ -49,20 +55,40 @@ class GraphModeCellDump:
         self.list = config.list
         self.data_mode = config.data_mode
         self.file_format = config.file_format
-        GraphModeCellDump.task = config.task
         self.summary_mode = config.summary_mode
+        self.task = config.task
         self.check_config(strict)
         self.set_step()
     @staticmethod
-    def step():
+    def step(dump_path, step_list, task):
         # 更新TensorDump Step
-        if GraphModeCellDump.task == CoreConst.TENSOR:
+        if task == CoreConst.TENSOR:
             hal.synchronize()
             temp_tensor = ms.Tensor([1], dtype=ms.float32)
-            step_flag = "<tensordump-update-step>"
-            _run_op(ops.TensorDump(), "TensorDump", (step_flag, temp_tensor))
-            ops.tensordump(step_flag, temp_tensor)
+            rank_id = os.environ.get('RANK_ID')
+            rank_dir = DEFAULT_RANK_DIR
+            if rank_id is not None:
+                rank_dir = CoreConst.RANK + str(rank_id)
+            with tempfile.TemporaryDirectory(dir=dump_path, prefix=rank_dir) as temp_dir:
+                save_file_flag = f"{temp_dir}/step_{Runtime.step_count}"
+                _run_op(ops.TensorDump(), "TensorDump", (save_file_flag, temp_tensor))
+                step_flag = "<tensordump-update-step>"
+                _run_op(ops.TensorDump(), "TensorDump", (step_flag, temp_tensor))
+                ops.tensordump(step_flag, temp_tensor)
+                cellDumperWithDumpGradient.process_step(dump_path, temp_dir, Runtime.step_count, step_list)
+        # 更新静态图KBK dump的step数
+        if task == CoreConst.STATISTICS:
+            if not graph_step_flag:
+                raise Exception(
+                    "Importing _dump_step failed, "
+                    "please use the latest version package of MindSpore."
+                )
+            _dump_step(1)
+            cellDumperWithDumpGradient.process_statistics_step(dump_path, Runtime.step_count, step_list)
     def check_config(self, strict):
         if not self.net:

msprobe/mindspore/dump/graph_tensor_dump.py CHANGED Viewed

@@ -16,6 +16,8 @@
 import os
 from collections import OrderedDict
 import mindspore as ms
+from mindspore import hal, ops, Tensor
+from mindspore.ops.primitive import _run_op
 def _iterate_items(data):
@@ -121,3 +123,12 @@ def save_grad(save_dir, name, data):
     dump_dir = generate_dump_dir(save_dir)
     suffix_name = name + '_grad'
     return _SaveGradCell(dump_dir, suffix_name)(data)
+def step():
+    hal.synchronize()
+    temp_tensor = Tensor([1], dtype=ms.float32)
+    step_flag = "<tensordump-update-step>"
+    _run_op(ops.TensorDump(), "TensorDump", (step_flag, temp_tensor))
+    ops.tensordump(step_flag, temp_tensor)
+    hal.synchronize()

msprobe/mindspore/dump/hook_cell/api_register.py CHANGED Viewed

@@ -40,36 +40,36 @@ cur_path = os.path.dirname(os.path.realpath(__file__))
 if not is_mindtorch():
     _api_types = {
         Const.MS_FRAMEWORK: {
-            Const.MS_API_TYPE_OPS: (ops, (ops,)),
-            Const.MS_API_TYPE_TENSOR: (Tensor, (Tensor,)),
-            Const.MS_API_TYPE_MINT: (mint, (mint,)),
-            Const.MS_API_TYPE_MINT_FUNC: (functional, (functional,)),
-            Const.MS_API_TYPE_COM: (comm_func, (comm_func,)),
-            Const.MS_API_TYPE_MINT_DIST: (distributed, (distributed,))
+            Const.MS_API_TYPE_OPS: ((ops,), (ops,)),
+            Const.MS_API_TYPE_TENSOR: ((Tensor,), (Tensor,)),
+            Const.MS_API_TYPE_MINT: ((mint,), (mint,)),
+            Const.MS_API_TYPE_MINT_FUNC: ((functional,), (functional,)),
+            Const.MS_API_TYPE_COM: ((comm_func,), (comm_func,)),
+            Const.MS_API_TYPE_MINT_DIST: ((distributed,), (distributed,))
         }
     }
     if stub_tensor_existed:
         _api_types.get(Const.MS_FRAMEWORK).update(
-            {Const.MS_API_TYPE_STUB_TENSOR: (StubTensor, (StubTensor,))}
+            {Const.MS_API_TYPE_STUB_TENSOR: ((StubTensor,), (StubTensor,))}
         )
     _supported_api_list_path = (os.path.join(cur_path, MsConst.SUPPORTED_API_LIST_FILE),)
-    _backlist = []
+    _blacklist = []
 else:
     import torch
     import torch_npu
     _api_types = {
         Const.MT_FRAMEWORK: {
-            Const.PT_API_TYPE_FUNCTIONAL: (torch.nn.functional, (torch.nn.functional,)),
-            Const.PT_API_TYPE_TENSOR: (torch.Tensor, (torch.Tensor,)),
-            Const.PT_API_TYPE_TORCH: (torch, (torch,)),
-            Const.PT_API_TYPE_NPU: (torch_npu, (torch_npu,)),
-            Const.PT_API_TYPE_DIST: (torch.distributed, (torch.distributed, torch.distributed.distributed_c10d))
+            Const.PT_API_TYPE_FUNCTIONAL: ((torch.nn.functional,), (torch.nn.functional,)),
+            Const.PT_API_TYPE_TENSOR: ((torch.Tensor,), (torch.Tensor,)),
+            Const.PT_API_TYPE_TORCH: ((torch,), (torch,)),
+            Const.PT_API_TYPE_NPU: ((torch_npu,), (torch_npu,)),
+            Const.PT_API_TYPE_DIST: ((torch.distributed,), (torch.distributed, torch.distributed.distributed_c10d))
         }
     }
     _supported_api_list_path = (os.path.join(cur_path, '../../../pytorch/hook_module',
                                              MsConst.SUPPORTED_API_LIST_FILE),)
-    _backlist = [f'{Const.PT_API_TYPE_TENSOR}.__setitem__']
+    _blacklist = []
 _inner_used_api = {
     Const.MS_FRAMEWORK + Const.SEP + Const.MS_API_TYPE_OPS: (
@@ -87,12 +87,11 @@ _inner_used_api = {
 class ApiTemplate(HOOKCell):
     def __init__(self, api_name, api_func, prefix, hook_build_func):
         self.api_name = api_name
-        self.api_func = api_func
         self.prefix_api_name = prefix + Const.SEP + str(api_name.split(Const.SEP)[-1]) + Const.SEP
-        super().__init__(hook_build_func)
         distributed_prefix = Const.DIST_API_TYPE_PREFIX if is_mindtorch() else Const.MINT_DIST_API_TYPE_PREFIX
-        if prefix == distributed_prefix:
-            self.op_is_distributed = True
+        self.op_is_distributed = prefix == distributed_prefix
+        super().__init__(hook_build_func)
+        self.api_func = api_func
     @staticmethod
     def async_to_sync(output):
@@ -161,7 +160,7 @@ def get_api_register(return_new=False):
             _inner_used_api,
             _supported_api_list_path,
             ApiTemplate,
-            _backlist
+            _blacklist
         )
     global api_register
@@ -171,6 +170,6 @@ def get_api_register(return_new=False):
             _inner_used_api,
             _supported_api_list_path,
             ApiTemplate,
-            _backlist
+            _blacklist
         )
     return api_register

msprobe/mindspore/dump/hook_cell/hook_cell.py CHANGED Viewed

@@ -19,8 +19,6 @@ from collections import defaultdict
 import mindspore as ms
 from mindspore import nn
-from msprobe.core.common.runtime import Runtime
-from msprobe.core.common.utils import ThreadSafe
 from msprobe.mindspore.common.utils import is_mindtorch, register_backward_hook_functions
 ms_version = ms.__version__
@@ -37,48 +35,28 @@ def get_cell_count(name):
 def __init__(self, hook_build_func) -> None:
     super(HOOKCell, self).__init__()
     self.msprobe_input_kwargs = {}
-    self.tid = threading.get_ident()
-    self.stop_hook = HOOKCell.inner_stop_hook.get(self.tid, False)
-    if not self.stop_hook:
-        self.forward_data_collected = False
-        if not Runtime.is_running:
-            return
-        prefix = self.prefix_api_name if hasattr(self, "prefix_api_name") else ""
-        ThreadSafe.acquire()
-        if callable(hook_build_func):
-            hook_set = hook_build_func(prefix)
-            if ms_version < "2.6.0" and not is_mindtorch():
-                getattr(self, "_forward_pre_hook", {})[id(self)] = hook_set.forward_pre_hook
+    prefix = self.prefix_api_name if hasattr(self, "prefix_api_name") else ""
+    if callable(hook_build_func):
+        hook_set = hook_build_func(prefix)
+        if ms_version < "2.6.0" and not is_mindtorch():
+            getattr(self, "_forward_pre_hook", {})[id(self)] = hook_set.forward_pre_hook
+            if hook_set.forward_hook:
                 getattr(self, "_forward_hook", {})[id(self)] = hook_set.forward_hook
-            else:
-                self.register_forward_pre_hook(hook_set.forward_pre_hook)
+        else:
+            self.register_forward_pre_hook(hook_set.forward_pre_hook)
+            if hook_set.forward_hook:
                 self.register_forward_hook(hook_set.forward_hook)
-            register_backward_hook_functions["full"](self, hook_set.backward_hook)
-            register_backward_hook_functions["pre"](self, hook_set.backward_pre_hook)
-# 重载call，加全局标志。
 def __call__(self, *args, **kwargs):
-    changed = False
-    if not self.stop_hook:
-        HOOKCell.inner_stop_hook[self.tid] = True
-        changed = True
-    try:
-        self.msprobe_input_kwargs = kwargs
-        out = super(HOOKCell, self).__call__(*args, **kwargs)
-    except Exception as e:
-        raise e
-    finally:
-        if changed:
-            HOOKCell.inner_stop_hook[self.tid] = False
+    tid = threading.get_ident()
+    self.msprobe_input_kwargs[tid] = kwargs
+    out = super(HOOKCell, self).__call__(*args, **kwargs)
     return out
 hook_cell_dict = {
     "cell_count": defaultdict(int),
-    "inner_stop_hook": defaultdict(bool),
     "add_cell_count": staticmethod(add_cell_count),
     "get_cell_count": staticmethod(get_cell_count),
     "__init__": __init__,

mindstudio-probe 8.1.2__py3-none-any.whl → 8.2.1__py3-none-any.whl

mindstudio-probe 8.1.2py3-none-any.whl → 8.2.1py3-none-any.whl