PyPI - mindstudio-probe - Versions diffs - 8.3.0__py3-none-any.whl → 8.3.2__py3-none-any.whl - Mend

mindstudio-probe 8.3.0py3-none-any.whl → 8.3.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (54) hide show

{mindstudio_probe-8.3.0.dist-info → mindstudio_probe-8.3.2.dist-info}/METADATA +1 -1
{mindstudio_probe-8.3.0.dist-info → mindstudio_probe-8.3.2.dist-info}/RECORD +44 -54
msprobe/README.md +8 -5
msprobe/core/common/const.py +17 -3
msprobe/core/common/file_utils.py +64 -13
msprobe/core/common/framework_adapter.py +10 -1
msprobe/core/common/utils.py +17 -0
msprobe/core/compare/utils.py +26 -6
msprobe/core/config_check/ckpt_compare/ckpt_comparator.py +6 -1
msprobe/core/hook_manager.py +2 -16
msprobe/core/service.py +5 -16
msprobe/docs/01.installation.md +2 -0
msprobe/docs/02.config_introduction.md +0 -13
msprobe/docs/05.data_dump_PyTorch.md +1 -1
msprobe/docs/07.accuracy_checker_PyTorch.md +13 -13
msprobe/docs/10.accuracy_compare_PyTorch.md +6 -6
msprobe/docs/14.data_parse_PyTorch.md +2 -0
msprobe/docs/19.monitor.md +4 -4
msprobe/docs/21.visualization_PyTorch.md +1 -1
msprobe/docs/25.tool_function_introduction.md +0 -1
msprobe/docs/32.ckpt_compare.md +5 -5
msprobe/mindspore/monitor/module_hook.py +17 -20
msprobe/pytorch/api_accuracy_checker/common/config.py +3 -36
msprobe/pytorch/api_accuracy_checker/compare/api_precision_compare.py +0 -24
msprobe/pytorch/api_accuracy_checker/compare/compare.py +2 -12
msprobe/pytorch/api_accuracy_checker/config.yaml +1 -6
msprobe/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py +34 -5
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py +12 -132
msprobe/pytorch/common/utils.py +0 -70
msprobe/pytorch/debugger/debugger_config.py +0 -10
msprobe/pytorch/dump/module_dump/module_processer.py +18 -3
msprobe/pytorch/hook_module/api_register.py +14 -3
msprobe/pytorch/monitor/module_hook.py +16 -34
msprobe/pytorch/pt_config.py +2 -51
msprobe/pytorch/pytorch_service.py +10 -14
msprobe/visualization/builder/graph_builder.py +2 -2
msprobe/visualization/builder/graph_merger.py +13 -0
msprobe/visualization/db_utils.py +42 -18
msprobe/visualization/graph/graph.py +13 -9
msprobe/visualization/graph_service.py +20 -10
msprobe/docs/08.accuracy_checker_online_PyTorch.md +0 -295
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/__init__.py +0 -0
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/attl.py +0 -205
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/client.py +0 -378
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/device_dispatch.py +0 -239
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/dump_dispatch.py +0 -115
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/server.py +0 -250
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/torch_ops_config.yaml +0 -63
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/utils.py +0 -198
msprobe/pytorch/attl_manager.py +0 -65
{mindstudio_probe-8.3.0.dist-info → mindstudio_probe-8.3.2.dist-info}/LICENSE +0 -0
{mindstudio_probe-8.3.0.dist-info → mindstudio_probe-8.3.2.dist-info}/WHEEL +0 -0
{mindstudio_probe-8.3.0.dist-info → mindstudio_probe-8.3.2.dist-info}/entry_points.txt +0 -0
{mindstudio_probe-8.3.0.dist-info → mindstudio_probe-8.3.2.dist-info}/top_level.txt +0 -0

msprobe/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py CHANGED Viewed

@@ -39,7 +39,12 @@ from msprobe.core.common.const import FileCheckConst, Const
 from msprobe.core.common.utils import CompareException
-def split_json_file(input_file, num_splits, filter_api):
+def split_json_file(input_file, num_splits, filter_api, device_id):
+    max_processes = len(device_id) * 8
+    if num_splits > max_processes:
+        logger.warning(f"A device supports a maximum of 8 processes. "
+                       f"The total number of processes exceeds the limit, and it is set to {max_processes}.")
+        num_splits = max_processes
     forward_data, backward_data, real_data_path = parse_json_info_forward_backward(input_file)
     input_dir = os.path.dirname(os.path.abspath(input_file))
     if filter_api:
@@ -88,7 +93,7 @@ def split_json_file(input_file, num_splits, filter_api):
                     logger.error(f"File not found or could not be deleted: {file}")
             msg = 'ERROR: Split json file failed, please check the input file and try again.'
             raise CompareException(CompareException.PARSE_FILE_ERROR, msg) from e
-    return split_files, total_items
+    return split_files, total_items, num_splits
 def signal_handler(signum, frame):
@@ -127,7 +132,8 @@ def run_parallel_ut(config):
     def read_process_output(process):
         try:
             while True:
-                if process.poll() is not None:
+                # 子进程标准输出流与进程本身状态是分开的，因此增加判断。子进程返回值非None表示子进程结束，标准输出为None表示结束。
+                if process.poll() is not None or process.stdout is None:
                     break
                 output = process.stdout.readline()
                 if output == '':
@@ -175,12 +181,17 @@ def run_parallel_ut(config):
     try:
         for process in processes:
-            process.communicate(timeout=None)
+            process.wait()  # wait仅阻塞，不捕获标准输出和标准错误，原communicate不仅阻塞，而且捕获标准输出和标准错误
     except KeyboardInterrupt:
         logger.warning("Interrupted by user, terminating processes and cleaning up...")
     except Exception as e:
         logger.error(f"An unexpected error occurred: {e}")
     finally:
+        # 最后再更新一次进度条，避免因缓存写入等原因子进程结束而进度未刷新的问题
+        if wait_for_file_write_complete(config.result_csv_path):
+            result_file = read_csv(config.result_csv_path)
+            completed_items = len(result_file)
+            progress_bar.update(completed_items - progress_bar.n)
         if progress_bar.n < config.total_items:
             logger.warning("The UT task has not been completed. The parameter '-csv_path' along with the path to " \
                            "the result CSV file will be utilized to resume the UT task.")
@@ -195,6 +206,22 @@ def run_parallel_ut(config):
         logger.error(f"An unexpected error occurred: {e}")
+def wait_for_file_write_complete(file_path, timeout=3600):
+    last_size = 0
+    start_time = time.time()  # 记录开始时间
+    while True:
+        current_size = os.path.getsize(file_path)
+        # 检查是否文件大小未变化
+        if current_size == last_size:
+            return True  # 文件写入完成，返回 True
+        last_size = current_size
+        # 检查是否超时
+        if time.time() - start_time > timeout:
+            logger.error("write the result csv file timeout.")
+            return False  # 超时，返回 False
+        time.sleep(0.1)  # 适当的延时
 def prepare_config(args):
     api_info_file_checker = FileChecker(file_path=args.api_info_file, path_type=FileCheckConst.FILE,
                                         ability=FileCheckConst.READ_ABLE, file_type=FileCheckConst.JSON_SUFFIX)
@@ -203,7 +230,9 @@ def prepare_config(args):
     create_directory(out_path)
     out_path_checker = FileChecker(out_path, FileCheckConst.DIR, ability=FileCheckConst.WRITE_ABLE)
     out_path = out_path_checker.common_check()
-    split_files, total_items = split_json_file(api_info, args.num_splits, args.filter_api)
+    split_files, total_items, modified_num_splits = split_json_file(api_info, args.num_splits,
+                                                                    args.filter_api, args.device_id)
+    args.num_splits = modified_num_splits
     config_path = args.config_path if args.config_path else None
     if config_path:
         config_path_checker = FileChecker(config_path, FileCheckConst.FILE,

msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py CHANGED Viewed

@@ -51,8 +51,6 @@ from msprobe.pytorch.pt_config import parse_json_config
 from msprobe.core.common.const import Const, FileCheckConst, CompareConst
 from msprobe.core.common.utils import safe_get_value, CompareException, is_int, check_op_str_pattern_valid
 from msprobe.pytorch.common.utils import seed_all
-from msprobe.pytorch.api_accuracy_checker.tensor_transport_layer.attl import ATTL, ATTLConfig, move2device_exec
-from msprobe.pytorch.api_accuracy_checker.tensor_transport_layer.device_dispatch import ConsumerDispatcher
 from msprobe.pytorch.api_accuracy_checker.run_ut.run_ut_utils import generate_cpu_params, generate_device_params, \
     ExecParams
@@ -90,27 +88,22 @@ seed_all()
 def run_ut(config):
     logger.info("start UT test")
-    if config.online_config.is_online:
-        logger.info(f"UT task result will be saved in {config.result_csv_path}".replace(".csv", "_rank*.csv"))
-        logger.info(f"UT task details will be saved in {config.details_csv_path}".replace(".csv", "_rank*.csv"))
-    else:
-        logger.info(f"UT task result will be saved in {config.result_csv_path}")
-        logger.info(f"UT task details will be saved in {config.details_csv_path}")
+    logger.info(f"UT task result will be saved in {config.result_csv_path}")
+    logger.info(f"UT task details will be saved in {config.details_csv_path}")
     if config.save_error_data:
         logger.info(f"UT task error_data will be saved in {config.error_data_path}")
     compare = Comparator(config.result_csv_path, config.details_csv_path, config.is_continue_run_ut, config=config)
-    if config.online_config.is_online:
-        run_api_online(config, compare)
-    else:
-        csv_df = read_csv(config.result_csv_path)
-        try:
-            api_name_set = {row[0] for row in csv_df.itertuples(index=False, name=None)}
-        except IndexError:
-            logger.error(f"Read {config.result_csv_path} error, api_name_set is empty.")
-            api_name_set = set()
-        run_api_offline(config, compare, api_name_set)
+    csv_df = read_csv(config.result_csv_path)
+    try:
+        api_name_set = {row[0] for row in csv_df.itertuples(index=False, name=None)}
+    except IndexError:
+        logger.error(f"Read {config.result_csv_path} error, api_name_set is empty.")
+        api_name_set = set()
+    run_api_offline(config, compare, api_name_set)
     for result_csv_path, details_csv_path in zip(compare.save_path_list, compare.detail_save_path_list):
         change_mode(result_csv_path, FileCheckConst.DATA_FILE_AUTHORITY)
         change_mode(details_csv_path, FileCheckConst.DATA_FILE_AUTHORITY)
@@ -164,60 +157,6 @@ def run_api_offline(config, compare, api_name_set):
             gc.collect()
-def run_api_online(config, compare):
-    attl = init_attl(config.online_config)
-    dispatcher = ConsumerDispatcher(compare=compare)
-    dispatcher.start(handle_func=run_torch_api_online, config=config)
-    def tcp_communication_flow():
-        while True:
-            api_data = attl.recv()
-            if api_data == 'STOP_':
-                continue
-            if api_data == 'KILL_':
-                time.sleep(1)
-                logger.info("==========接收到STOP信号==========")
-                dispatcher.stop()
-                attl.stop_serve()
-                time.sleep(1)
-                break
-            if not isinstance(api_data, ApiData):
-                continue
-            api_full_name = api_data.name
-            _, api_name = extract_basic_api_segments(api_full_name)
-            if blacklist_and_whitelist_filter(api_name, config.black_list, config.white_list):
-                continue
-            if api_data.rank in config.online_config.rank_list:
-                dispatcher.update_consume_queue(api_data)
-    def shared_storage_communication_flow():
-        flag_num = -1
-        while True:
-            api_data = attl.download()
-            if api_data == "start":
-                if flag_num == -1:
-                    flag_num += 1
-                flag_num += 1
-            if api_data == "end":
-                flag_num -= 1
-            if flag_num == 0:
-                dispatcher.stop()
-                break
-            if not isinstance(api_data, ApiData):
-                continue
-            api_full_name = api_data.name
-            _, api_name = extract_basic_api_segments(api_full_name)
-            if blacklist_and_whitelist_filter(api_name, config.black_list, config.white_list):
-                continue
-            if api_data.rank in config.online_config.rank_list:
-                dispatcher.update_consume_queue(api_data)
-    if config.online_config.nfs_path:
-        shared_storage_communication_flow()
-    else:
-        tcp_communication_flow()
 def blacklist_and_whitelist_filter(api_name, black_list, white_list):
     """
     run api(api_name) if api_name not in black_list and in white_list.
@@ -315,21 +254,6 @@ def run_torch_api(api_full_name, real_data_path, backward_content, api_info_dict
     return UtDataInfo(bench_grad_out, device_grad_out, device_out, out, bench_grad, in_fwd_data_list, backward_message)
-def run_torch_api_online(api_full_name, api_data, backward_content):
-    in_fwd_data_list = []
-    api_type, api_name = extract_basic_api_segments(api_full_name)
-    args, kwargs, out = api_data.args, api_data.kwargs, api_data.result
-    in_fwd_data_list.append(args)
-    in_fwd_data_list.append(kwargs)
-    if kwargs.get("device"):
-        del kwargs["device"]
-    device_exec_params = ExecParams(api_type, api_name, current_device, args, kwargs, False, None)
-    device_out = exec_api(device_exec_params)
-    device_out = move2device_exec(device_out, "cpu")
-    return UtDataInfo(None, None, out, device_out, None, in_fwd_data_list, None, rank=api_data.rank)
 def check_need_grad(api_info_dict):
     need_grad = True
     if api_info_dict.get(Const.INPUT_KWARGS) and "out" in api_info_dict.get(Const.INPUT_KWARGS):
@@ -389,16 +313,6 @@ def initialize_save_error_data(error_data_path):
     return error_data_path
-def init_attl(config):
-    """config: OnlineConfig"""
-    attl = ATTL('gpu', ATTLConfig(is_benchmark_device=True,
-                                  connect_ip=config.host,
-                                  connect_port=config.port,
-                                  nfs_path=config.nfs_path,
-                                  tls_path=config.tls_path))
-    return attl
 def _run_ut_parser(parser):
     parser.add_argument("-api_info", "--api_info_file", dest="api_info_file", default="", type=str,
                         help="<Optional> The api param tool result file: generate from api param tool, "
@@ -481,38 +395,6 @@ def _run_ut(parser=None):
     _run_ut_parser(parser)
     args = parser.parse_args(sys.argv[1:])
     run_ut_command(args)
-def checked_online_config(online_config):
-    if not online_config.is_online:
-        return
-    if not isinstance(online_config.is_online, bool):
-        raise ValueError("is_online must be bool type")
-    # rank_list
-    if not isinstance(online_config.rank_list, list):
-        raise ValueError("rank_list must be a list")
-    if online_config.rank_list and not all(isinstance(rank, int) for rank in online_config.rank_list):
-        raise ValueError("All elements in rank_list must be integers")
-    # nfs_path
-    if online_config.nfs_path:
-        check_file_or_directory_path(online_config.nfs_path, isdir=True)
-        return
-    # tls_path
-    if online_config.tls_path:
-        check_file_or_directory_path(online_config.tls_path, isdir=True)
-        check_file_or_directory_path(os.path.join(online_config.tls_path, "server.key"))
-        check_file_or_directory_path(os.path.join(online_config.tls_path, "server.crt"))
-        check_file_or_directory_path(os.path.join(online_config.tls_path, "ca.crt"))
-        crl_path = os.path.join(online_config.tls_path, "crl.pem")
-        if os.path.exists(crl_path):
-            check_file_or_directory_path(crl_path)
-    # host and port
-    if not isinstance(online_config.host, str) or not re.match(Const.ipv4_pattern, online_config.host):
-        raise Exception(f"host: {online_config.host} is invalid.")
-    if not isinstance(online_config.port, int) or not (0 < online_config.port <= 65535):
-        raise Exception(f"port: {online_config.port} is invalid, port range 0-65535.")
 def run_ut_command(args):
@@ -525,7 +407,7 @@ def run_ut_command(args):
     else:
         checker_config = CheckerConfig()
-    if not checker_config.is_online and not args.api_info_file:
+    if not args.api_info_file:
         logger.error("Please provide api_info_file for offline run ut.")
         raise Exception("Please provide api_info_file for offline run ut.")
@@ -588,8 +470,6 @@ def run_ut_command(args):
             global UT_ERROR_DATA_DIR
             UT_ERROR_DATA_DIR = 'ut_error_data' + time_info
         error_data_path = initialize_save_error_data(error_data_path)
-    online_config = checker_config.get_online_config()
-    checked_online_config(online_config)
     config_params = {
         'forward_content': forward_content,
         'backward_content': backward_content,

msprobe/pytorch/common/utils.py CHANGED Viewed

@@ -337,56 +337,6 @@ def save_pt(tensor, filepath):
     change_mode(filepath, FileCheckConst.DATA_FILE_AUTHORITY)
-class TypeCheckingUnpickler(pickle.Unpickler):
-    """
-    This class is a subclass of pickle.Unpickler, which is used to unpickle pickled objects.
-    It overrides the find_class method to add type checking functionality.
-    """
-    allowed_types = [
-        "str",
-        "ApiData",
-        "OrderedDict",
-        "_rebuild_tensor_v2",  # from torch.utils
-        "_load_from_bytes"  # from torch.storage
-    ]
-    def find_class(self, module, name):
-        """
-        Method to find the class of the object to be unpickled.
-        Throws pickle.UnpicklingError If the object type is not in the allowed types list.
-        """
-        if name in self.allowed_types:
-            return super().find_class(module, name)
-        raise pickle.UnpicklingError("Unsupported object type: {}.{}".format(module, name))
-def save_pkl(tensor, filepath):
-    """Save ApiData or str objection by pickle"""
-    check_path_before_create(filepath)
-    filepath = os.path.realpath(filepath)
-    try:
-        with FileOpen(filepath, 'wb') as f:
-            pickle.dump(tensor, f)
-    except Exception as e:
-        logger.error("Save pt file failed, please check according possible error causes: "
-                     "1. out of disk space or disk error, "
-                     "2. no permission to write files, etc.")
-        raise RuntimeError(f"save pt file {filepath} failed") from e
-    change_mode(filepath, FileCheckConst.DATA_FILE_AUTHORITY)
-def load_pkl(pt_path):
-    """Load ApiData or str objection by pickle for accuracy_checker_online"""
-    check_file_or_directory_path(pt_path)
-    pt_path = os.path.realpath(pt_path)
-    try:
-        with FileOpen(pt_path, 'rb') as f:
-            pt = TypeCheckingUnpickler(f).load()
-    except Exception as e:
-        raise RuntimeError(f"load pt file {pt_path} failed: {e}") from e
-    return pt
 def is_recomputation():
     """Check if the current operation is in the re-computation phase.
@@ -471,23 +421,3 @@ def register_forward_hook(module, forward_hook):
         module.register_forward_hook(forward_hook, with_kwargs=True)
     else:
         module.register_forward_hook(forward_hook)
-def save_api_data(api_data):
-    """Save data to io stream"""
-    try:
-        io_buff = io.BytesIO()
-        torch.save(api_data, io_buff)
-    except Exception as e:
-        raise RuntimeError(f"save api_data to io_buff failed") from e
-    return io_buff
-def load_api_data(api_data_bytes):
-    """Load data from bytes stream"""
-    try:
-        buffer = io.BytesIO(api_data_bytes)
-        buffer = torch.load(buffer, map_location="cpu")
-    except Exception as e:
-        raise RuntimeError(f"load api_data from bytes failed") from e
-    return buffer

msprobe/pytorch/debugger/debugger_config.py CHANGED Viewed

@@ -48,16 +48,6 @@ class DebuggerConfig:
                 "max_sample": task_config.max_sample
             }
-        self.online_run_ut = False
-        if self.task == Const.TENSOR:
-            # dump api tensor and collaborate with online run_ut
-            self.online_run_ut = task_config.online_run_ut if task_config.online_run_ut else False
-            self.nfs_path = task_config.nfs_path if task_config.nfs_path else ""
-            self.tls_path = task_config.tls_path if task_config.tls_path else ""
-            self.host = task_config.host if task_config.host else ""
-            self.port = task_config.port if task_config.port else -1
-            self.online_run_ut_recompute = task_config.online_run_ut_recompute \
-                if isinstance(task_config.online_run_ut_recompute, bool) else False
         self.check()
         self._check_statistics_config(task_config)

msprobe/pytorch/dump/module_dump/module_processer.py CHANGED Viewed

@@ -63,9 +63,11 @@ def wrap_forward_with_hook_safety(module):
         except _StopRecomputationError as e:
             exception_output = None
             if len(module._forward_hooks.values()) > 0:
-                # msprobe的forward_hook会出现在第一个，仅执行msprobe的forward_hook
-                hook_fn = list(module._forward_hooks.values())[0]
-                hook_fn(module, args, kwargs, exception_output)
+                # 仅执行msprobe的forward_hook, hook名称必然包含'ModuleProcesser.'
+                for hook_fn in module._forward_hooks.values():
+                    if 'ModuleProcesser' in str(hook_fn):
+                        hook_fn(module, args, kwargs, exception_output)
+                        break
             raise e
     if torch_version_above_or_equal_21:
@@ -152,7 +154,13 @@ class ModuleProcesser:
         modules_and_names_with_index = self.get_modules_and_names(models, recursive, module_names)
         for index, modules_and_names in modules_and_names_with_index.items():
             model = models if index == "-1" else models[int(index)]
+            model_list = []
             for name, module in modules_and_names:
+                model_list.append((name, module))
+            is_verl = "verl" in sys.modules
+            for idx, (name, module) in enumerate(model_list):
                 if recursive and module == model:
                     continue
                 if not is_torch_nn_module(module):
@@ -163,6 +171,13 @@ class ModuleProcesser:
                     continue
                 if module.__class__.__name__ == "FullyShardedDataParallel":
                     continue
+                # verl 场景下跳过第一层和最后一层
+                if is_verl and (idx == 1 or idx == len(model_list) - 1):
+                    logger.warning(f"The module {name} is the first or last layer in verl scenario, "
+                                   f"the data dump for this module will be skipped.")
+                    continue
                 setattr(module, 'msprobe_hook', True)
                 module_index = (index + Const.SEP) if index != "-1" else ""
                 prefix_name = f'{BaseScope.Module_Type_Module}{Const.SEP}{module_index}{name}{Const.SEP}' + \

msprobe/pytorch/hook_module/api_register.py CHANGED Viewed

@@ -22,6 +22,7 @@ import torch.distributed as dist
 from msprobe.core.common.const import Const
 from msprobe.core.common.file_utils import load_yaml
+from msprobe.core.common.runtime import Runtime
 from msprobe.core.data_dump.api_registry import ApiRegistry
 from msprobe.pytorch.common.log import logger
 from msprobe.pytorch.common.utils import (
@@ -91,6 +92,12 @@ _inner_used_api = {
 }
+def reset_dist_collect_func():
+    global dist_data_collect_func, dist_batch_data_collect_func
+    dist_data_collect_func.clear()
+    dist_batch_data_collect_func.clear()
 @parameter_adapter
 def tensor_module_forward(module, *args, **kwargs):
     return module.api_func(*args, **kwargs)
@@ -114,9 +121,9 @@ def dist_module_forward(module, *args, **kwargs):
         return store_data
-    if use_async_op_flag or module.api_name in ['isend', 'irecv']:
+    if Runtime.is_running and (use_async_op_flag or module.api_name in ['isend', 'irecv']):
         dist_data_collect_func[handle] = create_async_callback_func(module.distributed_forward_hook)
-    if module.api_name == 'batch_isend_irecv':
+    if Runtime.is_running and module.api_name == 'batch_isend_irecv':
         dist_batch_data_collect_func.append([handle, create_async_callback_func(module.distributed_forward_hook)])
     return handle
@@ -135,13 +142,17 @@ def redirect_wait():
                 store_func = dist_data_collect_func.pop(args[0])
                 store_func()
                 return
+            remove_value = None
             for value in dist_batch_data_collect_func:
                 if args[0] in value[0]:
                     value[0].remove(args[0])
                     if len(value[0]) == 0:
                         store_func = value[1]
                         store_func()
-                    return
+                        remove_value = value
+                    break
+            if remove_value:
+                dist_batch_data_collect_func.remove(remove_value)
         return wrapped_wait

msprobe/pytorch/monitor/module_hook.py CHANGED Viewed

@@ -48,12 +48,10 @@ from msprobe.pytorch.monitor.module_metric import get_metrics, get_summary_write
 from msprobe.pytorch.monitor.optimizer_collect import OptimizerMonFactory
 from msprobe.pytorch.monitor.visualizer import HeatmapVisualizer
 torch_version_above_or_equal_2 = torch.__version__.split('+')[0] >= '2.0'
 if not torch_version_above_or_equal_2:
     raise ValueError("monitor require torch>=2.0")
 FORMAT_MAPPING = {
     MonitorConst.TENSORBOARD: SummaryWriterWithAD,
     MonitorConst.CSV: CSVWriterWithAD,
@@ -150,15 +148,11 @@ class GradContext:
     def __init__(self) -> None:
         self.pre = {}
         self.post = {}
-        self.acc_metric = {}
-        self.acc = {}
         self.actv = {}
     def reset(self):
         self.pre.clear()
         self.post.clear()
-        self.acc_metric.clear()
-        self.acc.clear()
         self.actv.clear()
@@ -510,18 +504,8 @@ class TrainerMon:
         if not self.wg_distribution:
             return {}, {}
-        if self.weight_hooked:
-            get_metrics(self.ops, self.grad_context.acc, self.eps, self.grad_context.acc_metric)
         get_metrics(self.ops, post_grad_dict, self.eps, self.grad_context.post)
-        reduced_grad = self.grad_context.post
-        if self.weight_hooked:
-            unreduced_grad = self.grad_context.acc_metric
-        else:
-            unreduced_grad = self.grad_context.pre
-        return reduced_grad, unreduced_grad
+        return self.grad_context.post, self.grad_context.pre
     def generate_xy_metrics(self):
         actv = {}
@@ -529,7 +513,6 @@ class TrainerMon:
             actv.update(fwd_context.actv)
         actv_grad = self.grad_context.actv
         return actv, actv_grad
     def reload_xy(self, xy_distribution=False):
@@ -607,11 +590,8 @@ class TrainerMon:
         if not self.wg_distribution:
             return
-        if self.weight_hooked:
-            self.summary_writer.write_metrics(self.ops, self.grad_context.acc_metric, step, 'grad_unreduced',
-                                              use_micro_step=self.monitor_mbs_grad)
-        else:
-            self.summary_writer.write_metrics(self.ops, self.grad_context.pre, step, 'grad_unreduced')
+        self.summary_writer.write_metrics(self.ops, self.grad_context.pre, step, 'grad_unreduced',
+                                          use_micro_step=self.monitor_mbs_grad)
         self.summary_writer.write_metrics(self.ops, self.grad_context.post, step, 'grad_reduced')
     def hook_optimizer(self, optimizer):
@@ -732,9 +712,9 @@ class TrainerMon:
             # 静态在第0步就可以保存, 动态在第0步不可以, 因为动态设计的就是重置后下一步开启, 第0步的self.monitoring还是False
             if self.monitoring:
                 module_rank_valid = not self.module_rank_list or (
-                            dist.is_initialized() and dist.get_rank() in self.module_rank_list)
+                        dist.is_initialized() and dist.get_rank() in self.module_rank_list)
                 step_condition = (context.step >= self.start_step and (
-                            context.step - self.start_step) % self.step_interval == 0)
+                        context.step - self.start_step) % self.step_interval == 0)
                 if module_rank_valid and step_condition:
                     self.has_collect_times += 1
@@ -791,6 +771,7 @@ class TrainerMon:
                     hook(optimizer, args, kwargs)
                 step_final_hook(optimizer, args, kwargs)
                 return out
             return wrapper
         optimizer.__class__.step = patch_step(optimizer.__class__.step, optimizer)
@@ -1013,11 +994,11 @@ class TrainerMon:
                 vpp_stage + module_name,
             ]:
                 if pattern in l2_targets:
-                    return pattern
+                    return pattern
         elif hook_name in ["linear_hook"]:
             return vpp_stage + squash_param_name(module_name, self.squash_name)
         return ""
     def _hook_module(self, target_names, l2_target_names, module: torch.nn.Module, vpp_stage=''):
         if '_modules' not in module.__dict__:
             # nothing to hook
@@ -1151,7 +1132,7 @@ class TrainerMon:
                 context.micro_step = 0
                 context.step += 1
             return
         def stack_hook(module, args, kwargs, module_output, name):
             if module not in self.module_fwd_hook_context_by_module:
                 self.module_fwd_hook_context_by_module[module] = ModuleHookContext(name)
@@ -1221,7 +1202,7 @@ class TrainerMon:
         if self.monitor_mbs_grad:
             self._hook_weights()
             return
         self.optimizer_mon.patch_grad_sync(self)
         if self.enable_megatron or self.enable_deepspeed:
@@ -1281,6 +1262,7 @@ class TrainerMon:
                 get_metrics(self.ops, grad_dict, self.eps, self.grad_context.pre)
                 out = foreach_reduce(fsdp_params, unsharded_grads, *unused)
                 return out
             return wrapper
         logger.info("Patch fsdp2 foreach_reduce, collect pre_grad metrics.")
@@ -1294,10 +1276,9 @@ class TrainerMon:
         """
         遍历参数的梯度生成函数（grad_acc），并挂载hook，以便在该参数所有梯度计算后，采集通信聚合前梯度数据。
         """
-        context = self.grad_context
         @torch.no_grad
-        def param_hook(*args, context_dict, param, name):
+        def param_hook(*args, param, name):
             key = name
             if self.monitor_mbs_grad:
                 key += f'{MonitorConst.NAME_SEP}{param.micro_step}'
@@ -1305,14 +1286,15 @@ class TrainerMon:
             key = get_summary_writer_tag_name(key, 'acc_grad', self.rank)
             self.register_param_call_id("param_hook", key)
             param.micro_step += 1
+            grad_dict = {}
             if self.monitor_mbs_grad or (param.micro_step == self.micro_batch_number):
                 if self.params_have_main_grad:
                     grad = param.main_grad
                 else:
                     grad = param.grad
-                context_dict[key] = grad.clone()
+                grad_dict[key] = grad.clone()
+            get_metrics(self.ops, grad_dict, self.eps, self.grad_context.pre)
             if param.micro_step == self.micro_batch_number:
                 param.micro_step = 0
@@ -1322,7 +1304,7 @@ class TrainerMon:
             param_tmp = param.expand_as(param)
             grad_acc = param_tmp.grad_fn.next_functions[0][0]
             handle = grad_acc.register_hook(
-                partial(param_hook, context_dict=context.acc, param=param, name=name))
+                partial(param_hook, param=param, name=name))
             self.grad_accs.append(grad_acc)
             self.handles['wgrads'].append(handle)

mindstudio-probe 8.3.0__py3-none-any.whl → 8.3.2__py3-none-any.whl

mindstudio-probe 8.3.0py3-none-any.whl → 8.3.2py3-none-any.whl