PyPI - mindstudio-probe - Versions diffs - 8.1.2__py3-none-any.whl → 8.2.1__py3-none-any.whl - Mend

mindstudio-probe 8.1.2py3-none-any.whl → 8.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (181) hide show

{mindstudio_probe-8.1.2.dist-info → mindstudio_probe-8.2.1.dist-info}/METADATA +2 -2
{mindstudio_probe-8.1.2.dist-info → mindstudio_probe-8.2.1.dist-info}/RECORD +172 -147
msprobe/README.md +6 -6
msprobe/core/common/const.py +98 -41
msprobe/core/common/db_manager.py +256 -0
msprobe/core/common/file_utils.py +28 -5
msprobe/core/common/log.py +7 -0
msprobe/core/common/megatron_utils.py +59 -0
msprobe/core/common/parallel_state.py +193 -0
msprobe/core/common/utils.py +20 -13
msprobe/core/common_config.py +5 -0
msprobe/core/compare/acc_compare.py +140 -93
msprobe/core/compare/check.py +13 -0
msprobe/core/compare/compare_cli.py +64 -6
msprobe/core/compare/config.py +10 -8
msprobe/core/compare/diff_analyze/diff_analyze_threshold.yaml +14 -0
msprobe/core/compare/diff_analyze/first_diff_analyze.py +135 -0
msprobe/core/compare/diff_analyze/ignore_op_list.yaml +3 -0
msprobe/core/compare/find_first/__init__.py +0 -0
msprobe/core/compare/find_first/analyzer.py +282 -0
msprobe/core/compare/find_first/data_processor.py +35 -0
msprobe/core/compare/find_first/graph.py +188 -0
msprobe/core/compare/find_first/utils.py +189 -0
msprobe/core/compare/highlight.py +74 -101
msprobe/core/compare/layer_mapping/layer_mapping.py +14 -9
msprobe/core/compare/merge_result/merge_result.py +2 -2
msprobe/core/compare/multiprocessing_compute.py +45 -28
msprobe/core/compare/npy_compare.py +7 -10
msprobe/core/compare/utils.py +338 -130
msprobe/core/config_check/checkers/dataset_checker.py +2 -1
msprobe/core/config_check/checkers/env_args_checker.py +5 -5
msprobe/core/config_check/checkers/hyperparameter_checker.py +30 -10
msprobe/core/config_check/checkers/pip_checker.py +4 -3
msprobe/core/config_check/checkers/random_checker.py +3 -3
msprobe/core/config_check/checkers/weights_checker.py +2 -1
msprobe/core/config_check/ckpt_compare/megatron_loader.py +2 -0
msprobe/core/config_check/resource/hyperparameter.yaml +11 -1
msprobe/core/config_check/utils/hyperparameter_parser.py +7 -3
msprobe/core/config_check/utils/utils.py +10 -0
msprobe/core/data_dump/api_registry.py +49 -30
msprobe/core/data_dump/data_collector.py +71 -29
msprobe/core/data_dump/data_processor/base.py +2 -0
msprobe/core/data_dump/data_processor/mindspore_processor.py +47 -53
msprobe/core/data_dump/data_processor/pytorch_processor.py +227 -93
msprobe/core/data_dump/json_writer.py +81 -7
msprobe/core/data_dump/scope.py +4 -6
msprobe/core/hook_manager.py +129 -70
msprobe/core/monitor/csv2db.py +361 -0
msprobe/core/monitor/db_utils.py +278 -0
msprobe/core/monitor/utils.py +35 -1
msprobe/core/service.py +31 -39
msprobe/core/single_save/single_comparator.py +16 -3
msprobe/docs/01.installation.md +51 -19
msprobe/docs/02.config_introduction.md +16 -20
msprobe/docs/03.config_examples.md +26 -0
msprobe/docs/04.kernel_dump_PyTorch.md +1 -1
msprobe/docs/05.data_dump_PyTorch.md +6 -2
msprobe/docs/06.data_dump_MindSpore.md +44 -7
msprobe/docs/07.accuracy_checker_PyTorch.md +1 -1
msprobe/docs/10.accuracy_compare_PyTorch.md +124 -44
msprobe/docs/11.accuracy_compare_MindSpore.md +75 -7
msprobe/docs/14.data_parse_PyTorch.md +1 -1
msprobe/docs/19.monitor.md +94 -7
msprobe/docs/21.visualization_PyTorch.md +71 -101
msprobe/docs/22.visualization_MindSpore.md +69 -119
msprobe/docs/23.generate_operator_PyTorch.md +1 -1
msprobe/docs/25.tool_function_introduction.md +0 -1
msprobe/docs/26.data_dump_PyTorch_baseline.md +7 -7
msprobe/docs/28.debugger_save_instruction.md +184 -81
msprobe/docs/29.data_dump_MSAdapter.md +6 -0
msprobe/docs/31.config_check.md +4 -2
msprobe/docs/36.calculation_result_change.md +75 -0
msprobe/docs/FAQ.md +22 -1
msprobe/docs/data_dump_MindSpore/dynamic_graph_quick_start_example.md +6 -2
msprobe/docs/img/compare_result.png +0 -0
msprobe/docs/img/visualization/vis_browser_1.png +0 -0
msprobe/docs/img/visualization/vis_match_info.png +0 -0
msprobe/docs/img/visualization/vis_precision_info.png +0 -0
msprobe/docs/img/visualization/vis_search_info.png +0 -0
msprobe/docs/img/visualization/vis_show_info.png +0 -0
msprobe/docs/img/visualization/vis_showcase.png +0 -0
msprobe/docs/img/visualization/vis_unmatch_info.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/1.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/2.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/3.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/4.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/5.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/6.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/7.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/llamafactory-qwen25vl.txt +59 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/llamafactory1.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/llamafactory2.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/mindspeed-mm-qwen25vl.txt +80 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/mindspeed1.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/mindspeed2.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactory_mapping.md +330 -0
msprobe/mindspore/__init__.py +1 -1
msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +1 -1
msprobe/mindspore/api_accuracy_checker/api_runner.py +9 -6
msprobe/mindspore/api_accuracy_checker/compute_element.py +18 -12
msprobe/mindspore/cell_processor.py +64 -25
msprobe/mindspore/common/utils.py +51 -7
msprobe/mindspore/compare/common_dir_compare.py +45 -37
msprobe/mindspore/compare/ms_compare.py +10 -2
msprobe/mindspore/compare/ms_graph_compare.py +47 -52
msprobe/mindspore/debugger/debugger_config.py +18 -7
msprobe/mindspore/debugger/precision_debugger.py +16 -12
msprobe/mindspore/dump/cell_dump_process.py +130 -68
msprobe/mindspore/dump/cell_dump_with_insert_gradient.py +10 -2
msprobe/mindspore/dump/graph_mode_cell_dump.py +35 -9
msprobe/mindspore/dump/graph_tensor_dump.py +11 -0
msprobe/mindspore/dump/hook_cell/api_register.py +19 -20
msprobe/mindspore/dump/hook_cell/hook_cell.py +12 -34
msprobe/mindspore/dump/hook_cell/ms_hook_manager.py +142 -21
msprobe/mindspore/dump/kernel_kbyk_dump.py +24 -0
msprobe/mindspore/exception_dump/__init__.py +0 -0
msprobe/mindspore/exception_dump/exception_dump_tool_factory.py +51 -0
msprobe/mindspore/exception_dump/kernel_graph_exception_dump.py +57 -0
msprobe/mindspore/free_benchmark/api_pynative_self_check.py +5 -4
msprobe/mindspore/mindspore_service.py +2 -2
msprobe/mindspore/mindtorch/mindtorch_adaptor.py +12 -7
msprobe/mindspore/monitor/features.py +82 -0
msprobe/mindspore/monitor/module_hook.py +168 -10
msprobe/mindspore/monitor/utils.py +27 -1
msprobe/mindspore/ms_config.py +12 -4
msprobe/mindspore/overflow_check/overflow_check_tool_factory.py +1 -1
msprobe/mindspore/task_handler_factory.py +3 -1
msprobe/nan_analyze/graph.py +1 -1
msprobe/pytorch/api_accuracy_checker/common/config.py +3 -36
msprobe/pytorch/api_accuracy_checker/compare/api_precision_compare.py +0 -24
msprobe/pytorch/api_accuracy_checker/compare/compare.py +2 -12
msprobe/pytorch/api_accuracy_checker/config.yaml +1 -6
msprobe/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py +2 -2
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py +12 -132
msprobe/pytorch/common/utils.py +1 -21
msprobe/pytorch/compare/pt_compare.py +10 -2
msprobe/pytorch/{hook_module/jit_script_wrapper.py → compare/pt_diff_analyze.py} +3 -15
msprobe/pytorch/compare/utils.py +2 -1
msprobe/pytorch/debugger/debugger_config.py +18 -23
msprobe/pytorch/dump/module_dump/hook_wrapper.py +10 -7
msprobe/pytorch/dump/module_dump/module_processer.py +41 -19
msprobe/pytorch/free_benchmark/main.py +7 -4
msprobe/pytorch/hook_module/api_register.py +62 -24
msprobe/pytorch/hook_module/hook_module.py +9 -29
msprobe/pytorch/hook_module/pt_hook_manager.py +84 -15
msprobe/pytorch/hook_module/script_wrapper.py +140 -0
msprobe/pytorch/hook_module/support_wrap_ops.yaml +6 -0
msprobe/pytorch/monitor/csv2tb.py +1 -1
msprobe/pytorch/monitor/features.py +94 -0
msprobe/pytorch/monitor/module_hook.py +221 -81
msprobe/pytorch/monitor/module_metric.py +27 -1
msprobe/pytorch/monitor/optimizer_collect.py +109 -4
msprobe/pytorch/online_dispatch/dispatch.py +42 -24
msprobe/pytorch/online_dispatch/dump_compare.py +1 -1
msprobe/pytorch/parse_tool/lib/visualization.py +0 -1
msprobe/pytorch/pt_config.py +2 -51
msprobe/pytorch/pytorch_service.py +7 -14
msprobe/visualization/builder/graph_builder.py +192 -63
msprobe/visualization/builder/graph_merger.py +986 -0
msprobe/visualization/builder/msprobe_adapter.py +17 -15
msprobe/visualization/compare/graph_comparator.py +26 -16
msprobe/visualization/db_utils.py +252 -0
msprobe/visualization/graph/base_node.py +2 -22
msprobe/visualization/graph/distributed_analyzer.py +12 -12
msprobe/visualization/graph/graph.py +44 -16
msprobe/visualization/graph_service.py +143 -59
msprobe/visualization/utils.py +103 -4
msprobe/docs/08.accuracy_checker_online_PyTorch.md +0 -295
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/attl.py +0 -205
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/client.py +0 -378
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/device_dispatch.py +0 -239
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/dump_dispatch.py +0 -115
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/server.py +0 -250
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/torch_ops_config.yaml +0 -63
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/utils.py +0 -198
msprobe/pytorch/attl_manager.py +0 -65
{mindstudio_probe-8.1.2.dist-info → mindstudio_probe-8.2.1.dist-info}/LICENSE +0 -0
{mindstudio_probe-8.1.2.dist-info → mindstudio_probe-8.2.1.dist-info}/WHEEL +0 -0
{mindstudio_probe-8.1.2.dist-info → mindstudio_probe-8.2.1.dist-info}/entry_points.txt +0 -0
{mindstudio_probe-8.1.2.dist-info → mindstudio_probe-8.2.1.dist-info}/top_level.txt +0 -0
/msprobe/{pytorch/api_accuracy_checker/tensor_transport_layer → core/compare/diff_analyze}/__init__.py +0 -0

msprobe/pytorch/monitor/module_hook.py CHANGED Viewed

@@ -15,18 +15,21 @@
 import json
 import os
 import uuid
+import importlib
 from collections import defaultdict
 from datetime import datetime
 from functools import partial
+from itertools import cycle
 import pytz
 import torch
 import torch.distributed as dist
 import pandas as pd
 from torch.utils.hooks import BackwardHook
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
 from msprobe.core.common.const import MonitorConst, Const
-from msprobe.core.common.file_utils import load_json, save_json
+from msprobe.core.common.file_utils import load_json, save_json, make_dir
 from msprobe.core.common.decorator import recursion_depth_decorator
 from msprobe.core.monitor.anomaly_processor import AnomalyScanner, AnomalyDataFactory, AnomalyDataWriter
 from msprobe.core.common.file_utils import write_df_to_csv
@@ -39,9 +42,9 @@ from msprobe.pytorch.monitor.utils import get_param_struct
 from msprobe.pytorch.monitor.data_writers import SummaryWriterWithAD, CSVWriterWithAD, BaseWriterWithAD, WriterInput
 from msprobe.pytorch.monitor.distributed.wrap_distributed import api_register, create_hooks, op_aggregate, \
     get_process_group
-from msprobe.pytorch.monitor.features import get_sign_matches
+from msprobe.pytorch.monitor.features import get_sign_matches, cal_qkt
 from msprobe.pytorch.monitor.module_metric import get_metrics, get_summary_writer_tag_name, \
-    TensorMetrics, squash_param_name
+    TensorMetrics, squash_param_name, get_entropy_metric, get_sr_metric
 from msprobe.pytorch.monitor.optimizer_collect import OptimizerMonFactory
 from msprobe.pytorch.monitor.visualizer import HeatmapVisualizer
@@ -56,6 +59,7 @@ FORMAT_MAPPING = {
     MonitorConst.CSV: CSVWriterWithAD,
     MonitorConst.API: BaseWriterWithAD
 }
+start_step = 0
 def param_is_not_tensor_parallel_duplicate(param, tp_group):
@@ -82,7 +86,17 @@ class ModuleHookContext:
         self.actvgrad.clear()
-start_step = 0
+class FeatureHookContext:
+    def __init__(self, module_name):
+        self.step = 0
+        self.micro_step = 0
+        self.attention_feature = {}
+        self.linear_feature = {}
+        self.module_name = module_name
+    def reset(self):
+        self.attention_feature.clear()
+        self.linear_feature.clear()
 class OptimizerContext:
@@ -159,8 +173,8 @@ class TrainerMon:
         self.params_have_main_grad = params_have_main_grad
         self.update_heatmap_visualizer = defaultdict(HeatmapVisualizer)
         self.ratio_heatmap_visualizer = defaultdict(HeatmapVisualizer)
-        self.origin_start_grad_sync = None
         self.fsdp_post_backward_hook = None
+        self.fsdp2_foreach_reduce = None
         self.config_timestamp = 0  # 后面有校验时间戳, 首次监控无需为了更新config文件时间戳而去改, 可通过dynamic_on开关直接打开
         self.config = load_json(config_file_path)
         validate_config(self.config)
@@ -195,7 +209,9 @@ class TrainerMon:
         self.dp_group = None
         self.tp_group = None
         self.enable_megatron = False
+        self.enable_deepspeed = False
         self.fsdp_wrapped_module = False
+        self.fsdp2_wrapped_module = False
         self.micro_batch_number = 1
         self.optimizer_mon = None
         self.optimizer_trans = None
@@ -203,6 +219,7 @@ class TrainerMon:
         # TYPE3: 会随着训练中途config配置更新或监控状态改变而重置的变量
         self.module_fwd_hook_context_by_module = defaultdict(ModuleHookContext)
         self.module_bwd_hook_context_by_module = defaultdict(ModuleHookContext)
+        self.feature_hook_context_by_module = defaultdict(FeatureHookContext)
         self.optimizer_context = defaultdict(OptimizerContext)
         self.cc_context = defaultdict(CommunicationContext)
         self.grad_context = GradContext()
@@ -210,9 +227,12 @@ class TrainerMon:
         self.param2name = defaultdict(str)
         self.name2indices = defaultdict()
         self.name2param = {}
+        self.origin2squash = {}
         self.duplicate_param = {}
         self.name2tag = {}
         self.param_name_call_id = {}
+        self.flat_prefix_names = []
+        self.flat_prefix_reverse_iter = None
         self.call_id = 0
         self.module_struct = defaultdict(dict)
         self.grad_accs = []
@@ -270,6 +290,18 @@ class TrainerMon:
         cc_tensor.reset()
         return metrics
+    @staticmethod
+    def get_linear_hook_target(module):
+        if isinstance(module, torch.nn.Embedding):
+            return ''
+        if hasattr(module, "num_embeddings") or hasattr(module, "vocab_start_index"):
+            return ''
+        for weight_name in ["weight", "wg"]:
+            if hasattr(module, weight_name) and isinstance(getattr(module, weight_name), torch.Tensor):
+                if getattr(module, weight_name).dim() == 2:
+                    return weight_name
+        return ''
     def set_config(self):
         logger.info(f"current config: {self.config}")
         self.start_step = self.config.get("start_step", 0)
@@ -294,6 +326,8 @@ class TrainerMon:
         self.cc_distribution = self.config.get("cc_distribution", {})
         self.stack_info = self.config.get('stack_info', False)
         self.monitor_mbs_grad = self.config.get('monitor_mbs_grad', False)
+        self.recording_l2_features = self.config.get("recording_l2_features", False)
+        self.sa_order = self.config.get("sa_order", "s,b,h,d")
         if not self.cc_distribution.get('enable', False):
             self.cc_log_only = False
@@ -352,6 +386,8 @@ class TrainerMon:
             logger.info_on_rank_0("> momentum and variance of adam is not monitored. ")
         if not self.wg_distribution:
             logger.info_on_rank_0("> weight grad of specified module is not monitored. ")
+        if not self.recording_l2_features:
+            logger.info_on_rank_0("> l2 features of specified module is not monitored. ")
         if not self.mg_direction:
             logger.info_on_rank_0('> grad and momentum direction will not be compared.')
         if not self.cc_distribution.get('enable', False):
@@ -533,6 +569,24 @@ class TrainerMon:
         if self.grad_context.actv:
             self.summary_writer.write_metrics(self.ops, self.grad_context.actv, step, MonitorConst.ACTVGRAD)
+    def write_metrics_if_not_empty(self, features, metrics, step, hook_name):
+        if not features or len(features) == 0:
+            return
+        use_micro_step = hook_name not in ["linear_hook"]
+        self.summary_writer.write_metrics(metrics, features, step, hook_name, use_micro_step=use_micro_step)
+        features.clear()
+    def write_features_tb(self, step):
+        if not self.recording_l2_features:
+            return
+        for context in self.feature_hook_context_by_module.values():
+            num_features = len(context.attention_feature) + len(context.linear_feature)
+            if num_features == 0:
+                continue
+            self.write_metrics_if_not_empty(context.attention_feature, ["entropy", "softmax_max"],
+                                            step, "attention_hook")
+            self.write_metrics_if_not_empty(context.linear_feature, ["sr", "kernel_norm"], step, "linear_hook")
     def write_param_tb(self, opt_context):
         if not self.param_distribution:
             return
@@ -687,6 +741,7 @@ class TrainerMon:
                     if self.anomaly_data_factory:
                         self.anomaly_data_factory.set_call_id(self.param_name_call_id)
                     self.write_xy_tb(context.step)
+                    self.write_features_tb(context.step)
                     self.write_grad_tb(context.step)
                     self.write_mv_tb(context)
                     self.write_param_tb(context)
@@ -756,7 +811,8 @@ class TrainerMon:
             vpp_stage = f'{vpp_stage}{MonitorConst.NAME_SEP}'
             targets = [x for x, _ in model_chunk.named_modules()] if self.print_struct else self.config[
                 'targets'].keys()
-            hooked_count += self._hook_module(targets, model_chunk, vpp_stage)
+            l2_target_names = self.config.get('l2_targets', '')
+            hooked_count += self._hook_module(targets, l2_target_names, model_chunk, vpp_stage)
         logger.info_on_rank_0(f"> {hooked_count} modules are monitored.")
@@ -797,6 +853,9 @@ class TrainerMon:
         for handle in self.handles['xy']:
             handle.remove()
         self.handles['xy'].clear()
+        for handle in self.handles['L2_features']:
+            handle.remove()
+        self.handles['L2_features'].clear()
         # 清空对应context缓存
         for _, fwd_context in self.module_fwd_hook_context_by_module.items():
             fwd_context.reset()
@@ -804,22 +863,14 @@ class TrainerMon:
             bwd_context.reset()
         self.grad_context.reset()  # 权重梯度和激活值梯度都在这
-        if self.origin_start_grad_sync:  # megatron
-            try:
-                from megatron.core.distributed.param_and_grad_buffer import Bucket
-                Bucket.start_grad_sync = self.origin_start_grad_sync
-                logger.info("remove Bucket start_grad_sync")
-            except ImportError:
-                pass
-            try:
-                from megatron.core.distributed.param_and_grad_buffer import _ParamAndGradBucketGroup
-                _ParamAndGradBucketGroup.start_grad_sync = self.origin_start_grad_sync
-                logger.info("remove _ParamAndGradBucketGroup start_grad_sync")
-            except ImportError:
-                pass
-        elif self.fsdp_post_backward_hook:  # fsdp
+        self.optimizer_mon.restore_grad_sync(self)
+        if self.fsdp_post_backward_hook:  # fsdp
             torch.distributed.fsdp._runtime_utils._post_backward_hook = self.fsdp_post_backward_hook
             logger.info("remove patch_post_backward_hook in fsdp.")
+        if self.fsdp2_foreach_reduce:  # fsdp2
+            torch.distributed.fsdp._fully_shard._fsdp_collectives.foreach_reduce = self.fsdp2_foreach_reduce
+            importlib.reload(torch.distributed.fsdp._fully_shard._fsdp_param_group)
+            logger.info("remove patch_foreach_reduce_hook in fsdp2.")
         else:  # not megatron and not fsdp
             for handle in self.handles['wgrads']:
                 handle.remove()
@@ -881,14 +932,11 @@ class TrainerMon:
             logger.info(msg)
     def _save_module_struct(self):
-        save_module_struct = (not dist.is_initialized()
-                              or (self.module_rank_list and dist.get_rank() == min(self.module_rank_list))
-                              or (not self.module_rank_list and dist.get_rank() == 0))
-        if save_module_struct:
-            module_struct_file = os.path.realpath(os.path.join(get_output_base_dir(), 'module_struct.json'))
-            save_json(module_struct_file, self.module_struct, indent=2)
-            logger.info(f"> save module struct to {module_struct_file}")
+        output_dir = os.path.join(get_output_base_dir(), 'module_struct', f'rank{self.rank}')
+        make_dir(output_dir)
+        module_struct_file = os.path.realpath(os.path.join(output_dir, 'module_struct.json'))
+        save_json(module_struct_file, self.module_struct, indent=2)
+        logger.info(f"> save module struct to {module_struct_file}")
         self.struct_printed = True
     def _is_target_param(self, param_name, param, prefix):
@@ -896,23 +944,32 @@ class TrainerMon:
         squash_name = prefix + squash_param_name(param_name, self.squash_name)
         for target in self.config['targets'].keys():
             if param_name.startswith(target) or squash_name.startswith(target) or name.startswith(target):
-                setattr(param, "zero_out_wgrad", True)
                 return True
         return False
     def _register_chunk(self, model_chunk, prefix):
+        if isinstance(model_chunk, FSDP):
+            if not model_chunk._use_orig_params:
+                raise ValueError("Only Support fsdp1 with use_orig_params=True")
+            self.fsdp_wrapped_module = True
         for (param_name, param) in model_chunk.named_parameters():
             if not param.requires_grad:
                 continue
-            if not self.fsdp_wrapped_module and param_name.startswith("_fsdp_wrapped_module"):
-                self.fsdp_wrapped_module = True
+            if not self.fsdp2_wrapped_module and param.__class__.__name__ == "DTensor":
+                self.fsdp2_wrapped_module = True
+            if self.fsdp_wrapped_module:  # FSDP1需要记录完整的不被target限制的flat权重前缀名，以供后续对flat解包
+                flat_prefix_name, _ = param_name.rsplit(MonitorConst.FSDP_FLAT_SEP, 1)
+                if flat_prefix_name not in self.flat_prefix_names:
+                    self.flat_prefix_names.append(flat_prefix_name)
             if self._is_target_param(param_name, param, prefix):
                 name = prefix + squash_param_name(param_name, self.squash_name)
                 if name in self.param2name.values():
                     name = prefix + param_name
                 self.param2name[param] = name
                 self.name2param[name] = param
+                self.origin2squash[param_name] = name
                 if self.tp_group and not param_is_not_tensor_parallel_duplicate(param, self.tp_group):
                     self.duplicate_param[name] = True
@@ -929,6 +986,8 @@ class TrainerMon:
                     k: get_summary_writer_tag_name(name, k, self.rank)
                     for k in keywords
                 }
+        if self.fsdp_wrapped_module:
+            self.flat_prefix_reverse_iter = cycle(reversed(self.flat_prefix_names))  # post_backward_hook调用顺序是反向的
     def _register_param_name(self):
         for vpp_stage, model_chunk in enumerate(self.model):
@@ -946,7 +1005,20 @@ class TrainerMon:
                 return pattern
         return ""
-    def _hook_module(self, target_names, module: torch.nn.Module, vpp_stage=''):
+    def _is_recording_module(self, module_name, l2_targets, vpp_stage, hook_name):
+        if len(l2_targets) > 0:
+            for pattern in [
+                vpp_stage + squash_param_name(module_name, self.squash_name),
+                vpp_stage + module_name,
+            ]:
+                if pattern in l2_targets:
+                    return pattern
+        elif hook_name in ["linear_hook"]:
+            return vpp_stage + squash_param_name(module_name, self.squash_name)
+        return ""
+    def _hook_module(self, target_names, l2_target_names, module: torch.nn.Module, vpp_stage=''):
         if '_modules' not in module.__dict__:
             # nothing to hook
             return 0
@@ -1025,6 +1097,61 @@ class TrainerMon:
                 context.micro_step = 0
             return
+        def extract_attention_feature_hook(module, module_input, module_output, name):
+            if is_recomputation() or not module.training:
+                return
+            if module not in self.feature_hook_context_by_module:
+                self.feature_hook_context_by_module[module] = FeatureHookContext(name)
+            context: FeatureHookContext = self.feature_hook_context_by_module[module]
+            tbtag_tensor_map = {}
+            if len(module_input) < 2:
+                logger.warning(
+                    f"Length of module_input in attention hook ({name}) is {len(module_input)}, "
+                    "expected >= 2. Skipping feature extraction for this module."
+                )
+                return
+            q_h = module_input[0]
+            k_h = module_input[1]
+            qkt = cal_qkt(q_h, k_h, order=self.sa_order)
+            tbtag_tensor_map.update(
+                self.build_tbtag_tensor_map(f'{context.module_name}.attention',
+                                            f'{MonitorConst.NAME_SEP}{context.micro_step}', 'qkt', qkt)
+            )
+            get_entropy_metric(tbtag_tensor_map, context.attention_feature)
+            context.micro_step += 1
+            if context.micro_step == self.micro_batch_number:
+                context.micro_step = 0
+                context.step += 1
+            return
+        def extract_linear_sr_hook(module, module_input, module_output, name):
+            if is_recomputation() or not module.training:
+                return
+            weight_name = self.get_linear_hook_target(module)
+            if weight_name == '':
+                return
+            if module not in self.feature_hook_context_by_module:
+                self.feature_hook_context_by_module[module] = FeatureHookContext(name)
+            context: FeatureHookContext = self.feature_hook_context_by_module[module]
+            if context.micro_step == (self.micro_batch_number - 1):
+                tbtag_tensor_map = {}
+                value = getattr(module, weight_name).data
+                tbtag_tensor_map.update(
+                    self.build_tbtag_tensor_map(f'{context.module_name}.linear',
+                                                '', 'sr', value)
+                )
+                get_sr_metric(tbtag_tensor_map, context.linear_feature)
+            context.micro_step += 1
+            if context.micro_step == self.micro_batch_number:
+                context.micro_step = 0
+                context.step += 1
+            return
         def stack_hook(module, args, kwargs, module_output, name):
             if module not in self.module_fwd_hook_context_by_module:
                 self.module_fwd_hook_context_by_module[module] = ModuleHookContext(name)
@@ -1056,34 +1183,29 @@ class TrainerMon:
                     self.module_bwd_hook_context_by_module[submodule] = ModuleHookContext(name)
                 logger.info_on_rank_0(f"> {name} is monitored successfully")
                 hooked_count += 1
-        return hooked_count
-    def _patch_grad_sync(self):
-        def patch_sync(sync_grad_func):
-            def wrapper(bucket):
-                grad_dict = {}
-                # Megatron between core_r0.6.0 and core_r0.8.0, this bucket is Bucket.
-                # When megatron is core_r0.9.0, this bucket is _ParamAndGradBucketGroup.
-                # In megatron version core_r0.9.0, func start_grad_sync from Bucket moved to _ParamAndGradBucketGroup.
-                bucket_params_id_list = [id(params) for params in bucket.params]
-                for param, name in self.param2name.items():
-                    if id(param) not in bucket_params_id_list:
-                        continue
-                    grad = param.main_grad if self.params_have_main_grad else param.grad
-                    if grad is None:
-                        logger.warning(f"grad is None: {name}, maybe something wrong happened.")
-                        continue
-                    tag = self.name2tag.get(name, {}).get(MonitorConst.PRE_GRAD)
-                    if tag is None:
+        if not self.print_struct and self.recording_l2_features:
+            for module_name, submodule in module.named_modules():
+                func_map = {
+                    "attention_hook": extract_attention_feature_hook,
+                    "linear_hook": extract_linear_sr_hook,
+                }
+                for hook_name in func_map.keys():
+                    if hook_name not in l2_target_names:
                         continue
-                    grad_dict[tag] = grad
-                    self.register_param_call_id("sync_grad_func", tag)
-                get_metrics(self.ops, grad_dict, self.eps, self.grad_context.pre)
-                out = sync_grad_func(bucket)
-                return out
+                    temp_names = l2_target_names[hook_name]
+                    name = self._is_recording_module(module_name, temp_names, vpp_stage, hook_name)
+                    if name:
+                        handle = submodule.register_forward_hook(partial(func_map[hook_name], name=name))
+                        print_feature_name = hook_name.split('_')[0]
+                        logger.info_on_rank_0(
+                            f'> {print_feature_name} features of {name} is monitored successfully')
+                        self.handles["L2_features"].append(handle)
+                        hooked_count += 1
+                continue
-            return wrapper
+        return hooked_count
+    def _patch_grad_sync(self):
         if not self.wg_distribution:
             return
         if self.fsdp_wrapped_module:
@@ -1091,27 +1213,18 @@ class TrainerMon:
             self._patch_fsdp_post_backward_hook()
             return
+        if self.fsdp2_wrapped_module:
+            # patch fsdp2 _fully_shard._fsdp_collectives.foreach_reduce
+            self._patch_fsdp2_foreach_reduce()
+            return
         if self.monitor_mbs_grad:
             self._hook_weights()
             return
-        try:
-            from megatron.core.distributed.param_and_grad_buffer import Bucket
-            self.origin_start_grad_sync = Bucket.start_grad_sync
-            Bucket.start_grad_sync = patch_sync(Bucket.start_grad_sync)
-            self.enable_megatron = True
-            logger.info("megatron version is >= core_r0.6.0 <= core_r0.8.0")
-        except ImportError:
-            self.enable_megatron = False
+        self.optimizer_mon.patch_grad_sync(self)
-        try:
-            from megatron.core.distributed.param_and_grad_buffer import _ParamAndGradBucketGroup
-            self.origin_start_grad_sync = _ParamAndGradBucketGroup.start_grad_sync
-            _ParamAndGradBucketGroup.start_grad_sync = patch_sync(_ParamAndGradBucketGroup.start_grad_sync)
-            self.enable_megatron = True
-            logger.info("megatron version is > core_r0.8.0 <= core_r0.9.0")
-        except ImportError:
-            self.enable_megatron = False | self.enable_megatron
-        if self.enable_megatron:
+        if self.enable_megatron or self.enable_deepspeed:
             return
         # default hook weights
@@ -1124,17 +1237,22 @@ class TrainerMon:
         每个forward阶段，fsdp对AccumulateGrad重复注册hook方法，monitor工具内注册hook无法生效，
         因此对_post_backward_hook进行patch，在backward后，reduce_scatter前采集梯度。
         """
         def patch_post_backward_hook(_post_backward_hook):
             def wrapper(state, handle, *unused):
                 grad_dict = {}
-                offset = 0
-                for param, name in self.param2name.items():
-                    limit = param.numel()
-                    if not limit:
+                local_names = handle.flat_param._fqns
+                offsets = handle._get_flat_param_offsets()
+                shapes = handle.flat_param._shapes
+                flat_prefix = next(self.flat_prefix_reverse_iter)
+                for local_name, (start, end), local_shape in zip(local_names, offsets, shapes):
+                    grad_clip = handle.flat_param.grad[start:end + 1]
+                    grad = grad_clip.reshape(local_shape)
+                    total_name = f"{flat_prefix}{MonitorConst.FSDP_FLAT_SEP}{local_name}"
+                    if total_name not in self.origin2squash:
+                        logger.warning(f"{total_name} not in model.named_parameters(), skip.")
                         continue
-                    grad = handle.flat_param.grad[offset:offset + limit]
-                    offset += limit
-                    tag = self.name2tag.get(name, {}).get(MonitorConst.PRE_GRAD)
+                    tag = self.name2tag.get(self.origin2squash[total_name], {}).get(MonitorConst.PRE_GRAD)
                     if tag is None:
                         continue
                     grad_dict[tag] = grad
@@ -1150,6 +1268,28 @@ class TrainerMon:
         torch.distributed.fsdp._runtime_utils._post_backward_hook = \
             patch_post_backward_hook(torch.distributed.fsdp._runtime_utils._post_backward_hook)
+    def _patch_fsdp2_foreach_reduce(self):
+        def patch_foreach_reduce(foreach_reduce):
+            def wrapper(fsdp_params, unsharded_grads, *unused):
+                grad_dict = {}
+                for param, grad in zip(fsdp_params, unsharded_grads):
+                    tag = self.name2tag.get(self.origin2squash[param._param_fqn], {}).get(MonitorConst.PRE_GRAD)
+                    if tag is None:
+                        continue
+                    grad_dict[tag] = grad
+                    self.register_param_call_id("foreach_reduce", tag)
+                get_metrics(self.ops, grad_dict, self.eps, self.grad_context.pre)
+                out = foreach_reduce(fsdp_params, unsharded_grads, *unused)
+                return out
+            return wrapper
+        logger.info("Patch fsdp2 foreach_reduce, collect pre_grad metrics.")
+        import torch.distributed.fsdp._fully_shard._fsdp_param_group as _fsdp_param_group
+        import torch.distributed.fsdp._fully_shard._fsdp_collectives as _fsdp_collectives
+        self.fsdp2_foreach_reduce = _fsdp_collectives.foreach_reduce
+        _fsdp_collectives.foreach_reduce = patch_foreach_reduce(_fsdp_collectives.foreach_reduce)
+        importlib.reload(_fsdp_param_group)  # 关键操作，不然会因为torch一开始就import foreach_reduce导致patch失效
     def _hook_weights(self):
         """
         遍历参数的梯度生成函数（grad_acc），并挂载hook，以便在该参数所有梯度计算后，采集通信聚合前梯度数据。

msprobe/pytorch/monitor/module_metric.py CHANGED Viewed

@@ -17,6 +17,7 @@ import re
 import torch
 from msprobe.pytorch.monitor.features import get_max, get_min, get_zeros, get_nans, get_norm, get_mean
+from msprobe.pytorch.monitor.features import cal_entropy, cal_stable_rank
 from msprobe.pytorch.monitor.utils import get_nan_tensor
@@ -31,7 +32,8 @@ def squash_param_name(param_name, enable=True):
     if not enable:
         return param_name
     name = ''
-    for pattern in ['layers?\.(.*)', 'embeddings?\.(.*)', 'final.*', 'output.*', 'norm.*']:
+    for pattern in ['^.*\.(layers?\..*)', '^.*\.(embeddings?\..*)', '^.*\.(final.*)', '^.*\.(output.*)',
+                    '^.*\.(norm.*)']:
         match = re.findall(pattern, param_name)
         if match:
             name += match[0]
@@ -184,3 +186,27 @@ def get_metrics(ops, tag2tensor, eps, out_dict=None):
             fun_metric = config_metric_registry.get(metric_name)
             out_dict[tag][metric_name] = fun_metric.get_metric(tensor, eps)
     return out_dict
+def get_sr_metric(tag2tensor, out_dict=None):
+    if out_dict is None:
+        out_dict = {}
+    for tag, tensor in tag2tensor.items():
+        if "sr" not in tag:
+            continue
+        if tag not in out_dict:
+            out_dict[tag] = {}
+        sr, eig = cal_stable_rank(tensor)
+        out_dict[tag]['sr'] = sr
+        out_dict[tag]['kernel_norm'] = eig
+def get_entropy_metric(tag2tensor, out_dict=None):
+    if out_dict is None:
+        out_dict = {}
+    for tag, tensor in tag2tensor.items():
+        if tag not in out_dict:
+            out_dict[tag] = {}
+        entropy, softmax_max = cal_entropy(tensor)
+        out_dict[tag]['entropy'] = entropy
+        out_dict[tag]['softmax_max'] = softmax_max

mindstudio-probe 8.1.2__py3-none-any.whl → 8.2.1__py3-none-any.whl

mindstudio-probe 8.1.2py3-none-any.whl → 8.2.1py3-none-any.whl