PyPI - mindstudio-probe - Versions diffs - 1.2.2__py3-none-any.whl → 8.1.0__py3-none-any.whl - Mend

mindstudio-probe 1.2.2py3-none-any.whl → 8.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (261) hide show

{mindstudio_probe-1.2.2.dist-info → mindstudio_probe-8.1.0.dist-info}/METADATA +4 -3
{mindstudio_probe-1.2.2.dist-info → mindstudio_probe-8.1.0.dist-info}/RECORD +243 -191
msprobe/README.md +57 -21
msprobe/core/__init__.py +17 -0
msprobe/core/common/const.py +224 -82
msprobe/core/common/decorator.py +50 -0
msprobe/core/common/exceptions.py +5 -3
msprobe/core/common/file_utils.py +274 -40
msprobe/core/common/framework_adapter.py +169 -0
msprobe/core/common/global_lock.py +86 -0
msprobe/core/common/runtime.py +25 -0
msprobe/core/common/utils.py +148 -72
msprobe/core/common_config.py +7 -0
msprobe/core/compare/acc_compare.py +640 -462
msprobe/core/compare/check.py +36 -107
msprobe/core/compare/compare_cli.py +4 -0
msprobe/core/compare/config.py +72 -0
msprobe/core/compare/highlight.py +217 -215
msprobe/core/compare/layer_mapping/layer_mapping.py +4 -1
msprobe/core/compare/merge_result/merge_result.py +12 -6
msprobe/core/compare/multiprocessing_compute.py +227 -107
msprobe/core/compare/npy_compare.py +32 -16
msprobe/core/compare/utils.py +218 -244
msprobe/{mindspore/runtime.py → core/config_check/__init__.py} +2 -4
msprobe/{pytorch/dump/kernel_dump/kernel_config.py → core/config_check/checkers/__init__.py} +8 -16
msprobe/core/config_check/checkers/base_checker.py +60 -0
msprobe/core/config_check/checkers/dataset_checker.py +138 -0
msprobe/core/config_check/checkers/env_args_checker.py +96 -0
msprobe/core/config_check/checkers/hyperparameter_checker.py +170 -0
msprobe/core/config_check/checkers/pip_checker.py +90 -0
msprobe/core/config_check/checkers/random_checker.py +367 -0
msprobe/core/config_check/checkers/weights_checker.py +147 -0
msprobe/core/config_check/ckpt_compare/ckpt_comparator.py +74 -0
msprobe/core/config_check/ckpt_compare/megatron_loader.py +302 -0
msprobe/core/config_check/ckpt_compare/metrics.py +83 -0
msprobe/core/config_check/ckpt_compare/name_mapping.yaml +12 -0
msprobe/core/config_check/config_check_cli.py +51 -0
msprobe/core/config_check/config_checker.py +100 -0
msprobe/{pytorch/parse.py → core/config_check/resource/dependency.yaml} +7 -4
msprobe/core/config_check/resource/env.yaml +57 -0
msprobe/core/config_check/resource/hyperparameter.yaml +21 -0
msprobe/core/config_check/utils/hyperparameter_parser.py +115 -0
msprobe/core/config_check/utils/utils.py +107 -0
msprobe/core/data_dump/api_registry.py +239 -0
msprobe/core/data_dump/data_collector.py +36 -9
msprobe/core/data_dump/data_processor/base.py +74 -53
msprobe/core/data_dump/data_processor/mindspore_processor.py +119 -78
msprobe/core/data_dump/data_processor/pytorch_processor.py +134 -96
msprobe/core/data_dump/json_writer.py +146 -57
msprobe/core/debugger/precision_debugger.py +143 -0
msprobe/core/grad_probe/constant.py +2 -1
msprobe/core/grad_probe/grad_compare.py +2 -2
msprobe/core/grad_probe/utils.py +1 -1
msprobe/core/hook_manager.py +242 -0
msprobe/core/monitor/anomaly_processor.py +384 -0
msprobe/core/overflow_check/abnormal_scene.py +2 -0
msprobe/core/service.py +356 -0
msprobe/core/single_save/__init__.py +0 -0
msprobe/core/single_save/single_comparator.py +243 -0
msprobe/core/single_save/single_saver.py +157 -0
msprobe/docs/01.installation.md +6 -5
msprobe/docs/02.config_introduction.md +89 -30
msprobe/docs/03.config_examples.md +1 -0
msprobe/docs/04.kernel_dump_PyTorch.md +1 -1
msprobe/docs/05.data_dump_PyTorch.md +184 -50
msprobe/docs/06.data_dump_MindSpore.md +193 -28
msprobe/docs/07.accuracy_checker_PyTorch.md +13 -3
msprobe/docs/08.accuracy_checker_online_PyTorch.md +72 -10
msprobe/docs/09.accuracy_checker_MindSpore.md +19 -7
msprobe/docs/10.accuracy_compare_PyTorch.md +266 -102
msprobe/docs/11.accuracy_compare_MindSpore.md +117 -43
msprobe/docs/12.overflow_check_PyTorch.md +5 -3
msprobe/docs/13.overflow_check_MindSpore.md +6 -4
msprobe/docs/14.data_parse_PyTorch.md +4 -10
msprobe/docs/17.grad_probe.md +2 -1
msprobe/docs/18.online_dispatch.md +3 -3
msprobe/docs/19.monitor.md +211 -103
msprobe/docs/21.visualization_PyTorch.md +100 -28
msprobe/docs/22.visualization_MindSpore.md +103 -31
msprobe/docs/23.generate_operator_PyTorch.md +9 -9
msprobe/docs/25.tool_function_introduction.md +23 -22
msprobe/docs/26.data_dump_PyTorch_baseline.md +14 -3
msprobe/docs/27.dump_json_instruction.md +278 -8
msprobe/docs/28.debugger_save_instruction.md +111 -20
msprobe/docs/28.kernel_dump_MindSpore.md +1 -1
msprobe/docs/29.data_dump_MSAdapter.md +229 -0
msprobe/docs/30.overflow_check_MSAdapter.md +31 -0
msprobe/docs/31.config_check.md +95 -0
msprobe/docs/32.ckpt_compare.md +69 -0
msprobe/docs/33.generate_operator_MindSpore.md +190 -0
msprobe/docs/34.RL_collect.md +92 -0
msprobe/docs/35.nan_analyze.md +72 -0
msprobe/docs/FAQ.md +3 -11
msprobe/docs/data_dump_MindSpore/data_dump_MindSpore_baseline.md +12 -1
msprobe/docs/data_dump_MindSpore/dynamic_graph_quick_start_example.md +3 -1
msprobe/docs/img/compare_result.png +0 -0
msprobe/docs/img/merge_result.png +0 -0
msprobe/docs/img/save_compare_result_sample.png +0 -0
msprobe/docs/img/visualization/proxy.png +0 -0
msprobe/docs/img/visualization/vis_browser_1.png +0 -0
msprobe/docs/img/visualization/vis_match_info.png +0 -0
msprobe/docs/img/visualization/vis_precision_info.png +0 -0
msprobe/docs/img/visualization/vis_search_info.png +0 -0
msprobe/docs/img/visualization/vis_show_info.png +0 -0
msprobe/docs/img/visualization/vis_showcase.png +0 -0
msprobe/docs/img/visualization/vis_unmatch_info.png +0 -0
msprobe/mindspore/__init__.py +3 -3
msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +151 -55
msprobe/mindspore/api_accuracy_checker/api_runner.py +25 -11
msprobe/mindspore/api_accuracy_checker/base_compare_algorithm.py +2 -1
msprobe/mindspore/api_accuracy_checker/bench_functions/flash_attention_score.py +580 -0
msprobe/mindspore/api_accuracy_checker/bench_functions/fusion_operator.py +41 -0
msprobe/mindspore/api_accuracy_checker/cmd_parser.py +4 -0
msprobe/mindspore/api_accuracy_checker/data_manager.py +4 -3
msprobe/mindspore/api_accuracy_checker/generate_op_script/config_op.json +9 -0
msprobe/mindspore/api_accuracy_checker/generate_op_script/op_generator.py +451 -0
msprobe/mindspore/api_accuracy_checker/generate_op_script/operator_replication.template +2081 -0
msprobe/mindspore/api_accuracy_checker/multi_api_accuracy_checker.py +11 -1
msprobe/mindspore/api_accuracy_checker/torch_mindtorch_importer.py +2 -1
msprobe/mindspore/cell_processor.py +204 -33
msprobe/mindspore/code_mapping/graph_parser.py +4 -21
msprobe/mindspore/common/const.py +73 -2
msprobe/mindspore/common/utils.py +157 -29
msprobe/mindspore/compare/common_dir_compare.py +382 -0
msprobe/mindspore/compare/distributed_compare.py +2 -26
msprobe/mindspore/compare/ms_compare.py +18 -398
msprobe/mindspore/compare/ms_graph_compare.py +20 -10
msprobe/mindspore/compare/utils.py +37 -0
msprobe/mindspore/debugger/debugger_config.py +59 -7
msprobe/mindspore/debugger/precision_debugger.py +83 -90
msprobe/mindspore/dump/cell_dump_process.py +902 -0
msprobe/mindspore/dump/cell_dump_with_insert_gradient.py +889 -0
msprobe/mindspore/dump/dump_tool_factory.py +18 -8
msprobe/mindspore/dump/graph_mode_cell_dump.py +139 -0
msprobe/mindspore/dump/graph_tensor_dump.py +123 -0
msprobe/mindspore/dump/hook_cell/api_register.py +176 -0
msprobe/mindspore/dump/hook_cell/hook_cell.py +22 -12
msprobe/mindspore/dump/hook_cell/ms_hook_manager.py +88 -0
msprobe/mindspore/dump/hook_cell/primitive_hooks.py +8 -2
msprobe/mindspore/dump/hook_cell/support_wrap_ops.yaml +42 -26
msprobe/mindspore/dump/jit_dump.py +35 -27
msprobe/mindspore/dump/kernel_kbyk_dump.py +6 -3
msprobe/mindspore/dym_loader/hook_dynamic_loader.cpp +110 -0
msprobe/mindspore/dym_loader/hook_dynamic_loader.h +15 -16
msprobe/mindspore/free_benchmark/api_pynative_self_check.py +22 -12
msprobe/mindspore/free_benchmark/common/utils.py +1 -1
msprobe/mindspore/free_benchmark/perturbation/perturbation_factory.py +4 -2
msprobe/mindspore/free_benchmark/self_check_tool_factory.py +6 -3
msprobe/mindspore/grad_probe/global_context.py +9 -2
msprobe/mindspore/grad_probe/grad_analyzer.py +2 -1
msprobe/mindspore/grad_probe/grad_stat_csv.py +3 -2
msprobe/mindspore/grad_probe/hook.py +2 -4
msprobe/mindspore/mindspore_service.py +111 -0
msprobe/mindspore/monitor/common_func.py +52 -0
msprobe/mindspore/monitor/data_writers.py +237 -0
msprobe/mindspore/monitor/distributed/wrap_distributed.py +1 -1
msprobe/mindspore/monitor/features.py +13 -1
msprobe/mindspore/monitor/module_hook.py +568 -444
msprobe/mindspore/monitor/optimizer_collect.py +331 -0
msprobe/mindspore/monitor/utils.py +71 -9
msprobe/mindspore/ms_config.py +16 -15
msprobe/mindspore/overflow_check/overflow_check_tool_factory.py +5 -3
msprobe/mindspore/task_handler_factory.py +5 -2
msprobe/msprobe.py +19 -0
msprobe/nan_analyze/__init__.py +14 -0
msprobe/nan_analyze/analyzer.py +255 -0
msprobe/nan_analyze/graph.py +189 -0
msprobe/nan_analyze/utils.py +211 -0
msprobe/pytorch/api_accuracy_checker/common/config.py +2 -2
msprobe/pytorch/api_accuracy_checker/compare/api_precision_compare.py +3 -6
msprobe/pytorch/api_accuracy_checker/compare/compare.py +36 -34
msprobe/pytorch/api_accuracy_checker/generate_op_script/op_generator.py +15 -13
msprobe/pytorch/api_accuracy_checker/generate_op_script/operator_replication.template +206 -4
msprobe/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py +9 -9
msprobe/pytorch/api_accuracy_checker/run_ut/run_overflow_check.py +6 -5
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py +31 -9
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut_utils.py +28 -20
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/attl.py +3 -1
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/client.py +29 -13
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/device_dispatch.py +12 -2
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/server.py +45 -31
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/utils.py +154 -0
msprobe/pytorch/attl_manager.py +65 -0
msprobe/pytorch/bench_functions/moe_gating_top_k_softmax.py +6 -0
msprobe/pytorch/bench_functions/npu_fusion_attention.py +27 -0
msprobe/pytorch/common/utils.py +53 -19
msprobe/pytorch/compare/distributed_compare.py +4 -36
msprobe/pytorch/compare/pt_compare.py +13 -84
msprobe/pytorch/compare/utils.py +47 -0
msprobe/pytorch/debugger/debugger_config.py +34 -17
msprobe/pytorch/debugger/precision_debugger.py +50 -96
msprobe/pytorch/dump/module_dump/hook_wrapper.py +93 -0
msprobe/pytorch/dump/module_dump/module_dump.py +15 -61
msprobe/pytorch/dump/module_dump/module_processer.py +150 -114
msprobe/pytorch/free_benchmark/common/utils.py +1 -1
msprobe/pytorch/free_benchmark/compare/single_benchmark.py +1 -1
msprobe/pytorch/free_benchmark/perturbed_layers/npu/add_noise.py +3 -3
msprobe/pytorch/free_benchmark/perturbed_layers/npu/bit_noise.py +3 -3
msprobe/pytorch/free_benchmark/perturbed_layers/npu/change_value.py +1 -1
msprobe/pytorch/free_benchmark/perturbed_layers/npu/improve_precision.py +1 -1
msprobe/pytorch/free_benchmark/result_handlers/check_handler.py +1 -1
msprobe/pytorch/function_factory.py +1 -1
msprobe/pytorch/grad_probe/grad_monitor.py +2 -2
msprobe/pytorch/grad_probe/grad_stat_csv.py +3 -2
msprobe/pytorch/hook_module/api_register.py +155 -0
msprobe/pytorch/hook_module/hook_module.py +18 -22
msprobe/pytorch/hook_module/jit_script_wrapper.py +33 -0
msprobe/pytorch/hook_module/pt_hook_manager.py +68 -0
msprobe/pytorch/hook_module/register_optimizer_hook.py +2 -1
msprobe/pytorch/hook_module/support_wrap_ops.yaml +193 -75
msprobe/pytorch/hook_module/utils.py +28 -2
msprobe/pytorch/monitor/csv2tb.py +14 -4
msprobe/pytorch/monitor/data_writers.py +259 -0
msprobe/pytorch/monitor/distributed/wrap_distributed.py +8 -2
msprobe/pytorch/monitor/module_hook.py +336 -241
msprobe/pytorch/monitor/module_metric.py +17 -0
msprobe/pytorch/monitor/optimizer_collect.py +244 -224
msprobe/pytorch/monitor/utils.py +84 -4
msprobe/pytorch/online_dispatch/compare.py +0 -2
msprobe/pytorch/online_dispatch/dispatch.py +13 -2
msprobe/pytorch/online_dispatch/dump_compare.py +8 -2
msprobe/pytorch/online_dispatch/utils.py +3 -0
msprobe/pytorch/parse_tool/lib/interactive_cli.py +1 -6
msprobe/pytorch/parse_tool/lib/utils.py +5 -4
msprobe/pytorch/pt_config.py +16 -11
msprobe/pytorch/pytorch_service.py +70 -0
msprobe/visualization/builder/graph_builder.py +69 -10
msprobe/visualization/builder/msprobe_adapter.py +24 -12
msprobe/visualization/compare/graph_comparator.py +63 -51
msprobe/visualization/compare/mode_adapter.py +22 -20
msprobe/visualization/graph/base_node.py +11 -4
msprobe/visualization/graph/distributed_analyzer.py +1 -10
msprobe/visualization/graph/graph.py +2 -13
msprobe/visualization/graph/node_op.py +1 -2
msprobe/visualization/graph_service.py +251 -104
msprobe/visualization/utils.py +26 -44
msprobe/mindspore/dump/hook_cell/api_registry.py +0 -207
msprobe/mindspore/dump/hook_cell/wrap_api.py +0 -212
msprobe/mindspore/dym_loader/hook_dynamic_loader.cc +0 -140
msprobe/mindspore/monitor/anomaly_detect.py +0 -404
msprobe/mindspore/monitor/module_spec_verifier.py +0 -94
msprobe/mindspore/service.py +0 -543
msprobe/pytorch/hook_module/api_registry.py +0 -166
msprobe/pytorch/hook_module/wrap_distributed.py +0 -79
msprobe/pytorch/hook_module/wrap_functional.py +0 -66
msprobe/pytorch/hook_module/wrap_npu_custom.py +0 -85
msprobe/pytorch/hook_module/wrap_tensor.py +0 -69
msprobe/pytorch/hook_module/wrap_torch.py +0 -84
msprobe/pytorch/hook_module/wrap_vf.py +0 -60
msprobe/pytorch/monitor/anomaly_analyse.py +0 -201
msprobe/pytorch/monitor/anomaly_detect.py +0 -410
msprobe/pytorch/monitor/module_spec_verifier.py +0 -95
msprobe/pytorch/monitor/unittest/test_monitor.py +0 -160
msprobe/pytorch/service.py +0 -470
{mindstudio_probe-1.2.2.dist-info → mindstudio_probe-8.1.0.dist-info}/LICENSE +0 -0
{mindstudio_probe-1.2.2.dist-info → mindstudio_probe-8.1.0.dist-info}/WHEEL +0 -0
{mindstudio_probe-1.2.2.dist-info → mindstudio_probe-8.1.0.dist-info}/entry_points.txt +0 -0
{mindstudio_probe-1.2.2.dist-info → mindstudio_probe-8.1.0.dist-info}/top_level.txt +0 -0
/msprobe/{mindspore → core}/compare/ms_to_pt_api.yaml +0 -0
/msprobe/{mindspore/dump → core}/kernel_dump/kernel_config.py +0 -0
/msprobe/{pytorch/monitor/unittest → core/monitor}/__init__.py +0 -0

msprobe/pytorch/monitor/module_hook.py CHANGED Viewed

@@ -22,26 +22,29 @@ from functools import partial
 import pytz
 import torch
 import torch.distributed as dist
+import pandas as pd
 from torch.utils.hooks import BackwardHook
 from msprobe.core.common.const import MonitorConst, Const
 from msprobe.core.common.file_utils import load_json, save_json
+from msprobe.core.common.decorator import recursion_depth_decorator
+from msprobe.core.monitor.anomaly_processor import AnomalyScanner, AnomalyDataFactory, AnomalyDataWriter
+from msprobe.core.common.file_utils import write_df_to_csv
+from msprobe.core.common.utils import analyze_api_call_stack
 from msprobe.pytorch.common.log import logger
-from msprobe.pytorch.common.utils import is_recomputation
-from msprobe.pytorch.monitor.anomaly_analyse import AnomalyDataWriter
-from msprobe.pytorch.monitor.anomaly_detect import AnomalyScanner, SummaryWriterWithAD, AnomalyDataFactory, \
-    CSVWriterWithAD, BaseWriterWithAD, WriterInput
+from msprobe.pytorch.common.utils import is_recomputation, is_float8_tensor
+from msprobe.pytorch.monitor.data_writers import SummaryWriterWithAD, CSVWriterWithAD, BaseWriterWithAD, WriterInput
 from msprobe.pytorch.monitor.distributed.wrap_distributed import api_register, create_hooks, op_aggregate, \
     get_process_group
 from msprobe.pytorch.monitor.features import get_sign_matches
 from msprobe.pytorch.monitor.module_metric import get_metrics, get_summary_writer_tag_name, \
     TensorMetrics, squash_param_name
-from msprobe.pytorch.monitor.module_spec_verifier import validate_config_spec
 from msprobe.pytorch.monitor.optimizer_collect import OptimizerMonFactory
 from msprobe.pytorch.monitor.utils import get_param_struct, validate_config, validate_ops, \
-    get_output_base_dir, get_target_output_dir
+    get_output_base_dir, get_target_output_dir, chmod_tensorboard_dir, validate_set_monitor
 from msprobe.pytorch.monitor.visualizer import HeatmapVisualizer
 torch_version_above_or_equal_2 = torch.__version__.split('+')[0] >= '2.0'
 if not torch_version_above_or_equal_2:
     raise ValueError("monitor require torch>=2.0")
@@ -71,36 +74,7 @@ class ModuleHookContext:
         self.actvgrad = []
         self.module_name = module_name
         self.struct = {}
-        self.format_by_arg = {}
-        self.verified = False
-        self.focused_in_col = 0
-        self.focused_out_col = 0
-    def set_format_by_arg(self, key_name: str, target_config: dict):
-        """ 按照监控对象配置format_by_arg
-        1) module_name 在 target 中配置监控对象
-        2) module_name 未在 targets 中配置，且 all_xy 全量监控
-        3) module_name 未在 targets 中配置，且 all_xy 未全量监控
-        :param key_name: str, one of [input, output, input_grad, output_grad]
-        :param target_config: target obj in config json.
-        :return:
-        """
-        cared = target_config.get(self.module_name, self.struct)
-        if key_name in cared:
-            target_module_config = cared[key_name]
-            if isinstance(target_module_config, dict):
-                # current cared is self.struct, monitor all data for module_name
-                self.format_by_arg[key_name] = target_module_config.get('config')
-            elif isinstance(target_module_config, str):
-                # current cared is target_config[self.module_name]
-                self.format_by_arg[key_name] = target_module_config
-            else:
-                logger.warning_on_rank_0(f"target module config error, result maybe empty."
-                                         f"module_name: {self.module_name}, key_name: {key_name}")
-                self.format_by_arg[key_name] = None
-        else:
-            self.format_by_arg[key_name] = self.struct.get(key_name).get('config')
+        self.stack = ""
     def reset(self):
         self.actv.clear()
@@ -176,15 +150,16 @@ class GradContext:
 class TrainerMon:
     tensor_metrics = TensorMetrics()
-    def __init__(self, config_file_path, process_group=None, params_have_main_grad=True) -> None:
+    # 保留原opt_ty参数, 兼容msprobe1.2.2前旧版本
+    def __init__(self, config_file_path, process_group=None, params_have_main_grad=True, opt_ty=None) -> None:
         # TYPE1: 只在这里初始化的变量, 不会随着训练中途config配置改变而重置
         self.config_file_path = config_file_path
         self.process_group = get_process_group(process_group)
         self.params_have_main_grad = params_have_main_grad
         self.update_heatmap_visualizer = defaultdict(HeatmapVisualizer)
         self.ratio_heatmap_visualizer = defaultdict(HeatmapVisualizer)
-        self.origin_step_func = None
         self.origin_start_grad_sync = None
+        self.fsdp_post_backward_hook = None
         self.config_timestamp = 0  # 后面有校验时间戳, 首次监控无需为了更新config文件时间戳而去改, 可通过dynamic_on开关直接打开
         self.config = load_json(config_file_path)
         validate_config(self.config)
@@ -219,9 +194,10 @@ class TrainerMon:
         self.dp_group = None
         self.tp_group = None
         self.enable_megatron = False
+        self.fsdp_wrapped_module = False
         self.micro_batch_number = 1
-        self.optimizer_class = None
         self.optimizer_mon = None
+        self.optimizer_trans = None
         # TYPE3: 会随着训练中途config配置更新或监控状态改变而重置的变量
         self.module_fwd_hook_context_by_module = defaultdict(ModuleHookContext)
@@ -231,7 +207,6 @@ class TrainerMon:
         self.grad_context = GradContext()
         self.handles = defaultdict(list)
         self.param2name = defaultdict(str)
-        self.name2index = defaultdict()
         self.name2indices = defaultdict()
         self.name2param = {}
         self.duplicate_param = {}
@@ -244,6 +219,8 @@ class TrainerMon:
         self.optimizer_hooked = False
         self.param_registered = False
         self.struct_printed = False
+        self.pre_step_hooks = []
+        self.post_step_hooks = []
         # 动静态区分
         self.dynamic_enable = os.getenv("DYNAMIC_MONITOR", 'False').lower() == 'true'
@@ -314,6 +291,8 @@ class TrainerMon:
         self.param_distribution = self.config.get("param_distribution", False)
         self.mg_direction = self.config.get('mg_direction', False)
         self.cc_distribution = self.config.get("cc_distribution", {})
+        self.stack_info = self.config.get('stack_info', False)
+        self.monitor_mbs_grad = self.config.get('monitor_mbs_grad', False)
         if not self.cc_distribution.get('enable', False):
             self.cc_log_only = False
@@ -322,8 +301,6 @@ class TrainerMon:
             self.cc_log_only = self.cc_distribution.get('cc_log_only', False)
             self.cc_logged_stack = defaultdict(set)
             self.cc_pre_hook = self.cc_distribution.get('cc_pre_hook', False)
-            self.handles['cc'] = api_register.initialize_hook(*create_hooks(context=self.cc_context, monitor=self))
-            api_register.redirect_api()
         self.common_info()
@@ -336,11 +313,11 @@ class TrainerMon:
         # 初始化writer, 创建输出目录
         if self.format not in FORMAT_MAPPING:
-            logger.error(f"Unsupported format: {self.format}, use default format: {MonitorConst.CSV}")
+            logger.warning(f"Unsupported format: {self.format}, use default format: {MonitorConst.CSV}")
             self.format = MonitorConst.CSV
         if self.ur_distribution and self.format != 'tensorboard':
-            logger.error("can only set ur_distribution when format is 'tensorboard', cancel ur_distribution")
+            logger.warning("can only set ur_distribution when format is 'tensorboard', cancel ur_distribution")
             self.ur_distribution = False
         writer = FORMAT_MAPPING[self.format]
@@ -363,19 +340,6 @@ class TrainerMon:
                                                              self.rank)
                 self.anomaly_data_writer.init_detected_json()
-    def adhoc_check(self, target_tensor: torch.tensor, module_name: str, tensor_name: str, rank_list, ops_list):
-        rank = None
-        if dist.is_initialized():
-            rank = dist.get_rank()
-            if (rank not in rank_list) and len(rank_list) != 0:
-                return
-        self.tensor_metrics.stat_insert(target_tensor, ops_list, module_name, tensor_name, rank)
-    def build_tbtag_tensor_map(self, module_name, tag, tensor):
-        key = get_summary_writer_tag_name(module_name, tag, self.rank)
-        self._register_param_call_id("_hook_module", key)
-        return {key: tensor}
     def common_info(self):
         if not self.xy_distribution:
             logger.info_on_rank_0("> module input/output input_grad/output_grad is not monitored. ")
@@ -392,101 +356,38 @@ class TrainerMon:
         if not self.cc_distribution.get('enable', False):
             logger.info_on_rank_0("> cc operator is not monitored.")
-    def hook_modules(self):
-        if self.module_rank_list and (self.rank not in self.module_rank_list):
-            return
-        targets = self.config['targets']
-        module_in_all_stage = [key for key in targets.keys() if MonitorConst.NAME_SEP not in key]
-        for key in module_in_all_stage:
-            struct = targets.pop(key)
-            targets.update({f'{vpp_stage}{MonitorConst.NAME_SEP}{key}': struct for vpp_stage in range(len(self.model))})
-        hooked_count = 0
-        for vpp_stage, model_chunk in enumerate(self.model):
-            vpp_stage = f'{vpp_stage}{MonitorConst.NAME_SEP}'
-            targets = [x for x, _ in model_chunk.named_modules()] if self.print_struct else self.config[
-                'targets'].keys()
-            hooked_count += self._hook_module(targets, model_chunk, vpp_stage)
-        logger.info_on_rank_0(f"> {hooked_count} modules are monitored.")
-        def clone_if_tensor(args):
-            if isinstance(args, tuple):
-                return tuple([clone_if_tensor(arg) for arg in args])
-            elif isinstance(args, torch.Tensor):
-                return args.clone()
-            else:
-                return args
-        @torch.no_grad
-        def wrap_hook_setup(setup):
-            def wrapped_setup(*args, **kwargs):
-                args = setup(*args, **kwargs)
-                args = clone_if_tensor(args)
-                return args
-            return wrapped_setup
-        BackwardHook.setup_output_hook = wrap_hook_setup(BackwardHook.setup_output_hook)
-        return
-    def generate_param_metrics(self, opt_context):
-        if not self.param_distribution:
-            return
-        get_metrics(self.ops, self.name2param, self.eps, opt_context.param_metric)
-    def generate_mv_metrics(self, opt_context):
-        if not self.mv_distribution:
-            return
-        opt_context.exp_avg_metric = {}
-        opt_context.exp_avg_sq_metric = {}
-        m_tag_tensor_map = self.generate_param_map(MonitorConst.EXP_AVG, opt_context.param_exp_avg)
-        v_tag_tensor_map = self.generate_param_map(MonitorConst.EXP_AVG_SQ, opt_context.param_exp_avg_sq)
-        get_metrics(self.ops, m_tag_tensor_map, self.eps, opt_context.exp_avg_metric)
-        get_metrics(self.ops, v_tag_tensor_map, self.eps, opt_context.exp_avg_sq_metric)
-    def generate_wgrad_metrics(self):
-        if not self.wg_distribution:
-            return {}, {}
-        if self.weight_hooked:
-            get_metrics(self.ops, self.grad_context.acc, self.eps, self.grad_context.acc_metric)
-        grad_dict = {}
-        for param, name in self.param2name.items():
-            if self.duplicate_param.get(name, False):
-                continue
-            grad = param.main_grad if self.params_have_main_grad else param.grad
-            if grad is None:
-                logger.warning(f"grad is None: {name}, maybe something wrong happened.")
-                continue
-            tag = self.name2tag.get(name, {}).get(MonitorConst.POST_GRAD)
-            self._register_param_call_id("hook_optimizer", tag)
-            grad_dict[tag] = grad
+    # 保留原接口, 兼容msprobe1.2.2前旧版本
+    def monitor_gnorm_with_ad(self, model, optimizer=None, grad_acc_steps=1, tp_group=None, dp_group=None,
+                              start_iteration=0):
+        if optimizer is None:
+            optimizer = getattr(self, "optimizer_trans", None)  # 兼容老版本可传None的情况, 从set_wrapped_optimizer获取
+            if optimizer is None:
+                logger.error("monitor_gnorm_with_ad: please set_wrapped_optimizer before it or input optimizer!=None")
+                return
+        self.set_monitor(model, optimizer, grad_acc_steps, tp_group, dp_group, start_iteration)
-        get_metrics(self.ops, grad_dict, self.eps, self.grad_context.post)
-        unreduced_grad = self.grad_context.acc_metric if self.weight_hooked else self.grad_context.pre
-        return self.grad_context.post, unreduced_grad
+    # 保留原接口, 兼容msprobe1.2.2前旧版本
+    def set_wrapped_optimizer(self, optimizer):
+        self.optimizer_trans = optimizer
     def set_monitor(
             self,
             model,
+            optimizer,
             grad_acc_steps=1,
-            optimizer=None,
             tp_group=None,
             dp_group=None,
             start_iteration=0
     ):
         """External interface"""
+        grad_acc_steps, start_iteration = validate_set_monitor(grad_acc_steps, start_iteration)
         global start_step
         start_step = start_iteration
         logger.info(f'grad acc steps {grad_acc_steps}')
         self.micro_batch_number = grad_acc_steps
         self.dp_group = dp_group
         self.tp_group = tp_group
-        self.optimizer_mon, self.optimizer_class = OptimizerMonFactory.create_optimizer_mon(optimizer)
+        self.optimizer_mon = OptimizerMonFactory.create_optimizer_mon(optimizer)
         self.hook_step_final(optimizer)
         if not isinstance(model, list):
             model = [model]
@@ -502,18 +403,89 @@ class TrainerMon:
         self.hook_optimizer(optimizer)
         self._patch_grad_sync()
         self.hook_modules()
+        if self.cc_distribution.get('enable', False):
+            self.handles['cc'] = api_register.initialize_hook(*create_hooks(context=self.cc_context, monitor=self))
+            api_register.redirect_api()
         self.monitoring = True
+    def adhoc_check(self, target_tensor: torch.tensor, module_name: str, tensor_name: str, rank_list, ops_list):
+        rank = None
+        if dist.is_initialized():
+            rank = dist.get_rank()
+            if (rank not in rank_list) and len(rank_list) != 0:
+                return
+        self.tensor_metrics.stat_insert(target_tensor, ops_list, module_name, tensor_name, rank)
+    def build_tbtag_tensor_map(self, module_name, suffix, tag, tensor):
+        """
+        :param module_name: str of module name
+        :param suffix:
+        :param tag:
+        :param tensor: torch.tensor or tuple/list of torch.tensor
+        :return: tensor_map
+        """
+        tensor_map = {}
+        if isinstance(tensor, torch.Tensor):
+            tensor = [tensor]
+        if isinstance(tensor, tuple) or isinstance(tensor, list):
+            if len(tensor) == 1:
+                key = get_summary_writer_tag_name(module_name + suffix, tag, self.rank)
+                self.register_param_call_id("_hook_module", key)
+                tensor_map[key] = tensor[0]
+            else:
+                for i, tensor_i in enumerate(tensor):
+                    key = get_summary_writer_tag_name(module_name + f"_{i}" + suffix, tag, self.rank)
+                    self.register_param_call_id("_hook_module", key)
+                    tensor_map[key] = tensor_i
+        return tensor_map
     def generate_param_map(self, tag, param_tensor):
         metrics = {}
         for name in self.param2name.values():
             key = get_summary_writer_tag_name(name, tag, self.rank)
-            self._register_param_call_id("optimizer_pre_step_hook", key)
+            self.register_param_call_id("optimizer_pre_step_hook", key)
             if name not in param_tensor or param_tensor[name] is None:
                 continue
             metrics[key] = param_tensor[name]
         return metrics
+    def generate_param_metrics(self, opt_context, stage=MonitorConst.PRE_PARAM):
+        if not self.param_distribution:
+            return
+        tag2param = {
+            self.name2tag.get(name, {}).get(stage): param
+            for name, param in self.name2param.items()
+            if param.numel() != 0
+        }
+        get_metrics(self.ops, tag2param, self.eps, opt_context.param_metric)
+    def generate_mv_metrics(self, opt_context):
+        if not self.mv_distribution:
+            return
+        opt_context.exp_avg_metric = {}
+        opt_context.exp_avg_sq_metric = {}
+        m_tag_tensor_map = self.generate_param_map(MonitorConst.EXP_AVG, opt_context.param_exp_avg)
+        v_tag_tensor_map = self.generate_param_map(MonitorConst.EXP_AVG_SQ, opt_context.param_exp_avg_sq)
+        get_metrics(self.ops, m_tag_tensor_map, self.eps, opt_context.exp_avg_metric)
+        get_metrics(self.ops, v_tag_tensor_map, self.eps, opt_context.exp_avg_sq_metric)
+    def generate_wgrad_metrics(self, post_grad_dict):
+        if not self.wg_distribution:
+            return {}, {}
+        if self.weight_hooked:
+            get_metrics(self.ops, self.grad_context.acc, self.eps, self.grad_context.acc_metric)
+        get_metrics(self.ops, post_grad_dict, self.eps, self.grad_context.post)
+        reduced_grad = self.grad_context.post
+        if self.weight_hooked:
+            unreduced_grad = self.grad_context.acc_metric
+        else:
+            unreduced_grad = self.grad_context.pre
+        return reduced_grad, unreduced_grad
     def generate_xy_metrics(self):
         actv = {}
         for fwd_context in self.module_fwd_hook_context_by_module.values():
@@ -538,6 +510,17 @@ class TrainerMon:
     def write_adhoc_check(self, step):
         self.tensor_metrics.flush(self.summary_writer)
+    def write_stack_info(self):
+        stack_data = []
+        header = ["module_name", "stack_info"]
+        stack_data.append(header)
+        for _, fwd_context in self.module_fwd_hook_context_by_module.items():
+            stack_data.append([fwd_context.module_name, fwd_context.stack])
+        filepath = os.path.join(self.tensorboard_dir, f'stack_info.csv')
+        if not os.path.exists(filepath):
+            data_frame = pd.DataFrame(columns=stack_data)
+            write_df_to_csv(data_frame, filepath)
     def write_xy_tb(self, step):
         if not self.xy_distribution:
             return
@@ -552,27 +535,31 @@ class TrainerMon:
     def write_param_tb(self, opt_context):
         if not self.param_distribution:
             return
-        self.summary_writer.write_metrics(self.ops, opt_context.param_metric, opt_context.step, MonitorConst.PARAM)
+        param_metrics = {k: v for k, v in opt_context.param_metric.items() if MonitorConst.PRE_PARAM in k}
+        updated_param_metrics = {k: v for k, v in opt_context.param_metric.items() if MonitorConst.POST_PARAM in k}
+        self.summary_writer.write_metrics(self.ops, param_metrics, opt_context.step, MonitorConst.PRE_PARAM)
+        self.summary_writer.write_metrics(self.ops, updated_param_metrics, opt_context.step, MonitorConst.POST_PARAM)
     def write_mv_tb(self, opt_context):
         if not self.mv_distribution:
             return
-        self.summary_writer.write_metrics(self.ops, opt_context.exp_avg_metric,
+        self.summary_writer.write_metrics(self.ops, opt_context.exp_avg_metric,
                                           opt_context.step, MonitorConst.EXP_AVG)
-        self.summary_writer.write_metrics(self.ops, opt_context.exp_avg_sq_metric,
+        self.summary_writer.write_metrics(self.ops, opt_context.exp_avg_sq_metric,
                                           opt_context.step, MonitorConst.EXP_AVG_SQ)
     def write_grad_tb(self, step):
         if not self.wg_distribution:
             return
-        if self.enable_megatron:
-            self.summary_writer.write_metrics(self.ops, self.grad_context.pre, step, 'grad_unreduced')
+        if self.weight_hooked:
+            self.summary_writer.write_metrics(self.ops, self.grad_context.acc_metric, step, 'grad_unreduced',
+                                              use_micro_step=self.monitor_mbs_grad)
         else:
-            self.summary_writer.write_metrics(self.ops, self.grad_context.acc_metric, step, 'grad_unreduced')
+            self.summary_writer.write_metrics(self.ops, self.grad_context.pre, step, 'grad_unreduced')
         self.summary_writer.write_metrics(self.ops, self.grad_context.post, step, 'grad_reduced')
-    def hook_optimizer(self, optimizer=None):
+    def hook_optimizer(self, optimizer):
         # in DDP by default use params_have_main_grad
         def optimizer_pre_step_hook(optimizer, args, kwargs):
             context = self.optimizer_context[optimizer]
@@ -591,21 +578,23 @@ class TrainerMon:
             # skip generate metrics
             if context.step < self.start_step or (context.step - self.start_step) % self.step_interval != 0:
                 return
-            if MonitorConst.DEEPSPEED_ZERO_OPT_FILTER in self.optimizer_class:  # use deepspeed with zero1/2/3
-                if not self.name2indices:
-                    self.name2indices = self.optimizer_mon.get_param_index(self.param2name, self.name2index, optimizer)
-                mv_result = self.optimizer_mon.fetch_mv(self, optimizer, self.param2name, self.name2indices)
-                self.param2name = mv_result.grad
-            else:
-                mv_result = self.optimizer_mon.fetch_mv(self, optimizer, self.param2name)
-            context.param_exp_avg = mv_result.exp_avg
-            context.param_exp_avg_sq = mv_result.exp_avg_sq
-            context.param_adam_update = mv_result.update
-            context.param_adam_ratio = mv_result.ratio
-            self.generate_wgrad_metrics()
+            grad_dict = {}
+            if self.wg_distribution:
+                grad_dict = self.optimizer_mon.fetch_grad(self, self.param2name)
+            mv_result = None
+            if self.mv_distribution or self.ur_distribution or self.mg_direction:
+                mv_result = self.optimizer_mon.fetch_mv(self, self.param2name)
+            if mv_result:
+                context.param_exp_avg = mv_result.exp_avg
+                context.param_exp_avg_sq = mv_result.exp_avg_sq
+                context.param_adam_update = mv_result.update
+                context.param_adam_ratio = mv_result.ratio
+            self.generate_wgrad_metrics(grad_dict)
             self.generate_mv_metrics(context)
-            self.generate_param_metrics(context)
+            self.generate_param_metrics(context, MonitorConst.PRE_PARAM)
             tbtag_tensor_map = {}
             if self.mg_direction:
@@ -633,18 +622,15 @@ class TrainerMon:
             context.metric_dict = metric_dict
             return
-        def patch_step(func, optimizer):
-            def wrapper(*args, **kwargs):
-                optimizer_pre_step_hook(optimizer, args, kwargs)
-                out = func(*args, **kwargs)
-                return out
-            return wrapper
+        def optimizer_post_step_hook(optimizer, args, kwargs):
+            context = self.optimizer_context[optimizer]
+            self.generate_param_metrics(context, MonitorConst.POST_PARAM)
         if self.optimizer_hooked:
             return
-        optimizer.__class__.step = patch_step(optimizer.__class__.step, optimizer)
+        self.pre_step_hooks.append(optimizer_pre_step_hook)
+        self.post_step_hooks.append(optimizer_post_step_hook)
         self.optimizer_hooked = True
         return
@@ -674,6 +660,7 @@ class TrainerMon:
                 validate_config(config)
                 self.config = config
                 self.set_config()
+                self.start_step = context.step  # 动态启停时不受原start_step影响，永远从下一步开始
                 logger.warning(f"config is updated at step{context.step - 1}, "
                                f"will start new hook at step{context.step}.")
             except Exception as e:
@@ -703,6 +690,12 @@ class TrainerMon:
                     self.write_mv_tb(context)
                     self.write_param_tb(context)
                     self.write_adhoc_check(context.step)
+                    if self.stack_info:
+                        self.write_stack_info()
+                        self.stack_info = False
+                        for handle in self.handles["stack"]:
+                            handle.remove()
+                        self.handles["stack"].clear()
                     if self.ur_distribution:
                         for param_name, _ in context.param_adam_update.items():
@@ -721,6 +714,9 @@ class TrainerMon:
                     if self.anomaly_data_factory:
                         self.anomaly_data_writer.write_detected_json(self.summary_writer.get_anomalies())
                     self.summary_writer.clear_anomalies()
+                    if self.format == MonitorConst.TENSORBOARD:
+                        chmod_tensorboard_dir(self.tensorboard_dir)
                     self.call_id = 0
                     self.param_name_call_id.clear()
@@ -732,16 +728,69 @@ class TrainerMon:
         def patch_step(func, optimizer):
             def wrapper(*args, **kwargs):
+                for hook in self.pre_step_hooks:
+                    hook(optimizer, args, kwargs)
                 out = func(*args, **kwargs)
+                for hook in self.post_step_hooks:
+                    hook(optimizer, args, kwargs)
                 step_final_hook(optimizer, args, kwargs)
                 return out
             return wrapper
         optimizer.__class__.step = patch_step(optimizer.__class__.step, optimizer)
-        self.origin_step_func = optimizer.__class__.step
+        return
+    def hook_modules(self):
+        if self.module_rank_list and (self.rank not in self.module_rank_list):
+            return
+        targets = self.config['targets']
+        module_in_all_stage = [key for key in targets.keys() if MonitorConst.NAME_SEP not in key]
+        for key in module_in_all_stage:
+            struct = targets.pop(key)
+            targets.update({f'{vpp_stage}{MonitorConst.NAME_SEP}{key}': struct for vpp_stage in range(len(self.model))})
+        hooked_count = 0
+        for vpp_stage, model_chunk in enumerate(self.model):
+            vpp_stage = f'{vpp_stage}{MonitorConst.NAME_SEP}'
+            targets = [x for x, _ in model_chunk.named_modules()] if self.print_struct else self.config[
+                'targets'].keys()
+            hooked_count += self._hook_module(targets, model_chunk, vpp_stage)
+        logger.info_on_rank_0(f"> {hooked_count} modules are monitored.")
+        @recursion_depth_decorator('msprobe.pytorch.monitor.clone_if_tensor')
+        def clone_if_tensor(args):
+            if isinstance(args, tuple):
+                return tuple([clone_if_tensor(arg) for arg in args])
+            elif isinstance(args, torch.Tensor) and not is_float8_tensor(args):
+                return args.clone()
+            else:
+                return args
+        @torch.no_grad
+        def wrap_hook_setup(setup):
+            def wrapped_setup(*args, **kwargs):
+                args = setup(*args, **kwargs)
+                args = clone_if_tensor(args)
+                return args
+            return wrapped_setup
+        BackwardHook.setup_input_hook = wrap_hook_setup(BackwardHook.setup_input_hook)
+        BackwardHook.setup_output_hook = wrap_hook_setup(BackwardHook.setup_output_hook)
         return
+    def register_param_call_id(self, hook_name: str, key: str):
+        """
+        :param hook_name:
+        :param key: str, '0:relu_0/output_grad'
+        :return:
+        """
+        logger.debug(f"{hook_name} {key}: {self.call_id}")
+        self.param_name_call_id[key] = self.call_id
+        self.call_id += 1
     def _remove_all_hooks(self, optimizer):
         # 清空hook handle
         for handle in self.handles['xy']:
@@ -767,14 +816,18 @@ class TrainerMon:
                 logger.info("remove _ParamAndGradBucketGroup start_grad_sync")
             except ImportError:
                 pass
-        else:  # not megatron
+        elif self.fsdp_post_backward_hook:  # fsdp
+            torch.distributed.fsdp._runtime_utils._post_backward_hook = self.fsdp_post_backward_hook
+            logger.info("remove patch_post_backward_hook in fsdp.")
+        else:  # not megatron and not fsdp
             for handle in self.handles['wgrads']:
                 handle.remove()
             self.handles['wgrads'].clear()
             self.weight_hooked = False
         if self.optimizer_hooked:
-            optimizer.__class__.step = self.origin_step_func
+            self.pre_step_hooks.clear()
+            self.post_step_hooks.clear()
         for _, context in self.optimizer_context.items():
             context.reset()
@@ -783,12 +836,12 @@ class TrainerMon:
         for handle in self.handles['cc']:
             handle.remove()
         self.handles['cc'].clear()
+        api_register.restore_api()
         for _, context in self.cc_context.items():
             context.reset()
         # 清空节点缓存
         self.param2name.clear()
-        self.name2index.clear()
         self.name2indices.clear()
         self.name2param.clear()
         self.duplicate_param.clear()
@@ -848,27 +901,33 @@ class TrainerMon:
         return False
     def _register_chunk(self, model_chunk, prefix):
-        index = 0
         for (param_name, param) in model_chunk.named_parameters():
             if not param.requires_grad:
                 continue
+            if not self.fsdp_wrapped_module and param_name.startswith("_fsdp_wrapped_module"):
+                self.fsdp_wrapped_module = True
             if self._is_target_param(param_name, param, prefix):
                 name = prefix + squash_param_name(param_name, self.squash_name)
                 if name in self.param2name.values():
                     name = prefix + param_name
                 self.param2name[param] = name
                 self.name2param[name] = param
-                self.name2index[name] = index
                 if self.tp_group and not param_is_not_tensor_parallel_duplicate(param, self.tp_group):
                     self.duplicate_param[name] = True
                 if self.dp_group and param_is_data_parallel_duplicate(self.dp_group):
                     self.duplicate_param[name] = True
+                keywords = [
+                    MonitorConst.PRE_GRAD,
+                    MonitorConst.POST_GRAD,
+                    MonitorConst.PRE_PARAM,
+                    MonitorConst.POST_PARAM
+                ]
                 self.name2tag[name] = {
-                    MonitorConst.PRE_GRAD: get_summary_writer_tag_name(name, MonitorConst.PRE_GRAD, self.rank),
-                    MonitorConst.POST_GRAD: get_summary_writer_tag_name(name, MonitorConst.POST_GRAD, self.rank)
+                    k: get_summary_writer_tag_name(name, k, self.rank)
+                    for k in keywords
                 }
-                index += 1
     def _register_param_name(self):
         for vpp_stage, model_chunk in enumerate(self.model):
@@ -891,11 +950,17 @@ class TrainerMon:
             # nothing to hook
             return 0
-        def fwd_hook_fun(module, module_input, module_output, name):
+        def fwd_hook_fun(module, args, kwargs, module_output, name):
             if not module.training or is_recomputation():
                 # 1 only monitor training stage.
                 # 2 when open recompute, skip recomputed forward stage.
                 return
+            module_input = [tensor for tensor in args if torch.is_tensor(tensor)]
+            if kwargs:
+                kwargs_tensors = [tensor for tensor in kwargs.values() if torch.is_tensor(tensor)]
+                module_input.extend(kwargs_tensors)
             if module not in self.module_fwd_hook_context_by_module:
                 self.module_fwd_hook_context_by_module[module] = ModuleHookContext(name)
             context: ModuleHookContext = self.module_fwd_hook_context_by_module[module]
@@ -904,34 +969,20 @@ class TrainerMon:
                     Const.INPUT: get_param_struct(module_input),
                     Const.OUTPUT: get_param_struct(module_output)
                 }
             if self.print_struct:
                 self.module_struct[context.module_name].update(context.struct)
                 return
-            if not context.format_by_arg:
-                context.set_format_by_arg(Const.INPUT, self.config['targets'])
-                context.set_format_by_arg(Const.OUTPUT, self.config['targets'])
-            if not context.format_by_arg:
-                return
-            if not context.verified:
-                context.focused_in_col = validate_config_spec(context.format_by_arg[Const.INPUT],
-                                                              module_input, context.module_name,
-                                                              Const.INPUT)
-                context.focused_out_col = validate_config_spec(context.format_by_arg[Const.OUTPUT],
-                                                               module_output, context.module_name,
-                                                               Const.OUTPUT)
-                context.verified = True
-            # expect output be tensor type
             tbtag_tensor_map = {}
-            cared_input = module_input if context.focused_in_col is None else module_input[context.focused_in_col]
             tbtag_tensor_map.update(
                 self.build_tbtag_tensor_map(
-                    f'{context.module_name}.{Const.INPUT}{MonitorConst.NAME_SEP}{context.micro_step}',
-                    MonitorConst.ACTV, cared_input))
-            cared_output = module_output if context.focused_out_col is None else module_output[context.focused_out_col]
+                    f'{context.module_name}.{Const.INPUT}', f'{MonitorConst.NAME_SEP}{context.micro_step}',
+                    MonitorConst.ACTV, module_input))
             tbtag_tensor_map.update(
                 self.build_tbtag_tensor_map(
-                    f'{context.module_name}.{Const.OUTPUT}{MonitorConst.NAME_SEP}{context.micro_step}',
-                    MonitorConst.ACTV, cared_output))
+                    f'{context.module_name}.{Const.OUTPUT}', f'{MonitorConst.NAME_SEP}{context.micro_step}',
+                    MonitorConst.ACTV, module_output))
             get_metrics(self.ops, tbtag_tensor_map, self.eps, context.actv)
             context.micro_step += 1
@@ -949,31 +1000,17 @@ class TrainerMon:
             if self.print_struct:
                 self.module_struct[context.module_name].update(context.struct)
                 return
-            if not context.format_by_arg:
-                context.set_format_by_arg(MonitorConst.INPUT_GRAD, self.config['targets'])
-                context.set_format_by_arg(MonitorConst.OUTPUT_GRAD, self.config['targets'])
-            if not context.format_by_arg:
-                return
-            if not context.verified:
-                context.focused_in_col = validate_config_spec(
-                    context.format_by_arg[MonitorConst.INPUT_GRAD],
-                    input_grad, context.module_name, MonitorConst.INPUT_GRAD)
-                context.focused_out_col = validate_config_spec(
-                    context.format_by_arg[MonitorConst.OUTPUT_GRAD],
-                    output_grad, context.module_name, MonitorConst.OUTPUT_GRAD)
-                context.verified = True
             tbtag_tensor_map = {}
-            cared_input_grad = input_grad if context.focused_in_col is None else input_grad[context.focused_in_col]
             tbtag_tensor_map.update(
                 self.build_tbtag_tensor_map(
-                    f'{context.module_name}.{Const.INPUT}{MonitorConst.NAME_SEP}{context.micro_step}',
-                    MonitorConst.ACTV, cared_input_grad))
-            cared_output_grad = output_grad if context.focused_out_col is None else output_grad[context.focused_out_col]
+                    f'{context.module_name}.{Const.INPUT}', f'{MonitorConst.NAME_SEP}{context.micro_step}',
+                    MonitorConst.ACTVGRAD, input_grad))
             tbtag_tensor_map.update(
                 self.build_tbtag_tensor_map(
-                    f'{context.module_name}.{Const.OUTPUT}{MonitorConst.NAME_SEP}{context.micro_step}',
-                    MonitorConst.ACTV, cared_output_grad))
+                    f'{context.module_name}.{Const.OUTPUT}', f'{MonitorConst.NAME_SEP}{context.micro_step}',
+                    MonitorConst.ACTVGRAD, output_grad))
             if context.micro_step == 0 and context.actvgrad:
                 logger.warning(f"actvgrad context of {context.module_name} is not empty when first micro_step, "
@@ -987,17 +1024,30 @@ class TrainerMon:
                 context.micro_step = 0
             return
+        def stack_hook(module, args, kwargs, module_output, name):
+            if module not in self.module_fwd_hook_context_by_module:
+                self.module_fwd_hook_context_by_module[module] = ModuleHookContext(name)
+            context: ModuleHookContext = self.module_fwd_hook_context_by_module[module]
+            context.stack = analyze_api_call_stack(name)
+            return
         if self.backward_only and self.forward_only:
             logger.warning('not enable backward_only and forward_only simultaneously')
         hooked_count = 0
-        if self.xy_distribution or self.print_struct:
-            for module_name, submodule in module.named_modules():
-                name = self._is_target_module(module_name, target_names, vpp_stage)
-                if not name:
-                    continue
+        for module_name, submodule in module.named_modules():
+            if self.stack_info:
+                name = vpp_stage + squash_param_name(module_name, self.squash_name)
+                handle = submodule.register_forward_hook(partial(stack_hook, name=name), with_kwargs=True)
+                self.handles['stack'].append(handle)
+            name = self._is_target_module(module_name, target_names, vpp_stage)
+            if not name:
+                continue
+            if submodule.__class__.__name__ == "FullyShardedDataParallel":
+                continue
+            if self.xy_distribution or self.print_struct:
                 if not self.backward_only:
-                    handle = submodule.register_forward_hook(partial(fwd_hook_fun, name=name))
+                    handle = submodule.register_forward_hook(partial(fwd_hook_fun, name=name), with_kwargs=True)
                     self.handles['xy'].append(handle)
                 if not self.forward_only and not self.has_register_backward_hook(name, submodule):
                     handle = submodule.register_full_backward_hook(bwd_hook_fun)
@@ -1026,7 +1076,7 @@ class TrainerMon:
                     if tag is None:
                         continue
                     grad_dict[tag] = grad
-                    self._register_param_call_id("sync_grad_func", tag)
+                    self.register_param_call_id("sync_grad_func", tag)
                 get_metrics(self.ops, grad_dict, self.eps, self.grad_context.pre)
                 out = sync_grad_func(bucket)
                 return out
@@ -1035,7 +1085,14 @@ class TrainerMon:
         if not self.wg_distribution:
             return
+        if self.fsdp_wrapped_module:
+            # patch fsdp _runtime_utils._post_backward_hook
+            self._patch_fsdp_post_backward_hook()
+            return
+        if self.monitor_mbs_grad:
+            self._hook_weights()
+            return
         try:
             from megatron.core.distributed.param_and_grad_buffer import Bucket
             self.origin_start_grad_sync = Bucket.start_grad_sync
@@ -1052,44 +1109,82 @@ class TrainerMon:
             self.enable_megatron = True
             logger.info("megatron version is > core_r0.8.0 <= core_r0.9.0")
         except ImportError:
-            self.enable_megatron = False
+            self.enable_megatron = False | self.enable_megatron
+        if self.enable_megatron:
+            return
-        if not self.enable_megatron:
-            self._hook_weights()
+        # default hook weights
+        self._hook_weights()
+    def _patch_fsdp_post_backward_hook(self):
+        """
+        FSDP runtime 需要处理整个forward和backward计算和通信的流程，通过override nn.Module的forward，定义相应的逻辑。
+        对AccumulateGrad对象注册hook，可以在backward计算grad后立刻执行，在reduce_scatter操作前采集梯度累计后，通信聚合前的梯度。
+        每个forward阶段，fsdp对AccumulateGrad重复注册hook方法，monitor工具内注册hook无法生效，
+        因此对_post_backward_hook进行patch，在backward后，reduce_scatter前采集梯度。
+        """
+        def patch_post_backward_hook(_post_backward_hook):
+            def wrapper(state, handle, *unused):
+                grad_dict = {}
+                offset = 0
+                for param, name in self.param2name.items():
+                    limit = param.numel()
+                    if not limit:
+                        continue
+                    grad = handle.flat_param.grad[offset:offset + limit]
+                    offset += limit
+                    tag = self.name2tag.get(name, {}).get(MonitorConst.PRE_GRAD)
+                    if tag is None:
+                        continue
+                    grad_dict[tag] = grad
+                    self.register_param_call_id("_post_backward_hook", tag)
+                get_metrics(self.ops, grad_dict, self.eps, self.grad_context.pre)
+                out = _post_backward_hook(state, handle, *unused)
+                return out
+            return wrapper
+        logger.info("Patch fsdp _post_backward_hook, collect pre_grad metrics.")
+        self.fsdp_post_backward_hook = torch.distributed.fsdp._runtime_utils._post_backward_hook
+        torch.distributed.fsdp._runtime_utils._post_backward_hook = \
+            patch_post_backward_hook(torch.distributed.fsdp._runtime_utils._post_backward_hook)
     def _hook_weights(self):
+        """
+        遍历参数的梯度生成函数（grad_acc），并挂载hook，以便在该参数所有梯度计算后，采集通信聚合前梯度数据。
+        """
         context = self.grad_context
         @torch.no_grad
-        def param_hook(*args, context_dict, param, key, name):
+        def param_hook(*args, context_dict, param, name):
+            key = name
+            if self.monitor_mbs_grad:
+                key += f'{MonitorConst.NAME_SEP}{param.micro_step}'
+            key = get_summary_writer_tag_name(key, 'acc_grad', self.rank)
+            self.register_param_call_id("param_hook", key)
             param.micro_step += 1
-            self._register_param_call_id("param_hook", key)
-            if param.micro_step == self.micro_batch_number:
-                param.micro_step = 0
+            if self.monitor_mbs_grad or (param.micro_step == self.micro_batch_number):
                 if self.params_have_main_grad:
-                    context_dict[key] = param.main_grad.clone()
+                    grad = param.main_grad
                 else:
-                    context_dict[key] = param.grad.clone()
+                    grad = param.grad
+                if is_float8_tensor(grad):
+                    grad = grad.float()
+                context_dict[key] = grad.clone()
+            if param.micro_step == self.micro_batch_number:
+                param.micro_step = 0
         logger.info("hooking weights.")
         for param, name in self.param2name.items():
-            key = get_summary_writer_tag_name(name, 'acc_grad', self.rank)
             setattr(param, 'micro_step', 0)
             param_tmp = param.expand_as(param)
             grad_acc = param_tmp.grad_fn.next_functions[0][0]
             handle = grad_acc.register_hook(
-                partial(param_hook, context_dict=context.acc, param=param, key=key, name=name))
+                partial(param_hook, context_dict=context.acc, param=param, name=name))
             self.grad_accs.append(grad_acc)
             self.handles['wgrads'].append(handle)
         self.weight_hooked = True
-    def _register_param_call_id(self, hook_name: str, key: str):
-        """
-        :param hook_name:
-        :param key: str, '0:relu_0/output_grad'
-        :return:
-        """
-        logger.debug(f"{hook_name} {key}: {self.call_id}")
-        self.param_name_call_id[key] = self.call_id
-        self.call_id += 1

mindstudio-probe 1.2.2__py3-none-any.whl → 8.1.0__py3-none-any.whl

mindstudio-probe 1.2.2py3-none-any.whl → 8.1.0py3-none-any.whl