PyPI - mindstudio-probe - Versions diffs - 1.3.0__py3-none-any.whl → 8.1.1__py3-none-any.whl - Mend

mindstudio-probe 1.3.0py3-none-any.whl → 8.1.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (213) hide show

{mindstudio_probe-1.3.0.dist-info → mindstudio_probe-8.1.1.dist-info}/METADATA +4 -2
{mindstudio_probe-1.3.0.dist-info → mindstudio_probe-8.1.1.dist-info}/RECORD +204 -152
msprobe/README.md +32 -1
msprobe/core/__init__.py +17 -0
msprobe/core/common/const.py +120 -21
msprobe/core/common/exceptions.py +2 -2
msprobe/core/common/file_utils.py +279 -50
msprobe/core/common/framework_adapter.py +169 -0
msprobe/core/common/global_lock.py +86 -0
msprobe/core/common/runtime.py +25 -0
msprobe/core/common/utils.py +136 -45
msprobe/core/common_config.py +7 -0
msprobe/core/compare/acc_compare.py +646 -428
msprobe/core/compare/check.py +36 -103
msprobe/core/compare/compare_cli.py +4 -0
msprobe/core/compare/config.py +72 -0
msprobe/core/compare/highlight.py +215 -215
msprobe/core/compare/layer_mapping/layer_mapping.py +2 -0
msprobe/core/compare/merge_result/merge_result.py +4 -4
msprobe/core/compare/multiprocessing_compute.py +223 -110
msprobe/core/compare/npy_compare.py +2 -4
msprobe/core/compare/utils.py +214 -244
msprobe/core/config_check/__init__.py +17 -0
msprobe/{pytorch/dump/kernel_dump/kernel_config.py → core/config_check/checkers/__init__.py} +8 -16
msprobe/core/config_check/checkers/base_checker.py +60 -0
msprobe/core/config_check/checkers/dataset_checker.py +138 -0
msprobe/core/config_check/checkers/env_args_checker.py +96 -0
msprobe/core/config_check/checkers/hyperparameter_checker.py +170 -0
msprobe/core/config_check/checkers/pip_checker.py +90 -0
msprobe/core/config_check/checkers/random_checker.py +367 -0
msprobe/core/config_check/checkers/weights_checker.py +147 -0
msprobe/core/config_check/ckpt_compare/ckpt_comparator.py +74 -0
msprobe/core/config_check/ckpt_compare/megatron_loader.py +302 -0
msprobe/core/config_check/ckpt_compare/metrics.py +83 -0
msprobe/core/config_check/ckpt_compare/name_mapping.yaml +12 -0
msprobe/core/config_check/config_check_cli.py +51 -0
msprobe/core/config_check/config_checker.py +100 -0
msprobe/{mindspore/runtime.py → core/config_check/resource/dependency.yaml} +7 -4
msprobe/core/config_check/resource/env.yaml +57 -0
msprobe/core/config_check/resource/hyperparameter.yaml +21 -0
msprobe/core/config_check/utils/hyperparameter_parser.py +115 -0
msprobe/core/config_check/utils/utils.py +107 -0
msprobe/core/data_dump/api_registry.py +67 -4
msprobe/core/data_dump/data_collector.py +170 -89
msprobe/core/data_dump/data_processor/base.py +72 -51
msprobe/core/data_dump/data_processor/mindspore_processor.py +109 -55
msprobe/core/data_dump/data_processor/pytorch_processor.py +90 -82
msprobe/core/data_dump/json_writer.py +143 -27
msprobe/core/debugger/precision_debugger.py +144 -0
msprobe/core/grad_probe/constant.py +1 -1
msprobe/core/grad_probe/grad_compare.py +1 -1
msprobe/core/grad_probe/utils.py +1 -1
msprobe/core/hook_manager.py +242 -0
msprobe/core/monitor/anomaly_processor.py +384 -0
msprobe/core/service.py +357 -0
msprobe/core/single_save/__init__.py +0 -0
msprobe/core/single_save/single_comparator.py +243 -0
msprobe/core/single_save/single_saver.py +146 -0
msprobe/docs/01.installation.md +6 -5
msprobe/docs/02.config_introduction.md +79 -22
msprobe/docs/03.config_examples.md +1 -0
msprobe/docs/04.kernel_dump_PyTorch.md +1 -1
msprobe/docs/05.data_dump_PyTorch.md +118 -49
msprobe/docs/06.data_dump_MindSpore.md +167 -20
msprobe/docs/07.accuracy_checker_PyTorch.md +2 -2
msprobe/docs/08.accuracy_checker_online_PyTorch.md +69 -9
msprobe/docs/09.accuracy_checker_MindSpore.md +18 -6
msprobe/docs/10.accuracy_compare_PyTorch.md +212 -74
msprobe/docs/11.accuracy_compare_MindSpore.md +87 -37
msprobe/docs/12.overflow_check_PyTorch.md +2 -2
msprobe/docs/13.overflow_check_MindSpore.md +2 -2
msprobe/docs/14.data_parse_PyTorch.md +3 -3
msprobe/docs/17.grad_probe.md +2 -1
msprobe/docs/18.online_dispatch.md +2 -2
msprobe/docs/19.monitor.md +90 -44
msprobe/docs/21.visualization_PyTorch.md +68 -15
msprobe/docs/22.visualization_MindSpore.md +71 -18
msprobe/docs/25.tool_function_introduction.md +23 -22
msprobe/docs/26.data_dump_PyTorch_baseline.md +14 -3
msprobe/docs/27.dump_json_instruction.md +1 -1
msprobe/docs/28.debugger_save_instruction.md +111 -20
msprobe/docs/29.data_dump_MSAdapter.md +2 -2
msprobe/docs/30.overflow_check_MSAdapter.md +2 -2
msprobe/docs/31.config_check.md +95 -0
msprobe/docs/32.ckpt_compare.md +69 -0
msprobe/docs/33.generate_operator_MindSpore.md +181 -0
msprobe/docs/34.RL_collect.md +92 -0
msprobe/docs/35.nan_analyze.md +72 -0
msprobe/docs/data_dump_MindSpore/data_dump_MindSpore_baseline.md +12 -1
msprobe/docs/data_dump_MindSpore/dynamic_graph_quick_start_example.md +3 -1
msprobe/docs/img/compare_result.png +0 -0
msprobe/docs/img/save_compare_result_sample.png +0 -0
msprobe/docs/img/visualization/proxy.png +0 -0
msprobe/mindspore/__init__.py +1 -2
msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +150 -58
msprobe/mindspore/api_accuracy_checker/api_runner.py +7 -3
msprobe/mindspore/api_accuracy_checker/bench_functions/flash_attention_score.py +47 -69
msprobe/mindspore/api_accuracy_checker/cmd_parser.py +4 -0
msprobe/mindspore/api_accuracy_checker/compute_element.py +0 -1
msprobe/mindspore/api_accuracy_checker/data_manager.py +2 -2
msprobe/mindspore/api_accuracy_checker/generate_op_script/op_generator.py +460 -0
msprobe/mindspore/api_accuracy_checker/generate_op_script/operator_replication.template +2081 -0
msprobe/mindspore/api_accuracy_checker/multi_api_accuracy_checker.py +9 -0
msprobe/mindspore/api_accuracy_checker/torch_mindtorch_importer.py +2 -1
msprobe/mindspore/cell_processor.py +204 -33
msprobe/mindspore/code_mapping/graph_parser.py +4 -21
msprobe/mindspore/common/const.py +17 -7
msprobe/mindspore/common/utils.py +128 -11
msprobe/mindspore/compare/common_dir_compare.py +382 -0
msprobe/mindspore/compare/distributed_compare.py +2 -26
msprobe/mindspore/compare/ms_compare.py +17 -405
msprobe/mindspore/compare/ms_graph_compare.py +14 -5
msprobe/mindspore/compare/utils.py +37 -0
msprobe/mindspore/debugger/debugger_config.py +53 -3
msprobe/mindspore/debugger/precision_debugger.py +72 -91
msprobe/mindspore/dump/cell_dump_process.py +877 -0
msprobe/mindspore/dump/cell_dump_with_insert_gradient.py +864 -0
msprobe/mindspore/dump/dump_tool_factory.py +13 -5
msprobe/mindspore/dump/graph_mode_cell_dump.py +139 -0
msprobe/mindspore/dump/graph_tensor_dump.py +123 -0
msprobe/mindspore/dump/hook_cell/api_register.py +40 -6
msprobe/mindspore/dump/hook_cell/hook_cell.py +18 -7
msprobe/mindspore/dump/hook_cell/ms_hook_manager.py +88 -0
msprobe/mindspore/dump/hook_cell/primitive_hooks.py +8 -2
msprobe/mindspore/dump/hook_cell/support_wrap_ops.yaml +18 -0
msprobe/mindspore/dump/jit_dump.py +21 -18
msprobe/mindspore/dump/kernel_kbyk_dump.py +6 -3
msprobe/mindspore/dym_loader/hook_dynamic_loader.cpp +110 -0
msprobe/mindspore/dym_loader/hook_dynamic_loader.h +15 -15
msprobe/mindspore/free_benchmark/api_pynative_self_check.py +12 -6
msprobe/mindspore/free_benchmark/common/utils.py +1 -1
msprobe/mindspore/grad_probe/global_context.py +7 -2
msprobe/mindspore/grad_probe/grad_stat_csv.py +3 -2
msprobe/mindspore/mindspore_service.py +114 -0
msprobe/mindspore/monitor/common_func.py +52 -0
msprobe/mindspore/monitor/data_writers.py +237 -0
msprobe/mindspore/monitor/features.py +20 -7
msprobe/mindspore/monitor/module_hook.py +281 -209
msprobe/mindspore/monitor/optimizer_collect.py +334 -0
msprobe/mindspore/monitor/utils.py +25 -5
msprobe/mindspore/ms_config.py +16 -15
msprobe/mindspore/task_handler_factory.py +5 -2
msprobe/msprobe.py +19 -0
msprobe/nan_analyze/__init__.py +14 -0
msprobe/nan_analyze/analyzer.py +255 -0
msprobe/nan_analyze/graph.py +189 -0
msprobe/nan_analyze/utils.py +211 -0
msprobe/pytorch/api_accuracy_checker/common/config.py +2 -2
msprobe/pytorch/api_accuracy_checker/compare/compare.py +36 -34
msprobe/pytorch/api_accuracy_checker/compare/compare_utils.py +20 -20
msprobe/pytorch/api_accuracy_checker/generate_op_script/op_generator.py +4 -7
msprobe/pytorch/api_accuracy_checker/generate_op_script/operator_replication.template +204 -2
msprobe/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py +12 -11
msprobe/pytorch/api_accuracy_checker/run_ut/run_overflow_check.py +1 -0
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py +8 -5
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut_utils.py +2 -3
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/client.py +29 -13
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/device_dispatch.py +12 -2
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/server.py +45 -31
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/utils.py +156 -0
msprobe/pytorch/attl_manager.py +65 -0
msprobe/pytorch/bench_functions/npu_fusion_attention.py +27 -0
msprobe/pytorch/common/utils.py +26 -14
msprobe/pytorch/compare/distributed_compare.py +4 -36
msprobe/pytorch/compare/pt_compare.py +13 -84
msprobe/pytorch/compare/utils.py +47 -0
msprobe/pytorch/debugger/debugger_config.py +34 -17
msprobe/pytorch/debugger/precision_debugger.py +66 -118
msprobe/pytorch/dump/module_dump/hook_wrapper.py +93 -0
msprobe/pytorch/dump/module_dump/module_dump.py +11 -58
msprobe/pytorch/dump/module_dump/module_processer.py +143 -113
msprobe/pytorch/grad_probe/grad_stat_csv.py +3 -2
msprobe/pytorch/hook_module/api_register.py +29 -5
msprobe/pytorch/hook_module/hook_module.py +9 -18
msprobe/pytorch/hook_module/jit_script_wrapper.py +33 -0
msprobe/pytorch/hook_module/pt_hook_manager.py +68 -0
msprobe/pytorch/hook_module/support_wrap_ops.yaml +22 -1
msprobe/pytorch/hook_module/utils.py +28 -2
msprobe/pytorch/monitor/csv2tb.py +6 -2
msprobe/pytorch/monitor/data_writers.py +259 -0
msprobe/pytorch/monitor/module_hook.py +227 -158
msprobe/pytorch/monitor/module_metric.py +14 -0
msprobe/pytorch/monitor/optimizer_collect.py +242 -270
msprobe/pytorch/monitor/utils.py +16 -3
msprobe/pytorch/online_dispatch/dispatch.py +4 -2
msprobe/pytorch/online_dispatch/dump_compare.py +5 -2
msprobe/pytorch/parse_tool/lib/utils.py +3 -3
msprobe/pytorch/pt_config.py +8 -7
msprobe/pytorch/pytorch_service.py +73 -0
msprobe/visualization/builder/graph_builder.py +33 -13
msprobe/visualization/builder/msprobe_adapter.py +24 -11
msprobe/visualization/compare/graph_comparator.py +53 -45
msprobe/visualization/compare/mode_adapter.py +31 -1
msprobe/visualization/graph/base_node.py +3 -3
msprobe/visualization/graph/graph.py +2 -2
msprobe/visualization/graph_service.py +250 -103
msprobe/visualization/utils.py +27 -11
msprobe/mindspore/dym_loader/hook_dynamic_loader.cc +0 -106
msprobe/mindspore/monitor/anomaly_detect.py +0 -404
msprobe/mindspore/monitor/module_spec_verifier.py +0 -94
msprobe/mindspore/service.py +0 -549
msprobe/pytorch/monitor/anomaly_analyse.py +0 -201
msprobe/pytorch/monitor/anomaly_detect.py +0 -410
msprobe/pytorch/monitor/module_spec_verifier.py +0 -95
msprobe/pytorch/monitor/unittest/test_monitor.py +0 -160
msprobe/pytorch/service.py +0 -473
{mindstudio_probe-1.3.0.dist-info → mindstudio_probe-8.1.1.dist-info}/LICENSE +0 -0
{mindstudio_probe-1.3.0.dist-info → mindstudio_probe-8.1.1.dist-info}/WHEEL +0 -0
{mindstudio_probe-1.3.0.dist-info → mindstudio_probe-8.1.1.dist-info}/entry_points.txt +0 -0
{mindstudio_probe-1.3.0.dist-info → mindstudio_probe-8.1.1.dist-info}/top_level.txt +0 -0
/msprobe/{mindspore → core}/compare/ms_to_pt_api.yaml +0 -0
/msprobe/{mindspore/dump → core}/kernel_dump/kernel_config.py +0 -0
/msprobe/{pytorch/monitor/unittest → core/monitor}/__init__.py +0 -0

msprobe/pytorch/monitor/module_hook.py CHANGED Viewed

@@ -22,27 +22,29 @@ from functools import partial
 import pytz
 import torch
 import torch.distributed as dist
+import pandas as pd
 from torch.utils.hooks import BackwardHook
 from msprobe.core.common.const import MonitorConst, Const
 from msprobe.core.common.file_utils import load_json, save_json
 from msprobe.core.common.decorator import recursion_depth_decorator
+from msprobe.core.monitor.anomaly_processor import AnomalyScanner, AnomalyDataFactory, AnomalyDataWriter
+from msprobe.core.common.file_utils import write_df_to_csv
+from msprobe.core.common.utils import analyze_api_call_stack
 from msprobe.pytorch.common.log import logger
 from msprobe.pytorch.common.utils import is_recomputation, is_float8_tensor
-from msprobe.pytorch.monitor.anomaly_analyse import AnomalyDataWriter
-from msprobe.pytorch.monitor.anomaly_detect import AnomalyScanner, SummaryWriterWithAD, AnomalyDataFactory, \
-    CSVWriterWithAD, BaseWriterWithAD, WriterInput
+from msprobe.pytorch.monitor.data_writers import SummaryWriterWithAD, CSVWriterWithAD, BaseWriterWithAD, WriterInput
 from msprobe.pytorch.monitor.distributed.wrap_distributed import api_register, create_hooks, op_aggregate, \
     get_process_group
 from msprobe.pytorch.monitor.features import get_sign_matches
 from msprobe.pytorch.monitor.module_metric import get_metrics, get_summary_writer_tag_name, \
     TensorMetrics, squash_param_name
-from msprobe.pytorch.monitor.module_spec_verifier import validate_config_spec
 from msprobe.pytorch.monitor.optimizer_collect import OptimizerMonFactory
 from msprobe.pytorch.monitor.utils import get_param_struct, validate_config, validate_ops, \
     get_output_base_dir, get_target_output_dir, chmod_tensorboard_dir, validate_set_monitor
 from msprobe.pytorch.monitor.visualizer import HeatmapVisualizer
 torch_version_above_or_equal_2 = torch.__version__.split('+')[0] >= '2.0'
 if not torch_version_above_or_equal_2:
     raise ValueError("monitor require torch>=2.0")
@@ -72,36 +74,7 @@ class ModuleHookContext:
         self.actvgrad = []
         self.module_name = module_name
         self.struct = {}
-        self.format_by_arg = {}
-        self.verified = False
-        self.focused_in_col = 0
-        self.focused_out_col = 0
-    def set_format_by_arg(self, key_name: str, target_config: dict):
-        """ 按照监控对象配置format_by_arg
-        1) module_name 在 target 中配置监控对象
-        2) module_name 未在 targets 中配置，且 all_xy 全量监控
-        3) module_name 未在 targets 中配置，且 all_xy 未全量监控
-        :param key_name: str, one of [input, output, input_grad, output_grad]
-        :param target_config: target obj in config json.
-        :return:
-        """
-        cared = target_config.get(self.module_name, self.struct)
-        if key_name in cared:
-            target_module_config = cared[key_name]
-            if isinstance(target_module_config, dict):
-                # current cared is self.struct, monitor all data for module_name
-                self.format_by_arg[key_name] = target_module_config.get('config')
-            elif isinstance(target_module_config, str):
-                # current cared is target_config[self.module_name]
-                self.format_by_arg[key_name] = target_module_config
-            else:
-                logger.warning_on_rank_0(f"target module config error, result maybe empty."
-                                         f"module_name: {self.module_name}, key_name: {key_name}")
-                self.format_by_arg[key_name] = None
-        else:
-            self.format_by_arg[key_name] = self.struct.get(key_name).get('config')
+        self.stack = ""
     def reset(self):
         self.actv.clear()
@@ -185,8 +158,8 @@ class TrainerMon:
         self.params_have_main_grad = params_have_main_grad
         self.update_heatmap_visualizer = defaultdict(HeatmapVisualizer)
         self.ratio_heatmap_visualizer = defaultdict(HeatmapVisualizer)
-        self.origin_step_func = None
         self.origin_start_grad_sync = None
+        self.fsdp_post_backward_hook = None
         self.config_timestamp = 0  # 后面有校验时间戳, 首次监控无需为了更新config文件时间戳而去改, 可通过dynamic_on开关直接打开
         self.config = load_json(config_file_path)
         validate_config(self.config)
@@ -221,8 +194,8 @@ class TrainerMon:
         self.dp_group = None
         self.tp_group = None
         self.enable_megatron = False
+        self.fsdp_wrapped_module = False
         self.micro_batch_number = 1
-        self.optimizer_class = None
         self.optimizer_mon = None
         self.optimizer_trans = None
@@ -234,7 +207,6 @@ class TrainerMon:
         self.grad_context = GradContext()
         self.handles = defaultdict(list)
         self.param2name = defaultdict(str)
-        self.name2index = defaultdict()
         self.name2indices = defaultdict()
         self.name2param = {}
         self.duplicate_param = {}
@@ -247,6 +219,8 @@ class TrainerMon:
         self.optimizer_hooked = False
         self.param_registered = False
         self.struct_printed = False
+        self.pre_step_hooks = []
+        self.post_step_hooks = []
         # 动静态区分
         self.dynamic_enable = os.getenv("DYNAMIC_MONITOR", 'False').lower() == 'true'
@@ -317,6 +291,8 @@ class TrainerMon:
         self.param_distribution = self.config.get("param_distribution", False)
         self.mg_direction = self.config.get('mg_direction', False)
         self.cc_distribution = self.config.get("cc_distribution", {})
+        self.stack_info = self.config.get('stack_info', False)
+        self.monitor_mbs_grad = self.config.get('monitor_mbs_grad', False)
         if not self.cc_distribution.get('enable', False):
             self.cc_log_only = False
@@ -411,7 +387,7 @@ class TrainerMon:
         self.micro_batch_number = grad_acc_steps
         self.dp_group = dp_group
         self.tp_group = tp_group
-        self.optimizer_mon, self.optimizer_class = OptimizerMonFactory.create_optimizer_mon(optimizer)
+        self.optimizer_mon = OptimizerMonFactory.create_optimizer_mon(optimizer)
         self.hook_step_final(optimizer)
         if not isinstance(model, list):
             model = [model]
@@ -440,25 +416,48 @@ class TrainerMon:
                 return
         self.tensor_metrics.stat_insert(target_tensor, ops_list, module_name, tensor_name, rank)
-    def build_tbtag_tensor_map(self, module_name, tag, tensor):
-        key = get_summary_writer_tag_name(module_name, tag, self.rank)
-        self._register_param_call_id("_hook_module", key)
-        return {key: tensor}
+    def build_tbtag_tensor_map(self, module_name, suffix, tag, tensor):
+        """
+        :param module_name: str of module name
+        :param suffix:
+        :param tag:
+        :param tensor: torch.tensor or tuple/list of torch.tensor
+        :return: tensor_map
+        """
+        tensor_map = {}
+        if isinstance(tensor, torch.Tensor):
+            tensor = [tensor]
+        if isinstance(tensor, tuple) or isinstance(tensor, list):
+            if len(tensor) == 1:
+                key = get_summary_writer_tag_name(module_name + suffix, tag, self.rank)
+                self.register_param_call_id("_hook_module", key)
+                tensor_map[key] = tensor[0]
+            else:
+                for i, tensor_i in enumerate(tensor):
+                    key = get_summary_writer_tag_name(module_name + f"_{i}" + suffix, tag, self.rank)
+                    self.register_param_call_id("_hook_module", key)
+                    tensor_map[key] = tensor_i
+        return tensor_map
     def generate_param_map(self, tag, param_tensor):
         metrics = {}
         for name in self.param2name.values():
             key = get_summary_writer_tag_name(name, tag, self.rank)
-            self._register_param_call_id("optimizer_pre_step_hook", key)
+            self.register_param_call_id("optimizer_pre_step_hook", key)
             if name not in param_tensor or param_tensor[name] is None:
                 continue
             metrics[key] = param_tensor[name]
         return metrics
-    def generate_param_metrics(self, opt_context):
+    def generate_param_metrics(self, opt_context, stage=MonitorConst.PRE_PARAM):
         if not self.param_distribution:
             return
-        get_metrics(self.ops, self.name2param, self.eps, opt_context.param_metric)
+        tag2param = {
+            self.name2tag.get(name, {}).get(stage): param
+            for name, param in self.name2param.items()
+            if param.numel() != 0
+        }
+        get_metrics(self.ops, tag2param, self.eps, opt_context.param_metric)
     def generate_mv_metrics(self, opt_context):
         if not self.mv_distribution:
@@ -470,28 +469,22 @@ class TrainerMon:
         get_metrics(self.ops, m_tag_tensor_map, self.eps, opt_context.exp_avg_metric)
         get_metrics(self.ops, v_tag_tensor_map, self.eps, opt_context.exp_avg_sq_metric)
-    def generate_wgrad_metrics(self):
+    def generate_wgrad_metrics(self, post_grad_dict):
         if not self.wg_distribution:
             return {}, {}
         if self.weight_hooked:
             get_metrics(self.ops, self.grad_context.acc, self.eps, self.grad_context.acc_metric)
-        grad_dict = {}
-        for param, name in self.param2name.items():
-            if self.duplicate_param.get(name, False):
-                continue
-            grad = param.main_grad if self.params_have_main_grad else param.grad
-            if grad is None:
-                logger.warning(f"grad is None: {name}, maybe something wrong happened.")
-                continue
-            tag = self.name2tag.get(name, {}).get(MonitorConst.POST_GRAD)
-            self._register_param_call_id("hook_optimizer", tag)
-            grad_dict[tag] = grad
+        get_metrics(self.ops, post_grad_dict, self.eps, self.grad_context.post)
+        reduced_grad = self.grad_context.post
+        if self.weight_hooked:
+            unreduced_grad = self.grad_context.acc_metric
+        else:
+            unreduced_grad = self.grad_context.pre
-        get_metrics(self.ops, grad_dict, self.eps, self.grad_context.post)
-        unreduced_grad = self.grad_context.acc_metric if self.weight_hooked else self.grad_context.pre
-        return self.grad_context.post, unreduced_grad
+        return reduced_grad, unreduced_grad
     def generate_xy_metrics(self):
         actv = {}
@@ -517,6 +510,17 @@ class TrainerMon:
     def write_adhoc_check(self, step):
         self.tensor_metrics.flush(self.summary_writer)
+    def write_stack_info(self):
+        stack_data = []
+        header = ["module_name", "stack_info"]
+        stack_data.append(header)
+        for _, fwd_context in self.module_fwd_hook_context_by_module.items():
+            stack_data.append([fwd_context.module_name, fwd_context.stack])
+        filepath = os.path.join(self.tensorboard_dir, f'stack_info.csv')
+        if not os.path.exists(filepath):
+            data_frame = pd.DataFrame(columns=stack_data)
+            write_df_to_csv(data_frame, filepath)
     def write_xy_tb(self, step):
         if not self.xy_distribution:
             return
@@ -531,7 +535,10 @@ class TrainerMon:
     def write_param_tb(self, opt_context):
         if not self.param_distribution:
             return
-        self.summary_writer.write_metrics(self.ops, opt_context.param_metric, opt_context.step, MonitorConst.PARAM)
+        param_metrics = {k: v for k, v in opt_context.param_metric.items() if MonitorConst.PRE_PARAM in k}
+        updated_param_metrics = {k: v for k, v in opt_context.param_metric.items() if MonitorConst.POST_PARAM in k}
+        self.summary_writer.write_metrics(self.ops, param_metrics, opt_context.step, MonitorConst.PRE_PARAM)
+        self.summary_writer.write_metrics(self.ops, updated_param_metrics, opt_context.step, MonitorConst.POST_PARAM)
     def write_mv_tb(self, opt_context):
         if not self.mv_distribution:
@@ -545,10 +552,11 @@ class TrainerMon:
         if not self.wg_distribution:
             return
-        if self.enable_megatron:
-            self.summary_writer.write_metrics(self.ops, self.grad_context.pre, step, 'grad_unreduced')
+        if self.weight_hooked:
+            self.summary_writer.write_metrics(self.ops, self.grad_context.acc_metric, step, 'grad_unreduced',
+                                              use_micro_step=self.monitor_mbs_grad)
         else:
-            self.summary_writer.write_metrics(self.ops, self.grad_context.acc_metric, step, 'grad_unreduced')
+            self.summary_writer.write_metrics(self.ops, self.grad_context.pre, step, 'grad_unreduced')
         self.summary_writer.write_metrics(self.ops, self.grad_context.post, step, 'grad_reduced')
     def hook_optimizer(self, optimizer):
@@ -570,21 +578,23 @@ class TrainerMon:
             # skip generate metrics
             if context.step < self.start_step or (context.step - self.start_step) % self.step_interval != 0:
                 return
-            if MonitorConst.DEEPSPEED_ZERO_OPT_FILTER in self.optimizer_class:  # use deepspeed with zero1/2/3
-                if not self.name2indices:
-                    self.name2indices = self.optimizer_mon.get_param_index(self.param2name, self.name2index, optimizer)
-                mv_result = self.optimizer_mon.fetch_mv(self, optimizer, self.param2name, self.name2indices)
-                self.param2name = mv_result.grad
-            else:
-                mv_result = self.optimizer_mon.fetch_mv(self, optimizer, self.param2name)
-            context.param_exp_avg = mv_result.exp_avg
-            context.param_exp_avg_sq = mv_result.exp_avg_sq
-            context.param_adam_update = mv_result.update
-            context.param_adam_ratio = mv_result.ratio
-            self.generate_wgrad_metrics()
+            grad_dict = {}
+            if self.wg_distribution:
+                grad_dict = self.optimizer_mon.fetch_grad(self, self.param2name)
+            mv_result = None
+            if self.mv_distribution or self.ur_distribution or self.mg_direction:
+                mv_result = self.optimizer_mon.fetch_mv(self, self.param2name)
+            if mv_result:
+                context.param_exp_avg = mv_result.exp_avg
+                context.param_exp_avg_sq = mv_result.exp_avg_sq
+                context.param_adam_update = mv_result.update
+                context.param_adam_ratio = mv_result.ratio
+            self.generate_wgrad_metrics(grad_dict)
             self.generate_mv_metrics(context)
-            self.generate_param_metrics(context)
+            self.generate_param_metrics(context, MonitorConst.PRE_PARAM)
             tbtag_tensor_map = {}
             if self.mg_direction:
@@ -612,17 +622,15 @@ class TrainerMon:
             context.metric_dict = metric_dict
             return
-        def patch_step(func, optimizer):
-            def wrapper(*args, **kwargs):
-                optimizer_pre_step_hook(optimizer, args, kwargs)
-                out = func(*args, **kwargs)
-                return out
-            return wrapper
+        def optimizer_post_step_hook(optimizer, args, kwargs):
+            context = self.optimizer_context[optimizer]
+            self.generate_param_metrics(context, MonitorConst.POST_PARAM)
         if self.optimizer_hooked:
             return
-        optimizer.__class__.step = patch_step(optimizer.__class__.step, optimizer)
+        self.pre_step_hooks.append(optimizer_pre_step_hook)
+        self.post_step_hooks.append(optimizer_post_step_hook)
         self.optimizer_hooked = True
         return
@@ -682,6 +690,12 @@ class TrainerMon:
                     self.write_mv_tb(context)
                     self.write_param_tb(context)
                     self.write_adhoc_check(context.step)
+                    if self.stack_info:
+                        self.write_stack_info()
+                        self.stack_info = False
+                        for handle in self.handles["stack"]:
+                            handle.remove()
+                        self.handles["stack"].clear()
                     if self.ur_distribution:
                         for param_name, _ in context.param_adam_update.items():
@@ -714,13 +728,16 @@ class TrainerMon:
         def patch_step(func, optimizer):
             def wrapper(*args, **kwargs):
+                for hook in self.pre_step_hooks:
+                    hook(optimizer, args, kwargs)
                 out = func(*args, **kwargs)
+                for hook in self.post_step_hooks:
+                    hook(optimizer, args, kwargs)
                 step_final_hook(optimizer, args, kwargs)
                 return out
             return wrapper
         optimizer.__class__.step = patch_step(optimizer.__class__.step, optimizer)
-        self.origin_step_func = optimizer.__class__.step
         return
     def hook_modules(self):
@@ -764,6 +781,16 @@ class TrainerMon:
         BackwardHook.setup_output_hook = wrap_hook_setup(BackwardHook.setup_output_hook)
         return
+    def register_param_call_id(self, hook_name: str, key: str):
+        """
+        :param hook_name:
+        :param key: str, '0:relu_0/output_grad'
+        :return:
+        """
+        logger.debug(f"{hook_name} {key}: {self.call_id}")
+        self.param_name_call_id[key] = self.call_id
+        self.call_id += 1
     def _remove_all_hooks(self, optimizer):
         # 清空hook handle
         for handle in self.handles['xy']:
@@ -789,14 +816,18 @@ class TrainerMon:
                 logger.info("remove _ParamAndGradBucketGroup start_grad_sync")
             except ImportError:
                 pass
-        else:  # not megatron
+        elif self.fsdp_post_backward_hook:  # fsdp
+            torch.distributed.fsdp._runtime_utils._post_backward_hook = self.fsdp_post_backward_hook
+            logger.info("remove patch_post_backward_hook in fsdp.")
+        else:  # not megatron and not fsdp
             for handle in self.handles['wgrads']:
                 handle.remove()
             self.handles['wgrads'].clear()
             self.weight_hooked = False
         if self.optimizer_hooked:
-            optimizer.__class__.step = self.origin_step_func
+            self.pre_step_hooks.clear()
+            self.post_step_hooks.clear()
         for _, context in self.optimizer_context.items():
             context.reset()
@@ -811,7 +842,6 @@ class TrainerMon:
         # 清空节点缓存
         self.param2name.clear()
-        self.name2index.clear()
         self.name2indices.clear()
         self.name2param.clear()
         self.duplicate_param.clear()
@@ -871,27 +901,33 @@ class TrainerMon:
         return False
     def _register_chunk(self, model_chunk, prefix):
-        index = 0
         for (param_name, param) in model_chunk.named_parameters():
             if not param.requires_grad:
                 continue
+            if not self.fsdp_wrapped_module and param_name.startswith("_fsdp_wrapped_module"):
+                self.fsdp_wrapped_module = True
             if self._is_target_param(param_name, param, prefix):
                 name = prefix + squash_param_name(param_name, self.squash_name)
                 if name in self.param2name.values():
                     name = prefix + param_name
                 self.param2name[param] = name
                 self.name2param[name] = param
-                self.name2index[name] = index
                 if self.tp_group and not param_is_not_tensor_parallel_duplicate(param, self.tp_group):
                     self.duplicate_param[name] = True
                 if self.dp_group and param_is_data_parallel_duplicate(self.dp_group):
                     self.duplicate_param[name] = True
+                keywords = [
+                    MonitorConst.PRE_GRAD,
+                    MonitorConst.POST_GRAD,
+                    MonitorConst.PRE_PARAM,
+                    MonitorConst.POST_PARAM
+                ]
                 self.name2tag[name] = {
-                    MonitorConst.PRE_GRAD: get_summary_writer_tag_name(name, MonitorConst.PRE_GRAD, self.rank),
-                    MonitorConst.POST_GRAD: get_summary_writer_tag_name(name, MonitorConst.POST_GRAD, self.rank)
+                    k: get_summary_writer_tag_name(name, k, self.rank)
+                    for k in keywords
                 }
-                index += 1
     def _register_param_name(self):
         for vpp_stage, model_chunk in enumerate(self.model):
@@ -914,11 +950,17 @@ class TrainerMon:
             # nothing to hook
             return 0
-        def fwd_hook_fun(module, module_input, module_output, name):
+        def fwd_hook_fun(module, args, kwargs, module_output, name):
             if not module.training or is_recomputation():
                 # 1 only monitor training stage.
                 # 2 when open recompute, skip recomputed forward stage.
                 return
+            module_input = [tensor for tensor in args if torch.is_tensor(tensor)]
+            if kwargs:
+                kwargs_tensors = [tensor for tensor in kwargs.values() if torch.is_tensor(tensor)]
+                module_input.extend(kwargs_tensors)
             if module not in self.module_fwd_hook_context_by_module:
                 self.module_fwd_hook_context_by_module[module] = ModuleHookContext(name)
             context: ModuleHookContext = self.module_fwd_hook_context_by_module[module]
@@ -927,34 +969,20 @@ class TrainerMon:
                     Const.INPUT: get_param_struct(module_input),
                     Const.OUTPUT: get_param_struct(module_output)
                 }
             if self.print_struct:
                 self.module_struct[context.module_name].update(context.struct)
                 return
-            if not context.format_by_arg:
-                context.set_format_by_arg(Const.INPUT, self.config['targets'])
-                context.set_format_by_arg(Const.OUTPUT, self.config['targets'])
-            if not context.format_by_arg:
-                return
-            if not context.verified:
-                context.focused_in_col = validate_config_spec(context.format_by_arg[Const.INPUT],
-                                                              module_input, context.module_name,
-                                                              Const.INPUT)
-                context.focused_out_col = validate_config_spec(context.format_by_arg[Const.OUTPUT],
-                                                               module_output, context.module_name,
-                                                               Const.OUTPUT)
-                context.verified = True
-            # expect output be tensor type
             tbtag_tensor_map = {}
-            cared_input = module_input if context.focused_in_col is None else module_input[context.focused_in_col]
             tbtag_tensor_map.update(
                 self.build_tbtag_tensor_map(
-                    f'{context.module_name}.{Const.INPUT}{MonitorConst.NAME_SEP}{context.micro_step}',
-                    MonitorConst.ACTV, cared_input))
-            cared_output = module_output if context.focused_out_col is None else module_output[context.focused_out_col]
+                    f'{context.module_name}.{Const.INPUT}', f'{MonitorConst.NAME_SEP}{context.micro_step}',
+                    MonitorConst.ACTV, module_input))
             tbtag_tensor_map.update(
                 self.build_tbtag_tensor_map(
-                    f'{context.module_name}.{Const.OUTPUT}{MonitorConst.NAME_SEP}{context.micro_step}',
-                    MonitorConst.ACTV, cared_output))
+                    f'{context.module_name}.{Const.OUTPUT}', f'{MonitorConst.NAME_SEP}{context.micro_step}',
+                    MonitorConst.ACTV, module_output))
             get_metrics(self.ops, tbtag_tensor_map, self.eps, context.actv)
             context.micro_step += 1
@@ -972,31 +1000,17 @@ class TrainerMon:
             if self.print_struct:
                 self.module_struct[context.module_name].update(context.struct)
                 return
-            if not context.format_by_arg:
-                context.set_format_by_arg(MonitorConst.INPUT_GRAD, self.config['targets'])
-                context.set_format_by_arg(MonitorConst.OUTPUT_GRAD, self.config['targets'])
-            if not context.format_by_arg:
-                return
-            if not context.verified:
-                context.focused_in_col = validate_config_spec(
-                    context.format_by_arg[MonitorConst.INPUT_GRAD],
-                    input_grad, context.module_name, MonitorConst.INPUT_GRAD)
-                context.focused_out_col = validate_config_spec(
-                    context.format_by_arg[MonitorConst.OUTPUT_GRAD],
-                    output_grad, context.module_name, MonitorConst.OUTPUT_GRAD)
-                context.verified = True
             tbtag_tensor_map = {}
-            cared_input_grad = input_grad if context.focused_in_col is None else input_grad[context.focused_in_col]
             tbtag_tensor_map.update(
                 self.build_tbtag_tensor_map(
-                    f'{context.module_name}.{Const.INPUT}{MonitorConst.NAME_SEP}{context.micro_step}',
-                    MonitorConst.ACTV, cared_input_grad))
-            cared_output_grad = output_grad if context.focused_out_col is None else output_grad[context.focused_out_col]
+                    f'{context.module_name}.{Const.INPUT}', f'{MonitorConst.NAME_SEP}{context.micro_step}',
+                    MonitorConst.ACTVGRAD, input_grad))
             tbtag_tensor_map.update(
                 self.build_tbtag_tensor_map(
-                    f'{context.module_name}.{Const.OUTPUT}{MonitorConst.NAME_SEP}{context.micro_step}',
-                    MonitorConst.ACTV, cared_output_grad))
+                    f'{context.module_name}.{Const.OUTPUT}', f'{MonitorConst.NAME_SEP}{context.micro_step}',
+                    MonitorConst.ACTVGRAD, output_grad))
             if context.micro_step == 0 and context.actvgrad:
                 logger.warning(f"actvgrad context of {context.module_name} is not empty when first micro_step, "
@@ -1010,17 +1024,30 @@ class TrainerMon:
                 context.micro_step = 0
             return
+        def stack_hook(module, args, kwargs, module_output, name):
+            if module not in self.module_fwd_hook_context_by_module:
+                self.module_fwd_hook_context_by_module[module] = ModuleHookContext(name)
+            context: ModuleHookContext = self.module_fwd_hook_context_by_module[module]
+            context.stack = analyze_api_call_stack(name)
+            return
         if self.backward_only and self.forward_only:
             logger.warning('not enable backward_only and forward_only simultaneously')
         hooked_count = 0
-        if self.xy_distribution or self.print_struct:
-            for module_name, submodule in module.named_modules():
-                name = self._is_target_module(module_name, target_names, vpp_stage)
-                if not name:
-                    continue
+        for module_name, submodule in module.named_modules():
+            if self.stack_info:
+                name = vpp_stage + squash_param_name(module_name, self.squash_name)
+                handle = submodule.register_forward_hook(partial(stack_hook, name=name), with_kwargs=True)
+                self.handles['stack'].append(handle)
+            name = self._is_target_module(module_name, target_names, vpp_stage)
+            if not name:
+                continue
+            if submodule.__class__.__name__ == "FullyShardedDataParallel":
+                continue
+            if self.xy_distribution or self.print_struct:
                 if not self.backward_only:
-                    handle = submodule.register_forward_hook(partial(fwd_hook_fun, name=name))
+                    handle = submodule.register_forward_hook(partial(fwd_hook_fun, name=name), with_kwargs=True)
                     self.handles['xy'].append(handle)
                 if not self.forward_only and not self.has_register_backward_hook(name, submodule):
                     handle = submodule.register_full_backward_hook(bwd_hook_fun)
@@ -1049,7 +1076,7 @@ class TrainerMon:
                     if tag is None:
                         continue
                     grad_dict[tag] = grad
-                    self._register_param_call_id("sync_grad_func", tag)
+                    self.register_param_call_id("sync_grad_func", tag)
                 get_metrics(self.ops, grad_dict, self.eps, self.grad_context.pre)
                 out = sync_grad_func(bucket)
                 return out
@@ -1058,7 +1085,14 @@ class TrainerMon:
         if not self.wg_distribution:
             return
+        if self.fsdp_wrapped_module:
+            # patch fsdp _runtime_utils._post_backward_hook
+            self._patch_fsdp_post_backward_hook()
+            return
+        if self.monitor_mbs_grad:
+            self._hook_weights()
+            return
         try:
             from megatron.core.distributed.param_and_grad_buffer import Bucket
             self.origin_start_grad_sync = Bucket.start_grad_sync
@@ -1076,19 +1110,62 @@ class TrainerMon:
             logger.info("megatron version is > core_r0.8.0 <= core_r0.9.0")
         except ImportError:
             self.enable_megatron = False | self.enable_megatron
+        if self.enable_megatron:
+            return
-        if not self.enable_megatron:
-            self._hook_weights()
+        # default hook weights
+        self._hook_weights()
+    def _patch_fsdp_post_backward_hook(self):
+        """
+        FSDP runtime 需要处理整个forward和backward计算和通信的流程，通过override nn.Module的forward，定义相应的逻辑。
+        对AccumulateGrad对象注册hook，可以在backward计算grad后立刻执行，在reduce_scatter操作前采集梯度累计后，通信聚合前的梯度。
+        每个forward阶段，fsdp对AccumulateGrad重复注册hook方法，monitor工具内注册hook无法生效，
+        因此对_post_backward_hook进行patch，在backward后，reduce_scatter前采集梯度。
+        """
+        def patch_post_backward_hook(_post_backward_hook):
+            def wrapper(state, handle, *unused):
+                grad_dict = {}
+                offset = 0
+                for param, name in self.param2name.items():
+                    limit = param.numel()
+                    if not limit:
+                        continue
+                    grad = handle.flat_param.grad[offset:offset + limit]
+                    offset += limit
+                    tag = self.name2tag.get(name, {}).get(MonitorConst.PRE_GRAD)
+                    if tag is None:
+                        continue
+                    grad_dict[tag] = grad
+                    self.register_param_call_id("_post_backward_hook", tag)
+                get_metrics(self.ops, grad_dict, self.eps, self.grad_context.pre)
+                out = _post_backward_hook(state, handle, *unused)
+                return out
+            return wrapper
+        logger.info("Patch fsdp _post_backward_hook, collect pre_grad metrics.")
+        self.fsdp_post_backward_hook = torch.distributed.fsdp._runtime_utils._post_backward_hook
+        torch.distributed.fsdp._runtime_utils._post_backward_hook = \
+            patch_post_backward_hook(torch.distributed.fsdp._runtime_utils._post_backward_hook)
     def _hook_weights(self):
+        """
+        遍历参数的梯度生成函数（grad_acc），并挂载hook，以便在该参数所有梯度计算后，采集通信聚合前梯度数据。
+        """
         context = self.grad_context
         @torch.no_grad
-        def param_hook(*args, context_dict, param, key, name):
+        def param_hook(*args, context_dict, param, name):
+            key = name
+            if self.monitor_mbs_grad:
+                key += f'{MonitorConst.NAME_SEP}{param.micro_step}'
+            key = get_summary_writer_tag_name(key, 'acc_grad', self.rank)
+            self.register_param_call_id("param_hook", key)
             param.micro_step += 1
-            self._register_param_call_id("param_hook", key)
-            if param.micro_step == self.micro_batch_number:
-                param.micro_step = 0
+            if self.monitor_mbs_grad or (param.micro_step == self.micro_batch_number):
                 if self.params_have_main_grad:
                     grad = param.main_grad
                 else:
@@ -1097,25 +1174,17 @@ class TrainerMon:
                     grad = grad.float()
                 context_dict[key] = grad.clone()
+            if param.micro_step == self.micro_batch_number:
+                param.micro_step = 0
         logger.info("hooking weights.")
         for param, name in self.param2name.items():
-            key = get_summary_writer_tag_name(name, 'acc_grad', self.rank)
             setattr(param, 'micro_step', 0)
             param_tmp = param.expand_as(param)
             grad_acc = param_tmp.grad_fn.next_functions[0][0]
             handle = grad_acc.register_hook(
-                partial(param_hook, context_dict=context.acc, param=param, key=key, name=name))
+                partial(param_hook, context_dict=context.acc, param=param, name=name))
             self.grad_accs.append(grad_acc)
             self.handles['wgrads'].append(handle)
         self.weight_hooked = True
-    def _register_param_call_id(self, hook_name: str, key: str):
-        """
-        :param hook_name:
-        :param key: str, '0:relu_0/output_grad'
-        :return:
-        """
-        logger.debug(f"{hook_name} {key}: {self.call_id}")
-        self.param_name_call_id[key] = self.call_id
-        self.call_id += 1

mindstudio-probe 1.3.0__py3-none-any.whl → 8.1.1__py3-none-any.whl

mindstudio-probe 1.3.0py3-none-any.whl → 8.1.1py3-none-any.whl