PyPI - mindstudio-probe - Versions diffs - 1.2.1__py3-none-any.whl → 1.2.2__py3-none-any.whl - Mend

mindstudio-probe 1.2.1py3-none-any.whl → 1.2.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (85) hide show

{mindstudio_probe-1.2.1.dist-info → mindstudio_probe-1.2.2.dist-info}/METADATA +1 -1
{mindstudio_probe-1.2.1.dist-info → mindstudio_probe-1.2.2.dist-info}/RECORD +85 -66
msprobe/README.md +2 -2
msprobe/core/common/const.py +34 -9
msprobe/core/common/inplace_ops.yaml +1 -0
msprobe/core/common/utils.py +14 -0
msprobe/core/compare/layer_mapping/data_scope_parser.py +1 -1
msprobe/core/compare/merge_result/merge_result.py +8 -7
msprobe/core/compare/merge_result/utils.py +81 -0
msprobe/core/compare/utils.py +10 -0
msprobe/core/data_dump/data_collector.py +58 -13
msprobe/core/data_dump/data_processor/base.py +92 -8
msprobe/core/data_dump/data_processor/factory.py +3 -0
msprobe/core/data_dump/data_processor/mindspore_processor.py +17 -4
msprobe/core/data_dump/data_processor/pytorch_processor.py +58 -7
msprobe/core/data_dump/json_writer.py +26 -8
msprobe/docs/01.installation.md +25 -0
msprobe/docs/02.config_introduction.md +14 -12
msprobe/docs/03.config_examples.md +24 -0
msprobe/docs/05.data_dump_PyTorch.md +34 -15
msprobe/docs/06.data_dump_MindSpore.md +45 -22
msprobe/docs/09.accuracy_checker_MindSpore.md +4 -2
msprobe/docs/19.monitor.md +257 -260
msprobe/docs/21.visualization_PyTorch.md +10 -0
msprobe/docs/22.visualization_MindSpore.md +11 -0
msprobe/docs/27.dump_json_instruction.md +24 -20
msprobe/docs/28.debugger_save_instruction.md +94 -0
msprobe/docs/28.kernel_dump_MindSpore.md +69 -0
msprobe/docs/img/monitor/step_count_per_record.png +0 -0
msprobe/mindspore/__init__.py +1 -0
msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +26 -6
msprobe/mindspore/api_accuracy_checker/api_runner.py +54 -16
msprobe/mindspore/api_accuracy_checker/compute_element.py +47 -1
msprobe/mindspore/api_accuracy_checker/torch_mindtorch_importer.py +129 -0
msprobe/mindspore/api_accuracy_checker/type_mapping.py +24 -1
msprobe/mindspore/api_accuracy_checker/utils.py +6 -1
msprobe/mindspore/common/utils.py +20 -2
msprobe/mindspore/debugger/debugger_config.py +25 -2
msprobe/mindspore/debugger/precision_debugger.py +25 -6
msprobe/mindspore/dump/hook_cell/api_registry.py +2 -0
msprobe/mindspore/dump/jit_dump.py +7 -6
msprobe/mindspore/monitor/anomaly_detect.py +404 -0
msprobe/mindspore/monitor/distributed/__init__.py +0 -0
msprobe/mindspore/monitor/distributed/distributed_ops.yaml +15 -0
msprobe/mindspore/monitor/distributed/stack_blacklist.yaml +5 -0
msprobe/mindspore/monitor/distributed/wrap_distributed.py +300 -0
msprobe/mindspore/monitor/features.py +63 -0
msprobe/mindspore/monitor/module_hook.py +821 -0
msprobe/mindspore/monitor/module_spec_verifier.py +94 -0
msprobe/mindspore/monitor/utils.py +267 -0
msprobe/mindspore/ms_config.py +8 -2
msprobe/mindspore/service.py +95 -21
msprobe/pytorch/__init__.py +0 -1
msprobe/pytorch/api_accuracy_checker/generate_op_script/op_generator.py +1 -1
msprobe/pytorch/bench_functions/apply_adam.py +215 -0
msprobe/pytorch/bench_functions/group_norm_silu.py +27 -0
msprobe/pytorch/bench_functions/mish.py +21 -0
msprobe/pytorch/bench_functions/moe_gating_top_k_softmax.py +44 -0
msprobe/pytorch/bench_functions/sort_v2.py +21 -0
msprobe/pytorch/common/utils.py +71 -0
msprobe/pytorch/debugger/debugger_config.py +19 -9
msprobe/pytorch/debugger/precision_debugger.py +14 -0
msprobe/pytorch/dump/module_dump/module_processer.py +10 -30
msprobe/pytorch/function_factory.py +7 -1
msprobe/pytorch/hook_module/support_wrap_ops.yaml +2 -1
msprobe/pytorch/hook_module/wrap_distributed.py +4 -0
msprobe/pytorch/monitor/anomaly_detect.py +14 -29
msprobe/pytorch/monitor/csv2tb.py +10 -12
msprobe/pytorch/monitor/module_hook.py +123 -104
msprobe/pytorch/monitor/module_metric.py +6 -6
msprobe/pytorch/monitor/optimizer_collect.py +45 -63
msprobe/pytorch/monitor/utils.py +8 -43
msprobe/pytorch/pt_config.py +19 -22
msprobe/pytorch/service.py +103 -24
msprobe/visualization/builder/graph_builder.py +31 -5
msprobe/visualization/builder/msprobe_adapter.py +7 -5
msprobe/visualization/graph/base_node.py +3 -2
msprobe/visualization/graph/distributed_analyzer.py +80 -3
msprobe/visualization/graph/node_op.py +4 -2
msprobe/visualization/graph_service.py +3 -4
msprobe/visualization/utils.py +10 -2
{mindstudio_probe-1.2.1.dist-info → mindstudio_probe-1.2.2.dist-info}/LICENSE +0 -0
{mindstudio_probe-1.2.1.dist-info → mindstudio_probe-1.2.2.dist-info}/WHEEL +0 -0
{mindstudio_probe-1.2.1.dist-info → mindstudio_probe-1.2.2.dist-info}/entry_points.txt +0 -0
{mindstudio_probe-1.2.1.dist-info → mindstudio_probe-1.2.2.dist-info}/top_level.txt +0 -0

msprobe/pytorch/monitor/module_metric.py CHANGED Viewed

@@ -17,7 +17,7 @@ import re
 import torch
 from msprobe.pytorch.monitor.features import get_max, get_min, get_zeros, get_nans, get_norm, get_mean
-from msprobe.pytorch.monitor.utils import NAN_TENSOR_ON_DEVICE
+from msprobe.pytorch.monitor.utils import get_nan_tensor
 def get_summary_writer_tag_name(module_or_param_name: str, tag: str, rank):
@@ -147,13 +147,13 @@ def get_metrics(ops, tag2tensor, eps, out_dict=None):
     """
     :param ops: ["op1", "op2"]
     :param tag2tensor: {
-    '0:fc_0/input': torch.randn([3, 4]),
-    '0:fc_0/output': torch.randn([3, 3])
+    '0:fc.input:0/actv': torch.randn([3, 4]),
+    '0:fc.output:0/actv': torch.randn([3, 3])
     }
     :param eps: float 1e-8
     :param out_dict:{
-    '0:fc_0/input': {"op1": op1(torch.randn([3, 4])), "op2": op2(torch.randn([3, 4]))}
-    '0:fc_0/output': {"op1": op1(torch.randn([3, 3])), "op2": op2(torch.randn([3, 3]))}
+    '0:fc.input:0/actv': {"op1": op1(torch.randn([3, 4])), "op2": op2(torch.randn([3, 4]))}
+    '0:fc.output:0/actv': {"op1": op1(torch.randn([3, 3])), "op2": op2(torch.randn([3, 3]))}
     }
     :return: out_dict
     """
@@ -164,7 +164,7 @@ def get_metrics(ops, tag2tensor, eps, out_dict=None):
             out_dict[tag] = {}
         if not torch.is_tensor(tensor):
             # Non-tensor in/output filled with nan.
-            out_dict[tag].update({metric_name: NAN_TENSOR_ON_DEVICE for metric_name in ops})
+            out_dict[tag].update({metric_name: get_nan_tensor() for metric_name in ops})
             continue
         for metric_name in ops:
             fun_metric = config_metric_registry.get(metric_name)

msprobe/pytorch/monitor/optimizer_collect.py CHANGED Viewed

@@ -23,16 +23,10 @@ from msprobe.pytorch.monitor.utils import MVResult, MVGradResult
 class OptimizerMon(object):
-    wrapped_optimizer = None
     def __init__(self) -> None:
         self.fp16_to_fp32_param = {}
         self.is_stage3 = False
-    @classmethod
-    def set_wrapped_optimizer(cls, wrapped_optimizer):
-        cls.wrapped_optimizer = wrapped_optimizer
     def fetch_mv(self, monitor, torch_opt, params2name):
         pass
@@ -82,7 +76,6 @@ class OptimizerMon(object):
         ratio_dict = defaultdict()
         param2name = defaultdict()
         fp32_partitioned_groups_flat_grad = defaultdict()
-        mix_prec_opt = OptimizerMon.wrapped_optimizer
         partition_id = dist.get_rank()
         def get_flatten_grad(self, optimizer, group_idx):
@@ -101,7 +94,7 @@ class OptimizerMon(object):
                 return fp32_partitioned_groups_flat[group_idx].grad
         for group_idx in range(len(fp32_partitioned_groups_flat)):
-            fp32_partitioned_groups_flat_grad[group_idx] = get_flatten_grad(self, mix_prec_opt, group_idx)
+            fp32_partitioned_groups_flat_grad[group_idx] = get_flatten_grad(self, torch_opt, group_idx)
         for name in params2name.values():
             start_idx, end_idx, group_idx, group_with_rank = name2indices[name]
@@ -110,9 +103,9 @@ class OptimizerMon(object):
             fp32_param = fp32_partitioned_groups_flat[group_idx][start_idx: end_idx]
             fp32_param.grad = fp32_partitioned_groups_flat_grad[group_idx][start_idx: end_idx]
             param2name[fp32_param] = name
-            if not mix_prec_opt.state:
+            if not torch_opt.state:
                 continue
-            state_param = list(mix_prec_opt.state.values())[group_idx]
+            state_param = list(torch_opt.state.values())[group_idx]
             exp_avg = state_param.get("exp_avg", None)
             exp_avg_sq = state_param.get("exp_avg_sq", None)
             if exp_avg is None or exp_avg_sq is None:
@@ -150,36 +143,33 @@ class MixPrecisionOptimizerMon(OptimizerMon):
     混合精度训练通过适当降低某些计算的精度来加速训练过程并减少内存消耗。
     """
-    def map_fp16_tp_fp32_param(self, mix_prec_opt):
-        for fp16_group, fp32_group in zip(mix_prec_opt.float16_groups, mix_prec_opt.fp32_from_float16_groups):
+    def map_fp16_tp_fp32_param(self, torch_opt):
+        for fp16_group, fp32_group in zip(torch_opt.float16_groups, torch_opt.fp32_from_float16_groups):
             for fp16_param, fp32_param in zip(fp16_group, fp32_group):
                 self.fp16_to_fp32_param[fp16_param] = fp32_param
     def fetch_mv(self, monitor, torch_opt, params2name):
-        mix_prec_opt = self.wrapped_optimizer
-        if not self.fp16_to_fp32_param and mix_prec_opt is not None:
-            self.map_fp16_tp_fp32_param(mix_prec_opt)
+        if not self.fp16_to_fp32_param and torch_opt is not None:
+            self.map_fp16_tp_fp32_param(torch_opt)
         return self._fetch_mv_in_adam(monitor, torch_opt, params2name)
 class MegatronDistributedOptimizerMon(OptimizerMon):
-    def map_fp16_tp_fp32_param(self, mix_prec_opt):
-        if not (hasattr(mix_prec_opt, "model_float16_groups") and
-                hasattr(mix_prec_opt, "shard_fp32_from_float16_groups")):
+    def map_fp16_tp_fp32_param(self, torch_opt):
+        if not (hasattr(torch_opt, "model_float16_groups") and
+                hasattr(torch_opt, "shard_fp32_from_float16_groups")):
             raise Exception(
                 "megatron distributed optimizer should have model_float16_groups and shard_fp32_from_float16_groups, "
                 "if not, please check megatron-lm version")
-        for fp16_group, shard_fp32_group in zip(mix_prec_opt.model_float16_groups,
-                                                mix_prec_opt.shard_fp32_from_float16_groups):
+        for fp16_group, shard_fp32_group in zip(torch_opt.model_float16_groups,
+                                                torch_opt.shard_fp32_from_float16_groups):
             for fp16_param, shard_fp32_param in zip(fp16_group, shard_fp32_group):
                 self.fp16_to_fp32_param[fp16_param] = shard_fp32_param
     def fetch_mv(self, monitor, torch_opt, params2name):
-        mix_prec_opt = self.wrapped_optimizer
-        if not self.fp16_to_fp32_param and mix_prec_opt is not None:
-            self.map_fp16_tp_fp32_param(mix_prec_opt)
+        if not self.fp16_to_fp32_param and torch_opt is not None:
+            self.map_fp16_tp_fp32_param(torch_opt)
         return self._fetch_mv_in_adam(monitor, torch_opt, params2name)
@@ -191,30 +181,26 @@ class MegatronFP32OptimizerMon(OptimizerMon):
 class MegatronChainedDistributedOptimizerMon(MegatronDistributedOptimizerMon):
     def fetch_mv(self, monitor, torch_opt, params2name):
-        mix_prec_opt = self.wrapped_optimizer
-        if not self.fp16_to_fp32_param and mix_prec_opt is not None:
-            for opt in mix_prec_opt.chained_optimizers:
+        if not self.fp16_to_fp32_param and torch_opt is not None:
+            for opt in torch_opt.chained_optimizers:
                 self.map_fp16_tp_fp32_param(opt)
         if not isinstance(torch_opt, torch.optim.Optimizer):
             torch_opt.state = {}
-            for opt in mix_prec_opt.chained_optimizers:
+            for opt in torch_opt.chained_optimizers:
                 torch_opt.state.update(opt.optimizer.state)
         return self._fetch_mv_in_adam(monitor, torch_opt, params2name)
 class MegatronChainedMixPrecisionOptimizerMon(MixPrecisionOptimizerMon):
     def fetch_mv(self, monitor, torch_opt, params2name):
-        mix_prec_opt = self.wrapped_optimizer
-        if not self.fp16_to_fp32_param and mix_prec_opt is not None:
-            for opt in mix_prec_opt.chained_optimizers:
+        if not self.fp16_to_fp32_param and torch_opt is not None:
+            for opt in torch_opt.chained_optimizers:
                 self.map_fp16_tp_fp32_param(opt)
         if not isinstance(torch_opt, torch.optim.Optimizer):
             torch_opt.state = {}
-            for opt in mix_prec_opt.chained_optimizers:
+            for opt in torch_opt.chained_optimizers:
                 torch_opt.state.update(opt.optimizer.state)
         return self._fetch_mv_in_adam(monitor, torch_opt, params2name)
@@ -225,9 +211,8 @@ class DeepSpeedZeroOptimizerStage0Mon(OptimizerMon):
 class DeepSpeedZeroOptimizerStage3Mon(OptimizerMon):
-    def get_param_index(self, params2name, name2index):
-        mix_prec_opt = OptimizerMon.wrapped_optimizer
-        fp16_groups = mix_prec_opt.fp16_partitioned_groups
+    def get_param_index(self, params2name, name2index, torch_opt):
+        fp16_groups = torch_opt.fp16_partitioned_groups
         name2indices = defaultdict()
         index_length = defaultdict()
         index = 0
@@ -246,13 +231,11 @@ class DeepSpeedZeroOptimizerStage3Mon(OptimizerMon):
     def fetch_mv(self, monitor, torch_opt, params2name, name2indices=None):
         self.is_stage3 = True
-        mix_prec_opt = OptimizerMon.wrapped_optimizer
-        fp32_partitioned_groups_flat = mix_prec_opt.fp32_partitioned_groups_flat
+        fp32_partitioned_groups_flat = torch_opt.fp32_partitioned_groups_flat
         return self._fetch_mv_grad_in_adam(monitor, torch_opt, params2name, name2indices, fp32_partitioned_groups_flat)
 class DeepSpeedZeroOptimizerStage1or2Mon(OptimizerMon):
     @staticmethod
     def get_group_index(fp32_length, world_size, index):
         for i in range(len(fp32_length) - 1):
@@ -265,12 +248,11 @@ class DeepSpeedZeroOptimizerStage1or2Mon(OptimizerMon):
                 return sub_interval_start, min(sub_index, world_size - 1)
         return fp32_length[-1], 0
-    def get_param_index(self, params2name, name2index):
-        mix_prec_opt = OptimizerMon.wrapped_optimizer
-        padding = mix_prec_opt.groups_padding
+    def get_param_index(self, params2name, name2index, torch_opt):
+        padding = torch_opt.groups_padding
         world_size = dist.get_world_size()
         fp32_length = [0]
-        for fp32_group_index, single_partition_of_fp32_group in enumerate(mix_prec_opt.single_partition_of_fp32_groups):
+        for fp32_group_index, single_partition_of_fp32_group in enumerate(torch_opt.single_partition_of_fp32_groups):
             fp32_length.append(len(single_partition_of_fp32_group) * world_size + fp32_length[fp32_group_index])
         bf16_groups = []
@@ -278,7 +260,7 @@ class DeepSpeedZeroOptimizerStage1or2Mon(OptimizerMon):
         index_length = defaultdict()
         index = 0
         idx = 0
-        for group_idx, bf16_group in enumerate(mix_prec_opt.bit16_groups):
+        for group_idx, bf16_group in enumerate(torch_opt.bit16_groups):
             bf16_groups.extend(bf16_group)
             for param in bf16_group:
                 param_length = len(param.flatten())
@@ -286,7 +268,7 @@ class DeepSpeedZeroOptimizerStage1or2Mon(OptimizerMon):
                 index_length[idx] = (index, index + param_length, group_idx, group_index, group_with_rank)
                 index += param_length
                 idx += 1
-        group_length = len(bf16_groups) / len(mix_prec_opt.bit16_groups)
+        group_length = len(bf16_groups) / len(torch_opt.bit16_groups)
         for _, name in params2name.items():
             name_index = name2index[name]
             start_idx, end_idx, group_idx, group_index, group_with_rank = index_length[name_index]
@@ -300,8 +282,7 @@ class DeepSpeedZeroOptimizerStage1or2Mon(OptimizerMon):
         return name2indices
     def fetch_mv(self, monitor, torch_opt, params2name, name2indices=None):
-        mix_prec_opt = OptimizerMon.wrapped_optimizer
-        fp32_partitioned_groups_flat = mix_prec_opt.single_partition_of_fp32_groups
+        fp32_partitioned_groups_flat = torch_opt.single_partition_of_fp32_groups
         return self._fetch_mv_grad_in_adam(monitor, torch_opt, params2name, name2indices, fp32_partitioned_groups_flat)
@@ -312,22 +293,23 @@ class DummyOptimizerMon(OptimizerMon):
 class OptimizerMonFactory:
     _optimizer_mon_map = {
-        "Megatron_Float16OptimizerWithFloat16Params": MixPrecisionOptimizerMon,
-        "Megatron_DistributedOptimizer": MegatronDistributedOptimizerMon,
-        "Megatron_ChainedDistributedOptimizer": MegatronChainedDistributedOptimizerMon,
-        "Megatron_ChainedFloat16OptimizerWithFloat16Params": MegatronChainedMixPrecisionOptimizerMon,
-        "Megatron_FP32Optimizer": MegatronFP32OptimizerMon,
-        "DeepSpeedZeroOptimizer_Stage0": DeepSpeedZeroOptimizerStage0Mon,
-        "DeepSpeedZeroOptimizer_Stage1_or_2": DeepSpeedZeroOptimizerStage1or2Mon,
+        "FP32Optimizer": MegatronFP32OptimizerMon,
+        "Float16OptimizerWithFloat16Params": MixPrecisionOptimizerMon,
+        "DistributedOptimizer": MegatronDistributedOptimizerMon,
+        "ChainedDistributedOptimizer": MegatronChainedDistributedOptimizerMon,
+        "ChainedFloat16OptimizerWithFloat16Params": MegatronChainedMixPrecisionOptimizerMon,
+        "BF16_Optimizer": DeepSpeedZeroOptimizerStage0Mon,
+        "DeepSpeedZeroOptimizer": DeepSpeedZeroOptimizerStage1or2Mon,
         "DeepSpeedZeroOptimizer_Stage3": DeepSpeedZeroOptimizerStage3Mon,
-        "unknown": DummyOptimizerMon
+        "Adam": DummyOptimizerMon
     }
     @staticmethod
-    def create_optimizer_mon(opt_ty: str):
-        if not opt_ty:
-            return DummyOptimizerMon()
-        optimizer_mon_class = OptimizerMonFactory._optimizer_mon_map.get(opt_ty)
-        if not optimizer_mon_class:
-            raise Exception("opt_ty should be one of: " + ", ".join(OptimizerMonFactory._optimizer_mon_map.keys()))
-        return optimizer_mon_class()
+    def create_optimizer_mon(optimizer):
+        # auto replace opt_ty
+        optimizer_class = optimizer.__class__.__name__
+        if optimizer_class == "ChainedOptimizer":
+            optimizer_class = "Chained" + optimizer.chained_optimizers[0].__class__.__name__
+        optimizer_mon_class = OptimizerMonFactory._optimizer_mon_map.get(optimizer_class, DummyOptimizerMon)
+        return optimizer_mon_class(), optimizer_class

msprobe/pytorch/monitor/utils.py CHANGED Viewed

@@ -36,7 +36,7 @@ except ImportError:
     if torch.cuda.is_available():
         device = "cuda"
-NAN_TENSOR_ON_DEVICE = torch.tensor(torch.nan, device=device)
+NAN_TENSOR_ON_DEVICE = None
 FILE_MAX_SIZE = 10 * 1024 * 1024 * 1024
 FILE_NAME_MAX_LENGTH = 255
 DIRECTORY_MAX_LENGTH = 4096
@@ -57,6 +57,13 @@ def get_output_base_dir():
     return os.getenv(MonitorConst.MONITOR_OUTPUT_DIR, MonitorConst.DEFAULT_MONITOR_OUTPUT_DIR)
+def get_nan_tensor():
+    global NAN_TENSOR_ON_DEVICE
+    if not NAN_TENSOR_ON_DEVICE:
+        NAN_TENSOR_ON_DEVICE = torch.tensor(torch.nan, device=device)
+    return NAN_TENSOR_ON_DEVICE
 def filter_special_chars(func):
     @wraps(func)
     def func_level(msg):
@@ -82,48 +89,6 @@ def get_param_struct(param):
     return res
-def is_recomputation():
-    """Check if the current operation is in the re-computation phase.
-    This function inspects the current call stack to indicate whether the current operation is in the
-    re-computation phase. We use a blacklist mechanism, now supported megatron and mindspeed framework.
-    megatron: The 'backward' function is called by the 'torch/autograd/function.py' file.
-    mindspeed: The 'checkpoint_function_backward' function is called by the 'torch/autograd/function.py'
-    file or the custom module(use CheckpointWithoutOutput) with the 'backward' function is executed within the
-    'torch/_tensor.py' file.
-    Returns:
-        bool: True if in the re-computation phase, False otherwise.
-    """
-    backward_function_indices = []
-    call_stack = inspect.stack()
-    # Identify the function 'backward' is being executed within the 'torch/_tensor.py' file.
-    for frame_info in call_stack:
-        if frame_info.function == Const.BACKWARD and frame_info.filename.endswith('torch/_tensor.py'):
-            del call_stack
-            return True
-    # Identify indices in the call stack where the specific function is being executed
-    for idx, frame_info in enumerate(call_stack):
-        if frame_info.function == Const.BACKWARD or frame_info.function == 'checkpoint_function_backward':
-            backward_function_indices.append(idx)
-    # Check if the execution is within 'torch/autograd/function.py' file
-    for idx in backward_function_indices:
-        # The Megatron and MindSpeed L0&L1 scenes
-        if idx + 1 < len(call_stack) and call_stack[idx + 1].filename.endswith('torch/autograd/function.py'):
-            del call_stack
-            return True
-        # The latest MindSpeed L2 and ModelLink scenes
-        if idx + 2 < len(call_stack) and call_stack[idx + 2].filename.endswith('torch/autograd/function.py'):
-            del call_stack
-            return True
-    del call_stack
-    return False
 def validate_ops(ops):
     if not isinstance(ops, list):
         raise TypeError("ops should be a list")

msprobe/pytorch/pt_config.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
+# Copyright (c) 2024-2025, Huawei Technologies Co., Ltd.
 # All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0  (the "License");
@@ -303,28 +303,25 @@ class GradToolConfig(BaseConfig):
         check_bounds(self.bounds)
+class StructureConfig(BaseConfig):
+    def __init__(self, json_config):
+        super().__init__(json_config)
+TaskDict = {
+    Const.TENSOR: TensorConfig,
+    Const.STATISTICS: StatisticsConfig,
+    Const.OVERFLOW_CHECK: OverflowCheckConfig,
+    Const.FREE_BENCHMARK: FreeBenchmarkCheckConfig,
+    Const.RUN_UT: RunUTConfig,
+    Const.GRAD_PROBE: GradToolConfig,
+    Const.STRUCTURE: StructureConfig
+}
 def parse_task_config(task, json_config):
-    default_dic = {}
-    if task == Const.TENSOR:
-        config_dic = json_config.get(Const.TENSOR, default_dic)
-        return TensorConfig(config_dic)
-    elif task == Const.STATISTICS:
-        config_dic = json_config.get(Const.STATISTICS, default_dic)
-        return StatisticsConfig(config_dic)
-    elif task == Const.OVERFLOW_CHECK:
-        config_dic = json_config.get(Const.OVERFLOW_CHECK, default_dic)
-        return OverflowCheckConfig(config_dic)
-    elif task == Const.FREE_BENCHMARK:
-        config_dic = json_config.get(Const.FREE_BENCHMARK, default_dic)
-        return FreeBenchmarkCheckConfig(config_dic)
-    elif task == Const.RUN_UT:
-        config_dic = json_config.get(Const.RUN_UT, default_dic)
-        return RunUTConfig(config_dic)
-    elif task == Const.GRAD_PROBE:
-        config_dic = json_config.get(Const.GRAD_PROBE, default_dic)
-        return GradToolConfig(config_dic)
-    else:
-        return StatisticsConfig(default_dic)
+    task_map = json_config.get(task, dict())
+    return TaskDict.get(task)(task_map)
 def parse_json_config(json_file_path, task):

msprobe/pytorch/service.py CHANGED Viewed

@@ -15,19 +15,19 @@
 import functools
 import os
-from collections import namedtuple
+from collections import namedtuple, defaultdict
 import torch
 from msprobe.core.common.const import Const
 from msprobe.core.common.exceptions import DistributedNotInitializedError
 from msprobe.core.common.file_utils import create_directory
-from msprobe.core.common.utils import print_tools_ends_info
+from msprobe.core.common.utils import print_tools_ends_info, DumpPathAggregation
 from msprobe.core.data_dump.data_collector import build_data_collector
 from msprobe.core.data_dump.data_processor.base import ModuleForwardInputsOutputs, ModuleBackwardInputsOutputs
 from msprobe.core.data_dump.scope import BaseScope
 from msprobe.pytorch.api_accuracy_checker.common.utils import ApiData
 from msprobe.pytorch.common.log import logger
-from msprobe.pytorch.common.utils import get_rank_if_initialized
+from msprobe.pytorch.common.utils import get_rank_if_initialized, is_recomputation
 from msprobe.pytorch.dump.kernel_dump.kernel_config import create_kernel_config_json
 from msprobe.pytorch.dump.module_dump.module_processer import ModuleProcesser
 from msprobe.pytorch.hook_module.api_registry import api_register
@@ -56,13 +56,16 @@ class Service:
         self.should_stop_service = False
         self.attl = None
         self.params_grad_info = {}
+        self.hook_handle_dict = {}
         # 提前注册，确保注册尽可能多的API hook
         self.register_api_hook()
+        self.init_for_debug_level()
     def build_hook(self, module_type, name):
         def pre_hook(api_or_module_name, module, args, kwargs):
             if not self.should_execute_hook(module_type, module, True):
                 return args, kwargs
+            is_recompute = is_recomputation()
             self.inner_switch = True
             if module_type == BaseScope.Module_Type_Module:
@@ -77,7 +80,13 @@ class Service:
                 return None, None
             if self.data_collector:
                 module_input_output = ModuleForwardInputsOutputs(args=args, kwargs=kwargs, output=None)
-                self.data_collector.forward_input_data_collect(api_or_module_name, module, pid, module_input_output)
+                self.data_collector.forward_input_data_collect(
+                    api_or_module_name,
+                    module,
+                    pid,
+                    module_input_output,
+                    is_recompute
+                )
             self.inner_switch = False
             return args, kwargs
@@ -101,7 +110,12 @@ class Service:
             if not (Const.FORWARD in self.config.data_mode and Const.BACKWARD not in self.config.data_mode):
                 for param_name, param in params_dict.items():
                     if param.requires_grad:
-                        param.register_hook(grad_hook(module, ori_name, param_name))
+                        name = ori_name + Const.SEP + param_name
+                        old_handle = self.hook_handle_dict.get(name)
+                        if old_handle and hasattr(old_handle, "remove"):
+                            old_handle.remove()
+                        handle = param.register_hook(grad_hook(module, ori_name, param_name))
+                        self.hook_handle_dict[name] = handle
         def init_params_grad_info(module, params_dict):
             '''
@@ -125,6 +139,7 @@ class Service:
         def forward_hook(api_or_module_name, module, args, kwargs, output):
             if not self.should_execute_hook(module_type, module, True):
                 return None
+            is_recompute = is_recomputation()
             self.inner_switch = True
             if self.config.online_run_ut:
@@ -147,10 +162,15 @@ class Service:
             if module_type == BaseScope.Module_Type_Module:
                 api_or_module_name = module.mindstudio_reserved_name[-1]
                 self.data_collector.update_api_or_module_name(api_or_module_name)
-                params_dict = {key.split(Const.SEP)[-1]: value for key, value in module.named_parameters(recurse=False)}
-                setattr(module_input_output, Const.PARAMS, params_dict)
+                params_dict = {}
+                if self.config.task != Const.STRUCTURE:
+                    params_dict = {
+                        key.split(Const.SEP)[-1]: value
+                        for key, value in module.named_parameters(recurse=False)
+                    }
+                    setattr(module_input_output, Const.PARAMS, params_dict)
                 # 判断是否需要注册参数hook
-                if not hasattr(module, 'params_grad_name') and params_dict:
+                if params_dict:
                     ori_name = api_or_module_name.rsplit(Const.SEP, 2)[0]
                     grad_name = ori_name + Const.SEP + Const.PARAMS_GRAD
                     # 首次执行前向hook时，添加params_grad_name属性，并注册参数hook
@@ -160,7 +180,8 @@ class Service:
                     api_or_module_name,
                     module,
                     pid,
-                    module_input_output
+                    module_input_output,
+                    is_recompute
                 )
                 init_params_grad_info(module, params_dict)
             else:
@@ -169,7 +190,8 @@ class Service:
                     api_or_module_name,
                     module,
                     pid,
-                    module_input_output
+                    module_input_output,
+                    is_recompute
                 )
             if self.data_collector.if_return_forward_new_output():
@@ -185,6 +207,7 @@ class Service:
         def backward_hook(api_or_module_name, module, grad_input, grad_output):
             if not self.should_execute_hook(module_type, module, False):
                 return
+            is_recompute = is_recomputation()
             self.inner_switch = True
             if module_type == BaseScope.Module_Type_Module:
@@ -198,7 +221,13 @@ class Service:
             if self.data_collector:
                 # 此处获取到的grad_input实际为反向过程的输出数据，grad_output为反向过程的输入数据，因此传入时调换顺序
                 module_input_output = ModuleBackwardInputsOutputs(grad_input=grad_output, grad_output=grad_input)
-                self.data_collector.backward_data_collect(api_or_module_name, module, pid, module_input_output)
+                self.data_collector.backward_data_collect(
+                    api_or_module_name,
+                    module,
+                    pid,
+                    module_input_output,
+                    is_recompute
+                )
             self.inner_switch = False
         pid = os.getpid()
@@ -217,6 +246,8 @@ class Service:
         return HookFn(pre_forward_hook_fn, forward_hook_fn, backward_hook_fn, forward_hook_torch_version_below_2_fn)
     def start(self, model):
+        if self.config.level == Const.LEVEL_DEBUG:
+            return
         if self.need_stop_service():
             return
@@ -231,6 +262,8 @@ class Service:
             if self.config.rank and self.current_rank not in self.config.rank:
                 return
             self.register_module_hook()
+            if self.config.level == Const.LEVEL_MIX:
+                register_optimizer_hook(self.data_collector)
             self.first_start = False
         if self.config.online_run_ut and torch_version_above_or_equal_2:
             run_ut_dispatch(self.attl, True, self.config.online_run_ut_recompute)
@@ -241,6 +274,8 @@ class Service:
             logger.info_on_rank_0(f"Dump data will be saved in {self.dump_iter_dir}.")
     def stop(self):
+        if self.config.level == Const.LEVEL_DEBUG:
+            return
         if self.should_stop_service:
             return
         if self.config.step and self.current_iter not in self.config.step:
@@ -255,15 +290,19 @@ class Service:
             return
         if self.config.async_dump:
             self.data_collector.fill_stack_tensor_data()
-            self.data_collector.data_processor.dump_async_data()
+            if self.config.task == Const.TENSOR:
+                self.data_collector.data_processor.dump_async_data()
         self.data_collector.write_json()
     def step(self):
+        if self.config.level == Const.LEVEL_DEBUG:
+            return
         if self.should_stop_service:
             return
         if self.config.async_dump:
             self.data_collector.fill_stack_tensor_data()
-            self.data_collector.data_processor.dump_async_data()
+            if self.config.task == Const.TENSOR:
+                self.data_collector.data_processor.dump_async_data()
         self.data_collector.write_json()
         self.current_iter += 1
         self.data_collector.update_iter(self.current_iter)
@@ -319,13 +358,13 @@ class Service:
         else:
             dump_data_dir = None
-        dump_file_path = os.path.join(dump_dir, "dump.json")
-        stack_file_path = os.path.join(dump_dir, "stack.json")
-        construct_file_path = os.path.join(dump_dir, "construct.json")
-        free_benchmark_file_path = os.path.join(self.config.dump_path, "free_benchmark.csv")
-        self.data_collector.update_dump_paths(
-            dump_file_path, stack_file_path, construct_file_path, dump_data_dir, free_benchmark_file_path
-        )
+        dump_path_aggregation = DumpPathAggregation()
+        dump_path_aggregation.dump_file_path = os.path.join(dump_dir, "dump.json")
+        dump_path_aggregation.stack_file_path = os.path.join(dump_dir, "stack.json")
+        dump_path_aggregation.construct_file_path = os.path.join(dump_dir, "construct.json")
+        dump_path_aggregation.dump_tensor_data_dir = dump_data_dir
+        dump_path_aggregation.free_benchmark_file_path = os.path.join(dump_dir, "free_benchmark.csv")
+        self.data_collector.update_dump_paths(dump_path_aggregation)
         self.data_collector.initialize_json_file(framework=Const.PT_FRAMEWORK)
     def register_api_hook(self):
@@ -337,9 +376,6 @@ class Service:
             )
             api_register.api_modularity()
-        if self.config.level == Const.LEVEL_MIX:
-            register_optimizer_hook(self.data_collector)
     def register_module_hook(self):
         if self.config.level in [Const.LEVEL_L0, Const.LEVEL_MIX]:
             logger.info_on_rank_0(f"The module {self.config.task} hook function is successfully mounted to the model.")
@@ -379,7 +415,7 @@ class Service:
     def reset_status(self):
         ModuleProcesser.reset_module_stats()
         HOOKModule.reset_module_stats()
-        self.data_collector.data_writer.reset_cache()
+        self.data_collector.reset_status()
         self.params_grad_info.clear()
         if self.config.level == Const.LEVEL_L2:
@@ -389,3 +425,46 @@ class Service:
             return
         if self.config.rank and self.current_rank not in self.config.rank:
             return
+    def init_for_debug_level(self):
+        if not (self.config.level == Const.LEVEL_DEBUG and self.config.task in [Const.TENSOR, Const.STATISTICS]):
+            return
+        try:
+            self.current_rank = get_rank_if_initialized()
+        except DistributedNotInitializedError:
+            self.current_rank = None
+        # dir: dump_path -- rank{} -- debug.json
+        self.dump_iter_dir = self.config.dump_path
+        cur_rank = self.current_rank if self.current_rank is not None else ''
+        dump_dir = os.path.join(self.dump_iter_dir, f"rank{cur_rank}")
+        create_directory(dump_dir)
+        if self.config.task in self.data_collector.tasks_need_tensor_data:
+            dump_data_dir = os.path.join(dump_dir, "dump_tensor_data")
+            create_directory(dump_data_dir)
+        else:
+            dump_data_dir = None
+        dump_path_aggregation = DumpPathAggregation()
+        dump_path_aggregation.dump_tensor_data_dir = dump_data_dir
+        dump_path_aggregation.debug_file_path = os.path.join(dump_dir, "debug.json")
+        self.data_collector.update_dump_paths(dump_path_aggregation)
+        self.data_collector.initialize_json_file(framework=Const.PT_FRAMEWORK)
+        self.debug_variable_counter = defaultdict(int)
+    def save(self, variable, name, save_backward):
+        if self.config.level != Const.LEVEL_DEBUG:
+            return
+        count = self.debug_variable_counter[name]
+        self.debug_variable_counter[name] += 1
+        name_with_count = f"{name}.{count}"
+        grad_name_with_count = f"{name}_grad.{count}"
+        # forward save
+        self.data_collector.debug_data_collect_forward(variable, name_with_count)
+        # backward save
+        if save_backward:
+            self.data_collector.debug_data_collect_backward(variable, grad_name_with_count)

mindstudio-probe 1.2.1__py3-none-any.whl → 1.2.2__py3-none-any.whl

mindstudio-probe 1.2.1py3-none-any.whl → 1.2.2py3-none-any.whl