PyPI - mindstudio-probe - Versions diffs - 8.2.1__py3-none-any.whl → 8.3.1__py3-none-any.whl - Mend

mindstudio-probe 8.2.1py3-none-any.whl → 8.3.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

{mindstudio_probe-8.2.1.dist-info → mindstudio_probe-8.3.1.dist-info}/METADATA +1 -1
{mindstudio_probe-8.2.1.dist-info → mindstudio_probe-8.3.1.dist-info}/RECORD +39 -40
msprobe/README.md +7 -2
msprobe/core/common/const.py +17 -3
msprobe/core/common/file_utils.py +138 -32
msprobe/core/common/framework_adapter.py +16 -6
msprobe/core/common/utils.py +17 -0
msprobe/core/compare/diff_analyze/first_diff_analyze.py +4 -16
msprobe/core/compare/find_first/utils.py +1 -1
msprobe/core/config_check/ckpt_compare/ckpt_comparator.py +6 -1
msprobe/core/hook_manager.py +0 -1
msprobe/docs/01.installation.md +2 -0
msprobe/docs/02.config_introduction.md +1 -1
msprobe/docs/14.data_parse_PyTorch.md +2 -0
msprobe/docs/15.free_benchmarking_PyTorch.md +1 -1
msprobe/docs/21.visualization_PyTorch.md +1 -1
msprobe/docs/26.data_dump_PyTorch_baseline.md +3 -3
msprobe/docs/32.ckpt_compare.md +5 -5
msprobe/mindspore/api_accuracy_checker/generate_op_script/op_generator.py +1 -1
msprobe/mindspore/compare/utils.py +1 -2
msprobe/mindspore/monitor/module_hook.py +17 -20
msprobe/msprobe.py +6 -4
msprobe/pytorch/api_accuracy_checker/generate_op_script/op_generator.py +1 -1
msprobe/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py +34 -5
msprobe/pytorch/common/utils.py +2 -52
msprobe/pytorch/compare/utils.py +1 -2
msprobe/pytorch/dump/module_dump/hook_wrapper.py +24 -0
msprobe/pytorch/dump/module_dump/module_processer.py +27 -6
msprobe/pytorch/hook_module/api_register.py +11 -2
msprobe/pytorch/monitor/module_hook.py +16 -34
msprobe/pytorch/pt_config.py +6 -0
msprobe/visualization/builder/graph_builder.py +3 -2
msprobe/visualization/builder/graph_merger.py +13 -0
msprobe/visualization/graph/graph.py +13 -9
msprobe/visualization/utils.py +11 -1
msprobe/core/compare/diff_analyze/ignore_op_list.yaml +0 -3
{mindstudio_probe-8.2.1.dist-info → mindstudio_probe-8.3.1.dist-info}/LICENSE +0 -0
{mindstudio_probe-8.2.1.dist-info → mindstudio_probe-8.3.1.dist-info}/WHEEL +0 -0
{mindstudio_probe-8.2.1.dist-info → mindstudio_probe-8.3.1.dist-info}/entry_points.txt +0 -0
{mindstudio_probe-8.2.1.dist-info → mindstudio_probe-8.3.1.dist-info}/top_level.txt +0 -0

msprobe/pytorch/dump/module_dump/module_processer.py CHANGED Viewed

@@ -13,21 +13,25 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import threading
 import sys
+import threading
 from collections import OrderedDict
 import torch
 from torch.utils.hooks import BackwardHook, RemovableHandle
 from msprobe.core.common.const import Const
+from msprobe.core.common.megatron_utils import wrap_megatron_step, get_micro_step, is_megatron
 from msprobe.core.common.runtime import Runtime
 from msprobe.core.common.utils import ModuleQueue, ThreadSafe
-from msprobe.core.common.megatron_utils import wrap_megatron_step, get_micro_step, is_megatron
 from msprobe.core.data_dump.scope import BaseScope, ModuleRangeScope, MixRangeScope
 from msprobe.pytorch.common.log import logger
 from msprobe.pytorch.common.utils import is_torch_nn_module, register_forward_pre_hook
-from msprobe.pytorch.dump.module_dump.hook_wrapper import wrap_setup_input_output_hook
+from msprobe.pytorch.dump.module_dump.hook_wrapper import (
+    wrap_setup_input_output_hook,
+    wrap_backward_hook_function_apply
+)
 torch_version_above_or_equal_2 = torch.__version__.split('+')[0] >= '2.0'
 torch_version_above_or_equal_21 = torch.__version__.split('+')[0] >= '2.1'
@@ -59,10 +63,13 @@ def wrap_forward_with_hook_safety(module):
         except _StopRecomputationError as e:
             exception_output = None
             if len(module._forward_hooks.values()) > 0:
-                # msprobe的forward_hook会出现在第一个，仅执行msprobe的forward_hook
-                hook_fn = list(module._forward_hooks.values())[0]
-                hook_fn(module, args, kwargs, exception_output)
+                # 仅执行msprobe的forward_hook, hook名称必然包含'ModuleProcesser.'
+                for hook_fn in module._forward_hooks.values():
+                    if 'ModuleProcesser' in str(hook_fn):
+                        hook_fn(module, args, kwargs, exception_output)
+                        break
             raise e
     if torch_version_above_or_equal_21:
         module.forward = wrapped_forward
@@ -80,6 +87,7 @@ class ModuleProcesser:
     def __init__(self, scope):
         self.scope = scope if isinstance(scope, (ModuleRangeScope, MixRangeScope)) else None
         wrap_setup_input_output_hook()
+        wrap_backward_hook_function_apply()
         try:
             from megatron.core.pipeline_parallel import schedules
             origin_func_id = id(schedules.deallocate_output_tensor)
@@ -146,7 +154,13 @@ class ModuleProcesser:
         modules_and_names_with_index = self.get_modules_and_names(models, recursive, module_names)
         for index, modules_and_names in modules_and_names_with_index.items():
             model = models if index == "-1" else models[int(index)]
+            model_list = []
             for name, module in modules_and_names:
+                model_list.append((name, module))
+            is_verl = "verl" in sys.modules
+            for idx, (name, module) in enumerate(model_list):
                 if recursive and module == model:
                     continue
                 if not is_torch_nn_module(module):
@@ -157,6 +171,13 @@ class ModuleProcesser:
                     continue
                 if module.__class__.__name__ == "FullyShardedDataParallel":
                     continue
+                # verl 场景下跳过第一层和最后一层
+                if is_verl and (idx == 1 or idx == len(model_list) - 1):
+                    logger.warning(f"The module {name} is the first or last layer in verl scenario, "
+                                   f"the data dump for this module will be skipped.")
+                    continue
                 setattr(module, 'msprobe_hook', True)
                 module_index = (index + Const.SEP) if index != "-1" else ""
                 prefix_name = f'{BaseScope.Module_Type_Module}{Const.SEP}{module_index}{name}{Const.SEP}' + \

msprobe/pytorch/hook_module/api_register.py CHANGED Viewed

@@ -43,7 +43,6 @@ else:
 torch_version_above_2 = torch.__version__.split('+')[0] > '2.0'
-_inner_used_api = {}
 _supported_api_list_path = (os.path.join(os.path.dirname(os.path.realpath(__file__)), Const.SUPPORT_API_FILE_NAME),)
 _cuda_func_mapping = {"npu_fusion_attention": "gpu_fusion_attention"}
 dist_data_collect_func = {}
@@ -85,6 +84,12 @@ if not is_gpu:
         mindspeed_op_file_list = [op.split(Const.SEP)[0] + Const.PY_SUFFIX for op in mindspeed_op_list]
         dynamic_import_op(mindspeed.ops, mindspeed_op_file_list)
+_inner_used_api = {
+    Const.PT_FRAMEWORK + Const.SEP + Const.PT_API_TYPE_TENSOR: (
+        torch.Tensor, "view_as"
+    )
+}
 @parameter_adapter
 def tensor_module_forward(module, *args, **kwargs):
@@ -130,13 +135,17 @@ def redirect_wait():
                 store_func = dist_data_collect_func.pop(args[0])
                 store_func()
                 return
+            remove_value = None
             for value in dist_batch_data_collect_func:
                 if args[0] in value[0]:
                     value[0].remove(args[0])
                     if len(value[0]) == 0:
                         store_func = value[1]
                         store_func()
-                    return
+                        remove_value = value
+                    break
+            if remove_value:
+                dist_batch_data_collect_func.remove(remove_value)
         return wrapped_wait

msprobe/pytorch/monitor/module_hook.py CHANGED Viewed

@@ -48,12 +48,10 @@ from msprobe.pytorch.monitor.module_metric import get_metrics, get_summary_write
 from msprobe.pytorch.monitor.optimizer_collect import OptimizerMonFactory
 from msprobe.pytorch.monitor.visualizer import HeatmapVisualizer
 torch_version_above_or_equal_2 = torch.__version__.split('+')[0] >= '2.0'
 if not torch_version_above_or_equal_2:
     raise ValueError("monitor require torch>=2.0")
 FORMAT_MAPPING = {
     MonitorConst.TENSORBOARD: SummaryWriterWithAD,
     MonitorConst.CSV: CSVWriterWithAD,
@@ -150,15 +148,11 @@ class GradContext:
     def __init__(self) -> None:
         self.pre = {}
         self.post = {}
-        self.acc_metric = {}
-        self.acc = {}
         self.actv = {}
     def reset(self):
         self.pre.clear()
         self.post.clear()
-        self.acc_metric.clear()
-        self.acc.clear()
         self.actv.clear()
@@ -510,18 +504,8 @@ class TrainerMon:
         if not self.wg_distribution:
             return {}, {}
-        if self.weight_hooked:
-            get_metrics(self.ops, self.grad_context.acc, self.eps, self.grad_context.acc_metric)
         get_metrics(self.ops, post_grad_dict, self.eps, self.grad_context.post)
-        reduced_grad = self.grad_context.post
-        if self.weight_hooked:
-            unreduced_grad = self.grad_context.acc_metric
-        else:
-            unreduced_grad = self.grad_context.pre
-        return reduced_grad, unreduced_grad
+        return self.grad_context.post, self.grad_context.pre
     def generate_xy_metrics(self):
         actv = {}
@@ -529,7 +513,6 @@ class TrainerMon:
             actv.update(fwd_context.actv)
         actv_grad = self.grad_context.actv
         return actv, actv_grad
     def reload_xy(self, xy_distribution=False):
@@ -607,11 +590,8 @@ class TrainerMon:
         if not self.wg_distribution:
             return
-        if self.weight_hooked:
-            self.summary_writer.write_metrics(self.ops, self.grad_context.acc_metric, step, 'grad_unreduced',
-                                              use_micro_step=self.monitor_mbs_grad)
-        else:
-            self.summary_writer.write_metrics(self.ops, self.grad_context.pre, step, 'grad_unreduced')
+        self.summary_writer.write_metrics(self.ops, self.grad_context.pre, step, 'grad_unreduced',
+                                          use_micro_step=self.monitor_mbs_grad)
         self.summary_writer.write_metrics(self.ops, self.grad_context.post, step, 'grad_reduced')
     def hook_optimizer(self, optimizer):
@@ -732,9 +712,9 @@ class TrainerMon:
             # 静态在第0步就可以保存, 动态在第0步不可以, 因为动态设计的就是重置后下一步开启, 第0步的self.monitoring还是False
             if self.monitoring:
                 module_rank_valid = not self.module_rank_list or (
-                            dist.is_initialized() and dist.get_rank() in self.module_rank_list)
+                        dist.is_initialized() and dist.get_rank() in self.module_rank_list)
                 step_condition = (context.step >= self.start_step and (
-                            context.step - self.start_step) % self.step_interval == 0)
+                        context.step - self.start_step) % self.step_interval == 0)
                 if module_rank_valid and step_condition:
                     self.has_collect_times += 1
@@ -791,6 +771,7 @@ class TrainerMon:
                     hook(optimizer, args, kwargs)
                 step_final_hook(optimizer, args, kwargs)
                 return out
             return wrapper
         optimizer.__class__.step = patch_step(optimizer.__class__.step, optimizer)
@@ -1013,11 +994,11 @@ class TrainerMon:
                 vpp_stage + module_name,
             ]:
                 if pattern in l2_targets:
-                    return pattern
+                    return pattern
         elif hook_name in ["linear_hook"]:
             return vpp_stage + squash_param_name(module_name, self.squash_name)
         return ""
     def _hook_module(self, target_names, l2_target_names, module: torch.nn.Module, vpp_stage=''):
         if '_modules' not in module.__dict__:
             # nothing to hook
@@ -1151,7 +1132,7 @@ class TrainerMon:
                 context.micro_step = 0
                 context.step += 1
             return
         def stack_hook(module, args, kwargs, module_output, name):
             if module not in self.module_fwd_hook_context_by_module:
                 self.module_fwd_hook_context_by_module[module] = ModuleHookContext(name)
@@ -1221,7 +1202,7 @@ class TrainerMon:
         if self.monitor_mbs_grad:
             self._hook_weights()
             return
         self.optimizer_mon.patch_grad_sync(self)
         if self.enable_megatron or self.enable_deepspeed:
@@ -1281,6 +1262,7 @@ class TrainerMon:
                 get_metrics(self.ops, grad_dict, self.eps, self.grad_context.pre)
                 out = foreach_reduce(fsdp_params, unsharded_grads, *unused)
                 return out
             return wrapper
         logger.info("Patch fsdp2 foreach_reduce, collect pre_grad metrics.")
@@ -1294,10 +1276,9 @@ class TrainerMon:
         """
         遍历参数的梯度生成函数（grad_acc），并挂载hook，以便在该参数所有梯度计算后，采集通信聚合前梯度数据。
         """
-        context = self.grad_context
         @torch.no_grad
-        def param_hook(*args, context_dict, param, name):
+        def param_hook(*args, param, name):
             key = name
             if self.monitor_mbs_grad:
                 key += f'{MonitorConst.NAME_SEP}{param.micro_step}'
@@ -1305,14 +1286,15 @@ class TrainerMon:
             key = get_summary_writer_tag_name(key, 'acc_grad', self.rank)
             self.register_param_call_id("param_hook", key)
             param.micro_step += 1
+            grad_dict = {}
             if self.monitor_mbs_grad or (param.micro_step == self.micro_batch_number):
                 if self.params_have_main_grad:
                     grad = param.main_grad
                 else:
                     grad = param.grad
-                context_dict[key] = grad.clone()
+                grad_dict[key] = grad.clone()
+            get_metrics(self.ops, grad_dict, self.eps, self.grad_context.pre)
             if param.micro_step == self.micro_batch_number:
                 param.micro_step = 0
@@ -1322,7 +1304,7 @@ class TrainerMon:
             param_tmp = param.expand_as(param)
             grad_acc = param_tmp.grad_fn.next_functions[0][0]
             handle = grad_acc.register_hook(
-                partial(param_hook, context_dict=context.acc, param=param, name=name))
+                partial(param_hook, param=param, name=name))
             self.grad_accs.append(grad_acc)
             self.handles['wgrads'].append(handle)

msprobe/pytorch/pt_config.py CHANGED Viewed

@@ -80,6 +80,7 @@ class FreeBenchmarkCheckConfig(BaseConfig):
         self.handler_type = json_config.get("handler_type", PytorchFreeBenchmarkConst.DEFAULT_HANDLER)
         self.fuzz_level = json_config.get("fuzz_level", PytorchFreeBenchmarkConst.DEFAULT_FUZZ_LEVEL)
         self.fuzz_stage = json_config.get("fuzz_stage", PytorchFreeBenchmarkConst.DEFAULT_FUZZ_STAGE)
+        self.list = json_config.get("list")
         self.if_preheat = json_config.get("if_preheat", False)
         self.preheat_step = json_config.get("preheat_step", PytorchFreeBenchmarkConst.DEFAULT_PREHEAT_STEP)
         self.max_sample = json_config.get("max_sample", PytorchFreeBenchmarkConst.DEFAULT_PREHEAT_STEP)
@@ -146,6 +147,11 @@ class FreeBenchmarkCheckConfig(BaseConfig):
             logger.error_log_with_exp(
                 msg, MsprobeException(MsprobeException.INVALID_PARAM_ERROR, msg)
             )
+        if self.fuzz_stage == Const.BACKWARD and not self.list:
+            raise MsprobeException(
+                MsprobeException.INVALID_PARAM_ERROR,
+                f"When fuzz_stage is set to {Const.BACKWARD}, the parameters list must not be empty."
+            )
     def _check_fuzz_level(self):
         if self.fuzz_level not in PytorchFreeBenchmarkConst.FUZZ_LEVEL_LIST:

msprobe/visualization/builder/graph_builder.py CHANGED Viewed

@@ -74,6 +74,7 @@ class GraphBuilder:
             config.graph_b.data_source = GraphConst.JSON_BENCH_KEY
             config.graph_b.step = config.step
             config.graph_b.rank = config.rank
+            config.graph_b.compare_mode = config.compare_mode
             node_to_db(config.graph_b, filename)
         config_to_db(config, filename)
@@ -297,8 +298,8 @@ class GraphBuilder:
         no_recompute_map = GraphBuilder._get_no_recompute_map(graph, id_prefixes)
         if not no_recompute_map:
             return
-        # 深拷贝非重计算节点字典用于反向模式
-        no_recompute_ids_b = copy.deepcopy(no_recompute_map)
+        # 拷贝非重计算节点字典用于反向模式
+        no_recompute_ids_b = {node_id: list(node_list) for node_id, node_list in no_recompute_map.items()}
         del_indexes = []
         for node_id, id_prefix in recompute_map.items():

msprobe/visualization/builder/graph_merger.py CHANGED Viewed

@@ -146,6 +146,7 @@ class BaseGraphMerger:
                                                                           GraphConst.APIS_BETWEEN_MODULES_ALL_RANKS,
                                                                           id_accumulation=True)
                 all_collection_node = main_graph_result.graph.get_node(all_collection_node_id)
+                all_collection_node.upnode = main_graph_result.graph.root
                 new_main_root_sub_nodes.append(all_collection_node)
                 # Apis_Between_Modules.0 --> Apis_Between_Modules_Rank0.0
                 origin_main_node_id = main_node.id
@@ -377,6 +378,12 @@ class PPMerger(BaseGraphMerger):
             logger.info('Unable to get pp groups based on Distributed Api (batch_isend_irecv, send, or isend), '
                         'generate pp groups using parallel param "rank_size", "tp" and "pp".')
             _, pp_groups = self.get_default_groups()
+        elif len(pp_groups[0]) != self.parallel_param.pp:
+            logger.warning(f'Based on Distributed Api (atch_isend_irecv, send, or isend), '
+                           f'the resulting pp groups={pp_groups}, '
+                           f'its length is not equal to the parallel param "pp"({self.parallel_param.pp}) you defined, '
+                           f'generate pp groups using parallel param "rank_size", "tp" and "pp".')
+            _, pp_groups = self.get_default_groups()
         logger.info(f'{self.log_prefix} All pp groups is {pp_groups}.')
         return pp_groups
@@ -657,6 +664,12 @@ class TPMerger(BaseGraphMerger):
             logger.info('Unable to get tp groups based on Distributed Api (reduce_scatter or all_reduce), '
                         'generate tp groups using parallel param "rank_size", "tp" and "pp".')
             tp_groups, _ = self.get_default_groups()
+        elif len(tp_groups[0]) != self.parallel_param.tp:
+            logger.warning(f'Based on Distributed Api (reduce_scatter or all_reduce), '
+                           f'the resulting tp groups={tp_groups}, '
+                           f'its length is not equal to the parallel param "tp"({self.parallel_param.tp}) you defined, '
+                           f'generate tp groups using parallel param "rank_size", "tp" and "pp".')
+            tp_groups, _ = self.get_default_groups()
         logger.info(f'{self.log_prefix} All tp groups is {tp_groups}.')
         return tp_groups

msprobe/visualization/graph/graph.py CHANGED Viewed

@@ -126,21 +126,25 @@ class Graph:
     def get_sorted_nodes(self):
         """
-        通过深度优先遍历graph，获得排过序的node列表
+        通过深度优先遍历graph，获得排过序的node列表，使用栈实现避免超出递归深度问题
         """
         visited = set()
         order = []
+        stack = [(self.root, False)]
-        @recursion_depth_decorator('msprobe.visualization.graph.graph.Graph.get_nodes_order.visit', max_depth=500)
-        def visit(node):
+        while stack:
+            node, processed = stack.pop()
             if node.id in visited:
-                return
-            visited.add(node.id)
-            for sub_node in node.subnodes:
-                visit(sub_node)
-            order.append(node)
+                continue
+            if processed:
+                visited.add(node.id)
+                order.append(node)
+            else:
+                stack.append((node, True))
+                for sub_node in reversed(node.subnodes):
+                    if sub_node.id not in visited:
+                        stack.append((sub_node, False))
-        visit(self.root)
         return order
     def add_node(self, node_op, node_id, up_node=None, id_accumulation=False):

msprobe/visualization/utils.py CHANGED Viewed

@@ -152,7 +152,8 @@ def load_parallel_param(input_param):
 def validate_parallel_param(parallel_param, dump_path, log_prefix='[NPU]'):
-    params = [parallel_param.tp, parallel_param.pp, parallel_param.rank_size]
+    pattern = re.compile(r'^[a-z\-]+$')
+    params = [parallel_param.tp, parallel_param.pp, parallel_param.rank_size, parallel_param.vpp]
     ranks = check_and_return_dir_contents(dump_path, Const.RANK)
     if len(ranks) != parallel_param.rank_size:
         logger.error(f'{log_prefix} The parallel param "rank_size" error, '
@@ -161,6 +162,12 @@ def validate_parallel_param(parallel_param, dump_path, log_prefix='[NPU]'):
     if any(x is None for x in params):
         logger.error(f'{log_prefix} The parallel params "tp/pp/rank_size" must not be null!')
         raise MsprobeException(MsprobeException.INVALID_PARAM_ERROR)
+    if any(isinstance(x, bool) for x in params):
+        logger.error(f'{log_prefix} The parallel params "tp/pp/vpp/rank_size" must not be bool!')
+        raise MsprobeException(MsprobeException.INVALID_PARAM_ERROR)
+    if any(not isinstance(x, int) for x in params):
+        logger.error(f'{log_prefix} The parallel params "tp/pp/vpp/rank_size" must be int!')
+        raise MsprobeException(MsprobeException.INVALID_PARAM_ERROR)
     if any(x <= 0 for x in params):
         logger.error(f'{log_prefix} The parallel params "tp/pp/vpp/rank_size" must be greater than 0!')
         raise MsprobeException(MsprobeException.INVALID_PARAM_ERROR)
@@ -185,6 +192,9 @@ def validate_parallel_param(parallel_param, dump_path, log_prefix='[NPU]'):
     if not isinstance(parallel_param.order, str):
         logger.error(f'{log_prefix} The parallel params "order" must be of string type!')
         raise MsprobeException(MsprobeException.INVALID_PARAM_ERROR)
+    if not pattern.match(parallel_param.order):
+        logger.error(f'{log_prefix} The parallel params "order" must consist only of lowercase letters and "-"!')
+        raise MsprobeException(MsprobeException.INVALID_PARAM_ERROR)
 class ParallelParam:

msprobe/core/compare/diff_analyze/ignore_op_list.yaml DELETED Viewed

@@ -1,3 +0,0 @@
-npu_fusion_attention:
-  - 4
-  - 5

{mindstudio_probe-8.2.1.dist-info → mindstudio_probe-8.3.1.dist-info}/LICENSE RENAMED Viewed

File without changes

{mindstudio_probe-8.2.1.dist-info → mindstudio_probe-8.3.1.dist-info}/WHEEL RENAMED Viewed

File without changes

{mindstudio_probe-8.2.1.dist-info → mindstudio_probe-8.3.1.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{mindstudio_probe-8.2.1.dist-info → mindstudio_probe-8.3.1.dist-info}/top_level.txt RENAMED Viewed

File without changes

mindstudio-probe 8.2.1__py3-none-any.whl → 8.3.1__py3-none-any.whl

mindstudio-probe 8.2.1py3-none-any.whl → 8.3.1py3-none-any.whl