PyPI - mindstudio-probe - Versions diffs - 1.2.1__py3-none-any.whl → 1.2.2__py3-none-any.whl - Mend

mindstudio-probe 1.2.1py3-none-any.whl → 1.2.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (85) hide show

{mindstudio_probe-1.2.1.dist-info → mindstudio_probe-1.2.2.dist-info}/METADATA +1 -1
{mindstudio_probe-1.2.1.dist-info → mindstudio_probe-1.2.2.dist-info}/RECORD +85 -66
msprobe/README.md +2 -2
msprobe/core/common/const.py +34 -9
msprobe/core/common/inplace_ops.yaml +1 -0
msprobe/core/common/utils.py +14 -0
msprobe/core/compare/layer_mapping/data_scope_parser.py +1 -1
msprobe/core/compare/merge_result/merge_result.py +8 -7
msprobe/core/compare/merge_result/utils.py +81 -0
msprobe/core/compare/utils.py +10 -0
msprobe/core/data_dump/data_collector.py +58 -13
msprobe/core/data_dump/data_processor/base.py +92 -8
msprobe/core/data_dump/data_processor/factory.py +3 -0
msprobe/core/data_dump/data_processor/mindspore_processor.py +17 -4
msprobe/core/data_dump/data_processor/pytorch_processor.py +58 -7
msprobe/core/data_dump/json_writer.py +26 -8
msprobe/docs/01.installation.md +25 -0
msprobe/docs/02.config_introduction.md +14 -12
msprobe/docs/03.config_examples.md +24 -0
msprobe/docs/05.data_dump_PyTorch.md +34 -15
msprobe/docs/06.data_dump_MindSpore.md +45 -22
msprobe/docs/09.accuracy_checker_MindSpore.md +4 -2
msprobe/docs/19.monitor.md +257 -260
msprobe/docs/21.visualization_PyTorch.md +10 -0
msprobe/docs/22.visualization_MindSpore.md +11 -0
msprobe/docs/27.dump_json_instruction.md +24 -20
msprobe/docs/28.debugger_save_instruction.md +94 -0
msprobe/docs/28.kernel_dump_MindSpore.md +69 -0
msprobe/docs/img/monitor/step_count_per_record.png +0 -0
msprobe/mindspore/__init__.py +1 -0
msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +26 -6
msprobe/mindspore/api_accuracy_checker/api_runner.py +54 -16
msprobe/mindspore/api_accuracy_checker/compute_element.py +47 -1
msprobe/mindspore/api_accuracy_checker/torch_mindtorch_importer.py +129 -0
msprobe/mindspore/api_accuracy_checker/type_mapping.py +24 -1
msprobe/mindspore/api_accuracy_checker/utils.py +6 -1
msprobe/mindspore/common/utils.py +20 -2
msprobe/mindspore/debugger/debugger_config.py +25 -2
msprobe/mindspore/debugger/precision_debugger.py +25 -6
msprobe/mindspore/dump/hook_cell/api_registry.py +2 -0
msprobe/mindspore/dump/jit_dump.py +7 -6
msprobe/mindspore/monitor/anomaly_detect.py +404 -0
msprobe/mindspore/monitor/distributed/__init__.py +0 -0
msprobe/mindspore/monitor/distributed/distributed_ops.yaml +15 -0
msprobe/mindspore/monitor/distributed/stack_blacklist.yaml +5 -0
msprobe/mindspore/monitor/distributed/wrap_distributed.py +300 -0
msprobe/mindspore/monitor/features.py +63 -0
msprobe/mindspore/monitor/module_hook.py +821 -0
msprobe/mindspore/monitor/module_spec_verifier.py +94 -0
msprobe/mindspore/monitor/utils.py +267 -0
msprobe/mindspore/ms_config.py +8 -2
msprobe/mindspore/service.py +95 -21
msprobe/pytorch/__init__.py +0 -1
msprobe/pytorch/api_accuracy_checker/generate_op_script/op_generator.py +1 -1
msprobe/pytorch/bench_functions/apply_adam.py +215 -0
msprobe/pytorch/bench_functions/group_norm_silu.py +27 -0
msprobe/pytorch/bench_functions/mish.py +21 -0
msprobe/pytorch/bench_functions/moe_gating_top_k_softmax.py +44 -0
msprobe/pytorch/bench_functions/sort_v2.py +21 -0
msprobe/pytorch/common/utils.py +71 -0
msprobe/pytorch/debugger/debugger_config.py +19 -9
msprobe/pytorch/debugger/precision_debugger.py +14 -0
msprobe/pytorch/dump/module_dump/module_processer.py +10 -30
msprobe/pytorch/function_factory.py +7 -1
msprobe/pytorch/hook_module/support_wrap_ops.yaml +2 -1
msprobe/pytorch/hook_module/wrap_distributed.py +4 -0
msprobe/pytorch/monitor/anomaly_detect.py +14 -29
msprobe/pytorch/monitor/csv2tb.py +10 -12
msprobe/pytorch/monitor/module_hook.py +123 -104
msprobe/pytorch/monitor/module_metric.py +6 -6
msprobe/pytorch/monitor/optimizer_collect.py +45 -63
msprobe/pytorch/monitor/utils.py +8 -43
msprobe/pytorch/pt_config.py +19 -22
msprobe/pytorch/service.py +103 -24
msprobe/visualization/builder/graph_builder.py +31 -5
msprobe/visualization/builder/msprobe_adapter.py +7 -5
msprobe/visualization/graph/base_node.py +3 -2
msprobe/visualization/graph/distributed_analyzer.py +80 -3
msprobe/visualization/graph/node_op.py +4 -2
msprobe/visualization/graph_service.py +3 -4
msprobe/visualization/utils.py +10 -2
{mindstudio_probe-1.2.1.dist-info → mindstudio_probe-1.2.2.dist-info}/LICENSE +0 -0
{mindstudio_probe-1.2.1.dist-info → mindstudio_probe-1.2.2.dist-info}/WHEEL +0 -0
{mindstudio_probe-1.2.1.dist-info → mindstudio_probe-1.2.2.dist-info}/entry_points.txt +0 -0
{mindstudio_probe-1.2.1.dist-info → mindstudio_probe-1.2.2.dist-info}/top_level.txt +0 -0

msprobe/core/compare/utils.py CHANGED Viewed

@@ -170,6 +170,16 @@ def gen_op_item(op_data, op_name):
         elif op_item.get('type') == 'slice':
             op_item['dtype'] = op_data.get('type')
             op_item['shape'] = str(np.shape(np.array(op_data.get('value'))))
+        elif op_item.get('type') == 'ellipsis':
+            op_item['dtype'] = op_data.get('type')
+            op_item['shape'] = '[]'
+            for i in params:
+                op_item[i] = op_data.get('value')
+        elif op_item.get('type') == 'torch.ProcessGroup':
+            op_item['dtype'] = op_data.get('type')
+            op_item['shape'] = '[]'
+            for i in params:
+                op_item[i] = str(op_data.get('group_ranks'))
         else:
             op_item['dtype'] = str(type(op_data.get('value')))
             op_item['shape'] = '[]'

msprobe/core/data_dump/data_collector.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
+# Copyright (c) 2024-2025, Huawei Technologies Co., Ltd.
 # All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0  (the "License");
@@ -40,6 +40,7 @@ class DataCollector:
         self.scope = ScopeFactory(self.config).build_scope()
         self.backward_module_names = {}
         self.optimizer_status = ""
+        self.optimizer_status_first_start = {Const.OPTIMIZER: True, Const.CLIP_GRAD: True}
         atexit.register(self.write_json)
     @property
@@ -54,6 +55,17 @@ class DataCollector:
     def check_scope_and_pid(scope, name, pid):
         return (not scope or scope.check(name)) and pid == os.getpid()
+    @staticmethod
+    def set_is_recomputable(data_info, is_recompute):
+        if data_info and len(data_info) == 1 and is_recompute is not None: # 正常情况下data_info的长度应改为1
+            data_info[list(data_info.keys())[0]]["is_recompute"] = is_recompute
+    def reset_status(self):
+        self.optimizer_status = ""
+        self.optimizer_status_first_start = {Const.OPTIMIZER: True, Const.CLIP_GRAD: True}
+        self.data_writer.reset_cache()
+        self.backward_module_names.clear()
     def if_return_forward_new_output(self):
         return self.data_processor.if_return_forward_new_output()
@@ -77,7 +89,7 @@ class DataCollector:
         logger.debug(msg)
         self.data_writer.update_data(data_info)
-    def forward_input_data_collect(self, name, module, pid, module_input_output):
+    def forward_input_data_collect(self, name, module, pid, module_input_output, is_recompute=None):
         if self.config.task == Const.FREE_BENCHMARK:
             backward_name = name.replace(Const.FORWARD, Const.BACKWARD)
             if self.check_scope_and_pid(self.scope, backward_name, pid):
@@ -87,37 +99,48 @@ class DataCollector:
         if not self.check_scope_and_pid(self.scope, name, pid):
             return
-        data_info = self.data_processor.analyze_forward_input(name, module, module_input_output)
+        data_info = {}
+        if self.config.task != Const.STRUCTURE:
+            data_info = self.data_processor.analyze_forward_input(name, module, module_input_output)
+        self.set_is_recomputable(data_info, is_recompute)
         if self.config.level == Const.LEVEL_L2:
             return
         self.handle_data(name, data_info, flush=self.data_processor.is_terminated)
-    def forward_output_data_collect(self, name, module, pid, module_input_output):
+    def forward_output_data_collect(self, name, module, pid, module_input_output, is_recompute=None):
         self.update_construct(name)
         if not self.check_scope_and_pid(self.scope, name, pid):
             return
-        data_info = self.data_processor.analyze_forward_output(name, module, module_input_output)
+        data_info = {}
+        if self.config.task != Const.STRUCTURE:
+            data_info = self.data_processor.analyze_forward_output(name, module, module_input_output)
+        self.set_is_recomputable(data_info, is_recompute)
         if self.config.level == Const.LEVEL_L2:
             return
         self.data_writer.update_stack(self.data_processor.analyze_api_call_stack(name))
         self.handle_data(name, data_info, flush=self.data_processor.is_terminated)
-    def forward_data_collect(self, name, module, pid, module_input_output):
+    def forward_data_collect(self, name, module, pid, module_input_output, is_recompute=None):
         self.update_construct(name)
         if not self.check_scope_and_pid(self.scope, name, pid):
             return
-        data_info = self.data_processor.analyze_forward(name, module, module_input_output)
+        data_info = {}
+        if self.config.task != Const.STRUCTURE:
+            data_info = self.data_processor.analyze_forward(name, module, module_input_output)
+        self.set_is_recomputable(data_info, is_recompute)
         self.data_writer.update_stack(self.data_processor.analyze_api_call_stack(name))
         self.handle_data(name, data_info, flush=self.data_processor.is_terminated)
-    def backward_data_collect(self, name, module, pid, module_input_output):
+    def backward_data_collect(self, name, module, pid, module_input_output, is_recompute=None):
         self.update_construct(name)
         if not self.check_scope_and_pid(self.scope, name, pid):
             return
-        data_info = self.data_processor.analyze_backward(name, module, module_input_output)
+        data_info = {}
+        if self.config.task != Const.STRUCTURE:
+            data_info = self.data_processor.analyze_backward(name, module, module_input_output)
         if self.config.level == Const.LEVEL_L2:
             return
         # 获取执行反向的模块名称
@@ -127,25 +150,34 @@ class DataCollector:
             self.backward_module_names[module_name] = True
         self.handle_data(name, data_info, flush=self.data_processor.is_terminated)
-    def backward_input_data_collect(self, name, module, pid, module_input_output):
+    def backward_input_data_collect(self, name, module, pid, module_input_output, is_recompute=None):
         self.update_construct(name)
         if not self.check_scope_and_pid(self.scope, name, pid):
             return
-        data_info = self.data_processor.analyze_backward_input(name, module, module_input_output)
+        data_info = {}
+        if self.config.task != Const.STRUCTURE:
+            data_info = self.data_processor.analyze_backward_input(name, module, module_input_output)
+        self.set_is_recomputable(data_info, is_recompute)
         self.handle_data(name, data_info)
-    def backward_output_data_collect(self, name, module, pid, module_input_output):
+    def backward_output_data_collect(self, name, module, pid, module_input_output, is_recompute=None):
         self.update_construct(name)
         if not self.check_scope_and_pid(self.scope, name, pid):
             return
-        data_info = self.data_processor.analyze_backward_output(name, module, module_input_output)
+        data_info = {}
+        if self.config.task != Const.STRUCTURE:
+            data_info = self.data_processor.analyze_backward_output(name, module, module_input_output)
+        self.set_is_recomputable(data_info, is_recompute)
         self.handle_data(name, data_info)
     def update_construct(self, name):
         if self.config.level not in DataCollector.level_without_construct:
             if self.optimizer_status in [Const.OPTIMIZER, Const.CLIP_GRAD]:
+                if self.optimizer_status_first_start[self.optimizer_status]:
+                    self.data_writer.update_construct({self.optimizer_status: None})
+                    self.optimizer_status_first_start[self.optimizer_status] = False
                 self.data_writer.update_construct({name: self.optimizer_status})
             else:
                 self.data_writer.update_construct({name: self.module_processor.api_parent_node})
@@ -183,3 +215,16 @@ class DataCollector:
     def fill_stack_tensor_data(self):
         self.data_writer.fill_stack_tensor_data()
+    def debug_data_collect_forward(self, variable, name_with_count):
+        data_info = self.data_processor.analyze_debug_forward(variable, name_with_count)
+        self.data_writer.update_debug({name_with_count: data_info})
+    def debug_data_collect_backward(self, variable, grad_name_with_count):
+        # prepare all None nested data structure
+        all_none_data_info = self.data_processor.analyze_element_to_all_none(variable)
+        self.data_writer.update_debug({grad_name_with_count: all_none_data_info})
+        # register tensor backward hook
+        self.data_processor.analyze_debug_backward(variable, grad_name_with_count, self.data_writer.cache_debug['data'])

msprobe/core/data_dump/data_processor/base.py CHANGED Viewed

@@ -17,6 +17,9 @@ import inspect
 import os
 from dataclasses import dataclass, is_dataclass
 from typing import Tuple, Dict, Optional, Any
+from functools import partial
+import copy
+from typing import Union
 import numpy as np
@@ -87,7 +90,7 @@ class TensorStatInfo:
 class BaseDataProcessor:
     _recursive_key_stack = []
     special_type = (
-        np.integer, np.floating, np.bool_, np.complexfloating, np.str_, np.byte, np.unicode_,
+        np.integer, np.floating, np.bool_, np.complexfloating, np.str_, np.byte, np.unicode_, np.ndarray,
         bool, int, float, str, slice,
         type(Ellipsis)
     )
@@ -143,6 +146,37 @@ class BaseDataProcessor:
         else:
             return data
+    @staticmethod
+    def set_value_into_nested_structure(data_structure, indexes, value):
+        '''
+        Args:
+            data_structure: nested data structure
+            indexes: List
+            value: value to be set
+        '''
+        if not indexes:
+            raise ValueError("set_value_into_nested_structure failed: "
+                             "indexes need to be non empty when set value to nested data structure")
+        current_level = data_structure
+        for i, index in enumerate(indexes):
+            valid_for_list = isinstance(current_level, list) and isinstance(index, int) and len(current_level) > index
+            valid_for_dict = isinstance(current_level, dict) and index in current_level
+            is_last = i == len(indexes) - 1
+            if valid_for_dict or valid_for_list:
+                if is_last:
+                    try:
+                        current_level[index] = value
+                    except Exception as e:
+                        raise IndexError("set_value_into_nested_structure failed: passed indexes wrong") from e
+                else:
+                    try:
+                        current_level = current_level[index]
+                    except Exception as e:
+                        raise IndexError("set_value_into_nested_structure failed: passed indexes wrong") from e
+            else:
+                raise ValueError("set_value_into_nested_structure failed: "
+                                 "invalid data_structure type or invalid index")
     @staticmethod
     def _convert_numpy_to_builtin(arg):
         type_mapping = {
@@ -183,8 +217,22 @@ class BaseDataProcessor:
         return single_arg
     @staticmethod
-    def _analyze_numpy(value, numpy_type):
-        return {"type": numpy_type, "value": value}
+    def _analyze_numpy(ndarray, numpy_type):
+        ndarray_json = {}
+        ndarray_json.update({'type': 'numpy.ndarray'})
+        ndarray_json.update({'dtype': str(ndarray.dtype)})
+        ndarray_json.update({'shape': ndarray.shape})
+        if ndarray.size > 0:
+            ndarray_json.update({"Max": np.max(ndarray).item()})
+            ndarray_json.update({"Min": np.min(ndarray).item()})
+            ndarray_json.update({"Mean": np.mean(ndarray).item()})
+            ndarray_json.update({"Norm": np.linalg.norm(ndarray).item()})
+        else:
+            ndarray_json.update({"Max": None})
+            ndarray_json.update({"Min": None})
+            ndarray_json.update({"Mean": None})
+            ndarray_json.update({"Norm": None})
+        return ndarray_json
     @staticmethod
     def _get_allowed_data_mode(data_mode):
@@ -203,7 +251,7 @@ class BaseDataProcessor:
         return cls.special_type
     @classmethod
-    def recursive_apply_transform(cls, args, transform, depth=0):
+    def recursive_apply_transform(cls, args, transform, depth=0) -> Union[dict, list, None]:
         if depth > Const.MAX_DEPTH:
             logger.error(f"The maximum depth of recursive transform, {Const.MAX_DEPTH} is reached.")
             raise CompareException(CompareException.RECURSION_LIMIT_ERROR)
@@ -220,7 +268,7 @@ class BaseDataProcessor:
             return cls.apply_transform_dict(args_dict, transform, depth)
         elif isinstance(args, (list, tuple)):
             result_list = cls.apply_transform_list(args, transform, depth)
-            return type(args)(result_list)
+            return result_list
         elif isinstance(args, dict):
             return cls.apply_transform_dict(args, transform, depth)
         elif args is not None:
@@ -228,12 +276,12 @@ class BaseDataProcessor:
             return None
         else:
             return None
     @classmethod
     def apply_transform_dict(cls, args, transform, depth):
         result_dict = {}
         for k, arg in args.items():
-            cls._recursive_key_stack.append(str(k))
+            cls._recursive_key_stack.append(k)
             result_dict[k] = cls.recursive_apply_transform(arg, transform, depth=depth + 1)
             cls._recursive_key_stack.pop()
         return result_dict
@@ -242,11 +290,21 @@ class BaseDataProcessor:
     def apply_transform_list(cls, args, transform, depth):
         result_list = []
         for i, arg in enumerate(args):
-            cls._recursive_key_stack.append(str(i))
+            cls._recursive_key_stack.append(i)
             result_list.append(cls.recursive_apply_transform(arg, transform, depth=depth + 1))
             cls._recursive_key_stack.pop()
         return result_list
+    @classmethod
+    def register_hook_single_element(cls, element, suffix_stack, hook_fn):
+        if cls.is_hookable_element(element):
+            indexes = copy.deepcopy(suffix_stack)
+            wrap_hook_fn = partial(hook_fn, indexes=indexes)
+            def real_hook_fn(grad):
+                return wrap_hook_fn(grad)
+            element.register_hook(real_hook_fn)
     def if_return_forward_new_output(self):
         return self._return_forward_new_output
@@ -383,3 +441,29 @@ class BaseDataProcessor:
                               suffix + file_format)
         file_path = os.path.join(self.data_writer.dump_tensor_data_dir, dump_data_name)
         return dump_data_name, file_path
+    def analyze_element_to_all_none(self, element):
+        return self.recursive_apply_transform(element, lambda element, stack: None)
+    def analyze_debug_forward(self, variable, name_with_count):
+        self.current_api_or_module_name = name_with_count
+        self.api_data_category = Const.TENSOR
+        # these two attributes are used to construct tensor file name {name_with_count}.tensor.{indexes}.npy/pt
+        data_info = self.analyze_element(variable)
+        return data_info
+    def analyze_debug_backward(self, variable, grad_name_with_count, nested_data_structure):
+        def hook_fn(grad, indexes):
+            suffix = Const.SEP.join([str(index) for index in indexes])
+            self.save_name = grad_name_with_count + Const.SEP + Const.TENSOR + Const.SEP + suffix
+            grad_data_info = self.analyze_element(grad)
+            self.save_name = None
+            full_index = [grad_name_with_count] + indexes
+            try:
+                self.set_value_into_nested_structure(nested_data_structure, full_index, grad_data_info)
+            except (ValueError, IndexError) as e:
+                logger.warning(f"error occured while recording statistics of {grad_name_with_count} variable, "
+                               f"skip current recording, detailed infomation: {e}")
+            return grad
+        wrap_register_hook_single_element = partial(self.register_hook_single_element, hook_fn=hook_fn)
+        self.recursive_apply_transform(variable, wrap_register_hook_single_element)

msprobe/core/data_dump/data_processor/factory.py CHANGED Viewed

@@ -14,6 +14,7 @@
 # limitations under the License.
 from msprobe.core.common.const import Const
+from msprobe.core.data_dump.data_processor.base import BaseDataProcessor
 class DataProcessorFactory:
@@ -62,6 +63,7 @@ class DataProcessorFactory:
             cls.register_processor(Const.PT_FRAMEWORK, Const.OVERFLOW_CHECK, PytorchOverflowCheckDataProcessor)
             cls.register_processor(Const.PT_FRAMEWORK, Const.FREE_BENCHMARK, PytorchFreeBenchmarkDataProcessor)
             cls.register_processor(Const.PT_FRAMEWORK, Const.KERNEL_DUMP, PytorchKernelDumpDataProcessor)
+            cls.register_processor(Const.PT_FRAMEWORK, Const.STRUCTURE, BaseDataProcessor)
             cls.register_module_processor(Const.PT_FRAMEWORK, ModuleProcesser)
         elif framework == Const.MS_FRAMEWORK:
             from msprobe.core.data_dump.data_processor.mindspore_processor import (
@@ -75,4 +77,5 @@ class DataProcessorFactory:
             cls.register_processor(Const.MS_FRAMEWORK, Const.TENSOR, MindsporeTensorDataProcessor)
             cls.register_processor(Const.MS_FRAMEWORK, Const.OVERFLOW_CHECK, MindsporeOverflowCheckDataProcessor)
             cls.register_processor(Const.MS_FRAMEWORK, Const.KERNEL_DUMP, MindsporeKernelDumpDataProcessor)
+            cls.register_processor(Const.MS_FRAMEWORK, Const.STRUCTURE, BaseDataProcessor)
             cls.register_module_processor(Const.MS_FRAMEWORK, CellProcessor)

msprobe/core/data_dump/data_processor/mindspore_processor.py CHANGED Viewed

@@ -23,7 +23,7 @@ import numpy as np
 from msprobe.core.common.const import Const
 from msprobe.core.data_dump.data_processor.base import (BaseDataProcessor, TensorStatInfo,
                                                         ModuleForwardInputsOutputs, ModuleBackwardInputsOutputs)
-from msprobe.core.common.file_utils import path_len_exceeds_limit
+from msprobe.core.common.file_utils import path_len_exceeds_limit, save_npy
 from msprobe.mindspore.common.utils import convert_bf16_to_fp32, save_tensor_as_npy
 from msprobe.mindspore.common.log import logger
 from msprobe.mindspore.dump.hook_cell.api_registry import api_register
@@ -116,6 +116,10 @@ class MindsporeDataProcessor(BaseDataProcessor):
             api_register.norm_inner_op_set_hook_func()
         return tensor_stat
+    @staticmethod
+    def is_hookable_element(element):
+        return hasattr(element, "register_hook") and callable(element.register_hook)
     @classmethod
     def get_special_types(cls):
         return super().get_special_types() + cls.mindspore_special_type
@@ -136,11 +140,13 @@ class MindsporeDataProcessor(BaseDataProcessor):
         converted_numpy, numpy_type = self._convert_numpy_to_builtin(element)
         if converted_numpy is not element:
-            return self._analyze_numpy(converted_numpy, numpy_type)
+            return {"type": numpy_type, "value": converted_numpy}
         if isinstance(element, Number):
             return self.analyze_dtype_in_kwargs(element)
         if isinstance(element, ms.Tensor):
-            return self._analyze_tensor(element, Const.SEP.join(suffix_stack))
+            return self._analyze_tensor(element, Const.SEP.join([str(suffix) for suffix in suffix_stack]))
+        if isinstance(element, np.ndarray):
+            return self._analyze_numpy(element, Const.SEP.join([str(suffix) for suffix in suffix_stack]))
         if isinstance(element, (bool, int, float, str, slice, type(Ellipsis))):
             return self._analyze_builtin(element)
         return {}
@@ -185,6 +191,13 @@ class TensorDataProcessor(MindsporeDataProcessor):
         else:
             save_tensor_as_npy(tensor, file_path)
         return single_arg
+    def _analyze_numpy(self, ndarray, suffix):
+        dump_data_name, file_path = self.get_save_file_path(suffix)
+        save_npy(ndarray, file_path)
+        ndarray_json = super()._analyze_numpy(ndarray, suffix)
+        ndarray_json.update({"data_name": dump_data_name})
+        return ndarray_json
 class OverflowCheckDataProcessor(MindsporeDataProcessor):
@@ -231,7 +244,7 @@ class OverflowCheckDataProcessor(MindsporeDataProcessor):
         api_info_struct = super().analyze_backward(name, module, module_input_output)
         self.maybe_save_overflow_data()
         return api_info_struct if self.has_overflow else None
     def analyze_params(self, name, param_name, grad):
         self.has_overflow = False
         api_info_struct = super().analyze_params(name, param_name, grad)

msprobe/core/data_dump/data_processor/pytorch_processor.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
+# Copyright (c) 2024-2025, Huawei Technologies Co., Ltd.
 # All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0  (the "License");
@@ -21,6 +21,7 @@ from typing import List
 import numpy as np
 import torch
 from torch import distributed as dist
+from torch.distributed.distributed_c10d import _get_default_group
 from msprobe.core.common.const import Const
 from msprobe.core.common.file_utils import path_len_exceeds_limit
@@ -40,7 +41,16 @@ except ImportError:
 class PytorchDataProcessor(BaseDataProcessor):
-    pytorch_special_type = (torch.device, torch.dtype, torch.Size, torch.Tensor, torch.memory_format, dist.ProcessGroup)
+    pytorch_special_type = (
+        torch.device,
+        torch.dtype,
+        torch.Size,
+        torch.Tensor,
+        torch.memory_format,
+        dist.ProcessGroup,
+        dist.P2POp,
+        dist.ReduceOp
+    )
     memory_format = {
         torch.contiguous_format: "contiguous_format",
         torch.channels_last: "channels_last",
@@ -168,6 +178,11 @@ class PytorchDataProcessor(BaseDataProcessor):
     def is_distributed_op(module):
         return getattr(module, "op_is_distributed", False)
+    @staticmethod
+    def is_hookable_element(element):
+        return (hasattr(element, "register_hook") and callable(element.register_hook)) and \
+            (hasattr(element, "requires_grad") and element.requires_grad)
     @staticmethod
     def _analyze_torch_size(arg):
         return {"type": "torch.Size", "value": list(arg)}
@@ -176,7 +191,6 @@ class PytorchDataProcessor(BaseDataProcessor):
     def _analyze_memory_format(arg):
         # 获取内存格式
         format_type = PytorchDataProcessor.memory_format.get(arg)
         return {"type": "torch.memory_format", "format": format_type}
     @staticmethod
@@ -188,9 +202,18 @@ class PytorchDataProcessor(BaseDataProcessor):
             group_id = PytorchDataProcessor.process_group_hash(arg)
             group_info.update({"group_id": group_id})
         except Exception as e:
-            logger.warning(f"Failed to get process group(id: {group_id}) ranks info with error info: {e}.")
+            logger.warning(f"Failed to get process group ranks info with error info: {e}.")
         return group_info
+    @staticmethod
+    def _analyze_reduce_op(arg):
+        op_type = None
+        try:
+            op_type = str(arg)
+        except Exception as e:
+            logger.warning(f"Failed to get value of torch.distributed.ReduceOp with error info: {e}.")
+        return {"type": "torch.distributed.ReduceOp", "value": op_type}
     @classmethod
     def get_special_types(cls):
         return super().get_special_types() + cls.pytorch_special_type
@@ -204,11 +227,17 @@ class PytorchDataProcessor(BaseDataProcessor):
             return self._analyze_memory_format(element)
         if isinstance(element, dist.ProcessGroup):
             return self._analyze_process_group(element)
+        if isinstance(element, dist.P2POp):
+            return self._analyze_p2pop(element)
+        if isinstance(element, dist.ReduceOp):
+            return self._analyze_reduce_op(element)
         converted_numpy, numpy_type = self._convert_numpy_to_builtin(element)
         if converted_numpy is not element:
-            return self._analyze_numpy(converted_numpy, numpy_type)
+            return {"type": numpy_type, "value": converted_numpy}
         if isinstance(element, torch.Tensor):
-            return self._analyze_tensor(element, Const.SEP.join(suffix_stack))
+            return self._analyze_tensor(element, Const.SEP.join([str(suffix) for suffix in suffix_stack]))
+        if isinstance(element, np.ndarray):
+            return self._analyze_numpy(element, Const.SEP.join([str(suffix) for suffix in suffix_stack]))
         if isinstance(element, (bool, int, float, str, slice, type(Ellipsis))):
             return self._analyze_builtin(element)
         return {}
@@ -218,6 +247,21 @@ class PytorchDataProcessor(BaseDataProcessor):
             module_input_output.update_output_with_args_and_kwargs()
         return super().analyze_forward_output(name, module, module_input_output)
+    def _analyze_p2pop(self, arg):
+        p2pop_info = {"class_type": "torch.distributed.P2POp"}
+        try:
+            tensor_info = self._analyze_tensor(arg.tensor, [])
+            p2pop_info.update({"tensor": tensor_info})
+            p2pop_info.update({"op": arg.op.__name__})
+            p2pop_info.update({"peer": arg.peer})
+            p2pop_info.update({"tag": arg.tag})
+            group_id = PytorchDataProcessor.process_group_hash(
+                arg.group) if arg.group else PytorchDataProcessor.process_group_hash(_get_default_group())
+            p2pop_info.update({"group_id": group_id})
+        except Exception as e:
+            logger.warning(f"Failed to parse the P2POp content with error info: {e}.")
+        return p2pop_info
     def _analyze_tensor(self, tensor, suffix):
         tensor_stat = self.get_stat_info(tensor, self.config.async_dump)
         tensor_json = {}
@@ -267,6 +311,13 @@ class TensorDataProcessor(PytorchDataProcessor):
             saved_tensor = tensor.clone().contiguous().detach()
             save_pt(saved_tensor, file_path)
         return single_arg
+    def _analyze_numpy(self, ndarray, suffix):
+        dump_data_name, file_path = self.get_save_file_path(suffix)
+        save_pt(torch.tensor(ndarray), file_path)
+        ndarray_json = super()._analyze_numpy(ndarray, suffix)
+        ndarray_json.update({"data_name": dump_data_name})
+        return ndarray_json
 class OverflowCheckDataProcessor(PytorchDataProcessor):
@@ -319,7 +370,7 @@ class OverflowCheckDataProcessor(PytorchDataProcessor):
         api_info_struct = super().analyze_backward(name, module, module_input_output)
         self.handle_overflow()
         return api_info_struct if self.has_overflow else None
     def analyze_params(self, name, param_name, grad):
         self.has_overflow = False
         self._is_support_inf_nan()

msprobe/core/data_dump/json_writer.py CHANGED Viewed

@@ -15,6 +15,7 @@
 import csv
 import os
+import copy
 import numpy as np
 from msprobe.core.common.const import Const, FileCheckConst
@@ -31,10 +32,12 @@ class DataWriter:
         self.construct_file_path = None
         self.free_benchmark_file_path = None
         self.dump_tensor_data_dir = None
+        self.debug_file_path = None
         self.flush_size = 1000
         self.cache_data = {}
         self.cache_stack = {}
         self.cache_construct = {}
+        self.cache_debug = {}
     @staticmethod
     def write_data_to_csv(result: list, result_header: tuple, file_path: str):
@@ -57,6 +60,13 @@ class DataWriter:
         self.cache_construct = {}
     def initialize_json_file(self, **kwargs):
+        if self.debug_file_path and not self.cache_debug:
+            # debug level case only create debug.json
+            debug_dict = copy.deepcopy(kwargs)
+            debug_dict.update({"dump_data_dir": self.dump_tensor_data_dir, Const.DATA: {}})
+            self.cache_debug = debug_dict
+            save_json(self.debug_file_path, self.cache_debug, indent=1)
+            return
         if not self.cache_data:
             kwargs.update({"dump_data_dir": self.dump_tensor_data_dir, Const.DATA: {}})
             self.cache_data = kwargs
@@ -66,13 +76,13 @@ class DataWriter:
         if not self.cache_construct:
             save_json(self.construct_file_path, self.cache_construct, indent=1)
-    def update_dump_paths(self, dump_file_path, stack_file_path, construct_file_path, dump_data_dir,
-                          free_benchmark_file_path):
-        self.dump_file_path = dump_file_path
-        self.stack_file_path = stack_file_path
-        self.construct_file_path = construct_file_path
-        self.dump_tensor_data_dir = dump_data_dir
-        self.free_benchmark_file_path = free_benchmark_file_path
+    def update_dump_paths(self, dump_path_aggregation):
+        self.dump_file_path = dump_path_aggregation.dump_file_path
+        self.stack_file_path = dump_path_aggregation.stack_file_path
+        self.construct_file_path = dump_path_aggregation.construct_file_path
+        self.dump_tensor_data_dir = dump_path_aggregation.dump_tensor_data_dir
+        self.free_benchmark_file_path = dump_path_aggregation.free_benchmark_file_path
+        self.debug_file_path = dump_path_aggregation.debug_file_path
     def flush_data_periodically(self):
         dump_data = self.cache_data.get(Const.DATA)
@@ -100,6 +110,9 @@ class DataWriter:
     def update_construct(self, new_data):
         self.cache_construct.update(new_data)
+    def update_debug(self, new_data):
+        self.cache_debug['data'].update(new_data)
     def write_data_json(self, file_path):
         logger.info(f"dump.json is at {os.path.dirname(os.path.dirname(file_path))}. ")
         save_json(file_path, self.cache_data, indent=1)
@@ -110,6 +123,9 @@ class DataWriter:
     def write_construct_info_json(self, file_path):
         save_json(file_path, self.cache_construct, indent=1)
+    def write_debug_info_json(self, file_path):
+        save_json(file_path, self.cache_debug, indent=1)
     def write_json(self):
         if self.cache_data:
             self.write_data_json(self.dump_file_path)
@@ -117,6 +133,8 @@ class DataWriter:
             self.write_stack_info_json(self.stack_file_path)
         if self.cache_construct:
             self.write_construct_info_json(self.construct_file_path)
+        if self.cache_debug:
+            self.write_debug_info_json(self.debug_file_path)
     def fill_stack_tensor_data(self):
         self.process_stat_data_recursive(self.cache_data)
@@ -135,7 +153,7 @@ class DataWriter:
                     if hasattr(tensor_stat_data, "device") and tensor_stat_data.device != Const.CPU_LOWERCASE:
                         tensor_stat_data = tensor_stat_data.cpu()
                     for index, stat in zip(tensor_stat_index, tensor_stat_data):
-                        data.update({index, stat.item()})
+                        data.update({index: stat.item()})
                 del data["tensor_stat"]
             else:
                 for key in data.keys():

mindstudio-probe 1.2.1__py3-none-any.whl → 1.2.2__py3-none-any.whl

mindstudio-probe 1.2.1py3-none-any.whl → 1.2.2py3-none-any.whl