PyPI - mindstudio-probe - Versions diffs - 1.3.0__py3-none-any.whl → 8.1.1__py3-none-any.whl - Mend

mindstudio-probe 1.3.0py3-none-any.whl → 8.1.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (213) hide show

{mindstudio_probe-1.3.0.dist-info → mindstudio_probe-8.1.1.dist-info}/METADATA +4 -2
{mindstudio_probe-1.3.0.dist-info → mindstudio_probe-8.1.1.dist-info}/RECORD +204 -152
msprobe/README.md +32 -1
msprobe/core/__init__.py +17 -0
msprobe/core/common/const.py +120 -21
msprobe/core/common/exceptions.py +2 -2
msprobe/core/common/file_utils.py +279 -50
msprobe/core/common/framework_adapter.py +169 -0
msprobe/core/common/global_lock.py +86 -0
msprobe/core/common/runtime.py +25 -0
msprobe/core/common/utils.py +136 -45
msprobe/core/common_config.py +7 -0
msprobe/core/compare/acc_compare.py +646 -428
msprobe/core/compare/check.py +36 -103
msprobe/core/compare/compare_cli.py +4 -0
msprobe/core/compare/config.py +72 -0
msprobe/core/compare/highlight.py +215 -215
msprobe/core/compare/layer_mapping/layer_mapping.py +2 -0
msprobe/core/compare/merge_result/merge_result.py +4 -4
msprobe/core/compare/multiprocessing_compute.py +223 -110
msprobe/core/compare/npy_compare.py +2 -4
msprobe/core/compare/utils.py +214 -244
msprobe/core/config_check/__init__.py +17 -0
msprobe/{pytorch/dump/kernel_dump/kernel_config.py → core/config_check/checkers/__init__.py} +8 -16
msprobe/core/config_check/checkers/base_checker.py +60 -0
msprobe/core/config_check/checkers/dataset_checker.py +138 -0
msprobe/core/config_check/checkers/env_args_checker.py +96 -0
msprobe/core/config_check/checkers/hyperparameter_checker.py +170 -0
msprobe/core/config_check/checkers/pip_checker.py +90 -0
msprobe/core/config_check/checkers/random_checker.py +367 -0
msprobe/core/config_check/checkers/weights_checker.py +147 -0
msprobe/core/config_check/ckpt_compare/ckpt_comparator.py +74 -0
msprobe/core/config_check/ckpt_compare/megatron_loader.py +302 -0
msprobe/core/config_check/ckpt_compare/metrics.py +83 -0
msprobe/core/config_check/ckpt_compare/name_mapping.yaml +12 -0
msprobe/core/config_check/config_check_cli.py +51 -0
msprobe/core/config_check/config_checker.py +100 -0
msprobe/{mindspore/runtime.py → core/config_check/resource/dependency.yaml} +7 -4
msprobe/core/config_check/resource/env.yaml +57 -0
msprobe/core/config_check/resource/hyperparameter.yaml +21 -0
msprobe/core/config_check/utils/hyperparameter_parser.py +115 -0
msprobe/core/config_check/utils/utils.py +107 -0
msprobe/core/data_dump/api_registry.py +67 -4
msprobe/core/data_dump/data_collector.py +170 -89
msprobe/core/data_dump/data_processor/base.py +72 -51
msprobe/core/data_dump/data_processor/mindspore_processor.py +109 -55
msprobe/core/data_dump/data_processor/pytorch_processor.py +90 -82
msprobe/core/data_dump/json_writer.py +143 -27
msprobe/core/debugger/precision_debugger.py +144 -0
msprobe/core/grad_probe/constant.py +1 -1
msprobe/core/grad_probe/grad_compare.py +1 -1
msprobe/core/grad_probe/utils.py +1 -1
msprobe/core/hook_manager.py +242 -0
msprobe/core/monitor/anomaly_processor.py +384 -0
msprobe/core/service.py +357 -0
msprobe/core/single_save/__init__.py +0 -0
msprobe/core/single_save/single_comparator.py +243 -0
msprobe/core/single_save/single_saver.py +146 -0
msprobe/docs/01.installation.md +6 -5
msprobe/docs/02.config_introduction.md +79 -22
msprobe/docs/03.config_examples.md +1 -0
msprobe/docs/04.kernel_dump_PyTorch.md +1 -1
msprobe/docs/05.data_dump_PyTorch.md +118 -49
msprobe/docs/06.data_dump_MindSpore.md +167 -20
msprobe/docs/07.accuracy_checker_PyTorch.md +2 -2
msprobe/docs/08.accuracy_checker_online_PyTorch.md +69 -9
msprobe/docs/09.accuracy_checker_MindSpore.md +18 -6
msprobe/docs/10.accuracy_compare_PyTorch.md +212 -74
msprobe/docs/11.accuracy_compare_MindSpore.md +87 -37
msprobe/docs/12.overflow_check_PyTorch.md +2 -2
msprobe/docs/13.overflow_check_MindSpore.md +2 -2
msprobe/docs/14.data_parse_PyTorch.md +3 -3
msprobe/docs/17.grad_probe.md +2 -1
msprobe/docs/18.online_dispatch.md +2 -2
msprobe/docs/19.monitor.md +90 -44
msprobe/docs/21.visualization_PyTorch.md +68 -15
msprobe/docs/22.visualization_MindSpore.md +71 -18
msprobe/docs/25.tool_function_introduction.md +23 -22
msprobe/docs/26.data_dump_PyTorch_baseline.md +14 -3
msprobe/docs/27.dump_json_instruction.md +1 -1
msprobe/docs/28.debugger_save_instruction.md +111 -20
msprobe/docs/29.data_dump_MSAdapter.md +2 -2
msprobe/docs/30.overflow_check_MSAdapter.md +2 -2
msprobe/docs/31.config_check.md +95 -0
msprobe/docs/32.ckpt_compare.md +69 -0
msprobe/docs/33.generate_operator_MindSpore.md +181 -0
msprobe/docs/34.RL_collect.md +92 -0
msprobe/docs/35.nan_analyze.md +72 -0
msprobe/docs/data_dump_MindSpore/data_dump_MindSpore_baseline.md +12 -1
msprobe/docs/data_dump_MindSpore/dynamic_graph_quick_start_example.md +3 -1
msprobe/docs/img/compare_result.png +0 -0
msprobe/docs/img/save_compare_result_sample.png +0 -0
msprobe/docs/img/visualization/proxy.png +0 -0
msprobe/mindspore/__init__.py +1 -2
msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +150 -58
msprobe/mindspore/api_accuracy_checker/api_runner.py +7 -3
msprobe/mindspore/api_accuracy_checker/bench_functions/flash_attention_score.py +47 -69
msprobe/mindspore/api_accuracy_checker/cmd_parser.py +4 -0
msprobe/mindspore/api_accuracy_checker/compute_element.py +0 -1
msprobe/mindspore/api_accuracy_checker/data_manager.py +2 -2
msprobe/mindspore/api_accuracy_checker/generate_op_script/op_generator.py +460 -0
msprobe/mindspore/api_accuracy_checker/generate_op_script/operator_replication.template +2081 -0
msprobe/mindspore/api_accuracy_checker/multi_api_accuracy_checker.py +9 -0
msprobe/mindspore/api_accuracy_checker/torch_mindtorch_importer.py +2 -1
msprobe/mindspore/cell_processor.py +204 -33
msprobe/mindspore/code_mapping/graph_parser.py +4 -21
msprobe/mindspore/common/const.py +17 -7
msprobe/mindspore/common/utils.py +128 -11
msprobe/mindspore/compare/common_dir_compare.py +382 -0
msprobe/mindspore/compare/distributed_compare.py +2 -26
msprobe/mindspore/compare/ms_compare.py +17 -405
msprobe/mindspore/compare/ms_graph_compare.py +14 -5
msprobe/mindspore/compare/utils.py +37 -0
msprobe/mindspore/debugger/debugger_config.py +53 -3
msprobe/mindspore/debugger/precision_debugger.py +72 -91
msprobe/mindspore/dump/cell_dump_process.py +877 -0
msprobe/mindspore/dump/cell_dump_with_insert_gradient.py +864 -0
msprobe/mindspore/dump/dump_tool_factory.py +13 -5
msprobe/mindspore/dump/graph_mode_cell_dump.py +139 -0
msprobe/mindspore/dump/graph_tensor_dump.py +123 -0
msprobe/mindspore/dump/hook_cell/api_register.py +40 -6
msprobe/mindspore/dump/hook_cell/hook_cell.py +18 -7
msprobe/mindspore/dump/hook_cell/ms_hook_manager.py +88 -0
msprobe/mindspore/dump/hook_cell/primitive_hooks.py +8 -2
msprobe/mindspore/dump/hook_cell/support_wrap_ops.yaml +18 -0
msprobe/mindspore/dump/jit_dump.py +21 -18
msprobe/mindspore/dump/kernel_kbyk_dump.py +6 -3
msprobe/mindspore/dym_loader/hook_dynamic_loader.cpp +110 -0
msprobe/mindspore/dym_loader/hook_dynamic_loader.h +15 -15
msprobe/mindspore/free_benchmark/api_pynative_self_check.py +12 -6
msprobe/mindspore/free_benchmark/common/utils.py +1 -1
msprobe/mindspore/grad_probe/global_context.py +7 -2
msprobe/mindspore/grad_probe/grad_stat_csv.py +3 -2
msprobe/mindspore/mindspore_service.py +114 -0
msprobe/mindspore/monitor/common_func.py +52 -0
msprobe/mindspore/monitor/data_writers.py +237 -0
msprobe/mindspore/monitor/features.py +20 -7
msprobe/mindspore/monitor/module_hook.py +281 -209
msprobe/mindspore/monitor/optimizer_collect.py +334 -0
msprobe/mindspore/monitor/utils.py +25 -5
msprobe/mindspore/ms_config.py +16 -15
msprobe/mindspore/task_handler_factory.py +5 -2
msprobe/msprobe.py +19 -0
msprobe/nan_analyze/__init__.py +14 -0
msprobe/nan_analyze/analyzer.py +255 -0
msprobe/nan_analyze/graph.py +189 -0
msprobe/nan_analyze/utils.py +211 -0
msprobe/pytorch/api_accuracy_checker/common/config.py +2 -2
msprobe/pytorch/api_accuracy_checker/compare/compare.py +36 -34
msprobe/pytorch/api_accuracy_checker/compare/compare_utils.py +20 -20
msprobe/pytorch/api_accuracy_checker/generate_op_script/op_generator.py +4 -7
msprobe/pytorch/api_accuracy_checker/generate_op_script/operator_replication.template +204 -2
msprobe/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py +12 -11
msprobe/pytorch/api_accuracy_checker/run_ut/run_overflow_check.py +1 -0
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py +8 -5
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut_utils.py +2 -3
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/client.py +29 -13
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/device_dispatch.py +12 -2
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/server.py +45 -31
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/utils.py +156 -0
msprobe/pytorch/attl_manager.py +65 -0
msprobe/pytorch/bench_functions/npu_fusion_attention.py +27 -0
msprobe/pytorch/common/utils.py +26 -14
msprobe/pytorch/compare/distributed_compare.py +4 -36
msprobe/pytorch/compare/pt_compare.py +13 -84
msprobe/pytorch/compare/utils.py +47 -0
msprobe/pytorch/debugger/debugger_config.py +34 -17
msprobe/pytorch/debugger/precision_debugger.py +66 -118
msprobe/pytorch/dump/module_dump/hook_wrapper.py +93 -0
msprobe/pytorch/dump/module_dump/module_dump.py +11 -58
msprobe/pytorch/dump/module_dump/module_processer.py +143 -113
msprobe/pytorch/grad_probe/grad_stat_csv.py +3 -2
msprobe/pytorch/hook_module/api_register.py +29 -5
msprobe/pytorch/hook_module/hook_module.py +9 -18
msprobe/pytorch/hook_module/jit_script_wrapper.py +33 -0
msprobe/pytorch/hook_module/pt_hook_manager.py +68 -0
msprobe/pytorch/hook_module/support_wrap_ops.yaml +22 -1
msprobe/pytorch/hook_module/utils.py +28 -2
msprobe/pytorch/monitor/csv2tb.py +6 -2
msprobe/pytorch/monitor/data_writers.py +259 -0
msprobe/pytorch/monitor/module_hook.py +227 -158
msprobe/pytorch/monitor/module_metric.py +14 -0
msprobe/pytorch/monitor/optimizer_collect.py +242 -270
msprobe/pytorch/monitor/utils.py +16 -3
msprobe/pytorch/online_dispatch/dispatch.py +4 -2
msprobe/pytorch/online_dispatch/dump_compare.py +5 -2
msprobe/pytorch/parse_tool/lib/utils.py +3 -3
msprobe/pytorch/pt_config.py +8 -7
msprobe/pytorch/pytorch_service.py +73 -0
msprobe/visualization/builder/graph_builder.py +33 -13
msprobe/visualization/builder/msprobe_adapter.py +24 -11
msprobe/visualization/compare/graph_comparator.py +53 -45
msprobe/visualization/compare/mode_adapter.py +31 -1
msprobe/visualization/graph/base_node.py +3 -3
msprobe/visualization/graph/graph.py +2 -2
msprobe/visualization/graph_service.py +250 -103
msprobe/visualization/utils.py +27 -11
msprobe/mindspore/dym_loader/hook_dynamic_loader.cc +0 -106
msprobe/mindspore/monitor/anomaly_detect.py +0 -404
msprobe/mindspore/monitor/module_spec_verifier.py +0 -94
msprobe/mindspore/service.py +0 -549
msprobe/pytorch/monitor/anomaly_analyse.py +0 -201
msprobe/pytorch/monitor/anomaly_detect.py +0 -410
msprobe/pytorch/monitor/module_spec_verifier.py +0 -95
msprobe/pytorch/monitor/unittest/test_monitor.py +0 -160
msprobe/pytorch/service.py +0 -473
{mindstudio_probe-1.3.0.dist-info → mindstudio_probe-8.1.1.dist-info}/LICENSE +0 -0
{mindstudio_probe-1.3.0.dist-info → mindstudio_probe-8.1.1.dist-info}/WHEEL +0 -0
{mindstudio_probe-1.3.0.dist-info → mindstudio_probe-8.1.1.dist-info}/entry_points.txt +0 -0
{mindstudio_probe-1.3.0.dist-info → mindstudio_probe-8.1.1.dist-info}/top_level.txt +0 -0
/msprobe/{mindspore → core}/compare/ms_to_pt_api.yaml +0 -0
/msprobe/{mindspore/dump → core}/kernel_dump/kernel_config.py +0 -0
/msprobe/{pytorch/monitor/unittest → core/monitor}/__init__.py +0 -0

msprobe/core/data_dump/data_processor/mindspore_processor.py CHANGED Viewed

@@ -17,13 +17,14 @@ import zlib
 import mindspore as ms
 from mindspore import mint, ops, hal
+from mindspore.mint import distributed
 from mindspore._c_expression.typing import Number
 import numpy as np
 from msprobe.core.common.const import Const
 from msprobe.core.data_dump.data_processor.base import (BaseDataProcessor, TensorStatInfo,
                                                         ModuleForwardInputsOutputs, ModuleBackwardInputsOutputs)
-from msprobe.core.common.file_utils import path_len_exceeds_limit, save_npy
+from msprobe.core.common.file_utils import path_len_exceeds_limit
 from msprobe.mindspore.common.utils import convert_bf16_to_fp32, save_tensor_as_npy
 from msprobe.mindspore.common.log import logger
 from msprobe.mindspore.dump.hook_cell.api_register import get_api_register
@@ -36,7 +37,7 @@ except ImportError:
 class MindsporeDataProcessor(BaseDataProcessor):
-    mindspore_special_type = tuple([ms.Tensor, Number])
+    mindspore_special_type = tuple([ms.Tensor, Number, distributed.P2POp])
     def __init__(self, config, data_writer):
         super().__init__(config, data_writer)
@@ -65,7 +66,7 @@ class MindsporeDataProcessor(BaseDataProcessor):
             tensor_stat.max = np.max(data_np).item()
             tensor_stat.min = np.min(data_np).item()
         elif not data.shape:
-            tensor_stat.max = tensor_stat.min = tensor_stat.mean = tensor_stat.norm = data.item()
+            tensor_stat.max = tensor_stat.min = tensor_stat.mean = tensor_stat.norm = data
         elif data.dtype == ms.complex64 or data.dtype == ms.complex128:
             data_abs = np.abs(data.asnumpy())
             tensor_stat.max = np.max(data_abs).item()
@@ -76,38 +77,52 @@ class MindsporeDataProcessor(BaseDataProcessor):
             if not ops.is_floating_point(data) or data.dtype == ms.float64:
                 data = data.to(ms.float32)
             get_norm_value = mint.norm if hasattr(mint, "norm") else ops.norm
-            tensor_stat.max = mint.max(data).item()
-            tensor_stat.min = mint.min(data).item()
-            tensor_stat.mean = mint.mean(data).item()
-            tensor_stat.norm = get_norm_value(data).item()
+            tensor_stat.max = mint.max(data)
+            tensor_stat.min = mint.min(data)
+            tensor_stat.mean = mint.mean(data)
+            tensor_stat.norm = get_norm_value(data)
         return tensor_stat
     @staticmethod
     def get_stat_info_async(data):
         tensor_stat = TensorStatInfo()
-        if data.dtype == ms.complex64 or data.dtype == ms.complex128:
+        if data.dtype == ms.bool_:
+            tensor_stat.max = mint.any(data)
+            tensor_stat.min = mint.all(data)
+        elif not data.shape:
+            tensor_stat.max = tensor_stat.min = tensor_stat.mean = tensor_stat.norm = data
+        elif data.dtype == ms.complex64 or data.dtype == ms.complex128:
             logger.warning("Async dump do not support complex data!")
             return tensor_stat
-        elif data.dtype == ms.bool_:
-            tensor_stat.stack_tensor_stat = (["Max", "Min"], ops.stack([data.any(), data.all()]))
-        elif not data.shape:
-            tensor_stat.stack_tensor_stat = (["Max", "Min", "Mean", "Norm"], ops.stack([data, data, data, data]))
         else:
             if not ops.is_floating_point(data) or data.dtype == ms.float64:
                 data = data.to(ms.float32)
             get_norm_value = mint.norm if hasattr(mint, "norm") else ops.norm
-            tensor_stat.stack_tensor_stat = (["Max", "Min", "Mean", "Norm"], ops.stack(
-                [mint.max(data), mint.min(data), mint.mean(data), get_norm_value(data)]))
+            tensor_stat.max = mint.max(data)
+            tensor_stat.min = mint.min(data)
+            tensor_stat.mean = mint.mean(data)
+            tensor_stat.norm = get_norm_value(data)
         return tensor_stat
     @staticmethod
     def is_hookable_element(element):
         return hasattr(element, "register_hook") and callable(element.register_hook)
+    @staticmethod
+    def process_group_hash(arg):
+        group_ranks = distributed.get_process_group_ranks(arg)
+        group_ranks_hash = zlib.crc32(str(group_ranks).encode('utf-8'))
+        return f"{group_ranks_hash:08x}"
     @classmethod
     def get_special_types(cls):
         return super().get_special_types() + cls.mindspore_special_type
+    def dump_async_data(self):
+        for file_path, tensor in self._async_dump_cache.items():
+            save_tensor_as_npy(tensor, file_path)
+        self._async_dump_cache.clear()
     def get_stat_info(self, data):
         self.api_register.restore_inner_used_api()
         tensor_stat = TensorStatInfo()
@@ -125,19 +140,34 @@ class MindsporeDataProcessor(BaseDataProcessor):
         if suffix_stack and suffix_stack[-1] in self.mindspore_object_key:
             return self.mindspore_object_key[suffix_stack[-1]](element)
-        converted_numpy, numpy_type = self._convert_numpy_to_builtin(element)
-        if converted_numpy is not element:
-            return {"type": numpy_type, "value": converted_numpy}
-        if isinstance(element, Number):
-            return self.analyze_dtype_in_kwargs(element)
-        if isinstance(element, ms.Tensor):
-            return self._analyze_tensor(element, Const.SEP.join([str(suffix) for suffix in suffix_stack]))
-        if isinstance(element, np.ndarray):
-            return self._analyze_numpy(element, Const.SEP.join([str(suffix) for suffix in suffix_stack]))
-        if isinstance(element, (bool, int, float, str, slice, type(Ellipsis))):
-            return self._analyze_builtin(element)
+        suffix_str = Const.SEP.join(str(s) for s in suffix_stack)
+        type_analyzer = [
+            (MindsporeDataProcessor.builtin_type, self._analyze_builtin),
+            (ms.Tensor, lambda e: self._analyze_tensor(e, suffix_str)),
+            (Number, self.analyze_dtype_in_kwargs),
+            (MindsporeDataProcessor.np_type[:-1], self._analyze_numpy),
+            (np.ndarray, lambda e: self._analyze_ndarray(e, suffix_str)),
+            (distributed.P2POp, lambda e: self._analyze_p2pop(e, suffix_str))
+        ]
+        for type_key, analyze_fn in type_analyzer:
+            if isinstance(element, type_key):
+                return analyze_fn(element)
         return {}
+    def _analyze_p2pop(self, arg, suffix):
+        p2pop_info = {"class_type": "mindspore.mint.distributed.P2POp"}
+        try:
+            tensor_info = self._analyze_tensor(arg.tensor, suffix)
+            p2pop_info.update({"tensor": tensor_info})
+            p2pop_info.update({"op": arg.op})
+            p2pop_info.update({"peer": arg.peer})
+            p2pop_info.update({"tag": arg.tag})
+            group_id = self.process_group_hash(arg.group) if arg.group else None
+            p2pop_info.update({"group_id": group_id})
+        except Exception as e:
+            logger.warning(f"Failed to parse the P2POp content with error info: {e}.")
+        return p2pop_info
     def _analyze_tensor(self, tensor, suffix):
         tensor_stat = self.get_stat_info(tensor)
         tensor_json = {
@@ -146,32 +176,26 @@ class MindsporeDataProcessor(BaseDataProcessor):
             'shape': tensor.shape
         }
-        if tensor_stat.stack_tensor_stat is None:
-            tensor_json.update({'Max': self.transfer_type(tensor_stat.max)})
-            tensor_json.update({'Min': self.transfer_type(tensor_stat.min)})
-            tensor_json.update({'Mean': self.transfer_type(tensor_stat.mean)})
-            tensor_json.update({'Norm': self.transfer_type(tensor_stat.norm)})
-        else:
-            tensor_json.update({'tensor_stat': tensor_stat.stack_tensor_stat})
+        # 将统计值存入全局 buffer，并返回占位索引
+        stat_values = [
+            tensor_stat.max,
+            tensor_stat.min,
+            tensor_stat.mean,
+            tensor_stat.norm
+        ]
+        placeholder_index = self.data_writer.append_stat_to_buffer(stat_values)
+        tensor_json.update({Const.TENSOR_STAT_INDEX: placeholder_index})
         if self.config.summary_mode == Const.MD5 and not self.config.async_dump:
             tensor_md5 = self.get_md5_for_tensor(tensor)
             tensor_json.update({Const.MD5: tensor_md5})
         return tensor_json
-class StatisticsDataProcessor(MindsporeDataProcessor):
-    pass
-class TensorDataProcessor(MindsporeDataProcessor):
-    def dump_async_data(self):
-        for file_path, tensor in self._async_dump_cache.items():
-            save_tensor_as_npy(tensor, file_path)
-        self._async_dump_cache.clear()
-    def _analyze_tensor(self, tensor, suffix):
+    def _analyze_and_save_tensor(self, tensor, suffix):
         dump_data_name, file_path = self.get_save_file_path(suffix)
-        single_arg = super()._analyze_tensor(tensor, suffix)
+        single_arg = MindsporeDataProcessor._analyze_tensor(self, tensor, suffix)
         single_arg.update({"data_name": dump_data_name})
         if self.config.async_dump:
             self._async_dump_cache[file_path] = tensor.copy()
@@ -179,12 +203,27 @@ class TensorDataProcessor(MindsporeDataProcessor):
             save_tensor_as_npy(tensor, file_path)
         return single_arg
-    def _analyze_numpy(self, ndarray, suffix):
-        dump_data_name, file_path = self.get_save_file_path(suffix)
-        save_npy(ndarray, file_path)
-        ndarray_json = super()._analyze_numpy(ndarray, suffix)
-        ndarray_json.update({"data_name": dump_data_name})
-        return ndarray_json
+class StatisticsDataProcessor(MindsporeDataProcessor):
+    def _analyze_tensor(self, tensor, suffix):
+        if any(item in self.current_api_or_module_name for item in self.config.tensor_list):
+            return self._analyze_and_save_tensor(tensor, suffix)
+        else:
+            return super()._analyze_tensor(tensor, suffix)
+    def _analyze_ndarray(self, ndarray, suffix):
+        if any(item in self.current_api_or_module_name for item in self.config.tensor_list):
+            return self._analyze_and_save_ndarray(ndarray, suffix)
+        else:
+            return super()._analyze_ndarray(ndarray, suffix)
+class TensorDataProcessor(MindsporeDataProcessor):
+    def _analyze_tensor(self, tensor, suffix):
+        return self._analyze_and_save_tensor(tensor, suffix)
+    def _analyze_ndarray(self, ndarray, suffix):
+        return self._analyze_and_save_ndarray(ndarray, suffix)
 class OverflowCheckDataProcessor(MindsporeDataProcessor):
@@ -231,7 +270,7 @@ class OverflowCheckDataProcessor(MindsporeDataProcessor):
         api_info_struct = super().analyze_backward(name, module, module_input_output)
         self.maybe_save_overflow_data()
         return api_info_struct if self.has_overflow else None
     def analyze_params(self, name, param_name, grad):
         self.has_overflow = False
         api_info_struct = super().analyze_params(name, param_name, grad)
@@ -249,11 +288,26 @@ class OverflowCheckDataProcessor(MindsporeDataProcessor):
         self.cached_tensors_and_file_paths = {}
     def _analyze_maybe_overflow_tensor(self, tensor_json):
-        if tensor_json['Max'] is None:
+        tensor_stat_index = tensor_json.get(Const.TENSOR_STAT_INDEX)
+        if tensor_stat_index is None:
+            logger.warning("tensor_stat_index does not exist in tensor_json.")
             return
-        if np.isinf(tensor_json['Max']) or np.isnan(tensor_json['Max']):
+        max_tensor = self.data_writer.get_buffer_values_max(tensor_stat_index)
+        min_tensor = self.data_writer.get_buffer_values_min(tensor_stat_index)
+        if max_tensor is None or min_tensor is None:
+            return
+        def check_inf_nan(value):
+            # Use .item() if it's a tensor-like structure
+            if hasattr(value, "item"):
+                value = value.item()
+            return np.isinf(value) or np.isnan(value)
+        if check_inf_nan(max_tensor):
             self.has_overflow = True
-        if np.isinf(tensor_json['Min']) or np.isnan(tensor_json['Min']):
+            return
+        if check_inf_nan(min_tensor):
             self.has_overflow = True
     def _analyze_tensor(self, tensor, suffix):

msprobe/core/data_dump/data_processor/pytorch_processor.py CHANGED Viewed

@@ -13,7 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import hashlib
 import zlib
 from dataclasses import asdict
 from typing import List
@@ -102,19 +101,17 @@ class PytorchDataProcessor(BaseDataProcessor):
             logger.warning("Async dump do not support complex data!")
             return tensor_stat
         elif data.dtype == torch.bool:
-            tensor_stat.stack_tensor_stat = (["Max", "Min"], torch.stack(
-                [torch.any(data), torch.all(data)]))
+            tensor_stat.max = torch.any(data)
+            tensor_stat.min = torch.all(data)
         elif not data.shape:
-            tensor_stat.stack_tensor_stat = (["Max", "Min", "Mean", "Norm"], torch.stack([data, data, data, data]))
+            tensor_stat.max = tensor_stat.min = tensor_stat.mean = tensor_stat.norm = data
         else:
-            if not data.is_floating_point() or data.dtype == torch.float64:
+            if data.dtype == torch.float64 or not data.is_floating_point():
                 data = data.float()
-            tensor_stat.stack_tensor_stat = (["Max", "Min", "Mean", "Norm"], torch.stack([
-                torch.max(data),
-                torch.min(data),
-                torch.mean(data),
-                torch.norm(data)
-            ]))
+            tensor_stat.max = torch.max(data)
+            tensor_stat.min = torch.min(data)
+            tensor_stat.mean = torch.mean(data)
+            tensor_stat.norm = torch.norm(data)
         return tensor_stat
     @staticmethod
@@ -127,17 +124,17 @@ class PytorchDataProcessor(BaseDataProcessor):
             tensor_stat.min = np.min(data_abs).item()
             tensor_stat.mean = np.mean(data_abs).item()
         elif data.dtype == torch.bool:
-            tensor_stat.max = torch.any(data).item()
-            tensor_stat.min = torch.all(data).item()
+            tensor_stat.max = torch.any(data)
+            tensor_stat.min = torch.all(data)
         elif not data.shape:
-            tensor_stat.max = tensor_stat.min = tensor_stat.mean = tensor_stat.norm = data.item()
+            tensor_stat.max = tensor_stat.min = tensor_stat.mean = tensor_stat.norm = data
         else:
-            if not data.is_floating_point() or data.dtype == torch.float64:
+            if data.dtype == torch.float64 or not data.is_floating_point():
                 data = data.float()
-            tensor_stat.max = torch.max(data).item()
-            tensor_stat.min = torch.min(data).item()
-            tensor_stat.mean = torch.mean(data).item()
-            tensor_stat.norm = torch.norm(data).item()
+            tensor_stat.max = torch.max(data)
+            tensor_stat.min = torch.min(data)
+            tensor_stat.mean = torch.mean(data)
+            tensor_stat.norm = torch.norm(data)
         return tensor_stat
     @staticmethod
@@ -174,12 +171,8 @@ class PytorchDataProcessor(BaseDataProcessor):
     @staticmethod
     def process_group_hash(arg):
         group_ranks = dist.get_process_group_ranks(arg)
-        group_ranks_hash = hashlib.md5(str(group_ranks).encode('utf-8')).hexdigest()
-        return group_ranks_hash
-    @staticmethod
-    def is_distributed_op(module):
-        return getattr(module, "op_is_distributed", False)
+        group_ranks_hash = zlib.crc32(str(group_ranks).encode('utf-8'))
+        return f"{group_ranks_hash:08x}"
     @staticmethod
     def is_hookable_element(element):
@@ -233,34 +226,31 @@ class PytorchDataProcessor(BaseDataProcessor):
     def get_special_types(cls):
         return super().get_special_types() + cls.pytorch_special_type
+    def dump_async_data(self):
+        for file_path, tensor in self._async_dump_cache.items():
+            save_pt(tensor.contiguous(), file_path)
+        self._async_dump_cache.clear()
     def analyze_single_element(self, element, suffix_stack):
         if suffix_stack and suffix_stack[-1] in self.torch_object_key:
             return self.torch_object_key[suffix_stack[-1]](element)
-        if isinstance(element, torch.Size):
-            return self._analyze_torch_size(element)
-        if isinstance(element, torch.memory_format):
-            return self._analyze_memory_format(element)
-        if isinstance(element, dist.ProcessGroup):
-            return self._analyze_process_group(element)
-        if isinstance(element, dist.P2POp):
-            return self._analyze_p2pop(element, Const.SEP.join([str(suffix) for suffix in suffix_stack]))
-        if isinstance(element, dist.ReduceOp):
-            return self._analyze_reduce_op(element)
-        converted_numpy, numpy_type = self._convert_numpy_to_builtin(element)
-        if converted_numpy is not element:
-            return {"type": numpy_type, "value": converted_numpy}
-        if isinstance(element, torch.Tensor):
-            return self._analyze_tensor(element, Const.SEP.join([str(suffix) for suffix in suffix_stack]))
-        if isinstance(element, np.ndarray):
-            return self._analyze_numpy(element, Const.SEP.join([str(suffix) for suffix in suffix_stack]))
-        if isinstance(element, (bool, int, float, str, slice, type(Ellipsis))):
-            return self._analyze_builtin(element)
-        return {}
-    def analyze_forward_output(self, name, module, module_input_output: ModuleForwardInputsOutputs):
-        if self.is_distributed_op(module):
-            module_input_output.update_output_with_args_and_kwargs()
-        return super().analyze_forward_output(name, module, module_input_output)
+        suffix_str = Const.SEP.join(str(s) for s in suffix_stack)
+        type_analyzer = [
+            (PytorchDataProcessor.builtin_type, self._analyze_builtin),
+            (torch.Size, self._analyze_torch_size),
+            (torch.Tensor, lambda e: self._analyze_tensor(e, suffix_str)),
+            (torch.memory_format, self._analyze_memory_format),
+            (dist.ProcessGroup, self._analyze_process_group),
+            (dist.P2POp, lambda e: self._analyze_p2pop(e, suffix_str)),
+            (dist.ReduceOp, self._analyze_reduce_op),
+            (PytorchDataProcessor.np_type[:-1], self._analyze_numpy),
+            (np.ndarray, lambda e: self._analyze_ndarray(e, suffix_str)),
+        ]
+        for type_key, analyze_fn in type_analyzer:
+            if isinstance(element, type_key):
+                return analyze_fn(element)
+        return {}
     def _analyze_p2pop(self, arg, suffix):
         p2pop_info = {"class_type": "torch.distributed.P2POp"}
@@ -284,42 +274,26 @@ class PytorchDataProcessor(BaseDataProcessor):
         tensor_json.update({'type': 'torch.Tensor'})
         tensor_json.update({'dtype': dtype})
         tensor_json.update({"shape": tensor.shape})
-        if tensor_stat.stack_tensor_stat is None:
-            tensor_json.update({"Max": tensor_stat.max})
-            tensor_json.update({"Min": tensor_stat.min})
-            tensor_json.update({"Mean": tensor_stat.mean})
-            tensor_json.update({"Norm": tensor_stat.norm})
-            tensor_json.update({"requires_grad": tensor.requires_grad})
-            if tensor_stat.max is not None:
-                if np.isinf(tensor_stat.max) or np.isnan(tensor_stat.max):
-                    tensor_json['Max_except_inf_nan'] = self.handle_tensor_extremum_nan_inf(tensor, "max")
-            if tensor_stat.min is not None:
-                if np.isinf(tensor_stat.min) or np.isnan(tensor_stat.min):
-                    tensor_json['Min_except_inf_nan'] = self.handle_tensor_extremum_nan_inf(tensor, "min")
-        else:
-            tensor_json.update({"requires_grad": tensor.requires_grad})
-            tensor_json.update({"tensor_stat": tensor_stat.stack_tensor_stat})
+        stat_values = [
+            tensor_stat.max,
+            tensor_stat.min,
+            tensor_stat.mean,
+            tensor_stat.norm
+        ]
+        placeholder_index = self.data_writer.append_stat_to_buffer(stat_values)
+        tensor_json.update({Const.TENSOR_STAT_INDEX: placeholder_index})
+        tensor_json.update({"requires_grad": tensor.requires_grad})
         if self.config.summary_mode == Const.MD5 and not self.config.async_dump:
             tensor_md5 = self.get_md5_for_tensor(tensor)
             tensor_json.update({Const.MD5: tensor_md5})
         return tensor_json
-class StatisticsDataProcessor(PytorchDataProcessor):
-    pass
-class TensorDataProcessor(PytorchDataProcessor):
-    def dump_async_data(self):
-        for file_path, tensor in self._async_dump_cache.items():
-            save_pt(tensor.contiguous(), file_path)
-        self._async_dump_cache.clear()
-    def _analyze_tensor(self, tensor, suffix):
+    def _analyze_and_save_tensor(self, tensor, suffix):
         dump_data_name, file_path = self.get_save_file_path(suffix)
-        single_arg = super()._analyze_tensor(tensor, suffix)
+        single_arg = PytorchDataProcessor._analyze_tensor(self, tensor, suffix)
         single_arg.update({"data_name": dump_data_name})
         tensor, _ = self._cast_to_float_if_fp8(tensor)
         if self.config.async_dump:
@@ -329,14 +303,36 @@ class TensorDataProcessor(PytorchDataProcessor):
             save_pt(saved_tensor, file_path)
         return single_arg
-    def _analyze_numpy(self, ndarray, suffix):
+    def _analyze_and_save_ndarray(self, ndarray, suffix):
         dump_data_name, file_path = self.get_save_file_path(suffix)
         save_pt(torch.tensor(ndarray), file_path)
-        ndarray_json = super()._analyze_numpy(ndarray, suffix)
+        ndarray_json = PytorchDataProcessor._analyze_ndarray(ndarray, suffix)
         ndarray_json.update({"data_name": dump_data_name})
         return ndarray_json
+class StatisticsDataProcessor(PytorchDataProcessor):
+    def _analyze_tensor(self, tensor, suffix):
+        if any(item in self.current_api_or_module_name for item in self.config.tensor_list):
+            return self._analyze_and_save_tensor(tensor, suffix)
+        else:
+            return super()._analyze_tensor(tensor, suffix)
+    def _analyze_ndarray(self, ndarray, suffix):
+        if any(item in self.current_api_or_module_name for item in self.config.tensor_list):
+            return self._analyze_and_save_ndarray(ndarray, suffix)
+        else:
+            return super()._analyze_ndarray(ndarray, suffix)
+class TensorDataProcessor(PytorchDataProcessor):
+    def _analyze_tensor(self, tensor, suffix):
+        return self._analyze_and_save_tensor(tensor, suffix)
+    def _analyze_ndarray(self, ndarray, suffix):
+        return self._analyze_and_save_ndarray(ndarray, suffix)
 class OverflowCheckDataProcessor(PytorchDataProcessor):
     __slots__ = ["cached_tensors_and_file_paths"]
@@ -427,10 +423,22 @@ class OverflowCheckDataProcessor(PytorchDataProcessor):
             raise RuntimeError(f"overflow check failed") from e
     def _analyze_maybe_overflow_tensor(self, tensor_json):
-        if tensor_json['Max'] is None or tensor_json['Min'] is None:
+        tensor_stat_index = tensor_json.get(Const.TENSOR_STAT_INDEX)
+        if tensor_stat_index is None:
+            logger.warning("tensor_stat_index does not exist in tensor_json.")
             return
-        self.has_overflow = np.isinf(tensor_json['Max']) or np.isnan(tensor_json['Max']) or \
-                            np.isinf(tensor_json['Min']) or np.isnan(tensor_json['Min'])
+        max_tensor = self.data_writer.get_buffer_values_max(tensor_stat_index)
+        min_tensor = self.data_writer.get_buffer_values_min(tensor_stat_index)
+        if max_tensor is None or min_tensor is None:
+            return
+        if torch.isinf(max_tensor) or torch.isnan(max_tensor):
+            self.has_overflow = True
+            return
+        if torch.isinf(min_tensor) or torch.isnan(min_tensor):
+            self.has_overflow = True
     def _analyze_tensor(self, tensor, suffix):
         dump_data_name, file_path = self.get_save_file_path(suffix)

mindstudio-probe 1.3.0__py3-none-any.whl → 8.1.1__py3-none-any.whl

mindstudio-probe 1.3.0py3-none-any.whl → 8.1.1py3-none-any.whl