PyPI - mindstudio-probe - Versions diffs - 8.2.0__py3-none-any.whl → 8.2.1__py3-none-any.whl - Mend

mindstudio-probe 8.2.0py3-none-any.whl → 8.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (63) hide show

{mindstudio_probe-8.2.0.dist-info → mindstudio_probe-8.2.1.dist-info}/METADATA +2 -2
{mindstudio_probe-8.2.0.dist-info → mindstudio_probe-8.2.1.dist-info}/RECORD +63 -61
msprobe/README.md +4 -4
msprobe/core/common/const.py +6 -0
msprobe/core/common/db_manager.py +35 -4
msprobe/core/common/file_utils.py +28 -5
msprobe/core/common/megatron_utils.py +59 -0
msprobe/core/common/utils.py +14 -3
msprobe/core/compare/diff_analyze/first_diff_analyze.py +16 -4
msprobe/core/compare/diff_analyze/ignore_op_list.yaml +3 -0
msprobe/core/compare/find_first/analyzer.py +8 -7
msprobe/core/compare/find_first/graph.py +11 -3
msprobe/core/compare/find_first/utils.py +3 -2
msprobe/core/compare/highlight.py +13 -6
msprobe/core/compare/multiprocessing_compute.py +17 -10
msprobe/core/compare/utils.py +14 -5
msprobe/core/data_dump/data_collector.py +18 -21
msprobe/core/data_dump/data_processor/pytorch_processor.py +43 -20
msprobe/core/data_dump/json_writer.py +18 -8
msprobe/core/data_dump/scope.py +4 -6
msprobe/core/hook_manager.py +21 -0
msprobe/core/service.py +2 -0
msprobe/core/single_save/single_comparator.py +16 -3
msprobe/docs/01.installation.md +7 -5
msprobe/docs/04.kernel_dump_PyTorch.md +1 -1
msprobe/docs/06.data_dump_MindSpore.md +1 -1
msprobe/docs/10.accuracy_compare_PyTorch.md +46 -5
msprobe/docs/14.data_parse_PyTorch.md +1 -1
msprobe/docs/19.monitor.md +2 -0
msprobe/docs/21.visualization_PyTorch.md +15 -80
msprobe/docs/22.visualization_MindSpore.md +20 -104
msprobe/docs/23.generate_operator_PyTorch.md +1 -1
msprobe/docs/26.data_dump_PyTorch_baseline.md +7 -7
msprobe/docs/img/visualization/vis_browser_1.png +0 -0
msprobe/docs/img/visualization/vis_match_info.png +0 -0
msprobe/docs/img/visualization/vis_precision_info.png +0 -0
msprobe/docs/img/visualization/vis_search_info.png +0 -0
msprobe/docs/img/visualization/vis_show_info.png +0 -0
msprobe/docs/img/visualization/vis_showcase.png +0 -0
msprobe/docs/img/visualization/vis_unmatch_info.png +0 -0
msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +1 -1
msprobe/mindspore/cell_processor.py +33 -5
msprobe/mindspore/compare/common_dir_compare.py +22 -26
msprobe/mindspore/debugger/precision_debugger.py +1 -1
msprobe/mindspore/dump/cell_dump_process.py +73 -62
msprobe/mindspore/dump/graph_mode_cell_dump.py +21 -10
msprobe/mindspore/dump/hook_cell/ms_hook_manager.py +2 -0
msprobe/pytorch/compare/utils.py +2 -1
msprobe/pytorch/dump/module_dump/hook_wrapper.py +10 -7
msprobe/pytorch/dump/module_dump/module_processer.py +15 -8
msprobe/pytorch/monitor/module_hook.py +28 -9
msprobe/pytorch/online_dispatch/dispatch.py +42 -24
msprobe/visualization/builder/graph_builder.py +169 -64
msprobe/visualization/builder/graph_merger.py +0 -1
msprobe/visualization/builder/msprobe_adapter.py +1 -1
msprobe/visualization/db_utils.py +25 -2
msprobe/visualization/graph/base_node.py +0 -24
msprobe/visualization/graph/graph.py +5 -14
msprobe/visualization/graph_service.py +29 -53
{mindstudio_probe-8.2.0.dist-info → mindstudio_probe-8.2.1.dist-info}/LICENSE +0 -0
{mindstudio_probe-8.2.0.dist-info → mindstudio_probe-8.2.1.dist-info}/WHEEL +0 -0
{mindstudio_probe-8.2.0.dist-info → mindstudio_probe-8.2.1.dist-info}/entry_points.txt +0 -0
{mindstudio_probe-8.2.0.dist-info → mindstudio_probe-8.2.1.dist-info}/top_level.txt +0 -0

msprobe/core/compare/diff_analyze/first_diff_analyze.py CHANGED Viewed

@@ -26,6 +26,8 @@ from msprobe.core.compare.utils import gen_api_batches
 cur_dir = os.path.dirname(os.path.realpath(__file__))
 diff_threshold_yaml_path = os.path.join(cur_dir, 'diff_analyze_threshold.yaml')
+ignore_op_list_yaml_path = os.path.join(cur_dir, 'ignore_op_list.yaml')
+ignore_list = load_yaml(ignore_op_list_yaml_path)
 thresholds = load_yaml(diff_threshold_yaml_path)
 cmp_metrics = thresholds.get('compare_metrics')
@@ -51,7 +53,7 @@ class FirstDiffAnalyze:
                 return True
         return False
-    def single_api_check(self, result_slice, header):
+    def single_api_check(self, result_slice, header, api_name=None):
         """
         单个api差异检查
@@ -65,14 +67,18 @@ class FirstDiffAnalyze:
         }
         column_indices = {name: idx for idx, name in enumerate(header)}
+        output_idx = -1
         for line in result_slice:
             op_item = {
                 column_name: line[column_indices[column_name]]
                 for column_name in header
             }
             single_check_result['op_items'].append(op_item)
+            if op_item['state'] != 'output':
+                continue
+            output_idx += 1
+            if output_idx in ignore_list.get(api_name, []):
+                continue
             # set is_same
             if self.mode_config.dump_mode == Const.MD5:
                 if line[column_indices[CompareConst.RESULT]] == CompareConst.DIFF:
@@ -117,7 +123,13 @@ class FirstDiffAnalyze:
         with tqdm(total=len(api_batches), desc=bar_desc_add_rank, unit="api/module", ncols=100) as progress_bar:
             for api_batch in api_batches:
                 result_slice = result[api_batch.start: api_batch.params_grad_end_index]
-                check_result[api_batch.api_name] = self.single_api_check(result_slice, header)
+                api_compo = api_batch.api_name.split('.')
+                # suppose name is Tensor.MatMul.0.forward
+                if len(api_compo) < 4:
+                    continue
+                # get MatMul as api_name
+                api_name = api_compo[-3]
+                check_result[api_batch.api_name] = self.single_api_check(result_slice, header, api_name)
                 progress_bar.update(1)
         return check_result

msprobe/core/compare/diff_analyze/ignore_op_list.yaml ADDED Viewed

@@ -0,0 +1,3 @@
+npu_fusion_attention:
+  - 4
+  - 5

msprobe/core/compare/find_first/analyzer.py CHANGED Viewed

@@ -47,7 +47,6 @@ class DiffAnalyzer:
             analyze_func()
             if self._diff_nodes:
                 self._gen_analyze_info()
-                self._post_process()
                 return
         logger.info('Cannot find any diff node, no need to generate analyze file.')
@@ -56,12 +55,6 @@ class DiffAnalyzer:
         self._resolve_input_path(self._output_path)
         logger.info("Pre Process completed.")
-    def _post_process(self):
-        for rank_path in self._paths.values():
-            dump_path = rank_path.dump_path
-            logger.debug(f"Remove {dump_path} success")
-        logger.info("Post Process completed.")
     """
     这里需要生成stack，但是直接用dict中自带就行，在op_items.NPU_Stack_Info中
     """
@@ -105,6 +98,8 @@ class DiffAnalyzer:
                 logger.warning(f'Rank {path.rank} has no dump data!')
                 continue
             for op_name, op_data in dump_data.items():
+                if is_ignore_op(op_name):
+                    continue
                 if is_communication_op(op_name):
                     self._first_comm_nodes[path.rank] = op_name
                     break
@@ -131,10 +126,16 @@ class DiffAnalyzer:
         for rank, nodes in list(self._rank_comm_nodes_dict.items())[:-1]:
             searched_ranks.add(rank)
             seen_nodes = set()
+            last_node = None
             for cur_node in nodes.values():
+                is_overflow = last_node and hasattr(last_node, 'layer') and hasattr(cur_node, 'layer') and \
+                last_node.layer >= cur_node.layer
+                if is_overflow:
+                    cur_node.layer = last_node.layer + 1
                 conn_info = cur_node.find_connected_nodes()
                 if not conn_info.get('ranks'):
                     conn_info['ranks'] = self._rank_comm_nodes_dict.keys()
+                last_node = cur_node
                 if not self._find_connection(conn_info, cur_node, searched_ranks, seen_nodes):
                     logger.debug(f'Cannot find connected communication node for "{cur_node.node_id}".')

msprobe/core/compare/find_first/graph.py CHANGED Viewed

@@ -52,19 +52,25 @@ class DataNode:
         metrics = {}
         for cmp_data in self.op_data:
             name = cmp_data.get(CompareConst.NPU_NAME)
+            # 构建度量指标字典
+            metrics = {}
             if CompareConst.NPU_MAX in cmp_data:
                 metrics = {CompareConst.NPU_MAX: cmp_data.get(CompareConst.NPU_MAX),
                         CompareConst.NPU_MIN: cmp_data.get(CompareConst.NPU_MIN),
                         CompareConst.NPU_MEAN: cmp_data.get(CompareConst.NPU_MEAN),
                         CompareConst.NPU_NORM: cmp_data.get(CompareConst.NPU_NORM)}
             elif CompareConst.NPU_MD5 in cmp_data:
-                metrics = {CompareConst.NPU_MD5: cmp_data.get(CompareConst.NPU_MD5)}
+                metrics[CompareConst.NPU_MD5] = cmp_data.get(CompareConst.NPU_MD5)
+            if CompareConst.NPU_P2POP_PEER in cmp_data:
+                metrics[CompareConst.NPU_P2POP_PEER] = cmp_data.get(CompareConst.NPU_P2POP_PEER)
             if cmp_data.get(CompareConst.STACK) != CompareConst.N_A and not self.stack:
                 self.stack = cmp_data.get(CompareConst.STACK)
-            if Const.INPUT in name:
+            if cmp_data.get('state') == "input":
                 self.inputs[name] = metrics
-            elif Const.OUTPUT in name:
+            elif cmp_data.get('state') == "output":
                 self.outputs[name] = metrics
     def gen_node_info(self, path: RankPath):
@@ -161,6 +167,8 @@ class CommunicationNode:
                 if val and val.startswith('[') and val.endswith(']'):
                     val = [int(part) for part in val.strip('[]').split(',')]
                     ranks.update(val)
+            elif v.get(CompareConst.NPU_P2POP_PEER) != "None":
+                ranks.add(v.get(CompareConst.NPU_P2POP_PEER))
         return {'ranks': ranks, 'api': f'Distributed.{tar_api}',
                 'type': DiffAnalyseConst.OPPOSITE_DIR.get(self.type, DiffAnalyseConst.LINK)}

msprobe/core/compare/find_first/utils.py CHANGED Viewed

@@ -120,7 +120,8 @@ def is_communication_op(op_name):
 def is_ignore_op(op_name):
     ignore_keywords = [
         'Torch.empty',
-        'Torch.fill'
+        'Torch.fill',
+        'Tensor.__setitem__'
     ]
     return any(keyword in op_name for keyword in ignore_keywords)
@@ -181,7 +182,7 @@ def analyze_diff_in_group(nodes_group):
     input_diff_nodes = list(filter(lambda node: node.is_diff, src_list))
     # 如果有异常回溯计算节点找到异常来源
     # 使用cpu模拟节点进行计算，查看结果是否有问题。需要对所有计算节点录入/映射，暂不实现。
-    get_compute_ops_from_comm_nodes(input_diff_nodes)
+    get_compute_ops_from_comm_nodes(nodes_group)
     # 筛选入参没问题但出参有问题的通信节点
     output_diff_nodes = list(filter(lambda node: node.data.is_diff, nodes_group))
     get_comm_ops(output_diff_nodes)

msprobe/core/compare/highlight.py CHANGED Viewed

@@ -26,7 +26,7 @@ from tqdm import tqdm
 from msprobe.core.common.const import CompareConst, Const
 from msprobe.core.common.file_utils import save_workbook
 from msprobe.core.common.log import logger
-from msprobe.core.common.utils import get_header_index
+from msprobe.core.common.utils import get_header_index, CompareException
 from msprobe.core.compare.utils import table_value_is_valid, gen_api_batches
 from msprobe.core.compare.config import ModeConfig
@@ -359,18 +359,25 @@ class HighLight:
         def err_call(args):
             logger.error("Multiprocessing malicious value check failed! Reason: {}".format(args))
-            try:
-                pool.close()
-            except OSError:
-                logger.error("Pool terminate failed")
         result_df_columns = result_df.columns.tolist()
         for column in result_df_columns:
             self.value_check(column)
+        async_results = []
         for df_chunk in chunks:
-            pool.apply_async(func, args=(df_chunk, result_df_columns,), error_callback=err_call)
+            result = pool.apply_async(func, args=(df_chunk, result_df_columns,), error_callback=err_call)
+            async_results.append(result)
         pool.close()
+        for ar in async_results:
+            try:
+                ar.get(timeout=3600)
+            except Exception as e:
+                logger.error(f"Task failed with exception: {e}")
+                pool.terminate()
+                raise CompareException(CompareException.MULTIPROCESS_ERROR) from e
         pool.join()
     def df_malicious_value_check(self, result_df):

msprobe/core/compare/multiprocessing_compute.py CHANGED Viewed

@@ -52,16 +52,20 @@ def _ms_graph_handle_multi_process(func, result_df, mode):
     def err_call(args):
         logger.error('multiprocess compare failed! Reason: {}'.format(args))
-        try:
-            pool.close()
-        except OSError as e:
-            logger.error(f'pool terminate failed: {str(e)}')
     for df_chunk in df_chunks:
         result = pool.apply_async(func, args=(df_chunk, mode), error_callback=err_call)
         results.append(result)
-    final_results = [r.get() for r in results]
     pool.close()
+    try:
+        final_results = [r.get(timeout=3600) for r in results]
+    except Exception as e:
+        logger.error(f"Task failed with exception: {e}")
+        pool.terminate()
+        raise CompareException(CompareException.MULTIPROCESS_ERROR) from e
     pool.join()
     return pd.concat(final_results, ignore_index=True)
@@ -277,10 +281,6 @@ class CompareRealData:
         def err_call(args):
             logger.error('multiprocess compare failed! Reason: {}'.format(args))
-            try:
-                pool.close()
-            except OSError:
-                logger.error("pool terminate failed")
         progress_bar = tqdm(total=len(result_df), desc="API/Module Item Compare Process", unit="row", ncols=100)
@@ -298,7 +298,14 @@ class CompareRealData:
                                       )
             results.append(result)
-        final_results = [r.get() for r in results]
         pool.close()
+        try:
+            final_results = [r.get(timeout=3600) for r in results]
+        except Exception as e:
+            logger.error(f"Task failed with exception: {e}")
+            pool.terminate()
+            raise CompareException(CompareException.MULTIPROCESS_ERROR) from e
         pool.join()
         return pd.concat(final_results, ignore_index=True)

msprobe/core/compare/utils.py CHANGED Viewed

@@ -695,10 +695,6 @@ def get_sorted_ranks(npu_dump_dir, bench_dump_dir):
 def multi_statistics_compare(func, func_args):
     def err_call(args):
         logger.error(f'Multiprocess statistics compare failed! Reason: {args}')
-        try:
-            pool.close()
-        except OSError:
-            logger.error("Pool terminate failed")
     compare_func, input_param_nr_list, output_path, kwargs = func_args
@@ -715,9 +711,22 @@ def multi_statistics_compare(func, func_args):
             chunks[i].append(input_param_nr_list[param_num - remainder + i])
     pool = multiprocessing.Pool(process_num)
+    async_results = []
     for chunk in chunks:
-        pool.apply_async(func, args=(compare_func, chunk, output_path, kwargs), error_callback=err_call)
+        result = pool.apply_async(func, args=(compare_func, chunk, output_path, kwargs), error_callback=err_call)
+        async_results.append(result)
     pool.close()
+    for ar in async_results:
+        try:
+            ar.get(timeout=3600)
+        except Exception as e:
+            logger.error(f"Task failed with exception: {e}")
+            pool.terminate()
+            raise CompareException(CompareException.MULTIPROCESS_ERROR) from e
     pool.join()

msprobe/core/data_dump/data_collector.py CHANGED Viewed

@@ -23,6 +23,7 @@ from msprobe.core.data_dump.json_writer import DataWriter
 from msprobe.core.common.log import logger
 from msprobe.core.common.const import Const
 from msprobe.core.data_dump.data_processor.factory import DataProcessorFactory
+from msprobe.core.common.megatron_utils import MegatronStepInfo, get_micro_step, is_megatron
 def build_data_collector(config):
@@ -270,15 +271,20 @@ class DataCollector:
         if self.config.level not in DataCollector.level_without_construct:
             if self.optimizer_status in [Const.OPTIMIZER, Const.CLIP_GRAD]:
                 if self.optimizer_status_first_start[self.optimizer_status]:
-                    self.data_writer.update_construct({self.optimizer_status: None})
+                    self.data_writer.update_construct(
+                        {self.optimizer_status: None if not is_megatron() else [None, get_micro_step()]})
                     self.optimizer_status_first_start[self.optimizer_status] = False
-                self.data_writer.update_construct({name: self.optimizer_status})
+                self.data_writer.update_construct(
+                    {name: self.optimizer_status if not is_megatron() else [self.optimizer_status, get_micro_step()]})
             else:
                 if self.config.level == Const.LEVEL_MIX and \
                   not (name.startswith(Const.MODULE) or name.startswith(Const.CELL)):
                     self.data_writer.update_construct(
                         {name: self.module_processor.api_parent_node.get(threading.get_ident())}
                     )
+            if MegatronStepInfo.is_megatron:
+                micro_step_number = max(MegatronStepInfo.forward_micro_step, MegatronStepInfo.backward_micro_step)
+                self.data_writer.update_construct({Const.MEGATRON_MICRO_STEP_NUMBER: micro_step_number})
             self.data_writer.update_construct(self.module_processor.module_node)
@@ -302,25 +308,16 @@ class DataCollector:
         self.data_processor.update_iter(current_iter)
     def params_data_collect(self, name, param_name, pid, data):
-        try:
-            grad_name = name + Const.SEP + Const.PARAMS_GRAD
-            self.update_api_or_module_name(grad_name)
-            if not self.check_scope_and_pid(self.scope, name, pid) and not self.backward_module_names.get(name):
-                if self.data_writer.cache_data.get("data"):
-                    self.data_writer.cache_data.get("data").pop(grad_name, None)
-                    self.params_grad_record[grad_name] = False
-                return
-            data_info = self.data_processor.analyze_params(grad_name, param_name, data)
-            self.handle_data(grad_name, data_info, flush=self.data_processor.is_terminated)
-            self.params_grad_record[grad_name] = False
-        except Exception as e:
-            error_type = type(e).__name__
-            tb = traceback.format_exc()
-            self.data_writer.write_error_log(
-                f"[ERROR] params_data_collect failed: "
-                f"name={name}, param_name={param_name}, pid={pid}\n{tb}",
-                error_type=error_type
-            )
+        grad_name = name + Const.SEP + Const.PARAMS_GRAD
+        self.update_api_or_module_name(grad_name)
+        if not self.check_scope_and_pid(self.scope, name, pid) and not self.backward_module_names.get(name):
+            if self.data_writer.cache_data.get("data"):
+                self.data_writer.cache_data.get("data").pop(grad_name, None)
+                self.params_grad_record[grad_name] = False
+            return
+        data_info = self.data_processor.analyze_params(grad_name, param_name, data)
+        self.handle_data(grad_name, data_info, flush=self.data_processor.is_terminated)
+        self.params_grad_record[grad_name] = False
     def params_data_collect_in_bw_hook(self, params_dict, name):
         try:

msprobe/core/data_dump/data_processor/pytorch_processor.py CHANGED Viewed

@@ -13,13 +13,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import ctypes
 import os
 import zlib
-import ctypes
 from collections.abc import Iterable
+from concurrent.futures import ThreadPoolExecutor
 from dataclasses import asdict
 from typing import List
-from concurrent.futures import ThreadPoolExecutor
 import numpy as np
 import torch
@@ -29,7 +29,6 @@ from torch.distributed.distributed_c10d import _get_default_group
 from msprobe.core.common.const import Const
 from msprobe.core.common.decorator import recursion_depth_decorator
 from msprobe.core.common.exceptions import MsprobeException
-from msprobe.core.common.file_utils import path_len_exceeds_limit
 from msprobe.core.common.log import logger
 from msprobe.core.common.utils import convert_tuple, is_int
 from msprobe.core.data_dump.data_processor.base import BaseDataProcessor, ModuleBackwardInputsOutputs, \
@@ -48,15 +47,28 @@ class TensorHandler:
     def __init__(self):
         self.has_dtensor = hasattr(dist, "tensor") and hasattr(dist.tensor, "DTensor")
         self.has_fake_tensor = hasattr(torch, "_subclasses") and hasattr(torch._subclasses, "fake_tensor")
+        self.has_async_collective_tensor = hasattr(dist, "_functional_collectives") and \
+                                           hasattr(dist._functional_collectives, "AsyncCollectiveTensor")
+    @staticmethod
+    def free_tensor(tensor, tensor_name):
+        try:
+            tensor.untyped_storage().resize_(0)
+        except Exception as e:
+            logger.warning(f"Failed to free tensor: {tensor_name}, the detail info: {e}.")
     def is_dtensor(self, tensor):
-        return self.has_dtensor and isinstance(tensor, torch.distributed.tensor.DTensor)
+        return self.has_dtensor and isinstance(tensor, dist.tensor.DTensor)
     def is_fake_tensor(self, tensor):
         return self.has_fake_tensor and isinstance(tensor, torch._subclasses.fake_tensor.FakeTensor)
+    def is_async_collective_tensor(self, tensor):
+        return self.has_async_collective_tensor and \
+            isinstance(tensor, dist._functional_collectives.AsyncCollectiveTensor)
     def is_empty_data(self, tensor):
-        return tensor.is_meta or self.is_fake_tensor(tensor)
+        return tensor.is_meta or self.is_fake_tensor(tensor) or self.is_async_collective_tensor(tensor)
     def convert_common_tensor(self, tensor):
         if self.is_dtensor(tensor):
@@ -71,6 +83,8 @@ class TensorHandler:
             return Const.DTENSOR_TYPE
         if self.is_fake_tensor(tensor):
             return Const.FAKE_TENSOR_TYPE
+        if self.is_async_collective_tensor(tensor):
+            return Const.AC_TENSOR_TYPE
         return Const.TENSOR_TYPE
     def get_dtensor_info(self, tensor):
@@ -94,6 +108,18 @@ class TensorHandler:
         dtensor_info.update({"placements": placements})
         return dtensor_info
+    def save_tensor(self, tensor, file_path):
+        common_tensor = self.convert_common_tensor(tensor)
+        if self.is_empty_data(common_tensor):
+            logger.debug(f"Saving fake tensor or meta tensor is not supported, the current tensor is {file_path}.")
+            return
+        if common_tensor.untyped_storage().data_ptr() == 0:
+            logger.debug(f"Saving null-pointer tensor is not supported, the current tensor is {file_path}.")
+            return
+        saved_tensor = common_tensor.clone().contiguous().detach()
+        save_pt(saved_tensor, file_path)
+        self.free_tensor(saved_tensor, file_path)
 class PytorchDataProcessor(BaseDataProcessor):
     pytorch_special_type = (
@@ -288,7 +314,7 @@ class PytorchDataProcessor(BaseDataProcessor):
     def dump_async_data(self):
         for file_path, tensor in self._async_dump_cache.items():
-            save_pt(tensor.contiguous(), file_path)
+            self.tensor_handler.save_tensor(tensor, file_path)
         self._async_dump_cache.clear()
     def analyze_single_element(self, element, suffix_stack):
@@ -385,24 +411,24 @@ class PytorchDataProcessor(BaseDataProcessor):
     def _analyze_and_save_tensor(self, tensor, suffix):
         dump_data_name, file_path = self.get_save_file_path(suffix)
         single_arg = PytorchDataProcessor._analyze_tensor(self, tensor, suffix)
-        if self.tensor_handler.is_empty_data(tensor) or tensor.untyped_storage().data_ptr() == 0:
-            logger.debug(
-                "Collecting real data of fake tensor or meta tensor is not supported or data_ptr is 0, "
-                f"the current api/module name is {self.current_api_or_module_name}."
-            )
+        common_tensor = self.tensor_handler.convert_common_tensor(tensor)
+        if self.tensor_handler.is_empty_data(common_tensor):
+            logger.debug(f"Saving fake tensor or meta tensor is not supported, the current tensor is {file_path}.")
+            return single_arg
+        if common_tensor.untyped_storage().data_ptr() == 0:
+            logger.debug(f"Saving null-pointer tensor is not supported, the current tensor is {file_path}.")
             return single_arg
         single_arg.update({"data_name": dump_data_name})
         if self.config.async_dump:
-            self._async_dump_cache[file_path] = tensor.clone().detach()
+            self._async_dump_cache[file_path] = common_tensor.clone().detach()
         else:
-            saved_tensor = tensor.clone().contiguous().detach()
-            save_pt(saved_tensor, file_path)
+            self.tensor_handler.save_tensor(common_tensor, file_path)
         return single_arg
     def _analyze_and_save_ndarray(self, ndarray, suffix):
         dump_data_name, file_path = self.get_save_file_path(suffix)
-        save_pt(torch.tensor(ndarray), file_path)
+        self.tensor_handler.save_tensor(torch.tensor(ndarray), file_path)
         ndarray_json = PytorchDataProcessor._analyze_ndarray(ndarray, suffix)
         ndarray_json.update({"data_name": dump_data_name})
         return ndarray_json
@@ -493,7 +519,7 @@ class OverflowCheckDataProcessor(PytorchDataProcessor):
             self._analyze_maybe_overflow_flag()
         if self.has_overflow:
             for file_path, tensor in self.cached_tensors_and_file_paths.items():
-                save_pt(tensor.clone().contiguous().detach(), file_path)
+                self.tensor_handler.save_tensor(tensor, file_path)
             self.real_overflow_nums += 1
             if self.overflow_nums != -1 and self.real_overflow_nums >= self.overflow_nums:
                 logger.info(f"[{Const.TOOL_NAME}] Reached the preset overflow times, "
@@ -538,10 +564,7 @@ class OverflowCheckDataProcessor(PytorchDataProcessor):
     def _analyze_tensor(self, tensor, suffix):
         dump_data_name, file_path = self.get_save_file_path(suffix)
-        if not path_len_exceeds_limit(file_path):
-            self.cached_tensors_and_file_paths.update({file_path: tensor})
-        else:
-            logger.warning(f'The file path {file_path} length exceeds limit.')
+        self.cached_tensors_and_file_paths.update({file_path: tensor})
         single_arg = super()._analyze_tensor(tensor, suffix)
         single_arg.update({"data_name": dump_data_name})
         if not self.has_overflow and self.support_inf_nan:

msprobe/core/data_dump/json_writer.py CHANGED Viewed

@@ -13,18 +13,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import concurrent
+import copy
 import csv
 import os
-import copy
 import threading
 import traceback
 from datetime import datetime, timezone, timedelta
-import concurrent
 from msprobe.core.common.const import Const, FileCheckConst
-from msprobe.core.common.file_utils import change_mode, FileOpen, save_json, load_json, check_path_before_create
-from msprobe.core.common.log import logger
 from msprobe.core.common.decorator import recursion_depth_decorator
+from msprobe.core.common.file_utils import change_mode, FileOpen, save_json, check_path_before_create
+from msprobe.core.common.log import logger
 lock = threading.Lock()
@@ -40,6 +40,7 @@ class DataWriter:
         self.debug_file_path = None
         self.dump_error_info_path = None
         self.flush_size = 1000
+        self.md5_flush_size = 5000
         self.larger_flush_size = 20000
         self.cache_data = {}
         self.cache_stack = {}
@@ -49,6 +50,7 @@ class DataWriter:
         self._error_log_initialized = False
         self._cache_logged_error_types = set()
         self.crc32_stack_list = []
+        self.data_updated = False
     @staticmethod
     def write_data_to_csv(result: list, result_header: tuple, file_path: str):
@@ -60,7 +62,7 @@ class DataWriter:
             spawn_writer = csv.writer(csv_file)
             if not is_exists:
                 spawn_writer.writerow(result_header)
-            spawn_writer.writerows([result,])
+            spawn_writer.writerows([result, ])
         is_new_file = not is_exists
         if is_new_file:
             change_mode(file_path, FileCheckConst.DATA_FILE_AUTHORITY)
@@ -190,7 +192,7 @@ class DataWriter:
         summary_mode = getattr(cfg, "summary_mode", None)
         if summary_mode == Const.MD5:
-            threshold = self.flush_size
+            threshold = self.md5_flush_size
         else:
             threshold = self.flush_size if length < self.larger_flush_size else self.larger_flush_size
@@ -238,6 +240,7 @@ class DataWriter:
                 logger.warning(f"The dump data({dump_data}) should be a dict.")
                 return
+            self.data_updated = True
             key = next(iter(new_data.keys()))
             if key in dump_data:
                 dump_data.get(key).update(new_data.get(key))
@@ -246,6 +249,7 @@ class DataWriter:
     def update_stack(self, name, stack_data):
         with lock:
+            self.data_updated = True
             api_list = self.cache_stack.get(stack_data)
             if api_list is None:
                 self.cache_stack.update({stack_data: [name]})
@@ -254,10 +258,12 @@ class DataWriter:
     def update_construct(self, new_data):
         with lock:
+            self.data_updated = True
             self.cache_construct.update(new_data)
     def update_debug(self, new_data):
         with lock:
+            self.data_updated = True
             self.cache_debug['data'].update(new_data)
     def write_data_json(self, file_path):
@@ -324,17 +330,21 @@ class DataWriter:
             stat_result = self.flush_stat_stack()
             # 遍历 cache_data，将占位符替换为最终统计值
             if stat_result:
+                self.data_updated = True
                 self._replace_stat_placeholders(self.cache_data, stat_result)
                 if self.cache_debug:
                     self._replace_stat_placeholders(self.cache_debug, stat_result)
-            # 2) 再 flush CRC32
             crc32_result = self.flush_crc32_stack()
             if crc32_result:
+                self.data_updated = True
                 self._replace_crc32_placeholders(self.cache_data, crc32_result)
                 if self.cache_debug:
                     self._replace_crc32_placeholders(self.cache_debug, crc32_result)
+            if not self.data_updated:
+                return
             if self.cache_data:
                 self.write_data_json(self.dump_file_path)
             if self.cache_stack:
@@ -343,4 +353,4 @@ class DataWriter:
                 self.write_construct_info_json(self.construct_file_path)
             if self.cache_debug:
                 self.write_debug_info_json(self.debug_file_path)
+            self.data_updated = False

msprobe/core/data_dump/scope.py CHANGED Viewed

@@ -69,8 +69,7 @@ class BaseScope(ABC):
         self.scope = scope
         self.api_list = api_list
-    @staticmethod
-    def rectify_args(scope, api_list):
+    def rectify_args(self, scope, api_list):
         if not isinstance(api_list, list):
             raise ScopeException(ScopeException.InvalidApiStr,
                                  f"api_list参数须配置为列表，实际类型为{type(api_list)}.")
@@ -104,12 +103,11 @@ class BaseScope(ABC):
 class ListScope(BaseScope):
-    @staticmethod
-    def rectify_args(scope, api_list):
+    def rectify_args(self, scope, api_list):
         if scope and api_list:
             raise ScopeException(ScopeException.ArgConflict,
                                  f"scope和api_list不可以同时配置，实际配置为scope={scope}, api_list={api_list}.")
-        return super(ListScope, ListScope).rectify_args(scope, api_list)
+        return super().rectify_args(scope, api_list)
     def check(self, name):
         if not self.scope or name in self.scope:
@@ -147,7 +145,7 @@ class RangeScope(BaseScope, ABC):
                                      f"scope参数格式错误，要求格式为api或模块完整命名，实际为{name}.")
     def rectify_args(self, scope, api_list):
-        scope, api_list = super(RangeScope, RangeScope).rectify_args(scope, api_list)
+        scope, api_list = super().rectify_args(scope, api_list)
         if scope and len(scope) != 2:
             raise ScopeException(ScopeException.InvalidScope,
                                  f"scope参数指定区间断点，须传入长度为2的列表，实际长度为{len(scope)}.")

mindstudio-probe 8.2.0__py3-none-any.whl → 8.2.1__py3-none-any.whl

mindstudio-probe 8.2.0py3-none-any.whl → 8.2.1py3-none-any.whl