PyPI - mindstudio-probe - Versions diffs - 8.1.2__py3-none-any.whl → 8.2.1__py3-none-any.whl - Mend

mindstudio-probe 8.1.2py3-none-any.whl → 8.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (181) hide show

{mindstudio_probe-8.1.2.dist-info → mindstudio_probe-8.2.1.dist-info}/METADATA +2 -2
{mindstudio_probe-8.1.2.dist-info → mindstudio_probe-8.2.1.dist-info}/RECORD +172 -147
msprobe/README.md +6 -6
msprobe/core/common/const.py +98 -41
msprobe/core/common/db_manager.py +256 -0
msprobe/core/common/file_utils.py +28 -5
msprobe/core/common/log.py +7 -0
msprobe/core/common/megatron_utils.py +59 -0
msprobe/core/common/parallel_state.py +193 -0
msprobe/core/common/utils.py +20 -13
msprobe/core/common_config.py +5 -0
msprobe/core/compare/acc_compare.py +140 -93
msprobe/core/compare/check.py +13 -0
msprobe/core/compare/compare_cli.py +64 -6
msprobe/core/compare/config.py +10 -8
msprobe/core/compare/diff_analyze/diff_analyze_threshold.yaml +14 -0
msprobe/core/compare/diff_analyze/first_diff_analyze.py +135 -0
msprobe/core/compare/diff_analyze/ignore_op_list.yaml +3 -0
msprobe/core/compare/find_first/__init__.py +0 -0
msprobe/core/compare/find_first/analyzer.py +282 -0
msprobe/core/compare/find_first/data_processor.py +35 -0
msprobe/core/compare/find_first/graph.py +188 -0
msprobe/core/compare/find_first/utils.py +189 -0
msprobe/core/compare/highlight.py +74 -101
msprobe/core/compare/layer_mapping/layer_mapping.py +14 -9
msprobe/core/compare/merge_result/merge_result.py +2 -2
msprobe/core/compare/multiprocessing_compute.py +45 -28
msprobe/core/compare/npy_compare.py +7 -10
msprobe/core/compare/utils.py +338 -130
msprobe/core/config_check/checkers/dataset_checker.py +2 -1
msprobe/core/config_check/checkers/env_args_checker.py +5 -5
msprobe/core/config_check/checkers/hyperparameter_checker.py +30 -10
msprobe/core/config_check/checkers/pip_checker.py +4 -3
msprobe/core/config_check/checkers/random_checker.py +3 -3
msprobe/core/config_check/checkers/weights_checker.py +2 -1
msprobe/core/config_check/ckpt_compare/megatron_loader.py +2 -0
msprobe/core/config_check/resource/hyperparameter.yaml +11 -1
msprobe/core/config_check/utils/hyperparameter_parser.py +7 -3
msprobe/core/config_check/utils/utils.py +10 -0
msprobe/core/data_dump/api_registry.py +49 -30
msprobe/core/data_dump/data_collector.py +71 -29
msprobe/core/data_dump/data_processor/base.py +2 -0
msprobe/core/data_dump/data_processor/mindspore_processor.py +47 -53
msprobe/core/data_dump/data_processor/pytorch_processor.py +227 -93
msprobe/core/data_dump/json_writer.py +81 -7
msprobe/core/data_dump/scope.py +4 -6
msprobe/core/hook_manager.py +129 -70
msprobe/core/monitor/csv2db.py +361 -0
msprobe/core/monitor/db_utils.py +278 -0
msprobe/core/monitor/utils.py +35 -1
msprobe/core/service.py +31 -39
msprobe/core/single_save/single_comparator.py +16 -3
msprobe/docs/01.installation.md +51 -19
msprobe/docs/02.config_introduction.md +16 -20
msprobe/docs/03.config_examples.md +26 -0
msprobe/docs/04.kernel_dump_PyTorch.md +1 -1
msprobe/docs/05.data_dump_PyTorch.md +6 -2
msprobe/docs/06.data_dump_MindSpore.md +44 -7
msprobe/docs/07.accuracy_checker_PyTorch.md +1 -1
msprobe/docs/10.accuracy_compare_PyTorch.md +124 -44
msprobe/docs/11.accuracy_compare_MindSpore.md +75 -7
msprobe/docs/14.data_parse_PyTorch.md +1 -1
msprobe/docs/19.monitor.md +94 -7
msprobe/docs/21.visualization_PyTorch.md +71 -101
msprobe/docs/22.visualization_MindSpore.md +69 -119
msprobe/docs/23.generate_operator_PyTorch.md +1 -1
msprobe/docs/25.tool_function_introduction.md +0 -1
msprobe/docs/26.data_dump_PyTorch_baseline.md +7 -7
msprobe/docs/28.debugger_save_instruction.md +184 -81
msprobe/docs/29.data_dump_MSAdapter.md +6 -0
msprobe/docs/31.config_check.md +4 -2
msprobe/docs/36.calculation_result_change.md +75 -0
msprobe/docs/FAQ.md +22 -1
msprobe/docs/data_dump_MindSpore/dynamic_graph_quick_start_example.md +6 -2
msprobe/docs/img/compare_result.png +0 -0
msprobe/docs/img/visualization/vis_browser_1.png +0 -0
msprobe/docs/img/visualization/vis_match_info.png +0 -0
msprobe/docs/img/visualization/vis_precision_info.png +0 -0
msprobe/docs/img/visualization/vis_search_info.png +0 -0
msprobe/docs/img/visualization/vis_show_info.png +0 -0
msprobe/docs/img/visualization/vis_showcase.png +0 -0
msprobe/docs/img/visualization/vis_unmatch_info.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/1.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/2.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/3.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/4.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/5.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/6.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/7.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/llamafactory-qwen25vl.txt +59 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/llamafactory1.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/llamafactory2.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/mindspeed-mm-qwen25vl.txt +80 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/mindspeed1.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/mindspeed2.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactory_mapping.md +330 -0
msprobe/mindspore/__init__.py +1 -1
msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +1 -1
msprobe/mindspore/api_accuracy_checker/api_runner.py +9 -6
msprobe/mindspore/api_accuracy_checker/compute_element.py +18 -12
msprobe/mindspore/cell_processor.py +64 -25
msprobe/mindspore/common/utils.py +51 -7
msprobe/mindspore/compare/common_dir_compare.py +45 -37
msprobe/mindspore/compare/ms_compare.py +10 -2
msprobe/mindspore/compare/ms_graph_compare.py +47 -52
msprobe/mindspore/debugger/debugger_config.py +18 -7
msprobe/mindspore/debugger/precision_debugger.py +16 -12
msprobe/mindspore/dump/cell_dump_process.py +130 -68
msprobe/mindspore/dump/cell_dump_with_insert_gradient.py +10 -2
msprobe/mindspore/dump/graph_mode_cell_dump.py +35 -9
msprobe/mindspore/dump/graph_tensor_dump.py +11 -0
msprobe/mindspore/dump/hook_cell/api_register.py +19 -20
msprobe/mindspore/dump/hook_cell/hook_cell.py +12 -34
msprobe/mindspore/dump/hook_cell/ms_hook_manager.py +142 -21
msprobe/mindspore/dump/kernel_kbyk_dump.py +24 -0
msprobe/mindspore/exception_dump/__init__.py +0 -0
msprobe/mindspore/exception_dump/exception_dump_tool_factory.py +51 -0
msprobe/mindspore/exception_dump/kernel_graph_exception_dump.py +57 -0
msprobe/mindspore/free_benchmark/api_pynative_self_check.py +5 -4
msprobe/mindspore/mindspore_service.py +2 -2
msprobe/mindspore/mindtorch/mindtorch_adaptor.py +12 -7
msprobe/mindspore/monitor/features.py +82 -0
msprobe/mindspore/monitor/module_hook.py +168 -10
msprobe/mindspore/monitor/utils.py +27 -1
msprobe/mindspore/ms_config.py +12 -4
msprobe/mindspore/overflow_check/overflow_check_tool_factory.py +1 -1
msprobe/mindspore/task_handler_factory.py +3 -1
msprobe/nan_analyze/graph.py +1 -1
msprobe/pytorch/api_accuracy_checker/common/config.py +3 -36
msprobe/pytorch/api_accuracy_checker/compare/api_precision_compare.py +0 -24
msprobe/pytorch/api_accuracy_checker/compare/compare.py +2 -12
msprobe/pytorch/api_accuracy_checker/config.yaml +1 -6
msprobe/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py +2 -2
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py +12 -132
msprobe/pytorch/common/utils.py +1 -21
msprobe/pytorch/compare/pt_compare.py +10 -2
msprobe/pytorch/{hook_module/jit_script_wrapper.py → compare/pt_diff_analyze.py} +3 -15
msprobe/pytorch/compare/utils.py +2 -1
msprobe/pytorch/debugger/debugger_config.py +18 -23
msprobe/pytorch/dump/module_dump/hook_wrapper.py +10 -7
msprobe/pytorch/dump/module_dump/module_processer.py +41 -19
msprobe/pytorch/free_benchmark/main.py +7 -4
msprobe/pytorch/hook_module/api_register.py +62 -24
msprobe/pytorch/hook_module/hook_module.py +9 -29
msprobe/pytorch/hook_module/pt_hook_manager.py +84 -15
msprobe/pytorch/hook_module/script_wrapper.py +140 -0
msprobe/pytorch/hook_module/support_wrap_ops.yaml +6 -0
msprobe/pytorch/monitor/csv2tb.py +1 -1
msprobe/pytorch/monitor/features.py +94 -0
msprobe/pytorch/monitor/module_hook.py +221 -81
msprobe/pytorch/monitor/module_metric.py +27 -1
msprobe/pytorch/monitor/optimizer_collect.py +109 -4
msprobe/pytorch/online_dispatch/dispatch.py +42 -24
msprobe/pytorch/online_dispatch/dump_compare.py +1 -1
msprobe/pytorch/parse_tool/lib/visualization.py +0 -1
msprobe/pytorch/pt_config.py +2 -51
msprobe/pytorch/pytorch_service.py +7 -14
msprobe/visualization/builder/graph_builder.py +192 -63
msprobe/visualization/builder/graph_merger.py +986 -0
msprobe/visualization/builder/msprobe_adapter.py +17 -15
msprobe/visualization/compare/graph_comparator.py +26 -16
msprobe/visualization/db_utils.py +252 -0
msprobe/visualization/graph/base_node.py +2 -22
msprobe/visualization/graph/distributed_analyzer.py +12 -12
msprobe/visualization/graph/graph.py +44 -16
msprobe/visualization/graph_service.py +143 -59
msprobe/visualization/utils.py +103 -4
msprobe/docs/08.accuracy_checker_online_PyTorch.md +0 -295
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/attl.py +0 -205
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/client.py +0 -378
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/device_dispatch.py +0 -239
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/dump_dispatch.py +0 -115
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/server.py +0 -250
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/torch_ops_config.yaml +0 -63
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/utils.py +0 -198
msprobe/pytorch/attl_manager.py +0 -65
{mindstudio_probe-8.1.2.dist-info → mindstudio_probe-8.2.1.dist-info}/LICENSE +0 -0
{mindstudio_probe-8.1.2.dist-info → mindstudio_probe-8.2.1.dist-info}/WHEEL +0 -0
{mindstudio_probe-8.1.2.dist-info → mindstudio_probe-8.2.1.dist-info}/entry_points.txt +0 -0
{mindstudio_probe-8.1.2.dist-info → mindstudio_probe-8.2.1.dist-info}/top_level.txt +0 -0
/msprobe/{pytorch/api_accuracy_checker/tensor_transport_layer → core/compare/diff_analyze}/__init__.py +0 -0

msprobe/mindspore/monitor/module_hook.py CHANGED Viewed

@@ -13,11 +13,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from gzip import FEXTRA
 import os
 import re
 import uuid
 from collections import defaultdict
 from datetime import datetime
+from functools import partial
 import pytz
 import pandas as pd
@@ -27,16 +29,18 @@ from mindspore import nn, _no_grad
 from msprobe.core.common.log import logger
 from msprobe.core.common.const import MonitorConst, Const
-from msprobe.core.common.file_utils import load_json, save_json
+from msprobe.core.common.file_utils import load_json, save_json, make_dir
 from msprobe.core.monitor.utils import validate_config, get_output_base_dir, get_target_output_dir
 from msprobe.core.monitor.anomaly_processor import AnomalyScanner, AnomalyDataFactory, AnomalyDataWriter
 from msprobe.mindspore.common.utils import is_mindtorch
-from msprobe.mindspore.monitor.common_func import is_valid_instance, get_parameters, get_submodules, get_rank
+from msprobe.mindspore.monitor.common_func import is_valid_instance, get_parameters, get_submodules, get_rank, \
+    comm_is_initialized
 from msprobe.mindspore.monitor.utils import get_summary_writer_tag_name, step_accumulates_one, is_skip_step, \
-    get_metrics
+    get_metrics, get_entropy_metric, get_sr_metric
 from msprobe.mindspore.monitor.optimizer_collect import OptimizerMonFactory
 from msprobe.mindspore.monitor.data_writers import CSVWriterWithAD, BaseWriterWithAD, WriterInput
 from msprobe.mindspore.monitor.distributed.wrap_distributed import api_register, create_hooks, op_aggregate
+from msprobe.mindspore.monitor.features import cal_qkt
 from msprobe.core.common.file_utils import write_df_to_csv
 from msprobe.core.common.utils import analyze_api_call_stack
@@ -76,13 +80,24 @@ def param_is_data_parallel_duplicate(dp_group):
 def squash_param_name(param_name):
-    for pattern in ['layers?\.(.*)', 'embeddings?\.(.*)', 'final.*', 'output.*', 'norm.*']:
+    for pattern in ['^.*\.(layers?\..*)', '^.*\.(embeddings?\..*)', '^.*\.(final.*)', '^.*\.(output.*)',
+                    '^.*\.(norm.*)']:
         match = re.findall(pattern, param_name)
         if match:
             return match[0]
     return param_name
+def is_recording_module(module_name, l2_targets, vpp_stage):
+    if len(l2_targets) > 0:
+        for pattern in [vpp_stage + squash_param_name(module_name), vpp_stage + module_name]:
+            if pattern in l2_targets:
+                return pattern
+        return ""
+    else:
+        raise NotImplementedError("If monitering l2_features, the targets should be set specifically.")
 # Used For Module Forward & Backward Collect
 class ModuleHookContext:
     def __init__(self, module_name) -> None:
@@ -99,6 +114,19 @@ class ModuleHookContext:
         self.actvgrad.clear()
+class FeatureHookContext:
+    def __init__(self, module_name):
+        self.step = 0
+        self.micro_step = 0
+        self.attention_feature = {}
+        self.linear_feature = {}
+        self.module_name = module_name
+    def reset(self):
+        self.attention_feature.clear()
+        self.linear_feature.clear()
 start_step = 0
@@ -211,6 +239,7 @@ class TrainerMon:
         # TYPE3: 会随着训练中途config配置更新或监控状态改变而重置的变量
         self.module_fwd_hook_context_by_module = defaultdict(ModuleHookContext)
         self.module_bwd_hook_context_by_module = defaultdict(ModuleHookContext)
+        self.feature_hook_context_by_module = defaultdict(FeatureHookContext)
         self.optimizer_context = defaultdict(OptimizerContext)
         self.cc_context = defaultdict(CommunicationContext)
         self.grad_context = GradContext()
@@ -244,6 +273,18 @@ class TrainerMon:
             if self.collect_times > 0:
                 self.monitoring = True
+    @staticmethod
+    def get_linear_hook_target(module):
+        if isinstance(module, nn.Embedding):
+            return ''
+        if hasattr(module, "num_embeddings") or hasattr(module, "vocab_start_index"):
+            return ''
+        for weight_name in ["weight", "wg"]:
+            if hasattr(module, weight_name) and isinstance(getattr(module, weight_name), Tensor):
+                if getattr(module, weight_name).dim() == 2:
+                    return weight_name
+        return ''
     def set_config(self):
         self.start_step = self.config.get("start_step", 0)
         self.collect_times = self.config.get("collect_times", 100000000)  # 默认大值, 目的是一直采集
@@ -268,6 +309,9 @@ class TrainerMon:
         self.cc_distribution = self.config.get("cc_distribution", {})  # communication ops
         self.stack_info = self.config.get('stack_info', False)
         self.monitor_mbs_grad = self.config.get('monitor_mbs_grad', False)
+        self.recording_l2_features = self.config.get('recording_l2_features', False)
+        self.sa_order = self.config.get('sa_order', "s,b,h,d")
         if not self.cc_distribution.get('enable', False):
             self.cc_log_only = False
@@ -320,6 +364,8 @@ class TrainerMon:
             logger.info("> momentum and variance of adam is not monitored. ")
         if not self.wg_distribution:
             logger.info("> weight grad of specified module is not monitored. ")
+        if not self.recording_l2_features:
+            logger.info("> l2 features of specified module is not monitored. ")
         if not self.mg_direction:
             logger.info('> grad and momentum direction will not be compared.')
         if not self.cc_distribution.get('enable', False):
@@ -367,6 +413,7 @@ class TrainerMon:
                     self.write_grad_tb(context.step)
                     self.write_mv_tb(context)
                     self.write_param_tb(context)
+                    self.write_features_tb(context.step)
                     if self.stack_info:
                         self.write_stack_info()
                         self.stack_info = False
@@ -391,7 +438,6 @@ class TrainerMon:
             context.step += 1
             self.dynamic_monitor(optimizer)
         def patch_step(func, optimizer):
             def wrapper(*args, **kwargs):
                 for hook in self.pre_step_hooks:
@@ -472,12 +518,18 @@ class TrainerMon:
                 continue
             vpp_stage = f'{vpp_stage}{MonitorConst.NAME_SEP}'
             targets = [x for x, _ in get_submodules(model_chunk)] if self.print_struct else self.targets.keys()
-            hooked_count += self._hook_module(targets, model_chunk, vpp_stage)
+            l2_target_names = self.config.get('l2_targets', {})
+            hooked_count += self._hook_module(targets, l2_target_names, model_chunk, vpp_stage)
         logger.info(f"> {hooked_count} modules are monitored.")
     def hook_optimizer(self, optimizer):
         def optimizer_pre_step_hook(opt, *args, **kwargs):
             context = self.optimizer_context[opt]
+            if (self.print_struct and not all(value == {} for value in self.module_struct.values())
+                    and not self.struct_printed):
+                self._save_module_struct()
+                if not self.cc_log_only:
+                    raise Exception("exit after first monitor step when print model struct")
             if is_skip_step(context.step, self.start_step, self.step_interval, self.has_collect_times,
                             self.collect_times):
                 return
@@ -623,6 +675,25 @@ class TrainerMon:
                                           use_micro_step=self.monitor_mbs_grad)
         self.summary_writer.write_metrics(self.ops, self.grad_context.post, step, 'grad_reduced')
+    def write_metrics_if_not_empty(self, features, metrics, step, hook_name):
+        if not features or len(features) == 0:
+            return
+        use_micro_step = hook_name not in ["linear_hook"]
+        self.summary_writer.write_metrics(metrics, features, step, hook_name, use_micro_step=use_micro_step)
+        features.clear()
+    def write_features_tb(self, step):
+        if not self.recording_l2_features:
+            return
+        for context in self.feature_hook_context_by_module.values():
+            num_features = len(context.attention_feature) + len(context.linear_feature)
+            if num_features == 0:
+                continue
+            self.write_metrics_if_not_empty(context.attention_feature, ["entropy", "softmax"], step,
+                                            "attention_hook")
+            self.write_metrics_if_not_empty(context.linear_feature, ["sr", "kernel_norm"], step,
+                                            "linear_hook")
     def is_target_rank(self):
         if self.module_rank_list and (self.rank not in self.module_rank_list):
             return False
@@ -695,7 +766,15 @@ class TrainerMon:
                 }
                 index += 1
-    def _hook_module(self, target_names, module, vpp_stage=''):
+    def _save_module_struct(self):
+        output_dir = os.path.join(get_output_base_dir(), 'module_struct', f'rank{self.rank}')
+        make_dir(output_dir)
+        module_struct_file = os.path.realpath(os.path.join(output_dir, 'module_struct.json'))
+        save_json(module_struct_file, self.module_struct, indent=2)
+        logger.info(f"> save module struct to {module_struct_file}")
+        self.struct_printed = True
+    def _hook_module(self, target_names, l2_target_names, module, vpp_stage=''):
         if not is_valid_instance(module):
             # nothing to hook
             return 0
@@ -785,7 +864,8 @@ class TrainerMon:
             return
         def fwd_hook_register(module, fwd_hook_fun, name):
-            if mindspore.__version__ >= '2.6.0':
+            from packaging import version
+            if version.parse(mindspore.__version__) >= version.parse('2.6.0'):
                 def wrapper(module, args, kwargs, module_output):
                     return fwd_hook_fun(module, args, kwargs, module_output, name)
                 return module.register_forward_hook(wrapper, with_kwargs=True)
@@ -795,6 +875,61 @@ class TrainerMon:
                     return fwd_hook_fun(module, args, None, module_output, name)
                 return module.register_forward_hook(wrapper)
+        def extract_attention_feature_hook(module, args, kwargs, module_output, name):
+            module_input = [tensor for tensor in args if isinstance(tensor, Tensor)]
+            if kwargs:
+                kwargs_tensors = [tensor for tensor in kwargs.values() if isinstance(tensor, Tensor)]
+                module_input.extend(kwargs_tensors)
+            if module not in self.feature_hook_context_by_module:
+                self.feature_hook_context_by_module[module] = FeatureHookContext(name)
+            context: FeatureHookContext = self.feature_hook_context_by_module[module]
+            tbtag_tensor_map = {}
+            if len(module_input) < 2:
+                logger.warning(
+                    "Calculate attention feature failed, the length of module_input in attention hook's module should "
+                    "be greater than or equal to 2.")
+            q_h = module_input[0]
+            k_h = module_input[1]
+            qkt = cal_qkt(q_h, k_h, order=self.sa_order)
+            tbtag_tensor_map.update(
+                self.build_tbtag_tensor_map(
+                    f'{context.module_name}.attention', f'{MonitorConst.NAME_SEP}{context.micro_step}',
+                    'qkt', qkt))
+            get_entropy_metric(tbtag_tensor_map, context.attention_feature)
+            context.micro_step += 1
+            if context.micro_step == self.micro_batch_number:
+                context.micro_step = 0
+                context.step += 1
+            return
+        def extract_linear_sr_hook(module, args, kwargs, module_output, name):
+            weight_name = self.get_linear_hook_target(module)
+            if weight_name == "":
+                return
+            if module not in self.feature_hook_context_by_module:
+                self.feature_hook_context_by_module[module] = FeatureHookContext(name)
+            context: FeatureHookContext = self.feature_hook_context_by_module[module]
+            if context.micro_step == self.micro_batch_number - 1:
+                tbtag_tensor_map = {}
+                value = module.weight.data
+                tbtag_tensor_map.update(
+                    self.build_tbtag_tensor_map(
+                        f'{context.module_name}.linear', f'{MonitorConst.NAME_SEP}{context.micro_step}',
+                        'sr', value))
+                get_sr_metric(tbtag_tensor_map, context.linear_feature)
+            context.micro_step += 1
+            if context.micro_step == self.micro_batch_number:
+                context.micro_step = 0
+                context.step += 1
+            return
         def stack_hook(module, args, kwargs, module_output, name):
             if module not in self.module_fwd_hook_context_by_module:
                 self.module_fwd_hook_context_by_module[module] = ModuleHookContext(name)
@@ -824,6 +959,24 @@ class TrainerMon:
                     self.module_bwd_hook_context_by_module[submodule] = ModuleHookContext(name)
                 logger.info(f"> {name} is monitored successfully")
                 hooked_count += 1
+        if not self.print_struct and self.recording_l2_features:
+            for module_name, submodule in get_submodules(module):
+                func_map = {
+                    "attention_hook": extract_attention_feature_hook,
+                    "linear_hook": extract_linear_sr_hook
+                }
+                for hook in func_map.keys():
+                    if hook in l2_target_names:
+                        temp_names = l2_target_names[hook]
+                        name = is_recording_module(module_name, temp_names, vpp_stage)
+                        if name:
+                            handle = fwd_hook_register(submodule, func_map[hook], name=name)
+                            print_feature_name = hook.split('_')[0]
+                            logger.info_on_rank_0(
+                                f'> {print_feature_name} features of {name} is monitored successfully')
+                            self.handles["L2_features"].append(handle)
+                            hooked_count += 1
         return hooked_count
     def _patch_grad_sync(self):
@@ -889,11 +1042,16 @@ class TrainerMon:
         for handle in self.handles['xy']:
             handle.remove()
         self.handles['xy'].clear()
+        for handle in self.handles['L2_features']:
+            handle.remove()
+        self.handles['L2_features'].clear()
         # 清空对应context缓存
-        for _, fwd_context in self.module_fwd_hook_context_by_module.items():
+        for fwd_context in self.module_fwd_hook_context_by_module.values():
             fwd_context.reset()
-        for _, bwd_context in self.module_bwd_hook_context_by_module.items():
+        for bwd_context in self.module_bwd_hook_context_by_module.values():
             bwd_context.reset()
+        for feature_context in self.feature_hook_context_by_module.values():
+            feature_context.reset()
         self.grad_context.reset()  # 权重梯度和激活值梯度都在这
         for handle in self.handles['wgrads']:

msprobe/mindspore/monitor/utils.py CHANGED Viewed

@@ -14,7 +14,7 @@
 # limitations under the License.
 from mindspore import dtype as mstype, Tensor
-from msprobe.mindspore.monitor.features import FUNC_MAP
+from msprobe.mindspore.monitor.features import FUNC_MAP, cal_entropy, cal_stable_rank
 def get_single_metrics(op_list, tag, tensor, eps=1e-8, output=None):
@@ -75,3 +75,29 @@ def is_skip_step(step, start_step, step_interval, has_collect_times=0, collect_t
     :return: whether skip or not, bool
     """
     return step < start_step or (step - start_step) % step_interval != 0 or has_collect_times >= collect_times
+def get_entropy_metric(tag2tensor, out_dict=None):
+    if out_dict is None:
+        out_dict = {}
+    for tag, tensor in tag2tensor.items():
+        if tag not in out_dict:
+            out_dict[tag] = {}
+        entropy, softmax = cal_entropy(tensor)
+        out_dict[tag]["entropy"] = entropy
+        out_dict[tag]["softmax"] = softmax
+    return out_dict
+def get_sr_metric(tag2tensor, out_dict=None):
+    if out_dict is None:
+        out_dict = {}
+    for tag, tensor in tag2tensor.items():
+        if "sr" not in tag:
+            continue
+        if tag not in out_dict:
+            out_dict[tag] = {}
+        sr, eig = cal_stable_rank(tensor)
+        out_dict[tag]["sr"] = sr
+        out_dict[tag]["eig"] = eig
+    return out_dict

msprobe/mindspore/ms_config.py CHANGED Viewed

@@ -57,11 +57,12 @@ class StatisticsConfig(BaseConfig):
             raise Exception("Config param [precision] is invalid, expected from [\"high\", \"low\"]")
     def _check_summary_mode(self):
-        muti_opt = ["md5", "max", "min", "mean", "l2norm"]
+        muti_opt = ["max", "min", "mean", "count", "negative zero count", "positive zero count", "nan count",
+                    "negative inf count", "positive inf count", "zero count", "l2norm", "hash", "md5"]
         if isinstance(self.summary_mode, str) and self.summary_mode not in Const.SUMMARY_MODE:
-            raise Exception("summary_mode is invalid")
+            raise Exception("summary_mode is an invalid string")
         if isinstance(self.summary_mode, list) and not all(opt in muti_opt for opt in self.summary_mode):
-            raise Exception("summary_mode is invalid")
+            raise Exception("summary_mode contains invalid option(s)")
 class OverflowCheckConfig(BaseConfig):
@@ -79,6 +80,12 @@ class OverflowCheckConfig(BaseConfig):
             raise Exception("check_mode is invalid")
+class ExceptionDumpConfig(BaseConfig):
+    def __init__(self, json_config):
+        super().__init__(json_config)
+        self.data_mode = ["all"]
 class FreeBenchmarkConfig(BaseConfig):
     def __init__(self, task_config):
         super().__init__(task_config)
@@ -128,7 +135,8 @@ TaskDict = {
     Const.OVERFLOW_CHECK: OverflowCheckConfig,
     Const.FREE_BENCHMARK: FreeBenchmarkConfig,
     Const.GRAD_PROBE: GradProbeConfig,
-    Const.STRUCTURE: StructureConfig
+    Const.STRUCTURE: StructureConfig,
+    Const.EXCEPTION_DUMP: ExceptionDumpConfig
 }

msprobe/mindspore/overflow_check/overflow_check_tool_factory.py CHANGED Viewed

@@ -32,7 +32,7 @@ class OverflowCheckToolFactory:
             Const.PYNATIVE_MODE: None
         },
         Const.KERNEL: {
-            Const.GRAPH_KBYK_MODE: None,
+            Const.GRAPH_KBYK_MODE: KernelGraphOverflowCheck,
             Const.GRAPH_GE_MODE: KernelGraphOverflowCheck,
             Const.PYNATIVE_MODE: None
         }

msprobe/mindspore/task_handler_factory.py CHANGED Viewed

@@ -18,6 +18,7 @@ from msprobe.mindspore.debugger.debugger_config import DebuggerConfig
 from msprobe.mindspore.dump.dump_tool_factory import DumpToolFactory
 from msprobe.mindspore.overflow_check.overflow_check_tool_factory import OverflowCheckToolFactory
 from msprobe.mindspore.free_benchmark.self_check_tool_factory import SelfCheckToolFactory
+from msprobe.mindspore.exception_dump.exception_dump_tool_factory import ExceptionDumpToolFactory
 class TaskHandlerFactory:
@@ -25,7 +26,8 @@ class TaskHandlerFactory:
         Const.TENSOR: DumpToolFactory,
         Const.STATISTICS: DumpToolFactory,
         Const.OVERFLOW_CHECK: OverflowCheckToolFactory,
-        Const.FREE_BENCHMARK: SelfCheckToolFactory
+        Const.FREE_BENCHMARK: SelfCheckToolFactory,
+        Const.EXCEPTION_DUMP: ExceptionDumpToolFactory
     }
     @staticmethod

msprobe/nan_analyze/graph.py CHANGED Viewed

@@ -16,8 +16,8 @@
 from dataclasses import dataclass
 from msprobe.core.common.const import Const
 from msprobe.core.common.log import logger
-from msprobe.core.common.exceptions import MsprobeException
 from msprobe.nan_analyze.utils import FileCache, RankPath, is_ignore_op, check_item_anomaly, NanAnalyseConst
+from msprobe.core.common.exceptions import MsprobeException
 @dataclass

msprobe/pytorch/api_accuracy_checker/common/config.py CHANGED Viewed

@@ -24,8 +24,7 @@ from msprobe.pytorch.pt_config import RunUTConfig
 RunUtConfig = namedtuple('RunUtConfig', ['forward_content', 'backward_content', 'result_csv_path', 'details_csv_path',
                                          'save_error_data', 'is_continue_run_ut', 'real_data_path', 'white_list',
-                                         'black_list', 'error_data_path', 'online_config'])
-OnlineConfig = namedtuple('OnlineConfig', ['is_online', 'nfs_path', 'host', 'port', 'rank_list', 'tls_path'])
+                                         'black_list', 'error_data_path'])
 class Config:
@@ -46,13 +45,7 @@ class Config:
             'white_list': list,
             'black_list': list,
             'error_data_path': str,
-            'precision': int,
-            'is_online': bool,
-            'nfs_path': str,
-            'host': str,
-            'port': int,
-            'rank_list': list,
-            'tls_path': str
+            'precision': int
         }
         if key not in validators:
             raise ValueError(f"{key} must be one of {validators.keys()}")
@@ -68,10 +61,6 @@ class Config:
             RunUTConfig.check_filter_list_config(key, value)
         if key == 'error_data_path':
             RunUTConfig.check_error_data_path_config(value)
-        if key == 'nfs_path':
-            RunUTConfig.check_nfs_path_config(value)
-        if key == 'tls_path':
-            RunUTConfig.check_tls_path_config(value)
         return value
@@ -85,12 +74,6 @@ class CheckerConfig:
         self.white_list = msCheckerConfig.white_list
         self.black_list = msCheckerConfig.black_list
         self.error_data_path = msCheckerConfig.error_data_path
-        self.is_online = msCheckerConfig.is_online
-        self.nfs_path = msCheckerConfig.nfs_path
-        self.host = msCheckerConfig.host
-        self.port = msCheckerConfig.port
-        self.rank_list = msCheckerConfig.rank_list
-        self.tls_path = msCheckerConfig.tls_path
         if task_config:
             self.load_config(task_config)
@@ -99,22 +82,7 @@ class CheckerConfig:
         self.white_list = task_config.white_list
         self.black_list = task_config.black_list
         self.error_data_path = task_config.error_data_path
-        self.is_online = task_config.is_online
-        self.nfs_path = task_config.nfs_path
-        self.host = task_config.host
-        self.port = task_config.port
-        self.rank_list = task_config.rank_list
-        self.tls_path = task_config.tls_path
-    def get_online_config(self):
-        return OnlineConfig(
-            is_online=self.is_online,
-            nfs_path=self.nfs_path,
-            host=self.host,
-            port=self.port,
-            rank_list=self.rank_list,
-            tls_path=self.tls_path
-        )
     def get_run_ut_config(self, **config_params):
         return RunUtConfig(
@@ -127,6 +95,5 @@ class CheckerConfig:
             real_data_path=config_params.get('real_data_path'),
             white_list=self.white_list.copy() if self.white_list else [],
             black_list=self.black_list.copy() if self.black_list else [],
-            error_data_path=config_params.get('error_data_path'),
-            online_config=self.get_online_config()
+            error_data_path=config_params.get('error_data_path')
         )

msprobe/pytorch/api_accuracy_checker/compare/api_precision_compare.py CHANGED Viewed

@@ -117,30 +117,6 @@ def api_precision_compare(config):
     change_mode(config.details_csv_path, FileCheckConst.DATA_FILE_AUTHORITY)
-def online_api_precision_compare(online_config):
-    rank = online_config.rank
-    result_csv_path = os.path.join(Const.DEFAULT_PATH, online_config.result_csv_path).replace(
-                    "_rank*.csv", f"_rank{rank}.csv")
-    details_csv_path = os.path.join(Const.DEFAULT_PATH, online_config.details_csv_path).replace(
-                    "_rank*.csv", f"_rank{rank}.csv")
-    detail_csv_title = [ApiPrecisionCompareColumn.get_detail_csv_title()]
-    result_csv_title = [ApiPrecisionCompareColumn.get_result_csv_title()]
-    if not os.path.exists(result_csv_path):
-        write_csv(result_csv_title, result_csv_path)
-    if not os.path.exists(details_csv_path):
-        write_csv(detail_csv_title, details_csv_path)
-    config = CompareConfig("", "", result_csv_path, details_csv_path)
-    try:
-        npu_data, gpu_data = online_config.npu_data, online_config.gpu_data
-        check_csv_columns(npu_data.columns, "npu_csv")
-        check_csv_columns(gpu_data.columns, "gpu_csv")
-        analyse_csv(npu_data, gpu_data, config)
-    except Exception as err:
-        logger.error(f"Online api precision compare Error: {str(err)}")
-    change_mode(result_csv_path, FileCheckConst.DATA_FILE_AUTHORITY)
-    change_mode(details_csv_path, FileCheckConst.DATA_FILE_AUTHORITY)
 def analyse_csv(npu_data, gpu_data, config):
     forward_status, backward_status = [], []
     last_api_name, last_api_dtype, last_api_full_name = None, None, None

msprobe/pytorch/api_accuracy_checker/compare/compare.py CHANGED Viewed

@@ -66,13 +66,6 @@ class Comparator:
         self.save_path_list = [result_csv_path]
         self.detail_save_path_list = [details_csv_path]
-        if config and config.online_config.is_online:
-            self.save_path_str = result_csv_path.replace(".csv", "_rank{}.csv")
-            self.detail_save_path_str = details_csv_path.replace(".csv", "_rank{}.csv")
-            self.save_path_list = [self.save_path_str.format(rank) for rank in config.online_config.rank_list]
-            self.detail_save_path_list = \
-                [self.detail_save_path_str.format(rank) for rank in config.online_config.rank_list]
         self.registry = self._register_compare_func()
         if not is_continue_run_ut:
@@ -245,9 +238,8 @@ class Comparator:
         self.write_detail_csv(args)
-    def compare_output(self, full_api_name, data_info, is_online=False):
+    def compare_output(self, full_api_name, data_info):
         """Get compare result and write to result and detail csv.
-        is_online: bool, default False. True: called by online api precision compare, only compare without write to csv.
         """
         _, api_name = extract_basic_api_segments(full_api_name)
         if not api_name:
@@ -280,9 +272,7 @@ class Comparator:
                                  fwd_compare_alg_results,
                                  bwd_compare_alg_results,
                                  data_info.rank)
-        if is_online:
-            # get run_ut compare detail
-            return self._get_run_ut_detail(result_info)
         self.record_results(result_info)
         return fwd_success_status == CompareConst.PASS, bwd_success_status == CompareConst.PASS \
                or bwd_success_status == CompareConst.SPACE

msprobe/pytorch/api_accuracy_checker/config.yaml CHANGED Viewed

@@ -2,9 +2,4 @@ white_list: []
 black_list: []
 error_data_path: './'
 precision: 14
-is_online: False
-nfs_path: ""
-host: ""
-port: -1
-rank_list: [0]
-tls_path: "./"

msprobe/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py CHANGED Viewed

@@ -84,8 +84,8 @@ def split_json_file(input_file, num_splits, filter_api):
             for file in split_files:
                 try:
                     remove_path(file)
-                except FileNotFoundError:
-                    logger.error(f"File not found and could not be deleted: {file}")
+                except Exception:
+                    logger.error(f"File not found or could not be deleted: {file}")
             msg = 'ERROR: Split json file failed, please check the input file and try again.'
             raise CompareException(CompareException.PARSE_FILE_ERROR, msg) from e
     return split_files, total_items

mindstudio-probe 8.1.2__py3-none-any.whl → 8.2.1__py3-none-any.whl

mindstudio-probe 8.1.2py3-none-any.whl → 8.2.1py3-none-any.whl