PyPI - mindstudio-probe - Versions diffs - 1.3.0__py3-none-any.whl → 8.1.1__py3-none-any.whl - Mend

mindstudio-probe 1.3.0py3-none-any.whl → 8.1.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (213) hide show

{mindstudio_probe-1.3.0.dist-info → mindstudio_probe-8.1.1.dist-info}/METADATA +4 -2
{mindstudio_probe-1.3.0.dist-info → mindstudio_probe-8.1.1.dist-info}/RECORD +204 -152
msprobe/README.md +32 -1
msprobe/core/__init__.py +17 -0
msprobe/core/common/const.py +120 -21
msprobe/core/common/exceptions.py +2 -2
msprobe/core/common/file_utils.py +279 -50
msprobe/core/common/framework_adapter.py +169 -0
msprobe/core/common/global_lock.py +86 -0
msprobe/core/common/runtime.py +25 -0
msprobe/core/common/utils.py +136 -45
msprobe/core/common_config.py +7 -0
msprobe/core/compare/acc_compare.py +646 -428
msprobe/core/compare/check.py +36 -103
msprobe/core/compare/compare_cli.py +4 -0
msprobe/core/compare/config.py +72 -0
msprobe/core/compare/highlight.py +215 -215
msprobe/core/compare/layer_mapping/layer_mapping.py +2 -0
msprobe/core/compare/merge_result/merge_result.py +4 -4
msprobe/core/compare/multiprocessing_compute.py +223 -110
msprobe/core/compare/npy_compare.py +2 -4
msprobe/core/compare/utils.py +214 -244
msprobe/core/config_check/__init__.py +17 -0
msprobe/{pytorch/dump/kernel_dump/kernel_config.py → core/config_check/checkers/__init__.py} +8 -16
msprobe/core/config_check/checkers/base_checker.py +60 -0
msprobe/core/config_check/checkers/dataset_checker.py +138 -0
msprobe/core/config_check/checkers/env_args_checker.py +96 -0
msprobe/core/config_check/checkers/hyperparameter_checker.py +170 -0
msprobe/core/config_check/checkers/pip_checker.py +90 -0
msprobe/core/config_check/checkers/random_checker.py +367 -0
msprobe/core/config_check/checkers/weights_checker.py +147 -0
msprobe/core/config_check/ckpt_compare/ckpt_comparator.py +74 -0
msprobe/core/config_check/ckpt_compare/megatron_loader.py +302 -0
msprobe/core/config_check/ckpt_compare/metrics.py +83 -0
msprobe/core/config_check/ckpt_compare/name_mapping.yaml +12 -0
msprobe/core/config_check/config_check_cli.py +51 -0
msprobe/core/config_check/config_checker.py +100 -0
msprobe/{mindspore/runtime.py → core/config_check/resource/dependency.yaml} +7 -4
msprobe/core/config_check/resource/env.yaml +57 -0
msprobe/core/config_check/resource/hyperparameter.yaml +21 -0
msprobe/core/config_check/utils/hyperparameter_parser.py +115 -0
msprobe/core/config_check/utils/utils.py +107 -0
msprobe/core/data_dump/api_registry.py +67 -4
msprobe/core/data_dump/data_collector.py +170 -89
msprobe/core/data_dump/data_processor/base.py +72 -51
msprobe/core/data_dump/data_processor/mindspore_processor.py +109 -55
msprobe/core/data_dump/data_processor/pytorch_processor.py +90 -82
msprobe/core/data_dump/json_writer.py +143 -27
msprobe/core/debugger/precision_debugger.py +144 -0
msprobe/core/grad_probe/constant.py +1 -1
msprobe/core/grad_probe/grad_compare.py +1 -1
msprobe/core/grad_probe/utils.py +1 -1
msprobe/core/hook_manager.py +242 -0
msprobe/core/monitor/anomaly_processor.py +384 -0
msprobe/core/service.py +357 -0
msprobe/core/single_save/__init__.py +0 -0
msprobe/core/single_save/single_comparator.py +243 -0
msprobe/core/single_save/single_saver.py +146 -0
msprobe/docs/01.installation.md +6 -5
msprobe/docs/02.config_introduction.md +79 -22
msprobe/docs/03.config_examples.md +1 -0
msprobe/docs/04.kernel_dump_PyTorch.md +1 -1
msprobe/docs/05.data_dump_PyTorch.md +118 -49
msprobe/docs/06.data_dump_MindSpore.md +167 -20
msprobe/docs/07.accuracy_checker_PyTorch.md +2 -2
msprobe/docs/08.accuracy_checker_online_PyTorch.md +69 -9
msprobe/docs/09.accuracy_checker_MindSpore.md +18 -6
msprobe/docs/10.accuracy_compare_PyTorch.md +212 -74
msprobe/docs/11.accuracy_compare_MindSpore.md +87 -37
msprobe/docs/12.overflow_check_PyTorch.md +2 -2
msprobe/docs/13.overflow_check_MindSpore.md +2 -2
msprobe/docs/14.data_parse_PyTorch.md +3 -3
msprobe/docs/17.grad_probe.md +2 -1
msprobe/docs/18.online_dispatch.md +2 -2
msprobe/docs/19.monitor.md +90 -44
msprobe/docs/21.visualization_PyTorch.md +68 -15
msprobe/docs/22.visualization_MindSpore.md +71 -18
msprobe/docs/25.tool_function_introduction.md +23 -22
msprobe/docs/26.data_dump_PyTorch_baseline.md +14 -3
msprobe/docs/27.dump_json_instruction.md +1 -1
msprobe/docs/28.debugger_save_instruction.md +111 -20
msprobe/docs/29.data_dump_MSAdapter.md +2 -2
msprobe/docs/30.overflow_check_MSAdapter.md +2 -2
msprobe/docs/31.config_check.md +95 -0
msprobe/docs/32.ckpt_compare.md +69 -0
msprobe/docs/33.generate_operator_MindSpore.md +181 -0
msprobe/docs/34.RL_collect.md +92 -0
msprobe/docs/35.nan_analyze.md +72 -0
msprobe/docs/data_dump_MindSpore/data_dump_MindSpore_baseline.md +12 -1
msprobe/docs/data_dump_MindSpore/dynamic_graph_quick_start_example.md +3 -1
msprobe/docs/img/compare_result.png +0 -0
msprobe/docs/img/save_compare_result_sample.png +0 -0
msprobe/docs/img/visualization/proxy.png +0 -0
msprobe/mindspore/__init__.py +1 -2
msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +150 -58
msprobe/mindspore/api_accuracy_checker/api_runner.py +7 -3
msprobe/mindspore/api_accuracy_checker/bench_functions/flash_attention_score.py +47 -69
msprobe/mindspore/api_accuracy_checker/cmd_parser.py +4 -0
msprobe/mindspore/api_accuracy_checker/compute_element.py +0 -1
msprobe/mindspore/api_accuracy_checker/data_manager.py +2 -2
msprobe/mindspore/api_accuracy_checker/generate_op_script/op_generator.py +460 -0
msprobe/mindspore/api_accuracy_checker/generate_op_script/operator_replication.template +2081 -0
msprobe/mindspore/api_accuracy_checker/multi_api_accuracy_checker.py +9 -0
msprobe/mindspore/api_accuracy_checker/torch_mindtorch_importer.py +2 -1
msprobe/mindspore/cell_processor.py +204 -33
msprobe/mindspore/code_mapping/graph_parser.py +4 -21
msprobe/mindspore/common/const.py +17 -7
msprobe/mindspore/common/utils.py +128 -11
msprobe/mindspore/compare/common_dir_compare.py +382 -0
msprobe/mindspore/compare/distributed_compare.py +2 -26
msprobe/mindspore/compare/ms_compare.py +17 -405
msprobe/mindspore/compare/ms_graph_compare.py +14 -5
msprobe/mindspore/compare/utils.py +37 -0
msprobe/mindspore/debugger/debugger_config.py +53 -3
msprobe/mindspore/debugger/precision_debugger.py +72 -91
msprobe/mindspore/dump/cell_dump_process.py +877 -0
msprobe/mindspore/dump/cell_dump_with_insert_gradient.py +864 -0
msprobe/mindspore/dump/dump_tool_factory.py +13 -5
msprobe/mindspore/dump/graph_mode_cell_dump.py +139 -0
msprobe/mindspore/dump/graph_tensor_dump.py +123 -0
msprobe/mindspore/dump/hook_cell/api_register.py +40 -6
msprobe/mindspore/dump/hook_cell/hook_cell.py +18 -7
msprobe/mindspore/dump/hook_cell/ms_hook_manager.py +88 -0
msprobe/mindspore/dump/hook_cell/primitive_hooks.py +8 -2
msprobe/mindspore/dump/hook_cell/support_wrap_ops.yaml +18 -0
msprobe/mindspore/dump/jit_dump.py +21 -18
msprobe/mindspore/dump/kernel_kbyk_dump.py +6 -3
msprobe/mindspore/dym_loader/hook_dynamic_loader.cpp +110 -0
msprobe/mindspore/dym_loader/hook_dynamic_loader.h +15 -15
msprobe/mindspore/free_benchmark/api_pynative_self_check.py +12 -6
msprobe/mindspore/free_benchmark/common/utils.py +1 -1
msprobe/mindspore/grad_probe/global_context.py +7 -2
msprobe/mindspore/grad_probe/grad_stat_csv.py +3 -2
msprobe/mindspore/mindspore_service.py +114 -0
msprobe/mindspore/monitor/common_func.py +52 -0
msprobe/mindspore/monitor/data_writers.py +237 -0
msprobe/mindspore/monitor/features.py +20 -7
msprobe/mindspore/monitor/module_hook.py +281 -209
msprobe/mindspore/monitor/optimizer_collect.py +334 -0
msprobe/mindspore/monitor/utils.py +25 -5
msprobe/mindspore/ms_config.py +16 -15
msprobe/mindspore/task_handler_factory.py +5 -2
msprobe/msprobe.py +19 -0
msprobe/nan_analyze/__init__.py +14 -0
msprobe/nan_analyze/analyzer.py +255 -0
msprobe/nan_analyze/graph.py +189 -0
msprobe/nan_analyze/utils.py +211 -0
msprobe/pytorch/api_accuracy_checker/common/config.py +2 -2
msprobe/pytorch/api_accuracy_checker/compare/compare.py +36 -34
msprobe/pytorch/api_accuracy_checker/compare/compare_utils.py +20 -20
msprobe/pytorch/api_accuracy_checker/generate_op_script/op_generator.py +4 -7
msprobe/pytorch/api_accuracy_checker/generate_op_script/operator_replication.template +204 -2
msprobe/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py +12 -11
msprobe/pytorch/api_accuracy_checker/run_ut/run_overflow_check.py +1 -0
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py +8 -5
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut_utils.py +2 -3
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/client.py +29 -13
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/device_dispatch.py +12 -2
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/server.py +45 -31
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/utils.py +156 -0
msprobe/pytorch/attl_manager.py +65 -0
msprobe/pytorch/bench_functions/npu_fusion_attention.py +27 -0
msprobe/pytorch/common/utils.py +26 -14
msprobe/pytorch/compare/distributed_compare.py +4 -36
msprobe/pytorch/compare/pt_compare.py +13 -84
msprobe/pytorch/compare/utils.py +47 -0
msprobe/pytorch/debugger/debugger_config.py +34 -17
msprobe/pytorch/debugger/precision_debugger.py +66 -118
msprobe/pytorch/dump/module_dump/hook_wrapper.py +93 -0
msprobe/pytorch/dump/module_dump/module_dump.py +11 -58
msprobe/pytorch/dump/module_dump/module_processer.py +143 -113
msprobe/pytorch/grad_probe/grad_stat_csv.py +3 -2
msprobe/pytorch/hook_module/api_register.py +29 -5
msprobe/pytorch/hook_module/hook_module.py +9 -18
msprobe/pytorch/hook_module/jit_script_wrapper.py +33 -0
msprobe/pytorch/hook_module/pt_hook_manager.py +68 -0
msprobe/pytorch/hook_module/support_wrap_ops.yaml +22 -1
msprobe/pytorch/hook_module/utils.py +28 -2
msprobe/pytorch/monitor/csv2tb.py +6 -2
msprobe/pytorch/monitor/data_writers.py +259 -0
msprobe/pytorch/monitor/module_hook.py +227 -158
msprobe/pytorch/monitor/module_metric.py +14 -0
msprobe/pytorch/monitor/optimizer_collect.py +242 -270
msprobe/pytorch/monitor/utils.py +16 -3
msprobe/pytorch/online_dispatch/dispatch.py +4 -2
msprobe/pytorch/online_dispatch/dump_compare.py +5 -2
msprobe/pytorch/parse_tool/lib/utils.py +3 -3
msprobe/pytorch/pt_config.py +8 -7
msprobe/pytorch/pytorch_service.py +73 -0
msprobe/visualization/builder/graph_builder.py +33 -13
msprobe/visualization/builder/msprobe_adapter.py +24 -11
msprobe/visualization/compare/graph_comparator.py +53 -45
msprobe/visualization/compare/mode_adapter.py +31 -1
msprobe/visualization/graph/base_node.py +3 -3
msprobe/visualization/graph/graph.py +2 -2
msprobe/visualization/graph_service.py +250 -103
msprobe/visualization/utils.py +27 -11
msprobe/mindspore/dym_loader/hook_dynamic_loader.cc +0 -106
msprobe/mindspore/monitor/anomaly_detect.py +0 -404
msprobe/mindspore/monitor/module_spec_verifier.py +0 -94
msprobe/mindspore/service.py +0 -549
msprobe/pytorch/monitor/anomaly_analyse.py +0 -201
msprobe/pytorch/monitor/anomaly_detect.py +0 -410
msprobe/pytorch/monitor/module_spec_verifier.py +0 -95
msprobe/pytorch/monitor/unittest/test_monitor.py +0 -160
msprobe/pytorch/service.py +0 -473
{mindstudio_probe-1.3.0.dist-info → mindstudio_probe-8.1.1.dist-info}/LICENSE +0 -0
{mindstudio_probe-1.3.0.dist-info → mindstudio_probe-8.1.1.dist-info}/WHEEL +0 -0
{mindstudio_probe-1.3.0.dist-info → mindstudio_probe-8.1.1.dist-info}/entry_points.txt +0 -0
{mindstudio_probe-1.3.0.dist-info → mindstudio_probe-8.1.1.dist-info}/top_level.txt +0 -0
/msprobe/{mindspore → core}/compare/ms_to_pt_api.yaml +0 -0
/msprobe/{mindspore/dump → core}/kernel_dump/kernel_config.py +0 -0
/msprobe/{pytorch/monitor/unittest → core/monitor}/__init__.py +0 -0

msprobe/core/data_dump/data_collector.py CHANGED Viewed

@@ -15,6 +15,7 @@
 import atexit
 import os
+import traceback
 from msprobe.core.data_dump.scope import ScopeFactory
 from msprobe.core.data_dump.json_writer import DataWriter
@@ -41,7 +42,7 @@ class DataCollector:
         self.backward_module_names = {}
         self.optimizer_status = ""
         self.optimizer_status_first_start = {Const.OPTIMIZER: True, Const.CLIP_GRAD: True}
-        atexit.register(self.write_json)
+        atexit.register(self.write_json_at_exit)
     @property
     def dump_data_dir(self):
@@ -78,6 +79,11 @@ class DataCollector:
     def write_json(self):
         self.data_writer.write_json()
+    def write_json_at_exit(self):
+        if self.config.async_dump and self.config.task == Const.TENSOR:
+            self.data_processor.dump_async_data()
+        self.data_writer.write_json()
     def update_data(self, name, data_info):
         msg = f"msprobe is collecting data on {name}."
         if self.config.task == Const.OVERFLOW_CHECK:
@@ -89,88 +95,155 @@ class DataCollector:
         logger.debug(msg)
         self.data_writer.update_data(data_info)
-    def forward_input_data_collect(self, name, module, pid, module_input_output, is_recompute=None):
-        if self.config.task == Const.FREE_BENCHMARK:
-            backward_name = name.replace(Const.FORWARD, Const.BACKWARD)
-            if self.check_scope_and_pid(self.scope, backward_name, pid):
-                self.data_processor.analyze_forward_input(backward_name, module, module_input_output)
-            return
-        if not self.check_scope_and_pid(self.scope, name, pid):
-            return
+    def call_stack_collect(self, name):
+        stack_info = self.data_processor.analyze_api_call_stack(name)
+        self.data_writer.update_stack(name, stack_info)
-        data_info = {}
-        if self.config.task != Const.STRUCTURE:
-            data_info = self.data_processor.analyze_forward_input(name, module, module_input_output)
-        self.set_is_recomputable(data_info, is_recompute)
-        if self.config.level == Const.LEVEL_L2:
-            return
-        self.handle_data(name, data_info, flush=self.data_processor.is_terminated)
+    def forward_input_data_collect(self, name, module, pid, module_input_output, is_recompute=None):
+        try:
+            if self.config.task == Const.FREE_BENCHMARK:
+                backward_name = name.replace(Const.FORWARD, Const.BACKWARD)
+                if self.check_scope_and_pid(self.scope, backward_name, pid):
+                    self.data_processor.analyze_forward_input(backward_name, module, module_input_output)
+                return
+            if not self.check_scope_and_pid(self.scope, name, pid):
+                return
+            data_info = {}
+            if self.config.task != Const.STRUCTURE:
+                data_info = self.data_processor.analyze_forward_input(name, module, module_input_output)
+            self.set_is_recomputable(data_info, is_recompute)
+            if self.config.level == Const.LEVEL_L2:
+                return
+            self.handle_data(name, data_info, flush=self.data_processor.is_terminated)
+        except Exception:
+            tb = traceback.format_exc()
+            self.data_writer.write_error_log(
+                f"[ERROR] forward_input_data_collect failed: name={name}, pid={pid}\n{tb}"
+            )
     def forward_output_data_collect(self, name, module, pid, module_input_output, is_recompute=None):
-        self.update_construct(name)
-        if not self.check_scope_and_pid(self.scope, name, pid):
-            return
-        data_info = {}
-        if self.config.task != Const.STRUCTURE:
-            data_info = self.data_processor.analyze_forward_output(name, module, module_input_output)
-        self.set_is_recomputable(data_info, is_recompute)
-        if self.config.level == Const.LEVEL_L2:
-            return
-        self.data_writer.update_stack(self.data_processor.analyze_api_call_stack(name))
-        self.handle_data(name, data_info, flush=self.data_processor.is_terminated)
+        try:
+            self.update_construct(name)
+            if not self.check_scope_and_pid(self.scope, name, pid):
+                return
+            data_info = {}
+            if self.config.task != Const.STRUCTURE:
+                data_info = self.data_processor.analyze_forward_output(name, module, module_input_output)
+            self.set_is_recomputable(data_info, is_recompute)
+            if self.config.level == Const.LEVEL_L2:
+                return
+            self.call_stack_collect(name)
+            self.handle_data(name, data_info, flush=self.data_processor.is_terminated)
+        except Exception:
+            tb = traceback.format_exc()
+            self.data_writer.write_error_log(
+                f"[ERROR] forward_output_data_collect failed: name={name}, pid={pid}\n{tb}"
+            )
+    def forward_data_collect_only_tensor(self, name, module, pid, module_input_output):
+        try:
+            if not self.check_scope_and_pid(self.scope, name, pid):
+                return
+            self.data_processor.analyze_forward(name, module, module_input_output)
+        except Exception:
+            tb = traceback.format_exc()
+            self.data_writer.write_error_log(
+                f"[ERROR] forward_data_collect_only_tensor failed: name={name}, pid={pid}\n{tb}"
+            )
     def forward_data_collect(self, name, module, pid, module_input_output, is_recompute=None):
-        self.update_construct(name)
-        if not self.check_scope_and_pid(self.scope, name, pid):
-            return
-        data_info = {}
-        if self.config.task != Const.STRUCTURE:
-            data_info = self.data_processor.analyze_forward(name, module, module_input_output)
-        self.set_is_recomputable(data_info, is_recompute)
-        self.data_writer.update_stack(self.data_processor.analyze_api_call_stack(name))
-        self.handle_data(name, data_info, flush=self.data_processor.is_terminated)
+        try:
+            self.update_construct(name)
+            if not self.check_scope_and_pid(self.scope, name, pid):
+                return
+            data_info = {}
+            if self.config.task != Const.STRUCTURE:
+                data_info = self.data_processor.analyze_forward(name, module, module_input_output)
+            self.set_is_recomputable(data_info, is_recompute)
+            self.call_stack_collect(name)
+            self.handle_data(name, data_info, flush=self.data_processor.is_terminated)
+        except Exception:
+            tb = traceback.format_exc()
+            self.data_writer.write_error_log(
+                f"[ERROR] forward_data_collect failed: name={name}, pid={pid}\n{tb}"
+            )
+    def backward_data_collect_only_tensor(self, name, module, pid, module_input_output, is_recompute=None):
+        try:
+            if not self.check_scope_and_pid(self.scope, name, pid):
+                return
+            self.data_processor.analyze_backward(name, module, module_input_output)
+        except Exception:
+            tb = traceback.format_exc()
+            self.data_writer.write_error_log(
+                f"[ERROR] backward_data_collect_only_tensor failed: name={name}, pid={pid}\n{tb}"
+            )
     def backward_data_collect(self, name, module, pid, module_input_output, is_recompute=None):
-        self.update_construct(name)
-        if not self.check_scope_and_pid(self.scope, name, pid):
-            return
-        data_info = {}
-        if self.config.task != Const.STRUCTURE:
-            data_info = self.data_processor.analyze_backward(name, module, module_input_output)
-        if self.config.level == Const.LEVEL_L2:
-            return
-        # 获取执行反向的模块名称
-        if data_info and name.split(Const.SEP)[0] in Const.MODULE_PREFIX:
-            module_name = name.rsplit(Const.SEP, 2)[0]
-            # 将模块名称加入到反向模块名称集合中，用于梯度收集时判断是否需要收集梯度
-            self.backward_module_names[module_name] = True
-        self.handle_data(name, data_info, flush=self.data_processor.is_terminated)
+        try:
+            self.update_construct(name)
+            if not self.check_scope_and_pid(self.scope, name, pid):
+                return
+            data_info = {}
+            if self.config.task != Const.STRUCTURE:
+                data_info = self.data_processor.analyze_backward(name, module, module_input_output)
+            if self.config.level == Const.LEVEL_L2:
+                return
+            if data_info and name.split(Const.SEP)[0] in Const.MODULE_PREFIX:
+                module_name = name.rsplit(Const.SEP, 2)[0]
+                self.backward_module_names[module_name] = True
+            self.handle_data(name, data_info, flush=self.data_processor.is_terminated)
+        except Exception:
+            tb = traceback.format_exc()
+            self.data_writer.write_error_log(
+                f"[ERROR] backward_data_collect failed: name={name}, pid={pid}\n{tb}"
+            )
     def backward_input_data_collect(self, name, module, pid, module_input_output, is_recompute=None):
-        self.update_construct(name)
-        if not self.check_scope_and_pid(self.scope, name, pid):
-            return
-        data_info = {}
-        if self.config.task != Const.STRUCTURE:
-            data_info = self.data_processor.analyze_backward_input(name, module, module_input_output)
-        self.set_is_recomputable(data_info, is_recompute)
-        self.handle_data(name, data_info)
+        try:
+            self.update_construct(name)
+            if not self.check_scope_and_pid(self.scope, name, pid):
+                return
+            data_info = {}
+            if self.config.task != Const.STRUCTURE:
+                data_info = self.data_processor.analyze_backward_input(name, module, module_input_output)
+            self.set_is_recomputable(data_info, is_recompute)
+            self.handle_data(name, data_info)
+        except Exception:
+            tb = traceback.format_exc()
+            self.data_writer.write_error_log(
+                f"[ERROR] backward_input_data_collect failed: name={name}, pid={pid}\n{tb}"
+            )
     def backward_output_data_collect(self, name, module, pid, module_input_output, is_recompute=None):
-        self.update_construct(name)
-        if not self.check_scope_and_pid(self.scope, name, pid):
-            return
-        data_info = {}
-        if self.config.task != Const.STRUCTURE:
-            data_info = self.data_processor.analyze_backward_output(name, module, module_input_output)
-        self.set_is_recomputable(data_info, is_recompute)
-        self.handle_data(name, data_info)
+        try:
+            self.update_construct(name)
+            if not self.check_scope_and_pid(self.scope, name, pid):
+                return
+            data_info = {}
+            if self.config.task != Const.STRUCTURE:
+                data_info = self.data_processor.analyze_backward_output(name, module, module_input_output)
+            self.set_is_recomputable(data_info, is_recompute)
+            self.handle_data(name, data_info)
+        except Exception:
+            tb = traceback.format_exc()
+            self.data_writer.write_error_log(
+                f"[ERROR] backward_output_data_collect failed: name={name}, pid={pid}\n{tb}"
+            )
     def update_construct(self, name):
         if self.config.level not in DataCollector.level_without_construct:
@@ -180,7 +253,10 @@ class DataCollector:
                     self.optimizer_status_first_start[self.optimizer_status] = False
                 self.data_writer.update_construct({name: self.optimizer_status})
             else:
-                self.data_writer.update_construct({name: self.module_processor.api_parent_node})
+                if self.config.level == Const.LEVEL_MIX and \
+                  not (name.startswith(Const.MODULE) or name.startswith(Const.CELL)):
+                    self.data_writer.update_construct({name: self.module_processor.api_parent_node})
             self.data_writer.update_construct(self.module_processor.module_node)
     def handle_data(self, name, data_info, flush=False):
@@ -203,28 +279,33 @@ class DataCollector:
         self.data_processor.update_iter(current_iter)
     def params_data_collect(self, name, param_name, pid, data):
-        grad_name = name + Const.SEP + Const.PARAMS_GRAD
-        # 校验scope和pid，以及当前name是否有过反向计算
-        if not self.check_scope_and_pid(self.scope, name, pid) and not self.backward_module_names.get(name):
-            # 如果没有反向计算，则需要清除之前占位写入的grad数据
-            if self.data_writer.cache_data.get("data"):
-                self.data_writer.cache_data.get("data").pop(grad_name, None)
-            return
-        data_info = self.data_processor.analyze_params(grad_name, param_name, data)
-        self.handle_data(grad_name, data_info, flush=self.data_processor.is_terminated)
-    def fill_stack_tensor_data(self):
-        self.data_writer.fill_stack_tensor_data()
+        try:
+            grad_name = name + Const.SEP + Const.PARAMS_GRAD
+            self.update_api_or_module_name(grad_name)
+            if not self.check_scope_and_pid(self.scope, name, pid) and not self.backward_module_names.get(name):
+                if self.data_writer.cache_data.get("data"):
+                    self.data_writer.cache_data.get("data").pop(grad_name, None)
+                return
+            data_info = self.data_processor.analyze_params(grad_name, param_name, data)
+            self.handle_data(grad_name, data_info, flush=self.data_processor.is_terminated)
+        except Exception:
+            tb = traceback.format_exc()
+            self.data_writer.write_error_log(
+                f"[ERROR] params_data_collect failed: "
+                f"name={name}, param_name={param_name}, pid={pid}\n{tb}"
+            )
     def debug_data_collect_forward(self, variable, name_with_count):
         data_info = self.data_processor.analyze_debug_forward(variable, name_with_count)
-        self.data_writer.update_debug({name_with_count: data_info})
+        name_with_count_category = name_with_count + Const.SEP + Const.DEBUG
+        self.data_writer.update_debug({name_with_count_category: data_info})
     def debug_data_collect_backward(self, variable, grad_name_with_count):
         # prepare all None nested data structure
         all_none_data_info = self.data_processor.analyze_element_to_all_none(variable)
-        self.data_writer.update_debug({grad_name_with_count: all_none_data_info})
+        grad_name_with_count_category = grad_name_with_count + Const.SEP + Const.DEBUG
+        self.data_writer.update_debug({grad_name_with_count_category: all_none_data_info})
         # register tensor backward hook
-        self.data_processor.analyze_debug_backward(variable, grad_name_with_count, self.data_writer.cache_debug['data'])
+        self.data_processor.analyze_debug_backward(variable, grad_name_with_count_category,
+                                                   self.data_writer.cache_debug['data'])

msprobe/core/data_dump/data_processor/base.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
+# Copyright (c) 2024-2025, Huawei Technologies Co., Ltd.
 # All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -13,17 +13,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import copy
 import inspect
 import os
 from dataclasses import dataclass, is_dataclass
-from typing import Tuple, Dict, Optional, Any
 from functools import partial
-import copy
-from typing import Union
+from typing import Tuple, Dict, Optional, Any, Union
 import numpy as np
 from msprobe.core.common.const import Const
+from msprobe.core.common.file_utils import save_npy
 from msprobe.core.common.log import logger
 from msprobe.core.common.utils import convert_tuple, CompareException
@@ -79,21 +79,17 @@ class ModuleBackwardOutputs:
 class TensorStatInfo:
-    def __init__(self, max_val=None, min_val=None, mean_val=None, norm_val=None, stack_tensor_stat=None):
+    def __init__(self, max_val=None, min_val=None, mean_val=None, norm_val=None):
         self.max = max_val
         self.min = min_val
         self.mean = mean_val
         self.norm = norm_val
-        self.stack_tensor_stat = stack_tensor_stat
 class BaseDataProcessor:
     _recursive_key_stack = []
-    special_type = (
-        np.integer, np.floating, np.bool_, np.complexfloating, np.str_, np.byte, np.unicode_, np.ndarray,
-        bool, int, float, str, slice,
-        type(Ellipsis)
-    )
+    builtin_type = (bool, int, float, str, slice, type(Ellipsis))
+    np_type = (np.integer, np.floating, np.bool_, np.complexfloating, np.str_, np.byte, np.unicode_, np.ndarray)
     def __init__(self, config, data_writer):
         self.data_writer = data_writer
@@ -120,7 +116,10 @@ class BaseDataProcessor:
     @staticmethod
     def analyze_api_call_stack(name):
         try:
-            api_stack = inspect.stack()[5:]
+            if name.startswith("Primitive"):
+                api_stack = inspect.stack()[4:]
+            else:
+                api_stack = inspect.stack()[5:]
         except Exception as e:
             logger.warning(f"The call stack of <{name}> failed to retrieve, {e}.")
             api_stack = None
@@ -129,12 +128,14 @@ class BaseDataProcessor:
             for (_, path, line, func, code, _) in api_stack:
                 if not code:
                     continue
+                if any(filter_path in path for filter_path in Const.STACK_FILTER_KEYWORDS) and \
+                        Const.CALL_STACK_FLAG not in path:
+                    continue
                 stack_line = f"File {path}, line {str(line)}, in {func}, \n {code[0].strip()}"
                 stack_str.append(stack_line)
         else:
             stack_str.append(Const.WITHOUT_CALL_STACK)
-        stack_info_struct = {name: stack_str}
-        return stack_info_struct
+        return tuple(stack_str)
     @staticmethod
     def transfer_type(data):
@@ -178,20 +179,8 @@ class BaseDataProcessor:
                                  "invalid data_structure type or invalid index")
     @staticmethod
-    def _convert_numpy_to_builtin(arg):
-        type_mapping = {
-            np.integer: int,
-            np.floating: float,
-            np.bool_: bool,
-            np.complexfloating: complex,
-            np.str_: str,
-            np.byte: bytes,
-            np.unicode_: str
-        }
-        for numpy_type, builtin_type in type_mapping.items():
-            if isinstance(arg, numpy_type):
-                return builtin_type(arg), type(arg).__name__
-        return arg, ''
+    def is_distributed_op(module):
+        return getattr(module, "op_is_distributed", False)
     @staticmethod
     def _analyze_builtin(arg):
@@ -217,21 +206,40 @@ class BaseDataProcessor:
         return single_arg
     @staticmethod
-    def _analyze_numpy(ndarray, numpy_type):
+    def _analyze_numpy(arg):
+        return {"type": type(arg).__name__, "value": arg.item()}
+    @staticmethod
+    def _analyze_ndarray(ndarray, _):
         ndarray_json = {}
         ndarray_json.update({'type': 'numpy.ndarray'})
         ndarray_json.update({'dtype': str(ndarray.dtype)})
         ndarray_json.update({'shape': ndarray.shape})
-        if ndarray.size > 0:
-            ndarray_json.update({"Max": np.max(ndarray).item()})
-            ndarray_json.update({"Min": np.min(ndarray).item()})
-            ndarray_json.update({"Mean": np.mean(ndarray).item()})
-            ndarray_json.update({"Norm": np.linalg.norm(ndarray).item()})
-        else:
-            ndarray_json.update({"Max": None})
-            ndarray_json.update({"Min": None})
-            ndarray_json.update({"Mean": None})
-            ndarray_json.update({"Norm": None})
+        # 先初始化默认值
+        stats = {
+            "Max": None,
+            "Min": None,
+            "Mean": None,
+            "Norm": None
+        }
+        try:
+            # 只有非空时才尝试计算
+            if ndarray.size > 0:
+                stats = {
+                    "Max": np.max(ndarray).item(),
+                    "Min": np.min(ndarray).item(),
+                    "Mean": np.mean(ndarray).item(),
+                    "Norm": np.linalg.norm(ndarray).item()
+                }
+        except Exception as e:
+            # 决定打印内容或切片
+            logger.warning(f"Error analyzing ndarray stats: {e}")
+        # 最后一次性更新
+        ndarray_json.update(stats)
         return ndarray_json
     @staticmethod
@@ -248,7 +256,7 @@ class BaseDataProcessor:
     @classmethod
     def get_special_types(cls):
-        return cls.special_type
+        return cls.builtin_type + cls.np_type
     @classmethod
     def recursive_apply_transform(cls, args, transform, depth=0) -> Union[dict, list, None]:
@@ -303,6 +311,7 @@ class BaseDataProcessor:
             def real_hook_fn(grad):
                 return wrap_hook_fn(grad)
             element.register_hook(real_hook_fn)
     def if_return_forward_new_output(self):
@@ -350,6 +359,8 @@ class BaseDataProcessor:
         return api_info_struct
     def analyze_forward_output(self, name, module, module_input_output: ModuleForwardInputsOutputs):
+        if self.is_distributed_op(module):
+            module_input_output.update_output_with_args_and_kwargs()
         api_info_struct = {}
         # check whether data_mode contains forward or input
         if self.is_dump_for_data_mode(Const.FORWARD, Const.OUTPUT):
@@ -427,6 +438,7 @@ class BaseDataProcessor:
         api_info_struct = {}
         self.save_name = name + Const.SEP + param_name
         data_info = self.analyze_element(grad)
+        self.save_name = None
         grad_info_dict = {param_name: [data_info]}
         api_info_struct[name] = grad_info_dict
         return api_info_struct
@@ -435,10 +447,10 @@ class BaseDataProcessor:
         file_format = Const.PT_SUFFIX if self.config.framework == Const.PT_FRAMEWORK else Const.NUMPY_SUFFIX
         if self.save_name is not None:
             dump_data_name = (self.save_name + file_format)
-            self.save_name = None
         else:
-            dump_data_name = (self.current_api_or_module_name + Const.SEP + self.api_data_category + Const.SEP +
-                              suffix + file_format)
+            suffix_with_seq = (Const.SEP + suffix) if suffix else ""
+            dump_data_name = (self.current_api_or_module_name + Const.SEP + self.api_data_category + suffix_with_seq +
+                              file_format)
         file_path = os.path.join(self.data_writer.dump_tensor_data_dir, dump_data_name)
         return dump_data_name, file_path
@@ -447,23 +459,32 @@ class BaseDataProcessor:
     def analyze_debug_forward(self, variable, name_with_count):
         self.current_api_or_module_name = name_with_count
-        self.api_data_category = Const.TENSOR
-        # these two attributes are used to construct tensor file name {name_with_count}.tensor.{indexes}.npy/pt
+        self.api_data_category = Const.DEBUG
+        # these two attributes are used to construct tensor file name {name_with_count}.debug.{indexes}.npy/pt
         data_info = self.analyze_element(variable)
         return data_info
-    def analyze_debug_backward(self, variable, grad_name_with_count, nested_data_structure):
+    def analyze_debug_backward(self, variable, grad_name_with_count_category, nested_data_structure):
         def hook_fn(grad, indexes):
             suffix = Const.SEP.join([str(index) for index in indexes])
-            self.save_name = grad_name_with_count + Const.SEP + Const.TENSOR + Const.SEP + suffix
+            suffix_with_sep = (Const.SEP + suffix) if suffix else ""
+            self.save_name = grad_name_with_count_category + suffix_with_sep
             grad_data_info = self.analyze_element(grad)
             self.save_name = None
-            full_index = [grad_name_with_count] + indexes
+            full_index = [grad_name_with_count_category] + indexes
             try:
                 self.set_value_into_nested_structure(nested_data_structure, full_index, grad_data_info)
             except (ValueError, IndexError) as e:
-                logger.warning(f"error occured while recording statistics of {grad_name_with_count} variable, "
-                               f"skip current recording, detailed infomation: {e}")
+                logger.warning(f"error occurred while recording statistics of {grad_name_with_count_category} variable,"
+                               f"skip current recording, detailed information: {e}")
             return grad
         wrap_register_hook_single_element = partial(self.register_hook_single_element, hook_fn=hook_fn)
-        self.recursive_apply_transform(variable, wrap_register_hook_single_element)
+        self.recursive_apply_transform(variable, wrap_register_hook_single_element)
+    def _analyze_and_save_ndarray(self, ndarray, suffix):
+        dump_data_name, file_path = self.get_save_file_path(suffix)
+        save_npy(ndarray, file_path)
+        ndarray_json = BaseDataProcessor._analyze_ndarray(ndarray, suffix)
+        ndarray_json.update({"data_name": dump_data_name})
+        return ndarray_json

mindstudio-probe 1.3.0__py3-none-any.whl → 8.1.1__py3-none-any.whl

mindstudio-probe 1.3.0py3-none-any.whl → 8.1.1py3-none-any.whl