PyPI - mindstudio-probe - Versions diffs - 1.2.2__py3-none-any.whl → 8.1.0__py3-none-any.whl - Mend

mindstudio-probe 1.2.2py3-none-any.whl → 8.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (261) hide show

{mindstudio_probe-1.2.2.dist-info → mindstudio_probe-8.1.0.dist-info}/METADATA +4 -3
{mindstudio_probe-1.2.2.dist-info → mindstudio_probe-8.1.0.dist-info}/RECORD +243 -191
msprobe/README.md +57 -21
msprobe/core/__init__.py +17 -0
msprobe/core/common/const.py +224 -82
msprobe/core/common/decorator.py +50 -0
msprobe/core/common/exceptions.py +5 -3
msprobe/core/common/file_utils.py +274 -40
msprobe/core/common/framework_adapter.py +169 -0
msprobe/core/common/global_lock.py +86 -0
msprobe/core/common/runtime.py +25 -0
msprobe/core/common/utils.py +148 -72
msprobe/core/common_config.py +7 -0
msprobe/core/compare/acc_compare.py +640 -462
msprobe/core/compare/check.py +36 -107
msprobe/core/compare/compare_cli.py +4 -0
msprobe/core/compare/config.py +72 -0
msprobe/core/compare/highlight.py +217 -215
msprobe/core/compare/layer_mapping/layer_mapping.py +4 -1
msprobe/core/compare/merge_result/merge_result.py +12 -6
msprobe/core/compare/multiprocessing_compute.py +227 -107
msprobe/core/compare/npy_compare.py +32 -16
msprobe/core/compare/utils.py +218 -244
msprobe/{mindspore/runtime.py → core/config_check/__init__.py} +2 -4
msprobe/{pytorch/dump/kernel_dump/kernel_config.py → core/config_check/checkers/__init__.py} +8 -16
msprobe/core/config_check/checkers/base_checker.py +60 -0
msprobe/core/config_check/checkers/dataset_checker.py +138 -0
msprobe/core/config_check/checkers/env_args_checker.py +96 -0
msprobe/core/config_check/checkers/hyperparameter_checker.py +170 -0
msprobe/core/config_check/checkers/pip_checker.py +90 -0
msprobe/core/config_check/checkers/random_checker.py +367 -0
msprobe/core/config_check/checkers/weights_checker.py +147 -0
msprobe/core/config_check/ckpt_compare/ckpt_comparator.py +74 -0
msprobe/core/config_check/ckpt_compare/megatron_loader.py +302 -0
msprobe/core/config_check/ckpt_compare/metrics.py +83 -0
msprobe/core/config_check/ckpt_compare/name_mapping.yaml +12 -0
msprobe/core/config_check/config_check_cli.py +51 -0
msprobe/core/config_check/config_checker.py +100 -0
msprobe/{pytorch/parse.py → core/config_check/resource/dependency.yaml} +7 -4
msprobe/core/config_check/resource/env.yaml +57 -0
msprobe/core/config_check/resource/hyperparameter.yaml +21 -0
msprobe/core/config_check/utils/hyperparameter_parser.py +115 -0
msprobe/core/config_check/utils/utils.py +107 -0
msprobe/core/data_dump/api_registry.py +239 -0
msprobe/core/data_dump/data_collector.py +36 -9
msprobe/core/data_dump/data_processor/base.py +74 -53
msprobe/core/data_dump/data_processor/mindspore_processor.py +119 -78
msprobe/core/data_dump/data_processor/pytorch_processor.py +134 -96
msprobe/core/data_dump/json_writer.py +146 -57
msprobe/core/debugger/precision_debugger.py +143 -0
msprobe/core/grad_probe/constant.py +2 -1
msprobe/core/grad_probe/grad_compare.py +2 -2
msprobe/core/grad_probe/utils.py +1 -1
msprobe/core/hook_manager.py +242 -0
msprobe/core/monitor/anomaly_processor.py +384 -0
msprobe/core/overflow_check/abnormal_scene.py +2 -0
msprobe/core/service.py +356 -0
msprobe/core/single_save/__init__.py +0 -0
msprobe/core/single_save/single_comparator.py +243 -0
msprobe/core/single_save/single_saver.py +157 -0
msprobe/docs/01.installation.md +6 -5
msprobe/docs/02.config_introduction.md +89 -30
msprobe/docs/03.config_examples.md +1 -0
msprobe/docs/04.kernel_dump_PyTorch.md +1 -1
msprobe/docs/05.data_dump_PyTorch.md +184 -50
msprobe/docs/06.data_dump_MindSpore.md +193 -28
msprobe/docs/07.accuracy_checker_PyTorch.md +13 -3
msprobe/docs/08.accuracy_checker_online_PyTorch.md +72 -10
msprobe/docs/09.accuracy_checker_MindSpore.md +19 -7
msprobe/docs/10.accuracy_compare_PyTorch.md +266 -102
msprobe/docs/11.accuracy_compare_MindSpore.md +117 -43
msprobe/docs/12.overflow_check_PyTorch.md +5 -3
msprobe/docs/13.overflow_check_MindSpore.md +6 -4
msprobe/docs/14.data_parse_PyTorch.md +4 -10
msprobe/docs/17.grad_probe.md +2 -1
msprobe/docs/18.online_dispatch.md +3 -3
msprobe/docs/19.monitor.md +211 -103
msprobe/docs/21.visualization_PyTorch.md +100 -28
msprobe/docs/22.visualization_MindSpore.md +103 -31
msprobe/docs/23.generate_operator_PyTorch.md +9 -9
msprobe/docs/25.tool_function_introduction.md +23 -22
msprobe/docs/26.data_dump_PyTorch_baseline.md +14 -3
msprobe/docs/27.dump_json_instruction.md +278 -8
msprobe/docs/28.debugger_save_instruction.md +111 -20
msprobe/docs/28.kernel_dump_MindSpore.md +1 -1
msprobe/docs/29.data_dump_MSAdapter.md +229 -0
msprobe/docs/30.overflow_check_MSAdapter.md +31 -0
msprobe/docs/31.config_check.md +95 -0
msprobe/docs/32.ckpt_compare.md +69 -0
msprobe/docs/33.generate_operator_MindSpore.md +190 -0
msprobe/docs/34.RL_collect.md +92 -0
msprobe/docs/35.nan_analyze.md +72 -0
msprobe/docs/FAQ.md +3 -11
msprobe/docs/data_dump_MindSpore/data_dump_MindSpore_baseline.md +12 -1
msprobe/docs/data_dump_MindSpore/dynamic_graph_quick_start_example.md +3 -1
msprobe/docs/img/compare_result.png +0 -0
msprobe/docs/img/merge_result.png +0 -0
msprobe/docs/img/save_compare_result_sample.png +0 -0
msprobe/docs/img/visualization/proxy.png +0 -0
msprobe/docs/img/visualization/vis_browser_1.png +0 -0
msprobe/docs/img/visualization/vis_match_info.png +0 -0
msprobe/docs/img/visualization/vis_precision_info.png +0 -0
msprobe/docs/img/visualization/vis_search_info.png +0 -0
msprobe/docs/img/visualization/vis_show_info.png +0 -0
msprobe/docs/img/visualization/vis_showcase.png +0 -0
msprobe/docs/img/visualization/vis_unmatch_info.png +0 -0
msprobe/mindspore/__init__.py +3 -3
msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +151 -55
msprobe/mindspore/api_accuracy_checker/api_runner.py +25 -11
msprobe/mindspore/api_accuracy_checker/base_compare_algorithm.py +2 -1
msprobe/mindspore/api_accuracy_checker/bench_functions/flash_attention_score.py +580 -0
msprobe/mindspore/api_accuracy_checker/bench_functions/fusion_operator.py +41 -0
msprobe/mindspore/api_accuracy_checker/cmd_parser.py +4 -0
msprobe/mindspore/api_accuracy_checker/data_manager.py +4 -3
msprobe/mindspore/api_accuracy_checker/generate_op_script/config_op.json +9 -0
msprobe/mindspore/api_accuracy_checker/generate_op_script/op_generator.py +451 -0
msprobe/mindspore/api_accuracy_checker/generate_op_script/operator_replication.template +2081 -0
msprobe/mindspore/api_accuracy_checker/multi_api_accuracy_checker.py +11 -1
msprobe/mindspore/api_accuracy_checker/torch_mindtorch_importer.py +2 -1
msprobe/mindspore/cell_processor.py +204 -33
msprobe/mindspore/code_mapping/graph_parser.py +4 -21
msprobe/mindspore/common/const.py +73 -2
msprobe/mindspore/common/utils.py +157 -29
msprobe/mindspore/compare/common_dir_compare.py +382 -0
msprobe/mindspore/compare/distributed_compare.py +2 -26
msprobe/mindspore/compare/ms_compare.py +18 -398
msprobe/mindspore/compare/ms_graph_compare.py +20 -10
msprobe/mindspore/compare/utils.py +37 -0
msprobe/mindspore/debugger/debugger_config.py +59 -7
msprobe/mindspore/debugger/precision_debugger.py +83 -90
msprobe/mindspore/dump/cell_dump_process.py +902 -0
msprobe/mindspore/dump/cell_dump_with_insert_gradient.py +889 -0
msprobe/mindspore/dump/dump_tool_factory.py +18 -8
msprobe/mindspore/dump/graph_mode_cell_dump.py +139 -0
msprobe/mindspore/dump/graph_tensor_dump.py +123 -0
msprobe/mindspore/dump/hook_cell/api_register.py +176 -0
msprobe/mindspore/dump/hook_cell/hook_cell.py +22 -12
msprobe/mindspore/dump/hook_cell/ms_hook_manager.py +88 -0
msprobe/mindspore/dump/hook_cell/primitive_hooks.py +8 -2
msprobe/mindspore/dump/hook_cell/support_wrap_ops.yaml +42 -26
msprobe/mindspore/dump/jit_dump.py +35 -27
msprobe/mindspore/dump/kernel_kbyk_dump.py +6 -3
msprobe/mindspore/dym_loader/hook_dynamic_loader.cpp +110 -0
msprobe/mindspore/dym_loader/hook_dynamic_loader.h +15 -16
msprobe/mindspore/free_benchmark/api_pynative_self_check.py +22 -12
msprobe/mindspore/free_benchmark/common/utils.py +1 -1
msprobe/mindspore/free_benchmark/perturbation/perturbation_factory.py +4 -2
msprobe/mindspore/free_benchmark/self_check_tool_factory.py +6 -3
msprobe/mindspore/grad_probe/global_context.py +9 -2
msprobe/mindspore/grad_probe/grad_analyzer.py +2 -1
msprobe/mindspore/grad_probe/grad_stat_csv.py +3 -2
msprobe/mindspore/grad_probe/hook.py +2 -4
msprobe/mindspore/mindspore_service.py +111 -0
msprobe/mindspore/monitor/common_func.py +52 -0
msprobe/mindspore/monitor/data_writers.py +237 -0
msprobe/mindspore/monitor/distributed/wrap_distributed.py +1 -1
msprobe/mindspore/monitor/features.py +13 -1
msprobe/mindspore/monitor/module_hook.py +568 -444
msprobe/mindspore/monitor/optimizer_collect.py +331 -0
msprobe/mindspore/monitor/utils.py +71 -9
msprobe/mindspore/ms_config.py +16 -15
msprobe/mindspore/overflow_check/overflow_check_tool_factory.py +5 -3
msprobe/mindspore/task_handler_factory.py +5 -2
msprobe/msprobe.py +19 -0
msprobe/nan_analyze/__init__.py +14 -0
msprobe/nan_analyze/analyzer.py +255 -0
msprobe/nan_analyze/graph.py +189 -0
msprobe/nan_analyze/utils.py +211 -0
msprobe/pytorch/api_accuracy_checker/common/config.py +2 -2
msprobe/pytorch/api_accuracy_checker/compare/api_precision_compare.py +3 -6
msprobe/pytorch/api_accuracy_checker/compare/compare.py +36 -34
msprobe/pytorch/api_accuracy_checker/generate_op_script/op_generator.py +15 -13
msprobe/pytorch/api_accuracy_checker/generate_op_script/operator_replication.template +206 -4
msprobe/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py +9 -9
msprobe/pytorch/api_accuracy_checker/run_ut/run_overflow_check.py +6 -5
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py +31 -9
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut_utils.py +28 -20
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/attl.py +3 -1
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/client.py +29 -13
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/device_dispatch.py +12 -2
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/server.py +45 -31
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/utils.py +154 -0
msprobe/pytorch/attl_manager.py +65 -0
msprobe/pytorch/bench_functions/moe_gating_top_k_softmax.py +6 -0
msprobe/pytorch/bench_functions/npu_fusion_attention.py +27 -0
msprobe/pytorch/common/utils.py +53 -19
msprobe/pytorch/compare/distributed_compare.py +4 -36
msprobe/pytorch/compare/pt_compare.py +13 -84
msprobe/pytorch/compare/utils.py +47 -0
msprobe/pytorch/debugger/debugger_config.py +34 -17
msprobe/pytorch/debugger/precision_debugger.py +50 -96
msprobe/pytorch/dump/module_dump/hook_wrapper.py +93 -0
msprobe/pytorch/dump/module_dump/module_dump.py +15 -61
msprobe/pytorch/dump/module_dump/module_processer.py +150 -114
msprobe/pytorch/free_benchmark/common/utils.py +1 -1
msprobe/pytorch/free_benchmark/compare/single_benchmark.py +1 -1
msprobe/pytorch/free_benchmark/perturbed_layers/npu/add_noise.py +3 -3
msprobe/pytorch/free_benchmark/perturbed_layers/npu/bit_noise.py +3 -3
msprobe/pytorch/free_benchmark/perturbed_layers/npu/change_value.py +1 -1
msprobe/pytorch/free_benchmark/perturbed_layers/npu/improve_precision.py +1 -1
msprobe/pytorch/free_benchmark/result_handlers/check_handler.py +1 -1
msprobe/pytorch/function_factory.py +1 -1
msprobe/pytorch/grad_probe/grad_monitor.py +2 -2
msprobe/pytorch/grad_probe/grad_stat_csv.py +3 -2
msprobe/pytorch/hook_module/api_register.py +155 -0
msprobe/pytorch/hook_module/hook_module.py +18 -22
msprobe/pytorch/hook_module/jit_script_wrapper.py +33 -0
msprobe/pytorch/hook_module/pt_hook_manager.py +68 -0
msprobe/pytorch/hook_module/register_optimizer_hook.py +2 -1
msprobe/pytorch/hook_module/support_wrap_ops.yaml +193 -75
msprobe/pytorch/hook_module/utils.py +28 -2
msprobe/pytorch/monitor/csv2tb.py +14 -4
msprobe/pytorch/monitor/data_writers.py +259 -0
msprobe/pytorch/monitor/distributed/wrap_distributed.py +8 -2
msprobe/pytorch/monitor/module_hook.py +336 -241
msprobe/pytorch/monitor/module_metric.py +17 -0
msprobe/pytorch/monitor/optimizer_collect.py +244 -224
msprobe/pytorch/monitor/utils.py +84 -4
msprobe/pytorch/online_dispatch/compare.py +0 -2
msprobe/pytorch/online_dispatch/dispatch.py +13 -2
msprobe/pytorch/online_dispatch/dump_compare.py +8 -2
msprobe/pytorch/online_dispatch/utils.py +3 -0
msprobe/pytorch/parse_tool/lib/interactive_cli.py +1 -6
msprobe/pytorch/parse_tool/lib/utils.py +5 -4
msprobe/pytorch/pt_config.py +16 -11
msprobe/pytorch/pytorch_service.py +70 -0
msprobe/visualization/builder/graph_builder.py +69 -10
msprobe/visualization/builder/msprobe_adapter.py +24 -12
msprobe/visualization/compare/graph_comparator.py +63 -51
msprobe/visualization/compare/mode_adapter.py +22 -20
msprobe/visualization/graph/base_node.py +11 -4
msprobe/visualization/graph/distributed_analyzer.py +1 -10
msprobe/visualization/graph/graph.py +2 -13
msprobe/visualization/graph/node_op.py +1 -2
msprobe/visualization/graph_service.py +251 -104
msprobe/visualization/utils.py +26 -44
msprobe/mindspore/dump/hook_cell/api_registry.py +0 -207
msprobe/mindspore/dump/hook_cell/wrap_api.py +0 -212
msprobe/mindspore/dym_loader/hook_dynamic_loader.cc +0 -140
msprobe/mindspore/monitor/anomaly_detect.py +0 -404
msprobe/mindspore/monitor/module_spec_verifier.py +0 -94
msprobe/mindspore/service.py +0 -543
msprobe/pytorch/hook_module/api_registry.py +0 -166
msprobe/pytorch/hook_module/wrap_distributed.py +0 -79
msprobe/pytorch/hook_module/wrap_functional.py +0 -66
msprobe/pytorch/hook_module/wrap_npu_custom.py +0 -85
msprobe/pytorch/hook_module/wrap_tensor.py +0 -69
msprobe/pytorch/hook_module/wrap_torch.py +0 -84
msprobe/pytorch/hook_module/wrap_vf.py +0 -60
msprobe/pytorch/monitor/anomaly_analyse.py +0 -201
msprobe/pytorch/monitor/anomaly_detect.py +0 -410
msprobe/pytorch/monitor/module_spec_verifier.py +0 -95
msprobe/pytorch/monitor/unittest/test_monitor.py +0 -160
msprobe/pytorch/service.py +0 -470
{mindstudio_probe-1.2.2.dist-info → mindstudio_probe-8.1.0.dist-info}/LICENSE +0 -0
{mindstudio_probe-1.2.2.dist-info → mindstudio_probe-8.1.0.dist-info}/WHEEL +0 -0
{mindstudio_probe-1.2.2.dist-info → mindstudio_probe-8.1.0.dist-info}/entry_points.txt +0 -0
{mindstudio_probe-1.2.2.dist-info → mindstudio_probe-8.1.0.dist-info}/top_level.txt +0 -0
/msprobe/{mindspore → core}/compare/ms_to_pt_api.yaml +0 -0
/msprobe/{mindspore/dump → core}/kernel_dump/kernel_config.py +0 -0
/msprobe/{pytorch/monitor/unittest → core/monitor}/__init__.py +0 -0

msprobe/pytorch/attl_manager.py ADDED Viewed

@@ -0,0 +1,65 @@
+# Copyright (c) 2025, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from msprobe.core.common.runtime import Runtime
+from msprobe.core.common.utils import Const
+from msprobe.pytorch.api_accuracy_checker.common.utils import ApiData
+from msprobe.pytorch.common.log import logger
+class ATTLManager:
+    def __init__(self, config):
+        self.config = config
+        self.attl = None
+    def attl_init(self):
+        if self.config.online_run_ut:
+            from msprobe.pytorch.api_accuracy_checker.tensor_transport_layer.attl import ATTLConfig, ATTL
+            attl_config = ATTLConfig(is_benchmark_device=False,
+                                     connect_ip=self.config.host,
+                                     connect_port=self.config.port,
+                                     nfs_path=self.config.nfs_path,
+                                     tls_path=self.config.tls_path)
+            need_dump = len(self.config.rank) == 0 or Runtime.current_rank in self.config.rank
+            self.attl = ATTL('npu', attl_config, need_dump=need_dump)
+            if self.config.nfs_path:
+                self.attl.upload("start")
+    def attl_send(self, name, args, kwargs, output):
+        api_data = ApiData(
+                    name[:-len(Const.FORWARD_NAME_SUFFIX)],
+                    args,
+                    kwargs,
+                    output,
+                    Runtime.current_iter,
+                    Runtime.current_rank
+                )
+        logger.info(f"tools is dumping api: {api_data.name}, rank: {Runtime.current_rank}")
+        api_type, _, _ = api_data.name.split(Const.SEP)
+        if api_type in [Const.DISTRIBUTED]:
+            logger.info(f"api {api_data.name} is not supported, skip")
+            return
+        if self.config.nfs_path:
+            self.attl.upload(api_data)
+        else:
+            self.attl.send(api_data)
+    def attl_stop(self):
+        if self.config.nfs_path:
+            self.attl.upload("end")
+        elif self.attl.socket_manager is not None:
+            logger.info(f"pid: {os.getpid()} finished, start sends STOP signal.")
+            self.attl.socket_manager.send_stop_signal()

msprobe/pytorch/bench_functions/moe_gating_top_k_softmax.py CHANGED Viewed

@@ -29,6 +29,8 @@ def softmax_func(x, axis=None):
 def npu_moe_gating_top_k_softmax(x, finished_optional, k):
     input_dtype = x.dtype
+    if x.dim() < 1:
+        raise ValueError("Input x must have at least 1 dimensions.")
     num_expert = x.shape[-1]
     softmax = softmax_func(x, -1)
     softmax = softmax.to(input_dtype)
@@ -36,9 +38,13 @@ def npu_moe_gating_top_k_softmax(x, finished_optional, k):
     expert_idx = expert_idx[:, :k]
     y = torch.gather(softmax, index=expert_idx, dim=-1)
     if finished_optional is not None:
+        if finished_optional.dim() < 1:
+            raise ValueError("Finished_optional must have at least 1 dimensions.")
         finished_optional = finished_optional.view(finished_optional.shape[0], 1)
         finished_optional = finished_optional.expand(-1, k)
         expert_idx = torch.where(finished_optional, num_expert, expert_idx)
+    if y.dim() < 2:
+        raise ValueError("Variable y must have at least 2 dimensions.")
     row_idx = torch.arange(y.shape[0] * y.shape[1]).reshape(y.shape[1], y.shape[0]).t()
     return y, expert_idx, row_idx

msprobe/pytorch/bench_functions/npu_fusion_attention.py CHANGED Viewed

@@ -117,6 +117,12 @@ def fusion_attention_forward(forward_params):
     pse = forward_params.pse
     scale = forward_params.scale
     keep_prob = forward_params.keep_prob
+    # 除零风险拦截：keep_prob 为 0 时会导致除零错误
+    if keep_prob == 0:
+        raise ValueError("fusion_attention_forward: keep_prob cannot be zero to avoid division by zero.")
     qk = calculate_qk(q, k, atten_mask, pse, scale)
     softmax_res, softmax_max, softmax_sum = softmax_forward(qk)
     if drop_mask is None or len(drop_mask.shape) == 0:
@@ -137,6 +143,11 @@ def fusion_attention_backward(backward_params):
     pse = backward_params.pse
     scale = backward_params.scale
     keep_prob = backward_params.keep_prob
+    # 除零风险拦截：keep_prob 为 0 时会导致除零错误
+    if keep_prob == 0:
+        raise ValueError("fusion_attention_backward: keep_prob cannot be zero to avoid division by zero.")
     dp = torch.matmul(dx, v.permute(0, 1, 3, 2))
     if drop_mask is None or len(drop_mask.shape) == 0:
         drop_res = softmax_res.permute(0, 1, 3, 2)
@@ -164,23 +175,35 @@ def parse_bsnd_args(query, key, head_num, input_layout):
         if input_layout == "BSH":
             b, s1, h1 = query.shape
             _, s2, h2 = key.shape
+            if n1 == 0:
+                raise ValueError("parse_bsnd_args: head_num (n1) cannot be zero to avoid division by zero.")
             d = h1 // n1
+            if d == 0:
+                raise ValueError("parse_bsnd_args: computed head dimension (d) is zero, division by zero risk.")
             n2 = h2 // d
         elif input_layout == "SBH":
             s1, b, h1 = query.shape
             s2, _, h2 = key.shape
+            if n1 == 0:
+                raise ValueError("parse_bsnd_args: head_num (n1) cannot be zero to avoid division by zero.")
             d = h1 // n1
+            if d == 0:
+                raise ValueError("parse_bsnd_args: computed head dimension (d) is zero, division by zero risk.")
             n2 = h2 // d
         elif input_layout == "BSND":
             b, s1, n1, d = query.shape
             _, s2, n2, _ = key.shape
             h1 = n1 * d
             h2 = n2 * d
+            if d == 0:
+                raise ValueError("parse_bsnd_args: head dimension (d) is zero, division by zero risk.")
         elif input_layout == "BNSD":
             b, n1, s1, d = query.shape
             _, n2, s2, _ = key.shape
             h1 = n1 * d
             h2 = n2 * d
+            if d == 0:
+                raise ValueError("parse_bsnd_args: head dimension (d) is zero, division by zero risk.")
     except Exception as e:
         raise ValueError(f"query.shape: {query.shape}, key.shape: {key.shape}, parse_bsnd_args error: {e}") from e
@@ -446,6 +469,8 @@ def npu_fusion_attention_forward_patch(*args, **kwargs):
     input_layout = get_input_layout(*args, **kwargs)
     b, s1, s2, n1, n2, d, h1, h2, dtype = parse_bsnd_args(args[0], args[1], head_num, input_layout)
+    if d == 0:
+        raise ValueError("npu_fusion_attention_forward_patch: head dimension (d) is zero, division by zero risk.")
     if n1 == n2 and s1 == s2:
         logger.debug(f"running case : BNSD = {b}_{n1}_{s1}_{d}, sparse = {kwargs.get('sparse_mode', 0)}")
     else:
@@ -478,6 +503,8 @@ def npu_fusion_attention_backward_patch(*args, **kwargs):
         raise ValueError(f"Unsupported npu_fusion_attention_grad args {args}.")
     b, s1, s2, n1, n2, d, h1, h2, dtype = parse_bsnd_args(args[0], args[1], args[4], args[5])
+    if d == 0:
+        raise ValueError("npu_fusion_attention_backward_patch: head dimension (d) is zero, division by zero risk.")
     if n1 == n2 and s1 == s2:
         logger.info(f"running case : bnsd = {b}_{n1}_{s1}_{d}, sparse = {kwargs.get('sparse_mode', 0)}")
     else:

msprobe/pytorch/common/utils.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
+# Copyright (c) 2024-2025, Huawei Technologies Co., Ltd.
 # All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0  (the "License");
@@ -24,11 +24,12 @@ from functools import wraps
 import numpy as np
 import torch
 import torch.distributed as dist
 from msprobe.core.common.exceptions import DistributedNotInitializedError
 from msprobe.core.common.file_utils import (FileCheckConst, change_mode,
                                             check_file_or_directory_path, check_path_before_create, FileOpen)
 from msprobe.core.common.log import logger
-from msprobe.core.common.utils import check_seed_all
+from msprobe.core.common.utils import check_seed_all, is_save_variable_valid
 from packaging import version
 try:
@@ -38,7 +39,9 @@ except ImportError:
 else:
     is_gpu = False
 torch_without_guard_version = torch.__version__ >= '2.1'
+torch_version_above_or_equal_2 = torch.__version__.split('+')[0] >= '2.0'
 if not is_gpu and not torch_without_guard_version:
     from torch_npu.utils.device_guard import torch_device_guard as torch_npu_device_guard
@@ -57,7 +60,7 @@ def parameter_adapter(func):
     @wraps(func)
     def inner(self, *args, **kwargs):
-        if self.op_name_ == "__getitem__" and len(args) > 1 and isinstance(args[1], torch.Tensor):
+        if self.api_name == "__getitem__" and len(args) > 1 and isinstance(args[1], torch.Tensor):
             input_tensor = args[0]
             indices = args[1]
             if indices.dtype == torch.uint8:
@@ -77,7 +80,7 @@ def parameter_adapter(func):
                 else:
                     res = [input_tensor[tensor_index] for tensor_index in indices]
                     return getattr(torch._C._VariableFunctionsClass, "stack")(res, 0)
-        if self.op_name_ == "__eq__" and len(args) > 1 and args[1] is None:
+        if self.api_name == "__eq__" and len(args) > 1 and args[1] is None:
             return False
         return func(self, *args, **kwargs)
@@ -261,6 +264,10 @@ class Const:
     NPU = 'NPU'
     DISTRIBUTED = 'Distributed'
+    HIFLOAT8_TYPE = "torch_npu.HiFloat8Tensor"
+    FLOAT8_E5M2_TYPE = "torch.float8_e5m2"
+    FLOAT8_E4M3FN_TYPE = "torch.float8_e4m3fn"
     RAISE_PRECISION = {
         torch.float16: torch.float32,
         torch.bfloat16: torch.float32,
@@ -309,14 +316,14 @@ def print_rank_0(message):
         logger.info(message)
-def load_pt(pt_path, to_cpu=False):
+def load_pt(pt_path, to_cpu=False, weights_only=True):
     pt_path = os.path.realpath(pt_path)
     check_file_or_directory_path(pt_path)
     try:
         if to_cpu:
-            pt = torch.load(pt_path, map_location=torch.device("cpu"), weights_only=True)
+            pt = torch.load(pt_path, map_location=torch.device("cpu"), weights_only=weights_only)
         else:
-            pt = torch.load(pt_path, weights_only=True)
+            pt = torch.load(pt_path, weights_only=weights_only)
     except Exception as e:
         raise RuntimeError(f"load pt file {pt_path} failed") from e
     return pt
@@ -391,7 +398,7 @@ def save_api_data(api_data):
         io_buff = io.BytesIO()
         torch.save(api_data, io_buff)
     except Exception as e:
-        raise RuntimeError(f"save api_data to io_buff failed") from e
+        raise RuntimeError("save api_data to io_buff failed") from e
     return io_buff
@@ -401,7 +408,7 @@ def load_api_data(api_data_bytes):
         buffer = io.BytesIO(api_data_bytes)
         buffer = torch.load(buffer, map_location="cpu")
     except Exception as e:
-        raise RuntimeError(f"load api_data from bytes failed") from e
+        raise RuntimeError("load api_data from bytes failed") from e
     return buffer
@@ -419,7 +426,11 @@ def is_recomputation():
         bool: True if in the re-computation phase, False otherwise.
     """
     backward_function_indices = []
-    call_stack = inspect.stack()
+    try:
+        call_stack = inspect.stack()
+    except Exception as e:
+        logger.warning(f"Failed to capture stack trace, recomputation validation may be incorrect, error info: {e}.")
+        return False
     # Identify the function 'backward' is being executed within the 'torch/_tensor.py' file.
     for frame_info in call_stack:
@@ -449,9 +460,11 @@ def is_recomputation():
 def check_save_param(variable, name, save_backward):
     # try catch this api to skip invalid call
-    if not isinstance(variable, (list, dict, torch.Tensor, int, float, str)):
+    valid_data_types = (torch.Tensor, int, float, str)
+    if not is_save_variable_valid(variable, valid_data_types):
+        valid_data_types_with_nested_types = valid_data_types + (dict, tuple, list)
         logger.warning("PrecisionDebugger.save variable type not valid, "
-                       "should be one of list, dict, torch.Tensor, int, float or string. "
+                       f"should be one of {valid_data_types_with_nested_types}"
                        "Skip current save process.")
         raise ValueError
     if not isinstance(name, str):
@@ -466,10 +479,31 @@ def check_save_param(variable, name, save_backward):
         raise ValueError
-def replace_last_occurrence(text, old, new):
-    if text is None:
-        return text
-    index = text.rfind(old)
-    if index != -1:
-        return text[:index] + text[index:].replace(old, new, 1)
-    return text
+def is_torch_nn_module(variable):
+    return isinstance(variable, torch.nn.Module) and not isinstance(variable, torch.jit.ScriptModule)
+def is_hifloat8_tensor(tensor):
+    if not is_gpu and hasattr(torch_npu, "HiFloat8Tensor") and isinstance(tensor, torch_npu.HiFloat8Tensor):
+        return True
+    return False
+def is_float8_tensor(tensor):
+    if str(tensor.dtype) in [Const.FLOAT8_E5M2_TYPE, Const.FLOAT8_E4M3FN_TYPE]:
+        return True
+    return is_hifloat8_tensor(tensor)
+def register_forward_pre_hook(module, forward_pre_hook):
+    if torch_version_above_or_equal_2:
+        module.register_forward_pre_hook(forward_pre_hook, with_kwargs=True)
+    else:
+        module.register_forward_pre_hook(forward_pre_hook)
+def register_forward_hook(module, forward_hook):
+    if torch_version_above_or_equal_2:
+        module.register_forward_hook(forward_hook, with_kwargs=True)
+    else:
+        module.register_forward_hook(forward_hook)

msprobe/pytorch/compare/distributed_compare.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2024, Huawei Technologies Co., Ltd.
+# Copyright (c) 2024-2025, Huawei Technologies Co., Ltd.
 # All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0  (the "License");
@@ -13,41 +13,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import os
-from msprobe.core.common.exceptions import FileCheckException
-from msprobe.core.common.file_utils import create_directory
-from msprobe.core.common.utils import CompareException, check_compare_param, check_configuration_param, get_dump_mode, \
-    set_dump_path
-from msprobe.core.compare.acc_compare import ModeConfig
-from msprobe.core.compare.utils import check_and_return_dir_contents, extract_json, set_stack_json_path
-from msprobe.pytorch.common.log import logger
-from msprobe.pytorch.compare.pt_compare import PTComparator, compare
+from msprobe.core.compare.utils import compare_distributed_inner
+from msprobe.pytorch.compare.pt_compare import compare
 def compare_distributed(npu_dump_dir, bench_dump_dir, output_path, **kwargs):
-    if kwargs.get("suffix"):
-        logger.error("Argument 'suffix' is not supported for compare_distributed.")
-        raise CompareException(CompareException.INVALID_PARAM_ERROR)
-    is_print_compare_log = kwargs.get("is_print_compare_log", True)
-    # get the ranks and match by order
-    npu_ranks = sorted(check_and_return_dir_contents(npu_dump_dir, 'rank'))
-    bench_ranks = sorted(check_and_return_dir_contents(bench_dump_dir, 'rank'))
-    if len(npu_ranks) != len(bench_ranks):
-        logger.error(
-            "The number of ranks in the two runs are different. "
-            "Unable to match the ranks. "
-            "Please use another folder to compare or use compare() api and manually match the ranks.")
-        raise CompareException(CompareException.INVALID_PATH_ERROR)
-    for nr, br in zip(npu_ranks, bench_ranks):
-        npu_data_dir = os.path.join(npu_dump_dir, nr)
-        bench_data_dir = os.path.join(bench_dump_dir, br)
-        npu_path = extract_json(npu_data_dir, stack_json=False)
-        bench_path = extract_json(bench_data_dir, stack_json=False)
-        dump_result_param = {
-            "npu_json_path": npu_path,
-            "bench_json_path": bench_path,
-            "is_print_compare_log": is_print_compare_log
-        }
-        compare(input_param=dump_result_param, output_path=output_path, suffix=f'_{nr}-{br}', **kwargs)
+    compare_distributed_inner(npu_dump_dir, bench_dump_dir, output_path, compare, **kwargs)

msprobe/pytorch/compare/pt_compare.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
+# Copyright (c) 2024-2025, Huawei Technologies Co., Ltd.
 # All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0  (the "License");
@@ -13,92 +13,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import os.path
+from msprobe.core.compare.acc_compare import Comparator, ModeConfig, MappingConfig, setup_comparison
+from msprobe.pytorch.compare.utils import read_pt_data
-import torch
-from msprobe.core.common.const import FileCheckConst
-from msprobe.core.common.exceptions import FileCheckException
-from msprobe.core.common.file_utils import FileChecker, create_directory, load_yaml
-from msprobe.core.common.utils import CompareException, check_compare_param, check_configuration_param, get_dump_mode, \
-    set_dump_path
-from msprobe.core.compare.acc_compare import Comparator, ModeConfig
-from msprobe.core.compare.utils import set_stack_json_path
-from msprobe.pytorch.common.log import logger
-from msprobe.pytorch.common.utils import load_pt
-class PTComparator(Comparator):
-    def __init__(self, mode_config, data_mapping=None):
-        super().__init__(mode_config)
-        self.stack_mode = mode_config.stack_mode
-        self.auto_analyze = mode_config.auto_analyze
-        self.fuzzy_match = mode_config.fuzzy_match
-        self.dump_mode = mode_config.dump_mode
-        self.frame_name = PTComparator.__name__
-        self.data_mapping = data_mapping
-        if isinstance(self.data_mapping, str) or self.data_mapping is None:
-            self.data_mapping_dict = self.load_mapping_file(self.data_mapping)
-        elif isinstance(self.data_mapping, dict):
-            self.data_mapping_dict = self.data_mapping
-        else:
-            raise TypeError(f"The type of parameter `data_mapping` must be dict, str or None, but got "
-                            f"{type(self.data_mapping)}")
-    @staticmethod
-    def load_mapping_file(mapping_file):
-        if isinstance(mapping_file, str):
-            mapping_dict = load_yaml(mapping_file)
-        else:
-            mapping_dict = {}
-        return mapping_dict
-    def read_npy_data(self, dir_path, file_name):
-        if not file_name:
-            return None
-        data_path = os.path.join(dir_path, file_name)
-        path_checker = FileChecker(data_path, FileCheckConst.FILE, FileCheckConst.READ_ABLE,
-                                   FileCheckConst.PT_SUFFIX, False)
-        data_path = path_checker.common_check()
-        try:
-            # detach because numpy can not process gradient information
-            data_value = load_pt(data_path, to_cpu=True).detach()
-        except RuntimeError as e:
-            # 这里捕获 load_pt 中抛出的异常
-            logger.error(f"Failed to load the .pt file at {data_path}.")
-            raise CompareException(CompareException.INVALID_FILE_ERROR) from e
-        except AttributeError as e:
-            # 这里捕获 detach 方法抛出的异常
-            logger.error(f"Failed to detach the loaded tensor.")
-            raise CompareException(CompareException.DETACH_ERROR) from e
-        if data_value.dtype == torch.bfloat16:
-            data_value = data_value.to(torch.float32)
-        data_value = data_value.numpy()
-        return data_value
+def read_real_data(npu_dir, npu_data_name, bench_dir, bench_data_name, _) -> tuple:
+    n_value = read_pt_data(npu_dir, npu_data_name)
+    b_value = read_pt_data(bench_dir, bench_data_name)
+    return n_value, b_value
 def compare(input_param, output_path, **kwargs):
-    try:
-        auto_analyze = kwargs.get('auto_analyze', True)
-        fuzzy_match = kwargs.get('fuzzy_match', False)
-        data_mapping = kwargs.get('data_mapping', None)
-        suffix = kwargs.get('suffix', '')
-        set_dump_path(input_param)
-        dump_mode = get_dump_mode(input_param)
-        if "stack_json_path" in input_param:
-            stack_mode = kwargs.get('stack_mode', False)
-        else:
-            stack_mode = set_stack_json_path(input_param)  # set stack_mode and set "stack_json_path" in input_param
-        check_configuration_param(stack_mode, auto_analyze, fuzzy_match, input_param.get('is_print_compare_log', True))
-        create_directory(output_path)
-        check_compare_param(input_param, output_path, dump_mode, stack_mode)
-    except (CompareException, FileCheckException) as error:
-        logger.error('Compare failed. Please check the arguments and do it again!')
-        raise CompareException(error.code) from error
+    config = setup_comparison(input_param, output_path, **kwargs)
-    mode_config = ModeConfig(stack_mode, auto_analyze, fuzzy_match, dump_mode)
-    pt_comparator = PTComparator(mode_config, data_mapping)
-    pt_comparator.compare_core(input_param, output_path, suffix=suffix)
+    mode_config = ModeConfig(config.stack_mode, config.auto_analyze, config.fuzzy_match,
+                             config.dump_mode, config.compared_file_type)
+    mapping_config = MappingConfig(data_mapping=config.data_mapping)
+    pt_comparator = Comparator(read_real_data, mode_config, mapping_config)
+    pt_comparator.compare_core(input_param, output_path, suffix=config.suffix)

msprobe/pytorch/compare/utils.py ADDED Viewed

@@ -0,0 +1,47 @@
+# Copyright (c) 2025-2025, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import torch
+from msprobe.core.common.utils import logger, CompareException
+from msprobe.core.common.file_utils import FileChecker, FileCheckConst
+from msprobe.pytorch.common.utils import load_pt
+def read_pt_data(dir_path, file_name):
+    if not file_name:
+        return None
+    data_path = os.path.join(dir_path, file_name)
+    path_checker = FileChecker(data_path, FileCheckConst.FILE, FileCheckConst.READ_ABLE,
+                               FileCheckConst.PT_SUFFIX, False)
+    data_path = path_checker.common_check()
+    try:
+        # detach because numpy can not process gradient information
+        data_value = load_pt(data_path, to_cpu=True).detach()
+    except RuntimeError as e:
+        # 这里捕获 load_pt 中抛出的异常
+        logger.error(f"Failed to load the .pt file at {data_path}.")
+        raise CompareException(CompareException.INVALID_FILE_ERROR) from e
+    except AttributeError as e:
+        # 这里捕获 detach 方法抛出的异常
+        logger.error(f"Failed to detach the loaded tensor.")
+        raise CompareException(CompareException.DETACH_ERROR) from e
+    if data_value.dtype == torch.bfloat16:
+        data_value = data_value.to(torch.float32)
+    data_value = data_value.numpy()
+    return data_value

msprobe/pytorch/debugger/debugger_config.py CHANGED Viewed

@@ -13,11 +13,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import torch
 from msprobe.core.common.const import Const
 from msprobe.core.common.exceptions import MsprobeException
 from msprobe.pytorch.common.log import logger
+from msprobe.pytorch.common.utils import is_torch_nn_module
 class DebuggerConfig:
@@ -60,6 +59,7 @@ class DebuggerConfig:
                 if isinstance(task_config.online_run_ut_recompute, bool) else False
         self.check()
+        self._check_statistics_config(task_config)
         if self.level == Const.LEVEL_L2:
             self.is_backward_kernel_dump = False
@@ -78,10 +78,13 @@ class DebuggerConfig:
         if not isinstance(self.async_dump, bool):
             raise MsprobeException(MsprobeException.INVALID_PARAM_ERROR,
                                    f"The parameters async_dump should be bool.")
-        if self.async_dump and self.task == Const.TENSOR and not self.list:
-            raise MsprobeException(MsprobeException.INVALID_PARAM_ERROR,
-                                   f"The parameters async_dump is true in tensor task, the parameters list cannot be "
-                                   f"empty.")
+        if self.async_dump and self.task == Const.TENSOR:
+            if self.level == Const.LEVEL_DEBUG:
+                self.list = [] # async_dump + debug level case ignore list
+            if not self.list and self.level != Const.LEVEL_DEBUG:
+                raise MsprobeException(MsprobeException.INVALID_PARAM_ERROR,
+                                    f"The parameters async_dump is true in tensor task, the parameters list cannot be "
+                                    f"empty.")
         if self.task == Const.STRUCTURE and self.level not in [Const.LEVEL_L0, Const.LEVEL_MIX]:
             logger.warning_on_rank_0(
                 f"When the task is set to structure, the level should be one of {[Const.LEVEL_L0, Const.LEVEL_MIX]}. "
@@ -93,25 +96,24 @@ class DebuggerConfig:
         self.check_kwargs()
         return True
-    def check_model(self, instance, start_model):
-        if self.level not in [Const.LEVEL_L0, Const.LEVEL_MIX]:
-            if instance.model is not None or start_model is not None:
-                logger.info_on_rank_0(
-                    f"The current level is not L0 or mix level, so the model parameters will not be used.")
+    def check_model(self, instance, start_model, token_range=None):
+        instance.model = start_model if start_model is not None else instance.model
+        if self.level not in [Const.LEVEL_L0, Const.LEVEL_MIX] and token_range is None:
             return
-        if start_model is None and instance.model is None:
+        if instance.model is None:
             logger.error_on_rank_0(
-                f"For level {self.level}, PrecisionDebugger or start interface must receive a 'model' parameter.")
+                f"For level {self.level} or non-empty token_range, "
+                f"PrecisionDebugger or start interface must receive a 'model' parameter.")
             raise MsprobeException(MsprobeException.INVALID_PARAM_ERROR, f"missing the parameter 'model'")
-        instance.model = start_model if start_model is not None else instance.model
-        if isinstance(instance.model, torch.nn.Module):
+        if is_torch_nn_module(instance.model):
             return
         error_model = None
         if isinstance(instance.model, (list, tuple)):
             for model in instance.model:
-                if not isinstance(model, torch.nn.Module):
+                if not is_torch_nn_module(model):
                     error_model = model
                     break
         else:
@@ -119,7 +121,7 @@ class DebuggerConfig:
         if error_model is not None:
             error_info = (f"The 'model' parameter must be a torch.nn.Module or list[torch.nn.Module] "
-                          f"type, currently there is a {type(error_model)} type.")
+                          f"type, currently there is an unsupported {type(error_model)} type.")
             raise MsprobeException(
                 MsprobeException.INVALID_PARAM_ERROR, error_info)
@@ -130,8 +132,23 @@ class DebuggerConfig:
         if not self.list or len(self.list) != 1:
             raise MsprobeException(MsprobeException.INVALID_PARAM_ERROR,
                                    f"When level is set to L2, the list must be configured as a list with one api name.")
+        if self.task != Const.TENSOR:
+            raise MsprobeException(MsprobeException.INVALID_PARAM_ERROR,
+                                   f"When level is set to L2, the task must be set to tensor.")
         api_name = self.list[0]
         if api_name.endswith(Const.BACKWARD):
             self.is_backward_kernel_dump = True
             api_forward_name = api_name[:-len(Const.BACKWARD)] + Const.FORWARD
             self.list.append(api_forward_name)
+    def _check_statistics_config(self, task_config):
+        if self.task != Const.STATISTICS:
+            return
+        self.tensor_list = []
+        if not hasattr(task_config, "tensor_list"):
+            return
+        if self.level == Const.LEVEL_DEBUG and task_config.tensor_list:
+            logger.warning_on_rank_0("When level is set to debug, the tensor_list will be invalid.")
+            return
+        self.tensor_list = task_config.tensor_list

mindstudio-probe 1.2.2__py3-none-any.whl → 8.1.0__py3-none-any.whl

mindstudio-probe 1.2.2py3-none-any.whl → 8.1.0py3-none-any.whl