mindstudio-probe 1.2.1__py3-none-any.whl → 1.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {mindstudio_probe-1.2.1.dist-info → mindstudio_probe-1.3.0.dist-info}/METADATA +3 -3
- {mindstudio_probe-1.2.1.dist-info → mindstudio_probe-1.3.0.dist-info}/RECORD +168 -150
- msprobe/README.md +27 -22
- msprobe/core/common/const.py +129 -60
- msprobe/core/common/decorator.py +50 -0
- msprobe/core/common/exceptions.py +3 -1
- msprobe/core/common/file_utils.py +25 -2
- msprobe/core/common/inplace_ops.yaml +1 -0
- msprobe/core/common/utils.py +43 -33
- msprobe/core/compare/acc_compare.py +43 -74
- msprobe/core/compare/check.py +2 -6
- msprobe/core/compare/highlight.py +2 -0
- msprobe/core/compare/layer_mapping/data_scope_parser.py +1 -1
- msprobe/core/compare/layer_mapping/layer_mapping.py +2 -1
- msprobe/core/compare/merge_result/merge_result.py +16 -9
- msprobe/core/compare/merge_result/utils.py +81 -0
- msprobe/core/compare/multiprocessing_compute.py +19 -12
- msprobe/core/compare/npy_compare.py +30 -12
- msprobe/core/compare/utils.py +30 -10
- msprobe/core/data_dump/api_registry.py +176 -0
- msprobe/core/data_dump/data_collector.py +58 -13
- msprobe/core/data_dump/data_processor/base.py +94 -10
- msprobe/core/data_dump/data_processor/factory.py +3 -0
- msprobe/core/data_dump/data_processor/mindspore_processor.py +33 -33
- msprobe/core/data_dump/data_processor/pytorch_processor.py +99 -18
- msprobe/core/data_dump/json_writer.py +61 -40
- msprobe/core/grad_probe/constant.py +1 -0
- msprobe/core/grad_probe/grad_compare.py +1 -1
- msprobe/core/overflow_check/abnormal_scene.py +2 -0
- msprobe/docs/01.installation.md +27 -1
- msprobe/docs/02.config_introduction.md +27 -23
- msprobe/docs/03.config_examples.md +24 -0
- msprobe/docs/05.data_dump_PyTorch.md +103 -16
- msprobe/docs/06.data_dump_MindSpore.md +76 -32
- msprobe/docs/07.accuracy_checker_PyTorch.md +11 -1
- msprobe/docs/08.accuracy_checker_online_PyTorch.md +3 -1
- msprobe/docs/09.accuracy_checker_MindSpore.md +5 -3
- msprobe/docs/10.accuracy_compare_PyTorch.md +59 -33
- msprobe/docs/11.accuracy_compare_MindSpore.md +40 -16
- msprobe/docs/12.overflow_check_PyTorch.md +3 -1
- msprobe/docs/13.overflow_check_MindSpore.md +4 -2
- msprobe/docs/14.data_parse_PyTorch.md +1 -7
- msprobe/docs/18.online_dispatch.md +1 -1
- msprobe/docs/19.monitor.md +332 -273
- msprobe/docs/21.visualization_PyTorch.md +42 -13
- msprobe/docs/22.visualization_MindSpore.md +43 -13
- msprobe/docs/23.generate_operator_PyTorch.md +9 -9
- msprobe/docs/27.dump_json_instruction.md +301 -27
- msprobe/docs/28.debugger_save_instruction.md +94 -0
- msprobe/docs/28.kernel_dump_MindSpore.md +69 -0
- msprobe/docs/29.data_dump_MSAdapter.md +229 -0
- msprobe/docs/30.overflow_check_MSAdapter.md +31 -0
- msprobe/docs/FAQ.md +3 -11
- msprobe/docs/img/compare_result.png +0 -0
- msprobe/docs/img/merge_result.png +0 -0
- msprobe/docs/img/monitor/step_count_per_record.png +0 -0
- msprobe/docs/img/visualization/vis_browser_1.png +0 -0
- msprobe/docs/img/visualization/vis_match_info.png +0 -0
- msprobe/docs/img/visualization/vis_precision_info.png +0 -0
- msprobe/docs/img/visualization/vis_search_info.png +0 -0
- msprobe/docs/img/visualization/vis_show_info.png +0 -0
- msprobe/docs/img/visualization/vis_showcase.png +0 -0
- msprobe/docs/img/visualization/vis_unmatch_info.png +0 -0
- msprobe/mindspore/__init__.py +4 -2
- msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +32 -7
- msprobe/mindspore/api_accuracy_checker/api_runner.py +70 -22
- msprobe/mindspore/api_accuracy_checker/base_compare_algorithm.py +2 -1
- msprobe/mindspore/api_accuracy_checker/bench_functions/flash_attention_score.py +602 -0
- msprobe/mindspore/api_accuracy_checker/bench_functions/fusion_operator.py +41 -0
- msprobe/mindspore/api_accuracy_checker/compute_element.py +47 -1
- msprobe/mindspore/api_accuracy_checker/data_manager.py +2 -1
- msprobe/mindspore/api_accuracy_checker/multi_api_accuracy_checker.py +2 -1
- msprobe/mindspore/api_accuracy_checker/torch_mindtorch_importer.py +130 -0
- msprobe/mindspore/api_accuracy_checker/type_mapping.py +24 -1
- msprobe/mindspore/api_accuracy_checker/utils.py +6 -1
- msprobe/mindspore/common/const.py +61 -0
- msprobe/mindspore/common/utils.py +48 -18
- msprobe/mindspore/compare/ms_compare.py +27 -19
- msprobe/mindspore/compare/ms_graph_compare.py +6 -5
- msprobe/mindspore/debugger/debugger_config.py +31 -6
- msprobe/mindspore/debugger/precision_debugger.py +45 -14
- msprobe/mindspore/dump/dump_tool_factory.py +5 -3
- msprobe/mindspore/dump/hook_cell/api_register.py +142 -0
- msprobe/mindspore/dump/hook_cell/hook_cell.py +9 -10
- msprobe/mindspore/dump/hook_cell/support_wrap_ops.yaml +24 -26
- msprobe/mindspore/dump/jit_dump.py +21 -15
- msprobe/mindspore/dym_loader/hook_dynamic_loader.cc +22 -56
- msprobe/mindspore/dym_loader/hook_dynamic_loader.h +0 -1
- msprobe/mindspore/free_benchmark/api_pynative_self_check.py +10 -6
- msprobe/mindspore/free_benchmark/perturbation/perturbation_factory.py +4 -2
- msprobe/mindspore/free_benchmark/self_check_tool_factory.py +6 -3
- msprobe/mindspore/grad_probe/global_context.py +2 -0
- msprobe/mindspore/grad_probe/grad_analyzer.py +2 -1
- msprobe/mindspore/grad_probe/hook.py +2 -4
- msprobe/mindspore/monitor/anomaly_detect.py +404 -0
- msprobe/mindspore/monitor/distributed/__init__.py +0 -0
- msprobe/mindspore/monitor/distributed/distributed_ops.yaml +15 -0
- msprobe/mindspore/monitor/distributed/stack_blacklist.yaml +5 -0
- msprobe/mindspore/monitor/distributed/wrap_distributed.py +300 -0
- msprobe/mindspore/monitor/features.py +63 -0
- msprobe/mindspore/monitor/module_hook.py +873 -0
- msprobe/mindspore/monitor/module_spec_verifier.py +94 -0
- msprobe/mindspore/monitor/utils.py +309 -0
- msprobe/mindspore/ms_config.py +8 -2
- msprobe/mindspore/overflow_check/overflow_check_tool_factory.py +5 -3
- msprobe/mindspore/service.py +114 -34
- msprobe/pytorch/__init__.py +0 -1
- msprobe/pytorch/api_accuracy_checker/compare/api_precision_compare.py +3 -6
- msprobe/pytorch/api_accuracy_checker/generate_op_script/op_generator.py +12 -7
- msprobe/pytorch/api_accuracy_checker/generate_op_script/operator_replication.template +2 -2
- msprobe/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py +4 -5
- msprobe/pytorch/api_accuracy_checker/run_ut/run_overflow_check.py +5 -5
- msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py +25 -6
- msprobe/pytorch/api_accuracy_checker/run_ut/run_ut_utils.py +28 -19
- msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/attl.py +3 -1
- msprobe/pytorch/bench_functions/apply_adam.py +215 -0
- msprobe/pytorch/bench_functions/group_norm_silu.py +27 -0
- msprobe/pytorch/{parse.py → bench_functions/mish.py} +6 -4
- msprobe/pytorch/bench_functions/moe_gating_top_k_softmax.py +50 -0
- msprobe/pytorch/bench_functions/sort_v2.py +21 -0
- msprobe/pytorch/common/utils.py +97 -4
- msprobe/pytorch/debugger/debugger_config.py +19 -9
- msprobe/pytorch/debugger/precision_debugger.py +24 -1
- msprobe/pytorch/dump/module_dump/module_dump.py +4 -3
- msprobe/pytorch/dump/module_dump/module_processer.py +21 -35
- msprobe/pytorch/free_benchmark/common/utils.py +1 -1
- msprobe/pytorch/free_benchmark/compare/single_benchmark.py +1 -1
- msprobe/pytorch/free_benchmark/perturbed_layers/npu/add_noise.py +3 -3
- msprobe/pytorch/free_benchmark/perturbed_layers/npu/bit_noise.py +3 -3
- msprobe/pytorch/free_benchmark/perturbed_layers/npu/change_value.py +1 -1
- msprobe/pytorch/free_benchmark/perturbed_layers/npu/improve_precision.py +1 -1
- msprobe/pytorch/free_benchmark/result_handlers/check_handler.py +1 -1
- msprobe/pytorch/function_factory.py +8 -2
- msprobe/pytorch/grad_probe/grad_monitor.py +2 -2
- msprobe/pytorch/hook_module/api_register.py +131 -0
- msprobe/pytorch/hook_module/hook_module.py +19 -14
- msprobe/pytorch/hook_module/register_optimizer_hook.py +2 -1
- msprobe/pytorch/hook_module/support_wrap_ops.yaml +173 -75
- msprobe/pytorch/monitor/anomaly_detect.py +14 -29
- msprobe/pytorch/monitor/csv2tb.py +18 -14
- msprobe/pytorch/monitor/distributed/wrap_distributed.py +8 -2
- msprobe/pytorch/monitor/module_hook.py +238 -193
- msprobe/pytorch/monitor/module_metric.py +9 -6
- msprobe/pytorch/monitor/optimizer_collect.py +100 -67
- msprobe/pytorch/monitor/unittest/test_monitor.py +1 -1
- msprobe/pytorch/monitor/utils.py +76 -44
- msprobe/pytorch/online_dispatch/compare.py +0 -2
- msprobe/pytorch/online_dispatch/dispatch.py +9 -0
- msprobe/pytorch/online_dispatch/dump_compare.py +3 -0
- msprobe/pytorch/online_dispatch/utils.py +3 -0
- msprobe/pytorch/parse_tool/lib/interactive_cli.py +1 -6
- msprobe/pytorch/parse_tool/lib/utils.py +2 -1
- msprobe/pytorch/pt_config.py +30 -29
- msprobe/pytorch/service.py +114 -32
- msprobe/visualization/builder/graph_builder.py +75 -10
- msprobe/visualization/builder/msprobe_adapter.py +7 -6
- msprobe/visualization/compare/graph_comparator.py +42 -38
- msprobe/visualization/compare/mode_adapter.py +0 -19
- msprobe/visualization/graph/base_node.py +11 -3
- msprobe/visualization/graph/distributed_analyzer.py +71 -3
- msprobe/visualization/graph/graph.py +0 -11
- msprobe/visualization/graph/node_op.py +4 -3
- msprobe/visualization/graph_service.py +4 -5
- msprobe/visualization/utils.py +12 -35
- msprobe/mindspore/dump/hook_cell/api_registry.py +0 -205
- msprobe/mindspore/dump/hook_cell/wrap_api.py +0 -212
- msprobe/pytorch/hook_module/api_registry.py +0 -166
- msprobe/pytorch/hook_module/wrap_distributed.py +0 -75
- msprobe/pytorch/hook_module/wrap_functional.py +0 -66
- msprobe/pytorch/hook_module/wrap_npu_custom.py +0 -85
- msprobe/pytorch/hook_module/wrap_tensor.py +0 -69
- msprobe/pytorch/hook_module/wrap_torch.py +0 -84
- msprobe/pytorch/hook_module/wrap_vf.py +0 -60
- {mindstudio_probe-1.2.1.dist-info → mindstudio_probe-1.3.0.dist-info}/LICENSE +0 -0
- {mindstudio_probe-1.2.1.dist-info → mindstudio_probe-1.3.0.dist-info}/WHEEL +0 -0
- {mindstudio_probe-1.2.1.dist-info → mindstudio_probe-1.3.0.dist-info}/entry_points.txt +0 -0
- {mindstudio_probe-1.2.1.dist-info → mindstudio_probe-1.3.0.dist-info}/top_level.txt +0 -0
|
@@ -23,10 +23,10 @@ import numpy as np
|
|
|
23
23
|
from msprobe.core.common.const import Const
|
|
24
24
|
from msprobe.core.data_dump.data_processor.base import (BaseDataProcessor, TensorStatInfo,
|
|
25
25
|
ModuleForwardInputsOutputs, ModuleBackwardInputsOutputs)
|
|
26
|
-
from msprobe.core.common.file_utils import path_len_exceeds_limit
|
|
26
|
+
from msprobe.core.common.file_utils import path_len_exceeds_limit, save_npy
|
|
27
27
|
from msprobe.mindspore.common.utils import convert_bf16_to_fp32, save_tensor_as_npy
|
|
28
28
|
from msprobe.mindspore.common.log import logger
|
|
29
|
-
from msprobe.mindspore.dump.hook_cell.
|
|
29
|
+
from msprobe.mindspore.dump.hook_cell.api_register import get_api_register
|
|
30
30
|
|
|
31
31
|
has_adump = True
|
|
32
32
|
try:
|
|
@@ -44,6 +44,7 @@ class MindsporeDataProcessor(BaseDataProcessor):
|
|
|
44
44
|
"dtype": self.analyze_dtype_in_kwargs
|
|
45
45
|
}
|
|
46
46
|
self._async_dump_cache = {}
|
|
47
|
+
self.api_register = get_api_register()
|
|
47
48
|
|
|
48
49
|
@staticmethod
|
|
49
50
|
def get_md5_for_tensor(x):
|
|
@@ -74,61 +75,51 @@ class MindsporeDataProcessor(BaseDataProcessor):
|
|
|
74
75
|
else:
|
|
75
76
|
if not ops.is_floating_point(data) or data.dtype == ms.float64:
|
|
76
77
|
data = data.to(ms.float32)
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
if hasattr(mint, "norm"):
|
|
82
|
-
get_norm_value = api_register.mint_ops_ori_attr.get("norm", mint.norm)
|
|
83
|
-
else:
|
|
84
|
-
get_norm_value = api_register.functional_ori_attr.get("norm", ops.norm)
|
|
85
|
-
tensor_stat.max = get_max_value(data).item()
|
|
86
|
-
tensor_stat.min = get_min_value(data).item()
|
|
87
|
-
tensor_stat.mean = get_mean_value(data).item()
|
|
78
|
+
get_norm_value = mint.norm if hasattr(mint, "norm") else ops.norm
|
|
79
|
+
tensor_stat.max = mint.max(data).item()
|
|
80
|
+
tensor_stat.min = mint.min(data).item()
|
|
81
|
+
tensor_stat.mean = mint.mean(data).item()
|
|
88
82
|
tensor_stat.norm = get_norm_value(data).item()
|
|
89
|
-
api_register.norm_inner_op_set_hook_func()
|
|
90
83
|
return tensor_stat
|
|
91
84
|
|
|
92
85
|
@staticmethod
|
|
93
86
|
def get_stat_info_async(data):
|
|
94
87
|
tensor_stat = TensorStatInfo()
|
|
95
|
-
stack_method = api_register.functional_ori_attr.get("stack", ms.ops.stack)
|
|
96
88
|
if data.dtype == ms.complex64 or data.dtype == ms.complex128:
|
|
97
89
|
logger.warning("Async dump do not support complex data!")
|
|
98
90
|
return tensor_stat
|
|
99
91
|
elif data.dtype == ms.bool_:
|
|
100
|
-
tensor_stat.stack_tensor_stat = (["Max", "Min"],
|
|
92
|
+
tensor_stat.stack_tensor_stat = (["Max", "Min"], ops.stack([data.any(), data.all()]))
|
|
101
93
|
elif not data.shape:
|
|
102
|
-
tensor_stat.stack_tensor_stat = (["Max", "Min", "Mean", "Norm"],
|
|
94
|
+
tensor_stat.stack_tensor_stat = (["Max", "Min", "Mean", "Norm"], ops.stack([data, data, data, data]))
|
|
103
95
|
else:
|
|
104
96
|
if not ops.is_floating_point(data) or data.dtype == ms.float64:
|
|
105
97
|
data = data.to(ms.float32)
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
get_mean_value = api_register.mint_ops_ori_attr.get("mean", mint.mean)
|
|
110
|
-
if hasattr(mint, "norm"):
|
|
111
|
-
get_norm_value = api_register.mint_ops_ori_attr.get("norm", mint.norm)
|
|
112
|
-
else:
|
|
113
|
-
get_norm_value = api_register.functional_ori_attr.get("norm", ops.norm)
|
|
114
|
-
tensor_stat.stack_tensor_stat = (["Max", "Min", "Mean", "Norm"], stack_method(
|
|
115
|
-
[get_max_value(data), get_min_value(data), get_mean_value(data), get_norm_value(data)]))
|
|
116
|
-
api_register.norm_inner_op_set_hook_func()
|
|
98
|
+
get_norm_value = mint.norm if hasattr(mint, "norm") else ops.norm
|
|
99
|
+
tensor_stat.stack_tensor_stat = (["Max", "Min", "Mean", "Norm"], ops.stack(
|
|
100
|
+
[mint.max(data), mint.min(data), mint.mean(data), get_norm_value(data)]))
|
|
117
101
|
return tensor_stat
|
|
118
102
|
|
|
103
|
+
@staticmethod
|
|
104
|
+
def is_hookable_element(element):
|
|
105
|
+
return hasattr(element, "register_hook") and callable(element.register_hook)
|
|
106
|
+
|
|
119
107
|
@classmethod
|
|
120
108
|
def get_special_types(cls):
|
|
121
109
|
return super().get_special_types() + cls.mindspore_special_type
|
|
122
110
|
|
|
123
111
|
def get_stat_info(self, data):
|
|
112
|
+
self.api_register.restore_inner_used_api()
|
|
124
113
|
tensor_stat = TensorStatInfo()
|
|
125
114
|
if data.numel() == 0:
|
|
126
|
-
|
|
115
|
+
stat_info = tensor_stat
|
|
127
116
|
else:
|
|
128
117
|
if self.config.async_dump:
|
|
129
|
-
|
|
118
|
+
stat_info = MindsporeDataProcessor.get_stat_info_async(data)
|
|
130
119
|
else:
|
|
131
|
-
|
|
120
|
+
stat_info = MindsporeDataProcessor.get_stat_info_sync(data)
|
|
121
|
+
self.api_register.register_inner_used_api()
|
|
122
|
+
return stat_info
|
|
132
123
|
|
|
133
124
|
def analyze_single_element(self, element, suffix_stack):
|
|
134
125
|
if suffix_stack and suffix_stack[-1] in self.mindspore_object_key:
|
|
@@ -136,11 +127,13 @@ class MindsporeDataProcessor(BaseDataProcessor):
|
|
|
136
127
|
|
|
137
128
|
converted_numpy, numpy_type = self._convert_numpy_to_builtin(element)
|
|
138
129
|
if converted_numpy is not element:
|
|
139
|
-
return
|
|
130
|
+
return {"type": numpy_type, "value": converted_numpy}
|
|
140
131
|
if isinstance(element, Number):
|
|
141
132
|
return self.analyze_dtype_in_kwargs(element)
|
|
142
133
|
if isinstance(element, ms.Tensor):
|
|
143
|
-
return self._analyze_tensor(element, Const.SEP.join(suffix_stack))
|
|
134
|
+
return self._analyze_tensor(element, Const.SEP.join([str(suffix) for suffix in suffix_stack]))
|
|
135
|
+
if isinstance(element, np.ndarray):
|
|
136
|
+
return self._analyze_numpy(element, Const.SEP.join([str(suffix) for suffix in suffix_stack]))
|
|
144
137
|
if isinstance(element, (bool, int, float, str, slice, type(Ellipsis))):
|
|
145
138
|
return self._analyze_builtin(element)
|
|
146
139
|
return {}
|
|
@@ -186,6 +179,13 @@ class TensorDataProcessor(MindsporeDataProcessor):
|
|
|
186
179
|
save_tensor_as_npy(tensor, file_path)
|
|
187
180
|
return single_arg
|
|
188
181
|
|
|
182
|
+
def _analyze_numpy(self, ndarray, suffix):
|
|
183
|
+
dump_data_name, file_path = self.get_save_file_path(suffix)
|
|
184
|
+
save_npy(ndarray, file_path)
|
|
185
|
+
ndarray_json = super()._analyze_numpy(ndarray, suffix)
|
|
186
|
+
ndarray_json.update({"data_name": dump_data_name})
|
|
187
|
+
return ndarray_json
|
|
188
|
+
|
|
189
189
|
|
|
190
190
|
class OverflowCheckDataProcessor(MindsporeDataProcessor):
|
|
191
191
|
__slots__ = ["cached_tensors_and_file_paths"]
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# Copyright (c) 2024-
|
|
1
|
+
# Copyright (c) 2024-2025, Huawei Technologies Co., Ltd.
|
|
2
2
|
# All rights reserved.
|
|
3
3
|
#
|
|
4
4
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
@@ -21,16 +21,18 @@ from typing import List
|
|
|
21
21
|
import numpy as np
|
|
22
22
|
import torch
|
|
23
23
|
from torch import distributed as dist
|
|
24
|
+
from torch.distributed.distributed_c10d import _get_default_group
|
|
24
25
|
|
|
25
26
|
from msprobe.core.common.const import Const
|
|
27
|
+
from msprobe.core.common.exceptions import MsprobeException
|
|
26
28
|
from msprobe.core.common.file_utils import path_len_exceeds_limit
|
|
27
29
|
from msprobe.core.common.log import logger
|
|
28
30
|
from msprobe.core.common.utils import convert_tuple
|
|
31
|
+
from msprobe.core.common.decorator import recursion_depth_decorator
|
|
29
32
|
from msprobe.core.data_dump.data_processor.base import BaseDataProcessor, ModuleBackwardInputsOutputs, \
|
|
30
33
|
ModuleForwardInputsOutputs, TensorStatInfo
|
|
31
|
-
from msprobe.pytorch.common.utils import save_pt,
|
|
34
|
+
from msprobe.pytorch.common.utils import Const as PtConst, save_pt, is_hifloat8_tensor, is_float8_tensor
|
|
32
35
|
from msprobe.pytorch.free_benchmark import FreeBenchmarkCheck, UnequalRow
|
|
33
|
-
from msprobe.core.common.utils import recursion_depth_decorator
|
|
34
36
|
|
|
35
37
|
is_gpu = False
|
|
36
38
|
try:
|
|
@@ -40,7 +42,16 @@ except ImportError:
|
|
|
40
42
|
|
|
41
43
|
|
|
42
44
|
class PytorchDataProcessor(BaseDataProcessor):
|
|
43
|
-
pytorch_special_type = (
|
|
45
|
+
pytorch_special_type = (
|
|
46
|
+
torch.device,
|
|
47
|
+
torch.dtype,
|
|
48
|
+
torch.Size,
|
|
49
|
+
torch.Tensor,
|
|
50
|
+
torch.memory_format,
|
|
51
|
+
dist.ProcessGroup,
|
|
52
|
+
dist.P2POp,
|
|
53
|
+
dist.ReduceOp
|
|
54
|
+
)
|
|
44
55
|
memory_format = {
|
|
45
56
|
torch.contiguous_format: "contiguous_format",
|
|
46
57
|
torch.channels_last: "channels_last",
|
|
@@ -68,14 +79,16 @@ class PytorchDataProcessor(BaseDataProcessor):
|
|
|
68
79
|
def analyze_device_in_kwargs(element):
|
|
69
80
|
single_arg = {}
|
|
70
81
|
single_arg.update({'type': "torch.device"})
|
|
71
|
-
if
|
|
82
|
+
if isinstance(element, (int, str)):
|
|
83
|
+
single_arg.update({"value": element})
|
|
84
|
+
elif isinstance(element, torch.device):
|
|
72
85
|
if hasattr(element, "index"):
|
|
73
86
|
device_value = element.type + ":" + str(element.index)
|
|
74
87
|
else:
|
|
75
88
|
device_value = element.type
|
|
76
89
|
single_arg.update({"value": device_value})
|
|
77
90
|
else:
|
|
78
|
-
|
|
91
|
+
logger.debug(f"Device type {type(element)} is not supported.")
|
|
79
92
|
return single_arg
|
|
80
93
|
|
|
81
94
|
@staticmethod
|
|
@@ -133,7 +146,7 @@ class PytorchDataProcessor(BaseDataProcessor):
|
|
|
133
146
|
if data.is_meta:
|
|
134
147
|
return tensor_stat
|
|
135
148
|
data_clone = data.detach()
|
|
136
|
-
if data_clone.numel()
|
|
149
|
+
if not data_clone.numel() or not data_clone.data_ptr():
|
|
137
150
|
return tensor_stat
|
|
138
151
|
else:
|
|
139
152
|
if data_clone.device.type == Const.CPU_LOWERCASE or not async_dump:
|
|
@@ -168,6 +181,11 @@ class PytorchDataProcessor(BaseDataProcessor):
|
|
|
168
181
|
def is_distributed_op(module):
|
|
169
182
|
return getattr(module, "op_is_distributed", False)
|
|
170
183
|
|
|
184
|
+
@staticmethod
|
|
185
|
+
def is_hookable_element(element):
|
|
186
|
+
return (hasattr(element, "register_hook") and callable(element.register_hook)) and \
|
|
187
|
+
(hasattr(element, "requires_grad") and element.requires_grad)
|
|
188
|
+
|
|
171
189
|
@staticmethod
|
|
172
190
|
def _analyze_torch_size(arg):
|
|
173
191
|
return {"type": "torch.Size", "value": list(arg)}
|
|
@@ -176,7 +194,6 @@ class PytorchDataProcessor(BaseDataProcessor):
|
|
|
176
194
|
def _analyze_memory_format(arg):
|
|
177
195
|
# 获取内存格式
|
|
178
196
|
format_type = PytorchDataProcessor.memory_format.get(arg)
|
|
179
|
-
|
|
180
197
|
return {"type": "torch.memory_format", "format": format_type}
|
|
181
198
|
|
|
182
199
|
@staticmethod
|
|
@@ -188,9 +205,30 @@ class PytorchDataProcessor(BaseDataProcessor):
|
|
|
188
205
|
group_id = PytorchDataProcessor.process_group_hash(arg)
|
|
189
206
|
group_info.update({"group_id": group_id})
|
|
190
207
|
except Exception as e:
|
|
191
|
-
logger.warning(f"Failed to get process group
|
|
208
|
+
logger.warning(f"Failed to get process group ranks info with error info: {e}.")
|
|
192
209
|
return group_info
|
|
193
210
|
|
|
211
|
+
@staticmethod
|
|
212
|
+
def _analyze_reduce_op(arg):
|
|
213
|
+
op_type = None
|
|
214
|
+
try:
|
|
215
|
+
op_type = str(arg)
|
|
216
|
+
except Exception as e:
|
|
217
|
+
logger.warning(f"Failed to get value of torch.distributed.ReduceOp with error info: {e}.")
|
|
218
|
+
return {"type": "torch.distributed.ReduceOp", "value": op_type}
|
|
219
|
+
|
|
220
|
+
@staticmethod
|
|
221
|
+
def _cast_to_float_if_fp8(tensor):
|
|
222
|
+
dtype = str(tensor.dtype)
|
|
223
|
+
if is_float8_tensor(tensor):
|
|
224
|
+
dtype = PtConst.HIFLOAT8_TYPE if is_hifloat8_tensor(tensor) else dtype
|
|
225
|
+
logger.debug(
|
|
226
|
+
f"The {dtype} tensor analyzing/saving is unsupported in dump function."
|
|
227
|
+
f"Casting to float for processing."
|
|
228
|
+
)
|
|
229
|
+
tensor = tensor.float()
|
|
230
|
+
return tensor, dtype
|
|
231
|
+
|
|
194
232
|
@classmethod
|
|
195
233
|
def get_special_types(cls):
|
|
196
234
|
return super().get_special_types() + cls.pytorch_special_type
|
|
@@ -204,11 +242,17 @@ class PytorchDataProcessor(BaseDataProcessor):
|
|
|
204
242
|
return self._analyze_memory_format(element)
|
|
205
243
|
if isinstance(element, dist.ProcessGroup):
|
|
206
244
|
return self._analyze_process_group(element)
|
|
245
|
+
if isinstance(element, dist.P2POp):
|
|
246
|
+
return self._analyze_p2pop(element, Const.SEP.join([str(suffix) for suffix in suffix_stack]))
|
|
247
|
+
if isinstance(element, dist.ReduceOp):
|
|
248
|
+
return self._analyze_reduce_op(element)
|
|
207
249
|
converted_numpy, numpy_type = self._convert_numpy_to_builtin(element)
|
|
208
250
|
if converted_numpy is not element:
|
|
209
|
-
return
|
|
251
|
+
return {"type": numpy_type, "value": converted_numpy}
|
|
210
252
|
if isinstance(element, torch.Tensor):
|
|
211
|
-
return self._analyze_tensor(element, Const.SEP.join(suffix_stack))
|
|
253
|
+
return self._analyze_tensor(element, Const.SEP.join([str(suffix) for suffix in suffix_stack]))
|
|
254
|
+
if isinstance(element, np.ndarray):
|
|
255
|
+
return self._analyze_numpy(element, Const.SEP.join([str(suffix) for suffix in suffix_stack]))
|
|
212
256
|
if isinstance(element, (bool, int, float, str, slice, type(Ellipsis))):
|
|
213
257
|
return self._analyze_builtin(element)
|
|
214
258
|
return {}
|
|
@@ -218,11 +262,27 @@ class PytorchDataProcessor(BaseDataProcessor):
|
|
|
218
262
|
module_input_output.update_output_with_args_and_kwargs()
|
|
219
263
|
return super().analyze_forward_output(name, module, module_input_output)
|
|
220
264
|
|
|
265
|
+
def _analyze_p2pop(self, arg, suffix):
|
|
266
|
+
p2pop_info = {"class_type": "torch.distributed.P2POp"}
|
|
267
|
+
try:
|
|
268
|
+
tensor_info = self._analyze_tensor(arg.tensor, suffix)
|
|
269
|
+
p2pop_info.update({"tensor": tensor_info})
|
|
270
|
+
p2pop_info.update({"op": arg.op.__name__})
|
|
271
|
+
p2pop_info.update({"peer": arg.peer})
|
|
272
|
+
p2pop_info.update({"tag": arg.tag})
|
|
273
|
+
group_id = PytorchDataProcessor.process_group_hash(
|
|
274
|
+
arg.group) if arg.group else PytorchDataProcessor.process_group_hash(_get_default_group())
|
|
275
|
+
p2pop_info.update({"group_id": group_id})
|
|
276
|
+
except Exception as e:
|
|
277
|
+
logger.warning(f"Failed to parse the P2POp content with error info: {e}.")
|
|
278
|
+
return p2pop_info
|
|
279
|
+
|
|
221
280
|
def _analyze_tensor(self, tensor, suffix):
|
|
281
|
+
tensor, dtype = self._cast_to_float_if_fp8(tensor)
|
|
222
282
|
tensor_stat = self.get_stat_info(tensor, self.config.async_dump)
|
|
223
283
|
tensor_json = {}
|
|
224
284
|
tensor_json.update({'type': 'torch.Tensor'})
|
|
225
|
-
tensor_json.update({'dtype':
|
|
285
|
+
tensor_json.update({'dtype': dtype})
|
|
226
286
|
tensor_json.update({"shape": tensor.shape})
|
|
227
287
|
if tensor_stat.stack_tensor_stat is None:
|
|
228
288
|
tensor_json.update({"Max": tensor_stat.max})
|
|
@@ -261,6 +321,7 @@ class TensorDataProcessor(PytorchDataProcessor):
|
|
|
261
321
|
dump_data_name, file_path = self.get_save_file_path(suffix)
|
|
262
322
|
single_arg = super()._analyze_tensor(tensor, suffix)
|
|
263
323
|
single_arg.update({"data_name": dump_data_name})
|
|
324
|
+
tensor, _ = self._cast_to_float_if_fp8(tensor)
|
|
264
325
|
if self.config.async_dump:
|
|
265
326
|
self._async_dump_cache[file_path] = tensor.clone().detach()
|
|
266
327
|
else:
|
|
@@ -268,6 +329,13 @@ class TensorDataProcessor(PytorchDataProcessor):
|
|
|
268
329
|
save_pt(saved_tensor, file_path)
|
|
269
330
|
return single_arg
|
|
270
331
|
|
|
332
|
+
def _analyze_numpy(self, ndarray, suffix):
|
|
333
|
+
dump_data_name, file_path = self.get_save_file_path(suffix)
|
|
334
|
+
save_pt(torch.tensor(ndarray), file_path)
|
|
335
|
+
ndarray_json = super()._analyze_numpy(ndarray, suffix)
|
|
336
|
+
ndarray_json.update({"data_name": dump_data_name})
|
|
337
|
+
return ndarray_json
|
|
338
|
+
|
|
271
339
|
|
|
272
340
|
class OverflowCheckDataProcessor(PytorchDataProcessor):
|
|
273
341
|
__slots__ = ["cached_tensors_and_file_paths"]
|
|
@@ -319,7 +387,7 @@ class OverflowCheckDataProcessor(PytorchDataProcessor):
|
|
|
319
387
|
api_info_struct = super().analyze_backward(name, module, module_input_output)
|
|
320
388
|
self.handle_overflow()
|
|
321
389
|
return api_info_struct if self.has_overflow else None
|
|
322
|
-
|
|
390
|
+
|
|
323
391
|
def analyze_params(self, name, param_name, grad):
|
|
324
392
|
self.has_overflow = False
|
|
325
393
|
self._is_support_inf_nan()
|
|
@@ -332,7 +400,8 @@ class OverflowCheckDataProcessor(PytorchDataProcessor):
|
|
|
332
400
|
self._analyze_maybe_overflow_flag()
|
|
333
401
|
if self.has_overflow:
|
|
334
402
|
for file_path, tensor in self.cached_tensors_and_file_paths.items():
|
|
335
|
-
|
|
403
|
+
tensor, _ = self._cast_to_float_if_fp8(tensor)
|
|
404
|
+
save_pt(tensor.clone().contiguous().detach(), file_path)
|
|
336
405
|
self.real_overflow_nums += 1
|
|
337
406
|
if self.overflow_nums != -1 and self.real_overflow_nums >= self.overflow_nums:
|
|
338
407
|
logger.info(f"[{Const.TOOL_NAME}] Reached the preset overflow times, "
|
|
@@ -457,11 +526,13 @@ class KernelDumpDataProcessor(PytorchDataProcessor):
|
|
|
457
526
|
return
|
|
458
527
|
|
|
459
528
|
if self.config.is_backward_kernel_dump:
|
|
460
|
-
self.forward_args = self.clone_and_detach_tensor(module_input_output.args)
|
|
461
|
-
self.forward_kwargs = self.clone_and_detach_tensor(module_input_output.kwargs)
|
|
462
529
|
try:
|
|
530
|
+
self.forward_args = self.clone_and_detach_tensor(module_input_output.args)
|
|
531
|
+
self.forward_kwargs = self.clone_and_detach_tensor(module_input_output.kwargs)
|
|
463
532
|
output = module.forward(*self.forward_args, **self.forward_kwargs)
|
|
464
|
-
except Exception:
|
|
533
|
+
except Exception as e:
|
|
534
|
+
if isinstance(e, MsprobeException):
|
|
535
|
+
logger.warning(str(e))
|
|
465
536
|
self._print_unsupported_log(name)
|
|
466
537
|
self.enable_kernel_dump = False
|
|
467
538
|
return
|
|
@@ -503,9 +574,17 @@ class KernelDumpDataProcessor(PytorchDataProcessor):
|
|
|
503
574
|
self.stop_kernel_dump()
|
|
504
575
|
logger.info(f"The kernel data of {name} is dumped successfully.")
|
|
505
576
|
|
|
506
|
-
@recursion_depth_decorator(
|
|
577
|
+
@recursion_depth_decorator(
|
|
578
|
+
"KernelDump: KernelDumpDataProcessor.clone_and_detach_tensor",
|
|
579
|
+
max_depth=Const.DUMP_MAX_DEPTH
|
|
580
|
+
)
|
|
507
581
|
def clone_and_detach_tensor(self, input_params):
|
|
508
582
|
if isinstance(input_params, torch.Tensor):
|
|
583
|
+
if is_float8_tensor(input_params):
|
|
584
|
+
raise MsprobeException(
|
|
585
|
+
MsprobeException.UNSUPPORTED_TYPE_ERROR,
|
|
586
|
+
f"L2 backward dump does not support float8 type."
|
|
587
|
+
)
|
|
509
588
|
if input_params.requires_grad:
|
|
510
589
|
return input_params.clone().detach().requires_grad_()
|
|
511
590
|
return input_params.clone()
|
|
@@ -520,6 +599,8 @@ class KernelDumpDataProcessor(PytorchDataProcessor):
|
|
|
520
599
|
|
|
521
600
|
def analyze_single_element(self, element, suffix_stack):
|
|
522
601
|
if isinstance(element, torch.Tensor):
|
|
602
|
+
if is_float8_tensor(element):
|
|
603
|
+
return {}
|
|
523
604
|
if not self.is_found_output_tensor:
|
|
524
605
|
if element.requires_grad:
|
|
525
606
|
self.forward_output_tensor = element
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# Copyright (c) 2024-
|
|
1
|
+
# Copyright (c) 2024-2025, Huawei Technologies Co., Ltd.
|
|
2
2
|
# All rights reserved.
|
|
3
3
|
#
|
|
4
4
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
@@ -15,12 +15,15 @@
|
|
|
15
15
|
|
|
16
16
|
import csv
|
|
17
17
|
import os
|
|
18
|
-
import
|
|
18
|
+
import copy
|
|
19
|
+
import threading
|
|
19
20
|
|
|
20
21
|
from msprobe.core.common.const import Const, FileCheckConst
|
|
21
22
|
from msprobe.core.common.file_utils import change_mode, FileOpen, save_json, load_json
|
|
22
23
|
from msprobe.core.common.log import logger
|
|
23
|
-
from msprobe.core.common.
|
|
24
|
+
from msprobe.core.common.decorator import recursion_depth_decorator
|
|
25
|
+
|
|
26
|
+
lock = threading.Lock()
|
|
24
27
|
|
|
25
28
|
|
|
26
29
|
class DataWriter:
|
|
@@ -31,10 +34,12 @@ class DataWriter:
|
|
|
31
34
|
self.construct_file_path = None
|
|
32
35
|
self.free_benchmark_file_path = None
|
|
33
36
|
self.dump_tensor_data_dir = None
|
|
37
|
+
self.debug_file_path = None
|
|
34
38
|
self.flush_size = 1000
|
|
35
39
|
self.cache_data = {}
|
|
36
40
|
self.cache_stack = {}
|
|
37
41
|
self.cache_construct = {}
|
|
42
|
+
self.cache_debug = {}
|
|
38
43
|
|
|
39
44
|
@staticmethod
|
|
40
45
|
def write_data_to_csv(result: list, result_header: tuple, file_path: str):
|
|
@@ -57,6 +62,13 @@ class DataWriter:
|
|
|
57
62
|
self.cache_construct = {}
|
|
58
63
|
|
|
59
64
|
def initialize_json_file(self, **kwargs):
|
|
65
|
+
if self.debug_file_path and not self.cache_debug:
|
|
66
|
+
# debug level case only create debug.json
|
|
67
|
+
debug_dict = copy.deepcopy(kwargs)
|
|
68
|
+
debug_dict.update({"dump_data_dir": self.dump_tensor_data_dir, Const.DATA: {}})
|
|
69
|
+
self.cache_debug = debug_dict
|
|
70
|
+
save_json(self.debug_file_path, self.cache_debug, indent=1)
|
|
71
|
+
return
|
|
60
72
|
if not self.cache_data:
|
|
61
73
|
kwargs.update({"dump_data_dir": self.dump_tensor_data_dir, Const.DATA: {}})
|
|
62
74
|
self.cache_data = kwargs
|
|
@@ -66,13 +78,13 @@ class DataWriter:
|
|
|
66
78
|
if not self.cache_construct:
|
|
67
79
|
save_json(self.construct_file_path, self.cache_construct, indent=1)
|
|
68
80
|
|
|
69
|
-
def update_dump_paths(self,
|
|
70
|
-
|
|
71
|
-
self.
|
|
72
|
-
self.
|
|
73
|
-
self.
|
|
74
|
-
self.
|
|
75
|
-
self.
|
|
81
|
+
def update_dump_paths(self, dump_path_aggregation):
|
|
82
|
+
self.dump_file_path = dump_path_aggregation.dump_file_path
|
|
83
|
+
self.stack_file_path = dump_path_aggregation.stack_file_path
|
|
84
|
+
self.construct_file_path = dump_path_aggregation.construct_file_path
|
|
85
|
+
self.dump_tensor_data_dir = dump_path_aggregation.dump_tensor_data_dir
|
|
86
|
+
self.free_benchmark_file_path = dump_path_aggregation.free_benchmark_file_path
|
|
87
|
+
self.debug_file_path = dump_path_aggregation.debug_file_path
|
|
76
88
|
|
|
77
89
|
def flush_data_periodically(self):
|
|
78
90
|
dump_data = self.cache_data.get(Const.DATA)
|
|
@@ -80,25 +92,32 @@ class DataWriter:
|
|
|
80
92
|
self.write_json()
|
|
81
93
|
|
|
82
94
|
def update_data(self, new_data):
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
dump_data
|
|
94
|
-
|
|
95
|
-
|
|
95
|
+
with lock:
|
|
96
|
+
if not isinstance(new_data, dict) or len(new_data.keys()) != 1:
|
|
97
|
+
logger.warning(f"The data info({new_data}) should be a dict with only one outer key.")
|
|
98
|
+
return
|
|
99
|
+
dump_data = self.cache_data.get(Const.DATA)
|
|
100
|
+
if not isinstance(dump_data, dict):
|
|
101
|
+
logger.warning(f"The dump data({dump_data}) should be a dict.")
|
|
102
|
+
return
|
|
103
|
+
|
|
104
|
+
key = next(iter(new_data.keys()))
|
|
105
|
+
if key in dump_data:
|
|
106
|
+
dump_data.get(key).update(new_data.get(key))
|
|
107
|
+
else:
|
|
108
|
+
dump_data.update(new_data)
|
|
96
109
|
|
|
97
110
|
def update_stack(self, new_data):
|
|
98
|
-
|
|
111
|
+
with lock:
|
|
112
|
+
self.cache_stack.update(new_data)
|
|
99
113
|
|
|
100
114
|
def update_construct(self, new_data):
|
|
101
|
-
|
|
115
|
+
with lock:
|
|
116
|
+
self.cache_construct.update(new_data)
|
|
117
|
+
|
|
118
|
+
def update_debug(self, new_data):
|
|
119
|
+
with lock:
|
|
120
|
+
self.cache_debug['data'].update(new_data)
|
|
102
121
|
|
|
103
122
|
def write_data_json(self, file_path):
|
|
104
123
|
logger.info(f"dump.json is at {os.path.dirname(os.path.dirname(file_path))}. ")
|
|
@@ -110,21 +129,25 @@ class DataWriter:
|
|
|
110
129
|
def write_construct_info_json(self, file_path):
|
|
111
130
|
save_json(file_path, self.cache_construct, indent=1)
|
|
112
131
|
|
|
132
|
+
def write_debug_info_json(self, file_path):
|
|
133
|
+
save_json(file_path, self.cache_debug, indent=1)
|
|
134
|
+
|
|
113
135
|
def write_json(self):
|
|
114
|
-
|
|
115
|
-
self.
|
|
116
|
-
|
|
117
|
-
self.
|
|
118
|
-
|
|
119
|
-
self.
|
|
136
|
+
with lock:
|
|
137
|
+
if self.cache_data:
|
|
138
|
+
self.write_data_json(self.dump_file_path)
|
|
139
|
+
if self.cache_stack:
|
|
140
|
+
self.write_stack_info_json(self.stack_file_path)
|
|
141
|
+
if self.cache_construct:
|
|
142
|
+
self.write_construct_info_json(self.construct_file_path)
|
|
143
|
+
if self.cache_debug:
|
|
144
|
+
self.write_debug_info_json(self.debug_file_path)
|
|
120
145
|
|
|
121
146
|
def fill_stack_tensor_data(self):
|
|
122
147
|
self.process_stat_data_recursive(self.cache_data)
|
|
123
148
|
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
logger.error(f"The maximum depth of recursive process stat data, {Const.MAX_DEPTH} is reached.")
|
|
127
|
-
raise MsprobeException(MsprobeException.RECURSION_LIMIT_ERROR)
|
|
149
|
+
@recursion_depth_decorator("AsyncDump: DataWriter.process_stat_data_recursive", max_depth=Const.DUMP_MAX_DEPTH)
|
|
150
|
+
def process_stat_data_recursive(self, data):
|
|
128
151
|
if isinstance(data, dict):
|
|
129
152
|
if "tensor_stat" in data.keys():
|
|
130
153
|
tensor_stat = data["tensor_stat"]
|
|
@@ -132,14 +155,12 @@ class DataWriter:
|
|
|
132
155
|
logger.warning("Some bad data in async dump")
|
|
133
156
|
else:
|
|
134
157
|
tensor_stat_index, tensor_stat_data = tensor_stat[0], tensor_stat[1]
|
|
135
|
-
if hasattr(tensor_stat_data, "device") and tensor_stat_data.device != Const.CPU_LOWERCASE:
|
|
136
|
-
tensor_stat_data = tensor_stat_data.cpu()
|
|
137
158
|
for index, stat in zip(tensor_stat_index, tensor_stat_data):
|
|
138
|
-
data.update({index
|
|
159
|
+
data.update({index: stat.item()})
|
|
139
160
|
del data["tensor_stat"]
|
|
140
161
|
else:
|
|
141
162
|
for key in data.keys():
|
|
142
|
-
self.process_stat_data_recursive(data[key]
|
|
163
|
+
self.process_stat_data_recursive(data[key])
|
|
143
164
|
elif isinstance(data, (list, tuple)):
|
|
144
165
|
for i in data:
|
|
145
|
-
self.process_stat_data_recursive(i
|
|
166
|
+
self.process_stat_data_recursive(i)
|
|
@@ -112,7 +112,7 @@ class GradComparator:
|
|
|
112
112
|
result.append([key] + value)
|
|
113
113
|
result_csv_path = os.path.join(output_dir, "similarities.csv")
|
|
114
114
|
if os.path.exists(result_csv_path):
|
|
115
|
-
logger.warning(f"{result_csv_path} will be
|
|
115
|
+
logger.warning(f"{result_csv_path} will be deleted")
|
|
116
116
|
remove_path(result_csv_path)
|
|
117
117
|
write_csv(result, result_csv_path)
|
|
118
118
|
|
|
@@ -20,6 +20,7 @@ import numpy as np
|
|
|
20
20
|
from msprobe.core.overflow_check.api_info import APIInfo
|
|
21
21
|
from msprobe.core.overflow_check.level import OverflowLevel
|
|
22
22
|
from msprobe.core.overflow_check.utils import has_nan_inf
|
|
23
|
+
from msprobe.core.common.decorator import recursion_depth_decorator
|
|
23
24
|
|
|
24
25
|
|
|
25
26
|
class AnomalyScene:
|
|
@@ -35,6 +36,7 @@ class AnomalyScene:
|
|
|
35
36
|
raise NotImplementedError
|
|
36
37
|
|
|
37
38
|
@staticmethod
|
|
39
|
+
@recursion_depth_decorator("AbnormalScene: AnomalyScene._has_anomaly")
|
|
38
40
|
def _has_anomaly(data: Union[Dict, Any]) -> bool:
|
|
39
41
|
"""检查张量是否包含异常值"""
|
|
40
42
|
if isinstance(data, dict):
|
msprobe/docs/01.installation.md
CHANGED
|
@@ -16,6 +16,8 @@ pip install mindstudio-probe
|
|
|
16
16
|
|
|
17
17
|
|版本|发布日期|支持 PyTorch 版本|支持 MindSpore 版本|下载链接|校验码|
|
|
18
18
|
|:--:|:--:|:--:|:--:|:--:|:--:|
|
|
19
|
+
|1.2.2|2025.3.03|1.11/2.0/2.1/2.2|2.4.0|[mindstudio_probe-1.2.2-py3-none-any.whl](https://ptdbg.obs.myhuaweicloud.com/msprobe/1.2/mindstudio_probe-1.2.2-py3-none-any.whl)|961411bb460d327ea51d6ca4d0c8e8c5565f07c0852d7b8592b781ca35b87212|
|
|
20
|
+
|1.2.1|2025.2.07|1.11/2.0/2.1/2.2|2.4.0|[mindstudio_probe-1.2.1-py3-none-any.whl](https://ptdbg.obs.myhuaweicloud.com/msprobe/1.2/mindstudio_probe-1.2.1-py3-none-any.whl)|b64b342118558e0339b39237f88a49b93fd24551b0cb202c872fbfef4260c86b|
|
|
19
21
|
|1.2.0|2025.1.13|1.11/2.0/2.1/2.2|2.4.0|[mindstudio_probe-1.2.0-py3-none-any.whl](https://ptdbg.obs.myhuaweicloud.com/msprobe/1.2/mindstudio_probe-1.2.0-py3-none-any.whl)|1e3aeea1706112f6ee52fd1165037936bb209138f0b9ec42ea21e2c1c8942cdc|
|
|
20
22
|
|1.1.1|2024.12.09|1.11/2.0/2.1/2.2|2.4.0|[mindstudio_probe-1.1.1-py3-none-any.whl](https://ptdbg.obs.myhuaweicloud.com/msprobe/1.1/mindstudio_probe-1.1.1-py3-none-any.whl)|577b597555dc155b76ba1a62d575c3546004644e140a456c3ba0824d46283735|
|
|
21
23
|
|1.1.0|2024.10.14|1.11/2.0/2.1/2.2|2.4.0|[mindstudio_probe-1.1.0-py3-none-any.whl](https://ptdbg.obs.myhuaweicloud.com/msprobe/1.1/mindstudio_probe-1.1.0-py3-none-any.whl)|83a5a9b7c65a357639f8c9636d88c693b4cf0eb590d4f8f5cb56395ba69b1f6d|
|
|
@@ -50,10 +52,34 @@ pip install ./mindstudio_probe*.whl
|
|
|
50
52
|
|
|
51
53
|
|参数|说明|是否必选|
|
|
52
54
|
|--|--|:--:|
|
|
53
|
-
|--include-mod|指定可选模块,可取值`adump`,表示在编whl包时加入adump模块。默认未配置该参数,表示编基础包。<br>• adump模块用于MindSpore静态图场景L2级别的dump。<br>• 仅MindSpore 2.5.0及以上版本支持adump模块。<br>• 若使用源码安装,编译环境需支持GCC 7或以上版本,和CMAKE 3.14或以上版本。<br>• 生成的whl包仅限编译时使用的python版本和处理器架构可用。|否|
|
|
55
|
+
|--include-mod|指定可选模块,可取值`adump`,表示在编whl包时加入adump模块。默认未配置该参数,表示编基础包。<br>• adump模块用于MindSpore静态图场景L2级别的dump。<br>• 仅MindSpore 2.5.0及以上版本支持adump模块。<br>• 若使用源码安装,编译环境需支持GCC 7.5或以上版本,和CMAKE 3.14或以上版本。<br>• 生成的whl包仅限编译时使用的python版本和处理器架构可用。|否|
|
|
54
56
|
|
|
55
57
|
# 特性变更说明
|
|
56
58
|
|
|
59
|
+
## 1.2.0
|
|
60
|
+
|
|
61
|
+
【数据采集】
|
|
62
|
+
|
|
63
|
+
- 模块级dump支持采集权重及权重梯度
|
|
64
|
+
- 修复原地覆盖类API前向输入数据采集不正确的问题
|
|
65
|
+
- seed_all接口支持控制dropout失效功能
|
|
66
|
+
|
|
67
|
+
【精度预检】
|
|
68
|
+
|
|
69
|
+
- MindSpore场景新增支持Tensor类的mint API的预检
|
|
70
|
+
|
|
71
|
+
【训练状态监控】
|
|
72
|
+
|
|
73
|
+
- 支持FSDP和ZeRO-0
|
|
74
|
+
- 异常排序支持前向激活值和反向梯度
|
|
75
|
+
|
|
76
|
+
【分级可视化构图比对】
|
|
77
|
+
|
|
78
|
+
- 支持graph结构分页展示,支持graph批量构建和比对
|
|
79
|
+
- 支持溢出检测模式
|
|
80
|
+
|
|
81
|
+
## 1.1.1
|
|
82
|
+
|
|
57
83
|
## 1.1.1
|
|
58
84
|
|
|
59
85
|
【数据采集】
|