mindstudio-probe 1.2.2__py3-none-any.whl → 8.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {mindstudio_probe-1.2.2.dist-info → mindstudio_probe-8.1.0.dist-info}/METADATA +4 -3
- {mindstudio_probe-1.2.2.dist-info → mindstudio_probe-8.1.0.dist-info}/RECORD +243 -191
- msprobe/README.md +57 -21
- msprobe/core/__init__.py +17 -0
- msprobe/core/common/const.py +224 -82
- msprobe/core/common/decorator.py +50 -0
- msprobe/core/common/exceptions.py +5 -3
- msprobe/core/common/file_utils.py +274 -40
- msprobe/core/common/framework_adapter.py +169 -0
- msprobe/core/common/global_lock.py +86 -0
- msprobe/core/common/runtime.py +25 -0
- msprobe/core/common/utils.py +148 -72
- msprobe/core/common_config.py +7 -0
- msprobe/core/compare/acc_compare.py +640 -462
- msprobe/core/compare/check.py +36 -107
- msprobe/core/compare/compare_cli.py +4 -0
- msprobe/core/compare/config.py +72 -0
- msprobe/core/compare/highlight.py +217 -215
- msprobe/core/compare/layer_mapping/layer_mapping.py +4 -1
- msprobe/core/compare/merge_result/merge_result.py +12 -6
- msprobe/core/compare/multiprocessing_compute.py +227 -107
- msprobe/core/compare/npy_compare.py +32 -16
- msprobe/core/compare/utils.py +218 -244
- msprobe/{mindspore/runtime.py → core/config_check/__init__.py} +2 -4
- msprobe/{pytorch/dump/kernel_dump/kernel_config.py → core/config_check/checkers/__init__.py} +8 -16
- msprobe/core/config_check/checkers/base_checker.py +60 -0
- msprobe/core/config_check/checkers/dataset_checker.py +138 -0
- msprobe/core/config_check/checkers/env_args_checker.py +96 -0
- msprobe/core/config_check/checkers/hyperparameter_checker.py +170 -0
- msprobe/core/config_check/checkers/pip_checker.py +90 -0
- msprobe/core/config_check/checkers/random_checker.py +367 -0
- msprobe/core/config_check/checkers/weights_checker.py +147 -0
- msprobe/core/config_check/ckpt_compare/ckpt_comparator.py +74 -0
- msprobe/core/config_check/ckpt_compare/megatron_loader.py +302 -0
- msprobe/core/config_check/ckpt_compare/metrics.py +83 -0
- msprobe/core/config_check/ckpt_compare/name_mapping.yaml +12 -0
- msprobe/core/config_check/config_check_cli.py +51 -0
- msprobe/core/config_check/config_checker.py +100 -0
- msprobe/{pytorch/parse.py → core/config_check/resource/dependency.yaml} +7 -4
- msprobe/core/config_check/resource/env.yaml +57 -0
- msprobe/core/config_check/resource/hyperparameter.yaml +21 -0
- msprobe/core/config_check/utils/hyperparameter_parser.py +115 -0
- msprobe/core/config_check/utils/utils.py +107 -0
- msprobe/core/data_dump/api_registry.py +239 -0
- msprobe/core/data_dump/data_collector.py +36 -9
- msprobe/core/data_dump/data_processor/base.py +74 -53
- msprobe/core/data_dump/data_processor/mindspore_processor.py +119 -78
- msprobe/core/data_dump/data_processor/pytorch_processor.py +134 -96
- msprobe/core/data_dump/json_writer.py +146 -57
- msprobe/core/debugger/precision_debugger.py +143 -0
- msprobe/core/grad_probe/constant.py +2 -1
- msprobe/core/grad_probe/grad_compare.py +2 -2
- msprobe/core/grad_probe/utils.py +1 -1
- msprobe/core/hook_manager.py +242 -0
- msprobe/core/monitor/anomaly_processor.py +384 -0
- msprobe/core/overflow_check/abnormal_scene.py +2 -0
- msprobe/core/service.py +356 -0
- msprobe/core/single_save/__init__.py +0 -0
- msprobe/core/single_save/single_comparator.py +243 -0
- msprobe/core/single_save/single_saver.py +157 -0
- msprobe/docs/01.installation.md +6 -5
- msprobe/docs/02.config_introduction.md +89 -30
- msprobe/docs/03.config_examples.md +1 -0
- msprobe/docs/04.kernel_dump_PyTorch.md +1 -1
- msprobe/docs/05.data_dump_PyTorch.md +184 -50
- msprobe/docs/06.data_dump_MindSpore.md +193 -28
- msprobe/docs/07.accuracy_checker_PyTorch.md +13 -3
- msprobe/docs/08.accuracy_checker_online_PyTorch.md +72 -10
- msprobe/docs/09.accuracy_checker_MindSpore.md +19 -7
- msprobe/docs/10.accuracy_compare_PyTorch.md +266 -102
- msprobe/docs/11.accuracy_compare_MindSpore.md +117 -43
- msprobe/docs/12.overflow_check_PyTorch.md +5 -3
- msprobe/docs/13.overflow_check_MindSpore.md +6 -4
- msprobe/docs/14.data_parse_PyTorch.md +4 -10
- msprobe/docs/17.grad_probe.md +2 -1
- msprobe/docs/18.online_dispatch.md +3 -3
- msprobe/docs/19.monitor.md +211 -103
- msprobe/docs/21.visualization_PyTorch.md +100 -28
- msprobe/docs/22.visualization_MindSpore.md +103 -31
- msprobe/docs/23.generate_operator_PyTorch.md +9 -9
- msprobe/docs/25.tool_function_introduction.md +23 -22
- msprobe/docs/26.data_dump_PyTorch_baseline.md +14 -3
- msprobe/docs/27.dump_json_instruction.md +278 -8
- msprobe/docs/28.debugger_save_instruction.md +111 -20
- msprobe/docs/28.kernel_dump_MindSpore.md +1 -1
- msprobe/docs/29.data_dump_MSAdapter.md +229 -0
- msprobe/docs/30.overflow_check_MSAdapter.md +31 -0
- msprobe/docs/31.config_check.md +95 -0
- msprobe/docs/32.ckpt_compare.md +69 -0
- msprobe/docs/33.generate_operator_MindSpore.md +190 -0
- msprobe/docs/34.RL_collect.md +92 -0
- msprobe/docs/35.nan_analyze.md +72 -0
- msprobe/docs/FAQ.md +3 -11
- msprobe/docs/data_dump_MindSpore/data_dump_MindSpore_baseline.md +12 -1
- msprobe/docs/data_dump_MindSpore/dynamic_graph_quick_start_example.md +3 -1
- msprobe/docs/img/compare_result.png +0 -0
- msprobe/docs/img/merge_result.png +0 -0
- msprobe/docs/img/save_compare_result_sample.png +0 -0
- msprobe/docs/img/visualization/proxy.png +0 -0
- msprobe/docs/img/visualization/vis_browser_1.png +0 -0
- msprobe/docs/img/visualization/vis_match_info.png +0 -0
- msprobe/docs/img/visualization/vis_precision_info.png +0 -0
- msprobe/docs/img/visualization/vis_search_info.png +0 -0
- msprobe/docs/img/visualization/vis_show_info.png +0 -0
- msprobe/docs/img/visualization/vis_showcase.png +0 -0
- msprobe/docs/img/visualization/vis_unmatch_info.png +0 -0
- msprobe/mindspore/__init__.py +3 -3
- msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +151 -55
- msprobe/mindspore/api_accuracy_checker/api_runner.py +25 -11
- msprobe/mindspore/api_accuracy_checker/base_compare_algorithm.py +2 -1
- msprobe/mindspore/api_accuracy_checker/bench_functions/flash_attention_score.py +580 -0
- msprobe/mindspore/api_accuracy_checker/bench_functions/fusion_operator.py +41 -0
- msprobe/mindspore/api_accuracy_checker/cmd_parser.py +4 -0
- msprobe/mindspore/api_accuracy_checker/data_manager.py +4 -3
- msprobe/mindspore/api_accuracy_checker/generate_op_script/config_op.json +9 -0
- msprobe/mindspore/api_accuracy_checker/generate_op_script/op_generator.py +451 -0
- msprobe/mindspore/api_accuracy_checker/generate_op_script/operator_replication.template +2081 -0
- msprobe/mindspore/api_accuracy_checker/multi_api_accuracy_checker.py +11 -1
- msprobe/mindspore/api_accuracy_checker/torch_mindtorch_importer.py +2 -1
- msprobe/mindspore/cell_processor.py +204 -33
- msprobe/mindspore/code_mapping/graph_parser.py +4 -21
- msprobe/mindspore/common/const.py +73 -2
- msprobe/mindspore/common/utils.py +157 -29
- msprobe/mindspore/compare/common_dir_compare.py +382 -0
- msprobe/mindspore/compare/distributed_compare.py +2 -26
- msprobe/mindspore/compare/ms_compare.py +18 -398
- msprobe/mindspore/compare/ms_graph_compare.py +20 -10
- msprobe/mindspore/compare/utils.py +37 -0
- msprobe/mindspore/debugger/debugger_config.py +59 -7
- msprobe/mindspore/debugger/precision_debugger.py +83 -90
- msprobe/mindspore/dump/cell_dump_process.py +902 -0
- msprobe/mindspore/dump/cell_dump_with_insert_gradient.py +889 -0
- msprobe/mindspore/dump/dump_tool_factory.py +18 -8
- msprobe/mindspore/dump/graph_mode_cell_dump.py +139 -0
- msprobe/mindspore/dump/graph_tensor_dump.py +123 -0
- msprobe/mindspore/dump/hook_cell/api_register.py +176 -0
- msprobe/mindspore/dump/hook_cell/hook_cell.py +22 -12
- msprobe/mindspore/dump/hook_cell/ms_hook_manager.py +88 -0
- msprobe/mindspore/dump/hook_cell/primitive_hooks.py +8 -2
- msprobe/mindspore/dump/hook_cell/support_wrap_ops.yaml +42 -26
- msprobe/mindspore/dump/jit_dump.py +35 -27
- msprobe/mindspore/dump/kernel_kbyk_dump.py +6 -3
- msprobe/mindspore/dym_loader/hook_dynamic_loader.cpp +110 -0
- msprobe/mindspore/dym_loader/hook_dynamic_loader.h +15 -16
- msprobe/mindspore/free_benchmark/api_pynative_self_check.py +22 -12
- msprobe/mindspore/free_benchmark/common/utils.py +1 -1
- msprobe/mindspore/free_benchmark/perturbation/perturbation_factory.py +4 -2
- msprobe/mindspore/free_benchmark/self_check_tool_factory.py +6 -3
- msprobe/mindspore/grad_probe/global_context.py +9 -2
- msprobe/mindspore/grad_probe/grad_analyzer.py +2 -1
- msprobe/mindspore/grad_probe/grad_stat_csv.py +3 -2
- msprobe/mindspore/grad_probe/hook.py +2 -4
- msprobe/mindspore/mindspore_service.py +111 -0
- msprobe/mindspore/monitor/common_func.py +52 -0
- msprobe/mindspore/monitor/data_writers.py +237 -0
- msprobe/mindspore/monitor/distributed/wrap_distributed.py +1 -1
- msprobe/mindspore/monitor/features.py +13 -1
- msprobe/mindspore/monitor/module_hook.py +568 -444
- msprobe/mindspore/monitor/optimizer_collect.py +331 -0
- msprobe/mindspore/monitor/utils.py +71 -9
- msprobe/mindspore/ms_config.py +16 -15
- msprobe/mindspore/overflow_check/overflow_check_tool_factory.py +5 -3
- msprobe/mindspore/task_handler_factory.py +5 -2
- msprobe/msprobe.py +19 -0
- msprobe/nan_analyze/__init__.py +14 -0
- msprobe/nan_analyze/analyzer.py +255 -0
- msprobe/nan_analyze/graph.py +189 -0
- msprobe/nan_analyze/utils.py +211 -0
- msprobe/pytorch/api_accuracy_checker/common/config.py +2 -2
- msprobe/pytorch/api_accuracy_checker/compare/api_precision_compare.py +3 -6
- msprobe/pytorch/api_accuracy_checker/compare/compare.py +36 -34
- msprobe/pytorch/api_accuracy_checker/generate_op_script/op_generator.py +15 -13
- msprobe/pytorch/api_accuracy_checker/generate_op_script/operator_replication.template +206 -4
- msprobe/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py +9 -9
- msprobe/pytorch/api_accuracy_checker/run_ut/run_overflow_check.py +6 -5
- msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py +31 -9
- msprobe/pytorch/api_accuracy_checker/run_ut/run_ut_utils.py +28 -20
- msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/attl.py +3 -1
- msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/client.py +29 -13
- msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/device_dispatch.py +12 -2
- msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/server.py +45 -31
- msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/utils.py +154 -0
- msprobe/pytorch/attl_manager.py +65 -0
- msprobe/pytorch/bench_functions/moe_gating_top_k_softmax.py +6 -0
- msprobe/pytorch/bench_functions/npu_fusion_attention.py +27 -0
- msprobe/pytorch/common/utils.py +53 -19
- msprobe/pytorch/compare/distributed_compare.py +4 -36
- msprobe/pytorch/compare/pt_compare.py +13 -84
- msprobe/pytorch/compare/utils.py +47 -0
- msprobe/pytorch/debugger/debugger_config.py +34 -17
- msprobe/pytorch/debugger/precision_debugger.py +50 -96
- msprobe/pytorch/dump/module_dump/hook_wrapper.py +93 -0
- msprobe/pytorch/dump/module_dump/module_dump.py +15 -61
- msprobe/pytorch/dump/module_dump/module_processer.py +150 -114
- msprobe/pytorch/free_benchmark/common/utils.py +1 -1
- msprobe/pytorch/free_benchmark/compare/single_benchmark.py +1 -1
- msprobe/pytorch/free_benchmark/perturbed_layers/npu/add_noise.py +3 -3
- msprobe/pytorch/free_benchmark/perturbed_layers/npu/bit_noise.py +3 -3
- msprobe/pytorch/free_benchmark/perturbed_layers/npu/change_value.py +1 -1
- msprobe/pytorch/free_benchmark/perturbed_layers/npu/improve_precision.py +1 -1
- msprobe/pytorch/free_benchmark/result_handlers/check_handler.py +1 -1
- msprobe/pytorch/function_factory.py +1 -1
- msprobe/pytorch/grad_probe/grad_monitor.py +2 -2
- msprobe/pytorch/grad_probe/grad_stat_csv.py +3 -2
- msprobe/pytorch/hook_module/api_register.py +155 -0
- msprobe/pytorch/hook_module/hook_module.py +18 -22
- msprobe/pytorch/hook_module/jit_script_wrapper.py +33 -0
- msprobe/pytorch/hook_module/pt_hook_manager.py +68 -0
- msprobe/pytorch/hook_module/register_optimizer_hook.py +2 -1
- msprobe/pytorch/hook_module/support_wrap_ops.yaml +193 -75
- msprobe/pytorch/hook_module/utils.py +28 -2
- msprobe/pytorch/monitor/csv2tb.py +14 -4
- msprobe/pytorch/monitor/data_writers.py +259 -0
- msprobe/pytorch/monitor/distributed/wrap_distributed.py +8 -2
- msprobe/pytorch/monitor/module_hook.py +336 -241
- msprobe/pytorch/monitor/module_metric.py +17 -0
- msprobe/pytorch/monitor/optimizer_collect.py +244 -224
- msprobe/pytorch/monitor/utils.py +84 -4
- msprobe/pytorch/online_dispatch/compare.py +0 -2
- msprobe/pytorch/online_dispatch/dispatch.py +13 -2
- msprobe/pytorch/online_dispatch/dump_compare.py +8 -2
- msprobe/pytorch/online_dispatch/utils.py +3 -0
- msprobe/pytorch/parse_tool/lib/interactive_cli.py +1 -6
- msprobe/pytorch/parse_tool/lib/utils.py +5 -4
- msprobe/pytorch/pt_config.py +16 -11
- msprobe/pytorch/pytorch_service.py +70 -0
- msprobe/visualization/builder/graph_builder.py +69 -10
- msprobe/visualization/builder/msprobe_adapter.py +24 -12
- msprobe/visualization/compare/graph_comparator.py +63 -51
- msprobe/visualization/compare/mode_adapter.py +22 -20
- msprobe/visualization/graph/base_node.py +11 -4
- msprobe/visualization/graph/distributed_analyzer.py +1 -10
- msprobe/visualization/graph/graph.py +2 -13
- msprobe/visualization/graph/node_op.py +1 -2
- msprobe/visualization/graph_service.py +251 -104
- msprobe/visualization/utils.py +26 -44
- msprobe/mindspore/dump/hook_cell/api_registry.py +0 -207
- msprobe/mindspore/dump/hook_cell/wrap_api.py +0 -212
- msprobe/mindspore/dym_loader/hook_dynamic_loader.cc +0 -140
- msprobe/mindspore/monitor/anomaly_detect.py +0 -404
- msprobe/mindspore/monitor/module_spec_verifier.py +0 -94
- msprobe/mindspore/service.py +0 -543
- msprobe/pytorch/hook_module/api_registry.py +0 -166
- msprobe/pytorch/hook_module/wrap_distributed.py +0 -79
- msprobe/pytorch/hook_module/wrap_functional.py +0 -66
- msprobe/pytorch/hook_module/wrap_npu_custom.py +0 -85
- msprobe/pytorch/hook_module/wrap_tensor.py +0 -69
- msprobe/pytorch/hook_module/wrap_torch.py +0 -84
- msprobe/pytorch/hook_module/wrap_vf.py +0 -60
- msprobe/pytorch/monitor/anomaly_analyse.py +0 -201
- msprobe/pytorch/monitor/anomaly_detect.py +0 -410
- msprobe/pytorch/monitor/module_spec_verifier.py +0 -95
- msprobe/pytorch/monitor/unittest/test_monitor.py +0 -160
- msprobe/pytorch/service.py +0 -470
- {mindstudio_probe-1.2.2.dist-info → mindstudio_probe-8.1.0.dist-info}/LICENSE +0 -0
- {mindstudio_probe-1.2.2.dist-info → mindstudio_probe-8.1.0.dist-info}/WHEEL +0 -0
- {mindstudio_probe-1.2.2.dist-info → mindstudio_probe-8.1.0.dist-info}/entry_points.txt +0 -0
- {mindstudio_probe-1.2.2.dist-info → mindstudio_probe-8.1.0.dist-info}/top_level.txt +0 -0
- /msprobe/{mindspore → core}/compare/ms_to_pt_api.yaml +0 -0
- /msprobe/{mindspore/dump → core}/kernel_dump/kernel_config.py +0 -0
- /msprobe/{pytorch/monitor/unittest → core/monitor}/__init__.py +0 -0
|
@@ -0,0 +1,889 @@
|
|
|
1
|
+
# Copyright (c) 2025-2025, Huawei Technologies Co., Ltd.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
|
|
16
|
+
import atexit
|
|
17
|
+
from multiprocessing import Pool
|
|
18
|
+
import os
|
|
19
|
+
import re
|
|
20
|
+
import time
|
|
21
|
+
|
|
22
|
+
import numpy as np
|
|
23
|
+
import pandas as pd
|
|
24
|
+
import mindspore as ms
|
|
25
|
+
from mindspore import nn, ops
|
|
26
|
+
|
|
27
|
+
from msprobe.core.common.const import Const as CoreConst
|
|
28
|
+
from msprobe.core.common.const import FileCheckConst
|
|
29
|
+
from msprobe.core.common.file_utils import (
|
|
30
|
+
load_npy, save_json, remove_path, load_yaml,
|
|
31
|
+
create_directory, read_csv, write_df_to_csv, write_csv, move_file, move_directory)
|
|
32
|
+
from msprobe.mindspore.common.log import logger
|
|
33
|
+
from msprobe.mindspore.dump.cell_dump_process import CellDumpConfig
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
CONSTRUCT_FILE_NAME = "construct.json"
|
|
37
|
+
DEFAULT_RANK_DIR = "rank0"
|
|
38
|
+
KEY_LAYERS = "layers"
|
|
39
|
+
construct = {}
|
|
40
|
+
cell_list = []
|
|
41
|
+
KEY_SIDE_EFFECT = "side_effect_io"
|
|
42
|
+
KEY_TOPLAYER = "TopLayer"
|
|
43
|
+
KEY_FORWARD = CoreConst.FORWARD
|
|
44
|
+
KEY_BACKWARD = CoreConst.BACKWARD
|
|
45
|
+
KEY_INPUT = CoreConst.INPUT
|
|
46
|
+
KEY_OUTPUT = CoreConst.OUTPUT
|
|
47
|
+
KEY_DUMP_TENSOR_DATA = "dump_tensor_data_"
|
|
48
|
+
KEY_STATISTIC_CSV = "statistic.csv"
|
|
49
|
+
KEY_TD_FLAG = "td_flag"
|
|
50
|
+
td = ops.TensorDump()
|
|
51
|
+
if (ms.__version__ >= "2.5.0"):
|
|
52
|
+
td_in = ops.TensorDump("in")
|
|
53
|
+
else:
|
|
54
|
+
td_in = ops.TensorDump()
|
|
55
|
+
graph_step_flag = True
|
|
56
|
+
try:
|
|
57
|
+
from mindspore._c_expression import _set_init_iter
|
|
58
|
+
except ImportError:
|
|
59
|
+
graph_step_flag = False
|
|
60
|
+
td.add_prim_attr(KEY_SIDE_EFFECT, False)
|
|
61
|
+
td_in.add_prim_attr(KEY_SIDE_EFFECT, False)
|
|
62
|
+
td.add_prim_attr(KEY_TD_FLAG, True)
|
|
63
|
+
td_in.add_prim_attr(KEY_TD_FLAG, True)
|
|
64
|
+
dump_task = CoreConst.STATISTICS
|
|
65
|
+
np_ms_dtype_dict = {
|
|
66
|
+
"bool": ms.bool_,
|
|
67
|
+
"int8": ms.int8,
|
|
68
|
+
"byte": ms.byte,
|
|
69
|
+
"int16": ms.int16,
|
|
70
|
+
"short": ms.short,
|
|
71
|
+
"int32": ms.int32,
|
|
72
|
+
"intc": ms.intc,
|
|
73
|
+
"int64": ms.int64,
|
|
74
|
+
"intp": ms.intp,
|
|
75
|
+
"uint8": ms.uint8,
|
|
76
|
+
"ubyte": ms.ubyte,
|
|
77
|
+
"uint16": ms.uint16,
|
|
78
|
+
"ushort": ms.ushort,
|
|
79
|
+
"uint32": ms.uint32,
|
|
80
|
+
"uintc": ms.uintc,
|
|
81
|
+
"uint64": ms.uint64,
|
|
82
|
+
"uintp": ms.uintp,
|
|
83
|
+
"float16": ms.float16,
|
|
84
|
+
"half": ms.half,
|
|
85
|
+
"float32": ms.float32,
|
|
86
|
+
"single": ms.single,
|
|
87
|
+
"float64": ms.float64,
|
|
88
|
+
"double": ms.double,
|
|
89
|
+
"bfloat16": ms.bfloat16,
|
|
90
|
+
"complex64": ms.complex64,
|
|
91
|
+
"complex128": ms.complex128
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def gen_file_path(dump_path, cell_prefix, suffix, io_type, index):
|
|
96
|
+
data_path = os.path.join(dump_path, '{step}', '{rank}', CoreConst.DUMP_TENSOR_DATA)
|
|
97
|
+
file_name = ""
|
|
98
|
+
if dump_task == CoreConst.TENSOR:
|
|
99
|
+
file_name = cell_prefix + CoreConst.SEP + suffix + CoreConst.SEP + io_type + CoreConst.SEP + str(index)
|
|
100
|
+
if dump_task == CoreConst.STATISTICS:
|
|
101
|
+
file_name = cell_prefix + CoreConst.HYPHEN + suffix + CoreConst.HYPHEN + io_type + CoreConst.HYPHEN + str(index)
|
|
102
|
+
return os.path.join(data_path, file_name)
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def partial_func(func, dump_path, cell_prefix, index, io_type):
|
|
106
|
+
def newfunc(*args, **kwargs):
|
|
107
|
+
return func(dump_path, cell_prefix, index, io_type, *args, **kwargs)
|
|
108
|
+
return newfunc
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def clip_gradient(dump_path, cell_prefix, index, io_type, dx):
|
|
112
|
+
if io_type == KEY_OUTPUT:
|
|
113
|
+
temp = td(gen_file_path(dump_path, cell_prefix, KEY_BACKWARD, io_type, index), dx)
|
|
114
|
+
dx = ops.depend(dx, temp)
|
|
115
|
+
elif io_type == KEY_INPUT:
|
|
116
|
+
temp = td_in(gen_file_path(dump_path, cell_prefix, KEY_BACKWARD, io_type, index), dx)
|
|
117
|
+
dx = ops.depend(dx, temp)
|
|
118
|
+
return dx
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def need_tensordump_in(cell_obj, attr):
|
|
122
|
+
return hasattr(cell_obj, attr) and getattr(cell_obj, attr) == "in"
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def cell_construct_wrapper(func, self):
|
|
126
|
+
def new_construct(self, *args, **kwargs):
|
|
127
|
+
new_args = []
|
|
128
|
+
out_list = []
|
|
129
|
+
|
|
130
|
+
index = 0
|
|
131
|
+
item = None
|
|
132
|
+
backward_or_all = self.data_mode in ["backward", "all"]
|
|
133
|
+
forward_or_all = self.data_mode in ["forward", "all"]
|
|
134
|
+
# The inputs of the cell.
|
|
135
|
+
for index, item in enumerate(args):
|
|
136
|
+
if backward_or_all and ops.is_tensor(item):
|
|
137
|
+
item = self.output_clips[index](item)
|
|
138
|
+
if forward_or_all and ops.is_tensor(item):
|
|
139
|
+
if need_tensordump_in(self, 'input_dump_mode'):
|
|
140
|
+
temp = td_in(
|
|
141
|
+
gen_file_path(self.dump_path, self.cell_prefix, KEY_FORWARD, KEY_INPUT, index),
|
|
142
|
+
item
|
|
143
|
+
)
|
|
144
|
+
else:
|
|
145
|
+
temp = td(
|
|
146
|
+
gen_file_path(self.dump_path, self.cell_prefix, KEY_FORWARD, KEY_INPUT, index),
|
|
147
|
+
item
|
|
148
|
+
)
|
|
149
|
+
item = ops.depend(item, temp)
|
|
150
|
+
new_args.append(item)
|
|
151
|
+
|
|
152
|
+
out = func(*new_args, **kwargs)
|
|
153
|
+
|
|
154
|
+
# The outputs of the cell.
|
|
155
|
+
if isinstance(out, tuple):
|
|
156
|
+
for index, item in enumerate(out):
|
|
157
|
+
if backward_or_all and ops.is_tensor(item):
|
|
158
|
+
item = self.input_clips[index](item)
|
|
159
|
+
if forward_or_all and ops.is_tensor(item):
|
|
160
|
+
if need_tensordump_in(self, 'output_dump_mode'):
|
|
161
|
+
temp = td_in(
|
|
162
|
+
gen_file_path(self.dump_path, self.cell_prefix, KEY_FORWARD, KEY_OUTPUT, index),
|
|
163
|
+
item
|
|
164
|
+
)
|
|
165
|
+
else:
|
|
166
|
+
temp = td(
|
|
167
|
+
gen_file_path(self.dump_path, self.cell_prefix, KEY_FORWARD, KEY_OUTPUT, index),
|
|
168
|
+
item
|
|
169
|
+
)
|
|
170
|
+
item = ops.depend(item, temp)
|
|
171
|
+
out_list.append(item)
|
|
172
|
+
elif forward_or_all and not ops.is_tensor(item):
|
|
173
|
+
out_list.append(item)
|
|
174
|
+
out_list = tuple(out_list)
|
|
175
|
+
return out_list
|
|
176
|
+
else:
|
|
177
|
+
if backward_or_all:
|
|
178
|
+
out = self.input_clips[0](out)
|
|
179
|
+
if forward_or_all and ops.is_tensor(out):
|
|
180
|
+
if need_tensordump_in(self, 'output_dump_mode'):
|
|
181
|
+
temp = td_in(
|
|
182
|
+
gen_file_path(self.dump_path, self.cell_prefix, KEY_FORWARD, KEY_OUTPUT, 0),
|
|
183
|
+
out
|
|
184
|
+
)
|
|
185
|
+
else:
|
|
186
|
+
temp = td(
|
|
187
|
+
gen_file_path(self.dump_path, self.cell_prefix, KEY_FORWARD, KEY_OUTPUT, 0),
|
|
188
|
+
out
|
|
189
|
+
)
|
|
190
|
+
out = ops.depend(out, temp)
|
|
191
|
+
return out
|
|
192
|
+
|
|
193
|
+
return new_construct.__get__(self, type(self))
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
# 获取目录下所有文件名并根据TensorDump落盘自增id从小到大排序
|
|
197
|
+
def sort_filenames(path):
|
|
198
|
+
filenames = os.listdir(path)
|
|
199
|
+
id_pattern = re.compile(rf'{CoreConst.REPLACEMENT_CHARACTER}(\d+){CoreConst.NUMPY_SUFFIX}$')
|
|
200
|
+
filenames.sort(key=lambda x: int(id_pattern.findall(x)[0]))
|
|
201
|
+
return filenames
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
# 删除重复dump的文件:自定义文件名相同,并且数据相同
|
|
205
|
+
def del_same_file(path, filenames):
|
|
206
|
+
result_list = []
|
|
207
|
+
seen_prefixes = {}
|
|
208
|
+
for current_filename in filenames:
|
|
209
|
+
parts = current_filename.rsplit(CoreConst.REPLACEMENT_CHARACTER, 1)
|
|
210
|
+
prefix = parts[0]
|
|
211
|
+
if prefix not in seen_prefixes:
|
|
212
|
+
result_list.append(current_filename)
|
|
213
|
+
seen_prefixes[prefix] = current_filename
|
|
214
|
+
else:
|
|
215
|
+
current_file_path = os.path.join(path, current_filename)
|
|
216
|
+
current_file = load_npy(current_file_path)
|
|
217
|
+
prev_filename = seen_prefixes[prefix]
|
|
218
|
+
prev_file_path = os.path.join(path, prev_filename)
|
|
219
|
+
prev_file = load_npy(prev_file_path)
|
|
220
|
+
if np.array_equal(current_file, prev_file):
|
|
221
|
+
remove_path(current_file_path)
|
|
222
|
+
logger.warning(f"{current_file_path} is deleted!")
|
|
223
|
+
else:
|
|
224
|
+
result_list.append(current_filename)
|
|
225
|
+
return result_list
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
def rename_filename(path="", data_df=None):
|
|
229
|
+
if dump_task == CoreConst.TENSOR:
|
|
230
|
+
filenames = sort_filenames(path)
|
|
231
|
+
filenames = del_same_file(path, filenames)
|
|
232
|
+
if dump_task == CoreConst.STATISTICS:
|
|
233
|
+
filenames = data_df[CoreConst.OP_NAME].tolist()
|
|
234
|
+
|
|
235
|
+
filename_dict = {}
|
|
236
|
+
for index, filename in enumerate(filenames):
|
|
237
|
+
if dump_task == CoreConst.TENSOR:
|
|
238
|
+
name_field = filename.rsplit(CoreConst.REPLACEMENT_CHARACTER, 1)[0]
|
|
239
|
+
if dump_task == CoreConst.STATISTICS:
|
|
240
|
+
name_field = filename
|
|
241
|
+
|
|
242
|
+
if name_field in filename_dict:
|
|
243
|
+
filename_dict[name_field] += 1
|
|
244
|
+
else:
|
|
245
|
+
filename_dict[name_field] = 0
|
|
246
|
+
|
|
247
|
+
cell_index = filename_dict[name_field]
|
|
248
|
+
|
|
249
|
+
# 修改文件名,增加重复调用Cell的序号
|
|
250
|
+
if CoreConst.FORWARD_PATTERN in filename:
|
|
251
|
+
# Format: Cell.{cell_name}.{class_name}.{forward/backward}.{number}.{input/output}.{index}_{dtype}_{id}.npy
|
|
252
|
+
new_file_name = filename.replace(CoreConst.FORWARD_PATTERN,
|
|
253
|
+
CoreConst.FORWARD_PATTERN + str(cell_index) + CoreConst.SEP)
|
|
254
|
+
if CoreConst.BACKWARD_PATTERN in filename:
|
|
255
|
+
new_file_name = filename.replace(CoreConst.BACKWARD_PATTERN,
|
|
256
|
+
CoreConst.BACKWARD_PATTERN + str(cell_index) + CoreConst.SEP)
|
|
257
|
+
if dump_task == CoreConst.TENSOR:
|
|
258
|
+
move_file(os.path.join(path, filename), os.path.join(path, new_file_name))
|
|
259
|
+
if dump_task == CoreConst.STATISTICS:
|
|
260
|
+
data_df.loc[index, CoreConst.OP_NAME] = new_file_name
|
|
261
|
+
logger.info("==========The rename_filename phase is Finished!==========")
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
# Extract the field between the first "." and the third to last ".", i.e. {cell_name}
|
|
265
|
+
def get_cell_name(string):
|
|
266
|
+
parts = string.split(CoreConst.SEP)
|
|
267
|
+
if len(parts) < 4:
|
|
268
|
+
return None
|
|
269
|
+
start_index = 1
|
|
270
|
+
end_index = len(parts) - 3
|
|
271
|
+
return CoreConst.SEP.join(parts[start_index:end_index])
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
# Extract the field between the last "." and the second to last ".", i.e. {data_made}
|
|
275
|
+
def get_data_mode(string):
|
|
276
|
+
last_dot_index = string.rfind(CoreConst.SEP)
|
|
277
|
+
second_last_dot_index = string.rfind(CoreConst.SEP, 0, last_dot_index)
|
|
278
|
+
data_mode = string[second_last_dot_index + 1:last_dot_index]
|
|
279
|
+
return data_mode
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
# 判断二者之间是否存在父子关系
|
|
283
|
+
def check_relation(cell_name, parent_cell_name):
|
|
284
|
+
layers_pattern = rf"{CoreConst.SEP}{KEY_LAYERS}{CoreConst.SEP}\d+$"
|
|
285
|
+
last_dot_index = cell_name.rfind(CoreConst.SEP)
|
|
286
|
+
if last_dot_index == -1:
|
|
287
|
+
return False
|
|
288
|
+
# 如果cell_name最后一个'.'之前的字段等于parent_cell_name,则判定存在父子关系
|
|
289
|
+
sub_cell_name = cell_name[:last_dot_index]
|
|
290
|
+
if sub_cell_name == parent_cell_name:
|
|
291
|
+
return True
|
|
292
|
+
elif re.search(layers_pattern, cell_name):
|
|
293
|
+
# 如果cell_name以".layer.{layer_id}"结尾,且去掉该字段后等于parent_cell_name,则判定存在父子关系
|
|
294
|
+
sub_cell_name = re.sub(layers_pattern, '', cell_name)
|
|
295
|
+
if sub_cell_name == parent_cell_name:
|
|
296
|
+
return True
|
|
297
|
+
return False
|
|
298
|
+
|
|
299
|
+
|
|
300
|
+
def get_construct(cell_list_input):
|
|
301
|
+
for cell in cell_list_input:
|
|
302
|
+
cell_name = get_cell_name(cell)
|
|
303
|
+
cell_data_mode = get_data_mode(cell)
|
|
304
|
+
found_flag = False
|
|
305
|
+
for parent_cell in cell_list_input:
|
|
306
|
+
parent_cell_name = get_cell_name(parent_cell)
|
|
307
|
+
parent_data_mode = get_data_mode(parent_cell)
|
|
308
|
+
has_relation = check_relation(cell_name, parent_cell_name)
|
|
309
|
+
if has_relation and parent_data_mode == cell_data_mode:
|
|
310
|
+
construct.update({cell: parent_cell})
|
|
311
|
+
found_flag = True
|
|
312
|
+
break
|
|
313
|
+
if not found_flag:
|
|
314
|
+
construct.update({cell: None})
|
|
315
|
+
|
|
316
|
+
|
|
317
|
+
def generate_construct(path):
|
|
318
|
+
global construct
|
|
319
|
+
if dump_task == CoreConst.TENSOR:
|
|
320
|
+
# filename格式:Cell.clip_grad_norm.ClipGradNorm.forward.0.output.1_int32_0.npy
|
|
321
|
+
filenames = sort_filenames(path)
|
|
322
|
+
point_position = 3
|
|
323
|
+
if dump_task == CoreConst.STATISTICS:
|
|
324
|
+
df = read_csv(path)
|
|
325
|
+
# filename格式:Cell.clip_grad_norm.ClipGradNorm.forward.0.output.1
|
|
326
|
+
filenames = df[CoreConst.OP_NAME].tolist()
|
|
327
|
+
point_position = 2
|
|
328
|
+
|
|
329
|
+
# 提取文件名中Cell.{cell_name}.{class_name}.{data_mode}.{重复调用此cell的序号}字段,并存入cell_list
|
|
330
|
+
for filename in filenames:
|
|
331
|
+
mid_field = filename.rsplit(CoreConst.SEP, point_position)[0]
|
|
332
|
+
if KEY_INPUT in filename:
|
|
333
|
+
if mid_field in cell_list:
|
|
334
|
+
cell_list.remove(mid_field)
|
|
335
|
+
cell_list.append(mid_field)
|
|
336
|
+
else:
|
|
337
|
+
if mid_field not in cell_list:
|
|
338
|
+
index = filenames.index(filename)
|
|
339
|
+
output_field = mid_field + KEY_OUTPUT
|
|
340
|
+
find_flag = False
|
|
341
|
+
for filename_other in cell_list[index + 1:]:
|
|
342
|
+
if output_field in filename_other:
|
|
343
|
+
find_flag = True
|
|
344
|
+
if find_flag is False:
|
|
345
|
+
cell_list.append(mid_field)
|
|
346
|
+
|
|
347
|
+
get_construct(cell_list)
|
|
348
|
+
|
|
349
|
+
# 生成JSON文件
|
|
350
|
+
rank_dir = os.path.dirname(path)
|
|
351
|
+
json_path = os.path.join(rank_dir, CONSTRUCT_FILE_NAME)
|
|
352
|
+
save_json(json_path, construct, indent=1)
|
|
353
|
+
|
|
354
|
+
# 清空'construct'继续处理下一个路径下的数据
|
|
355
|
+
construct = {}
|
|
356
|
+
logger.info(f"Construct data saved to {json_path}")
|
|
357
|
+
|
|
358
|
+
|
|
359
|
+
def process_file(file_path):
|
|
360
|
+
try:
|
|
361
|
+
# 读取.npy文件内容
|
|
362
|
+
npy_content = load_npy(file_path)
|
|
363
|
+
logger.debug(f"Loaded {file_path}: shape is {npy_content.shape}, dtype is {npy_content.dtype}")
|
|
364
|
+
|
|
365
|
+
# 文件名举例:Cell.network._backbone.loss.CrossEntropyLoss.forward.0.input.0_float32_165.npy
|
|
366
|
+
parts = os.path.basename(file_path).split(CoreConst.SEP)
|
|
367
|
+
data_dtype = ""
|
|
368
|
+
# 获取0_float32_165或者0_in_float32_165中的float32
|
|
369
|
+
data_dtype_list = parts[-2].split('_')
|
|
370
|
+
if len(data_dtype_list) > 1:
|
|
371
|
+
data_dtype = data_dtype_list[-2]
|
|
372
|
+
# op_name是Cell.network._backbone.loss.CrossEntropyLoss.forward.0
|
|
373
|
+
op_name = CoreConst.SEP.join(parts[:-3])
|
|
374
|
+
ms_dtype = np_ms_dtype_dict.get(data_dtype)
|
|
375
|
+
if ms_dtype is None:
|
|
376
|
+
logger.warning(f"Get dtype None from file {file_path}")
|
|
377
|
+
|
|
378
|
+
# 修改落盘文件名字,去掉TensorDump自带的数据类型和自增id字段
|
|
379
|
+
data_file_name = os.path.basename(file_path)
|
|
380
|
+
data_file_dir = os.path.dirname(file_path)
|
|
381
|
+
parts = data_file_name.split(CoreConst.SEP)
|
|
382
|
+
if len(parts) >= 2:
|
|
383
|
+
param_index = parts[-2].split(CoreConst.REPLACEMENT_CHARACTER)[0]
|
|
384
|
+
pre_parts = CoreConst.SEP.join(parts[:-2])
|
|
385
|
+
new_file_name = pre_parts + CoreConst.SEP + param_index + CoreConst.NUMPY_SUFFIX
|
|
386
|
+
move_file(os.path.join(data_file_dir, data_file_name), os.path.join(data_file_dir, new_file_name))
|
|
387
|
+
logger.debug(f"{data_file_name} is renamed to {new_file_name}")
|
|
388
|
+
else:
|
|
389
|
+
logger.warning(f"Failed to rename {data_file_name}.")
|
|
390
|
+
new_file_name = data_file_name
|
|
391
|
+
|
|
392
|
+
tensor_json = {
|
|
393
|
+
CoreConst.TYPE: 'mindspore.Tensor',
|
|
394
|
+
CoreConst.DTYPE: str(ms_dtype),
|
|
395
|
+
CoreConst.SHAPE: list(npy_content.shape),
|
|
396
|
+
CoreConst.MAX: npy_content.max().item(),
|
|
397
|
+
CoreConst.MIN: npy_content.min().item(),
|
|
398
|
+
CoreConst.MEAN: npy_content.mean().item(),
|
|
399
|
+
CoreConst.NORM: np.linalg.norm(npy_content).item(),
|
|
400
|
+
CoreConst.DATA_NAME: new_file_name
|
|
401
|
+
}
|
|
402
|
+
|
|
403
|
+
# 根据文件名的最后一个部分(输入或输出)确定是添加到input_args还是output
|
|
404
|
+
if parts[-3] == KEY_INPUT:
|
|
405
|
+
return op_name, CoreConst.INPUT_ARGS, tensor_json
|
|
406
|
+
elif parts[-3] == KEY_OUTPUT:
|
|
407
|
+
return op_name, KEY_OUTPUT, tensor_json
|
|
408
|
+
else:
|
|
409
|
+
return None, None, None
|
|
410
|
+
|
|
411
|
+
except Exception as e:
|
|
412
|
+
logger.error(f"Error reading {file_path}: {e}")
|
|
413
|
+
return None, None, None
|
|
414
|
+
|
|
415
|
+
|
|
416
|
+
def custom_sort(item, key_to_index):
|
|
417
|
+
key = item[0]
|
|
418
|
+
return key_to_index.get(key, float('inf'))
|
|
419
|
+
|
|
420
|
+
|
|
421
|
+
def convert_special_values(value):
|
|
422
|
+
if isinstance(value, str):
|
|
423
|
+
if value.lower() == "true":
|
|
424
|
+
return True
|
|
425
|
+
elif value.lower() == "false":
|
|
426
|
+
return False
|
|
427
|
+
try:
|
|
428
|
+
return float(value)
|
|
429
|
+
except ValueError:
|
|
430
|
+
return value
|
|
431
|
+
elif pd.isna(value):
|
|
432
|
+
return None
|
|
433
|
+
return value
|
|
434
|
+
|
|
435
|
+
|
|
436
|
+
def process_csv(path):
|
|
437
|
+
data_info = []
|
|
438
|
+
df = read_csv(path)
|
|
439
|
+
df = df.sort_values(by='Op Name', ascending=True)
|
|
440
|
+
columns = df.columns
|
|
441
|
+
colume_to_json_key = {
|
|
442
|
+
'Max Value': CoreConst.MAX,
|
|
443
|
+
'Min Value': CoreConst.MIN,
|
|
444
|
+
'Avg Value': CoreConst.MEAN,
|
|
445
|
+
'L2Norm Value': CoreConst.NORM
|
|
446
|
+
}
|
|
447
|
+
for _, row in df.iterrows():
|
|
448
|
+
# op_name_value格式:Cell.network._backbone.loss.CrossEntropyLoss.forward.0.input.0
|
|
449
|
+
op_name_value = row['Op Name']
|
|
450
|
+
op_name = op_name_value.rsplit(CoreConst.SEP, 2)[0]
|
|
451
|
+
|
|
452
|
+
# 获取input/output字段
|
|
453
|
+
io_key = op_name_value.split(CoreConst.SEP)[-2]
|
|
454
|
+
|
|
455
|
+
# shape读取出来为字符串类型转为list。"(1,4096)"->[1,4096]
|
|
456
|
+
shape_num = re.findall(r'\d+', row['Shape'])
|
|
457
|
+
shape = [int(num) for num in shape_num]
|
|
458
|
+
|
|
459
|
+
tensor_json = {
|
|
460
|
+
CoreConst.TYPE: 'mindspore.Tensor',
|
|
461
|
+
CoreConst.DTYPE: str(np_ms_dtype_dict.get(row['Data Type'])),
|
|
462
|
+
CoreConst.SHAPE: shape
|
|
463
|
+
}
|
|
464
|
+
for col_name, json_key in colume_to_json_key.items():
|
|
465
|
+
if col_name in columns:
|
|
466
|
+
value = convert_special_values(row[col_name])
|
|
467
|
+
tensor_json[json_key] = value
|
|
468
|
+
|
|
469
|
+
if io_key == KEY_INPUT:
|
|
470
|
+
data_info.append([op_name, CoreConst.INPUT_ARGS, tensor_json])
|
|
471
|
+
elif io_key == KEY_OUTPUT:
|
|
472
|
+
data_info.append([op_name, KEY_OUTPUT, tensor_json])
|
|
473
|
+
else:
|
|
474
|
+
data_info.append([None, None, None])
|
|
475
|
+
return data_info
|
|
476
|
+
|
|
477
|
+
|
|
478
|
+
def generate_dump_info(path):
|
|
479
|
+
if not os.path.exists(path):
|
|
480
|
+
logger.error("The provided path does not exist.")
|
|
481
|
+
return
|
|
482
|
+
|
|
483
|
+
if dump_task == CoreConst.TENSOR:
|
|
484
|
+
dump_data = {"task": "tensor", "level": "L0", "dump_data_dir": path, "data": {}}
|
|
485
|
+
with Pool(processes=10) as pool:
|
|
486
|
+
file_paths = []
|
|
487
|
+
for file in os.listdir(path):
|
|
488
|
+
if file.endswith(FileCheckConst.NUMPY_SUFFIX):
|
|
489
|
+
file_paths.append((os.path.join(path, file),))
|
|
490
|
+
file_paths.sort()
|
|
491
|
+
results = pool.starmap(process_file, file_paths)
|
|
492
|
+
if dump_task == CoreConst.STATISTICS:
|
|
493
|
+
dump_data = {"task": "statistics", "level": "L0", "framework": "mindspore", "dump_data_dir": None, "data": {}}
|
|
494
|
+
results = process_csv(path)
|
|
495
|
+
|
|
496
|
+
# 收集结果
|
|
497
|
+
for op_name, key, tensor_json in results:
|
|
498
|
+
if op_name:
|
|
499
|
+
if op_name not in dump_data.get(CoreConst.DATA, {}):
|
|
500
|
+
dump_data.get(CoreConst.DATA, {})[op_name] = {CoreConst.INPUT_ARGS: [],
|
|
501
|
+
CoreConst.INPUT_KWARGS: {},
|
|
502
|
+
KEY_OUTPUT: []}
|
|
503
|
+
if key not in dump_data.get(CoreConst.DATA, {}).get(op_name, {}):
|
|
504
|
+
dump_data.get(CoreConst.DATA, {}).get(op_name, {})[key] = []
|
|
505
|
+
dump_data.get(CoreConst.DATA, {}).get(op_name, {}).get(key, []).append(tensor_json)
|
|
506
|
+
|
|
507
|
+
# 根据cell_list排序
|
|
508
|
+
data_dict = dump_data.get(CoreConst.DATA, {})
|
|
509
|
+
key_to_index = {key: index for index, key in enumerate(cell_list)}
|
|
510
|
+
sorted_data_dict = dict(sorted(data_dict.items(), key=lambda item: custom_sort(item, key_to_index)))
|
|
511
|
+
dump_data[CoreConst.DATA] = sorted_data_dict
|
|
512
|
+
|
|
513
|
+
# 将数据写入dump.json
|
|
514
|
+
json_path = os.path.join(os.path.dirname(path), 'dump.json')
|
|
515
|
+
save_json(json_path, dump_data, indent=1)
|
|
516
|
+
|
|
517
|
+
logger.info(f"Dump data saved to {json_path}")
|
|
518
|
+
|
|
519
|
+
|
|
520
|
+
def generate_stack_info(path):
|
|
521
|
+
if not os.path.exists(path):
|
|
522
|
+
logger.error("The provided path does not exist.")
|
|
523
|
+
return
|
|
524
|
+
|
|
525
|
+
stack_data = {}
|
|
526
|
+
for cell_name in cell_list:
|
|
527
|
+
stack_data.update({cell_name: []})
|
|
528
|
+
|
|
529
|
+
# 将数据写入stack.json
|
|
530
|
+
json_path = os.path.join(os.path.dirname(path), 'stack.json')
|
|
531
|
+
save_json(json_path, stack_data, indent=1)
|
|
532
|
+
|
|
533
|
+
# 删除csv文件
|
|
534
|
+
if dump_task == CoreConst.STATISTICS:
|
|
535
|
+
remove_path(path)
|
|
536
|
+
|
|
537
|
+
logger.info(f"Stack data saved to {json_path}")
|
|
538
|
+
|
|
539
|
+
|
|
540
|
+
def is_download_finished(directory, interval=3):
|
|
541
|
+
"""
|
|
542
|
+
判断指定目录在一段时间后是否有数据被下载完成
|
|
543
|
+
:param directory: 指定目录的路径
|
|
544
|
+
:param interval: 检查的时间间隔(秒),默认为 3 秒
|
|
545
|
+
:return: 如有数据被下载完成返回 True,否则返回 False
|
|
546
|
+
"""
|
|
547
|
+
# 检查目录是否存在
|
|
548
|
+
if not os.path.exists(directory):
|
|
549
|
+
logger.warning(f"The specified directory {directory} does not exist.")
|
|
550
|
+
return False, False
|
|
551
|
+
initial_modification_time = os.path.getmtime(directory)
|
|
552
|
+
time.sleep(interval)
|
|
553
|
+
current_modification_time = os.path.getmtime(directory)
|
|
554
|
+
# 比较初始和当前修改时间
|
|
555
|
+
if current_modification_time > initial_modification_time:
|
|
556
|
+
return False, True
|
|
557
|
+
else:
|
|
558
|
+
return True, False
|
|
559
|
+
|
|
560
|
+
|
|
561
|
+
def process(dump_path):
|
|
562
|
+
rank_id = os.environ.get('RANK_ID')
|
|
563
|
+
rank_dir = DEFAULT_RANK_DIR
|
|
564
|
+
if rank_id is not None:
|
|
565
|
+
rank_dir = CoreConst.RANK + str(rank_id)
|
|
566
|
+
|
|
567
|
+
step_dir_list = os.listdir(dump_path)
|
|
568
|
+
for step_dir in step_dir_list:
|
|
569
|
+
step_path = os.path.join(dump_path, step_dir)
|
|
570
|
+
rank_path = os.path.join(step_path, rank_dir)
|
|
571
|
+
npy_path = os.path.join(rank_path, CoreConst.DUMP_TENSOR_DATA)
|
|
572
|
+
check_times = 0
|
|
573
|
+
while True:
|
|
574
|
+
is_finished, is_downloading = is_download_finished(npy_path)
|
|
575
|
+
if not is_finished:
|
|
576
|
+
if not is_downloading:
|
|
577
|
+
logger.warning(f'{npy_path} does not exist.')
|
|
578
|
+
break
|
|
579
|
+
check_times += 1
|
|
580
|
+
if check_times < 1000:
|
|
581
|
+
logger.info("There is data being downloaded in the specified directory, continue checking...")
|
|
582
|
+
else:
|
|
583
|
+
logger.warning('Download timeout, stop checking.')
|
|
584
|
+
break
|
|
585
|
+
else:
|
|
586
|
+
logger.info("There is no data being downloaded in the specified directory, stop checking.")
|
|
587
|
+
break
|
|
588
|
+
logger.info("==========Start processing data that has already been stored on the disk!==========")
|
|
589
|
+
rename_filename(path=npy_path)
|
|
590
|
+
generate_construct(npy_path)
|
|
591
|
+
generate_dump_info(npy_path)
|
|
592
|
+
generate_stack_info(npy_path)
|
|
593
|
+
# 单卡场景,rank目录名称为rank
|
|
594
|
+
if rank_id is None:
|
|
595
|
+
new_rank_path = os.path.join(step_path, CoreConst.RANK)
|
|
596
|
+
try:
|
|
597
|
+
move_directory(rank_path, new_rank_path)
|
|
598
|
+
logger.debug(f"Directory was successfully renamed to: {new_rank_path}")
|
|
599
|
+
except Exception as e:
|
|
600
|
+
logger.warning(f"Failed to renamed to {new_rank_path}: {e}")
|
|
601
|
+
logger.info("==========JSON file generation completed!==========")
|
|
602
|
+
|
|
603
|
+
|
|
604
|
+
# 删除csv文件中每行数据最后面的逗号
|
|
605
|
+
def remove_trailing_commas(filename):
|
|
606
|
+
csv_data = read_csv(filename, as_pd=False)
|
|
607
|
+
for i in range(1, len(csv_data)):
|
|
608
|
+
if csv_data[i] and csv_data[i][-1] == "":
|
|
609
|
+
csv_data[i].pop()
|
|
610
|
+
write_csv(csv_data, filename, mode="w")
|
|
611
|
+
|
|
612
|
+
|
|
613
|
+
# 将相同step的csv文件合并,并加工后存入相应step目录下
|
|
614
|
+
def merge_file(dump_path, rank_dir, file_dict):
|
|
615
|
+
rank_dir = rank_dir.replace(CoreConst.REPLACEMENT_CHARACTER, '')
|
|
616
|
+
for step_dir, file_list in file_dict.items():
|
|
617
|
+
step_dir = CoreConst.STEP + step_dir
|
|
618
|
+
rank_path = os.path.join(dump_path, step_dir, rank_dir)
|
|
619
|
+
create_directory(rank_path)
|
|
620
|
+
output_file = os.path.join(rank_path, KEY_STATISTIC_CSV)
|
|
621
|
+
|
|
622
|
+
all_dfs = []
|
|
623
|
+
try:
|
|
624
|
+
for file_path in file_list:
|
|
625
|
+
remove_trailing_commas(file_path)
|
|
626
|
+
df = read_csv(file_path)
|
|
627
|
+
all_dfs.append(df)
|
|
628
|
+
|
|
629
|
+
# 合并所有 DataFrame
|
|
630
|
+
merged_df = pd.concat(all_dfs, ignore_index=True)
|
|
631
|
+
|
|
632
|
+
# 按 Timestamp 字段升序排序
|
|
633
|
+
merged_df = merged_df.sort_values(by='Timestamp', ascending=True)
|
|
634
|
+
# 删除Slot字段为0的数据
|
|
635
|
+
merged_df = merged_df[merged_df['Slot'] != 0]
|
|
636
|
+
# 重置索引,从0开始排序
|
|
637
|
+
merged_df.reset_index(drop=True, inplace=True)
|
|
638
|
+
|
|
639
|
+
# 获取op_name并加工为Cell.network._backbone.LlamaForCausalLM.forward.input.0格式
|
|
640
|
+
merged_df[CoreConst.OP_NAME] = merged_df[CoreConst.OP_NAME].str.split(KEY_DUMP_TENSOR_DATA, expand=True)[1]
|
|
641
|
+
merged_df[CoreConst.OP_NAME] = (
|
|
642
|
+
merged_df[CoreConst.OP_NAME].str.split(CoreConst.PIPE_SEPARATOR, expand=True)[0]
|
|
643
|
+
)
|
|
644
|
+
merged_df[CoreConst.OP_NAME] = (
|
|
645
|
+
merged_df[CoreConst.OP_NAME].str.replace(CoreConst.HYPHEN, CoreConst.SEP, regex=False)
|
|
646
|
+
)
|
|
647
|
+
# 重命名op_name,改为Cell.{cell_name}.{class_name}.{forward/backward}.{number}.{input/output}.{index}格式
|
|
648
|
+
rename_filename(data_df=merged_df)
|
|
649
|
+
|
|
650
|
+
# 将合并并排序后的 DataFrame 保存到相应step目录下
|
|
651
|
+
write_df_to_csv(merged_df, output_file)
|
|
652
|
+
except FileNotFoundError:
|
|
653
|
+
logger.error("One or more files not found.")
|
|
654
|
+
except KeyError:
|
|
655
|
+
logger.error("The value of the ‘Op Name’ field does not contain KEY_DUMP_TENSOR_DATA,"
|
|
656
|
+
" and the index is out of bounds.")
|
|
657
|
+
except Exception as e:
|
|
658
|
+
logger.error(f"An error occurred:{e}")
|
|
659
|
+
|
|
660
|
+
|
|
661
|
+
def process_statistics(dump_path):
|
|
662
|
+
rank_id = os.environ.get('RANK_ID')
|
|
663
|
+
rank_dir_kbk = "rank_0"
|
|
664
|
+
if rank_id is not None:
|
|
665
|
+
rank_dir_kbk = CoreConst.RANK + CoreConst.REPLACEMENT_CHARACTER + str(rank_id)
|
|
666
|
+
rank_path_kbk = os.path.join(dump_path, rank_dir_kbk)
|
|
667
|
+
|
|
668
|
+
# 按相同step数将csv文件名分组存入file_dict
|
|
669
|
+
file_dict = {}
|
|
670
|
+
depth_limit = 4
|
|
671
|
+
base_depth = rank_path_kbk.count(os.sep)
|
|
672
|
+
for root, _, files in os.walk(rank_path_kbk):
|
|
673
|
+
current_depth = root.count(os.sep) - base_depth
|
|
674
|
+
if current_depth > depth_limit:
|
|
675
|
+
continue
|
|
676
|
+
for file in files:
|
|
677
|
+
if file == KEY_STATISTIC_CSV:
|
|
678
|
+
file_path = os.path.join(root, file)
|
|
679
|
+
step_dir = os.path.basename(os.path.dirname(file_path))
|
|
680
|
+
if step_dir in file_dict:
|
|
681
|
+
file_dict[step_dir].append(file_path)
|
|
682
|
+
else:
|
|
683
|
+
file_dict[step_dir] = [file_path]
|
|
684
|
+
|
|
685
|
+
# 将相同step的csv文件合并,并加工后存入相应step目录下
|
|
686
|
+
merge_file(dump_path, rank_dir_kbk, file_dict)
|
|
687
|
+
|
|
688
|
+
rank_dir = rank_dir_kbk.replace(CoreConst.REPLACEMENT_CHARACTER, '')
|
|
689
|
+
dir_list = os.listdir(dump_path)
|
|
690
|
+
step_dir_list = [d for d in dir_list if d.startswith(CoreConst.STEP)]
|
|
691
|
+
for step_dir in step_dir_list:
|
|
692
|
+
step_path = os.path.join(dump_path, step_dir)
|
|
693
|
+
rank_path = os.path.join(step_path, rank_dir)
|
|
694
|
+
csv_path = os.path.join(rank_path, KEY_STATISTIC_CSV)
|
|
695
|
+
logger.info("==========Start processing data csv!==========")
|
|
696
|
+
generate_construct(csv_path)
|
|
697
|
+
generate_dump_info(csv_path)
|
|
698
|
+
generate_stack_info(csv_path)
|
|
699
|
+
remove_path(rank_path_kbk)
|
|
700
|
+
# 单卡场景,rank目录名称为rank
|
|
701
|
+
if rank_id is None:
|
|
702
|
+
new_rank_path = os.path.join(step_path, CoreConst.RANK)
|
|
703
|
+
try:
|
|
704
|
+
move_directory(rank_path, new_rank_path)
|
|
705
|
+
logger.info(f"Directory was successfully renamed to: {new_rank_path}")
|
|
706
|
+
except Exception as e:
|
|
707
|
+
logger.warning(f"Failed to renamed to {new_rank_path}: {e}")
|
|
708
|
+
logger.info("==========JSON file generation completed!==========")
|
|
709
|
+
|
|
710
|
+
|
|
711
|
+
def get_yaml_keys(yaml_data):
|
|
712
|
+
keys = []
|
|
713
|
+
for key, _ in yaml_data.items():
|
|
714
|
+
keys.append(key)
|
|
715
|
+
return keys
|
|
716
|
+
|
|
717
|
+
|
|
718
|
+
def get_tensordump_mode(input_str):
|
|
719
|
+
left_index = input_str.find('(')
|
|
720
|
+
right_index = input_str.find(')')
|
|
721
|
+
|
|
722
|
+
# 提取括号内的字符串
|
|
723
|
+
if left_index != -1 and right_index != -1:
|
|
724
|
+
inner_str = input_str[left_index + 1:right_index]
|
|
725
|
+
# 分割字符串得到元素列表
|
|
726
|
+
elements = inner_str.split(',')
|
|
727
|
+
if len(elements) >= 2:
|
|
728
|
+
# 去除元素前后的空格
|
|
729
|
+
first_element = elements[0].strip()
|
|
730
|
+
second_element = elements[1].strip()
|
|
731
|
+
return first_element, second_element
|
|
732
|
+
return None, None
|
|
733
|
+
|
|
734
|
+
|
|
735
|
+
def set_tensordump_mode(cell, input_str):
|
|
736
|
+
first_str, second_str = get_tensordump_mode(input_str)
|
|
737
|
+
if first_str and second_str:
|
|
738
|
+
cell.input_dump_mode = first_str
|
|
739
|
+
cell.output_dump_mode = second_str
|
|
740
|
+
|
|
741
|
+
|
|
742
|
+
def create_kbyk_json(dump_path, summary_mode, step):
|
|
743
|
+
if step:
|
|
744
|
+
step_str = ""
|
|
745
|
+
for s in step:
|
|
746
|
+
step_str += (str(s) + '|')
|
|
747
|
+
iteration = step_str[:-1]
|
|
748
|
+
else:
|
|
749
|
+
iteration = "all"
|
|
750
|
+
|
|
751
|
+
if summary_mode == "statistics":
|
|
752
|
+
statistic_category = ["max", "min", "avg", "l2norm"]
|
|
753
|
+
elif "mean" in summary_mode:
|
|
754
|
+
mean_index = summary_mode.index("mean")
|
|
755
|
+
summary_mode[mean_index] = "avg"
|
|
756
|
+
statistic_category = summary_mode
|
|
757
|
+
else:
|
|
758
|
+
statistic_category = summary_mode
|
|
759
|
+
|
|
760
|
+
config_json = {
|
|
761
|
+
"common_dump_settings": {
|
|
762
|
+
"op_debug_mode": 0,
|
|
763
|
+
"dump_mode": 1,
|
|
764
|
+
"path": dump_path,
|
|
765
|
+
"net_name": "Net",
|
|
766
|
+
"iteration": iteration,
|
|
767
|
+
"saved_data": "statistic",
|
|
768
|
+
"input_output": 0,
|
|
769
|
+
"kernels": ["TensorDump"],
|
|
770
|
+
"support_device": [0, 1, 2, 3, 4, 5, 6, 7],
|
|
771
|
+
"statistic_category": statistic_category
|
|
772
|
+
},
|
|
773
|
+
"e2e_dump_settings": {
|
|
774
|
+
"enable": False,
|
|
775
|
+
"trans_flag": True,
|
|
776
|
+
"stat_calc_mode": "device"
|
|
777
|
+
}
|
|
778
|
+
}
|
|
779
|
+
|
|
780
|
+
create_directory(dump_path)
|
|
781
|
+
rank_id = os.environ.get('RANK_ID')
|
|
782
|
+
if rank_id is None:
|
|
783
|
+
rank_id = 0
|
|
784
|
+
config_json_path = os.path.join(dump_path, rank_id + "kernel_kbyk_dump.json")
|
|
785
|
+
save_json(config_json_path, config_json, indent=4)
|
|
786
|
+
logger.info(config_json_path + " has been created.")
|
|
787
|
+
return config_json_path
|
|
788
|
+
|
|
789
|
+
|
|
790
|
+
def start(config: CellDumpConfig):
|
|
791
|
+
global dump_task
|
|
792
|
+
dump_task = config.task
|
|
793
|
+
net = config.net
|
|
794
|
+
dump_path = config.dump_path
|
|
795
|
+
data_mode = config.data_mode
|
|
796
|
+
summary_mode = config.summary_mode
|
|
797
|
+
step = config.step
|
|
798
|
+
if dump_task == CoreConst.STATISTICS:
|
|
799
|
+
# 使能KBK dump
|
|
800
|
+
config_json_path = create_kbyk_json(dump_path, summary_mode, step)
|
|
801
|
+
os.environ["MINDSPORE_DUMP_CONFIG"] = config_json_path
|
|
802
|
+
|
|
803
|
+
# 执行过程中跳过TensorDump算子
|
|
804
|
+
os.environ["MS_KERNEL_LAUNCH_SKIP"] = "TensorDump"
|
|
805
|
+
|
|
806
|
+
# 初始化静态图KBK dump的step数,从0开始
|
|
807
|
+
if not graph_step_flag:
|
|
808
|
+
raise Exception(
|
|
809
|
+
"Importing _set_init_iter failed, "
|
|
810
|
+
"please use the latest version package of MindSpore."
|
|
811
|
+
)
|
|
812
|
+
_set_init_iter(0)
|
|
813
|
+
remove_path(config_json_path)
|
|
814
|
+
|
|
815
|
+
if net is None:
|
|
816
|
+
return
|
|
817
|
+
|
|
818
|
+
if isinstance(net, nn.Cell):
|
|
819
|
+
net = (('', net),)
|
|
820
|
+
|
|
821
|
+
td_config_path = ""
|
|
822
|
+
try:
|
|
823
|
+
import mindformers
|
|
824
|
+
mindformers_file = mindformers.__file__
|
|
825
|
+
mindformers_dir = os.path.dirname(mindformers_file)
|
|
826
|
+
td_config_path = os.path.join(mindformers_dir, "configuration", "layer_mapping.yaml")
|
|
827
|
+
if not os.path.exists(td_config_path):
|
|
828
|
+
td_config_path = ""
|
|
829
|
+
logger.warning("The configuration file in mindformers was not loaded, the default mode will be used.")
|
|
830
|
+
except ImportError:
|
|
831
|
+
logger.warning("The mindFormers failed to load, the default mode will be used.")
|
|
832
|
+
|
|
833
|
+
if td_config_path == "":
|
|
834
|
+
yaml_data = {}
|
|
835
|
+
else:
|
|
836
|
+
yaml_data = load_yaml(td_config_path)
|
|
837
|
+
first_layer_key = get_yaml_keys(yaml_data)
|
|
838
|
+
|
|
839
|
+
black_list = ["grad_reducer", ""]
|
|
840
|
+
|
|
841
|
+
for name_and_model in net:
|
|
842
|
+
for name, cell in name_and_model[1].cells_and_names(name_prefix=name_and_model[0]):
|
|
843
|
+
class_name = cell.__class__.__name__
|
|
844
|
+
# 跳过黑名单cell
|
|
845
|
+
if name in black_list:
|
|
846
|
+
logger.info(f"Cell {name}.{class_name} is skipped!")
|
|
847
|
+
continue
|
|
848
|
+
# 跳过框架内部的cell
|
|
849
|
+
if class_name.startswith(CoreConst.REPLACEMENT_CHARACTER):
|
|
850
|
+
logger.info(f"Cell {name}.{class_name} is skipped!")
|
|
851
|
+
continue
|
|
852
|
+
else:
|
|
853
|
+
# Format: Cell.{cell_name}.{class_name}
|
|
854
|
+
cell.cell_prefix = CoreConst.SEP.join([CoreConst.CELL, name, cell.__class__.__name__])
|
|
855
|
+
if dump_task == CoreConst.STATISTICS:
|
|
856
|
+
cell.cell_prefix = cell.cell_prefix.replace(CoreConst.SEP, CoreConst.HYPHEN)
|
|
857
|
+
|
|
858
|
+
# 根据yaml配置文件设置cell的TensorDump模式
|
|
859
|
+
if class_name in first_layer_key:
|
|
860
|
+
layer_data = yaml_data.get(class_name)
|
|
861
|
+
if layer_data:
|
|
862
|
+
for child_name, child_cell in cell.cells_and_names():
|
|
863
|
+
if child_name in layer_data:
|
|
864
|
+
set_tensordump_mode(child_cell, layer_data[child_name])
|
|
865
|
+
top_layer_data = yaml_data.get(KEY_TOPLAYER)
|
|
866
|
+
if top_layer_data and name in top_layer_data:
|
|
867
|
+
set_tensordump_mode(cell, top_layer_data[name])
|
|
868
|
+
|
|
869
|
+
# 替换construct函数
|
|
870
|
+
cell.construct = cell_construct_wrapper(cell.construct, cell)
|
|
871
|
+
logger.info(f"Cell {name}: construct function is wrapped!")
|
|
872
|
+
cell.dump_path = dump_path
|
|
873
|
+
cell.data_mode = data_mode
|
|
874
|
+
cell.input_clips = []
|
|
875
|
+
cell.output_clips = []
|
|
876
|
+
# It is assumed that each cell has a maximum of 50 outputs and 50 inputs.
|
|
877
|
+
for i in range(50):
|
|
878
|
+
cell.input_clips.append(
|
|
879
|
+
ops.InsertGradientOf(partial_func(clip_gradient, cell.dump_path, cell.cell_prefix, i, KEY_INPUT))
|
|
880
|
+
)
|
|
881
|
+
cell.output_clips.append(
|
|
882
|
+
ops.InsertGradientOf(partial_func(clip_gradient, cell.dump_path, cell.cell_prefix, i, KEY_OUTPUT))
|
|
883
|
+
)
|
|
884
|
+
|
|
885
|
+
logger.info("==========The cell_dump_process_start phase is Finished!==========")
|
|
886
|
+
if dump_task == CoreConst.TENSOR:
|
|
887
|
+
atexit.register(process, dump_path=dump_path)
|
|
888
|
+
if dump_task == CoreConst.STATISTICS:
|
|
889
|
+
atexit.register(process_statistics, dump_path=dump_path)
|