mindstudio-probe 1.2.2__py3-none-any.whl → 8.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (261) hide show
  1. {mindstudio_probe-1.2.2.dist-info → mindstudio_probe-8.1.0.dist-info}/METADATA +4 -3
  2. {mindstudio_probe-1.2.2.dist-info → mindstudio_probe-8.1.0.dist-info}/RECORD +243 -191
  3. msprobe/README.md +57 -21
  4. msprobe/core/__init__.py +17 -0
  5. msprobe/core/common/const.py +224 -82
  6. msprobe/core/common/decorator.py +50 -0
  7. msprobe/core/common/exceptions.py +5 -3
  8. msprobe/core/common/file_utils.py +274 -40
  9. msprobe/core/common/framework_adapter.py +169 -0
  10. msprobe/core/common/global_lock.py +86 -0
  11. msprobe/core/common/runtime.py +25 -0
  12. msprobe/core/common/utils.py +148 -72
  13. msprobe/core/common_config.py +7 -0
  14. msprobe/core/compare/acc_compare.py +640 -462
  15. msprobe/core/compare/check.py +36 -107
  16. msprobe/core/compare/compare_cli.py +4 -0
  17. msprobe/core/compare/config.py +72 -0
  18. msprobe/core/compare/highlight.py +217 -215
  19. msprobe/core/compare/layer_mapping/layer_mapping.py +4 -1
  20. msprobe/core/compare/merge_result/merge_result.py +12 -6
  21. msprobe/core/compare/multiprocessing_compute.py +227 -107
  22. msprobe/core/compare/npy_compare.py +32 -16
  23. msprobe/core/compare/utils.py +218 -244
  24. msprobe/{mindspore/runtime.py → core/config_check/__init__.py} +2 -4
  25. msprobe/{pytorch/dump/kernel_dump/kernel_config.py → core/config_check/checkers/__init__.py} +8 -16
  26. msprobe/core/config_check/checkers/base_checker.py +60 -0
  27. msprobe/core/config_check/checkers/dataset_checker.py +138 -0
  28. msprobe/core/config_check/checkers/env_args_checker.py +96 -0
  29. msprobe/core/config_check/checkers/hyperparameter_checker.py +170 -0
  30. msprobe/core/config_check/checkers/pip_checker.py +90 -0
  31. msprobe/core/config_check/checkers/random_checker.py +367 -0
  32. msprobe/core/config_check/checkers/weights_checker.py +147 -0
  33. msprobe/core/config_check/ckpt_compare/ckpt_comparator.py +74 -0
  34. msprobe/core/config_check/ckpt_compare/megatron_loader.py +302 -0
  35. msprobe/core/config_check/ckpt_compare/metrics.py +83 -0
  36. msprobe/core/config_check/ckpt_compare/name_mapping.yaml +12 -0
  37. msprobe/core/config_check/config_check_cli.py +51 -0
  38. msprobe/core/config_check/config_checker.py +100 -0
  39. msprobe/{pytorch/parse.py → core/config_check/resource/dependency.yaml} +7 -4
  40. msprobe/core/config_check/resource/env.yaml +57 -0
  41. msprobe/core/config_check/resource/hyperparameter.yaml +21 -0
  42. msprobe/core/config_check/utils/hyperparameter_parser.py +115 -0
  43. msprobe/core/config_check/utils/utils.py +107 -0
  44. msprobe/core/data_dump/api_registry.py +239 -0
  45. msprobe/core/data_dump/data_collector.py +36 -9
  46. msprobe/core/data_dump/data_processor/base.py +74 -53
  47. msprobe/core/data_dump/data_processor/mindspore_processor.py +119 -78
  48. msprobe/core/data_dump/data_processor/pytorch_processor.py +134 -96
  49. msprobe/core/data_dump/json_writer.py +146 -57
  50. msprobe/core/debugger/precision_debugger.py +143 -0
  51. msprobe/core/grad_probe/constant.py +2 -1
  52. msprobe/core/grad_probe/grad_compare.py +2 -2
  53. msprobe/core/grad_probe/utils.py +1 -1
  54. msprobe/core/hook_manager.py +242 -0
  55. msprobe/core/monitor/anomaly_processor.py +384 -0
  56. msprobe/core/overflow_check/abnormal_scene.py +2 -0
  57. msprobe/core/service.py +356 -0
  58. msprobe/core/single_save/__init__.py +0 -0
  59. msprobe/core/single_save/single_comparator.py +243 -0
  60. msprobe/core/single_save/single_saver.py +157 -0
  61. msprobe/docs/01.installation.md +6 -5
  62. msprobe/docs/02.config_introduction.md +89 -30
  63. msprobe/docs/03.config_examples.md +1 -0
  64. msprobe/docs/04.kernel_dump_PyTorch.md +1 -1
  65. msprobe/docs/05.data_dump_PyTorch.md +184 -50
  66. msprobe/docs/06.data_dump_MindSpore.md +193 -28
  67. msprobe/docs/07.accuracy_checker_PyTorch.md +13 -3
  68. msprobe/docs/08.accuracy_checker_online_PyTorch.md +72 -10
  69. msprobe/docs/09.accuracy_checker_MindSpore.md +19 -7
  70. msprobe/docs/10.accuracy_compare_PyTorch.md +266 -102
  71. msprobe/docs/11.accuracy_compare_MindSpore.md +117 -43
  72. msprobe/docs/12.overflow_check_PyTorch.md +5 -3
  73. msprobe/docs/13.overflow_check_MindSpore.md +6 -4
  74. msprobe/docs/14.data_parse_PyTorch.md +4 -10
  75. msprobe/docs/17.grad_probe.md +2 -1
  76. msprobe/docs/18.online_dispatch.md +3 -3
  77. msprobe/docs/19.monitor.md +211 -103
  78. msprobe/docs/21.visualization_PyTorch.md +100 -28
  79. msprobe/docs/22.visualization_MindSpore.md +103 -31
  80. msprobe/docs/23.generate_operator_PyTorch.md +9 -9
  81. msprobe/docs/25.tool_function_introduction.md +23 -22
  82. msprobe/docs/26.data_dump_PyTorch_baseline.md +14 -3
  83. msprobe/docs/27.dump_json_instruction.md +278 -8
  84. msprobe/docs/28.debugger_save_instruction.md +111 -20
  85. msprobe/docs/28.kernel_dump_MindSpore.md +1 -1
  86. msprobe/docs/29.data_dump_MSAdapter.md +229 -0
  87. msprobe/docs/30.overflow_check_MSAdapter.md +31 -0
  88. msprobe/docs/31.config_check.md +95 -0
  89. msprobe/docs/32.ckpt_compare.md +69 -0
  90. msprobe/docs/33.generate_operator_MindSpore.md +190 -0
  91. msprobe/docs/34.RL_collect.md +92 -0
  92. msprobe/docs/35.nan_analyze.md +72 -0
  93. msprobe/docs/FAQ.md +3 -11
  94. msprobe/docs/data_dump_MindSpore/data_dump_MindSpore_baseline.md +12 -1
  95. msprobe/docs/data_dump_MindSpore/dynamic_graph_quick_start_example.md +3 -1
  96. msprobe/docs/img/compare_result.png +0 -0
  97. msprobe/docs/img/merge_result.png +0 -0
  98. msprobe/docs/img/save_compare_result_sample.png +0 -0
  99. msprobe/docs/img/visualization/proxy.png +0 -0
  100. msprobe/docs/img/visualization/vis_browser_1.png +0 -0
  101. msprobe/docs/img/visualization/vis_match_info.png +0 -0
  102. msprobe/docs/img/visualization/vis_precision_info.png +0 -0
  103. msprobe/docs/img/visualization/vis_search_info.png +0 -0
  104. msprobe/docs/img/visualization/vis_show_info.png +0 -0
  105. msprobe/docs/img/visualization/vis_showcase.png +0 -0
  106. msprobe/docs/img/visualization/vis_unmatch_info.png +0 -0
  107. msprobe/mindspore/__init__.py +3 -3
  108. msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +151 -55
  109. msprobe/mindspore/api_accuracy_checker/api_runner.py +25 -11
  110. msprobe/mindspore/api_accuracy_checker/base_compare_algorithm.py +2 -1
  111. msprobe/mindspore/api_accuracy_checker/bench_functions/flash_attention_score.py +580 -0
  112. msprobe/mindspore/api_accuracy_checker/bench_functions/fusion_operator.py +41 -0
  113. msprobe/mindspore/api_accuracy_checker/cmd_parser.py +4 -0
  114. msprobe/mindspore/api_accuracy_checker/data_manager.py +4 -3
  115. msprobe/mindspore/api_accuracy_checker/generate_op_script/config_op.json +9 -0
  116. msprobe/mindspore/api_accuracy_checker/generate_op_script/op_generator.py +451 -0
  117. msprobe/mindspore/api_accuracy_checker/generate_op_script/operator_replication.template +2081 -0
  118. msprobe/mindspore/api_accuracy_checker/multi_api_accuracy_checker.py +11 -1
  119. msprobe/mindspore/api_accuracy_checker/torch_mindtorch_importer.py +2 -1
  120. msprobe/mindspore/cell_processor.py +204 -33
  121. msprobe/mindspore/code_mapping/graph_parser.py +4 -21
  122. msprobe/mindspore/common/const.py +73 -2
  123. msprobe/mindspore/common/utils.py +157 -29
  124. msprobe/mindspore/compare/common_dir_compare.py +382 -0
  125. msprobe/mindspore/compare/distributed_compare.py +2 -26
  126. msprobe/mindspore/compare/ms_compare.py +18 -398
  127. msprobe/mindspore/compare/ms_graph_compare.py +20 -10
  128. msprobe/mindspore/compare/utils.py +37 -0
  129. msprobe/mindspore/debugger/debugger_config.py +59 -7
  130. msprobe/mindspore/debugger/precision_debugger.py +83 -90
  131. msprobe/mindspore/dump/cell_dump_process.py +902 -0
  132. msprobe/mindspore/dump/cell_dump_with_insert_gradient.py +889 -0
  133. msprobe/mindspore/dump/dump_tool_factory.py +18 -8
  134. msprobe/mindspore/dump/graph_mode_cell_dump.py +139 -0
  135. msprobe/mindspore/dump/graph_tensor_dump.py +123 -0
  136. msprobe/mindspore/dump/hook_cell/api_register.py +176 -0
  137. msprobe/mindspore/dump/hook_cell/hook_cell.py +22 -12
  138. msprobe/mindspore/dump/hook_cell/ms_hook_manager.py +88 -0
  139. msprobe/mindspore/dump/hook_cell/primitive_hooks.py +8 -2
  140. msprobe/mindspore/dump/hook_cell/support_wrap_ops.yaml +42 -26
  141. msprobe/mindspore/dump/jit_dump.py +35 -27
  142. msprobe/mindspore/dump/kernel_kbyk_dump.py +6 -3
  143. msprobe/mindspore/dym_loader/hook_dynamic_loader.cpp +110 -0
  144. msprobe/mindspore/dym_loader/hook_dynamic_loader.h +15 -16
  145. msprobe/mindspore/free_benchmark/api_pynative_self_check.py +22 -12
  146. msprobe/mindspore/free_benchmark/common/utils.py +1 -1
  147. msprobe/mindspore/free_benchmark/perturbation/perturbation_factory.py +4 -2
  148. msprobe/mindspore/free_benchmark/self_check_tool_factory.py +6 -3
  149. msprobe/mindspore/grad_probe/global_context.py +9 -2
  150. msprobe/mindspore/grad_probe/grad_analyzer.py +2 -1
  151. msprobe/mindspore/grad_probe/grad_stat_csv.py +3 -2
  152. msprobe/mindspore/grad_probe/hook.py +2 -4
  153. msprobe/mindspore/mindspore_service.py +111 -0
  154. msprobe/mindspore/monitor/common_func.py +52 -0
  155. msprobe/mindspore/monitor/data_writers.py +237 -0
  156. msprobe/mindspore/monitor/distributed/wrap_distributed.py +1 -1
  157. msprobe/mindspore/monitor/features.py +13 -1
  158. msprobe/mindspore/monitor/module_hook.py +568 -444
  159. msprobe/mindspore/monitor/optimizer_collect.py +331 -0
  160. msprobe/mindspore/monitor/utils.py +71 -9
  161. msprobe/mindspore/ms_config.py +16 -15
  162. msprobe/mindspore/overflow_check/overflow_check_tool_factory.py +5 -3
  163. msprobe/mindspore/task_handler_factory.py +5 -2
  164. msprobe/msprobe.py +19 -0
  165. msprobe/nan_analyze/__init__.py +14 -0
  166. msprobe/nan_analyze/analyzer.py +255 -0
  167. msprobe/nan_analyze/graph.py +189 -0
  168. msprobe/nan_analyze/utils.py +211 -0
  169. msprobe/pytorch/api_accuracy_checker/common/config.py +2 -2
  170. msprobe/pytorch/api_accuracy_checker/compare/api_precision_compare.py +3 -6
  171. msprobe/pytorch/api_accuracy_checker/compare/compare.py +36 -34
  172. msprobe/pytorch/api_accuracy_checker/generate_op_script/op_generator.py +15 -13
  173. msprobe/pytorch/api_accuracy_checker/generate_op_script/operator_replication.template +206 -4
  174. msprobe/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py +9 -9
  175. msprobe/pytorch/api_accuracy_checker/run_ut/run_overflow_check.py +6 -5
  176. msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py +31 -9
  177. msprobe/pytorch/api_accuracy_checker/run_ut/run_ut_utils.py +28 -20
  178. msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/attl.py +3 -1
  179. msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/client.py +29 -13
  180. msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/device_dispatch.py +12 -2
  181. msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/server.py +45 -31
  182. msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/utils.py +154 -0
  183. msprobe/pytorch/attl_manager.py +65 -0
  184. msprobe/pytorch/bench_functions/moe_gating_top_k_softmax.py +6 -0
  185. msprobe/pytorch/bench_functions/npu_fusion_attention.py +27 -0
  186. msprobe/pytorch/common/utils.py +53 -19
  187. msprobe/pytorch/compare/distributed_compare.py +4 -36
  188. msprobe/pytorch/compare/pt_compare.py +13 -84
  189. msprobe/pytorch/compare/utils.py +47 -0
  190. msprobe/pytorch/debugger/debugger_config.py +34 -17
  191. msprobe/pytorch/debugger/precision_debugger.py +50 -96
  192. msprobe/pytorch/dump/module_dump/hook_wrapper.py +93 -0
  193. msprobe/pytorch/dump/module_dump/module_dump.py +15 -61
  194. msprobe/pytorch/dump/module_dump/module_processer.py +150 -114
  195. msprobe/pytorch/free_benchmark/common/utils.py +1 -1
  196. msprobe/pytorch/free_benchmark/compare/single_benchmark.py +1 -1
  197. msprobe/pytorch/free_benchmark/perturbed_layers/npu/add_noise.py +3 -3
  198. msprobe/pytorch/free_benchmark/perturbed_layers/npu/bit_noise.py +3 -3
  199. msprobe/pytorch/free_benchmark/perturbed_layers/npu/change_value.py +1 -1
  200. msprobe/pytorch/free_benchmark/perturbed_layers/npu/improve_precision.py +1 -1
  201. msprobe/pytorch/free_benchmark/result_handlers/check_handler.py +1 -1
  202. msprobe/pytorch/function_factory.py +1 -1
  203. msprobe/pytorch/grad_probe/grad_monitor.py +2 -2
  204. msprobe/pytorch/grad_probe/grad_stat_csv.py +3 -2
  205. msprobe/pytorch/hook_module/api_register.py +155 -0
  206. msprobe/pytorch/hook_module/hook_module.py +18 -22
  207. msprobe/pytorch/hook_module/jit_script_wrapper.py +33 -0
  208. msprobe/pytorch/hook_module/pt_hook_manager.py +68 -0
  209. msprobe/pytorch/hook_module/register_optimizer_hook.py +2 -1
  210. msprobe/pytorch/hook_module/support_wrap_ops.yaml +193 -75
  211. msprobe/pytorch/hook_module/utils.py +28 -2
  212. msprobe/pytorch/monitor/csv2tb.py +14 -4
  213. msprobe/pytorch/monitor/data_writers.py +259 -0
  214. msprobe/pytorch/monitor/distributed/wrap_distributed.py +8 -2
  215. msprobe/pytorch/monitor/module_hook.py +336 -241
  216. msprobe/pytorch/monitor/module_metric.py +17 -0
  217. msprobe/pytorch/monitor/optimizer_collect.py +244 -224
  218. msprobe/pytorch/monitor/utils.py +84 -4
  219. msprobe/pytorch/online_dispatch/compare.py +0 -2
  220. msprobe/pytorch/online_dispatch/dispatch.py +13 -2
  221. msprobe/pytorch/online_dispatch/dump_compare.py +8 -2
  222. msprobe/pytorch/online_dispatch/utils.py +3 -0
  223. msprobe/pytorch/parse_tool/lib/interactive_cli.py +1 -6
  224. msprobe/pytorch/parse_tool/lib/utils.py +5 -4
  225. msprobe/pytorch/pt_config.py +16 -11
  226. msprobe/pytorch/pytorch_service.py +70 -0
  227. msprobe/visualization/builder/graph_builder.py +69 -10
  228. msprobe/visualization/builder/msprobe_adapter.py +24 -12
  229. msprobe/visualization/compare/graph_comparator.py +63 -51
  230. msprobe/visualization/compare/mode_adapter.py +22 -20
  231. msprobe/visualization/graph/base_node.py +11 -4
  232. msprobe/visualization/graph/distributed_analyzer.py +1 -10
  233. msprobe/visualization/graph/graph.py +2 -13
  234. msprobe/visualization/graph/node_op.py +1 -2
  235. msprobe/visualization/graph_service.py +251 -104
  236. msprobe/visualization/utils.py +26 -44
  237. msprobe/mindspore/dump/hook_cell/api_registry.py +0 -207
  238. msprobe/mindspore/dump/hook_cell/wrap_api.py +0 -212
  239. msprobe/mindspore/dym_loader/hook_dynamic_loader.cc +0 -140
  240. msprobe/mindspore/monitor/anomaly_detect.py +0 -404
  241. msprobe/mindspore/monitor/module_spec_verifier.py +0 -94
  242. msprobe/mindspore/service.py +0 -543
  243. msprobe/pytorch/hook_module/api_registry.py +0 -166
  244. msprobe/pytorch/hook_module/wrap_distributed.py +0 -79
  245. msprobe/pytorch/hook_module/wrap_functional.py +0 -66
  246. msprobe/pytorch/hook_module/wrap_npu_custom.py +0 -85
  247. msprobe/pytorch/hook_module/wrap_tensor.py +0 -69
  248. msprobe/pytorch/hook_module/wrap_torch.py +0 -84
  249. msprobe/pytorch/hook_module/wrap_vf.py +0 -60
  250. msprobe/pytorch/monitor/anomaly_analyse.py +0 -201
  251. msprobe/pytorch/monitor/anomaly_detect.py +0 -410
  252. msprobe/pytorch/monitor/module_spec_verifier.py +0 -95
  253. msprobe/pytorch/monitor/unittest/test_monitor.py +0 -160
  254. msprobe/pytorch/service.py +0 -470
  255. {mindstudio_probe-1.2.2.dist-info → mindstudio_probe-8.1.0.dist-info}/LICENSE +0 -0
  256. {mindstudio_probe-1.2.2.dist-info → mindstudio_probe-8.1.0.dist-info}/WHEEL +0 -0
  257. {mindstudio_probe-1.2.2.dist-info → mindstudio_probe-8.1.0.dist-info}/entry_points.txt +0 -0
  258. {mindstudio_probe-1.2.2.dist-info → mindstudio_probe-8.1.0.dist-info}/top_level.txt +0 -0
  259. /msprobe/{mindspore → core}/compare/ms_to_pt_api.yaml +0 -0
  260. /msprobe/{mindspore/dump → core}/kernel_dump/kernel_config.py +0 -0
  261. /msprobe/{pytorch/monitor/unittest → core/monitor}/__init__.py +0 -0
@@ -0,0 +1,902 @@
1
+ # Copyright (c) 2025-2025, Huawei Technologies Co., Ltd.
2
+ # All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ import atexit
17
+ from multiprocessing import Pool
18
+ import os
19
+ import re
20
+ import time
21
+ from dataclasses import dataclass
22
+ from typing import List, Optional, Union, Any
23
+
24
+ import numpy as np
25
+ import pandas as pd
26
+ import mindspore as ms
27
+ from mindspore import nn, ops
28
+
29
+ from msprobe.core.common.const import Const as CoreConst
30
+ from msprobe.core.common.const import FileCheckConst
31
+ from msprobe.core.common.file_utils import (
32
+ load_npy, save_json, remove_path, load_yaml,
33
+ create_directory, read_csv, write_df_to_csv, write_csv, move_file, move_directory)
34
+ from msprobe.mindspore.common.log import logger
35
+
36
+ CONSTRUCT_FILE_NAME = "construct.json"
37
+ DEFAULT_RANK_DIR = "rank0"
38
+ KEY_LAYERS = "layers"
39
+ construct = {}
40
+ cell_list = []
41
+ KEY_SIDE_EFFECT = "side_effect_io"
42
+ KEY_TOPLAYER = "TopLayer"
43
+ KEY_FORWARD = CoreConst.FORWARD
44
+ KEY_BACKWARD = CoreConst.BACKWARD
45
+ KEY_INPUT = CoreConst.INPUT
46
+ KEY_OUTPUT = CoreConst.OUTPUT
47
+ KEY_DUMP_TENSOR_DATA = "dump_tensor_data_"
48
+ KEY_STATISTIC_CSV = "statistic.csv"
49
+ KEY_TD_FLAG = "td_flag"
50
+ td = ops.TensorDump()
51
+ if (ms.__version__ >= "2.5.0"):
52
+ td_in = ops.TensorDump("in")
53
+ else:
54
+ td_in = ops.TensorDump()
55
+ dump_gradient_op_existed = False
56
+ if hasattr(ops, 'DumpGradient'):
57
+ gd = ops.DumpGradient()
58
+ dump_gradient_op_existed = True
59
+ else:
60
+ logger.warning('The operator "DumpGradient" does not exist. Cell dump can not work in graph mode.')
61
+ graph_step_flag = True
62
+ try:
63
+ from mindspore._c_expression import _set_init_iter
64
+ except ImportError:
65
+ graph_step_flag = False
66
+ td.add_prim_attr(KEY_SIDE_EFFECT, False)
67
+ td_in.add_prim_attr(KEY_SIDE_EFFECT, False)
68
+ td.add_prim_attr(KEY_TD_FLAG, True)
69
+ td_in.add_prim_attr(KEY_TD_FLAG, True)
70
+ dump_task = CoreConst.STATISTICS
71
+ np_ms_dtype_dict = {
72
+ "bool": ms.bool_,
73
+ "int8": ms.int8,
74
+ "byte": ms.byte,
75
+ "int16": ms.int16,
76
+ "short": ms.short,
77
+ "int32": ms.int32,
78
+ "intc": ms.intc,
79
+ "int64": ms.int64,
80
+ "intp": ms.intp,
81
+ "uint8": ms.uint8,
82
+ "ubyte": ms.ubyte,
83
+ "uint16": ms.uint16,
84
+ "ushort": ms.ushort,
85
+ "uint32": ms.uint32,
86
+ "uintc": ms.uintc,
87
+ "uint64": ms.uint64,
88
+ "uintp": ms.uintp,
89
+ "float16": ms.float16,
90
+ "half": ms.half,
91
+ "float32": ms.float32,
92
+ "single": ms.single,
93
+ "float64": ms.float64,
94
+ "double": ms.double,
95
+ "bfloat16": ms.bfloat16,
96
+ "complex64": ms.complex64,
97
+ "complex128": ms.complex128
98
+ }
99
+
100
+
101
+ @dataclass
102
+ class CellDumpConfig:
103
+ net: object
104
+ dump_path: str
105
+ data_mode: str
106
+ task: str = CoreConst.STATISTICS
107
+ summary_mode: Optional[Union[List[str], str]] = None
108
+ step: int = 0
109
+
110
+
111
+ def gen_file_path(dump_path, cell_prefix, suffix, io_type, index):
112
+ step_path = os.path.join(dump_path, "{step}")
113
+ rank_path = os.path.join(step_path, "{rank}")
114
+ data_path = os.path.join(rank_path, CoreConst.DUMP_TENSOR_DATA)
115
+ file_name = ""
116
+ if dump_task == CoreConst.TENSOR:
117
+ file_name = cell_prefix + CoreConst.SEP + suffix + CoreConst.SEP + io_type + CoreConst.SEP + str(index)
118
+ if dump_task == CoreConst.STATISTICS:
119
+ file_name = cell_prefix + CoreConst.HYPHEN + suffix + CoreConst.HYPHEN + io_type + CoreConst.HYPHEN + str(index)
120
+ return os.path.join(data_path, file_name)
121
+
122
+
123
+ def need_tensordump_in(cell_obj, attr, index):
124
+ if not hasattr(cell_obj, attr):
125
+ return False
126
+ attr_values = getattr(cell_obj, attr)
127
+ if index >= len(attr_values):
128
+ return False
129
+ return attr_values[index] == "in"
130
+
131
+
132
+ def cell_construct_wrapper(func, self):
133
+ def new_construct(self, *args, **kwargs):
134
+ new_args = []
135
+ out_list = []
136
+
137
+ index = 0
138
+ item = None
139
+ backward_or_all = self.data_mode in ["backward", "all"]
140
+ forward_or_all = self.data_mode in ["forward", "all"]
141
+ # The inputs of the cell.
142
+ for index, item in enumerate(args):
143
+ if backward_or_all and ops.is_tensor(item):
144
+ if need_tensordump_in(self, 'input_dump_mode', index):
145
+ item = gd(gen_file_path(self.dump_path, self.cell_prefix, KEY_BACKWARD, KEY_OUTPUT, index),
146
+ item, "in")
147
+ else:
148
+ item = gd(gen_file_path(self.dump_path, self.cell_prefix, KEY_BACKWARD, KEY_OUTPUT, index),
149
+ item, "out")
150
+ if forward_or_all and ops.is_tensor(item):
151
+ if need_tensordump_in(self, 'input_dump_mode', index):
152
+ temp = td_in(
153
+ gen_file_path(self.dump_path, self.cell_prefix, KEY_FORWARD, KEY_INPUT, index),
154
+ item
155
+ )
156
+ else:
157
+ temp = td(
158
+ gen_file_path(self.dump_path, self.cell_prefix, KEY_FORWARD, KEY_INPUT, index),
159
+ item
160
+ )
161
+ item = ops.depend(item, temp)
162
+ new_args.append(item)
163
+
164
+ out = func(*new_args, **kwargs)
165
+
166
+ # The outputs of the cell.
167
+ if isinstance(out, tuple):
168
+ for index, item in enumerate(out):
169
+ if backward_or_all and ops.is_tensor(item):
170
+ if need_tensordump_in(self, 'output_dump_mode', index):
171
+ item = gd(gen_file_path(self.dump_path, self.cell_prefix, KEY_BACKWARD, KEY_INPUT, index),
172
+ item, "in")
173
+ else:
174
+ item = gd(gen_file_path(self.dump_path, self.cell_prefix, KEY_BACKWARD, KEY_INPUT, index),
175
+ item, "out")
176
+ if forward_or_all and ops.is_tensor(item):
177
+ if need_tensordump_in(self, 'output_dump_mode', index):
178
+ temp = td_in(
179
+ gen_file_path(self.dump_path, self.cell_prefix, KEY_FORWARD, KEY_OUTPUT, index),
180
+ item
181
+ )
182
+ else:
183
+ temp = td(
184
+ gen_file_path(self.dump_path, self.cell_prefix, KEY_FORWARD, KEY_OUTPUT, index),
185
+ item
186
+ )
187
+ item = ops.depend(item, temp)
188
+ out_list.append(item)
189
+ elif forward_or_all and not ops.is_tensor(item):
190
+ out_list.append(item)
191
+ out_list = tuple(out_list)
192
+ return out_list
193
+ else:
194
+ if backward_or_all:
195
+ if need_tensordump_in(self, 'output_dump_mode', index):
196
+ out = gd(gen_file_path(self.dump_path, self.cell_prefix, KEY_BACKWARD, KEY_INPUT, 0),
197
+ out, "in")
198
+ else:
199
+ out = gd(gen_file_path(self.dump_path, self.cell_prefix, KEY_BACKWARD, KEY_INPUT, 0),
200
+ out, "out")
201
+ if forward_or_all and ops.is_tensor(out):
202
+ if need_tensordump_in(self, 'output_dump_mode', index):
203
+ temp = td_in(
204
+ gen_file_path(self.dump_path, self.cell_prefix, KEY_FORWARD, KEY_OUTPUT, 0),
205
+ out
206
+ )
207
+ else:
208
+ temp = td(
209
+ gen_file_path(self.dump_path, self.cell_prefix, KEY_FORWARD, KEY_OUTPUT, 0),
210
+ out
211
+ )
212
+ out = ops.depend(out, temp)
213
+ return out
214
+
215
+ return new_construct.__get__(self, type(self))
216
+
217
+
218
+ # 获取目录下所有文件名并根据TensorDump落盘自增id从小到大排序
219
+ def sort_filenames(path):
220
+ filenames = os.listdir(path)
221
+ id_pattern = re.compile(rf'{CoreConst.REPLACEMENT_CHARACTER}(\d+){CoreConst.NUMPY_SUFFIX}$')
222
+ filenames.sort(key=lambda x: int(id_pattern.findall(x)[0]))
223
+ return filenames
224
+
225
+
226
+ # 删除重复dump的文件:自定义文件名相同,并且数据相同
227
+ def del_same_file(path, filenames):
228
+ result_list = []
229
+ seen_prefixes = {}
230
+ for current_filename in filenames:
231
+ parts = current_filename.rsplit(CoreConst.REPLACEMENT_CHARACTER, 1)
232
+ prefix = parts[0]
233
+ if prefix not in seen_prefixes:
234
+ result_list.append(current_filename)
235
+ seen_prefixes[prefix] = current_filename
236
+ else:
237
+ current_file_path = os.path.join(path, current_filename)
238
+ current_file = load_npy(current_file_path)
239
+ prev_filename = seen_prefixes[prefix]
240
+ prev_file_path = os.path.join(path, prev_filename)
241
+ prev_file = load_npy(prev_file_path)
242
+ if np.array_equal(current_file, prev_file):
243
+ remove_path(current_file_path)
244
+ logger.warning(f"{current_file_path} is deleted!")
245
+ else:
246
+ result_list.append(current_filename)
247
+ return result_list
248
+
249
+
250
+ def rename_filename(path="", data_df=None):
251
+ if dump_task == CoreConst.TENSOR:
252
+ filenames = sort_filenames(path)
253
+ filenames = del_same_file(path, filenames)
254
+ if dump_task == CoreConst.STATISTICS:
255
+ filenames = data_df[CoreConst.OP_NAME].tolist()
256
+
257
+ filename_dict = {}
258
+ for index, filename in enumerate(filenames):
259
+ if dump_task == CoreConst.TENSOR:
260
+ name_field = filename.rsplit(CoreConst.REPLACEMENT_CHARACTER, 1)[0]
261
+ if dump_task == CoreConst.STATISTICS:
262
+ name_field = filename
263
+
264
+ if name_field in filename_dict:
265
+ filename_dict[name_field] += 1
266
+ else:
267
+ filename_dict[name_field] = 0
268
+
269
+ cell_index = filename_dict[name_field]
270
+
271
+ # 修改文件名,增加重复调用Cell的序号
272
+ if CoreConst.FORWARD_PATTERN in filename:
273
+ # Format: Cell.{cell_name}.{class_name}.{forward/backward}.{number}.{input/output}.{index}_{dtype}_{id}.npy
274
+ new_file_name = filename.replace(CoreConst.FORWARD_PATTERN,
275
+ CoreConst.FORWARD_PATTERN + str(cell_index) + CoreConst.SEP)
276
+ if CoreConst.BACKWARD_PATTERN in filename:
277
+ new_file_name = filename.replace(CoreConst.BACKWARD_PATTERN,
278
+ CoreConst.BACKWARD_PATTERN + str(cell_index) + CoreConst.SEP)
279
+ if dump_task == CoreConst.TENSOR:
280
+ move_file(os.path.join(path, filename), os.path.join(path, new_file_name))
281
+ if dump_task == CoreConst.STATISTICS:
282
+ data_df.loc[index, CoreConst.OP_NAME] = new_file_name
283
+ logger.info("==========The rename_filename phase is Finished!==========")
284
+
285
+
286
+ # Extract the field between the first "." and the third to last ".", i.e. {cell_name}
287
+ def get_cell_name(str):
288
+ parts = str.split(CoreConst.SEP)
289
+ if len(parts) < 4:
290
+ return None
291
+ start_index = 1
292
+ end_index = len(parts) - 3
293
+ return CoreConst.SEP.join(parts[start_index:end_index])
294
+
295
+
296
+ # Extract the field between the last "." and the second to last ".", i.e. {data_made}
297
+ def get_data_mode(str):
298
+ last_dot_index = str.rfind(CoreConst.SEP)
299
+ second_last_dot_index = str.rfind(CoreConst.SEP, 0, last_dot_index)
300
+ data_mode = str[second_last_dot_index + 1:last_dot_index]
301
+ return data_mode
302
+
303
+
304
+ # 判断二者之间是否存在父子关系
305
+ def check_relation(cell_name, parent_cell_name):
306
+ layers_pattern = rf"{CoreConst.SEP}{KEY_LAYERS}{CoreConst.SEP}\d+$"
307
+ last_dot_index = cell_name.rfind(CoreConst.SEP)
308
+ if last_dot_index == -1:
309
+ return False
310
+ # 如果cell_name最后一个'.'之前的字段等于parent_cell_name,则判定存在父子关系
311
+ sub_cell_name = cell_name[:last_dot_index]
312
+ if sub_cell_name == parent_cell_name:
313
+ return True
314
+ elif re.search(layers_pattern, cell_name):
315
+ # 如果cell_name以".layer.{layer_id}"结尾,且去掉该字段后等于parent_cell_name,则判定存在父子关系
316
+ sub_cell_name = re.sub(layers_pattern, '', cell_name)
317
+ if sub_cell_name == parent_cell_name:
318
+ return True
319
+ return False
320
+
321
+
322
+ def get_construct(cell_list_input):
323
+ for cell in cell_list_input:
324
+ cell_name = get_cell_name(cell)
325
+ cell_data_mode = get_data_mode(cell)
326
+ found_flag = False
327
+ for parent_cell in cell_list_input:
328
+ parent_cell_name = get_cell_name(parent_cell)
329
+ parent_data_mode = get_data_mode(parent_cell)
330
+ has_relation = check_relation(cell_name, parent_cell_name)
331
+ if has_relation and parent_data_mode == cell_data_mode:
332
+ construct.update({cell: parent_cell})
333
+ found_flag = True
334
+ break
335
+ if not found_flag:
336
+ construct.update({cell: None})
337
+
338
+
339
+ def generate_construct(path):
340
+ global construct
341
+ if dump_task == CoreConst.TENSOR:
342
+ # filename格式:Cell.clip_grad_norm.ClipGradNorm.forward.0.output.1_int32_0.npy
343
+ filenames = sort_filenames(path)
344
+ point_position = 3
345
+ if dump_task == CoreConst.STATISTICS:
346
+ df = read_csv(path)
347
+ # filename格式:Cell.clip_grad_norm.ClipGradNorm.forward.0.output.1
348
+ filenames = df[CoreConst.OP_NAME].tolist()
349
+ point_position = 2
350
+
351
+ # 提取文件名中Cell.{cell_name}.{class_name}.{data_mode}.{重复调用此cell的序号}字段,并存入cell_list
352
+ for filename in filenames:
353
+ mid_field = filename.rsplit(CoreConst.SEP, point_position)[0]
354
+ if KEY_INPUT in filename:
355
+ if mid_field in cell_list:
356
+ cell_list.remove(mid_field)
357
+ cell_list.append(mid_field)
358
+ else:
359
+ if mid_field not in cell_list:
360
+ index = filenames.index(filename)
361
+ output_field = mid_field + KEY_OUTPUT
362
+ find_flag = False
363
+ for filename_other in cell_list[index + 1:]:
364
+ if output_field in filename_other:
365
+ find_flag = True
366
+ if find_flag is False:
367
+ cell_list.append(mid_field)
368
+
369
+ get_construct(cell_list)
370
+
371
+ # 生成JSON文件
372
+ rank_dir = os.path.dirname(path)
373
+ json_path = os.path.join(rank_dir, CONSTRUCT_FILE_NAME)
374
+ save_json(json_path, construct, indent=1)
375
+
376
+ # 清空'construct'继续处理下一个路径下的数据
377
+ construct = {}
378
+ logger.info(f"Construct data saved to {json_path}")
379
+
380
+
381
+ def process_file(file_path):
382
+ try:
383
+ # 读取.npy文件内容
384
+ npy_content = load_npy(file_path)
385
+ logger.debug(f"Loaded {file_path}: shape is {npy_content.shape}, dtype is {npy_content.dtype}")
386
+
387
+ # 文件名举例:Cell.network._backbone.loss.CrossEntropyLoss.forward.0.input.0_float32_165.npy
388
+ parts = os.path.basename(file_path).split(CoreConst.SEP)
389
+ data_dtype = ""
390
+ # 获取0_float32_165或者0_in_float32_165中的float32
391
+ data_dtype_list = parts[-2].split('_')
392
+ if len(data_dtype_list) > 1:
393
+ data_dtype = data_dtype_list[-2]
394
+ # op_name是Cell.network._backbone.loss.CrossEntropyLoss.forward.0
395
+ op_name = CoreConst.SEP.join(parts[:-3])
396
+ ms_dtype = np_ms_dtype_dict.get(data_dtype)
397
+ if ms_dtype is None:
398
+ logger.warning(f"Get dtype None from file {file_path}")
399
+
400
+ # 修改落盘文件名字,去掉TensorDump自带的数据类型和自增id字段
401
+ data_file_name = os.path.basename(file_path)
402
+ data_file_dir = os.path.dirname(file_path)
403
+ parts = data_file_name.split(CoreConst.SEP)
404
+ if len(parts) >= 2:
405
+ param_index = parts[-2].split(CoreConst.REPLACEMENT_CHARACTER)[0]
406
+ pre_parts = CoreConst.SEP.join(parts[:-2])
407
+ new_file_name = pre_parts + CoreConst.SEP + param_index + CoreConst.NUMPY_SUFFIX
408
+ move_file(os.path.join(data_file_dir, data_file_name), os.path.join(data_file_dir, new_file_name))
409
+ logger.debug(f"{data_file_name} is renamed to {new_file_name}")
410
+ else:
411
+ logger.warning(f"Failed to rename {data_file_name}.")
412
+ new_file_name = data_file_name
413
+
414
+ tensor_json = {
415
+ CoreConst.TYPE: 'mindspore.Tensor',
416
+ CoreConst.DTYPE: str(ms_dtype),
417
+ CoreConst.SHAPE: list(npy_content.shape),
418
+ CoreConst.MAX: npy_content.max().item(),
419
+ CoreConst.MIN: npy_content.min().item(),
420
+ CoreConst.MEAN: npy_content.mean().item(),
421
+ CoreConst.NORM: np.linalg.norm(npy_content).item(),
422
+ CoreConst.DATA_NAME: new_file_name
423
+ }
424
+
425
+ # 根据文件名的最后一个部分(输入或输出)确定是添加到input_args还是output
426
+ if parts[-3] == KEY_INPUT:
427
+ return op_name, CoreConst.INPUT_ARGS, tensor_json
428
+ elif parts[-3] == KEY_OUTPUT:
429
+ return op_name, KEY_OUTPUT, tensor_json
430
+ else:
431
+ return None, None, None
432
+
433
+ except Exception as e:
434
+ logger.error(f"Error reading {file_path}: {e}")
435
+ return None, None, None
436
+
437
+
438
+ def custom_sort(item, key_to_index):
439
+ key = item[0]
440
+ return key_to_index.get(key, float('inf'))
441
+
442
+
443
+ def convert_special_values(value: Any) -> Union[bool, float, None, str, Any]:
444
+ if isinstance(value, str):
445
+ if value.lower() == "true":
446
+ return True
447
+ elif value.lower() == "false":
448
+ return False
449
+ try:
450
+ return float(value)
451
+ except ValueError:
452
+ return value
453
+ elif pd.isna(value):
454
+ return None
455
+ return value
456
+
457
+
458
+ def process_csv(path):
459
+ data_info = []
460
+ df = read_csv(path)
461
+ df = df.sort_values(by='Op Name', ascending=True)
462
+ columns = df.columns
463
+ colume_to_json_key = {
464
+ 'Max Value': CoreConst.MAX,
465
+ 'Min Value': CoreConst.MIN,
466
+ 'Avg Value': CoreConst.MEAN,
467
+ 'L2Norm Value': CoreConst.NORM
468
+ }
469
+ for _, row in df.iterrows():
470
+ # op_name_value格式:Cell.network._backbone.loss.CrossEntropyLoss.forward.0.input.0
471
+ op_name_value = row['Op Name']
472
+ op_name = op_name_value.rsplit(CoreConst.SEP, 2)[0]
473
+
474
+ # 获取input/output字段
475
+ io_key = op_name_value.split(CoreConst.SEP)[-2]
476
+
477
+ # shape读取出来为字符串类型转为list。"(1,4096)"->[1,4096]
478
+ shape_num = re.findall(r'\d+', row['Shape'])
479
+ shape = [int(num) for num in shape_num]
480
+
481
+ tensor_json = {
482
+ CoreConst.TYPE: 'mindspore.Tensor',
483
+ CoreConst.DTYPE: str(np_ms_dtype_dict.get(row['Data Type'])),
484
+ CoreConst.SHAPE: shape
485
+ }
486
+ for col_name, json_key in colume_to_json_key.items():
487
+ if col_name in columns:
488
+ value = convert_special_values(row[col_name])
489
+ tensor_json[json_key] = value
490
+
491
+ if io_key == KEY_INPUT:
492
+ data_info.append([op_name, CoreConst.INPUT_ARGS, tensor_json])
493
+ elif io_key == KEY_OUTPUT:
494
+ data_info.append([op_name, KEY_OUTPUT, tensor_json])
495
+ else:
496
+ data_info.append([None, None, None])
497
+ return data_info
498
+
499
+
500
+ def generate_dump_info(path):
501
+ if not os.path.exists(path):
502
+ logger.error("The provided path does not exist.")
503
+ return
504
+
505
+ if dump_task == CoreConst.TENSOR:
506
+ dump_data = {"task": "tensor", "level": "L0", "dump_data_dir": path, "data": {}}
507
+ with Pool(processes=10) as pool:
508
+ file_paths = []
509
+ for file in os.listdir(path):
510
+ if file.endswith(FileCheckConst.NUMPY_SUFFIX):
511
+ file_paths.append((os.path.join(path, file),))
512
+ file_paths.sort()
513
+ results = pool.starmap(process_file, file_paths)
514
+ if dump_task == CoreConst.STATISTICS:
515
+ dump_data = {"task": "statistics", "level": "L0", "framework": "mindspore", "dump_data_dir": None, "data": {}}
516
+ results = process_csv(path)
517
+
518
+ # 收集结果
519
+ for op_name, key, tensor_json in results:
520
+ if op_name:
521
+ if op_name not in dump_data.get(CoreConst.DATA, {}):
522
+ dump_data.get(CoreConst.DATA, {})[op_name] = {CoreConst.INPUT_ARGS: [],
523
+ CoreConst.INPUT_KWARGS: {},
524
+ KEY_OUTPUT: []}
525
+ if key not in dump_data.get(CoreConst.DATA, {}).get(op_name, {}):
526
+ dump_data.get(CoreConst.DATA, {}).get(op_name, {})[key] = []
527
+ dump_data.get(CoreConst.DATA, {}).get(op_name, {}).get(key, []).append(tensor_json)
528
+
529
+ # 根据cell_list排序
530
+ data_dict = dump_data.get(CoreConst.DATA, {})
531
+ key_to_index = {key: index for index, key in enumerate(cell_list)}
532
+ sorted_data_dict = dict(sorted(data_dict.items(), key=lambda item: custom_sort(item, key_to_index)))
533
+ dump_data[CoreConst.DATA] = sorted_data_dict
534
+
535
+ # 将数据写入dump.json
536
+ json_path = os.path.join(os.path.dirname(path), 'dump.json')
537
+ save_json(json_path, dump_data, indent=1)
538
+
539
+ logger.info(f"Dump data saved to {json_path}")
540
+
541
+
542
+ def generate_stack_info(path):
543
+ if not os.path.exists(path):
544
+ logger.error("The provided path does not exist.")
545
+ return
546
+
547
+ stack_data = {}
548
+ for cell_name in cell_list:
549
+ stack_data.update({cell_name: []})
550
+
551
+ # 将数据写入stack.json
552
+ json_path = os.path.join(os.path.dirname(path), 'stack.json')
553
+ save_json(json_path, stack_data, indent=1)
554
+
555
+ # 删除csv文件
556
+ if dump_task == CoreConst.STATISTICS:
557
+ remove_path(path)
558
+
559
+ logger.info(f"Stack data saved to {json_path}")
560
+
561
+
562
+ def is_download_finished(directory, interval=3):
563
+ """
564
+ 判断指定目录在一段时间后是否有数据被下载完成
565
+ :param directory: 指定目录的路径
566
+ :param interval: 检查的时间间隔(秒),默认为 3 秒
567
+ :return: 如有数据被下载完成返回 True,否则返回 False
568
+ """
569
+ # 检查目录是否存在
570
+ if not os.path.exists(directory):
571
+ logger.warning(f"The specified directory {directory} does not exist.")
572
+ return False
573
+ initial_modification_time = os.path.getmtime(directory)
574
+ time.sleep(interval)
575
+ current_modification_time = os.path.getmtime(directory)
576
+ # 比较初始和当前修改时间
577
+ if current_modification_time > initial_modification_time:
578
+ return False
579
+ else:
580
+ return True
581
+
582
+
583
+ def process(dump_path):
584
+ rank_id = os.environ.get('RANK_ID')
585
+ rank_dir = DEFAULT_RANK_DIR
586
+ if rank_id is not None:
587
+ rank_dir = CoreConst.RANK + str(rank_id)
588
+
589
+ step_dir_list = os.listdir(dump_path)
590
+ for step_dir in step_dir_list:
591
+ step_path = os.path.join(dump_path, step_dir)
592
+ rank_path = os.path.join(step_path, rank_dir)
593
+ npy_path = os.path.join(rank_path, CoreConst.DUMP_TENSOR_DATA)
594
+ while True:
595
+ is_finished = is_download_finished(npy_path)
596
+ if not is_finished:
597
+ logger.info("There is data being downloaded in the specified directory, continue checking...")
598
+ else:
599
+ logger.info("There is no data being downloaded in the specified directory, Stop checking.")
600
+ break
601
+ logger.info("==========Start processing data that has already been stored on the disk!==========")
602
+ rename_filename(path=npy_path)
603
+ generate_construct(npy_path)
604
+ generate_dump_info(npy_path)
605
+ generate_stack_info(npy_path)
606
+ # 单卡场景,rank目录名称为rank
607
+ if rank_id is None:
608
+ new_rank_path = os.path.join(step_path, CoreConst.RANK)
609
+ try:
610
+ move_directory(rank_path, new_rank_path)
611
+ logger.info(f"Directory was successfully renamed to: {new_rank_path}")
612
+ except Exception as e:
613
+ logger.warning(f"Failed to renamed to {new_rank_path}: {e}")
614
+ logger.info("==========JSON file generation completed!==========")
615
+
616
+
617
+ # 删除csv文件中每行数据最后面的逗号
618
+ def remove_trailing_commas(filename):
619
+ csv_data = read_csv(filename, as_pd=False)
620
+ for i in range(1, len(csv_data)):
621
+ if csv_data[i] and csv_data[i][-1] == "":
622
+ csv_data[i].pop()
623
+ write_csv(csv_data, filename, mode="w")
624
+
625
+
626
+ # 将相同step的csv文件合并,并加工后存入相应step目录下
627
+ def merge_file(dump_path, rank_dir, file_dict):
628
+ rank_dir = rank_dir.replace(CoreConst.REPLACEMENT_CHARACTER, '')
629
+ for step_dir, file_list in file_dict.items():
630
+ step_dir = CoreConst.STEP + step_dir
631
+ rank_path = os.path.join(dump_path, step_dir, rank_dir)
632
+ create_directory(rank_path)
633
+ output_file = os.path.join(rank_path, KEY_STATISTIC_CSV)
634
+
635
+ all_dfs = []
636
+ try:
637
+ for file_path in file_list:
638
+ remove_trailing_commas(file_path)
639
+ df = read_csv(file_path)
640
+ all_dfs.append(df)
641
+
642
+ # 合并所有 DataFrame
643
+ merged_df = pd.concat(all_dfs, ignore_index=True)
644
+ # 按 Timestamp 字段升序排序
645
+ merged_df = merged_df.sort_values(by='Timestamp', ascending=True)
646
+ # 删除Slot字段为0的数据
647
+ merged_df = merged_df[merged_df['Slot'] != 0]
648
+ # 重置索引,从0开始排序
649
+ merged_df.reset_index(drop=True, inplace=True)
650
+ except FileNotFoundError as e:
651
+ logger.error(f"File not found: {e.filename}")
652
+
653
+ try:
654
+ # 获取op_name并加工为Cell.network._backbone.LlamaForCausalLM.forward.input.0格式
655
+ merged_df[CoreConst.OP_NAME] = merged_df[CoreConst.OP_NAME].str.split(KEY_DUMP_TENSOR_DATA, expand=True)[1]
656
+ merged_df[CoreConst.OP_NAME] = (
657
+ merged_df[CoreConst.OP_NAME].str.split(CoreConst.PIPE_SEPARATOR, expand=True)[0]
658
+ )
659
+ merged_df[CoreConst.OP_NAME] = (
660
+ merged_df[CoreConst.OP_NAME].str.replace(CoreConst.HYPHEN, CoreConst.SEP, regex=False)
661
+ )
662
+ # 重命名op_name,改为Cell.{cell_name}.{class_name}.{forward/backward}.{number}.{input/output}.{index}格式
663
+ rename_filename(data_df=merged_df)
664
+
665
+ # 将合并并排序后的 DataFrame 保存到相应step目录下
666
+ write_df_to_csv(merged_df, output_file)
667
+ except KeyError:
668
+ logger.error("The value of the ‘Op Name’ field does not contain KEY_DUMP_TENSOR_DATA,"
669
+ " and the index is out of bounds.")
670
+
671
+
672
+ def process_statistics(dump_path):
673
+ rank_id = os.environ.get('RANK_ID')
674
+ rank_dir_kbk = "rank_0"
675
+ if rank_id is not None:
676
+ rank_dir_kbk = CoreConst.RANK + CoreConst.REPLACEMENT_CHARACTER + str(rank_id)
677
+ rank_path_kbk = os.path.join(dump_path, rank_dir_kbk)
678
+
679
+ # 按相同step数将csv文件名分组存入file_dict
680
+ file_dict = {}
681
+ depth_limit = 4
682
+ base_depth = rank_path_kbk.count(os.sep)
683
+ for root, _, files in os.walk(rank_path_kbk):
684
+ current_depth = root.count(os.sep) - base_depth
685
+ if current_depth > depth_limit:
686
+ continue
687
+ for file in files:
688
+ if file == KEY_STATISTIC_CSV:
689
+ file_path = os.path.join(root, file)
690
+ step_dir = os.path.basename(os.path.dirname(file_path))
691
+ if step_dir in file_dict:
692
+ file_dict[step_dir].append(file_path)
693
+ else:
694
+ file_dict[step_dir] = [file_path]
695
+
696
+ # 将相同step的csv文件合并,并加工后存入相应step目录下
697
+ merge_file(dump_path, rank_dir_kbk, file_dict)
698
+
699
+ rank_dir = rank_dir_kbk.replace(CoreConst.REPLACEMENT_CHARACTER, '')
700
+ dir_list = os.listdir(dump_path)
701
+ step_dir_list = [d for d in dir_list if d.startswith(CoreConst.STEP)]
702
+ for step_dir in step_dir_list:
703
+ step_path = os.path.join(dump_path, step_dir)
704
+ rank_path = os.path.join(step_path, rank_dir)
705
+ csv_path = os.path.join(rank_path, KEY_STATISTIC_CSV)
706
+ logger.info("==========Start processing data csv!==========")
707
+ generate_construct(csv_path)
708
+ generate_dump_info(csv_path)
709
+ generate_stack_info(csv_path)
710
+ remove_path(rank_path_kbk)
711
+ # 单卡场景,rank目录名称为rank
712
+ if rank_id is None:
713
+ new_rank_path = os.path.join(step_path, CoreConst.RANK)
714
+ try:
715
+ move_directory(rank_path, new_rank_path)
716
+ logger.info(f"Directory was successfully renamed to: {new_rank_path}")
717
+ except Exception as e:
718
+ logger.warning(f"Failed to renamed to {new_rank_path}: {e}")
719
+ logger.info("==========JSON file generation completed!==========")
720
+
721
+
722
+ def get_yaml_keys(yaml_data):
723
+ keys = []
724
+ for key, _ in yaml_data.items():
725
+ keys.append(key)
726
+ return keys
727
+
728
+
729
+ def get_tensordump_mode(input_str):
730
+ left_index = input_str.find('(')
731
+ right_index = input_str.find(')')
732
+
733
+ # 提取括号内的字符串
734
+ if left_index != -1 and right_index != -1:
735
+ inner_str = input_str[left_index + 1:right_index]
736
+ # 分割字符串得到元素列表
737
+ elements = inner_str.split(',')
738
+ if len(elements) >= 2:
739
+ # 去除元素前后的空格
740
+ first_element = elements[0].strip()
741
+ second_element = elements[1].strip()
742
+ return first_element, second_element
743
+ return None, None
744
+
745
+
746
+ def str_to_list(input_str):
747
+ # 去除首尾的方括号
748
+ input_str = input_str.strip('[]')
749
+ # 按逗号分割并去除元素两端的空格
750
+ return [item.strip() for item in input_str.split(',')]
751
+
752
+
753
+ def set_tensordump_mode(cell, input_str):
754
+ first_str, second_str = get_tensordump_mode(input_str)
755
+ inputs_mode = []
756
+ outputs_mode = []
757
+ if first_str and second_str:
758
+ inputs_mode = str_to_list(first_str)
759
+ outputs_mode = str_to_list(second_str)
760
+ if inputs_mode and outputs_mode:
761
+ cell.input_dump_mode = inputs_mode
762
+ cell.output_dump_mode = outputs_mode
763
+
764
+
765
+ def create_kbyk_json(dump_path, summary_mode, step):
766
+ if step:
767
+ step_str = ""
768
+ for s in step:
769
+ step_str += (str(s) + '|')
770
+ iteration = step_str[:-1]
771
+ else:
772
+ iteration = "all"
773
+
774
+ if summary_mode == "statistics":
775
+ statistic_category = ["max", "min", "avg", "l2norm"]
776
+ elif "mean" in summary_mode:
777
+ mean_index = summary_mode.index("mean")
778
+ summary_mode[mean_index] = "avg"
779
+ statistic_category = summary_mode
780
+ else:
781
+ statistic_category = summary_mode
782
+
783
+ config_json = {
784
+ "common_dump_settings": {
785
+ "op_debug_mode": 0,
786
+ "dump_mode": 1,
787
+ "path": dump_path,
788
+ "net_name": "Net",
789
+ "iteration": iteration,
790
+ "saved_data": "statistic",
791
+ "input_output": 0,
792
+ "kernels": ["TensorDump"],
793
+ "support_device": [0, 1, 2, 3, 4, 5, 6, 7],
794
+ "statistic_category": statistic_category
795
+ },
796
+ "e2e_dump_settings": {
797
+ "enable": False,
798
+ "trans_flag": True,
799
+ "stat_calc_mode": "device"
800
+ }
801
+ }
802
+
803
+ create_directory(dump_path)
804
+ rank_id = os.environ.get('RANK_ID')
805
+ if rank_id is None:
806
+ rank_id = 0
807
+ config_json_path = os.path.join(dump_path, rank_id + "kernel_kbyk_dump.json")
808
+ save_json(config_json_path, config_json, indent=4)
809
+ logger.info(config_json_path + " has been created.")
810
+ return config_json_path
811
+
812
+
813
+ def start(config: CellDumpConfig):
814
+ global dump_task
815
+ dump_task = config.task
816
+ net = config.net
817
+ dump_path = config.dump_path
818
+ data_mode = config.data_mode
819
+ summary_mode = config.summary_mode
820
+ step = config.step
821
+ if dump_task == CoreConst.STATISTICS:
822
+ # 使能KBK dump
823
+ config_json_path = create_kbyk_json(dump_path, summary_mode, step)
824
+ os.environ["MINDSPORE_DUMP_CONFIG"] = config_json_path
825
+
826
+ # 执行过程中跳过TensorDump算子
827
+ os.environ["MS_KERNEL_LAUNCH_SKIP"] = "TensorDump"
828
+
829
+ # 初始化静态图KBK dump的step数,从0开始
830
+ if not graph_step_flag:
831
+ raise Exception(
832
+ "Importing _set_init_iter failed, "
833
+ "please use the latest version package of MindSpore."
834
+ )
835
+ _set_init_iter(0)
836
+ remove_path(config_json_path)
837
+
838
+ if not dump_gradient_op_existed or net is None:
839
+ return
840
+
841
+ if isinstance(net, nn.Cell):
842
+ net = (('', net),)
843
+
844
+ td_config_path = ""
845
+ try:
846
+ import mindformers
847
+ mindformers_file = mindformers.__file__
848
+ mindformers_dir = os.path.dirname(mindformers_file)
849
+ td_config_path = os.path.join(mindformers_dir, "configuration", "layer_mapping.yaml")
850
+ if not os.path.exists(td_config_path):
851
+ td_config_path = ""
852
+ logger.warning("The configuration file in mindformers was not loaded, the default mode will be used.")
853
+ except ImportError:
854
+ logger.warning("The mindFormers failed to load, the default mode will be used.")
855
+
856
+ if td_config_path == "":
857
+ yaml_data = {}
858
+ else:
859
+ yaml_data = load_yaml(td_config_path)
860
+ first_layer_key = get_yaml_keys(yaml_data)
861
+
862
+ black_list = ["grad_reducer", ""]
863
+
864
+ for name_and_model in net:
865
+ for name, cell in name_and_model[1].cells_and_names(name_prefix=name_and_model[0]):
866
+ class_name = cell.__class__.__name__
867
+ # 跳过黑名单cell
868
+ if name in black_list:
869
+ logger.info(f"Cell {name}.{class_name} is skipped!")
870
+ continue
871
+ # 跳过框架内部的cell
872
+ if class_name.startswith(CoreConst.REPLACEMENT_CHARACTER):
873
+ logger.info(f"Cell {name}.{class_name} is skipped!")
874
+ continue
875
+ else:
876
+ # Format: Cell.{cell_name}.{class_name}
877
+ cell.cell_prefix = CoreConst.SEP.join([CoreConst.CELL, name, cell.__class__.__name__])
878
+ if dump_task == CoreConst.STATISTICS:
879
+ cell.cell_prefix = cell.cell_prefix.replace(CoreConst.SEP, CoreConst.HYPHEN)
880
+
881
+ # 根据yaml配置文件设置cell的TensorDump模式
882
+ if class_name in first_layer_key:
883
+ layer_data = yaml_data.get(class_name)
884
+ if layer_data:
885
+ for child_name, child_cell in cell.cells_and_names():
886
+ if child_name in layer_data:
887
+ set_tensordump_mode(child_cell, layer_data[child_name])
888
+ top_layer_data = yaml_data.get(KEY_TOPLAYER)
889
+ if top_layer_data and name in top_layer_data:
890
+ set_tensordump_mode(cell, top_layer_data[name])
891
+
892
+ # 替换construct函数
893
+ cell.construct = cell_construct_wrapper(cell.construct, cell)
894
+ logger.info(f"Cell {name}: construct function is wrapped!")
895
+ cell.dump_path = dump_path
896
+ cell.data_mode = data_mode
897
+
898
+ logger.info("==========The cell_dump_process_start phase is Finished!==========")
899
+ if dump_task == CoreConst.TENSOR:
900
+ atexit.register(process, dump_path=dump_path)
901
+ if dump_task == CoreConst.STATISTICS:
902
+ atexit.register(process_statistics, dump_path=dump_path)