mindspore 2.2.11__cp37-none-any.whl → 2.2.14__cp37-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mindspore might be problematic. Click here for more details.

Files changed (118) hide show
  1. mindspore/.commit_id +1 -1
  2. mindspore/__init__.py +2 -1
  3. mindspore/_akg/akg/topi/cpp/impl.py +1 -1
  4. mindspore/_akg/akg/tvm/_ffi/base.py +1 -1
  5. mindspore/_c_dataengine.cpython-37m-aarch64-linux-gnu.so +0 -0
  6. mindspore/_c_expression.cpython-37m-aarch64-linux-gnu.so +0 -0
  7. mindspore/_c_mindrecord.cpython-37m-aarch64-linux-gnu.so +0 -0
  8. mindspore/_mindspore_offline_debug.cpython-37m-aarch64-linux-gnu.so +0 -0
  9. mindspore/bin/cache_admin +0 -0
  10. mindspore/bin/cache_server +0 -0
  11. mindspore/common/tensor.py +0 -2
  12. mindspore/communication/management.py +3 -0
  13. mindspore/context.py +34 -4
  14. mindspore/dataset/engine/datasets.py +23 -0
  15. mindspore/dataset/engine/validators.py +1 -1
  16. mindspore/dataset/vision/py_transforms_util.py +2 -2
  17. mindspore/experimental/optim/lr_scheduler.py +5 -6
  18. mindspore/lib/libdnnl.so.2 +0 -0
  19. mindspore/lib/libmindspore.so +0 -0
  20. mindspore/lib/libmindspore_backend.so +0 -0
  21. mindspore/lib/libmindspore_common.so +0 -0
  22. mindspore/lib/libmindspore_core.so +0 -0
  23. mindspore/lib/libmindspore_glog.so.0 +0 -0
  24. mindspore/lib/libmindspore_gpr.so.15 +0 -0
  25. mindspore/lib/libmindspore_grpc.so.15 +0 -0
  26. mindspore/lib/libmindspore_shared_lib.so +0 -0
  27. mindspore/lib/libopencv_core.so.4.5 +0 -0
  28. mindspore/lib/plugin/ascend/custom_aicpu_ops/op_impl/cpu/aicpu_kernel/impl/libcust_aicpu_kernels.so +0 -0
  29. mindspore/lib/plugin/ascend/custom_aicpu_ops/op_impl/cpu/aicpu_kernel/impl/libcust_cpu_kernels.so +0 -0
  30. mindspore/lib/plugin/ascend/custom_aicpu_ops/op_impl/cpu/config/cust_aicpu_kernel.json +48 -0
  31. mindspore/lib/plugin/ascend/custom_aicpu_ops/op_proto/libcust_op_proto.so +0 -0
  32. mindspore/lib/plugin/ascend/libakg.so +0 -0
  33. mindspore/lib/plugin/ascend/libascend_collective.so +0 -0
  34. mindspore/lib/plugin/ascend/libdvpp_utils.so +0 -0
  35. mindspore/lib/plugin/ascend/libmindspore_aicpu_kernels.so +0 -0
  36. mindspore/lib/plugin/ascend/libmindspore_cpu_kernels.so +0 -0
  37. mindspore/lib/plugin/libmindspore_ascend.so.1 +0 -0
  38. mindspore/mindrecord/tools/cifar100_to_mr.py +49 -57
  39. mindspore/mindrecord/tools/cifar10_to_mr.py +46 -55
  40. mindspore/mindrecord/tools/csv_to_mr.py +3 -8
  41. mindspore/mindrecord/tools/mnist_to_mr.py +4 -9
  42. mindspore/mindrecord/tools/tfrecord_to_mr.py +1 -4
  43. mindspore/nn/layer/activation.py +1 -1
  44. mindspore/nn/layer/embedding.py +2 -2
  45. mindspore/nn/loss/loss.py +1 -1
  46. mindspore/nn/optim/ada_grad.py +2 -2
  47. mindspore/nn/optim/sgd.py +3 -2
  48. mindspore/numpy/math_ops.py +1 -1
  49. mindspore/ops/__init__.py +3 -0
  50. mindspore/ops/_grad_experimental/grad_array_ops.py +0 -31
  51. mindspore/ops/_grad_experimental/grad_comm_ops.py +4 -2
  52. mindspore/ops/_grad_experimental/grad_inner_ops.py +8 -0
  53. mindspore/ops/_grad_experimental/grad_math_ops.py +37 -17
  54. mindspore/ops/_op_impl/aicpu/__init__.py +1 -0
  55. mindspore/ops/_op_impl/aicpu/generate_eod_mask.py +38 -0
  56. mindspore/ops/function/array_func.py +6 -5
  57. mindspore/ops/function/debug_func.py +1 -1
  58. mindspore/ops/function/linalg_func.py +21 -11
  59. mindspore/ops/function/math_func.py +3 -0
  60. mindspore/ops/function/nn_func.py +13 -11
  61. mindspore/ops/function/parameter_func.py +2 -0
  62. mindspore/ops/function/sparse_unary_func.py +2 -2
  63. mindspore/ops/function/vmap_func.py +1 -0
  64. mindspore/ops/operations/_embedding_cache_ops.py +1 -1
  65. mindspore/ops/operations/_inner_ops.py +56 -1
  66. mindspore/ops/operations/_quant_ops.py +4 -4
  67. mindspore/ops/operations/_rl_inner_ops.py +1 -1
  68. mindspore/ops/operations/array_ops.py +15 -4
  69. mindspore/ops/operations/custom_ops.py +1 -1
  70. mindspore/ops/operations/debug_ops.py +1 -1
  71. mindspore/ops/operations/image_ops.py +3 -3
  72. mindspore/ops/operations/inner_ops.py +49 -0
  73. mindspore/ops/operations/math_ops.py +62 -0
  74. mindspore/ops/operations/nn_ops.py +7 -3
  75. mindspore/ops/operations/random_ops.py +2 -0
  76. mindspore/ops/operations/sparse_ops.py +4 -4
  77. mindspore/ops/silent_check.py +162 -0
  78. mindspore/parallel/__init__.py +3 -2
  79. mindspore/parallel/_auto_parallel_context.py +82 -3
  80. mindspore/parallel/_parallel_serialization.py +34 -2
  81. mindspore/parallel/_tensor.py +3 -1
  82. mindspore/parallel/_transformer/transformer.py +8 -8
  83. mindspore/parallel/checkpoint_transform.py +191 -45
  84. mindspore/profiler/parser/ascend_cluster_generator.py +111 -0
  85. mindspore/profiler/parser/ascend_communicate_generator.py +315 -0
  86. mindspore/profiler/parser/ascend_flops_generator.py +8 -2
  87. mindspore/profiler/parser/ascend_fpbp_generator.py +8 -2
  88. mindspore/profiler/parser/ascend_hccl_generator.py +2 -2
  89. mindspore/profiler/parser/ascend_msprof_exporter.py +30 -6
  90. mindspore/profiler/parser/ascend_msprof_generator.py +16 -5
  91. mindspore/profiler/parser/ascend_op_generator.py +15 -7
  92. mindspore/profiler/parser/ascend_timeline_generator.py +5 -2
  93. mindspore/profiler/parser/base_timeline_generator.py +11 -3
  94. mindspore/profiler/parser/cpu_gpu_timeline_generator.py +2 -1
  95. mindspore/profiler/parser/framework_parser.py +8 -2
  96. mindspore/profiler/parser/memory_usage_parser.py +8 -2
  97. mindspore/profiler/parser/minddata_analyzer.py +8 -2
  98. mindspore/profiler/parser/minddata_parser.py +1 -1
  99. mindspore/profiler/parser/msadvisor_analyzer.py +4 -2
  100. mindspore/profiler/parser/msadvisor_parser.py +9 -3
  101. mindspore/profiler/profiling.py +97 -25
  102. mindspore/rewrite/api/node.py +1 -1
  103. mindspore/rewrite/api/symbol_tree.py +2 -2
  104. mindspore/train/callback/_checkpoint.py +8 -8
  105. mindspore/train/callback/_landscape.py +2 -3
  106. mindspore/train/callback/_summary_collector.py +6 -7
  107. mindspore/train/dataset_helper.py +6 -0
  108. mindspore/train/model.py +17 -5
  109. mindspore/train/serialization.py +6 -1
  110. mindspore/train/summary/_writer_pool.py +1 -1
  111. mindspore/train/summary/summary_record.py +5 -6
  112. mindspore/version.py +1 -1
  113. {mindspore-2.2.11.dist-info → mindspore-2.2.14.dist-info}/METADATA +1 -1
  114. {mindspore-2.2.11.dist-info → mindspore-2.2.14.dist-info}/RECORD +117 -114
  115. mindspore/lib/plugin/libmindspore_ascend.so.2 +0 -0
  116. {mindspore-2.2.11.dist-info → mindspore-2.2.14.dist-info}/WHEEL +0 -0
  117. {mindspore-2.2.11.dist-info → mindspore-2.2.14.dist-info}/entry_points.txt +0 -0
  118. {mindspore-2.2.11.dist-info → mindspore-2.2.14.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,315 @@
1
+ # Copyright 2024 Huawei Technologies Co., Ltd
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ # ============================================================================
15
+ """communicate data analyze api file"""
16
+ import json
17
+ import re
18
+ import logging
19
+ import os
20
+ import stat
21
+ from collections import defaultdict
22
+
23
+ from mindspore.profiler.common.exceptions.exceptions import ProfilerIOException
24
+
25
+
26
+ class AscendCommunicationGenerator:
27
+ """
28
+ load and split communication info by step
29
+ """
30
+ COMMUNICATION_TIME_INFO = "Communication Time Info"
31
+ START_TIMESTAMP = "Start Timestamp(us)"
32
+ COMMUNICATION_BANDWIDTH_INFO = "Communication Bandwidth Info"
33
+ HCOM_SEND = "Send"
34
+ HCOM_RECEIVE = "Receive"
35
+ TOTAL = "Total"
36
+ SYNCHRONIZATION_TIME_RATIO = "Synchronization Time Ratio"
37
+ SYNCHRONIZATION_TIME_MS = "Synchronization Time(ms)"
38
+ WAIT_TIME_RATIO = "Wait Time Ratio"
39
+ TRANSIT_TIME_MS = "Transit Time(ms)"
40
+ TRANSIT_SIZE_MB = "Transit Size(MB)"
41
+ SIZE_DISTRIBUTION = "Size Distribution"
42
+ WAIT_TIME_MS = "Wait Time(ms)"
43
+ BANDWIDTH_GB_S = "Bandwidth(GB/s)"
44
+ COMMUNICATION = "communication.json"
45
+ COMMUNICATION_MATRIX = "communication_matrix.json"
46
+ P2P = "p2p"
47
+ COLLECTIVE = "collective"
48
+ TRANSPORT_TYPE = "Transport Type"
49
+ PATTERN1 = re.compile(r"receive|send")
50
+ PATTERN2 = re.compile(r"invalid|broadcast|allreduce|reduce|"
51
+ r"allgather|reducescatter|scatter|alltoall|alltoallv|alltoallvc")
52
+
53
+ def __init__(self, source_path):
54
+ super().__init__()
55
+ self.root_path = source_path
56
+ self.step_list = [{"step_id": None, "start_ts": 0, "end_ts": float('inf'), "comm_ops": {}}]
57
+ self.output_communication = {}
58
+ self.output_matrix_data = {}
59
+
60
+ @staticmethod
61
+ def combine_size_distribution(op_dict: dict, total_dict: dict):
62
+ """combine size distribution"""
63
+ for size, size_info in op_dict.items():
64
+ total_dict[size][0] += size_info[0]
65
+ total_dict[size][1] += size_info[1]
66
+
67
+ @staticmethod
68
+ def compute_ratio(dividend: float, divisor: float):
69
+ """compute ratio"""
70
+ if abs(divisor) < 1e-15:
71
+ return 0
72
+ return round(dividend / divisor, 4)
73
+
74
+ def parse(self) -> None:
75
+ """parse"""
76
+ self.generate_communication()
77
+ self.generate_matrix()
78
+
79
+ def generate_communication(self):
80
+ """
81
+ generate communication.json
82
+ """
83
+ communication_file = os.path.join(self.root_path, self.COMMUNICATION)
84
+ with open(communication_file) as file:
85
+ communication_data = json.load(file)
86
+ if not communication_data:
87
+ return
88
+ self.split_comm_op_by_step(communication_data)
89
+
90
+ for step_info in self.step_list:
91
+ step = "step" + step_info.get("step_id") if step_info.get("step_id") else "step"
92
+ self.output_communication[step] = self.get_communication_ops_dict(step_info.get("comm_ops"))
93
+
94
+ def generate_matrix(self):
95
+ """generate matrix"""
96
+ communication_file = os.path.join(self.root_path, self.COMMUNICATION_MATRIX)
97
+ with open(communication_file) as file:
98
+ matrix_data = json.load(file)
99
+ if not matrix_data:
100
+ return
101
+ matrix_data_by_step = self.split_matrix_by_step(matrix_data)
102
+
103
+ for step, comm_matrix_data in matrix_data_by_step.items():
104
+ self.output_matrix_data[step] = self.get_matrix_ops_dict(comm_matrix_data)
105
+
106
+ def split_comm_op_by_step(self, communication_data: dict):
107
+ """split comm op by step"""
108
+ if len(self.step_list) == 1:
109
+ self.step_list[0]["comm_ops"] = communication_data
110
+ for communication_op, communication_op_info in communication_data.items():
111
+ start_time = communication_op_info.get(self.COMMUNICATION_TIME_INFO, {}).get(self.START_TIMESTAMP)
112
+ for step_info in self.step_list:
113
+ if step_info.get("start_ts", -1) <= start_time <= step_info.get("end_ts", -1):
114
+ step_info.get("comm_ops", {})[communication_op] = communication_op_info
115
+ break
116
+
117
+ def split_communication_p2p_ops(self, op_data: dict):
118
+ """
119
+ split communicate
120
+ """
121
+ comm_op_dict = {self.P2P: {}, self.COLLECTIVE: {}}
122
+ for communication_op, communication_info in op_data.items():
123
+ if communication_op.find(self.HCOM_SEND) != -1 or communication_op.find(self.HCOM_RECEIVE) != -1:
124
+ comm_op_dict[self.P2P][communication_op] = communication_info
125
+ elif communication_op.startswith(self.TOTAL):
126
+ continue
127
+ else:
128
+ comm_op_dict[self.COLLECTIVE][communication_op] = communication_info
129
+ return comm_op_dict
130
+
131
+ def split_matrix_by_step(self, matrix_data: dict) -> dict:
132
+ """
133
+ split matrix by step
134
+ """
135
+ matrix_data_by_step = {}
136
+ if self.is_step_list_empty():
137
+ matrix_data_by_step["step"] = matrix_data
138
+ return matrix_data_by_step
139
+
140
+ for comm_op in matrix_data:
141
+ for step_info in self.step_list:
142
+ if comm_op in step_info.get("comm_ops", {}):
143
+ step = "step" + step_info.get("step_id") if step_info.get("step_id") else "step"
144
+ matrix_data_by_step.setdefault(step, {})[comm_op] = matrix_data.get(comm_op)
145
+ break
146
+ return matrix_data_by_step
147
+
148
+ def get_communication_ops_dict(self, op_data: dict) -> dict:
149
+ """get communication ops dict"""
150
+ comm_op_dict = self.split_communication_p2p_ops(op_data)
151
+ self.compute_total_info(comm_op_dict[self.P2P])
152
+ self.compute_total_info(comm_op_dict[self.COLLECTIVE])
153
+ return comm_op_dict
154
+
155
+ def integrate_matrix_data(self, comm_op_dict_simple):
156
+ """integrate the matrix data"""
157
+ comm_op_dict = defaultdict(dict)
158
+ for new_comm_op_name, data in comm_op_dict_simple.items():
159
+ data.sort(key=lambda x: x[self.BANDWIDTH_GB_S], reverse=True)
160
+ t_type = data[0].get(self.TRANSPORT_TYPE, '')
161
+ t_size = sum(x.get(self.TRANSIT_SIZE_MB, 0) for x in data)
162
+ t_time = sum(x.get(self.TRANSIT_TIME_MS, 0) for x in data)
163
+ bandwidth = self.compute_ratio(t_size, t_time)
164
+
165
+ link = new_comm_op_name[2]
166
+
167
+ comm_op_dict[f'{new_comm_op_name[0]}-top1@{new_comm_op_name[1]}'].update({link: data[0]})
168
+ comm_op_dict[f'{new_comm_op_name[0]}-middle@{new_comm_op_name[1]}'].update({link: data[len(data) // 2]})
169
+ comm_op_dict[f'{new_comm_op_name[0]}-bottom1@{new_comm_op_name[1]}'].update({link: data[-1]})
170
+ index2 = -2
171
+ index3 = -3
172
+ if len(data) == 1:
173
+ index2 = -1
174
+ index3 = -1
175
+ elif len(data) == 2:
176
+ index3 = -2
177
+ comm_op_dict[f'{new_comm_op_name[0]}-bottom2@{new_comm_op_name[1]}'].update({link: data[index2]})
178
+ comm_op_dict[f'{new_comm_op_name[0]}-bottom3@{new_comm_op_name[1]}'].update({link: data[index3]})
179
+ comm_op_dict[f'{new_comm_op_name[0]}-total@{new_comm_op_name[1]}'].update({link: {
180
+ self.TRANSPORT_TYPE: t_type,
181
+ self.TRANSIT_SIZE_MB: t_size,
182
+ self.TRANSIT_TIME_MS: t_time,
183
+ self.BANDWIDTH_GB_S: bandwidth
184
+ }})
185
+ return comm_op_dict
186
+
187
+ def get_matrix_ops_dict(self, op_data: dict) -> dict:
188
+ """parse matrix data"""
189
+ comm_op_dict_simple_p2p = defaultdict(list)
190
+ comm_op_dict_simple_collective = defaultdict(list)
191
+
192
+ for communication_op, communication_info in op_data.items():
193
+ if communication_op.find(self.HCOM_SEND) != -1 or communication_op.find(self.HCOM_RECEIVE) != -1:
194
+
195
+ match_obj = self.PATTERN1.search(communication_op.lower())
196
+ comm_op_type = match_obj.group()
197
+ for link, data in communication_info.items():
198
+ new_comm_op_name = (comm_op_type, communication_op.split("@")[-1], link)
199
+ data['op_name'] = communication_op.split("@")[0]
200
+ comm_op_dict_simple_p2p[new_comm_op_name].append(data)
201
+
202
+ elif communication_op.startswith(self.TOTAL):
203
+ continue
204
+ else:
205
+ match_obj = self.PATTERN2.search(communication_op.lower())
206
+ if not match_obj:
207
+ comm_op_type = communication_op.lower().split('/')[-1].split('-op')[0]
208
+ logging.warning("Communication operator type not found communication_op: %s, use comm_op_type: %s",
209
+ communication_op, comm_op_type)
210
+ else:
211
+ comm_op_type = match_obj.group()
212
+
213
+ for link, data in communication_info.items():
214
+ new_comm_op_name = (comm_op_type, communication_op.split("@")[-1], link)
215
+ data['op_name'] = communication_op.split("@")[0]
216
+ comm_op_dict_simple_collective[new_comm_op_name].append(data)
217
+
218
+ comm_op_dict = {self.P2P: self.integrate_matrix_data(comm_op_dict_simple_p2p),
219
+ self.COLLECTIVE: self.integrate_matrix_data(comm_op_dict_simple_collective)}
220
+
221
+ return comm_op_dict
222
+
223
+ def is_step_list_empty(self):
224
+ """is step list empty"""
225
+ for step_info in self.step_list:
226
+ if step_info.get("comm_ops"):
227
+ return False
228
+ return True
229
+
230
+ def compute_total_info(self, comm_ops: dict):
231
+ """
232
+ compute total info
233
+ """
234
+ if not comm_ops:
235
+ return
236
+ total_time_info_dict = defaultdict(float)
237
+ total_bandwidth_info_dict = {}
238
+ for _, communication_op_info in comm_ops.items():
239
+ for com_info, com_info_dict in communication_op_info.items():
240
+ if com_info == self.COMMUNICATION_TIME_INFO:
241
+ self.combine_time_info(com_info_dict, total_time_info_dict)
242
+ if com_info == self.COMMUNICATION_BANDWIDTH_INFO:
243
+ self.combine_bandwidth_info(com_info_dict, total_bandwidth_info_dict)
244
+ self.compute_time_ratio(total_time_info_dict)
245
+ self.compute_bandwidth_ratio(total_bandwidth_info_dict)
246
+ comm_ops['Total Op Info'] = {
247
+ self.COMMUNICATION_TIME_INFO: total_time_info_dict,
248
+ self.COMMUNICATION_BANDWIDTH_INFO: total_bandwidth_info_dict
249
+ }
250
+
251
+ def combine_time_info(self, com_info_dict: dict, total_time_info_dict: dict):
252
+ """combine time info"""
253
+ ratio_list = [self.WAIT_TIME_RATIO, self.SYNCHRONIZATION_TIME_RATIO]
254
+ for time_info in com_info_dict:
255
+ if time_info not in ratio_list and time_info != self.START_TIMESTAMP:
256
+ total_time_info_dict[time_info] += com_info_dict.get(time_info)
257
+
258
+ def combine_bandwidth_info(self, com_info_dict: dict, total_bandwidth_info_dict: dict):
259
+ """
260
+ combine bandwidth info
261
+ """
262
+ add_list = [self.TRANSIT_TIME_MS, self.TRANSIT_SIZE_MB]
263
+ dict_list = [self.SIZE_DISTRIBUTION]
264
+ for transport_type, part_transport_dict in com_info_dict.items():
265
+ if transport_type not in total_bandwidth_info_dict:
266
+ total_bandwidth_info_dict[transport_type] = {
267
+ self.TRANSIT_TIME_MS: 0,
268
+ self.TRANSIT_SIZE_MB: 0,
269
+ self.SIZE_DISTRIBUTION: defaultdict(lambda: [0, 0])
270
+ }
271
+ for bandwidth_msg, value in part_transport_dict.items():
272
+ if bandwidth_msg in add_list:
273
+ total_bandwidth_info_dict[transport_type][bandwidth_msg] += value
274
+ if bandwidth_msg in dict_list:
275
+ self.combine_size_distribution(value, total_bandwidth_info_dict[transport_type][bandwidth_msg])
276
+
277
+ def compute_time_ratio(self, total_time_info_dict: dict):
278
+ """compute time ratio"""
279
+ total_time_info_dict[self.WAIT_TIME_RATIO] = \
280
+ self.compute_ratio(total_time_info_dict.get(self.WAIT_TIME_MS, 0),
281
+ total_time_info_dict.get(self.WAIT_TIME_MS, 0) +
282
+ total_time_info_dict.get(self.TRANSIT_TIME_MS, 0))
283
+ total_time_info_dict[self.SYNCHRONIZATION_TIME_RATIO] = \
284
+ self.compute_ratio(total_time_info_dict.get(self.SYNCHRONIZATION_TIME_MS, 0),
285
+ total_time_info_dict.get(self.TRANSIT_TIME_MS, 0) +
286
+ total_time_info_dict.get(self.SYNCHRONIZATION_TIME_MS, 0))
287
+
288
+ def compute_bandwidth_ratio(self, total_bandwidth_info_dict: dict):
289
+ """compute bandwidth ratio"""
290
+ for _, bandwidth_dict in total_bandwidth_info_dict.items():
291
+ self.compute_ratio(bandwidth_dict.get(self.TRANSIT_SIZE_MB, 0), bandwidth_dict.get(self.TRANSIT_TIME_MS, 0))
292
+
293
+ def write(self, communication_file_path, communication_matrix_file_path):
294
+ """
295
+ write communication file and communication matrix file
296
+ """
297
+ try:
298
+ with os.fdopen(os.open(communication_file_path,
299
+ os.O_WRONLY | os.O_CREAT | os.O_TRUNC, 0o600), 'w') as json_file:
300
+ json.dump(self.output_communication, json_file)
301
+ except (IOError, OSError) as err:
302
+ logging.critical('Error occurred when write communication file: %s', err)
303
+ raise ProfilerIOException() from err
304
+ if os.path.exists(communication_file_path):
305
+ os.chmod(communication_file_path, stat.S_IREAD | stat.S_IWRITE)
306
+
307
+ try:
308
+ with os.fdopen(os.open(communication_matrix_file_path,
309
+ os.O_WRONLY | os.O_CREAT | os.O_TRUNC, 0o600), 'w') as json_file:
310
+ json.dump(self.output_matrix_data, json_file)
311
+ except (IOError, OSError) as err:
312
+ logging.critical('Error occurred when write communication matrix file: %s', err)
313
+ raise ProfilerIOException() from err
314
+ if os.path.exists(communication_matrix_file_path):
315
+ os.chmod(communication_matrix_file_path, stat.S_IREAD | stat.S_IWRITE)
@@ -26,13 +26,19 @@ from mindspore.profiler.common.exceptions.exceptions import ProfilerIOException
26
26
  class AscendFlopsGenerator:
27
27
  """Generate ascend flops data from DataFrame."""
28
28
 
29
- def __init__(self, op_summary):
29
+ def __init__(self, op_summary, pretty=False):
30
30
  self.op_summary = op_summary
31
31
  self.flops_dt = np.dtype(
32
32
  [('op_full_name', object), ('MFLOPs(10^6 cube)', float), ('GFLOPS(10^9 cube)', float),
33
33
  ('MFLOPs(10^6 vector)', float), ('GFLOPS(10^9 vector)', float)])
34
34
  self.flops = None
35
35
  self.flops_summary = None
36
+ self.pretty = pretty
37
+
38
+ @property
39
+ def indent(self):
40
+ indent = 1 if self.pretty else None
41
+ return indent
36
42
 
37
43
  def parse(self):
38
44
  """Analyse the op_summary data generate flops data."""
@@ -86,7 +92,7 @@ class AscendFlopsGenerator:
86
92
  with os.fdopen(os.open(flops_summary_path,
87
93
  os.O_WRONLY | os.O_CREAT | os.O_TRUNC, stat.S_IWUSR | stat.S_IRUSR),
88
94
  'w') as json_file:
89
- json.dump(self.flops_summary, json_file)
95
+ json.dump(self.flops_summary, json_file, indent=self.indent)
90
96
  except (IOError, OSError) as err:
91
97
  logging.critical('Errot occurred when write step trace point info file: %s', err)
92
98
  raise ProfilerIOException() from err
@@ -26,10 +26,16 @@ from mindspore.profiler.common.exceptions.exceptions import ProfilerIOException
26
26
  class AscendFPBPGenerator:
27
27
  """Generate ascend fp bp data from DataFrame."""
28
28
 
29
- def __init__(self, op_summary, steptrace):
29
+ def __init__(self, op_summary, steptrace, pretty=False):
30
30
  self.op_summary = op_summary
31
31
  self.steptrace = steptrace
32
32
  self.points = None
33
+ self.pretty = pretty
34
+
35
+ @property
36
+ def indent(self):
37
+ indent = 1 if self.pretty else None
38
+ return indent
33
39
 
34
40
  def parse(self):
35
41
  """Analyse the op_summary and steptrace data generate fpbp data."""
@@ -68,7 +74,7 @@ class AscendFPBPGenerator:
68
74
  with os.fdopen(os.open(step_trace_point_info_path,
69
75
  os.O_WRONLY | os.O_CREAT | os.O_TRUNC, stat.S_IWUSR | stat.S_IRUSR),
70
76
  'w') as json_file:
71
- json.dump(self.points, json_file)
77
+ json.dump(self.points, json_file, indent=self.indent)
72
78
  except (IOError, OSError) as err:
73
79
  logging.critical('Errot occurred when write step trace point info file: %s', err)
74
80
  raise ProfilerIOException() from err
@@ -208,8 +208,8 @@ class AscendHCCLGenerator:
208
208
  name = row.get('name')
209
209
  pid = row.get('pid')
210
210
  tid = row.get('tid')
211
- ts = row.get('ts')
212
- dur = row.get('dur')
211
+ ts = float(row.get('ts'))
212
+ dur = float(row.get('dur'))
213
213
  te = ts + dur
214
214
  ph = row.get('ph')
215
215
  task_type = row.get('args', {}).get('task type', '')
@@ -60,14 +60,20 @@ class AscendMsprofExporter:
60
60
 
61
61
  def get_drv_version(self):
62
62
  """Get the drv_version for choosing the export mode."""
63
+ script_path = self._get_msprof_info_path()
64
+ if not script_path:
65
+ logger.warning("Can`t find get_msprof_info.py path, use single-export mode instead.")
66
+ return False
67
+
68
+ logger.info("get_msprof_info.py path is : %s", script_path)
63
69
  host_dir = os.path.join(self.prof_root_dir, 'host')
64
70
  cmd = ['python',
65
- '/usr/local/Ascend/latest/tools/profiler/profiler_tool/analysis/interface/get_msprof_info.py',
71
+ script_path,
66
72
  '-dir', host_dir]
67
73
  try:
68
74
  outs, _ = self._run_cmd(cmd)
69
75
  if not outs:
70
- logger.warning('Check the drvVersion can`t find the result, use single export mode instead.')
76
+ logger.warning('Check the drvVersion can`t find the result, use single-export mode instead.')
71
77
  return False
72
78
  result = json.loads(outs)
73
79
  logger.info('get drv_version result is : %s', result)
@@ -104,6 +110,10 @@ class AscendMsprofExporter:
104
110
  self._run_cmd(msprof_export_cmd)
105
111
  self._check_export_files(self.source_path)
106
112
 
113
+ msprof_analyze_cmd = [self._msprof_cmd, "--analyze=on", "--rule=communication,communication_matrix",
114
+ "--output={}".format(self.prof_root_dir)]
115
+ self._run_cmd(msprof_analyze_cmd)
116
+
107
117
  return flag
108
118
 
109
119
  def _run_cmd(self, cmd):
@@ -176,6 +186,18 @@ class AscendMsprofExporter:
176
186
 
177
187
  logger.info("The msprof command has been added to the path!")
178
188
 
189
+ def _get_msprof_info_path(self):
190
+ """Check the existence of get_msprof_info.py script"""
191
+ outs, _ = self._run_cmd(['which', self._msprof_cmd])
192
+ if not outs:
193
+ return ""
194
+ msprof_path = os.path.realpath(outs.strip())
195
+ sup_path = msprof_path.split('tools')[0]
196
+ script_path = os.path.join(sup_path, 'tools/profiler/profiler_tool/analysis/interface/get_msprof_info.py')
197
+ if not os.path.exists(script_path):
198
+ return ""
199
+ return script_path
200
+
179
201
  def _generate_step_trace(self, prof_path, device_path):
180
202
  """"generate model_id iteration_id dict"""
181
203
 
@@ -228,9 +250,10 @@ class AscendMsprofExporter:
228
250
  op_statistic.add(summary_file)
229
251
 
230
252
  if not op_summary:
231
- raise RuntimeError("The op_summary file was not found, perhaps the original data was not collected.")
253
+ logger.warning("The op_summary file was not found, perhaps the original data was not collected.")
254
+ return
232
255
  if not op_statistic:
233
- raise RuntimeError("The op_statistics file was not found, perhaps the original data was not collected.")
256
+ logger.warning("The op_statistics file was not found, perhaps the original data was not collected.")
234
257
 
235
258
  logger.info("Finish checking files.")
236
259
 
@@ -250,7 +273,8 @@ class AscendMsprofExporter:
250
273
  op_statistic.add(summary_file)
251
274
 
252
275
  if not op_summary:
253
- raise RuntimeError("The op_summary file was not found, perhaps the original data was not collected.")
276
+ logger.warning("The op_summary file was not found, perhaps the original data was not collected.")
277
+ return
254
278
  if not op_statistic:
255
- raise RuntimeError("The op_statistics file was not found, perhaps the original data was not collected.")
279
+ logger.warning("The op_statistics file was not found, perhaps the original data was not collected.")
256
280
  logger.info("Finish checking files.")
@@ -18,6 +18,7 @@ import fnmatch
18
18
  import os
19
19
 
20
20
  import numpy as np
21
+ from mindspore import log as logger
21
22
 
22
23
 
23
24
  class AscendMsprofDataGeneratorOld:
@@ -128,13 +129,22 @@ class AscendMsprofDataGeneratorOld:
128
129
  self.op_summary_name = self.op_summary_basis_name
129
130
  self.op_summary_name['Iteration ID'] = {'index': -1, 'dtype': ('Iteration ID', object)}
130
131
  for row in reader:
131
- row = [row[index.get('index')] for index in self.op_summary_name.values()]
132
- row[self.op_summary_name['Iteration ID']['index']] = iteration
133
- row = ['0' if i == 'N/A' else i for i in row]
134
- op_summary.append(tuple(row))
132
+ try:
133
+ row = [row[index.get('index')] for index in self.op_summary_name.values()]
134
+ row[self.op_summary_name['Iteration ID']['index']] = iteration
135
+ row = ['0' if i == 'N/A' else i for i in row]
136
+ row += ['0.000'] # Add one column for Task Start Time(us)
137
+ op_summary.append(tuple(row))
138
+ except IndexError:
139
+ logger.warning(f"Fail to read{file}. Will ignore this file and continue reading")
135
140
 
136
141
  op_summary_dt = np.dtype([value['dtype'] for value in self.op_summary_name.values()])
137
142
 
143
+ for i in range(0, len(op_summary)):
144
+ if len(op_summary[i]) < len(op_summary_dt):
145
+ new_raw = [j for j in op_summary[i]]
146
+ new_raw.extend([0 for _ in range(len(op_summary_dt) - len(op_summary[i]))])
147
+ op_summary[i] = tuple(new_raw)
138
148
  self.op_summary = np.array(op_summary, dtype=op_summary_dt)
139
149
  self.op_summary['Task Start Time'] = self.op_summary['Task Start Time'] * 1e-3
140
150
  self.op_summary['Task Duration'] = self.op_summary['Task Duration'] * 1e-3
@@ -348,7 +358,8 @@ class AscendMsprofDataGenerator:
348
358
  new_row = tuple(['0' if d == 'N/A' else d for d in new_row])
349
359
  op_statistic.append(new_row)
350
360
  break
351
-
361
+ if not op_statistic:
362
+ return
352
363
  op_statistic_dt = np.dtype(self.op_statistic_type)
353
364
  self.op_statistic = np.array(op_statistic, dtype=op_statistic_dt)
354
365
  self.op_statistic['Total Time'] *= 1e-3
@@ -35,6 +35,7 @@ class AscendOPGenerator:
35
35
  self.aicpu_detail = None
36
36
  self.framework_raw = None
37
37
  self.output_timeline_data = None
38
+ self.has_statistic_file = True
38
39
 
39
40
  self.op_detail_dt = np.dtype(
40
41
  [('full_op_name', object), ('task_duration', float), ('execution_frequency', int), ('task_type', object)])
@@ -61,12 +62,16 @@ class AscendOPGenerator:
61
62
 
62
63
  # aicore intermediation type
63
64
  self.op_type = self._parse_op_type(self.op_statistic)
65
+ if isinstance(self.op_type, np.ndarray) and not self.op_type.size or not isinstance(self.op_type, np.ndarray) \
66
+ and not self.op_type:
67
+ self.has_statistic_file = False
64
68
 
65
69
  # aicpu_intermediation
66
70
  self.aicpu_detail = self._parse_aicpu_detail(self.op_summary)
67
71
 
68
72
  # framwork_raw
69
- self.framework_raw = self._parse_framework_raw(self.op_summary)
73
+ if self.has_statistic_file:
74
+ self.framework_raw = self._parse_framework_raw(self.op_summary)
70
75
 
71
76
  self.output_timeline_data = self.op_summary[self.op_summary['Task Type'] == 'AI_CORE'][
72
77
  ['Op Name', 'Stream ID', 'Task Start Time', 'Task Duration']]
@@ -84,7 +89,7 @@ class AscendOPGenerator:
84
89
  output_timeline_data_path : output_timeline_data.txt path
85
90
  """
86
91
  # aicore intermediation detail
87
- if self.op_detail.shape[0] != 0:
92
+ if isinstance(self.op_detail, np.ndarray) and self.op_detail.size and self.op_detail.shape[0] != 0:
88
93
  try:
89
94
  with os.fdopen(os.open(aicore_intermediate_detail_path,
90
95
  os.O_WRONLY | os.O_CREAT | os.O_TRUNC, stat.S_IWUSR | stat.S_IRUSR),
@@ -99,7 +104,7 @@ class AscendOPGenerator:
99
104
  os.chmod(aicore_intermediate_detail_path, stat.S_IREAD | stat.S_IWRITE)
100
105
 
101
106
  # aicore intermediation type
102
- if self.op_type.shape[0] != 0:
107
+ if isinstance(self.op_type, np.ndarray) and self.op_type.size and self.op_type.shape[0] != 0:
103
108
  try:
104
109
  with os.fdopen(os.open(aicore_intermediate_type_path,
105
110
  os.O_WRONLY | os.O_CREAT | os.O_TRUNC, stat.S_IWUSR | stat.S_IRUSR),
@@ -114,7 +119,7 @@ class AscendOPGenerator:
114
119
  os.chmod(aicore_intermediate_type_path, stat.S_IREAD | stat.S_IWRITE)
115
120
 
116
121
  # aicpu_intermediation
117
- if self.aicpu_detail.shape[0] != 0:
122
+ if isinstance(self.aicpu_detail, np.ndarray) and self.aicpu_detail.size and self.aicpu_detail.shape[0] != 0:
118
123
  try:
119
124
  with os.fdopen(os.open(aicpu_intermediate_detail_path,
120
125
  os.O_WRONLY | os.O_CREAT | os.O_TRUNC, stat.S_IWUSR | stat.S_IRUSR),
@@ -129,7 +134,7 @@ class AscendOPGenerator:
129
134
  os.chmod(aicpu_intermediate_detail_path, stat.S_IREAD | stat.S_IWRITE)
130
135
 
131
136
  # framwork_raw
132
- if self.framework_raw.shape[0] != 0:
137
+ if isinstance(self.framework_raw, np.ndarray) and self.framework_raw.size and self.framework_raw.shape[0] != 0:
133
138
  try:
134
139
  with os.fdopen(os.open(framework_raw_path,
135
140
  os.O_WRONLY | os.O_CREAT | os.O_TRUNC, stat.S_IWUSR | stat.S_IRUSR),
@@ -144,7 +149,8 @@ class AscendOPGenerator:
144
149
  os.chmod(framework_raw_path, stat.S_IREAD | stat.S_IWRITE)
145
150
 
146
151
  # output_timeline_data
147
- if self.output_timeline_data.shape[0] != 0 and output_timeline_data_path:
152
+ if isinstance(self.output_timeline_data, np.ndarray) and self.output_timeline_data.size and \
153
+ self.output_timeline_data.shape[0] != 0 and output_timeline_data_path:
148
154
  try:
149
155
  with os.fdopen(os.open(output_timeline_data_path,
150
156
  os.O_WRONLY | os.O_CREAT | os.O_TRUNC, stat.S_IWUSR | stat.S_IRUSR),
@@ -186,7 +192,9 @@ class AscendOPGenerator:
186
192
  Args:
187
193
  op_statistic(DataFrame): op statistic data.
188
194
  """
189
-
195
+ if isinstance(op_statistic, np.ndarray) and not op_statistic.size or not isinstance(op_statistic, np.ndarray) \
196
+ and not op_statistic:
197
+ return None
190
198
  groups, _, inverse, _ = np.unique(op_statistic['Op Type'], return_index=True, return_inverse=True,
191
199
  return_counts=True)
192
200
 
@@ -49,17 +49,20 @@ class AscendTimelineGenerator(BaseTimelineGenerator):
49
49
  [('Op Name', object), ('Stream ID', int), ('Task Start Time', float), ('Task Duration', float),
50
50
  ('pid', int)])
51
51
 
52
- def init_timeline(self, op_summary, steptrace):
52
+ def init_timeline(self, op_summary, steptrace, pretty=False):
53
53
  """
54
54
  Init timeline metadata, adding all collected info.
55
55
 
56
56
  Args:
57
57
  op_summary: op data
58
58
  steptrace: step data
59
+ pretty: whether to format json file
59
60
  """
60
61
 
61
62
  logger.info('Initiating timeline...')
62
-
63
+ self._pretty = pretty
64
+ if op_summary.size < 1:
65
+ return
63
66
  timeline_list = op_summary[~np.isin(op_summary['Task Type'], ['AI_CPU', 'HCCL'])][
64
67
  ['Op Name', 'Stream ID', 'Task Start Time', 'Task Duration']]
65
68
 
@@ -102,6 +102,12 @@ class BaseTimelineGenerator:
102
102
  self._model = model
103
103
  self._step_start_op_name = ""
104
104
  self._step_end_op_name = ""
105
+ self._pretty = False
106
+
107
+ @property
108
+ def indent(self):
109
+ indent = 1 if self._pretty else None
110
+ return indent
105
111
 
106
112
  @staticmethod
107
113
  def get_parallel_context():
@@ -201,7 +207,9 @@ class BaseTimelineGenerator:
201
207
  with os.fdopen(os.open(display_file_path, os.O_WRONLY | os.O_CREAT | os.O_TRUNC, 0o600), 'w') as json_file:
202
208
  json_file.write('[')
203
209
  for _, item in enumerate(self._timeline_meta):
204
- json.dump(item, json_file)
210
+ item_json = json.dumps([item], indent=self.indent)
211
+ item_json = item_json.lstrip('[').rstrip('\n]')
212
+ json_file.write(item_json)
205
213
  if "scope_level" in item.keys():
206
214
  self._max_scope_name_num = max(
207
215
  self._max_scope_name_num, item["scope_level"] + 1)
@@ -209,7 +217,7 @@ class BaseTimelineGenerator:
209
217
  json_file.write(',')
210
218
  if file_size > size_limit:
211
219
  break
212
- label_name_json = json.dumps(self.get_thread_label_name())
220
+ label_name_json = json.dumps(self.get_thread_label_name(), indent=self.indent)
213
221
  label_name_json = label_name_json.lstrip('[')
214
222
  json_file.write(label_name_json)
215
223
  os.chmod(display_file_path, stat.S_IREAD | stat.S_IWRITE)
@@ -230,7 +238,7 @@ class BaseTimelineGenerator:
230
238
  try:
231
239
  with os.fdopen(os.open(timeline_summary_file_path,
232
240
  os.O_WRONLY | os.O_CREAT | os.O_TRUNC, 0o600), 'w') as json_file:
233
- json.dump(self._timeline_summary, json_file)
241
+ json.dump(self._timeline_summary, json_file, indent=self.indent)
234
242
  except (IOError, OSError) as err:
235
243
  logger.critical('Error occurred when write timeline summary file: %s', err)
236
244
  raise ProfilerIOException() from err
@@ -542,8 +542,9 @@ class CpuTimelineGenerator(GpuTimelineGenerator):
542
542
 
543
543
  return timeline_list
544
544
 
545
- def init_timeline(self):
545
+ def init_timeline(self, pretty=False):
546
546
  """Init timeline metadata, adding all collected info."""
547
+ self._pretty = pretty
547
548
  timeline_list = self._load_timeline_data()
548
549
 
549
550
  # Init a dict for counting the num of streams.