mindspore 2.3.0__cp39-cp39-win_amd64.whl → 2.4.0__cp39-cp39-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mindspore might be problematic. Click here for more details.

Files changed (285) hide show
  1. mindspore/.commit_id +1 -1
  2. mindspore/__init__.py +3 -1
  3. mindspore/_c_dataengine.cp39-win_amd64.pyd +0 -0
  4. mindspore/_c_expression.cp39-win_amd64.pyd +0 -0
  5. mindspore/_c_mindrecord.cp39-win_amd64.pyd +0 -0
  6. mindspore/_checkparam.py +50 -9
  7. mindspore/_extends/parse/compile_config.py +41 -0
  8. mindspore/_extends/parse/parser.py +9 -7
  9. mindspore/_extends/parse/standard_method.py +52 -14
  10. mindspore/_extends/pijit/pijit_func_white_list.py +350 -24
  11. mindspore/amp.py +24 -10
  12. mindspore/avcodec-59.dll +0 -0
  13. mindspore/avdevice-59.dll +0 -0
  14. mindspore/avfilter-8.dll +0 -0
  15. mindspore/avformat-59.dll +0 -0
  16. mindspore/avutil-57.dll +0 -0
  17. mindspore/common/__init__.py +6 -4
  18. mindspore/common/_pijit_context.py +190 -0
  19. mindspore/common/_register_for_tensor.py +2 -1
  20. mindspore/common/_tensor_overload.py +139 -0
  21. mindspore/common/api.py +102 -87
  22. mindspore/common/dump.py +5 -6
  23. mindspore/common/generator.py +1 -7
  24. mindspore/common/hook_handle.py +14 -26
  25. mindspore/common/mindir_util.py +2 -2
  26. mindspore/common/parameter.py +46 -13
  27. mindspore/common/recompute.py +39 -9
  28. mindspore/common/sparse_tensor.py +7 -3
  29. mindspore/common/tensor.py +209 -29
  30. mindspore/communication/__init__.py +1 -1
  31. mindspore/communication/_comm_helper.py +38 -3
  32. mindspore/communication/comm_func.py +310 -55
  33. mindspore/communication/management.py +14 -14
  34. mindspore/context.py +123 -22
  35. mindspore/dataset/__init__.py +1 -1
  36. mindspore/dataset/audio/__init__.py +1 -1
  37. mindspore/dataset/core/config.py +7 -0
  38. mindspore/dataset/core/validator_helpers.py +7 -0
  39. mindspore/dataset/engine/cache_client.py +1 -1
  40. mindspore/dataset/engine/datasets.py +72 -44
  41. mindspore/dataset/engine/datasets_audio.py +7 -7
  42. mindspore/dataset/engine/datasets_standard_format.py +53 -3
  43. mindspore/dataset/engine/datasets_text.py +20 -20
  44. mindspore/dataset/engine/datasets_user_defined.py +174 -104
  45. mindspore/dataset/engine/datasets_vision.py +33 -33
  46. mindspore/dataset/engine/iterators.py +29 -0
  47. mindspore/dataset/engine/obs/util.py +7 -0
  48. mindspore/dataset/engine/queue.py +114 -60
  49. mindspore/dataset/engine/serializer_deserializer.py +2 -2
  50. mindspore/dataset/engine/validators.py +34 -14
  51. mindspore/dataset/text/__init__.py +1 -4
  52. mindspore/dataset/transforms/__init__.py +0 -3
  53. mindspore/dataset/utils/line_reader.py +2 -0
  54. mindspore/dataset/vision/__init__.py +1 -4
  55. mindspore/dataset/vision/utils.py +1 -1
  56. mindspore/dataset/vision/validators.py +2 -1
  57. mindspore/dnnl.dll +0 -0
  58. mindspore/{nn/extend → experimental/es}/__init__.py +4 -11
  59. mindspore/experimental/es/embedding_service.py +883 -0
  60. mindspore/{nn/layer → experimental/es}/embedding_service_layer.py +218 -30
  61. mindspore/experimental/llm_boost/__init__.py +21 -0
  62. mindspore/{nn/extend/layer → experimental/llm_boost/atb}/__init__.py +4 -8
  63. mindspore/experimental/llm_boost/atb/boost_base.py +211 -0
  64. mindspore/experimental/llm_boost/atb/llama_boost.py +115 -0
  65. mindspore/experimental/llm_boost/atb/qwen_boost.py +101 -0
  66. mindspore/experimental/llm_boost/register.py +129 -0
  67. mindspore/experimental/llm_boost/utils.py +31 -0
  68. mindspore/experimental/optim/adamw.py +85 -0
  69. mindspore/experimental/optim/optimizer.py +3 -0
  70. mindspore/hal/__init__.py +3 -3
  71. mindspore/hal/contiguous_tensors_handle.py +175 -0
  72. mindspore/hal/stream.py +18 -0
  73. mindspore/include/api/model_group.h +13 -1
  74. mindspore/include/api/types.h +10 -10
  75. mindspore/include/dataset/config.h +2 -2
  76. mindspore/include/dataset/constants.h +2 -2
  77. mindspore/include/dataset/execute.h +2 -2
  78. mindspore/include/dataset/vision.h +4 -0
  79. mindspore/jpeg62.dll +0 -0
  80. mindspore/log.py +1 -1
  81. mindspore/mindrecord/filewriter.py +68 -51
  82. mindspore/mindspore_backend.dll +0 -0
  83. mindspore/mindspore_common.dll +0 -0
  84. mindspore/mindspore_core.dll +0 -0
  85. mindspore/mindspore_glog.dll +0 -0
  86. mindspore/mindspore_np_dtype.dll +0 -0
  87. mindspore/mindspore_ops.dll +0 -0
  88. mindspore/mint/__init__.py +495 -46
  89. mindspore/mint/distributed/__init__.py +31 -0
  90. mindspore/mint/distributed/distributed.py +254 -0
  91. mindspore/mint/nn/__init__.py +266 -21
  92. mindspore/mint/nn/functional.py +125 -19
  93. mindspore/mint/nn/layer/__init__.py +39 -0
  94. mindspore/mint/nn/layer/activation.py +133 -0
  95. mindspore/mint/nn/layer/normalization.py +477 -0
  96. mindspore/mint/nn/layer/pooling.py +110 -0
  97. mindspore/mint/optim/adamw.py +28 -7
  98. mindspore/mint/special/__init__.py +63 -0
  99. mindspore/multiprocessing/__init__.py +2 -1
  100. mindspore/nn/__init__.py +0 -1
  101. mindspore/nn/cell.py +275 -93
  102. mindspore/nn/layer/activation.py +211 -44
  103. mindspore/nn/layer/basic.py +113 -3
  104. mindspore/nn/layer/embedding.py +120 -2
  105. mindspore/nn/layer/normalization.py +101 -5
  106. mindspore/nn/layer/padding.py +34 -48
  107. mindspore/nn/layer/pooling.py +161 -7
  108. mindspore/nn/layer/transformer.py +3 -3
  109. mindspore/nn/loss/__init__.py +2 -2
  110. mindspore/nn/loss/loss.py +84 -6
  111. mindspore/nn/optim/__init__.py +2 -1
  112. mindspore/nn/optim/adadelta.py +1 -1
  113. mindspore/nn/optim/adam.py +1 -1
  114. mindspore/nn/optim/lamb.py +1 -1
  115. mindspore/nn/optim/tft_wrapper.py +127 -0
  116. mindspore/nn/wrap/cell_wrapper.py +12 -23
  117. mindspore/nn/wrap/grad_reducer.py +5 -5
  118. mindspore/nn/wrap/loss_scale.py +17 -3
  119. mindspore/numpy/__init__.py +1 -1
  120. mindspore/numpy/array_creations.py +65 -68
  121. mindspore/numpy/array_ops.py +64 -60
  122. mindspore/numpy/fft.py +610 -75
  123. mindspore/numpy/logic_ops.py +11 -10
  124. mindspore/numpy/math_ops.py +85 -84
  125. mindspore/numpy/utils_const.py +4 -4
  126. mindspore/opencv_core452.dll +0 -0
  127. mindspore/opencv_imgcodecs452.dll +0 -0
  128. mindspore/opencv_imgproc452.dll +0 -0
  129. mindspore/ops/__init__.py +6 -4
  130. mindspore/ops/_grad_experimental/grad_comm_ops.py +47 -3
  131. mindspore/ops/_grad_experimental/grad_math_ops.py +0 -22
  132. mindspore/ops/_vmap/vmap_array_ops.py +2 -4
  133. mindspore/ops/_vmap/vmap_math_ops.py +17 -1
  134. mindspore/ops/_vmap/vmap_nn_ops.py +43 -2
  135. mindspore/ops/auto_generate/cpp_create_prim_instance_helper.py +85 -7
  136. mindspore/ops/auto_generate/gen_arg_dtype_cast.py +2 -0
  137. mindspore/ops/auto_generate/gen_extend_func.py +734 -13
  138. mindspore/ops/auto_generate/gen_ops_def.py +2420 -381
  139. mindspore/ops/auto_generate/gen_ops_prim.py +5196 -1659
  140. mindspore/ops/auto_generate/pyboost_inner_prim.py +176 -56
  141. mindspore/ops/composite/base.py +85 -48
  142. mindspore/ops/composite/multitype_ops/_compile_utils.py +1 -0
  143. mindspore/ops/composite/multitype_ops/not_in_impl.py +2 -2
  144. mindspore/ops/function/__init__.py +22 -0
  145. mindspore/ops/function/array_func.py +490 -153
  146. mindspore/ops/function/debug_func.py +113 -1
  147. mindspore/ops/function/fft_func.py +15 -2
  148. mindspore/ops/function/grad/grad_func.py +3 -2
  149. mindspore/ops/function/math_func.py +558 -207
  150. mindspore/ops/function/nn_func.py +817 -383
  151. mindspore/ops/function/other_func.py +3 -2
  152. mindspore/ops/function/random_func.py +184 -8
  153. mindspore/ops/function/reshard_func.py +13 -11
  154. mindspore/ops/function/sparse_unary_func.py +1 -1
  155. mindspore/ops/function/vmap_func.py +3 -2
  156. mindspore/ops/functional.py +24 -14
  157. mindspore/ops/op_info_register.py +3 -3
  158. mindspore/ops/operations/__init__.py +6 -1
  159. mindspore/ops/operations/_grad_ops.py +2 -76
  160. mindspore/ops/operations/_infer_ops.py +1 -1
  161. mindspore/ops/operations/_inner_ops.py +71 -94
  162. mindspore/ops/operations/array_ops.py +12 -146
  163. mindspore/ops/operations/comm_ops.py +42 -53
  164. mindspore/ops/operations/custom_ops.py +83 -19
  165. mindspore/ops/operations/debug_ops.py +42 -10
  166. mindspore/ops/operations/manually_defined/_inner.py +12 -0
  167. mindspore/ops/operations/manually_defined/ops_def.py +265 -10
  168. mindspore/ops/operations/math_ops.py +12 -223
  169. mindspore/ops/operations/nn_ops.py +20 -114
  170. mindspore/ops/operations/other_ops.py +7 -4
  171. mindspore/ops/operations/random_ops.py +46 -1
  172. mindspore/ops/primitive.py +18 -6
  173. mindspore/ops_generate/arg_dtype_cast.py +2 -0
  174. mindspore/ops_generate/gen_aclnn_implement.py +11 -11
  175. mindspore/ops_generate/gen_constants.py +36 -0
  176. mindspore/ops_generate/gen_ops.py +67 -52
  177. mindspore/ops_generate/gen_ops_inner_prim.py +1 -1
  178. mindspore/ops_generate/gen_pyboost_func.py +131 -47
  179. mindspore/ops_generate/op_proto.py +10 -3
  180. mindspore/ops_generate/pyboost_utils.py +14 -1
  181. mindspore/ops_generate/template.py +43 -21
  182. mindspore/parallel/__init__.py +3 -1
  183. mindspore/parallel/_auto_parallel_context.py +28 -8
  184. mindspore/parallel/_cell_wrapper.py +83 -0
  185. mindspore/parallel/_parallel_serialization.py +47 -19
  186. mindspore/parallel/_tensor.py +81 -11
  187. mindspore/parallel/_utils.py +13 -1
  188. mindspore/parallel/algo_parameter_config.py +5 -5
  189. mindspore/parallel/checkpoint_transform.py +46 -39
  190. mindspore/parallel/cluster/process_entity/__init__.py +1 -1
  191. mindspore/parallel/cluster/process_entity/_api.py +31 -23
  192. mindspore/parallel/cluster/process_entity/_utils.py +2 -27
  193. mindspore/parallel/parameter_broadcast.py +3 -4
  194. mindspore/parallel/shard.py +162 -31
  195. mindspore/parallel/transform_safetensors.py +993 -0
  196. mindspore/profiler/__init__.py +2 -1
  197. mindspore/profiler/common/constant.py +29 -0
  198. mindspore/profiler/common/registry.py +47 -0
  199. mindspore/profiler/common/util.py +28 -0
  200. mindspore/profiler/dynamic_profiler.py +694 -0
  201. mindspore/profiler/envprofiling.py +17 -19
  202. mindspore/profiler/parser/ascend_analysis/constant.py +18 -0
  203. mindspore/profiler/parser/ascend_analysis/file_manager.py +25 -4
  204. mindspore/profiler/parser/ascend_analysis/function_event.py +43 -19
  205. mindspore/profiler/parser/ascend_analysis/fwk_cann_parser.py +31 -26
  206. mindspore/profiler/parser/ascend_analysis/fwk_file_parser.py +56 -10
  207. mindspore/profiler/parser/ascend_analysis/msprof_timeline_parser.py +55 -8
  208. mindspore/profiler/parser/ascend_analysis/path_manager.py +313 -0
  209. mindspore/profiler/parser/ascend_analysis/profiler_info_parser.py +27 -20
  210. mindspore/profiler/parser/ascend_analysis/trace_event_manager.py +9 -2
  211. mindspore/profiler/parser/ascend_msprof_exporter.py +5 -4
  212. mindspore/profiler/parser/ascend_timeline_generator.py +27 -25
  213. mindspore/profiler/parser/base_timeline_generator.py +19 -25
  214. mindspore/profiler/parser/cpu_gpu_timeline_generator.py +25 -12
  215. mindspore/profiler/parser/framework_parser.py +1 -391
  216. mindspore/profiler/parser/gpu_analysis/__init__.py +14 -0
  217. mindspore/profiler/parser/gpu_analysis/function_event.py +44 -0
  218. mindspore/profiler/parser/gpu_analysis/fwk_file_parser.py +89 -0
  219. mindspore/profiler/parser/gpu_analysis/profiler_info_parser.py +72 -0
  220. mindspore/profiler/parser/memory_usage_parser.py +0 -154
  221. mindspore/profiler/parser/profiler_info.py +78 -6
  222. mindspore/profiler/profiler.py +153 -0
  223. mindspore/profiler/profiling.py +280 -412
  224. mindspore/rewrite/__init__.py +1 -2
  225. mindspore/rewrite/common/namespace.py +4 -4
  226. mindspore/rewrite/symbol_tree/symbol_tree.py +3 -3
  227. mindspore/run_check/_check_version.py +36 -103
  228. mindspore/safeguard/rewrite_obfuscation.py +591 -247
  229. mindspore/swresample-4.dll +0 -0
  230. mindspore/swscale-6.dll +0 -0
  231. mindspore/tinyxml2.dll +0 -0
  232. mindspore/train/__init__.py +4 -3
  233. mindspore/train/_utils.py +28 -2
  234. mindspore/train/amp.py +171 -53
  235. mindspore/train/callback/__init__.py +2 -2
  236. mindspore/train/callback/_callback.py +4 -4
  237. mindspore/train/callback/_checkpoint.py +85 -22
  238. mindspore/train/callback/_cluster_monitor.py +1 -1
  239. mindspore/train/callback/_flops_collector.py +1 -0
  240. mindspore/train/callback/_loss_monitor.py +3 -3
  241. mindspore/train/callback/_on_request_exit.py +134 -31
  242. mindspore/train/callback/_summary_collector.py +5 -5
  243. mindspore/train/callback/_tft_register.py +352 -0
  244. mindspore/train/dataset_helper.py +7 -3
  245. mindspore/train/metrics/metric.py +3 -3
  246. mindspore/train/metrics/roc.py +4 -4
  247. mindspore/train/mind_ir_pb2.py +44 -39
  248. mindspore/train/model.py +134 -58
  249. mindspore/train/serialization.py +336 -112
  250. mindspore/turbojpeg.dll +0 -0
  251. mindspore/utils/__init__.py +21 -0
  252. mindspore/utils/utils.py +60 -0
  253. mindspore/version.py +1 -1
  254. {mindspore-2.3.0.dist-info → mindspore-2.4.0.dist-info}/METADATA +6 -2
  255. {mindspore-2.3.0.dist-info → mindspore-2.4.0.dist-info}/RECORD +258 -252
  256. mindspore/include/c_api/ms/abstract.h +0 -67
  257. mindspore/include/c_api/ms/attribute.h +0 -197
  258. mindspore/include/c_api/ms/base/handle_types.h +0 -43
  259. mindspore/include/c_api/ms/base/macros.h +0 -32
  260. mindspore/include/c_api/ms/base/status.h +0 -33
  261. mindspore/include/c_api/ms/base/types.h +0 -283
  262. mindspore/include/c_api/ms/context.h +0 -102
  263. mindspore/include/c_api/ms/graph.h +0 -160
  264. mindspore/include/c_api/ms/node.h +0 -606
  265. mindspore/include/c_api/ms/tensor.h +0 -161
  266. mindspore/include/c_api/ms/value.h +0 -84
  267. mindspore/mindspore_shared_lib.dll +0 -0
  268. mindspore/nn/extend/basic.py +0 -140
  269. mindspore/nn/extend/embedding.py +0 -143
  270. mindspore/nn/extend/layer/normalization.py +0 -109
  271. mindspore/nn/extend/pooling.py +0 -117
  272. mindspore/nn/layer/embedding_service.py +0 -531
  273. mindspore/ops/_op_impl/aicpu/strided_slice_v2.py +0 -93
  274. mindspore/ops/_op_impl/aicpu/strided_slice_v2_grad.py +0 -66
  275. mindspore/ops/extend/__init__.py +0 -53
  276. mindspore/ops/extend/array_func.py +0 -218
  277. mindspore/ops/extend/math_func.py +0 -76
  278. mindspore/ops/extend/nn_func.py +0 -308
  279. mindspore/ops/silent_check.py +0 -162
  280. mindspore/profiler/parser/msadvisor_analyzer.py +0 -82
  281. mindspore/profiler/parser/msadvisor_parser.py +0 -240
  282. mindspore/train/callback/_mindio_ttp.py +0 -443
  283. {mindspore-2.3.0.dist-info → mindspore-2.4.0.dist-info}/WHEEL +0 -0
  284. {mindspore-2.3.0.dist-info → mindspore-2.4.0.dist-info}/entry_points.txt +0 -0
  285. {mindspore-2.3.0.dist-info → mindspore-2.4.0.dist-info}/top_level.txt +0 -0
@@ -18,8 +18,8 @@ from __future__ import absolute_import
18
18
  import os
19
19
  import stat
20
20
  import time
21
-
22
21
  import threading
22
+
23
23
  import mindspore.context as context
24
24
  from mindspore import log as logger
25
25
  from mindspore import nn
@@ -37,8 +37,7 @@ from mindspore.common.tensor import Tensor
37
37
  from mindspore.common.parameter import Parameter
38
38
  from mindspore.common.generator import Generator
39
39
  from mindspore.common.api import _cell_graph_executor
40
- from mindspore._c_expression import _collect_host_info
41
-
40
+ from mindspore._c_expression import collect_host_info, get_clock_syscnt
42
41
 
43
42
  _cur_dir = os.getcwd()
44
43
  SAVE_DIR = _cur_dir
@@ -88,9 +87,9 @@ def _chg_ckpt_file_name_if_same_exist(directory, prefix, exception=False):
88
87
  if index == 0:
89
88
  suffix_num = max(suffix_num, 1)
90
89
  elif index != -1:
91
- num = filename[pre_len+1:pre_len+index]
90
+ num = filename[pre_len + 1:pre_len + index]
92
91
  if num.isdigit():
93
- suffix_num = max(suffix_num, int(num)+1)
92
+ suffix_num = max(suffix_num, int(num) + 1)
94
93
 
95
94
  if suffix_num != 0:
96
95
  prefix = f'{prefix}_{suffix_num}'
@@ -98,6 +97,14 @@ def _chg_ckpt_file_name_if_same_exist(directory, prefix, exception=False):
98
97
  return prefix
99
98
 
100
99
 
100
+ def _check_format_and_other_params(format, enc_key, enc_mode, crc_check=False, async_save=False, exception_save=False,
101
+ map_param_inc=False, global_step_num=None):
102
+ param_not_default = (enc_key is not None or enc_mode != "AES-GCM" or crc_check or async_save
103
+ or exception_save or map_param_inc or global_step_num is not None)
104
+ if format == "safetensors" and param_not_default:
105
+ raise ValueError("For 'save_checkpoint', when format is 'safetensors', other param must be default.")
106
+
107
+
101
108
  class CheckpointConfig:
102
109
  """
103
110
  The configuration of model checkpoint.
@@ -136,6 +143,10 @@ class CheckpointConfig:
136
143
  exception_save (bool): Whether to save the current checkpoint when an exception occurs. Default: ``False`` .
137
144
  crc_check (bool): Whether to perform crc32 calculation when saving checkpoint and save the calculation
138
145
  result to the end of ckpt. Default: ``False`` .
146
+ remove_redundancy (bool): Whether to enable saving the checkpoint with redundancy removal.
147
+ Redundancy removal refers to eliminating redundant data in data parallelism mode. Default: ``False`` , means
148
+ redundant-free saving is not enabled.
149
+ format (str): Format of the output file, can be "ckpt" or "safetensors". Default: "ckpt".
139
150
  kwargs (dict): Configuration options dictionary.
140
151
 
141
152
  Raises:
@@ -188,6 +199,8 @@ class CheckpointConfig:
188
199
  enc_mode='AES-GCM',
189
200
  exception_save=False,
190
201
  crc_check=False,
202
+ remove_redundancy=False,
203
+ format="ckpt",
191
204
  **kwargs):
192
205
 
193
206
  if save_checkpoint_steps is not None:
@@ -231,8 +244,13 @@ class CheckpointConfig:
231
244
  self._enc_key = Validator.check_isinstance('enc_key', enc_key, (type(None), bytes))
232
245
  self._enc_mode = Validator.check_isinstance('enc_mode', enc_mode, str)
233
246
  self._crc_check = Validator.check_isinstance('crc_check', crc_check, bool)
247
+ self._format = Validator.check_isinstance('format', format, str)
234
248
  self._map_param_inc = kwargs.get('incremental', False)
235
249
  self.enable_redundance = kwargs.get('enable_redundance', False)
250
+ self.remove_redundancy = Validator.check_isinstance('remove_redundancy', remove_redundancy, bool)
251
+
252
+ _check_format_and_other_params(format, enc_key, enc_mode, crc_check, async_save, exception_save,
253
+ self._map_param_inc)
236
254
 
237
255
  @property
238
256
  def save_checkpoint_steps(self):
@@ -333,6 +351,10 @@ class CheckpointConfig:
333
351
  """
334
352
  return self._crc_check
335
353
 
354
+ @property
355
+ def format(self):
356
+ return self._format
357
+
336
358
  @property
337
359
  def append_dict(self):
338
360
  """
@@ -495,10 +517,10 @@ class ModelCheckpoint(Callback):
495
517
  self._aiturbo_init_flag = os.getenv("AITURBO") == "1"
496
518
  # get existing checkpoint files
497
519
  if self._aiturbo_init_flag:
498
- import aiturbo
499
- self._manager = aiturbo.CheckpointShmManager()
520
+ from aiturbo.checkpoint.aiturbo_mindspore_ckpt import CheckpointShmManager
521
+ self._manager = CheckpointShmManager()
500
522
  else:
501
- self._manager = CheckpointManager()
523
+ self._manager = CheckpointManager(self._config.format)
502
524
  if not callable(directory) and not callable(prefix):
503
525
  self._prefix = _chg_ckpt_file_name_if_same_exist(self._directory, self._prefix)
504
526
  self._append_dict = self._config.append_dict or {}
@@ -517,7 +539,7 @@ class ModelCheckpoint(Callback):
517
539
  """
518
540
  cb_params = run_context.original_args()
519
541
  if self._aiturbo_init_flag:
520
- import aiturbo
542
+ from aiturbo.checkpoint import aiturbo_mindspore as aiturbo
521
543
  ckpt_storage_path = self._directory
522
544
  rank_id = get_rank()
523
545
  stage_num = _get_auto_parallel_context("pipeline_stages")
@@ -536,7 +558,7 @@ class ModelCheckpoint(Callback):
536
558
  "stage_layout": param_redundancy_dict}
537
559
  single_params = remove_param_redundancy(param_redundancy_dict)
538
560
  single_params = {device_id: list(params) for device_id, params in single_params.items()}
539
- aiturbo.init(ckpt_storage_path, rank_id, layout, single_params, self._config.enable_redundance, dp)
561
+ aiturbo.init(ckpt_storage_path, rank_id, layout, single_params, not self._config.enable_redundance, dp)
540
562
  self._aiturbo_init_flag = False
541
563
  if self._prefix_func:
542
564
  self._prefix = self._prefix_func(cb_params)
@@ -546,7 +568,7 @@ class ModelCheckpoint(Callback):
546
568
  "string that does not contain '/', but got {}.".format(self._prefix))
547
569
  if self._directory_func:
548
570
  self._directory = self._directory_func(cb_params)
549
- _collect_host_info("Callback", "ModelCheckpoint", "step_end", level=1)
571
+ collect_host_info("Callback", "ModelCheckpoint", "step_end", start_time=get_clock_syscnt(), level=1)
550
572
  # In disaster recovery scenario, the training process may be rolled back to the last step where
551
573
  # the ckpt was successfully saved, so the _last_triggered_step should be updated.
552
574
  if _get_recovery_context("enable_recovery") and cb_params.last_save_ckpt_step is not None:
@@ -575,7 +597,7 @@ class ModelCheckpoint(Callback):
575
597
  run_context (RunContext): Context of the train running.
576
598
  """
577
599
  cb_params = run_context.original_args()
578
- _collect_host_info("Callback", "ModelCheckpoint", "end", level=1)
600
+ collect_host_info("Callback", "ModelCheckpoint", "end", start_time=get_clock_syscnt(), level=1)
579
601
  _to_save_last_ckpt = True
580
602
 
581
603
  self._save_ckpt(cb_params, _to_save_last_ckpt)
@@ -601,6 +623,13 @@ class ModelCheckpoint(Callback):
601
623
 
602
624
  return False
603
625
 
626
+ def _append_dict_content(self, epoch_num, step_num):
627
+ """Append append_dict content."""
628
+ if "epoch_num" in self._append_dict:
629
+ self._append_dict["epoch_num"] = self._append_epoch_num + epoch_num
630
+ if "step_num" in self._append_dict:
631
+ self._append_dict["step_num"] = self._append_step_num + step_num
632
+
604
633
  def _save_ckpt(self, cb_params, force_to_save=False):
605
634
  """Save checkpoint files."""
606
635
  if cb_params.cur_step_num == self._last_triggered_step:
@@ -615,10 +644,10 @@ class ModelCheckpoint(Callback):
615
644
 
616
645
  if save_ckpt:
617
646
  if self._prefix_func:
618
- cur_ckpoint_file = self._prefix + ".ckpt"
647
+ cur_ckpoint_file = self._prefix + f".{self._config.format}"
619
648
  else:
620
649
  cur_ckpoint_file = self._prefix + "-" + str(cb_params.cur_epoch_num) + "_" \
621
- + str(step_num_in_epoch) + ".ckpt"
650
+ + str(step_num_in_epoch) + f".{self._config.format}"
622
651
  # update checkpoint file list.
623
652
  self._manager.update_ckpoint_filelist(self._directory, self._prefix)
624
653
  # keep checkpoint files number equal max number.
@@ -644,20 +673,51 @@ class ModelCheckpoint(Callback):
644
673
  set_cur_net(cb_params.train_network)
645
674
  cb_params.train_network.add_flags(ge_sync_data=True)
646
675
  _cell_graph_executor(cb_params.train_network, phase='save')
647
- if "epoch_num" in self._append_dict:
648
- self._append_dict["epoch_num"] = self._append_epoch_num + cb_params.cur_epoch_num
649
- if "step_num" in self._append_dict:
650
- self._append_dict["step_num"] = self._append_step_num + cb_params.cur_step_num
676
+ self._append_dict_content(cb_params.cur_epoch_num, cb_params.cur_step_num)
651
677
  network = self._config.saved_network if self._config.saved_network is not None else cb_params.train_network
652
678
  if os.getenv("AITURBO") == "1":
653
679
  save_checkpoint(network, cur_file, self._config.integrated_save, self._config.async_save,
654
680
  self._append_dict, self._config.enc_key, self._config.enc_mode,
655
681
  crc_check=self._config.crc_check, incremental=self._map_param_inc,
656
682
  global_step_num=cb_params.cur_step_num)
683
+ elif self._config.remove_redundancy:
684
+ parallel_mode = context.get_auto_parallel_context("parallel_mode")
685
+ if parallel_mode == "stand_alone":
686
+ raise TypeError(f"The deduplication feature for saving checkpoint can only be used "
687
+ f"in parallel scenarios, but got {parallel_mode}.")
688
+ param_layout = network.parameter_layout_dict
689
+ rank_id = get_rank()
690
+ if param_layout:
691
+ device_num = _get_device_num()
692
+ stage_num = _get_auto_parallel_context("pipeline_stages")
693
+ chunk_size = device_num // stage_num
694
+ initial_rank = (rank_id // chunk_size) * chunk_size
695
+ param_redundancy_dict = get_parameter_redundancy(param_layout, initial_rank)
696
+ single_params = remove_param_redundancy(param_redundancy_dict)
697
+ save_param_names = single_params.get(rank_id)
698
+ param_layout_set = set(param_layout.keys())
699
+ if save_param_names == param_layout.keys():
700
+ logger.warning(
701
+ f"For remove_redundancy save checkpoint, the saved parameters are non-redundant.")
702
+
703
+ def choice_func(x):
704
+ return x not in param_layout_set or x in save_param_names
705
+ else:
706
+ param_redundancy_dict = get_parameter_redundancy(network)
707
+ single_params = remove_param_redundancy(param_redundancy_dict)
708
+ save_param_names = single_params.get(rank_id)
709
+
710
+ def choice_func(x):
711
+ return x in save_param_names
712
+ save_checkpoint(network, cur_file, False, self._config.async_save,
713
+ self._append_dict, self._config.enc_key, self._config.enc_mode,
714
+ crc_check=self._config.crc_check, format=self._config.format,
715
+ incremental=self._map_param_inc, choice_func=choice_func)
657
716
  else:
658
717
  save_checkpoint(network, cur_file, self._config.integrated_save, self._config.async_save,
659
718
  self._append_dict, self._config.enc_key, self._config.enc_mode,
660
- crc_check=self._config.crc_check, incremental=self._map_param_inc)
719
+ crc_check=self._config.crc_check, format=self._config.format,
720
+ incremental=self._map_param_inc)
661
721
 
662
722
  self._latest_ckpt_file_name = cur_file
663
723
 
@@ -691,8 +751,9 @@ class ModelCheckpoint(Callback):
691
751
  class CheckpointManager:
692
752
  """Manage checkpoint files according to train_config of checkpoint."""
693
753
 
694
- def __init__(self):
754
+ def __init__(self, format='ckpt'):
695
755
  self._ckpoint_filelist = []
756
+ self._format = format
696
757
 
697
758
  @property
698
759
  def ckpoint_filelist(self):
@@ -707,10 +768,12 @@ class CheckpointManager:
707
768
  def update_ckpoint_filelist(self, directory, prefix):
708
769
  """Update the checkpoint file list."""
709
770
  self._ckpoint_filelist = []
771
+ format = self._format
772
+ format_length = len(format) + 1
710
773
  files = os.listdir(directory)
711
774
  for filename in files:
712
- if os.path.splitext(filename)[-1] == ".ckpt" and filename.startswith(prefix + "-"):
713
- mid_name = filename[len(prefix):-5]
775
+ if os.path.splitext(filename)[-1] == f".{format}" and filename.startswith(prefix + "-"):
776
+ mid_name = filename[len(prefix):-format_length]
714
777
  flag = not (True in [char.isalpha() for char in mid_name])
715
778
  if flag:
716
779
  self._ckpoint_filelist.append(os.path.join(directory, filename))
@@ -150,7 +150,7 @@ class ClusterMonitor(Callback):
150
150
  with _perf_mutex:
151
151
  dir_path = os.path.dirname(self.full_path)
152
152
  if not os.path.exists(dir_path):
153
- os.makedirs(dir_path)
153
+ os.makedirs(dir_path, mode=0o700)
154
154
  if os.path.exists(self.full_path):
155
155
  os.chmod(self.full_path, stat.S_IWUSR)
156
156
  os.remove(self.full_path)
@@ -65,6 +65,7 @@ class FlopsUtilizationCollector(Callback):
65
65
  Raises:
66
66
  TypeError: If data_size is not positive int.
67
67
  TypeError: If full_flops is not bool.
68
+ AssertionError: If the training mode is not a static graph or not a static shape.
68
69
 
69
70
  Examples:
70
71
  >>> import numpy as np
@@ -19,7 +19,7 @@ import numpy as np
19
19
 
20
20
  from mindspore import _checkparam as Validator
21
21
  from mindspore.train.callback._callback import Callback, _handle_loss
22
- from mindspore._c_expression import _collect_host_info
22
+ from mindspore._c_expression import collect_host_info, get_clock_syscnt
23
23
 
24
24
 
25
25
  class LossMonitor(Callback):
@@ -70,7 +70,7 @@ class LossMonitor(Callback):
70
70
  please refer to :class:`mindspore.train.RunContext`.
71
71
  """
72
72
  cb_params = run_context.original_args()
73
- _collect_host_info("Callback", "LossMonitor", "step_end", level=1)
73
+ collect_host_info("Callback", "LossMonitor", "step_end", start_time=get_clock_syscnt(), level=1)
74
74
  cur_epoch_num = cb_params.get("cur_epoch_num", 1)
75
75
  loss = _handle_loss(cb_params.net_outputs)
76
76
 
@@ -101,7 +101,7 @@ class LossMonitor(Callback):
101
101
  please refer to :class:`mindspore.train.RunContext`.
102
102
  """
103
103
  cb_params = run_context.original_args()
104
- _collect_host_info("Callback", "LossMonitor", "train_epoch_end", level=1)
104
+ collect_host_info("Callback", "LossMonitor", "train_epoch_end", start_time=get_clock_syscnt(), level=1)
105
105
  metrics = cb_params.get("metrics")
106
106
  if metrics:
107
107
  print("Eval result: epoch %d, metrics: %s" % (cb_params.cur_epoch_num, metrics))
@@ -16,12 +16,19 @@
16
16
 
17
17
  from __future__ import absolute_import
18
18
  import os
19
+ import json
19
20
  import signal
20
-
21
- from mindspore import log
21
+ import threading
22
+ from mindspore.common import dtype as mstype
23
+ from mindspore import context
24
+ from mindspore import log as logger
25
+ from mindspore.common.tensor import Tensor
26
+ from mindspore.train._utils import _make_directory
22
27
  from mindspore import _checkparam as Validator
23
28
  from mindspore.train.serialization import load_checkpoint, save_checkpoint, export
24
29
  from mindspore.train.callback._callback import Callback
30
+ from mindspore.parallel._utils import _get_parallel_mode
31
+ from mindspore.context import ParallelMode
25
32
 
26
33
 
27
34
  class OnRequestExit(Callback):
@@ -29,7 +36,8 @@ class OnRequestExit(Callback):
29
36
  Respond to the user's closing request, exit the training or eval process, and save the checkpoint and mindir.
30
37
 
31
38
  Register OnRequestExit Callback before training, when the user want to exit the training process
32
- and save the training data, could send the registered exit signal 'sig' to the training process.
39
+ and save the training data, could send the registered exit signal 'sig' to the training process or modify the
40
+ 'GracefulExit' that a key in the json file specified by the 'config_file' to '1'.
33
41
  After the training process executes the current step, saves the current training status,
34
42
  including checkpoint and mindir, and then exit the training process.
35
43
 
@@ -38,9 +46,12 @@ class OnRequestExit(Callback):
38
46
  save_mindir (bool): Whether save the mindir before the training process exit. Default: ``True`` .
39
47
  file_name (str): The saved checkpoint and mindir file name,
40
48
  the checkpoint file add suffix '.ckpt', the mindir file add suffix '.mindir'. Default: ``'Net'`` .
41
- directory (str): The directory save checkpoint and mindir. Default: ``'./'`` .
49
+ directory (str): The path to save files. It will generate a 'rank_{id}' path by rank_id
50
+ to save checkpoint and mindir. Default: ``'./'`` .
42
51
  sig (int): The user registered exit signal, it must be a captureable and negligible signal.
43
52
  When the process receives the signal, exits the training or eval process. Default: ``signal.SIGTERM`` .
53
+ config_file (str): A json config file used to exit training process gracefully. Key: ``{"GracefulExit": 1}`` .
54
+ Default: ``None`` .
44
55
 
45
56
  Raises:
46
57
  ValueError: If the 'save_ckpt' is not a bool.
@@ -67,20 +78,28 @@ class OnRequestExit(Callback):
67
78
  >>> model.train(10, dataset, callbacks=on_request_exit)
68
79
  """
69
80
 
70
- def __init__(self, save_ckpt=True, save_mindir=True, file_name='Net', directory='./', sig=signal.SIGTERM):
81
+ def __init__(self, save_ckpt=True, save_mindir=True, file_name='Net', directory='./', config_file=None,
82
+ sig=signal.SIGTERM):
71
83
  super(OnRequestExit, self).__init__()
72
84
  self.save_ckpt = Validator.check_isinstance('save_ckpt', save_ckpt, bool)
73
85
  self.save_mindir = Validator.check_isinstance('save_mindir', save_mindir, bool)
74
- if self.save_ckpt or self.save_mindir:
75
- file_name = Validator.check_isinstance('file_name', file_name, str)
76
- directory = Validator.check_isinstance('directory', directory, str)
77
- os.makedirs(os.path.abspath(directory), exist_ok=True)
78
- self.train_file_path = os.path.abspath(os.path.join(directory, f"{file_name}_train"))
79
- self.eval_file_path = os.path.abspath(os.path.join(directory, f"{file_name}_eval"))
80
86
  self.sig = Validator.check_isinstance('sig', sig, int)
81
87
  if hasattr(signal, "SIGKILL") and self.sig == signal.SIGKILL:
82
88
  raise ValueError("Not support send exit request by signal SIGKILL.")
83
- self.exit = False
89
+ self.exit = False # used signal to exit the training process
90
+ self.lock = threading.Lock()
91
+ self.save_path = directory
92
+ self.key = "GracefulExit"
93
+ self.remote_config_file = config_file # used config file to save checkpoint and exit training process
94
+ self.use_graceful = os.environ.get("MS_ENABLE_GRACEFUL_EXIT") == "1"
95
+ self.is_distributed = _get_parallel_mode() != ParallelMode.STAND_ALONE
96
+ self.integrated_save = True
97
+ if self.is_distributed:
98
+ self.integrated_save = _get_parallel_mode() == ParallelMode.AUTO_PARALLEL
99
+ self.stop_train = False
100
+ self.need_do_step_end = False
101
+ if self.save_ckpt or self.save_mindir:
102
+ self.train_name, self.eval_name = self._get_save_path(file_name)
84
103
 
85
104
  def on_train_begin(self, run_context):
86
105
  """
@@ -91,22 +110,31 @@ class OnRequestExit(Callback):
91
110
  For more details, please refer to :class:`mindspore.train.RunContext`.
92
111
  """
93
112
  signal.signal(self.sig, self._handle_signal)
94
- if self.save_ckpt and os.path.isfile(f"{self.train_file_path}.ckpt"):
113
+ if self.save_ckpt and os.path.isfile(f"{self.train_name}.ckpt"):
95
114
  cb_params = run_context.original_args()
96
115
  train_net = cb_params.train_network
97
- load_checkpoint(f"{self.train_file_path}.ckpt", net=train_net)
116
+ load_checkpoint(f"{self.train_name}.ckpt", net=train_net)
117
+
118
+ def on_train_step_begin(self, run_context):
119
+ """
120
+ Check whether received the exit signal or
121
+ whether the value of 'GracefulExit' in 'config_file' was changed to '1'.
122
+
123
+ Args:
124
+ run_context (RunContext): Context information of the model.
125
+ For more details, please refer to :class:`mindspore.train.RunContext`.
126
+ """
127
+ self._do_step_begin(run_context)
98
128
 
99
129
  def on_train_step_end(self, run_context):
100
130
  """
101
- When the train step end, if received the exit signal, set the 'run_context' attribute '_stop_requested' to True.
102
- Then exit the training process after this step training.
131
+ Save checkpoint file or mindir file according to config, and exit the training process.
103
132
 
104
133
  Args:
105
134
  run_context (RunContext): Include some information of the model.
106
135
  For more details, please refer to :class:`mindspore.train.RunContext`.
107
136
  """
108
- if self.exit:
109
- run_context.request_stop()
137
+ self._do_step_end(run_context)
110
138
 
111
139
  def on_train_epoch_end(self, run_context):
112
140
  """
@@ -118,8 +146,7 @@ class OnRequestExit(Callback):
118
146
  run_context (RunContext): Include some information of the model.
119
147
  For more details, please refer to :class:`mindspore.train.RunContext`.
120
148
  """
121
- if self.exit:
122
- run_context.request_stop()
149
+ self._do_step_end(run_context)
123
150
 
124
151
  def on_train_end(self, run_context):
125
152
  """
@@ -135,10 +162,10 @@ class OnRequestExit(Callback):
135
162
  cb_params = run_context.original_args()
136
163
  train_net = cb_params.train_network
137
164
  if self.save_ckpt:
138
- save_checkpoint(train_net, ckpt_file_name=self.train_file_path)
165
+ save_checkpoint(train_net, ckpt_file_name=self.train_name)
139
166
  if self.save_mindir:
140
167
  inputs = cb_params.train_dataset_element
141
- export(train_net, *inputs, file_name=self.train_file_path, file_format='MINDIR')
168
+ export(train_net, *inputs, file_name=self.train_name, file_format='MINDIR')
142
169
 
143
170
  def on_eval_begin(self, run_context):
144
171
  """
@@ -153,15 +180,15 @@ class OnRequestExit(Callback):
153
180
  return
154
181
  cb_params = run_context.original_args()
155
182
  eval_net = cb_params.eval_network
156
- if os.path.isfile(f"{self.eval_file_path}.ckpt"):
157
- load_checkpoint(f"{self.eval_file_path}.ckpt", net=eval_net)
158
- elif os.path.isfile(f"{self.train_file_path}.ckpt"):
159
- load_checkpoint(f"{self.train_file_path}.ckpt", net=eval_net)
183
+ if os.path.isfile(f"{self.eval_name}.ckpt"):
184
+ load_checkpoint(f"{self.eval_name}.ckpt", net=eval_net)
185
+ elif os.path.isfile(f"{self.train_name}.ckpt"):
186
+ load_checkpoint(f"{self.train_name}.ckpt", net=eval_net)
160
187
 
161
188
  def on_eval_step_end(self, run_context):
162
189
  """
163
- When the eval step end, if received the exit signal, set the 'run_context' attribute '_stop_requested' to True.
164
- Then exit the eval process after this step eval.
190
+ When the eval step end, if received the exit signal, set attribute '_stop_requested' of the
191
+ 'run_context' to True. Then exit the eval process after this step eval.
165
192
 
166
193
  Args:
167
194
  run_context (RunContext): Include some information of the model.
@@ -184,12 +211,88 @@ class OnRequestExit(Callback):
184
211
  cb_params = run_context.original_args()
185
212
  eval_net = cb_params.eval_network
186
213
  if self.save_ckpt:
187
- save_checkpoint(eval_net, ckpt_file_name=self.eval_file_path)
214
+ save_checkpoint(eval_net, ckpt_file_name=self.eval_name)
188
215
  if self.save_mindir:
189
216
  inputs = cb_params.eval_dataset_element
190
- export(eval_net, *inputs, file_name=self.eval_file_path, file_format='MINDIR')
217
+ export(eval_net, *inputs, file_name=self.eval_name, file_format='MINDIR')
191
218
 
192
219
  def _handle_signal(self, signum, frame):
193
220
  """Handle the received signal"""
194
- log.debug(f"signum: {signum}, frame: {frame}")
221
+ logger.debug(f"signum: {signum}, frame: {frame}")
195
222
  self.exit = True
223
+
224
+ def _do_step_end(self, run_context):
225
+ """
226
+ Save the checkpoint or mindir, and then exit training process.
227
+
228
+ Args:
229
+ run_context (RunContext): Include some information of the model.
230
+ For more details, please refer to :class:`mindspore.train.RunContext`.
231
+ """
232
+ with self.lock:
233
+ # save once
234
+ if self.stop_train or not self.need_do_step_end:
235
+ return
236
+ logger.info("Gracefully exiting training process on step end.")
237
+ call_params = run_context.original_args()
238
+ net = call_params.train_network
239
+ for _, param in net.parameters_and_names():
240
+ if param.name == "graceful_exit" and param.asnumpy() == True: # pylint: disable=C0121
241
+ logger.warning("Graceful exit is triggered, stop training.")
242
+ if self.save_ckpt:
243
+ save_checkpoint(net, self.train_name, integrated_save=self.integrated_save)
244
+ if self.save_mindir:
245
+ inputs = call_params.train_dataset_element
246
+ export(net, *inputs, file_name=self.train_name, file_format='MINDIR')
247
+ run_context.request_stop()
248
+ self.stop_train = True
249
+
250
+ def _do_step_begin(self, run_context):
251
+ """
252
+ Check training process exit configuration at the step begin.
253
+
254
+ Args:
255
+ run_context (RunContext): Include some information of the model.
256
+ For more details, please refer to :class:`mindspore.train.RunContext`.
257
+ """
258
+ with self.lock:
259
+ # no env
260
+ if not self.use_graceful:
261
+ return
262
+ if self._check_config_info() or self.exit:
263
+ call_params = run_context.original_args()
264
+ net = call_params.train_network
265
+ for _, param in net.parameters_and_names():
266
+ if not self.is_distributed and param.name == "graceful_exit":
267
+ param.set_data(Tensor(True, mstype.bool_))
268
+ self.need_do_step_end = True
269
+ break
270
+ if param.name == "graceful_init":
271
+ param.set_data(Tensor([1], mstype.int32))
272
+ self.need_do_step_end = True
273
+ break
274
+
275
+ def _check_config_info(self):
276
+ """check json config info"""
277
+ if self.remote_config_file is not None and os.path.exists(self.remote_config_file):
278
+ with open(self.remote_config_file, "r") as f:
279
+ try:
280
+ config_info = json.load(f)
281
+ except json.JSONDecodeError as e:
282
+ logger.warning(f"Parse json file failed: {e}, please check json file: {self.remote_config_file}")
283
+ return False
284
+ if self.key in config_info and config_info[self.key] == 1:
285
+ return True
286
+ return False
287
+
288
+ def _get_save_path(self, file_name):
289
+ """path to save checkpoint files or mindir files"""
290
+ device_id = context.get_context("device_id")
291
+ if self.save_path is None:
292
+ tmp = os.path.join(os.getcwd(), r"rank_" + str(device_id))
293
+ path_ = _make_directory(tmp)
294
+ return os.path.join(path_, f"{file_name}_train"), os.path.join(path_, f"{file_name}_eval")
295
+
296
+ save_path = os.path.join(self.save_path, r"rank_" + str(device_id))
297
+ save_path = _make_directory(save_path)
298
+ return os.path.join(save_path, f"{file_name}_train"), os.path.join(save_path, f"{file_name}_eval")
@@ -41,7 +41,7 @@ from mindspore.nn.optim.optimizer import Optimizer
41
41
  from mindspore.nn.loss.loss import LossBase
42
42
  from mindspore.train._utils import check_value_type, _make_directory
43
43
  from mindspore._c_expression import security
44
- from mindspore._c_expression import _collect_host_info
44
+ from mindspore._c_expression import collect_host_info, get_clock_syscnt
45
45
 
46
46
  HYPER_CONFIG_ENV_NAME = "MINDINSIGHT_HYPER_CONFIG"
47
47
  HYPER_CONFIG_LEN_LIMIT = 100000
@@ -472,7 +472,7 @@ class SummaryCollector(Callback):
472
472
 
473
473
  def begin(self, run_context):
474
474
  cb_params = run_context.original_args()
475
- _collect_host_info("Callback", "SummaryCollector", "begin", level=1)
475
+ collect_host_info("Callback", "SummaryCollector", "begin", start_time=get_clock_syscnt(), level=1)
476
476
  self._check_callbacks(cb_params)
477
477
 
478
478
  if cb_params.mode not in ModeEnum.to_list():
@@ -484,7 +484,7 @@ class SummaryCollector(Callback):
484
484
 
485
485
  def step_end(self, run_context):
486
486
  cb_params = run_context.original_args()
487
- _collect_host_info("Callback", "SummaryCollector", "step_end", level=1)
487
+ collect_host_info("Callback", "SummaryCollector", "step_end", start_time=get_clock_syscnt(), level=1)
488
488
  if cb_params.mode != ModeEnum.TRAIN.value:
489
489
  return
490
490
 
@@ -559,7 +559,7 @@ class SummaryCollector(Callback):
559
559
 
560
560
  def epoch_end(self, run_context):
561
561
  cb_params = run_context.original_args()
562
- _collect_host_info("Callback", "SummaryCollector", "epoch_end", level=1)
562
+ collect_host_info("Callback", "SummaryCollector", "epoch_end", start_time=get_clock_syscnt(), level=1)
563
563
  self._collect_tensor_data(cb_params)
564
564
  collect_landscape = self._collect_specified_data.get('collect_landscape')
565
565
  if collect_landscape is not None:
@@ -576,7 +576,7 @@ class SummaryCollector(Callback):
576
576
 
577
577
  def end(self, run_context):
578
578
  cb_params = run_context.original_args()
579
- _collect_host_info("Callback", "SummaryCollector", "end", level=1)
579
+ collect_host_info("Callback", "SummaryCollector", "end", start_time=get_clock_syscnt(), level=1)
580
580
  if cb_params.mode == ModeEnum.TRAIN.value:
581
581
  self._collect_train_lineage(cb_params)
582
582
  else: