mindspore 2.3.0rc1__cp38-cp38-manylinux1_x86_64.whl → 2.3.0rc2__cp38-cp38-manylinux1_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mindspore might be problematic. Click here for more details.

Files changed (223) hide show
  1. mindspore/.commit_id +1 -1
  2. mindspore/__init__.py +1 -1
  3. mindspore/_akg/akg/utils/tbe_codegen_utils.py +13 -3
  4. mindspore/_c_dataengine.cpython-38-x86_64-linux-gnu.so +0 -0
  5. mindspore/_c_expression.cpython-38-x86_64-linux-gnu.so +0 -0
  6. mindspore/_checkparam.py +20 -0
  7. mindspore/_extends/parse/parser.py +1 -1
  8. mindspore/_extends/parse/standard_method.py +6 -5
  9. mindspore/_mindspore_offline_debug.cpython-38-x86_64-linux-gnu.so +0 -0
  10. mindspore/amp.py +5 -5
  11. mindspore/boost/boost_cell_wrapper.py +1 -1
  12. mindspore/boost/group_loss_scale_manager.py +1 -1
  13. mindspore/common/__init__.py +4 -2
  14. mindspore/common/_register_for_recompute.py +48 -0
  15. mindspore/common/_stub_tensor.py +1 -0
  16. mindspore/common/api.py +56 -4
  17. mindspore/common/dtype.py +5 -3
  18. mindspore/common/dump.py +2 -2
  19. mindspore/common/hook_handle.py +51 -4
  20. mindspore/common/initializer.py +1 -1
  21. mindspore/common/jit_config.py +17 -6
  22. mindspore/common/parameter.py +7 -2
  23. mindspore/common/recompute.py +247 -0
  24. mindspore/common/sparse_tensor.py +2 -2
  25. mindspore/common/symbol.py +1 -1
  26. mindspore/common/tensor.py +74 -36
  27. mindspore/communication/__init__.py +3 -3
  28. mindspore/communication/management.py +30 -30
  29. mindspore/context.py +28 -15
  30. mindspore/dataset/__init__.py +5 -5
  31. mindspore/dataset/audio/__init__.py +2 -2
  32. mindspore/dataset/audio/transforms.py +51 -51
  33. mindspore/dataset/callback/ds_callback.py +2 -2
  34. mindspore/dataset/engine/cache_client.py +1 -1
  35. mindspore/dataset/engine/datasets.py +3 -3
  36. mindspore/dataset/engine/datasets_audio.py +14 -14
  37. mindspore/dataset/engine/datasets_standard_format.py +3 -3
  38. mindspore/dataset/engine/datasets_text.py +38 -38
  39. mindspore/dataset/engine/datasets_user_defined.py +3 -3
  40. mindspore/dataset/engine/datasets_vision.py +68 -68
  41. mindspore/dataset/text/__init__.py +3 -3
  42. mindspore/dataset/text/transforms.py +26 -26
  43. mindspore/dataset/transforms/__init__.py +1 -1
  44. mindspore/dataset/vision/__init__.py +3 -3
  45. mindspore/dataset/vision/transforms.py +92 -92
  46. mindspore/dataset/vision/utils.py +1 -1
  47. mindspore/experimental/optim/adadelta.py +2 -2
  48. mindspore/experimental/optim/adagrad.py +2 -2
  49. mindspore/experimental/optim/adam.py +2 -2
  50. mindspore/experimental/optim/adamax.py +2 -2
  51. mindspore/experimental/optim/adamw.py +2 -2
  52. mindspore/experimental/optim/asgd.py +2 -2
  53. mindspore/experimental/optim/lr_scheduler.py +24 -20
  54. mindspore/experimental/optim/nadam.py +2 -2
  55. mindspore/experimental/optim/optimizer.py +1 -1
  56. mindspore/experimental/optim/radam.py +2 -2
  57. mindspore/experimental/optim/rmsprop.py +2 -2
  58. mindspore/experimental/optim/rprop.py +2 -2
  59. mindspore/experimental/optim/sgd.py +2 -2
  60. mindspore/hal/stream.py +2 -0
  61. mindspore/include/mindapi/base/types.h +5 -0
  62. mindspore/lib/libdnnl.so.2 +0 -0
  63. mindspore/lib/libmindspore.so +0 -0
  64. mindspore/lib/libmindspore_backend.so +0 -0
  65. mindspore/lib/libmindspore_common.so +0 -0
  66. mindspore/lib/libmindspore_core.so +0 -0
  67. mindspore/lib/libmindspore_gpr.so.15 +0 -0
  68. mindspore/lib/libmindspore_grpc++.so.1 +0 -0
  69. mindspore/lib/libmindspore_grpc.so.15 +0 -0
  70. mindspore/lib/libmindspore_shared_lib.so +0 -0
  71. mindspore/lib/libopencv_core.so.4.5 +0 -0
  72. mindspore/lib/libopencv_imgcodecs.so.4.5 +0 -0
  73. mindspore/lib/libopencv_imgproc.so.4.5 +0 -0
  74. mindspore/lib/plugin/ascend/custom_aicpu_ops/op_impl/cpu/aicpu_kernel/impl/libcust_cpu_kernels.so +0 -0
  75. mindspore/lib/plugin/ascend/custom_aicpu_ops/op_impl/cpu/config/cust_aicpu_kernel.json +6 -6
  76. mindspore/lib/plugin/ascend/custom_aicpu_ops/op_proto/libcust_op_proto.so +0 -0
  77. mindspore/lib/plugin/ascend/libdvpp_utils.so +0 -0
  78. mindspore/lib/plugin/ascend/libmindspore_cpu_kernels.so +0 -0
  79. mindspore/lib/plugin/gpu/libcuda_ops.so.10 +0 -0
  80. mindspore/lib/plugin/gpu/libcuda_ops.so.11 +0 -0
  81. mindspore/lib/plugin/gpu10.1/libnccl.so.2 +0 -0
  82. mindspore/lib/plugin/gpu11.1/libnccl.so.2 +0 -0
  83. mindspore/lib/plugin/gpu11.6/libnccl.so.2 +0 -0
  84. mindspore/lib/plugin/libmindspore_ascend.so.2 +0 -0
  85. mindspore/lib/plugin/libmindspore_gpu.so.10.1 +0 -0
  86. mindspore/lib/plugin/libmindspore_gpu.so.11.1 +0 -0
  87. mindspore/lib/plugin/libmindspore_gpu.so.11.6 +0 -0
  88. mindspore/log.py +2 -2
  89. mindspore/mint/__init__.py +457 -0
  90. mindspore/mint/nn/__init__.py +430 -0
  91. mindspore/mint/nn/functional.py +424 -0
  92. mindspore/mint/optim/__init__.py +24 -0
  93. mindspore/mint/optim/adamw.py +186 -0
  94. mindspore/multiprocessing/__init__.py +4 -0
  95. mindspore/nn/__init__.py +3 -0
  96. mindspore/nn/cell.py +51 -47
  97. mindspore/nn/extend/__init__.py +29 -0
  98. mindspore/nn/extend/basic.py +140 -0
  99. mindspore/nn/extend/embedding.py +143 -0
  100. mindspore/nn/extend/layer/__init__.py +27 -0
  101. mindspore/nn/extend/layer/normalization.py +107 -0
  102. mindspore/nn/extend/pooling.py +117 -0
  103. mindspore/nn/generator.py +297 -0
  104. mindspore/nn/layer/basic.py +109 -1
  105. mindspore/nn/layer/container.py +2 -2
  106. mindspore/nn/layer/conv.py +6 -6
  107. mindspore/nn/layer/embedding.py +1 -1
  108. mindspore/nn/layer/normalization.py +21 -43
  109. mindspore/nn/layer/padding.py +4 -0
  110. mindspore/nn/optim/ada_grad.py +2 -2
  111. mindspore/nn/optim/adadelta.py +1 -1
  112. mindspore/nn/optim/adafactor.py +1 -1
  113. mindspore/nn/optim/adam.py +7 -7
  114. mindspore/nn/optim/adamax.py +2 -2
  115. mindspore/nn/optim/adasum.py +2 -2
  116. mindspore/nn/optim/asgd.py +2 -2
  117. mindspore/nn/optim/ftrl.py +1 -1
  118. mindspore/nn/optim/lamb.py +3 -3
  119. mindspore/nn/optim/lars.py +1 -1
  120. mindspore/nn/optim/lazyadam.py +2 -2
  121. mindspore/nn/optim/momentum.py +2 -2
  122. mindspore/nn/optim/optimizer.py +2 -2
  123. mindspore/nn/optim/proximal_ada_grad.py +2 -2
  124. mindspore/nn/optim/rmsprop.py +2 -2
  125. mindspore/nn/optim/rprop.py +2 -2
  126. mindspore/nn/optim/sgd.py +2 -2
  127. mindspore/nn/optim/thor.py +2 -2
  128. mindspore/nn/wrap/cell_wrapper.py +9 -9
  129. mindspore/nn/wrap/grad_reducer.py +5 -5
  130. mindspore/ops/_grad_experimental/grad_comm_ops.py +4 -2
  131. mindspore/ops/_vmap/vmap_grad_nn_ops.py +41 -2
  132. mindspore/ops/_vmap/vmap_math_ops.py +27 -8
  133. mindspore/ops/_vmap/vmap_nn_ops.py +66 -8
  134. mindspore/ops/auto_generate/cpp_create_prim_instance_helper.py +73 -1
  135. mindspore/ops/auto_generate/gen_arg_dtype_cast.py +12 -3
  136. mindspore/ops/auto_generate/gen_arg_handler.py +24 -0
  137. mindspore/ops/auto_generate/gen_extend_func.py +274 -0
  138. mindspore/ops/auto_generate/gen_ops_def.py +889 -22
  139. mindspore/ops/auto_generate/gen_ops_prim.py +3541 -253
  140. mindspore/ops/auto_generate/pyboost_inner_prim.py +282 -0
  141. mindspore/ops/composite/multitype_ops/_compile_utils.py +2 -1
  142. mindspore/ops/composite/multitype_ops/_constexpr_utils.py +9 -0
  143. mindspore/ops/extend/__init__.py +9 -1
  144. mindspore/ops/extend/array_func.py +134 -27
  145. mindspore/ops/extend/math_func.py +3 -3
  146. mindspore/ops/extend/nn_func.py +363 -2
  147. mindspore/ops/function/__init__.py +19 -2
  148. mindspore/ops/function/array_func.py +463 -439
  149. mindspore/ops/function/clip_func.py +7 -18
  150. mindspore/ops/function/grad/grad_func.py +5 -5
  151. mindspore/ops/function/linalg_func.py +4 -4
  152. mindspore/ops/function/math_func.py +260 -243
  153. mindspore/ops/function/nn_func.py +825 -62
  154. mindspore/ops/function/random_func.py +73 -4
  155. mindspore/ops/function/sparse_unary_func.py +1 -1
  156. mindspore/ops/function/vmap_func.py +1 -1
  157. mindspore/ops/functional.py +2 -2
  158. mindspore/ops/op_info_register.py +1 -31
  159. mindspore/ops/operations/__init__.py +2 -3
  160. mindspore/ops/operations/_grad_ops.py +2 -107
  161. mindspore/ops/operations/_inner_ops.py +5 -5
  162. mindspore/ops/operations/_sequence_ops.py +2 -2
  163. mindspore/ops/operations/array_ops.py +11 -233
  164. mindspore/ops/operations/comm_ops.py +32 -32
  165. mindspore/ops/operations/custom_ops.py +7 -89
  166. mindspore/ops/operations/manually_defined/ops_def.py +329 -4
  167. mindspore/ops/operations/math_ops.py +13 -163
  168. mindspore/ops/operations/nn_ops.py +9 -316
  169. mindspore/ops/operations/random_ops.py +1 -1
  170. mindspore/ops/operations/sparse_ops.py +3 -3
  171. mindspore/ops/primitive.py +2 -2
  172. mindspore/ops_generate/arg_dtype_cast.py +12 -3
  173. mindspore/ops_generate/arg_handler.py +24 -0
  174. mindspore/ops_generate/gen_ops_inner_prim.py +2 -0
  175. mindspore/ops_generate/gen_pyboost_func.py +13 -6
  176. mindspore/ops_generate/pyboost_utils.py +2 -17
  177. mindspore/parallel/__init__.py +3 -2
  178. mindspore/parallel/_auto_parallel_context.py +106 -1
  179. mindspore/parallel/_parallel_serialization.py +34 -2
  180. mindspore/parallel/_utils.py +16 -0
  181. mindspore/parallel/algo_parameter_config.py +4 -4
  182. mindspore/parallel/checkpoint_transform.py +249 -77
  183. mindspore/parallel/cluster/process_entity/_api.py +1 -1
  184. mindspore/parallel/parameter_broadcast.py +1 -1
  185. mindspore/parallel/shard.py +1 -1
  186. mindspore/profiler/parser/ascend_analysis/fwk_cann_parser.py +1 -0
  187. mindspore/profiler/parser/ascend_analysis/profiler_info_parser.py +17 -5
  188. mindspore/profiler/parser/ascend_msprof_exporter.py +3 -3
  189. mindspore/profiler/parser/ascend_msprof_generator.py +10 -3
  190. mindspore/profiler/parser/ascend_op_generator.py +26 -9
  191. mindspore/profiler/parser/ascend_timeline_generator.py +7 -4
  192. mindspore/profiler/parser/profiler_info.py +11 -1
  193. mindspore/profiler/profiling.py +13 -5
  194. mindspore/rewrite/api/node.py +12 -12
  195. mindspore/rewrite/api/symbol_tree.py +11 -11
  196. mindspore/run_check/_check_version.py +1 -1
  197. mindspore/safeguard/rewrite_obfuscation.py +2 -2
  198. mindspore/train/amp.py +4 -4
  199. mindspore/train/anf_ir_pb2.py +8 -2
  200. mindspore/train/callback/_backup_and_restore.py +2 -2
  201. mindspore/train/callback/_callback.py +4 -4
  202. mindspore/train/callback/_checkpoint.py +2 -2
  203. mindspore/train/callback/_early_stop.py +2 -2
  204. mindspore/train/callback/_landscape.py +4 -4
  205. mindspore/train/callback/_loss_monitor.py +2 -2
  206. mindspore/train/callback/_on_request_exit.py +2 -2
  207. mindspore/train/callback/_reduce_lr_on_plateau.py +2 -2
  208. mindspore/train/callback/_summary_collector.py +2 -2
  209. mindspore/train/callback/_time_monitor.py +2 -2
  210. mindspore/train/dataset_helper.py +8 -3
  211. mindspore/train/loss_scale_manager.py +2 -2
  212. mindspore/train/metrics/metric.py +3 -3
  213. mindspore/train/mind_ir_pb2.py +22 -17
  214. mindspore/train/model.py +15 -15
  215. mindspore/train/serialization.py +18 -18
  216. mindspore/train/summary/summary_record.py +7 -7
  217. mindspore/train/train_thor/convert_utils.py +3 -3
  218. mindspore/version.py +1 -1
  219. {mindspore-2.3.0rc1.dist-info → mindspore-2.3.0rc2.dist-info}/METADATA +1 -1
  220. {mindspore-2.3.0rc1.dist-info → mindspore-2.3.0rc2.dist-info}/RECORD +223 -209
  221. {mindspore-2.3.0rc1.dist-info → mindspore-2.3.0rc2.dist-info}/WHEEL +0 -0
  222. {mindspore-2.3.0rc1.dist-info → mindspore-2.3.0rc2.dist-info}/entry_points.txt +0 -0
  223. {mindspore-2.3.0rc1.dist-info → mindspore-2.3.0rc2.dist-info}/top_level.txt +0 -0
@@ -26,18 +26,18 @@ from mindspore.parallel._utils import _is_in_auto_parallel_mode
26
26
  from mindspore.parallel._parallel_serialization import _rank_list_for_transform_parallel_checkpoint, \
27
27
  _transform_parallel_checkpoint, _get_device_num_from_strategy, _make_dir, \
28
28
  _extract_layout_map, _extract_src_dst_layout_map, _parameter_not_in_local_stage, _extract_pipeline_stage_num, \
29
- _merge_protobuf_strategy, _merge_json_strategy
29
+ _merge_protobuf_strategy, _merge_json_strategy, _extract_src_dst_layout_map_by_src
30
30
 
31
31
 
32
32
  __all__ = ["merge_pipeline_strategys", "rank_list_for_transform", "transform_checkpoint_by_rank",
33
- "transform_checkpoints", "sync_pipeline_shared_parameters"]
33
+ "transform_checkpoints", "sync_pipeline_shared_parameters", "load_segmented_checkpoints"]
34
34
 
35
35
 
36
36
  def merge_pipeline_strategys(src_strategy_dirs, dst_strategy_file):
37
37
  """
38
38
  Merge parallel strategy between all pipeline stages in pipeline parallel mode.
39
39
  For more details about converting distributed Checkpoint, please refer to
40
- `Model Transformation <https://www.mindspore.cn/tutorials/experts/en/r2.3.q1/parallel/model_transformation.html>`_.
40
+ `Model Transformation <https://www.mindspore.cn/tutorials/experts/en/master/parallel/model_transformation.html>`_.
41
41
 
42
42
  Note:
43
43
  Strategy file of each pipeline stage should be included in src_strategy_dirs.
@@ -77,7 +77,7 @@ def rank_list_for_transform(rank_id, src_strategy_file=None, dst_strategy_file=N
77
77
  """
78
78
  List of original distributed checkpoint rank index for obtaining the target checkpoint of a rank_id during the
79
79
  distributed checkpoint conversion. For more details about converting distributed Checkpoint, please refer to
80
- `Model Transformation <https://www.mindspore.cn/tutorials/experts/en/r2.3.q1/parallel/model_transformation.html>`_.
80
+ `Model Transformation <https://www.mindspore.cn/tutorials/experts/en/master/parallel/model_transformation.html>`_.
81
81
 
82
82
  Args:
83
83
  rank_id (int): The rank of which distributed checkpoint needs to be obtained after conversion.
@@ -140,7 +140,7 @@ def transform_checkpoint_by_rank(rank_id, checkpoint_files_map, save_checkpoint_
140
140
  """
141
141
  Transform distributed checkpoint from source sharding strategy to destination sharding strategy by rank
142
142
  for a network. For more details about converting distributed Checkpoint, please refer to
143
- `Model Transformation <https://www.mindspore.cn/tutorials/experts/en/r2.3.q1/parallel/model_transformation.html>`_.
143
+ `Model Transformation <https://www.mindspore.cn/tutorials/experts/en/master/parallel/model_transformation.html>`_.
144
144
 
145
145
  Args:
146
146
  rank_id (int): The rank of which distributed checkpoint needs to be obtained after conversion.
@@ -225,49 +225,63 @@ def transform_checkpoint_by_rank(rank_id, checkpoint_files_map, save_checkpoint_
225
225
  ms.save_checkpoint(transform_param_list, save_checkpoint_file_name)
226
226
 
227
227
 
228
- def transform_checkpoints(src_checkpoints_dir, dst_checkpoints_dir, ckpt_prefix, src_strategy_file=None,
229
- dst_strategy_file=None):
230
- """
231
- Transform distributed checkpoint from source sharding strategy to destination sharding strategy for a rank.
232
- For more details about converting distributed Checkpoint, please refer to
233
- `Model Transformation <https://www.mindspore.cn/tutorials/experts/en/r2.3.q1/parallel/model_transformation.html>`_.
234
-
235
- Note:
236
- The `src_checkpoints_dir` directory structure should be organized like "src_checkpoints_dir/rank_0/a.ckpt", the
237
- rank number should be set to a subdirectory and the checkpoint file is stored in this subdirectory. If multiple
238
- files exist in a rank directory, the last file in the lexicgraphic order would be selected.
239
-
240
- Args:
241
- src_checkpoints_dir (str): The source checkpoints directory.
242
- dst_checkpoints_dir (str): The destination checkpoints directory to save the converted checkpoints.
243
- ckpt_prefix (str): The destination checkpoint name prefix.
244
- src_strategy_file (str): Name of source sharding strategy file which saved by
245
- 'mindspore.set_auto_parallel_context(strategy_ckpt_save_file)'.
246
- when the `src_strategy_file` is ``None``, it means that the source sharding strategy is
247
- without any sharing for each parameter. Default: ``None``.
248
- dst_strategy_file (str): Name of destination sharding strategy file which saved by
249
- 'mindspore.set_auto_parallel_context(strategy_ckpt_save_file)'.
250
- when the `dst_strategy_file` is ``None``,
251
- it means that the destination sharding strategy
252
- is without any sharing for each parameter. Default: ``None``.
253
-
254
- Raises:
255
- ValueError: `src_strategy_file` or `dst_strategy_file` is incorrect.
256
- NotADirectoryError: `src_checkpoints_dir` or `dst_checkpoints_dir` is not a directory.
257
- ValueError: The checkpoint file is missing in `src_checkpoints_dir`.
258
- TypeError: `src_strategy_file` or `dst_strategy_file` is not a string.
259
-
260
- Examples:
261
- >>> import mindspore as ms
262
- >>> ms.transform_checkpoints(src_checkpoints_dir, dst_checkpoints_dir, "dst_checkpoint",
263
- ... "./src_strategy.ckpt", "./dst_strategy.ckpt")
264
-
265
- """
266
- if not os.path.isdir(src_checkpoints_dir):
267
- raise NotADirectoryError("src_checkpoints_dir {} is not a directory.".format(src_checkpoints_dir))
268
- _make_dir(dst_checkpoints_dir, "path")
269
- if not isinstance(ckpt_prefix, str):
270
- raise TypeError("The ckpt_prefix should be a str.")
228
+ def _transform_checkpoint_by_stage(src_checkpoints_dir, dst_checkpoints_dir, ckpt_prefix, src_strategy_file,
229
+ dst_strategy_file=None):
230
+ """Transform checkpoint for stage in src_strategy_file"""
231
+ param_total_dict = defaultdict(dict)
232
+ param_attr_dict = defaultdict(dict)
233
+ param_type_dict = defaultdict(dict)
234
+ src_strategy_list, dst_strategy_list, stage_id = _extract_src_dst_layout_map_by_src(src_strategy_file, \
235
+ dst_strategy_file)
236
+ src_stage_device_num = np.prod(src_strategy_list.get(list(src_strategy_list.keys())[0])[0]) if src_strategy_list \
237
+ is not None else 1
238
+ dst_stage_device_num = np.prod(dst_strategy_list.get(list(dst_strategy_list.keys())[0])[0]) if dst_strategy_list \
239
+ is not None else 1
240
+ origin_dst_strategy_list = _extract_layout_map(dst_strategy_file)
241
+ origin_src_strategy_list = _extract_layout_map(src_strategy_file)
242
+ checkpoint_files_map = {}
243
+ src_rank_id_start = stage_id * src_stage_device_num
244
+ for local_rank in range(src_stage_device_num):
245
+ rank_id = src_rank_id_start + local_rank
246
+ checkpoint_file_name = os.path.join(src_checkpoints_dir, "rank_{}".format(rank_id), "*.ckpt")
247
+ rank_ckpts = glob.glob(checkpoint_file_name)
248
+ rank_ckpts.sort()
249
+ for checkpoint_file in rank_ckpts:
250
+ if not os.path.isfile(checkpoint_file):
251
+ ms.log.warning("{} is not a checkpoint file.".format(checkpoint_file))
252
+ continue
253
+ checkpoint_files_map[rank_id] = checkpoint_file
254
+ for rank, local_file in checkpoint_files_map.items():
255
+ if not os.path.exists(local_file):
256
+ raise ValueError("Checkpoint file {} in rank {} not exits: ".format(local_file, rank))
257
+ for rank, file_name in checkpoint_files_map.items():
258
+ ckpt_dict = ms.load_checkpoint(file_name)
259
+ for param_name, param in ckpt_dict.items():
260
+ # cut the parameter not in the pipeline stage.
261
+ if _parameter_not_in_local_stage(param_name, origin_src_strategy_list, src_strategy_list) \
262
+ and _parameter_not_in_local_stage(param_name, origin_dst_strategy_list, dst_strategy_list):
263
+ continue
264
+ src_rank = rank % src_stage_device_num
265
+ param_type_dict[param_name][src_rank] = str(param.data.dtype)
266
+ if param.data.dtype == mstype.bfloat16:
267
+ param.set_dtype(mstype.float32)
268
+ param_total_dict[param_name][src_rank] = param.data.asnumpy()
269
+ param_attr_dict[param_name][src_rank] = (param.requires_grad, param.layerwise_parallel)
270
+ for local_rank_id in range(dst_stage_device_num):
271
+ transform_param_list = _transform_parallel_checkpoint(local_rank_id, param_total_dict,
272
+ param_attr_dict, src_strategy_list, dst_strategy_list,
273
+ param_type_dict)
274
+ save_checkpoint_file = "{}{}_part{}.ckpt".format(ckpt_prefix, local_rank_id, stage_id)
275
+ save_checkpoint_file_dir = os.path.join(dst_checkpoints_dir, "rank_{}".format(local_rank_id))
276
+ if not os.path.exists(save_checkpoint_file_dir):
277
+ _make_dir(save_checkpoint_file_dir, "path")
278
+ save_checkpoint_file_name = os.path.join(save_checkpoint_file_dir, save_checkpoint_file)
279
+ ms.save_checkpoint(transform_param_list, save_checkpoint_file_name)
280
+
281
+
282
+ def _transform_checkpoints(src_checkpoints_dir, dst_checkpoints_dir, ckpt_prefix, src_strategy_file=None,
283
+ dst_strategy_file=None):
284
+ """Transform checkpoints for all stages in src_strategy_file"""
271
285
  checkpoints_rank_dir_list = os.path.join(src_checkpoints_dir, "rank_[0-9]*")
272
286
  all_checkpoint_files_map = {}
273
287
  for checkpoint_dir in glob.glob(checkpoints_rank_dir_list):
@@ -342,6 +356,76 @@ def transform_checkpoints(src_checkpoints_dir, dst_checkpoints_dir, ckpt_prefix,
342
356
  del param_total_dict
343
357
 
344
358
 
359
+ def transform_checkpoints(src_checkpoints_dir, dst_checkpoints_dir, ckpt_prefix, src_strategy_file=None,
360
+ dst_strategy_file=None):
361
+ """
362
+ Transform distributed checkpoint from source sharding strategy to destination sharding strategy for a rank.
363
+ For more details about converting distributed Checkpoint, please refer to
364
+ `Model Transformation <https://www.mindspore.cn/tutorials/experts/en/master/parallel/model_transformation.html>`_.
365
+
366
+ Note:
367
+ The `src_checkpoints_dir` directory structure should be organized like "src_checkpoints_dir/rank_0/a.ckpt", the
368
+ rank number should be set to a subdirectory and the checkpoint file is stored in this subdirectory. If multiple
369
+ files exist in a rank directory, the last file in the lexicgraphic order would be selected.
370
+
371
+ Args:
372
+ src_checkpoints_dir (str): The source checkpoints directory.
373
+ dst_checkpoints_dir (str): The destination checkpoints directory to save the converted checkpoints.
374
+ ckpt_prefix (str): The destination checkpoint name prefix.
375
+ src_strategy_file (str): Name of source sharding strategy file which saved by
376
+ 'mindspore.set_auto_parallel_context(strategy_ckpt_save_file)'.
377
+ when the 'src_strategy_file' is None, it means that the source sharding strategy is
378
+ without any sharing for each parameter. Default:None.
379
+ dst_strategy_file (str): Name of destination sharding strategy file which saved by
380
+ 'mindspore.set_auto_parallel_context(strategy_ckpt_save_file)'.
381
+ when the 'dst_strategy_file' is None, it means that the destination sharding strategy
382
+ is without any sharing for each parameter. Default:None.
383
+
384
+ Raises:
385
+ ValueError: `src_strategy_file` or `dst_strategy_file` is incorrect.
386
+ NotADirectoryError: `src_checkpoints_dir` or `dst_checkpoints_dir` is not a directory.
387
+ ValueError: The checkpoint file is missing in `src_checkpoints_dir`.
388
+ TypeError: `src_strategy_file` or `dst_strategy_file` is not a string.
389
+
390
+ Examples:
391
+ >>> import mindspore as ms
392
+ >>> ms.transform_checkpoints(src_checkpoints_dir, dst_checkpoints_dir, "dst_checkpoint",
393
+ ... "./src_strategy.ckpt", "./dst_strategy.ckpt")
394
+
395
+ """
396
+ if not os.path.isdir(src_checkpoints_dir):
397
+ raise NotADirectoryError("src_checkpoints_dir {} is not a directory.".format(src_checkpoints_dir))
398
+ _make_dir(dst_checkpoints_dir, "path")
399
+ if not isinstance(ckpt_prefix, str):
400
+ raise TypeError("The ckpt_prefix should be a str.")
401
+ if src_strategy_file and os.path.dirname(src_strategy_file) and not os.path.exists(
402
+ os.path.dirname(src_strategy_file)):
403
+ raise ValueError("The director of src_strategy_file: {} is not exists.".
404
+ format(os.path.dirname(src_strategy_file)))
405
+ if dst_strategy_file and os.path.dirname(dst_strategy_file) and not os.path.exists(
406
+ os.path.dirname(dst_strategy_file)):
407
+ raise ValueError("The director of dst_strategy_file: {} is not exists.".
408
+ format(os.path.dirname(dst_strategy_file)))
409
+ src_layout_map = _extract_layout_map(src_strategy_file)
410
+ dst_layout_map = _extract_layout_map(dst_strategy_file)
411
+ pipeline_stage_num = _extract_pipeline_stage_num(src_strategy_file)
412
+ if src_layout_map:
413
+ src_param_keys = {param_name for param_name in src_layout_map if not param_name.startswith("accu_grads")}
414
+ if dst_layout_map:
415
+ dst_param_keys = {param_name for param_name in dst_layout_map if not param_name.startswith("accu_grads")}
416
+ if src_layout_map and dst_layout_map and pipeline_stage_num == 1 \
417
+ and src_param_keys.issubset(dst_param_keys) and len(src_param_keys) < len(dst_param_keys):
418
+ dst_stage_num = _extract_pipeline_stage_num(dst_strategy_file)
419
+ if dst_stage_num > 1:
420
+ raise NotImplementedError("When using unmerged src strategy, dst strategy doesn't \
421
+ support strategy with pipeline parallel.")
422
+ _transform_checkpoint_by_stage(src_checkpoints_dir, dst_checkpoints_dir, ckpt_prefix,
423
+ src_strategy_file, dst_strategy_file)
424
+ else:
425
+ _transform_checkpoints(src_checkpoints_dir, dst_checkpoints_dir, ckpt_prefix,
426
+ src_strategy_file, dst_strategy_file)
427
+
428
+
345
429
  def _sync_params(name, param, layout):
346
430
  """synchronize single parameter"""
347
431
  if len(layout) < 10:
@@ -394,53 +478,79 @@ def sync_pipeline_shared_parameters(net):
394
478
  Args:
395
479
  net (nn.Cell): the inference network.
396
480
 
481
+ Supported Platforms:
482
+ ``Ascend``
483
+
397
484
  Examples:
485
+ .. note::
486
+ Before running the following examples, you need to configure the communication environment variables.
487
+
488
+ For the Ascend device, users need to write a dynamic cluster startup script, please see the `Dynamic Cluster
489
+ Startup <https://www.mindspore.cn/tutorials/experts/en/master/parallel/dynamic_cluster.html>`_ .
490
+
398
491
  >>> import numpy as np
399
492
  >>> import mindspore as ms
400
- >>> from mindspore import nn, ops, Parameter, Tensor
401
- >>> class VocabEmbedding(nn.Cell):
402
- ... def __init__(self, vocab_size, embedding_size):
493
+ >>> import mindspore.communication.management as D
494
+ >>> from mindspore import lazy_inline, context, nn, ops, Parameter, Tensor
495
+ >>> context.set_context(mode=context.GRAPH_MODE)
496
+ >>> class Embedding(nn.Cell):
497
+ ... def __init__(self, shape):
403
498
  ... super().__init__()
404
- ... self.embedding_table = Parameter(Tensor(np.ones([vocab_size, embedding_size]), ms.float32),
405
- ... name='embedding')
406
- ... self.gather = ops.Gather()
407
- ...
499
+ ... self.w = Parameter(Tensor(np.ones(shape), ms.float32), name='w')
500
+ ... self.matmul = ops.MatMul().shard(((1, 1), (1, 1)))
408
501
  ... def construct(self, x):
409
- ... output = self.gather(self.embedding_table, x, 0)
410
- ... output = output.squeeze(1)
411
- ... return output, self.embedding_table.value()
502
+ ... return self.matmul(x, self.w), self.w
412
503
  ...
413
504
  >>> class LMHead(nn.Cell):
414
505
  ... def __init__(self):
415
506
  ... super().__init__()
416
- ... self.matmul = ops.MatMul(transpose_b=True)
417
- ...
418
- ... def construct(self, state, embed):
419
- ... return self.matmul(state, embed)
507
+ ... self.matmul = ops.MatMul(transpose_b=True).shard(((1, 1), (1, 1)))
508
+ ... def construct(self, x, w):
509
+ ... return self.matmul(x, w)
420
510
  ...
421
511
  >>> class Network(nn.Cell):
422
512
  ... @lazy_inline
423
513
  ... def __init__(self):
424
514
  ... super().__init__()
425
- ... self.word_embedding = VocabEmbedding(vocab_size=4, embedding_size=4)
426
- ... self.head = LMHead()
427
- ...
515
+ ... shape = (4, 4)
516
+ ... self.word_embedding = Embedding(shape)
517
+ ... self.lm_head = LMHead()
518
+ ... self.word_embedding.pipeline_stage = 0
519
+ ... self.lm_head.pipeline_stage = 1
428
520
  ... def construct(self, x):
429
521
  ... x, embed = self.word_embedding(x)
430
- ... x = self.head(x, embed)
431
- ... return x
432
- >>>
522
+ ... return self.lm_head(x, embed)
523
+ ...
524
+ >>> class PipelineCellInference(nn.Cell):
525
+ ... def __init__(self, network, micro_batch_num):
526
+ ... super().__init__()
527
+ ... self.network = network
528
+ ... self.micro_batch_num = micro_batch_num
529
+ ... self.concat = ops.Concat()
530
+ ... def construct(self, x):
531
+ ... ret = ()
532
+ ... for i in range(self.micro_batch_num):
533
+ ... micro_batch_size = x.shape[0] // self.micro_batch_num
534
+ ... start = micro_batch_size * i
535
+ ... end = micro_batch_size * (i + 1)
536
+ ... micro_input = x[start:end]
537
+ ... y = self.network(micro_input)
538
+ ... ret = ret + (y,)
539
+ ... ret = self.concat(ret)
540
+ ... return ret
541
+ >>> D.init()
542
+ >>> context.set_auto_parallel_context(parallel_mode='semi_auto_parallel', full_batch=True, pipeline_stages=2)
433
543
  >>> net = Network()
434
- >>> net.word_embedding.pipeline_stage = 0
435
- >>> net.head.pipeline_stage = 1
436
- >>> x = Tensor(np.ones((8, 4))
437
- >>> net.compile()
544
+ >>> net = PipelineCellInference(net, 2)
545
+ >>> net.set_train(False)
546
+ >>> x = Tensor(np.ones((2, 4)), ms.float32)
547
+ >>> net.compile(x)
438
548
  >>> ms.sync_pipeline_shared_parameters(net)
439
- >>> print(net.word_embedding.embedding_table.asnumpy())
440
- >>> [[1. 1. 1. 1.]
441
- [1. 1. 1. 1.]
442
- [1. 1. 1. 1.]
443
- [1. 1. 1. 1.]]
549
+ >>> print(net.network.word_embedding.w.asnumpy())
550
+ [[1. 1. 1. 1.]
551
+ [1. 1. 1. 1.]
552
+ [1. 1. 1. 1.]
553
+ [1. 1. 1. 1.]]
444
554
  """
445
555
 
446
556
  if not isinstance(net, ms.nn.Cell):
@@ -466,3 +576,65 @@ def sync_pipeline_shared_parameters(net):
466
576
 
467
577
  # restore parallel context
468
578
  ms.context.set_auto_parallel_context(parallel_mode=parallel_mode, full_batch=full_batch)
579
+
580
+
581
+ def load_segmented_checkpoints(ckpt_file_dir, net=None, strict_load=False, filter_prefix=None,
582
+ dec_key=None, dec_mode="AES-GCM", specify_prefix=None, choice_func=None):
583
+ """
584
+ Load checkpoint info from a specified file. If the specified ckpt_file_dir path contains multiple
585
+ checkpoint files, all checkpoint files will be loaded one by one and the combined dictionary will be return.
586
+
587
+ Note:
588
+ - `specify_prefix` and `filter_prefix` do not affect each other.
589
+ - If none of the parameters are loaded from checkpoint file, it will throw ValueError.
590
+ - `specify_prefix` and `filter_prefix` are in the process of being deprecated,
591
+ `choice_func` is recommended instead.
592
+ And using either of those two args will override `choice_func` at the same time.
593
+
594
+ Args:
595
+ ckpt_file_dir (str): Checkpoint file directory.
596
+ net (Cell): The network where the parameters will be loaded. Default: ``None`` .
597
+ strict_load (bool): Whether to strict load the parameter into net. If ``False`` , it will load parameter
598
+ into net when parameter name's suffix in checkpoint file is the same as the
599
+ parameter in the network. When the types are inconsistent perform type conversion
600
+ on the parameters of the same type, such as float32 to float16. Default: ``False`` .
601
+ filter_prefix (Union[str, list[str], tuple[str]]): Deprecated(see `choice_func`). Parameters starting with the
602
+ filter_prefix will not be loaded. Default: ``None`` .
603
+ dec_key (Union[None, bytes]): Byte type key used for decryption. If the value is ``None`` , the decryption
604
+ is not required. Default: ``None`` .
605
+ dec_mode (str): This parameter is valid only when dec_key is not set to ``None`` . Specifies the decryption
606
+ mode, currently supports ``"AES-GCM"`` and ``"AES-CBC"`` and ``"SM4-CBC"`` .
607
+ Default: ``"AES-GCM"`` .
608
+ specify_prefix (Union[str, list[str], tuple[str]]): Deprecated(see `choice_func`). Parameters starting with the
609
+ specify_prefix will be loaded. Default: ``None`` .
610
+ choice_func (Union[None, function]) : Input value of the function is a Parameter name of type string,
611
+ and the return value is a bool. If returns ``True`` , the Parameter
612
+ that matches the custom condition will be loaded. If returns ``False`` , the Parameter that
613
+ matches the custom condition will be removed. Default: ``None`` .
614
+
615
+ Returns:
616
+ Dict, key is parameter name, value is a Parameter or string. When the `append_dict` parameter of
617
+ :func:`mindspore.save_checkpoint` and the `append_info` parameter of :class:`mindspore.train.CheckpointConfig`
618
+ are used to save the checkpoint, `append_dict` and `append_info` are dict types, and their value are string,
619
+ then the return value obtained by loading checkpoint is string, and in other cases the return value is
620
+ Parameter.
621
+
622
+ Raises:
623
+ TypeError: Input ckpt_file_dir is not a string.
624
+ ValueError: Checkpoint file directory doesn't exist. Or it's not a directory
625
+ ValueError: Checkpoint file's format is incorrect.
626
+ ValueError: Parameter's dict is None after load checkpoint file.
627
+ TypeError: The type of `specify_prefix` or `filter_prefix` is incorrect.
628
+ """
629
+ if not isinstance(ckpt_file_dir, str):
630
+ raise TypeError("The ckpt_file_dir should be a str.")
631
+ if not os.path.isdir(ckpt_file_dir):
632
+ raise ValueError("The dst_strategy_file: {} doesn't exist. Or it's not a directory".
633
+ format(ckpt_file_dir))
634
+ checkpoint_file_name = os.path.join(ckpt_file_dir, "*.ckpt")
635
+ rank_ckpts = glob.glob(checkpoint_file_name)
636
+ parameter_dict = {}
637
+ for checkpoint_file in rank_ckpts:
638
+ parameter_dict.update(ms.load_checkpoint(checkpoint_file, net, strict_load, filter_prefix, dec_key,
639
+ dec_mode, specify_prefix, choice_func))
640
+ return parameter_dict
@@ -42,7 +42,7 @@ class _Node:
42
42
  os.environ["MS_WORKER_NUM"] = str(self.worker_num)
43
43
  os.environ["MS_SCHED_HOST"] = self.sched_host
44
44
  os.environ["MS_SCHED_PORT"] = str(self.sched_port)
45
- os.environ["MS_CLUSTER_TIMEOUT"] = str(self.timeout)
45
+ os.environ["MS_TOPO_TIMEOUT"] = str(self.timeout)
46
46
 
47
47
  class _MetaServerNode(_Node):
48
48
  """
@@ -84,7 +84,7 @@ def parameter_broadcast(net, layout, cur_rank=0, initial_rank=0):
84
84
  >>> net.matmul2.shard(((1, 8), (8, 1)))
85
85
  >>> net.relu2.shard(((8, 1),))
86
86
  >>> # Create the dataset taking MNIST as an example. Refer to
87
- >>> # https://gitee.com/mindspore/docs/blob/r2.3.q1/docs/mindspore/code/mnist.py
87
+ >>> # https://gitee.com/mindspore/docs/blob/master/docs/mindspore/code/mnist.py
88
88
  >>> dataset = create_dataset()
89
89
  >>> optim = nn.SGD(net.trainable_params(), 1e-2)
90
90
  >>> loss = nn.CrossEntropyLoss()
@@ -328,7 +328,7 @@ def shard(fn, in_strategy, out_strategy=None, parameter_plan=None, device="Ascen
328
328
 
329
329
  Tutorial Examples:
330
330
  - `Functional Operator Sharding
331
- <https://www.mindspore.cn/tutorials/experts/en/r2.3.q1/parallel/pynative_shard_function_parallel.html>`_
331
+ <https://www.mindspore.cn/tutorials/experts/en/master/parallel/pynative_shard_function_parallel.html>`_
332
332
  """
333
333
  if not isinstance(fn, (ms.nn.Cell)):
334
334
  logger.warning("'fn' is not a mindspore.nn.Cell, and its definition cannot involve Parameter; "
@@ -34,6 +34,7 @@ class FwkCANNParser:
34
34
  def __init__(self, source_path: str, msprof_data: List, rank_id: int):
35
35
  source_path = validate_and_normalize_path(source_path)
36
36
  ProfilerInfoParser.init_source_path(source_path)
37
+ ProfilerInfoParser.init_rank_id(rank_id)
37
38
  fwk_parser = FwkFileParser(source_path, rank_id)
38
39
  msprof_timeline_parser = MsprofTimelineParser(msprof_data)
39
40
  self._fwk_op_data = fwk_parser.get_op_range_data()
@@ -20,9 +20,9 @@ from subprocess import CalledProcessError, TimeoutExpired
20
20
  from subprocess import Popen, PIPE
21
21
 
22
22
  from mindspore import log as logger
23
- import mindspore._c_expression as c_expression
24
23
  from mindspore.profiler.common.validator.validate_path import validate_and_normalize_path
25
24
  from mindspore.profiler.parser.ascend_analysis.constant import Constant
25
+ from mindspore.profiler.parser.profiler_info import ProfilerInfo
26
26
 
27
27
 
28
28
  class ProfilerInfoParser:
@@ -37,6 +37,7 @@ class ProfilerInfoParser:
37
37
  # profiler information related files
38
38
  _source_prof_path = None
39
39
  _loaded_frequency = False
40
+ _rank_id = 0
40
41
 
41
42
  @classmethod
42
43
  def init_source_path(cls, source_path: str):
@@ -48,12 +49,15 @@ class ProfilerInfoParser:
48
49
  raise RuntimeError("Input source path is invalid!")
49
50
  cls._source_prof_path = prof_path
50
51
 
52
+ @classmethod
53
+ def init_rank_id(cls, rank_id):
54
+ """initialize the rank id."""
55
+ cls._rank_id = rank_id
56
+
51
57
  @classmethod
52
58
  def get_local_time(cls, syscnt: int) -> Decimal:
53
59
  """Convert syscnt to local time."""
54
60
  if not cls._loaded_frequency:
55
- localtime_stamp = c_expression.get_clock_time()
56
- syscnt_stamp = c_expression.get_clock_syscnt()
57
61
  outs, _ = cls.__run_cmd(['which', cls._msprof_cmd])
58
62
  if not outs:
59
63
  raise FileNotFoundError("Failed to find msprof command!")
@@ -70,8 +74,16 @@ class ProfilerInfoParser:
70
74
  cls._freq = float(cpu_info.get("Frequency", cls._freq))
71
75
  except ValueError:
72
76
  pass
73
- cls._start_cnt = syscnt_stamp
74
- cls._time_offset = localtime_stamp
77
+ profiler_info_path = os.path.join(cls._source_prof_path, os.path.pardir,
78
+ f"profiler_info_{cls._rank_id}.json")
79
+ if not os.path.isfile(profiler_info_path):
80
+ raise RuntimeError(f"Can`t find the file {profiler_info_path}, please check!")
81
+ with os.fdopen(os.open(profiler_info_path, os.O_RDONLY, 0o600), 'r') as fr:
82
+ profiler_info_data = json.load(fr)
83
+ cls._start_cnt = profiler_info_data.get('system_cnt')
84
+ cls._time_offset = profiler_info_data.get('system_time')
85
+ ProfilerInfo.set_system_time(cls._time_offset)
86
+ ProfilerInfo.set_system_cnt(cls._start_cnt)
75
87
  cls._loaded_frequency = True
76
88
  start_ns = cls.__get_timestamp(syscnt)
77
89
  start_us = Decimal(start_ns) / Constant.NS_TO_US
@@ -251,10 +251,10 @@ class AscendMsprofExporter:
251
251
  msprof_json.add(f)
252
252
 
253
253
  if not op_summary:
254
- raise RuntimeError("The op_summary csv file was not found, perhaps the original data was not collected.")
254
+ logger.warning("The op_summary csv file was not found, perhaps the original data was not collected.")
255
255
  if not op_statistic:
256
- raise RuntimeError("The op_statistics csv file was not found, perhaps the original data was not collected.")
256
+ logger.warning("The op_statistics csv file was not found, perhaps the original data was not collected.")
257
257
  if not msprof_json:
258
- raise RuntimeError("The msprof json file was not found, perhaps the original data was not collected.")
258
+ logger.warning("The msprof json file was not found, perhaps the original data was not collected.")
259
259
 
260
260
  logger.info("Finish checking files.")
@@ -88,7 +88,10 @@ class AscendMsprofDataGenerator:
88
88
  """read op summary to memory"""
89
89
  op_summary = []
90
90
  op_summary_name = fr'{self.mindstudio_profiler_output}/op_summary_*.csv'
91
- op_summary_file = get_newest_file(glob.glob(op_summary_name))[0]
91
+ op_summary_files = glob.glob(op_summary_name)
92
+ if not op_summary_files:
93
+ return
94
+ op_summary_file = get_newest_file(op_summary_files)[0]
92
95
  with open(op_summary_file, newline='') as csvfile:
93
96
  reader = csv.DictReader(csvfile, delimiter=',', quotechar='"')
94
97
  for row in reader:
@@ -129,7 +132,10 @@ class AscendMsprofDataGenerator:
129
132
  """read op statistic to memory"""
130
133
  op_statistic = []
131
134
  op_statistic_name = fr'{self.mindstudio_profiler_output}/op_statistic_*.csv'
132
- op_statistic_file = get_newest_file(glob.glob(op_statistic_name))[0]
135
+ op_statistic_files = glob.glob(op_statistic_name)
136
+ if not op_statistic_files:
137
+ return
138
+ op_statistic_file = get_newest_file(op_statistic_files)[0]
133
139
  with open(op_statistic_file, newline='') as csvfile:
134
140
  reader = csv.DictReader(csvfile, delimiter=',', quotechar='"')
135
141
  for row in reader:
@@ -140,7 +146,8 @@ class AscendMsprofDataGenerator:
140
146
  )
141
147
  new_row = tuple(['0' if d == 'N/A' else d for d in new_row])
142
148
  op_statistic.append(new_row)
143
-
149
+ if not op_statistic:
150
+ return
144
151
  op_statistic_dt = np.dtype(self.op_statistic_type)
145
152
  self.op_statistic = np.array(op_statistic, dtype=op_statistic_dt)
146
153
  self.op_statistic['Total Time'] *= 1e-3