mindspore 2.2.10__cp38-none-any.whl → 2.2.14__cp38-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mindspore might be problematic. Click here for more details.

Files changed (152) hide show
  1. mindspore/.commit_id +1 -1
  2. mindspore/__init__.py +2 -1
  3. mindspore/_akg/akg/composite/build_module.py +95 -5
  4. mindspore/_akg/akg/topi/cpp/impl.py +1 -1
  5. mindspore/_akg/akg/tvm/_ffi/base.py +1 -1
  6. mindspore/_akg/akg/utils/composite_op_helper.py +7 -2
  7. mindspore/_akg/akg/utils/dump_ascend_meta.py +22 -3
  8. mindspore/_akg/akg/utils/util.py +18 -1
  9. mindspore/_c_dataengine.cpython-38-aarch64-linux-gnu.so +0 -0
  10. mindspore/_c_expression.cpython-38-aarch64-linux-gnu.so +0 -0
  11. mindspore/_c_mindrecord.cpython-38-aarch64-linux-gnu.so +0 -0
  12. mindspore/_extends/parse/__init__.py +3 -2
  13. mindspore/_extends/parse/parser.py +6 -1
  14. mindspore/_extends/parse/standard_method.py +12 -2
  15. mindspore/_mindspore_offline_debug.cpython-38-aarch64-linux-gnu.so +0 -0
  16. mindspore/bin/cache_admin +0 -0
  17. mindspore/bin/cache_server +0 -0
  18. mindspore/common/_utils.py +16 -0
  19. mindspore/common/tensor.py +0 -2
  20. mindspore/communication/management.py +3 -0
  21. mindspore/context.py +34 -4
  22. mindspore/dataset/engine/cache_client.py +8 -5
  23. mindspore/dataset/engine/datasets.py +23 -0
  24. mindspore/dataset/engine/validators.py +1 -1
  25. mindspore/dataset/vision/py_transforms_util.py +2 -2
  26. mindspore/experimental/optim/lr_scheduler.py +5 -6
  27. mindspore/lib/libdnnl.so.2 +0 -0
  28. mindspore/lib/libmindspore.so +0 -0
  29. mindspore/lib/libmindspore_backend.so +0 -0
  30. mindspore/lib/libmindspore_common.so +0 -0
  31. mindspore/lib/libmindspore_core.so +0 -0
  32. mindspore/lib/libmindspore_gpr.so.15 +0 -0
  33. mindspore/lib/libmindspore_grpc++.so.1 +0 -0
  34. mindspore/lib/libmindspore_grpc.so.15 +0 -0
  35. mindspore/lib/libmindspore_shared_lib.so +0 -0
  36. mindspore/lib/libopencv_core.so.4.5 +0 -0
  37. mindspore/lib/libopencv_imgcodecs.so.4.5 +0 -0
  38. mindspore/lib/libopencv_imgproc.so.4.5 +0 -0
  39. mindspore/lib/plugin/ascend/custom_aicpu_ops/op_impl/cpu/aicpu_kernel/impl/libcust_aicpu_kernels.so +0 -0
  40. mindspore/lib/plugin/ascend/custom_aicpu_ops/op_impl/cpu/aicpu_kernel/impl/libcust_cpu_kernels.so +0 -0
  41. mindspore/lib/plugin/ascend/custom_aicpu_ops/op_impl/cpu/config/cust_aicpu_kernel.json +118 -0
  42. mindspore/lib/plugin/ascend/custom_aicpu_ops/op_proto/libcust_op_proto.so +0 -0
  43. mindspore/lib/plugin/ascend/libakg.so +0 -0
  44. mindspore/lib/plugin/ascend/libascend_collective.so +0 -0
  45. mindspore/lib/plugin/ascend/libdvpp_utils.so +0 -0
  46. mindspore/lib/plugin/ascend/libmindspore_aicpu_kernels.so +0 -0
  47. mindspore/lib/plugin/ascend/libmindspore_cpu_kernels.so +0 -0
  48. mindspore/lib/plugin/cpu/libakg.so +0 -0
  49. mindspore/lib/plugin/libmindspore_ascend.so.1 +0 -0
  50. mindspore/mindrecord/tools/cifar100_to_mr.py +49 -57
  51. mindspore/mindrecord/tools/cifar10_to_mr.py +46 -55
  52. mindspore/mindrecord/tools/csv_to_mr.py +3 -8
  53. mindspore/mindrecord/tools/mnist_to_mr.py +4 -9
  54. mindspore/mindrecord/tools/tfrecord_to_mr.py +1 -4
  55. mindspore/nn/layer/activation.py +1 -1
  56. mindspore/nn/layer/embedding.py +2 -2
  57. mindspore/nn/layer/flash_attention.py +48 -135
  58. mindspore/nn/loss/loss.py +1 -1
  59. mindspore/nn/optim/ada_grad.py +2 -2
  60. mindspore/nn/optim/sgd.py +3 -2
  61. mindspore/nn/wrap/__init__.py +4 -2
  62. mindspore/nn/wrap/cell_wrapper.py +6 -3
  63. mindspore/numpy/math_ops.py +1 -1
  64. mindspore/ops/__init__.py +3 -0
  65. mindspore/ops/_grad_experimental/grad_array_ops.py +0 -31
  66. mindspore/ops/_grad_experimental/grad_comm_ops.py +4 -2
  67. mindspore/ops/_grad_experimental/grad_inner_ops.py +8 -0
  68. mindspore/ops/_grad_experimental/grad_math_ops.py +37 -17
  69. mindspore/ops/_op_impl/aicpu/__init__.py +1 -0
  70. mindspore/ops/_op_impl/aicpu/generate_eod_mask.py +38 -0
  71. mindspore/ops/_op_impl/aicpu/linear_sum_assignment.py +21 -2
  72. mindspore/ops/function/array_func.py +6 -5
  73. mindspore/ops/function/debug_func.py +1 -1
  74. mindspore/ops/function/linalg_func.py +21 -11
  75. mindspore/ops/function/math_func.py +3 -0
  76. mindspore/ops/function/nn_func.py +13 -11
  77. mindspore/ops/function/parameter_func.py +2 -0
  78. mindspore/ops/function/sparse_unary_func.py +2 -2
  79. mindspore/ops/function/vmap_func.py +1 -0
  80. mindspore/ops/operations/__init__.py +5 -2
  81. mindspore/ops/operations/_embedding_cache_ops.py +1 -1
  82. mindspore/ops/operations/_grad_ops.py +3 -4
  83. mindspore/ops/operations/_inner_ops.py +56 -1
  84. mindspore/ops/operations/_quant_ops.py +4 -4
  85. mindspore/ops/operations/_rl_inner_ops.py +1 -1
  86. mindspore/ops/operations/array_ops.py +15 -4
  87. mindspore/ops/operations/custom_ops.py +1 -1
  88. mindspore/ops/operations/debug_ops.py +1 -1
  89. mindspore/ops/operations/image_ops.py +3 -3
  90. mindspore/ops/operations/inner_ops.py +49 -0
  91. mindspore/ops/operations/math_ops.py +65 -3
  92. mindspore/ops/operations/nn_ops.py +95 -28
  93. mindspore/ops/operations/random_ops.py +2 -0
  94. mindspore/ops/operations/sparse_ops.py +4 -4
  95. mindspore/ops/silent_check.py +162 -0
  96. mindspore/parallel/__init__.py +3 -2
  97. mindspore/parallel/_auto_parallel_context.py +82 -3
  98. mindspore/parallel/_parallel_serialization.py +34 -2
  99. mindspore/parallel/_tensor.py +3 -1
  100. mindspore/parallel/_transformer/transformer.py +8 -8
  101. mindspore/parallel/checkpoint_transform.py +191 -45
  102. mindspore/profiler/parser/ascend_cluster_generator.py +111 -0
  103. mindspore/profiler/parser/ascend_communicate_generator.py +315 -0
  104. mindspore/profiler/parser/ascend_flops_generator.py +8 -2
  105. mindspore/profiler/parser/ascend_fpbp_generator.py +8 -2
  106. mindspore/profiler/parser/ascend_hccl_generator.py +2 -2
  107. mindspore/profiler/parser/ascend_msprof_exporter.py +30 -6
  108. mindspore/profiler/parser/ascend_msprof_generator.py +16 -5
  109. mindspore/profiler/parser/ascend_op_generator.py +15 -7
  110. mindspore/profiler/parser/ascend_timeline_generator.py +5 -2
  111. mindspore/profiler/parser/base_timeline_generator.py +11 -3
  112. mindspore/profiler/parser/cpu_gpu_timeline_generator.py +2 -1
  113. mindspore/profiler/parser/framework_parser.py +8 -2
  114. mindspore/profiler/parser/memory_usage_parser.py +8 -2
  115. mindspore/profiler/parser/minddata_analyzer.py +8 -2
  116. mindspore/profiler/parser/minddata_parser.py +1 -1
  117. mindspore/profiler/parser/msadvisor_analyzer.py +4 -2
  118. mindspore/profiler/parser/msadvisor_parser.py +9 -3
  119. mindspore/profiler/profiling.py +97 -25
  120. mindspore/rewrite/api/node.py +1 -1
  121. mindspore/rewrite/api/symbol_tree.py +2 -2
  122. mindspore/rewrite/parsers/for_parser.py +6 -6
  123. mindspore/rewrite/parsers/module_parser.py +4 -4
  124. mindspore/scipy/ops.py +55 -5
  125. mindspore/scipy/optimize/__init__.py +3 -2
  126. mindspore/scipy/optimize/linear_sum_assignment.py +38 -33
  127. mindspore/train/callback/_checkpoint.py +8 -8
  128. mindspore/train/callback/_landscape.py +2 -3
  129. mindspore/train/callback/_summary_collector.py +6 -7
  130. mindspore/train/dataset_helper.py +6 -0
  131. mindspore/train/model.py +17 -5
  132. mindspore/train/serialization.py +6 -1
  133. mindspore/train/summary/_writer_pool.py +1 -1
  134. mindspore/train/summary/summary_record.py +5 -6
  135. mindspore/version.py +1 -1
  136. {mindspore-2.2.10.dist-info → mindspore-2.2.14.dist-info}/METADATA +3 -2
  137. {mindspore-2.2.10.dist-info → mindspore-2.2.14.dist-info}/RECORD +140 -148
  138. mindspore/lib/plugin/libmindspore_ascend.so.2 +0 -0
  139. mindspore/ops/_op_impl/_custom_op/flash_attention/__init__.py +0 -0
  140. mindspore/ops/_op_impl/_custom_op/flash_attention/attention.py +0 -406
  141. mindspore/ops/_op_impl/_custom_op/flash_attention/constants.py +0 -41
  142. mindspore/ops/_op_impl/_custom_op/flash_attention/flash_attention_bwd.py +0 -467
  143. mindspore/ops/_op_impl/_custom_op/flash_attention/flash_attention_fwd.py +0 -563
  144. mindspore/ops/_op_impl/_custom_op/flash_attention/flash_attention_impl.py +0 -193
  145. mindspore/ops/_op_impl/_custom_op/flash_attention/tik_ops_utils.py +0 -435
  146. mindspore/ops/_op_impl/_custom_op/flash_attention/tiling_strategy/__init__.py +0 -0
  147. mindspore/ops/_op_impl/_custom_op/flash_attention/tiling_strategy/sparse_tiling.py +0 -45
  148. mindspore/ops/_op_impl/_custom_op/flash_attention/tiling_strategy/strategy.py +0 -67
  149. mindspore/ops/_op_impl/_custom_op/flash_attention/tiling_strategy/wukong_tiling.py +0 -62
  150. {mindspore-2.2.10.dist-info → mindspore-2.2.14.dist-info}/WHEEL +0 -0
  151. {mindspore-2.2.10.dist-info → mindspore-2.2.14.dist-info}/entry_points.txt +0 -0
  152. {mindspore-2.2.10.dist-info → mindspore-2.2.14.dist-info}/top_level.txt +0 -0
@@ -243,6 +243,33 @@ def _extract_pipeline_stage_num(strategy_file):
243
243
  return pipeline_stage_num
244
244
 
245
245
 
246
+ def _extract_src_dst_layout_map_by_src(src_strategy_file=None, dst_strategy_file=None):
247
+ """Extract strategy list by src strategy"""
248
+ src_layout_map = _extract_layout_map(src_strategy_file)
249
+ dst_layout_map = _extract_layout_map(dst_strategy_file)
250
+ if dst_layout_map is None:
251
+ return src_layout_map, dst_layout_map
252
+ for param_name in list(dst_layout_map.keys()):
253
+ if param_name in src_layout_map.keys():
254
+ continue
255
+ dst_layout_map.pop(param_name)
256
+ stage_id = 0
257
+ if src_strategy_file[-5:] == ".json":
258
+ with open(src_strategy_file, 'r') as f:
259
+ json_content = json.load(f)
260
+ strategy_items = json_content.get("parallel_strategy_item")
261
+ if not strategy_items:
262
+ raise ValueError("The strategy file {} if empty.".format(src_strategy_file))
263
+ stage_id = strategy_items.get(list(strategy_items.keys())[0]).get('stage')
264
+ else:
265
+ src_parallel_strategy_map = _load_protobuf_strategy(src_strategy_file)
266
+ strategy_items = src_parallel_strategy_map.parallel_strategy_item
267
+ if not strategy_items:
268
+ raise ValueError("The strategy file {} if empty.".format(src_strategy_file))
269
+ stage_id = strategy_items[0].parallel_strategys.stage
270
+ return src_layout_map, dst_layout_map, stage_id
271
+
272
+
246
273
  def _extract_src_dst_layout_map(rank_id, src_strategy_file=None, dst_strategy_file=None):
247
274
  """Extract strategy list"""
248
275
  src_layout_map = _extract_layout_map(src_strategy_file)
@@ -341,6 +368,7 @@ def _transform_parallel_checkpoint(rank_id, param_total_dict, param_attr_dict, s
341
368
  Transform model parallel dimension for distributed checkpoint files.
342
369
  """
343
370
  transform_param_dict = {}
371
+ device_num = -1
344
372
  for param_name, _ in param_total_dict.items():
345
373
  tensor_shape = list(param_total_dict[param_name].values())[0].shape
346
374
  from_dev_matrix = [1]
@@ -394,14 +422,18 @@ def _transform_parallel_checkpoint(rank_id, param_total_dict, param_attr_dict, s
394
422
  to_info_tuple = (to_opt_shard_size, to_dev_matrix_origin, to_tensor_map_origin, origin_tensor_shape)
395
423
  _insert_opt_shard_reshape(param_rank_map, from_info_tuple, to_info_tuple)
396
424
  transform_operator_stack = _generate_transform_operator_stack(param_rank_map, rank_id)
397
- _apply_tensor_transform_operators(transform_operator_stack, param_total_dict[param_name], device_num)
398
- transform_tensor = ms.Tensor(param_total_dict[param_name][rank_id % device_num])
425
+ param_total_dict_copy = param_total_dict[param_name].copy()
426
+ _apply_tensor_transform_operators(transform_operator_stack, param_total_dict_copy, device_num)
427
+ transform_tensor = ms.Tensor(param_total_dict_copy[rank_id % device_num])
399
428
  requires_grad = param_attr_dict[param_name][rank_id % device_num][0]
400
429
  layerwise_parallel = param_attr_dict[param_name][rank_id % device_num][1]
401
430
  transform_para = ms.Parameter(transform_tensor, param_name, requires_grad, layerwise_parallel)
402
431
  if param_type_dict[param_name][rank_id % device_num] == "BFloat16":
403
432
  transform_para.set_dtype(ms.bfloat16)
404
433
  transform_param_dict[param_name] = transform_para
434
+ if device_num < 0:
435
+ raise ValueError("None of the parameters in checkpoint file are in either src strategy or "
436
+ "dst strategy. Please check correctness of strategy files.")
405
437
 
406
438
  # Handle those parameter like learning_rate, global_step which not in strategy_file.
407
439
  for param_name, _ in param_total_dict.items():
@@ -223,7 +223,9 @@ def _load_tensor(tensor, dev_mat, tensor_map, rank_id=-1):
223
223
  tensor_strategy = _get_tensor_strategy(dev_mat, tensor_map)
224
224
  tensor_slice_index = _get_tensor_slice_index(dev_mat, tensor_strategy, tensor_map, rank)
225
225
  if tensor.dtype == mstype.bfloat16:
226
- tensor = tensor.float()
226
+ from mindspore.ops.operations import Cast
227
+ cpu_cast = Cast().set_device("CPU")
228
+ tensor = cpu_cast(tensor, mstype.float32)
227
229
  np_tensor = tensor.asnumpy()
228
230
  np_tensor_list = _chunk_tensor_by_strategy(np_tensor, tensor_strategy)
229
231
  np_tensor_slice = np_tensor_list[int(tensor_slice_index)]
@@ -805,14 +805,14 @@ class MultiHeadAttention(Cell):
805
805
  - **attention_mask** (Tensor) - If the use_past is False or is_first_iteration=True, the attention mask
806
806
  matrix should ba (batch_size, src_seq_length, tgt_seq_length), or None. None means there will be no mask
807
807
  in softmax computation. Otherwise, the mask must be (batch_size, 1, tgt_seq_length)
808
- - **key_past** (Tensor) - Float16 tensor with shape (batch_size, num_heads, size_per_head, tgt_seq_length).
808
+ - **key_past** (Tensor) - float16 tensor with shape (batch_size, num_heads, size_per_head, tgt_seq_length).
809
809
  The past calculated key vector. Used for incremental prediction when the use_past is True.
810
810
  Default None.
811
- - **value_past** (Tensor) - Float16 tensor with shape
811
+ - **value_past** (Tensor) - float16 tensor with shape
812
812
  (batch_size, num_heads, tgt_seq_length, size_per_head).
813
813
  The past calculated value vector. Used for incremental prediction when the use_past is True.
814
814
  Default None.
815
- - **batch_valid_length** (Tensor) - Int32 tensor with shape (batch_size,) the past calculated the index.
815
+ - **batch_valid_length** (Tensor) - int32 tensor with shape (batch_size,) the past calculated the index.
816
816
  Used for incremental prediction when the use_past is True. Default None.
817
817
 
818
818
  Outputs:
@@ -1412,7 +1412,7 @@ class TransformerEncoderLayer(Cell):
1412
1412
  be no mask in softmax computation. Otherwise, should be [batch_size, 1, hidden_size]
1413
1413
  - **init_reset** (Tensor) - A bool tensor with shape [1], used to clear the past key parameter and
1414
1414
  past value parameter used in the incremental prediction. Only valid when use_past is True. Default True.
1415
- - **batch_valid_length** (Tensor) - Int32 tensor with shape [batch_size] the past calculated the index.
1415
+ - **batch_valid_length** (Tensor) - int32 tensor with shape [batch_size] the past calculated the index.
1416
1416
  Used for incremental prediction when the use_past is True. Default None.
1417
1417
 
1418
1418
  Outputs:
@@ -1824,7 +1824,7 @@ class TransformerDecoderLayer(Cell):
1824
1824
  means there will be no mask in softmax computation in cross attention. Default None.
1825
1825
  - **init_reset** (Tensor) - A bool tensor with shape [1], used to clear the past key parameter and
1826
1826
  past value parameter used in the incremental prediction. Only valid when use_past is True. Default True.
1827
- - **batch_valid_length** (Tensor) - Int32 tensor with shape [batch_size] the past calculated the index.
1827
+ - **batch_valid_length** (Tensor) - int32 tensor with shape [batch_size] the past calculated the index.
1828
1828
  Used for incremental prediction when the use_past is True. Default None.
1829
1829
 
1830
1830
  Outputs:
@@ -2333,7 +2333,7 @@ class TransformerEncoder(Cell):
2333
2333
  be no mask in softmax computation. Otherwise, should be [batch_size, 1, hidden_size]
2334
2334
  - **init_reset** (Tensor) - A bool tensor with shape [1], used to clear the past key parameter and
2335
2335
  past value parameter used in the incremental prediction. Only valid when use_past is True. Default True.
2336
- - **batch_valid_length** (Tensor) - Int32 tensor with shape [batch_size] the past calculated the index.
2336
+ - **batch_valid_length** (Tensor) - int32 tensor with shape [batch_size] the past calculated the index.
2337
2337
  Used for incremental prediction when the use_past is True. Default None.
2338
2338
 
2339
2339
  Outputs:
@@ -2589,7 +2589,7 @@ class TransformerDecoder(Cell):
2589
2589
  means there will be no mask in softmax computation in cross attention. Default None.
2590
2590
  - **init_reset** (Tensor) - A bool tensor with shape [1], used to clear the past key parameter and
2591
2591
  past value parameter used in the incremental prediction. Only valid when use_past is True. Default True.
2592
- - **batch_valid_length** (Tensor) - Int32 tensor with shape [batch_size] the past calculated the index.
2592
+ - **batch_valid_length** (Tensor) - int32 tensor with shape [batch_size] the past calculated the index.
2593
2593
  Used for incremental prediction when the use_past is True. Default None.
2594
2594
 
2595
2595
  Outputs:
@@ -2842,7 +2842,7 @@ class Transformer(Cell):
2842
2842
  seq_length, hidden_size], this should be none if the decoder layer is 0 or the user wants no mask.
2843
2843
  - **init_reset** (Tensor) - A bool tensor with shape [1], used to clear the past key parameter and
2844
2844
  past value parameter used in the incremental prediction. Only valid when use_past is True. Default True.
2845
- - **batch_valid_length** (Tensor) - Int32 tensor with shape [batch_size] the past calculated the index.
2845
+ - **batch_valid_length** (Tensor) - int32 tensor with shape [batch_size] the past calculated the index.
2846
2846
  Used for incremental prediction when the use_past is True. Default None.
2847
2847
 
2848
2848
  Outputs:
@@ -25,11 +25,11 @@ from mindspore.common import dtype as mstype
25
25
  from mindspore.parallel._parallel_serialization import _rank_list_for_transform_parallel_checkpoint, \
26
26
  _transform_parallel_checkpoint, _get_device_num_from_strategy, _make_dir, \
27
27
  _extract_layout_map, _extract_src_dst_layout_map, _parameter_not_in_local_stage, _extract_pipeline_stage_num, \
28
- _merge_protobuf_strategy, _merge_json_strategy
28
+ _merge_protobuf_strategy, _merge_json_strategy, _extract_src_dst_layout_map_by_src
29
29
 
30
30
 
31
31
  __all__ = ["merge_pipeline_strategys", "rank_list_for_transform", "transform_checkpoint_by_rank",
32
- "transform_checkpoints"]
32
+ "transform_checkpoints", "load_segmented_checkpoints"]
33
33
 
34
34
 
35
35
  def merge_pipeline_strategys(src_strategy_dirs, dst_strategy_file):
@@ -71,7 +71,6 @@ def merge_pipeline_strategys(src_strategy_dirs, dst_strategy_file):
71
71
  _merge_json_strategy(src_strategy_files_json, dst_strategy_file)
72
72
 
73
73
 
74
-
75
74
  def rank_list_for_transform(rank_id, src_strategy_file=None, dst_strategy_file=None):
76
75
  """
77
76
  List of original distributed checkpoint rank index for obtaining the target checkpoint of a rank_id
@@ -222,48 +221,63 @@ def transform_checkpoint_by_rank(rank_id, checkpoint_files_map, save_checkpoint_
222
221
  ms.save_checkpoint(transform_param_list, save_checkpoint_file_name)
223
222
 
224
223
 
225
- def transform_checkpoints(src_checkpoints_dir, dst_checkpoints_dir, ckpt_prefix, src_strategy_file=None,
226
- dst_strategy_file=None):
227
- """
228
- Transform distributed checkpoint from source sharding strategy to destination sharding strategy for a rank.
229
- For more details about converting distributed Checkpoint, please refer to
230
- `Model Transformation <https://www.mindspore.cn/tutorials/experts/en/r2.2/parallel/model_transformation.html>`_.
231
-
232
- Note:
233
- The `src_checkpoints_dir` directory structure should be organized like "src_checkpoints_dir/rank_0/a.ckpt", the
234
- rank number should be set to a subdirectory and the checkpoint file is stored in this subdirectory. If multiple
235
- files exist in a rank directory, the last file in the lexicgraphic order would be selected.
236
-
237
- Args:
238
- src_checkpoints_dir (str): The source checkpoints directory.
239
- dst_checkpoints_dir (str): The destination checkpoints directory to save the converted checkpoints.
240
- ckpt_prefix (str): The destination checkpoint name prefix.
241
- src_strategy_file (str): Name of source sharding strategy file which saved by
242
- 'mindspore.set_auto_parallel_context(strategy_ckpt_save_file)'.
243
- when the 'src_strategy_file' is None, it means that the source sharding strategy is
244
- without any sharing for each parameter. Default:None.
245
- dst_strategy_file (str): Name of destination sharding strategy file which saved by
246
- 'mindspore.set_auto_parallel_context(strategy_ckpt_save_file)'.
247
- when the 'dst_strategy_file' is None, it means that the destination sharding strategy
248
- is without any sharing for each parameter. Default:None.
249
-
250
- Raises:
251
- ValueError: `src_strategy_file` or `dst_strategy_file` is incorrect.
252
- NotADirectoryError: `src_checkpoints_dir` or `dst_checkpoints_dir` is not a directory.
253
- ValueError: The checkpoint file is missing in `src_checkpoints_dir`.
254
- TypeError: `src_strategy_file` or `dst_strategy_file` is not a string.
255
-
256
- Examples:
257
- >>> import mindspore as ms
258
- >>> ms.transform_checkpoints(src_checkpoints_dir, dst_checkpoints_dir, "dst_checkpoint",
259
- ... "./src_strategy.ckpt", "./dst_strategy.ckpt")
260
-
261
- """
262
- if not os.path.isdir(src_checkpoints_dir):
263
- raise NotADirectoryError("src_checkpoints_dir {} is not a directory.".format(src_checkpoints_dir))
264
- _make_dir(dst_checkpoints_dir, "path")
265
- if not isinstance(ckpt_prefix, str):
266
- raise TypeError("The ckpt_prefix should be a str.")
224
+ def _transform_checkpoint_by_stage(src_checkpoints_dir, dst_checkpoints_dir, ckpt_prefix, src_strategy_file,
225
+ dst_strategy_file=None):
226
+ """Transform checkpoint for stage in src_strategy_file"""
227
+ param_total_dict = defaultdict(dict)
228
+ param_attr_dict = defaultdict(dict)
229
+ param_type_dict = defaultdict(dict)
230
+ src_strategy_list, dst_strategy_list, stage_id = _extract_src_dst_layout_map_by_src(src_strategy_file, \
231
+ dst_strategy_file)
232
+ src_stage_device_num = np.prod(src_strategy_list.get(list(src_strategy_list.keys())[0])[0]) if src_strategy_list \
233
+ is not None else 1
234
+ dst_stage_device_num = np.prod(dst_strategy_list.get(list(dst_strategy_list.keys())[0])[0]) if dst_strategy_list \
235
+ is not None else 1
236
+ origin_dst_strategy_list = _extract_layout_map(dst_strategy_file)
237
+ origin_src_strategy_list = _extract_layout_map(src_strategy_file)
238
+ checkpoint_files_map = {}
239
+ src_rank_id_start = stage_id * src_stage_device_num
240
+ for local_rank in range(src_stage_device_num):
241
+ rank_id = src_rank_id_start + local_rank
242
+ checkpoint_file_name = os.path.join(src_checkpoints_dir, "rank_{}".format(rank_id), "*.ckpt")
243
+ rank_ckpts = glob.glob(checkpoint_file_name)
244
+ rank_ckpts.sort()
245
+ for checkpoint_file in rank_ckpts:
246
+ if not os.path.isfile(checkpoint_file):
247
+ ms.log.warning("{} is not a checkpoint file.".format(checkpoint_file))
248
+ continue
249
+ checkpoint_files_map[rank_id] = checkpoint_file
250
+ for rank, local_file in checkpoint_files_map.items():
251
+ if not os.path.exists(local_file):
252
+ raise ValueError("Checkpoint file {} in rank {} not exits: ".format(local_file, rank))
253
+ for rank, file_name in checkpoint_files_map.items():
254
+ ckpt_dict = ms.load_checkpoint(file_name)
255
+ for param_name, param in ckpt_dict.items():
256
+ # cut the parameter not in the pipeline stage.
257
+ if _parameter_not_in_local_stage(param_name, origin_src_strategy_list, src_strategy_list) \
258
+ and _parameter_not_in_local_stage(param_name, origin_dst_strategy_list, dst_strategy_list):
259
+ continue
260
+ src_rank = rank % src_stage_device_num
261
+ param_type_dict[param_name][src_rank] = str(param.data.dtype)
262
+ if param.data.dtype == mstype.bfloat16:
263
+ param.set_dtype(mstype.float32)
264
+ param_total_dict[param_name][src_rank] = param.data.asnumpy()
265
+ param_attr_dict[param_name][src_rank] = (param.requires_grad, param.layerwise_parallel)
266
+ for local_rank_id in range(dst_stage_device_num):
267
+ transform_param_list = _transform_parallel_checkpoint(local_rank_id, param_total_dict,
268
+ param_attr_dict, src_strategy_list, dst_strategy_list,
269
+ param_type_dict)
270
+ save_checkpoint_file = "{}{}_part{}.ckpt".format(ckpt_prefix, local_rank_id, stage_id)
271
+ save_checkpoint_file_dir = os.path.join(dst_checkpoints_dir, "rank_{}".format(local_rank_id))
272
+ if not os.path.exists(save_checkpoint_file_dir):
273
+ _make_dir(save_checkpoint_file_dir, "path")
274
+ save_checkpoint_file_name = os.path.join(save_checkpoint_file_dir, save_checkpoint_file)
275
+ ms.save_checkpoint(transform_param_list, save_checkpoint_file_name)
276
+
277
+
278
+ def _transform_checkpoints(src_checkpoints_dir, dst_checkpoints_dir, ckpt_prefix, src_strategy_file=None,
279
+ dst_strategy_file=None):
280
+ """Transform checkpoints for all stages in src_strategy_file"""
267
281
  checkpoints_rank_dir_list = os.path.join(src_checkpoints_dir, "rank_[0-9]*")
268
282
  all_checkpoint_files_map = {}
269
283
  for checkpoint_dir in glob.glob(checkpoints_rank_dir_list):
@@ -336,3 +350,135 @@ def transform_checkpoints(src_checkpoints_dir, dst_checkpoints_dir, ckpt_prefix,
336
350
  ms.save_checkpoint(transform_param_list, save_checkpoint_file_name)
337
351
  del param_total_dict_copy
338
352
  del param_total_dict
353
+
354
+
355
+ def transform_checkpoints(src_checkpoints_dir, dst_checkpoints_dir, ckpt_prefix, src_strategy_file=None,
356
+ dst_strategy_file=None):
357
+ """
358
+ Transform distributed checkpoint from source sharding strategy to destination sharding strategy for a rank.
359
+ For more details about converting distributed Checkpoint, please refer to
360
+ `Model Transformation <https://www.mindspore.cn/tutorials/experts/en/r2.2/parallel/model_transformation.html>`_.
361
+
362
+ Note:
363
+ The `src_checkpoints_dir` directory structure should be organized like "src_checkpoints_dir/rank_0/a.ckpt", the
364
+ rank number should be set to a subdirectory and the checkpoint file is stored in this subdirectory. If multiple
365
+ files exist in a rank directory, the last file in the lexicgraphic order would be selected.
366
+
367
+ Args:
368
+ src_checkpoints_dir (str): The source checkpoints directory.
369
+ dst_checkpoints_dir (str): The destination checkpoints directory to save the converted checkpoints.
370
+ ckpt_prefix (str): The destination checkpoint name prefix.
371
+ src_strategy_file (str): Name of source sharding strategy file which saved by
372
+ 'mindspore.set_auto_parallel_context(strategy_ckpt_save_file)'.
373
+ when the 'src_strategy_file' is None, it means that the source sharding strategy is
374
+ without any sharing for each parameter. Default:None.
375
+ dst_strategy_file (str): Name of destination sharding strategy file which saved by
376
+ 'mindspore.set_auto_parallel_context(strategy_ckpt_save_file)'.
377
+ when the 'dst_strategy_file' is None, it means that the destination sharding strategy
378
+ is without any sharing for each parameter. Default:None.
379
+
380
+ Raises:
381
+ ValueError: `src_strategy_file` or `dst_strategy_file` is incorrect.
382
+ NotADirectoryError: `src_checkpoints_dir` or `dst_checkpoints_dir` is not a directory.
383
+ ValueError: The checkpoint file is missing in `src_checkpoints_dir`.
384
+ TypeError: `src_strategy_file` or `dst_strategy_file` is not a string.
385
+
386
+ Examples:
387
+ >>> import mindspore as ms
388
+ >>> ms.transform_checkpoints(src_checkpoints_dir, dst_checkpoints_dir, "dst_checkpoint",
389
+ ... "./src_strategy.ckpt", "./dst_strategy.ckpt")
390
+
391
+ """
392
+ if not os.path.isdir(src_checkpoints_dir):
393
+ raise NotADirectoryError("src_checkpoints_dir {} is not a directory.".format(src_checkpoints_dir))
394
+ _make_dir(dst_checkpoints_dir, "path")
395
+ if not isinstance(ckpt_prefix, str):
396
+ raise TypeError("The ckpt_prefix should be a str.")
397
+ if src_strategy_file and os.path.dirname(src_strategy_file) and not os.path.exists(
398
+ os.path.dirname(src_strategy_file)):
399
+ raise ValueError("The director of src_strategy_file: {} is not exists.".
400
+ format(os.path.dirname(src_strategy_file)))
401
+ if dst_strategy_file and os.path.dirname(dst_strategy_file) and not os.path.exists(
402
+ os.path.dirname(dst_strategy_file)):
403
+ raise ValueError("The director of dst_strategy_file: {} is not exists.".
404
+ format(os.path.dirname(dst_strategy_file)))
405
+ src_layout_map = _extract_layout_map(src_strategy_file)
406
+ dst_layout_map = _extract_layout_map(dst_strategy_file)
407
+ pipeline_stage_num = _extract_pipeline_stage_num(src_strategy_file)
408
+ if src_layout_map:
409
+ src_param_keys = {param_name for param_name in src_layout_map if not param_name.startswith("accu_grads")}
410
+ if dst_layout_map:
411
+ dst_param_keys = {param_name for param_name in dst_layout_map if not param_name.startswith("accu_grads")}
412
+ if src_layout_map and dst_layout_map and pipeline_stage_num == 1 \
413
+ and src_param_keys.issubset(dst_param_keys) and len(src_param_keys) < len(dst_param_keys):
414
+ dst_stage_num = _extract_pipeline_stage_num(dst_strategy_file)
415
+ if dst_stage_num > 1:
416
+ raise NotImplementedError("When using unmerged src strategy, dst strategy doesn't \
417
+ support strategy with pipeline parallel.")
418
+ _transform_checkpoint_by_stage(src_checkpoints_dir, dst_checkpoints_dir, ckpt_prefix,
419
+ src_strategy_file, dst_strategy_file)
420
+ else:
421
+ _transform_checkpoints(src_checkpoints_dir, dst_checkpoints_dir, ckpt_prefix,
422
+ src_strategy_file, dst_strategy_file)
423
+
424
+
425
+ def load_segmented_checkpoints(ckpt_file_dir, net=None, strict_load=False, filter_prefix=None,
426
+ dec_key=None, dec_mode="AES-GCM", specify_prefix=None, choice_func=None):
427
+ """
428
+ Load checkpoint info from a specified file. If the specified ckpt_file_dir path contains multiple
429
+ checkpoint files, all checkpoint files will be loaded one by one and the combined dictionary will be return.
430
+
431
+ Note:
432
+ - `specify_prefix` and `filter_prefix` do not affect each other.
433
+ - If none of the parameters are loaded from checkpoint file, it will throw ValueError.
434
+ - `specify_prefix` and `filter_prefix` are in the process of being deprecated,
435
+ `choice_func` is recommended instead.
436
+ And using either of those two args will override `choice_func` at the same time.
437
+
438
+ Args:
439
+ ckpt_file_dir (str): Checkpoint file directory.
440
+ net (Cell): The network where the parameters will be loaded. Default: ``None`` .
441
+ strict_load (bool): Whether to strict load the parameter into net. If ``False`` , it will load parameter
442
+ into net when parameter name's suffix in checkpoint file is the same as the
443
+ parameter in the network. When the types are inconsistent perform type conversion
444
+ on the parameters of the same type, such as float32 to float16. Default: ``False`` .
445
+ filter_prefix (Union[str, list[str], tuple[str]]): Deprecated(see `choice_func`). Parameters starting with the
446
+ filter_prefix will not be loaded. Default: ``None`` .
447
+ dec_key (Union[None, bytes]): Byte type key used for decryption. If the value is ``None`` , the decryption
448
+ is not required. Default: ``None`` .
449
+ dec_mode (str): This parameter is valid only when dec_key is not set to ``None`` . Specifies the decryption
450
+ mode, currently supports ``"AES-GCM"`` and ``"AES-CBC"`` and ``"SM4-CBC"`` .
451
+ Default: ``"AES-GCM"`` .
452
+ specify_prefix (Union[str, list[str], tuple[str]]): Deprecated(see `choice_func`). Parameters starting with the
453
+ specify_prefix will be loaded. Default: ``None`` .
454
+ choice_func (Union[None, function]) : Input value of the function is a Parameter name of type string,
455
+ and the return value is a bool. If returns ``True`` , the Parameter
456
+ that matches the custom condition will be loaded. If returns ``False`` , the Parameter that
457
+ matches the custom condition will be removed. Default: ``None`` .
458
+
459
+ Returns:
460
+ Dict, key is parameter name, value is a Parameter or string. When the `append_dict` parameter of
461
+ :func:`mindspore.save_checkpoint` and the `append_info` parameter of :class:`mindspore.train.CheckpointConfig`
462
+ are used to save the checkpoint, `append_dict` and `append_info` are dict types, and their value are string,
463
+ then the return value obtained by loading checkpoint is string, and in other cases the return value is
464
+ Parameter.
465
+
466
+ Raises:
467
+ TypeError: Input ckpt_file_dir is not a string.
468
+ ValueError: Checkpoint file directory doesn't exist. Or it's not a directory
469
+ ValueError: Checkpoint file's format is incorrect.
470
+ ValueError: Parameter's dict is None after load checkpoint file.
471
+ TypeError: The type of `specify_prefix` or `filter_prefix` is incorrect.
472
+ """
473
+ if not isinstance(ckpt_file_dir, str):
474
+ raise TypeError("The ckpt_file_dir should be a str.")
475
+ if not os.path.isdir(ckpt_file_dir):
476
+ raise ValueError("The dst_strategy_file: {} doesn't exists. Or it's not a directory.".
477
+ format(os.path.dirname(ckpt_file_dir)))
478
+ checkpoint_file_name = os.path.join(ckpt_file_dir, "*.ckpt")
479
+ rank_ckpts = glob.glob(checkpoint_file_name)
480
+ parameter_dict = {}
481
+ for checkpoint_file in rank_ckpts:
482
+ parameter_dict.update(ms.load_checkpoint(checkpoint_file, net, strict_load, filter_prefix, dec_key,
483
+ dec_mode, specify_prefix, choice_func))
484
+ return parameter_dict
@@ -0,0 +1,111 @@
1
+ # Copyright 2024 Huawei Technologies Co., Ltd
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ # ============================================================================
15
+ """trace step time analyse model"""
16
+ import csv
17
+ import fnmatch
18
+ import json
19
+ import logging
20
+ import os
21
+ import stat
22
+
23
+ import numpy as np
24
+ from mindspore.profiler.common.exceptions.exceptions import ProfilerIOException
25
+
26
+
27
+ def find_files(directory, pattern):
28
+ """Find files with feature 'pattern' from the directory"""
29
+ file_list = []
30
+ for root, _, files in os.walk(directory):
31
+ files.sort(key=lambda x: os.path.getctime(os.path.join(directory, x)))
32
+ for basename in files:
33
+ if fnmatch.fnmatch(basename, pattern):
34
+ filename = os.path.join(root, basename)
35
+ file_list.append(filename)
36
+ return file_list
37
+
38
+
39
+ class AscendClusterGenerator:
40
+ """Generate step trace time data from msprof*.json"""
41
+
42
+ def __init__(self, source_path):
43
+ self.root_path = source_path
44
+ self.msprof_data = np.array([])
45
+ self.step_trace_time = {'Step': None, 'Computing': 0, 'comunNotOverlp': 0, 'Overlapped': 0, 'Communication': 0,
46
+ 'Free': 0, 'Stage': 0, 'Bubble': 0, 'comunNotOverlpRec': 0}
47
+ self.msprof_data_df = np.dtype([('name', object), ('ts', float), ('dur', float)])
48
+ self.trace_step_time_df = np.dtype(
49
+ [('Step', int), ('Computing', float), ('comunNotOverlp', float), ('Communication', float), ('Free', float),
50
+ ('Stage', float), ('Bubble', float), ('comunNotOverlpRec', float)])
51
+ self.title = ['Step', 'Computing', 'Communication(Not Overlapped)', 'Overlapped', 'Communication', 'Free',
52
+ 'Stage', 'Bubble', 'Communication(Not Overlapped and Exclude Receive)']
53
+
54
+ def parse(self):
55
+ """
56
+ Analyse msprof json generate cluster data.
57
+ """
58
+ self.read_msprof()
59
+ if self.msprof_data.size < 1:
60
+ return
61
+ self.step_trace_time['Computing'] = np.sum(self.msprof_data[self.msprof_data['name'] == 'Computing']['dur'])
62
+ self.step_trace_time['comunNotOverlp'] = np.sum(
63
+ self.msprof_data[self.msprof_data['name'] == 'Communication(Not Overlapped)']['dur'])
64
+ self.step_trace_time['Communication'] = np.sum(
65
+ self.msprof_data[self.msprof_data['name'] == 'Communication']['dur'])
66
+ self.step_trace_time['Free'] = np.sum(self.msprof_data[self.msprof_data['name'] == 'Free']['dur'])
67
+ self.step_trace_time['Bubble'] = np.sum(
68
+ self.msprof_data[np.char.find(self.msprof_data['name'].astype('str'), '/Receive-op')]['dur'])
69
+
70
+ self.step_trace_time['Overlapped'] = self.step_trace_time['Communication'] - self.step_trace_time[
71
+ 'comunNotOverlp']
72
+ self.step_trace_time['Stage'] = np.max(self.msprof_data['ts'] + self.msprof_data['dur']) - np.min(
73
+ self.msprof_data['ts']) - self.step_trace_time['Bubble']
74
+ self.step_trace_time['comunNotOverlpRec'] = self.step_trace_time['comunNotOverlp'] - self.step_trace_time[
75
+ 'Bubble']
76
+
77
+ def read_msprof(self):
78
+ """
79
+ read msprof json information into memory.
80
+ """
81
+ msprof_data = []
82
+ for file in find_files(self.root_path, "msprof_*.json"):
83
+ with open(file) as jsonfile:
84
+ for row in json.load(jsonfile):
85
+ if row.get('name') in ['Computing', 'Communication', 'Communication(Not Overlapped)',
86
+ 'Free'] or row.get('name').find('/Receive-op'):
87
+ name = row.get('name', '')
88
+ ts = row.get('ts', 0)
89
+ dur = row.get('dur', 0)
90
+ msprof_data.append(tuple([name, ts, dur]))
91
+ self.msprof_data = np.array(msprof_data, dtype=self.msprof_data_df)
92
+
93
+ def write(self, step_trace_time_path):
94
+ """
95
+ Write the step trace time csv.
96
+
97
+ Args:
98
+ step_trace_time_path(str): step_trace_time.csv path.
99
+ """
100
+ try:
101
+ with os.fdopen(os.open(step_trace_time_path,
102
+ os.O_WRONLY | os.O_CREAT | os.O_TRUNC, stat.S_IWUSR | stat.S_IRUSR),
103
+ 'w') as step_trace_time:
104
+ writer = csv.writer(step_trace_time)
105
+ writer.writerow(self.title)
106
+ writer.writerow([v for _, v in self.step_trace_time.items()])
107
+ except (IOError, OSError) as err:
108
+ logging.critical('Error occurred when write step trace time file: %s', err)
109
+ raise ProfilerIOException() from err
110
+ if os.path.exists(step_trace_time_path):
111
+ os.chmod(step_trace_time_path, stat.S_IREAD | stat.S_IWRITE)