PyPI - mindspore - Versions diffs - 2.3.0rc1__cp39-cp39-manylinux1_x86_64.whl → 2.3.0rc2__cp39-cp39-manylinux1_x86_64.whl - Mend

mindspore 2.3.0rc1__cp39-cp39-manylinux1_x86_64.whl → 2.3.0rc2__cp39-cp39-manylinux1_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mindspore might be problematic. Click here for more details.

Files changed (226) hide show

mindspore/.commit_id +1 -1
mindspore/__init__.py +1 -1
mindspore/_akg/akg/utils/tbe_codegen_utils.py +13 -3
mindspore/_c_dataengine.cpython-39-x86_64-linux-gnu.so +0 -0
mindspore/_c_expression.cpython-39-x86_64-linux-gnu.so +0 -0
mindspore/_checkparam.py +20 -0
mindspore/_extends/parse/parser.py +1 -1
mindspore/_extends/parse/standard_method.py +6 -5
mindspore/_mindspore_offline_debug.cpython-39-x86_64-linux-gnu.so +0 -0
mindspore/amp.py +5 -5
mindspore/bin/cache_admin +0 -0
mindspore/bin/cache_server +0 -0
mindspore/boost/boost_cell_wrapper.py +1 -1
mindspore/boost/group_loss_scale_manager.py +1 -1
mindspore/common/__init__.py +4 -2
mindspore/common/_register_for_recompute.py +48 -0
mindspore/common/_stub_tensor.py +1 -0
mindspore/common/api.py +56 -4
mindspore/common/dtype.py +5 -3
mindspore/common/dump.py +2 -2
mindspore/common/hook_handle.py +51 -4
mindspore/common/initializer.py +1 -1
mindspore/common/jit_config.py +17 -6
mindspore/common/parameter.py +7 -2
mindspore/common/recompute.py +247 -0
mindspore/common/sparse_tensor.py +2 -2
mindspore/common/symbol.py +1 -1
mindspore/common/tensor.py +74 -36
mindspore/communication/__init__.py +3 -3
mindspore/communication/management.py +30 -30
mindspore/context.py +28 -15
mindspore/dataset/__init__.py +5 -5
mindspore/dataset/audio/__init__.py +2 -2
mindspore/dataset/audio/transforms.py +51 -51
mindspore/dataset/callback/ds_callback.py +2 -2
mindspore/dataset/engine/cache_client.py +1 -1
mindspore/dataset/engine/datasets.py +3 -3
mindspore/dataset/engine/datasets_audio.py +14 -14
mindspore/dataset/engine/datasets_standard_format.py +3 -3
mindspore/dataset/engine/datasets_text.py +38 -38
mindspore/dataset/engine/datasets_user_defined.py +3 -3
mindspore/dataset/engine/datasets_vision.py +68 -68
mindspore/dataset/text/__init__.py +3 -3
mindspore/dataset/text/transforms.py +26 -26
mindspore/dataset/transforms/__init__.py +1 -1
mindspore/dataset/vision/__init__.py +3 -3
mindspore/dataset/vision/transforms.py +92 -92
mindspore/dataset/vision/utils.py +1 -1
mindspore/experimental/optim/adadelta.py +2 -2
mindspore/experimental/optim/adagrad.py +2 -2
mindspore/experimental/optim/adam.py +2 -2
mindspore/experimental/optim/adamax.py +2 -2
mindspore/experimental/optim/adamw.py +2 -2
mindspore/experimental/optim/asgd.py +2 -2
mindspore/experimental/optim/lr_scheduler.py +24 -20
mindspore/experimental/optim/nadam.py +2 -2
mindspore/experimental/optim/optimizer.py +1 -1
mindspore/experimental/optim/radam.py +2 -2
mindspore/experimental/optim/rmsprop.py +2 -2
mindspore/experimental/optim/rprop.py +2 -2
mindspore/experimental/optim/sgd.py +2 -2
mindspore/hal/stream.py +2 -0
mindspore/include/mindapi/base/types.h +5 -0
mindspore/lib/libdnnl.so.2 +0 -0
mindspore/lib/libmindspore.so +0 -0
mindspore/lib/libmindspore_backend.so +0 -0
mindspore/lib/libmindspore_common.so +0 -0
mindspore/lib/libmindspore_core.so +0 -0
mindspore/lib/libmindspore_glog.so.0 +0 -0
mindspore/lib/libmindspore_gpr.so.15 +0 -0
mindspore/lib/libmindspore_grpc++.so.1 +0 -0
mindspore/lib/libmindspore_grpc.so.15 +0 -0
mindspore/lib/libmindspore_shared_lib.so +0 -0
mindspore/lib/libopencv_core.so.4.5 +0 -0
mindspore/lib/libopencv_imgcodecs.so.4.5 +0 -0
mindspore/lib/libopencv_imgproc.so.4.5 +0 -0
mindspore/lib/plugin/ascend/custom_aicpu_ops/op_impl/cpu/aicpu_kernel/impl/libcust_cpu_kernels.so +0 -0
mindspore/lib/plugin/ascend/custom_aicpu_ops/op_impl/cpu/config/cust_aicpu_kernel.json +6 -6
mindspore/lib/plugin/ascend/custom_aicpu_ops/op_proto/libcust_op_proto.so +0 -0
mindspore/lib/plugin/ascend/libdvpp_utils.so +0 -0
mindspore/lib/plugin/ascend/libmindspore_cpu_kernels.so +0 -0
mindspore/lib/plugin/gpu/libcuda_ops.so.10 +0 -0
mindspore/lib/plugin/gpu/libcuda_ops.so.11 +0 -0
mindspore/lib/plugin/gpu10.1/libnccl.so.2 +0 -0
mindspore/lib/plugin/gpu11.1/libnccl.so.2 +0 -0
mindspore/lib/plugin/gpu11.6/libnccl.so.2 +0 -0
mindspore/lib/plugin/libmindspore_ascend.so.2 +0 -0
mindspore/lib/plugin/libmindspore_gpu.so.10.1 +0 -0
mindspore/lib/plugin/libmindspore_gpu.so.11.1 +0 -0
mindspore/lib/plugin/libmindspore_gpu.so.11.6 +0 -0
mindspore/log.py +2 -2
mindspore/mint/__init__.py +457 -0
mindspore/mint/nn/__init__.py +430 -0
mindspore/mint/nn/functional.py +424 -0
mindspore/mint/optim/__init__.py +24 -0
mindspore/mint/optim/adamw.py +186 -0
mindspore/multiprocessing/__init__.py +4 -0
mindspore/nn/__init__.py +3 -0
mindspore/nn/cell.py +51 -47
mindspore/nn/extend/__init__.py +29 -0
mindspore/nn/extend/basic.py +140 -0
mindspore/nn/extend/embedding.py +143 -0
mindspore/nn/extend/layer/__init__.py +27 -0
mindspore/nn/extend/layer/normalization.py +107 -0
mindspore/nn/extend/pooling.py +117 -0
mindspore/nn/generator.py +297 -0
mindspore/nn/layer/basic.py +109 -1
mindspore/nn/layer/container.py +2 -2
mindspore/nn/layer/conv.py +6 -6
mindspore/nn/layer/embedding.py +1 -1
mindspore/nn/layer/normalization.py +21 -43
mindspore/nn/layer/padding.py +4 -0
mindspore/nn/optim/ada_grad.py +2 -2
mindspore/nn/optim/adadelta.py +1 -1
mindspore/nn/optim/adafactor.py +1 -1
mindspore/nn/optim/adam.py +7 -7
mindspore/nn/optim/adamax.py +2 -2
mindspore/nn/optim/adasum.py +2 -2
mindspore/nn/optim/asgd.py +2 -2
mindspore/nn/optim/ftrl.py +1 -1
mindspore/nn/optim/lamb.py +3 -3
mindspore/nn/optim/lars.py +1 -1
mindspore/nn/optim/lazyadam.py +2 -2
mindspore/nn/optim/momentum.py +2 -2
mindspore/nn/optim/optimizer.py +2 -2
mindspore/nn/optim/proximal_ada_grad.py +2 -2
mindspore/nn/optim/rmsprop.py +2 -2
mindspore/nn/optim/rprop.py +2 -2
mindspore/nn/optim/sgd.py +2 -2
mindspore/nn/optim/thor.py +2 -2
mindspore/nn/wrap/cell_wrapper.py +9 -9
mindspore/nn/wrap/grad_reducer.py +5 -5
mindspore/ops/_grad_experimental/grad_comm_ops.py +4 -2
mindspore/ops/_vmap/vmap_grad_nn_ops.py +41 -2
mindspore/ops/_vmap/vmap_math_ops.py +27 -8
mindspore/ops/_vmap/vmap_nn_ops.py +66 -8
mindspore/ops/auto_generate/cpp_create_prim_instance_helper.py +73 -1
mindspore/ops/auto_generate/gen_arg_dtype_cast.py +12 -3
mindspore/ops/auto_generate/gen_arg_handler.py +24 -0
mindspore/ops/auto_generate/gen_extend_func.py +274 -0
mindspore/ops/auto_generate/gen_ops_def.py +889 -22
mindspore/ops/auto_generate/gen_ops_prim.py +3541 -253
mindspore/ops/auto_generate/pyboost_inner_prim.py +282 -0
mindspore/ops/composite/multitype_ops/_compile_utils.py +2 -1
mindspore/ops/composite/multitype_ops/_constexpr_utils.py +9 -0
mindspore/ops/extend/__init__.py +9 -1
mindspore/ops/extend/array_func.py +134 -27
mindspore/ops/extend/math_func.py +3 -3
mindspore/ops/extend/nn_func.py +363 -2
mindspore/ops/function/__init__.py +19 -2
mindspore/ops/function/array_func.py +463 -439
mindspore/ops/function/clip_func.py +7 -18
mindspore/ops/function/grad/grad_func.py +5 -5
mindspore/ops/function/linalg_func.py +4 -4
mindspore/ops/function/math_func.py +260 -243
mindspore/ops/function/nn_func.py +825 -62
mindspore/ops/function/random_func.py +73 -4
mindspore/ops/function/sparse_unary_func.py +1 -1
mindspore/ops/function/vmap_func.py +1 -1
mindspore/ops/functional.py +2 -2
mindspore/ops/op_info_register.py +1 -31
mindspore/ops/operations/__init__.py +2 -3
mindspore/ops/operations/_grad_ops.py +2 -107
mindspore/ops/operations/_inner_ops.py +5 -5
mindspore/ops/operations/_sequence_ops.py +2 -2
mindspore/ops/operations/array_ops.py +11 -233
mindspore/ops/operations/comm_ops.py +32 -32
mindspore/ops/operations/custom_ops.py +7 -89
mindspore/ops/operations/manually_defined/ops_def.py +329 -4
mindspore/ops/operations/math_ops.py +13 -163
mindspore/ops/operations/nn_ops.py +9 -316
mindspore/ops/operations/random_ops.py +1 -1
mindspore/ops/operations/sparse_ops.py +3 -3
mindspore/ops/primitive.py +2 -2
mindspore/ops_generate/arg_dtype_cast.py +12 -3
mindspore/ops_generate/arg_handler.py +24 -0
mindspore/ops_generate/gen_ops_inner_prim.py +2 -0
mindspore/ops_generate/gen_pyboost_func.py +13 -6
mindspore/ops_generate/pyboost_utils.py +2 -17
mindspore/parallel/__init__.py +3 -2
mindspore/parallel/_auto_parallel_context.py +106 -1
mindspore/parallel/_parallel_serialization.py +34 -2
mindspore/parallel/_utils.py +16 -0
mindspore/parallel/algo_parameter_config.py +4 -4
mindspore/parallel/checkpoint_transform.py +249 -77
mindspore/parallel/cluster/process_entity/_api.py +1 -1
mindspore/parallel/parameter_broadcast.py +1 -1
mindspore/parallel/shard.py +1 -1
mindspore/profiler/parser/ascend_analysis/fwk_cann_parser.py +1 -0
mindspore/profiler/parser/ascend_analysis/profiler_info_parser.py +17 -5
mindspore/profiler/parser/ascend_msprof_exporter.py +3 -3
mindspore/profiler/parser/ascend_msprof_generator.py +10 -3
mindspore/profiler/parser/ascend_op_generator.py +26 -9
mindspore/profiler/parser/ascend_timeline_generator.py +7 -4
mindspore/profiler/parser/profiler_info.py +11 -1
mindspore/profiler/profiling.py +13 -5
mindspore/rewrite/api/node.py +12 -12
mindspore/rewrite/api/symbol_tree.py +11 -11
mindspore/run_check/_check_version.py +1 -1
mindspore/safeguard/rewrite_obfuscation.py +2 -2
mindspore/train/amp.py +4 -4
mindspore/train/anf_ir_pb2.py +8 -2
mindspore/train/callback/_backup_and_restore.py +2 -2
mindspore/train/callback/_callback.py +4 -4
mindspore/train/callback/_checkpoint.py +2 -2
mindspore/train/callback/_early_stop.py +2 -2
mindspore/train/callback/_landscape.py +4 -4
mindspore/train/callback/_loss_monitor.py +2 -2
mindspore/train/callback/_on_request_exit.py +2 -2
mindspore/train/callback/_reduce_lr_on_plateau.py +2 -2
mindspore/train/callback/_summary_collector.py +2 -2
mindspore/train/callback/_time_monitor.py +2 -2
mindspore/train/dataset_helper.py +8 -3
mindspore/train/loss_scale_manager.py +2 -2
mindspore/train/metrics/metric.py +3 -3
mindspore/train/mind_ir_pb2.py +22 -17
mindspore/train/model.py +15 -15
mindspore/train/serialization.py +18 -18
mindspore/train/summary/summary_record.py +7 -7
mindspore/train/train_thor/convert_utils.py +3 -3
mindspore/version.py +1 -1
{mindspore-2.3.0rc1.dist-info → mindspore-2.3.0rc2.dist-info}/METADATA +1 -1
{mindspore-2.3.0rc1.dist-info → mindspore-2.3.0rc2.dist-info}/RECORD +226 -212
{mindspore-2.3.0rc1.dist-info → mindspore-2.3.0rc2.dist-info}/WHEEL +0 -0
{mindspore-2.3.0rc1.dist-info → mindspore-2.3.0rc2.dist-info}/entry_points.txt +0 -0
{mindspore-2.3.0rc1.dist-info → mindspore-2.3.0rc2.dist-info}/top_level.txt +0 -0

mindspore/parallel/checkpoint_transform.py CHANGED Viewed

@@ -26,18 +26,18 @@ from mindspore.parallel._utils import _is_in_auto_parallel_mode
 from mindspore.parallel._parallel_serialization import _rank_list_for_transform_parallel_checkpoint, \
     _transform_parallel_checkpoint, _get_device_num_from_strategy, _make_dir, \
     _extract_layout_map, _extract_src_dst_layout_map, _parameter_not_in_local_stage, _extract_pipeline_stage_num, \
-    _merge_protobuf_strategy, _merge_json_strategy
+    _merge_protobuf_strategy, _merge_json_strategy, _extract_src_dst_layout_map_by_src
 __all__ = ["merge_pipeline_strategys", "rank_list_for_transform", "transform_checkpoint_by_rank",
-           "transform_checkpoints", "sync_pipeline_shared_parameters"]
+           "transform_checkpoints", "sync_pipeline_shared_parameters", "load_segmented_checkpoints"]
 def merge_pipeline_strategys(src_strategy_dirs, dst_strategy_file):
     """
     Merge parallel strategy between all pipeline stages in pipeline parallel mode.
     For more details about converting distributed Checkpoint, please refer to
-    `Model Transformation <https://www.mindspore.cn/tutorials/experts/en/r2.3.q1/parallel/model_transformation.html>`_.
+    `Model Transformation <https://www.mindspore.cn/tutorials/experts/en/master/parallel/model_transformation.html>`_.
     Note:
         Strategy file of each pipeline stage should be included in src_strategy_dirs.
@@ -77,7 +77,7 @@ def rank_list_for_transform(rank_id, src_strategy_file=None, dst_strategy_file=N
     """
     List of original distributed checkpoint rank index for obtaining the target checkpoint of a rank_id during the
     distributed checkpoint conversion. For more details about converting distributed Checkpoint, please refer to
-    `Model Transformation <https://www.mindspore.cn/tutorials/experts/en/r2.3.q1/parallel/model_transformation.html>`_.
+    `Model Transformation <https://www.mindspore.cn/tutorials/experts/en/master/parallel/model_transformation.html>`_.
     Args:
         rank_id (int): The rank of which distributed checkpoint needs to be obtained after conversion.
@@ -140,7 +140,7 @@ def transform_checkpoint_by_rank(rank_id, checkpoint_files_map, save_checkpoint_
     """
     Transform distributed checkpoint from source sharding strategy to destination sharding strategy by rank
     for a network. For more details about converting distributed Checkpoint, please refer to
-    `Model Transformation <https://www.mindspore.cn/tutorials/experts/en/r2.3.q1/parallel/model_transformation.html>`_.
+    `Model Transformation <https://www.mindspore.cn/tutorials/experts/en/master/parallel/model_transformation.html>`_.
     Args:
         rank_id (int): The rank of which distributed checkpoint needs to be obtained after conversion.
@@ -225,49 +225,63 @@ def transform_checkpoint_by_rank(rank_id, checkpoint_files_map, save_checkpoint_
     ms.save_checkpoint(transform_param_list, save_checkpoint_file_name)
-def transform_checkpoints(src_checkpoints_dir, dst_checkpoints_dir, ckpt_prefix, src_strategy_file=None,
-                          dst_strategy_file=None):
-    """
-    Transform distributed checkpoint from source sharding strategy to destination sharding strategy for a rank.
-    For more details about converting distributed Checkpoint, please refer to
-    `Model Transformation <https://www.mindspore.cn/tutorials/experts/en/r2.3.q1/parallel/model_transformation.html>`_.
-    Note:
-        The `src_checkpoints_dir` directory structure should be organized like "src_checkpoints_dir/rank_0/a.ckpt", the
-        rank number should be set to a subdirectory and the checkpoint file is stored in this subdirectory. If multiple
-        files exist in a rank directory, the last file in the lexicgraphic order would be selected.
-    Args:
-        src_checkpoints_dir (str): The source checkpoints directory.
-        dst_checkpoints_dir (str): The destination checkpoints directory to save the converted checkpoints.
-        ckpt_prefix (str): The destination checkpoint name prefix.
-        src_strategy_file (str): Name of source sharding strategy file which saved by
-                                 'mindspore.set_auto_parallel_context(strategy_ckpt_save_file)'.
-                                 when the `src_strategy_file` is ``None``, it means that the source sharding strategy is
-                                 without any sharing for each parameter. Default: ``None``.
-        dst_strategy_file (str): Name of destination sharding strategy file which saved by
-                                 'mindspore.set_auto_parallel_context(strategy_ckpt_save_file)'.
-                                 when the `dst_strategy_file` is ``None``,
-                                 it means that the destination sharding strategy
-                                 is without any sharing for each parameter. Default: ``None``.
-    Raises:
-        ValueError: `src_strategy_file` or `dst_strategy_file` is incorrect.
-        NotADirectoryError: `src_checkpoints_dir` or `dst_checkpoints_dir` is not a directory.
-        ValueError: The checkpoint file is missing in `src_checkpoints_dir`.
-        TypeError: `src_strategy_file` or `dst_strategy_file` is not a string.
-    Examples:
-        >>> import mindspore as ms
-        >>> ms.transform_checkpoints(src_checkpoints_dir, dst_checkpoints_dir, "dst_checkpoint",
-        ...                       "./src_strategy.ckpt", "./dst_strategy.ckpt")
-    """
-    if not os.path.isdir(src_checkpoints_dir):
-        raise NotADirectoryError("src_checkpoints_dir {} is not a directory.".format(src_checkpoints_dir))
-    _make_dir(dst_checkpoints_dir, "path")
-    if not isinstance(ckpt_prefix, str):
-        raise TypeError("The ckpt_prefix should be a str.")
+def _transform_checkpoint_by_stage(src_checkpoints_dir, dst_checkpoints_dir, ckpt_prefix, src_strategy_file,
+                                   dst_strategy_file=None):
+    """Transform checkpoint for stage in src_strategy_file"""
+    param_total_dict = defaultdict(dict)
+    param_attr_dict = defaultdict(dict)
+    param_type_dict = defaultdict(dict)
+    src_strategy_list, dst_strategy_list, stage_id = _extract_src_dst_layout_map_by_src(src_strategy_file, \
+                                                                                             dst_strategy_file)
+    src_stage_device_num = np.prod(src_strategy_list.get(list(src_strategy_list.keys())[0])[0]) if src_strategy_list \
+                                                                                                   is not None else 1
+    dst_stage_device_num = np.prod(dst_strategy_list.get(list(dst_strategy_list.keys())[0])[0]) if dst_strategy_list \
+                                                                                                   is not None else 1
+    origin_dst_strategy_list = _extract_layout_map(dst_strategy_file)
+    origin_src_strategy_list = _extract_layout_map(src_strategy_file)
+    checkpoint_files_map = {}
+    src_rank_id_start = stage_id * src_stage_device_num
+    for local_rank in range(src_stage_device_num):
+        rank_id = src_rank_id_start + local_rank
+        checkpoint_file_name = os.path.join(src_checkpoints_dir, "rank_{}".format(rank_id), "*.ckpt")
+        rank_ckpts = glob.glob(checkpoint_file_name)
+        rank_ckpts.sort()
+        for checkpoint_file in rank_ckpts:
+            if not os.path.isfile(checkpoint_file):
+                ms.log.warning("{} is not a checkpoint file.".format(checkpoint_file))
+                continue
+            checkpoint_files_map[rank_id] = checkpoint_file
+    for rank, local_file in checkpoint_files_map.items():
+        if not os.path.exists(local_file):
+            raise ValueError("Checkpoint file {} in rank {} not exits: ".format(local_file, rank))
+    for rank, file_name in checkpoint_files_map.items():
+        ckpt_dict = ms.load_checkpoint(file_name)
+        for param_name, param in ckpt_dict.items():
+            # cut the parameter not in the pipeline stage.
+            if _parameter_not_in_local_stage(param_name, origin_src_strategy_list, src_strategy_list) \
+                    and _parameter_not_in_local_stage(param_name, origin_dst_strategy_list, dst_strategy_list):
+                continue
+            src_rank = rank % src_stage_device_num
+            param_type_dict[param_name][src_rank] = str(param.data.dtype)
+            if param.data.dtype == mstype.bfloat16:
+                param.set_dtype(mstype.float32)
+            param_total_dict[param_name][src_rank] = param.data.asnumpy()
+            param_attr_dict[param_name][src_rank] = (param.requires_grad, param.layerwise_parallel)
+    for local_rank_id in range(dst_stage_device_num):
+        transform_param_list = _transform_parallel_checkpoint(local_rank_id, param_total_dict,
+                                                              param_attr_dict, src_strategy_list, dst_strategy_list,
+                                                              param_type_dict)
+        save_checkpoint_file = "{}{}_part{}.ckpt".format(ckpt_prefix, local_rank_id, stage_id)
+        save_checkpoint_file_dir = os.path.join(dst_checkpoints_dir, "rank_{}".format(local_rank_id))
+        if not os.path.exists(save_checkpoint_file_dir):
+            _make_dir(save_checkpoint_file_dir, "path")
+        save_checkpoint_file_name = os.path.join(save_checkpoint_file_dir, save_checkpoint_file)
+        ms.save_checkpoint(transform_param_list, save_checkpoint_file_name)
+def _transform_checkpoints(src_checkpoints_dir, dst_checkpoints_dir, ckpt_prefix, src_strategy_file=None,
+                           dst_strategy_file=None):
+    """Transform checkpoints for all stages in src_strategy_file"""
     checkpoints_rank_dir_list = os.path.join(src_checkpoints_dir, "rank_[0-9]*")
     all_checkpoint_files_map = {}
     for checkpoint_dir in glob.glob(checkpoints_rank_dir_list):
@@ -342,6 +356,76 @@ def transform_checkpoints(src_checkpoints_dir, dst_checkpoints_dir, ckpt_prefix,
         del param_total_dict
+def transform_checkpoints(src_checkpoints_dir, dst_checkpoints_dir, ckpt_prefix, src_strategy_file=None,
+                          dst_strategy_file=None):
+    """
+    Transform distributed checkpoint from source sharding strategy to destination sharding strategy for a rank.
+    For more details about converting distributed Checkpoint, please refer to
+    `Model Transformation <https://www.mindspore.cn/tutorials/experts/en/master/parallel/model_transformation.html>`_.
+    Note:
+        The `src_checkpoints_dir` directory structure should be organized like "src_checkpoints_dir/rank_0/a.ckpt", the
+        rank number should be set to a subdirectory and the checkpoint file is stored in this subdirectory. If multiple
+        files exist in a rank directory, the last file in the lexicgraphic order would be selected.
+    Args:
+        src_checkpoints_dir (str): The source checkpoints directory.
+        dst_checkpoints_dir (str): The destination checkpoints directory to save the converted checkpoints.
+        ckpt_prefix (str): The destination checkpoint name prefix.
+        src_strategy_file (str): Name of source sharding strategy file which saved by
+                                 'mindspore.set_auto_parallel_context(strategy_ckpt_save_file)'.
+                                 when the 'src_strategy_file' is None, it means that the source sharding strategy is
+                                 without any sharing for each parameter. Default:None.
+        dst_strategy_file (str): Name of destination sharding strategy file which saved by
+                                 'mindspore.set_auto_parallel_context(strategy_ckpt_save_file)'.
+                                 when the 'dst_strategy_file' is None, it means that the destination sharding strategy
+                                 is without any sharing for each parameter. Default:None.
+    Raises:
+        ValueError: `src_strategy_file` or `dst_strategy_file` is incorrect.
+        NotADirectoryError: `src_checkpoints_dir` or `dst_checkpoints_dir` is not a directory.
+        ValueError: The checkpoint file is missing in `src_checkpoints_dir`.
+        TypeError: `src_strategy_file` or `dst_strategy_file` is not a string.
+    Examples:
+        >>> import mindspore as ms
+        >>> ms.transform_checkpoints(src_checkpoints_dir, dst_checkpoints_dir, "dst_checkpoint",
+        ...                       "./src_strategy.ckpt", "./dst_strategy.ckpt")
+    """
+    if not os.path.isdir(src_checkpoints_dir):
+        raise NotADirectoryError("src_checkpoints_dir {} is not a directory.".format(src_checkpoints_dir))
+    _make_dir(dst_checkpoints_dir, "path")
+    if not isinstance(ckpt_prefix, str):
+        raise TypeError("The ckpt_prefix should be a str.")
+    if src_strategy_file and os.path.dirname(src_strategy_file) and not os.path.exists(
+            os.path.dirname(src_strategy_file)):
+        raise ValueError("The director of src_strategy_file: {} is not exists.".
+                         format(os.path.dirname(src_strategy_file)))
+    if dst_strategy_file and os.path.dirname(dst_strategy_file) and not os.path.exists(
+            os.path.dirname(dst_strategy_file)):
+        raise ValueError("The director of dst_strategy_file: {} is not exists.".
+                         format(os.path.dirname(dst_strategy_file)))
+    src_layout_map = _extract_layout_map(src_strategy_file)
+    dst_layout_map = _extract_layout_map(dst_strategy_file)
+    pipeline_stage_num = _extract_pipeline_stage_num(src_strategy_file)
+    if src_layout_map:
+        src_param_keys = {param_name for param_name in src_layout_map if not param_name.startswith("accu_grads")}
+    if dst_layout_map:
+        dst_param_keys = {param_name for param_name in dst_layout_map if not param_name.startswith("accu_grads")}
+    if src_layout_map and dst_layout_map and pipeline_stage_num == 1 \
+        and src_param_keys.issubset(dst_param_keys) and len(src_param_keys) < len(dst_param_keys):
+        dst_stage_num = _extract_pipeline_stage_num(dst_strategy_file)
+        if dst_stage_num > 1:
+            raise NotImplementedError("When using unmerged src strategy, dst strategy doesn't \
+                                       support strategy with pipeline parallel.")
+        _transform_checkpoint_by_stage(src_checkpoints_dir, dst_checkpoints_dir, ckpt_prefix,
+                                       src_strategy_file, dst_strategy_file)
+    else:
+        _transform_checkpoints(src_checkpoints_dir, dst_checkpoints_dir, ckpt_prefix,
+                               src_strategy_file, dst_strategy_file)
 def _sync_params(name, param, layout):
     """synchronize single parameter"""
     if len(layout) < 10:
@@ -394,53 +478,79 @@ def sync_pipeline_shared_parameters(net):
     Args:
         net (nn.Cell): the inference network.
+    Supported Platforms:
+        ``Ascend``
     Examples:
+        .. note::
+            Before running the following examples, you need to configure the communication environment variables.
+            For the Ascend device, users need to write a dynamic cluster startup script, please see the `Dynamic Cluster
+            Startup <https://www.mindspore.cn/tutorials/experts/en/master/parallel/dynamic_cluster.html>`_ .
         >>> import numpy as np
         >>> import mindspore as ms
-        >>> from mindspore import nn, ops, Parameter, Tensor
-        >>> class VocabEmbedding(nn.Cell):
-        ...     def __init__(self, vocab_size, embedding_size):
+        >>> import mindspore.communication.management as D
+        >>> from mindspore import lazy_inline, context, nn, ops, Parameter, Tensor
+        >>> context.set_context(mode=context.GRAPH_MODE)
+        >>> class Embedding(nn.Cell):
+        ...     def __init__(self, shape):
         ...         super().__init__()
-        ...         self.embedding_table = Parameter(Tensor(np.ones([vocab_size, embedding_size]), ms.float32),
-        ...                                          name='embedding')
-        ...         self.gather = ops.Gather()
-        ...
+        ...         self.w = Parameter(Tensor(np.ones(shape), ms.float32), name='w')
+        ...         self.matmul = ops.MatMul().shard(((1, 1), (1, 1)))
         ...     def construct(self, x):
-        ...         output = self.gather(self.embedding_table, x, 0)
-        ...         output = output.squeeze(1)
-        ...         return output, self.embedding_table.value()
+        ...         return self.matmul(x, self.w), self.w
         ...
         >>> class LMHead(nn.Cell):
         ...     def __init__(self):
         ...         super().__init__()
-        ...         self.matmul = ops.MatMul(transpose_b=True)
-        ...
-        ...     def construct(self, state, embed):
-        ...         return self.matmul(state, embed)
+        ...         self.matmul = ops.MatMul(transpose_b=True).shard(((1, 1), (1, 1)))
+        ...     def construct(self, x, w):
+        ...         return self.matmul(x, w)
         ...
         >>> class Network(nn.Cell):
         ...     @lazy_inline
         ...     def __init__(self):
         ...         super().__init__()
-        ...         self.word_embedding = VocabEmbedding(vocab_size=4, embedding_size=4)
-        ...         self.head = LMHead()
-        ...
+        ...         shape = (4, 4)
+        ...         self.word_embedding = Embedding(shape)
+        ...         self.lm_head = LMHead()
+        ...         self.word_embedding.pipeline_stage = 0
+        ...         self.lm_head.pipeline_stage = 1
         ...     def construct(self, x):
         ...         x, embed = self.word_embedding(x)
-        ...         x = self.head(x, embed)
-        ...         return x
-        >>>
+        ...         return self.lm_head(x, embed)
+        ...
+        >>> class PipelineCellInference(nn.Cell):
+        ...     def __init__(self, network, micro_batch_num):
+        ...         super().__init__()
+        ...         self.network = network
+        ...         self.micro_batch_num = micro_batch_num
+        ...         self.concat = ops.Concat()
+        ...     def construct(self, x):
+        ...         ret = ()
+        ...         for i in range(self.micro_batch_num):
+        ...             micro_batch_size = x.shape[0] // self.micro_batch_num
+        ...             start = micro_batch_size * i
+        ...             end = micro_batch_size * (i + 1)
+        ...             micro_input = x[start:end]
+        ...             y = self.network(micro_input)
+        ...             ret = ret + (y,)
+        ...         ret = self.concat(ret)
+        ...         return ret
+        >>> D.init()
+        >>> context.set_auto_parallel_context(parallel_mode='semi_auto_parallel', full_batch=True, pipeline_stages=2)
         >>> net = Network()
-        >>> net.word_embedding.pipeline_stage = 0
-        >>> net.head.pipeline_stage = 1
-        >>> x = Tensor(np.ones((8, 4))
-        >>> net.compile()
+        >>> net = PipelineCellInference(net, 2)
+        >>> net.set_train(False)
+        >>> x = Tensor(np.ones((2, 4)), ms.float32)
+        >>> net.compile(x)
         >>> ms.sync_pipeline_shared_parameters(net)
-        >>> print(net.word_embedding.embedding_table.asnumpy())
-        >>> [[1. 1. 1. 1.]
-             [1. 1. 1. 1.]
-             [1. 1. 1. 1.]
-             [1. 1. 1. 1.]]
+        >>> print(net.network.word_embedding.w.asnumpy())
+        [[1. 1. 1. 1.]
+         [1. 1. 1. 1.]
+         [1. 1. 1. 1.]
+         [1. 1. 1. 1.]]
     """
     if not isinstance(net, ms.nn.Cell):
@@ -466,3 +576,65 @@ def sync_pipeline_shared_parameters(net):
     # restore parallel context
     ms.context.set_auto_parallel_context(parallel_mode=parallel_mode, full_batch=full_batch)
+def load_segmented_checkpoints(ckpt_file_dir, net=None, strict_load=False, filter_prefix=None,
+                               dec_key=None, dec_mode="AES-GCM", specify_prefix=None, choice_func=None):
+    """
+    Load checkpoint info from a specified file. If the specified ckpt_file_dir path contains multiple
+    checkpoint files, all checkpoint files will be loaded one by one and the combined dictionary will be return.
+    Note:
+        - `specify_prefix` and `filter_prefix` do not affect each other.
+        - If none of the parameters are loaded from checkpoint file, it will throw ValueError.
+        - `specify_prefix` and `filter_prefix` are in the process of being deprecated,
+          `choice_func` is recommended instead.
+          And using either of those two args will override `choice_func` at the same time.
+    Args:
+        ckpt_file_dir (str): Checkpoint file directory.
+        net (Cell): The network where the parameters will be loaded. Default: ``None`` .
+        strict_load (bool): Whether to strict load the parameter into net. If ``False`` , it will load parameter
+                            into net when parameter name's suffix in checkpoint file is the same as the
+                            parameter in the network. When the types are inconsistent perform type conversion
+                            on the parameters of the same type, such as float32 to float16. Default: ``False`` .
+        filter_prefix (Union[str, list[str], tuple[str]]): Deprecated(see `choice_func`). Parameters starting with the
+            filter_prefix will not be loaded. Default: ``None`` .
+        dec_key (Union[None, bytes]): Byte type key used for decryption. If the value is ``None`` , the decryption
+                                      is not required. Default: ``None`` .
+        dec_mode (str): This parameter is valid only when dec_key is not set to ``None`` . Specifies the decryption
+                        mode, currently supports ``"AES-GCM"`` and ``"AES-CBC"`` and ``"SM4-CBC"`` .
+                        Default: ``"AES-GCM"`` .
+        specify_prefix (Union[str, list[str], tuple[str]]): Deprecated(see `choice_func`). Parameters starting with the
+            specify_prefix will be loaded. Default: ``None`` .
+        choice_func (Union[None, function]) : Input value of the function is a Parameter name of type string,
+            and the return value is a bool. If returns ``True`` , the Parameter
+            that matches the custom condition will be loaded. If returns ``False`` , the Parameter that
+            matches the custom condition will be removed. Default: ``None`` .
+    Returns:
+        Dict, key is parameter name, value is a Parameter or string. When the `append_dict` parameter of
+        :func:`mindspore.save_checkpoint` and the `append_info` parameter of :class:`mindspore.train.CheckpointConfig`
+        are used to save the checkpoint, `append_dict` and `append_info` are dict types, and their value are string,
+        then the return value obtained by loading checkpoint is string, and in other cases the return value is
+        Parameter.
+    Raises:
+        TypeError: Input ckpt_file_dir is not a string.
+        ValueError: Checkpoint file directory doesn't exist. Or it's not a directory
+        ValueError: Checkpoint file's format is incorrect.
+        ValueError: Parameter's dict is None after load checkpoint file.
+        TypeError: The type of `specify_prefix` or `filter_prefix` is incorrect.
+    """
+    if not isinstance(ckpt_file_dir, str):
+        raise TypeError("The ckpt_file_dir should be a str.")
+    if not os.path.isdir(ckpt_file_dir):
+        raise ValueError("The dst_strategy_file: {} doesn't exist. Or it's not a directory".
+                         format(ckpt_file_dir))
+    checkpoint_file_name = os.path.join(ckpt_file_dir, "*.ckpt")
+    rank_ckpts = glob.glob(checkpoint_file_name)
+    parameter_dict = {}
+    for checkpoint_file in rank_ckpts:
+        parameter_dict.update(ms.load_checkpoint(checkpoint_file, net, strict_load, filter_prefix, dec_key,
+                                                 dec_mode, specify_prefix, choice_func))
+    return parameter_dict

mindspore/parallel/cluster/process_entity/_api.py CHANGED Viewed

@@ -42,7 +42,7 @@ class _Node:
         os.environ["MS_WORKER_NUM"] = str(self.worker_num)
         os.environ["MS_SCHED_HOST"] = self.sched_host
         os.environ["MS_SCHED_PORT"] = str(self.sched_port)
-        os.environ["MS_CLUSTER_TIMEOUT"] = str(self.timeout)
+        os.environ["MS_TOPO_TIMEOUT"] = str(self.timeout)
 class _MetaServerNode(_Node):
     """

mindspore/parallel/parameter_broadcast.py CHANGED Viewed

@@ -84,7 +84,7 @@ def parameter_broadcast(net, layout, cur_rank=0, initial_rank=0):
         >>> net.matmul2.shard(((1, 8), (8, 1)))
         >>> net.relu2.shard(((8, 1),))
         >>> # Create the dataset taking MNIST as an example. Refer to
-        >>> # https://gitee.com/mindspore/docs/blob/r2.3.q1/docs/mindspore/code/mnist.py
+        >>> # https://gitee.com/mindspore/docs/blob/master/docs/mindspore/code/mnist.py
         >>> dataset = create_dataset()
         >>> optim = nn.SGD(net.trainable_params(), 1e-2)
         >>> loss = nn.CrossEntropyLoss()

mindspore/parallel/shard.py CHANGED Viewed

@@ -328,7 +328,7 @@ def shard(fn, in_strategy, out_strategy=None, parameter_plan=None, device="Ascen
     Tutorial Examples:
         - `Functional Operator Sharding
-          <https://www.mindspore.cn/tutorials/experts/en/r2.3.q1/parallel/pynative_shard_function_parallel.html>`_
+          <https://www.mindspore.cn/tutorials/experts/en/master/parallel/pynative_shard_function_parallel.html>`_
     """
     if not isinstance(fn, (ms.nn.Cell)):
         logger.warning("'fn' is not a mindspore.nn.Cell, and its definition cannot involve Parameter; "

mindspore/profiler/parser/ascend_analysis/fwk_cann_parser.py CHANGED Viewed

@@ -34,6 +34,7 @@ class FwkCANNParser:
     def __init__(self, source_path: str, msprof_data: List, rank_id: int):
         source_path = validate_and_normalize_path(source_path)
         ProfilerInfoParser.init_source_path(source_path)
+        ProfilerInfoParser.init_rank_id(rank_id)
         fwk_parser = FwkFileParser(source_path, rank_id)
         msprof_timeline_parser = MsprofTimelineParser(msprof_data)
         self._fwk_op_data = fwk_parser.get_op_range_data()

mindspore/profiler/parser/ascend_analysis/profiler_info_parser.py CHANGED Viewed

@@ -20,9 +20,9 @@ from subprocess import CalledProcessError, TimeoutExpired
 from subprocess import Popen, PIPE
 from mindspore import log as logger
-import mindspore._c_expression as c_expression
 from mindspore.profiler.common.validator.validate_path import validate_and_normalize_path
 from mindspore.profiler.parser.ascend_analysis.constant import Constant
+from mindspore.profiler.parser.profiler_info import ProfilerInfo
 class ProfilerInfoParser:
@@ -37,6 +37,7 @@ class ProfilerInfoParser:
     # profiler information related files
     _source_prof_path = None
     _loaded_frequency = False
+    _rank_id = 0
     @classmethod
     def init_source_path(cls, source_path: str):
@@ -48,12 +49,15 @@ class ProfilerInfoParser:
             raise RuntimeError("Input source path is invalid!")
         cls._source_prof_path = prof_path
+    @classmethod
+    def init_rank_id(cls, rank_id):
+        """initialize the rank id."""
+        cls._rank_id = rank_id
     @classmethod
     def get_local_time(cls, syscnt: int) -> Decimal:
         """Convert syscnt to local time."""
         if not cls._loaded_frequency:
-            localtime_stamp = c_expression.get_clock_time()
-            syscnt_stamp = c_expression.get_clock_syscnt()
             outs, _ = cls.__run_cmd(['which', cls._msprof_cmd])
             if not outs:
                 raise FileNotFoundError("Failed to find msprof command!")
@@ -70,8 +74,16 @@ class ProfilerInfoParser:
                 cls._freq = float(cpu_info.get("Frequency", cls._freq))
             except ValueError:
                 pass
-            cls._start_cnt = syscnt_stamp
-            cls._time_offset = localtime_stamp
+            profiler_info_path = os.path.join(cls._source_prof_path, os.path.pardir,
+                                              f"profiler_info_{cls._rank_id}.json")
+            if not os.path.isfile(profiler_info_path):
+                raise RuntimeError(f"Can`t find the file {profiler_info_path}, please check!")
+            with os.fdopen(os.open(profiler_info_path, os.O_RDONLY, 0o600), 'r') as fr:
+                profiler_info_data = json.load(fr)
+            cls._start_cnt = profiler_info_data.get('system_cnt')
+            cls._time_offset = profiler_info_data.get('system_time')
+            ProfilerInfo.set_system_time(cls._time_offset)
+            ProfilerInfo.set_system_cnt(cls._start_cnt)
             cls._loaded_frequency = True
         start_ns = cls.__get_timestamp(syscnt)
         start_us = Decimal(start_ns) / Constant.NS_TO_US

mindspore/profiler/parser/ascend_msprof_exporter.py CHANGED Viewed

@@ -251,10 +251,10 @@ class AscendMsprofExporter:
                 msprof_json.add(f)
         if not op_summary:
-            raise RuntimeError("The op_summary csv file was not found, perhaps the original data was not collected.")
+            logger.warning("The op_summary csv file was not found, perhaps the original data was not collected.")
         if not op_statistic:
-            raise RuntimeError("The op_statistics csv file was not found, perhaps the original data was not collected.")
+            logger.warning("The op_statistics csv file was not found, perhaps the original data was not collected.")
         if not msprof_json:
-            raise RuntimeError("The msprof json file was not found, perhaps the original data was not collected.")
+            logger.warning("The msprof json file was not found, perhaps the original data was not collected.")
         logger.info("Finish checking files.")

mindspore/profiler/parser/ascend_msprof_generator.py CHANGED Viewed

@@ -88,7 +88,10 @@ class AscendMsprofDataGenerator:
         """read op summary to memory"""
         op_summary = []
         op_summary_name = fr'{self.mindstudio_profiler_output}/op_summary_*.csv'
-        op_summary_file = get_newest_file(glob.glob(op_summary_name))[0]
+        op_summary_files = glob.glob(op_summary_name)
+        if not op_summary_files:
+            return
+        op_summary_file = get_newest_file(op_summary_files)[0]
         with open(op_summary_file, newline='') as csvfile:
             reader = csv.DictReader(csvfile, delimiter=',', quotechar='"')
             for row in reader:
@@ -129,7 +132,10 @@ class AscendMsprofDataGenerator:
         """read op statistic to memory"""
         op_statistic = []
         op_statistic_name = fr'{self.mindstudio_profiler_output}/op_statistic_*.csv'
-        op_statistic_file = get_newest_file(glob.glob(op_statistic_name))[0]
+        op_statistic_files = glob.glob(op_statistic_name)
+        if not op_statistic_files:
+            return
+        op_statistic_file = get_newest_file(op_statistic_files)[0]
         with open(op_statistic_file, newline='') as csvfile:
             reader = csv.DictReader(csvfile, delimiter=',', quotechar='"')
             for row in reader:
@@ -140,7 +146,8 @@ class AscendMsprofDataGenerator:
                 )
                 new_row = tuple(['0' if d == 'N/A' else d for d in new_row])
                 op_statistic.append(new_row)
+        if not op_statistic:
+            return
         op_statistic_dt = np.dtype(self.op_statistic_type)
         self.op_statistic = np.array(op_statistic, dtype=op_statistic_dt)
         self.op_statistic['Total Time'] *= 1e-3