PyPI - mindspore - Versions diffs - 2.3.0__cp39-cp39-win_amd64.whl → 2.4.1__cp39-cp39-win_amd64.whl - Mend

mindspore 2.3.0__cp39-cp39-win_amd64.whl → 2.4.1__cp39-cp39-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mindspore might be problematic. Click here for more details.

Files changed (287) hide show

mindspore/.commit_id +1 -1
mindspore/__init__.py +3 -1
mindspore/_c_dataengine.cp39-win_amd64.pyd +0 -0
mindspore/_c_expression.cp39-win_amd64.pyd +0 -0
mindspore/_c_mindrecord.cp39-win_amd64.pyd +0 -0
mindspore/_checkparam.py +50 -9
mindspore/_extends/parse/compile_config.py +41 -0
mindspore/_extends/parse/parser.py +9 -7
mindspore/_extends/parse/standard_method.py +52 -14
mindspore/_extends/pijit/pijit_func_white_list.py +350 -24
mindspore/amp.py +24 -10
mindspore/avcodec-59.dll +0 -0
mindspore/avdevice-59.dll +0 -0
mindspore/avfilter-8.dll +0 -0
mindspore/avformat-59.dll +0 -0
mindspore/avutil-57.dll +0 -0
mindspore/common/__init__.py +6 -4
mindspore/common/_pijit_context.py +190 -0
mindspore/common/_register_for_tensor.py +2 -1
mindspore/common/_tensor_overload.py +139 -0
mindspore/common/api.py +102 -87
mindspore/common/dump.py +5 -6
mindspore/common/generator.py +1 -7
mindspore/common/hook_handle.py +14 -26
mindspore/common/initializer.py +51 -15
mindspore/common/mindir_util.py +2 -2
mindspore/common/parameter.py +62 -15
mindspore/common/recompute.py +39 -9
mindspore/common/sparse_tensor.py +7 -3
mindspore/common/tensor.py +183 -37
mindspore/communication/__init__.py +1 -1
mindspore/communication/_comm_helper.py +38 -3
mindspore/communication/comm_func.py +315 -60
mindspore/communication/management.py +14 -14
mindspore/context.py +132 -22
mindspore/dataset/__init__.py +1 -1
mindspore/dataset/audio/__init__.py +1 -1
mindspore/dataset/core/config.py +7 -0
mindspore/dataset/core/validator_helpers.py +7 -0
mindspore/dataset/engine/cache_client.py +1 -1
mindspore/dataset/engine/datasets.py +72 -44
mindspore/dataset/engine/datasets_audio.py +7 -7
mindspore/dataset/engine/datasets_standard_format.py +53 -3
mindspore/dataset/engine/datasets_text.py +20 -20
mindspore/dataset/engine/datasets_user_defined.py +174 -104
mindspore/dataset/engine/datasets_vision.py +33 -33
mindspore/dataset/engine/iterators.py +29 -0
mindspore/dataset/engine/obs/util.py +7 -0
mindspore/dataset/engine/queue.py +114 -60
mindspore/dataset/engine/serializer_deserializer.py +2 -2
mindspore/dataset/engine/validators.py +34 -14
mindspore/dataset/text/__init__.py +1 -4
mindspore/dataset/transforms/__init__.py +0 -3
mindspore/dataset/utils/line_reader.py +2 -0
mindspore/dataset/vision/__init__.py +1 -4
mindspore/dataset/vision/utils.py +1 -1
mindspore/dataset/vision/validators.py +2 -1
mindspore/dnnl.dll +0 -0
mindspore/{nn/extend → experimental/es}/__init__.py +4 -11
mindspore/experimental/es/embedding_service.py +883 -0
mindspore/{nn/layer → experimental/es}/embedding_service_layer.py +218 -30
mindspore/experimental/llm_boost/__init__.py +21 -0
mindspore/{nn/extend/layer → experimental/llm_boost/atb}/__init__.py +4 -8
mindspore/experimental/llm_boost/atb/boost_base.py +211 -0
mindspore/experimental/llm_boost/atb/llama_boost.py +115 -0
mindspore/experimental/llm_boost/atb/qwen_boost.py +101 -0
mindspore/experimental/llm_boost/register.py +129 -0
mindspore/experimental/llm_boost/utils.py +31 -0
mindspore/experimental/optim/adamw.py +85 -0
mindspore/experimental/optim/optimizer.py +3 -0
mindspore/hal/__init__.py +3 -3
mindspore/hal/contiguous_tensors_handle.py +175 -0
mindspore/hal/stream.py +18 -0
mindspore/include/api/model_group.h +13 -1
mindspore/include/api/types.h +10 -10
mindspore/include/dataset/config.h +2 -2
mindspore/include/dataset/constants.h +2 -2
mindspore/include/dataset/execute.h +2 -2
mindspore/include/dataset/vision.h +4 -0
mindspore/jpeg62.dll +0 -0
mindspore/log.py +1 -1
mindspore/mindrecord/filewriter.py +68 -51
mindspore/mindspore_backend.dll +0 -0
mindspore/mindspore_common.dll +0 -0
mindspore/mindspore_core.dll +0 -0
mindspore/mindspore_glog.dll +0 -0
mindspore/mindspore_np_dtype.dll +0 -0
mindspore/mindspore_ops.dll +0 -0
mindspore/mint/__init__.py +983 -46
mindspore/mint/distributed/__init__.py +31 -0
mindspore/mint/distributed/distributed.py +254 -0
mindspore/mint/nn/__init__.py +268 -23
mindspore/mint/nn/functional.py +125 -19
mindspore/mint/nn/layer/__init__.py +39 -0
mindspore/mint/nn/layer/activation.py +133 -0
mindspore/mint/nn/layer/normalization.py +477 -0
mindspore/mint/nn/layer/pooling.py +110 -0
mindspore/mint/optim/adamw.py +26 -13
mindspore/mint/special/__init__.py +63 -0
mindspore/multiprocessing/__init__.py +2 -1
mindspore/nn/__init__.py +0 -1
mindspore/nn/cell.py +276 -96
mindspore/nn/layer/activation.py +211 -44
mindspore/nn/layer/basic.py +137 -10
mindspore/nn/layer/embedding.py +137 -2
mindspore/nn/layer/normalization.py +101 -5
mindspore/nn/layer/padding.py +34 -48
mindspore/nn/layer/pooling.py +161 -7
mindspore/nn/layer/transformer.py +3 -3
mindspore/nn/loss/__init__.py +2 -2
mindspore/nn/loss/loss.py +84 -6
mindspore/nn/optim/__init__.py +2 -1
mindspore/nn/optim/adadelta.py +1 -1
mindspore/nn/optim/adam.py +1 -1
mindspore/nn/optim/lamb.py +1 -1
mindspore/nn/optim/tft_wrapper.py +124 -0
mindspore/nn/wrap/cell_wrapper.py +12 -23
mindspore/nn/wrap/grad_reducer.py +5 -5
mindspore/nn/wrap/loss_scale.py +17 -3
mindspore/numpy/__init__.py +1 -1
mindspore/numpy/array_creations.py +65 -68
mindspore/numpy/array_ops.py +64 -60
mindspore/numpy/fft.py +610 -75
mindspore/numpy/logic_ops.py +11 -10
mindspore/numpy/math_ops.py +85 -84
mindspore/numpy/utils_const.py +4 -4
mindspore/opencv_core452.dll +0 -0
mindspore/opencv_imgcodecs452.dll +0 -0
mindspore/opencv_imgproc452.dll +0 -0
mindspore/ops/__init__.py +6 -4
mindspore/ops/_grad_experimental/grad_array_ops.py +0 -11
mindspore/ops/_grad_experimental/grad_comm_ops.py +67 -4
mindspore/ops/_grad_experimental/grad_math_ops.py +0 -22
mindspore/ops/_vmap/vmap_array_ops.py +2 -4
mindspore/ops/_vmap/vmap_math_ops.py +17 -1
mindspore/ops/_vmap/vmap_nn_ops.py +43 -2
mindspore/ops/auto_generate/cpp_create_prim_instance_helper.py +91 -7
mindspore/ops/auto_generate/gen_arg_dtype_cast.py +2 -0
mindspore/ops/auto_generate/gen_extend_func.py +767 -13
mindspore/ops/auto_generate/gen_ops_def.py +2452 -364
mindspore/ops/auto_generate/gen_ops_prim.py +5442 -1756
mindspore/ops/auto_generate/pyboost_inner_prim.py +176 -56
mindspore/ops/composite/base.py +85 -48
mindspore/ops/composite/multitype_ops/_compile_utils.py +1 -0
mindspore/ops/composite/multitype_ops/not_in_impl.py +2 -2
mindspore/ops/function/__init__.py +22 -0
mindspore/ops/function/array_func.py +492 -153
mindspore/ops/function/debug_func.py +113 -1
mindspore/ops/function/fft_func.py +15 -2
mindspore/ops/function/grad/grad_func.py +3 -2
mindspore/ops/function/math_func.py +564 -207
mindspore/ops/function/nn_func.py +817 -383
mindspore/ops/function/other_func.py +3 -2
mindspore/ops/function/random_func.py +402 -12
mindspore/ops/function/reshard_func.py +13 -11
mindspore/ops/function/sparse_unary_func.py +1 -1
mindspore/ops/function/vmap_func.py +3 -2
mindspore/ops/functional.py +24 -14
mindspore/ops/op_info_register.py +3 -3
mindspore/ops/operations/__init__.py +7 -2
mindspore/ops/operations/_grad_ops.py +2 -76
mindspore/ops/operations/_infer_ops.py +1 -1
mindspore/ops/operations/_inner_ops.py +71 -94
mindspore/ops/operations/array_ops.py +14 -146
mindspore/ops/operations/comm_ops.py +63 -53
mindspore/ops/operations/custom_ops.py +83 -19
mindspore/ops/operations/debug_ops.py +42 -10
mindspore/ops/operations/manually_defined/_inner.py +12 -0
mindspore/ops/operations/manually_defined/ops_def.py +273 -20
mindspore/ops/operations/math_ops.py +12 -223
mindspore/ops/operations/nn_ops.py +20 -114
mindspore/ops/operations/other_ops.py +7 -4
mindspore/ops/operations/random_ops.py +46 -1
mindspore/ops/primitive.py +18 -6
mindspore/ops_generate/arg_dtype_cast.py +2 -0
mindspore/ops_generate/gen_aclnn_implement.py +11 -11
mindspore/ops_generate/gen_constants.py +36 -0
mindspore/ops_generate/gen_ops.py +67 -52
mindspore/ops_generate/gen_ops_inner_prim.py +1 -1
mindspore/ops_generate/gen_pyboost_func.py +131 -47
mindspore/ops_generate/op_proto.py +10 -3
mindspore/ops_generate/pyboost_utils.py +14 -1
mindspore/ops_generate/template.py +43 -21
mindspore/parallel/__init__.py +3 -1
mindspore/parallel/_auto_parallel_context.py +31 -9
mindspore/parallel/_cell_wrapper.py +85 -0
mindspore/parallel/_parallel_serialization.py +47 -19
mindspore/parallel/_tensor.py +127 -13
mindspore/parallel/_utils.py +53 -22
mindspore/parallel/algo_parameter_config.py +5 -5
mindspore/parallel/checkpoint_transform.py +46 -39
mindspore/parallel/cluster/process_entity/__init__.py +1 -1
mindspore/parallel/cluster/process_entity/_api.py +31 -23
mindspore/parallel/cluster/process_entity/_utils.py +2 -27
mindspore/parallel/parameter_broadcast.py +3 -4
mindspore/parallel/shard.py +162 -31
mindspore/parallel/transform_safetensors.py +1146 -0
mindspore/profiler/__init__.py +2 -1
mindspore/profiler/common/constant.py +29 -0
mindspore/profiler/common/registry.py +47 -0
mindspore/profiler/common/util.py +28 -0
mindspore/profiler/dynamic_profiler.py +694 -0
mindspore/profiler/envprofiling.py +17 -19
mindspore/profiler/parser/ascend_analysis/constant.py +18 -0
mindspore/profiler/parser/ascend_analysis/file_manager.py +25 -4
mindspore/profiler/parser/ascend_analysis/function_event.py +43 -19
mindspore/profiler/parser/ascend_analysis/fwk_cann_parser.py +31 -26
mindspore/profiler/parser/ascend_analysis/fwk_file_parser.py +56 -10
mindspore/profiler/parser/ascend_analysis/msprof_timeline_parser.py +55 -8
mindspore/profiler/parser/ascend_analysis/path_manager.py +313 -0
mindspore/profiler/parser/ascend_analysis/profiler_info_parser.py +27 -20
mindspore/profiler/parser/ascend_analysis/trace_event_manager.py +9 -2
mindspore/profiler/parser/ascend_msprof_exporter.py +5 -4
mindspore/profiler/parser/ascend_timeline_generator.py +27 -25
mindspore/profiler/parser/base_timeline_generator.py +19 -25
mindspore/profiler/parser/cpu_gpu_timeline_generator.py +25 -12
mindspore/profiler/parser/framework_parser.py +1 -391
mindspore/profiler/parser/gpu_analysis/__init__.py +14 -0
mindspore/profiler/parser/gpu_analysis/function_event.py +44 -0
mindspore/profiler/parser/gpu_analysis/fwk_file_parser.py +89 -0
mindspore/profiler/parser/gpu_analysis/profiler_info_parser.py +72 -0
mindspore/profiler/parser/memory_usage_parser.py +0 -154
mindspore/profiler/parser/profiler_info.py +78 -6
mindspore/profiler/profiler.py +153 -0
mindspore/profiler/profiling.py +285 -413
mindspore/rewrite/__init__.py +1 -2
mindspore/rewrite/common/namespace.py +4 -4
mindspore/rewrite/symbol_tree/symbol_tree.py +3 -3
mindspore/run_check/_check_version.py +39 -104
mindspore/safeguard/rewrite_obfuscation.py +591 -247
mindspore/swresample-4.dll +0 -0
mindspore/swscale-6.dll +0 -0
mindspore/tinyxml2.dll +0 -0
mindspore/train/__init__.py +4 -3
mindspore/train/_utils.py +105 -19
mindspore/train/amp.py +171 -53
mindspore/train/callback/__init__.py +2 -2
mindspore/train/callback/_callback.py +4 -4
mindspore/train/callback/_checkpoint.py +97 -31
mindspore/train/callback/_cluster_monitor.py +1 -1
mindspore/train/callback/_flops_collector.py +1 -0
mindspore/train/callback/_loss_monitor.py +3 -3
mindspore/train/callback/_on_request_exit.py +145 -31
mindspore/train/callback/_summary_collector.py +5 -5
mindspore/train/callback/_tft_register.py +375 -0
mindspore/train/dataset_helper.py +15 -3
mindspore/train/metrics/metric.py +3 -3
mindspore/train/metrics/roc.py +4 -4
mindspore/train/mind_ir_pb2.py +44 -39
mindspore/train/model.py +154 -58
mindspore/train/serialization.py +342 -128
mindspore/turbojpeg.dll +0 -0
mindspore/utils/__init__.py +21 -0
mindspore/utils/utils.py +60 -0
mindspore/version.py +1 -1
{mindspore-2.3.0.dist-info → mindspore-2.4.1.dist-info}/METADATA +13 -7
{mindspore-2.3.0.dist-info → mindspore-2.4.1.dist-info}/RECORD +260 -254
{mindspore-2.3.0.dist-info → mindspore-2.4.1.dist-info}/WHEEL +1 -1
mindspore/include/c_api/ms/abstract.h +0 -67
mindspore/include/c_api/ms/attribute.h +0 -197
mindspore/include/c_api/ms/base/handle_types.h +0 -43
mindspore/include/c_api/ms/base/macros.h +0 -32
mindspore/include/c_api/ms/base/status.h +0 -33
mindspore/include/c_api/ms/base/types.h +0 -283
mindspore/include/c_api/ms/context.h +0 -102
mindspore/include/c_api/ms/graph.h +0 -160
mindspore/include/c_api/ms/node.h +0 -606
mindspore/include/c_api/ms/tensor.h +0 -161
mindspore/include/c_api/ms/value.h +0 -84
mindspore/mindspore_shared_lib.dll +0 -0
mindspore/nn/extend/basic.py +0 -140
mindspore/nn/extend/embedding.py +0 -143
mindspore/nn/extend/layer/normalization.py +0 -109
mindspore/nn/extend/pooling.py +0 -117
mindspore/nn/layer/embedding_service.py +0 -531
mindspore/ops/_op_impl/aicpu/strided_slice_v2.py +0 -93
mindspore/ops/_op_impl/aicpu/strided_slice_v2_grad.py +0 -66
mindspore/ops/extend/__init__.py +0 -53
mindspore/ops/extend/array_func.py +0 -218
mindspore/ops/extend/math_func.py +0 -76
mindspore/ops/extend/nn_func.py +0 -308
mindspore/ops/silent_check.py +0 -162
mindspore/profiler/parser/msadvisor_analyzer.py +0 -82
mindspore/profiler/parser/msadvisor_parser.py +0 -240
mindspore/train/callback/_mindio_ttp.py +0 -443
{mindspore-2.3.0.dist-info → mindspore-2.4.1.dist-info}/entry_points.txt +0 -0
{mindspore-2.3.0.dist-info → mindspore-2.4.1.dist-info}/top_level.txt +0 -0

mindspore/communication/comm_func.py CHANGED Viewed

@@ -17,11 +17,18 @@
 Defines communication operators with functional form.
 """
 from mindspore.communication import GlobalComm, get_group_rank_from_world_rank, get_group_size
+from mindspore.communication.management import _get_group
+from mindspore.communication._comm_helper import _get_group_rank_from_world_rank_from_cache_helper
 from mindspore.common.tensor import Tensor
 from mindspore._c_expression import Tensor as Tensor_
 from mindspore.ops import ReduceOp, cat
 from mindspore.ops._primitive_cache import _get_cache_prim
 from mindspore.ops.primitive import _primexpr
+from mindspore.ops.auto_generate.gen_ops_prim import (inner_comm_all_reduce_op, inner_comm_all_gather_op,
+                                                      inner_comm_all_to_all_v_op, inner_comm_irecv_op,
+                                                      inner_comm_isend_op, inner_comm_reduce_scatter_op)
+from mindspore._c_expression import CommHandle as CommHandle_
+from mindspore import jit_class
 __all__ = [
     'all_reduce',
@@ -36,15 +43,48 @@ __all__ = [
     'reduce_scatter_tensor',
     'reduce',
     'scatter_tensor',
+    'send',
+    'recv',
     'P2POp',
     'batch_isend_irecv',
 ]
 import mindspore.ops.operations as P
+_GROPU_SIZE_CACHE = {}
+@jit_class
+class CommHandle(CommHandle_):
+    r"""
+    Usually, handles are created in C++during the execution of communication operators and returned to the Python
+    layer. It will not be created directly in Python. Only in scenarios where graph patterns are compatible,
+    handles will be created using Python.
+    """
+    def wait(self):
+        r"""
+        The wait for asynchronous handles will not take effect for handles created on the Python side.
+        >>> import numpy as np
+        >>> from mindspore.communication import init
+        >>> from mindspore.communication.comm_func import all_reduce
+        >>> from mindspore import Tensor
+        >>>
+        >>> init()
+        >>> input_tensor = Tensor(np.ones([2, 8]).astype(np.float32))
+        >>> output, handle = all_reduce(input_tensor, async_op=True)
+        >>> handle.wait()
+        >>> print(output)
+        [[2. 2. 2. 2. 2. 2. 2. 2.]
+         [2. 2. 2. 2. 2. 2. 2. 2.]]
+        """
+default_handle = CommHandle()
 def _check_split_sizes_sequence(tensor, sequence):
-    if sequence == []:
+    if not sequence:
         raise TypeError(f"sequence can not be empty list.")
     element0 = sequence[0]
     for idx in range(1, len(sequence)):
@@ -132,7 +172,17 @@ def _get_size(shape):
     return numel
-def all_reduce(tensor, op=ReduceOp.SUM, group=GlobalComm.WORLD_COMM_GROUP):
+def _is_split_sizes_empty(split_sizes):
+    return split_sizes is None or not split_sizes
+def _contiguous(tensor):
+    if not tensor.is_contiguous() or tensor.storage_offset() != 0:
+        tensor = tensor.contiguous()
+    return tensor
+def all_reduce(tensor, op=ReduceOp.SUM, group=GlobalComm.WORLD_COMM_GROUP, async_op=False):
     """
     Reduce tensors across all devices in such a way that all deviceswill get the same final result,
     returns the tensor which is all reduced.
@@ -146,17 +196,20 @@ def all_reduce(tensor, op=ReduceOp.SUM, group=GlobalComm.WORLD_COMM_GROUP):
                   On the CPU, only 'sum' is supported. Default: ``ReduceOp.SUM`` .
         group (str, optional): The communication group to work on. Default: ``GlobalComm.WORLD_COMM_GROUP`` , which
                   means ``"hccl_world_group"`` in Ascend, and ``"nccl_world_group"`` in GPU.
+        async_op (bool, optional): Whether this operator should be an async operator. Default: ``False`` .
     Returns:
-        Tensor, has the same shape of the input, i.e., :math:`(x_1, x_2, ..., x_R)`.
-        The contents depend on the specified operation.
+        Tuple(Tensor, CommHandle), the output tensor has the same shape of the input,
+        i.e., :math:`(x_1, x_2, ..., x_R)`. The contents depend on the specified operation.
+        CommHandle is an async work handle, if `async_op` is set to True. CommHandle will be None,
+        when `async_op` is False.
     Raises:
         TypeError: If the type of the first input parameter is not Tensor, or any of `op` and `group` is not a str.
         RuntimeError: If device target is invalid, or backend is invalid, or distributed initialization fails.
     Supported Platforms:
-        ``Ascend`` ``GPU`` ``CPU``
+        ``Ascend``
     Examples:
         .. note::
@@ -165,7 +218,7 @@ def all_reduce(tensor, op=ReduceOp.SUM, group=GlobalComm.WORLD_COMM_GROUP):
             For Ascend/GPU/CPU devices, it is recommended to use the msrun startup method
             without any third-party or configuration file dependencies.
             Please see the `msrun start up
-            <https://www.mindspore.cn/tutorials/experts/zh-CN/master/parallel/msrun_launcher.html>`_
+            <https://www.mindspore.cn/docs/zh-CN/master/model_train/parallel/msrun_launcher.html>`_
             for more details.
             This example should be run with 2 devices.
@@ -185,11 +238,17 @@ def all_reduce(tensor, op=ReduceOp.SUM, group=GlobalComm.WORLD_COMM_GROUP):
     """
     if not isinstance(tensor, (Tensor, Tensor_)):
         raise TypeError("For all_reduce, the input tensor must be tensor")
-    all_reduce_op = _get_cache_prim(P.AllReduce)(op=op, group=group)
-    return all_reduce_op(tensor)
+    if not isinstance(op, str):
+        raise TypeError("For all_reduce, the input op type must be str")
+    if op not in ('sum', 'prod', 'min', 'max'):
+        raise TypeError("For all_reduce, the input op value must be one of sum, prod, min, max")
+    group = _get_group(group)
+    tensor = _contiguous(tensor)
+    output = inner_comm_all_reduce_op(tensor, op, group)
+    return _deal_comm_outputs(output, async_op)
-def all_gather_into_tensor(tensor, group=GlobalComm.WORLD_COMM_GROUP):
+def all_gather_into_tensor(tensor, group=GlobalComm.WORLD_COMM_GROUP, async_op=False):
     """
     Gathers tensors from the specified communication group and returns the tensor which is all gathered.
@@ -201,10 +260,13 @@ def all_gather_into_tensor(tensor, group=GlobalComm.WORLD_COMM_GROUP):
                         The shape of tensor is :math:`(x_1, x_2, ..., x_R)`.
         group (str, optional): The communication group to work on. Default: ``GlobalComm.WORLD_COMM_GROUP`` , which
             means ``"hccl_world_group"`` in Ascend, and ``"nccl_world_group"`` in GPU.
+        async_op (bool, optional): Whether this operator should be an async operator. Default: ``False`` .
     Returns:
-        Tensor. If the number of devices in the group is N,
-        then the shape of output is :math:`(N, x_1, x_2, ..., x_R)`.
+        Tuple(Tensor, CommHandle), if the number of devices in the group is N,
+        then the shape of output tensor is :math:`(N, x_1, x_2, ..., x_R)`.
+        CommHandle is an async work handle, if `async_op` is set to True.
+        CommHandle will be None, when `async_op` is False.
     Raises:
         TypeError: If the type of the first input parameter is not Tensor, or `group` is not a str.
@@ -213,7 +275,7 @@ def all_gather_into_tensor(tensor, group=GlobalComm.WORLD_COMM_GROUP):
         RuntimeError: If device target is invalid, or backend is invalid, or distributed initialization fails.
     Supported Platforms:
-        ``Ascend`` ``GPU``
+        ``Ascend``
     Examples:
         .. note::
@@ -222,7 +284,7 @@ def all_gather_into_tensor(tensor, group=GlobalComm.WORLD_COMM_GROUP):
             For Ascend/GPU/CPU devices, it is recommended to use the msrun startup method
             without any third-party or configuration file dependencies.
             Please see the `msrun start up
-            <https://www.mindspore.cn/tutorials/experts/zh-CN/master/parallel/msrun_launcher.html>`_
+            <https://www.mindspore.cn/docs/zh-CN/master/model_train/parallel/msrun_launcher.html>`_
             for more details.
             This example should be run with 2 devices.
@@ -248,11 +310,17 @@ def all_gather_into_tensor(tensor, group=GlobalComm.WORLD_COMM_GROUP):
     if not isinstance(tensor, (Tensor, Tensor_)):
         raise TypeError("For all_gather_into_tensor, the input tensor must be tensor")
-    all_gather_op = _get_cache_prim(P.AllGather)(group=group)
-    return all_gather_op(tensor)
+    group = _get_group(group)
+    global _GROPU_SIZE_CACHE
+    if group not in _GROPU_SIZE_CACHE:
+        _GROPU_SIZE_CACHE[group] = get_group_size(group)
+    group_size = _GROPU_SIZE_CACHE[group]
+    tensor = _contiguous(tensor)
+    output = inner_comm_all_gather_op(tensor, group_size, group)
+    return _deal_comm_outputs(output, async_op)
-def reduce_scatter_tensor(tensor, op=ReduceOp.SUM, group=GlobalComm.WORLD_COMM_GROUP):
+def reduce_scatter_tensor(tensor, op=ReduceOp.SUM, group=GlobalComm.WORLD_COMM_GROUP, async_op=False):
     r"""
     Reduces and scatters tensors from the specified communication group and
     returns the tensor which is reduced and scattered.
@@ -268,9 +336,12 @@ def reduce_scatter_tensor(tensor, op=ReduceOp.SUM, group=GlobalComm.WORLD_COMM_G
                   like SUM and MAX. Default: ``ReduceOp.SUM`` .
         group (str, optional): The communication group to work on. Default: ``GlobalComm.WORLD_COMM_GROUP`` , which
             means ``"hccl_world_group"`` in Ascend, and ``"nccl_world_group"`` in GPU.
+        async_op (bool, optional): Whether this operator should be an async operator. Default: ``False`` .
     Returns:
-        Tensor, it has the same dtype as `input_x` with a shape of :math:`(N/rank\_size, *)`.
+        Tuple(Tensor, CommHandle), the output tensor has the same dtype as `input_x` with a shape of
+        :math:`(N/rank\_size, *)`. CommHandle is an async work handle, if `async_op` is set to True.
+        CommHandle will be None, when `async_op` is False.
     Raises:
         TypeError: If the type of the first input parameter is not Tensor, or any of `op` and `group` is not a str.
@@ -278,7 +349,7 @@ def reduce_scatter_tensor(tensor, op=ReduceOp.SUM, group=GlobalComm.WORLD_COMM_G
         RuntimeError: If device target is invalid, or backend is invalid, or distributed initialization fails.
     Supported Platforms:
-        ``Ascend`` ``GPU``
+        ``Ascend``
     Examples:
         .. note::
@@ -287,7 +358,7 @@ def reduce_scatter_tensor(tensor, op=ReduceOp.SUM, group=GlobalComm.WORLD_COMM_G
             For Ascend/GPU/CPU devices, it is recommended to use the msrun startup method
             without any third-party or configuration file dependencies.
             Please see the `msrun start up
-            <https://www.mindspore.cn/tutorials/experts/zh-CN/master/parallel/msrun_launcher.html>`_
+            <https://www.mindspore.cn/docs/zh-CN/master/model_train/parallel/msrun_launcher.html>`_
             for more details.
             This example should be run with 2 devices.
@@ -312,8 +383,14 @@ def reduce_scatter_tensor(tensor, op=ReduceOp.SUM, group=GlobalComm.WORLD_COMM_G
     if not isinstance(tensor, (Tensor, Tensor_)):
         raise TypeError("For reduce_scatter_tensor, the input tensor must be tensor")
-    reduce_scatter_op = _get_cache_prim(P.ReduceScatter)(op=op, group=group)
-    return reduce_scatter_op(tensor)
+    group = _get_group(group)
+    global _GROPU_SIZE_CACHE
+    if group not in _GROPU_SIZE_CACHE:
+        _GROPU_SIZE_CACHE[group] = get_group_size(group)
+    rank_size = _GROPU_SIZE_CACHE[group]
+    tensor = _contiguous(tensor)
+    output = inner_comm_reduce_scatter_op(tensor, rank_size, op, group)
+    return _deal_comm_outputs(output, async_op)
 def reduce(tensor, dst, op=ReduceOp.SUM, group=GlobalComm.WORLD_COMM_GROUP):
@@ -353,7 +430,7 @@ def reduce(tensor, dst, op=ReduceOp.SUM, group=GlobalComm.WORLD_COMM_GROUP):
             without any third-party or configuration file dependencies.
             Please see the `msrun start up
-            <https://www.mindspore.cn/tutorials/experts/zh-CN/master/parallel/msrun_launcher.html>`_
+            <https://www.mindspore.cn/docs/zh-CN/master/model_train/parallel/msrun_launcher.html>`_
             for more details.
             This example should be run with 4 devices.
@@ -428,6 +505,7 @@ class P2POp:
         >>> recv_op = P2POp(irecv, recv_tensor, 0)
         >>> recv_op = P2POp('irecv', (), 0, recv_dtype=mindspore.float32)
     """
     def __init__(self, op, tensor, peer, group=None, tag=0, *, recv_dtype=None):
         self.op = op
         self.tensor = tensor
@@ -482,7 +560,7 @@ def batch_isend_irecv(p2p_op_list):
             For Ascend/GPU/CPU devices, it is recommended to use the msrun startup method
             without any third-party or configuration file dependencies.
             Please see the `msrun start up
-            <https://www.mindspore.cn/tutorials/experts/zh-CN/master/parallel/msrun_launcher.html>`_
+            <https://www.mindspore.cn/docs/zh-CN/master/model_train/parallel/msrun_launcher.html>`_
             for more details.
             This example should be run with 2 devices.
@@ -519,6 +597,8 @@ def batch_isend_irecv(p2p_op_list):
     receive_shapes = []
     receive_dtypes = []
     tags = []
+    if not p2p_op_list:
+        raise TypeError(f"p2p_op_list can not be empty list.")
     group = p2p_op_list[0].group
     if group is None:
         group = GlobalComm.WORLD_COMM_GROUP
@@ -596,7 +676,7 @@ def scatter_tensor(tensor, src=0, group=GlobalComm.WORLD_COMM_GROUP):
             For Ascend/GPU/CPU devices, it is recommended to use the msrun startup method
             without any third-party or configuration file dependencies.
             Please see the `msrun start up
-            <https://www.mindspore.cn/tutorials/experts/zh-CN/master/parallel/msrun_launcher.html>`_
+            <https://www.mindspore.cn/docs/zh-CN/master/model_train/parallel/msrun_launcher.html>`_
             for more details.
             This example should be run with 2 devices.
@@ -661,7 +741,7 @@ def gather_into_tensor(tensor, dst=0, group=GlobalComm.WORLD_COMM_GROUP):
             For Ascend/GPU/CPU devices, it is recommended to use the msrun startup method
             without any third-party or configuration file dependencies.
             Please see the `msrun start up
-            <https://www.mindspore.cn/tutorials/experts/zh-CN/master/parallel/msrun_launcher.html>`_
+            <https://www.mindspore.cn/docs/zh-CN/master/model_train/parallel/msrun_launcher.html>`_
             for more details.
             This example should be run with 2 devices.
@@ -724,7 +804,7 @@ def broadcast(tensor, src=0, group=GlobalComm.WORLD_COMM_GROUP):
             For Ascend/GPU/CPU devices, it is recommended to use the msrun startup method
             without any third-party or configuration file dependencies.
             Please see the `msrun start up
-            <https://www.mindspore.cn/tutorials/experts/zh-CN/master/parallel/msrun_launcher.html>`_
+            <https://www.mindspore.cn/docs/zh-CN/master/model_train/parallel/msrun_launcher.html>`_
             for more details.
             This example should be run with 2 devices.
@@ -778,7 +858,7 @@ def barrier(group=GlobalComm.WORLD_COMM_GROUP):
             For Ascend/GPU/CPU devices, it is recommended to use the msrun startup method
             without any third-party or configuration file dependencies.
             Please see the `msrun start up
-            <https://www.mindspore.cn/tutorials/experts/zh-CN/master/parallel/msrun_launcher.html>`_
+            <https://www.mindspore.cn/docs/zh-CN/master/model_train/parallel/msrun_launcher.html>`_
             for more details.
             This example should be run with 2 devices.
@@ -797,7 +877,19 @@ def barrier(group=GlobalComm.WORLD_COMM_GROUP):
     return _op()
-def isend(tensor, dst=0, group=GlobalComm.WORLD_COMM_GROUP, tag=0):
+def _deal_comm_outputs(output, async_op):
+    if isinstance(output, tuple):
+        if not async_op:
+            output[1].wait()
+            return (output[0], None)
+        return output
+    if not async_op:
+        return (output, None)
+    return (output, default_handle)
+def send(tensor, dst=0, group=GlobalComm.WORLD_COMM_GROUP, tag=0):
     """
     Send tensors to the specified dest_rank.
@@ -817,7 +909,7 @@ def isend(tensor, dst=0, group=GlobalComm.WORLD_COMM_GROUP, tag=0):
         ValueError: If the rank ID of the process is greater than the rank size of the communication group.
     Supported Platforms:
-        ``Ascend`` ``GPU``
+        ``Ascend``
     Examples:
         .. note::
@@ -826,7 +918,138 @@ def isend(tensor, dst=0, group=GlobalComm.WORLD_COMM_GROUP, tag=0):
             For Ascend/GPU/CPU devices, it is recommended to use the msrun startup method
             without any third-party or configuration file dependencies.
             Please see the `msrun start up
-            <https://www.mindspore.cn/tutorials/experts/zh-CN/master/parallel/msrun_launcher.html>`_
+            <https://www.mindspore.cn/docs/zh-CN/master/model_train/parallel/msrun_launcher.html>`_
+            for more details.
+            This example should be run with 2 devices.
+        >>> from mindspore import ops
+        >>> import mindspore.nn as nn
+        >>> from mindspore.communication import init
+        >>> from mindspore.communication.comm_func import send
+        >>> from mindspore import Tensor
+        >>> import numpy as np
+        >>>
+        >>> init()
+        >>> input_ = Tensor(np.ones([2, 8]).astype(np.float32))
+        >>> send(input_, 0)
+    """
+    if not isinstance(tensor, (Tensor, Tensor_)):
+        raise TypeError("For send, the input tensor must be tensor")
+    group = _get_group(group)
+    _dst = _get_group_rank_from_world_rank_from_cache_helper(dst, group)
+    tensor = _contiguous(tensor)
+    output = inner_comm_isend_op(tensor, _dst, group, tag)
+    _deal_comm_outputs(output, False)
+def recv(tensor, src=0, group=GlobalComm.WORLD_COMM_GROUP, tag=0):
+    """
+    Receive tensors from src.
+    Note:
+        Send and Receive must be used in combination and have same tag.
+        The shape and dtype of input `tensor` is used to receive tensor, but the value
+        of input `tensor` would not take effect.
+        Only support PyNative mode, Graph mode is not currently supported.
+    Args:
+        tensor (Tensor): The shape of tensor is :math:`(x_1, x_2, ..., x_R)`. The shape and dtype of this
+            tensor is used to receive tensor, but the value of input `tensor` would not take effect.
+        src (int, optional): A required integer identifying the source rank(global rank). Default: 0.
+        group (str, optional): The communication group to work on.
+            Default: "hccl_world_group" on Ascend, "nccl_world_group" on GPU.
+        tag (int, optional): A required integer identifying the send/recv message tag. The message will
+            be received by the Send op with the same "tag". Default: 0.
+    Returns:
+        Tensor, the shape of output is :math:`(x_1, x_2, ..., x_R)`.
+    Raises:
+        TypeError: If `src` is not an int or `group` is not a str.
+        ValueError: If the rank ID of the process is greater than the rank size of the communication group.
+    Supported Platforms:
+        ``Ascend``
+    Examples:
+        .. note::
+            Before running the following examples, you need to configure the communication environment variables.
+            For Ascend/GPU/CPU devices, it is recommended to use the msrun startup method
+            without any third-party or configuration file dependencies.
+            Please see the `msrun start up
+            <https://www.mindspore.cn/docs/zh-CN/master/model_train/parallel/msrun_launcher.html>`_
+            for more details.
+            This example should be run with 2 devices.
+        >>> from mindspore import ops
+        >>> import mindspore.nn as nn
+        >>> from mindspore.communication import init
+        >>> from mindspore.communication.comm_func import recv
+        >>> from mindspore import Tensor
+        >>> import numpy as np
+        >>>
+        # Launch 2 processes.
+        Process 0 send the following array to Process 1
+        [[ 0.  1.]
+         [ 2.  3.]]
+        >>> init()
+        >>> x = ms.Tensor(np.zeros([2, 2]))
+        # Process 1 receive tensor from Process 0.
+        >>> out = recv(x, src=0)
+        >>> print(out)
+        [[ 0.  1.]
+         [ 2.  3.]]
+    """
+    if not isinstance(tensor, (Tensor, Tensor_)):
+        raise TypeError("For recv, the input tensor must be tensor")
+    if not isinstance(src, int):
+        raise TypeError("For recv, the src must be int")
+    group = _get_group(group)
+    _src = _get_group_rank_from_world_rank_from_cache_helper(src, group)
+    tensor = _contiguous(tensor)
+    shape = tensor.shape
+    dtype = tensor.dtype
+    output, _ = _deal_comm_outputs(inner_comm_irecv_op(tag, _src, shape, group, dtype), False)
+    return output
+def isend(tensor, dst=0, group=GlobalComm.WORLD_COMM_GROUP, tag=0):
+    """
+    Send tensors to the specified dest_rank asynchronously.
+    Note:
+        Send and Receive must be used in combination and have same tag.
+        Only support PyNative mode, Graph mode is not currently supported.
+    Args:
+        tensor (Tensor): The shape of tensor is :math:`(x_1, x_2, ..., x_R)`.
+        dst (int, optional): A required integer identifying the destination rank(global rank). Default: 0.
+        group (str, optional): The communication group to work on.
+            Default: "hccl_world_group" on Ascend, "nccl_world_group" on GPU.
+        tag (int, optional): A required integer identifying the send/recv message tag. The message will
+            be received by the Receive op with the same "tag". Default: 0.
+    Returns:
+        CommHandle, it is an async work handle.
+    Raises:
+        TypeError: `dst` is not an int or `group` is not a str。
+        ValueError: If the rank ID of the process is greater than the rank size of the communication group.
+    Supported Platforms:
+        ``Ascend``
+    Examples:
+        .. note::
+            Before running the following examples, you need to configure the communication environment variables.
+            For Ascend/GPU/CPU devices, it is recommended to use the msrun startup method
+            without any third-party or configuration file dependencies.
+            Please see the `msrun start up
+            <https://www.mindspore.cn/docs/zh-CN/master/model_train/parallel/msrun_launcher.html>`_
             for more details.
             This example should be run with 2 devices.
@@ -840,19 +1063,22 @@ def isend(tensor, dst=0, group=GlobalComm.WORLD_COMM_GROUP, tag=0):
         >>>
         >>> init()
         >>> input_ = Tensor(np.ones([2, 8]).astype(np.float32))
-        >>> isend(input_, 0)
+        >>> handle = isend(input_, 0)
+        >>> handle.wait()
     """
     if not isinstance(tensor, (Tensor, Tensor_)):
         raise TypeError("For isend, the input tensor must be tensor")
-    _dst = get_group_rank_from_world_rank(dst, group)
-    _op = _get_cache_prim(P.Send)(tag, _dst, group, group)
-    _depend = _get_cache_prim(P.Depend)()
-    return _depend(tensor, _op(tensor))
+    group = _get_group(group)
+    _dst = _get_group_rank_from_world_rank_from_cache_helper(dst, group)
+    tensor = _contiguous(tensor)
+    output = inner_comm_isend_op(tensor, _dst, group, tag)
+    _, handle = _deal_comm_outputs(output, True)
+    return handle
 def irecv(tensor, src=0, group=GlobalComm.WORLD_COMM_GROUP, tag=0):
     """
-    Receive tensors from src.
+    Receive tensors from src asynchronously.
     Note:
         Send and Receive must be used in combination and have same tag.
@@ -870,14 +1096,16 @@ def irecv(tensor, src=0, group=GlobalComm.WORLD_COMM_GROUP, tag=0):
             be received by the Send op with the same "tag". Default: 0.
     Returns:
-        Tensor, the shape of output is :math:`(x_1, x_2, ..., x_R)`.
+        Tuple(Tensor, CommHandle), the shape of output is :math:`(x_1, x_2, ..., x_R)`.
+        CommHandle is an async work handle, if `async_op` is set to True.
+        CommHandle will be None, when `async_op` is False.
     Raises:
         TypeError: If `src` is not an int or `group` is not a str.
         ValueError: If the rank ID of the process is greater than the rank size of the communication group.
     Supported Platforms:
-        ``Ascend`` ``GPU``
+        ``Ascend``
     Examples:
         .. note::
@@ -886,7 +1114,7 @@ def irecv(tensor, src=0, group=GlobalComm.WORLD_COMM_GROUP, tag=0):
             For Ascend/GPU/CPU devices, it is recommended to use the msrun startup method
             without any third-party or configuration file dependencies.
             Please see the `msrun start up
-            <https://www.mindspore.cn/tutorials/experts/zh-CN/master/parallel/msrun_launcher.html>`_
+            <https://www.mindspore.cn/docs/zh-CN/master/model_train/parallel/msrun_launcher.html>`_
             for more details.
             This example should be run with 2 devices.
@@ -905,19 +1133,22 @@ def irecv(tensor, src=0, group=GlobalComm.WORLD_COMM_GROUP, tag=0):
         >>> init()
         >>> x = ms.Tensor(np.zeros([2, 2]))
         # Process 1 receive tensor from Process 0.
-        >>> out = irecv(x, src=0)
+        >>> out, handle = irecv(x, src=0)
+        >>> handle.wait()
         >>> print(out)
         [[ 0.  1.]
          [ 2.  3.]]
     """
-    _src = get_group_rank_from_world_rank(src, group)
+    group = _get_group(group)
+    _src = _get_group_rank_from_world_rank_from_cache_helper(src, group)
+    tensor = _contiguous(tensor)
     shape = tensor.shape
     dtype = tensor.dtype
-    _op = _get_cache_prim(P.Receive)(tag, _src, shape, dtype, group, group)
-    return _op(tensor)
+    output = inner_comm_irecv_op(tag, _src, shape, group, dtype)
+    return _deal_comm_outputs(output, True)
-def all_to_all_with_output_shape(output_shape_list, input_tensor_list, group=None):
+def all_to_all_with_output_shape(output_shape_list, input_tensor_list, group=None, async_op=False):
     """
     scatter and gather list of tensor to/from all rank according to input/output tensor list.
@@ -932,9 +1163,12 @@ def all_to_all_with_output_shape(output_shape_list, input_tensor_list, group=Non
             List of tensors to scatter to the remote rank.
         group (str, optional): The communication group to work on.
             Default: None, which means "hccl_world_group" on Ascend, "nccl_world_group" on GPU.
+        async_op (bool, optional): Whether this operator should be an async operator. Default: ``False`` .
     Returns:
-        Tuple(Tensor), the tensors gathered from remote ranks.
+        Tuple(Tuple(Tensor), CommHandle), the tensors is gathered from remote ranks.
+        CommHandle is an async work handle, if `async_op` is set to True.
+        CommHandle will be None, when `async_op` is False.
     Raises:
         TypeError: If `input_tensor_list` is not list of tensors.
@@ -951,7 +1185,7 @@ def all_to_all_with_output_shape(output_shape_list, input_tensor_list, group=Non
             For Ascend/GPU/CPU devices, it is recommended to use the msrun startup method
             without any third-party or configuration file dependencies.
             Please see the `msrun start up
-            <https://www.mindspore.cn/tutorials/experts/zh-CN/master/parallel/msrun_launcher.html>`_
+            <https://www.mindspore.cn/docs/zh-CN/master/model_train/parallel/msrun_launcher.html>`_
             for more details.
             This example should be run with 2 devices.
@@ -1004,28 +1238,40 @@ def all_to_all_with_output_shape(output_shape_list, input_tensor_list, group=Non
             recv_numel_list.append(_get_size(_shape))
             recv_shape_list.append(_shape)
-    _op = _get_cache_prim(P.AlltoAllV)(send_numel_list, recv_numel_list, group)
     send_flatten_tensor = cat(send_flatten_tensor)
-    output = _op(send_flatten_tensor)
+    send_flatten_tensor = _contiguous(send_flatten_tensor)
+    group = GlobalComm.WORLD_COMM_GROUP if group is None else _get_group(group)
+    global _GROPU_SIZE_CACHE
+    if group not in _GROPU_SIZE_CACHE:
+        _GROPU_SIZE_CACHE[group] = get_group_size(group)
+    rank_size = _GROPU_SIZE_CACHE[group]
+    output = inner_comm_all_to_all_v_op(send_flatten_tensor, group, send_numel_list, recv_numel_list,
+                                        rank_size, False)
+    output, handle = _deal_comm_outputs(output, async_op)
     result = []
     offset = 0
     for numel, shape in zip(recv_numel_list, recv_shape_list):
         result.append(output[offset:offset + numel].reshape(shape))
         offset = offset + numel
-    return tuple(result)
+    return (tuple(result), handle)
 def _get_all_to_all_single_numel_list(tensor, output_shape, output_split_sizes, input_split_sizes, group):
     """get numel list for all_to_all_single."""
-    if input_split_sizes is None or not input_split_sizes:
-        _world_size = get_group_size(group)
+    global _GROPU_SIZE_CACHE
+    if _is_split_sizes_empty(input_split_sizes):
+        if group not in _GROPU_SIZE_CACHE:
+            _GROPU_SIZE_CACHE[group] = get_group_size(group)
+        _world_size = _GROPU_SIZE_CACHE[group]
         if tensor.shape[0] % _world_size != 0:
             raise ValueError("input shape at dim 0 must be divided by world_size, "
                              f"but got {tensor.shape[0]} and {_world_size}.")
         _split_size = tensor.shape[0] // _world_size
         input_split_sizes = (_split_size,) * _world_size
-    if output_split_sizes is None or not output_split_sizes:
-        _world_size = get_group_size(group)
+    if _is_split_sizes_empty(output_split_sizes):
+        if group not in _GROPU_SIZE_CACHE:
+            _GROPU_SIZE_CACHE[group] = get_group_size(group)
+        _world_size = _GROPU_SIZE_CACHE[group]
         shape_dim_0 = None
         if isinstance(output_shape, Tensor):
             shape_dim_0 = output_shape.shape[0]
@@ -1053,7 +1299,7 @@ def _get_all_to_all_single_numel_list(tensor, output_shape, output_split_sizes,
 def all_to_all_single_with_output_shape(output_shape, tensor, output_split_sizes=None,
-                                        input_split_sizes=None, group=None):
+                                        input_split_sizes=None, group=None, async_op=False):
     """
     scatter and gather input with split size to/from all rank, and return result in a single tensor.
@@ -1071,11 +1317,13 @@ def all_to_all_single_with_output_shape(output_shape, tensor, output_split_sizes
             it means equally split by ``world_size``. Default: None.
         group (str, optional): The communication group to work on.
             Default: None, which means "hccl_world_group" on Ascend, "nccl_world_group" on GPU.
+        async_op (bool, optional): Whether this operator should be an async operator. Default: ``False`` .
     Returns:
-        Tensor, the tensors gathered concatenated from remote ranks.
+        Tuple(Tensor, CommHandle), the output tensor is gathered concatenated from remote ranks.
         If the numel of tensor gathered from remote is zero, it will return a Tensor will value 0,
-        which has no actual meanning.
+        which has no actual meanning. CommHandle is an async work handle, if `async_op` is set to True.
+        CommHandle will be None, when `async_op` is False.
     Raises:
         TypeError: If `tensor` is not tensor.
@@ -1091,7 +1339,7 @@ def all_to_all_single_with_output_shape(output_shape, tensor, output_split_sizes
             For Ascend/GPU/CPU devices, it is recommended to use the msrun startup method
             without any third-party or configuration file dependencies.
             Please see the `msrun start up
-            <https://www.mindspore.cn/tutorials/experts/zh-CN/master/parallel/msrun_launcher.html>`_
+            <https://www.mindspore.cn/docs/zh-CN/master/model_train/parallel/msrun_launcher.html>`_
             for more details.
             This example should be run with 2 devices.
@@ -1129,12 +1377,19 @@ def all_to_all_single_with_output_shape(output_shape, tensor, output_split_sizes
     if group is None:
         group = GlobalComm.WORLD_COMM_GROUP
+    split_sizes_empty = _is_split_sizes_empty(output_split_sizes) and _is_split_sizes_empty(input_split_sizes)
     send_numel_list, recv_numel_list, recv_shape_without_first_dim = \
         _get_all_to_all_single_numel_list(tensor, output_shape, output_split_sizes, input_split_sizes, group)
-    _op = _get_cache_prim(P.AlltoAllV)(send_numel_list, recv_numel_list, group)
+    tensor = _contiguous(tensor)
     _input = tensor.reshape(-1)
-    result = _op(_input)
+    group = GlobalComm.WORLD_COMM_GROUP if group is None else _get_group(group)
+    global _GROPU_SIZE_CACHE
+    if group not in _GROPU_SIZE_CACHE:
+        _GROPU_SIZE_CACHE[group] = get_group_size(group)
+    rank_size = _GROPU_SIZE_CACHE[group]
+    result = inner_comm_all_to_all_v_op(_input, group, send_numel_list, recv_numel_list, rank_size, split_sizes_empty)
+    result, handle = _deal_comm_outputs(result, async_op)
     if any(recv_numel_list):
         result = result.reshape((-1,) + recv_shape_without_first_dim)
-    return result
+    return result, handle