PyPI - mindspore - Versions diffs - 2.7.0rc1__cp310-cp310-win_amd64.whl → 2.7.1__cp310-cp310-win_amd64.whl - Mend

mindspore 2.7.0rc1__cp310-cp310-win_amd64.whl → 2.7.1__cp310-cp310-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mindspore might be problematic. Click here for more details.

Files changed (370) hide show

mindspore/.commit_id +1 -1
mindspore/__init__.py +5 -2
mindspore/_c_dataengine.cp310-win_amd64.pyd +0 -0
mindspore/_c_expression.cp310-win_amd64.pyd +0 -0
mindspore/_c_mindrecord.cp310-win_amd64.pyd +0 -0
mindspore/_checkparam.py +2 -2
mindspore/_extends/builtin_operations.py +3 -3
mindspore/_extends/parallel_compile/akg_compiler/custom.py +1109 -0
mindspore/_extends/parallel_compile/akg_compiler/gen_custom_op_files.py +1 -1
mindspore/_extends/parse/__init__.py +3 -3
mindspore/_extends/parse/compile_config.py +24 -1
mindspore/_extends/parse/deprecated/deprecated_tensor_method.py +6 -3
mindspore/_extends/parse/parser.py +28 -22
mindspore/_extends/parse/resources.py +1 -1
mindspore/_extends/parse/standard_method.py +23 -2
mindspore/_extends/parse/trope.py +2 -1
mindspore/_extends/pijit/pijit_func_white_list.py +9 -27
mindspore/amp.py +0 -18
mindspore/avcodec-59.dll +0 -0
mindspore/avdevice-59.dll +0 -0
mindspore/avfilter-8.dll +0 -0
mindspore/avformat-59.dll +0 -0
mindspore/avutil-57.dll +0 -0
mindspore/boost/base.py +29 -2
mindspore/common/__init__.py +18 -12
mindspore/common/_decorator.py +3 -2
mindspore/common/_grad_function.py +3 -1
mindspore/common/_tensor_cpp_method.py +1 -1
mindspore/common/_tensor_docs.py +371 -96
mindspore/common/_utils.py +7 -43
mindspore/common/api.py +434 -135
mindspore/common/dtype.py +98 -57
mindspore/common/dump.py +7 -108
mindspore/common/dynamic_shape/__init__.py +0 -0
mindspore/common/{auto_dynamic_shape.py → dynamic_shape/auto_dynamic_shape.py} +15 -23
mindspore/common/dynamic_shape/enable_dynamic.py +197 -0
mindspore/common/file_system.py +59 -9
mindspore/common/hook_handle.py +82 -3
mindspore/common/jit_config.py +5 -1
mindspore/common/jit_trace.py +27 -12
mindspore/common/lazy_inline.py +5 -3
mindspore/common/np_dtype.py +3 -3
mindspore/common/parameter.py +17 -127
mindspore/common/recompute.py +4 -13
mindspore/common/tensor.py +50 -217
mindspore/communication/_comm_helper.py +11 -1
mindspore/communication/comm_func.py +138 -4
mindspore/communication/management.py +85 -1
mindspore/config/op_info.config +0 -15
mindspore/context.py +20 -106
mindspore/dataset/__init__.py +1 -1
mindspore/dataset/audio/transforms.py +1 -1
mindspore/dataset/core/config.py +35 -1
mindspore/dataset/engine/datasets.py +338 -319
mindspore/dataset/engine/datasets_user_defined.py +38 -22
mindspore/dataset/engine/datasets_vision.py +1 -1
mindspore/dataset/engine/validators.py +1 -15
mindspore/dataset/transforms/c_transforms.py +2 -2
mindspore/dataset/transforms/transforms.py +3 -3
mindspore/dataset/vision/__init__.py +1 -1
mindspore/dataset/vision/py_transforms.py +8 -8
mindspore/dataset/vision/transforms.py +17 -5
mindspore/dataset/vision/utils.py +632 -21
mindspore/device_context/ascend/op_tuning.py +35 -1
mindspore/dnnl.dll +0 -0
mindspore/{profiler/common/validator → graph}/__init__.py +9 -1
mindspore/graph/custom_pass.py +55 -0
mindspore/include/api/cell.h +28 -4
mindspore/include/api/cfg.h +24 -7
mindspore/include/api/context.h +1 -0
mindspore/include/api/delegate.h +0 -2
mindspore/include/api/dual_abi_helper.h +100 -19
mindspore/include/api/graph.h +14 -1
mindspore/include/api/kernel.h +16 -3
mindspore/include/api/kernel_api.h +9 -1
mindspore/include/api/metrics/accuracy.h +9 -0
mindspore/include/api/model.h +5 -1
mindspore/include/api/model_group.h +4 -0
mindspore/include/api/model_parallel_runner.h +2 -0
mindspore/include/api/status.h +48 -10
mindspore/include/api/types.h +6 -1
mindspore/include/dataset/constants.h +9 -0
mindspore/include/dataset/execute.h +2 -2
mindspore/jpeg62.dll +0 -0
mindspore/mindrecord/__init__.py +3 -3
mindspore/mindrecord/common/exceptions.py +1 -0
mindspore/mindrecord/config.py +1 -1
mindspore/{parallel/mpi → mindrecord/core}/__init__.py +4 -1
mindspore/mindrecord/{shardheader.py → core/shardheader.py} +2 -1
mindspore/mindrecord/{shardindexgenerator.py → core/shardindexgenerator.py} +1 -1
mindspore/mindrecord/{shardreader.py → core/shardreader.py} +2 -1
mindspore/mindrecord/{shardsegment.py → core/shardsegment.py} +2 -2
mindspore/mindrecord/{shardutils.py → core/shardutils.py} +1 -1
mindspore/mindrecord/{shardwriter.py → core/shardwriter.py} +1 -1
mindspore/mindrecord/filereader.py +4 -4
mindspore/mindrecord/filewriter.py +5 -5
mindspore/mindrecord/mindpage.py +2 -2
mindspore/mindrecord/tools/cifar10.py +4 -3
mindspore/mindrecord/tools/cifar100.py +1 -1
mindspore/mindrecord/tools/cifar100_to_mr.py +1 -1
mindspore/mindrecord/tools/cifar10_to_mr.py +6 -6
mindspore/mindrecord/tools/csv_to_mr.py +1 -1
mindspore/mindrecord/tools/imagenet_to_mr.py +1 -1
mindspore/mindrecord/tools/mnist_to_mr.py +1 -1
mindspore/mindrecord/tools/tfrecord_to_mr.py +1 -1
mindspore/mindspore_backend_common.dll +0 -0
mindspore/mindspore_backend_manager.dll +0 -0
mindspore/mindspore_cluster.dll +0 -0
mindspore/mindspore_common.dll +0 -0
mindspore/mindspore_core.dll +0 -0
mindspore/mindspore_cpu.dll +0 -0
mindspore/mindspore_dump.dll +0 -0
mindspore/mindspore_frontend.dll +0 -0
mindspore/mindspore_glog.dll +0 -0
mindspore/mindspore_hardware_abstract.dll +0 -0
mindspore/mindspore_memory_pool.dll +0 -0
mindspore/mindspore_ms_backend.dll +0 -0
mindspore/mindspore_ops.dll +0 -0
mindspore/{mindspore_ops_host.dll → mindspore_ops_cpu.dll} +0 -0
mindspore/mindspore_profiler.dll +0 -0
mindspore/mindspore_pyboost.dll +0 -0
mindspore/mindspore_pynative.dll +0 -0
mindspore/mindspore_runtime_pipeline.dll +0 -0
mindspore/mindspore_runtime_utils.dll +0 -0
mindspore/mindspore_tools.dll +0 -0
mindspore/mint/__init__.py +15 -10
mindspore/mint/distributed/__init__.py +4 -0
mindspore/mint/distributed/distributed.py +392 -69
mindspore/mint/nn/__init__.py +2 -16
mindspore/mint/nn/functional.py +4 -110
mindspore/mint/nn/layer/__init__.py +0 -2
mindspore/mint/nn/layer/_functions.py +1 -2
mindspore/mint/nn/layer/activation.py +0 -6
mindspore/mint/nn/layer/basic.py +0 -47
mindspore/mint/nn/layer/conv.py +10 -10
mindspore/mint/nn/layer/normalization.py +11 -16
mindspore/mint/nn/layer/pooling.py +0 -4
mindspore/nn/__init__.py +1 -3
mindspore/nn/cell.py +231 -239
mindspore/nn/layer/activation.py +4 -2
mindspore/nn/layer/basic.py +56 -14
mindspore/nn/layer/container.py +16 -0
mindspore/nn/layer/embedding.py +4 -169
mindspore/nn/layer/image.py +1 -1
mindspore/nn/layer/normalization.py +2 -1
mindspore/nn/layer/thor_layer.py +4 -85
mindspore/nn/optim/ada_grad.py +0 -1
mindspore/nn/optim/adafactor.py +0 -1
mindspore/nn/optim/adam.py +32 -127
mindspore/nn/optim/adamax.py +0 -1
mindspore/nn/optim/asgd.py +0 -1
mindspore/nn/optim/ftrl.py +8 -102
mindspore/nn/optim/lamb.py +1 -4
mindspore/nn/optim/lars.py +0 -3
mindspore/nn/optim/lazyadam.py +25 -218
mindspore/nn/optim/momentum.py +5 -43
mindspore/nn/optim/optimizer.py +6 -55
mindspore/nn/optim/proximal_ada_grad.py +0 -1
mindspore/nn/optim/rmsprop.py +0 -1
mindspore/nn/optim/rprop.py +0 -1
mindspore/nn/optim/sgd.py +0 -1
mindspore/nn/optim/tft_wrapper.py +2 -4
mindspore/nn/optim/thor.py +0 -2
mindspore/nn/probability/bijector/bijector.py +7 -8
mindspore/nn/probability/bijector/gumbel_cdf.py +2 -2
mindspore/nn/probability/bijector/power_transform.py +20 -21
mindspore/nn/probability/bijector/scalar_affine.py +5 -5
mindspore/nn/probability/bijector/softplus.py +13 -14
mindspore/nn/probability/distribution/_utils/utils.py +2 -2
mindspore/nn/wrap/cell_wrapper.py +39 -5
mindspore/nn/wrap/grad_reducer.py +4 -89
mindspore/numpy/array_creations.py +4 -4
mindspore/numpy/fft.py +9 -9
mindspore/numpy/utils_const.py +1 -1
mindspore/{nn/reinforcement → onnx}/__init__.py +5 -8
mindspore/onnx/onnx_export.py +137 -0
mindspore/opencv_core4110.dll +0 -0
mindspore/opencv_imgcodecs4110.dll +0 -0
mindspore/{opencv_imgproc452.dll → opencv_imgproc4110.dll} +0 -0
mindspore/ops/__init__.py +2 -0
mindspore/ops/_grad_experimental/grad_comm_ops.py +38 -2
mindspore/ops/_grad_experimental/grad_inner_ops.py +0 -9
mindspore/ops/_op_impl/aicpu/__init__.py +0 -10
mindspore/ops/_op_impl/cpu/__init__.py +1 -5
mindspore/ops/_op_impl/cpu/{buffer_append.py → joinedstr_op.py} +8 -8
mindspore/ops/auto_generate/cpp_create_prim_instance_helper.py +28 -24
mindspore/ops/auto_generate/gen_extend_func.py +6 -11
mindspore/ops/auto_generate/gen_ops_def.py +385 -154
mindspore/ops/auto_generate/gen_ops_prim.py +5676 -5167
mindspore/ops/communication.py +97 -0
mindspore/ops/composite/__init__.py +5 -2
mindspore/ops/composite/base.py +16 -2
mindspore/ops/composite/multitype_ops/__init__.py +3 -1
mindspore/ops/composite/multitype_ops/_compile_utils.py +150 -8
mindspore/ops/composite/multitype_ops/_constexpr_utils.py +1 -1
mindspore/ops/composite/multitype_ops/add_impl.py +7 -0
mindspore/ops/composite/multitype_ops/mod_impl.py +27 -0
mindspore/ops/function/__init__.py +2 -0
mindspore/ops/function/array_func.py +24 -18
mindspore/ops/function/comm_func.py +3883 -0
mindspore/ops/function/debug_func.py +7 -6
mindspore/ops/function/grad/grad_func.py +4 -12
mindspore/ops/function/math_func.py +89 -86
mindspore/ops/function/nn_func.py +92 -313
mindspore/ops/function/random_func.py +9 -18
mindspore/ops/functional.py +4 -1
mindspore/ops/functional_overload.py +377 -30
mindspore/ops/operations/__init__.py +2 -5
mindspore/ops/operations/_custom_ops_utils.py +7 -9
mindspore/ops/operations/_inner_ops.py +12 -50
mindspore/ops/operations/_rl_inner_ops.py +0 -933
mindspore/ops/operations/array_ops.py +5 -50
mindspore/ops/operations/comm_ops.py +95 -17
mindspore/ops/operations/custom_ops.py +237 -22
mindspore/ops/operations/debug_ops.py +33 -35
mindspore/ops/operations/manually_defined/ops_def.py +39 -318
mindspore/ops/operations/math_ops.py +5 -5
mindspore/ops/operations/nn_ops.py +3 -3
mindspore/ops/operations/sparse_ops.py +0 -83
mindspore/ops/primitive.py +4 -27
mindspore/ops/tensor_method.py +88 -10
mindspore/ops_generate/aclnn/aclnn_kernel_register_auto_cc_generator.py +5 -5
mindspore/ops_generate/aclnn/gen_aclnn_implement.py +8 -8
mindspore/ops_generate/api/functions_cc_generator.py +53 -4
mindspore/ops_generate/api/tensor_func_reg_cpp_generator.py +25 -11
mindspore/ops_generate/common/gen_constants.py +11 -10
mindspore/ops_generate/common/op_proto.py +18 -1
mindspore/ops_generate/common/template.py +102 -245
mindspore/ops_generate/common/template_utils.py +212 -0
mindspore/ops_generate/gen_custom_ops.py +69 -0
mindspore/ops_generate/op_def/ops_def_cc_generator.py +78 -7
mindspore/ops_generate/op_def_py/base_op_prim_py_generator.py +360 -0
mindspore/ops_generate/op_def_py/custom_op_prim_py_generator.py +140 -0
mindspore/ops_generate/op_def_py/op_def_py_generator.py +54 -7
mindspore/ops_generate/op_def_py/op_prim_py_generator.py +5 -312
mindspore/ops_generate/pyboost/auto_grad_impl_cc_generator.py +74 -17
mindspore/ops_generate/pyboost/auto_grad_reg_cc_generator.py +22 -5
mindspore/ops_generate/pyboost/gen_pyboost_func.py +0 -16
mindspore/ops_generate/pyboost/op_template_parser.py +3 -2
mindspore/ops_generate/pyboost/pyboost_functions_cpp_generator.py +21 -5
mindspore/ops_generate/pyboost/pyboost_functions_h_generator.py +2 -2
mindspore/ops_generate/pyboost/pyboost_functions_impl_cpp_generator.py +30 -10
mindspore/ops_generate/pyboost/pyboost_grad_function_cpp_generator.py +10 -3
mindspore/ops_generate/pyboost/pyboost_internal_kernel_info_adapter_generator.py +1 -1
mindspore/ops_generate/pyboost/pyboost_native_grad_functions_generator.py +19 -9
mindspore/ops_generate/pyboost/pyboost_op_cpp_code_generator.py +71 -28
mindspore/ops_generate/pyboost/pyboost_overload_functions_cpp_generator.py +10 -9
mindspore/ops_generate/pyboost/pyboost_utils.py +27 -16
mindspore/ops_generate/resources/yaml_loader.py +13 -0
mindspore/ops_generate/tensor_py_cc_generator.py +2 -2
mindspore/parallel/_auto_parallel_context.py +5 -15
mindspore/parallel/_cell_wrapper.py +1 -1
mindspore/parallel/_parallel_serialization.py +4 -6
mindspore/parallel/_ps_context.py +2 -2
mindspore/parallel/_utils.py +34 -17
mindspore/parallel/auto_parallel.py +23 -9
mindspore/parallel/checkpoint_transform.py +20 -2
mindspore/parallel/cluster/process_entity/_api.py +28 -33
mindspore/parallel/cluster/process_entity/_utils.py +9 -5
mindspore/parallel/cluster/run.py +5 -3
mindspore/{experimental/llm_boost/ascend_native → parallel/distributed}/__init__.py +21 -22
mindspore/parallel/distributed/distributed_data_parallel.py +393 -0
mindspore/parallel/distributed/flatten_grad_buffer.py +295 -0
mindspore/parallel/function/reshard_func.py +6 -5
mindspore/parallel/nn/parallel_cell_wrapper.py +40 -3
mindspore/parallel/nn/parallel_grad_reducer.py +0 -8
mindspore/parallel/shard.py +7 -21
mindspore/parallel/strategy.py +336 -0
mindspore/parallel/transform_safetensors.py +127 -20
mindspore/profiler/analysis/viewer/ascend_kernel_details_viewer.py +13 -9
mindspore/profiler/analysis/viewer/ascend_op_memory_viewer.py +1 -1
mindspore/profiler/analysis/viewer/ms_minddata_viewer.py +1 -1
mindspore/profiler/common/constant.py +5 -0
mindspore/profiler/common/file_manager.py +9 -0
mindspore/profiler/common/msprof_cmd_tool.py +40 -4
mindspore/profiler/common/path_manager.py +65 -24
mindspore/profiler/common/profiler_context.py +27 -14
mindspore/profiler/common/profiler_info.py +3 -3
mindspore/profiler/common/profiler_meta_data.py +1 -0
mindspore/profiler/common/profiler_op_analyse.py +10 -6
mindspore/profiler/common/profiler_path_manager.py +13 -0
mindspore/profiler/common/util.py +30 -3
mindspore/profiler/dynamic_profiler.py +91 -46
mindspore/profiler/envprofiler.py +30 -5
mindspore/profiler/experimental_config.py +18 -2
mindspore/profiler/platform/cpu_profiler.py +10 -4
mindspore/profiler/platform/npu_profiler.py +34 -7
mindspore/profiler/profiler.py +193 -145
mindspore/profiler/profiler_action_controller.py +1 -1
mindspore/profiler/profiler_interface.py +2 -2
mindspore/rewrite/symbol_tree/symbol_tree.py +1 -1
mindspore/run_check/_check_version.py +108 -24
mindspore/runtime/__init__.py +9 -6
mindspore/runtime/executor.py +35 -0
mindspore/runtime/memory.py +113 -0
mindspore/runtime/thread_bind_core.py +1 -1
mindspore/swresample-4.dll +0 -0
mindspore/swscale-6.dll +0 -0
mindspore/tinyxml2.dll +0 -0
mindspore/{experimental/llm_boost → tools}/__init__.py +5 -5
mindspore/tools/data_dump.py +130 -0
mindspore/tools/sdc_detect.py +91 -0
mindspore/tools/stress_detect.py +63 -0
mindspore/train/__init__.py +6 -6
mindspore/train/_utils.py +8 -21
mindspore/train/amp.py +6 -7
mindspore/train/callback/_callback.py +2 -1
mindspore/train/callback/_checkpoint.py +1 -17
mindspore/train/callback/_flops_collector.py +10 -6
mindspore/train/callback/_train_fault_tolerance.py +72 -25
mindspore/train/data_sink.py +5 -9
mindspore/train/dataset_helper.py +5 -5
mindspore/train/model.py +41 -230
mindspore/train/serialization.py +160 -401
mindspore/train/train_thor/model_thor.py +2 -2
mindspore/turbojpeg.dll +0 -0
mindspore/utils/__init__.py +6 -3
mindspore/utils/dlpack.py +92 -0
mindspore/utils/dryrun.py +1 -1
mindspore/utils/runtime_execution_order_check.py +10 -0
mindspore/utils/sdc_detect.py +14 -12
mindspore/utils/stress_detect.py +43 -0
mindspore/utils/utils.py +152 -16
mindspore/version.py +1 -1
{mindspore-2.7.0rc1.dist-info → mindspore-2.7.1.dist-info}/METADATA +3 -2
{mindspore-2.7.0rc1.dist-info → mindspore-2.7.1.dist-info}/RECORD +330 -344
mindspore/_extends/remote/kernel_build_server_ascend.py +0 -75
mindspore/communication/_hccl_management.py +0 -297
mindspore/experimental/llm_boost/ascend_native/llama_boost_ascend_native.py +0 -207
mindspore/experimental/llm_boost/ascend_native/llm_boost.py +0 -52
mindspore/experimental/llm_boost/atb/__init__.py +0 -23
mindspore/experimental/llm_boost/atb/boost_base.py +0 -385
mindspore/experimental/llm_boost/atb/llama_boost.py +0 -137
mindspore/experimental/llm_boost/atb/qwen_boost.py +0 -124
mindspore/experimental/llm_boost/register.py +0 -130
mindspore/experimental/llm_boost/utils.py +0 -31
mindspore/include/OWNERS +0 -7
mindspore/mindspore_cpu_res_manager.dll +0 -0
mindspore/mindspore_ops_kernel_common.dll +0 -0
mindspore/mindspore_res_manager.dll +0 -0
mindspore/nn/optim/_dist_optimizer_registry.py +0 -111
mindspore/nn/reinforcement/_batch_read_write.py +0 -142
mindspore/nn/reinforcement/_tensors_queue.py +0 -152
mindspore/nn/reinforcement/tensor_array.py +0 -145
mindspore/opencv_core452.dll +0 -0
mindspore/opencv_imgcodecs452.dll +0 -0
mindspore/ops/_op_impl/aicpu/priority_replay_buffer.py +0 -113
mindspore/ops/_op_impl/aicpu/reservoir_replay_buffer.py +0 -96
mindspore/ops/_op_impl/aicpu/sparse_cross.py +0 -42
mindspore/ops/_op_impl/cpu/buffer_get.py +0 -28
mindspore/ops/_op_impl/cpu/buffer_sample.py +0 -28
mindspore/ops/_op_impl/cpu/priority_replay_buffer.py +0 -42
mindspore/ops/operations/_tensor_array.py +0 -359
mindspore/ops/operations/rl_ops.py +0 -288
mindspore/parallel/_offload_context.py +0 -275
mindspore/parallel/_recovery_context.py +0 -115
mindspore/parallel/_transformer/__init__.py +0 -35
mindspore/parallel/_transformer/layers.py +0 -765
mindspore/parallel/_transformer/loss.py +0 -251
mindspore/parallel/_transformer/moe.py +0 -693
mindspore/parallel/_transformer/op_parallel_config.py +0 -222
mindspore/parallel/_transformer/transformer.py +0 -3124
mindspore/parallel/mpi/_mpi_config.py +0 -116
mindspore/profiler/common/validator/validate_path.py +0 -84
mindspore/train/memory_profiling_pb2.py +0 -298
mindspore/utils/hooks.py +0 -81
/mindspore/common/{_auto_dynamic.py → dynamic_shape/_auto_dynamic.py} +0 -0
{mindspore-2.7.0rc1.dist-info → mindspore-2.7.1.dist-info}/WHEEL +0 -0
{mindspore-2.7.0rc1.dist-info → mindspore-2.7.1.dist-info}/entry_points.txt +0 -0
{mindspore-2.7.0rc1.dist-info → mindspore-2.7.1.dist-info}/top_level.txt +0 -0

mindspore/ops/function/comm_func.py ADDED Viewed

@@ -0,0 +1,3883 @@
+# Copyright 2024 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Communication management API"""
+from __future__ import absolute_import
+import hashlib
+import builtins
+import io
+import sys
+import pickle
+from datetime import timedelta
+import numpy as np
+from mindspore import log as logger
+from mindspore.common import dtype as mstype
+from mindspore._checkparam import args_type_check
+from mindspore.common.api import jit_class
+from mindspore.common.api import _pynative_executor
+from mindspore.runtime.stream import synchronize
+from mindspore.ops.operations.comm_ops import ReduceOp
+from mindspore.ops.auto_generate import cat
+from mindspore.common.tensor import Tensor
+from mindspore._c_expression import TensorPy as Tensor_
+from mindspore.ops.primitive import _primexpr
+from mindspore.communication.management import _init_without_sched
+from mindspore.communication._comm_helper import (
+    _destroy_group_helper,
+    _get_rank_helper,
+    _get_size_helper,
+    _get_backend,
+    _get_group_ranks,
+    _is_available,
+    _is_initialized,
+    _ExistingGroup,
+    _get_group_rank_from_world_rank_from_cache_helper,
+)
+from mindspore.communication import (
+    init,
+    get_group_size,
+    get_world_rank_from_group_rank,
+    create_group,
+    GlobalComm,
+    get_group_rank_from_world_rank,
+)
+from mindspore.ops.auto_generate.gen_ops_prim import (
+    inner_comm_all_reduce_op,
+    inner_comm_all_gather_op,
+    inner_comm_all_to_all_v_op,
+    inner_comm_irecv_op,
+    inner_comm_reduce_scatter_op
+)
+from mindspore.ops.auto_generate.gen_ops_prim import (
+    dist_comm_all_gather_op,
+    dist_comm_all_reduce_op,
+    dist_comm_reduce_scatter_op,
+    dist_comm_isend_op,
+    dist_comm_all_to_all_v_op,
+    dist_comm_reduce_scatter_tensor_op,
+    dist_comm_reduce_scatter_tensor_uneven_op,
+    dist_comm_all_to_all_v_single_op,
+    dist_comm_broadcast_op,
+    dist_comm_all_gather_into_tensor_op,
+    dist_comm_all_gather_into_tensor_uneven_op,
+    dist_comm_irecv_op,
+    dist_comm_scatter_tensor_op,
+    dist_comm_gather_into_tensor_op,
+    dist_comm_gather_op,
+    dist_comm_reduce_op,
+    dist_comm_scatter_op,
+    dist_comm_barrier_op,
+    dist_comm_batch_isend_irecv_op,
+)
+from mindspore._c_expression import TCPStoreClient, GroupOptions, _finalize_collective
+from mindspore._c_expression import CommHandle as CommHandle_
+__all__ = [
+    "TCPStore",
+    "init_process_group",
+    "destroy_process_group",
+    "get_rank",
+    "get_world_size",
+    "new_group",
+    "get_backend",
+    "get_global_rank",
+    "get_process_group_ranks",
+    "get_group_rank",
+    "all_reduce",
+    "all_gather_into_tensor",
+    "all_gather_into_tensor_uneven",
+    "all_to_all",
+    "all_to_all_single",
+    "reduce_scatter_tensor",
+    "reduce_scatter_tensor_uneven",
+    "isend",
+    "irecv",
+    "send",
+    "recv",
+    "gather",
+    "scatter",
+    "all_gather",
+    "reduce_scatter",
+    "barrier",
+    "broadcast",
+    "reduce",
+    "P2POp",
+    "batch_isend_irecv",
+    "all_gather_object",
+    "broadcast_object_list",
+    "gather_object",
+    "scatter_object_list",
+    "is_available",
+    "is_initialized",
+    'gather_into_tensor',
+    'scatter_tensor',
+    'set_comm_ops_inplace',
+    'all_to_all_v_c'
+]
+_pickler = pickle.Pickler
+_unpickler = pickle.Unpickler
+BACKEND_HCCL = "hccl"
+BACKEND_MCCL = "mccl"
+_GROPU_SIZE_CACHE = {}
+_GROPU_RANK_CACHE = {}
+_ALL_TO_ALL_CACHE = {}
+safe_builtins = {
+    'range',
+    'complex',
+    'set',
+    'frozenset',
+    'slice',
+}
+def get_cache_group_size(group=GlobalComm.WORLD_COMM_GROUP):
+    """get cache group size."""
+    global _GROPU_SIZE_CACHE
+    if group not in _GROPU_SIZE_CACHE:
+        _GROPU_SIZE_CACHE[group] = _get_size_helper(group)
+    group_size = _GROPU_SIZE_CACHE[group]
+    return group_size
+def get_cache_group_rank(group=GlobalComm.WORLD_COMM_GROUP):
+    """get cache rank id."""
+    global _GROPU_RANK_CACHE
+    if group not in _GROPU_RANK_CACHE:
+        _GROPU_RANK_CACHE[group] = _get_rank_helper(group)
+    group_rank = _GROPU_RANK_CACHE[group]
+    return group_rank
+class RestrictedUnpickler(pickle.Unpickler):
+    # Override find_class method.
+    def find_class(self, module, name):
+        # Only allow safe classes from builtins.
+        if module == "builtins" and name in safe_builtins:
+            return getattr(builtins, name)
+        # Forbid everything else.
+        raise pickle.UnpicklingError("global '%s.%s' is forbidden" %
+                                     (module, name))
+def restricted_loads(s):
+    """Helper function analogous to pickle.loads()."""
+    return RestrictedUnpickler(io.BytesIO(s)).load()
+def _object_to_tensor(obj, size=0):
+    f = io.BytesIO()
+    _pickler(f).dump(obj)
+    buf = np.frombuffer(f.getvalue(), dtype=np.int8)
+    tensor_size = buf.size
+    if size > tensor_size:
+        buf = np.resize(buf, size)
+        tensor_size = size
+    return Tensor(buf), tensor_size
+def _tensor_to_object(tensor, tensor_size):
+    buf = tensor.asnumpy().tobytes()[:tensor_size]
+    return restricted_loads(buf)
+comm_funcs = [
+    "all_reduce",
+    "all_gather_into_tensor",
+    "all_gather_into_tensor_uneven",
+    "all_to_all",
+    "all_to_all_single",
+    "reduce_scatter_tensor",
+    "reduce_scatter_tensor_uneven",
+    "isend",
+    "irecv",
+    "send",
+    "recv",
+    "gather",
+    "scatter",
+    "all_gather",
+    "reduce_scatter",
+    "barrier",
+    "broadcast",
+    "reduce",
+    "batch_isend_irecv",
+    "all_gather_object",
+    "broadcast_object_list",
+    "gather_object",
+    "scatter_object_list",
+    'gather_into_tensor',
+    'scatter_tensor',
+    'all_to_all_v_c'
+]
+_COMM_ENABLE_PLACE = {item: True for item in comm_funcs}
+def is_inplace_func():
+    """if is inplace func name."""
+    global _COMM_ENABLE_PLACE
+    caller_name = sys._getframe(1).f_code.co_name # pylint: disable=protected-access
+    if caller_name in _COMM_ENABLE_PLACE:
+        return _COMM_ENABLE_PLACE[caller_name]
+    return False
+def set_comm_ops_inplace(is_enable, func_list=None):
+    """
+    Set inplace attribute to communication function.
+    Args:
+        is_enable (bool): Whether to enable inplace.
+        func_list (list): Indicates which functions have their inplace attributes set.
+    Raises:
+        TypeError: If `is_enable` is not bool.
+        TypeError: If `func_list` is not None and not list.
+        ValueError: The function name in `func_list` is invalid.
+    Supported Platforms:
+        ``Ascend``
+    Examples:
+        .. note::
+            Before running the following examples, you need to configure the communication environment variables.
+            For Ascend devices, it is recommended to use the msrun startup method
+            without any third-party or configuration file dependencies.
+            Please see the `msrun start up
+            <https://www.mindspore.cn/tutorials/en/master/parallel/msrun_launcher.html>`_
+            for more details.
+        >>> from mindspore.ops.communication import set_comm_ops_inplace
+        >>> set_comm_ops_inplace(True)
+    """
+    global _COMM_ENABLE_PLACE
+    if not isinstance(is_enable, bool):
+        raise TypeError(
+            "For 'set_comm_ops_inplace', the argument 'is_enable' must be type of bool, "
+            "but got 'is_enable' type : {}.".format(type(is_enable))
+        )
+    if func_list is None:
+        for func in _COMM_ENABLE_PLACE:
+            _COMM_ENABLE_PLACE[func] = is_enable
+        return
+    if not isinstance(func_list, (list, tuple)):
+        raise TypeError(f"Expected list or tuple, but got {type(func_list)}.")
+    for func in func_list:
+        if func not in _COMM_ENABLE_PLACE:
+            raise ValueError(f"The function name in `func_list` must be correct, but got {func}.")
+        _COMM_ENABLE_PLACE[func] = is_enable
+@jit_class
+class CommHandle(CommHandle_):
+    r"""
+    Usually, handles are created in C++during the execution of communication operators and returned to the Python
+    layer. It will not be created directly in Python. Only in scenarios where graph patterns are compatible,
+    handles will be created using Python.
+    """
+    def __init__(self, handle=None, exec_sync=False):
+        super(CommHandle, self).__init__()
+        self.handle = handle
+        self.exec_sync = exec_sync
+    def wait(self):
+        r"""
+        The wait for asynchronous handles will not take effect for handles created on the Python side.
+        >>> import numpy as np
+        >>> from mindspore.communication import init
+        >>> from mindspore.ops.communication import all_reduce
+        >>> from mindspore import Tensor
+        >>>
+        >>> init()
+        >>> input_tensor = Tensor(np.ones([2, 8]).astype(np.float32))
+        >>> output, handle = all_reduce(input_tensor, async_op=True)
+        >>> handle.wait()
+        >>> print(output)
+        [[2. 2. 2. 2. 2. 2. 2. 2.]
+         [2. 2. 2. 2. 2. 2. 2. 2.]]
+        """
+        if self.handle:
+            self.handle.wait()
+        if self.exec_sync:
+            synchronize()
+default_handle = CommHandle()
+def _deal_comm_outputs(output, async_op, exec_sync=False):
+    """
+    deal with comm ops outputs.
+    """
+    if isinstance(output, tuple):
+        if not async_op:
+            output[1].wait()
+            if exec_sync:
+                synchronize()
+            return (output[0], None)
+        return (output[0], CommHandle(output[1], exec_sync))
+    if not async_op:
+        return (output, None)
+    return (output, default_handle)
+@_primexpr
+def _check_all_tensors(tensor_list):
+    """check all elements in tensor_list are type of Tensor"""
+    if not isinstance(tensor_list, (list, tuple)):
+        raise TypeError(f"Expected list or tuple, but got {type(tensor_list)}.")
+    for t in tensor_list:
+        if not isinstance(t, Tensor):
+            raise TypeError(f"Expected tensor, but got {type(t)}")
+@_primexpr
+def _check_all_tensors_or_tuple(tensor_list):
+    """check all elements in tensor_list are type of Tensor or tuple or list"""
+    if not isinstance(tensor_list, (list, tuple)):
+        raise TypeError(f"Expected list or tuple, but got {type(tensor_list)}.")
+    for t in tensor_list:
+        if not isinstance(t, (Tensor, tuple, list)):
+            raise TypeError(f"Expected tensor or tuple, but got {type(t)}")
+@_primexpr
+def _check_all_tensor_same_dtype(*tensor_lists):
+    """check all the input tensor has same dtype"""
+    consistent_dtype = None
+    for list_ in tensor_lists:
+        if not isinstance(list_, (list, tuple)):
+            list_ = [list_]
+        for tensor_ in list_:
+            if not isinstance(tensor_, Tensor):
+                continue
+            dtype = tensor_.dtype
+            if consistent_dtype is None:
+                consistent_dtype = dtype
+            else:
+                if dtype != consistent_dtype:
+                    raise TypeError("all_to_all input dtype must be the same, "
+                                    f"but got {consistent_dtype} and {dtype}.")
+def _get_size(shape):
+    numel = 1
+    for s in shape:
+        numel *= s
+    return numel
+def _is_split_sizes_empty(split_sizes):
+    return split_sizes is None or not split_sizes
+class TCPStore:
+    """
+    A TCP-based distributed key-value store implementation.
+    Note:
+        - The function is implemented by CPU and does not involve any hardware operations related to Ascend.
+        - Currently, all parameters provided by the TCPStore class constructor are not supported
+          except for `host_name`, `port`, `world_size`, `is_master`, `timeout` and `wait_for_workers`,
+          which are reserved parameters and invalid settings.
+        - The current TCPStore function is limited and only supports scenarios where the key is
+          less than 4k and the value is less than 1G. Complex scenarios are to be supported.
+    Args:
+        host_name (str): The hostname or IP Address the server store should run on.
+            Currently only supports user input IP addresses.
+        port (int): The port on which the server store should listen for incoming requests.
+        world_size (int, optional): The total number of store users (number of clients + 1 for the server).
+            Default is ``None``, indicates a non-fixed number of store users. This parameter is
+            only valid for the server.
+        is_master (bool, optional): True when initializing the server store and False for client stores.
+            Default is ``False``.
+        timeout (timedelta, optional): Timeout used by the store during initialization. Default is
+            ``timedelta(seconds=300)``.
+        wait_for_workers (bool, optional): Whether to wait for all the workers to connect with the server
+            store. This is only applicable when `world_size` is a fixed value. Default is ``True``. This
+            parameter is only valid for the server.
+        multi_tenant (bool, invalid, optional): If ``True``, all ``TCPStore`` instances in the current process with
+            the same host/port will use the same underlying ``TCPServer``. Default is ``False``.
+        master_listen_fd (int, invalid, optional): If specified, the underlying ``TCPServer`` will listen on this file
+            descriptor, which must be a socket already bound to ``port``. Useful to avoid port assignment races
+            in some scenarios. Default is ``None`` (meaning the server creates a new socket and attempts to bind it
+            to ``port``).
+        use_libuv (bool, invalid, optional): If True, use libuv for ``TCPServer`` backend. Default is ``True``.
+    Returns:
+        TCPStore Object.
+    Supported Platforms:
+        ``Ascend``
+    Examples:
+        .. note::
+            Before running the following examples, you need to configure the communication environment variables.
+            For Ascend devices, it is recommended to use the msrun startup method
+            without any third-party or configuration file dependencies.
+            Please see the `msrun start up
+            <https://www.mindspore.cn/tutorials/en/master/parallel/msrun_launcher.html>`_
+            for more details.
+        >>> from mindspore.ops.communication import TCPStore
+        >>> store = TCPStore("127.0.0.1", 1234)
+    """
+    def __init__(self, host_name, port, world_size=None, is_master=False, timeout=timedelta(seconds=300),
+                 wait_for_workers=True, multi_tenant=False, master_listen_fd=None, use_libuv=True):
+        if not isinstance(host_name, str):
+            raise TypeError(
+                "For 'TCPStore', the argument 'host_name' must be type of string, "
+                "but got 'host_name' type : {}.".format(type(host_name))
+            )
+        if not isinstance(port, int):
+            raise TypeError(
+                "For 'TCPStore', the argument 'port' must be type of int, "
+                "but got 'port' type : {}.".format(type(port))
+            )
+        if not isinstance(is_master, bool):
+            raise TypeError(
+                "For 'TCPStore', the argument 'is_master' must be type of bool, "
+                "but got 'is_master' type : {}.".format(type(is_master))
+            )
+        if not isinstance(timeout, timedelta):
+            raise TypeError(
+                "For 'TCPStore', the argument 'timeout' must be type of timedelta, "
+                "but got 'timeout' type : {}.".format(type(timeout))
+            )
+        if not isinstance(wait_for_workers, bool):
+            raise TypeError(
+                "For 'TCPStore', the argument 'wait_for_workers' must be type of bool, "
+                "but got 'wait_for_workers' type : {}.".format(type(wait_for_workers))
+            )
+        if world_size is None:
+            world_size = 1
+        if not isinstance(world_size, int):
+            raise TypeError(
+                "For 'TCPStore', the argument 'world_size' must be type of int, "
+                "but got 'world_size' type : {}.".format(type(world_size))
+            )
+        if port < 0 or port > 65535:
+            raise ValueError(
+                "For 'TCPStore', the argument 'port' must be legal, "
+                f"but got {port}."
+            )
+        if world_size <= 0:
+            raise ValueError(
+                "For 'TCPStore', the argument 'world_size' must be legal, "
+                f"but got {world_size}."
+            )
+        timeout_ms = int(timeout.total_seconds() * 1000)
+        self.instance = TCPStoreClient(host_name, port, is_master, timeout_ms, world_size, wait_for_workers)
+        self.host = host_name
+        self.port = port
+    def add(self, key, amount):
+        """
+        When the `add` function is called for the first time with a given key, it creates a counter in
+        the storage corresponding to that key, with the initial value set to `amount`. Subsequent calls
+        to `add` with the same key increment the counter by amount.
+        Args:
+            key (str): The key whose counter value will be incremented.
+            amount (int): The amount by which the counter will be incremented.
+        Returns:
+            int, value of counter with `key`.
+        Raises:
+            TypeError: If `key` is not string.
+            TypeError: If `amount` is not int.
+            RuntimeError: If the `add` and `set` pass the same `key` and the `value` passed by `set` cannot
+                be correctly converted to a numerical value, calling `add` will result in an error.
+        Supported Platforms:
+            ``Ascend``
+        Examples:
+            .. note::
+                Before running the following examples, you need to configure the communication environment variables.
+                For Ascend devices, it is recommended to use the msrun startup method
+                without any third-party or configuration file dependencies.
+                Please see the `msrun start up
+                <https://www.mindspore.cn/tutorials/en/master/parallel/msrun_launcher.html>`_
+                for more details.
+            >>> from mindspore.ops.communication import TCPStore
+            >>> store = TCPStore("127.0.0.1", 1234)
+            >>> store.add("first_key", 1)
+        """
+        if not isinstance(key, str):
+            raise TypeError(
+                "For 'TCPStore.add', the argument 'key' must be type of string, "
+                "but got 'key' type : {}.".format(type(key))
+            )
+        if not isinstance(amount, int):
+            raise TypeError(
+                "For 'TCPStore.add', the argument 'amount' must be type of string or int, "
+                "but got 'amount' type : {}.".format(type(amount))
+            )
+        return self.instance.add(key, amount)
+    def set(self, key, value):
+        """
+        Inserts the key-value pair into the store based on the supplied `key` and
+        `value`. If `key` already exists in the store, it will overwrite the old
+        value with the new supplied `value`.
+        Args:
+            key (str): The key to be added to the store.
+            value (Union[bytes, str]): The value associated with `key` to be added to the store.
+        Raises:
+            TypeError: If `key` is not string.
+            TypeError: If `value` is not string or bytes.
+        Supported Platforms:
+            ``Ascend``
+        Examples:
+            .. note::
+                Before running the following examples, you need to configure the communication environment variables.
+                For Ascend devices, it is recommended to use the msrun startup method
+                without any third-party or configuration file dependencies.
+                Please see the `msrun start up
+                <https://www.mindspore.cn/tutorials/en/master/parallel/msrun_launcher.html>`_
+                for more details.
+            >>> from mindspore.ops.communication import TCPStore
+            >>> store = TCPStore("127.0.0.1", 1234)
+            >>> store.set("first_key", "first_value")
+        """
+        if not isinstance(key, str):
+            raise TypeError(
+                "For 'TCPStore.set', the argument 'key' must be type of string, "
+                "but got 'key' type : {}.".format(type(key))
+            )
+        if not isinstance(value, (str, bytes)):
+            raise TypeError(
+                "For 'TCPStore.set', the argument 'value' must be type of string or bytes, "
+                "but got 'value' type : {}.".format(type(value))
+            )
+        return self.instance.set(key, value)
+    def get(self, key):
+        """
+        Retrieves the value associated with the given `key` in the store. If the `key` does not exist
+        in the storage, this function will wait for the `timeout` set by the class initialization and then
+        throw an exception.
+        Args:
+            key (str): The function will return the value associated with this key.
+        Returns:
+            bytes, Value associated with `key` if `key` is in the store.
+        Raises:
+            TypeError: If `key` is not string.
+            RuntimeError: If `get` runs out of time.
+        Supported Platforms:
+            ``Ascend``
+        Examples:
+            .. note::
+                Before running the following examples, you need to configure the communication environment variables.
+                For Ascend devices, it is recommended to use the msrun startup method
+                without any third-party or configuration file dependencies.
+                Please see the `msrun start up
+                <https://www.mindspore.cn/tutorials/en/master/parallel/msrun_launcher.html>`_
+                for more details.
+            >>> from mindspore.ops.communication import TCPStore
+            >>> store = TCPStore("127.0.0.1", 1234)
+            >>> store.set("first_key", "first_value")
+            >>> data = store.get("first_key")
+            >>> print(data)
+        """
+        if not isinstance(key, str):
+            raise TypeError(
+                "For 'TCPStore.get', the argument 'key' must be type of string, "
+                "but got 'key' type : {}.".format(type(key))
+            )
+        byte_data = self.instance.get(key)
+        return byte_data
+    def delete_key(self, key):
+        """
+        Deletes the key-value pair associated with `key` from the store.
+        Args:
+            key (str): The key to be deleted from the store.
+        Returns:
+            bool, ``True`` if `key` was deleted, otherwise ``False``.
+        Raises:
+            TypeError: If `key` is not string.
+        Supported Platforms:
+            ``Ascend``
+        Examples:
+            .. note::
+                Before running the following examples, you need to configure the communication environment variables.
+                For Ascend devices, it is recommended to use the msrun startup method
+                without any third-party or configuration file dependencies.
+                Please see the `msrun start up
+                <https://www.mindspore.cn/tutorials/en/master/parallel/msrun_launcher.html>`_
+                for more details.
+            >>> from mindspore.ops.communication import TCPStore
+            >>> store = TCPStore("127.0.0.1", 1234)
+            >>> store.set("first_key", "first_value")
+            >>> # This should return true
+            >>> store.delete_key("first_key")
+        """
+        if not isinstance(key, str):
+            raise TypeError(
+                "For 'TCPStore.delete_key', the argument 'key' must be type of string, "
+                "but got 'key' type : {}.".format(type(key))
+            )
+        return self.instance.delete_key(key)
+def is_available():
+    """
+    Checks if distributed module is available.
+    Note:
+        Always returns `True` because MindSpore always has distributed ability on all platforms.
+    Returns:
+        bool, whether this distributed module is available.
+    Supported Platforms:
+        ``Ascend``
+    Examples:
+        .. note::
+            Before running the following examples, you need to configure the communication environment variables.
+            For Ascend devices, it is recommended to use the msrun startup method
+            without any third-party or configuration file dependencies.
+            Please see the `msrun start up
+            <https://www.mindspore.cn/tutorials/en/master/parallel/msrun_launcher.html>`_
+            for more details.
+        >>> import mindspore as ms
+        >>> from mindspore.ops.communication import is_available
+        >>> ms.set_device(device_target="Ascend")
+        >>> is_available()
+        True
+    """
+    return _is_available()
+def is_initialized():
+    """
+    Checks if default process group has been initialized.
+    Returns:
+        bool, whether the default process group has been initialized.
+    Supported Platforms:
+        ``Ascend``
+    Examples:
+        .. note::
+            Before running the following examples, you need to configure the communication environment variables.
+            For Ascend devices, it is recommended to use the msrun startup method
+            without any third-party or configuration file dependencies.
+            Please see the `msrun start up
+            <https://www.mindspore.cn/tutorials/en/master/parallel/msrun_launcher.html>`_
+            for more details.
+        >>> import mindspore as ms
+        >>> from mindspore.ops.communication import init_process_group, is_initialized
+        >>> ms.set_device(device_target="Ascend")
+        >>> init_process_group()
+        >>> print(is_initialized())
+        True
+    """
+    return _is_initialized()
+@args_type_check(init_method=str, timeout=timedelta, world_size=int, rank=int, store=TCPStore)
+def init_process_group(backend="hccl",
+                       init_method=None,
+                       timeout=None,
+                       world_size=-1,
+                       rank=-1,
+                       store=None,
+                       pg_options=None,
+                       device_id=None):
+    """
+    Init collective communication lib. And create a default collective communication group.
+    Note:
+        This method isn't supported in GPU and CPU versions of MindSpore.
+        In Ascend hardware platforms, this API should be set before the definition of any Tensor and Parameter,
+        and the instantiation and execution of any operation and net.
+    Args:
+        backend (str, optional): The backend to ues. Default is ``"hccl"`` and now only support hccl.
+        init_method (str, optional): URL specifying how to init collective communication group. Default is ``None``.
+        timeout (timedelta, optional): Timeout for API executed. Default is ``None``. Currently, this parameter is
+            only supported for host-side cluster network configuration using `init_method` or `store`.
+        world_size (int, optional): Number of the processes participating in the job. Default is ``-1``.
+        rank (int, optional): Rank of the current process. Default is ``-1``.
+        store (Store, optional): An object that stores key/value data, facilitating the exchange of inter-process
+            communication addresses and connection information. Default is ``None``. Currently, only the
+            ``TCPStore`` type is supported.
+        pg_options (ProcessGroupOptions, invalid): process group options specifying what additional options need to be
+            passed in during the construction of specific process group. The provided parameter is a reserved
+            parameter, and the current setting does not take effect.
+        device_id (int, invalid): the device id to exeute. The provided parameter is a reserved parameter,
+            and the current setting does not take effect.
+    Raises:
+        ValueError: If `backend` is not hccl.
+        ValueError: If `world_size` is not equal to -1 or process group number.
+        ValueError: If both `init_method` and `store` are set.
+        ValueError: `world_size` is not correctly set as a positive integer value, when using the initialization
+            method `init_method` or `store`.
+        ValueError: `rank` is not correctly set as a non-negative integer, when using the initialization method
+            `init_method` or `store`.
+        RuntimeError: If device target is invalid, or backend is invalid, or distributed initialization fails,
+            or the environment variables RANK_ID/MINDSPORE_HCCL_CONFIG_PATH
+            have not been exported when backend is HCCL.
+    Supported Platforms:
+        ``Ascend``
+    Examples:
+        .. note::
+            Before running the following examples, you need to configure the communication environment variables.
+            For Ascend devices, it is recommended to use the msrun startup method
+            without any third-party or configuration file dependencies.
+            Please see the `msrun start up
+            <https://www.mindspore.cn/tutorials/en/master/parallel/msrun_launcher.html>`_
+            for more details.
+        >>> import mindspore as ms
+        >>> from mindspore.ops.communication import init_process_group, destroy_process_group
+        >>> ms.set_device(device_target="Ascend")
+        >>> init_process_group()
+        >>> destroy_process_group()
+    """
+    if pg_options is not None:
+        logger.warning("pg_options is ignored, setting is invalid")
+    if device_id is not None:
+        logger.warning("device_id is ignored, setting is invalid")
+    if backend != "hccl":
+        raise ValueError(
+            "Only support hccl now, please setting backend to hccl or using default value"
+        )
+    if init_method is not None and store is not None:
+        raise ValueError(
+            "Only one of init_method and store is supported."
+        )
+    if init_method is not None or store is not None:
+        if world_size <= 0:
+            raise ValueError(
+                "Specified world_size must be a positive integer."
+            )
+        if rank < 0:
+            raise ValueError(
+                "Specified rank must be a non-negative integer."
+            )
+        if timeout is None:
+            timeout = timedelta(seconds=300)
+        timeout_ms = int(timeout.total_seconds() * 1000)
+        _init_without_sched(backend, init_method, timeout_ms, world_size, rank, store)
+    else:
+        init(backend)
+    if world_size != -1 and world_size != get_group_size():
+        raise ValueError(
+            "world_size is wrong, please using default value or setting: ",
+            get_group_size(),
+        )
+def destroy_process_group(group=None):
+    """
+    Destroy the user collective communication group.
+    If group is None or "hccl_world_group", Destroy all group and release collective communication lib.
+    Note:
+        - This method isn't supported in GPU and CPU versions of MindSpore.
+        - This method should be used after :func:`mindspore.ops.communication.init_process_group`.
+    Args:
+        group (str, optional): The communication group to work on. Normally, the group should be created by
+            :func:`mindspore.ops.communication.new_group`. If ``None``, which means ``"hccl_world_group"`` in Ascend.
+            Default: ``None``.
+    Raises:
+        TypeError: If group is not a string.
+        RuntimeError: If HCCL is not available or MindSpore is GPU/CPU version.
+    Supported Platforms:
+        ``Ascend``
+    Examples:
+        .. note::
+            Before running the following examples, you need to configure the communication environment variables.
+            For Ascend devices, it is recommended to use the msrun startup method
+            without any third-party or configuration file dependencies.
+            Please see the `msrun start up
+            <https://www.mindspore.cn/tutorials/en/master/parallel/msrun_launcher.html>`_
+            for more details.
+        >>> import mindspore as ms
+        >>> from mindspore.ops.communication import init_process_group, destroy_process_group
+        >>> ms.set_device(device_target="Ascend")
+        >>> init_process_group()
+        >>> destroy_process_group()
+    """
+    if group == GlobalComm.WORLD_COMM_GROUP or group is None:
+        _pynative_executor.sync()
+        _finalize_collective()
+        _ExistingGroup.ITEMS.clear()
+        _ExistingGroup.GROUP_RANKS.clear()
+    elif not isinstance(group, str):
+        raise TypeError(
+            "For 'destroy_group', the argument 'group' must be type of string or None, "
+            "but got 'group' type : {}.".format(type(group))
+        )
+    else:
+        _destroy_group_helper(group)
+def get_rank(group=None):
+    """
+    Get the rank ID for the current device in the specified collective communication group.
+    Note:
+        This method should be used after :func:`mindspore.ops.communication.init_process_group`.
+    Args:
+        group (str, optional): The communication group to work on. Normally, the group should be created by
+            :func:`mindspore.ops.communication.new_group`. If ``None``, which means ``"hccl_world_group"`` in Ascend.
+            Default: ``None``.
+    Returns:
+        int, the rank ID of the calling process within the group.
+        return -1, if not part of the group
+    Raises:
+        TypeError: If group is not a string.
+    Supported Platforms:
+        ``Ascend`` ``CPU``
+    Examples:
+        .. note::
+            Before running the following examples, you need to configure the communication environment variables.
+            For Ascend devices, it is recommended to use the msrun startup method
+            without any third-party or configuration file dependencies.
+            Please see the `msrun start up
+            <https://www.mindspore.cn/tutorials/en/master/parallel/msrun_launcher.html>`_
+            for more details.
+        >>> import mindspore as ms
+        >>> from mindspore.ops.communication import init_process_group, get_rank
+        >>> ms.set_device(device_target="Ascend")
+        >>> init_process_group()
+        >>> rank_id = get_rank()
+        >>> print(rank_id)
+        >>> # the result is the rank_id in world_group
+        #rank 0: 0
+        #rank 1: 1
+    """
+    if group is None:
+        group = GlobalComm.WORLD_COMM_GROUP
+    if not isinstance(group, str):
+        raise TypeError(
+            "For 'get_rank', the argument 'group' must be type of string, "
+            "but got 'group' type : {}.".format(type(group))
+        )
+    try:
+        ret = _get_rank_helper(group)
+    except RuntimeError as e:
+        logger.warning(e)
+        ret = -1
+    return ret
+def get_world_size(group=None):
+    """
+    Get the rank size of the specified collective communication group.
+    Note:
+        This method should be used after :func:`mindspore.ops.communication.init_process_group`.
+    Args:
+        group (str, optional): The communication group to work on. Normally, the group should be created by
+            :func:`mindspore.ops.communication.new_group`. If ``None``, which means ``"hccl_world_group"`` in Ascend.
+            Default: ``None``.
+    Returns:
+        int, the rank size of the group.
+        return -1, if the group is not available.
+    Raises:
+        TypeError: If group is not a string.
+    Supported Platforms:
+        ``Ascend``
+    Examples:
+        .. note::
+            Before running the following examples, you need to configure the communication environment variables.
+            For Ascend devices, it is recommended to use the msrun startup method
+            without any third-party or configuration file dependencies.
+            Please see the `msrun start up
+            <https://www.mindspore.cn/tutorials/en/master/parallel/msrun_launcher.html>`_
+            for more details.
+            This example should be run with 8 devices.
+        >>> import mindspore as ms
+        >>> from mindspore.ops.communication import init_process_group, get_world_size
+        >>> ms.set_device(device_target="Ascend")
+        >>> init_process_group()
+        >>> group_size = get_world_size()
+        >>> print("group_size is: ", group_size)
+        group_size is: 8
+    """
+    ret = -1
+    if group is None:
+        group = GlobalComm.WORLD_COMM_GROUP
+    if not isinstance(group, str):
+        raise TypeError(
+            "For 'get_world_size', the argument 'group' must be type of string, "
+            "but got 'group' type : {}.".format(type(group))
+        )
+    try:
+        ret = _get_size_helper(group)
+    except RuntimeError as e:
+        logger.warning(e)
+        ret = -1
+    return ret
+def new_group(ranks=None,
+              timeout=None,
+              backend=None,
+              pg_options=None,
+              use_local_synchronization=False,
+              group_desc=None):
+    """
+    Create a new distributed group.
+    Note:
+        This method should be used after :func:`mindspore.ops.communication.init_process_group`.
+    Args:
+        ranks (list[int], optional): List of ranks of group members. If ``None``,
+            will be create the world group. Default is ``None``.
+        timeout (int, invalid): Currently it is a reserved parameter.
+        backend (str, invalid): Support backend Library, Currently support ``"hccl"`` and ``"mccl"``.
+            when backend is ``"hccl"`` will use Huawei Collective Communication Library(HCCL).
+            when  backend is ``"mccl"`` will use MindSpore Collective Communication Library(MCCL).
+            If ``None``, which means ``"hccl"`` in Ascend. Default is ``None``.
+        pg_options (GroupOptions, optional): Additional communication group configuration parameters.
+            The backend will automatically select supported parameters and apply them during group
+            initialization. i.e. for the ``HCCL`` backend, ``hccl_config`` can be specified so that
+            group initialization configurations can be applied. Default is ``None``.
+            `GroupOptions` is defined as a class that can be instantiated as a python object.
+            .. code-block::
+                GroupOptions {
+                    hccl_config(dict)
+                }
+            `hccl_config` currently only supports "hccl_buffer_size" or "hccl_comm".
+            - hccl_buffer_size (uint32): specifies the size of the HCCL communication buffer.
+            - hccl_comm (int64): specifies an existing HcclComm pointer. If "hccl_comm" is set,
+              "hccl_buffer_size" will be ignored.
+        use_local_synchronization (bool, invalid): Currently it is a reserved parameter.
+        group_desc (str, invalid): Currently it is a reserved parameter.
+    Returns:
+        A string with group name. Return "" in the abnormal scenarios.
+    Raises:
+        TypeError: If list ranks in Group has duplicate rank id.
+    Supported Platforms:
+        ``Ascend`` ``CPU``
+    Examples:
+        .. note::
+            Before running the following examples, you need to configure the communication environment variables.
+            For Ascend devices, it is recommended to use the msrun startup method
+            without any third-party or configuration file dependencies.
+            Please see the `msrun start up
+            <https://www.mindspore.cn/tutorials/en/master/parallel/msrun_launcher.html>`_
+            for more details.
+        >>> import mindspore as ms
+        >>> from mindspore.ops.communication import init_process_group, new_group
+        >>> ms.set_device(device_target="Ascend")
+        >>> init_process_group()
+        >>> group = new_group()
+        >>> print("group is: ", group)
+        group is: hccl_world_group
+    """
+    if ranks is not None:
+        if not isinstance(ranks, list):
+            raise TypeError("ranks must be list, but got {}".format(type(ranks)))
+        ranks = sorted(ranks)
+    else:
+        return GlobalComm.WORLD_COMM_GROUP
+    if backend is None:
+        backend = "hccl"
+    if not isinstance(backend, str) or backend not in ("hccl", "mccl"):
+        raise TypeError(f"the input backend must be hccl or mccl, but got {backend}")
+    group = backend + "_" + str(len(ranks)) + "_" + hashlib.sha1(bytes("_".join(map(str, ranks)), "utf-8")).hexdigest()
+    if pg_options is not None:
+        if not isinstance(pg_options, GroupOptions):
+            raise TypeError("pg_options must be type GroupOptions, but got {}".format(type(pg_options)))
+    try:
+        create_group(group, ranks, pg_options)
+    except RuntimeError as e:
+        logger.warning(e)
+        group = ""
+    return group
+def get_backend(group=None):
+    """
+    Get the backend of communication process groups.
+    Note:
+        Only one communication backend is supported by MindSpore for each process.
+        It should be one of `hccl`/`nccl`/`mccl`. Currently only support hccl and mccl.
+    Args:
+        group (str, optional): The communication group to work on.
+            Normally, the group should be created by :func:`mindspore.ops.communication.new_group`, If ``None``,
+            which means ``"hccl_world_group"`` in Ascend. Default: ``None``.
+    Returns:
+        string, the backend of the group.
+    Raises:
+        TypeError: If the `group` is not a str.
+    Supported Platforms:
+        ``Ascend`` ``CPU``
+    Examples:
+        .. note::
+            Before running the following examples, you need to configure the communication environment variables.
+            For Ascend devices, it is recommended to use the msrun startup method
+            without any third-party or configuration file dependencies.
+            Please see the `msrun start up
+            <https://www.mindspore.cn/tutorials/en/master/parallel/msrun_launcher.html>`_
+            for more details.
+        >>> import mindspore as ms
+        >>> from mindspore.ops.communication import init_process_group, get_backend
+        >>> ms.set_device(device_target="Ascend")
+        >>> init_process_group()
+        >>> backend = get_backend()
+        >>> print("backend is: ", backend)
+        backend is: hccl
+    """
+    if group is None:
+        return BACKEND_HCCL
+    if not isinstance(group, str):
+        raise TypeError(
+            "For 'get_backend', the argument 'group' must be type of string or None, "
+            "but got 'group' type : {}.".format(type(group))
+        )
+    if BACKEND_HCCL in group:
+        return BACKEND_HCCL
+    if BACKEND_MCCL in group:
+        return BACKEND_MCCL
+    return _get_backend()
+def get_global_rank(group, group_rank):
+    """
+    A function that returns the rank id in the world group corresponding to the
+    rank which id is 'group_rank' in the user group.
+    Note:
+        This method should be used after :func:`mindspore.ops.communication.init_process_group`.
+    Args:
+        group (str): The communication group to work on. Normally, the group should
+            be created by :func:`mindspore.ops.communication.new_group`. If ``None``, which
+            means ``"hccl_world_group"`` in Ascend.
+        group_rank (int): Group rank to query.
+    Returns:
+        An integer scalar with the rank id in the world group.
+    Raises:
+        TypeError: If the `group` is not a str.
+        TypeError: If the `group_rank` is not an integer.
+        RuntimeError: If device target is invalid, or backend is invalid, or distributed initialization fails.
+    Supported Platforms:
+        ``Ascend``
+    Examples:
+        .. note::
+            Before running the following examples, you need to configure the communication environment variables.
+            For Ascend devices, it is recommended to use the msrun startup method
+            without any third-party or configuration file dependencies.
+            Please see the `msrun start up
+            <https://www.mindspore.cn/tutorials/en/master/parallel/msrun_launcher.html>`_
+            for more details.
+            This example should be run with 8 devices.
+        >>> import mindspore as ms
+        >>> from mindspore.ops.communication import init_process_group, get_global_rank, new_group, get_rank
+        >>> ms.set_device(device_target="Ascend")
+        >>> # Launch 8 processes.
+        >>> init_process_group()
+        >>> rank_ids = [0,4]
+        >>> if get_rank() in rank_ids:
+        ...     group = new_group(rank_ids)
+        ...     world_rank_id = get_global_rank(group, 1)
+        ...     print("world_rank_id is: ", world_rank_id)
+        #rank 0 and 4:
+        world_rank_id is: 4
+    """
+    if not isinstance(group_rank, int):
+        raise TypeError(
+            f"The group_rank argument must be integer, but got {type(group_rank)}."
+        )
+    if group is None or group is GlobalComm.WORLD_COMM_GROUP:
+        return group_rank
+    if not isinstance(group, str):
+        raise TypeError(
+            "For 'get_global_rank', the argument 'group' must be type of string or None, "
+            "but got 'group' type : {}.".format(type(group))
+        )
+    return get_world_rank_from_group_rank(group, group_rank)
+def get_group_rank(group, global_rank):
+    """
+    Get the rank ID in the specified user communication group corresponding to
+    the rank ID in the world communication group.
+    Note:
+        This method should be used after :func:`mindspore.ops.communication.init_process_group`.
+    Args:
+        group (str): The communication group to work on. Normally, the group should be
+            created by :func:`mindspore.ops.communication.new_group`. If ``None``, which means
+            ``"hccl_world_group"`` in Ascend.
+        global_rank (int): A rank ID in the world communication group.
+    Returns:
+        int, the rank ID in the user communication group.
+    Raises:
+        TypeError: If global_rank is not an integer or the group is not a string.
+        RuntimeError: If device target is invalid, or backend is invalid, or distributed initialization fails.
+    Supported Platforms:
+        ``Ascend``
+    Examples:
+        .. note::
+            Before running the following examples, you need to configure the communication environment variables.
+            For Ascend devices, it is recommended to use the msrun startup method
+            without any third-party or configuration file dependencies.
+            Please see the `msrun start up
+            <https://www.mindspore.cn/tutorials/en/master/parallel/msrun_launcher.html>`_
+            for more details.
+            This example should be run with 8 devices.
+        >>> import mindspore as ms
+        >>> from mindspore.ops.communication import init_process_group, new_group, get_group_rank, get_rank
+        >>> ms.set_device(device_target="Ascend")
+        >>> # Launch 8 processes.
+        >>> init_process_group()
+        >>> rank_ids = [0,4]
+        >>> if get_rank() in rank_ids:
+        ...     group = new_group(rank_ids)
+        ...     group_rank_id = get_group_rank(group, 4)
+        ...     print("group_rank_id is: ", group_rank_id)
+        #rank 0 and 4:
+        group_rank_id is: 1
+    """
+    if not isinstance(global_rank, int):
+        raise TypeError(
+            f"The global_rank argument must be integer, but got {type(global_rank)}."
+        )
+    if group is None:
+        group = GlobalComm.WORLD_COMM_GROUP
+    if not isinstance(group, str):
+        raise TypeError(
+            "For 'get_group_rank_from_world_rank', the argument 'group' must be type of string, "
+            "but got 'group' type : {}.".format(type(group))
+        )
+    return _get_group_rank_from_world_rank_from_cache_helper(
+        world_rank_id=global_rank, group=group
+    )
+def get_process_group_ranks(group=None):
+    """
+    Gets the ranks of the specific group and returns the process ranks in the communication group as a list.
+    Args:
+        group (str, optional): The communication group to work on. Normally, the group should be created by
+            :func:`mindspore.ops.communication.new_group`. If ``None``, which means ``"hccl_world_group"`` in Ascend.
+            Default: ``None``.
+    Returns:
+        List (List[int]), List of process ranks in the specified communication group.
+    Raises:
+        TypeError: If the `group` is not a str.
+        RuntimeError: If device target is invalid, or backend is invalid, or distributed initialization fails.
+    Supported Platforms:
+        ``Ascend`` ``CPU``
+    Examples:
+        .. note::
+            Before running the following examples, you need to configure the communication environment variables.
+            For Ascend devices, it is recommended to use the msrun startup method
+            without any third-party or configuration file dependencies.
+            Please see the `msrun start up
+            <https://www.mindspore.cn/tutorials/en/master/parallel/msrun_launcher.html>`_
+            for more details.
+            This example should be run with 4 devices.
+        >>> import mindspore as ms
+        >>> from mindspore.ops.communication import init_process_group, get_process_group_ranks
+        >>> # Launch 4 processes.
+        >>> ms.set_device(device_target="Ascend")
+        >>> init_process_group()
+        >>> output = get_process_group_ranks()
+        >>> print(output)
+        [0, 1, 2, 3]
+    """
+    if group is None:
+        group = GlobalComm.WORLD_COMM_GROUP
+    if not isinstance(group, str):
+        raise TypeError(
+            "For 'get_process_group_ranks', the argument 'group' must be type of string or None, "
+            "but got 'group' type : {}.".format(type(group))
+        )
+    return _get_group_ranks(group)
+@_primexpr
+def _check_all_tensor_same_dtype_and_shape(*tensor_lists):
+    """check all the input tensor has same dtype and shape"""
+    consistent_dtype = None
+    consistent_shape = None
+    for list_ in tensor_lists:
+        if not isinstance(list_, (list, tuple)):
+            list_ = [list_]
+        for tensor_ in list_:
+            if not isinstance(tensor_, Tensor):
+                continue
+            dtype = tensor_.dtype
+            shape = tensor_.shape
+            if consistent_dtype is None:
+                consistent_dtype = dtype
+                consistent_shape = shape
+            else:
+                if dtype != consistent_dtype:
+                    raise TypeError(
+                        "tensor_lists dtype must be the same, "
+                        f"but got {consistent_dtype} and {dtype}."
+                    )
+                if shape != consistent_shape:
+                    raise TypeError(
+                        "tensor_lists shape must be the same, "
+                        f"but got {consistent_shape} and {shape}."
+                    )
+@_primexpr
+def _check_output_shape(output, expected_shape, op_name):
+    if output.shape != expected_shape:
+        raise TypeError(
+            f"For {op_name}, the output shape should be {expected_shape}, "
+            f"but got {output.shape}.")
+@_primexpr
+def _check_output_dtype(output, expected_dtype, op_name):
+    if output.dtype != expected_dtype:
+        raise TypeError(
+            f"For {op_name}, the output dtype should be {expected_dtype}, "
+            f"but got {output.dtype}.")
+def all_reduce(tensor, op=ReduceOp.SUM, group=None, async_op=False):
+    """
+    Reduce tensors across all devices in such a way that all deviceswill get the same final result,
+    returns the tensor which is all reduced.
+    Note:
+        The tensors must have the same shape and format in all processes of the collection.
+    Args:
+        tensor (Tensor): The input tensor of collective. The shape of tensor is :math:`(x_1, x_2, ..., x_R)`.
+            If the function operates in-place, this also means output of collective.
+        op (str, optional): Specifies an operation used for element-wise reductions, like sum, prod, max, and min.
+            Default: ``ReduceOp.SUM`` .
+        group (str, optional): The communication group to work on. If ``None``, which means ``"hccl_world_group"`` in
+            Ascend. Default: ``None``.
+        async_op (bool, optional): Whether this operator should be an async operator. Default: ``False`` .
+    Returns:
+        - CommHandle, if the function operates in-place, return it. CommHandle is an async work handle,
+            if `async_op` is set to True. CommHandle will be None, when `async_op` is False.
+        - Tuple(Tensor, CommHandle), if the function operates non in-place, return it.
+            the output tensor has the same shape of the input, i.e., :math:`(x_1, x_2, ..., x_R)`.
+            The contents depend on the specified operation. CommHandle is an async work handle,
+            if `async_op` is set to True. CommHandle will be None, when `async_op` is False.
+    Raises:
+        TypeError: If the type of the first input parameter is not Tensor, or any of `op` and `group` is not a str,
+                   `op` range is illegal or async_op is not bool.
+        RuntimeError: If device target is invalid, or backend is invalid, or distributed initialization fails.
+    Supported Platforms:
+        ``Ascend`` ``CPU``
+    Examples:
+        .. note::
+            Before running the following examples, you need to configure the communication environment variables.
+            For Ascend devices, it is recommended to use the msrun startup method
+            without any third-party or configuration file dependencies.
+            Please see the `msrun start up
+            <https://www.mindspore.cn/tutorials/en/master/parallel/msrun_launcher.html>`_
+            for more details.
+            This example should be run with 2 devices.
+        >>> import numpy as np
+        >>> from mindspore.ops.communication import init_process_group
+        >>> from mindspore.ops.communication import all_reduce
+        >>> from mindspore import Tensor
+        >>>
+        >>> init_process_group()
+        >>> tensor = Tensor(np.ones([2, 8]).astype(np.float32))
+        >>> output = all_reduce(tensor)
+        >>> print(tensor)
+        [[2. 2. 2. 2. 2. 2. 2. 2.]
+         [2. 2. 2. 2. 2. 2. 2. 2.]]
+    """
+    if not isinstance(tensor, (Tensor, Tensor_)):
+        raise TypeError("For all_reduce, the input tensor must be tensor")
+    if not isinstance(op, str):
+        raise TypeError("For all_reduce, the input op type must be str")
+    if op not in ("sum", "prod", "min", "max"):
+        raise TypeError(
+            "For all_reduce, the input op value must be one of sum, prod, min, max"
+        )
+    if group is None:
+        group = GlobalComm.WORLD_COMM_GROUP
+    if not isinstance(group, str):
+        raise TypeError(
+            "The argument 'group' must be type of string, "
+            "but got 'group' type : {}.".format(type(group))
+        )
+    if not isinstance(async_op, bool):
+        raise TypeError(
+            f"The argument 'async_op' must be a bool, but got {type(async_op)}."
+        )
+    if is_inplace_func() is True:
+        output = dist_comm_all_reduce_op(tensor, op, group)
+        _, handle = _deal_comm_outputs(output, async_op)
+        return handle
+    out = inner_comm_all_reduce_op(tensor, op, group)
+    return _deal_comm_outputs(out, async_op)
+def all_gather_into_tensor(output_tensor, input_tensor, group=None, async_op=False):
+    """
+    Gathers tensors from the specified communication group and returns the tensor which is all gathered.
+    Note:
+        The tensors must have the same shape and format in all processes of the collection.
+    Args:
+        output_tensor (Tensor): The output tensor to be all gathered into tensor.If the number of devices
+            in the group is N, then the shape of output tensor is :math:`(N*x_1, x_2, ..., x_R)`.
+            If the function operates non in-place, This parameter is invalid.
+        input_tensor (Tensor): The input tensor to be all gathered into tensor.
+            The shape of tensor is :math:`(x_1, x_2, ..., x_R)`.
+        group (str, optional): The communication group to work on. If ``None``, which means ``"hccl_world_group"`` in
+            Ascend. Default: ``None``.
+        async_op (bool, optional): Whether this operator should be an async operator. Default: ``False`` .
+    Returns:
+        - CommHandle, if the function operates in-place, return it. CommHandle is an async work handle,
+            if `async_op` is set to True. CommHandle will be None, when `async_op` is False.
+        - Tuple(Tensor, CommHandle), if the function operates non in-place, if the number of devices in the group is N,
+            then the shape of output tensor is :math:`(N, x_1, x_2, ..., x_R)`.
+            CommHandle is an async work handle, if `async_op` is set to True.
+            CommHandle will be None, when `async_op` is False.
+    Raises:
+        TypeError: If the type of the input_tensor or output_tensor parameter is not Tensor,
+            `group` is not a str, or async_op is not bool.
+        RuntimeError: If device target is invalid, or backend is invalid, or distributed initialization fails.
+    Supported Platforms:
+        ``Ascend``
+    Examples:
+        .. note::
+            Before running the following examples, you need to configure the communication environment variables.
+            For Ascend devices, it is recommended to use the msrun startup method
+            without any third-party or configuration file dependencies.
+            Please see the `msrun start up
+            <https://www.mindspore.cn/tutorials/en/master/parallel/msrun_launcher.html>`_
+            for more details.
+            This example should be run with 2 devices.
+        >>> import numpy as np
+        >>> import mindspore as ms
+        >>> from mindspore import ops
+        >>> from mindspore.ops.communication import init_process_group
+        >>> from mindspore.ops.communication import all_gather_into_tensor
+        >>> from mindspore import Tensor
+        >>>
+        >>> ms.set_device(device_target="Ascend")
+        >>> init_process_group()
+        >>> input_tensor = Tensor(np.ones([2, 8]).astype(np.float32))
+        >>> out_tensor = Tensor(np.zeros([4, 8]).astype(np.float32))
+        >>> output = all_gather_into_tensor(out_tensor, input_tensor)
+        >>> print(out_tensor)
+        [[1. 1. 1. 1. 1. 1. 1. 1.]
+         [1. 1. 1. 1. 1. 1. 1. 1.]
+         [1. 1. 1. 1. 1. 1. 1. 1.]
+         [1. 1. 1. 1. 1. 1. 1. 1.]]
+    """
+    if not isinstance(input_tensor, (Tensor, Tensor_)):
+        raise TypeError("For all_gather_into_tensor, the input tensor must be tensor")
+    if is_inplace_func() is True and \
+       not isinstance(output_tensor, (Tensor, Tensor_)):
+        raise TypeError("For all_gather_into_tensor, the output tensor must be tensor")
+    if group is None:
+        group = GlobalComm.WORLD_COMM_GROUP
+    if not isinstance(group, str):
+        raise TypeError(
+            "The argument 'group' must be type of string, "
+            "but got 'group' type : {}.".format(type(group))
+        )
+    if not isinstance(async_op, bool):
+        raise TypeError(
+            f"The argument 'async_op' must be a bool, but got {type(async_op)}."
+        )
+    group_size = get_cache_group_size(group)
+    if is_inplace_func() is True:
+        output = dist_comm_all_gather_into_tensor_op(
+            output_tensor, input_tensor, group_size, group
+        )
+        _, handle = _deal_comm_outputs(output, async_op)
+        return handle
+    output = inner_comm_all_gather_op(input_tensor, group_size, group)
+    return _deal_comm_outputs(output, async_op)
+def all_gather_into_tensor_uneven(output, input, output_split_sizes=None, group=None, async_op=False):
+    r"""
+    Gathers and concatenates tensors across devices with uneven first dimensions.
+    Note:
+        - Input tensors must have identical shapes except for the first dimension.
+        - Output tensor's first dimension should equal to the sum of all devices' input first dimensions.
+    Args:
+        output (Tensor): Concatenated output tensor with shape :math:`(\sum_{i=0}^{N-1} x_{i1}, x_2, ..., x_R)`,
+            where N is the number of devices in the group.
+        input (Tensor): Local input tensor with shape :math:`(x_{k1}, x_2, ..., x_R)`, where k is current device's rank.
+        output_split_sizes (list[int], optional): Specifies first dimension sizes from each device.
+            Must match actual input dimensions when provided.
+            If ``None``, assumes equal split sizes across devices. Default: ``None``.
+        group (str, optional): The communication group to work on. If ``None``,
+            which means ``"hccl_world_group"`` in Ascend. Default: ``None``.
+        async_op (bool, optional): Whether this operator should be an async operator. Default: ``False``.
+    Returns:
+        CommHandle, CommHandle is an async work handle, if `async_op` is set to True.
+        CommHandle will be None, when `async_op` is False.
+    Raises:
+        ValueError: If the shape of `input` does not match the constraints of `output_split_sizes`.
+        RuntimeError: If device target is invalid, or backend is invalid, or distributed initialization fails.
+    Supported Platforms:
+        ``Ascend``
+    Examples:
+        .. note::
+            Before running the following examples, you need to configure the communication environment variables.
+            For Ascend devices, it is recommended to use the msrun startup method
+            without any third-party or configuration file dependencies.
+            Please see the `msrun start up
+            <https://www.mindspore.cn/tutorials/en/master/parallel/msrun_launcher.html>`_
+            for more details.
+            This example should be run with 2 devices.
+        >>> import numpy as np
+        >>> import mindspore as ms
+        >>> from mindspore import ops
+        >>> from mindspore.ops.communication import init_process_group, get_rank
+        >>> from mindspore.ops.communication import all_gather_into_tensor_uneven
+        >>> from mindspore import Tensor
+        >>>
+        >>> ms.set_device(device_target="Ascend")
+        >>> init_process_group()
+        >>> if get_rank() == 0:
+        >>>     input_tensor = Tensor(np.ones([3, 4]).astype(np.float32))
+        >>> else:
+        >>>     input_tensor = Tensor(np.ones([2, 4]).astype(np.float32))
+        >>> out_tensor = Tensor(np.zeros([5, 4]).astype(np.float32))
+        >>> output_split_sizes = [3, 2]
+        >>> output = all_gather_into_tensor_uneven(out_tensor, input_tensor, output_split_sizes)
+        >>> print(out_tensor)
+        [[1. 1. 1. 1.]
+         [1. 1. 1. 1.]
+         [1. 1. 1. 1.]
+         [1. 1. 1. 1.]
+         [1. 1. 1. 1.]]
+    """
+    if is_inplace_func() is False:
+        raise ValueError("Non-inplace mode is currently not supported.")
+    if group is None:
+        group = GlobalComm.WORLD_COMM_GROUP
+    if not isinstance(group, str):
+        raise TypeError(
+            "The argument 'group' must be type of string, "
+            "but got 'group' type : {}.".format(type(group))
+        )
+    if not isinstance(async_op, bool):
+        raise TypeError(
+            f"The argument 'async_op' must be a bool, but got {type(async_op)}."
+        )
+    group_size = get_cache_group_size(group)
+    output_split_sizes = [] if output_split_sizes is None else output_split_sizes
+    result = dist_comm_all_gather_into_tensor_uneven_op(
+        output, input, output_split_sizes, group_size, group
+    )
+    _, handle = _deal_comm_outputs(result, async_op)
+    return handle
+def reduce_scatter_tensor(output, input, op=ReduceOp.SUM, group=None, async_op=False):
+    r"""
+    Reduces and scatters tensors from the specified communication group and
+    returns the tensor which is reduced and scattered.
+    Note:
+        The tensors must have the same shape and format in all processes of the collection.
+    Args:
+        output(Tensor): the output tensor has the same dtype as `input_x` with a shape of :math:`(N/rank\_size, *)`.
+            If the function operates non in-place, This parameter is invalid.
+        input(Tensor): The input tensor to be reduced and scattered, suppose it has a shape :math:`(N, *)`, where `*`
+            means any number of additional dimensions. N must be divisible by rank_size.
+            rank_size refers to the number of cards in the communication group.
+        op (str, optional): Specifies an operation used for element-wise reductions,
+            like SUM and MAX. Default: ``ReduceOp.SUM`` .
+        group (str, optional): The communication group to work on. If ``None``, which means ``"hccl_world_group"`` in
+            Ascend. Default: ``None``.
+        async_op (bool, optional): Whether this operator should be an async operator. Default: ``False`` .
+    Returns:
+        - CommHandle, if the function operates in-place, CommHandle is an async work handle,
+            if `async_op` is set to True. CommHandle will be None, when `async_op` is False.
+        - Tuple(Tensor, CommHandle), if the function operates non in-place, return it.
+            the output tensor has the same dtype as `input_x` with a shape of
+            :math:`(N/rank\_size, *)`. CommHandle is an async work handle, if `async_op` is set to True.
+            CommHandle will be None, when `async_op` is False.
+    Raises:
+        TypeError: If the type of the input and output parameter is not Tensor, any of `op` and `group` is not a str.
+            async_op is not bool or 'op' is invalid.
+        ValueError: If the first dimension of the input cannot be divided by the rank_size.
+        RuntimeError: If device target is invalid, or backend is invalid, or distributed initialization fails.
+    Supported Platforms:
+        ``Ascend``
+    Examples:
+        .. note::
+            Before running the following examples, you need to configure the communication environment variables.
+            For Ascend devices, it is recommended to use the msrun startup method
+            without any third-party or configuration file dependencies.
+            Please see the `msrun start up
+            <https://www.mindspore.cn/tutorials/en/master/parallel/msrun_launcher.html>`_
+            for more details.
+            This example should be run with 2 devices.
+        >>> import mindspore as ms
+        >>> from mindspore import Tensor
+        >>> from mindspore.ops.communication import init_process_group
+        >>> from mindspore.ops.communication import reduce_scatter_tensor
+        >>> import numpy as np
+        >>>
+        >>> ms.set_device(device_target="Ascend")
+        >>> init_process_group()
+        >>> input_tensor = Tensor(np.ones([8, 8]).astype(np.float32))
+        >>> output_tensor = Tensor(np.ones([4, 8]).astype(np.float32))
+        >>> output = reduce_scatter_tensor(output_tensor ,input_tensor)
+        >>> print(output_tensor)
+        [[2. 2. 2. 2. 2. 2. 2. 2.]
+         [2. 2. 2. 2. 2. 2. 2. 2.]
+         [2. 2. 2. 2. 2. 2. 2. 2.]
+         [2. 2. 2. 2. 2. 2. 2. 2.]]
+    """
+    if not isinstance(input, (Tensor, Tensor_)):
+        raise TypeError("For reduce_scatter_tensor, the input tensor must be tensor")
+    if is_inplace_func() is True and \
+       not isinstance(output, (Tensor, Tensor_)):
+        raise TypeError("For reduce_scatter_tensor, the output tensor must be tensor")
+    if not isinstance(op, str):
+        raise TypeError("For reduce_scatter_tensor, the input op type must be str")
+    if op not in ("sum", "prod", "min", "max"):
+        raise TypeError(
+            "For reduce_scatter_tensor, the input op value must be one of sum, prod, min, max"
+        )
+    if group is None:
+        group = GlobalComm.WORLD_COMM_GROUP
+    if not isinstance(group, str):
+        raise TypeError(
+            "The argument 'group' must be type of string, "
+            "but got 'group' type : {}.".format(type(group))
+        )
+    if not isinstance(async_op, bool):
+        raise TypeError(
+            f"The argument 'async_op' must be a bool, but got {type(async_op)}."
+        )
+    rank_size = get_cache_group_size(group)
+    if is_inplace_func() is True:
+        out = dist_comm_reduce_scatter_tensor_op(output, input, rank_size, op, group)
+        _, handle = _deal_comm_outputs(out, async_op)
+        return handle
+    out = inner_comm_reduce_scatter_op(input, rank_size, op, group)
+    return _deal_comm_outputs(out, async_op)
+def reduce_scatter_tensor_uneven(output, input, input_split_sizes=None, op=ReduceOp.SUM, group=None, async_op=False):
+    r"""
+    Reduce tensors from the specified communication group and scatter to the output tensor
+    according to `input_split_sizes`.
+    Note:
+        - The input tensor must have identical shape and format across all processes.
+        - The first dimension of input tensor should equal to the sum of `input_split_sizes`.
+    Args:
+        output(Tensor): the output tensor has the same dtype as `input` with a shape of
+            :math:`(input\_split\_sizes[rank], *)`, where rank is the local rank id of the device.
+        input(Tensor): The input tensor to be reduced and scattered, Expected shape :math:`(N, *)`, where `*`
+            means any number of additional dimensions. N must equal the sum of `input_split_sizes` across ranks.
+        input_split_sizes (list[int], optional): List specifying how to split the first dimension of input tensor.
+            If ``None``, splits evenly according to group size. Default: ``None``.
+        op (str, optional): Specifies an operation used for element-wise reductions,
+            One of ReduceOp: 'SUM', 'MIN', 'MAX'. Default: ``ReduceOp.SUM``.
+        group (str, optional): The communication group to work on. If ``None``, which means ``"hccl_world_group"`` in
+            Ascend. Default: ``None``.
+        async_op (bool, optional): Whether this operator should be an async operator. Default: ``False``.
+    Returns:
+        CommHandle, CommHandle is an async work handle, if `async_op` is set to True.
+        CommHandle will be None, when `async_op` is False.
+    Raises:
+        ValueError: If the shape of `output` does not match the constraints of `input_split_sizes`.
+        RuntimeError: If device target is invalid, or backend is invalid, or distributed initialization fails.
+    Supported Platforms:
+        ``Ascend``
+    Examples:
+        .. note::
+            Before running the following examples, you need to configure the communication environment variables.
+            For Ascend devices, it is recommended to use the msrun startup method
+            without any third-party or configuration file dependencies.
+            Please see the `msrun start up
+            <https://www.mindspore.cn/tutorials/en/master/parallel/msrun_launcher.html>`_
+            for more details.
+            This example should be run with 2 devices.
+        >>> import mindspore as ms
+        >>> from mindspore import Tensor
+        >>> from mindspore.ops.communication import init_process_group, get_rank
+        >>> from mindspore.ops.communication import reduce_scatter_tensor_uneven
+        >>> import numpy as np
+        >>>
+        >>> ms.set_device(device_target="Ascend")
+        >>> init_process_group()
+        >>> input_tensor = Tensor(np.ones([5, 8]).astype(np.float32))
+        >>> if get_rank() == 0:
+        >>>     output_tensor = Tensor(np.ones([2, 8]).astype(np.float32))
+        >>> else:
+        >>>     output_tensor = Tensor(np.ones([3, 8]).astype(np.float32))
+        >>> input_split_sizes = [2, 3]
+        >>> output = reduce_scatter_tensor_uneven(output_tensor, input_tensor, input_split_sizes)
+        >>> print(output_tensor)
+        rank 0:
+        [[2. 2. 2. 2. 2. 2. 2. 2.]
+         [2. 2. 2. 2. 2. 2. 2. 2.]]
+        rank 1:
+        [[2. 2. 2. 2. 2. 2. 2. 2.]
+         [2. 2. 2. 2. 2. 2. 2. 2.]
+         [2. 2. 2. 2. 2. 2. 2. 2.]]
+    """
+    if is_inplace_func() is False:
+        raise ValueError("Non-inplace mode is currently not supported.")
+    if not isinstance(op, str):
+        raise TypeError("For reduce_scatter_tensor_uneven, the input op type must be str")
+    if op not in ("sum", "min", "max"):
+        raise TypeError(
+            "For reduce_scatter_tensor_uneven, the input op value must be one of sum, prod, min, max"
+        )
+    if group is None:
+        group = GlobalComm.WORLD_COMM_GROUP
+    if not isinstance(group, str):
+        raise TypeError(
+            "The argument 'group' must be type of string, "
+            "but got 'group' type : {}.".format(type(group))
+        )
+    if not isinstance(async_op, bool):
+        raise TypeError(
+            f"The argument 'async_op' must be a bool, but got {type(async_op)}."
+        )
+    input_split_sizes = [] if input_split_sizes is None else input_split_sizes
+    rank_size = get_cache_group_size(group)
+    result = dist_comm_reduce_scatter_tensor_uneven_op(
+        output, input, input_split_sizes, rank_size, op, group
+    )
+    _, handle = _deal_comm_outputs(result, async_op)
+    return handle
+def reduce(tensor, dst, op=ReduceOp.SUM, group=None, async_op=False):
+    """
+    Reduces tensors across the processes in the specified communication group, sends the result
+    to the target dst(global rank), and returns the tensor which is sent to the target process.
+    Note:
+        - Only process with destination rank receives the reduced output.
+        - Only support PyNative mode, Graph mode is not currently supported.
+        - Other processes only get a tensor with shape [1], which has no mathematical meaning.
+    Args:
+        tensor (Tensor): Input and output of the collective. The function operates in-place.
+        dst (int): The target rank of the process(global rank) that receives the reduced output.
+        op (str, optional): Specifies an operation used for element-wise reductions, like sum, prod, max, and min.
+            Default: ``ReduceOp.SUM`` .
+        group (str, optional): The communication group to work on. If ``None``, which means ``"hccl_world_group"`` in
+            Ascend. Default: ``None``.
+        async_op (bool, optional): Whether this operator should be an async operator. Default: ``False`` .
+    Returns:
+        CommHandle, CommHandle is an async work handle, if `async_op` is set to ``True``.
+        CommHandle will be None, when `async_op` is ``False``.
+    Raises:
+        TypeError: If the type of `tensor` is not Tensor, any of `op` and `group` is not a str.
+            async_op is not bool or 'op' is invalid.
+        RuntimeError: If device target is invalid, or backend is invalid, or distributed initialization fails.
+    Supported Platforms:
+        ``Ascend``
+    Examples:
+        .. note::
+            Before running the following examples, you need to configure the communication environment variables.
+            For Ascend devices, it is recommended to use the msrun startup method
+            without any third-party or configuration file dependencies.
+            Please see the `msrun start up
+            <https://www.mindspore.cn/tutorials/en/master/parallel/msrun_launcher.html>`_
+            for more details.
+            This example should be run with 4 devices.
+        >>> from mindspore import ops
+        >>> import mindspore.nn as nn
+        >>> from mindspore.ops.communication import init_process_group, reduce
+        >>> from mindspore import Tensor
+        >>> import numpy as np
+        >>> # Launch 2 processes.
+        >>> init_process_group()
+        >>> dest_rank=1
+        >>> input_tensor = Tensor(np.ones([2, 8]).astype(np.float32))
+        >>> output = reduce(input_tensor, dest_rank)
+        >>> print(input_tensor)
+        Process with rank 0: [[1. 1. 1. 1. 1. 1. 1. 1.]
+                             [1. 1. 1. 1. 1. 1. 1. 1.]],
+        Process with rank 1: [[2. 2. 2. 2. 2. 2. 2. 2.]
+                             [2. 2. 2. 2. 2. 2. 2. 2.]],
+    """
+    if is_inplace_func() is False:
+        raise ValueError("Non-inplace mode is currently not supported.")
+    if not isinstance(tensor, (Tensor, Tensor_)):
+        raise TypeError("For reduce, the input tensor must be tensor")
+    if not isinstance(dst, int):
+        raise TypeError("For reduce, the dst must be int")
+    if not isinstance(op, str):
+        raise TypeError("For reduce, the input op type must be str")
+    if op not in ("sum", "prod", "min", "max"):
+        raise TypeError(
+            "For reduce, the input op value must be one of sum, prod, min, max"
+        )
+    if group is None:
+        group = GlobalComm.WORLD_COMM_GROUP
+    if not isinstance(group, str):
+        raise TypeError(
+            "The argument 'group' must be type of string, "
+            "but got 'group' type : {}.".format(type(group))
+        )
+    if not isinstance(async_op, bool):
+        raise TypeError(
+            f"The argument 'async_op' must be a bool, but got {type(async_op)}."
+        )
+    result = dist_comm_reduce_op(tensor, op, dst, group)
+    _, handle = _deal_comm_outputs(result, async_op)
+    return handle
+class P2POp:
+    """
+    Object for `batch_isend_irecv` input, to store information of ``"isend"`` and ``"irecv"``.
+    Note:
+        `tensor` will be modified in-place by final result when `op` is ``"irecv"``.
+    Args:
+        op(Union[str, function]): Only string of ``"isend"`` and ``"irecv"`` are allowed.
+            Or function of ``ops.isend`` and ``ops.irecv`` are allowed.
+        tensor(Tensor): tensor for sending/receiving.
+        peer(int): remote global rank for send/receive.
+        group (str, optional): The communication group to work on. If ``None``, which means ``"hccl_world_group"`` in
+            Ascend. Default: ``None``.
+        tag(int, optional): currently not supported yet. Default: ``0``.
+    Returns:
+        P2POp Object.
+    Raises:
+        TypeError: when `op` is not string or function of 'isend' and 'irecv'.
+        TypeError: when `tensor` is not type of Tensor or 'peer' is not int.
+        NotImplementedError: when `tag` is not 0.
+    Supported Platforms:
+        ``Ascend``
+    Examples:
+        >>> import numpy as np
+        >>> import mindspore
+        >>> from mindspore.ops.communication import P2POp, isend, irecv
+        >>> from mindspore import Tensor
+        >>> # Launch 2 processes.
+        >>> send_tensor = Tensor(1.)
+        >>> send_op = P2POp('isend', send_tensor, 1)
+        >>> send_op = P2POp(isend, send_tensor, 1)
+        >>> recv_tensor = Tensor(0.)
+        >>> recv_op = P2POp('irecv', recv_tensor, 0)
+        >>> recv_op = P2POp(irecv, recv_tensor, 0)
+    """
+    def __init__(self, op, tensor, peer, group=None, tag=0):
+        self.op = op
+        self.tensor = tensor
+        self.peer = peer
+        self.group = group
+        self.tag = tag
+    def __new__(cls, op, tensor, peer, group=None, tag=0):
+        if isinstance(op, str):
+            op_name = op
+            if op_name not in ["isend", "irecv"]:
+                raise TypeError(
+                    f"Expected op to be of type isend or irecv, but got {op_name}"
+                )
+        else:
+            if op not in [isend, irecv]:
+                raise TypeError(
+                    f"Expected op to be of type isend or irecv, but got {op}"
+                )
+            op_name = op.__name__
+        if not isinstance(tensor, (Tensor, Tensor_)):
+            raise TypeError(
+                f"Expected tensor to be Tensor, but got {type(tensor)}."
+            )
+        if not isinstance(peer, int):
+            raise TypeError("For P2POp, the peer must be int")
+        if tag != 0:
+            raise NotImplementedError("tag is not support yet.")
+        return object.__new__(cls)
+TYPE_ISEND = 0
+TYPE_IRECV = 1
+def batch_isend_irecv(p2p_op_list):
+    """
+    Batch send and recv tensors asynchronously.
+    Note:
+        - The 'isend' and 'irecv' of `P2POp` in `p2p_op_list` between ranks need to match each other.
+        - `P2POp` in `p2p_op_list` can only use the same communication group.
+        - `tag` of `P2POp` in `p2p_op_list` is not support yet.
+        - `tensor` of `P2POp` in `p2p_op_list` will not be modified by result inplace.
+        - Only support PyNative mode, Graph mode is not currently supported.
+    Args:
+        p2p_op_list(list[P2POp]): list contains `P2POp`. `P2POp` is type of :class:`mindspore.ops.communication.P2POp`
+    Returns:
+        list[CommHandle], CommHandle is an async work handle, Currently only one packaging handle is supported.
+    Raises:
+        TypeError: If `p2p_op_list` is empty or `p2p_op_list` are not all type of `P2POp`.
+        TypeError: The group name in `p2p_op_list` are not consistent.
+        TypeError: The `tensor` in `p2p_op_list` are not Tensor.
+        TypeError: The `op` in `p2p_op_list` are not isend or irecv.
+    Supported Platforms:
+        ``Ascend``
+    Examples:
+        .. note::
+            Before running the following examples, you need to configure the communication environment variables.
+            For Ascend devices, it is recommended to use the msrun startup method
+            without any third-party or configuration file dependencies.
+            Please see the `msrun start up
+            <https://www.mindspore.cn/tutorials/en/master/parallel/msrun_launcher.html>`_
+            for more details.
+            This example should be run with 2 devices.
+        >>> import numpy as np
+        >>> import mindspore
+        >>> from mindspore.ops.communication import init_process_group, get_rank, get_world_size
+        >>> from mindspore.ops.communication import batch_isend_irecv, P2POp
+        >>> from mindspore import Tensor
+        >>>
+        >>> init_process_group()
+        >>> this_rank = get_rank()
+        >>> world_size = get_world_size()
+        >>> next_rank = (this_rank + 1) % world_size
+        >>> prev_rank = (this_rank + world_size - 1) % world_size
+        >>>
+        >>> send_tensor = Tensor(this_rank + 1, dtype=mindspore.float32)
+        >>> recv_tensor = Tensor(0., dtype=mindspore.float32)
+        >>>
+        >>> send_op = P2POp('isend', send_tensor, next_rank)
+        >>> recv_op = P2POp('irecv', recv_tensor, prev_rank)
+        >>>
+        >>> p2p_op_list = [send_op, recv_op]
+        >>> output = batch_isend_irecv(p2p_op_list)
+        >>> print(recv_tensor)
+        rank 0:
+        2.0
+        rank 1:
+        1.0
+    """
+    if is_inplace_func() is False:
+        raise ValueError("Non-inplace mode is currently not supported.")
+    tensors = []
+    op_types = []
+    remotes_ranks = []
+    tags = []
+    if not p2p_op_list:
+        raise TypeError(f"p2p_op_list can not be empty list.")
+    for _, p2p_op in enumerate(p2p_op_list):
+        if not isinstance(p2p_op, P2POp):
+            raise TypeError("The elements in p2p_op_list must be type of P2POp.")
+    group = p2p_op_list[0].group
+    type_ = None
+    for _, p2p_op in enumerate(p2p_op_list):
+        if group != p2p_op.group:
+            raise TypeError("The group name in p2p_op_list must be consistent.")
+        if isinstance(p2p_op.op, str):
+            type_ = p2p_op.op
+        else:
+            type_ = p2p_op.op.__name__
+        rank_ = (
+            p2p_op.peer
+            if p2p_op.group is None
+            else get_group_rank_from_world_rank(p2p_op.peer, p2p_op.group)
+        )
+        remotes_ranks.append(rank_)
+        tags.append(p2p_op.tag)
+        if type_ == "isend":
+            tensors.append(p2p_op.tensor)
+            op_types.append(TYPE_ISEND)
+        elif type_ == "irecv":
+            if isinstance(p2p_op.tensor, Tensor):
+                tensors.append(p2p_op.tensor)
+                op_types.append(TYPE_IRECV)
+            else:
+                raise TypeError("p2p_op.tensor must be tensor")
+        else:
+            raise TypeError("p2p_op.op must be isend or irecv")
+    if group is None:
+        group = GlobalComm.WORLD_COMM_GROUP
+    output = dist_comm_batch_isend_irecv_op(tensors, group, op_types, remotes_ranks)
+    _, handle = _deal_comm_outputs(output, True)
+    return [handle]
+def scatter_tensor(output_tensor, input_tensor, src=0, group=None, async_op=False):
+    r"""
+    Scatter tensor evently across the processes in the specified communication group.
+    Note:
+        - The interface behavior only support Tensor input and scatter evenly, which
+            is different from that of `pytoch.distributed.scatter`.
+        - Only the tensor in process `src` (global rank) will do scatter.
+        - Only support PyNative mode, Graph mode is not currently supported.
+    Args:
+        output_tensor (Tensor): Output tensor. It should have the same size across all ranks.
+        input_tensor (Tensor):  The input tensor to be scattered. The shape of tensor is :math:`(x_1, x_2, ..., x_R)`.
+        src (int, optional): Specifies the rank(global rank) of the process that send the tensor.
+            And only process `src` will send the tensor. Default is 0.
+        group (str, optional): The communication group to work on. If ``None``, which means ``"hccl_world_group"`` in
+            Ascend. Default: ``None``.
+        async_op (bool, optional): Whether this operator should be an async operator. Default: ``False`` .
+    Returns:
+        CommHandle, CommHandle is an async work handle, if `async_op` is set to True.
+        CommHandle will be None, when `async_op` is False.
+    Raises:
+        TypeError: If the type of the first input parameter is not Tensor, or any of `op` and `group` is not a str.
+        RuntimeError: If device target is invalid, or backend is invalid, or distributed initialization fails.
+    Supported Platforms:
+        ``Ascend``
+    Examples:
+        .. note::
+            Before running the following examples, you need to configure the communication environment variables.
+            For Ascend devices, it is recommended to use the msrun startup method
+            without any third-party or configuration file dependencies.
+            Please see the `msrun start up
+            <https://www.mindspore.cn/tutorials/en/master/parallel/msrun_launcher.html>`_
+            for more details.
+            This example should be run with 2 devices.
+        >>> import mindspore as ms
+        >>> from mindspore.ops.communication import init_process_group
+        >>> from mindspore.communication.comm_func import scatter_tensor
+        >>> import numpy as np
+        >>> # Launch 2 processes.
+        >>>
+        >>> init_process_group()
+        >>> input = ms.Tensor(np.arange(8).reshape([4, 2]).astype(np.float32))
+        >>> output = ms.Tensor(np.zeros([2, 2]).astype(np.float32))
+        >>> out = scatter_tensor(output, input, src=0)
+        >>> print(output)
+        # rank_0
+        [[0. 1.]
+         [2. 3.]]
+        # rank_1
+        [[4. 5.]
+         [6. 7.]]
+    """
+    if is_inplace_func() is False:
+        raise ValueError("Non-inplace mode is currently not supported.")
+    if not isinstance(input_tensor, (Tensor, Tensor_)):
+        raise TypeError("For scatter_tensor, the input tensor must be tensor")
+    if not isinstance(output_tensor, (Tensor, Tensor_)):
+        raise TypeError("For scatter_tensor, the output tensor must be tensor")
+    if not isinstance(src, int):
+        raise TypeError("For scatter_tensor, the src must be int")
+    if group is None:
+        group = GlobalComm.WORLD_COMM_GROUP
+    if not isinstance(group, str):
+        raise TypeError(
+            "The argument 'group' must be type of string, "
+            "but got 'group' type : {}.".format(type(group))
+        )
+    if not isinstance(async_op, bool):
+        raise TypeError(
+            f"The argument 'async_op' must be a bool, but got {type(async_op)}."
+        )
+    src = get_group_rank_from_world_rank(src, group)
+    rank_size = get_cache_group_size(group)
+    rank_id = get_cache_group_rank(group)
+    output = dist_comm_scatter_tensor_op(
+        output_tensor, input_tensor, rank_size, src, rank_id, group
+    )
+    _, handle = _deal_comm_outputs(output, async_op)
+    return handle
+def gather_into_tensor(output_tensor, input_tensor, dst=0, group=None, async_op=False):
+    r"""
+    Gathers tensors from the specified communication group. The operation will gather the tensor
+    from processes according to dimension 0.
+    Note:
+        - Only the tensor in process `dst` (global rank) will keep the gathered tensor. The other process
+            will keep a tensor with shape [1], which has no mathematical meaning.
+        - Only support PyNative mode, Graph mode is not currently supported.
+    Args:
+        output_tensor (Tensor):  Output tensor to accommodate tensor elements from all ranks.
+        input_tensor (Tensor): The tensor to be gathered. The shape of tensor is :math:`(x_1, x_2, ..., x_R)`.
+            the input tensors in this API must have the same size across all ranks.
+        dst(int, optional): Specifies the rank(global rank) of the process that receive the tensor.
+            And only process `dst` will receive the gathered tensor. Default: 0.
+        group (str, optional): The communication group to work on. If ``None``, which means ``"hccl_world_group"`` in
+            Ascend. Default: ``None``.
+        async_op (bool, optional): Whether this operator should be an async operator. Default: ``False`` .
+    Returns:
+        CommHandle, CommHandle is an async work handle, if `async_op` is set to True.
+        CommHandle will be None, when `async_op` is False.
+    Raises:
+        TypeError: If the type of the `input_tensor` or `output_tensor` parameter is not Tensor,
+            or any of `op` and `group` is not a str.
+        RuntimeError: If device target is invalid, or backend is invalid, or distributed initialization fails.
+    Supported Platforms:
+        ``Ascend``
+    Examples:
+        .. note::
+            Before running the following examples, you need to configure the communication environment variables.
+            For Ascend devices, it is recommended to use the msrun startup method
+            without any third-party or configuration file dependencies.
+            Please see the `msrun start up
+            <https://www.mindspore.cn/tutorials/en/master/parallel/msrun_launcher.html>`_
+            for more details.
+            This example should be run with 2 devices.
+        >>> import numpy as np
+        >>> import mindspore as ms
+        >>> import mindspore.nn as nn
+        >>> from mindspore.ops.communication import init_process_group
+        >>> from mindspore import Tensor
+        >>> from mindspore.communication.comm_func import gather_into_tensor
+        >>> # Launch 2 processes.
+        >>>
+        >>> init_process_group()
+        >>> input = Tensor(np.arange(4).reshape([2, 2]).astype(np.float32))
+        >>> output = Tensor(np.zeros([4, 2]).astype(np.float32))
+        >>> handle = gather_into_tensor(output, input, dst=0)
+        >>> print(output)
+        Process with rank 0: [[0. 1.],
+                              [2. 3.],
+                              [0. 1.],
+                              [2. 3.]]
+        Process with rank 1:  [[0. 0.],
+                              [0. 0.],
+                              [0. 0.],
+                              [0. 0.]]
+    """
+    if is_inplace_func() is False:
+        raise ValueError("Non-inplace mode is currently not supported.")
+    if not isinstance(input_tensor, (Tensor, Tensor_)):
+        raise TypeError("For gather_into_tensor, the input tensor must be tensor")
+    if not isinstance(output_tensor, (Tensor, Tensor_)):
+        raise TypeError("For gather_into_tensor, the output tensor must be tensor")
+    if not isinstance(dst, int):
+        raise TypeError("For gather_into_tensor, the dst must be int")
+    if group is None:
+        group = GlobalComm.WORLD_COMM_GROUP
+    if not isinstance(group, str):
+        raise TypeError(
+            "The argument 'group' must be type of string, "
+            "but got 'group' type : {}.".format(type(group))
+        )
+    if not isinstance(async_op, bool):
+        raise TypeError(
+            f"The argument 'async_op' must be a bool, but got {type(async_op)}."
+        )
+    group_size = get_cache_group_size(group)
+    dst = get_group_rank_from_world_rank(dst, group)
+    rank_id = get_cache_group_rank(group)
+    output = dist_comm_gather_into_tensor_op(
+        output_tensor, input_tensor, group_size, dst, rank_id, group
+    )
+    _, handle = _deal_comm_outputs(output, async_op)
+    return handle
+def broadcast(tensor, src, group=None, async_op=False):
+    """
+    Broadcasts the tensor to the whole group.
+    Note:
+        - The tensors must have the same shape and format in all processes of the collection.
+        - Only support PyNative mode, Graph mode is not currently supported.
+    Args:
+        tensor (Tensor): Data to be sent if src is the rank of current process,
+            and tensor to be used to save received data otherwise.
+        src (int): Specifies the rank(global rank) of the process that broadcast the tensor.
+            And only process `src` will broadcast the tensor.
+        group (str, optional): The communication group to work on. If ``None``, which means ``"hccl_world_group"`` in
+            Ascend. Default: ``None``.
+        async_op (bool, optional): Whether this operator should be an async operator. Default: ``False`` .
+    Returns:
+        CommHandle, CommHandle is an async work handle, if `async_op` is set to True.
+        CommHandle will be None, when `async_op` is False.
+    Raises:
+        TypeError: If the type of the `tensor` parameter is not Tensor, `src` is not an integer,
+            `group` is not a string or `async_op` is not bool.
+        RuntimeError: If device target is invalid, or backend is invalid, or distributed initialization fails.
+    Supported Platforms:
+        ``Ascend`` ``CPU``
+    Examples:
+        .. note::
+            Before running the following examples, you need to configure the communication environment variables.
+            For Ascend devices, it is recommended to use the msrun startup method
+            without any third-party or configuration file dependencies.
+            Please see the `msrun start up
+            <https://www.mindspore.cn/tutorials/en/master/parallel/msrun_launcher.html>`_
+            for more details.
+            This example should be run with 2 devices.
+        >>> import mindspore as ms
+        >>> from mindspore.ops.communication import init_process_group, broadcast
+        >>> import numpy as np
+        >>> # Launch 2 processes.
+        >>>
+        >>> init_process_group()
+        >>> data = ms.Tensor(np.arange(8).reshape([2, 4]).astype(np.float32))
+        >>> handle = broadcast(tensor=data, src=0)
+        >>> print(data)
+        [[0. 1. 2. 3.]
+         [4. 5. 6. 7.]]
+    """
+    if is_inplace_func() is False:
+        raise ValueError("Non-inplace mode is currently not supported.")
+    if not isinstance(tensor, (Tensor, Tensor_)):
+        raise TypeError("For broadcast, the input tensor must be tensor")
+    if not isinstance(src, int):
+        raise TypeError("For broadcast, the src must be int")
+    if group is None:
+        group = GlobalComm.WORLD_COMM_GROUP
+    if not isinstance(group, str):
+        raise TypeError(
+            "The argument 'group' must be type of string, "
+            "but got 'group' type : {}.".format(type(group))
+        )
+    if not isinstance(async_op, bool):
+        raise TypeError(
+            f"The argument 'async_op' must be a bool, but got {type(async_op)}."
+        )
+    src_rank = get_group_rank_from_world_rank(src, group)
+    rank_id = get_cache_group_rank(group)
+    output = dist_comm_broadcast_op(tensor, src_rank, rank_id, group)
+    _, handle = _deal_comm_outputs(output, async_op)
+    return handle
+def barrier(group=None, async_op=False, device_ids=None):
+    """
+    Synchronizes all processes in the specified group. Once the process call this operation, it will be blocked until
+    all processes call this operation. After all processes finish calling the operations, the blocked processes
+    will be woken and continue their task.
+    Args:
+        group (str, optional): The communication group to work on. If ``None``, which means ``"hccl_world_group"`` in
+            Ascend. Default: ``None``.
+        async_op (bool, optional): Whether this operator should be an async operator. Default: ``False`` .
+        device_ids (list[int], optional): Currently It is a reserved Parameter.
+    Returns:
+        CommHandle, CommHandle is an async work handle, if `async_op` is set to True.
+        CommHandle will be None, when `async_op` is False.
+    Raises:
+        TypeError: `group` is not a str or `async_op` is not a bool.
+        RuntimeError: If backend is invalid, or distributed initialization fails.
+    Supported Platforms:
+        ``Ascend`` ``CPU``
+    Examples:
+        .. note::
+            Before running the following examples, you need to configure the communication environment variables.
+            For Ascend devices, it is recommended to use the msrun startup method
+            without any third-party or configuration file dependencies.
+            Please see the `msrun start up
+            <https://www.mindspore.cn/tutorials/en/master/parallel/msrun_launcher.html>`_
+            for more details.
+            This example should be run with 2 devices.
+        >>> from mindspore.ops.communication import init_process_group
+        >>> from mindspore.communication.comm_func import barrier
+        >>> # Launch 2 processes.
+        >>> init_process_group()
+        >>> barrier()
+        >>> print("barrier finish!")
+        barrier finish!
+    """
+    if group is None:
+        group = GlobalComm.WORLD_COMM_GROUP
+    if not isinstance(group, str):
+        raise TypeError(
+            "The argument 'group' must be type of string, "
+            "but got 'group' type : {}.".format(type(group))
+        )
+    if not isinstance(async_op, bool):
+        raise TypeError(
+            f"The argument 'async_op' must be a bool, but got {type(async_op)}."
+        )
+    output = dist_comm_barrier_op(group)
+    _, handle = _deal_comm_outputs(output, async_op, True)
+    return handle
+def send(tensor, dst=0, group=None, tag=0):
+    """
+    Send tensors to the specified dest_rank.
+    Note:
+        Only support PyNative mode, Graph mode is not currently supported.
+    Args:
+        tensor (Tensor): Tensor to send.
+        dst (int, optional): A required integer identifying the destination rank(global rank). Default: 0.
+        group (str, optional): The communication group to work on. If ``None``, which means ``"hccl_world_group"`` in
+            Ascend. Default: ``None``.
+        tag (int, optional): A required integer identifying the send/recv message tag. The message will
+            be received by the Receive op with the same "tag". Default: 0. It is a reserved parameter currently.
+    Raises:
+        TypeError: If the `tensor` is not Tensor, `dst` is not an int or `group` is not a str.
+        ValueError: If the `dst` process rank id is same as the current process.
+    Supported Platforms:
+        ``Ascend`` ``CPU``
+    Examples:
+        .. note::
+            Before running the following examples, you need to configure the communication environment variables.
+            For Ascend devices, it is recommended to use the msrun startup method
+            without any third-party or configuration file dependencies.
+            Please see the `msrun start up
+            <https://www.mindspore.cn/tutorials/en/master/parallel/msrun_launcher.html>`_
+            for more details.
+            This example should be run with 2 devices.
+        >>> from mindspore.ops.communication import init_process_group
+        >>> from mindspore.ops.communication import send, recv, get_rank
+        >>> from mindspore import Tensor
+        >>> import numpy as np
+        >>>
+        # Launch 2 processes, Process 0 sends the array to Process 1.
+        >>> init_process_group()
+        >>> this_rank = get_rank()
+        >>> if this_rank == 0:
+        ...     input_ = Tensor(np.ones([2, 8]).astype(np.float32))
+        ...     send(input_, 1)
+        >>> if this_rank == 1:
+        ...     x = Tensor(np.zeros([2, 8]).astype(np.float32))
+        ...     out = recv(x, src=0)
+        ...     print(x)
+        rank 1:
+        [[1. 1. 1. 1. 1. 1. 1. 1.]
+         [1. 1. 1. 1. 1. 1. 1. 1.]]
+    """
+    if not isinstance(tensor, (Tensor, Tensor_)):
+        raise TypeError("For send, the input tensor must be tensor")
+    if not isinstance(dst, int):
+        raise TypeError("For send, the dst must be int")
+    if group is None:
+        group = GlobalComm.WORLD_COMM_GROUP
+    if not isinstance(group, str):
+        raise TypeError(
+            "The argument 'group' must be type of string, "
+            "but got 'group' type : {}.".format(type(group))
+        )
+    if get_cache_group_rank() == dst:
+        raise ValueError(
+            "Invalid destination rank: destination rank should not be the same as "
+            "the rank of the current process."
+        )
+    _dst = _get_group_rank_from_world_rank_from_cache_helper(dst, group)
+    output = dist_comm_isend_op(tensor, _dst, group, tag)
+    _deal_comm_outputs(output, False)
+def recv(tensor, src=0, group=None, tag=0):
+    """
+    Receive tensors from src.
+    Note:
+        Only support PyNative mode, Graph mode is not currently supported.
+    Args:
+        tensor (Tensor): Tensor to fill with received data, If the function operates in-place. Otherwise,
+            Indicates the the shape and dtype of this tensor is used to receive tensor, but the value of
+            input `tensor` would not take effect.
+        src (int, optional): A required integer identifying the source rank(global rank). Default: ``0``.
+        group (str, optional): The communication group to work on. If ``None``, which means ``"hccl_world_group"`` in
+            Ascend. Default: ``None``.
+        tag (int, optional): A required integer identifying the send/recv message tag. The message will
+            be received by the Send op with the same "tag". Default: ``0``. It is a reserved parameter currently.
+    Returns:
+        - int, if the function operates in-place, return it. If success, return ``0``.
+        - Tensor, if the function operates non in-place, return it.
+            the shape of output is :math:`(x_1, x_2, ..., x_R)`.
+    Raises:
+        TypeError: If the `tensor` is not Tensor, `src` is not an int or `group` is not a str.
+        ValueError: If the rank ID of the process is greater than the rank size of the communication group.
+    Supported Platforms:
+        ``Ascend`` ``CPU``
+    Examples:
+        .. note::
+            Before running the following examples, you need to configure the communication environment variables.
+            For Ascend devices, it is recommended to use the msrun startup method
+            without any third-party or configuration file dependencies.
+            Please see the `msrun start up
+            <https://www.mindspore.cn/tutorials/en/master/parallel/msrun_launcher.html>`_
+            for more details.
+            This example should be run with 2 devices.
+        >>> from mindspore.ops.communication import init_process_group
+        >>> from mindspore.ops.communication import send, recv, get_rank
+        >>> from mindspore import Tensor
+        >>> import numpy as np
+        >>>
+        # Launch 2 processes, Process 0 sends the array to Process 1.
+        >>> init_process_group()
+        >>> this_rank = get_rank()
+        >>> if this_rank == 0:
+        ...     input_ = Tensor(np.ones([2, 8]).astype(np.float32))
+        ...     send(input_, 1)
+        >>> if this_rank == 1:
+        ...     x = Tensor(np.zeros([2, 8]).astype(np.float32))
+        ...     out = recv(x, src=0)
+        ...     print(x)
+        rank 1:
+        [[1. 1. 1. 1. 1. 1. 1. 1.]
+         [1. 1. 1. 1. 1. 1. 1. 1.]]
+    """
+    if not isinstance(tensor, (Tensor, Tensor_)):
+        raise TypeError("For recv, the input tensor must be tensor")
+    if not isinstance(src, int):
+        raise TypeError("For recv, the src must be int")
+    if group is None:
+        group = GlobalComm.WORLD_COMM_GROUP
+    if not isinstance(group, str):
+        raise TypeError(
+            "The argument 'group' must be type of string, "
+            "but got 'group' type : {}.".format(type(group))
+        )
+    _src = _get_group_rank_from_world_rank_from_cache_helper(src, group)
+    if is_inplace_func() is True:
+        output = dist_comm_irecv_op(tensor, tag, _src, group)
+        _deal_comm_outputs(output, False)
+        return 0
+    shape = tensor.shape
+    dtype = tensor.dtype
+    output = inner_comm_irecv_op(tag, _src, shape, group, dtype)
+    output, _ = _deal_comm_outputs(output, False)
+    return output
+def isend(tensor, dst=0, group=None, tag=0):
+    """
+    Send tensors to the specified dest_rank asynchronously.
+    Note:
+        Only support PyNative mode, Graph mode is not currently supported.
+    Args:
+        tensor (Tensor): Tensor to send.
+        dst (int, optional): A required integer identifying the destination rank(global rank). Default: 0.
+        group (str, optional): The communication group to work on. If ``None``, which means ``"hccl_world_group"`` in
+            Ascend. Default: ``None``.
+        tag (int, optional): A required integer identifying the send/recv message tag. The message will
+            be received by the Receive op with the same "tag". Default: 0. It is a reserved parameter currently.
+    Returns:
+        CommHandle, it is an async work handle.
+    Raises:
+        TypeError: If the `tensor` is not Tensor, `dst` is not an int or `group` is not a str.
+        ValueError: If the `dst` process rank id is same as the current process.
+    Supported Platforms:
+        ``Ascend``
+    Examples:
+        .. note::
+            Before running the following examples, you need to configure the communication environment variables.
+            For Ascend devices, it is recommended to use the msrun startup method
+            without any third-party or configuration file dependencies.
+            Please see the `msrun start up
+            <https://www.mindspore.cn/tutorials/en/master/parallel/msrun_launcher.html>`_
+            for more details.
+            This example should be run with 2 devices.
+        >>> from mindspore.ops.communication import init_process_group
+        >>> from mindspore.ops.communication import isend, irecv, get_rank
+        >>> from mindspore import Tensor
+        >>> import numpy as np
+        >>>
+        # Launch 2 processes, Process 0 sends the array to Process 1.
+        >>> init_process_group()
+        >>> this_rank = get_rank()
+        >>> if this_rank == 0:
+        ...     input_ = Tensor(np.ones([2, 8]).astype(np.float32))
+        ...     handle = isend(input_, 1)
+        ...     handle.wait()
+        >>> if this_rank == 1:
+        ...     x = Tensor(np.zeros([2, 8]).astype(np.float32))
+        ...     handle = irecv(x, src=0)
+        ...     handle.wait()
+        ...     print(x)
+        rank 1:
+        [[1. 1. 1. 1. 1. 1. 1. 1.]
+         [1. 1. 1. 1. 1. 1. 1. 1.]]
+    """
+    if not isinstance(tensor, (Tensor, Tensor_)):
+        raise TypeError("For isend, the input tensor must be tensor")
+    if not isinstance(dst, int):
+        raise TypeError("For isend, the dst must be int")
+    if group is None:
+        group = GlobalComm.WORLD_COMM_GROUP
+    if not isinstance(group, str):
+        raise TypeError(
+            "The argument 'group' must be type of string, "
+            "but got 'group' type : {}.".format(type(group))
+        )
+    if get_cache_group_rank() == dst:
+        raise ValueError(
+            "Invalid destination rank: destination rank should not be the same as "
+            "the rank of the current process."
+        )
+    _dst = _get_group_rank_from_world_rank_from_cache_helper(dst, group)
+    output = dist_comm_isend_op(tensor, _dst, group, tag)
+    _, handle = _deal_comm_outputs(output, True)
+    return handle
+def irecv(tensor, src=0, group=None, tag=0):
+    """
+    Receive tensors from src asynchronously.
+    Note:
+        Only support PyNative mode, Graph mode is not currently supported.
+    Args:
+        tensor (Tensor): Tensor to fill with received data, if the function operates in-place.Otherwise,
+            Indicates the the shape and dtype of this tensor is used to receive tensor, but the value of
+            input `tensor` would not take effect.
+        src (int, optional): A required integer identifying the source rank(global rank). Default: ``0``.
+        group (str, optional): The communication group to work on. If ``None``, which means ``"hccl_world_group"`` in
+            Ascend. Default: ``None``.
+        tag (int, optional): A required integer identifying the send/recv message tag. The message will
+            be received by the Send op with the same "tag". Default: ``0``. It is a reserved parameter currently.
+    Returns:
+        - CommHandle, if the function operates in-place, return it. CommHandle is an async work handle,
+            if `async_op` is set to True. CommHandle will be None, when `async_op` is False.
+        - Tuple(Tensor, CommHandle), if the function operates non in-place, return it. the shape of output
+            is :math:`(x_1, x_2, ..., x_R)`. CommHandle is an async work handle, if `async_op` is set to True.
+            CommHandle will be None, when `async_op` is False.
+    Raises:
+        TypeError: If the type of `tensor` is not Tensor, If `src` is not an int or `group` is not a str.
+        ValueError: If the rank ID of the process is greater than the rank size of the communication group.
+    Supported Platforms:
+        ``Ascend``
+    Examples:
+        .. note::
+            Before running the following examples, you need to configure the communication environment variables.
+            For Ascend devices, it is recommended to use the msrun startup method
+            without any third-party or configuration file dependencies.
+            Please see the `msrun start up
+            <https://www.mindspore.cn/tutorials/en/master/parallel/msrun_launcher.html>`_
+            for more details.
+            This example should be run with 2 devices.
+        >>> from mindspore.ops.communication import init_process_group
+        >>> from mindspore.ops.communication import isend, irecv, get_rank
+        >>> from mindspore import Tensor
+        >>> import numpy as np
+        >>>
+        # Launch 2 processes, Process 0 sends the array to Process 1.
+        >>> init_process_group()
+        >>> this_rank = get_rank()
+        >>> if this_rank == 0:
+        ...     input_ = Tensor(np.ones([2, 8]).astype(np.float32))
+        ...     handle = isend(input_, 1)
+        ...     handle.wait()
+        >>> if this_rank == 1:
+        ...     x = Tensor(np.zeros([2, 8]).astype(np.float32))
+        ...     handle = irecv(x, src=0)
+        ...     handle.wait()
+        ...     print(x)
+        rank 1:
+        [[1. 1. 1. 1. 1. 1. 1. 1.]
+         [1. 1. 1. 1. 1. 1. 1. 1.]]
+    """
+    if not isinstance(tensor, (Tensor, Tensor_)):
+        raise TypeError("For irecv, the input tensor must be tensor")
+    if group is None:
+        group = GlobalComm.WORLD_COMM_GROUP
+    if not isinstance(group, str):
+        raise TypeError(
+            "The argument 'group' must be type of string, "
+            "but got 'group' type : {}.".format(type(group))
+        )
+    if not isinstance(src, int):
+        raise TypeError("For irecv, the src must be int")
+    _src = _get_group_rank_from_world_rank_from_cache_helper(src, group)
+    if is_inplace_func() is True:
+        output = dist_comm_irecv_op(tensor, tag, _src, group)
+        _, handle = _deal_comm_outputs(output, True)
+        return handle
+    shape = tensor.shape
+    dtype = tensor.dtype
+    output = inner_comm_irecv_op(tag, _src, shape, group, dtype)
+    return _deal_comm_outputs(output, True)
+def all_to_all(output_tensor_list, input_tensor_list, group=None, async_op=False):
+    """
+    scatter and gather list of tensor to/from all rank according to input/output tensor list.
+    Note:
+        - tensor shape in `output_shape_list` and `input_tensor_list` should be match across ranks.
+        - Only support PyNative mode, Graph mode is not currently supported.
+    Args:
+        output_tensor_list(Union[List(Tensor), List(Tuple(int))]): List of tensors that indicate the gathered
+            from remote ranks, If the function operates in-place. Otherwise, List of tensors or shape
+            that indicate the gathered tensors shape from remote ranks.
+        input_tensor_list (List[Tensor]): List of tensors to scatter to the remote rank.
+        group (str, optional): The communication group to work on. If ``None``, which means ``"hccl_world_group"`` in
+            Ascend. Default: ``None``.
+        async_op (bool, optional): Whether this operator should be an async operator. Default: ``False`` .
+    Returns:
+        - CommHandle, if the function operates in-place, return it. CommHandle is an async work handle,
+            if `async_op` is set to True. CommHandle will be None, when `async_op` is False.
+        - Tuple(Tensor, CommHandle), if the function operates non in-place, return it. the tensors is gathered
+            from remote ranks. CommHandle is an async work handle, if `async_op` is set to True.
+            CommHandle will be None, when `async_op` is False.
+    Raises:
+        TypeError: If not all elements in `input_tensor_list` or `output_tensor_list` are Tensor.
+        TypeError: If tensors in `input_tensor_list` or `output_tensor_list` are not the same type.
+        TypeError: If `group` is not str or `async_op` is not bool.
+    Supported Platforms:
+        ``Ascend``
+    Examples:
+        .. note::
+            Before running the following examples, you need to configure the communication environment variables.
+            For Ascend devices, it is recommended to use the msrun startup method
+            without any third-party or configuration file dependencies.
+            Please see the `msrun start up
+            <https://www.mindspore.cn/tutorials/en/master/parallel/msrun_launcher.html>`_
+            for more details.
+            This example should be run with 2 devices.
+        >>> import mindspore as ms
+        >>> from mindspore.ops.communication import init_process_group, get_rank
+        >>> from mindspore.ops.communication import all_to_all
+        >>> from mindspore import Tensor
+        >>>
+        >>> init_process_group()
+        >>> this_rank = get_rank()
+        >>> if this_rank == 0:
+        ...     send_tensor_list = [Tensor(1.), Tensor([[2, 3], [4, 5.]])]
+        ...     recv_tensor_list = [Tensor((0), dtype=ms.float32), Tensor([0, 0.])]
+        >>> if this_rank == 1:
+        ...     send_tensor_list = [Tensor([2, 2.]), Tensor([4, 5, 6, 7.])]
+        ...     recv_tensor_list = [Tensor([[0, 0.],[0, 0]]), Tensor([0, 0, 0, 0.])]
+        >>> handle = all_to_all(recv_tensor_list, send_tensor_list)
+        >>> print(recv_tensor_list)
+        rank 0:
+        (Tensor(shape=[], dtype=Float32, value= 1),
+         Tensor(shape=[2], dtype=Float32, value= [2.00000000e+00, 2.00000000e+00]))
+        rank 1:
+        (Tensor(shape=[2, 2], dtype=Float32, value=
+        [[2.00000000e+00, 3.00000000e+00],
+         [4.00000000e+00, 5.00000000e+00]]),
+         Tensor(shape=[4], dtype=Float32, value=[4.00000000e+00, 5.00000000e+00, 6.00000000e+00, 7.00000000e+00]))
+    """
+    if group is None:
+        group = GlobalComm.WORLD_COMM_GROUP
+    if not isinstance(group, str):
+        raise TypeError(
+            "The argument 'group' must be type of string, "
+            "but got 'group' type : {}.".format(type(group))
+        )
+    if not isinstance(async_op, bool):
+        raise TypeError(
+            f"The argument 'async_op' must be a bool, but got {type(async_op)}."
+        )
+    _check_all_tensors(input_tensor_list)
+    _check_all_tensor_same_dtype(input_tensor_list)
+    send_numel_list = []
+    send_flatten_tensor = []
+    recv_numel_list = []
+    recv_shape_list = []
+    for tensor in input_tensor_list:
+        send_numel_list.append(tensor.numel())
+        send_flatten_tensor.append(tensor.reshape(-1))
+    send_flatten_tensor = cat(send_flatten_tensor)
+    rank_size = get_cache_group_size(group)
+    if is_inplace_func() is False:
+        _check_all_tensors_or_tuple(output_tensor_list)
+        for tensor in output_tensor_list:
+            if isinstance(tensor, Tensor):
+                recv_numel_list.append(tensor.size)
+                recv_shape_list.append(tensor.shape)
+            else:
+                _shape = tensor
+                recv_numel_list.append(_get_size(_shape))
+                recv_shape_list.append(_shape)
+        output = inner_comm_all_to_all_v_op(send_flatten_tensor, group, send_numel_list, recv_numel_list,
+                                            rank_size, False)
+        output, handle = _deal_comm_outputs(output, async_op)
+        result = []
+        offset = 0
+        for numel, shape in zip(recv_numel_list, recv_shape_list):
+            result.append(output[offset:offset + numel].reshape(shape))
+            offset = offset + numel
+        return (tuple(result), handle)
+    _check_all_tensors(output_tensor_list)
+    _check_all_tensor_same_dtype(output_tensor_list)
+    for tensor in output_tensor_list:
+        recv_numel_list.append(tensor.numel())
+        recv_shape_list.append(tensor.shape)
+    output = dist_comm_all_to_all_v_op(
+        output_tensor_list,
+        send_flatten_tensor,
+        group,
+        send_numel_list,
+        recv_numel_list,
+        rank_size,
+    )
+    _, handle = _deal_comm_outputs(output, async_op)
+    return handle
+def _get_all_to_all_single_numel_list(tensor, output, output_split_sizes,
+                                      input_split_sizes, group):
+    """get numel list for all_to_all_single."""
+    if _is_split_sizes_empty(input_split_sizes):
+        _world_size = get_cache_group_size(group)
+        if tensor.shape[0] % _world_size != 0:
+            raise ValueError(
+                "input shape at dim 0 must be divided by world_size, "
+                f"but got {tensor.shape[0]} and {_world_size}."
+            )
+        _split_size = tensor.shape[0] // _world_size
+        input_split_sizes = (_split_size,) * _world_size
+    if _is_split_sizes_empty(output_split_sizes):
+        _world_size = get_cache_group_size(group)
+        shape_dim_0 = output.shape[0]
+        if shape_dim_0 % _world_size != 0:
+            raise ValueError(
+                "output shape at dim 0 must be divided by world_size, "
+                f"but got {shape_dim_0} and {_world_size}."
+            )
+        _split_size = shape_dim_0 // _world_size
+        output_split_sizes = (_split_size,) * _world_size
+    send_size_without_first_dim = _get_size(tensor.shape[1:])
+    send_numel_list = [size * send_size_without_first_dim for size in input_split_sizes]
+    recv_shape_without_first_dim = output.shape[1:]
+    recv_size_without_first_dim = _get_size(recv_shape_without_first_dim)
+    recv_numel_list = [
+        size * recv_size_without_first_dim for size in output_split_sizes
+    ]
+    return send_numel_list, recv_numel_list, recv_shape_without_first_dim
+def all_to_all_single(output,
+                      input,
+                      output_split_sizes=None,
+                      input_split_sizes=None,
+                      group=None,
+                      async_op=False):
+    """
+    scatter and gather input with split size to/from all rank, and return result in a single tensor.
+    Note:
+        - Only support PyNative mode, Graph mode is not currently supported.
+    Args:
+        output (Union(Tensor, Tuple(int))): the output tensor is gathered concatenated from remote ranks,
+            if the functionoperates in-place. Otherwise, the tensor or shape to indicate the shape
+            of tensor gathered concatenated from remote rank.
+        input (Tensor): tensor to be scattered to remote rank.
+        output_split_sizes (Union(Tuple(int), List(int)), optional): output split size at dim 0. If set to None,
+            it means equally split by ``world_size``. Default: ``None``.
+        input_split_sizes (Union(Tuple(int), List(int)), optional): input split size at dim 0. If set to None,
+            it means equally split by ``world_size``. Default: ``None``.
+        group (str, optional): The communication group to work on. If ``None``, which means ``"hccl_world_group"`` in
+            Ascend. Default: ``None``.
+        async_op (bool, optional): Whether this operator should be an async operator. Default: ``False`` .
+    Returns:
+        - CommHandle, if the function operates in-place, return it. CommHandle is an async work handle,
+            if `async_op` is set to True. CommHandle will be None, when `async_op` is False.
+        - Tuple(Tensor, CommHandle), if the function operates non in-place, return it.
+            The output tensor is gathered concatenated from remote ranks.
+            If the numel of tensor gathered from remote is zero, it will return a Tensor with shape `()`,
+            and value has no actual meanning. CommHandle is an async work handle, if `async_op` is set to True.
+            CommHandle will be None, when `async_op` is False.
+    Raises:
+        TypeError: If `input` or `output` is not tensor. `group` is not a str, or async_op is not bool.
+        ValueError: When `input_split_sizes` is empty, input dim 0 can not be divided by ``world_size``.
+        ValueError: When `output_split_sizes` is empty, output dim 0 can not be divided by ``world_size``.
+    Supported Platforms:
+        ``Ascend``
+    Examples:
+        .. note::
+            Before running the following examples, you need to configure the communication environment variables.
+            For Ascend devices, it is recommended to use the msrun startup method
+            without any third-party or configuration file dependencies.
+            Please see the `msrun start up
+            <https://www.mindspore.cn/tutorials/en/master/parallel/msrun_launcher.html>`_
+            for more details.
+            This example should be run with 2 devices.
+        >>> import numpy as np
+        >>> import mindspore
+        >>> from mindspore.ops.communication import init_process_group, get_rank
+        >>> from mindspore.ops.communication import all_to_all_single
+        >>> from mindspore import Tensor
+        >>> from mindspore.ops.communication import zeros
+        >>>
+        >>> init_process_group()
+        >>> this_rank = get_rank()
+        >>> if this_rank == 0:
+        ...     output = Tensor(np.zeros([3, 3]).astype(np.float32))
+        ...     tensor = Tensor([[0, 1, 2.], [3, 4, 5], [6, 7, 8]])
+        ...     result = all_to_all_single(output, tensor, [2, 1], [2, 1])
+        ...     print(output)
+        >>> if this_rank == 1:
+        ...     output = Tensor(np.zeros([2, 3]).astype(np.float32))
+        ...     tensor = Tensor([[9, 10., 11], [12, 13, 14]])
+        ...     result = all_to_all_single(output, tensor, [1, 1], [1, 1])
+        ...     print(output)
+        rank 0:
+        [[ 0.  1.  2.]
+         [ 3.  4.  5.]
+         [ 9. 10. 11.]]
+        rank 1:
+        [[ 6.  7.  8.]
+         [12. 13. 14.]]
+    """
+    _check_all_tensors([input])
+    _check_all_tensors([output])
+    if group is None:
+        group = GlobalComm.WORLD_COMM_GROUP
+    if not isinstance(group, str):
+        raise TypeError(
+            "The argument 'group' must be type of string, "
+            "but got 'group' type : {}.".format(type(group))
+        )
+    if not isinstance(async_op, bool):
+        raise TypeError(
+            f"The argument 'async_op' must be a bool, but got {type(async_op)}."
+        )
+    split_sizes_empty = _is_split_sizes_empty(output_split_sizes) and _is_split_sizes_empty(input_split_sizes)
+    _input = input.reshape(-1)
+    rank_size = get_cache_group_size(group)
+    if is_inplace_func() is False:
+        if isinstance(output_split_sizes, list):
+            output_split_sizes = tuple(output_split_sizes)
+        if isinstance(input_split_sizes, list):
+            input_split_sizes = tuple(input_split_sizes)
+        global _ALL_TO_ALL_CACHE
+        tensor_shape = output
+        cache_key = (tensor_shape, output, output_split_sizes, input_split_sizes, group)
+        if cache_key not in _ALL_TO_ALL_CACHE:
+            _ALL_TO_ALL_CACHE[cache_key] = _get_all_to_all_single_numel_list(*cache_key)
+        send_numel_list, recv_numel_list, recv_shape_without_first_dim = _ALL_TO_ALL_CACHE[cache_key]
+        result = \
+            inner_comm_all_to_all_v_op(_input, group, send_numel_list, recv_numel_list, rank_size, split_sizes_empty)
+        result, handle = _deal_comm_outputs(result, async_op)
+        if any(recv_numel_list):
+            result = result.reshape((-1,) + recv_shape_without_first_dim)
+        return result, handle
+    send_numel_list, recv_numel_list, _ = \
+        _get_all_to_all_single_numel_list(input, output, output_split_sizes, input_split_sizes, group)
+    result = dist_comm_all_to_all_v_single_op(
+        output,
+        _input,
+        group,
+        send_numel_list,
+        recv_numel_list,
+        rank_size,
+        split_sizes_empty,
+    )
+    _, handle = _deal_comm_outputs(result, async_op)
+    return handle
+def _check_tensor_list(tensor_list, tensor, group_size):
+    """check all elements in tensor_list are type of Tensor or tuple or list"""
+    _check_group_tensor_list(tensor_list, group_size)
+    if tensor.dtype != tensor_list[0].dtype:
+        raise TypeError(
+            f"The argument list tensor type must be equal to tensor type, but got {tensor_list[0].dtype}."
+        )
+    if tensor.shape != tensor_list[0].shape:
+        raise TypeError(
+            f"The argument list tensor shape must be equal to tensor shape, but got {tensor_list[0].shape}."
+        )
+def _check_group_tensor_list(tensor_list, group_size):
+    if not tensor_list or len(tensor_list) != group_size:
+        raise TypeError(
+            f"The argument list tensor len must be equal to group rank size, but got {len(tensor_list)}."
+        )
+def all_gather(tensor_list, tensor, group=None, async_op=False):
+    """
+    Gathers tensors from the specified communication group and returns the tensor list which is all gathered.
+    Args:
+        tensor_list (list[Tensor]): Output list.
+        tensor (Tensor): The input tensor to be all gathered into tensor.
+        group (str, optional): The communication group to work on. If ``None``, which means ``"hccl_world_group"`` in
+            Ascend. Default: ``None``.
+        async_op (bool, optional): Whether this operator should be an async operator. Default: ``False`` .
+    Returns:
+        CommHandle,  CommHandle is an async work handle, if `async_op` is set to True.
+        CommHandle will be None, when `async_op` is False.
+    Raises:
+        TypeError: If the type of input `tensor` is not Tensor, `tensor_list` is not Tensor List,
+            `group` is not a str or async_op is not bool.
+        TypeError: If size of `tensor_list` is not equal to group size。
+        TypeError: If the type or shape of `tensor` not equal to the member of `tensor_list`。
+        RuntimeError: If device target is invalid, or backend is invalid, or distributed initialization fails.
+    Supported Platforms:
+        ``Ascend`` ``CPU``
+    Examples:
+        .. note::
+            Before running the following examples, you need to configure the communication environment variables.
+            For Ascend devices, it is recommended to use the msrun startup method
+            without any third-party or configuration file dependencies.
+            Please see the `msrun start up
+            <https://www.mindspore.cn/tutorials/en/master/parallel/msrun_launcher.html>`_
+            for more details.
+            This example should be run with 2 devices.
+        >>> import numpy as np
+        >>> import mindspore as ms
+        >>> from mindspore.ops.communication import init_process_group
+        >>> from mindspore.ops.communication import all_gather
+        >>> from mindspore import Tensor
+        >>>
+        >>> init_process_group()
+        >>> input_tensor = Tensor(np.ones([2, 8]).astype(np.float32))
+        >>> out_tensors = [Tensor(np.zeros([2, 8]).astype(np.float32)), Tensor(np.zeros([2, 8]).astype(np.float32))]
+        >>> output = all_gather(out_tensors, input_tensor)
+        >>> print(out_tensors)
+        [Tensor(shape=[2, 8], dtype=Float32, value=
+        [[ 1.00000000e+00,  1.00000000e+00,  1.00000000e+00 ...  1.00000000e+00,  1.00000000e+00,  1.00000000e+00],
+         [ 1.00000000e+00,  1.00000000e+00,  1.00000000e+00 ...  1.00000000e+00,  1.00000000e+00,  1.00000000e+00]]),
+        Tensor(shape=[2, 8], dtype=Float32, value=
+        [[ 1.00000000e+00,  1.00000000e+00,  1.00000000e+00 ...  1.00000000e+00,  1.00000000e+00,  1.00000000e+00],
+         [ 1.00000000e+00,  1.00000000e+00,  1.00000000e+00 ...  1.00000000e+00,  1.00000000e+00,  1.00000000e+00]])]
+    """
+    if is_inplace_func() is False:
+        raise ValueError("Non-inplace mode is currently not supported.")
+    _check_all_tensors(tensor_list)
+    _check_all_tensor_same_dtype(tensor_list)
+    if not isinstance(tensor, (Tensor, Tensor_)):
+        raise TypeError("For all_gather_into_tensor, the input tensor must be tensor")
+    if group is None:
+        group = GlobalComm.WORLD_COMM_GROUP
+    if not isinstance(group, str):
+        raise TypeError(
+            "The argument 'group' must be type of string, "
+            "but got 'group' type : {}.".format(type(group))
+        )
+    if not isinstance(async_op, bool):
+        raise TypeError(
+            f"The argument 'async_op' must be a bool, but got {type(async_op)}."
+        )
+    group_size = get_cache_group_size(group)
+    _check_group_tensor_list(tensor_list, group_size)
+    rank_id = get_group_rank_from_world_rank(get_rank(), group)
+    _check_output_shape(tensor, tensor_list[rank_id].shape, "all_gather")
+    _check_output_dtype(tensor, tensor_list[0].dtype, "all_gather")
+    result = dist_comm_all_gather_op(tensor_list, tensor, group_size, group)
+    _, handle = _deal_comm_outputs(result, async_op)
+    return handle
+def reduce_scatter(output, input_list, op=ReduceOp.SUM, group=None, async_op=False):
+    r"""
+    Reduces and scatters tensors from the specified communication group and
+    returns the tensor which is reduced and scattered.
+    Args:
+        output (Tensor): the output tensor.
+        input_list (list[Tensor]): List of tensors to reduce and scatter.
+        op (str, optional): Specifies an operation used for element-wise reductions,
+            like SUM and MAX. Default: ``ReduceOp.SUM`` .
+        group (str, optional): The communication group to work on. If ``None``, which means ``"hccl_world_group"`` in
+            Ascend. Default: ``None``.
+        async_op (bool, optional): Whether this operator should be an async operator. Default: ``False`` .
+    Returns:
+        CommHandle, CommHandle is an async work handle, if `async_op` is set to True.
+        CommHandle will be None, when `async_op` is False.
+    Raises:
+        TypeError: If the type of `output` parameter is not Tensor, `input_list` is not Tensor List.
+        TypeError: If any of `op` and `group` is not a str. async_op is not bool or 'op' is invalid.
+        TypeError: If size of `input_list` is not equal to group size.
+        TypeError: If the type or shape of `output` not equal to the member of `input_list`.
+        RuntimeError: If device target is invalid, or backend is invalid, or distributed initialization fails.
+    Supported Platforms:
+        ``Ascend``
+    Examples:
+        .. note::
+            Before running the following examples, you need to configure the communication environment variables.
+            For Ascend devices, it is recommended to use the msrun startup method
+            without any third-party or configuration file dependencies.
+            Please see the `msrun start up
+            <https://www.mindspore.cn/tutorials/en/master/parallel/msrun_launcher.html>`_
+            for more details.
+            This example should be run with 2 devices.
+        >>> from mindspore import Tensor
+        >>> from mindspore.ops.communication import init_process_group
+        >>> from mindspore.ops.communication import reduce_scatter
+        >>> import numpy as np
+        >>>
+        >>> init_process_group()
+        >>> input_tensors = [Tensor(np.ones([4, 8]).astype(np.float32)), Tensor(np.ones([4, 8]).astype(np.float32))]
+        >>> output_tensor = Tensor(np.zeros([4, 8]).astype(np.float32))
+        >>> output = reduce_scatter(output_tensor ,input_tensors)
+        >>> print(output_tensor)
+        [[2. 2. 2. 2. 2. 2. 2. 2.]
+         [2. 2. 2. 2. 2. 2. 2. 2.]
+         [2. 2. 2. 2. 2. 2. 2. 2.]
+         [2. 2. 2. 2. 2. 2. 2. 2.]]
+    """
+    if is_inplace_func() is False:
+        raise ValueError("Non-inplace mode is currently not supported.")
+    _check_all_tensors(input_list)
+    _check_all_tensor_same_dtype(input_list)
+    if not isinstance(output, (Tensor, Tensor_)):
+        raise TypeError("For reduce_scatter, the output tensor must be tensor")
+    if group is None:
+        group = GlobalComm.WORLD_COMM_GROUP
+    if not isinstance(group, str):
+        raise TypeError(
+            "The argument 'group' must be type of string, "
+            "but got 'group' type : {}.".format(type(group))
+        )
+    if not isinstance(async_op, bool):
+        raise TypeError(
+            f"The argument 'async_op' must be a bool, but got {type(async_op)}."
+        )
+    if not isinstance(op, str):
+        raise TypeError("For reduce_scatter, the input op type must be str")
+    if op not in ("sum", "prod", "min", "max"):
+        raise TypeError(
+            "For reduce_scatter, the input op value must be one of sum, prod, min, max"
+        )
+    rank_size = get_cache_group_size(group)
+    _check_group_tensor_list(input_list, rank_size)
+    rank_id = get_group_rank_from_world_rank(get_rank(), group)
+    _check_output_shape(output, input_list[rank_id].shape, "reduce_scatter")
+    _check_output_dtype(output, input_list[0].dtype, "reduce_scatter")
+    result = dist_comm_reduce_scatter_op(output, input_list, rank_size, op, group)
+    _, handle = _deal_comm_outputs(result, async_op)
+    return handle
+def scatter(tensor, scatter_list, src=0, group=None, async_op=False):
+    r"""
+    Scatter tensor evently across the processes in the specified communication group.
+    Note:
+        - The interface behavior only support Tensor List input and scatter evenly.
+        - Only the tensor in process `src` (global rank) will do scatter.
+        - Only support PyNative mode, Graph mode is not currently supported.
+    Args:
+        tensor (Tensor): the output tensor.
+        scatter_list (list[Tensor]): List of same-sized tensors to scatter.
+            default is None, must be specified on the source rank.
+        src (int, optional): Specifies the rank(global rank) of the process that send the tensor.
+            And only process `src` will send the tensor.
+        group (str, optional): The communication group to work on. If ``None``, which means ``"hccl_world_group"`` in
+            Ascend. Default: ``None``.
+        async_op (bool, optional): Whether this operator should be an async operator. Default: ``False`` .
+    Returns:
+        CommHandle, CommHandle is an async work handle, if `async_op` is set to True.
+        CommHandle will be None, when `async_op` is False.
+    Raises:
+        TypeError: If the type of `tensor` parameter is not Tensor, `scatter_list` is not Tensor List.
+        TypeError: If any of `op` and `group` is not a str. async_op is not bool or 'op' is invalid.
+        TypeError: If size of `scatter_list` is not equal to group size.
+        TypeError: If the type or shape of `tensor` not equal to the member of `scatter_list`.
+        RuntimeError: If device target is invalid, or backend is invalid, or distributed initialization fails.
+    Supported Platforms:
+        ``Ascend`` ``CPU``
+    Examples:
+        .. note::
+            Before running the following examples, you need to configure the communication environment variables.
+            For Ascend devices, it is recommended to use the msrun startup method
+            without any third-party or configuration file dependencies.
+            Please see the `msrun start up
+            <https://www.mindspore.cn/tutorials/en/master/parallel/msrun_launcher.html>`_
+            for more details.
+            This example should be run with 2 devices.
+        >>> from mindspore import Tensor
+        >>> from mindspore.ops.communication import init_process_group, scatter
+        >>> import numpy as np
+        >>> # Launch 2 processes.
+        >>>
+        >>> init_process_group()
+        >>> inputs = [Tensor(np.ones([2, 2]).astype(np.float32)), Tensor(np.ones([2, 2]).astype(np.float32))]
+        >>> output = Tensor(np.zeros([2, 2]).astype(np.float32))
+        >>> scatter(output, inputs, src=0)
+        >>> print(output)
+        # rank_0
+        [[1. 1.]
+         [1. 1.]]
+        # rank_1
+        [[1. 1.]
+         [1. 1.]]
+    """
+    if is_inplace_func() is False:
+        raise ValueError("Non-inplace mode is currently not supported.")
+    _check_all_tensors(scatter_list)
+    _check_all_tensor_same_dtype_and_shape(scatter_list)
+    if not isinstance(tensor, (Tensor, Tensor_)):
+        raise TypeError("For scatter_tensor, the output tensor must be tensor")
+    if not isinstance(src, int):
+        raise TypeError("For scatter_tensor, the src must be int")
+    if group is None:
+        group = GlobalComm.WORLD_COMM_GROUP
+    if not isinstance(group, str):
+        raise TypeError(
+            "The argument 'group' must be type of string, "
+            "but got 'group' type : {}.".format(type(group))
+        )
+    if not isinstance(async_op, bool):
+        raise TypeError(
+            f"The argument 'async_op' must be a bool, but got {type(async_op)}."
+        )
+    src = get_group_rank_from_world_rank(src, group)
+    rank_size = get_cache_group_size(group)
+    rank_id = get_cache_group_rank(group)
+    if src == rank_id:
+        _check_tensor_list(scatter_list, tensor, rank_size)
+    output = dist_comm_scatter_op(tensor, scatter_list, rank_size, src, rank_id, group)
+    _, handle = _deal_comm_outputs(output, async_op)
+    return handle
+def gather(tensor, gather_list, dst=0, group=None, async_op=False):
+    r"""
+    Gathers tensors from the specified communication group. The operation will gather the tensor
+    from processes according to dimension 0.
+    Note:
+        - Only the tensor in process `dst` (global rank) will keep the gathered tensor. The other process
+          will keep a tensor list which has no mathematical meaning.
+        - The tensors must have the same shape and format in all processes of the collection.
+        - Only support PyNative mode, Graph mode is not currently supported.
+    Args:
+        tensor (Tensor): The tensor to be gathered.
+        gather_list (list[Tensor]): List of same-sized tensors to use for gathered data.
+        dst (int, optional): Specifies the rank(global rank) of the process that receive the tensor.
+            And only process `dst` will receive the gathered tensor. Default: ``0`` .
+        group (str, optional): The communication group to work on. If ``None``, which means ``"hccl_world_group"`` in
+            Ascend. Default: ``None``.
+        async_op (bool, optional): Whether this operator should be an async operator. Default: ``False`` .
+    Returns:
+        CommHandle, CommHandle is an async work handle, if `async_op` is set to True.
+        CommHandle will be None, when `async_op` is False.
+    Raises:
+        TypeError: If the type of input tensor is not Tensor, or gather_list is not Tensor list.
+        TypeError: If dst is not an integer, group is not a string or async_op is not bool.
+        TypeError: If size of `gather_list` is not equal to group size.
+        TypeError: If the type or shape of `tensor` not equal to the member of `gather_list`.
+        RuntimeError: If device target is invalid, or backend is invalid, or distributed initialization fails.
+    Supported Platforms:
+        ``Ascend`` ``CPU``
+    Examples:
+        .. note::
+            Before running the following examples, you need to configure the communication environment variables.
+            For Ascend devices, it is recommended to use the msrun startup method
+            without any third-party or configuration file dependencies.
+            Please see the `msrun start up
+            <https://www.mindspore.cn/tutorials/en/master/parallel/msrun_launcher.html>`_
+            for more details.
+            This example should be run with 2 devices.
+        >>> import numpy as np
+        >>> import mindspore as ms
+        >>> import mindspore.nn as nn
+        >>> from mindspore.ops.communication import init_process_group, gather
+        >>> from mindspore import Tensor
+        >>> # Launch 2 processes.
+        >>> init_process_group()
+        >>> input = Tensor(np.arange(4).reshape([2, 2]).astype(np.float32))
+        >>> outputs = [Tensor(np.zeros([2, 2]).astype(np.float32)),Tensor(np.zeros([2, 2]).astype(np.float32))]
+        >>> gather(input, outputs, dst=0)
+        >>> print(outputs)
+        # rank_0
+        [Tensor(shape=[2, 2], dtype=Float32, value=
+        [[ 0.00000000e+00,  1.00000000e+00],
+         [ 2.00000000e+00,  3.00000000e+00]]), Tensor(shape=[2, 2], dtype=Float32, value=
+        [[ 0.00000000e+00,  1.00000000e+00], [ 2.00000000e+00,  3.00000000e+00]])]
+        [Tensor(shape=[2, 2], dtype=Float32, value=[[ 0.00000000e+00,  1.00000000e+00],
+         [ 2.00000000e+00,  3.00000000e+00]]), Tensor(shape=[2, 2], dtype=Float32, value=
+        [[ 0.00000000e+00,  1.00000000e+00], [ 2.00000000e+00,  3.00000000e+00]])]
+        # rank_1
+        [Tensor(shape=[2, 2], dtype=Float32, value=[[ 0.00000000e+00,  0.00000000e+00],
+         [ 0.00000000e+00,  0.00000000e+00]]), Tensor(shape=[2, 2], dtype=Float32, value=
+        [[ 0.00000000e+00,  0.00000000e+00], [ 0.00000000e+00,  0.00000000e+00]])]
+        [Tensor(shape=[2, 2], dtype=Float32, value=
+        [[ 0.00000000e+00,  0.00000000e+00],
+         [ 0.00000000e+00,  0.00000000e+00]]), Tensor(shape=[2, 2], dtype=Float32, value=
+        [[ 0.00000000e+00,  0.00000000e+00], [ 0.00000000e+00,  0.00000000e+00]])]
+    """
+    if is_inplace_func() is False:
+        raise ValueError("Non-inplace mode is currently not supported.")
+    if not isinstance(tensor, (Tensor, Tensor_)):
+        raise TypeError("For gather, the input tensor must be tensor")
+    _check_all_tensors(gather_list)
+    _check_all_tensor_same_dtype_and_shape(gather_list)
+    if not isinstance(dst, int):
+        raise TypeError("For gather, the dst must be int")
+    if group is None:
+        group = GlobalComm.WORLD_COMM_GROUP
+    if not isinstance(group, str):
+        raise TypeError(
+            "The argument 'group' must be type of string, "
+            "but got 'group' type : {}.".format(type(group))
+        )
+    if not isinstance(async_op, bool):
+        raise TypeError(f"The argument 'async_op' must be a bool, but got {type(async_op)}.")
+    group_size = get_cache_group_size(group)
+    dst = get_group_rank_from_world_rank(dst, group)
+    rank_id = get_cache_group_rank(group)
+    if dst == rank_id:
+        _check_tensor_list(gather_list, tensor, group_size)
+    output = dist_comm_gather_op(tensor, gather_list, group_size, dst, rank_id, group)
+    _, handle = _deal_comm_outputs(output, async_op)
+    return handle
+def scatter_object_list(scatter_object_output_list, scatter_object_input_list, src=0, group=None):
+    r"""
+    Scatters picklable objects in scatter_object_input_list to the whole group.
+    Note:
+        - Similar to :func:`mindspore.ops.communication.scatter`, but Python objects can be passed in.
+        - Only the objects in process `src` (global rank) will do scatter.
+        - Only support PyNative mode, Graph mode is not currently supported.
+    Args:
+        scatter_object_output_list (list[Any]): Non-empty list whose first element
+            will store the object scattered to this rank.
+        scatter_object_input_list (list[Any]): List of python objects to scatter.
+            it must be specified on the source rank.
+        src (int, optional): Specifies the rank(global rank) of the process that send the tensor.
+            And only process `src` will send the tensor. Default: ``0`` .
+        group (str, optional): The communication group to work on. If ``None``, which means ``"hccl_world_group"`` in
+            Ascend. Default: ``None``.
+    Raises:
+        TypeError: If `group` is not a str or `src` is not an integer.
+        TypeError: If size of `scatter_object_input_list` is not equal to group size.
+        RuntimeError: If device target is invalid, or backend is invalid, or distributed initialization fails.
+    Supported Platforms:
+        ``Ascend``
+    Examples:
+        .. note::
+            Before running the following examples, you need to configure the communication environment variables.
+            For Ascend devices, it is recommended to use the msrun startup method
+            without any third-party or configuration file dependencies.
+            Please see the `msrun start up
+            <https://www.mindspore.cn/tutorials/en/master/parallel/msrun_launcher.html>`_
+            for more details.
+            This example should be run with 2 devices.
+        >>> from mindspore.ops.communication import init_process_group, scatter_object_list
+        >>> init_process_group()
+        >>> obj = ["test",  {1: 2}]
+        >>> scatter_object_output_list=[None]
+        >>> scatter_object_list(scatter_object_output_list, obj)
+        >>> print(scatter_object_output_list)
+        # rank_0
+        ['test']
+        # rank_1
+        [{1: 2}]
+    """
+    if is_inplace_func() is False:
+        raise ValueError("Non-inplace mode is currently not supported.")
+    if group is None:
+        group = GlobalComm.WORLD_COMM_GROUP
+    if not isinstance(group, str):
+        raise TypeError(
+            "For 'scatter_object_list', the argument 'group' must be type of string, "
+            "but got 'group' type : {}.".format(type(group))
+        )
+    if not isinstance(scatter_object_output_list, list) or not scatter_object_output_list:
+        raise TypeError(f"The scatter_object_output_list can not be empty.")
+    if not isinstance(src, int):
+        raise TypeError("For scatter_object_list, the src must be int")
+    group_size = get_cache_group_size(group)
+    rank_id = get_cache_group_rank()
+    tensor_sizes = []
+    tensor_list = []
+    if rank_id == src:
+        if not isinstance(scatter_object_input_list, list) or len(scatter_object_input_list) != group_size:
+            raise TypeError(
+                "The len of scatter_object_input_list must be equal to group rank size, "
+                "but got {len(scatter_object_input_list)}."
+            )
+        for obj in scatter_object_input_list:
+            _, size = _object_to_tensor(obj)
+            tensor_sizes.append(Tensor([size], dtype=mstype.int32))
+        max_size = int(max(tensor_sizes).item())
+        for obj in scatter_object_input_list:
+            tensor, _ = _object_to_tensor(obj, max_size)
+            tensor_list.append(tensor)
+    else:
+        tensor_sizes = [Tensor([0], dtype=mstype.int32) for i in range(group_size)]
+    object_size = cat(tensor_sizes)
+    broadcast(object_size, src, group)
+    max_object_size = int(max(object_size).item())
+    data = np.zeros((max_object_size)).astype(np.int8)
+    if rank_id != src:
+        tensor_list = [Tensor(data) for i in range(group_size)]
+    out_tensor = Tensor(data)
+    scatter(out_tensor, tensor_list, src, group)
+    group_id = get_group_rank_from_world_rank(rank_id, group)
+    scatter_object_output_list[0] = _tensor_to_object(out_tensor, object_size[group_id])
+def gather_object(obj, object_gather_list=None, dst=0, group=None):
+    r"""
+    Gathers python objects from the whole group in a single process.
+    Note:
+        - Similar to :func:`mindspore.ops.communication.gather`, but Python objects can be passed in.
+        - Only support PyNative mode, Graph mode is not currently supported.
+    Args:
+        obj (Any): The python objects to be gathered.
+        object_gather_list (list[Any], optional): List of same-sized tensors to use for gathered data.
+            On the ``dst`` rank, it should be correctly sized as the size of the group for this
+            collective and will contain the output. Default: ``None``.
+        dst (int, optional): Specifies the rank(global rank) of the process that receive the tensor.
+            And only process `dst` will receive the gathered tensor. Default: ``0`` .
+        group (str, optional): The communication group to work on. If ``None``, which means ``"hccl_world_group"`` in
+            Ascend. Default: ``None``.
+    Raises:
+        TypeError: If dst is not an integer, or group is not a string.
+        TypeError: If size of `object_gather_list` is not equal to group size.
+        RuntimeError: If device target is invalid, or backend is invalid, or distributed initialization fails.
+    Supported Platforms:
+        ``Ascend``
+    Examples:
+        .. note::
+            Before running the following examples, you need to configure the communication environment variables.
+            For Ascend devices, it is recommended to use the msrun startup method
+            without any third-party or configuration file dependencies.
+            Please see the `msrun start up
+            <https://www.mindspore.cn/tutorials/en/master/parallel/msrun_launcher.html>`_
+            for more details.
+            This example should be run with 2 devices.
+        >>> from mindspore.ops.communication import init_process_group, gather_object, get_rank
+        >>> init_process_group()
+        >>> rank = get_rank()
+        >>> obj = ["test", {1: 2}]
+        >>> object_gather_list=[None, None]
+        >>> gather_object(obj[rank], object_gather_list)
+        >>> print(object_gather_list)
+        # rank_0
+        ['test', {1: 2}]
+    """
+    if is_inplace_func() is False:
+        raise ValueError("Non-inplace mode is currently not supported.")
+    if group is None:
+        group = GlobalComm.WORLD_COMM_GROUP
+    if not isinstance(group, str):
+        raise TypeError(
+            "For 'gather_object', the argument 'group' must be type of string, "
+            "but got 'group' type : {}.".format(type(group))
+        )
+    if not isinstance(dst, int):
+        raise TypeError("For gather_object, the dst must be int")
+    group_size = get_cache_group_size(group)
+    rank_id = get_cache_group_rank()
+    if rank_id == dst:
+        if not isinstance(object_gather_list, list) or len(object_gather_list) != group_size:
+            raise TypeError(
+                f"The len of object_gather_list must be equal to group rank size, but got {len(object_gather_list)}."
+            )
+    _, size = _object_to_tensor(obj)
+    tensor = Tensor([size], dtype=mstype.int32)
+    object_size_list = [Tensor([0], dtype=mstype.int32) for i in range(group_size)]
+    all_gather(object_size_list, tensor, group=group)
+    max_object_size = int(max(object_size_list).item())
+    in_tensor, size = _object_to_tensor(obj, max_object_size)
+    data = np.zeros((size)).astype(np.int8)
+    object_tensor_list = [Tensor(data) for i in range(group_size)]
+    gather(in_tensor, object_tensor_list, dst, group)
+    if rank_id != dst:
+        return
+    for i, item in enumerate(object_size_list):
+        tensor_size = int(item.item())
+        tensor = object_tensor_list[i]
+        object_gather_list[i] = _tensor_to_object(tensor, tensor_size)
+def broadcast_object_list(object_list, src=0, group=None, device=None):
+    """
+    Broadcasts the entire group of input Python objects.
+    Note:
+        - Similar to :func:`mindspore.ops.communication.broadcast`, but Python objects can be passed in.
+        - Only support PyNative mode, Graph mode is not currently supported.
+    Args:
+        object_list (list[Any]): list of input to be sent if src is the rank of current process,
+            and list to be used to save received data otherwise.
+        src (int, optional): Specifies the rank(global rank) of the process that broadcast the Python objects.
+            And only process `src` will broadcast the Python objects. Default: ``0`` .
+        group (str, optional): The communication group to work on. If ``None``, which means ``"hccl_world_group"`` in
+            Ascend. Default: ``None``.
+        device (str, optional): Currently it is a reserved parameter. Default: ``None``.
+    Raises:
+        TypeError: If `src` is not an integer or `group` is not a string.
+        RuntimeError: If device target is invalid, or backend is invalid, or distributed initialization fails.
+    Supported Platforms:
+        ``Ascend``
+    Examples:
+        .. note::
+            Before running the following examples, you need to configure the communication environment variables.
+            For Ascend devices, it is recommended to use the msrun startup method
+            without any third-party or configuration file dependencies.
+            Please see the `msrun start up
+            <https://www.mindspore.cn/tutorials/en/master/parallel/msrun_launcher.html>`_
+            for more details.
+            This example should be run with 2 devices.
+        >>> from mindspore.ops.communication import init_process_group, broadcast_object_list, get_rank
+        >>> init_process_group()
+        >>> rank = get_rank()
+        >>> obj = ["test", 12, {1: 2}]
+        >>> if rank == 1:
+        ...     obj = [None, None, None]
+        >>> broadcast_object_list(obj)
+        >>> print(obj)
+        ['test', 12, {1: 2}]
+    """
+    if is_inplace_func() is False:
+        raise ValueError("Non-inplace mode is currently not supported.")
+    if group is None:
+        group = GlobalComm.WORLD_COMM_GROUP
+    if not isinstance(group, str):
+        raise TypeError(
+            "For 'broadcast_object_list', the argument 'group' must be type of string, "
+            "but got 'group' type : {}.".format(type(group))
+        )
+    if not isinstance(src, int):
+        raise TypeError("For broadcast_object_list, the src must be int")
+    if not isinstance(object_list, list) or not object_list:
+        raise TypeError(f"The object_list can not be empty.")
+    rank_id = get_cache_group_rank()
+    tensor_sizes = []
+    tensor_list = []
+    size = 0
+    object_size_list = [Tensor([0], dtype=mstype.int32) for i in range(len(object_list))]
+    if rank_id == src:
+        tensor_list, tensor_sizes = zip(
+            *[_object_to_tensor(obj) for obj in object_list]
+        )
+        object_size_list = [Tensor([tensor_sizes[i]], dtype=mstype.int32) for i in range(len(tensor_sizes))]
+        object_tensor = cat(tensor_list)
+    object_size = cat(object_size_list)
+    broadcast(object_size, src, group)
+    size = int(sum(object_size).item())
+    if rank_id != src:
+        data = np.zeros((size)).astype(np.int8)
+        object_tensor = Tensor(data)
+    broadcast(object_tensor, src, group)
+    if rank_id != src:
+        offset = 0
+        for i, item in enumerate(object_size):
+            obj_size = item
+            obj_view = object_tensor[offset: offset + obj_size]
+            offset += obj_size
+            object_list[i] = _tensor_to_object(obj_view, obj_size)
+def all_gather_object(object_list, obj, group=None):
+    """
+    Aggregates Python objects in a specified communication group.
+    Note:
+        Similar to :func:`mindspore.ops.communication.all_gather`, but Python objects can be passed in.
+    Args:
+        object_list (list[Any]): Output Python object list.
+        obj (Any): Python object to be broadcast from current process.
+        group (str, optional): The communication group to work on. If ``None``, which means ``"hccl_world_group"`` in
+            Ascend. Default: ``None``.
+    Raises:
+        TypeError: `group` is not a str.
+        TypeError: If size of `object_list` is not equal to group size.
+        RuntimeError: If device target is invalid, or backend is invalid, or distributed initialization fails.
+    Supported Platforms:
+        ``Ascend``
+    Examples:
+        .. note::
+            Before running the following examples, you need to configure the communication environment variables.
+            For Ascend devices, it is recommended to use the msrun startup method
+            without any third-party or configuration file dependencies.
+            Please see the `msrun start up
+            <https://www.mindspore.cn/tutorials/en/master/parallel/msrun_launcher.html>`_
+            for more details.
+            This example should be run with 2 devices.
+        >>> from mindspore.ops.communication import init_process_group, get_rank
+        >>> from mindspore.ops.communication import all_gather_object
+        >>> init_process_group()
+        >>> rank = get_rank()
+        >>> obj = ["test", {1: 2}]
+        >>> object_gather_list=[None, None]
+        >>> all_gather_object(object_gather_list, obj[rank])
+        >>> print(object_gather_list)
+        # rank_0
+        ['test', {1: 2}]
+        # rank_1
+        ['test', {1: 2}]
+    """
+    if is_inplace_func() is False:
+        raise ValueError("Non-inplace mode is currently not supported.")
+    if group is None:
+        group = GlobalComm.WORLD_COMM_GROUP
+    if not isinstance(group, str):
+        raise TypeError(
+            "For 'all_gather_object', the argument 'group' must be type of string, "
+            "but got 'group' type : {}.".format(type(group))
+        )
+    group_size = get_cache_group_size(group)
+    if not isinstance(object_list, list) or len(object_list) != group_size:
+        raise TypeError(
+            f"The len of argument object_list must be equal to group rank size, but got {len(object_list)}."
+        )
+    _, size = _object_to_tensor(obj)
+    tensor = Tensor([size], dtype=mstype.int32)
+    object_size_list = [Tensor([0], dtype=mstype.int32) for i in range(group_size)]
+    all_gather(object_size_list, tensor, group=group)
+    max_object_size = int(max(object_size_list).item())
+    in_tensor, size = _object_to_tensor(obj, max_object_size)
+    data = np.zeros((size)).astype(np.int8)
+    object_tensor_list = [Tensor(data) for i in range(group_size)]
+    all_gather(object_tensor_list, in_tensor, group=group)
+    for i, item in enumerate(object_size_list):
+        tensor_size = int(item.item())
+        tensor = object_tensor_list[i]
+        object_list[i] = _tensor_to_object(tensor, tensor_size)
+def all_to_all_v_c(output, input, send_count_matrix, group=None, async_op=False):
+    r"""
+    Based on the user-specified split size, the input tensor is divided and sent to other devices, where split chunks
+    are received and then merged into a single output tensor.
+    Note:
+        Only support PyNative mode, Graph mode is not currently supported.
+    Args:
+        output (Tensor): the output tensor is gathered concatenated from remote ranks.
+        input (Tensor): tensor to be scattered to remote rank.
+        send_count_matrix (list[int]) - The sending and receiving parameters of all ranks,
+            :math:`\text{send_count_matrix}[i*\text{rank_size}+j]` represents the amount of data sent by
+            rank i to rank j, and the basic unit is first dimension sizes. Among them, `rank_size`
+            indicates the size of the communication group.
+        group (str, optional): The communication group to work on. If ``None``, which means ``"hccl_world_group"`` in
+            Ascend. Default: ``None``.
+        async_op (bool, optional): Whether this operator should be an async operator. Default: ``False`` .
+    Returns:
+        CommHandle. CommHandle is an async work handle, if `async_op` is set to True.
+        CommHandle will be None, when `async_op` is False.
+    Raises:
+        TypeError: If `input` or `output` is not tensor. `group` is not a str, or async_op is not bool.
+    Supported Platforms:
+        ``Ascend``
+    Examples:
+        .. note::
+            Before running the following examples, you need to configure the communication environment variables.
+            For Ascend devices, it is recommended to use the msrun startup method
+            without any third-party or configuration file dependencies.
+            Please see the `msrun start up
+            <https://www.mindspore.cn/tutorials/en/master/parallel/msrun_launcher.html>`_
+            for more details.
+            This example should be run with 2 devices.
+        >>> import numpy as np
+        >>> import mindspore
+        >>> from mindspore.ops.communication import init_process_group, get_rank
+        >>> from mindspore.ops.communication import all_to_all_v_c
+        >>> from mindspore import Tensor
+        >>> from mindspore.ops import zeros
+        >>>
+        >>> init_process_group()
+        >>> this_rank = get_rank()
+        >>> if this_rank == 0:
+        ...     output = Tensor(np.zeros([3]).astype(np.float32))
+        ...     tensor = Tensor([0, 1, 2.]) * this_rank
+        ...     result = all_to_all_v_c(output, tensor, [0, 3, 3, 0])
+        ...     print(output)
+        >>> if this_rank == 1:
+        ...     output = Tensor(np.zeros([3]).astype(np.float32))
+        ...     tensor = Tensor([0, 1, 2.]) * this_rank
+        ...     result = all_to_all_v_c(output, tensor, [0, 3, 3, 0])
+        ...     print(output)
+        rank 0:
+        [0. 1. 2]
+        rank 1:
+        [0. 0. 0]
+    """
+    _check_all_tensors([input])
+    _check_all_tensors([output])
+    if group is None:
+        group = GlobalComm.WORLD_COMM_GROUP
+    if not isinstance(group, str):
+        raise TypeError(
+            "The argument 'group' must be type of string, "
+            "but got 'group' type : {}.".format(type(group))
+        )
+    if not isinstance(async_op, bool):
+        raise TypeError(
+            f"The argument 'async_op' must be a bool, but got {type(async_op)}."
+        )
+    if not isinstance(send_count_matrix, list):
+        raise TypeError("send_count_matrix must be list, but got {}".format(type(send_count_matrix)))
+    if not all(isinstance(x, int) for x in send_count_matrix):
+        raise TypeError("send_count_matrix elements must be of type int")
+    rank_size = get_cache_group_size(group)
+    if rank_size * rank_size != len(send_count_matrix):
+        raise TypeError(f"send_count_matrix must be square matrix, but got {len(send_count_matrix)}.")
+    _send_count_matrix = _get_all_to_all_v_c_numel_list(output, input, send_count_matrix)
+    _input = input.reshape(-1)
+    rank_id = get_cache_group_rank(group)
+    result = dist_comm_all_to_all_v_c_op(
+        output,
+        _input,
+        group,
+        _send_count_matrix,
+        rank_size,
+        rank_id,
+    )
+    _, handle = _deal_comm_outputs(result, async_op)
+    return handle