PyPI - mindspore - Versions diffs - 2.4.1__cp39-cp39-win_amd64.whl → 2.5.0__cp39-cp39-win_amd64.whl - Mend

mindspore 2.4.1__cp39-cp39-win_amd64.whl → 2.5.0__cp39-cp39-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mindspore might be problematic. Click here for more details.

Files changed (395) hide show

mindspore/.commit_id +1 -1
mindspore/Microsoft.VisualStudio.Telemetry.dll +0 -0
mindspore/Newtonsoft.Json.dll +0 -0
mindspore/__init__.py +8 -3
mindspore/_c_dataengine.cp39-win_amd64.pyd +0 -0
mindspore/_c_expression.cp39-win_amd64.pyd +0 -0
mindspore/_c_mindrecord.cp39-win_amd64.pyd +0 -0
mindspore/_checkparam.py +0 -5
mindspore/_extends/parallel_compile/akg_compiler/gen_custom_op_files.py +1 -1
mindspore/_extends/parse/compile_config.py +64 -0
mindspore/_extends/parse/deprecated/__init__.py +0 -0
mindspore/_extends/parse/deprecated/deprecated_tensor_method.py +375 -0
mindspore/_extends/parse/parser.py +23 -5
mindspore/_extends/parse/standard_method.py +123 -27
mindspore/_extends/pijit/pijit_func_white_list.py +1 -1
mindspore/amp.py +7 -1
mindspore/atlprov.dll +0 -0
mindspore/avcodec-59.dll +0 -0
mindspore/avdevice-59.dll +0 -0
mindspore/avfilter-8.dll +0 -0
mindspore/avformat-59.dll +0 -0
mindspore/avutil-57.dll +0 -0
mindspore/boost/boost_cell_wrapper.py +136 -41
mindspore/c1.dll +0 -0
mindspore/c1xx.dll +0 -0
mindspore/c2.dll +0 -0
mindspore/common/__init__.py +3 -1
mindspore/common/_register_for_tensor.py +0 -1
mindspore/common/_stub_tensor.py +25 -4
mindspore/common/_tensor_cpp_method.py +17 -0
mindspore/common/_tensor_docs.py +6132 -0
mindspore/common/api.py +99 -25
mindspore/common/dtype.py +34 -34
mindspore/common/dump.py +2 -1
mindspore/common/file_system.py +8 -1
mindspore/common/generator.py +2 -0
mindspore/common/hook_handle.py +3 -1
mindspore/common/initializer.py +3 -4
mindspore/common/lazy_inline.py +8 -2
mindspore/common/mindir_util.py +10 -2
mindspore/common/parameter.py +30 -27
mindspore/common/tensor.py +713 -1337
mindspore/communication/__init__.py +1 -1
mindspore/communication/_comm_helper.py +10 -0
mindspore/communication/comm_func.py +215 -173
mindspore/communication/management.py +23 -20
mindspore/context.py +292 -193
mindspore/dataset/__init__.py +23 -19
mindspore/dataset/callback/ds_callback.py +2 -1
mindspore/dataset/core/config.py +84 -3
mindspore/dataset/engine/cache_admin.py +3 -3
mindspore/dataset/engine/cache_client.py +5 -4
mindspore/dataset/engine/datasets.py +192 -149
mindspore/dataset/engine/datasets_audio.py +14 -0
mindspore/dataset/engine/datasets_standard_format.py +28 -11
mindspore/dataset/engine/datasets_text.py +38 -1
mindspore/dataset/engine/datasets_user_defined.py +125 -65
mindspore/dataset/engine/datasets_vision.py +81 -8
mindspore/dataset/engine/iterators.py +281 -63
mindspore/dataset/engine/obs/util.py +8 -0
mindspore/dataset/engine/queue.py +40 -0
mindspore/dataset/engine/samplers.py +26 -2
mindspore/dataset/engine/serializer_deserializer.py +1 -1
mindspore/dataset/engine/validators.py +43 -11
mindspore/dataset/transforms/py_transforms_util.py +17 -0
mindspore/dataset/transforms/transforms.py +29 -12
mindspore/dataset/vision/validators.py +1 -2
mindspore/device_context/__init__.py +21 -0
mindspore/device_context/ascend/__init__.py +25 -0
mindspore/device_context/ascend/device.py +72 -0
mindspore/device_context/ascend/op_debug.py +94 -0
mindspore/device_context/ascend/op_precision.py +193 -0
mindspore/device_context/ascend/op_tuning.py +127 -0
mindspore/device_context/cpu/__init__.py +25 -0
mindspore/device_context/cpu/device.py +62 -0
mindspore/device_context/cpu/op_tuning.py +43 -0
mindspore/device_context/gpu/__init__.py +21 -0
mindspore/device_context/gpu/device.py +70 -0
mindspore/device_context/gpu/op_precision.py +67 -0
mindspore/device_context/gpu/op_tuning.py +175 -0
mindspore/device_manager.py +134 -0
mindspore/dnnl.dll +0 -0
mindspore/dpcmi.dll +0 -0
mindspore/experimental/llm_boost/__init__.py +3 -2
mindspore/experimental/llm_boost/ascend_native/__init__.py +22 -0
mindspore/experimental/llm_boost/ascend_native/llama_boost_ascend_native.py +211 -0
mindspore/experimental/llm_boost/ascend_native/llm_boost.py +52 -0
mindspore/experimental/llm_boost/atb/boost_base.py +239 -64
mindspore/experimental/llm_boost/atb/llama_boost.py +52 -30
mindspore/experimental/llm_boost/atb/qwen_boost.py +47 -24
mindspore/experimental/llm_boost/register.py +1 -0
mindspore/experimental/optim/adadelta.py +26 -22
mindspore/experimental/optim/adam.py +3 -0
mindspore/experimental/optim/lr_scheduler.py +33 -24
mindspore/experimental/optim/radam.py +33 -30
mindspore/hal/device.py +28 -0
mindspore/hal/event.py +17 -0
mindspore/hal/memory.py +94 -3
mindspore/hal/stream.py +91 -6
mindspore/include/api/context.h +1 -2
mindspore/include/dataset/constants.h +2 -2
mindspore/jpeg62.dll +0 -0
mindspore/log.py +12 -0
mindspore/mindrecord/__init__.py +1 -1
mindspore/mindrecord/config.py +17 -316
mindspore/mindrecord/filereader.py +1 -9
mindspore/mindrecord/filewriter.py +5 -15
mindspore/mindrecord/mindpage.py +1 -9
mindspore/mindspore_backend.dll +0 -0
mindspore/mindspore_common.dll +0 -0
mindspore/mindspore_core.dll +0 -0
mindspore/mindspore_glog.dll +0 -0
mindspore/mindspore_ops.dll +0 -0
mindspore/mint/__init__.py +824 -218
mindspore/mint/distributed/__init__.py +66 -4
mindspore/mint/distributed/distributed.py +2594 -44
mindspore/mint/linalg/__init__.py +6 -0
mindspore/mint/nn/__init__.py +473 -14
mindspore/mint/nn/functional.py +486 -11
mindspore/mint/nn/layer/__init__.py +17 -4
mindspore/mint/nn/layer/_functions.py +330 -0
mindspore/mint/nn/layer/activation.py +169 -1
mindspore/mint/nn/layer/basic.py +123 -0
mindspore/mint/nn/layer/conv.py +727 -0
mindspore/mint/nn/layer/normalization.py +215 -19
mindspore/mint/nn/layer/padding.py +797 -0
mindspore/mint/nn/layer/pooling.py +170 -0
mindspore/mint/optim/__init__.py +2 -1
mindspore/mint/optim/adam.py +223 -0
mindspore/mint/optim/adamw.py +26 -19
mindspore/mint/special/__init__.py +2 -1
mindspore/msobj140.dll +0 -0
mindspore/mspdb140.dll +0 -0
mindspore/mspdbcore.dll +0 -0
mindspore/mspdbst.dll +0 -0
mindspore/mspft140.dll +0 -0
mindspore/msvcdis140.dll +0 -0
mindspore/msvcp140_1.dll +0 -0
mindspore/msvcp140_2.dll +0 -0
mindspore/msvcp140_atomic_wait.dll +0 -0
mindspore/msvcp140_codecvt_ids.dll +0 -0
mindspore/multiprocessing/__init__.py +5 -0
mindspore/nn/__init__.py +2 -0
mindspore/nn/cell.py +142 -21
mindspore/nn/dynamic_lr.py +2 -1
mindspore/nn/layer/activation.py +6 -6
mindspore/nn/layer/basic.py +35 -25
mindspore/nn/layer/channel_shuffle.py +3 -3
mindspore/nn/layer/conv.py +3 -0
mindspore/nn/layer/embedding.py +3 -3
mindspore/nn/layer/normalization.py +8 -7
mindspore/nn/layer/padding.py +4 -3
mindspore/nn/layer/pooling.py +55 -23
mindspore/nn/layer/rnn_cells.py +1 -1
mindspore/nn/layer/rnns.py +2 -1
mindspore/nn/layer/timedistributed.py +5 -5
mindspore/nn/layer/transformer.py +48 -26
mindspore/nn/learning_rate_schedule.py +5 -3
mindspore/nn/loss/loss.py +31 -36
mindspore/nn/optim/ada_grad.py +1 -0
mindspore/nn/optim/adadelta.py +2 -2
mindspore/nn/optim/adam.py +1 -1
mindspore/nn/optim/lars.py +1 -4
mindspore/nn/optim/optimizer.py +1 -1
mindspore/nn/optim/rprop.py +2 -2
mindspore/nn/optim/thor.py +2 -1
mindspore/nn/utils/__init__.py +22 -0
mindspore/nn/utils/init.py +73 -0
mindspore/nn/wrap/cell_wrapper.py +4 -6
mindspore/nn/wrap/loss_scale.py +3 -4
mindspore/numpy/array_creations.py +60 -62
mindspore/numpy/array_ops.py +148 -143
mindspore/numpy/logic_ops.py +41 -42
mindspore/numpy/math_ops.py +361 -359
mindspore/numpy/utils.py +16 -16
mindspore/numpy/utils_const.py +4 -4
mindspore/opencv_core452.dll +0 -0
mindspore/opencv_imgcodecs452.dll +0 -0
mindspore/opencv_imgproc452.dll +0 -0
mindspore/ops/__init__.py +2 -1
mindspore/ops/_grad_experimental/grad_comm_ops.py +107 -8
mindspore/ops/_grad_experimental/grad_debug_ops.py +6 -1
mindspore/ops/_grad_experimental/grad_inner_ops.py +9 -0
mindspore/ops/_grad_experimental/grad_math_ops.py +2 -1
mindspore/ops/_op_impl/cpu/__init__.py +1 -0
mindspore/ops/_op_impl/cpu/raise_op.py +28 -0
mindspore/ops/_vmap/vmap_array_ops.py +20 -19
mindspore/ops/_vmap/vmap_base.py +0 -2
mindspore/ops/_vmap/vmap_grad_nn_ops.py +19 -13
mindspore/ops/_vmap/vmap_math_ops.py +11 -9
mindspore/ops/_vmap/vmap_nn_ops.py +20 -34
mindspore/ops/auto_generate/cpp_create_prim_instance_helper.py +149 -12
mindspore/ops/auto_generate/gen_arg_handler.py +0 -61
mindspore/ops/auto_generate/gen_extend_func.py +554 -60
mindspore/ops/auto_generate/gen_ops_def.py +1621 -115
mindspore/ops/auto_generate/gen_ops_prim.py +8027 -3411
mindspore/ops/auto_generate/pyboost_inner_prim.py +183 -79
mindspore/ops/composite/base.py +1 -1
mindspore/ops/composite/multitype_ops/_compile_utils.py +229 -30
mindspore/ops/composite/multitype_ops/pow_impl.py +0 -29
mindspore/ops/function/__init__.py +12 -0
mindspore/ops/function/array_func.py +561 -159
mindspore/ops/function/clip_func.py +64 -0
mindspore/ops/function/debug_func.py +28 -20
mindspore/ops/function/image_func.py +1 -1
mindspore/ops/function/linalg_func.py +5 -4
mindspore/ops/function/math_func.py +1664 -294
mindspore/ops/function/nn_func.py +988 -317
mindspore/ops/function/parameter_func.py +3 -56
mindspore/ops/function/random_func.py +243 -33
mindspore/ops/function/sparse_unary_func.py +1 -1
mindspore/ops/functional.py +18 -5
mindspore/ops/functional_overload.py +897 -0
mindspore/ops/operations/__init__.py +3 -2
mindspore/ops/operations/_embedding_cache_ops.py +4 -4
mindspore/ops/operations/_grad_ops.py +2 -34
mindspore/ops/operations/_infer_ops.py +2 -1
mindspore/ops/operations/_inner_ops.py +38 -8
mindspore/ops/operations/array_ops.py +45 -303
mindspore/ops/operations/comm_ops.py +23 -17
mindspore/ops/operations/custom_ops.py +7 -49
mindspore/ops/operations/debug_ops.py +42 -47
mindspore/ops/operations/inner_ops.py +6 -4
mindspore/ops/operations/linalg_ops.py +3 -2
mindspore/ops/operations/manually_defined/ops_def.py +185 -104
mindspore/ops/operations/math_ops.py +11 -216
mindspore/ops/operations/nn_ops.py +153 -310
mindspore/ops/primitive.py +23 -21
mindspore/ops/tensor_method.py +1669 -0
mindspore/ops_generate/aclnn_kernel_register_auto_cc_generator.py +110 -0
mindspore/ops_generate/add_tensor_docs_generator.py +54 -0
mindspore/ops_generate/arg_handler.py +0 -61
mindspore/ops_generate/auto_grad_impl_cc_generator.py +135 -0
mindspore/ops_generate/auto_grad_reg_cc_generator.py +93 -0
mindspore/ops_generate/base_generator.py +11 -0
mindspore/ops_generate/cpp_create_prim_instance_helper_generator.py +108 -0
mindspore/ops_generate/functional_map_cpp_generator.py +491 -0
mindspore/ops_generate/functional_overload_py_generator.py +110 -0
mindspore/ops_generate/functions_cc_generator.py +233 -0
mindspore/ops_generate/gen_aclnn_implement.py +110 -114
mindspore/ops_generate/gen_constants.py +157 -3
mindspore/ops_generate/gen_ops.py +245 -990
mindspore/ops_generate/gen_pyboost_func.py +97 -998
mindspore/ops_generate/gen_utils.py +119 -33
mindspore/ops_generate/lite_ops_cpp_generator.py +155 -0
mindspore/ops_generate/op_api_proto.py +206 -0
mindspore/ops_generate/op_def_py_generator.py +131 -0
mindspore/ops_generate/op_prim_py_generator.py +480 -0
mindspore/ops_generate/op_proto.py +373 -108
mindspore/ops_generate/op_template_parser.py +436 -0
mindspore/ops_generate/ops_def_cc_generator.py +288 -0
mindspore/ops_generate/ops_def_h_generator.py +74 -0
mindspore/ops_generate/ops_name_h_generator.py +68 -0
mindspore/ops_generate/ops_primitive_h_generator.py +81 -0
mindspore/ops_generate/pyboost_functions_cpp_generator.py +370 -0
mindspore/ops_generate/pyboost_functions_h_generator.py +68 -0
mindspore/ops_generate/pyboost_functions_py_generator.py +148 -0
mindspore/ops_generate/pyboost_grad_function_cpp_generator.py +154 -0
mindspore/ops_generate/pyboost_inner_prim_generator.py +131 -0
mindspore/ops_generate/pyboost_native_grad_functions_generator.py +268 -0
mindspore/ops_generate/pyboost_op_cpp_code_generator.py +851 -0
mindspore/ops_generate/pyboost_overload_functions_cpp_generator.py +344 -0
mindspore/ops_generate/pyboost_utils.py +92 -33
mindspore/ops_generate/template.py +294 -44
mindspore/ops_generate/tensor_func_reg_cpp_generator.py +422 -0
mindspore/parallel/__init__.py +3 -3
mindspore/parallel/_auto_parallel_context.py +44 -34
mindspore/parallel/_cell_wrapper.py +22 -3
mindspore/parallel/_parallel_serialization.py +13 -2
mindspore/parallel/_utils.py +4 -2
mindspore/parallel/algo_parameter_config.py +1 -1
mindspore/parallel/checkpoint_transform.py +44 -0
mindspore/parallel/cluster/process_entity/_api.py +131 -37
mindspore/parallel/cluster/process_entity/_utils.py +41 -6
mindspore/parallel/cluster/run.py +20 -3
mindspore/parallel/parameter_broadcast.py +1 -1
mindspore/parallel/shard.py +3 -0
mindspore/parallel/transform_safetensors.py +119 -253
mindspore/pgodb140.dll +0 -0
mindspore/pgort140.dll +0 -0
mindspore/profiler/__init__.py +17 -4
mindspore/profiler/analysis/__init__.py +0 -0
mindspore/profiler/analysis/parser/__init__.py +0 -0
mindspore/profiler/analysis/parser/ascend_cann_parser.py +166 -0
mindspore/profiler/analysis/parser/base_parser.py +158 -0
mindspore/profiler/analysis/parser/framework_cann_relation_parser.py +45 -0
mindspore/profiler/analysis/parser/ms_framework_parser.py +142 -0
mindspore/profiler/analysis/parser/ms_minddata_parser.py +145 -0
mindspore/profiler/analysis/parser/timeline_assembly_factory/__init__.py +0 -0
mindspore/profiler/analysis/parser/timeline_assembly_factory/ascend_timeline_assembler.py +261 -0
mindspore/profiler/analysis/parser/timeline_assembly_factory/base_timeline_assembler.py +40 -0
mindspore/profiler/analysis/parser/timeline_assembly_factory/trace_view_container.py +84 -0
mindspore/profiler/analysis/parser/timeline_creator/__init__.py +0 -0
mindspore/profiler/analysis/parser/timeline_creator/base_timeline_creator.py +44 -0
mindspore/profiler/analysis/parser/timeline_creator/cpu_op_timeline_creator.py +90 -0
mindspore/profiler/analysis/parser/timeline_creator/fwk_timeline_creator.py +76 -0
mindspore/profiler/analysis/parser/timeline_creator/msprof_timeline_creator.py +103 -0
mindspore/profiler/analysis/parser/timeline_creator/scope_layer_timeline_creator.py +134 -0
mindspore/profiler/analysis/parser/timeline_event/__init__.py +0 -0
mindspore/profiler/analysis/parser/timeline_event/base_event.py +233 -0
mindspore/profiler/analysis/parser/timeline_event/cpu_op_event.py +47 -0
mindspore/profiler/analysis/parser/timeline_event/flow_event.py +36 -0
mindspore/profiler/analysis/parser/timeline_event/fwk_event.py +260 -0
mindspore/profiler/analysis/parser/timeline_event/msprof_event.py +73 -0
mindspore/profiler/analysis/parser/timeline_event/scope_layer_event.py +53 -0
mindspore/profiler/analysis/parser/timeline_event/timeline_event_pool.py +146 -0
mindspore/profiler/analysis/task_manager.py +131 -0
mindspore/profiler/analysis/time_converter.py +84 -0
mindspore/profiler/analysis/viewer/__init__.py +0 -0
mindspore/profiler/analysis/viewer/ascend_communication_viewer.py +333 -0
mindspore/profiler/analysis/viewer/ascend_integrate_viewer.py +87 -0
mindspore/profiler/analysis/viewer/ascend_kernel_details_viewer.py +252 -0
mindspore/profiler/analysis/viewer/ascend_memory_viewer.py +313 -0
mindspore/profiler/analysis/viewer/ascend_op_memory_viewer.py +322 -0
mindspore/profiler/analysis/viewer/ascend_step_trace_time_viewer.py +265 -0
mindspore/profiler/analysis/viewer/ascend_timeline_viewer.py +58 -0
mindspore/profiler/analysis/viewer/base_viewer.py +26 -0
mindspore/profiler/analysis/viewer/ms_dataset_viewer.py +97 -0
mindspore/profiler/analysis/viewer/ms_minddata_viewer.py +581 -0
mindspore/profiler/analysis/work_flow.py +73 -0
mindspore/profiler/common/ascend_msprof_exporter.py +138 -0
mindspore/profiler/common/command_executor.py +90 -0
mindspore/profiler/common/constant.py +174 -3
mindspore/profiler/common/file_manager.py +208 -0
mindspore/profiler/common/log.py +130 -0
mindspore/profiler/common/msprof_cmd_tool.py +202 -0
mindspore/profiler/common/path_manager.py +371 -0
mindspore/profiler/common/process_bar.py +168 -0
mindspore/profiler/common/process_pool.py +9 -3
mindspore/profiler/common/profiler_context.py +476 -0
mindspore/profiler/common/profiler_info.py +304 -0
mindspore/profiler/common/profiler_output_path.py +284 -0
mindspore/profiler/common/profiler_parameters.py +210 -0
mindspore/profiler/common/profiler_path_manager.py +120 -0
mindspore/profiler/common/record_function.py +76 -0
mindspore/profiler/common/tlv_decoder.py +76 -0
mindspore/profiler/common/util.py +75 -2
mindspore/profiler/dynamic_profiler.py +270 -37
mindspore/profiler/envprofiler.py +138 -0
mindspore/profiler/mstx.py +199 -0
mindspore/profiler/platform/__init__.py +21 -0
mindspore/profiler/platform/base_profiler.py +40 -0
mindspore/profiler/platform/cpu_profiler.py +124 -0
mindspore/profiler/platform/gpu_profiler.py +74 -0
mindspore/profiler/platform/npu_profiler.py +309 -0
mindspore/profiler/profiler.py +580 -93
mindspore/profiler/profiler_action_controller.py +187 -0
mindspore/profiler/profiler_interface.py +114 -0
mindspore/profiler/schedule.py +208 -0
mindspore/rewrite/api/symbol_tree.py +1 -2
mindspore/run_check/_check_version.py +18 -13
mindspore/runtime/__init__.py +37 -0
mindspore/runtime/device.py +27 -0
mindspore/runtime/event.py +209 -0
mindspore/runtime/executor.py +148 -0
mindspore/runtime/memory.py +392 -0
mindspore/runtime/stream.py +460 -0
mindspore/runtime/thread_bind_core.py +401 -0
mindspore/swresample-4.dll +0 -0
mindspore/swscale-6.dll +0 -0
mindspore/tbbmalloc.dll +0 -0
mindspore/tinyxml2.dll +0 -0
mindspore/train/__init__.py +2 -2
mindspore/train/_utils.py +53 -18
mindspore/train/amp.py +8 -4
mindspore/train/callback/_checkpoint.py +32 -18
mindspore/train/callback/_early_stop.py +1 -1
mindspore/train/callback/_flops_collector.py +105 -69
mindspore/train/callback/_history.py +1 -1
mindspore/train/callback/_summary_collector.py +44 -6
mindspore/train/callback/_tft_register.py +37 -15
mindspore/train/dataset_helper.py +11 -11
mindspore/train/metrics/precision.py +4 -5
mindspore/train/mind_ir_pb2.py +167 -46
mindspore/train/model.py +13 -14
mindspore/train/serialization.py +461 -72
mindspore/train/summary/summary_record.py +1 -2
mindspore/train/train_thor/model_thor.py +1 -1
mindspore/turbojpeg.dll +0 -0
mindspore/utils/__init__.py +4 -2
mindspore/utils/dryrun.py +138 -0
mindspore/utils/runtime_execution_order_check.py +550 -0
mindspore/vcmeta.dll +0 -0
mindspore/vcruntime140.dll +0 -0
mindspore/vcruntime140_1.dll +0 -0
mindspore/version.py +1 -1
{mindspore-2.4.1.dist-info → mindspore-2.5.0.dist-info}/METADATA +3 -4
{mindspore-2.4.1.dist-info → mindspore-2.5.0.dist-info}/RECORD +391 -265
{mindspore-2.4.1.dist-info → mindspore-2.5.0.dist-info}/entry_points.txt +1 -1
mindspore/common/_tensor_overload.py +0 -139
mindspore/mindspore_np_dtype.dll +0 -0
mindspore/profiler/envprofiling.py +0 -254
mindspore/profiler/profiling.py +0 -1926
{mindspore-2.4.1.dist-info → mindspore-2.5.0.dist-info}/WHEEL +0 -0
{mindspore-2.4.1.dist-info → mindspore-2.5.0.dist-info}/top_level.txt +0 -0

mindspore/communication/comm_func.py CHANGED Viewed

@@ -28,7 +28,9 @@ from mindspore.ops.auto_generate.gen_ops_prim import (inner_comm_all_reduce_op,
                                                       inner_comm_all_to_all_v_op, inner_comm_irecv_op,
                                                       inner_comm_isend_op, inner_comm_reduce_scatter_op)
 from mindspore._c_expression import CommHandle as CommHandle_
+from mindspore._c_expression.typing import Type
 from mindspore import jit_class
+from mindspore.common.api import _pynative_executor
 __all__ = [
     'all_reduce',
@@ -218,19 +220,18 @@ def all_reduce(tensor, op=ReduceOp.SUM, group=GlobalComm.WORLD_COMM_GROUP, async
             For Ascend/GPU/CPU devices, it is recommended to use the msrun startup method
             without any third-party or configuration file dependencies.
             Please see the `msrun start up
-            <https://www.mindspore.cn/docs/zh-CN/master/model_train/parallel/msrun_launcher.html>`_
+            <https://www.mindspore.cn/docs/en/master/model_train/parallel/msrun_launcher.html>`_
             for more details.
             This example should be run with 2 devices.
         >>> import numpy as np
-        >>> from mindspore.communication import init
-        >>> from mindspore.communication.comm_func import all_reduce
-        >>> from mindspore import Tensor
+        >>> import mindspore as ms
+        >>> import mindspore.communication as comm
         >>>
-        >>> init()
-        >>> input_tensor = Tensor(np.ones([2, 8]).astype(np.float32))
-        >>> output = all_reduce(input_tensor)
+        >>> comm.init()
+        >>> input_tensor = ms.Tensor(np.ones([2, 8]).astype(np.float32))
+        >>> output, _ = comm.comm_func.all_reduce(input_tensor)
         >>> print(output)
         [[2. 2. 2. 2. 2. 2. 2. 2.]
          [2. 2. 2. 2. 2. 2. 2. 2.]]
@@ -284,22 +285,18 @@ def all_gather_into_tensor(tensor, group=GlobalComm.WORLD_COMM_GROUP, async_op=F
             For Ascend/GPU/CPU devices, it is recommended to use the msrun startup method
             without any third-party or configuration file dependencies.
             Please see the `msrun start up
-            <https://www.mindspore.cn/docs/zh-CN/master/model_train/parallel/msrun_launcher.html>`_
+            <https://www.mindspore.cn/docs/en/master/model_train/parallel/msrun_launcher.html>`_
             for more details.
             This example should be run with 2 devices.
         >>> import numpy as np
         >>> import mindspore as ms
-        >>> from mindspore import ops
-        >>> from mindspore.communication import init
-        >>> from mindspore.communication.comm_func import all_gather_into_tensor
-        >>> from mindspore import Tensor
+        >>> import mindspore.communication as comm
         >>>
-        >>> ms.set_context(mode=ms.GRAPH_MODE)
-        >>> init()
-        >>> input_tensor = Tensor(np.ones([2, 8]).astype(np.float32))
-        >>> output = all_gather_into_tensor(input_tensor)
+        >>> comm.init()
+        >>> input_tensor = ms.Tensor(np.ones([2, 8]).astype(np.float32))
+        >>> output, _ = comm.comm_func.all_gather_into_tensor(input_tensor)
         >>> print(output)
         [[1. 1. 1. 1. 1. 1. 1. 1.]
          [1. 1. 1. 1. 1. 1. 1. 1.]
@@ -358,21 +355,18 @@ def reduce_scatter_tensor(tensor, op=ReduceOp.SUM, group=GlobalComm.WORLD_COMM_G
             For Ascend/GPU/CPU devices, it is recommended to use the msrun startup method
             without any third-party or configuration file dependencies.
             Please see the `msrun start up
-            <https://www.mindspore.cn/docs/zh-CN/master/model_train/parallel/msrun_launcher.html>`_
+            <https://www.mindspore.cn/docs/en/master/model_train/parallel/msrun_launcher.html>`_
             for more details.
             This example should be run with 2 devices.
-        >>> import mindspore as ms
-        >>> from mindspore import Tensor
-        >>> from mindspore.communication import init
-        >>> from mindspore.communication.comm_func import reduce_scatter_tensor
         >>> import numpy as np
+        >>> import mindspore as ms
+        >>> import mindspore.communication as comm
         >>>
-        >>> ms.set_context(mode=ms.GRAPH_MODE)
-        >>> init()
-        >>> input_tensor = Tensor(np.ones([8, 8]).astype(np.float32))
-        >>> output = reduce_scatter_tensor(input_tensor)
+        >>> comm.init()
+        >>> input_tensor = ms.Tensor(np.ones([8, 8]).astype(np.float32))
+        >>> output, _ = comm.comm_func.reduce_scatter_tensor(input_tensor)
         >>> print(output)
         [[2. 2. 2. 2. 2. 2. 2. 2.]
          [2. 2. 2. 2. 2. 2. 2. 2.]
@@ -430,22 +424,20 @@ def reduce(tensor, dst, op=ReduceOp.SUM, group=GlobalComm.WORLD_COMM_GROUP):
             without any third-party or configuration file dependencies.
             Please see the `msrun start up
-            <https://www.mindspore.cn/docs/zh-CN/master/model_train/parallel/msrun_launcher.html>`_
+            <https://www.mindspore.cn/docs/en/master/model_train/parallel/msrun_launcher.html>`_
             for more details.
             This example should be run with 4 devices.
-        >>> from mindspore import ops
-        >>> import mindspore.nn as nn
-        >>> from mindspore.communication import init
-        >>> from mindspore.communication.comm_func import reduce
-        >>> from mindspore import Tensor
         >>> import numpy as np
+        >>> import mindspore as ms
+        >>> import mindspore.communication as comm
+        >>>
         >>> # Launch 4 processes.
-        >>> init()
+        >>> comm.init()
         >>> dest_rank=1
-        >>> input_tensor = Tensor(np.ones([2, 8]).astype(np.float32))
-        >>> output = reduce(input_tensor)
+        >>> input_tensor = ms.Tensor(np.ones([2, 8]).astype(np.float32))
+        >>> output = comm.comm_func.reduce(input_tensor, dst=dest_rank)
         >>> print(output)
         Process with rank 1: [[4. 4. 4. 4. 4. 4. 4. 4.]
                              [4. 4. 4. 4. 4. 4. 4. 4.]],
@@ -494,27 +486,36 @@ class P2POp:
     Examples:
         >>> import numpy as np
-        >>> import mindspore
-        >>> from mindspore.communication.comm_func import P2POp, isend, irecv
-        >>> from mindspore import Tensor
-        >>> send_tensor = Tensor(1.)
-        >>> send_op = P2POp('isend', send_tensor, 1)
-        >>> send_op = P2POp(isend, send_tensor, 1)
-        >>> recv_tensor = Tensor(0.)
-        >>> recv_op = P2POp('irecv', recv_tensor, 0)
-        >>> recv_op = P2POp(irecv, recv_tensor, 0)
-        >>> recv_op = P2POp('irecv', (), 0, recv_dtype=mindspore.float32)
+        >>> import mindspore as ms
+        >>> import mindspore.communication as comm
+        >>>
+        >>> send_tensor = ms.Tensor(1.)
+        >>> send_op = comm.comm_func.P2POp('isend', send_tensor, 1)
+        >>> send_op = comm.comm_func.P2POp(comm.comm_func.isend, send_tensor, 1)
+        >>> recv_tensor = ms.Tensor(0.)
+        >>> recv_op = comm.comm_func.P2POp('irecv', recv_tensor, 0)
+        >>> recv_op = comm.comm_func.P2POp(comm.comm_func.irecv, recv_tensor, 0)
+        >>> recv_op = comm.comm_func.P2POp('irecv', (), 0, recv_dtype=ms.float32)
     """
     def __init__(self, op, tensor, peer, group=None, tag=0, *, recv_dtype=None):
         self.op = op
         self.tensor = tensor
+        if not isinstance(peer, int):
+            raise TypeError(f"peer must be type of int, but got type of {type(peer)}")
+        if recv_dtype and not isinstance(recv_dtype, Type):
+            raise TypeError(f"recv_dtype must be type of mindspore dtype, but got type of {type(recv_dtype)}")
         self.peer = peer
         self.group = group
         self.tag = tag
         self.recv_dtype = recv_dtype
     def __new__(cls, op, tensor, peer, group=None, tag=0, recv_dtype=None):
+        if not (isinstance(op, str) or callable(op)):
+            raise TypeError(f"op must be type of string or function, but got type of {type(op)}")
         if isinstance(op, str):
             op_name = op
         else:
@@ -560,31 +561,29 @@ def batch_isend_irecv(p2p_op_list):
             For Ascend/GPU/CPU devices, it is recommended to use the msrun startup method
             without any third-party or configuration file dependencies.
             Please see the `msrun start up
-            <https://www.mindspore.cn/docs/zh-CN/master/model_train/parallel/msrun_launcher.html>`_
+            <https://www.mindspore.cn/docs/en/master/model_train/parallel/msrun_launcher.html>`_
             for more details.
             This example should be run with 2 devices.
         >>> import numpy as np
-        >>> import mindspore
-        >>> from mindspore.communication import init, get_rank, get_group_size
-        >>> from mindspore.communication.comm_func import batch_isend_irecv, P2POp
-        >>> from mindspore import Tensor
+        >>> import mindspore as ms
+        >>> import mindspore.communication as comm
         >>>
-        >>> init()
-        >>> this_rank = get_rank()
-        >>> world_size = get_group_size()
+        >>> comm.init()
+        >>> this_rank = comm.get_rank()
+        >>> world_size = comm.get_group_size()
         >>> next_rank = (this_rank + 1) % world_size
         >>> prev_rank = (this_rank + world_size - 1) % world_size
         >>>
-        >>> send_tensor = Tensor(this_rank + 1, dtype=mindspore.float32)
-        >>> recv_tensor = Tensor(0., dtype=mindspore.float32)
+        >>> send_tensor = ms.Tensor(this_rank + 1, dtype=ms.float32)
+        >>> recv_tensor = ms.Tensor(0., dtype=ms.float32)
         >>>
-        >>> send_op = P2POp('isend', send_tensor, next_rank)
-        >>> recv_op = P2POp('irecv', recv_tensor, prev_rank)
+        >>> send_op = comm.comm_func.P2POp('isend', send_tensor, next_rank)
+        >>> recv_op = comm.comm_func.P2POp('irecv', recv_tensor, prev_rank)
         >>>
         >>> p2p_op_list = [send_op, recv_op]
-        >>> output = batch_isend_irecv(p2p_op_list)
+        >>> output = comm.comm_func.batch_isend_irecv(p2p_op_list)
         >>> print(output)
         rank 0:
         (Tensor(shape=[], dtype=Float32, value= 0), Tensor(shape=[], dtype=Float32, value= 2))
@@ -597,6 +596,10 @@ def batch_isend_irecv(p2p_op_list):
     receive_shapes = []
     receive_dtypes = []
     tags = []
+    if not isinstance(p2p_op_list, list):
+        raise TypeError(f"p2p_op_list must be type of list, but got type of {p2p_op_list}.")
     if not p2p_op_list:
         raise TypeError(f"p2p_op_list can not be empty list.")
     group = p2p_op_list[0].group
@@ -676,20 +679,20 @@ def scatter_tensor(tensor, src=0, group=GlobalComm.WORLD_COMM_GROUP):
             For Ascend/GPU/CPU devices, it is recommended to use the msrun startup method
             without any third-party or configuration file dependencies.
             Please see the `msrun start up
-            <https://www.mindspore.cn/docs/zh-CN/master/model_train/parallel/msrun_launcher.html>`_
+            <https://www.mindspore.cn/docs/en/master/model_train/parallel/msrun_launcher.html>`_
             for more details.
             This example should be run with 2 devices.
-        >>> import mindspore as ms
-        >>> from mindspore.communication import init
-        >>> from mindspore.communication.comm_func import scatter_tensor
         >>> import numpy as np
+        >>> import mindspore as ms
+        >>> import mindspore.communication as comm
+        >>>
         >>> # Launch 2 processes.
         >>>
-        >>> init()
+        >>> comm.init()
         >>> input = ms.Tensor(np.arange(8).reshape([4, 2]).astype(np.float32))
-        >>> out = scatter_tensor(tensor=data, src=0)
+        >>> out = comm.comm_func.scatter_tensor(tensor=input, src=0)
         >>> print(out)
         # rank_0
         [[0. 1.]
@@ -741,22 +744,20 @@ def gather_into_tensor(tensor, dst=0, group=GlobalComm.WORLD_COMM_GROUP):
             For Ascend/GPU/CPU devices, it is recommended to use the msrun startup method
             without any third-party or configuration file dependencies.
             Please see the `msrun start up
-            <https://www.mindspore.cn/docs/zh-CN/master/model_train/parallel/msrun_launcher.html>`_
+            <https://www.mindspore.cn/docs/en/master/model_train/parallel/msrun_launcher.html>`_
             for more details.
             This example should be run with 2 devices.
         >>> import numpy as np
         >>> import mindspore as ms
-        >>> import mindspore.nn as nn
-        >>> from mindspore.communication import init
-        >>> from mindspore import Tensor
-        >>> from mindspore.communication.comm_func import gather_into_tensor
+        >>> import mindspore.communication as comm
+        >>>
         >>> # Launch 2 processes.
         >>>
-        >>> init()
-        >>> input = Tensor(np.arange(4).reshape([2, 2]).astype(np.float32))
-        >>> output = gather_into_tensor(tensor=data, dst=0)
+        >>> comm.init()
+        >>> input = ms.Tensor(np.arange(4).reshape([2, 2]).astype(np.float32))
+        >>> output = comm.comm_func.gather_into_tensor(tensor=input, dst=0)
         >>> print(output)
         Process with rank 0: [[0. 1.],
                               [2. 3.],
@@ -804,21 +805,20 @@ def broadcast(tensor, src=0, group=GlobalComm.WORLD_COMM_GROUP):
             For Ascend/GPU/CPU devices, it is recommended to use the msrun startup method
             without any third-party or configuration file dependencies.
             Please see the `msrun start up
-            <https://www.mindspore.cn/docs/zh-CN/master/model_train/parallel/msrun_launcher.html>`_
+            <https://www.mindspore.cn/docs/en/master/model_train/parallel/msrun_launcher.html>`_
             for more details.
             This example should be run with 2 devices.
-        >>> import mindspore as ms
-        >>> from mindspore import Tensor
-        >>> from mindspore.communication import init
-        >>> from mindspore.communication.comm_func import broadcast
         >>> import numpy as np
+        >>> import mindspore as ms
+        >>> import mindspore.communication as comm
+        >>>
         >>> # Launch 2 processes.
         >>>
-        >>> init()
+        >>> comm.init()
         >>> data = ms.Tensor(np.arange(8).reshape([2, 4]).astype(np.float32))
-        >>> out = broadcast(tensor=data, src=0)
+        >>> out = comm.comm_func.broadcast(tensor=data, src=0)
         [[0. 1. 2. 3.]
          [4. 5. 6. 7.]]
@@ -858,29 +858,37 @@ def barrier(group=GlobalComm.WORLD_COMM_GROUP):
             For Ascend/GPU/CPU devices, it is recommended to use the msrun startup method
             without any third-party or configuration file dependencies.
             Please see the `msrun start up
-            <https://www.mindspore.cn/docs/zh-CN/master/model_train/parallel/msrun_launcher.html>`_
+            <https://www.mindspore.cn/docs/en/master/model_train/parallel/msrun_launcher.html>`_
             for more details.
             This example should be run with 2 devices.
-        >>> from mindspore.communication import init
-        >>> from mindspore.communication.comm_func import barrier
+        >>> import mindspore as ms
+        >>> import mindspore.communication as comm
+        >>>
         >>> # Launch 2 processes.
-        >>> init()
-        >>> barrier()
+        >>> comm.init()
+        >>> comm.comm_func.barrier()
     Tutorial Examples:
         - `Distributed Set Communication Primitives - Barrier
           <https://www.mindspore.cn/docs/en/master/api_python/samples/ops/communicate_ops.html#barrier>`_
     """
+    if not isinstance(group, str):
+        raise TypeError(f"group must be type of string, but got {type(group)}")
     _op = _get_cache_prim(P.Barrier)(group)
     return _op()
-def _deal_comm_outputs(output, async_op):
+def _deal_comm_outputs(output, async_op, exec_sync=False):
+    """
+    deal with comm ops outputs.
+    """
     if isinstance(output, tuple):
         if not async_op:
             output[1].wait()
+            if exec_sync:
+                _pynative_executor.sync()
             return (output[0], None)
         return output
@@ -918,21 +926,34 @@ def send(tensor, dst=0, group=GlobalComm.WORLD_COMM_GROUP, tag=0):
             For Ascend/GPU/CPU devices, it is recommended to use the msrun startup method
             without any third-party or configuration file dependencies.
             Please see the `msrun start up
-            <https://www.mindspore.cn/docs/zh-CN/master/model_train/parallel/msrun_launcher.html>`_
+            <https://www.mindspore.cn/docs/en/master/model_train/parallel/msrun_launcher.html>`_
             for more details.
             This example should be run with 2 devices.
-        >>> from mindspore import ops
-        >>> import mindspore.nn as nn
-        >>> from mindspore.communication import init
-        >>> from mindspore.communication.comm_func import send
-        >>> from mindspore import Tensor
         >>> import numpy as np
+        >>> import mindspore as ms
+        >>> from mindspore.communication import init
+        >>> from mindspore.communication.comm_func import send, recv
+        >>> from mindspore.communication import get_rank, get_group_size
         >>>
+        >>> np.random.seed(1)
         >>> init()
-        >>> input_ = Tensor(np.ones([2, 8]).astype(np.float32))
-        >>> send(input_, 0)
+        >>> rank = get_rank()
+        >>> size = get_group_size()
+        >>> x = np.ones([2, 2]).astype(np.float32) * 0.01 * (rank + 1)
+        >>> x2 = np.ones([2, 2]).astype(np.float32)
+        >>>
+        >>>
+        >>> if rank < size / 2:
+        >>>     _x = ms.Tensor(x)
+        >>>     send(_x, rank + size // 2)
+        >>> else:
+        >>>     _x2 = ms.Tensor(x2)
+        >>>     output = recv(_x2, rank - size // 2)
+        >>>     print(output)
+        [[0.01  0.01]
+         [0.01  0.01]]
     """
     if not isinstance(tensor, (Tensor, Tensor_)):
         raise TypeError("For send, the input tensor must be tensor")
@@ -979,29 +1000,34 @@ def recv(tensor, src=0, group=GlobalComm.WORLD_COMM_GROUP, tag=0):
             For Ascend/GPU/CPU devices, it is recommended to use the msrun startup method
             without any third-party or configuration file dependencies.
             Please see the `msrun start up
-            <https://www.mindspore.cn/docs/zh-CN/master/model_train/parallel/msrun_launcher.html>`_
+            <https://www.mindspore.cn/docs/en/master/model_train/parallel/msrun_launcher.html>`_
             for more details.
             This example should be run with 2 devices.
-        >>> from mindspore import ops
-        >>> import mindspore.nn as nn
-        >>> from mindspore.communication import init
-        >>> from mindspore.communication.comm_func import recv
-        >>> from mindspore import Tensor
         >>> import numpy as np
+        >>> import mindspore as ms
+        >>> from mindspore.communication import init
+        >>> from mindspore.communication.comm_func import send, recv
+        >>> from mindspore.communication import get_rank, get_group_size
         >>>
-        # Launch 2 processes.
-        Process 0 send the following array to Process 1
-        [[ 0.  1.]
-         [ 2.  3.]]
+        >>> np.random.seed(1)
         >>> init()
-        >>> x = ms.Tensor(np.zeros([2, 2]))
-        # Process 1 receive tensor from Process 0.
-        >>> out = recv(x, src=0)
-        >>> print(out)
-        [[ 0.  1.]
-         [ 2.  3.]]
+        >>> rank = get_rank()
+        >>> size = get_group_size()
+        >>> x = np.ones([2, 2]).astype(np.float32) * 0.01 * (rank + 1)
+        >>> x2 = np.ones([2, 2]).astype(np.float32)
+        >>>
+        >>>
+        >>> if rank < size / 2:
+        >>>     _x = ms.Tensor(x)
+        >>>     send(_x, rank + size // 2)
+        >>> else:
+        >>>     _x2 = ms.Tensor(x2)
+        >>>     output = recv(_x2, rank - size // 2)
+        >>>     print(output)
+        [[0.01  0.01]
+         [0.01  0.01]]
     """
     if not isinstance(tensor, (Tensor, Tensor_)):
         raise TypeError("For recv, the input tensor must be tensor")
@@ -1049,22 +1075,35 @@ def isend(tensor, dst=0, group=GlobalComm.WORLD_COMM_GROUP, tag=0):
             For Ascend/GPU/CPU devices, it is recommended to use the msrun startup method
             without any third-party or configuration file dependencies.
             Please see the `msrun start up
-            <https://www.mindspore.cn/docs/zh-CN/master/model_train/parallel/msrun_launcher.html>`_
+            <https://www.mindspore.cn/docs/en/master/model_train/parallel/msrun_launcher.html>`_
             for more details.
             This example should be run with 2 devices.
-        >>> from mindspore import ops
-        >>> import mindspore.nn as nn
-        >>> from mindspore.communication import init
-        >>> from mindspore.communication.comm_func import isend
-        >>> from mindspore import Tensor
         >>> import numpy as np
+        >>> import mindspore as ms
+        >>> from mindspore.communication import init
+        >>> from mindspore.communication.comm_func import isend, irecv
+        >>> from mindspore.communication import get_rank, get_group_size
         >>>
+        >>> np.random.seed(1)
         >>> init()
-        >>> input_ = Tensor(np.ones([2, 8]).astype(np.float32))
-        >>> handle = isend(input_, 0)
-        >>> handle.wait()
+        >>> rank = get_rank()
+        >>> size = get_group_size()
+        >>> x = np.ones([2, 2]).astype(np.float32) * 0.01 * (rank + 1)
+        >>> x2 = np.ones([2, 2]).astype(np.float32)
+        >>>
+        >>>
+        >>> if rank < size / 2:
+        >>>     _x = ms.Tensor(x)
+        >>>     isend(_x, rank + size // 2)
+        >>> else:
+        >>>     _x2 = ms.Tensor(x2)
+        >>>     output, handle = irecv(_x2, rank - size // 2)
+        >>>     handle.wait()
+        >>>     print(output)
+        [[0.01  0.01]
+         [0.01  0.01]]
     """
     if not isinstance(tensor, (Tensor, Tensor_)):
         raise TypeError("For isend, the input tensor must be tensor")
@@ -1114,30 +1153,35 @@ def irecv(tensor, src=0, group=GlobalComm.WORLD_COMM_GROUP, tag=0):
             For Ascend/GPU/CPU devices, it is recommended to use the msrun startup method
             without any third-party or configuration file dependencies.
             Please see the `msrun start up
-            <https://www.mindspore.cn/docs/zh-CN/master/model_train/parallel/msrun_launcher.html>`_
+            <https://www.mindspore.cn/docs/en/master/model_train/parallel/msrun_launcher.html>`_
             for more details.
             This example should be run with 2 devices.
-        >>> from mindspore import ops
-        >>> import mindspore.nn as nn
-        >>> from mindspore.communication import init
-        >>> from mindspore.communication.comm_func import irecv
-        >>> from mindspore import Tensor
         >>> import numpy as np
+        >>> import mindspore as ms
+        >>> from mindspore.communication import init
+        >>> from mindspore.communication.comm_func import isend, irecv
+        >>> from mindspore.communication import get_rank, get_group_size
         >>>
-        # Launch 2 processes.
-        Process 0 send the following array to Process 1
-        [[ 0.  1.]
-         [ 2.  3.]]
+        >>> np.random.seed(1)
         >>> init()
-        >>> x = ms.Tensor(np.zeros([2, 2]))
-        # Process 1 receive tensor from Process 0.
-        >>> out, handle = irecv(x, src=0)
-        >>> handle.wait()
-        >>> print(out)
-        [[ 0.  1.]
-         [ 2.  3.]]
+        >>> rank = get_rank()
+        >>> size = get_group_size()
+        >>> x = np.ones([2, 2]).astype(np.float32) * 0.01 * (rank + 1)
+        >>> x2 = np.ones([2, 2]).astype(np.float32)
+        >>>
+        >>>
+        >>> if rank < size / 2:
+        >>>     _x = ms.Tensor(x)
+        >>>     isend(_x, rank + size // 2)
+        >>> else:
+        >>>     _x2 = ms.Tensor(x2)
+        >>>     output, handle = irecv(_x2, rank - size // 2)
+        >>>     handle.wait()
+        >>>     print(output)
+        [[0.01  0.01]
+         [0.01  0.01]]
     """
     group = _get_group(group)
     _src = _get_group_rank_from_world_rank_from_cache_helper(src, group)
@@ -1185,27 +1229,24 @@ def all_to_all_with_output_shape(output_shape_list, input_tensor_list, group=Non
             For Ascend/GPU/CPU devices, it is recommended to use the msrun startup method
             without any third-party or configuration file dependencies.
             Please see the `msrun start up
-            <https://www.mindspore.cn/docs/zh-CN/master/model_train/parallel/msrun_launcher.html>`_
+            <https://www.mindspore.cn/docs/en/master/model_train/parallel/msrun_launcher.html>`_
             for more details.
             This example should be run with 2 devices.
         >>> import numpy as np
-        >>> import mindspore
-        >>> from mindspore.communication import init, get_rank, get_group_size
-        >>> from mindspore.communication.comm_func import all_to_all_with_output_shape
-        >>> from mindspore import Tensor
-        >>> from mindspore.ops import zeros
+        >>> import mindspore as ms
+        >>> import mindspore.communication as comm
         >>>
-        >>> init()
-        >>> this_rank = get_rank()
+        >>> comm.init()
+        >>> this_rank = comm.get_rank()
         >>> if this_rank == 0:
-        >>>     send_tensor_list = [Tensor(1.), Tensor([[2, 3], [4, 5.]])]
+        >>>     send_tensor_list = [ms.Tensor(1.), ms.Tensor([[2, 3], [4, 5.]])]
         >>>     recv_tensor_list = [(), (2,)]
         >>> if this_rank == 1:
-        >>>     send_tensor_list = [Tensor([2, 2.]), Tensor([4, 5, 6, 7.])]
+        >>>     send_tensor_list = [ms.Tensor([2, 2.]), ms.Tensor([4, 5, 6, 7.])]
         >>>     recv_tensor_list = [(2, 2), (4,)]
-        >>> output = all_to_all_with_output_shape(recv_tensor_list, send_tensor_list)
+        >>> output, _ = comm.comm_func.all_to_all_with_output_shape(recv_tensor_list, send_tensor_list)
         >>> print(output)
         rank 0:
         (Tensor(shape=[], dtype=Float32, value= 1),
@@ -1256,17 +1297,17 @@ def all_to_all_with_output_shape(output_shape_list, input_tensor_list, group=Non
     return (tuple(result), handle)
-def _get_all_to_all_single_numel_list(tensor, output_shape, output_split_sizes, input_split_sizes, group):
+def _get_all_to_all_single_numel_list(tensor_shape, output_shape, output_split_sizes, input_split_sizes, group):
     """get numel list for all_to_all_single."""
     global _GROPU_SIZE_CACHE
     if _is_split_sizes_empty(input_split_sizes):
         if group not in _GROPU_SIZE_CACHE:
             _GROPU_SIZE_CACHE[group] = get_group_size(group)
         _world_size = _GROPU_SIZE_CACHE[group]
-        if tensor.shape[0] % _world_size != 0:
+        if tensor_shape[0] % _world_size != 0:
             raise ValueError("input shape at dim 0 must be divided by world_size, "
-                             f"but got {tensor.shape[0]} and {_world_size}.")
-        _split_size = tensor.shape[0] // _world_size
+                             f"but got {tensor_shape[0]} and {_world_size}.")
+        _split_size = tensor_shape[0] // _world_size
         input_split_sizes = (_split_size,) * _world_size
     if _is_split_sizes_empty(output_split_sizes):
         if group not in _GROPU_SIZE_CACHE:
@@ -1283,7 +1324,7 @@ def _get_all_to_all_single_numel_list(tensor, output_shape, output_split_sizes,
         _split_size = shape_dim_0 // _world_size
         output_split_sizes = (_split_size,) * _world_size
-    send_size_without_first_dim = _get_size(tensor.shape[1:])
+    send_size_without_first_dim = _get_size(tensor_shape[1:])
     send_numel_list = [size * send_size_without_first_dim for size in input_split_sizes]
     recv_size_without_first_dim = None
@@ -1298,6 +1339,9 @@ def _get_all_to_all_single_numel_list(tensor, output_shape, output_split_sizes,
     return send_numel_list, recv_numel_list, recv_shape_without_first_dim
+_ALL_TO_ALL_CACHE = {}
 def all_to_all_single_with_output_shape(output_shape, tensor, output_split_sizes=None,
                                         input_split_sizes=None, group=None, async_op=False):
     """
@@ -1339,36 +1383,25 @@ def all_to_all_single_with_output_shape(output_shape, tensor, output_split_sizes
             For Ascend/GPU/CPU devices, it is recommended to use the msrun startup method
             without any third-party or configuration file dependencies.
             Please see the `msrun start up
-            <https://www.mindspore.cn/docs/zh-CN/master/model_train/parallel/msrun_launcher.html>`_
+            <https://www.mindspore.cn/docs/en/master/model_train/parallel/msrun_launcher.html>`_
             for more details.
             This example should be run with 2 devices.
         >>> import numpy as np
-        >>> import mindspore
-        >>> from mindspore.communication import init, get_rank, get_group_size
-        >>> from mindspore.communication.comm_func import all_to_all_single_with_output_shape
-        >>> from mindspore import Tensor
-        >>> from mindspore.ops import zeros
+        >>> import mindspore as ms
+        >>> import mindspore.communication as comm
         >>>
-        >>> init()
-        >>> this_rank = get_rank()
-        >>> if this_rank == 0:
-        >>>     output_shape = (3, 3)
-        >>>     tensor = Tensor([[0, 1, 2.], [3, 4, 5], [6, 7, 8]])
-        >>>     result = all_to_all_single_with_output_shape(output_shape, tensor, [2, 1], [2, 1])
-        >>> if this_rank == 1:
-        >>>     output_shape = (2, 3)
-        >>>     tensor = Tensor([[9, 10., 11], [12, 13, 14]])
-        >>>     result = all_to_all_single_with_output_shape(output_shape, tensor)
+        >>> comm.init()
+        >>> rank = comm.get_rank()
+        >>> input = ms.Tensor([0, 1]) + rank * 2
+        >>> output_shape = (2,)
+        >>> result, _ = comm.comm_func.all_to_all_single_with_output_shape(output_shape, input)
         >>> print(result)
         rank 0:
-        [[ 0.  1.  2.]
-         [ 3.  4.  5.]
-         [ 9. 10. 11.]]
+        [ 0.  2.]
         rank 1:
-        [[ 6.  7.  8.]
-         [12. 13. 14.]]
+        [ 1.  3.]
     """
@@ -1378,8 +1411,17 @@ def all_to_all_single_with_output_shape(output_shape, tensor, output_split_sizes
         group = GlobalComm.WORLD_COMM_GROUP
     split_sizes_empty = _is_split_sizes_empty(output_split_sizes) and _is_split_sizes_empty(input_split_sizes)
-    send_numel_list, recv_numel_list, recv_shape_without_first_dim = \
-        _get_all_to_all_single_numel_list(tensor, output_shape, output_split_sizes, input_split_sizes, group)
+    if isinstance(output_split_sizes, list):
+        output_split_sizes = tuple(output_split_sizes)
+    if isinstance(input_split_sizes, list):
+        input_split_sizes = tuple(input_split_sizes)
+    global _ALL_TO_ALL_CACHE
+    tensor_shape = output_shape
+    cache_key = (tensor_shape, output_shape, output_split_sizes, input_split_sizes, group)
+    if cache_key not in _ALL_TO_ALL_CACHE:
+        _ALL_TO_ALL_CACHE[cache_key] = _get_all_to_all_single_numel_list(*cache_key)
+    send_numel_list, recv_numel_list, recv_shape_without_first_dim = _ALL_TO_ALL_CACHE[cache_key]
     tensor = _contiguous(tensor)
     _input = tensor.reshape(-1)
     group = GlobalComm.WORLD_COMM_GROUP if group is None else _get_group(group)