PyPI - mindspore - Versions diffs - 2.1.0__cp38-cp38-win_amd64.whl → 2.2.11__cp38-cp38-win_amd64.whl - Mend

mindspore 2.1.0__cp38-cp38-win_amd64.whl → 2.2.11__cp38-cp38-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mindspore might be problematic. Click here for more details.

Files changed (511) hide show

mindspore/.commit_id +1 -1
mindspore/Microsoft.VisualStudio.Telemetry.dll +0 -0
mindspore/Newtonsoft.Json.dll +0 -0
mindspore/__init__.py +4 -1
mindspore/_c_dataengine.cp38-win_amd64.pyd +0 -0
mindspore/_c_expression.cp38-win_amd64.pyd +0 -0
mindspore/_c_mindrecord.cp38-win_amd64.pyd +0 -0
mindspore/_check_jit_forbidden_api.py +3 -1
mindspore/_checkparam.py +23 -29
mindspore/_extends/graph_kernel/__init__.py +0 -1
mindspore/_extends/graph_kernel/model/graph_split.py +84 -76
mindspore/_extends/graph_kernel/model/model_builder.py +9 -50
mindspore/_extends/graph_kernel/splitter.py +4 -11
mindspore/_extends/parallel_compile/akg_compiler/akg_process.py +122 -15
mindspore/_extends/parallel_compile/akg_compiler/build_tbe_kernel.py +84 -67
mindspore/_extends/parallel_compile/akg_compiler/tbe_topi.py +4 -2
mindspore/_extends/parallel_compile/akg_compiler/util.py +10 -7
mindspore/_extends/parallel_compile/tbe_compiler/tbe_adapter.py +2 -2
mindspore/_extends/parallel_compile/tbe_compiler/tbe_helper.py +6 -5
mindspore/_extends/parallel_compile/tbe_compiler/tbe_job.py +1 -1
mindspore/_extends/parallel_compile/tbe_compiler/tbe_job_manager.py +1 -1
mindspore/_extends/parse/__init__.py +13 -15
mindspore/_extends/parse/namespace.py +7 -33
mindspore/_extends/parse/parser.py +67 -72
mindspore/_extends/parse/resources.py +1 -1
mindspore/_extends/parse/standard_method.py +86 -106
mindspore/_extends/parse/trope.py +1 -1
mindspore/_extends/remote/kernel_build_server.py +25 -7
mindspore/_extends/remote/kernel_build_server_akg_v2.py +55 -0
mindspore/_install_custom.py +43 -0
mindspore/amp.py +47 -11
mindspore/atlprov.dll +0 -0
mindspore/boost/boost.py +1 -8
mindspore/boost/boost_cell_wrapper.py +3 -2
mindspore/boost/grad_accumulation.py +1 -1
mindspore/boost/group_loss_scale_manager.py +8 -7
mindspore/c1.dll +0 -0
mindspore/c1xx.dll +0 -0
mindspore/c2.dll +0 -0
mindspore/common/__init__.py +5 -3
mindspore/common/_jit_fallback_utils.py +6 -0
mindspore/common/_register_for_adapter.py +2 -0
mindspore/common/_register_for_tensor.py +2 -2
mindspore/common/_stub_tensor.py +13 -0
mindspore/common/_utils.py +29 -0
mindspore/common/api.py +174 -259
mindspore/common/auto_dynamic_shape.py +494 -0
mindspore/common/dtype.py +18 -11
mindspore/common/dump.py +6 -4
mindspore/common/initializer.py +14 -14
mindspore/common/jit_config.py +33 -15
mindspore/common/lazy_inline.py +126 -7
mindspore/common/mindir_util.py +101 -0
mindspore/common/parameter.py +51 -41
mindspore/common/seed.py +4 -4
mindspore/common/sparse_tensor.py +13 -14
mindspore/common/tensor.py +243 -165
mindspore/communication/__init__.py +7 -4
mindspore/communication/_comm_helper.py +83 -4
mindspore/communication/management.py +152 -84
mindspore/config/op_info.config +14 -3
mindspore/context.py +152 -61
mindspore/dataset/__init__.py +5 -5
mindspore/dataset/audio/__init__.py +2 -2
mindspore/dataset/audio/transforms.py +52 -52
mindspore/dataset/callback/ds_callback.py +16 -2
mindspore/dataset/core/config.py +68 -51
mindspore/dataset/engine/cache_client.py +33 -7
mindspore/dataset/engine/datasets.py +250 -112
mindspore/dataset/engine/datasets_audio.py +43 -211
mindspore/dataset/engine/datasets_standard_format.py +16 -35
mindspore/dataset/engine/datasets_text.py +43 -67
mindspore/dataset/engine/datasets_user_defined.py +86 -100
mindspore/dataset/engine/datasets_vision.py +219 -1029
mindspore/dataset/engine/iterators.py +11 -4
mindspore/dataset/engine/obs/obs_mindrecord_dataset.py +4 -0
mindspore/dataset/engine/obs/util.py +3 -0
mindspore/dataset/engine/samplers.py +1 -1
mindspore/dataset/engine/validators.py +19 -5
mindspore/dataset/text/__init__.py +3 -3
mindspore/dataset/text/transforms.py +101 -127
mindspore/dataset/text/utils.py +205 -138
mindspore/dataset/transforms/__init__.py +1 -1
mindspore/dataset/transforms/py_transforms_util.py +40 -12
mindspore/dataset/transforms/transforms.py +95 -40
mindspore/dataset/utils/browse_dataset.py +8 -2
mindspore/dataset/utils/line_reader.py +17 -19
mindspore/dataset/vision/__init__.py +3 -3
mindspore/dataset/vision/c_transforms.py +6 -3
mindspore/dataset/vision/transforms.py +409 -287
mindspore/dataset/vision/utils.py +13 -14
mindspore/dataset/vision/validators.py +11 -1
mindspore/dnnl.dll +0 -0
mindspore/dpcmi.dll +0 -0
mindspore/experimental/map_parameter.py +14 -0
mindspore/{nn/optim_ex → experimental/optim}/__init__.py +30 -29
mindspore/{nn/optim_ex → experimental/optim}/adam.py +60 -67
mindspore/{nn/optim_ex → experimental/optim}/adamw.py +181 -203
mindspore/experimental/optim/lr_scheduler.py +1427 -0
mindspore/{nn/optim_ex → experimental/optim}/optimizer.py +252 -259
mindspore/{nn/optim_ex → experimental/optim}/sgd.py +147 -152
mindspore/gen_ops.py +273 -0
mindspore/include/OWNERS +0 -1
mindspore/include/api/data_type.h +2 -1
mindspore/include/api/graph.h +0 -15
mindspore/include/api/kernel.h +2 -0
mindspore/include/api/kernel_api.h +37 -12
mindspore/include/api/model.h +17 -14
mindspore/include/api/status.h +8 -3
mindspore/include/api/types.h +37 -4
mindspore/include/c_api/ms/abstract.h +67 -0
mindspore/include/c_api/ms/attribute.h +197 -0
mindspore/include/c_api/ms/base/handle_types.h +43 -0
mindspore/include/c_api/ms/base/macros.h +32 -0
mindspore/include/c_api/ms/base/status.h +33 -0
mindspore/include/c_api/ms/base/types.h +282 -0
mindspore/include/c_api/ms/context.h +102 -0
mindspore/include/c_api/ms/graph.h +160 -0
mindspore/include/c_api/ms/node.h +606 -0
mindspore/include/c_api/ms/tensor.h +161 -0
mindspore/include/c_api/ms/value.h +84 -0
mindspore/include/dataset/constants.h +6 -5
mindspore/include/dataset/execute.h +23 -13
mindspore/include/dataset/text.h +26 -26
mindspore/include/dataset/transforms.h +13 -13
mindspore/include/dataset/vision.h +60 -60
mindspore/include/dataset/vision_ascend.h +5 -6
mindspore/include/dataset/vision_lite.h +17 -17
mindspore/jpeg62.dll +0 -0
mindspore/mindrecord/tools/imagenet_to_mr.py +1 -1
mindspore/mindrecord/tools/mnist_to_mr.py +2 -2
mindspore/mindspore_backend.dll +0 -0
mindspore/mindspore_common.dll +0 -0
mindspore/mindspore_core.dll +0 -0
mindspore/mindspore_glog.dll +0 -0
mindspore/mindspore_shared_lib.dll +0 -0
mindspore/msobj140.dll +0 -0
mindspore/mspdb140.dll +0 -0
mindspore/mspdbcore.dll +0 -0
mindspore/mspdbst.dll +0 -0
mindspore/mspft140.dll +0 -0
mindspore/msvcdis140.dll +0 -0
mindspore/msvcp140_1.dll +0 -0
mindspore/msvcp140_2.dll +0 -0
mindspore/msvcp140_atomic_wait.dll +0 -0
mindspore/msvcp140_codecvt_ids.dll +0 -0
mindspore/nn/__init__.py +0 -2
mindspore/nn/cell.py +313 -74
mindspore/nn/dynamic_lr.py +21 -21
mindspore/nn/layer/activation.py +22 -30
mindspore/nn/layer/basic.py +15 -13
mindspore/nn/layer/channel_shuffle.py +1 -1
mindspore/nn/layer/container.py +271 -9
mindspore/nn/layer/conv.py +323 -204
mindspore/nn/layer/dense.py +8 -5
mindspore/nn/layer/embedding.py +33 -27
mindspore/nn/layer/flash_attention.py +61 -95
mindspore/nn/layer/image.py +8 -6
mindspore/nn/layer/math.py +16 -25
mindspore/nn/layer/normalization.py +107 -66
mindspore/nn/layer/padding.py +1 -1
mindspore/nn/layer/pooling.py +131 -109
mindspore/nn/layer/rnn_cells.py +27 -22
mindspore/nn/layer/rnns.py +13 -16
mindspore/nn/layer/thor_layer.py +1 -1
mindspore/nn/layer/transformer.py +221 -154
mindspore/nn/learning_rate_schedule.py +9 -1
mindspore/nn/loss/loss.py +235 -174
mindspore/nn/optim/ada_grad.py +2 -1
mindspore/nn/optim/adadelta.py +1 -0
mindspore/nn/optim/adafactor.py +2 -1
mindspore/nn/optim/adam.py +7 -4
mindspore/nn/optim/adamax.py +3 -2
mindspore/nn/optim/adasum.py +2 -2
mindspore/nn/optim/asgd.py +2 -3
mindspore/nn/optim/ftrl.py +6 -5
mindspore/nn/optim/lamb.py +7 -4
mindspore/nn/optim/lars.py +1 -1
mindspore/nn/optim/lazyadam.py +5 -3
mindspore/nn/optim/momentum.py +2 -1
mindspore/nn/optim/optimizer.py +53 -4
mindspore/nn/optim/proximal_ada_grad.py +3 -4
mindspore/nn/optim/rmsprop.py +4 -3
mindspore/nn/optim/rprop.py +23 -12
mindspore/nn/optim/sgd.py +26 -11
mindspore/nn/optim/thor.py +9 -7
mindspore/nn/probability/bijector/bijector.py +5 -5
mindspore/nn/probability/bijector/power_transform.py +27 -27
mindspore/nn/probability/bijector/softplus.py +3 -3
mindspore/nn/probability/distribution/_utils/custom_ops.py +3 -3
mindspore/nn/probability/distribution/bernoulli.py +5 -5
mindspore/nn/probability/distribution/beta.py +3 -3
mindspore/nn/probability/distribution/categorical.py +7 -7
mindspore/nn/probability/distribution/cauchy.py +0 -1
mindspore/nn/probability/distribution/distribution.py +3 -3
mindspore/nn/probability/distribution/gamma.py +3 -3
mindspore/nn/probability/distribution/geometric.py +4 -4
mindspore/nn/probability/distribution/gumbel.py +4 -4
mindspore/nn/probability/distribution/log_normal.py +2 -2
mindspore/nn/probability/distribution/logistic.py +2 -2
mindspore/nn/probability/distribution/poisson.py +4 -4
mindspore/nn/probability/distribution/transformed_distribution.py +3 -3
mindspore/nn/probability/distribution/uniform.py +6 -6
mindspore/nn/wrap/__init__.py +4 -2
mindspore/nn/wrap/cell_wrapper.py +87 -34
mindspore/nn/wrap/grad_reducer.py +8 -5
mindspore/nn/wrap/loss_scale.py +105 -42
mindspore/numpy/array_creations.py +1 -2
mindspore/numpy/array_ops.py +3 -2
mindspore/numpy/utils_const.py +5 -5
mindspore/opencv_core452.dll +0 -0
mindspore/opencv_imgcodecs452.dll +0 -0
mindspore/opencv_imgproc452.dll +0 -0
mindspore/ops/_grad_experimental/__init__.py +0 -5
mindspore/ops/_grad_experimental/grad_array_ops.py +2 -3
mindspore/ops/_grad_experimental/grad_comm_ops.py +15 -2
mindspore/ops/_grad_experimental/grad_debug_ops.py +0 -37
mindspore/ops/_grad_experimental/grad_implementations.py +11 -1
mindspore/ops/_grad_experimental/grad_inner_ops.py +2 -216
mindspore/ops/_grad_experimental/grad_math_ops.py +19 -199
mindspore/ops/_grad_experimental/grad_sparse.py +15 -0
mindspore/ops/_grad_experimental/grad_sparse_ops.py +3 -3
mindspore/ops/_op_impl/_custom_op/dsd_back_impl.py +1 -1
mindspore/ops/_op_impl/aicpu/__init__.py +14 -2
mindspore/ops/_op_impl/aicpu/add.py +3 -3
mindspore/ops/_op_impl/aicpu/bias_add_grad.py +0 -1
mindspore/ops/_op_impl/aicpu/count_nonzero.py +43 -0
mindspore/ops/_op_impl/{_custom_op/flash_attention/constants.py → aicpu/eps.py} +18 -27
mindspore/ops/_op_impl/aicpu/gamma.py +2 -2
mindspore/ops/_op_impl/aicpu/linear_sum_assignment.py +21 -2
mindspore/ops/_op_impl/aicpu/log_uniform_candidate_sampler.py +6 -3
mindspore/ops/_op_impl/aicpu/lu_unpack_grad.py +0 -1
mindspore/ops/_op_impl/aicpu/multinomial.py +3 -3
mindspore/ops/_op_impl/aicpu/parameterized_truncated_normal.py +15 -7
mindspore/ops/_op_impl/aicpu/random_categorical.py +39 -19
mindspore/ops/_op_impl/aicpu/random_choice_with_mask.py +5 -2
mindspore/ops/_op_impl/aicpu/random_poisson.py +103 -52
mindspore/ops/_op_impl/aicpu/random_shuffle.py +17 -15
mindspore/ops/_op_impl/aicpu/{sparseaddmm.py → sparse_addmm.py} +2 -2
mindspore/ops/_op_impl/aicpu/{sparsesparsemaximum.py → sparse_sparse_maximum.py} +4 -4
mindspore/ops/_op_impl/aicpu/standard_laplace.py +5 -5
mindspore/ops/_op_impl/aicpu/standard_normal.py +5 -5
mindspore/ops/_op_impl/aicpu/truncated_normal.py +9 -7
mindspore/ops/_op_impl/aicpu/uniform.py +5 -3
mindspore/ops/_op_impl/aicpu/uniform_candidate_sampler.py +8 -4
mindspore/ops/_op_impl/aicpu/uniform_int.py +5 -5
mindspore/ops/_op_impl/aicpu/uniform_real.py +4 -4
mindspore/ops/_op_impl/tbe/__init__.py +4 -4
mindspore/ops/_op_impl/tbe/inplace_index_add.py +7 -3
mindspore/ops/_op_impl/tbe/trans_data_ds.py +2 -0
mindspore/ops/_primitive_cache.py +1 -1
mindspore/ops/_tracefunc.py +45 -13
mindspore/ops/_utils/utils.py +6 -1
mindspore/ops/_vmap/vmap_array_ops.py +3 -3
mindspore/ops/_vmap/vmap_base.py +3 -3
mindspore/ops/_vmap/vmap_convolution_ops.py +1 -1
mindspore/ops/_vmap/vmap_grad_math_ops.py +6 -4
mindspore/ops/_vmap/vmap_math_ops.py +5 -2
mindspore/ops/_vmap/vmap_nn_ops.py +61 -7
mindspore/ops/arg_dtype_cast.py +54 -0
mindspore/ops/composite/base.py +37 -10
mindspore/ops/composite/math_ops.py +5 -4
mindspore/ops/composite/multitype_ops/_compile_utils.py +275 -73
mindspore/ops/composite/multitype_ops/_constexpr_utils.py +16 -9
mindspore/ops/composite/multitype_ops/add_impl.py +43 -4
mindspore/ops/composite/multitype_ops/getitem_impl.py +42 -4
mindspore/ops/composite/multitype_ops/ones_like_impl.py +6 -0
mindspore/ops/composite/multitype_ops/setitem_impl.py +2 -1
mindspore/ops/composite/multitype_ops/zeros_like_impl.py +9 -0
mindspore/ops/deprecated.py +304 -0
mindspore/ops/function/__init__.py +4 -1
mindspore/ops/function/array_func.py +174 -193
mindspore/ops/function/clip_func.py +81 -13
mindspore/ops/function/debug_func.py +1 -1
mindspore/ops/function/grad/grad_func.py +18 -9
mindspore/ops/function/image_func.py +10 -4
mindspore/ops/function/linalg_func.py +5 -5
mindspore/ops/function/math_func.py +575 -386
mindspore/ops/function/nn_func.py +568 -260
mindspore/ops/function/random_func.py +88 -57
mindspore/ops/function/sparse_func.py +1 -1
mindspore/ops/function/sparse_unary_func.py +14 -12
mindspore/ops/function/vmap_func.py +6 -5
mindspore/ops/functional.py +15 -10
mindspore/ops/op_info_register.py +244 -25
mindspore/ops/operations/__init__.py +31 -19
mindspore/ops/operations/_grad_ops.py +71 -7
mindspore/ops/operations/_inner_ops.py +350 -17
mindspore/ops/operations/_quant_ops.py +4 -8
mindspore/ops/operations/_sequence_ops.py +42 -0
mindspore/ops/operations/array_ops.py +68 -282
mindspore/ops/operations/comm_ops.py +107 -59
mindspore/ops/operations/custom_ops.py +94 -70
mindspore/ops/operations/debug_ops.py +8 -4
mindspore/ops/operations/image_ops.py +18 -12
mindspore/ops/operations/inner_ops.py +26 -3
mindspore/ops/operations/math_ops.py +192 -144
mindspore/ops/operations/nn_ops.py +857 -489
mindspore/ops/operations/other_ops.py +0 -22
mindspore/ops/operations/random_ops.py +53 -111
mindspore/ops/operations/sparse_ops.py +3 -1
mindspore/ops/primitive.py +24 -18
mindspore/parallel/_auto_parallel_context.py +68 -8
mindspore/parallel/_cost_model_context.py +2 -2
mindspore/parallel/_offload_context.py +17 -3
mindspore/parallel/_parallel_serialization.py +12 -5
mindspore/parallel/_ps_context.py +12 -0
mindspore/parallel/_tensor.py +18 -13
mindspore/parallel/_transformer/layers.py +5 -3
mindspore/parallel/_transformer/loss.py +1 -0
mindspore/parallel/_transformer/moe.py +2 -2
mindspore/parallel/_transformer/op_parallel_config.py +12 -1
mindspore/parallel/_transformer/transformer.py +23 -3
mindspore/parallel/_utils.py +11 -7
mindspore/parallel/algo_parameter_config.py +85 -5
mindspore/parallel/checkpoint_transform.py +19 -12
mindspore/parallel/shard.py +21 -14
mindspore/pgodb140.dll +0 -0
mindspore/pgort140.dll +0 -0
mindspore/profiler/common/struct_type.py +3 -3
mindspore/profiler/common/util.py +4 -2
mindspore/profiler/envprofiling.py +1 -1
mindspore/profiler/parser/aicpu_data_parser.py +5 -3
mindspore/profiler/parser/ascend_flops_generator.py +2 -2
mindspore/profiler/parser/ascend_fpbp_generator.py +1 -1
mindspore/profiler/parser/ascend_hccl_generator.py +249 -12
mindspore/profiler/parser/ascend_msprof_exporter.py +150 -255
mindspore/profiler/parser/ascend_msprof_generator.py +204 -17
mindspore/profiler/parser/ascend_op_generator.py +6 -6
mindspore/profiler/parser/ascend_steptrace_generator.py +6 -4
mindspore/profiler/parser/ascend_timeline_generator.py +14 -187
mindspore/profiler/parser/base_timeline_generator.py +10 -8
mindspore/profiler/parser/cpu_gpu_timeline_generator.py +16 -12
mindspore/profiler/parser/flops_parser.py +15 -11
mindspore/profiler/parser/framework_parser.py +38 -22
mindspore/profiler/parser/hccl_parser.py +16 -12
mindspore/profiler/parser/integrator.py +22 -11
mindspore/profiler/parser/memory_usage_parser.py +2 -2
mindspore/profiler/parser/minddata_analyzer.py +12 -14
mindspore/profiler/parser/minddata_pipeline_parser.py +1 -1
mindspore/profiler/parser/msadvisor_parser.py +8 -4
mindspore/profiler/parser/op_intermediate_parser.py +5 -2
mindspore/profiler/parser/optime_parser.py +1 -1
mindspore/profiler/parser/profiler_info.py +21 -2
mindspore/profiler/parser/step_trace_parser.py +11 -14
mindspore/profiler/profiling.py +179 -89
mindspore/rewrite/api/node.py +102 -19
mindspore/rewrite/api/node_type.py +5 -1
mindspore/rewrite/api/pattern_engine.py +1 -1
mindspore/rewrite/api/scoped_value.py +9 -17
mindspore/rewrite/api/symbol_tree.py +131 -47
mindspore/rewrite/ast_helpers/__init__.py +2 -1
mindspore/rewrite/ast_helpers/ast_finder.py +129 -0
mindspore/rewrite/ast_helpers/ast_modifier.py +116 -104
mindspore/rewrite/ast_transformers/flatten_recursive_stmt.py +93 -46
mindspore/rewrite/common/rewrite_elog.py +5 -1
mindspore/rewrite/namer.py +33 -24
mindspore/rewrite/namespace.py +14 -5
mindspore/{_extends/graph_kernel/expanders/complex → rewrite/node}/__init__.py +9 -9
mindspore/rewrite/node/call_function.py +79 -0
mindspore/rewrite/node/cell_container.py +135 -0
mindspore/rewrite/node/control_flow.py +88 -0
mindspore/rewrite/{node.py → node/node.py} +273 -234
mindspore/rewrite/node/node_manager.py +254 -0
mindspore/rewrite/{topological_manager.py → node/node_topological_manager.py} +13 -46
mindspore/rewrite/parsers/arguments_parser.py +22 -21
mindspore/rewrite/parsers/assign_parser.py +216 -221
mindspore/rewrite/parsers/attribute_parser.py +9 -7
mindspore/rewrite/parsers/class_def_parser.py +174 -113
mindspore/rewrite/parsers/constant_parser.py +9 -6
mindspore/rewrite/parsers/container_parser.py +9 -7
mindspore/rewrite/parsers/for_parser.py +42 -21
mindspore/rewrite/parsers/function_def_parser.py +24 -16
mindspore/rewrite/parsers/if_parser.py +28 -24
mindspore/rewrite/parsers/module_parser.py +196 -25
mindspore/rewrite/{parser.py → parsers/parser.py} +4 -2
mindspore/rewrite/{parser_register.py → parsers/parser_register.py} +1 -1
mindspore/rewrite/parsers/return_parser.py +6 -6
mindspore/rewrite/sparsify/sparse_transformer.py +12 -3
mindspore/rewrite/sparsify/utils.py +1 -1
mindspore/rewrite/symbol_tree.py +523 -578
mindspore/rewrite/symbol_tree_builder.py +9 -193
mindspore/rewrite/symbol_tree_dumper.py +2 -2
mindspore/run_check/_check_version.py +6 -4
mindspore/{ops/bprop_mindir → safeguard}/__init__.py +4 -3
mindspore/safeguard/rewrite_obfuscation.py +541 -0
mindspore/tbbmalloc.dll +0 -0
mindspore/tinyxml2.dll +0 -0
mindspore/train/_utils.py +7 -3
mindspore/train/amp.py +323 -123
mindspore/train/anf_ir_pb2.py +14 -2
mindspore/train/callback/_backup_and_restore.py +2 -12
mindspore/train/callback/_callback.py +29 -4
mindspore/train/callback/_checkpoint.py +23 -8
mindspore/train/callback/_early_stop.py +2 -2
mindspore/train/callback/_landscape.py +4 -4
mindspore/train/callback/_loss_monitor.py +2 -2
mindspore/train/callback/_on_request_exit.py +2 -2
mindspore/train/callback/_reduce_lr_on_plateau.py +3 -4
mindspore/train/callback/_summary_collector.py +15 -8
mindspore/train/callback/_time_monitor.py +58 -5
mindspore/train/data_sink.py +5 -11
mindspore/train/dataset_helper.py +84 -57
mindspore/train/loss_scale_manager.py +2 -2
mindspore/train/metrics/__init__.py +3 -3
mindspore/train/metrics/cosine_similarity.py +1 -1
mindspore/train/metrics/hausdorff_distance.py +3 -2
mindspore/train/metrics/mean_surface_distance.py +3 -2
mindspore/train/metrics/metric.py +39 -19
mindspore/train/metrics/roc.py +2 -2
mindspore/train/metrics/root_mean_square_surface_distance.py +4 -3
mindspore/train/mind_ir_pb2.py +85 -36
mindspore/train/model.py +187 -47
mindspore/train/serialization.py +487 -161
mindspore/train/summary/_summary_adapter.py +1 -1
mindspore/train/summary/_writer_pool.py +3 -2
mindspore/train/summary/summary_record.py +37 -17
mindspore/train/train_thor/convert_utils.py +3 -3
mindspore/train/train_thor/dataset_helper.py +1 -1
mindspore/turbojpeg.dll +0 -0
mindspore/vcmeta.dll +0 -0
mindspore/vcruntime140.dll +0 -0
mindspore/vcruntime140_1.dll +0 -0
mindspore/version.py +1 -1
{mindspore-2.1.0.dist-info → mindspore-2.2.11.dist-info}/METADATA +7 -4
{mindspore-2.1.0.dist-info → mindspore-2.2.11.dist-info}/RECORD +429 -486
mindspore/_extends/graph_kernel/expander.py +0 -80
mindspore/_extends/graph_kernel/expanders/__init__.py +0 -54
mindspore/_extends/graph_kernel/expanders/_utils.py +0 -269
mindspore/_extends/graph_kernel/expanders/addn.py +0 -33
mindspore/_extends/graph_kernel/expanders/batchnorm.py +0 -152
mindspore/_extends/graph_kernel/expanders/batchnorm_grad.py +0 -105
mindspore/_extends/graph_kernel/expanders/clip_by_norm_no_div_sum.py +0 -33
mindspore/_extends/graph_kernel/expanders/complex/abs.py +0 -30
mindspore/_extends/graph_kernel/expanders/complex/add.py +0 -44
mindspore/_extends/graph_kernel/expanders/complex/div.py +0 -62
mindspore/_extends/graph_kernel/expanders/complex/mul.py +0 -52
mindspore/_extends/graph_kernel/expanders/complex/real_div.py +0 -62
mindspore/_extends/graph_kernel/expanders/complex/sub.py +0 -45
mindspore/_extends/graph_kernel/expanders/conv2d.py +0 -200
mindspore/_extends/graph_kernel/expanders/dropout_grad.py +0 -30
mindspore/_extends/graph_kernel/expanders/equal_count.py +0 -50
mindspore/_extends/graph_kernel/expanders/erfc.py +0 -35
mindspore/_extends/graph_kernel/expanders/expand_dims.py +0 -50
mindspore/_extends/graph_kernel/expanders/fused_adam.py +0 -44
mindspore/_extends/graph_kernel/expanders/fused_adam_weight_decay.py +0 -47
mindspore/_extends/graph_kernel/expanders/fused_mul_add.py +0 -28
mindspore/_extends/graph_kernel/expanders/gelu_grad.py +0 -70
mindspore/_extends/graph_kernel/expanders/gkdropout.py +0 -40
mindspore/_extends/graph_kernel/expanders/identity.py +0 -25
mindspore/_extends/graph_kernel/expanders/layernorm.py +0 -93
mindspore/_extends/graph_kernel/expanders/layernorm_grad.py +0 -113
mindspore/_extends/graph_kernel/expanders/logsoftmax.py +0 -46
mindspore/_extends/graph_kernel/expanders/logsoftmax_grad.py +0 -36
mindspore/_extends/graph_kernel/expanders/matmul.py +0 -80
mindspore/_extends/graph_kernel/expanders/maximum_grad.py +0 -59
mindspore/_extends/graph_kernel/expanders/minimum_grad.py +0 -80
mindspore/_extends/graph_kernel/expanders/oneslike.py +0 -26
mindspore/_extends/graph_kernel/expanders/reduce_mean.py +0 -43
mindspore/_extends/graph_kernel/expanders/relu_grad.py +0 -32
mindspore/_extends/graph_kernel/expanders/sigmoid_cross_entropy_with_logits.py +0 -41
mindspore/_extends/graph_kernel/expanders/sigmoid_cross_entropy_with_logits_grad.py +0 -35
mindspore/_extends/graph_kernel/expanders/sigmoid_grad.py +0 -31
mindspore/_extends/graph_kernel/expanders/slice.py +0 -35
mindspore/_extends/graph_kernel/expanders/softmax_cross_entropy_with_logits.py +0 -42
mindspore/_extends/graph_kernel/expanders/softmax_grad_ext.py +0 -41
mindspore/_extends/graph_kernel/expanders/softsign.py +0 -28
mindspore/_extends/graph_kernel/expanders/sqrt_grad.py +0 -29
mindspore/_extends/graph_kernel/expanders/square_sum_all.py +0 -44
mindspore/_extends/graph_kernel/expanders/square_sum_v1.py +0 -37
mindspore/_extends/graph_kernel/expanders/squared_difference.py +0 -43
mindspore/_extends/graph_kernel/expanders/tanh_grad.py +0 -31
mindspore/_extends/graph_kernel/model/op_infer.py +0 -506
mindspore/dataset/datapreprocess/__init__.py +0 -20
mindspore/dataset/datapreprocess/preprocess_imagenet_validate_dataset.py +0 -54
mindspore/include/api/net.h +0 -142
mindspore/nn/lr_scheduler.py +0 -262
mindspore/ops/_grad_experimental/grad_image_ops.py +0 -248
mindspore/ops/_grad_experimental/grad_linalg_ops.py +0 -181
mindspore/ops/_grad_experimental/grad_other_ops.py +0 -72
mindspore/ops/_grad_experimental/grad_scalar_ops.py +0 -112
mindspore/ops/_grad_experimental/grad_sequence_ops.py +0 -351
mindspore/ops/_op_impl/_custom_op/flash_attention/__init__.py +0 -0
mindspore/ops/_op_impl/_custom_op/flash_attention/attention.py +0 -350
mindspore/ops/_op_impl/_custom_op/flash_attention/flash_attention_bwd.py +0 -409
mindspore/ops/_op_impl/_custom_op/flash_attention/flash_attention_fwd.py +0 -578
mindspore/ops/_op_impl/_custom_op/flash_attention/flash_attention_impl.py +0 -199
mindspore/ops/_op_impl/_custom_op/flash_attention/tik_ops_utils.py +0 -446
mindspore/ops/_op_impl/_custom_op/flash_attention/tiling_strategy/__init__.py +0 -0
mindspore/ops/_op_impl/_custom_op/flash_attention/tiling_strategy/sparse_tiling.py +0 -45
mindspore/ops/_op_impl/_custom_op/flash_attention/tiling_strategy/strategy.py +0 -67
mindspore/ops/_op_impl/_custom_op/flash_attention/tiling_strategy/wukong_tiling.py +0 -62
mindspore/ops/bprop_mindir/BNTrainingReduce_bprop.mindir +0 -0
mindspore/ops/bprop_mindir/Broadcast_bprop.mindir +0 -0
mindspore/ops/bprop_mindir/Depend_bprop.mindir +0 -0
mindspore/ops/bprop_mindir/DepthwiseConv2dNative_bprop.mindir +0 -138
mindspore/ops/bprop_mindir/EmbeddingLookup_bprop.mindir +0 -0
mindspore/ops/bprop_mindir/Load_bprop.mindir +0 -0
mindspore/ops/bprop_mindir/ScatterNonAliasingAdd_bprop.mindir +0 -0
mindspore/ops/bprop_mindir/SparseGatherV2_bprop.mindir +0 -0
mindspore/ops/bprop_mindir/SparseSoftmaxCrossEntropyWithLogits_bprop.mindir +0 -0
mindspore/ops/bprop_mindir/Switch_bprop.mindir +0 -0
mindspore/ops/bprop_mindir/TransShape_bprop.mindir +0 -0
mindspore/ops/bprop_mindir/TupleGetItem_bprop.mindir +0 -0
mindspore/ops/bprop_mindir/Unique_bprop.mindir +0 -0
mindspore/ops/bprop_mindir/Unstack_bprop.mindir +0 -0
mindspore/ops/bprop_mindir/generate_mindir.py +0 -114
mindspore/rewrite/node_visitor.py +0 -44
{mindspore-2.1.0.dist-info → mindspore-2.2.11.dist-info}/WHEEL +0 -0
{mindspore-2.1.0.dist-info → mindspore-2.2.11.dist-info}/entry_points.txt +0 -0
{mindspore-2.1.0.dist-info → mindspore-2.2.11.dist-info}/top_level.txt +0 -0

mindspore/ops/operations/nn_ops.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# Copyright 2020-2022 Huawei Technologies Co., Ltd
+# Copyright 2020-2023 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -438,7 +438,10 @@ class Softmax(Primitive):
     Inputs:
         - **logits** (Tensor) - Tensor of shape :math:`(N, *)`, where :math:`*` means, any number of
-          additional dimensions, with float16, float32 or float64(CPU, GPU) data type.
+          additional dimensions. Supported dtypes:
+          - Ascend: float16, float32.
+          - GPU/CPU: float16, float32, float64.
     Outputs:
         Tensor, with the same type and shape as the logits.
@@ -517,7 +520,11 @@ class Softplus(Primitive):
         \text{output} = \log(1 + \exp(\text{x}))
     Inputs:
-        - **input_x** (Tensor) - Tensor of any dimension, with float16, float32 or float64(CPU, GPU) data type.
+        - **input_x** (Tensor) - Tensor of any dimension.
+          Supported dtypes:
+          - GPU/CPU: float16, float32, float64.
+          - Ascend: float16, float32.
     Outputs:
         Tensor, with the same type and shape as the `input_x`.
@@ -626,7 +633,7 @@ class ReLUV3(Primitive):
     Inputs:
         - **input_x** (Tensor) - Tensor of shape :math:`(N, *)`, where :math:`*` means, any number of
           additional dimensions, data type is
-          `number <https://www.mindspore.cn/docs/en/r2.1/api_python/mindspore.html#mindspore.dtype>`_.
+          `number <https://www.mindspore.cn/docs/en/r2.2/api_python/mindspore.html#mindspore.dtype>`_.
     Outputs:
         Tensor of shape :math:`(N, *)`, with the same type and shape as the `input_x`.
@@ -659,7 +666,11 @@ class Mish(PrimitiveWithInfer):
     Refer to :func:`mindspore.ops.mish` for more details.
     Inputs:
-        - **x** (Tensor) - The input Tensor with float16, float32 or float64 data type.
+        - **x** (Tensor) - The input Tensor.
+          Supported dtypes:
+          - GPU/CPU: float16, float32, float64.
+          - Ascend: float16, float32.
     Outputs:
         Tensor, with the same type and shape as the `x`.
@@ -745,7 +756,9 @@ class ReLU6(PrimitiveWithCheck):
     Refer to :func:`mindspore.ops.relu6` for more details.
     Inputs:
-        - **input_x** (Tensor) - Input Tensor of float16 or float32 data type.
+        - **input_x** (Tensor) - Tensor of shape :math:`(N, *)`,
+          where :math:`*` means any number of additional dimensions.
+          Data type must be float16, float32.
     Outputs:
         Tensor, with the same type and shape as the `input_x`.
@@ -1216,54 +1229,6 @@ class InstanceNormV2(Primitive):
         validator.check_bool(is_training, "is_training", self.name)
-class BNTrainingReduce(Primitive):
-    """
-    The BNTrainingReduce interface is deprecated, please use the :class:`mindspore.ops.BatchNorm` instead.
-    Supported Platforms:
-        Deprecated
-    """
-    @deprecated("1.5", "ops.BatchNorm", False)
-    @prim_attr_register
-    def __init__(self, data_format="NCHW"):
-        """Initialize BNTrainingReduce."""
-        self.init_prim_io_names(inputs=['x'], outputs=['sum', 'square_sum'])
-        self.format = validator.check_string(data_format, ['NCHW', 'NHWC'], 'format', self.name)
-        if context.get_context("device_target") != "GPU" and self.format == "NHWC":
-            raise ValueError(f"For '{self.name}', the 'NHWC' format is only supported in GPU target, "
-                             f"but got the 'data_format' is {self.format} and "
-                             f"the platform is {context.get_context('device_target')}.")
-        self.add_prim_attr('data_format', self.format)
-class BNTrainingUpdate(Primitive):
-    """
-    The BNTrainingUpdate interface is deprecated, please use the :class:`mindspore.ops.BatchNorm` instead.
-    Supported Platforms:
-        Deprecated
-    """
-    @deprecated("1.5", "ops.BatchNorm", False)
-    @prim_attr_register
-    def __init__(self, isRef=True, epsilon=1e-5, factor=0.1, data_format="NCHW"):
-        """Initialize BNTrainingUpdate."""
-        self.init_prim_io_names(inputs=['x', 'sum', 'square_sum', 'scale', 'b', 'mean', 'variance'],
-                                outputs=['y', 'running_mean', 'running_variance', 'save_mean', 'save_inv_variance'])
-        validator.check_value_type("isRef", isRef, [bool], self.name)
-        validator.check_value_type("epsilon", epsilon, [float], self.name)
-        validator.check_value_type("factor", factor, [float], self.name)
-        self.epsilon = validator.check_float_range(epsilon, 0, 1, validator.INC_RIGHT, 'epsilon', 'BNTrainingUpdate')
-        self.factor = validator.check_float_range(factor, 0, 1, validator.INC_BOTH, 'factor', 'BNTrainingUpdate')
-        self.format = validator.check_string(data_format, ['NCHW', 'NHWC'], 'format', self.name)
-        if context.get_context("device_target") != "GPU" and self.format == "NHWC":
-            raise ValueError(f"For '{self.name}', the 'NHWC' format is only supported in GPU target, "
-                             f"but got the 'data_format' is {self.format} and "
-                             f"the platform is {context.get_context('device_target')}.")
-        self.add_prim_attr('data_format', self.format)
 class BatchNorm(PrimitiveWithInfer):
     r"""
     Batch Normalization for input data and updated parameters.
@@ -1400,33 +1365,40 @@ class Conv2D(Primitive):
     2D convolution layer.
     Applies a 2D convolution over an input tensor which is typically of shape :math:`(N, C_{in}, H_{in}, W_{in})`,
-    where :math:`N` is batch size, :math:`C` is channel number, :math:`H` is height, :math:`W` is width,
-    :math:`X_i` is
-    the :math:`i^{th}` input value and :math:`b_i` indicates the deviation value of the :math:`i^{th}` input value.
-    For each batch of shape :math:`(C_{in}, H_{in}, W_{in})`, the formula is defined as:
+    where :math:`N` is batch size, :math:`C` is channel number, :math:`H` is feature height, :math:`W` is feature width.
+    The output is calculated based on formula:
     .. math::
-        out_j = \sum_{i=0}^{C_{in} - 1} ccor(W_{ij}, X_i) + b_j,
-    where :math:`ccor` is the cross correlation operator, :math:`C_{in}` is the input channel number, :math:`j` ranges
-    from :math:`0` to :math:`C_{out} - 1`, :math:`W_{ij}` corresponds to the :math:`i`-th channel of the :math:`j`-th
-    filter and :math:`out_{j}` corresponds to the :math:`j`-th channel of the output. :math:`W_{ij}` is a slice
-    of kernel and it has shape :math:`(\text{kernel_size[0]}, \text{kernel_size[1]})`,
-    where :math:`\text{kernel_size[0]}` and :math:`\text{kernel_size[1]}` are the height and width of the
-    convolution kernel. The full kernel has shape
-    :math:`(C_{out}, C_{in} / \text{group}, \text{kernel_size[0]}, \text{kernel_size[1]})`,
-    where group is the group number to split the input in the channel dimension.
-    If the 'pad_mode' is set to be "pad", the output height and width will be
-    :math:`\left \lfloor{1 + \frac{H_{in} + \text{padding[0]} + \text{padding[1]} - \text{kernel_size[0]} -
-    (\text{kernel_size[0]} - 1) \times (\text{dilation[0]} - 1) }{\text{stride[0]}}} \right \rfloor` and
-    :math:`\left \lfloor{1 + \frac{W_{in} + \text{padding[2]} + \text{padding[3]} - \text{kernel_size[1]} -
-    (\text{kernel_size[1]} - 1) \times (\text{dilation[1]} - 1) }{\text{stride[1]}}} \right \rfloor` respectively.
-    Where :math:`dilation` is Spacing between kernel elements, :math:`stride` is The step length of each step,
-    :math:`padding` is zero-padding added to both sides of the input.
-    The first introduction can be found in paper `Gradient Based Learning Applied to Document Recognition
+        \text{out}(N_i, C_{\text{out}_j}) = \text{bias}(C_{\text{out}_j}) +
+        \sum_{k = 0}^{C_{in} - 1} \text{ccor}({\text{weight}(C_{\text{out}_j}, k), \text{X}(N_i, k)})
+    where :math:`bias` is the output channel bias, :math:`ccor` is
+    the `cross-correlation <https://en.wikipedia.org/wiki/Cross-correlation>`_,
+    , :math:`weight` is the convolution kernel value and :math:`X` represents the input feature map.
+    Here are the indices' meanings:
+    - :math:`i` corresponds to the batch number, ranging from 0 to N-1, where N is the batch size of the input.
+    - :math:`j` corresponds to the output channel, ranging from 0 to C_{out}-1, where C_{out} is the number of
+      output channels, which is also equal to the number of kernels.
+    - :math:`k` corresponds to the input channel, ranging from 0 to C_{in}-1, where C_{in} is the number of
+      input channels, which is also equal to the number of channels in the convolutional kernels.
+    Therefore, in the above formula, :math:`{bias}(C_{out_j})` represents the bias of the :math:`j`-th
+    output channel, :math:`{weight}(C_{out_j}, k)` represents the slice of the :math:`j`-th convolutional
+    kernel in the :math:`k`-th channel, and :math:`{X}(N_i, k)` represents the slice of the :math:`k`-th input
+    channel in the :math:`i`-th batch of the input feature map.
+    The shape of the convolutional kernel is given by :math:`(kernel\_size[0], kernel\_size[1])`,
+    where :math:`kernel\_size[0]` and :math:`kernel\_size[1]` are the height and width of the kernel, respectively.
+    If we consider the input and output channels as well as the `group` parameter, the complete kernel shape
+    will be :math:`(C_{out}, C_{in} / \text{group}, \text{kernel_size[0]}, \text{kernel_size[1]})`,
+    where `group` is the number of groups dividing `x`'s input channel when applying group convolution.
+    For more details about convolution layer, please refer to `Gradient Based Learning Applied to Document Recognition
     <http://vision.stanford.edu/cs598_spring07/papers/Lecun98.pdf>`_.
     Note:
@@ -1434,57 +1406,72 @@ class Conv2D(Primitive):
         That is, when `group>1`, condition `in\_channels` = `out\_channels` = `group` must be satisfied.
     Args:
-        out_channel (int): The number of output channel :math:`C_{out}`.
-        kernel_size (Union[int, tuple[int]]): The data type is int or a tuple of 2 integers. Specifies the height
-            and width of the 2D convolution window. Single int means the value is for both the height and the width of
-            the kernel. A tuple of 2 ints means the first value is for the height and the other is for the
-            width of the kernel.
-        mode (int): Modes for different convolutions. The value is currently not used. Default: ``1`` .
-        pad_mode (str): Specifies padding mode. The optional values are
-            ``"same"`` , ``"valid"`` and ``"pad"`` . Default: ``"valid"`` .
-            - ``"same"``: Adopts the way of completion. The height and width of the output will be equal to
-              the input `x` divided by stride. The padding will be evenly calculated in top and bottom,
-              left and right possiblily.
-              Otherwise, the last extra padding will be calculated from the bottom and the right side.
+        out_channel (int): Specifies output channel :math:`C_{out}`.
+        kernel_size (Union[int, tuple[int]]): Specifies the height and width of the 2D convolution kernel.
+            It can be a single int or a tuple of 2 integers. A single int means the value is for both the height
+            and the width. A tuple of 2 ints means the first value is for the height and the other is for the width.
+        mode (int, optional): Modes for different convolutions. The value is currently not used. Default: ``1`` .
+        pad_mode (str, optional): Specifies the padding mode with a padding value of 0. It can be set to:
+            ``"same"`` , ``"valid"`` or ``"pad"`` . Default: ``"valid"`` .
+            - ``"same"``: Pad the input around its edges so that the shape of input and output
+              are the same when `stride` is set to ``1``.
+              The amount of padding to is calculated by the operator internally, If the amount is even, it is
+              uniformly distributed around the input, if it is odd, the excess amount goes to the right/bottom side.
               If this mode is set, `pad` must be 0.
-            - ``"valid"``: Adopts the way of discarding. The possible largest height and width of output will be
-              returned without padding. Extra pixels will be discarded. If this mode is set, `pad` must be 0.
-            - ``"pad"``: Implicit paddings on both sides of the input `x`. The number of `pad` will be padded to the
-              input Tensor borders. `pad` must be greater than or equal to 0.
-        pad (Union(int, tuple[int])): Implicit paddings on both sides of the input `x`. If `pad` is one integer,
-                    the paddings of top, bottom, left and right are the same, equal to pad. If `pad` is a tuple
-                    with four integers, the paddings of top, bottom, left and right will be equal to pad[0],
-                    pad[1], pad[2], and pad[3] accordingly. Default: ``0`` .
-        stride (Union(int, tuple[int])): The distance of kernel moving, an int number that represents
-            the height and width of movement are both strides, or a tuple of two or four int numbers that
-            represent height and width of movement respectively. Default: ``1`` .
-        dilation (Union(int, tuple[int])): The data type is int or a tuple of 2 or 4 integers. Specifies the dilation
-                                      rate to use for dilated convolution. If set to be :math:`k > 1`, there will
-                                      be :math:`k - 1` pixels skipped for each sampling location. Its value must
-                                      be greater than or equal to 1 and bounded by the height and width of the
-                                      input `x`. Default: ``1`` .
-        group (int): Splits input into groups. Default: ``1`` .
-        data_format (str): The optional value for data format, is ``'NHWC'`` or ``'NCHW'`` . Default: ``"NCHW"`` .
+            - ``"valid"``: No padding is applied to the input, and the output returns the maximum
+              possible height and width. Extra pixels that could not complete a full stride will
+              be discarded. If this mode is set, `pad` must be 0.
+            - ``"pad"``: Pad the input with a specified amount. In this mode, the amount of padding
+              in the height and width directions is determined by the `pad` parameter.
+              If this mode is set, `pad` must be greater than or equal to 0.
+        pad (Union(int, tuple[int]), optional): Specifies the amount of padding to apply on input
+            when `pad_mode` is set to ``"pad"``. It can be a single int or a tuple of 4 ints.
+            If `pad` is one integer, the paddings of top, bottom, left and right are the same, equal to `pad`.
+            If `pad` is a tuple with four integers, the paddings of top, bottom, left and right will be equal to pad[0],
+            pad[1], pad[2], and pad[3] accordingly. Default: ``0`` .
+        stride (Union(int, tuple[int]), optional): Specifies the stride of the convolution kernel's movement.
+            It can be a single int or a tuple of two or four ints. A single int means the stride is the same in
+            both the height and width directions. A tuple of two ints indicates the strides in the height and
+            width directions, respectively. For a tuple of four ints, the two ints correspond to (N, C) dimension
+            are treated as 1, and the two correspond to (H, W) dimensions is the step size in the height
+            and width directions respectively. Default: ``1`` .
+        dilation (Union(int, tuple[int]), optional): Specifies the dilation rate to use for dilated convolution.
+            It can be a single int or a tuple of 2 or 4 integers. A single int means the dilation size is the same
+            in both the height and width directions. A tuple of two ints represents the dilation size in
+            the height and width directions, respectively. For a tuple of four ints, the two ints correspond
+            to (N, C) dimension are treated as 1, and the two correspond to (H, W) dimensions is the
+            dilation size in the height and width directions respectively.
+            Assuming :math:`dilation=(d0, d1)`, the convolutional kernel samples the input with a
+            spacing of :math:`d0-1` elements in the height direction and :math:`d1-1` elements in the width direction.
+            The values in the height and width dimensions are in the ranges [1, H] and [1, W], respectively.
+            Default: ``1`` .
+        group (int, optional): Specifies the number of groups dividing `x`'s input channel when applying
+            group convolution. Default: ``1`` .
+        data_format (str, optional): The optional value for data format, is ``'NHWC'`` or ``'NCHW'`` .
+            Default: ``"NCHW"`` .
     Inputs:
-        - **x** (Tensor) - Tensor of shape :math:`(N, C_{in}, H_{in}, W_{in})`.
-        - **weight** (Tensor) - Set size of kernel is :math:`(\text{kernel_size[0]}, \text{kernel_size[1]})`,
-          then the shape is :math:`(C_{out}, C_{in}, \text{kernel_size[0]}, \text{kernel_size[1]})`.
+        - **x** (Tensor) - Input tensor of shape :math:`(N, C_{in}, H_{in}, W_{in})` or
+          :math:`(N, H_{in}, W_{in}, C_{in}, )` depending on `data_format` .
+        - **weight** (Tensor) - The convolutional kernel value, it should has shape
+          :math:`(C_{out}, C_{in} / \text{group}, \text{kernel_size[0]}, \text{kernel_size[1]})` .
     Outputs:
-        Tensor, the value that applied 2D convolution. The shape is :math:`(N, C_{out}, H_{out}, W_{out})`.
+        Tensor, the value that applied 2D convolution. The shape is :math:`(N, C_{out}, H_{out}, W_{out})`
+        or :math:`(N, H_{out}, W_{out}, C_{out}, )`.
+        To see how different pad modes affect the output shape, please refer to
+        :class:`mindspore.nn.Conv2d` for more details.
     Raises:
         TypeError: If `kernel_size`, `stride`, `pad` or `dilation` is neither an int nor a tuple.
         TypeError: If `out_channel` or `group` is not an int.
         ValueError: If `kernel_size`, `stride` or `dilation` is less than 1.
-        ValueError: If `pad_mode` is not one of 'same', 'valid' or 'pad'.
+        ValueError: If `pad_mode` is not one of ``'same'``, ``'valid'`` or ``'pad'``.
         ValueError: If `pad` is a tuple whose length is not equal to 4.
-        ValueError: If `pad_mode` it not equal to 'pad' and `pad` is not equal to (0, 0, 0, 0).
-        ValueError: If `data_format` is neither 'NCHW' nor 'NHWC'.
+        ValueError: If `pad_mode` it not equal to ``'pad'`` and `pad` is not equal to ``(0, 0, 0, 0)``.
+        ValueError: If `data_format` is neither ``'NHWC'`` nor ``'NCHW'`` .
     Supported Platforms:
         ``Ascend`` ``GPU`` ``CPU``
@@ -1493,12 +1480,49 @@ class Conv2D(Primitive):
         >>> import mindspore
         >>> import numpy as np
         >>> from mindspore import Tensor, ops
+        >>> # case 1: All parameters use default values.
         >>> x = Tensor(np.ones([10, 32, 32, 32]), mindspore.float32)
         >>> weight = Tensor(np.ones([32, 32, 3, 3]), mindspore.float32)
         >>> conv2d = ops.Conv2D(out_channel=32, kernel_size=3)
         >>> output = conv2d(x, weight)
         >>> print(output.shape)
         (10, 32, 30, 30)
+        >>> # case 2: pad_mode="pad", other parameters being default.
+        >>> x = Tensor(np.ones([10, 32, 32, 32]), mindspore.float32)
+        >>> weight = Tensor(np.ones([32, 32, 3, 3]), mindspore.float32)
+        >>> conv2d = ops.Conv2D(out_channel=32, kernel_size=3, pad_mode="pad", pad=(4, 10, 4, 10))
+        >>> output = conv2d(x, weight)
+        >>> print(output.shape)
+        (10, 32, 44, 44)
+        >>> # case 3: stride=(2, 4), other parameters being default.
+        >>> x = Tensor(np.ones([10, 32, 32, 32]), mindspore.float32)
+        >>> weight = Tensor(np.ones([32, 32, 3, 3]), mindspore.float32)
+        >>> conv2d = ops.Conv2D(out_channel=32, kernel_size=3, stride=(2, 4))
+        >>> output = conv2d(x, weight)
+        >>> print(output.shape)
+        (10, 32, 15, 8)
+        >>> # case 4: dilation=2, other parameters being default.
+        >>> x = Tensor(np.ones([10, 32, 32, 32]), mindspore.float32)
+        >>> weight = Tensor(np.ones([32, 32, 3, 3]), mindspore.float32)
+        >>> conv2d = ops.Conv2D(out_channel=32, kernel_size=3, dilation=2)
+        >>> output = conv2d(x, weight)
+        >>> print(output.shape)
+        (10, 32, 28, 28)
+        >>> # case 5: group=2, other parameters being default.
+        >>> x = Tensor(np.ones([10, 64, 32, 32]), mindspore.float32)
+        >>> weight = Tensor(np.ones([32, 32, 3, 3]), mindspore.float32)
+        >>> conv2d = ops.Conv2D(out_channel=32, kernel_size=3, group=2)
+        >>> output = conv2d(x, weight)
+        >>> print(output.shape)
+        (10, 32, 30, 30)
+        >>> # case 6: All parameters are specified.
+        >>> x = Tensor(np.ones([10, 64, 32, 32]), mindspore.float32)
+        >>> weight = Tensor(np.ones([32, 32, 3, 3]), mindspore.float32)
+        >>> conv2d = ops.Conv2D(out_channel=32, kernel_size=3, pad_mode="pad",
+        ...                     pad=(4, 10, 4, 10), stride=(2, 4), dilation=2,  group=2)
+        >>> output = conv2d(x, weight)
+        >>> print(output.shape)
+        (10, 32, 21, 11)
     """
     @prim_attr_register
@@ -1779,8 +1803,13 @@ class _Pool(PrimitiveWithInfer):
                 out_w = math.ceil(input_w / stride_w)
         out_shape = [batch, channel, out_h, out_w] if self.format == "NCHW" else [batch, out_h, out_w, channel]
-        for shape_value in out_shape:
-            if shape_value <= 0 and shape_value != -1:
+        is_dynamic_shape = False
+        for in_shape_val in x_shape_norm:
+            if in_shape_val == -1:
+                is_dynamic_shape = True
+        for out_shape_val in out_shape:
+            if out_shape_val <= 0 and not is_dynamic_shape:
                 raise ValueError(f"For '{self.name}', the each element of the output shape must be larger than 0, "
                                  f"but got output shape: {out_shape}. The input shape: {x_shape}, "
                                  f"kernel size: {self.kernel_size}, strides: {self.strides}."
@@ -1814,22 +1843,26 @@ class MaxPool(_Pool):
         strides (Union[int, tuple[int]]): The distance of kernel moving, an int number that represents
             not only the height of movement but also the width of movement, or a tuple of two int numbers that
             represent height and width of movement respectively. Default: ``1`` .
-        pad_mode (str): The optional value of pad mode is ``"same"`` or ``"valid"`` .
-            Default: ``"valid"`` .
+        pad_mode (str, optional): Specifies the padding mode with a padding value of 0. It can be set to:
+            ``"same"`` or ``"valid"`` . Default: ``"valid"`` .
-            - ``"same"``: Adopts the way of completion. The height and width of the output will be the same
-              as the input. The total number of padding will be calculated in horizontal and vertical
-              directions and evenly distributed to top, bottom, left and right if possible.
-              Otherwise, the last extra padding will be done from the bottom and the right side.
+            - ``"same"``: Pad the input around its edges so that the shape of input and output
+              are the same when `stride` is set to ``1``.
+              The amount of padding to is calculated by the operator internally, If the amount is even, it is
+              uniformly distributed around the input, if it is odd, the excess amount goes to the right/bottom side.
+            - ``"valid"``: No padding is applied to the input, and the output returns the maximum
+              possible height and width. Extra pixels that could not complete a full stride will
+              be discarded.
-            - ``"valid"``: Adopts the way of discarding. The possible largest height and width of output
-              will be returned without padding. Extra pixels will be discarded.
         data_format (str) : The optional value for data format, is ``'NHWC'`` or ``'NCHW'`` .
             Default: ``'NCHW'`` .
     Inputs:
         - **x** (Tensor) - Tensor of shape :math:`(N, C_{in}, H_{in}, W_{in})`.
-          Supported dtypes: float16, float32, float64.
+          Supported dtypes:
+          - CPU: float16, float32, float64.
+          - GPU/Ascend: float16, float32.
     Outputs:
         Tensor, with shape :math:`(N, C_{out}, H_{out}, W_{out})`.
@@ -1887,16 +1920,17 @@ class MaxPoolV1(Primitive):
         strides (Union[int, tuple[int]]): The distance of kernel moving, an integer that represents
             the height and width of movement are both strides, or a tuple of two integers that
             represent height and width of movement, respectively. Default: ``1`` .
-        pad_mode (str): The optional value for pad mode, is ``"same"`` or ``"valid"`` .
-            Default: ``"valid"`` .
+        pad_mode (str, optional): Specifies the padding mode with a padding value of 0. It can be set to:
+            ``"same"`` or ``"valid"`` . Default: ``"valid"`` .
-            - ``"same"``: Adopts the way of completion. The height and width of the output will be the same
-              as the input. The number of padding will be calculated in horizontal and vertical
-              directions, and evenly distributed to top and bottom, left and right if possible.
-              Otherwise, the extra padding will be done from the bottom and the right side.
+            - ``"same"``: Pad the input around its edges so that the shape of input and output
+              are the same when `stride` is set to ``1``.
+              The amount of padding to is calculated by the operator internally, If the amount is even, it is
+              uniformly distributed around the input, if it is odd, the excess amount goes to the right/bottom side.
+            - ``"valid"``: No padding is applied to the input, and the output returns the maximum
+              possible height and width. Extra pixels that could not complete a full stride will
+              be discarded.
-            - ``"valid"``: Adopts the way of discarding. The possible largest height and width of the
-              output will be returned without padding. Extra pixels will be discarded.
         data_format (str) : The optional value for data format, is ``'NCHW'`` or ``'NHWC'`` .
             Default: ``'NCHW'`` .
@@ -1957,55 +1991,6 @@ class MaxPoolV1(Primitive):
         self.add_prim_attr("strides", strides_adapted)
-class MaxPoolWithArgmax(Primitive):
-    r"""
-    :class:`mindspore.ops.MaxPoolWithArgmax` is deprecated from version 2.0 and will be removed in a future version,
-    use :class:`mindspore.ops.MaxPoolWithArgmaxV2` instead.
-    Supported Platforms:
-        Deprecated
-    Examples:
-        >>> import mindspore
-        >>> import numpy as np
-        >>> from mindspore import Tensor, ops
-        >>> x = Tensor(np.arange(1 * 3 * 3 * 4).reshape((1, 3, 3, 4)), mindspore.float32)
-        >>> maxpool_arg_op = ops.MaxPoolWithArgmax(pad_mode="VALID", kernel_size=2, strides=1)
-        >>> output_tensor, argmax = maxpool_arg_op(x)
-        >>> print(output_tensor)
-        [[[[ 5.  6.  7.]
-           [ 9. 10. 11.]]
-          [[17. 18. 19.]
-           [21. 22. 23.]]
-          [[29. 30. 31.]
-           [33. 34. 35.]]]]
-    """
-    @deprecated("2.0", "ops.MaxPoolWithArgmaxV2", False)
-    @prim_attr_register
-    def __init__(self, kernel_size=1, strides=1, pad_mode="valid", data_format="NCHW"):
-        """Initialize MaxPoolWithArgmax."""
-        self.init_prim_io_names(inputs=['x'], outputs=['output', 'mask'])
-        validator.check_value_type('kernel_size', kernel_size, [int, tuple], self.name)
-        validator.check_value_type('strides', strides, [int, tuple], self.name)
-        validator.check_value_type('pad_mode', pad_mode, [str], self.name)
-        self.pad_mode = validator.check_string(pad_mode.upper(), ['VALID', 'SAME'], 'pad_mode', self.name)
-        self.add_prim_attr("pad_mode", self.pad_mode)
-        self.format = validator.check_string(data_format, ['NCHW', 'NHWC'], 'format', self.name)
-        if context.get_context("device_target") != "GPU" and self.format == "NHWC":
-            raise ValueError(f"For '{self.name}', the 'NHWC' format is only supported in GPU target, "
-                             f"but got the 'data_format' is {self.format} and "
-                             f"the platform is {context.get_context('device_target')}.")
-        self.kernel_size = _check_positive_int_or_tuple(
-            "kernel_size", kernel_size, self.name, allow_four=False, ret_four=True)
-        self.kernel_size = (1, self.kernel_size[-2], self.kernel_size[-1], 1)
-        self.add_prim_attr("kernel_size", self.kernel_size)
-        self.strides = _check_positive_int_or_tuple("strides", strides, self.name, allow_four=False, ret_four=True)
-        self.strides = (1, self.strides[-2], self.strides[-1], 1)
-        self.add_prim_attr("strides", self.strides)
 class MaxPool3D(Primitive):
     r"""
     Applies a 3D max pooling over an input Tensor which can be regarded as a composition of 3D planes.
@@ -2026,19 +2011,21 @@ class MaxPool3D(Primitive):
         strides (Union[int, tuple[int]]): The distance of kernel moving, an int number that represents
             not only the depth, height of movement but also the width of movement,, or a tuple of three int numbers that
             represent depth, height and width of movement respectively. Default: ``1`` .
-        pad_mode (str): The optional value of pad mode is ``"SAME"`` , ``"VALID"`` or ``"PAD"`` .
-            Default: ``"VALID"`` .
-            - ``"SAME"``: Adopts the way of completion. The height and width of the output will be the same
-              as the input. The total number of padding will be calculated in horizontal and vertical
-              directions and evenly distributed to top, bottom, left and right if possible.
-              Otherwise, the last extra padding will be done from the bottom and the right side.
-            - ``"VALID"``: Adopts the way of discarding. The possible largest height and width of output
-              will be returned without padding. Extra pixels will be discarded.
-            - ``"PAD"``: Implicit paddings on both sides of the input in depth, height and width. The number of
-              ``"PAD"`` will be padded to the input Tensor borders. "pad_list" must be greater than or equal to 0.
+        pad_mode (str, optional): Specifies the padding mode with a padding value of 0. It can be set to:
+            ``"SAME"`` , ``"VALID"`` or ``"PAD"`` . Default: ``"VALID"`` .
+            - ``"SAME"``: Pad the input around its depth/height/width dimension so that the shape of input and output
+              are the same when `stride` is set to ``1``.
+              The amount of padding to is calculated by the operator internally.  If the amount is even,
+              it isuniformly distributed around the input, if it is odd, the excess amount goes
+              to the front/right/bottom side.
+              If this mode is set, `pad_list` must be 0.
+            - ``"VALID"``: No padding is applied to the input, and the output returns the maximum
+              possible depth, height and width. Extra pixels that could not complete a full stride will
+              be discarded. If this mode is set, `pad_list` must be 0.
+            - ``"PAD"``: Pad the input with a specified amount. In this mode, the amount of padding
+              in the depth, height and width dimension is determined by the `pad_list` parameter.
+              If this mode is set, `pad_list` must be greater than or equal to 0.
         pad_list (Union(int, tuple[int])): The pad value to be filled. Default: ``0`` . If `pad` is an integer, the
             paddings of head, tail, top, bottom, left and right are the same, equal to pad. If `pad` is a tuple of six
@@ -2347,14 +2334,17 @@ class AvgPool(Primitive):
         strides (Union[int, tuple[int]]): The distance of kernel moving, an int number that represents
             the height and width of movement are both strides, or a tuple of two int numbers that
             represent height and width of movement respectively. Default: ``1`` .
-        pad_mode (str, optional): The optional value for pad mode, is ``'same'`` or ``'valid'`` .
-            Default: ``'valid'`` .
+        pad_mode (str, optional): Specifies the padding mode with a padding value of 0. It can be set to:
+            ``"same"`` or ``"valid"`` . Default: ``"valid"`` .
-            - ``'same'``: The height and width of the output are the same as the input divided by 'strides'
-              and rounded up.
+            - ``"same"``: Pad the input around its edges so that the shape of input and output
+              are the same when `stride` is set to ``1``.
+              The amount of padding to is calculated by the operator internally, If the amount is even, it is
+              uniformly distributed around the input, if it is odd, the excess amount goes to the right/bottom side.
+            - ``"valid"``: No padding is applied to the input, and the output returns the maximum
+              possible height and width. Extra pixels that could not complete a full stride will
+              be discarded.
-            - ``'valid'``: Returns the output of the valid calculation without filling. Redundant pixels that
-              do not satisfy the calculation will be discarded.
         data_format (str, optional): The format of input and output data. It should be ``'NHWC'`` or ``'NCHW'`` .
             Default: ``'NCHW'`` .
@@ -2451,16 +2441,17 @@ class AvgPoolV1(Primitive):
         strides (Union[int, tuple[int]]): The distance of kernel moving, an integer that represents
             the height and width of movement are both strides, or a tuple of two integers that
             represent height and width of movement, respectively. Default: ``1`` .
-        pad_mode (str): The optional value for pad mode, should be one of ``"same"`` or ``"valid"`` .
-            Default: ``"valid"`` .
+        pad_mode (str, optional): Specifies the padding mode with a padding value of 0. It can be set to:
+            ``"same"`` or ``"valid"`` . Default: ``"valid"`` .
-            - ``"same"``: Adopts the way of completion. The height and width of output will be the same as
-              the input. The total number of padding will be calculated horizontally and vertically,
-              and evenly distributed to top and bottom, left and right if possible.
-              Otherwise, the last extra padding will be done from bottom and right.
+            - ``"same"``: Pad the input around its edges so that the shape of input and output
+              are the same when `stride` is set to ``1``.
+              The amount of padding to is calculated by the operator internally, If the amount is even, it is
+              uniformly distributed around the input, if it is odd, the excess amount goes to the right/bottom side.
+            - ``"valid"``: No padding is applied to the input, and the output returns the maximum
+              possible height and width. Extra pixels that could not complete a full stride will
+              be discarded.
-            - ``"valid"``: Adopts the way of discarding. The largest possible height and width of output
-              will be returned without padding. Extra pixels will be discarded.
         data_format (str): The format of input and output data. Should be ``'NHWC'`` or ``'NCHW'`` .
             Default: ``'NCHW'`` .
@@ -2708,8 +2699,21 @@ class Conv2DTranspose(Conv2DBackpropInput):
     Args:
         out_channel (int): The dimensionality of the output space.
         kernel_size (Union[int, tuple[int]]): The size of the convolution window.
-        pad_mode (str): Modes to fill padding. It could be ``"valid"`` , ``"same"`` , or ``"pad"`` .
-            Default: ``"valid"`` .
+        pad_mode (str, optional): Specifies the padding mode with a padding value of 0. It can be set to:
+            ``"same"`` , ``"valid"`` or ``"pad"`` . Default: ``"valid"`` .
+            - ``"same"``: Pad the input around its edges so that the shape of input and output
+              are the same when `stride` is set to ``1``.
+              The amount of padding to is calculated by the operator internally, If the amount is even, it is
+              uniformly distributed around the input, if it is odd, the excess amount goes to the right/bottom side.
+              If this mode is set, `pad` must be 0.
+            - ``"valid"``: No padding is applied to the input, and the output returns the maximum
+              possible height and width. Extra pixels that could not complete a full stride will
+              be discarded. If this mode is set, `pad` must be 0.
+            - ``"pad"``: Pad the input with a specified amount. In this mode, the amount of padding
+              in the height and width directions is determined by the `pad` parameter.
+              If this mode is set, `pad` must be greater than or equal to 0.
             Please refer to :class:`mindspore.nn.Conv2dTranspose` for more specifications about `pad_mode`.
         pad (Union[int, tuple[int]]): The pad value to be filled. Default: ``0`` . If `pad` is an integer, the paddings
                     of top, bottom, left and right are the same, equal to pad. If `pad` is a tuple of four integers,
@@ -2779,9 +2783,13 @@ class BiasAdd(Primitive):
             Default is ``"NCHW"`` .
     Inputs:
-        - **input_x** (Tensor) - The input tensor. The shape can be 2-5 dimensions.
+        - **input_x** (Tensor) - The input tensor. The shape can be 2-5 dimensions. Supported dtypes:
+          - Ascend/CPU: all Number type.
+          - GPU: float16, float32, int8.
         - **bias** (Tensor) - The bias tensor, with shape :math:`(C)`. C must be the same as channel dimension C of
-          `input_x`.
+          `input_x`. It has the same type as `input_x`.
     Outputs:
         Tensor, with the same shape and data type as `input_x`.
@@ -2790,7 +2798,7 @@ class BiasAdd(Primitive):
         TypeError: If `data_format` is not a str.
         ValueError: If value of `data_format` is not in the range of ['NHWC','NCHW','NCDHW'].
         TypeError: If `input_x` or `bias` is not a Tensor.
-        TypeError: If dtype of `input_x` or `bias` is inconsistent.
+        TypeError: If dtype of `input_x` and `bias` is inconsistent.
         TypeError: If dimension of `input_x` is not in the range [2, 5].
     Supported Platforms:
@@ -2820,7 +2828,7 @@ class NLLLoss(Primitive):
     r"""
     Gets the negative log likelihood loss between logits and labels.
-    The nll loss with reduction=none can be described as:
+    The nll loss with :math:`reduction = none` can be described as:
     .. math::
@@ -2831,7 +2839,7 @@ class NLLLoss(Primitive):
     where :math:`x` is the logits, :math:`t` is the labels, :math:`w` is the weight,
     N is the batch size, :math:`c` belonging to [0, C-1] is class index, where :math:`C` is the number of classes.
-    If reduction is not ``'none'`` (default ``'mean'`` ), then
+    If :math:`reduction \neq none` (default ``'mean'`` ), then
     .. math::
@@ -2841,8 +2849,13 @@ class NLLLoss(Primitive):
         \end{array}\right.
     Args:
-        reduction (str): Apply specific reduction method to the output: ``"none"`` , ``"mean"`` , or ``"sum"`` .
-          Default: ``"mean"`` .
+        reduction (str, optional): Apply specific reduction method to the output: ``'none'`` , ``'mean'`` ,
+            ``'sum'`` . Default: ``'mean'`` .
+            - ``'none'``: no reduction will be applied.
+            - ``'mean'``: compute and return the weighted mean of elements in the output.
+            - ``'sum'``: the output elements will be summed.
         ignore_index (int): Specifies a target value that is ignored
             and does not contribute to the input gradient. Default: ``-100`` .
@@ -2856,8 +2869,9 @@ class NLLLoss(Primitive):
     Outputs:
         Tuple of 2 tensors composed with `loss` and `total_weight`.
-        - **loss** (Tensor) - When `reduction` is 'none' and `logits` is a 2D tensor, the `loss` shape is :math:`(N,)`.
-          Otherwise, the `loss` is a scalar. The data type is the same with `input's`.
+        - **loss** (Tensor) - When `reduction` is ``'none'`` and `logits` is a 2D tensor,
+          the `loss` shape is :math:`(N,)`. Otherwise, the `loss` is a scalar.
+          The data type is the same with `input's`.
         - **total_weight** (Tensor) - The `total_weight` is a scalar. The data type is the same with `weight's`.
     Raises:
@@ -3155,6 +3169,10 @@ class SmoothL1Loss(Primitive):
         reduction (str, optional): Apply specific reduction method to the output: ``'none'`` , ``'mean'`` ,
             ``'sum'`` . Default: ``'none'`` .
+            - ``'none'``: no reduction will be applied.
+            - ``'mean'``: compute and return the mean of elements in the output.
+            - ``'sum'``: the output elements will be summed.
     Inputs:
         - **logits** (Tensor) - Input Tensor of any dimension. Data type must be float16, float32 or float64.
         - **labels** (Tensor) - Ground truth data, has the same shape and dtype as the `logits`.
@@ -3202,12 +3220,12 @@ class MultiMarginLoss(Primitive):
     Args:
         p (int, optional): The norm degree for pairwise distance. Should be 1 or 2. Default: ``1`` .
         margin (int, optional): A parameter to change pairwise distance. Default: ``1.0`` .
-        reduction (str, optional): Apply specific reduction method to the output: ``"none"`` ,
-            ``"mean"`` , ``"sum"`` . Default: ``"mean"`` .
+        reduction (str, optional): Apply specific reduction method to the output: ``'none'`` , ``'mean'`` ,
+            ``'sum'`` . Default: ``'mean'`` .
-            - ``"none"``: no reduction will be applied.
-            - ``"mean"``: the sum of the output will be divided by the number of elements in the output.
-            - ``"sum"``: the output will be summed.
+            - ``'none'``: no reduction will be applied.
+            - ``'mean'``: compute and return the weighted mean of elements in the output.
+            - ``'sum'``: the output elements will be summed.
     Inputs:
         - **inputs** (Tensor) - Input , with shape :math:`(N, C)`. Data type only support float32, float16
@@ -3218,7 +3236,7 @@ class MultiMarginLoss(Primitive):
           support float16, float32 or float64.
     Outputs:
-        Tensor, When `reduction` is 'none', the shape is :math:`(N,)`.
+        Tensor, When `reduction` is ``'none'``, the shape is :math:`(N,)`.
         Otherwise, it is a scalar. Has the same data type with `inputs`.
     Supported Platforms:
@@ -3261,15 +3279,19 @@ class SoftMarginLoss(Primitive):
     where :math:`x.nelement()` is the number of elements of x.
     Args:
-        reduction (str): Apply specific reduction method to the output: ``"none"`` , ``"mean"`` or ``"sum"`` .
-          Default: ``"mean"`` .
+        reduction (str, optional): Apply specific reduction method to the output: ``'none'`` , ``'mean'`` ,
+            ``'sum'`` . Default: ``'mean'`` .
+            - ``'none'``: no reduction will be applied.
+            - ``'mean'``: compute and return the mean of elements in the output.
+            - ``'sum'``: the output elements will be summed.
     Inputs:
         - **logits** (Tensor) - Predict data. Data type must be float16 or float32.
         - **labels** (Tensor) - Ground truth data, with the same type and shape as `logits`.
     Outputs:
-        Tensor or Scalar, if `reduction` is "none", its shape is the same as `logits`.
+        Tensor or Scalar, if `reduction` is ``"none"``, its shape is the same as `logits`.
         Otherwise, a scalar value will be returned.
     Raises:
@@ -3736,26 +3758,28 @@ class LayerNorm(Primitive):
     Args:
         begin_norm_axis (int): The begin axis of the `input_x` to apply LayerNorm,
-            the value must be in [-1, rank(input)). Default: ``1`` .
+            the value must be in [-1, rank(input_x)). Default: ``1`` .
         begin_params_axis (int): The begin axis of the parameter input (`gamma`, `beta`) to
-            apply LayerNorm, the value must be in [-1, rank(input)). Default: ``1`` .
-        epsilon (float): A value added to the denominator for numerical stability. Default: ``1e-7`` .
+            apply LayerNorm, the value must be in [-1, rank(input_x)). Default: ``1`` .
+        epsilon (float): A value added to the denominator for numerical stability(:math:`\epsilon`). Default: ``1e-7`` .
     Inputs:
         - **input_x** (Tensor) - Tensor of shape :math:`(N, \ldots)`.
           The input of LayerNorm. Supported dtypes: float16, float32, float64.
-        - **gamma** (Tensor) - Tensor of shape :math:`(P_0, \ldots, P_\text{begin_params_axis})`.
+        - **gamma** (Tensor) - Tensor of shape :math:`(P_\text{begin_params_axis}, \ldots, P_\text{rank(input_x)-1})`.
           The learnable parameter :math:`\gamma` as the scale on norm. Supported dtypes: float16, float32, float64.
-        - **beta** (Tensor) - Tensor of shape :math:`(P_0, \ldots, P_\text{begin_params_axis})`.
+        - **beta** (Tensor) - Tensor of shape :math:`(P_\text{begin_params_axis}, \ldots, P_\text{rank(input_x)-1})`.
           The learnable parameter :math:`\beta` as the scale on norm. Supported dtypes: float16, float32, float64.
     Outputs:
         tuple[Tensor], tuple of 3 tensors, the normalized input and the updated parameters.
         - **output_x** (Tensor) - The normalized input, has the same type and shape as the `input_x`.
-          The shape is :math:`(N, C)`.
-        - **mean** (Tensor) - Tensor of shape :math:`(C,)`.
-        - **variance** (Tensor) - Tensor of shape :math:`(C,)`.
+        - **mean** (Tensor) - The first `begin_norm_axis` dimensions of `mean` shape is the same as `input_x`,
+          and the remaining dimensions are 1. Suppose the shape of the `input_x` is :math:`(x_1, x_2, \ldots, x_R)`,
+          the shape of the `mean` is :math:`(x_1, \ldots, x_{begin_params_axis}, 1, \ldots, 1)`
+          (when `begin_params_axis=0`, the shape of `mean` is :math:`(1, \ldots, 1)` ).
+        - **variance** (Tensor) - Shape is the same as `mean` .
     Raises:
         TypeError: If `begin_norm_axis` or `begin_params_axis` is not an int.
@@ -3855,38 +3879,6 @@ class L2Normalize(Primitive):
         self.axis = axis
-class DropoutGenMask(Primitive):
-    """
-    The DropoutGenMask interface is deprecated, please use the :class:`mindspore.ops.Dropout` instead.
-    Supported Platforms:
-        Deprecated
-    """
-    @deprecated("1.5", "ops.Dropout", False)
-    @prim_attr_register
-    def __init__(self, Seed0=0, Seed1=0):
-        """Initialize DropoutGenMask."""
-        self.init_prim_io_names(inputs=['shape', 'keep_prob'], outputs=['output'])
-        validator.check_value_type("Seed0", Seed0, [int], self.name)
-        validator.check_value_type("Seed1", Seed1, [int], self.name)
-        self.add_prim_attr("side_effect_hidden", True)
-class DropoutDoMask(Primitive):
-    """
-    The DropoutDoMask interface is deprecated, please use the :class:`mindspore.ops.Dropout` instead.
-    Supported Platforms:
-        Deprecated
-    """
-    @deprecated("1.5", "ops.Dropout", False)
-    @prim_attr_register
-    def __init__(self):
-        pass
 class ResizeBilinear(PrimitiveWithInfer):
     r"""
     This API is deprecated, please use the :class:`mindspore.ops.ResizeBilinearV2` instead.
@@ -4017,6 +4009,7 @@ class OneHot(Primitive):
     Note:
         If the input indices is rank `N`, the output will have rank `N+1`. The new axis is created at dimension `axis`.
+        On Ascend, if `on_value` is Int64 dtype, `indices` must be Int64 dtype.
     Args:
         axis (int): Position to insert the value. e.g. If shape of `indices` is :math:`(N, C)`, and `axis` is -1,
@@ -4025,18 +4018,20 @@ class OneHot(Primitive):
     Inputs:
         - **indices** (Tensor) - A tensor of indices. Tensor of shape :math:`(X_0, \ldots, X_n)`.
-          Data type must be uint8, int32 or int64.
+          Data type must be int32 or int64.
         - **depth** (int) - A scalar defining the depth of the one-hot dimension.
-        - **on_value** (Tensor) - A value to fill in output when `indices[j] = i`.
+        - **on_value** (Tensor) - A value to fill in output when `indices[j] = i`. Data type must be int32, int64,
+          float16 or float32.
         - **off_value** (Tensor) - A value to fill in output when `indices[j] != i`.
           It has the same data type as `on_value`.
     Outputs:
-        Tensor, one-hot tensor. Tensor of shape :math:`(X_0, \ldots, X_{axis}, \text{depth} ,X_{axis+1}, \ldots, X_n)`.
+        Tensor, one-hot tensor. Tensor of shape :math:`(X_0, \ldots, X_{axis}, \text{depth} ,X_{axis+1}, \ldots, X_n)`,
+        and it has the same data type as `on_value`.
     Raises:
         TypeError: If `axis` or `depth` is not an int.
-        TypeError: If dtype of `indices` is not uint8, int32 or int64.
+        TypeError: If dtype of `indices` is not int32 or int64.
         TypeError: If `indices`, `on_value` or `off_value` is not a Tensor.
         ValueError: If `axis` is not in range [-1, len(indices_shape)].
         ValueError: If `depth` is less than 0.
@@ -4065,26 +4060,6 @@ class OneHot(Primitive):
         validator.check_value_type("axis", axis, [int], self.name)
-class Gelu(PrimitiveWithInfer):
-    """
-    Same as operator GeLU. Gelu will be deprecated in the future.
-    Please use GeLU instead.
-    """
-    @deprecated("1.1", "GeLU", True)
-    @prim_attr_register
-    def __init__(self):
-        """Initialize Gelu"""
-        self.init_prim_io_names(inputs=['x'], outputs=['output'])
-    def infer_shape(self, input_x):
-        return input_x
-    def infer_dtype(self, input_x):
-        validator.check_tensor_dtype_valid("input_x", input_x, (mstype.float16, mstype.float32), self.name)
-        return input_x
 class GeLU(Primitive):
     r"""
     Gaussian Error Linear Units activation function.
@@ -4131,26 +4106,6 @@ class GeLU(Primitive):
         self.init_prim_io_names(inputs=['x'], outputs=['output'])
-class FastGelu(PrimitiveWithInfer):
-    """
-    Same as operator FastGeLU. FastGelu will be deprecated in the future.
-    Please use FastGeLU instead.
-    """
-    @deprecated("1.1", "FastGeLU", True)
-    @prim_attr_register
-    def __init__(self):
-        """Initialize FastGelu."""
-        self.init_prim_io_names(inputs=['x'], outputs=['output'])
-    def infer_shape(self, input_x):
-        return input_x
-    def infer_dtype(self, input_x):
-        validator.check_tensor_dtype_valid("input_x", input_x, (mstype.float16, mstype.float32), self.name)
-        return input_x
 class FastGeLU(Primitive):
     r"""
     Fast Gaussian Error Linear Units activation function.
@@ -4301,19 +4256,24 @@ class LSTM(Primitive):
         bidirectional (bool): Specifies whether it is a bidirectional LSTM.
         dropout (float): If not 0, append `Dropout` layer on the outputs of each
             LSTM layer except the last layer. The range of dropout is [0.0, 1.0].
+        proj_size (int): If `proj_size` > 0, a projection of the corresponding size will be used,
+            which is only supported on CPU now. Default: ``0`` .
     Inputs:
         - **input** (Tensor) - Tensor of shape :math:`(seq\_len, batch\_size, input\_size)` or
           :math:`(batch\_size, seq\_len, input\_size)`.
-        - **h** (Tensor) - Tensor of shape :math:`(num\_directions * num\_layers, batch\_size, hidden\_size)`.
+        - **h** (Tensor) - Tensor of shape :math:`(num\_directions * num\_layers, batch\_size, real\_hidden\_size)`.
         - **c** (Tensor) - Tensor of shape :math:`(num\_directions * num\_layers, batch\_size, hidden\_size)`.
         - **w** (Tensor) - A weight Tensor.
+        If :math:`proj\_size > 0` , :math:`real\_hidden\_size = proj\_size` , otherwise
+        :math:`real\_hidden\_size = hidden\_size` .
     Outputs:
-        Tuple, a tuple contains (`output`, `h_n`, `c_n`, `reserve`, `state`).
+        Tuple, a tuple contains `(output, h_n, c_n, reserve, state)`.
-        - **output** (Tensor) - Tensor of shape :math:`(seq\_len, batch\_size, num\_directions * hidden\_size)`.
-        - **h_n** (Tensor) - Tensor of shape :math:`(num\_directions * num\_layers, batch\_size, hidden\_size)`.
+        - **output** (Tensor) - Tensor of shape :math:`(seq\_len, batch\_size, num\_directions * real\_hidden\_size)`.
+        - **h_n** (Tensor) - Tensor of shape :math:`(num\_directions * num\_layers, batch\_size, real\_hidden\_size)`.
         - **c_n** (Tensor) - Tensor of shape :math:`(num\_directions * num\_layers, batch\_size, hidden\_size)`.
         - **reserve** (Tensor) - Tensor of shape :math:`(r, 1)`.
         - **state** (Tensor) - Random number generator state and its shape is :math:`(s, 1)`.
@@ -4323,6 +4283,7 @@ class LSTM(Primitive):
         TypeError: If `has_bias` or `bidirectional` is not a bool.
         TypeError: If `dropout` is not a float.
         ValueError: If `dropout` is not in range [0.0, 1.0].
+        ValueError: If `proj_size` is not in range [0, `hidden_size`).
     Supported Platforms:
         ``GPU`` ``CPU``
@@ -4356,10 +4317,12 @@ class LSTM(Primitive):
     """
     @prim_attr_register
-    def __init__(self, input_size, hidden_size, num_layers, has_bias, bidirectional, dropout):
+    def __init__(self, input_size, hidden_size, num_layers, has_bias, bidirectional, dropout, proj_size=0):
         """Initialize LSTM."""
         self.input_size = validator.check_positive_int(input_size, "input_size", self.name)
         self.hidden_size = validator.check_positive_int(hidden_size, "hidden_size", self.name)
+        self.proj_size = validator.check_int_range(proj_size, 0, hidden_size, validator.INC_LEFT,
+                                                   'proj_size', self.name)
         self.num_layers = validator.check_positive_int(num_layers, "num_layers", self.name)
         self.has_bias = validator.check_value_type("has_bias", has_bias, (bool,), self.name)
         self.bidirectional = validator.check_value_type("bidirectional", bidirectional, (bool,), self.name)
@@ -4466,8 +4429,12 @@ class BCEWithLogitsLoss(PrimitiveWithInfer):
     :math:`P_c>1` increases the recall, :math:`P_c<1` increases the precision.
     Args:
-        reduction (str): Type of reduction to be applied to loss. The optional values are ``'mean'`` , ``'sum'`` , and
-            ``'none'`` , not case sensitive. If ``'none'`` , do not perform reduction. Default: ``'mean'`` .
+        reduction (str, optional): Apply specific reduction method to the output: ``'none'`` , ``'mean'`` ,
+            ``'sum'`` . Default: ``'mean'`` .
+            - ``'none'``: no reduction will be applied.
+            - ``'mean'``: compute and return the weighted mean of elements in the output.
+            - ``'sum'``: the output elements will be summed.
     Inputs:
         - **logits** (Tensor) - Input logits. Data type must be float16 or float32.
@@ -4481,7 +4448,7 @@ class BCEWithLogitsLoss(PrimitiveWithInfer):
           Data type must be float16 or float32.
     Outputs:
-        Tensor or Scalar, if `reduction` is 'none', it's a tensor with the same shape and type as input `logits`.
+        Tensor or Scalar, if `reduction` is ``'none'``, it's a tensor with the same shape and type as input `logits`.
         Otherwise, the output is a scalar.
     Raises:
@@ -4489,7 +4456,7 @@ class BCEWithLogitsLoss(PrimitiveWithInfer):
         TypeError: If data type of any input is neither float16 nor float32.
         TypeError: If data type of `reduction` is not string.
         ValueError: If `weight` or `pos_weight` can not be broadcast to a tensor with shape of `logits`.
-        ValueError: If `reduction` is not one of 'none', 'mean' or 'sum'.
+        ValueError: If `reduction` is not one of ``'none'``, ``'mean'`` or ``'sum'``.
     Supported Platforms:
         ``Ascend`` ``GPU`` ``CPU``
@@ -4669,9 +4636,15 @@ class MirrorPad(Primitive):
     Pads the input tensor according to the paddings and mode.
     Args:
-        mode (str): Specifies the padding mode. The optional values are ``'REFLECT'`` and ``'SYMMETRIC'`` .
+        mode (str, optional): An optional string specifying the pad method.
+            The optional values are ``'REFLECT'`` and ``'SYMMETRIC'`` .
             Default: ``'REFLECT'`` .
+            - ``'REFLECT'``: Reflect the value on the edge while omitting the last one.
+              For example, pad [1, 2, 3, 4] with 2 elements on both sides will result in [3, 2, 1, 2, 3, 4, 3, 2].
+            - ``'SYMMETRIC'``: Reflect the value on the edge while repeating the last one.
+              For example, pad [1, 2, 3, 4] with 2 elements on both sides will result in [2, 1, 1, 2, 3, 4, 4, 3].
     Inputs:
         - **input_x** (Tensor) - Tensor of shape :math:`(N, *)`, where :math:`*` means, any number of
           additional dimensions.
@@ -4683,15 +4656,14 @@ class MirrorPad(Primitive):
           paddings[D, 0] and paddings[D, 1] must be no greater than input_x.dim_size(D)
           (or input_x.dim_size(D) - 1) if mode is SYMMETRIC (if REFLECT, respectively).
     Outputs:
         Tensor, the tensor after padding.
-        - If `mode` is "REFLECT", it uses a way of symmetrical copying through the axis of symmetry to fill in.
+        - If `mode` is ``'REFLECT'``, it uses a way of symmetrical copying through the axis of symmetry to fill in.
           If the `input_x` is [[1,2,3], [4,5,6], [7,8,9]] and `paddings` is [[1,1], [2,2]], then the
           `Outputs` is [[6,5,4,5,6,5,4], [3,2,1,2,3,2,1], [6,5,4,5,6,5,4], [9,8,7,8,9,8,7], [6,5,4,5,6,5,4]].
           For a more intuitive understanding, please see the example below.
-        - If `mode` is "SYMMETRIC", the filling method is similar to the "REFLECT". It is also copied
+        - If `mode` is ``'SYMMETRIC'``, the filling method is similar to the ``'REFLECT'``. It is also copied
           according to the symmetry axis, except that it includes the symmetry axis. If the `input_x`
           is [[1,2,3], [4,5,6], [7,8,9]] and `paddings` is [[1,1], [2,2]], then the `Outputs` is
           [[2,1,1,2,3,3,2], [2,1,1,2,3,3,2], [5,4,4,5,6,6,5], [8,7,7,8,9,9,8], [8,7,7,8,9,9,8]].
@@ -5675,7 +5647,7 @@ class KLDivLoss(Primitive):
         - **labels** (Tensor) - The label Tensor which has the same shape and data type as `logits`.
     Outputs:
-        Tensor or Scalar, if `reduction` is 'none', then output is a tensor and has the same shape as `logits`.
+        Tensor or Scalar, if `reduction` is ``'none'``, then output is a tensor and has the same shape as `logits`.
         Otherwise it is a scalar.
     Raises:
@@ -5750,8 +5722,12 @@ class BinaryCrossEntropy(Primitive):
         - The value of :math:`x` must range from 0 to 1.
     Args:
-        reduction (str): Specifies the reduction to be applied to the output.
-            Its value must be one of ``'none'`` , ``'mean'`` or ``'sum'`` . Default: ``'mean'`` .
+        reduction (str, optional): Apply specific reduction method to the output: ``'none'`` , ``'mean'`` ,
+            ``'sum'`` . Default: ``'mean'`` .
+            - ``'none'``: no reduction will be applied.
+            - ``'mean'``: compute and return the weighted mean of elements in the output.
+            - ``'sum'``: the output elements will be summed.
     Inputs:
         - **logits** (Tensor) - The predictive value whose data type must be float16 or float32,
@@ -5766,7 +5742,7 @@ class BinaryCrossEntropy(Primitive):
     Raises:
         TypeError: If dtype of `logits`, `labels` or `weight` (if given) is neither float16 nor float32.
-        ValueError: If `reduction` is not one of 'none', 'mean' or 'sum'.
+        ValueError: If `reduction` is not one of ``'none'``, ``'mean'`` or ``'sum'``.
         ValueError: If shape of `labels` is not the same as `logits` or `weight` (if given).
         TypeError: If `logits`, `labels` or `weight` is not a Tensor.
@@ -7173,7 +7149,19 @@ class Dropout(PrimitiveWithCheck):
     Outputs:
         - **output** (Tensor) - With the same shape and data type as `x`.
-        - **mask** (Tensor) - With the same shape as `x`.
+        - **mask** (Tensor) - The mask applied to `x`.
+          - On GPU and CPU, `mask` has the same shape and data type as `x`.
+          - On Ascend, to achieve a better performance, it is denoted as a 1-D Tensor
+            with Uint8 data type. It has shape :math:`(byte\_counts, )` where :math:`byte\_counts` is the
+            number of bytes needed to mask the input `x`, :math:`byte\_counts` is calculated using the
+            following formula:
+            .. math::
+                byte\_counts = \text{ceil}(\text{cumprod}(x.shape) / 128) * 16
+            If shape of `x` is :math:`(2, 3, 4, 5, 6)`, the shape of `mask` will be :math:`(96, )`.
     Supported Platforms:
         ``Ascend`` ``GPU`` ``CPU``
@@ -7195,6 +7183,7 @@ class Dropout(PrimitiveWithCheck):
         self.seed0 = validator.check_value_type("Seed0", Seed0, [int], self.name)
         self.seed1 = validator.check_value_type("Seed1", Seed1, [int], self.name)
         self.keep_prob = validator.check_float_range(keep_prob, 0, 1, validator.INC_RIGHT, "keep_prob", self.name)
+        self.add_prim_attr("side_effect_hidden", True)
     def check_shape(self, x_shape):
         validator.check_int(len(x_shape), 1, validator.GE, "x_shape", self.name)
@@ -7402,6 +7391,9 @@ class CTCGreedyDecoder(Primitive):
     Refer to :func:`mindspore.ops.ctc_greedy_decoder` for more details.
+    Note:
+        On Ascend, 'merge_repeated' can not be set to false.
     Args:
         merge_repeated (bool, optional): If ``True`` , merge repeated classes in output. Default: ``True`` .
@@ -7824,6 +7816,10 @@ class LRN(Primitive):
     r"""
     Local Response Normalization.
+    .. warning::
+        LRN is deprecated on Ascend due to potential accuracy problem. It's recommended to use other
+        normalization methods, e.g. :class:`mindspore.ops.BatchNorm`.
     .. math::
         b_{c} = a_{c}\left(k + \frac{\alpha}{n}
@@ -7854,7 +7850,7 @@ class LRN(Primitive):
         TypeError: If `x` is not a Tensor.
     Supported Platforms:
-        ``Ascend`` ``GPU`` ``CPU``
+        ``GPU`` ``CPU``
     Examples:
         >>> import mindspore
@@ -7908,21 +7904,22 @@ class AvgPool3D(Primitive):
         strides (Union[int, tuple[int]]): The distance of kernel moving, an int number that represents
             the depth, height and width of movement are both strides, or a tuple of three int numbers that
             represent depth, height and width of movement respectively. Default: ``1`` .
-        pad_mode (str): The optional value for pad mode, is ``"same"`` , ``"valid"`` , ``"pad"`` .
-            Default: ``"valid"`` .
-            - ``"same"``: Adopts the way of completion. The depth, height and width of the output will be the same
-              as the input. The total number of padding will be calculated in depth, horizontal and vertical
-              directions and evenly distributed to head and tail, top and bottom, left and right if possible.
-              Otherwise, the last extra padding will be done from the tail, bottom and the right side.
+        pad_mode (str, optional): Specifies the padding mode with a padding value of 0. It can be set to:
+            ``"same"`` , ``"valid"`` or ``"pad"`` . Default: ``"valid"`` .
+            - ``"same"``: Pad the input around its depth/height/width dimension so that the shape of input and output
+              are the same when `stride` is set to ``1``.
+              The amount of padding to is calculated by the operator internally.  If the amount is even,
+              it isuniformly distributed around the input, if it is odd, the excess amount goes
+              to the front/right/bottom side.
               If this mode is set, `pad` must be 0.
+            - ``"valid"``: No padding is applied to the input, and the output returns the maximum
+              possible depth, height and width. Extra pixels that could not complete a full stride will
+              be discarded. If this mode is set, `pad` must be 0.
+            - ``"pad"``: Pad the input with a specified amount. In this mode, the amount of padding
+              in the depth, height and width dimension is determined by the `pad` parameter.
+              If this mode is set, `pad` must be greater than or equal to 0.
-            - ``"valid"``: Adopts the way of discarding. The possible largest depth, height and width of output
-              will be returned without padding. Extra pixels will be discarded. If this mode is set, `pad`
-              must be 0.
-            - pad: Implicit paddings on both sides of the input in depth, height, width. The number of `pad` will
-              be padded to the input Tensor borders. `pad` must be greater than or equal to 0.
         pad (Union(int, tuple[int], list[int])): The pad value to be filled. Default: ``0`` . If `pad` is an integer,
             the paddings of head, tail, top, bottom, left and right are the same, equal to pad.
             If `pad` is a tuple of six integers, the padding of head, tail, top, bottom, left and right equal to
@@ -8005,74 +8002,97 @@ class AvgPool3D(Primitive):
 class Conv3D(Primitive):
     r"""
-    Applies a 3D convolution over an input tensor. The input tensor is typically of shape
-    :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})` and output shape
-    :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})`, where :math:`N` is batch size, :math:`C` is channel number,
-    :math:`D` is depth, :math:`H, W` is feature height and width respectively.
-    the output value of a layer is calculated as:
+    3D convolution layer.
+    Applies a 3D convolution over an input tensor which is typically of shape
+    :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})`,
+    where :math:`N` is batch size, :math:`C` is channel number, :math:`D` is feature depth,
+    :math:`H` is feature height, :math:`W` is feature width.
+    The output is calculated based on formula:
     .. math::
-        \operatorname{out}\left(N_{i}, C_{\text {out}_j}\right)=\operatorname{bias}\left(C_{\text {out}_j}\right)+
-        \sum_{k=0}^{C_{in}-1} ccor(\text {weight}\left(C_{\text {out}_j}, k\right),
-        \operatorname{input}\left(N_{i}, k\right))
-    where :math:`k` is kernel,
-    :math:`ccor` is the `cross-correlation <https://en.wikipedia.org/wiki/Cross-correlation>`_ ,
-    :math:`C_{in}` is the channel number of the input, :math:`out_{j}` corresponds to the :math:`j`-th channel of
-    the output and :math:`j` is in the range of :math:`[0, C_{out} - 1]`. :math:`\text{weight}(C_{\text{out}_j}, k)`
-    is a convolution kernel slice with shape
-    :math:`(\text{kernel_size[0]}, \text{kernel_size[1]}, \text{kernel_size[2]})`,
-    where :math:`\text{kernel_size[0]}`, :math:`\text{kernel_size[1]}` and :math:`\text{kernel_size[2]}` are
-    the depth, height and width of the convolution kernel respectively. :math:`\text{bias}` is the bias parameter
-    and :math:`\text{X}` is the input tensor.
-    The shape of full convolution kernel is
-    :math:`(C_{out}, C_{in} / \text{groups}, \text{kernel_size[0]}, \text{kernel_size[1]}, \text{kernel_size[2]})`,
-    where `groups` is the number of groups to split `input` in the channel dimension.
-    For more details, please refer to the paper `Gradient Based Learning Applied to Document
-    Recognition <http://vision.stanford.edu/cs598_spring07/papers/Lecun98.pdf>`_ .
+        \text{out}(N_i, C_{\text{out}_j}) = \text{bias}(C_{\text{out}_j}) +
+        \sum_{k = 0}^{C_{in} - 1} \text{ccor}({\text{weight}(C_{\text{out}_j}, k), \text{X}(N_i, k)})
+    where :math:`bias` is the output channel bias, :math:`ccor` is
+    the `cross-correlation <https://en.wikipedia.org/wiki/Cross-correlation>`_,
+    , :math:`weight` is the convolution kernel value and :math:`X` represents the input feature map.
+    Here are the indices' meanings:
+    - :math:`i` corresponds to the batch number, ranging from 0 to N-1, where N is the batch size of the input.
+    - :math:`j` corresponds to the output channel, ranging from 0 to C_{out}-1, where C_{out} is the number of
+      output channels, which is also equal to the number of kernels.
+    - :math:`k` corresponds to the input channel, ranging from 0 to C_{in}-1, where C_{in} is the number of
+      input channels, which is also equal to the number of channels in the convolutional kernels.
+    Therefore, in the above formula, :math:`{bias}(C_{out_j})` represents the bias of the :math:`j`-th
+    output channel, :math:`{weight}(C_{out_j}, k)` represents the slice of the :math:`j`-th convolutional
+    kernel in the :math:`k`-th channel, and :math:`{X}(N_i, k)` represents the slice of the :math:`k`-th input
+    channel in the :math:`i`-th batch of the input feature map.
+    The shape of the convolutional kernel is given by
+    :math:`(\text{kernel_size[0]}, \text{kernel_size[1]}, \text{kernel_size[2]})`
+    where :math:`kernel\_size[0]` , :math:`kernel\_size[1]` and :math:`kernel\_size[2]` are the depth,
+    height and width of the kernel, respectively.
+    If we consider the input and output channels as well as the `group` parameter, the complete kernel shape
+    will be :math:`(C_{out}, C_{in} / \text{group}, \text{kernel_size[0]},
+    \text{kernel_size[1]}, \text{kernel_size[2]})`,
+    where `group` is the number of groups dividing `x`'s input channel when applying group convolution.
+    For more details about convolution layer, please refer to `Gradient Based Learning Applied to Document Recognition
+    <http://vision.stanford.edu/cs598_spring07/papers/Lecun98.pdf>`_.
     Note:
-        On Ascend platform, `group = 1` must be satisfied.
+        1. On Ascend platform, `groups = 1` must be satisfied.
+        2. On Ascend `dilation` on depth only supports the case of 1.
     Args:
-        out_channel (int): The number of output channel :math:`C_{out}`.
-        kernel_size (Union[int, tuple[int]]): Specifies the depth, height
-            and width of the 3D convolution window. It can be a single int or a tuple of 3 integers.
-            Single int means the value is for the depth, height and width
-            of the kernel. A tuple of 3 ints corresponds to the depth, height and width of the kernel respectively.
+        out_channel (int): Specifies output channel :math:`C_{out}`.
+        kernel_size (Union[int, tuple[int]]): Specifies the depth, height and width of the 3D convolution kernel.
+            It can be a single int or a tuple of 3 integers. A single int means the value is for depth, height
+            and the width. A tuple of 3 ints means the first value is for depth and
+            the rest is for the height and width.
         mode (int, optional): Modes for different convolutions. It is currently not used. Default: ``1`` .
         stride (Union[int, tuple[int]], optional): The distance of kernel moving, it can be an int number
             that represents the depth, height and width of movement or a tuple of three int numbers that
             represent depth, height and width movement respectively. Default: ``1`` .
-        pad_mode (str, optional): Specifies padding mode. The optional values are
-            ``"same"`` , ``"valid"`` and ``"pad"`` . Default: ``"valid"`` .
-            - ``"same"``: Adopts the way of completion. The depth, height and width of the output will be equal to
-              the input `x` divided by stride. The padding will be evenly calculated in head and tail, top and bottom,
-              left and right directions possiblily.
-              Otherwise, the last extra padding will be calculated from the tail, bottom and the right side.
+        pad_mode (str, optional): Specifies the padding mode with a padding value of 0. It can be set to:
+            ``"same"`` , ``"valid"`` or ``"pad"`` . Default: ``"valid"`` .
+            - ``"same"``: Pad the input around its depth/height/width dimension so that the shape of input and output
+              are the same when `stride` is set to ``1``.
+              The amount of padding to is calculated by the operator internally.  If the amount is even,
+              it isuniformly distributed around the input, if it is odd, the excess amount goes
+              to the front/right/bottom side.
               If this mode is set, `pad` must be 0.
-            - ``"valid"``: Adopts the way of discarding. The possible largest depth, height and width of output
-              will be returned without padding. Extra pixels will be discarded. If this mode is set, `pad`
-              must be 0.
-            - ``"pad"``: Implicit paddings on both sides of the input in depth, height and width. The number of `pad`
-              will be padded to the input Tensor borders. `pad` must be greater than or equal to 0.
-        pad (Union(int, tuple[int]), optional): The pad value to be filled. Default: ``0`` .
-            If `pad` is an integer, the paddings
-            of head, tail, top, bottom, left and right are the same, equal to pad. If `pad` is a tuple of six
-            integers, the padding of head, tail, top, bottom, left and right equal to pad[0], pad[1], pad[2],
-            pad[3], pad[4] and pad[5] correspondingly.
-        dilation (Union[int, tuple[int]], optional): The data type is int or a tuple of 3 integers
-            :math:`(dilation_d, dilation_h, dilation_w)`. Currently, dilation on depth only supports the case of 1
-            on Ascend backend. Specifies the dilation rate to use for dilated convolution. If set :math:`k > 1`,
-            there will be :math:`k - 1` pixels skipped for each sampling location.
-            The value ranges for the depth, height, and width dimensions are [1, D], [1, H], and [1, W],
-            respectively. Default: ``1`` .
-        group (int, optional):The number of groups into which the filter is divided. `in_channels`
+            - ``"valid"``: No padding is applied to the input, and the output returns the maximum
+              possible depth, height and width. Extra pixels that could not complete a full stride will
+              be discarded. If this mode is set, `pad` must be 0.
+            - ``"pad"``: Pad the input with a specified amount. In this mode, the amount of padding
+              in the depth, height and width dimension is determined by the `pad` parameter.
+              If this mode is set, `pad` must be greater than or equal to 0.
+        pad (Union(int, tuple[int]), optional): Specifies the amount of padding to apply on input
+            when `pad_mode` is set to ``"pad"``. It can be a single int or a tuple of 6 ints.
+            If `pad` is one integer, the paddings of head, tail, top, bottom,
+            left and right are the same, equal to `pad`. If `pad` is a tuple with 6 integers, the
+            paddings of head, tail, top, bottom, left and right is equal to pad[0],
+            pad[1], pad[2], pad[3], pad[4] and pad[5] accordingly. Default: ``0`` .
+        dilation (Union[int, tuple[int]], optional): Specifies the dilation rate to use for dilated convolution.
+            It can be a single int or a tuple of 3 integers. A single int means the dilation size is the same
+            in the depth, height and width directions. A tuple of 3 ints represents the dilation size in
+            the depth, height and width directions, respectively.
+            Assuming :math:`dilation=(d0, d1, d2)`, the convolutional kernel samples the input with a
+            spacing of :math:`d0-1` elements in the depth direction,
+            :math:`d1-1` elements in the height direction, :math:`d2-1` elements in the
+            width direction respectively. The values in the depth, height and width dimensions are in the
+            ranges [1, D], [1, H] and [1, W], respectively.
+            Default: ``1`` .
+        group (int, optional): The number of groups into which the filter is divided. `in_channels`
             and `out_channels` must be divisible by `group`. Default: ``1`` .
         data_format (str, optional): The optional value for data format. Currently only support ``"NCDHW"`` .
@@ -8088,7 +8108,7 @@ class Conv3D(Primitive):
     Outputs:
         Tensor, the value that applied 3D convolution. The shape is :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})`.
-        `pad_mode` is 'same':
+        `pad_mode` is ``"same"``:
         .. math::
             \begin{array}{ll} \\
@@ -8097,7 +8117,7 @@ class Conv3D(Primitive):
                 W_{out} = \left \lceil{\frac{W_{in}}{\text{stride[2]}}} \right \rceil \\
             \end{array}
-        `pad_mode` is 'valid':
+        `pad_mode` is ``"valid"``:
         .. math::
             \begin{array}{ll} \\
@@ -8109,15 +8129,15 @@ class Conv3D(Primitive):
                 {\text{stride[2]}} + 1} \right \rfloor \\
             \end{array}
-        `pad_mode` is 'pad':
+        `pad_mode` is ``"pad"``:
         .. math::
             \begin{array}{ll} \\
-                D_{out} = \left \lfloor{\frac{D_{in} + padding[0] + padding[1] - (\text{dilation[0]} - 1) \times
+                D_{out} = \left \lfloor{\frac{D_{in} + pad[0] + pad[1] - (\text{dilation[0]} - 1) \times
                 \text{kernel_size[0]} - 1 }{\text{stride[0]}} + 1} \right \rfloor \\
-                H_{out} = \left \lfloor{\frac{H_{in} + padding[2] + padding[3] - (\text{dilation[1]} - 1) \times
+                H_{out} = \left \lfloor{\frac{H_{in} + pad[2] + pad[3] - (\text{dilation[1]} - 1) \times
                 \text{kernel_size[1]} - 1 }{\text{stride[1]}} + 1} \right \rfloor \\
-                W_{out} = \left \lfloor{\frac{W_{in} + padding[4] + padding[5] - (\text{dilation[2]} - 1) \times
+                W_{out} = \left \lfloor{\frac{W_{in} + pad[4] + pad[5] - (\text{dilation[2]} - 1) \times
                 \text{kernel_size[2]} - 1 }{\text{stride[2]}} + 1} \right \rfloor \\
             \end{array}
@@ -8138,12 +8158,56 @@ class Conv3D(Primitive):
         >>> import mindspore
         >>> import numpy as np
         >>> from mindspore import Tensor, ops
+        >>> # case 1: specify kernel_size with tuple, all parameters use default values.
         >>> x = Tensor(np.ones([16, 3, 10, 32, 32]), mindspore.float16)
         >>> weight = Tensor(np.ones([32, 3, 4, 3, 3]), mindspore.float16)
         >>> conv3d = ops.Conv3D(out_channel=32, kernel_size=(4, 3, 3))
         >>> output = conv3d(x, weight)
         >>> print(output.shape)
         (16, 32, 7, 30, 30)
+        >>> # case 2: specify kernel_size with int, all parameters use default values.
+        >>> x = Tensor(np.ones([10, 20, 32, 32, 32]), mindspore.float32)
+        >>> weight = Tensor(np.ones([40, 20, 3, 3, 3]), mindspore.float32)
+        >>> conv3d = ops.Conv3D(out_channel=40, kernel_size=3)
+        >>> output = conv3d(x, weight)
+        >>> print(output.shape)
+        (10, 40, 30, 30, 30)
+         >>> # case 3: stride=(1, 2, 3), other parameters being default.
+        >>> x = Tensor(np.ones([10, 20, 32, 32, 32]), mindspore.float32)
+        >>> weight = Tensor(np.ones([40, 20, 3, 3, 3]), mindspore.float32)
+        >>> conv3d = ops.Conv3D(out_channel=40, kernel_size=3, stride=(1, 2, 3))
+        >>> output = conv3d(x, weight)
+        >>> print(output.shape)
+        (10, 40, 30, 15, 10)
+         >>> # case 4: pad_mode="pad", other parameters being default.
+        >>> x = Tensor(np.ones([10, 20, 32, 32, 32]), mindspore.float32)
+        >>> weight = Tensor(np.ones([40, 20, 3, 3, 3]), mindspore.float32)
+        >>> conv3d = ops.Conv3D(out_channel=40, kernel_size=3, pad_mode="pad", pad=2)
+        >>> output = conv3d(x, weight)
+        >>> print(output.shape)
+        (10, 40, 34, 34, 34)
+         >>> # case 5: dilation=(1, 1, 1), other parameters being default.
+        >>> x = Tensor(np.ones([10, 20, 32, 32, 32]), mindspore.float32)
+        >>> weight = Tensor(np.ones([40, 20, 3, 3, 3]), mindspore.float32)
+        >>> conv3d = ops.Conv3D(out_channel=40, kernel_size=3, dilation=(1, 1, 1))
+        >>> output = conv3d(x, weight)
+        >>> print(output.shape)
+        (10, 40, 30, 30, 30)
+        >>> # case 6: group=1, other parameters being default.
+        >>> x = Tensor(np.ones([10, 20, 32, 32, 32]), mindspore.float32)
+        >>> weight = Tensor(np.ones([40, 20, 3, 3, 3]), mindspore.float32)
+        >>> conv3d = ops.Conv3D(out_channel=40, kernel_size=3, group=1)
+        >>> output = conv3d(x, weight)
+        >>> print(output.shape)
+        (10, 40, 30, 30, 30)
+        >>> # case 7: All parameters are specified.
+        >>> x = Tensor(np.ones([10, 20, 32, 32, 32]), mindspore.float32)
+        >>> weight = Tensor(np.ones([40, 20, 3, 3, 3]), mindspore.float32)
+        >>> conv3d = ops.Conv3D(out_channel=40, kernel_size=3, stride=(1, 2, 3), pad_mode="pad",
+        ...                     pad=2, dilation=(1), group=1)
+        >>> output = conv3d(x, weight)
+        >>> print(output.shape)
+        (10, 40, 34, 17, 12)
     """
     @prim_attr_register
@@ -8218,8 +8282,22 @@ class Conv3DBackpropInput(Primitive):
         out_channel (int): The dimension of the output.
         kernel_size (Union[int, tuple[int]]): The kernel size of the 3D convolution.
         mode (int): Modes for different convolutions. Not currently used.
-        pad_mode (str): Modes to fill padding. It could be ``"valid"`` , ``"same"`` , or ``"pad"`` .
-                    Default: ``"valid"`` .
+        pad_mode (str, optional): Specifies the padding mode with a padding value of 0. It can be set to:
+            ``"same"`` , ``"valid"`` or ``"pad"`` . Default: ``"valid"`` .
+            - ``"same"``: Pad the input around its depth/height/width dimension so that the shape of input and output
+              are the same when `stride` is set to ``1``.
+              The amount of padding to is calculated by the operator internally.  If the amount is even,
+              it isuniformly distributed around the input, if it is odd, the excess amount goes
+              to the front/right/bottom side.
+              If this mode is set, `pad` must be 0.
+            - ``"valid"``: No padding is applied to the input, and the output returns the maximum
+              possible depth, height and width. Extra pixels that could not complete a full stride will
+              be discarded. If this mode is set, `pad` must be 0.
+            - ``"pad"``: Pad the input with a specified amount. In this mode, the amount of padding
+              in the depth, height and width dimension is determined by the `pad` parameter.
+              If this mode is set, `pad` must be greater than or equal to 0.
         pad (Union(int, tuple[int])): The pad value to be filled. Default: ``0`` . If `pad` is an integer, the
                     paddings of head, tail, top, bottom, left and right are the same, equal to pad. If `pad` is a
                     tuple of four integers, the padding of head, tail, top, bottom, left and right equal to pad[0],
@@ -8443,13 +8521,14 @@ class CTCLossV2(Primitive):
     Args:
         blank (int, optional): The blank label. Default: ``0`` .
-        reduction (str, optional): Apply specific reduction method to the output. Currently only support ``'none'`` ,
-            not case sensitive. Default: ``"none"`` .
+        reduction (str, optional): Apply specific reduction method to the output. Currently only support ``'none'``.
+            Default: ``'none'`` .
         zero_infinity (bool, optional): If loss is infinite, this parameter determines whether to set that loss
             and its correlated gradient to zero. Default: ``False`` .
     Inputs:
-        - **log_probs** (Tensor) - A tensor of shape :math:`(T, C, N)`, where :math:`T` is input length, :math:`N` is
+        - **log_probs** (Tensor) - A tensor of shape :math:`(T, N, C)`, where :math:`T` is input length, :math:`N` is
           batch size and :math:`C` is number of classes (including blank). Supported dtypes: float32, float64.
         - **targets** (Tensor) - A tensor of shape :math:`(N, S)`, where :math:`S` is max target length,
           means the target sequences. Supported dtypes: int32, int64.
@@ -8601,35 +8680,37 @@ class Conv3DTranspose(Primitive):
             Single int means the value is for the depth, height and width of the kernel.
             A tuple of 3 ints means the first value is for the depth, the second value is for the height and the
             other is for the width of the kernel.
-        mode (int): Modes for different convolutions. Default is ``1`` . It is currently not used.
-        pad_mode (str): Specifies padding mode. The optional values are
-            ``"same"`` , ``"valid"`` , ``"pad"`` . Default: ``"valid"`` .
-            - ``"same"``: Adopts the way of completion. The depth, height and width of the output will be equal to
-              the input `x` divided by stride. The padding will be evenly calculated in head and tail, top and bottom,
-              left and right directions possiblily.
-              Otherwise, the last extra padding will be calculated from the tail, bottom and the right side.
+        mode (int, optional): Modes for different convolutions. Default is ``1`` . It is currently not used.
+        pad_mode (str, optional): Specifies the padding mode with a padding value of 0. It can be set to:
+            ``"same"`` , ``"valid"`` or ``"pad"`` . Default: ``"valid"`` .
+            - ``"same"``: Pad the input around its depth/height/width dimension so that the shape of input and output
+              are the same when `stride` is set to ``1``.
+              The amount of padding to is calculated by the operator internally.  If the amount is even,
+              it isuniformly distributed around the input, if it is odd, the excess amount goes
+              to the front/right/bottom side.
               If this mode is set, `pad` must be 0.
-            - ``"valid"``: Adopts the way of discarding. The possible largest depth, height and width of output
-              will be returned without padding. Extra pixels will be discarded. If this mode is set, `pad`
-              and `output_padding` must be 0.
-            - ``"pad"``: Implicit paddings on both sides of the input in depth, height and width. The number of `pad`
-              will be padded to the input Tensor borders. `pad` must be greater than or equal to 0.
-        pad (Union(int, tuple[int])): The pad value to be filled. Default: ``0`` . If `pad` is an integer, the paddings
-             of head, tail, top, bottom, left and right are the same, equal to pad. If `pad` is a tuple of six integers,
-             the padding of head, tail, top, bottom, left and right equal to pad[0], pad[1], pad[2], pad[3], pad[4]
-             and pad[5] correspondingly.
-        stride (Union(int, tuple[int])): The distance of kernel moving, an int number that represents
+            - ``"valid"``: No padding is applied to the input, and the output returns the maximum
+              possible depth, height and width. Extra pixels that could not complete a full stride will
+              be discarded. If this mode is set, `pad` must be 0.
+            - ``"pad"``: Pad the input with a specified amount. In this mode, the amount of padding
+              in the depth, height and width dimension is determined by the `pad` parameter.
+              If this mode is set, `pad` must be greater than or equal to 0.
+        pad (Union(int, tuple[int]), optional): The pad value to be filled. Default: ``0`` . If `pad` is an integer,
+            the paddings of head, tail, top, bottom, left and right are the same, equal to pad.
+            If `pad` is a tuple of six integers, the padding of head, tail, top, bottom, left and right equal
+            to pad[0], pad[1], pad[2], pad[3], pad[4] and pad[5] correspondingly.
+        stride (Union(int, tuple[int]), optional): The distance of kernel moving, an int number that represents
             the depth, height and width of movement are both strides, or a tuple of three int numbers that
             represent depth, height and width of movement respectively. Default: ``1`` .
-        dilation (Union(int, tuple[int])): Specifies the space to use between kernel elements. Default: ``1`` .
-        group (int): The number of groups into which the filter is divided. `in_channels`
+        dilation (Union(int, tuple[int]), optional): Specifies the space to use between kernel elements.
+            Default: ``1`` .
+        group (int, optional): The number of groups into which the filter is divided. `in_channels`
             and `out_channels` must be divisible by `group`. Default: ``1`` .
-        output_padding (Union(int, tuple[int])): Add extra size to each dimension of the output. Default: ``0`` .
-        data_format (str): The optional value for data format. Currently only ``'NCDHW'`` is supported.
+        output_padding (Union(int, tuple[int]), optional): Add extra size to each dimension of the output.
+            Default: ``0`` .
+        data_format (str, optional): The optional value for data format. Currently only ``'NCDHW'`` is supported.
             Default: ``'NCDHW'``.
     Inputs:
@@ -8794,14 +8875,17 @@ class Dilation2D(Primitive):
                                       each sampling location. Its value must be greater or equal to 1 and bounded by
                                       the height and width of the input `x`.
-        pad_mode (str, optional): Specifies padding mode. The optional values are
-            ``"same"`` , ``"valid"`` . Default: ``"same"`` . Both upper and lower case are supported.
+        pad_mode (str, optional): Specifies the padding mode with a padding value of 0. It can be set to:
+            ``"same"`` or ``"valid"`` . Default: ``"valid"`` .
-            - ``"same"``: Adopts the way of completion. The height and width of the output will be the same as
-              the input `x`.
+            - ``"same"``: Pad the input around its edges so that the shape of input and output
+              are the same when `stride` is set to ``1``.
+              The amount of padding to is calculated by the operator internally, If the amount is even, it is
+              uniformly distributed around the input, if it is odd, the excess amount goes to the right/bottom side.
+            - ``"valid"``: No padding is applied to the input, and the output returns the maximum
+              possible height and width. Extra pixels that could not complete a full stride will
+              be discarded.
-            - ``"valid"``: Adopts the way of discarding. The possible largest height and width of output will be
-              returned without padding. Extra pixels will be discarded.
         data_format (str, optional): The value for data format, only ``'NCHW'`` is supported at present.
             Default: ``"NCHW"`` .
@@ -8879,7 +8963,11 @@ class Dilation2D(Primitive):
         self.pad_mode = validator.check_string(pad_mode, ['VALID', 'SAME', 'valid', 'same'], 'pad_mode', self.name)
         self.add_prim_attr('pad_mode', self.pad_mode.upper())
         self.stride = _check_format_stride_or_dilation("stride", stride, self.name, self.data_format)
-        if self.stride[2] < 1 or self.stride[2] > 255 or self.stride[3] < 1 or self.stride[3] > 255:
+        def is_in_range(x):
+            return 1 <= x <= 255
+        if not is_in_range(self.stride[2]) or not is_in_range(self.stride[3]):
             raise ValueError(f'For Dilation2D, size of stride is not supported, '
                              f'stride should be in the range of [1, 255], '
                              f'but got stride_h: `{self.stride[2]}`, stride_w: `{self.stride[3]}`.')
@@ -9418,8 +9506,8 @@ class MultilabelMarginLoss(Primitive):
             ``'sum'`` . Default: ``'mean'`` .
             - ``'none'``: no reduction will be applied.
-            - ``'mean'``: the sum of the output will be divided by the number of elements in the output.
-            - ``'sum'``: the output will be summed.
+            - ``'mean'``: compute and return the mean of elements in the output.
+            - ``'sum'``: the output elements will be summed.
     Inputs:
         - **x** (Tensor) - Predict data. Tensor of shape :math:`(C)` or :math:`(N, C)`, where :math:`N`
@@ -9428,7 +9516,7 @@ class MultilabelMarginLoss(Primitive):
           label targets padded by -1.
     Outputs:
-        - **y** (Union[Tensor, Scalar]) - The loss of MultilabelMarginLoss. If `reduction` is "none", its shape
+        - **y** (Union[Tensor, Scalar]) - The loss of MultilabelMarginLoss. If `reduction` is ``"none"``, its shape
           is :math:`(N)`. Otherwise, a scalar value will be returned.
         - **is_target** (Tensor) - Output tensor for backward input, with the same shape as `target`,
           data type must be int32.
@@ -9694,8 +9782,22 @@ class GridSampler3D(Primitive):
     Args:
         interpolation_mode (str, optional): An optional string specifying the interpolation method.
             The optional values are ``"bilinear"`` or ``"nearest"`` . Default: ``"bilinear"`` .
+            - ``"nearest"``: Nearest neighbor interpolation. Each output pixel is assigned the value of the
+              nearest input pixel. This method is simple and fast but can result in blocky or pixelated outputs.
+            - ``"bilinear"``: Bilinear interpolation. Each output pixel is a weighted average of the four nearest input
+              pixels, computed using bilinear interpolation. This method produces smoother results compared
+              to nearest neighbor interpolation.
         padding_mode (str, optional): An optional string specifying the pad method.
             The optional values are ``"zeros"`` , ``"border"`` or ``"reflection"`` . Default: ``"zeros"`` .
+            When the sampling grid is outside input's bounds, effects of various padding modes are as follows:
+            - ``"zeros"``: Pads the input tensor with zeros.
+            - ``"border"``: Pads the input tensor with the values of the pixels on the border of the tensor.
+            - ``"reflection"``: Pads the input tensor by reflecting the values of the pixels at the
+              boundary of the tensor.
         align_corners (bool, optional): An optional bool specifying alignment method. If set to ``True`` ,
             the extrema (-1 and 1) are considered as referring to
             the center points of the input’s corner pixels. If set to ``False`` , they are instead considered as
@@ -10178,8 +10280,12 @@ class TripletMarginLoss(Primitive):
         p (int, optional): The norm degree for pairwise distance. Default: ``2`` .
         eps (float, optional): Default: ``1e-6`` .
         swap (bool, optional): The distance swap. Default: ``False`` .
-        reduction (str, optional): Apply specific reduction method to the
-            output: ``"none"`` , ``"mean"`` , ``"sum"`` . Default: ``"mean"`` .
+        reduction (str, optional): Apply specific reduction method to the output: ``'none'`` , ``'mean'`` ,
+            ``'sum'`` . Default: ``'mean'`` .
+            - ``'none'``: no reduction will be applied.
+            - ``'mean'``: compute and return the mean of elements in the output.
+            - ``'sum'``: the output elements will be summed.
     Inputs:
         - **x** (Tensor) - A sample randomly selected from the training set. Data type must be BasicType.
@@ -10190,7 +10296,7 @@ class TripletMarginLoss(Primitive):
         - **margin** (Tensor) - Make a margin between the positive pair and the negative pair.
     Outputs:
-        Union[Tensor, Scalar], if `reduction` is "none", its shape is :math:`(N)`.
+        Union[Tensor, Scalar], if `reduction` is ``"none"``, its shape is :math:`(N)`.
         Otherwise, a scalar value will be returned.
     Raises:
@@ -10207,7 +10313,7 @@ class TripletMarginLoss(Primitive):
           is bigger than or equal to 8.
         ValueError: If length of shape of `margin` is not 0.
         ValueError: If shape of `x`, `positive` and `negative` cannot broadcast.
-        ValueError: If `reduction` is not one of 'none', 'mean', 'sum'.
+        ValueError: If `reduction` is not one of ``'none'``, ``'mean'``, ``'sum'``.
     Supported Platforms:
         ``GPU``
@@ -10303,6 +10409,13 @@ class GridSampler2D(Primitive):
         interpolation_mode (str, optional): An optional string specifying the interpolation method.
             The optional values are
             ``"bilinear"`` or ``"nearest"`` . Default: ``"bilinear"`` .
+            - ``"nearest"``: Nearest neighbor interpolation. Each output pixel is assigned the value of the
+              nearest input pixel. This method is simple and fast but can result in blocky or pixelated outputs.
+            - ``"bilinear"``: Bilinear interpolation. Each output pixel is a weighted average of the four nearest input
+              pixels, computed using bilinear interpolation. This method produces smoother results compared
+              to nearest neighbor interpolation.
         padding_mode (str, optional): An optional string specifying the pad method.
             The optional values are ``"zeros"`` , ``"border"`` or ``"reflection"`` . Default: ``"zeros"`` .
             When the sampling grid is outside input's bounds, effects of various padding modes are as follows:
@@ -10317,8 +10430,12 @@ class GridSampler2D(Primitive):
             and output tensors are aligned. When set to ``False`` , it is not aligned. Default: ``False`` .
     Inputs:
-        - **input_x** (Tensor) - A 4-D tensor with dtype of float16, float32 or float64 and shape of
-          :math:`(N, C, H_{in}, W_{in})`.
+        - **input_x** (Tensor) - A 4-D tensor with shape
+          :math:`(N, C, H_{in}, W_{in})`. Supported dtypes:
+          - Ascend: float16, float32.
+          - GPU/CPU: float16, float32, float64.
         - **grid** (Tensor) - A 4-D tensor whose dtype is the same as `input_x` and whose shape is
           :math:`(N, H_{out}, W_{out}, 2)`.
           Used to specify the sampling pixel locations normalized by the input spatial
@@ -10409,7 +10526,7 @@ class UpsampleNearest3D(Primitive):
     This operator scale up the volumetric input with specified `output_size` or `scales` factors, using nearest
     neighbor algorithm.
-    One of `output_size` or `scales` must be given, and can not be specified both.
+    One of `output_size` or `scales` must be given, and can not specified both at the same time.
     Inputs:
         - **x** (Tensor) - 5D tensor of shape :math:`(N, C, D_{in}, H_{in}, W_{in})`.
@@ -11116,46 +11233,45 @@ class Dense(Primitive):
     Applies dense connected operator for the input. The implement of the operation is as:
     .. math::
-        \text{output} = \text{x} * \text{w} + \text{b},
+        output = x @ w ^ T + b,
-    where :math:`x` is the input tensor, :math:`\text{w}` is a weight matrix with the same data type as the :math:`x` ,
-    and :math:`\text{b}` is a bias vector with the same data type as the :math:`x` (only if has_bias is True).
-    Args:
-        has_bias (bool): Specifies whether the layer uses a bias vector :math:`\text{b}`. Default: True.
+    where :math:`x` is the input tensor, :math:`w` is a weight matrix with the same data type as the :math:`x` ,
+    and :math:`b` is a bias vector with the same data type as the :math:`x` (only if `b` is not ``None``).
     Inputs:
-        - **x** (Union[Tensor, Parameter]) - The input tensor with data type of float16, float32 or float64.
-        - **w** (Union[Tensor, Parameter]) - The weight tensor with data type of float16, float32 or float64.
-        - **b** (Union[Tensor, Parameter]) - The bias tensor with data type of float16, float32 or float64.
+        - **x** (Tensor) - The shape must meet the following requirement: :math:`len(x.shape)>0`.
+        - **w** (Tensor) - The shape must meet the following requirements:
+          If :math:`len(x.shape)>1`, :math:`len(w.shape)=2`. If :math:`len(x.shape)=1`, :math:`len(w.shape)=1`.
+          :math:`w.shape[-1]=x.shape[-1]`.
+        - **b** (Union[Tensor, None]) - If `b` is not ``None``, the shape must meet the following requirements:
+          If :math:`len(x.shape)>1`, :math:`len(b.shape)=0` or :math:`len(b.shape)=1` .
+          If :math:`len(b.shape)=1`, :math:`b.shape[0]=w.shape[0]`.
+          If :math:`len(x.shape)=1`, :math:`len(b.shape)=0`.
     Outputs:
-        Tensor of shape :math:`(*x.shape[:-1], w.shape[0])`.
-    Raises:
-        TypeError: If `has_bias` is not a bool.
+        If :math:`len(x.shape)>1`, Tensor of shape :math:`(*x.shape[:-1], w.shape[0])`.
+        If :math:`len(x.shape)=1`, Tensor of shape :math:`()`.
     Supported Platforms:
-        ``GPU``
+        ``Ascend`` ``GPU`` ``CPU``
     Examples:
-        >>> from mindspore.ops.operations import nn_ops
+        >>> import numpy as np
+        >>> from mindspore import Tensor, ops
         >>> x = Tensor(np.random.random((4, 5, 6, 7)).astype(np.float32))
-        >>> weight = Parameter(np.random.random((6, 7)).astype(np.float32))
-        >>> bias = Parameter(np.random.random((6,)).astype(np.float32))
-        >>> dense = nn_ops.Dense()
+        >>> weight = Tensor(np.random.random((6, 7)).astype(np.float32))
+        >>> bias = Tensor(np.random.random((6,)).astype(np.float32))
+        >>> dense = ops.Dense()
         >>> output = dense(x, weight, bias)
         >>> print(output.shape)
         (4, 5, 6, 6)
     """
     @prim_attr_register
-    def __init__(self, has_bias=True):
+    def __init__(self):
         """Initialize Dense."""
         self.init_prim_io_names(inputs=['x', 'w', 'b'], outputs=["output"])
-        self.has_bias = has_bias
-        self.has_bias = validator.check_bool(has_bias, "has_bias", "Dense")
-        self.add_prim_attr("has_bias", self.has_bias)
+        self.add_prim_attr("has_bias", True)
 class WKV(Primitive):
@@ -11166,22 +11282,22 @@ class WKV(Primitive):
     Inputs:
         - **w** (Tensor) - The time_first tensor with data type of float32.
-          Input tensor of shape :math:`(hidden_size,)`.
+          Input tensor of shape :math:`(hidden\_size,)`.
         - **u** (Tensor]) - The time_decay tensor with data type of float32.
-          Input tensor of shape :math:`(hidden_size,)`.
+          Input tensor of shape :math:`(hidden\_size,)`.
         - **k** (Tensor) - The key tensor with data type of float32.
-          Input tensor of shape :math:`(batch_size, seq_length, hidden_size)`.
+          Input tensor of shape :math:`(batch\_size, seq\_length, hidden\_size)`.
         - **v** (Tensor) - The value tensor with data type of float32.
-          Input tensor of shape :math:`(batch_size, seq_length, hidden_size)`.
+          Input tensor of shape :math:`(batch\_size, seq\_length, hidden\_size)`.
         - **sp** (Tensor) - The states_p tensor with data type of float32.
-          Input tensor of shape :math:`(batch_size, seq_length, hidden_size)`.
+          Input tensor of shape :math:`(batch\_size, seq\_length, hidden\_size)`.
         - **sq** (Tensor) - The states_q tensor with data type of float32.
-          Input tensor of shape :math:`(batch_size, hidden_size)`.
+          Input tensor of shape :math:`(batch\_size, hidden\_size)`.
         - **sm** (Tensor) - The states_m tensor with data type of float32.
-          Input tensor of shape :math:`(batch_size, hidden_size)`.
+          Input tensor of shape :math:`(batch\_size, hidden\_size)`.
     Outputs:
-        Tensor of shape :math:`(batch_size, seq_length, hidden_size)`.
+        Tensor of shape :math:`(batch\_size, seq\_length, hidden\_size)`.
     Supported Platforms:
         ``Ascend``
@@ -11209,3 +11325,255 @@ class WKV(Primitive):
         """Initialize WKV."""
         self.init_prim_io_names(inputs=["time_first", "time_decay", "key", "value", "sp", "sq", "sm"],
                                 outputs=["output", "out_sp", "out_sq", "out_sm"])
+class PromptFlashAttention(Primitive):
+    r"""
+    The interface for fully inference.
+    B -- Batch size
+    S -- Sequence length
+    H -- Hidden size
+    Refer to :func:mindspore.ops.prompt_flash_attention for more detail.
+    .. warning::
+        This is an experimental API that is subject to change or deletion.
+    Args:
+        num_heads (int): The number of heads.
+        scale_value (float): The scale value indicating the scale coefficient, which is used as the scalar of
+          Muls in the calculation. Default: 1.0.
+        pre_tokens (int): Previous tokens. Default: 2147483547.
+        next_tokens (int): next tokens.  Default: 0.
+          indicate the upper triangle, Indicate the number of data blocks involved in the calculation. The value 0
+          indicates that the data blocks in the upper triangle are not involved in the calculation
+        input_layout (str): the data layout of the input qkv, support `(BSH)` and `(BNSD)`, Default `BSH`.
+        num_key_value_heads (int): head numbers of key/value which are used in GQA algorithm.
+          The value o indicates if the key and value have the same head nums, use numHeads.  Default: 0.
+        sparse_mode (int): Default: 0
+    Inputs:
+        - **query** (Tensor) - The query tensor with data type of float16 or float32.
+          Input tensor of shape :math:`(B, S, H)` / `(B, N, S, D)`.
+        - **key** (Tensor) - The key tensor with data type of float16 or float32.
+          Input tensor of shape :math:`(B, S, H)` / `(B, N, S, D)`.
+        - **value** (Tensor) - The value tensor with data type of float16 or float32.
+          Input tensor of shape :math:`(B, S, H)` / `(B, N, S, D)`.
+        - **attn_mask** (Tensor) - The attention mask tensor with data type of float16 or float32.
+          For each element, 0 indicates retention and 1 indicates discard. Input tensor of shape :math:`(B, 1, S, S)`.
+        - **actual_seq_lengths** (Tensor): Describe actual sequence length of each input with data type of int.
+        - **actual_seq_lengths_kv** (Tensor): Describe actual sequence length of each input with data type of int.
+        - **padding_mask** (Tensor) - The padding mask tensor with data type of float16 or float32
+        - **dep_scale1** (Tensor)
+        - **quant_scale1** (Tensor)
+        - **deq_scale2** (Tensor)
+        - **quant_scale2** (Tensor)
+        - **quant_offset2** (Tensor)
+    Outputs:
+        - **attention_out** (Tensor) - Input tensor of shape :math:`(B, S, H)` / `(B, N, S, D)`.
+        Supported Platforms:
+        ``Ascend``
+    Examples:
+        >>> import mindspore.ops.operations.nn_ops as P
+        >>> from mindspore import Tensor
+        >>> import numpy as np
+        >>> B = 1
+        >>> N = 16
+        >>> S = 256
+        >>> D = 16
+        >>> query = Tensor(np.ones((B, N, S, D), dtype=np.float16))
+        >>> key = Tensor(np.ones((B, N, S, D), dtype=np.float16))
+        >>> value = Tensor(np.ones((B, N, S, D), dtype=np.float16))
+        >>> pfa = P.PromptFlashAttention(N, input_layout='BNSD')
+        >>> out = pfa(query, key, value, None, None, None, None, None, None, None, None, None)
+        >>> print(out[0].shape)
+        (1, 16, 256, 16)
+    """
+    @prim_attr_register
+    def __init__(self, num_heads, scale_value=1.0, pre_tokens=2147483547, next_tokens=0, input_layout='BSH',
+                 num_key_value_heads=0, sparse_mode=0):
+        """Initialize PromptFlashAttention."""
+        validator.check_value_type('num_heads', num_heads, [int], self.name)
+        validator.check_value_type('scale_value', scale_value, [float], self.name)
+        validator.check_value_type('pre_tokens', pre_tokens, [int], self.name)
+        validator.check_value_type('next_tokens', next_tokens, [int], self.name)
+        validator.check_value_type('input_layout', input_layout, [str], self.name)
+        validator.check_value_type('num_key_value_heads', num_key_value_heads, [int], self.name)
+        validator.check_value_type('sparse_mode', sparse_mode, [int], self.name)
+        self.init_prim_io_names(inputs=["query", "key", "value", "attn_mask", "actual_seq_lengths",
+                                        "actual_seq_lengths_kv", "padding_mask", "deq_scale1", "quant_scale1",
+                                        "deq_scale2", "quant_scale2", "quant_offset2"],
+                                outputs=["attention_out"])
+class FlashAttentionScore(Primitive):
+    r"""
+    FlashAttentionScore.
+    .. warning::
+        This is an experimental API that is subject to change or deletion.
+    B -- Batch size
+    S1 -- Sequence length of query
+    S2 -- Sequence length of key and value
+    N1 -- Num heads of query
+    N2 -- Num heads of key and value, and N2 must be a factor of N1
+    D -- head size
+    H1 -- Hidden size of query, which equals to N1 * D
+    H2 -- Hidden size of key and value, which equals to N2 * D
+    Args:
+        head_num (int): The head num of query.
+        keep_prob (float): The keep probability of dropout. Default: 1.0.
+        scale_value (float): The scale value. Default: 1.0.
+        pre_tokens (int): Previous tokens. Default: 65536.
+        next_tokens (int): Next tokens. Default: 65536.
+        inner_precise (int): Specify the execution mode, where 0 indicates high precision mode and 1 indicates high
+        performance mode. Only support 0 currently. Default: 0.
+        input_layout (str, optional): Specifies the layout of `query`, the value must be one of ["BSH", "BNSD"].
+        Default: "BSH".
+        sparse_mode (int): Default 0.
+    Inputs:
+        - **query** (Tensor[float16, float32, bfloat16]) - The query tensor.
+          Input tensor of shape :math:`(B, S1, H1)` or `(B, N1, S1, D)`.
+        - **key** (Tensor[float16, float32, bfloat16]) - The key tensor.
+          Input tensor of shape :math:`(B, S2, H2)` or `(B, N2, S2, D)`.
+        - **value** (Tensor[float16, float32, bfloat16]) - The value tensor.
+          Input tensor of shape :math:`(B, S2, H2)` or `(B, N2, S2, D)`.
+        - **real_shift** (Tensor[float16, float32, bfloat16], None) - The position embedding code.
+          Input tensor of shape :math: `(B, N1, S1, S2)` or `(B, N1, 1, S2)`.
+        - **drop_mask** (Tensor[uint8], None) - The dropout mask tensor.
+          Input tensor of shape :math:`(B, N1, S1, S2 // 8) or None`.
+        - **padding_mask** (None) - The padding mask of float16 or float32, not implemented yet.
+        - **attn_mask** (Tensor[uint8], None) - The attention mask tensor.
+          For each element, 0 indicates retention and 1 indicates discard.
+          Input tensor of shape :math:`(B, N1, S1, S2)`, `(B, 1, S1, S2)` or `(S1, S2)`.
+        - **prefix** (Tensor[int64], None) - Not implemented yet.
+          Input tensor of shape :math:`(B,)`.
+    Outputs:
+        - **softmax_max** (Tensor[float32]) - (B, N1, S1, 8)
+        - **softmax_sum** (Tensor[float32]) - (B, N1, S1, 8)
+        - **softmax_out** (Tensor[float32]) - Useless output, ignore it. Output tensor of shape : `()`
+        - **attention_out** (Tensor[float16, float32, bfloat16]) - The output of attention, its shape, and data type
+          are the same as the query.
+    Supported Platforms:
+        ``Ascend``
+    """
+    @prim_attr_register
+    def __init__(self, head_num, keep_prob=1.0, scale_value=1.0, pre_tokens=65536, next_tokens=65536, inner_precise=0,
+                 input_layout="BSH", sparse_mode=0):
+        """Initialize FlashAttentionScore"""
+        validator.check_value_type('head_num', head_num, [int], self.name)
+        validator.check_value_type('keep_prob', keep_prob, [int, float], self.name)
+        validator.check_float(keep_prob, 0.0, validator.GE, "keep_prob", self.name)
+        validator.check_float(keep_prob, 1.0, validator.LE, "keep_prob", self.name)
+        validator.check_value_type('scale_value', scale_value, [float], self.name)
+        validator.check_value_type('pre_tokens', pre_tokens, [int], self.name)
+        validator.check_value_type('next_tokens', next_tokens, [int], self.name)
+        validator.check_value_type('inner_precise', inner_precise, [int], self.name)
+        validator.check_value_type('sparse_mode', sparse_mode, [int], self.name)
+        if inner_precise not in [0]:
+            raise ValueError(f"Attribute 'inner_precise' must be 0, but got {inner_precise}")
+        validator.check_value_type('input_layout', input_layout, [str], self.name)
+        if input_layout not in ["BSH", "BNSD"]:
+            raise ValueError(f"Attribute 'input_layout' must be either 'BSH' or 'BNSD', but got {input_layout}")
+        self.init_prim_io_names(
+            inputs=['query', 'key', 'value', 'real_shift', 'drop_mask', 'padding_mask', 'attn_mask', 'prefix'],
+            outputs=['softmax_max', 'softmax_sum', 'softmax_out', 'attention_out'])
+class RmsNorm(Primitive):
+    r"""
+    The RmsNorm operator is a normalization operation, and its formula is:
+    .. math::
+        y=\frac{x_i}{\sqrt{\frac{1}{n}}\sum_{i=1}^{n}{ x_i^2}+\varepsilon  }\gamma_i
+    .. warning::
+        This is an experimental API that is subject to change or deletion.
+    Args:
+        epsilon (float): prevent division by 0, default value is `1e-6`
+    Inputs:
+        - **input_x** (Tensor) - Input data of RmsNorm, support data type: float16, float32, bfloat16.
+        - **gamma** (Tensor) - Support data type: float16, float32, bfloat16.
+    Outputs:
+        - **y** (Tensor) - Has the same type and shape with `input_x`.
+        - **rstd** (Tensor) - Has the same type with `input_x`, used by gradient calculation.
+    Raises:
+        TypeError: If data type of `input_x` is not one of the following: float16, float32, bfloat16.
+        TypeError: If data type of `gamma` is not one of the following: float16, float32, bfloat16.
+        TypeError: If data type of "input_x" is not the same with the data type of "gamma"
+    Supported Platforms:
+        ``Ascend``
+    """
+    @prim_attr_register
+    def __init__(self, epsilon=1e-6):
+        """Initialize Dense."""
+        validator.check_value_type("epsilon", epsilon, [float], self.name)
+        self.init_prim_io_names(inputs=['x', 'gamma'], outputs=["y", "rstd"])
+class PagedAttention(Primitive):
+    r"""
+    .. warning::
+        This is an experimental API that is subject to change or deletion.
+    """
+    @prim_attr_register
+    def __init__(self, head_num, scale_value=1.0, kv_head_num=0):
+        """Initialize PagedAttention"""
+        validator.check_value_type('head_num', head_num, [int], self.name)
+        validator.check_value_type('scale_value', scale_value, [float], self.name) # scale after qkbmm
+        validator.check_value_type('kv_head_num', kv_head_num, [int], self.name) # for MQA
+        self.init_prim_io_names(
+            inputs=['query', 'key_cache', 'value_cache', 'block_tables', 'context_lens'],
+            outputs=['attention_out'])
+class PagedAttentionMask(Primitive):
+    r"""
+    .. warning::
+        This is an experimental API that is subject to change or deletion.
+    """
+    @prim_attr_register
+    def __init__(self, head_num, scale_value=1.0, kv_head_num=0):
+        """Initialize PagedAttentionMask"""
+        validator.check_value_type('head_num', head_num, [int], self.name)
+        validator.check_value_type('scale_value', scale_value, [float], self.name) # scale after qkbmm
+        validator.check_value_type('kv_head_num', kv_head_num, [int], self.name) # for MQA
+        self.init_prim_io_names(
+            inputs=['query', 'key_cache', 'value_cache', 'block_tables', 'context_lens', 'alibi_mask'],
+            outputs=['attention_out'])
+class ReshapeAndCache(Primitive):
+    r"""
+    .. warning::
+        This is an experimental API that is subject to change or deletion.
+    """
+    __mindspore_signature__ = (
+        sig.make_sig('key', dtype=sig.sig_dtype.T),
+        sig.make_sig('value', dtype=sig.sig_dtype.T),
+        sig.make_sig('key_cache', sig.sig_rw.RW_WRITE, dtype=sig.sig_dtype.T),
+        sig.make_sig('value_cache', sig.sig_rw.RW_WRITE, dtype=sig.sig_dtype.T),
+        sig.make_sig('slot_mapping', dtype=sig.sig_dtype.T1),
+    )
+    @prim_attr_register
+    def __init__(self):
+        """Initialize ReshapeAndCache"""
+        self.init_prim_io_names(
+            inputs=['key', 'value', 'key_cache', 'value_cache', 'slot_mapping'],
+            outputs=['key_out'])
+        self.add_prim_attr('side_effect_mem', True)