PyPI - mindspore - Versions diffs - 2.1.0__cp38-cp38-win_amd64.whl → 2.2.11__cp38-cp38-win_amd64.whl - Mend

mindspore 2.1.0__cp38-cp38-win_amd64.whl → 2.2.11__cp38-cp38-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mindspore might be problematic. Click here for more details.

Files changed (511) hide show

mindspore/.commit_id +1 -1
mindspore/Microsoft.VisualStudio.Telemetry.dll +0 -0
mindspore/Newtonsoft.Json.dll +0 -0
mindspore/__init__.py +4 -1
mindspore/_c_dataengine.cp38-win_amd64.pyd +0 -0
mindspore/_c_expression.cp38-win_amd64.pyd +0 -0
mindspore/_c_mindrecord.cp38-win_amd64.pyd +0 -0
mindspore/_check_jit_forbidden_api.py +3 -1
mindspore/_checkparam.py +23 -29
mindspore/_extends/graph_kernel/__init__.py +0 -1
mindspore/_extends/graph_kernel/model/graph_split.py +84 -76
mindspore/_extends/graph_kernel/model/model_builder.py +9 -50
mindspore/_extends/graph_kernel/splitter.py +4 -11
mindspore/_extends/parallel_compile/akg_compiler/akg_process.py +122 -15
mindspore/_extends/parallel_compile/akg_compiler/build_tbe_kernel.py +84 -67
mindspore/_extends/parallel_compile/akg_compiler/tbe_topi.py +4 -2
mindspore/_extends/parallel_compile/akg_compiler/util.py +10 -7
mindspore/_extends/parallel_compile/tbe_compiler/tbe_adapter.py +2 -2
mindspore/_extends/parallel_compile/tbe_compiler/tbe_helper.py +6 -5
mindspore/_extends/parallel_compile/tbe_compiler/tbe_job.py +1 -1
mindspore/_extends/parallel_compile/tbe_compiler/tbe_job_manager.py +1 -1
mindspore/_extends/parse/__init__.py +13 -15
mindspore/_extends/parse/namespace.py +7 -33
mindspore/_extends/parse/parser.py +67 -72
mindspore/_extends/parse/resources.py +1 -1
mindspore/_extends/parse/standard_method.py +86 -106
mindspore/_extends/parse/trope.py +1 -1
mindspore/_extends/remote/kernel_build_server.py +25 -7
mindspore/_extends/remote/kernel_build_server_akg_v2.py +55 -0
mindspore/_install_custom.py +43 -0
mindspore/amp.py +47 -11
mindspore/atlprov.dll +0 -0
mindspore/boost/boost.py +1 -8
mindspore/boost/boost_cell_wrapper.py +3 -2
mindspore/boost/grad_accumulation.py +1 -1
mindspore/boost/group_loss_scale_manager.py +8 -7
mindspore/c1.dll +0 -0
mindspore/c1xx.dll +0 -0
mindspore/c2.dll +0 -0
mindspore/common/__init__.py +5 -3
mindspore/common/_jit_fallback_utils.py +6 -0
mindspore/common/_register_for_adapter.py +2 -0
mindspore/common/_register_for_tensor.py +2 -2
mindspore/common/_stub_tensor.py +13 -0
mindspore/common/_utils.py +29 -0
mindspore/common/api.py +174 -259
mindspore/common/auto_dynamic_shape.py +494 -0
mindspore/common/dtype.py +18 -11
mindspore/common/dump.py +6 -4
mindspore/common/initializer.py +14 -14
mindspore/common/jit_config.py +33 -15
mindspore/common/lazy_inline.py +126 -7
mindspore/common/mindir_util.py +101 -0
mindspore/common/parameter.py +51 -41
mindspore/common/seed.py +4 -4
mindspore/common/sparse_tensor.py +13 -14
mindspore/common/tensor.py +243 -165
mindspore/communication/__init__.py +7 -4
mindspore/communication/_comm_helper.py +83 -4
mindspore/communication/management.py +152 -84
mindspore/config/op_info.config +14 -3
mindspore/context.py +152 -61
mindspore/dataset/__init__.py +5 -5
mindspore/dataset/audio/__init__.py +2 -2
mindspore/dataset/audio/transforms.py +52 -52
mindspore/dataset/callback/ds_callback.py +16 -2
mindspore/dataset/core/config.py +68 -51
mindspore/dataset/engine/cache_client.py +33 -7
mindspore/dataset/engine/datasets.py +250 -112
mindspore/dataset/engine/datasets_audio.py +43 -211
mindspore/dataset/engine/datasets_standard_format.py +16 -35
mindspore/dataset/engine/datasets_text.py +43 -67
mindspore/dataset/engine/datasets_user_defined.py +86 -100
mindspore/dataset/engine/datasets_vision.py +219 -1029
mindspore/dataset/engine/iterators.py +11 -4
mindspore/dataset/engine/obs/obs_mindrecord_dataset.py +4 -0
mindspore/dataset/engine/obs/util.py +3 -0
mindspore/dataset/engine/samplers.py +1 -1
mindspore/dataset/engine/validators.py +19 -5
mindspore/dataset/text/__init__.py +3 -3
mindspore/dataset/text/transforms.py +101 -127
mindspore/dataset/text/utils.py +205 -138
mindspore/dataset/transforms/__init__.py +1 -1
mindspore/dataset/transforms/py_transforms_util.py +40 -12
mindspore/dataset/transforms/transforms.py +95 -40
mindspore/dataset/utils/browse_dataset.py +8 -2
mindspore/dataset/utils/line_reader.py +17 -19
mindspore/dataset/vision/__init__.py +3 -3
mindspore/dataset/vision/c_transforms.py +6 -3
mindspore/dataset/vision/transforms.py +409 -287
mindspore/dataset/vision/utils.py +13 -14
mindspore/dataset/vision/validators.py +11 -1
mindspore/dnnl.dll +0 -0
mindspore/dpcmi.dll +0 -0
mindspore/experimental/map_parameter.py +14 -0
mindspore/{nn/optim_ex → experimental/optim}/__init__.py +30 -29
mindspore/{nn/optim_ex → experimental/optim}/adam.py +60 -67
mindspore/{nn/optim_ex → experimental/optim}/adamw.py +181 -203
mindspore/experimental/optim/lr_scheduler.py +1427 -0
mindspore/{nn/optim_ex → experimental/optim}/optimizer.py +252 -259
mindspore/{nn/optim_ex → experimental/optim}/sgd.py +147 -152
mindspore/gen_ops.py +273 -0
mindspore/include/OWNERS +0 -1
mindspore/include/api/data_type.h +2 -1
mindspore/include/api/graph.h +0 -15
mindspore/include/api/kernel.h +2 -0
mindspore/include/api/kernel_api.h +37 -12
mindspore/include/api/model.h +17 -14
mindspore/include/api/status.h +8 -3
mindspore/include/api/types.h +37 -4
mindspore/include/c_api/ms/abstract.h +67 -0
mindspore/include/c_api/ms/attribute.h +197 -0
mindspore/include/c_api/ms/base/handle_types.h +43 -0
mindspore/include/c_api/ms/base/macros.h +32 -0
mindspore/include/c_api/ms/base/status.h +33 -0
mindspore/include/c_api/ms/base/types.h +282 -0
mindspore/include/c_api/ms/context.h +102 -0
mindspore/include/c_api/ms/graph.h +160 -0
mindspore/include/c_api/ms/node.h +606 -0
mindspore/include/c_api/ms/tensor.h +161 -0
mindspore/include/c_api/ms/value.h +84 -0
mindspore/include/dataset/constants.h +6 -5
mindspore/include/dataset/execute.h +23 -13
mindspore/include/dataset/text.h +26 -26
mindspore/include/dataset/transforms.h +13 -13
mindspore/include/dataset/vision.h +60 -60
mindspore/include/dataset/vision_ascend.h +5 -6
mindspore/include/dataset/vision_lite.h +17 -17
mindspore/jpeg62.dll +0 -0
mindspore/mindrecord/tools/imagenet_to_mr.py +1 -1
mindspore/mindrecord/tools/mnist_to_mr.py +2 -2
mindspore/mindspore_backend.dll +0 -0
mindspore/mindspore_common.dll +0 -0
mindspore/mindspore_core.dll +0 -0
mindspore/mindspore_glog.dll +0 -0
mindspore/mindspore_shared_lib.dll +0 -0
mindspore/msobj140.dll +0 -0
mindspore/mspdb140.dll +0 -0
mindspore/mspdbcore.dll +0 -0
mindspore/mspdbst.dll +0 -0
mindspore/mspft140.dll +0 -0
mindspore/msvcdis140.dll +0 -0
mindspore/msvcp140_1.dll +0 -0
mindspore/msvcp140_2.dll +0 -0
mindspore/msvcp140_atomic_wait.dll +0 -0
mindspore/msvcp140_codecvt_ids.dll +0 -0
mindspore/nn/__init__.py +0 -2
mindspore/nn/cell.py +313 -74
mindspore/nn/dynamic_lr.py +21 -21
mindspore/nn/layer/activation.py +22 -30
mindspore/nn/layer/basic.py +15 -13
mindspore/nn/layer/channel_shuffle.py +1 -1
mindspore/nn/layer/container.py +271 -9
mindspore/nn/layer/conv.py +323 -204
mindspore/nn/layer/dense.py +8 -5
mindspore/nn/layer/embedding.py +33 -27
mindspore/nn/layer/flash_attention.py +61 -95
mindspore/nn/layer/image.py +8 -6
mindspore/nn/layer/math.py +16 -25
mindspore/nn/layer/normalization.py +107 -66
mindspore/nn/layer/padding.py +1 -1
mindspore/nn/layer/pooling.py +131 -109
mindspore/nn/layer/rnn_cells.py +27 -22
mindspore/nn/layer/rnns.py +13 -16
mindspore/nn/layer/thor_layer.py +1 -1
mindspore/nn/layer/transformer.py +221 -154
mindspore/nn/learning_rate_schedule.py +9 -1
mindspore/nn/loss/loss.py +235 -174
mindspore/nn/optim/ada_grad.py +2 -1
mindspore/nn/optim/adadelta.py +1 -0
mindspore/nn/optim/adafactor.py +2 -1
mindspore/nn/optim/adam.py +7 -4
mindspore/nn/optim/adamax.py +3 -2
mindspore/nn/optim/adasum.py +2 -2
mindspore/nn/optim/asgd.py +2 -3
mindspore/nn/optim/ftrl.py +6 -5
mindspore/nn/optim/lamb.py +7 -4
mindspore/nn/optim/lars.py +1 -1
mindspore/nn/optim/lazyadam.py +5 -3
mindspore/nn/optim/momentum.py +2 -1
mindspore/nn/optim/optimizer.py +53 -4
mindspore/nn/optim/proximal_ada_grad.py +3 -4
mindspore/nn/optim/rmsprop.py +4 -3
mindspore/nn/optim/rprop.py +23 -12
mindspore/nn/optim/sgd.py +26 -11
mindspore/nn/optim/thor.py +9 -7
mindspore/nn/probability/bijector/bijector.py +5 -5
mindspore/nn/probability/bijector/power_transform.py +27 -27
mindspore/nn/probability/bijector/softplus.py +3 -3
mindspore/nn/probability/distribution/_utils/custom_ops.py +3 -3
mindspore/nn/probability/distribution/bernoulli.py +5 -5
mindspore/nn/probability/distribution/beta.py +3 -3
mindspore/nn/probability/distribution/categorical.py +7 -7
mindspore/nn/probability/distribution/cauchy.py +0 -1
mindspore/nn/probability/distribution/distribution.py +3 -3
mindspore/nn/probability/distribution/gamma.py +3 -3
mindspore/nn/probability/distribution/geometric.py +4 -4
mindspore/nn/probability/distribution/gumbel.py +4 -4
mindspore/nn/probability/distribution/log_normal.py +2 -2
mindspore/nn/probability/distribution/logistic.py +2 -2
mindspore/nn/probability/distribution/poisson.py +4 -4
mindspore/nn/probability/distribution/transformed_distribution.py +3 -3
mindspore/nn/probability/distribution/uniform.py +6 -6
mindspore/nn/wrap/__init__.py +4 -2
mindspore/nn/wrap/cell_wrapper.py +87 -34
mindspore/nn/wrap/grad_reducer.py +8 -5
mindspore/nn/wrap/loss_scale.py +105 -42
mindspore/numpy/array_creations.py +1 -2
mindspore/numpy/array_ops.py +3 -2
mindspore/numpy/utils_const.py +5 -5
mindspore/opencv_core452.dll +0 -0
mindspore/opencv_imgcodecs452.dll +0 -0
mindspore/opencv_imgproc452.dll +0 -0
mindspore/ops/_grad_experimental/__init__.py +0 -5
mindspore/ops/_grad_experimental/grad_array_ops.py +2 -3
mindspore/ops/_grad_experimental/grad_comm_ops.py +15 -2
mindspore/ops/_grad_experimental/grad_debug_ops.py +0 -37
mindspore/ops/_grad_experimental/grad_implementations.py +11 -1
mindspore/ops/_grad_experimental/grad_inner_ops.py +2 -216
mindspore/ops/_grad_experimental/grad_math_ops.py +19 -199
mindspore/ops/_grad_experimental/grad_sparse.py +15 -0
mindspore/ops/_grad_experimental/grad_sparse_ops.py +3 -3
mindspore/ops/_op_impl/_custom_op/dsd_back_impl.py +1 -1
mindspore/ops/_op_impl/aicpu/__init__.py +14 -2
mindspore/ops/_op_impl/aicpu/add.py +3 -3
mindspore/ops/_op_impl/aicpu/bias_add_grad.py +0 -1
mindspore/ops/_op_impl/aicpu/count_nonzero.py +43 -0
mindspore/ops/_op_impl/{_custom_op/flash_attention/constants.py → aicpu/eps.py} +18 -27
mindspore/ops/_op_impl/aicpu/gamma.py +2 -2
mindspore/ops/_op_impl/aicpu/linear_sum_assignment.py +21 -2
mindspore/ops/_op_impl/aicpu/log_uniform_candidate_sampler.py +6 -3
mindspore/ops/_op_impl/aicpu/lu_unpack_grad.py +0 -1
mindspore/ops/_op_impl/aicpu/multinomial.py +3 -3
mindspore/ops/_op_impl/aicpu/parameterized_truncated_normal.py +15 -7
mindspore/ops/_op_impl/aicpu/random_categorical.py +39 -19
mindspore/ops/_op_impl/aicpu/random_choice_with_mask.py +5 -2
mindspore/ops/_op_impl/aicpu/random_poisson.py +103 -52
mindspore/ops/_op_impl/aicpu/random_shuffle.py +17 -15
mindspore/ops/_op_impl/aicpu/{sparseaddmm.py → sparse_addmm.py} +2 -2
mindspore/ops/_op_impl/aicpu/{sparsesparsemaximum.py → sparse_sparse_maximum.py} +4 -4
mindspore/ops/_op_impl/aicpu/standard_laplace.py +5 -5
mindspore/ops/_op_impl/aicpu/standard_normal.py +5 -5
mindspore/ops/_op_impl/aicpu/truncated_normal.py +9 -7
mindspore/ops/_op_impl/aicpu/uniform.py +5 -3
mindspore/ops/_op_impl/aicpu/uniform_candidate_sampler.py +8 -4
mindspore/ops/_op_impl/aicpu/uniform_int.py +5 -5
mindspore/ops/_op_impl/aicpu/uniform_real.py +4 -4
mindspore/ops/_op_impl/tbe/__init__.py +4 -4
mindspore/ops/_op_impl/tbe/inplace_index_add.py +7 -3
mindspore/ops/_op_impl/tbe/trans_data_ds.py +2 -0
mindspore/ops/_primitive_cache.py +1 -1
mindspore/ops/_tracefunc.py +45 -13
mindspore/ops/_utils/utils.py +6 -1
mindspore/ops/_vmap/vmap_array_ops.py +3 -3
mindspore/ops/_vmap/vmap_base.py +3 -3
mindspore/ops/_vmap/vmap_convolution_ops.py +1 -1
mindspore/ops/_vmap/vmap_grad_math_ops.py +6 -4
mindspore/ops/_vmap/vmap_math_ops.py +5 -2
mindspore/ops/_vmap/vmap_nn_ops.py +61 -7
mindspore/ops/arg_dtype_cast.py +54 -0
mindspore/ops/composite/base.py +37 -10
mindspore/ops/composite/math_ops.py +5 -4
mindspore/ops/composite/multitype_ops/_compile_utils.py +275 -73
mindspore/ops/composite/multitype_ops/_constexpr_utils.py +16 -9
mindspore/ops/composite/multitype_ops/add_impl.py +43 -4
mindspore/ops/composite/multitype_ops/getitem_impl.py +42 -4
mindspore/ops/composite/multitype_ops/ones_like_impl.py +6 -0
mindspore/ops/composite/multitype_ops/setitem_impl.py +2 -1
mindspore/ops/composite/multitype_ops/zeros_like_impl.py +9 -0
mindspore/ops/deprecated.py +304 -0
mindspore/ops/function/__init__.py +4 -1
mindspore/ops/function/array_func.py +174 -193
mindspore/ops/function/clip_func.py +81 -13
mindspore/ops/function/debug_func.py +1 -1
mindspore/ops/function/grad/grad_func.py +18 -9
mindspore/ops/function/image_func.py +10 -4
mindspore/ops/function/linalg_func.py +5 -5
mindspore/ops/function/math_func.py +575 -386
mindspore/ops/function/nn_func.py +568 -260
mindspore/ops/function/random_func.py +88 -57
mindspore/ops/function/sparse_func.py +1 -1
mindspore/ops/function/sparse_unary_func.py +14 -12
mindspore/ops/function/vmap_func.py +6 -5
mindspore/ops/functional.py +15 -10
mindspore/ops/op_info_register.py +244 -25
mindspore/ops/operations/__init__.py +31 -19
mindspore/ops/operations/_grad_ops.py +71 -7
mindspore/ops/operations/_inner_ops.py +350 -17
mindspore/ops/operations/_quant_ops.py +4 -8
mindspore/ops/operations/_sequence_ops.py +42 -0
mindspore/ops/operations/array_ops.py +68 -282
mindspore/ops/operations/comm_ops.py +107 -59
mindspore/ops/operations/custom_ops.py +94 -70
mindspore/ops/operations/debug_ops.py +8 -4
mindspore/ops/operations/image_ops.py +18 -12
mindspore/ops/operations/inner_ops.py +26 -3
mindspore/ops/operations/math_ops.py +192 -144
mindspore/ops/operations/nn_ops.py +857 -489
mindspore/ops/operations/other_ops.py +0 -22
mindspore/ops/operations/random_ops.py +53 -111
mindspore/ops/operations/sparse_ops.py +3 -1
mindspore/ops/primitive.py +24 -18
mindspore/parallel/_auto_parallel_context.py +68 -8
mindspore/parallel/_cost_model_context.py +2 -2
mindspore/parallel/_offload_context.py +17 -3
mindspore/parallel/_parallel_serialization.py +12 -5
mindspore/parallel/_ps_context.py +12 -0
mindspore/parallel/_tensor.py +18 -13
mindspore/parallel/_transformer/layers.py +5 -3
mindspore/parallel/_transformer/loss.py +1 -0
mindspore/parallel/_transformer/moe.py +2 -2
mindspore/parallel/_transformer/op_parallel_config.py +12 -1
mindspore/parallel/_transformer/transformer.py +23 -3
mindspore/parallel/_utils.py +11 -7
mindspore/parallel/algo_parameter_config.py +85 -5
mindspore/parallel/checkpoint_transform.py +19 -12
mindspore/parallel/shard.py +21 -14
mindspore/pgodb140.dll +0 -0
mindspore/pgort140.dll +0 -0
mindspore/profiler/common/struct_type.py +3 -3
mindspore/profiler/common/util.py +4 -2
mindspore/profiler/envprofiling.py +1 -1
mindspore/profiler/parser/aicpu_data_parser.py +5 -3
mindspore/profiler/parser/ascend_flops_generator.py +2 -2
mindspore/profiler/parser/ascend_fpbp_generator.py +1 -1
mindspore/profiler/parser/ascend_hccl_generator.py +249 -12
mindspore/profiler/parser/ascend_msprof_exporter.py +150 -255
mindspore/profiler/parser/ascend_msprof_generator.py +204 -17
mindspore/profiler/parser/ascend_op_generator.py +6 -6
mindspore/profiler/parser/ascend_steptrace_generator.py +6 -4
mindspore/profiler/parser/ascend_timeline_generator.py +14 -187
mindspore/profiler/parser/base_timeline_generator.py +10 -8
mindspore/profiler/parser/cpu_gpu_timeline_generator.py +16 -12
mindspore/profiler/parser/flops_parser.py +15 -11
mindspore/profiler/parser/framework_parser.py +38 -22
mindspore/profiler/parser/hccl_parser.py +16 -12
mindspore/profiler/parser/integrator.py +22 -11
mindspore/profiler/parser/memory_usage_parser.py +2 -2
mindspore/profiler/parser/minddata_analyzer.py +12 -14
mindspore/profiler/parser/minddata_pipeline_parser.py +1 -1
mindspore/profiler/parser/msadvisor_parser.py +8 -4
mindspore/profiler/parser/op_intermediate_parser.py +5 -2
mindspore/profiler/parser/optime_parser.py +1 -1
mindspore/profiler/parser/profiler_info.py +21 -2
mindspore/profiler/parser/step_trace_parser.py +11 -14
mindspore/profiler/profiling.py +179 -89
mindspore/rewrite/api/node.py +102 -19
mindspore/rewrite/api/node_type.py +5 -1
mindspore/rewrite/api/pattern_engine.py +1 -1
mindspore/rewrite/api/scoped_value.py +9 -17
mindspore/rewrite/api/symbol_tree.py +131 -47
mindspore/rewrite/ast_helpers/__init__.py +2 -1
mindspore/rewrite/ast_helpers/ast_finder.py +129 -0
mindspore/rewrite/ast_helpers/ast_modifier.py +116 -104
mindspore/rewrite/ast_transformers/flatten_recursive_stmt.py +93 -46
mindspore/rewrite/common/rewrite_elog.py +5 -1
mindspore/rewrite/namer.py +33 -24
mindspore/rewrite/namespace.py +14 -5
mindspore/{_extends/graph_kernel/expanders/complex → rewrite/node}/__init__.py +9 -9
mindspore/rewrite/node/call_function.py +79 -0
mindspore/rewrite/node/cell_container.py +135 -0
mindspore/rewrite/node/control_flow.py +88 -0
mindspore/rewrite/{node.py → node/node.py} +273 -234
mindspore/rewrite/node/node_manager.py +254 -0
mindspore/rewrite/{topological_manager.py → node/node_topological_manager.py} +13 -46
mindspore/rewrite/parsers/arguments_parser.py +22 -21
mindspore/rewrite/parsers/assign_parser.py +216 -221
mindspore/rewrite/parsers/attribute_parser.py +9 -7
mindspore/rewrite/parsers/class_def_parser.py +174 -113
mindspore/rewrite/parsers/constant_parser.py +9 -6
mindspore/rewrite/parsers/container_parser.py +9 -7
mindspore/rewrite/parsers/for_parser.py +42 -21
mindspore/rewrite/parsers/function_def_parser.py +24 -16
mindspore/rewrite/parsers/if_parser.py +28 -24
mindspore/rewrite/parsers/module_parser.py +196 -25
mindspore/rewrite/{parser.py → parsers/parser.py} +4 -2
mindspore/rewrite/{parser_register.py → parsers/parser_register.py} +1 -1
mindspore/rewrite/parsers/return_parser.py +6 -6
mindspore/rewrite/sparsify/sparse_transformer.py +12 -3
mindspore/rewrite/sparsify/utils.py +1 -1
mindspore/rewrite/symbol_tree.py +523 -578
mindspore/rewrite/symbol_tree_builder.py +9 -193
mindspore/rewrite/symbol_tree_dumper.py +2 -2
mindspore/run_check/_check_version.py +6 -4
mindspore/{ops/bprop_mindir → safeguard}/__init__.py +4 -3
mindspore/safeguard/rewrite_obfuscation.py +541 -0
mindspore/tbbmalloc.dll +0 -0
mindspore/tinyxml2.dll +0 -0
mindspore/train/_utils.py +7 -3
mindspore/train/amp.py +323 -123
mindspore/train/anf_ir_pb2.py +14 -2
mindspore/train/callback/_backup_and_restore.py +2 -12
mindspore/train/callback/_callback.py +29 -4
mindspore/train/callback/_checkpoint.py +23 -8
mindspore/train/callback/_early_stop.py +2 -2
mindspore/train/callback/_landscape.py +4 -4
mindspore/train/callback/_loss_monitor.py +2 -2
mindspore/train/callback/_on_request_exit.py +2 -2
mindspore/train/callback/_reduce_lr_on_plateau.py +3 -4
mindspore/train/callback/_summary_collector.py +15 -8
mindspore/train/callback/_time_monitor.py +58 -5
mindspore/train/data_sink.py +5 -11
mindspore/train/dataset_helper.py +84 -57
mindspore/train/loss_scale_manager.py +2 -2
mindspore/train/metrics/__init__.py +3 -3
mindspore/train/metrics/cosine_similarity.py +1 -1
mindspore/train/metrics/hausdorff_distance.py +3 -2
mindspore/train/metrics/mean_surface_distance.py +3 -2
mindspore/train/metrics/metric.py +39 -19
mindspore/train/metrics/roc.py +2 -2
mindspore/train/metrics/root_mean_square_surface_distance.py +4 -3
mindspore/train/mind_ir_pb2.py +85 -36
mindspore/train/model.py +187 -47
mindspore/train/serialization.py +487 -161
mindspore/train/summary/_summary_adapter.py +1 -1
mindspore/train/summary/_writer_pool.py +3 -2
mindspore/train/summary/summary_record.py +37 -17
mindspore/train/train_thor/convert_utils.py +3 -3
mindspore/train/train_thor/dataset_helper.py +1 -1
mindspore/turbojpeg.dll +0 -0
mindspore/vcmeta.dll +0 -0
mindspore/vcruntime140.dll +0 -0
mindspore/vcruntime140_1.dll +0 -0
mindspore/version.py +1 -1
{mindspore-2.1.0.dist-info → mindspore-2.2.11.dist-info}/METADATA +7 -4
{mindspore-2.1.0.dist-info → mindspore-2.2.11.dist-info}/RECORD +429 -486
mindspore/_extends/graph_kernel/expander.py +0 -80
mindspore/_extends/graph_kernel/expanders/__init__.py +0 -54
mindspore/_extends/graph_kernel/expanders/_utils.py +0 -269
mindspore/_extends/graph_kernel/expanders/addn.py +0 -33
mindspore/_extends/graph_kernel/expanders/batchnorm.py +0 -152
mindspore/_extends/graph_kernel/expanders/batchnorm_grad.py +0 -105
mindspore/_extends/graph_kernel/expanders/clip_by_norm_no_div_sum.py +0 -33
mindspore/_extends/graph_kernel/expanders/complex/abs.py +0 -30
mindspore/_extends/graph_kernel/expanders/complex/add.py +0 -44
mindspore/_extends/graph_kernel/expanders/complex/div.py +0 -62
mindspore/_extends/graph_kernel/expanders/complex/mul.py +0 -52
mindspore/_extends/graph_kernel/expanders/complex/real_div.py +0 -62
mindspore/_extends/graph_kernel/expanders/complex/sub.py +0 -45
mindspore/_extends/graph_kernel/expanders/conv2d.py +0 -200
mindspore/_extends/graph_kernel/expanders/dropout_grad.py +0 -30
mindspore/_extends/graph_kernel/expanders/equal_count.py +0 -50
mindspore/_extends/graph_kernel/expanders/erfc.py +0 -35
mindspore/_extends/graph_kernel/expanders/expand_dims.py +0 -50
mindspore/_extends/graph_kernel/expanders/fused_adam.py +0 -44
mindspore/_extends/graph_kernel/expanders/fused_adam_weight_decay.py +0 -47
mindspore/_extends/graph_kernel/expanders/fused_mul_add.py +0 -28
mindspore/_extends/graph_kernel/expanders/gelu_grad.py +0 -70
mindspore/_extends/graph_kernel/expanders/gkdropout.py +0 -40
mindspore/_extends/graph_kernel/expanders/identity.py +0 -25
mindspore/_extends/graph_kernel/expanders/layernorm.py +0 -93
mindspore/_extends/graph_kernel/expanders/layernorm_grad.py +0 -113
mindspore/_extends/graph_kernel/expanders/logsoftmax.py +0 -46
mindspore/_extends/graph_kernel/expanders/logsoftmax_grad.py +0 -36
mindspore/_extends/graph_kernel/expanders/matmul.py +0 -80
mindspore/_extends/graph_kernel/expanders/maximum_grad.py +0 -59
mindspore/_extends/graph_kernel/expanders/minimum_grad.py +0 -80
mindspore/_extends/graph_kernel/expanders/oneslike.py +0 -26
mindspore/_extends/graph_kernel/expanders/reduce_mean.py +0 -43
mindspore/_extends/graph_kernel/expanders/relu_grad.py +0 -32
mindspore/_extends/graph_kernel/expanders/sigmoid_cross_entropy_with_logits.py +0 -41
mindspore/_extends/graph_kernel/expanders/sigmoid_cross_entropy_with_logits_grad.py +0 -35
mindspore/_extends/graph_kernel/expanders/sigmoid_grad.py +0 -31
mindspore/_extends/graph_kernel/expanders/slice.py +0 -35
mindspore/_extends/graph_kernel/expanders/softmax_cross_entropy_with_logits.py +0 -42
mindspore/_extends/graph_kernel/expanders/softmax_grad_ext.py +0 -41
mindspore/_extends/graph_kernel/expanders/softsign.py +0 -28
mindspore/_extends/graph_kernel/expanders/sqrt_grad.py +0 -29
mindspore/_extends/graph_kernel/expanders/square_sum_all.py +0 -44
mindspore/_extends/graph_kernel/expanders/square_sum_v1.py +0 -37
mindspore/_extends/graph_kernel/expanders/squared_difference.py +0 -43
mindspore/_extends/graph_kernel/expanders/tanh_grad.py +0 -31
mindspore/_extends/graph_kernel/model/op_infer.py +0 -506
mindspore/dataset/datapreprocess/__init__.py +0 -20
mindspore/dataset/datapreprocess/preprocess_imagenet_validate_dataset.py +0 -54
mindspore/include/api/net.h +0 -142
mindspore/nn/lr_scheduler.py +0 -262
mindspore/ops/_grad_experimental/grad_image_ops.py +0 -248
mindspore/ops/_grad_experimental/grad_linalg_ops.py +0 -181
mindspore/ops/_grad_experimental/grad_other_ops.py +0 -72
mindspore/ops/_grad_experimental/grad_scalar_ops.py +0 -112
mindspore/ops/_grad_experimental/grad_sequence_ops.py +0 -351
mindspore/ops/_op_impl/_custom_op/flash_attention/__init__.py +0 -0
mindspore/ops/_op_impl/_custom_op/flash_attention/attention.py +0 -350
mindspore/ops/_op_impl/_custom_op/flash_attention/flash_attention_bwd.py +0 -409
mindspore/ops/_op_impl/_custom_op/flash_attention/flash_attention_fwd.py +0 -578
mindspore/ops/_op_impl/_custom_op/flash_attention/flash_attention_impl.py +0 -199
mindspore/ops/_op_impl/_custom_op/flash_attention/tik_ops_utils.py +0 -446
mindspore/ops/_op_impl/_custom_op/flash_attention/tiling_strategy/__init__.py +0 -0
mindspore/ops/_op_impl/_custom_op/flash_attention/tiling_strategy/sparse_tiling.py +0 -45
mindspore/ops/_op_impl/_custom_op/flash_attention/tiling_strategy/strategy.py +0 -67
mindspore/ops/_op_impl/_custom_op/flash_attention/tiling_strategy/wukong_tiling.py +0 -62
mindspore/ops/bprop_mindir/BNTrainingReduce_bprop.mindir +0 -0
mindspore/ops/bprop_mindir/Broadcast_bprop.mindir +0 -0
mindspore/ops/bprop_mindir/Depend_bprop.mindir +0 -0
mindspore/ops/bprop_mindir/DepthwiseConv2dNative_bprop.mindir +0 -138
mindspore/ops/bprop_mindir/EmbeddingLookup_bprop.mindir +0 -0
mindspore/ops/bprop_mindir/Load_bprop.mindir +0 -0
mindspore/ops/bprop_mindir/ScatterNonAliasingAdd_bprop.mindir +0 -0
mindspore/ops/bprop_mindir/SparseGatherV2_bprop.mindir +0 -0
mindspore/ops/bprop_mindir/SparseSoftmaxCrossEntropyWithLogits_bprop.mindir +0 -0
mindspore/ops/bprop_mindir/Switch_bprop.mindir +0 -0
mindspore/ops/bprop_mindir/TransShape_bprop.mindir +0 -0
mindspore/ops/bprop_mindir/TupleGetItem_bprop.mindir +0 -0
mindspore/ops/bprop_mindir/Unique_bprop.mindir +0 -0
mindspore/ops/bprop_mindir/Unstack_bprop.mindir +0 -0
mindspore/ops/bprop_mindir/generate_mindir.py +0 -114
mindspore/rewrite/node_visitor.py +0 -44
{mindspore-2.1.0.dist-info → mindspore-2.2.11.dist-info}/WHEEL +0 -0
{mindspore-2.1.0.dist-info → mindspore-2.2.11.dist-info}/entry_points.txt +0 -0
{mindspore-2.1.0.dist-info → mindspore-2.2.11.dist-info}/top_level.txt +0 -0

mindspore/ops/function/nn_func.py CHANGED Viewed

@@ -27,7 +27,7 @@ from mindspore.ops.operations import nn_ops as NN_OPS
 from mindspore.ops.operations import _sequence_ops as seq
 import mindspore.common.dtype as mstype
 from mindspore.ops.function.math_func import logsumexp
-from mindspore.ops.function.random_func import _get_seed
+from mindspore.ops.function.random_func import _get_seed, _set_prim_op_user_data
 from mindspore.common.tensor import Tensor
 from mindspore._c_expression import Tensor as Tensor_
 from mindspore.ops._primitive_cache import _get_cache_prim
@@ -40,6 +40,7 @@ from mindspore.ops.operations.nn_ops import ChannelShuffle
 from mindspore.ops.operations.nn_ops import TripletMarginLoss
 from mindspore.ops.operations._inner_ops import SiLU
 from mindspore.ops.operations._sequence_ops import TupleToTensor, TensorToTuple, ListToTensor
+from mindspore.common.api import _function_forbid_reuse
 slice_ = P.Slice()
 fast_gelu_ = P.FastGeLU()
@@ -232,7 +233,7 @@ def adaptive_avg_pool3d(input, output_size):
 def _check_avgpool_1d_type_and_int(kernel_size, stride, ceil_mode, count_include_pad):
     """Checks the type of avgpool1d input"""
     validator.check_value_type('kernel_size', kernel_size, [int], 'avg_pool1d')
-    validator.check_value_type('stride', stride, [int], 'avg_pool1d')
+    validator.check_value_type('stride', stride, (int, tuple), 'avg_pool1d')
     validator.check_value_type('ceil_mode', ceil_mode, bool, 'avg_pool1d')
     validator.check_value_type('count_include_pad', count_include_pad, bool, 'avg_pool1d')
     validator.check_int(kernel_size, 1, validator.GE, "kernel_size", 'avg_pool1d')
@@ -263,12 +264,10 @@ def avg_pool1d(input_x, kernel_size=1, stride=1, padding=0, ceil_mode=False, cou
     Args:
         input_x (Tensor): Tensor of shape :math:`(N, C_{in}, L_{in})`.
         kernel_size (int): The size of kernel window used to take the average value. Default: ``1`` .
-        stride (Union(int, tuple[int])): The distance of kernel moving, an int number that represents the height and
-            width of movement are both strides, or a tuple of two int numbers that represent height and width of
-            movement respectively. Default: ``1`` .
-        padding (Union(int, tuple[int])): The pad value to be filled. If `padding` is an integer, the paddings of left
-            and right are the same, equal to pad. If `padding` is a tuple of `2` integers, the padding of left and right
-            equal to `padding[0]` and `padding[1]` correspondingly. Default: ``0`` .
+        stride (Union(int, tuple[int])): The distance of kernel moving. `stride` can either be an int
+            number or a tuple of one int number. Default: ``1`` .
+        padding (Union(int, tuple[int])): The pad value to be filled. `padding` can either be an integer
+            or a tuple of one integer. Default: ``0`` .
         ceil_mode (bool): If True, apply ceil instead of floor to compute the output shape. Default: ``False``.
         count_include_pad (bool): If True, include the zero-padding in the averaging calculation. Default: ``True`` .
@@ -300,20 +299,25 @@ def avg_pool1d(input_x, kernel_size=1, stride=1, padding=0, ceil_mode=False, cou
         raise TypeError("For avg_pool1d, the input input_x must be tensor")
     if len(input_x.shape) != 3:
-        raise ValueError("For avg_pool1d, input must have 3 dim, but got {}.".format(len(input_x.shape)))
+        raise ValueError(f"For avg_pool1d, input must have 3 dim, but got {len(input_x.shape)}.")
     _check_avgpool_1d_type_and_int(kernel_size, stride, ceil_mode, count_include_pad)
     if isinstance(padding, int):
         check_non_negative_int(padding, 'padding', 'avg_pool1d')
         padding = (0, 0, 0, 0, padding, padding)
     elif isinstance(padding, tuple):
-        if len(padding) != 2:
-            raise ValueError("For avg_pool1d, padding should be int or tuple of length 2.")
+        if len(padding) != 1:
+            raise ValueError("For avg_pool1d, padding should be int or tuple of length 1.")
         for item in padding:
             check_non_negative_int(item, 'padding', 'avg_pool1d')
-        padding = (0, 0, 0, 0, padding[0], padding[1])
+        padding = (0, 0, 0, 0, padding[0], padding[0])
     else:
-        raise TypeError("For avg_pool1d, padding should be int or tuple of length 2.")
+        raise TypeError("For avg_pool1d, padding should be int or tuple of length 1.")
+    if isinstance(stride, tuple):
+        if len(stride) != 1:
+            raise ValueError("For avg_pool1d, stride should be int or tuple of length 1.")
+        stride = stride[0]
     expand_op = _get_cache_prim(P.ExpandDims)()
     squeeze_op = _get_cache_prim(P.Squeeze)((2, 3))
@@ -419,7 +423,7 @@ def avg_pool2d(input_x, kernel_size=1, stride=1, padding=0, ceil_mode=False, cou
         ceil_mode (bool): If True, apply ceil instead of floor to compute the output shape. Default: ``False``.
         count_include_pad (bool): If True, include the zero-padding in the averaging calculation. Default: ``True`` .
         divisor_override (int): If specified, it will be used as divisor in the averaging calculation, otherwise
-            `kernel_size` will be used. Default: ``0`` .
+            `kernel_size` will be used. Default: ``0``, which means not specified.
     Returns:
         Tensor, with shape :math:`(N, C_{out}, H_{out}, W_{out})`.
@@ -456,7 +460,7 @@ def avg_pool2d(input_x, kernel_size=1, stride=1, padding=0, ceil_mode=False, cou
         raise TypeError("For avg_pool2d, the input input_x must be tensor")
     if len(input_x.shape) != 4:
-        raise ValueError("For avg_pool2d, input must have 4 dim, but got {}.".format(len(input_x.shape)))
+        raise ValueError(f"For avg_pool2d, input must have 4 dim, but got {len(input_x.shape)}.")
     kernel_size = _check_avgpool_2d_kernel_size(kernel_size)
     stride = _check_avgpool_2d_stride(stride)
@@ -527,7 +531,7 @@ def avg_pool3d(input_x, kernel_size=1, stride=1, padding=0, ceil_mode=False, cou
         count_include_pad (bool, optional): If ``True`` , averaging calculation
             will include the zero-padding. Default: ``True`` .
         divisor_override (int, optional): If specified, it will be used as divisor in the averaging calculation,
-            otherwise `kernel_size` will be used. Default: ``0`` .
+            otherwise `kernel_size` will be used. Default: ``0`` , which means not specified.
     Returns:
         Tensor, with shape :math:`(N, C, D_{out}, H_{out}, W_{out})`. Has the same data type with `input_x`.
@@ -560,7 +564,7 @@ def avg_pool3d(input_x, kernel_size=1, stride=1, padding=0, ceil_mode=False, cou
         raise TypeError("For avg_pool3d, the input input_x must be tensor")
     if len(input_x.shape) != 5:
-        raise ValueError("For avg_pool3d, input must have 5 dim, but got {}.".format(len(input_x.shape)))
+        raise ValueError(f"For avg_pool3d, input must have 5 dim, but got {len(input_x.shape)}.")
     _check_avg_pool3d_padding(padding)
@@ -637,21 +641,21 @@ def adaptive_max_pool1d(input, output_size):
     x_dtype = _get_cache_prim(P.DType)()(input)
     if len(x_in_shape) != 3:
-        raise ValueError("For adaptive_max_pool1d input must have 3 dim, but got {}.".format(len(x_in_shape)))
+        raise ValueError(f"For adaptive_max_pool1d input must have 3 dim, but got {len(x_in_shape)}.")
     if x_in_shape[2] < output_size:
-        raise ValueError("For adaptive_max_pool1d input's last dimension must be greater or equal to "
-                         "output size {}, but got {}.".format(output_size, x_in_shape[2]))
+        raise ValueError(f"For adaptive_max_pool1d input's last dimension must be greater or equal to "
+                         f"output size {output_size}, but got {x_in_shape[2]}.")
     if x_in_shape[2] % output_size != 0:
-        raise ValueError("For adaptive_max_pool1d input's last dimension must be divisible by "
-                         "output size {}, but got {}.".format(output_size, x_in_shape[2]))
+        raise ValueError(f"For adaptive_max_pool1d input's last dimension must be divisible by "
+                         f"output size {output_size}, but got {x_in_shape[2]}.")
     if is_ascend_backend():
         if x_dtype not in [mstype.float16]:
-            raise TypeError("For adaptive_max_pool1d in Ascend platform, the input dtype must be float16, "
-                            "but got {}.".format(x_dtype))
+            raise TypeError(f"For adaptive_max_pool1d in Ascend platform, the input dtype must be float16, "
+                            f"but got {x_dtype}.")
     else:
         if x_dtype not in [mstype.float16, mstype.float32]:
-            raise TypeError("For adaptive_max_pool1d, the input dtype must be float16 or float32, "
-                            "but got {}.".format(x_dtype))
+            raise TypeError(f"For adaptive_max_pool1d, the input dtype must be float16 or float32, "
+                            f"but got {x_dtype}.")
     expand_ = _get_cache_prim(P.ExpandDims)()
     squeeze_ = _get_cache_prim(P.Squeeze)(2)
@@ -1147,7 +1151,7 @@ def max_unpool3d(x, indices, kernel_size, stride=None, padding=0, output_size=No
     return out
-def binary_cross_entropy_with_logits(logits, label, weight, pos_weight, reduction='mean'):
+def binary_cross_entropy_with_logits(logits, label, weight=None, pos_weight=None, reduction='mean'):
     r"""
     Adds sigmoid activation function to input `logits`, and uses the given logits to compute binary cross entropy
     between the logits and the label.
@@ -1177,7 +1181,7 @@ def binary_cross_entropy_with_logits(logits, label, weight, pos_weight, reductio
     This operator will multiply the output by the corresponding weight.
     The tensor :math:`weight` assigns different weights to each piece of data in the batch,
-    and the tensor :math:`pos_weight` adds corresponding weights to the positive examples of each category.
+    and the tensor :math:`pos\_weight` adds corresponding weights to the positive examples of each category.
     In addition, it can trade off recall and precision by adding weights to positive examples.
     In the case of multi-label classification the loss can be described as:
@@ -1196,17 +1200,21 @@ def binary_cross_entropy_with_logits(logits, label, weight, pos_weight, reductio
         logits (Tensor): Input logits. Data type must be float16 or float32.
         label (Tensor): Ground truth label, has the same shape as `logits`.
           Data type must be float16 or float32.
-        weight (Tensor): A rescaling weight applied to the loss of each batch element. It can be
+        weight (Tensor, optional): A rescaling weight applied to the loss of each batch element. It can be
           broadcast to a tensor with shape of `logits`. Data type must be float16 or float32.
-        pos_weight (Tensor): A weight of positive examples. Must be a vector with length equal to the
+          Default: ``None``, `weight` is a Tensor whose value is ``1``.
+        pos_weight (Tensor, optional): A weight of positive examples. Must be a vector with length equal to the
           number of classes. It can be broadcast to a tensor with shape of `logits`.
-          Data type must be float16 or float32.
-        reduction (str): Type of reduction to be applied to loss. The optional values
-             are ``'mean'`` , ``'sum'`` , and ``'none'`` ,
-             not case sensitive. If ``'none'`` , do not perform reduction. Default: ``'mean'`` .
+          Data type must be float16 or float32. Default: ``None``, `pos_weight` is a Tensor whose value is ``1``.
+        reduction (str, optional): Apply specific reduction method to the output: ``'none'`` , ``'mean'`` ,
+            ``'sum'`` . Default: ``'mean'`` .
+            - ``'none'``: no reduction will be applied.
+            - ``'mean'``: compute and return the weighted mean of elements in the output.
+            - ``'sum'``: the output elements will be summed.
     Returns:
-        Tensor or Scalar, if `reduction` is 'none', it's a tensor with the same shape and type as input `logits`.
+        Tensor or Scalar, if `reduction` is ``'none'``, it's a tensor with the same shape and type as input `logits`.
         Otherwise, the output is a scalar.
     Raises:
@@ -1214,7 +1222,7 @@ def binary_cross_entropy_with_logits(logits, label, weight, pos_weight, reductio
         TypeError: If data type of input `logits`, `label`, `weight`, `pos_weight` is neither float16 nor float32.
         TypeError: If data type of input `reduction` is not string.
         ValueError: If `weight` or `pos_weight` can not be broadcast to a tensor with shape of `logits`.
-        ValueError: If `reduction` is not one of 'none', 'mean' or 'sum'.
+        ValueError: If `reduction` is not one of ``'none'``, ``'mean'`` or ``'sum'``.
     Supported Platforms:
         ``Ascend`` ``GPU`` ``CPU``
@@ -1232,10 +1240,15 @@ def binary_cross_entropy_with_logits(logits, label, weight, pos_weight, reductio
         0.3463612
     """
+    if weight is None:
+        weight = ops.ones_like(logits)
+    if pos_weight is None:
+        pos_weight = ops.ones_like(logits)
     bce_with_logits_loss_op = _get_cache_prim(NN_OPS.BCEWithLogitsLoss)(reduction)
     return bce_with_logits_loss_op(logits, label, weight, pos_weight)
+@_function_forbid_reuse
 def dropout(input, p=0.5, training=True, seed=None):
     r"""
     During training, randomly zeroes some of the elements of the input tensor
@@ -1275,7 +1288,9 @@ def dropout(input, p=0.5, training=True, seed=None):
         return input
     keep_prob = 1 - p
     seed0, seed1 = _get_seed(seed, "dropout")
-    out, _ = P.Dropout(keep_prob=keep_prob, Seed0=seed0, Seed1=seed1)(input)
+    dropout_op = P.Dropout(keep_prob=keep_prob, Seed0=seed0, Seed1=seed1)
+    dropout_op = _set_prim_op_user_data(dropout_op, "random_cache", False)
+    out, _ = dropout_op(input)
     return out
@@ -1820,7 +1835,7 @@ def kl_div(logits, labels, reduction='mean'):
             Its value must be one of ``'none'`` , ``'mean'`` , ``'batchmean'`` or ``'sum'`` . Default: ``'mean'`` .
     Returns:
-        Tensor or Scalar, if `reduction` is 'none', then output is a tensor and has the same shape as `logits`.
+        Tensor or Scalar, if `reduction` is ``'none'``, then output is a tensor and has the same shape as `logits`.
         Otherwise, it is a scalar.
     Raises:
@@ -2220,7 +2235,9 @@ def interpolate(input,
             One and only one of size and scale_factor can be set to None. Default: ``None`` .
         mode (str): The sampling algorithm.
             One of 'nearest', 'linear' (3D only), 'bilinear' (4D only), 'trilinear' (5D only), 'bicubic' (4D only),
-            'area', 'nearest-exact'(3D and 4D). Default: ``"nearest"`` .
+            'area', 'nearest-exact'(matches Scikit-Image and PIL nearest neighbours interpolation algorithms and fixes
+            knows issues with `nearest`, 3D and 4D). Default: ``"nearest"`` .
         align_corners (bool): If True, rescale input by :math:`(new\_height - 1) / (height - 1)`, which exactly
             aligns the corners of data and resized data. If False, rescale by :math:`new\_height / height`.
             Default: ``None`` .
@@ -2568,10 +2585,12 @@ def soft_margin_loss(input, target, reduction='mean'):
     Args:
         input (Tensor): Predict data. Data type must be float16 or float32.
         target (Tensor): Ground truth data, with the same type and shape as `logits`.
-        reduction (str, optional): Implements the reduction method to the output with ``'none'`` , ``'mean'`` ,
-            or ``'sum'`` ,
-            respectively indicate that no calculation is specified, that the mean is used, and that is calculated
-            using summation. Default: ``'mean'`` .
+        reduction (str, optional): Apply specific reduction method to the output: ``'none'`` , ``'mean'`` ,
+            ``'sum'`` . Default: ``'mean'`` .
+            - ``'none'``: no reduction will be applied.
+            - ``'mean'``: compute and return the mean of elements in the output.
+            - ``'sum'``: the output elements will be summed.
     Outputs:
         Tensor or Scalar. If `reduction` is ``'none'``, its shape is the same as `logits`.
@@ -2751,6 +2770,55 @@ def soft_shrink(input, lambd=0.5):
     return soft_shrink_op(input)
+def softplus(input, beta=1, threshold=20): # pylint:disable=redefined-outer-name
+    r"""
+    Applies softplus function to `input` element-wise.
+    The softplus function is shown as follows, x is the element of `input` :
+    .. math::
+        \text{output} = \frac{1}{beta}\log(1 + \exp(\text{beta * x}))
+    When :math:`input * beta > threshold`, the implementation converts to the linear function
+    to ensure numerical stability.
+    Args:
+        input (Tensor) - Tensor of any dimension.
+            Supported dtypes:
+            - GPU/CPU: float16, float32, float64.
+            - Ascend: float16, float32.
+        beta (int, optional) - The :math:`\beta` value in softplus function. Default: ``1`` .
+        threshold (int, optional) - When :math:`input * beta > threshold`, converting softplus to a linear function.
+            Default: ``20`` .
+    Returns:
+        Tensor, with the same type and shape as the `input` .
+    Raises:
+        TypeError: If `input` is not a Tensor.
+        TypeError: If the dtype of `input` is not float16, float32 or float64.
+    Supported Platforms:
+        ``Ascend``  ``GPU`` ``CPU``
+    Examples:
+        >>> import mindspore
+        >>> import numpy as np
+        >>> from mindspore import Tensor, ops
+        >>> input = Tensor(np.array([0.1, 0.2, 30, 25]), mindspore.float32)
+        >>> output = ops.softplus(input)
+        >>> print(output)
+        [0.7443967 0.79813886 30. 25.]
+    """
+    softplus_op = _get_cache_prim(P.Softplus)()
+    scaling_input = beta * input
+    op_output = (1 / beta) * softplus_op(scaling_input)
+    return ops.select(input * beta > threshold, input, op_output)
 def silu(x):
     r"""
     Computes Sigmoid Linear Unit of input element-wise. The SiLU function is defined as:
@@ -2860,7 +2928,7 @@ def sigmoid(input):
         >>> print(output)
         [0.7310586  0.880797   0.95257413 0.98201376 0.9933072 ]
     """
-    return sigmoid_(input)
+    return _get_cache_prim(NN_OPS.Sigmoid)()(input)
 def logsigmoid(x):
@@ -2946,11 +3014,19 @@ def dense(input, weight, bias=None):
     _check_is_tensor("bias", bias, "dense")
     weight = ops.t(weight)
     input = ops.matmul(input, weight)
+    input_shape = input.shape
     if bias is not None:
         input = input + bias
+        _check_dense_add_bias_shape(input_shape, input.shape, bias.shape)
     return input
+def _check_dense_add_bias_shape(input_shape, output_shape, bias_shape):
+    """Check that the output has the correct shape after adding bias."""
+    if input_shape != output_shape:
+        raise ValueError(f"For dense, the bias shape {bias_shape} does not match the input shape {input_shape}.")
 @_primexpr
 def check_dense_inputs_same_shape(input1_shape, input2_shape, prim_name=None):
     """check bidense input Tensors' shape"""
@@ -2965,7 +3041,10 @@ def bidense(input1, input2, weight, bias=None):
     Applies bilinear dense connected layer for `input1` and `input2`. The bilinear dense function is defined as:
     .. math::
-        output = input1^{T} weight input2 + bias
+        output = x_{1}^{T}Ax_{2} + b
+    :math:`x_{1}` represents `input1` , :math:`x_{2}` represents `input2` , :math:`A` represents `weight` ,
+    :math:`b` represents `bias` .
     .. warning::
         This is an experimental API that is subject to change or deletion.
@@ -3391,7 +3470,9 @@ def relu6(x):
     It returns :math:`\min(\max(0,x), 6)` element-wise.
     Args:
-        x (Tensor): Input Tensor of float16 or float32 data type.
+        x (Tensor): Tensor of shape :math:`(N, *)`,
+            where :math:`*` means any number of additional dimensions.
+            Data type must be float16, float32.
     Returns:
         Tensor, with the same dtype and shape as the `x`.
@@ -3528,6 +3609,9 @@ def rrelu(input, lower=1.0 / 8, upper=1.0 / 3):
     _lower = Tensor(lower, mstype.float32)
     _upper = Tensor(upper, mstype.float32)
     _size = input.shape
+    if ops.is_sequence_value_unknown(_size):
+        dyn_shape = _get_cache_prim(P.TensorShape)()
+        _size = dyn_shape(input)
     sign_matrix = _get_cache_prim(P.Sign)()(input)
     negative_filter = sign_matrix.clip(None, 0)
     positive_filter = sign_matrix.clip(0, None)
@@ -3615,11 +3699,10 @@ def cross_entropy(input, target, weight=None, ignore_index=-100, reduction='mean
           l_n = - w_{y_n} \log \frac{\exp(x_{n,y_n})}{\sum_{c=1}^C \exp(x_{n,c})}
           \cdot \mathbb{1}\{y_n \not= \text{ignore_index}\}
-      where :math:`x` is the inputs, :math:`y` is the target, :math:`w` is the weight,
-      N is the batch size, :math:`c` belonging to :math:`[0, C-1]` is class index, where :math:`C` is the number of
-      classes.
+      where :math:`x` is the inputs, :math:`y` is the target, :math:`w` is the weight, N is the batch size,
+      :math:`c` belonging to :math:`[0, C-1]` is class index, where :math:`C` is the number of classes.
-      If reduction is not 'none' (default 'mean'), then
+      If `reduction` is not ``None`` (default ``'mean'`` ), then
       .. math::
@@ -3638,11 +3721,10 @@ def cross_entropy(input, target, weight=None, ignore_index=-100, reduction='mean
           \ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad
           l_n = - \sum_{c=1}^C w_c \log \frac{\exp(x_{n,c})}{\sum_{i=1}^C \exp(x_{n,i})} y_{n,c}
-      where :math:`x` is the inputs, :math:`y` is the target, :math:`w` is the weight,
-      N is the batch size, :math:`c` belonging to :math:`[0, C-1]` is class index, where :math:`C` is the number of
-      classes.
+      where :math:`x` is the inputs, :math:`y` is the target, :math:`w` is the weight, N is the batch size,
+      :math:`c` belonging to :math:`[0, C-1]` is class index, where :math:`C` is the number of classes.
-      If reduction is not 'none' (default 'mean'), then
+      If `reduction` is not ``None`` (default ``'mean'`` ), then
       .. math::
@@ -3658,16 +3740,19 @@ def cross_entropy(input, target, weight=None, ignore_index=-100, reduction='mean
             in case of 2D Loss, or :math:`(N, C, d_1, d_2, ..., d_K)`.
             `input` is expected to be log-probabilities, data type must be float16 or float32.
         target (Tensor): For class indices, tensor of shape :math:`()`, :math:`(N)` or
-            :math:`(N, d_1, d_2, ..., d_K)` , data type must be int32.
-            For probabilities, tensor of shape :math:`(C,)` :math:`(N, C)` or :math:`(N, C, d_1, d_2, ..., d_K)` ,
-            data type must be float16 or float32.
+            :math:`(N, d_1, d_2, ..., d_K)` , data type must be int32. For probabilities, tensor of shape :math:`(C,)` ,
+            :math:`(N, C)` or :math:`(N, C, d_1, d_2, ..., d_K)` , data type must be float16 or float32.
         weight (Tensor): A rescaling weight applied to the loss of each batch element.
-            If not None, the shape is :math:`(C,)`,
-            data type must be float16 or float32. Default: ``None`` .
+            If not None, the shape is :math:`(C,)`, data type must be float16 or float32. Default: ``None`` .
         ignore_index (int): Specifies a target value that is ignored
             and does not contribute to the input gradient. Default: ``-100`` .
-        reduction (str):  Apply specific reduction method to the output: ``'none'`` , ``'mean'`` , or ``'sum'`` .
-            Default: ``'mean'`` .
+        reduction (str, optional): Apply specific reduction method to the output: ``'none'`` , ``'mean'`` ,
+            ``'sum'`` . Default: ``'mean'`` .
+            - ``'none'``: no reduction will be applied.
+            - ``'mean'``: compute and return the weighted mean of elements in the output.
+            - ``'sum'``: the output elements will be summed.
         label_smoothing (float): Label smoothing values, a regularization tool used to prevent the model
             from overfitting when calculating Loss. The value range is [0.0, 1.0]. Default value: ``0.0`` .
@@ -3678,17 +3763,16 @@ def cross_entropy(input, target, weight=None, ignore_index=-100, reduction='mean
         ``Ascend`` ``GPU`` ``CPU``
     Examples:
-        >>> import mindspore
+        >>> import mindspore as ms
         >>> import numpy as np
-        >>> from mindspore import Tensor, ops
         >>> # Case 1: Indices labels
-        >>> inputs = mindspore.Tensor(np.random.randn(3, 5), mindspore.float32)
-        >>> target = mindspore.Tensor(np.array([1, 0, 4]), mindspore.int32)
-        >>> output = ops.cross_entropy(inputs, target)
+        >>> inputs = ms.Tensor(np.random.randn(3, 5), ms.float32)
+        >>> target = ms.Tensor(np.array([1, 0, 4]), ms.int32)
+        >>> output = ms.ops.cross_entropy(inputs, target)
         >>> # Case 2: Probability labels
-        >>> inputs = mindspore.Tensor(np.random.randn(3, 5), mindspore.float32)
-        >>> target = mindspore.Tensor(np.random.randn(3, 5), mindspore.float32)
-        >>> output = ops.cross_entropy(inputs, target)
+        >>> inputs = ms.Tensor(np.random.randn(3, 5), ms.float32)
+        >>> target = ms.Tensor(np.random.randn(3, 5), ms.float32)
+        >>> output = ms.ops.cross_entropy(inputs, target)
     """
     _check_is_tensor('input', input, "cross_entropy_loss")
     _check_is_tensor('target', target, "cross_entropy_loss")
@@ -3743,7 +3827,7 @@ def nll_loss(inputs, target, weight=None, ignore_index=-100, reduction='mean', l
     N is the batch size, :math:`c` belonging to :math:`[0, C-1]` is class index, where :math:`C` is the number of
     classes.
-    If reduction is not 'none' (default 'mean'), then
+    If `reduction` is not ``None`` (default 'mean'), then
     .. math::
@@ -3763,8 +3847,13 @@ def nll_loss(inputs, target, weight=None, ignore_index=-100, reduction='mean', l
             The data type must be float16 or float32. Default: ``None`` .
         ignore_index (int): Specifies a target value that is ignored
             and does not contribute to the input gradient. Default: ``-100`` .
-        reduction (str):  Apply specific reduction method to the output: ``'none'`` , ``'mean'`` , or ``'sum'`` .
-            Default: ``'mean'`` .
+        reduction (str, optional): Apply specific reduction method to the output: ``'none'`` , ``'mean'`` ,
+            ``'sum'`` . Default: ``'mean'`` .
+            - ``'none'``: no reduction will be applied.
+            - ``'mean'``: compute and return the weighted mean of elements in the output.
+            - ``'sum'``: the output elements will be summed.
         label_smoothing (float): Label smoothing values, a regularization tool used to prevent the model
             from overfitting when calculating Loss. The value range is [0.0, 1.0]. Default value: ``0.0`` .
@@ -3858,7 +3947,7 @@ def l1_loss(input, target, reduction='mean'):
     r"""
     Calculate the mean absolute error between the `input` value and the `target` value.
-    Assuming that the :math:`x` and :math:`y` are 1-D Tensor, length :math:`N`, `reduction` is set to "none" ,
+    Assuming that the :math:`x` and :math:`y` are 1-D Tensor, length :math:`N`, `reduction` is set to ``"none"``,
     then calculate the loss of :math:`x` and :math:`y` without dimensionality reduction.
     The formula is as follows:
@@ -3881,18 +3970,21 @@ def l1_loss(input, target, reduction='mean'):
         input (Tensor): Predicted value, Tensor of any dimension.
         target (Tensor): Target value, usually has the same shape as the `input`.
             If `input` and `target` have different shape, make sure they can broadcast to each other.
-        reduction (str, optional): Type of reduction to be applied to loss.
-            The optional value is ``"mean"`` , ``"sum"`` or
-            ``"none"`` . Default: ``'mean'`` .
+        reduction (str, optional): Apply specific reduction method to the output: ``'none'`` , ``'mean'`` ,
+            ``'sum'`` . Default: ``'mean'`` .
+            - ``'none'``: no reduction will be applied.
+            - ``'mean'``: compute and return the mean of elements in the output.
+            - ``'sum'``: the output elements will be summed.
     Returns:
-        Tensor or Scalar, if `reduction` is "none", return a Tensor with same shape and dtype as `input`.
+        Tensor or Scalar, if `reduction` is ``"none"``, return a Tensor with same shape and dtype as `input`.
         Otherwise, a scalar value will be returned.
     Raises:
         TypeError: If `input` is not a Tensor.
         TypeError: If `target` is not a Tensor.
-        ValueError: If `reduction` is not one of "none", "mean" or "sum".
+        ValueError: If `reduction` is not one of ``"none"``, ``"mean"`` or ``"sum"``.
     Supported Platforms:
         ``Ascend`` ``GPU`` ``CPU``
@@ -3948,16 +4040,20 @@ def smooth_l1_loss(input, target, beta=1.0, reduction='none'):
         target (Tensor): Ground truth data, tensor of shape :math:`(N, *)`, same shape and dtype as the `input`.
         beta (float): A parameter used to control the point where the function will change between
             L1 to L2 loss. The value should be greater than zero. Default: ``1.0`` .
-        reduction (str): Apply specific reduction method to the output: ``'none'`` , ``'mean'`` or ``'sum'`` .
-            Default: ``'none'`` .
+        reduction (str, optional): Apply specific reduction method to the output: ``'none'`` , ``'mean'`` ,
+            ``'sum'`` . Default: ``'none'`` .
+            - ``'none'``: no reduction will be applied.
+            - ``'mean'``: compute and return the mean of elements in the output.
+            - ``'sum'``: the output elements will be summed.
     Returns:
-        Tensor, if `reduction` is 'none', then output is a tensor with the same shape as `input`.
+        Tensor, if `reduction` is ``'none'``, then output is a tensor with the same shape as `input`.
         Otherwise, the shape of output tensor is :math:`(1,)`.
     Raises:
         TypeError: If `beta` is not a float.
-        ValueError: If `reduction` is not one of 'none', 'mean', 'sum'.
+        ValueError: If `reduction` is not one of ``'none'``, ``'mean'``, ``'sum'``.
         TypeError: If dtype of `input` or `target` is not one of float16, float32, float64.
         ValueError: If `beta` is less than or equal to 0.
         ValueError: If shape of `input` is not the same as `target`.
@@ -4072,6 +4168,7 @@ def leaky_relu(input, alpha=0.2):
     select_op = _get_cache_prim(P.Maximum)()
     if alpha > 1:
         select_op = _get_cache_prim(P.Minimum)()
+    alpha = _get_cache_prim(P.Cast)()(F.scalar_to_tensor(alpha), input.dtype)
     return select_op(alpha * input, input)
@@ -4158,6 +4255,10 @@ def lrn(x, depth_radius=5, bias=1.0, alpha=1.0, beta=0.5, norm_region="ACROSS_CH
     r"""
     Local Response Normalization.
+    .. warning::
+        lrn is deprecated on Ascend due to potential accuracy problem. It's recommended to use other
+        normalization methods, e.g. :class:`mindspore.ops.batch_norm`.
     .. math::
         b_{c} = a_{c}\left(k + \frac{\alpha}{n}
@@ -4186,7 +4287,7 @@ def lrn(x, depth_radius=5, bias=1.0, alpha=1.0, beta=0.5, norm_region="ACROSS_CH
         TypeError: If `x` is not a Tensor.
     Supported Platforms:
-        ``Ascend`` ``GPU`` ``CPU``
+        ``GPU`` ``CPU``
     Examples:
         >>> import mindspore
@@ -4219,7 +4320,11 @@ def mish(x):
     <https://arxiv.org/abs/1908.08681>`_.
     Args:
-        x (Tensor): The input Tensor with float16, float32 or float64 data type.
+        x (Tensor): The input Tensor.
+            Supported dtypes:
+            - GPU/CPU: float16, float32, float64.
+            - Ascend: float16, float32.
     Returns:
         Tensor, with the same type and shape as the `x`.
@@ -4320,10 +4425,40 @@ def _check_type_and_shape_same(param_name1, input_data1, param_name2, input_data
 def margin_ranking_loss(input1, input2, target, margin=0.0, reduction='mean'):
-    """
+    r"""
     MarginRankingLoss creates a criterion that measures the loss.
-    For details, please refer to :class:`mindspore.nn.MarginRankingLoss`.
+    Given two tensors :math:`input1`, :math:`input2` and a Tensor label :math:`target` with values 1 or -1,
+    the operation is as follows:
+    .. math::
+        \text{loss}(input1, input2, target) = \max(0, -target * (input1 - input2) + \text{margin})
+    Args:
+        input1 (Tensor): Tensor of shape :math:`(N, *)` where :math:`*` means, any number of additional dimensions.
+        input2 (Tensor): Tensor of shape :math:`(N, *)`, same shape and dtype as `input1`.
+        target (Tensor): Contains value 1 or -1. Suppose the shape of `input1` is
+          :math:`(x_1, x_2, x_3, ..., x_R)`, then the shape of `target` must be :math:`(x_1, x_2, x_3, ..., x_R)`.
+        margin (float, optional): Specify the adjustment factor of the operation. Default: ``0.0`` .
+        reduction (str, optional): Apply specific reduction method to the output: ``'none'`` , ``'mean'`` ,
+            ``'sum'`` . Default: ``'mean'`` .
+            - ``'none'``: no reduction will be applied.
+            - ``'mean'``: compute and return the mean of elements in the output.
+            - ``'sum'``: the output elements will be summed.
+    Returns:
+        Tensor or Scalar. if `reduction` is ``"none"``, its shape is the same as `labels`.
+        Otherwise, a scalar value will be returned.
+    Raises:
+        TypeError: If `margin` is not a float.
+        TypeError: If `input1`, `input2` or `target` is not a Tensor.
+        TypeError: If the types of `input1` and `input2` are inconsistent.
+        TypeError: If the types of `input1` and `target` are inconsistent.
+        ValueError: If the shape of `input1` and `input2` are inconsistent.
+        ValueError: If the shape of `input1` and `target` are inconsistent.
+        ValueError: If `reduction` is not one of ``'none'``, ``'mean'`` , ``'sum'``.
     Supported Platforms:
         ``Ascend`` ``GPU`` ``CPU``
@@ -4334,7 +4469,7 @@ def margin_ranking_loss(input1, input2, target, margin=0.0, reduction='mean'):
         >>> import numpy as np
         >>> input1 = Tensor(np.array([0.3864, -2.4093, -1.4076]), ms.float32)
         >>> input2 = Tensor(np.array([-0.6012, -1.6681, 1.2928]), ms.float32)
-        >>> target = sign(Tensor(np.array([-2, -2, 3]), ms.float32))
+        >>> target = ops.Sign()(Tensor(np.array([-2, -2, 3]), ms.float32))
         >>> output = ops.margin_ranking_loss(input1, input2, target)
         >>> print(output)
         1.2293333
@@ -4375,17 +4510,20 @@ def cosine_embedding_loss(input1, input2, target, margin=0.0, reduction="mean"):
         target (Tensor): Contains value 1 or -1. Suppose the shape of `input1` is
           :math:`(x_1, x_2, x_3, ..., x_R)`, then the shape of `target` must be :math:`(x_1, x_3, x_4, ..., x_R)`.
         margin (float, optional): Should be in [-1.0, 1.0]. Default: 0.0.
-        reduction (str, optional): Specifies which reduction to be applied to the output. It must be one of
-          ``"none"`` , ``"mean"`` , and ``"sum"`` ,
-          meaning no reduction, reduce mean and sum on output, respectively. Default: ``"mean"`` .
+        reduction (str, optional): Apply specific reduction method to the output: ``'none'`` , ``'mean'`` ,
+            ``'sum'`` . Default: ``'mean'`` .
+            - ``'none'``: no reduction will be applied.
+            - ``'mean'``: compute and return the mean of elements in the output.
+            - ``'sum'``: the output elements will be summed.
     Returns:
-        Tensor or Scalar, if `reduction` is "none", its shape is the same as `target`.
+        Tensor or Scalar, if `reduction` is ``"none"``, its shape is the same as `target`.
         Otherwise, a scalar value will be returned.
     Raises:
         TypeError: If `margin` is not a float.
-        ValueError: If `reduction` is not one of 'none', 'mean', 'sum'.
+        ValueError: If `reduction` is not one of ``'none'``, ``'mean'``, ``'sum'``.
         ValueError: If `margin` is not in range [-1, 1].
     Supported Platforms:
@@ -4471,6 +4609,19 @@ def max_pool3d(x, kernel_size, stride=None, padding=0, dilation=1, ceil_mode=Fal
         - **output** (Tensor) - Maxpooling result, with shape :math:`(N_{out}, C_{out}, D_{out}, H_{out}, W_{out})`.
           It has the same data type as `x`.
+        .. math::
+            D_{out} = \left\lfloor\frac{D_{in} + 2 \times \text{padding}[0] - \text{dilation}[0] \times
+            (\text{kernel_size}[0] - 1) - 1}{\text{stride}[0]} + 1\right\rfloor
+        .. math::
+            H_{out} = \left\lfloor\frac{H_{in} + 2 \times \text{padding}[1] - \text{dilation}[1] \times
+            (\text{kernel_size}[1] - 1) - 1}{\text{stride}[1]} + 1\right\rfloor
+        .. math::
+            W_{out} = \left\lfloor\frac{W_{in} + 2 \times \text{padding}[2] - \text{dilation}[2] \times
+            (\text{kernel_size}[2] - 1) - 1}{\text{stride}[2]} + 1\right\rfloor
         - **argmax** (Tensor) - Index corresponding to the maximum value. Data type is int64. It will be return
           only when `return_indices` is ``True`` .
@@ -4529,14 +4680,24 @@ def grid_sample(input, grid, mode='bilinear', padding_mode='zeros', align_corner
     Args:
         input (Tensor): input with shape of :math:`(N, C, H_{in}, W_{in})` (4-D case) or :math:`(N, C, D_{in},
-            H_{in}, W_{in})` (5-D case) and dtype of float16, float32 or float64.
+            H_{in}, W_{in})` (5-D case) and dtype of float32 or float64.
         grid (Tensor): flow-field with shape of :math:`(N, H_{out}, W_{out}, 2)` (4-D case) or :math:`(N, D_{out},
             H_{out}, W_{out}, 3)` (5-D case) and same dtype as `input`.
         mode (str): An optional string specifying the interpolation method. The optional values are
-            'bilinear', 'nearest'. Default: ``'bilinear'`` . Note: `bicubic` is not supported yet. When
+            ``'bilinear'``, ``'nearest'``. Default: ``'bilinear'`` . Note: `bicubic` is not supported yet. When
             `mode="bilinear"` and the input is 5-D, the interpolation mode used internally will actually
             be trilinear. However, when the input is 4-D, the interpolation mode will legistimately be bilinear.
             Default: ``'bilinear'`` .
+            - ``'nearest'``: Nearest neighbor interpolation. Each output pixel is assigned the value of the
+              nearest input pixel. This method is simple and fast but can result in blocky or pixelated outputs.
+            - ``'bilinear'``: Bilinear interpolation. Each output pixel is a weighted average of the four nearest input
+              pixels, computed using bilinear interpolation. This method produces smoother results compared
+              to nearest neighbor interpolation.
+            - ``'trilinear'``: Trilinear interpolation. This is an extension of bilinear interpolation to 3D data.
+              It performs bilinear interpolation in the two spatial dimensions and linear interpolation along
+              the third dimension. It is commonly used for volume or 3D image interpolation.
         padding_mode (str): An optional string specifying the pad method. The optional values are "zeros", "border" or
             "reflection". Default: ``'zeros'`` .
         align_corners (bool): An optional bool. If set to `True`, the extrema (-1 and 1) are considered as referring to
@@ -4617,10 +4778,13 @@ def ctc_loss(log_probs, targets, input_lengths, target_lengths, blank=0, reducti
         input_lengths (Union(tuple, Tensor)): Lengths of the input. A tuple or Tensor of shape(N).
         target_lengths (Union(tuple, Tensor)): Lengths of the target. A tuple or Tensor of shape(N).
         blank (int, optional): The blank label. Default: ``0`` .
-        reduction (str, optional): Implements the reduction method to the output with
-            ``'none'`` , ``'mean'`` , or ``'sum'`` ,
-            respectively indicate that no calculation is specified, that the mean is used, and that is calculated
-            using summation. Default: ``"mean"`` .
+        reduction (str, optional): Apply specific reduction method to the output: ``'none'`` , ``'mean'`` ,
+            ``'sum'`` . Default: ``'mean'`` .
+            - ``'none'``: no reduction will be applied.
+            - ``'mean'``: compute and return the mean of elements in the output.
+            - ``'sum'``: the output elements will be summed.
         zero_infinity (bool, optional): Whether to set infinite loss and correlation gradient to 0. Default: ``False`` .
     Returns:
@@ -4704,8 +4868,12 @@ def gaussian_nll_loss(x, target, var, full=False, eps=1e-6, reduction='mean'):
         full (bool, optional): Include the constant term in the loss calculation. When :math:`full=True`,
             the constant term will be :math:`const = 0.5*log(2\pi)`. Default: ``False``.
         eps (float, optional): Used to improve the stability of log function must be greater than 0. Default: ``1e-6`` .
-        reduction (str, optional): Apply specific reduction method to the
-            output: ``"none"``, ``"mean"``, or ``"sum"``. Default: ``'mean'``.
+        reduction (str, optional): Apply specific reduction method to the output: ``'none'`` , ``'mean'`` ,
+            ``'sum'`` . Default: ``'mean'`` .
+            - ``'none'``: no reduction will be applied.
+            - ``'mean'``: compute and return the mean of elements in the output.
+            - ``'sum'``: the output elements will be summed.
     Returns:
         Tensor or Tensor scalar, the computed loss depending on :math:`reduction`.
@@ -4722,8 +4890,7 @@ def gaussian_nll_loss(x, target, var, full=False, eps=1e-6, reduction='mean'):
     Examples:
         >>> import numpy as np
-        >>> from mindspore import Tensor
-        >>> import mindspore.ops as ops
+        >>> from mindspore import Tensor, ops
         >>> import mindspore.common.dtype as mstype
         >>> arr1 = np.arange(8).reshape((4, 2))
         >>> arr2 = np.array([2, 3, 1, 4, 6, 4, 4, 9]).reshape((4, 2))
@@ -4831,9 +4998,12 @@ def hinge_embedding_loss(inputs, targets, margin=1.0, reduction='mean'):
             Has the same shape as `inputs`, contains -1 or 1.
         margin (float, int): Threshold defined by Hinge Embedding Loss :math:`margin`.
             Represented as :math:`\Delta` in the formula. Default: ``1.0`` .
-        reduction (str): Specify the computing method to be applied to the outputs:
-            ``'none'`` , ``'mean'`` , or ``'sum'`` .
-            Default: ``'mean'`` .
+        reduction (str, optional): Apply specific reduction method to the output: ``'none'`` , ``'mean'`` ,
+            ``'sum'`` . Default: ``'mean'`` .
+            - ``'none'``: no reduction will be applied.
+            - ``'mean'``: compute and return the mean of elements in the output.
+            - ``'sum'``: the output elements will be summed.
     Returns:
         Tensor or Tensor scalar, the computed loss depending on :math:`reduction`.
@@ -4843,7 +5013,7 @@ def hinge_embedding_loss(inputs, targets, margin=1.0, reduction='mean'):
         TypeError: If `targets` is not a Tensor.
         TypeError: If `margin` is not a float or int.
         ValueError: If `targets` does not have the same shape as `inputs` or they could not broadcast to each other.
-        ValueError: If `reduction` is not one of 'none', 'mean', 'sum'.
+        ValueError: If `reduction` is not one of ``'none'``, ``'mean'``, ``'sum'``.
     Supported Platforms:
         ``Ascend`` ``GPU`` ``CPU``
@@ -4889,6 +5059,9 @@ def ctc_greedy_decoder(inputs, sequence_length, merge_repeated=True):
     r"""
     Performs greedy decoding on the logits given in inputs.
+    Note:
+        On Ascend, 'merge_repeated' can not be set to false.
     Args:
         inputs (Tensor): The input Tensor must be a 3-D tensor whose shape is
             :math:`(max\_time, batch\_size, num\_classes)`. `num_classes` must be `num_labels + 1` classes,
@@ -5068,74 +5241,87 @@ def _check_conv_iterable_lengths(iterable, dim, iter_name):
 def conv1d(input, weight, bias=None, stride=1, pad_mode="valid", padding=0, dilation=1, groups=1):
     r"""
-    Applies a 1D convolution over an input tensor.
-    The input tensor is typically of shape :math:`(N, C_{in}, W_{in})`,
-    where :math:`N` is batch size, :math:`C_{in}` is channel number, :math:`W` is width, :math:`X_i` is
-    the :math:`i^{th}` input value and :math:`b_i` indicates the deviation value of the :math:`i^{th}` input value.
-    For each batch of shape :math:`(C_{in}, W_{in})`, the formula is defined as:
+    Applies a 1D convolution over an input tensor. The input Tensor is typically
+    of shape :math:`(N, C_{in}, L_{in})`,
+    where :math:`N` is batch size, :math:`C` is channel number, :math:`L` is input sequence width.
+    The output is calculated based on formula:
     .. math::
-        out_j = \sum_{i=0}^{C_{in} - 1} ccor(W_{j}, X_i) + b_j,
+        \text{out}(N_i, C_{\text{out}_j}) = \text{bias}(C_{\text{out}_j}) +
+        \sum_{k = 0}^{C_{in} - 1} \text{ccor}({\text{weight}(C_{\text{out}_j}, k), \text{X}(N_i, k)})
+    where :math:`bias` is the output channel bias, :math:`ccor` is
+    the `cross-correlation <https://en.wikipedia.org/wiki/Cross-correlation>`_,
+    , :math:`weight` is the convolution kernel value and :math:`X` represents the input feature map.
+    Here are the indices' meanings:
+    - :math:`i` corresponds to the batch number, ranging from 0 to N-1, where N is the batch size of the input.
+    - :math:`j` corresponds to the output channel, ranging from 0 to C_{out}-1, where C_{out} is the number of
+      output channels, which is also equal to the number of kernels.
-    where :math:`ccor` is the `cross-correlation <https://en.wikipedia.org/wiki/Cross-correlation>`_  operator,
-    :math:`C_{in}` is the input channel number, :math:`j` ranges
-    from :math:`0` to :math:`C_{out} - 1`, :math:`W_{ij}` corresponds to the :math:`i`-th channel of the :math:`j`-th
-    filter and :math:`out_{j}` corresponds to the :math:`j`-th channel of the output. :math:`W_{j}` is a slice
-    of kernel, and it has shape :math:`(\text{kernal_size})`, where :math:`\text{kernel_size}` is the width of
-    the convolution kernel. The full kernel has shape :math:`(C_{out}, C_{in} / \text{groups}, \text{kernel_size})`,
-    where `groups` is the group number to split the input in the channel dimension.
+    - :math:`k` corresponds to the input channel, ranging from 0 to C_{in}-1, where C_{in} is the number of
+      input channels, which is also equal to the number of channels in the convolutional kernels.
-    If the `pad_mode` is set to be "valid", the output width will be :math:`\left \lfloor{
-    1 + \frac{W_{in} + \text{padding[0]} - \text{kernel_size} - (\text{kernel_size} - 1) \times(\text{dilation} - 1)}
-    {\text { stride }}} \right \rfloor`.
+    Therefore, in the above formula, :math:`{bias}(C_{out_j})` represents the bias of the :math:`j`-th
+    output channel, :math:`{weight}(C_{out_j}, k)` represents the slice of the :math:`j`-th convolutional
+    kernel in the :math:`k`-th channel, and :math:`{X}(N_i, k)` represents the slice of the :math:`k`-th input
+    channel in the :math:`i`-th batch of the input feature map.
-    where :math:`dilation` is spacing between kernel elements, :math:`stride` is The step length of each step,
-    :math:`padding` is zero-padding added to both sides of the input.
-    For output width on other `pad_mode`, please refer to formula on `mindspore.nn.Conv1d
-    <https://www.mindspore.cn/docs/en/r2.1/api_python/nn/mindspore.nn.Conv2d.html>`_.
+    The shape of the convolutional kernel is given by :math:`(kernel\_size)`,
+    where :math:`kernel\_size` is the width of the kernel.
+    If we consider the input and output channels as well as the `group` parameter, the complete kernel shape
+    will be :math:`(C_{out}, C_{in} / \text{group}, \text{kernel_size})`,
+    where `group` is the number of groups dividing `x`'s input channel when applying group convolution.
-    The first introduction can be found in paper `Gradient Based Learning Applied to Document Recognition
-    <http://vision.stanford.edu/cs598_spring07/papers/Lecun98.pdf>`_. More detailed introduction can be found here:
-    `ConvNets <http://cs231n.github.io/convolutional-networks/>`_ .
+    For more details about convolution layer, please refer to `Gradient Based Learning Applied to Document Recognition
+    <http://vision.stanford.edu/cs598_spring07/papers/Lecun98.pdf>`_
+    and `ConvNets <http://cs231n.github.io/convolutional-networks/>`_ .
     Note:
         On Ascend platform, only group convolution in depthwise convolution scenarios is supported.
         That is, when `groups>1`, condition `C_{in}` = `C_{out}` = `groups` must be satisfied.
     Args:
-        input (Tensor): Tensor of shape :math:`(N, C_{in}, W_{in})`.
-        weight (Tensor): Tensor of shape
-            :math:`(N, C_{in} / \text{groups}, \text{kernel_size})`, then the size of kernel is
-            :math:`(\text{kernel_size})`.
-        bias (Tensor): Bias Tensor with shape :math:`(C_{out})`.
+        input (Tensor): Input Tensor of shape :math:`(N, C_{in}, L_{in})`.
+        weight (Tensor): The convolutional kernel value, it should has shape
+            :math:`(N, C_{in} / \text{groups}, \text{kernel_size})`.
+        bias (Tensor, optional): Bias Tensor with shape :math:`(C_{out})`.
             When bias is None, zeros will be used. Default: ``None`` .
         stride (Union(int, tuple[int]), optional): The distance of kernel moving, an int number or a tuple of one int
-            that represents width of movement. Default: 1.
+            that represents width of movement. Default: ``1``.
         pad_mode (str, optional): Specifies padding mode. The optional values are
             ``"same"`` , ``"valid"`` and ``"pad"`` . Default: ``"valid"`` .
-            - same: Adopts the way of completion. The height and width of the output will be equal to
+            - ``"same"``: Adopts the way of completion. The height and width of the output will be equal to
               the input `x` divided by stride. The padding will be evenly calculated in left and right possiblily.
               Otherwise, the last extra padding will be calculated from the right side.
               If this mode is set, `padding` must be 0.
-            - valid: Adopts the way of discarding. The possible largest width of output will be returned
+            - ``"valid"``: Adopts the way of discarding. The possible largest width of output will be returned
               without padding. Extra pixels will be discarded. If this mode is set, `padding` must be 0.
-            - pad: Implicit paddings on both sides of the input `x`. The number of `padding` will be padded to the input
+            - ``"pad"``: Implicit paddings on both sides of the input `x`.
+              The number of `padding` will be padded to the input
               Tensor borders. `padding` must be greater than or equal to 0.
-        padding (Union(int, tuple[int], list[int]), optional): Implicit paddings on both sides of `input`, meaning the
+        padding (Union(int, tuple[int], list[int]), optional):  Specifies the amount of padding to apply on
+            both side of `input` when `pad_mode` is set to ``"pad"``. The
             paddings of left and right are the same, equal to padding or padding[0] when padding is a tuple of
             1 integer. Default: ``0`` .
-        dilation (Union(int, tuple[int]), optional): Gaps between kernel elements. The data type is int or a tuple of
-            1 integer. Specifies the dilation rate to use for dilated convolution. If set to be :math:`k > 1`,
-            there will be :math:`k - 1` pixels skipped for each sampling location. Its value must be greater than or
-            equal to 1 and bounded by the width of `input`. Default: ``1`` .
+        dilation (Union(int, tuple[int]), optional): Specifies the dilation rate to use for dilated convolution.
+            It can be a single int or a tuple of 1 integer.
+            Assuming :math:`dilation=(d0,)`, the convolutional kernel samples the input with a
+            spacing of :math:`d0-1` elements in the width direction.
+            The value should be in the ranges [1, L].
+            Default: ``1`` .
         groups (int, optional): Splits `input` into groups. Default: ``1`` .
     Returns:
-        Tensor, the value that applied 1D convolution. The shape is :math:`(N, C_{out}, W_{out})`.
+        Tensor, the value that applied 1D convolution. The shape is :math:`(N, C_{out}, L_{out})`.
+        To see how different pad modes affect the output shape, please refer to
+        :class:`mindspore.nn.Conv1d` for more details.
     Raises:
         TypeError: If `stride`, `padding` or `dilation` is neither an int nor a tuple.
@@ -5204,40 +5390,44 @@ def conv1d(input, weight, bias=None, stride=1, pad_mode="valid", padding=0, dila
 def conv2d(input, weight, bias=None, stride=1, pad_mode="valid", padding=0, dilation=1, groups=1):
     r"""
-    Applies a 2D convolution over an input tensor.
-    The input tensor is typically of shape :math:`(N, C_{in}, H_{in}, W_{in})`,
-    where :math:`N` is batch size, :math:`C` is channel number, :math:`H` is height, :math:`W` is width, :math:`X_i` is
-    the :math:`i^{th}` input value and :math:`b_i` indicates the deviation value of the :math:`i^{th}` input value.
-    For each batch of shape :math:`(C_{in}, H_{in}, W_{in})`, the formula is defined as:
+    Applies a 2D convolution over an input tensor. The input tenor is typically of
+    shape :math:`(N, C_{in}, H_{in}, W_{in})`, where :math:`N` is batch size, :math:`C` is
+    channel number, :math:`H` is feature height, :math:`W` is feature width.
+    The output is calculated based on formula:
     .. math::
-        out_j = \sum_{i=0}^{C_{in} - 1} ccor(W_{ij}, X_i) + b_j,
+        \text{out}(N_i, C_{\text{out}_j}) = \text{bias}(C_{\text{out}_j}) +
+        \sum_{k = 0}^{C_{in} - 1} \text{ccor}({\text{weight}(C_{\text{out}_j}, k), \text{X}(N_i, k)})
+    where :math:`bias` is the output channel bias, :math:`ccor` is
+    the `cross-correlation <https://en.wikipedia.org/wiki/Cross-correlation>`_,
+    , :math:`weight` is the convolution kernel value and :math:`X` represents the input feature map.
+    Here are the indices' meanings:
+    - :math:`i` corresponds to the batch number, ranging from 0 to N-1, where N is the batch size of the input.
-    where :math:`ccor` is the `cross-correlation <https://en.wikipedia.org/wiki/Cross-correlation>`_  operator,
-    :math:`C_{in}` is the input channel number, :math:`j` ranges
-    from :math:`0` to :math:`C_{out} - 1`, :math:`W_{ij}` corresponds to the :math:`i`-th channel of the :math:`j`-th
-    filter and :math:`out_{j}` corresponds to the :math:`j`-th channel of the output. :math:`W_{ij}` is a slice
-    of kernel, and it has shape :math:`(\text{kernel_size[0]}, \text{kernel_size[1]})`, where :math:`\text{
-    kernel_size[0]}` and :math:`\text{kernel_size[1]}` are the height and width of the convolution kernel.
-    The full kernel has shape :math:`(C_{out}, C_{in} / \text{groups}, \text{kernel_size[0]}, \text{kernel_size[1]})`,
-    where `groups` is the group number to split the input in the channel dimension.
+    - :math:`j` corresponds to the output channel, ranging from 0 to C_{out}-1, where C_{out} is the number of
+      output channels, which is also equal to the number of kernels.
-    If the `pad_mode` is set to be "valid", the output height and width will be :math:`\left \lfloor{
-    1 + \frac{H_{in} + \text{padding[0]} + \text{padding[1]} - \text{kernel_size[0]} -
-    (\text{kernel_size[0]} - 1) \times(\text{dilation[0]} - 1)} {\text { stride[0] }}} \right \rfloor` and
+    - :math:`k` corresponds to the input channel, ranging from 0 to C_{in}-1, where C_{in} is the number of
+      input channels, which is also equal to the number of channels in the convolutional kernels.
-    :math:`\left \lfloor{1 + \frac{W_{in} + \text{padding[2]} + \text{padding[3]} - \text{kernel_size[1]} -
-    (\text{kernel_size[1]} - 1) \times(\text{dilation[1]} - 1)} {\text { stride[1] }}} \right \rfloor` respectively.
+    Therefore, in the above formula, :math:`{bias}(C_{out_j})` represents the bias of the :math:`j`-th
+    output channel, :math:`{weight}(C_{out_j}, k)` represents the slice of the :math:`j`-th convolutional
+    kernel in the :math:`k`-th channel, and :math:`{X}(N_i, k)` represents the slice of the :math:`k`-th input
+    channel in the :math:`i`-th batch of the input feature map.
-    where :math:`dilation` is spacing between kernel elements, :math:`stride` is The step length of each step,
-    :math:`padding` is zero-padding added to both sides of the input.
-    For output height and width on other `pad_mode`, please refer to formula on `mindspore.nn.Conv2d
-    <https://www.mindspore.cn/docs/en/r2.1/api_python/nn/mindspore.nn.Conv2d.html>`_.
+    The shape of the convolutional kernel is given by :math:`(kernel\_size[0], kernel\_size[1])`,
+    where :math:`kernel\_size[0]` and :math:`kernel\_size[1]` are the height and width of the kernel, respectively.
+    If we consider the input and output channels as well as the `group` parameter, the complete kernel shape
+    will be :math:`(C_{out}, C_{in} / \text{group}, \text{kernel_size[0]}, \text{kernel_size[1]})`,
+    where `group` is the number of groups dividing `x`'s input channel when applying group convolution.
-    The first introduction can be found in paper `Gradient Based Learning Applied to Document Recognition
-    <http://vision.stanford.edu/cs598_spring07/papers/Lecun98.pdf>`_. More detailed introduction can be found here:
-    `ConvNets <http://cs231n.github.io/convolutional-networks/>`_ .
+    For more details about convolution layer, please refer to `Gradient Based Learning Applied to Document Recognition
+    <http://vision.stanford.edu/cs598_spring07/papers/Lecun98.pdf>`_ and
+    `ConvNets <http://cs231n.github.io/convolutional-networks/>`_.
     Note:
         On Ascend platform, only group convolution in depthwise convolution scenarios is supported.
@@ -5248,7 +5438,7 @@ def conv2d(input, weight, bias=None, stride=1, pad_mode="valid", padding=0, dila
         weight (Tensor): Tensor of shape
             :math:`(N, C_{in} / \text{groups}, \text{kernel_size[0]}, \text{kernel_size[1]})`, then the size of kernel
             is :math:`(\text{kernel_size[0]}, \text{kernel_size[1]})`.
-        bias (Tensor): Bias Tensor with shape :math:`(C_{out})`.
+        bias (Tensor, optional): Bias Tensor with shape :math:`(C_{out})`.
             When bias is ``None`` , zeros will be used. Default: ``None`` .
         stride (Union(int, tuple[int]), optional): The distance of kernel moving, an int number that represents
             the height and width of movement are both strides, or a tuple of two int numbers that
@@ -5278,6 +5468,9 @@ def conv2d(input, weight, bias=None, stride=1, pad_mode="valid", padding=0, dila
     Returns:
         Tensor, the value that applied 2D convolution. The shape is :math:`(N, C_{out}, H_{out}, W_{out})`.
+        To see how different pad modes affect the output shape, please refer to
+        :class:`mindspore.nn.Conv2d` for more details.
     Raises:
         TypeError: If `stride`, `padding` or `dilation` is neither an int nor a tuple.
@@ -5421,8 +5614,9 @@ def huber_loss(input, target, reduction='mean', delta=1.0):
     Calculates the error between the predicted value and the target value,
     which has the best of both the loss of l1 and the loss of mse.
-    Assuming that the :math:`x` and :math:`y` are 1-D Tensor, length :math:`N`, the reduction parameter is set to "none"
-    then calculate the loss of :math:`x` and :math:`y` without dimensionality reduction. The formula is as follows:
+    Assuming that the :math:`x` and :math:`y` are 1-D Tensor, length :math:`N`, the `reduction` parameter
+    is set to ``"none"`` then calculate the loss of :math:`x` and :math:`y` without dimensionality reduction.
+    The formula is as follows:
     .. math::
         \ell(x, y) = L = \{l_1,\dots,l_N\}^\top
@@ -5451,21 +5645,25 @@ def huber_loss(input, target, reduction='mean', delta=1.0):
         target (Tensor): Target value, has same dtype and shape as the `input` in common cases.
             However, when the shape of `target` is different from the shape of `input`,
             and they should be broadcasted to each other.
-        reduction (str): Type of reduction to be applied to loss.
-            The optional values are ``'mean'`` , ``'sum'`` and ``'none'`` .
-            Default: ``'mean'``.
+        reduction (str, optional): Apply specific reduction method to the output: ``'none'`` , ``'mean'`` ,
+            ``'sum'`` . Default: ``'mean'`` .
+            - ``'none'``: no reduction will be applied.
+            - ``'mean'``: compute and return the mean of elements in the output.
+            - ``'sum'``: the output elements will be summed.
         delta (Union[int, float]): The threshold to change between two type of loss.
             The value must be greater than zero. Default: ``1.0`` .
     Returns:
-        Tensor or Scalar, if `reduction` is "none", return a Tensor with same shape and dtype as `input`.
+        Tensor or Scalar, if `reduction` is ``"none"``, return a Tensor with same shape and dtype as `input`.
         Otherwise, a scalar value will be returned.
     Raises:
         TypeError: If `input` or `target` is not a Tensor.
         TypeError: If dtype of `delta` is neither float nor int.
         ValueError: If `delta` is less than or equal to 0.
-        ValueError: If `reduction` is not one of "none", "mean", "sum".
+        ValueError: If `reduction` is not one of ``"none"``, ``"mean"``, ``"sum"``.
         ValueError: If `input` and `target` have different shapes and cannot be broadcasted to each other.
     Supported Platforms:
@@ -5655,15 +5853,20 @@ def bias_add(input_x, bias):
     consistent with the shape of the `input_x` Tensor.
     Args:
-        input_x (Tensor): The input tensor. The shape can be 2-5 dimensions.
-        bias (Tensor): The bias tensor, with shape :math:`(C)`. C must be the same as channel dimension C of `input_x`.
+        input_x (Tensor): The input tensor. The shape can be 2-5 dimensions. Supported dtypes:
+            - Ascend/CPU: all Number type.
+            - GPU: float16, float32, int8.
+        bias (Tensor): The bias tensor, with shape :math:`(C)`. C must be the same as channel dimension C of
+            `input_x`. It has the same type as `input_x`.
     Returns:
         Tensor, with the same shape and data type as `input_x`.
     Raises:
         TypeError: If `input_x` or `bias` is not a Tensor.
-        TypeError: If dtype of `input_x` or `bias` is inconsistent.
+        TypeError: If dtype of `input_x` and `bias` is inconsistent.
         TypeError: If dimension of `input_x` is not in the range [2, 5].
     Supported Platforms:
@@ -5718,11 +5921,12 @@ def binary_cross_entropy(logits, labels, weight=None, reduction='mean'):
             the loss function
             will not consider any sample weights, and each sample will be treated as having equal importance
             when calculating the loss.
-        reduction (str, optional): Specify the protocol calculation method used to output the results.
-            Its value must be one of ``'none'`` , ``'mean'`` or ``'sum'`` ,
-            respectively indicate that no calculation method is
-            specified, using the average value for calculation, and using summation for calculation, not case-sensitive.
-            Default: ``'mean'`` .
+        reduction (str, optional): Apply specific reduction method to the output: ``'none'`` , ``'mean'`` ,
+            ``'sum'`` . Default: ``'mean'`` .
+            - ``'none'``: no reduction will be applied.
+            - ``'mean'``: compute and return the weighted mean of elements in the output.
+            - ``'sum'``: the output elements will be summed.
     Returns:
         Tensor or Scalar. Returns Tensor that has the same dtype and shape as `logits` if `reduction` is 'none'.
@@ -5731,7 +5935,7 @@ def binary_cross_entropy(logits, labels, weight=None, reduction='mean'):
     Raises:
         TypeError: If `logits`, `labels` or `weight` is not a Tensor.
         TypeError: If dtype of `logits`, `labels` or `weight` (if given) is neither float16 nor float32.
-        ValueError: If `reduction` is not one of 'none', 'mean' or 'sum'.
+        ValueError: If `reduction` is not one of ``'none'``, ``'mean'`` or ``'sum'``.
         ValueError: If shape of `labels` is not the same as `logits` or `weight` (if given).
     Supported Platforms:
@@ -5754,32 +5958,46 @@ def binary_cross_entropy(logits, labels, weight=None, reduction='mean'):
 def conv3d(input, weight, bias=None, stride=1, pad_mode="valid", padding=0, dilation=1, groups=1):
     r"""
-    Applies a 3D convolution over an input tensor. The input tensor is typically of shape
-    :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})` and output shape
-    :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})`, where :math:`N` is batch size, :math:`C` is channel number,
-    :math:`D` is depth, :math:`H, W` is feature height and width respectively.
-    the output value of a layer is calculated as:
+    Applies a 3D convolution over an input tensor. The input tensor is typically of
+    shape :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})`, where :math:`N` is batch size, :math:`C`
+    is channel number, :math:`D` is feature depth, :math:`H` is feature height, :math:`W` is feature width.
+    The output is calculated based on formula:
     .. math::
-        \operatorname{out}\left(N_{i}, C_{\text {out}_j}\right)=\operatorname{bias}\left(C_{\text {out}_j}\right)+
-        \sum_{k=0}^{C_{in}-1} ccor(\text {weight}\left(C_{\text {out}_j}, k\right),
-        \operatorname{input}\left(N_{i}, k\right))
-    where :math:`k` is kernel,
-    :math:`ccor` is the `cross-correlation <https://en.wikipedia.org/wiki/Cross-correlation>`_ ,
-    :math:`C_{in}` is the channel number of the input, :math:`out_{j}` corresponds to the jth channel of
-    the output and :math:`j` is in the range of :math:`[0, C_{out}-1]`. :math:`\text{weight}(C_{\text{out}_j}, k)`
-    is a convolution kernel slice with shape
-    :math:`(\text{kernel_size[0]}, \text{kernel_size[1]}, \text{kernel_size[2]})`,
-    where :math:`\text{kernel_size[0]}`, :math:`\text{kernel_size[1]}` and :math:`\text{kernel_size[2]}` are
-    the depth, height and width of the convolution kernel respectively. :math:`\text{bias}` is the bias parameter
-    and :math:`\text{X}` is the input tensor.
-    The shape of full convolution kernel is
-    :math:`(C_{out}, C_{in} / \text{groups}, \text{kernel_size[0]}, \text{kernel_size[1]}, \text{kernel_size[2]})`,
-    where `groups` is the number of groups to split `input` in the channel dimension.
-    For more details, please refer to the paper `Gradient Based Learning Applied to Document
-    Recognition <http://vision.stanford.edu/cs598_spring07/papers/Lecun98.pdf>`_ .
+        \text{out}(N_i, C_{\text{out}_j}) = \text{bias}(C_{\text{out}_j}) +
+        \sum_{k = 0}^{C_{in} - 1} \text{ccor}({\text{weight}(C_{\text{out}_j}, k), \text{X}(N_i, k)})
+    where :math:`bias` is the output channel bias, :math:`ccor` is
+    the `cross-correlation <https://en.wikipedia.org/wiki/Cross-correlation>`_,
+    , :math:`weight` is the convolution kernel value and :math:`X` represents the input feature map.
+    Here are the indices' meanings:
+    - :math:`i` corresponds to the batch number, ranging from 0 to N-1, where N is the batch size of the input.
+    - :math:`j` corresponds to the output channel, ranging from 0 to C_{out}-1, where C_{out} is the number of
+      output channels, which is also equal to the number of kernels.
+    - :math:`k` corresponds to the input channel, ranging from 0 to C_{in}-1, where C_{in} is the number of
+      input channels, which is also equal to the number of channels in the convolutional kernels.
+    Therefore, in the above formula, :math:`{bias}(C_{out_j})` represents the bias of the :math:`j`-th
+    output channel, :math:`{weight}(C_{out_j}, k)` represents the slice of the :math:`j`-th convolutional
+    kernel in the :math:`k`-th channel, and :math:`{X}(N_i, k)` represents the slice of the :math:`k`-th input
+    channel in the :math:`i`-th batch of the input feature map.
+    The shape of the convolutional kernel is given by
+    :math:`(\text{kernel_size[0]}, \text{kernel_size[1]}, \text{kernel_size[2]})`
+    where :math:`kernel\_size[0]` , :math:`kernel\_size[1]` and :math:`kernel\_size[2]` are the depth,
+    height and width of the kernel, respectively.
+    If we consider the input and output channels as well as the `group` parameter, the complete kernel shape
+    will be :math:`(C_{out}, C_{in} / \text{group}, \text{kernel_size[0]},
+    \text{kernel_size[1]}, \text{kernel_size[2]})`,
+    where `group` is the number of groups dividing `x`'s input channel when applying group convolution.
+    For more details about convolution layer, please refer to `Gradient Based Learning Applied to Document Recognition
+    <http://vision.stanford.edu/cs598_spring07/papers/Lecun98.pdf>`_.
     Note:
         1. On Ascend platform, `groups = 1` must be satisfied.
@@ -5790,8 +6008,8 @@ def conv3d(input, weight, bias=None, stride=1, pad_mode="valid", padding=0, dila
         weight (Tensor): Set size of kernel is :math:`(\text{kernel_size[0]}, \text{kernel_size[1]},
             \text{kernel_size[2]})`, then the shape is :math:`(C_{out}, C_{in}, \text{kernel_size[0]},
             \text{kernel_size[1]}, \text{kernel_size[1]})`.
-        bias (Tensor): Bias Tensor with shape :math:`(C_{out})`. When bias is None, zeros will be used. Default:
-            ``None`` .
+        bias (Tensor, optional): Bias Tensor with shape :math:`(C_{out})`.
+            When bias is None, zeros will be used. Default: ``None`` .
         stride (Union[int, tuple[int]], optional): The distance of kernel moving,
             it can be an int number that represents
             the depth, height and width of movement or a tuple of three int numbers that
@@ -5799,18 +6017,18 @@ def conv3d(input, weight, bias=None, stride=1, pad_mode="valid", padding=0, dila
         pad_mode (str, optional): Specifies padding mode. The optional values are
             ``"same"`` , ``"valid"`` and ``"pad"`` . Default: ``"valid"`` .
-            - same: Adopts the way of completion. The depth, height and width of the output will be equal to
+            - ``"same"``: Adopts the way of completion. The depth, height and width of the output will be equal to
               the input `x` divided by stride. The padding will be evenly calculated in head and tail, top and bottom,
               left and right directions possiblily.
               Otherwise, the last extra padding will be calculated from the tail, bottom and the right side.
               If this mode is set, `pad` must be 0.
-            - valid: Adopts the way of discarding. The possible largest depth, height and width of output
+            - ``"valid"``: Adopts the way of discarding. The possible largest depth, height and width of output
               will be returned without padding. Extra pixels will be discarded. If this mode is set, `pad`
               must be 0.
-            - pad: Implicit paddings on both sides of the input in depth, height and width. The number of `pad` will
-              be padded to the input Tensor borders. `pad` must be greater than or equal to 0.
+            - ``"pad"``: Implicit paddings on both sides of the input in depth, height and width.
+              The number of `pad` will be padded to the input Tensor borders. `pad` must be greater than or equal to 0.
         padding (Union[int, tuple[int], list[int]], optional): The pad value to be filled. If `pad` is an integer,
             the paddings of head, tail, top, bottom, left and right are the same, equal to pad.
@@ -5828,36 +6046,36 @@ def conv3d(input, weight, bias=None, stride=1, pad_mode="valid", padding=0, dila
     Returns:
         Tensor, the value that applied 3D convolution. The shape is :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})`.
-        `pad_mode` is 'same':
+        `pad_mode` is ``"same"``:
         .. math::
             \begin{array}{ll} \\
-                D_{out} ＝ \left \lceil{\frac{D_{in}}{\text{stride[0]}}} \right \rceil \\
-                H_{out} ＝ \left \lceil{\frac{H_{in}}{\text{stride[1]}}} \right \rceil \\
-                W_{out} ＝ \left \lceil{\frac{W_{in}}{\text{stride[2]}}} \right \rceil \\
+                D_{out} = \left \lceil{\frac{D_{in}}{\text{stride[0]}}} \right \rceil \\
+                H_{out} = \left \lceil{\frac{H_{in}}{\text{stride[1]}}} \right \rceil \\
+                W_{out} = \left \lceil{\frac{W_{in}}{\text{stride[2]}}} \right \rceil \\
             \end{array}
-        `pad_mode` is 'valid':
+        `pad_mode` is ``"valid"``:
         .. math::
             \begin{array}{ll} \\
-                D_{out} ＝ \left \lfloor{\frac{D_{in} - \text{dilation[0]} \times (\text{kernel_size[0]} - 1) }
+                D_{out} = \left \lfloor{\frac{D_{in} - \text{dilation[0]} \times (\text{kernel_size[0]} - 1) }
                 {\text{stride[0]}} + 1} \right \rfloor \\
-                H_{out} ＝ \left \lfloor{\frac{H_{in} - \text{dilation[1]} \times (\text{kernel_size[1]} - 1) }
+                H_{out} = \left \lfloor{\frac{H_{in} - \text{dilation[1]} \times (\text{kernel_size[1]} - 1) }
                 {\text{stride[1]}} + 1} \right \rfloor \\
-                W_{out} ＝ \left \lfloor{\frac{W_{in} - \text{dilation[2]} \times (\text{kernel_size[2]} - 1) }
+                W_{out} = \left \lfloor{\frac{W_{in} - \text{dilation[2]} \times (\text{kernel_size[2]} - 1) }
                 {\text{stride[2]}} + 1} \right \rfloor \\
             \end{array}
-        `pad_mode` is 'pad':
+        `pad_mode` is ``"pad"``:
         .. math::
             \begin{array}{ll} \\
-                D_{out} ＝ \left \lfloor{\frac{D_{in} + padding[0] + padding[1] - (\text{dilation[0]} - 1) \times
+                D_{out} = \left \lfloor{\frac{D_{in} + padding[0] + padding[1] - (\text{dilation[0]} - 1) \times
                 \text{kernel_size[0]} - 1 }{\text{stride[0]}} + 1} \right \rfloor \\
-                H_{out} ＝ \left \lfloor{\frac{H_{in} + padding[2] + padding[3] - (\text{dilation[1]} - 1) \times
+                H_{out} = \left \lfloor{\frac{H_{in} + padding[2] + padding[3] - (\text{dilation[1]} - 1) \times
                 \text{kernel_size[1]} - 1 }{\text{stride[1]}} + 1} \right \rfloor \\
-                W_{out} ＝ \left \lfloor{\frac{W_{in} + padding[4] + padding[5] - (\text{dilation[2]} - 1) \times
+                W_{out} = \left \lfloor{\frac{W_{in} + padding[4] + padding[5] - (\text{dilation[2]} - 1) \times
                 \text{kernel_size[2]} - 1 }{\text{stride[2]}} + 1} \right \rfloor \\
             \end{array}
@@ -6082,7 +6300,7 @@ def glu(x, axis=-1):
         TypeError: If `x` is not a Tensor.
     Supported Platforms:
-        ``Ascend`` ``CPU``
+        ``Ascend`` ``GPU`` ``CPU``
     Examples:
         >>> from mindspore import Tensor, ops
@@ -6128,12 +6346,12 @@ def multi_margin_loss(input, target, p=1, margin=1, weight=None, reduction='mean
         reduction (str, optional): Apply specific reduction method to the output: ``'none'`` , ``'mean'`` ,
             ``'sum'`` . Default: ``'mean'`` .
-            - ``'none'`` : no reduction will be applied.
-            - ``'mean'`` : the sum of the output will be divided by the number of elements in the output.
-            - ``'sum'`` : the output will be summed.
+            - ``'none'``: no reduction will be applied.
+            - ``'mean'``: compute and return the weighted mean of elements in the output.
+            - ``'sum'``: the output elements will be summed.
     Returns:
-        Tensor. If `reduction` is 'none', returns a Tensor with the same shape as `target`.
+        Tensor. If `reduction` is ``'none'``, returns a Tensor with the same shape as `target`.
         Otherwise, it is a scalar.
     Raises:
@@ -6200,13 +6418,14 @@ def multilabel_margin_loss(input, target, reduction='mean'):
         reduction (str, optional): Apply specific reduction method to the output: ``'none'`` , ``'mean'`` ,
             ``'sum'`` . Default: ``'mean'`` .
-            - ``'none'`` : no reduction will be applied.
-            - ``'mean'`` : the sum of the output will be divided by the number of elements in the output.
-            - ``'sum'`` : the output will be summed.
+            - ``'none'``: no reduction will be applied.
+            - ``'mean'``: compute and return the mean of elements in the output.
+            - ``'sum'``: the output elements will be summed.
     Returns:
-        - **outputs** (Union[Tensor, Scalar]) - The loss of MultilabelMarginLoss. If `reduction` is "none", its shape
-          is :math:`(N)`. Otherwise, a scalar value will be returned.
+        - **outputs** (Union[Tensor, Scalar]) - The loss of MultilabelMarginLoss.
+          If `reduction` is ``"none"``, its shape is :math:`(N)`.
+          Otherwise, a scalar value will be returned.
     Raises:
         TypeError: If `input` or `target` is not a Tensor.
@@ -6214,7 +6433,7 @@ def multilabel_margin_loss(input, target, reduction='mean'):
         TypeError: If dtype of `target` is not int32.
         ValueError: If length of shape of `input` is neither 1 nor 2.
         ValueError: If shape of `input` is not the same as `target`.
-        ValueError: If `reduction` is not one of 'none', 'mean', 'sum'.
+        ValueError: If `reduction` is not one of ``'none'``, ``'mean'``, ``'sum'``.
     Supported Platforms:
         ``Ascend`` ``GPU``
@@ -6260,12 +6479,15 @@ def multilabel_soft_margin_loss(input, target, weight=None, reduction='mean'):
         input (Tensor): A tensor of shape :math:`(N, C)` , where N is batch size and C is number of classes.
         target (Tensor): The label target Tensor which has the same shape as `input`.
         weight (Union[Tensor, int, float]): The manual rescaling weight given to each class. Default: ``None``.
-        reduction (str): Specifies which reduction to be applied to the output. It must be one of
-            ``'none'`` , ``'mean'`` , and ``'sum'`` , meaning no reduction, reduce mean and sum on output, respectively.
-            Default: ``'mean'`` .
+        reduction (str, optional): Apply specific reduction method to the output: ``'none'`` , ``'mean'`` ,
+            ``'sum'`` . Default: ``'mean'`` .
+            - ``'none'``: no reduction will be applied.
+            - ``'mean'``: compute and return the weighted mean of elements in the output.
+            - ``'sum'``: the output elements will be summed.
     Returns:
-        Tensor, the data type is the same as input, if the reduction is 'none',
+        Tensor, the data type is the same as input, if the `reduction` is ``'none'``,
         its shape is :math:`(N)` , otherwise it is zero.
     Raises:
@@ -6409,15 +6631,15 @@ def gelu(input_x, approximate='none'):
     x_dtype = _get_cache_prim(P.DType)()(input_x)
     if x_dtype not in [mstype.float16, mstype.float32, mstype.float64]:
-        raise TypeError("For gelu, the input dtype must be float16, float32 or float64, "
-                        "but got {}.".format(x_dtype))
+        raise TypeError(f"For gelu, the input dtype must be float16, float32 or float64, "
+                        f"but got {x_dtype}.")
     if approximate == 'tanh':
         output = _get_cache_prim(P.GeLU)()(input_x)
     else:
-        output = _get_cache_prim(P.Sqrt)()(Tensor(2.0))
+        output = _get_cache_prim(P.Sqrt)()(Tensor(2.0, x_dtype))
         output = _get_cache_prim(P.Div)()(input_x, output)
-        output = _get_cache_prim(P.Erf)()(output) + Tensor(1.0)
-        output = input_x * output * Tensor(0.5)
+        output = _get_cache_prim(P.Erf)()(output) + Tensor(1.0, x_dtype)
+        output = input_x * output * Tensor(0.5, x_dtype)
     return output
@@ -6655,8 +6877,12 @@ def mse_loss(input, target, reduction='mean'):
         target (Tensor): The input label. Tensor of any dimension, same shape as the `input` in common cases.
             However, it supports that the shape of `input` is different from the shape of `target`
             and they should be broadcasted to each other.
-        reduction (str, optional): Type of reduction to be applied to loss.
-            The optional values are ``"mean"`` , ``"none"`` and ``"sum"`` . Default: ``'mean'`` .
+        reduction (str, optional): Apply specific reduction method to the output: ``'none'`` , ``'mean'`` ,
+            ``'sum'`` . Default: ``'mean'`` .
+            - ``'none'``: no reduction will be applied.
+            - ``'mean'``: compute and return the mean of elements in the output.
+            - ``'sum'``: the output elements will be summed.
     Returns:
         Tensor, loss of type float, the shape is zero if `reduction` is ``'mean'`` or ``'sum'`` ,
@@ -6759,11 +6985,15 @@ def triplet_margin_loss(anchor, positive, negative, margin=1.0, p=2, eps=1e-06,
         eps (float, optional): Add small value to avoid division by zero. Default: ``1e-06``.
         swap (bool, optional): The distance swap change the negative distance to the distance between positive
             sample and negative sample. Default: ``False`` .
-        reduction (str, optional): Apply specific reduction method to the output: ``'none'`` , ``'mean'`` , ``'sum'`` .
-            Default: ``'mean'`` .
+        reduction (str, optional): Apply specific reduction method to the output: ``'none'`` , ``'mean'`` ,
+            ``'sum'`` . Default: ``'mean'`` .
+            - ``'none'``: no reduction will be applied.
+            - ``'mean'``: compute and return the mean of elements in the output.
+            - ``'sum'``: the output elements will be summed.
     Returns:
-        Tensor. If `reduction` is "none", its shape is :math:`(N)`. Otherwise, a scalar value will be returned.
+        Tensor. If `reduction` is ``"none"``, its shape is :math:`(N)`. Otherwise, a scalar value will be returned.
     Raises:
         TypeError: If `anchor` or `positive` or 'negative' is not a Tensor.
@@ -6776,7 +7006,7 @@ def triplet_margin_loss(anchor, positive, negative, margin=1.0, p=2, eps=1e-06,
             same time.
         ValueError: If the dimension of input `anchor` or `positive` or `negative` is bigger than or equal to 8.
         ValueError: If shape of `anchor`, `positive` and `negative` cannot broadcast.
-        ValueError: If `reduction` is not one of 'none', 'mean', 'sum'.
+        ValueError: If `reduction` is not one of ``'none'``, ``'mean'``, ``'sum'``.
     Supported Platforms:
         ``GPU``
@@ -6811,7 +7041,7 @@ def linear(x, w, b):
 def _inner_dropout(x, p, training):
     """inner dropout"""
     _dropout = _get_cache_prim(P.Dropout)(1 - p)
-    if p > 0. and training:
+    if 0. < p <= 1. and training:
         return _dropout(x)[0]
     return x
@@ -6864,10 +7094,11 @@ def _in_projection_packed(q, k, v, w, b, k_is_v, q_is_k):
     return linear(q, w_q, b_q), linear(k, w_k, b_k), linear(v, w_v, b_v)
-def _scaled_dot_product_attention(query, key, value, attn_mask, dropout_p, is_causal, is_training):
+def _scaled_dot_product_attention(query, key, value, attn_mask, dropout_p, is_causal, is_training, dtype):
     """scaled dot product attention"""
     embed_size = query.shape[-1]
-    scaling_factor = Tensor(embed_size, mstype.float32).sqrt().sqrt()
+    embed_size_tensor = scalar_to_tensor_(embed_size, dtype)
+    scaling_factor = embed_size_tensor.sqrt().sqrt()
     query = query / scaling_factor
     if is_causal:
@@ -6960,7 +7191,7 @@ def multi_head_attention_forward(query, key, value, embed_dim_to_check, num_head
                                  out_proj_bias, training=True, key_padding_mask=None, attn_mask=None,
                                  use_separate_proj_weight=False, q_proj_weight=None, k_proj_weight=None,
                                  v_proj_weight=None, static_k=None, static_v=None, average_attn_weights=True,
-                                 is_causal=False, k_is_v=False, q_is_k=False):
+                                 is_causal=False, k_is_v=False, q_is_k=False, dtype=mstype.float32):
     """multi head attetion forward function"""
     is_batched = _check_qkv_shape(query.ndim, key.ndim, value.ndim)
     if key_padding_mask is not None:
@@ -7117,7 +7348,7 @@ def multi_head_attention_forward(query, key, value, embed_dim_to_check, num_head
     v = v.view((bsz, num_heads, src_len, head_dim))
     attn_output, attn_output_weights = _scaled_dot_product_attention(
-        q, k, v, attn_mask, dropout_p, is_causal, training)
+        q, k, v, attn_mask, dropout_p, is_causal, training, dtype)
     attn_output = attn_output.transpose(2, 0, 1, 3).view((bsz * tgt_len, embed_dim))
     attn_output = linear(attn_output, out_proj_weight, out_proj_bias)
@@ -7213,6 +7444,82 @@ def max_pool2d(x, kernel_size, stride=None, padding=0, dilation=1, return_indice
     return out
+def prompt_flash_attention(query, key, value, padding_mask, attn_mask, actual_seq_lengths,
+                           actual_seq_lengths_kv, deq_scale1, quant_scale1,
+                           deq_scale2, quant_scale2, quant_offset2, num_heads, scale_value=1.0, pre_tokens=2147483547,
+                           next_tokens=0, input_layout='BSH',
+                           num_key_value_heads=0, sparse_mode=0):
+    r"""
+    The interface for fully inference.
+    B -- Batch size
+    S -- Sequence length
+    H -- Hidden size
+    Note:
+    is only supported on ascend910B
+    .. warning::
+        This is an experimental API that is subject to change or deletion.
+    Inputs:
+        query (Tensor) - The query tensor with data type of float16 or float32.
+          Input tensor of shape :math:`(B, S, H)` / `(B, N, S, D)`.
+        key (Tensor) - The key tensor with data type of float16 or float32.
+          Input tensor of shape :math:`(B, S, H)` / `(B, N, S, D)`.
+        value (Tensor) - The value tensor with data type of float16 or float32.
+          Input tensor of shape :math:`(B, S, H)` / `(B, N, S, D)`.
+        padding_mask (Tensor) - The padding mask tensor with data type of float16 or float32
+        attn_mask (Tensor) - The attention mask tensor with data type of float16 or float32.
+          For each element, 0 indicates retention and 1 indicates discard. Input tensor of shape :math:`(B, 1, S, S)`.
+        actual_seq_lengths (list[int]): Describe actual sequence length of each input with data type of int.
+        actual_seq_lengths_kv (list[int]): Describe actual sequence length of each input with data type of int.
+        dep_scale1 (Tensor)
+        quant_scale1 (Tensor)
+        deq_scale2 (Tensor)
+        quant_scale2 (Tensor)
+        quant_offset2 (Tensor)
+        num_heads (int): The number of heads.
+        scale_value (float): The scale value indicating the scale coefficient, which is used as the scalar of
+          Muls in the calculation. Default: 1.0.
+        pre_tokens (int): Previous tokens. Default: 2147483547.
+        next_tokens (int): next tokens.  Default: 0.
+          indicate the upper triangle, Indicate the number of data blocks involved in the calculation. The value 0
+          indicates that the data blocks in the upper triangle are not involved in the calculation
+        input_layout (str): the data layout of the input qkv, support `(BSH)` and `(BNSD)`, Default `BSH`.
+        num_key_value_heads (int): head numbers of key/value which are used in GQA algorithm.
+          The value o indicates if the key and value have the same head nums, use numHeads.  Default: 0.
+        sparse_mode (int): Default: 0
+    Outputs:
+        attention_out (Tensor) - Input tensor of shape :math:`(B, S, H)` / `(B, N, S, D)`.
+        Supported Platforms:
+        ``Ascend``
+    Examples:
+        >>> from mindspore.ops.function.nn_func import prompt_flash_attention
+        >>> from mindspore import Tensor
+        >>> import numpy as np
+        >>> B = 1
+        >>> N = 16
+        >>> S = 256
+        >>> D = 16
+        >>> query = Tensor(np.ones((B, N, S, D), dtype=np.float16))
+        >>> key = Tensor(np.ones((B, N, S, D), dtype=np.float16))
+        >>> value = Tensor(np.ones((B, N, S, D), dtype=np.float16))
+        >>> out = ops.prompt_flash_attention(query, key, value, None, None, None, None, None, None, None, None,
+                                             None, N, input_layout='BNSD')
+        >>> print(out[0].shape)
+        (1, 16, 256, 16)
+    """
+    pfa = _get_cache_prim(NN_OPS.PromptFlashAttention)(num_heads, scale_value, pre_tokens, next_tokens, input_layout,
+                                                       num_key_value_heads, sparse_mode)
+    return pfa(query, key, value, padding_mask, attn_mask, actual_seq_lengths, actual_seq_lengths_kv, deq_scale1,
+               quant_scale1, deq_scale2, quant_scale2, quant_offset2)
 __all__ = [
     'adaptive_avg_pool1d',
     'adaptive_avg_pool2d',
@@ -7260,6 +7567,7 @@ __all__ = [
     'softsign',
     'softshrink',
     'soft_shrink',
+    'softplus',
     'selu',
     'silu',
     'soft_margin_loss',