PyPI - mindspore - Versions diffs - 2.4.1__cp310-cp310-manylinux1_x86_64.whl → 2.5.0__cp310-cp310-manylinux1_x86_64.whl - Mend

mindspore 2.4.1__cp310-cp310-manylinux1_x86_64.whl → 2.5.0__cp310-cp310-manylinux1_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mindspore might be problematic. Click here for more details.

Files changed (866) hide show

mindspore/ops/function/nn_func.py CHANGED Viewed

@@ -26,7 +26,7 @@ from mindspore.ops import functional as F
 from mindspore.ops.operations import nn_ops as NN_OPS
 from mindspore.ops.operations import _sequence_ops as seq
 import mindspore.common.dtype as mstype
-from mindspore.ops.function.math_func import logsumexp
+from mindspore.ops.function.math_func import logsumexp, div
 from mindspore.ops.function.random_func import _get_seed, _set_prim_op_user_data
 from mindspore.common.tensor import Tensor
 from mindspore._c_expression import Tensor as Tensor_
@@ -40,22 +40,29 @@ from mindspore.ops.operations.nn_ops import ChannelShuffle
 from mindspore.ops.operations.nn_ops import TripletMarginLoss
 from mindspore.ops.operations._sequence_ops import TupleToTensor, TensorToTuple, ListToTensor
 from mindspore.common.api import _function_forbid_reuse
-from mindspore.ops.auto_generate import log_softmax, dense, prelu, celu, relu, fast_gelu, silu, elu, sigmoid, relu6, \
+from mindspore.ops.auto_generate import log_softmax, dense, prelu, celu, fast_gelu, silu, elu, sigmoid, relu6, \
     softmax_impl, swiglu, logsigmoid_op
-from mindspore.ops.auto_generate import group_norm_op, rms_norm, layer_norm_ext_op, batch_norm_ext_op, mse_loss_ext
+from mindspore.ops.auto_generate import relu_op, inplace_relu_op
+from mindspore.ops.auto_generate import group_norm_op, rms_norm, add_rms_norm, layer_norm_ext_op, batch_norm_ext_op,\
+    mse_loss_ext
 from mindspore.ops.auto_generate import (reflection_pad_1d_op, reflection_pad_2d_op, add_layernorm_v2_op,
                                          reflection_pad_3d_op,  # pylint: disable=W0611
                                          replication_pad_1d_op, replication_pad_2d_op, replication_pad_3d_op,
                                          constant_pad_nd_op, dropout_ext_op, reverse_v2_impl, avg_pool2d_op,
                                          upsample_nearest1d_op, upsample_nearest2d_op, upsample_nearest3d_op,
                                          upsample_linear1d_op, upsample_bilinear2d_op, upsample_bicubic2d_op,
-                                         upsample_trilinear3d_impl, fill_scalar_op, floor_op)
-from mindspore.ops.auto_generate.gen_ops_prim import embedding_op, Convolution, ConstantPadND, MaxPoolWithIndices, \
-    MaxPoolWithMask
+                                         upsample_trilinear3d_impl, fill_scalar_op, floor_op, nllloss_2d_op,
+                                         masked_fill_op, masked_select, ones, flatten_ext, conv_transpose2d)
+from mindspore.ops.auto_generate.gen_ops_prim import embedding_op, MaxPoolWithIndices, \
+    PromptFlashAttention, MaxPoolWithMask
+from mindspore.ops.auto_generate.gen_ops_prim import conv3d_ext_op, conv3d_padding_op, conv2d_ext_op, conv2d_padding_op
 from mindspore.common.generator import default_generator
 from mindspore.ops.auto_generate import hardshrink, hardsigmoid, hardswish
 from mindspore.ops.auto_generate import softshrink
 from mindspore.ops.auto_generate import adaptive_avg_pool2d_ext_op
+from mindspore.ops.auto_generate.pyboost_inner_prim import nllloss_impl
+from mindspore.ops.function.array_func import gather_ext
+from mindspore.ops.operations.manually_defined import flash_attention_score
 abs_ = P.Abs()
 add_ = P.Add()
@@ -111,7 +118,7 @@ check_int_const = validator.check_is_int
 check_non_negative_float_const = validator.check_non_negative_float
 check_string_const = constexpr(validator.check_string)
-generator_step_ = Tensor(1, mstype.int64)
+generator_step_ = Tensor(12, mstype.int64)
 def adaptive_avg_pool2d(input, output_size):
@@ -247,11 +254,11 @@ def adaptive_avg_pool2d_ext(input, output_size):
     .. math::
         out\_shape = \begin{cases}
-        input\_shape[-2] + output\_size[1], & \text{if } output\_size text{ is (None, w);}\\
-        output\_size[0] + input\_shape[-1], & \text{if } output\_size text{ is (h, None);}\\
-        input\_shape[-2:], & \text{if } output\_size text{ is (None, None);}\\
-        (h, h), & \text{if } output\_size text{ is h;}\\
-        (h, w), & \text{if } output\_size text{ is (h, w)}
+        input\_shape[-2] + output\_size[1], & \text{if } output\_size \text{ is (None, w);}\\
+        output\_size[0] + input\_shape[-1], & \text{if } output\_size \text{ is (h, None);}\\
+        input\_shape[-2:], & \text{if } output\_size \text{ is (None, None);}\\
+        (h, h), & \text{if } output\_size \text{ is h;}\\
+        (h, w), & \text{if } output\_size \text{ is (h, w)}
         \end{cases}
     Raises:
@@ -620,13 +627,13 @@ def avg_pool2d_ext(input, kernel_size, stride=None, padding=0, ceil_mode=False,
         outputs regional average in the :math:`(H_{in}, W_{in})` -dimension.
         Given kernel size :math:`(kH, kW)` and `stride` , the operation is as follows.
-        .. note::
-            On the Atlas platform, when calculating the input, the precision is degraded from float32 to float16.
         .. math::
             \text{output}(N_i, C_j, h, w) = \frac{1}{kH * kW} \sum_{m=0}^{kH-1} \sum_{n=0}^{kW-1}
             \text{input}(N_i, C_j, stride[0] \times h + m, stride[1] \times w + n)
+        .. note::
+            On the Atlas platform, when calculating the input, the precision is degraded from float32 to float16.
         Args:
             input (Tensor): Tensor of shape :math:`(N, C, H_{in}, W_{in})` or :math:`(C, H_{in}, W_{in})`.
             kernel_size (Union[int, tuple[int], list[int]]): The size of kernel used to take the average value.
@@ -1473,7 +1480,7 @@ def dropout(input, p=0.5, training=True, seed=None):
 @_function_forbid_reuse
-def dropout_ext(input, p=0.5, training=True):
+def dropout_ext(input, p=0.5, training=True, inplace=False):
     r"""
     During training, randomly zeroes some of the elements of the input tensor
     with probability `p` from a Bernoulli distribution. It plays the role of reducing neuron correlation and
@@ -1482,10 +1489,12 @@ def dropout_ext(input, p=0.5, training=True):
     Args:
         input (Tensor): The input Tensor of shape :math:`(*, N)`.
-        p (float): The dropping rate of input neurons, between 0 and 1, e.g. `p` = 0.1,
+        p (float, optional): The dropping rate of input neurons, between 0 and 1, e.g. `p` = 0.1,
             means dropping out 10% of input neurons. Default: ``0.5`` .
-        training (bool): Apply dropout if it is ``True`` , if it is ``False`` , the input is returned directly,
-            and `p` is invalid. Default: ``True``.
+        training (bool, optional): Apply dropout if it is ``True`` ,
+            if it is ``False`` , the input is returned directly,
+            and `p` is invalid. Default: ``True`` .
+        inplace (bool, optional): If set to ``True`` , will do this operation in-place. Default: ``False`` .
     Returns:
         - **output** (Tensor) - Zeroed tensor, with the same shape and data type as `input`.
@@ -1506,10 +1515,14 @@ def dropout_ext(input, p=0.5, training=True):
         (2, 2)
     """
     check_bool_const(training, "training", "dropout_ext")
-    if training is False:
+    check_bool_const(inplace, "inplace", "dropout_ext")
+    if not training:
         return input
     seed, offset = default_generator._step(generator_step_)  # pylint: disable=protected-access
     out, _ = dropout_ext_op(input, p, seed, offset)
+    if inplace:
+        input.copy_(out)
+        return input
     return out
@@ -2415,8 +2428,8 @@ def interpolate(input,
         >>> input = Tensor([[[1, 2, 3], [4, 5, 6]]], mindspore.float32)
         >>> output = ops.interpolate(input, size=(6,), mode='nearest')
         >>> print(output)
-            [[[1. 1. 2. 2. 3. 3.]
-              [4. 4. 5. 5. 6. 6.]]]
+        [[[1. 1. 2. 2. 3. 3.]
+          [4. 4. 5. 5. 6. 6.]]]
     """
     def run_nearest(x, size, align_corners=None, scale_factor=None):
@@ -2667,7 +2680,7 @@ def interpolate_ext(input,
     r"""
     Samples the input Tensor to the given size or scale_factor by using one of the interpolate algorithms.
-    .. warnings:
+    .. warning::
         This is an experimental API that is subject to change or deletion.
     .. note::
@@ -2675,7 +2688,7 @@ def interpolate_ext(input,
           is not supported.
         - In 'nearest' mode, there may exist precision problem in the scenarios, where input is 3-D/4-D Tensor
           and the image is scaled by scale_factor.
-        - `mode` and `scale_factor` should be constants.
+        - `mode` and `recompute_scale_factor` should be constants.
     Args:
         input (Tensor): Tensor to be resized.
@@ -2690,9 +2703,11 @@ def interpolate_ext(input,
             after removing the first two dimensions N, C.
             One and only one of size and scale_factor can be set to None. Default: ``None`` .
         mode (str): The sampling algorithm.
-            One of 'nearest', 'linear' (3D only), 'bilinear' (4D only), 'trilinear' (5D only), and 'bicubic' (4D only).
+            One of 'nearest', 'linear' (3D only),
+            'bilinear' (4D only), 'trilinear' (5D only), and 'bicubic' (4D only).
             Default: ``"nearest"`` .
-        align_corners (bool): Whether to use corner alignment for coordinate mapping. Assuming a transformation is
+        align_corners (bool, optional): Whether to use corner alignment for coordinate mapping.
+            Assuming a transformation is
             applied to the input Tensor along the x-axis, the specific calculation formula is as follows:
             .. code-block::
@@ -2771,12 +2786,12 @@ def interpolate_ext(input,
     Examples:
         >>> import mindspore
-        >>> from mindspore import Tensor, mint
+        >>> from mindspore import Tensor, ops
         >>> input = Tensor([[[1, 2, 3], [4, 5, 6]]], mindspore.float32)
-        >>> output = mint.interpolate(input, size=(6,), mode='nearest')
+        >>> output = ops.interpolate_ext(input, size=(6,), mode='nearest')
         >>> print(output)
-            [[[1. 1. 2. 2. 3. 3.]
-              [4. 4. 5. 5. 6. 6.]]]
+        [[[1. 1. 2. 2. 3. 3.]
+          [4. 4. 5. 5. 6. 6.]]]
     """
     def run_nearest(x, size, align_corners=None, scale_factor=None):
@@ -3030,8 +3045,6 @@ def softmax_ext(input, dim=None, dtype=None):
         input (Tensor): Tensor of shape :math:`(N, *)`, where :math:`*` means, any number of
             additional dimensions.
         dim (int, optional): The dim to perform the Softmax operation. Default: ``None`` .
-    Keyword Args:
         dtype (:class:`mindspore.dtype`, optional): When set, `input` will be converted to the specified type,
             `dtype`, before execution, and dtype of returned Tensor will also be `dtype`. Default: ``None`` .
@@ -3631,7 +3644,7 @@ def _replication_pad(input, pad):
     return out
-def pad_ext(input, pad, mode='constant', value=0.0):
+def pad_ext(input, pad, mode='constant', value=None):
     r"""
     Pads the input tensor according to the pad.
@@ -3679,7 +3692,7 @@ def pad_ext(input, pad, mode='constant', value=0.0):
         value (Union[int, float, None], optional): Valid only in ``'constant'`` mode.
             Set the padding value in ``'constant'`` mode. If the value is None, 0 is used as the default padding value.
-            Default: ``0.0`` .
+            Default: ``None`` .
     Returns:
         Tensor, the tensor after padding.
@@ -3689,7 +3702,7 @@ def pad_ext(input, pad, mode='constant', value=0.0):
         TypeError: If `input` is not a Tensor.
         ValueError: If length of `pad` is not even.
         ValueError: If length of `pad` is greater than 6.
-        ValueError: If `mode` is not ``'constant'`` and `value` not ``None``.
+        ValueError: If `mode` is not ``'constant'`` and `value` is neither ``None`` nor 0.
     Supported Platforms:
         ``Ascend``
@@ -3717,7 +3730,7 @@ def pad_ext(input, pad, mode='constant', value=0.0):
         value = 0 if value is None else value
         out = constant_pad_nd_op(input, pad, value)
     else:
-        if value != 0.0:
+        if value is not None and value != 0:
             raise ValueError(f"Padding mode {mode} doesn\'t take in value argument.")
         if mode == "circular":
             out = _circular_pad(input, pad)
@@ -3897,9 +3910,9 @@ def rrelu(input, lower=1.0 / 8, upper=1.0 / 3):
     `Empirical Evaluation of Rectified Activations in Convolution Network <https://arxiv.org/pdf/1505.00853.pdf>`_ .
     Args:
-        input  (Tensor): The input of rrelu is a Tensor of any dimension.
-        lower (Union[int, float]): Slope of the activation function at x < 0. Default: ``1.0 / 8`` .
-        upper (Union[int, float]): Slope of the activation function at x < 0. Default: ``1.0 / 3`` .
+        input (Tensor): The input of rrelu is a Tensor of any dimension.
+        lower (Union[int, float]): Slope of the activation function at data of `input` < 0. Default: ``1.0 / 8`` .
+        upper (Union[int, float]): Slope of the activation function at data of `input` < 0. Default: ``1.0 / 3`` .
     Returns:
         Tensor, after rrelu, has the same type and shape as the `input`.
@@ -4271,6 +4284,303 @@ def _nll_loss(inputs, target, target_dim=-1, weight=None, ignore_index=None, red
     return loss
+def nll_loss_ext(input, target, weight=None, ignore_index=-100, reduction='mean'):
+    r"""
+    Gets the negative log likelihood loss between input and target.
+    The nll loss with reduction=none can be described as:
+    .. math::
+        \ell(x, t)=L=\left\{l_{1}, \ldots, l_{N}\right\}^{\top},
+        \quad l_{n}=-w_{t_{n}} x_{n, t_{n}},
+        \quad w_{c}=\text { weight }[c] \cdot \mathbb{1}
+        \{c \not= \text{ignore_index}\},
+    where :math:`x` is the input, :math:`t` is the target, :math:`w` is the weight,
+    :math:`N` is the batch size, :math:`c` belonging to :math:`[0, C-1]` is class index,
+    where :math:`C` is the number of classes.
+    If `reduction` is not ``None`` (default ``'mean'``), then
+    .. math::
+        \ell(x, t)=\left\{\begin{array}{ll}
+        \sum_{n=1}^{N} \frac{1}{\sum_{n=1}^{N} w_{t n}} l_{n}, & \text { if reduction }=\text { 'mean', } \\
+        \sum_{n=1}^{N} l_{n}, & \text { if reduction }=\text { 'sum' }
+        \end{array}\right.
+    .. warning::
+        This is an experimental API that is subject to change or deletion.
+    Args:
+        input (Tensor): :math:`(N)` or :math:`(N, C)` where `C = number of classes` , `N = batch size` ,
+            or :math:`(N, C, d_1, d_2, ..., d_K)` (for high-dimensional data).
+            `input` is expected to be log-probabilities.
+            Data type only supports float32 or float16 or bfloat16(only supported by
+            Atlas A2 training series products).
+        target (Tensor): :math:`()` or :math:`(N)` ,
+          where the value range is :math:`[0, C-1]`, or :math:`(N, d_1, d_2, ..., d_K)` for
+          high-dimensional loss, data type must be int32 or int64 or uint8.
+        weight (Tensor, optional): A rescaling weight applied to the loss of each batch element.
+            If not None, the shape is :math:`(C,)`.
+            The data type must be float16 or float32 or bfloat16(only supported by Atlas A2 training series products).
+            It should have the same data type as `input` . Default: ``'None'`` .
+        ignore_index (int, optional): Specifies a target value that is ignored
+            and does not contribute to the input gradient. Default: ``-100`` .
+        reduction (str, optional): Apply specific reduction method to the output: ``'none'`` , ``'mean'`` ,
+            ``'sum'`` . Default: ``'mean'`` .
+            - ``'none'``: no reduction will be applied.
+            - ``'mean'``: compute and return the weighted mean of elements in the output.
+            - ``'sum'``: the output elements will be summed.
+    Returns:
+        Tensor. The data type is the same as that of `input`.
+    Supported Platforms:
+        ``Ascend``
+    Examples:
+        >>> import mindspore
+        >>> import numpy as np
+        >>> from mindspore import Tensor, mint
+        >>> input = mindspore.Tensor(np.random.randn(3, 5), mindspore.float32)
+        >>> target = mindspore.Tensor(np.array([1, 0, 4]), mindspore.int32)
+        >>> output = mint.nn.functional.nll_loss(input, target)
+    """
+    return _nllloss_nd(input, target, weight, ignore_index, reduction)
+def _nllloss_nd(input, target, weight=None, ingore_index=-100, reduction='mean'):
+    """nllloss_nd inner function"""
+    input_dim = input.ndim
+    class_dim = 0 if input_dim == 1 else 1
+    n_classes = input.shape[class_dim]
+    if weight is None:
+        weight = ones(n_classes, input.dtype)
+    if input_dim < 1:
+        raise ValueError(f"input dim should be less than 1, but got {input_dim}")
+    if input_dim != 1 and input.shape[0] != target.shape[0]:
+        raise ValueError(f"input bacth_size should be equal to target batch_size, but got {input.shape[0]} and "
+                         f"{target.shape[0]}")
+    if input_dim == 1 or input_dim == 2:
+        return nllloss_impl(input, target, weight, reduction, ingore_index)[0]
+    if input_dim == 4:
+        return nllloss_2d_op(input, target, weight, reduction, ingore_index)[0]
+    # input_dim==3 or input_dim>4
+    n = input.shape[0]
+    c = input.shape[1]
+    out_size = (n,) + input.shape[2:]
+    if input.size > 0:
+        input = input.view((n, c, 1, -1))
+    else:
+        input = input.view((n, c, 0, 0))
+    if target.size > 0:
+        target = target.view((n, 1, -1))
+    else:
+        target = target.view((n, 0, 0))
+    if reduction != 'none':
+        return nllloss_2d_op(input, target, weight, reduction, ingore_index)[0]
+    ret = nllloss_2d_op(input, target, weight, reduction, ingore_index)[0]
+    return ret.view(out_size)
+def _cross_entropy_for_probabilities(input, target, weight, reduction, label_smoothing, class_dim, n_classes):
+    """cross_entropy inner function for class probabilities"""
+    if input.shape != target.shape:
+        raise ValueError("For cross_entropy that target is probabilities, input shape should equal to target shape.")
+    if label_smoothing > 0.0:
+        target = target * (1 - label_smoothing) + label_smoothing / n_classes
+    loss = input * target
+    if weight is not None:
+        weight_ = weight
+        ori_shape = loss.shape
+        if input.ndim > 2:
+            loss = loss.view(ori_shape[:2] + (-1,))
+            weight_ = weight_.view(1, -1, 1)
+        loss = loss * weight_
+        loss = loss.view(ori_shape)
+    if reduction == "mean":
+        return -div(loss.sum(), (input.size / n_classes))
+    if reduction == "sum":
+        return -loss.sum()
+    if reduction == "none":
+        return -loss.sum(class_dim)
+    raise ValueError(f"redution value {reduction} not valid.")
+def _cross_entropy_for_class_indices(input, target, weight, ingore_index, reduction, label_smoothing, class_dim,
+                                     n_classes):
+    """cross_entropy inner function for class indices"""
+    nllloss = _nllloss_nd(input, target, weight, ingore_index, reduction)
+    if label_smoothing > 0.0:
+        if weight is not None:
+            weight_ = weight
+            input_ = input
+            ori_shape = input.shape
+            if input.ndim > 2:
+                input_ = input.view(ori_shape[:2] + (-1,))
+                weight_ = weight_.view(1, -1, 1)
+            loss = input_ * weight_
+            loss = loss.view(ori_shape)
+            smooth_loss = -loss.sum(class_dim)
+        else:
+            smooth_loss = -input.sum(class_dim)
+        ignore_mask = ops.eq(target, ingore_index)
+        smooth_loss = masked_fill_op(smooth_loss, ignore_mask, 0)
+        if reduction == "mean":
+            true_mask = ~ignore_mask
+            if weight is not None:
+                weight_sum = gather_ext(weight, 0, flatten_ext(masked_select(target, true_mask))).sum()
+                if weight_sum == 0:
+                    ret = smooth_loss.sum()
+                else:
+                    ret = smooth_loss.sum() / weight_sum
+            else:
+                weight_sum = true_mask.sum()
+                if weight_sum == 0:
+                    ret = smooth_loss.sum()
+                else:
+                    ret = smooth_loss.sum() / weight_sum
+        elif reduction == "sum":
+            ret = smooth_loss.sum()
+        elif reduction == "none":
+            ret = smooth_loss
+        else:
+            raise ValueError(f"redution value {reduction} not valid.")
+        return (1 - label_smoothing) * nllloss + ret * (label_smoothing / n_classes)
+    return nllloss
+def cross_entropy_ext(input, target, weight=None, ingore_index=-100, reduction='mean', label_smoothing=0.0):
+    r"""
+    The cross entropy loss between input and target.
+    The cross entropy supports two kind of targets:
+    - Class indices (int) in the range :math:`[0, C)` where :math:`C` is the number of classes,
+      the loss with reduction=none can be described as:
+      .. math::
+          \ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad
+          l_n = - w_{y_n} \log \frac{\exp(x_{n,y_n})}{\sum_{c=1}^C \exp(x_{n,c})}
+          \cdot \mathbb{1}\{y_n \not= \text{ignore_index}\}
+      where :math:`x` is the inputs, :math:`y` is the target, :math:`w` is the weight, :math:`N` is the batch size,
+      :math:`c` belonging to :math:`[0, C-1]` is class index, where :math:`C` is the number of classes.
+      If `reduction` is not ``None`` (default ``'mean'`` ), then
+      .. math::
+          \ell(x, y) = \begin{cases}
+              \sum_{n=1}^N \frac{1}{\sum_{n=1}^N w_{y_n} \cdot \mathbb{1}\{y_n \not= \text{ignore_index}\}} l_n, &
+              \text{if reduction} = \text{'mean',}\\
+              \sum_{n=1}^N l_n,  &
+              \text{if reduction} = \text{'sum'.}
+              \end{cases}
+    - Probabilities (float) for each class, useful when labels beyond a single class per minibatch item
+      are required, the loss with reduction=none can be described as:
+      .. math::
+          \ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad
+          l_n = - \sum_{c=1}^C w_c \log \frac{\exp(x_{n,c})}{\sum_{i=1}^C \exp(x_{n,i})} y_{n,c}
+      where :math:`x` is the inputs, :math:`y` is the target, :math:`w` is the weight, N is the batch size,
+      :math:`c` belonging to :math:`[0, C-1]` is class index, where :math:`C` is the number of classes.
+      If `reduction` is not ``None`` (default ``'mean'`` ), then
+      .. math::
+          \ell(x, y) = \begin{cases}
+              \frac{\sum_{n=1}^N l_n}{N}, &
+              \text{if reduction} = \text{'mean',}\\
+              \sum_{n=1}^N l_n,  &
+              \text{if reduction} = \text{'sum'.}
+              \end{cases}
+    .. warning::
+            This is an experimental API that is subject to change or deletion.
+    Note:
+        Dynamic shape, dynamic rank and variable constant input are not supported in `strict graph mode
+        (jit_syntax_level=mindspore.STRICT)
+        <https://www.mindspore.cn/docs/en/master/model_train/program_form/static_graph.html>`_.
+    Args:
+        input (Tensor): :math:`(N)` or :math:`(N, C)` where `C = number of classes` or :math:`(N, C, H, W)`
+            in case of 2D Loss, or :math:`(N, C, d_1, d_2, ..., d_K)`.
+            `input` is expected to be log-probabilities, data type must be float16 or float32 or bfloat16(only supported
+            by Atlas A2 training series products).
+        target (Tensor): For class indices, tensor of shape :math:`()`, :math:`(N)` or
+            :math:`(N, d_1, d_2, ..., d_K)` , data type must be int32 or int64. For probabilities, tensor of shape
+            :math:`(N,)` , :math:`(N, C)` or :math:`(N, C, d_1, d_2, ..., d_K)` , data type must be float16 or float32
+            or bfloat16(only supported by Atlas A2 training series products).
+        weight (Tensor, optional): A rescaling weight applied to the loss of each batch element.
+            If not None, the shape is :math:`(C,)`, data type must be float16 or float32 or bfloat16(only supported by
+            Atlas A2 training series products). Default: ``None`` .
+        ignore_index (int, optional): Specifies a target value that is ignored and does not contribute to the input
+            gradient. Only valid in class indices, please set it to a negative number in probabilities.
+            Default: ``-100`` .
+        reduction (str, optional): Apply specific reduction method to the output: ``'none'`` , ``'mean'`` ,
+            ``'sum'`` . Default: ``'mean'`` .
+            - ``'none'``: no reduction will be applied.
+            - ``'mean'``: compute and return the weighted mean of elements in the output.
+            - ``'sum'``: the output elements will be summed.
+        label_smoothing (float, optional): Label smoothing values, a regularization tool used to prevent the model
+            from overfitting when calculating Loss. The value range is [0.0, 1.0]. Default: ``0.0`` .
+    Returns:
+        Tensor, the data type is the same as `input` .
+    Supported Platforms:
+        ``Ascend``
+    Examples:
+        >>> import mindspore as ms
+        >>> from mindspore import ops, Tensor
+        >>> import numpy as np
+        >>> # Case 1: Indices labels
+        >>> inputs = Tensor(np.random.randn(3, 5), ms.float32)
+        >>> target = Tensor(np.array([1, 0, 4]), ms.int32)
+        >>> output = ops.cross_entropy_ext(inputs, target)
+        >>> # Case 2: Probability labels
+        >>> inputs = Tensor(np.random.randn(3, 5), ms.float32)
+        >>> target = Tensor(np.random.randn(3, 5), ms.float32)
+        >>> output = ops.cross_entropy_ext(inputs, target)
+    """
+    if not isinstance(input, Tensor) or not isinstance(target, Tensor):
+        raise TypeError(
+            f"For cross_entropy, input and target must be Tensor, but got input:{type(input)}, target:{type(target)}.")
+    if weight is not None and not isinstance(weight, Tensor):
+        raise TypeError(f"For cross_entropy, weight must be Tensor or None, but got {type(weight)}.")
+    if label_smoothing < 0.0 or label_smoothing > 1.0:
+        raise ValueError(f"For cross_entropy, label_smoothing must in [0, 1]")
+    if input.ndim == 0 or input.shape[0] == 0:
+        raise ValueError(f"For cross_entropy, input don't support 0-dim and shape[0].")
+    class_dim = 0 if input.ndim == 1 else 1
+    n_classes = input.shape[class_dim]
+    input = log_softmax_ext(input, class_dim, dtype=input.dtype)
+    # for probabilities
+    target_dtype = target.dtype
+    if isinstance(target_dtype, type(mstype.tensor_type)):
+        target_dtype = target_dtype.element_type()
+    if target_dtype in mstype.float_type:
+        return _cross_entropy_for_probabilities(input, target, weight, reduction, label_smoothing, class_dim,
+                                                n_classes)
+    # for class indices
+    return _cross_entropy_for_class_indices(input, target, weight, ingore_index, reduction, label_smoothing,
+                                            class_dim, n_classes)
 def l1_loss(input, target, reduction='mean'):
     r"""
     Calculate the mean absolute error between the `input` value and the `target` value.
@@ -4348,8 +4658,8 @@ def smooth_l1_loss(input, target, beta=1.0, reduction='none'):
     .. math::
         L_{i} =
         \begin{cases}
-        \frac{0.5 (x_i - y_i)^{2}}{\beta}, & \text{if } |x_i - y_i| < \beta \\
-        |x_i - y_i| - 0.5 * \beta, & \text{otherwise. }
+        \frac{0.5 (x_i - y_i)^{2}}{\text{beta}}, & \text{if } |x_i - y_i| < \text{beta} \\
+        |x_i - y_i| - 0.5 * \text{beta}, & \text{otherwise. }
         \end{cases}
     If `reduction` is not `none`, then:
@@ -4364,12 +4674,26 @@ def smooth_l1_loss(input, target, beta=1.0, reduction='none'):
     Here :math:`\text{beta}` controls the point where the loss function changes from quadratic to linear.
     :math:`\text{beta}>0` , its default value is ``1.0`` . :math:`N` is the batch size.
+    .. warning::
+        This API has poor performance on CPU and it is recommended to run it on the Ascend/GPU.
     Args:
-        input (Tensor): Tensor of shape :math:`(N, *)` where :math:`*` means, any number of additional dimensions.
-            Data type is float16, float32 or float64.
-        target (Tensor): Ground truth data, tensor of shape :math:`(N, *)`, same shape and dtype as the `input`.
-        beta (float): A parameter used to control the point where the function will change between
-            L1 to L2 loss. The value should be greater than zero. Default: ``1.0`` .
+        input (Tensor): Tensor of shape :math:`(N, *)` where :math:`*` means,
+            any number of additional dimensions.Supported dtypes:
+            - Ascend: float16, float32, bfloat16.
+            - CPU/GPU: float16, float32, float64.
+        target (Tensor): Ground truth data, tensor of shape :math:`(N, *)`.
+            - CPU/Ascend: has the same shape as the `input`, `target` and `input`
+              comply with the implicit type conversion rules to make the data types consistent.
+            - GPU: has the same shape and dtype as the `input`.
+        beta (number, optional): A parameter used to control the point where the function will change between
+            L1 to L2 loss. Default: ``1.0`` .
+            - Ascend: The value should be equal to or greater than zero.
+            - CPU/GPU: The value should be greater than zero.
         reduction (str, optional): Apply specific reduction method to the output: ``'none'`` , ``'mean'`` ,
             ``'sum'`` . Default: ``'none'`` .
@@ -4379,14 +4703,15 @@ def smooth_l1_loss(input, target, beta=1.0, reduction='none'):
     Returns:
         Tensor, if `reduction` is ``'none'``, then output is a tensor with the same shape as `input`.
-        Otherwise, the shape of output tensor is :math:`(1,)`.
+        Otherwise, the shape of output tensor is :math:`()`.
     Raises:
-        TypeError: If `beta` is not a float.
-        ValueError: If `reduction` is not one of ``'none'``, ``'mean'``, ``'sum'``.
-        TypeError: If dtype of `input` or `target` is not one of float16, float32, float64.
-        ValueError: If `beta` is less than or equal to 0.
+        TypeError: If input `input`, `target` is not Tensor.
+        RuntimeError: If dtype of `input` or `target` is not one of float16, float32, float64, bfloat16.
         ValueError: If shape of `input` is not the same as `target`.
+        ValueError: If `reduction` is not one of ``'none'``, ``'mean'``, ``'sum'``.
+        TypeError: If `beta` is not a float, int or bool.
+        RuntimeError: If `beta` is less than or equal to 0.
     Supported Platforms:
         ``Ascend`` ``GPU`` ``CPU``
@@ -4961,7 +5286,7 @@ def grid_sample(input, grid, mode='bilinear', padding_mode='zeros', align_corner
             H_{in}, W_{in})` (5-D case) and dtype of float32 or float64.
         grid (Tensor): flow-field with shape of :math:`(N, H_{out}, W_{out}, 2)` (4-D case) or :math:`(N, D_{out},
             H_{out}, W_{out}, 3)` (5-D case) and same dtype as `input`.
-        mode (str): An optional string specifying the interpolation method. The optional values are
+        mode (str, optional): An optional string specifying the interpolation method. The optional values are
             ``'bilinear'``, ``'nearest'``. Default: ``'bilinear'`` . Note: `bicubic` is not supported yet. When
             `mode="bilinear"` and the input is 5-D, the interpolation mode used internally will actually
             be trilinear. However, when the input is 4-D, the interpolation mode will legistimately be bilinear.
@@ -4976,9 +5301,10 @@ def grid_sample(input, grid, mode='bilinear', padding_mode='zeros', align_corner
               It performs bilinear interpolation in the two spatial dimensions and linear interpolation along
               the third dimension. It is commonly used for volume or 3D image interpolation.
-        padding_mode (str): An optional string specifying the pad method. The optional values are "zeros", "border" or
+        padding_mode (str, optional): An optional string specifying the pad method.
+            The optional values are "zeros", "border" or
             "reflection". Default: ``'zeros'`` .
-        align_corners (bool): If set to `True`, the extrema (-1 and 1) are considered as referring to
+        align_corners (bool, optional): If set to `True`, the extrema (-1 and 1) are considered as referring to
             the center points of the input's corner pixels. If set to `False`, they are instead considered as referring
             to the corner points of the input's corner pixels, making the sampling more resolution agnostic. Default:
             ``False`` .
@@ -5389,12 +5715,15 @@ def conv3d_transpose(inputs, weight, pad_mode='valid', padding=0, stride=1, dila
     Args:
         inputs (Tensor): The gradients with respect to the output of the convolution.
            The shape conforms to the default.
-           data_format :math:`(N, C_{in}, D_{out}, H_{out}, W_{out})`. Currently dout data type only supports float16
-           and float32.
+           data_format :math:`(N, C_{in}, D_{out}, H_{out}, W_{out})`.
+           Supported dtypes:
+           - Ascend: float16.
+           - GPU/CPU: float16, float32.
         weight (Tensor): Set size of kernel is :math:`(K_d, K_h, K_w)`, then the shape is
            :math:`(C_{in}, C_{out}//group, K_d, K_h, K_w)`. Where :math:`group` is the Args parameter,
            :math:`//` is the symbol for integer division.
-           Currently weight data type only supports float16 and float32.
+           It has the same dtype as `dout`.
         pad_mode (str): Specifies padding mode. The optional values are
             "same", "valid", "pad". Default: "valid".
@@ -5538,9 +5867,9 @@ def conv1d(input, weight, bias=None, stride=1, pad_mode="valid", padding=0, dila
     The shape of the convolutional kernel is given by :math:`(\text{kernel_size})`,
     where :math:`\text{kernel_size}` is the width of the kernel.
-    If we consider the input and output channels as well as the `group` parameter, the complete kernel shape
-    will be :math:`(C_{out}, C_{in} / \text{group}, \text{kernel_size})`,
-    where `group` is the number of groups dividing `x`'s input channel when applying group convolution.
+    If we consider the input and output channels as well as the `groups` parameter, the complete kernel shape
+    will be :math:`(C_{out}, C_{in} / \text{groups}, \text{kernel_size})`,
+    where `groups` is the number of groups dividing `x`'s input channel when applying group convolution.
     For more details about convolution layer, please refer to `Gradient Based Learning Applied to Document Recognition
     <http://vision.stanford.edu/cs598_spring07/papers/Lecun98.pdf>`_
@@ -5655,7 +5984,7 @@ def conv1d(input, weight, bias=None, stride=1, pad_mode="valid", padding=0, dila
 def conv2d(input, weight, bias=None, stride=1, pad_mode="valid", padding=0, dilation=1, groups=1):
     r"""
-    Applies a 2D convolution over an input tensor. The input tenor is typically of
+    Applies a 2D convolution over an input tensor. The input tensor is typically of
     shape :math:`(N, C_{in}, H_{in}, W_{in})`, where :math:`N` is batch size, :math:`C` is
     channel number, :math:`H` is feature height, :math:`W` is feature width.
@@ -5690,9 +6019,9 @@ def conv2d(input, weight, bias=None, stride=1, pad_mode="valid", padding=0, dila
     The shape of the convolutional kernel is given by :math:`(\text{kernel_size[0]}, \text{kernel_size[1]})`,
     where :math:`\text{kernel_size[0]}` and :math:`\text{kernel_size[1]}` are the height and width of the kernel,
     respectively.
-    If we consider the input and output channels as well as the `group` parameter, the complete kernel shape
-    will be :math:`(C_{out}, C_{in} / \text{group}, \text{kernel_size[0]}, \text{kernel_size[1]})`,
-    where `group` is the number of groups dividing `x`'s input channel when applying group convolution.
+    If we consider the input and output channels as well as the `groups` parameter, the complete kernel shape
+    will be :math:`(C_{out}, C_{in} / \text{groups}, \text{kernel_size[0]}, \text{kernel_size[1]})`,
+    where `groups` is the number of groups dividing `x`'s input channel when applying group convolution.
     For more details about convolution layer, please refer to `Gradient Based Learning Applied to Document Recognition
     <http://vision.stanford.edu/cs598_spring07/papers/Lecun98.pdf>`_ and
@@ -5840,9 +6169,9 @@ def _get_pad_nd_info(pad_l, pad_r):
 def conv2d_ext(input, weight, bias=None, stride=1, padding=0, dilation=1, groups=1):
     r"""
-    Applies a 2D convolution over an input tensor. The input tenor is typically of
-    shape :math:`(N, C_{in}, H_{in}, W_{in})`, where :math:`N` is batch size, :math:`C` is
-    channel number, :math:`H` is feature height, :math:`W` is feature width.
+    Applies a 2D convolution over an input tensor. The input tensor is typically of
+    shape :math:`(N, C_{in}, H_{in}, W_{in})` or :math:`(C_{in}, H_{in}, W_{in})`,
+    where :math:`N` is batch size, :math:`C` is channel number, :math:`H` is feature height, :math:`W` is feature width.
     The output is calculated based on formula:
@@ -5855,8 +6184,6 @@ def conv2d_ext(input, weight, bias=None, stride=1, padding=0, dilation=1, groups
     the `cross-correlation <https://en.wikipedia.org/wiki/Cross-correlation>`_,
     , :math:`weight` is the convolution kernel value and :math:`X` represents the input feature map.
-    Here are the indices' meanings:
     - :math:`i` corresponds to the batch number, the range is :math:`[0, N-1]`,
       where :math:`N` is the batch size of the input.
@@ -5883,12 +6210,11 @@ def conv2d_ext(input, weight, bias=None, stride=1, padding=0, dilation=1, groups
     <http://vision.stanford.edu/cs598_spring07/papers/Lecun98.pdf>`_ and
     `ConvNets <http://cs231n.github.io/convolutional-networks/>`_.
-    Note:
-        On Ascend platform, only group convolution in depthwise convolution scenarios is supported.
-        That is, when `groups>1`, condition :math:`C_{in}` = :math:`C_{out}` = `groups` must be satisfied.
+    .. warning::
+        This is an experimental API that is subject to change or deletion.
     Args:
-        input (Tensor): Tensor of shape :math:`(N, C_{in}, H_{in}, W_{in})`.
+        input (Tensor): Tensor of shape :math:`(N, C_{in}, H_{in}, W_{in})` or :math:`(C_{in}, H_{in}, W_{in})`.
         weight (Tensor): Tensor of shape
             :math:`(N, C_{in} / \text{groups}, \text{kernel_size[0]}, \text{kernel_size[1]})`, then the size of kernel
             is :math:`(\text{kernel_size[0]}, \text{kernel_size[1]})`.
@@ -5897,41 +6223,53 @@ def conv2d_ext(input, weight, bias=None, stride=1, padding=0, dilation=1, groups
         stride (Union(int, tuple[int]), optional): The distance of kernel moving, an int number that represents
             the height and width of movement are both strides, or a tuple of two int numbers that
             represent height and width of movement respectively. Default: ``1`` .
-        padding (Union(int, tuple[int], list[int], str), optional): Implicit paddings on both sides of the input `x`.
-            Can be a string, one integer or a tuple/list with 2 integers.
-            If `padding` is a string, the optional values are ``"same"`` , ``"valid"``.
-            - same: Adopts the way of completion. The height and width of the output will be equal to
-              the input `x` divided by stride. The padding will be evenly calculated in top and bottom,
-              left and right possiblily. Otherwise, the last extra padding will be calculated from the bottom
-              and the right side. If this mode is set, `padding` must be 0.
-            - valid: Adopts the way of discarding. The possible largest height and width of output will be returned
-              without padding. Extra pixels will be discarded. If this mode is set, `padding` must be 0.
+        padding (Union[int, tuple[int], str], optional): The number of padding
+            on the height and width directions of the input.
+            The data type is an integer or a tuple of two integers or string {`valid`, `same`}. If `padding` is an
+            integer, then `padding_{H}` and `padding_{W}` are all equal to `padding`.
+            If `padding` is a tuple of 2 integers, then `padding_{H}` and `padding_{W}`
+            is equal to `padding[0]` and `padding[1]` respectively.
+            The value should be greater than or equal to 0. Default: ``0`` .
+            - ``"same"``: Pad the input around its edges so that the shape of input and output
+              are the same when `stride` is set to ``1``.
+              The amount of padding to is calculated by the operator internally, If the amount is even, it is
+              uniformly distributed around the input, if it is odd, the excess amount goes to the right/bottom side.
+              If this mode is set, `stride` must be 1.
+            - ``"valid"``: No padding is applied to the input, and the output returns the maximum
+              possible height and width. Extra pixels that could not complete a full stride will
+              be discarded.
-            If `padding` is one integer, the paddings of top, bottom, left and right are the same, equal to padding.
-            If `padding` is a tuple/list with 2 integers, the padding of top adn bottom is padding[0],
-            and the padding of left and right is padding[1]. Default: ``0`` .
         dilation (Union(int, tuple[int]), optional): Gaps between kernel elements.The data type is int or a tuple of
             2 integers. Specifies the dilation rate to use for dilated convolution. If set to be :math:`k > 1`,
             there will be :math:`k - 1` pixels skipped for each sampling location. Its value must
             be greater than or equal to 1 and bounded by the height and width of the input `x`. Default: ``1`` .
         groups (int, optional): Splits `input` into groups. Default: ``1`` .
+            - :math:`(C_{in} \text{ % } \text{groups} == 0)` , :math:`(C_{out} \text{ % } \text{groups} == 0)` ,
+              :math:`(C_{out} >= \text{groups})` , :math:`(\text{kernel_size[1]} = C_{in} / \text{groups})`
     Returns:
         Tensor, the value that applied 2D convolution. The shape is :math:`(N, C_{out}, H_{out}, W_{out})`.
         To see how different pad modes affect the output shape, please refer to
-        :class:`mindspore.nn.Conv2d` for more details.
+        :class:`mindspore.mint.nn.Conv2d` for more details.
     Raises:
-        TypeError: If `stride`, `padding` or `dilation` is neither an int nor a tuple.
-        TypeError: `groups` is not an int.
+        ValueError: Args and size of the input feature map should satisfy the output formula to ensure that the size of
+            the output feature map is positive; otherwise, an error will be reported. For more details on the output
+            formula, please refer to :class:`mindspore.mint.nn.Conv2d`.
+        RuntimeError: On Ascend, due to the limitation of the L1 cache size of different NPU chip, if input size or
+            kernel size is too large, it may trigger an error.
+        TypeError: If `in_channels` , `out_channels` or `groups` is not an int.
+        TypeError: If `kernel_size` , `stride` or `dilation` is neither an int nor a tuple.
         TypeError: If `bias` is not a Tensor.
         ValueError: If  the shape of `bias` is not :math:`(C_{out})` .
         ValueError: If `stride` or `dilation` is less than 1.
-        ValueError: If `pad_mode` is not one of 'same', 'valid' or 'pad'.
-        ValueError: If `padding` is a tuple/list whose length is not equal to 2.
+        ValueError: If `padding` is `same` , `stride` is not equal to 1.
+        ValueError: The input parameters do not satisfy the convolution output formula.
+        ValueError: The KernelSize cannot exceed the size of the input feature map.
+        ValueError: The value of padding cannot cause the calculation area to exceed the input size.
     Supported Platforms:
         ``Ascend``
@@ -5939,123 +6277,21 @@ def conv2d_ext(input, weight, bias=None, stride=1, padding=0, dilation=1, groups
     Examples:
         >>> import mindspore
         >>> import numpy as np
-        >>> from mindspore import Tensor, ops
-        >>> from mindspore.ops.function.nn_func import conv2d_ext
+        >>> from mindspore import Tensor, ops, mint
         >>> x = Tensor(np.ones([10, 32, 32, 32]), mindspore.float32)
         >>> weight = Tensor(np.ones([32, 32, 3, 3]), mindspore.float32)
-        >>> output = conv2d_ext(x, weight)
+        >>> output = mint.nn.functional.conv2d(x, weight)
         >>> print(output.shape)
         (10, 32, 30, 30)
     """
-    def _convolution_same(input, weight, bias, dilation, groups):
-        """ convolution when mode is 'same' """
-        if isinstance(dilation, int):
-            dilation = (dilation,) * 2
-        validator.check_int(len(weight.shape), 4, validator.EQ, "weight.shape", 'conv2d')
-        validator.check_int(len(dilation), 2, validator.EQ, "dilation", 'conv2d')
-        # Calc padding info
-        need_pad_nd, pad_l, pad_r = _get_pad_info(dilation, weight)
-        if not need_pad_nd:
-            conv = _get_cache_prim(Convolution)(stride, pad_l, dilation, False, (0, 0), groups)
-            return conv(input, weight, bias)
-        # Calc pad nd info
-        pad_nd, pad_l = _get_pad_nd_info(pad_l, pad_r)
-        pad_nd_op = _get_cache_prim(ConstantPadND)()
-        padded_input = pad_nd_op(input, pad_nd, 0)
-        conv = _get_cache_prim(Convolution)(stride, pad_l, dilation, False, (0, 0), groups)
-        return conv(padded_input, weight, bias)
-    if isinstance(padding, int):
-        padding = (padding,) * 2
-    if isinstance(padding, (tuple, list)):
-        conv = _get_cache_prim(Convolution)(stride, padding, dilation, False, (0, 0), groups)
-        return conv(input, weight, bias)
+    if isinstance(padding, (int, tuple, list)):
+        return conv2d_ext_op(input, weight, bias, stride, padding, dilation, groups)
     if isinstance(padding, str):
-        if padding == 'valid':
-            conv = _get_cache_prim(Convolution)(stride, (0, 0), dilation, False, (0, 0), groups)
-            return conv(input, weight, bias)
-        if padding == 'same':
-            _check_stride_when_same_mode(stride)
-            return _convolution_same(input, weight, bias, dilation, groups)
-        raise ValueError(f"For conv2d, the parameter 'padding' must be 'same' or 'valid' when " \
-                         f"the type of 'padding' is string.")
+        return conv2d_padding_op(input, weight, bias, stride, padding, dilation, groups)
     raise TypeError(f"For conv2d, the parameter 'padding' must be a tuple/list " \
                     f"or a string, but got {type(padding)}")
-def conv_transpose2d(input, weight, bias=None, stride=1, padding=0, output_padding=0, groups=1, dilation=1):
-    r"""
-    Calculates a 2D transposed convolution, which can be regarded as Conv2d for the gradient of the input,
-    also called deconvolution (although it is not an actual deconvolution).
-    The input is typically of shape :math:`(N, C_{in}, H_{in}, W_{in})`,
-    where :math:`N` is batch size, :math:`C_{in}` is space dimension,
-    :math:`H_{in}, W_{in}` are the height and width of the feature layer respectively.
-    When Conv2d and Conv2dTranspose are initialized with the same parameters, and `pad_mode` is set to 'pad',
-    :math:`dilation * (kernel\_size - 1) - padding` amount of zero will be paded to the height and width
-    directions of the input, they are inverses of each other in regard to the input and output shapes in this case.
-    However, when `stride` > 1, Conv2d maps multiple input shapes to the same output shape. Deconvolutional network
-    can refer to `Deconvolutional Networks <https://www.matthewzeiler.com/mattzeiler/deconvolutionalnetworks.pdf>`_.
-    Args:
-        input (Tensor): Tensor of shape :math:`(N, C_{in}, H_{in}, W_{in})`.
-        weight (Tensor): Tensor of shape
-            :math:`(N, C_{in} / \text{groups}, \text{kernel_size[0]}, \text{kernel_size[1]})`, then the size of kernel
-            is :math:`(\text{kernel_size[0]}, \text{kernel_size[1]})`.
-        bias (Tensor, optional): Bias Tensor with shape :math:`(C_{out})`.
-            When bias is ``None`` , zeros will be used. Default: ``None`` .
-        stride (Union(int, tuple[int]), optional): The distance of kernel moving, an int number that represents
-            the height and width of movement are both strides, or a tuple of two int numbers that
-            represent height and width of movement respectively. Default: ``1`` .
-        padding (Union(int, tuple[int], list[int]), optional): Implicit paddings on both sides of the input `x`.
-            Can be an integer or a tuple/list with 2 integers.
-        output_padding (Union[int, tuple[int]]): The number of padding on the height and width directions of the output.
-            The data type is an integer or a tuple of two integers. If `output_padding` is an integer,
-            then the bottom and right padding are all equal to `output_padding`. If `output_padding` is a tuple of
-            2 integers, then the bottom and right padding is equal to `output_padding[0]`, `output_padding[1]`
-            respectively.
-        groups (int, optional): Splits `input` into groups. Default: ``1`` .
-        dilation (Union(int, tuple[int]), optional): Gaps between kernel elements.The data type is int or a tuple of
-            2 integers. Specifies the dilation rate to use for dilated convolution. If set to be :math:`k > 1`,
-            there will be :math:`k - 1` pixels skipped for each sampling location. Its value must
-            be greater than or equal to 1 and bounded by the height and width of the input `x`. Default: ``1`` .
-    Returns:
-        Tensor, the value that applied 2D convolution. The shape is :math:`(N, C_{out}, H_{out}, W_{out})`.
-        To see how different pad modes affect the output shape, please refer to
-        :class:`mindspore.nn.Conv2dTranspose` for more details.
-    Raises:
-        TypeError: If `stride`, `padding` or `dilation` is neither an int nor a tuple.
-        TypeError: `groups` is not an int.
-        TypeError: If `bias` is not a Tensor.
-        ValueError: If  the shape of `bias` is not :math:`(C_{out})` .
-        ValueError: If `stride` or `dilation` is less than 1.
-        ValueError: If `padding` is a tuple/list whose length is not equal to 2.
-    Supported Platforms:
-        ``Ascend``
-    Examples:
-        >>> import mindspore
-        >>> import numpy as np
-        >>> from mindspore import Tensor, ops
-        >>> x = Tensor(np.ones([1, 6, 32, 32]), mindspore.float32)
-        >>> weight = Tensor(np.ones([6, 3, 5, 5]), mindspore.float32)
-        >>> output = ops.conv_transpose2d(x, weight)
-        >>> print(output.shape)
-        (1, 3, 36, 36)
-    """
-    conv = _get_cache_prim(Convolution)(stride, padding, dilation, True, output_padding, groups)
-    return conv(input, weight, bias)
 def hardtanh(input, min_val=-1.0, max_val=1.0):
     r"""
     Applies the hardtanh activation function element-wise. The activation function is defined as:
@@ -6742,10 +6978,10 @@ def conv3d(input, weight, bias=None, stride=1, pad_mode="valid", padding=0, dila
     :math:`(\text{kernel_size[0]}, \text{kernel_size[1]}, \text{kernel_size[2]})`
     where :math:`\text{kernel_size[0]}` , :math:`\text{kernel_size[1]}` and :math:`\text{kernel_size[2]}` are the depth,
     height and width of the kernel, respectively.
-    If we consider the input and output channels as well as the `group` parameter, the complete kernel shape
-    will be :math:`(C_{out}, C_{in} / \text{group}, \text{kernel_size[0]},
+    If we consider the input and output channels as well as the `groups` parameter, the complete kernel shape
+    will be :math:`(C_{out}, C_{in} / \text{groups}, \text{kernel_size[0]},
     \text{kernel_size[1]}, \text{kernel_size[2]})`,
-    where `group` is the number of groups dividing `x`'s input channel when applying group convolution.
+    where `groups` is the number of groups dividing `x`'s input channel when applying group convolution.
     For more details about convolution layer, please refer to `Gradient Based Learning Applied to Document Recognition
     <http://vision.stanford.edu/cs598_spring07/papers/Lecun98.pdf>`_.
@@ -6882,6 +7118,146 @@ def conv3d(input, weight, bias=None, stride=1, pad_mode="valid", padding=0, dila
     return output
+def conv3d_ext(input, weight, bias=None, stride=1, padding=0, dilation=1, groups=1):
+    r"""
+    Applies a 3D convolution over an input tensor. The input tensor is typically of
+    shape :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})` or :math:`(C_{in}, D_{in}, H_{in}, W_{in})`,
+    where :math:`N` is batch size, :math:`C` is channel number, :math:`D, H, W` are the depth,
+    height and width of the feature graph, respectively.
+    The output is calculated based on formula:
+    .. math::
+        \text{out}(N_i, C_{\text{out}_j}) = \text{bias}(C_{\text{out}_j}) +
+        \sum_{k = 0}^{C_{in} - 1} \text{ccor}({\text{weight}(C_{\text{out}_j}, k), \text{X}(N_i, k)})
+    where :math:`bias` is the output channel bias, :math:`ccor` is
+    the `cross-correlation <https://en.wikipedia.org/wiki/Cross-correlation>`_
+    , :math:`weight` is the convolution kernel value and :math:`X` represents the input feature map.
+    Here are the indices' meanings:
+    - :math:`i` corresponds to the batch number, the range is :math:`[0, N-1]`,
+      where :math:`N` is the batch size of the input.
+    - :math:`j` corresponds to the output channel, the range is :math:`[0, C_{out}-1]`,
+      where :math:`C_{out}` is the number of
+      output channels, which is also equal to the number of kernels.
+    - :math:`k` corresponds to the input channel, the range is :math:`[0, C_{in}-1]`,
+      where :math:`C_{in}` is the number of
+      input channels, which is also equal to the number of channels in the convolutional kernels.
+    Therefore, in the above formula, :math:`{bias}(C_{\text{out}_j})` represents the bias of the :math:`j`-th
+    output channel, :math:`{weight}(C_{\text{out}_j}, k)` represents the slice of the :math:`j`-th convolutional
+    kernel in the :math:`k`-th channel, and :math:`{X}(N_i, k)` represents the slice of the :math:`k`-th input
+    channel in the :math:`i`-th batch of the input feature map.
+    The shape of the convolutional kernel is given by :math:`(kd, kh, kw)` where :math:`kd` , :math:`kd` and\
+    :math:`kw` are the depth, height and width of the kernel, respectively.
+    If we consider the input and output channels as well as the `group` parameter, the complete kernel shape
+    will be :math:`(C_{out}, C_{in} / \text{group}, kd, kh, kw)`,
+    where `group` is the number of groups dividing `x`'s input channel when applying group convolution.
+    For more details about convolution layer, please refer to `Gradient Based Learning Applied to Document Recognition
+    <http://vision.stanford.edu/cs598_spring07/papers/Lecun98.pdf>`_.
+    The following lists some of the limitations of the parameters.
+    - input -- The input to the conv3d. The input must have each dimension size within the range [1, int32_max].
+    - weight -- Filters of shape :math:`(C_{out}, C_{in} / groups, kd, kh, kw)`. The value of :math:`kh`
+      and :math:`kw` is in the range [1, 511]. The remaining values are in the range [1, int32_max].
+      And :math:`kh*kw*k0` is less 65536 (k0 is 16. If data type is float32, k0 is 8).
+    - bias -- Bias Tensor with shape :math:`(C_{out})`. The shape must equal the first dimension of the weight.
+    - stride -- The distance of kernel moving. It can be an int number or
+      tuple (noted by :math:`(stride_d, stride_h, stride_w)`). stride_h and stride_w are in the range [1, 63].
+      stride_d is in the range [1, 255].
+    - padding -- If padding is an int number, it is in the range [0, 255].
+    - dilation -- The value is in the range [1, 255].
+    - groups -- The value is in the range [1, 65535].
+    - :math:`C_{in} \% \text{groups} == 0 \quad \text{and} \quad C_{out} \% \text{groups} == 0` .
+    - :math:`weight[1] == C_{in} / groups` .
+    - :math:`H_{in} + PadUp + PadDown >= (kh - 1) * DilationH + 1` .
+    - :math:`W_{in} + PadLeft + PadRight >= (kw - 1) * DilationW + 1` .
+    - :math:`D_{in} + PadFront + PadBack >= (kd - 1) * DilationD + 1` .
+    - :math:`H_{out} = (H_{in} + PadUp + PadDown - ((kh - 1) * DilationH + 1)) / StrideH + 1` .
+    - :math:`W_{out} = (W_{in} + PadLeft + PadRight - ((kw - 1) * DilationW + 1)) / StrideW + 1` .
+    - :math:`D_{out} = (D_{in} + PadFront + PadBack - ((kd - 1) * DilationD + 1)) / StrideD + 1` .
+    - :math:`(D_{in}+PadFront+PadBack - ((kd-1)*DilationD+1)) /% StrideD <= PadBack` .
+    - :math:`(H_{in}+PadUp+PadDown - ((kh-1)*Dilationh+1)) /% StrideH <= PadDown` .
+    - :math:`stride_d <= kernel_d` .
+    - :math:`PadUp < kh` and :math:`PadDown < kh` . When `padding` = ``'valid'``, both PadUp and PadDown are zeros.
+      When `padding` = ``'same'``, pad can be calculated by
+      :math:`floor(((H_{out}-1) * strideH + (kh - 1) * DilationH + 1 - H_{in}) / 2)` for high dimension.
+      It is similar way to calculate the padding for depth and width dimension. And the depth and width
+      dimensions also have the same constraints.
+    - :math:`((kh - 1) * DilationH - PadUp)` should be in [0, 255]. It is the same constraint for depth
+      and width dimension.
+    - If `padding` is ``'same'``, `stride` must be 1.
+    .. warning::
+        This API does not support Atlas series products.
+        This is an experimental API that is subject to change or deletion.
+    Args:
+        input (Tensor): Tensor of shape :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})`.
+        weight (Tensor): Set size of kernel is :math:`(kd, kh,
+            kw)`, then the shape is :math:`(C_{out}, C_{in} / groups, kd, kh, kw)`.
+        bias (Tensor, optional): Bias Tensor with shape :math:`(C_{out})`.
+            When bias is ``None`` , zeros will be used. Default: ``None`` .
+        stride (Union(int, tuple[int]), optional): The distance of kernel moving, an int number that represents
+            the depth, the height and width of movement are both strides, or a tuple of triple int numbers that
+            represent the depth, height and width of movement respectively. Default: ``1`` .
+        padding (Union(int, tuple[int], str), optional): Implicit paddings on both sides of the input `x`.
+            Can be a string, one integer or a tuple/list with 3 integers.
+            If `padding` is a string, the optional values are ``"same"`` , ``"valid"``.
+            - same: Adopts the way of completion. The height and width of the output will be equal to
+              the input `x` divided by stride. The padding will be evenly calculated in top and bottom,
+              left and right possiblily. Otherwise, the last extra padding will be calculated from the bottom
+              and the right side. If this mode is set, `stride` must be 1.
+            - valid: Adopts the way of discarding. The possible largest height and width of output will be returned
+              without padding. Extra pixels will be discarded.
+            If `padding` is one integer, the paddings of top, bottom, left and right are the same, equal to padding.
+            If `padding` is a tuple/list with 3 integers, the padding of head, tail, top, bottom,
+            left and right equal to pad[0], pad[0], pad[1], pad[1], pad[2] and pad[2] correspondingly. Default: ``0`` .
+        dilation (Union[int, tuple[int]], optional): Controlling the space between the kernel points. Default: ``1`` .
+        groups (int, optional): Splits `input` into groups. Default: ``1`` .
+    Returns:
+        Tensor, the same dtype as the `input`, with the shape :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})`
+        or :math:`(C_{out}, D_{out}, H_{out}, W_{out})`.
+    Raises:
+        TypeError: If `stride`, `padding` or `dilation` is neither an int nor a tuple.
+        TypeError: `groups` is not an int.
+        TypeError: If `bias` is not a Tensor.
+    Supported Platforms:
+        ``Ascend``
+    Examples:
+        >>> import mindspore
+        >>> import numpy as np
+        >>> from mindspore import mint
+        >>> x = mindspore.Tensor(np.random.randn(12, 1, 60, 50, 8), mindspore.float16)
+        >>> w = mindspore.Tensor(np.random.randn(26, 1, 2, 4, 4), mindspore.float16)
+        >>> out = mint.nn.functional.conv3d(x, w)
+        >>> print(out.shape)
+        (12, 26, 59, 47, 5)
+    """
+    if isinstance(padding, (tuple, list, int)):
+        return conv3d_ext_op(input, weight, bias, stride, padding, dilation, groups)
+    if isinstance(padding, str):
+        return conv3d_padding_op(input, weight, bias, stride, padding, dilation, groups)
+    raise TypeError(f"For conv3d, the parameter 'padding' must be a tuple/list " \
+                    f"or a string, but got {type(padding)}")
 @_primexpr
 def _check_positive_int(arg_value, arg_name=None, prim_name=None):
     validator.check_positive_int(arg_value, arg_name=arg_name, prim_name=prim_name)
@@ -7062,6 +7438,50 @@ def glu(x, axis=-1):
     return x * y
+def glu_ext(input, dim=-1):
+    r"""
+    Computes GLU (Gated Linear Unit activation function) of the input tensor.
+    .. math::
+        {GLU}(a, b)= a \otimes \sigma(b)
+    where :math:`a` is the first half of the `input` Tensor after `input` is split and :math:`b` is the second half.
+    Here :math:`\sigma` is the sigmoid function, and :math:`\otimes` is the Hadamard product.
+    See `Language Modeling with Gated Convluational Networks <https://arxiv.org/abs/1612.08083>`_.
+    .. warning::
+        This is an experimental API that is subject to change or deletion.
+    Args:
+        input (Tensor): Tensor to be calculated. Dtype is floating point and the shape
+            is :math:`(\ast_1, N, \ast_2)` where `*` means, any number of additional dimensions. :math:`N`
+            is required to be an even number, where :math:`N` is the size of `input` on the dimension
+            selected by `dim`.
+        dim (int, optional): The dimension to split the input `input`. The value range is `[-r, r)` where `r`
+            is the number of dimensions of `input`. Default: ``-1`` , the last dimension in `input`.
+    Raises:
+        TypeError: If `input` is not a Tensor or `dim` is not an int.
+        IndexError: If the value of `dim` is out of the range of `[-r, r)`, where `r` is the number
+            of dimensions of `input`.
+        RuntimeError: If dtype of `input` is not supported.
+        RuntimeError: If the length of `input` in the dimension selected by `dim` is not even.
+    Supported Platforms:
+        ``Ascend`` ``CPU``
+    Examples:
+        >>> from mindspore import Tensor, mint
+        >>> input = Tensor([[0.1,0.2,0.3,0.4],[0.5,0.6,0.7,0.8]])
+        >>> output = mint.nn.functional.glu(input)
+        >>> print(output)
+        [[0.05744425 0.11973753]
+         [0.33409387 0.41398472]]
+    """
+    return _get_cache_prim(P.GLU)(axis=dim)(input)
 def multi_margin_loss(input, target, p=1, margin=1, weight=None, reduction='mean'):
     r"""
     Hinge loss for optimizing a multi-class classification.
@@ -7291,7 +7711,8 @@ def gelu(input, approximate='none'):
     Args:
         input (Tensor): The input of the activation function GeLU, the data type is float16, float32 or float64.
-        approximate (str): the gelu approximation algorithm to use. Acceptable vaslues are ``'none'`` and ``'tanh'`` .
+        approximate (str, optional): the gelu approximation algorithm to use.
+            Acceptable vaslues are ``'none'`` and ``'tanh'`` .
             Default: ``'none'`` .
     Returns:
@@ -7309,7 +7730,7 @@ def gelu(input, approximate='none'):
         >>> import mindspore
         >>> from mindspore import Tensor, ops
         >>> x = Tensor([1.0, 2.0, 3.0], mindspore.float32)
-        >>> result = ops.gelu(x)
+        >>> result = ops.gelu(x, approximate='none')
         >>> print(result)
         [0.8413447 1.9544997 2.9959505]
     """
@@ -7334,7 +7755,8 @@ def gelu(input, approximate='none'):
 def channel_shuffle(x, groups):
     r"""
     Divide the channels in a tensor of shape :math:`(*, C, H, W)` into :math:`g` groups and
-    rearrange them as :math:`(*, \frac{C}{g}, g, H*W)`, while keeping the original tensor shapes.
+    rearrange them as :math:`(*, \frac{C}{g}, g, H*W)`, while retaining the original tensor
+    shape in the final output.
     Args:
         x (Tensor): Tensor to be divided, it has shape :math:`(*, C, H, W)`,
@@ -7550,6 +7972,96 @@ def lp_pool2d(x, norm_type, kernel_size, stride=None, ceil_mode=False):
     return ((sign(out) * ops.relu(ops.abs(out))) * (kernel_size[0] * kernel_size[1])).pow(1.0 / norm_type)
+def relu(input, inplace=False):
+    r"""
+    Computes ReLU (Rectified Linear Unit activation function) of input tensors element-wise.
+    It returns :math:`\max(input,\  0)` element-wise. Specially, the neurons with the negative output
+    will be suppressed and the active neurons will stay the same.
+    .. math::
+        ReLU(input) = (input)^+ = \max(0, input)
+    ReLU Activation Function Graph:
+    .. image:: ../images/ReLU.png
+        :align: center
+    Args:
+        input (Tensor): The input Tensor.
+        inplace (bool, optional): Whether to use inplace mode, Defaults to ``False``.
+    Returns:
+        Tensor, with the same dtype and shape as the `input`.
+    Raises:
+        TypeError: If dtype of `input` is not Number type.
+        TypeError: If `input` is not a Tensor.
+    Supported Platforms:
+        ``Ascend`` ``GPU`` ``CPU``
+    Examples:
+        >>> import mindspore
+        >>> import numpy as np
+        >>> from mindspore import Tensor, ops
+        >>> input = Tensor(np.array([[-1.0, 4.0, -8.0], [2.0, -5.0, 9.0]]), mindspore.float32)
+        >>> output = ops.relu(input)
+        >>> print(output)
+        [[0. 4. 0.]
+         [2. 0. 9.]]
+    """
+    if inplace:
+        return inplace_relu_op(input)
+    return relu_op(input)
+def relu_(input):
+    r"""
+    ReLuComputes ReLU (Rectified Linear Unit activation function) inplace of input tensors element-wise.
+    It returns :math:`\max(input,\  0)` element-wise. Specially, the neurons with the negative output
+    will be suppressed and the active neurons will stay the same.
+    .. math::
+        ReLU(input) = (input)^+ = \max(0, input)
+    ReLU Activation Function Graph:
+    .. image:: ../images/ReLU.png
+        :align: center
+    .. warning::
+        This is an experimental API that is subject to change or deletion.
+    Args:
+        input (Tensor): The input Tensor.
+    Returns:
+        Tensor, with the same dtype and shape as the `input`.
+    Raises:
+        TypeError: If dtype of `input` is not Number type.
+        TypeError: If `input` is not a Tensor.
+    Supported Platforms:
+        ``Ascend``
+    Examples:
+        >>> import mindspore
+        >>> import numpy as np
+        >>> from mindspore import Tensor, ops
+        >>> input = Tensor(np.array([[-1.0, 4.0, -8.0], [2.0, -5.0, 9.0]]), mindspore.float32)
+        >>> ops.relu_(input)
+        >>> print(input)
+        [[0. 4. 0.]
+         [2. 0. 9.]]
+    """
+    return inplace_relu_op(input)
 def mse_loss(input, target, reduction='mean'):
     r"""
     Calculates the mean squared error between the predicted value and the label value.
@@ -7667,7 +8179,8 @@ def triplet_margin_loss(anchor, positive, negative, margin=1.0, p=2, eps=1e-06,
             as `anchor`.
         negative (Tensor): A sample belonging to the different class from `anchor`, with the same type and shape
             as `anchor`.
-        margin (float, optional): Make a margin between the positive pair and the negative pair. Default: ``1.0`` .
+        margin (float, optional): Make a margin between the positive pair and the negative pair. The shape of margin
+            must be 0. Default: ``1.0`` .
         p (int, optional): The degree of norm for pairwise distance. Default: ``2`` .
         eps (float, optional): Add small value to avoid division by zero. Default: ``1e-06``.
         swap (bool, optional): The distance swap change the negative distance to the distance between positive
@@ -8131,7 +8644,7 @@ def max_pool2d(x, kernel_size, stride=None, padding=0, dilation=1, return_indice
     return out
-def max_pool2d_ext(input, kernel_size, stride=None, padding=0, dilation=1, *, ceil_mode=False, return_indices=False):
+def max_pool2d_ext(input, kernel_size, stride=None, padding=0, dilation=1, ceil_mode=False, return_indices=False):
     r"""
     Performs a 2D max pooling on the input Tensor.
@@ -8153,21 +8666,23 @@ def max_pool2d_ext(input, kernel_size, stride=None, padding=0, dilation=1, *, ce
         kernel_size (Union[int, tuple[int]]): The size of kernel used to take the maximum value and arg
             value, is an int number that represents height and width of the kernel, or a tuple of
             two int numbers that represent height and width respectively.
-        stride (Union[int, tuple[int], None]): The distance of kernel moving, an int number that represents
+        stride (Union[int, tuple[int], None], optional): The distance of kernel moving, an int number that represents
             the height and width of movement are both stride, or a tuple of two int numbers that
             represent height and width of movement respectively.
             Default: ``None`` , which indicates the moving step is `kernel_size` .
-        padding (Union[int, tuple[int]]): An int number that represents the height and width of movement are both
+        padding (Union[int, tuple[int]], optional):
+            An int number that represents the height and width of movement are both
             strides, or a tuple of two int numbers that represent height and width of movement respectively.
             Default: ``0`` .
-        dilation (Union[int, tuple[int]]): Control the stride of elements in the kernel. Default: ``1`` .
-        ceil_mode (bool): Whether to use ceil instead of floor to calculate output shape. Default: ``False`` .
-        return_indices (bool): Whether to output the indices of max value. Default: ``False`` .
+        dilation (Union[int, tuple[int]], optional): Control the stride of elements in the kernel. Default: ``1`` .
+        ceil_mode (bool, optional): Whether to use ceil instead of floor to calculate output shape. Default: ``False`` .
+        return_indices (bool, optional): Whether to output the indices of max value. Default: ``False`` .
     Returns:
         If `return_indices` is ``False`` , return a Tensor `output`, else return a tuple (`output`, `argmax`).
-        - **output** (Tensor) - Maxpooling result, with shape :math:`(N_{out}, C_{out}, H_{out}, W_{out})`.
+        - **output** (Tensor) - Maxpooling result,
+          with shape :math:`(N_{out}, C_{out}, H_{out}, W_{out})`.
           It has the same data type as `input`.
         .. math::
@@ -8198,10 +8713,9 @@ def max_pool2d_ext(input, kernel_size, stride=None, padding=0, dilation=1, *, ce
         >>> import mindspore
         >>> import numpy as np
         >>> from mindspore import Tensor, ops
-        >>> from mindspore.ops.function.nn_func import max_pool2d_ext
         >>> input = Tensor(np.arange(20 * 16 * 50 * 32).reshape((20, 16, 50, 32)), mindspore.float32)
-        >>> output_tensor, argmax = max_pool2d_ext(input, kernel_size=(3, 2), stride=(2, 1),
-                                                          ceil_mode=False, return_indices=True)
+        >>> output_tensor, argmax = ops.function.nn_func.max_pool2d_ext(input, kernel_size=(3, 2), stride=(2, 1),
+        ...                                                             ceil_mode=False, return_indices=True)
         >>> print(output_tensor.shape)
         (20, 16, 24, 31)
         >>> print(argmax.shape)
@@ -8219,62 +8733,199 @@ def max_pool2d_ext(input, kernel_size, stride=None, padding=0, dilation=1, *, ce
     return out
-def prompt_flash_attention(query, key, value, attn_mask, actual_seq_lengths, actual_seq_lengths_kv, pse_shift,
-                           deq_scale1, quant_scale1, deq_scale2, quant_scale2, quant_offset2, num_heads,
-                           scale_value=1.0, pre_tokens=2147483547, next_tokens=0, input_layout='BSH',
-                           num_key_value_heads=0, sparse_mode=0, inner_precise=1):
+def prompt_flash_attention(query, key, value, attn_mask=None, actual_seq_lengths=None, actual_seq_lengths_kv=None,
+                           pse_shift=None, deq_scale1=None, quant_scale1=None, deq_scale2=None, quant_scale2=None,
+                           quant_offset2=None, num_heads=1, scale_value=1.0, pre_tokens=2147483647, next_tokens=0,
+                           input_layout='BSH', num_key_value_heads=0, sparse_mode=0, inner_precise=1):
     r"""
     The interface for fully inference.
-    B -- Batch size
-    S -- Sequence length
-    H -- Hidden size
+    - B: Batch size
+    - N: Num of attention heads
+    - S: Sequence length
+    - D: Head dim
+    - H: Hidden layer size
+    Self attention constructs an attention model based on the relationship between input samples themselves. The
+    principle is to assume that there is an input sample sequence :math:`x` of length :math:`n`, and each
+    element of :math:`x` is a :math:`d` dimensional vector, which can be viewed as a token embedding. This sequence
+    can be transformed through 3 weight matrices to obtain 3 matrices with dimensions of :math:`n\times d`.
+    The self attention calculation formula is defined as:
+    .. math::
+        Attention(Q,K,V)=Softmax(\frac{QK^{T} }{\sqrt{d} } )V
+    where the product of :math:`Q` and :math:`K^{T}` represents the attention of input :math:`x`. To avoid the value
+    becoming too large, it is usually scaled by dividing it by the square root of :math:`d` and perform softmax
+    normalization on each row, yields a matrix of :math:`n\times d` after multiplying :math:`V`.
+    .. warning::
+        - Support dtype of float16 for `attn_mask` will be deprecated in the future.
+        - When `sparse_mode` is 2, 3 or 4, the shape of `attn_mask` must be :math:`(2048, 2048)` /
+          :math:`(B, 1, 2048, 2048)` / :math:`(1, 1, 2048, 2048)`.
     Note:
-    experiment ops
+        - Maximum Support for each axis
+          - Supports B-axis values less than or equal to 65536 (64k).
+            When the input type includes int8 with D-axis not aligned to 32, or the input type is
+            float16 or bfloat16 with D-axis not aligned to 16, the B-axis supports up to 128 only.
+          - Supports N-axis values less than or equal to 256.
+          - Supports S-axis values less than or equal to 20971520 (20M).
+          - Supports D-axis values less than or equal to 512.
+        - Quantization
+          - int8 Input, int8 Output: Parameters `deq_scale1`, `quant_scale1`, `deq_scale2`, and `quant_scale2`
+            must all be provided. `quant_offset2` is optional (default is 0 if not provided).
+          - int8 Input, float16 Output: Parameters `deq_scale1`, `quant_scale1`, and `deq_scale2` must all be provided.
+            If `quant_offset2` or `quant_scale2` is provided (i.e., not null), it will result in an error.
+          - float16 or bfloat16 Input, int8 Output: Parameter `quant_scale2` must be provided. `quant_offset2` is
+            optional (default is 0 if not provided). If `deq_scale1`, `quant_scale1`, or `deq_scale2` is
+            provided (i.e., not null), it will result in an error.
+          - int8 Output:
+            - `quant_scale2` and `quant_offset2` in per-channel format do not support scenarios with
+              left padding, Ring Attention, or non-32-byte aligned D-axis.
+            - In GE mode: `quant_scale2` and `quant_offset2` in per-tensor format do not support scenarios
+              with non-32-byte aligned D-axis.
+            - Does not support sparse as band and `pre_tokens`/`next_tokens` being negative.
+          - `quant_scale2` and `quant_offset2` can be bfloat16 only when `query` is bfloat16.
+        - Other Usage Caveats:
+          - :math:`N` of parameter `query` must be equal to `num_heads`. :math:`N` of parameter `key` and parameter
+            `value` must be equal to `num_key_value_heads`.
+          - `num_heads` must be divisible by `num_key_value_heads` and `num_heads` divided by `num_key_value_heads`
+            can not be greater than 64.
+          - When `query` dtype is bfloat16, D axis should align with 16.
+          - Each element of `actual_seq_lengths` must not exceed q_S and element
+            of `actual_seq_lengths_kv` must not exceed kv_S.
     .. warning::
-        This is an experimental API that is subject to change or deletion.
+        Only support on Atlas A2 training series.
-    Inputs:
-        query (Tensor) - The query tensor with data type of float16 or float32.
-          Input tensor of shape :math:`(B, S, H)` / `(B, N, S, D)`.
-        key (Tensor) - The key tensor with data type of float16 or float32.
-          Input tensor of shape :math:`(B, S, H)` / `(B, N, S, D)`.
-        value (Tensor) - The value tensor with data type of float16 or float32.
-          Input tensor of shape :math:`(B, S, H)` / `(B, N, S, D)`.
-        attn_mask (Tensor) - The attention mask tensor with data type of float16 or float32.
-          For each element, 0 indicates retention and 1 indicates discard. Input tensor of shape :math:`(B, 1, S, S)`.
-        actual_seq_lengths (Tensor): Describe actual sequence length of each input with data type of int64.
-        actual_seq_lengths_kv (Tensor): Describe actual sequence length of each input with data type of int64.
-        pse_shift (Tensor) - The position encoding tensor with data type of float16 or float32.
-        dep_scale1 (Tensor)
-        quant_scale1 (Tensor)
-        deq_scale2 (Tensor)
-        quant_scale2 (Tensor)
-        quant_offset2 (Tensor)
-        num_heads (int): The number of heads.
-        scale_value (float): The scale value indicating the scale coefficient, which is used as the scalar of
-          Muls in the calculation. Default: 1.0.
-        pre_tokens (int): Previous tokens. Default: 2147483547.
-        next_tokens (int): next tokens.  Default: 0.
-          indicate the upper triangle, Indicate the number of data blocks involved in the calculation. The value 0
-          indicates that the data blocks in the upper triangle are not involved in the calculation
-        input_layout (str): the data layout of the input qkv, support `(BSH)` and `(BNSD)`, Default `BSH`.
-        num_key_value_heads (int): head numbers of key/value which are used in GQA algorithm.
-          The value o indicates if the key and value have the same head nums, use numHeads.  Default: 0.
-        sparse_mode (int): Default: 0
-        inner_precise (int): 0, float16 high precision. 1, high performance. default 1
+    Args:
+        query (Tensor): The query tensor with data type of int8, float16 or bfloat16.
+            The shape is :math:`(B, q_S, q_H)` / `(B, q_N, q_S, q_D)`.
+        key (Tensor): The key tensor with the same dtype as `query`.
+            The shape is :math:`(B, kv_S, kv_H)` / `(B, kv_N, kv_S, kv_D)`.
+        value (Tensor): The value tensor with the same dtype as `query`.
+            The shape is :math:`(B, kv_S, kv_H)` / `(B, kv_N, kv_S, kv_D)`.
+        attn_mask (Tensor, optional) - The attention mask tensor with data type of bool, int8, uint8 or float16.
+            For each element, 0/False indicates retention and 1/True indicates discard.
+            If `sparse_mode` is 0 or 1: the shape is :math:`(q_S, kv_S)` / :math:`(B, q_S, kv_S)` /
+            :math:`(1, q_S, kv_S)` / :math:`(B, 1, q_S, kv_S)` / :math:`(1, 1, q_S, kv_S)`.
+            If `sparse_mode` is 2, 3 or 4, the shape is :math:`(2048, 2048)` / :math:`(1, 2048, 2048)` /
+            :math:`(1, 1, 2048, 2048)`.
+            Default: ``None``.
+        actual_seq_lengths (Union[Tensor, tuple[int], list[int]], optional): Describe actual sequence length of each
+            batch of `query` with data type of int64. The shape is :math:`(B, )` and every element should be
+            positive integer.
+            Default: ``None``.
+        actual_seq_lengths_kv (Union[Tensor, tuple[int], list[int]], optional): Describe actual sequence length of each
+            batch of `key` or `value` with data type of int64. The shape is :math:`(B, )` and every element should be
+            positive integer.
+            Default: ``None``.
+        pse_shift (Tensor, optional): The position encoding tensor with data type of float16 or bfloat16.
+            Input tensor of shape :math:`(B, N, q_S, kv_S)` / :math:`(1, N, q_S, kv_S)`.
+            Default: ``None``.
+            - q_S must be greater than or equal to the query's S length, and kv_S must be greater than or
+              equal to the key's S length.'
+            - If `pse_shift` has dtype float16, `query` should have dtype float16 or int8, in which case high
+              precision mode is enabled automatically.
+            - If `pse_shift` has dtype bfloat16, `query` should have dtype bfloat16.
-    Outputs:
-        attention_out (Tensor) - Input tensor of shape :math:`(B, S, H)` / `(B, N, S, D)`.
+        deq_scale1 (Tensor, optional): Quantitative parametor, the tensor with data type of uint64 or float32.
+            Input Tensor of shape :math:`(1,)`.
+            Default: ``None``.
+        quant_scale1 (Tensor, optional): Quantitative parametor, the tensor with data type of float32.
+            Input Tensor of shape :math:`(1,)`.
+            Default: ``None``.
+        deq_scale2 (Tensor, optional): Quantitative parametor, input Tensor of shape :math:`(1,)` and it has
+            the same dtype as `deq_scale1`.
+            Default: ``None``.
+        quant_scale2 (Tensor, optional): Quantitative parametor, the tensor with data type of float32 or bfloat16.
+            The suggested shape is :math:`(1,)` / :math:`(1, 1, q_H)` / :math:`(q_H, )` when output layout is BSH,
+            :math:`(1,)` / :math:`(1, q_N, 1, D)` / :math:`(q_N, D) when layout is BNSD.
+            Default: ``None``.
+        quant_offset2 (Tensor, optional): Quantitative parametor, the tensor with data type of float32 or bfloat16.
+            It has the same dtype and shape as `quant_scale2`.
+            Default: ``None``.
+        num_heads (int, optional): The number of heads. It is an integer in range [0, 256].
+            Default: ``1``.
+        scale_value (double, optional): The scale value indicating the scale coefficient, which is used as the scalar of
+            Muls in the calculation.
+            Default: ``1.0``.
+        pre_tokens (int, optional): For sparse cumputing, indicating the number of previous tokens Attention needs to
+            associated with.
+            Default: ``2147483647``.
+        next_tokens (int, optional): For sparse cumputing, indicating the number of next tokens Attention needs to
+            associated with.
+            Default: ``0``.
+        input_layout (str, optional): the data layout of the input qkv, support `(BSH)` and `(BNSD)`.
+            Default ``BSH``.
+        num_key_value_heads (int, optional): An int indicates head numbers of ``key``/``value`` which are used
+            in GQA algorithm. The value 0 indicates if the key and value have the same head nums, use `num_heads`.
+            It it is specified(not 0), it must be a factor of `num_heads` and it must be equal to kv_n.
+            Default: ``0``.
+        sparse_mode (int, optional): An int specifies sparse mode, can be int from {0, 1, 2, 3, 4}.
+            Default: ``0``.
+            - sparseMode = 0: If `attn_mask` is a null pointer, `pre_tokens` and `next_tokens` inputs are ignored
+              (internally set to INT_MAX).
+            - sparseMode = 2, 3, 4: `attn_mask` shape must be :math:`(S, S)` or :math:`(1, S, S)` or
+              :math:`(1, 1, S, S)`, with S fixed at 2048. User must ensure that `attn_mask` is lower triangular.
+              If not provided or incorrect shape, it will result in an error.
+            - sparseMode = 1, 2, 3: Ignores `pre_tokens`, `next_tokens` inputs and sets values according
+              to specific rules.
+            - sparseMode = 4: `pre_tokens` and `next_tokens` must be non-negative.
+        inner_precise (int, optional): An int number from {0, 1} indicates computing mode.
+            ``0`` for high precision mode for float16 dtype. ``1`` for high performance mode.
+            Default: ``1``.
-        Supported Platforms:
+    Returns:
+        attention_out (Tensor) - Output tensor, has the same shape as `query` of
+        :math:`(B, q_S, q_H)` / :math:`(B, q_N, q_S, q_D)`.
+        Output dtype is determined by multiple factors, please refer to Note above for details.
+    Raises:
+        TypeError: Dtype of `query` is not int8, float16 or bfloat16.
+        TypeError: `query`, `key` and `value` don't have the same dtype.
+        TypeError: Dtype of `attn_mask` is not bool, int8 or uint8.
+        TypeError: Dtype of `pse_shift` is not bfloat16 or float16.
+        TypeError: `scale_value` is not a double number.
+        TypeError: `input_layout` is not a string.
+        TypeError: `num_key_value_heads` is not an int.
+        TypeError: `sparse_mode` is not an int.
+        TypeError: `sparse_inner_precisemode` is not an int.
+        TypeError: `quant_scale1` is not Tensor of type float32.
+        TypeError: `deq_scale1` is not Tensor of type uint64 or float32.
+        TypeError: `quant_scale2` is not Tensor of type float32.
+        TypeError: `deq_scale2` is not Tensor of type uint64 or float32.
+        TypeError: `quant_offset2` is not Tensor of type float32.
+        ValueError: `input_layout` is a string but of `(BSH)` or `(BNSD)`.
+        RuntimeError: `num_heads` is not divisible by `num_key_value_heads`.
+        RuntimeError: `num_heads` is not greater than 0.
+        RuntimeError: `num_key_value_heads` is not greater than or equal to 0.
+        RuntimeError: kv_n is not equal to `num_key_value_heads`.
+        RuntimeError: `attn_mask` shape is not valid.
+        RuntimeError: `sparse_mode` is specified but is not 0, 1, 2, 3 or 4.
+        RuntimeError: `query` dtype is bfloat16 and D axis is not aligned with 16.
+        RuntimeError: `input_layout` is BSH and kv_h is not divisible by `num_key_value_heads`.
+        RuntimeError: D-axis of `query`, `key` and `value` is not the same.
+        RuntimeError: In post quant per-channel scenario, D-axis is not 32 Byte aligned.
+    Supported Platforms:
         ``Ascend``
     Examples:
-        >>> from mindspore.ops.function.nn_func import prompt_flash_attention
-        >>> from mindspore import Tensor
+        >>> from mindspore import Tensor, ops
         >>> import numpy as np
         >>> B = 1
         >>> N = 16
@@ -8284,13 +8935,13 @@ def prompt_flash_attention(query, key, value, attn_mask, actual_seq_lengths, act
         >>> key = Tensor(np.ones((B, N, S, D), dtype=np.float16))
         >>> value = Tensor(np.ones((B, N, S, D), dtype=np.float16))
         >>> out = ops.prompt_flash_attention(query, key, value, None, None, None, None, None, None, None, None,
-                                             None, N, input_layout='BNSD')
+        ...                                  None, N, input_layout='BNSD')
         >>> print(out.shape)
         (1, 16, 256, 16)
     """
-    pfa = _get_cache_prim(NN_OPS.PromptFlashAttention)(num_heads, scale_value, pre_tokens, next_tokens, input_layout,
-                                                       num_key_value_heads, sparse_mode, inner_precise)
+    pfa = _get_cache_prim(PromptFlashAttention)(num_heads, scale_value, pre_tokens, next_tokens, input_layout,
+                                                num_key_value_heads, sparse_mode, inner_precise)
     return pfa(query, key, value, attn_mask, actual_seq_lengths, actual_seq_lengths_kv, pse_shift, deq_scale1,
                quant_scale1, deq_scale2, quant_scale2, quant_offset2)
@@ -8301,21 +8952,17 @@ def incre_flash_attention(query, key, value, attn_mask=None, actual_seq_lengths=
                           num_heads=1, input_layout='BSH', scale_value=1.0, num_key_value_heads=0,
                           block_size=0, inner_precise=1, kv_padding_size=None):
     r"""
-    B -- Batch size
-    N -- Num heads
-    kvN -- Num key value heads
+    The interface for incremental inference.
-    S -- Sequence length
+    - B: Batch size
+    - N: Num of attention heads
+    - kvN: Num of `key` / `value` heads
+    - S: Sequence length
+    - D: Head dim
+    - H: Hidden layer size
+    - kvH: Hidden size of `key` / `value`
-    D -- Head dim
-    H -- Hidden size
-    kvH -- Hidden size of key value
-    where :math:`H=N\times D`, :math:`kvH=kvN\times D`
+    where :math:`H=N\times D`, :math:`kvH=kvN\times D`.
     Self attention constructs an attention model based on the relationship between input samples themselves. The
     principle is to assume that there is a length of the input sample sequence :math:`x` of :math:`n`, and each
@@ -8330,62 +8977,62 @@ def incre_flash_attention(query, key, value, attn_mask=None, actual_seq_lengths=
     becoming too large, it is usually scaled by dividing it by the square root of :math:`d` and perform softmax
     normalization on each row, yields a matrix of :math:`n\times d` after multiplying :math:`V`.
-    .. warning::
-        This is an experimental API that is subject to change or deletion.
     Note:
-      - If there is no input parameter and no default value, None needs to be passed.
-      - The shape of the tensor corresponding to the key and value parameters needs to be completely consistent.
-      - :math:`N` of parameter query is equal with num_heads. :math:`N` of parameter key and parameter value is equal
-        with num_key_value_heads. num_heads is a multiple of num_key_value_heads.
+      - If there is no input parameter and no default value, ``None`` needs to be passed.
+      - The shape of the tensor corresponding to the `key` and `value` parameters needs to be completely consistent.
+      - :math:`N` of parameter `query` is equal with `num_heads`. :math:`N` of parameter `key` and parameter `value`
+        is equal with `num_key_value_heads`. `num_heads` is a multiple of `num_key_value_heads`.
       - Quantization
-        - When the data type of query, key, and value is float16 and the data type of output is int8, the input
-          parameter quant_scale2 is required and quant_offset2 is optional.
-        - When antiquant_scale exists, key and value need to be passed by int8. antiquant_offset is optional.
-        - The data type of antiquant_scale and antiquant_offset should be consistency with that of query.
-      - pse_shift
+        - When the data type of `query`, `key`, and `value` is float16 and the data type of output is int8, the input
+          parameter `quant_scale2` is required and `quant_offset2` is optional.
+        - When `antiquant_scale` exists, `key` and `value` need to be passed by int8. `antiquant_offset` is optional.
+        - The data type of `antiquant_scale` and `antiquant_offset` should be consistenct with that of `query`.
+      - `pse_shift`
-        - The pse_shift data type needs to be consistent with the query data type, and only supports D-axis alignment,
+        - The `pse_shift` data type needs to be consistent with `query`, and only supports D-axis alignment,
           which means that the D-axis can be divided by 16.
       - Page attention:
-        - The necessary condition for enabling page attention is that the block_table exists, and the key
-          and value are arranged in a contiguous memory according to the index in the block_table. The support for
-          key and value dtypes is float16/bfloat16/int8.
-        - In the enabling scenario of page attention, 16 alignment is required when input types of key and value are
-          float16/bfloat16, and 32 alignment is required when input types of key and value are int8. It is
-          recommended to use 128.
+        - The necessary condition for enabling page attention is that the `block_table` exists, and the `key`
+          and `value` are arranged in a contiguous memory according to the index in the `block_table`. The support
+          dtype for `key` and `value` is float16/bfloat16/int8.
+        - In the enabling scenario of page attention, 16 alignment is required when input types of `key`
+          and `value` are float16/bfloat16, and 32 alignment is required when input dtype of `key` and `value`
+          is int8. It is recommended to use 128.
         - The maximum max_block_num_per_seq currently supported by blocktable is 16k, and exceeding 16k will result
           in interception and error messages; If you encounter :math:`S` being too large and causing
-          max_block_num_per_seq to exceed 16k, you can increase the block_size to solve the problem.
-        - The multiplication of all dimensions of the shape of the parameters key and value in the page attention
+          max_block_num_per_seq to exceed 16k, you can increase the `block_size` to solve the problem.
+        - The multiplication of all dimensions of the shape of the parameters `key` and `value` in the page attention
           scenario cannot exceed the representation range of int32.
         - When performing per-channel post quantization, page attention cannot be enabled simultaneously.
-      - kv_padding_size:
+      - `kv_padding_size`:
         - The calculation formula for the starting point of KV cache transfer is
           :math:`S-kv\_padding\_size-actual\_seq\_lengths`. The calculation formula for the transfer endpoint of KV
           cache is :math:`S-kv\_padding\_size`. When the starting or ending point of the KV cache transfer is less
           than 0, the returned data result is all 0.
-        - When kv_padding_size is less than 0, it will be set to 0.
-        - kv_padding_size needs to be enabled together with the actual_seq_lengths parameter, otherwise it is
+        - When `kv_padding_size` is less than 0, it will be set to 0.
+        - `kv_padding_size` needs to be enabled together with the `actual_seq_lengths` parameter, otherwise it is
           considered as the KV right padding scene.
         - It needs to be enabled together with the atten_mask parameter and ensure that the meaning of atten_mask is
           correct, that is, it can correctly hide invalid data. Otherwise, it will introduce accuracy issues.
-        - kv_padding_size does not support page attention scenarios
+        - `kv_padding_size` does not support page attention scenarios.
+    .. warning::
+        Only support on Atlas A2 training series.
     Args:
         query (Tensor): The query tensor with data type of float16 or bfloat16.
             The shape is :math:`(B, 1, H)` / :math:`(B, N, 1, D)`.
-        key (TensorList): The key tensor with data type of float16 or bfloat16 or int8.
+        key (Union[tuple, list]): The key tensor with data type of float16 or bfloat16 or int8.
             The shape is :math:`(B, S, kvH)` / :math:`(B, kvN, S, D)`.
-        value (TensorList): The value tensor with data type of float16 or bfloat16 or int8.
+        value (Union[tuple, list]): The value tensor with data type of float16 or bfloat16 or int8.
             The shape is :math:`(B, S, kvH)` / :math:`(B, kvN, S, D)`.
         attn_mask (Tensor, optional): The attention mask tensor with data type of bool or int8 or uint8.
             The shape is :math:`(B, S)` / :math:`(B, 1, S)` / :math:`(B, 1, 1, S)`. Default: ``None``.
         actual_seq_lengths (Union[Tensor, tuple[int], list[int]], optional): Describe actual sequence length of each
-            input with data type of int32 or int64. The shape is :math:`(B, )`. Default: ``None``.
+            input with data type of int64. The shape is :math:`(B, )`. Default: ``None``.
         pse_shift (Tensor, optional): The position encoding tensor with data type of float16 or bfloat16. Input tensor
             of shape :math:`(1, N, 1, S)` / :math:`(B, N, 1, S)`. Default: ``None``.
         dequant_scale1 (Tensor, optional): Quantitative parametor, the tensor with data type of uint64 or float32. It
@@ -8400,22 +9047,25 @@ def incre_flash_attention(query, key, value, attn_mask=None, actual_seq_lengths=
             The shape is :math:`(1,)`. Default: ``None``.
         antiquant_scale (Tensor, optional): Pseudo Quantitative parametor, the tensor with data type of float16 or
             bfloat16. The shape is :math:`(2, kvN, 1, D)` when input_layout is 'BNSD' or :math:`(2, kvH)` when
-          input_layout is 'BSH'. Default: ``None``.
+            input_layout is 'BSH'. Default: ``None``.
         antiquant_offset (Tensor, optional): Pseudo Quantitative parametor, the tensor with data type of float16 or
             bfloat16. The shape is :math:`(2, kvN, 1, D)` when input_layout is 'BNSD' or :math:`(2, kvH)` when
-          input_layout is 'BSH'. Default: ``None``.
+            input_layout is 'BSH'. Default: ``None``.
         block_table (Tensor, optional): The tensor with data type of int32. The shape is
             :math:`(B, max\_block\_num\_per\_seq)`,
             where :math:`max\_block\_num\_per\_seq = ceil(\frac{max(actual\_seq\_length)}{block\_size} )`.
             Default: ``None``.
-        num_heads (int): The number of heads.
-        input_layout (str): The data layout of the input qkv, support 'BSH' and 'BNSD'. Default ``'BSH'``.
-        scale_value (double): The scale value indicating the scale coefficient, which is used as the scalar of
-            Muls in the calculation. Default: ``1.0``.
-        num_key_value_heads (int): Head numbers of key/value which are used in GQA algorithm.
-            The value 0 indicates if the key and value have the same head nums, use numHeads.  Default: ``0``.
-        block_size (int): The maximum number of tokens stored in each block of KV in page attention. Default: ``0``.
-        inner_precise (int): Default: ``1``.
+        num_heads (int, optional): The number of heads.  Default: ``1``.
+        input_layout (str, optional): The data layout of the input qkv, support 'BSH' and 'BNSD'. Default ``'BSH'``.
+        scale_value (double, optional): The scale value indicating the scale coefficient, which is used as
+            the scalar of Muls in the calculation. Default: ``1.0``.
+        num_key_value_heads (int, optional): Head numbers of `key`/`value` which are used in GQA algorithm.
+            The value 0 indicates if the `key` and `value` have the same head nums, use numHeads.  Default: ``0``.
+        block_size (int, optional): The maximum number of tokens stored in each block of KV in page attention.
+            Default: ``0``.
+        inner_precise (int, optional): An int number from {0, 1} indicates computing mode.
+            ``0`` for high precision mode for float16 dtype. ``1`` for high performance mode.
+            Default: ``1``.
         kv_padding_size (Tensor, optional): The tensor with data type of int64. The range of values is
             :math:`0\le kv\_padding\_size \le  S-max(actual\_seq\_length)`. The shape is :math:`()` or :math:`(1,)`.
             Default: ``None``.
@@ -8423,6 +9073,25 @@ def incre_flash_attention(query, key, value, attn_mask=None, actual_seq_lengths=
     Returns:
         attention_out (Tensor), the shape is :math:`(B, 1, H)` / :math:`(B, N, 1, D)`.
+    Raises:
+        TypeError: dtype of `query` is not float16 or bfloat16.
+        TypeError: `key` and `value` don't have the same dtype.
+        TypeError: dtype of `attn_mask` is not bool, int8 or uint8.
+        TypeError: dtype of `pse_shift` is not bfloat16 or float16.
+        TypeError: `scale_value` is not a double number.
+        TypeError: `input_layout` is not a string.
+        TypeError: `num_key_value_heads` or `num_heads` is not an int.
+        TypeError: `inner_precise` is not an int.
+        TypeError: `quant_scale1` is not Tensor of type float32.
+        TypeError: `quant_scale2` is not Tensor of type float32.
+        TypeError: `quant_offset2` is not Tensor of type float32.
+        ValueError: size of `actual_seq_lengths` is not 1 or B.
+        ValueError: `input_layout` is a string but of `(BSH)` or `(BNSD)`.
+        ValueError: `num_heads` is not divisible by Q_H.
+        ValueError: `num_heads` is not divisible by `num_key_value_heads`.
+        RuntimeError: `num_heads` is not greater than 0.
+        RuntimeError: `attn_mask` shape is not valid.
     Supported Platforms:
         ``Ascend``
@@ -8435,7 +9104,7 @@ def incre_flash_attention(query, key, value, attn_mask=None, actual_seq_lengths=
         >>> query = Tensor(np.random.randn(B, 1, N * D), mstype.float16)
         >>> key = [Tensor(np.random.randn(B, S, kvN * D), mstype.float16)]
         >>> value = [Tensor(np.random.randn(B, S, kvN * D), mstype.float16)]
-        >>> ifa_ms = ops.functional.incre_flash_attention
+        >>> ifa_ms = ops.incre_flash_attention
         >>> attn_out = ifa_ms(query, key, value, num_heads=N, num_key_value_heads=kvN)
         >>> attn_out
         Tensor(shape=[1, 1, 512], dtype=Float16, value=
@@ -8458,7 +9127,7 @@ def embedding(input, weight, padding_idx=None, max_norm=None, norm_type=2.0, sca
     Args:
         input (Tensor): The indices used to lookup in the `weight`. The data type must be mindspore.int32 or
             mindspore.int64, and the value should be in range `[0, weight.shape[0])`.
-        weight (Parameter): The matrix where to lookup from. The shape must be 2D.
+        weight (Union[Parameter, Tensor]): The matrix where to lookup from. The shape must be 2D.
         padding_idx (int, optional): If the value is not None, the corresponding row of `weight` will not be updated
             in training. The value should be in range `[-weight.shape[0], weight.shape[0])` if it's not ``None``.
             Default ``None``.
@@ -8475,7 +9144,6 @@ def embedding(input, weight, padding_idx=None, max_norm=None, norm_type=2.0, sca
     Raises:
         ValueError: If `padding_idx` is out of valid range.
         ValueError: If the shape of `weight` is invalid.
-        TypeError: `weight` is not a :class:`mindspore.Parameter`.
     Supported Platforms:
         ``Ascend``
@@ -8535,6 +9203,8 @@ __all__ = [
     'hardshrink',
     'is_floating_point',
     'incre_flash_attention',
+    'prompt_flash_attention',
+    'flash_attention_score',
     'flip',
     'fliplr',
     'flipud',
@@ -8605,5 +9275,6 @@ __all__ = [
     'add_layer_norm',
     'group_norm',
     'rms_norm',
+    'add_rms_norm',
 ]
 __all__.sort()