PyPI - mindspore - Versions diffs - 2.2.0__cp38-cp38-win_amd64.whl → 2.2.11__cp38-cp38-win_amd64.whl - Mend

mindspore 2.2.0__cp38-cp38-win_amd64.whl → 2.2.11__cp38-cp38-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mindspore might be problematic. Click here for more details.

Files changed (112) hide show

mindspore/.commit_id +1 -1
mindspore/_c_dataengine.cp38-win_amd64.pyd +0 -0
mindspore/_c_expression.cp38-win_amd64.pyd +0 -0
mindspore/_c_mindrecord.cp38-win_amd64.pyd +0 -0
mindspore/_checkparam.py +3 -3
mindspore/_extends/graph_kernel/model/graph_split.py +84 -76
mindspore/_extends/graph_kernel/splitter.py +3 -2
mindspore/_extends/parallel_compile/akg_compiler/build_tbe_kernel.py +83 -66
mindspore/_extends/parallel_compile/akg_compiler/tbe_topi.py +4 -4
mindspore/_extends/parallel_compile/akg_compiler/util.py +10 -7
mindspore/_extends/parallel_compile/tbe_compiler/tbe_helper.py +2 -1
mindspore/_extends/parse/__init__.py +3 -2
mindspore/_extends/parse/parser.py +6 -1
mindspore/_extends/parse/standard_method.py +14 -11
mindspore/_extends/remote/kernel_build_server.py +2 -1
mindspore/common/_utils.py +16 -0
mindspore/common/api.py +1 -1
mindspore/common/auto_dynamic_shape.py +81 -85
mindspore/common/dump.py +1 -1
mindspore/common/tensor.py +3 -20
mindspore/config/op_info.config +1 -1
mindspore/context.py +11 -4
mindspore/dataset/engine/cache_client.py +8 -5
mindspore/dataset/engine/datasets_standard_format.py +5 -0
mindspore/dataset/vision/transforms.py +21 -21
mindspore/experimental/optim/adam.py +1 -1
mindspore/gen_ops.py +1 -1
mindspore/include/api/model.h +17 -0
mindspore/include/api/status.h +8 -3
mindspore/mindspore_backend.dll +0 -0
mindspore/mindspore_common.dll +0 -0
mindspore/mindspore_core.dll +0 -0
mindspore/mindspore_shared_lib.dll +0 -0
mindspore/nn/cell.py +0 -3
mindspore/nn/layer/activation.py +4 -5
mindspore/nn/layer/conv.py +39 -23
mindspore/nn/layer/flash_attention.py +54 -129
mindspore/nn/layer/math.py +3 -7
mindspore/nn/layer/rnn_cells.py +5 -5
mindspore/nn/wrap/__init__.py +4 -2
mindspore/nn/wrap/cell_wrapper.py +12 -3
mindspore/numpy/utils_const.py +5 -5
mindspore/opencv_core452.dll +0 -0
mindspore/opencv_imgcodecs452.dll +0 -0
mindspore/opencv_imgproc452.dll +0 -0
mindspore/ops/_grad_experimental/grad_array_ops.py +1 -1
mindspore/ops/_grad_experimental/grad_implementations.py +2 -2
mindspore/ops/_grad_experimental/grad_math_ops.py +19 -18
mindspore/ops/_grad_experimental/grad_sparse_ops.py +3 -3
mindspore/ops/_op_impl/aicpu/add.py +3 -3
mindspore/ops/_op_impl/aicpu/linear_sum_assignment.py +21 -2
mindspore/ops/_utils/utils.py +2 -0
mindspore/ops/composite/multitype_ops/_compile_utils.py +2 -1
mindspore/ops/composite/multitype_ops/getitem_impl.py +2 -2
mindspore/ops/function/array_func.py +10 -7
mindspore/ops/function/grad/grad_func.py +0 -1
mindspore/ops/function/nn_func.py +98 -9
mindspore/ops/function/random_func.py +2 -1
mindspore/ops/op_info_register.py +24 -21
mindspore/ops/operations/__init__.py +6 -2
mindspore/ops/operations/_grad_ops.py +25 -6
mindspore/ops/operations/_inner_ops.py +155 -23
mindspore/ops/operations/array_ops.py +9 -7
mindspore/ops/operations/comm_ops.py +2 -2
mindspore/ops/operations/custom_ops.py +85 -68
mindspore/ops/operations/inner_ops.py +26 -3
mindspore/ops/operations/math_ops.py +7 -6
mindspore/ops/operations/nn_ops.py +193 -49
mindspore/parallel/_parallel_serialization.py +10 -3
mindspore/parallel/_tensor.py +4 -1
mindspore/parallel/checkpoint_transform.py +13 -2
mindspore/parallel/shard.py +17 -10
mindspore/profiler/common/util.py +1 -0
mindspore/profiler/parser/ascend_hccl_generator.py +232 -0
mindspore/profiler/parser/ascend_msprof_exporter.py +86 -43
mindspore/profiler/parser/ascend_msprof_generator.py +196 -9
mindspore/profiler/parser/ascend_op_generator.py +1 -1
mindspore/profiler/parser/ascend_timeline_generator.py +6 -182
mindspore/profiler/parser/base_timeline_generator.py +1 -1
mindspore/profiler/parser/cpu_gpu_timeline_generator.py +2 -2
mindspore/profiler/parser/framework_parser.py +1 -1
mindspore/profiler/parser/profiler_info.py +19 -0
mindspore/profiler/profiling.py +46 -24
mindspore/rewrite/api/pattern_engine.py +1 -1
mindspore/rewrite/parsers/for_parser.py +7 -7
mindspore/rewrite/parsers/module_parser.py +4 -4
mindspore/rewrite/symbol_tree.py +1 -4
mindspore/run_check/_check_version.py +5 -3
mindspore/safeguard/rewrite_obfuscation.py +52 -28
mindspore/train/callback/_summary_collector.py +1 -1
mindspore/train/dataset_helper.py +1 -0
mindspore/train/model.py +2 -2
mindspore/train/serialization.py +97 -11
mindspore/train/summary/_summary_adapter.py +1 -1
mindspore/train/summary/summary_record.py +23 -7
mindspore/version.py +1 -1
{mindspore-2.2.0.dist-info → mindspore-2.2.11.dist-info}/METADATA +3 -2
{mindspore-2.2.0.dist-info → mindspore-2.2.11.dist-info}/RECORD +101 -112
mindspore/ops/_op_impl/_custom_op/flash_attention/__init__.py +0 -0
mindspore/ops/_op_impl/_custom_op/flash_attention/attention.py +0 -406
mindspore/ops/_op_impl/_custom_op/flash_attention/constants.py +0 -41
mindspore/ops/_op_impl/_custom_op/flash_attention/flash_attention_bwd.py +0 -467
mindspore/ops/_op_impl/_custom_op/flash_attention/flash_attention_fwd.py +0 -563
mindspore/ops/_op_impl/_custom_op/flash_attention/flash_attention_impl.py +0 -193
mindspore/ops/_op_impl/_custom_op/flash_attention/tik_ops_utils.py +0 -435
mindspore/ops/_op_impl/_custom_op/flash_attention/tiling_strategy/__init__.py +0 -0
mindspore/ops/_op_impl/_custom_op/flash_attention/tiling_strategy/sparse_tiling.py +0 -45
mindspore/ops/_op_impl/_custom_op/flash_attention/tiling_strategy/strategy.py +0 -67
mindspore/ops/_op_impl/_custom_op/flash_attention/tiling_strategy/wukong_tiling.py +0 -62
{mindspore-2.2.0.dist-info → mindspore-2.2.11.dist-info}/WHEEL +0 -0
{mindspore-2.2.0.dist-info → mindspore-2.2.11.dist-info}/entry_points.txt +0 -0
{mindspore-2.2.0.dist-info → mindspore-2.2.11.dist-info}/top_level.txt +0 -0

mindspore/dataset/vision/transforms.py CHANGED Viewed

@@ -144,14 +144,14 @@ class AdjustBrightness(ImageTensorOperation, PyTensorOperation):
         Args:
             device_target (str, optional): The operator will be executed on this device. Currently supports
-                ``CPU`` and ``Ascend`` , where ``Ascend`` refers to Ascend910B device. Default: ``CPU`` .
+                ``CPU`` . Default: ``CPU`` .
         Raises:
             TypeError: If `device_target` is not of type str.
-            ValueError: If `device_target` is not within the valid set of ['CPU', 'Ascend'].
+            ValueError: If `device_target` is not ``CPU`` .
         Supported Platforms:
-            ``CPU`` ``Ascend``
+            ``CPU``
         Examples:
             >>> import mindspore.dataset as ds
@@ -227,14 +227,14 @@ class AdjustContrast(ImageTensorOperation, PyTensorOperation):
         Args:
             device_target (str, optional): The operator will be executed on this device. Currently supports
-                ``CPU`` and ``Ascend`` , where ``Ascend`` refers to Ascend910B device. Default: ``CPU`` .
+                ``CPU`` . Default: ``CPU`` .
         Raises:
             TypeError: If `device_target` is not of type str.
-            ValueError: If `device_target` is not within the valid set of ['CPU', 'Ascend'].
+            ValueError: If `device_target` is not ``CPU`` .
         Supported Platforms:
-            ``CPU`` ``Ascend``
+            ``CPU``
         Examples:
             >>> import mindspore.dataset as ds
@@ -373,14 +373,14 @@ class AdjustHue(ImageTensorOperation, PyTensorOperation):
         Args:
             device_target (str, optional): The operator will be executed on this device. Currently supports
-                ``CPU`` and ``Ascend`` , where ``Ascend`` refers to Ascend910B device. Default: ``CPU`` .
+                ``CPU`` . Default: ``CPU`` .
         Raises:
             TypeError: If `device_target` is not of type str.
-            ValueError: If `device_target` is not within the valid set of ['CPU', 'Ascend'].
+            ValueError: If `device_target` is not ``CPU`` .
         Supported Platforms:
-            ``CPU`` ``Ascend``
+            ``CPU``
         Examples:
             >>> import mindspore.dataset as ds
@@ -457,14 +457,14 @@ class AdjustSaturation(ImageTensorOperation, PyTensorOperation):
         Args:
             device_target (str, optional): The operator will be executed on this device. Currently supports
-                ``CPU`` and ``Ascend`` , where ``Ascend`` refers to Ascend910B device. Default: ``CPU`` .
+                ``CPU`` . Default: ``CPU`` .
         Raises:
             TypeError: If `device_target` is not of type str.
-            ValueError: If `device_target` is not within the valid set of ['CPU', 'Ascend'].
+            ValueError: If `device_target` is not ``CPU`` .
         Supported Platforms:
-            ``CPU`` ``Ascend``
+            ``CPU``
         Examples:
             >>> import mindspore.dataset as ds
@@ -1159,14 +1159,14 @@ class Decode(ImageTensorOperation, PyTensorOperation):
         Args:
             device_target (str, optional): The operator will be executed on this device. Currently supports
-                ``CPU`` and ``Ascend`` , where ``Ascend`` refers to Ascend910B device. Default: ``CPU`` .
+                ``CPU`` . Default: ``CPU`` .
         Raises:
             TypeError: If `device_target` is not of type str.
-            ValueError: If `device_target` is not within the valid set of ['CPU', 'Ascend'].
+            ValueError: If `device_target` is not ``CPU`` .
         Supported Platforms:
-            ``CPU`` ``Ascend``
+            ``CPU``
         Examples:
             >>> import mindspore.dataset as ds
@@ -1908,14 +1908,14 @@ class Normalize(ImageTensorOperation):
         Args:
             device_target (str, optional): The operator will be executed on this device. Currently supports
-                ``CPU`` and ``Ascend`` , where ``Ascend`` refers to Ascend910B device. Default: ``CPU`` .
+                ``CPU`` . Default: ``CPU`` .
         Raises:
             TypeError: If `device_target` is not of type str.
-            ValueError: If `device_target` is not within the valid set of ['CPU', 'Ascend'].
+            ValueError: If `device_target` is not ``CPU`` .
         Supported Platforms:
-            ``CPU`` ``Ascend``
+            ``CPU``
         Examples:
             >>> import mindspore.dataset as ds
@@ -4182,14 +4182,14 @@ class Resize(ImageTensorOperation, PyTensorOperation):
         Args:
             device_target (str, optional): The operator will be executed on this device. Currently supports
-                ``CPU`` and ``Ascend`` , where ``Ascend`` refers to Ascend910B device. Default: ``CPU`` .
+                ``CPU`` . Default: ``CPU`` .
         Raises:
             TypeError: If `device_target` is not of type str.
-            ValueError: If `device_target` is not within the valid set of ['CPU', 'Ascend'].
+            ValueError: If `device_target` is not ``CPU`` .
         Supported Platforms:
-            ``CPU`` ``Ascend``
+            ``CPU``
         Examples:
             >>> import mindspore.dataset as ds

mindspore/experimental/optim/adam.py CHANGED Viewed

@@ -43,7 +43,7 @@ def _run_adam_with_amsgrad_opt(opt, beta1_power, beta2_power, lr, gradient, para
 class Adam(Optimizer):
     r"""
-    Implements Adam algorithm..
+    Implements Adam algorithm.
     The updating formulas are as follows:

mindspore/gen_ops.py CHANGED Viewed

@@ -120,7 +120,7 @@ def generate_py_primitive(yaml_data):
                 assign_str += arg_name
             args_assign.append(assign_str)
-        args_assign = '\n'.join(assign for assign in args_assign)
+        args_assign = '\n'.join([assign for assign in args_assign])
         primitive_code = f"""
 class {class_name}(Primitive):
     def __init__(self, {', '.join(init_args_with_default)}):

mindspore/include/api/model.h CHANGED Viewed

@@ -136,6 +136,13 @@ class MS_API Model {
   /// \return Status.
   Status UpdateWeights(const std::vector<MSTensor> &new_weights);
+  /// \brief Change the size and or content of weight tensors
+  ///
+  /// \param[in]  A vector where model constant are arranged in sequence
+  ///
+  /// \return Status.
+  Status UpdateWeights(const std::vector<std::vector<MSTensor>> &new_weights);
   /// \brief Inference model API. If use this API in train mode, it's equal to RunStep API.
   ///
   /// \param[in] inputs A vector where model inputs are arranged in sequence.
@@ -358,6 +365,13 @@ class MS_API Model {
   const std::shared_ptr<ModelImpl> impl() const { return impl_; }
+  /// \brief Get model info by key
+  ///
+  /// \param[in] key The key of model info key-value pair
+  ///
+  /// \return The value of the model info associated with the given key.
+  inline std::string GetModelInfo(const std::string &key);
  private:
   friend class Serialization;
   // api without std::string
@@ -374,6 +388,7 @@ class MS_API Model {
                const std::vector<char> &cropto_lib_path);
   Status Build(const std::vector<char> &model_path, ModelType model_type, const std::shared_ptr<Context> &model_context,
                const Key &dec_key, const std::vector<char> &dec_mode, const std::vector<char> &cropto_lib_path);
+  std::vector<char> GetModelInfo(const std::vector<char> &key);
   std::shared_ptr<ModelImpl> impl_;
 };
@@ -416,5 +431,7 @@ Status Model::Build(const std::string &model_path, ModelType model_type,
                     const std::shared_ptr<Context> &model_context) {
   return Build(StringToChar(model_path), model_type, model_context);
 }
+inline std::string Model::GetModelInfo(const std::string &key) { return CharToString(GetModelInfo(StringToChar(key))); }
 }  // namespace mindspore
 #endif  // MINDSPORE_INCLUDE_API_MODEL_H

mindspore/include/api/status.h CHANGED Viewed

@@ -83,9 +83,14 @@ enum StatusCode : uint32_t {
   kLiteModelRebuild = kLite | (0x0FFFFFFF & -12),    /**< Model has been built. */
   // Executor error code, range: [-100,-200)
-  kLiteOutOfTensorRange = kLite | (0x0FFFFFFF & -100), /**< Failed to check range. */
-  kLiteInputTensorError = kLite | (0x0FFFFFFF & -101), /**< Failed to check input tensor. */
-  kLiteReentrantError = kLite | (0x0FFFFFFF & -102),   /**< Exist executor running. */
+  kLiteOutOfTensorRange = kLite | (0x0FFFFFFF & -100),           /**< Failed to check range. */
+  kLiteInputTensorError = kLite | (0x0FFFFFFF & -101),           /**< Failed to check input tensor. */
+  kLiteReentrantError = kLite | (0x0FFFFFFF & -102),             /**< Exist executor running. */
+  kLiteLLMWaitProcessTimeOut = kLite | (0x0FFFFFFF & -103),      /**< Wait to be processed time out. */
+  kLiteLLMKVCacheNotExist = kLite | (0x0FFFFFFF & -104),         /**< KV Cache not exist. */
+  kLiteLLMRepeatRequest = kLite | (0x0FFFFFFF & -105),           /**< repeat request. */
+  kLiteLLMRequestAlreadyCompleted = kLite | (0x0FFFFFFF & -106), /**< request already complete!. */
+  kLiteLLMEngineFinalized = kLite | (0x0FFFFFFF & -107),         /**< llm engine finalized. */
   // Graph error code, range: [-200,-300)
   kLiteGraphFileError = kLite | (0x0FFFFFFF & -200), /**< Failed to verify graph file. */

mindspore/mindspore_backend.dll CHANGED Viewed

Binary file

mindspore/mindspore_common.dll CHANGED Viewed

Binary file

mindspore/mindspore_core.dll CHANGED Viewed

Binary file

mindspore/mindspore_shared_lib.dll CHANGED Viewed

Binary file

mindspore/nn/cell.py CHANGED Viewed

@@ -1081,9 +1081,6 @@ class Cell(Cell_):
         if not isinstance(param, Parameter) and param is not None:
             raise TypeError(f"For 'insert_param_to_cell', the argument 'param' must be 'Parameter' if not None, "
                             f"but got {type(param)}.")
-        if param is None:
-            raise TypeError(f"For 'insert_param_to_cell', the argument 'param' must not be None, "
-                            f"but got None.")
         if isinstance(param, Parameter) and param.name == PARAMETER_NAME_DEFAULT:
             param.name = param_name
         self._params[param_name] = param

mindspore/nn/layer/activation.py CHANGED Viewed

@@ -932,10 +932,8 @@ class GELU(Cell):
         """Initialize GELU."""
         super(GELU, self).__init__()
         validator.check_bool(approximate, 'approximate', self.cls_name)
-        self.approximate = approximate
-        if approximate:
-            self.approximate = 'tanh'
-        else:
+        self.approximate = 'tanh'
+        if not approximate:
             self.approximate = 'none'
     def construct(self, x):
@@ -1335,7 +1333,8 @@ class LRN(Cell):
     .. warning::
         LRN is deprecated on Ascend due to potential accuracy problem. It's recommended to use other
-        normalization methods, e.g. :class:`mindspore.nn.BatchNorm`.
+        normalization methods, e.g. :class:`mindspore.nn.BatchNorm1d` ,
+        :class:`mindspore.nn.BatchNorm2d` , :class:`mindspore.nn.BatchNorm3d`.
     Refer to :func:`mindspore.ops.lrn` for more details.

mindspore/nn/layer/conv.py CHANGED Viewed

@@ -718,9 +718,9 @@ class Conv3d(_Conv):
         .. math::
             \begin{array}{ll} \\
-                D_{out} ＝ \left \lceil{\frac{D_{in}}{\text{stride[0]}}} \right \rceil \\
-                H_{out} ＝ \left \lceil{\frac{H_{in}}{\text{stride[1]}}} \right \rceil \\
-                W_{out} ＝ \left \lceil{\frac{W_{in}}{\text{stride[2]}}} \right \rceil \\
+                D_{out} = \left \lceil{\frac{D_{in}}{\text{stride[0]}}} \right \rceil \\
+                H_{out} = \left \lceil{\frac{H_{in}}{\text{stride[1]}}} \right \rceil \\
+                W_{out} = \left \lceil{\frac{W_{in}}{\text{stride[2]}}} \right \rceil \\
             \end{array}
@@ -728,11 +728,11 @@ class Conv3d(_Conv):
         .. math::
             \begin{array}{ll} \\
-                D_{out} ＝ \left \lfloor{\frac{D_{in} - \text{dilation[0]} \times (\text{kernel_size[0]} - 1) }
+                D_{out} = \left \lfloor{\frac{D_{in} - \text{dilation[0]} \times (\text{kernel_size[0]} - 1) }
                 {\text{stride[0]}} + 1} \right \rfloor \\
-                H_{out} ＝ \left \lfloor{\frac{H_{in} - \text{dilation[1]} \times (\text{kernel_size[1]} - 1) }
+                H_{out} = \left \lfloor{\frac{H_{in} - \text{dilation[1]} \times (\text{kernel_size[1]} - 1) }
                 {\text{stride[1]}} + 1} \right \rfloor \\
-                W_{out} ＝ \left \lfloor{\frac{W_{in} - \text{dilation[2]} \times (\text{kernel_size[2]} - 1) }
+                W_{out} = \left \lfloor{\frac{W_{in} - \text{dilation[2]} \times (\text{kernel_size[2]} - 1) }
                 {\text{stride[2]}} + 1} \right \rfloor \\
             \end{array}
@@ -740,11 +740,11 @@ class Conv3d(_Conv):
         .. math::
             \begin{array}{ll} \\
-                D_{out} ＝ \left \lfloor{\frac{D_{in} + padding[0] + padding[1] - (\text{dilation[0]} - 1) \times
+                D_{out} = \left \lfloor{\frac{D_{in} + padding[0] + padding[1] - (\text{dilation[0]} - 1) \times
                 \text{kernel_size[0]} - 1 }{\text{stride[0]}} + 1} \right \rfloor \\
-                H_{out} ＝ \left \lfloor{\frac{H_{in} + padding[2] + padding[3] - (\text{dilation[1]} - 1) \times
+                H_{out} = \left \lfloor{\frac{H_{in} + padding[2] + padding[3] - (\text{dilation[1]} - 1) \times
                 \text{kernel_size[1]} - 1 }{\text{stride[1]}} + 1} \right \rfloor \\
-                W_{out} ＝ \left \lfloor{\frac{W_{in} + padding[4] + padding[5] - (\text{dilation[2]} - 1) \times
+                W_{out} = \left \lfloor{\frac{W_{in} + padding[4] + padding[5] - (\text{dilation[2]} - 1) \times
                 \text{kernel_size[2]} - 1 }{\text{stride[2]}} + 1} \right \rfloor \\
             \end{array}
@@ -812,7 +812,7 @@ class Conv3d(_Conv):
             bias_init,
             data_format,
             dtype=dtype)
-        out_channels = self.out_channels
+        out_channels = self.out_channels // group
         self.conv3d = P.Conv3D(out_channel=out_channels,
                                kernel_size=self.kernel_size,
                                mode=1,
@@ -820,17 +820,33 @@ class Conv3d(_Conv):
                                pad=self.padding,
                                stride=self.stride,
                                dilation=self.dilation,
-                               group=group,
+                               group=1,
                                data_format=self.data_format)
         self.bias_add = P.BiasAdd(data_format=self.data_format)
         self.shape = P.Shape()
+        self.concat = P.Concat(1)
+        self.split_0 = P.Split(0, self.group)
+        self.split_1 = P.Split(1, self.group)
     def construct(self, x):
         x_shape = self.shape(x)
         _check_input_5dims(x_shape, self.cls_name)
-        out = self.conv3d(x, self.weight)
-        if self.has_bias:
-            out = self.bias_add(out, self.bias)
+        if self.group == 1:
+            out = self.conv3d(x, self.weight)
+            if self.has_bias:
+                out = self.bias_add(out, self.bias)
+        else:
+            features = self.split_1(x)
+            weights = self.split_0(self.weight)
+            outputs = ()
+            for i in range(self.group):
+                output = self.conv3d(features[i], weights[i])
+                outputs = outputs + (output,)
+            out = self.concat(outputs)
+            if self.bias is not None:
+                new_shape = [1 for _ in range(out.ndim)]
+                new_shape[1] = self.out_channels
+                out = out + self.bias.reshape(new_shape)
         return out
@@ -921,9 +937,9 @@ class Conv3dTranspose(_Conv):
         .. math::
             \begin{array}{ll} \\
-                D_{out} ＝ \left \lfloor{\frac{D_{in}}{\text{stride[0]}} + 1} \right \rfloor \\
-                H_{out} ＝ \left \lfloor{\frac{H_{in}}{\text{stride[1]}} + 1} \right \rfloor \\
-                W_{out} ＝ \left \lfloor{\frac{W_{in}}{\text{stride[2]}} + 1} \right \rfloor \\
+                D_{out} = \left \lfloor{\frac{D_{in}}{\text{stride[0]}} + 1} \right \rfloor \\
+                H_{out} = \left \lfloor{\frac{H_{in}}{\text{stride[1]}} + 1} \right \rfloor \\
+                W_{out} = \left \lfloor{\frac{W_{in}}{\text{stride[2]}} + 1} \right \rfloor \\
             \end{array}
@@ -931,11 +947,11 @@ class Conv3dTranspose(_Conv):
         .. math::
             \begin{array}{ll} \\
-                D_{out} ＝ \left \lfloor{\frac{D_{in} - \text{dilation[0]} \times (\text{kernel_size[0]} - 1) }
+                D_{out} = \left \lfloor{\frac{D_{in} - \text{dilation[0]} \times (\text{kernel_size[0]} - 1) }
                 {\text{stride[0]}} + 1} \right \rfloor \\
-                H_{out} ＝ \left \lfloor{\frac{H_{in} - \text{dilation[1]} \times (\text{kernel_size[1]} - 1) }
+                H_{out} = \left \lfloor{\frac{H_{in} - \text{dilation[1]} \times (\text{kernel_size[1]} - 1) }
                 {\text{stride[1]}} + 1} \right \rfloor \\
-                W_{out} ＝ \left \lfloor{\frac{W_{in} - \text{dilation[2]} \times (\text{kernel_size[2]} - 1) }
+                W_{out} = \left \lfloor{\frac{W_{in} - \text{dilation[2]} \times (\text{kernel_size[2]} - 1) }
                 {\text{stride[2]}} + 1} \right \rfloor \\
             \end{array}
@@ -943,11 +959,11 @@ class Conv3dTranspose(_Conv):
         .. math::
             \begin{array}{ll} \\
-                D_{out} ＝ \left \lfloor{\frac{D_{in} + padding[0] + padding[1] - (\text{dilation[0]} - 1) \times
+                D_{out} = \left \lfloor{\frac{D_{in} + padding[0] + padding[1] - (\text{dilation[0]} - 1) \times
                 \text{kernel_size[0]} - 1 }{\text{stride[0]}} + 1} \right \rfloor \\
-                H_{out} ＝ \left \lfloor{\frac{H_{in} + padding[2] + padding[3] - (\text{dilation[1]} - 1) \times
+                H_{out} = \left \lfloor{\frac{H_{in} + padding[2] + padding[3] - (\text{dilation[1]} - 1) \times
                 \text{kernel_size[1]} - 1 }{\text{stride[1]}} + 1} \right \rfloor \\
-                W_{out} ＝ \left \lfloor{\frac{W_{in} + padding[4] + padding[5] - (\text{dilation[2]} - 1) \times
+                W_{out} = \left \lfloor{\frac{W_{in} + padding[4] + padding[5] - (\text{dilation[2]} - 1) \times
                 \text{kernel_size[2]} - 1 }{\text{stride[2]}} + 1} \right \rfloor \\
             \end{array}

mindspore/nn/layer/flash_attention.py CHANGED Viewed

@@ -21,9 +21,7 @@ import mindspore.common.dtype as mstype
 from mindspore.common.tensor import Tensor
 from mindspore import ops
 from mindspore.nn.cell import Cell
-from mindspore.ops._op_impl._custom_op.flash_attention.flash_attention_impl import get_flash_attention
 from mindspore.ops.operations.nn_ops import FlashAttentionScore
-from mindspore._c_expression import MSContext
 __all__ = ['FlashAttention']
@@ -46,25 +44,25 @@ class FlashAttention(Cell):
             Default 65536.
         next_block_num(int): A integer to define the number of blocks to look behind for local block sparse attention.
             Default 65536.
-        tiling_stgy_name(str): A str to define tiling strategy of flash attention.
         dp(int): data parallel.
             Default 1.
         mp(int): model parallel.
             Default 1.
-        high_precision(bool): This mode has higher precision but some performance loss.
+        high_precision(bool): This mode has higher precision but some performance loss. Only take effect on Ascend910A.
             Default False.
         have_attention_mask_batch(bool): indicates whether attention_mask contains the batch dimension.
             Default True
         alibi(bool): This parameter indicates whether the flashattention supports the Alibi.
             Default: False
+        use_mqa(bool): Using MQA if True, only take effect under 910B. Default: False.
     Inputs:
       - **query** (Tensor) - Tensor query (:class:`mstype.fp16` [batch_size, head_num, seq_length, head_dim])
       - **key** (Tensor) - Tensor key (:class:`mstype.fp16` [batch_size, head_num, seq_length, head_dim])
       - **value** (Tensor) - Tensor value (:class:`mstype.fp16` [batch_size, head_num, seq_length, head_dim])
-      - **attention_mask** (Tensor) - Float Tensor the mask of (:class:`mstype.fp16` [batch_size, seq_length,
-          seq_length]): A matrix to pass masked information.
+      - **attention_mask** (Tensor) - Float Tensor the mask of (:class:`mstype.fp16` `mstype.uint8`
+        [batch_size, seq_length, seq_length]): A matrix to pass masked information.
     Outputs:
         A Tensor. The output of the attention with shape [batch_size, head_num, seq_length, head_dim]
@@ -97,56 +95,51 @@ class FlashAttention(Cell):
                  dropout_rate=0.0,
                  prev_block_num=65536,
                  next_block_num=65536,
-                 tiling_stgy_name="sparse",
                  dp=1,
                  mp=1,
                  high_precision=False,
                  have_attention_mask_batch=True,
-                 alibi=False
+                 alibi=False,
+                 use_mqa=False
                  ):
         super(FlashAttention, self).__init__()
         scaling_constant = math.sqrt(head_dim)
         if scaling_constant == 0:
             raise ValueError("the scaling constant must not be 0.")
-        self.scale_factor = Tensor([1. / scaling_constant], dtype=mstype.float16)
+        self.dropout_rate = dropout_rate
+        self.alibi = alibi
+        self.have_attention_mask_batch = have_attention_mask_batch
-        self.is_910A = MSContext.get_instance().get_ascend_soc_version() == "Ascend910"
-        if self.is_910A:
-            self.flash_attention = get_flash_attention(
-                prev_block_num=prev_block_num,
-                next_block_num=next_block_num,
-                tiling_stgy_name=tiling_stgy_name,
-                high_precision=high_precision
-            )
-            self.flash_attention.add_prim_attr("primitive_target", "Ascend")
-        else:
-            if alibi:
-                raise ValueError(f"When soc_version is not Ascend910A, alibi must be False")
-            self.transpose_4d_pre = ops.Transpose().shard(((dp, mp, 1, 1),))
-            self.transpose_4d_post = ops.Transpose().shard(((dp, 1, mp, 1),))
-            self.reshape = ops.Reshape()
-            self.zeros_like = ops.ZerosLike().shard(((dp, mp, 1, 1),))
-            self.zeros = ops.Zeros()
-            self.attn_expand_dims = ops.ExpandDims().shard(((dp, 1, 1),))
-            fa_strategies = ((dp, 1, mp),
-                             (dp, 1, mp),
-                             (dp, 1, mp),
+        self.transpose_4d_pre = ops.Transpose().shard(((dp, mp, 1, 1),))
+        self.transpose_4d_post = ops.Transpose().shard(((dp, 1, mp, 1),))
+        self.reshape = ops.Reshape()
+        self.zeros_like = ops.ZerosLike().shard(((dp, mp, 1, 1),))
+        self.zeros = ops.Zeros()
+        self.attn_cast = ops.Cast()
+        if use_mqa:
+            fa_strategies = ((dp, mp, 1, 1),
+                             (dp, 1, 1, 1),
                              (dp, 1, 1, 1))
-            if dropout_rate > 1e-5:
-                fa_strategies += ((dp, mp, 1, 1),)
-            self.flash_attention = FlashAttentionScore(head_num=head_num, pre_tokens=prev_block_num,
-                                                       next_tokens=next_block_num,
-                                                       keep_prob=1 - dropout_rate,
-                                                       scale_value=1.0,
-                                                       inner_precise=0 if high_precision else 1).shard(fa_strategies)
+        else:
+            fa_strategies = ((dp, mp, 1, 1),
+                             (dp, mp, 1, 1),
+                             (dp, mp, 1, 1))
+        if self.alibi:
+            self.alibi_rescale_mul = ops.Mul().shard(((dp, mp, 1, 1), (1,)))
+            self.alibi_rescale_factor = Tensor([scaling_constant], dtype=mstype.float16)
+            fa_strategies += ((dp, mp, 1, 1),)
+        if dropout_rate > 1e-5:
+            fa_strategies += ((dp, mp, 1, 1),)
+        fa_strategies += ((dp, 1, 1, 1),)
+        self.flash_attention = FlashAttentionScore(head_num=head_num, pre_tokens=prev_block_num,
+                                                   next_tokens=next_block_num,
+                                                   keep_prob=1 - dropout_rate,
+                                                   scale_value=1. / scaling_constant,
+                                                   inner_precise=0,
+                                                   input_layout="BNSD").shard(fa_strategies)
-        self.ones = ops.Ones()
-        self.dim_mask = Tensor([1 for _ in range(head_dim)], dtype=mstype.int8)
-        self.scale_mul = ops.Mul().shard(((dp, mp, 1, 1), (1,)))
         self.dropout_rate = dropout_rate
-        self.have_attention_mask_batch = have_attention_mask_batch
-        self.alibi = alibi
         if self.dropout_rate > 1e-5:
             self.keep_prob = Tensor(1 - self.dropout_rate, dtype=mstype.float16)
             self.fill_v2 = ops.FillV2().shard(((dp, mp, 1, 1), ()))
@@ -162,46 +155,7 @@ class FlashAttention(Cell):
                                   such as MatMul. Default: None.
         :return:
         """
-        if in_strategy is None:
-            # default: dp=1, mp=1, construct inputs only contain query, key, value
-            in_strategy = (
-                (1, 1, 1, 1),
-                (1, 1, 1, 1),
-                (1, 1, 1, 1),
-            )
         self.flash_attention.shard(in_strategy)
-        dp = in_strategy[0][0]
-        mp = in_strategy[0][1]
-        self.flash_attention.add_prim_attr("dev_matrix_shape", [dp, mp, 1, 1])
-        inputs_tensor_map = [
-            [3, 2, 1, 0],
-            [3, 2, 1, 0],
-            [3, 2, 1, 0],
-        ]
-        if self.have_attention_mask_batch:
-            inputs_tensor_map.append([3, 1, 0])
-        else:
-            inputs_tensor_map.append([-1, 1, 0])
-        input_empty_args_num = 2
-        # dropout_mask
-        if self.dropout_rate > 1e-5:
-            input_empty_args_num -= 1
-            inputs_tensor_map.append([3, 2, 1, 0])
-        if self.alibi:
-            input_empty_args_num -= 1
-            inputs_tensor_map.append([3, 2, 1, 0])
-        self.flash_attention.add_prim_attr("inputs_tensor_map", inputs_tensor_map)
-        self.flash_attention.add_prim_attr("outputs_tensor_map", [
-            [3, 2, 1, 0],  # O
-            [3, 2, 1],  # L
-            [3, 2, 1]  # M
-        ])
-        self.flash_attention.add_prim_attr("as_loss_divisor", 0)
-        self.flash_attention.add_prim_attr("empty_mirror_ops", input_empty_args_num)
     def construct(self, query, key, value, attn_mask=None, alibi_mask=None):
         """FlashAttention forward
@@ -212,53 +166,24 @@ class FlashAttention(Cell):
         :param alibi_mask: [bsz, head_num, 1, seq_len], if not None
         :return: output          [bsz, head_num, seq_len, head_dim]
         """
-        query = self.scale_mul(query, self.scale_factor)
-        bsz, head_num, seq_len, head_dim = query.shape
-        _, k_head_num, k_seq_len, _ = key.shape
-        _, v_head_num, v_seq_len, _ = value.shape
-        if head_num != k_head_num or head_num != v_head_num:
-            raise ValueError(
-                "the head_num of query, key and value must be the same, "
-                "If different head_num are used, users need to change themselves to be same by tile.")
-        if seq_len % 16 != 0 or k_seq_len % 16 != 0 or k_seq_len != v_seq_len:
-            raise ValueError(
-                "query, key, value seq_len must be a multiple of 16, and key seq_len, value seq_len must be the same.")
-        if head_dim > 304:
-            raise ValueError(
-                "the head_dim must be less than 304, otherwise the ub would be OOM.")
-        if self.is_910A:
-            # 910A -- FlashAttentionPrimtive
-            if self.dropout_rate > 1e-5:
-                drop_mask_bits = self.drop_gen_mask((bsz, head_num, seq_len, seq_len), self.keep_prob)
-                tensor_shape = Tensor((bsz, head_num, seq_len, seq_len), mstype.int32)
-                ones = self.fill_v2(tensor_shape, self.tensor_one)
-                ones = self.depend(ones, query)
-                drop_mask = self.do_dropout(ones, drop_mask_bits, self.keep_prob)
-            else:
-                drop_mask = None
-            output, _, _ = self.flash_attention(query, key, value, attn_mask, drop_mask, alibi_mask)
+        bsz, head_num, seq_len, _ = query.shape
+        # 910B -- FlashAttentionScore
+        if self.dropout_rate > 1e-5:
+            drop_mask_bits = self.reshape(self.drop_gen_mask((bsz, head_num, seq_len, seq_len), self.keep_prob),
+                                          (bsz, head_num, seq_len, seq_len // 8))
         else:
-            # FlashAttentionScore
-            # Useless input, just for binary calls.
-            if self.dropout_rate > 1e-5:
-                drop_mask_bits = self.reshape(self.drop_gen_mask((bsz, head_num, seq_len, seq_len), self.keep_prob),
-                                              (bsz, head_num, seq_len, seq_len // 8))
-            else:
-                drop_mask_bits = None
-            # (B, N, S, D) -> (B, S, H)
-            query = self.reshape(self.transpose_4d_pre(query, (0, 2, 1, 3)), (bsz, seq_len, -1))
-            key = self.reshape(self.transpose_4d_pre(key, (0, 2, 1, 3)), (bsz, seq_len, -1))
-            value = self.reshape(self.transpose_4d_pre(value, (0, 2, 1, 3)), (bsz, seq_len, -1))
-            attn_mask = self.attn_expand_dims(attn_mask, 1)
-            output, _, _ = self.flash_attention(query,
-                                                key,
-                                                value,
-                                                attn_mask,
-                                                drop_mask_bits,
-                                                None,
-                                                None)
-            output = self.transpose_4d_post(self.reshape(output, (bsz, seq_len, head_num, head_dim)), (0, 2, 1, 3))
+            drop_mask_bits = None
+        if self.alibi:
+            alibi_mask = self.alibi_rescale_mul(alibi_mask, self.cast(self.alibi_rescale_factor, alibi_mask.dtype))
+        # (B, S, S) -> (B, 1, S, S)
+        if self.have_attention_mask_batch:
+            attn_mask = self.cast(self.reshape(attn_mask, (bsz, 1, seq_len, seq_len)), mstype.uint8)
+        _, _, _, output = self.flash_attention(query,
+                                               key,
+                                               value,
+                                               alibi_mask,
+                                               drop_mask_bits,
+                                               None,
+                                               attn_mask,
+                                               None)
         return output