PyPI - mindspore - Versions diffs - 2.2.0__cp39-cp39-win_amd64.whl → 2.2.10__cp39-cp39-win_amd64.whl - Mend

mindspore 2.2.0__cp39-cp39-win_amd64.whl → 2.2.10__cp39-cp39-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mindspore might be problematic. Click here for more details.

Files changed (122) hide show

mindspore/.commit_id +1 -1
mindspore/Microsoft.VisualStudio.Telemetry.dll +0 -0
mindspore/Newtonsoft.Json.dll +0 -0
mindspore/_c_dataengine.cp39-win_amd64.pyd +0 -0
mindspore/_c_expression.cp39-win_amd64.pyd +0 -0
mindspore/_c_mindrecord.cp39-win_amd64.pyd +0 -0
mindspore/_checkparam.py +3 -3
mindspore/_extends/graph_kernel/model/graph_split.py +84 -76
mindspore/_extends/graph_kernel/splitter.py +3 -2
mindspore/_extends/parallel_compile/akg_compiler/build_tbe_kernel.py +83 -66
mindspore/_extends/parallel_compile/akg_compiler/tbe_topi.py +4 -4
mindspore/_extends/parallel_compile/akg_compiler/util.py +10 -7
mindspore/_extends/parallel_compile/tbe_compiler/tbe_helper.py +2 -1
mindspore/_extends/parse/standard_method.py +2 -9
mindspore/_extends/remote/kernel_build_server.py +2 -1
mindspore/atlprov.dll +0 -0
mindspore/c1.dll +0 -0
mindspore/c1xx.dll +0 -0
mindspore/c2.dll +0 -0
mindspore/common/api.py +1 -1
mindspore/common/auto_dynamic_shape.py +81 -85
mindspore/common/dump.py +1 -1
mindspore/common/tensor.py +3 -20
mindspore/config/op_info.config +1 -1
mindspore/context.py +11 -4
mindspore/dataset/engine/datasets_standard_format.py +5 -0
mindspore/dataset/vision/transforms.py +21 -21
mindspore/dnnl.dll +0 -0
mindspore/dpcmi.dll +0 -0
mindspore/experimental/optim/adam.py +1 -1
mindspore/gen_ops.py +1 -1
mindspore/include/api/model.h +17 -0
mindspore/include/api/status.h +8 -3
mindspore/jpeg62.dll +0 -0
mindspore/mindspore_backend.dll +0 -0
mindspore/mindspore_common.dll +0 -0
mindspore/mindspore_core.dll +0 -0
mindspore/mindspore_glog.dll +0 -0
mindspore/mindspore_shared_lib.dll +0 -0
mindspore/msobj140.dll +0 -0
mindspore/mspdb140.dll +0 -0
mindspore/mspdbcore.dll +0 -0
mindspore/mspdbst.dll +0 -0
mindspore/mspft140.dll +0 -0
mindspore/msvcdis140.dll +0 -0
mindspore/msvcp140_1.dll +0 -0
mindspore/msvcp140_2.dll +0 -0
mindspore/msvcp140_atomic_wait.dll +0 -0
mindspore/msvcp140_codecvt_ids.dll +0 -0
mindspore/nn/cell.py +0 -3
mindspore/nn/layer/activation.py +4 -5
mindspore/nn/layer/conv.py +39 -23
mindspore/nn/layer/flash_attention.py +90 -78
mindspore/nn/layer/math.py +3 -7
mindspore/nn/layer/rnn_cells.py +5 -5
mindspore/nn/wrap/cell_wrapper.py +6 -0
mindspore/numpy/utils_const.py +5 -5
mindspore/opencv_core452.dll +0 -0
mindspore/opencv_imgcodecs452.dll +0 -0
mindspore/opencv_imgproc452.dll +0 -0
mindspore/ops/_grad_experimental/grad_array_ops.py +1 -1
mindspore/ops/_grad_experimental/grad_implementations.py +2 -2
mindspore/ops/_grad_experimental/grad_math_ops.py +19 -18
mindspore/ops/_grad_experimental/grad_sparse_ops.py +3 -3
mindspore/ops/_op_impl/aicpu/add.py +3 -3
mindspore/ops/_utils/utils.py +2 -0
mindspore/ops/composite/multitype_ops/_compile_utils.py +2 -1
mindspore/ops/composite/multitype_ops/getitem_impl.py +2 -2
mindspore/ops/function/array_func.py +10 -7
mindspore/ops/function/grad/grad_func.py +0 -1
mindspore/ops/function/nn_func.py +98 -9
mindspore/ops/function/random_func.py +2 -1
mindspore/ops/op_info_register.py +24 -21
mindspore/ops/operations/__init__.py +3 -2
mindspore/ops/operations/_grad_ops.py +24 -4
mindspore/ops/operations/_inner_ops.py +155 -23
mindspore/ops/operations/array_ops.py +9 -7
mindspore/ops/operations/comm_ops.py +2 -2
mindspore/ops/operations/custom_ops.py +85 -68
mindspore/ops/operations/inner_ops.py +26 -3
mindspore/ops/operations/math_ops.py +4 -3
mindspore/ops/operations/nn_ops.py +109 -28
mindspore/parallel/_parallel_serialization.py +10 -3
mindspore/parallel/_tensor.py +4 -1
mindspore/parallel/checkpoint_transform.py +13 -2
mindspore/parallel/shard.py +17 -10
mindspore/pgodb140.dll +0 -0
mindspore/pgort140.dll +0 -0
mindspore/profiler/common/util.py +1 -0
mindspore/profiler/parser/ascend_hccl_generator.py +232 -0
mindspore/profiler/parser/ascend_msprof_exporter.py +86 -43
mindspore/profiler/parser/ascend_msprof_generator.py +196 -9
mindspore/profiler/parser/ascend_op_generator.py +1 -1
mindspore/profiler/parser/ascend_timeline_generator.py +6 -182
mindspore/profiler/parser/base_timeline_generator.py +1 -1
mindspore/profiler/parser/cpu_gpu_timeline_generator.py +2 -2
mindspore/profiler/parser/framework_parser.py +1 -1
mindspore/profiler/parser/profiler_info.py +19 -0
mindspore/profiler/profiling.py +46 -24
mindspore/rewrite/api/pattern_engine.py +1 -1
mindspore/rewrite/parsers/for_parser.py +1 -1
mindspore/rewrite/symbol_tree.py +1 -4
mindspore/run_check/_check_version.py +5 -3
mindspore/safeguard/rewrite_obfuscation.py +52 -28
mindspore/tbbmalloc.dll +0 -0
mindspore/tinyxml2.dll +0 -0
mindspore/train/callback/_summary_collector.py +1 -1
mindspore/train/dataset_helper.py +1 -0
mindspore/train/model.py +2 -2
mindspore/train/serialization.py +97 -11
mindspore/train/summary/_summary_adapter.py +1 -1
mindspore/train/summary/summary_record.py +23 -7
mindspore/turbojpeg.dll +0 -0
mindspore/vcmeta.dll +0 -0
mindspore/vcruntime140.dll +0 -0
mindspore/vcruntime140_1.dll +0 -0
mindspore/version.py +1 -1
{mindspore-2.2.0.dist-info → mindspore-2.2.10.dist-info}/METADATA +1 -1
{mindspore-2.2.0.dist-info → mindspore-2.2.10.dist-info}/RECORD +122 -122
{mindspore-2.2.0.dist-info → mindspore-2.2.10.dist-info}/WHEEL +0 -0
{mindspore-2.2.0.dist-info → mindspore-2.2.10.dist-info}/entry_points.txt +0 -0
{mindspore-2.2.0.dist-info → mindspore-2.2.10.dist-info}/top_level.txt +0 -0

mindspore/include/api/model.h CHANGED Viewed

@@ -136,6 +136,13 @@ class MS_API Model {
   /// \return Status.
   Status UpdateWeights(const std::vector<MSTensor> &new_weights);
+  /// \brief Change the size and or content of weight tensors
+  ///
+  /// \param[in]  A vector where model constant are arranged in sequence
+  ///
+  /// \return Status.
+  Status UpdateWeights(const std::vector<std::vector<MSTensor>> &new_weights);
   /// \brief Inference model API. If use this API in train mode, it's equal to RunStep API.
   ///
   /// \param[in] inputs A vector where model inputs are arranged in sequence.
@@ -358,6 +365,13 @@ class MS_API Model {
   const std::shared_ptr<ModelImpl> impl() const { return impl_; }
+  /// \brief Get model info by key
+  ///
+  /// \param[in] key The key of model info key-value pair
+  ///
+  /// \return The value of the model info associated with the given key.
+  inline std::string GetModelInfo(const std::string &key);
  private:
   friend class Serialization;
   // api without std::string
@@ -374,6 +388,7 @@ class MS_API Model {
                const std::vector<char> &cropto_lib_path);
   Status Build(const std::vector<char> &model_path, ModelType model_type, const std::shared_ptr<Context> &model_context,
                const Key &dec_key, const std::vector<char> &dec_mode, const std::vector<char> &cropto_lib_path);
+  std::vector<char> GetModelInfo(const std::vector<char> &key);
   std::shared_ptr<ModelImpl> impl_;
 };
@@ -416,5 +431,7 @@ Status Model::Build(const std::string &model_path, ModelType model_type,
                     const std::shared_ptr<Context> &model_context) {
   return Build(StringToChar(model_path), model_type, model_context);
 }
+inline std::string Model::GetModelInfo(const std::string &key) { return CharToString(GetModelInfo(StringToChar(key))); }
 }  // namespace mindspore
 #endif  // MINDSPORE_INCLUDE_API_MODEL_H

mindspore/include/api/status.h CHANGED Viewed

@@ -83,9 +83,14 @@ enum StatusCode : uint32_t {
   kLiteModelRebuild = kLite | (0x0FFFFFFF & -12),    /**< Model has been built. */
   // Executor error code, range: [-100,-200)
-  kLiteOutOfTensorRange = kLite | (0x0FFFFFFF & -100), /**< Failed to check range. */
-  kLiteInputTensorError = kLite | (0x0FFFFFFF & -101), /**< Failed to check input tensor. */
-  kLiteReentrantError = kLite | (0x0FFFFFFF & -102),   /**< Exist executor running. */
+  kLiteOutOfTensorRange = kLite | (0x0FFFFFFF & -100),           /**< Failed to check range. */
+  kLiteInputTensorError = kLite | (0x0FFFFFFF & -101),           /**< Failed to check input tensor. */
+  kLiteReentrantError = kLite | (0x0FFFFFFF & -102),             /**< Exist executor running. */
+  kLiteLLMWaitProcessTimeOut = kLite | (0x0FFFFFFF & -103),      /**< Wait to be processed time out. */
+  kLiteLLMKVCacheNotExist = kLite | (0x0FFFFFFF & -104),         /**< KV Cache not exist. */
+  kLiteLLMRepeatRequest = kLite | (0x0FFFFFFF & -105),           /**< repeat request. */
+  kLiteLLMRequestAlreadyCompleted = kLite | (0x0FFFFFFF & -106), /**< request already complete!. */
+  kLiteLLMEngineFinalized = kLite | (0x0FFFFFFF & -107),         /**< llm engine finalized. */
   // Graph error code, range: [-200,-300)
   kLiteGraphFileError = kLite | (0x0FFFFFFF & -200), /**< Failed to verify graph file. */

mindspore/jpeg62.dll CHANGED Viewed

Binary file

mindspore/mindspore_backend.dll CHANGED Viewed

Binary file

mindspore/mindspore_common.dll CHANGED Viewed

Binary file

mindspore/mindspore_core.dll CHANGED Viewed

Binary file

mindspore/mindspore_glog.dll CHANGED Viewed

Binary file

mindspore/mindspore_shared_lib.dll CHANGED Viewed

Binary file

mindspore/msobj140.dll CHANGED Viewed

Binary file

mindspore/mspdb140.dll CHANGED Viewed

Binary file

mindspore/mspdbcore.dll CHANGED Viewed

Binary file

mindspore/mspdbst.dll CHANGED Viewed

Binary file

mindspore/mspft140.dll CHANGED Viewed

Binary file

mindspore/msvcdis140.dll CHANGED Viewed

Binary file

mindspore/msvcp140_1.dll CHANGED Viewed

Binary file

mindspore/msvcp140_2.dll CHANGED Viewed

Binary file

mindspore/msvcp140_atomic_wait.dll CHANGED Viewed

Binary file

mindspore/msvcp140_codecvt_ids.dll CHANGED Viewed

Binary file

mindspore/nn/cell.py CHANGED Viewed

@@ -1081,9 +1081,6 @@ class Cell(Cell_):
         if not isinstance(param, Parameter) and param is not None:
             raise TypeError(f"For 'insert_param_to_cell', the argument 'param' must be 'Parameter' if not None, "
                             f"but got {type(param)}.")
-        if param is None:
-            raise TypeError(f"For 'insert_param_to_cell', the argument 'param' must not be None, "
-                            f"but got None.")
         if isinstance(param, Parameter) and param.name == PARAMETER_NAME_DEFAULT:
             param.name = param_name
         self._params[param_name] = param

mindspore/nn/layer/activation.py CHANGED Viewed

@@ -932,10 +932,8 @@ class GELU(Cell):
         """Initialize GELU."""
         super(GELU, self).__init__()
         validator.check_bool(approximate, 'approximate', self.cls_name)
-        self.approximate = approximate
-        if approximate:
-            self.approximate = 'tanh'
-        else:
+        self.approximate = 'tanh'
+        if not approximate:
             self.approximate = 'none'
     def construct(self, x):
@@ -1335,7 +1333,8 @@ class LRN(Cell):
     .. warning::
         LRN is deprecated on Ascend due to potential accuracy problem. It's recommended to use other
-        normalization methods, e.g. :class:`mindspore.nn.BatchNorm`.
+        normalization methods, e.g. :class:`mindspore.nn.BatchNorm1d` ,
+        :class:`mindspore.nn.BatchNorm2d` , :class:`mindspore.nn.BatchNorm3d`.
     Refer to :func:`mindspore.ops.lrn` for more details.

mindspore/nn/layer/conv.py CHANGED Viewed

@@ -718,9 +718,9 @@ class Conv3d(_Conv):
         .. math::
             \begin{array}{ll} \\
-                D_{out} ＝ \left \lceil{\frac{D_{in}}{\text{stride[0]}}} \right \rceil \\
-                H_{out} ＝ \left \lceil{\frac{H_{in}}{\text{stride[1]}}} \right \rceil \\
-                W_{out} ＝ \left \lceil{\frac{W_{in}}{\text{stride[2]}}} \right \rceil \\
+                D_{out} = \left \lceil{\frac{D_{in}}{\text{stride[0]}}} \right \rceil \\
+                H_{out} = \left \lceil{\frac{H_{in}}{\text{stride[1]}}} \right \rceil \\
+                W_{out} = \left \lceil{\frac{W_{in}}{\text{stride[2]}}} \right \rceil \\
             \end{array}
@@ -728,11 +728,11 @@ class Conv3d(_Conv):
         .. math::
             \begin{array}{ll} \\
-                D_{out} ＝ \left \lfloor{\frac{D_{in} - \text{dilation[0]} \times (\text{kernel_size[0]} - 1) }
+                D_{out} = \left \lfloor{\frac{D_{in} - \text{dilation[0]} \times (\text{kernel_size[0]} - 1) }
                 {\text{stride[0]}} + 1} \right \rfloor \\
-                H_{out} ＝ \left \lfloor{\frac{H_{in} - \text{dilation[1]} \times (\text{kernel_size[1]} - 1) }
+                H_{out} = \left \lfloor{\frac{H_{in} - \text{dilation[1]} \times (\text{kernel_size[1]} - 1) }
                 {\text{stride[1]}} + 1} \right \rfloor \\
-                W_{out} ＝ \left \lfloor{\frac{W_{in} - \text{dilation[2]} \times (\text{kernel_size[2]} - 1) }
+                W_{out} = \left \lfloor{\frac{W_{in} - \text{dilation[2]} \times (\text{kernel_size[2]} - 1) }
                 {\text{stride[2]}} + 1} \right \rfloor \\
             \end{array}
@@ -740,11 +740,11 @@ class Conv3d(_Conv):
         .. math::
             \begin{array}{ll} \\
-                D_{out} ＝ \left \lfloor{\frac{D_{in} + padding[0] + padding[1] - (\text{dilation[0]} - 1) \times
+                D_{out} = \left \lfloor{\frac{D_{in} + padding[0] + padding[1] - (\text{dilation[0]} - 1) \times
                 \text{kernel_size[0]} - 1 }{\text{stride[0]}} + 1} \right \rfloor \\
-                H_{out} ＝ \left \lfloor{\frac{H_{in} + padding[2] + padding[3] - (\text{dilation[1]} - 1) \times
+                H_{out} = \left \lfloor{\frac{H_{in} + padding[2] + padding[3] - (\text{dilation[1]} - 1) \times
                 \text{kernel_size[1]} - 1 }{\text{stride[1]}} + 1} \right \rfloor \\
-                W_{out} ＝ \left \lfloor{\frac{W_{in} + padding[4] + padding[5] - (\text{dilation[2]} - 1) \times
+                W_{out} = \left \lfloor{\frac{W_{in} + padding[4] + padding[5] - (\text{dilation[2]} - 1) \times
                 \text{kernel_size[2]} - 1 }{\text{stride[2]}} + 1} \right \rfloor \\
             \end{array}
@@ -812,7 +812,7 @@ class Conv3d(_Conv):
             bias_init,
             data_format,
             dtype=dtype)
-        out_channels = self.out_channels
+        out_channels = self.out_channels // group
         self.conv3d = P.Conv3D(out_channel=out_channels,
                                kernel_size=self.kernel_size,
                                mode=1,
@@ -820,17 +820,33 @@ class Conv3d(_Conv):
                                pad=self.padding,
                                stride=self.stride,
                                dilation=self.dilation,
-                               group=group,
+                               group=1,
                                data_format=self.data_format)
         self.bias_add = P.BiasAdd(data_format=self.data_format)
         self.shape = P.Shape()
+        self.concat = P.Concat(1)
+        self.split_0 = P.Split(0, self.group)
+        self.split_1 = P.Split(1, self.group)
     def construct(self, x):
         x_shape = self.shape(x)
         _check_input_5dims(x_shape, self.cls_name)
-        out = self.conv3d(x, self.weight)
-        if self.has_bias:
-            out = self.bias_add(out, self.bias)
+        if self.group == 1:
+            out = self.conv3d(x, self.weight)
+            if self.has_bias:
+                out = self.bias_add(out, self.bias)
+        else:
+            features = self.split_1(x)
+            weights = self.split_0(self.weight)
+            outputs = ()
+            for i in range(self.group):
+                output = self.conv3d(features[i], weights[i])
+                outputs = outputs + (output,)
+            out = self.concat(outputs)
+            if self.bias is not None:
+                new_shape = [1 for _ in range(out.ndim)]
+                new_shape[1] = self.out_channels
+                out = out + self.bias.reshape(new_shape)
         return out
@@ -921,9 +937,9 @@ class Conv3dTranspose(_Conv):
         .. math::
             \begin{array}{ll} \\
-                D_{out} ＝ \left \lfloor{\frac{D_{in}}{\text{stride[0]}} + 1} \right \rfloor \\
-                H_{out} ＝ \left \lfloor{\frac{H_{in}}{\text{stride[1]}} + 1} \right \rfloor \\
-                W_{out} ＝ \left \lfloor{\frac{W_{in}}{\text{stride[2]}} + 1} \right \rfloor \\
+                D_{out} = \left \lfloor{\frac{D_{in}}{\text{stride[0]}} + 1} \right \rfloor \\
+                H_{out} = \left \lfloor{\frac{H_{in}}{\text{stride[1]}} + 1} \right \rfloor \\
+                W_{out} = \left \lfloor{\frac{W_{in}}{\text{stride[2]}} + 1} \right \rfloor \\
             \end{array}
@@ -931,11 +947,11 @@ class Conv3dTranspose(_Conv):
         .. math::
             \begin{array}{ll} \\
-                D_{out} ＝ \left \lfloor{\frac{D_{in} - \text{dilation[0]} \times (\text{kernel_size[0]} - 1) }
+                D_{out} = \left \lfloor{\frac{D_{in} - \text{dilation[0]} \times (\text{kernel_size[0]} - 1) }
                 {\text{stride[0]}} + 1} \right \rfloor \\
-                H_{out} ＝ \left \lfloor{\frac{H_{in} - \text{dilation[1]} \times (\text{kernel_size[1]} - 1) }
+                H_{out} = \left \lfloor{\frac{H_{in} - \text{dilation[1]} \times (\text{kernel_size[1]} - 1) }
                 {\text{stride[1]}} + 1} \right \rfloor \\
-                W_{out} ＝ \left \lfloor{\frac{W_{in} - \text{dilation[2]} \times (\text{kernel_size[2]} - 1) }
+                W_{out} = \left \lfloor{\frac{W_{in} - \text{dilation[2]} \times (\text{kernel_size[2]} - 1) }
                 {\text{stride[2]}} + 1} \right \rfloor \\
             \end{array}
@@ -943,11 +959,11 @@ class Conv3dTranspose(_Conv):
         .. math::
             \begin{array}{ll} \\
-                D_{out} ＝ \left \lfloor{\frac{D_{in} + padding[0] + padding[1] - (\text{dilation[0]} - 1) \times
+                D_{out} = \left \lfloor{\frac{D_{in} + padding[0] + padding[1] - (\text{dilation[0]} - 1) \times
                 \text{kernel_size[0]} - 1 }{\text{stride[0]}} + 1} \right \rfloor \\
-                H_{out} ＝ \left \lfloor{\frac{H_{in} + padding[2] + padding[3] - (\text{dilation[1]} - 1) \times
+                H_{out} = \left \lfloor{\frac{H_{in} + padding[2] + padding[3] - (\text{dilation[1]} - 1) \times
                 \text{kernel_size[1]} - 1 }{\text{stride[1]}} + 1} \right \rfloor \\
-                W_{out} ＝ \left \lfloor{\frac{W_{in} + padding[4] + padding[5] - (\text{dilation[2]} - 1) \times
+                W_{out} = \left \lfloor{\frac{W_{in} + padding[4] + padding[5] - (\text{dilation[2]} - 1) \times
                 \text{kernel_size[2]} - 1 }{\text{stride[2]}} + 1} \right \rfloor \\
             \end{array}

mindspore/nn/layer/flash_attention.py CHANGED Viewed

@@ -57,14 +57,15 @@ class FlashAttention(Cell):
             Default True
         alibi(bool): This parameter indicates whether the flashattention supports the Alibi.
             Default: False
+        use_mqa(bool): Using MHA if True, only take effect under 910B. Default: False.
     Inputs:
       - **query** (Tensor) - Tensor query (:class:`mstype.fp16` [batch_size, head_num, seq_length, head_dim])
       - **key** (Tensor) - Tensor key (:class:`mstype.fp16` [batch_size, head_num, seq_length, head_dim])
       - **value** (Tensor) - Tensor value (:class:`mstype.fp16` [batch_size, head_num, seq_length, head_dim])
-      - **attention_mask** (Tensor) - Float Tensor the mask of (:class:`mstype.fp16` [batch_size, seq_length,
-          seq_length]): A matrix to pass masked information.
+      - **attention_mask** (Tensor) - Float Tensor the mask of (:class:`mstype.fp16` `mstype.uint8`
+        [batch_size, seq_length, seq_length]): A matrix to pass masked information.
     Outputs:
         A Tensor. The output of the attention with shape [batch_size, head_num, seq_length, head_dim]
@@ -102,17 +103,23 @@ class FlashAttention(Cell):
                  mp=1,
                  high_precision=False,
                  have_attention_mask_batch=True,
-                 alibi=False
+                 alibi=False,
+                 use_mqa=False
                  ):
         super(FlashAttention, self).__init__()
         scaling_constant = math.sqrt(head_dim)
         if scaling_constant == 0:
             raise ValueError("the scaling constant must not be 0.")
-        self.scale_factor = Tensor([1. / scaling_constant], dtype=mstype.float16)
-        self.is_910A = MSContext.get_instance().get_ascend_soc_version() == "Ascend910"
+        self.dropout_rate = dropout_rate
+        self.is_910A = MSContext.get_instance().get_ascend_soc_version() == "ascend910"
         if self.is_910A:
+            self.scale_factor = Tensor([1. / math.sqrt(scaling_constant)], dtype=mstype.float16)
+            self.scale_mul = ops.Mul().shard(((dp, mp, 1, 1), (1,)))
+            self.ones = ops.Ones()
+            self.dim_mask = Tensor([1 for _ in range(head_dim)], dtype=mstype.int8)
+            self.have_attention_mask_batch = have_attention_mask_batch
+            self.alibi = alibi
             self.flash_attention = get_flash_attention(
                 prev_block_num=prev_block_num,
                 next_block_num=next_block_num,
@@ -120,6 +127,10 @@ class FlashAttention(Cell):
                 high_precision=high_precision
             )
             self.flash_attention.add_prim_attr("primitive_target", "Ascend")
+            fa_strategies = ((dp, mp, 1, 1),
+                             (dp, mp, 1, 1),
+                             (dp, mp, 1, 1))
+            self.shard(fa_strategies)
         else:
             if alibi:
                 raise ValueError(f"When soc_version is not Ascend910A, alibi must be False")
@@ -128,25 +139,27 @@ class FlashAttention(Cell):
             self.reshape = ops.Reshape()
             self.zeros_like = ops.ZerosLike().shard(((dp, mp, 1, 1),))
             self.zeros = ops.Zeros()
-            self.attn_expand_dims = ops.ExpandDims().shard(((dp, 1, 1),))
-            fa_strategies = ((dp, 1, mp),
-                             (dp, 1, mp),
-                             (dp, 1, mp),
-                             (dp, 1, 1, 1))
+            self.attn_cast = ops.Cast()
+            if use_mqa:
+                fa_strategies = ((dp, mp, 1, 1),
+                                 (dp, 1, 1, 1),
+                                 (dp, 1, 1, 1),
+                                 (dp, 1, 1, 1))
+            else:
+                fa_strategies = ((dp, mp, 1, 1),
+                                 (dp, mp, 1, 1),
+                                 (dp, mp, 1, 1),
+                                 (dp, 1, 1, 1))
             if dropout_rate > 1e-5:
                 fa_strategies += ((dp, mp, 1, 1),)
             self.flash_attention = FlashAttentionScore(head_num=head_num, pre_tokens=prev_block_num,
                                                        next_tokens=next_block_num,
                                                        keep_prob=1 - dropout_rate,
-                                                       scale_value=1.0,
-                                                       inner_precise=0 if high_precision else 1).shard(fa_strategies)
+                                                       scale_value=1. / scaling_constant,
+                                                       inner_precise=0 if high_precision else 1,
+                                                       input_layout="BNSD").shard(fa_strategies)
-        self.ones = ops.Ones()
-        self.dim_mask = Tensor([1 for _ in range(head_dim)], dtype=mstype.int8)
-        self.scale_mul = ops.Mul().shard(((dp, mp, 1, 1), (1,)))
         self.dropout_rate = dropout_rate
-        self.have_attention_mask_batch = have_attention_mask_batch
-        self.alibi = alibi
         if self.dropout_rate > 1e-5:
             self.keep_prob = Tensor(1 - self.dropout_rate, dtype=mstype.float16)
             self.fill_v2 = ops.FillV2().shard(((dp, mp, 1, 1), ()))
@@ -162,46 +175,49 @@ class FlashAttention(Cell):
                                   such as MatMul. Default: None.
         :return:
         """
-        if in_strategy is None:
-            # default: dp=1, mp=1, construct inputs only contain query, key, value
-            in_strategy = (
-                (1, 1, 1, 1),
-                (1, 1, 1, 1),
-                (1, 1, 1, 1),
-            )
-        self.flash_attention.shard(in_strategy)
-        dp = in_strategy[0][0]
-        mp = in_strategy[0][1]
-        self.flash_attention.add_prim_attr("dev_matrix_shape", [dp, mp, 1, 1])
-        inputs_tensor_map = [
-            [3, 2, 1, 0],
-            [3, 2, 1, 0],
-            [3, 2, 1, 0],
-        ]
-        if self.have_attention_mask_batch:
-            inputs_tensor_map.append([3, 1, 0])
-        else:
-            inputs_tensor_map.append([-1, 1, 0])
+        if self.is_910A:
+            if in_strategy is None:
+                # default: dp=1, mp=1, construct inputs only contain query, key, value
+                in_strategy = (
+                    (1, 1, 1, 1),
+                    (1, 1, 1, 1),
+                    (1, 1, 1, 1),
+                )
+            self.flash_attention.shard(in_strategy)
+            dp = in_strategy[0][0]
+            mp = in_strategy[0][1]
+            self.flash_attention.add_prim_attr("dev_matrix_shape", [dp, mp, 1, 1])
+            inputs_tensor_map = [
+                [3, 2, 1, 0],
+                [3, 2, 1, 0],
+                [3, 2, 1, 0],
+            ]
+            if self.have_attention_mask_batch:
+                inputs_tensor_map.append([3, 1, 0])
+            else:
+                inputs_tensor_map.append([-1, 1, 0])
-        input_empty_args_num = 2
-        # dropout_mask
-        if self.dropout_rate > 1e-5:
-            input_empty_args_num -= 1
-            inputs_tensor_map.append([3, 2, 1, 0])
+            input_empty_args_num = 2
+            # dropout_mask
+            if self.dropout_rate > 1e-5:
+                input_empty_args_num -= 1
+                inputs_tensor_map.append([3, 2, 1, 0])
-        if self.alibi:
-            input_empty_args_num -= 1
-            inputs_tensor_map.append([3, 2, 1, 0])
+            if self.alibi:
+                input_empty_args_num -= 1
+                inputs_tensor_map.append([3, 2, 1, 0])
-        self.flash_attention.add_prim_attr("inputs_tensor_map", inputs_tensor_map)
+            self.flash_attention.add_prim_attr("inputs_tensor_map", inputs_tensor_map)
-        self.flash_attention.add_prim_attr("outputs_tensor_map", [
-            [3, 2, 1, 0],  # O
-            [3, 2, 1],  # L
-            [3, 2, 1]  # M
-        ])
-        self.flash_attention.add_prim_attr("as_loss_divisor", 0)
-        self.flash_attention.add_prim_attr("empty_mirror_ops", input_empty_args_num)
+            self.flash_attention.add_prim_attr("outputs_tensor_map", [
+                [3, 2, 1, 0],  # O
+                [3, 2, 1],  # L
+                [3, 2, 1]  # M
+            ])
+            self.flash_attention.add_prim_attr("as_loss_divisor", 0)
+            self.flash_attention.add_prim_attr("empty_mirror_ops", input_empty_args_num)
+        else:
+            self.flash_attention.shard(in_strategy)
     def construct(self, query, key, value, attn_mask=None, alibi_mask=None):
         """FlashAttention forward
@@ -212,24 +228,22 @@ class FlashAttention(Cell):
         :param alibi_mask: [bsz, head_num, 1, seq_len], if not None
         :return: output          [bsz, head_num, seq_len, head_dim]
         """
-        query = self.scale_mul(query, self.scale_factor)
         bsz, head_num, seq_len, head_dim = query.shape
-        _, k_head_num, k_seq_len, _ = key.shape
-        _, v_head_num, v_seq_len, _ = value.shape
-        if head_num != k_head_num or head_num != v_head_num:
-            raise ValueError(
-                "the head_num of query, key and value must be the same, "
-                "If different head_num are used, users need to change themselves to be same by tile.")
-        if seq_len % 16 != 0 or k_seq_len % 16 != 0 or k_seq_len != v_seq_len:
-            raise ValueError(
-                "query, key, value seq_len must be a multiple of 16, and key seq_len, value seq_len must be the same.")
-        if head_dim > 304:
-            raise ValueError(
-                "the head_dim must be less than 304, otherwise the ub would be OOM.")
         if self.is_910A:
+            _, k_head_num, k_seq_len, _ = key.shape
+            _, v_head_num, v_seq_len, _ = value.shape
+            if head_num != k_head_num or head_num != v_head_num:
+                raise ValueError(
+                    "the head_num of query, key and value must be the same, "
+                    "If different head_num are used, users need to change themselves to be same by tile.")
+            if seq_len % 16 != 0 or k_seq_len % 16 != 0 or k_seq_len != v_seq_len:
+                raise ValueError(
+                    "query, key, value seq_len must be a multiple of 16, "
+                    "and the seq_len between key and value must be equal.")
             # 910A -- FlashAttentionPrimtive
+            if head_dim > 304:
+                raise ValueError(
+                    "the head_dim must be less than 304, otherwise the ub would be OOM.")
             if self.dropout_rate > 1e-5:
                 drop_mask_bits = self.drop_gen_mask((bsz, head_num, seq_len, seq_len), self.keep_prob)
                 tensor_shape = Tensor((bsz, head_num, seq_len, seq_len), mstype.int32)
@@ -238,27 +252,25 @@ class FlashAttention(Cell):
                 drop_mask = self.do_dropout(ones, drop_mask_bits, self.keep_prob)
             else:
                 drop_mask = None
+            query = self.scale_mul(query, self.scale_factor)
+            key = self.scale_mul(key, self.scale_factor)
+            attn_mask = self.cast(attn_mask, mstype.float16)
             output, _, _ = self.flash_attention(query, key, value, attn_mask, drop_mask, alibi_mask)
         else:
-            # FlashAttentionScore
-            # Useless input, just for binary calls.
+            # 910B -- FlashAttentionScore
             if self.dropout_rate > 1e-5:
                 drop_mask_bits = self.reshape(self.drop_gen_mask((bsz, head_num, seq_len, seq_len), self.keep_prob),
                                               (bsz, head_num, seq_len, seq_len // 8))
             else:
                 drop_mask_bits = None
-            # (B, N, S, D) -> (B, S, H)
-            query = self.reshape(self.transpose_4d_pre(query, (0, 2, 1, 3)), (bsz, seq_len, -1))
-            key = self.reshape(self.transpose_4d_pre(key, (0, 2, 1, 3)), (bsz, seq_len, -1))
-            value = self.reshape(self.transpose_4d_pre(value, (0, 2, 1, 3)), (bsz, seq_len, -1))
-            attn_mask = self.attn_expand_dims(attn_mask, 1)
+            # (B, S, S) -> (B, 1, S, S)
+            attn_mask = self.cast(self.reshape(attn_mask, (bsz, 1, seq_len, seq_len)), mstype.uint8)
             output, _, _ = self.flash_attention(query,
                                                 key,
                                                 value,
                                                 attn_mask,
                                                 drop_mask_bits,
                                                 None,
+                                                None,
                                                 None)
-            output = self.transpose_4d_post(self.reshape(output, (bsz, seq_len, head_num, head_dim)), (0, 2, 1, 3))
         return output

mindspore/nn/layer/math.py CHANGED Viewed

@@ -375,9 +375,6 @@ class DiGamma(Cell):
                            nan, real_result)
-eps_fp32 = Tensor(np.finfo(np.float32).eps, mstype.float32)
 def _while_helper_func(cond, body, vals):
     while cond(vals).any():
         vals = body(vals)
@@ -394,7 +391,7 @@ def _igamma_series(ax, x, a, enabled):
     select = P.Select()
     # If more data types are supported, this epsilon need to be selected.
-    epsilon = eps_fp32
+    epsilon = Tensor(np.finfo(np.float32).eps, mstype.float32)
     def cond(vals):
         enabled = vals[0]
@@ -443,7 +440,7 @@ def _igammac_continued_fraction(ax, x, a, enabled):
     select = P.Select()
     # If more data types are supported, this epsilon need to be selected.
-    epsilon = eps_fp32
+    epsilon = Tensor(np.finfo(np.float32).eps, mstype.float32)
     def cond(vals):
         enabled = vals[0]
@@ -620,8 +617,7 @@ class IGamma(Cell):
             x = F.broadcast_to(x, para_shape)
             a = F.broadcast_to(a, para_shape)
         x_is_zero = self.equal(x, 0)
-        log_maxfloat = self.log_maxfloat32
-        underflow = self.less(ax, self.neg(log_maxfloat))
+        underflow = self.less(ax, self.neg(self.log_maxfloat32))
         ax = self.exp(ax)
         enabled = self.logicalnot(self.logicalor(self.logicalor(x_is_zero, domain_error), underflow))
         output = self.select(use_igammac,

mindspore/nn/layer/rnn_cells.py CHANGED Viewed

@@ -83,7 +83,7 @@ def _check_lstmcell_init(func):
 def _rnn_tanh_cell(inputs, hidden, w_ih, w_hh, b_ih, b_hh):
-    '''RNN cell function with tanh activation'''
+    """RNN cell function with tanh activation"""
     if b_ih is None:
         igates = P.MatMul(False, True)(inputs, w_ih)
         hgates = P.MatMul(False, True)(hidden, w_hh)
@@ -94,7 +94,7 @@ def _rnn_tanh_cell(inputs, hidden, w_ih, w_hh, b_ih, b_hh):
 def _rnn_relu_cell(inputs, hidden, w_ih, w_hh, b_ih, b_hh):
-    '''RNN cell function with relu activation'''
+    """RNN cell function with relu activation"""
     if b_ih is None:
         igates = P.MatMul(False, True)(inputs, w_ih)
         hgates = P.MatMul(False, True)(hidden, w_hh)
@@ -105,7 +105,7 @@ def _rnn_relu_cell(inputs, hidden, w_ih, w_hh, b_ih, b_hh):
 def _lstm_cell(inputs, hidden, w_ih, w_hh, b_ih, b_hh):
-    '''LSTM cell function'''
+    """LSTM cell function"""
     hx, cx = hidden
     if b_ih is None:
         gates = P.MatMul(False, True)(inputs, w_ih) + P.MatMul(False, True)(hx, w_hh)
@@ -125,7 +125,7 @@ def _lstm_cell(inputs, hidden, w_ih, w_hh, b_ih, b_hh):
 def _gru_cell(inputs, hidden, w_ih, w_hh, b_ih, b_hh):
-    '''GRU cell function'''
+    """GRU cell function"""
     if b_ih is None:
         gi = P.MatMul(False, True)(inputs, w_ih)
         gh = P.MatMul(False, True)(hidden, w_hh)
@@ -144,7 +144,7 @@ def _gru_cell(inputs, hidden, w_ih, w_hh, b_ih, b_hh):
 class RNNCellBase(Cell):
-    '''Basic class for RNN Cells'''
+    """Basic class for RNN Cells"""
     def __init__(self, input_size: int, hidden_size: int, has_bias: bool, num_chunks: int,
                  dtype=mstype.float32):
         super().__init__()

mindspore/nn/wrap/cell_wrapper.py CHANGED Viewed

@@ -644,6 +644,9 @@ class PipelineCell(Cell):
         self.micro_inputs = nn.CellList()
         self.micro_size = micro_size
         self.add_list = []
+        if not isinstance(network, Cell):
+            raise TypeError("For 'PipelineCell', the argument 'network' must cell type, "
+                            "but got the type : {}.".format(type(network)))
         if not isinstance(micro_size, int):
             raise TypeError("For 'PipelineCell', the argument 'micro_size' must be integer, "
                             "but got the type : {}.".format(type(micro_size)))
@@ -689,6 +692,9 @@ class GradAccumulationCell(Cell):
         self.micro_inputs = nn.CellList()
         self.micro_size = micro_size
         self.add_list = []
+        if not isinstance(network, Cell):
+            raise TypeError("For 'GradAccumulationCell', the argument 'network' must cell type, "
+                            "but got the type : {}.".format(type(network)))
         if not isinstance(micro_size, int):
             raise TypeError("For 'GradAccumulationCell', the argument 'micro_size' must be integer, "
                             "but got the type : {}.".format(type(micro_size)))