PyPI - mindspore - Versions diffs - 2.2.0__cp38-none-any.whl → 2.2.10__cp38-none-any.whl - Mend

mindspore 2.2.0cp38-none-any.whl → 2.2.10cp38-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mindspore might be problematic. Click here for more details.

Files changed (138) hide show

mindspore/.commit_id +1 -1
mindspore/_akg/akg/composite/build_module.py +9 -15
mindspore/_akg/akg/utils/ascend_profilier/__init__.py +0 -0
mindspore/_akg/akg/utils/ascend_profilier/cann_file_parser.py +76 -0
mindspore/_akg/akg/utils/ascend_profilier/file_manager.py +56 -0
mindspore/_akg/akg/utils/ascend_profilier/op_summary_bean.py +23 -0
mindspore/_akg/akg/utils/ascend_profilier/op_summary_headers.py +8 -0
mindspore/_akg/akg/utils/ascend_profilier/op_summary_parser.py +42 -0
mindspore/_akg/akg/utils/ascend_profilier/path_manager.py +65 -0
mindspore/_akg/akg/utils/kernel_exec.py +41 -15
mindspore/_akg/akg/utils/tbe_codegen_utils.py +27 -6
mindspore/_akg/akg/utils/util.py +38 -0
mindspore/_c_dataengine.cpython-38-aarch64-linux-gnu.so +0 -0
mindspore/_c_expression.cpython-38-aarch64-linux-gnu.so +0 -0
mindspore/_checkparam.py +3 -3
mindspore/_extends/graph_kernel/model/graph_split.py +84 -76
mindspore/_extends/graph_kernel/splitter.py +3 -2
mindspore/_extends/parallel_compile/akg_compiler/build_tbe_kernel.py +83 -66
mindspore/_extends/parallel_compile/akg_compiler/tbe_topi.py +4 -4
mindspore/_extends/parallel_compile/akg_compiler/util.py +10 -7
mindspore/_extends/parallel_compile/tbe_compiler/tbe_helper.py +2 -1
mindspore/_extends/parse/standard_method.py +2 -9
mindspore/_extends/remote/kernel_build_server.py +2 -1
mindspore/_mindspore_offline_debug.cpython-38-aarch64-linux-gnu.so +0 -0
mindspore/bin/cache_admin +0 -0
mindspore/bin/cache_server +0 -0
mindspore/common/api.py +1 -1
mindspore/common/auto_dynamic_shape.py +81 -85
mindspore/common/dump.py +1 -1
mindspore/common/tensor.py +3 -20
mindspore/config/op_info.config +1 -1
mindspore/context.py +11 -4
mindspore/dataset/engine/datasets_standard_format.py +5 -0
mindspore/dataset/vision/transforms.py +21 -21
mindspore/experimental/optim/adam.py +1 -1
mindspore/gen_ops.py +1 -1
mindspore/include/api/model.h +17 -0
mindspore/include/api/status.h +8 -3
mindspore/lib/libdnnl.so.2 +0 -0
mindspore/lib/libmindspore.so +0 -0
mindspore/lib/libmindspore_backend.so +0 -0
mindspore/lib/libmindspore_common.so +0 -0
mindspore/lib/libmindspore_core.so +0 -0
mindspore/lib/libmindspore_glog.so.0 +0 -0
mindspore/lib/libmindspore_gpr.so.15 +0 -0
mindspore/lib/libmindspore_grpc++.so.1 +0 -0
mindspore/lib/libmindspore_grpc.so.15 +0 -0
mindspore/lib/libmindspore_shared_lib.so +0 -0
mindspore/lib/libnnacl.so +0 -0
mindspore/lib/libopencv_core.so.4.5 +0 -0
mindspore/lib/libopencv_imgcodecs.so.4.5 +0 -0
mindspore/lib/libopencv_imgproc.so.4.5 +0 -0
mindspore/lib/plugin/ascend/custom_aicore_ops/op_impl/ai_core/tbe/config/ascend310/aic-ascend310-ops-info.json +123 -0
mindspore/lib/plugin/ascend/custom_aicore_ops/op_impl/ai_core/tbe/config/ascend310p/aic-ascend310p-ops-info.json +123 -0
mindspore/lib/plugin/ascend/custom_aicore_ops/op_impl/ai_core/tbe/config/ascend910/aic-ascend910-ops-info.json +158 -0
mindspore/lib/plugin/ascend/custom_aicore_ops/op_impl/ai_core/tbe/config/ascend910b/aic-ascend910b-ops-info.json +37 -0
mindspore/lib/plugin/ascend/custom_aicore_ops/op_impl/ai_core/tbe/custom_aicore_ops_impl/add_dsl.py +46 -0
mindspore/lib/plugin/ascend/custom_aicore_ops/op_impl/ai_core/tbe/custom_aicore_ops_impl/add_tik.py +51 -0
mindspore/lib/plugin/ascend/custom_aicore_ops/op_impl/ai_core/tbe/custom_aicore_ops_impl/kv_cache_mgr.py +241 -0
mindspore/lib/plugin/ascend/custom_aicore_ops/op_impl/ai_core/tbe/custom_aicore_ops_impl/matmul_tik.py +212 -0
mindspore/lib/plugin/ascend/custom_aicore_ops/op_impl/vector_core/tbe/custom_aicore_ops_impl/add_dsl.py +46 -0
mindspore/lib/plugin/ascend/custom_aicore_ops/op_impl/vector_core/tbe/custom_aicore_ops_impl/add_tik.py +51 -0
mindspore/lib/plugin/ascend/custom_aicore_ops/op_impl/vector_core/tbe/custom_aicore_ops_impl/kv_cache_mgr.py +241 -0
mindspore/lib/plugin/ascend/custom_aicore_ops/op_impl/vector_core/tbe/custom_aicore_ops_impl/matmul_tik.py +212 -0
mindspore/lib/plugin/ascend/custom_aicore_ops/op_proto/libop_proto.so +0 -0
mindspore/lib/plugin/ascend/custom_aicpu_ops/op_impl/cpu/aicpu_kernel/impl/libcust_aicpu_kernels.so +0 -0
mindspore/lib/plugin/ascend/custom_aicpu_ops/op_impl/cpu/aicpu_kernel/impl/libcust_cpu_kernels.so +0 -0
mindspore/lib/plugin/ascend/custom_aicpu_ops/op_impl/cpu/config/cust_aicpu_kernel.json +8 -80
mindspore/lib/plugin/ascend/custom_aicpu_ops/op_proto/libcust_op_proto.so +0 -0
mindspore/lib/plugin/ascend/libakg.so +0 -0
mindspore/lib/plugin/ascend/libhccl_plugin.so +0 -0
mindspore/lib/plugin/ascend/libmindspore_aicpu_kernels.so +0 -0
mindspore/lib/plugin/ascend/libmindspore_cpu_kernels.so +0 -0
mindspore/lib/plugin/cpu/libakg.so +0 -0
mindspore/lib/plugin/libmindspore_ascend.so.1 +0 -0
mindspore/lib/plugin/libmindspore_ascend.so.2 +0 -0
mindspore/nn/cell.py +0 -3
mindspore/nn/layer/activation.py +4 -5
mindspore/nn/layer/conv.py +39 -23
mindspore/nn/layer/flash_attention.py +90 -78
mindspore/nn/layer/math.py +3 -7
mindspore/nn/layer/rnn_cells.py +5 -5
mindspore/nn/wrap/cell_wrapper.py +6 -0
mindspore/numpy/utils_const.py +5 -5
mindspore/ops/_grad_experimental/grad_array_ops.py +1 -1
mindspore/ops/_grad_experimental/grad_implementations.py +2 -2
mindspore/ops/_grad_experimental/grad_math_ops.py +19 -18
mindspore/ops/_grad_experimental/grad_sparse_ops.py +3 -3
mindspore/ops/_op_impl/aicpu/add.py +3 -3
mindspore/ops/_utils/utils.py +2 -0
mindspore/ops/composite/multitype_ops/_compile_utils.py +2 -1
mindspore/ops/composite/multitype_ops/getitem_impl.py +2 -2
mindspore/ops/function/array_func.py +10 -7
mindspore/ops/function/grad/grad_func.py +0 -1
mindspore/ops/function/nn_func.py +98 -9
mindspore/ops/function/random_func.py +2 -1
mindspore/ops/op_info_register.py +24 -21
mindspore/ops/operations/__init__.py +3 -2
mindspore/ops/operations/_grad_ops.py +24 -4
mindspore/ops/operations/_inner_ops.py +155 -23
mindspore/ops/operations/array_ops.py +9 -7
mindspore/ops/operations/comm_ops.py +2 -2
mindspore/ops/operations/custom_ops.py +85 -68
mindspore/ops/operations/inner_ops.py +26 -3
mindspore/ops/operations/math_ops.py +4 -3
mindspore/ops/operations/nn_ops.py +109 -28
mindspore/parallel/_parallel_serialization.py +10 -3
mindspore/parallel/_tensor.py +4 -1
mindspore/parallel/checkpoint_transform.py +13 -2
mindspore/parallel/shard.py +17 -10
mindspore/profiler/common/util.py +1 -0
mindspore/profiler/parser/ascend_hccl_generator.py +232 -0
mindspore/profiler/parser/ascend_msprof_exporter.py +86 -43
mindspore/profiler/parser/ascend_msprof_generator.py +196 -9
mindspore/profiler/parser/ascend_op_generator.py +1 -1
mindspore/profiler/parser/ascend_timeline_generator.py +6 -182
mindspore/profiler/parser/base_timeline_generator.py +1 -1
mindspore/profiler/parser/cpu_gpu_timeline_generator.py +2 -2
mindspore/profiler/parser/framework_parser.py +1 -1
mindspore/profiler/parser/profiler_info.py +19 -0
mindspore/profiler/profiling.py +46 -24
mindspore/rewrite/api/pattern_engine.py +1 -1
mindspore/rewrite/parsers/for_parser.py +1 -1
mindspore/rewrite/symbol_tree.py +1 -4
mindspore/run_check/_check_version.py +5 -3
mindspore/safeguard/rewrite_obfuscation.py +52 -28
mindspore/train/callback/_summary_collector.py +1 -1
mindspore/train/dataset_helper.py +1 -0
mindspore/train/model.py +2 -2
mindspore/train/serialization.py +97 -11
mindspore/train/summary/_summary_adapter.py +1 -1
mindspore/train/summary/summary_record.py +23 -7
mindspore/version.py +1 -1
{mindspore-2.2.0.dist-info → mindspore-2.2.10.dist-info}/METADATA +1 -1
{mindspore-2.2.0.dist-info → mindspore-2.2.10.dist-info}/RECORD +138 -118
{mindspore-2.2.0.dist-info → mindspore-2.2.10.dist-info}/WHEEL +0 -0
{mindspore-2.2.0.dist-info → mindspore-2.2.10.dist-info}/entry_points.txt +0 -0
{mindspore-2.2.0.dist-info → mindspore-2.2.10.dist-info}/top_level.txt +0 -0

mindspore/nn/layer/conv.py CHANGED Viewed

@@ -718,9 +718,9 @@ class Conv3d(_Conv):
         .. math::
             \begin{array}{ll} \\
-                D_{out} ＝ \left \lceil{\frac{D_{in}}{\text{stride[0]}}} \right \rceil \\
-                H_{out} ＝ \left \lceil{\frac{H_{in}}{\text{stride[1]}}} \right \rceil \\
-                W_{out} ＝ \left \lceil{\frac{W_{in}}{\text{stride[2]}}} \right \rceil \\
+                D_{out} = \left \lceil{\frac{D_{in}}{\text{stride[0]}}} \right \rceil \\
+                H_{out} = \left \lceil{\frac{H_{in}}{\text{stride[1]}}} \right \rceil \\
+                W_{out} = \left \lceil{\frac{W_{in}}{\text{stride[2]}}} \right \rceil \\
             \end{array}
@@ -728,11 +728,11 @@ class Conv3d(_Conv):
         .. math::
             \begin{array}{ll} \\
-                D_{out} ＝ \left \lfloor{\frac{D_{in} - \text{dilation[0]} \times (\text{kernel_size[0]} - 1) }
+                D_{out} = \left \lfloor{\frac{D_{in} - \text{dilation[0]} \times (\text{kernel_size[0]} - 1) }
                 {\text{stride[0]}} + 1} \right \rfloor \\
-                H_{out} ＝ \left \lfloor{\frac{H_{in} - \text{dilation[1]} \times (\text{kernel_size[1]} - 1) }
+                H_{out} = \left \lfloor{\frac{H_{in} - \text{dilation[1]} \times (\text{kernel_size[1]} - 1) }
                 {\text{stride[1]}} + 1} \right \rfloor \\
-                W_{out} ＝ \left \lfloor{\frac{W_{in} - \text{dilation[2]} \times (\text{kernel_size[2]} - 1) }
+                W_{out} = \left \lfloor{\frac{W_{in} - \text{dilation[2]} \times (\text{kernel_size[2]} - 1) }
                 {\text{stride[2]}} + 1} \right \rfloor \\
             \end{array}
@@ -740,11 +740,11 @@ class Conv3d(_Conv):
         .. math::
             \begin{array}{ll} \\
-                D_{out} ＝ \left \lfloor{\frac{D_{in} + padding[0] + padding[1] - (\text{dilation[0]} - 1) \times
+                D_{out} = \left \lfloor{\frac{D_{in} + padding[0] + padding[1] - (\text{dilation[0]} - 1) \times
                 \text{kernel_size[0]} - 1 }{\text{stride[0]}} + 1} \right \rfloor \\
-                H_{out} ＝ \left \lfloor{\frac{H_{in} + padding[2] + padding[3] - (\text{dilation[1]} - 1) \times
+                H_{out} = \left \lfloor{\frac{H_{in} + padding[2] + padding[3] - (\text{dilation[1]} - 1) \times
                 \text{kernel_size[1]} - 1 }{\text{stride[1]}} + 1} \right \rfloor \\
-                W_{out} ＝ \left \lfloor{\frac{W_{in} + padding[4] + padding[5] - (\text{dilation[2]} - 1) \times
+                W_{out} = \left \lfloor{\frac{W_{in} + padding[4] + padding[5] - (\text{dilation[2]} - 1) \times
                 \text{kernel_size[2]} - 1 }{\text{stride[2]}} + 1} \right \rfloor \\
             \end{array}
@@ -812,7 +812,7 @@ class Conv3d(_Conv):
             bias_init,
             data_format,
             dtype=dtype)
-        out_channels = self.out_channels
+        out_channels = self.out_channels // group
         self.conv3d = P.Conv3D(out_channel=out_channels,
                                kernel_size=self.kernel_size,
                                mode=1,
@@ -820,17 +820,33 @@ class Conv3d(_Conv):
                                pad=self.padding,
                                stride=self.stride,
                                dilation=self.dilation,
-                               group=group,
+                               group=1,
                                data_format=self.data_format)
         self.bias_add = P.BiasAdd(data_format=self.data_format)
         self.shape = P.Shape()
+        self.concat = P.Concat(1)
+        self.split_0 = P.Split(0, self.group)
+        self.split_1 = P.Split(1, self.group)
     def construct(self, x):
         x_shape = self.shape(x)
         _check_input_5dims(x_shape, self.cls_name)
-        out = self.conv3d(x, self.weight)
-        if self.has_bias:
-            out = self.bias_add(out, self.bias)
+        if self.group == 1:
+            out = self.conv3d(x, self.weight)
+            if self.has_bias:
+                out = self.bias_add(out, self.bias)
+        else:
+            features = self.split_1(x)
+            weights = self.split_0(self.weight)
+            outputs = ()
+            for i in range(self.group):
+                output = self.conv3d(features[i], weights[i])
+                outputs = outputs + (output,)
+            out = self.concat(outputs)
+            if self.bias is not None:
+                new_shape = [1 for _ in range(out.ndim)]
+                new_shape[1] = self.out_channels
+                out = out + self.bias.reshape(new_shape)
         return out
@@ -921,9 +937,9 @@ class Conv3dTranspose(_Conv):
         .. math::
             \begin{array}{ll} \\
-                D_{out} ＝ \left \lfloor{\frac{D_{in}}{\text{stride[0]}} + 1} \right \rfloor \\
-                H_{out} ＝ \left \lfloor{\frac{H_{in}}{\text{stride[1]}} + 1} \right \rfloor \\
-                W_{out} ＝ \left \lfloor{\frac{W_{in}}{\text{stride[2]}} + 1} \right \rfloor \\
+                D_{out} = \left \lfloor{\frac{D_{in}}{\text{stride[0]}} + 1} \right \rfloor \\
+                H_{out} = \left \lfloor{\frac{H_{in}}{\text{stride[1]}} + 1} \right \rfloor \\
+                W_{out} = \left \lfloor{\frac{W_{in}}{\text{stride[2]}} + 1} \right \rfloor \\
             \end{array}
@@ -931,11 +947,11 @@ class Conv3dTranspose(_Conv):
         .. math::
             \begin{array}{ll} \\
-                D_{out} ＝ \left \lfloor{\frac{D_{in} - \text{dilation[0]} \times (\text{kernel_size[0]} - 1) }
+                D_{out} = \left \lfloor{\frac{D_{in} - \text{dilation[0]} \times (\text{kernel_size[0]} - 1) }
                 {\text{stride[0]}} + 1} \right \rfloor \\
-                H_{out} ＝ \left \lfloor{\frac{H_{in} - \text{dilation[1]} \times (\text{kernel_size[1]} - 1) }
+                H_{out} = \left \lfloor{\frac{H_{in} - \text{dilation[1]} \times (\text{kernel_size[1]} - 1) }
                 {\text{stride[1]}} + 1} \right \rfloor \\
-                W_{out} ＝ \left \lfloor{\frac{W_{in} - \text{dilation[2]} \times (\text{kernel_size[2]} - 1) }
+                W_{out} = \left \lfloor{\frac{W_{in} - \text{dilation[2]} \times (\text{kernel_size[2]} - 1) }
                 {\text{stride[2]}} + 1} \right \rfloor \\
             \end{array}
@@ -943,11 +959,11 @@ class Conv3dTranspose(_Conv):
         .. math::
             \begin{array}{ll} \\
-                D_{out} ＝ \left \lfloor{\frac{D_{in} + padding[0] + padding[1] - (\text{dilation[0]} - 1) \times
+                D_{out} = \left \lfloor{\frac{D_{in} + padding[0] + padding[1] - (\text{dilation[0]} - 1) \times
                 \text{kernel_size[0]} - 1 }{\text{stride[0]}} + 1} \right \rfloor \\
-                H_{out} ＝ \left \lfloor{\frac{H_{in} + padding[2] + padding[3] - (\text{dilation[1]} - 1) \times
+                H_{out} = \left \lfloor{\frac{H_{in} + padding[2] + padding[3] - (\text{dilation[1]} - 1) \times
                 \text{kernel_size[1]} - 1 }{\text{stride[1]}} + 1} \right \rfloor \\
-                W_{out} ＝ \left \lfloor{\frac{W_{in} + padding[4] + padding[5] - (\text{dilation[2]} - 1) \times
+                W_{out} = \left \lfloor{\frac{W_{in} + padding[4] + padding[5] - (\text{dilation[2]} - 1) \times
                 \text{kernel_size[2]} - 1 }{\text{stride[2]}} + 1} \right \rfloor \\
             \end{array}

mindspore/nn/layer/flash_attention.py CHANGED Viewed

@@ -57,14 +57,15 @@ class FlashAttention(Cell):
             Default True
         alibi(bool): This parameter indicates whether the flashattention supports the Alibi.
             Default: False
+        use_mqa(bool): Using MHA if True, only take effect under 910B. Default: False.
     Inputs:
       - **query** (Tensor) - Tensor query (:class:`mstype.fp16` [batch_size, head_num, seq_length, head_dim])
       - **key** (Tensor) - Tensor key (:class:`mstype.fp16` [batch_size, head_num, seq_length, head_dim])
       - **value** (Tensor) - Tensor value (:class:`mstype.fp16` [batch_size, head_num, seq_length, head_dim])
-      - **attention_mask** (Tensor) - Float Tensor the mask of (:class:`mstype.fp16` [batch_size, seq_length,
-          seq_length]): A matrix to pass masked information.
+      - **attention_mask** (Tensor) - Float Tensor the mask of (:class:`mstype.fp16` `mstype.uint8`
+        [batch_size, seq_length, seq_length]): A matrix to pass masked information.
     Outputs:
         A Tensor. The output of the attention with shape [batch_size, head_num, seq_length, head_dim]
@@ -102,17 +103,23 @@ class FlashAttention(Cell):
                  mp=1,
                  high_precision=False,
                  have_attention_mask_batch=True,
-                 alibi=False
+                 alibi=False,
+                 use_mqa=False
                  ):
         super(FlashAttention, self).__init__()
         scaling_constant = math.sqrt(head_dim)
         if scaling_constant == 0:
             raise ValueError("the scaling constant must not be 0.")
-        self.scale_factor = Tensor([1. / scaling_constant], dtype=mstype.float16)
-        self.is_910A = MSContext.get_instance().get_ascend_soc_version() == "Ascend910"
+        self.dropout_rate = dropout_rate
+        self.is_910A = MSContext.get_instance().get_ascend_soc_version() == "ascend910"
         if self.is_910A:
+            self.scale_factor = Tensor([1. / math.sqrt(scaling_constant)], dtype=mstype.float16)
+            self.scale_mul = ops.Mul().shard(((dp, mp, 1, 1), (1,)))
+            self.ones = ops.Ones()
+            self.dim_mask = Tensor([1 for _ in range(head_dim)], dtype=mstype.int8)
+            self.have_attention_mask_batch = have_attention_mask_batch
+            self.alibi = alibi
             self.flash_attention = get_flash_attention(
                 prev_block_num=prev_block_num,
                 next_block_num=next_block_num,
@@ -120,6 +127,10 @@ class FlashAttention(Cell):
                 high_precision=high_precision
             )
             self.flash_attention.add_prim_attr("primitive_target", "Ascend")
+            fa_strategies = ((dp, mp, 1, 1),
+                             (dp, mp, 1, 1),
+                             (dp, mp, 1, 1))
+            self.shard(fa_strategies)
         else:
             if alibi:
                 raise ValueError(f"When soc_version is not Ascend910A, alibi must be False")
@@ -128,25 +139,27 @@ class FlashAttention(Cell):
             self.reshape = ops.Reshape()
             self.zeros_like = ops.ZerosLike().shard(((dp, mp, 1, 1),))
             self.zeros = ops.Zeros()
-            self.attn_expand_dims = ops.ExpandDims().shard(((dp, 1, 1),))
-            fa_strategies = ((dp, 1, mp),
-                             (dp, 1, mp),
-                             (dp, 1, mp),
-                             (dp, 1, 1, 1))
+            self.attn_cast = ops.Cast()
+            if use_mqa:
+                fa_strategies = ((dp, mp, 1, 1),
+                                 (dp, 1, 1, 1),
+                                 (dp, 1, 1, 1),
+                                 (dp, 1, 1, 1))
+            else:
+                fa_strategies = ((dp, mp, 1, 1),
+                                 (dp, mp, 1, 1),
+                                 (dp, mp, 1, 1),
+                                 (dp, 1, 1, 1))
             if dropout_rate > 1e-5:
                 fa_strategies += ((dp, mp, 1, 1),)
             self.flash_attention = FlashAttentionScore(head_num=head_num, pre_tokens=prev_block_num,
                                                        next_tokens=next_block_num,
                                                        keep_prob=1 - dropout_rate,
-                                                       scale_value=1.0,
-                                                       inner_precise=0 if high_precision else 1).shard(fa_strategies)
+                                                       scale_value=1. / scaling_constant,
+                                                       inner_precise=0 if high_precision else 1,
+                                                       input_layout="BNSD").shard(fa_strategies)
-        self.ones = ops.Ones()
-        self.dim_mask = Tensor([1 for _ in range(head_dim)], dtype=mstype.int8)
-        self.scale_mul = ops.Mul().shard(((dp, mp, 1, 1), (1,)))
         self.dropout_rate = dropout_rate
-        self.have_attention_mask_batch = have_attention_mask_batch
-        self.alibi = alibi
         if self.dropout_rate > 1e-5:
             self.keep_prob = Tensor(1 - self.dropout_rate, dtype=mstype.float16)
             self.fill_v2 = ops.FillV2().shard(((dp, mp, 1, 1), ()))
@@ -162,46 +175,49 @@ class FlashAttention(Cell):
                                   such as MatMul. Default: None.
         :return:
         """
-        if in_strategy is None:
-            # default: dp=1, mp=1, construct inputs only contain query, key, value
-            in_strategy = (
-                (1, 1, 1, 1),
-                (1, 1, 1, 1),
-                (1, 1, 1, 1),
-            )
-        self.flash_attention.shard(in_strategy)
-        dp = in_strategy[0][0]
-        mp = in_strategy[0][1]
-        self.flash_attention.add_prim_attr("dev_matrix_shape", [dp, mp, 1, 1])
-        inputs_tensor_map = [
-            [3, 2, 1, 0],
-            [3, 2, 1, 0],
-            [3, 2, 1, 0],
-        ]
-        if self.have_attention_mask_batch:
-            inputs_tensor_map.append([3, 1, 0])
-        else:
-            inputs_tensor_map.append([-1, 1, 0])
+        if self.is_910A:
+            if in_strategy is None:
+                # default: dp=1, mp=1, construct inputs only contain query, key, value
+                in_strategy = (
+                    (1, 1, 1, 1),
+                    (1, 1, 1, 1),
+                    (1, 1, 1, 1),
+                )
+            self.flash_attention.shard(in_strategy)
+            dp = in_strategy[0][0]
+            mp = in_strategy[0][1]
+            self.flash_attention.add_prim_attr("dev_matrix_shape", [dp, mp, 1, 1])
+            inputs_tensor_map = [
+                [3, 2, 1, 0],
+                [3, 2, 1, 0],
+                [3, 2, 1, 0],
+            ]
+            if self.have_attention_mask_batch:
+                inputs_tensor_map.append([3, 1, 0])
+            else:
+                inputs_tensor_map.append([-1, 1, 0])
-        input_empty_args_num = 2
-        # dropout_mask
-        if self.dropout_rate > 1e-5:
-            input_empty_args_num -= 1
-            inputs_tensor_map.append([3, 2, 1, 0])
+            input_empty_args_num = 2
+            # dropout_mask
+            if self.dropout_rate > 1e-5:
+                input_empty_args_num -= 1
+                inputs_tensor_map.append([3, 2, 1, 0])
-        if self.alibi:
-            input_empty_args_num -= 1
-            inputs_tensor_map.append([3, 2, 1, 0])
+            if self.alibi:
+                input_empty_args_num -= 1
+                inputs_tensor_map.append([3, 2, 1, 0])
-        self.flash_attention.add_prim_attr("inputs_tensor_map", inputs_tensor_map)
+            self.flash_attention.add_prim_attr("inputs_tensor_map", inputs_tensor_map)
-        self.flash_attention.add_prim_attr("outputs_tensor_map", [
-            [3, 2, 1, 0],  # O
-            [3, 2, 1],  # L
-            [3, 2, 1]  # M
-        ])
-        self.flash_attention.add_prim_attr("as_loss_divisor", 0)
-        self.flash_attention.add_prim_attr("empty_mirror_ops", input_empty_args_num)
+            self.flash_attention.add_prim_attr("outputs_tensor_map", [
+                [3, 2, 1, 0],  # O
+                [3, 2, 1],  # L
+                [3, 2, 1]  # M
+            ])
+            self.flash_attention.add_prim_attr("as_loss_divisor", 0)
+            self.flash_attention.add_prim_attr("empty_mirror_ops", input_empty_args_num)
+        else:
+            self.flash_attention.shard(in_strategy)
     def construct(self, query, key, value, attn_mask=None, alibi_mask=None):
         """FlashAttention forward
@@ -212,24 +228,22 @@ class FlashAttention(Cell):
         :param alibi_mask: [bsz, head_num, 1, seq_len], if not None
         :return: output          [bsz, head_num, seq_len, head_dim]
         """
-        query = self.scale_mul(query, self.scale_factor)
         bsz, head_num, seq_len, head_dim = query.shape
-        _, k_head_num, k_seq_len, _ = key.shape
-        _, v_head_num, v_seq_len, _ = value.shape
-        if head_num != k_head_num or head_num != v_head_num:
-            raise ValueError(
-                "the head_num of query, key and value must be the same, "
-                "If different head_num are used, users need to change themselves to be same by tile.")
-        if seq_len % 16 != 0 or k_seq_len % 16 != 0 or k_seq_len != v_seq_len:
-            raise ValueError(
-                "query, key, value seq_len must be a multiple of 16, and key seq_len, value seq_len must be the same.")
-        if head_dim > 304:
-            raise ValueError(
-                "the head_dim must be less than 304, otherwise the ub would be OOM.")
         if self.is_910A:
+            _, k_head_num, k_seq_len, _ = key.shape
+            _, v_head_num, v_seq_len, _ = value.shape
+            if head_num != k_head_num or head_num != v_head_num:
+                raise ValueError(
+                    "the head_num of query, key and value must be the same, "
+                    "If different head_num are used, users need to change themselves to be same by tile.")
+            if seq_len % 16 != 0 or k_seq_len % 16 != 0 or k_seq_len != v_seq_len:
+                raise ValueError(
+                    "query, key, value seq_len must be a multiple of 16, "
+                    "and the seq_len between key and value must be equal.")
             # 910A -- FlashAttentionPrimtive
+            if head_dim > 304:
+                raise ValueError(
+                    "the head_dim must be less than 304, otherwise the ub would be OOM.")
             if self.dropout_rate > 1e-5:
                 drop_mask_bits = self.drop_gen_mask((bsz, head_num, seq_len, seq_len), self.keep_prob)
                 tensor_shape = Tensor((bsz, head_num, seq_len, seq_len), mstype.int32)
@@ -238,27 +252,25 @@ class FlashAttention(Cell):
                 drop_mask = self.do_dropout(ones, drop_mask_bits, self.keep_prob)
             else:
                 drop_mask = None
+            query = self.scale_mul(query, self.scale_factor)
+            key = self.scale_mul(key, self.scale_factor)
+            attn_mask = self.cast(attn_mask, mstype.float16)
             output, _, _ = self.flash_attention(query, key, value, attn_mask, drop_mask, alibi_mask)
         else:
-            # FlashAttentionScore
-            # Useless input, just for binary calls.
+            # 910B -- FlashAttentionScore
             if self.dropout_rate > 1e-5:
                 drop_mask_bits = self.reshape(self.drop_gen_mask((bsz, head_num, seq_len, seq_len), self.keep_prob),
                                               (bsz, head_num, seq_len, seq_len // 8))
             else:
                 drop_mask_bits = None
-            # (B, N, S, D) -> (B, S, H)
-            query = self.reshape(self.transpose_4d_pre(query, (0, 2, 1, 3)), (bsz, seq_len, -1))
-            key = self.reshape(self.transpose_4d_pre(key, (0, 2, 1, 3)), (bsz, seq_len, -1))
-            value = self.reshape(self.transpose_4d_pre(value, (0, 2, 1, 3)), (bsz, seq_len, -1))
-            attn_mask = self.attn_expand_dims(attn_mask, 1)
+            # (B, S, S) -> (B, 1, S, S)
+            attn_mask = self.cast(self.reshape(attn_mask, (bsz, 1, seq_len, seq_len)), mstype.uint8)
             output, _, _ = self.flash_attention(query,
                                                 key,
                                                 value,
                                                 attn_mask,
                                                 drop_mask_bits,
                                                 None,
+                                                None,
                                                 None)
-            output = self.transpose_4d_post(self.reshape(output, (bsz, seq_len, head_num, head_dim)), (0, 2, 1, 3))
         return output

mindspore/nn/layer/math.py CHANGED Viewed

@@ -375,9 +375,6 @@ class DiGamma(Cell):
                            nan, real_result)
-eps_fp32 = Tensor(np.finfo(np.float32).eps, mstype.float32)
 def _while_helper_func(cond, body, vals):
     while cond(vals).any():
         vals = body(vals)
@@ -394,7 +391,7 @@ def _igamma_series(ax, x, a, enabled):
     select = P.Select()
     # If more data types are supported, this epsilon need to be selected.
-    epsilon = eps_fp32
+    epsilon = Tensor(np.finfo(np.float32).eps, mstype.float32)
     def cond(vals):
         enabled = vals[0]
@@ -443,7 +440,7 @@ def _igammac_continued_fraction(ax, x, a, enabled):
     select = P.Select()
     # If more data types are supported, this epsilon need to be selected.
-    epsilon = eps_fp32
+    epsilon = Tensor(np.finfo(np.float32).eps, mstype.float32)
     def cond(vals):
         enabled = vals[0]
@@ -620,8 +617,7 @@ class IGamma(Cell):
             x = F.broadcast_to(x, para_shape)
             a = F.broadcast_to(a, para_shape)
         x_is_zero = self.equal(x, 0)
-        log_maxfloat = self.log_maxfloat32
-        underflow = self.less(ax, self.neg(log_maxfloat))
+        underflow = self.less(ax, self.neg(self.log_maxfloat32))
         ax = self.exp(ax)
         enabled = self.logicalnot(self.logicalor(self.logicalor(x_is_zero, domain_error), underflow))
         output = self.select(use_igammac,

mindspore/nn/layer/rnn_cells.py CHANGED Viewed

@@ -83,7 +83,7 @@ def _check_lstmcell_init(func):
 def _rnn_tanh_cell(inputs, hidden, w_ih, w_hh, b_ih, b_hh):
-    '''RNN cell function with tanh activation'''
+    """RNN cell function with tanh activation"""
     if b_ih is None:
         igates = P.MatMul(False, True)(inputs, w_ih)
         hgates = P.MatMul(False, True)(hidden, w_hh)
@@ -94,7 +94,7 @@ def _rnn_tanh_cell(inputs, hidden, w_ih, w_hh, b_ih, b_hh):
 def _rnn_relu_cell(inputs, hidden, w_ih, w_hh, b_ih, b_hh):
-    '''RNN cell function with relu activation'''
+    """RNN cell function with relu activation"""
     if b_ih is None:
         igates = P.MatMul(False, True)(inputs, w_ih)
         hgates = P.MatMul(False, True)(hidden, w_hh)
@@ -105,7 +105,7 @@ def _rnn_relu_cell(inputs, hidden, w_ih, w_hh, b_ih, b_hh):
 def _lstm_cell(inputs, hidden, w_ih, w_hh, b_ih, b_hh):
-    '''LSTM cell function'''
+    """LSTM cell function"""
     hx, cx = hidden
     if b_ih is None:
         gates = P.MatMul(False, True)(inputs, w_ih) + P.MatMul(False, True)(hx, w_hh)
@@ -125,7 +125,7 @@ def _lstm_cell(inputs, hidden, w_ih, w_hh, b_ih, b_hh):
 def _gru_cell(inputs, hidden, w_ih, w_hh, b_ih, b_hh):
-    '''GRU cell function'''
+    """GRU cell function"""
     if b_ih is None:
         gi = P.MatMul(False, True)(inputs, w_ih)
         gh = P.MatMul(False, True)(hidden, w_hh)
@@ -144,7 +144,7 @@ def _gru_cell(inputs, hidden, w_ih, w_hh, b_ih, b_hh):
 class RNNCellBase(Cell):
-    '''Basic class for RNN Cells'''
+    """Basic class for RNN Cells"""
     def __init__(self, input_size: int, hidden_size: int, has_bias: bool, num_chunks: int,
                  dtype=mstype.float32):
         super().__init__()

mindspore/nn/wrap/cell_wrapper.py CHANGED Viewed

@@ -644,6 +644,9 @@ class PipelineCell(Cell):
         self.micro_inputs = nn.CellList()
         self.micro_size = micro_size
         self.add_list = []
+        if not isinstance(network, Cell):
+            raise TypeError("For 'PipelineCell', the argument 'network' must cell type, "
+                            "but got the type : {}.".format(type(network)))
         if not isinstance(micro_size, int):
             raise TypeError("For 'PipelineCell', the argument 'micro_size' must be integer, "
                             "but got the type : {}.".format(type(micro_size)))
@@ -689,6 +692,9 @@ class GradAccumulationCell(Cell):
         self.micro_inputs = nn.CellList()
         self.micro_size = micro_size
         self.add_list = []
+        if not isinstance(network, Cell):
+            raise TypeError("For 'GradAccumulationCell', the argument 'network' must cell type, "
+                            "but got the type : {}.".format(type(network)))
         if not isinstance(micro_size, int):
             raise TypeError("For 'GradAccumulationCell', the argument 'micro_size' must be integer, "
                             "but got the type : {}.".format(type(micro_size)))

mindspore/numpy/utils_const.py CHANGED Viewed

@@ -143,8 +143,8 @@ def _infer_out_shape(*shapes):
     shape_out = list()
     max_len = max([len(it) for it in shapes])
     for i in range(max_len):
-        items = [it[i-max_len+len(it)] if i-max_len +
-                 len(it) >= 0 else 1 for it in shapes]
+        items = [
+            it[i - max_len + len(it)] if i - max_len + len(it) >= 0 else 1 for it in shapes]
         max_size = 0 if 0 in items else max(items)
         _check()
         shape_out.append(max_size)
@@ -158,8 +158,8 @@ def _can_broadcast(*shapes):
     """
     max_len = max([len(it) for it in shapes])
     for i in range(max_len):
-        items = [it[i-max_len+len(it)] if i-max_len +
-                 len(it) >= 0 else 1 for it in shapes]
+        items = [
+            it[i - max_len + len(it)] if i - max_len + len(it) >= 0 else 1 for it in shapes]
         max_size = 0 if 0 in items else max(items)
         if any(item not in (1, max_size) for item in items):
             return False
@@ -399,7 +399,7 @@ def _broadcast_tuples(tup1, tup2):
         if not isinstance(tup1, (tuple, list)) or not isinstance(tup2, (tuple, list)):
             raise TypeError("input shift and axis must be tuple or list or int.")
         if len(tup1) == len(tup2) or len(tup1) == 1 or len(tup2) == 1:
-            return None
+            return
         raise ValueError("shape mismatch: objects cannot be broadcast to a single shape")
     tup1 = (tup1,) if isinstance(tup1, int) else tup1

mindspore/ops/_grad_experimental/grad_array_ops.py CHANGED Viewed

@@ -203,7 +203,7 @@ def get_bprop_index_put(self):
         if is_ascend:
             indices_ms = [convert_idx_positive(indices_ms[i], x1.shape[i]) for i in range(len(indices_ms))]
         indices_me = stack(indices_ms)
-        indices_grad = F.transpose(indices_me, F.make_range(F.rank(indices_me)-1, -1, -1))
+        indices_grad = F.transpose(indices_me, F.make_range(F.rank(indices_me) - 1, -1, -1))
         values_grad = gather_nd(dout, indices_grad)
         if equal(cast(x2.shape[0], mstype.int32), Tensor(1)):
             values_grad = values_grad.sum().reshape(1)

mindspore/ops/_grad_experimental/grad_implementations.py CHANGED Viewed

@@ -19,7 +19,7 @@ from mindspore.ops import functional as F
 from mindspore.ops import operations as P
 from mindspore.ops.composite import multitype_ops as C
 from mindspore.ops.composite.multitype_ops.zeros_like_impl import zeros_like
-from mindspore.ops._grad_experimental.grad_base import bprops
+from mindspore.ops._grad_experimental.grad_base import bprops, bprop_getters
 from mindspore.common import dtype as mstype
 get_dtype = P.DType()
@@ -193,7 +193,7 @@ def bprop_tensor_move(x, out, dout):
     return (dout,)
-@bprops.register("DictInplaceSetItem")
+@bprop_getters.register("DictInplaceSetItem")
 def get_bprop_dict_inplace_setitem(self):
     """Generate bprop for dict inplace pop"""

mindspore/ops/_grad_experimental/grad_math_ops.py CHANGED Viewed

@@ -135,7 +135,7 @@ def get_bprop_matrix_triangular_solve(self):
     def bprop(matrix, rhs, out, dout):
         grad_rhs = matrix_triangular_solve_op(matrix, dout)
-        if matrix.dtype == mstype.complex64 or matrix.dtype == mstype.complex128:
+        if matrix.dtype in (mstype.complex64, mstype.complex128):
             grad_rhs_temp = _adjoint(grad_rhs)
             out_temp = _adjoint(out)
         else:
@@ -156,14 +156,14 @@ def get_bprop_matrix_triangular_solve(self):
                 grad_matrix = mat_mul_op(grad_rhs, out_temp)
                 grad_matrix = neg_op(grad_matrix)
         if lower_a:
-            if grad_matrix.dtype == mstype.complex64 or grad_matrix.dtype == mstype.complex128:
+            if grad_matrix.dtype in (mstype.complex64, mstype.complex128):
                 grad_matrix_real = matrix_band_part_op(real_op(grad_matrix), -1, 0)
                 grad_matrix_imag = matrix_band_part_op(imag_op(grad_matrix), -1, 0)
                 grad_matrix = complex_op(grad_matrix_real, grad_matrix_imag)
             else:
                 grad_matrix = matrix_band_part_op(grad_matrix, -1, 0)
         else:
-            if grad_matrix.dtype == mstype.complex64 or grad_matrix.dtype == mstype.complex128:
+            if grad_matrix.dtype in (mstype.complex64, mstype.complex128):
                 grad_matrix_real = matrix_band_part_op(real_op(grad_matrix), 0, -1)
                 grad_matrix_imag = matrix_band_part_op(imag_op(grad_matrix), 0, -1)
                 grad_matrix = complex_op(grad_matrix_real, grad_matrix_imag)
@@ -219,7 +219,7 @@ def get_bprop_matrix_solve(self):
 @_primexpr
 def _generate_perm_matrix_solve_ls(x_dim):
     perm = tuple(range(x_dim - 2))
-    perm = perm + (x_dim-1, x_dim-2)
+    perm = perm + (x_dim - 1, x_dim - 2)
     return perm
@@ -647,20 +647,21 @@ def _fft_rank_offset(norm_shape, rank):
 @_primexpr
 def _fft_with_size_back_norm(norm_shape, norm, inverse, rank):
     """generate reverse term for fft_with_size"""
+    norm_ = None
     if inverse is False:
         if norm == "forward":
-            norm_ = 1 / _fft_rank_offset(norm_shape, rank)
-        if norm == "backward":
-            norm_ = 1 * _fft_rank_offset(norm_shape, rank)
-        if norm == "ortho":
-            norm_ = 1
-    if inverse is True:
+            norm_ = 1.0 / _fft_rank_offset(norm_shape, rank)
+        elif norm == "backward":
+            norm_ = 1.0 * _fft_rank_offset(norm_shape, rank)
+        elif norm == "ortho":
+            norm_ = 1.0
+    else:
         if norm == "forward":
-            norm_ = 1 * _fft_rank_offset(norm_shape, rank)
-        if norm == "backward":
-            norm_ = 1 / _fft_rank_offset(norm_shape, rank)
-        if norm == "ortho":
-            norm_ = 1
+            norm_ = 1.0 * _fft_rank_offset(norm_shape, rank)
+        elif norm == "backward":
+            norm_ = 1.0 / _fft_rank_offset(norm_shape, rank)
+        elif norm == "ortho":
+            norm_ = 1.0
     return norm_
@@ -670,9 +671,9 @@ def _rfft_norm(norm_shape, norm, rank):
     norm_ = 1.0
     if norm == "forward":
         norm_ = 1 / _fft_rank_offset(norm_shape, rank)
-    if norm == "backward":
-        norm_ = 1
-    if norm == "ortho":
+    elif norm == "backward":
+        norm_ = 1.0
+    elif norm == "ortho":
         norm_ = 1 / np.sqrt(_fft_rank_offset(norm_shape, rank))
     return norm_

mindspore/ops/_grad_experimental/grad_sparse_ops.py CHANGED Viewed

@@ -358,10 +358,10 @@ def get_bprop_ragged_tensor_to_sparse(self):
                 split.append(zeros_like(i))
             all_d = (split, ragged_values_grad)
             return all_d
-        split = ()
+        split_ = ()
         for i in enumerate(rt_nested_splits):
-            split = split + (zeros_like(i),)
-        all_d = (split, ragged_values_grad)
+            split_ = split_ + (zeros_like(i),)
+        all_d = (split_, ragged_values_grad)
         return all_d
     return bprop

mindspore 2.2.0__cp38-none-any.whl → 2.2.10__cp38-none-any.whl

Potentially problematic release.

mindspore 2.2.0cp38-none-any.whl → 2.2.10cp38-none-any.whl