PyPI - mindspore - Versions diffs - 2.2.0__cp37-cp37m-manylinux1_x86_64.whl → 2.2.11__cp37-cp37m-manylinux1_x86_64.whl - Mend

mindspore 2.2.0__cp37-cp37m-manylinux1_x86_64.whl → 2.2.11__cp37-cp37m-manylinux1_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (170) hide show

mindspore/.commit_id +1 -1
mindspore/_akg/akg/composite/build_module.py +104 -20
mindspore/_akg/akg/utils/ascend_profilier/cann_file_parser.py +76 -0
mindspore/_akg/akg/utils/ascend_profilier/file_manager.py +56 -0
mindspore/_akg/akg/utils/ascend_profilier/op_summary_bean.py +23 -0
mindspore/_akg/akg/utils/ascend_profilier/op_summary_headers.py +8 -0
mindspore/_akg/akg/utils/ascend_profilier/op_summary_parser.py +42 -0
mindspore/_akg/akg/utils/ascend_profilier/path_manager.py +65 -0
mindspore/_akg/akg/utils/composite_op_helper.py +7 -2
mindspore/_akg/akg/utils/dump_ascend_meta.py +22 -3
mindspore/_akg/akg/utils/kernel_exec.py +41 -15
mindspore/_akg/akg/utils/tbe_codegen_utils.py +27 -6
mindspore/_akg/akg/utils/util.py +56 -1
mindspore/_c_dataengine.cpython-37m-x86_64-linux-gnu.so +0 -0
mindspore/_c_expression.cpython-37m-x86_64-linux-gnu.so +0 -0
mindspore/_checkparam.py +3 -3
mindspore/_extends/graph_kernel/model/graph_split.py +84 -76
mindspore/_extends/graph_kernel/splitter.py +3 -2
mindspore/_extends/parallel_compile/akg_compiler/build_tbe_kernel.py +83 -66
mindspore/_extends/parallel_compile/akg_compiler/tbe_topi.py +4 -4
mindspore/_extends/parallel_compile/akg_compiler/util.py +10 -7
mindspore/_extends/parallel_compile/tbe_compiler/tbe_helper.py +2 -1
mindspore/_extends/parse/__init__.py +3 -2
mindspore/_extends/parse/parser.py +6 -1
mindspore/_extends/parse/standard_method.py +14 -11
mindspore/_extends/remote/kernel_build_server.py +2 -1
mindspore/_mindspore_offline_debug.cpython-37m-x86_64-linux-gnu.so +0 -0
mindspore/bin/cache_admin +0 -0
mindspore/bin/cache_server +0 -0
mindspore/common/_utils.py +16 -0
mindspore/common/api.py +1 -1
mindspore/common/auto_dynamic_shape.py +81 -85
mindspore/common/dump.py +1 -1
mindspore/common/tensor.py +3 -20
mindspore/config/op_info.config +1 -1
mindspore/context.py +11 -4
mindspore/dataset/engine/cache_client.py +8 -5
mindspore/dataset/engine/datasets_standard_format.py +5 -0
mindspore/dataset/vision/transforms.py +21 -21
mindspore/experimental/optim/adam.py +1 -1
mindspore/gen_ops.py +1 -1
mindspore/include/api/model.h +17 -0
mindspore/include/api/status.h +8 -3
mindspore/lib/libdnnl.so.2 +0 -0
mindspore/lib/libmindspore.so +0 -0
mindspore/lib/libmindspore_backend.so +0 -0
mindspore/lib/libmindspore_common.so +0 -0
mindspore/lib/libmindspore_core.so +0 -0
mindspore/lib/libmindspore_glog.so.0 +0 -0
mindspore/lib/libmindspore_gpr.so.15 +0 -0
mindspore/lib/libmindspore_grpc++.so.1 +0 -0
mindspore/lib/libmindspore_grpc.so.15 +0 -0
mindspore/lib/libmindspore_shared_lib.so +0 -0
mindspore/lib/libnnacl.so +0 -0
mindspore/lib/libopencv_core.so.4.5 +0 -0
mindspore/lib/libopencv_imgcodecs.so.4.5 +0 -0
mindspore/lib/libopencv_imgproc.so.4.5 +0 -0
mindspore/lib/plugin/ascend/custom_aicore_ops/op_impl/ai_core/tbe/config/ascend310/aic-ascend310-ops-info.json +123 -0
mindspore/lib/plugin/ascend/custom_aicore_ops/op_impl/ai_core/tbe/config/ascend310p/aic-ascend310p-ops-info.json +123 -0
mindspore/lib/plugin/ascend/custom_aicore_ops/op_impl/ai_core/tbe/config/ascend910/aic-ascend910-ops-info.json +158 -0
mindspore/lib/plugin/ascend/custom_aicore_ops/op_impl/ai_core/tbe/config/ascend910b/aic-ascend910b-ops-info.json +37 -0
mindspore/lib/plugin/ascend/custom_aicore_ops/op_impl/ai_core/tbe/custom_aicore_ops_impl/add_dsl.py +46 -0
mindspore/lib/plugin/ascend/custom_aicore_ops/op_impl/ai_core/tbe/custom_aicore_ops_impl/add_tik.py +51 -0
mindspore/lib/plugin/ascend/custom_aicore_ops/op_impl/ai_core/tbe/custom_aicore_ops_impl/kv_cache_mgr.py +241 -0
mindspore/lib/plugin/ascend/custom_aicore_ops/op_impl/ai_core/tbe/custom_aicore_ops_impl/matmul_tik.py +212 -0
mindspore/lib/plugin/ascend/custom_aicore_ops/op_impl/vector_core/tbe/custom_aicore_ops_impl/add_dsl.py +46 -0
mindspore/lib/plugin/ascend/custom_aicore_ops/op_impl/vector_core/tbe/custom_aicore_ops_impl/add_tik.py +51 -0
mindspore/lib/plugin/ascend/custom_aicore_ops/op_impl/vector_core/tbe/custom_aicore_ops_impl/kv_cache_mgr.py +241 -0
mindspore/lib/plugin/ascend/custom_aicore_ops/op_impl/vector_core/tbe/custom_aicore_ops_impl/matmul_tik.py +212 -0
mindspore/lib/plugin/ascend/custom_aicore_ops/op_proto/libop_proto.so +0 -0
mindspore/lib/plugin/ascend/custom_aicpu_ops/op_impl/cpu/aicpu_kernel/impl/libcust_aicpu_kernels.so +0 -0
mindspore/lib/plugin/ascend/custom_aicpu_ops/op_impl/cpu/aicpu_kernel/impl/libcust_cpu_kernels.so +0 -0
mindspore/lib/plugin/ascend/custom_aicpu_ops/op_impl/cpu/config/cust_aicpu_kernel.json +78 -80
mindspore/lib/plugin/ascend/custom_aicpu_ops/op_proto/libcust_op_proto.so +0 -0
mindspore/lib/plugin/ascend/libakg.so +0 -0
mindspore/lib/plugin/ascend/libhccl_plugin.so +0 -0
mindspore/lib/plugin/ascend/libmindspore_aicpu_kernels.so +0 -0
mindspore/lib/plugin/ascend/libmindspore_cpu_kernels.so +0 -0
mindspore/lib/plugin/cpu/libakg.so +0 -0
mindspore/lib/plugin/gpu/libcuda_ops.so.10 +0 -0
mindspore/lib/plugin/gpu/libcuda_ops.so.11 +0 -0
mindspore/lib/plugin/gpu10.1/libakg.so +0 -0
mindspore/lib/plugin/gpu10.1/libnccl.so.2 +0 -0
mindspore/lib/plugin/gpu11.1/libakg.so +0 -0
mindspore/lib/plugin/gpu11.6/libakg.so +0 -0
mindspore/lib/plugin/gpu11.6/libnccl.so.2 +0 -0
mindspore/lib/plugin/libmindspore_ascend.so.1 +0 -0
mindspore/lib/plugin/libmindspore_ascend.so.2 +0 -0
mindspore/lib/plugin/libmindspore_gpu.so.10.1 +0 -0
mindspore/lib/plugin/libmindspore_gpu.so.11.1 +0 -0
mindspore/lib/plugin/libmindspore_gpu.so.11.6 +0 -0
mindspore/nn/cell.py +0 -3
mindspore/nn/layer/activation.py +4 -5
mindspore/nn/layer/conv.py +39 -23
mindspore/nn/layer/flash_attention.py +54 -129
mindspore/nn/layer/math.py +3 -7
mindspore/nn/layer/rnn_cells.py +5 -5
mindspore/nn/wrap/__init__.py +4 -2
mindspore/nn/wrap/cell_wrapper.py +12 -3
mindspore/numpy/utils_const.py +5 -5
mindspore/ops/_grad_experimental/grad_array_ops.py +1 -1
mindspore/ops/_grad_experimental/grad_implementations.py +2 -2
mindspore/ops/_grad_experimental/grad_math_ops.py +19 -18
mindspore/ops/_grad_experimental/grad_sparse_ops.py +3 -3
mindspore/ops/_op_impl/aicpu/add.py +3 -3
mindspore/ops/_op_impl/aicpu/linear_sum_assignment.py +21 -2
mindspore/ops/_utils/utils.py +2 -0
mindspore/ops/composite/multitype_ops/_compile_utils.py +2 -1
mindspore/ops/composite/multitype_ops/getitem_impl.py +2 -2
mindspore/ops/function/array_func.py +10 -7
mindspore/ops/function/grad/grad_func.py +0 -1
mindspore/ops/function/nn_func.py +98 -9
mindspore/ops/function/random_func.py +2 -1
mindspore/ops/op_info_register.py +24 -21
mindspore/ops/operations/__init__.py +6 -2
mindspore/ops/operations/_grad_ops.py +25 -6
mindspore/ops/operations/_inner_ops.py +155 -23
mindspore/ops/operations/array_ops.py +9 -7
mindspore/ops/operations/comm_ops.py +2 -2
mindspore/ops/operations/custom_ops.py +85 -68
mindspore/ops/operations/inner_ops.py +26 -3
mindspore/ops/operations/math_ops.py +7 -6
mindspore/ops/operations/nn_ops.py +193 -49
mindspore/parallel/_parallel_serialization.py +10 -3
mindspore/parallel/_tensor.py +4 -1
mindspore/parallel/checkpoint_transform.py +13 -2
mindspore/parallel/shard.py +17 -10
mindspore/profiler/common/util.py +1 -0
mindspore/profiler/parser/ascend_hccl_generator.py +232 -0
mindspore/profiler/parser/ascend_msprof_exporter.py +86 -43
mindspore/profiler/parser/ascend_msprof_generator.py +196 -9
mindspore/profiler/parser/ascend_op_generator.py +1 -1
mindspore/profiler/parser/ascend_timeline_generator.py +6 -182
mindspore/profiler/parser/base_timeline_generator.py +1 -1
mindspore/profiler/parser/cpu_gpu_timeline_generator.py +2 -2
mindspore/profiler/parser/framework_parser.py +1 -1
mindspore/profiler/parser/profiler_info.py +19 -0
mindspore/profiler/profiling.py +46 -24
mindspore/rewrite/api/pattern_engine.py +1 -1
mindspore/rewrite/parsers/for_parser.py +7 -7
mindspore/rewrite/parsers/module_parser.py +4 -4
mindspore/rewrite/symbol_tree.py +1 -4
mindspore/run_check/_check_version.py +5 -3
mindspore/safeguard/rewrite_obfuscation.py +52 -28
mindspore/scipy/ops.py +55 -5
mindspore/scipy/optimize/__init__.py +3 -2
mindspore/scipy/optimize/linear_sum_assignment.py +38 -33
mindspore/train/callback/_summary_collector.py +1 -1
mindspore/train/dataset_helper.py +1 -0
mindspore/train/model.py +2 -2
mindspore/train/serialization.py +97 -11
mindspore/train/summary/_summary_adapter.py +1 -1
mindspore/train/summary/summary_record.py +23 -7
mindspore/version.py +1 -1
{mindspore-2.2.0.dist-info → mindspore-2.2.11.dist-info}/METADATA +3 -2
{mindspore-2.2.0.dist-info → mindspore-2.2.11.dist-info}/RECORD +160 -151
mindspore/ops/_op_impl/_custom_op/flash_attention/attention.py +0 -406
mindspore/ops/_op_impl/_custom_op/flash_attention/constants.py +0 -41
mindspore/ops/_op_impl/_custom_op/flash_attention/flash_attention_bwd.py +0 -467
mindspore/ops/_op_impl/_custom_op/flash_attention/flash_attention_fwd.py +0 -563
mindspore/ops/_op_impl/_custom_op/flash_attention/flash_attention_impl.py +0 -193
mindspore/ops/_op_impl/_custom_op/flash_attention/tik_ops_utils.py +0 -435
mindspore/ops/_op_impl/_custom_op/flash_attention/tiling_strategy/__init__.py +0 -0
mindspore/ops/_op_impl/_custom_op/flash_attention/tiling_strategy/sparse_tiling.py +0 -45
mindspore/ops/_op_impl/_custom_op/flash_attention/tiling_strategy/strategy.py +0 -67
mindspore/ops/_op_impl/_custom_op/flash_attention/tiling_strategy/wukong_tiling.py +0 -62
/mindspore/{ops/_op_impl/_custom_op/flash_attention → _akg/akg/utils/ascend_profilier}/__init__.py +0 -0
{mindspore-2.2.0.dist-info → mindspore-2.2.11.dist-info}/WHEEL +0 -0
{mindspore-2.2.0.dist-info → mindspore-2.2.11.dist-info}/entry_points.txt +0 -0
{mindspore-2.2.0.dist-info → mindspore-2.2.11.dist-info}/top_level.txt +0 -0

mindspore/nn/layer/flash_attention.py CHANGED Viewed

@@ -21,9 +21,7 @@ import mindspore.common.dtype as mstype
 from mindspore.common.tensor import Tensor
 from mindspore import ops
 from mindspore.nn.cell import Cell
-from mindspore.ops._op_impl._custom_op.flash_attention.flash_attention_impl import get_flash_attention
 from mindspore.ops.operations.nn_ops import FlashAttentionScore
-from mindspore._c_expression import MSContext
 __all__ = ['FlashAttention']
@@ -46,25 +44,25 @@ class FlashAttention(Cell):
             Default 65536.
         next_block_num(int): A integer to define the number of blocks to look behind for local block sparse attention.
             Default 65536.
-        tiling_stgy_name(str): A str to define tiling strategy of flash attention.
         dp(int): data parallel.
             Default 1.
         mp(int): model parallel.
             Default 1.
-        high_precision(bool): This mode has higher precision but some performance loss.
+        high_precision(bool): This mode has higher precision but some performance loss. Only take effect on Ascend910A.
             Default False.
         have_attention_mask_batch(bool): indicates whether attention_mask contains the batch dimension.
             Default True
         alibi(bool): This parameter indicates whether the flashattention supports the Alibi.
             Default: False
+        use_mqa(bool): Using MQA if True, only take effect under 910B. Default: False.
     Inputs:
       - **query** (Tensor) - Tensor query (:class:`mstype.fp16` [batch_size, head_num, seq_length, head_dim])
       - **key** (Tensor) - Tensor key (:class:`mstype.fp16` [batch_size, head_num, seq_length, head_dim])
       - **value** (Tensor) - Tensor value (:class:`mstype.fp16` [batch_size, head_num, seq_length, head_dim])
-      - **attention_mask** (Tensor) - Float Tensor the mask of (:class:`mstype.fp16` [batch_size, seq_length,
-          seq_length]): A matrix to pass masked information.
+      - **attention_mask** (Tensor) - Float Tensor the mask of (:class:`mstype.fp16` `mstype.uint8`
+        [batch_size, seq_length, seq_length]): A matrix to pass masked information.
     Outputs:
         A Tensor. The output of the attention with shape [batch_size, head_num, seq_length, head_dim]
@@ -97,56 +95,51 @@ class FlashAttention(Cell):
                  dropout_rate=0.0,
                  prev_block_num=65536,
                  next_block_num=65536,
-                 tiling_stgy_name="sparse",
                  dp=1,
                  mp=1,
                  high_precision=False,
                  have_attention_mask_batch=True,
-                 alibi=False
+                 alibi=False,
+                 use_mqa=False
                  ):
         super(FlashAttention, self).__init__()
         scaling_constant = math.sqrt(head_dim)
         if scaling_constant == 0:
             raise ValueError("the scaling constant must not be 0.")
-        self.scale_factor = Tensor([1. / scaling_constant], dtype=mstype.float16)
+        self.dropout_rate = dropout_rate
+        self.alibi = alibi
+        self.have_attention_mask_batch = have_attention_mask_batch
-        self.is_910A = MSContext.get_instance().get_ascend_soc_version() == "Ascend910"
-        if self.is_910A:
-            self.flash_attention = get_flash_attention(
-                prev_block_num=prev_block_num,
-                next_block_num=next_block_num,
-                tiling_stgy_name=tiling_stgy_name,
-                high_precision=high_precision
-            )
-            self.flash_attention.add_prim_attr("primitive_target", "Ascend")
-        else:
-            if alibi:
-                raise ValueError(f"When soc_version is not Ascend910A, alibi must be False")
-            self.transpose_4d_pre = ops.Transpose().shard(((dp, mp, 1, 1),))
-            self.transpose_4d_post = ops.Transpose().shard(((dp, 1, mp, 1),))
-            self.reshape = ops.Reshape()
-            self.zeros_like = ops.ZerosLike().shard(((dp, mp, 1, 1),))
-            self.zeros = ops.Zeros()
-            self.attn_expand_dims = ops.ExpandDims().shard(((dp, 1, 1),))
-            fa_strategies = ((dp, 1, mp),
-                             (dp, 1, mp),
-                             (dp, 1, mp),
+        self.transpose_4d_pre = ops.Transpose().shard(((dp, mp, 1, 1),))
+        self.transpose_4d_post = ops.Transpose().shard(((dp, 1, mp, 1),))
+        self.reshape = ops.Reshape()
+        self.zeros_like = ops.ZerosLike().shard(((dp, mp, 1, 1),))
+        self.zeros = ops.Zeros()
+        self.attn_cast = ops.Cast()
+        if use_mqa:
+            fa_strategies = ((dp, mp, 1, 1),
+                             (dp, 1, 1, 1),
                              (dp, 1, 1, 1))
-            if dropout_rate > 1e-5:
-                fa_strategies += ((dp, mp, 1, 1),)
-            self.flash_attention = FlashAttentionScore(head_num=head_num, pre_tokens=prev_block_num,
-                                                       next_tokens=next_block_num,
-                                                       keep_prob=1 - dropout_rate,
-                                                       scale_value=1.0,
-                                                       inner_precise=0 if high_precision else 1).shard(fa_strategies)
+        else:
+            fa_strategies = ((dp, mp, 1, 1),
+                             (dp, mp, 1, 1),
+                             (dp, mp, 1, 1))
+        if self.alibi:
+            self.alibi_rescale_mul = ops.Mul().shard(((dp, mp, 1, 1), (1,)))
+            self.alibi_rescale_factor = Tensor([scaling_constant], dtype=mstype.float16)
+            fa_strategies += ((dp, mp, 1, 1),)
+        if dropout_rate > 1e-5:
+            fa_strategies += ((dp, mp, 1, 1),)
+        fa_strategies += ((dp, 1, 1, 1),)
+        self.flash_attention = FlashAttentionScore(head_num=head_num, pre_tokens=prev_block_num,
+                                                   next_tokens=next_block_num,
+                                                   keep_prob=1 - dropout_rate,
+                                                   scale_value=1. / scaling_constant,
+                                                   inner_precise=0,
+                                                   input_layout="BNSD").shard(fa_strategies)
-        self.ones = ops.Ones()
-        self.dim_mask = Tensor([1 for _ in range(head_dim)], dtype=mstype.int8)
-        self.scale_mul = ops.Mul().shard(((dp, mp, 1, 1), (1,)))
         self.dropout_rate = dropout_rate
-        self.have_attention_mask_batch = have_attention_mask_batch
-        self.alibi = alibi
         if self.dropout_rate > 1e-5:
             self.keep_prob = Tensor(1 - self.dropout_rate, dtype=mstype.float16)
             self.fill_v2 = ops.FillV2().shard(((dp, mp, 1, 1), ()))
@@ -162,46 +155,7 @@ class FlashAttention(Cell):
                                   such as MatMul. Default: None.
         :return:
         """
-        if in_strategy is None:
-            # default: dp=1, mp=1, construct inputs only contain query, key, value
-            in_strategy = (
-                (1, 1, 1, 1),
-                (1, 1, 1, 1),
-                (1, 1, 1, 1),
-            )
         self.flash_attention.shard(in_strategy)
-        dp = in_strategy[0][0]
-        mp = in_strategy[0][1]
-        self.flash_attention.add_prim_attr("dev_matrix_shape", [dp, mp, 1, 1])
-        inputs_tensor_map = [
-            [3, 2, 1, 0],
-            [3, 2, 1, 0],
-            [3, 2, 1, 0],
-        ]
-        if self.have_attention_mask_batch:
-            inputs_tensor_map.append([3, 1, 0])
-        else:
-            inputs_tensor_map.append([-1, 1, 0])
-        input_empty_args_num = 2
-        # dropout_mask
-        if self.dropout_rate > 1e-5:
-            input_empty_args_num -= 1
-            inputs_tensor_map.append([3, 2, 1, 0])
-        if self.alibi:
-            input_empty_args_num -= 1
-            inputs_tensor_map.append([3, 2, 1, 0])
-        self.flash_attention.add_prim_attr("inputs_tensor_map", inputs_tensor_map)
-        self.flash_attention.add_prim_attr("outputs_tensor_map", [
-            [3, 2, 1, 0],  # O
-            [3, 2, 1],  # L
-            [3, 2, 1]  # M
-        ])
-        self.flash_attention.add_prim_attr("as_loss_divisor", 0)
-        self.flash_attention.add_prim_attr("empty_mirror_ops", input_empty_args_num)
     def construct(self, query, key, value, attn_mask=None, alibi_mask=None):
         """FlashAttention forward
@@ -212,53 +166,24 @@ class FlashAttention(Cell):
         :param alibi_mask: [bsz, head_num, 1, seq_len], if not None
         :return: output          [bsz, head_num, seq_len, head_dim]
         """
-        query = self.scale_mul(query, self.scale_factor)
-        bsz, head_num, seq_len, head_dim = query.shape
-        _, k_head_num, k_seq_len, _ = key.shape
-        _, v_head_num, v_seq_len, _ = value.shape
-        if head_num != k_head_num or head_num != v_head_num:
-            raise ValueError(
-                "the head_num of query, key and value must be the same, "
-                "If different head_num are used, users need to change themselves to be same by tile.")
-        if seq_len % 16 != 0 or k_seq_len % 16 != 0 or k_seq_len != v_seq_len:
-            raise ValueError(
-                "query, key, value seq_len must be a multiple of 16, and key seq_len, value seq_len must be the same.")
-        if head_dim > 304:
-            raise ValueError(
-                "the head_dim must be less than 304, otherwise the ub would be OOM.")
-        if self.is_910A:
-            # 910A -- FlashAttentionPrimtive
-            if self.dropout_rate > 1e-5:
-                drop_mask_bits = self.drop_gen_mask((bsz, head_num, seq_len, seq_len), self.keep_prob)
-                tensor_shape = Tensor((bsz, head_num, seq_len, seq_len), mstype.int32)
-                ones = self.fill_v2(tensor_shape, self.tensor_one)
-                ones = self.depend(ones, query)
-                drop_mask = self.do_dropout(ones, drop_mask_bits, self.keep_prob)
-            else:
-                drop_mask = None
-            output, _, _ = self.flash_attention(query, key, value, attn_mask, drop_mask, alibi_mask)
+        bsz, head_num, seq_len, _ = query.shape
+        # 910B -- FlashAttentionScore
+        if self.dropout_rate > 1e-5:
+            drop_mask_bits = self.reshape(self.drop_gen_mask((bsz, head_num, seq_len, seq_len), self.keep_prob),
+                                          (bsz, head_num, seq_len, seq_len // 8))
         else:
-            # FlashAttentionScore
-            # Useless input, just for binary calls.
-            if self.dropout_rate > 1e-5:
-                drop_mask_bits = self.reshape(self.drop_gen_mask((bsz, head_num, seq_len, seq_len), self.keep_prob),
-                                              (bsz, head_num, seq_len, seq_len // 8))
-            else:
-                drop_mask_bits = None
-            # (B, N, S, D) -> (B, S, H)
-            query = self.reshape(self.transpose_4d_pre(query, (0, 2, 1, 3)), (bsz, seq_len, -1))
-            key = self.reshape(self.transpose_4d_pre(key, (0, 2, 1, 3)), (bsz, seq_len, -1))
-            value = self.reshape(self.transpose_4d_pre(value, (0, 2, 1, 3)), (bsz, seq_len, -1))
-            attn_mask = self.attn_expand_dims(attn_mask, 1)
-            output, _, _ = self.flash_attention(query,
-                                                key,
-                                                value,
-                                                attn_mask,
-                                                drop_mask_bits,
-                                                None,
-                                                None)
-            output = self.transpose_4d_post(self.reshape(output, (bsz, seq_len, head_num, head_dim)), (0, 2, 1, 3))
+            drop_mask_bits = None
+        if self.alibi:
+            alibi_mask = self.alibi_rescale_mul(alibi_mask, self.cast(self.alibi_rescale_factor, alibi_mask.dtype))
+        # (B, S, S) -> (B, 1, S, S)
+        if self.have_attention_mask_batch:
+            attn_mask = self.cast(self.reshape(attn_mask, (bsz, 1, seq_len, seq_len)), mstype.uint8)
+        _, _, _, output = self.flash_attention(query,
+                                               key,
+                                               value,
+                                               alibi_mask,
+                                               drop_mask_bits,
+                                               None,
+                                               attn_mask,
+                                               None)
         return output

mindspore/nn/layer/math.py CHANGED Viewed

@@ -375,9 +375,6 @@ class DiGamma(Cell):
                            nan, real_result)
-eps_fp32 = Tensor(np.finfo(np.float32).eps, mstype.float32)
 def _while_helper_func(cond, body, vals):
     while cond(vals).any():
         vals = body(vals)
@@ -394,7 +391,7 @@ def _igamma_series(ax, x, a, enabled):
     select = P.Select()
     # If more data types are supported, this epsilon need to be selected.
-    epsilon = eps_fp32
+    epsilon = Tensor(np.finfo(np.float32).eps, mstype.float32)
     def cond(vals):
         enabled = vals[0]
@@ -443,7 +440,7 @@ def _igammac_continued_fraction(ax, x, a, enabled):
     select = P.Select()
     # If more data types are supported, this epsilon need to be selected.
-    epsilon = eps_fp32
+    epsilon = Tensor(np.finfo(np.float32).eps, mstype.float32)
     def cond(vals):
         enabled = vals[0]
@@ -620,8 +617,7 @@ class IGamma(Cell):
             x = F.broadcast_to(x, para_shape)
             a = F.broadcast_to(a, para_shape)
         x_is_zero = self.equal(x, 0)
-        log_maxfloat = self.log_maxfloat32
-        underflow = self.less(ax, self.neg(log_maxfloat))
+        underflow = self.less(ax, self.neg(self.log_maxfloat32))
         ax = self.exp(ax)
         enabled = self.logicalnot(self.logicalor(self.logicalor(x_is_zero, domain_error), underflow))
         output = self.select(use_igammac,

mindspore/nn/layer/rnn_cells.py CHANGED Viewed

@@ -83,7 +83,7 @@ def _check_lstmcell_init(func):
 def _rnn_tanh_cell(inputs, hidden, w_ih, w_hh, b_ih, b_hh):
-    '''RNN cell function with tanh activation'''
+    """RNN cell function with tanh activation"""
     if b_ih is None:
         igates = P.MatMul(False, True)(inputs, w_ih)
         hgates = P.MatMul(False, True)(hidden, w_hh)
@@ -94,7 +94,7 @@ def _rnn_tanh_cell(inputs, hidden, w_ih, w_hh, b_ih, b_hh):
 def _rnn_relu_cell(inputs, hidden, w_ih, w_hh, b_ih, b_hh):
-    '''RNN cell function with relu activation'''
+    """RNN cell function with relu activation"""
     if b_ih is None:
         igates = P.MatMul(False, True)(inputs, w_ih)
         hgates = P.MatMul(False, True)(hidden, w_hh)
@@ -105,7 +105,7 @@ def _rnn_relu_cell(inputs, hidden, w_ih, w_hh, b_ih, b_hh):
 def _lstm_cell(inputs, hidden, w_ih, w_hh, b_ih, b_hh):
-    '''LSTM cell function'''
+    """LSTM cell function"""
     hx, cx = hidden
     if b_ih is None:
         gates = P.MatMul(False, True)(inputs, w_ih) + P.MatMul(False, True)(hx, w_hh)
@@ -125,7 +125,7 @@ def _lstm_cell(inputs, hidden, w_ih, w_hh, b_ih, b_hh):
 def _gru_cell(inputs, hidden, w_ih, w_hh, b_ih, b_hh):
-    '''GRU cell function'''
+    """GRU cell function"""
     if b_ih is None:
         gi = P.MatMul(False, True)(inputs, w_ih)
         gh = P.MatMul(False, True)(hidden, w_hh)
@@ -144,7 +144,7 @@ def _gru_cell(inputs, hidden, w_ih, w_hh, b_ih, b_hh):
 class RNNCellBase(Cell):
-    '''Basic class for RNN Cells'''
+    """Basic class for RNN Cells"""
     def __init__(self, input_size: int, hidden_size: int, has_bias: bool, num_chunks: int,
                  dtype=mstype.float32):
         super().__init__()

mindspore/nn/wrap/__init__.py CHANGED Viewed

@@ -20,7 +20,8 @@ Use the Wrapper to combine the loss or build the training steps.
 from __future__ import absolute_import
 from mindspore.nn.wrap.cell_wrapper import ForwardValueAndGrad, TrainOneStepCell, WithLossCell, WithGradCell, \
-    WithEvalCell, ParameterUpdate, GetNextSingleOp, VirtualDatasetCellTriple, MicroBatchInterleaved, PipelineCell
+    WithEvalCell, ParameterUpdate, GetNextSingleOp, VirtualDatasetCellTriple, MicroBatchInterleaved, PipelineCell, \
+    GradAccumulationCell
 from mindspore.nn.wrap.loss_scale import TrainOneStepWithLossScaleCell,\
     DynamicLossScaleUpdateCell, FixedLossScaleUpdateCell
 from mindspore.nn.wrap.grad_reducer import DistributedGradReducer
@@ -42,5 +43,6 @@ __all__ = [
     "ParameterUpdate",
     "DynamicLossScaleUpdateCell",
     "FixedLossScaleUpdateCell",
-    "VirtualDatasetCellTriple"
+    "VirtualDatasetCellTriple",
+    "GradAccumulationCell"
     ]

mindspore/nn/wrap/cell_wrapper.py CHANGED Viewed

@@ -644,6 +644,9 @@ class PipelineCell(Cell):
         self.micro_inputs = nn.CellList()
         self.micro_size = micro_size
         self.add_list = []
+        if not isinstance(network, Cell):
+            raise TypeError("For 'PipelineCell', the argument 'network' must cell type, "
+                            "but got the type : {}.".format(type(network)))
         if not isinstance(micro_size, int):
             raise TypeError("For 'PipelineCell', the argument 'micro_size' must be integer, "
                             "but got the type : {}.".format(type(micro_size)))
@@ -670,7 +673,7 @@ class PipelineCell(Cell):
 class GradAccumulationCell(Cell):
     """
-    Wrap the network with Micro Batch.
+    Wrap the network with Micro Batch to enable the grad accumulation in semi_auto_parallel/auto_parallel mode.
     Args:
         network (Cell): The target network to wrap.
@@ -680,8 +683,11 @@ class GradAccumulationCell(Cell):
         ``Ascend`` ``GPU``
     Examples:
-        >>> net = Net()
-        >>> net = GradAccumulationCell(net, 4)
+        >>> import mindspore.nn as nn
+        >>> # Define the network structure of LeNet5. Refer to
+        >>> # https://gitee.com/mindspore/docs/blob/r2.2/docs/mindspore/code/lenet.py
+        >>> net = LeNet5()
+        >>> net = nn.GradAccumulationCell(net, 4)
     """
     def __init__(self, network, micro_size):
         super(GradAccumulationCell, self).__init__(auto_prefix=False)
@@ -689,6 +695,9 @@ class GradAccumulationCell(Cell):
         self.micro_inputs = nn.CellList()
         self.micro_size = micro_size
         self.add_list = []
+        if not isinstance(network, Cell):
+            raise TypeError("For 'GradAccumulationCell', the argument 'network' must cell type, "
+                            "but got the type : {}.".format(type(network)))
         if not isinstance(micro_size, int):
             raise TypeError("For 'GradAccumulationCell', the argument 'micro_size' must be integer, "
                             "but got the type : {}.".format(type(micro_size)))

mindspore/numpy/utils_const.py CHANGED Viewed

@@ -143,8 +143,8 @@ def _infer_out_shape(*shapes):
     shape_out = list()
     max_len = max([len(it) for it in shapes])
     for i in range(max_len):
-        items = [it[i-max_len+len(it)] if i-max_len +
-                 len(it) >= 0 else 1 for it in shapes]
+        items = [
+            it[i - max_len + len(it)] if i - max_len + len(it) >= 0 else 1 for it in shapes]
         max_size = 0 if 0 in items else max(items)
         _check()
         shape_out.append(max_size)
@@ -158,8 +158,8 @@ def _can_broadcast(*shapes):
     """
     max_len = max([len(it) for it in shapes])
     for i in range(max_len):
-        items = [it[i-max_len+len(it)] if i-max_len +
-                 len(it) >= 0 else 1 for it in shapes]
+        items = [
+            it[i - max_len + len(it)] if i - max_len + len(it) >= 0 else 1 for it in shapes]
         max_size = 0 if 0 in items else max(items)
         if any(item not in (1, max_size) for item in items):
             return False
@@ -399,7 +399,7 @@ def _broadcast_tuples(tup1, tup2):
         if not isinstance(tup1, (tuple, list)) or not isinstance(tup2, (tuple, list)):
             raise TypeError("input shift and axis must be tuple or list or int.")
         if len(tup1) == len(tup2) or len(tup1) == 1 or len(tup2) == 1:
-            return None
+            return
         raise ValueError("shape mismatch: objects cannot be broadcast to a single shape")
     tup1 = (tup1,) if isinstance(tup1, int) else tup1

mindspore/ops/_grad_experimental/grad_array_ops.py CHANGED Viewed

@@ -203,7 +203,7 @@ def get_bprop_index_put(self):
         if is_ascend:
             indices_ms = [convert_idx_positive(indices_ms[i], x1.shape[i]) for i in range(len(indices_ms))]
         indices_me = stack(indices_ms)
-        indices_grad = F.transpose(indices_me, F.make_range(F.rank(indices_me)-1, -1, -1))
+        indices_grad = F.transpose(indices_me, F.make_range(F.rank(indices_me) - 1, -1, -1))
         values_grad = gather_nd(dout, indices_grad)
         if equal(cast(x2.shape[0], mstype.int32), Tensor(1)):
             values_grad = values_grad.sum().reshape(1)

mindspore/ops/_grad_experimental/grad_implementations.py CHANGED Viewed

@@ -19,7 +19,7 @@ from mindspore.ops import functional as F
 from mindspore.ops import operations as P
 from mindspore.ops.composite import multitype_ops as C
 from mindspore.ops.composite.multitype_ops.zeros_like_impl import zeros_like
-from mindspore.ops._grad_experimental.grad_base import bprops
+from mindspore.ops._grad_experimental.grad_base import bprops, bprop_getters
 from mindspore.common import dtype as mstype
 get_dtype = P.DType()
@@ -193,7 +193,7 @@ def bprop_tensor_move(x, out, dout):
     return (dout,)
-@bprops.register("DictInplaceSetItem")
+@bprop_getters.register("DictInplaceSetItem")
 def get_bprop_dict_inplace_setitem(self):
     """Generate bprop for dict inplace pop"""

mindspore/ops/_grad_experimental/grad_math_ops.py CHANGED Viewed

@@ -135,7 +135,7 @@ def get_bprop_matrix_triangular_solve(self):
     def bprop(matrix, rhs, out, dout):
         grad_rhs = matrix_triangular_solve_op(matrix, dout)
-        if matrix.dtype == mstype.complex64 or matrix.dtype == mstype.complex128:
+        if matrix.dtype in (mstype.complex64, mstype.complex128):
             grad_rhs_temp = _adjoint(grad_rhs)
             out_temp = _adjoint(out)
         else:
@@ -156,14 +156,14 @@ def get_bprop_matrix_triangular_solve(self):
                 grad_matrix = mat_mul_op(grad_rhs, out_temp)
                 grad_matrix = neg_op(grad_matrix)
         if lower_a:
-            if grad_matrix.dtype == mstype.complex64 or grad_matrix.dtype == mstype.complex128:
+            if grad_matrix.dtype in (mstype.complex64, mstype.complex128):
                 grad_matrix_real = matrix_band_part_op(real_op(grad_matrix), -1, 0)
                 grad_matrix_imag = matrix_band_part_op(imag_op(grad_matrix), -1, 0)
                 grad_matrix = complex_op(grad_matrix_real, grad_matrix_imag)
             else:
                 grad_matrix = matrix_band_part_op(grad_matrix, -1, 0)
         else:
-            if grad_matrix.dtype == mstype.complex64 or grad_matrix.dtype == mstype.complex128:
+            if grad_matrix.dtype in (mstype.complex64, mstype.complex128):
                 grad_matrix_real = matrix_band_part_op(real_op(grad_matrix), 0, -1)
                 grad_matrix_imag = matrix_band_part_op(imag_op(grad_matrix), 0, -1)
                 grad_matrix = complex_op(grad_matrix_real, grad_matrix_imag)
@@ -219,7 +219,7 @@ def get_bprop_matrix_solve(self):
 @_primexpr
 def _generate_perm_matrix_solve_ls(x_dim):
     perm = tuple(range(x_dim - 2))
-    perm = perm + (x_dim-1, x_dim-2)
+    perm = perm + (x_dim - 1, x_dim - 2)
     return perm
@@ -647,20 +647,21 @@ def _fft_rank_offset(norm_shape, rank):
 @_primexpr
 def _fft_with_size_back_norm(norm_shape, norm, inverse, rank):
     """generate reverse term for fft_with_size"""
+    norm_ = None
     if inverse is False:
         if norm == "forward":
-            norm_ = 1 / _fft_rank_offset(norm_shape, rank)
-        if norm == "backward":
-            norm_ = 1 * _fft_rank_offset(norm_shape, rank)
-        if norm == "ortho":
-            norm_ = 1
-    if inverse is True:
+            norm_ = 1.0 / _fft_rank_offset(norm_shape, rank)
+        elif norm == "backward":
+            norm_ = 1.0 * _fft_rank_offset(norm_shape, rank)
+        elif norm == "ortho":
+            norm_ = 1.0
+    else:
         if norm == "forward":
-            norm_ = 1 * _fft_rank_offset(norm_shape, rank)
-        if norm == "backward":
-            norm_ = 1 / _fft_rank_offset(norm_shape, rank)
-        if norm == "ortho":
-            norm_ = 1
+            norm_ = 1.0 * _fft_rank_offset(norm_shape, rank)
+        elif norm == "backward":
+            norm_ = 1.0 / _fft_rank_offset(norm_shape, rank)
+        elif norm == "ortho":
+            norm_ = 1.0
     return norm_
@@ -670,9 +671,9 @@ def _rfft_norm(norm_shape, norm, rank):
     norm_ = 1.0
     if norm == "forward":
         norm_ = 1 / _fft_rank_offset(norm_shape, rank)
-    if norm == "backward":
-        norm_ = 1
-    if norm == "ortho":
+    elif norm == "backward":
+        norm_ = 1.0
+    elif norm == "ortho":
         norm_ = 1 / np.sqrt(_fft_rank_offset(norm_shape, rank))
     return norm_

mindspore/ops/_grad_experimental/grad_sparse_ops.py CHANGED Viewed

@@ -358,10 +358,10 @@ def get_bprop_ragged_tensor_to_sparse(self):
                 split.append(zeros_like(i))
             all_d = (split, ragged_values_grad)
             return all_d
-        split = ()
+        split_ = ()
         for i in enumerate(rt_nested_splits):
-            split = split + (zeros_like(i),)
-        all_d = (split, ragged_values_grad)
+            split_ = split_ + (zeros_like(i),)
+        all_d = (split_, ragged_values_grad)
         return all_d
     return bprop

mindspore/ops/_op_impl/aicpu/add.py CHANGED Viewed

@@ -29,9 +29,9 @@ add_op_info = AiCPURegOp("Add") \
     .dtype_format(DataType.I32_Default, DataType.I32_Default, DataType.I32_Default) \
     .dtype_format(DataType.I64_Default, DataType.I64_Default, DataType.I64_Default) \
     .dtype_format(DataType.U8_Default, DataType.U8_Default, DataType.U8_Default) \
-    .dtype_format(DataType.U16_Default, DataType.I16_Default, DataType.I16_Default) \
-    .dtype_format(DataType.U32_Default, DataType.I32_Default, DataType.I32_Default) \
-    .dtype_format(DataType.U64_Default, DataType.I64_Default, DataType.I64_Default) \
+    .dtype_format(DataType.U16_Default, DataType.U16_Default, DataType.U16_Default) \
+    .dtype_format(DataType.U32_Default, DataType.U32_Default, DataType.U32_Default) \
+    .dtype_format(DataType.U64_Default, DataType.U64_Default, DataType.U64_Default) \
     .dtype_format(DataType.C64_Default, DataType.C64_Default, DataType.C64_Default) \
     .dtype_format(DataType.C128_Default, DataType.C128_Default, DataType.C128_Default) \
     .get_op_info()

mindspore/ops/_op_impl/aicpu/linear_sum_assignment.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# Copyright 2022 Huawei Technologies Co., Ltd
+# Copyright 2023 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -24,11 +24,30 @@ lsap_op_info = AiCPURegOp("LinearSumAssignment") \
     .input(2, 'maximize', "required") \
     .output(0, "row_ind", "required") \
     .output(1, "col_ind", "required") \
-    .attr("cust_aicpu", "str") \
     .dtype_format(DataType.F64_Default, DataType.I64_Default,
                   DataType.BOOL_Default, DataType.I64_Default, DataType.I64_Default,) \
     .dtype_format(DataType.F32_Default, DataType.I64_Default,
                   DataType.BOOL_Default, DataType.I64_Default, DataType.I64_Default,) \
+    .dtype_format(DataType.F16_Default, DataType.I64_Default,
+                  DataType.BOOL_Default, DataType.I64_Default, DataType.I64_Default,) \
+    .dtype_format(DataType.BOOL_Default, DataType.I64_Default,
+                  DataType.BOOL_Default, DataType.I64_Default, DataType.I64_Default,) \
+    .dtype_format(DataType.I16_Default, DataType.I64_Default,
+                  DataType.BOOL_Default, DataType.I64_Default, DataType.I64_Default,) \
+    .dtype_format(DataType.I32_Default, DataType.I64_Default,
+                  DataType.BOOL_Default, DataType.I64_Default, DataType.I64_Default,) \
+    .dtype_format(DataType.I64_Default, DataType.I64_Default,
+                  DataType.BOOL_Default, DataType.I64_Default, DataType.I64_Default,) \
+    .dtype_format(DataType.I8_Default, DataType.I64_Default,
+                  DataType.BOOL_Default, DataType.I64_Default, DataType.I64_Default,) \
+    .dtype_format(DataType.U16_Default, DataType.I64_Default,
+                  DataType.BOOL_Default, DataType.I64_Default, DataType.I64_Default,) \
+    .dtype_format(DataType.U32_Default, DataType.I64_Default,
+                  DataType.BOOL_Default, DataType.I64_Default, DataType.I64_Default,) \
+    .dtype_format(DataType.U64_Default, DataType.I64_Default,
+                  DataType.BOOL_Default, DataType.I64_Default, DataType.I64_Default,) \
+    .dtype_format(DataType.U8_Default, DataType.I64_Default,
+                  DataType.BOOL_Default, DataType.I64_Default, DataType.I64_Default,) \
     .get_op_info()

mindspore/ops/_utils/utils.py CHANGED Viewed

@@ -77,10 +77,12 @@ def get_broadcast_shape(x_shape, y_shape, prim_name, arg_name1="x", arg_name2="y
     broadcast_shape = list(broadcast_shape_front) + broadcast_shape_back
     return broadcast_shape
 def dim_not_equal(dim1, dim2):
     """Compare dim in shape"""
     return dim1 != dim2 and dim1 >= 0 and dim2 >= 0
 def get_concat_offset(x_shp, x_type, axis, prim_name):
     """for concat and concatoffset check args and compute offset"""
     validator.check_value_type("shape", x_shp, [tuple, list], prim_name)

mindspore/ops/composite/multitype_ops/_compile_utils.py CHANGED Viewed

@@ -1255,7 +1255,8 @@ def _tensor_setitem_by_bool_tensor_with_tensor(data, index, value):
     index = index.reshape(const_utils.generate_padding_shape(index.shape, len(data.shape)))
     index = F.broadcast_to(index, data.shape)
     value = F.cast(value, F.dtype(data))
-    value = value.reshape(const_utils.generate_padding_shape(value.shape, len(data.shape)))
+    while value.ndim < data.ndim:
+        value = value.unsqueeze(-1)
     value = F.broadcast_to(value, data.shape)
     result = F.select(index, value, data)
     return result

mindspore/ops/composite/multitype_ops/getitem_impl.py CHANGED Viewed

@@ -161,7 +161,7 @@ def _tuple_getitem_by_slice(data, slice_index):
         if start is None:
             start = 0 if step >= 1 else -1
         if stop is None:
-            stop = (2**31-1) if step >= 1 else -(2**31-1)
+            stop = (2**31 - 1) if step >= 1 else -(2**31 - 1)
         return sequence_slice(data, start, stop, step)
     return _tuple_slice(data, slice_index)
@@ -236,7 +236,7 @@ def _list_getitem_by_slice(data, slice_index):
         if start is None:
             start = 0 if step >= 1 else -1
         if stop is None:
-            stop = (2**31-1) if step >= 1 else -(2**31-1)
+            stop = (2**31 - 1) if step >= 1 else -(2**31 - 1)
         return sequence_slice(data, start, stop, step)
     return _list_slice(data, slice_index)