PyPI - mindspore - Versions diffs - 2.3.0rc1__cp38-none-any.whl → 2.3.0rc2__cp38-none-any.whl - Mend

mindspore 2.3.0rc1cp38-none-any.whl → 2.3.0rc2cp38-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mindspore might be problematic. Click here for more details.

Files changed (318) hide show

mindspore/lib/plugin/ascend/ms_kernels_internal/internal_kernel/include/matmul_common/tiling_utils.h ADDED Viewed

@@ -0,0 +1,65 @@
+/**
+ * Copyright 2024 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MATMUL_TILING_UTILS_H
+#define MATMUL_TILING_UTILS_H
+#include <stdint.h>
+#include <sstream>
+#include <cstdlib>
+#include <vector>
+namespace mindspore {
+namespace internal {
+namespace tiling {
+static std::vector<int> getMatMulTilingFromEnv() {
+    std::vector<int> result;
+    auto env_name = "INTERNAL_MATMUL_TILING";
+    const char* envVarValue = std::getenv(env_name);
+    if (envVarValue != nullptr) {
+        std::string envVarString(envVarValue);
+        std::stringstream ss(envVarString);
+        std::string item;
+        while (std::getline(ss, item, ',')) {
+            result.push_back(std::stoi(item));
+        }
+    }
+    return result;
+}
+static bool getShuffleFlagFromEnv() {
+    auto env_name = "CUSTOM_MATMUL_SHUFFLE";
+    const char* envVarValue = std::getenv(env_name);
+    if (envVarValue != nullptr) {
+        std::string envVarString(envVarValue);
+        if (envVarString != "0" && envVarString != "off") {
+            return true;
+        }
+        return false;
+    }
+    return true;
+}
+}  // namespace tiling
+}  // namespace internal
+}  // namespace mindspore
+#endif  // MATMUL_TILING_UTILS_H

mindspore/lib/plugin/ascend/ms_kernels_internal/internal_kernel/include/matmul_stridedslice/matmul_stridedslice_fusion_impl.h CHANGED Viewed

@@ -24,11 +24,12 @@
 #include "asdops/tensor.h"
 #include "utils.h"
-// #include "pp_matmul_info.h"
 #include "backend_param.h"
-#include "tiling_data.h"
+#include "matmul_common/pp_matmul_info.h"
+#include "matmul_common/tiling_utils.h"
+#include "matmul_common/tiling_data.h"
+#include "matmul_common/pp_matmul_common_tiling.h"
 #include "param/matmul_qkv_param.h"
-// #include "pp_matmul_common_tiling.h"
 #include "tune_repo/utils.h"
 #include "internal_kernel.h"
@@ -39,6 +40,8 @@
 namespace mindspore {
 namespace internal {
+using namespace tiling;
 class MatMulStridedSliceFusionImpl : public InternelKernelImpl {
  public:
   MatMulStridedSliceFusionImpl(const OpParamPtr &param) : InternelKernelImpl(param){};
@@ -48,7 +51,8 @@ class MatMulStridedSliceFusionImpl : public InternelKernelImpl {
   int Launch() override;
   size_t GetTilingBufSize() override;
   int Tiling(HostRawBuf &tilingBuf) override;
-  int TilingLLMCustom(HostRawBuf &tilingBuf);
+  void TilingBasicFromPp(uint32_t &blockDim, PpTilingData &tilingdata);
+  int TilingLLMCustom(HostRawBuf &tilingBuf, const uint32_t &blockDim, const PpTilingData &tilingdata, bool has_tuned);
   std::vector<uint64_t> GetWorkSpaceSize() override;
   int InferShape(const std::vector<DIMS> &input_shapes, std::vector<DIMS> &output_shapes) override;
@@ -66,8 +70,8 @@ class MatMulStridedSliceFusionImpl : public InternelKernelImpl {
   REPO tuningTable_;
   tiling::MatmulStridedSliceFusionTilingData t_;
-  void GetTunedKey(std::vector<int> &tune_key);
-  void GetTunedValue(const std::vector<int> &tuned_config);
+  std::vector<int> GetTunedKey();
+  void SetTunedValue(const std::vector<int> &tuned_config);
 };
 }  // namespace internal

mindspore/lib/plugin/ascend/ms_kernels_internal/internal_kernel/include/op_param.h CHANGED Viewed

@@ -27,7 +27,6 @@
 #include "asdops/params/norm.h"
 #include "asdops/params/softmax.h"
 #include "asdops/params/split.h"
-#include "attention_param.h"
 #include "asdops/params/expand.h"
 #include "asdops/params/fill.h"
 #include "asdops/params/reduce.h"
@@ -99,6 +98,10 @@ struct AddLayerNormParam {
 };
 struct ApplyRotaryPosEmbParam {
+  // cosFormat=0  shape是[maxSeqLen, headDim]，    cos/sin不交替
+  // cosFormat=1  shape是[maxSeqLen, headDim]，    cos/sin交替
+  // cosFormat=2  shape是[batch*seqLen, headDim]， cos/sin不交替
+  // cosFormat=3  shape是[batch*seqLen, headDim]， cos/sin交替
   int32_t cosFormat{0};
 };

mindspore/lib/plugin/ascend/ms_kernels_internal/internal_kernel/include/paged_attention/kernel/paged_attention_mix_hwsync.h ADDED Viewed

@@ -0,0 +1,41 @@
+#ifndef BS_FLASHATTENTION_BS__ATTENTION_MIX_HWSYNC_H
+#define BS_FLASHATTENTION_BS__ATTENTION_MIX_HWSYNC_H
+constexpr float DROPOUT_PROP = 0.5;
+constexpr uint32_t LOOP_LEN = 5;
+constexpr uint32_t UB_HALF_BUF_SIZE = 8 * 2048;
+constexpr uint32_t BIT_UINT8 = 8;
+constexpr uint32_t BIT_BLOCK = 256;
+constexpr uint32_t BLOCK_SIZE = 16;
+constexpr uint32_t VECTOR_SIZE = 128;
+constexpr uint32_t VECTOR_SIZE_FP32 = 64;
+constexpr uint32_t CUBE_MATRIX_SIZE = 256;// 16 * 16
+constexpr uint64_t UB_UINT8_BLOCK_SIZE = 16384; // 64 * 128 * 2B
+constexpr uint64_t UB_UINT8_LINE_SIZE = 512;    // 64 * 4B，申请两倍空间防踩踏。
+constexpr uint64_t UB_FLOAT_LINE_SIZE = 128;    // 64，申请两倍空间防踩踏。
+constexpr uint64_t UB_HALF_LINE_SIZE = 256;     // UB_FLOAT_LINE_SIZE * 2
+constexpr uint32_t L0AB_HALF_BUF_SIZE = 16384; // 128 * 128
+constexpr uint64_t L1_SIZE = 512 * 1024; // 512KB
+constexpr uint64_t L0AB_UINT8_BLOCK_SIZE = 32768; // 128 * 128 * 2B
+constexpr uint64_t L1_MAX_SHARE_NUM = (L1_SIZE - 8 * L0AB_UINT8_BLOCK_SIZE) / L0AB_UINT8_BLOCK_SIZE / 2;
+constexpr uint64_t SUB_SP_SIZE = 2048 * 8;  // 1024*16, 2048*8, 4096*4, 8192*2, 16K*1，五种分块方法
+enum class L1Mode{load, // 读取数据至L1的share区
+                  share, // 使用share区的数据
+                  noshare};  // 不读且不用share区
+inline uint64_t ceil(uint64_t y, uint64_t x) {
+    return (y + x - 1) / x;
+}
+inline uint64_t round(uint64_t y, uint64_t x) {
+    return ceil(y, x) * x;
+}
+#if BFLOAT16
+#define CALC_DATA_TYPE bfloat16_t
+#else
+#define CALC_DATA_TYPE half
+#endif
+#endif //BS_FLASHATTENTION_BS__ATTENTION_MIX_HWSYNC_H

mindspore/lib/plugin/ascend/ms_kernels_internal/internal_kernel/include/{attention/PagedAttention_impl.h → paged_attention/paged_attention_impl.h} RENAMED Viewed

@@ -24,7 +24,7 @@
 #include "asdops/tensor.h"
 #include "internal_kernel.h"
+#include "param/attention_param.h"
 #include "acl_rt.h"
 #include <unordered_map>

mindspore/lib/plugin/ascend/ms_kernels_internal/internal_kernel/include/paged_attention/paged_attention_tiling.h ADDED Viewed

@@ -0,0 +1,63 @@
+#ifndef __BS_ATTENTION_TILING_H__
+#define __BS_ATTENTION_TILING_H__
+#pragma pack (8)
+typedef struct {
+    uint64_t batch_size;
+    uint64_t num_heads;
+    uint64_t max_seqlen;
+    uint64_t head_dim;
+    uint64_t num_group;
+    uint64_t q_seqlen;
+    uint64_t kv_seqlen;
+    uint64_t table_block_size;
+    uint64_t sync_addr;
+    uint64_t core_num;
+    float tor;
+} BSAttentionTilingData;
+#pragma pack()
+#define MAX_CORE_NUM 25
+#define ATTENTION_DEBUG false   // 开启时会对S/P写入调试数据
+#define ROWMAX true
+#define OP_NAME PagedAttention
+#define BUFFER_NUM 4            // 核间流水数，暂不支持修改
+constexpr uint64_t WORKSPACE_MAX_SEQLEN = 16384;    // max seqlen
+constexpr uint64_t WORKSPACE_SIZE = 64 * WORKSPACE_MAX_SEQLEN;
+#if BFLOAT16
+#define TYPE_NAME _bf16
+#else
+#define TYPE_NAME _fp16
+#endif
+#if BSH
+#define LAYOUT_NAME _BSH
+#else
+#define LAYOUT_NAME _BNSD
+#endif
+#define TRI_NAME _full
+#define CONCAT_(A, B, C, D, E) A##B##C##D##E
+#define CONCAT(A, B, C, D, E) CONCAT_(A, B, C, D, E)
+#define FUNC_NAME_AIC CONCAT(OP_NAME, TYPE_NAME, LAYOUT_NAME, TRI_NAME, _mix_aic)
+#define FUNC_NAME_AIV CONCAT(OP_NAME, TYPE_NAME, LAYOUT_NAME, TRI_NAME, _mix_aiv)
+// **************mask patten模式**************//
+// 第一种：下三角，开启LOWER_TRIANGLE时会直接采用下三角，不依赖mask
+// #define LOWER_TRIANGLE false
+// 第二种：Block Sparse，LOWER_TRIANGLE关闭时，开启BLOCK_SPARSE，会使用pre_token和next_token，不依赖mask（待开发）
+// #define BLOCK_SPARSE false
+// 第三种：读取MASK，LOWER_TRIANGLE和BLOCK_SPARSE关闭时，开启AMASK，会使用mask作为输入
+// #define AMASK true
+// 第四种：全矩阵，LOWER_TRIANGLE、BLOCK_SPARSE和AMASK如果全部关闭，则此attention采用全矩阵运算，不抑制S中的元素
+// *******************************************//
+constexpr uint64_t WORKSPACE_MAX_SEQLEN_BLOCK = WORKSPACE_MAX_SEQLEN / 16;
+constexpr uint64_t BUFFER_SIZE = MAX_CORE_NUM * WORKSPACE_SIZE * sizeof(uint16_t);
+#endif

mindspore/lib/plugin/ascend/ms_kernels_internal/internal_kernel/include/param/add_param.h CHANGED Viewed

@@ -34,11 +34,11 @@ struct AddParam : public OpParam {
   DIMS input1_dims_;
   DIMS input2_dims_;
   bool canSupport() {
-    if (ADD_SUPPORT_DTYPE.find(input1_dtype_) == ADD_SUPPORT_DTYPE.end()) {
+    if (ADD_SUPPORT_DTYPE.find(input1_dtype_) == ADD_SUPPORT_DTYPE.end() || input1_dims_ != input2_dims_) {
       return false;
     }
     if (input1_dims_ == input2_dims_) {
-      return true;
+      return false;
     }
     if (std::abs(int(input1_dims_.size()) - int(input2_dims_.size())) > 1) {
       return false;

mindspore/lib/plugin/ascend/ms_kernels_internal/internal_kernel/include/{attention_param.h → param/attention_param.h} RENAMED Viewed

@@ -16,12 +16,21 @@
 #ifndef ATTENTION_PARAMS_H
 #define ATTENTION_PARAMS_H
+#include "types.h"
+#include "op_param.h"
 namespace mindspore {
 namespace internal {
-struct FlashAttentionScoreParam {
+struct FlashAttentionScoreParam : public OpParam {
+  int head_num = 0;
+  int inner_precise = 0;
+  int pre_tokens = 2147483647;
+  int next_tokens = 0;
+  int sparse_mode = 0;
 };
-struct PagedAttentionParam {
+struct PagedAttentionParam : public OpParam {
+  int inner_precise = 0;
 };
 }  // namespace internal
 }  // namespace mindspore

mindspore/lib/plugin/ascend/ms_kernels_internal/internal_kernel/include/param/matmul_ext_param.h ADDED Viewed

@@ -0,0 +1,37 @@
+/**
+ * Copyright 2024 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MATMUL_EXT_PARAMS_H_
+#define MATMUL_EXT_PARAMS_H_
+#include "types.h"
+#include "op_param.h"
+namespace mindspore {
+namespace internal {
+struct MatMulExtParam : public OpParam {
+  int input_dtype = -1;
+  int weight_dtype = -1;
+  int output_dtype = -1;
+  bool with_relu = false;
+  bool with_gelu = false;
+  bool with_bias = false;
+  bool with_bias_fastgelu = false;
+};
+}  // namespace internal
+}  // namespace mindspore
+#endif

mindspore/lib/plugin/ascend/ms_kernels_internal/internal_kernel/include/param/sub_param.h ADDED Viewed

@@ -0,0 +1,45 @@
+/**
+ * Copyright 2024 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef SUB_PARAMS_H_
+#define SUB_PARAMS_H_
+#include "types.h"
+#include "op_param.h"
+#include <set>
+namespace mindspore {
+namespace internal {
+struct SubParam : public OpParam {
+  TensorDType input1_dtype_;
+  TensorDType input2_dtype_;
+  DIMS input1_dims_;
+  DIMS input2_dims_;
+  bool canSupport() {
+    if (input2_dtype_ != AsdOps::TensorDType::TENSOR_DTYPE_INT32) {
+      return false;
+    }
+    if (input2_dims_.size() == 0 || (input2_dims_.size() == 1 && input2_dims_[0] == 1)) {
+      return true;
+    }
+    if (input1_dims_.size() == 0 || (input1_dims_.size() == 1 && input1_dims_[0] == 1)) {
+      return true;
+    }
+    return false;
+  }
+};
+}  // namespace internal
+}  // namespace mindspore
+#endif

mindspore/lib/plugin/ascend/ms_kernels_internal/internal_kernel/include/reshape_and_cache/reshape_and_cache_tiling.h CHANGED Viewed

@@ -19,8 +19,7 @@
 struct ReshapeAndCacheTilingData {
   int32_t num_tokens;
-  int32_t num_heads;
-  int32_t head_size;
+  int32_t hidden_size;
 };
 #endif

mindspore/lib/plugin/ascend/ms_kernels_internal/internal_kernel/include/rms_norm/kernel/rms_norm.h ADDED Viewed

@@ -0,0 +1,23 @@
+/**
+ * Copyright 2024 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MS_KERNELS_INTERNAL_KERNEL_ASCENDC_RMS_NORM_H_
+#define MS_KERNELS_INTERNAL_KERNEL_ASCENDC_RMS_NORM_H_
+void rms_norm_do(uint32_t blockDim, void *l2ctrl, void *stream, uint8_t *x, uint8_t *gamma, uint8_t *y, uint8_t *rstd,
+                 uint8_t *workspace, uint8_t *tiling);
+#endif

mindspore/lib/plugin/ascend/ms_kernels_internal/internal_kernel/include/rms_norm/kernel/rms_norm_base.h ADDED Viewed

@@ -0,0 +1,175 @@
+/**
+ * Copyright 2024 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*!
+ * \file rms_norm_base.h
+ * \brief
+ */
+#ifndef _RMS_NORM_BASE_H_
+#define _RMS_NORM_BASE_H_
+#include "kernel_operator.h"
+using namespace AscendC;
+#if __CCE_AICORE__ != 220
+#define bfloat16_t int16_t
+#endif
+constexpr int32_t BUFFER_NUM = 1;         // tensor num for each queue
+constexpr int32_t NUM_PER_REP_FP32 = 64;  // ONE_REPEAT_BYTE_SIZE / sizeof(float);
+constexpr int32_t NUM_PER_BLK_FP32 = 8;
+constexpr float MINUS_HALF = -0.5;
+constexpr float ZERO = 0;
+constexpr float ONE = 1;
+template <typename T>
+__aicore__ inline T CeilDiv(T x, T y) {
+  return y == 0 ? x : (x + y - 1) / y;
+}
+template <typename Tp, Tp v>
+struct integral_constant {
+  static constexpr Tp value = v;
+};
+using true_type = integral_constant<bool, true>;
+using false_type = integral_constant<bool, false>;
+template <typename, typename>
+struct is_same : public false_type {};
+template <typename Tp>
+struct is_same<Tp, Tp> : public true_type {};
+__aicore__ inline void ReduceSumFP32(const LocalTensor<float> &dst_local, const LocalTensor<float> &src_local,
+                                     const LocalTensor<float> &work_local, int32_t count) {
+  // count need smaller than 255 repeat
+  if (g_coreType == AIV) {
+    uint64_t mask = NUM_PER_REP_FP32;
+    int32_t repeatTimes = count / NUM_PER_REP_FP32;
+    int32_t tailCount = count % NUM_PER_REP_FP32;
+    int32_t bodyCount = repeatTimes * NUM_PER_REP_FP32;
+    BinaryRepeatParams repeatParams;
+    repeatParams.src0RepStride = ONE_REPEAT_BYTE_SIZE / ONE_BLK_SIZE;
+    repeatParams.src0BlkStride = 1;
+    repeatParams.src1RepStride = 0;
+    repeatParams.src1BlkStride = 1;
+    repeatParams.dstRepStride = 0;
+    repeatParams.dstBlkStride = 1;
+    Duplicate(work_local, ZERO, NUM_PER_REP_FP32);
+    pipe_barrier(PIPE_V);
+    if (likely(repeatTimes > 0)) {
+      Add(work_local, src_local, work_local, mask, repeatTimes, repeatParams);
+      pipe_barrier(PIPE_V);
+    }
+    if (unlikely(tailCount != 0)) {
+      Add(work_local, src_local[bodyCount], work_local, tailCount, 1, repeatParams);
+      pipe_barrier(PIPE_V);
+    }
+    AscendCUtils::SetMask<float>(NUM_PER_REP_FP32);
+    vcadd((__ubuf__ float *)dst_local.GetPhyAddr(), (__ubuf__ float *)work_local.GetPhyAddr(), 1, 0, 1, 0, false);
+    pipe_barrier(PIPE_V);
+  }
+}
+__aicore__ inline void ReduceSumCustom(const LocalTensor<float> &dst_local, const LocalTensor<float> &src_local,
+                                       const LocalTensor<float> &work_local, int32_t count) {
+#if __CCE_AICORE__ == 220
+  ReduceSumFP32(dst_local, src_local, work_local, count);
+#else
+  ReduceSum(dst_local, src_local, dst_local, count);
+#endif
+}
+__aicore__ inline void ReduceSumFP32ToBlock(const LocalTensor<float> &dst_local, const LocalTensor<float> &src_local,
+                                            const LocalTensor<float> &work_local, int32_t count) {
+  // count need smaller than 255 repeat
+  uint64_t mask = NUM_PER_REP_FP32;
+  int32_t repeatTimes = count / NUM_PER_REP_FP32;
+  int32_t tailCount = count % NUM_PER_REP_FP32;
+  int32_t bodyCount = repeatTimes * NUM_PER_REP_FP32;
+  BinaryRepeatParams repeatParams;
+  repeatParams.src0RepStride = ONE_REPEAT_BYTE_SIZE / ONE_BLK_SIZE;
+  repeatParams.src0BlkStride = 1;
+  repeatParams.src1RepStride = 0;
+  repeatParams.src1BlkStride = 1;
+  repeatParams.dstRepStride = 0;
+  repeatParams.dstBlkStride = 1;
+  Duplicate(work_local, ZERO, NUM_PER_REP_FP32);
+  pipe_barrier(PIPE_V);
+  if (likely(repeatTimes > 0)) {
+    Add(work_local, src_local, work_local, mask, repeatTimes, repeatParams);
+    pipe_barrier(PIPE_V);
+  }
+  if (unlikely(tailCount != 0)) {
+    Add(work_local, src_local[bodyCount], work_local, tailCount, 1, repeatParams);
+    pipe_barrier(PIPE_V);
+  }
+  BlockReduceSum(dst_local, work_local, 1, mask, 1, 1, DEFAULT_REPEAT_STRIDE);
+  pipe_barrier(PIPE_V);
+}
+__aicore__ inline void BlockReduceSumFP32(const LocalTensor<float> &dst_local, const LocalTensor<float> &src_local,
+                                          int32_t count) {
+  // count need multiple of 8
+  int32_t repeatTimes = count / NUM_PER_REP_FP32;
+  int32_t tailCount = count % NUM_PER_REP_FP32;
+  int32_t dstAddr = repeatTimes * 8;
+  int32_t srcAddr = repeatTimes * NUM_PER_REP_FP32;
+  if (likely(repeatTimes > 0)) {
+    BlockReduceSum(dst_local, src_local, repeatTimes, NUM_PER_REP_FP32, 1, 1, DEFAULT_REPEAT_STRIDE);
+    pipe_barrier(PIPE_V);
+  }
+  if (tailCount != 0) {
+    BlockReduceSum(dst_local[dstAddr], src_local[srcAddr], 1, tailCount, 1, 1, DEFAULT_REPEAT_STRIDE);
+    pipe_barrier(PIPE_V);
+  }
+}
+template <typename T, typename U, typename R>
+__aicore__ inline void DataCopyCustom(const U &dstTensor, const R &srcTensor, const uint32_t count) {
+#if __CCE_AICORE__ == 220
+  DataCopyParams copyParams;
+  copyParams.blockLen = count * sizeof(T);
+  copyParams.blockCount = 1;
+  if constexpr (is_same<U, AscendC::LocalTensor<T>>::value) {
+    DataCopyPadParams padParams;
+    DataCopyPad(dstTensor, srcTensor, copyParams, padParams);
+  } else {
+    DataCopyPad(dstTensor, srcTensor, copyParams);
+  }
+#else
+  // only support count greater than 32byte
+  int32_t numPerBlock = ONE_BLK_SIZE / sizeof(T);
+  if (count % numPerBlock == 0) {
+    DataCopy(dstTensor, srcTensor, count);
+  } else {
+    if constexpr (is_same<U, AscendC::LocalTensor<T>>::value) {
+      int32_t num = AlignUp(count, numPerBlock);
+      DataCopy(dstTensor, srcTensor, num);
+    } else {
+      int32_t num = count / numPerBlock * numPerBlock;
+      DataCopy(dstTensor, srcTensor, num);
+      set_flag(PIPE_MTE3, PIPE_S, EVENT_ID0);
+      wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID0);
+      for (int32_t i = 0; i < numPerBlock; i++) {
+        T tensorValue = srcTensor.GetValue(count - numPerBlock + i);
+        srcTensor.SetValue(i, tensorValue);
+      }
+      set_flag(PIPE_S, PIPE_MTE3, EVENT_ID0);
+      wait_flag(PIPE_S, PIPE_MTE3, EVENT_ID0);
+      DataCopy(dstTensor[count - numPerBlock], srcTensor, numPerBlock);
+    }
+  }
+#endif
+}
+#endif  // RMS_NORM_BASE_H_

mindspore 2.3.0rc1__cp38-none-any.whl → 2.3.0rc2__cp38-none-any.whl

Potentially problematic release.

mindspore 2.3.0rc1cp38-none-any.whl → 2.3.0rc2cp38-none-any.whl