npm - @novastera-oss/llamarn - Versions diffs - 0.4.0 → 0.4.3-beta4 - Mend

@novastera-oss/llamarn 0.4.0 → 0.4.3-beta4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (979) hide show

package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h CHANGED Viewed

@@ -34,6 +34,7 @@ void ggml_compute_forward_add1(const struct ggml_compute_params * params, struct
 void ggml_compute_forward_acc(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_sum(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_sum_rows(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_cumsum(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_mean(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_argmax(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_count_equal(const struct ggml_compute_params * params, struct ggml_tensor * dst);
@@ -51,10 +52,6 @@ void ggml_compute_forward_scale(const struct ggml_compute_params * params, struc
 void ggml_compute_forward_set(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_cpy(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_cont(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_reshape(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_view(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_permute(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_transpose(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_get_rows(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_get_rows_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_set_rows(const struct ggml_compute_params * params, struct ggml_tensor * dst);
@@ -69,6 +66,7 @@ void ggml_compute_forward_clamp(const struct ggml_compute_params * params, struc
 void ggml_compute_forward_conv_transpose_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_im2col(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_im2col_back_f32(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_im2col_3d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_conv_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_conv_3d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_conv_transpose_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
@@ -84,6 +82,8 @@ void ggml_compute_forward_arange(const struct ggml_compute_params * params, stru
 void ggml_compute_forward_timestep_embedding(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_argsort(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_leaky_relu(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_tri(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_fill(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_flash_attn_ext(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_flash_attn_back(
         const struct ggml_compute_params * params,
@@ -99,6 +99,7 @@ void ggml_compute_forward_get_rel_pos(const struct ggml_compute_params * params,
 void ggml_compute_forward_add_rel_pos(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_rwkv_wkv6(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_rwkv_wkv7(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_solve_tri(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_gla(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_map_custom1(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_map_custom2(const struct ggml_compute_params * params, struct ggml_tensor * dst);

package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp CHANGED Viewed

@@ -1600,6 +1600,55 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
         return false;
     }
+    void forward_mul_mat_one_chunk(ggml_compute_params * params,
+                                   ggml_tensor *         op,
+                                   int64_t               src0_start,
+                                   int64_t               src0_end,
+                                   int64_t               src1_start,
+                                   int64_t               src1_end) {
+        const ggml_tensor * src0 = op->src[0];
+        const ggml_tensor * src1 = op->src[1];
+        ggml_tensor *       dst  = op;
+        GGML_TENSOR_BINARY_OP_LOCALS
+        const size_t src1_col_stride = ggml_row_size(PARAM_TYPE, ne10);
+        GGML_ASSERT(ne03 == 1 && ne13 == 1);
+        GGML_ASSERT(ne12 % ne02 == 0);
+        const int64_t r2 = ne12 / ne02;
+        const int64_t i12 = src1_start / ne1;
+        const int64_t i11 = src1_start - i12 * ne1;
+        // Determine batch index
+        const int64_t i02 = i12 / r2;
+        const int64_t i1 = i11;
+        const int64_t i2 = i12;
+        const char * src0_ptr = (const char *) src0->data + i02 * nb02;
+        const char * src1_ptr = (const char *) params->wdata + (i11 + i12 * ne11) * src1_col_stride;
+        char *       dst_ptr  = ((char *) dst->data + (i1 * nb1 + i2 * nb2));
+        const int64_t nrows = src1_end - src1_start;
+        const int64_t ncols = src0_end - src0_start;
+        GGML_ASSERT(src1_ptr + src1_col_stride * nrows <= (const char *) params->wdata + params->wsize);
+        // If there are more than three rows in src1, use gemm; otherwise, use gemv.
+        if (nrows > 3) {
+            gemm<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(ne00, (float *) (dst_ptr) + src0_start, nb1 / nb0,
+                                                             src0_ptr + src0_start * nb01, src1_ptr,
+                                                             nrows - (nrows % 4), ncols);
+        }
+        for (int iter = nrows - (nrows % 4); iter < nrows; iter++) {
+            gemv<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(ne00, (float *) (dst_ptr + (iter * nb1)) + src0_start,
+                                                             ne01, src0_ptr + src0_start * nb01,
+                                                             src1_ptr + (src1_col_stride * iter), 1 /* nrows */, ncols);
+        }
+    }
     void forward_mul_mat(ggml_compute_params * params, ggml_tensor * op) {
         const ggml_tensor * src0 = op->src[0];
         const ggml_tensor * src1 = op->src[1];
@@ -1621,6 +1670,12 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
         GGML_ASSERT(nb1 <= nb2);
         GGML_ASSERT(nb2 <= nb3);
+        // TODO: General batched mul mat for 4D tensors
+        // Currently only supports 3D tensors
+        GGML_ASSERT(ne03 == 1);
+        GGML_ASSERT(ne13 == 1);
+        GGML_ASSERT(ne3 == 1);
         GGML_ASSERT(src1->type == GGML_TYPE_F32);
         GGML_ASSERT(ggml_n_dims(op->src[0]) == 2);
@@ -1628,46 +1683,101 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
         char *       wdata = static_cast<char *>(params->wdata);
         const size_t nbw1  = ggml_row_size(PARAM_TYPE, ne10);
+        const size_t nbw2  = nbw1 * ne11;
-        assert(params->wsize >= nbw1 * ne11);
+        assert(params->wsize >= nbw2 * ne12);
         const ggml_from_float_t from_float = ggml_get_type_traits_cpu(PARAM_TYPE)->from_float;
-        int64_t i11_processed = 0;
-        for (int64_t i11 = ith * 4; i11 < ne11 - ne11 % 4; i11 += nth * 4) {
-            ggml_quantize_mat_t<INTER_SIZE, PARAM_TYPE>((float *) ((char *) src1->data + i11 * nb11), (void *) (wdata + i11 * nbw1), 4, ne10);
-        }
+        // INFO: Quantization is done in planes to avoid extra complexity in chunking.
+        // Flattening dimensions not multiple of INTER_SIZE would require extra handling depending on how
+        // the planes are broadcast.
+        for (int64_t i12 = 0; i12 < ne12; i12++) {
+            char * data_ptr  = (char *) src1->data + i12 * nb12;
+            char * wdata_ptr = wdata + i12 * nbw2;
+            for (int64_t i11 = ith * 4; i11 < ne11 - ne11 % 4; i11 += nth * 4) {
+                ggml_quantize_mat_t<INTER_SIZE, PARAM_TYPE>((float *) (data_ptr + i11 * nb11),
+                                                            (void *) (wdata_ptr + i11 * nbw1), 4, ne10);
+            }
-        i11_processed = ne11 - ne11 % 4;
-        for (int64_t i11 = i11_processed + ith; i11 < ne11; i11 += nth) {
-            from_float((float *) ((char *) src1->data + i11 * nb11), (void *) (wdata + i11 * nbw1), ne10);
+            const int64_t i11_processed = ne11 - ne11 % 4;
+            for (int64_t i11 = i11_processed + ith; i11 < ne11; i11 += nth) {
+                from_float((float *) (data_ptr + i11 * nb11), (void *) (wdata_ptr + i11 * nbw1), ne10);
+            }
         }
-        ggml_barrier(params->threadpool);
+        // disable for NUMA
+        const bool disable_chunking = ggml_is_numa();
-        const void * src1_wdata      = params->wdata;
-        const size_t src1_col_stride = ggml_row_size(PARAM_TYPE, ne10);
-        int64_t      src0_start      = (ith * ne01) / nth;
-        int64_t      src0_end        = ((ith + 1) * ne01) / nth;
-        src0_start = (src0_start % NB_COLS) ? src0_start + NB_COLS - (src0_start % NB_COLS) : src0_start;
-        src0_end   = (src0_end   % NB_COLS) ? src0_end   + NB_COLS - (src0_end   % NB_COLS) : src0_end;
-        if (src0_start >= src0_end) {
-            return;
+        // 4x chunks per thread
+        const int64_t nr0 = ggml_nrows(op->src[0]);
+        int     nth_scaled  = nth * 4;
+        int64_t chunk_size0 = (nr0 + nth_scaled - 1) / nth_scaled;
+        int64_t nchunk0     = (nr0 + chunk_size0 - 1) / chunk_size0;
+        // src1 is chunked only by full planes.
+        // When we flatten we need to address dimensions not multiple of the q8 INTER_SIZE
+        // to route them thorugh GEMV.
+        // nchunk1 = ne12 also avoids messing the chunking for models with no 3d tensors
+        // to avoid affecting their performance
+        int64_t nchunk1 = ne12;
+        // Ensure minimum chunk size to avoid alignment issues with high thread counts
+        // Minimum chunk size should be at least NB_COLS to prevent overlapping chunks after alignment
+        const int64_t min_chunk_size = NB_COLS;
+        if (nchunk0 > 0 && (nr0 / nchunk0) < min_chunk_size && nr0 >= min_chunk_size) {
+            nchunk0 = (nr0 + min_chunk_size - 1) / min_chunk_size;
         }
-        // If there are more than three rows in src1, use gemm; otherwise, use gemv.
-        if (ne11 > 3) {
-            gemm<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(ne00,
-                    (float *) ((char *) dst->data) + src0_start, ne01,
-                    (const char *) src0->data + src0_start * nb01,
-                    (const char *) src1_wdata, ne11 - ne11 % 4, src0_end - src0_start);
+        if (nth == 1 || nchunk0 < nth || disable_chunking) {
+            nchunk0 = nth;
         }
-        for (int iter = ne11 - ne11 % 4; iter < ne11; iter++) {
-            gemv<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(ne00,
-                    (float *) ((char *) dst->data + (iter * nb1)) + src0_start, ne01,
-                    (const char *) src0->data + src0_start * nb01,
-                    (const char *) src1_wdata + (src1_col_stride * iter), 1,
-                    src0_end - src0_start);
+        const int64_t dr0 = (nr0 + nchunk0 - 1) / nchunk0;
+        // Ensure nchunk doesn't exceed the number of rows divided by minimum chunk size
+        // This prevents creating too many tiny chunks that could overlap after alignment
+        const int64_t max_nchunk = (nr0 + min_chunk_size - 1) / min_chunk_size;
+        nchunk0                  = MIN(nchunk0, max_nchunk);
+        if (ith == 0) {
+            // Every thread starts at ith, so the first unprocessed chunk is nth.  This save a bit of coordination right at the start.
+            ggml_threadpool_chunk_set(params->threadpool, nth);
+        }
+        ggml_barrier(params->threadpool);
+        // The first chunk comes from our thread_id, the rest will get auto-assigned.
+        int current_chunk = ith;
+        while (current_chunk < nchunk0 * nchunk1) {
+            const int64_t ith0 = current_chunk % nchunk0;
+            const int64_t ith1 = current_chunk / nchunk0;
+            int64_t src0_start = dr0 * ith0;
+            int64_t src0_end   = MIN(src0_start + dr0, nr0);
+            // full-plane range for src1
+            int64_t src1_start = ith1 * ne11;
+            int64_t src1_end = (ith1 + 1) * ne11;
+            // Align boundaries to NB_COLS - round up to ensure all data is included
+            // The chunk size limiting above ensures chunks are large enough to prevent overlaps
+            src0_start = (src0_start % NB_COLS) ? src0_start + NB_COLS - (src0_start % NB_COLS) : src0_start;
+            src0_end   = (src0_end % NB_COLS) ? src0_end + NB_COLS - (src0_end % NB_COLS) : src0_end;
+            src0_end   = MIN(src0_end, ne01);
+            // Make sure current plane is the last one before exiting
+            if (src0_start >= src0_end) {
+                current_chunk = ggml_threadpool_chunk_add(params->threadpool, 1);
+                continue;
+            }
+            forward_mul_mat_one_chunk(params, dst, src0_start, src0_end, src1_start, src1_end);
+            current_chunk = ggml_threadpool_chunk_add(params->threadpool, 1);
         }
     }
@@ -1772,8 +1882,12 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
             int64_t src0_cur_start = (ith * ne01) / nth;
             int64_t src0_cur_end   = ((ith + 1) * ne01) / nth;
+            // Align boundaries to NB_COLS - round up to ensure all data is included
             src0_cur_start = (src0_cur_start % NB_COLS) ? src0_cur_start + NB_COLS - (src0_cur_start % NB_COLS) : src0_cur_start;
             src0_cur_end   = (src0_cur_end   % NB_COLS) ? src0_cur_end   + NB_COLS - (src0_cur_end   % NB_COLS) : src0_cur_end;
+            if (src0_cur_end > ne01) {
+                src0_cur_end = ne01;
+            }
             if (src0_cur_start >= src0_cur_end) {
                 return;

package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h CHANGED Viewed

@@ -114,26 +114,6 @@ extern "C" {
     #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) riscv_compute_fp32_to_fp16(x)
     #define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x)
     #define GGML_CPU_FP32_TO_FP16(x) GGML_CPU_COMPUTE_FP32_TO_FP16(x)
-#elif defined(__NNPA__)
-    #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) nnpa_compute_fp16_to_fp32(x)
-    #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) nnpa_compute_fp32_to_fp16(x)
-    #define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x)
-    #define GGML_CPU_FP32_TO_FP16(x) GGML_CPU_COMPUTE_FP32_TO_FP16(x)
-    static inline float nnpa_compute_fp16_to_fp32(ggml_fp16_t h) {
-        uint16x8_t v_h = vec_splats(h);
-        uint16x8_t v_hd = vec_convert_from_fp16(v_h, 0);
-        return vec_extend_to_fp32_hi(v_hd, 0)[0];
-    }
-    static inline ggml_fp16_t nnpa_compute_fp32_to_fp16(float f) {
-        float32x4_t v_f = vec_splats(f);
-        float32x4_t v_zero = vec_splats(0.0f);
-        uint16x8_t v_hd = vec_round_from_fp32(v_f, v_zero, 0);
-        uint16x8_t v_h = vec_convert_to_fp16(v_hd, 0);
-        return vec_extract(v_h, 0);
-    }
 #endif
 // precomputed f32 table for f16 (256 KB)
@@ -180,18 +160,18 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
 #define GGML_F32xt                        svfloat32_t
 #define GGML_F32xt_ZERO                   svdup_n_f32(0.0f)
 #define GGML_F32xt_SET1(x)                svdup_n_f32(x)
-#define GGML_F32xt_LOAD_IMPL(pg, a, ...)  svld1_f32(pg, a)
-#define GGML_F32xt_LOAD(...)              GGML_F32xt_LOAD_IMPL(DEFAULT_PG, __VA_ARGS__)
-#define GGML_F32xt_STORE_IMPL(pg,a,b)     svst1_f32(pg, a, b)
-#define GGML_F32xt_STORE(...)             GGML_F32xt_STORE_IMPL(DEFAULT_PG, __VA_ARGS__)
+#define GGML_F32xt_LOAD_IMPL(pg, a)       svld1_f32(pg, a)
+#define GGML_F32xt_LOAD(a)                GGML_F32xt_LOAD_IMPL(DEFAULT_PG, a)
+#define GGML_F32xt_STORE_IMPL(pg, a, b)   svst1_f32(pg, a, b)
+#define GGML_F32xt_STORE(a, b)            GGML_F32xt_STORE_IMPL(DEFAULT_PG, a, b)
 #define GGML_F32xt_FMA_IMPL(pg, a, b, c)  svmad_f32_m(pg, b, c, a)
-#define GGML_F32xt_FMA(...)               GGML_F32xt_FMA_IMPL(DEFAULT_PG, __VA_ARGS__)
+#define GGML_F32xt_FMA(a, b, c)           GGML_F32xt_FMA_IMPL(DEFAULT_PG, a, b, c)
 #define GGML_F32xt_ADD_IMPL(pg, a, b)     svadd_f32_m(pg, a, b)
-#define GGML_F32xt_ADD(...)               GGML_F32xt_ADD_IMPL(DEFAULT_PG, __VA_ARGS__)
+#define GGML_F32xt_ADD(a, b)              GGML_F32xt_ADD_IMPL(DEFAULT_PG, a, b)
 #define GGML_F32xt_MUL_IMPL(pg, a, b)     svmul_f32_m(pg, a, b)
-#define GGML_F32xt_MUL(...)               GGML_F32xt_MUL_IMPL(DEFAULT_PG, __VA_ARGS__)
+#define GGML_F32xt_MUL(a, b)              GGML_F32xt_MUL_IMPL(DEFAULT_PG, a, b)
 #define GGML_F32xt_REDUCE_ONE_IMPL(pg, a) svaddv(pg, a)
-#define GGML_F32xt_REDUCE_ONE(...)        GGML_F32xt_REDUCE_ONE_IMPL(DEFAULT_PG, __VA_ARGS__)
+#define GGML_F32xt_REDUCE_ONE(a)          GGML_F32xt_REDUCE_ONE_IMPL(DEFAULT_PG, a)
 #define GGML_F32xt_REDUCE_IMPL(pg, res, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8)  \
 {                                                      \
     sum1 = svadd_f32_m(DEFAULT_PG, sum1, sum2);        \
@@ -203,7 +183,8 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
     sum1 = svadd_f32_m(DEFAULT_PG, sum1, sum5);        \
     (res) = (ggml_float) GGML_F32xt_REDUCE_ONE(sum1);  \
 }
-#define GGML_F32xt_REDUCE(...) GGML_F32xt_REDUCE_IMPL(DEFAULT_PG, __VA_ARGS__)
+#define GGML_F32xt_REDUCE(res, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8)  \
+        GGML_F32xt_REDUCE_IMPL(DEFAULT_PG, res, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8)
 #define GGML_F32_VEC        GGML_F32xt
 #define GGML_F32_VEC_ZERO   GGML_F32xt_ZERO
@@ -215,6 +196,48 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
 #define GGML_F32_VEC_MUL    GGML_F32xt_MUL
 #define GGML_F32_VEC_REDUCE GGML_F32xt_REDUCE
+// F16 SVE
+#define DEFAULT_PG32    svptrue_b32()
+#define DEFAULT_PG16    svptrue_b16()
+#define GGML_F32Cxt                         svfloat16_t
+#define GGML_F32Cxt_ZERO                    svdup_n_f16(0.0f)
+#define GGML_F32Cxt_SET1(x)                 svdup_n_f16(x)
+#define GGML_F32Cxt_LOAD(p)                 svld1_f16(DEFAULT_PG16, (const __fp16 *)(p))
+#define GGML_F32Cxt_STORE(dst_ptr, src_vec) svst1_f16(DEFAULT_PG16, (__fp16 *)(dst_ptr), (src_vec))
+#define GGML_F32Cxt_FMA_IMPL(pg, a, b, c)   svmad_f16_x(pg, b, c, a)
+#define GGML_F32Cxt_FMA(a, b, c)            GGML_F32Cxt_FMA_IMPL(DEFAULT_PG16, a, b, c)
+#define GGML_F32Cxt_ADD_IMPL(pg, a, b)      svadd_f16_x(pg, a, b)
+#define GGML_F32Cxt_ADD(a, b)               GGML_F32Cxt_ADD_IMPL(DEFAULT_PG16, a, b)
+#define GGML_F32Cxt_MUL_IMPL(pg, a, b)      svmul_f16_x(pg, a, b)
+#define GGML_F32Cxt_MUL(a, b)               GGML_F32Cxt_MUL_IMPL(DEFAULT_PG16, a, b)
+#define GGML_F32Cxt_REDUCE                  GGML_F16xt_REDUCE_MIXED
+#define GGML_F16x_VEC                GGML_F32Cxt
+#define GGML_F16x_VEC_ZERO           GGML_F32Cxt_ZERO
+#define GGML_F16x_VEC_SET1           GGML_F32Cxt_SET1
+#define GGML_F16x_VEC_LOAD(p, i)     GGML_F32Cxt_LOAD(p)
+#define GGML_F16x_VEC_STORE(p, r, i) GGML_F32Cxt_STORE((__fp16 *)(p), r)
+#define GGML_F16x_VEC_FMA            GGML_F32Cxt_FMA
+#define GGML_F16x_VEC_ADD            GGML_F32Cxt_ADD
+#define GGML_F16x_VEC_MUL            GGML_F32Cxt_MUL
+#define GGML_F16x_VEC_REDUCE         GGML_F32Cxt_REDUCE
+#define GGML_F16xt_REDUCE_ONE_IMPL(pg, a) svaddv_f16(pg, a)
+#define GGML_F16xt_REDUCE_ONE(a)          GGML_F16xt_REDUCE_ONE_IMPL(DEFAULT_PG16, a)
+#define GGML_F16xt_REDUCE_MIXED_IMPL(pg16, res, sum1, sum2, sum3, sum4)  \
+{                                                      \
+    sum1 = svadd_f16_x(pg16, sum1, sum2);              \
+    sum3 = svadd_f16_x(pg16, sum3, sum4);              \
+    sum1 = svadd_f16_x(pg16, sum1, sum3);              \
+    __fp16 sum_f16 = svaddv_f16(pg16, sum1);           \
+    (res) = (ggml_float) sum_f16;                      \
+}
+#define GGML_F16xt_REDUCE_MIXED(res, sum1, sum2, sum3, sum4)  \
+        GGML_F16xt_REDUCE_MIXED_IMPL(DEFAULT_PG16, res, sum1, sum2, sum3, sum4)
 // F16 NEON
 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
@@ -935,7 +958,7 @@ do {                                                              \
 #define GGML_F32Cx8          __m256
 #define GGML_F32Cx8_ZERO    (__m256)__lasx_xvldi(0)
-#define GGML_F32Cx8_SET1(x) (__m256)__lasx_xvreplgr2vr_w((x))
+#define GGML_F32Cx8_SET1(x) (__m256)__lasx_xvreplfr2vr_s((x))
 static inline __m256 __lasx_f32cx8_load(const ggml_fp16_t * x) {
     __m256i a;
@@ -977,35 +1000,35 @@ static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
 #define GGML_F32_EPR  4
 #define GGML_F32x4         __m128
-#define GGML_F32x4_ZERO    __lsx_vldi(0)
-#define GGML_F32x4_SET1(x) __lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
-#define GGML_F32x4_LOAD(x) __lsx_vld((x), 0)
+#define GGML_F32x4_ZERO    (__m128)__lsx_vldi(0)
+#define GGML_F32x4_SET1(x) (__m128)__lsx_vreplfr2vr_s((x))
+#define GGML_F32x4_LOAD(x) (__m128)__lsx_vld((x), 0)
 #define GGML_F32x4_STORE(x, y)   __lsx_vst(y, x, 0)
 #define GGML_F32x4_FMA(a, b, c) __lsx_vfmadd_s(b, c, a)
 #define GGML_F32x4_ADD     __lsx_vfadd_s
 #define GGML_F32x4_MUL     __lsx_vfmul_s
-#define GGML_F32x4_REDUCE(res, x)                                                     \
-{                                                                                     \
-    int offset = GGML_F32_ARR >> 1;                                                   \
-    for (int i = 0; i < offset; ++i) {                                                \
-        x[i] = __lsx_vfadd_s(x[i], x[offset + i]);                                    \
-    }                                                                                 \
-    offset >>= 1;                                                                     \
-    for (int i = 0; i < offset; ++i) {                                                \
-        x[i] = __lsx_vfadd_s(x[i], x[offset + i]);                                    \
-    }                                                                                 \
-    offset >>= 1;                                                                     \
-    for (int i = 0; i < offset; ++i) {                                                \
-        x[i] = __lsx_vfadd_s(x[i], x[offset + i]);                                    \
-    }                                                                                 \
-    __m128i tmp     = __lsx_vsrli_d((__m128i) x[0], 32);                              \
-    tmp             = (__m128i) __lsx_vfadd_s((__m128) tmp, x[0]);                    \
-    tmp             = __lsx_vpickev_w(__lsx_vldi(0), tmp);                            \
-    const __m128 t0 = __lsx_vshuf4i_w(tmp, 0x88);                                     \
-    tmp             = __lsx_vsrli_d((__m128i) t0, 32);                                \
-    tmp             = (__m128i) __lsx_vfadd_s((__m128) tmp, t0);                      \
-    tmp             = __lsx_vpickev_w(__lsx_vldi(0), tmp);                            \
-    res             = (ggml_float) __lsx_vpickve2gr_w(__lsx_vshuf4i_w(tmp, 0x88), 0); \
+#define GGML_F32x4_REDUCE(res, x)                               \
+{                                                               \
+    int offset = GGML_F32_ARR >> 1;                             \
+    for (int i = 0; i < offset; ++i) {                          \
+        x[i] = __lsx_vfadd_s(x[i], x[offset+i]);                \
+    }                                                           \
+    offset >>= 1;                                               \
+    for (int i = 0; i < offset; ++i) {                          \
+        x[i] = __lsx_vfadd_s(x[i], x[offset+i]);                \
+    }                                                           \
+    offset >>= 1;                                               \
+    for (int i = 0; i < offset; ++i) {                          \
+        x[i] = __lsx_vfadd_s(x[i], x[offset+i]);                \
+    }                                                           \
+    __m128i t0 = __lsx_vpickev_w((__m128i)x[0], (__m128i)x[0]); \
+    __m128i t1 = __lsx_vpickod_w((__m128i)x[0], (__m128i)x[0]); \
+    __m128 t2 = __lsx_vfadd_s((__m128)t0, (__m128)t1);          \
+    __m128i t3 = __lsx_vpickev_w((__m128i)t2, (__m128i)t2);     \
+    __m128i t4 = __lsx_vpickod_w((__m128i)t2, (__m128i)t2);     \
+    __m128 t5 = __lsx_vfadd_s((__m128)t3, (__m128)t4);          \
+    res = (ggml_float) ((v4f32)t5)[0];                          \
 }
 #define GGML_F32_VEC        GGML_F32x4
@@ -1031,7 +1054,7 @@ static inline __m128 __lsx_f16x4_load(const ggml_fp16_t * x) {
     tmp[2] = GGML_CPU_FP16_TO_FP32(x[2]);
     tmp[3] = GGML_CPU_FP16_TO_FP32(x[3]);
-    return __lsx_vld(tmp, 0);
+    return (__m128)__lsx_vld(tmp, 0);
 }
 static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
@@ -1046,9 +1069,9 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
 }
 #define GGML_F32Cx4             __m128
-#define GGML_F32Cx4_ZERO        __lsx_vldi(0)
-#define GGML_F32Cx4_SET1(x)     __lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
-#define GGML_F32Cx4_LOAD(x)     __lsx_f16x4_load(x)
+#define GGML_F32Cx4_ZERO        (__m128)__lsx_vldi(0)
+#define GGML_F32Cx4_SET1(x)     (__m128)__lsx_vreplfr2vr_s((x))
+#define GGML_F32Cx4_LOAD(x)     (__m128)__lsx_f16x4_load(x)
 #define GGML_F32Cx4_STORE(x, y) __lsx_f16x4_store(x, y)
 #define GGML_F32Cx4_FMA         GGML_F32x4_FMA
 #define GGML_F32Cx4_ADD         __lsx_vfadd_s
@@ -1115,11 +1138,6 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
 #define GGML_F16_EPR  GGML_F32_EPR
 static inline float32x4_t __lzs_f16cx4_load(const ggml_fp16_t * x) {
-#if defined(__NNPA__)
-    uint16x8_t v_x = vec_xl(0, (const ggml_fp16_t *)x);
-    uint16x8_t v_xd = vec_convert_from_fp16(v_x, 0);
-    return vec_extend_to_fp32_hi(v_xd, 0);
-#else
     float tmp[4];
     for (int i = 0; i < 4; i++) {
@@ -1129,20 +1147,9 @@ static inline float32x4_t __lzs_f16cx4_load(const ggml_fp16_t * x) {
     // note: keep type-cast here to prevent compiler bugs
     // see: https://github.com/ggml-org/llama.cpp/issues/12846
     return vec_xl(0, (const float *)(tmp));
-#endif
 }
 static inline void __lzs_f16cx4_store(ggml_fp16_t * x, float32x4_t v_y) {
-#if defined(__NNPA__)
-    float32x4_t v_zero = vec_splats(0.0f);
-    uint16x8_t v_xd = vec_round_from_fp32(v_y, v_zero, 0);
-    uint16x8_t v_x = vec_convert_to_fp16(v_xd, 0);
-    x[0] = vec_extract(v_x, 0);
-    x[1] = vec_extract(v_x, 1);
-    x[2] = vec_extract(v_x, 2);
-    x[3] = vec_extract(v_x, 3);
-#else
     float arr[4];
     // note: keep type-cast here to prevent compiler bugs
@@ -1152,7 +1159,6 @@ static inline void __lzs_f16cx4_store(ggml_fp16_t * x, float32x4_t v_y) {
     for (int i = 0; i < 4; i++) {
         x[i] = GGML_CPU_FP32_TO_FP16(arr[i]);
     }
-#endif
 }
 #define GGML_F16_VEC                GGML_F32x4