npm - whisper.rn - Versions diffs - 0.5.0-rc.9 → 0.5.1 - Mend

whisper.rn 0.5.0-rc.9 → 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (136) hide show

package/cpp/ggml-cpu/vec.h CHANGED Viewed

@@ -55,7 +55,22 @@ inline static void wsp_ggml_vec_cpy_i32(const int n, int32_t * y, const int32_t
 inline static void wsp_ggml_vec_set_f16(const int n, wsp_ggml_fp16_t * x, const wsp_ggml_fp16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
 inline static void wsp_ggml_vec_set_bf16(const int n, wsp_ggml_bf16_t * x, const wsp_ggml_bf16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
-inline static void wsp_ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i] + y[i]; }
+inline static void wsp_ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) {
+    int i = 0;
+#if defined(__AVX2__)
+    for (; i + 7 < n; i += 8) {
+        __m256 vx = _mm256_loadu_ps(x + i);
+        __m256 vy = _mm256_loadu_ps(y + i);
+        __m256 vz = _mm256_add_ps(vx, vy);
+        _mm256_storeu_ps(z + i, vz);
+    }
+#endif
+    for (; i < n; ++i) {
+        z[i] = x[i] + y[i];
+    }
+}
 inline static void wsp_ggml_vec_add_f16 (const int n, wsp_ggml_fp16_t * z, const wsp_ggml_fp16_t * x, const wsp_ggml_fp16_t * y) {
     for (int i = 0; i < n; ++i) {
         z[i] = WSP_GGML_CPU_FP32_TO_FP16(WSP_GGML_CPU_FP16_TO_FP32(x[i]) + WSP_GGML_CPU_FP16_TO_FP32(y[i]));
@@ -104,36 +119,149 @@ inline static void wsp_ggml_vec_dot_f16_unroll(const int n, const int xs, float
     }
 #if defined(WSP_GGML_SIMD)
-    const int np = (n & ~(WSP_GGML_F16_STEP - 1));
+    #if defined(__ARM_FEATURE_SVE)
+        const int sve_register_length = svcntb() * 8;
+        const int wsp_ggml_f16_epr = sve_register_length / 16; // running when 16
+        const int wsp_ggml_f16_step = 8 * wsp_ggml_f16_epr; // choose 8 SVE registers
+        const int np = (n & ~(wsp_ggml_f16_step - 1));
+        svfloat16_t sum_00 = svdup_n_f16(0.0f);
+        svfloat16_t sum_01 = svdup_n_f16(0.0f);
+        svfloat16_t sum_02 = svdup_n_f16(0.0f);
+        svfloat16_t sum_03 = svdup_n_f16(0.0f);
+        svfloat16_t sum_10 = svdup_n_f16(0.0f);
+        svfloat16_t sum_11 = svdup_n_f16(0.0f);
+        svfloat16_t sum_12 = svdup_n_f16(0.0f);
+        svfloat16_t sum_13 = svdup_n_f16(0.0f);
+        svfloat16_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
+        svfloat16_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
+        for (int i = 0; i < np; i += wsp_ggml_f16_step) {
+            ay1 = WSP_GGML_F16x_VEC_LOAD(y + i + 0 * wsp_ggml_f16_epr, 0); // 8 elements
+            ax1 = WSP_GGML_F16x_VEC_LOAD(x[0] + i + 0*wsp_ggml_f16_epr, 0); // 8 elemnst
+            sum_00 = WSP_GGML_F16x_VEC_FMA(sum_00, ax1, ay1);     // sum_00 = sum_00+ax1*ay1
+            ax1 = WSP_GGML_F16x_VEC_LOAD(x[1] + i + 0*wsp_ggml_f16_epr, 0); // 8 elements
+            sum_10 = WSP_GGML_F16x_VEC_FMA(sum_10, ax1, ay1);
+            ay2 = WSP_GGML_F16x_VEC_LOAD(y + i + 1 * wsp_ggml_f16_epr, 1); // next 8 elements
+            ax2 = WSP_GGML_F16x_VEC_LOAD(x[0] + i + 1*wsp_ggml_f16_epr, 1); // next 8 ekements
+            sum_01 = WSP_GGML_F16x_VEC_FMA(sum_01, ax2, ay2);
+            ax2 = WSP_GGML_F16x_VEC_LOAD(x[1] + i + 1*wsp_ggml_f16_epr, 1);
+            sum_11 = WSP_GGML_F16x_VEC_FMA(sum_11, ax2, ay2);
+            ay3 = WSP_GGML_F16x_VEC_LOAD(y + i + 2 * wsp_ggml_f16_epr, 2);
+            ax3 = WSP_GGML_F16x_VEC_LOAD(x[0] + i + 2*wsp_ggml_f16_epr, 2);
+            sum_02 = WSP_GGML_F16x_VEC_FMA(sum_02, ax3, ay3);
+            ax1 = WSP_GGML_F16x_VEC_LOAD(x[1] + i + 2*wsp_ggml_f16_epr, 2);
+            sum_12 = WSP_GGML_F16x_VEC_FMA(sum_12, ax3, ay3);
+            ay4 = WSP_GGML_F16x_VEC_LOAD(y + i + 3 * wsp_ggml_f16_epr, 3);
+            ax4 = WSP_GGML_F16x_VEC_LOAD(x[0] + i + 3*wsp_ggml_f16_epr, 3);
+            sum_03 = WSP_GGML_F16x_VEC_FMA(sum_03, ax4, ay4);
+            ax4 = WSP_GGML_F16x_VEC_LOAD(x[1] + i + 3*wsp_ggml_f16_epr, 3);
+            sum_13 = WSP_GGML_F16x_VEC_FMA(sum_13, ax4, ay4);
+            ay5 = WSP_GGML_F16x_VEC_LOAD(y + i + 4 * wsp_ggml_f16_epr, 4);
+            ax5 = WSP_GGML_F16x_VEC_LOAD(x[0] + i + 4*wsp_ggml_f16_epr, 4);
+            sum_00 = WSP_GGML_F16x_VEC_FMA(sum_00, ax5, ay5);
+            ax5 = WSP_GGML_F16x_VEC_LOAD(x[1] + i + 4*wsp_ggml_f16_epr, 4);
+            sum_10 = WSP_GGML_F16x_VEC_FMA(sum_10, ax5, ay5);
+            ay6 = WSP_GGML_F16x_VEC_LOAD(y + i + 5 * wsp_ggml_f16_epr, 5);
-    WSP_GGML_F16_VEC sum[WSP_GGML_VEC_DOT_UNROLL][WSP_GGML_F16_ARR] = { { WSP_GGML_F16_VEC_ZERO } };
+            ax6 = WSP_GGML_F16x_VEC_LOAD(x[0] + i + 5*wsp_ggml_f16_epr, 5);
-    WSP_GGML_F16_VEC ax[WSP_GGML_F16_ARR];
-    WSP_GGML_F16_VEC ay[WSP_GGML_F16_ARR];
+            sum_01 = WSP_GGML_F16x_VEC_FMA(sum_01, ax6, ay6);
+            ax6 = WSP_GGML_F16x_VEC_LOAD(x[1] + i + 5*wsp_ggml_f16_epr, 5);
+            sum_11 = WSP_GGML_F16x_VEC_FMA(sum_11, ax6, ay6);
-    for (int i = 0; i < np; i += WSP_GGML_F16_STEP) {
-        for (int j = 0; j < WSP_GGML_F16_ARR; j++) {
-            ay[j] = WSP_GGML_F16_VEC_LOAD(y + i + j*WSP_GGML_F16_EPR, j);
+            ay7 = WSP_GGML_F16x_VEC_LOAD(y + i + 6 * wsp_ggml_f16_epr, 6);
-            for (int k = 0; k < WSP_GGML_VEC_DOT_UNROLL; ++k) {
-                ax[j] = WSP_GGML_F16_VEC_LOAD(x[k] + i + j*WSP_GGML_F16_EPR, j);
+            ax7 = WSP_GGML_F16x_VEC_LOAD(x[0] + i + 6*wsp_ggml_f16_epr, 6);
-                sum[k][j] = WSP_GGML_F16_VEC_FMA(sum[k][j], ax[j], ay[j]);
+            sum_02 = WSP_GGML_F16x_VEC_FMA(sum_02, ax7, ay7);
+            ax7 = WSP_GGML_F16x_VEC_LOAD(x[1] + i + 6*wsp_ggml_f16_epr, 6);
+            sum_12 = WSP_GGML_F16x_VEC_FMA(sum_12, ax7, ay7);
+            ay8 = WSP_GGML_F16x_VEC_LOAD(y + i + 7 * wsp_ggml_f16_epr, 7);
+            ax8 = WSP_GGML_F16x_VEC_LOAD(x[0] + i + 7*wsp_ggml_f16_epr, 7);
+            sum_03 = WSP_GGML_F16x_VEC_FMA(sum_03, ax8, ay8);
+            ax8 = WSP_GGML_F16x_VEC_LOAD(x[1] + i + 7*wsp_ggml_f16_epr, 7);
+            sum_13 = WSP_GGML_F16x_VEC_FMA(sum_13, ax8, ay8);
+        }
+        const int np2 = (n & ~(wsp_ggml_f16_epr - 1));
+        for (int k = np; k < np2; k += wsp_ggml_f16_epr) {
+            svfloat16_t ry = WSP_GGML_F16x_VEC_LOAD(y + k, 0);
+            svfloat16_t rx = WSP_GGML_F16x_VEC_LOAD(x[0] + k, 0);
+            sum_00 = WSP_GGML_F16x_VEC_FMA(sum_00, rx, ry);
+            rx = WSP_GGML_F16x_VEC_LOAD(x[1] + k, 0);
+            sum_10 = WSP_GGML_F16x_VEC_FMA(sum_10, rx, ry);
+        }
+        if (np2 < n) {
+            svbool_t pg = svwhilelt_b16(np2, n);
+            svfloat16_t hx_0 = svld1_f16(pg, (const __fp16 *)(x[0] + np2));
+            svfloat16_t hx_1 = svld1_f16(pg, (const __fp16 *)(x[1] + np2));
+            svfloat16_t hy = svld1_f16(pg, (const __fp16 *)(y + np2));
+            sum_00 = svmad_f16_x(pg, hx_0, hy, sum_00);
+            sum_10 = svmad_f16_x(pg, hx_1, hy, sum_10);
+        }
+        WSP_GGML_F16x_VEC_REDUCE(sumf[0], sum_00, sum_01, sum_02, sum_03);
+        WSP_GGML_F16x_VEC_REDUCE(sumf[1], sum_10, sum_11, sum_12, sum_13);
+    #elif defined(__riscv_v_intrinsic)
+      // todo: RVV impl
+      for (int i = 0; i < n; ++i) {
+          for (int j = 0; j < WSP_GGML_VEC_DOT_UNROLL; ++j) {
+              sumf[j] += (wsp_ggml_float)(WSP_GGML_CPU_FP16_TO_FP32(x[j][i])*WSP_GGML_CPU_FP16_TO_FP32(y[i]));
+          }
+      }
+    #else
+        const int np = (n & ~(WSP_GGML_F16_STEP - 1));
+        WSP_GGML_F16_VEC sum[WSP_GGML_VEC_DOT_UNROLL][WSP_GGML_F16_ARR] = { { WSP_GGML_F16_VEC_ZERO } };
+        WSP_GGML_F16_VEC ax[WSP_GGML_F16_ARR];
+        WSP_GGML_F16_VEC ay[WSP_GGML_F16_ARR];
+        for (int i = 0; i < np; i += WSP_GGML_F16_STEP) {
+            for (int j = 0; j < WSP_GGML_F16_ARR; j++) {
+                ay[j] = WSP_GGML_F16_VEC_LOAD(y + i + j*WSP_GGML_F16_EPR, j);
+                for (int k = 0; k < WSP_GGML_VEC_DOT_UNROLL; ++k) {
+                    ax[j] = WSP_GGML_F16_VEC_LOAD(x[k] + i + j*WSP_GGML_F16_EPR, j);
+                    sum[k][j] = WSP_GGML_F16_VEC_FMA(sum[k][j], ax[j], ay[j]);
+                }
             }
         }
-    }
-    // reduce sum0..sum3 to sum0
-    for (int k = 0; k < WSP_GGML_VEC_DOT_UNROLL; ++k) {
-        WSP_GGML_F16_VEC_REDUCE(sumf[k], sum[k]);
-    }
+        // reduce sum0..sum3 to sum0
+        for (int k = 0; k < WSP_GGML_VEC_DOT_UNROLL; ++k) {
+            WSP_GGML_F16_VEC_REDUCE(sumf[k], sum[k]);
+        }
-    // leftovers
-    for (int i = np; i < n; ++i) {
-        for (int j = 0; j < WSP_GGML_VEC_DOT_UNROLL; ++j) {
-            sumf[j] += (wsp_ggml_float)(WSP_GGML_CPU_FP16_TO_FP32(x[j][i])*WSP_GGML_CPU_FP16_TO_FP32(y[i]));
+        // leftovers
+        for (int i = np; i < n; ++i) {
+            for (int j = 0; j < WSP_GGML_VEC_DOT_UNROLL; ++j) {
+                sumf[j] += (wsp_ggml_float)(WSP_GGML_CPU_FP16_TO_FP32(x[j][i])*WSP_GGML_CPU_FP16_TO_FP32(y[i]));
+            }
         }
-    }
+    #endif
 #else
     for (int i = 0; i < n; ++i) {
         for (int j = 0; j < WSP_GGML_VEC_DOT_UNROLL; ++j) {
@@ -163,49 +291,49 @@ inline static void wsp_ggml_vec_mad_f32(const int n, float * WSP_GGML_RESTRICT y
             ax1 = WSP_GGML_F32_VEC_LOAD(x + i);
             ay1 = WSP_GGML_F32_VEC_LOAD(y + i);
-            ay1 = WSP_GGML_F32_VEC_FMA(ax1, vx, ay1);
+            ay1 = WSP_GGML_F32_VEC_FMA(ay1, ax1, vx);
             WSP_GGML_F32_VEC_STORE(y + i, ay1);
             ax2 = WSP_GGML_F32_VEC_LOAD(x + i + 1*wsp_ggml_f32_epr);
             ay2 = WSP_GGML_F32_VEC_LOAD(y + i + 1*wsp_ggml_f32_epr);
-            ay2 = WSP_GGML_F32_VEC_FMA(ax2, vx, ay2);
+            ay2 = WSP_GGML_F32_VEC_FMA(ay2, ax2, vx);
             WSP_GGML_F32_VEC_STORE(y + i + 1*wsp_ggml_f32_epr, ay2);
             ax3 = WSP_GGML_F32_VEC_LOAD(x + i + 2*wsp_ggml_f32_epr);
             ay3 = WSP_GGML_F32_VEC_LOAD(y + i + 2*wsp_ggml_f32_epr);
-            ay3 = WSP_GGML_F32_VEC_FMA(ax3, vx, ay3);
+            ay3 = WSP_GGML_F32_VEC_FMA(ay3, ax3, vx);
             WSP_GGML_F32_VEC_STORE(y + i + 2*wsp_ggml_f32_epr, ay3);
             ax4 = WSP_GGML_F32_VEC_LOAD(x + i + 3*wsp_ggml_f32_epr);
             ay4 = WSP_GGML_F32_VEC_LOAD(y + i + 3*wsp_ggml_f32_epr);
-            ay4 = WSP_GGML_F32_VEC_FMA(ax4, vx, ay4);
+            ay4 = WSP_GGML_F32_VEC_FMA(ay4, ax4, vx);
             WSP_GGML_F32_VEC_STORE(y + i + 3*wsp_ggml_f32_epr, ay4);
             ax5 = WSP_GGML_F32_VEC_LOAD(x + i + 4*wsp_ggml_f32_epr);
             ay5 = WSP_GGML_F32_VEC_LOAD(y + i + 4*wsp_ggml_f32_epr);
-            ay5 = WSP_GGML_F32_VEC_FMA(ax5, vx, ay5);
+            ay5 = WSP_GGML_F32_VEC_FMA(ay5, ax5, vx);
             WSP_GGML_F32_VEC_STORE(y + i + 4*wsp_ggml_f32_epr, ay5);
             ax6 = WSP_GGML_F32_VEC_LOAD(x + i + 5*wsp_ggml_f32_epr);
             ay6 = WSP_GGML_F32_VEC_LOAD(y + i + 5*wsp_ggml_f32_epr);
-            ay6 = WSP_GGML_F32_VEC_FMA(ax6, vx, ay6);
+            ay6 = WSP_GGML_F32_VEC_FMA(ay6, ax6, vx);
             WSP_GGML_F32_VEC_STORE(y + i + 5*wsp_ggml_f32_epr, ay6);
             ax7 = WSP_GGML_F32_VEC_LOAD(x + i + 6*wsp_ggml_f32_epr);
             ay7 = WSP_GGML_F32_VEC_LOAD(y + i + 6*wsp_ggml_f32_epr);
-            ay7 = WSP_GGML_F32_VEC_FMA(ax7, vx, ay7);
+            ay7 = WSP_GGML_F32_VEC_FMA(ay7, ax7, vx);
             WSP_GGML_F32_VEC_STORE(y + i + 6*wsp_ggml_f32_epr, ay7);
             ax8 = WSP_GGML_F32_VEC_LOAD(x + i + 7*wsp_ggml_f32_epr);
             ay8 = WSP_GGML_F32_VEC_LOAD(y + i + 7*wsp_ggml_f32_epr);
-            ay8 = WSP_GGML_F32_VEC_FMA(ax8, vx, ay8);
+            ay8 = WSP_GGML_F32_VEC_FMA(ay8, ax8, vx);
             WSP_GGML_F32_VEC_STORE(y + i + 7*wsp_ggml_f32_epr, ay8);
         }
@@ -215,7 +343,7 @@ inline static void wsp_ggml_vec_mad_f32(const int n, float * WSP_GGML_RESTRICT y
         for (int i = np; i < np2; i += wsp_ggml_f32_epr) {
             ax1 = WSP_GGML_F32_VEC_LOAD(x + i);
             ay1 = WSP_GGML_F32_VEC_LOAD(y + i);
-            ay1 = WSP_GGML_F32_VEC_FMA(ax1, vx, ay1);
+            ay1 = WSP_GGML_F32_VEC_FMA(ay1, ax1, vx);
             WSP_GGML_F32_VEC_STORE(y + i, ay1);
         }
@@ -228,6 +356,14 @@ inline static void wsp_ggml_vec_mad_f32(const int n, float * WSP_GGML_RESTRICT y
             svst1_f32(pg, y + np2, ay1);
         }
+    #elif defined(__riscv_v_intrinsic)
+        for (int i = 0, avl; i < n; i += avl) {
+            avl = __riscv_vsetvl_e32m8(n - i);
+            vfloat32m8_t ax = __riscv_vle32_v_f32m8(&x[i], avl);
+            vfloat32m8_t ay = __riscv_vle32_v_f32m8(&y[i], avl);
+            vfloat32m8_t ny = __riscv_vfmadd_vf_f32m8(ax, v, ay, avl);
+            __riscv_vse32_v_f32m8(&y[i], ny, avl);
+        }
     #else
         const int np = (n & ~(WSP_GGML_F32_STEP - 1));
@@ -261,27 +397,112 @@ inline static void wsp_ggml_vec_mad_f32(const int n, float * WSP_GGML_RESTRICT y
 inline static void wsp_ggml_vec_mad_f16(const int n, wsp_ggml_fp16_t * WSP_GGML_RESTRICT y, const wsp_ggml_fp16_t * WSP_GGML_RESTRICT x, const float v) {
 #if defined(WSP_GGML_SIMD)
-    const int np = (n & ~(WSP_GGML_F16_STEP - 1));
+    #if defined(__ARM_FEATURE_SVE)
+        const int sve_register_length = svcntb() * 8;
+        const int wsp_ggml_f16_epr = sve_register_length / 16;
+        const int wsp_ggml_f16_step = 8 * wsp_ggml_f16_epr;
+        WSP_GGML_F16x_VEC vx = WSP_GGML_F16x_VEC_SET1(v);
+        const int np= (n & ~(wsp_ggml_f16_step - 1));
+        svfloat16_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
+        svfloat16_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
+        for (int i = 0; i < np; i += wsp_ggml_f16_step) {
+            ax1 = WSP_GGML_F16x_VEC_LOAD(x + i + 0 * wsp_ggml_f16_epr, 0);
+            ay1 = WSP_GGML_F16x_VEC_LOAD(y + i + 0 * wsp_ggml_f16_epr, 0);
+            ay1 = WSP_GGML_F16x_VEC_FMA(ay1, ax1, vx);
+            WSP_GGML_F16x_VEC_STORE(y + i + 0 * wsp_ggml_f16_epr, ay1, 0);
+            ax2 = WSP_GGML_F16x_VEC_LOAD(x + i + 1 * wsp_ggml_f16_epr, 1);
+            ay2 = WSP_GGML_F16x_VEC_LOAD(y + i + 1 * wsp_ggml_f16_epr, 1);
+            ay2 = WSP_GGML_F16x_VEC_FMA(ay2, ax2, vx);
+            WSP_GGML_F16x_VEC_STORE(y + i + 1 * wsp_ggml_f16_epr, ay2, 1);
-    WSP_GGML_F16_VEC vx = WSP_GGML_F16_VEC_SET1(v);
+            ax3 = WSP_GGML_F16x_VEC_LOAD(x + i + 2 * wsp_ggml_f16_epr, 2);
+            ay3 = WSP_GGML_F16x_VEC_LOAD(y + i + 2 * wsp_ggml_f16_epr, 2);
+            ay3 = WSP_GGML_F16x_VEC_FMA(ay3, ax3, vx);
-    WSP_GGML_F16_VEC ax[WSP_GGML_F16_ARR];
-    WSP_GGML_F16_VEC ay[WSP_GGML_F16_ARR];
+            WSP_GGML_F16x_VEC_STORE(y + i + 2 * wsp_ggml_f16_epr, ay3, 2);
-    for (int i = 0; i < np; i += WSP_GGML_F16_STEP) {
-        for (int j = 0; j < WSP_GGML_F16_ARR; j++) {
-            ax[j] = WSP_GGML_F16_VEC_LOAD(x + i + j*WSP_GGML_F16_EPR, j);
-            ay[j] = WSP_GGML_F16_VEC_LOAD(y + i + j*WSP_GGML_F16_EPR, j);
-            ay[j] = WSP_GGML_F16_VEC_FMA(ay[j], ax[j], vx);
+            ax4 = WSP_GGML_F16x_VEC_LOAD(x + i + 3 * wsp_ggml_f16_epr, 3);
+            ay4 = WSP_GGML_F16x_VEC_LOAD(y + i + 3 * wsp_ggml_f16_epr, 3);
+            ay4 = WSP_GGML_F16x_VEC_FMA(ay4, ax4, vx);
-            WSP_GGML_F16_VEC_STORE(y + i + j*WSP_GGML_F16_EPR, ay, j);
+            WSP_GGML_F16x_VEC_STORE(y + i + 3 * wsp_ggml_f16_epr, ay4, 3);
+            ax5 = WSP_GGML_F16x_VEC_LOAD(x + i + 4 * wsp_ggml_f16_epr, 4);
+            ay5 = WSP_GGML_F16x_VEC_LOAD(y + i + 4 * wsp_ggml_f16_epr, 4);
+            ay5 = WSP_GGML_F16x_VEC_FMA(ay5, ax5, vx);
+            WSP_GGML_F16x_VEC_STORE(y + i + 4 * wsp_ggml_f16_epr, ay5, 4);
+            ax6 = WSP_GGML_F16x_VEC_LOAD(x + i + 5 * wsp_ggml_f16_epr, 5);
+            ay6 = WSP_GGML_F16x_VEC_LOAD(y + i + 5 * wsp_ggml_f16_epr, 5);
+            ay6 = WSP_GGML_F16x_VEC_FMA(ay6, ax6, vx);
+            WSP_GGML_F16x_VEC_STORE(y + i + 5 * wsp_ggml_f16_epr, ay6, 5);
+            ax7 = WSP_GGML_F16x_VEC_LOAD(x + i + 6 * wsp_ggml_f16_epr, 6);
+            ay7 = WSP_GGML_F16x_VEC_LOAD(y + i + 6 * wsp_ggml_f16_epr, 6);
+            ay7 = WSP_GGML_F16x_VEC_FMA(ay7, ax7, vx);
+            WSP_GGML_F16x_VEC_STORE(y + i + 6 * wsp_ggml_f16_epr, ay7, 6);
+            ax8 = WSP_GGML_F16x_VEC_LOAD(x + i + 7 * wsp_ggml_f16_epr, 7);
+            ay8 = WSP_GGML_F16x_VEC_LOAD(y + i + 7 * wsp_ggml_f16_epr, 7);
+            ay8 = WSP_GGML_F16x_VEC_FMA(ay8, ax8, vx);
+            WSP_GGML_F16x_VEC_STORE(y + i + 7 * wsp_ggml_f16_epr, ay8, 7);
         }
-    }
+        const int np2 = (n & ~(wsp_ggml_f16_epr - 1));
+        for (int k = np; k < np2; k += wsp_ggml_f16_epr) {
+            svfloat16_t rx = WSP_GGML_F16x_VEC_LOAD(x + k, 0);
+            svfloat16_t ry = WSP_GGML_F16x_VEC_LOAD(y + k, 0);
+            ry = WSP_GGML_F16x_VEC_FMA(ry, rx, vx);
-    // leftovers
-    for (int i = np; i < n; ++i) {
-        y[i] = WSP_GGML_CPU_FP32_TO_FP16(WSP_GGML_CPU_FP16_TO_FP32(y[i]) + WSP_GGML_CPU_FP16_TO_FP32(x[i])*v);
-    }
+            WSP_GGML_F16x_VEC_STORE(y + k, ry, 0);
+        }
+        if (np2 < n) {
+            svbool_t pg = svwhilelt_b16(np2, n);
+            svfloat16_t hx = svld1_f16(pg, (const __fp16 *)(x + np2));
+            svfloat16_t hy = svld1_f16(pg, (const __fp16 *)(y + np2));
+            hy = svmad_f16_x(pg, hx, vx, hy);
+            svst1_f16(pg, (__fp16 *)(y + np2), hy);
+        }
+    #elif defined(__riscv_v_intrinsic)
+        // todo: RVV impl
+        // scalar
+        for (int i = 0; i < n; ++i) {
+            y[i] = WSP_GGML_CPU_FP32_TO_FP16(WSP_GGML_CPU_FP16_TO_FP32(y[i]) + WSP_GGML_CPU_FP16_TO_FP32(x[i])*v);
+        }
+    #else
+        const int np = (n & ~(WSP_GGML_F16_STEP - 1));
+        WSP_GGML_F16_VEC vx = WSP_GGML_F16_VEC_SET1(v);
+        WSP_GGML_F16_VEC ax[WSP_GGML_F16_ARR];
+        WSP_GGML_F16_VEC ay[WSP_GGML_F16_ARR];
+        for (int i = 0; i < np; i += WSP_GGML_F16_STEP) {
+            for (int j = 0; j < WSP_GGML_F16_ARR; j++) {
+                ax[j] = WSP_GGML_F16_VEC_LOAD(x + i + j*WSP_GGML_F16_EPR, j);
+                ay[j] = WSP_GGML_F16_VEC_LOAD(y + i + j*WSP_GGML_F16_EPR, j);
+                ay[j] = WSP_GGML_F16_VEC_FMA(ay[j], ax[j], vx);
+                WSP_GGML_F16_VEC_STORE(y + i + j*WSP_GGML_F16_EPR, ay, j);
+            }
+        }
+        // leftovers
+        for (int i = np; i < n; ++i) {
+            y[i] = WSP_GGML_CPU_FP32_TO_FP16(WSP_GGML_CPU_FP16_TO_FP32(y[i]) + WSP_GGML_CPU_FP16_TO_FP32(x[i])*v);
+        }
+    #endif
 #else
     // scalar
     for (int i = 0; i < n; ++i) {
@@ -309,6 +530,16 @@ inline static void wsp_ggml_vec_mad_f32_unroll(const int n, const int xs, const
                 y[i] += x[k][i]*v[k][0];
             }
         }
+    #elif defined(__riscv_v_intrinsic)
+        for (int i = 0, avl; i < n; i += avl) {
+            avl = __riscv_vsetvl_e32m8(n - i);
+            vfloat32m8_t ay = __riscv_vle32_v_f32m8(&y[i], avl);
+            for (int k = 0; k < WSP_GGML_VEC_MAD_UNROLL; k++) {
+                vfloat32m8_t ax = __riscv_vle32_v_f32m8(&x[k][i], avl);
+                ay = __riscv_vfmadd_vf_f32m8(ax, v[k][0], ay, avl);
+            }
+            __riscv_vse32_v_f32m8(&y[i], ay, avl);
+        }
     #else
         const int np = (n & ~(WSP_GGML_F32_STEP - 1));
@@ -351,6 +582,53 @@ inline static void wsp_ggml_vec_mad_f32_unroll(const int n, const int xs, const
 #endif
 }
+inline static void wsp_ggml_vec_mad1_f32(const int n, float * y, const float * x, const float s, const float b) {
+#if defined(WSP_GGML_USE_ACCELERATE)
+    vDSP_vsmsa(x, 1, &s, &b, y, 1, n);
+#elif defined(WSP_GGML_SIMD)
+    #if defined(__ARM_FEATURE_SVE)
+        // scalar ; TODO: Write SVE code
+        for (int i = 0; i < n; ++i) {
+            y[i] = x[i]*s + b;
+        }
+    #elif defined(__riscv_v_intrinsic)
+        for (int i = 0, avl; i < n; i += avl) {
+            avl = __riscv_vsetvl_e32m8(n - i);
+            vfloat32m8_t ax = __riscv_vle32_v_f32m8(&x[i], avl);
+            vfloat32m8_t vb = __riscv_vfmv_v_f_f32m8(b, avl);
+            vfloat32m8_t ny = __riscv_vfmadd_vf_f32m8(ax, s, vb, avl);
+            __riscv_vse32_v_f32m8(&y[i], ny, avl);
+        }
+    #else
+        const int np = (n & ~(WSP_GGML_F32_STEP - 1));
+        WSP_GGML_F32_VEC vs = WSP_GGML_F32_VEC_SET1(s);
+        WSP_GGML_F32_VEC vb = WSP_GGML_F32_VEC_SET1(b);
+        WSP_GGML_F32_VEC ay[WSP_GGML_F32_ARR];
+        for (int i = 0; i < np; i += WSP_GGML_F32_STEP) {
+            for (int j = 0; j < WSP_GGML_F32_ARR; j++) {
+                ay[j] = WSP_GGML_F32_VEC_LOAD(x + i + j*WSP_GGML_F32_EPR);
+                ay[j] = WSP_GGML_F32_VEC_FMA(vb, ay[j], vs);
+                WSP_GGML_F32_VEC_STORE(y + i + j*WSP_GGML_F32_EPR, ay[j]);
+            }
+        }
+        // leftovers
+        for (int i = np; i < n; ++i) {
+            y[i] = x[i]*s + b;
+        }
+    #endif
+#else
+    // scalar
+    for (int i = 0; i < n; ++i) {
+        y[i] = x[i]*s + b;
+    }
+#endif
+}
 //inline static void wsp_ggml_vec_scale_f32(const int n, float * y, const float   v) { for (int i = 0; i < n; ++i) y[i] *= v;          }
 inline static void wsp_ggml_vec_scale_f32(const int n, float * y, const float   v) {
 #if defined(WSP_GGML_USE_ACCELERATE)
@@ -382,6 +660,13 @@ inline static void wsp_ggml_vec_scale_f32(const int n, float * y, const float
             ay1 = svmul_f32_m(pg, ay1, vx);
             svst1_f32(pg, y + np, ay1);
         }
+    #elif defined(__riscv_v_intrinsic)
+        for (int i = 0, avl; i < n; i += avl) {
+            avl = __riscv_vsetvl_e32m8(n - i);
+            vfloat32m8_t ay = __riscv_vle32_v_f32m8(&y[i], avl);
+            vfloat32m8_t ny = __riscv_vfmul_vf_f32m8(ay, v, avl);
+            __riscv_vse32_v_f32m8(&y[i], ny, avl);
+        }
     #else
         const int np = (n & ~(WSP_GGML_F32_STEP - 1));
@@ -413,25 +698,59 @@ inline static void wsp_ggml_vec_scale_f32(const int n, float * y, const float
 inline static void wsp_ggml_vec_scale_f16(const int n, wsp_ggml_fp16_t * y, const float v) {
 #if defined(WSP_GGML_SIMD)
-    const int np = (n & ~(WSP_GGML_F16_STEP - 1));
+    #if defined(__ARM_FEATURE_SVE)
+        const int sve_register_length = svcntb() * 8;
+        const int wsp_ggml_f16_epr = sve_register_length / 16;
+        const int wsp_ggml_f16_step = 2 * wsp_ggml_f16_epr;
+        WSP_GGML_F16x_VEC vx =  WSP_GGML_F16x_VEC_SET1(v);
+        const int np = (n & ~(wsp_ggml_f16_step - 1));
+        svfloat16_t ay1, ay2;
+        for (int i = 0; i < np; i += wsp_ggml_f16_step) {
+            ay1 = WSP_GGML_F16x_VEC_LOAD(y + i + 0*wsp_ggml_f16_epr, 0);
+            ay1 = WSP_GGML_F16x_VEC_MUL(ay1, vx);
+            WSP_GGML_F16x_VEC_STORE(y + i + 0*wsp_ggml_f16_epr, ay1, 0);
+            ay2 = WSP_GGML_F16x_VEC_LOAD(y + i + 1*wsp_ggml_f16_epr, 1);
+            ay2 = WSP_GGML_F16x_VEC_MUL(ay2, vx);
+            WSP_GGML_F16x_VEC_STORE(y + i + 1*wsp_ggml_f16_epr, ay2, 1);
+        }
+        // leftovers
+        // maximum number of leftover elements will be less that ggmlF_16x_epr. Apply predicated svmad on available elements only
+        if (np < n) {
+            svbool_t pg = svwhilelt_b16(np, n);
+            svfloat16_t hy = svld1_f16(pg, (__fp16 *)(y + np));
+            svfloat16_t out = svmul_f16_m(pg, hy, vx);
+            svst1_f16(pg, (__fp16 *)(y + np), out);
+        }
+    #elif defined(__riscv_v_intrinsic)
+        // todo: RVV impl
+        // scalar
+        for (int i = 0; i < n; ++i) {
+            y[i] = WSP_GGML_CPU_FP32_TO_FP16(WSP_GGML_CPU_FP16_TO_FP32(y[i])*v);
+        }
+    #else
+        const int np = (n & ~(WSP_GGML_F16_STEP - 1));
-    WSP_GGML_F16_VEC vx = WSP_GGML_F16_VEC_SET1(v);
+        WSP_GGML_F16_VEC vx = WSP_GGML_F16_VEC_SET1(v);
-    WSP_GGML_F16_VEC ay[WSP_GGML_F16_ARR];
+        WSP_GGML_F16_VEC ay[WSP_GGML_F16_ARR];
-    for (int i = 0; i < np; i += WSP_GGML_F16_STEP) {
-        for (int j = 0; j < WSP_GGML_F16_ARR; j++) {
-            ay[j] = WSP_GGML_F16_VEC_LOAD(y + i + j*WSP_GGML_F16_EPR, j);
-            ay[j] = WSP_GGML_F16_VEC_MUL(ay[j], vx);
+        for (int i = 0; i < np; i += WSP_GGML_F16_STEP) {
+            for (int j = 0; j < WSP_GGML_F16_ARR; j++) {
+                ay[j] = WSP_GGML_F16_VEC_LOAD(y + i + j*WSP_GGML_F16_EPR, j);
+                ay[j] = WSP_GGML_F16_VEC_MUL(ay[j], vx);
-            WSP_GGML_F16_VEC_STORE(y + i + j*WSP_GGML_F16_EPR, ay, j);
+                WSP_GGML_F16_VEC_STORE(y + i + j*WSP_GGML_F16_EPR, ay, j);
+            }
         }
-    }
-    // leftovers
-    for (int i = np; i < n; ++i) {
-        y[i] = WSP_GGML_CPU_FP32_TO_FP16(WSP_GGML_CPU_FP16_TO_FP32(y[i])*v);
-    }
+        // leftovers
+        for (int i = np; i < n; ++i) {
+            y[i] = WSP_GGML_CPU_FP32_TO_FP16(WSP_GGML_CPU_FP16_TO_FP32(y[i])*v);
+        }
+    #endif
 #else
     // scalar
     for (int i = 0; i < n; ++i) {
@@ -683,7 +1002,39 @@ https://github.com/openvinotoolkit/openvino/blob/master/src/plugins/intel_cpu/sr
     }
 #endif
-#if defined(__ARM_NEON) && defined(__aarch64__)
+#if defined(__ARM_FEATURE_SVE) && defined(__aarch64__)
+inline static svfloat32_t wsp_ggml_v_expf(svbool_t pg, svfloat32_t x) {
+    const svfloat32_t r = svdup_n_f32_x(pg, 0x1.8p23f);
+    const svfloat32_t z = svmla_n_f32_x(pg, r, x, 0x1.715476p+0f);
+    const svfloat32_t n = svsub_f32_x(pg, z, r);
+    const svfloat32_t b = svmls_n_f32_x(pg, svmls_n_f32_x(pg, x, n, 0x1.62e4p-1f), n, 0x1.7f7d1cp-20f);
+    const svuint32_t e = svlsl_n_u32_x(pg, svreinterpret_u32_f32(z), 23);
+    const svfloat32_t k = svreinterpret_f32_u32(svadd_u32_x(pg, e, svreinterpret_u32_f32(svdup_n_f32_x(pg, 1))));
+    const svbool_t c = svacgt_n_f32(pg, n, 126);
+    const svfloat32_t u = svmul_f32_x(pg, b, b);
+    const svfloat32_t j = svmla_f32_x(pg,
+        svmul_n_f32_x(pg, b, 0x1.ffffecp-1f),
+        svmla_f32_x(pg, svmla_f32_x(pg, svdup_n_f32_x(pg, 0x1.fffdb6p-2f), svdup_n_f32_x(pg, 0x1.555e66p-3f), b),
+                        svmla_f32_x(pg, svdup_n_f32_x(pg, 0x1.573e2ep-5f), svdup_n_f32_x(pg, 0x1.0e4020p-7f), b), u), u);
+    const svuint32_t d = svdup_n_u32_z(svcmple_n_f32(pg, n, 0.0), 0x82000000);
+    const svfloat32_t s1 = svreinterpret_f32_u32(svadd_n_u32_x(pg, d, 0x7f000000));
+    const svfloat32_t s2 = svreinterpret_f32_u32(svsub_u32_x(pg, e, d));
+    return svsel_f32(svacgt_f32(pg, n, svdup_n_f32_x(pg, 192)), svmul_f32_x(pg, s1, s1),
+                     svsel_f32(c, svmul_f32_x(pg, svmla_f32_x(pg, s2, s2, j), s1), svmla_f32_x(pg, k, k, j)));
+}
+// computes silu x/(1+exp(-x)) in single precision vector
+inline static svfloat32_t wsp_ggml_v_silu(svbool_t pg, svfloat32_t x) {
+    const svfloat32_t one = svdup_n_f32_x(pg, 1.0f);
+    const svfloat32_t zero = svdup_n_f32_x(pg, 0.0f);
+    const svfloat32_t neg_x = svsub_f32_x(pg, zero, x);
+    const svfloat32_t exp_neg_x = wsp_ggml_v_expf(pg, neg_x);
+    const svfloat32_t one_plus_exp_neg_x = svadd_f32_x(pg, one, exp_neg_x);
+    return svdiv_f32_x(pg, x, one_plus_exp_neg_x);
+}
+#elif defined(__ARM_NEON) && defined(__aarch64__)
 // adapted from arm limited optimized routine
 // the maximum error is 1.45358 plus 0.5 ulps
@@ -874,7 +1225,59 @@ inline static __m128 wsp_ggml_v_silu(__m128 x) {
     return _mm_div_ps(x, one_plus_exp_neg_x);
 }
-#endif // __ARM_NEON / __AVX2__ / __SSE2__
+#elif defined(__riscv_v_intrinsic)
+// adapted from arm limited optimized routine
+// the maximum error is 1.45358 plus 0.5 ulps
+// numbers above 88.38 will flush to infinity
+// numbers beneath -103.97 will flush to zero
+inline static vfloat32m2_t wsp_ggml_v_expf_m2(vfloat32m2_t x, int vl) {
+    const vfloat32m2_t r = __riscv_vfmv_v_f_f32m2(0x1.8p23f, vl);
+#ifdef __riscv_xtheadvector
+    // workaround for compiler bug (gcc 14.3.0: Error: unrecognized opcode `th.vmv1r.v v2,v4')
+    vfloat32m2_t z = __riscv_vfadd_vf_f32m2(r, 0.0f, vl);
+    z = __riscv_vfmacc_vf_f32m2(z, 0x1.715476p+0f, x, vl);
+#else
+    const vfloat32m2_t z = __riscv_vfmacc_vf_f32m2(r, 0x1.715476p+0f, x, vl);
+#endif
+    const vfloat32m2_t n = __riscv_vfsub_vv_f32m2(z, r, vl);
+    const vfloat32m2_t b = __riscv_vfnmsac_vf_f32m2(__riscv_vfnmsac_vf_f32m2(x, 0x1.62e4p-1f, n, vl),
+                                                    0x1.7f7d1cp-20f, n, vl);
+    const vuint32m2_t e = __riscv_vsll_vx_u32m2(__riscv_vreinterpret_v_f32m2_u32m2(z), 23, vl);
+    const vfloat32m2_t k = __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vadd_vx_u32m2(e, 0x3f800000, vl)); // 1.0f
+    const vbool16_t c = __riscv_vmfgt_vf_f32m2_b16(__riscv_vfabs_v_f32m2(n, vl), 126.0f, vl);
+    const vfloat32m2_t u = __riscv_vfmul_vv_f32m2(b, b, vl);
+    const vfloat32m2_t j = __riscv_vfmacc_vv_f32m2(
+        __riscv_vfmul_vf_f32m2(b, 0x1.ffffecp-1f, vl),
+        __riscv_vfmacc_vv_f32m2(
+            __riscv_vfmacc_vf_f32m2(__riscv_vfmv_v_f_f32m2(0x1.fffdb6p-2f, vl), 0x1.555e66p-3f, b, vl),
+            __riscv_vfmacc_vf_f32m2(__riscv_vfmv_v_f_f32m2(0x1.573e2ep-5f, vl), 0x1.0e4020p-7f, b, vl),
+            u, vl), u, vl);
+    if (!__riscv_vcpop_m_b16(c, vl))
+        return __riscv_vfmacc_vv_f32m2(k, j, k, vl);
+    const vbool16_t  dm = __riscv_vmfle_vf_f32m2_b16(n, 0.0f, vl);
+    const vuint32m2_t d = __riscv_vmerge_vxm_u32m2(__riscv_vmv_v_x_u32m2(0, vl), 0x82000000, dm, vl);
+    const vfloat32m2_t s1 = __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vadd_vx_u32m2(d, 0x7f000000, vl));
+    const vfloat32m2_t s2 = __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vsub_vv_u32m2(e, d, vl));
+    const vfloat32m2_t r1 = __riscv_vmerge_vvm_f32m2(
+        __riscv_vfmacc_vv_f32m2(k, k, j, vl),
+        __riscv_vfmul_vv_f32m2(__riscv_vfmacc_vv_f32m2(s2, s2, j, vl), s1, vl),
+        c, vl);
+    return __riscv_vmerge_vvm_f32m2(
+        r1, __riscv_vfmul_vv_f32m2(s1, s1, vl),
+        __riscv_vmfgt_vf_f32m2_b16(__riscv_vfabs_v_f32m2(n, vl), 192.0f, vl),
+        vl);
+}
+// computes silu x/(1+exp(-x)) in single precision vector
+inline static vfloat32m2_t wsp_ggml_v_silu_m2(vfloat32m2_t x, int vl) {
+    const vfloat32m2_t neg_x = __riscv_vfneg_v_f32m2(x, vl);
+    const vfloat32m2_t exp_neg_x = wsp_ggml_v_expf_m2(neg_x, vl);
+    const vfloat32m2_t one_plus_exp_neg_x = __riscv_vfadd_vf_f32m2(exp_neg_x, 1.0f, vl);
+    return __riscv_vfdiv_vv_f32m2(x, one_plus_exp_neg_x, vl);
+}
+#endif // __ARM_NEON / __AVX2__ / __SSE2__ / __riscv_v_intrinsic
 inline static void wsp_ggml_vec_silu_f16(const int n, wsp_ggml_fp16_t * y, const wsp_ggml_fp16_t * x) {
     for (int i = 0; i < n; ++i) {
@@ -953,9 +1356,49 @@ void wsp_ggml_vec_swiglu_f32(const int n, float * y, const float * x, const floa
 inline static void wsp_ggml_vec_swiglu_f16(const int n, wsp_ggml_fp16_t * y, const wsp_ggml_fp16_t * x, const wsp_ggml_fp16_t * g) {
     for (int i = 0; i < n; ++i) {
-        float v = WSP_GGML_CPU_FP16_TO_FP32(x[i]);
-        float w = WSP_GGML_CPU_FP16_TO_FP32(g[i]);
-        y[i] = WSP_GGML_CPU_FP32_TO_FP16((v/(1.0f + expf(-v))) * w);
+        float xi = WSP_GGML_CPU_FP16_TO_FP32(x[i]);
+        float gi = WSP_GGML_CPU_FP16_TO_FP32(g[i]);
+        y[i] = WSP_GGML_CPU_FP32_TO_FP16((xi/(1.0f + expf(-xi))) * gi);
+    }
+}
+inline static void wsp_ggml_vec_geglu_erf_f32(const int n, float * y, const float * x, const float * g) {
+    for (int i = 0; i < n; ++i) {
+        float xi = x[i];
+        y[i] = 0.5f * xi * (1.0f + erff(xi*SQRT_2_INV)) * g[i];
+    }
+}
+inline static void wsp_ggml_vec_geglu_erf_f16(const int n, wsp_ggml_fp16_t * y, const wsp_ggml_fp16_t * x, const wsp_ggml_fp16_t * g) {
+    for (int i = 0; i < n; ++i) {
+        float xi = WSP_GGML_CPU_FP16_TO_FP32(x[i]);
+        float gi = WSP_GGML_CPU_FP16_TO_FP32(g[i]);
+        y[i] = WSP_GGML_CPU_FP32_TO_FP16(0.5f * xi * (1.0f + erff(xi*SQRT_2_INV)) * gi);
+    }
+}
+#ifdef WSP_GGML_GELU_QUICK_FP16
+inline static void wsp_ggml_vec_geglu_quick_f32(const int n, float * y, const float * x, const float * g) {
+    uint16_t t;
+    for (int i = 0; i < n; ++i) {
+        wsp_ggml_fp16_t fp16 = WSP_GGML_CPU_FP32_TO_FP16(x[i]);
+        memcpy(&t, &fp16, sizeof(uint16_t));
+        y[i] = WSP_GGML_CPU_FP16_TO_FP32(wsp_ggml_table_gelu_quick_f16[t]) * g[i];
+    }
+}
+#else
+inline static void wsp_ggml_vec_geglu_quick_f32(const int n, float * y, const float * x, const float * g) {
+    for (int i = 0; i < n; ++i) {
+        y[i] = wsp_ggml_gelu_quick_f32(x[i]) * g[i];
+    }
+}
+#endif
+inline static void wsp_ggml_vec_geglu_quick_f16(const int n, wsp_ggml_fp16_t * y, const wsp_ggml_fp16_t * x, const wsp_ggml_fp16_t * g) {
+    const uint16_t * i16 = (const uint16_t *) x;
+    for (int i = 0; i < n; ++i) {
+        float v = WSP_GGML_CPU_FP16_TO_FP32(g[i]);
+        y[i] = WSP_GGML_CPU_FP32_TO_FP16(WSP_GGML_CPU_FP16_TO_FP32(wsp_ggml_table_gelu_quick_f16[i16[i]]) * v);
     }
 }