npm - cui-llama.rn - Versions diffs - 1.7.4 → 1.7.6 - Mend

cui-llama.rn 1.7.4 → 1.7.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (276) hide show

package/cpp/ggml-cpu/vec.h CHANGED Viewed

@@ -5,6 +5,7 @@
 #include "ggml-impl.h"
 #include "simd-mappings.h"
 #include "ggml.h"
+#include "ggml-cpu.h"
 #if defined(LM_GGML_USE_ACCELERATE)
 #include <Accelerate/Accelerate.h>
@@ -57,7 +58,7 @@ inline static void lm_ggml_vec_set_bf16(const int n, lm_ggml_bf16_t * x, const l
 inline static void lm_ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i] + y[i]; }
 inline static void lm_ggml_vec_add_f16 (const int n, lm_ggml_fp16_t * z, const lm_ggml_fp16_t * x, const lm_ggml_fp16_t * y) {
     for (int i = 0; i < n; ++i) {
-        z[i] = LM_GGML_FP32_TO_FP16(LM_GGML_FP16_TO_FP32(x[i]) + LM_GGML_FP16_TO_FP32(y[i]));
+        z[i] = LM_GGML_CPU_FP32_TO_FP16(LM_GGML_CPU_FP16_TO_FP32(x[i]) + LM_GGML_CPU_FP16_TO_FP32(y[i]));
     }
 }
 inline static void lm_ggml_vec_add1_f32(const int n, float * z, const float * x, const float   v) { for (int i = 0; i < n; ++i) z[i]  = x[i] + v;    }
@@ -66,7 +67,7 @@ inline static void lm_ggml_vec_acc1_f32(const int n, float * y, const float   v)
 inline static void lm_ggml_vec_sub_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i] - y[i]; }
 inline static void lm_ggml_vec_sub_f16 (const int n, lm_ggml_fp16_t * z, const lm_ggml_fp16_t * x, const lm_ggml_fp16_t * y) {
     for (int i = 0; i < n; ++i) {
-        z[i] = LM_GGML_FP32_TO_FP16(LM_GGML_FP16_TO_FP32(x[i]) - LM_GGML_FP16_TO_FP32(y[i]));
+        z[i] = LM_GGML_CPU_FP32_TO_FP16(LM_GGML_CPU_FP16_TO_FP32(x[i]) - LM_GGML_CPU_FP16_TO_FP32(y[i]));
     }
 }
 inline static void lm_ggml_vec_set_f32 (const int n, float * x, const float   v)                  { for (int i = 0; i < n; ++i) x[i]  = v;           }
@@ -74,20 +75,20 @@ inline static void lm_ggml_vec_cpy_f32 (const int n, float * y, const float * x)
 inline static void lm_ggml_vec_neg_f32 (const int n, float * y, const float * x)                  { for (int i = 0; i < n; ++i) y[i]  = -x[i];       }
 inline static void lm_ggml_vec_neg_f16 (const int n, lm_ggml_fp16_t * y, const lm_ggml_fp16_t * x) {
     for (int i = 0; i < n; ++i) {
-        y[i] = LM_GGML_FP32_TO_FP16(-LM_GGML_FP16_TO_FP32(x[i]));
+        y[i] = LM_GGML_CPU_FP32_TO_FP16(-LM_GGML_CPU_FP16_TO_FP32(x[i]));
     }
 }
 inline static void lm_ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i]*y[i];   }
 inline static void lm_ggml_vec_mul_f16 (const int n, lm_ggml_fp16_t * z, const lm_ggml_fp16_t * x, const lm_ggml_fp16_t * y) {
     for (int i = 0; i < n; ++i) {
-        z[i] = LM_GGML_FP32_TO_FP16(LM_GGML_FP16_TO_FP32(x[i]) * LM_GGML_FP16_TO_FP32(y[i]));
+        z[i] = LM_GGML_CPU_FP32_TO_FP16(LM_GGML_CPU_FP16_TO_FP32(x[i]) * LM_GGML_CPU_FP16_TO_FP32(y[i]));
     }
 }
 inline static void lm_ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i]/y[i];   }
 inline static void lm_ggml_vec_div_f16 (const int n, lm_ggml_fp16_t * z, const lm_ggml_fp16_t * x, const lm_ggml_fp16_t * y) {
     for (int i = 0; i < n; ++i) {
-        z[i] = LM_GGML_FP32_TO_FP16(LM_GGML_FP16_TO_FP32(x[i]) / LM_GGML_FP16_TO_FP32(y[i]));
+        z[i] = LM_GGML_CPU_FP32_TO_FP16(LM_GGML_CPU_FP16_TO_FP32(x[i]) / LM_GGML_CPU_FP16_TO_FP32(y[i]));
     }
 }
@@ -130,13 +131,13 @@ inline static void lm_ggml_vec_dot_f16_unroll(const int n, const int xs, float *
     // leftovers
     for (int i = np; i < n; ++i) {
         for (int j = 0; j < LM_GGML_VEC_DOT_UNROLL; ++j) {
-            sumf[j] += (lm_ggml_float)(LM_GGML_FP16_TO_FP32(x[j][i])*LM_GGML_FP16_TO_FP32(y[i]));
+            sumf[j] += (lm_ggml_float)(LM_GGML_CPU_FP16_TO_FP32(x[j][i])*LM_GGML_CPU_FP16_TO_FP32(y[i]));
         }
     }
 #else
     for (int i = 0; i < n; ++i) {
         for (int j = 0; j < LM_GGML_VEC_DOT_UNROLL; ++j) {
-            sumf[j] += (lm_ggml_float)(LM_GGML_FP16_TO_FP32(x[j][i])*LM_GGML_FP16_TO_FP32(y[i]));
+            sumf[j] += (lm_ggml_float)(LM_GGML_CPU_FP16_TO_FP32(x[j][i])*LM_GGML_CPU_FP16_TO_FP32(y[i]));
         }
     }
 #endif
@@ -148,27 +149,108 @@ inline static void lm_ggml_vec_dot_f16_unroll(const int n, const int xs, float *
 inline static void lm_ggml_vec_mad_f32(const int n, float * LM_GGML_RESTRICT y, const float * LM_GGML_RESTRICT x, const float v) {
 #if defined(LM_GGML_SIMD)
-    const int np = (n & ~(LM_GGML_F32_STEP - 1));
+    #if defined(__ARM_FEATURE_SVE)
-    LM_GGML_F32_VEC vx = LM_GGML_F32_VEC_SET1(v);
+        const int sve_register_length = lm_ggml_cpu_get_sve_cnt() * 8;
+        const int lm_ggml_f32_epr = sve_register_length / 32;//8;//svcntw(); // SVE128:4, SVE256:8, SVE512:16
+        const int lm_ggml_f32_step = 8 * lm_ggml_f32_epr; // choose 8 SVE registers
+        LM_GGML_F32_VEC vx = LM_GGML_F32_VEC_SET1(v);
-    LM_GGML_F32_VEC ax[LM_GGML_F32_ARR];
-    LM_GGML_F32_VEC ay[LM_GGML_F32_ARR];
+        const int np = (n & ~(lm_ggml_f32_step - 1));
+        svfloat32_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
+        svfloat32_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
+        for (int i = 0; i < np; i += lm_ggml_f32_step) {
-    for (int i = 0; i < np; i += LM_GGML_F32_STEP) {
-        for (int j = 0; j < LM_GGML_F32_ARR; j++) {
-            ax[j] = LM_GGML_F32_VEC_LOAD(x + i + j*LM_GGML_F32_EPR);
-            ay[j] = LM_GGML_F32_VEC_LOAD(y + i + j*LM_GGML_F32_EPR);
-            ay[j] = LM_GGML_F32_VEC_FMA(ay[j], ax[j], vx);
+            ax1 = LM_GGML_F32_VEC_LOAD(x + i);
+            ay1 = LM_GGML_F32_VEC_LOAD(y + i);
+            ay1 = LM_GGML_F32_VEC_FMA(ax1, vx, ay1);
-            LM_GGML_F32_VEC_STORE(y + i + j*LM_GGML_F32_EPR, ay[j]);
+            LM_GGML_F32_VEC_STORE(y + i, ay1);
+            ax2 = LM_GGML_F32_VEC_LOAD(x + i + 1*lm_ggml_f32_epr);
+            ay2 = LM_GGML_F32_VEC_LOAD(y + i + 1*lm_ggml_f32_epr);
+            ay2 = LM_GGML_F32_VEC_FMA(ax2, vx, ay2);
+            LM_GGML_F32_VEC_STORE(y + i + 1*lm_ggml_f32_epr, ay2);
+            ax3 = LM_GGML_F32_VEC_LOAD(x + i + 2*lm_ggml_f32_epr);
+            ay3 = LM_GGML_F32_VEC_LOAD(y + i + 2*lm_ggml_f32_epr);
+            ay3 = LM_GGML_F32_VEC_FMA(ax3, vx, ay3);
+            LM_GGML_F32_VEC_STORE(y + i + 2*lm_ggml_f32_epr, ay3);
+            ax4 = LM_GGML_F32_VEC_LOAD(x + i + 3*lm_ggml_f32_epr);
+            ay4 = LM_GGML_F32_VEC_LOAD(y + i + 3*lm_ggml_f32_epr);
+            ay4 = LM_GGML_F32_VEC_FMA(ax4, vx, ay4);
+            LM_GGML_F32_VEC_STORE(y + i + 3*lm_ggml_f32_epr, ay4);
+            ax5 = LM_GGML_F32_VEC_LOAD(x + i + 4*lm_ggml_f32_epr);
+            ay5 = LM_GGML_F32_VEC_LOAD(y + i + 4*lm_ggml_f32_epr);
+            ay5 = LM_GGML_F32_VEC_FMA(ax5, vx, ay5);
+            LM_GGML_F32_VEC_STORE(y + i + 4*lm_ggml_f32_epr, ay5);
+            ax6 = LM_GGML_F32_VEC_LOAD(x + i + 5*lm_ggml_f32_epr);
+            ay6 = LM_GGML_F32_VEC_LOAD(y + i + 5*lm_ggml_f32_epr);
+            ay6 = LM_GGML_F32_VEC_FMA(ax6, vx, ay6);
+            LM_GGML_F32_VEC_STORE(y + i + 5*lm_ggml_f32_epr, ay6);
+            ax7 = LM_GGML_F32_VEC_LOAD(x + i + 6*lm_ggml_f32_epr);
+            ay7 = LM_GGML_F32_VEC_LOAD(y + i + 6*lm_ggml_f32_epr);
+            ay7 = LM_GGML_F32_VEC_FMA(ax7, vx, ay7);
+            LM_GGML_F32_VEC_STORE(y + i + 6*lm_ggml_f32_epr, ay7);
+            ax8 = LM_GGML_F32_VEC_LOAD(x + i + 7*lm_ggml_f32_epr);
+            ay8 = LM_GGML_F32_VEC_LOAD(y + i + 7*lm_ggml_f32_epr);
+            ay8 = LM_GGML_F32_VEC_FMA(ax8, vx, ay8);
+            LM_GGML_F32_VEC_STORE(y + i + 7*lm_ggml_f32_epr, ay8);
         }
-    }
+        // leftovers
+        // Since 8 unrolls are done in above loop, leftovers lie in range [0, lm_ggml_f32_step] which is handled in below loop
+        const int np2 = (n & ~(lm_ggml_f32_epr - 1));
+        for (int i = np; i < np2; i += lm_ggml_f32_epr) {
+            ax1 = LM_GGML_F32_VEC_LOAD(x + i);
+            ay1 = LM_GGML_F32_VEC_LOAD(y + i);
+            ay1 = LM_GGML_F32_VEC_FMA(ax1, vx, ay1);
+            LM_GGML_F32_VEC_STORE(y + i, ay1);
+        }
+        // maximum number of leftover elements will be less that lm_ggml_f32_epr. Apply predicated svmad on available elements only
+        if (np2 < n) {
+            svbool_t pg =svwhilelt_b32(np2, n);
+            ax1 = svld1_f32(pg, x + np2);
+            ay1 = svld1_f32(pg, y + np2);
+            ay1 = svmad_f32_m(pg, ax1, vx, ay1);
+            svst1_f32(pg, y + np2, ay1);
+        }
+    #else
+        const int np = (n & ~(LM_GGML_F32_STEP - 1));
-    // leftovers
-    for (int i = np; i < n; ++i) {
-        y[i] += x[i]*v;
-    }
+        LM_GGML_F32_VEC vx = LM_GGML_F32_VEC_SET1(v);
+        LM_GGML_F32_VEC ax[LM_GGML_F32_ARR];
+        LM_GGML_F32_VEC ay[LM_GGML_F32_ARR];
+        for (int i = 0; i < np; i += LM_GGML_F32_STEP) {
+            for (int j = 0; j < LM_GGML_F32_ARR; j++) {
+                ax[j] = LM_GGML_F32_VEC_LOAD(x + i + j*LM_GGML_F32_EPR);
+                ay[j] = LM_GGML_F32_VEC_LOAD(y + i + j*LM_GGML_F32_EPR);
+                ay[j] = LM_GGML_F32_VEC_FMA(ay[j], ax[j], vx);
+                LM_GGML_F32_VEC_STORE(y + i + j*LM_GGML_F32_EPR, ay[j]);
+            }
+        }
+        // leftovers
+        for (int i = np; i < n; ++i) {
+            y[i] += x[i]*v;
+        }
+    #endif
 #else
     // scalar
     for (int i = 0; i < n; ++i) {
@@ -198,12 +280,12 @@ inline static void lm_ggml_vec_mad_f16(const int n, lm_ggml_fp16_t * LM_GGML_RES
     // leftovers
     for (int i = np; i < n; ++i) {
-        y[i] = LM_GGML_FP32_TO_FP16(LM_GGML_FP16_TO_FP32(y[i]) + LM_GGML_FP16_TO_FP32(x[i])*v);
+        y[i] = LM_GGML_CPU_FP32_TO_FP16(LM_GGML_CPU_FP16_TO_FP32(y[i]) + LM_GGML_CPU_FP16_TO_FP32(x[i])*v);
     }
 #else
     // scalar
     for (int i = 0; i < n; ++i) {
-        y[i] = LM_GGML_FP32_TO_FP16(LM_GGML_FP16_TO_FP32(y[i]) + LM_GGML_FP16_TO_FP32(x[i])*v);
+        y[i] = LM_GGML_CPU_FP32_TO_FP16(LM_GGML_CPU_FP16_TO_FP32(y[i]) + LM_GGML_CPU_FP16_TO_FP32(x[i])*v);
     }
 #endif
 }
@@ -220,36 +302,45 @@ inline static void lm_ggml_vec_mad_f32_unroll(const int n, const int xs, const i
     }
 #if defined(LM_GGML_SIMD)
-    const int np = (n & ~(LM_GGML_F32_STEP - 1));
+    #if defined(__ARM_FEATURE_SVE)
+        // scalar Route to scalar implementation       //TODO: Write SVE code
+        for (int k = 0; k < LM_GGML_VEC_MAD_UNROLL; ++k) {
+            for (int i = 0; i < n; ++i) {
+                y[i] += x[k][i]*v[k][0];
+            }
+        }
+    #else
+        const int np = (n & ~(LM_GGML_F32_STEP - 1));
-    LM_GGML_F32_VEC vx[LM_GGML_VEC_MAD_UNROLL];
+        LM_GGML_F32_VEC vx[LM_GGML_VEC_MAD_UNROLL];
-    for (int k = 0; k < LM_GGML_VEC_MAD_UNROLL; ++k) {
-        vx[k] = LM_GGML_F32_VEC_SET1(v[k][0]);
-    }
+        for (int k = 0; k < LM_GGML_VEC_MAD_UNROLL; ++k) {
+            vx[k] = LM_GGML_F32_VEC_SET1(v[k][0]);
+        }
-    LM_GGML_F32_VEC ax[LM_GGML_VEC_MAD_UNROLL][LM_GGML_F32_ARR];
-    LM_GGML_F32_VEC ay[LM_GGML_F32_ARR];
+        LM_GGML_F32_VEC ax[LM_GGML_VEC_MAD_UNROLL][LM_GGML_F32_ARR];
+        LM_GGML_F32_VEC ay[LM_GGML_F32_ARR];
-    for (int i = 0; i < np; i += LM_GGML_F32_STEP) {
-        for (int j = 0; j < LM_GGML_F32_ARR; j++) {
-            ay[j] = LM_GGML_F32_VEC_LOAD(y + i + j*LM_GGML_F32_EPR);
+        for (int i = 0; i < np; i += LM_GGML_F32_STEP) {
+            for (int j = 0; j < LM_GGML_F32_ARR; j++) {
+                ay[j] = LM_GGML_F32_VEC_LOAD(y + i + j*LM_GGML_F32_EPR);
-            for (int k = 0; k < LM_GGML_VEC_MAD_UNROLL; ++k) {
-                ax[k][j] = LM_GGML_F32_VEC_LOAD(x[k] + i + j*LM_GGML_F32_EPR);
-                ay[j] = LM_GGML_F32_VEC_FMA(ay[j], ax[k][j], vx[k]);
-            }
+                for (int k = 0; k < LM_GGML_VEC_MAD_UNROLL; ++k) {
+                    ax[k][j] = LM_GGML_F32_VEC_LOAD(x[k] + i + j*LM_GGML_F32_EPR);
+                    ay[j] = LM_GGML_F32_VEC_FMA(ay[j], ax[k][j], vx[k]);
+                }
-            LM_GGML_F32_VEC_STORE(y + i + j*LM_GGML_F32_EPR, ay[j]);
+                LM_GGML_F32_VEC_STORE(y + i + j*LM_GGML_F32_EPR, ay[j]);
+            }
         }
-    }
-    // leftovers
-    for (int k = 0; k < LM_GGML_VEC_MAD_UNROLL; ++k) {
-        for (int i = np; i < n; ++i) {
-            y[i] += x[k][i]*v[k][0];
+        // leftovers
+        for (int k = 0; k < LM_GGML_VEC_MAD_UNROLL; ++k) {
+            for (int i = np; i < n; ++i) {
+                y[i] += x[k][i]*v[k][0];
+            }
         }
-    }
+    #endif
 #else
     // scalar
     for (int k = 0; k < LM_GGML_VEC_MAD_UNROLL; ++k) {
@@ -265,25 +356,53 @@ inline static void lm_ggml_vec_scale_f32(const int n, float * y, const float   v
 #if defined(LM_GGML_USE_ACCELERATE)
     vDSP_vsmul(y, 1, &v, y, 1, n);
 #elif defined(LM_GGML_SIMD)
-    const int np = (n & ~(LM_GGML_F32_STEP - 1));
+    #if defined(__ARM_FEATURE_SVE)
+        const int sve_register_length = lm_ggml_cpu_get_sve_cnt() * 8;
+        const int lm_ggml_f32_epr = sve_register_length / 32;//8;//svcntw(); // SVE128:4, SVE256:8, SVE512:16
+        const int lm_ggml_f32_step = 2 * lm_ggml_f32_epr;
+        LM_GGML_F32_VEC vx = LM_GGML_F32_VEC_SET1(v);
+        const int np = (n & ~(lm_ggml_f32_step - 1));
+        svfloat32_t ay1;
+        svfloat32_t ay2;
+        for (int i = 0; i < np; i += lm_ggml_f32_step) {
+            ay1 = LM_GGML_F32_VEC_LOAD(y + i);
+            ay1 = LM_GGML_F32_VEC_MUL(ay1, vx);
+            LM_GGML_F32_VEC_STORE(y + i, ay1);
+            ay2 = LM_GGML_F32_VEC_LOAD(y + i + 1*lm_ggml_f32_epr);
+            ay2 = LM_GGML_F32_VEC_MUL(ay2, vx);
+            LM_GGML_F32_VEC_STORE(y + i + 1*lm_ggml_f32_epr, ay2);
+        }
+        // leftovers
+        // maximum number of leftover elements will be less that lm_ggml_f32_epr. Apply predicated svmad on available elements only
+        if (np < n) {
+            svbool_t pg = svwhilelt_b32(np, n);
+            ay1 = svld1_f32(pg, y + np);
+            ay1 = svmul_f32_m(pg, ay1, vx);
+            svst1_f32(pg, y + np, ay1);
+        }
+    #else
+        const int np = (n & ~(LM_GGML_F32_STEP - 1));
-    LM_GGML_F32_VEC vx = LM_GGML_F32_VEC_SET1(v);
+        LM_GGML_F32_VEC vx = LM_GGML_F32_VEC_SET1(v);
-    LM_GGML_F32_VEC ay[LM_GGML_F32_ARR];
+        LM_GGML_F32_VEC ay[LM_GGML_F32_ARR];
-    for (int i = 0; i < np; i += LM_GGML_F32_STEP) {
-        for (int j = 0; j < LM_GGML_F32_ARR; j++) {
-            ay[j] = LM_GGML_F32_VEC_LOAD(y + i + j*LM_GGML_F32_EPR);
-            ay[j] = LM_GGML_F32_VEC_MUL(ay[j], vx);
+        for (int i = 0; i < np; i += LM_GGML_F32_STEP) {
+            for (int j = 0; j < LM_GGML_F32_ARR; j++) {
+                ay[j] = LM_GGML_F32_VEC_LOAD(y + i + j*LM_GGML_F32_EPR);
+                ay[j] = LM_GGML_F32_VEC_MUL(ay[j], vx);
-            LM_GGML_F32_VEC_STORE(y + i + j*LM_GGML_F32_EPR, ay[j]);
+                LM_GGML_F32_VEC_STORE(y + i + j*LM_GGML_F32_EPR, ay[j]);
+            }
         }
-    }
-    // leftovers
-    for (int i = np; i < n; ++i) {
-        y[i] *= v;
-    }
+        // leftovers
+        for (int i = np; i < n; ++i) {
+            y[i] *= v;
+        }
+    #endif
 #else
     // scalar
     for (int i = 0; i < n; ++i) {
@@ -311,12 +430,12 @@ inline static void lm_ggml_vec_scale_f16(const int n, lm_ggml_fp16_t * y, const
     // leftovers
     for (int i = np; i < n; ++i) {
-        y[i] = LM_GGML_FP32_TO_FP16(LM_GGML_FP16_TO_FP32(y[i])*v);
+        y[i] = LM_GGML_CPU_FP32_TO_FP16(LM_GGML_CPU_FP16_TO_FP32(y[i])*v);
     }
 #else
     // scalar
     for (int i = 0; i < n; ++i) {
-        y[i] = LM_GGML_FP32_TO_FP16(LM_GGML_FP16_TO_FP32(y[i])*v);
+        y[i] = LM_GGML_CPU_FP32_TO_FP16(LM_GGML_CPU_FP16_TO_FP32(y[i])*v);
     }
 #endif
 }
@@ -325,103 +444,103 @@ inline static void lm_ggml_vec_norm_f32 (const int n, float * s, const float * x
 inline static void lm_ggml_vec_sqr_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i];   }
 inline static void lm_ggml_vec_sqr_f16 (const int n, lm_ggml_fp16_t * y, const lm_ggml_fp16_t * x) {
     for (int i = 0; i < n; ++i) {
-        float v = LM_GGML_FP16_TO_FP32(x[i]);
-        y[i] = LM_GGML_FP32_TO_FP16(v*v);
+        float v = LM_GGML_CPU_FP16_TO_FP32(x[i]);
+        y[i] = LM_GGML_CPU_FP32_TO_FP16(v*v);
     }
 }
 inline static void lm_ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrtf(x[i]); }
 inline static void lm_ggml_vec_sqrt_f16 (const int n, lm_ggml_fp16_t * y, const lm_ggml_fp16_t * x) {
     for (int i = 0; i < n; ++i) {
-        y[i] = LM_GGML_FP32_TO_FP16(sqrtf(LM_GGML_FP16_TO_FP32(x[i])));
+        y[i] = LM_GGML_CPU_FP32_TO_FP16(sqrtf(LM_GGML_CPU_FP16_TO_FP32(x[i])));
     }
 }
 inline static void lm_ggml_vec_log_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = logf(x[i]);  }
 inline static void lm_ggml_vec_log_f16 (const int n, lm_ggml_fp16_t * y, const lm_ggml_fp16_t * x) {
     for (int i = 0; i < n; ++i) {
-        y[i] = LM_GGML_FP32_TO_FP16(logf(LM_GGML_FP16_TO_FP32(x[i])));
+        y[i] = LM_GGML_CPU_FP32_TO_FP16(logf(LM_GGML_CPU_FP16_TO_FP32(x[i])));
     }
 }
 inline static void lm_ggml_vec_sin_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sinf(x[i]);  }
 inline static void lm_ggml_vec_sin_f16 (const int n, lm_ggml_fp16_t * y, const lm_ggml_fp16_t * x) {
     for (int i = 0; i < n; ++i) {
-        y[i] = LM_GGML_FP32_TO_FP16(sinf(LM_GGML_FP16_TO_FP32(x[i])));
+        y[i] = LM_GGML_CPU_FP32_TO_FP16(sinf(LM_GGML_CPU_FP16_TO_FP32(x[i])));
     }
 }
 inline static void lm_ggml_vec_cos_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = cosf(x[i]);  }
 inline static void lm_ggml_vec_cos_f16 (const int n, lm_ggml_fp16_t * y, const lm_ggml_fp16_t * x) {
     for (int i = 0; i < n; ++i) {
-        y[i] = LM_GGML_FP32_TO_FP16(cosf(LM_GGML_FP16_TO_FP32(x[i])));
+        y[i] = LM_GGML_CPU_FP32_TO_FP16(cosf(LM_GGML_CPU_FP16_TO_FP32(x[i])));
     }
 }
 inline static void lm_ggml_vec_abs_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fabsf(x[i]); }
 inline static void lm_ggml_vec_abs_f16 (const int n, lm_ggml_fp16_t * y, const lm_ggml_fp16_t * x) {
     for (int i = 0; i < n; ++i) {
-        y[i] = LM_GGML_FP32_TO_FP16(fabsf(LM_GGML_FP16_TO_FP32(x[i])));
+        y[i] = LM_GGML_CPU_FP32_TO_FP16(fabsf(LM_GGML_CPU_FP16_TO_FP32(x[i])));
     }
 }
 inline static void lm_ggml_vec_sgn_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); }
 inline static void lm_ggml_vec_sgn_f16 (const int n, lm_ggml_fp16_t * y, const lm_ggml_fp16_t * x) {
     for (int i = 0; i < n; ++i) {
-        float v = LM_GGML_FP16_TO_FP32(x[i]);
-        y[i] = LM_GGML_FP32_TO_FP16((v > 0.f) ? 1.f : ((v < 0.f) ? -1.f : 0.f));
+        float v = LM_GGML_CPU_FP16_TO_FP32(x[i]);
+        y[i] = LM_GGML_CPU_FP32_TO_FP16((v > 0.f) ? 1.f : ((v < 0.f) ? -1.f : 0.f));
     }
 }
 inline static void lm_ggml_vec_step_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f; }
 inline static void lm_ggml_vec_step_f16 (const int n, lm_ggml_fp16_t * y, const lm_ggml_fp16_t * x) {
     for (int i = 0; i < n; ++i) {
-        y[i] = LM_GGML_FP32_TO_FP16((LM_GGML_FP16_TO_FP32(x[i]) > 0.f) ? 1.f : 0.f);
+        y[i] = LM_GGML_CPU_FP32_TO_FP16((LM_GGML_CPU_FP16_TO_FP32(x[i]) > 0.f) ? 1.f : 0.f);
     }
 }
 inline static void lm_ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]);  }
 inline static void lm_ggml_vec_tanh_f16 (const int n, lm_ggml_fp16_t * y, const lm_ggml_fp16_t * x) {
     for (int i = 0; i < n; ++i) {
-        y[i] = LM_GGML_FP32_TO_FP16(tanhf(LM_GGML_FP16_TO_FP32(x[i])));
+        y[i] = LM_GGML_CPU_FP32_TO_FP16(tanhf(LM_GGML_CPU_FP16_TO_FP32(x[i])));
     }
 }
 inline static void lm_ggml_vec_elu_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expm1f(x[i]); }
 inline static void lm_ggml_vec_elu_f16 (const int n, lm_ggml_fp16_t * y, const lm_ggml_fp16_t * x) {
     for (int i = 0; i < n; ++i) {
-        y[i] = LM_GGML_FP32_TO_FP16(expm1f(LM_GGML_FP16_TO_FP32(x[i])));
+        y[i] = LM_GGML_CPU_FP32_TO_FP16(expm1f(LM_GGML_CPU_FP16_TO_FP32(x[i])));
     }
 }
 inline static void lm_ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
 inline static void lm_ggml_vec_relu_f16 (const int n, lm_ggml_fp16_t * y, const lm_ggml_fp16_t * x) {
     for (int i = 0; i < n; ++i) {
-        float v = LM_GGML_FP16_TO_FP32(x[i]);
-        y[i] = LM_GGML_FP32_TO_FP16((v > 0.f) ? v : 0.f);
+        float v = LM_GGML_CPU_FP16_TO_FP32(x[i]);
+        y[i] = LM_GGML_CPU_FP32_TO_FP16((v > 0.f) ? v : 0.f);
     }
 }
 inline static void lm_ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }
 inline static void lm_ggml_vec_leaky_relu_f16 (const int n, lm_ggml_fp16_t * y, const lm_ggml_fp16_t * x, const float ns) {
     for (int i = 0; i < n; ++i) {
-        float v = LM_GGML_FP16_TO_FP32(x[i]);
-        y[i] = LM_GGML_FP32_TO_FP16(((v > 0.f) ? v : 0.f) + ns * ((v < 0.0f) ? v : 0.f));
+        float v = LM_GGML_CPU_FP16_TO_FP32(x[i]);
+        y[i] = LM_GGML_CPU_FP32_TO_FP16(((v > 0.f) ? v : 0.f) + ns * ((v < 0.0f) ? v : 0.f));
     }
 }
 inline static void lm_ggml_vec_sigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = 1.f / (1.f + expf(-x[i])); }
 inline static void lm_ggml_vec_sigmoid_f16 (const int n, lm_ggml_fp16_t * y, const lm_ggml_fp16_t * x) {
     for (int i = 0; i < n; ++i) {
-        y[i] = LM_GGML_FP32_TO_FP16(1.f / (1.f + expf(-LM_GGML_FP16_TO_FP32(x[i]))));
+        y[i] = LM_GGML_CPU_FP32_TO_FP16(1.f / (1.f + expf(-LM_GGML_CPU_FP16_TO_FP32(x[i]))));
     }
 }
 // TODO: optimize performance
 inline static void lm_ggml_vec_hardswish_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
 inline static void lm_ggml_vec_hardswish_f16 (const int n, lm_ggml_fp16_t * y, const lm_ggml_fp16_t * x) {
     for (int i = 0; i < n; ++i) {
-        float v = LM_GGML_FP16_TO_FP32(x[i]);
-        y[i] = LM_GGML_FP32_TO_FP16(v * fminf(1.0f, fmaxf(0.0f, (v + 3.0f) / 6.0f)));
+        float v = LM_GGML_CPU_FP16_TO_FP32(x[i]);
+        y[i] = LM_GGML_CPU_FP32_TO_FP16(v * fminf(1.0f, fmaxf(0.0f, (v + 3.0f) / 6.0f)));
     }
 }
 inline static void lm_ggml_vec_hardsigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
 inline static void lm_ggml_vec_hardsigmoid_f16 (const int n, lm_ggml_fp16_t * y, const lm_ggml_fp16_t * x) {
     for (int i = 0; i < n; ++i) {
-        y[i] = LM_GGML_FP32_TO_FP16(fminf(1.0f, fmaxf(0.0f, (LM_GGML_FP16_TO_FP32(x[i]) + 3.0f) / 6.0f)));
+        y[i] = LM_GGML_CPU_FP32_TO_FP16(fminf(1.0f, fmaxf(0.0f, (LM_GGML_CPU_FP16_TO_FP32(x[i]) + 3.0f) / 6.0f)));
     }
 }
 inline static void lm_ggml_vec_exp_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = expf(x[i]); }
 inline static void lm_ggml_vec_exp_f16 (const int n, lm_ggml_fp16_t * y, const lm_ggml_fp16_t * x) {
     for (int i = 0; i < n; ++i) {
-        y[i] = LM_GGML_FP32_TO_FP16(expf(LM_GGML_FP16_TO_FP32(x[i])));
+        y[i] = LM_GGML_CPU_FP32_TO_FP16(expf(LM_GGML_CPU_FP16_TO_FP32(x[i])));
     }
 }
@@ -443,9 +562,9 @@ inline static void lm_ggml_vec_gelu_f16(const int n, lm_ggml_fp16_t * y, const l
 inline static void lm_ggml_vec_gelu_erf_f16(const int n, lm_ggml_fp16_t * y, const lm_ggml_fp16_t * x) {
     for (int i = 0; i < n; ++i) {
-        float xi = LM_GGML_FP16_TO_FP32(x[i]);
+        float xi = LM_GGML_CPU_FP16_TO_FP32(x[i]);
         float res = 0.5f*xi*(1.0f + erff(xi*SQRT_2_INV));
-        y[i] = LM_GGML_FP32_TO_FP16(res);
+        y[i] = LM_GGML_CPU_FP32_TO_FP16(res);
     }
 }
@@ -458,9 +577,9 @@ inline static void lm_ggml_vec_gelu_f32(const int n, float * y, const float * x)
         } else if (x[i] >= 10.0f) {
             y[i] = x[i];
         } else {
-            lm_ggml_fp16_t fp16 = LM_GGML_FP32_TO_FP16(x[i]);
+            lm_ggml_fp16_t fp16 = LM_GGML_CPU_FP32_TO_FP16(x[i]);
             memcpy(&t, &fp16, sizeof(uint16_t));
-            y[i] = LM_GGML_FP16_TO_FP32(lm_ggml_table_gelu_f16[t]);
+            y[i] = LM_GGML_CPU_FP16_TO_FP32(lm_ggml_table_gelu_f16[t]);
         }
     }
 }
@@ -494,9 +613,9 @@ inline static float lm_ggml_gelu_quick_f32(float x) {
 inline static void lm_ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) {
     uint16_t t;
     for (int i = 0; i < n; ++i) {
-        lm_ggml_fp16_t fp16 = LM_GGML_FP32_TO_FP16(x[i]);
+        lm_ggml_fp16_t fp16 = LM_GGML_CPU_FP32_TO_FP16(x[i]);
         memcpy(&t, &fp16, sizeof(uint16_t));
-        y[i] = LM_GGML_FP16_TO_FP32(lm_ggml_table_gelu_quick_f16[t]);
+        y[i] = LM_GGML_CPU_FP16_TO_FP32(lm_ggml_table_gelu_quick_f16[t]);
     }
 }
 #else
@@ -509,8 +628,8 @@ inline static void lm_ggml_vec_gelu_quick_f32(const int n, float * y, const floa
 inline static void lm_ggml_vec_gelu_quick_f16(const int n, lm_ggml_fp16_t * y, const lm_ggml_fp16_t * x) {
     for (int i = 0; i < n; ++i) {
-        float v = LM_GGML_FP16_TO_FP32(x[i]);
-        y[i] = LM_GGML_FP32_TO_FP16(v*(1.0f/(1.0f+expf(GELU_QUICK_COEF*v))));
+        float v = LM_GGML_CPU_FP16_TO_FP32(x[i]);
+        y[i] = LM_GGML_CPU_FP32_TO_FP16(v*(1.0f/(1.0f+expf(GELU_QUICK_COEF*v))));
     }
 }
@@ -519,8 +638,8 @@ inline static float lm_ggml_silu_f32(float x) {
     return x/(1.0f + expf(-x));
 }
 inline static lm_ggml_fp16_t lm_ggml_silu_f16(lm_ggml_fp16_t x) {
-    float v = LM_GGML_FP16_TO_FP32(x);
-    return LM_GGML_FP32_TO_FP16(v/(1.0f + expf(-v)));
+    float v = LM_GGML_CPU_FP16_TO_FP32(x);
+    return LM_GGML_CPU_FP32_TO_FP16(v/(1.0f + expf(-v)));
 }
 #if __FINITE_MATH_ONLY__
@@ -528,6 +647,42 @@ inline static lm_ggml_fp16_t lm_ggml_silu_f16(lm_ggml_fp16_t x) {
 #error "ref: https://github.com/ggml-org/llama.cpp/pull/7154#issuecomment-2143844461"
 #endif
+/* Below function was borrowed from the GitHub repository:
+https://github.com/openvinotoolkit/openvino/blob/master/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/common.hpp */
+#if defined(__ARM_FEATURE_SVE) && defined(__aarch64__)
+    inline static svfloat32_t exp_ps_sve(svbool_t pg, svfloat32_t src) {
+        // Constants
+        const svfloat32_t log2_e = svdup_n_f32(1.4426950409f);
+        const svfloat32_t ln2 = svdup_n_f32(0.6931473921f);
+        const svfloat32_t half_ln2_sq = svdup_n_f32(0.2413862043f);
+        const svuint32_t not_mask17 = svdup_n_u32(~((1u << 17) - 1));
+        const svfloat32_t one = svdup_n_f32(1.0f);
+        const svfloat32_t inactive1 = svdup_n_f32(0.0f);
+        const svint32_t inactive2 = svdup_n_s32(0);
+        // Algorithm starts here
+        svfloat32_t t0 = svmul_f32_m(pg, src, log2_e);  // y = x * log2(e)
+        svfloat32_t t1 = svrintm_f32_m(inactive1, pg, t0);         // rount to int (float)
+        svint32_t t2 = svcvt_s32_f32_m(inactive2, pg, t1);         // n
+        t1 = svsub_f32_m(pg, t0, t1);   // a = y - floor(y)
+        t1 = svadd_f32_m(pg, t1, one);  // b = a + 1
+        svuint32_t t3 = svlsr_n_u32_m(pg, svreinterpret_u32_f32(t1), 17);  // v = b >> 17 (u32)
+        svfloat32_t t4 = svexpa_f32(t3);                                   // c = fexpa(v)
+        t4 = svscale_f32_m(pg, t4, t2);                                    // fexpa(v) * 2^(n)
+        // and_(t2.d, t1.d, not_mask17.d)
+        svfloat32_t t5 = svreinterpret_f32_u32(svand_u32_m(pg, svreinterpret_u32_f32(t1), not_mask17));
+        t5 = svsub_f32_m(pg, t1, t5);                // z
+        t0 = svmla_f32_m(pg, ln2, t5, half_ln2_sq);  // ln2 + half_ln2_sq * z
+        t0 = svmla_f32_m(pg, one, t5, t0);           // 1 + (ln2 * z) + (half_ln2_sq * z * z)
+        t0 = svmul_f32_m(pg, t0, t4);                // Final result
+        return t0;
+    }
+#endif
 #if defined(__ARM_NEON) && defined(__aarch64__)
 // adapted from arm limited optimized routine
@@ -733,9 +888,9 @@ inline static float lm_ggml_silu_backward_f32(float x, float dy) {
 }
 inline static lm_ggml_fp16_t lm_ggml_silu_backward_f16(lm_ggml_fp16_t x, lm_ggml_fp16_t dy) {
-    const float v = LM_GGML_FP16_TO_FP32(x);
+    const float v = LM_GGML_CPU_FP16_TO_FP32(x);
     const float s = 1.0f/(1.0f + expf(-v));
-    return LM_GGML_FP32_TO_FP16(LM_GGML_FP16_TO_FP32(dy)*s*(1.0f + v*(1.0f - s)));
+    return LM_GGML_CPU_FP32_TO_FP16(LM_GGML_CPU_FP16_TO_FP32(dy)*s*(1.0f + v*(1.0f - s)));
 }
 inline static void lm_ggml_vec_silu_backward_f32(const int n, float * dx, const float * x, const float * dy) {
@@ -773,7 +928,7 @@ inline static void lm_ggml_vec_sum_f32_ggf(const int n, lm_ggml_float * s, const
 inline static void lm_ggml_vec_sum_f16_ggf(const int n, float * s, const lm_ggml_fp16_t * x) {
     float sum = 0.0f;
     for (int i = 0; i < n; ++i) {
-        sum += LM_GGML_FP16_TO_FP32(x[i]);
+        sum += LM_GGML_CPU_FP16_TO_FP32(x[i]);
     }
     *s = sum;
 }

package/cpp/ggml-cpu.h CHANGED Viewed

@@ -101,6 +101,7 @@ extern "C" {
     LM_GGML_BACKEND_API int lm_ggml_cpu_has_riscv_v    (void);
     LM_GGML_BACKEND_API int lm_ggml_cpu_has_vsx        (void);
     LM_GGML_BACKEND_API int lm_ggml_cpu_has_vxe        (void);
+    LM_GGML_BACKEND_API int lm_ggml_cpu_has_nnpa       (void);
     LM_GGML_BACKEND_API int lm_ggml_cpu_has_wasm_simd  (void);
     LM_GGML_BACKEND_API int lm_ggml_cpu_has_llamafile  (void);