npm - cui-llama.rn - Versions diffs - 1.7.3 → 1.7.6 - Mend

cui-llama.rn 1.7.3 → 1.7.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (276) hide show

package/cpp/ggml-cpu/vec.cpp CHANGED Viewed

@@ -17,29 +17,98 @@ void lm_ggml_vec_dot_f32(int n, float * LM_GGML_RESTRICT s, size_t bs, const flo
 #if defined(LM_GGML_SIMD)
     float sumf = 0.0f;
-    const int np = (n & ~(LM_GGML_F32_STEP - 1));
-    LM_GGML_F32_VEC sum[LM_GGML_F32_ARR] = { LM_GGML_F32_VEC_ZERO };
+    #if defined(__ARM_FEATURE_SVE)
+        const int sve_register_length = lm_ggml_cpu_get_sve_cnt() * 8;
+        const int lm_ggml_f32_epr = sve_register_length / 32;//8;//svcntw(); // SVE128:4, SVE256:8, SVE512:16
+        const int lm_ggml_f32_step = 8 * lm_ggml_f32_epr; // choose 8 SVE registers
+        const int np = (n & ~(lm_ggml_f32_step - 1));
+        svfloat32_t sum1 = svdup_n_f32(0.0f);
+        svfloat32_t sum2 = svdup_n_f32(0.0f);
+        svfloat32_t sum3 = svdup_n_f32(0.0f);
+        svfloat32_t sum4 = svdup_n_f32(0.0f);
+        svfloat32_t sum5 = svdup_n_f32(0.0f);
+        svfloat32_t sum6 = svdup_n_f32(0.0f);
+        svfloat32_t sum7 = svdup_n_f32(0.0f);
+        svfloat32_t sum8 = svdup_n_f32(0.0f);
+        svfloat32_t ax1,ax2,ax3,ax4,ax5,ax6,ax7,ax8;
+        svfloat32_t ay1,ay2,ay3,ay4,ay5,ay6,ay7,ay8;
+        for (int i = 0; i < np; i += lm_ggml_f32_step) {
+            ax1 = LM_GGML_F32_VEC_LOAD(x + i);
+            ay1 = LM_GGML_F32_VEC_LOAD(y + i);
+            sum1 = LM_GGML_F32_VEC_FMA(ax1, ay1, sum1);
+            ax2 = LM_GGML_F32_VEC_LOAD(x + i + 1*lm_ggml_f32_epr);
+            ay2 = LM_GGML_F32_VEC_LOAD(y + i + 1*lm_ggml_f32_epr);
+            sum2 = LM_GGML_F32_VEC_FMA(ax2, ay2, sum2);
+            ax3 = LM_GGML_F32_VEC_LOAD(x + i + 2*lm_ggml_f32_epr);
+            ay3 = LM_GGML_F32_VEC_LOAD(y + i + 2*lm_ggml_f32_epr);
+            sum3 = LM_GGML_F32_VEC_FMA(ax3, ay3, sum3);
+            ax4 = LM_GGML_F32_VEC_LOAD(x + i + 3*lm_ggml_f32_epr);
+            ay4 = LM_GGML_F32_VEC_LOAD(y + i + 3*lm_ggml_f32_epr);
+            sum4 = LM_GGML_F32_VEC_FMA(ax4, ay4, sum4);
+            ax5 = LM_GGML_F32_VEC_LOAD(x + i + 4*lm_ggml_f32_epr);
+            ay5 = LM_GGML_F32_VEC_LOAD(y + i + 4*lm_ggml_f32_epr);
+            sum5 = LM_GGML_F32_VEC_FMA(ax5, ay5, sum5);
+            ax6 = LM_GGML_F32_VEC_LOAD(x + i + 5*lm_ggml_f32_epr);
+            ay6 = LM_GGML_F32_VEC_LOAD(y + i + 5*lm_ggml_f32_epr);
+            sum6 = LM_GGML_F32_VEC_FMA(ax6, ay6, sum6);
+            ax7 = LM_GGML_F32_VEC_LOAD(x + i + 6*lm_ggml_f32_epr);
+            ay7 = LM_GGML_F32_VEC_LOAD(y + i + 6*lm_ggml_f32_epr);
+            sum7 = LM_GGML_F32_VEC_FMA(ax7, ay7, sum7);
+            ax8 = LM_GGML_F32_VEC_LOAD(x + i + 7*lm_ggml_f32_epr);
+            ay8 = LM_GGML_F32_VEC_LOAD(y + i + 7*lm_ggml_f32_epr);
+            sum8 = LM_GGML_F32_VEC_FMA(ax8, ay8, sum8);
+        }
+        // leftovers
+        // Since 8 unrolls are done in above loop, leftovers lie in range [0, lm_ggml_f32_step] which is handled in below loop
+        const int np2 = (n & ~(lm_ggml_f32_epr - 1));
+        for (int i = np; i < np2; i += lm_ggml_f32_epr) {
+            ax1 = LM_GGML_F32_VEC_LOAD(x + i);
+            ay1 = LM_GGML_F32_VEC_LOAD(y + i);
+            sum1 = LM_GGML_F32_VEC_FMA(ax1, ay1, sum1);
+        }
+        // maximum number of leftover elements will be less that lm_ggml_f32_epr. Apply predicated svmad on available elements only
+        if (np2 < n) {
+            svbool_t pg = svwhilelt_b32(np2, n);
+            ax1 = svld1_f32(pg, x + np2);
+            ay1 = svld1_f32(pg, y + np2);
+            sum1 = svmad_f32_m(pg, ax1, ay1, sum1);
+        }
+        // reduce sum1,sum2 to sum1
+        LM_GGML_F32_VEC_REDUCE(sumf, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8);
+    #else
+        const int np = (n & ~(LM_GGML_F32_STEP - 1));
-    LM_GGML_F32_VEC ax[LM_GGML_F32_ARR];
-    LM_GGML_F32_VEC ay[LM_GGML_F32_ARR];
+        LM_GGML_F32_VEC sum[LM_GGML_F32_ARR] = { LM_GGML_F32_VEC_ZERO };
-    for (int i = 0; i < np; i += LM_GGML_F32_STEP) {
-        for (int j = 0; j < LM_GGML_F32_ARR; j++) {
-            ax[j] = LM_GGML_F32_VEC_LOAD(x + i + j*LM_GGML_F32_EPR);
-            ay[j] = LM_GGML_F32_VEC_LOAD(y + i + j*LM_GGML_F32_EPR);
+        LM_GGML_F32_VEC ax[LM_GGML_F32_ARR];
+        LM_GGML_F32_VEC ay[LM_GGML_F32_ARR];
-            sum[j] = LM_GGML_F32_VEC_FMA(sum[j], ax[j], ay[j]);
+        for (int i = 0; i < np; i += LM_GGML_F32_STEP) {
+            for (int j = 0; j < LM_GGML_F32_ARR; j++) {
+                ax[j] = LM_GGML_F32_VEC_LOAD(x + i + j*LM_GGML_F32_EPR);
+                ay[j] = LM_GGML_F32_VEC_LOAD(y + i + j*LM_GGML_F32_EPR);
+                sum[j] = LM_GGML_F32_VEC_FMA(sum[j], ax[j], ay[j]);
+            }
         }
-    }
-    // reduce sum0..sum3 to sum0
-    LM_GGML_F32_VEC_REDUCE(sumf, sum);
+        // reduce sum0..sum3 to sum0
+        LM_GGML_F32_VEC_REDUCE(sumf, sum);
-    // leftovers
-    for (int i = np; i < n; ++i) {
-        sumf += x[i]*y[i];
-    }
+        // leftovers
+        for (int i = np; i < n; ++i) {
+            sumf += x[i]*y[i];
+        }
+    #endif
 #else
     // scalar
     lm_ggml_float sumf = 0.0;
@@ -150,11 +219,11 @@ void lm_ggml_vec_dot_f16(int n, float * LM_GGML_RESTRICT s, size_t bs, lm_ggml_f
     // leftovers
     for (int i = np; i < n; ++i) {
-        sumf += (lm_ggml_float)(LM_GGML_FP16_TO_FP32(x[i])*LM_GGML_FP16_TO_FP32(y[i]));
+        sumf += (lm_ggml_float)(LM_GGML_CPU_FP16_TO_FP32(x[i])*LM_GGML_CPU_FP16_TO_FP32(y[i]));
     }
 #else
     for (int i = 0; i < n; ++i) {
-        sumf += (lm_ggml_float)(LM_GGML_FP16_TO_FP32(x[i])*LM_GGML_FP16_TO_FP32(y[i]));
+        sumf += (lm_ggml_float)(LM_GGML_CPU_FP16_TO_FP32(x[i])*LM_GGML_CPU_FP16_TO_FP32(y[i]));
     }
 #endif