npm - cui-llama.rn - Versions diffs - 1.7.3 → 1.7.6 - Mend

cui-llama.rn 1.7.3 → 1.7.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (276) hide show

package/cpp/ggml-cpu/repack.h ADDED Viewed

@@ -0,0 +1,98 @@
+#pragma once
+#define LM_GGML_COMMON_DECL_CPP
+#include "ggml-common.h"
+#include "traits.h"
+#include "ggml.h"
+// GGML internal header
+lm_ggml_backend_buffer_type_t lm_ggml_backend_cpu_repack_buffer_type(void);
+template <int K> constexpr int QK_0() {
+    if constexpr (K == 4) {
+        return QK4_0;
+    }
+    if constexpr (K == 8) {
+        return QK8_0;
+    }
+    return -1;
+}
+template <int K, int N> struct block {
+    lm_ggml_half d[N];                         // deltas for N qK_0 blocks
+    int8_t    qs[(QK_0<K>() * N * K) / 8];  // quants for N qK_0 blocks
+};
+// control size
+static_assert(sizeof(block<4, 4>) == 4 * sizeof(lm_ggml_half) + QK8_0 * 2, "wrong block<4,4> size/padding");
+static_assert(sizeof(block<4, 8>) == 8 * sizeof(lm_ggml_half) + QK8_0 * 4, "wrong block<4,8> size/padding");
+static_assert(sizeof(block<8, 4>) == 4 * sizeof(lm_ggml_half) + QK8_0 * 4, "wrong block<8,4> size/padding");
+static_assert(sizeof(block<8, 8>) == 8 * sizeof(lm_ggml_half) + QK8_0 * 8, "wrong block<8,8> size/padding");
+using block_q4_0x4 = block<4, 4>;
+using block_q4_0x8 = block<4, 8>;
+using block_q8_0x4 = block<8, 4>;
+using block_q8_0x8 = block<8, 8>;
+struct block_q4_Kx8 {
+    lm_ggml_half d[8];      // super-block scale for quantized scales
+    lm_ggml_half dmin[8];   // super-block scale for quantized mins
+    uint8_t scales[96];  // scales and mins, quantized with 6 bits
+    uint8_t qs[1024];    // 4--bit quants
+};
+static_assert(sizeof(block_q4_Kx8) == sizeof(lm_ggml_half) * 16 + K_SCALE_SIZE * 8 + QK_K * 4, "wrong q4_K block size/padding");
+struct block_q8_Kx4 {
+    float d[4];              // delta
+    int8_t qs[QK_K * 4];     // quants
+    int16_t bsums[QK_K / 4]; // sum of quants in groups of 16
+};
+static_assert(sizeof(block_q8_Kx4) == sizeof(float) * 4 + QK_K * 4 + (QK_K / 4) * sizeof(int16_t), "wrong q8_K block size/padding");
+struct block_iq4_nlx4 {
+    lm_ggml_half d[4];            // deltas for 4 iq4_nl blocks
+    uint8_t   qs[QK4_NL * 2];  // nibbles / quants for 4 iq4_nl blocks
+};
+static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(lm_ggml_half) + QK4_NL * 2, "wrong iq4_nlx4 block size/padding");
+#if defined(__cplusplus)
+extern "C" {
+#endif
+void lm_ggml_quantize_mat_q8_0_4x4(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT vy, int64_t k);
+void lm_ggml_quantize_mat_q8_0_4x8(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT vy, int64_t k);
+void lm_ggml_quantize_mat_q8_K_4x8(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT vy, int64_t k);
+void lm_ggml_gemv_q4_0_4x4_q8_0(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, const void * LM_GGML_RESTRICT vy, int nr, int nc);
+void lm_ggml_gemv_q4_0_4x8_q8_0(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, const void * LM_GGML_RESTRICT vy, int nr, int nc);
+void lm_ggml_gemv_q4_0_8x8_q8_0(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, const void * LM_GGML_RESTRICT vy, int nr, int nc);
+void lm_ggml_gemv_q4_K_8x8_q8_K(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, const void * LM_GGML_RESTRICT vy, int nr, int nc);
+void lm_ggml_gemv_iq4_nl_4x4_q8_0(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, const void * LM_GGML_RESTRICT vy, int nr, int nc);
+void lm_ggml_gemm_q4_0_4x4_q8_0(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, const void * LM_GGML_RESTRICT vy, int nr, int nc);
+void lm_ggml_gemm_q4_0_4x8_q8_0(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, const void * LM_GGML_RESTRICT vy, int nr, int nc);
+void lm_ggml_gemm_q4_0_8x8_q8_0(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, const void * LM_GGML_RESTRICT vy, int nr, int nc);
+void lm_ggml_gemm_q4_K_8x8_q8_K(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, const void * LM_GGML_RESTRICT vy, int nr, int nc);
+void lm_ggml_gemm_iq4_nl_4x4_q8_0(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, const void * LM_GGML_RESTRICT vy, int nr, int nc);
+// Native implementations
+void lm_ggml_quantize_mat_q8_0_4x4_generic(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT vy, int64_t k);
+void lm_ggml_quantize_mat_q8_0_4x8_generic(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT vy, int64_t k);
+void lm_ggml_quantize_mat_q8_K_4x8_generic(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT vy, int64_t k);
+void lm_ggml_gemv_q4_0_4x4_q8_0_generic(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, const void * LM_GGML_RESTRICT vy, int nr, int nc);
+void lm_ggml_gemv_q4_0_4x8_q8_0_generic(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, const void * LM_GGML_RESTRICT vy, int nr, int nc);
+void lm_ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, const void * LM_GGML_RESTRICT vy, int nr, int nc);
+void lm_ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, const void * LM_GGML_RESTRICT vy, int nr, int nc);
+void lm_ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, const void * LM_GGML_RESTRICT vy, int nr, int nc);
+void lm_ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, const void * LM_GGML_RESTRICT vy, int nr, int nc);
+void lm_ggml_gemm_q4_0_4x8_q8_0_generic(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, const void * LM_GGML_RESTRICT vy, int nr, int nc);
+void lm_ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, const void * LM_GGML_RESTRICT vy, int nr, int nc);
+void lm_ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, const void * LM_GGML_RESTRICT vy, int nr, int nc);
+void lm_ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, const void * LM_GGML_RESTRICT vy, int nr, int nc);
+#if defined(__cplusplus)
+} // extern "C"
+#endif

package/cpp/ggml-cpu/simd-mappings.h CHANGED Viewed

@@ -2,10 +2,167 @@
 #include "ggml-cpu-impl.h"
+#ifdef __ARM_FEATURE_SVE
+#include <arm_sve.h>
+#endif // __ARM_FEATURE_SVE
+#if defined(__ARM_NEON) && !defined(__CUDACC__) && !defined(__MUSACC__)
+// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
+//
+//   $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
+//
+#include <arm_neon.h>
+#endif
+#if defined(__F16C__)
+#include <immintrin.h>
+#endif
+#ifdef __cplusplus
+extern "C" {
+#endif
 //
 // simd mappings
 //
+// FP16 to FP32 conversion
+// 16-bit float
+// on Arm, we use __fp16
+// on x86, we use uint16_t
+//
+// for old CUDA compilers (<= 11), we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/10616
+// for     MUSA compilers        , we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/11843
+//
+#if defined(__ARM_NEON) && !(defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11) && !defined(__MUSACC__)
+    #define LM_GGML_CPU_COMPUTE_FP16_TO_FP32(x) neon_compute_fp16_to_fp32(x)
+    #define LM_GGML_CPU_COMPUTE_FP32_TO_FP16(x) neon_compute_fp32_to_fp16(x)
+    #define LM_GGML_CPU_FP16_TO_FP32(x) LM_GGML_CPU_COMPUTE_FP16_TO_FP32(x)
+    static inline float neon_compute_fp16_to_fp32(lm_ggml_fp16_t h) {
+        __fp16 tmp;
+        memcpy(&tmp, &h, sizeof(lm_ggml_fp16_t));
+        return (float)tmp;
+    }
+    static inline lm_ggml_fp16_t neon_compute_fp32_to_fp16(float f) {
+        lm_ggml_fp16_t res;
+        __fp16 tmp = f;
+        memcpy(&res, &tmp, sizeof(lm_ggml_fp16_t));
+        return res;
+    }
+#elif defined(__F16C__)
+    #ifdef _MSC_VER
+        #define LM_GGML_CPU_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
+        #define LM_GGML_CPU_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
+    #else
+        #define LM_GGML_CPU_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
+        #define LM_GGML_CPU_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
+    #endif
+#elif defined(__POWER9_VECTOR__)
+    #define LM_GGML_CPU_COMPUTE_FP16_TO_FP32(x) power_compute_fp16_to_fp32(x)
+    #define LM_GGML_CPU_COMPUTE_FP32_TO_FP16(x) power_compute_fp32_to_fp16(x)
+    /* the inline asm below is about 12% faster than the lookup method */
+    #define LM_GGML_CPU_FP16_TO_FP32(x) LM_GGML_CPU_COMPUTE_FP16_TO_FP32(x)
+    #define LM_GGML_CPU_FP32_TO_FP16(x) LM_GGML_CPU_COMPUTE_FP32_TO_FP16(x)
+    static inline float power_compute_fp16_to_fp32(lm_ggml_fp16_t h) {
+        float f;
+        double d;
+        __asm__(
+            "mtfprd %0,%2\n"
+            "xscvhpdp %0,%0\n"
+            "frsp %1,%0\n" :
+            /* temp */ "=d"(d),
+            /* out */  "=f"(f):
+            /* in */   "r"(h));
+        return f;
+    }
+    static inline lm_ggml_fp16_t power_compute_fp32_to_fp16(float f) {
+        double d;
+        lm_ggml_fp16_t r;
+        __asm__( /* xscvdphp can work on double or single precision */
+            "xscvdphp %0,%2\n"
+            "mffprd %1,%0\n" :
+            /* temp */ "=d"(d),
+            /* out */  "=r"(r):
+            /* in */   "f"(f));
+        return r;
+    }
+#elif defined(__riscv) && defined(__riscv_zfhmin)
+    static inline float riscv_compute_fp16_to_fp32(lm_ggml_fp16_t h) {
+        float f;
+        __asm__(
+            "fmv.h.x %[f], %[h]\n\t"
+            "fcvt.s.h %[f], %[f]"
+            : [f] "=&f" (f)
+            : [h] "r" (h)
+        );
+        return f;
+    }
+    static inline lm_ggml_fp16_t riscv_compute_fp32_to_fp16(float f) {
+        lm_ggml_fp16_t res;
+        __asm__(
+            "fcvt.h.s %[f], %[f]\n\t"
+            "fmv.x.h %[h], %[f]"
+            : [h] "=&r" (res)
+            : [f] "f" (f)
+        );
+        return res;
+    }
+    #define LM_GGML_CPU_COMPUTE_FP16_TO_FP32(x) riscv_compute_fp16_to_fp32(x)
+    #define LM_GGML_CPU_COMPUTE_FP32_TO_FP16(x) riscv_compute_fp32_to_fp16(x)
+    #define LM_GGML_CPU_FP16_TO_FP32(x) LM_GGML_CPU_COMPUTE_FP16_TO_FP32(x)
+    #define LM_GGML_CPU_FP32_TO_FP16(x) LM_GGML_CPU_COMPUTE_FP32_TO_FP16(x)
+#elif defined(__NNPA__)
+    #define LM_GGML_CPU_COMPUTE_FP16_TO_FP32(x) nnpa_compute_fp16_to_fp32(x)
+    #define LM_GGML_CPU_COMPUTE_FP32_TO_FP16(x) nnpa_compute_fp32_to_fp16(x)
+    #define LM_GGML_CPU_FP16_TO_FP32(x) LM_GGML_CPU_COMPUTE_FP16_TO_FP32(x)
+    #define LM_GGML_CPU_FP32_TO_FP16(x) LM_GGML_CPU_COMPUTE_FP32_TO_FP16(x)
+    static inline float nnpa_compute_fp16_to_fp32(lm_ggml_fp16_t h) {
+        uint16x8_t v_h = vec_splats(h);
+        uint16x8_t v_hd = vec_convert_from_fp16(v_h, 0);
+        return vec_extend_to_fp32_hi(v_hd, 0)[0];
+    }
+    static inline lm_ggml_fp16_t nnpa_compute_fp32_to_fp16(float f) {
+        float32x4_t v_f = vec_splats(f);
+        float32x4_t v_zero = vec_splats(0.0f);
+        uint16x8_t v_hd = vec_round_from_fp32(v_f, v_zero, 0);
+        uint16x8_t v_h = vec_convert_to_fp16(v_hd, 0);
+        return vec_extract(v_h, 0);
+    }
+#endif
+// precomputed f32 table for f16 (256 KB)
+// defined in ggml-cpu.c, initialized in lm_ggml_cpu_init()
+extern float lm_ggml_table_f32_f16[1 << 16];
+// On ARM NEON, it's quicker to directly convert x -> x instead of calling into lm_ggml_lookup_fp16_to_fp32,
+// so we define LM_GGML_CPU_FP16_TO_FP32 and LM_GGML_CPU_FP32_TO_FP16 elsewhere for NEON.
+// This is also true for POWER9.
+#if !defined(LM_GGML_CPU_FP16_TO_FP32)
+inline static float lm_ggml_lookup_fp16_to_fp32(lm_ggml_fp16_t f) {
+    uint16_t s;
+    memcpy(&s, &f, sizeof(uint16_t));
+    return lm_ggml_table_f32_f16[s];
+}
+#define LM_GGML_CPU_FP16_TO_FP32(x) lm_ggml_lookup_fp16_to_fp32(x)
+#endif
+#if !defined(LM_GGML_CPU_FP32_TO_FP16)
+#define LM_GGML_CPU_FP32_TO_FP16(x) LM_GGML_COMPUTE_FP32_TO_FP16(x)
+#endif
 // we define a common set of C macros which map to specific intrinsics based on the current architecture
 // we then implement the fundamental computation operations below using only these macros
 // adding support for new architectures requires to define the corresponding SIMD macros
@@ -17,7 +174,123 @@
 //   number of elements to fit in a single register
 //
-#if defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA)
+#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_FMA)
+#define LM_GGML_SIMD
+// F32 SVE
+#define LM_GGML_F32_EPR 8
+#define DEFAULT_PG svptrue_b32()
+#define LM_GGML_F32xt                        svfloat32_t
+#define LM_GGML_F32xt_ZERO                   svdup_n_f32(0.0f)
+#define LM_GGML_F32xt_SET1(x)                svdup_n_f32(x)
+#define LM_GGML_F32xt_LOAD_IMPL(pg, a, ...)  svld1_f32(pg, a)
+#define LM_GGML_F32xt_LOAD(...)              LM_GGML_F32xt_LOAD_IMPL(DEFAULT_PG, __VA_ARGS__)
+#define LM_GGML_F32xt_STORE_IMPL(pg,a,b)     svst1_f32(pg, a, b)
+#define LM_GGML_F32xt_STORE(...)             LM_GGML_F32xt_STORE_IMPL(DEFAULT_PG, __VA_ARGS__)
+#define LM_GGML_F32xt_FMA_IMPL(pg, a, b, c)  svmad_f32_m(pg, a, b, c)
+#define LM_GGML_F32xt_FMA(...)               LM_GGML_F32xt_FMA_IMPL(DEFAULT_PG, __VA_ARGS__)
+#define LM_GGML_F32xt_ADD_IMPL(pg, a, b)     svadd_f32_m(pg, a, b)
+#define LM_GGML_F32xt_ADD(...)               LM_GGML_F32xt_ADD_IMPL(DEFAULT_PG, __VA_ARGS__)
+#define LM_GGML_F32xt_MUL_IMPL(pg, a, b)     svmul_f32_m(pg, a, b)
+#define LM_GGML_F32xt_MUL(...)               LM_GGML_F32xt_MUL_IMPL(DEFAULT_PG, __VA_ARGS__)
+#define LM_GGML_F32xt_REDUCE_ONE_IMPL(pg, a) svaddv(pg, a)
+#define LM_GGML_F32xt_REDUCE_ONE(...)        LM_GGML_F32xt_REDUCE_ONE_IMPL(DEFAULT_PG, __VA_ARGS__)
+#define LM_GGML_F32xt_REDUCE_IMPL(pg, res, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8)  \
+{                                                      \
+    sum1 = svadd_f32_m(DEFAULT_PG, sum1, sum2);        \
+    sum3 = svadd_f32_m(DEFAULT_PG, sum3, sum4);        \
+    sum5 = svadd_f32_m(DEFAULT_PG, sum5, sum6);        \
+    sum7 = svadd_f32_m(DEFAULT_PG, sum7, sum8);        \
+    sum1 = svadd_f32_m(DEFAULT_PG, sum1, sum3);        \
+    sum5 = svadd_f32_m(DEFAULT_PG, sum5, sum7);        \
+    sum1 = svadd_f32_m(DEFAULT_PG, sum1, sum5);        \
+    (res) = (lm_ggml_float) LM_GGML_F32xt_REDUCE_ONE(sum1);  \
+}
+#define LM_GGML_F32xt_REDUCE(...) LM_GGML_F32xt_REDUCE_IMPL(DEFAULT_PG, __VA_ARGS__)
+#define LM_GGML_F32_VEC        LM_GGML_F32xt
+#define LM_GGML_F32_VEC_ZERO   LM_GGML_F32xt_ZERO
+#define LM_GGML_F32_VEC_SET1   LM_GGML_F32xt_SET1
+#define LM_GGML_F32_VEC_LOAD   LM_GGML_F32xt_LOAD
+#define LM_GGML_F32_VEC_STORE  LM_GGML_F32xt_STORE
+#define LM_GGML_F32_VEC_FMA    LM_GGML_F32xt_FMA
+#define LM_GGML_F32_VEC_ADD    LM_GGML_F32xt_ADD
+#define LM_GGML_F32_VEC_MUL    LM_GGML_F32xt_MUL
+#define LM_GGML_F32_VEC_REDUCE LM_GGML_F32xt_REDUCE
+// F16 NEON
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+    #define LM_GGML_F16_STEP 32
+    #define LM_GGML_F16_EPR  8
+    #define LM_GGML_F16x8              float16x8_t
+    #define LM_GGML_F16x8_ZERO         vdupq_n_f16(0.0f)
+    #define LM_GGML_F16x8_SET1(x)      vdupq_n_f16(x)
+    #define LM_GGML_F16x8_LOAD(x)      vld1q_f16((const __fp16 *)(x))
+    #define LM_GGML_F16x8_STORE        vst1q_f16
+    #define LM_GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
+    #define LM_GGML_F16x8_ADD          vaddq_f16
+    #define LM_GGML_F16x8_MUL          vmulq_f16
+    #define LM_GGML_F16x8_REDUCE(res, x)                               \
+    do {                                                            \
+        int offset = LM_GGML_F16_ARR >> 1;                             \
+        for (int i = 0; i < offset; ++i) {                          \
+            (x)[i] = vaddq_f16((x)[i], (x)[offset+i]);              \
+        }                                                           \
+        offset >>= 1;                                               \
+        for (int i = 0; i < offset; ++i) {                          \
+            (x)[i] = vaddq_f16((x)[i], (x)[offset+i]);              \
+        }                                                           \
+        offset >>= 1;                                               \
+        for (int i = 0; i < offset; ++i) {                          \
+            (x)[i] = vaddq_f16((x)[i], (x)[offset+i]);              \
+        }                                                           \
+        const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 ((x)[0])); \
+        const float32x4_t t1 = vcvt_f32_f16(vget_high_f16((x)[0])); \
+        (res) = (lm_ggml_float) vaddvq_f32(vaddq_f32(t0, t1));         \
+    } while (0)
+    #define LM_GGML_F16_VEC                LM_GGML_F16x8
+    #define LM_GGML_F16_VEC_ZERO           LM_GGML_F16x8_ZERO
+    #define LM_GGML_F16_VEC_SET1           LM_GGML_F16x8_SET1
+    #define LM_GGML_F16_VEC_LOAD(p, i)     LM_GGML_F16x8_LOAD(p)
+    #define LM_GGML_F16_VEC_STORE(p, r, i) LM_GGML_F16x8_STORE((__fp16 *)(p), (r)[i])
+    #define LM_GGML_F16_VEC_FMA            LM_GGML_F16x8_FMA
+    #define LM_GGML_F16_VEC_ADD            LM_GGML_F16x8_ADD
+    #define LM_GGML_F16_VEC_MUL            LM_GGML_F16x8_MUL
+    #define LM_GGML_F16_VEC_REDUCE         LM_GGML_F16x8_REDUCE
+#else
+    // if FP16 vector arithmetic is not supported, we use FP32 instead
+    // and take advantage of the vcvt_ functions to convert to/from FP16
+    #define LM_GGML_F16_STEP 16
+    #define LM_GGML_F16_EPR  4
+    #define LM_GGML_F32Cx4              float32x4_t
+    #define LM_GGML_F32Cx4_ZERO         vdupq_n_f32(0.0f)
+    #define LM_GGML_F32Cx4_SET1(x)      vdupq_n_f32(x)
+    #define LM_GGML_F32Cx4_LOAD(x)      vcvt_f32_f16(vld1_f16((const __fp16 *)(x)))
+    #define LM_GGML_F32Cx4_STORE(x, y)  vst1_f16(x, vcvt_f16_f32(y))
+    #define LM_GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c)
+    #define LM_GGML_F32Cx4_ADD          vaddq_f32
+    #define LM_GGML_F32Cx4_MUL          vmulq_f32
+    #define LM_GGML_F32Cx4_REDUCE       LM_GGML_F32x4_REDUCE
+    #define LM_GGML_F16_VEC                LM_GGML_F32Cx4
+    #define LM_GGML_F16_VEC_ZERO           LM_GGML_F32Cx4_ZERO
+    #define LM_GGML_F16_VEC_SET1           LM_GGML_F32Cx4_SET1
+    #define LM_GGML_F16_VEC_LOAD(p, i)     LM_GGML_F32Cx4_LOAD(p)
+    #define LM_GGML_F16_VEC_STORE(p, r, i) LM_GGML_F32Cx4_STORE((__fp16 *)(p), r[i])
+    #define LM_GGML_F16_VEC_FMA            LM_GGML_F32Cx4_FMA
+    #define LM_GGML_F16_VEC_ADD            LM_GGML_F32Cx4_ADD
+    #define LM_GGML_F16_VEC_MUL            LM_GGML_F32Cx4_MUL
+    #define LM_GGML_F16_VEC_REDUCE         LM_GGML_F32Cx4_REDUCE
+#endif
+#elif defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA)
 #define LM_GGML_SIMD
@@ -299,7 +572,7 @@ static inline __m256 __avx_f32cx8_load(const lm_ggml_fp16_t * x) {
     float tmp[8];
     for (int i = 0; i < 8; i++) {
-        tmp[i] = LM_GGML_FP16_TO_FP32(x[i]);
+        tmp[i] = LM_GGML_CPU_FP16_TO_FP32(x[i]);
     }
     return _mm256_loadu_ps(tmp);
@@ -310,7 +583,7 @@ static inline void __avx_f32cx8_store(lm_ggml_fp16_t *x, __m256 y) {
     _mm256_storeu_ps(arr, y);
     for (int i = 0; i < 8; i++)
-        x[i] = LM_GGML_FP32_TO_FP16(arr[i]);
+        x[i] = LM_GGML_CPU_FP32_TO_FP16(arr[i]);
 }
 #define LM_GGML_F32Cx8_LOAD(x)     __avx_f32cx8_load(x)
 #define LM_GGML_F32Cx8_STORE(x, y) __avx_f32cx8_store(x, y)
@@ -458,10 +731,10 @@ static inline unsigned char lm_ggml_endian_byte(int i) {
 inline static v128_t __wasm_f16x4_load(const lm_ggml_fp16_t * p) {
     float tmp[4];
-    tmp[0] = LM_GGML_FP16_TO_FP32(p[0]);
-    tmp[1] = LM_GGML_FP16_TO_FP32(p[1]);
-    tmp[2] = LM_GGML_FP16_TO_FP32(p[2]);
-    tmp[3] = LM_GGML_FP16_TO_FP32(p[3]);
+    tmp[0] = LM_GGML_CPU_FP16_TO_FP32(p[0]);
+    tmp[1] = LM_GGML_CPU_FP16_TO_FP32(p[1]);
+    tmp[2] = LM_GGML_CPU_FP16_TO_FP32(p[2]);
+    tmp[3] = LM_GGML_CPU_FP16_TO_FP32(p[3]);
     return wasm_v128_load(tmp);
 }
@@ -471,10 +744,10 @@ inline static void __wasm_f16x4_store(lm_ggml_fp16_t * p, v128_t x) {
     wasm_v128_store(tmp, x);
-    p[0] = LM_GGML_FP32_TO_FP16(tmp[0]);
-    p[1] = LM_GGML_FP32_TO_FP16(tmp[1]);
-    p[2] = LM_GGML_FP32_TO_FP16(tmp[2]);
-    p[3] = LM_GGML_FP32_TO_FP16(tmp[3]);
+    p[0] = LM_GGML_CPU_FP32_TO_FP16(tmp[0]);
+    p[1] = LM_GGML_CPU_FP32_TO_FP16(tmp[1]);
+    p[2] = LM_GGML_CPU_FP32_TO_FP16(tmp[2]);
+    p[3] = LM_GGML_CPU_FP32_TO_FP16(tmp[3]);
 }
 #define LM_GGML_F16x4             v128_t
@@ -574,10 +847,10 @@ inline static void __wasm_f16x4_store(lm_ggml_fp16_t * p, v128_t x) {
 static inline __m128 __sse_f16x4_load(const lm_ggml_fp16_t * x) {
     float tmp[4];
-    tmp[0] = LM_GGML_FP16_TO_FP32(x[0]);
-    tmp[1] = LM_GGML_FP16_TO_FP32(x[1]);
-    tmp[2] = LM_GGML_FP16_TO_FP32(x[2]);
-    tmp[3] = LM_GGML_FP16_TO_FP32(x[3]);
+    tmp[0] = LM_GGML_CPU_FP16_TO_FP32(x[0]);
+    tmp[1] = LM_GGML_CPU_FP16_TO_FP32(x[1]);
+    tmp[2] = LM_GGML_CPU_FP16_TO_FP32(x[2]);
+    tmp[3] = LM_GGML_CPU_FP16_TO_FP32(x[3]);
     return _mm_loadu_ps(tmp);
 }
@@ -587,10 +860,10 @@ static inline void __sse_f16x4_store(lm_ggml_fp16_t * x, __m128 y) {
     _mm_storeu_ps(arr, y);
-    x[0] = LM_GGML_FP32_TO_FP16(arr[0]);
-    x[1] = LM_GGML_FP32_TO_FP16(arr[1]);
-    x[2] = LM_GGML_FP32_TO_FP16(arr[2]);
-    x[3] = LM_GGML_FP32_TO_FP16(arr[3]);
+    x[0] = LM_GGML_CPU_FP32_TO_FP16(arr[0]);
+    x[1] = LM_GGML_CPU_FP32_TO_FP16(arr[1]);
+    x[2] = LM_GGML_CPU_FP32_TO_FP16(arr[2]);
+    x[3] = LM_GGML_CPU_FP32_TO_FP16(arr[3]);
 }
 #define LM_GGML_F32Cx4             __m128
@@ -712,7 +985,7 @@ static inline void __lasx_f32cx8_store(lm_ggml_fp16_t * x, __m256 y) {
 #define LM_GGML_F32x4_ZERO    __lsx_vldi(0)
 #define LM_GGML_F32x4_SET1(x) __lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
 #define LM_GGML_F32x4_LOAD(x) __lsx_vld((x), 0)
-#define LM_GGML_F32x4_STORE((x),(y))   __lsx_vst((y), (x), 0)
+#define LM_GGML_F32x4_STORE(x, y)   __lsx_vst(y, x, 0)
 #define LM_GGML_F32x4_FMA(a, b, c) __lsx_vfmadd_s(b, c, a)
 #define LM_GGML_F32x4_ADD     __lsx_vfadd_s
 #define LM_GGML_F32x4_MUL     __lsx_vfmul_s
@@ -758,10 +1031,10 @@ static inline void __lasx_f32cx8_store(lm_ggml_fp16_t * x, __m256 y) {
 static inline __m128 __lsx_f16x4_load(const lm_ggml_fp16_t * x) {
     float tmp[4];
-    tmp[0] = LM_GGML_FP16_TO_FP32(x[0]);
-    tmp[1] = LM_GGML_FP16_TO_FP32(x[1]);
-    tmp[2] = LM_GGML_FP16_TO_FP32(x[2]);
-    tmp[3] = LM_GGML_FP16_TO_FP32(x[3]);
+    tmp[0] = LM_GGML_CPU_FP16_TO_FP32(x[0]);
+    tmp[1] = LM_GGML_CPU_FP16_TO_FP32(x[1]);
+    tmp[2] = LM_GGML_CPU_FP16_TO_FP32(x[2]);
+    tmp[3] = LM_GGML_CPU_FP16_TO_FP32(x[3]);
     return __lsx_vld(tmp, 0);
 }
@@ -771,10 +1044,10 @@ static inline void __lsx_f16x4_store(lm_ggml_fp16_t * x, __m128 y) {
     __lsx_vst(y, arr, 0);
-    x[0] = LM_GGML_FP32_TO_FP16(arr[0]);
-    x[1] = LM_GGML_FP32_TO_FP16(arr[1]);
-    x[2] = LM_GGML_FP32_TO_FP16(arr[2]);
-    x[3] = LM_GGML_FP32_TO_FP16(arr[3]);
+    x[0] = LM_GGML_CPU_FP32_TO_FP16(arr[0]);
+    x[1] = LM_GGML_CPU_FP32_TO_FP16(arr[1]);
+    x[2] = LM_GGML_CPU_FP32_TO_FP16(arr[2]);
+    x[3] = LM_GGML_CPU_FP32_TO_FP16(arr[3]);
 }
 #define LM_GGML_F32Cx4             __m128
@@ -806,7 +1079,7 @@ static inline void __lsx_f16x4_store(lm_ggml_fp16_t * x, __m128 y) {
 #define LM_GGML_F32_STEP 32
 #define LM_GGML_F32_EPR  4
-#define LM_GGML_F32x4              __vector float
+#define LM_GGML_F32x4              float32x4_t
 #define LM_GGML_F32x4_ZERO         vec_splats(0.0f)
 #define LM_GGML_F32x4_SET1         vec_splats
 #define LM_GGML_F32x4_LOAD(p)      vec_xl(0, p)
@@ -828,10 +1101,8 @@ static inline void __lsx_f16x4_store(lm_ggml_fp16_t * x, __m128 y) {
     for (int i = 0; i < offset; ++i) {              \
         x[i] = vec_add(x[i], x[offset + i]);        \
     }                                               \
-    res = vec_extract(x[0], 0) +                    \
-          vec_extract(x[0], 1) +                    \
-          vec_extract(x[0], 2) +                    \
-          vec_extract(x[0], 3);                     \
+    float32x4_t tmp = x[0] + vec_reve(x[0]);        \
+    res = tmp[0] + tmp[1];                          \
 }
 #define LM_GGML_F32_VEC        LM_GGML_F32x4
@@ -848,28 +1119,45 @@ static inline void __lsx_f16x4_store(lm_ggml_fp16_t * x, __m128 y) {
 #define LM_GGML_F16_STEP LM_GGML_F32_STEP
 #define LM_GGML_F16_EPR  LM_GGML_F32_EPR
-static inline __vector float __lzs_f16cx4_load(const lm_ggml_fp16_t * x) {
+static inline float32x4_t __lzs_f16cx4_load(const lm_ggml_fp16_t * x) {
+#if defined(__NNPA__)
+    uint16x8_t v_x = vec_xl(0, (const lm_ggml_fp16_t *)x);
+    uint16x8_t v_xd = vec_convert_from_fp16(v_x, 0);
+    return vec_extend_to_fp32_hi(v_xd, 0);
+#else
     float tmp[4];
     for (int i = 0; i < 4; i++) {
-        tmp[i] = LM_GGML_FP16_TO_FP32(x[i]);
+        tmp[i] = LM_GGML_CPU_FP16_TO_FP32(x[i]);
     }
     // note: keep type-cast here to prevent compiler bugs
     // see: https://github.com/ggml-org/llama.cpp/issues/12846
     return vec_xl(0, (const float *)(tmp));
+#endif
 }
-static inline void __lzs_f16cx4_store(lm_ggml_fp16_t * x, __vector float y) {
+static inline void __lzs_f16cx4_store(lm_ggml_fp16_t * x, float32x4_t v_y) {
+#if defined(__NNPA__)
+    float32x4_t v_zero = vec_splats(0.0f);
+    uint16x8_t v_xd = vec_round_from_fp32(v_y, v_zero, 0);
+    uint16x8_t v_x = vec_convert_to_fp16(v_xd, 0);
+    x[0] = vec_extract(v_x, 0);
+    x[1] = vec_extract(v_x, 1);
+    x[2] = vec_extract(v_x, 2);
+    x[3] = vec_extract(v_x, 3);
+#else
     float arr[4];
     // note: keep type-cast here to prevent compiler bugs
     // see: https://github.com/ggml-org/llama.cpp/issues/12846
-    vec_xst(y, 0, (float *)(arr));
+    vec_xst(v_y, 0, (float *)(arr));
     for (int i = 0; i < 4; i++) {
-        x[i] = LM_GGML_FP32_TO_FP16(arr[i]);
+        x[i] = LM_GGML_CPU_FP32_TO_FP16(arr[i]);
     }
+#endif
 }
 #define LM_GGML_F16_VEC                LM_GGML_F32x4
@@ -890,3 +1178,7 @@ static inline void __lzs_f16cx4_store(lm_ggml_fp16_t * x, __vector float y) {
 #define LM_GGML_F32_ARR (LM_GGML_F32_STEP/LM_GGML_F32_EPR)
 #define LM_GGML_F16_ARR (LM_GGML_F16_STEP/LM_GGML_F16_EPR)
 #endif
+#ifdef __cplusplus
+}
+#endif

package/cpp/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} RENAMED Viewed

@@ -1,4 +1,4 @@
-#include "ggml-cpu-traits.h"
+#include "traits.h"
 #include "ggml-backend-impl.h"
 #include "ggml-backend.h"