@fugood/llama.node 1.0.0-beta.6 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +2 -0
- package/lib/binding.ts +12 -0
- package/lib/index.js +10 -0
- package/lib/index.ts +17 -1
- package/package.json +14 -14
- package/src/EmbeddingWorker.cpp +1 -1
- package/src/LlamaCompletionWorker.cpp +7 -3
- package/src/LlamaCompletionWorker.h +2 -0
- package/src/LlamaContext.cpp +49 -6
- package/src/LlamaContext.h +1 -0
- package/src/RerankWorker.h +26 -0
- package/src/common.hpp +1 -1
- package/src/llama.cpp/CMakeLists.txt +1 -1
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -46
- package/src/llama.cpp/ggml/CMakeLists.txt +1 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +8 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +10 -9
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +109 -108
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +13 -12
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +53 -52
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +56 -55
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +42 -41
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +24 -23
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +29 -28
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +30 -29
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +83 -82
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +20 -19
- package/src/llama.cpp/ggml/src/ggml-cpu/common.h +3 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +9 -3
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +59 -16
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +3 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +48 -48
- package/src/llama.cpp/ggml/src/ggml-cpu/quants.c +25 -24
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +15 -14
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +211 -33
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +2 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +45 -45
- package/src/llama.cpp/include/llama.h +6 -3
- package/src/llama.cpp/src/llama-arch.cpp +54 -0
- package/src/llama.cpp/src/llama-arch.h +17 -0
- package/src/llama.cpp/src/llama-batch.cpp +20 -7
- package/src/llama.cpp/src/llama-chat.cpp +11 -6
- package/src/llama.cpp/src/llama-context.cpp +0 -1
- package/src/llama.cpp/src/llama-graph.cpp +19 -4
- package/src/llama.cpp/src/llama-graph.h +14 -2
- package/src/llama.cpp/src/llama-hparams.h +6 -0
- package/src/llama.cpp/src/llama-kv-cache-unified.cpp +28 -2
- package/src/llama.cpp/src/llama-kv-cells.h +33 -9
- package/src/llama.cpp/src/llama-model.cpp +518 -1
- package/src/llama.cpp/src/llama-model.h +22 -0
- package/src/llama.cpp/src/llama-quant.cpp +87 -5
|
@@ -2,10 +2,167 @@
|
|
|
2
2
|
|
|
3
3
|
#include "ggml-cpu-impl.h"
|
|
4
4
|
|
|
5
|
+
#ifdef __ARM_FEATURE_SVE
|
|
6
|
+
#include <arm_sve.h>
|
|
7
|
+
#endif // __ARM_FEATURE_SVE
|
|
8
|
+
|
|
9
|
+
#if defined(__ARM_NEON) && !defined(__CUDACC__) && !defined(__MUSACC__)
|
|
10
|
+
// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
|
|
11
|
+
//
|
|
12
|
+
// $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
|
|
13
|
+
//
|
|
14
|
+
#include <arm_neon.h>
|
|
15
|
+
#endif
|
|
16
|
+
|
|
17
|
+
#if defined(__F16C__)
|
|
18
|
+
#include <immintrin.h>
|
|
19
|
+
#endif
|
|
20
|
+
|
|
21
|
+
#ifdef __cplusplus
|
|
22
|
+
extern "C" {
|
|
23
|
+
#endif
|
|
24
|
+
|
|
5
25
|
//
|
|
6
26
|
// simd mappings
|
|
7
27
|
//
|
|
8
28
|
|
|
29
|
+
// FP16 to FP32 conversion
|
|
30
|
+
|
|
31
|
+
// 16-bit float
|
|
32
|
+
// on Arm, we use __fp16
|
|
33
|
+
// on x86, we use uint16_t
|
|
34
|
+
//
|
|
35
|
+
// for old CUDA compilers (<= 11), we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/10616
|
|
36
|
+
// for MUSA compilers , we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/11843
|
|
37
|
+
//
|
|
38
|
+
#if defined(__ARM_NEON) && !(defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11) && !defined(__MUSACC__)
|
|
39
|
+
#define GGML_CPU_COMPUTE_FP16_TO_FP32(x) neon_compute_fp16_to_fp32(x)
|
|
40
|
+
#define GGML_CPU_COMPUTE_FP32_TO_FP16(x) neon_compute_fp32_to_fp16(x)
|
|
41
|
+
|
|
42
|
+
#define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x)
|
|
43
|
+
|
|
44
|
+
static inline float neon_compute_fp16_to_fp32(ggml_fp16_t h) {
|
|
45
|
+
__fp16 tmp;
|
|
46
|
+
memcpy(&tmp, &h, sizeof(ggml_fp16_t));
|
|
47
|
+
return (float)tmp;
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
static inline ggml_fp16_t neon_compute_fp32_to_fp16(float f) {
|
|
51
|
+
ggml_fp16_t res;
|
|
52
|
+
__fp16 tmp = f;
|
|
53
|
+
memcpy(&res, &tmp, sizeof(ggml_fp16_t));
|
|
54
|
+
return res;
|
|
55
|
+
}
|
|
56
|
+
#elif defined(__F16C__)
|
|
57
|
+
#ifdef _MSC_VER
|
|
58
|
+
#define GGML_CPU_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
|
|
59
|
+
#define GGML_CPU_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
|
|
60
|
+
#else
|
|
61
|
+
#define GGML_CPU_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
|
|
62
|
+
#define GGML_CPU_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
|
|
63
|
+
#endif
|
|
64
|
+
#elif defined(__POWER9_VECTOR__)
|
|
65
|
+
#define GGML_CPU_COMPUTE_FP16_TO_FP32(x) power_compute_fp16_to_fp32(x)
|
|
66
|
+
#define GGML_CPU_COMPUTE_FP32_TO_FP16(x) power_compute_fp32_to_fp16(x)
|
|
67
|
+
/* the inline asm below is about 12% faster than the lookup method */
|
|
68
|
+
#define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x)
|
|
69
|
+
#define GGML_CPU_FP32_TO_FP16(x) GGML_CPU_COMPUTE_FP32_TO_FP16(x)
|
|
70
|
+
|
|
71
|
+
static inline float power_compute_fp16_to_fp32(ggml_fp16_t h) {
|
|
72
|
+
float f;
|
|
73
|
+
double d;
|
|
74
|
+
__asm__(
|
|
75
|
+
"mtfprd %0,%2\n"
|
|
76
|
+
"xscvhpdp %0,%0\n"
|
|
77
|
+
"frsp %1,%0\n" :
|
|
78
|
+
/* temp */ "=d"(d),
|
|
79
|
+
/* out */ "=f"(f):
|
|
80
|
+
/* in */ "r"(h));
|
|
81
|
+
return f;
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
static inline ggml_fp16_t power_compute_fp32_to_fp16(float f) {
|
|
85
|
+
double d;
|
|
86
|
+
ggml_fp16_t r;
|
|
87
|
+
__asm__( /* xscvdphp can work on double or single precision */
|
|
88
|
+
"xscvdphp %0,%2\n"
|
|
89
|
+
"mffprd %1,%0\n" :
|
|
90
|
+
/* temp */ "=d"(d),
|
|
91
|
+
/* out */ "=r"(r):
|
|
92
|
+
/* in */ "f"(f));
|
|
93
|
+
return r;
|
|
94
|
+
}
|
|
95
|
+
#elif defined(__riscv) && defined(__riscv_zfhmin)
|
|
96
|
+
static inline float riscv_compute_fp16_to_fp32(ggml_fp16_t h) {
|
|
97
|
+
float f;
|
|
98
|
+
__asm__(
|
|
99
|
+
"fmv.h.x %[f], %[h]\n\t"
|
|
100
|
+
"fcvt.s.h %[f], %[f]"
|
|
101
|
+
: [f] "=&f" (f)
|
|
102
|
+
: [h] "r" (h)
|
|
103
|
+
);
|
|
104
|
+
return f;
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
static inline ggml_fp16_t riscv_compute_fp32_to_fp16(float f) {
|
|
108
|
+
ggml_fp16_t res;
|
|
109
|
+
__asm__(
|
|
110
|
+
"fcvt.h.s %[f], %[f]\n\t"
|
|
111
|
+
"fmv.x.h %[h], %[f]"
|
|
112
|
+
: [h] "=&r" (res)
|
|
113
|
+
: [f] "f" (f)
|
|
114
|
+
);
|
|
115
|
+
return res;
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
#define GGML_CPU_COMPUTE_FP16_TO_FP32(x) riscv_compute_fp16_to_fp32(x)
|
|
119
|
+
#define GGML_CPU_COMPUTE_FP32_TO_FP16(x) riscv_compute_fp32_to_fp16(x)
|
|
120
|
+
#define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x)
|
|
121
|
+
#define GGML_CPU_FP32_TO_FP16(x) GGML_CPU_COMPUTE_FP32_TO_FP16(x)
|
|
122
|
+
#elif defined(__NNPA__)
|
|
123
|
+
#define GGML_CPU_COMPUTE_FP16_TO_FP32(x) nnpa_compute_fp16_to_fp32(x)
|
|
124
|
+
#define GGML_CPU_COMPUTE_FP32_TO_FP16(x) nnpa_compute_fp32_to_fp16(x)
|
|
125
|
+
|
|
126
|
+
#define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x)
|
|
127
|
+
#define GGML_CPU_FP32_TO_FP16(x) GGML_CPU_COMPUTE_FP32_TO_FP16(x)
|
|
128
|
+
|
|
129
|
+
static inline float nnpa_compute_fp16_to_fp32(ggml_fp16_t h) {
|
|
130
|
+
uint16x8_t v_h = vec_splats(h);
|
|
131
|
+
uint16x8_t v_hd = vec_convert_from_fp16(v_h, 0);
|
|
132
|
+
return vec_extend_to_fp32_hi(v_hd, 0)[0];
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
static inline ggml_fp16_t nnpa_compute_fp32_to_fp16(float f) {
|
|
136
|
+
float32x4_t v_f = vec_splats(f);
|
|
137
|
+
float32x4_t v_zero = vec_splats(0.0f);
|
|
138
|
+
uint16x8_t v_hd = vec_round_from_fp32(v_f, v_zero, 0);
|
|
139
|
+
uint16x8_t v_h = vec_convert_to_fp16(v_hd, 0);
|
|
140
|
+
return vec_extract(v_h, 0);
|
|
141
|
+
}
|
|
142
|
+
#endif
|
|
143
|
+
|
|
144
|
+
// precomputed f32 table for f16 (256 KB)
|
|
145
|
+
// defined in ggml-cpu.c, initialized in ggml_cpu_init()
|
|
146
|
+
extern float ggml_table_f32_f16[1 << 16];
|
|
147
|
+
|
|
148
|
+
// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
|
|
149
|
+
// so we define GGML_CPU_FP16_TO_FP32 and GGML_CPU_FP32_TO_FP16 elsewhere for NEON.
|
|
150
|
+
// This is also true for POWER9.
|
|
151
|
+
#if !defined(GGML_CPU_FP16_TO_FP32)
|
|
152
|
+
inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
|
|
153
|
+
uint16_t s;
|
|
154
|
+
memcpy(&s, &f, sizeof(uint16_t));
|
|
155
|
+
return ggml_table_f32_f16[s];
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
#define GGML_CPU_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
|
|
159
|
+
#endif
|
|
160
|
+
|
|
161
|
+
#if !defined(GGML_CPU_FP32_TO_FP16)
|
|
162
|
+
#define GGML_CPU_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
|
|
163
|
+
#endif
|
|
164
|
+
|
|
165
|
+
|
|
9
166
|
// we define a common set of C macros which map to specific intrinsics based on the current architecture
|
|
10
167
|
// we then implement the fundamental computation operations below using only these macros
|
|
11
168
|
// adding support for new architectures requires to define the corresponding SIMD macros
|
|
@@ -415,7 +572,7 @@ static inline __m256 __avx_f32cx8_load(const ggml_fp16_t * x) {
|
|
|
415
572
|
float tmp[8];
|
|
416
573
|
|
|
417
574
|
for (int i = 0; i < 8; i++) {
|
|
418
|
-
tmp[i] =
|
|
575
|
+
tmp[i] = GGML_CPU_FP16_TO_FP32(x[i]);
|
|
419
576
|
}
|
|
420
577
|
|
|
421
578
|
return _mm256_loadu_ps(tmp);
|
|
@@ -426,7 +583,7 @@ static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
|
|
|
426
583
|
_mm256_storeu_ps(arr, y);
|
|
427
584
|
|
|
428
585
|
for (int i = 0; i < 8; i++)
|
|
429
|
-
x[i] =
|
|
586
|
+
x[i] = GGML_CPU_FP32_TO_FP16(arr[i]);
|
|
430
587
|
}
|
|
431
588
|
#define GGML_F32Cx8_LOAD(x) __avx_f32cx8_load(x)
|
|
432
589
|
#define GGML_F32Cx8_STORE(x, y) __avx_f32cx8_store(x, y)
|
|
@@ -574,10 +731,10 @@ static inline unsigned char ggml_endian_byte(int i) {
|
|
|
574
731
|
inline static v128_t __wasm_f16x4_load(const ggml_fp16_t * p) {
|
|
575
732
|
float tmp[4];
|
|
576
733
|
|
|
577
|
-
tmp[0] =
|
|
578
|
-
tmp[1] =
|
|
579
|
-
tmp[2] =
|
|
580
|
-
tmp[3] =
|
|
734
|
+
tmp[0] = GGML_CPU_FP16_TO_FP32(p[0]);
|
|
735
|
+
tmp[1] = GGML_CPU_FP16_TO_FP32(p[1]);
|
|
736
|
+
tmp[2] = GGML_CPU_FP16_TO_FP32(p[2]);
|
|
737
|
+
tmp[3] = GGML_CPU_FP16_TO_FP32(p[3]);
|
|
581
738
|
|
|
582
739
|
return wasm_v128_load(tmp);
|
|
583
740
|
}
|
|
@@ -587,10 +744,10 @@ inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
|
|
|
587
744
|
|
|
588
745
|
wasm_v128_store(tmp, x);
|
|
589
746
|
|
|
590
|
-
p[0] =
|
|
591
|
-
p[1] =
|
|
592
|
-
p[2] =
|
|
593
|
-
p[3] =
|
|
747
|
+
p[0] = GGML_CPU_FP32_TO_FP16(tmp[0]);
|
|
748
|
+
p[1] = GGML_CPU_FP32_TO_FP16(tmp[1]);
|
|
749
|
+
p[2] = GGML_CPU_FP32_TO_FP16(tmp[2]);
|
|
750
|
+
p[3] = GGML_CPU_FP32_TO_FP16(tmp[3]);
|
|
594
751
|
}
|
|
595
752
|
|
|
596
753
|
#define GGML_F16x4 v128_t
|
|
@@ -690,10 +847,10 @@ inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
|
|
|
690
847
|
static inline __m128 __sse_f16x4_load(const ggml_fp16_t * x) {
|
|
691
848
|
float tmp[4];
|
|
692
849
|
|
|
693
|
-
tmp[0] =
|
|
694
|
-
tmp[1] =
|
|
695
|
-
tmp[2] =
|
|
696
|
-
tmp[3] =
|
|
850
|
+
tmp[0] = GGML_CPU_FP16_TO_FP32(x[0]);
|
|
851
|
+
tmp[1] = GGML_CPU_FP16_TO_FP32(x[1]);
|
|
852
|
+
tmp[2] = GGML_CPU_FP16_TO_FP32(x[2]);
|
|
853
|
+
tmp[3] = GGML_CPU_FP16_TO_FP32(x[3]);
|
|
697
854
|
|
|
698
855
|
return _mm_loadu_ps(tmp);
|
|
699
856
|
}
|
|
@@ -703,10 +860,10 @@ static inline void __sse_f16x4_store(ggml_fp16_t * x, __m128 y) {
|
|
|
703
860
|
|
|
704
861
|
_mm_storeu_ps(arr, y);
|
|
705
862
|
|
|
706
|
-
x[0] =
|
|
707
|
-
x[1] =
|
|
708
|
-
x[2] =
|
|
709
|
-
x[3] =
|
|
863
|
+
x[0] = GGML_CPU_FP32_TO_FP16(arr[0]);
|
|
864
|
+
x[1] = GGML_CPU_FP32_TO_FP16(arr[1]);
|
|
865
|
+
x[2] = GGML_CPU_FP32_TO_FP16(arr[2]);
|
|
866
|
+
x[3] = GGML_CPU_FP32_TO_FP16(arr[3]);
|
|
710
867
|
}
|
|
711
868
|
|
|
712
869
|
#define GGML_F32Cx4 __m128
|
|
@@ -828,7 +985,7 @@ static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
|
|
|
828
985
|
#define GGML_F32x4_ZERO __lsx_vldi(0)
|
|
829
986
|
#define GGML_F32x4_SET1(x) __lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
|
|
830
987
|
#define GGML_F32x4_LOAD(x) __lsx_vld((x), 0)
|
|
831
|
-
#define GGML_F32x4_STORE(
|
|
988
|
+
#define GGML_F32x4_STORE(x, y) __lsx_vst(y, x, 0)
|
|
832
989
|
#define GGML_F32x4_FMA(a, b, c) __lsx_vfmadd_s(b, c, a)
|
|
833
990
|
#define GGML_F32x4_ADD __lsx_vfadd_s
|
|
834
991
|
#define GGML_F32x4_MUL __lsx_vfmul_s
|
|
@@ -874,10 +1031,10 @@ static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
|
|
|
874
1031
|
static inline __m128 __lsx_f16x4_load(const ggml_fp16_t * x) {
|
|
875
1032
|
float tmp[4];
|
|
876
1033
|
|
|
877
|
-
tmp[0] =
|
|
878
|
-
tmp[1] =
|
|
879
|
-
tmp[2] =
|
|
880
|
-
tmp[3] =
|
|
1034
|
+
tmp[0] = GGML_CPU_FP16_TO_FP32(x[0]);
|
|
1035
|
+
tmp[1] = GGML_CPU_FP16_TO_FP32(x[1]);
|
|
1036
|
+
tmp[2] = GGML_CPU_FP16_TO_FP32(x[2]);
|
|
1037
|
+
tmp[3] = GGML_CPU_FP16_TO_FP32(x[3]);
|
|
881
1038
|
|
|
882
1039
|
return __lsx_vld(tmp, 0);
|
|
883
1040
|
}
|
|
@@ -887,10 +1044,10 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
|
|
|
887
1044
|
|
|
888
1045
|
__lsx_vst(y, arr, 0);
|
|
889
1046
|
|
|
890
|
-
x[0] =
|
|
891
|
-
x[1] =
|
|
892
|
-
x[2] =
|
|
893
|
-
x[3] =
|
|
1047
|
+
x[0] = GGML_CPU_FP32_TO_FP16(arr[0]);
|
|
1048
|
+
x[1] = GGML_CPU_FP32_TO_FP16(arr[1]);
|
|
1049
|
+
x[2] = GGML_CPU_FP32_TO_FP16(arr[2]);
|
|
1050
|
+
x[3] = GGML_CPU_FP32_TO_FP16(arr[3]);
|
|
894
1051
|
}
|
|
895
1052
|
|
|
896
1053
|
#define GGML_F32Cx4 __m128
|
|
@@ -922,7 +1079,7 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
|
|
|
922
1079
|
#define GGML_F32_STEP 32
|
|
923
1080
|
#define GGML_F32_EPR 4
|
|
924
1081
|
|
|
925
|
-
#define GGML_F32x4
|
|
1082
|
+
#define GGML_F32x4 float32x4_t
|
|
926
1083
|
#define GGML_F32x4_ZERO vec_splats(0.0f)
|
|
927
1084
|
#define GGML_F32x4_SET1 vec_splats
|
|
928
1085
|
#define GGML_F32x4_LOAD(p) vec_xl(0, p)
|
|
@@ -962,28 +1119,45 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
|
|
|
962
1119
|
#define GGML_F16_STEP GGML_F32_STEP
|
|
963
1120
|
#define GGML_F16_EPR GGML_F32_EPR
|
|
964
1121
|
|
|
965
|
-
static inline
|
|
1122
|
+
static inline float32x4_t __lzs_f16cx4_load(const ggml_fp16_t * x) {
|
|
1123
|
+
#if defined(__NNPA__)
|
|
1124
|
+
uint16x8_t v_x = vec_xl(0, (const ggml_fp16_t *)x);
|
|
1125
|
+
uint16x8_t v_xd = vec_convert_from_fp16(v_x, 0);
|
|
1126
|
+
return vec_extend_to_fp32_hi(v_xd, 0);
|
|
1127
|
+
#else
|
|
966
1128
|
float tmp[4];
|
|
967
1129
|
|
|
968
1130
|
for (int i = 0; i < 4; i++) {
|
|
969
|
-
tmp[i] =
|
|
1131
|
+
tmp[i] = GGML_CPU_FP16_TO_FP32(x[i]);
|
|
970
1132
|
}
|
|
971
1133
|
|
|
972
1134
|
// note: keep type-cast here to prevent compiler bugs
|
|
973
1135
|
// see: https://github.com/ggml-org/llama.cpp/issues/12846
|
|
974
1136
|
return vec_xl(0, (const float *)(tmp));
|
|
1137
|
+
#endif
|
|
975
1138
|
}
|
|
976
1139
|
|
|
977
|
-
static inline void __lzs_f16cx4_store(ggml_fp16_t * x,
|
|
1140
|
+
static inline void __lzs_f16cx4_store(ggml_fp16_t * x, float32x4_t v_y) {
|
|
1141
|
+
#if defined(__NNPA__)
|
|
1142
|
+
float32x4_t v_zero = vec_splats(0.0f);
|
|
1143
|
+
uint16x8_t v_xd = vec_round_from_fp32(v_y, v_zero, 0);
|
|
1144
|
+
uint16x8_t v_x = vec_convert_to_fp16(v_xd, 0);
|
|
1145
|
+
|
|
1146
|
+
x[0] = vec_extract(v_x, 0);
|
|
1147
|
+
x[1] = vec_extract(v_x, 1);
|
|
1148
|
+
x[2] = vec_extract(v_x, 2);
|
|
1149
|
+
x[3] = vec_extract(v_x, 3);
|
|
1150
|
+
#else
|
|
978
1151
|
float arr[4];
|
|
979
1152
|
|
|
980
1153
|
// note: keep type-cast here to prevent compiler bugs
|
|
981
1154
|
// see: https://github.com/ggml-org/llama.cpp/issues/12846
|
|
982
|
-
vec_xst(
|
|
1155
|
+
vec_xst(v_y, 0, (float *)(arr));
|
|
983
1156
|
|
|
984
1157
|
for (int i = 0; i < 4; i++) {
|
|
985
|
-
x[i] =
|
|
1158
|
+
x[i] = GGML_CPU_FP32_TO_FP16(arr[i]);
|
|
986
1159
|
}
|
|
1160
|
+
#endif
|
|
987
1161
|
}
|
|
988
1162
|
|
|
989
1163
|
#define GGML_F16_VEC GGML_F32x4
|
|
@@ -1004,3 +1178,7 @@ static inline void __lzs_f16cx4_store(ggml_fp16_t * x, __vector float y) {
|
|
|
1004
1178
|
#define GGML_F32_ARR (GGML_F32_STEP/GGML_F32_EPR)
|
|
1005
1179
|
#define GGML_F16_ARR (GGML_F16_STEP/GGML_F16_EPR)
|
|
1006
1180
|
#endif
|
|
1181
|
+
|
|
1182
|
+
#ifdef __cplusplus
|
|
1183
|
+
}
|
|
1184
|
+
#endif
|
|
@@ -219,11 +219,11 @@ void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * G
|
|
|
219
219
|
|
|
220
220
|
// leftovers
|
|
221
221
|
for (int i = np; i < n; ++i) {
|
|
222
|
-
sumf += (ggml_float)(
|
|
222
|
+
sumf += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[i])*GGML_CPU_FP16_TO_FP32(y[i]));
|
|
223
223
|
}
|
|
224
224
|
#else
|
|
225
225
|
for (int i = 0; i < n; ++i) {
|
|
226
|
-
sumf += (ggml_float)(
|
|
226
|
+
sumf += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[i])*GGML_CPU_FP16_TO_FP32(y[i]));
|
|
227
227
|
}
|
|
228
228
|
#endif
|
|
229
229
|
|
|
@@ -58,7 +58,7 @@ inline static void ggml_vec_set_bf16(const int n, ggml_bf16_t * x, const ggml_bf
|
|
|
58
58
|
inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] + y[i]; }
|
|
59
59
|
inline static void ggml_vec_add_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
|
|
60
60
|
for (int i = 0; i < n; ++i) {
|
|
61
|
-
z[i] =
|
|
61
|
+
z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) + GGML_CPU_FP16_TO_FP32(y[i]));
|
|
62
62
|
}
|
|
63
63
|
}
|
|
64
64
|
inline static void ggml_vec_add1_f32(const int n, float * z, const float * x, const float v) { for (int i = 0; i < n; ++i) z[i] = x[i] + v; }
|
|
@@ -67,7 +67,7 @@ inline static void ggml_vec_acc1_f32(const int n, float * y, const float v)
|
|
|
67
67
|
inline static void ggml_vec_sub_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] - y[i]; }
|
|
68
68
|
inline static void ggml_vec_sub_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
|
|
69
69
|
for (int i = 0; i < n; ++i) {
|
|
70
|
-
z[i] =
|
|
70
|
+
z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) - GGML_CPU_FP16_TO_FP32(y[i]));
|
|
71
71
|
}
|
|
72
72
|
}
|
|
73
73
|
inline static void ggml_vec_set_f32 (const int n, float * x, const float v) { for (int i = 0; i < n; ++i) x[i] = v; }
|
|
@@ -75,20 +75,20 @@ inline static void ggml_vec_cpy_f32 (const int n, float * y, const float * x)
|
|
|
75
75
|
inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = -x[i]; }
|
|
76
76
|
inline static void ggml_vec_neg_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
|
77
77
|
for (int i = 0; i < n; ++i) {
|
|
78
|
-
y[i] =
|
|
78
|
+
y[i] = GGML_CPU_FP32_TO_FP16(-GGML_CPU_FP16_TO_FP32(x[i]));
|
|
79
79
|
}
|
|
80
80
|
}
|
|
81
81
|
|
|
82
82
|
inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]*y[i]; }
|
|
83
83
|
inline static void ggml_vec_mul_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
|
|
84
84
|
for (int i = 0; i < n; ++i) {
|
|
85
|
-
z[i] =
|
|
85
|
+
z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) * GGML_CPU_FP16_TO_FP32(y[i]));
|
|
86
86
|
}
|
|
87
87
|
}
|
|
88
88
|
inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]/y[i]; }
|
|
89
89
|
inline static void ggml_vec_div_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
|
|
90
90
|
for (int i = 0; i < n; ++i) {
|
|
91
|
-
z[i] =
|
|
91
|
+
z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) / GGML_CPU_FP16_TO_FP32(y[i]));
|
|
92
92
|
}
|
|
93
93
|
}
|
|
94
94
|
|
|
@@ -131,13 +131,13 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG
|
|
|
131
131
|
// leftovers
|
|
132
132
|
for (int i = np; i < n; ++i) {
|
|
133
133
|
for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
|
|
134
|
-
sumf[j] += (ggml_float)(
|
|
134
|
+
sumf[j] += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[j][i])*GGML_CPU_FP16_TO_FP32(y[i]));
|
|
135
135
|
}
|
|
136
136
|
}
|
|
137
137
|
#else
|
|
138
138
|
for (int i = 0; i < n; ++i) {
|
|
139
139
|
for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
|
|
140
|
-
sumf[j] += (ggml_float)(
|
|
140
|
+
sumf[j] += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[j][i])*GGML_CPU_FP16_TO_FP32(y[i]));
|
|
141
141
|
}
|
|
142
142
|
}
|
|
143
143
|
#endif
|
|
@@ -280,12 +280,12 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y,
|
|
|
280
280
|
|
|
281
281
|
// leftovers
|
|
282
282
|
for (int i = np; i < n; ++i) {
|
|
283
|
-
y[i] =
|
|
283
|
+
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v);
|
|
284
284
|
}
|
|
285
285
|
#else
|
|
286
286
|
// scalar
|
|
287
287
|
for (int i = 0; i < n; ++i) {
|
|
288
|
-
y[i] =
|
|
288
|
+
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v);
|
|
289
289
|
}
|
|
290
290
|
#endif
|
|
291
291
|
}
|
|
@@ -430,12 +430,12 @@ inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float
|
|
|
430
430
|
|
|
431
431
|
// leftovers
|
|
432
432
|
for (int i = np; i < n; ++i) {
|
|
433
|
-
y[i] =
|
|
433
|
+
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v);
|
|
434
434
|
}
|
|
435
435
|
#else
|
|
436
436
|
// scalar
|
|
437
437
|
for (int i = 0; i < n; ++i) {
|
|
438
|
-
y[i] =
|
|
438
|
+
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v);
|
|
439
439
|
}
|
|
440
440
|
#endif
|
|
441
441
|
}
|
|
@@ -444,103 +444,103 @@ inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) {
|
|
|
444
444
|
inline static void ggml_vec_sqr_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i]; }
|
|
445
445
|
inline static void ggml_vec_sqr_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
|
446
446
|
for (int i = 0; i < n; ++i) {
|
|
447
|
-
float v =
|
|
448
|
-
y[i] =
|
|
447
|
+
float v = GGML_CPU_FP16_TO_FP32(x[i]);
|
|
448
|
+
y[i] = GGML_CPU_FP32_TO_FP16(v*v);
|
|
449
449
|
}
|
|
450
450
|
}
|
|
451
451
|
inline static void ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrtf(x[i]); }
|
|
452
452
|
inline static void ggml_vec_sqrt_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
|
453
453
|
for (int i = 0; i < n; ++i) {
|
|
454
|
-
y[i] =
|
|
454
|
+
y[i] = GGML_CPU_FP32_TO_FP16(sqrtf(GGML_CPU_FP16_TO_FP32(x[i])));
|
|
455
455
|
}
|
|
456
456
|
}
|
|
457
457
|
inline static void ggml_vec_log_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = logf(x[i]); }
|
|
458
458
|
inline static void ggml_vec_log_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
|
459
459
|
for (int i = 0; i < n; ++i) {
|
|
460
|
-
y[i] =
|
|
460
|
+
y[i] = GGML_CPU_FP32_TO_FP16(logf(GGML_CPU_FP16_TO_FP32(x[i])));
|
|
461
461
|
}
|
|
462
462
|
}
|
|
463
463
|
inline static void ggml_vec_sin_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sinf(x[i]); }
|
|
464
464
|
inline static void ggml_vec_sin_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
|
465
465
|
for (int i = 0; i < n; ++i) {
|
|
466
|
-
y[i] =
|
|
466
|
+
y[i] = GGML_CPU_FP32_TO_FP16(sinf(GGML_CPU_FP16_TO_FP32(x[i])));
|
|
467
467
|
}
|
|
468
468
|
}
|
|
469
469
|
inline static void ggml_vec_cos_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = cosf(x[i]); }
|
|
470
470
|
inline static void ggml_vec_cos_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
|
471
471
|
for (int i = 0; i < n; ++i) {
|
|
472
|
-
y[i] =
|
|
472
|
+
y[i] = GGML_CPU_FP32_TO_FP16(cosf(GGML_CPU_FP16_TO_FP32(x[i])));
|
|
473
473
|
}
|
|
474
474
|
}
|
|
475
475
|
inline static void ggml_vec_abs_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fabsf(x[i]); }
|
|
476
476
|
inline static void ggml_vec_abs_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
|
477
477
|
for (int i = 0; i < n; ++i) {
|
|
478
|
-
y[i] =
|
|
478
|
+
y[i] = GGML_CPU_FP32_TO_FP16(fabsf(GGML_CPU_FP16_TO_FP32(x[i])));
|
|
479
479
|
}
|
|
480
480
|
}
|
|
481
481
|
inline static void ggml_vec_sgn_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); }
|
|
482
482
|
inline static void ggml_vec_sgn_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
|
483
483
|
for (int i = 0; i < n; ++i) {
|
|
484
|
-
float v =
|
|
485
|
-
y[i] =
|
|
484
|
+
float v = GGML_CPU_FP16_TO_FP32(x[i]);
|
|
485
|
+
y[i] = GGML_CPU_FP32_TO_FP16((v > 0.f) ? 1.f : ((v < 0.f) ? -1.f : 0.f));
|
|
486
486
|
}
|
|
487
487
|
}
|
|
488
488
|
inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f; }
|
|
489
489
|
inline static void ggml_vec_step_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
|
490
490
|
for (int i = 0; i < n; ++i) {
|
|
491
|
-
y[i] =
|
|
491
|
+
y[i] = GGML_CPU_FP32_TO_FP16((GGML_CPU_FP16_TO_FP32(x[i]) > 0.f) ? 1.f : 0.f);
|
|
492
492
|
}
|
|
493
493
|
}
|
|
494
494
|
inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]); }
|
|
495
495
|
inline static void ggml_vec_tanh_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
|
496
496
|
for (int i = 0; i < n; ++i) {
|
|
497
|
-
y[i] =
|
|
497
|
+
y[i] = GGML_CPU_FP32_TO_FP16(tanhf(GGML_CPU_FP16_TO_FP32(x[i])));
|
|
498
498
|
}
|
|
499
499
|
}
|
|
500
500
|
inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expm1f(x[i]); }
|
|
501
501
|
inline static void ggml_vec_elu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
|
502
502
|
for (int i = 0; i < n; ++i) {
|
|
503
|
-
y[i] =
|
|
503
|
+
y[i] = GGML_CPU_FP32_TO_FP16(expm1f(GGML_CPU_FP16_TO_FP32(x[i])));
|
|
504
504
|
}
|
|
505
505
|
}
|
|
506
506
|
inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
|
|
507
507
|
inline static void ggml_vec_relu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
|
508
508
|
for (int i = 0; i < n; ++i) {
|
|
509
|
-
float v =
|
|
510
|
-
y[i] =
|
|
509
|
+
float v = GGML_CPU_FP16_TO_FP32(x[i]);
|
|
510
|
+
y[i] = GGML_CPU_FP32_TO_FP16((v > 0.f) ? v : 0.f);
|
|
511
511
|
}
|
|
512
512
|
}
|
|
513
513
|
inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }
|
|
514
514
|
inline static void ggml_vec_leaky_relu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const float ns) {
|
|
515
515
|
for (int i = 0; i < n; ++i) {
|
|
516
|
-
float v =
|
|
517
|
-
y[i] =
|
|
516
|
+
float v = GGML_CPU_FP16_TO_FP32(x[i]);
|
|
517
|
+
y[i] = GGML_CPU_FP32_TO_FP16(((v > 0.f) ? v : 0.f) + ns * ((v < 0.0f) ? v : 0.f));
|
|
518
518
|
}
|
|
519
519
|
}
|
|
520
520
|
inline static void ggml_vec_sigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = 1.f / (1.f + expf(-x[i])); }
|
|
521
521
|
inline static void ggml_vec_sigmoid_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
|
522
522
|
for (int i = 0; i < n; ++i) {
|
|
523
|
-
y[i] =
|
|
523
|
+
y[i] = GGML_CPU_FP32_TO_FP16(1.f / (1.f + expf(-GGML_CPU_FP16_TO_FP32(x[i]))));
|
|
524
524
|
}
|
|
525
525
|
}
|
|
526
526
|
// TODO: optimize performance
|
|
527
527
|
inline static void ggml_vec_hardswish_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
|
|
528
528
|
inline static void ggml_vec_hardswish_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
|
529
529
|
for (int i = 0; i < n; ++i) {
|
|
530
|
-
float v =
|
|
531
|
-
y[i] =
|
|
530
|
+
float v = GGML_CPU_FP16_TO_FP32(x[i]);
|
|
531
|
+
y[i] = GGML_CPU_FP32_TO_FP16(v * fminf(1.0f, fmaxf(0.0f, (v + 3.0f) / 6.0f)));
|
|
532
532
|
}
|
|
533
533
|
}
|
|
534
534
|
inline static void ggml_vec_hardsigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
|
|
535
535
|
inline static void ggml_vec_hardsigmoid_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
|
536
536
|
for (int i = 0; i < n; ++i) {
|
|
537
|
-
y[i] =
|
|
537
|
+
y[i] = GGML_CPU_FP32_TO_FP16(fminf(1.0f, fmaxf(0.0f, (GGML_CPU_FP16_TO_FP32(x[i]) + 3.0f) / 6.0f)));
|
|
538
538
|
}
|
|
539
539
|
}
|
|
540
540
|
inline static void ggml_vec_exp_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = expf(x[i]); }
|
|
541
541
|
inline static void ggml_vec_exp_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
|
542
542
|
for (int i = 0; i < n; ++i) {
|
|
543
|
-
y[i] =
|
|
543
|
+
y[i] = GGML_CPU_FP32_TO_FP16(expf(GGML_CPU_FP16_TO_FP32(x[i])));
|
|
544
544
|
}
|
|
545
545
|
}
|
|
546
546
|
|
|
@@ -562,9 +562,9 @@ inline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp
|
|
|
562
562
|
|
|
563
563
|
inline static void ggml_vec_gelu_erf_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
|
564
564
|
for (int i = 0; i < n; ++i) {
|
|
565
|
-
float xi =
|
|
565
|
+
float xi = GGML_CPU_FP16_TO_FP32(x[i]);
|
|
566
566
|
float res = 0.5f*xi*(1.0f + erff(xi*SQRT_2_INV));
|
|
567
|
-
y[i] =
|
|
567
|
+
y[i] = GGML_CPU_FP32_TO_FP16(res);
|
|
568
568
|
}
|
|
569
569
|
}
|
|
570
570
|
|
|
@@ -577,9 +577,9 @@ inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
|
|
|
577
577
|
} else if (x[i] >= 10.0f) {
|
|
578
578
|
y[i] = x[i];
|
|
579
579
|
} else {
|
|
580
|
-
ggml_fp16_t fp16 =
|
|
580
|
+
ggml_fp16_t fp16 = GGML_CPU_FP32_TO_FP16(x[i]);
|
|
581
581
|
memcpy(&t, &fp16, sizeof(uint16_t));
|
|
582
|
-
y[i] =
|
|
582
|
+
y[i] = GGML_CPU_FP16_TO_FP32(ggml_table_gelu_f16[t]);
|
|
583
583
|
}
|
|
584
584
|
}
|
|
585
585
|
}
|
|
@@ -613,9 +613,9 @@ inline static float ggml_gelu_quick_f32(float x) {
|
|
|
613
613
|
inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) {
|
|
614
614
|
uint16_t t;
|
|
615
615
|
for (int i = 0; i < n; ++i) {
|
|
616
|
-
ggml_fp16_t fp16 =
|
|
616
|
+
ggml_fp16_t fp16 = GGML_CPU_FP32_TO_FP16(x[i]);
|
|
617
617
|
memcpy(&t, &fp16, sizeof(uint16_t));
|
|
618
|
-
y[i] =
|
|
618
|
+
y[i] = GGML_CPU_FP16_TO_FP32(ggml_table_gelu_quick_f16[t]);
|
|
619
619
|
}
|
|
620
620
|
}
|
|
621
621
|
#else
|
|
@@ -628,8 +628,8 @@ inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float *
|
|
|
628
628
|
|
|
629
629
|
inline static void ggml_vec_gelu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
|
630
630
|
for (int i = 0; i < n; ++i) {
|
|
631
|
-
float v =
|
|
632
|
-
y[i] =
|
|
631
|
+
float v = GGML_CPU_FP16_TO_FP32(x[i]);
|
|
632
|
+
y[i] = GGML_CPU_FP32_TO_FP16(v*(1.0f/(1.0f+expf(GELU_QUICK_COEF*v))));
|
|
633
633
|
}
|
|
634
634
|
}
|
|
635
635
|
|
|
@@ -638,8 +638,8 @@ inline static float ggml_silu_f32(float x) {
|
|
|
638
638
|
return x/(1.0f + expf(-x));
|
|
639
639
|
}
|
|
640
640
|
inline static ggml_fp16_t ggml_silu_f16(ggml_fp16_t x) {
|
|
641
|
-
float v =
|
|
642
|
-
return
|
|
641
|
+
float v = GGML_CPU_FP16_TO_FP32(x);
|
|
642
|
+
return GGML_CPU_FP32_TO_FP16(v/(1.0f + expf(-v)));
|
|
643
643
|
}
|
|
644
644
|
|
|
645
645
|
#if __FINITE_MATH_ONLY__
|
|
@@ -888,9 +888,9 @@ inline static float ggml_silu_backward_f32(float x, float dy) {
|
|
|
888
888
|
}
|
|
889
889
|
|
|
890
890
|
inline static ggml_fp16_t ggml_silu_backward_f16(ggml_fp16_t x, ggml_fp16_t dy) {
|
|
891
|
-
const float v =
|
|
891
|
+
const float v = GGML_CPU_FP16_TO_FP32(x);
|
|
892
892
|
const float s = 1.0f/(1.0f + expf(-v));
|
|
893
|
-
return
|
|
893
|
+
return GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(dy)*s*(1.0f + v*(1.0f - s)));
|
|
894
894
|
}
|
|
895
895
|
|
|
896
896
|
inline static void ggml_vec_silu_backward_f32(const int n, float * dx, const float * x, const float * dy) {
|
|
@@ -928,7 +928,7 @@ inline static void ggml_vec_sum_f32_ggf(const int n, ggml_float * s, const float
|
|
|
928
928
|
inline static void ggml_vec_sum_f16_ggf(const int n, float * s, const ggml_fp16_t * x) {
|
|
929
929
|
float sum = 0.0f;
|
|
930
930
|
for (int i = 0; i < n; ++i) {
|
|
931
|
-
sum +=
|
|
931
|
+
sum += GGML_CPU_FP16_TO_FP32(x[i]);
|
|
932
932
|
}
|
|
933
933
|
*s = sum;
|
|
934
934
|
}
|