@fugood/llama.node 1.0.0-beta.7 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +2 -0
- package/lib/binding.ts +10 -0
- package/lib/index.js +8 -0
- package/lib/index.ts +14 -0
- package/package.json +14 -14
- package/src/LlamaContext.cpp +37 -0
- package/src/LlamaContext.h +1 -0
- package/src/RerankWorker.h +26 -0
- package/src/llama.cpp/CMakeLists.txt +1 -1
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -46
- package/src/llama.cpp/ggml/CMakeLists.txt +1 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +8 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +10 -9
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +109 -108
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +13 -12
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +53 -52
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +56 -55
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +42 -41
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +24 -23
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +29 -28
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +30 -29
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +83 -82
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +20 -19
- package/src/llama.cpp/ggml/src/ggml-cpu/common.h +3 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +9 -3
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +59 -16
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +3 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +48 -48
- package/src/llama.cpp/ggml/src/ggml-cpu/quants.c +25 -24
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +15 -14
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +211 -33
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +2 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +45 -45
- package/src/llama.cpp/include/llama.h +6 -3
- package/src/llama.cpp/src/llama-arch.cpp +54 -0
- package/src/llama.cpp/src/llama-arch.h +17 -0
- package/src/llama.cpp/src/llama-batch.cpp +20 -7
- package/src/llama.cpp/src/llama-chat.cpp +11 -6
- package/src/llama.cpp/src/llama-context.cpp +0 -1
- package/src/llama.cpp/src/llama-graph.cpp +19 -4
- package/src/llama.cpp/src/llama-graph.h +14 -2
- package/src/llama.cpp/src/llama-hparams.h +6 -0
- package/src/llama.cpp/src/llama-kv-cache-unified.cpp +28 -2
- package/src/llama.cpp/src/llama-kv-cells.h +33 -9
- package/src/llama.cpp/src/llama-model.cpp +518 -1
- package/src/llama.cpp/src/llama-model.h +22 -0
- package/src/llama.cpp/src/llama-quant.cpp +87 -5
|
@@ -6,6 +6,7 @@
|
|
|
6
6
|
#include "ggml-impl.h"
|
|
7
7
|
#include "ggml-cpu.h"
|
|
8
8
|
#include "ggml-cpu-impl.h"
|
|
9
|
+
#include "simd-mappings.h"
|
|
9
10
|
#include "traits.h"
|
|
10
11
|
|
|
11
12
|
#include <cmath>
|
|
@@ -39,11 +40,11 @@ static inline __m512 __avx512_f32cx8x2_load(ggml_fp16_t *x, ggml_fp16_t *y) {
|
|
|
39
40
|
float tmp[16];
|
|
40
41
|
|
|
41
42
|
for (int i = 0; i < 8; i++) {
|
|
42
|
-
tmp[i] =
|
|
43
|
+
tmp[i] = GGML_CPU_FP16_TO_FP32(x[i]);
|
|
43
44
|
}
|
|
44
45
|
|
|
45
46
|
for (int i = 0; i < 8; i++) {
|
|
46
|
-
tmp[i + 8] =
|
|
47
|
+
tmp[i + 8] = GGML_CPU_FP16_TO_FP32(y[i]);
|
|
47
48
|
}
|
|
48
49
|
|
|
49
50
|
return _mm512_loadu_ps(tmp);
|
|
@@ -54,10 +55,10 @@ static inline __m512 __avx512_repeat_f32cx16_load(__m128i x) {
|
|
|
54
55
|
_mm_storeu_si128((__m128i*)tmphalf, x);
|
|
55
56
|
|
|
56
57
|
for (int i = 0; i < 4; i++) {
|
|
57
|
-
tmp[i] =
|
|
58
|
-
tmp[i + 4] =
|
|
59
|
-
tmp[i + 8] =
|
|
60
|
-
tmp[i + 12] =
|
|
58
|
+
tmp[i] = GGML_CPU_FP16_TO_FP32(tmphalf[i]);
|
|
59
|
+
tmp[i + 4] = GGML_CPU_FP16_TO_FP32(tmphalf[i]);
|
|
60
|
+
tmp[i + 8] = GGML_CPU_FP16_TO_FP32(tmphalf[i]);
|
|
61
|
+
tmp[i + 12] = GGML_CPU_FP16_TO_FP32(tmphalf[i]);
|
|
61
62
|
}
|
|
62
63
|
|
|
63
64
|
return _mm512_loadu_ps(tmp);
|
|
@@ -67,7 +68,7 @@ static inline __m256 __avx_f32cx8_load(ggml_fp16_t *x) {
|
|
|
67
68
|
float tmp[8];
|
|
68
69
|
|
|
69
70
|
for (int i = 0; i < 8; i++) {
|
|
70
|
-
tmp[i] =
|
|
71
|
+
tmp[i] = GGML_CPU_FP16_TO_FP32(x[i]);
|
|
71
72
|
}
|
|
72
73
|
|
|
73
74
|
return _mm256_loadu_ps(tmp);
|
|
@@ -76,8 +77,8 @@ static inline __m256 __avx_repeat_f32cx8_load(ggml_fp16_t *x) {
|
|
|
76
77
|
float tmp[8];
|
|
77
78
|
|
|
78
79
|
for (int i = 0; i < 4; i++) {
|
|
79
|
-
tmp[i] =
|
|
80
|
-
tmp[i + 4] =
|
|
80
|
+
tmp[i] = GGML_CPU_FP16_TO_FP32(x[i]);
|
|
81
|
+
tmp[i + 4] = GGML_CPU_FP16_TO_FP32(x[i]);
|
|
81
82
|
}
|
|
82
83
|
|
|
83
84
|
return _mm256_loadu_ps(tmp);
|
|
@@ -88,7 +89,7 @@ static inline __m256 __avx_rearranged_f32cx8_load(ggml_fp16_t *x, __m128i arrang
|
|
|
88
89
|
|
|
89
90
|
_mm_storeu_si128((__m128i*)tmphalf, _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *) x), arrangeMask));
|
|
90
91
|
for (int i = 0; i < 8; i++) {
|
|
91
|
-
tmp[i] =
|
|
92
|
+
tmp[i] = GGML_CPU_FP16_TO_FP32(tmphalf[i]);
|
|
92
93
|
}
|
|
93
94
|
|
|
94
95
|
return _mm256_loadu_ps(tmp);
|
|
@@ -211,7 +212,7 @@ void ggml_quantize_mat_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTR
|
|
|
211
212
|
id[row_iter] = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f; //d ? 1.0f / d : 0.0f;
|
|
212
213
|
|
|
213
214
|
// Store the scale for the individual block
|
|
214
|
-
y[i].d[row_iter] =
|
|
215
|
+
y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d);
|
|
215
216
|
|
|
216
217
|
// Store the values in blocks of eight values - Aim is to use these later for block interleaving
|
|
217
218
|
srcv[row_iter][0] = v0;
|
|
@@ -297,7 +298,7 @@ void ggml_quantize_mat_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTR
|
|
|
297
298
|
const float d = amax / ((1 << 7) - 1);
|
|
298
299
|
id[row_iter] = d ? 1.0f / d : 0.0f;
|
|
299
300
|
|
|
300
|
-
y[i].d[row_iter] =
|
|
301
|
+
y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d);
|
|
301
302
|
}
|
|
302
303
|
|
|
303
304
|
for (int j = 0; j < QK8_0 * 4; j++) {
|
|
@@ -647,7 +648,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
647
648
|
const __m256 col_scale_f32 = GGML_F32Cx8_REARRANGE_LOAD(b_ptr[b].d, changemask);
|
|
648
649
|
|
|
649
650
|
// Load and convert to FP32 scale from block_q8_0
|
|
650
|
-
const __m256 row_scale_f32 = _mm256_set1_ps(
|
|
651
|
+
const __m256 row_scale_f32 = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(a_ptr[b].d));
|
|
651
652
|
|
|
652
653
|
// Load the block values in block_q8_0 in batches of 16 bytes and replicate the same across 256 bit vector
|
|
653
654
|
__m256i lhs_vec_0 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)a_ptr[b].qs));
|
|
@@ -706,7 +707,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
706
707
|
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
|
|
707
708
|
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
|
|
708
709
|
}
|
|
709
|
-
sumf[j] += sumi *
|
|
710
|
+
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
|
710
711
|
}
|
|
711
712
|
}
|
|
712
713
|
}
|
|
@@ -972,13 +973,13 @@ void ggml_gemv_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
972
973
|
sumi2 = sumi2 * scales_1[j];
|
|
973
974
|
sumi += sumi1 + sumi2;
|
|
974
975
|
}
|
|
975
|
-
sumf[j] += sumi *
|
|
976
|
+
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
|
|
976
977
|
}
|
|
977
978
|
}
|
|
978
979
|
for (int sb = 0; sb < 8; sb++) {
|
|
979
980
|
uint8_t *mins = (uint8_t*) utmp + 8 + sb * 16;
|
|
980
981
|
for (int j = 0; j < ncols_interleaved; j++) {
|
|
981
|
-
sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) *
|
|
982
|
+
sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
|
|
982
983
|
}
|
|
983
984
|
}
|
|
984
985
|
}
|
|
@@ -1755,7 +1756,7 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
1755
1756
|
sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
|
|
1756
1757
|
(v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
|
|
1757
1758
|
}
|
|
1758
|
-
sumf[m][j] += sumi *
|
|
1759
|
+
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
|
|
1759
1760
|
}
|
|
1760
1761
|
}
|
|
1761
1762
|
}
|
|
@@ -3259,7 +3260,7 @@ void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
3259
3260
|
sumi2 = sumi2 * scales_1[j];
|
|
3260
3261
|
sumi += sumi1 + sumi2;
|
|
3261
3262
|
}
|
|
3262
|
-
sumf[m][j] += sumi *
|
|
3263
|
+
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
|
|
3263
3264
|
}
|
|
3264
3265
|
}
|
|
3265
3266
|
}
|
|
@@ -3268,7 +3269,7 @@ void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
3268
3269
|
for(int m = 0; m < 4; m++) {
|
|
3269
3270
|
const int16_t *bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6);
|
|
3270
3271
|
for(int j = 0; j < ncols_interleaved; j++) {
|
|
3271
|
-
sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) *
|
|
3272
|
+
sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
|
|
3272
3273
|
}
|
|
3273
3274
|
}
|
|
3274
3275
|
}
|
|
@@ -4,6 +4,7 @@
|
|
|
4
4
|
#include "traits.h"
|
|
5
5
|
#include "ggml-cpu-impl.h"
|
|
6
6
|
#include "ggml-impl.h"
|
|
7
|
+
#include "simd-mappings.h"
|
|
7
8
|
|
|
8
9
|
#ifdef __cplusplus
|
|
9
10
|
|
|
@@ -12,11 +13,11 @@
|
|
|
12
13
|
// convenience functions/macros for use in template calls
|
|
13
14
|
// note: these won't be required after the 'traits' lookup table is used.
|
|
14
15
|
static inline ggml_fp16_t f32_to_f16(float x) {
|
|
15
|
-
return
|
|
16
|
+
return GGML_CPU_FP32_TO_FP16(x);
|
|
16
17
|
}
|
|
17
18
|
|
|
18
19
|
static inline float f16_to_f32(ggml_fp16_t x) {
|
|
19
|
-
return
|
|
20
|
+
return GGML_CPU_FP16_TO_FP32(x);
|
|
20
21
|
}
|
|
21
22
|
|
|
22
23
|
static inline ggml_bf16_t f32_to_bf16(float x) {
|
|
@@ -62,11 +62,17 @@ struct ggml_compute_params {
|
|
|
62
62
|
#if defined(__s390x__) && defined(__VEC__)
|
|
63
63
|
#ifndef __VXE__
|
|
64
64
|
#define __VXE__
|
|
65
|
-
#endif
|
|
65
|
+
#endif // __VXE__
|
|
66
66
|
#ifndef __VXE2__
|
|
67
67
|
#define __VXE2__
|
|
68
|
-
#endif
|
|
69
|
-
#endif
|
|
68
|
+
#endif // __VXE2__
|
|
69
|
+
#endif // __s390x__ && __VEC__
|
|
70
|
+
|
|
71
|
+
#if defined(__s390x__) && defined(GGML_NNPA)
|
|
72
|
+
#ifndef __NNPA__
|
|
73
|
+
#define __NNPA__
|
|
74
|
+
#endif // __NNPA__
|
|
75
|
+
#endif // __s390x__ && GGML_NNPA
|
|
70
76
|
|
|
71
77
|
#if defined(__ARM_FEATURE_SVE)
|
|
72
78
|
#include <sys/prctl.h>
|
|
@@ -72,6 +72,9 @@
|
|
|
72
72
|
#define UNUSED GGML_UNUSED
|
|
73
73
|
#define SWAP(x, y, T) do { T SWAP = x; (x) = y; (y) = SWAP; } while (0)
|
|
74
74
|
|
|
75
|
+
// precomputed f32 table for f16 (256 KB) (simd-mappings.h)
|
|
76
|
+
float ggml_table_f32_f16[1 << 16];
|
|
77
|
+
|
|
75
78
|
#if defined(__ARM_ARCH)
|
|
76
79
|
struct ggml_arm_arch_features_type {
|
|
77
80
|
int sve_cnt;
|
|
@@ -736,7 +739,7 @@ struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value) {
|
|
|
736
739
|
{
|
|
737
740
|
assert(tensor->nb[0] == sizeof(ggml_fp16_t));
|
|
738
741
|
for (int i = 0; i < n; i++) {
|
|
739
|
-
ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1),
|
|
742
|
+
ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), GGML_CPU_FP32_TO_FP16(value));
|
|
740
743
|
}
|
|
741
744
|
} break;
|
|
742
745
|
case GGML_TYPE_BF16:
|
|
@@ -795,7 +798,7 @@ struct ggml_tensor * ggml_set_f32(struct ggml_tensor * tensor, float value) {
|
|
|
795
798
|
{
|
|
796
799
|
assert(tensor->nb[0] == sizeof(ggml_fp16_t));
|
|
797
800
|
for (int i = 0; i < n; i++) {
|
|
798
|
-
ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1),
|
|
801
|
+
ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), GGML_CPU_FP32_TO_FP16(value));
|
|
799
802
|
}
|
|
800
803
|
} break;
|
|
801
804
|
case GGML_TYPE_BF16:
|
|
@@ -846,7 +849,7 @@ int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i) {
|
|
|
846
849
|
case GGML_TYPE_F16:
|
|
847
850
|
{
|
|
848
851
|
GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
|
|
849
|
-
return
|
|
852
|
+
return GGML_CPU_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]);
|
|
850
853
|
}
|
|
851
854
|
case GGML_TYPE_BF16:
|
|
852
855
|
{
|
|
@@ -891,7 +894,7 @@ void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value) {
|
|
|
891
894
|
case GGML_TYPE_F16:
|
|
892
895
|
{
|
|
893
896
|
GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
|
|
894
|
-
((ggml_fp16_t *)(tensor->data))[i] =
|
|
897
|
+
((ggml_fp16_t *)(tensor->data))[i] = GGML_CPU_FP32_TO_FP16(value);
|
|
895
898
|
} break;
|
|
896
899
|
case GGML_TYPE_BF16:
|
|
897
900
|
{
|
|
@@ -920,7 +923,7 @@ int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i
|
|
|
920
923
|
case GGML_TYPE_I32:
|
|
921
924
|
return ((int32_t *) data)[0];
|
|
922
925
|
case GGML_TYPE_F16:
|
|
923
|
-
return
|
|
926
|
+
return GGML_CPU_FP16_TO_FP32(((ggml_fp16_t *) data)[0]);
|
|
924
927
|
case GGML_TYPE_BF16:
|
|
925
928
|
return GGML_BF16_TO_FP32(((ggml_bf16_t *) data)[0]);
|
|
926
929
|
case GGML_TYPE_F32:
|
|
@@ -947,7 +950,7 @@ void ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2,
|
|
|
947
950
|
} break;
|
|
948
951
|
case GGML_TYPE_F16:
|
|
949
952
|
{
|
|
950
|
-
((ggml_fp16_t *)(data))[0] =
|
|
953
|
+
((ggml_fp16_t *)(data))[0] = GGML_CPU_FP32_TO_FP16(value);
|
|
951
954
|
} break;
|
|
952
955
|
case GGML_TYPE_BF16:
|
|
953
956
|
{
|
|
@@ -985,7 +988,7 @@ float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i) {
|
|
|
985
988
|
}
|
|
986
989
|
case GGML_TYPE_F16:
|
|
987
990
|
{
|
|
988
|
-
return
|
|
991
|
+
return GGML_CPU_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]);
|
|
989
992
|
}
|
|
990
993
|
case GGML_TYPE_BF16:
|
|
991
994
|
{
|
|
@@ -1024,7 +1027,7 @@ void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value) {
|
|
|
1024
1027
|
} break;
|
|
1025
1028
|
case GGML_TYPE_F16:
|
|
1026
1029
|
{
|
|
1027
|
-
((ggml_fp16_t *)(tensor->data))[i] =
|
|
1030
|
+
((ggml_fp16_t *)(tensor->data))[i] = GGML_CPU_FP32_TO_FP16(value);
|
|
1028
1031
|
} break;
|
|
1029
1032
|
case GGML_TYPE_BF16:
|
|
1030
1033
|
{
|
|
@@ -1051,7 +1054,7 @@ float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2,
|
|
|
1051
1054
|
case GGML_TYPE_I32:
|
|
1052
1055
|
return ((int32_t *) data)[0];
|
|
1053
1056
|
case GGML_TYPE_F16:
|
|
1054
|
-
return
|
|
1057
|
+
return GGML_CPU_FP16_TO_FP32(((ggml_fp16_t *) data)[0]);
|
|
1055
1058
|
case GGML_TYPE_BF16:
|
|
1056
1059
|
return GGML_BF16_TO_FP32(((ggml_bf16_t *) data)[0]);
|
|
1057
1060
|
case GGML_TYPE_F32:
|
|
@@ -1078,7 +1081,7 @@ void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2,
|
|
|
1078
1081
|
} break;
|
|
1079
1082
|
case GGML_TYPE_F16:
|
|
1080
1083
|
{
|
|
1081
|
-
((ggml_fp16_t *)(data))[0] =
|
|
1084
|
+
((ggml_fp16_t *)(data))[0] = GGML_CPU_FP32_TO_FP16(value);
|
|
1082
1085
|
} break;
|
|
1083
1086
|
case GGML_TYPE_BF16:
|
|
1084
1087
|
{
|
|
@@ -3141,9 +3144,24 @@ void ggml_cpu_fp32_to_fp16(const float * x, ggml_fp16_t * y, int64_t n) {
|
|
|
3141
3144
|
__m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
|
|
3142
3145
|
_mm_storel_epi64((__m128i *)(y + i), y_vec);
|
|
3143
3146
|
}
|
|
3147
|
+
#elif defined(__NNPA__)
|
|
3148
|
+
for (; i + 7 < n; i += 8) {
|
|
3149
|
+
float32x4_t v_xh = vec_xl(0, (const float *)(x + i + 0));
|
|
3150
|
+
float32x4_t v_xl = vec_xl(0, (const float *)(x + i + 4));
|
|
3151
|
+
uint16x8_t v_yd = vec_round_from_fp32(v_xh, v_xl, 0);
|
|
3152
|
+
uint16x8_t v_y = vec_convert_to_fp16(v_yd, 0);
|
|
3153
|
+
vec_xst(v_y, 0, (ggml_fp16_t *)(y + i));
|
|
3154
|
+
}
|
|
3155
|
+
for (; i + 3 < n; i += 4) {
|
|
3156
|
+
float32x4_t v_x = vec_xl(0, (const float *)(x + i));
|
|
3157
|
+
float32x4_t v_zero = vec_splats(0.0f);
|
|
3158
|
+
uint16x8_t v_yd = vec_round_from_fp32(v_x, v_zero, 0);
|
|
3159
|
+
uint16x8_t v_y = vec_convert_to_fp16(v_yd, 0);
|
|
3160
|
+
vec_xst(v_y, 0, (ggml_fp16_t *)(y + i));
|
|
3161
|
+
}
|
|
3144
3162
|
#endif
|
|
3145
3163
|
for (; i < n; ++i) {
|
|
3146
|
-
y[i] =
|
|
3164
|
+
y[i] = GGML_CPU_FP32_TO_FP16(x[i]);
|
|
3147
3165
|
}
|
|
3148
3166
|
}
|
|
3149
3167
|
|
|
@@ -3167,9 +3185,25 @@ void ggml_cpu_fp16_to_fp32(const ggml_fp16_t * x, float * y, int64_t n) {
|
|
|
3167
3185
|
__m128 y_vec = _mm_cvtph_ps(x_vec);
|
|
3168
3186
|
_mm_storeu_ps(y + i, y_vec);
|
|
3169
3187
|
}
|
|
3188
|
+
#elif defined(__NNPA__)
|
|
3189
|
+
for (; i + 7 < n; i += 8) {
|
|
3190
|
+
uint16x8_t v_x = vec_xl(0, (const ggml_fp16_t *)(x + i));
|
|
3191
|
+
uint16x8_t v_yd = vec_convert_from_fp16(v_x, 0);
|
|
3192
|
+
float32x4_t v_yh = vec_extend_to_fp32_hi(v_yd, 0);
|
|
3193
|
+
float32x4_t v_yl = vec_extend_to_fp32_lo(v_yd, 0);
|
|
3194
|
+
vec_xst(v_yh, 0, (float *)(y + i + 0));
|
|
3195
|
+
vec_xst(v_yl, 0, (float *)(y + i + 4));
|
|
3196
|
+
}
|
|
3197
|
+
for (; i + 3 < n; i += 4) {
|
|
3198
|
+
uint16x8_t v_x = vec_xl(0, (const ggml_fp16_t *)(x + i));
|
|
3199
|
+
uint16x8_t v_yd = vec_convert_from_fp16(v_x, 0);
|
|
3200
|
+
float32x4_t v_yh = vec_extend_to_fp32_hi(v_yd, 0);
|
|
3201
|
+
vec_xst(v_yh, 0, (float *)(y + i));
|
|
3202
|
+
}
|
|
3170
3203
|
#endif
|
|
3204
|
+
|
|
3171
3205
|
for (; i < n; ++i) {
|
|
3172
|
-
y[i] =
|
|
3206
|
+
y[i] = GGML_CPU_FP16_TO_FP32(x[i]);
|
|
3173
3207
|
}
|
|
3174
3208
|
}
|
|
3175
3209
|
|
|
@@ -3369,6 +3403,14 @@ int ggml_cpu_has_vxe(void) {
|
|
|
3369
3403
|
#endif
|
|
3370
3404
|
}
|
|
3371
3405
|
|
|
3406
|
+
int ggml_cpu_has_nnpa(void) {
|
|
3407
|
+
#if defined(GGML_NNPA)
|
|
3408
|
+
return 1;
|
|
3409
|
+
#else
|
|
3410
|
+
return 0;
|
|
3411
|
+
#endif
|
|
3412
|
+
}
|
|
3413
|
+
|
|
3372
3414
|
int ggml_cpu_has_neon(void) {
|
|
3373
3415
|
#if defined(__ARM_ARCH) && defined(__ARM_NEON)
|
|
3374
3416
|
return 1;
|
|
@@ -3418,7 +3460,7 @@ int ggml_cpu_has_sme(void) {
|
|
|
3418
3460
|
}
|
|
3419
3461
|
|
|
3420
3462
|
void ggml_cpu_init(void) {
|
|
3421
|
-
// needed to initialize
|
|
3463
|
+
// needed to initialize ggml_time
|
|
3422
3464
|
{
|
|
3423
3465
|
struct ggml_init_params params = { 0, NULL, false };
|
|
3424
3466
|
struct ggml_context * ctx = ggml_init(params);
|
|
@@ -3439,9 +3481,10 @@ void ggml_cpu_init(void) {
|
|
|
3439
3481
|
uint16_t u16;
|
|
3440
3482
|
ggml_fp16_t fp16;
|
|
3441
3483
|
} u = {i};
|
|
3442
|
-
float f =
|
|
3443
|
-
|
|
3444
|
-
|
|
3484
|
+
float f = GGML_COMPUTE_FP16_TO_FP32(u.fp16);
|
|
3485
|
+
ggml_table_f32_f16[i] = f;
|
|
3486
|
+
ggml_table_gelu_f16[i] = GGML_CPU_FP32_TO_FP16(ggml_gelu_f32(f));
|
|
3487
|
+
ggml_table_gelu_quick_f16[i] = GGML_CPU_FP32_TO_FP16(ggml_gelu_quick_f32(f));
|
|
3445
3488
|
}
|
|
3446
3489
|
|
|
3447
3490
|
const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
|
|
@@ -578,6 +578,9 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
|
|
|
578
578
|
if (ggml_cpu_has_vxe()) {
|
|
579
579
|
features.push_back({ "VXE", "1" });
|
|
580
580
|
}
|
|
581
|
+
if (ggml_cpu_has_nnpa()) {
|
|
582
|
+
features.push_back({ "NNPA", "1" });
|
|
583
|
+
}
|
|
581
584
|
if (ggml_cpu_has_wasm_simd()) {
|
|
582
585
|
features.push_back({ "WASM_SIMD", "1" });
|
|
583
586
|
}
|
|
@@ -52,6 +52,7 @@
|
|
|
52
52
|
#include "ggml-impl.h"
|
|
53
53
|
#include "ggml-cpu-impl.h"
|
|
54
54
|
#include "ggml-quants.h"
|
|
55
|
+
#include "simd-mappings.h"
|
|
55
56
|
|
|
56
57
|
#include <array>
|
|
57
58
|
#include <type_traits>
|
|
@@ -73,7 +74,7 @@
|
|
|
73
74
|
namespace {
|
|
74
75
|
|
|
75
76
|
inline float unhalf(ggml_fp16_t d) {
|
|
76
|
-
return
|
|
77
|
+
return GGML_CPU_FP16_TO_FP32(d);
|
|
77
78
|
}
|
|
78
79
|
|
|
79
80
|
////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
@@ -252,7 +253,7 @@ template <> inline float32x4_t load(const ggml_fp16_t * p) {
|
|
|
252
253
|
float tmp[4];
|
|
253
254
|
|
|
254
255
|
for (int i = 0; i < 4; i++) {
|
|
255
|
-
tmp[i] =
|
|
256
|
+
tmp[i] = GGML_CPU_FP16_TO_FP32(p[i]);
|
|
256
257
|
}
|
|
257
258
|
|
|
258
259
|
return vec_xl(0, (const float *)(tmp));
|