@fugood/llama.node 1.1.11 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +5 -8
- package/lib/binding.ts +18 -1
- package/lib/index.js +2 -2
- package/lib/index.ts +2 -2
- package/package.json +20 -16
- package/src/DecodeAudioTokenWorker.cpp +23 -26
- package/src/DecodeAudioTokenWorker.h +6 -8
- package/src/DetokenizeWorker.cpp +5 -8
- package/src/DetokenizeWorker.h +6 -5
- package/src/DisposeWorker.cpp +23 -3
- package/src/DisposeWorker.h +4 -2
- package/src/EmbeddingWorker.cpp +9 -35
- package/src/EmbeddingWorker.h +3 -2
- package/src/LlamaCompletionWorker.cpp +217 -315
- package/src/LlamaCompletionWorker.h +6 -12
- package/src/LlamaContext.cpp +166 -396
- package/src/LlamaContext.h +8 -13
- package/src/LoadSessionWorker.cpp +22 -19
- package/src/LoadSessionWorker.h +3 -2
- package/src/RerankWorker.h +3 -2
- package/src/SaveSessionWorker.cpp +22 -19
- package/src/SaveSessionWorker.h +3 -2
- package/src/TokenizeWorker.cpp +38 -35
- package/src/TokenizeWorker.h +12 -3
- package/src/common.hpp +0 -458
- package/src/llama.cpp/common/arg.cpp +50 -30
- package/src/llama.cpp/common/chat.cpp +250 -1
- package/src/llama.cpp/common/chat.h +4 -0
- package/src/llama.cpp/common/common.h +1 -1
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +21 -1
- package/src/llama.cpp/common/log.cpp +53 -2
- package/src/llama.cpp/common/log.h +10 -4
- package/src/llama.cpp/common/sampling.cpp +23 -2
- package/src/llama.cpp/common/sampling.h +3 -1
- package/src/llama.cpp/common/speculative.cpp +1 -1
- package/src/llama.cpp/ggml/CMakeLists.txt +3 -2
- package/src/llama.cpp/ggml/include/ggml-backend.h +15 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-metal.h +0 -6
- package/src/llama.cpp/ggml/include/ggml.h +56 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +21 -14
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +210 -96
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +57 -59
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +6 -7
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +25 -38
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +4 -12
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +379 -4
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +41 -37
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +150 -28
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +320 -73
- package/src/llama.cpp/include/llama.h +5 -6
- package/src/llama.cpp/src/llama-adapter.cpp +33 -0
- package/src/llama.cpp/src/llama-adapter.h +3 -0
- package/src/llama.cpp/src/llama-arch.cpp +28 -4
- package/src/llama.cpp/src/llama-arch.h +3 -0
- package/src/llama.cpp/src/llama-context.cpp +65 -57
- package/src/llama.cpp/src/llama-context.h +1 -1
- package/src/llama.cpp/src/llama-graph.cpp +57 -11
- package/src/llama.cpp/src/llama-graph.h +8 -0
- package/src/llama.cpp/src/llama-hparams.cpp +37 -0
- package/src/llama.cpp/src/llama-hparams.h +10 -3
- package/src/llama.cpp/src/llama-kv-cache.cpp +56 -38
- package/src/llama.cpp/src/llama-kv-cache.h +9 -0
- package/src/llama.cpp/src/llama-model.cpp +217 -97
- package/src/llama.cpp/src/llama-model.h +0 -1
- package/src/llama.cpp/src/llama-quant.cpp +3 -3
- package/src/llama.cpp/src/llama-sampling.cpp +226 -126
- package/src/llama.cpp/src/llama.cpp +53 -10
- package/src/anyascii.c +0 -22223
- package/src/anyascii.h +0 -42
- package/src/tts_utils.cpp +0 -371
- package/src/tts_utils.h +0 -103
|
@@ -53,9 +53,9 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
|
|
|
53
53
|
|
|
54
54
|
#if defined(__VXE__) || defined(__VXE2__)
|
|
55
55
|
for (int i = 0; i < nb; i++) {
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
56
|
+
float32x4_t srcv [8];
|
|
57
|
+
float32x4_t asrcv[8];
|
|
58
|
+
float32x4_t amaxv[8];
|
|
59
59
|
|
|
60
60
|
for (int j = 0; j < 8; j++) srcv[j] = vec_xl(0, x + i*32 + 4*j);
|
|
61
61
|
for (int j = 0; j < 8; j++) asrcv[j] = vec_abs(srcv[j]);
|
|
@@ -74,8 +74,8 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
|
|
|
74
74
|
y[i].d = GGML_CPU_FP32_TO_FP16(d);
|
|
75
75
|
|
|
76
76
|
for (int j = 0; j < 8; j++) {
|
|
77
|
-
const
|
|
78
|
-
const
|
|
77
|
+
const float32x4_t v = vec_mul(srcv[j], vec_splats(id));
|
|
78
|
+
const int32x4_t vi = vec_signed(v);
|
|
79
79
|
|
|
80
80
|
y[i].qs[4*j + 0] = vec_extract(vi, 0);
|
|
81
81
|
y[i].qs[4*j + 1] = vec_extract(vi, 1);
|
|
@@ -98,9 +98,9 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
|
|
|
98
98
|
|
|
99
99
|
#if defined(__VXE__) || defined(__VXE2__)
|
|
100
100
|
for (int i = 0; i < nb; i++) {
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
101
|
+
float32x4_t srcv [8];
|
|
102
|
+
float32x4_t asrcv[8];
|
|
103
|
+
float32x4_t amaxv[8];
|
|
104
104
|
|
|
105
105
|
for (int j = 0; j < 8; j++) srcv[j] = vec_xl(0, x + i*32 + 4*j);
|
|
106
106
|
for (int j = 0; j < 8; j++) asrcv[j] = vec_abs(srcv[j]);
|
|
@@ -118,11 +118,11 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
|
|
|
118
118
|
|
|
119
119
|
y[i].d = GGML_CPU_FP32_TO_FP16(d);
|
|
120
120
|
|
|
121
|
-
|
|
121
|
+
int32x4_t acc = vec_splats(0);
|
|
122
122
|
|
|
123
123
|
for (int j = 0; j < 8; j++) {
|
|
124
|
-
const
|
|
125
|
-
const
|
|
124
|
+
const float32x4_t v = vec_mul(srcv[j], vec_splats(id));
|
|
125
|
+
const int32x4_t vi = vec_signed(v);
|
|
126
126
|
|
|
127
127
|
y[i].qs[4*j + 0] = vec_extract(vi, 0);
|
|
128
128
|
y[i].qs[4*j + 1] = vec_extract(vi, 1);
|
|
@@ -162,37 +162,36 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
162
162
|
float sumf = 0;
|
|
163
163
|
|
|
164
164
|
#if defined(__VXE__) || defined(__VXE2__)
|
|
165
|
-
|
|
165
|
+
float32x4_t acc = vec_splats(0.0f);
|
|
166
166
|
|
|
167
|
-
const
|
|
168
|
-
const
|
|
167
|
+
const uint8x16_t v_m = vec_splats((const uint8_t)0x0F);
|
|
168
|
+
const int8x16_t v_s = vec_splats( (const int8_t)0x08);
|
|
169
169
|
|
|
170
170
|
for (; ib < nb; ++ib) {
|
|
171
|
-
const
|
|
172
|
-
const
|
|
173
|
-
const
|
|
171
|
+
const uint8x16_t v_x = vec_xl(0, x[ib].qs);
|
|
172
|
+
const int8x16_t v_xl = (const int8x16_t)(v_x & v_m);
|
|
173
|
+
const int8x16_t v_xh = (const int8x16_t)(v_x >> 4);
|
|
174
174
|
|
|
175
|
-
const
|
|
176
|
-
const
|
|
175
|
+
const int8x16_t v_xls = vec_sub(v_xl, v_s);
|
|
176
|
+
const int8x16_t v_xhs = vec_sub(v_xh, v_s);
|
|
177
177
|
|
|
178
|
-
const
|
|
179
|
-
const
|
|
178
|
+
const int8x16_t v_yl = vec_xl(0 , y[ib].qs);
|
|
179
|
+
const int8x16_t v_yh = vec_xl(QK8_0/2, y[ib].qs);
|
|
180
180
|
|
|
181
|
-
const
|
|
182
|
-
const
|
|
183
|
-
const
|
|
184
|
-
const
|
|
181
|
+
const int16x8_t v_xylso = vec_mulo(v_xls, v_yl);
|
|
182
|
+
const int16x8_t v_xylse = vec_mule(v_xls, v_yl);
|
|
183
|
+
const int16x8_t v_xyhso = vec_mulo(v_xhs, v_yh);
|
|
184
|
+
const int16x8_t v_xyhse = vec_mule(v_xhs, v_yh);
|
|
185
185
|
|
|
186
|
-
|
|
186
|
+
int16x8_t v_xy_ = v_xylso + v_xylse + v_xyhso + v_xyhse; v_xy_ += vec_reve(v_xy_);
|
|
187
187
|
|
|
188
|
-
const
|
|
189
|
-
const
|
|
188
|
+
const float32x4_t v_xy = vec_float(vec_unpackh(v_xy_));
|
|
189
|
+
const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d));
|
|
190
190
|
|
|
191
191
|
acc = vec_madd(v_xy, v_d, acc);
|
|
192
192
|
}
|
|
193
193
|
|
|
194
|
-
sumf = acc
|
|
195
|
-
|
|
194
|
+
sumf = vec_hsum_f32x4(acc);
|
|
196
195
|
*s = sumf;
|
|
197
196
|
#else
|
|
198
197
|
UNUSED(nb);
|
|
@@ -249,8 +248,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
249
248
|
acc = vec_madd(v_xy, v_d, acc);
|
|
250
249
|
}
|
|
251
250
|
|
|
252
|
-
sumf = acc
|
|
253
|
-
|
|
251
|
+
sumf = vec_hsum_f32x4(acc) + summs;
|
|
254
252
|
*s = sumf;
|
|
255
253
|
#else
|
|
256
254
|
UNUSED(nb);
|
|
@@ -351,7 +349,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
351
349
|
v_sum1 = vec_madd(v_xy1f, v_d1, v_sum1);
|
|
352
350
|
}
|
|
353
351
|
|
|
354
|
-
sumf +=
|
|
352
|
+
sumf += vec_hsum_f32x4(v_sum0) + vec_hsum_f32x4(v_sum1);
|
|
355
353
|
|
|
356
354
|
#pragma GCC unroll 4
|
|
357
355
|
for (; ib < nb; ++ib) {
|
|
@@ -390,7 +388,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
390
388
|
const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d));
|
|
391
389
|
const float32x4_t v_acc = vec_madd(v_xyf, v_d, vec_splats(0.0f));
|
|
392
390
|
|
|
393
|
-
sumf +=
|
|
391
|
+
sumf += vec_hsum_f32x4(v_acc);
|
|
394
392
|
}
|
|
395
393
|
|
|
396
394
|
*s = sumf;
|
|
@@ -502,7 +500,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
502
500
|
v_sum1 = vec_madd(v_xy1f, v_d1, v_sum1);
|
|
503
501
|
}
|
|
504
502
|
|
|
505
|
-
sumf +=
|
|
503
|
+
sumf += vec_hsum_f32x4(v_sum0) + vec_hsum_f32x4(v_sum1) + summs0 + summs1;
|
|
506
504
|
|
|
507
505
|
#pragma GCC unroll 4
|
|
508
506
|
for (; ib < nb; ++ib) {
|
|
@@ -543,7 +541,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
543
541
|
const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d));
|
|
544
542
|
const float32x4_t v_acc = vec_madd(v_xyf, v_d, v_acc);
|
|
545
543
|
|
|
546
|
-
sumf +=
|
|
544
|
+
sumf += vec_hsum_f32x4(v_acc) + summs;
|
|
547
545
|
}
|
|
548
546
|
|
|
549
547
|
*s = sumf;
|
|
@@ -575,7 +573,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
575
573
|
float sumf = 0;
|
|
576
574
|
|
|
577
575
|
#if defined(__VXE__) || defined(__VXE2__)
|
|
578
|
-
|
|
576
|
+
float32x4_t acc = vec_splats(0.0f);
|
|
579
577
|
|
|
580
578
|
#pragma GCC unroll 8
|
|
581
579
|
for (; ib < nb; ++ib) {
|
|
@@ -594,7 +592,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
594
592
|
acc = vec_madd(v_xy, v_d, acc);
|
|
595
593
|
}
|
|
596
594
|
|
|
597
|
-
sumf = acc
|
|
595
|
+
sumf = vec_hsum_f32x4(acc);
|
|
598
596
|
|
|
599
597
|
*s = sumf;
|
|
600
598
|
#else
|
|
@@ -718,10 +716,10 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
718
716
|
isum2 = ggml_vec_dot(v_z, q3bytes[2], q8bytes[6]);
|
|
719
717
|
isum3 = ggml_vec_dot(v_z, q3bytes[3], q8bytes[7]);
|
|
720
718
|
|
|
721
|
-
isum += (isum0
|
|
722
|
-
isum += (isum1
|
|
723
|
-
isum += (isum2
|
|
724
|
-
isum += (isum3
|
|
719
|
+
isum += vec_hsum_i32x4(isum0) * scale[0];
|
|
720
|
+
isum += vec_hsum_i32x4(isum1) * scale[1];
|
|
721
|
+
isum += vec_hsum_i32x4(isum2) * scale[2];
|
|
722
|
+
isum += vec_hsum_i32x4(isum3) * scale[3];
|
|
725
723
|
|
|
726
724
|
scale += 4;
|
|
727
725
|
|
|
@@ -819,7 +817,7 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
819
817
|
v_xl[1] = (int8x16_t)vec_and(v_x[1], v_lm);
|
|
820
818
|
|
|
821
819
|
const int32x4_t p1 = ggml_vec_dot(ggml_vec_dot(v_z, v_xl[0], v_y[0]), v_xl[1], v_y[1]);
|
|
822
|
-
sumi1 += (p1
|
|
820
|
+
sumi1 += vec_hsum_i32x4(p1) * scales[2*j+0];
|
|
823
821
|
|
|
824
822
|
v_y[0] = vec_xl(0 , y0);
|
|
825
823
|
v_y[1] = vec_xl(16, y0);
|
|
@@ -829,7 +827,7 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
829
827
|
v_xl[1] = (int8x16_t)vec_sr(v_x[1], 4);
|
|
830
828
|
|
|
831
829
|
const int32x4_t p2 = ggml_vec_dot(ggml_vec_dot(v_z, v_xl[0], v_y[0]), v_xl[1], v_y[1]);
|
|
832
|
-
sumi2 += (p2
|
|
830
|
+
sumi2 += vec_hsum_i32x4(p2) * scales[2*j+1];
|
|
833
831
|
}
|
|
834
832
|
|
|
835
833
|
sumf += d * (sumi1 + sumi2);
|
|
@@ -911,7 +909,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
911
909
|
const int32x4_t v_minsho = vec_mulo(v_ysums, v_minsh);
|
|
912
910
|
const int32x4_t v_minshe = vec_mule(v_ysums, v_minsh);
|
|
913
911
|
const int32x4_t v_mins = vec_add(v_minsho, v_minshe);
|
|
914
|
-
const int32_t mins = v_mins
|
|
912
|
+
const int32_t mins = vec_hsum_i32x4(v_mins);
|
|
915
913
|
|
|
916
914
|
const uint8_t * scales = (const uint8_t *)utmp;
|
|
917
915
|
const uint8_t * GGML_RESTRICT x0l = x[i].qs;
|
|
@@ -948,8 +946,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
948
946
|
int32x4_t sumi0 = ggml_vec_dot(ggml_vec_dot(v_z, q5b[0], v_y[0]), q5b[1], v_y[1]);
|
|
949
947
|
int32x4_t sumi1 = ggml_vec_dot(ggml_vec_dot(v_z, q5b[2], v_y[2]), q5b[3], v_y[3]);
|
|
950
948
|
|
|
951
|
-
sumi += (sumi0
|
|
952
|
-
sumi += (sumi1
|
|
949
|
+
sumi += vec_hsum_i32x4(sumi0) * *scales++;
|
|
950
|
+
sumi += vec_hsum_i32x4(sumi1) * *scales++;
|
|
953
951
|
}
|
|
954
952
|
|
|
955
953
|
sumf += d * sumi - dmin * mins;
|
|
@@ -1020,7 +1018,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1020
1018
|
const int32x4_t v_minshe = vec_mule(v_ysumsh, v_scaleh);
|
|
1021
1019
|
const int32x4_t v_mins = v_minslo + v_minsle + v_minsho + v_minshe;
|
|
1022
1020
|
|
|
1023
|
-
const int32_t mins = v_mins
|
|
1021
|
+
const int32_t mins = vec_hsum_i32x4(v_mins);
|
|
1024
1022
|
|
|
1025
1023
|
int32_t isum = 0;
|
|
1026
1024
|
for (int j = 0; j < QK_K/128; ++j) {
|
|
@@ -1060,10 +1058,10 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1060
1058
|
int32x4_t summs2 = ggml_vec_dot(v_z, q6b[2], v_y[2]);
|
|
1061
1059
|
int32x4_t summs3 = ggml_vec_dot(v_z, q6b[3], v_y[3]);
|
|
1062
1060
|
|
|
1063
|
-
isum += (summs0
|
|
1064
|
-
(summs1
|
|
1065
|
-
(summs2
|
|
1066
|
-
(summs3
|
|
1061
|
+
isum += vec_hsum_i32x4(summs0) * scale[0] +
|
|
1062
|
+
vec_hsum_i32x4(summs1) * scale[1] +
|
|
1063
|
+
vec_hsum_i32x4(summs2) * scale[2] +
|
|
1064
|
+
vec_hsum_i32x4(summs3) * scale[3];
|
|
1067
1065
|
|
|
1068
1066
|
scale += 4;
|
|
1069
1067
|
|
|
@@ -1094,10 +1092,10 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1094
1092
|
summs2 = ggml_vec_dot(v_z, q6b[2], v_y[2]);
|
|
1095
1093
|
summs3 = ggml_vec_dot(v_z, q6b[3], v_y[3]);
|
|
1096
1094
|
|
|
1097
|
-
isum += (summs0
|
|
1098
|
-
(summs1
|
|
1099
|
-
(summs2
|
|
1100
|
-
(summs3
|
|
1095
|
+
isum += vec_hsum_i32x4(summs0) * scale[0] +
|
|
1096
|
+
vec_hsum_i32x4(summs1) * scale[1] +
|
|
1097
|
+
vec_hsum_i32x4(summs2) * scale[2] +
|
|
1098
|
+
vec_hsum_i32x4(summs3) * scale[3];
|
|
1101
1099
|
|
|
1102
1100
|
scale += 4;
|
|
1103
1101
|
}
|
|
@@ -1285,7 +1283,7 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v
|
|
|
1285
1283
|
const int8x16_t v_yh = vec_xl(QK8_0/2, y0->qs);
|
|
1286
1284
|
const int32x4_t v_xy = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xl, v_yl), v_xh, v_yh);
|
|
1287
1285
|
|
|
1288
|
-
sumf += GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d) * (v_xy
|
|
1286
|
+
sumf += GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d) * vec_hsum_i32x4(v_xy);
|
|
1289
1287
|
}
|
|
1290
1288
|
|
|
1291
1289
|
*s = sumf;
|
|
@@ -1354,8 +1352,8 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
|
|
|
1354
1352
|
|
|
1355
1353
|
h >>= 4;
|
|
1356
1354
|
|
|
1357
|
-
sumi1 += (vsumi0
|
|
1358
|
-
sumi2 += (vsumi1
|
|
1355
|
+
sumi1 += vec_hsum_i32x4(vsumi0) * ls1;
|
|
1356
|
+
sumi2 += vec_hsum_i32x4(vsumi1) * ls2;
|
|
1359
1357
|
}
|
|
1360
1358
|
|
|
1361
1359
|
sumf += GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d * (sumi1 + sumi2);
|
|
@@ -68,12 +68,6 @@ struct ggml_compute_params {
|
|
|
68
68
|
#endif // __VXE2__
|
|
69
69
|
#endif // __s390x__ && __VEC__
|
|
70
70
|
|
|
71
|
-
#if defined(__s390x__) && defined(GGML_NNPA)
|
|
72
|
-
#ifndef __NNPA__
|
|
73
|
-
#define __NNPA__
|
|
74
|
-
#endif // __NNPA__
|
|
75
|
-
#endif // __s390x__ && GGML_NNPA
|
|
76
|
-
|
|
77
71
|
#if defined(__ARM_FEATURE_SVE)
|
|
78
72
|
#include <sys/prctl.h>
|
|
79
73
|
#endif
|
|
@@ -489,11 +483,16 @@ inline static int16x8_t vec_padd_s16(int16x8_t a, int16x8_t b) {
|
|
|
489
483
|
/**
|
|
490
484
|
* @see https://github.com/ggml-org/llama.cpp/pull/14037
|
|
491
485
|
*/
|
|
492
|
-
inline static float
|
|
486
|
+
inline static float vec_hsum_f32x4(float32x4_t v) {
|
|
493
487
|
float32x4_t v_temp = v + vec_reve(v);
|
|
494
488
|
return v_temp[0] + v_temp[1];
|
|
495
489
|
}
|
|
496
490
|
|
|
491
|
+
inline static int32_t vec_hsum_i32x4(int32x4_t v) {
|
|
492
|
+
int32x4_t v_temp = v + vec_reve(v);
|
|
493
|
+
return v_temp[0] + v_temp[1];
|
|
494
|
+
}
|
|
495
|
+
|
|
497
496
|
inline static int32x4_t ggml_vec_dot(int32x4_t acc, int8x16_t a, int8x16_t b) {
|
|
498
497
|
const int16x8_t p = vec_mule(a, b) + vec_mulo(a, b);
|
|
499
498
|
return acc + (vec_unpackh(p) + vec_unpackl(p));
|
|
@@ -373,6 +373,9 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
|
|
|
373
373
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
|
374
374
|
.nrows = 1,
|
|
375
375
|
},
|
|
376
|
+
[GGML_TYPE_I32] = {
|
|
377
|
+
.from_float = (ggml_from_float_t) ggml_cpu_fp32_to_i32,
|
|
378
|
+
},
|
|
376
379
|
};
|
|
377
380
|
|
|
378
381
|
const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type) {
|
|
@@ -1876,6 +1879,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
|
1876
1879
|
{
|
|
1877
1880
|
ggml_compute_forward_im2col_back_f32(params, tensor);
|
|
1878
1881
|
} break;
|
|
1882
|
+
case GGML_OP_IM2COL_3D:
|
|
1883
|
+
{
|
|
1884
|
+
ggml_compute_forward_im2col_3d(params, tensor);
|
|
1885
|
+
} break;
|
|
1879
1886
|
case GGML_OP_CONV_2D:
|
|
1880
1887
|
{
|
|
1881
1888
|
ggml_compute_forward_conv_2d(params, tensor);
|
|
@@ -2255,6 +2262,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
|
2255
2262
|
} break;
|
|
2256
2263
|
case GGML_OP_IM2COL:
|
|
2257
2264
|
case GGML_OP_IM2COL_BACK:
|
|
2265
|
+
case GGML_OP_IM2COL_3D:
|
|
2258
2266
|
case GGML_OP_CONV_2D:
|
|
2259
2267
|
case GGML_OP_CONV_3D:
|
|
2260
2268
|
case GGML_OP_CONV_2D_DW:
|
|
@@ -2691,7 +2699,10 @@ struct ggml_cplan ggml_graph_plan(
|
|
|
2691
2699
|
if (ggml_is_quantized(node->type) ||
|
|
2692
2700
|
// F16 -> BF16 and BF16 -> F16 copies go through intermediate F32
|
|
2693
2701
|
(node->src[0]->type == GGML_TYPE_F16 && node->src[1] && node->src[1]->type == GGML_TYPE_BF16) ||
|
|
2694
|
-
(node->src[0]->type == GGML_TYPE_BF16 && node->src[1] && node->src[1]->type == GGML_TYPE_F16)
|
|
2702
|
+
(node->src[0]->type == GGML_TYPE_BF16 && node->src[1] && node->src[1]->type == GGML_TYPE_F16) ||
|
|
2703
|
+
// conversion between F32 and I32
|
|
2704
|
+
(node->src[0]->type == GGML_TYPE_F32 && node->src[1] && node->src[1]->type == GGML_TYPE_I32) ||
|
|
2705
|
+
(node->src[0]->type == GGML_TYPE_I32 && node->src[1] && node->src[1]->type == GGML_TYPE_F32)) {
|
|
2695
2706
|
cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
|
|
2696
2707
|
}
|
|
2697
2708
|
} break;
|
|
@@ -3206,20 +3217,12 @@ void ggml_cpu_fp32_to_fp16(const float * x, ggml_fp16_t * y, int64_t n) {
|
|
|
3206
3217
|
__m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
|
|
3207
3218
|
_mm_storel_epi64((__m128i *)(y + i), y_vec);
|
|
3208
3219
|
}
|
|
3209
|
-
#elif defined(
|
|
3210
|
-
for (; i
|
|
3211
|
-
|
|
3212
|
-
|
|
3213
|
-
|
|
3214
|
-
|
|
3215
|
-
vec_xst(v_y, 0, (ggml_fp16_t *)(y + i));
|
|
3216
|
-
}
|
|
3217
|
-
for (; i + 3 < n; i += 4) {
|
|
3218
|
-
float32x4_t v_x = vec_xl(0, (const float *)(x + i));
|
|
3219
|
-
float32x4_t v_zero = vec_splats(0.0f);
|
|
3220
|
-
uint16x8_t v_yd = vec_round_from_fp32(v_x, v_zero, 0);
|
|
3221
|
-
uint16x8_t v_y = vec_convert_to_fp16(v_yd, 0);
|
|
3222
|
-
vec_xst(v_y, 0, (ggml_fp16_t *)(y + i));
|
|
3220
|
+
#elif defined(__riscv_zvfh)
|
|
3221
|
+
for (int vl; i < n; i += vl) {
|
|
3222
|
+
vl = __riscv_vsetvl_e32m2(n - i);
|
|
3223
|
+
vfloat32m2_t vx = __riscv_vle32_v_f32m2(&x[i], vl);
|
|
3224
|
+
vfloat16m1_t vy = __riscv_vfncvt_f_f_w_f16m1(vx, vl);
|
|
3225
|
+
__riscv_vse16_v_f16m1((_Float16 *)&y[i], vy, vl);
|
|
3223
3226
|
}
|
|
3224
3227
|
#endif
|
|
3225
3228
|
for (; i < n; ++i) {
|
|
@@ -3247,21 +3250,6 @@ void ggml_cpu_fp16_to_fp32(const ggml_fp16_t * x, float * y, int64_t n) {
|
|
|
3247
3250
|
__m128 y_vec = _mm_cvtph_ps(x_vec);
|
|
3248
3251
|
_mm_storeu_ps(y + i, y_vec);
|
|
3249
3252
|
}
|
|
3250
|
-
#elif defined(__NNPA__)
|
|
3251
|
-
for (; i + 7 < n; i += 8) {
|
|
3252
|
-
uint16x8_t v_x = vec_xl(0, (const ggml_fp16_t *)(x + i));
|
|
3253
|
-
uint16x8_t v_yd = vec_convert_from_fp16(v_x, 0);
|
|
3254
|
-
float32x4_t v_yh = vec_extend_to_fp32_hi(v_yd, 0);
|
|
3255
|
-
float32x4_t v_yl = vec_extend_to_fp32_lo(v_yd, 0);
|
|
3256
|
-
vec_xst(v_yh, 0, (float *)(y + i + 0));
|
|
3257
|
-
vec_xst(v_yl, 0, (float *)(y + i + 4));
|
|
3258
|
-
}
|
|
3259
|
-
for (; i + 3 < n; i += 4) {
|
|
3260
|
-
uint16x8_t v_x = vec_xl(0, (const ggml_fp16_t *)(x + i));
|
|
3261
|
-
uint16x8_t v_yd = vec_convert_from_fp16(v_x, 0);
|
|
3262
|
-
float32x4_t v_yh = vec_extend_to_fp32_hi(v_yd, 0);
|
|
3263
|
-
vec_xst(v_yh, 0, (float *)(y + i));
|
|
3264
|
-
}
|
|
3265
3253
|
#endif
|
|
3266
3254
|
|
|
3267
3255
|
for (; i < n; ++i) {
|
|
@@ -3276,6 +3264,13 @@ void ggml_cpu_fp32_to_bf16(const float * x, ggml_bf16_t * y, int64_t n) {
|
|
|
3276
3264
|
}
|
|
3277
3265
|
}
|
|
3278
3266
|
|
|
3267
|
+
void ggml_cpu_fp32_to_i32(const float * x, int32_t * y, int64_t n) {
|
|
3268
|
+
int64_t i = 0;
|
|
3269
|
+
for (; i < n; ++i) {
|
|
3270
|
+
y[i] = x[i];
|
|
3271
|
+
}
|
|
3272
|
+
}
|
|
3273
|
+
|
|
3279
3274
|
void ggml_cpu_bf16_to_fp32(const ggml_bf16_t * x, float * y, int64_t n) {
|
|
3280
3275
|
int64_t i = 0;
|
|
3281
3276
|
#if defined(__AVX2__)
|
|
@@ -3465,14 +3460,6 @@ int ggml_cpu_has_vxe(void) {
|
|
|
3465
3460
|
#endif
|
|
3466
3461
|
}
|
|
3467
3462
|
|
|
3468
|
-
int ggml_cpu_has_nnpa(void) {
|
|
3469
|
-
#if defined(GGML_NNPA)
|
|
3470
|
-
return 1;
|
|
3471
|
-
#else
|
|
3472
|
-
return 0;
|
|
3473
|
-
#endif
|
|
3474
|
-
}
|
|
3475
|
-
|
|
3476
3463
|
int ggml_cpu_has_neon(void) {
|
|
3477
3464
|
#if defined(__ARM_ARCH) && defined(__ARM_NEON)
|
|
3478
3465
|
return 1;
|
|
@@ -190,6 +190,7 @@ static const struct ggml_backend_i ggml_backend_cpu_i = {
|
|
|
190
190
|
/* .graph_compute = */ ggml_backend_cpu_graph_compute,
|
|
191
191
|
/* .event_record = */ NULL,
|
|
192
192
|
/* .event_wait = */ NULL,
|
|
193
|
+
/* .optimize_graph = */ NULL,
|
|
193
194
|
};
|
|
194
195
|
|
|
195
196
|
static ggml_guid_t ggml_backend_cpu_guid(void) {
|
|
@@ -348,8 +349,10 @@ static void ggml_backend_cpu_device_get_memory(ggml_backend_dev_t dev, size_t *
|
|
|
348
349
|
long pages = sysconf(_SC_PHYS_PAGES);
|
|
349
350
|
long page_size = sysconf(_SC_PAGE_SIZE);
|
|
350
351
|
*total = pages * page_size;
|
|
352
|
+
|
|
353
|
+
// "free" system memory is ill-defined, for practical purposes assume that all of it is free:
|
|
351
354
|
*free = *total;
|
|
352
|
-
#endif
|
|
355
|
+
#endif // _WIN32
|
|
353
356
|
|
|
354
357
|
GGML_UNUSED(dev);
|
|
355
358
|
}
|
|
@@ -576,9 +579,6 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
|
|
|
576
579
|
if (ggml_cpu_has_vxe()) {
|
|
577
580
|
features.push_back({ "VXE", "1" });
|
|
578
581
|
}
|
|
579
|
-
if (ggml_cpu_has_nnpa()) {
|
|
580
|
-
features.push_back({ "NNPA", "1" });
|
|
581
|
-
}
|
|
582
582
|
if (ggml_cpu_has_wasm_simd()) {
|
|
583
583
|
features.push_back({ "WASM_SIMD", "1" });
|
|
584
584
|
}
|
|
@@ -154,7 +154,7 @@ class tensor_traits : public ggml::cpu::tensor_traits {
|
|
|
154
154
|
if (dst->src[0]->type == GGML_TYPE_Q4_0) {
|
|
155
155
|
return compute_forward_q4_0(params, dst);
|
|
156
156
|
} else if (dst->src[0]->type == GGML_TYPE_F16) {
|
|
157
|
-
return
|
|
157
|
+
return compute_forward_fp16(params, dst);
|
|
158
158
|
}
|
|
159
159
|
} else if (dst->op == GGML_OP_GET_ROWS) {
|
|
160
160
|
if (dst->src[0]->type == GGML_TYPE_Q4_0) {
|
|
@@ -164,7 +164,7 @@ class tensor_traits : public ggml::cpu::tensor_traits {
|
|
|
164
164
|
return false;
|
|
165
165
|
}
|
|
166
166
|
|
|
167
|
-
bool
|
|
167
|
+
bool compute_forward_fp16(ggml_compute_params * params, struct ggml_tensor * dst) {
|
|
168
168
|
static std::atomic_flag first_to_arrive = ATOMIC_FLAG_INIT;
|
|
169
169
|
|
|
170
170
|
const ggml_tensor * src0 = dst->src[0];
|
|
@@ -515,9 +515,6 @@ class extra_buffer_type : ggml::cpu::extra_buffer_type {
|
|
|
515
515
|
op->src[0]->buffer &&
|
|
516
516
|
(ggml_n_dims(op->src[0]) == 2) &&
|
|
517
517
|
op->src[0]->buffer->buft == ggml_backend_cpu_kleidiai_buffer_type() && ctx.kernels) {
|
|
518
|
-
if (op->op == GGML_OP_GET_ROWS && op->src[1]->ne[0] != 8) {
|
|
519
|
-
return false;
|
|
520
|
-
}
|
|
521
518
|
if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) {
|
|
522
519
|
return false;
|
|
523
520
|
}
|
|
@@ -534,13 +531,8 @@ class extra_buffer_type : ggml::cpu::extra_buffer_type {
|
|
|
534
531
|
if (op->src[0]->buffer && op->src[0]->buffer->buft == ggml_backend_cpu_kleidiai_buffer_type()) {
|
|
535
532
|
return (ggml::cpu::tensor_traits *) op->src[0]->extra;
|
|
536
533
|
}
|
|
537
|
-
else if (ggml_kleidiai_select_kernels(ctx.features, op) &&
|
|
538
|
-
|
|
539
|
-
(op->src[1]->op == GGML_OP_PERMUTE || op->src[1]->op == GGML_OP_SOFT_MAX) &&
|
|
540
|
-
op->src[1]->ne[1] > 1) {
|
|
541
|
-
if ((op->src[0]->nb[0] != 2) ||
|
|
542
|
-
(op->src[1]->nb[0] != 4) ||
|
|
543
|
-
(op->src[0]->nb[1] * op->src[0]->ne[1] != op->src[0]->nb[2]) ||
|
|
534
|
+
else if (ggml_kleidiai_select_kernels(ctx.features, op) && op->src[1]->ne[1] > 1) {
|
|
535
|
+
if ((op->src[0]->nb[1] * op->src[0]->ne[1] != op->src[0]->nb[2]) ||
|
|
544
536
|
(op->src[1]->nb[1] * op->src[1]->ne[1] != op->src[1]->nb[2])) {
|
|
545
537
|
return nullptr;
|
|
546
538
|
}
|