@fugood/llama.node 1.3.8 → 1.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.js +25 -18
- package/lib/binding.ts +19 -1
- package/lib/index.js +3 -3
- package/lib/index.ts +1 -1
- package/package.json +17 -17
- package/scripts/llama.cpp.patch +53 -4
- package/src/LlamaCompletionWorker.cpp +2 -2
- package/src/LlamaContext.cpp +6 -1
- package/src/llama.cpp/common/arg.cpp +1 -1
- package/src/llama.cpp/common/chat-parser.cpp +968 -0
- package/src/llama.cpp/common/chat.cpp +0 -952
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +2 -2
- package/src/llama.cpp/ggml/CMakeLists.txt +1 -0
- package/src/llama.cpp/ggml/include/ggml-rpc.h +1 -1
- package/src/llama.cpp/ggml/src/CMakeLists.txt +11 -4
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +336 -3
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/cpu-feats.cpp +11 -8
- package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +22 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +234 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.h +6 -0
- package/src/llama.cpp/src/CMakeLists.txt +1 -0
- package/src/llama.cpp/src/llama-arch.cpp +48 -3
- package/src/llama.cpp/src/llama-arch.h +2 -0
- package/src/llama.cpp/src/llama-context.cpp +6 -2
- package/src/llama.cpp/src/llama-hparams.h +1 -1
- package/src/llama.cpp/src/llama-model.cpp +102 -5
- package/src/llama.cpp/src/llama-model.h +4 -0
- package/src/llama.cpp/src/llama-quant.cpp +13 -5
- package/src/llama.cpp/src/models/lfm2.cpp +5 -3
- package/src/llama.cpp/src/models/models.h +51 -1
- package/src/llama.cpp/src/models/qwen3next.cpp +1042 -0
|
@@ -124,6 +124,58 @@ void ggml_quantize_mat_q8_0_4x8_generic(const float * GGML_RESTRICT x, void * GG
|
|
|
124
124
|
}
|
|
125
125
|
}
|
|
126
126
|
|
|
127
|
+
|
|
128
|
+
void ggml_quantize_mat_q8_K_4x4_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
|
|
129
|
+
assert(QK_K == 256);
|
|
130
|
+
assert(k % QK_K == 0);
|
|
131
|
+
const int nb = k / QK_K;
|
|
132
|
+
|
|
133
|
+
block_q8_Kx4 * GGML_RESTRICT y = (block_q8_Kx4 *) vy;
|
|
134
|
+
|
|
135
|
+
// scalar
|
|
136
|
+
const int blck_size_interleave = 4;
|
|
137
|
+
float srcv[4][QK_K];
|
|
138
|
+
float iscale[4];
|
|
139
|
+
|
|
140
|
+
for (int i = 0; i < nb; i++) {
|
|
141
|
+
for (int row_iter = 0; row_iter < 4; row_iter++) {
|
|
142
|
+
float amax = 0.0f; // absolute max
|
|
143
|
+
float max = 0;
|
|
144
|
+
|
|
145
|
+
for (int j = 0; j < QK_K; j++) {
|
|
146
|
+
srcv[row_iter][j] = x[row_iter * k + i * QK_K + j];
|
|
147
|
+
// Update the maximum value of the corresponding super block
|
|
148
|
+
if(amax < fabsf(srcv[row_iter][j])) {
|
|
149
|
+
amax = fabsf(srcv[row_iter][j]);
|
|
150
|
+
max = srcv[row_iter][j];
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
iscale[row_iter] = amax ? -127.f/max : 0;
|
|
155
|
+
|
|
156
|
+
y[i].d[row_iter] = amax ? 1/iscale[row_iter] : 0;
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
for (int j = 0; j < QK_K / 4; j++) {
|
|
160
|
+
y[i].bsums[j] = 0;
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
// Quants values are interleaved in sequence of four bytes from corresponding super blocks
|
|
164
|
+
// Bsums values are interleaved in sequence of four bsums from each super block taken for interleaving
|
|
165
|
+
// i.e first four bsums from the first super block, followed by first four bsums from second super block and so on
|
|
166
|
+
for (int j = 0; j < QK_K * 4; j++) {
|
|
167
|
+
int src_offset = (j / (4 * blck_size_interleave)) * blck_size_interleave;
|
|
168
|
+
int src_id = (j % (4 * blck_size_interleave)) / blck_size_interleave;
|
|
169
|
+
src_offset += (j % blck_size_interleave);
|
|
170
|
+
int index = (((j & 15) >> 2) << 2) + ((j >> 8) << 4) + ((j >> 6) & 3);
|
|
171
|
+
|
|
172
|
+
float x0 = srcv[src_id][src_offset] * iscale[src_id];
|
|
173
|
+
y[i].qs[j] = nearest_int(x0);
|
|
174
|
+
y[i].bsums[index] += y[i].qs[j];
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
|
|
127
179
|
void ggml_quantize_mat_q8_K_4x8_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
|
|
128
180
|
assert(QK_K == 256);
|
|
129
181
|
assert(k % QK_K == 0);
|
|
@@ -192,6 +244,12 @@ template <> void ggml_quantize_mat_t<8, GGML_TYPE_Q8_0>(const float * GGML_RESTR
|
|
|
192
244
|
ggml_quantize_mat_q8_0_4x8(x, vy, n_per_row);
|
|
193
245
|
}
|
|
194
246
|
|
|
247
|
+
template <> void ggml_quantize_mat_t<4, GGML_TYPE_Q8_K>(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row) {
|
|
248
|
+
assert(nrow == 4);
|
|
249
|
+
UNUSED(nrow);
|
|
250
|
+
ggml_quantize_mat_q8_K_4x4(x, vy, n_per_row);
|
|
251
|
+
}
|
|
252
|
+
|
|
195
253
|
template <> void ggml_quantize_mat_t<8, GGML_TYPE_Q8_K>(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row) {
|
|
196
254
|
assert(nrow == 4);
|
|
197
255
|
UNUSED(nrow);
|
|
@@ -333,6 +391,77 @@ void ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|
|
333
391
|
}
|
|
334
392
|
}
|
|
335
393
|
|
|
394
|
+
void ggml_gemv_q4_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
395
|
+
const int qk = QK_K;
|
|
396
|
+
const int nb = n / qk;
|
|
397
|
+
const int ncols_interleaved = 8;
|
|
398
|
+
const int blocklen = 4;
|
|
399
|
+
static const uint32_t kmask1 = 0x3f3f3f3f;
|
|
400
|
+
static const uint32_t kmask2 = 0x0f0f0f0f;
|
|
401
|
+
static const uint32_t kmask3 = 0x03030303;
|
|
402
|
+
|
|
403
|
+
assert (n % qk == 0);
|
|
404
|
+
assert (nc % ncols_interleaved == 0);
|
|
405
|
+
|
|
406
|
+
UNUSED(bs);
|
|
407
|
+
UNUSED(nr);
|
|
408
|
+
|
|
409
|
+
float sumf[8];
|
|
410
|
+
float sum_minf[8];
|
|
411
|
+
uint32_t utmp[32];
|
|
412
|
+
int sumi1;
|
|
413
|
+
int sumi2;
|
|
414
|
+
int sumi;
|
|
415
|
+
|
|
416
|
+
const block_q8_K * a_ptr = (const block_q8_K *) vy;
|
|
417
|
+
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
418
|
+
const block_q4_Kx8 * b_ptr = (const block_q4_Kx8 *) vx + (x * nb);
|
|
419
|
+
|
|
420
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
421
|
+
sumf[j] = 0.0;
|
|
422
|
+
sum_minf[j] = 0.0;
|
|
423
|
+
}
|
|
424
|
+
for (int l = 0; l < nb; l++) {
|
|
425
|
+
for (int sb = 0; sb < 8; sb++) {
|
|
426
|
+
memcpy(utmp + sb * 4, b_ptr[l].scales + sb * 12, 12);
|
|
427
|
+
utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4);
|
|
428
|
+
const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1;
|
|
429
|
+
utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4);
|
|
430
|
+
utmp[sb * 4 + 2] = uaux_0;
|
|
431
|
+
utmp[sb * 4 + 0] &= kmask1;
|
|
432
|
+
}
|
|
433
|
+
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
|
434
|
+
uint8_t * scales_0 = (uint8_t *) utmp + (k / 8) * 32;
|
|
435
|
+
uint8_t * scales_1 = (uint8_t *) utmp + (k / 8) * 32 + 16;
|
|
436
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
437
|
+
sumi1 = 0;
|
|
438
|
+
sumi2 = 0;
|
|
439
|
+
sumi = 0;
|
|
440
|
+
for (int i = 0; i < blocklen; ++i) {
|
|
441
|
+
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF);
|
|
442
|
+
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4);
|
|
443
|
+
sumi1 = (v0 * a_ptr[l].qs[(k / 8) * 64 + (k % 8) * blocklen + i]);
|
|
444
|
+
sumi2 = (v1 * a_ptr[l].qs[(k / 8) * 64 + (k % 8) * blocklen + i + 32]);
|
|
445
|
+
sumi1 = sumi1 * scales_0[j];
|
|
446
|
+
sumi2 = sumi2 * scales_1[j];
|
|
447
|
+
sumi += sumi1 + sumi2;
|
|
448
|
+
}
|
|
449
|
+
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
|
|
450
|
+
}
|
|
451
|
+
}
|
|
452
|
+
for (int sb = 0; sb < 8; sb++) {
|
|
453
|
+
uint8_t * mins = (uint8_t *) utmp + 8 + sb * 16;
|
|
454
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
455
|
+
sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
|
|
456
|
+
}
|
|
457
|
+
}
|
|
458
|
+
}
|
|
459
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
460
|
+
s[x * ncols_interleaved + j] = sumf[j] - sum_minf[j];
|
|
461
|
+
}
|
|
462
|
+
}
|
|
463
|
+
}
|
|
464
|
+
|
|
336
465
|
void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
337
466
|
const int qk = QK_K;
|
|
338
467
|
const int nb = n / qk;
|
|
@@ -727,6 +856,89 @@ void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|
|
727
856
|
}
|
|
728
857
|
}
|
|
729
858
|
|
|
859
|
+
void ggml_gemm_q4_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
860
|
+
const int qk = QK_K;
|
|
861
|
+
const int nb = n / qk;
|
|
862
|
+
const int ncols_interleaved = 8;
|
|
863
|
+
const int blocklen = 4;
|
|
864
|
+
static const uint32_t kmask1 = 0x3f3f3f3f;
|
|
865
|
+
static const uint32_t kmask2 = 0x0f0f0f0f;
|
|
866
|
+
static const uint32_t kmask3 = 0x03030303;
|
|
867
|
+
|
|
868
|
+
assert (n % qk == 0);
|
|
869
|
+
assert (nr % 4 == 0);
|
|
870
|
+
assert (nc % ncols_interleaved == 0);
|
|
871
|
+
|
|
872
|
+
UNUSED(nb);
|
|
873
|
+
UNUSED(ncols_interleaved);
|
|
874
|
+
UNUSED(blocklen);
|
|
875
|
+
|
|
876
|
+
float sumf[4][8];
|
|
877
|
+
float sum_minf[4][8];
|
|
878
|
+
uint32_t utmp[32];
|
|
879
|
+
int sumi1;
|
|
880
|
+
int sumi2;
|
|
881
|
+
int sumi;
|
|
882
|
+
|
|
883
|
+
for (int y = 0; y < nr / 4; y++) {
|
|
884
|
+
const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb);
|
|
885
|
+
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
886
|
+
const block_q4_Kx8 * b_ptr = (const block_q4_Kx8 *) vx + (x * nb);
|
|
887
|
+
for (int m = 0; m < 4; m++) {
|
|
888
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
889
|
+
sumf[m][j] = 0.0;
|
|
890
|
+
sum_minf[m][j] = 0.0;
|
|
891
|
+
}
|
|
892
|
+
}
|
|
893
|
+
for (int l = 0; l < nb; l++) {
|
|
894
|
+
for (int sb = 0; sb < 8; sb++) {
|
|
895
|
+
memcpy(utmp + sb * 4, b_ptr[l].scales + sb * 12, 12);
|
|
896
|
+
utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4);
|
|
897
|
+
const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1;
|
|
898
|
+
utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4);
|
|
899
|
+
utmp[sb * 4 + 2] = uaux_0;
|
|
900
|
+
utmp[sb * 4 + 0] &= kmask1;
|
|
901
|
+
}
|
|
902
|
+
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
|
903
|
+
uint8_t * scales_0 = (uint8_t *) utmp + (k / 8) * 32;
|
|
904
|
+
uint8_t * scales_1 = (uint8_t *) utmp + (k / 8) * 32 + 16;
|
|
905
|
+
for (int m = 0; m < 4; m++) {
|
|
906
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
907
|
+
sumi1 = 0;
|
|
908
|
+
sumi2 = 0;
|
|
909
|
+
sumi = 0;
|
|
910
|
+
for (int i = 0; i < blocklen; ++i) {
|
|
911
|
+
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF);
|
|
912
|
+
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4);
|
|
913
|
+
sumi1 = (v0 * a_ptr[l].qs[(k / 8) * 256 + (k % 8) * 4 * blocklen + m * blocklen + i]);
|
|
914
|
+
sumi2 = (v1 * a_ptr[l].qs[(k / 8) * 256 + (k % 8) * 4 * blocklen + m * blocklen + i + 128]);
|
|
915
|
+
sumi1 = sumi1 * scales_0[j];
|
|
916
|
+
sumi2 = sumi2 * scales_1[j];
|
|
917
|
+
sumi += sumi1 + sumi2;
|
|
918
|
+
}
|
|
919
|
+
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
|
|
920
|
+
}
|
|
921
|
+
}
|
|
922
|
+
}
|
|
923
|
+
for (int sb = 0; sb < 8; sb++) {
|
|
924
|
+
uint8_t * mins = (uint8_t *) utmp + 8 + sb * 16;
|
|
925
|
+
for(int m = 0; m < 4; m++) {
|
|
926
|
+
const int16_t * bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6);
|
|
927
|
+
for(int j = 0; j < ncols_interleaved; j++) {
|
|
928
|
+
sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
|
|
929
|
+
}
|
|
930
|
+
}
|
|
931
|
+
}
|
|
932
|
+
}
|
|
933
|
+
for (int m = 0; m < 4; m++) {
|
|
934
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
935
|
+
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j] - sum_minf[m][j];
|
|
936
|
+
}
|
|
937
|
+
}
|
|
938
|
+
}
|
|
939
|
+
}
|
|
940
|
+
}
|
|
941
|
+
|
|
730
942
|
void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
731
943
|
const int qk = QK_K;
|
|
732
944
|
const int nb = n / qk;
|
|
@@ -1228,9 +1440,10 @@ static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block
|
|
|
1228
1440
|
|
|
1229
1441
|
GGML_UNUSED(data_size);
|
|
1230
1442
|
}
|
|
1443
|
+
|
|
1231
1444
|
static int repack_q4_K_to_q4_K_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
|
|
1232
1445
|
GGML_ASSERT(t->type == GGML_TYPE_Q4_K);
|
|
1233
|
-
GGML_ASSERT(interleave_block == 8);
|
|
1446
|
+
GGML_ASSERT(interleave_block == 8 || interleave_block == 4);
|
|
1234
1447
|
constexpr int nrows_interleaved = 8;
|
|
1235
1448
|
|
|
1236
1449
|
block_q4_Kx8 * dst = (block_q4_Kx8*)t->data;
|
|
@@ -1468,6 +1681,10 @@ template <> int repack<block_q4_K, 8, 8>(struct ggml_tensor * t, const void * da
|
|
|
1468
1681
|
return repack_q4_K_to_q4_K_8_bl(t, 8, data, data_size);
|
|
1469
1682
|
}
|
|
1470
1683
|
|
|
1684
|
+
template <> int repack<block_q4_K, 4, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
|
1685
|
+
return repack_q4_K_to_q4_K_8_bl(t, 4, data, data_size);
|
|
1686
|
+
}
|
|
1687
|
+
|
|
1471
1688
|
template <> int repack<block_q2_K, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
|
1472
1689
|
return repack_q2_K_to_q2_K_8_bl(t, 8, data, data_size);
|
|
1473
1690
|
}
|
|
@@ -1501,6 +1718,10 @@ template <> void gemv<block_q4_0, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t
|
|
|
1501
1718
|
ggml_gemv_q4_0_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
|
|
1502
1719
|
}
|
|
1503
1720
|
|
|
1721
|
+
template <> void gemv<block_q4_K, 4, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
1722
|
+
ggml_gemv_q4_K_8x4_q8_K(n, s, bs, vx, vy, nr, nc);
|
|
1723
|
+
}
|
|
1724
|
+
|
|
1504
1725
|
template <> void gemv<block_q4_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
1505
1726
|
ggml_gemv_q4_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
|
|
1506
1727
|
}
|
|
@@ -1529,6 +1750,10 @@ template <> void gemm<block_q4_0, 8, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t
|
|
|
1529
1750
|
ggml_gemm_q4_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc);
|
|
1530
1751
|
}
|
|
1531
1752
|
|
|
1753
|
+
template <> void gemm<block_q4_K, 4, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
1754
|
+
ggml_gemm_q4_K_8x4_q8_K(n, s, bs, vx, vy, nr, nc);
|
|
1755
|
+
}
|
|
1756
|
+
|
|
1532
1757
|
template <> void gemm<block_q4_0, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
1533
1758
|
ggml_gemm_q4_0_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
|
|
1534
1759
|
}
|
|
@@ -1931,6 +2156,9 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
|
|
|
1931
2156
|
static const ggml::cpu::repack::tensor_traits<block_q4_0, 4, 4, GGML_TYPE_Q8_0> q4_0_4x4_q8_0;
|
|
1932
2157
|
static const ggml::cpu::repack::tensor_traits<block_q4_0, 8, 4, GGML_TYPE_Q8_0> q4_0_4x8_q8_0;
|
|
1933
2158
|
static const ggml::cpu::repack::tensor_traits<block_q4_0, 8, 8, GGML_TYPE_Q8_0> q4_0_8x8_q8_0;
|
|
2159
|
+
|
|
2160
|
+
// instance for Q4_K
|
|
2161
|
+
static const ggml::cpu::repack::tensor_traits<block_q4_K, 4, 8, GGML_TYPE_Q8_K> q4_K_8x4_q8_K;
|
|
1934
2162
|
static const ggml::cpu::repack::tensor_traits<block_q4_K, 8, 8, GGML_TYPE_Q8_K> q4_K_8x8_q8_K;
|
|
1935
2163
|
|
|
1936
2164
|
// instance for Q2
|
|
@@ -1967,6 +2195,11 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
|
|
|
1967
2195
|
return &q4_K_8x8_q8_K;
|
|
1968
2196
|
}
|
|
1969
2197
|
}
|
|
2198
|
+
if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
|
|
2199
|
+
if (cur->ne[1] % 8 == 0) {
|
|
2200
|
+
return &q4_K_8x4_q8_K;
|
|
2201
|
+
}
|
|
2202
|
+
}
|
|
1970
2203
|
} else if (cur->type == GGML_TYPE_Q2_K) {
|
|
1971
2204
|
if (ggml_cpu_has_avx512()) {
|
|
1972
2205
|
if (cur->ne[1] % 8 == 0) {
|
|
@@ -80,10 +80,12 @@ extern "C" {
|
|
|
80
80
|
|
|
81
81
|
void ggml_quantize_mat_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
|
|
82
82
|
void ggml_quantize_mat_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
|
|
83
|
+
void ggml_quantize_mat_q8_K_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
|
|
83
84
|
void ggml_quantize_mat_q8_K_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
|
|
84
85
|
void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
85
86
|
void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
86
87
|
void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
88
|
+
void ggml_gemv_q4_K_8x4_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
87
89
|
void ggml_gemv_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
88
90
|
void ggml_gemv_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
89
91
|
void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
@@ -91,6 +93,7 @@ void ggml_gemv_iq4_nl_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const
|
|
|
91
93
|
void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
92
94
|
void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
93
95
|
void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
96
|
+
void ggml_gemm_q4_K_8x4_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
94
97
|
void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
95
98
|
void ggml_gemm_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
96
99
|
void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
@@ -99,10 +102,12 @@ void ggml_gemm_iq4_nl_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const
|
|
|
99
102
|
// Native implementations
|
|
100
103
|
void ggml_quantize_mat_q8_0_4x4_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
|
|
101
104
|
void ggml_quantize_mat_q8_0_4x8_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
|
|
105
|
+
void ggml_quantize_mat_q8_K_4x4_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
|
|
102
106
|
void ggml_quantize_mat_q8_K_4x8_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
|
|
103
107
|
void ggml_gemv_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
104
108
|
void ggml_gemv_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
105
109
|
void ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
110
|
+
void ggml_gemv_q4_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
106
111
|
void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
107
112
|
void ggml_gemv_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
108
113
|
void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
@@ -110,6 +115,7 @@ void ggml_gemv_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs
|
|
|
110
115
|
void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
111
116
|
void ggml_gemm_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
112
117
|
void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
118
|
+
void ggml_gemm_q4_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
113
119
|
void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
114
120
|
void ggml_gemm_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
115
121
|
void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
@@ -32,6 +32,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
|
32
32
|
{ LLM_ARCH_QWEN2VL, "qwen2vl" },
|
|
33
33
|
{ LLM_ARCH_QWEN3, "qwen3" },
|
|
34
34
|
{ LLM_ARCH_QWEN3MOE, "qwen3moe" },
|
|
35
|
+
{ LLM_ARCH_QWEN3NEXT, "qwen3next" },
|
|
35
36
|
{ LLM_ARCH_QWEN3VL, "qwen3vl" },
|
|
36
37
|
{ LLM_ARCH_QWEN3VLMOE, "qwen3vlmoe" },
|
|
37
38
|
{ LLM_ARCH_PHI2, "phi2" },
|
|
@@ -829,6 +830,38 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
|
829
830
|
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
|
830
831
|
},
|
|
831
832
|
},
|
|
833
|
+
{
|
|
834
|
+
LLM_ARCH_QWEN3NEXT,
|
|
835
|
+
{
|
|
836
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
837
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|
838
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
|
839
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
|
840
|
+
{ LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
|
|
841
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
|
842
|
+
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
|
843
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
|
844
|
+
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
|
845
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
|
846
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
|
847
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
|
848
|
+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
|
849
|
+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
|
850
|
+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
|
851
|
+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
|
852
|
+
{ LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" },
|
|
853
|
+
{ LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
|
|
854
|
+
{ LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
|
|
855
|
+
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
|
856
|
+
{ LLM_TENSOR_SSM_A, "blk.%d.ssm_a" },
|
|
857
|
+
{ LLM_TENSOR_SSM_CONV1D, "blk.%d.ssm_conv1d" },
|
|
858
|
+
{ LLM_TENSOR_SSM_DT, "blk.%d.ssm_dt" },
|
|
859
|
+
{ LLM_TENSOR_SSM_BETA_ALPHA, "blk.%d.ssm_ba" },
|
|
860
|
+
{ LLM_TENSOR_SSM_IN, "blk.%d.ssm_in" },
|
|
861
|
+
{ LLM_TENSOR_SSM_NORM, "blk.%d.ssm_norm" },
|
|
862
|
+
{ LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" },
|
|
863
|
+
},
|
|
864
|
+
},
|
|
832
865
|
{
|
|
833
866
|
LLM_ARCH_QWEN3VL,
|
|
834
867
|
{
|
|
@@ -2237,7 +2270,7 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
|
2237
2270
|
{ LLM_TENSOR_SHORTCONV_INPROJ, "blk.%d.shortconv.in_proj" },
|
|
2238
2271
|
{ LLM_TENSOR_SHORTCONV_OUTPROJ, "blk.%d.shortconv.out_proj" },
|
|
2239
2272
|
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
2240
|
-
{
|
|
2273
|
+
{ LLM_TENSOR_OUTPUT_NORM, "token_embd_norm" }, // note: wrong tensor name
|
|
2241
2274
|
{ LLM_TENSOR_OUTPUT, "output" },
|
|
2242
2275
|
}
|
|
2243
2276
|
},
|
|
@@ -2259,7 +2292,7 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
|
2259
2292
|
{ LLM_TENSOR_SHORTCONV_INPROJ, "blk.%d.shortconv.in_proj" },
|
|
2260
2293
|
{ LLM_TENSOR_SHORTCONV_OUTPROJ, "blk.%d.shortconv.out_proj" },
|
|
2261
2294
|
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
2262
|
-
{
|
|
2295
|
+
{ LLM_TENSOR_OUTPUT_NORM, "token_embd_norm" }, // note: wrong tensor name
|
|
2263
2296
|
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
|
2264
2297
|
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
|
2265
2298
|
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
|
@@ -2487,11 +2520,21 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
|
2487
2520
|
},
|
|
2488
2521
|
};
|
|
2489
2522
|
|
|
2523
|
+
// declare information about the model weight tensors:
|
|
2524
|
+
// - the layer in which the tensor is going to be used. this is needed in order to assign the correct buffer type for the weight
|
|
2525
|
+
// - the operator which is going to use the weight. this is needed to determine if the respective backend supports the operator
|
|
2526
|
+
//
|
|
2527
|
+
// for example, input layers are usually assigned to CPU/host buffer types
|
|
2528
|
+
//
|
|
2529
|
+
// a mismatch between the declared information and the actual layer/op in which the tensor is used can lead to sub-optimal
|
|
2530
|
+
// assignment of the buffer types and extra overhead during computation
|
|
2531
|
+
// example: https://github.com/ggml-org/llama.cpp/pull/17548
|
|
2532
|
+
//
|
|
2490
2533
|
static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
|
2491
2534
|
{LLM_TENSOR_TOKEN_EMBD, {LLM_TENSOR_LAYER_INPUT, GGML_OP_GET_ROWS}},
|
|
2492
2535
|
{LLM_TENSOR_POS_EMBD, {LLM_TENSOR_LAYER_INPUT, GGML_OP_GET_ROWS}},
|
|
2493
|
-
{LLM_TENSOR_TOKEN_EMBD_NORM, {LLM_TENSOR_LAYER_INPUT, GGML_OP_GET_ROWS}},
|
|
2494
2536
|
{LLM_TENSOR_TOKEN_TYPES, {LLM_TENSOR_LAYER_INPUT, GGML_OP_GET_ROWS}},
|
|
2537
|
+
{LLM_TENSOR_TOKEN_EMBD_NORM, {LLM_TENSOR_LAYER_INPUT, GGML_OP_MUL}},
|
|
2495
2538
|
{LLM_TENSOR_OUTPUT, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
|
|
2496
2539
|
{LLM_TENSOR_CLS, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
|
|
2497
2540
|
{LLM_TENSOR_CLS_OUT, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
|
|
@@ -2546,6 +2589,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
|
|
2546
2589
|
{LLM_TENSOR_SSM_X, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
2547
2590
|
{LLM_TENSOR_SSM_DT, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
2548
2591
|
{LLM_TENSOR_SSM_OUT, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
2592
|
+
{LLM_TENSOR_SSM_BETA_ALPHA, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
2549
2593
|
{LLM_TENSOR_TIME_MIX_W1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
2550
2594
|
{LLM_TENSOR_TIME_MIX_W2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
2551
2595
|
{LLM_TENSOR_TIME_MIX_A1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
@@ -2744,6 +2788,7 @@ bool llm_arch_is_hybrid(const llm_arch & arch) {
|
|
|
2744
2788
|
case LLM_ARCH_LFM2:
|
|
2745
2789
|
case LLM_ARCH_LFM2MOE:
|
|
2746
2790
|
case LLM_ARCH_NEMOTRON_H:
|
|
2791
|
+
case LLM_ARCH_QWEN3NEXT:
|
|
2747
2792
|
return true;
|
|
2748
2793
|
default:
|
|
2749
2794
|
return false;
|
|
@@ -36,6 +36,7 @@ enum llm_arch {
|
|
|
36
36
|
LLM_ARCH_QWEN2VL,
|
|
37
37
|
LLM_ARCH_QWEN3,
|
|
38
38
|
LLM_ARCH_QWEN3MOE,
|
|
39
|
+
LLM_ARCH_QWEN3NEXT,
|
|
39
40
|
LLM_ARCH_QWEN3VL,
|
|
40
41
|
LLM_ARCH_QWEN3VLMOE,
|
|
41
42
|
LLM_ARCH_PHI2,
|
|
@@ -381,6 +382,7 @@ enum llm_tensor {
|
|
|
381
382
|
LLM_TENSOR_SSM_D,
|
|
382
383
|
LLM_TENSOR_SSM_NORM,
|
|
383
384
|
LLM_TENSOR_SSM_OUT,
|
|
385
|
+
LLM_TENSOR_SSM_BETA_ALPHA, // qwen3next
|
|
384
386
|
LLM_TENSOR_TIME_MIX_W0,
|
|
385
387
|
LLM_TENSOR_TIME_MIX_W1,
|
|
386
388
|
LLM_TENSOR_TIME_MIX_W2,
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
#include "llama-context.h"
|
|
2
2
|
|
|
3
|
+
#include "llama-arch.h"
|
|
3
4
|
#include "llama-impl.h"
|
|
4
5
|
#include "llama-batch.h"
|
|
5
6
|
#include "llama-io.h"
|
|
@@ -299,7 +300,7 @@ llama_context::llama_context(
|
|
|
299
300
|
|
|
300
301
|
cross.v_embd.clear();
|
|
301
302
|
|
|
302
|
-
const uint32_t n_seqs = cparams.
|
|
303
|
+
const uint32_t n_seqs = cparams.n_seq_max;
|
|
303
304
|
const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
|
|
304
305
|
|
|
305
306
|
// avoid reserving graphs with zero outputs - assume one output per sequence
|
|
@@ -542,7 +543,7 @@ bool llama_context::memory_update(bool optimize) {
|
|
|
542
543
|
throw std::runtime_error("failed to initialize memory context");
|
|
543
544
|
}
|
|
544
545
|
|
|
545
|
-
const uint32_t n_seqs = cparams.
|
|
546
|
+
const uint32_t n_seqs = cparams.n_seq_max;
|
|
546
547
|
const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
|
|
547
548
|
|
|
548
549
|
auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
|
|
@@ -1386,6 +1387,9 @@ void llama_context::output_reorder() {
|
|
|
1386
1387
|
//
|
|
1387
1388
|
|
|
1388
1389
|
uint32_t llama_context::graph_max_nodes() const {
|
|
1390
|
+
if (model.arch == LLM_ARCH_QWEN3NEXT) {
|
|
1391
|
+
return std::max<uint32_t>(8192u, 32u*model.n_tensors());
|
|
1392
|
+
}
|
|
1389
1393
|
return std::max<uint32_t>(1024u, 8u*model.n_tensors());
|
|
1390
1394
|
}
|
|
1391
1395
|
|