@fugood/llama.node 1.1.1 → 1.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +14 -14
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +71 -596
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +14 -286
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +90 -569
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +103 -596
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +55 -341
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +3 -58
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +62 -305
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +54 -314
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +94 -673
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +16 -249
- package/src/llama.cpp/src/llama-arch.cpp +22 -0
- package/src/llama.cpp/src/llama-arch.h +1 -0
- package/src/llama.cpp/src/llama-context.cpp +2 -2
- package/src/llama.cpp/src/llama-graph.cpp +94 -0
- package/src/llama.cpp/src/llama-graph.h +12 -0
- package/src/llama.cpp/src/llama-hparams.cpp +9 -3
- package/src/llama.cpp/src/llama-hparams.h +11 -4
- package/src/llama.cpp/src/llama-model.cpp +195 -8
|
@@ -281,35 +281,9 @@ void ggml_quantize_mat_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTR
|
|
|
281
281
|
}
|
|
282
282
|
|
|
283
283
|
#else
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
float id[4];
|
|
288
|
-
|
|
289
|
-
for (int i = 0; i < nb; i++) {
|
|
290
|
-
for (int row_iter = 0; row_iter < 4; row_iter++) {
|
|
291
|
-
float amax = 0.0f; // absolute max
|
|
292
|
-
|
|
293
|
-
for (int j = 0; j < QK8_0; j++) {
|
|
294
|
-
srcv[row_iter][j] = x[row_iter * k + i * QK8_0 + j];
|
|
295
|
-
amax = MAX(amax, fabsf(srcv[row_iter][j]));
|
|
296
|
-
}
|
|
297
|
-
|
|
298
|
-
const float d = amax / ((1 << 7) - 1);
|
|
299
|
-
id[row_iter] = d ? 1.0f / d : 0.0f;
|
|
300
|
-
|
|
301
|
-
y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d);
|
|
302
|
-
}
|
|
303
|
-
|
|
304
|
-
for (int j = 0; j < QK8_0 * 4; j++) {
|
|
305
|
-
int src_offset = (j / (4 * blck_size_interleave)) * blck_size_interleave;
|
|
306
|
-
int src_id = (j % (4 * blck_size_interleave)) / blck_size_interleave;
|
|
307
|
-
src_offset += (j % blck_size_interleave);
|
|
308
|
-
|
|
309
|
-
float x0 = srcv[src_id][src_offset] * id[src_id];
|
|
310
|
-
y[i].qs[j] = roundf(x0);
|
|
311
|
-
}
|
|
312
|
-
}
|
|
284
|
+
UNUSED(nb);
|
|
285
|
+
UNUSED(y);
|
|
286
|
+
ggml_quantize_mat_q8_0_4x8_generic(x, vy, k);
|
|
313
287
|
#endif
|
|
314
288
|
}
|
|
315
289
|
|
|
@@ -531,49 +505,9 @@ void ggml_quantize_mat_q8_K_4x8(const float * GGML_RESTRICT x, void * GGML_RESTR
|
|
|
531
505
|
}
|
|
532
506
|
|
|
533
507
|
#else
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
float srcv[4][QK_K];
|
|
538
|
-
float iscale[4];
|
|
539
|
-
|
|
540
|
-
for (int i = 0; i < nb; i++) {
|
|
541
|
-
for (int row_iter = 0; row_iter < 4; row_iter++) {
|
|
542
|
-
float amax = 0.0f; // absolute max
|
|
543
|
-
float max = 0;
|
|
544
|
-
|
|
545
|
-
for (int j = 0; j < QK_K; j++) {
|
|
546
|
-
srcv[row_iter][j] = x[row_iter * k + i * QK_K + j];
|
|
547
|
-
// Update the maximum value of the corresponding super block
|
|
548
|
-
if(amax < fabsf(srcv[row_iter][j])) {
|
|
549
|
-
amax = fabsf(srcv[row_iter][j]);
|
|
550
|
-
max = srcv[row_iter][j];
|
|
551
|
-
}
|
|
552
|
-
}
|
|
553
|
-
|
|
554
|
-
iscale[row_iter] = amax ? -127.f/max : 0;
|
|
555
|
-
|
|
556
|
-
y[i].d[row_iter] = amax ? 1/iscale[row_iter] : 0;
|
|
557
|
-
}
|
|
558
|
-
|
|
559
|
-
for (int j = 0; j < QK_K / 4; j++) {
|
|
560
|
-
y[i].bsums[j] = 0;
|
|
561
|
-
}
|
|
562
|
-
|
|
563
|
-
// Quants values are interleaved in sequence of eight bytes from corresponding super blocks
|
|
564
|
-
// Bsums values are interleaved in sequence of four bsums from each super block taken for interleaving
|
|
565
|
-
// i.e first four bsums from the first super block, followed by first four bsums from second super block and so on
|
|
566
|
-
for (int j = 0; j < QK_K * 4; j++) {
|
|
567
|
-
int src_offset = (j / (4 * blck_size_interleave)) * blck_size_interleave;
|
|
568
|
-
int src_id = (j % (4 * blck_size_interleave)) / blck_size_interleave;
|
|
569
|
-
src_offset += (j % blck_size_interleave);
|
|
570
|
-
int index = (((j & 31) >> 3) << 2) + ((j >> 8) << 4) + ((j >> 6) & 3);
|
|
571
|
-
|
|
572
|
-
float x0 = srcv[src_id][src_offset] * iscale[src_id];
|
|
573
|
-
y[i].qs[j] = nearest_int(x0);
|
|
574
|
-
y[i].bsums[index] += y[i].qs[j];
|
|
575
|
-
}
|
|
576
|
-
}
|
|
508
|
+
UNUSED(nb);
|
|
509
|
+
UNUSED(y);
|
|
510
|
+
ggml_quantize_mat_q8_K_4x8_generic(x, vy, k);
|
|
577
511
|
#endif
|
|
578
512
|
}
|
|
579
513
|
|
|
@@ -689,31 +623,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
689
623
|
return;
|
|
690
624
|
|
|
691
625
|
#endif
|
|
692
|
-
|
|
693
|
-
float sumf[8];
|
|
694
|
-
int sumi;
|
|
695
|
-
|
|
696
|
-
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
|
697
|
-
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
698
|
-
const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
|
|
699
|
-
|
|
700
|
-
for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
|
|
701
|
-
for (int l = 0; l < nb; l++) {
|
|
702
|
-
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
|
703
|
-
for (int j = 0; j < ncols_interleaved; j++) {
|
|
704
|
-
sumi = 0;
|
|
705
|
-
for (int i = 0; i < blocklen; ++i) {
|
|
706
|
-
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
|
|
707
|
-
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
|
|
708
|
-
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
|
|
709
|
-
}
|
|
710
|
-
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
|
711
|
-
}
|
|
712
|
-
}
|
|
713
|
-
}
|
|
714
|
-
for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
|
|
715
|
-
}
|
|
716
|
-
}
|
|
626
|
+
ggml_gemv_q4_0_8x8_q8_0_generic(n, s, bs, vx, vy, nr, nc);
|
|
717
627
|
}
|
|
718
628
|
|
|
719
629
|
void ggml_gemv_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
@@ -932,61 +842,10 @@ void ggml_gemv_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
932
842
|
}
|
|
933
843
|
|
|
934
844
|
#else
|
|
935
|
-
|
|
936
|
-
|
|
937
|
-
|
|
938
|
-
|
|
939
|
-
int sumi1;
|
|
940
|
-
int sumi2;
|
|
941
|
-
int sumi;
|
|
942
|
-
|
|
943
|
-
const block_q8_K * a_ptr = (const block_q8_K *) vy;
|
|
944
|
-
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
945
|
-
const block_q4_Kx8 * b_ptr = (const block_q4_Kx8 *) vx + (x * nb);
|
|
946
|
-
|
|
947
|
-
for (int j = 0; j < ncols_interleaved; j++) {
|
|
948
|
-
sumf[j] = 0.0;
|
|
949
|
-
sum_minf[j] = 0.0;
|
|
950
|
-
}
|
|
951
|
-
for (int l = 0; l < nb; l++) {
|
|
952
|
-
for (int sb = 0; sb < 8; sb++) {
|
|
953
|
-
memcpy(utmp + sb * 4, b_ptr[l].scales + sb * 12, 12);
|
|
954
|
-
utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4);
|
|
955
|
-
const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1;
|
|
956
|
-
utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4);
|
|
957
|
-
utmp[sb * 4 + 2] = uaux_0;
|
|
958
|
-
utmp[sb * 4 + 0] &= kmask1;
|
|
959
|
-
}
|
|
960
|
-
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
|
961
|
-
uint8_t *scales_0 = (uint8_t*) utmp + (k / 4) * 32;
|
|
962
|
-
uint8_t *scales_1 = (uint8_t*) utmp + (k / 4) * 32 + 16;
|
|
963
|
-
for (int j = 0; j < ncols_interleaved; j++) {
|
|
964
|
-
sumi1 = 0;
|
|
965
|
-
sumi2 = 0;
|
|
966
|
-
sumi = 0;
|
|
967
|
-
for (int i = 0; i < blocklen; ++i) {
|
|
968
|
-
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF);
|
|
969
|
-
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4);
|
|
970
|
-
sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 64 + (k % 4) * blocklen + i]);
|
|
971
|
-
sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 64 + (k % 4) * blocklen + i + 32]);
|
|
972
|
-
sumi1 = sumi1 * scales_0[j];
|
|
973
|
-
sumi2 = sumi2 * scales_1[j];
|
|
974
|
-
sumi += sumi1 + sumi2;
|
|
975
|
-
}
|
|
976
|
-
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
|
|
977
|
-
}
|
|
978
|
-
}
|
|
979
|
-
for (int sb = 0; sb < 8; sb++) {
|
|
980
|
-
uint8_t *mins = (uint8_t*) utmp + 8 + sb * 16;
|
|
981
|
-
for (int j = 0; j < ncols_interleaved; j++) {
|
|
982
|
-
sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
|
|
983
|
-
}
|
|
984
|
-
}
|
|
985
|
-
}
|
|
986
|
-
for (int j = 0; j < ncols_interleaved; j++) {
|
|
987
|
-
s[x * ncols_interleaved + j] = sumf[j] - sum_minf[j];
|
|
988
|
-
}
|
|
989
|
-
}
|
|
845
|
+
UNUSED(kmask1);
|
|
846
|
+
UNUSED(kmask2);
|
|
847
|
+
UNUSED(kmask3);
|
|
848
|
+
ggml_gemv_q4_K_8x8_q8_K_generic(n, s, bs, vx, vy, nr, nc);
|
|
990
849
|
#endif
|
|
991
850
|
}
|
|
992
851
|
|
|
@@ -1735,38 +1594,7 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
1735
1594
|
}
|
|
1736
1595
|
|
|
1737
1596
|
#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__)
|
|
1738
|
-
|
|
1739
|
-
int sumi;
|
|
1740
|
-
|
|
1741
|
-
for (int y = 0; y < nr / 4; y++) {
|
|
1742
|
-
const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
|
|
1743
|
-
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
1744
|
-
const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
|
|
1745
|
-
for (int m = 0; m < 4; m++) {
|
|
1746
|
-
for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
|
|
1747
|
-
}
|
|
1748
|
-
for (int l = 0; l < nb; l++) {
|
|
1749
|
-
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
|
1750
|
-
for (int m = 0; m < 4; m++) {
|
|
1751
|
-
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1752
|
-
sumi = 0;
|
|
1753
|
-
for (int i = 0; i < blocklen; ++i) {
|
|
1754
|
-
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
|
|
1755
|
-
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
|
|
1756
|
-
sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
|
|
1757
|
-
(v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
|
|
1758
|
-
}
|
|
1759
|
-
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
|
|
1760
|
-
}
|
|
1761
|
-
}
|
|
1762
|
-
}
|
|
1763
|
-
}
|
|
1764
|
-
for (int m = 0; m < 4; m++) {
|
|
1765
|
-
for (int j = 0; j < ncols_interleaved; j++)
|
|
1766
|
-
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
|
|
1767
|
-
}
|
|
1768
|
-
}
|
|
1769
|
-
}
|
|
1597
|
+
ggml_gemm_q4_0_8x8_q8_0_generic(n, s, bs, vx, vy, nr, nc);
|
|
1770
1598
|
}
|
|
1771
1599
|
|
|
1772
1600
|
void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
@@ -3216,70 +3044,9 @@ void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
3216
3044
|
}
|
|
3217
3045
|
|
|
3218
3046
|
#else
|
|
3219
|
-
|
|
3220
|
-
|
|
3221
|
-
|
|
3222
|
-
|
|
3223
|
-
int sumi1;
|
|
3224
|
-
int sumi2;
|
|
3225
|
-
int sumi;
|
|
3226
|
-
|
|
3227
|
-
for (int y = 0; y < nr / 4; y++) {
|
|
3228
|
-
const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb);
|
|
3229
|
-
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
3230
|
-
const block_q4_Kx8 * b_ptr = (const block_q4_Kx8 *) vx + (x * nb);
|
|
3231
|
-
for (int m = 0; m < 4; m++) {
|
|
3232
|
-
for (int j = 0; j < ncols_interleaved; j++) {
|
|
3233
|
-
sumf[m][j] = 0.0;
|
|
3234
|
-
sum_minf[m][j] = 0.0;
|
|
3235
|
-
}
|
|
3236
|
-
}
|
|
3237
|
-
for (int l = 0; l < nb; l++) {
|
|
3238
|
-
for (int sb = 0; sb < 8; sb++) {
|
|
3239
|
-
memcpy(utmp + sb * 4, b_ptr[l].scales + sb * 12, 12);
|
|
3240
|
-
utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4);
|
|
3241
|
-
const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1;
|
|
3242
|
-
utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4);
|
|
3243
|
-
utmp[sb * 4 + 2] = uaux_0;
|
|
3244
|
-
utmp[sb * 4 + 0] &= kmask1;
|
|
3245
|
-
}
|
|
3246
|
-
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
|
3247
|
-
uint8_t *scales_0 = (uint8_t*) utmp + (k / 4) * 32;
|
|
3248
|
-
uint8_t *scales_1 = (uint8_t*) utmp + (k / 4) * 32 + 16;
|
|
3249
|
-
for (int m = 0; m < 4; m++) {
|
|
3250
|
-
for (int j = 0; j < ncols_interleaved; j++) {
|
|
3251
|
-
sumi1 = 0;
|
|
3252
|
-
sumi2 = 0;
|
|
3253
|
-
sumi = 0;
|
|
3254
|
-
for (int i = 0; i < blocklen; ++i) {
|
|
3255
|
-
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF);
|
|
3256
|
-
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4);
|
|
3257
|
-
sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 256 + (k % 4) * 4 * blocklen + m * blocklen + i]);
|
|
3258
|
-
sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 256 + (k % 4) * 4 * blocklen + m * blocklen + i + 128]);
|
|
3259
|
-
sumi1 = sumi1 * scales_0[j];
|
|
3260
|
-
sumi2 = sumi2 * scales_1[j];
|
|
3261
|
-
sumi += sumi1 + sumi2;
|
|
3262
|
-
}
|
|
3263
|
-
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
|
|
3264
|
-
}
|
|
3265
|
-
}
|
|
3266
|
-
}
|
|
3267
|
-
for (int sb = 0; sb < 8; sb++) {
|
|
3268
|
-
uint8_t *mins = (uint8_t*) utmp + 8 + sb * 16;
|
|
3269
|
-
for(int m = 0; m < 4; m++) {
|
|
3270
|
-
const int16_t *bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6);
|
|
3271
|
-
for(int j = 0; j < ncols_interleaved; j++) {
|
|
3272
|
-
sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
|
|
3273
|
-
}
|
|
3274
|
-
}
|
|
3275
|
-
}
|
|
3276
|
-
}
|
|
3277
|
-
for (int m = 0; m < 4; m++) {
|
|
3278
|
-
for (int j = 0; j < ncols_interleaved; j++) {
|
|
3279
|
-
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j] - sum_minf[m][j];
|
|
3280
|
-
}
|
|
3281
|
-
}
|
|
3282
|
-
}
|
|
3283
|
-
}
|
|
3047
|
+
UNUSED(kmask1);
|
|
3048
|
+
UNUSED(kmask2);
|
|
3049
|
+
UNUSED(kmask3);
|
|
3050
|
+
ggml_gemm_q4_K_8x8_q8_K_generic(n, s, bs, vx, vy, nr, nc);
|
|
3284
3051
|
#endif
|
|
3285
3052
|
}
|
|
@@ -88,6 +88,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
|
88
88
|
{ LLM_ARCH_SMOLLM3, "smollm3" },
|
|
89
89
|
{ LLM_ARCH_LFM2, "lfm2" },
|
|
90
90
|
{ LLM_ARCH_DREAM, "dream" },
|
|
91
|
+
{ LLM_ARCH_SMALLTHINKER, "smallthinker" },
|
|
91
92
|
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
|
92
93
|
};
|
|
93
94
|
|
|
@@ -1933,6 +1934,27 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
|
1933
1934
|
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
|
|
1934
1935
|
}
|
|
1935
1936
|
},
|
|
1937
|
+
{
|
|
1938
|
+
LLM_ARCH_SMALLTHINKER,
|
|
1939
|
+
{
|
|
1940
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
1941
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|
1942
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
|
1943
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
|
1944
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
|
1945
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
|
1946
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
|
1947
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
|
1948
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
|
1949
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
|
1950
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
|
1951
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
1952
|
+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
|
1953
|
+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
|
1954
|
+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
|
1955
|
+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }
|
|
1956
|
+
},
|
|
1957
|
+
},
|
|
1936
1958
|
{
|
|
1937
1959
|
LLM_ARCH_DREAM,
|
|
1938
1960
|
{
|
|
@@ -298,7 +298,7 @@ llama_context::llama_context(
|
|
|
298
298
|
|
|
299
299
|
cross.v_embd.clear();
|
|
300
300
|
|
|
301
|
-
// reserve pp graph first so that buffers are only allocated once
|
|
301
|
+
// reserve pp (prompt processing) graph first so that buffers are only allocated once
|
|
302
302
|
{
|
|
303
303
|
auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
|
|
304
304
|
if (!gf) {
|
|
@@ -309,7 +309,7 @@ llama_context::llama_context(
|
|
|
309
309
|
n_nodes_pp = ggml_graph_n_nodes(gf);
|
|
310
310
|
}
|
|
311
311
|
|
|
312
|
-
// reserve with tg graph to get the number of splits and nodes
|
|
312
|
+
// reserve with tg (token generation) graph to get the number of splits and nodes
|
|
313
313
|
{
|
|
314
314
|
auto * gf = graph_reserve(n_seqs, n_seqs, n_seqs, mctx.get());
|
|
315
315
|
if (!gf) {
|
|
@@ -938,6 +938,100 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
|
|
938
938
|
return moe_out;
|
|
939
939
|
}
|
|
940
940
|
|
|
941
|
+
ggml_tensor * llm_graph_context::build_moe_ffn_from_probs(
|
|
942
|
+
ggml_tensor * cur,
|
|
943
|
+
ggml_tensor * probs,
|
|
944
|
+
ggml_tensor * up_exps,
|
|
945
|
+
ggml_tensor * gate_exps,
|
|
946
|
+
ggml_tensor * down_exps,
|
|
947
|
+
ggml_tensor * exp_probs_b,
|
|
948
|
+
int64_t n_expert,
|
|
949
|
+
int64_t n_expert_used,
|
|
950
|
+
llama_expert_gating_func_type gating_op,
|
|
951
|
+
int il) const {
|
|
952
|
+
const int64_t n_embd = cur->ne[0];
|
|
953
|
+
const int64_t n_tokens = cur->ne[1];
|
|
954
|
+
|
|
955
|
+
// add experts selection bias - introduced in DeepSeek V3
|
|
956
|
+
// leave probs unbiased as it's later used to get expert weights
|
|
957
|
+
ggml_tensor * selection_probs = probs;
|
|
958
|
+
if (exp_probs_b != nullptr) {
|
|
959
|
+
selection_probs = ggml_add(ctx0, probs, exp_probs_b);
|
|
960
|
+
cb(selection_probs, "ffn_moe_probs_biased", il);
|
|
961
|
+
}
|
|
962
|
+
|
|
963
|
+
// select experts
|
|
964
|
+
ggml_tensor * selected_experts = ggml_top_k(ctx0, selection_probs, n_expert_used); // [n_expert_used, n_tokens]
|
|
965
|
+
cb(selected_experts->src[0], "ffn_moe_argsort", il);
|
|
966
|
+
cb(selected_experts, "ffn_moe_topk", il);
|
|
967
|
+
|
|
968
|
+
ggml_tensor * weights = ggml_get_rows(ctx0,
|
|
969
|
+
ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts); // [1, n_expert_used, n_tokens]
|
|
970
|
+
cb(weights, "ffn_moe_weights", il);
|
|
971
|
+
|
|
972
|
+
weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens);
|
|
973
|
+
if (gating_op == LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX) {
|
|
974
|
+
weights = ggml_soft_max(ctx0, weights);
|
|
975
|
+
} else {
|
|
976
|
+
weights = ggml_sigmoid(ctx0, weights);
|
|
977
|
+
ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights); // [1, n_tokens]
|
|
978
|
+
cb(weights_sum, "ffn_moe_weights_sum", il);
|
|
979
|
+
|
|
980
|
+
weights = ggml_div(ctx0, weights, weights_sum); // [n_expert_used, n_tokens]
|
|
981
|
+
cb(weights, "ffn_moe_weights_norm", il);
|
|
982
|
+
}
|
|
983
|
+
|
|
984
|
+
weights = ggml_reshape_3d(ctx0, weights, 1, n_expert_used, n_tokens);
|
|
985
|
+
|
|
986
|
+
cur = ggml_reshape_3d(ctx0, cur, n_embd, 1, n_tokens);
|
|
987
|
+
|
|
988
|
+
ggml_tensor * up = build_lora_mm_id(up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
|
|
989
|
+
cb(up, "ffn_moe_up", il);
|
|
990
|
+
|
|
991
|
+
ggml_tensor * experts = nullptr;
|
|
992
|
+
cur = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
|
|
993
|
+
cb(cur, "ffn_moe_gate", il);
|
|
994
|
+
|
|
995
|
+
cur = ggml_reglu_split(ctx0, cur, up);
|
|
996
|
+
cb(cur, "ffn_moe_reglu", il);
|
|
997
|
+
|
|
998
|
+
experts = build_lora_mm_id(down_exps, cur, selected_experts); // [n_embd, n_expert_used, n_tokens]
|
|
999
|
+
cb(experts, "ffn_moe_down", il);
|
|
1000
|
+
|
|
1001
|
+
experts = ggml_mul(ctx0, experts, weights);
|
|
1002
|
+
cb(cur, "ffn_moe_weighted", il);
|
|
1003
|
+
|
|
1004
|
+
ggml_tensor * cur_experts[LLAMA_MAX_EXPERTS] = { nullptr };
|
|
1005
|
+
|
|
1006
|
+
assert(n_expert_used > 0);
|
|
1007
|
+
|
|
1008
|
+
// order the views before the adds
|
|
1009
|
+
for (uint32_t i = 0; i < hparams.n_expert_used; ++i) {
|
|
1010
|
+
cur_experts[i] = ggml_view_2d(ctx0, experts, n_embd, n_tokens, experts->nb[2], i*experts->nb[1]);
|
|
1011
|
+
|
|
1012
|
+
ggml_build_forward_expand(gf, cur_experts[i]);
|
|
1013
|
+
}
|
|
1014
|
+
|
|
1015
|
+
// aggregate experts
|
|
1016
|
+
// note: here we explicitly use hparams.n_expert_used instead of n_expert_used
|
|
1017
|
+
// to avoid potentially a large number of add nodes during warmup
|
|
1018
|
+
// ref: https://github.com/ggml-org/llama.cpp/pull/14753
|
|
1019
|
+
ggml_tensor * moe_out = cur_experts[0];
|
|
1020
|
+
|
|
1021
|
+
for (uint32_t i = 1; i < hparams.n_expert_used; ++i) {
|
|
1022
|
+
moe_out = ggml_add(ctx0, moe_out, cur_experts[i]);
|
|
1023
|
+
}
|
|
1024
|
+
|
|
1025
|
+
if (n_expert_used == 1) {
|
|
1026
|
+
// avoid returning a non-contiguous tensor
|
|
1027
|
+
moe_out = ggml_cont(ctx0, moe_out);
|
|
1028
|
+
}
|
|
1029
|
+
|
|
1030
|
+
cb(moe_out, "ffn_moe_out", il);
|
|
1031
|
+
|
|
1032
|
+
return moe_out;
|
|
1033
|
+
}
|
|
1034
|
+
|
|
941
1035
|
// input embeddings with optional lora
|
|
942
1036
|
ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {
|
|
943
1037
|
const int64_t n_embd = hparams.n_embd;
|
|
@@ -625,6 +625,18 @@ struct llm_graph_context {
|
|
|
625
625
|
llama_expert_gating_func_type gating_op,
|
|
626
626
|
int il) const;
|
|
627
627
|
|
|
628
|
+
ggml_tensor * build_moe_ffn_from_probs(
|
|
629
|
+
ggml_tensor * cur,
|
|
630
|
+
ggml_tensor * probs,
|
|
631
|
+
ggml_tensor * up_exps,
|
|
632
|
+
ggml_tensor * gate_exps,
|
|
633
|
+
ggml_tensor * down_exps,
|
|
634
|
+
ggml_tensor * exp_probs_b,
|
|
635
|
+
int64_t n_expert,
|
|
636
|
+
int64_t n_expert_used,
|
|
637
|
+
llama_expert_gating_func_type gating_op,
|
|
638
|
+
int il) const;
|
|
639
|
+
|
|
628
640
|
//
|
|
629
641
|
// inputs
|
|
630
642
|
//
|
|
@@ -2,9 +2,15 @@
|
|
|
2
2
|
|
|
3
3
|
#include "ggml.h"
|
|
4
4
|
|
|
5
|
-
void llama_hparams::set_swa_pattern(uint32_t n_pattern) {
|
|
6
|
-
|
|
7
|
-
|
|
5
|
+
void llama_hparams::set_swa_pattern(uint32_t n_pattern, bool dense_first) {
|
|
6
|
+
if (dense_first) {
|
|
7
|
+
for (uint32_t il = 0; il < n_layer; ++il) {
|
|
8
|
+
swa_layers[il] = n_pattern == 0 || (il % n_pattern != 0);
|
|
9
|
+
}
|
|
10
|
+
} else {
|
|
11
|
+
for (uint32_t il = 0; il < n_layer; ++il) {
|
|
12
|
+
swa_layers[il] = n_pattern == 0 || (il % n_pattern < (n_pattern - 1));
|
|
13
|
+
}
|
|
8
14
|
}
|
|
9
15
|
}
|
|
10
16
|
|
|
@@ -98,7 +98,7 @@ struct llama_hparams {
|
|
|
98
98
|
float rope_freq_scale_train;
|
|
99
99
|
float rope_freq_scale_train_swa;
|
|
100
100
|
uint32_t n_ctx_orig_yarn;
|
|
101
|
-
float rope_yarn_log_mul;
|
|
101
|
+
float rope_yarn_log_mul = 0.0f;
|
|
102
102
|
|
|
103
103
|
std::array<int, 4> rope_sections;
|
|
104
104
|
|
|
@@ -140,7 +140,7 @@ struct llama_hparams {
|
|
|
140
140
|
// for Classifiers
|
|
141
141
|
uint32_t n_cls_out = 1;
|
|
142
142
|
|
|
143
|
-
// llama4
|
|
143
|
+
// llama4 smallthinker
|
|
144
144
|
uint32_t n_moe_layer_step = 0;
|
|
145
145
|
uint32_t n_no_rope_layer_step = 4;
|
|
146
146
|
uint32_t n_attn_temp_floor_scale = 8192;
|
|
@@ -161,9 +161,10 @@ struct llama_hparams {
|
|
|
161
161
|
enum llama_rope_scaling_type rope_scaling_type_train = LLAMA_ROPE_SCALING_TYPE_NONE;
|
|
162
162
|
|
|
163
163
|
// this value n_pattern means that every nth layer is dense (i.e. non-SWA)
|
|
164
|
+
// dense_first means whether the pattern is start with a dense layer
|
|
164
165
|
// note that if n_pattern == 0, all layers are SWA
|
|
165
166
|
// if n_pattern == 1, all layers are dense
|
|
166
|
-
// example: n_pattern = 3
|
|
167
|
+
// example 1: n_pattern = 3, dense_first = false
|
|
167
168
|
// il == 0: swa
|
|
168
169
|
// il == 1: swa
|
|
169
170
|
// il == 2: dense
|
|
@@ -172,7 +173,13 @@ struct llama_hparams {
|
|
|
172
173
|
// il == 5: dense
|
|
173
174
|
// il == 6: swa
|
|
174
175
|
// etc ...
|
|
175
|
-
|
|
176
|
+
// example 2: n_pattern = 2, dense_first = true
|
|
177
|
+
// il == 0: dense
|
|
178
|
+
// il == 1: swa
|
|
179
|
+
// il == 2: dense
|
|
180
|
+
// il == 3: swa
|
|
181
|
+
// etc ...
|
|
182
|
+
void set_swa_pattern(uint32_t n_pattern, bool dense_first = false);
|
|
176
183
|
|
|
177
184
|
// return true if one of the layers is SWA
|
|
178
185
|
bool is_swa_any() const;
|