@fugood/llama.node 1.1.5 → 1.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.ts +4 -0
- package/lib/index.js +6 -1
- package/lib/index.ts +6 -0
- package/lib/version.js +5 -0
- package/lib/version.ts +2 -0
- package/package.json +14 -14
- package/scripts/llama.cpp.patch +19 -15
- package/src/LlamaCompletionWorker.cpp +73 -18
- package/src/LlamaCompletionWorker.h +8 -0
- package/src/llama.cpp/CMakeLists.txt +2 -0
- package/src/llama.cpp/common/arg.cpp +147 -46
- package/src/llama.cpp/common/chat-parser.cpp +9 -1
- package/src/llama.cpp/common/chat.cpp +350 -3
- package/src/llama.cpp/common/chat.h +11 -3
- package/src/llama.cpp/common/common.cpp +54 -0
- package/src/llama.cpp/common/common.h +44 -9
- package/src/llama.cpp/ggml/CMakeLists.txt +5 -2
- package/src/llama.cpp/ggml/include/ggml-opt.h +25 -6
- package/src/llama.cpp/ggml/include/ggml-zdnn.h +16 -0
- package/src/llama.cpp/ggml/include/ggml.h +65 -3
- package/src/llama.cpp/ggml/src/CMakeLists.txt +13 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +61 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +96 -8
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +1136 -1077
- package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +20 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +20 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +21 -24
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +16 -7
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +270 -11
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +3 -8
- package/src/llama.cpp/ggml/src/ggml-cpu/quants.c +35 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/quants.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +200 -51
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.h +11 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/traits.cpp +2 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/traits.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +19 -4
- package/src/llama.cpp/include/llama.h +26 -0
- package/src/llama.cpp/src/llama-arch.cpp +65 -0
- package/src/llama.cpp/src/llama-arch.h +10 -0
- package/src/llama.cpp/src/llama-batch.cpp +1 -1
- package/src/llama.cpp/src/llama-chat.cpp +15 -4
- package/src/llama.cpp/src/llama-chat.h +1 -0
- package/src/llama.cpp/src/llama-context.cpp +37 -25
- package/src/llama.cpp/src/llama-context.h +6 -5
- package/src/llama.cpp/src/llama-graph.cpp +118 -9
- package/src/llama.cpp/src/llama-graph.h +38 -0
- package/src/llama.cpp/src/llama-hparams.h +5 -3
- package/src/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +12 -6
- package/src/llama.cpp/src/llama-kv-cache-unified-iswa.h +2 -2
- package/src/llama.cpp/src/llama-kv-cache-unified.cpp +93 -69
- package/src/llama.cpp/src/llama-kv-cache-unified.h +2 -2
- package/src/llama.cpp/src/llama-memory-hybrid.cpp +6 -2
- package/src/llama.cpp/src/llama-memory-hybrid.h +2 -2
- package/src/llama.cpp/src/llama-memory-recurrent.cpp +6 -2
- package/src/llama.cpp/src/llama-memory-recurrent.h +2 -2
- package/src/llama.cpp/src/llama-memory.h +2 -2
- package/src/llama.cpp/src/llama-model-loader.cpp +1 -0
- package/src/llama.cpp/src/llama-model-loader.h +3 -2
- package/src/llama.cpp/src/llama-model.cpp +500 -4
- package/src/llama.cpp/src/llama-model.h +25 -4
- package/src/llama.cpp/src/llama-quant.cpp +37 -1
- package/src/llama.cpp/src/llama-vocab.cpp +43 -0
|
@@ -206,8 +206,9 @@ void ggml_gemv_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|
|
206
206
|
const int ncols_interleaved = 4;
|
|
207
207
|
const int blocklen = 4;
|
|
208
208
|
|
|
209
|
-
assert
|
|
210
|
-
assert
|
|
209
|
+
assert(nr == 1);
|
|
210
|
+
assert(n % qk == 0);
|
|
211
|
+
assert(nc % ncols_interleaved == 0);
|
|
211
212
|
|
|
212
213
|
UNUSED(s);
|
|
213
214
|
UNUSED(bs);
|
|
@@ -307,30 +308,28 @@ void ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|
|
307
308
|
UNUSED(ncols_interleaved);
|
|
308
309
|
UNUSED(blocklen);
|
|
309
310
|
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
int sumi;
|
|
311
|
+
float sumf[8];
|
|
312
|
+
int sumi;
|
|
313
313
|
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
314
|
+
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
|
315
|
+
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
316
|
+
const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
|
|
317
317
|
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
}
|
|
328
|
-
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
|
318
|
+
for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
|
|
319
|
+
for (int l = 0; l < nb; l++) {
|
|
320
|
+
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
|
321
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
322
|
+
sumi = 0;
|
|
323
|
+
for (int i = 0; i < blocklen; ++i) {
|
|
324
|
+
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
|
|
325
|
+
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
|
|
326
|
+
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
|
|
329
327
|
}
|
|
328
|
+
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
|
330
329
|
}
|
|
331
330
|
}
|
|
332
|
-
for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
|
|
333
331
|
}
|
|
332
|
+
for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
|
|
334
333
|
}
|
|
335
334
|
}
|
|
336
335
|
|
|
@@ -494,43 +493,73 @@ void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs
|
|
|
494
493
|
const int ncols_interleaved = 4;
|
|
495
494
|
const int blocklen = 4;
|
|
496
495
|
|
|
497
|
-
assert
|
|
498
|
-
assert
|
|
496
|
+
assert(nr == 1);
|
|
497
|
+
assert(n % qk == 0);
|
|
498
|
+
assert(nc % ncols_interleaved == 0);
|
|
499
499
|
|
|
500
|
-
UNUSED(s);
|
|
501
500
|
UNUSED(bs);
|
|
502
|
-
UNUSED(vx);
|
|
503
|
-
UNUSED(vy);
|
|
504
501
|
UNUSED(nr);
|
|
505
|
-
UNUSED(nc);
|
|
506
|
-
UNUSED(nb);
|
|
507
|
-
UNUSED(ncols_interleaved);
|
|
508
|
-
UNUSED(blocklen);
|
|
509
502
|
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
int sumi;
|
|
503
|
+
float sumf[4];
|
|
504
|
+
int sumi;
|
|
513
505
|
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
506
|
+
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
|
507
|
+
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
508
|
+
const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
|
|
517
509
|
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
}
|
|
528
|
-
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
|
510
|
+
for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
|
|
511
|
+
for (int l = 0; l < nb; l++) {
|
|
512
|
+
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
|
513
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
514
|
+
sumi = 0;
|
|
515
|
+
for (int i = 0; i < blocklen; ++i) {
|
|
516
|
+
const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
|
|
517
|
+
const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
|
|
518
|
+
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
|
|
529
519
|
}
|
|
520
|
+
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
|
530
521
|
}
|
|
531
522
|
}
|
|
532
|
-
for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
|
|
533
523
|
}
|
|
524
|
+
for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
|
|
525
|
+
}
|
|
526
|
+
}
|
|
527
|
+
|
|
528
|
+
void ggml_gemv_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
529
|
+
const int qk = QK8_0;
|
|
530
|
+
const int nb = n / qk;
|
|
531
|
+
const int ncols_interleaved = 8;
|
|
532
|
+
const int blocklen = 8;
|
|
533
|
+
|
|
534
|
+
assert(nr == 1);
|
|
535
|
+
assert(n % qk == 0);
|
|
536
|
+
assert(nc % ncols_interleaved == 0);
|
|
537
|
+
|
|
538
|
+
UNUSED(bs);
|
|
539
|
+
UNUSED(nr);
|
|
540
|
+
|
|
541
|
+
float sumf[8];
|
|
542
|
+
int sumi;
|
|
543
|
+
|
|
544
|
+
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
|
545
|
+
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
546
|
+
const block_iq4_nlx8 * b_ptr = (const block_iq4_nlx8 *) vx + (x * nb);
|
|
547
|
+
|
|
548
|
+
for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
|
|
549
|
+
for (int l = 0; l < nb; l++) {
|
|
550
|
+
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
|
551
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
552
|
+
sumi = 0;
|
|
553
|
+
for (int i = 0; i < blocklen; ++i) {
|
|
554
|
+
const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
|
|
555
|
+
const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
|
|
556
|
+
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
|
|
557
|
+
}
|
|
558
|
+
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
|
559
|
+
}
|
|
560
|
+
}
|
|
561
|
+
}
|
|
562
|
+
for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
|
|
534
563
|
}
|
|
535
564
|
}
|
|
536
565
|
|
|
@@ -934,6 +963,50 @@ void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs
|
|
|
934
963
|
}
|
|
935
964
|
}
|
|
936
965
|
|
|
966
|
+
void ggml_gemm_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
967
|
+
const int qk = QK8_0;
|
|
968
|
+
const int nb = n / qk;
|
|
969
|
+
const int ncols_interleaved = 8;
|
|
970
|
+
const int blocklen = 8;
|
|
971
|
+
|
|
972
|
+
assert(n % qk == 0);
|
|
973
|
+
assert(nr % 4 == 0);
|
|
974
|
+
assert(nc % ncols_interleaved == 0);
|
|
975
|
+
|
|
976
|
+
float sumf[4][8];
|
|
977
|
+
int sumi;
|
|
978
|
+
|
|
979
|
+
for (int y = 0; y < nr / 4; y++) {
|
|
980
|
+
const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
|
|
981
|
+
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
982
|
+
const block_iq4_nlx8 * b_ptr = (const block_iq4_nlx8 *) vx + (x * nb);
|
|
983
|
+
for (int m = 0; m < 4; m++) {
|
|
984
|
+
for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
|
|
985
|
+
}
|
|
986
|
+
for (int l = 0; l < nb; l++) {
|
|
987
|
+
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
|
988
|
+
for (int m = 0; m < 4; m++) {
|
|
989
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
990
|
+
sumi = 0;
|
|
991
|
+
for (int i = 0; i < blocklen; ++i) {
|
|
992
|
+
const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
|
|
993
|
+
const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
|
|
994
|
+
sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
|
|
995
|
+
(v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4]));
|
|
996
|
+
}
|
|
997
|
+
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
|
|
998
|
+
}
|
|
999
|
+
}
|
|
1000
|
+
}
|
|
1001
|
+
}
|
|
1002
|
+
for (int m = 0; m < 4; m++) {
|
|
1003
|
+
for (int j = 0; j < ncols_interleaved; j++)
|
|
1004
|
+
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
|
|
1005
|
+
}
|
|
1006
|
+
}
|
|
1007
|
+
}
|
|
1008
|
+
}
|
|
1009
|
+
|
|
937
1010
|
} // extern "C"
|
|
938
1011
|
|
|
939
1012
|
static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave) {
|
|
@@ -1285,15 +1358,16 @@ static block_iq4_nlx4 make_block_iq4_nlx4(block_iq4_nl * in, unsigned int blck_s
|
|
|
1285
1358
|
|
|
1286
1359
|
static int repack_iq4_nl_to_iq4_nl_4_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
|
|
1287
1360
|
GGML_ASSERT(t->type == GGML_TYPE_IQ4_NL);
|
|
1288
|
-
//GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
|
|
1289
1361
|
GGML_ASSERT(interleave_block == 4);
|
|
1290
1362
|
|
|
1291
|
-
|
|
1292
|
-
|
|
1363
|
+
const block_iq4_nl * src = (const block_iq4_nl *)data;
|
|
1364
|
+
block_iq4_nlx4 * dst = ( block_iq4_nlx4 *)t->data;
|
|
1365
|
+
|
|
1293
1366
|
block_iq4_nl dst_tmp[4];
|
|
1367
|
+
|
|
1294
1368
|
int nrow = ggml_nrows(t);
|
|
1295
1369
|
int nrows_interleaved = 4;
|
|
1296
|
-
int nblocks = t->ne[0] /
|
|
1370
|
+
int nblocks = t->ne[0] / QK4_NL;
|
|
1297
1371
|
|
|
1298
1372
|
GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_iq4_nl));
|
|
1299
1373
|
|
|
@@ -1315,6 +1389,63 @@ static int repack_iq4_nl_to_iq4_nl_4_bl(struct ggml_tensor * t, int interleave_b
|
|
|
1315
1389
|
GGML_UNUSED(data_size);
|
|
1316
1390
|
}
|
|
1317
1391
|
|
|
1392
|
+
static block_iq4_nlx8 make_block_iq4_nlx8(block_iq4_nl * in, unsigned int blck_size_interleave) {
|
|
1393
|
+
block_iq4_nlx8 out;
|
|
1394
|
+
|
|
1395
|
+
for (int i = 0; i < 8; i++) {
|
|
1396
|
+
out.d[i] = in[i].d;
|
|
1397
|
+
}
|
|
1398
|
+
|
|
1399
|
+
const int end = QK4_NL * 4 / blck_size_interleave;
|
|
1400
|
+
|
|
1401
|
+
if (blck_size_interleave == 8) {
|
|
1402
|
+
for (int i = 0; i < end; ++i) {
|
|
1403
|
+
int src_id = i % 8;
|
|
1404
|
+
int src_offset = (i / 8) * blck_size_interleave;
|
|
1405
|
+
int dst_offset = i * blck_size_interleave;
|
|
1406
|
+
|
|
1407
|
+
memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], sizeof(uint64_t));
|
|
1408
|
+
}
|
|
1409
|
+
} else {
|
|
1410
|
+
GGML_ASSERT(false);
|
|
1411
|
+
}
|
|
1412
|
+
|
|
1413
|
+
return out;
|
|
1414
|
+
}
|
|
1415
|
+
|
|
1416
|
+
static int repack_iq4_nl_to_iq4_nl_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
|
|
1417
|
+
GGML_ASSERT(t->type == GGML_TYPE_IQ4_NL);
|
|
1418
|
+
GGML_ASSERT(interleave_block == 8);
|
|
1419
|
+
|
|
1420
|
+
const block_iq4_nl * src = (const block_iq4_nl *)data;
|
|
1421
|
+
block_iq4_nlx8 * dst = ( block_iq4_nlx8 *)t->data;
|
|
1422
|
+
|
|
1423
|
+
block_iq4_nl dst_tmp[8];
|
|
1424
|
+
|
|
1425
|
+
int nrow = ggml_nrows(t);
|
|
1426
|
+
int nrows_interleaved = 8;
|
|
1427
|
+
int nblocks = t->ne[0] / QK4_NL;
|
|
1428
|
+
|
|
1429
|
+
GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_iq4_nl));
|
|
1430
|
+
|
|
1431
|
+
if (t->ne[1] % nrows_interleaved != 0) {
|
|
1432
|
+
return -1;
|
|
1433
|
+
}
|
|
1434
|
+
|
|
1435
|
+
for (int b = 0; b < nrow; b += nrows_interleaved) {
|
|
1436
|
+
for (int64_t x = 0; x < nblocks; x++) {
|
|
1437
|
+
for (int i = 0; i < nrows_interleaved; i++) {
|
|
1438
|
+
dst_tmp[i] = src[x + i * nblocks];
|
|
1439
|
+
}
|
|
1440
|
+
*dst++ = make_block_iq4_nlx8(dst_tmp, interleave_block);
|
|
1441
|
+
}
|
|
1442
|
+
src += nrows_interleaved * nblocks;
|
|
1443
|
+
}
|
|
1444
|
+
return 0;
|
|
1445
|
+
|
|
1446
|
+
GGML_UNUSED(data_size);
|
|
1447
|
+
}
|
|
1448
|
+
|
|
1318
1449
|
namespace ggml::cpu::repack {
|
|
1319
1450
|
// repack
|
|
1320
1451
|
template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS>
|
|
@@ -1350,6 +1481,10 @@ template <> int repack<block_iq4_nl, 4, 4>(struct ggml_tensor * t, const void *
|
|
|
1350
1481
|
// return repack_iq4_nl_to_iq4_nl_4_bl(t, 8, data, data_size);
|
|
1351
1482
|
//}
|
|
1352
1483
|
|
|
1484
|
+
template <> int repack<block_iq4_nl, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
|
1485
|
+
return repack_iq4_nl_to_iq4_nl_8_bl(t, 8, data, data_size);
|
|
1486
|
+
}
|
|
1487
|
+
|
|
1353
1488
|
// gemv
|
|
1354
1489
|
template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE>
|
|
1355
1490
|
void gemv(int, float *, size_t, const void *, const void *, int, int);
|
|
@@ -1378,6 +1513,10 @@ template <> void gemv<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size
|
|
|
1378
1513
|
ggml_gemv_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
|
|
1379
1514
|
}
|
|
1380
1515
|
|
|
1516
|
+
template <> void gemv<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
1517
|
+
ggml_gemv_iq4_nl_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
|
|
1518
|
+
}
|
|
1519
|
+
|
|
1381
1520
|
// gemm
|
|
1382
1521
|
template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE>
|
|
1383
1522
|
void gemm(int, float *, size_t, const void *, const void *, int, int);
|
|
@@ -1406,6 +1545,10 @@ template <> void gemm<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size
|
|
|
1406
1545
|
ggml_gemm_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
|
|
1407
1546
|
}
|
|
1408
1547
|
|
|
1548
|
+
template <> void gemm<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
1549
|
+
ggml_gemm_iq4_nl_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
|
|
1550
|
+
}
|
|
1551
|
+
|
|
1409
1552
|
class tensor_traits_base : public ggml::cpu::tensor_traits {
|
|
1410
1553
|
public:
|
|
1411
1554
|
virtual int repack(struct ggml_tensor * t, const void * data, size_t data_size) = 0;
|
|
@@ -1680,6 +1823,7 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
|
|
|
1680
1823
|
|
|
1681
1824
|
// instance for IQ4
|
|
1682
1825
|
static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0> iq4_nl_4x4_q8_0;
|
|
1826
|
+
static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0> iq4_nl_8x8_q8_0;
|
|
1683
1827
|
|
|
1684
1828
|
if (cur->type == GGML_TYPE_Q4_0) {
|
|
1685
1829
|
if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)) {
|
|
@@ -1710,6 +1854,11 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
|
|
|
1710
1854
|
}
|
|
1711
1855
|
}
|
|
1712
1856
|
} else if (cur->type == GGML_TYPE_IQ4_NL) {
|
|
1857
|
+
if (ggml_cpu_has_avx2()) {
|
|
1858
|
+
if (cur->ne[1] % 8 == 0) {
|
|
1859
|
+
return &iq4_nl_8x8_q8_0;
|
|
1860
|
+
}
|
|
1861
|
+
}
|
|
1713
1862
|
if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
|
|
1714
1863
|
if (cur->ne[1] % 4 == 0) {
|
|
1715
1864
|
return &iq4_nl_4x4_q8_0;
|
|
@@ -67,6 +67,13 @@ struct block_iq4_nlx4 {
|
|
|
67
67
|
|
|
68
68
|
static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(ggml_half) + QK4_NL * 2, "wrong iq4_nlx4 block size/padding");
|
|
69
69
|
|
|
70
|
+
struct block_iq4_nlx8 {
|
|
71
|
+
ggml_half d[8]; // deltas for 8 iq4_nl blocks
|
|
72
|
+
uint8_t qs[QK4_NL * 4]; // nibbles / quants for 8 iq4_nl blocks
|
|
73
|
+
};
|
|
74
|
+
|
|
75
|
+
static_assert(sizeof(block_iq4_nlx8) == 8 * sizeof(ggml_half) + QK4_NL * 4, "wrong iq4_nlx8 block size/padding");
|
|
76
|
+
|
|
70
77
|
#if defined(__cplusplus)
|
|
71
78
|
extern "C" {
|
|
72
79
|
#endif
|
|
@@ -80,12 +87,14 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
80
87
|
void ggml_gemv_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
81
88
|
void ggml_gemv_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
82
89
|
void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
90
|
+
void ggml_gemv_iq4_nl_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
83
91
|
void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
84
92
|
void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
85
93
|
void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
86
94
|
void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
87
95
|
void ggml_gemm_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
88
96
|
void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
97
|
+
void ggml_gemm_iq4_nl_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
89
98
|
|
|
90
99
|
// Native implementations
|
|
91
100
|
void ggml_quantize_mat_q8_0_4x4_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
|
|
@@ -97,12 +106,14 @@ void ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|
|
97
106
|
void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
98
107
|
void ggml_gemv_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
99
108
|
void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
109
|
+
void ggml_gemv_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
100
110
|
void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
101
111
|
void ggml_gemm_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
102
112
|
void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
103
113
|
void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
104
114
|
void ggml_gemm_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
105
115
|
void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
116
|
+
void ggml_gemm_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
106
117
|
|
|
107
118
|
#if defined(__cplusplus)
|
|
108
119
|
} // extern "C"
|
|
@@ -10,7 +10,7 @@ extra_buffer_type::~extra_buffer_type() {}
|
|
|
10
10
|
} // namespace ggml::cpu
|
|
11
11
|
|
|
12
12
|
bool ggml_cpu_extra_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) {
|
|
13
|
-
for (auto extra :
|
|
13
|
+
for (auto extra : ggml_backend_cpu_get_extra_buffer_types()) {
|
|
14
14
|
if (extra && extra->context) {
|
|
15
15
|
auto buf_extra = (ggml::cpu::extra_buffer_type *) extra->context;
|
|
16
16
|
auto tensor_traits = buf_extra->get_tensor_traits(op);
|
|
@@ -23,7 +23,7 @@ bool ggml_cpu_extra_compute_forward(struct ggml_compute_params * params, struct
|
|
|
23
23
|
}
|
|
24
24
|
|
|
25
25
|
bool ggml_cpu_extra_work_size(int n_threads, const struct ggml_tensor * op, size_t * size) {
|
|
26
|
-
for (auto extra :
|
|
26
|
+
for (auto extra : ggml_backend_cpu_get_extra_buffer_types()) {
|
|
27
27
|
if (extra && extra->context) {
|
|
28
28
|
auto buf_extra = (ggml::cpu::extra_buffer_type *) extra->context;
|
|
29
29
|
auto tensor_traits = buf_extra->get_tensor_traits(op);
|
|
@@ -33,6 +33,6 @@ class extra_buffer_type {
|
|
|
33
33
|
} // namespace ggml::cpu
|
|
34
34
|
|
|
35
35
|
// implemented in ggml-cpu.cpp.
|
|
36
|
-
std::vector<ggml_backend_buffer_type_t> &
|
|
36
|
+
std::vector<ggml_backend_buffer_type_t> & ggml_backend_cpu_get_extra_buffer_types();
|
|
37
37
|
|
|
38
38
|
#endif
|
|
@@ -55,7 +55,22 @@ inline static void ggml_vec_cpy_i32(const int n, int32_t * y, const int32_t * x)
|
|
|
55
55
|
|
|
56
56
|
inline static void ggml_vec_set_f16(const int n, ggml_fp16_t * x, const ggml_fp16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
|
|
57
57
|
inline static void ggml_vec_set_bf16(const int n, ggml_bf16_t * x, const ggml_bf16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
|
|
58
|
-
|
|
58
|
+
|
|
59
|
+
inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) {
|
|
60
|
+
int i = 0;
|
|
61
|
+
#if defined(__AVX2__)
|
|
62
|
+
for (; i + 7 < n; i += 8) {
|
|
63
|
+
__m256 vx = _mm256_loadu_ps(x + i);
|
|
64
|
+
__m256 vy = _mm256_loadu_ps(y + i);
|
|
65
|
+
__m256 vz = _mm256_add_ps(vx, vy);
|
|
66
|
+
_mm256_storeu_ps(z + i, vz);
|
|
67
|
+
}
|
|
68
|
+
#endif
|
|
69
|
+
for (; i < n; ++i) {
|
|
70
|
+
z[i] = x[i] + y[i];
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
|
|
59
74
|
inline static void ggml_vec_add_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
|
|
60
75
|
for (int i = 0; i < n; ++i) {
|
|
61
76
|
z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) + GGML_CPU_FP16_TO_FP32(y[i]));
|
|
@@ -992,9 +1007,9 @@ void ggml_vec_swiglu_f32(const int n, float * y, const float * x, const float *
|
|
|
992
1007
|
|
|
993
1008
|
inline static void ggml_vec_swiglu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
|
|
994
1009
|
for (int i = 0; i < n; ++i) {
|
|
995
|
-
float
|
|
996
|
-
float
|
|
997
|
-
y[i] = GGML_CPU_FP32_TO_FP16((
|
|
1010
|
+
float xi = GGML_CPU_FP16_TO_FP32(x[i]);
|
|
1011
|
+
float gi = GGML_CPU_FP16_TO_FP32(g[i]);
|
|
1012
|
+
y[i] = GGML_CPU_FP32_TO_FP16((xi/(1.0f + expf(-xi))) * gi);
|
|
998
1013
|
}
|
|
999
1014
|
}
|
|
1000
1015
|
|
|
@@ -152,6 +152,7 @@ extern "C" {
|
|
|
152
152
|
//LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35, // removed from gguf files, use Q4_0 and runtime repack
|
|
153
153
|
LLAMA_FTYPE_MOSTLY_TQ1_0 = 36, // except 1d tensors
|
|
154
154
|
LLAMA_FTYPE_MOSTLY_TQ2_0 = 37, // except 1d tensors
|
|
155
|
+
LLAMA_FTYPE_MOSTLY_MXFP4_MOE = 38, // except 1d tensors
|
|
155
156
|
|
|
156
157
|
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
|
157
158
|
};
|
|
@@ -869,6 +870,29 @@ extern "C" {
|
|
|
869
870
|
size_t n_token_capacity,
|
|
870
871
|
size_t * n_token_count_out);
|
|
871
872
|
|
|
873
|
+
#define LLAMA_STATE_SEQ_FLAGS_SWA_ONLY 1
|
|
874
|
+
|
|
875
|
+
typedef uint32_t llama_state_seq_flags;
|
|
876
|
+
|
|
877
|
+
LLAMA_API size_t llama_state_seq_get_size_ext(
|
|
878
|
+
struct llama_context * ctx,
|
|
879
|
+
llama_seq_id seq_id,
|
|
880
|
+
llama_state_seq_flags flags);
|
|
881
|
+
|
|
882
|
+
LLAMA_API size_t llama_state_seq_get_data_ext(
|
|
883
|
+
struct llama_context * ctx,
|
|
884
|
+
uint8_t * dst,
|
|
885
|
+
size_t size,
|
|
886
|
+
llama_seq_id seq_id,
|
|
887
|
+
llama_state_seq_flags flags);
|
|
888
|
+
|
|
889
|
+
LLAMA_API size_t llama_state_seq_set_data_ext(
|
|
890
|
+
struct llama_context * ctx,
|
|
891
|
+
const uint8_t * src,
|
|
892
|
+
size_t size,
|
|
893
|
+
llama_seq_id dest_seq_id,
|
|
894
|
+
llama_state_seq_flags flags);
|
|
895
|
+
|
|
872
896
|
//
|
|
873
897
|
// Decoding
|
|
874
898
|
//
|
|
@@ -1436,6 +1460,8 @@ extern "C" {
|
|
|
1436
1460
|
|
|
1437
1461
|
ggml_opt_get_optimizer_params get_opt_pars; // callback for calculating optimizer parameters
|
|
1438
1462
|
void * get_opt_pars_ud; // userdata for calculating optimizer parameters
|
|
1463
|
+
|
|
1464
|
+
enum ggml_opt_optimizer_type optimizer_type;
|
|
1439
1465
|
};
|
|
1440
1466
|
|
|
1441
1467
|
LLAMA_API void llama_opt_init(struct llama_context * lctx, struct llama_model * model, struct llama_opt_params lopt_params);
|
|
@@ -62,6 +62,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
|
62
62
|
{ LLM_ARCH_DEEPSEEK2, "deepseek2" },
|
|
63
63
|
{ LLM_ARCH_CHATGLM, "chatglm" },
|
|
64
64
|
{ LLM_ARCH_GLM4, "glm4" },
|
|
65
|
+
{ LLM_ARCH_GLM4_MOE, "glm4moe" },
|
|
65
66
|
{ LLM_ARCH_BITNET, "bitnet" },
|
|
66
67
|
{ LLM_ARCH_T5, "t5" },
|
|
67
68
|
{ LLM_ARCH_T5ENCODER, "t5encoder" },
|
|
@@ -87,6 +88,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
|
87
88
|
{ LLM_ARCH_HUNYUAN_MOE, "hunyuan-moe" },
|
|
88
89
|
{ LLM_ARCH_HUNYUAN_DENSE, "hunyuan-dense" },
|
|
89
90
|
{ LLM_ARCH_SMOLLM3, "smollm3" },
|
|
91
|
+
{ LLM_ARCH_OPENAI_MOE, "gpt-oss" },
|
|
90
92
|
{ LLM_ARCH_LFM2, "lfm2" },
|
|
91
93
|
{ LLM_ARCH_DREAM, "dream" },
|
|
92
94
|
{ LLM_ARCH_SMALLTHINKER, "smallthinker" },
|
|
@@ -127,6 +129,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
|
127
129
|
{ LLM_KV_EXPERT_WEIGHTS_NORM, "%s.expert_weights_norm" },
|
|
128
130
|
{ LLM_KV_EXPERT_GATING_FUNC, "%s.expert_gating_func" },
|
|
129
131
|
{ LLM_KV_MOE_EVERY_N_LAYERS, "%s.moe_every_n_layers" },
|
|
132
|
+
{ LLM_KV_NEXTN_PREDICT_LAYERS, "%s.nextn_predict_layers" },
|
|
130
133
|
{ LLM_KV_POOLING_TYPE, "%s.pooling_type" },
|
|
131
134
|
{ LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
|
|
132
135
|
{ LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" },
|
|
@@ -1391,6 +1394,40 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
|
1391
1394
|
{ LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
|
|
1392
1395
|
},
|
|
1393
1396
|
},
|
|
1397
|
+
{
|
|
1398
|
+
LLM_ARCH_GLM4_MOE,
|
|
1399
|
+
{
|
|
1400
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
1401
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|
1402
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
|
1403
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
|
1404
|
+
{ LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
|
|
1405
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
|
1406
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
|
1407
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
|
1408
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
|
1409
|
+
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
|
1410
|
+
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
|
1411
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
|
1412
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
|
1413
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
1414
|
+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
|
1415
|
+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
|
1416
|
+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
|
1417
|
+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
|
1418
|
+
{ LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
|
|
1419
|
+
{ LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
|
|
1420
|
+
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
|
1421
|
+
{ LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" },
|
|
1422
|
+
// NextN/MTP tensors - preserved but unused (in final layer, dynamic layer number)
|
|
1423
|
+
{ LLM_TENSOR_NEXTN_EH_PROJ, "blk.%d.nextn.eh_proj" },
|
|
1424
|
+
{ LLM_TENSOR_NEXTN_EMBED_TOKENS, "blk.%d.nextn.embed_tokens" },
|
|
1425
|
+
{ LLM_TENSOR_NEXTN_ENORM, "blk.%d.nextn.enorm" },
|
|
1426
|
+
{ LLM_TENSOR_NEXTN_HNORM, "blk.%d.nextn.hnorm" },
|
|
1427
|
+
{ LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "blk.%d.nextn.shared_head_head" },
|
|
1428
|
+
{ LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "blk.%d.nextn.shared_head_norm" },
|
|
1429
|
+
},
|
|
1430
|
+
},
|
|
1394
1431
|
{
|
|
1395
1432
|
LLM_ARCH_BITNET,
|
|
1396
1433
|
{
|
|
@@ -1935,6 +1972,25 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
|
1935
1972
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
1936
1973
|
},
|
|
1937
1974
|
},
|
|
1975
|
+
{
|
|
1976
|
+
LLM_ARCH_OPENAI_MOE,
|
|
1977
|
+
{
|
|
1978
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
1979
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|
1980
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
|
1981
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
|
1982
|
+
{ LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
|
|
1983
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
|
1984
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
|
1985
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
|
1986
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
|
1987
|
+
{ LLM_TENSOR_ATTN_SINKS, "blk.%d.attn_sinks" },
|
|
1988
|
+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
|
1989
|
+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
|
1990
|
+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
|
1991
|
+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
|
1992
|
+
},
|
|
1993
|
+
},
|
|
1938
1994
|
{
|
|
1939
1995
|
LLM_ARCH_LFM2,
|
|
1940
1996
|
{
|
|
@@ -2050,6 +2106,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
|
|
2050
2106
|
{LLM_TENSOR_ATTN_KV_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
2051
2107
|
{LLM_TENSOR_ATTN_K_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
2052
2108
|
{LLM_TENSOR_ATTN_V_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
2109
|
+
{LLM_TENSOR_ATTN_SINKS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SCALE}},
|
|
2053
2110
|
{LLM_TENSOR_DEC_ATTN_Q, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
2054
2111
|
{LLM_TENSOR_DEC_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
2055
2112
|
{LLM_TENSOR_DEC_ATTN_V, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
@@ -2181,6 +2238,14 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
|
|
2181
2238
|
{LLM_TENSOR_SHORTCONV_CONV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SSM_CONV}},
|
|
2182
2239
|
{LLM_TENSOR_SHORTCONV_INPROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
2183
2240
|
{LLM_TENSOR_SHORTCONV_OUTPROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
2241
|
+
// NextN/MTP tensors are currently ignored (reserved for future MTP support)
|
|
2242
|
+
// These tensors only exist in the last layer(s) and are treated as output tensors
|
|
2243
|
+
{LLM_TENSOR_NEXTN_EH_PROJ, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
|
|
2244
|
+
{LLM_TENSOR_NEXTN_EMBED_TOKENS, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_GET_ROWS}},
|
|
2245
|
+
{LLM_TENSOR_NEXTN_ENORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_GET_ROWS}},
|
|
2246
|
+
{LLM_TENSOR_NEXTN_HNORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
|
|
2247
|
+
{LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
|
|
2248
|
+
{LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
|
|
2184
2249
|
};
|
|
2185
2250
|
|
|
2186
2251
|
LLM_KV::LLM_KV(llm_arch arch, const char * suffix) : arch(arch), suffix(suffix) {}
|
|
@@ -66,6 +66,7 @@ enum llm_arch {
|
|
|
66
66
|
LLM_ARCH_DEEPSEEK2,
|
|
67
67
|
LLM_ARCH_CHATGLM,
|
|
68
68
|
LLM_ARCH_GLM4,
|
|
69
|
+
LLM_ARCH_GLM4_MOE,
|
|
69
70
|
LLM_ARCH_BITNET,
|
|
70
71
|
LLM_ARCH_T5,
|
|
71
72
|
LLM_ARCH_T5ENCODER,
|
|
@@ -91,6 +92,7 @@ enum llm_arch {
|
|
|
91
92
|
LLM_ARCH_HUNYUAN_MOE,
|
|
92
93
|
LLM_ARCH_HUNYUAN_DENSE,
|
|
93
94
|
LLM_ARCH_SMOLLM3,
|
|
95
|
+
LLM_ARCH_OPENAI_MOE,
|
|
94
96
|
LLM_ARCH_LFM2,
|
|
95
97
|
LLM_ARCH_DREAM,
|
|
96
98
|
LLM_ARCH_SMALLTHINKER,
|
|
@@ -131,6 +133,7 @@ enum llm_kv {
|
|
|
131
133
|
LLM_KV_EXPERT_WEIGHTS_NORM,
|
|
132
134
|
LLM_KV_EXPERT_GATING_FUNC,
|
|
133
135
|
LLM_KV_MOE_EVERY_N_LAYERS,
|
|
136
|
+
LLM_KV_NEXTN_PREDICT_LAYERS,
|
|
134
137
|
LLM_KV_POOLING_TYPE,
|
|
135
138
|
LLM_KV_LOGIT_SCALE,
|
|
136
139
|
LLM_KV_DECODER_START_TOKEN_ID,
|
|
@@ -263,6 +266,7 @@ enum llm_tensor {
|
|
|
263
266
|
LLM_TENSOR_ATTN_OUT_NORM,
|
|
264
267
|
LLM_TENSOR_ATTN_POST_NORM,
|
|
265
268
|
LLM_TENSOR_ATTN_ROT_EMBD,
|
|
269
|
+
LLM_TENSOR_ATTN_SINKS,
|
|
266
270
|
LLM_TENSOR_FFN_GATE_INP,
|
|
267
271
|
LLM_TENSOR_FFN_GATE_INP_SHEXP,
|
|
268
272
|
LLM_TENSOR_FFN_NORM,
|
|
@@ -409,6 +413,12 @@ enum llm_tensor {
|
|
|
409
413
|
LLM_TENSOR_SHORTCONV_CONV,
|
|
410
414
|
LLM_TENSOR_SHORTCONV_INPROJ,
|
|
411
415
|
LLM_TENSOR_SHORTCONV_OUTPROJ,
|
|
416
|
+
LLM_TENSOR_NEXTN_EH_PROJ,
|
|
417
|
+
LLM_TENSOR_NEXTN_EMBED_TOKENS,
|
|
418
|
+
LLM_TENSOR_NEXTN_ENORM,
|
|
419
|
+
LLM_TENSOR_NEXTN_HNORM,
|
|
420
|
+
LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD,
|
|
421
|
+
LLM_TENSOR_NEXTN_SHARED_HEAD_NORM,
|
|
412
422
|
};
|
|
413
423
|
|
|
414
424
|
enum llm_tensor_layer {
|