@fugood/llama.node 1.1.5 → 1.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. package/lib/binding.ts +4 -0
  2. package/lib/index.js +6 -1
  3. package/lib/index.ts +6 -0
  4. package/lib/version.js +5 -0
  5. package/lib/version.ts +2 -0
  6. package/package.json +14 -14
  7. package/scripts/llama.cpp.patch +19 -15
  8. package/src/LlamaCompletionWorker.cpp +73 -18
  9. package/src/LlamaCompletionWorker.h +8 -0
  10. package/src/llama.cpp/CMakeLists.txt +2 -0
  11. package/src/llama.cpp/common/arg.cpp +147 -46
  12. package/src/llama.cpp/common/chat-parser.cpp +9 -1
  13. package/src/llama.cpp/common/chat.cpp +350 -3
  14. package/src/llama.cpp/common/chat.h +11 -3
  15. package/src/llama.cpp/common/common.cpp +54 -0
  16. package/src/llama.cpp/common/common.h +44 -9
  17. package/src/llama.cpp/ggml/CMakeLists.txt +5 -2
  18. package/src/llama.cpp/ggml/include/ggml-opt.h +25 -6
  19. package/src/llama.cpp/ggml/include/ggml-zdnn.h +16 -0
  20. package/src/llama.cpp/ggml/include/ggml.h +65 -3
  21. package/src/llama.cpp/ggml/src/CMakeLists.txt +13 -1
  22. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +1 -1
  23. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +61 -0
  24. package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +96 -8
  25. package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +1136 -1077
  26. package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +20 -0
  27. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +20 -1
  28. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +21 -24
  29. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +16 -7
  30. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +270 -11
  31. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +3 -8
  32. package/src/llama.cpp/ggml/src/ggml-cpu/quants.c +35 -0
  33. package/src/llama.cpp/ggml/src/ggml-cpu/quants.h +8 -0
  34. package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +200 -51
  35. package/src/llama.cpp/ggml/src/ggml-cpu/repack.h +11 -0
  36. package/src/llama.cpp/ggml/src/ggml-cpu/traits.cpp +2 -2
  37. package/src/llama.cpp/ggml/src/ggml-cpu/traits.h +1 -1
  38. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +19 -4
  39. package/src/llama.cpp/include/llama.h +26 -0
  40. package/src/llama.cpp/src/llama-arch.cpp +65 -0
  41. package/src/llama.cpp/src/llama-arch.h +10 -0
  42. package/src/llama.cpp/src/llama-batch.cpp +1 -1
  43. package/src/llama.cpp/src/llama-chat.cpp +15 -4
  44. package/src/llama.cpp/src/llama-chat.h +1 -0
  45. package/src/llama.cpp/src/llama-context.cpp +37 -25
  46. package/src/llama.cpp/src/llama-context.h +6 -5
  47. package/src/llama.cpp/src/llama-graph.cpp +118 -9
  48. package/src/llama.cpp/src/llama-graph.h +38 -0
  49. package/src/llama.cpp/src/llama-hparams.h +5 -3
  50. package/src/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +12 -6
  51. package/src/llama.cpp/src/llama-kv-cache-unified-iswa.h +2 -2
  52. package/src/llama.cpp/src/llama-kv-cache-unified.cpp +93 -69
  53. package/src/llama.cpp/src/llama-kv-cache-unified.h +2 -2
  54. package/src/llama.cpp/src/llama-memory-hybrid.cpp +6 -2
  55. package/src/llama.cpp/src/llama-memory-hybrid.h +2 -2
  56. package/src/llama.cpp/src/llama-memory-recurrent.cpp +6 -2
  57. package/src/llama.cpp/src/llama-memory-recurrent.h +2 -2
  58. package/src/llama.cpp/src/llama-memory.h +2 -2
  59. package/src/llama.cpp/src/llama-model-loader.cpp +1 -0
  60. package/src/llama.cpp/src/llama-model-loader.h +3 -2
  61. package/src/llama.cpp/src/llama-model.cpp +500 -4
  62. package/src/llama.cpp/src/llama-model.h +25 -4
  63. package/src/llama.cpp/src/llama-quant.cpp +37 -1
  64. package/src/llama.cpp/src/llama-vocab.cpp +43 -0
@@ -206,8 +206,9 @@ void ggml_gemv_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
206
206
  const int ncols_interleaved = 4;
207
207
  const int blocklen = 4;
208
208
 
209
- assert (n % qk == 0);
210
- assert (nc % ncols_interleaved == 0);
209
+ assert(nr == 1);
210
+ assert(n % qk == 0);
211
+ assert(nc % ncols_interleaved == 0);
211
212
 
212
213
  UNUSED(s);
213
214
  UNUSED(bs);
@@ -307,30 +308,28 @@ void ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
307
308
  UNUSED(ncols_interleaved);
308
309
  UNUSED(blocklen);
309
310
 
310
- {
311
- float sumf[8];
312
- int sumi;
311
+ float sumf[8];
312
+ int sumi;
313
313
 
314
- const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
315
- for (int x = 0; x < nc / ncols_interleaved; x++) {
316
- const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
314
+ const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
315
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
316
+ const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
317
317
 
318
- for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
319
- for (int l = 0; l < nb; l++) {
320
- for (int k = 0; k < (qk / (2 * blocklen)); k++) {
321
- for (int j = 0; j < ncols_interleaved; j++) {
322
- sumi = 0;
323
- for (int i = 0; i < blocklen; ++i) {
324
- const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
325
- const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
326
- sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
327
- }
328
- sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
318
+ for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
319
+ for (int l = 0; l < nb; l++) {
320
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
321
+ for (int j = 0; j < ncols_interleaved; j++) {
322
+ sumi = 0;
323
+ for (int i = 0; i < blocklen; ++i) {
324
+ const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
325
+ const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
326
+ sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
329
327
  }
328
+ sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
330
329
  }
331
330
  }
332
- for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
333
331
  }
332
+ for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
334
333
  }
335
334
  }
336
335
 
@@ -494,43 +493,73 @@ void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs
494
493
  const int ncols_interleaved = 4;
495
494
  const int blocklen = 4;
496
495
 
497
- assert (n % qk == 0);
498
- assert (nc % ncols_interleaved == 0);
496
+ assert(nr == 1);
497
+ assert(n % qk == 0);
498
+ assert(nc % ncols_interleaved == 0);
499
499
 
500
- UNUSED(s);
501
500
  UNUSED(bs);
502
- UNUSED(vx);
503
- UNUSED(vy);
504
501
  UNUSED(nr);
505
- UNUSED(nc);
506
- UNUSED(nb);
507
- UNUSED(ncols_interleaved);
508
- UNUSED(blocklen);
509
502
 
510
- {
511
- float sumf[4];
512
- int sumi;
503
+ float sumf[4];
504
+ int sumi;
513
505
 
514
- const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
515
- for (int x = 0; x < nc / ncols_interleaved; x++) {
516
- const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
506
+ const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
507
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
508
+ const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
517
509
 
518
- for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
519
- for (int l = 0; l < nb; l++) {
520
- for (int k = 0; k < (qk / (2 * blocklen)); k++) {
521
- for (int j = 0; j < ncols_interleaved; j++) {
522
- sumi = 0;
523
- for (int i = 0; i < blocklen; ++i) {
524
- const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
525
- const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
526
- sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
527
- }
528
- sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
510
+ for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
511
+ for (int l = 0; l < nb; l++) {
512
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
513
+ for (int j = 0; j < ncols_interleaved; j++) {
514
+ sumi = 0;
515
+ for (int i = 0; i < blocklen; ++i) {
516
+ const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
517
+ const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
518
+ sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
529
519
  }
520
+ sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
530
521
  }
531
522
  }
532
- for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
533
523
  }
524
+ for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
525
+ }
526
+ }
527
+
528
+ void ggml_gemv_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
529
+ const int qk = QK8_0;
530
+ const int nb = n / qk;
531
+ const int ncols_interleaved = 8;
532
+ const int blocklen = 8;
533
+
534
+ assert(nr == 1);
535
+ assert(n % qk == 0);
536
+ assert(nc % ncols_interleaved == 0);
537
+
538
+ UNUSED(bs);
539
+ UNUSED(nr);
540
+
541
+ float sumf[8];
542
+ int sumi;
543
+
544
+ const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
545
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
546
+ const block_iq4_nlx8 * b_ptr = (const block_iq4_nlx8 *) vx + (x * nb);
547
+
548
+ for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
549
+ for (int l = 0; l < nb; l++) {
550
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
551
+ for (int j = 0; j < ncols_interleaved; j++) {
552
+ sumi = 0;
553
+ for (int i = 0; i < blocklen; ++i) {
554
+ const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
555
+ const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
556
+ sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
557
+ }
558
+ sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
559
+ }
560
+ }
561
+ }
562
+ for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
534
563
  }
535
564
  }
536
565
 
@@ -934,6 +963,50 @@ void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs
934
963
  }
935
964
  }
936
965
 
966
+ void ggml_gemm_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
967
+ const int qk = QK8_0;
968
+ const int nb = n / qk;
969
+ const int ncols_interleaved = 8;
970
+ const int blocklen = 8;
971
+
972
+ assert(n % qk == 0);
973
+ assert(nr % 4 == 0);
974
+ assert(nc % ncols_interleaved == 0);
975
+
976
+ float sumf[4][8];
977
+ int sumi;
978
+
979
+ for (int y = 0; y < nr / 4; y++) {
980
+ const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
981
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
982
+ const block_iq4_nlx8 * b_ptr = (const block_iq4_nlx8 *) vx + (x * nb);
983
+ for (int m = 0; m < 4; m++) {
984
+ for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
985
+ }
986
+ for (int l = 0; l < nb; l++) {
987
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
988
+ for (int m = 0; m < 4; m++) {
989
+ for (int j = 0; j < ncols_interleaved; j++) {
990
+ sumi = 0;
991
+ for (int i = 0; i < blocklen; ++i) {
992
+ const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
993
+ const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
994
+ sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
995
+ (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4]));
996
+ }
997
+ sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
998
+ }
999
+ }
1000
+ }
1001
+ }
1002
+ for (int m = 0; m < 4; m++) {
1003
+ for (int j = 0; j < ncols_interleaved; j++)
1004
+ s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
1005
+ }
1006
+ }
1007
+ }
1008
+ }
1009
+
937
1010
  } // extern "C"
938
1011
 
939
1012
  static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave) {
@@ -1285,15 +1358,16 @@ static block_iq4_nlx4 make_block_iq4_nlx4(block_iq4_nl * in, unsigned int blck_s
1285
1358
 
1286
1359
  static int repack_iq4_nl_to_iq4_nl_4_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
1287
1360
  GGML_ASSERT(t->type == GGML_TYPE_IQ4_NL);
1288
- //GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
1289
1361
  GGML_ASSERT(interleave_block == 4);
1290
1362
 
1291
- block_iq4_nlx4 * dst = (block_iq4_nlx4 *)t->data;
1292
- const block_iq4_nl * src = (const block_iq4_nl *)data;
1363
+ const block_iq4_nl * src = (const block_iq4_nl *)data;
1364
+ block_iq4_nlx4 * dst = ( block_iq4_nlx4 *)t->data;
1365
+
1293
1366
  block_iq4_nl dst_tmp[4];
1367
+
1294
1368
  int nrow = ggml_nrows(t);
1295
1369
  int nrows_interleaved = 4;
1296
- int nblocks = t->ne[0] / QK4_0;
1370
+ int nblocks = t->ne[0] / QK4_NL;
1297
1371
 
1298
1372
  GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_iq4_nl));
1299
1373
 
@@ -1315,6 +1389,63 @@ static int repack_iq4_nl_to_iq4_nl_4_bl(struct ggml_tensor * t, int interleave_b
1315
1389
  GGML_UNUSED(data_size);
1316
1390
  }
1317
1391
 
1392
+ static block_iq4_nlx8 make_block_iq4_nlx8(block_iq4_nl * in, unsigned int blck_size_interleave) {
1393
+ block_iq4_nlx8 out;
1394
+
1395
+ for (int i = 0; i < 8; i++) {
1396
+ out.d[i] = in[i].d;
1397
+ }
1398
+
1399
+ const int end = QK4_NL * 4 / blck_size_interleave;
1400
+
1401
+ if (blck_size_interleave == 8) {
1402
+ for (int i = 0; i < end; ++i) {
1403
+ int src_id = i % 8;
1404
+ int src_offset = (i / 8) * blck_size_interleave;
1405
+ int dst_offset = i * blck_size_interleave;
1406
+
1407
+ memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], sizeof(uint64_t));
1408
+ }
1409
+ } else {
1410
+ GGML_ASSERT(false);
1411
+ }
1412
+
1413
+ return out;
1414
+ }
1415
+
1416
+ static int repack_iq4_nl_to_iq4_nl_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
1417
+ GGML_ASSERT(t->type == GGML_TYPE_IQ4_NL);
1418
+ GGML_ASSERT(interleave_block == 8);
1419
+
1420
+ const block_iq4_nl * src = (const block_iq4_nl *)data;
1421
+ block_iq4_nlx8 * dst = ( block_iq4_nlx8 *)t->data;
1422
+
1423
+ block_iq4_nl dst_tmp[8];
1424
+
1425
+ int nrow = ggml_nrows(t);
1426
+ int nrows_interleaved = 8;
1427
+ int nblocks = t->ne[0] / QK4_NL;
1428
+
1429
+ GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_iq4_nl));
1430
+
1431
+ if (t->ne[1] % nrows_interleaved != 0) {
1432
+ return -1;
1433
+ }
1434
+
1435
+ for (int b = 0; b < nrow; b += nrows_interleaved) {
1436
+ for (int64_t x = 0; x < nblocks; x++) {
1437
+ for (int i = 0; i < nrows_interleaved; i++) {
1438
+ dst_tmp[i] = src[x + i * nblocks];
1439
+ }
1440
+ *dst++ = make_block_iq4_nlx8(dst_tmp, interleave_block);
1441
+ }
1442
+ src += nrows_interleaved * nblocks;
1443
+ }
1444
+ return 0;
1445
+
1446
+ GGML_UNUSED(data_size);
1447
+ }
1448
+
1318
1449
  namespace ggml::cpu::repack {
1319
1450
  // repack
1320
1451
  template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS>
@@ -1350,6 +1481,10 @@ template <> int repack<block_iq4_nl, 4, 4>(struct ggml_tensor * t, const void *
1350
1481
  // return repack_iq4_nl_to_iq4_nl_4_bl(t, 8, data, data_size);
1351
1482
  //}
1352
1483
 
1484
+ template <> int repack<block_iq4_nl, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
1485
+ return repack_iq4_nl_to_iq4_nl_8_bl(t, 8, data, data_size);
1486
+ }
1487
+
1353
1488
  // gemv
1354
1489
  template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE>
1355
1490
  void gemv(int, float *, size_t, const void *, const void *, int, int);
@@ -1378,6 +1513,10 @@ template <> void gemv<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size
1378
1513
  ggml_gemv_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
1379
1514
  }
1380
1515
 
1516
+ template <> void gemv<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
1517
+ ggml_gemv_iq4_nl_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
1518
+ }
1519
+
1381
1520
  // gemm
1382
1521
  template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE>
1383
1522
  void gemm(int, float *, size_t, const void *, const void *, int, int);
@@ -1406,6 +1545,10 @@ template <> void gemm<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size
1406
1545
  ggml_gemm_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
1407
1546
  }
1408
1547
 
1548
+ template <> void gemm<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
1549
+ ggml_gemm_iq4_nl_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
1550
+ }
1551
+
1409
1552
  class tensor_traits_base : public ggml::cpu::tensor_traits {
1410
1553
  public:
1411
1554
  virtual int repack(struct ggml_tensor * t, const void * data, size_t data_size) = 0;
@@ -1680,6 +1823,7 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
1680
1823
 
1681
1824
  // instance for IQ4
1682
1825
  static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0> iq4_nl_4x4_q8_0;
1826
+ static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0> iq4_nl_8x8_q8_0;
1683
1827
 
1684
1828
  if (cur->type == GGML_TYPE_Q4_0) {
1685
1829
  if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)) {
@@ -1710,6 +1854,11 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
1710
1854
  }
1711
1855
  }
1712
1856
  } else if (cur->type == GGML_TYPE_IQ4_NL) {
1857
+ if (ggml_cpu_has_avx2()) {
1858
+ if (cur->ne[1] % 8 == 0) {
1859
+ return &iq4_nl_8x8_q8_0;
1860
+ }
1861
+ }
1713
1862
  if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
1714
1863
  if (cur->ne[1] % 4 == 0) {
1715
1864
  return &iq4_nl_4x4_q8_0;
@@ -67,6 +67,13 @@ struct block_iq4_nlx4 {
67
67
 
68
68
  static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(ggml_half) + QK4_NL * 2, "wrong iq4_nlx4 block size/padding");
69
69
 
70
+ struct block_iq4_nlx8 {
71
+ ggml_half d[8]; // deltas for 8 iq4_nl blocks
72
+ uint8_t qs[QK4_NL * 4]; // nibbles / quants for 8 iq4_nl blocks
73
+ };
74
+
75
+ static_assert(sizeof(block_iq4_nlx8) == 8 * sizeof(ggml_half) + QK4_NL * 4, "wrong iq4_nlx8 block size/padding");
76
+
70
77
  #if defined(__cplusplus)
71
78
  extern "C" {
72
79
  #endif
@@ -80,12 +87,14 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
80
87
  void ggml_gemv_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
81
88
  void ggml_gemv_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
82
89
  void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
90
+ void ggml_gemv_iq4_nl_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
83
91
  void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
84
92
  void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
85
93
  void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
86
94
  void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
87
95
  void ggml_gemm_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
88
96
  void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
97
+ void ggml_gemm_iq4_nl_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
89
98
 
90
99
  // Native implementations
91
100
  void ggml_quantize_mat_q8_0_4x4_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
@@ -97,12 +106,14 @@ void ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
97
106
  void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
98
107
  void ggml_gemv_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
99
108
  void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
109
+ void ggml_gemv_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
100
110
  void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
101
111
  void ggml_gemm_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
102
112
  void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
103
113
  void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
104
114
  void ggml_gemm_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
105
115
  void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
116
+ void ggml_gemm_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
106
117
 
107
118
  #if defined(__cplusplus)
108
119
  } // extern "C"
@@ -10,7 +10,7 @@ extra_buffer_type::~extra_buffer_type() {}
10
10
  } // namespace ggml::cpu
11
11
 
12
12
  bool ggml_cpu_extra_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) {
13
- for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) {
13
+ for (auto extra : ggml_backend_cpu_get_extra_buffer_types()) {
14
14
  if (extra && extra->context) {
15
15
  auto buf_extra = (ggml::cpu::extra_buffer_type *) extra->context;
16
16
  auto tensor_traits = buf_extra->get_tensor_traits(op);
@@ -23,7 +23,7 @@ bool ggml_cpu_extra_compute_forward(struct ggml_compute_params * params, struct
23
23
  }
24
24
 
25
25
  bool ggml_cpu_extra_work_size(int n_threads, const struct ggml_tensor * op, size_t * size) {
26
- for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) {
26
+ for (auto extra : ggml_backend_cpu_get_extra_buffer_types()) {
27
27
  if (extra && extra->context) {
28
28
  auto buf_extra = (ggml::cpu::extra_buffer_type *) extra->context;
29
29
  auto tensor_traits = buf_extra->get_tensor_traits(op);
@@ -33,6 +33,6 @@ class extra_buffer_type {
33
33
  } // namespace ggml::cpu
34
34
 
35
35
  // implemented in ggml-cpu.cpp.
36
- std::vector<ggml_backend_buffer_type_t> & ggml_backend_cpu_get_extra_buffers_type();
36
+ std::vector<ggml_backend_buffer_type_t> & ggml_backend_cpu_get_extra_buffer_types();
37
37
 
38
38
  #endif
@@ -55,7 +55,22 @@ inline static void ggml_vec_cpy_i32(const int n, int32_t * y, const int32_t * x)
55
55
 
56
56
  inline static void ggml_vec_set_f16(const int n, ggml_fp16_t * x, const ggml_fp16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
57
57
  inline static void ggml_vec_set_bf16(const int n, ggml_bf16_t * x, const ggml_bf16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
58
- inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] + y[i]; }
58
+
59
+ inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) {
60
+ int i = 0;
61
+ #if defined(__AVX2__)
62
+ for (; i + 7 < n; i += 8) {
63
+ __m256 vx = _mm256_loadu_ps(x + i);
64
+ __m256 vy = _mm256_loadu_ps(y + i);
65
+ __m256 vz = _mm256_add_ps(vx, vy);
66
+ _mm256_storeu_ps(z + i, vz);
67
+ }
68
+ #endif
69
+ for (; i < n; ++i) {
70
+ z[i] = x[i] + y[i];
71
+ }
72
+ }
73
+
59
74
  inline static void ggml_vec_add_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
60
75
  for (int i = 0; i < n; ++i) {
61
76
  z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) + GGML_CPU_FP16_TO_FP32(y[i]));
@@ -992,9 +1007,9 @@ void ggml_vec_swiglu_f32(const int n, float * y, const float * x, const float *
992
1007
 
993
1008
  inline static void ggml_vec_swiglu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
994
1009
  for (int i = 0; i < n; ++i) {
995
- float v = GGML_CPU_FP16_TO_FP32(x[i]);
996
- float w = GGML_CPU_FP16_TO_FP32(g[i]);
997
- y[i] = GGML_CPU_FP32_TO_FP16((v/(1.0f + expf(-v))) * w);
1010
+ float xi = GGML_CPU_FP16_TO_FP32(x[i]);
1011
+ float gi = GGML_CPU_FP16_TO_FP32(g[i]);
1012
+ y[i] = GGML_CPU_FP32_TO_FP16((xi/(1.0f + expf(-xi))) * gi);
998
1013
  }
999
1014
  }
1000
1015
 
@@ -152,6 +152,7 @@ extern "C" {
152
152
  //LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35, // removed from gguf files, use Q4_0 and runtime repack
153
153
  LLAMA_FTYPE_MOSTLY_TQ1_0 = 36, // except 1d tensors
154
154
  LLAMA_FTYPE_MOSTLY_TQ2_0 = 37, // except 1d tensors
155
+ LLAMA_FTYPE_MOSTLY_MXFP4_MOE = 38, // except 1d tensors
155
156
 
156
157
  LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
157
158
  };
@@ -869,6 +870,29 @@ extern "C" {
869
870
  size_t n_token_capacity,
870
871
  size_t * n_token_count_out);
871
872
 
873
+ #define LLAMA_STATE_SEQ_FLAGS_SWA_ONLY 1
874
+
875
+ typedef uint32_t llama_state_seq_flags;
876
+
877
+ LLAMA_API size_t llama_state_seq_get_size_ext(
878
+ struct llama_context * ctx,
879
+ llama_seq_id seq_id,
880
+ llama_state_seq_flags flags);
881
+
882
+ LLAMA_API size_t llama_state_seq_get_data_ext(
883
+ struct llama_context * ctx,
884
+ uint8_t * dst,
885
+ size_t size,
886
+ llama_seq_id seq_id,
887
+ llama_state_seq_flags flags);
888
+
889
+ LLAMA_API size_t llama_state_seq_set_data_ext(
890
+ struct llama_context * ctx,
891
+ const uint8_t * src,
892
+ size_t size,
893
+ llama_seq_id dest_seq_id,
894
+ llama_state_seq_flags flags);
895
+
872
896
  //
873
897
  // Decoding
874
898
  //
@@ -1436,6 +1460,8 @@ extern "C" {
1436
1460
 
1437
1461
  ggml_opt_get_optimizer_params get_opt_pars; // callback for calculating optimizer parameters
1438
1462
  void * get_opt_pars_ud; // userdata for calculating optimizer parameters
1463
+
1464
+ enum ggml_opt_optimizer_type optimizer_type;
1439
1465
  };
1440
1466
 
1441
1467
  LLAMA_API void llama_opt_init(struct llama_context * lctx, struct llama_model * model, struct llama_opt_params lopt_params);
@@ -62,6 +62,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
62
62
  { LLM_ARCH_DEEPSEEK2, "deepseek2" },
63
63
  { LLM_ARCH_CHATGLM, "chatglm" },
64
64
  { LLM_ARCH_GLM4, "glm4" },
65
+ { LLM_ARCH_GLM4_MOE, "glm4moe" },
65
66
  { LLM_ARCH_BITNET, "bitnet" },
66
67
  { LLM_ARCH_T5, "t5" },
67
68
  { LLM_ARCH_T5ENCODER, "t5encoder" },
@@ -87,6 +88,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
87
88
  { LLM_ARCH_HUNYUAN_MOE, "hunyuan-moe" },
88
89
  { LLM_ARCH_HUNYUAN_DENSE, "hunyuan-dense" },
89
90
  { LLM_ARCH_SMOLLM3, "smollm3" },
91
+ { LLM_ARCH_OPENAI_MOE, "gpt-oss" },
90
92
  { LLM_ARCH_LFM2, "lfm2" },
91
93
  { LLM_ARCH_DREAM, "dream" },
92
94
  { LLM_ARCH_SMALLTHINKER, "smallthinker" },
@@ -127,6 +129,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
127
129
  { LLM_KV_EXPERT_WEIGHTS_NORM, "%s.expert_weights_norm" },
128
130
  { LLM_KV_EXPERT_GATING_FUNC, "%s.expert_gating_func" },
129
131
  { LLM_KV_MOE_EVERY_N_LAYERS, "%s.moe_every_n_layers" },
132
+ { LLM_KV_NEXTN_PREDICT_LAYERS, "%s.nextn_predict_layers" },
130
133
  { LLM_KV_POOLING_TYPE, "%s.pooling_type" },
131
134
  { LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
132
135
  { LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" },
@@ -1391,6 +1394,40 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
1391
1394
  { LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
1392
1395
  },
1393
1396
  },
1397
+ {
1398
+ LLM_ARCH_GLM4_MOE,
1399
+ {
1400
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1401
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1402
+ { LLM_TENSOR_OUTPUT, "output" },
1403
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1404
+ { LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
1405
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1406
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1407
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1408
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1409
+ { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
1410
+ { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
1411
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1412
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1413
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1414
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
1415
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
1416
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
1417
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
1418
+ { LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
1419
+ { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
1420
+ { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
1421
+ { LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" },
1422
+ // NextN/MTP tensors - preserved but unused (in final layer, dynamic layer number)
1423
+ { LLM_TENSOR_NEXTN_EH_PROJ, "blk.%d.nextn.eh_proj" },
1424
+ { LLM_TENSOR_NEXTN_EMBED_TOKENS, "blk.%d.nextn.embed_tokens" },
1425
+ { LLM_TENSOR_NEXTN_ENORM, "blk.%d.nextn.enorm" },
1426
+ { LLM_TENSOR_NEXTN_HNORM, "blk.%d.nextn.hnorm" },
1427
+ { LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "blk.%d.nextn.shared_head_head" },
1428
+ { LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "blk.%d.nextn.shared_head_norm" },
1429
+ },
1430
+ },
1394
1431
  {
1395
1432
  LLM_ARCH_BITNET,
1396
1433
  {
@@ -1935,6 +1972,25 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
1935
1972
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1936
1973
  },
1937
1974
  },
1975
+ {
1976
+ LLM_ARCH_OPENAI_MOE,
1977
+ {
1978
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1979
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1980
+ { LLM_TENSOR_OUTPUT, "output" },
1981
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1982
+ { LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
1983
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1984
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1985
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1986
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1987
+ { LLM_TENSOR_ATTN_SINKS, "blk.%d.attn_sinks" },
1988
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
1989
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
1990
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
1991
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
1992
+ },
1993
+ },
1938
1994
  {
1939
1995
  LLM_ARCH_LFM2,
1940
1996
  {
@@ -2050,6 +2106,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
2050
2106
  {LLM_TENSOR_ATTN_KV_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
2051
2107
  {LLM_TENSOR_ATTN_K_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
2052
2108
  {LLM_TENSOR_ATTN_V_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
2109
+ {LLM_TENSOR_ATTN_SINKS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SCALE}},
2053
2110
  {LLM_TENSOR_DEC_ATTN_Q, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
2054
2111
  {LLM_TENSOR_DEC_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
2055
2112
  {LLM_TENSOR_DEC_ATTN_V, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
@@ -2181,6 +2238,14 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
2181
2238
  {LLM_TENSOR_SHORTCONV_CONV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SSM_CONV}},
2182
2239
  {LLM_TENSOR_SHORTCONV_INPROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
2183
2240
  {LLM_TENSOR_SHORTCONV_OUTPROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
2241
+ // NextN/MTP tensors are currently ignored (reserved for future MTP support)
2242
+ // These tensors only exist in the last layer(s) and are treated as output tensors
2243
+ {LLM_TENSOR_NEXTN_EH_PROJ, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
2244
+ {LLM_TENSOR_NEXTN_EMBED_TOKENS, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_GET_ROWS}},
2245
+ {LLM_TENSOR_NEXTN_ENORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_GET_ROWS}},
2246
+ {LLM_TENSOR_NEXTN_HNORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
2247
+ {LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
2248
+ {LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
2184
2249
  };
2185
2250
 
2186
2251
  LLM_KV::LLM_KV(llm_arch arch, const char * suffix) : arch(arch), suffix(suffix) {}
@@ -66,6 +66,7 @@ enum llm_arch {
66
66
  LLM_ARCH_DEEPSEEK2,
67
67
  LLM_ARCH_CHATGLM,
68
68
  LLM_ARCH_GLM4,
69
+ LLM_ARCH_GLM4_MOE,
69
70
  LLM_ARCH_BITNET,
70
71
  LLM_ARCH_T5,
71
72
  LLM_ARCH_T5ENCODER,
@@ -91,6 +92,7 @@ enum llm_arch {
91
92
  LLM_ARCH_HUNYUAN_MOE,
92
93
  LLM_ARCH_HUNYUAN_DENSE,
93
94
  LLM_ARCH_SMOLLM3,
95
+ LLM_ARCH_OPENAI_MOE,
94
96
  LLM_ARCH_LFM2,
95
97
  LLM_ARCH_DREAM,
96
98
  LLM_ARCH_SMALLTHINKER,
@@ -131,6 +133,7 @@ enum llm_kv {
131
133
  LLM_KV_EXPERT_WEIGHTS_NORM,
132
134
  LLM_KV_EXPERT_GATING_FUNC,
133
135
  LLM_KV_MOE_EVERY_N_LAYERS,
136
+ LLM_KV_NEXTN_PREDICT_LAYERS,
134
137
  LLM_KV_POOLING_TYPE,
135
138
  LLM_KV_LOGIT_SCALE,
136
139
  LLM_KV_DECODER_START_TOKEN_ID,
@@ -263,6 +266,7 @@ enum llm_tensor {
263
266
  LLM_TENSOR_ATTN_OUT_NORM,
264
267
  LLM_TENSOR_ATTN_POST_NORM,
265
268
  LLM_TENSOR_ATTN_ROT_EMBD,
269
+ LLM_TENSOR_ATTN_SINKS,
266
270
  LLM_TENSOR_FFN_GATE_INP,
267
271
  LLM_TENSOR_FFN_GATE_INP_SHEXP,
268
272
  LLM_TENSOR_FFN_NORM,
@@ -409,6 +413,12 @@ enum llm_tensor {
409
413
  LLM_TENSOR_SHORTCONV_CONV,
410
414
  LLM_TENSOR_SHORTCONV_INPROJ,
411
415
  LLM_TENSOR_SHORTCONV_OUTPROJ,
416
+ LLM_TENSOR_NEXTN_EH_PROJ,
417
+ LLM_TENSOR_NEXTN_EMBED_TOKENS,
418
+ LLM_TENSOR_NEXTN_ENORM,
419
+ LLM_TENSOR_NEXTN_HNORM,
420
+ LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD,
421
+ LLM_TENSOR_NEXTN_SHARED_HEAD_NORM,
412
422
  };
413
423
 
414
424
  enum llm_tensor_layer {