@fugood/llama.node 1.1.4 → 1.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. package/lib/binding.ts +8 -0
  2. package/package.json +14 -14
  3. package/scripts/llama.cpp.patch +17 -13
  4. package/src/LlamaCompletionWorker.cpp +2 -0
  5. package/src/LlamaContext.cpp +3 -0
  6. package/src/llama.cpp/common/arg.cpp +80 -10
  7. package/src/llama.cpp/common/chat.cpp +52 -8
  8. package/src/llama.cpp/common/chat.h +7 -2
  9. package/src/llama.cpp/common/common.cpp +1 -0
  10. package/src/llama.cpp/common/common.h +16 -6
  11. package/src/llama.cpp/common/speculative.cpp +135 -54
  12. package/src/llama.cpp/common/speculative.h +8 -1
  13. package/src/llama.cpp/ggml/CMakeLists.txt +4 -2
  14. package/src/llama.cpp/ggml/include/ggml.h +37 -1
  15. package/src/llama.cpp/ggml/src/CMakeLists.txt +12 -1
  16. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +61 -0
  17. package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +96 -8
  18. package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +3196 -0
  19. package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +20 -0
  20. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +14 -1
  21. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +207 -9
  22. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +2 -7
  23. package/src/llama.cpp/ggml/src/ggml-cpu/quants.c +35 -0
  24. package/src/llama.cpp/ggml/src/ggml-cpu/quants.h +8 -0
  25. package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +263 -0
  26. package/src/llama.cpp/ggml/src/ggml-cpu/repack.h +11 -0
  27. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +19 -4
  28. package/src/llama.cpp/include/llama.h +9 -4
  29. package/src/llama.cpp/src/llama-arch.cpp +105 -0
  30. package/src/llama.cpp/src/llama-arch.h +12 -0
  31. package/src/llama.cpp/src/llama-batch.cpp +1 -1
  32. package/src/llama.cpp/src/llama-chat.cpp +33 -1
  33. package/src/llama.cpp/src/llama-chat.h +2 -0
  34. package/src/llama.cpp/src/llama-context.cpp +19 -10
  35. package/src/llama.cpp/src/llama-context.h +4 -1
  36. package/src/llama.cpp/src/llama-graph.cpp +175 -148
  37. package/src/llama.cpp/src/llama-graph.h +60 -23
  38. package/src/llama.cpp/src/llama-hparams.h +5 -3
  39. package/src/llama.cpp/src/llama-kv-cache-unified.cpp +6 -2
  40. package/src/llama.cpp/src/llama-kv-cache-unified.h +1 -1
  41. package/src/llama.cpp/src/llama-memory-hybrid.cpp +2 -1
  42. package/src/llama.cpp/src/llama-memory-hybrid.h +1 -0
  43. package/src/llama.cpp/src/llama-model-loader.cpp +1 -0
  44. package/src/llama.cpp/src/llama-model-loader.h +3 -2
  45. package/src/llama.cpp/src/llama-model.cpp +949 -75
  46. package/src/llama.cpp/src/llama-model.h +24 -4
  47. package/src/llama.cpp/src/llama-quant.cpp +40 -4
  48. package/src/llama.cpp/src/llama-vocab.cpp +49 -1
  49. package/src/llama.cpp/src/llama-vocab.h +1 -0
@@ -412,6 +412,82 @@ void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
412
412
  }
413
413
  }
414
414
 
415
+ void ggml_gemv_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
416
+ const int qk = QK_K;
417
+ const int nb = n / qk;
418
+ const int ncols_interleaved = 8;
419
+ const int blocklen = 8;
420
+
421
+ assert (n % qk == 0);
422
+ assert (nc % ncols_interleaved == 0);
423
+
424
+ UNUSED(s);
425
+ UNUSED(bs);
426
+ UNUSED(vx);
427
+ UNUSED(vy);
428
+ UNUSED(nr);
429
+ UNUSED(nc);
430
+ UNUSED(nb);
431
+ UNUSED(ncols_interleaved);
432
+ UNUSED(blocklen);
433
+
434
+ float sumf[8];
435
+ float sum_minf[8];
436
+ int sumi1,sumi2,sumi3,sumi4;
437
+ int sumi;
438
+
439
+ const block_q8_K * a_ptr = (const block_q8_K *)vy;
440
+ for(int x = 0; x < nc / ncols_interleaved; x++) {
441
+ const block_q2_Kx8 * b_ptr = (const block_q2_Kx8 *) vx + (x * nb);
442
+ for (int j = 0; j < ncols_interleaved; j++) {
443
+ sumf[j] = 0.0;
444
+ sum_minf[j] = 0.0;
445
+ }
446
+ for (int l = 0; l < nb; l++) {
447
+ for (int k = 0; k < (qk / (4 * blocklen)); k++) {
448
+ const uint8_t *scales_0 = b_ptr[l].scales + (k / 4) * 64 ;
449
+ const uint8_t *scales_1 = b_ptr[l].scales + (k / 4) * 64 + 16;
450
+ const uint8_t *scales_2 = b_ptr[l].scales + (k / 4) * 64 + 32;
451
+ const uint8_t *scales_3 = b_ptr[l].scales + (k / 4) * 64 + 48;
452
+ for (int j = 0; j < ncols_interleaved; j++) {
453
+ sumi1 = 0;
454
+ sumi2 = 0;
455
+ sumi3 = 0;
456
+ sumi4 = 0;
457
+ sumi = 0;
458
+ int offset = ((k / 2) % 2) + j * 2;
459
+ for (int i = 0; i < blocklen; ++i){
460
+ const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 3);
461
+ const int v1 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 2 ) & 3);
462
+ const int v2 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4 ) & 3);
463
+ const int v3 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 6 ) & 3);
464
+ sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i]);
465
+ sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i + 32]);
466
+ sumi3 = (v2 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i + 64]);
467
+ sumi4 = (v3 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i + 96]);
468
+
469
+ sumi1 = sumi1 * (scales_0[offset] & 0xF);
470
+ sumi2 = sumi2 * (scales_1[offset] & 0xF);
471
+ sumi3 = sumi3 * (scales_2[offset] & 0xF);
472
+ sumi4 = sumi4 * (scales_3[offset] & 0xF);
473
+ sumi += sumi1 + sumi2 + sumi3 + sumi4;
474
+ }
475
+ sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
476
+ }
477
+ }
478
+ for(int sb = 0; sb < 8; sb++) {
479
+ const uint8_t *mins = b_ptr[l].scales + sb * 16;
480
+ for(int j = 0; j < ncols_interleaved; j++){
481
+ sum_minf[j] += ((mins[j * 2] >> 4) * a_ptr[l].bsums[sb * 2] + (mins[(j * 2)+ 1] >> 4) * a_ptr[l].bsums[sb * 2 + 1]) * GGML_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
482
+ }
483
+ }
484
+ }
485
+ for (int j = 0; j < ncols_interleaved; j++) {
486
+ s[x * ncols_interleaved + j] = sumf[j] - sum_minf[j];
487
+ }
488
+ }
489
+ }
490
+
415
491
  void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
416
492
  const int qk = QK8_0;
417
493
  const int nb = n / qk;
@@ -711,6 +787,97 @@ void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
711
787
  }
712
788
  }
713
789
 
790
+ void ggml_gemm_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
791
+ const int qk = QK_K;
792
+ const int nb = n / qk;
793
+ const int ncols_interleaved = 8;
794
+ const int blocklen = 8;
795
+
796
+ assert (n % qk == 0);
797
+ assert (nr % 4 == 0);
798
+ assert (nc % ncols_interleaved == 0);
799
+
800
+ UNUSED(s);
801
+ UNUSED(bs);
802
+ UNUSED(vx);
803
+ UNUSED(vy);
804
+ UNUSED(nr);
805
+ UNUSED(nc);
806
+ UNUSED(nb);
807
+ UNUSED(ncols_interleaved);
808
+ UNUSED(blocklen);
809
+
810
+ float sumf[4][8];
811
+ float sum_minf[4][8];
812
+ int sumi1, sumi2, sumi3, sumi4;
813
+ int sumi;
814
+
815
+ for (int y = 0; y < nr / 4; y++) {
816
+ const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb);
817
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
818
+ const block_q2_Kx8 * b_ptr = (const block_q2_Kx8 *) vx + (x * nb);
819
+ for (int m = 0; m < 4; m++) {
820
+ for (int j = 0; j < ncols_interleaved; j++) {
821
+ sumf[m][j] = 0.0;
822
+ sum_minf[m][j] = 0.0;
823
+ }
824
+ }
825
+ for (int l = 0; l < nb; l++) {
826
+ for (int k = 0; k < (qk / (4 * blocklen)); k++) {
827
+
828
+ const uint8_t *scales_0 = b_ptr[l].scales + (k / 4) * 64 ;
829
+ const uint8_t *scales_1 = b_ptr[l].scales + (k / 4) * 64 + 16;
830
+ const uint8_t *scales_2 = b_ptr[l].scales + (k / 4) * 64 + 32;
831
+ const uint8_t *scales_3 = b_ptr[l].scales + (k / 4) * 64 + 48;
832
+ for (int m = 0; m < 4; m++) {
833
+ for (int j = 0; j < ncols_interleaved; j++) {
834
+ sumi1 = 0;
835
+ sumi2 = 0;
836
+ sumi3 = 0;
837
+ sumi4 = 0;
838
+ sumi = 0;
839
+ int offset = ((k / 2) % 2) + j * 2;
840
+ for (int i = 0; i < blocklen; ++i){
841
+ const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 3);
842
+ const int v1 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 2 ) & 3);
843
+ const int v2 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4 ) & 3);
844
+ const int v3 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 6 ) & 3);
845
+ sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i]);
846
+ sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i + 128]);
847
+ sumi3 = (v2 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i + 256]);
848
+ sumi4 = (v3 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i + 384]);
849
+ sumi1 = sumi1 * (scales_0[offset] & 0xF);
850
+ sumi2 = sumi2 * (scales_1[offset] & 0xF);
851
+ sumi3 = sumi3 * (scales_2[offset] & 0xF);
852
+ sumi4 = sumi4 * (scales_3[offset] & 0xF);
853
+ sumi += sumi1 + sumi2 + sumi3 + sumi4;
854
+ }
855
+ sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
856
+ }
857
+ }
858
+ }
859
+ for(int sb = 0; sb < 8; sb++) {
860
+ const uint8_t *mins = b_ptr[l].scales + sb * 16;
861
+ for(int m = 0; m < 4; m++) {
862
+ const int16_t *bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6);
863
+ for(int j = 0; j < ncols_interleaved; j++) {
864
+ int mins_prod = ((mins[j * 2] >> 4) * bsums[0] + (mins[(j * 2)+ 1] >> 4) * bsums[1]);
865
+ sum_minf[m][j] += (mins_prod) * GGML_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
866
+ }
867
+ }
868
+ }
869
+ }
870
+
871
+ for (int m = 0; m < 4; m++) {
872
+ for (int j = 0; j < ncols_interleaved; j++) {
873
+ s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j] - sum_minf[m][j];
874
+ }
875
+ }
876
+ }
877
+ }
878
+ }
879
+
880
+
714
881
  void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
715
882
  const int qk = QK8_0;
716
883
  const int nb = n / qk;
@@ -914,6 +1081,50 @@ static block_q4_Kx8 make_block_q4_Kx8(block_q4_K * in, unsigned int blck_size_in
914
1081
  return out;
915
1082
  }
916
1083
 
1084
+ static block_q2_Kx8 make_block_q2_Kx8(block_q2_K * in, unsigned int blck_size_interleave) {
1085
+ block_q2_Kx8 out;
1086
+
1087
+ // Delta(scale) and dmin values of the eight Q2_K structures are copied onto the output interleaved structure
1088
+ for (int i = 0; i < 8; i++) {
1089
+ out.d[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d;
1090
+ }
1091
+
1092
+ for (int i = 0; i < 8; i++) {
1093
+ out.dmin[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.dmin;
1094
+ }
1095
+
1096
+ const int end = QK_K * 2 / blck_size_interleave;
1097
+
1098
+ // Interleave Q2_K quants by taking 8 bytes at a time
1099
+ for (int i = 0; i < end; ++i) {
1100
+ int src_id = i % 8;
1101
+ int src_offset = (i / 8) * blck_size_interleave;
1102
+ int dst_offset = i * blck_size_interleave;
1103
+
1104
+ uint64_t elems;
1105
+ memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
1106
+ memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
1107
+ }
1108
+
1109
+ // The below logic is designed so as to unpack and rearrange scales and mins values in Q2_K
1110
+ // Currently the Q2_K structure has 16 scales and 16 mins packed in 16 bytes ( 4 bits for each value)
1111
+ // The output Q2_Kx8 structure has 128 bytes for storing scales and mins
1112
+ // Every 16 byte is packed such that it contains scales and mins for corresponding sub blocks from Q2_K structure
1113
+ // For eg - First 16 bytes contains 16 scales and 16 mins - each of first and second sub blocks from different Q2_K structures
1114
+
1115
+ for(int i = 0; i < 128; i++){
1116
+
1117
+ // Index for selecting which q2k super block
1118
+ int src1 = (i % 16) / 2;
1119
+ // Index for selecting scale
1120
+ int src2 = ((i / 16) * 2) + (i % 2);
1121
+
1122
+ out.scales[i] = in[src1].scales[src2];
1123
+ }
1124
+ return out;
1125
+
1126
+ }
1127
+
917
1128
  static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
918
1129
  GGML_ASSERT(t->type == GGML_TYPE_Q4_0);
919
1130
  GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
@@ -975,6 +1186,37 @@ static int repack_q4_K_to_q4_K_8_bl(struct ggml_tensor * t, int interleave_block
975
1186
  GGML_UNUSED(data_size);
976
1187
  }
977
1188
 
1189
+ static int repack_q2_K_to_q2_K_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
1190
+ GGML_ASSERT(t->type == GGML_TYPE_Q2_K);
1191
+ GGML_ASSERT(interleave_block == 8);
1192
+ constexpr int nrows_interleaved = 8;
1193
+
1194
+ block_q2_Kx8 * dst = (block_q2_Kx8*)t->data;
1195
+ const block_q2_K * src = (const block_q2_K*) data;
1196
+ block_q2_K dst_tmp[8];
1197
+ int nrow = ggml_nrows(t);
1198
+ int nblocks = t->ne[0] / QK_K;
1199
+
1200
+ GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q2_K));
1201
+
1202
+ if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
1203
+ return -1;
1204
+ }
1205
+
1206
+ for (int b = 0; b < nrow; b += nrows_interleaved) {
1207
+ for (int64_t x = 0; x < nblocks; x++) {
1208
+ for (int i = 0; i < nrows_interleaved; i++ ) {
1209
+ dst_tmp[i] = src[x + i * nblocks];
1210
+ }
1211
+ *dst++ = make_block_q2_Kx8(dst_tmp, interleave_block);
1212
+ }
1213
+ src += nrows_interleaved * nblocks;
1214
+ }
1215
+ return 0;
1216
+
1217
+ GGML_UNUSED(data_size);
1218
+ }
1219
+
978
1220
  static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
979
1221
  GGML_ASSERT(t->type == GGML_TYPE_Q4_0);
980
1222
  GGML_ASSERT(interleave_block == 8);
@@ -1095,6 +1337,10 @@ template <> int repack<block_q4_K, 8, 8>(struct ggml_tensor * t, const void * da
1095
1337
  return repack_q4_K_to_q4_K_8_bl(t, 8, data, data_size);
1096
1338
  }
1097
1339
 
1340
+ template <> int repack<block_q2_K, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
1341
+ return repack_q2_K_to_q2_K_8_bl(t, 8, data, data_size);
1342
+ }
1343
+
1098
1344
  template <> int repack<block_iq4_nl, 4, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
1099
1345
  return repack_iq4_nl_to_iq4_nl_4_bl(t, 4, data, data_size);
1100
1346
  }
@@ -1124,6 +1370,10 @@ template <> void gemv<block_q4_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t
1124
1370
  ggml_gemv_q4_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
1125
1371
  }
1126
1372
 
1373
+ template <> void gemv<block_q2_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
1374
+ ggml_gemv_q2_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
1375
+ }
1376
+
1127
1377
  template <> void gemv<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
1128
1378
  ggml_gemv_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
1129
1379
  }
@@ -1148,6 +1398,10 @@ template <> void gemm<block_q4_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t
1148
1398
  ggml_gemm_q4_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
1149
1399
  }
1150
1400
 
1401
+ template <> void gemm<block_q2_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
1402
+ ggml_gemm_q2_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
1403
+ }
1404
+
1151
1405
  template <> void gemm<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
1152
1406
  ggml_gemm_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
1153
1407
  }
@@ -1421,6 +1675,9 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
1421
1675
  static const ggml::cpu::repack::tensor_traits<block_q4_0, 8, 8, GGML_TYPE_Q8_0> q4_0_8x8_q8_0;
1422
1676
  static const ggml::cpu::repack::tensor_traits<block_q4_K, 8, 8, GGML_TYPE_Q8_K> q4_K_8x8_q8_K;
1423
1677
 
1678
+ // instance for Q2
1679
+ static const ggml::cpu::repack::tensor_traits<block_q2_K, 8, 8, GGML_TYPE_Q8_K> q2_K_8x8_q8_K;
1680
+
1424
1681
  // instance for IQ4
1425
1682
  static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0> iq4_nl_4x4_q8_0;
1426
1683
 
@@ -1446,6 +1703,12 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
1446
1703
  return &q4_K_8x8_q8_K;
1447
1704
  }
1448
1705
  }
1706
+ } else if (cur->type == GGML_TYPE_Q2_K) {
1707
+ if (ggml_cpu_has_avx512()) {
1708
+ if (cur->ne[1] % 8 == 0) {
1709
+ return &q2_K_8x8_q8_K;
1710
+ }
1711
+ }
1449
1712
  } else if (cur->type == GGML_TYPE_IQ4_NL) {
1450
1713
  if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
1451
1714
  if (cur->ne[1] % 4 == 0) {
@@ -44,7 +44,14 @@ struct block_q4_Kx8 {
44
44
  };
45
45
 
46
46
  static_assert(sizeof(block_q4_Kx8) == sizeof(ggml_half) * 16 + K_SCALE_SIZE * 8 + QK_K * 4, "wrong q4_K block size/padding");
47
+ struct block_q2_Kx8 {
48
+ ggml_half d[8]; // super-block scale for quantized scales
49
+ ggml_half dmin[8]; // super-block scale for quantized mins
50
+ uint8_t scales[128]; // scales and mins, quantized with 4 bits
51
+ uint8_t qs[512]; // 2--bit quants
52
+ };
47
53
 
54
+ static_assert(sizeof(block_q2_Kx8) == sizeof(ggml_half) * 16 + QK_K/2 + QK_K * 2, "wrong q2_K block size/padding");
48
55
  struct block_q8_Kx4 {
49
56
  float d[4]; // delta
50
57
  int8_t qs[QK_K * 4]; // quants
@@ -71,11 +78,13 @@ void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
71
78
  void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
72
79
  void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
73
80
  void ggml_gemv_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
81
+ void ggml_gemv_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
74
82
  void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
75
83
  void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
76
84
  void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
77
85
  void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
78
86
  void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
87
+ void ggml_gemm_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
79
88
  void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
80
89
 
81
90
  // Native implementations
@@ -86,11 +95,13 @@ void ggml_gemv_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
86
95
  void ggml_gemv_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
87
96
  void ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
88
97
  void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
98
+ void ggml_gemv_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
89
99
  void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
90
100
  void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
91
101
  void ggml_gemm_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
92
102
  void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
93
103
  void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
104
+ void ggml_gemm_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
94
105
  void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
95
106
 
96
107
  #if defined(__cplusplus)
@@ -55,7 +55,22 @@ inline static void ggml_vec_cpy_i32(const int n, int32_t * y, const int32_t * x)
55
55
 
56
56
  inline static void ggml_vec_set_f16(const int n, ggml_fp16_t * x, const ggml_fp16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
57
57
  inline static void ggml_vec_set_bf16(const int n, ggml_bf16_t * x, const ggml_bf16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
58
- inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] + y[i]; }
58
+
59
+ inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) {
60
+ int i = 0;
61
+ #if defined(__AVX2__)
62
+ for (; i + 7 < n; i += 8) {
63
+ __m256 vx = _mm256_loadu_ps(x + i);
64
+ __m256 vy = _mm256_loadu_ps(y + i);
65
+ __m256 vz = _mm256_add_ps(vx, vy);
66
+ _mm256_storeu_ps(z + i, vz);
67
+ }
68
+ #endif
69
+ for (; i < n; ++i) {
70
+ z[i] = x[i] + y[i];
71
+ }
72
+ }
73
+
59
74
  inline static void ggml_vec_add_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
60
75
  for (int i = 0; i < n; ++i) {
61
76
  z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) + GGML_CPU_FP16_TO_FP32(y[i]));
@@ -992,9 +1007,9 @@ void ggml_vec_swiglu_f32(const int n, float * y, const float * x, const float *
992
1007
 
993
1008
  inline static void ggml_vec_swiglu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
994
1009
  for (int i = 0; i < n; ++i) {
995
- float v = GGML_CPU_FP16_TO_FP32(x[i]);
996
- float w = GGML_CPU_FP16_TO_FP32(g[i]);
997
- y[i] = GGML_CPU_FP32_TO_FP16((v/(1.0f + expf(-v))) * w);
1010
+ float xi = GGML_CPU_FP16_TO_FP32(x[i]);
1011
+ float gi = GGML_CPU_FP16_TO_FP32(g[i]);
1012
+ y[i] = GGML_CPU_FP32_TO_FP16((xi/(1.0f + expf(-xi))) * gi);
998
1013
  }
999
1014
  }
1000
1015
 
@@ -152,6 +152,7 @@ extern "C" {
152
152
  //LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35, // removed from gguf files, use Q4_0 and runtime repack
153
153
  LLAMA_FTYPE_MOSTLY_TQ1_0 = 36, // except 1d tensors
154
154
  LLAMA_FTYPE_MOSTLY_TQ2_0 = 37, // except 1d tensors
155
+ LLAMA_FTYPE_MOSTLY_MXFP4_MOE = 38, // except 1d tensors
155
156
 
156
157
  LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
157
158
  };
@@ -284,10 +285,11 @@ extern "C" {
284
285
  const struct llama_model_kv_override * kv_overrides;
285
286
 
286
287
  // Keep the booleans together to avoid misalignment during copy-by-value.
287
- bool vocab_only; // only load the vocabulary, no weights
288
- bool use_mmap; // use mmap if possible
289
- bool use_mlock; // force system to keep model in RAM
290
- bool check_tensors; // validate model tensor data
288
+ bool vocab_only; // only load the vocabulary, no weights
289
+ bool use_mmap; // use mmap if possible
290
+ bool use_mlock; // force system to keep model in RAM
291
+ bool check_tensors; // validate model tensor data
292
+ bool use_extra_bufts; // use extra buffer types (used for weight repacking)
291
293
  };
292
294
 
293
295
  // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
@@ -537,6 +539,9 @@ extern "C" {
537
539
  // Returns true if the model is recurrent (like Mamba, RWKV, etc.)
538
540
  LLAMA_API bool llama_model_is_recurrent(const struct llama_model * model);
539
541
 
542
+ // Returns true if the model is diffusion-based (like LLaDA, Dream, etc.)
543
+ LLAMA_API bool llama_model_is_diffusion(const struct llama_model * model);
544
+
540
545
  // Returns 0 on success
541
546
  LLAMA_API uint32_t llama_model_quantize(
542
547
  const char * fname_inp,
@@ -62,6 +62,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
62
62
  { LLM_ARCH_DEEPSEEK2, "deepseek2" },
63
63
  { LLM_ARCH_CHATGLM, "chatglm" },
64
64
  { LLM_ARCH_GLM4, "glm4" },
65
+ { LLM_ARCH_GLM4_MOE, "glm4moe" },
65
66
  { LLM_ARCH_BITNET, "bitnet" },
66
67
  { LLM_ARCH_T5, "t5" },
67
68
  { LLM_ARCH_T5ENCODER, "t5encoder" },
@@ -85,10 +86,13 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
85
86
  { LLM_ARCH_ERNIE4_5, "ernie4_5" },
86
87
  { LLM_ARCH_ERNIE4_5_MOE, "ernie4_5-moe" },
87
88
  { LLM_ARCH_HUNYUAN_MOE, "hunyuan-moe" },
89
+ { LLM_ARCH_HUNYUAN_DENSE, "hunyuan-dense" },
88
90
  { LLM_ARCH_SMOLLM3, "smollm3" },
91
+ { LLM_ARCH_OPENAI_MOE, "gpt-oss" },
89
92
  { LLM_ARCH_LFM2, "lfm2" },
90
93
  { LLM_ARCH_DREAM, "dream" },
91
94
  { LLM_ARCH_SMALLTHINKER, "smallthinker" },
95
+ { LLM_ARCH_LLADA, "llada" },
92
96
  { LLM_ARCH_UNKNOWN, "(unknown)" },
93
97
  };
94
98
 
@@ -125,6 +129,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
125
129
  { LLM_KV_EXPERT_WEIGHTS_NORM, "%s.expert_weights_norm" },
126
130
  { LLM_KV_EXPERT_GATING_FUNC, "%s.expert_gating_func" },
127
131
  { LLM_KV_MOE_EVERY_N_LAYERS, "%s.moe_every_n_layers" },
132
+ { LLM_KV_NEXTN_PREDICT_LAYERS, "%s.nextn_predict_layers" },
128
133
  { LLM_KV_POOLING_TYPE, "%s.pooling_type" },
129
134
  { LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
130
135
  { LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" },
@@ -1389,6 +1394,40 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
1389
1394
  { LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
1390
1395
  },
1391
1396
  },
1397
+ {
1398
+ LLM_ARCH_GLM4_MOE,
1399
+ {
1400
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1401
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1402
+ { LLM_TENSOR_OUTPUT, "output" },
1403
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1404
+ { LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
1405
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1406
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1407
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1408
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1409
+ { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
1410
+ { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
1411
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1412
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1413
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1414
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
1415
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
1416
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
1417
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
1418
+ { LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
1419
+ { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
1420
+ { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
1421
+ { LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" },
1422
+ // NextN/MTP tensors - preserved but unused (in final layer, dynamic layer number)
1423
+ { LLM_TENSOR_NEXTN_EH_PROJ, "blk.%d.nextn.eh_proj" },
1424
+ { LLM_TENSOR_NEXTN_EMBED_TOKENS, "blk.%d.nextn.embed_tokens" },
1425
+ { LLM_TENSOR_NEXTN_ENORM, "blk.%d.nextn.enorm" },
1426
+ { LLM_TENSOR_NEXTN_HNORM, "blk.%d.nextn.hnorm" },
1427
+ { LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "blk.%d.nextn.shared_head_head" },
1428
+ { LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "blk.%d.nextn.shared_head_norm" },
1429
+ },
1430
+ },
1392
1431
  {
1393
1432
  LLM_ARCH_BITNET,
1394
1433
  {
@@ -1896,6 +1935,26 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
1896
1935
  { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
1897
1936
  },
1898
1937
  },
1938
+ {
1939
+ LLM_ARCH_HUNYUAN_DENSE,
1940
+ {
1941
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1942
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1943
+ { LLM_TENSOR_OUTPUT, "output" },
1944
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1945
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1946
+ { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
1947
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1948
+ { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
1949
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1950
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1951
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1952
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1953
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1954
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1955
+
1956
+ },
1957
+ },
1899
1958
  {
1900
1959
  LLM_ARCH_SMOLLM3,
1901
1960
  {
@@ -1913,6 +1972,25 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
1913
1972
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1914
1973
  },
1915
1974
  },
1975
+ {
1976
+ LLM_ARCH_OPENAI_MOE,
1977
+ {
1978
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1979
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1980
+ { LLM_TENSOR_OUTPUT, "output" },
1981
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1982
+ { LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
1983
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1984
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1985
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1986
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1987
+ { LLM_TENSOR_ATTN_SINKS, "blk.%d.attn_sinks" },
1988
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
1989
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
1990
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
1991
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
1992
+ },
1993
+ },
1916
1994
  {
1917
1995
  LLM_ARCH_LFM2,
1918
1996
  {
@@ -1972,6 +2050,23 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
1972
2050
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1973
2051
  },
1974
2052
  },
2053
+ {
2054
+ LLM_ARCH_LLADA,
2055
+ {
2056
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
2057
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
2058
+ { LLM_TENSOR_OUTPUT, "output" },
2059
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
2060
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
2061
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
2062
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
2063
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
2064
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
2065
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
2066
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
2067
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
2068
+ },
2069
+ },
1975
2070
  {
1976
2071
  LLM_ARCH_UNKNOWN,
1977
2072
  {
@@ -2011,6 +2106,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
2011
2106
  {LLM_TENSOR_ATTN_KV_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
2012
2107
  {LLM_TENSOR_ATTN_K_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
2013
2108
  {LLM_TENSOR_ATTN_V_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
2109
+ {LLM_TENSOR_ATTN_SINKS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SCALE}},
2014
2110
  {LLM_TENSOR_DEC_ATTN_Q, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
2015
2111
  {LLM_TENSOR_DEC_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
2016
2112
  {LLM_TENSOR_DEC_ATTN_V, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
@@ -2142,6 +2238,14 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
2142
2238
  {LLM_TENSOR_SHORTCONV_CONV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SSM_CONV}},
2143
2239
  {LLM_TENSOR_SHORTCONV_INPROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
2144
2240
  {LLM_TENSOR_SHORTCONV_OUTPROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
2241
+ // NextN/MTP tensors are currently ignored (reserved for future MTP support)
2242
+ // These tensors only exist in the last layer(s) and are treated as output tensors
2243
+ {LLM_TENSOR_NEXTN_EH_PROJ, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
2244
+ {LLM_TENSOR_NEXTN_EMBED_TOKENS, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_GET_ROWS}},
2245
+ {LLM_TENSOR_NEXTN_ENORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_GET_ROWS}},
2246
+ {LLM_TENSOR_NEXTN_HNORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
2247
+ {LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
2248
+ {LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
2145
2249
  };
2146
2250
 
2147
2251
  LLM_KV::LLM_KV(llm_arch arch, const char * suffix) : arch(arch), suffix(suffix) {}
@@ -2224,6 +2328,7 @@ bool llm_arch_is_hybrid(const llm_arch & arch) {
2224
2328
  bool llm_arch_is_diffusion(const llm_arch & arch) {
2225
2329
  switch (arch) {
2226
2330
  case LLM_ARCH_DREAM:
2331
+ case LLM_ARCH_LLADA:
2227
2332
  return true;
2228
2333
  default:
2229
2334
  return false;
@@ -66,6 +66,7 @@ enum llm_arch {
66
66
  LLM_ARCH_DEEPSEEK2,
67
67
  LLM_ARCH_CHATGLM,
68
68
  LLM_ARCH_GLM4,
69
+ LLM_ARCH_GLM4_MOE,
69
70
  LLM_ARCH_BITNET,
70
71
  LLM_ARCH_T5,
71
72
  LLM_ARCH_T5ENCODER,
@@ -89,10 +90,13 @@ enum llm_arch {
89
90
  LLM_ARCH_ERNIE4_5,
90
91
  LLM_ARCH_ERNIE4_5_MOE,
91
92
  LLM_ARCH_HUNYUAN_MOE,
93
+ LLM_ARCH_HUNYUAN_DENSE,
92
94
  LLM_ARCH_SMOLLM3,
95
+ LLM_ARCH_OPENAI_MOE,
93
96
  LLM_ARCH_LFM2,
94
97
  LLM_ARCH_DREAM,
95
98
  LLM_ARCH_SMALLTHINKER,
99
+ LLM_ARCH_LLADA,
96
100
  LLM_ARCH_UNKNOWN,
97
101
  };
98
102
 
@@ -129,6 +133,7 @@ enum llm_kv {
129
133
  LLM_KV_EXPERT_WEIGHTS_NORM,
130
134
  LLM_KV_EXPERT_GATING_FUNC,
131
135
  LLM_KV_MOE_EVERY_N_LAYERS,
136
+ LLM_KV_NEXTN_PREDICT_LAYERS,
132
137
  LLM_KV_POOLING_TYPE,
133
138
  LLM_KV_LOGIT_SCALE,
134
139
  LLM_KV_DECODER_START_TOKEN_ID,
@@ -261,6 +266,7 @@ enum llm_tensor {
261
266
  LLM_TENSOR_ATTN_OUT_NORM,
262
267
  LLM_TENSOR_ATTN_POST_NORM,
263
268
  LLM_TENSOR_ATTN_ROT_EMBD,
269
+ LLM_TENSOR_ATTN_SINKS,
264
270
  LLM_TENSOR_FFN_GATE_INP,
265
271
  LLM_TENSOR_FFN_GATE_INP_SHEXP,
266
272
  LLM_TENSOR_FFN_NORM,
@@ -407,6 +413,12 @@ enum llm_tensor {
407
413
  LLM_TENSOR_SHORTCONV_CONV,
408
414
  LLM_TENSOR_SHORTCONV_INPROJ,
409
415
  LLM_TENSOR_SHORTCONV_OUTPROJ,
416
+ LLM_TENSOR_NEXTN_EH_PROJ,
417
+ LLM_TENSOR_NEXTN_EMBED_TOKENS,
418
+ LLM_TENSOR_NEXTN_ENORM,
419
+ LLM_TENSOR_NEXTN_HNORM,
420
+ LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD,
421
+ LLM_TENSOR_NEXTN_SHARED_HEAD_NORM,
410
422
  };
411
423
 
412
424
  enum llm_tensor_layer {