@fugood/llama.node 1.3.8 → 1.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. package/lib/binding.js +25 -18
  2. package/lib/binding.ts +19 -1
  3. package/lib/index.js +3 -3
  4. package/lib/index.ts +1 -1
  5. package/package.json +17 -17
  6. package/scripts/llama.cpp.patch +53 -4
  7. package/src/LlamaCompletionWorker.cpp +2 -2
  8. package/src/LlamaContext.cpp +6 -1
  9. package/src/llama.cpp/common/arg.cpp +1 -1
  10. package/src/llama.cpp/common/chat-parser.cpp +968 -0
  11. package/src/llama.cpp/common/chat.cpp +0 -952
  12. package/src/llama.cpp/common/json-schema-to-grammar.cpp +2 -2
  13. package/src/llama.cpp/ggml/CMakeLists.txt +1 -0
  14. package/src/llama.cpp/ggml/include/ggml-rpc.h +1 -1
  15. package/src/llama.cpp/ggml/src/CMakeLists.txt +11 -4
  16. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +336 -3
  17. package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/cpu-feats.cpp +11 -8
  18. package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +22 -0
  19. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +2 -1
  20. package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +234 -1
  21. package/src/llama.cpp/ggml/src/ggml-cpu/repack.h +6 -0
  22. package/src/llama.cpp/src/CMakeLists.txt +1 -0
  23. package/src/llama.cpp/src/llama-arch.cpp +48 -3
  24. package/src/llama.cpp/src/llama-arch.h +2 -0
  25. package/src/llama.cpp/src/llama-context.cpp +6 -2
  26. package/src/llama.cpp/src/llama-hparams.h +1 -1
  27. package/src/llama.cpp/src/llama-model.cpp +102 -5
  28. package/src/llama.cpp/src/llama-model.h +4 -0
  29. package/src/llama.cpp/src/llama-quant.cpp +13 -5
  30. package/src/llama.cpp/src/models/lfm2.cpp +5 -3
  31. package/src/llama.cpp/src/models/models.h +51 -1
  32. package/src/llama.cpp/src/models/qwen3next.cpp +1042 -0
@@ -124,6 +124,58 @@ void ggml_quantize_mat_q8_0_4x8_generic(const float * GGML_RESTRICT x, void * GG
124
124
  }
125
125
  }
126
126
 
127
+
128
+ void ggml_quantize_mat_q8_K_4x4_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
129
+ assert(QK_K == 256);
130
+ assert(k % QK_K == 0);
131
+ const int nb = k / QK_K;
132
+
133
+ block_q8_Kx4 * GGML_RESTRICT y = (block_q8_Kx4 *) vy;
134
+
135
+ // scalar
136
+ const int blck_size_interleave = 4;
137
+ float srcv[4][QK_K];
138
+ float iscale[4];
139
+
140
+ for (int i = 0; i < nb; i++) {
141
+ for (int row_iter = 0; row_iter < 4; row_iter++) {
142
+ float amax = 0.0f; // absolute max
143
+ float max = 0;
144
+
145
+ for (int j = 0; j < QK_K; j++) {
146
+ srcv[row_iter][j] = x[row_iter * k + i * QK_K + j];
147
+ // Update the maximum value of the corresponding super block
148
+ if(amax < fabsf(srcv[row_iter][j])) {
149
+ amax = fabsf(srcv[row_iter][j]);
150
+ max = srcv[row_iter][j];
151
+ }
152
+ }
153
+
154
+ iscale[row_iter] = amax ? -127.f/max : 0;
155
+
156
+ y[i].d[row_iter] = amax ? 1/iscale[row_iter] : 0;
157
+ }
158
+
159
+ for (int j = 0; j < QK_K / 4; j++) {
160
+ y[i].bsums[j] = 0;
161
+ }
162
+
163
+ // Quants values are interleaved in sequence of four bytes from corresponding super blocks
164
+ // Bsums values are interleaved in sequence of four bsums from each super block taken for interleaving
165
+ // i.e first four bsums from the first super block, followed by first four bsums from second super block and so on
166
+ for (int j = 0; j < QK_K * 4; j++) {
167
+ int src_offset = (j / (4 * blck_size_interleave)) * blck_size_interleave;
168
+ int src_id = (j % (4 * blck_size_interleave)) / blck_size_interleave;
169
+ src_offset += (j % blck_size_interleave);
170
+ int index = (((j & 15) >> 2) << 2) + ((j >> 8) << 4) + ((j >> 6) & 3);
171
+
172
+ float x0 = srcv[src_id][src_offset] * iscale[src_id];
173
+ y[i].qs[j] = nearest_int(x0);
174
+ y[i].bsums[index] += y[i].qs[j];
175
+ }
176
+ }
177
+ }
178
+
127
179
  void ggml_quantize_mat_q8_K_4x8_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
128
180
  assert(QK_K == 256);
129
181
  assert(k % QK_K == 0);
@@ -192,6 +244,12 @@ template <> void ggml_quantize_mat_t<8, GGML_TYPE_Q8_0>(const float * GGML_RESTR
192
244
  ggml_quantize_mat_q8_0_4x8(x, vy, n_per_row);
193
245
  }
194
246
 
247
+ template <> void ggml_quantize_mat_t<4, GGML_TYPE_Q8_K>(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row) {
248
+ assert(nrow == 4);
249
+ UNUSED(nrow);
250
+ ggml_quantize_mat_q8_K_4x4(x, vy, n_per_row);
251
+ }
252
+
195
253
  template <> void ggml_quantize_mat_t<8, GGML_TYPE_Q8_K>(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row) {
196
254
  assert(nrow == 4);
197
255
  UNUSED(nrow);
@@ -333,6 +391,77 @@ void ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
333
391
  }
334
392
  }
335
393
 
394
+ void ggml_gemv_q4_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
395
+ const int qk = QK_K;
396
+ const int nb = n / qk;
397
+ const int ncols_interleaved = 8;
398
+ const int blocklen = 4;
399
+ static const uint32_t kmask1 = 0x3f3f3f3f;
400
+ static const uint32_t kmask2 = 0x0f0f0f0f;
401
+ static const uint32_t kmask3 = 0x03030303;
402
+
403
+ assert (n % qk == 0);
404
+ assert (nc % ncols_interleaved == 0);
405
+
406
+ UNUSED(bs);
407
+ UNUSED(nr);
408
+
409
+ float sumf[8];
410
+ float sum_minf[8];
411
+ uint32_t utmp[32];
412
+ int sumi1;
413
+ int sumi2;
414
+ int sumi;
415
+
416
+ const block_q8_K * a_ptr = (const block_q8_K *) vy;
417
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
418
+ const block_q4_Kx8 * b_ptr = (const block_q4_Kx8 *) vx + (x * nb);
419
+
420
+ for (int j = 0; j < ncols_interleaved; j++) {
421
+ sumf[j] = 0.0;
422
+ sum_minf[j] = 0.0;
423
+ }
424
+ for (int l = 0; l < nb; l++) {
425
+ for (int sb = 0; sb < 8; sb++) {
426
+ memcpy(utmp + sb * 4, b_ptr[l].scales + sb * 12, 12);
427
+ utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4);
428
+ const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1;
429
+ utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4);
430
+ utmp[sb * 4 + 2] = uaux_0;
431
+ utmp[sb * 4 + 0] &= kmask1;
432
+ }
433
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
434
+ uint8_t * scales_0 = (uint8_t *) utmp + (k / 8) * 32;
435
+ uint8_t * scales_1 = (uint8_t *) utmp + (k / 8) * 32 + 16;
436
+ for (int j = 0; j < ncols_interleaved; j++) {
437
+ sumi1 = 0;
438
+ sumi2 = 0;
439
+ sumi = 0;
440
+ for (int i = 0; i < blocklen; ++i) {
441
+ const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF);
442
+ const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4);
443
+ sumi1 = (v0 * a_ptr[l].qs[(k / 8) * 64 + (k % 8) * blocklen + i]);
444
+ sumi2 = (v1 * a_ptr[l].qs[(k / 8) * 64 + (k % 8) * blocklen + i + 32]);
445
+ sumi1 = sumi1 * scales_0[j];
446
+ sumi2 = sumi2 * scales_1[j];
447
+ sumi += sumi1 + sumi2;
448
+ }
449
+ sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
450
+ }
451
+ }
452
+ for (int sb = 0; sb < 8; sb++) {
453
+ uint8_t * mins = (uint8_t *) utmp + 8 + sb * 16;
454
+ for (int j = 0; j < ncols_interleaved; j++) {
455
+ sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
456
+ }
457
+ }
458
+ }
459
+ for (int j = 0; j < ncols_interleaved; j++) {
460
+ s[x * ncols_interleaved + j] = sumf[j] - sum_minf[j];
461
+ }
462
+ }
463
+ }
464
+
336
465
  void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
337
466
  const int qk = QK_K;
338
467
  const int nb = n / qk;
@@ -727,6 +856,89 @@ void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
727
856
  }
728
857
  }
729
858
 
859
+ void ggml_gemm_q4_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
860
+ const int qk = QK_K;
861
+ const int nb = n / qk;
862
+ const int ncols_interleaved = 8;
863
+ const int blocklen = 4;
864
+ static const uint32_t kmask1 = 0x3f3f3f3f;
865
+ static const uint32_t kmask2 = 0x0f0f0f0f;
866
+ static const uint32_t kmask3 = 0x03030303;
867
+
868
+ assert (n % qk == 0);
869
+ assert (nr % 4 == 0);
870
+ assert (nc % ncols_interleaved == 0);
871
+
872
+ UNUSED(nb);
873
+ UNUSED(ncols_interleaved);
874
+ UNUSED(blocklen);
875
+
876
+ float sumf[4][8];
877
+ float sum_minf[4][8];
878
+ uint32_t utmp[32];
879
+ int sumi1;
880
+ int sumi2;
881
+ int sumi;
882
+
883
+ for (int y = 0; y < nr / 4; y++) {
884
+ const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb);
885
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
886
+ const block_q4_Kx8 * b_ptr = (const block_q4_Kx8 *) vx + (x * nb);
887
+ for (int m = 0; m < 4; m++) {
888
+ for (int j = 0; j < ncols_interleaved; j++) {
889
+ sumf[m][j] = 0.0;
890
+ sum_minf[m][j] = 0.0;
891
+ }
892
+ }
893
+ for (int l = 0; l < nb; l++) {
894
+ for (int sb = 0; sb < 8; sb++) {
895
+ memcpy(utmp + sb * 4, b_ptr[l].scales + sb * 12, 12);
896
+ utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4);
897
+ const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1;
898
+ utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4);
899
+ utmp[sb * 4 + 2] = uaux_0;
900
+ utmp[sb * 4 + 0] &= kmask1;
901
+ }
902
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
903
+ uint8_t * scales_0 = (uint8_t *) utmp + (k / 8) * 32;
904
+ uint8_t * scales_1 = (uint8_t *) utmp + (k / 8) * 32 + 16;
905
+ for (int m = 0; m < 4; m++) {
906
+ for (int j = 0; j < ncols_interleaved; j++) {
907
+ sumi1 = 0;
908
+ sumi2 = 0;
909
+ sumi = 0;
910
+ for (int i = 0; i < blocklen; ++i) {
911
+ const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF);
912
+ const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4);
913
+ sumi1 = (v0 * a_ptr[l].qs[(k / 8) * 256 + (k % 8) * 4 * blocklen + m * blocklen + i]);
914
+ sumi2 = (v1 * a_ptr[l].qs[(k / 8) * 256 + (k % 8) * 4 * blocklen + m * blocklen + i + 128]);
915
+ sumi1 = sumi1 * scales_0[j];
916
+ sumi2 = sumi2 * scales_1[j];
917
+ sumi += sumi1 + sumi2;
918
+ }
919
+ sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
920
+ }
921
+ }
922
+ }
923
+ for (int sb = 0; sb < 8; sb++) {
924
+ uint8_t * mins = (uint8_t *) utmp + 8 + sb * 16;
925
+ for(int m = 0; m < 4; m++) {
926
+ const int16_t * bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6);
927
+ for(int j = 0; j < ncols_interleaved; j++) {
928
+ sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
929
+ }
930
+ }
931
+ }
932
+ }
933
+ for (int m = 0; m < 4; m++) {
934
+ for (int j = 0; j < ncols_interleaved; j++) {
935
+ s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j] - sum_minf[m][j];
936
+ }
937
+ }
938
+ }
939
+ }
940
+ }
941
+
730
942
  void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
731
943
  const int qk = QK_K;
732
944
  const int nb = n / qk;
@@ -1228,9 +1440,10 @@ static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block
1228
1440
 
1229
1441
  GGML_UNUSED(data_size);
1230
1442
  }
1443
+
1231
1444
  static int repack_q4_K_to_q4_K_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
1232
1445
  GGML_ASSERT(t->type == GGML_TYPE_Q4_K);
1233
- GGML_ASSERT(interleave_block == 8);
1446
+ GGML_ASSERT(interleave_block == 8 || interleave_block == 4);
1234
1447
  constexpr int nrows_interleaved = 8;
1235
1448
 
1236
1449
  block_q4_Kx8 * dst = (block_q4_Kx8*)t->data;
@@ -1468,6 +1681,10 @@ template <> int repack<block_q4_K, 8, 8>(struct ggml_tensor * t, const void * da
1468
1681
  return repack_q4_K_to_q4_K_8_bl(t, 8, data, data_size);
1469
1682
  }
1470
1683
 
1684
+ template <> int repack<block_q4_K, 4, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
1685
+ return repack_q4_K_to_q4_K_8_bl(t, 4, data, data_size);
1686
+ }
1687
+
1471
1688
  template <> int repack<block_q2_K, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
1472
1689
  return repack_q2_K_to_q2_K_8_bl(t, 8, data, data_size);
1473
1690
  }
@@ -1501,6 +1718,10 @@ template <> void gemv<block_q4_0, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t
1501
1718
  ggml_gemv_q4_0_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
1502
1719
  }
1503
1720
 
1721
+ template <> void gemv<block_q4_K, 4, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
1722
+ ggml_gemv_q4_K_8x4_q8_K(n, s, bs, vx, vy, nr, nc);
1723
+ }
1724
+
1504
1725
  template <> void gemv<block_q4_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
1505
1726
  ggml_gemv_q4_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
1506
1727
  }
@@ -1529,6 +1750,10 @@ template <> void gemm<block_q4_0, 8, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t
1529
1750
  ggml_gemm_q4_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc);
1530
1751
  }
1531
1752
 
1753
+ template <> void gemm<block_q4_K, 4, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
1754
+ ggml_gemm_q4_K_8x4_q8_K(n, s, bs, vx, vy, nr, nc);
1755
+ }
1756
+
1532
1757
  template <> void gemm<block_q4_0, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
1533
1758
  ggml_gemm_q4_0_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
1534
1759
  }
@@ -1931,6 +2156,9 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
1931
2156
  static const ggml::cpu::repack::tensor_traits<block_q4_0, 4, 4, GGML_TYPE_Q8_0> q4_0_4x4_q8_0;
1932
2157
  static const ggml::cpu::repack::tensor_traits<block_q4_0, 8, 4, GGML_TYPE_Q8_0> q4_0_4x8_q8_0;
1933
2158
  static const ggml::cpu::repack::tensor_traits<block_q4_0, 8, 8, GGML_TYPE_Q8_0> q4_0_8x8_q8_0;
2159
+
2160
+ // instance for Q4_K
2161
+ static const ggml::cpu::repack::tensor_traits<block_q4_K, 4, 8, GGML_TYPE_Q8_K> q4_K_8x4_q8_K;
1934
2162
  static const ggml::cpu::repack::tensor_traits<block_q4_K, 8, 8, GGML_TYPE_Q8_K> q4_K_8x8_q8_K;
1935
2163
 
1936
2164
  // instance for Q2
@@ -1967,6 +2195,11 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
1967
2195
  return &q4_K_8x8_q8_K;
1968
2196
  }
1969
2197
  }
2198
+ if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
2199
+ if (cur->ne[1] % 8 == 0) {
2200
+ return &q4_K_8x4_q8_K;
2201
+ }
2202
+ }
1970
2203
  } else if (cur->type == GGML_TYPE_Q2_K) {
1971
2204
  if (ggml_cpu_has_avx512()) {
1972
2205
  if (cur->ne[1] % 8 == 0) {
@@ -80,10 +80,12 @@ extern "C" {
80
80
 
81
81
  void ggml_quantize_mat_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
82
82
  void ggml_quantize_mat_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
83
+ void ggml_quantize_mat_q8_K_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
83
84
  void ggml_quantize_mat_q8_K_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
84
85
  void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
85
86
  void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
86
87
  void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
88
+ void ggml_gemv_q4_K_8x4_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
87
89
  void ggml_gemv_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
88
90
  void ggml_gemv_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
89
91
  void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
@@ -91,6 +93,7 @@ void ggml_gemv_iq4_nl_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const
91
93
  void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
92
94
  void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
93
95
  void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
96
+ void ggml_gemm_q4_K_8x4_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
94
97
  void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
95
98
  void ggml_gemm_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
96
99
  void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
@@ -99,10 +102,12 @@ void ggml_gemm_iq4_nl_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const
99
102
  // Native implementations
100
103
  void ggml_quantize_mat_q8_0_4x4_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
101
104
  void ggml_quantize_mat_q8_0_4x8_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
105
+ void ggml_quantize_mat_q8_K_4x4_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
102
106
  void ggml_quantize_mat_q8_K_4x8_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
103
107
  void ggml_gemv_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
104
108
  void ggml_gemv_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
105
109
  void ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
110
+ void ggml_gemv_q4_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
106
111
  void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
107
112
  void ggml_gemv_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
108
113
  void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
@@ -110,6 +115,7 @@ void ggml_gemv_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs
110
115
  void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
111
116
  void ggml_gemm_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
112
117
  void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
118
+ void ggml_gemm_q4_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
113
119
  void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
114
120
  void ggml_gemm_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
115
121
  void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
@@ -114,6 +114,7 @@ add_library(llama
114
114
  models/qwen3vl.cpp
115
115
  models/qwen3vl-moe.cpp
116
116
  models/qwen3moe.cpp
117
+ models/qwen3next.cpp
117
118
  models/refact.cpp
118
119
  models/rnd1.cpp
119
120
  models/rwkv6-base.cpp
@@ -32,6 +32,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
32
32
  { LLM_ARCH_QWEN2VL, "qwen2vl" },
33
33
  { LLM_ARCH_QWEN3, "qwen3" },
34
34
  { LLM_ARCH_QWEN3MOE, "qwen3moe" },
35
+ { LLM_ARCH_QWEN3NEXT, "qwen3next" },
35
36
  { LLM_ARCH_QWEN3VL, "qwen3vl" },
36
37
  { LLM_ARCH_QWEN3VLMOE, "qwen3vlmoe" },
37
38
  { LLM_ARCH_PHI2, "phi2" },
@@ -829,6 +830,38 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
829
830
  { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
830
831
  },
831
832
  },
833
+ {
834
+ LLM_ARCH_QWEN3NEXT,
835
+ {
836
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
837
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
838
+ { LLM_TENSOR_OUTPUT, "output" },
839
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
840
+ { LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
841
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
842
+ { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
843
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
844
+ { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
845
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
846
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
847
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
848
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
849
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
850
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
851
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
852
+ { LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" },
853
+ { LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
854
+ { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
855
+ { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
856
+ { LLM_TENSOR_SSM_A, "blk.%d.ssm_a" },
857
+ { LLM_TENSOR_SSM_CONV1D, "blk.%d.ssm_conv1d" },
858
+ { LLM_TENSOR_SSM_DT, "blk.%d.ssm_dt" },
859
+ { LLM_TENSOR_SSM_BETA_ALPHA, "blk.%d.ssm_ba" },
860
+ { LLM_TENSOR_SSM_IN, "blk.%d.ssm_in" },
861
+ { LLM_TENSOR_SSM_NORM, "blk.%d.ssm_norm" },
862
+ { LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" },
863
+ },
864
+ },
832
865
  {
833
866
  LLM_ARCH_QWEN3VL,
834
867
  {
@@ -2237,7 +2270,7 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
2237
2270
  { LLM_TENSOR_SHORTCONV_INPROJ, "blk.%d.shortconv.in_proj" },
2238
2271
  { LLM_TENSOR_SHORTCONV_OUTPROJ, "blk.%d.shortconv.out_proj" },
2239
2272
  { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
2240
- { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
2273
+ { LLM_TENSOR_OUTPUT_NORM, "token_embd_norm" }, // note: wrong tensor name
2241
2274
  { LLM_TENSOR_OUTPUT, "output" },
2242
2275
  }
2243
2276
  },
@@ -2259,7 +2292,7 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
2259
2292
  { LLM_TENSOR_SHORTCONV_INPROJ, "blk.%d.shortconv.in_proj" },
2260
2293
  { LLM_TENSOR_SHORTCONV_OUTPROJ, "blk.%d.shortconv.out_proj" },
2261
2294
  { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
2262
- { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
2295
+ { LLM_TENSOR_OUTPUT_NORM, "token_embd_norm" }, // note: wrong tensor name
2263
2296
  { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
2264
2297
  { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
2265
2298
  { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
@@ -2487,11 +2520,21 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
2487
2520
  },
2488
2521
  };
2489
2522
 
2523
+ // declare information about the model weight tensors:
2524
+ // - the layer in which the tensor is going to be used. this is needed in order to assign the correct buffer type for the weight
2525
+ // - the operator which is going to use the weight. this is needed to determine if the respective backend supports the operator
2526
+ //
2527
+ // for example, input layers are usually assigned to CPU/host buffer types
2528
+ //
2529
+ // a mismatch between the declared information and the actual layer/op in which the tensor is used can lead to sub-optimal
2530
+ // assignment of the buffer types and extra overhead during computation
2531
+ // example: https://github.com/ggml-org/llama.cpp/pull/17548
2532
+ //
2490
2533
  static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
2491
2534
  {LLM_TENSOR_TOKEN_EMBD, {LLM_TENSOR_LAYER_INPUT, GGML_OP_GET_ROWS}},
2492
2535
  {LLM_TENSOR_POS_EMBD, {LLM_TENSOR_LAYER_INPUT, GGML_OP_GET_ROWS}},
2493
- {LLM_TENSOR_TOKEN_EMBD_NORM, {LLM_TENSOR_LAYER_INPUT, GGML_OP_GET_ROWS}},
2494
2536
  {LLM_TENSOR_TOKEN_TYPES, {LLM_TENSOR_LAYER_INPUT, GGML_OP_GET_ROWS}},
2537
+ {LLM_TENSOR_TOKEN_EMBD_NORM, {LLM_TENSOR_LAYER_INPUT, GGML_OP_MUL}},
2495
2538
  {LLM_TENSOR_OUTPUT, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
2496
2539
  {LLM_TENSOR_CLS, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
2497
2540
  {LLM_TENSOR_CLS_OUT, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
@@ -2546,6 +2589,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
2546
2589
  {LLM_TENSOR_SSM_X, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
2547
2590
  {LLM_TENSOR_SSM_DT, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
2548
2591
  {LLM_TENSOR_SSM_OUT, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
2592
+ {LLM_TENSOR_SSM_BETA_ALPHA, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
2549
2593
  {LLM_TENSOR_TIME_MIX_W1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
2550
2594
  {LLM_TENSOR_TIME_MIX_W2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
2551
2595
  {LLM_TENSOR_TIME_MIX_A1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
@@ -2744,6 +2788,7 @@ bool llm_arch_is_hybrid(const llm_arch & arch) {
2744
2788
  case LLM_ARCH_LFM2:
2745
2789
  case LLM_ARCH_LFM2MOE:
2746
2790
  case LLM_ARCH_NEMOTRON_H:
2791
+ case LLM_ARCH_QWEN3NEXT:
2747
2792
  return true;
2748
2793
  default:
2749
2794
  return false;
@@ -36,6 +36,7 @@ enum llm_arch {
36
36
  LLM_ARCH_QWEN2VL,
37
37
  LLM_ARCH_QWEN3,
38
38
  LLM_ARCH_QWEN3MOE,
39
+ LLM_ARCH_QWEN3NEXT,
39
40
  LLM_ARCH_QWEN3VL,
40
41
  LLM_ARCH_QWEN3VLMOE,
41
42
  LLM_ARCH_PHI2,
@@ -381,6 +382,7 @@ enum llm_tensor {
381
382
  LLM_TENSOR_SSM_D,
382
383
  LLM_TENSOR_SSM_NORM,
383
384
  LLM_TENSOR_SSM_OUT,
385
+ LLM_TENSOR_SSM_BETA_ALPHA, // qwen3next
384
386
  LLM_TENSOR_TIME_MIX_W0,
385
387
  LLM_TENSOR_TIME_MIX_W1,
386
388
  LLM_TENSOR_TIME_MIX_W2,
@@ -1,5 +1,6 @@
1
1
  #include "llama-context.h"
2
2
 
3
+ #include "llama-arch.h"
3
4
  #include "llama-impl.h"
4
5
  #include "llama-batch.h"
5
6
  #include "llama-io.h"
@@ -299,7 +300,7 @@ llama_context::llama_context(
299
300
 
300
301
  cross.v_embd.clear();
301
302
 
302
- const uint32_t n_seqs = cparams.kv_unified ? 1 : cparams.n_seq_max;
303
+ const uint32_t n_seqs = cparams.n_seq_max;
303
304
  const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
304
305
 
305
306
  // avoid reserving graphs with zero outputs - assume one output per sequence
@@ -542,7 +543,7 @@ bool llama_context::memory_update(bool optimize) {
542
543
  throw std::runtime_error("failed to initialize memory context");
543
544
  }
544
545
 
545
- const uint32_t n_seqs = cparams.kv_unified ? 1 : cparams.n_seq_max;
546
+ const uint32_t n_seqs = cparams.n_seq_max;
546
547
  const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
547
548
 
548
549
  auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
@@ -1386,6 +1387,9 @@ void llama_context::output_reorder() {
1386
1387
  //
1387
1388
 
1388
1389
  uint32_t llama_context::graph_max_nodes() const {
1390
+ if (model.arch == LLM_ARCH_QWEN3NEXT) {
1391
+ return std::max<uint32_t>(8192u, 32u*model.n_tensors());
1392
+ }
1389
1393
  return std::max<uint32_t>(1024u, 8u*model.n_tensors());
1390
1394
  }
1391
1395
 
@@ -6,7 +6,7 @@
6
6
 
7
7
  // bump if necessary
8
8
  #define LLAMA_MAX_LAYERS 512
9
- #define LLAMA_MAX_EXPERTS 384 // Kimi-K2
9
+ #define LLAMA_MAX_EXPERTS 512 // Qwen3 Next
10
10
 
11
11
  enum llama_expert_gating_func_type {
12
12
  LLAMA_EXPERT_GATING_FUNC_TYPE_NONE = 0,