@fugood/llama.node 1.3.7 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. package/lib/binding.js +18 -1
  2. package/lib/binding.ts +19 -1
  3. package/lib/index.js +3 -3
  4. package/lib/index.ts +1 -1
  5. package/package.json +15 -15
  6. package/scripts/llama.cpp.patch +7 -7
  7. package/src/LlamaCompletionWorker.cpp +2 -2
  8. package/src/llama.cpp/common/arg.cpp +27 -2
  9. package/src/llama.cpp/common/chat-parser.cpp +968 -0
  10. package/src/llama.cpp/common/chat.cpp +0 -952
  11. package/src/llama.cpp/common/common.cpp +55 -0
  12. package/src/llama.cpp/common/common.h +18 -0
  13. package/src/llama.cpp/common/json-schema-to-grammar.cpp +2 -2
  14. package/src/llama.cpp/ggml/CMakeLists.txt +6 -4
  15. package/src/llama.cpp/ggml/include/ggml-rpc.h +1 -1
  16. package/src/llama.cpp/ggml/include/ggml.h +12 -4
  17. package/src/llama.cpp/ggml/src/CMakeLists.txt +26 -4
  18. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +29 -15
  19. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +721 -0
  20. package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/cpu-feats.cpp +38 -0
  21. package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +22 -2
  22. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +9 -0
  23. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +71 -4
  24. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +1 -0
  25. package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +243 -4
  26. package/src/llama.cpp/ggml/src/ggml-cpu/repack.h +6 -0
  27. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +84 -85
  28. package/src/llama.cpp/include/llama.h +18 -0
  29. package/src/llama.cpp/src/CMakeLists.txt +2 -0
  30. package/src/llama.cpp/src/llama-arch.cpp +95 -16
  31. package/src/llama.cpp/src/llama-arch.h +15 -0
  32. package/src/llama.cpp/src/llama-context.cpp +7 -3
  33. package/src/llama.cpp/src/llama-graph.cpp +3 -3
  34. package/src/llama.cpp/src/llama-hparams.h +1 -1
  35. package/src/llama.cpp/src/llama-model.cpp +141 -6
  36. package/src/llama.cpp/src/llama-model.h +4 -0
  37. package/src/llama.cpp/src/llama-quant.cpp +13 -5
  38. package/src/llama.cpp/src/models/lfm2.cpp +5 -3
  39. package/src/llama.cpp/src/models/models.h +55 -1
  40. package/src/llama.cpp/src/models/qwen3next.cpp +1042 -0
  41. package/src/llama.cpp/src/models/rnd1.cpp +126 -0
@@ -80,10 +80,12 @@ extern "C" {
80
80
 
81
81
  void ggml_quantize_mat_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
82
82
  void ggml_quantize_mat_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
83
+ void ggml_quantize_mat_q8_K_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
83
84
  void ggml_quantize_mat_q8_K_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
84
85
  void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
85
86
  void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
86
87
  void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
88
+ void ggml_gemv_q4_K_8x4_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
87
89
  void ggml_gemv_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
88
90
  void ggml_gemv_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
89
91
  void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
@@ -91,6 +93,7 @@ void ggml_gemv_iq4_nl_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const
91
93
  void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
92
94
  void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
93
95
  void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
96
+ void ggml_gemm_q4_K_8x4_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
94
97
  void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
95
98
  void ggml_gemm_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
96
99
  void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
@@ -99,10 +102,12 @@ void ggml_gemm_iq4_nl_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const
99
102
  // Native implementations
100
103
  void ggml_quantize_mat_q8_0_4x4_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
101
104
  void ggml_quantize_mat_q8_0_4x8_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
105
+ void ggml_quantize_mat_q8_K_4x4_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
102
106
  void ggml_quantize_mat_q8_K_4x8_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
103
107
  void ggml_gemv_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
104
108
  void ggml_gemv_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
105
109
  void ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
110
+ void ggml_gemv_q4_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
106
111
  void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
107
112
  void ggml_gemv_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
108
113
  void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
@@ -110,6 +115,7 @@ void ggml_gemv_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs
110
115
  void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
111
116
  void ggml_gemm_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
112
117
  void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
118
+ void ggml_gemm_q4_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
113
119
  void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
114
120
  void ggml_gemm_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
115
121
  void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
@@ -397,119 +397,118 @@ inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const
397
397
  }
398
398
 
399
399
  inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y, const ggml_fp16_t * GGML_RESTRICT x, const float v) {
400
- #if defined(GGML_SIMD)
401
- #if defined(__ARM_FEATURE_SVE)
402
- const int sve_register_length = svcntb() * 8;
403
- const int ggml_f16_epr = sve_register_length / 16;
404
- const int ggml_f16_step = 8 * ggml_f16_epr;
400
+ #if defined(GGML_SIMD) && defined(__ARM_FEATURE_SVE)
401
+ const int sve_register_length = svcntb() * 8;
402
+ const int ggml_f16_epr = sve_register_length / 16;
403
+ const int ggml_f16_step = 8 * ggml_f16_epr;
405
404
 
406
- GGML_F16x_VEC vx = GGML_F16x_VEC_SET1(v);
405
+ GGML_F16x_VEC vx = GGML_F16x_VEC_SET1(v);
407
406
 
408
- const int np= (n & ~(ggml_f16_step - 1));
407
+ int np = (n & ~(ggml_f16_step - 1));
409
408
 
410
- svfloat16_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
411
- svfloat16_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
412
- for (int i = 0; i < np; i += ggml_f16_step) {
413
- ax1 = GGML_F16x_VEC_LOAD(x + i + 0 * ggml_f16_epr, 0);
414
- ay1 = GGML_F16x_VEC_LOAD(y + i + 0 * ggml_f16_epr, 0);
415
- ay1 = GGML_F16x_VEC_FMA(ay1, ax1, vx);
409
+ svfloat16_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
410
+ svfloat16_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
411
+ for (int i = 0; i < np; i += ggml_f16_step) {
412
+ ax1 = GGML_F16x_VEC_LOAD(x + i + 0 * ggml_f16_epr, 0);
413
+ ay1 = GGML_F16x_VEC_LOAD(y + i + 0 * ggml_f16_epr, 0);
414
+ ay1 = GGML_F16x_VEC_FMA(ay1, ax1, vx);
416
415
 
417
- GGML_F16x_VEC_STORE(y + i + 0 * ggml_f16_epr, ay1, 0);
416
+ GGML_F16x_VEC_STORE(y + i + 0 * ggml_f16_epr, ay1, 0);
418
417
 
419
- ax2 = GGML_F16x_VEC_LOAD(x + i + 1 * ggml_f16_epr, 1);
420
- ay2 = GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 1);
421
- ay2 = GGML_F16x_VEC_FMA(ay2, ax2, vx);
418
+ ax2 = GGML_F16x_VEC_LOAD(x + i + 1 * ggml_f16_epr, 1);
419
+ ay2 = GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 1);
420
+ ay2 = GGML_F16x_VEC_FMA(ay2, ax2, vx);
422
421
 
423
- GGML_F16x_VEC_STORE(y + i + 1 * ggml_f16_epr, ay2, 1);
422
+ GGML_F16x_VEC_STORE(y + i + 1 * ggml_f16_epr, ay2, 1);
424
423
 
425
- ax3 = GGML_F16x_VEC_LOAD(x + i + 2 * ggml_f16_epr, 2);
426
- ay3 = GGML_F16x_VEC_LOAD(y + i + 2 * ggml_f16_epr, 2);
427
- ay3 = GGML_F16x_VEC_FMA(ay3, ax3, vx);
424
+ ax3 = GGML_F16x_VEC_LOAD(x + i + 2 * ggml_f16_epr, 2);
425
+ ay3 = GGML_F16x_VEC_LOAD(y + i + 2 * ggml_f16_epr, 2);
426
+ ay3 = GGML_F16x_VEC_FMA(ay3, ax3, vx);
428
427
 
429
- GGML_F16x_VEC_STORE(y + i + 2 * ggml_f16_epr, ay3, 2);
428
+ GGML_F16x_VEC_STORE(y + i + 2 * ggml_f16_epr, ay3, 2);
430
429
 
431
- ax4 = GGML_F16x_VEC_LOAD(x + i + 3 * ggml_f16_epr, 3);
432
- ay4 = GGML_F16x_VEC_LOAD(y + i + 3 * ggml_f16_epr, 3);
433
- ay4 = GGML_F16x_VEC_FMA(ay4, ax4, vx);
430
+ ax4 = GGML_F16x_VEC_LOAD(x + i + 3 * ggml_f16_epr, 3);
431
+ ay4 = GGML_F16x_VEC_LOAD(y + i + 3 * ggml_f16_epr, 3);
432
+ ay4 = GGML_F16x_VEC_FMA(ay4, ax4, vx);
434
433
 
435
- GGML_F16x_VEC_STORE(y + i + 3 * ggml_f16_epr, ay4, 3);
434
+ GGML_F16x_VEC_STORE(y + i + 3 * ggml_f16_epr, ay4, 3);
436
435
 
437
- ax5 = GGML_F16x_VEC_LOAD(x + i + 4 * ggml_f16_epr, 4);
438
- ay5 = GGML_F16x_VEC_LOAD(y + i + 4 * ggml_f16_epr, 4);
439
- ay5 = GGML_F16x_VEC_FMA(ay5, ax5, vx);
436
+ ax5 = GGML_F16x_VEC_LOAD(x + i + 4 * ggml_f16_epr, 4);
437
+ ay5 = GGML_F16x_VEC_LOAD(y + i + 4 * ggml_f16_epr, 4);
438
+ ay5 = GGML_F16x_VEC_FMA(ay5, ax5, vx);
440
439
 
441
- GGML_F16x_VEC_STORE(y + i + 4 * ggml_f16_epr, ay5, 4);
440
+ GGML_F16x_VEC_STORE(y + i + 4 * ggml_f16_epr, ay5, 4);
442
441
 
443
- ax6 = GGML_F16x_VEC_LOAD(x + i + 5 * ggml_f16_epr, 5);
444
- ay6 = GGML_F16x_VEC_LOAD(y + i + 5 * ggml_f16_epr, 5);
445
- ay6 = GGML_F16x_VEC_FMA(ay6, ax6, vx);
442
+ ax6 = GGML_F16x_VEC_LOAD(x + i + 5 * ggml_f16_epr, 5);
443
+ ay6 = GGML_F16x_VEC_LOAD(y + i + 5 * ggml_f16_epr, 5);
444
+ ay6 = GGML_F16x_VEC_FMA(ay6, ax6, vx);
446
445
 
447
- GGML_F16x_VEC_STORE(y + i + 5 * ggml_f16_epr, ay6, 5);
446
+ GGML_F16x_VEC_STORE(y + i + 5 * ggml_f16_epr, ay6, 5);
448
447
 
449
- ax7 = GGML_F16x_VEC_LOAD(x + i + 6 * ggml_f16_epr, 6);
450
- ay7 = GGML_F16x_VEC_LOAD(y + i + 6 * ggml_f16_epr, 6);
451
- ay7 = GGML_F16x_VEC_FMA(ay7, ax7, vx);
448
+ ax7 = GGML_F16x_VEC_LOAD(x + i + 6 * ggml_f16_epr, 6);
449
+ ay7 = GGML_F16x_VEC_LOAD(y + i + 6 * ggml_f16_epr, 6);
450
+ ay7 = GGML_F16x_VEC_FMA(ay7, ax7, vx);
452
451
 
453
- GGML_F16x_VEC_STORE(y + i + 6 * ggml_f16_epr, ay7, 6);
452
+ GGML_F16x_VEC_STORE(y + i + 6 * ggml_f16_epr, ay7, 6);
454
453
 
455
- ax8 = GGML_F16x_VEC_LOAD(x + i + 7 * ggml_f16_epr, 7);
456
- ay8 = GGML_F16x_VEC_LOAD(y + i + 7 * ggml_f16_epr, 7);
457
- ay8 = GGML_F16x_VEC_FMA(ay8, ax8, vx);
454
+ ax8 = GGML_F16x_VEC_LOAD(x + i + 7 * ggml_f16_epr, 7);
455
+ ay8 = GGML_F16x_VEC_LOAD(y + i + 7 * ggml_f16_epr, 7);
456
+ ay8 = GGML_F16x_VEC_FMA(ay8, ax8, vx);
458
457
 
459
- GGML_F16x_VEC_STORE(y + i + 7 * ggml_f16_epr, ay8, 7);
460
- }
461
- const int np2 = (n & ~(ggml_f16_epr - 1));
462
- for (int k = np; k < np2; k += ggml_f16_epr) {
463
- svfloat16_t rx = GGML_F16x_VEC_LOAD(x + k, 0);
464
- svfloat16_t ry = GGML_F16x_VEC_LOAD(y + k, 0);
465
- ry = GGML_F16x_VEC_FMA(ry, rx, vx);
466
-
467
- GGML_F16x_VEC_STORE(y + k, ry, 0);
468
- }
469
-
470
- if (np2 < n) {
471
- svbool_t pg = svwhilelt_b16(np2, n);
472
- svfloat16_t hx = svld1_f16(pg, (const __fp16 *)(x + np2));
473
- svfloat16_t hy = svld1_f16(pg, (const __fp16 *)(y + np2));
474
- hy = svmad_f16_x(pg, hx, vx, hy);
475
- svst1_f16(pg, (__fp16 *)(y + np2), hy);
476
- }
458
+ GGML_F16x_VEC_STORE(y + i + 7 * ggml_f16_epr, ay8, 7);
459
+ }
460
+ const int np2 = (n & ~(ggml_f16_epr - 1));
461
+ for (int k = np; k < np2; k += ggml_f16_epr) {
462
+ svfloat16_t rx = GGML_F16x_VEC_LOAD(x + k, 0);
463
+ svfloat16_t ry = GGML_F16x_VEC_LOAD(y + k, 0);
464
+ ry = GGML_F16x_VEC_FMA(ry, rx, vx);
477
465
 
478
- #elif defined(__riscv_v_intrinsic)
479
- // todo: RVV impl
480
- // scalar
481
- for (int i = 0; i < n; ++i) {
482
- y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v);
483
- }
484
- #else
485
- const int np = (n & ~(GGML_F16_STEP - 1));
466
+ GGML_F16x_VEC_STORE(y + k, ry, 0);
467
+ }
486
468
 
487
- GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
469
+ if (np2 < n) {
470
+ svbool_t pg = svwhilelt_b16(np2, n);
471
+ svfloat16_t hx = svld1_f16(pg, (const __fp16 *)(x + np2));
472
+ svfloat16_t hy = svld1_f16(pg, (const __fp16 *)(y + np2));
473
+ hy = svmad_f16_x(pg, hx, vx, hy);
474
+ svst1_f16(pg, (__fp16 *)(y + np2), hy);
475
+ }
476
+ np = n;
477
+ #elif defined(__riscv_zvfh) // implies __riscv_v_intrinsic
478
+ const int np = n;
479
+ _Float16 hv = (_Float16)v;
480
+ for (int i = 0, avl; i < n; i += avl) {
481
+ avl = __riscv_vsetvl_e16m8(n - i);
482
+ vfloat16m8_t ax = __riscv_vle16_v_f16m8((const _Float16 *)&x[i], avl);
483
+ vfloat16m8_t ay = __riscv_vle16_v_f16m8((_Float16 *)&y[i], avl);
484
+ vfloat16m8_t ny = __riscv_vfmadd_vf_f16m8(ax, hv, ay, avl);
485
+ __riscv_vse16_v_f16m8((_Float16 *)&y[i], ny, avl);
486
+ }
487
+ #elif defined(GGML_SIMD)
488
+ const int np = (n & ~(GGML_F16_STEP - 1));
488
489
 
489
- GGML_F16_VEC ax[GGML_F16_ARR];
490
- GGML_F16_VEC ay[GGML_F16_ARR];
490
+ GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
491
491
 
492
- for (int i = 0; i < np; i += GGML_F16_STEP) {
493
- for (int j = 0; j < GGML_F16_ARR; j++) {
494
- ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j);
495
- ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
496
- ay[j] = GGML_F16_VEC_FMA(ay[j], ax[j], vx);
492
+ GGML_F16_VEC ax[GGML_F16_ARR];
493
+ GGML_F16_VEC ay[GGML_F16_ARR];
497
494
 
498
- GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
499
- }
500
- }
495
+ for (int i = 0; i < np; i += GGML_F16_STEP) {
496
+ for (int j = 0; j < GGML_F16_ARR; j++) {
497
+ ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j);
498
+ ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
499
+ ay[j] = GGML_F16_VEC_FMA(ay[j], ax[j], vx);
501
500
 
502
- // leftovers
503
- for (int i = np; i < n; ++i) {
504
- y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v);
501
+ GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
505
502
  }
506
- #endif
503
+ }
507
504
  #else
508
- // scalar
509
- for (int i = 0; i < n; ++i) {
505
+ const int np = 0;
506
+ #endif
507
+
508
+ // leftovers
509
+ for (int i = np; i < n; ++i) {
510
510
  y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v);
511
511
  }
512
- #endif
513
512
  }
514
513
 
515
514
  // xs and vs are byte strides of x and v
@@ -246,6 +246,21 @@ extern "C" {
246
246
  LLAMA_KV_OVERRIDE_TYPE_STR,
247
247
  };
248
248
 
249
+ enum llama_model_meta_key {
250
+ LLAMA_MODEL_META_KEY_SAMPLING_SEQUENCE,
251
+ LLAMA_MODEL_META_KEY_SAMPLING_TOP_K,
252
+ LLAMA_MODEL_META_KEY_SAMPLING_TOP_P,
253
+ LLAMA_MODEL_META_KEY_SAMPLING_MIN_P,
254
+ LLAMA_MODEL_META_KEY_SAMPLING_XTC_PROBABILITY,
255
+ LLAMA_MODEL_META_KEY_SAMPLING_XTC_THRESHOLD,
256
+ LLAMA_MODEL_META_KEY_SAMPLING_TEMP,
257
+ LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_LAST_N,
258
+ LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_REPEAT,
259
+ LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT,
260
+ LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_TAU,
261
+ LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_ETA,
262
+ };
263
+
249
264
  struct llama_model_kv_override {
250
265
  enum llama_model_kv_override_type tag;
251
266
 
@@ -518,6 +533,9 @@ extern "C" {
518
533
  // Get the number of metadata key/value pairs
519
534
  LLAMA_API int32_t llama_model_meta_count(const struct llama_model * model);
520
535
 
536
+ // Get sampling metadata key name. Returns nullptr if the key is invalid
537
+ LLAMA_API const char * llama_model_meta_key_str(enum llama_model_meta_key key);
538
+
521
539
  // Get metadata key name by index
522
540
  LLAMA_API int32_t llama_model_meta_key_by_index(const struct llama_model * model, int32_t i, char * buf, size_t buf_size);
523
541
 
@@ -114,7 +114,9 @@ add_library(llama
114
114
  models/qwen3vl.cpp
115
115
  models/qwen3vl-moe.cpp
116
116
  models/qwen3moe.cpp
117
+ models/qwen3next.cpp
117
118
  models/refact.cpp
119
+ models/rnd1.cpp
118
120
  models/rwkv6-base.cpp
119
121
  models/rwkv6.cpp
120
122
  models/rwkv6qwen2.cpp
@@ -32,6 +32,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
32
32
  { LLM_ARCH_QWEN2VL, "qwen2vl" },
33
33
  { LLM_ARCH_QWEN3, "qwen3" },
34
34
  { LLM_ARCH_QWEN3MOE, "qwen3moe" },
35
+ { LLM_ARCH_QWEN3NEXT, "qwen3next" },
35
36
  { LLM_ARCH_QWEN3VL, "qwen3vl" },
36
37
  { LLM_ARCH_QWEN3VLMOE, "qwen3vlmoe" },
37
38
  { LLM_ARCH_PHI2, "phi2" },
@@ -108,24 +109,37 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
108
109
  { LLM_ARCH_APERTUS, "apertus" },
109
110
  { LLM_ARCH_MINIMAX_M2, "minimax-m2" },
110
111
  { LLM_ARCH_COGVLM, "cogvlm" },
112
+ { LLM_ARCH_RND1, "rnd1" },
111
113
  { LLM_ARCH_PANGU_EMBED, "pangu-embedded" },
112
114
  { LLM_ARCH_UNKNOWN, "(unknown)" },
113
115
  };
114
116
 
115
117
  static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
116
- { LLM_KV_GENERAL_TYPE, "general.type" },
117
- { LLM_KV_GENERAL_ARCHITECTURE, "general.architecture" },
118
- { LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version" },
119
- { LLM_KV_GENERAL_ALIGNMENT, "general.alignment" },
120
- { LLM_KV_GENERAL_FILE_TYPE, "general.file_type" },
121
- { LLM_KV_GENERAL_NAME, "general.name" },
122
- { LLM_KV_GENERAL_AUTHOR, "general.author" },
123
- { LLM_KV_GENERAL_VERSION, "general.version" },
124
- { LLM_KV_GENERAL_URL, "general.url" },
125
- { LLM_KV_GENERAL_DESCRIPTION, "general.description" },
126
- { LLM_KV_GENERAL_LICENSE, "general.license" },
127
- { LLM_KV_GENERAL_SOURCE_URL, "general.source.url" },
128
- { LLM_KV_GENERAL_SOURCE_HF_REPO, "general.source.huggingface.repository" },
118
+ { LLM_KV_GENERAL_TYPE, "general.type" },
119
+ { LLM_KV_GENERAL_ARCHITECTURE, "general.architecture" },
120
+ { LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version" },
121
+ { LLM_KV_GENERAL_ALIGNMENT, "general.alignment" },
122
+ { LLM_KV_GENERAL_FILE_TYPE, "general.file_type" },
123
+ { LLM_KV_GENERAL_SAMPLING_SEQUENCE, "general.sampling.sequence" },
124
+ { LLM_KV_GENERAL_SAMPLING_TOP_K, "general.sampling.top_k" },
125
+ { LLM_KV_GENERAL_SAMPLING_TOP_P, "general.sampling.top_p" },
126
+ { LLM_KV_GENERAL_SAMPLING_MIN_P, "general.sampling.min_p" },
127
+ { LLM_KV_GENERAL_SAMPLING_XTC_PROBABILITY, "general.sampling.xtc_probability" },
128
+ { LLM_KV_GENERAL_SAMPLING_XTC_THRESHOLD, "general.sampling.xtc_threshold" },
129
+ { LLM_KV_GENERAL_SAMPLING_TEMP, "general.sampling.temp" },
130
+ { LLM_KV_GENERAL_SAMPLING_PENALTY_LAST_N, "general.sampling.penalty_last_n" },
131
+ { LLM_KV_GENERAL_SAMPLING_PENALTY_REPEAT, "general.sampling.penalty_repeat" },
132
+ { LLM_KV_GENERAL_SAMPLING_MIROSTAT, "general.sampling.mirostat" },
133
+ { LLM_KV_GENERAL_SAMPLING_MIROSTAT_TAU, "general.sampling.mirostat_tau" },
134
+ { LLM_KV_GENERAL_SAMPLING_MIROSTAT_ETA, "general.sampling.mirostat_eta" },
135
+ { LLM_KV_GENERAL_NAME, "general.name" },
136
+ { LLM_KV_GENERAL_AUTHOR, "general.author" },
137
+ { LLM_KV_GENERAL_VERSION, "general.version" },
138
+ { LLM_KV_GENERAL_URL, "general.url" },
139
+ { LLM_KV_GENERAL_DESCRIPTION, "general.description" },
140
+ { LLM_KV_GENERAL_LICENSE, "general.license" },
141
+ { LLM_KV_GENERAL_SOURCE_URL, "general.source.url" },
142
+ { LLM_KV_GENERAL_SOURCE_HF_REPO, "general.source.huggingface.repository" },
129
143
 
130
144
  { LLM_KV_VOCAB_SIZE, "%s.vocab_size" },
131
145
  { LLM_KV_CONTEXT_LENGTH, "%s.context_length" },
@@ -816,6 +830,38 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
816
830
  { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
817
831
  },
818
832
  },
833
+ {
834
+ LLM_ARCH_QWEN3NEXT,
835
+ {
836
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
837
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
838
+ { LLM_TENSOR_OUTPUT, "output" },
839
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
840
+ { LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
841
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
842
+ { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
843
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
844
+ { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
845
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
846
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
847
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
848
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
849
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
850
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
851
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
852
+ { LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" },
853
+ { LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
854
+ { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
855
+ { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
856
+ { LLM_TENSOR_SSM_A, "blk.%d.ssm_a" },
857
+ { LLM_TENSOR_SSM_CONV1D, "blk.%d.ssm_conv1d" },
858
+ { LLM_TENSOR_SSM_DT, "blk.%d.ssm_dt" },
859
+ { LLM_TENSOR_SSM_BETA_ALPHA, "blk.%d.ssm_ba" },
860
+ { LLM_TENSOR_SSM_IN, "blk.%d.ssm_in" },
861
+ { LLM_TENSOR_SSM_NORM, "blk.%d.ssm_norm" },
862
+ { LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" },
863
+ },
864
+ },
819
865
  {
820
866
  LLM_ARCH_QWEN3VL,
821
867
  {
@@ -2224,7 +2270,7 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
2224
2270
  { LLM_TENSOR_SHORTCONV_INPROJ, "blk.%d.shortconv.in_proj" },
2225
2271
  { LLM_TENSOR_SHORTCONV_OUTPROJ, "blk.%d.shortconv.out_proj" },
2226
2272
  { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
2227
- { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
2273
+ { LLM_TENSOR_OUTPUT_NORM, "token_embd_norm" }, // note: wrong tensor name
2228
2274
  { LLM_TENSOR_OUTPUT, "output" },
2229
2275
  }
2230
2276
  },
@@ -2246,7 +2292,7 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
2246
2292
  { LLM_TENSOR_SHORTCONV_INPROJ, "blk.%d.shortconv.in_proj" },
2247
2293
  { LLM_TENSOR_SHORTCONV_OUTPROJ, "blk.%d.shortconv.out_proj" },
2248
2294
  { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
2249
- { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
2295
+ { LLM_TENSOR_OUTPUT_NORM, "token_embd_norm" }, // note: wrong tensor name
2250
2296
  { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
2251
2297
  { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
2252
2298
  { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
@@ -2446,6 +2492,26 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
2446
2492
  { LLM_TENSOR_VISEXP_FFN_UP, "blk.%d.vis_up" },
2447
2493
  },
2448
2494
  },
2495
+ {
2496
+ LLM_ARCH_RND1,
2497
+ {
2498
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
2499
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
2500
+ { LLM_TENSOR_OUTPUT, "output" },
2501
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
2502
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
2503
+ { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
2504
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
2505
+ { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
2506
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
2507
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
2508
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
2509
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
2510
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
2511
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
2512
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
2513
+ },
2514
+ },
2449
2515
  {
2450
2516
  LLM_ARCH_UNKNOWN,
2451
2517
  {
@@ -2454,11 +2520,21 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
2454
2520
  },
2455
2521
  };
2456
2522
 
2523
+ // declare information about the model weight tensors:
2524
+ // - the layer in which the tensor is going to be used. this is needed in order to assign the correct buffer type for the weight
2525
+ // - the operator which is going to use the weight. this is needed to determine if the respective backend supports the operator
2526
+ //
2527
+ // for example, input layers are usually assigned to CPU/host buffer types
2528
+ //
2529
+ // a mismatch between the declared information and the actual layer/op in which the tensor is used can lead to sub-optimal
2530
+ // assignment of the buffer types and extra overhead during computation
2531
+ // example: https://github.com/ggml-org/llama.cpp/pull/17548
2532
+ //
2457
2533
  static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
2458
2534
  {LLM_TENSOR_TOKEN_EMBD, {LLM_TENSOR_LAYER_INPUT, GGML_OP_GET_ROWS}},
2459
2535
  {LLM_TENSOR_POS_EMBD, {LLM_TENSOR_LAYER_INPUT, GGML_OP_GET_ROWS}},
2460
- {LLM_TENSOR_TOKEN_EMBD_NORM, {LLM_TENSOR_LAYER_INPUT, GGML_OP_GET_ROWS}},
2461
2536
  {LLM_TENSOR_TOKEN_TYPES, {LLM_TENSOR_LAYER_INPUT, GGML_OP_GET_ROWS}},
2537
+ {LLM_TENSOR_TOKEN_EMBD_NORM, {LLM_TENSOR_LAYER_INPUT, GGML_OP_MUL}},
2462
2538
  {LLM_TENSOR_OUTPUT, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
2463
2539
  {LLM_TENSOR_CLS, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
2464
2540
  {LLM_TENSOR_CLS_OUT, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
@@ -2513,6 +2589,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
2513
2589
  {LLM_TENSOR_SSM_X, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
2514
2590
  {LLM_TENSOR_SSM_DT, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
2515
2591
  {LLM_TENSOR_SSM_OUT, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
2592
+ {LLM_TENSOR_SSM_BETA_ALPHA, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
2516
2593
  {LLM_TENSOR_TIME_MIX_W1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
2517
2594
  {LLM_TENSOR_TIME_MIX_W2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
2518
2595
  {LLM_TENSOR_TIME_MIX_A1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
@@ -2711,6 +2788,7 @@ bool llm_arch_is_hybrid(const llm_arch & arch) {
2711
2788
  case LLM_ARCH_LFM2:
2712
2789
  case LLM_ARCH_LFM2MOE:
2713
2790
  case LLM_ARCH_NEMOTRON_H:
2791
+ case LLM_ARCH_QWEN3NEXT:
2714
2792
  return true;
2715
2793
  default:
2716
2794
  return false;
@@ -2722,6 +2800,7 @@ bool llm_arch_is_diffusion(const llm_arch & arch) {
2722
2800
  case LLM_ARCH_DREAM:
2723
2801
  case LLM_ARCH_LLADA:
2724
2802
  case LLM_ARCH_LLADA_MOE:
2803
+ case LLM_ARCH_RND1:
2725
2804
  return true;
2726
2805
  default:
2727
2806
  return false;
@@ -36,6 +36,7 @@ enum llm_arch {
36
36
  LLM_ARCH_QWEN2VL,
37
37
  LLM_ARCH_QWEN3,
38
38
  LLM_ARCH_QWEN3MOE,
39
+ LLM_ARCH_QWEN3NEXT,
39
40
  LLM_ARCH_QWEN3VL,
40
41
  LLM_ARCH_QWEN3VLMOE,
41
42
  LLM_ARCH_PHI2,
@@ -112,6 +113,7 @@ enum llm_arch {
112
113
  LLM_ARCH_APERTUS,
113
114
  LLM_ARCH_MINIMAX_M2,
114
115
  LLM_ARCH_COGVLM,
116
+ LLM_ARCH_RND1,
115
117
  LLM_ARCH_PANGU_EMBED,
116
118
  LLM_ARCH_UNKNOWN,
117
119
  };
@@ -122,6 +124,18 @@ enum llm_kv {
122
124
  LLM_KV_GENERAL_QUANTIZATION_VERSION,
123
125
  LLM_KV_GENERAL_ALIGNMENT,
124
126
  LLM_KV_GENERAL_FILE_TYPE,
127
+ LLM_KV_GENERAL_SAMPLING_SEQUENCE,
128
+ LLM_KV_GENERAL_SAMPLING_TOP_K,
129
+ LLM_KV_GENERAL_SAMPLING_TOP_P,
130
+ LLM_KV_GENERAL_SAMPLING_MIN_P,
131
+ LLM_KV_GENERAL_SAMPLING_XTC_PROBABILITY,
132
+ LLM_KV_GENERAL_SAMPLING_XTC_THRESHOLD,
133
+ LLM_KV_GENERAL_SAMPLING_TEMP,
134
+ LLM_KV_GENERAL_SAMPLING_PENALTY_LAST_N,
135
+ LLM_KV_GENERAL_SAMPLING_PENALTY_REPEAT,
136
+ LLM_KV_GENERAL_SAMPLING_MIROSTAT,
137
+ LLM_KV_GENERAL_SAMPLING_MIROSTAT_TAU,
138
+ LLM_KV_GENERAL_SAMPLING_MIROSTAT_ETA,
125
139
  LLM_KV_GENERAL_NAME,
126
140
  LLM_KV_GENERAL_AUTHOR,
127
141
  LLM_KV_GENERAL_VERSION,
@@ -368,6 +382,7 @@ enum llm_tensor {
368
382
  LLM_TENSOR_SSM_D,
369
383
  LLM_TENSOR_SSM_NORM,
370
384
  LLM_TENSOR_SSM_OUT,
385
+ LLM_TENSOR_SSM_BETA_ALPHA, // qwen3next
371
386
  LLM_TENSOR_TIME_MIX_W0,
372
387
  LLM_TENSOR_TIME_MIX_W1,
373
388
  LLM_TENSOR_TIME_MIX_W2,
@@ -1,5 +1,6 @@
1
1
  #include "llama-context.h"
2
2
 
3
+ #include "llama-arch.h"
3
4
  #include "llama-impl.h"
4
5
  #include "llama-batch.h"
5
6
  #include "llama-io.h"
@@ -299,7 +300,7 @@ llama_context::llama_context(
299
300
 
300
301
  cross.v_embd.clear();
301
302
 
302
- const uint32_t n_seqs = cparams.kv_unified ? 1 : cparams.n_seq_max;
303
+ const uint32_t n_seqs = cparams.n_seq_max;
303
304
  const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
304
305
 
305
306
  // avoid reserving graphs with zero outputs - assume one output per sequence
@@ -542,7 +543,7 @@ bool llama_context::memory_update(bool optimize) {
542
543
  throw std::runtime_error("failed to initialize memory context");
543
544
  }
544
545
 
545
- const uint32_t n_seqs = cparams.kv_unified ? 1 : cparams.n_seq_max;
546
+ const uint32_t n_seqs = cparams.n_seq_max;
546
547
  const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
547
548
 
548
549
  auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
@@ -1248,7 +1249,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
1248
1249
 
1249
1250
  // make the outputs have the same order they had in the user-provided batch
1250
1251
  // note: this is mostly relevant for recurrent models atm
1251
- if (!sorted_output) {
1252
+ if (!sorted_output && n_outputs > 1) {
1252
1253
  GGML_ASSERT((size_t) n_outputs == out_ids.size());
1253
1254
 
1254
1255
  // TODO: is there something more efficient which also minimizes swaps?
@@ -1386,6 +1387,9 @@ void llama_context::output_reorder() {
1386
1387
  //
1387
1388
 
1388
1389
  uint32_t llama_context::graph_max_nodes() const {
1390
+ if (model.arch == LLM_ARCH_QWEN3NEXT) {
1391
+ return std::max<uint32_t>(8192u, 32u*model.n_tensors());
1392
+ }
1389
1393
  return std::max<uint32_t>(1024u, 8u*model.n_tensors());
1390
1394
  }
1391
1395
 
@@ -961,14 +961,14 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
961
961
  // organize experts into n_expert_groups
962
962
  ggml_tensor * selection_groups = ggml_reshape_3d(ctx0, selection_probs, n_exp_per_group, hparams.n_expert_groups, n_tokens); // [n_exp_per_group, n_expert_groups, n_tokens]
963
963
 
964
- ggml_tensor * group_scores = ggml_top_k(ctx0, selection_groups, 2); // [2, n_expert_groups, n_tokens]
964
+ ggml_tensor * group_scores = ggml_argsort_top_k(ctx0, selection_groups, 2); // [2, n_expert_groups, n_tokens]
965
965
  group_scores = ggml_get_rows(ctx0, ggml_reshape_4d(ctx0, selection_groups, 1, selection_groups->ne[0], selection_groups->ne[1], selection_groups->ne[2]), group_scores); // [1, 2, n_expert_groups, n_tokens]
966
966
 
967
967
  // get top n_group_used expert groups
968
968
  group_scores = ggml_sum_rows(ctx0, ggml_reshape_3d(ctx0, group_scores, group_scores->ne[1], group_scores->ne[2], group_scores->ne[3])); // [1, n_expert_groups, n_tokens]
969
969
  group_scores = ggml_reshape_2d(ctx0, group_scores, group_scores->ne[1], group_scores->ne[2]); // [n_expert_groups, n_tokens]
970
970
 
971
- ggml_tensor * expert_groups = ggml_top_k(ctx0, group_scores, hparams.n_group_used); // [n_group_used, n_tokens]
971
+ ggml_tensor * expert_groups = ggml_argsort_top_k(ctx0, group_scores, hparams.n_group_used); // [n_group_used, n_tokens]
972
972
  cb(expert_groups, "ffn_moe_group_topk", il);
973
973
 
974
974
  // mask out the other groups
@@ -979,7 +979,7 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
979
979
  }
980
980
 
981
981
  // select experts
982
- ggml_tensor * selected_experts = ggml_top_k(ctx0, selection_probs, n_expert_used); // [n_expert_used, n_tokens]
982
+ ggml_tensor * selected_experts = ggml_argsort_top_k(ctx0, selection_probs, n_expert_used); // [n_expert_used, n_tokens]
983
983
  cb(selected_experts->src[0], "ffn_moe_argsort", il);
984
984
  cb(selected_experts, "ffn_moe_topk", il);
985
985
 
@@ -6,7 +6,7 @@
6
6
 
7
7
  // bump if necessary
8
8
  #define LLAMA_MAX_LAYERS 512
9
- #define LLAMA_MAX_EXPERTS 384 // Kimi-K2
9
+ #define LLAMA_MAX_EXPERTS 512 // Qwen3 Next
10
10
 
11
11
  enum llama_expert_gating_func_type {
12
12
  LLAMA_EXPERT_GATING_FUNC_TYPE_NONE = 0,