@fugood/llama.node 1.3.7 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.js +18 -1
- package/lib/binding.ts +19 -1
- package/lib/index.js +3 -3
- package/lib/index.ts +1 -1
- package/package.json +15 -15
- package/scripts/llama.cpp.patch +7 -7
- package/src/LlamaCompletionWorker.cpp +2 -2
- package/src/llama.cpp/common/arg.cpp +27 -2
- package/src/llama.cpp/common/chat-parser.cpp +968 -0
- package/src/llama.cpp/common/chat.cpp +0 -952
- package/src/llama.cpp/common/common.cpp +55 -0
- package/src/llama.cpp/common/common.h +18 -0
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +2 -2
- package/src/llama.cpp/ggml/CMakeLists.txt +6 -4
- package/src/llama.cpp/ggml/include/ggml-rpc.h +1 -1
- package/src/llama.cpp/ggml/include/ggml.h +12 -4
- package/src/llama.cpp/ggml/src/CMakeLists.txt +26 -4
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +29 -15
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +721 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/cpu-feats.cpp +38 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +22 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +9 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +71 -4
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +243 -4
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.h +6 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +84 -85
- package/src/llama.cpp/include/llama.h +18 -0
- package/src/llama.cpp/src/CMakeLists.txt +2 -0
- package/src/llama.cpp/src/llama-arch.cpp +95 -16
- package/src/llama.cpp/src/llama-arch.h +15 -0
- package/src/llama.cpp/src/llama-context.cpp +7 -3
- package/src/llama.cpp/src/llama-graph.cpp +3 -3
- package/src/llama.cpp/src/llama-hparams.h +1 -1
- package/src/llama.cpp/src/llama-model.cpp +141 -6
- package/src/llama.cpp/src/llama-model.h +4 -0
- package/src/llama.cpp/src/llama-quant.cpp +13 -5
- package/src/llama.cpp/src/models/lfm2.cpp +5 -3
- package/src/llama.cpp/src/models/models.h +55 -1
- package/src/llama.cpp/src/models/qwen3next.cpp +1042 -0
- package/src/llama.cpp/src/models/rnd1.cpp +126 -0
|
@@ -80,10 +80,12 @@ extern "C" {
|
|
|
80
80
|
|
|
81
81
|
void ggml_quantize_mat_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
|
|
82
82
|
void ggml_quantize_mat_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
|
|
83
|
+
void ggml_quantize_mat_q8_K_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
|
|
83
84
|
void ggml_quantize_mat_q8_K_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
|
|
84
85
|
void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
85
86
|
void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
86
87
|
void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
88
|
+
void ggml_gemv_q4_K_8x4_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
87
89
|
void ggml_gemv_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
88
90
|
void ggml_gemv_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
89
91
|
void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
@@ -91,6 +93,7 @@ void ggml_gemv_iq4_nl_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const
|
|
|
91
93
|
void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
92
94
|
void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
93
95
|
void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
96
|
+
void ggml_gemm_q4_K_8x4_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
94
97
|
void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
95
98
|
void ggml_gemm_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
96
99
|
void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
@@ -99,10 +102,12 @@ void ggml_gemm_iq4_nl_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const
|
|
|
99
102
|
// Native implementations
|
|
100
103
|
void ggml_quantize_mat_q8_0_4x4_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
|
|
101
104
|
void ggml_quantize_mat_q8_0_4x8_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
|
|
105
|
+
void ggml_quantize_mat_q8_K_4x4_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
|
|
102
106
|
void ggml_quantize_mat_q8_K_4x8_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
|
|
103
107
|
void ggml_gemv_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
104
108
|
void ggml_gemv_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
105
109
|
void ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
110
|
+
void ggml_gemv_q4_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
106
111
|
void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
107
112
|
void ggml_gemv_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
108
113
|
void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
@@ -110,6 +115,7 @@ void ggml_gemv_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs
|
|
|
110
115
|
void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
111
116
|
void ggml_gemm_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
112
117
|
void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
118
|
+
void ggml_gemm_q4_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
113
119
|
void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
114
120
|
void ggml_gemm_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
115
121
|
void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
@@ -397,119 +397,118 @@ inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const
|
|
|
397
397
|
}
|
|
398
398
|
|
|
399
399
|
inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y, const ggml_fp16_t * GGML_RESTRICT x, const float v) {
|
|
400
|
-
#if defined(GGML_SIMD)
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
const int ggml_f16_step = 8 * ggml_f16_epr;
|
|
400
|
+
#if defined(GGML_SIMD) && defined(__ARM_FEATURE_SVE)
|
|
401
|
+
const int sve_register_length = svcntb() * 8;
|
|
402
|
+
const int ggml_f16_epr = sve_register_length / 16;
|
|
403
|
+
const int ggml_f16_step = 8 * ggml_f16_epr;
|
|
405
404
|
|
|
406
|
-
|
|
405
|
+
GGML_F16x_VEC vx = GGML_F16x_VEC_SET1(v);
|
|
407
406
|
|
|
408
|
-
|
|
407
|
+
int np = (n & ~(ggml_f16_step - 1));
|
|
409
408
|
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
409
|
+
svfloat16_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
|
|
410
|
+
svfloat16_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
|
|
411
|
+
for (int i = 0; i < np; i += ggml_f16_step) {
|
|
412
|
+
ax1 = GGML_F16x_VEC_LOAD(x + i + 0 * ggml_f16_epr, 0);
|
|
413
|
+
ay1 = GGML_F16x_VEC_LOAD(y + i + 0 * ggml_f16_epr, 0);
|
|
414
|
+
ay1 = GGML_F16x_VEC_FMA(ay1, ax1, vx);
|
|
416
415
|
|
|
417
|
-
|
|
416
|
+
GGML_F16x_VEC_STORE(y + i + 0 * ggml_f16_epr, ay1, 0);
|
|
418
417
|
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
418
|
+
ax2 = GGML_F16x_VEC_LOAD(x + i + 1 * ggml_f16_epr, 1);
|
|
419
|
+
ay2 = GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 1);
|
|
420
|
+
ay2 = GGML_F16x_VEC_FMA(ay2, ax2, vx);
|
|
422
421
|
|
|
423
|
-
|
|
422
|
+
GGML_F16x_VEC_STORE(y + i + 1 * ggml_f16_epr, ay2, 1);
|
|
424
423
|
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
424
|
+
ax3 = GGML_F16x_VEC_LOAD(x + i + 2 * ggml_f16_epr, 2);
|
|
425
|
+
ay3 = GGML_F16x_VEC_LOAD(y + i + 2 * ggml_f16_epr, 2);
|
|
426
|
+
ay3 = GGML_F16x_VEC_FMA(ay3, ax3, vx);
|
|
428
427
|
|
|
429
|
-
|
|
428
|
+
GGML_F16x_VEC_STORE(y + i + 2 * ggml_f16_epr, ay3, 2);
|
|
430
429
|
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
430
|
+
ax4 = GGML_F16x_VEC_LOAD(x + i + 3 * ggml_f16_epr, 3);
|
|
431
|
+
ay4 = GGML_F16x_VEC_LOAD(y + i + 3 * ggml_f16_epr, 3);
|
|
432
|
+
ay4 = GGML_F16x_VEC_FMA(ay4, ax4, vx);
|
|
434
433
|
|
|
435
|
-
|
|
434
|
+
GGML_F16x_VEC_STORE(y + i + 3 * ggml_f16_epr, ay4, 3);
|
|
436
435
|
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
436
|
+
ax5 = GGML_F16x_VEC_LOAD(x + i + 4 * ggml_f16_epr, 4);
|
|
437
|
+
ay5 = GGML_F16x_VEC_LOAD(y + i + 4 * ggml_f16_epr, 4);
|
|
438
|
+
ay5 = GGML_F16x_VEC_FMA(ay5, ax5, vx);
|
|
440
439
|
|
|
441
|
-
|
|
440
|
+
GGML_F16x_VEC_STORE(y + i + 4 * ggml_f16_epr, ay5, 4);
|
|
442
441
|
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
442
|
+
ax6 = GGML_F16x_VEC_LOAD(x + i + 5 * ggml_f16_epr, 5);
|
|
443
|
+
ay6 = GGML_F16x_VEC_LOAD(y + i + 5 * ggml_f16_epr, 5);
|
|
444
|
+
ay6 = GGML_F16x_VEC_FMA(ay6, ax6, vx);
|
|
446
445
|
|
|
447
|
-
|
|
446
|
+
GGML_F16x_VEC_STORE(y + i + 5 * ggml_f16_epr, ay6, 5);
|
|
448
447
|
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
448
|
+
ax7 = GGML_F16x_VEC_LOAD(x + i + 6 * ggml_f16_epr, 6);
|
|
449
|
+
ay7 = GGML_F16x_VEC_LOAD(y + i + 6 * ggml_f16_epr, 6);
|
|
450
|
+
ay7 = GGML_F16x_VEC_FMA(ay7, ax7, vx);
|
|
452
451
|
|
|
453
|
-
|
|
452
|
+
GGML_F16x_VEC_STORE(y + i + 6 * ggml_f16_epr, ay7, 6);
|
|
454
453
|
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
454
|
+
ax8 = GGML_F16x_VEC_LOAD(x + i + 7 * ggml_f16_epr, 7);
|
|
455
|
+
ay8 = GGML_F16x_VEC_LOAD(y + i + 7 * ggml_f16_epr, 7);
|
|
456
|
+
ay8 = GGML_F16x_VEC_FMA(ay8, ax8, vx);
|
|
458
457
|
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
GGML_F16x_VEC_STORE(y + k, ry, 0);
|
|
468
|
-
}
|
|
469
|
-
|
|
470
|
-
if (np2 < n) {
|
|
471
|
-
svbool_t pg = svwhilelt_b16(np2, n);
|
|
472
|
-
svfloat16_t hx = svld1_f16(pg, (const __fp16 *)(x + np2));
|
|
473
|
-
svfloat16_t hy = svld1_f16(pg, (const __fp16 *)(y + np2));
|
|
474
|
-
hy = svmad_f16_x(pg, hx, vx, hy);
|
|
475
|
-
svst1_f16(pg, (__fp16 *)(y + np2), hy);
|
|
476
|
-
}
|
|
458
|
+
GGML_F16x_VEC_STORE(y + i + 7 * ggml_f16_epr, ay8, 7);
|
|
459
|
+
}
|
|
460
|
+
const int np2 = (n & ~(ggml_f16_epr - 1));
|
|
461
|
+
for (int k = np; k < np2; k += ggml_f16_epr) {
|
|
462
|
+
svfloat16_t rx = GGML_F16x_VEC_LOAD(x + k, 0);
|
|
463
|
+
svfloat16_t ry = GGML_F16x_VEC_LOAD(y + k, 0);
|
|
464
|
+
ry = GGML_F16x_VEC_FMA(ry, rx, vx);
|
|
477
465
|
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
// scalar
|
|
481
|
-
for (int i = 0; i < n; ++i) {
|
|
482
|
-
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v);
|
|
483
|
-
}
|
|
484
|
-
#else
|
|
485
|
-
const int np = (n & ~(GGML_F16_STEP - 1));
|
|
466
|
+
GGML_F16x_VEC_STORE(y + k, ry, 0);
|
|
467
|
+
}
|
|
486
468
|
|
|
487
|
-
|
|
469
|
+
if (np2 < n) {
|
|
470
|
+
svbool_t pg = svwhilelt_b16(np2, n);
|
|
471
|
+
svfloat16_t hx = svld1_f16(pg, (const __fp16 *)(x + np2));
|
|
472
|
+
svfloat16_t hy = svld1_f16(pg, (const __fp16 *)(y + np2));
|
|
473
|
+
hy = svmad_f16_x(pg, hx, vx, hy);
|
|
474
|
+
svst1_f16(pg, (__fp16 *)(y + np2), hy);
|
|
475
|
+
}
|
|
476
|
+
np = n;
|
|
477
|
+
#elif defined(__riscv_zvfh) // implies __riscv_v_intrinsic
|
|
478
|
+
const int np = n;
|
|
479
|
+
_Float16 hv = (_Float16)v;
|
|
480
|
+
for (int i = 0, avl; i < n; i += avl) {
|
|
481
|
+
avl = __riscv_vsetvl_e16m8(n - i);
|
|
482
|
+
vfloat16m8_t ax = __riscv_vle16_v_f16m8((const _Float16 *)&x[i], avl);
|
|
483
|
+
vfloat16m8_t ay = __riscv_vle16_v_f16m8((_Float16 *)&y[i], avl);
|
|
484
|
+
vfloat16m8_t ny = __riscv_vfmadd_vf_f16m8(ax, hv, ay, avl);
|
|
485
|
+
__riscv_vse16_v_f16m8((_Float16 *)&y[i], ny, avl);
|
|
486
|
+
}
|
|
487
|
+
#elif defined(GGML_SIMD)
|
|
488
|
+
const int np = (n & ~(GGML_F16_STEP - 1));
|
|
488
489
|
|
|
489
|
-
|
|
490
|
-
GGML_F16_VEC ay[GGML_F16_ARR];
|
|
490
|
+
GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
|
|
491
491
|
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j);
|
|
495
|
-
ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
|
|
496
|
-
ay[j] = GGML_F16_VEC_FMA(ay[j], ax[j], vx);
|
|
492
|
+
GGML_F16_VEC ax[GGML_F16_ARR];
|
|
493
|
+
GGML_F16_VEC ay[GGML_F16_ARR];
|
|
497
494
|
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
495
|
+
for (int i = 0; i < np; i += GGML_F16_STEP) {
|
|
496
|
+
for (int j = 0; j < GGML_F16_ARR; j++) {
|
|
497
|
+
ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j);
|
|
498
|
+
ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
|
|
499
|
+
ay[j] = GGML_F16_VEC_FMA(ay[j], ax[j], vx);
|
|
501
500
|
|
|
502
|
-
|
|
503
|
-
for (int i = np; i < n; ++i) {
|
|
504
|
-
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v);
|
|
501
|
+
GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
|
|
505
502
|
}
|
|
506
|
-
|
|
503
|
+
}
|
|
507
504
|
#else
|
|
508
|
-
|
|
509
|
-
|
|
505
|
+
const int np = 0;
|
|
506
|
+
#endif
|
|
507
|
+
|
|
508
|
+
// leftovers
|
|
509
|
+
for (int i = np; i < n; ++i) {
|
|
510
510
|
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v);
|
|
511
511
|
}
|
|
512
|
-
#endif
|
|
513
512
|
}
|
|
514
513
|
|
|
515
514
|
// xs and vs are byte strides of x and v
|
|
@@ -246,6 +246,21 @@ extern "C" {
|
|
|
246
246
|
LLAMA_KV_OVERRIDE_TYPE_STR,
|
|
247
247
|
};
|
|
248
248
|
|
|
249
|
+
enum llama_model_meta_key {
|
|
250
|
+
LLAMA_MODEL_META_KEY_SAMPLING_SEQUENCE,
|
|
251
|
+
LLAMA_MODEL_META_KEY_SAMPLING_TOP_K,
|
|
252
|
+
LLAMA_MODEL_META_KEY_SAMPLING_TOP_P,
|
|
253
|
+
LLAMA_MODEL_META_KEY_SAMPLING_MIN_P,
|
|
254
|
+
LLAMA_MODEL_META_KEY_SAMPLING_XTC_PROBABILITY,
|
|
255
|
+
LLAMA_MODEL_META_KEY_SAMPLING_XTC_THRESHOLD,
|
|
256
|
+
LLAMA_MODEL_META_KEY_SAMPLING_TEMP,
|
|
257
|
+
LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_LAST_N,
|
|
258
|
+
LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_REPEAT,
|
|
259
|
+
LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT,
|
|
260
|
+
LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_TAU,
|
|
261
|
+
LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_ETA,
|
|
262
|
+
};
|
|
263
|
+
|
|
249
264
|
struct llama_model_kv_override {
|
|
250
265
|
enum llama_model_kv_override_type tag;
|
|
251
266
|
|
|
@@ -518,6 +533,9 @@ extern "C" {
|
|
|
518
533
|
// Get the number of metadata key/value pairs
|
|
519
534
|
LLAMA_API int32_t llama_model_meta_count(const struct llama_model * model);
|
|
520
535
|
|
|
536
|
+
// Get sampling metadata key name. Returns nullptr if the key is invalid
|
|
537
|
+
LLAMA_API const char * llama_model_meta_key_str(enum llama_model_meta_key key);
|
|
538
|
+
|
|
521
539
|
// Get metadata key name by index
|
|
522
540
|
LLAMA_API int32_t llama_model_meta_key_by_index(const struct llama_model * model, int32_t i, char * buf, size_t buf_size);
|
|
523
541
|
|
|
@@ -32,6 +32,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
|
32
32
|
{ LLM_ARCH_QWEN2VL, "qwen2vl" },
|
|
33
33
|
{ LLM_ARCH_QWEN3, "qwen3" },
|
|
34
34
|
{ LLM_ARCH_QWEN3MOE, "qwen3moe" },
|
|
35
|
+
{ LLM_ARCH_QWEN3NEXT, "qwen3next" },
|
|
35
36
|
{ LLM_ARCH_QWEN3VL, "qwen3vl" },
|
|
36
37
|
{ LLM_ARCH_QWEN3VLMOE, "qwen3vlmoe" },
|
|
37
38
|
{ LLM_ARCH_PHI2, "phi2" },
|
|
@@ -108,24 +109,37 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
|
108
109
|
{ LLM_ARCH_APERTUS, "apertus" },
|
|
109
110
|
{ LLM_ARCH_MINIMAX_M2, "minimax-m2" },
|
|
110
111
|
{ LLM_ARCH_COGVLM, "cogvlm" },
|
|
112
|
+
{ LLM_ARCH_RND1, "rnd1" },
|
|
111
113
|
{ LLM_ARCH_PANGU_EMBED, "pangu-embedded" },
|
|
112
114
|
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
|
113
115
|
};
|
|
114
116
|
|
|
115
117
|
static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
116
|
-
{ LLM_KV_GENERAL_TYPE,
|
|
117
|
-
{ LLM_KV_GENERAL_ARCHITECTURE,
|
|
118
|
-
{ LLM_KV_GENERAL_QUANTIZATION_VERSION,
|
|
119
|
-
{ LLM_KV_GENERAL_ALIGNMENT,
|
|
120
|
-
{ LLM_KV_GENERAL_FILE_TYPE,
|
|
121
|
-
{
|
|
122
|
-
{
|
|
123
|
-
{
|
|
124
|
-
{
|
|
125
|
-
{
|
|
126
|
-
{
|
|
127
|
-
{
|
|
128
|
-
{
|
|
118
|
+
{ LLM_KV_GENERAL_TYPE, "general.type" },
|
|
119
|
+
{ LLM_KV_GENERAL_ARCHITECTURE, "general.architecture" },
|
|
120
|
+
{ LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version" },
|
|
121
|
+
{ LLM_KV_GENERAL_ALIGNMENT, "general.alignment" },
|
|
122
|
+
{ LLM_KV_GENERAL_FILE_TYPE, "general.file_type" },
|
|
123
|
+
{ LLM_KV_GENERAL_SAMPLING_SEQUENCE, "general.sampling.sequence" },
|
|
124
|
+
{ LLM_KV_GENERAL_SAMPLING_TOP_K, "general.sampling.top_k" },
|
|
125
|
+
{ LLM_KV_GENERAL_SAMPLING_TOP_P, "general.sampling.top_p" },
|
|
126
|
+
{ LLM_KV_GENERAL_SAMPLING_MIN_P, "general.sampling.min_p" },
|
|
127
|
+
{ LLM_KV_GENERAL_SAMPLING_XTC_PROBABILITY, "general.sampling.xtc_probability" },
|
|
128
|
+
{ LLM_KV_GENERAL_SAMPLING_XTC_THRESHOLD, "general.sampling.xtc_threshold" },
|
|
129
|
+
{ LLM_KV_GENERAL_SAMPLING_TEMP, "general.sampling.temp" },
|
|
130
|
+
{ LLM_KV_GENERAL_SAMPLING_PENALTY_LAST_N, "general.sampling.penalty_last_n" },
|
|
131
|
+
{ LLM_KV_GENERAL_SAMPLING_PENALTY_REPEAT, "general.sampling.penalty_repeat" },
|
|
132
|
+
{ LLM_KV_GENERAL_SAMPLING_MIROSTAT, "general.sampling.mirostat" },
|
|
133
|
+
{ LLM_KV_GENERAL_SAMPLING_MIROSTAT_TAU, "general.sampling.mirostat_tau" },
|
|
134
|
+
{ LLM_KV_GENERAL_SAMPLING_MIROSTAT_ETA, "general.sampling.mirostat_eta" },
|
|
135
|
+
{ LLM_KV_GENERAL_NAME, "general.name" },
|
|
136
|
+
{ LLM_KV_GENERAL_AUTHOR, "general.author" },
|
|
137
|
+
{ LLM_KV_GENERAL_VERSION, "general.version" },
|
|
138
|
+
{ LLM_KV_GENERAL_URL, "general.url" },
|
|
139
|
+
{ LLM_KV_GENERAL_DESCRIPTION, "general.description" },
|
|
140
|
+
{ LLM_KV_GENERAL_LICENSE, "general.license" },
|
|
141
|
+
{ LLM_KV_GENERAL_SOURCE_URL, "general.source.url" },
|
|
142
|
+
{ LLM_KV_GENERAL_SOURCE_HF_REPO, "general.source.huggingface.repository" },
|
|
129
143
|
|
|
130
144
|
{ LLM_KV_VOCAB_SIZE, "%s.vocab_size" },
|
|
131
145
|
{ LLM_KV_CONTEXT_LENGTH, "%s.context_length" },
|
|
@@ -816,6 +830,38 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
|
816
830
|
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
|
817
831
|
},
|
|
818
832
|
},
|
|
833
|
+
{
|
|
834
|
+
LLM_ARCH_QWEN3NEXT,
|
|
835
|
+
{
|
|
836
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
837
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|
838
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
|
839
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
|
840
|
+
{ LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
|
|
841
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
|
842
|
+
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
|
843
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
|
844
|
+
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
|
845
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
|
846
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
|
847
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
|
848
|
+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
|
849
|
+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
|
850
|
+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
|
851
|
+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
|
852
|
+
{ LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" },
|
|
853
|
+
{ LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
|
|
854
|
+
{ LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
|
|
855
|
+
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
|
856
|
+
{ LLM_TENSOR_SSM_A, "blk.%d.ssm_a" },
|
|
857
|
+
{ LLM_TENSOR_SSM_CONV1D, "blk.%d.ssm_conv1d" },
|
|
858
|
+
{ LLM_TENSOR_SSM_DT, "blk.%d.ssm_dt" },
|
|
859
|
+
{ LLM_TENSOR_SSM_BETA_ALPHA, "blk.%d.ssm_ba" },
|
|
860
|
+
{ LLM_TENSOR_SSM_IN, "blk.%d.ssm_in" },
|
|
861
|
+
{ LLM_TENSOR_SSM_NORM, "blk.%d.ssm_norm" },
|
|
862
|
+
{ LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" },
|
|
863
|
+
},
|
|
864
|
+
},
|
|
819
865
|
{
|
|
820
866
|
LLM_ARCH_QWEN3VL,
|
|
821
867
|
{
|
|
@@ -2224,7 +2270,7 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
|
2224
2270
|
{ LLM_TENSOR_SHORTCONV_INPROJ, "blk.%d.shortconv.in_proj" },
|
|
2225
2271
|
{ LLM_TENSOR_SHORTCONV_OUTPROJ, "blk.%d.shortconv.out_proj" },
|
|
2226
2272
|
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
2227
|
-
{
|
|
2273
|
+
{ LLM_TENSOR_OUTPUT_NORM, "token_embd_norm" }, // note: wrong tensor name
|
|
2228
2274
|
{ LLM_TENSOR_OUTPUT, "output" },
|
|
2229
2275
|
}
|
|
2230
2276
|
},
|
|
@@ -2246,7 +2292,7 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
|
2246
2292
|
{ LLM_TENSOR_SHORTCONV_INPROJ, "blk.%d.shortconv.in_proj" },
|
|
2247
2293
|
{ LLM_TENSOR_SHORTCONV_OUTPROJ, "blk.%d.shortconv.out_proj" },
|
|
2248
2294
|
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
2249
|
-
{
|
|
2295
|
+
{ LLM_TENSOR_OUTPUT_NORM, "token_embd_norm" }, // note: wrong tensor name
|
|
2250
2296
|
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
|
2251
2297
|
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
|
2252
2298
|
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
|
@@ -2446,6 +2492,26 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
|
2446
2492
|
{ LLM_TENSOR_VISEXP_FFN_UP, "blk.%d.vis_up" },
|
|
2447
2493
|
},
|
|
2448
2494
|
},
|
|
2495
|
+
{
|
|
2496
|
+
LLM_ARCH_RND1,
|
|
2497
|
+
{
|
|
2498
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
2499
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|
2500
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
|
2501
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
|
2502
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
|
2503
|
+
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
|
2504
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
|
2505
|
+
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
|
2506
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
|
2507
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
|
2508
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
|
2509
|
+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
|
2510
|
+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
|
2511
|
+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
|
2512
|
+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
|
2513
|
+
},
|
|
2514
|
+
},
|
|
2449
2515
|
{
|
|
2450
2516
|
LLM_ARCH_UNKNOWN,
|
|
2451
2517
|
{
|
|
@@ -2454,11 +2520,21 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
|
2454
2520
|
},
|
|
2455
2521
|
};
|
|
2456
2522
|
|
|
2523
|
+
// declare information about the model weight tensors:
|
|
2524
|
+
// - the layer in which the tensor is going to be used. this is needed in order to assign the correct buffer type for the weight
|
|
2525
|
+
// - the operator which is going to use the weight. this is needed to determine if the respective backend supports the operator
|
|
2526
|
+
//
|
|
2527
|
+
// for example, input layers are usually assigned to CPU/host buffer types
|
|
2528
|
+
//
|
|
2529
|
+
// a mismatch between the declared information and the actual layer/op in which the tensor is used can lead to sub-optimal
|
|
2530
|
+
// assignment of the buffer types and extra overhead during computation
|
|
2531
|
+
// example: https://github.com/ggml-org/llama.cpp/pull/17548
|
|
2532
|
+
//
|
|
2457
2533
|
static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
|
2458
2534
|
{LLM_TENSOR_TOKEN_EMBD, {LLM_TENSOR_LAYER_INPUT, GGML_OP_GET_ROWS}},
|
|
2459
2535
|
{LLM_TENSOR_POS_EMBD, {LLM_TENSOR_LAYER_INPUT, GGML_OP_GET_ROWS}},
|
|
2460
|
-
{LLM_TENSOR_TOKEN_EMBD_NORM, {LLM_TENSOR_LAYER_INPUT, GGML_OP_GET_ROWS}},
|
|
2461
2536
|
{LLM_TENSOR_TOKEN_TYPES, {LLM_TENSOR_LAYER_INPUT, GGML_OP_GET_ROWS}},
|
|
2537
|
+
{LLM_TENSOR_TOKEN_EMBD_NORM, {LLM_TENSOR_LAYER_INPUT, GGML_OP_MUL}},
|
|
2462
2538
|
{LLM_TENSOR_OUTPUT, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
|
|
2463
2539
|
{LLM_TENSOR_CLS, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
|
|
2464
2540
|
{LLM_TENSOR_CLS_OUT, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
|
|
@@ -2513,6 +2589,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
|
|
2513
2589
|
{LLM_TENSOR_SSM_X, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
2514
2590
|
{LLM_TENSOR_SSM_DT, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
2515
2591
|
{LLM_TENSOR_SSM_OUT, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
2592
|
+
{LLM_TENSOR_SSM_BETA_ALPHA, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
2516
2593
|
{LLM_TENSOR_TIME_MIX_W1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
2517
2594
|
{LLM_TENSOR_TIME_MIX_W2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
2518
2595
|
{LLM_TENSOR_TIME_MIX_A1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
@@ -2711,6 +2788,7 @@ bool llm_arch_is_hybrid(const llm_arch & arch) {
|
|
|
2711
2788
|
case LLM_ARCH_LFM2:
|
|
2712
2789
|
case LLM_ARCH_LFM2MOE:
|
|
2713
2790
|
case LLM_ARCH_NEMOTRON_H:
|
|
2791
|
+
case LLM_ARCH_QWEN3NEXT:
|
|
2714
2792
|
return true;
|
|
2715
2793
|
default:
|
|
2716
2794
|
return false;
|
|
@@ -2722,6 +2800,7 @@ bool llm_arch_is_diffusion(const llm_arch & arch) {
|
|
|
2722
2800
|
case LLM_ARCH_DREAM:
|
|
2723
2801
|
case LLM_ARCH_LLADA:
|
|
2724
2802
|
case LLM_ARCH_LLADA_MOE:
|
|
2803
|
+
case LLM_ARCH_RND1:
|
|
2725
2804
|
return true;
|
|
2726
2805
|
default:
|
|
2727
2806
|
return false;
|
|
@@ -36,6 +36,7 @@ enum llm_arch {
|
|
|
36
36
|
LLM_ARCH_QWEN2VL,
|
|
37
37
|
LLM_ARCH_QWEN3,
|
|
38
38
|
LLM_ARCH_QWEN3MOE,
|
|
39
|
+
LLM_ARCH_QWEN3NEXT,
|
|
39
40
|
LLM_ARCH_QWEN3VL,
|
|
40
41
|
LLM_ARCH_QWEN3VLMOE,
|
|
41
42
|
LLM_ARCH_PHI2,
|
|
@@ -112,6 +113,7 @@ enum llm_arch {
|
|
|
112
113
|
LLM_ARCH_APERTUS,
|
|
113
114
|
LLM_ARCH_MINIMAX_M2,
|
|
114
115
|
LLM_ARCH_COGVLM,
|
|
116
|
+
LLM_ARCH_RND1,
|
|
115
117
|
LLM_ARCH_PANGU_EMBED,
|
|
116
118
|
LLM_ARCH_UNKNOWN,
|
|
117
119
|
};
|
|
@@ -122,6 +124,18 @@ enum llm_kv {
|
|
|
122
124
|
LLM_KV_GENERAL_QUANTIZATION_VERSION,
|
|
123
125
|
LLM_KV_GENERAL_ALIGNMENT,
|
|
124
126
|
LLM_KV_GENERAL_FILE_TYPE,
|
|
127
|
+
LLM_KV_GENERAL_SAMPLING_SEQUENCE,
|
|
128
|
+
LLM_KV_GENERAL_SAMPLING_TOP_K,
|
|
129
|
+
LLM_KV_GENERAL_SAMPLING_TOP_P,
|
|
130
|
+
LLM_KV_GENERAL_SAMPLING_MIN_P,
|
|
131
|
+
LLM_KV_GENERAL_SAMPLING_XTC_PROBABILITY,
|
|
132
|
+
LLM_KV_GENERAL_SAMPLING_XTC_THRESHOLD,
|
|
133
|
+
LLM_KV_GENERAL_SAMPLING_TEMP,
|
|
134
|
+
LLM_KV_GENERAL_SAMPLING_PENALTY_LAST_N,
|
|
135
|
+
LLM_KV_GENERAL_SAMPLING_PENALTY_REPEAT,
|
|
136
|
+
LLM_KV_GENERAL_SAMPLING_MIROSTAT,
|
|
137
|
+
LLM_KV_GENERAL_SAMPLING_MIROSTAT_TAU,
|
|
138
|
+
LLM_KV_GENERAL_SAMPLING_MIROSTAT_ETA,
|
|
125
139
|
LLM_KV_GENERAL_NAME,
|
|
126
140
|
LLM_KV_GENERAL_AUTHOR,
|
|
127
141
|
LLM_KV_GENERAL_VERSION,
|
|
@@ -368,6 +382,7 @@ enum llm_tensor {
|
|
|
368
382
|
LLM_TENSOR_SSM_D,
|
|
369
383
|
LLM_TENSOR_SSM_NORM,
|
|
370
384
|
LLM_TENSOR_SSM_OUT,
|
|
385
|
+
LLM_TENSOR_SSM_BETA_ALPHA, // qwen3next
|
|
371
386
|
LLM_TENSOR_TIME_MIX_W0,
|
|
372
387
|
LLM_TENSOR_TIME_MIX_W1,
|
|
373
388
|
LLM_TENSOR_TIME_MIX_W2,
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
#include "llama-context.h"
|
|
2
2
|
|
|
3
|
+
#include "llama-arch.h"
|
|
3
4
|
#include "llama-impl.h"
|
|
4
5
|
#include "llama-batch.h"
|
|
5
6
|
#include "llama-io.h"
|
|
@@ -299,7 +300,7 @@ llama_context::llama_context(
|
|
|
299
300
|
|
|
300
301
|
cross.v_embd.clear();
|
|
301
302
|
|
|
302
|
-
const uint32_t n_seqs = cparams.
|
|
303
|
+
const uint32_t n_seqs = cparams.n_seq_max;
|
|
303
304
|
const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
|
|
304
305
|
|
|
305
306
|
// avoid reserving graphs with zero outputs - assume one output per sequence
|
|
@@ -542,7 +543,7 @@ bool llama_context::memory_update(bool optimize) {
|
|
|
542
543
|
throw std::runtime_error("failed to initialize memory context");
|
|
543
544
|
}
|
|
544
545
|
|
|
545
|
-
const uint32_t n_seqs = cparams.
|
|
546
|
+
const uint32_t n_seqs = cparams.n_seq_max;
|
|
546
547
|
const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
|
|
547
548
|
|
|
548
549
|
auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
|
|
@@ -1248,7 +1249,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
|
|
1248
1249
|
|
|
1249
1250
|
// make the outputs have the same order they had in the user-provided batch
|
|
1250
1251
|
// note: this is mostly relevant for recurrent models atm
|
|
1251
|
-
if (!sorted_output) {
|
|
1252
|
+
if (!sorted_output && n_outputs > 1) {
|
|
1252
1253
|
GGML_ASSERT((size_t) n_outputs == out_ids.size());
|
|
1253
1254
|
|
|
1254
1255
|
// TODO: is there something more efficient which also minimizes swaps?
|
|
@@ -1386,6 +1387,9 @@ void llama_context::output_reorder() {
|
|
|
1386
1387
|
//
|
|
1387
1388
|
|
|
1388
1389
|
uint32_t llama_context::graph_max_nodes() const {
|
|
1390
|
+
if (model.arch == LLM_ARCH_QWEN3NEXT) {
|
|
1391
|
+
return std::max<uint32_t>(8192u, 32u*model.n_tensors());
|
|
1392
|
+
}
|
|
1389
1393
|
return std::max<uint32_t>(1024u, 8u*model.n_tensors());
|
|
1390
1394
|
}
|
|
1391
1395
|
|
|
@@ -961,14 +961,14 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
|
|
961
961
|
// organize experts into n_expert_groups
|
|
962
962
|
ggml_tensor * selection_groups = ggml_reshape_3d(ctx0, selection_probs, n_exp_per_group, hparams.n_expert_groups, n_tokens); // [n_exp_per_group, n_expert_groups, n_tokens]
|
|
963
963
|
|
|
964
|
-
ggml_tensor * group_scores =
|
|
964
|
+
ggml_tensor * group_scores = ggml_argsort_top_k(ctx0, selection_groups, 2); // [2, n_expert_groups, n_tokens]
|
|
965
965
|
group_scores = ggml_get_rows(ctx0, ggml_reshape_4d(ctx0, selection_groups, 1, selection_groups->ne[0], selection_groups->ne[1], selection_groups->ne[2]), group_scores); // [1, 2, n_expert_groups, n_tokens]
|
|
966
966
|
|
|
967
967
|
// get top n_group_used expert groups
|
|
968
968
|
group_scores = ggml_sum_rows(ctx0, ggml_reshape_3d(ctx0, group_scores, group_scores->ne[1], group_scores->ne[2], group_scores->ne[3])); // [1, n_expert_groups, n_tokens]
|
|
969
969
|
group_scores = ggml_reshape_2d(ctx0, group_scores, group_scores->ne[1], group_scores->ne[2]); // [n_expert_groups, n_tokens]
|
|
970
970
|
|
|
971
|
-
ggml_tensor * expert_groups =
|
|
971
|
+
ggml_tensor * expert_groups = ggml_argsort_top_k(ctx0, group_scores, hparams.n_group_used); // [n_group_used, n_tokens]
|
|
972
972
|
cb(expert_groups, "ffn_moe_group_topk", il);
|
|
973
973
|
|
|
974
974
|
// mask out the other groups
|
|
@@ -979,7 +979,7 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
|
|
979
979
|
}
|
|
980
980
|
|
|
981
981
|
// select experts
|
|
982
|
-
ggml_tensor * selected_experts =
|
|
982
|
+
ggml_tensor * selected_experts = ggml_argsort_top_k(ctx0, selection_probs, n_expert_used); // [n_expert_used, n_tokens]
|
|
983
983
|
cb(selected_experts->src[0], "ffn_moe_argsort", il);
|
|
984
984
|
cb(selected_experts, "ffn_moe_topk", il);
|
|
985
985
|
|