@fugood/llama.node 1.3.7 → 1.3.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +15 -15
- package/scripts/llama.cpp.patch +5 -5
- package/src/llama.cpp/common/arg.cpp +26 -1
- package/src/llama.cpp/common/common.cpp +55 -0
- package/src/llama.cpp/common/common.h +18 -0
- package/src/llama.cpp/ggml/CMakeLists.txt +5 -4
- package/src/llama.cpp/ggml/include/ggml.h +12 -4
- package/src/llama.cpp/ggml/src/CMakeLists.txt +15 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +29 -15
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +388 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/cpu-feats.cpp +35 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +0 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +9 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +69 -3
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +9 -3
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +84 -85
- package/src/llama.cpp/include/llama.h +18 -0
- package/src/llama.cpp/src/CMakeLists.txt +1 -0
- package/src/llama.cpp/src/llama-arch.cpp +47 -13
- package/src/llama.cpp/src/llama-arch.h +13 -0
- package/src/llama.cpp/src/llama-context.cpp +1 -1
- package/src/llama.cpp/src/llama-graph.cpp +3 -3
- package/src/llama.cpp/src/llama-model.cpp +39 -1
- package/src/llama.cpp/src/models/models.h +4 -0
- package/src/llama.cpp/src/models/rnd1.cpp +126 -0
|
@@ -397,119 +397,118 @@ inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const
|
|
|
397
397
|
}
|
|
398
398
|
|
|
399
399
|
inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y, const ggml_fp16_t * GGML_RESTRICT x, const float v) {
|
|
400
|
-
#if defined(GGML_SIMD)
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
const int ggml_f16_step = 8 * ggml_f16_epr;
|
|
400
|
+
#if defined(GGML_SIMD) && defined(__ARM_FEATURE_SVE)
|
|
401
|
+
const int sve_register_length = svcntb() * 8;
|
|
402
|
+
const int ggml_f16_epr = sve_register_length / 16;
|
|
403
|
+
const int ggml_f16_step = 8 * ggml_f16_epr;
|
|
405
404
|
|
|
406
|
-
|
|
405
|
+
GGML_F16x_VEC vx = GGML_F16x_VEC_SET1(v);
|
|
407
406
|
|
|
408
|
-
|
|
407
|
+
int np = (n & ~(ggml_f16_step - 1));
|
|
409
408
|
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
409
|
+
svfloat16_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
|
|
410
|
+
svfloat16_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
|
|
411
|
+
for (int i = 0; i < np; i += ggml_f16_step) {
|
|
412
|
+
ax1 = GGML_F16x_VEC_LOAD(x + i + 0 * ggml_f16_epr, 0);
|
|
413
|
+
ay1 = GGML_F16x_VEC_LOAD(y + i + 0 * ggml_f16_epr, 0);
|
|
414
|
+
ay1 = GGML_F16x_VEC_FMA(ay1, ax1, vx);
|
|
416
415
|
|
|
417
|
-
|
|
416
|
+
GGML_F16x_VEC_STORE(y + i + 0 * ggml_f16_epr, ay1, 0);
|
|
418
417
|
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
418
|
+
ax2 = GGML_F16x_VEC_LOAD(x + i + 1 * ggml_f16_epr, 1);
|
|
419
|
+
ay2 = GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 1);
|
|
420
|
+
ay2 = GGML_F16x_VEC_FMA(ay2, ax2, vx);
|
|
422
421
|
|
|
423
|
-
|
|
422
|
+
GGML_F16x_VEC_STORE(y + i + 1 * ggml_f16_epr, ay2, 1);
|
|
424
423
|
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
424
|
+
ax3 = GGML_F16x_VEC_LOAD(x + i + 2 * ggml_f16_epr, 2);
|
|
425
|
+
ay3 = GGML_F16x_VEC_LOAD(y + i + 2 * ggml_f16_epr, 2);
|
|
426
|
+
ay3 = GGML_F16x_VEC_FMA(ay3, ax3, vx);
|
|
428
427
|
|
|
429
|
-
|
|
428
|
+
GGML_F16x_VEC_STORE(y + i + 2 * ggml_f16_epr, ay3, 2);
|
|
430
429
|
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
430
|
+
ax4 = GGML_F16x_VEC_LOAD(x + i + 3 * ggml_f16_epr, 3);
|
|
431
|
+
ay4 = GGML_F16x_VEC_LOAD(y + i + 3 * ggml_f16_epr, 3);
|
|
432
|
+
ay4 = GGML_F16x_VEC_FMA(ay4, ax4, vx);
|
|
434
433
|
|
|
435
|
-
|
|
434
|
+
GGML_F16x_VEC_STORE(y + i + 3 * ggml_f16_epr, ay4, 3);
|
|
436
435
|
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
436
|
+
ax5 = GGML_F16x_VEC_LOAD(x + i + 4 * ggml_f16_epr, 4);
|
|
437
|
+
ay5 = GGML_F16x_VEC_LOAD(y + i + 4 * ggml_f16_epr, 4);
|
|
438
|
+
ay5 = GGML_F16x_VEC_FMA(ay5, ax5, vx);
|
|
440
439
|
|
|
441
|
-
|
|
440
|
+
GGML_F16x_VEC_STORE(y + i + 4 * ggml_f16_epr, ay5, 4);
|
|
442
441
|
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
442
|
+
ax6 = GGML_F16x_VEC_LOAD(x + i + 5 * ggml_f16_epr, 5);
|
|
443
|
+
ay6 = GGML_F16x_VEC_LOAD(y + i + 5 * ggml_f16_epr, 5);
|
|
444
|
+
ay6 = GGML_F16x_VEC_FMA(ay6, ax6, vx);
|
|
446
445
|
|
|
447
|
-
|
|
446
|
+
GGML_F16x_VEC_STORE(y + i + 5 * ggml_f16_epr, ay6, 5);
|
|
448
447
|
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
448
|
+
ax7 = GGML_F16x_VEC_LOAD(x + i + 6 * ggml_f16_epr, 6);
|
|
449
|
+
ay7 = GGML_F16x_VEC_LOAD(y + i + 6 * ggml_f16_epr, 6);
|
|
450
|
+
ay7 = GGML_F16x_VEC_FMA(ay7, ax7, vx);
|
|
452
451
|
|
|
453
|
-
|
|
452
|
+
GGML_F16x_VEC_STORE(y + i + 6 * ggml_f16_epr, ay7, 6);
|
|
454
453
|
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
454
|
+
ax8 = GGML_F16x_VEC_LOAD(x + i + 7 * ggml_f16_epr, 7);
|
|
455
|
+
ay8 = GGML_F16x_VEC_LOAD(y + i + 7 * ggml_f16_epr, 7);
|
|
456
|
+
ay8 = GGML_F16x_VEC_FMA(ay8, ax8, vx);
|
|
458
457
|
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
GGML_F16x_VEC_STORE(y + k, ry, 0);
|
|
468
|
-
}
|
|
469
|
-
|
|
470
|
-
if (np2 < n) {
|
|
471
|
-
svbool_t pg = svwhilelt_b16(np2, n);
|
|
472
|
-
svfloat16_t hx = svld1_f16(pg, (const __fp16 *)(x + np2));
|
|
473
|
-
svfloat16_t hy = svld1_f16(pg, (const __fp16 *)(y + np2));
|
|
474
|
-
hy = svmad_f16_x(pg, hx, vx, hy);
|
|
475
|
-
svst1_f16(pg, (__fp16 *)(y + np2), hy);
|
|
476
|
-
}
|
|
458
|
+
GGML_F16x_VEC_STORE(y + i + 7 * ggml_f16_epr, ay8, 7);
|
|
459
|
+
}
|
|
460
|
+
const int np2 = (n & ~(ggml_f16_epr - 1));
|
|
461
|
+
for (int k = np; k < np2; k += ggml_f16_epr) {
|
|
462
|
+
svfloat16_t rx = GGML_F16x_VEC_LOAD(x + k, 0);
|
|
463
|
+
svfloat16_t ry = GGML_F16x_VEC_LOAD(y + k, 0);
|
|
464
|
+
ry = GGML_F16x_VEC_FMA(ry, rx, vx);
|
|
477
465
|
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
// scalar
|
|
481
|
-
for (int i = 0; i < n; ++i) {
|
|
482
|
-
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v);
|
|
483
|
-
}
|
|
484
|
-
#else
|
|
485
|
-
const int np = (n & ~(GGML_F16_STEP - 1));
|
|
466
|
+
GGML_F16x_VEC_STORE(y + k, ry, 0);
|
|
467
|
+
}
|
|
486
468
|
|
|
487
|
-
|
|
469
|
+
if (np2 < n) {
|
|
470
|
+
svbool_t pg = svwhilelt_b16(np2, n);
|
|
471
|
+
svfloat16_t hx = svld1_f16(pg, (const __fp16 *)(x + np2));
|
|
472
|
+
svfloat16_t hy = svld1_f16(pg, (const __fp16 *)(y + np2));
|
|
473
|
+
hy = svmad_f16_x(pg, hx, vx, hy);
|
|
474
|
+
svst1_f16(pg, (__fp16 *)(y + np2), hy);
|
|
475
|
+
}
|
|
476
|
+
np = n;
|
|
477
|
+
#elif defined(__riscv_zvfh) // implies __riscv_v_intrinsic
|
|
478
|
+
const int np = n;
|
|
479
|
+
_Float16 hv = (_Float16)v;
|
|
480
|
+
for (int i = 0, avl; i < n; i += avl) {
|
|
481
|
+
avl = __riscv_vsetvl_e16m8(n - i);
|
|
482
|
+
vfloat16m8_t ax = __riscv_vle16_v_f16m8((const _Float16 *)&x[i], avl);
|
|
483
|
+
vfloat16m8_t ay = __riscv_vle16_v_f16m8((_Float16 *)&y[i], avl);
|
|
484
|
+
vfloat16m8_t ny = __riscv_vfmadd_vf_f16m8(ax, hv, ay, avl);
|
|
485
|
+
__riscv_vse16_v_f16m8((_Float16 *)&y[i], ny, avl);
|
|
486
|
+
}
|
|
487
|
+
#elif defined(GGML_SIMD)
|
|
488
|
+
const int np = (n & ~(GGML_F16_STEP - 1));
|
|
488
489
|
|
|
489
|
-
|
|
490
|
-
GGML_F16_VEC ay[GGML_F16_ARR];
|
|
490
|
+
GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
|
|
491
491
|
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j);
|
|
495
|
-
ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
|
|
496
|
-
ay[j] = GGML_F16_VEC_FMA(ay[j], ax[j], vx);
|
|
492
|
+
GGML_F16_VEC ax[GGML_F16_ARR];
|
|
493
|
+
GGML_F16_VEC ay[GGML_F16_ARR];
|
|
497
494
|
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
495
|
+
for (int i = 0; i < np; i += GGML_F16_STEP) {
|
|
496
|
+
for (int j = 0; j < GGML_F16_ARR; j++) {
|
|
497
|
+
ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j);
|
|
498
|
+
ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
|
|
499
|
+
ay[j] = GGML_F16_VEC_FMA(ay[j], ax[j], vx);
|
|
501
500
|
|
|
502
|
-
|
|
503
|
-
for (int i = np; i < n; ++i) {
|
|
504
|
-
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v);
|
|
501
|
+
GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
|
|
505
502
|
}
|
|
506
|
-
|
|
503
|
+
}
|
|
507
504
|
#else
|
|
508
|
-
|
|
509
|
-
|
|
505
|
+
const int np = 0;
|
|
506
|
+
#endif
|
|
507
|
+
|
|
508
|
+
// leftovers
|
|
509
|
+
for (int i = np; i < n; ++i) {
|
|
510
510
|
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v);
|
|
511
511
|
}
|
|
512
|
-
#endif
|
|
513
512
|
}
|
|
514
513
|
|
|
515
514
|
// xs and vs are byte strides of x and v
|
|
@@ -246,6 +246,21 @@ extern "C" {
|
|
|
246
246
|
LLAMA_KV_OVERRIDE_TYPE_STR,
|
|
247
247
|
};
|
|
248
248
|
|
|
249
|
+
enum llama_model_meta_key {
|
|
250
|
+
LLAMA_MODEL_META_KEY_SAMPLING_SEQUENCE,
|
|
251
|
+
LLAMA_MODEL_META_KEY_SAMPLING_TOP_K,
|
|
252
|
+
LLAMA_MODEL_META_KEY_SAMPLING_TOP_P,
|
|
253
|
+
LLAMA_MODEL_META_KEY_SAMPLING_MIN_P,
|
|
254
|
+
LLAMA_MODEL_META_KEY_SAMPLING_XTC_PROBABILITY,
|
|
255
|
+
LLAMA_MODEL_META_KEY_SAMPLING_XTC_THRESHOLD,
|
|
256
|
+
LLAMA_MODEL_META_KEY_SAMPLING_TEMP,
|
|
257
|
+
LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_LAST_N,
|
|
258
|
+
LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_REPEAT,
|
|
259
|
+
LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT,
|
|
260
|
+
LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_TAU,
|
|
261
|
+
LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_ETA,
|
|
262
|
+
};
|
|
263
|
+
|
|
249
264
|
struct llama_model_kv_override {
|
|
250
265
|
enum llama_model_kv_override_type tag;
|
|
251
266
|
|
|
@@ -518,6 +533,9 @@ extern "C" {
|
|
|
518
533
|
// Get the number of metadata key/value pairs
|
|
519
534
|
LLAMA_API int32_t llama_model_meta_count(const struct llama_model * model);
|
|
520
535
|
|
|
536
|
+
// Get sampling metadata key name. Returns nullptr if the key is invalid
|
|
537
|
+
LLAMA_API const char * llama_model_meta_key_str(enum llama_model_meta_key key);
|
|
538
|
+
|
|
521
539
|
// Get metadata key name by index
|
|
522
540
|
LLAMA_API int32_t llama_model_meta_key_by_index(const struct llama_model * model, int32_t i, char * buf, size_t buf_size);
|
|
523
541
|
|
|
@@ -108,24 +108,37 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
|
108
108
|
{ LLM_ARCH_APERTUS, "apertus" },
|
|
109
109
|
{ LLM_ARCH_MINIMAX_M2, "minimax-m2" },
|
|
110
110
|
{ LLM_ARCH_COGVLM, "cogvlm" },
|
|
111
|
+
{ LLM_ARCH_RND1, "rnd1" },
|
|
111
112
|
{ LLM_ARCH_PANGU_EMBED, "pangu-embedded" },
|
|
112
113
|
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
|
113
114
|
};
|
|
114
115
|
|
|
115
116
|
static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
116
|
-
{ LLM_KV_GENERAL_TYPE,
|
|
117
|
-
{ LLM_KV_GENERAL_ARCHITECTURE,
|
|
118
|
-
{ LLM_KV_GENERAL_QUANTIZATION_VERSION,
|
|
119
|
-
{ LLM_KV_GENERAL_ALIGNMENT,
|
|
120
|
-
{ LLM_KV_GENERAL_FILE_TYPE,
|
|
121
|
-
{
|
|
122
|
-
{
|
|
123
|
-
{
|
|
124
|
-
{
|
|
125
|
-
{
|
|
126
|
-
{
|
|
127
|
-
{
|
|
128
|
-
{
|
|
117
|
+
{ LLM_KV_GENERAL_TYPE, "general.type" },
|
|
118
|
+
{ LLM_KV_GENERAL_ARCHITECTURE, "general.architecture" },
|
|
119
|
+
{ LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version" },
|
|
120
|
+
{ LLM_KV_GENERAL_ALIGNMENT, "general.alignment" },
|
|
121
|
+
{ LLM_KV_GENERAL_FILE_TYPE, "general.file_type" },
|
|
122
|
+
{ LLM_KV_GENERAL_SAMPLING_SEQUENCE, "general.sampling.sequence" },
|
|
123
|
+
{ LLM_KV_GENERAL_SAMPLING_TOP_K, "general.sampling.top_k" },
|
|
124
|
+
{ LLM_KV_GENERAL_SAMPLING_TOP_P, "general.sampling.top_p" },
|
|
125
|
+
{ LLM_KV_GENERAL_SAMPLING_MIN_P, "general.sampling.min_p" },
|
|
126
|
+
{ LLM_KV_GENERAL_SAMPLING_XTC_PROBABILITY, "general.sampling.xtc_probability" },
|
|
127
|
+
{ LLM_KV_GENERAL_SAMPLING_XTC_THRESHOLD, "general.sampling.xtc_threshold" },
|
|
128
|
+
{ LLM_KV_GENERAL_SAMPLING_TEMP, "general.sampling.temp" },
|
|
129
|
+
{ LLM_KV_GENERAL_SAMPLING_PENALTY_LAST_N, "general.sampling.penalty_last_n" },
|
|
130
|
+
{ LLM_KV_GENERAL_SAMPLING_PENALTY_REPEAT, "general.sampling.penalty_repeat" },
|
|
131
|
+
{ LLM_KV_GENERAL_SAMPLING_MIROSTAT, "general.sampling.mirostat" },
|
|
132
|
+
{ LLM_KV_GENERAL_SAMPLING_MIROSTAT_TAU, "general.sampling.mirostat_tau" },
|
|
133
|
+
{ LLM_KV_GENERAL_SAMPLING_MIROSTAT_ETA, "general.sampling.mirostat_eta" },
|
|
134
|
+
{ LLM_KV_GENERAL_NAME, "general.name" },
|
|
135
|
+
{ LLM_KV_GENERAL_AUTHOR, "general.author" },
|
|
136
|
+
{ LLM_KV_GENERAL_VERSION, "general.version" },
|
|
137
|
+
{ LLM_KV_GENERAL_URL, "general.url" },
|
|
138
|
+
{ LLM_KV_GENERAL_DESCRIPTION, "general.description" },
|
|
139
|
+
{ LLM_KV_GENERAL_LICENSE, "general.license" },
|
|
140
|
+
{ LLM_KV_GENERAL_SOURCE_URL, "general.source.url" },
|
|
141
|
+
{ LLM_KV_GENERAL_SOURCE_HF_REPO, "general.source.huggingface.repository" },
|
|
129
142
|
|
|
130
143
|
{ LLM_KV_VOCAB_SIZE, "%s.vocab_size" },
|
|
131
144
|
{ LLM_KV_CONTEXT_LENGTH, "%s.context_length" },
|
|
@@ -2446,6 +2459,26 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
|
2446
2459
|
{ LLM_TENSOR_VISEXP_FFN_UP, "blk.%d.vis_up" },
|
|
2447
2460
|
},
|
|
2448
2461
|
},
|
|
2462
|
+
{
|
|
2463
|
+
LLM_ARCH_RND1,
|
|
2464
|
+
{
|
|
2465
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
2466
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|
2467
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
|
2468
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
|
2469
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
|
2470
|
+
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
|
2471
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
|
2472
|
+
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
|
2473
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
|
2474
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
|
2475
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
|
2476
|
+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
|
2477
|
+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
|
2478
|
+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
|
2479
|
+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
|
2480
|
+
},
|
|
2481
|
+
},
|
|
2449
2482
|
{
|
|
2450
2483
|
LLM_ARCH_UNKNOWN,
|
|
2451
2484
|
{
|
|
@@ -2722,6 +2755,7 @@ bool llm_arch_is_diffusion(const llm_arch & arch) {
|
|
|
2722
2755
|
case LLM_ARCH_DREAM:
|
|
2723
2756
|
case LLM_ARCH_LLADA:
|
|
2724
2757
|
case LLM_ARCH_LLADA_MOE:
|
|
2758
|
+
case LLM_ARCH_RND1:
|
|
2725
2759
|
return true;
|
|
2726
2760
|
default:
|
|
2727
2761
|
return false;
|
|
@@ -112,6 +112,7 @@ enum llm_arch {
|
|
|
112
112
|
LLM_ARCH_APERTUS,
|
|
113
113
|
LLM_ARCH_MINIMAX_M2,
|
|
114
114
|
LLM_ARCH_COGVLM,
|
|
115
|
+
LLM_ARCH_RND1,
|
|
115
116
|
LLM_ARCH_PANGU_EMBED,
|
|
116
117
|
LLM_ARCH_UNKNOWN,
|
|
117
118
|
};
|
|
@@ -122,6 +123,18 @@ enum llm_kv {
|
|
|
122
123
|
LLM_KV_GENERAL_QUANTIZATION_VERSION,
|
|
123
124
|
LLM_KV_GENERAL_ALIGNMENT,
|
|
124
125
|
LLM_KV_GENERAL_FILE_TYPE,
|
|
126
|
+
LLM_KV_GENERAL_SAMPLING_SEQUENCE,
|
|
127
|
+
LLM_KV_GENERAL_SAMPLING_TOP_K,
|
|
128
|
+
LLM_KV_GENERAL_SAMPLING_TOP_P,
|
|
129
|
+
LLM_KV_GENERAL_SAMPLING_MIN_P,
|
|
130
|
+
LLM_KV_GENERAL_SAMPLING_XTC_PROBABILITY,
|
|
131
|
+
LLM_KV_GENERAL_SAMPLING_XTC_THRESHOLD,
|
|
132
|
+
LLM_KV_GENERAL_SAMPLING_TEMP,
|
|
133
|
+
LLM_KV_GENERAL_SAMPLING_PENALTY_LAST_N,
|
|
134
|
+
LLM_KV_GENERAL_SAMPLING_PENALTY_REPEAT,
|
|
135
|
+
LLM_KV_GENERAL_SAMPLING_MIROSTAT,
|
|
136
|
+
LLM_KV_GENERAL_SAMPLING_MIROSTAT_TAU,
|
|
137
|
+
LLM_KV_GENERAL_SAMPLING_MIROSTAT_ETA,
|
|
125
138
|
LLM_KV_GENERAL_NAME,
|
|
126
139
|
LLM_KV_GENERAL_AUTHOR,
|
|
127
140
|
LLM_KV_GENERAL_VERSION,
|
|
@@ -1248,7 +1248,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
|
|
1248
1248
|
|
|
1249
1249
|
// make the outputs have the same order they had in the user-provided batch
|
|
1250
1250
|
// note: this is mostly relevant for recurrent models atm
|
|
1251
|
-
if (!sorted_output) {
|
|
1251
|
+
if (!sorted_output && n_outputs > 1) {
|
|
1252
1252
|
GGML_ASSERT((size_t) n_outputs == out_ids.size());
|
|
1253
1253
|
|
|
1254
1254
|
// TODO: is there something more efficient which also minimizes swaps?
|
|
@@ -961,14 +961,14 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
|
|
961
961
|
// organize experts into n_expert_groups
|
|
962
962
|
ggml_tensor * selection_groups = ggml_reshape_3d(ctx0, selection_probs, n_exp_per_group, hparams.n_expert_groups, n_tokens); // [n_exp_per_group, n_expert_groups, n_tokens]
|
|
963
963
|
|
|
964
|
-
ggml_tensor * group_scores =
|
|
964
|
+
ggml_tensor * group_scores = ggml_argsort_top_k(ctx0, selection_groups, 2); // [2, n_expert_groups, n_tokens]
|
|
965
965
|
group_scores = ggml_get_rows(ctx0, ggml_reshape_4d(ctx0, selection_groups, 1, selection_groups->ne[0], selection_groups->ne[1], selection_groups->ne[2]), group_scores); // [1, 2, n_expert_groups, n_tokens]
|
|
966
966
|
|
|
967
967
|
// get top n_group_used expert groups
|
|
968
968
|
group_scores = ggml_sum_rows(ctx0, ggml_reshape_3d(ctx0, group_scores, group_scores->ne[1], group_scores->ne[2], group_scores->ne[3])); // [1, n_expert_groups, n_tokens]
|
|
969
969
|
group_scores = ggml_reshape_2d(ctx0, group_scores, group_scores->ne[1], group_scores->ne[2]); // [n_expert_groups, n_tokens]
|
|
970
970
|
|
|
971
|
-
ggml_tensor * expert_groups =
|
|
971
|
+
ggml_tensor * expert_groups = ggml_argsort_top_k(ctx0, group_scores, hparams.n_group_used); // [n_group_used, n_tokens]
|
|
972
972
|
cb(expert_groups, "ffn_moe_group_topk", il);
|
|
973
973
|
|
|
974
974
|
// mask out the other groups
|
|
@@ -979,7 +979,7 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
|
|
979
979
|
}
|
|
980
980
|
|
|
981
981
|
// select experts
|
|
982
|
-
ggml_tensor * selected_experts =
|
|
982
|
+
ggml_tensor * selected_experts = ggml_argsort_top_k(ctx0, selection_probs, n_expert_used); // [n_expert_used, n_tokens]
|
|
983
983
|
cb(selected_experts->src[0], "ffn_moe_argsort", il);
|
|
984
984
|
cb(selected_experts, "ffn_moe_topk", il);
|
|
985
985
|
|
|
@@ -1036,6 +1036,18 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1036
1036
|
default: type = LLM_TYPE_UNKNOWN;
|
|
1037
1037
|
}
|
|
1038
1038
|
} break;
|
|
1039
|
+
case LLM_ARCH_RND1:
|
|
1040
|
+
{
|
|
1041
|
+
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
|
|
1042
|
+
|
|
1043
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1044
|
+
switch (hparams.n_layer) {
|
|
1045
|
+
case 48: type = LLM_TYPE_30B_A3B; break;
|
|
1046
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1047
|
+
}
|
|
1048
|
+
// Set non-causal attention for diffusion models
|
|
1049
|
+
hparams.causal_attn = false;
|
|
1050
|
+
} break;
|
|
1039
1051
|
case LLM_ARCH_QWEN2MOE:
|
|
1040
1052
|
{
|
|
1041
1053
|
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
|
|
@@ -3402,6 +3414,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
3402
3414
|
} break;
|
|
3403
3415
|
case LLM_ARCH_QWEN3MOE:
|
|
3404
3416
|
case LLM_ARCH_QWEN3VLMOE:
|
|
3417
|
+
case LLM_ARCH_RND1:
|
|
3405
3418
|
{
|
|
3406
3419
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
3407
3420
|
|
|
@@ -6720,7 +6733,7 @@ void llama_model::print_info() const {
|
|
|
6720
6733
|
LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
|
|
6721
6734
|
}
|
|
6722
6735
|
|
|
6723
|
-
if (arch == LLM_ARCH_QWEN3MOE || arch == LLM_ARCH_OPENAI_MOE || arch == LLM_ARCH_QWEN3VLMOE) {
|
|
6736
|
+
if (arch == LLM_ARCH_QWEN3MOE || arch == LLM_ARCH_OPENAI_MOE || arch == LLM_ARCH_QWEN3VLMOE || arch == LLM_ARCH_RND1) {
|
|
6724
6737
|
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
|
6725
6738
|
}
|
|
6726
6739
|
|
|
@@ -6882,6 +6895,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|
|
6882
6895
|
case LLM_ARCH_DREAM:
|
|
6883
6896
|
case LLM_ARCH_LLADA:
|
|
6884
6897
|
case LLM_ARCH_LLADA_MOE:
|
|
6898
|
+
case LLM_ARCH_RND1:
|
|
6885
6899
|
{
|
|
6886
6900
|
res = nullptr;
|
|
6887
6901
|
} break;
|
|
@@ -7075,6 +7089,11 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|
|
7075
7089
|
llm = std::make_unique<llm_build_llada_moe>(*this, params);
|
|
7076
7090
|
}
|
|
7077
7091
|
break;
|
|
7092
|
+
case LLM_ARCH_RND1:
|
|
7093
|
+
{
|
|
7094
|
+
llm = std::make_unique<llm_build_rnd1>(*this, params);
|
|
7095
|
+
}
|
|
7096
|
+
break;
|
|
7078
7097
|
case LLM_ARCH_QWEN2VL:
|
|
7079
7098
|
{
|
|
7080
7099
|
llm = std::make_unique<llm_build_qwen2vl>(*this, params);
|
|
@@ -7595,6 +7614,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
7595
7614
|
case LLM_ARCH_QWEN3:
|
|
7596
7615
|
case LLM_ARCH_QWEN3MOE:
|
|
7597
7616
|
case LLM_ARCH_LLADA_MOE:
|
|
7617
|
+
case LLM_ARCH_RND1:
|
|
7598
7618
|
case LLM_ARCH_OLMO2:
|
|
7599
7619
|
case LLM_ARCH_OLMOE:
|
|
7600
7620
|
case LLM_ARCH_PHI2:
|
|
@@ -7667,6 +7687,24 @@ int32_t llama_model_meta_count(const llama_model * model) {
|
|
|
7667
7687
|
return (int)model->gguf_kv.size();
|
|
7668
7688
|
}
|
|
7669
7689
|
|
|
7690
|
+
const char * llama_model_meta_key_str(llama_model_meta_key key) {
|
|
7691
|
+
switch (key) {
|
|
7692
|
+
case LLAMA_MODEL_META_KEY_SAMPLING_SEQUENCE: return "general.sampling.sequence";
|
|
7693
|
+
case LLAMA_MODEL_META_KEY_SAMPLING_TOP_K: return "general.sampling.top_k";
|
|
7694
|
+
case LLAMA_MODEL_META_KEY_SAMPLING_TOP_P: return "general.sampling.top_p";
|
|
7695
|
+
case LLAMA_MODEL_META_KEY_SAMPLING_MIN_P: return "general.sampling.min_p";
|
|
7696
|
+
case LLAMA_MODEL_META_KEY_SAMPLING_XTC_PROBABILITY: return "general.sampling.xtc_probability";
|
|
7697
|
+
case LLAMA_MODEL_META_KEY_SAMPLING_XTC_THRESHOLD: return "general.sampling.xtc_threshold";
|
|
7698
|
+
case LLAMA_MODEL_META_KEY_SAMPLING_TEMP: return "general.sampling.temp";
|
|
7699
|
+
case LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_LAST_N: return "general.sampling.penalty_last_n";
|
|
7700
|
+
case LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_REPEAT: return "general.sampling.penalty_repeat";
|
|
7701
|
+
case LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT: return "general.sampling.mirostat";
|
|
7702
|
+
case LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_TAU: return "general.sampling.mirostat_tau";
|
|
7703
|
+
case LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_ETA: return "general.sampling.mirostat_eta";
|
|
7704
|
+
default: return nullptr;
|
|
7705
|
+
}
|
|
7706
|
+
}
|
|
7707
|
+
|
|
7670
7708
|
int32_t llama_model_meta_key_by_index(const llama_model * model, int i, char * buf, size_t buf_size) {
|
|
7671
7709
|
if (i < 0 || i >= (int)model->gguf_kv.size()) {
|
|
7672
7710
|
if (buf_size > 0) {
|
|
@@ -431,6 +431,10 @@ struct llm_build_refact : public llm_graph_context {
|
|
|
431
431
|
llm_build_refact(const llama_model & model, const llm_graph_params & params);
|
|
432
432
|
};
|
|
433
433
|
|
|
434
|
+
struct llm_build_rnd1 : public llm_graph_context {
|
|
435
|
+
llm_build_rnd1(const llama_model & model, const llm_graph_params & params);
|
|
436
|
+
};
|
|
437
|
+
|
|
434
438
|
struct llm_build_rwkv6 : public llm_build_rwkv6_base {
|
|
435
439
|
llm_build_rwkv6(const llama_model & model, const llm_graph_params & params);
|
|
436
440
|
};
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
#include "models.h"
|
|
2
|
+
|
|
3
|
+
// RND1 is a Qwen3Moe AR model converted to diffusion model.
|
|
4
|
+
llm_build_rnd1::llm_build_rnd1(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
5
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
6
|
+
|
|
7
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
8
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
|
9
|
+
|
|
10
|
+
ggml_tensor * cur;
|
|
11
|
+
ggml_tensor * inpL;
|
|
12
|
+
|
|
13
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
14
|
+
|
|
15
|
+
// inp_pos - contains the positions
|
|
16
|
+
ggml_tensor * inp_pos = build_inp_pos();
|
|
17
|
+
|
|
18
|
+
// Non-causal attention for diffusion
|
|
19
|
+
auto * inp_attn = build_attn_inp_no_cache();
|
|
20
|
+
|
|
21
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
22
|
+
|
|
23
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
24
|
+
ggml_tensor * inpSA = inpL;
|
|
25
|
+
|
|
26
|
+
// norm
|
|
27
|
+
cur = build_norm(inpL,
|
|
28
|
+
model.layers[il].attn_norm, NULL,
|
|
29
|
+
LLM_NORM_RMS, il);
|
|
30
|
+
cb(cur, "attn_norm", il);
|
|
31
|
+
|
|
32
|
+
// self_attention
|
|
33
|
+
{
|
|
34
|
+
// compute Q and K and RoPE them
|
|
35
|
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
36
|
+
cb(Qcur, "Qcur", il);
|
|
37
|
+
|
|
38
|
+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
39
|
+
cb(Kcur, "Kcur", il);
|
|
40
|
+
|
|
41
|
+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
42
|
+
cb(Vcur, "Vcur", il);
|
|
43
|
+
|
|
44
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
45
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
46
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
47
|
+
|
|
48
|
+
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
|
|
49
|
+
cb(Qcur, "Qcur_normed", il);
|
|
50
|
+
|
|
51
|
+
Qcur = ggml_rope_ext(
|
|
52
|
+
ctx0, Qcur, inp_pos, nullptr,
|
|
53
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
54
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
55
|
+
);
|
|
56
|
+
|
|
57
|
+
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
|
|
58
|
+
cb(Kcur, "Kcur_normed", il);
|
|
59
|
+
|
|
60
|
+
Kcur = ggml_rope_ext(
|
|
61
|
+
ctx0, Kcur, inp_pos, nullptr,
|
|
62
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
63
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
64
|
+
);
|
|
65
|
+
|
|
66
|
+
cb(Qcur, "Qcur", il);
|
|
67
|
+
cb(Kcur, "Kcur", il);
|
|
68
|
+
cb(Vcur, "Vcur", il);
|
|
69
|
+
|
|
70
|
+
cur = build_attn(inp_attn,
|
|
71
|
+
model.layers[il].wo, model.layers[il].bo,
|
|
72
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
73
|
+
}
|
|
74
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
75
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
76
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
77
|
+
}
|
|
78
|
+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
79
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
80
|
+
|
|
81
|
+
// MoE branch
|
|
82
|
+
cur = build_norm(ffn_inp,
|
|
83
|
+
model.layers[il].ffn_norm, NULL,
|
|
84
|
+
LLM_NORM_RMS, il);
|
|
85
|
+
cb(cur, "ffn_norm", il);
|
|
86
|
+
|
|
87
|
+
ggml_tensor * moe_out =
|
|
88
|
+
build_moe_ffn(cur,
|
|
89
|
+
model.layers[il].ffn_gate_inp,
|
|
90
|
+
model.layers[il].ffn_up_exps,
|
|
91
|
+
model.layers[il].ffn_gate_exps,
|
|
92
|
+
model.layers[il].ffn_down_exps,
|
|
93
|
+
nullptr,
|
|
94
|
+
n_expert, n_expert_used,
|
|
95
|
+
LLM_FFN_SILU, true,
|
|
96
|
+
false, 0.0,
|
|
97
|
+
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
|
98
|
+
il);
|
|
99
|
+
cb(moe_out, "ffn_moe_out", il);
|
|
100
|
+
cur = moe_out;
|
|
101
|
+
|
|
102
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
103
|
+
|
|
104
|
+
cur = build_cvec(cur, il);
|
|
105
|
+
cb(cur, "l_out", il);
|
|
106
|
+
|
|
107
|
+
// input for next layer
|
|
108
|
+
inpL = cur;
|
|
109
|
+
}
|
|
110
|
+
cur = inpL;
|
|
111
|
+
|
|
112
|
+
cur = build_norm(cur,
|
|
113
|
+
model.output_norm, NULL,
|
|
114
|
+
LLM_NORM_RMS, -1);
|
|
115
|
+
|
|
116
|
+
cb(cur, "result_norm", -1);
|
|
117
|
+
res->t_embd = cur;
|
|
118
|
+
|
|
119
|
+
// lm_head
|
|
120
|
+
cur = build_lora_mm(model.output, cur);
|
|
121
|
+
|
|
122
|
+
cb(cur, "result_output", -1);
|
|
123
|
+
res->t_logits = cur;
|
|
124
|
+
|
|
125
|
+
ggml_build_forward_expand(gf, cur);
|
|
126
|
+
}
|