@fugood/llama.node 1.4.12 → 1.4.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +15 -15
- package/scripts/llama.cpp.patch +9 -9
- package/src/llama.cpp/common/arg.cpp +99 -45
- package/src/llama.cpp/common/chat.cpp +4 -4
- package/src/llama.cpp/common/common.cpp +19 -0
- package/src/llama.cpp/common/common.h +10 -0
- package/src/llama.cpp/common/llguidance.cpp +10 -6
- package/src/llama.cpp/common/regex-partial.cpp +13 -13
- package/src/llama.cpp/common/sampling.cpp +58 -14
- package/src/llama.cpp/common/sampling.h +3 -1
- package/src/llama.cpp/include/llama.h +87 -8
- package/src/llama.cpp/src/llama-arch.cpp +2 -0
- package/src/llama.cpp/src/llama-arch.h +1 -0
- package/src/llama.cpp/src/llama-context.cpp +615 -28
- package/src/llama.cpp/src/llama-context.h +43 -1
- package/src/llama.cpp/src/llama-grammar.cpp +40 -13
- package/src/llama.cpp/src/llama-grammar.h +2 -0
- package/src/llama.cpp/src/llama-graph.cpp +173 -5
- package/src/llama.cpp/src/llama-graph.h +71 -6
- package/src/llama.cpp/src/llama-hparams.cpp +4 -0
- package/src/llama.cpp/src/llama-hparams.h +8 -2
- package/src/llama.cpp/src/llama-model-saver.cpp +3 -0
- package/src/llama.cpp/src/llama-model.cpp +51 -11
- package/src/llama.cpp/src/llama-sampling.cpp +1232 -170
- package/src/llama.cpp/src/llama-sampling.h +16 -7
- package/src/llama.cpp/src/llama.cpp +38 -30
- package/src/llama.cpp/src/models/afmoe.cpp +9 -5
- package/src/llama.cpp/src/models/cohere2-iswa.cpp +3 -0
- package/src/llama.cpp/src/models/gemma2-iswa.cpp +5 -2
- package/src/llama.cpp/src/models/llama-iswa.cpp +6 -2
- package/src/llama.cpp/src/models/modern-bert.cpp +4 -3
- package/src/llama.cpp/src/models/openai-moe-iswa.cpp +5 -2
- package/src/llama.cpp/src/models/smallthinker.cpp +11 -5
|
@@ -507,6 +507,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
507
507
|
|
|
508
508
|
ml.get_key(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
|
|
509
509
|
ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
|
|
510
|
+
ml.get_key(LLM_KV_EMBEDDING_LENGTH_OUT, hparams.n_embd_out, false);
|
|
510
511
|
ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer);
|
|
511
512
|
ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
|
|
512
513
|
ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
|
|
@@ -578,6 +579,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
578
579
|
hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling);
|
|
579
580
|
GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED);
|
|
580
581
|
|
|
582
|
+
// TODO: Handle SWA metadata similarly when models start implementing it
|
|
581
583
|
// rope_freq_scale (inverse of the kv) is optional
|
|
582
584
|
float ropescale = 0.0f;
|
|
583
585
|
if (!ml.get_key(LLM_KV_ROPE_SCALING_FACTOR, ropescale, false)) {
|
|
@@ -586,10 +588,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
586
588
|
}
|
|
587
589
|
hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale;
|
|
588
590
|
|
|
589
|
-
// by default assume that the sliding-window layers use the same scaling type as the non-sliding-window layers
|
|
590
|
-
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
|
|
591
|
-
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
|
|
592
|
-
|
|
593
591
|
ml.get_key(LLM_KV_ROPE_SCALING_ATTN_FACTOR, hparams.rope_attn_factor, false);
|
|
594
592
|
|
|
595
593
|
// non-transformer models do not have attention heads
|
|
@@ -677,6 +675,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
677
675
|
hparams.f_attn_temp_scale = 0.1f;
|
|
678
676
|
hparams.f_attn_temp_offset = 1.0f;
|
|
679
677
|
hparams.set_swa_pattern(4); // pattern: 3 chunked - 1 full
|
|
678
|
+
|
|
679
|
+
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
|
|
680
|
+
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
|
|
681
|
+
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
|
|
680
682
|
}
|
|
681
683
|
|
|
682
684
|
switch (hparams.n_expert) {
|
|
@@ -722,6 +724,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
722
724
|
if (hparams.n_swa > 0) {
|
|
723
725
|
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
|
724
726
|
hparams.set_swa_pattern(4);
|
|
727
|
+
|
|
728
|
+
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
|
|
729
|
+
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
|
|
730
|
+
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
|
|
725
731
|
} else {
|
|
726
732
|
hparams.swa_type = LLAMA_SWA_TYPE_NONE;
|
|
727
733
|
}
|
|
@@ -1243,7 +1249,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1243
1249
|
if (found_swa && hparams.n_swa > 0) {
|
|
1244
1250
|
uint32_t swa_period = 8;
|
|
1245
1251
|
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
|
1246
|
-
hparams.rope_freq_scale_train_swa = 1.0f;
|
|
1247
1252
|
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa);
|
|
1248
1253
|
ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
|
|
1249
1254
|
hparams.set_swa_pattern(swa_period);
|
|
@@ -1309,7 +1314,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1309
1314
|
hparams.n_swa = 4096; // default value of gemma 2
|
|
1310
1315
|
hparams.set_swa_pattern(2);
|
|
1311
1316
|
hparams.attn_soft_cap = true;
|
|
1317
|
+
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
|
|
1318
|
+
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
|
|
1312
1319
|
|
|
1320
|
+
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
|
|
1313
1321
|
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
|
|
1314
1322
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1315
1323
|
ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING, hparams.f_attn_logit_softcapping, false);
|
|
@@ -1334,8 +1342,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1334
1342
|
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
|
1335
1343
|
hparams.set_swa_pattern(6);
|
|
1336
1344
|
|
|
1337
|
-
hparams.rope_freq_base_train_swa
|
|
1338
|
-
hparams.rope_freq_scale_train_swa = 1.0f;
|
|
1345
|
+
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
|
|
1339
1346
|
} else {
|
|
1340
1347
|
hparams.swa_type = LLAMA_SWA_TYPE_NONE;
|
|
1341
1348
|
}
|
|
@@ -1365,10 +1372,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1365
1372
|
hparams.set_swa_pattern(5);
|
|
1366
1373
|
|
|
1367
1374
|
hparams.n_layer_kv_from_start = 20;
|
|
1368
|
-
hparams.rope_freq_base_train_swa = 10000.0f;
|
|
1369
|
-
hparams.rope_freq_scale_train_swa = 1.0f;
|
|
1370
1375
|
hparams.f_attention_scale = 1.0f;
|
|
1371
1376
|
|
|
1377
|
+
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
|
|
1372
1378
|
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
|
1373
1379
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1374
1380
|
|
|
@@ -1384,9 +1390,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1384
1390
|
hparams.set_swa_pattern(6);
|
|
1385
1391
|
|
|
1386
1392
|
hparams.causal_attn = false; // embeddings do not use causal attention
|
|
1387
|
-
hparams.rope_freq_base_train_swa = 10000.0f;
|
|
1388
|
-
hparams.rope_freq_scale_train_swa = 1.0f;
|
|
1389
1393
|
|
|
1394
|
+
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
|
|
1390
1395
|
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
|
1391
1396
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1392
1397
|
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
|
|
@@ -1525,7 +1530,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1525
1530
|
{
|
|
1526
1531
|
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
|
1527
1532
|
hparams.set_swa_pattern(4);
|
|
1533
|
+
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
|
|
1534
|
+
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
|
|
1528
1535
|
|
|
1536
|
+
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
|
|
1529
1537
|
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
|
1530
1538
|
ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
|
|
1531
1539
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
@@ -1564,6 +1572,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1564
1572
|
if (found_swa && hparams.n_swa > 0) {
|
|
1565
1573
|
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
|
1566
1574
|
hparams.set_swa_pattern(4);
|
|
1575
|
+
|
|
1576
|
+
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
|
|
1577
|
+
hparams.rope_freq_scale_train_swa = 1.0; // See olmo2.cpp
|
|
1578
|
+
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
|
|
1567
1579
|
} else {
|
|
1568
1580
|
hparams.swa_type = LLAMA_SWA_TYPE_NONE;
|
|
1569
1581
|
}
|
|
@@ -1906,6 +1918,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1906
1918
|
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
|
1907
1919
|
hparams.n_swa = 4096;
|
|
1908
1920
|
hparams.set_swa_pattern(4);
|
|
1921
|
+
|
|
1922
|
+
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
|
|
1923
|
+
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
|
|
1924
|
+
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
|
|
1909
1925
|
}
|
|
1910
1926
|
|
|
1911
1927
|
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
|
|
@@ -2208,6 +2224,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
2208
2224
|
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
|
2209
2225
|
hparams.set_swa_pattern(2);
|
|
2210
2226
|
|
|
2227
|
+
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
|
|
2228
|
+
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
|
|
2229
|
+
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
|
|
2230
|
+
|
|
2211
2231
|
switch (hparams.n_layer) {
|
|
2212
2232
|
case 24: type = LLM_TYPE_20B; break;
|
|
2213
2233
|
case 36: type = LLM_TYPE_120B; break;
|
|
@@ -2252,6 +2272,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
2252
2272
|
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
|
2253
2273
|
hparams.n_swa = 4096;
|
|
2254
2274
|
hparams.set_swa_pattern(4, true);
|
|
2275
|
+
|
|
2276
|
+
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
|
|
2277
|
+
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
|
|
2278
|
+
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
|
|
2255
2279
|
} else {
|
|
2256
2280
|
hparams.swa_type = LLAMA_SWA_TYPE_NONE;
|
|
2257
2281
|
hparams.n_no_rope_layer_step = hparams.n_layer;
|
|
@@ -6446,6 +6470,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
6446
6470
|
layer.shortconv.out_proj = create_tensor(tn(LLM_TENSOR_SHORTCONV_OUTPROJ, "weight", i), {n_embd, n_embd}, 0);
|
|
6447
6471
|
}
|
|
6448
6472
|
}
|
|
6473
|
+
|
|
6474
|
+
// for LFM2-ColBert-350M
|
|
6475
|
+
dense_2_out_layers = create_tensor(tn(LLM_TENSOR_DENSE_2_OUT, "weight"), {n_embd, hparams.get_n_embd_out()}, TENSOR_NOT_REQUIRED);
|
|
6449
6476
|
} break;
|
|
6450
6477
|
case LLM_ARCH_SMALLTHINKER:
|
|
6451
6478
|
{
|
|
@@ -7098,6 +7125,10 @@ void llama_model::print_info() const {
|
|
|
7098
7125
|
LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type.c_str());
|
|
7099
7126
|
LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
|
|
7100
7127
|
LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
|
|
7128
|
+
if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
|
|
7129
|
+
LLAMA_LOG_INFO("%s: freq_base_swa = %.1f\n", __func__, hparams.rope_freq_base_train_swa);
|
|
7130
|
+
LLAMA_LOG_INFO("%s: freq_scale_swa = %g\n", __func__, hparams.rope_freq_scale_train_swa);
|
|
7131
|
+
}
|
|
7101
7132
|
LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
|
|
7102
7133
|
LLAMA_LOG_INFO("%s: rope_yarn_log_mul= %.4f\n", __func__, hparams.rope_yarn_log_mul);
|
|
7103
7134
|
LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
|
|
@@ -7910,12 +7941,17 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|
|
7910
7941
|
// add on pooling layer
|
|
7911
7942
|
llm->build_pooling(cls, cls_b, cls_out, cls_out_b);
|
|
7912
7943
|
|
|
7944
|
+
// add backend sampling layers (if any)
|
|
7945
|
+
llm->build_sampling();
|
|
7946
|
+
|
|
7913
7947
|
// if the gguf model was converted with --sentence-transformers-dense-modules
|
|
7914
7948
|
// there will be two additional dense projection layers
|
|
7915
7949
|
// dense linear projections are applied after pooling
|
|
7916
7950
|
// TODO: move reranking logic here and generalize
|
|
7917
7951
|
llm->build_dense_out(dense_2_out_layers, dense_3_out_layers);
|
|
7918
7952
|
|
|
7953
|
+
llm->res->set_outputs();
|
|
7954
|
+
|
|
7919
7955
|
return llm->res->get_gf();
|
|
7920
7956
|
}
|
|
7921
7957
|
|
|
@@ -7971,6 +8007,10 @@ int32_t llama_model_n_embd_inp(const llama_model * model) {
|
|
|
7971
8007
|
return model->hparams.n_embd_inp();
|
|
7972
8008
|
}
|
|
7973
8009
|
|
|
8010
|
+
int32_t llama_model_n_embd_out(const llama_model * model) {
|
|
8011
|
+
return model->hparams.get_n_embd_out();
|
|
8012
|
+
}
|
|
8013
|
+
|
|
7974
8014
|
int32_t llama_model_n_layer(const llama_model * model) {
|
|
7975
8015
|
return model->hparams.n_layer;
|
|
7976
8016
|
}
|