@fugood/llama.node 1.4.12 → 1.4.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. package/package.json +15 -15
  2. package/scripts/llama.cpp.patch +9 -9
  3. package/src/llama.cpp/common/arg.cpp +99 -45
  4. package/src/llama.cpp/common/chat.cpp +4 -4
  5. package/src/llama.cpp/common/common.cpp +19 -0
  6. package/src/llama.cpp/common/common.h +10 -0
  7. package/src/llama.cpp/common/llguidance.cpp +10 -6
  8. package/src/llama.cpp/common/regex-partial.cpp +13 -13
  9. package/src/llama.cpp/common/sampling.cpp +58 -14
  10. package/src/llama.cpp/common/sampling.h +3 -1
  11. package/src/llama.cpp/include/llama.h +87 -8
  12. package/src/llama.cpp/src/llama-arch.cpp +2 -0
  13. package/src/llama.cpp/src/llama-arch.h +1 -0
  14. package/src/llama.cpp/src/llama-context.cpp +615 -28
  15. package/src/llama.cpp/src/llama-context.h +43 -1
  16. package/src/llama.cpp/src/llama-grammar.cpp +40 -13
  17. package/src/llama.cpp/src/llama-grammar.h +2 -0
  18. package/src/llama.cpp/src/llama-graph.cpp +173 -5
  19. package/src/llama.cpp/src/llama-graph.h +71 -6
  20. package/src/llama.cpp/src/llama-hparams.cpp +4 -0
  21. package/src/llama.cpp/src/llama-hparams.h +8 -2
  22. package/src/llama.cpp/src/llama-model-saver.cpp +3 -0
  23. package/src/llama.cpp/src/llama-model.cpp +51 -11
  24. package/src/llama.cpp/src/llama-sampling.cpp +1232 -170
  25. package/src/llama.cpp/src/llama-sampling.h +16 -7
  26. package/src/llama.cpp/src/llama.cpp +38 -30
  27. package/src/llama.cpp/src/models/afmoe.cpp +9 -5
  28. package/src/llama.cpp/src/models/cohere2-iswa.cpp +3 -0
  29. package/src/llama.cpp/src/models/gemma2-iswa.cpp +5 -2
  30. package/src/llama.cpp/src/models/llama-iswa.cpp +6 -2
  31. package/src/llama.cpp/src/models/modern-bert.cpp +4 -3
  32. package/src/llama.cpp/src/models/openai-moe-iswa.cpp +5 -2
  33. package/src/llama.cpp/src/models/smallthinker.cpp +11 -5
@@ -507,6 +507,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
507
507
 
508
508
  ml.get_key(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
509
509
  ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
510
+ ml.get_key(LLM_KV_EMBEDDING_LENGTH_OUT, hparams.n_embd_out, false);
510
511
  ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer);
511
512
  ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
512
513
  ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
@@ -578,6 +579,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
578
579
  hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling);
579
580
  GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED);
580
581
 
582
+ // TODO: Handle SWA metadata similarly when models start implementing it
581
583
  // rope_freq_scale (inverse of the kv) is optional
582
584
  float ropescale = 0.0f;
583
585
  if (!ml.get_key(LLM_KV_ROPE_SCALING_FACTOR, ropescale, false)) {
@@ -586,10 +588,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
586
588
  }
587
589
  hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale;
588
590
 
589
- // by default assume that the sliding-window layers use the same scaling type as the non-sliding-window layers
590
- hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
591
- hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
592
-
593
591
  ml.get_key(LLM_KV_ROPE_SCALING_ATTN_FACTOR, hparams.rope_attn_factor, false);
594
592
 
595
593
  // non-transformer models do not have attention heads
@@ -677,6 +675,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
677
675
  hparams.f_attn_temp_scale = 0.1f;
678
676
  hparams.f_attn_temp_offset = 1.0f;
679
677
  hparams.set_swa_pattern(4); // pattern: 3 chunked - 1 full
678
+
679
+ hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
680
+ hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
681
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
680
682
  }
681
683
 
682
684
  switch (hparams.n_expert) {
@@ -722,6 +724,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
722
724
  if (hparams.n_swa > 0) {
723
725
  hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
724
726
  hparams.set_swa_pattern(4);
727
+
728
+ hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
729
+ hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
730
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
725
731
  } else {
726
732
  hparams.swa_type = LLAMA_SWA_TYPE_NONE;
727
733
  }
@@ -1243,7 +1249,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1243
1249
  if (found_swa && hparams.n_swa > 0) {
1244
1250
  uint32_t swa_period = 8;
1245
1251
  hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1246
- hparams.rope_freq_scale_train_swa = 1.0f;
1247
1252
  ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa);
1248
1253
  ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
1249
1254
  hparams.set_swa_pattern(swa_period);
@@ -1309,7 +1314,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1309
1314
  hparams.n_swa = 4096; // default value of gemma 2
1310
1315
  hparams.set_swa_pattern(2);
1311
1316
  hparams.attn_soft_cap = true;
1317
+ hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
1318
+ hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
1312
1319
 
1320
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
1313
1321
  ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
1314
1322
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1315
1323
  ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING, hparams.f_attn_logit_softcapping, false);
@@ -1334,8 +1342,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1334
1342
  hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1335
1343
  hparams.set_swa_pattern(6);
1336
1344
 
1337
- hparams.rope_freq_base_train_swa = 10000.0f;
1338
- hparams.rope_freq_scale_train_swa = 1.0f;
1345
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
1339
1346
  } else {
1340
1347
  hparams.swa_type = LLAMA_SWA_TYPE_NONE;
1341
1348
  }
@@ -1365,10 +1372,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1365
1372
  hparams.set_swa_pattern(5);
1366
1373
 
1367
1374
  hparams.n_layer_kv_from_start = 20;
1368
- hparams.rope_freq_base_train_swa = 10000.0f;
1369
- hparams.rope_freq_scale_train_swa = 1.0f;
1370
1375
  hparams.f_attention_scale = 1.0f;
1371
1376
 
1377
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
1372
1378
  ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
1373
1379
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1374
1380
 
@@ -1384,9 +1390,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1384
1390
  hparams.set_swa_pattern(6);
1385
1391
 
1386
1392
  hparams.causal_attn = false; // embeddings do not use causal attention
1387
- hparams.rope_freq_base_train_swa = 10000.0f;
1388
- hparams.rope_freq_scale_train_swa = 1.0f;
1389
1393
 
1394
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
1390
1395
  ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
1391
1396
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1392
1397
  ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
@@ -1525,7 +1530,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1525
1530
  {
1526
1531
  hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1527
1532
  hparams.set_swa_pattern(4);
1533
+ hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
1534
+ hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
1528
1535
 
1536
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
1529
1537
  ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
1530
1538
  ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
1531
1539
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -1564,6 +1572,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1564
1572
  if (found_swa && hparams.n_swa > 0) {
1565
1573
  hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1566
1574
  hparams.set_swa_pattern(4);
1575
+
1576
+ hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
1577
+ hparams.rope_freq_scale_train_swa = 1.0; // See olmo2.cpp
1578
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
1567
1579
  } else {
1568
1580
  hparams.swa_type = LLAMA_SWA_TYPE_NONE;
1569
1581
  }
@@ -1906,6 +1918,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1906
1918
  hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1907
1919
  hparams.n_swa = 4096;
1908
1920
  hparams.set_swa_pattern(4);
1921
+
1922
+ hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
1923
+ hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
1924
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
1909
1925
  }
1910
1926
 
1911
1927
  ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
@@ -2208,6 +2224,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
2208
2224
  hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
2209
2225
  hparams.set_swa_pattern(2);
2210
2226
 
2227
+ hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
2228
+ hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
2229
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
2230
+
2211
2231
  switch (hparams.n_layer) {
2212
2232
  case 24: type = LLM_TYPE_20B; break;
2213
2233
  case 36: type = LLM_TYPE_120B; break;
@@ -2252,6 +2272,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
2252
2272
  hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
2253
2273
  hparams.n_swa = 4096;
2254
2274
  hparams.set_swa_pattern(4, true);
2275
+
2276
+ hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
2277
+ hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
2278
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
2255
2279
  } else {
2256
2280
  hparams.swa_type = LLAMA_SWA_TYPE_NONE;
2257
2281
  hparams.n_no_rope_layer_step = hparams.n_layer;
@@ -6446,6 +6470,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
6446
6470
  layer.shortconv.out_proj = create_tensor(tn(LLM_TENSOR_SHORTCONV_OUTPROJ, "weight", i), {n_embd, n_embd}, 0);
6447
6471
  }
6448
6472
  }
6473
+
6474
+ // for LFM2-ColBert-350M
6475
+ dense_2_out_layers = create_tensor(tn(LLM_TENSOR_DENSE_2_OUT, "weight"), {n_embd, hparams.get_n_embd_out()}, TENSOR_NOT_REQUIRED);
6449
6476
  } break;
6450
6477
  case LLM_ARCH_SMALLTHINKER:
6451
6478
  {
@@ -7098,6 +7125,10 @@ void llama_model::print_info() const {
7098
7125
  LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type.c_str());
7099
7126
  LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
7100
7127
  LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
7128
+ if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
7129
+ LLAMA_LOG_INFO("%s: freq_base_swa = %.1f\n", __func__, hparams.rope_freq_base_train_swa);
7130
+ LLAMA_LOG_INFO("%s: freq_scale_swa = %g\n", __func__, hparams.rope_freq_scale_train_swa);
7131
+ }
7101
7132
  LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
7102
7133
  LLAMA_LOG_INFO("%s: rope_yarn_log_mul= %.4f\n", __func__, hparams.rope_yarn_log_mul);
7103
7134
  LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
@@ -7910,12 +7941,17 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
7910
7941
  // add on pooling layer
7911
7942
  llm->build_pooling(cls, cls_b, cls_out, cls_out_b);
7912
7943
 
7944
+ // add backend sampling layers (if any)
7945
+ llm->build_sampling();
7946
+
7913
7947
  // if the gguf model was converted with --sentence-transformers-dense-modules
7914
7948
  // there will be two additional dense projection layers
7915
7949
  // dense linear projections are applied after pooling
7916
7950
  // TODO: move reranking logic here and generalize
7917
7951
  llm->build_dense_out(dense_2_out_layers, dense_3_out_layers);
7918
7952
 
7953
+ llm->res->set_outputs();
7954
+
7919
7955
  return llm->res->get_gf();
7920
7956
  }
7921
7957
 
@@ -7971,6 +8007,10 @@ int32_t llama_model_n_embd_inp(const llama_model * model) {
7971
8007
  return model->hparams.n_embd_inp();
7972
8008
  }
7973
8009
 
8010
+ int32_t llama_model_n_embd_out(const llama_model * model) {
8011
+ return model->hparams.get_n_embd_out();
8012
+ }
8013
+
7974
8014
  int32_t llama_model_n_layer(const llama_model * model) {
7975
8015
  return model->hparams.n_layer;
7976
8016
  }