@fugood/llama.node 1.4.12 → 1.4.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. package/package.json +15 -15
  2. package/scripts/llama.cpp.patch +9 -9
  3. package/src/llama.cpp/common/arg.cpp +99 -45
  4. package/src/llama.cpp/common/chat.cpp +4 -4
  5. package/src/llama.cpp/common/common.cpp +19 -0
  6. package/src/llama.cpp/common/common.h +10 -0
  7. package/src/llama.cpp/common/llguidance.cpp +10 -6
  8. package/src/llama.cpp/common/regex-partial.cpp +13 -13
  9. package/src/llama.cpp/common/sampling.cpp +58 -14
  10. package/src/llama.cpp/common/sampling.h +3 -1
  11. package/src/llama.cpp/include/llama.h +87 -8
  12. package/src/llama.cpp/src/llama-arch.cpp +2 -0
  13. package/src/llama.cpp/src/llama-arch.h +1 -0
  14. package/src/llama.cpp/src/llama-context.cpp +615 -28
  15. package/src/llama.cpp/src/llama-context.h +43 -1
  16. package/src/llama.cpp/src/llama-grammar.cpp +40 -13
  17. package/src/llama.cpp/src/llama-grammar.h +2 -0
  18. package/src/llama.cpp/src/llama-graph.cpp +173 -5
  19. package/src/llama.cpp/src/llama-graph.h +71 -6
  20. package/src/llama.cpp/src/llama-hparams.cpp +4 -0
  21. package/src/llama.cpp/src/llama-hparams.h +8 -2
  22. package/src/llama.cpp/src/llama-model-saver.cpp +3 -0
  23. package/src/llama.cpp/src/llama-model.cpp +51 -11
  24. package/src/llama.cpp/src/llama-sampling.cpp +1232 -170
  25. package/src/llama.cpp/src/llama-sampling.h +16 -7
  26. package/src/llama.cpp/src/llama.cpp +38 -30
  27. package/src/llama.cpp/src/models/afmoe.cpp +9 -5
  28. package/src/llama.cpp/src/models/cohere2-iswa.cpp +3 -0
  29. package/src/llama.cpp/src/models/gemma2-iswa.cpp +5 -2
  30. package/src/llama.cpp/src/models/llama-iswa.cpp +6 -2
  31. package/src/llama.cpp/src/models/modern-bert.cpp +4 -3
  32. package/src/llama.cpp/src/models/openai-moe-iswa.cpp +5 -2
  33. package/src/llama.cpp/src/models/smallthinker.cpp +11 -5
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@fugood/llama.node",
3
3
  "access": "public",
4
- "version": "1.4.12",
4
+ "version": "1.4.13",
5
5
  "description": "An another Node binding of llama.cpp",
6
6
  "main": "lib/index.js",
7
7
  "scripts": {
@@ -72,20 +72,20 @@
72
72
  "CMakeLists.txt"
73
73
  ],
74
74
  "optionalDependencies": {
75
- "@fugood/node-llama-darwin-arm64": "1.4.12",
76
- "@fugood/node-llama-darwin-x64": "1.4.12",
77
- "@fugood/node-llama-linux-arm64": "1.4.12",
78
- "@fugood/node-llama-linux-arm64-cuda": "1.4.12",
79
- "@fugood/node-llama-linux-arm64-snapdragon": "1.4.12",
80
- "@fugood/node-llama-linux-arm64-vulkan": "1.4.12",
81
- "@fugood/node-llama-linux-x64": "1.4.12",
82
- "@fugood/node-llama-linux-x64-cuda": "1.4.12",
83
- "@fugood/node-llama-linux-x64-vulkan": "1.4.12",
84
- "@fugood/node-llama-win32-arm64": "1.4.12",
85
- "@fugood/node-llama-win32-arm64-vulkan": "1.4.12",
86
- "@fugood/node-llama-win32-x64": "1.4.12",
87
- "@fugood/node-llama-win32-x64-cuda": "1.4.12",
88
- "@fugood/node-llama-win32-x64-vulkan": "1.4.12"
75
+ "@fugood/node-llama-darwin-arm64": "1.4.13",
76
+ "@fugood/node-llama-darwin-x64": "1.4.13",
77
+ "@fugood/node-llama-linux-arm64": "1.4.13",
78
+ "@fugood/node-llama-linux-arm64-cuda": "1.4.13",
79
+ "@fugood/node-llama-linux-arm64-snapdragon": "1.4.13",
80
+ "@fugood/node-llama-linux-arm64-vulkan": "1.4.13",
81
+ "@fugood/node-llama-linux-x64": "1.4.13",
82
+ "@fugood/node-llama-linux-x64-cuda": "1.4.13",
83
+ "@fugood/node-llama-linux-x64-vulkan": "1.4.13",
84
+ "@fugood/node-llama-win32-arm64": "1.4.13",
85
+ "@fugood/node-llama-win32-arm64-vulkan": "1.4.13",
86
+ "@fugood/node-llama-win32-x64": "1.4.13",
87
+ "@fugood/node-llama-win32-x64-cuda": "1.4.13",
88
+ "@fugood/node-llama-win32-x64-vulkan": "1.4.13"
89
89
  },
90
90
  "devDependencies": {
91
91
  "@babel/preset-env": "^7.24.4",
@@ -32,7 +32,7 @@ index 1bcba9cd8..b7cd68734 100644
32
32
  static std::string_view trim_trailing_space(std::string_view sv, int max = -1) {
33
33
  int count = 0;
34
34
  diff --git a/src/llama.cpp/common/chat.cpp b/src/llama.cpp/common/chat.cpp
35
- index b98ab21ce..2f782837a 100644
35
+ index 22e527bab..c3d0affca 100644
36
36
  --- a/src/llama.cpp/common/chat.cpp
37
37
  +++ b/src/llama.cpp/common/chat.cpp
38
38
  @@ -7,9 +7,6 @@
@@ -96,10 +96,10 @@ index 8bd4a325f..333b3301f 100644
96
96
  struct common_chat_tool_call {
97
97
  std::string name;
98
98
  diff --git a/src/llama.cpp/common/common.cpp b/src/llama.cpp/common/common.cpp
99
- index 79c475612..cf189f8bc 100644
99
+ index 41b2b6833..fe9ba05aa 100644
100
100
  --- a/src/llama.cpp/common/common.cpp
101
101
  +++ b/src/llama.cpp/common/common.cpp
102
- @@ -1342,6 +1342,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
102
+ @@ -1361,6 +1361,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
103
103
  mparams.devices = params.devices.data();
104
104
  }
105
105
 
@@ -108,10 +108,10 @@ index 79c475612..cf189f8bc 100644
108
108
  mparams.main_gpu = params.main_gpu;
109
109
  mparams.split_mode = params.split_mode;
110
110
  diff --git a/src/llama.cpp/common/common.h b/src/llama.cpp/common/common.h
111
- index f8bc686b6..555ba044a 100644
111
+ index d6fd0d37a..477209ce5 100644
112
112
  --- a/src/llama.cpp/common/common.h
113
113
  +++ b/src/llama.cpp/common/common.h
114
- @@ -307,6 +307,7 @@ struct lr_opt {
114
+ @@ -310,6 +310,7 @@ struct lr_opt {
115
115
  struct ggml_opt_optimizer_params common_opt_lr_pars(void * userdata);
116
116
 
117
117
  struct common_params {
@@ -133,10 +133,10 @@ index 7622d0bf4..d2edcfddb 100644
133
133
  check_cxx_compiler_flag(-mfp16-format=ieee GGML_COMPILER_SUPPORTS_FP16_FORMAT_I3E)
134
134
  if (NOT "${GGML_COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")
135
135
  diff --git a/src/llama.cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/src/llama.cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp
136
- index 13b96d61f..5fa163442 100644
136
+ index 365a24b49..83bf4ee62 100644
137
137
  --- a/src/llama.cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp
138
138
  +++ b/src/llama.cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp
139
- @@ -2680,9 +2680,24 @@ static const char * ggml_backend_hexagon_device_get_description(ggml_backend_dev
139
+ @@ -2798,9 +2798,24 @@ static const char * ggml_backend_hexagon_device_get_description(ggml_backend_dev
140
140
  GGML_UNUSED(dev);
141
141
  }
142
142
 
@@ -163,7 +163,7 @@ index 13b96d61f..5fa163442 100644
163
163
  *total = *free;
164
164
 
165
165
  GGML_UNUSED(dev);
166
- @@ -2879,10 +2894,17 @@ ggml_hexagon_registry::ggml_hexagon_registry(ggml_backend_reg_t reg) {
166
+ @@ -3010,10 +3025,17 @@ ggml_hexagon_registry::ggml_hexagon_registry(ggml_backend_reg_t reg) {
167
167
  }
168
168
  }
169
169
 
@@ -185,7 +185,7 @@ index 13b96d61f..5fa163442 100644
185
185
 
186
186
  GGML_LOG_INFO("ggml-hex: Hexagon Arch version v%d\n", opt_arch);
187
187
 
188
- @@ -2895,6 +2917,8 @@ ggml_hexagon_registry::ggml_hexagon_registry(ggml_backend_reg_t reg) {
188
+ @@ -3026,6 +3048,8 @@ ggml_hexagon_registry::ggml_hexagon_registry(ggml_backend_reg_t reg) {
189
189
  } catch (const std::exception & exc) {
190
190
  GGML_LOG_ERROR("ggml-hex: failed to create device/session %zu\n", i);
191
191
  devices[i].context = nullptr;
@@ -679,7 +679,6 @@ static void common_params_print_completion(common_params_context & ctx_arg) {
679
679
  "llama-quantize",
680
680
  "llama-qwen2vl-cli",
681
681
  "llama-retrieval",
682
- "llama-run",
683
682
  "llama-save-load-state",
684
683
  "llama-server",
685
684
  "llama-simple",
@@ -854,6 +853,54 @@ bool common_arg_utils::is_autoy(const std::string & value) {
854
853
  return value == "auto" || value == "-1";
855
854
  }
856
855
 
856
+ // Simple CSV parser that handles quoted fields and escaped quotes
857
+ // example:
858
+ // input: value1,"value, with, commas","value with ""escaped"" quotes",value4
859
+ // output: [value1] [value, with, commas] [value with "escaped" quotes] [value4]
860
+ static std::vector<std::string> parse_csv_row(const std::string& input) {
861
+ std::vector<std::string> fields;
862
+ std::string field;
863
+ bool in_quotes = false;
864
+
865
+ for (size_t i = 0; i < input.length(); ++i) {
866
+ char ch = input[i];
867
+
868
+ if (ch == '"') {
869
+ if (!in_quotes) {
870
+ // start of quoted field (only valid if at beginning of field)
871
+ if (!field.empty()) {
872
+ // quote appeared in middle of unquoted field, treat as literal
873
+ field += '"';
874
+ } else {
875
+ in_quotes = true; // start
876
+ }
877
+ } else {
878
+ if (i + 1 < input.length() && input[i + 1] == '"') {
879
+ // escaped quote: ""
880
+ field += '"';
881
+ ++i; // skip the next quote
882
+ } else {
883
+ in_quotes = false; // end
884
+ }
885
+ }
886
+ } else if (ch == ',') {
887
+ if (in_quotes) {
888
+ field += ',';
889
+ } else {
890
+ fields.push_back(std::move(field));
891
+ field.clear();
892
+ }
893
+ } else {
894
+ field += ch;
895
+ }
896
+ }
897
+
898
+ // Add the last field
899
+ fields.push_back(std::move(field));
900
+
901
+ return fields;
902
+ }
903
+
857
904
  common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
858
905
  // per-example default params
859
906
  // we define here to make sure it's included in llama-gen-docs
@@ -1250,7 +1297,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1250
1297
  {"--in-file"}, "FNAME",
1251
1298
  "an input file (use comma-separated values to specify multiple files)",
1252
1299
  [](common_params & params, const std::string & value) {
1253
- for (const auto & item : string_split<std::string>(value, ',')) {
1300
+ for (const auto & item : parse_csv_row(value)) {
1254
1301
  std::ifstream file(item);
1255
1302
  if (!file) {
1256
1303
  throw std::runtime_error(string_format("error: failed to open file '%s'\n", item.c_str()));
@@ -1397,7 +1444,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1397
1444
  [](common_params & params, bool value) {
1398
1445
  params.warmup = value;
1399
1446
  }
1400
- ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY}));
1447
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_DEBUG}));
1401
1448
  add_opt(common_arg(
1402
1449
  {"--spm-infill"},
1403
1450
  string_format(
@@ -1695,6 +1742,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1695
1742
  params.sampling.grammar = json_schema_to_grammar(json::parse(schema));
1696
1743
  }
1697
1744
  ).set_sparam());
1745
+ add_opt(common_arg(
1746
+ {"-bs", "--backend-sampling"},
1747
+ "enable backend sampling (experimental) (default: disabled)",
1748
+ [](common_params & params) {
1749
+ params.sampling.backend_sampling = true;
1750
+ }
1751
+ ).set_sparam().set_env("LLAMA_ARG_BACKEND_SAMPLING"));
1698
1752
  add_opt(common_arg(
1699
1753
  {"--pooling"}, "{none,mean,cls,last,rank}",
1700
1754
  "pooling type for embeddings, use model default if unspecified",
@@ -1706,7 +1760,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1706
1760
  else if (value == "rank") { params.pooling_type = LLAMA_POOLING_TYPE_RANK; }
1707
1761
  else { throw std::invalid_argument("invalid value"); }
1708
1762
  }
1709
- ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_POOLING"));
1763
+ ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_DEBUG}).set_env("LLAMA_ARG_POOLING"));
1710
1764
  add_opt(common_arg(
1711
1765
  {"--attention"}, "{causal,non-causal}",
1712
1766
  "attention type for embeddings, use model default if unspecified",
@@ -1995,7 +2049,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1995
2049
  {"--image", "--audio"}, "FILE",
1996
2050
  "path to an image or audio file. use with multimodal models, use comma-separated values for multiple files\n",
1997
2051
  [](common_params & params, const std::string & value) {
1998
- for (const auto & item : string_split<std::string>(value, ',')) {
2052
+ for (const auto & item : parse_csv_row(value)) {
1999
2053
  params.image.emplace_back(item);
2000
2054
  }
2001
2055
  }
@@ -2252,37 +2306,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2252
2306
  ));
2253
2307
  add_opt(common_arg(
2254
2308
  {"--override-kv"}, "KEY=TYPE:VALUE,...",
2255
- "advanced option to override model metadata by key. to specify multiple overrides, either use comma-separated or repeat this argument.\n"
2309
+ "advanced option to override model metadata by key. to specify multiple overrides, either use comma-separated values.\n"
2256
2310
  "types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false,tokenizer.ggml.add_eos_token=bool:false",
2257
2311
  [](common_params & params, const std::string & value) {
2258
- std::vector<std::string> kv_overrides;
2259
-
2260
- std::string current;
2261
- bool escaping = false;
2262
-
2263
- for (const char c : value) {
2264
- if (escaping) {
2265
- current.push_back(c);
2266
- escaping = false;
2267
- } else if (c == '\\') {
2268
- escaping = true;
2269
- } else if (c == ',') {
2270
- kv_overrides.push_back(current);
2271
- current.clear();
2272
- } else {
2273
- current.push_back(c);
2274
- }
2275
- }
2276
-
2277
- if (escaping) {
2278
- current.push_back('\\');
2279
- }
2280
-
2281
- kv_overrides.push_back(current);
2282
-
2283
- for (const auto & kv_override : kv_overrides) {
2284
- if (!string_parse_kv_override(kv_override.c_str(), params.kv_overrides)) {
2285
- throw std::runtime_error(string_format("error: Invalid type for KV override: %s\n", kv_override.c_str()));
2312
+ for (const auto & item : parse_csv_row(value)) {
2313
+ if (!string_parse_kv_override(item.c_str(), params.kv_overrides)) {
2314
+ throw std::runtime_error(string_format("error: Invalid type for KV override: %s\n", item.c_str()));
2286
2315
  }
2287
2316
  }
2288
2317
  }
@@ -2299,7 +2328,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2299
2328
  {"--lora"}, "FNAME",
2300
2329
  "path to LoRA adapter (use comma-separated values to load multiple adapters)",
2301
2330
  [](common_params & params, const std::string & value) {
2302
- for (const auto & item : string_split<std::string>(value, ',')) {
2331
+ for (const auto & item : parse_csv_row(value)) {
2303
2332
  params.lora_adapters.push_back({ item, 1.0, "", "", nullptr });
2304
2333
  }
2305
2334
  }
@@ -2310,7 +2339,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2310
2339
  "path to LoRA adapter with user defined scaling (format: FNAME:SCALE,...)\n"
2311
2340
  "note: use comma-separated values",
2312
2341
  [](common_params & params, const std::string & value) {
2313
- for (const auto & item : string_split<std::string>(value, ',')) {
2342
+ for (const auto & item : parse_csv_row(value)) {
2314
2343
  auto parts = string_split<std::string>(item, ':');
2315
2344
  if (parts.size() != 2) {
2316
2345
  throw std::invalid_argument("lora-scaled format: FNAME:SCALE");
@@ -2324,7 +2353,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2324
2353
  {"--control-vector"}, "FNAME",
2325
2354
  "add a control vector\nnote: use comma-separated values to add multiple control vectors",
2326
2355
  [](common_params & params, const std::string & value) {
2327
- for (const auto & item : string_split<std::string>(value, ',')) {
2356
+ for (const auto & item : parse_csv_row(value)) {
2328
2357
  params.control_vectors.push_back({ 1.0f, item, });
2329
2358
  }
2330
2359
  }
@@ -2334,7 +2363,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2334
2363
  "add a control vector with user defined scaling SCALE\n"
2335
2364
  "note: use comma-separated values (format: FNAME:SCALE,...)",
2336
2365
  [](common_params & params, const std::string & value) {
2337
- for (const auto & item : string_split<std::string>(value, ',')) {
2366
+ for (const auto & item : parse_csv_row(value)) {
2338
2367
  auto parts = string_split<std::string>(item, ':');
2339
2368
  if (parts.size() != 2) {
2340
2369
  throw std::invalid_argument("control-vector-scaled format: FNAME:SCALE");
@@ -2432,7 +2461,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2432
2461
  {"--context-file"}, "FNAME",
2433
2462
  "file to load context from (use comma-separated values to specify multiple files)",
2434
2463
  [](common_params & params, const std::string & value) {
2435
- for (const auto & item : string_split<std::string>(value, ',')) {
2464
+ for (const auto & item : parse_csv_row(value)) {
2436
2465
  std::ifstream file(item, std::ios::binary);
2437
2466
  if (!file) {
2438
2467
  throw std::runtime_error(string_format("error: failed to open file '%s'\n", item.c_str()));
@@ -2579,7 +2608,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2579
2608
  [](common_params & params, int value) {
2580
2609
  params.embd_normalize = value;
2581
2610
  }
2582
- ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
2611
+ ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_DEBUG}));
2583
2612
  add_opt(common_arg(
2584
2613
  {"--embd-output-format"}, "FORMAT",
2585
2614
  "empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix, \"raw\" = plain whitespace-delimited output (one embedding per line)",
@@ -2657,7 +2686,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2657
2686
  [](common_params & params) {
2658
2687
  params.embedding = true;
2659
2688
  }
2660
- ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_EMBEDDINGS"));
2689
+ ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_DEBUG}).set_env("LLAMA_ARG_EMBEDDINGS"));
2661
2690
  add_opt(common_arg(
2662
2691
  {"--rerank", "--reranking"},
2663
2692
  string_format("enable reranking endpoint on server (default: %s)", "disabled"),
@@ -2668,9 +2697,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2668
2697
  ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_RERANKING"));
2669
2698
  add_opt(common_arg(
2670
2699
  {"--api-key"}, "KEY",
2671
- "API key to use for authentication (default: none)",
2700
+ "API key to use for authentication, multiple keys can be provided as a comma-separated list (default: none)",
2672
2701
  [](common_params & params, const std::string & value) {
2673
- params.api_keys.push_back(value);
2702
+ for (const auto & key : parse_csv_row(value)) {
2703
+ if (!key.empty()) {
2704
+ params.api_keys.push_back(key);
2705
+ }
2706
+ }
2674
2707
  }
2675
2708
  ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_API_KEY"));
2676
2709
  add_opt(common_arg(
@@ -2684,7 +2717,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2684
2717
  std::string key;
2685
2718
  while (std::getline(key_file, key)) {
2686
2719
  if (!key.empty()) {
2687
- params.api_keys.push_back(key);
2720
+ params.api_keys.push_back(key);
2688
2721
  }
2689
2722
  }
2690
2723
  key_file.close();
@@ -2706,7 +2739,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2706
2739
  ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSL_CERT_FILE"));
2707
2740
  add_opt(common_arg(
2708
2741
  {"--chat-template-kwargs"}, "STRING",
2709
- string_format("sets additional params for the json template parser"),
2742
+ "sets additional params for the json template parser, must be a valid json object string, e.g. '{\"key1\":\"value1\",\"key2\":\"value2\"}'",
2710
2743
  [](common_params & params, const std::string & value) {
2711
2744
  auto parsed = json::parse(value);
2712
2745
  for (const auto & item : parsed.items()) {
@@ -3344,6 +3377,27 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
3344
3377
  }
3345
3378
  }
3346
3379
  ).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
3380
+ add_opt(common_arg(
3381
+ {"--save-logits"},
3382
+ string_format("save final logits to files for verification (default: %s)", params.save_logits ? "true" : "false"),
3383
+ [](common_params & params) {
3384
+ params.save_logits = true;
3385
+ }
3386
+ ).set_examples({LLAMA_EXAMPLE_DEBUG}));
3387
+ add_opt(common_arg(
3388
+ {"--logits-output-dir"}, "PATH",
3389
+ string_format("directory for saving logits output files (default: %s)", params.logits_output_dir.c_str()),
3390
+ [](common_params & params, const std::string & value) {
3391
+ params.logits_output_dir = value;
3392
+ }
3393
+ ).set_examples({LLAMA_EXAMPLE_DEBUG}));
3394
+ add_opt(common_arg(
3395
+ {"--tensor-filter"}, "REGEX",
3396
+ "filter tensor names for debug output (regex pattern, can be specified multiple times)",
3397
+ [](common_params & params, const std::string & value) {
3398
+ params.tensor_filter.push_back(value);
3399
+ }
3400
+ ).set_examples({LLAMA_EXAMPLE_DEBUG}));
3347
3401
 
3348
3402
  // presets
3349
3403
  add_opt(common_arg(
@@ -2052,7 +2052,7 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
2052
2052
  // Trigger on tool calls that appear in the commentary channel
2053
2053
  data.grammar_triggers.push_back({
2054
2054
  COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
2055
- "<\\|channel\\|>(commentary|analysis) to"
2055
+ "<\\|channel\\|>(?:commentary|analysis) to"
2056
2056
  });
2057
2057
 
2058
2058
  // Trigger tool calls that appear in the role section, either at the
@@ -2385,17 +2385,17 @@ static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat
2385
2385
  (inputs.parallel_tool_calls ? "(" + tool_call + ")+" : tool_call));
2386
2386
  // Trigger on some common known "good bad" outputs (only from the start and with a json that's about a specific argument name to avoid false positives)
2387
2387
  data.grammar_triggers.push_back({
2388
- COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
2388
+ COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
2389
2389
  // If thinking_forced_open, then we capture the </think> tag in the grammar,
2390
2390
  // (important for required tool choice) and in the trigger's first capture (decides what is sent to the grammar)
2391
- std::string(data.thinking_forced_open ? "[\\s\\S]*?(</think>\\s*)" : "(?:<think>[\\s\\S]*?</think>\\s*)?") + (
2391
+ std::string(data.thinking_forced_open ? "(</think>\\s*)" : "") + (
2392
2392
  "\\s*("
2393
2393
  "(?:<tool_call>"
2394
2394
  "|<function"
2395
2395
  "|(?:```(?:json|xml)?\n\\s*)?(?:<function_call>|<tools>|<xml><json>|<response>)?"
2396
2396
  "\\s*\\{\\s*\"name\"\\s*:\\s*\"(?:" + string_join(escaped_names, "|") + ")\""
2397
2397
  ")"
2398
- ")[\\s\\S]*"
2398
+ ")"
2399
2399
  ),
2400
2400
  });
2401
2401
  data.preserved_tokens = {
@@ -1086,6 +1086,7 @@ struct common_init_result::impl {
1086
1086
  std::vector<llama_adapter_lora_ptr> lora;
1087
1087
 
1088
1088
  std::vector<common_sampler_ptr> samplers;
1089
+ std::vector<llama_sampler_seq_config> samplers_seq_config;
1089
1090
  };
1090
1091
 
1091
1092
  common_init_result::common_init_result(common_params & params) :
@@ -1162,10 +1163,19 @@ common_init_result::common_init_result(common_params & params) :
1162
1163
  // params.sampling.dry_penalty_last_n = llama_n_ctx(lctx);
1163
1164
  //}
1164
1165
 
1166
+ // init the backend samplers as part of the context creation
1165
1167
  pimpl->samplers.resize(cparams.n_seq_max);
1168
+ pimpl->samplers_seq_config.resize(cparams.n_seq_max);
1166
1169
 
1167
1170
  for (int i = 0; i < (int) cparams.n_seq_max; ++i) {
1168
1171
  pimpl->samplers[i].reset(common_sampler_init(model, params.sampling));
1172
+ pimpl->samplers_seq_config[i] = { i, common_sampler_get(pimpl->samplers[i].get()) };
1173
+ }
1174
+
1175
+ // TODO: temporarily gated behind a flag
1176
+ if (params.sampling.backend_sampling) {
1177
+ cparams.samplers = pimpl->samplers_seq_config.data();
1178
+ cparams.n_samplers = pimpl->samplers_seq_config.size();
1169
1179
  }
1170
1180
 
1171
1181
  llama_context * lctx = llama_init_from_model(model, cparams);
@@ -1189,6 +1199,12 @@ common_sampler * common_init_result::sampler(llama_seq_id seq_id) {
1189
1199
  return pimpl->samplers[seq_id].get();
1190
1200
  }
1191
1201
 
1202
+ void common_init_result::reset_samplers() {
1203
+ for (int i = 0; i < (int) pimpl->samplers.size(); ++i) {
1204
+ llama_sampler_reset(common_sampler_get(pimpl->samplers[i].get()));
1205
+ }
1206
+ }
1207
+
1192
1208
  std::vector<llama_adapter_lora_ptr> & common_init_result::lora() {
1193
1209
  return pimpl->lora;
1194
1210
  }
@@ -1304,6 +1320,9 @@ common_init_result_ptr common_init_from_params(common_params & params) {
1304
1320
  llama_synchronize(lctx);
1305
1321
  llama_perf_context_reset(lctx);
1306
1322
  llama_set_warmup(lctx, false);
1323
+
1324
+ // reset samplers to reset RNG state after warmup to the seeded state
1325
+ res->reset_samplers();
1307
1326
  }
1308
1327
 
1309
1328
  return res;
@@ -80,6 +80,7 @@ int32_t cpu_get_num_math();
80
80
  //
81
81
 
82
82
  enum llama_example {
83
+ LLAMA_EXAMPLE_DEBUG,
83
84
  LLAMA_EXAMPLE_COMMON,
84
85
  LLAMA_EXAMPLE_SPECULATIVE,
85
86
  LLAMA_EXAMPLE_COMPLETION,
@@ -216,6 +217,8 @@ struct common_params_sampling {
216
217
  std::vector<llama_logit_bias> logit_bias; // logit biases to apply
217
218
  std::vector<llama_logit_bias> logit_bias_eog; // pre-calculated logit biases for EOG tokens
218
219
 
220
+ bool backend_sampling = false;
221
+
219
222
  bool has_logit_bias() const {
220
223
  return !logit_bias.empty();
221
224
  }
@@ -371,6 +374,11 @@ struct common_params {
371
374
  std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding // NOLINT
372
375
  std::string logits_file = ""; // file for saving *all* logits // NOLINT
373
376
 
377
+ // llama-debug specific options
378
+ std::string logits_output_dir = "data"; // directory for saving logits output files // NOLINT
379
+ bool save_logits = false; // whether to save logits to files // NOLINT
380
+ std::vector<std::string> tensor_filter; // filter tensor names for debug output (regex) // NOLINT
381
+
374
382
  std::vector<std::string> in_files; // all input files
375
383
  std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
376
384
  std::vector<llama_model_kv_override> kv_overrides;
@@ -690,7 +698,9 @@ struct common_init_result {
690
698
 
691
699
  llama_model * model();
692
700
  llama_context * context();
701
+
693
702
  common_sampler * sampler(llama_seq_id seq_id);
703
+ void reset_samplers();
694
704
 
695
705
  std::vector<llama_adapter_lora_ptr> & lora();
696
706
 
@@ -106,12 +106,16 @@ static void llama_sampler_llg_free(llama_sampler * smpl) {
106
106
  }
107
107
 
108
108
  static llama_sampler_i llama_sampler_llg_i = {
109
- /* .name = */ llama_sampler_llg_name,
110
- /* .accept = */ llama_sampler_llg_accept_impl,
111
- /* .apply = */ llama_sampler_llg_apply,
112
- /* .reset = */ llama_sampler_llg_reset,
113
- /* .clone = */ llama_sampler_llg_clone,
114
- /* .free = */ llama_sampler_llg_free,
109
+ /* .name = */ llama_sampler_llg_name,
110
+ /* .accept = */ llama_sampler_llg_accept_impl,
111
+ /* .apply = */ llama_sampler_llg_apply,
112
+ /* .reset = */ llama_sampler_llg_reset,
113
+ /* .clone = */ llama_sampler_llg_clone,
114
+ /* .free = */ llama_sampler_llg_free,
115
+ /* .backend_init = */ NULL,
116
+ /* .backend_accept = */ NULL,
117
+ /* .backend_apply = */ NULL,
118
+ /* .backend_set_input = */ NULL,
115
119
  };
116
120
 
117
121
  static size_t llama_sampler_llg_tokenize_fn(const void * user_data, const uint8_t * bytes, size_t bytes_len,
@@ -27,7 +27,7 @@ common_regex_match common_regex::search(const std::string & input, size_t pos, b
27
27
  return res;
28
28
  }
29
29
  std::match_results<std::string::const_reverse_iterator> srmatch;
30
- if (std::regex_match(input.rbegin(), input.rend() - pos, srmatch, rx_reversed_partial)) {
30
+ if (std::regex_search(input.rbegin(), input.rend() - pos, srmatch, rx_reversed_partial, std::regex_constants::match_continuous)) {
31
31
  auto group = srmatch[1].str();
32
32
  if (group.length() != 0) {
33
33
  auto it = srmatch[1].second.base();
@@ -55,18 +55,18 @@ common_regex_match common_regex::search(const std::string & input, size_t pos, b
55
55
  to see if a string ends with a partial regex match, but but it's not in std::regex yet.
56
56
  Instead, we'll the regex into a partial match regex operating as a full match on the reverse iterators of the input.
57
57
 
58
- - /abcd/ -> (dcba|cba|ba|a).* -> ((?:(?:(?:(?:d)?c)?b)?a).*
59
- - /a|b/ -> (a|b).*
58
+ - /abcd/ -> ^(dcba|cba|ba|a) -> ^((?:(?:(?:(?:d)?c)?b)?a)
59
+ - /a|b/ -> ^(a|b)
60
60
  - /a*?/ -> error, could match ""
61
- - /a*b/ -> ((?:b)?a*+).* (final repetitions become eager)
62
- - /.*?ab/ -> ((?:b)?a).* (merge .*)
63
- - /a.*?b/ -> ((?:b)?.*?a).* (keep reluctant matches)
64
- - /a(bc)d/ -> ((?:(?:d)?(?:(?:c)?b))?a).*
65
- - /a(bc|de)/ -> ((?:(?:(?:e)?d)?|(?:(?:c)?b)?)?a).*
66
- - /ab{2,4}c/ -> abbb?b?c -> ((?:(?:(?:(?:(?:c)?b)?b)?b?)?b?)?a).*
61
+ - /a*b/ -> ^((?:b)?a*+) (final repetitions become eager)
62
+ - /.*?ab/ -> ^((?:b)?a) (omit .*)
63
+ - /a.*?b/ -> ^((?:b)?.*?a) (keep reluctant matches)
64
+ - /a(bc)d/ -> ^((?:(?:d)?(?:(?:c)?b))?a)
65
+ - /a(bc|de)/ -> ^((?:(?:(?:e)?d)?|(?:(?:c)?b)?)?a)
66
+ - /ab{2,4}c/ -> ^cbbb?b?a -> ^((?:(?:(?:(?:(?:c)?b)?b)?b?)?b?)?a)
67
67
 
68
- The regex will match a reversed string fully, and the end of the first (And only) capturing group will indicate the reversed start of the original partial pattern
69
- (i.e. just where the final .* starts in the inverted pattern; all other groups are turned into non-capturing groups, and reluctant quantifiers are ignored)
68
+ The regex will match a reversed string fully, and the end of the first (And only) capturing group will indicate the reversed start of the original partial pattern.
69
+ All other groups are turned into non-capturing groups, and reluctant quantifiers are ignored.
70
70
  */
71
71
  std::string regex_to_reversed_partial_regex(const std::string & pattern) {
72
72
  auto it = pattern.begin();
@@ -177,7 +177,7 @@ std::string regex_to_reversed_partial_regex(const std::string & pattern) {
177
177
  }
178
178
  }
179
179
 
180
- // /abcd/ -> (dcba|cba|ba|a).* -> ((?:(?:(?:d)?c)?b)?a).*
180
+ // /abcd/ -> ^(dcba|cba|ba|a) -> ^((?:(?:(?:d)?c)?b)?a)
181
181
  // if n(=4) parts, opening n-1(=3) non-capturing groups after the 1 capturing group
182
182
  // We'll do the outermost capturing group and final .* in the enclosing function.
183
183
  std::vector<std::string> res_alts;
@@ -200,5 +200,5 @@ std::string regex_to_reversed_partial_regex(const std::string & pattern) {
200
200
  throw std::runtime_error("Unmatched '(' in pattern");
201
201
  }
202
202
 
203
- return "(" + res + ")[\\s\\S]*";
203
+ return "^(" + res + ")";
204
204
  }