@fugood/llama.node 1.4.11 → 1.4.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. package/package.json +15 -15
  2. package/scripts/llama.cpp.patch +31 -31
  3. package/src/llama.cpp/common/arg.cpp +128 -59
  4. package/src/llama.cpp/common/arg.h +1 -0
  5. package/src/llama.cpp/common/chat-parser.cpp +11 -0
  6. package/src/llama.cpp/common/chat.cpp +36 -7
  7. package/src/llama.cpp/common/chat.h +1 -0
  8. package/src/llama.cpp/common/common.cpp +42 -23
  9. package/src/llama.cpp/common/common.h +11 -1
  10. package/src/llama.cpp/common/llguidance.cpp +10 -6
  11. package/src/llama.cpp/common/regex-partial.cpp +13 -13
  12. package/src/llama.cpp/common/sampling.cpp +58 -14
  13. package/src/llama.cpp/common/sampling.h +3 -1
  14. package/src/llama.cpp/ggml/CMakeLists.txt +13 -1
  15. package/src/llama.cpp/ggml/include/ggml-backend.h +1 -1
  16. package/src/llama.cpp/ggml/src/CMakeLists.txt +23 -9
  17. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +12 -2
  18. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +1 -1
  19. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +86 -25
  20. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +15 -8
  21. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +768 -0
  22. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +0 -4
  23. package/src/llama.cpp/include/llama.h +100 -12
  24. package/src/llama.cpp/src/CMakeLists.txt +4 -0
  25. package/src/llama.cpp/src/llama-adapter.cpp +12 -3
  26. package/src/llama.cpp/src/llama-adapter.h +7 -1
  27. package/src/llama.cpp/src/llama-arch.cpp +78 -0
  28. package/src/llama.cpp/src/llama-arch.h +8 -0
  29. package/src/llama.cpp/src/llama-chat.cpp +11 -0
  30. package/src/llama.cpp/src/llama-chat.h +1 -0
  31. package/src/llama.cpp/src/llama-context.cpp +637 -49
  32. package/src/llama.cpp/src/llama-context.h +43 -1
  33. package/src/llama.cpp/src/llama-grammar.cpp +40 -13
  34. package/src/llama.cpp/src/llama-grammar.h +2 -0
  35. package/src/llama.cpp/src/llama-graph.cpp +173 -5
  36. package/src/llama.cpp/src/llama-graph.h +71 -6
  37. package/src/llama.cpp/src/llama-hparams.cpp +4 -0
  38. package/src/llama.cpp/src/llama-hparams.h +12 -5
  39. package/src/llama.cpp/src/llama-kv-cache.h +1 -1
  40. package/src/llama.cpp/src/llama-mmap.cpp +11 -4
  41. package/src/llama.cpp/src/llama-model-loader.cpp +23 -0
  42. package/src/llama.cpp/src/llama-model-loader.h +2 -0
  43. package/src/llama.cpp/src/llama-model-saver.cpp +3 -0
  44. package/src/llama.cpp/src/llama-model.cpp +337 -26
  45. package/src/llama.cpp/src/llama-model.h +13 -2
  46. package/src/llama.cpp/src/llama-sampling.cpp +1259 -186
  47. package/src/llama.cpp/src/llama-sampling.h +19 -7
  48. package/src/llama.cpp/src/llama-vocab.cpp +101 -33
  49. package/src/llama.cpp/src/llama-vocab.h +2 -0
  50. package/src/llama.cpp/src/llama.cpp +87 -64
  51. package/src/llama.cpp/src/models/afmoe.cpp +9 -5
  52. package/src/llama.cpp/src/models/bert.cpp +4 -2
  53. package/src/llama.cpp/src/models/cogvlm.cpp +5 -3
  54. package/src/llama.cpp/src/models/cohere2-iswa.cpp +3 -0
  55. package/src/llama.cpp/src/models/deepseek2.cpp +1 -1
  56. package/src/llama.cpp/src/models/gemma-embedding.cpp +2 -6
  57. package/src/llama.cpp/src/models/gemma2-iswa.cpp +5 -2
  58. package/src/llama.cpp/src/models/gemma3.cpp +3 -4
  59. package/src/llama.cpp/src/models/gemma3n-iswa.cpp +4 -7
  60. package/src/llama.cpp/src/models/llama-iswa.cpp +6 -2
  61. package/src/llama.cpp/src/models/llama.cpp +19 -6
  62. package/src/llama.cpp/src/models/maincoder.cpp +117 -0
  63. package/src/llama.cpp/src/models/mimo2-iswa.cpp +123 -0
  64. package/src/llama.cpp/src/models/models.h +18 -0
  65. package/src/llama.cpp/src/models/modern-bert.cpp +116 -0
  66. package/src/llama.cpp/src/models/openai-moe-iswa.cpp +5 -2
  67. package/src/llama.cpp/src/models/plamo3.cpp +128 -0
  68. package/src/llama.cpp/src/models/smallthinker.cpp +11 -5
  69. package/src/llama.cpp/src/unicode.cpp +23 -14
@@ -306,7 +306,7 @@ json common_chat_msgs_to_json_oaicompat(const std::vector<common_chat_msg> & msg
306
306
  }
307
307
  }
308
308
  } else {
309
- jmsg["content"] = json(); // null
309
+ jmsg["content"] = "";
310
310
  }
311
311
  if (!msg.reasoning_content.empty()) {
312
312
  jmsg["reasoning_content"] = msg.reasoning_content;
@@ -367,8 +367,8 @@ std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const json & too
367
367
  const auto & function = tool.at("function");
368
368
  result.push_back({
369
369
  /* .name = */ function.at("name"),
370
- /* .description = */ function.at("description"),
371
- /* .parameters = */ function.at("parameters").dump(),
370
+ /* .description = */ function.value("description", ""),
371
+ /* .parameters = */ function.value("parameters", json::object()).dump(),
372
372
  });
373
373
  }
374
374
  }
@@ -656,6 +656,7 @@ const char * common_chat_format_name(common_chat_format format) {
656
656
  case COMMON_CHAT_FORMAT_QWEN3_CODER_XML: return "Qwen3 Coder";
657
657
  case COMMON_CHAT_FORMAT_APRIEL_1_5: return "Apriel 1.5";
658
658
  case COMMON_CHAT_FORMAT_XIAOMI_MIMO: return "Xiaomi MiMo";
659
+ case COMMON_CHAT_FORMAT_SOLAR_OPEN: return "Solar Open";
659
660
  case COMMON_CHAT_FORMAT_PEG_SIMPLE: return "peg-simple";
660
661
  case COMMON_CHAT_FORMAT_PEG_NATIVE: return "peg-native";
661
662
  case COMMON_CHAT_FORMAT_PEG_CONSTRUCTED: return "peg-constructed";
@@ -2051,7 +2052,7 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
2051
2052
  // Trigger on tool calls that appear in the commentary channel
2052
2053
  data.grammar_triggers.push_back({
2053
2054
  COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
2054
- "<\\|channel\\|>(commentary|analysis) to"
2055
+ "<\\|channel\\|>(?:commentary|analysis) to"
2055
2056
  });
2056
2057
 
2057
2058
  // Trigger tool calls that appear in the role section, either at the
@@ -2384,17 +2385,17 @@ static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat
2384
2385
  (inputs.parallel_tool_calls ? "(" + tool_call + ")+" : tool_call));
2385
2386
  // Trigger on some common known "good bad" outputs (only from the start and with a json that's about a specific argument name to avoid false positives)
2386
2387
  data.grammar_triggers.push_back({
2387
- COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
2388
+ COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
2388
2389
  // If thinking_forced_open, then we capture the </think> tag in the grammar,
2389
2390
  // (important for required tool choice) and in the trigger's first capture (decides what is sent to the grammar)
2390
- std::string(data.thinking_forced_open ? "[\\s\\S]*?(</think>\\s*)" : "(?:<think>[\\s\\S]*?</think>\\s*)?") + (
2391
+ std::string(data.thinking_forced_open ? "(</think>\\s*)" : "") + (
2391
2392
  "\\s*("
2392
2393
  "(?:<tool_call>"
2393
2394
  "|<function"
2394
2395
  "|(?:```(?:json|xml)?\n\\s*)?(?:<function_call>|<tools>|<xml><json>|<response>)?"
2395
2396
  "\\s*\\{\\s*\"name\"\\s*:\\s*\"(?:" + string_join(escaped_names, "|") + ")\""
2396
2397
  ")"
2397
- ")[\\s\\S]*"
2398
+ ")"
2398
2399
  ),
2399
2400
  });
2400
2401
  data.preserved_tokens = {
@@ -2504,6 +2505,27 @@ static common_chat_params common_chat_params_init_granite(const common_chat_temp
2504
2505
  return data;
2505
2506
  }
2506
2507
 
2508
+ static common_chat_params common_chat_params_init_solar_open(const common_chat_template & tmpl, const struct templates_params & inputs) {
2509
+ common_chat_params data;
2510
+
2511
+ // TODO: Reasoning effort
2512
+ json additional_context = {};
2513
+
2514
+ data.prompt = apply(tmpl, inputs, std::nullopt, std::nullopt, additional_context);
2515
+ data.format = COMMON_CHAT_FORMAT_SOLAR_OPEN;
2516
+
2517
+ data.preserved_tokens = {
2518
+ "<|think|>",
2519
+ "<|content|>",
2520
+ "<|begin|>",
2521
+ "<|end|>",
2522
+ };
2523
+
2524
+ // TODO: Tool calling
2525
+
2526
+ return data;
2527
+ }
2528
+
2507
2529
  static common_chat_params common_chat_params_init_without_tools(const common_chat_template & tmpl, const struct templates_params & inputs) {
2508
2530
  common_chat_params data;
2509
2531
  data.prompt = apply(tmpl, inputs);
@@ -2767,6 +2789,13 @@ static common_chat_params common_chat_templates_apply_jinja(
2767
2789
  return common_chat_params_init_magistral(tmpl, params);
2768
2790
  }
2769
2791
 
2792
+ // Solar Open
2793
+ if (src.find("<|tool_response:begin|>") != std::string::npos &&
2794
+ src.find("<|tool_response:name|>") != std::string::npos &&
2795
+ src.find("<|tool_response:result|>") != std::string::npos) {
2796
+ return common_chat_params_init_solar_open(tmpl, params);
2797
+ }
2798
+
2770
2799
  // Plain handler (no tools)
2771
2800
  if (params.tools.is_null() || inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_NONE) {
2772
2801
  return common_chat_params_init_without_tools(tmpl, params);
@@ -135,6 +135,7 @@ enum common_chat_format {
135
135
  COMMON_CHAT_FORMAT_QWEN3_CODER_XML,
136
136
  COMMON_CHAT_FORMAT_APRIEL_1_5,
137
137
  COMMON_CHAT_FORMAT_XIAOMI_MIMO,
138
+ COMMON_CHAT_FORMAT_SOLAR_OPEN,
138
139
 
139
140
  // These are intended to be parsed by the PEG parser
140
141
  COMMON_CHAT_FORMAT_PEG_SIMPLE,
@@ -251,7 +251,7 @@ bool set_process_priority(enum ggml_sched_priority prio) {
251
251
  case GGML_SCHED_PRIO_REALTIME: p = -20; break;
252
252
  }
253
253
 
254
- if (!setpriority(PRIO_PROCESS, 0, p)) {
254
+ if (setpriority(PRIO_PROCESS, 0, p) != 0) {
255
255
  LOG_WRN("failed to set process priority %d : %s (%d)\n", prio, strerror(errno), errno);
256
256
  return false;
257
257
  }
@@ -1078,12 +1078,15 @@ struct common_init_result::impl {
1078
1078
  impl() = default;
1079
1079
  ~impl() = default;
1080
1080
 
1081
+ // note: the order in which model, context, etc. are declared matters because their destructors will be called bottom-to-top
1082
+
1081
1083
  llama_model_ptr model;
1082
1084
  llama_context_ptr context;
1083
1085
 
1084
1086
  std::vector<llama_adapter_lora_ptr> lora;
1085
1087
 
1086
1088
  std::vector<common_sampler_ptr> samplers;
1089
+ std::vector<llama_sampler_seq_config> samplers_seq_config;
1087
1090
  };
1088
1091
 
1089
1092
  common_init_result::common_init_result(common_params & params) :
@@ -1107,6 +1110,25 @@ common_init_result::common_init_result(common_params & params) :
1107
1110
 
1108
1111
  const llama_vocab * vocab = llama_model_get_vocab(model);
1109
1112
 
1113
+ // load and optionally apply lora adapters (must be loaded before context creation)
1114
+ for (auto & la : params.lora_adapters) {
1115
+ llama_adapter_lora_ptr lora;
1116
+ lora.reset(llama_adapter_lora_init(model, la.path.c_str()));
1117
+ if (lora == nullptr) {
1118
+ LOG_ERR("%s: failed to load lora adapter '%s'\n", __func__, la.path.c_str());
1119
+ pimpl->model.reset(model);
1120
+ return;
1121
+ }
1122
+
1123
+ char buf[1024];
1124
+ la.ptr = lora.get();
1125
+ llama_adapter_meta_val_str(la.ptr, "adapter.lora.task_name", buf, sizeof(buf));
1126
+ la.task_name = buf;
1127
+ llama_adapter_meta_val_str(la.ptr, "adapter.lora.prompt_prefix", buf, sizeof(buf));
1128
+ la.prompt_prefix = buf;
1129
+ pimpl->lora.emplace_back(std::move(lora)); // copy to list of loaded adapters
1130
+ }
1131
+
1110
1132
  // updates params.sampling
1111
1133
  // TODO: fix naming
1112
1134
  common_init_sampler_from_model(model, params.sampling);
@@ -1141,10 +1163,19 @@ common_init_result::common_init_result(common_params & params) :
1141
1163
  // params.sampling.dry_penalty_last_n = llama_n_ctx(lctx);
1142
1164
  //}
1143
1165
 
1166
+ // init the backend samplers as part of the context creation
1144
1167
  pimpl->samplers.resize(cparams.n_seq_max);
1168
+ pimpl->samplers_seq_config.resize(cparams.n_seq_max);
1145
1169
 
1146
1170
  for (int i = 0; i < (int) cparams.n_seq_max; ++i) {
1147
1171
  pimpl->samplers[i].reset(common_sampler_init(model, params.sampling));
1172
+ pimpl->samplers_seq_config[i] = { i, common_sampler_get(pimpl->samplers[i].get()) };
1173
+ }
1174
+
1175
+ // TODO: temporarily gated behind a flag
1176
+ if (params.sampling.backend_sampling) {
1177
+ cparams.samplers = pimpl->samplers_seq_config.data();
1178
+ cparams.n_samplers = pimpl->samplers_seq_config.size();
1148
1179
  }
1149
1180
 
1150
1181
  llama_context * lctx = llama_init_from_model(model, cparams);
@@ -1168,6 +1199,12 @@ common_sampler * common_init_result::sampler(llama_seq_id seq_id) {
1168
1199
  return pimpl->samplers[seq_id].get();
1169
1200
  }
1170
1201
 
1202
+ void common_init_result::reset_samplers() {
1203
+ for (int i = 0; i < (int) pimpl->samplers.size(); ++i) {
1204
+ llama_sampler_reset(common_sampler_get(pimpl->samplers[i].get()));
1205
+ }
1206
+ }
1207
+
1171
1208
  std::vector<llama_adapter_lora_ptr> & common_init_result::lora() {
1172
1209
  return pimpl->lora;
1173
1210
  }
@@ -1243,24 +1280,6 @@ common_init_result_ptr common_init_from_params(common_params & params) {
1243
1280
  }
1244
1281
  }
1245
1282
 
1246
- // load and optionally apply lora adapters
1247
- for (auto & la : params.lora_adapters) {
1248
- llama_adapter_lora_ptr lora;
1249
- lora.reset(llama_adapter_lora_init(model, la.path.c_str()));
1250
- if (lora == nullptr) {
1251
- LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
1252
- return res;
1253
- }
1254
-
1255
- char buf[1024];
1256
- la.ptr = lora.get();
1257
- llama_adapter_meta_val_str(la.ptr, "adapter.lora.task_name", buf, sizeof(buf));
1258
- la.task_name = buf;
1259
- llama_adapter_meta_val_str(la.ptr, "adapter.lora.prompt_prefix", buf, sizeof(buf));
1260
- la.prompt_prefix = buf;
1261
- res->lora().emplace_back(std::move(lora)); // copy to list of loaded adapters
1262
- }
1263
-
1264
1283
  if (!params.lora_init_without_apply) {
1265
1284
  common_set_adapter_lora(lctx, params.lora_adapters);
1266
1285
  }
@@ -1301,6 +1320,9 @@ common_init_result_ptr common_init_from_params(common_params & params) {
1301
1320
  llama_synchronize(lctx);
1302
1321
  llama_perf_context_reset(lctx);
1303
1322
  llama_set_warmup(lctx, false);
1323
+
1324
+ // reset samplers to reset RNG state after warmup to the seeded state
1325
+ res->reset_samplers();
1304
1326
  }
1305
1327
 
1306
1328
  return res;
@@ -1339,11 +1361,8 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
1339
1361
  mparams.devices = params.devices.data();
1340
1362
  }
1341
1363
 
1342
- if (params.n_gpu_layers != -1) {
1343
- mparams.n_gpu_layers = params.n_gpu_layers;
1344
- }
1345
-
1346
1364
  mparams.vocab_only = params.vocab_only;
1365
+ mparams.n_gpu_layers = params.n_gpu_layers;
1347
1366
  mparams.main_gpu = params.main_gpu;
1348
1367
  mparams.split_mode = params.split_mode;
1349
1368
  mparams.tensor_split = params.tensor_split;
@@ -80,6 +80,7 @@ int32_t cpu_get_num_math();
80
80
  //
81
81
 
82
82
  enum llama_example {
83
+ LLAMA_EXAMPLE_DEBUG,
83
84
  LLAMA_EXAMPLE_COMMON,
84
85
  LLAMA_EXAMPLE_SPECULATIVE,
85
86
  LLAMA_EXAMPLE_COMPLETION,
@@ -216,6 +217,8 @@ struct common_params_sampling {
216
217
  std::vector<llama_logit_bias> logit_bias; // logit biases to apply
217
218
  std::vector<llama_logit_bias> logit_bias_eog; // pre-calculated logit biases for EOG tokens
218
219
 
220
+ bool backend_sampling = false;
221
+
219
222
  bool has_logit_bias() const {
220
223
  return !logit_bias.empty();
221
224
  }
@@ -330,7 +333,7 @@ struct common_params {
330
333
  // offload params
331
334
  std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
332
335
 
333
- int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
336
+ int32_t n_gpu_layers = -1; // number of layers to store in VRAM, -1 is auto, <= -2 is all
334
337
  int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
335
338
  float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
336
339
  bool fit_params = true; // whether to fit unset model/context parameters to free device memory
@@ -371,6 +374,11 @@ struct common_params {
371
374
  std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding // NOLINT
372
375
  std::string logits_file = ""; // file for saving *all* logits // NOLINT
373
376
 
377
+ // llama-debug specific options
378
+ std::string logits_output_dir = "data"; // directory for saving logits output files // NOLINT
379
+ bool save_logits = false; // whether to save logits to files // NOLINT
380
+ std::vector<std::string> tensor_filter; // filter tensor names for debug output (regex) // NOLINT
381
+
374
382
  std::vector<std::string> in_files; // all input files
375
383
  std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
376
384
  std::vector<llama_model_kv_override> kv_overrides;
@@ -690,7 +698,9 @@ struct common_init_result {
690
698
 
691
699
  llama_model * model();
692
700
  llama_context * context();
701
+
693
702
  common_sampler * sampler(llama_seq_id seq_id);
703
+ void reset_samplers();
694
704
 
695
705
  std::vector<llama_adapter_lora_ptr> & lora();
696
706
 
@@ -106,12 +106,16 @@ static void llama_sampler_llg_free(llama_sampler * smpl) {
106
106
  }
107
107
 
108
108
  static llama_sampler_i llama_sampler_llg_i = {
109
- /* .name = */ llama_sampler_llg_name,
110
- /* .accept = */ llama_sampler_llg_accept_impl,
111
- /* .apply = */ llama_sampler_llg_apply,
112
- /* .reset = */ llama_sampler_llg_reset,
113
- /* .clone = */ llama_sampler_llg_clone,
114
- /* .free = */ llama_sampler_llg_free,
109
+ /* .name = */ llama_sampler_llg_name,
110
+ /* .accept = */ llama_sampler_llg_accept_impl,
111
+ /* .apply = */ llama_sampler_llg_apply,
112
+ /* .reset = */ llama_sampler_llg_reset,
113
+ /* .clone = */ llama_sampler_llg_clone,
114
+ /* .free = */ llama_sampler_llg_free,
115
+ /* .backend_init = */ NULL,
116
+ /* .backend_accept = */ NULL,
117
+ /* .backend_apply = */ NULL,
118
+ /* .backend_set_input = */ NULL,
115
119
  };
116
120
 
117
121
  static size_t llama_sampler_llg_tokenize_fn(const void * user_data, const uint8_t * bytes, size_t bytes_len,
@@ -27,7 +27,7 @@ common_regex_match common_regex::search(const std::string & input, size_t pos, b
27
27
  return res;
28
28
  }
29
29
  std::match_results<std::string::const_reverse_iterator> srmatch;
30
- if (std::regex_match(input.rbegin(), input.rend() - pos, srmatch, rx_reversed_partial)) {
30
+ if (std::regex_search(input.rbegin(), input.rend() - pos, srmatch, rx_reversed_partial, std::regex_constants::match_continuous)) {
31
31
  auto group = srmatch[1].str();
32
32
  if (group.length() != 0) {
33
33
  auto it = srmatch[1].second.base();
@@ -55,18 +55,18 @@ common_regex_match common_regex::search(const std::string & input, size_t pos, b
55
55
  to see if a string ends with a partial regex match, but but it's not in std::regex yet.
56
56
  Instead, we'll the regex into a partial match regex operating as a full match on the reverse iterators of the input.
57
57
 
58
- - /abcd/ -> (dcba|cba|ba|a).* -> ((?:(?:(?:(?:d)?c)?b)?a).*
59
- - /a|b/ -> (a|b).*
58
+ - /abcd/ -> ^(dcba|cba|ba|a) -> ^((?:(?:(?:(?:d)?c)?b)?a)
59
+ - /a|b/ -> ^(a|b)
60
60
  - /a*?/ -> error, could match ""
61
- - /a*b/ -> ((?:b)?a*+).* (final repetitions become eager)
62
- - /.*?ab/ -> ((?:b)?a).* (merge .*)
63
- - /a.*?b/ -> ((?:b)?.*?a).* (keep reluctant matches)
64
- - /a(bc)d/ -> ((?:(?:d)?(?:(?:c)?b))?a).*
65
- - /a(bc|de)/ -> ((?:(?:(?:e)?d)?|(?:(?:c)?b)?)?a).*
66
- - /ab{2,4}c/ -> abbb?b?c -> ((?:(?:(?:(?:(?:c)?b)?b)?b?)?b?)?a).*
61
+ - /a*b/ -> ^((?:b)?a*+) (final repetitions become eager)
62
+ - /.*?ab/ -> ^((?:b)?a) (omit .*)
63
+ - /a.*?b/ -> ^((?:b)?.*?a) (keep reluctant matches)
64
+ - /a(bc)d/ -> ^((?:(?:d)?(?:(?:c)?b))?a)
65
+ - /a(bc|de)/ -> ^((?:(?:(?:e)?d)?|(?:(?:c)?b)?)?a)
66
+ - /ab{2,4}c/ -> ^cbbb?b?a -> ^((?:(?:(?:(?:(?:c)?b)?b)?b?)?b?)?a)
67
67
 
68
- The regex will match a reversed string fully, and the end of the first (And only) capturing group will indicate the reversed start of the original partial pattern
69
- (i.e. just where the final .* starts in the inverted pattern; all other groups are turned into non-capturing groups, and reluctant quantifiers are ignored)
68
+ The regex will match a reversed string fully, and the end of the first (And only) capturing group will indicate the reversed start of the original partial pattern.
69
+ All other groups are turned into non-capturing groups, and reluctant quantifiers are ignored.
70
70
  */
71
71
  std::string regex_to_reversed_partial_regex(const std::string & pattern) {
72
72
  auto it = pattern.begin();
@@ -177,7 +177,7 @@ std::string regex_to_reversed_partial_regex(const std::string & pattern) {
177
177
  }
178
178
  }
179
179
 
180
- // /abcd/ -> (dcba|cba|ba|a).* -> ((?:(?:(?:d)?c)?b)?a).*
180
+ // /abcd/ -> ^(dcba|cba|ba|a) -> ^((?:(?:(?:d)?c)?b)?a)
181
181
  // if n(=4) parts, opening n-1(=3) non-capturing groups after the 1 capturing group
182
182
  // We'll do the outermost capturing group and final .* in the enclosing function.
183
183
  std::vector<std::string> res_alts;
@@ -200,5 +200,5 @@ std::string regex_to_reversed_partial_regex(const std::string & pattern) {
200
200
  throw std::runtime_error("Unmatched '(' in pattern");
201
201
  }
202
202
 
203
- return "(" + res + ")[\\s\\S]*";
203
+ return "^(" + res + ")";
204
204
  }
@@ -120,17 +120,34 @@ struct common_sampler {
120
120
  }
121
121
 
122
122
  void set_logits(struct llama_context * ctx, int idx) {
123
- const auto * logits = llama_get_logits_ith(ctx, idx);
123
+ const float * sampled_probs = llama_get_sampled_probs_ith (ctx, idx);
124
+ const float * sampled_logits = llama_get_sampled_logits_ith (ctx, idx);
125
+ const llama_token * sampled_ids = llama_get_sampled_candidates_ith(ctx, idx);
124
126
 
125
127
  const llama_model * model = llama_get_model(ctx);
126
128
  const llama_vocab * vocab = llama_model_get_vocab(model);
127
129
 
128
130
  const int n_vocab = llama_vocab_n_tokens(vocab);
129
131
 
130
- cur.resize(n_vocab);
131
-
132
- for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
133
- cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
132
+ if (sampled_probs) {
133
+ const uint32_t sampled_probs_count = llama_get_sampled_probs_count_ith(ctx, idx);
134
+ cur.resize(sampled_probs_count);
135
+ for (uint32_t i = 0; i < sampled_probs_count; ++i) {
136
+ cur[i] = llama_token_data{sampled_ids[i], sampled_logits[i], sampled_probs[i]};
137
+ }
138
+ } else if (sampled_logits) {
139
+ const uint32_t sampled_logits_count = llama_get_sampled_logits_count_ith(ctx, idx);
140
+ cur.resize(sampled_logits_count);
141
+ for (uint32_t i = 0; i < sampled_logits_count; i++) {
142
+ cur[i] = llama_token_data{sampled_ids[i], sampled_logits[i], 0.0f};
143
+ }
144
+ } else {
145
+ const auto * logits = llama_get_logits_ith(ctx, idx);
146
+ GGML_ASSERT(logits != nullptr);
147
+ cur.resize(n_vocab);
148
+ for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
149
+ cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
150
+ }
134
151
  }
135
152
 
136
153
  cur_p = { cur.data(), cur.size(), -1, false };
@@ -159,7 +176,7 @@ std::string common_params_sampling::print() const {
159
176
  return std::string(result);
160
177
  }
161
178
 
162
- struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_params_sampling & params) {
179
+ struct common_sampler * common_sampler_init(const struct llama_model * model, struct common_params_sampling & params) {
163
180
  const llama_vocab * vocab = llama_model_get_vocab(model);
164
181
 
165
182
  llama_sampler_chain_params lparams = llama_sampler_chain_default_params();
@@ -179,24 +196,30 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
179
196
  #endif // LLAMA_USE_LLGUIDANCE
180
197
  } else {
181
198
  std::vector<std::string> trigger_patterns;
182
- std::vector<std::string> patterns_anywhere;
183
199
  std::vector<llama_token> trigger_tokens;
184
200
  for (const auto & trigger : params.grammar_triggers) {
185
201
  switch (trigger.type) {
186
202
  case COMMON_GRAMMAR_TRIGGER_TYPE_WORD:
187
203
  {
188
204
  const auto & word = trigger.value;
189
- patterns_anywhere.push_back(regex_escape(word));
205
+ trigger_patterns.push_back(regex_escape(word));
190
206
  break;
191
207
  }
192
208
  case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN:
193
209
  {
194
- patterns_anywhere.push_back(trigger.value);
210
+ trigger_patterns.push_back(trigger.value);
195
211
  break;
196
212
  }
197
213
  case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL:
198
214
  {
199
- trigger_patterns.push_back(trigger.value);
215
+ const auto & pattern = trigger.value;
216
+ std::string anchored = "^$";
217
+ if (!pattern.empty()) {
218
+ anchored = (pattern.front() != '^' ? "^" : "")
219
+ + pattern
220
+ + (pattern.back() != '$' ? "$" : "");
221
+ }
222
+ trigger_patterns.push_back(anchored);
200
223
  break;
201
224
  }
202
225
  case COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN:
@@ -210,10 +233,6 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
210
233
  }
211
234
  }
212
235
 
213
- if (!patterns_anywhere.empty()) {
214
- trigger_patterns.push_back("^[\\s\\S]*?(" + string_join(patterns_anywhere, "|") + ")[\\s\\S]*");
215
- }
216
-
217
236
  std::vector<const char *> trigger_patterns_c;
218
237
  trigger_patterns_c.reserve(trigger_patterns.size());
219
238
  for (const auto & regex : trigger_patterns) {
@@ -296,6 +315,12 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
296
315
  llama_sampler_chain_add(chain, smpl);
297
316
  }
298
317
 
318
+ if (grmr && params.backend_sampling) {
319
+ LOG_WRN("%s: backend sampling is not compatible with grammar, disabling\n", __func__);
320
+
321
+ params.backend_sampling = false;
322
+ }
323
+
299
324
  auto * result = new common_sampler {
300
325
  /* .params = */ params,
301
326
  /* .grmr = */ grmr,
@@ -405,6 +430,25 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
405
430
  auto & chain = gsmpl->chain;
406
431
  auto & cur_p = gsmpl->cur_p; // initialized by set_logits
407
432
 
433
+ // Check if a backend sampler has already sampled a token in which case we
434
+ // return that token id directly.
435
+ {
436
+ id = llama_get_sampled_token_ith(ctx, idx);
437
+
438
+ if (id != LLAMA_TOKEN_NULL) {
439
+ LOG_DBG("%s: Backend sampler selected token: '%d'. Will not run any CPU samplers\n", __func__, id);
440
+
441
+ GGML_ASSERT(!gsmpl->grmr && "using grammar in combination with backend sampling is not supported");
442
+
443
+ // TODO: simplify
444
+ gsmpl->cur.resize(1);
445
+ gsmpl->cur[0] = { id, 0.0f, 1.0f };
446
+ cur_p = { gsmpl->cur.data(), gsmpl->cur.size(), 0, true };
447
+
448
+ return id;
449
+ }
450
+ }
451
+
408
452
  gsmpl->set_logits(ctx, idx);
409
453
 
410
454
  if (grammar_first) {
@@ -36,7 +36,8 @@ struct common_sampler;
36
36
 
37
37
  // llama_sampler API overloads
38
38
 
39
- struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_params_sampling & params);
39
+ // note: can mutate params in some cases
40
+ struct common_sampler * common_sampler_init(const struct llama_model * model, struct common_params_sampling & params);
40
41
 
41
42
  void common_sampler_free(struct common_sampler * gsmpl);
42
43
 
@@ -48,6 +49,7 @@ struct common_sampler * common_sampler_clone (struct common_sampler * gsmpl);
48
49
  // arguments can be nullptr to skip printing
49
50
  void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl);
50
51
 
52
+ // get the underlying llama_sampler_chain
51
53
  struct llama_sampler * common_sampler_get(const struct common_sampler * gsmpl);
52
54
 
53
55
  // extended sampling implementation:
@@ -4,7 +4,7 @@ project("ggml" C CXX ASM)
4
4
  ### GGML Version
5
5
  set(GGML_VERSION_MAJOR 0)
6
6
  set(GGML_VERSION_MINOR 9)
7
- set(GGML_VERSION_PATCH 4)
7
+ set(GGML_VERSION_PATCH 5)
8
8
  set(GGML_VERSION_BASE "${GGML_VERSION_MAJOR}.${GGML_VERSION_MINOR}.${GGML_VERSION_PATCH}")
9
9
 
10
10
  find_program(GIT_EXE NAMES git git.exe NO_CMAKE_FIND_ROOT_PATH)
@@ -430,10 +430,22 @@ if (MSVC)
430
430
  configure_msvc_target(ggml-cpu-x64)
431
431
  configure_msvc_target(ggml-cpu-sse42)
432
432
  configure_msvc_target(ggml-cpu-sandybridge)
433
+ # __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
434
+ # skipping ggml-cpu-ivybridge
435
+ # skipping ggml-cpu-piledriver
433
436
  configure_msvc_target(ggml-cpu-haswell)
434
437
  configure_msvc_target(ggml-cpu-skylakex)
438
+ configure_msvc_target(ggml-cpu-cannonlake)
439
+ configure_msvc_target(ggml-cpu-cascadelake)
435
440
  configure_msvc_target(ggml-cpu-icelake)
441
+ # MSVC 2022 doesn't support BF16 intrinsics without `/arch:AVX10.1` ?!
442
+ # https://learn.microsoft.com/en-us/cpp/intrinsics/x64-amd64-intrinsics-list?view=msvc-170
443
+ # https://learn.microsoft.com/en-us/cpp/build/reference/arch-x64?view=msvc-170
444
+ # skipping ggml-cpu-cooperlake
445
+ # skipping ggml-cpu-zen4
436
446
  configure_msvc_target(ggml-cpu-alderlake)
447
+ # MSVC doesn't support AMX
448
+ # skipping ggml-cpu-sapphirerapids
437
449
 
438
450
  if (GGML_BUILD_EXAMPLES)
439
451
  configure_msvc_target(common-ggml)
@@ -358,7 +358,7 @@ extern "C" {
358
358
  typedef bool (*ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
359
359
 
360
360
  // Compare the output of two backends
361
- GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data, struct ggml_tensor * test_node);
361
+ GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data, struct ggml_tensor const * const * test_nodes, size_t num_test_nodes);
362
362
 
363
363
  // Tensor initialization
364
364
  GGML_API enum ggml_status ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
@@ -357,15 +357,29 @@ if (GGML_CPU_ALL_VARIANTS)
357
357
  endif()
358
358
  if (GGML_SYSTEM_ARCH STREQUAL "x86")
359
359
  ggml_add_cpu_backend_variant(x64)
360
- ggml_add_cpu_backend_variant(sse42 SSE42)
361
- ggml_add_cpu_backend_variant(sandybridge SSE42 AVX)
362
- ggml_add_cpu_backend_variant(haswell SSE42 AVX F16C AVX2 BMI2 FMA)
363
- ggml_add_cpu_backend_variant(skylakex SSE42 AVX F16C AVX2 BMI2 FMA AVX512)
364
- ggml_add_cpu_backend_variant(icelake SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
365
- ggml_add_cpu_backend_variant(alderlake SSE42 AVX F16C AVX2 BMI2 FMA AVX_VNNI)
360
+ ggml_add_cpu_backend_variant(sse42 SSE42)
361
+ ggml_add_cpu_backend_variant(sandybridge SSE42 AVX)
362
+ if (NOT MSVC)
363
+ # __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
364
+ ggml_add_cpu_backend_variant(ivybridge SSE42 AVX F16C)
365
+ ggml_add_cpu_backend_variant(piledriver SSE42 AVX F16C FMA)
366
+ endif()
367
+ ggml_add_cpu_backend_variant(haswell SSE42 AVX F16C FMA AVX2 BMI2)
368
+ ggml_add_cpu_backend_variant(skylakex SSE42 AVX F16C FMA AVX2 BMI2 AVX512)
369
+ ggml_add_cpu_backend_variant(cannonlake SSE42 AVX F16C FMA AVX2 BMI2 AVX512 AVX512_VBMI)
370
+ ggml_add_cpu_backend_variant(cascadelake SSE42 AVX F16C FMA AVX2 BMI2 AVX512 AVX512_VNNI)
371
+ ggml_add_cpu_backend_variant(icelake SSE42 AVX F16C FMA AVX2 BMI2 AVX512 AVX512_VBMI AVX512_VNNI)
372
+ if (NOT MSVC)
373
+ # MSVC 2022 doesn't support BF16 intrinsics without `/arch:AVX10.1` ?!
374
+ # https://learn.microsoft.com/en-us/cpp/intrinsics/x64-amd64-intrinsics-list?view=msvc-170
375
+ # https://learn.microsoft.com/en-us/cpp/build/reference/arch-x64?view=msvc-170
376
+ ggml_add_cpu_backend_variant(cooperlake SSE42 AVX F16C FMA AVX2 BMI2 AVX512 AVX512_VNNI AVX512_BF16)
377
+ ggml_add_cpu_backend_variant(zen4 SSE42 AVX F16C FMA AVX2 BMI2 AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16)
378
+ endif()
379
+ ggml_add_cpu_backend_variant(alderlake SSE42 AVX F16C FMA AVX2 BMI2 AVX_VNNI)
366
380
  if (NOT MSVC)
367
381
  # MSVC doesn't support AMX
368
- ggml_add_cpu_backend_variant(sapphirerapids SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
382
+ ggml_add_cpu_backend_variant(sapphirerapids SSE42 AVX F16C FMA AVX2 BMI2 AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
369
383
  endif()
370
384
  elseif(GGML_SYSTEM_ARCH STREQUAL "ARM")
371
385
  if (CMAKE_SYSTEM_NAME MATCHES "Linux")
@@ -387,8 +401,8 @@ if (GGML_CPU_ALL_VARIANTS)
387
401
  ggml_add_cpu_backend_variant(android_armv8.2_2 DOTPROD FP16_VECTOR_ARITHMETIC)
388
402
  ggml_add_cpu_backend_variant(android_armv8.6_1 DOTPROD FP16_VECTOR_ARITHMETIC MATMUL_INT8)
389
403
  ggml_add_cpu_backend_variant(android_armv9.0_1 DOTPROD MATMUL_INT8 FP16_VECTOR_ARITHMETIC SVE2)
390
- ggml_add_cpu_backend_variant(android_armv9.2_1 DOTPROD MATMUL_INT8 FP16_VECTOR_ARITHMETIC SME)
391
- ggml_add_cpu_backend_variant(android_armv9.2_2 DOTPROD MATMUL_INT8 FP16_VECTOR_ARITHMETIC SVE SME)
404
+ ggml_add_cpu_backend_variant(android_armv9.2_1 DOTPROD MATMUL_INT8 FP16_VECTOR_ARITHMETIC SVE SME)
405
+ ggml_add_cpu_backend_variant(android_armv9.2_2 DOTPROD MATMUL_INT8 FP16_VECTOR_ARITHMETIC SVE SVE2 SME)
392
406
  elseif (APPLE)
393
407
  ggml_add_cpu_backend_variant(apple_m1 DOTPROD)
394
408
  ggml_add_cpu_backend_variant(apple_m2_m3 DOTPROD MATMUL_INT8)
@@ -561,9 +561,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
561
561
 
562
562
  # Fetch KleidiAI sources:
563
563
  include(FetchContent)
564
- set(KLEIDIAI_COMMIT_TAG "v1.14.0")
564
+ set(KLEIDIAI_COMMIT_TAG "v1.16.0")
565
565
  set(KLEIDIAI_DOWNLOAD_URL "https://github.com/ARM-software/kleidiai/archive/refs/tags/${KLEIDIAI_COMMIT_TAG}.tar.gz")
566
- set(KLEIDIAI_ARCHIVE_MD5 "45e110675d93f99f82c23a1afcca76bc")
566
+ set(KLEIDIAI_ARCHIVE_MD5 "0a9e9008adb6031f9e8cf70dff4a3321")
567
567
 
568
568
  if (POLICY CMP0135)
569
569
  cmake_policy(SET CMP0135 NEW)
@@ -615,6 +615,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
615
615
  string(FIND "${ARCH_FLAGS_TEMP}" "+dotprod" DOTPROD_ENABLED)
616
616
  string(FIND "${ARCH_FLAGS_TEMP}" "+i8mm" I8MM_ENABLED)
617
617
  string(FIND "${ARCH_FLAGS_TEMP}" "+sme" SME_ENABLED)
618
+ string(FIND "${ARCH_FLAGS_TEMP}" "+sve" SVE_ENABLED)
618
619
 
619
620
  set(PRIVATE_ARCH_FLAGS ${ARCH_FLAGS_TEMP})
620
621
 
@@ -659,6 +660,15 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
659
660
  set(PRIVATE_ARCH_FLAGS "-fno-tree-vectorize;${PRIVATE_ARCH_FLAGS}+sve+sve2")
660
661
  endif()
661
662
 
663
+ if (NOT SVE_ENABLED MATCHES -1)
664
+ list(APPEND GGML_KLEIDIAI_SOURCES
665
+ ${KLEIDIAI_SRC}/kai/kai_common_sve_asm.S
666
+ ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod_asm.S
667
+ ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod.c
668
+ ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm_asm.S
669
+ ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm.c)
670
+ endif()
671
+
662
672
  set_source_files_properties(${GGML_KLEIDIAI_SOURCES} PROPERTIES COMPILE_OPTIONS "${PRIVATE_ARCH_FLAGS}")
663
673
  list(APPEND GGML_CPU_SOURCES ${GGML_KLEIDIAI_SOURCES})
664
674
  endif()
@@ -328,7 +328,7 @@ inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b)
328
328
 
329
329
  #if defined(_MSC_VER) || defined(__MINGW32__)
330
330
  #include <intrin.h>
331
- #elif defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__) || defined(__SSE__)
331
+ #elif defined(__SSE__) || defined(__SSE3__) || defined(__SSSE3__) || defined(__AVX__) || defined(__F16C__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX512BF16__)
332
332
  #include <immintrin.h>
333
333
  #endif
334
334