@fugood/llama.node 0.3.12 → 0.3.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. package/bin/darwin/arm64/llama-node.node +0 -0
  2. package/bin/darwin/x64/llama-node.node +0 -0
  3. package/bin/linux/arm64/llama-node.node +0 -0
  4. package/bin/linux/x64/llama-node.node +0 -0
  5. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  6. package/bin/linux-cuda/x64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  9. package/bin/win32/arm64/llama-node.node +0 -0
  10. package/bin/win32/arm64/node.lib +0 -0
  11. package/bin/win32/x64/llama-node.node +0 -0
  12. package/bin/win32/x64/node.lib +0 -0
  13. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  14. package/bin/win32-vulkan/arm64/node.lib +0 -0
  15. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  16. package/bin/win32-vulkan/x64/node.lib +0 -0
  17. package/lib/binding.ts +1 -0
  18. package/package.json +1 -1
  19. package/src/LlamaCompletionWorker.cpp +14 -0
  20. package/src/LlamaContext.cpp +13 -4
  21. package/src/llama.cpp/.github/workflows/build.yml +35 -3
  22. package/src/llama.cpp/.github/workflows/docker.yml +2 -0
  23. package/src/llama.cpp/.github/workflows/labeler.yml +1 -1
  24. package/src/llama.cpp/common/CMakeLists.txt +20 -3
  25. package/src/llama.cpp/common/arg.cpp +180 -3
  26. package/src/llama.cpp/common/chat-template.hpp +21 -7
  27. package/src/llama.cpp/common/chat.cpp +220 -101
  28. package/src/llama.cpp/common/chat.hpp +3 -0
  29. package/src/llama.cpp/common/common.h +15 -7
  30. package/src/llama.cpp/common/llguidance.cpp +3 -3
  31. package/src/llama.cpp/common/log.cpp +1 -0
  32. package/src/llama.cpp/common/log.h +2 -1
  33. package/src/llama.cpp/common/minja.hpp +24 -9
  34. package/src/llama.cpp/common/sampling.cpp +52 -46
  35. package/src/llama.cpp/common/speculative.h +1 -1
  36. package/src/llama.cpp/docs/build.md +2 -2
  37. package/src/llama.cpp/examples/imatrix/imatrix.cpp +2 -1
  38. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +6 -5
  39. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +1 -1
  40. package/src/llama.cpp/examples/perplexity/perplexity.cpp +1 -0
  41. package/src/llama.cpp/examples/run/run.cpp +5 -12
  42. package/src/llama.cpp/examples/server/CMakeLists.txt +1 -1
  43. package/src/llama.cpp/examples/server/httplib.h +381 -292
  44. package/src/llama.cpp/examples/server/server.cpp +58 -47
  45. package/src/llama.cpp/examples/server/utils.hpp +7 -5
  46. package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -1
  47. package/src/llama.cpp/ggml/include/ggml-metal.h +1 -1
  48. package/src/llama.cpp/ggml/include/ggml-vulkan.h +0 -2
  49. package/src/llama.cpp/ggml/include/ggml.h +1 -1
  50. package/src/llama.cpp/ggml/src/ggml-common.h +0 -2
  51. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +6 -12
  52. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +852 -268
  53. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +200 -107
  54. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +2 -5
  55. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +9 -8
  56. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +2 -2
  57. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +26 -4
  58. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +6 -7
  59. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +812 -569
  60. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +25 -1
  61. package/src/llama.cpp/ggml/src/ggml.c +1 -1
  62. package/src/llama.cpp/include/llama.h +14 -10
  63. package/src/llama.cpp/src/llama-grammar.cpp +1 -1
  64. package/src/llama.cpp/src/llama-grammar.h +1 -1
  65. package/src/llama.cpp/src/llama-impl.h +6 -6
  66. package/src/llama.cpp/src/llama-kv-cache.h +1 -1
  67. package/src/llama.cpp/src/llama-mmap.h +1 -0
  68. package/src/llama.cpp/src/llama-model.cpp +1 -1
  69. package/src/llama.cpp/src/llama-sampling.cpp +131 -57
  70. package/src/llama.cpp/src/llama.cpp +7 -5
  71. package/src/llama.cpp/src/unicode.cpp +9 -2
  72. package/src/llama.cpp/tests/test-backend-ops.cpp +5 -5
  73. package/src/llama.cpp/tests/test-chat.cpp +237 -69
  74. package/src/llama.cpp/tests/test-gguf.cpp +4 -4
  75. package/src/llama.cpp/tests/test-sampling.cpp +15 -0
@@ -42,7 +42,7 @@ enum stop_type {
42
42
  STOP_TYPE_LIMIT,
43
43
  };
44
44
 
45
- // state diagram: https://github.com/ggerganov/llama.cpp/pull/9283
45
+ // state diagram: https://github.com/ggml-org/llama.cpp/pull/9283
46
46
  enum slot_state {
47
47
  SLOT_STATE_IDLE,
48
48
  SLOT_STATE_STARTED, // TODO: this state is only used for setting up the initial prompt processing; maybe merge it with launch_slot_with_task in the future
@@ -173,6 +173,7 @@ struct slot_params {
173
173
  {"grammar_trigger_words", grammar_trigger_words},
174
174
  {"grammar_trigger_tokens", sampling.grammar_trigger_tokens},
175
175
  {"preserved_tokens", sampling.preserved_tokens},
176
+ {"chat_format", common_chat_format_name(oaicompat_chat_format)},
176
177
  {"samplers", samplers},
177
178
  {"speculative.n_max", speculative.n_max},
178
179
  {"speculative.n_min", speculative.n_min},
@@ -334,24 +335,24 @@ struct server_task {
334
335
  if (data.contains("json_schema") && !data.contains("grammar")) {
335
336
  try {
336
337
  auto schema = json_value(data, "json_schema", json::object());
337
- LOG_DBG("JSON schema: %s\n", schema.dump(2).c_str());
338
+ SRV_DBG("JSON schema: %s\n", schema.dump(2).c_str());
338
339
  params.sampling.grammar = json_schema_to_grammar(schema);
339
- LOG_DBG("Converted grammar: %s\n", params.sampling.grammar.c_str());
340
+ SRV_DBG("Converted grammar: %s\n", params.sampling.grammar.c_str());
340
341
  } catch (const std::exception & e) {
341
342
  throw std::runtime_error(std::string("\"json_schema\": ") + e.what());
342
343
  }
343
344
  } else {
344
345
  params.sampling.grammar = json_value(data, "grammar", defaults.sampling.grammar);
345
- LOG_DBG("Grammar: %s\n", params.sampling.grammar.c_str());
346
+ SRV_DBG("Grammar: %s\n", params.sampling.grammar.c_str());
346
347
  params.sampling.grammar_lazy = json_value(data, "grammar_lazy", defaults.sampling.grammar_lazy);
347
- LOG_DBG("Grammar lazy: %s\n", params.sampling.grammar_lazy ? "true" : "false");
348
+ SRV_DBG("Grammar lazy: %s\n", params.sampling.grammar_lazy ? "true" : "false");
348
349
  }
349
350
 
350
351
  {
351
352
  auto it = data.find("chat_format");
352
353
  if (it != data.end()) {
353
354
  params.oaicompat_chat_format = static_cast<common_chat_format>(it->get<int>());
354
- LOG_INF("Chat format: %s\n", common_chat_format_name(params.oaicompat_chat_format).c_str());
355
+ SRV_INF("Chat format: %s\n", common_chat_format_name(params.oaicompat_chat_format).c_str());
355
356
  } else {
356
357
  params.oaicompat_chat_format = defaults.oaicompat_chat_format;
357
358
  }
@@ -367,12 +368,12 @@ struct server_task {
367
368
 
368
369
  auto ids = common_tokenize(vocab, trigger.word, /* add_special= */ false, /* parse_special= */ true);
369
370
  if (ids.size() == 1) {
370
- LOG_DBG("Grammar trigger token: %d (`%s`)\n", ids[0], trigger.word.c_str());
371
+ SRV_DBG("Grammar trigger token: %d (`%s`)\n", ids[0], trigger.word.c_str());
371
372
  params.sampling.grammar_trigger_tokens.push_back(ids[0]);
372
373
  params.sampling.preserved_tokens.insert(ids[0]);
373
374
  continue;
374
375
  }
375
- LOG_DBG("Grammar trigger word: `%s`\n", trigger.word.c_str());
376
+ SRV_DBG("Grammar trigger word: `%s`\n", trigger.word.c_str());
376
377
  params.sampling.grammar_trigger_words.push_back(trigger);
377
378
  }
378
379
  }
@@ -381,11 +382,11 @@ struct server_task {
381
382
  for (const auto & t : *preserved_tokens) {
382
383
  auto ids = common_tokenize(vocab, t.get<std::string>(), /* add_special= */ false, /* parse_special= */ true);
383
384
  if (ids.size() == 1) {
384
- LOG_DBG("Preserved token: %d\n", ids[0]);
385
+ SRV_DBG("Preserved token: %d\n", ids[0]);
385
386
  params.sampling.preserved_tokens.insert(ids[0]);
386
387
  } else {
387
388
  // This may happen when using a tool call style meant for a model with special tokens to preserve on a model without said tokens.
388
- LOG_WRN("Not preserved because more than 1 token (wrong chat template override?): %s\n", t.get<std::string>().c_str());
389
+ SRV_WRN("Not preserved because more than 1 token (wrong chat template override?): %s\n", t.get<std::string>().c_str());
389
390
  }
390
391
  }
391
392
  }
@@ -717,16 +718,26 @@ struct server_task_result_cmpl_final : server_task_result {
717
718
  std::string finish_reason = "length";
718
719
  common_chat_msg msg;
719
720
  if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) {
720
- LOG_DBG("Parsing chat message: %s\n", content.c_str());
721
+ SRV_DBG("Parsing chat message: %s\n", content.c_str());
721
722
  msg = common_chat_parse(content, oaicompat_chat_format);
722
723
  finish_reason = msg.tool_calls.empty() ? "stop" : "tool_calls";
723
724
  } else {
724
725
  msg.content = content;
725
726
  }
726
727
 
727
- json tool_calls;
728
+ json message {
729
+ {"role", "assistant"},
730
+ };
731
+ if (!msg.reasoning_content.empty()) {
732
+ message["reasoning_content"] = msg.reasoning_content;
733
+ }
734
+ if (msg.content.empty() && !msg.tool_calls.empty()) {
735
+ message["content"] = json();
736
+ } else {
737
+ message["content"] = msg.content;
738
+ }
728
739
  if (!msg.tool_calls.empty()) {
729
- tool_calls = json::array();
740
+ auto tool_calls = json::array();
730
741
  for (const auto & tc : msg.tool_calls) {
731
742
  tool_calls.push_back({
732
743
  {"type", "function"},
@@ -737,15 +748,7 @@ struct server_task_result_cmpl_final : server_task_result {
737
748
  {"id", tc.id},
738
749
  });
739
750
  }
740
- }
741
-
742
- json message {
743
- {"content", msg.content},
744
- {"tool_calls", tool_calls},
745
- {"role", "assistant"},
746
- };
747
- if (!msg.tool_plan.empty()) {
748
- message["tool_plan"] = msg.tool_plan;
751
+ message["tool_calls"] = tool_calls;
749
752
  }
750
753
 
751
754
  json choice {
@@ -1600,6 +1603,10 @@ struct server_queue {
1600
1603
 
1601
1604
  while (true) {
1602
1605
  std::unique_lock<std::mutex> lock(mutex_tasks);
1606
+ if (!running) {
1607
+ QUE_DBG("%s", "terminate\n");
1608
+ return;
1609
+ }
1603
1610
  if (queue_tasks.empty()) {
1604
1611
  lock.unlock();
1605
1612
  break;
@@ -1620,11 +1627,11 @@ struct server_queue {
1620
1627
  QUE_DBG("%s", "waiting for new tasks\n");
1621
1628
  {
1622
1629
  std::unique_lock<std::mutex> lock(mutex_tasks);
1630
+ if (!running) {
1631
+ QUE_DBG("%s", "terminate\n");
1632
+ return;
1633
+ }
1623
1634
  if (queue_tasks.empty()) {
1624
- if (!running) {
1625
- QUE_DBG("%s", "terminate\n");
1626
- return;
1627
- }
1628
1635
  condition_tasks.wait(lock, [&]{
1629
1636
  return (!queue_tasks.empty() || !running);
1630
1637
  });
@@ -1885,7 +1892,7 @@ struct server_context {
1885
1892
  }
1886
1893
 
1887
1894
  if (params_base.chat_template.empty() && !validate_builtin_chat_template(params.use_jinja)) {
1888
- LOG_WRN("%s: The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses\n", __func__);
1895
+ SRV_WRN("%s: The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses\n", __func__);
1889
1896
  chat_templates = common_chat_templates_from_model(model, "chatml");
1890
1897
  } else {
1891
1898
  chat_templates = common_chat_templates_from_model(model, params_base.chat_template);
@@ -2069,8 +2076,8 @@ struct server_context {
2069
2076
 
2070
2077
  if (slot.n_predict > 0 && slot.params.n_predict > slot.n_predict) {
2071
2078
  // Might be better to reject the request with a 400 ?
2079
+ SLT_WRN(slot, "n_predict = %d exceeds server configuration, setting to %d", slot.params.n_predict, slot.n_predict);
2072
2080
  slot.params.n_predict = slot.n_predict;
2073
- SLT_WRN(slot, "n_predict = %d exceeds server configuration, setting to %d", slot.n_predict, slot.n_predict);
2074
2081
  }
2075
2082
 
2076
2083
  if (slot.params.ignore_eos && has_eos_token) {
@@ -2275,7 +2282,7 @@ struct server_context {
2275
2282
  for (size_t i = 0; i < std::min(max_probs, n_probs); i++) {
2276
2283
  result.probs.push_back({
2277
2284
  cur_p->data[i].id,
2278
- common_detokenize(ctx, {cur_p->data[i].id}, special),
2285
+ common_token_to_piece(ctx, cur_p->data[i].id, special),
2279
2286
  cur_p->data[i].p
2280
2287
  });
2281
2288
  }
@@ -2297,7 +2304,7 @@ struct server_context {
2297
2304
  for (size_t i = 0; i < std::min(n_vocab, n_probs); i++) {
2298
2305
  result.probs.push_back({
2299
2306
  cur[i].id,
2300
- common_detokenize(ctx, {cur[i].id}, special),
2307
+ common_token_to_piece(ctx, cur[i].id, special),
2301
2308
  cur[i].p
2302
2309
  });
2303
2310
  }
@@ -3355,10 +3362,10 @@ static void log_server_request(const httplib::Request & req, const httplib::Resp
3355
3362
 
3356
3363
  // reminder: this function is not covered by httplib's exception handler; if someone does more complicated stuff, think about wrapping it in try-catch
3357
3364
 
3358
- LOG_INF("request: %s %s %s %d\n", req.method.c_str(), req.path.c_str(), req.remote_addr.c_str(), res.status);
3365
+ SRV_INF("request: %s %s %s %d\n", req.method.c_str(), req.path.c_str(), req.remote_addr.c_str(), res.status);
3359
3366
 
3360
- LOG_DBG("request: %s\n", req.body.c_str());
3361
- LOG_DBG("response: %s\n", res.body.c_str());
3367
+ SRV_DBG("request: %s\n", req.body.c_str());
3368
+ SRV_DBG("response: %s\n", res.body.c_str());
3362
3369
  }
3363
3370
 
3364
3371
  std::function<void(int)> shutdown_handler;
@@ -3649,7 +3656,7 @@ int main(int argc, char ** argv) {
3649
3656
  }, {
3650
3657
  {"name", "n_busy_slots_per_decode"},
3651
3658
  {"help", "Average number of busy slots per llama_decode() call"},
3652
- {"value", (float) res_metrics->n_busy_slots_total / (float) res_metrics->n_decode_total}
3659
+ {"value", (float) res_metrics->n_busy_slots_total / std::max((float) res_metrics->n_decode_total, 1.f)}
3653
3660
  }}},
3654
3661
  {"gauge", {{
3655
3662
  {"name", "prompt_tokens_seconds"},
@@ -3860,7 +3867,9 @@ int main(int argc, char ** argv) {
3860
3867
 
3861
3868
  try {
3862
3869
  const auto & prompt = data.at("prompt");
3863
- LOG_DBG("Prompt: %s\n", prompt.is_string() ? prompt.get<std::string>().c_str() : prompt.dump(2).c_str());
3870
+ // TODO: this log can become very long, put it behind a flag or think about a more compact format
3871
+ //SRV_DBG("Prompt: %s\n", prompt.is_string() ? prompt.get<std::string>().c_str() : prompt.dump(2).c_str());
3872
+
3864
3873
  std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, prompt, true, true);
3865
3874
  tasks.reserve(tokenized_prompts.size());
3866
3875
  for (size_t i = 0; i < tokenized_prompts.size(); i++) {
@@ -4054,7 +4063,7 @@ int main(int argc, char ** argv) {
4054
4063
  }
4055
4064
 
4056
4065
  auto body = json::parse(req.body);
4057
- json data = oaicompat_completion_params_parse(body, params.use_jinja, ctx_server.chat_templates);
4066
+ json data = oaicompat_completion_params_parse(body, params.use_jinja, params.reasoning_format, ctx_server.chat_templates);
4058
4067
 
4059
4068
  return handle_completions_impl(
4060
4069
  SERVER_TASK_TYPE_COMPLETION,
@@ -4067,7 +4076,7 @@ int main(int argc, char ** argv) {
4067
4076
  // same with handle_chat_completions, but without inference part
4068
4077
  const auto handle_apply_template = [&ctx_server, &params, &res_ok](const httplib::Request & req, httplib::Response & res) {
4069
4078
  auto body = json::parse(req.body);
4070
- json data = oaicompat_completion_params_parse(body, params.use_jinja, ctx_server.chat_templates);
4079
+ json data = oaicompat_completion_params_parse(body, params.use_jinja, params.reasoning_format, ctx_server.chat_templates);
4071
4080
  res_ok(res, {{ "prompt", std::move(data.at("prompt")) }});
4072
4081
  };
4073
4082
 
@@ -4376,6 +4385,9 @@ int main(int argc, char ** argv) {
4376
4385
  res.set_content("Error: gzip is not supported by this browser", "text/plain");
4377
4386
  } else {
4378
4387
  res.set_header("Content-Encoding", "gzip");
4388
+ // COEP and COOP headers, required by pyodide (python interpreter)
4389
+ res.set_header("Cross-Origin-Embedder-Policy", "require-corp");
4390
+ res.set_header("Cross-Origin-Opener-Policy", "same-origin");
4379
4391
  res.set_content(reinterpret_cast<const char*>(index_html_gz), index_html_gz_len, "text/html; charset=utf-8");
4380
4392
  }
4381
4393
  return false;
@@ -4425,6 +4437,7 @@ int main(int argc, char ** argv) {
4425
4437
 
4426
4438
  // clean up function, to be called before exit
4427
4439
  auto clean_up = [&svr]() {
4440
+ SRV_INF("%s: cleaning up before exit...\n", __func__);
4428
4441
  svr->stop();
4429
4442
  llama_backend_free();
4430
4443
  };
@@ -4441,10 +4454,6 @@ int main(int argc, char ** argv) {
4441
4454
  }
4442
4455
 
4443
4456
  if (!was_bound) {
4444
- //LOG_ERROR("couldn't bind HTTP server socket", {
4445
- // {"hostname", params.hostname},
4446
- // {"port", params.port},
4447
- //});
4448
4457
  LOG_ERR("%s: couldn't bind HTTP server socket, hostname: %s, port: %d\n", __func__, params.hostname.c_str(), params.port);
4449
4458
  clean_up();
4450
4459
  return 1;
@@ -4461,7 +4470,7 @@ int main(int argc, char ** argv) {
4461
4470
 
4462
4471
  if (!ctx_server.load_model(params)) {
4463
4472
  clean_up();
4464
- t.join();
4473
+ // t.join(); // FIXME: see below
4465
4474
  LOG_ERR("%s: exiting due to model loading error\n", __func__);
4466
4475
  return 1;
4467
4476
  }
@@ -4485,13 +4494,10 @@ int main(int argc, char ** argv) {
4485
4494
  });
4486
4495
 
4487
4496
  shutdown_handler = [&](int) {
4497
+ // this will unblock start_loop()
4488
4498
  ctx_server.queue_tasks.terminate();
4489
4499
  };
4490
4500
 
4491
- LOG_INF("%s: server is listening on http://%s:%d - starting the main loop\n", __func__, params.hostname.c_str(), params.port);
4492
-
4493
- ctx_server.queue_tasks.start_loop();
4494
-
4495
4501
  #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
4496
4502
  struct sigaction sigint_action;
4497
4503
  sigint_action.sa_handler = signal_handler;
@@ -4506,8 +4512,13 @@ int main(int argc, char ** argv) {
4506
4512
  SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
4507
4513
  #endif
4508
4514
 
4515
+ LOG_INF("%s: server is listening on http://%s:%d - starting the main loop\n", __func__, params.hostname.c_str(), params.port);
4516
+
4517
+ // this call blocks the main thread until queue_tasks.terminate() is called
4518
+ ctx_server.queue_tasks.start_loop();
4519
+
4509
4520
  clean_up();
4510
- t.join();
4521
+ // t.join(); // FIXME: http thread may stuck if there is an on-going request. we don't need to care about this for now as the HTTP connection will already be closed at this point, but it's better to fix this
4511
4522
 
4512
4523
  return 0;
4513
4524
  }
@@ -367,10 +367,10 @@ inline std::string format_chat(const common_chat_template & tmpl, const std::vec
367
367
  }
368
368
  }
369
369
  } else {
370
- throw std::runtime_error("Invalid 'content' type (ref: https://github.com/ggerganov/llama.cpp/issues/8367)");
370
+ throw std::runtime_error("Invalid 'content' type (ref: https://github.com/ggml-org/llama.cpp/issues/8367)");
371
371
  }
372
372
  } else {
373
- throw std::runtime_error("Missing 'content' (ref: https://github.com/ggerganov/llama.cpp/issues/8367)");
373
+ throw std::runtime_error("Missing 'content' (ref: https://github.com/ggml-org/llama.cpp/issues/8367)");
374
374
  }
375
375
 
376
376
  chat.push_back({role, content, /* tool_calls= */ {}});
@@ -578,6 +578,7 @@ static json oaicompat_completion_params_parse(const json & body) {
578
578
  static json oaicompat_completion_params_parse(
579
579
  const json & body, /* openai api json semantics */
580
580
  bool use_jinja,
581
+ common_reasoning_format reasoning_format,
581
582
  const common_chat_templates & chat_templates)
582
583
  {
583
584
  json llama_params;
@@ -633,9 +634,10 @@ static json oaicompat_completion_params_parse(
633
634
  throw std::runtime_error("Cannot use custom grammar constraints with tools.");
634
635
  }
635
636
  common_chat_inputs inputs;
636
- inputs.messages = body.at("messages");
637
- inputs.tools = tools;
638
- inputs.tool_choice = tool_choice;
637
+ inputs.extract_reasoning = reasoning_format != COMMON_REASONING_FORMAT_NONE;
638
+ inputs.messages = body.at("messages");
639
+ inputs.tools = tools;
640
+ inputs.tool_choice = tool_choice;
639
641
  inputs.parallel_tool_calls = json_value(body, "parallel_tool_calls", false);
640
642
  if (inputs.parallel_tool_calls && !tmpl.original_caps().supports_parallel_tool_calls) {
641
643
  LOG_DBG("Disabling parallel_tool_calls because the template does not support it\n");
@@ -8,7 +8,7 @@ extern "C" {
8
8
  #endif
9
9
 
10
10
  // the compute plan that needs to be prepared for ggml_graph_compute()
11
- // since https://github.com/ggerganov/ggml/issues/287
11
+ // since https://github.com/ggml-org/ggml/issues/287
12
12
  struct ggml_cplan {
13
13
  size_t work_size; // size of work buffer, calculated by `ggml_graph_plan()`
14
14
  uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
@@ -45,7 +45,7 @@ GGML_BACKEND_API bool ggml_backend_is_metal(ggml_backend_t backend);
45
45
 
46
46
  GGML_DEPRECATED(
47
47
  GGML_BACKEND_API ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size),
48
- "obsoleted by the new device interface - https://github.com/ggerganov/llama.cpp/pull/9713");
48
+ "obsoleted by the new device interface - https://github.com/ggml-org/llama.cpp/pull/9713");
49
49
 
50
50
  GGML_BACKEND_API void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data);
51
51
 
@@ -10,8 +10,6 @@ extern "C" {
10
10
  #define GGML_VK_NAME "Vulkan"
11
11
  #define GGML_VK_MAX_DEVICES 16
12
12
 
13
- GGML_BACKEND_API void ggml_vk_instance_init(void);
14
-
15
13
  // backend API
16
14
  GGML_BACKEND_API ggml_backend_t ggml_backend_vk_init(size_t dev_num);
17
15
 
@@ -198,7 +198,7 @@
198
198
 
199
199
  #ifndef __GNUC__
200
200
  # define GGML_ATTRIBUTE_FORMAT(...)
201
- #elif defined(__MINGW32__)
201
+ #elif defined(__MINGW32__) && !defined(__clang__)
202
202
  # define GGML_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
203
203
  #else
204
204
  # define GGML_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
@@ -473,7 +473,6 @@ GGML_TABLE_BEGIN(uint8_t, ksigns_iq2xs, 128)
473
473
  240, 113, 114, 243, 116, 245, 246, 119, 120, 249, 250, 123, 252, 125, 126, 255,
474
474
  GGML_TABLE_END()
475
475
 
476
- //#if __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A // lowest compute capability for integer intrinsics
477
476
  GGML_TABLE_BEGIN(uint64_t, ksigns64, 128)
478
477
  0x0000000000000000, 0xff000000000000ff, 0xff0000000000ff00, 0x000000000000ffff,
479
478
  0xff00000000ff0000, 0x0000000000ff00ff, 0x0000000000ffff00, 0xff00000000ffffff,
@@ -508,7 +507,6 @@ GGML_TABLE_BEGIN(uint64_t, ksigns64, 128)
508
507
  0x00ffffffff000000, 0xffffffffff0000ff, 0xffffffffff00ff00, 0x00ffffffff00ffff,
509
508
  0xffffffffffff0000, 0x00ffffffffff00ff, 0x00ffffffffffff00, 0xffffffffffffffff,
510
509
  GGML_TABLE_END()
511
- //#endif
512
510
 
513
511
 
514
512
  GGML_TABLE_BEGIN(uint64_t, iq2xxs_grid, 256)
@@ -360,21 +360,15 @@ inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b)
360
360
  #endif
361
361
 
362
362
  #if defined(__loongarch_asx)
363
-
364
- typedef union {
365
- int32_t i;
366
- float f;
367
- } ft_union;
368
-
369
363
  /* float type data load instructions */
370
- static __m128 __lsx_vreplfr2vr_s(float val) {
371
- ft_union fi_tmpval = {.f = val};
372
- return (__m128)__lsx_vreplgr2vr_w(fi_tmpval.i);
364
+ static __m128 __lsx_vreplfr2vr_s(const float val) {
365
+ v4f32 res = {val, val, val, val};
366
+ return (__m128)res;
373
367
  }
374
368
 
375
- static __m256 __lasx_xvreplfr2vr_s(float val) {
376
- ft_union fi_tmpval = {.f = val};
377
- return (__m256)__lasx_xvreplgr2vr_w(fi_tmpval.i);
369
+ static __m256 __lasx_xvreplfr2vr_s(const float val) {
370
+ v8f32 res = {val, val, val, val, val, val, val, val};
371
+ return (__m256)res;
378
372
  }
379
373
  #endif
380
374