@fugood/llama.node 0.3.11 → 0.3.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +1 -0
- package/lib/index.js +26 -20
- package/lib/index.ts +32 -28
- package/package.json +1 -1
- package/src/LlamaCompletionWorker.cpp +14 -0
- package/src/LlamaContext.cpp +13 -4
- package/src/llama.cpp/.github/workflows/build.yml +35 -3
- package/src/llama.cpp/.github/workflows/docker.yml +2 -0
- package/src/llama.cpp/.github/workflows/labeler.yml +1 -1
- package/src/llama.cpp/common/CMakeLists.txt +20 -3
- package/src/llama.cpp/common/arg.cpp +180 -3
- package/src/llama.cpp/common/chat-template.hpp +21 -7
- package/src/llama.cpp/common/chat.cpp +220 -101
- package/src/llama.cpp/common/chat.hpp +3 -0
- package/src/llama.cpp/common/common.h +15 -7
- package/src/llama.cpp/common/llguidance.cpp +3 -3
- package/src/llama.cpp/common/log.cpp +1 -0
- package/src/llama.cpp/common/log.h +2 -1
- package/src/llama.cpp/common/minja.hpp +24 -9
- package/src/llama.cpp/common/sampling.cpp +52 -46
- package/src/llama.cpp/common/speculative.h +1 -1
- package/src/llama.cpp/docs/build.md +2 -2
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +2 -1
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +6 -5
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +1 -0
- package/src/llama.cpp/examples/run/run.cpp +5 -12
- package/src/llama.cpp/examples/server/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/server/httplib.h +381 -292
- package/src/llama.cpp/examples/server/server.cpp +58 -47
- package/src/llama.cpp/examples/server/utils.hpp +7 -5
- package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-metal.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-vulkan.h +0 -2
- package/src/llama.cpp/ggml/include/ggml.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-common.h +0 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +6 -12
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +852 -268
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +200 -107
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +2 -5
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +9 -8
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +2 -2
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +26 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +6 -7
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +812 -569
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +25 -1
- package/src/llama.cpp/ggml/src/ggml.c +1 -1
- package/src/llama.cpp/include/llama.h +14 -10
- package/src/llama.cpp/src/llama-grammar.cpp +1 -1
- package/src/llama.cpp/src/llama-grammar.h +1 -1
- package/src/llama.cpp/src/llama-impl.h +6 -6
- package/src/llama.cpp/src/llama-kv-cache.h +1 -1
- package/src/llama.cpp/src/llama-mmap.h +1 -0
- package/src/llama.cpp/src/llama-model.cpp +1 -1
- package/src/llama.cpp/src/llama-sampling.cpp +131 -57
- package/src/llama.cpp/src/llama.cpp +7 -5
- package/src/llama.cpp/src/unicode.cpp +9 -2
- package/src/llama.cpp/tests/test-backend-ops.cpp +5 -5
- package/src/llama.cpp/tests/test-chat.cpp +237 -69
- package/src/llama.cpp/tests/test-gguf.cpp +4 -4
- package/src/llama.cpp/tests/test-sampling.cpp +15 -0
|
@@ -42,7 +42,7 @@ enum stop_type {
|
|
|
42
42
|
STOP_TYPE_LIMIT,
|
|
43
43
|
};
|
|
44
44
|
|
|
45
|
-
// state diagram: https://github.com/
|
|
45
|
+
// state diagram: https://github.com/ggml-org/llama.cpp/pull/9283
|
|
46
46
|
enum slot_state {
|
|
47
47
|
SLOT_STATE_IDLE,
|
|
48
48
|
SLOT_STATE_STARTED, // TODO: this state is only used for setting up the initial prompt processing; maybe merge it with launch_slot_with_task in the future
|
|
@@ -173,6 +173,7 @@ struct slot_params {
|
|
|
173
173
|
{"grammar_trigger_words", grammar_trigger_words},
|
|
174
174
|
{"grammar_trigger_tokens", sampling.grammar_trigger_tokens},
|
|
175
175
|
{"preserved_tokens", sampling.preserved_tokens},
|
|
176
|
+
{"chat_format", common_chat_format_name(oaicompat_chat_format)},
|
|
176
177
|
{"samplers", samplers},
|
|
177
178
|
{"speculative.n_max", speculative.n_max},
|
|
178
179
|
{"speculative.n_min", speculative.n_min},
|
|
@@ -334,24 +335,24 @@ struct server_task {
|
|
|
334
335
|
if (data.contains("json_schema") && !data.contains("grammar")) {
|
|
335
336
|
try {
|
|
336
337
|
auto schema = json_value(data, "json_schema", json::object());
|
|
337
|
-
|
|
338
|
+
SRV_DBG("JSON schema: %s\n", schema.dump(2).c_str());
|
|
338
339
|
params.sampling.grammar = json_schema_to_grammar(schema);
|
|
339
|
-
|
|
340
|
+
SRV_DBG("Converted grammar: %s\n", params.sampling.grammar.c_str());
|
|
340
341
|
} catch (const std::exception & e) {
|
|
341
342
|
throw std::runtime_error(std::string("\"json_schema\": ") + e.what());
|
|
342
343
|
}
|
|
343
344
|
} else {
|
|
344
345
|
params.sampling.grammar = json_value(data, "grammar", defaults.sampling.grammar);
|
|
345
|
-
|
|
346
|
+
SRV_DBG("Grammar: %s\n", params.sampling.grammar.c_str());
|
|
346
347
|
params.sampling.grammar_lazy = json_value(data, "grammar_lazy", defaults.sampling.grammar_lazy);
|
|
347
|
-
|
|
348
|
+
SRV_DBG("Grammar lazy: %s\n", params.sampling.grammar_lazy ? "true" : "false");
|
|
348
349
|
}
|
|
349
350
|
|
|
350
351
|
{
|
|
351
352
|
auto it = data.find("chat_format");
|
|
352
353
|
if (it != data.end()) {
|
|
353
354
|
params.oaicompat_chat_format = static_cast<common_chat_format>(it->get<int>());
|
|
354
|
-
|
|
355
|
+
SRV_INF("Chat format: %s\n", common_chat_format_name(params.oaicompat_chat_format).c_str());
|
|
355
356
|
} else {
|
|
356
357
|
params.oaicompat_chat_format = defaults.oaicompat_chat_format;
|
|
357
358
|
}
|
|
@@ -367,12 +368,12 @@ struct server_task {
|
|
|
367
368
|
|
|
368
369
|
auto ids = common_tokenize(vocab, trigger.word, /* add_special= */ false, /* parse_special= */ true);
|
|
369
370
|
if (ids.size() == 1) {
|
|
370
|
-
|
|
371
|
+
SRV_DBG("Grammar trigger token: %d (`%s`)\n", ids[0], trigger.word.c_str());
|
|
371
372
|
params.sampling.grammar_trigger_tokens.push_back(ids[0]);
|
|
372
373
|
params.sampling.preserved_tokens.insert(ids[0]);
|
|
373
374
|
continue;
|
|
374
375
|
}
|
|
375
|
-
|
|
376
|
+
SRV_DBG("Grammar trigger word: `%s`\n", trigger.word.c_str());
|
|
376
377
|
params.sampling.grammar_trigger_words.push_back(trigger);
|
|
377
378
|
}
|
|
378
379
|
}
|
|
@@ -381,11 +382,11 @@ struct server_task {
|
|
|
381
382
|
for (const auto & t : *preserved_tokens) {
|
|
382
383
|
auto ids = common_tokenize(vocab, t.get<std::string>(), /* add_special= */ false, /* parse_special= */ true);
|
|
383
384
|
if (ids.size() == 1) {
|
|
384
|
-
|
|
385
|
+
SRV_DBG("Preserved token: %d\n", ids[0]);
|
|
385
386
|
params.sampling.preserved_tokens.insert(ids[0]);
|
|
386
387
|
} else {
|
|
387
388
|
// This may happen when using a tool call style meant for a model with special tokens to preserve on a model without said tokens.
|
|
388
|
-
|
|
389
|
+
SRV_WRN("Not preserved because more than 1 token (wrong chat template override?): %s\n", t.get<std::string>().c_str());
|
|
389
390
|
}
|
|
390
391
|
}
|
|
391
392
|
}
|
|
@@ -717,16 +718,26 @@ struct server_task_result_cmpl_final : server_task_result {
|
|
|
717
718
|
std::string finish_reason = "length";
|
|
718
719
|
common_chat_msg msg;
|
|
719
720
|
if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) {
|
|
720
|
-
|
|
721
|
+
SRV_DBG("Parsing chat message: %s\n", content.c_str());
|
|
721
722
|
msg = common_chat_parse(content, oaicompat_chat_format);
|
|
722
723
|
finish_reason = msg.tool_calls.empty() ? "stop" : "tool_calls";
|
|
723
724
|
} else {
|
|
724
725
|
msg.content = content;
|
|
725
726
|
}
|
|
726
727
|
|
|
727
|
-
json
|
|
728
|
+
json message {
|
|
729
|
+
{"role", "assistant"},
|
|
730
|
+
};
|
|
731
|
+
if (!msg.reasoning_content.empty()) {
|
|
732
|
+
message["reasoning_content"] = msg.reasoning_content;
|
|
733
|
+
}
|
|
734
|
+
if (msg.content.empty() && !msg.tool_calls.empty()) {
|
|
735
|
+
message["content"] = json();
|
|
736
|
+
} else {
|
|
737
|
+
message["content"] = msg.content;
|
|
738
|
+
}
|
|
728
739
|
if (!msg.tool_calls.empty()) {
|
|
729
|
-
tool_calls = json::array();
|
|
740
|
+
auto tool_calls = json::array();
|
|
730
741
|
for (const auto & tc : msg.tool_calls) {
|
|
731
742
|
tool_calls.push_back({
|
|
732
743
|
{"type", "function"},
|
|
@@ -737,15 +748,7 @@ struct server_task_result_cmpl_final : server_task_result {
|
|
|
737
748
|
{"id", tc.id},
|
|
738
749
|
});
|
|
739
750
|
}
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
json message {
|
|
743
|
-
{"content", msg.content},
|
|
744
|
-
{"tool_calls", tool_calls},
|
|
745
|
-
{"role", "assistant"},
|
|
746
|
-
};
|
|
747
|
-
if (!msg.tool_plan.empty()) {
|
|
748
|
-
message["tool_plan"] = msg.tool_plan;
|
|
751
|
+
message["tool_calls"] = tool_calls;
|
|
749
752
|
}
|
|
750
753
|
|
|
751
754
|
json choice {
|
|
@@ -1600,6 +1603,10 @@ struct server_queue {
|
|
|
1600
1603
|
|
|
1601
1604
|
while (true) {
|
|
1602
1605
|
std::unique_lock<std::mutex> lock(mutex_tasks);
|
|
1606
|
+
if (!running) {
|
|
1607
|
+
QUE_DBG("%s", "terminate\n");
|
|
1608
|
+
return;
|
|
1609
|
+
}
|
|
1603
1610
|
if (queue_tasks.empty()) {
|
|
1604
1611
|
lock.unlock();
|
|
1605
1612
|
break;
|
|
@@ -1620,11 +1627,11 @@ struct server_queue {
|
|
|
1620
1627
|
QUE_DBG("%s", "waiting for new tasks\n");
|
|
1621
1628
|
{
|
|
1622
1629
|
std::unique_lock<std::mutex> lock(mutex_tasks);
|
|
1630
|
+
if (!running) {
|
|
1631
|
+
QUE_DBG("%s", "terminate\n");
|
|
1632
|
+
return;
|
|
1633
|
+
}
|
|
1623
1634
|
if (queue_tasks.empty()) {
|
|
1624
|
-
if (!running) {
|
|
1625
|
-
QUE_DBG("%s", "terminate\n");
|
|
1626
|
-
return;
|
|
1627
|
-
}
|
|
1628
1635
|
condition_tasks.wait(lock, [&]{
|
|
1629
1636
|
return (!queue_tasks.empty() || !running);
|
|
1630
1637
|
});
|
|
@@ -1885,7 +1892,7 @@ struct server_context {
|
|
|
1885
1892
|
}
|
|
1886
1893
|
|
|
1887
1894
|
if (params_base.chat_template.empty() && !validate_builtin_chat_template(params.use_jinja)) {
|
|
1888
|
-
|
|
1895
|
+
SRV_WRN("%s: The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses\n", __func__);
|
|
1889
1896
|
chat_templates = common_chat_templates_from_model(model, "chatml");
|
|
1890
1897
|
} else {
|
|
1891
1898
|
chat_templates = common_chat_templates_from_model(model, params_base.chat_template);
|
|
@@ -2069,8 +2076,8 @@ struct server_context {
|
|
|
2069
2076
|
|
|
2070
2077
|
if (slot.n_predict > 0 && slot.params.n_predict > slot.n_predict) {
|
|
2071
2078
|
// Might be better to reject the request with a 400 ?
|
|
2079
|
+
SLT_WRN(slot, "n_predict = %d exceeds server configuration, setting to %d", slot.params.n_predict, slot.n_predict);
|
|
2072
2080
|
slot.params.n_predict = slot.n_predict;
|
|
2073
|
-
SLT_WRN(slot, "n_predict = %d exceeds server configuration, setting to %d", slot.n_predict, slot.n_predict);
|
|
2074
2081
|
}
|
|
2075
2082
|
|
|
2076
2083
|
if (slot.params.ignore_eos && has_eos_token) {
|
|
@@ -2275,7 +2282,7 @@ struct server_context {
|
|
|
2275
2282
|
for (size_t i = 0; i < std::min(max_probs, n_probs); i++) {
|
|
2276
2283
|
result.probs.push_back({
|
|
2277
2284
|
cur_p->data[i].id,
|
|
2278
|
-
|
|
2285
|
+
common_token_to_piece(ctx, cur_p->data[i].id, special),
|
|
2279
2286
|
cur_p->data[i].p
|
|
2280
2287
|
});
|
|
2281
2288
|
}
|
|
@@ -2297,7 +2304,7 @@ struct server_context {
|
|
|
2297
2304
|
for (size_t i = 0; i < std::min(n_vocab, n_probs); i++) {
|
|
2298
2305
|
result.probs.push_back({
|
|
2299
2306
|
cur[i].id,
|
|
2300
|
-
|
|
2307
|
+
common_token_to_piece(ctx, cur[i].id, special),
|
|
2301
2308
|
cur[i].p
|
|
2302
2309
|
});
|
|
2303
2310
|
}
|
|
@@ -3355,10 +3362,10 @@ static void log_server_request(const httplib::Request & req, const httplib::Resp
|
|
|
3355
3362
|
|
|
3356
3363
|
// reminder: this function is not covered by httplib's exception handler; if someone does more complicated stuff, think about wrapping it in try-catch
|
|
3357
3364
|
|
|
3358
|
-
|
|
3365
|
+
SRV_INF("request: %s %s %s %d\n", req.method.c_str(), req.path.c_str(), req.remote_addr.c_str(), res.status);
|
|
3359
3366
|
|
|
3360
|
-
|
|
3361
|
-
|
|
3367
|
+
SRV_DBG("request: %s\n", req.body.c_str());
|
|
3368
|
+
SRV_DBG("response: %s\n", res.body.c_str());
|
|
3362
3369
|
}
|
|
3363
3370
|
|
|
3364
3371
|
std::function<void(int)> shutdown_handler;
|
|
@@ -3649,7 +3656,7 @@ int main(int argc, char ** argv) {
|
|
|
3649
3656
|
}, {
|
|
3650
3657
|
{"name", "n_busy_slots_per_decode"},
|
|
3651
3658
|
{"help", "Average number of busy slots per llama_decode() call"},
|
|
3652
|
-
{"value", (float) res_metrics->n_busy_slots_total / (float) res_metrics->n_decode_total}
|
|
3659
|
+
{"value", (float) res_metrics->n_busy_slots_total / std::max((float) res_metrics->n_decode_total, 1.f)}
|
|
3653
3660
|
}}},
|
|
3654
3661
|
{"gauge", {{
|
|
3655
3662
|
{"name", "prompt_tokens_seconds"},
|
|
@@ -3860,7 +3867,9 @@ int main(int argc, char ** argv) {
|
|
|
3860
3867
|
|
|
3861
3868
|
try {
|
|
3862
3869
|
const auto & prompt = data.at("prompt");
|
|
3863
|
-
|
|
3870
|
+
// TODO: this log can become very long, put it behind a flag or think about a more compact format
|
|
3871
|
+
//SRV_DBG("Prompt: %s\n", prompt.is_string() ? prompt.get<std::string>().c_str() : prompt.dump(2).c_str());
|
|
3872
|
+
|
|
3864
3873
|
std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, prompt, true, true);
|
|
3865
3874
|
tasks.reserve(tokenized_prompts.size());
|
|
3866
3875
|
for (size_t i = 0; i < tokenized_prompts.size(); i++) {
|
|
@@ -4054,7 +4063,7 @@ int main(int argc, char ** argv) {
|
|
|
4054
4063
|
}
|
|
4055
4064
|
|
|
4056
4065
|
auto body = json::parse(req.body);
|
|
4057
|
-
json data = oaicompat_completion_params_parse(body, params.use_jinja, ctx_server.chat_templates);
|
|
4066
|
+
json data = oaicompat_completion_params_parse(body, params.use_jinja, params.reasoning_format, ctx_server.chat_templates);
|
|
4058
4067
|
|
|
4059
4068
|
return handle_completions_impl(
|
|
4060
4069
|
SERVER_TASK_TYPE_COMPLETION,
|
|
@@ -4067,7 +4076,7 @@ int main(int argc, char ** argv) {
|
|
|
4067
4076
|
// same with handle_chat_completions, but without inference part
|
|
4068
4077
|
const auto handle_apply_template = [&ctx_server, ¶ms, &res_ok](const httplib::Request & req, httplib::Response & res) {
|
|
4069
4078
|
auto body = json::parse(req.body);
|
|
4070
|
-
json data = oaicompat_completion_params_parse(body, params.use_jinja, ctx_server.chat_templates);
|
|
4079
|
+
json data = oaicompat_completion_params_parse(body, params.use_jinja, params.reasoning_format, ctx_server.chat_templates);
|
|
4071
4080
|
res_ok(res, {{ "prompt", std::move(data.at("prompt")) }});
|
|
4072
4081
|
};
|
|
4073
4082
|
|
|
@@ -4376,6 +4385,9 @@ int main(int argc, char ** argv) {
|
|
|
4376
4385
|
res.set_content("Error: gzip is not supported by this browser", "text/plain");
|
|
4377
4386
|
} else {
|
|
4378
4387
|
res.set_header("Content-Encoding", "gzip");
|
|
4388
|
+
// COEP and COOP headers, required by pyodide (python interpreter)
|
|
4389
|
+
res.set_header("Cross-Origin-Embedder-Policy", "require-corp");
|
|
4390
|
+
res.set_header("Cross-Origin-Opener-Policy", "same-origin");
|
|
4379
4391
|
res.set_content(reinterpret_cast<const char*>(index_html_gz), index_html_gz_len, "text/html; charset=utf-8");
|
|
4380
4392
|
}
|
|
4381
4393
|
return false;
|
|
@@ -4425,6 +4437,7 @@ int main(int argc, char ** argv) {
|
|
|
4425
4437
|
|
|
4426
4438
|
// clean up function, to be called before exit
|
|
4427
4439
|
auto clean_up = [&svr]() {
|
|
4440
|
+
SRV_INF("%s: cleaning up before exit...\n", __func__);
|
|
4428
4441
|
svr->stop();
|
|
4429
4442
|
llama_backend_free();
|
|
4430
4443
|
};
|
|
@@ -4441,10 +4454,6 @@ int main(int argc, char ** argv) {
|
|
|
4441
4454
|
}
|
|
4442
4455
|
|
|
4443
4456
|
if (!was_bound) {
|
|
4444
|
-
//LOG_ERROR("couldn't bind HTTP server socket", {
|
|
4445
|
-
// {"hostname", params.hostname},
|
|
4446
|
-
// {"port", params.port},
|
|
4447
|
-
//});
|
|
4448
4457
|
LOG_ERR("%s: couldn't bind HTTP server socket, hostname: %s, port: %d\n", __func__, params.hostname.c_str(), params.port);
|
|
4449
4458
|
clean_up();
|
|
4450
4459
|
return 1;
|
|
@@ -4461,7 +4470,7 @@ int main(int argc, char ** argv) {
|
|
|
4461
4470
|
|
|
4462
4471
|
if (!ctx_server.load_model(params)) {
|
|
4463
4472
|
clean_up();
|
|
4464
|
-
t.join();
|
|
4473
|
+
// t.join(); // FIXME: see below
|
|
4465
4474
|
LOG_ERR("%s: exiting due to model loading error\n", __func__);
|
|
4466
4475
|
return 1;
|
|
4467
4476
|
}
|
|
@@ -4485,13 +4494,10 @@ int main(int argc, char ** argv) {
|
|
|
4485
4494
|
});
|
|
4486
4495
|
|
|
4487
4496
|
shutdown_handler = [&](int) {
|
|
4497
|
+
// this will unblock start_loop()
|
|
4488
4498
|
ctx_server.queue_tasks.terminate();
|
|
4489
4499
|
};
|
|
4490
4500
|
|
|
4491
|
-
LOG_INF("%s: server is listening on http://%s:%d - starting the main loop\n", __func__, params.hostname.c_str(), params.port);
|
|
4492
|
-
|
|
4493
|
-
ctx_server.queue_tasks.start_loop();
|
|
4494
|
-
|
|
4495
4501
|
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
|
|
4496
4502
|
struct sigaction sigint_action;
|
|
4497
4503
|
sigint_action.sa_handler = signal_handler;
|
|
@@ -4506,8 +4512,13 @@ int main(int argc, char ** argv) {
|
|
|
4506
4512
|
SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
|
|
4507
4513
|
#endif
|
|
4508
4514
|
|
|
4515
|
+
LOG_INF("%s: server is listening on http://%s:%d - starting the main loop\n", __func__, params.hostname.c_str(), params.port);
|
|
4516
|
+
|
|
4517
|
+
// this call blocks the main thread until queue_tasks.terminate() is called
|
|
4518
|
+
ctx_server.queue_tasks.start_loop();
|
|
4519
|
+
|
|
4509
4520
|
clean_up();
|
|
4510
|
-
t.join();
|
|
4521
|
+
// t.join(); // FIXME: http thread may stuck if there is an on-going request. we don't need to care about this for now as the HTTP connection will already be closed at this point, but it's better to fix this
|
|
4511
4522
|
|
|
4512
4523
|
return 0;
|
|
4513
4524
|
}
|
|
@@ -367,10 +367,10 @@ inline std::string format_chat(const common_chat_template & tmpl, const std::vec
|
|
|
367
367
|
}
|
|
368
368
|
}
|
|
369
369
|
} else {
|
|
370
|
-
throw std::runtime_error("Invalid 'content' type (ref: https://github.com/
|
|
370
|
+
throw std::runtime_error("Invalid 'content' type (ref: https://github.com/ggml-org/llama.cpp/issues/8367)");
|
|
371
371
|
}
|
|
372
372
|
} else {
|
|
373
|
-
throw std::runtime_error("Missing 'content' (ref: https://github.com/
|
|
373
|
+
throw std::runtime_error("Missing 'content' (ref: https://github.com/ggml-org/llama.cpp/issues/8367)");
|
|
374
374
|
}
|
|
375
375
|
|
|
376
376
|
chat.push_back({role, content, /* tool_calls= */ {}});
|
|
@@ -578,6 +578,7 @@ static json oaicompat_completion_params_parse(const json & body) {
|
|
|
578
578
|
static json oaicompat_completion_params_parse(
|
|
579
579
|
const json & body, /* openai api json semantics */
|
|
580
580
|
bool use_jinja,
|
|
581
|
+
common_reasoning_format reasoning_format,
|
|
581
582
|
const common_chat_templates & chat_templates)
|
|
582
583
|
{
|
|
583
584
|
json llama_params;
|
|
@@ -633,9 +634,10 @@ static json oaicompat_completion_params_parse(
|
|
|
633
634
|
throw std::runtime_error("Cannot use custom grammar constraints with tools.");
|
|
634
635
|
}
|
|
635
636
|
common_chat_inputs inputs;
|
|
636
|
-
inputs.
|
|
637
|
-
inputs.
|
|
638
|
-
inputs.
|
|
637
|
+
inputs.extract_reasoning = reasoning_format != COMMON_REASONING_FORMAT_NONE;
|
|
638
|
+
inputs.messages = body.at("messages");
|
|
639
|
+
inputs.tools = tools;
|
|
640
|
+
inputs.tool_choice = tool_choice;
|
|
639
641
|
inputs.parallel_tool_calls = json_value(body, "parallel_tool_calls", false);
|
|
640
642
|
if (inputs.parallel_tool_calls && !tmpl.original_caps().supports_parallel_tool_calls) {
|
|
641
643
|
LOG_DBG("Disabling parallel_tool_calls because the template does not support it\n");
|
|
@@ -8,7 +8,7 @@ extern "C" {
|
|
|
8
8
|
#endif
|
|
9
9
|
|
|
10
10
|
// the compute plan that needs to be prepared for ggml_graph_compute()
|
|
11
|
-
// since https://github.com/
|
|
11
|
+
// since https://github.com/ggml-org/ggml/issues/287
|
|
12
12
|
struct ggml_cplan {
|
|
13
13
|
size_t work_size; // size of work buffer, calculated by `ggml_graph_plan()`
|
|
14
14
|
uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
|
|
@@ -45,7 +45,7 @@ GGML_BACKEND_API bool ggml_backend_is_metal(ggml_backend_t backend);
|
|
|
45
45
|
|
|
46
46
|
GGML_DEPRECATED(
|
|
47
47
|
GGML_BACKEND_API ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size),
|
|
48
|
-
"obsoleted by the new device interface - https://github.com/
|
|
48
|
+
"obsoleted by the new device interface - https://github.com/ggml-org/llama.cpp/pull/9713");
|
|
49
49
|
|
|
50
50
|
GGML_BACKEND_API void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data);
|
|
51
51
|
|
|
@@ -198,7 +198,7 @@
|
|
|
198
198
|
|
|
199
199
|
#ifndef __GNUC__
|
|
200
200
|
# define GGML_ATTRIBUTE_FORMAT(...)
|
|
201
|
-
#elif defined(__MINGW32__)
|
|
201
|
+
#elif defined(__MINGW32__) && !defined(__clang__)
|
|
202
202
|
# define GGML_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
|
|
203
203
|
#else
|
|
204
204
|
# define GGML_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
|
|
@@ -473,7 +473,6 @@ GGML_TABLE_BEGIN(uint8_t, ksigns_iq2xs, 128)
|
|
|
473
473
|
240, 113, 114, 243, 116, 245, 246, 119, 120, 249, 250, 123, 252, 125, 126, 255,
|
|
474
474
|
GGML_TABLE_END()
|
|
475
475
|
|
|
476
|
-
//#if __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A // lowest compute capability for integer intrinsics
|
|
477
476
|
GGML_TABLE_BEGIN(uint64_t, ksigns64, 128)
|
|
478
477
|
0x0000000000000000, 0xff000000000000ff, 0xff0000000000ff00, 0x000000000000ffff,
|
|
479
478
|
0xff00000000ff0000, 0x0000000000ff00ff, 0x0000000000ffff00, 0xff00000000ffffff,
|
|
@@ -508,7 +507,6 @@ GGML_TABLE_BEGIN(uint64_t, ksigns64, 128)
|
|
|
508
507
|
0x00ffffffff000000, 0xffffffffff0000ff, 0xffffffffff00ff00, 0x00ffffffff00ffff,
|
|
509
508
|
0xffffffffffff0000, 0x00ffffffffff00ff, 0x00ffffffffffff00, 0xffffffffffffffff,
|
|
510
509
|
GGML_TABLE_END()
|
|
511
|
-
//#endif
|
|
512
510
|
|
|
513
511
|
|
|
514
512
|
GGML_TABLE_BEGIN(uint64_t, iq2xxs_grid, 256)
|
|
@@ -360,21 +360,15 @@ inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b)
|
|
|
360
360
|
#endif
|
|
361
361
|
|
|
362
362
|
#if defined(__loongarch_asx)
|
|
363
|
-
|
|
364
|
-
typedef union {
|
|
365
|
-
int32_t i;
|
|
366
|
-
float f;
|
|
367
|
-
} ft_union;
|
|
368
|
-
|
|
369
363
|
/* float type data load instructions */
|
|
370
|
-
static __m128 __lsx_vreplfr2vr_s(float val) {
|
|
371
|
-
|
|
372
|
-
return (__m128)
|
|
364
|
+
static __m128 __lsx_vreplfr2vr_s(const float val) {
|
|
365
|
+
v4f32 res = {val, val, val, val};
|
|
366
|
+
return (__m128)res;
|
|
373
367
|
}
|
|
374
368
|
|
|
375
|
-
static __m256 __lasx_xvreplfr2vr_s(float val) {
|
|
376
|
-
|
|
377
|
-
return (__m256)
|
|
369
|
+
static __m256 __lasx_xvreplfr2vr_s(const float val) {
|
|
370
|
+
v8f32 res = {val, val, val, val, val, val, val, val};
|
|
371
|
+
return (__m256)res;
|
|
378
372
|
}
|
|
379
373
|
#endif
|
|
380
374
|
|