@fugood/llama.node 0.3.12 → 0.3.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +2 -1
- package/package.json +1 -1
- package/src/LlamaCompletionWorker.cpp +14 -0
- package/src/LlamaContext.cpp +110 -79
- package/src/LlamaContext.h +1 -1
- package/src/common.hpp +1 -2
- package/src/llama.cpp/.github/workflows/build.yml +95 -13
- package/src/llama.cpp/.github/workflows/docker.yml +2 -0
- package/src/llama.cpp/.github/workflows/labeler.yml +1 -1
- package/src/llama.cpp/.github/workflows/server.yml +2 -0
- package/src/llama.cpp/common/CMakeLists.txt +23 -6
- package/src/llama.cpp/common/arg.cpp +292 -14
- package/src/llama.cpp/common/chat.cpp +1128 -315
- package/src/llama.cpp/common/chat.h +135 -0
- package/src/llama.cpp/common/common.cpp +27 -171
- package/src/llama.cpp/common/common.h +41 -73
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +4 -5
- package/src/llama.cpp/common/json-schema-to-grammar.h +0 -1
- package/src/llama.cpp/common/llguidance.cpp +3 -3
- package/src/llama.cpp/common/log.cpp +1 -0
- package/src/llama.cpp/common/log.h +2 -1
- package/src/llama.cpp/common/{chat-template.hpp → minja/chat-template.hpp} +21 -7
- package/src/llama.cpp/common/{minja.hpp → minja/minja.hpp} +61 -14
- package/src/llama.cpp/common/ngram-cache.cpp +1 -0
- package/src/llama.cpp/common/sampling.cpp +93 -49
- package/src/llama.cpp/common/speculative.cpp +6 -5
- package/src/llama.cpp/common/speculative.h +1 -1
- package/src/llama.cpp/docs/build.md +47 -9
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +3 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +1 -0
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +4 -2
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +4 -4
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +6 -5
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +1 -1
- package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
- package/src/llama.cpp/examples/llava/clip.cpp +373 -107
- package/src/llama.cpp/examples/llava/clip.h +19 -3
- package/src/llama.cpp/examples/llava/gemma3-cli.cpp +341 -0
- package/src/llama.cpp/examples/llava/llava.cpp +4 -2
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +30 -11
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -0
- package/src/llama.cpp/examples/main/main.cpp +73 -28
- package/src/llama.cpp/examples/parallel/parallel.cpp +1 -0
- package/src/llama.cpp/examples/passkey/passkey.cpp +1 -0
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +1 -0
- package/src/llama.cpp/examples/quantize/quantize.cpp +1 -0
- package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +882 -237
- package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +35 -26
- package/src/llama.cpp/examples/run/run.cpp +115 -79
- package/src/llama.cpp/examples/server/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/server/httplib.h +381 -292
- package/src/llama.cpp/examples/server/server.cpp +134 -128
- package/src/llama.cpp/examples/server/utils.hpp +95 -106
- package/src/llama.cpp/examples/sycl/run-llama2.sh +2 -2
- package/src/llama.cpp/examples/tts/tts.cpp +251 -142
- package/src/llama.cpp/ggml/CMakeLists.txt +13 -1
- package/src/llama.cpp/ggml/include/ggml-alloc.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-backend.h +3 -3
- package/src/llama.cpp/ggml/include/ggml-cpu.h +4 -1
- package/src/llama.cpp/ggml/include/ggml-metal.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-vulkan.h +0 -2
- package/src/llama.cpp/ggml/include/ggml.h +6 -2
- package/src/llama.cpp/ggml/src/CMakeLists.txt +10 -7
- package/src/llama.cpp/ggml/src/ggml-alloc.c +24 -15
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +58 -54
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +10 -8
- package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +3 -5
- package/src/llama.cpp/ggml/src/ggml-common.h +0 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +132 -17
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +4 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +156 -11
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +2235 -641
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1572 -198
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +24 -5
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +259 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +61 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +288 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +9 -8
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +16 -3
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +14 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -5
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +235 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +6 -2
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +1 -0
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +246 -120
- package/src/llama.cpp/ggml/src/ggml-quants.c +114 -114
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +2 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +17 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +51 -10
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +33 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +2 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +701 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/cpy.hpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +55 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +136 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +308 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +174 -728
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -77
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +13 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +949 -602
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +37 -3
- package/src/llama.cpp/ggml/src/ggml.c +9 -4
- package/src/llama.cpp/include/llama.h +32 -14
- package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +46 -0
- package/src/llama.cpp/requirements/requirements-all.txt +1 -0
- package/src/llama.cpp/requirements/requirements-tool_bench.txt +12 -0
- package/src/llama.cpp/requirements.txt +1 -0
- package/src/llama.cpp/src/llama-arch.cpp +21 -0
- package/src/llama.cpp/src/llama-arch.h +1 -0
- package/src/llama.cpp/src/llama-chat.cpp +1 -0
- package/src/llama.cpp/src/llama-grammar.cpp +183 -183
- package/src/llama.cpp/src/llama-grammar.h +13 -4
- package/src/llama.cpp/src/llama-impl.h +6 -6
- package/src/llama.cpp/src/llama-kv-cache.h +2 -1
- package/src/llama.cpp/src/llama-mmap.cpp +11 -1
- package/src/llama.cpp/src/llama-mmap.h +1 -0
- package/src/llama.cpp/src/llama-model.cpp +70 -6
- package/src/llama.cpp/src/llama-sampling.cpp +174 -67
- package/src/llama.cpp/src/llama-vocab.cpp +12 -0
- package/src/llama.cpp/src/llama.cpp +154 -5
- package/src/llama.cpp/src/unicode.cpp +9 -2
- package/src/llama.cpp/tests/test-backend-ops.cpp +171 -115
- package/src/llama.cpp/tests/test-chat-template.cpp +32 -22
- package/src/llama.cpp/tests/test-chat.cpp +691 -325
- package/src/llama.cpp/tests/test-gguf.cpp +4 -4
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +63 -63
- package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -9
- package/src/llama.cpp/tests/test-sampling.cpp +15 -0
- package/src/llama.cpp/Sources/llama/llama.h +0 -4
- package/src/llama.cpp/common/chat.hpp +0 -52
|
@@ -42,7 +42,7 @@ enum stop_type {
|
|
|
42
42
|
STOP_TYPE_LIMIT,
|
|
43
43
|
};
|
|
44
44
|
|
|
45
|
-
// state diagram: https://github.com/
|
|
45
|
+
// state diagram: https://github.com/ggml-org/llama.cpp/pull/9283
|
|
46
46
|
enum slot_state {
|
|
47
47
|
SLOT_STATE_IDLE,
|
|
48
48
|
SLOT_STATE_STARTED, // TODO: this state is only used for setting up the initial prompt processing; maybe merge it with launch_slot_with_task in the future
|
|
@@ -131,9 +131,9 @@ struct slot_params {
|
|
|
131
131
|
lora.push_back({{"id", i}, {"scale", this->lora[i].scale}});
|
|
132
132
|
}
|
|
133
133
|
|
|
134
|
-
|
|
135
|
-
for (const auto & trigger : sampling.
|
|
136
|
-
|
|
134
|
+
auto grammar_triggers = json::array();
|
|
135
|
+
for (const auto & trigger : sampling.grammar_triggers) {
|
|
136
|
+
grammar_triggers.push_back(trigger.to_json<json>());
|
|
137
137
|
}
|
|
138
138
|
|
|
139
139
|
return json {
|
|
@@ -170,9 +170,10 @@ struct slot_params {
|
|
|
170
170
|
{"n_probs", sampling.n_probs},
|
|
171
171
|
{"min_keep", sampling.min_keep},
|
|
172
172
|
{"grammar", sampling.grammar},
|
|
173
|
-
{"
|
|
174
|
-
{"
|
|
173
|
+
{"grammar_lazy", sampling.grammar_lazy},
|
|
174
|
+
{"grammar_triggers", grammar_triggers},
|
|
175
175
|
{"preserved_tokens", sampling.preserved_tokens},
|
|
176
|
+
{"chat_format", common_chat_format_name(oaicompat_chat_format)},
|
|
176
177
|
{"samplers", samplers},
|
|
177
178
|
{"speculative.n_max", speculative.n_max},
|
|
178
179
|
{"speculative.n_min", speculative.n_min},
|
|
@@ -273,7 +274,7 @@ struct server_task {
|
|
|
273
274
|
params.speculative.p_min = json_value(data, "speculative.p_min", defaults.speculative.p_min);
|
|
274
275
|
|
|
275
276
|
params.speculative.n_min = std::min(params.speculative.n_max, params.speculative.n_min);
|
|
276
|
-
params.speculative.n_min = std::max(params.speculative.n_min,
|
|
277
|
+
params.speculative.n_min = std::max(params.speculative.n_min, 0);
|
|
277
278
|
params.speculative.n_max = std::max(params.speculative.n_max, 0);
|
|
278
279
|
|
|
279
280
|
// Use OpenAI API logprobs only if n_probs wasn't provided
|
|
@@ -328,69 +329,75 @@ struct server_task {
|
|
|
328
329
|
}
|
|
329
330
|
|
|
330
331
|
// process "json_schema" and "grammar"
|
|
331
|
-
if (data.contains("json_schema") && !data.at("json_schema").is_null() && data.contains("grammar") && !data.at("grammar").is_null()) {
|
|
332
|
-
throw std::runtime_error("Either \"json_schema\" or \"grammar\" can be specified, but not both");
|
|
333
|
-
}
|
|
334
332
|
if (data.contains("json_schema") && !data.contains("grammar")) {
|
|
335
333
|
try {
|
|
336
334
|
auto schema = json_value(data, "json_schema", json::object());
|
|
337
|
-
|
|
335
|
+
SRV_DBG("JSON schema: %s\n", schema.dump(2).c_str());
|
|
338
336
|
params.sampling.grammar = json_schema_to_grammar(schema);
|
|
339
|
-
|
|
337
|
+
SRV_DBG("Converted grammar: %s\n", params.sampling.grammar.c_str());
|
|
340
338
|
} catch (const std::exception & e) {
|
|
341
339
|
throw std::runtime_error(std::string("\"json_schema\": ") + e.what());
|
|
342
340
|
}
|
|
343
341
|
} else {
|
|
344
342
|
params.sampling.grammar = json_value(data, "grammar", defaults.sampling.grammar);
|
|
345
|
-
|
|
343
|
+
SRV_DBG("Grammar: %s\n", params.sampling.grammar.c_str());
|
|
346
344
|
params.sampling.grammar_lazy = json_value(data, "grammar_lazy", defaults.sampling.grammar_lazy);
|
|
347
|
-
|
|
345
|
+
SRV_DBG("Grammar lazy: %s\n", params.sampling.grammar_lazy ? "true" : "false");
|
|
348
346
|
}
|
|
349
347
|
|
|
350
348
|
{
|
|
351
349
|
auto it = data.find("chat_format");
|
|
352
350
|
if (it != data.end()) {
|
|
353
351
|
params.oaicompat_chat_format = static_cast<common_chat_format>(it->get<int>());
|
|
354
|
-
|
|
352
|
+
SRV_INF("Chat format: %s\n", common_chat_format_name(params.oaicompat_chat_format).c_str());
|
|
355
353
|
} else {
|
|
356
354
|
params.oaicompat_chat_format = defaults.oaicompat_chat_format;
|
|
357
355
|
}
|
|
358
356
|
}
|
|
359
357
|
|
|
360
358
|
{
|
|
361
|
-
const auto grammar_triggers = data.find("grammar_triggers");
|
|
362
|
-
if (grammar_triggers != data.end()) {
|
|
363
|
-
for (const auto & t : *grammar_triggers) {
|
|
364
|
-
common_grammar_trigger trigger;
|
|
365
|
-
trigger.word = t.at("word");
|
|
366
|
-
trigger.at_start = t.at("at_start");
|
|
367
|
-
|
|
368
|
-
auto ids = common_tokenize(vocab, trigger.word, /* add_special= */ false, /* parse_special= */ true);
|
|
369
|
-
if (ids.size() == 1) {
|
|
370
|
-
LOG_DBG("Grammar trigger token: %d (`%s`)\n", ids[0], trigger.word.c_str());
|
|
371
|
-
params.sampling.grammar_trigger_tokens.push_back(ids[0]);
|
|
372
|
-
params.sampling.preserved_tokens.insert(ids[0]);
|
|
373
|
-
continue;
|
|
374
|
-
}
|
|
375
|
-
LOG_DBG("Grammar trigger word: `%s`\n", trigger.word.c_str());
|
|
376
|
-
params.sampling.grammar_trigger_words.push_back(trigger);
|
|
377
|
-
}
|
|
378
|
-
}
|
|
379
359
|
const auto preserved_tokens = data.find("preserved_tokens");
|
|
380
360
|
if (preserved_tokens != data.end()) {
|
|
381
361
|
for (const auto & t : *preserved_tokens) {
|
|
382
362
|
auto ids = common_tokenize(vocab, t.get<std::string>(), /* add_special= */ false, /* parse_special= */ true);
|
|
383
363
|
if (ids.size() == 1) {
|
|
384
|
-
|
|
364
|
+
SRV_DBG("Preserved token: %d\n", ids[0]);
|
|
385
365
|
params.sampling.preserved_tokens.insert(ids[0]);
|
|
386
366
|
} else {
|
|
387
367
|
// This may happen when using a tool call style meant for a model with special tokens to preserve on a model without said tokens.
|
|
388
|
-
|
|
368
|
+
SRV_DBG("Not preserved because more than 1 token: %s\n", t.get<std::string>().c_str());
|
|
389
369
|
}
|
|
390
370
|
}
|
|
391
371
|
}
|
|
392
|
-
|
|
393
|
-
|
|
372
|
+
const auto grammar_triggers = data.find("grammar_triggers");
|
|
373
|
+
if (grammar_triggers != data.end()) {
|
|
374
|
+
for (const auto & t : *grammar_triggers) {
|
|
375
|
+
auto ct = common_grammar_trigger::from_json(t);
|
|
376
|
+
if (ct.type == COMMON_GRAMMAR_TRIGGER_TYPE_WORD) {
|
|
377
|
+
const auto & word = ct.value;
|
|
378
|
+
auto ids = common_tokenize(vocab, word, /* add_special= */ false, /* parse_special= */ true);
|
|
379
|
+
if (ids.size() == 1) {
|
|
380
|
+
auto token = ids[0];
|
|
381
|
+
if (std::find(params.sampling.preserved_tokens.begin(), params.sampling.preserved_tokens.end(), (llama_token) token) == params.sampling.preserved_tokens.end()) {
|
|
382
|
+
throw std::runtime_error("Grammar trigger word should be marked as preserved token: " + word);
|
|
383
|
+
}
|
|
384
|
+
SRV_DBG("Grammar trigger token: %d (`%s`)\n", token, word.c_str());
|
|
385
|
+
common_grammar_trigger trigger;
|
|
386
|
+
trigger.type = COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN;
|
|
387
|
+
trigger.value = word;
|
|
388
|
+
trigger.token = token;
|
|
389
|
+
params.sampling.grammar_triggers.push_back(std::move(trigger));
|
|
390
|
+
} else {
|
|
391
|
+
SRV_DBG("Grammar trigger word: `%s`\n", word.c_str());
|
|
392
|
+
params.sampling.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, word});
|
|
393
|
+
}
|
|
394
|
+
} else {
|
|
395
|
+
params.sampling.grammar_triggers.push_back(ct);
|
|
396
|
+
}
|
|
397
|
+
}
|
|
398
|
+
}
|
|
399
|
+
if (params.sampling.grammar_lazy && params.sampling.grammar_triggers.empty()) {
|
|
400
|
+
throw std::runtime_error("Error: no triggers set for lazy grammar!");
|
|
394
401
|
}
|
|
395
402
|
}
|
|
396
403
|
|
|
@@ -717,16 +724,26 @@ struct server_task_result_cmpl_final : server_task_result {
|
|
|
717
724
|
std::string finish_reason = "length";
|
|
718
725
|
common_chat_msg msg;
|
|
719
726
|
if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) {
|
|
720
|
-
|
|
727
|
+
SRV_DBG("Parsing chat message: %s\n", content.c_str());
|
|
721
728
|
msg = common_chat_parse(content, oaicompat_chat_format);
|
|
722
729
|
finish_reason = msg.tool_calls.empty() ? "stop" : "tool_calls";
|
|
723
730
|
} else {
|
|
724
731
|
msg.content = content;
|
|
725
732
|
}
|
|
726
733
|
|
|
727
|
-
json
|
|
734
|
+
json message {
|
|
735
|
+
{"role", "assistant"},
|
|
736
|
+
};
|
|
737
|
+
if (!msg.reasoning_content.empty()) {
|
|
738
|
+
message["reasoning_content"] = msg.reasoning_content;
|
|
739
|
+
}
|
|
740
|
+
if (msg.content.empty() && !msg.tool_calls.empty()) {
|
|
741
|
+
message["content"] = json();
|
|
742
|
+
} else {
|
|
743
|
+
message["content"] = msg.content;
|
|
744
|
+
}
|
|
728
745
|
if (!msg.tool_calls.empty()) {
|
|
729
|
-
tool_calls = json::array();
|
|
746
|
+
auto tool_calls = json::array();
|
|
730
747
|
for (const auto & tc : msg.tool_calls) {
|
|
731
748
|
tool_calls.push_back({
|
|
732
749
|
{"type", "function"},
|
|
@@ -734,18 +751,13 @@ struct server_task_result_cmpl_final : server_task_result {
|
|
|
734
751
|
{"name", tc.name},
|
|
735
752
|
{"arguments", tc.arguments},
|
|
736
753
|
}},
|
|
737
|
-
|
|
754
|
+
// Some templates generate and require an id (sometimes in a very specific format, e.g. Mistral Nemo).
|
|
755
|
+
// We only generate a random id for the ones that don't generate one by themselves
|
|
756
|
+
// (they also won't get to see it as their template likely doesn't use it, so it's all for the client)
|
|
757
|
+
{"id", tc.id.empty() ? gen_tool_call_id() : tc.id},
|
|
738
758
|
});
|
|
739
759
|
}
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
json message {
|
|
743
|
-
{"content", msg.content},
|
|
744
|
-
{"tool_calls", tool_calls},
|
|
745
|
-
{"role", "assistant"},
|
|
746
|
-
};
|
|
747
|
-
if (!msg.tool_plan.empty()) {
|
|
748
|
-
message["tool_plan"] = msg.tool_plan;
|
|
760
|
+
message["tool_calls"] = tool_calls;
|
|
749
761
|
}
|
|
750
762
|
|
|
751
763
|
json choice {
|
|
@@ -1304,7 +1316,7 @@ struct server_slot {
|
|
|
1304
1316
|
return task_type == SERVER_TASK_TYPE_EMBEDDING || task_type == SERVER_TASK_TYPE_RERANK;
|
|
1305
1317
|
}
|
|
1306
1318
|
|
|
1307
|
-
bool can_batch_with(server_slot & other_slot) {
|
|
1319
|
+
bool can_batch_with(server_slot & other_slot) const {
|
|
1308
1320
|
return is_non_causal() == other_slot.is_non_causal()
|
|
1309
1321
|
&& are_lora_equal(lora, other_slot.lora);
|
|
1310
1322
|
}
|
|
@@ -1600,6 +1612,10 @@ struct server_queue {
|
|
|
1600
1612
|
|
|
1601
1613
|
while (true) {
|
|
1602
1614
|
std::unique_lock<std::mutex> lock(mutex_tasks);
|
|
1615
|
+
if (!running) {
|
|
1616
|
+
QUE_DBG("%s", "terminate\n");
|
|
1617
|
+
return;
|
|
1618
|
+
}
|
|
1603
1619
|
if (queue_tasks.empty()) {
|
|
1604
1620
|
lock.unlock();
|
|
1605
1621
|
break;
|
|
@@ -1620,11 +1636,11 @@ struct server_queue {
|
|
|
1620
1636
|
QUE_DBG("%s", "waiting for new tasks\n");
|
|
1621
1637
|
{
|
|
1622
1638
|
std::unique_lock<std::mutex> lock(mutex_tasks);
|
|
1639
|
+
if (!running) {
|
|
1640
|
+
QUE_DBG("%s", "terminate\n");
|
|
1641
|
+
return;
|
|
1642
|
+
}
|
|
1623
1643
|
if (queue_tasks.empty()) {
|
|
1624
|
-
if (!running) {
|
|
1625
|
-
QUE_DBG("%s", "terminate\n");
|
|
1626
|
-
return;
|
|
1627
|
-
}
|
|
1628
1644
|
condition_tasks.wait(lock, [&]{
|
|
1629
1645
|
return (!queue_tasks.empty() || !running);
|
|
1630
1646
|
});
|
|
@@ -1800,7 +1816,7 @@ struct server_context {
|
|
|
1800
1816
|
// Necessary similarity of prompt for slot selection
|
|
1801
1817
|
float slot_prompt_similarity = 0.0f;
|
|
1802
1818
|
|
|
1803
|
-
|
|
1819
|
+
common_chat_templates_ptr chat_templates;
|
|
1804
1820
|
|
|
1805
1821
|
~server_context() {
|
|
1806
1822
|
// Clear any sampling context
|
|
@@ -1884,45 +1900,18 @@ struct server_context {
|
|
|
1884
1900
|
llama_init_dft.context.reset();
|
|
1885
1901
|
}
|
|
1886
1902
|
|
|
1887
|
-
|
|
1888
|
-
|
|
1889
|
-
chat_templates
|
|
1890
|
-
}
|
|
1891
|
-
|
|
1903
|
+
chat_templates = common_chat_templates_init(model, params_base.chat_template);
|
|
1904
|
+
try {
|
|
1905
|
+
common_chat_format_example(chat_templates.get(), params.use_jinja);
|
|
1906
|
+
} catch (const std::exception & e) {
|
|
1907
|
+
SRV_WRN("%s: Chat template parsing error: %s\n", __func__, e.what());
|
|
1908
|
+
SRV_WRN("%s: The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses\n", __func__);
|
|
1909
|
+
chat_templates = common_chat_templates_init(model, "chatml");
|
|
1892
1910
|
}
|
|
1893
|
-
GGML_ASSERT(chat_templates.template_default.get() != nullptr);
|
|
1894
1911
|
|
|
1895
1912
|
return true;
|
|
1896
1913
|
}
|
|
1897
1914
|
|
|
1898
|
-
bool validate_builtin_chat_template(bool use_jinja) const {
|
|
1899
|
-
llama_chat_message chat[] = {{"user", "test"}};
|
|
1900
|
-
|
|
1901
|
-
if (use_jinja) {
|
|
1902
|
-
auto templates = common_chat_templates_from_model(model, "");
|
|
1903
|
-
common_chat_inputs inputs;
|
|
1904
|
-
inputs.messages = json::array({{
|
|
1905
|
-
{"role", "user"},
|
|
1906
|
-
{"content", "test"},
|
|
1907
|
-
}});
|
|
1908
|
-
GGML_ASSERT(templates.template_default);
|
|
1909
|
-
try {
|
|
1910
|
-
common_chat_params_init(*templates.template_default, inputs);
|
|
1911
|
-
if (templates.template_tool_use) {
|
|
1912
|
-
common_chat_params_init(*templates.template_tool_use, inputs);
|
|
1913
|
-
}
|
|
1914
|
-
return true;
|
|
1915
|
-
} catch (const std::exception & e) {
|
|
1916
|
-
SRV_ERR("failed to apply template: %s\n", e.what());
|
|
1917
|
-
return false;
|
|
1918
|
-
}
|
|
1919
|
-
} else {
|
|
1920
|
-
const char * tmpl = llama_model_chat_template(model, /* name */ nullptr);
|
|
1921
|
-
const int32_t chat_res = llama_chat_apply_template(tmpl, chat, 1, true, nullptr, 0);
|
|
1922
|
-
return chat_res > 0;
|
|
1923
|
-
}
|
|
1924
|
-
}
|
|
1925
|
-
|
|
1926
1915
|
void init() {
|
|
1927
1916
|
const int32_t n_ctx_slot = n_ctx / params_base.n_parallel;
|
|
1928
1917
|
|
|
@@ -2069,8 +2058,8 @@ struct server_context {
|
|
|
2069
2058
|
|
|
2070
2059
|
if (slot.n_predict > 0 && slot.params.n_predict > slot.n_predict) {
|
|
2071
2060
|
// Might be better to reject the request with a 400 ?
|
|
2061
|
+
SLT_WRN(slot, "n_predict = %d exceeds server configuration, setting to %d\n", slot.params.n_predict, slot.n_predict);
|
|
2072
2062
|
slot.params.n_predict = slot.n_predict;
|
|
2073
|
-
SLT_WRN(slot, "n_predict = %d exceeds server configuration, setting to %d", slot.n_predict, slot.n_predict);
|
|
2074
2063
|
}
|
|
2075
2064
|
|
|
2076
2065
|
if (slot.params.ignore_eos && has_eos_token) {
|
|
@@ -2172,14 +2161,6 @@ struct server_context {
|
|
|
2172
2161
|
}
|
|
2173
2162
|
|
|
2174
2163
|
if (slot.has_new_line) {
|
|
2175
|
-
// if we have already seen a new line, we stop after a certain time limit
|
|
2176
|
-
if (slot.params.t_max_predict_ms > 0 && (ggml_time_us() - slot.t_start_generation > 1000.0f*slot.params.t_max_predict_ms)) {
|
|
2177
|
-
slot.stop = STOP_TYPE_LIMIT;
|
|
2178
|
-
slot.has_next_token = false;
|
|
2179
|
-
|
|
2180
|
-
SLT_DBG(slot, "stopped by time limit, n_decoded = %d, t_max_predict_ms = %d ms\n", slot.n_decoded, (int) slot.params.t_max_predict_ms);
|
|
2181
|
-
}
|
|
2182
|
-
|
|
2183
2164
|
// require that each new line has a whitespace prefix (i.e. indentation) of at least slot.params.n_indent
|
|
2184
2165
|
if (slot.params.n_indent > 0) {
|
|
2185
2166
|
// check the current indentation
|
|
@@ -2218,6 +2199,14 @@ struct server_context {
|
|
|
2218
2199
|
// check if there is a new line in the generated text
|
|
2219
2200
|
if (result.text_to_send.find('\n') != std::string::npos) {
|
|
2220
2201
|
slot.has_new_line = true;
|
|
2202
|
+
|
|
2203
|
+
// if we have seen a new line, we stop after a certain time limit, but only upon another new line
|
|
2204
|
+
if (slot.params.t_max_predict_ms > 0 && (ggml_time_us() - slot.t_start_generation > 1000.0f*slot.params.t_max_predict_ms)) {
|
|
2205
|
+
slot.stop = STOP_TYPE_LIMIT;
|
|
2206
|
+
slot.has_next_token = false;
|
|
2207
|
+
|
|
2208
|
+
SLT_DBG(slot, "stopped by time limit, n_decoded = %d, t_max_predict_ms = %d ms\n", slot.n_decoded, (int) slot.params.t_max_predict_ms);
|
|
2209
|
+
}
|
|
2221
2210
|
}
|
|
2222
2211
|
|
|
2223
2212
|
// if context shift is disabled, we stop when it reaches the context limit
|
|
@@ -2275,7 +2264,7 @@ struct server_context {
|
|
|
2275
2264
|
for (size_t i = 0; i < std::min(max_probs, n_probs); i++) {
|
|
2276
2265
|
result.probs.push_back({
|
|
2277
2266
|
cur_p->data[i].id,
|
|
2278
|
-
|
|
2267
|
+
common_token_to_piece(ctx, cur_p->data[i].id, special),
|
|
2279
2268
|
cur_p->data[i].p
|
|
2280
2269
|
});
|
|
2281
2270
|
}
|
|
@@ -2297,7 +2286,7 @@ struct server_context {
|
|
|
2297
2286
|
for (size_t i = 0; i < std::min(n_vocab, n_probs); i++) {
|
|
2298
2287
|
result.probs.push_back({
|
|
2299
2288
|
cur[i].id,
|
|
2300
|
-
|
|
2289
|
+
common_token_to_piece(ctx, cur[i].id, special),
|
|
2301
2290
|
cur[i].p
|
|
2302
2291
|
});
|
|
2303
2292
|
}
|
|
@@ -3027,7 +3016,7 @@ struct server_context {
|
|
|
3027
3016
|
const int64_t kv_shift = (int64_t) head_p - (int64_t) head_c;
|
|
3028
3017
|
|
|
3029
3018
|
llama_kv_cache_seq_rm (ctx, slot.id, head_p, head_c);
|
|
3030
|
-
llama_kv_cache_seq_add(ctx, slot.id, head_c,
|
|
3019
|
+
llama_kv_cache_seq_add(ctx, slot.id, head_c, head_c + n_match, kv_shift);
|
|
3031
3020
|
|
|
3032
3021
|
for (size_t i = 0; i < n_match; i++) {
|
|
3033
3022
|
slot.cache_tokens[head_p + i] = slot.cache_tokens[head_c + i];
|
|
@@ -3355,10 +3344,10 @@ static void log_server_request(const httplib::Request & req, const httplib::Resp
|
|
|
3355
3344
|
|
|
3356
3345
|
// reminder: this function is not covered by httplib's exception handler; if someone does more complicated stuff, think about wrapping it in try-catch
|
|
3357
3346
|
|
|
3358
|
-
|
|
3347
|
+
SRV_INF("request: %s %s %s %d\n", req.method.c_str(), req.path.c_str(), req.remote_addr.c_str(), res.status);
|
|
3359
3348
|
|
|
3360
|
-
|
|
3361
|
-
|
|
3349
|
+
SRV_DBG("request: %s\n", req.body.c_str());
|
|
3350
|
+
SRV_DBG("response: %s\n", res.body.c_str());
|
|
3362
3351
|
}
|
|
3363
3352
|
|
|
3364
3353
|
std::function<void(int)> shutdown_handler;
|
|
@@ -3649,7 +3638,7 @@ int main(int argc, char ** argv) {
|
|
|
3649
3638
|
}, {
|
|
3650
3639
|
{"name", "n_busy_slots_per_decode"},
|
|
3651
3640
|
{"help", "Average number of busy slots per llama_decode() call"},
|
|
3652
|
-
{"value", (float) res_metrics->n_busy_slots_total / (float) res_metrics->n_decode_total}
|
|
3641
|
+
{"value", (float) res_metrics->n_busy_slots_total / std::max((float) res_metrics->n_decode_total, 1.f)}
|
|
3653
3642
|
}}},
|
|
3654
3643
|
{"gauge", {{
|
|
3655
3644
|
{"name", "prompt_tokens_seconds"},
|
|
@@ -3815,13 +3804,15 @@ int main(int argc, char ** argv) {
|
|
|
3815
3804
|
{ "default_generation_settings", ctx_server.default_generation_settings_for_props },
|
|
3816
3805
|
{ "total_slots", ctx_server.params_base.n_parallel },
|
|
3817
3806
|
{ "model_path", ctx_server.params_base.model },
|
|
3818
|
-
{ "chat_template", ctx_server.chat_templates.
|
|
3819
|
-
{ "bos_token", ctx_server.
|
|
3820
|
-
{ "eos_token", ctx_server.
|
|
3807
|
+
{ "chat_template", common_chat_templates_source(ctx_server.chat_templates.get()) },
|
|
3808
|
+
{ "bos_token", common_token_to_piece(ctx_server.ctx, llama_vocab_bos(ctx_server.vocab), /* special= */ true)},
|
|
3809
|
+
{ "eos_token", common_token_to_piece(ctx_server.ctx, llama_vocab_eos(ctx_server.vocab), /* special= */ true)},
|
|
3821
3810
|
{ "build_info", build_info },
|
|
3822
3811
|
};
|
|
3823
|
-
if (ctx_server.params_base.use_jinja
|
|
3824
|
-
|
|
3812
|
+
if (ctx_server.params_base.use_jinja) {
|
|
3813
|
+
if (auto tool_use_src = common_chat_templates_source(ctx_server.chat_templates.get(), "tool_use")) {
|
|
3814
|
+
data["chat_template_tool_use"] = tool_use_src;
|
|
3815
|
+
}
|
|
3825
3816
|
}
|
|
3826
3817
|
|
|
3827
3818
|
res_ok(res, data);
|
|
@@ -3860,7 +3851,9 @@ int main(int argc, char ** argv) {
|
|
|
3860
3851
|
|
|
3861
3852
|
try {
|
|
3862
3853
|
const auto & prompt = data.at("prompt");
|
|
3863
|
-
|
|
3854
|
+
// TODO: this log can become very long, put it behind a flag or think about a more compact format
|
|
3855
|
+
//SRV_DBG("Prompt: %s\n", prompt.is_string() ? prompt.get<std::string>().c_str() : prompt.dump(2).c_str());
|
|
3856
|
+
|
|
3864
3857
|
std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, prompt, true, true);
|
|
3865
3858
|
tasks.reserve(tokenized_prompts.size());
|
|
3866
3859
|
for (size_t i = 0; i < tokenized_prompts.size(); i++) {
|
|
@@ -4054,7 +4047,7 @@ int main(int argc, char ** argv) {
|
|
|
4054
4047
|
}
|
|
4055
4048
|
|
|
4056
4049
|
auto body = json::parse(req.body);
|
|
4057
|
-
json data = oaicompat_completion_params_parse(body, params.use_jinja, ctx_server.chat_templates);
|
|
4050
|
+
json data = oaicompat_completion_params_parse(body, params.use_jinja, params.reasoning_format, ctx_server.chat_templates.get());
|
|
4058
4051
|
|
|
4059
4052
|
return handle_completions_impl(
|
|
4060
4053
|
SERVER_TASK_TYPE_COMPLETION,
|
|
@@ -4067,7 +4060,7 @@ int main(int argc, char ** argv) {
|
|
|
4067
4060
|
// same with handle_chat_completions, but without inference part
|
|
4068
4061
|
const auto handle_apply_template = [&ctx_server, ¶ms, &res_ok](const httplib::Request & req, httplib::Response & res) {
|
|
4069
4062
|
auto body = json::parse(req.body);
|
|
4070
|
-
json data = oaicompat_completion_params_parse(body, params.use_jinja, ctx_server.chat_templates);
|
|
4063
|
+
json data = oaicompat_completion_params_parse(body, params.use_jinja, params.reasoning_format, ctx_server.chat_templates.get());
|
|
4071
4064
|
res_ok(res, {{ "prompt", std::move(data.at("prompt")) }});
|
|
4072
4065
|
};
|
|
4073
4066
|
|
|
@@ -4254,6 +4247,11 @@ int main(int argc, char ** argv) {
|
|
|
4254
4247
|
// return;
|
|
4255
4248
|
//}
|
|
4256
4249
|
|
|
4250
|
+
// if true, use TEI API format, otherwise use Jina API format
|
|
4251
|
+
// Jina: https://jina.ai/reranker/
|
|
4252
|
+
// TEI: https://huggingface.github.io/text-embeddings-inference/#/Text%20Embeddings%20Inference/rerank
|
|
4253
|
+
bool is_tei_format = body.contains("texts");
|
|
4254
|
+
|
|
4257
4255
|
json query;
|
|
4258
4256
|
if (body.count("query") == 1) {
|
|
4259
4257
|
query = body.at("query");
|
|
@@ -4266,7 +4264,8 @@ int main(int argc, char ** argv) {
|
|
|
4266
4264
|
return;
|
|
4267
4265
|
}
|
|
4268
4266
|
|
|
4269
|
-
std::vector<std::string> documents = json_value(body, "documents",
|
|
4267
|
+
std::vector<std::string> documents = json_value(body, "documents",
|
|
4268
|
+
json_value(body, "texts", std::vector<std::string>()));
|
|
4270
4269
|
if (documents.empty()) {
|
|
4271
4270
|
res_error(res, format_error_response("\"documents\" must be a non-empty string array", ERROR_TYPE_INVALID_REQUEST));
|
|
4272
4271
|
return;
|
|
@@ -4311,7 +4310,12 @@ int main(int argc, char ** argv) {
|
|
|
4311
4310
|
}
|
|
4312
4311
|
|
|
4313
4312
|
// write JSON response
|
|
4314
|
-
json root = format_response_rerank(
|
|
4313
|
+
json root = format_response_rerank(
|
|
4314
|
+
body,
|
|
4315
|
+
responses,
|
|
4316
|
+
is_tei_format,
|
|
4317
|
+
documents);
|
|
4318
|
+
|
|
4315
4319
|
res_ok(res, root);
|
|
4316
4320
|
};
|
|
4317
4321
|
|
|
@@ -4376,6 +4380,9 @@ int main(int argc, char ** argv) {
|
|
|
4376
4380
|
res.set_content("Error: gzip is not supported by this browser", "text/plain");
|
|
4377
4381
|
} else {
|
|
4378
4382
|
res.set_header("Content-Encoding", "gzip");
|
|
4383
|
+
// COEP and COOP headers, required by pyodide (python interpreter)
|
|
4384
|
+
res.set_header("Cross-Origin-Embedder-Policy", "require-corp");
|
|
4385
|
+
res.set_header("Cross-Origin-Opener-Policy", "same-origin");
|
|
4379
4386
|
res.set_content(reinterpret_cast<const char*>(index_html_gz), index_html_gz_len, "text/html; charset=utf-8");
|
|
4380
4387
|
}
|
|
4381
4388
|
return false;
|
|
@@ -4425,6 +4432,7 @@ int main(int argc, char ** argv) {
|
|
|
4425
4432
|
|
|
4426
4433
|
// clean up function, to be called before exit
|
|
4427
4434
|
auto clean_up = [&svr]() {
|
|
4435
|
+
SRV_INF("%s: cleaning up before exit...\n", __func__);
|
|
4428
4436
|
svr->stop();
|
|
4429
4437
|
llama_backend_free();
|
|
4430
4438
|
};
|
|
@@ -4441,10 +4449,6 @@ int main(int argc, char ** argv) {
|
|
|
4441
4449
|
}
|
|
4442
4450
|
|
|
4443
4451
|
if (!was_bound) {
|
|
4444
|
-
//LOG_ERROR("couldn't bind HTTP server socket", {
|
|
4445
|
-
// {"hostname", params.hostname},
|
|
4446
|
-
// {"port", params.port},
|
|
4447
|
-
//});
|
|
4448
4452
|
LOG_ERR("%s: couldn't bind HTTP server socket, hostname: %s, port: %d\n", __func__, params.hostname.c_str(), params.port);
|
|
4449
4453
|
clean_up();
|
|
4450
4454
|
return 1;
|
|
@@ -4461,7 +4465,7 @@ int main(int argc, char ** argv) {
|
|
|
4461
4465
|
|
|
4462
4466
|
if (!ctx_server.load_model(params)) {
|
|
4463
4467
|
clean_up();
|
|
4464
|
-
t.join();
|
|
4468
|
+
// t.join(); // FIXME: see below
|
|
4465
4469
|
LOG_ERR("%s: exiting due to model loading error\n", __func__);
|
|
4466
4470
|
return 1;
|
|
4467
4471
|
}
|
|
@@ -4473,8 +4477,8 @@ int main(int argc, char ** argv) {
|
|
|
4473
4477
|
|
|
4474
4478
|
// print sample chat example to make it clear which template is used
|
|
4475
4479
|
LOG_INF("%s: chat template, chat_template: %s, example_format: '%s'\n", __func__,
|
|
4476
|
-
ctx_server.chat_templates.
|
|
4477
|
-
common_chat_format_example(
|
|
4480
|
+
common_chat_templates_source(ctx_server.chat_templates.get()),
|
|
4481
|
+
common_chat_format_example(ctx_server.chat_templates.get(), ctx_server.params_base.use_jinja).c_str());
|
|
4478
4482
|
|
|
4479
4483
|
ctx_server.queue_tasks.on_new_task([&ctx_server](const server_task & task) {
|
|
4480
4484
|
ctx_server.process_single_task(task);
|
|
@@ -4485,13 +4489,10 @@ int main(int argc, char ** argv) {
|
|
|
4485
4489
|
});
|
|
4486
4490
|
|
|
4487
4491
|
shutdown_handler = [&](int) {
|
|
4492
|
+
// this will unblock start_loop()
|
|
4488
4493
|
ctx_server.queue_tasks.terminate();
|
|
4489
4494
|
};
|
|
4490
4495
|
|
|
4491
|
-
LOG_INF("%s: server is listening on http://%s:%d - starting the main loop\n", __func__, params.hostname.c_str(), params.port);
|
|
4492
|
-
|
|
4493
|
-
ctx_server.queue_tasks.start_loop();
|
|
4494
|
-
|
|
4495
4496
|
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
|
|
4496
4497
|
struct sigaction sigint_action;
|
|
4497
4498
|
sigint_action.sa_handler = signal_handler;
|
|
@@ -4506,8 +4507,13 @@ int main(int argc, char ** argv) {
|
|
|
4506
4507
|
SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
|
|
4507
4508
|
#endif
|
|
4508
4509
|
|
|
4510
|
+
LOG_INF("%s: server is listening on http://%s:%d - starting the main loop\n", __func__, params.hostname.c_str(), params.port);
|
|
4511
|
+
|
|
4512
|
+
// this call blocks the main thread until queue_tasks.terminate() is called
|
|
4513
|
+
ctx_server.queue_tasks.start_loop();
|
|
4514
|
+
|
|
4509
4515
|
clean_up();
|
|
4510
|
-
t.join();
|
|
4516
|
+
// t.join(); // FIXME: http thread may stuck if there is an on-going request. we don't need to care about this for now as the HTTP connection will already be closed at this point, but it's better to fix this
|
|
4511
4517
|
|
|
4512
4518
|
return 0;
|
|
4513
4519
|
}
|