@fugood/llama.node 0.3.13 → 0.3.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +1 -1
- package/package.json +1 -1
- package/src/LlamaContext.cpp +98 -76
- package/src/LlamaContext.h +1 -1
- package/src/common.hpp +1 -2
- package/src/llama.cpp/.github/workflows/build.yml +89 -10
- package/src/llama.cpp/.github/workflows/server.yml +2 -0
- package/src/llama.cpp/CMakeLists.txt +9 -1
- package/src/llama.cpp/cmake/common.cmake +2 -0
- package/src/llama.cpp/common/CMakeLists.txt +3 -3
- package/src/llama.cpp/common/arg.cpp +132 -13
- package/src/llama.cpp/common/chat.cpp +960 -266
- package/src/llama.cpp/common/chat.h +135 -0
- package/src/llama.cpp/common/common.cpp +33 -174
- package/src/llama.cpp/common/common.h +27 -67
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +4 -5
- package/src/llama.cpp/common/json-schema-to-grammar.h +0 -1
- package/src/llama.cpp/common/{minja.hpp → minja/minja.hpp} +37 -5
- package/src/llama.cpp/common/ngram-cache.cpp +1 -0
- package/src/llama.cpp/common/sampling.cpp +45 -7
- package/src/llama.cpp/common/speculative.cpp +10 -9
- package/src/llama.cpp/common/speculative.h +1 -1
- package/src/llama.cpp/docs/build.md +45 -7
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +2 -2
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +4 -2
- package/src/llama.cpp/examples/embedding/embedding.cpp +2 -1
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +4 -2
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +2 -2
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +3 -4
- package/src/llama.cpp/examples/infill/infill.cpp +2 -2
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +2 -2
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +5 -5
- package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
- package/src/llama.cpp/examples/llava/clip.cpp +373 -107
- package/src/llama.cpp/examples/llava/clip.h +19 -3
- package/src/llama.cpp/examples/llava/gemma3-cli.cpp +341 -0
- package/src/llama.cpp/examples/llava/llava.cpp +4 -2
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +30 -11
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +7 -6
- package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
- package/src/llama.cpp/examples/main/main.cpp +79 -34
- package/src/llama.cpp/examples/parallel/parallel.cpp +6 -5
- package/src/llama.cpp/examples/passkey/passkey.cpp +15 -14
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +6 -6
- package/src/llama.cpp/examples/quantize/quantize.cpp +1 -0
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +2 -2
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +1 -1
- package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +882 -237
- package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +35 -26
- package/src/llama.cpp/examples/run/run.cpp +196 -108
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +2 -2
- package/src/llama.cpp/examples/server/server.cpp +113 -101
- package/src/llama.cpp/examples/server/utils.hpp +94 -105
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +2 -2
- package/src/llama.cpp/examples/speculative/speculative.cpp +14 -14
- package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
- package/src/llama.cpp/examples/sycl/run-llama2.sh +2 -2
- package/src/llama.cpp/examples/tts/tts.cpp +263 -151
- package/src/llama.cpp/ggml/CMakeLists.txt +14 -1
- package/src/llama.cpp/ggml/cmake/common.cmake +26 -0
- package/src/llama.cpp/ggml/include/ggml-alloc.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-backend.h +3 -3
- package/src/llama.cpp/ggml/include/ggml-cpu.h +3 -0
- package/src/llama.cpp/ggml/include/ggml.h +29 -1
- package/src/llama.cpp/ggml/src/CMakeLists.txt +15 -34
- package/src/llama.cpp/ggml/src/ggml-alloc.c +24 -15
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +58 -54
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +10 -8
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +6 -2
- package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +3 -7
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +3 -5
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +139 -16
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +4 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +151 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +1546 -387
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1645 -113
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +22 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +259 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +61 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +288 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +15 -2
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +2 -1
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +3 -1
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +14 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -5
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +242 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +6 -6
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +1 -0
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +315 -138
- package/src/llama.cpp/ggml/src/ggml-quants.c +114 -114
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +5 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +17 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +117 -36
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +33 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +2 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +701 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/cpy.hpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +55 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +147 -16
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +40 -40
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +307 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +262 -746
- package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +0 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -78
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +114 -6
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +6 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +4 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +13 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +305 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv.hpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +498 -188
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +0 -4
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +16 -3
- package/src/llama.cpp/ggml/src/ggml.c +93 -5
- package/src/llama.cpp/include/llama.h +105 -27
- package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +46 -0
- package/src/llama.cpp/requirements/requirements-all.txt +1 -0
- package/src/llama.cpp/requirements/requirements-tool_bench.txt +12 -0
- package/src/llama.cpp/requirements.txt +1 -0
- package/src/llama.cpp/src/CMakeLists.txt +5 -2
- package/src/llama.cpp/src/llama-adapter.cpp +19 -20
- package/src/llama.cpp/src/llama-adapter.h +11 -9
- package/src/llama.cpp/src/llama-arch.cpp +123 -16
- package/src/llama.cpp/src/llama-arch.h +19 -0
- package/src/llama.cpp/src/llama-batch.h +2 -2
- package/src/llama.cpp/src/llama-chat.cpp +1 -0
- package/src/llama.cpp/src/llama-context.cpp +2253 -1222
- package/src/llama.cpp/src/llama-context.h +214 -77
- package/src/llama.cpp/src/llama-cparams.h +1 -0
- package/src/llama.cpp/src/llama-grammar.cpp +182 -182
- package/src/llama.cpp/src/llama-grammar.h +12 -3
- package/src/llama.cpp/src/llama-graph.cpp +1662 -0
- package/src/llama.cpp/src/llama-graph.h +574 -0
- package/src/llama.cpp/src/llama-hparams.cpp +8 -0
- package/src/llama.cpp/src/llama-hparams.h +9 -0
- package/src/llama.cpp/src/llama-io.cpp +15 -0
- package/src/llama.cpp/src/llama-io.h +35 -0
- package/src/llama.cpp/src/llama-kv-cache.cpp +1006 -291
- package/src/llama.cpp/src/llama-kv-cache.h +178 -109
- package/src/llama.cpp/src/llama-memory.cpp +1 -0
- package/src/llama.cpp/src/llama-memory.h +21 -0
- package/src/llama.cpp/src/llama-mmap.cpp +11 -1
- package/src/llama.cpp/src/llama-model.cpp +8230 -122
- package/src/llama.cpp/src/llama-model.h +34 -1
- package/src/llama.cpp/src/llama-quant.cpp +10 -1
- package/src/llama.cpp/src/llama-sampling.cpp +43 -10
- package/src/llama.cpp/src/llama-vocab.cpp +12 -0
- package/src/llama.cpp/src/llama.cpp +51 -9837
- package/src/llama.cpp/tests/test-backend-ops.cpp +247 -112
- package/src/llama.cpp/tests/test-chat-template.cpp +32 -22
- package/src/llama.cpp/tests/test-chat.cpp +593 -395
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +63 -63
- package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -9
- package/src/llama.cpp/Sources/llama/llama.h +0 -4
- package/src/llama.cpp/common/chat.hpp +0 -55
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +0 -143
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +0 -9
- /package/src/llama.cpp/common/{chat-template.hpp → minja/chat-template.hpp} +0 -0
|
@@ -131,9 +131,9 @@ struct slot_params {
|
|
|
131
131
|
lora.push_back({{"id", i}, {"scale", this->lora[i].scale}});
|
|
132
132
|
}
|
|
133
133
|
|
|
134
|
-
|
|
135
|
-
for (const auto & trigger : sampling.
|
|
136
|
-
|
|
134
|
+
auto grammar_triggers = json::array();
|
|
135
|
+
for (const auto & trigger : sampling.grammar_triggers) {
|
|
136
|
+
grammar_triggers.push_back(trigger.to_json<json>());
|
|
137
137
|
}
|
|
138
138
|
|
|
139
139
|
return json {
|
|
@@ -170,8 +170,8 @@ struct slot_params {
|
|
|
170
170
|
{"n_probs", sampling.n_probs},
|
|
171
171
|
{"min_keep", sampling.min_keep},
|
|
172
172
|
{"grammar", sampling.grammar},
|
|
173
|
-
{"
|
|
174
|
-
{"
|
|
173
|
+
{"grammar_lazy", sampling.grammar_lazy},
|
|
174
|
+
{"grammar_triggers", grammar_triggers},
|
|
175
175
|
{"preserved_tokens", sampling.preserved_tokens},
|
|
176
176
|
{"chat_format", common_chat_format_name(oaicompat_chat_format)},
|
|
177
177
|
{"samplers", samplers},
|
|
@@ -274,7 +274,7 @@ struct server_task {
|
|
|
274
274
|
params.speculative.p_min = json_value(data, "speculative.p_min", defaults.speculative.p_min);
|
|
275
275
|
|
|
276
276
|
params.speculative.n_min = std::min(params.speculative.n_max, params.speculative.n_min);
|
|
277
|
-
params.speculative.n_min = std::max(params.speculative.n_min,
|
|
277
|
+
params.speculative.n_min = std::max(params.speculative.n_min, 0);
|
|
278
278
|
params.speculative.n_max = std::max(params.speculative.n_max, 0);
|
|
279
279
|
|
|
280
280
|
// Use OpenAI API logprobs only if n_probs wasn't provided
|
|
@@ -329,9 +329,6 @@ struct server_task {
|
|
|
329
329
|
}
|
|
330
330
|
|
|
331
331
|
// process "json_schema" and "grammar"
|
|
332
|
-
if (data.contains("json_schema") && !data.at("json_schema").is_null() && data.contains("grammar") && !data.at("grammar").is_null()) {
|
|
333
|
-
throw std::runtime_error("Either \"json_schema\" or \"grammar\" can be specified, but not both");
|
|
334
|
-
}
|
|
335
332
|
if (data.contains("json_schema") && !data.contains("grammar")) {
|
|
336
333
|
try {
|
|
337
334
|
auto schema = json_value(data, "json_schema", json::object());
|
|
@@ -359,24 +356,6 @@ struct server_task {
|
|
|
359
356
|
}
|
|
360
357
|
|
|
361
358
|
{
|
|
362
|
-
const auto grammar_triggers = data.find("grammar_triggers");
|
|
363
|
-
if (grammar_triggers != data.end()) {
|
|
364
|
-
for (const auto & t : *grammar_triggers) {
|
|
365
|
-
common_grammar_trigger trigger;
|
|
366
|
-
trigger.word = t.at("word");
|
|
367
|
-
trigger.at_start = t.at("at_start");
|
|
368
|
-
|
|
369
|
-
auto ids = common_tokenize(vocab, trigger.word, /* add_special= */ false, /* parse_special= */ true);
|
|
370
|
-
if (ids.size() == 1) {
|
|
371
|
-
SRV_DBG("Grammar trigger token: %d (`%s`)\n", ids[0], trigger.word.c_str());
|
|
372
|
-
params.sampling.grammar_trigger_tokens.push_back(ids[0]);
|
|
373
|
-
params.sampling.preserved_tokens.insert(ids[0]);
|
|
374
|
-
continue;
|
|
375
|
-
}
|
|
376
|
-
SRV_DBG("Grammar trigger word: `%s`\n", trigger.word.c_str());
|
|
377
|
-
params.sampling.grammar_trigger_words.push_back(trigger);
|
|
378
|
-
}
|
|
379
|
-
}
|
|
380
359
|
const auto preserved_tokens = data.find("preserved_tokens");
|
|
381
360
|
if (preserved_tokens != data.end()) {
|
|
382
361
|
for (const auto & t : *preserved_tokens) {
|
|
@@ -386,12 +365,39 @@ struct server_task {
|
|
|
386
365
|
params.sampling.preserved_tokens.insert(ids[0]);
|
|
387
366
|
} else {
|
|
388
367
|
// This may happen when using a tool call style meant for a model with special tokens to preserve on a model without said tokens.
|
|
389
|
-
|
|
368
|
+
SRV_DBG("Not preserved because more than 1 token: %s\n", t.get<std::string>().c_str());
|
|
390
369
|
}
|
|
391
370
|
}
|
|
392
371
|
}
|
|
393
|
-
|
|
394
|
-
|
|
372
|
+
const auto grammar_triggers = data.find("grammar_triggers");
|
|
373
|
+
if (grammar_triggers != data.end()) {
|
|
374
|
+
for (const auto & t : *grammar_triggers) {
|
|
375
|
+
auto ct = common_grammar_trigger::from_json(t);
|
|
376
|
+
if (ct.type == COMMON_GRAMMAR_TRIGGER_TYPE_WORD) {
|
|
377
|
+
const auto & word = ct.value;
|
|
378
|
+
auto ids = common_tokenize(vocab, word, /* add_special= */ false, /* parse_special= */ true);
|
|
379
|
+
if (ids.size() == 1) {
|
|
380
|
+
auto token = ids[0];
|
|
381
|
+
if (std::find(params.sampling.preserved_tokens.begin(), params.sampling.preserved_tokens.end(), (llama_token) token) == params.sampling.preserved_tokens.end()) {
|
|
382
|
+
throw std::runtime_error("Grammar trigger word should be marked as preserved token: " + word);
|
|
383
|
+
}
|
|
384
|
+
SRV_DBG("Grammar trigger token: %d (`%s`)\n", token, word.c_str());
|
|
385
|
+
common_grammar_trigger trigger;
|
|
386
|
+
trigger.type = COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN;
|
|
387
|
+
trigger.value = word;
|
|
388
|
+
trigger.token = token;
|
|
389
|
+
params.sampling.grammar_triggers.push_back(std::move(trigger));
|
|
390
|
+
} else {
|
|
391
|
+
SRV_DBG("Grammar trigger word: `%s`\n", word.c_str());
|
|
392
|
+
params.sampling.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, word});
|
|
393
|
+
}
|
|
394
|
+
} else {
|
|
395
|
+
params.sampling.grammar_triggers.push_back(ct);
|
|
396
|
+
}
|
|
397
|
+
}
|
|
398
|
+
}
|
|
399
|
+
if (params.sampling.grammar_lazy && params.sampling.grammar_triggers.empty()) {
|
|
400
|
+
throw std::runtime_error("Error: no triggers set for lazy grammar!");
|
|
395
401
|
}
|
|
396
402
|
}
|
|
397
403
|
|
|
@@ -745,7 +751,10 @@ struct server_task_result_cmpl_final : server_task_result {
|
|
|
745
751
|
{"name", tc.name},
|
|
746
752
|
{"arguments", tc.arguments},
|
|
747
753
|
}},
|
|
748
|
-
|
|
754
|
+
// Some templates generate and require an id (sometimes in a very specific format, e.g. Mistral Nemo).
|
|
755
|
+
// We only generate a random id for the ones that don't generate one by themselves
|
|
756
|
+
// (they also won't get to see it as their template likely doesn't use it, so it's all for the client)
|
|
757
|
+
{"id", tc.id.empty() ? gen_tool_call_id() : tc.id},
|
|
749
758
|
});
|
|
750
759
|
}
|
|
751
760
|
message["tool_calls"] = tool_calls;
|
|
@@ -1307,7 +1316,7 @@ struct server_slot {
|
|
|
1307
1316
|
return task_type == SERVER_TASK_TYPE_EMBEDDING || task_type == SERVER_TASK_TYPE_RERANK;
|
|
1308
1317
|
}
|
|
1309
1318
|
|
|
1310
|
-
bool can_batch_with(server_slot & other_slot) {
|
|
1319
|
+
bool can_batch_with(server_slot & other_slot) const {
|
|
1311
1320
|
return is_non_causal() == other_slot.is_non_causal()
|
|
1312
1321
|
&& are_lora_equal(lora, other_slot.lora);
|
|
1313
1322
|
}
|
|
@@ -1807,7 +1816,7 @@ struct server_context {
|
|
|
1807
1816
|
// Necessary similarity of prompt for slot selection
|
|
1808
1817
|
float slot_prompt_similarity = 0.0f;
|
|
1809
1818
|
|
|
1810
|
-
|
|
1819
|
+
common_chat_templates_ptr chat_templates;
|
|
1811
1820
|
|
|
1812
1821
|
~server_context() {
|
|
1813
1822
|
// Clear any sampling context
|
|
@@ -1863,6 +1872,10 @@ struct server_context {
|
|
|
1863
1872
|
params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers;
|
|
1864
1873
|
params_dft.n_parallel = 1;
|
|
1865
1874
|
|
|
1875
|
+
// force F16 KV cache for the draft model for extra performance
|
|
1876
|
+
params_dft.cache_type_k = GGML_TYPE_F16;
|
|
1877
|
+
params_dft.cache_type_v = GGML_TYPE_F16;
|
|
1878
|
+
|
|
1866
1879
|
llama_init_dft = common_init_from_params(params_dft);
|
|
1867
1880
|
|
|
1868
1881
|
model_dft = llama_init_dft.model.get();
|
|
@@ -1883,53 +1896,22 @@ struct server_context {
|
|
|
1883
1896
|
cparams_dft = common_context_params_to_llama(params_dft);
|
|
1884
1897
|
cparams_dft.n_batch = n_ctx_dft;
|
|
1885
1898
|
|
|
1886
|
-
// force F16 KV cache for the draft model for extra performance
|
|
1887
|
-
cparams_dft.type_k = GGML_TYPE_F16;
|
|
1888
|
-
cparams_dft.type_v = GGML_TYPE_F16;
|
|
1889
|
-
|
|
1890
1899
|
// the context is not needed - we will create one for each slot
|
|
1891
1900
|
llama_init_dft.context.reset();
|
|
1892
1901
|
}
|
|
1893
1902
|
|
|
1894
|
-
|
|
1903
|
+
chat_templates = common_chat_templates_init(model, params_base.chat_template);
|
|
1904
|
+
try {
|
|
1905
|
+
common_chat_format_example(chat_templates.get(), params.use_jinja);
|
|
1906
|
+
} catch (const std::exception & e) {
|
|
1907
|
+
SRV_WRN("%s: Chat template parsing error: %s\n", __func__, e.what());
|
|
1895
1908
|
SRV_WRN("%s: The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses\n", __func__);
|
|
1896
|
-
chat_templates =
|
|
1897
|
-
} else {
|
|
1898
|
-
chat_templates = common_chat_templates_from_model(model, params_base.chat_template);
|
|
1909
|
+
chat_templates = common_chat_templates_init(model, "chatml");
|
|
1899
1910
|
}
|
|
1900
|
-
GGML_ASSERT(chat_templates.template_default.get() != nullptr);
|
|
1901
1911
|
|
|
1902
1912
|
return true;
|
|
1903
1913
|
}
|
|
1904
1914
|
|
|
1905
|
-
bool validate_builtin_chat_template(bool use_jinja) const {
|
|
1906
|
-
llama_chat_message chat[] = {{"user", "test"}};
|
|
1907
|
-
|
|
1908
|
-
if (use_jinja) {
|
|
1909
|
-
auto templates = common_chat_templates_from_model(model, "");
|
|
1910
|
-
common_chat_inputs inputs;
|
|
1911
|
-
inputs.messages = json::array({{
|
|
1912
|
-
{"role", "user"},
|
|
1913
|
-
{"content", "test"},
|
|
1914
|
-
}});
|
|
1915
|
-
GGML_ASSERT(templates.template_default);
|
|
1916
|
-
try {
|
|
1917
|
-
common_chat_params_init(*templates.template_default, inputs);
|
|
1918
|
-
if (templates.template_tool_use) {
|
|
1919
|
-
common_chat_params_init(*templates.template_tool_use, inputs);
|
|
1920
|
-
}
|
|
1921
|
-
return true;
|
|
1922
|
-
} catch (const std::exception & e) {
|
|
1923
|
-
SRV_ERR("failed to apply template: %s\n", e.what());
|
|
1924
|
-
return false;
|
|
1925
|
-
}
|
|
1926
|
-
} else {
|
|
1927
|
-
const char * tmpl = llama_model_chat_template(model, /* name */ nullptr);
|
|
1928
|
-
const int32_t chat_res = llama_chat_apply_template(tmpl, chat, 1, true, nullptr, 0);
|
|
1929
|
-
return chat_res > 0;
|
|
1930
|
-
}
|
|
1931
|
-
}
|
|
1932
|
-
|
|
1933
1915
|
void init() {
|
|
1934
1916
|
const int32_t n_ctx_slot = n_ctx / params_base.n_parallel;
|
|
1935
1917
|
|
|
@@ -2058,6 +2040,18 @@ struct server_context {
|
|
|
2058
2040
|
return ret;
|
|
2059
2041
|
}
|
|
2060
2042
|
|
|
2043
|
+
bool can_be_detokenized(const struct llama_context * ctx, const std::vector<llama_token> & tokens) {
|
|
2044
|
+
const llama_model * model = llama_get_model(ctx);
|
|
2045
|
+
const llama_vocab * vocab = llama_model_get_vocab(model);
|
|
2046
|
+
const int32_t n_vocab = llama_vocab_n_tokens(vocab);
|
|
2047
|
+
for (const auto & token : tokens) {
|
|
2048
|
+
if (token < 0 || token >= n_vocab) {
|
|
2049
|
+
return false;
|
|
2050
|
+
}
|
|
2051
|
+
}
|
|
2052
|
+
return true;
|
|
2053
|
+
}
|
|
2054
|
+
|
|
2061
2055
|
bool launch_slot_with_task(server_slot & slot, const server_task & task) {
|
|
2062
2056
|
slot.reset();
|
|
2063
2057
|
slot.id_task = task.id;
|
|
@@ -2072,11 +2066,16 @@ struct server_context {
|
|
|
2072
2066
|
slot.lora = task.params.lora;
|
|
2073
2067
|
}
|
|
2074
2068
|
|
|
2069
|
+
bool can_detokenize = can_be_detokenized(ctx, slot.prompt_tokens);
|
|
2070
|
+
if (!can_detokenize) {
|
|
2071
|
+
send_error(task, "Prompt contains invalid tokens", ERROR_TYPE_INVALID_REQUEST);
|
|
2072
|
+
return false;
|
|
2073
|
+
}
|
|
2075
2074
|
SLT_DBG(slot, "launching slot : %s\n", safe_json_to_str(slot.to_json()).c_str());
|
|
2076
2075
|
|
|
2077
2076
|
if (slot.n_predict > 0 && slot.params.n_predict > slot.n_predict) {
|
|
2078
2077
|
// Might be better to reject the request with a 400 ?
|
|
2079
|
-
SLT_WRN(slot, "n_predict = %d exceeds server configuration, setting to %d", slot.params.n_predict, slot.n_predict);
|
|
2078
|
+
SLT_WRN(slot, "n_predict = %d exceeds server configuration, setting to %d\n", slot.params.n_predict, slot.n_predict);
|
|
2080
2079
|
slot.params.n_predict = slot.n_predict;
|
|
2081
2080
|
}
|
|
2082
2081
|
|
|
@@ -2114,7 +2113,7 @@ struct server_context {
|
|
|
2114
2113
|
SRV_DBG("%s", "clearing KV cache\n");
|
|
2115
2114
|
|
|
2116
2115
|
// clear the entire KV cache
|
|
2117
|
-
|
|
2116
|
+
llama_kv_self_clear(ctx);
|
|
2118
2117
|
clean_kv_cache = false;
|
|
2119
2118
|
}
|
|
2120
2119
|
|
|
@@ -2179,14 +2178,6 @@ struct server_context {
|
|
|
2179
2178
|
}
|
|
2180
2179
|
|
|
2181
2180
|
if (slot.has_new_line) {
|
|
2182
|
-
// if we have already seen a new line, we stop after a certain time limit
|
|
2183
|
-
if (slot.params.t_max_predict_ms > 0 && (ggml_time_us() - slot.t_start_generation > 1000.0f*slot.params.t_max_predict_ms)) {
|
|
2184
|
-
slot.stop = STOP_TYPE_LIMIT;
|
|
2185
|
-
slot.has_next_token = false;
|
|
2186
|
-
|
|
2187
|
-
SLT_DBG(slot, "stopped by time limit, n_decoded = %d, t_max_predict_ms = %d ms\n", slot.n_decoded, (int) slot.params.t_max_predict_ms);
|
|
2188
|
-
}
|
|
2189
|
-
|
|
2190
2181
|
// require that each new line has a whitespace prefix (i.e. indentation) of at least slot.params.n_indent
|
|
2191
2182
|
if (slot.params.n_indent > 0) {
|
|
2192
2183
|
// check the current indentation
|
|
@@ -2225,6 +2216,14 @@ struct server_context {
|
|
|
2225
2216
|
// check if there is a new line in the generated text
|
|
2226
2217
|
if (result.text_to_send.find('\n') != std::string::npos) {
|
|
2227
2218
|
slot.has_new_line = true;
|
|
2219
|
+
|
|
2220
|
+
// if we have seen a new line, we stop after a certain time limit, but only upon another new line
|
|
2221
|
+
if (slot.params.t_max_predict_ms > 0 && (ggml_time_us() - slot.t_start_generation > 1000.0f*slot.params.t_max_predict_ms)) {
|
|
2222
|
+
slot.stop = STOP_TYPE_LIMIT;
|
|
2223
|
+
slot.has_next_token = false;
|
|
2224
|
+
|
|
2225
|
+
SLT_DBG(slot, "stopped by time limit, n_decoded = %d, t_max_predict_ms = %d ms\n", slot.n_decoded, (int) slot.params.t_max_predict_ms);
|
|
2226
|
+
}
|
|
2228
2227
|
}
|
|
2229
2228
|
|
|
2230
2229
|
// if context shift is disabled, we stop when it reaches the context limit
|
|
@@ -2656,8 +2655,8 @@ struct server_context {
|
|
|
2656
2655
|
res->n_tasks_deferred = queue_tasks.queue_tasks_deferred.size();
|
|
2657
2656
|
res->t_start = metrics.t_start;
|
|
2658
2657
|
|
|
2659
|
-
res->kv_cache_tokens_count =
|
|
2660
|
-
res->kv_cache_used_cells =
|
|
2658
|
+
res->kv_cache_tokens_count = llama_kv_self_n_tokens(ctx);
|
|
2659
|
+
res->kv_cache_used_cells = llama_kv_self_used_cells(ctx);
|
|
2661
2660
|
|
|
2662
2661
|
res->n_prompt_tokens_processed_total = metrics.n_prompt_tokens_processed_total;
|
|
2663
2662
|
res->t_prompt_processing_total = metrics.t_prompt_processing_total;
|
|
@@ -2773,7 +2772,7 @@ struct server_context {
|
|
|
2773
2772
|
|
|
2774
2773
|
// Erase token cache
|
|
2775
2774
|
const size_t n_erased = slot->cache_tokens.size();
|
|
2776
|
-
|
|
2775
|
+
llama_kv_self_seq_rm(ctx, slot->id, -1, -1);
|
|
2777
2776
|
slot->cache_tokens.clear();
|
|
2778
2777
|
|
|
2779
2778
|
auto res = std::make_unique<server_task_result_slot_erase>();
|
|
@@ -2841,8 +2840,8 @@ struct server_context {
|
|
|
2841
2840
|
|
|
2842
2841
|
SLT_WRN(slot, "slot context shift, n_keep = %d, n_left = %d, n_discard = %d\n", n_keep, n_left, n_discard);
|
|
2843
2842
|
|
|
2844
|
-
|
|
2845
|
-
|
|
2843
|
+
llama_kv_self_seq_rm (ctx, slot.id, n_keep , n_keep + n_discard);
|
|
2844
|
+
llama_kv_self_seq_add(ctx, slot.id, n_keep + n_discard, slot.n_past, -n_discard);
|
|
2846
2845
|
|
|
2847
2846
|
if (slot.params.cache_prompt) {
|
|
2848
2847
|
for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++) {
|
|
@@ -3033,8 +3032,8 @@ struct server_context {
|
|
|
3033
3032
|
|
|
3034
3033
|
const int64_t kv_shift = (int64_t) head_p - (int64_t) head_c;
|
|
3035
3034
|
|
|
3036
|
-
|
|
3037
|
-
|
|
3035
|
+
llama_kv_self_seq_rm (ctx, slot.id, head_p, head_c);
|
|
3036
|
+
llama_kv_self_seq_add(ctx, slot.id, head_c, head_c + n_match, kv_shift);
|
|
3038
3037
|
|
|
3039
3038
|
for (size_t i = 0; i < n_match; i++) {
|
|
3040
3039
|
slot.cache_tokens[head_p + i] = slot.cache_tokens[head_c + i];
|
|
@@ -3072,9 +3071,9 @@ struct server_context {
|
|
|
3072
3071
|
}
|
|
3073
3072
|
|
|
3074
3073
|
// keep only the common part
|
|
3075
|
-
if (!
|
|
3074
|
+
if (!llama_kv_self_seq_rm(ctx, slot.id, slot.n_past, -1)) {
|
|
3076
3075
|
// could not partially delete (likely using a non-Transformer model)
|
|
3077
|
-
|
|
3076
|
+
llama_kv_self_seq_rm(ctx, slot.id, -1, -1);
|
|
3078
3077
|
|
|
3079
3078
|
// there is no common part left
|
|
3080
3079
|
slot.n_past = 0;
|
|
@@ -3314,7 +3313,7 @@ struct server_context {
|
|
|
3314
3313
|
slot.cache_tokens.push_back(id);
|
|
3315
3314
|
slot.cache_tokens.insert(slot.cache_tokens.end(), ids.begin(), ids.end() - 1);
|
|
3316
3315
|
|
|
3317
|
-
|
|
3316
|
+
llama_kv_self_seq_rm(ctx, slot.id, slot.n_past, -1);
|
|
3318
3317
|
|
|
3319
3318
|
for (size_t i = 0; i < ids.size(); ++i) {
|
|
3320
3319
|
completion_token_output result;
|
|
@@ -3822,13 +3821,15 @@ int main(int argc, char ** argv) {
|
|
|
3822
3821
|
{ "default_generation_settings", ctx_server.default_generation_settings_for_props },
|
|
3823
3822
|
{ "total_slots", ctx_server.params_base.n_parallel },
|
|
3824
3823
|
{ "model_path", ctx_server.params_base.model },
|
|
3825
|
-
{ "chat_template", ctx_server.chat_templates.
|
|
3826
|
-
{ "bos_token", ctx_server.
|
|
3827
|
-
{ "eos_token", ctx_server.
|
|
3824
|
+
{ "chat_template", common_chat_templates_source(ctx_server.chat_templates.get()) },
|
|
3825
|
+
{ "bos_token", common_token_to_piece(ctx_server.ctx, llama_vocab_bos(ctx_server.vocab), /* special= */ true)},
|
|
3826
|
+
{ "eos_token", common_token_to_piece(ctx_server.ctx, llama_vocab_eos(ctx_server.vocab), /* special= */ true)},
|
|
3828
3827
|
{ "build_info", build_info },
|
|
3829
3828
|
};
|
|
3830
|
-
if (ctx_server.params_base.use_jinja
|
|
3831
|
-
|
|
3829
|
+
if (ctx_server.params_base.use_jinja) {
|
|
3830
|
+
if (auto tool_use_src = common_chat_templates_source(ctx_server.chat_templates.get(), "tool_use")) {
|
|
3831
|
+
data["chat_template_tool_use"] = tool_use_src;
|
|
3832
|
+
}
|
|
3832
3833
|
}
|
|
3833
3834
|
|
|
3834
3835
|
res_ok(res, data);
|
|
@@ -4063,7 +4064,7 @@ int main(int argc, char ** argv) {
|
|
|
4063
4064
|
}
|
|
4064
4065
|
|
|
4065
4066
|
auto body = json::parse(req.body);
|
|
4066
|
-
json data = oaicompat_completion_params_parse(body, params.use_jinja, params.reasoning_format, ctx_server.chat_templates);
|
|
4067
|
+
json data = oaicompat_completion_params_parse(body, params.use_jinja, params.reasoning_format, ctx_server.chat_templates.get());
|
|
4067
4068
|
|
|
4068
4069
|
return handle_completions_impl(
|
|
4069
4070
|
SERVER_TASK_TYPE_COMPLETION,
|
|
@@ -4076,7 +4077,7 @@ int main(int argc, char ** argv) {
|
|
|
4076
4077
|
// same with handle_chat_completions, but without inference part
|
|
4077
4078
|
const auto handle_apply_template = [&ctx_server, ¶ms, &res_ok](const httplib::Request & req, httplib::Response & res) {
|
|
4078
4079
|
auto body = json::parse(req.body);
|
|
4079
|
-
json data = oaicompat_completion_params_parse(body, params.use_jinja, params.reasoning_format, ctx_server.chat_templates);
|
|
4080
|
+
json data = oaicompat_completion_params_parse(body, params.use_jinja, params.reasoning_format, ctx_server.chat_templates.get());
|
|
4080
4081
|
res_ok(res, {{ "prompt", std::move(data.at("prompt")) }});
|
|
4081
4082
|
};
|
|
4082
4083
|
|
|
@@ -4263,6 +4264,11 @@ int main(int argc, char ** argv) {
|
|
|
4263
4264
|
// return;
|
|
4264
4265
|
//}
|
|
4265
4266
|
|
|
4267
|
+
// if true, use TEI API format, otherwise use Jina API format
|
|
4268
|
+
// Jina: https://jina.ai/reranker/
|
|
4269
|
+
// TEI: https://huggingface.github.io/text-embeddings-inference/#/Text%20Embeddings%20Inference/rerank
|
|
4270
|
+
bool is_tei_format = body.contains("texts");
|
|
4271
|
+
|
|
4266
4272
|
json query;
|
|
4267
4273
|
if (body.count("query") == 1) {
|
|
4268
4274
|
query = body.at("query");
|
|
@@ -4275,7 +4281,8 @@ int main(int argc, char ** argv) {
|
|
|
4275
4281
|
return;
|
|
4276
4282
|
}
|
|
4277
4283
|
|
|
4278
|
-
std::vector<std::string> documents = json_value(body, "documents",
|
|
4284
|
+
std::vector<std::string> documents = json_value(body, "documents",
|
|
4285
|
+
json_value(body, "texts", std::vector<std::string>()));
|
|
4279
4286
|
if (documents.empty()) {
|
|
4280
4287
|
res_error(res, format_error_response("\"documents\" must be a non-empty string array", ERROR_TYPE_INVALID_REQUEST));
|
|
4281
4288
|
return;
|
|
@@ -4320,7 +4327,12 @@ int main(int argc, char ** argv) {
|
|
|
4320
4327
|
}
|
|
4321
4328
|
|
|
4322
4329
|
// write JSON response
|
|
4323
|
-
json root = format_response_rerank(
|
|
4330
|
+
json root = format_response_rerank(
|
|
4331
|
+
body,
|
|
4332
|
+
responses,
|
|
4333
|
+
is_tei_format,
|
|
4334
|
+
documents);
|
|
4335
|
+
|
|
4324
4336
|
res_ok(res, root);
|
|
4325
4337
|
};
|
|
4326
4338
|
|
|
@@ -4482,8 +4494,8 @@ int main(int argc, char ** argv) {
|
|
|
4482
4494
|
|
|
4483
4495
|
// print sample chat example to make it clear which template is used
|
|
4484
4496
|
LOG_INF("%s: chat template, chat_template: %s, example_format: '%s'\n", __func__,
|
|
4485
|
-
ctx_server.chat_templates.
|
|
4486
|
-
common_chat_format_example(
|
|
4497
|
+
common_chat_templates_source(ctx_server.chat_templates.get()),
|
|
4498
|
+
common_chat_format_example(ctx_server.chat_templates.get(), ctx_server.params_base.use_jinja).c_str());
|
|
4487
4499
|
|
|
4488
4500
|
ctx_server.queue_tasks.on_new_task([&ctx_server](const server_task & task) {
|
|
4489
4501
|
ctx_server.process_single_task(task);
|