@fugood/llama.node 0.3.13 → 0.3.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +1 -1
- package/package.json +1 -1
- package/src/LlamaContext.cpp +98 -76
- package/src/LlamaContext.h +1 -1
- package/src/common.hpp +1 -2
- package/src/llama.cpp/.github/workflows/build.yml +60 -10
- package/src/llama.cpp/.github/workflows/server.yml +2 -0
- package/src/llama.cpp/common/CMakeLists.txt +3 -3
- package/src/llama.cpp/common/arg.cpp +112 -11
- package/src/llama.cpp/common/chat.cpp +960 -266
- package/src/llama.cpp/common/chat.h +135 -0
- package/src/llama.cpp/common/common.cpp +27 -171
- package/src/llama.cpp/common/common.h +27 -67
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +4 -5
- package/src/llama.cpp/common/json-schema-to-grammar.h +0 -1
- package/src/llama.cpp/common/{minja.hpp → minja/minja.hpp} +37 -5
- package/src/llama.cpp/common/ngram-cache.cpp +1 -0
- package/src/llama.cpp/common/sampling.cpp +45 -7
- package/src/llama.cpp/common/speculative.cpp +6 -5
- package/src/llama.cpp/common/speculative.h +1 -1
- package/src/llama.cpp/docs/build.md +45 -7
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +3 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +1 -0
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +4 -2
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +2 -3
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +1 -1
- package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
- package/src/llama.cpp/examples/llava/clip.cpp +373 -107
- package/src/llama.cpp/examples/llava/clip.h +19 -3
- package/src/llama.cpp/examples/llava/gemma3-cli.cpp +341 -0
- package/src/llama.cpp/examples/llava/llava.cpp +4 -2
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +30 -11
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -0
- package/src/llama.cpp/examples/main/main.cpp +73 -28
- package/src/llama.cpp/examples/parallel/parallel.cpp +1 -0
- package/src/llama.cpp/examples/passkey/passkey.cpp +1 -0
- package/src/llama.cpp/examples/quantize/quantize.cpp +1 -0
- package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +882 -237
- package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +35 -26
- package/src/llama.cpp/examples/run/run.cpp +110 -67
- package/src/llama.cpp/examples/server/server.cpp +82 -87
- package/src/llama.cpp/examples/server/utils.hpp +94 -107
- package/src/llama.cpp/examples/sycl/run-llama2.sh +2 -2
- package/src/llama.cpp/examples/tts/tts.cpp +251 -142
- package/src/llama.cpp/ggml/CMakeLists.txt +13 -1
- package/src/llama.cpp/ggml/include/ggml-alloc.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-backend.h +3 -3
- package/src/llama.cpp/ggml/include/ggml-cpu.h +3 -0
- package/src/llama.cpp/ggml/include/ggml.h +5 -1
- package/src/llama.cpp/ggml/src/CMakeLists.txt +10 -7
- package/src/llama.cpp/ggml/src/ggml-alloc.c +24 -15
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +58 -54
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +10 -8
- package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +3 -5
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +132 -17
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +4 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +151 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +1396 -386
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1432 -151
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +22 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +259 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +61 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +288 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +15 -2
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +14 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -5
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +235 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +6 -2
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +1 -0
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +220 -116
- package/src/llama.cpp/ggml/src/ggml-quants.c +114 -114
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +2 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +17 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +51 -10
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +33 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +2 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +701 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/cpy.hpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +55 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +136 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +308 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +168 -721
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -77
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +13 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +146 -42
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +13 -3
- package/src/llama.cpp/ggml/src/ggml.c +8 -3
- package/src/llama.cpp/include/llama.h +19 -5
- package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +46 -0
- package/src/llama.cpp/requirements/requirements-all.txt +1 -0
- package/src/llama.cpp/requirements/requirements-tool_bench.txt +12 -0
- package/src/llama.cpp/requirements.txt +1 -0
- package/src/llama.cpp/src/llama-arch.cpp +21 -0
- package/src/llama.cpp/src/llama-arch.h +1 -0
- package/src/llama.cpp/src/llama-chat.cpp +1 -0
- package/src/llama.cpp/src/llama-grammar.cpp +182 -182
- package/src/llama.cpp/src/llama-grammar.h +12 -3
- package/src/llama.cpp/src/llama-kv-cache.h +1 -0
- package/src/llama.cpp/src/llama-mmap.cpp +11 -1
- package/src/llama.cpp/src/llama-model.cpp +69 -5
- package/src/llama.cpp/src/llama-sampling.cpp +43 -10
- package/src/llama.cpp/src/llama-vocab.cpp +12 -0
- package/src/llama.cpp/src/llama.cpp +147 -0
- package/src/llama.cpp/tests/test-backend-ops.cpp +166 -110
- package/src/llama.cpp/tests/test-chat-template.cpp +32 -22
- package/src/llama.cpp/tests/test-chat.cpp +593 -395
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +63 -63
- package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -9
- package/src/llama.cpp/Sources/llama/llama.h +0 -4
- package/src/llama.cpp/common/chat.hpp +0 -55
- /package/src/llama.cpp/common/{chat-template.hpp → minja/chat-template.hpp} +0 -0
|
@@ -131,9 +131,9 @@ struct slot_params {
|
|
|
131
131
|
lora.push_back({{"id", i}, {"scale", this->lora[i].scale}});
|
|
132
132
|
}
|
|
133
133
|
|
|
134
|
-
|
|
135
|
-
for (const auto & trigger : sampling.
|
|
136
|
-
|
|
134
|
+
auto grammar_triggers = json::array();
|
|
135
|
+
for (const auto & trigger : sampling.grammar_triggers) {
|
|
136
|
+
grammar_triggers.push_back(trigger.to_json<json>());
|
|
137
137
|
}
|
|
138
138
|
|
|
139
139
|
return json {
|
|
@@ -170,8 +170,8 @@ struct slot_params {
|
|
|
170
170
|
{"n_probs", sampling.n_probs},
|
|
171
171
|
{"min_keep", sampling.min_keep},
|
|
172
172
|
{"grammar", sampling.grammar},
|
|
173
|
-
{"
|
|
174
|
-
{"
|
|
173
|
+
{"grammar_lazy", sampling.grammar_lazy},
|
|
174
|
+
{"grammar_triggers", grammar_triggers},
|
|
175
175
|
{"preserved_tokens", sampling.preserved_tokens},
|
|
176
176
|
{"chat_format", common_chat_format_name(oaicompat_chat_format)},
|
|
177
177
|
{"samplers", samplers},
|
|
@@ -274,7 +274,7 @@ struct server_task {
|
|
|
274
274
|
params.speculative.p_min = json_value(data, "speculative.p_min", defaults.speculative.p_min);
|
|
275
275
|
|
|
276
276
|
params.speculative.n_min = std::min(params.speculative.n_max, params.speculative.n_min);
|
|
277
|
-
params.speculative.n_min = std::max(params.speculative.n_min,
|
|
277
|
+
params.speculative.n_min = std::max(params.speculative.n_min, 0);
|
|
278
278
|
params.speculative.n_max = std::max(params.speculative.n_max, 0);
|
|
279
279
|
|
|
280
280
|
// Use OpenAI API logprobs only if n_probs wasn't provided
|
|
@@ -329,9 +329,6 @@ struct server_task {
|
|
|
329
329
|
}
|
|
330
330
|
|
|
331
331
|
// process "json_schema" and "grammar"
|
|
332
|
-
if (data.contains("json_schema") && !data.at("json_schema").is_null() && data.contains("grammar") && !data.at("grammar").is_null()) {
|
|
333
|
-
throw std::runtime_error("Either \"json_schema\" or \"grammar\" can be specified, but not both");
|
|
334
|
-
}
|
|
335
332
|
if (data.contains("json_schema") && !data.contains("grammar")) {
|
|
336
333
|
try {
|
|
337
334
|
auto schema = json_value(data, "json_schema", json::object());
|
|
@@ -359,24 +356,6 @@ struct server_task {
|
|
|
359
356
|
}
|
|
360
357
|
|
|
361
358
|
{
|
|
362
|
-
const auto grammar_triggers = data.find("grammar_triggers");
|
|
363
|
-
if (grammar_triggers != data.end()) {
|
|
364
|
-
for (const auto & t : *grammar_triggers) {
|
|
365
|
-
common_grammar_trigger trigger;
|
|
366
|
-
trigger.word = t.at("word");
|
|
367
|
-
trigger.at_start = t.at("at_start");
|
|
368
|
-
|
|
369
|
-
auto ids = common_tokenize(vocab, trigger.word, /* add_special= */ false, /* parse_special= */ true);
|
|
370
|
-
if (ids.size() == 1) {
|
|
371
|
-
SRV_DBG("Grammar trigger token: %d (`%s`)\n", ids[0], trigger.word.c_str());
|
|
372
|
-
params.sampling.grammar_trigger_tokens.push_back(ids[0]);
|
|
373
|
-
params.sampling.preserved_tokens.insert(ids[0]);
|
|
374
|
-
continue;
|
|
375
|
-
}
|
|
376
|
-
SRV_DBG("Grammar trigger word: `%s`\n", trigger.word.c_str());
|
|
377
|
-
params.sampling.grammar_trigger_words.push_back(trigger);
|
|
378
|
-
}
|
|
379
|
-
}
|
|
380
359
|
const auto preserved_tokens = data.find("preserved_tokens");
|
|
381
360
|
if (preserved_tokens != data.end()) {
|
|
382
361
|
for (const auto & t : *preserved_tokens) {
|
|
@@ -386,12 +365,39 @@ struct server_task {
|
|
|
386
365
|
params.sampling.preserved_tokens.insert(ids[0]);
|
|
387
366
|
} else {
|
|
388
367
|
// This may happen when using a tool call style meant for a model with special tokens to preserve on a model without said tokens.
|
|
389
|
-
|
|
368
|
+
SRV_DBG("Not preserved because more than 1 token: %s\n", t.get<std::string>().c_str());
|
|
369
|
+
}
|
|
370
|
+
}
|
|
371
|
+
}
|
|
372
|
+
const auto grammar_triggers = data.find("grammar_triggers");
|
|
373
|
+
if (grammar_triggers != data.end()) {
|
|
374
|
+
for (const auto & t : *grammar_triggers) {
|
|
375
|
+
auto ct = common_grammar_trigger::from_json(t);
|
|
376
|
+
if (ct.type == COMMON_GRAMMAR_TRIGGER_TYPE_WORD) {
|
|
377
|
+
const auto & word = ct.value;
|
|
378
|
+
auto ids = common_tokenize(vocab, word, /* add_special= */ false, /* parse_special= */ true);
|
|
379
|
+
if (ids.size() == 1) {
|
|
380
|
+
auto token = ids[0];
|
|
381
|
+
if (std::find(params.sampling.preserved_tokens.begin(), params.sampling.preserved_tokens.end(), (llama_token) token) == params.sampling.preserved_tokens.end()) {
|
|
382
|
+
throw std::runtime_error("Grammar trigger word should be marked as preserved token: " + word);
|
|
383
|
+
}
|
|
384
|
+
SRV_DBG("Grammar trigger token: %d (`%s`)\n", token, word.c_str());
|
|
385
|
+
common_grammar_trigger trigger;
|
|
386
|
+
trigger.type = COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN;
|
|
387
|
+
trigger.value = word;
|
|
388
|
+
trigger.token = token;
|
|
389
|
+
params.sampling.grammar_triggers.push_back(std::move(trigger));
|
|
390
|
+
} else {
|
|
391
|
+
SRV_DBG("Grammar trigger word: `%s`\n", word.c_str());
|
|
392
|
+
params.sampling.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, word});
|
|
393
|
+
}
|
|
394
|
+
} else {
|
|
395
|
+
params.sampling.grammar_triggers.push_back(ct);
|
|
390
396
|
}
|
|
391
397
|
}
|
|
392
398
|
}
|
|
393
|
-
if (params.sampling.grammar_lazy) {
|
|
394
|
-
|
|
399
|
+
if (params.sampling.grammar_lazy && params.sampling.grammar_triggers.empty()) {
|
|
400
|
+
throw std::runtime_error("Error: no triggers set for lazy grammar!");
|
|
395
401
|
}
|
|
396
402
|
}
|
|
397
403
|
|
|
@@ -745,7 +751,10 @@ struct server_task_result_cmpl_final : server_task_result {
|
|
|
745
751
|
{"name", tc.name},
|
|
746
752
|
{"arguments", tc.arguments},
|
|
747
753
|
}},
|
|
748
|
-
|
|
754
|
+
// Some templates generate and require an id (sometimes in a very specific format, e.g. Mistral Nemo).
|
|
755
|
+
// We only generate a random id for the ones that don't generate one by themselves
|
|
756
|
+
// (they also won't get to see it as their template likely doesn't use it, so it's all for the client)
|
|
757
|
+
{"id", tc.id.empty() ? gen_tool_call_id() : tc.id},
|
|
749
758
|
});
|
|
750
759
|
}
|
|
751
760
|
message["tool_calls"] = tool_calls;
|
|
@@ -1307,7 +1316,7 @@ struct server_slot {
|
|
|
1307
1316
|
return task_type == SERVER_TASK_TYPE_EMBEDDING || task_type == SERVER_TASK_TYPE_RERANK;
|
|
1308
1317
|
}
|
|
1309
1318
|
|
|
1310
|
-
bool can_batch_with(server_slot & other_slot) {
|
|
1319
|
+
bool can_batch_with(server_slot & other_slot) const {
|
|
1311
1320
|
return is_non_causal() == other_slot.is_non_causal()
|
|
1312
1321
|
&& are_lora_equal(lora, other_slot.lora);
|
|
1313
1322
|
}
|
|
@@ -1807,7 +1816,7 @@ struct server_context {
|
|
|
1807
1816
|
// Necessary similarity of prompt for slot selection
|
|
1808
1817
|
float slot_prompt_similarity = 0.0f;
|
|
1809
1818
|
|
|
1810
|
-
|
|
1819
|
+
common_chat_templates_ptr chat_templates;
|
|
1811
1820
|
|
|
1812
1821
|
~server_context() {
|
|
1813
1822
|
// Clear any sampling context
|
|
@@ -1891,45 +1900,18 @@ struct server_context {
|
|
|
1891
1900
|
llama_init_dft.context.reset();
|
|
1892
1901
|
}
|
|
1893
1902
|
|
|
1894
|
-
|
|
1903
|
+
chat_templates = common_chat_templates_init(model, params_base.chat_template);
|
|
1904
|
+
try {
|
|
1905
|
+
common_chat_format_example(chat_templates.get(), params.use_jinja);
|
|
1906
|
+
} catch (const std::exception & e) {
|
|
1907
|
+
SRV_WRN("%s: Chat template parsing error: %s\n", __func__, e.what());
|
|
1895
1908
|
SRV_WRN("%s: The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses\n", __func__);
|
|
1896
|
-
chat_templates =
|
|
1897
|
-
} else {
|
|
1898
|
-
chat_templates = common_chat_templates_from_model(model, params_base.chat_template);
|
|
1909
|
+
chat_templates = common_chat_templates_init(model, "chatml");
|
|
1899
1910
|
}
|
|
1900
|
-
GGML_ASSERT(chat_templates.template_default.get() != nullptr);
|
|
1901
1911
|
|
|
1902
1912
|
return true;
|
|
1903
1913
|
}
|
|
1904
1914
|
|
|
1905
|
-
bool validate_builtin_chat_template(bool use_jinja) const {
|
|
1906
|
-
llama_chat_message chat[] = {{"user", "test"}};
|
|
1907
|
-
|
|
1908
|
-
if (use_jinja) {
|
|
1909
|
-
auto templates = common_chat_templates_from_model(model, "");
|
|
1910
|
-
common_chat_inputs inputs;
|
|
1911
|
-
inputs.messages = json::array({{
|
|
1912
|
-
{"role", "user"},
|
|
1913
|
-
{"content", "test"},
|
|
1914
|
-
}});
|
|
1915
|
-
GGML_ASSERT(templates.template_default);
|
|
1916
|
-
try {
|
|
1917
|
-
common_chat_params_init(*templates.template_default, inputs);
|
|
1918
|
-
if (templates.template_tool_use) {
|
|
1919
|
-
common_chat_params_init(*templates.template_tool_use, inputs);
|
|
1920
|
-
}
|
|
1921
|
-
return true;
|
|
1922
|
-
} catch (const std::exception & e) {
|
|
1923
|
-
SRV_ERR("failed to apply template: %s\n", e.what());
|
|
1924
|
-
return false;
|
|
1925
|
-
}
|
|
1926
|
-
} else {
|
|
1927
|
-
const char * tmpl = llama_model_chat_template(model, /* name */ nullptr);
|
|
1928
|
-
const int32_t chat_res = llama_chat_apply_template(tmpl, chat, 1, true, nullptr, 0);
|
|
1929
|
-
return chat_res > 0;
|
|
1930
|
-
}
|
|
1931
|
-
}
|
|
1932
|
-
|
|
1933
1915
|
void init() {
|
|
1934
1916
|
const int32_t n_ctx_slot = n_ctx / params_base.n_parallel;
|
|
1935
1917
|
|
|
@@ -2076,7 +2058,7 @@ struct server_context {
|
|
|
2076
2058
|
|
|
2077
2059
|
if (slot.n_predict > 0 && slot.params.n_predict > slot.n_predict) {
|
|
2078
2060
|
// Might be better to reject the request with a 400 ?
|
|
2079
|
-
SLT_WRN(slot, "n_predict = %d exceeds server configuration, setting to %d", slot.params.n_predict, slot.n_predict);
|
|
2061
|
+
SLT_WRN(slot, "n_predict = %d exceeds server configuration, setting to %d\n", slot.params.n_predict, slot.n_predict);
|
|
2080
2062
|
slot.params.n_predict = slot.n_predict;
|
|
2081
2063
|
}
|
|
2082
2064
|
|
|
@@ -2179,14 +2161,6 @@ struct server_context {
|
|
|
2179
2161
|
}
|
|
2180
2162
|
|
|
2181
2163
|
if (slot.has_new_line) {
|
|
2182
|
-
// if we have already seen a new line, we stop after a certain time limit
|
|
2183
|
-
if (slot.params.t_max_predict_ms > 0 && (ggml_time_us() - slot.t_start_generation > 1000.0f*slot.params.t_max_predict_ms)) {
|
|
2184
|
-
slot.stop = STOP_TYPE_LIMIT;
|
|
2185
|
-
slot.has_next_token = false;
|
|
2186
|
-
|
|
2187
|
-
SLT_DBG(slot, "stopped by time limit, n_decoded = %d, t_max_predict_ms = %d ms\n", slot.n_decoded, (int) slot.params.t_max_predict_ms);
|
|
2188
|
-
}
|
|
2189
|
-
|
|
2190
2164
|
// require that each new line has a whitespace prefix (i.e. indentation) of at least slot.params.n_indent
|
|
2191
2165
|
if (slot.params.n_indent > 0) {
|
|
2192
2166
|
// check the current indentation
|
|
@@ -2225,6 +2199,14 @@ struct server_context {
|
|
|
2225
2199
|
// check if there is a new line in the generated text
|
|
2226
2200
|
if (result.text_to_send.find('\n') != std::string::npos) {
|
|
2227
2201
|
slot.has_new_line = true;
|
|
2202
|
+
|
|
2203
|
+
// if we have seen a new line, we stop after a certain time limit, but only upon another new line
|
|
2204
|
+
if (slot.params.t_max_predict_ms > 0 && (ggml_time_us() - slot.t_start_generation > 1000.0f*slot.params.t_max_predict_ms)) {
|
|
2205
|
+
slot.stop = STOP_TYPE_LIMIT;
|
|
2206
|
+
slot.has_next_token = false;
|
|
2207
|
+
|
|
2208
|
+
SLT_DBG(slot, "stopped by time limit, n_decoded = %d, t_max_predict_ms = %d ms\n", slot.n_decoded, (int) slot.params.t_max_predict_ms);
|
|
2209
|
+
}
|
|
2228
2210
|
}
|
|
2229
2211
|
|
|
2230
2212
|
// if context shift is disabled, we stop when it reaches the context limit
|
|
@@ -3034,7 +3016,7 @@ struct server_context {
|
|
|
3034
3016
|
const int64_t kv_shift = (int64_t) head_p - (int64_t) head_c;
|
|
3035
3017
|
|
|
3036
3018
|
llama_kv_cache_seq_rm (ctx, slot.id, head_p, head_c);
|
|
3037
|
-
llama_kv_cache_seq_add(ctx, slot.id, head_c,
|
|
3019
|
+
llama_kv_cache_seq_add(ctx, slot.id, head_c, head_c + n_match, kv_shift);
|
|
3038
3020
|
|
|
3039
3021
|
for (size_t i = 0; i < n_match; i++) {
|
|
3040
3022
|
slot.cache_tokens[head_p + i] = slot.cache_tokens[head_c + i];
|
|
@@ -3822,13 +3804,15 @@ int main(int argc, char ** argv) {
|
|
|
3822
3804
|
{ "default_generation_settings", ctx_server.default_generation_settings_for_props },
|
|
3823
3805
|
{ "total_slots", ctx_server.params_base.n_parallel },
|
|
3824
3806
|
{ "model_path", ctx_server.params_base.model },
|
|
3825
|
-
{ "chat_template", ctx_server.chat_templates.
|
|
3826
|
-
{ "bos_token", ctx_server.
|
|
3827
|
-
{ "eos_token", ctx_server.
|
|
3807
|
+
{ "chat_template", common_chat_templates_source(ctx_server.chat_templates.get()) },
|
|
3808
|
+
{ "bos_token", common_token_to_piece(ctx_server.ctx, llama_vocab_bos(ctx_server.vocab), /* special= */ true)},
|
|
3809
|
+
{ "eos_token", common_token_to_piece(ctx_server.ctx, llama_vocab_eos(ctx_server.vocab), /* special= */ true)},
|
|
3828
3810
|
{ "build_info", build_info },
|
|
3829
3811
|
};
|
|
3830
|
-
if (ctx_server.params_base.use_jinja
|
|
3831
|
-
|
|
3812
|
+
if (ctx_server.params_base.use_jinja) {
|
|
3813
|
+
if (auto tool_use_src = common_chat_templates_source(ctx_server.chat_templates.get(), "tool_use")) {
|
|
3814
|
+
data["chat_template_tool_use"] = tool_use_src;
|
|
3815
|
+
}
|
|
3832
3816
|
}
|
|
3833
3817
|
|
|
3834
3818
|
res_ok(res, data);
|
|
@@ -4063,7 +4047,7 @@ int main(int argc, char ** argv) {
|
|
|
4063
4047
|
}
|
|
4064
4048
|
|
|
4065
4049
|
auto body = json::parse(req.body);
|
|
4066
|
-
json data = oaicompat_completion_params_parse(body, params.use_jinja, params.reasoning_format, ctx_server.chat_templates);
|
|
4050
|
+
json data = oaicompat_completion_params_parse(body, params.use_jinja, params.reasoning_format, ctx_server.chat_templates.get());
|
|
4067
4051
|
|
|
4068
4052
|
return handle_completions_impl(
|
|
4069
4053
|
SERVER_TASK_TYPE_COMPLETION,
|
|
@@ -4076,7 +4060,7 @@ int main(int argc, char ** argv) {
|
|
|
4076
4060
|
// same with handle_chat_completions, but without inference part
|
|
4077
4061
|
const auto handle_apply_template = [&ctx_server, ¶ms, &res_ok](const httplib::Request & req, httplib::Response & res) {
|
|
4078
4062
|
auto body = json::parse(req.body);
|
|
4079
|
-
json data = oaicompat_completion_params_parse(body, params.use_jinja, params.reasoning_format, ctx_server.chat_templates);
|
|
4063
|
+
json data = oaicompat_completion_params_parse(body, params.use_jinja, params.reasoning_format, ctx_server.chat_templates.get());
|
|
4080
4064
|
res_ok(res, {{ "prompt", std::move(data.at("prompt")) }});
|
|
4081
4065
|
};
|
|
4082
4066
|
|
|
@@ -4263,6 +4247,11 @@ int main(int argc, char ** argv) {
|
|
|
4263
4247
|
// return;
|
|
4264
4248
|
//}
|
|
4265
4249
|
|
|
4250
|
+
// if true, use TEI API format, otherwise use Jina API format
|
|
4251
|
+
// Jina: https://jina.ai/reranker/
|
|
4252
|
+
// TEI: https://huggingface.github.io/text-embeddings-inference/#/Text%20Embeddings%20Inference/rerank
|
|
4253
|
+
bool is_tei_format = body.contains("texts");
|
|
4254
|
+
|
|
4266
4255
|
json query;
|
|
4267
4256
|
if (body.count("query") == 1) {
|
|
4268
4257
|
query = body.at("query");
|
|
@@ -4275,7 +4264,8 @@ int main(int argc, char ** argv) {
|
|
|
4275
4264
|
return;
|
|
4276
4265
|
}
|
|
4277
4266
|
|
|
4278
|
-
std::vector<std::string> documents = json_value(body, "documents",
|
|
4267
|
+
std::vector<std::string> documents = json_value(body, "documents",
|
|
4268
|
+
json_value(body, "texts", std::vector<std::string>()));
|
|
4279
4269
|
if (documents.empty()) {
|
|
4280
4270
|
res_error(res, format_error_response("\"documents\" must be a non-empty string array", ERROR_TYPE_INVALID_REQUEST));
|
|
4281
4271
|
return;
|
|
@@ -4320,7 +4310,12 @@ int main(int argc, char ** argv) {
|
|
|
4320
4310
|
}
|
|
4321
4311
|
|
|
4322
4312
|
// write JSON response
|
|
4323
|
-
json root = format_response_rerank(
|
|
4313
|
+
json root = format_response_rerank(
|
|
4314
|
+
body,
|
|
4315
|
+
responses,
|
|
4316
|
+
is_tei_format,
|
|
4317
|
+
documents);
|
|
4318
|
+
|
|
4324
4319
|
res_ok(res, root);
|
|
4325
4320
|
};
|
|
4326
4321
|
|
|
@@ -4482,8 +4477,8 @@ int main(int argc, char ** argv) {
|
|
|
4482
4477
|
|
|
4483
4478
|
// print sample chat example to make it clear which template is used
|
|
4484
4479
|
LOG_INF("%s: chat template, chat_template: %s, example_format: '%s'\n", __func__,
|
|
4485
|
-
ctx_server.chat_templates.
|
|
4486
|
-
common_chat_format_example(
|
|
4480
|
+
common_chat_templates_source(ctx_server.chat_templates.get()),
|
|
4481
|
+
common_chat_format_example(ctx_server.chat_templates.get(), ctx_server.params_base.use_jinja).c_str());
|
|
4487
4482
|
|
|
4488
4483
|
ctx_server.queue_tasks.on_new_task([&ctx_server](const server_task & task) {
|
|
4489
4484
|
ctx_server.process_single_task(task);
|
|
@@ -7,14 +7,14 @@
|
|
|
7
7
|
|
|
8
8
|
// increase max payload length to allow use of larger context size
|
|
9
9
|
#define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 1048576
|
|
10
|
+
// disable Nagle's algorithm
|
|
11
|
+
#define CPPHTTPLIB_TCP_NODELAY true
|
|
10
12
|
#include "httplib.h"
|
|
11
13
|
|
|
12
14
|
// Change JSON_ASSERT from assert() to GGML_ASSERT:
|
|
13
15
|
#define JSON_ASSERT GGML_ASSERT
|
|
14
16
|
#include "json.hpp"
|
|
15
|
-
#include "
|
|
16
|
-
#include "chat.hpp"
|
|
17
|
-
#include "chat-template.hpp"
|
|
17
|
+
#include "chat.h"
|
|
18
18
|
|
|
19
19
|
#include <random>
|
|
20
20
|
#include <sstream>
|
|
@@ -347,41 +347,6 @@ static llama_tokens format_infill(
|
|
|
347
347
|
return embd_inp;
|
|
348
348
|
}
|
|
349
349
|
|
|
350
|
-
// Format given chat. If tmpl is empty, we take the template from model metadata
|
|
351
|
-
inline std::string format_chat(const common_chat_template & tmpl, const std::vector<json> & messages) {
|
|
352
|
-
std::vector<common_chat_msg> chat;
|
|
353
|
-
|
|
354
|
-
for (size_t i = 0; i < messages.size(); ++i) {
|
|
355
|
-
const auto & curr_msg = messages[i];
|
|
356
|
-
|
|
357
|
-
std::string role = json_value(curr_msg, "role", std::string(""));
|
|
358
|
-
|
|
359
|
-
std::string content;
|
|
360
|
-
if (curr_msg.contains("content")) {
|
|
361
|
-
if (curr_msg["content"].is_string()) {
|
|
362
|
-
content = curr_msg["content"].get<std::string>();
|
|
363
|
-
} else if (curr_msg["content"].is_array()) {
|
|
364
|
-
for (const auto & part : curr_msg["content"]) {
|
|
365
|
-
if (part.contains("text")) {
|
|
366
|
-
content += "\n" + part["text"].get<std::string>();
|
|
367
|
-
}
|
|
368
|
-
}
|
|
369
|
-
} else {
|
|
370
|
-
throw std::runtime_error("Invalid 'content' type (ref: https://github.com/ggml-org/llama.cpp/issues/8367)");
|
|
371
|
-
}
|
|
372
|
-
} else {
|
|
373
|
-
throw std::runtime_error("Missing 'content' (ref: https://github.com/ggml-org/llama.cpp/issues/8367)");
|
|
374
|
-
}
|
|
375
|
-
|
|
376
|
-
chat.push_back({role, content, /* tool_calls= */ {}});
|
|
377
|
-
}
|
|
378
|
-
|
|
379
|
-
const auto formatted_chat = common_chat_apply_template(tmpl, chat, true, /* use_jinja= */ false);
|
|
380
|
-
LOG_DBG("formatted_chat: '%s'\n", formatted_chat.c_str());
|
|
381
|
-
|
|
382
|
-
return formatted_chat;
|
|
383
|
-
}
|
|
384
|
-
|
|
385
350
|
//
|
|
386
351
|
// base64 utils (TODO: move to common in the future)
|
|
387
352
|
//
|
|
@@ -470,6 +435,10 @@ static std::string gen_chatcmplid() {
|
|
|
470
435
|
return "chatcmpl-" + random_string();
|
|
471
436
|
}
|
|
472
437
|
|
|
438
|
+
static std::string gen_tool_call_id() {
|
|
439
|
+
return random_string();
|
|
440
|
+
}
|
|
441
|
+
|
|
473
442
|
//
|
|
474
443
|
// other common utils
|
|
475
444
|
//
|
|
@@ -556,8 +525,13 @@ static json oaicompat_completion_params_parse(const json & body) {
|
|
|
556
525
|
throw std::runtime_error("Only one completion choice is allowed");
|
|
557
526
|
}
|
|
558
527
|
|
|
528
|
+
// Handle "echo" field
|
|
529
|
+
if (json_value(body, "echo", false)) {
|
|
530
|
+
throw std::runtime_error("Only no echo is supported");
|
|
531
|
+
}
|
|
532
|
+
|
|
559
533
|
// Params supported by OAI but unsupported by llama.cpp
|
|
560
|
-
static const std::vector<std::string> unsupported_params { "best_of", "
|
|
534
|
+
static const std::vector<std::string> unsupported_params { "best_of", "suffix" };
|
|
561
535
|
for (const auto & param : unsupported_params) {
|
|
562
536
|
if (body.contains(param)) {
|
|
563
537
|
throw std::runtime_error("Unsupported param: " + param);
|
|
@@ -579,12 +553,9 @@ static json oaicompat_completion_params_parse(
|
|
|
579
553
|
const json & body, /* openai api json semantics */
|
|
580
554
|
bool use_jinja,
|
|
581
555
|
common_reasoning_format reasoning_format,
|
|
582
|
-
const common_chat_templates
|
|
556
|
+
const struct common_chat_templates * tmpls)
|
|
583
557
|
{
|
|
584
558
|
json llama_params;
|
|
585
|
-
const auto & tmpl = body.contains("tools") && chat_templates.template_tool_use
|
|
586
|
-
? *chat_templates.template_tool_use
|
|
587
|
-
: *chat_templates.template_default;
|
|
588
559
|
|
|
589
560
|
auto tools = json_value(body, "tools", json());
|
|
590
561
|
auto stream = json_value(body, "stream", false);
|
|
@@ -610,62 +581,56 @@ static json oaicompat_completion_params_parse(
|
|
|
610
581
|
llama_params["stop"] = json_value(body, "stop", json::array());
|
|
611
582
|
}
|
|
612
583
|
|
|
584
|
+
auto json_schema = json_value(body, "json_schema", json());
|
|
585
|
+
auto grammar = json_value(body, "grammar", std::string());
|
|
586
|
+
if (!json_schema.is_null() && !grammar.empty()) {
|
|
587
|
+
throw std::runtime_error("Cannot use both json_schema and grammar");
|
|
588
|
+
}
|
|
589
|
+
|
|
613
590
|
// Handle "response_format" field
|
|
614
591
|
if (body.contains("response_format")) {
|
|
615
592
|
json response_format = json_value(body, "response_format", json::object());
|
|
616
593
|
std::string response_type = json_value(response_format, "type", std::string());
|
|
617
594
|
if (response_type == "json_object") {
|
|
618
|
-
|
|
595
|
+
json_schema = json_value(response_format, "schema", json::object());
|
|
619
596
|
} else if (response_type == "json_schema") {
|
|
620
|
-
|
|
621
|
-
|
|
597
|
+
auto schema_wrapper = json_value(response_format, "json_schema", json::object());
|
|
598
|
+
json_schema = json_value(schema_wrapper, "schema", json::object());
|
|
622
599
|
} else if (!response_type.empty() && response_type != "text") {
|
|
623
600
|
throw std::runtime_error("response_format type must be one of \"text\" or \"json_object\", but got: " + response_type);
|
|
624
601
|
}
|
|
625
602
|
}
|
|
626
603
|
|
|
604
|
+
common_chat_templates_inputs inputs;
|
|
605
|
+
inputs.messages = common_chat_msgs_parse_oaicompat(body.at("messages"));
|
|
606
|
+
inputs.tools = common_chat_tools_parse_oaicompat(tools);
|
|
607
|
+
inputs.tool_choice = common_chat_tool_choice_parse_oaicompat(json_value(body, "tool_choice", std::string("auto")));
|
|
608
|
+
inputs.json_schema = json_schema.is_null() ? "" : json_schema.dump();
|
|
609
|
+
inputs.grammar = grammar;
|
|
610
|
+
inputs.add_generation_prompt = json_value(body, "add_generation_prompt", true);
|
|
611
|
+
inputs.use_jinja = use_jinja;
|
|
612
|
+
inputs.parallel_tool_calls = json_value(body, "parallel_tool_calls", false);
|
|
613
|
+
inputs.extract_reasoning = reasoning_format != COMMON_REASONING_FORMAT_NONE;
|
|
614
|
+
inputs.add_generation_prompt = json_value(body, "add_generation_prompt", true);
|
|
615
|
+
if (!inputs.tools.empty() && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE && body.contains("grammar")) {
|
|
616
|
+
throw std::runtime_error("Cannot use custom grammar constraints with tools.");
|
|
617
|
+
}
|
|
618
|
+
|
|
627
619
|
// Apply chat template to the list of messages
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
if (inputs.parallel_tool_calls && !tmpl.original_caps().supports_parallel_tool_calls) {
|
|
643
|
-
LOG_DBG("Disabling parallel_tool_calls because the template does not support it\n");
|
|
644
|
-
inputs.parallel_tool_calls = false;
|
|
645
|
-
}
|
|
646
|
-
inputs.stream = stream;
|
|
647
|
-
// TODO: support mixing schema w/ tools beyond generic format.
|
|
648
|
-
inputs.json_schema = json_value(llama_params, "json_schema", json());
|
|
649
|
-
auto chat_params = common_chat_params_init(tmpl, inputs);
|
|
650
|
-
|
|
651
|
-
llama_params["chat_format"] = static_cast<int>(chat_params.format);
|
|
652
|
-
llama_params["prompt"] = chat_params.prompt;
|
|
653
|
-
llama_params["grammar"] = chat_params.grammar;
|
|
654
|
-
llama_params["grammar_lazy"] = chat_params.grammar_lazy;
|
|
655
|
-
auto grammar_triggers = json::array();
|
|
656
|
-
for (const auto & trigger : chat_params.grammar_triggers) {
|
|
657
|
-
grammar_triggers.push_back({
|
|
658
|
-
{"word", trigger.word},
|
|
659
|
-
{"at_start", trigger.at_start},
|
|
660
|
-
});
|
|
661
|
-
}
|
|
662
|
-
llama_params["grammar_triggers"] = grammar_triggers;
|
|
663
|
-
llama_params["preserved_tokens"] = chat_params.preserved_tokens;
|
|
664
|
-
for (const auto & stop : chat_params.additional_stops) {
|
|
665
|
-
llama_params["stop"].push_back(stop);
|
|
666
|
-
}
|
|
667
|
-
} else {
|
|
668
|
-
llama_params["prompt"] = format_chat(tmpl, body.at("messages"));
|
|
620
|
+
auto chat_params = common_chat_templates_apply(tmpls, inputs);
|
|
621
|
+
|
|
622
|
+
llama_params["chat_format"] = static_cast<int>(chat_params.format);
|
|
623
|
+
llama_params["prompt"] = chat_params.prompt;
|
|
624
|
+
llama_params["grammar"] = chat_params.grammar;
|
|
625
|
+
llama_params["grammar_lazy"] = chat_params.grammar_lazy;
|
|
626
|
+
auto grammar_triggers = json::array();
|
|
627
|
+
for (const auto & trigger : chat_params.grammar_triggers) {
|
|
628
|
+
grammar_triggers.push_back(trigger.to_json<json>());
|
|
629
|
+
}
|
|
630
|
+
llama_params["grammar_triggers"] = grammar_triggers;
|
|
631
|
+
llama_params["preserved_tokens"] = chat_params.preserved_tokens;
|
|
632
|
+
for (const auto & stop : chat_params.additional_stops) {
|
|
633
|
+
llama_params["stop"].push_back(stop);
|
|
669
634
|
}
|
|
670
635
|
|
|
671
636
|
// Handle "n" field
|
|
@@ -737,28 +702,50 @@ static json format_embeddings_response_oaicompat(const json & request, const jso
|
|
|
737
702
|
return res;
|
|
738
703
|
}
|
|
739
704
|
|
|
740
|
-
static json format_response_rerank(
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
705
|
+
static json format_response_rerank(
|
|
706
|
+
const json & request,
|
|
707
|
+
const json & ranks,
|
|
708
|
+
bool is_tei_format,
|
|
709
|
+
std::vector<std::string> & texts) {
|
|
710
|
+
json res;
|
|
711
|
+
if (is_tei_format) {
|
|
712
|
+
// TEI response format
|
|
713
|
+
res = json::array();
|
|
714
|
+
bool return_text = json_value(request, "return_text", false);
|
|
715
|
+
for (const auto & rank : ranks) {
|
|
716
|
+
int index = json_value(rank, "index", 0);
|
|
717
|
+
json elem = json{
|
|
718
|
+
{"index", index},
|
|
719
|
+
{"score", json_value(rank, "score", 0.0)},
|
|
720
|
+
};
|
|
721
|
+
if (return_text) {
|
|
722
|
+
elem["text"] = std::move(texts[index]);
|
|
723
|
+
}
|
|
724
|
+
res.push_back(elem);
|
|
725
|
+
}
|
|
726
|
+
} else {
|
|
727
|
+
// Jina response format
|
|
728
|
+
json results = json::array();
|
|
729
|
+
int32_t n_tokens = 0;
|
|
730
|
+
for (const auto & rank : ranks) {
|
|
731
|
+
results.push_back(json{
|
|
732
|
+
{"index", json_value(rank, "index", 0)},
|
|
733
|
+
{"relevance_score", json_value(rank, "score", 0.0)},
|
|
734
|
+
});
|
|
749
735
|
|
|
750
|
-
|
|
751
|
-
|
|
736
|
+
n_tokens += json_value(rank, "tokens_evaluated", 0);
|
|
737
|
+
}
|
|
752
738
|
|
|
753
|
-
|
|
754
|
-
|
|
755
|
-
|
|
756
|
-
|
|
757
|
-
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
|
|
739
|
+
res = json{
|
|
740
|
+
{"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
|
|
741
|
+
{"object", "list"},
|
|
742
|
+
{"usage", json{
|
|
743
|
+
{"prompt_tokens", n_tokens},
|
|
744
|
+
{"total_tokens", n_tokens}
|
|
745
|
+
}},
|
|
746
|
+
{"results", results}
|
|
747
|
+
};
|
|
748
|
+
}
|
|
762
749
|
|
|
763
750
|
return res;
|
|
764
751
|
}
|
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
# MIT license
|
|
4
4
|
# Copyright (C) 2024 Intel Corporation
|
|
5
5
|
# SPDX-License-Identifier: MIT
|
|
6
|
-
|
|
6
|
+
export ONEAPI_DEVICE_SELECTOR="level_zero:0"
|
|
7
7
|
source /opt/intel/oneapi/setvars.sh
|
|
8
8
|
|
|
9
9
|
#export GGML_SYCL_DEBUG=1
|
|
@@ -13,7 +13,7 @@ source /opt/intel/oneapi/setvars.sh
|
|
|
13
13
|
INPUT_PROMPT="Building a website can be done in 10 simple steps:\nStep 1:"
|
|
14
14
|
MODEL_FILE=models/llama-2-7b.Q4_0.gguf
|
|
15
15
|
NGL=33
|
|
16
|
-
CONEXT=
|
|
16
|
+
CONEXT=4096
|
|
17
17
|
|
|
18
18
|
if [ $# -gt 0 ]; then
|
|
19
19
|
GGML_SYCL_DEVICE=$1
|