@fugood/llama.node 0.3.9 → 0.3.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.js +2 -2
- package/lib/binding.ts +47 -8
- package/lib/index.js +21 -1
- package/lib/index.ts +31 -1
- package/package.json +12 -3
- package/src/LlamaCompletionWorker.cpp +33 -6
- package/src/LlamaCompletionWorker.h +3 -1
- package/src/LlamaContext.cpp +336 -28
- package/src/LlamaContext.h +2 -0
- package/src/common.hpp +19 -2
- package/src/llama.cpp/.github/workflows/build.yml +289 -107
- package/src/llama.cpp/.github/workflows/close-issue.yml +1 -1
- package/src/llama.cpp/.github/workflows/docker.yml +2 -1
- package/src/llama.cpp/.github/workflows/server.yml +25 -2
- package/src/llama.cpp/CMakeLists.txt +10 -19
- package/src/llama.cpp/cmake/build-info.cmake +1 -1
- package/src/llama.cpp/common/CMakeLists.txt +32 -0
- package/src/llama.cpp/common/arg.cpp +66 -16
- package/src/llama.cpp/common/chat-template.hpp +515 -0
- package/src/llama.cpp/common/chat.cpp +966 -0
- package/src/llama.cpp/common/chat.hpp +52 -0
- package/src/llama.cpp/common/common.cpp +159 -36
- package/src/llama.cpp/common/common.h +56 -14
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +46 -66
- package/src/llama.cpp/common/json-schema-to-grammar.h +15 -1
- package/src/llama.cpp/common/llguidance.cpp +270 -0
- package/src/llama.cpp/common/log.cpp +1 -10
- package/src/llama.cpp/common/log.h +10 -0
- package/src/llama.cpp/common/minja.hpp +2868 -0
- package/src/llama.cpp/common/sampling.cpp +22 -1
- package/src/llama.cpp/common/sampling.h +3 -0
- package/src/llama.cpp/docs/build.md +54 -9
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +12 -2
- package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +1 -1
- package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
- package/src/llama.cpp/examples/llava/clip-quantize-cli.cpp +59 -0
- package/src/llama.cpp/examples/llava/clip.cpp +133 -14
- package/src/llama.cpp/examples/llava/clip.h +2 -0
- package/src/llama.cpp/examples/llava/llava.cpp +22 -8
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +9 -1
- package/src/llama.cpp/examples/main/main.cpp +26 -25
- package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +136 -137
- package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +18 -4
- package/src/llama.cpp/examples/run/run.cpp +224 -69
- package/src/llama.cpp/examples/server/server.cpp +252 -81
- package/src/llama.cpp/examples/server/utils.hpp +73 -21
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +6 -4
- package/src/llama.cpp/examples/simple-cmake-pkg/CMakeLists.txt +11 -0
- package/src/llama.cpp/ggml/CMakeLists.txt +78 -1
- package/src/llama.cpp/ggml/include/ggml.h +1 -1
- package/src/llama.cpp/ggml/src/CMakeLists.txt +21 -4
- package/src/llama.cpp/ggml/src/ggml-alloc.c +1 -13
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +91 -78
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +7 -7
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +1 -1
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +46 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +16 -1
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +1 -1
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +28 -8
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +5 -7
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +33 -23
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.hpp +1 -5
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +323 -121
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +13 -3
- package/src/llama.cpp/ggml/src/ggml.c +23 -13
- package/src/llama.cpp/include/llama.h +14 -1
- package/src/llama.cpp/models/ggml-vocab-deepseek-r1-qwen.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-r1-qwen.gguf.out +46 -0
- package/src/llama.cpp/src/CMakeLists.txt +1 -1
- package/src/llama.cpp/src/llama-arch.cpp +7 -2
- package/src/llama.cpp/src/llama-arch.h +3 -1
- package/src/llama.cpp/src/llama-chat.cpp +11 -2
- package/src/llama.cpp/src/llama-chat.h +1 -0
- package/src/llama.cpp/src/llama-grammar.cpp +86 -6
- package/src/llama.cpp/src/llama-grammar.h +22 -1
- package/src/llama.cpp/src/llama-mmap.cpp +1 -0
- package/src/llama.cpp/src/llama-model-loader.cpp +1 -1
- package/src/llama.cpp/src/llama-model.cpp +76 -6
- package/src/llama.cpp/src/llama-sampling.cpp +47 -4
- package/src/llama.cpp/src/llama-vocab.cpp +10 -4
- package/src/llama.cpp/src/llama.cpp +181 -123
- package/src/llama.cpp/tests/CMakeLists.txt +4 -0
- package/src/llama.cpp/tests/test-backend-ops.cpp +158 -57
- package/src/llama.cpp/tests/test-chat-template.cpp +154 -31
- package/src/llama.cpp/tests/test-chat.cpp +607 -0
- package/src/llama.cpp/tests/test-grammar-integration.cpp +2 -2
- package/src/llama.cpp/tests/test-grammar-llguidance.cpp +1140 -0
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +1 -1
- package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +0 -32
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
// Chat support (incl. tool call grammar constraining & output parsing) w/ generic & custom template handlers.
|
|
2
|
+
|
|
3
|
+
#pragma once
|
|
4
|
+
|
|
5
|
+
#include "common.h"
|
|
6
|
+
#include <json.hpp>
|
|
7
|
+
#include <optional>
|
|
8
|
+
#include <string>
|
|
9
|
+
#include <vector>
|
|
10
|
+
|
|
11
|
+
using json = nlohmann::ordered_json;
|
|
12
|
+
|
|
13
|
+
struct common_chat_inputs {
|
|
14
|
+
json messages;
|
|
15
|
+
json tools;
|
|
16
|
+
json tool_choice;
|
|
17
|
+
json json_schema;
|
|
18
|
+
bool parallel_tool_calls;
|
|
19
|
+
bool stream;
|
|
20
|
+
std::string grammar;
|
|
21
|
+
bool add_generation_prompt = true;
|
|
22
|
+
};
|
|
23
|
+
|
|
24
|
+
enum common_chat_format {
|
|
25
|
+
COMMON_CHAT_FORMAT_CONTENT_ONLY,
|
|
26
|
+
COMMON_CHAT_FORMAT_GENERIC,
|
|
27
|
+
COMMON_CHAT_FORMAT_MISTRAL_NEMO,
|
|
28
|
+
COMMON_CHAT_FORMAT_LLAMA_3_X,
|
|
29
|
+
COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS,
|
|
30
|
+
COMMON_CHAT_FORMAT_DEEPSEEK_R1,
|
|
31
|
+
COMMON_CHAT_FORMAT_FIREFUNCTION_V2,
|
|
32
|
+
COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2,
|
|
33
|
+
COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
|
|
34
|
+
COMMON_CHAT_FORMAT_HERMES_2_PRO,
|
|
35
|
+
COMMON_CHAT_FORMAT_COMMAND_R7B,
|
|
36
|
+
|
|
37
|
+
COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
|
|
38
|
+
};
|
|
39
|
+
|
|
40
|
+
struct common_chat_params {
|
|
41
|
+
common_chat_format format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
|
|
42
|
+
json prompt;
|
|
43
|
+
std::string grammar;
|
|
44
|
+
bool grammar_lazy = false;
|
|
45
|
+
std::vector<common_grammar_trigger> grammar_triggers;
|
|
46
|
+
std::vector<std::string> preserved_tokens;
|
|
47
|
+
std::vector<std::string> additional_stops;
|
|
48
|
+
};
|
|
49
|
+
|
|
50
|
+
struct common_chat_params common_chat_params_init(const common_chat_template & tmpl, const struct common_chat_inputs & params);
|
|
51
|
+
std::string common_chat_format_name(common_chat_format format);
|
|
52
|
+
common_chat_msg common_chat_parse( const std::string & input, common_chat_format format);
|
|
@@ -12,6 +12,8 @@
|
|
|
12
12
|
#include "json.hpp"
|
|
13
13
|
#include "json-schema-to-grammar.h"
|
|
14
14
|
#include "llama.h"
|
|
15
|
+
#include "chat.hpp"
|
|
16
|
+
#include "chat-template.hpp"
|
|
15
17
|
|
|
16
18
|
#include <algorithm>
|
|
17
19
|
#include <cinttypes>
|
|
@@ -483,6 +485,48 @@ void string_replace_all(std::string & s, const std::string & search, const std::
|
|
|
483
485
|
s = std::move(builder);
|
|
484
486
|
}
|
|
485
487
|
|
|
488
|
+
std::string string_join(const std::vector<std::string> & values, const std::string & separator) {
|
|
489
|
+
std::ostringstream result;
|
|
490
|
+
for (size_t i = 0; i < values.size(); ++i) {
|
|
491
|
+
if (i > 0) {
|
|
492
|
+
result << separator;
|
|
493
|
+
}
|
|
494
|
+
result << values[i];
|
|
495
|
+
}
|
|
496
|
+
return result.str();
|
|
497
|
+
}
|
|
498
|
+
|
|
499
|
+
std::vector<std::string> string_split(const std::string & str, const std::string & delimiter) {
|
|
500
|
+
std::vector<std::string> parts;
|
|
501
|
+
size_t start = 0;
|
|
502
|
+
size_t end = str.find(delimiter);
|
|
503
|
+
|
|
504
|
+
while (end != std::string::npos) {
|
|
505
|
+
parts.push_back(str.substr(start, end - start));
|
|
506
|
+
start = end + delimiter.length();
|
|
507
|
+
end = str.find(delimiter, start);
|
|
508
|
+
}
|
|
509
|
+
|
|
510
|
+
parts.push_back(str.substr(start));
|
|
511
|
+
|
|
512
|
+
return parts;
|
|
513
|
+
}
|
|
514
|
+
|
|
515
|
+
std::string string_repeat(const std::string & str, size_t n) {
|
|
516
|
+
if (n == 0) {
|
|
517
|
+
return "";
|
|
518
|
+
}
|
|
519
|
+
|
|
520
|
+
std::string result;
|
|
521
|
+
result.reserve(str.length() * n);
|
|
522
|
+
|
|
523
|
+
for (size_t i = 0; i < n; ++i) {
|
|
524
|
+
result += str;
|
|
525
|
+
}
|
|
526
|
+
|
|
527
|
+
return result;
|
|
528
|
+
}
|
|
529
|
+
|
|
486
530
|
std::string string_from(bool value) {
|
|
487
531
|
return value ? "true" : "false";
|
|
488
532
|
}
|
|
@@ -1728,67 +1772,80 @@ std::string common_detokenize(const struct llama_vocab * vocab, const std::vecto
|
|
|
1728
1772
|
// Chat template utils
|
|
1729
1773
|
//
|
|
1730
1774
|
|
|
1731
|
-
std::string
|
|
1732
|
-
|
|
1733
|
-
|
|
1734
|
-
|
|
1735
|
-
|
|
1736
|
-
|
|
1775
|
+
bool common_chat_verify_template(const std::string & tmpl, bool use_jinja) {
|
|
1776
|
+
if (use_jinja) {
|
|
1777
|
+
try {
|
|
1778
|
+
auto chat_template = common_chat_template(tmpl, "<s>", "</s>");
|
|
1779
|
+
common_chat_inputs inputs;
|
|
1780
|
+
inputs.messages = json::array({{
|
|
1781
|
+
{"role", "user"},
|
|
1782
|
+
{"content", "test"},
|
|
1783
|
+
}});
|
|
1784
|
+
common_chat_params_init(chat_template, inputs);
|
|
1785
|
+
return true;
|
|
1786
|
+
} catch (const std::exception & e) {
|
|
1787
|
+
LOG_ERR("%s: failed to apply template: %s\n", __func__, e.what());
|
|
1788
|
+
return false;
|
|
1789
|
+
}
|
|
1790
|
+
}
|
|
1737
1791
|
llama_chat_message chat[] = {{"user", "test"}};
|
|
1738
1792
|
const int res = llama_chat_apply_template(tmpl.c_str(), chat, 1, true, nullptr, 0);
|
|
1739
1793
|
return res >= 0;
|
|
1740
1794
|
}
|
|
1741
1795
|
|
|
1742
|
-
std::string common_chat_apply_template(
|
|
1743
|
-
const
|
|
1796
|
+
std::string common_chat_apply_template(
|
|
1797
|
+
const common_chat_template & tmpl,
|
|
1744
1798
|
const std::vector<common_chat_msg> & msgs,
|
|
1745
|
-
bool add_ass
|
|
1799
|
+
bool add_ass,
|
|
1800
|
+
bool use_jinja) {
|
|
1801
|
+
if (use_jinja) {
|
|
1802
|
+
auto messages = json::array();
|
|
1803
|
+
for (const auto & msg : msgs) {
|
|
1804
|
+
messages.push_back({{"role", msg.role}, {"content", msg.content}});
|
|
1805
|
+
}
|
|
1806
|
+
common_chat_inputs inputs;
|
|
1807
|
+
inputs.messages = messages;
|
|
1808
|
+
inputs.add_generation_prompt = add_ass;
|
|
1809
|
+
return common_chat_params_init(tmpl, inputs).prompt;
|
|
1810
|
+
}
|
|
1811
|
+
|
|
1746
1812
|
int alloc_size = 0;
|
|
1747
|
-
bool fallback = false; // indicate if we must fallback to default chatml
|
|
1748
1813
|
std::vector<llama_chat_message> chat;
|
|
1749
1814
|
for (const auto & msg : msgs) {
|
|
1750
1815
|
chat.push_back({msg.role.c_str(), msg.content.c_str()});
|
|
1751
1816
|
alloc_size += (msg.role.size() + msg.content.size()) * 1.25;
|
|
1752
1817
|
}
|
|
1753
1818
|
|
|
1754
|
-
const char * ptr_tmpl = tmpl.empty() ? llama_model_chat_template(model) : tmpl.c_str();
|
|
1755
1819
|
std::vector<char> buf(alloc_size);
|
|
1756
1820
|
|
|
1757
1821
|
// run the first time to get the total output length
|
|
1758
|
-
int32_t res = llama_chat_apply_template(
|
|
1822
|
+
int32_t res = llama_chat_apply_template(tmpl.source().c_str(), chat.data(), chat.size(), add_ass, buf.data(), buf.size());
|
|
1759
1823
|
|
|
1760
1824
|
// error: chat template is not supported
|
|
1761
1825
|
if (res < 0) {
|
|
1762
|
-
if
|
|
1763
|
-
|
|
1764
|
-
|
|
1765
|
-
throw std::runtime_error("this custom template is not supported");
|
|
1766
|
-
}
|
|
1767
|
-
|
|
1768
|
-
// If the built-in template is not supported, we default to chatml
|
|
1769
|
-
res = llama_chat_apply_template("chatml", chat.data(), chat.size(), add_ass, buf.data(), buf.size());
|
|
1770
|
-
fallback = true;
|
|
1826
|
+
// if the custom "tmpl" is not supported, we throw an error
|
|
1827
|
+
// this is a bit redundant (for good), since we're not sure if user validated the custom template with llama_chat_verify_template()
|
|
1828
|
+
throw std::runtime_error("this custom template is not supported");
|
|
1771
1829
|
}
|
|
1772
1830
|
|
|
1773
1831
|
// if it turns out that our buffer is too small, we resize it
|
|
1774
1832
|
if ((size_t) res > buf.size()) {
|
|
1775
1833
|
buf.resize(res);
|
|
1776
|
-
res = llama_chat_apply_template(
|
|
1777
|
-
fallback ? "chatml" : ptr_tmpl,
|
|
1778
|
-
chat.data(), chat.size(), add_ass, buf.data(), buf.size());
|
|
1834
|
+
res = llama_chat_apply_template(tmpl.source().c_str(), chat.data(), chat.size(), add_ass, buf.data(), buf.size());
|
|
1779
1835
|
}
|
|
1780
1836
|
|
|
1781
1837
|
std::string formatted_chat(buf.data(), res);
|
|
1782
1838
|
return formatted_chat;
|
|
1783
1839
|
}
|
|
1784
1840
|
|
|
1785
|
-
std::string common_chat_format_single(
|
|
1786
|
-
const
|
|
1841
|
+
std::string common_chat_format_single(
|
|
1842
|
+
const common_chat_template & tmpl,
|
|
1787
1843
|
const std::vector<common_chat_msg> & past_msg,
|
|
1788
1844
|
const common_chat_msg & new_msg,
|
|
1789
|
-
bool add_ass
|
|
1845
|
+
bool add_ass,
|
|
1846
|
+
bool use_jinja) {
|
|
1790
1847
|
std::ostringstream ss;
|
|
1791
|
-
auto fmt_past_msg = past_msg.empty() ? "" : common_chat_apply_template(
|
|
1848
|
+
auto fmt_past_msg = past_msg.empty() ? "" : common_chat_apply_template(tmpl, past_msg, false, use_jinja);
|
|
1792
1849
|
std::vector<common_chat_msg> chat_new(past_msg);
|
|
1793
1850
|
// if the past_msg ends with a newline, we must preserve it in the formatted version
|
|
1794
1851
|
if (add_ass && !fmt_past_msg.empty() && fmt_past_msg.back() == '\n') {
|
|
@@ -1796,21 +1853,87 @@ std::string common_chat_format_single(const struct llama_model * model,
|
|
|
1796
1853
|
};
|
|
1797
1854
|
// format chat with new_msg
|
|
1798
1855
|
chat_new.push_back(new_msg);
|
|
1799
|
-
auto fmt_new_msg = common_chat_apply_template(
|
|
1856
|
+
auto fmt_new_msg = common_chat_apply_template(tmpl, chat_new, add_ass, use_jinja);
|
|
1800
1857
|
// get the diff part
|
|
1801
1858
|
ss << fmt_new_msg.substr(fmt_past_msg.size(), fmt_new_msg.size() - fmt_past_msg.size());
|
|
1802
1859
|
return ss.str();
|
|
1803
1860
|
}
|
|
1804
1861
|
|
|
1805
|
-
std::string common_chat_format_example(const
|
|
1806
|
-
const std::string & tmpl) {
|
|
1862
|
+
std::string common_chat_format_example(const common_chat_template & tmpl, bool use_jinja) {
|
|
1807
1863
|
std::vector<common_chat_msg> msgs = {
|
|
1808
|
-
{"system", "You are a helpful assistant"},
|
|
1809
|
-
{"user", "Hello"},
|
|
1810
|
-
{"assistant", "Hi there"},
|
|
1811
|
-
{"user", "How are you?"},
|
|
1864
|
+
{"system", "You are a helpful assistant", {}},
|
|
1865
|
+
{"user", "Hello", {}},
|
|
1866
|
+
{"assistant", "Hi there", {}},
|
|
1867
|
+
{"user", "How are you?", {}},
|
|
1812
1868
|
};
|
|
1813
|
-
return common_chat_apply_template(
|
|
1869
|
+
return common_chat_apply_template(tmpl, msgs, true, use_jinja);
|
|
1870
|
+
}
|
|
1871
|
+
|
|
1872
|
+
#define CHATML_TEMPLATE_SRC \
|
|
1873
|
+
"{%- for message in messages -%}\n" \
|
|
1874
|
+
" {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>\n' -}}\n" \
|
|
1875
|
+
"{%- endfor -%}\n" \
|
|
1876
|
+
"{%- if add_generation_prompt -%}\n" \
|
|
1877
|
+
" {{- '<|im_start|>assistant\n' -}}\n" \
|
|
1878
|
+
"{%- endif -%}"
|
|
1879
|
+
|
|
1880
|
+
common_chat_templates common_chat_templates_from_model(const struct llama_model * model, const std::string & chat_template_override)
|
|
1881
|
+
{
|
|
1882
|
+
std::string default_template_src;
|
|
1883
|
+
std::string template_tool_use_src;
|
|
1884
|
+
|
|
1885
|
+
bool has_explicit_template = !chat_template_override.empty();
|
|
1886
|
+
if (chat_template_override.empty()) {
|
|
1887
|
+
auto str = llama_model_chat_template(model, /* name */ nullptr);
|
|
1888
|
+
if (str) {
|
|
1889
|
+
default_template_src = str;
|
|
1890
|
+
has_explicit_template = true;
|
|
1891
|
+
}
|
|
1892
|
+
str = llama_model_chat_template(model, /* name */ "tool_use");
|
|
1893
|
+
if (str) {
|
|
1894
|
+
template_tool_use_src = str;
|
|
1895
|
+
has_explicit_template = true;
|
|
1896
|
+
}
|
|
1897
|
+
} else {
|
|
1898
|
+
default_template_src = chat_template_override;
|
|
1899
|
+
}
|
|
1900
|
+
if (default_template_src.empty() || default_template_src == "chatml") {
|
|
1901
|
+
if (!template_tool_use_src.empty()) {
|
|
1902
|
+
default_template_src = template_tool_use_src;
|
|
1903
|
+
} else {
|
|
1904
|
+
default_template_src = CHATML_TEMPLATE_SRC;
|
|
1905
|
+
}
|
|
1906
|
+
}
|
|
1907
|
+
auto vocab = llama_model_get_vocab(model);
|
|
1908
|
+
const auto get_token = [&](llama_token token, const char * name, const char * jinja_variable_name) {
|
|
1909
|
+
if (token == LLAMA_TOKEN_NULL) {
|
|
1910
|
+
if (default_template_src.find(jinja_variable_name) != std::string::npos
|
|
1911
|
+
|| template_tool_use_src.find(jinja_variable_name) != std::string::npos) {
|
|
1912
|
+
LOG_WRN("%s: warning: vocab does not have a %s token, jinja template won't work as intended.\n", __func__, name);
|
|
1913
|
+
}
|
|
1914
|
+
return std::string();
|
|
1915
|
+
} else {
|
|
1916
|
+
return common_token_to_piece(vocab, token, true);
|
|
1917
|
+
}
|
|
1918
|
+
};
|
|
1919
|
+
auto token_bos = get_token(llama_vocab_bos(vocab), "BOS", "bos_token");
|
|
1920
|
+
auto token_eos = get_token(llama_vocab_eos(vocab), "EOS", "eos_token");
|
|
1921
|
+
try {
|
|
1922
|
+
return {
|
|
1923
|
+
has_explicit_template,
|
|
1924
|
+
std::make_unique<minja::chat_template>(default_template_src, token_bos, token_eos),
|
|
1925
|
+
template_tool_use_src.empty()
|
|
1926
|
+
? nullptr
|
|
1927
|
+
: std::make_unique<minja::chat_template>(template_tool_use_src, token_bos, token_eos),
|
|
1928
|
+
};
|
|
1929
|
+
} catch (const std::exception & e) {
|
|
1930
|
+
LOG_ERR("%s: failed to parse chat template: %s\n", __func__, e.what());
|
|
1931
|
+
return {
|
|
1932
|
+
has_explicit_template,
|
|
1933
|
+
std::make_unique<minja::chat_template>(CHATML_TEMPLATE_SRC, token_bos, token_eos),
|
|
1934
|
+
nullptr,
|
|
1935
|
+
};
|
|
1936
|
+
}
|
|
1814
1937
|
}
|
|
1815
1938
|
|
|
1816
1939
|
//
|
|
@@ -4,6 +4,7 @@
|
|
|
4
4
|
|
|
5
5
|
#include "llama-cpp.h"
|
|
6
6
|
|
|
7
|
+
#include <set>
|
|
7
8
|
#include <string>
|
|
8
9
|
#include <vector>
|
|
9
10
|
#include <sstream>
|
|
@@ -109,6 +110,11 @@ enum common_conversation_mode {
|
|
|
109
110
|
COMMON_CONVERSATION_MODE_AUTO = 2,
|
|
110
111
|
};
|
|
111
112
|
|
|
113
|
+
struct common_grammar_trigger {
|
|
114
|
+
std::string word;
|
|
115
|
+
bool at_start;
|
|
116
|
+
};
|
|
117
|
+
|
|
112
118
|
// sampling parameters
|
|
113
119
|
struct common_params_sampling {
|
|
114
120
|
uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
|
|
@@ -154,7 +160,11 @@ struct common_params_sampling {
|
|
|
154
160
|
COMMON_SAMPLER_TYPE_TEMPERATURE,
|
|
155
161
|
};
|
|
156
162
|
|
|
157
|
-
std::string
|
|
163
|
+
std::string grammar; // optional BNF-like grammar to constrain sampling
|
|
164
|
+
bool grammar_lazy = false;
|
|
165
|
+
std::vector<common_grammar_trigger> grammar_trigger_words; // optional trigger words to trigger lazy grammar
|
|
166
|
+
std::vector<llama_token> grammar_trigger_tokens; // optional trigger tokens to trigger lazy grammar and print trigger special tokens.
|
|
167
|
+
std::set<llama_token> preserved_tokens;
|
|
158
168
|
|
|
159
169
|
std::vector<llama_logit_bias> logit_bias; // logit biases to apply
|
|
160
170
|
|
|
@@ -175,7 +185,11 @@ struct common_params_speculative {
|
|
|
175
185
|
struct cpu_params cpuparams;
|
|
176
186
|
struct cpu_params cpuparams_batch;
|
|
177
187
|
|
|
178
|
-
std::string
|
|
188
|
+
std::string hf_repo = ""; // HF repo // NOLINT
|
|
189
|
+
std::string hf_file = ""; // HF file // NOLINT
|
|
190
|
+
|
|
191
|
+
std::string model = ""; // draft model for speculative decoding // NOLINT
|
|
192
|
+
std::string model_url = ""; // model url to download // NOLINT
|
|
179
193
|
};
|
|
180
194
|
|
|
181
195
|
struct common_params_vocoder {
|
|
@@ -330,6 +344,7 @@ struct common_params {
|
|
|
330
344
|
std::string hostname = "127.0.0.1";
|
|
331
345
|
std::string public_path = ""; // NOLINT
|
|
332
346
|
std::string chat_template = ""; // NOLINT
|
|
347
|
+
bool use_jinja = false; // NOLINT
|
|
333
348
|
bool enable_chat_template = true;
|
|
334
349
|
|
|
335
350
|
std::vector<std::string> api_keys;
|
|
@@ -424,6 +439,10 @@ std::string string_format(const char * fmt, ...);
|
|
|
424
439
|
std::string string_strip(const std::string & str);
|
|
425
440
|
std::string string_get_sortable_timestamp();
|
|
426
441
|
|
|
442
|
+
std::string string_join(const std::vector<std::string> & values, const std::string & separator);
|
|
443
|
+
std::vector<std::string> string_split(const std::string & str, const std::string & delimiter);
|
|
444
|
+
std::string string_repeat(const std::string & str, size_t n);
|
|
445
|
+
|
|
427
446
|
void string_replace_all(std::string & s, const std::string & search, const std::string & replace);
|
|
428
447
|
|
|
429
448
|
template<class T>
|
|
@@ -508,12 +527,14 @@ struct llama_model * common_load_model_from_url(
|
|
|
508
527
|
const std::string & local_path,
|
|
509
528
|
const std::string & hf_token,
|
|
510
529
|
const struct llama_model_params & params);
|
|
530
|
+
|
|
511
531
|
struct llama_model * common_load_model_from_hf(
|
|
512
532
|
const std::string & repo,
|
|
513
533
|
const std::string & remote_path,
|
|
514
534
|
const std::string & local_path,
|
|
515
535
|
const std::string & hf_token,
|
|
516
536
|
const struct llama_model_params & params);
|
|
537
|
+
|
|
517
538
|
std::pair<std::string, std::string> common_get_hf_file(
|
|
518
539
|
const std::string & hf_repo_with_tag,
|
|
519
540
|
const std::string & hf_token);
|
|
@@ -591,36 +612,57 @@ std::string common_detokenize(
|
|
|
591
612
|
// Chat template utils
|
|
592
613
|
//
|
|
593
614
|
|
|
615
|
+
struct common_tool_call {
|
|
616
|
+
std::string name;
|
|
617
|
+
std::string arguments;
|
|
618
|
+
std::string id;
|
|
619
|
+
};
|
|
620
|
+
|
|
594
621
|
// same with llama_chat_message, but uses std::string
|
|
595
622
|
struct common_chat_msg {
|
|
596
623
|
std::string role;
|
|
597
624
|
std::string content;
|
|
625
|
+
std::vector<common_tool_call> tool_calls;
|
|
626
|
+
std::string tool_plan = "";
|
|
598
627
|
};
|
|
599
628
|
|
|
600
|
-
// Get the built-in chat template for the model. Return empty string if not present.
|
|
601
|
-
std::string common_get_builtin_chat_template(const struct llama_model * model);
|
|
602
|
-
|
|
603
629
|
// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
|
|
604
|
-
bool common_chat_verify_template(const std::string & tmpl);
|
|
630
|
+
bool common_chat_verify_template(const std::string & tmpl, bool use_jinja);
|
|
631
|
+
|
|
632
|
+
namespace minja {
|
|
633
|
+
class chat_template;
|
|
634
|
+
}
|
|
635
|
+
|
|
636
|
+
typedef minja::chat_template common_chat_template;
|
|
637
|
+
|
|
638
|
+
struct common_chat_templates {
|
|
639
|
+
bool has_explicit_template; // Model had builtin template or template overridde was specified.
|
|
640
|
+
std::unique_ptr<common_chat_template> template_default; // always set (defaults to chatml)
|
|
641
|
+
std::unique_ptr<common_chat_template> template_tool_use;
|
|
642
|
+
};
|
|
605
643
|
|
|
606
644
|
// CPP wrapper for llama_chat_apply_template
|
|
607
645
|
// If the built-in template is not supported, we default to chatml
|
|
608
646
|
// If the custom "tmpl" is not supported, we throw an error
|
|
609
|
-
std::string common_chat_apply_template(
|
|
610
|
-
const
|
|
647
|
+
std::string common_chat_apply_template(
|
|
648
|
+
const common_chat_template & tmpl,
|
|
611
649
|
const std::vector<common_chat_msg> & chat,
|
|
612
|
-
bool add_ass
|
|
650
|
+
bool add_ass,
|
|
651
|
+
bool use_jinja);
|
|
613
652
|
|
|
614
653
|
// Format single message, while taking into account the position of that message in chat history
|
|
615
|
-
std::string common_chat_format_single(
|
|
616
|
-
const
|
|
654
|
+
std::string common_chat_format_single(
|
|
655
|
+
const common_chat_template & tmpl,
|
|
617
656
|
const std::vector<common_chat_msg> & past_msg,
|
|
618
657
|
const common_chat_msg & new_msg,
|
|
619
|
-
bool add_ass
|
|
658
|
+
bool add_ass,
|
|
659
|
+
bool use_jinja);
|
|
620
660
|
|
|
621
661
|
// Returns an example of formatted chat
|
|
622
|
-
std::string common_chat_format_example(
|
|
623
|
-
|
|
662
|
+
std::string common_chat_format_example(
|
|
663
|
+
const common_chat_template & tmpl, bool use_jinja);
|
|
664
|
+
|
|
665
|
+
common_chat_templates common_chat_templates_from_model(const struct llama_model * model, const std::string & chat_template_override);
|
|
624
666
|
|
|
625
667
|
//
|
|
626
668
|
// KV cache utils
|
|
@@ -1,4 +1,6 @@
|
|
|
1
1
|
#include "json-schema-to-grammar.h"
|
|
2
|
+
#include "common.h"
|
|
3
|
+
|
|
2
4
|
#include <algorithm>
|
|
3
5
|
#include <fstream>
|
|
4
6
|
#include <map>
|
|
@@ -11,11 +13,6 @@
|
|
|
11
13
|
|
|
12
14
|
using json = nlohmann::ordered_json;
|
|
13
15
|
|
|
14
|
-
template <typename Iterator>
|
|
15
|
-
static std::string join(Iterator begin, Iterator end, const std::string & separator);
|
|
16
|
-
|
|
17
|
-
static std::string repeat(const std::string & str, size_t n);
|
|
18
|
-
|
|
19
16
|
static std::string build_repetition(const std::string & item_rule, int min_items, int max_items, const std::string & separator_rule = "") {
|
|
20
17
|
auto has_max = max_items != std::numeric_limits<int>::max();
|
|
21
18
|
|
|
@@ -128,8 +125,8 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
|
|
|
128
125
|
if (sub_len > 0) {
|
|
129
126
|
auto from_sub = from.substr(i + 1);
|
|
130
127
|
auto to_sub = to.substr(i + 1);
|
|
131
|
-
auto sub_zeros =
|
|
132
|
-
auto sub_nines =
|
|
128
|
+
auto sub_zeros = string_repeat("0", sub_len);
|
|
129
|
+
auto sub_nines = string_repeat("9", sub_len);
|
|
133
130
|
|
|
134
131
|
auto to_reached = false;
|
|
135
132
|
out << "(";
|
|
@@ -188,8 +185,8 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
|
|
|
188
185
|
auto max_digits = max_s.length();
|
|
189
186
|
|
|
190
187
|
for (auto digits = min_digits; digits < max_digits; digits++) {
|
|
191
|
-
uniform_range(min_s,
|
|
192
|
-
min_s = "1" +
|
|
188
|
+
uniform_range(min_s, string_repeat("9", digits));
|
|
189
|
+
min_s = "1" + string_repeat("0", digits);
|
|
193
190
|
out << " | ";
|
|
194
191
|
}
|
|
195
192
|
uniform_range(min_s, max_s);
|
|
@@ -318,49 +315,6 @@ std::unordered_map<char, std::string> GRAMMAR_LITERAL_ESCAPES = {
|
|
|
318
315
|
std::unordered_set<char> NON_LITERAL_SET = {'|', '.', '(', ')', '[', ']', '{', '}', '*', '+', '?'};
|
|
319
316
|
std::unordered_set<char> ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = {'^', '$', '.', '[', ']', '(', ')', '|', '{', '}', '*', '+', '?'};
|
|
320
317
|
|
|
321
|
-
template <typename Iterator>
|
|
322
|
-
std::string join(Iterator begin, Iterator end, const std::string & separator) {
|
|
323
|
-
std::ostringstream result;
|
|
324
|
-
if (begin != end) {
|
|
325
|
-
result << *begin;
|
|
326
|
-
for (Iterator it = begin + 1; it != end; ++it) {
|
|
327
|
-
result << separator << *it;
|
|
328
|
-
}
|
|
329
|
-
}
|
|
330
|
-
return result.str();
|
|
331
|
-
}
|
|
332
|
-
|
|
333
|
-
static std::vector<std::string> split(const std::string & str, const std::string & delimiter) {
|
|
334
|
-
std::vector<std::string> tokens;
|
|
335
|
-
size_t start = 0;
|
|
336
|
-
size_t end = str.find(delimiter);
|
|
337
|
-
|
|
338
|
-
while (end != std::string::npos) {
|
|
339
|
-
tokens.push_back(str.substr(start, end - start));
|
|
340
|
-
start = end + delimiter.length();
|
|
341
|
-
end = str.find(delimiter, start);
|
|
342
|
-
}
|
|
343
|
-
|
|
344
|
-
tokens.push_back(str.substr(start));
|
|
345
|
-
|
|
346
|
-
return tokens;
|
|
347
|
-
}
|
|
348
|
-
|
|
349
|
-
static std::string repeat(const std::string & str, size_t n) {
|
|
350
|
-
if (n == 0) {
|
|
351
|
-
return "";
|
|
352
|
-
}
|
|
353
|
-
|
|
354
|
-
std::string result;
|
|
355
|
-
result.reserve(str.length() * n);
|
|
356
|
-
|
|
357
|
-
for (size_t i = 0; i < n; ++i) {
|
|
358
|
-
result += str;
|
|
359
|
-
}
|
|
360
|
-
|
|
361
|
-
return result;
|
|
362
|
-
}
|
|
363
|
-
|
|
364
318
|
static std::string replacePattern(const std::string & input, const std::regex & regex, const std::function<std::string(const std::smatch &)> & replacement) {
|
|
365
319
|
std::smatch match;
|
|
366
320
|
std::string result;
|
|
@@ -389,6 +343,7 @@ static std::string format_literal(const std::string & literal) {
|
|
|
389
343
|
|
|
390
344
|
class SchemaConverter {
|
|
391
345
|
private:
|
|
346
|
+
friend std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options);
|
|
392
347
|
std::function<json(const std::string &)> _fetch_json;
|
|
393
348
|
bool _dotall;
|
|
394
349
|
std::map<std::string, std::string> _rules;
|
|
@@ -418,7 +373,7 @@ private:
|
|
|
418
373
|
for (size_t i = 0; i < alt_schemas.size(); i++) {
|
|
419
374
|
rules.push_back(visit(alt_schemas[i], name + (name.empty() ? "alternative-" : "-") + std::to_string(i)));
|
|
420
375
|
}
|
|
421
|
-
return
|
|
376
|
+
return string_join(rules, " | ");
|
|
422
377
|
}
|
|
423
378
|
|
|
424
379
|
std::string _visit_pattern(const std::string & pattern, const std::string & name) {
|
|
@@ -481,7 +436,7 @@ private:
|
|
|
481
436
|
for (const auto & item : ret) {
|
|
482
437
|
results.push_back(to_rule(item));
|
|
483
438
|
}
|
|
484
|
-
return std::make_pair(
|
|
439
|
+
return std::make_pair(string_join(results, " "), false);
|
|
485
440
|
};
|
|
486
441
|
|
|
487
442
|
while (i < length) {
|
|
@@ -539,7 +494,7 @@ private:
|
|
|
539
494
|
}
|
|
540
495
|
curly_brackets += '}';
|
|
541
496
|
i++;
|
|
542
|
-
auto nums =
|
|
497
|
+
auto nums = string_split(curly_brackets.substr(1, curly_brackets.length() - 2), ",");
|
|
543
498
|
int min_times = 0;
|
|
544
499
|
int max_times = std::numeric_limits<int>::max();
|
|
545
500
|
try {
|
|
@@ -809,10 +764,11 @@ private:
|
|
|
809
764
|
public:
|
|
810
765
|
SchemaConverter(
|
|
811
766
|
const std::function<json(const std::string &)> & fetch_json,
|
|
812
|
-
bool dotall
|
|
767
|
+
bool dotall,
|
|
768
|
+
bool compact_spaces)
|
|
813
769
|
: _fetch_json(fetch_json), _dotall(dotall)
|
|
814
770
|
{
|
|
815
|
-
_rules["space"] = SPACE_RULE;
|
|
771
|
+
_rules["space"] = compact_spaces ? "\" \"?" : SPACE_RULE;
|
|
816
772
|
}
|
|
817
773
|
|
|
818
774
|
void resolve_refs(json & schema, const std::string & url) {
|
|
@@ -854,7 +810,7 @@ public:
|
|
|
854
810
|
return;
|
|
855
811
|
}
|
|
856
812
|
std::string pointer = ref.substr(ref.find('#') + 1);
|
|
857
|
-
std::vector<std::string> tokens =
|
|
813
|
+
std::vector<std::string> tokens = string_split(pointer, "/");
|
|
858
814
|
for (size_t i = 1; i < tokens.size(); ++i) {
|
|
859
815
|
std::string sel = tokens[i];
|
|
860
816
|
if (target.is_null() || !target.contains(sel)) {
|
|
@@ -905,7 +861,7 @@ public:
|
|
|
905
861
|
for (const auto & v : schema["enum"]) {
|
|
906
862
|
enum_values.push_back(_generate_constant_rule(v));
|
|
907
863
|
}
|
|
908
|
-
return _add_rule(rule_name, "(" +
|
|
864
|
+
return _add_rule(rule_name, "(" + string_join(enum_values, " | ") + ") space");
|
|
909
865
|
} else if ((schema_type.is_null() || schema_type == "object")
|
|
910
866
|
&& (schema.contains("properties") ||
|
|
911
867
|
(schema.contains("additionalProperties") && schema["additionalProperties"] != true))) {
|
|
@@ -1019,10 +975,10 @@ public:
|
|
|
1019
975
|
|
|
1020
976
|
void check_errors() {
|
|
1021
977
|
if (!_errors.empty()) {
|
|
1022
|
-
throw std::runtime_error("JSON schema conversion failed:\n" +
|
|
978
|
+
throw std::runtime_error("JSON schema conversion failed:\n" + string_join(_errors, "\n"));
|
|
1023
979
|
}
|
|
1024
980
|
if (!_warnings.empty()) {
|
|
1025
|
-
fprintf(stderr, "WARNING: JSON schema conversion was incomplete: %s\n",
|
|
981
|
+
fprintf(stderr, "WARNING: JSON schema conversion was incomplete: %s\n", string_join(_warnings, "; ").c_str());
|
|
1026
982
|
}
|
|
1027
983
|
}
|
|
1028
984
|
|
|
@@ -1035,11 +991,35 @@ public:
|
|
|
1035
991
|
}
|
|
1036
992
|
};
|
|
1037
993
|
|
|
1038
|
-
std::string json_schema_to_grammar(const json & schema) {
|
|
1039
|
-
|
|
1040
|
-
|
|
1041
|
-
|
|
1042
|
-
|
|
994
|
+
std::string json_schema_to_grammar(const json & schema, bool force_gbnf) {
|
|
995
|
+
#ifdef LLAMA_USE_LLGUIDANCE
|
|
996
|
+
if (!force_gbnf) {
|
|
997
|
+
return "%llguidance {}\nstart: %json " + schema.dump();
|
|
998
|
+
}
|
|
999
|
+
#else
|
|
1000
|
+
(void)force_gbnf;
|
|
1001
|
+
#endif // LLAMA_USE_LLGUIDANCE
|
|
1002
|
+
return build_grammar([&](const common_grammar_builder & callbacks) {
|
|
1003
|
+
auto copy = schema;
|
|
1004
|
+
callbacks.resolve_refs(copy);
|
|
1005
|
+
callbacks.add_schema("", copy);
|
|
1006
|
+
});
|
|
1007
|
+
}
|
|
1008
|
+
|
|
1009
|
+
std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options) {
|
|
1010
|
+
SchemaConverter converter([&](const std::string &) { return json(); }, options.dotall, options.compact_spaces);
|
|
1011
|
+
common_grammar_builder builder {
|
|
1012
|
+
/* .add_rule = */ [&](const std::string & name, const std::string & rule) {
|
|
1013
|
+
return converter._add_rule(name, rule);
|
|
1014
|
+
},
|
|
1015
|
+
/* .add_schema = */ [&](const std::string & name, const nlohmann::ordered_json & schema) {
|
|
1016
|
+
return converter.visit(schema, name == "root" ? "" : name);
|
|
1017
|
+
},
|
|
1018
|
+
/* .resolve_refs = */ [&](nlohmann::ordered_json & schema) {
|
|
1019
|
+
converter.resolve_refs(schema, "");
|
|
1020
|
+
}
|
|
1021
|
+
};
|
|
1022
|
+
cb(builder);
|
|
1043
1023
|
converter.check_errors();
|
|
1044
1024
|
return converter.format_grammar();
|
|
1045
1025
|
}
|