@fugood/llama.node 0.3.13 → 0.3.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (139) hide show
  1. package/bin/darwin/arm64/llama-node.node +0 -0
  2. package/bin/darwin/x64/llama-node.node +0 -0
  3. package/bin/linux/arm64/llama-node.node +0 -0
  4. package/bin/linux/x64/llama-node.node +0 -0
  5. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  6. package/bin/linux-cuda/x64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  9. package/bin/win32/arm64/llama-node.node +0 -0
  10. package/bin/win32/arm64/node.lib +0 -0
  11. package/bin/win32/x64/llama-node.node +0 -0
  12. package/bin/win32/x64/node.lib +0 -0
  13. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  14. package/bin/win32-vulkan/arm64/node.lib +0 -0
  15. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  16. package/bin/win32-vulkan/x64/node.lib +0 -0
  17. package/lib/binding.ts +1 -1
  18. package/package.json +1 -1
  19. package/src/LlamaContext.cpp +98 -76
  20. package/src/LlamaContext.h +1 -1
  21. package/src/common.hpp +1 -2
  22. package/src/llama.cpp/.github/workflows/build.yml +60 -10
  23. package/src/llama.cpp/.github/workflows/server.yml +2 -0
  24. package/src/llama.cpp/common/CMakeLists.txt +3 -3
  25. package/src/llama.cpp/common/arg.cpp +112 -11
  26. package/src/llama.cpp/common/chat.cpp +960 -266
  27. package/src/llama.cpp/common/chat.h +135 -0
  28. package/src/llama.cpp/common/common.cpp +27 -171
  29. package/src/llama.cpp/common/common.h +27 -67
  30. package/src/llama.cpp/common/json-schema-to-grammar.cpp +4 -5
  31. package/src/llama.cpp/common/json-schema-to-grammar.h +0 -1
  32. package/src/llama.cpp/common/{minja.hpp → minja/minja.hpp} +37 -5
  33. package/src/llama.cpp/common/ngram-cache.cpp +1 -0
  34. package/src/llama.cpp/common/sampling.cpp +45 -7
  35. package/src/llama.cpp/common/speculative.cpp +6 -5
  36. package/src/llama.cpp/common/speculative.h +1 -1
  37. package/src/llama.cpp/docs/build.md +45 -7
  38. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +3 -1
  39. package/src/llama.cpp/examples/embedding/embedding.cpp +1 -0
  40. package/src/llama.cpp/examples/export-lora/export-lora.cpp +4 -2
  41. package/src/llama.cpp/examples/imatrix/imatrix.cpp +2 -3
  42. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +1 -1
  43. package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
  44. package/src/llama.cpp/examples/llava/clip.cpp +373 -107
  45. package/src/llama.cpp/examples/llava/clip.h +19 -3
  46. package/src/llama.cpp/examples/llava/gemma3-cli.cpp +341 -0
  47. package/src/llama.cpp/examples/llava/llava.cpp +4 -2
  48. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +30 -11
  49. package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -0
  50. package/src/llama.cpp/examples/main/main.cpp +73 -28
  51. package/src/llama.cpp/examples/parallel/parallel.cpp +1 -0
  52. package/src/llama.cpp/examples/passkey/passkey.cpp +1 -0
  53. package/src/llama.cpp/examples/quantize/quantize.cpp +1 -0
  54. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +882 -237
  55. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +35 -26
  56. package/src/llama.cpp/examples/run/run.cpp +110 -67
  57. package/src/llama.cpp/examples/server/server.cpp +82 -87
  58. package/src/llama.cpp/examples/server/utils.hpp +94 -107
  59. package/src/llama.cpp/examples/sycl/run-llama2.sh +2 -2
  60. package/src/llama.cpp/examples/tts/tts.cpp +251 -142
  61. package/src/llama.cpp/ggml/CMakeLists.txt +13 -1
  62. package/src/llama.cpp/ggml/include/ggml-alloc.h +1 -1
  63. package/src/llama.cpp/ggml/include/ggml-backend.h +3 -3
  64. package/src/llama.cpp/ggml/include/ggml-cpu.h +3 -0
  65. package/src/llama.cpp/ggml/include/ggml.h +5 -1
  66. package/src/llama.cpp/ggml/src/CMakeLists.txt +10 -7
  67. package/src/llama.cpp/ggml/src/ggml-alloc.c +24 -15
  68. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +1 -1
  69. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +58 -54
  70. package/src/llama.cpp/ggml/src/ggml-backend.cpp +10 -8
  71. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +3 -2
  72. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +3 -5
  73. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +132 -17
  74. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +2 -1
  75. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +4 -0
  76. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2 -1
  77. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +151 -0
  78. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +1396 -386
  79. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1432 -151
  80. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +22 -0
  81. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +259 -0
  82. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +61 -0
  83. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +288 -0
  84. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
  85. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +15 -2
  86. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +14 -0
  87. package/src/llama.cpp/ggml/src/ggml-impl.h +1 -1
  88. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -5
  89. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +235 -0
  90. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +6 -2
  91. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +1 -0
  92. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +220 -116
  93. package/src/llama.cpp/ggml/src/ggml-quants.c +114 -114
  94. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +2 -1
  95. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +2 -0
  96. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
  97. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +17 -0
  98. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +51 -10
  99. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +33 -4
  100. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +2 -2
  101. package/src/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +701 -0
  102. package/src/llama.cpp/ggml/src/ggml-sycl/cpy.hpp +11 -0
  103. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +55 -0
  104. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +136 -4
  105. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +308 -0
  106. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +23 -0
  107. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +168 -721
  108. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -77
  109. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -0
  110. package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +13 -0
  111. package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +23 -0
  112. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +146 -42
  113. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +13 -3
  114. package/src/llama.cpp/ggml/src/ggml.c +8 -3
  115. package/src/llama.cpp/include/llama.h +19 -5
  116. package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +112 -0
  117. package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +46 -0
  118. package/src/llama.cpp/requirements/requirements-all.txt +1 -0
  119. package/src/llama.cpp/requirements/requirements-tool_bench.txt +12 -0
  120. package/src/llama.cpp/requirements.txt +1 -0
  121. package/src/llama.cpp/src/llama-arch.cpp +21 -0
  122. package/src/llama.cpp/src/llama-arch.h +1 -0
  123. package/src/llama.cpp/src/llama-chat.cpp +1 -0
  124. package/src/llama.cpp/src/llama-grammar.cpp +182 -182
  125. package/src/llama.cpp/src/llama-grammar.h +12 -3
  126. package/src/llama.cpp/src/llama-kv-cache.h +1 -0
  127. package/src/llama.cpp/src/llama-mmap.cpp +11 -1
  128. package/src/llama.cpp/src/llama-model.cpp +69 -5
  129. package/src/llama.cpp/src/llama-sampling.cpp +43 -10
  130. package/src/llama.cpp/src/llama-vocab.cpp +12 -0
  131. package/src/llama.cpp/src/llama.cpp +147 -0
  132. package/src/llama.cpp/tests/test-backend-ops.cpp +166 -110
  133. package/src/llama.cpp/tests/test-chat-template.cpp +32 -22
  134. package/src/llama.cpp/tests/test-chat.cpp +593 -395
  135. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +63 -63
  136. package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -9
  137. package/src/llama.cpp/Sources/llama/llama.h +0 -4
  138. package/src/llama.cpp/common/chat.hpp +0 -55
  139. /package/src/llama.cpp/common/{chat-template.hpp → minja/chat-template.hpp} +0 -0
@@ -0,0 +1,135 @@
1
+ // Chat support (incl. tool call grammar constraining & output parsing) w/ generic & custom template handlers.
2
+
3
+ #pragma once
4
+
5
+ #include "common.h"
6
+ #include <string>
7
+ #include <vector>
8
+
9
+ struct common_chat_templates;
10
+
11
+ struct common_chat_tool_call {
12
+ std::string name;
13
+ std::string arguments;
14
+ std::string id;
15
+ };
16
+
17
+ struct common_chat_msg_content_part {
18
+ std::string type;
19
+ std::string text;
20
+ };
21
+
22
+ struct common_chat_msg {
23
+ std::string role;
24
+ std::string content;
25
+ std::vector<common_chat_msg_content_part> content_parts = {};
26
+ std::vector<common_chat_tool_call> tool_calls = {};
27
+ std::string reasoning_content;
28
+ std::string tool_name;
29
+ std::string tool_call_id;
30
+ };
31
+
32
+ struct common_chat_tool {
33
+ std::string name;
34
+ std::string description;
35
+ std::string parameters;
36
+ };
37
+
38
+ enum common_chat_tool_choice {
39
+ COMMON_CHAT_TOOL_CHOICE_AUTO,
40
+ COMMON_CHAT_TOOL_CHOICE_REQUIRED,
41
+ COMMON_CHAT_TOOL_CHOICE_NONE,
42
+ };
43
+
44
+ enum common_chat_format {
45
+ COMMON_CHAT_FORMAT_CONTENT_ONLY,
46
+ COMMON_CHAT_FORMAT_GENERIC,
47
+ COMMON_CHAT_FORMAT_MISTRAL_NEMO,
48
+ COMMON_CHAT_FORMAT_LLAMA_3_X,
49
+ COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS,
50
+ COMMON_CHAT_FORMAT_DEEPSEEK_R1,
51
+ COMMON_CHAT_FORMAT_DEEPSEEK_R1_EXTRACT_REASONING,
52
+ COMMON_CHAT_FORMAT_FIREFUNCTION_V2,
53
+ COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2,
54
+ COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
55
+ COMMON_CHAT_FORMAT_HERMES_2_PRO,
56
+ COMMON_CHAT_FORMAT_HERMES_2_PRO_EXTRACT_REASONING,
57
+ COMMON_CHAT_FORMAT_COMMAND_R7B,
58
+ COMMON_CHAT_FORMAT_COMMAND_R7B_EXTRACT_REASONING,
59
+
60
+ COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
61
+ };
62
+
63
+ struct common_chat_templates_inputs {
64
+ std::vector<common_chat_msg> messages;
65
+ std::string grammar;
66
+ std::string json_schema;
67
+ bool add_generation_prompt = true;
68
+ bool use_jinja = true;
69
+ // Parameters below only supported when use_jinja is true
70
+ std::vector<common_chat_tool> tools;
71
+ common_chat_tool_choice tool_choice = COMMON_CHAT_TOOL_CHOICE_AUTO;
72
+ bool parallel_tool_calls = false;
73
+ bool extract_reasoning = true;
74
+ };
75
+
76
+ struct common_chat_params {
77
+ common_chat_format format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
78
+ std::string prompt;
79
+ std::string grammar;
80
+ bool grammar_lazy = false;
81
+ std::vector<common_grammar_trigger> grammar_triggers;
82
+ std::vector<std::string> preserved_tokens;
83
+ std::vector<std::string> additional_stops;
84
+ };
85
+
86
+ // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
87
+ bool common_chat_verify_template(const std::string & tmpl, bool use_jinja);
88
+
89
+ void common_chat_templates_free(struct common_chat_templates * tmpls);
90
+
91
+ struct common_chat_templates_deleter { void operator()(common_chat_templates * tmpls) { common_chat_templates_free(tmpls); } };
92
+
93
+ typedef std::unique_ptr<struct common_chat_templates, common_chat_templates_deleter> common_chat_templates_ptr;
94
+
95
+ common_chat_templates_ptr common_chat_templates_init(
96
+ const struct llama_model * model,
97
+ const std::string & chat_template_override,
98
+ const std::string & bos_token_override = "",
99
+ const std::string & eos_token_override = "");
100
+
101
+ bool common_chat_templates_was_explicit(const struct common_chat_templates * tmpls);
102
+ const char * common_chat_templates_source(const struct common_chat_templates * tmpls, const char * variant = nullptr);
103
+
104
+
105
+ struct common_chat_params common_chat_templates_apply(
106
+ const struct common_chat_templates * tmpls,
107
+ const struct common_chat_templates_inputs & inputs);
108
+
109
+ // Format single message, while taking into account the position of that message in chat history
110
+ std::string common_chat_format_single(
111
+ const struct common_chat_templates * tmpls,
112
+ const std::vector<common_chat_msg> & past_msg,
113
+ const common_chat_msg & new_msg,
114
+ bool add_ass,
115
+ bool use_jinja);
116
+
117
+ // Returns an example of formatted chat
118
+ std::string common_chat_format_example(
119
+ const struct common_chat_templates * tmpls,
120
+ bool use_jinja);
121
+
122
+ std::string common_chat_format_name(common_chat_format format);
123
+ common_chat_msg common_chat_parse( const std::string & input, common_chat_format format);
124
+
125
+ common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice);
126
+
127
+ // Parses a JSON array of messages in OpenAI's chat completion API format.
128
+ // T can be std::string containing JSON or nlohmann::ordered_json
129
+ template <class T> std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const T & messages);
130
+ template <class T> T common_chat_msgs_to_json_oaicompat(const std::vector<common_chat_msg> & msgs, bool concat_typed_text = false);
131
+
132
+ // Parses a JSON array of tools in OpenAI's chat completion tool call API format.
133
+ // T can be std::string containing JSON or nlohmann::ordered_json
134
+ template <class T> std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const T & tools);
135
+ template <class T> T common_chat_tools_to_json_oaicompat(const std::vector<common_chat_tool> & tools);
@@ -10,10 +10,7 @@
10
10
  // Change JSON_ASSERT from assert() to GGML_ASSERT:
11
11
  #define JSON_ASSERT GGML_ASSERT
12
12
  #include "json.hpp"
13
- #include "json-schema-to-grammar.h"
14
13
  #include "llama.h"
15
- #include "chat.hpp"
16
- #include "chat-template.hpp"
17
14
 
18
15
  #include <algorithm>
19
16
  #include <cinttypes>
@@ -485,6 +482,11 @@ void string_replace_all(std::string & s, const std::string & search, const std::
485
482
  s = std::move(builder);
486
483
  }
487
484
 
485
+ std::string regex_escape(const std::string & s) {
486
+ static const std::regex special_chars("[.^$|()*+?\\[\\]{}\\\\]");
487
+ return std::regex_replace(s, special_chars, "\\$0");
488
+ }
489
+
488
490
  std::string string_join(const std::vector<std::string> & values, const std::string & separator) {
489
491
  std::ostringstream result;
490
492
  for (size_t i = 0; i < values.size(); ++i) {
@@ -1768,174 +1770,6 @@ std::string common_detokenize(const struct llama_vocab * vocab, const std::vecto
1768
1770
  return text;
1769
1771
  }
1770
1772
 
1771
- //
1772
- // Chat template utils
1773
- //
1774
-
1775
- bool common_chat_verify_template(const std::string & tmpl, bool use_jinja) {
1776
- if (use_jinja) {
1777
- try {
1778
- auto chat_template = common_chat_template(tmpl, "<s>", "</s>");
1779
- common_chat_inputs inputs;
1780
- inputs.messages = json::array({{
1781
- {"role", "user"},
1782
- {"content", "test"},
1783
- }});
1784
- common_chat_params_init(chat_template, inputs);
1785
- return true;
1786
- } catch (const std::exception & e) {
1787
- LOG_ERR("%s: failed to apply template: %s\n", __func__, e.what());
1788
- return false;
1789
- }
1790
- }
1791
- llama_chat_message chat[] = {{"user", "test"}};
1792
- const int res = llama_chat_apply_template(tmpl.c_str(), chat, 1, true, nullptr, 0);
1793
- return res >= 0;
1794
- }
1795
-
1796
- std::string common_chat_apply_template(
1797
- const common_chat_template & tmpl,
1798
- const std::vector<common_chat_msg> & msgs,
1799
- bool add_ass,
1800
- bool use_jinja) {
1801
- if (use_jinja) {
1802
- auto messages = json::array();
1803
- for (const auto & msg : msgs) {
1804
- messages.push_back({{"role", msg.role}, {"content", msg.content}});
1805
- }
1806
- common_chat_inputs inputs;
1807
- inputs.messages = messages;
1808
- inputs.add_generation_prompt = add_ass;
1809
- return common_chat_params_init(tmpl, inputs).prompt;
1810
- }
1811
-
1812
- int alloc_size = 0;
1813
- std::vector<llama_chat_message> chat;
1814
- for (const auto & msg : msgs) {
1815
- chat.push_back({msg.role.c_str(), msg.content.c_str()});
1816
- alloc_size += (msg.role.size() + msg.content.size()) * 1.25;
1817
- }
1818
-
1819
- std::vector<char> buf(alloc_size);
1820
-
1821
- // run the first time to get the total output length
1822
- int32_t res = llama_chat_apply_template(tmpl.source().c_str(), chat.data(), chat.size(), add_ass, buf.data(), buf.size());
1823
-
1824
- // error: chat template is not supported
1825
- if (res < 0) {
1826
- // if the custom "tmpl" is not supported, we throw an error
1827
- // this is a bit redundant (for good), since we're not sure if user validated the custom template with llama_chat_verify_template()
1828
- throw std::runtime_error("this custom template is not supported");
1829
- }
1830
-
1831
- // if it turns out that our buffer is too small, we resize it
1832
- if ((size_t) res > buf.size()) {
1833
- buf.resize(res);
1834
- res = llama_chat_apply_template(tmpl.source().c_str(), chat.data(), chat.size(), add_ass, buf.data(), buf.size());
1835
- }
1836
-
1837
- std::string formatted_chat(buf.data(), res);
1838
- return formatted_chat;
1839
- }
1840
-
1841
- std::string common_chat_format_single(
1842
- const common_chat_template & tmpl,
1843
- const std::vector<common_chat_msg> & past_msg,
1844
- const common_chat_msg & new_msg,
1845
- bool add_ass,
1846
- bool use_jinja) {
1847
- std::ostringstream ss;
1848
- auto fmt_past_msg = past_msg.empty() ? "" : common_chat_apply_template(tmpl, past_msg, false, use_jinja);
1849
- std::vector<common_chat_msg> chat_new(past_msg);
1850
- // if the past_msg ends with a newline, we must preserve it in the formatted version
1851
- if (add_ass && !fmt_past_msg.empty() && fmt_past_msg.back() == '\n') {
1852
- ss << "\n";
1853
- };
1854
- // format chat with new_msg
1855
- chat_new.push_back(new_msg);
1856
- auto fmt_new_msg = common_chat_apply_template(tmpl, chat_new, add_ass, use_jinja);
1857
- // get the diff part
1858
- ss << fmt_new_msg.substr(fmt_past_msg.size(), fmt_new_msg.size() - fmt_past_msg.size());
1859
- return ss.str();
1860
- }
1861
-
1862
- std::string common_chat_format_example(const common_chat_template & tmpl, bool use_jinja) {
1863
- std::vector<common_chat_msg> msgs = {
1864
- {"system", "You are a helpful assistant", {}},
1865
- {"user", "Hello", {}},
1866
- {"assistant", "Hi there", {}},
1867
- {"user", "How are you?", {}},
1868
- };
1869
- return common_chat_apply_template(tmpl, msgs, true, use_jinja);
1870
- }
1871
-
1872
- #define CHATML_TEMPLATE_SRC \
1873
- "{%- for message in messages -%}\n" \
1874
- " {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>\n' -}}\n" \
1875
- "{%- endfor -%}\n" \
1876
- "{%- if add_generation_prompt -%}\n" \
1877
- " {{- '<|im_start|>assistant\n' -}}\n" \
1878
- "{%- endif -%}"
1879
-
1880
- common_chat_templates common_chat_templates_from_model(const struct llama_model * model, const std::string & chat_template_override)
1881
- {
1882
- std::string default_template_src;
1883
- std::string template_tool_use_src;
1884
-
1885
- bool has_explicit_template = !chat_template_override.empty();
1886
- if (chat_template_override.empty()) {
1887
- auto str = llama_model_chat_template(model, /* name */ nullptr);
1888
- if (str) {
1889
- default_template_src = str;
1890
- has_explicit_template = true;
1891
- }
1892
- str = llama_model_chat_template(model, /* name */ "tool_use");
1893
- if (str) {
1894
- template_tool_use_src = str;
1895
- has_explicit_template = true;
1896
- }
1897
- } else {
1898
- default_template_src = chat_template_override;
1899
- }
1900
- if (default_template_src.empty() || default_template_src == "chatml") {
1901
- if (!template_tool_use_src.empty()) {
1902
- default_template_src = template_tool_use_src;
1903
- } else {
1904
- default_template_src = CHATML_TEMPLATE_SRC;
1905
- }
1906
- }
1907
- auto vocab = llama_model_get_vocab(model);
1908
- const auto get_token = [&](llama_token token, const char * name, const char * jinja_variable_name) {
1909
- if (token == LLAMA_TOKEN_NULL) {
1910
- if (default_template_src.find(jinja_variable_name) != std::string::npos
1911
- || template_tool_use_src.find(jinja_variable_name) != std::string::npos) {
1912
- LOG_WRN("%s: warning: vocab does not have a %s token, jinja template won't work as intended.\n", __func__, name);
1913
- }
1914
- return std::string();
1915
- } else {
1916
- return common_token_to_piece(vocab, token, true);
1917
- }
1918
- };
1919
- auto token_bos = get_token(llama_vocab_bos(vocab), "BOS", "bos_token");
1920
- auto token_eos = get_token(llama_vocab_eos(vocab), "EOS", "eos_token");
1921
- try {
1922
- return {
1923
- has_explicit_template,
1924
- std::make_unique<minja::chat_template>(default_template_src, token_bos, token_eos),
1925
- template_tool_use_src.empty()
1926
- ? nullptr
1927
- : std::make_unique<minja::chat_template>(template_tool_use_src, token_bos, token_eos),
1928
- };
1929
- } catch (const std::exception & e) {
1930
- LOG_ERR("%s: failed to parse chat template: %s\n", __func__, e.what());
1931
- return {
1932
- has_explicit_template,
1933
- std::make_unique<minja::chat_template>(CHATML_TEMPLATE_SRC, token_bos, token_eos),
1934
- nullptr,
1935
- };
1936
- }
1937
- }
1938
-
1939
1773
  //
1940
1774
  // KV cache utils
1941
1775
  //
@@ -2196,3 +2030,25 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c
2196
2030
  return result;
2197
2031
  }
2198
2032
 
2033
+ template <>
2034
+ json common_grammar_trigger::to_json() const {
2035
+ json out {
2036
+ {"type", (int) type},
2037
+ {"value", value},
2038
+ };
2039
+ if (type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) {
2040
+ out["token"] = (int) token;
2041
+ }
2042
+ return out;
2043
+ }
2044
+
2045
+ template <>
2046
+ common_grammar_trigger common_grammar_trigger::from_json(const json & in) {
2047
+ common_grammar_trigger out;
2048
+ out.type = (common_grammar_trigger_type) in.at("type").get<int>();
2049
+ out.value = in.at("value").get<std::string>();
2050
+ if (out.type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) {
2051
+ out.token = (llama_token) in.at("token").get<int>();
2052
+ }
2053
+ return out;
2054
+ }
@@ -110,9 +110,21 @@ enum common_conversation_mode {
110
110
  COMMON_CONVERSATION_MODE_AUTO = 2,
111
111
  };
112
112
 
113
+ enum common_grammar_trigger_type {
114
+ COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN,
115
+ COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
116
+ COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
117
+ COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START,
118
+ };
119
+
113
120
  struct common_grammar_trigger {
114
- std::string word;
115
- bool at_start;
121
+ common_grammar_trigger_type type;
122
+ std::string value;
123
+ llama_token token = LLAMA_TOKEN_NULL;
124
+
125
+ // T can only be nlohmann::ordered_json
126
+ template <class T> T to_json() const;
127
+ template <class T> static common_grammar_trigger from_json(const T & in);
116
128
  };
117
129
 
118
130
  // sampling parameters
@@ -163,8 +175,7 @@ struct common_params_sampling {
163
175
 
164
176
  std::string grammar; // optional BNF-like grammar to constrain sampling
165
177
  bool grammar_lazy = false;
166
- std::vector<common_grammar_trigger> grammar_trigger_words; // optional trigger words to trigger lazy grammar
167
- std::vector<llama_token> grammar_trigger_tokens; // optional trigger tokens to trigger lazy grammar and print trigger special tokens.
178
+ std::vector<common_grammar_trigger> grammar_triggers; // optional triggers (for lazy grammars)
168
179
  std::set<llama_token> preserved_tokens;
169
180
 
170
181
  std::vector<llama_logit_bias> logit_bias; // logit biases to apply
@@ -178,10 +189,10 @@ struct common_params_speculative {
178
189
 
179
190
  int32_t n_ctx = 0; // draft context size
180
191
  int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding
181
- int32_t n_min = 5; // minimum number of draft tokens to use for speculative decoding
192
+ int32_t n_min = 0; // minimum number of draft tokens to use for speculative decoding
182
193
  int32_t n_gpu_layers = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
183
194
  float p_split = 0.1f; // speculative decoding split probability
184
- float p_min = 0.9f; // minimum speculative decoding probability (greedy)
195
+ float p_min = 0.75f; // minimum speculative decoding probability (greedy)
185
196
 
186
197
  struct cpu_params cpuparams;
187
198
  struct cpu_params cpuparams_batch;
@@ -200,6 +211,8 @@ struct common_params_vocoder {
200
211
  std::string model = ""; // model path // NOLINT
201
212
  std::string model_url = ""; // model url to download // NOLINT
202
213
 
214
+ std::string speaker_file = ""; // speaker file path // NOLINT
215
+
203
216
  bool use_guide_tokens = false; // enable guide tokens to improve TTS accuracy // NOLINT
204
217
  };
205
218
 
@@ -261,6 +274,7 @@ struct common_params {
261
274
  std::string hf_repo = ""; // HF repo // NOLINT
262
275
  std::string hf_file = ""; // HF file // NOLINT
263
276
  std::string prompt = ""; // NOLINT
277
+ std::string system_prompt = ""; // NOLINT
264
278
  std::string prompt_file = ""; // store the external prompt file name // NOLINT
265
279
  std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state // NOLINT
266
280
  std::string input_prefix = ""; // string to prefix user inputs with // NOLINT
@@ -325,6 +339,8 @@ struct common_params {
325
339
  bool warmup = true; // warmup run
326
340
  bool check_tensors = false; // validate tensor data
327
341
 
342
+ bool single_turn = false; // single turn chat conversation
343
+
328
344
  ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
329
345
  ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
330
346
 
@@ -391,8 +407,6 @@ struct common_params {
391
407
  int32_t i_pos = -1; // position of the passkey in the junk text
392
408
 
393
409
  // imatrix params
394
- std::string out_file = "imatrix.dat"; // save the resulting imatrix to this file
395
-
396
410
  int32_t n_out_freq = 10; // output the imatrix every n_out_freq iterations
397
411
  int32_t n_save_freq = 0; // save the imatrix every n_save_freq iterations
398
412
  int32_t i_chunk = 0; // start processing from this chunk
@@ -404,16 +418,16 @@ struct common_params {
404
418
  int n_pca_batch = 100;
405
419
  int n_pca_iterations = 1000;
406
420
  dimre_method cvector_dimre_method = DIMRE_METHOD_PCA;
407
- std::string cvector_outfile = "control_vector.gguf";
408
421
  std::string cvector_positive_file = "examples/cvector-generator/positive.txt";
409
422
  std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
410
423
 
411
424
  bool spm_infill = false; // suffix/prefix/middle pattern for infill
412
425
 
413
- std::string lora_outfile = "ggml-lora-merged-f16.gguf";
414
-
415
426
  // batched-bench params
416
427
  bool batched_bench_output_jsonl = false;
428
+
429
+ // common params
430
+ std::string out_file; // output filename for all example programs
417
431
  };
418
432
 
419
433
  // call once at the start of a program if it uses libcommon
@@ -453,6 +467,8 @@ std::string string_repeat(const std::string & str, size_t n);
453
467
 
454
468
  void string_replace_all(std::string & s, const std::string & search, const std::string & replace);
455
469
 
470
+ std::string regex_escape(const std::string & s);
471
+
456
472
  template<class T>
457
473
  static std::vector<T> string_split(const std::string & str, char delim) {
458
474
  static_assert(!std::is_same<T, std::string>::value, "Please use the specialized version for std::string");
@@ -616,62 +632,6 @@ std::string common_detokenize(
616
632
  const std::vector<llama_token> & tokens,
617
633
  bool special = true);
618
634
 
619
- //
620
- // Chat template utils
621
- //
622
-
623
- struct common_tool_call {
624
- std::string name;
625
- std::string arguments;
626
- std::string id;
627
- };
628
-
629
- // same with llama_chat_message, but uses std::string
630
- struct common_chat_msg {
631
- std::string role;
632
- std::string content;
633
- std::vector<common_tool_call> tool_calls;
634
- std::string reasoning_content = "";
635
- };
636
-
637
- // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
638
- bool common_chat_verify_template(const std::string & tmpl, bool use_jinja);
639
-
640
- namespace minja {
641
- class chat_template;
642
- }
643
-
644
- typedef minja::chat_template common_chat_template;
645
-
646
- struct common_chat_templates {
647
- bool has_explicit_template; // Model had builtin template or template overridde was specified.
648
- std::unique_ptr<common_chat_template> template_default; // always set (defaults to chatml)
649
- std::unique_ptr<common_chat_template> template_tool_use;
650
- };
651
-
652
- // CPP wrapper for llama_chat_apply_template
653
- // If the built-in template is not supported, we default to chatml
654
- // If the custom "tmpl" is not supported, we throw an error
655
- std::string common_chat_apply_template(
656
- const common_chat_template & tmpl,
657
- const std::vector<common_chat_msg> & chat,
658
- bool add_ass,
659
- bool use_jinja);
660
-
661
- // Format single message, while taking into account the position of that message in chat history
662
- std::string common_chat_format_single(
663
- const common_chat_template & tmpl,
664
- const std::vector<common_chat_msg> & past_msg,
665
- const common_chat_msg & new_msg,
666
- bool add_ass,
667
- bool use_jinja);
668
-
669
- // Returns an example of formatted chat
670
- std::string common_chat_format_example(
671
- const common_chat_template & tmpl, bool use_jinja);
672
-
673
- common_chat_templates common_chat_templates_from_model(const struct llama_model * model, const std::string & chat_template_override);
674
-
675
635
  //
676
636
  // KV cache utils
677
637
  //
@@ -264,7 +264,7 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
264
264
  throw std::runtime_error("At least one of min_value or max_value must be set");
265
265
  }
266
266
 
267
- const std::string SPACE_RULE = "| \" \" | \"\\n\" [ \\t]{0,20}";
267
+ const std::string SPACE_RULE = "| \" \" | \"\\n\"{1,2} [ \\t]{0,20}";
268
268
 
269
269
  struct BuiltinRule {
270
270
  std::string content;
@@ -764,11 +764,10 @@ private:
764
764
  public:
765
765
  SchemaConverter(
766
766
  const std::function<json(const std::string &)> & fetch_json,
767
- bool dotall,
768
- bool compact_spaces)
767
+ bool dotall)
769
768
  : _fetch_json(fetch_json), _dotall(dotall)
770
769
  {
771
- _rules["space"] = compact_spaces ? "\" \"?" : SPACE_RULE;
770
+ _rules["space"] = SPACE_RULE;
772
771
  }
773
772
 
774
773
  void resolve_refs(json & schema, const std::string & url) {
@@ -1007,7 +1006,7 @@ std::string json_schema_to_grammar(const json & schema, bool force_gbnf) {
1007
1006
  }
1008
1007
 
1009
1008
  std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options) {
1010
- SchemaConverter converter([&](const std::string &) { return json(); }, options.dotall, options.compact_spaces);
1009
+ SchemaConverter converter([&](const std::string &) { return json(); }, options.dotall);
1011
1010
  common_grammar_builder builder {
1012
1011
  /* .add_rule = */ [&](const std::string & name, const std::string & rule) {
1013
1012
  return converter._add_rule(name, rule);
@@ -16,7 +16,6 @@ struct common_grammar_builder {
16
16
 
17
17
  struct common_grammar_options {
18
18
  bool dotall = false;
19
- bool compact_spaces = false;
20
19
  };
21
20
 
22
21
  std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options = {});
@@ -1378,13 +1378,27 @@ struct ArgumentsExpression {
1378
1378
  }
1379
1379
  };
1380
1380
 
1381
- static std::string strip(const std::string & s) {
1382
- auto start = s.find_first_not_of(" \t\n\r");
1381
+ static std::string strip(const std::string & s, const std::string & chars = "", bool left = true, bool right = true) {
1382
+ auto charset = chars.empty() ? " \t\n\r" : chars;
1383
+ auto start = left ? s.find_first_not_of(charset) : 0;
1383
1384
  if (start == std::string::npos) return "";
1384
- auto end = s.find_last_not_of(" \t\n\r");
1385
+ auto end = right ? s.find_last_not_of(charset) : s.size() - 1;
1385
1386
  return s.substr(start, end - start + 1);
1386
1387
  }
1387
1388
 
1389
+ static std::vector<std::string> split(const std::string & s, const std::string & sep) {
1390
+ std::vector<std::string> result;
1391
+ size_t start = 0;
1392
+ size_t end = s.find(sep);
1393
+ while (end != std::string::npos) {
1394
+ result.push_back(s.substr(start, end - start));
1395
+ start = end + sep.length();
1396
+ end = s.find(sep, start);
1397
+ }
1398
+ result.push_back(s.substr(start));
1399
+ return result;
1400
+ }
1401
+
1388
1402
  static std::string capitalize(const std::string & s) {
1389
1403
  if (s.empty()) return s;
1390
1404
  auto result = s;
@@ -1467,8 +1481,26 @@ public:
1467
1481
  } else if (obj.is_string()) {
1468
1482
  auto str = obj.get<std::string>();
1469
1483
  if (method->get_name() == "strip") {
1470
- vargs.expectArgs("strip method", {0, 0}, {0, 0});
1471
- return Value(strip(str));
1484
+ vargs.expectArgs("strip method", {0, 1}, {0, 0});
1485
+ auto chars = vargs.args.empty() ? "" : vargs.args[0].get<std::string>();
1486
+ return Value(strip(str, chars));
1487
+ } else if (method->get_name() == "lstrip") {
1488
+ vargs.expectArgs("lstrip method", {0, 1}, {0, 0});
1489
+ auto chars = vargs.args.empty() ? "" : vargs.args[0].get<std::string>();
1490
+ return Value(strip(str, chars, /* left= */ true, /* right= */ false));
1491
+ } else if (method->get_name() == "rstrip") {
1492
+ vargs.expectArgs("rstrip method", {0, 1}, {0, 0});
1493
+ auto chars = vargs.args.empty() ? "" : vargs.args[0].get<std::string>();
1494
+ return Value(strip(str, chars, /* left= */ false, /* right= */ true));
1495
+ } else if (method->get_name() == "split") {
1496
+ vargs.expectArgs("split method", {1, 1}, {0, 0});
1497
+ auto sep = vargs.args[0].get<std::string>();
1498
+ auto parts = split(str, sep);
1499
+ Value result = Value::array();
1500
+ for (const auto& part : parts) {
1501
+ result.push_back(Value(part));
1502
+ }
1503
+ return result;
1472
1504
  } else if (method->get_name() == "capitalize") {
1473
1505
  vargs.expectArgs("capitalize method", {0, 0}, {0, 0});
1474
1506
  return Value(capitalize(str));
@@ -7,6 +7,7 @@
7
7
  #include <cstdio>
8
8
  #include <fstream>
9
9
  #include <thread>
10
+ #include <algorithm>
10
11
 
11
12
  void common_ngram_cache_update(common_ngram_cache & ngram_cache, int ngram_min, int ngram_max,
12
13
  std::vector<llama_token> & inp, int nnew, bool print_progress) {