@fugood/llama.node 0.3.9 → 0.3.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. package/bin/darwin/arm64/llama-node.node +0 -0
  2. package/bin/darwin/x64/llama-node.node +0 -0
  3. package/bin/linux/arm64/llama-node.node +0 -0
  4. package/bin/linux/x64/llama-node.node +0 -0
  5. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  6. package/bin/linux-cuda/x64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  9. package/bin/win32/arm64/llama-node.node +0 -0
  10. package/bin/win32/arm64/node.lib +0 -0
  11. package/bin/win32/x64/llama-node.node +0 -0
  12. package/bin/win32/x64/node.lib +0 -0
  13. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  14. package/bin/win32-vulkan/arm64/node.lib +0 -0
  15. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  16. package/bin/win32-vulkan/x64/node.lib +0 -0
  17. package/lib/binding.js +2 -2
  18. package/lib/binding.ts +46 -8
  19. package/lib/index.ts +3 -1
  20. package/package.json +8 -1
  21. package/src/LlamaCompletionWorker.cpp +33 -6
  22. package/src/LlamaCompletionWorker.h +3 -1
  23. package/src/LlamaContext.cpp +292 -28
  24. package/src/LlamaContext.h +1 -0
  25. package/src/common.hpp +19 -2
  26. package/src/llama.cpp/.github/workflows/build.yml +289 -107
  27. package/src/llama.cpp/.github/workflows/close-issue.yml +1 -1
  28. package/src/llama.cpp/.github/workflows/docker.yml +2 -1
  29. package/src/llama.cpp/.github/workflows/server.yml +25 -2
  30. package/src/llama.cpp/CMakeLists.txt +10 -19
  31. package/src/llama.cpp/cmake/build-info.cmake +1 -1
  32. package/src/llama.cpp/common/CMakeLists.txt +32 -0
  33. package/src/llama.cpp/common/arg.cpp +66 -16
  34. package/src/llama.cpp/common/chat-template.hpp +515 -0
  35. package/src/llama.cpp/common/chat.cpp +966 -0
  36. package/src/llama.cpp/common/chat.hpp +52 -0
  37. package/src/llama.cpp/common/common.cpp +159 -36
  38. package/src/llama.cpp/common/common.h +56 -14
  39. package/src/llama.cpp/common/json-schema-to-grammar.cpp +46 -66
  40. package/src/llama.cpp/common/json-schema-to-grammar.h +15 -1
  41. package/src/llama.cpp/common/llguidance.cpp +270 -0
  42. package/src/llama.cpp/common/log.cpp +1 -10
  43. package/src/llama.cpp/common/log.h +10 -0
  44. package/src/llama.cpp/common/minja.hpp +2868 -0
  45. package/src/llama.cpp/common/sampling.cpp +22 -1
  46. package/src/llama.cpp/common/sampling.h +3 -0
  47. package/src/llama.cpp/docs/build.md +54 -9
  48. package/src/llama.cpp/examples/export-lora/export-lora.cpp +12 -2
  49. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +1 -1
  50. package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
  51. package/src/llama.cpp/examples/llava/clip-quantize-cli.cpp +59 -0
  52. package/src/llama.cpp/examples/llava/clip.cpp +133 -14
  53. package/src/llama.cpp/examples/llava/clip.h +2 -0
  54. package/src/llama.cpp/examples/llava/llava.cpp +22 -8
  55. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +9 -1
  56. package/src/llama.cpp/examples/main/main.cpp +26 -25
  57. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +136 -137
  58. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +18 -4
  59. package/src/llama.cpp/examples/run/run.cpp +224 -69
  60. package/src/llama.cpp/examples/server/server.cpp +252 -81
  61. package/src/llama.cpp/examples/server/utils.hpp +73 -21
  62. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +6 -4
  63. package/src/llama.cpp/examples/simple-cmake-pkg/CMakeLists.txt +11 -0
  64. package/src/llama.cpp/ggml/CMakeLists.txt +78 -1
  65. package/src/llama.cpp/ggml/include/ggml.h +1 -1
  66. package/src/llama.cpp/ggml/src/CMakeLists.txt +21 -4
  67. package/src/llama.cpp/ggml/src/ggml-alloc.c +1 -13
  68. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +91 -78
  69. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +7 -7
  70. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +2 -1
  71. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +1 -1
  72. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +46 -0
  73. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +16 -1
  74. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +1 -1
  75. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +28 -8
  76. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +5 -7
  77. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +33 -23
  78. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.hpp +1 -5
  79. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +323 -121
  80. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +13 -3
  81. package/src/llama.cpp/ggml/src/ggml.c +23 -13
  82. package/src/llama.cpp/include/llama.h +14 -1
  83. package/src/llama.cpp/models/ggml-vocab-deepseek-r1-qwen.gguf.inp +112 -0
  84. package/src/llama.cpp/models/ggml-vocab-deepseek-r1-qwen.gguf.out +46 -0
  85. package/src/llama.cpp/src/CMakeLists.txt +1 -1
  86. package/src/llama.cpp/src/llama-arch.cpp +7 -2
  87. package/src/llama.cpp/src/llama-arch.h +3 -1
  88. package/src/llama.cpp/src/llama-chat.cpp +11 -2
  89. package/src/llama.cpp/src/llama-chat.h +1 -0
  90. package/src/llama.cpp/src/llama-grammar.cpp +86 -6
  91. package/src/llama.cpp/src/llama-grammar.h +22 -1
  92. package/src/llama.cpp/src/llama-mmap.cpp +1 -0
  93. package/src/llama.cpp/src/llama-model-loader.cpp +1 -1
  94. package/src/llama.cpp/src/llama-model.cpp +76 -6
  95. package/src/llama.cpp/src/llama-sampling.cpp +47 -4
  96. package/src/llama.cpp/src/llama-vocab.cpp +10 -4
  97. package/src/llama.cpp/src/llama.cpp +181 -123
  98. package/src/llama.cpp/tests/CMakeLists.txt +4 -0
  99. package/src/llama.cpp/tests/test-backend-ops.cpp +158 -57
  100. package/src/llama.cpp/tests/test-chat-template.cpp +154 -31
  101. package/src/llama.cpp/tests/test-chat.cpp +607 -0
  102. package/src/llama.cpp/tests/test-grammar-integration.cpp +2 -2
  103. package/src/llama.cpp/tests/test-grammar-llguidance.cpp +1140 -0
  104. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +1 -1
  105. package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +0 -32
@@ -0,0 +1,52 @@
1
+ // Chat support (incl. tool call grammar constraining & output parsing) w/ generic & custom template handlers.
2
+
3
+ #pragma once
4
+
5
+ #include "common.h"
6
+ #include <json.hpp>
7
+ #include <optional>
8
+ #include <string>
9
+ #include <vector>
10
+
11
+ using json = nlohmann::ordered_json;
12
+
13
+ struct common_chat_inputs {
14
+ json messages;
15
+ json tools;
16
+ json tool_choice;
17
+ json json_schema;
18
+ bool parallel_tool_calls;
19
+ bool stream;
20
+ std::string grammar;
21
+ bool add_generation_prompt = true;
22
+ };
23
+
24
+ enum common_chat_format {
25
+ COMMON_CHAT_FORMAT_CONTENT_ONLY,
26
+ COMMON_CHAT_FORMAT_GENERIC,
27
+ COMMON_CHAT_FORMAT_MISTRAL_NEMO,
28
+ COMMON_CHAT_FORMAT_LLAMA_3_X,
29
+ COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS,
30
+ COMMON_CHAT_FORMAT_DEEPSEEK_R1,
31
+ COMMON_CHAT_FORMAT_FIREFUNCTION_V2,
32
+ COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2,
33
+ COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
34
+ COMMON_CHAT_FORMAT_HERMES_2_PRO,
35
+ COMMON_CHAT_FORMAT_COMMAND_R7B,
36
+
37
+ COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
38
+ };
39
+
40
+ struct common_chat_params {
41
+ common_chat_format format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
42
+ json prompt;
43
+ std::string grammar;
44
+ bool grammar_lazy = false;
45
+ std::vector<common_grammar_trigger> grammar_triggers;
46
+ std::vector<std::string> preserved_tokens;
47
+ std::vector<std::string> additional_stops;
48
+ };
49
+
50
+ struct common_chat_params common_chat_params_init(const common_chat_template & tmpl, const struct common_chat_inputs & params);
51
+ std::string common_chat_format_name(common_chat_format format);
52
+ common_chat_msg common_chat_parse( const std::string & input, common_chat_format format);
@@ -12,6 +12,8 @@
12
12
  #include "json.hpp"
13
13
  #include "json-schema-to-grammar.h"
14
14
  #include "llama.h"
15
+ #include "chat.hpp"
16
+ #include "chat-template.hpp"
15
17
 
16
18
  #include <algorithm>
17
19
  #include <cinttypes>
@@ -483,6 +485,48 @@ void string_replace_all(std::string & s, const std::string & search, const std::
483
485
  s = std::move(builder);
484
486
  }
485
487
 
488
+ std::string string_join(const std::vector<std::string> & values, const std::string & separator) {
489
+ std::ostringstream result;
490
+ for (size_t i = 0; i < values.size(); ++i) {
491
+ if (i > 0) {
492
+ result << separator;
493
+ }
494
+ result << values[i];
495
+ }
496
+ return result.str();
497
+ }
498
+
499
+ std::vector<std::string> string_split(const std::string & str, const std::string & delimiter) {
500
+ std::vector<std::string> parts;
501
+ size_t start = 0;
502
+ size_t end = str.find(delimiter);
503
+
504
+ while (end != std::string::npos) {
505
+ parts.push_back(str.substr(start, end - start));
506
+ start = end + delimiter.length();
507
+ end = str.find(delimiter, start);
508
+ }
509
+
510
+ parts.push_back(str.substr(start));
511
+
512
+ return parts;
513
+ }
514
+
515
+ std::string string_repeat(const std::string & str, size_t n) {
516
+ if (n == 0) {
517
+ return "";
518
+ }
519
+
520
+ std::string result;
521
+ result.reserve(str.length() * n);
522
+
523
+ for (size_t i = 0; i < n; ++i) {
524
+ result += str;
525
+ }
526
+
527
+ return result;
528
+ }
529
+
486
530
  std::string string_from(bool value) {
487
531
  return value ? "true" : "false";
488
532
  }
@@ -1728,67 +1772,80 @@ std::string common_detokenize(const struct llama_vocab * vocab, const std::vecto
1728
1772
  // Chat template utils
1729
1773
  //
1730
1774
 
1731
- std::string common_get_builtin_chat_template(const struct llama_model * model) {
1732
- const char * ptr_tmpl = llama_model_chat_template(model);
1733
- return ptr_tmpl == nullptr ? "" : ptr_tmpl;
1734
- }
1735
-
1736
- bool common_chat_verify_template(const std::string & tmpl) {
1775
+ bool common_chat_verify_template(const std::string & tmpl, bool use_jinja) {
1776
+ if (use_jinja) {
1777
+ try {
1778
+ auto chat_template = common_chat_template(tmpl, "<s>", "</s>");
1779
+ common_chat_inputs inputs;
1780
+ inputs.messages = json::array({{
1781
+ {"role", "user"},
1782
+ {"content", "test"},
1783
+ }});
1784
+ common_chat_params_init(chat_template, inputs);
1785
+ return true;
1786
+ } catch (const std::exception & e) {
1787
+ LOG_ERR("%s: failed to apply template: %s\n", __func__, e.what());
1788
+ return false;
1789
+ }
1790
+ }
1737
1791
  llama_chat_message chat[] = {{"user", "test"}};
1738
1792
  const int res = llama_chat_apply_template(tmpl.c_str(), chat, 1, true, nullptr, 0);
1739
1793
  return res >= 0;
1740
1794
  }
1741
1795
 
1742
- std::string common_chat_apply_template(const struct llama_model * model,
1743
- const std::string & tmpl,
1796
+ std::string common_chat_apply_template(
1797
+ const common_chat_template & tmpl,
1744
1798
  const std::vector<common_chat_msg> & msgs,
1745
- bool add_ass) {
1799
+ bool add_ass,
1800
+ bool use_jinja) {
1801
+ if (use_jinja) {
1802
+ auto messages = json::array();
1803
+ for (const auto & msg : msgs) {
1804
+ messages.push_back({{"role", msg.role}, {"content", msg.content}});
1805
+ }
1806
+ common_chat_inputs inputs;
1807
+ inputs.messages = messages;
1808
+ inputs.add_generation_prompt = add_ass;
1809
+ return common_chat_params_init(tmpl, inputs).prompt;
1810
+ }
1811
+
1746
1812
  int alloc_size = 0;
1747
- bool fallback = false; // indicate if we must fallback to default chatml
1748
1813
  std::vector<llama_chat_message> chat;
1749
1814
  for (const auto & msg : msgs) {
1750
1815
  chat.push_back({msg.role.c_str(), msg.content.c_str()});
1751
1816
  alloc_size += (msg.role.size() + msg.content.size()) * 1.25;
1752
1817
  }
1753
1818
 
1754
- const char * ptr_tmpl = tmpl.empty() ? llama_model_chat_template(model) : tmpl.c_str();
1755
1819
  std::vector<char> buf(alloc_size);
1756
1820
 
1757
1821
  // run the first time to get the total output length
1758
- int32_t res = llama_chat_apply_template(ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size());
1822
+ int32_t res = llama_chat_apply_template(tmpl.source().c_str(), chat.data(), chat.size(), add_ass, buf.data(), buf.size());
1759
1823
 
1760
1824
  // error: chat template is not supported
1761
1825
  if (res < 0) {
1762
- if (ptr_tmpl != nullptr) {
1763
- // if the custom "tmpl" is not supported, we throw an error
1764
- // this is a bit redundant (for good), since we're not sure if user validated the custom template with llama_chat_verify_template()
1765
- throw std::runtime_error("this custom template is not supported");
1766
- }
1767
-
1768
- // If the built-in template is not supported, we default to chatml
1769
- res = llama_chat_apply_template("chatml", chat.data(), chat.size(), add_ass, buf.data(), buf.size());
1770
- fallback = true;
1826
+ // if the custom "tmpl" is not supported, we throw an error
1827
+ // this is a bit redundant (for good), since we're not sure if user validated the custom template with llama_chat_verify_template()
1828
+ throw std::runtime_error("this custom template is not supported");
1771
1829
  }
1772
1830
 
1773
1831
  // if it turns out that our buffer is too small, we resize it
1774
1832
  if ((size_t) res > buf.size()) {
1775
1833
  buf.resize(res);
1776
- res = llama_chat_apply_template(
1777
- fallback ? "chatml" : ptr_tmpl,
1778
- chat.data(), chat.size(), add_ass, buf.data(), buf.size());
1834
+ res = llama_chat_apply_template(tmpl.source().c_str(), chat.data(), chat.size(), add_ass, buf.data(), buf.size());
1779
1835
  }
1780
1836
 
1781
1837
  std::string formatted_chat(buf.data(), res);
1782
1838
  return formatted_chat;
1783
1839
  }
1784
1840
 
1785
- std::string common_chat_format_single(const struct llama_model * model,
1786
- const std::string & tmpl,
1841
+ std::string common_chat_format_single(
1842
+ const common_chat_template & tmpl,
1787
1843
  const std::vector<common_chat_msg> & past_msg,
1788
1844
  const common_chat_msg & new_msg,
1789
- bool add_ass) {
1845
+ bool add_ass,
1846
+ bool use_jinja) {
1790
1847
  std::ostringstream ss;
1791
- auto fmt_past_msg = past_msg.empty() ? "" : common_chat_apply_template(model, tmpl, past_msg, false);
1848
+ auto fmt_past_msg = past_msg.empty() ? "" : common_chat_apply_template(tmpl, past_msg, false, use_jinja);
1792
1849
  std::vector<common_chat_msg> chat_new(past_msg);
1793
1850
  // if the past_msg ends with a newline, we must preserve it in the formatted version
1794
1851
  if (add_ass && !fmt_past_msg.empty() && fmt_past_msg.back() == '\n') {
@@ -1796,21 +1853,87 @@ std::string common_chat_format_single(const struct llama_model * model,
1796
1853
  };
1797
1854
  // format chat with new_msg
1798
1855
  chat_new.push_back(new_msg);
1799
- auto fmt_new_msg = common_chat_apply_template(model, tmpl, chat_new, add_ass);
1856
+ auto fmt_new_msg = common_chat_apply_template(tmpl, chat_new, add_ass, use_jinja);
1800
1857
  // get the diff part
1801
1858
  ss << fmt_new_msg.substr(fmt_past_msg.size(), fmt_new_msg.size() - fmt_past_msg.size());
1802
1859
  return ss.str();
1803
1860
  }
1804
1861
 
1805
- std::string common_chat_format_example(const struct llama_model * model,
1806
- const std::string & tmpl) {
1862
+ std::string common_chat_format_example(const common_chat_template & tmpl, bool use_jinja) {
1807
1863
  std::vector<common_chat_msg> msgs = {
1808
- {"system", "You are a helpful assistant"},
1809
- {"user", "Hello"},
1810
- {"assistant", "Hi there"},
1811
- {"user", "How are you?"},
1864
+ {"system", "You are a helpful assistant", {}},
1865
+ {"user", "Hello", {}},
1866
+ {"assistant", "Hi there", {}},
1867
+ {"user", "How are you?", {}},
1812
1868
  };
1813
- return common_chat_apply_template(model, tmpl, msgs, true);
1869
+ return common_chat_apply_template(tmpl, msgs, true, use_jinja);
1870
+ }
1871
+
1872
+ #define CHATML_TEMPLATE_SRC \
1873
+ "{%- for message in messages -%}\n" \
1874
+ " {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>\n' -}}\n" \
1875
+ "{%- endfor -%}\n" \
1876
+ "{%- if add_generation_prompt -%}\n" \
1877
+ " {{- '<|im_start|>assistant\n' -}}\n" \
1878
+ "{%- endif -%}"
1879
+
1880
+ common_chat_templates common_chat_templates_from_model(const struct llama_model * model, const std::string & chat_template_override)
1881
+ {
1882
+ std::string default_template_src;
1883
+ std::string template_tool_use_src;
1884
+
1885
+ bool has_explicit_template = !chat_template_override.empty();
1886
+ if (chat_template_override.empty()) {
1887
+ auto str = llama_model_chat_template(model, /* name */ nullptr);
1888
+ if (str) {
1889
+ default_template_src = str;
1890
+ has_explicit_template = true;
1891
+ }
1892
+ str = llama_model_chat_template(model, /* name */ "tool_use");
1893
+ if (str) {
1894
+ template_tool_use_src = str;
1895
+ has_explicit_template = true;
1896
+ }
1897
+ } else {
1898
+ default_template_src = chat_template_override;
1899
+ }
1900
+ if (default_template_src.empty() || default_template_src == "chatml") {
1901
+ if (!template_tool_use_src.empty()) {
1902
+ default_template_src = template_tool_use_src;
1903
+ } else {
1904
+ default_template_src = CHATML_TEMPLATE_SRC;
1905
+ }
1906
+ }
1907
+ auto vocab = llama_model_get_vocab(model);
1908
+ const auto get_token = [&](llama_token token, const char * name, const char * jinja_variable_name) {
1909
+ if (token == LLAMA_TOKEN_NULL) {
1910
+ if (default_template_src.find(jinja_variable_name) != std::string::npos
1911
+ || template_tool_use_src.find(jinja_variable_name) != std::string::npos) {
1912
+ LOG_WRN("%s: warning: vocab does not have a %s token, jinja template won't work as intended.\n", __func__, name);
1913
+ }
1914
+ return std::string();
1915
+ } else {
1916
+ return common_token_to_piece(vocab, token, true);
1917
+ }
1918
+ };
1919
+ auto token_bos = get_token(llama_vocab_bos(vocab), "BOS", "bos_token");
1920
+ auto token_eos = get_token(llama_vocab_eos(vocab), "EOS", "eos_token");
1921
+ try {
1922
+ return {
1923
+ has_explicit_template,
1924
+ std::make_unique<minja::chat_template>(default_template_src, token_bos, token_eos),
1925
+ template_tool_use_src.empty()
1926
+ ? nullptr
1927
+ : std::make_unique<minja::chat_template>(template_tool_use_src, token_bos, token_eos),
1928
+ };
1929
+ } catch (const std::exception & e) {
1930
+ LOG_ERR("%s: failed to parse chat template: %s\n", __func__, e.what());
1931
+ return {
1932
+ has_explicit_template,
1933
+ std::make_unique<minja::chat_template>(CHATML_TEMPLATE_SRC, token_bos, token_eos),
1934
+ nullptr,
1935
+ };
1936
+ }
1814
1937
  }
1815
1938
 
1816
1939
  //
@@ -4,6 +4,7 @@
4
4
 
5
5
  #include "llama-cpp.h"
6
6
 
7
+ #include <set>
7
8
  #include <string>
8
9
  #include <vector>
9
10
  #include <sstream>
@@ -109,6 +110,11 @@ enum common_conversation_mode {
109
110
  COMMON_CONVERSATION_MODE_AUTO = 2,
110
111
  };
111
112
 
113
+ struct common_grammar_trigger {
114
+ std::string word;
115
+ bool at_start;
116
+ };
117
+
112
118
  // sampling parameters
113
119
  struct common_params_sampling {
114
120
  uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
@@ -154,7 +160,11 @@ struct common_params_sampling {
154
160
  COMMON_SAMPLER_TYPE_TEMPERATURE,
155
161
  };
156
162
 
157
- std::string grammar; // optional BNF-like grammar to constrain sampling
163
+ std::string grammar; // optional BNF-like grammar to constrain sampling
164
+ bool grammar_lazy = false;
165
+ std::vector<common_grammar_trigger> grammar_trigger_words; // optional trigger words to trigger lazy grammar
166
+ std::vector<llama_token> grammar_trigger_tokens; // optional trigger tokens to trigger lazy grammar and print trigger special tokens.
167
+ std::set<llama_token> preserved_tokens;
158
168
 
159
169
  std::vector<llama_logit_bias> logit_bias; // logit biases to apply
160
170
 
@@ -175,7 +185,11 @@ struct common_params_speculative {
175
185
  struct cpu_params cpuparams;
176
186
  struct cpu_params cpuparams_batch;
177
187
 
178
- std::string model = ""; // draft model for speculative decoding // NOLINT
188
+ std::string hf_repo = ""; // HF repo // NOLINT
189
+ std::string hf_file = ""; // HF file // NOLINT
190
+
191
+ std::string model = ""; // draft model for speculative decoding // NOLINT
192
+ std::string model_url = ""; // model url to download // NOLINT
179
193
  };
180
194
 
181
195
  struct common_params_vocoder {
@@ -330,6 +344,7 @@ struct common_params {
330
344
  std::string hostname = "127.0.0.1";
331
345
  std::string public_path = ""; // NOLINT
332
346
  std::string chat_template = ""; // NOLINT
347
+ bool use_jinja = false; // NOLINT
333
348
  bool enable_chat_template = true;
334
349
 
335
350
  std::vector<std::string> api_keys;
@@ -424,6 +439,10 @@ std::string string_format(const char * fmt, ...);
424
439
  std::string string_strip(const std::string & str);
425
440
  std::string string_get_sortable_timestamp();
426
441
 
442
+ std::string string_join(const std::vector<std::string> & values, const std::string & separator);
443
+ std::vector<std::string> string_split(const std::string & str, const std::string & delimiter);
444
+ std::string string_repeat(const std::string & str, size_t n);
445
+
427
446
  void string_replace_all(std::string & s, const std::string & search, const std::string & replace);
428
447
 
429
448
  template<class T>
@@ -508,12 +527,14 @@ struct llama_model * common_load_model_from_url(
508
527
  const std::string & local_path,
509
528
  const std::string & hf_token,
510
529
  const struct llama_model_params & params);
530
+
511
531
  struct llama_model * common_load_model_from_hf(
512
532
  const std::string & repo,
513
533
  const std::string & remote_path,
514
534
  const std::string & local_path,
515
535
  const std::string & hf_token,
516
536
  const struct llama_model_params & params);
537
+
517
538
  std::pair<std::string, std::string> common_get_hf_file(
518
539
  const std::string & hf_repo_with_tag,
519
540
  const std::string & hf_token);
@@ -591,36 +612,57 @@ std::string common_detokenize(
591
612
  // Chat template utils
592
613
  //
593
614
 
615
+ struct common_tool_call {
616
+ std::string name;
617
+ std::string arguments;
618
+ std::string id;
619
+ };
620
+
594
621
  // same with llama_chat_message, but uses std::string
595
622
  struct common_chat_msg {
596
623
  std::string role;
597
624
  std::string content;
625
+ std::vector<common_tool_call> tool_calls;
626
+ std::string tool_plan = "";
598
627
  };
599
628
 
600
- // Get the built-in chat template for the model. Return empty string if not present.
601
- std::string common_get_builtin_chat_template(const struct llama_model * model);
602
-
603
629
  // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
604
- bool common_chat_verify_template(const std::string & tmpl);
630
+ bool common_chat_verify_template(const std::string & tmpl, bool use_jinja);
631
+
632
+ namespace minja {
633
+ class chat_template;
634
+ }
635
+
636
+ typedef minja::chat_template common_chat_template;
637
+
638
+ struct common_chat_templates {
639
+ bool has_explicit_template; // Model had builtin template or template overridde was specified.
640
+ std::unique_ptr<common_chat_template> template_default; // always set (defaults to chatml)
641
+ std::unique_ptr<common_chat_template> template_tool_use;
642
+ };
605
643
 
606
644
  // CPP wrapper for llama_chat_apply_template
607
645
  // If the built-in template is not supported, we default to chatml
608
646
  // If the custom "tmpl" is not supported, we throw an error
609
- std::string common_chat_apply_template(const struct llama_model * model,
610
- const std::string & tmpl,
647
+ std::string common_chat_apply_template(
648
+ const common_chat_template & tmpl,
611
649
  const std::vector<common_chat_msg> & chat,
612
- bool add_ass);
650
+ bool add_ass,
651
+ bool use_jinja);
613
652
 
614
653
  // Format single message, while taking into account the position of that message in chat history
615
- std::string common_chat_format_single(const struct llama_model * model,
616
- const std::string & tmpl,
654
+ std::string common_chat_format_single(
655
+ const common_chat_template & tmpl,
617
656
  const std::vector<common_chat_msg> & past_msg,
618
657
  const common_chat_msg & new_msg,
619
- bool add_ass);
658
+ bool add_ass,
659
+ bool use_jinja);
620
660
 
621
661
  // Returns an example of formatted chat
622
- std::string common_chat_format_example(const struct llama_model * model,
623
- const std::string & tmpl);
662
+ std::string common_chat_format_example(
663
+ const common_chat_template & tmpl, bool use_jinja);
664
+
665
+ common_chat_templates common_chat_templates_from_model(const struct llama_model * model, const std::string & chat_template_override);
624
666
 
625
667
  //
626
668
  // KV cache utils
@@ -1,4 +1,6 @@
1
1
  #include "json-schema-to-grammar.h"
2
+ #include "common.h"
3
+
2
4
  #include <algorithm>
3
5
  #include <fstream>
4
6
  #include <map>
@@ -11,11 +13,6 @@
11
13
 
12
14
  using json = nlohmann::ordered_json;
13
15
 
14
- template <typename Iterator>
15
- static std::string join(Iterator begin, Iterator end, const std::string & separator);
16
-
17
- static std::string repeat(const std::string & str, size_t n);
18
-
19
16
  static std::string build_repetition(const std::string & item_rule, int min_items, int max_items, const std::string & separator_rule = "") {
20
17
  auto has_max = max_items != std::numeric_limits<int>::max();
21
18
 
@@ -128,8 +125,8 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
128
125
  if (sub_len > 0) {
129
126
  auto from_sub = from.substr(i + 1);
130
127
  auto to_sub = to.substr(i + 1);
131
- auto sub_zeros = repeat("0", sub_len);
132
- auto sub_nines = repeat("9", sub_len);
128
+ auto sub_zeros = string_repeat("0", sub_len);
129
+ auto sub_nines = string_repeat("9", sub_len);
133
130
 
134
131
  auto to_reached = false;
135
132
  out << "(";
@@ -188,8 +185,8 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
188
185
  auto max_digits = max_s.length();
189
186
 
190
187
  for (auto digits = min_digits; digits < max_digits; digits++) {
191
- uniform_range(min_s, repeat("9", digits));
192
- min_s = "1" + repeat("0", digits);
188
+ uniform_range(min_s, string_repeat("9", digits));
189
+ min_s = "1" + string_repeat("0", digits);
193
190
  out << " | ";
194
191
  }
195
192
  uniform_range(min_s, max_s);
@@ -318,49 +315,6 @@ std::unordered_map<char, std::string> GRAMMAR_LITERAL_ESCAPES = {
318
315
  std::unordered_set<char> NON_LITERAL_SET = {'|', '.', '(', ')', '[', ']', '{', '}', '*', '+', '?'};
319
316
  std::unordered_set<char> ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = {'^', '$', '.', '[', ']', '(', ')', '|', '{', '}', '*', '+', '?'};
320
317
 
321
- template <typename Iterator>
322
- std::string join(Iterator begin, Iterator end, const std::string & separator) {
323
- std::ostringstream result;
324
- if (begin != end) {
325
- result << *begin;
326
- for (Iterator it = begin + 1; it != end; ++it) {
327
- result << separator << *it;
328
- }
329
- }
330
- return result.str();
331
- }
332
-
333
- static std::vector<std::string> split(const std::string & str, const std::string & delimiter) {
334
- std::vector<std::string> tokens;
335
- size_t start = 0;
336
- size_t end = str.find(delimiter);
337
-
338
- while (end != std::string::npos) {
339
- tokens.push_back(str.substr(start, end - start));
340
- start = end + delimiter.length();
341
- end = str.find(delimiter, start);
342
- }
343
-
344
- tokens.push_back(str.substr(start));
345
-
346
- return tokens;
347
- }
348
-
349
- static std::string repeat(const std::string & str, size_t n) {
350
- if (n == 0) {
351
- return "";
352
- }
353
-
354
- std::string result;
355
- result.reserve(str.length() * n);
356
-
357
- for (size_t i = 0; i < n; ++i) {
358
- result += str;
359
- }
360
-
361
- return result;
362
- }
363
-
364
318
  static std::string replacePattern(const std::string & input, const std::regex & regex, const std::function<std::string(const std::smatch &)> & replacement) {
365
319
  std::smatch match;
366
320
  std::string result;
@@ -389,6 +343,7 @@ static std::string format_literal(const std::string & literal) {
389
343
 
390
344
  class SchemaConverter {
391
345
  private:
346
+ friend std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options);
392
347
  std::function<json(const std::string &)> _fetch_json;
393
348
  bool _dotall;
394
349
  std::map<std::string, std::string> _rules;
@@ -418,7 +373,7 @@ private:
418
373
  for (size_t i = 0; i < alt_schemas.size(); i++) {
419
374
  rules.push_back(visit(alt_schemas[i], name + (name.empty() ? "alternative-" : "-") + std::to_string(i)));
420
375
  }
421
- return join(rules.begin(), rules.end(), " | ");
376
+ return string_join(rules, " | ");
422
377
  }
423
378
 
424
379
  std::string _visit_pattern(const std::string & pattern, const std::string & name) {
@@ -481,7 +436,7 @@ private:
481
436
  for (const auto & item : ret) {
482
437
  results.push_back(to_rule(item));
483
438
  }
484
- return std::make_pair(join(results.begin(), results.end(), " "), false);
439
+ return std::make_pair(string_join(results, " "), false);
485
440
  };
486
441
 
487
442
  while (i < length) {
@@ -539,7 +494,7 @@ private:
539
494
  }
540
495
  curly_brackets += '}';
541
496
  i++;
542
- auto nums = split(curly_brackets.substr(1, curly_brackets.length() - 2), ",");
497
+ auto nums = string_split(curly_brackets.substr(1, curly_brackets.length() - 2), ",");
543
498
  int min_times = 0;
544
499
  int max_times = std::numeric_limits<int>::max();
545
500
  try {
@@ -809,10 +764,11 @@ private:
809
764
  public:
810
765
  SchemaConverter(
811
766
  const std::function<json(const std::string &)> & fetch_json,
812
- bool dotall)
767
+ bool dotall,
768
+ bool compact_spaces)
813
769
  : _fetch_json(fetch_json), _dotall(dotall)
814
770
  {
815
- _rules["space"] = SPACE_RULE;
771
+ _rules["space"] = compact_spaces ? "\" \"?" : SPACE_RULE;
816
772
  }
817
773
 
818
774
  void resolve_refs(json & schema, const std::string & url) {
@@ -854,7 +810,7 @@ public:
854
810
  return;
855
811
  }
856
812
  std::string pointer = ref.substr(ref.find('#') + 1);
857
- std::vector<std::string> tokens = split(pointer, "/");
813
+ std::vector<std::string> tokens = string_split(pointer, "/");
858
814
  for (size_t i = 1; i < tokens.size(); ++i) {
859
815
  std::string sel = tokens[i];
860
816
  if (target.is_null() || !target.contains(sel)) {
@@ -905,7 +861,7 @@ public:
905
861
  for (const auto & v : schema["enum"]) {
906
862
  enum_values.push_back(_generate_constant_rule(v));
907
863
  }
908
- return _add_rule(rule_name, "(" + join(enum_values.begin(), enum_values.end(), " | ") + ") space");
864
+ return _add_rule(rule_name, "(" + string_join(enum_values, " | ") + ") space");
909
865
  } else if ((schema_type.is_null() || schema_type == "object")
910
866
  && (schema.contains("properties") ||
911
867
  (schema.contains("additionalProperties") && schema["additionalProperties"] != true))) {
@@ -1019,10 +975,10 @@ public:
1019
975
 
1020
976
  void check_errors() {
1021
977
  if (!_errors.empty()) {
1022
- throw std::runtime_error("JSON schema conversion failed:\n" + join(_errors.begin(), _errors.end(), "\n"));
978
+ throw std::runtime_error("JSON schema conversion failed:\n" + string_join(_errors, "\n"));
1023
979
  }
1024
980
  if (!_warnings.empty()) {
1025
- fprintf(stderr, "WARNING: JSON schema conversion was incomplete: %s\n", join(_warnings.begin(), _warnings.end(), "; ").c_str());
981
+ fprintf(stderr, "WARNING: JSON schema conversion was incomplete: %s\n", string_join(_warnings, "; ").c_str());
1026
982
  }
1027
983
  }
1028
984
 
@@ -1035,11 +991,35 @@ public:
1035
991
  }
1036
992
  };
1037
993
 
1038
- std::string json_schema_to_grammar(const json & schema) {
1039
- SchemaConverter converter([](const std::string &) { return json::object(); }, /* dotall= */ false);
1040
- auto copy = schema;
1041
- converter.resolve_refs(copy, "input");
1042
- converter.visit(copy, "");
994
+ std::string json_schema_to_grammar(const json & schema, bool force_gbnf) {
995
+ #ifdef LLAMA_USE_LLGUIDANCE
996
+ if (!force_gbnf) {
997
+ return "%llguidance {}\nstart: %json " + schema.dump();
998
+ }
999
+ #else
1000
+ (void)force_gbnf;
1001
+ #endif // LLAMA_USE_LLGUIDANCE
1002
+ return build_grammar([&](const common_grammar_builder & callbacks) {
1003
+ auto copy = schema;
1004
+ callbacks.resolve_refs(copy);
1005
+ callbacks.add_schema("", copy);
1006
+ });
1007
+ }
1008
+
1009
+ std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options) {
1010
+ SchemaConverter converter([&](const std::string &) { return json(); }, options.dotall, options.compact_spaces);
1011
+ common_grammar_builder builder {
1012
+ /* .add_rule = */ [&](const std::string & name, const std::string & rule) {
1013
+ return converter._add_rule(name, rule);
1014
+ },
1015
+ /* .add_schema = */ [&](const std::string & name, const nlohmann::ordered_json & schema) {
1016
+ return converter.visit(schema, name == "root" ? "" : name);
1017
+ },
1018
+ /* .resolve_refs = */ [&](nlohmann::ordered_json & schema) {
1019
+ converter.resolve_refs(schema, "");
1020
+ }
1021
+ };
1022
+ cb(builder);
1043
1023
  converter.check_errors();
1044
1024
  return converter.format_grammar();
1045
1025
  }