@fugood/llama.node 1.0.0-beta.5 → 1.0.0-beta.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. package/lib/binding.ts +1 -1
  2. package/package.json +14 -14
  3. package/scripts/llama.cpp.patch +27 -26
  4. package/src/LlamaCompletionWorker.cpp +21 -4
  5. package/src/LlamaCompletionWorker.h +2 -0
  6. package/src/LlamaContext.cpp +3 -12
  7. package/src/common.hpp +6 -5
  8. package/src/llama.cpp/CMakeLists.txt +15 -4
  9. package/src/llama.cpp/common/CMakeLists.txt +15 -24
  10. package/src/llama.cpp/common/arg.cpp +172 -110
  11. package/src/llama.cpp/common/chat-parser.cpp +385 -0
  12. package/src/llama.cpp/common/chat-parser.h +120 -0
  13. package/src/llama.cpp/common/chat.cpp +726 -596
  14. package/src/llama.cpp/common/chat.h +74 -8
  15. package/src/llama.cpp/common/common.cpp +56 -38
  16. package/src/llama.cpp/common/common.h +9 -3
  17. package/src/llama.cpp/common/json-partial.cpp +256 -0
  18. package/src/llama.cpp/common/json-partial.h +38 -0
  19. package/src/llama.cpp/common/json-schema-to-grammar.cpp +2 -1
  20. package/src/llama.cpp/common/json-schema-to-grammar.h +4 -4
  21. package/src/llama.cpp/common/sampling.cpp +7 -8
  22. package/src/llama.cpp/common/speculative.cpp +6 -4
  23. package/src/llama.cpp/ggml/CMakeLists.txt +48 -3
  24. package/src/llama.cpp/ggml/include/ggml.h +22 -3
  25. package/src/llama.cpp/ggml/src/CMakeLists.txt +81 -22
  26. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +131 -49
  27. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +1 -1
  28. package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +1 -1
  29. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
  30. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +4113 -0
  31. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +2162 -0
  32. package/src/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +2638 -0
  33. package/src/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
  34. package/src/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +2731 -0
  35. package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +2068 -0
  36. package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +396 -0
  37. package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +1299 -0
  38. package/src/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +1480 -0
  39. package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +4310 -0
  40. package/src/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +59 -3206
  41. package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +184 -0
  42. package/src/llama.cpp/ggml/src/ggml-cpu/common.h +1 -1
  43. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +12 -13
  44. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +64 -88
  45. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +8 -8
  46. package/src/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.cpp → hbm.cpp} +1 -1
  47. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +1 -1
  48. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +56 -7
  49. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +5 -0
  50. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +282 -100
  51. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +1 -0
  52. package/src/llama.cpp/ggml/src/ggml-cpu/quants.c +1157 -0
  53. package/src/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
  54. package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +1570 -0
  55. package/src/llama.cpp/ggml/src/ggml-cpu/repack.h +98 -0
  56. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +119 -5
  57. package/src/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
  58. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +85 -16
  59. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +204 -49
  60. package/src/llama.cpp/include/llama.h +145 -40
  61. package/src/llama.cpp/src/CMakeLists.txt +5 -1
  62. package/src/llama.cpp/src/llama-arch.cpp +99 -3
  63. package/src/llama.cpp/src/llama-arch.h +10 -1
  64. package/src/llama.cpp/src/llama-batch.cpp +728 -272
  65. package/src/llama.cpp/src/llama-batch.h +112 -54
  66. package/src/llama.cpp/src/llama-chat.cpp +19 -2
  67. package/src/llama.cpp/src/llama-chat.h +1 -0
  68. package/src/llama.cpp/src/llama-context.cpp +525 -339
  69. package/src/llama.cpp/src/llama-context.h +38 -17
  70. package/src/llama.cpp/src/llama-cparams.cpp +4 -0
  71. package/src/llama.cpp/src/llama-cparams.h +2 -0
  72. package/src/llama.cpp/src/llama-grammar.cpp +12 -2
  73. package/src/llama.cpp/src/llama-graph.cpp +413 -353
  74. package/src/llama.cpp/src/llama-graph.h +112 -56
  75. package/src/llama.cpp/src/llama-hparams.cpp +10 -2
  76. package/src/llama.cpp/src/llama-hparams.h +13 -2
  77. package/src/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +279 -0
  78. package/src/llama.cpp/src/llama-kv-cache-unified-iswa.h +128 -0
  79. package/src/llama.cpp/src/llama-kv-cache-unified.cpp +1815 -0
  80. package/src/llama.cpp/src/llama-kv-cache-unified.h +303 -0
  81. package/src/llama.cpp/src/llama-kv-cells.h +415 -0
  82. package/src/llama.cpp/src/llama-memory-hybrid.cpp +246 -0
  83. package/src/llama.cpp/src/llama-memory-hybrid.h +138 -0
  84. package/src/llama.cpp/src/llama-memory-recurrent.cpp +1112 -0
  85. package/src/llama.cpp/src/llama-memory-recurrent.h +183 -0
  86. package/src/llama.cpp/src/llama-memory.cpp +41 -0
  87. package/src/llama.cpp/src/llama-memory.h +86 -5
  88. package/src/llama.cpp/src/llama-mmap.cpp +1 -1
  89. package/src/llama.cpp/src/llama-model-loader.cpp +42 -17
  90. package/src/llama.cpp/src/llama-model-saver.cpp +1 -0
  91. package/src/llama.cpp/src/llama-model.cpp +1137 -528
  92. package/src/llama.cpp/src/llama-model.h +4 -0
  93. package/src/llama.cpp/src/llama-quant.cpp +2 -1
  94. package/src/llama.cpp/src/llama-sampling.cpp +2 -2
  95. package/src/llama.cpp/src/llama-vocab.cpp +69 -32
  96. package/src/llama.cpp/src/llama-vocab.h +1 -0
  97. package/src/llama.cpp/src/llama.cpp +11 -7
  98. package/src/llama.cpp/src/unicode.cpp +5 -0
  99. package/src/tts_utils.h +1 -1
  100. package/src/llama.cpp/common/json.hpp +0 -24766
  101. package/src/llama.cpp/common/minja/chat-template.hpp +0 -541
  102. package/src/llama.cpp/common/minja/minja.hpp +0 -2974
  103. package/src/llama.cpp/common/stb_image.h +0 -7988
  104. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
  105. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -13326
  106. package/src/llama.cpp/src/llama-kv-cache.cpp +0 -2827
  107. package/src/llama.cpp/src/llama-kv-cache.h +0 -515
  108. /package/src/llama.cpp/ggml/src/ggml-cpu/{cpu-feats-x86.cpp → arch/x86/cpu-feats.cpp} +0 -0
  109. /package/src/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.h → hbm.h} +0 -0
  110. /package/src/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
@@ -3,11 +3,13 @@
3
3
  #pragma once
4
4
 
5
5
  #include "common.h"
6
+ #include <functional>
6
7
  #include <chrono>
7
8
  #include <string>
8
9
  #include <vector>
9
- #include "minja/chat-template.hpp"
10
- #include "minja/minja.hpp"
10
+
11
+ #include <minja/chat-template.hpp>
12
+ #include <minja/minja.hpp>
11
13
 
12
14
  typedef minja::chat_template common_chat_template;
13
15
 
@@ -21,11 +23,19 @@ struct common_chat_tool_call {
21
23
  std::string name;
22
24
  std::string arguments;
23
25
  std::string id;
26
+
27
+ bool operator==(const common_chat_tool_call & other) const {
28
+ return name == other.name && arguments == other.arguments && id == other.id;
29
+ }
24
30
  };
25
31
 
26
32
  struct common_chat_msg_content_part {
27
33
  std::string type;
28
34
  std::string text;
35
+
36
+ bool operator==(const common_chat_msg_content_part & other) const {
37
+ return type == other.type && text == other.text;
38
+ }
29
39
  };
30
40
 
31
41
  struct common_chat_msg {
@@ -36,6 +46,51 @@ struct common_chat_msg {
36
46
  std::string reasoning_content;
37
47
  std::string tool_name;
38
48
  std::string tool_call_id;
49
+
50
+ template <class T> T to_json_oaicompat() const;
51
+
52
+ bool empty() const {
53
+ return content.empty() && content_parts.empty() && tool_calls.empty() && reasoning_content.empty() && tool_name.empty() && tool_call_id.empty();
54
+ }
55
+ void ensure_tool_call_ids_set(std::vector<std::string> & ids_cache, const std::function<std::string()> & gen_tool_call_id) {
56
+ for (auto i = 0u; i < tool_calls.size(); i++) {
57
+ if (ids_cache.size() <= i) {
58
+ auto id = tool_calls[i].id;
59
+ if (id.empty()) {
60
+ id = gen_tool_call_id();
61
+ }
62
+ ids_cache.push_back(id);
63
+ }
64
+ tool_calls[i].id = ids_cache[i];
65
+ }
66
+ }
67
+ bool operator==(const common_chat_msg & other) const {
68
+ return role == other.role
69
+ && content == other.content
70
+ && content_parts == other.content_parts
71
+ && tool_calls == other.tool_calls
72
+ && reasoning_content == other.reasoning_content
73
+ && tool_name == other.tool_name
74
+ && tool_call_id == other.tool_call_id;
75
+ }
76
+ bool operator!=(const common_chat_msg & other) const {
77
+ return !(*this == other);
78
+ }
79
+ };
80
+
81
+ struct common_chat_msg_diff {
82
+ std::string reasoning_content_delta;
83
+ std::string content_delta;
84
+ size_t tool_call_index = std::string::npos;
85
+ common_chat_tool_call tool_call_delta;
86
+
87
+ static std::vector<common_chat_msg_diff> compute_diffs(const common_chat_msg & previous_msg, const common_chat_msg & new_msg);
88
+
89
+ bool operator==(const common_chat_msg_diff & other) const {
90
+ return content_delta == other.content_delta
91
+ && tool_call_index == other.tool_call_index
92
+ && tool_call_delta == other.tool_call_delta;
93
+ }
39
94
  };
40
95
 
41
96
  struct common_chat_tool {
@@ -57,14 +112,11 @@ enum common_chat_format {
57
112
  COMMON_CHAT_FORMAT_LLAMA_3_X,
58
113
  COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS,
59
114
  COMMON_CHAT_FORMAT_DEEPSEEK_R1,
60
- COMMON_CHAT_FORMAT_DEEPSEEK_R1_EXTRACT_REASONING,
61
115
  COMMON_CHAT_FORMAT_FIREFUNCTION_V2,
62
116
  COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2,
63
117
  COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
64
118
  COMMON_CHAT_FORMAT_HERMES_2_PRO,
65
- COMMON_CHAT_FORMAT_HERMES_2_PRO_EXTRACT_REASONING,
66
119
  COMMON_CHAT_FORMAT_COMMAND_R7B,
67
- COMMON_CHAT_FORMAT_COMMAND_R7B_EXTRACT_REASONING,
68
120
 
69
121
  COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
70
122
  };
@@ -79,7 +131,8 @@ struct common_chat_templates_inputs {
79
131
  std::vector<common_chat_tool> tools;
80
132
  common_chat_tool_choice tool_choice = COMMON_CHAT_TOOL_CHOICE_AUTO;
81
133
  bool parallel_tool_calls = false;
82
- bool extract_reasoning = true;
134
+ common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE;
135
+ bool enable_thinking = true;
83
136
  std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
84
137
  };
85
138
 
@@ -88,11 +141,21 @@ struct common_chat_params {
88
141
  std::string prompt;
89
142
  std::string grammar;
90
143
  bool grammar_lazy = false;
144
+ bool thinking_forced_open = false;
91
145
  std::vector<common_grammar_trigger> grammar_triggers;
92
146
  std::vector<std::string> preserved_tokens;
93
147
  std::vector<std::string> additional_stops;
94
148
  };
95
149
 
150
+ struct common_chat_syntax {
151
+ common_chat_format format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
152
+ common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE;
153
+ // Whether reasoning_content should be inlined in the content (e.g. for reasoning_format=deepseek in stream mode)
154
+ bool reasoning_in_content = false;
155
+ bool thinking_forced_open = false;
156
+ bool parse_tool_calls = true;
157
+ };
158
+
96
159
  // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
97
160
  bool common_chat_verify_template(const std::string & tmpl, bool use_jinja);
98
161
 
@@ -129,8 +192,9 @@ std::string common_chat_format_example(
129
192
  const struct common_chat_templates * tmpls,
130
193
  bool use_jinja);
131
194
 
132
- std::string common_chat_format_name(common_chat_format format);
133
- common_chat_msg common_chat_parse( const std::string & input, common_chat_format format);
195
+ const char* common_chat_format_name(common_chat_format format);
196
+ const char* common_reasoning_format_name(common_reasoning_format format);
197
+ common_chat_msg common_chat_parse(const std::string & input, bool is_partial, const common_chat_syntax & syntax);
134
198
 
135
199
  common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice);
136
200
 
@@ -143,3 +207,5 @@ template <class T> T common_chat_msgs_to_json_oaicompat(const std::vector<common
143
207
  // T can be std::string containing JSON or nlohmann::ordered_json
144
208
  template <class T> std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const T & tools);
145
209
  template <class T> T common_chat_tools_to_json_oaicompat(const std::vector<common_chat_tool> & tools);
210
+
211
+ template <class T> T common_chat_msg_diff_to_json_oaicompat(const common_chat_msg_diff & diff);
@@ -203,6 +203,7 @@ bool set_process_priority(enum ggml_sched_priority prio) {
203
203
 
204
204
  DWORD p = NORMAL_PRIORITY_CLASS;
205
205
  switch (prio) {
206
+ case GGML_SCHED_PRIO_LOW: p = BELOW_NORMAL_PRIORITY_CLASS; break;
206
207
  case GGML_SCHED_PRIO_NORMAL: p = NORMAL_PRIORITY_CLASS; break;
207
208
  case GGML_SCHED_PRIO_MEDIUM: p = ABOVE_NORMAL_PRIORITY_CLASS; break;
208
209
  case GGML_SCHED_PRIO_HIGH: p = HIGH_PRIORITY_CLASS; break;
@@ -228,6 +229,7 @@ bool set_process_priority(enum ggml_sched_priority prio) {
228
229
 
229
230
  int p = 0;
230
231
  switch (prio) {
232
+ case GGML_SCHED_PRIO_LOW: p = 5; break;
231
233
  case GGML_SCHED_PRIO_NORMAL: p = 0; break;
232
234
  case GGML_SCHED_PRIO_MEDIUM: p = -5; break;
233
235
  case GGML_SCHED_PRIO_HIGH: p = -10; break;
@@ -464,7 +466,7 @@ size_t string_find_partial_stop(const std::string_view & str, const std::string_
464
466
 
465
467
  std::string regex_escape(const std::string & s) {
466
468
  static const std::regex special_chars("[.^$|()*+?\\[\\]{}\\\\]");
467
- return std::regex_replace(s, special_chars, "\\$0");
469
+ return std::regex_replace(s, special_chars, "\\$&");
468
470
  }
469
471
 
470
472
  std::string string_join(const std::vector<std::string> & values, const std::string & separator) {
@@ -704,11 +706,17 @@ bool fs_validate_filename(const std::string & filename) {
704
706
  // disable C++17 deprecation warning for std::codecvt_utf8
705
707
  # pragma clang diagnostic push
706
708
  # pragma clang diagnostic ignored "-Wdeprecated-declarations"
709
+ #elif defined(__GNUC__)
710
+ # pragma GCC diagnostic push
711
+ # pragma GCC diagnostic ignored "-Wdeprecated-declarations"
707
712
  #endif
713
+
708
714
  std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter;
709
715
 
710
716
  #if defined(__clang__)
711
717
  # pragma clang diagnostic pop
718
+ #elif defined(__GNUC__)
719
+ # pragma GCC diagnostic pop
712
720
  #endif
713
721
 
714
722
  filename_utf32 = converter.from_bytes(filename);
@@ -765,6 +773,9 @@ bool fs_validate_filename(const std::string & filename) {
765
773
  return true;
766
774
  }
767
775
 
776
+ #include <iostream>
777
+
778
+
768
779
  // returns true if successful, false otherwise
769
780
  bool fs_create_directory_with_parents(const std::string & path) {
770
781
  #ifdef _WIN32
@@ -782,9 +793,16 @@ bool fs_create_directory_with_parents(const std::string & path) {
782
793
  // process path from front to back, procedurally creating directories
783
794
  while ((pos_slash = path.find('\\', pos_slash)) != std::string::npos) {
784
795
  const std::wstring subpath = wpath.substr(0, pos_slash);
785
- const wchar_t * test = subpath.c_str();
786
796
 
787
- const bool success = CreateDirectoryW(test, NULL);
797
+ pos_slash += 1;
798
+
799
+ // skip the drive letter, in some systems it can return an access denied error
800
+ if (subpath.length() == 2 && subpath[1] == ':') {
801
+ continue;
802
+ }
803
+
804
+ const bool success = CreateDirectoryW(subpath.c_str(), NULL);
805
+
788
806
  if (!success) {
789
807
  const DWORD error = GetLastError();
790
808
 
@@ -798,8 +816,6 @@ bool fs_create_directory_with_parents(const std::string & path) {
798
816
  return false;
799
817
  }
800
818
  }
801
-
802
- pos_slash += 1;
803
819
  }
804
820
 
805
821
  return true;
@@ -849,7 +865,7 @@ std::string fs_get_cache_directory() {
849
865
  if (getenv("LLAMA_CACHE")) {
850
866
  cache_directory = std::getenv("LLAMA_CACHE");
851
867
  } else {
852
- #if defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)
868
+ #if defined(__linux__) || defined(__FreeBSD__) || defined(_AIX) || defined(__OpenBSD__)
853
869
  if (std::getenv("XDG_CACHE_HOME")) {
854
870
  cache_directory = std::getenv("XDG_CACHE_HOME");
855
871
  } else {
@@ -895,31 +911,6 @@ struct common_init_result common_init_from_params(common_params & params) {
895
911
 
896
912
  const llama_vocab * vocab = llama_model_get_vocab(model);
897
913
 
898
- if (params.reranking) {
899
- bool ok = true;
900
-
901
- if (llama_vocab_bos(vocab) == LLAMA_TOKEN_NULL) {
902
- LOG_WRN("%s: warning: vocab does not have a BOS token, reranking will not work\n", __func__);
903
- ok = false;
904
- }
905
-
906
- if (llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) {
907
- LOG_WRN("%s: warning: vocab does not have an EOS token, reranking will not work\n", __func__);
908
- ok = false;
909
- }
910
-
911
- if (llama_vocab_sep(vocab) == LLAMA_TOKEN_NULL) {
912
- LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
913
- ok = false;
914
- }
915
-
916
- if (!ok) {
917
- llama_model_free(model);
918
-
919
- return iparams;
920
- }
921
- }
922
-
923
914
  auto cparams = common_context_params_to_llama(params);
924
915
 
925
916
  llama_context * lctx = llama_init_from_model(model, cparams);
@@ -929,7 +920,7 @@ struct common_init_result common_init_from_params(common_params & params) {
929
920
  return iparams;
930
921
  }
931
922
 
932
- if (params.ctx_shift && !llama_kv_self_can_shift(lctx)) {
923
+ if (params.ctx_shift && !llama_memory_can_shift(llama_get_memory(lctx))) {
933
924
  LOG_WRN("%s: KV cache shifting is not supported for this context, disabling KV cache shifting\n", __func__);
934
925
  params.ctx_shift = false;
935
926
  }
@@ -961,6 +952,35 @@ struct common_init_result common_init_from_params(common_params & params) {
961
952
  }
962
953
  }
963
954
 
955
+ if (llama_pooling_type(lctx) == LLAMA_POOLING_TYPE_RANK) {
956
+ bool ok = true;
957
+
958
+ if (llama_vocab_bos(vocab) == LLAMA_TOKEN_NULL) {
959
+ LOG_WRN("%s: warning: vocab does not have a BOS token, reranking will not work\n", __func__);
960
+ ok = false;
961
+ }
962
+
963
+ bool has_eos = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
964
+ bool has_sep = llama_vocab_sep(vocab) != LLAMA_TOKEN_NULL;
965
+
966
+ if (!has_eos && !has_sep) {
967
+ LOG_WRN("%s: warning: vocab does not have an EOS token or SEP token, reranking will not work\n", __func__);
968
+ ok = false;
969
+ } else if (!has_eos) {
970
+ LOG_WRN("%s: warning: vocab does not have an EOS token, using SEP token as fallback\n", __func__);
971
+ } else if (!has_sep) {
972
+ LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
973
+ ok = false;
974
+ }
975
+
976
+ if (!ok) {
977
+ llama_free(lctx);
978
+ llama_model_free(model);
979
+
980
+ return iparams;
981
+ }
982
+ }
983
+
964
984
  // load and optionally apply lora adapters
965
985
  for (auto & la : params.lora_adapters) {
966
986
  llama_adapter_lora_ptr lora;
@@ -1036,7 +1056,7 @@ struct common_init_result common_init_from_params(common_params & params) {
1036
1056
  if (llama_model_has_decoder(model)) {
1037
1057
  llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch)));
1038
1058
  }
1039
- llama_kv_self_clear(lctx);
1059
+ llama_memory_clear(llama_get_memory(lctx), true);
1040
1060
  llama_synchronize(lctx);
1041
1061
  llama_perf_context_reset(lctx);
1042
1062
  llama_set_warmup(lctx, false);
@@ -1139,11 +1159,6 @@ struct llama_context_params common_context_params_to_llama(const common_params &
1139
1159
  cparams.op_offload = !params.no_op_offload;
1140
1160
  cparams.swa_full = params.swa_full;
1141
1161
 
1142
- if (params.reranking) {
1143
- cparams.embeddings = true;
1144
- cparams.pooling_type = LLAMA_POOLING_TYPE_RANK;
1145
- }
1146
-
1147
1162
  cparams.type_k = params.cache_type_k;
1148
1163
  cparams.type_v = params.cache_type_v;
1149
1164
 
@@ -1276,6 +1291,9 @@ std::vector<llama_token> common_tokenize(
1276
1291
  int n_tokens = text.length() + 2 * add_special;
1277
1292
  std::vector<llama_token> result(n_tokens);
1278
1293
  n_tokens = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
1294
+ if (n_tokens == std::numeric_limits<int32_t>::min()) {
1295
+ throw std::runtime_error("Tokenization failed: input text too large, tokenization result exceeds int32_t limit");
1296
+ }
1279
1297
  if (n_tokens < 0) {
1280
1298
  result.resize(-n_tokens);
1281
1299
  int check = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
@@ -115,7 +115,7 @@ enum common_grammar_trigger_type {
115
115
  COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN,
116
116
  COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
117
117
  COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
118
- COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START,
118
+ COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
119
119
  };
120
120
 
121
121
  struct common_grammar_trigger {
@@ -199,6 +199,9 @@ struct common_params_speculative {
199
199
  float p_split = 0.1f; // speculative decoding split probability
200
200
  float p_min = 0.75f; // minimum speculative decoding probability (greedy)
201
201
 
202
+ ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
203
+ ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
204
+
202
205
  struct cpu_params cpuparams;
203
206
  struct cpu_params cpuparams_batch;
204
207
 
@@ -215,7 +218,8 @@ struct common_params_vocoder {
215
218
 
216
219
  enum common_reasoning_format {
217
220
  COMMON_REASONING_FORMAT_NONE,
218
- COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`
221
+ COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY, // Extract thinking tag contents and return as `message.reasoning_content`, or leave inline in <think> tags in stream mode
222
+ COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
219
223
  };
220
224
 
221
225
  struct common_params {
@@ -292,6 +296,7 @@ struct common_params {
292
296
  int32_t verbosity = 0;
293
297
  int32_t control_vector_layer_start = -1; // layer range for control vector
294
298
  int32_t control_vector_layer_end = -1; // layer range for control vector
299
+ bool offline = false;
295
300
 
296
301
  int32_t ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
297
302
  int32_t ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
@@ -354,7 +359,7 @@ struct common_params {
354
359
  int32_t embd_normalize = 2; // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
355
360
  std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
356
361
  std::string embd_sep = "\n"; // separator of embeddings
357
- bool reranking = false; // enable reranking support on server
362
+ std::string cls_sep = "\t"; // separator of classification sequences
358
363
 
359
364
  // server params
360
365
  int32_t port = 8080; // server listens on this network port
@@ -369,6 +374,7 @@ struct common_params {
369
374
  bool use_jinja = false; // NOLINT
370
375
  bool enable_chat_template = true;
371
376
  common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
377
+ int reasoning_budget = -1;
372
378
  bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
373
379
 
374
380
  std::vector<std::string> api_keys;
@@ -0,0 +1,256 @@
1
+ #include "json-partial.h"
2
+
3
+ #include "log.h"
4
+
5
+ #include <nlohmann/json.hpp>
6
+
7
+ #include <string>
8
+
9
+ using json = nlohmann::ordered_json;
10
+
11
+ enum common_json_stack_element_type {
12
+ COMMON_JSON_STACK_ELEMENT_OBJECT,
13
+ COMMON_JSON_STACK_ELEMENT_KEY,
14
+ COMMON_JSON_STACK_ELEMENT_ARRAY,
15
+ };
16
+
17
+ struct common_json_stack_element {
18
+ common_json_stack_element_type type;
19
+ std::string key;
20
+ };
21
+
22
+ bool common_json_parse(
23
+ const std::string & input,
24
+ const std::string & healing_marker,
25
+ common_json & out)
26
+ {
27
+ std::string::const_iterator it = input.begin();
28
+ const auto end = input.end();
29
+ return common_json_parse(it, end, healing_marker, out);
30
+ }
31
+
32
+ bool common_json_parse(
33
+ std::string::const_iterator & it,
34
+ const std::string::const_iterator & end,
35
+ const std::string & healing_marker,
36
+ common_json & out)
37
+ {
38
+ // // https://json.nlohmann.me/features/parsing/sax_interface/
39
+ struct json_error_locator : public nlohmann::json_sax<json> {
40
+ std::size_t position;
41
+ bool found_error;
42
+ std::string last_token;
43
+ std::string exception_message;
44
+ std::vector<common_json_stack_element> stack;
45
+
46
+ json_error_locator() : position(0), found_error(false) {}
47
+
48
+ bool parse_error(std::size_t position, const std::string & last_token, const json::exception & ex) override { // NOLINT
49
+ this->position = position - 1;
50
+ this->found_error = true;
51
+ this->last_token = last_token;
52
+ this->exception_message = ex.what();
53
+ return false;
54
+ }
55
+ void close_value() {
56
+ if (!stack.empty() && (stack.back().type == COMMON_JSON_STACK_ELEMENT_KEY)) {
57
+ stack.pop_back();
58
+ }
59
+ }
60
+ bool null() override { // NOLINT
61
+ close_value();
62
+ return true;
63
+ }
64
+ bool boolean(bool) override { // NOLINT
65
+ close_value();
66
+ return true;
67
+ }
68
+ bool number_integer(number_integer_t) override { // NOLINT
69
+ close_value();
70
+ return true;
71
+ }
72
+ bool number_unsigned(number_unsigned_t) override { // NOLINT
73
+ close_value();
74
+ return true;
75
+ }
76
+ bool number_float(number_float_t, const string_t &) override { // NOLINT
77
+ close_value();
78
+ return true;
79
+ }
80
+ bool string(string_t &) override { // NOLINT
81
+ close_value();
82
+ return true;
83
+ }
84
+ bool binary(binary_t &) override { // NOLINT
85
+ close_value();
86
+ return true;
87
+ }
88
+ bool start_object(std::size_t) override { // NOLINT
89
+ stack.push_back({COMMON_JSON_STACK_ELEMENT_OBJECT, ""});
90
+ return true;
91
+ }
92
+ bool end_object() override {
93
+ GGML_ASSERT(!stack.empty() && stack.back().type == COMMON_JSON_STACK_ELEMENT_OBJECT);
94
+ stack.pop_back();
95
+ close_value();
96
+ return true;
97
+ }
98
+ bool key(string_t & key) override { // NOLINT
99
+ stack.push_back({COMMON_JSON_STACK_ELEMENT_KEY, key});
100
+ return true;
101
+ }
102
+ bool start_array(std::size_t) override { // NOLINT
103
+ stack.push_back({COMMON_JSON_STACK_ELEMENT_ARRAY, ""});
104
+ return true;
105
+ }
106
+ bool end_array() override {
107
+ GGML_ASSERT(!stack.empty() && stack.back().type == COMMON_JSON_STACK_ELEMENT_ARRAY);
108
+ stack.pop_back();
109
+ close_value();
110
+ return true;
111
+ }
112
+ };
113
+ json_error_locator err_loc;
114
+ auto start = it;
115
+ json::sax_parse(it, end, &err_loc);
116
+
117
+ if (err_loc.found_error) {
118
+ it = start;
119
+ auto temptative_end = it + err_loc.position;
120
+ // LOG_DBG("Error at position %zu (is_end = %s): %s\n", err_loc.position, temptative_end == end ? "true" : "false", err_loc.exception_message.c_str());
121
+
122
+ auto input = std::string(it, temptative_end);
123
+ try {
124
+ out.json = json::parse(input);
125
+ // out.json = json::parse(it, temptative_end);
126
+ it = temptative_end;
127
+ return true;
128
+ } catch (const std::exception & ex) {
129
+ // No, needs healing.
130
+ LOG_DBG("Failed to parse up to error: %s: <<<%s>>>\n", ex.what(), std::string(it, temptative_end).c_str());
131
+ }
132
+ auto can_parse = [](const std::string & str) {
133
+ try {
134
+ auto _ = json::parse(str); // NOLINT
135
+ return true;
136
+ } catch (const std::exception &) {
137
+ return false;
138
+ }
139
+ };
140
+ if (!healing_marker.empty() && !err_loc.stack.empty()) {
141
+ std::string str(it, temptative_end);
142
+ auto last_non_sp_pos = str.find_last_not_of(" \n\r\t");
143
+ if (last_non_sp_pos == std::string::npos) {
144
+ throw std::runtime_error("Cannot heal a truncated JSON that stopped in an unknown location");
145
+ }
146
+ auto last_non_sp_char = str[last_non_sp_pos];
147
+ // Used to detect stops on a number, which may not be complete.
148
+ auto was_maybe_number = [&]() {
149
+ if (!str.empty() && std::isspace(str.back())) {
150
+ return false;
151
+ }
152
+ return std::isdigit(last_non_sp_char) ||
153
+ last_non_sp_char == '.' ||
154
+ last_non_sp_char == 'e' ||
155
+ last_non_sp_char == 'E' ||
156
+ last_non_sp_char == '-';
157
+ };
158
+
159
+ std::string closing;
160
+ for (size_t i = err_loc.stack.size(); i > 0; i--) {
161
+ auto & el = err_loc.stack[i - 1];
162
+ if (el.type == COMMON_JSON_STACK_ELEMENT_OBJECT) {
163
+ closing += "}";
164
+ } else if (el.type == COMMON_JSON_STACK_ELEMENT_ARRAY) {
165
+ closing += "]";
166
+ } else if (el.type != COMMON_JSON_STACK_ELEMENT_KEY) {
167
+ throw std::runtime_error("Unexpected stack element type");
168
+ }
169
+ }
170
+
171
+ const auto & magic_seed = out.healing_marker.marker = healing_marker;//"$llama.cpp.json$";
172
+
173
+ if (err_loc.stack.back().type == COMMON_JSON_STACK_ELEMENT_KEY) {
174
+ // We're inside an object value
175
+ if (last_non_sp_char == ':' && can_parse(str + "1" + closing)) {
176
+ // Was about to create an object value
177
+ str += (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing;
178
+ } else if (can_parse(str + ": 1" + closing)) {
179
+ str += (out.healing_marker.json_dump_marker = ":\"" + magic_seed) + "\"" + closing;
180
+ } else if (last_non_sp_char == '{' && can_parse(str + closing)) {
181
+ // Was about to create an object
182
+ str += (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\": 1" + closing;
183
+ } else if (can_parse(str + "\"" + closing)) {
184
+ // Was inside an object value string
185
+ str += (out.healing_marker.json_dump_marker = magic_seed) + "\"" + closing;
186
+ } else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\"" + closing)) {
187
+ // Was inside an object value string after an escape
188
+ str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\"" + closing;
189
+ } else {
190
+ // find last :
191
+ auto last_pos = str.find_last_of(':');
192
+ if (last_pos == std::string::npos) {
193
+ throw std::runtime_error("Cannot heal a truncated JSON that stopped in an unknown location");
194
+ }
195
+ // Cutting back to opening : for object value
196
+ str = str.substr(0, last_pos + 1) + (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing;
197
+ }
198
+ } else if (err_loc.stack.back().type == COMMON_JSON_STACK_ELEMENT_ARRAY) {
199
+ if ((last_non_sp_char == ',' || last_non_sp_char == '[') && can_parse(str + "1" + closing)) {
200
+ // Was about to create an array value
201
+ str += (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing;
202
+ } else if (can_parse(str + "\"" + closing)) {
203
+ // Was inside an array value string
204
+ str += (out.healing_marker.json_dump_marker = magic_seed) + "\"" + closing;
205
+ } else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\"" + closing)) {
206
+ // Was inside an array value string after an escape
207
+ str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\"" + closing;
208
+ } else if (!was_maybe_number() && can_parse(str + ", 1" + closing)) {
209
+ // Had just finished a value
210
+ str += (out.healing_marker.json_dump_marker = ",\"" + magic_seed) + "\"" + closing;
211
+ } else {
212
+ auto last_pos = str.find_last_of("[,");
213
+ if (last_pos == std::string::npos) {
214
+ throw std::runtime_error("Cannot heal a truncated JSON array stopped in an unknown location");
215
+ }
216
+ // Cutting back to last [ or , for array value
217
+ str = str.substr(0, last_pos + 1) + (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing;
218
+ }
219
+ } else if (err_loc.stack.back().type == COMMON_JSON_STACK_ELEMENT_OBJECT) {
220
+ if ((last_non_sp_char == '{' && can_parse(str + closing)) ||
221
+ (last_non_sp_char == ',' && can_parse(str + "\"\": 1" + closing))) {
222
+ // Was about to create an object key+value
223
+ str += (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\": 1" + closing;
224
+ } else if (!was_maybe_number() && can_parse(str + ",\"\": 1" + closing)) {
225
+ // Was about to create an object key+value
226
+ str += (out.healing_marker.json_dump_marker = ",\"" + magic_seed) + "\": 1" + closing;
227
+ } else if (can_parse(str + "\": 1" + closing)) {
228
+ // Was inside an object key string
229
+ str += (out.healing_marker.json_dump_marker = magic_seed) + "\": 1" + closing;
230
+ } else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\": 1" + closing)) {
231
+ // Was inside an object key string after an escape
232
+ str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\": 1" + closing;
233
+ } else {
234
+ auto last_pos = str.find_last_of(':');
235
+ if (last_pos == std::string::npos) {
236
+ throw std::runtime_error("Cannot heal a truncated JSON object stopped in an unknown location");
237
+ }
238
+ // fprintf(stderr, "Cutting back to last : for object key+value\n");
239
+ str = str.substr(0, last_pos + 1) + (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing;
240
+ }
241
+ } else {
242
+ throw std::runtime_error("Cannot heal a truncated JSON object stopped in an unknown location");
243
+ }
244
+ // fprintf(stderr, "HEALED:\nSTRING <<<\n%s\n>>>\n\nmagic_cut: <<<\n%s\n>>>\n\n", str.c_str(), out.healing_marker.json_dump_marker.c_str());
245
+ out.json = json::parse(str);
246
+ it = temptative_end;
247
+ return true;
248
+ }
249
+ // TODO: handle unclosed top-level primitive if the stack was empty but we got an error (e.g. "tru", "\"", etc...)
250
+ // fprintf(stderr, "Closing: TODO\n");
251
+ return false;
252
+ }
253
+ out.json = json::parse(it, end);
254
+ it = end;
255
+ return true;
256
+ }