@fugood/llama.node 0.3.7 → 0.3.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (186) hide show
  1. package/README.md +17 -2
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  7. package/bin/linux-cuda/x64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  10. package/bin/win32/arm64/llama-node.node +0 -0
  11. package/bin/win32/arm64/node.lib +0 -0
  12. package/bin/win32/x64/llama-node.node +0 -0
  13. package/bin/win32/x64/node.lib +0 -0
  14. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/arm64/node.lib +0 -0
  16. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  17. package/bin/win32-vulkan/x64/node.lib +0 -0
  18. package/lib/binding.ts +8 -0
  19. package/lib/index.js +16 -1
  20. package/lib/index.ts +16 -0
  21. package/package.json +1 -1
  22. package/src/EmbeddingWorker.cpp +4 -3
  23. package/src/LlamaCompletionWorker.cpp +4 -2
  24. package/src/LlamaContext.cpp +156 -6
  25. package/src/LlamaContext.h +5 -0
  26. package/src/common.hpp +6 -11
  27. package/src/llama.cpp/.github/workflows/build.yml +19 -17
  28. package/src/llama.cpp/.github/workflows/docker.yml +77 -30
  29. package/src/llama.cpp/.github/workflows/editorconfig.yml +3 -1
  30. package/src/llama.cpp/.github/workflows/server.yml +22 -3
  31. package/src/llama.cpp/CMakeLists.txt +49 -24
  32. package/src/llama.cpp/common/arg.cpp +82 -26
  33. package/src/llama.cpp/common/arg.h +3 -0
  34. package/src/llama.cpp/common/common.cpp +192 -72
  35. package/src/llama.cpp/common/common.h +51 -18
  36. package/src/llama.cpp/common/ngram-cache.cpp +12 -12
  37. package/src/llama.cpp/common/ngram-cache.h +2 -2
  38. package/src/llama.cpp/common/sampling.cpp +11 -6
  39. package/src/llama.cpp/common/speculative.cpp +18 -15
  40. package/src/llama.cpp/docs/build.md +2 -0
  41. package/src/llama.cpp/examples/batched/batched.cpp +9 -7
  42. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +3 -3
  43. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +10 -8
  44. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +11 -8
  45. package/src/llama.cpp/examples/cvector-generator/mean.hpp +1 -1
  46. package/src/llama.cpp/examples/cvector-generator/pca.hpp +1 -1
  47. package/src/llama.cpp/examples/embedding/embedding.cpp +8 -7
  48. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +7 -6
  49. package/src/llama.cpp/examples/export-lora/export-lora.cpp +8 -7
  50. package/src/llama.cpp/examples/gguf/gguf.cpp +10 -6
  51. package/src/llama.cpp/examples/gguf-hash/gguf-hash.cpp +1 -0
  52. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +8 -7
  53. package/src/llama.cpp/examples/gritlm/gritlm.cpp +13 -10
  54. package/src/llama.cpp/examples/imatrix/imatrix.cpp +13 -12
  55. package/src/llama.cpp/examples/infill/infill.cpp +23 -24
  56. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +44 -13
  57. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -6
  58. package/src/llama.cpp/examples/llava/clip.cpp +4 -2
  59. package/src/llama.cpp/examples/llava/llava-cli.cpp +9 -6
  60. package/src/llama.cpp/examples/llava/llava.cpp +2 -2
  61. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +8 -4
  62. package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +11 -8
  63. package/src/llama.cpp/examples/lookahead/lookahead.cpp +6 -7
  64. package/src/llama.cpp/examples/lookup/lookup-create.cpp +4 -9
  65. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +3 -7
  66. package/src/llama.cpp/examples/lookup/lookup.cpp +5 -6
  67. package/src/llama.cpp/examples/main/main.cpp +51 -29
  68. package/src/llama.cpp/examples/parallel/parallel.cpp +5 -6
  69. package/src/llama.cpp/examples/passkey/passkey.cpp +7 -5
  70. package/src/llama.cpp/examples/perplexity/perplexity.cpp +37 -23
  71. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +12 -14
  72. package/src/llama.cpp/examples/retrieval/retrieval.cpp +8 -8
  73. package/src/llama.cpp/examples/rpc/rpc-server.cpp +12 -0
  74. package/src/llama.cpp/examples/run/CMakeLists.txt +1 -1
  75. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +1351 -0
  76. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +114 -0
  77. package/src/llama.cpp/examples/run/run.cpp +175 -61
  78. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +4 -25
  79. package/src/llama.cpp/examples/server/CMakeLists.txt +1 -0
  80. package/src/llama.cpp/examples/server/httplib.h +1295 -409
  81. package/src/llama.cpp/examples/server/server.cpp +387 -181
  82. package/src/llama.cpp/examples/server/tests/requirements.txt +1 -0
  83. package/src/llama.cpp/examples/server/utils.hpp +170 -58
  84. package/src/llama.cpp/examples/simple/simple.cpp +9 -8
  85. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +16 -12
  86. package/src/llama.cpp/examples/speculative/speculative.cpp +22 -23
  87. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +8 -12
  88. package/src/llama.cpp/examples/tokenize/tokenize.cpp +17 -5
  89. package/src/llama.cpp/examples/tts/tts.cpp +64 -23
  90. package/src/llama.cpp/ggml/CMakeLists.txt +5 -21
  91. package/src/llama.cpp/ggml/include/ggml-backend.h +2 -0
  92. package/src/llama.cpp/ggml/include/ggml-cpp.h +1 -0
  93. package/src/llama.cpp/ggml/include/ggml.h +36 -145
  94. package/src/llama.cpp/ggml/include/gguf.h +202 -0
  95. package/src/llama.cpp/ggml/src/CMakeLists.txt +6 -3
  96. package/src/llama.cpp/ggml/src/ggml-alloc.c +5 -0
  97. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +0 -1
  98. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +79 -49
  99. package/src/llama.cpp/ggml/src/ggml-backend.cpp +5 -2
  100. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +33 -23
  101. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +57 -72
  102. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +87 -2
  103. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +335 -66
  104. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +10 -2
  105. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1090 -378
  106. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +2 -2
  107. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +1 -0
  108. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +3 -0
  109. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +3 -0
  110. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +3 -1
  111. package/src/llama.cpp/ggml/src/ggml-impl.h +11 -16
  112. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +16 -0
  113. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +6 -6
  114. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +154 -35
  115. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
  116. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +9 -3
  117. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +18 -0
  118. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +3 -2
  119. package/src/llama.cpp/ggml/src/ggml-sycl/concat.hpp +1 -2
  120. package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +3 -2
  121. package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +1 -2
  122. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +40 -95
  123. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +48 -48
  124. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +24 -24
  125. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +238 -164
  126. package/src/llama.cpp/ggml/src/ggml-sycl/gla.cpp +105 -0
  127. package/src/llama.cpp/ggml/src/ggml-sycl/gla.hpp +8 -0
  128. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +3 -3
  129. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +1 -2
  130. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +3 -2
  131. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +1 -2
  132. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +7 -5
  133. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +1 -2
  134. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +74 -4
  135. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +314 -116
  136. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -2
  137. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +9 -3
  138. package/src/llama.cpp/ggml/src/ggml.c +117 -1327
  139. package/src/llama.cpp/ggml/src/gguf.cpp +1329 -0
  140. package/src/llama.cpp/include/llama-cpp.h +6 -1
  141. package/src/llama.cpp/include/llama.h +138 -75
  142. package/src/llama.cpp/src/CMakeLists.txt +13 -1
  143. package/src/llama.cpp/src/llama-adapter.cpp +347 -0
  144. package/src/llama.cpp/src/llama-adapter.h +74 -0
  145. package/src/llama.cpp/src/llama-arch.cpp +1487 -0
  146. package/src/llama.cpp/src/llama-arch.h +400 -0
  147. package/src/llama.cpp/src/llama-batch.cpp +368 -0
  148. package/src/llama.cpp/src/llama-batch.h +88 -0
  149. package/src/llama.cpp/src/llama-chat.cpp +578 -0
  150. package/src/llama.cpp/src/llama-chat.h +52 -0
  151. package/src/llama.cpp/src/llama-context.cpp +1775 -0
  152. package/src/llama.cpp/src/llama-context.h +128 -0
  153. package/src/llama.cpp/src/llama-cparams.cpp +1 -0
  154. package/src/llama.cpp/src/llama-cparams.h +37 -0
  155. package/src/llama.cpp/src/llama-grammar.cpp +5 -4
  156. package/src/llama.cpp/src/llama-grammar.h +3 -1
  157. package/src/llama.cpp/src/llama-hparams.cpp +71 -0
  158. package/src/llama.cpp/src/llama-hparams.h +139 -0
  159. package/src/llama.cpp/src/llama-impl.cpp +167 -0
  160. package/src/llama.cpp/src/llama-impl.h +16 -136
  161. package/src/llama.cpp/src/llama-kv-cache.cpp +718 -0
  162. package/src/llama.cpp/src/llama-kv-cache.h +218 -0
  163. package/src/llama.cpp/src/llama-mmap.cpp +589 -0
  164. package/src/llama.cpp/src/llama-mmap.h +67 -0
  165. package/src/llama.cpp/src/llama-model-loader.cpp +1124 -0
  166. package/src/llama.cpp/src/llama-model-loader.h +167 -0
  167. package/src/llama.cpp/src/llama-model.cpp +3953 -0
  168. package/src/llama.cpp/src/llama-model.h +370 -0
  169. package/src/llama.cpp/src/llama-quant.cpp +934 -0
  170. package/src/llama.cpp/src/llama-quant.h +1 -0
  171. package/src/llama.cpp/src/llama-sampling.cpp +147 -32
  172. package/src/llama.cpp/src/llama-sampling.h +3 -19
  173. package/src/llama.cpp/src/llama-vocab.cpp +1832 -575
  174. package/src/llama.cpp/src/llama-vocab.h +97 -142
  175. package/src/llama.cpp/src/llama.cpp +7160 -20314
  176. package/src/llama.cpp/src/unicode.cpp +8 -3
  177. package/src/llama.cpp/tests/CMakeLists.txt +2 -0
  178. package/src/llama.cpp/tests/test-autorelease.cpp +3 -3
  179. package/src/llama.cpp/tests/test-backend-ops.cpp +370 -59
  180. package/src/llama.cpp/tests/test-chat-template.cpp +162 -125
  181. package/src/llama.cpp/tests/test-gguf.cpp +222 -187
  182. package/src/llama.cpp/tests/test-model-load-cancel.cpp +1 -1
  183. package/src/llama.cpp/tests/test-sampling.cpp +0 -1
  184. package/src/llama.cpp/tests/test-tokenizer-0.cpp +4 -4
  185. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +9 -7
  186. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +8 -6
@@ -19,6 +19,7 @@
19
19
  #include "loading.html.hpp"
20
20
 
21
21
  #include <atomic>
22
+ #include <chrono>
22
23
  #include <condition_variable>
23
24
  #include <cstddef>
24
25
  #include <cinttypes>
@@ -32,6 +33,8 @@
32
33
 
33
34
  using json = nlohmann::ordered_json;
34
35
 
36
+ constexpr int HTTP_POLLING_SECONDS = 1;
37
+
35
38
  enum stop_type {
36
39
  STOP_TYPE_NONE,
37
40
  STOP_TYPE_EOS,
@@ -67,6 +70,13 @@ enum server_task_type {
67
70
  SERVER_TASK_TYPE_SET_LORA,
68
71
  };
69
72
 
73
+ enum oaicompat_type {
74
+ OAICOMPAT_TYPE_NONE,
75
+ OAICOMPAT_TYPE_CHAT,
76
+ OAICOMPAT_TYPE_COMPLETION,
77
+ OAICOMPAT_TYPE_EMBEDDING,
78
+ };
79
+
70
80
  // https://community.openai.com/t/openai-chat-list-of-error-codes-and-types/357791/11
71
81
  enum error_type {
72
82
  ERROR_TYPE_INVALID_REQUEST,
@@ -91,7 +101,10 @@ struct slot_params {
91
101
  int64_t t_max_prompt_ms = -1; // TODO: implement
92
102
  int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit
93
103
 
104
+ std::vector<common_adapter_lora_info> lora;
105
+
94
106
  std::vector<std::string> antiprompt;
107
+ std::vector<std::string> response_fields;
95
108
  bool timings_per_token = false;
96
109
  bool post_sampling_probs = false;
97
110
  bool ignore_eos = false;
@@ -100,11 +113,10 @@ struct slot_params {
100
113
  struct common_params_speculative speculative;
101
114
 
102
115
  // OAI-compat fields
103
- bool verbose = false;
104
- bool oaicompat = false;
105
- bool oaicompat_chat = true;
106
- std::string oaicompat_model;
107
- std::string oaicompat_cmpl_id;
116
+ bool verbose = false;
117
+ oaicompat_type oaicompat = OAICOMPAT_TYPE_NONE;
118
+ std::string oaicompat_model;
119
+ std::string oaicompat_cmpl_id;
108
120
 
109
121
  json to_json() const {
110
122
  std::vector<std::string> samplers;
@@ -113,6 +125,11 @@ struct slot_params {
113
125
  samplers.emplace_back(common_sampler_type_to_str(sampler));
114
126
  }
115
127
 
128
+ json lora = json::array();
129
+ for (size_t i = 0; i < this->lora.size(); ++i) {
130
+ lora.push_back({{"id", i}, {"scale", this->lora[i].scale}});
131
+ }
132
+
116
133
  return json {
117
134
  {"n_predict", n_predict}, // Server configured n_predict
118
135
  {"seed", sampling.seed},
@@ -153,6 +170,7 @@ struct slot_params {
153
170
  {"speculative.p_min", speculative.p_min},
154
171
  {"timings_per_token", timings_per_token},
155
172
  {"post_sampling_probs", post_sampling_probs},
173
+ {"lora", lora},
156
174
  };
157
175
  }
158
176
  };
@@ -182,13 +200,18 @@ struct server_task {
182
200
  // used by SERVER_TASK_TYPE_METRICS
183
201
  bool metrics_reset_bucket = false;
184
202
 
203
+ // used by SERVER_TASK_TYPE_SET_LORA
204
+ std::vector<common_adapter_lora_info> set_lora;
205
+
185
206
  server_task(server_task_type type) : type(type) {}
186
207
 
187
208
  static slot_params params_from_json_cmpl(
188
- const llama_model * model,
189
209
  const llama_context * ctx,
190
210
  const common_params & params_base,
191
211
  const json & data) {
212
+ const llama_model * model = llama_get_model(ctx);
213
+ const llama_vocab * vocab = llama_model_get_vocab(model);
214
+
192
215
  slot_params params;
193
216
 
194
217
  // Sampling parameter defaults are loaded from the global server context (but individual requests can still override them)
@@ -209,6 +232,7 @@ struct server_task {
209
232
  params.n_discard = json_value(data, "n_discard", defaults.n_discard);
210
233
  //params.t_max_prompt_ms = json_value(data, "t_max_prompt_ms", defaults.t_max_prompt_ms); // TODO: implement
211
234
  params.t_max_predict_ms = json_value(data, "t_max_predict_ms", defaults.t_max_predict_ms);
235
+ params.response_fields = json_value(data, "response_fields", std::vector<std::string>());
212
236
 
213
237
  params.sampling.top_k = json_value(data, "top_k", defaults.sampling.top_k);
214
238
  params.sampling.top_p = json_value(data, "top_p", defaults.sampling.top_p);
@@ -243,6 +267,16 @@ struct server_task {
243
267
  params.speculative.n_min = std::max(params.speculative.n_min, 2);
244
268
  params.speculative.n_max = std::max(params.speculative.n_max, 0);
245
269
 
270
+ if (data.contains("lora")) {
271
+ if (data.at("lora").is_array()) {
272
+ params.lora = parse_lora_request(params_base.lora_adapters, data.at("lora"));
273
+ } else {
274
+ throw std::runtime_error("Error: 'lora' must be an array of objects with 'id' and 'scale' fields");
275
+ }
276
+ } else {
277
+ params.lora = params_base.lora_adapters;
278
+ }
279
+
246
280
  // TODO: add more sanity checks for the input parameters
247
281
 
248
282
  if (params.sampling.penalty_last_n < -1) {
@@ -300,7 +334,7 @@ struct server_task {
300
334
 
301
335
  const auto & logit_bias = data.find("logit_bias");
302
336
  if (logit_bias != data.end() && logit_bias->is_array()) {
303
- const int n_vocab = llama_n_vocab(model);
337
+ const int n_vocab = llama_vocab_n_tokens(vocab);
304
338
  for (const auto & el : *logit_bias) {
305
339
  // TODO: we may want to throw errors here, in case "el" is incorrect
306
340
  if (el.is_array() && el.size() == 2) {
@@ -319,7 +353,7 @@ struct server_task {
319
353
  params.sampling.logit_bias.push_back({tok, bias});
320
354
  }
321
355
  } else if (el[0].is_string()) {
322
- auto toks = common_tokenize(model, el[0].get<std::string>(), false);
356
+ auto toks = common_tokenize(vocab, el[0].get<std::string>(), false);
323
357
  for (auto tok : toks) {
324
358
  params.sampling.logit_bias.push_back({tok, bias});
325
359
  }
@@ -522,15 +556,15 @@ struct server_task_result_cmpl_final : server_task_result {
522
556
 
523
557
  bool post_sampling_probs;
524
558
  std::vector<completion_token_output> probs_output;
559
+ std::vector<std::string> response_fields;
525
560
 
526
561
  slot_params generation_params;
527
562
 
528
563
  // OAI-compat fields
529
- bool verbose = false;
530
- bool oaicompat = false;
531
- bool oaicompat_chat = true; // TODO: support oaicompat for non-chat
532
- std::string oaicompat_model;
533
- std::string oaicompat_cmpl_id;
564
+ bool verbose = false;
565
+ oaicompat_type oaicompat = OAICOMPAT_TYPE_NONE;
566
+ std::string oaicompat_model;
567
+ std::string oaicompat_cmpl_id;
534
568
 
535
569
  virtual int get_index() override {
536
570
  return index;
@@ -541,9 +575,16 @@ struct server_task_result_cmpl_final : server_task_result {
541
575
  }
542
576
 
543
577
  virtual json to_json() override {
544
- return oaicompat
545
- ? (stream ? to_json_oaicompat_chat_stream() : to_json_oaicompat_chat())
546
- : to_json_non_oaicompat();
578
+ switch (oaicompat) {
579
+ case OAICOMPAT_TYPE_NONE:
580
+ return to_json_non_oaicompat();
581
+ case OAICOMPAT_TYPE_COMPLETION:
582
+ return to_json_oaicompat();
583
+ case OAICOMPAT_TYPE_CHAT:
584
+ return stream ? to_json_oaicompat_chat_stream() : to_json_oaicompat_chat();
585
+ default:
586
+ GGML_ASSERT(false && "Invalid oaicompat_type");
587
+ }
547
588
  }
548
589
 
549
590
  json to_json_non_oaicompat() {
@@ -568,6 +609,50 @@ struct server_task_result_cmpl_final : server_task_result {
568
609
  if (!stream && !probs_output.empty()) {
569
610
  res["completion_probabilities"] = completion_token_output::probs_vector_to_json(probs_output, post_sampling_probs);
570
611
  }
612
+ return response_fields.empty() ? res : json_get_nested_values(response_fields, res);
613
+ }
614
+
615
+ json to_json_oaicompat() {
616
+ std::time_t t = std::time(0);
617
+ json logprobs = json(nullptr); // OAI default to null
618
+ if (!stream && probs_output.size() > 0) {
619
+ logprobs = json{
620
+ {"content", completion_token_output::probs_vector_to_json(probs_output, post_sampling_probs)},
621
+ };
622
+ }
623
+ json finish_reason = "length";
624
+ if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) {
625
+ finish_reason = "stop";
626
+ }
627
+ json res = json {
628
+ {"choices", json::array({
629
+ json{
630
+ {"text", stream ? "" : content}, // in stream mode, content is already in last partial chunk
631
+ {"index", index},
632
+ {"logprobs", logprobs},
633
+ {"finish_reason", finish_reason},
634
+ }
635
+ })},
636
+ {"created", t},
637
+ {"model", oaicompat_model},
638
+ {"system_fingerprint", build_info},
639
+ {"object", "text_completion"},
640
+ {"usage", json {
641
+ {"completion_tokens", n_decoded},
642
+ {"prompt_tokens", n_prompt_tokens},
643
+ {"total_tokens", n_decoded + n_prompt_tokens}
644
+ }},
645
+ {"id", oaicompat_cmpl_id}
646
+ };
647
+
648
+ // extra fields for debugging purposes
649
+ if (verbose) {
650
+ res["__verbose"] = to_json_non_oaicompat();
651
+ }
652
+ if (timings.prompt_n >= 0) {
653
+ res.push_back({"timings", timings.to_json()});
654
+ }
655
+
571
656
  return res;
572
657
  }
573
658
 
@@ -595,10 +680,11 @@ struct server_task_result_cmpl_final : server_task_result {
595
680
  std::time_t t = std::time(0);
596
681
 
597
682
  json res = json {
598
- {"choices", json::array({choice})},
599
- {"created", t},
600
- {"model", oaicompat_model},
601
- {"object", "chat.completion"},
683
+ {"choices", json::array({choice})},
684
+ {"created", t},
685
+ {"model", oaicompat_model},
686
+ {"system_fingerprint", build_info},
687
+ {"object", "chat.completion"},
602
688
  {"usage", json {
603
689
  {"completion_tokens", n_decoded},
604
690
  {"prompt_tokens", n_prompt_tokens},
@@ -632,11 +718,12 @@ struct server_task_result_cmpl_final : server_task_result {
632
718
  };
633
719
 
634
720
  json ret = json {
635
- {"choices", json::array({choice})},
636
- {"created", t},
637
- {"id", oaicompat_cmpl_id},
638
- {"model", oaicompat_model},
639
- {"object", "chat.completion.chunk"},
721
+ {"choices", json::array({choice})},
722
+ {"created", t},
723
+ {"id", oaicompat_cmpl_id},
724
+ {"model", oaicompat_model},
725
+ {"system_fingerprint", build_info},
726
+ {"object", "chat.completion.chunk"},
640
727
  {"usage", json {
641
728
  {"completion_tokens", n_decoded},
642
729
  {"prompt_tokens", n_prompt_tokens},
@@ -666,11 +753,10 @@ struct server_task_result_cmpl_partial : server_task_result {
666
753
  result_timings timings;
667
754
 
668
755
  // OAI-compat fields
669
- bool verbose = false;
670
- bool oaicompat = false;
671
- bool oaicompat_chat = true; // TODO: support oaicompat for non-chat
672
- std::string oaicompat_model;
673
- std::string oaicompat_cmpl_id;
756
+ bool verbose = false;
757
+ oaicompat_type oaicompat = OAICOMPAT_TYPE_NONE;
758
+ std::string oaicompat_model;
759
+ std::string oaicompat_cmpl_id;
674
760
 
675
761
  virtual int get_index() override {
676
762
  return index;
@@ -681,7 +767,16 @@ struct server_task_result_cmpl_partial : server_task_result {
681
767
  }
682
768
 
683
769
  virtual json to_json() override {
684
- return oaicompat ? to_json_oaicompat() : to_json_non_oaicompat();
770
+ switch (oaicompat) {
771
+ case OAICOMPAT_TYPE_NONE:
772
+ return to_json_non_oaicompat();
773
+ case OAICOMPAT_TYPE_COMPLETION:
774
+ return to_json_oaicompat();
775
+ case OAICOMPAT_TYPE_CHAT:
776
+ return to_json_oaicompat_chat();
777
+ default:
778
+ GGML_ASSERT(false && "Invalid oaicompat_type");
779
+ }
685
780
  }
686
781
 
687
782
  json to_json_non_oaicompat() {
@@ -706,6 +801,41 @@ struct server_task_result_cmpl_partial : server_task_result {
706
801
  }
707
802
 
708
803
  json to_json_oaicompat() {
804
+ std::time_t t = std::time(0);
805
+ json logprobs = json(nullptr); // OAI default to null
806
+ if (prob_output.probs.size() > 0) {
807
+ logprobs = json{
808
+ {"content", completion_token_output::probs_vector_to_json({prob_output}, post_sampling_probs)},
809
+ };
810
+ }
811
+ json res = json {
812
+ {"choices", json::array({
813
+ json{
814
+ {"text", content},
815
+ {"index", index},
816
+ {"logprobs", logprobs},
817
+ {"finish_reason", nullptr},
818
+ }
819
+ })},
820
+ {"created", t},
821
+ {"model", oaicompat_model},
822
+ {"system_fingerprint", build_info},
823
+ {"object", "text_completion"},
824
+ {"id", oaicompat_cmpl_id}
825
+ };
826
+
827
+ // extra fields for debugging purposes
828
+ if (verbose) {
829
+ res["__verbose"] = to_json_non_oaicompat();
830
+ }
831
+ if (timings.prompt_n >= 0) {
832
+ res.push_back({"timings", timings.to_json()});
833
+ }
834
+
835
+ return res;
836
+ }
837
+
838
+ json to_json_oaicompat_chat() {
709
839
  bool first = n_decoded == 0;
710
840
  std::time_t t = std::time(0);
711
841
  json choices;
@@ -761,11 +891,12 @@ struct server_task_result_cmpl_partial : server_task_result {
761
891
  }
762
892
 
763
893
  json ret = json {
764
- {"choices", choices},
765
- {"created", t},
766
- {"id", oaicompat_cmpl_id},
767
- {"model", oaicompat_model},
768
- {"object", "chat.completion.chunk"}
894
+ {"choices", choices},
895
+ {"created", t},
896
+ {"id", oaicompat_cmpl_id},
897
+ {"model", oaicompat_model},
898
+ {"system_fingerprint", build_info},
899
+ {"object", "chat.completion.chunk"}
769
900
  };
770
901
 
771
902
  if (timings.prompt_n >= 0) {
@@ -783,14 +914,16 @@ struct server_task_result_embd : server_task_result {
783
914
  int32_t n_tokens;
784
915
 
785
916
  // OAI-compat fields
786
- bool oaicompat = false;
917
+ oaicompat_type oaicompat = OAICOMPAT_TYPE_NONE;
787
918
 
788
919
  virtual int get_index() override {
789
920
  return index;
790
921
  }
791
922
 
792
923
  virtual json to_json() override {
793
- return oaicompat ? to_json_oaicompat() : to_json_non_oaicompat();
924
+ return oaicompat == OAICOMPAT_TYPE_EMBEDDING
925
+ ? to_json_oaicompat()
926
+ : to_json_non_oaicompat();
794
927
  }
795
928
 
796
929
  json to_json_non_oaicompat() {
@@ -1003,6 +1136,8 @@ struct server_slot {
1003
1136
 
1004
1137
  common_speculative * spec = nullptr;
1005
1138
 
1139
+ std::vector<common_adapter_lora_info> lora;
1140
+
1006
1141
  // the index relative to completion multi-task request
1007
1142
  size_t index = 0;
1008
1143
 
@@ -1084,6 +1219,11 @@ struct server_slot {
1084
1219
  return task_type == SERVER_TASK_TYPE_EMBEDDING || task_type == SERVER_TASK_TYPE_RERANK;
1085
1220
  }
1086
1221
 
1222
+ bool can_batch_with(server_slot & other_slot) {
1223
+ return is_non_causal() == other_slot.is_non_causal()
1224
+ && are_lora_equal(lora, other_slot.lora);
1225
+ }
1226
+
1087
1227
  bool has_budget(const common_params & global_params) {
1088
1228
  if (params.n_predict == -1 && global_params.n_predict == -1) {
1089
1229
  return true; // limitless
@@ -1465,6 +1605,30 @@ struct server_response {
1465
1605
  // should never reach here
1466
1606
  }
1467
1607
 
1608
+ // same as recv(), but have timeout in seconds
1609
+ // if timeout is reached, nullptr is returned
1610
+ server_task_result_ptr recv_with_timeout(const std::unordered_set<int> & id_tasks, int timeout) {
1611
+ while (true) {
1612
+ std::unique_lock<std::mutex> lock(mutex_results);
1613
+ bool cr_res = condition_results.wait_for(lock, std::chrono::seconds(timeout), [&]{
1614
+ return !queue_results.empty();
1615
+ });
1616
+ if (!cr_res) {
1617
+ return nullptr;
1618
+ }
1619
+
1620
+ for (int i = 0; i < (int) queue_results.size(); i++) {
1621
+ if (id_tasks.find(queue_results[i]->id) != id_tasks.end()) {
1622
+ server_task_result_ptr res = std::move(queue_results[i]);
1623
+ queue_results.erase(queue_results.begin() + i);
1624
+ return res;
1625
+ }
1626
+ }
1627
+ }
1628
+
1629
+ // should never reach here
1630
+ }
1631
+
1468
1632
  // single-task version of recv()
1469
1633
  server_task_result_ptr recv(int id_task) {
1470
1634
  std::unordered_set<int> id_tasks = {id_task};
@@ -1491,11 +1655,17 @@ struct server_response {
1491
1655
  struct server_context {
1492
1656
  common_params params_base;
1493
1657
 
1658
+ // note: keep these alive - they determine the lifetime of the model, context, etc.
1659
+ common_init_result llama_init;
1660
+ common_init_result llama_init_dft;
1661
+
1494
1662
  llama_model * model = nullptr;
1495
1663
  llama_context * ctx = nullptr;
1496
- std::vector<common_lora_adapter_container> loras;
1664
+
1665
+ const llama_vocab * vocab = nullptr;
1497
1666
 
1498
1667
  llama_model * model_dft = nullptr;
1668
+
1499
1669
  llama_context_params cparams_dft;
1500
1670
 
1501
1671
  llama_batch batch = {};
@@ -1519,21 +1689,6 @@ struct server_context {
1519
1689
  float slot_prompt_similarity = 0.0f;
1520
1690
 
1521
1691
  ~server_context() {
1522
- if (ctx) {
1523
- llama_free(ctx);
1524
- ctx = nullptr;
1525
- }
1526
-
1527
- if (model) {
1528
- llama_free_model(model);
1529
- model = nullptr;
1530
- }
1531
-
1532
- if (model_dft) {
1533
- llama_free_model(model_dft);
1534
- model_dft = nullptr;
1535
- }
1536
-
1537
1692
  // Clear any sampling context
1538
1693
  for (server_slot & slot : slots) {
1539
1694
  common_sampler_free(slot.smpl);
@@ -1556,21 +1711,22 @@ struct server_context {
1556
1711
 
1557
1712
  params_base = params;
1558
1713
 
1559
- common_init_result llama_init = common_init_from_params(params_base);
1714
+ llama_init = common_init_from_params(params_base);
1560
1715
 
1561
- model = llama_init.model;
1562
- ctx = llama_init.context;
1563
- loras = llama_init.lora_adapters;
1716
+ model = llama_init.model.get();
1717
+ ctx = llama_init.context.get();
1564
1718
 
1565
1719
  if (model == nullptr) {
1566
1720
  SRV_ERR("failed to load model, '%s'\n", params_base.model.c_str());
1567
1721
  return false;
1568
1722
  }
1569
1723
 
1724
+ vocab = llama_model_get_vocab(model);
1725
+
1570
1726
  n_ctx = llama_n_ctx(ctx);
1571
1727
 
1572
- add_bos_token = llama_add_bos_token(model);
1573
- has_eos_token = llama_token_eos(model) != LLAMA_TOKEN_NULL;
1728
+ add_bos_token = llama_vocab_get_add_bos(vocab);
1729
+ has_eos_token = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
1574
1730
 
1575
1731
  if (!params_base.speculative.model.empty()) {
1576
1732
  SRV_INF("loading draft model '%s'\n", params_base.speculative.model.c_str());
@@ -1583,25 +1739,22 @@ struct server_context {
1583
1739
  params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers;
1584
1740
  params_dft.n_parallel = 1;
1585
1741
 
1586
- common_init_result llama_init_dft = common_init_from_params(params_dft);
1742
+ llama_init_dft = common_init_from_params(params_dft);
1587
1743
 
1588
- model_dft = llama_init_dft.model;
1744
+ model_dft = llama_init_dft.model.get();
1589
1745
 
1590
1746
  if (model_dft == nullptr) {
1591
1747
  SRV_ERR("failed to load draft model, '%s'\n", params_base.speculative.model.c_str());
1592
1748
  return false;
1593
1749
  }
1594
1750
 
1595
- if (!common_speculative_are_compatible(ctx, llama_init_dft.context)) {
1751
+ if (!common_speculative_are_compatible(ctx, llama_init_dft.context.get())) {
1596
1752
  SRV_ERR("the draft model '%s' is not compatible with the target model '%s'\n", params_base.speculative.model.c_str(), params_base.model.c_str());
1597
1753
 
1598
- llama_free (llama_init_dft.context);
1599
- llama_free_model(llama_init_dft.model);
1600
-
1601
1754
  return false;
1602
1755
  }
1603
1756
 
1604
- const int n_ctx_dft = llama_n_ctx(llama_init_dft.context);
1757
+ const int n_ctx_dft = llama_n_ctx(llama_init_dft.context.get());
1605
1758
 
1606
1759
  cparams_dft = common_context_params_to_llama(params_dft);
1607
1760
  cparams_dft.n_batch = n_ctx_dft;
@@ -1609,25 +1762,16 @@ struct server_context {
1609
1762
  // force F16 KV cache for the draft model for extra performance
1610
1763
  cparams_dft.type_k = GGML_TYPE_F16;
1611
1764
  cparams_dft.type_v = GGML_TYPE_F16;
1612
-
1613
- // the context is not needed - we will create one for each slot
1614
- llama_free(llama_init_dft.context);
1615
1765
  }
1616
1766
 
1617
1767
  return true;
1618
1768
  }
1619
1769
 
1620
- bool validate_model_chat_template() const {
1621
- std::vector<char> model_template(2048, 0); // longest known template is about 1200 bytes
1622
- std::string template_key = "tokenizer.chat_template";
1623
- int32_t res = llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size());
1624
- if (res >= 0) {
1625
- llama_chat_message chat[] = {{"user", "test"}};
1626
- std::string tmpl = std::string(model_template.data(), model_template.size());
1627
- int32_t chat_res = llama_chat_apply_template(model, tmpl.c_str(), chat, 1, true, nullptr, 0);
1628
- return chat_res > 0;
1629
- }
1630
- return false;
1770
+ bool validate_builtin_chat_template() const {
1771
+ llama_chat_message chat[] = {{"user", "test"}};
1772
+ const char * tmpl = llama_model_chat_template(model);
1773
+ const int32_t chat_res = llama_chat_apply_template(tmpl, chat, 1, true, nullptr, 0);
1774
+ return chat_res > 0;
1631
1775
  }
1632
1776
 
1633
1777
  void init() {
@@ -1646,7 +1790,7 @@ struct server_context {
1646
1790
  if (model_dft) {
1647
1791
  slot.batch_spec = llama_batch_init(params_base.speculative.n_max + 1, 0, 1);
1648
1792
 
1649
- slot.ctx_dft = llama_new_context_with_model(model_dft, cparams_dft);
1793
+ slot.ctx_dft = llama_init_from_model(model_dft, cparams_dft);
1650
1794
  if (slot.ctx_dft == nullptr) {
1651
1795
  SRV_ERR("%s", "failed to create draft context\n");
1652
1796
  return;
@@ -1766,6 +1910,12 @@ struct server_context {
1766
1910
  slot.params = std::move(task.params);
1767
1911
  slot.prompt_tokens = std::move(task.prompt_tokens);
1768
1912
 
1913
+ if (!are_lora_equal(task.params.lora, slot.lora)) {
1914
+ // if lora is changed, we cannot reuse cached tokens
1915
+ slot.cache_tokens.clear();
1916
+ slot.lora = task.params.lora;
1917
+ }
1918
+
1769
1919
  SLT_DBG(slot, "launching slot : %s\n", safe_json_to_str(slot.to_json()).c_str());
1770
1920
 
1771
1921
  if (slot.n_predict > 0 && slot.params.n_predict > slot.n_predict) {
@@ -1775,7 +1925,7 @@ struct server_context {
1775
1925
  }
1776
1926
 
1777
1927
  if (slot.params.ignore_eos && has_eos_token) {
1778
- slot.params.sampling.logit_bias.push_back({llama_token_eos(model), -INFINITY});
1928
+ slot.params.sampling.logit_bias.push_back({llama_vocab_eos(vocab), -INFINITY});
1779
1929
  }
1780
1930
 
1781
1931
  {
@@ -1850,6 +2000,8 @@ struct server_context {
1850
2000
  result.text_to_send = slot.generated_text.substr(pos, std::string::npos);
1851
2001
  slot.n_sent_text += result.text_to_send.size();
1852
2002
  // add the token to slot queue and cache
2003
+ } else {
2004
+ result.text_to_send = "";
1853
2005
  }
1854
2006
 
1855
2007
  slot.add_token(result);
@@ -1929,14 +2081,14 @@ struct server_context {
1929
2081
  slot.n_decoded, slot.n_prompt_tokens, slot.n_past, slot.n_ctx);
1930
2082
  }
1931
2083
 
1932
- if (llama_token_is_eog(model, result.tok)) {
2084
+ if (llama_vocab_is_eog(vocab, result.tok)) {
1933
2085
  slot.stop = STOP_TYPE_EOS;
1934
2086
  slot.has_next_token = false;
1935
2087
 
1936
2088
  SLT_DBG(slot, "%s", "stopped by EOS\n");
1937
2089
  }
1938
2090
 
1939
- const auto n_ctx_train = llama_n_ctx_train(model);
2091
+ const auto n_ctx_train = llama_model_n_ctx_train(model);
1940
2092
 
1941
2093
  if (slot.params.n_predict < 1 && slot.n_predict < 1 && slot.n_prompt_tokens + slot.n_decoded >= n_ctx_train) {
1942
2094
  slot.truncated = true;
@@ -1956,7 +2108,7 @@ struct server_context {
1956
2108
 
1957
2109
  void populate_token_probs(const server_slot & slot, completion_token_output & result, bool post_sampling, bool special, int idx) {
1958
2110
  size_t n_probs = slot.params.sampling.n_probs;
1959
- size_t n_vocab = llama_n_vocab(llama_get_model(ctx));
2111
+ size_t n_vocab = llama_vocab_n_tokens(vocab);
1960
2112
  if (post_sampling) {
1961
2113
  const auto * cur_p = common_sampler_get_candidates(slot.smpl);
1962
2114
  const size_t max_probs = cur_p->size;
@@ -2036,7 +2188,6 @@ struct server_context {
2036
2188
 
2037
2189
  res->verbose = slot.params.verbose;
2038
2190
  res->oaicompat = slot.params.oaicompat;
2039
- res->oaicompat_chat = slot.params.oaicompat_chat;
2040
2191
  res->oaicompat_model = slot.params.oaicompat_model;
2041
2192
  res->oaicompat_cmpl_id = slot.params.oaicompat_cmpl_id;
2042
2193
 
@@ -2063,6 +2214,7 @@ struct server_context {
2063
2214
  res->tokens = slot.generated_tokens;
2064
2215
  res->timings = slot.get_timings();
2065
2216
  res->prompt = common_detokenize(ctx, slot.prompt_tokens, true);
2217
+ res->response_fields = slot.params.response_fields;
2066
2218
 
2067
2219
  res->truncated = slot.truncated;
2068
2220
  res->n_decoded = slot.n_decoded;
@@ -2076,7 +2228,6 @@ struct server_context {
2076
2228
  res->verbose = slot.params.verbose;
2077
2229
  res->stream = slot.params.stream;
2078
2230
  res->oaicompat = slot.params.oaicompat;
2079
- res->oaicompat_chat = slot.params.oaicompat_chat;
2080
2231
  res->oaicompat_model = slot.params.oaicompat_model;
2081
2232
  res->oaicompat_cmpl_id = slot.params.oaicompat_cmpl_id;
2082
2233
 
@@ -2108,7 +2259,7 @@ struct server_context {
2108
2259
  res->n_tokens = slot.n_prompt_tokens;
2109
2260
  res->oaicompat = slot.params.oaicompat;
2110
2261
 
2111
- const int n_embd = llama_n_embd(model);
2262
+ const int n_embd = llama_model_n_embd(model);
2112
2263
 
2113
2264
  std::vector<float> embd_res(n_embd, 0.0f);
2114
2265
 
@@ -2198,10 +2349,21 @@ struct server_context {
2198
2349
  void receive_multi_results(
2199
2350
  const std::unordered_set<int> & id_tasks,
2200
2351
  const std::function<void(std::vector<server_task_result_ptr>&)> & result_handler,
2201
- const std::function<void(json)> & error_handler) {
2352
+ const std::function<void(json)> & error_handler,
2353
+ const std::function<bool()> & is_connection_closed) {
2202
2354
  std::vector<server_task_result_ptr> results(id_tasks.size());
2203
- for (size_t i = 0; i < id_tasks.size(); i++) {
2204
- server_task_result_ptr result = queue_results.recv(id_tasks);
2355
+ for (int i = 0; i < (int)id_tasks.size(); i++) {
2356
+ server_task_result_ptr result = queue_results.recv_with_timeout(id_tasks, HTTP_POLLING_SECONDS);
2357
+
2358
+ if (is_connection_closed()) {
2359
+ cancel_tasks(id_tasks);
2360
+ return;
2361
+ }
2362
+
2363
+ if (result == nullptr) {
2364
+ i--; // retry
2365
+ continue;
2366
+ }
2205
2367
 
2206
2368
  if (result->is_error()) {
2207
2369
  error_handler(result->to_json());
@@ -2225,10 +2387,20 @@ struct server_context {
2225
2387
  void receive_cmpl_results_stream(
2226
2388
  const std::unordered_set<int> & id_tasks,
2227
2389
  const std::function<bool(server_task_result_ptr&)> & result_handler,
2228
- const std::function<void(json)> & error_handler) {
2390
+ const std::function<void(json)> & error_handler,
2391
+ const std::function<bool()> & is_connection_closed) {
2229
2392
  size_t n_finished = 0;
2230
2393
  while (true) {
2231
- server_task_result_ptr result = queue_results.recv(id_tasks);
2394
+ server_task_result_ptr result = queue_results.recv_with_timeout(id_tasks, HTTP_POLLING_SECONDS);
2395
+
2396
+ if (is_connection_closed()) {
2397
+ cancel_tasks(id_tasks);
2398
+ return;
2399
+ }
2400
+
2401
+ if (result == nullptr) {
2402
+ continue; // retry
2403
+ }
2232
2404
 
2233
2405
  if (result->is_error()) {
2234
2406
  error_handler(result->to_json());
@@ -2456,7 +2628,7 @@ struct server_context {
2456
2628
  } break;
2457
2629
  case SERVER_TASK_TYPE_SET_LORA:
2458
2630
  {
2459
- common_lora_adapters_apply(ctx, loras);
2631
+ params_base.lora_adapters = std::move(task.set_lora);
2460
2632
  auto res = std::make_unique<server_task_result_apply_lora>();
2461
2633
  res->id = task.id;
2462
2634
  queue_results.send(std::move(res));
@@ -2533,12 +2705,22 @@ struct server_context {
2533
2705
  // start populating the batch for this iteration
2534
2706
  common_batch_clear(batch);
2535
2707
 
2708
+ // track if given slot can be batched with slots already in the batch
2709
+ server_slot * slot_batched = nullptr;
2710
+
2536
2711
  // frist, add sampled tokens from any ongoing sequences
2537
2712
  for (auto & slot : slots) {
2538
2713
  if (slot.state != SLOT_STATE_GENERATING) {
2539
2714
  continue;
2540
2715
  }
2541
2716
 
2717
+ // check if we can batch this slot with the previous one
2718
+ if (!slot_batched) {
2719
+ slot_batched = &slot;
2720
+ } else if (!slot_batched->can_batch_with(slot)) {
2721
+ continue;
2722
+ }
2723
+
2542
2724
  slot.i_batch = batch.n_tokens;
2543
2725
 
2544
2726
  common_batch_add(batch, slot.sampled, slot.n_past, { slot.id }, true);
@@ -2557,15 +2739,18 @@ struct server_context {
2557
2739
  int32_t n_batch = llama_n_batch(ctx);
2558
2740
  int32_t n_ubatch = llama_n_ubatch(ctx);
2559
2741
 
2560
- // track if this is an embedding or non-embedding batch
2561
- // if we've added sampled tokens above, we are in non-embedding mode
2562
- // -1: none, 0: non-embedding, 1: embedding
2563
- // TODO: make enum
2564
- int32_t batch_type = batch.n_tokens > 0 ? 0 : -1;
2565
-
2566
2742
  // next, batch any pending prompts without exceeding n_batch
2567
2743
  if (params_base.cont_batching || batch.n_tokens == 0) {
2568
2744
  for (auto & slot : slots) {
2745
+ // check if we can batch this slot with the previous one
2746
+ if (slot.is_processing()) {
2747
+ if (!slot_batched) {
2748
+ slot_batched = &slot;
2749
+ } else if (!slot_batched->can_batch_with(slot)) {
2750
+ continue;
2751
+ }
2752
+ }
2753
+
2569
2754
  // this slot still has a prompt to be processed
2570
2755
  if (slot.state == SLOT_STATE_PROCESSING_PROMPT || slot.state == SLOT_STATE_STARTED) {
2571
2756
  auto & prompt_tokens = slot.prompt_tokens;
@@ -2726,14 +2911,6 @@ struct server_context {
2726
2911
  }
2727
2912
  }
2728
2913
 
2729
- // check that we are in the right batch_type, if not defer the slot
2730
- int slot_type = slot.is_non_causal();
2731
- if (batch_type == -1) {
2732
- batch_type = slot_type;
2733
- } else if (batch_type != slot_type) {
2734
- continue;
2735
- }
2736
-
2737
2914
  // keep only the common part
2738
2915
  if (!llama_kv_cache_seq_rm(ctx, slot.id, slot.n_past, -1)) {
2739
2916
  // could not partially delete (likely using a non-Transformer model)
@@ -2801,8 +2978,12 @@ struct server_context {
2801
2978
 
2802
2979
  SRV_DBG("decoding batch, n_tokens = %d\n", batch.n_tokens);
2803
2980
 
2804
- // make sure we're in the right embedding mode
2805
- llama_set_embeddings(ctx, batch_type == 1);
2981
+ if (slot_batched) {
2982
+ // make sure we're in the right embedding mode
2983
+ llama_set_embeddings(ctx, slot_batched->is_non_causal());
2984
+ // apply lora, only need to do it once per batch
2985
+ common_set_adapter_lora(ctx, slot_batched->lora);
2986
+ }
2806
2987
 
2807
2988
  // process the created batch of tokens
2808
2989
  for (int32_t i = 0; i < batch.n_tokens; i += n_batch) {
@@ -3003,12 +3184,12 @@ struct server_context {
3003
3184
 
3004
3185
  json model_meta() const {
3005
3186
  return json {
3006
- {"vocab_type", llama_vocab_type (model)},
3007
- {"n_vocab", llama_n_vocab (model)},
3008
- {"n_ctx_train", llama_n_ctx_train (model)},
3009
- {"n_embd", llama_n_embd (model)},
3010
- {"n_params", llama_model_n_params(model)},
3011
- {"size", llama_model_size (model)},
3187
+ {"vocab_type", llama_vocab_type (vocab)},
3188
+ {"n_vocab", llama_vocab_n_tokens (vocab)},
3189
+ {"n_ctx_train", llama_model_n_ctx_train(model)},
3190
+ {"n_embd", llama_model_n_embd (model)},
3191
+ {"n_params", llama_model_n_params (model)},
3192
+ {"size", llama_model_size (model)},
3012
3193
  };
3013
3194
  }
3014
3195
  };
@@ -3475,7 +3656,8 @@ int main(int argc, char ** argv) {
3475
3656
  { "default_generation_settings", ctx_server.default_generation_settings_for_props },
3476
3657
  { "total_slots", ctx_server.params_base.n_parallel },
3477
3658
  { "model_path", ctx_server.params_base.model },
3478
- { "chat_template", llama_get_chat_template(ctx_server.model) },
3659
+ { "chat_template", common_get_builtin_chat_template(ctx_server.model) },
3660
+ { "build_info", build_info },
3479
3661
  };
3480
3662
 
3481
3663
  res_ok(res, data);
@@ -3496,12 +3678,12 @@ int main(int argc, char ** argv) {
3496
3678
 
3497
3679
  // handle completion-like requests (completion, chat, infill)
3498
3680
  // we can optionally provide a custom format for partial results and final results
3499
- const auto handle_completions_generic = [&ctx_server, &res_error, &res_ok](
3681
+ const auto handle_completions_impl = [&ctx_server, &res_error, &res_ok](
3500
3682
  server_task_type type,
3501
3683
  json & data,
3684
+ std::function<bool()> is_connection_closed,
3502
3685
  httplib::Response & res,
3503
- bool oaicompat = false,
3504
- bool oaicompat_chat = false) {
3686
+ oaicompat_type oaicompat) {
3505
3687
  GGML_ASSERT(type == SERVER_TASK_TYPE_COMPLETION || type == SERVER_TASK_TYPE_INFILL);
3506
3688
 
3507
3689
  if (ctx_server.params_base.embedding) {
@@ -3513,7 +3695,7 @@ int main(int argc, char ** argv) {
3513
3695
  std::vector<server_task> tasks;
3514
3696
 
3515
3697
  try {
3516
- std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.ctx, data.at("prompt"), true, true);
3698
+ std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, data.at("prompt"), true, true);
3517
3699
  tasks.reserve(tokenized_prompts.size());
3518
3700
  for (size_t i = 0; i < tokenized_prompts.size(); i++) {
3519
3701
  server_task task = server_task(type);
@@ -3522,13 +3704,15 @@ int main(int argc, char ** argv) {
3522
3704
  task.index = i;
3523
3705
 
3524
3706
  task.prompt_tokens = std::move(tokenized_prompts[i]);
3525
- task.params = server_task::params_from_json_cmpl(ctx_server.model, ctx_server.ctx, ctx_server.params_base, data);
3707
+ task.params = server_task::params_from_json_cmpl(
3708
+ ctx_server.ctx,
3709
+ ctx_server.params_base,
3710
+ data);
3526
3711
  task.id_selected_slot = json_value(data, "id_slot", -1);
3527
3712
 
3528
3713
  // OAI-compat
3529
- task.params.oaicompat = oaicompat;
3530
- task.params.oaicompat_chat = oaicompat_chat;
3531
- task.params.oaicompat_cmpl_id = completion_id;
3714
+ task.params.oaicompat = oaicompat;
3715
+ task.params.oaicompat_cmpl_id = completion_id;
3532
3716
  // oaicompat_model is already populated by params_from_json_cmpl
3533
3717
 
3534
3718
  tasks.push_back(task);
@@ -3559,7 +3743,7 @@ int main(int argc, char ** argv) {
3559
3743
  }
3560
3744
  }, [&](const json & error_data) {
3561
3745
  res_error(res, error_data);
3562
- });
3746
+ }, is_connection_closed);
3563
3747
 
3564
3748
  ctx_server.queue_results.remove_waiting_task_ids(task_ids);
3565
3749
  } else {
@@ -3569,6 +3753,7 @@ int main(int argc, char ** argv) {
3569
3753
  if (res_json.is_array()) {
3570
3754
  for (const auto & res : res_json) {
3571
3755
  if (!server_sent_event(sink, "data", res)) {
3756
+ // sending failed (HTTP connection closed), cancel the generation
3572
3757
  return false;
3573
3758
  }
3574
3759
  }
@@ -3578,8 +3763,11 @@ int main(int argc, char ** argv) {
3578
3763
  }
3579
3764
  }, [&](const json & error_data) {
3580
3765
  server_sent_event(sink, "error", error_data);
3766
+ }, [&sink]() {
3767
+ // note: do not use req.is_connection_closed here because req is already destroyed
3768
+ return !sink.is_writable();
3581
3769
  });
3582
- if (oaicompat) {
3770
+ if (oaicompat != OAICOMPAT_TYPE_NONE) {
3583
3771
  static const std::string ev_done = "data: [DONE]\n\n";
3584
3772
  sink.write(ev_done.data(), ev_done.size());
3585
3773
  }
@@ -3595,26 +3783,36 @@ int main(int argc, char ** argv) {
3595
3783
  }
3596
3784
  };
3597
3785
 
3598
- const auto handle_completions = [&handle_completions_generic](const httplib::Request & req, httplib::Response & res) {
3786
+ const auto handle_completions = [&handle_completions_impl](const httplib::Request & req, httplib::Response & res) {
3599
3787
  json data = json::parse(req.body);
3600
- return handle_completions_generic(
3788
+ return handle_completions_impl(
3789
+ SERVER_TASK_TYPE_COMPLETION,
3790
+ data,
3791
+ req.is_connection_closed,
3792
+ res,
3793
+ OAICOMPAT_TYPE_NONE);
3794
+ };
3795
+
3796
+ const auto handle_completions_oai = [&handle_completions_impl](const httplib::Request & req, httplib::Response & res) {
3797
+ json data = oaicompat_completion_params_parse(json::parse(req.body));
3798
+ return handle_completions_impl(
3601
3799
  SERVER_TASK_TYPE_COMPLETION,
3602
3800
  data,
3801
+ req.is_connection_closed,
3603
3802
  res,
3604
- /* oaicompat */ false,
3605
- /* oaicompat_chat */ false);
3803
+ OAICOMPAT_TYPE_COMPLETION);
3606
3804
  };
3607
3805
 
3608
- const auto handle_infill = [&ctx_server, &res_error, &handle_completions_generic](const httplib::Request & req, httplib::Response & res) {
3806
+ const auto handle_infill = [&ctx_server, &res_error, &handle_completions_impl](const httplib::Request & req, httplib::Response & res) {
3609
3807
  // check model compatibility
3610
3808
  std::string err;
3611
- if (llama_token_fim_pre(ctx_server.model) == LLAMA_TOKEN_NULL) {
3809
+ if (llama_vocab_fim_pre(ctx_server.vocab) == LLAMA_TOKEN_NULL) {
3612
3810
  err += "prefix token is missing. ";
3613
3811
  }
3614
- if (llama_token_fim_suf(ctx_server.model) == LLAMA_TOKEN_NULL) {
3812
+ if (llama_vocab_fim_suf(ctx_server.vocab) == LLAMA_TOKEN_NULL) {
3615
3813
  err += "suffix token is missing. ";
3616
3814
  }
3617
- if (llama_token_fim_mid(ctx_server.model) == LLAMA_TOKEN_NULL) {
3815
+ if (llama_vocab_fim_mid(ctx_server.vocab) == LLAMA_TOKEN_NULL) {
3618
3816
  err += "middle token is missing. ";
3619
3817
  }
3620
3818
  if (!err.empty()) {
@@ -3660,10 +3858,10 @@ int main(int argc, char ** argv) {
3660
3858
  data["input_extra"] = input_extra; // default to empty array if it's not exist
3661
3859
 
3662
3860
  std::string prompt = json_value(data, "prompt", std::string());
3663
- std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.ctx, prompt, true, true);
3861
+ std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, prompt, false, true);
3664
3862
  SRV_DBG("creating infill tasks, n_prompts = %d\n", (int) tokenized_prompts.size());
3665
3863
  data["prompt"] = format_infill(
3666
- ctx_server.ctx,
3864
+ ctx_server.vocab,
3667
3865
  data.at("input_prefix"),
3668
3866
  data.at("input_suffix"),
3669
3867
  data.at("input_extra"),
@@ -3674,22 +3872,27 @@ int main(int argc, char ** argv) {
3674
3872
  tokenized_prompts[0]
3675
3873
  );
3676
3874
 
3677
- return handle_completions_generic(SERVER_TASK_TYPE_INFILL, data, res);
3875
+ return handle_completions_impl(
3876
+ SERVER_TASK_TYPE_INFILL,
3877
+ data,
3878
+ req.is_connection_closed,
3879
+ res,
3880
+ OAICOMPAT_TYPE_NONE); // infill is not OAI compatible
3678
3881
  };
3679
3882
 
3680
- const auto handle_chat_completions = [&ctx_server, &params, &res_error, &handle_completions_generic](const httplib::Request & req, httplib::Response & res) {
3883
+ const auto handle_chat_completions = [&ctx_server, &params, &res_error, &handle_completions_impl](const httplib::Request & req, httplib::Response & res) {
3681
3884
  if (ctx_server.params_base.embedding) {
3682
3885
  res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
3683
3886
  return;
3684
3887
  }
3685
3888
 
3686
- json data = oaicompat_completion_params_parse(ctx_server.model, json::parse(req.body), params.chat_template);
3687
- return handle_completions_generic(
3889
+ json data = oaicompat_chat_completion_params_parse(ctx_server.model, json::parse(req.body), params.chat_template);
3890
+ return handle_completions_impl(
3688
3891
  SERVER_TASK_TYPE_COMPLETION,
3689
3892
  data,
3893
+ req.is_connection_closed,
3690
3894
  res,
3691
- /* oaicompat */ true,
3692
- /* oaicompat_chat */ true);
3895
+ OAICOMPAT_TYPE_CHAT);
3693
3896
  };
3694
3897
 
3695
3898
  const auto handle_models = [&params, &ctx_server, &res_ok](const httplib::Request &, httplib::Response & res) {
@@ -3697,7 +3900,7 @@ int main(int argc, char ** argv) {
3697
3900
  {"object", "list"},
3698
3901
  {"data", {
3699
3902
  {
3700
- {"id", params.model_alias},
3903
+ {"id", params.model_alias.empty() ? params.model : params.model_alias},
3701
3904
  {"object", "model"},
3702
3905
  {"created", std::time(0)},
3703
3906
  {"owned_by", "llamacpp"},
@@ -3717,7 +3920,7 @@ int main(int argc, char ** argv) {
3717
3920
  const bool add_special = json_value(body, "add_special", false);
3718
3921
  const bool with_pieces = json_value(body, "with_pieces", false);
3719
3922
 
3720
- llama_tokens tokens = tokenize_mixed(ctx_server.ctx, body.at("content"), add_special, true);
3923
+ llama_tokens tokens = tokenize_mixed(ctx_server.vocab, body.at("content"), add_special, true);
3721
3924
 
3722
3925
  if (with_pieces) {
3723
3926
  for (const auto& token : tokens) {
@@ -3762,10 +3965,10 @@ int main(int argc, char ** argv) {
3762
3965
  res_ok(res, data);
3763
3966
  };
3764
3967
 
3765
- const auto handle_embeddings_impl = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res, bool oaicompat) {
3968
+ const auto handle_embeddings_impl = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res, oaicompat_type oaicompat) {
3766
3969
  const json body = json::parse(req.body);
3767
3970
 
3768
- if (oaicompat && llama_pooling_type(ctx_server.ctx) == LLAMA_POOLING_TYPE_NONE) {
3971
+ if (oaicompat != OAICOMPAT_TYPE_NONE && llama_pooling_type(ctx_server.ctx) == LLAMA_POOLING_TYPE_NONE) {
3769
3972
  res_error(res, format_error_response("Pooling type 'none' is not OAI compatible. Please use a different pooling type", ERROR_TYPE_INVALID_REQUEST));
3770
3973
  return;
3771
3974
  }
@@ -3775,14 +3978,25 @@ int main(int argc, char ** argv) {
3775
3978
  if (body.count("input") != 0) {
3776
3979
  prompt = body.at("input");
3777
3980
  } else if (body.contains("content")) {
3778
- oaicompat = false;
3981
+ oaicompat = OAICOMPAT_TYPE_NONE; // "content" field is not OAI compatible
3779
3982
  prompt = body.at("content");
3780
3983
  } else {
3781
3984
  res_error(res, format_error_response("\"input\" or \"content\" must be provided", ERROR_TYPE_INVALID_REQUEST));
3782
3985
  return;
3783
3986
  }
3784
3987
 
3785
- std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.ctx, prompt, true, true);
3988
+ bool use_base64 = false;
3989
+ if (body.count("encoding_format") != 0) {
3990
+ const std::string& format = body.at("encoding_format");
3991
+ if (format == "base64") {
3992
+ use_base64 = true;
3993
+ } else if (format != "float") {
3994
+ res_error(res, format_error_response("The format to return the embeddings in. Can be either float or base64", ERROR_TYPE_INVALID_REQUEST));
3995
+ return;
3996
+ }
3997
+ }
3998
+
3999
+ std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, prompt, true, true);
3786
4000
  for (const auto & tokens : tokenized_prompts) {
3787
4001
  // this check is necessary for models that do not add BOS token to the input
3788
4002
  if (tokens.empty()) {
@@ -3823,7 +4037,7 @@ int main(int argc, char ** argv) {
3823
4037
  }, [&](const json & error_data) {
3824
4038
  res_error(res, error_data);
3825
4039
  error = true;
3826
- });
4040
+ }, req.is_connection_closed);
3827
4041
 
3828
4042
  ctx_server.queue_results.remove_waiting_task_ids(task_ids);
3829
4043
  }
@@ -3833,16 +4047,18 @@ int main(int argc, char ** argv) {
3833
4047
  }
3834
4048
 
3835
4049
  // write JSON response
3836
- json root = oaicompat ? format_embeddings_response_oaicompat(body, responses) : json(responses);
4050
+ json root = oaicompat == OAICOMPAT_TYPE_EMBEDDING
4051
+ ? format_embeddings_response_oaicompat(body, responses, use_base64)
4052
+ : json(responses);
3837
4053
  res_ok(res, root);
3838
4054
  };
3839
4055
 
3840
4056
  const auto handle_embeddings = [&handle_embeddings_impl](const httplib::Request & req, httplib::Response & res) {
3841
- handle_embeddings_impl(req, res, false);
4057
+ handle_embeddings_impl(req, res, OAICOMPAT_TYPE_NONE);
3842
4058
  };
3843
4059
 
3844
4060
  const auto handle_embeddings_oai = [&handle_embeddings_impl](const httplib::Request & req, httplib::Response & res) {
3845
- handle_embeddings_impl(req, res, true);
4061
+ handle_embeddings_impl(req, res, OAICOMPAT_TYPE_EMBEDDING);
3846
4062
  };
3847
4063
 
3848
4064
  const auto handle_rerank = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) {
@@ -3880,20 +4096,20 @@ int main(int argc, char ** argv) {
3880
4096
  return;
3881
4097
  }
3882
4098
 
3883
- llama_tokens tokenized_query = tokenize_input_prompts(ctx_server.ctx, query, /* add_special */ false, true)[0];
4099
+ llama_tokens tokenized_query = tokenize_input_prompts(ctx_server.vocab, query, /* add_special */ false, true)[0];
3884
4100
 
3885
4101
  // create and queue the task
3886
4102
  json responses = json::array();
3887
4103
  bool error = false;
3888
4104
  {
3889
4105
  std::vector<server_task> tasks;
3890
- std::vector<llama_tokens> tokenized_docs = tokenize_input_prompts(ctx_server.ctx, documents, /* add_special */ false, true);
4106
+ std::vector<llama_tokens> tokenized_docs = tokenize_input_prompts(ctx_server.vocab, documents, /* add_special */ false, true);
3891
4107
  tasks.reserve(tokenized_docs.size());
3892
4108
  for (size_t i = 0; i < tokenized_docs.size(); i++) {
3893
4109
  server_task task = server_task(SERVER_TASK_TYPE_RERANK);
3894
4110
  task.id = ctx_server.queue_tasks.get_new_id();
3895
4111
  task.index = i;
3896
- task.prompt_tokens = format_rerank(ctx_server.model, tokenized_query, tokenized_docs[i]);
4112
+ task.prompt_tokens = format_rerank(ctx_server.vocab, tokenized_query, tokenized_docs[i]);
3897
4113
  tasks.push_back(task);
3898
4114
  }
3899
4115
 
@@ -3911,7 +4127,7 @@ int main(int argc, char ** argv) {
3911
4127
  }, [&](const json & error_data) {
3912
4128
  res_error(res, error_data);
3913
4129
  error = true;
3914
- });
4130
+ }, req.is_connection_closed);
3915
4131
  }
3916
4132
 
3917
4133
  if (error) {
@@ -3925,8 +4141,9 @@ int main(int argc, char ** argv) {
3925
4141
 
3926
4142
  const auto handle_lora_adapters_list = [&](const httplib::Request &, httplib::Response & res) {
3927
4143
  json result = json::array();
3928
- for (size_t i = 0; i < ctx_server.loras.size(); ++i) {
3929
- auto & lora = ctx_server.loras[i];
4144
+ const auto & loras = ctx_server.params_base.lora_adapters;
4145
+ for (size_t i = 0; i < loras.size(); ++i) {
4146
+ auto & lora = loras[i];
3930
4147
  result.push_back({
3931
4148
  {"id", i},
3932
4149
  {"path", lora.path},
@@ -3938,27 +4155,14 @@ int main(int argc, char ** argv) {
3938
4155
  };
3939
4156
 
3940
4157
  const auto handle_lora_adapters_apply = [&](const httplib::Request & req, httplib::Response & res) {
3941
- const std::vector<json> body = json::parse(req.body);
3942
- int max_idx = ctx_server.loras.size();
3943
-
3944
- // clear existing value
3945
- for (auto & lora : ctx_server.loras) {
3946
- lora.scale = 0.0f;
3947
- }
3948
-
3949
- // set value
3950
- for (auto entry : body) {
3951
- int id = entry.at("id");
3952
- float scale = entry.at("scale");
3953
- if (0 <= id && id < max_idx) {
3954
- ctx_server.loras[id].scale = scale;
3955
- } else {
3956
- throw std::runtime_error("invalid adapter id");
3957
- }
4158
+ const json body = json::parse(req.body);
4159
+ if (!body.is_array()) {
4160
+ res_error(res, format_error_response("Request body must be an array", ERROR_TYPE_INVALID_REQUEST));
4161
+ return;
3958
4162
  }
3959
-
3960
4163
  server_task task(SERVER_TASK_TYPE_SET_LORA);
3961
4164
  task.id = ctx_server.queue_tasks.get_new_id();
4165
+ task.set_lora = parse_lora_request(ctx_server.params_base.lora_adapters, body);
3962
4166
  ctx_server.queue_results.add_waiting_task_id(task.id);
3963
4167
  ctx_server.queue_tasks.post(task);
3964
4168
 
@@ -4012,7 +4216,7 @@ int main(int argc, char ** argv) {
4012
4216
  svr->Get ("/v1/models", handle_models); // public endpoint (no API key check)
4013
4217
  svr->Post("/completion", handle_completions); // legacy
4014
4218
  svr->Post("/completions", handle_completions);
4015
- svr->Post("/v1/completions", handle_completions);
4219
+ svr->Post("/v1/completions", handle_completions_oai);
4016
4220
  svr->Post("/chat/completions", handle_chat_completions);
4017
4221
  svr->Post("/v1/chat/completions", handle_chat_completions);
4018
4222
  svr->Post("/infill", handle_infill);
@@ -4092,14 +4296,16 @@ int main(int argc, char ** argv) {
4092
4296
 
4093
4297
  // if a custom chat template is not supplied, we will use the one that comes with the model (if any)
4094
4298
  if (params.chat_template.empty()) {
4095
- if (!ctx_server.validate_model_chat_template()) {
4299
+ if (!ctx_server.validate_builtin_chat_template()) {
4096
4300
  LOG_WRN("%s: The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses\n", __func__);
4097
4301
  params.chat_template = "chatml";
4098
4302
  }
4099
4303
  }
4100
4304
 
4101
4305
  // print sample chat example to make it clear which template is used
4102
- LOG_INF("%s: chat template, built_in: %d, chat_example: '%s'\n", __func__, params.chat_template.empty(), common_chat_format_example(ctx_server.model, params.chat_template).c_str());
4306
+ LOG_INF("%s: chat template, chat_template: %s, example_format: '%s'\n", __func__,
4307
+ params.chat_template.empty() ? "(built-in)" : params.chat_template.c_str(),
4308
+ common_chat_format_example(ctx_server.model, params.chat_template).c_str());
4103
4309
 
4104
4310
  ctx_server.queue_tasks.on_new_task(std::bind(
4105
4311
  &server_context::process_single_task, &ctx_server, std::placeholders::_1));