@fugood/llama.node 0.3.6 → 0.3.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +17 -2
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +3 -1
- package/lib/index.js +16 -1
- package/lib/index.ts +16 -0
- package/package.json +1 -1
- package/src/EmbeddingWorker.cpp +4 -3
- package/src/LlamaCompletionWorker.cpp +4 -2
- package/src/LlamaContext.cpp +61 -6
- package/src/LlamaContext.h +1 -0
- package/src/common.hpp +6 -11
- package/src/llama.cpp/.github/workflows/build.yml +19 -17
- package/src/llama.cpp/.github/workflows/docker.yml +77 -30
- package/src/llama.cpp/.github/workflows/editorconfig.yml +3 -1
- package/src/llama.cpp/.github/workflows/server.yml +22 -3
- package/src/llama.cpp/CMakeLists.txt +49 -24
- package/src/llama.cpp/common/arg.cpp +82 -26
- package/src/llama.cpp/common/arg.h +3 -0
- package/src/llama.cpp/common/common.cpp +192 -72
- package/src/llama.cpp/common/common.h +51 -18
- package/src/llama.cpp/common/ngram-cache.cpp +12 -12
- package/src/llama.cpp/common/ngram-cache.h +2 -2
- package/src/llama.cpp/common/sampling.cpp +11 -6
- package/src/llama.cpp/common/speculative.cpp +18 -15
- package/src/llama.cpp/docs/build.md +2 -0
- package/src/llama.cpp/examples/batched/batched.cpp +9 -7
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +3 -3
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +10 -8
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +11 -8
- package/src/llama.cpp/examples/cvector-generator/mean.hpp +1 -1
- package/src/llama.cpp/examples/cvector-generator/pca.hpp +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +8 -7
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +7 -6
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +8 -7
- package/src/llama.cpp/examples/gguf/gguf.cpp +10 -6
- package/src/llama.cpp/examples/gguf-hash/gguf-hash.cpp +1 -0
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +8 -7
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +13 -10
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +13 -12
- package/src/llama.cpp/examples/infill/infill.cpp +23 -24
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +44 -13
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -6
- package/src/llama.cpp/examples/llava/clip.cpp +4 -2
- package/src/llama.cpp/examples/llava/llava-cli.cpp +9 -6
- package/src/llama.cpp/examples/llava/llava.cpp +2 -2
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +8 -4
- package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +11 -8
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +6 -7
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +4 -9
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +3 -7
- package/src/llama.cpp/examples/lookup/lookup.cpp +5 -6
- package/src/llama.cpp/examples/main/main.cpp +51 -29
- package/src/llama.cpp/examples/parallel/parallel.cpp +5 -6
- package/src/llama.cpp/examples/passkey/passkey.cpp +7 -5
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +37 -23
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +12 -14
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +8 -8
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +12 -0
- package/src/llama.cpp/examples/run/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +1351 -0
- package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +114 -0
- package/src/llama.cpp/examples/run/run.cpp +175 -61
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +4 -25
- package/src/llama.cpp/examples/server/CMakeLists.txt +1 -0
- package/src/llama.cpp/examples/server/httplib.h +1295 -409
- package/src/llama.cpp/examples/server/server.cpp +387 -181
- package/src/llama.cpp/examples/server/tests/requirements.txt +1 -0
- package/src/llama.cpp/examples/server/utils.hpp +170 -58
- package/src/llama.cpp/examples/simple/simple.cpp +9 -8
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +16 -12
- package/src/llama.cpp/examples/speculative/speculative.cpp +22 -23
- package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +8 -12
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +17 -5
- package/src/llama.cpp/examples/tts/tts.cpp +64 -23
- package/src/llama.cpp/ggml/CMakeLists.txt +5 -21
- package/src/llama.cpp/ggml/include/ggml-backend.h +2 -0
- package/src/llama.cpp/ggml/include/ggml-cpp.h +1 -0
- package/src/llama.cpp/ggml/include/ggml.h +36 -145
- package/src/llama.cpp/ggml/include/gguf.h +202 -0
- package/src/llama.cpp/ggml/src/CMakeLists.txt +6 -3
- package/src/llama.cpp/ggml/src/ggml-alloc.c +5 -0
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +0 -1
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +79 -49
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +5 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +33 -23
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +57 -72
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +87 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +335 -66
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +10 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1090 -378
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +2 -2
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +3 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +3 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +3 -1
- package/src/llama.cpp/ggml/src/ggml-impl.h +11 -16
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +16 -0
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +6 -6
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +154 -35
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +9 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +18 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.hpp +1 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +1 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +40 -95
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +48 -48
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +24 -24
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +238 -164
- package/src/llama.cpp/ggml/src/ggml-sycl/gla.cpp +105 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/gla.hpp +8 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +3 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +1 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +1 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +7 -5
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +1 -2
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +74 -4
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +314 -116
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -2
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +9 -3
- package/src/llama.cpp/ggml/src/ggml.c +117 -1327
- package/src/llama.cpp/ggml/src/gguf.cpp +1329 -0
- package/src/llama.cpp/include/llama-cpp.h +6 -1
- package/src/llama.cpp/include/llama.h +138 -75
- package/src/llama.cpp/src/CMakeLists.txt +13 -1
- package/src/llama.cpp/src/llama-adapter.cpp +347 -0
- package/src/llama.cpp/src/llama-adapter.h +74 -0
- package/src/llama.cpp/src/llama-arch.cpp +1487 -0
- package/src/llama.cpp/src/llama-arch.h +400 -0
- package/src/llama.cpp/src/llama-batch.cpp +368 -0
- package/src/llama.cpp/src/llama-batch.h +88 -0
- package/src/llama.cpp/src/llama-chat.cpp +578 -0
- package/src/llama.cpp/src/llama-chat.h +52 -0
- package/src/llama.cpp/src/llama-context.cpp +1775 -0
- package/src/llama.cpp/src/llama-context.h +128 -0
- package/src/llama.cpp/src/llama-cparams.cpp +1 -0
- package/src/llama.cpp/src/llama-cparams.h +37 -0
- package/src/llama.cpp/src/llama-grammar.cpp +5 -4
- package/src/llama.cpp/src/llama-grammar.h +3 -1
- package/src/llama.cpp/src/llama-hparams.cpp +71 -0
- package/src/llama.cpp/src/llama-hparams.h +139 -0
- package/src/llama.cpp/src/llama-impl.cpp +167 -0
- package/src/llama.cpp/src/llama-impl.h +16 -136
- package/src/llama.cpp/src/llama-kv-cache.cpp +718 -0
- package/src/llama.cpp/src/llama-kv-cache.h +218 -0
- package/src/llama.cpp/src/llama-mmap.cpp +589 -0
- package/src/llama.cpp/src/llama-mmap.h +67 -0
- package/src/llama.cpp/src/llama-model-loader.cpp +1124 -0
- package/src/llama.cpp/src/llama-model-loader.h +167 -0
- package/src/llama.cpp/src/llama-model.cpp +3953 -0
- package/src/llama.cpp/src/llama-model.h +370 -0
- package/src/llama.cpp/src/llama-quant.cpp +934 -0
- package/src/llama.cpp/src/llama-quant.h +1 -0
- package/src/llama.cpp/src/llama-sampling.cpp +147 -32
- package/src/llama.cpp/src/llama-sampling.h +3 -19
- package/src/llama.cpp/src/llama-vocab.cpp +1832 -575
- package/src/llama.cpp/src/llama-vocab.h +97 -142
- package/src/llama.cpp/src/llama.cpp +7160 -20314
- package/src/llama.cpp/src/unicode.cpp +8 -3
- package/src/llama.cpp/tests/CMakeLists.txt +2 -0
- package/src/llama.cpp/tests/test-autorelease.cpp +3 -3
- package/src/llama.cpp/tests/test-backend-ops.cpp +370 -59
- package/src/llama.cpp/tests/test-chat-template.cpp +162 -125
- package/src/llama.cpp/tests/test-gguf.cpp +222 -187
- package/src/llama.cpp/tests/test-model-load-cancel.cpp +1 -1
- package/src/llama.cpp/tests/test-sampling.cpp +0 -1
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +4 -4
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +9 -7
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +8 -6
|
@@ -19,6 +19,7 @@
|
|
|
19
19
|
#include "loading.html.hpp"
|
|
20
20
|
|
|
21
21
|
#include <atomic>
|
|
22
|
+
#include <chrono>
|
|
22
23
|
#include <condition_variable>
|
|
23
24
|
#include <cstddef>
|
|
24
25
|
#include <cinttypes>
|
|
@@ -32,6 +33,8 @@
|
|
|
32
33
|
|
|
33
34
|
using json = nlohmann::ordered_json;
|
|
34
35
|
|
|
36
|
+
constexpr int HTTP_POLLING_SECONDS = 1;
|
|
37
|
+
|
|
35
38
|
enum stop_type {
|
|
36
39
|
STOP_TYPE_NONE,
|
|
37
40
|
STOP_TYPE_EOS,
|
|
@@ -67,6 +70,13 @@ enum server_task_type {
|
|
|
67
70
|
SERVER_TASK_TYPE_SET_LORA,
|
|
68
71
|
};
|
|
69
72
|
|
|
73
|
+
enum oaicompat_type {
|
|
74
|
+
OAICOMPAT_TYPE_NONE,
|
|
75
|
+
OAICOMPAT_TYPE_CHAT,
|
|
76
|
+
OAICOMPAT_TYPE_COMPLETION,
|
|
77
|
+
OAICOMPAT_TYPE_EMBEDDING,
|
|
78
|
+
};
|
|
79
|
+
|
|
70
80
|
// https://community.openai.com/t/openai-chat-list-of-error-codes-and-types/357791/11
|
|
71
81
|
enum error_type {
|
|
72
82
|
ERROR_TYPE_INVALID_REQUEST,
|
|
@@ -91,7 +101,10 @@ struct slot_params {
|
|
|
91
101
|
int64_t t_max_prompt_ms = -1; // TODO: implement
|
|
92
102
|
int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit
|
|
93
103
|
|
|
104
|
+
std::vector<common_adapter_lora_info> lora;
|
|
105
|
+
|
|
94
106
|
std::vector<std::string> antiprompt;
|
|
107
|
+
std::vector<std::string> response_fields;
|
|
95
108
|
bool timings_per_token = false;
|
|
96
109
|
bool post_sampling_probs = false;
|
|
97
110
|
bool ignore_eos = false;
|
|
@@ -100,11 +113,10 @@ struct slot_params {
|
|
|
100
113
|
struct common_params_speculative speculative;
|
|
101
114
|
|
|
102
115
|
// OAI-compat fields
|
|
103
|
-
bool
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
std::string
|
|
107
|
-
std::string oaicompat_cmpl_id;
|
|
116
|
+
bool verbose = false;
|
|
117
|
+
oaicompat_type oaicompat = OAICOMPAT_TYPE_NONE;
|
|
118
|
+
std::string oaicompat_model;
|
|
119
|
+
std::string oaicompat_cmpl_id;
|
|
108
120
|
|
|
109
121
|
json to_json() const {
|
|
110
122
|
std::vector<std::string> samplers;
|
|
@@ -113,6 +125,11 @@ struct slot_params {
|
|
|
113
125
|
samplers.emplace_back(common_sampler_type_to_str(sampler));
|
|
114
126
|
}
|
|
115
127
|
|
|
128
|
+
json lora = json::array();
|
|
129
|
+
for (size_t i = 0; i < this->lora.size(); ++i) {
|
|
130
|
+
lora.push_back({{"id", i}, {"scale", this->lora[i].scale}});
|
|
131
|
+
}
|
|
132
|
+
|
|
116
133
|
return json {
|
|
117
134
|
{"n_predict", n_predict}, // Server configured n_predict
|
|
118
135
|
{"seed", sampling.seed},
|
|
@@ -153,6 +170,7 @@ struct slot_params {
|
|
|
153
170
|
{"speculative.p_min", speculative.p_min},
|
|
154
171
|
{"timings_per_token", timings_per_token},
|
|
155
172
|
{"post_sampling_probs", post_sampling_probs},
|
|
173
|
+
{"lora", lora},
|
|
156
174
|
};
|
|
157
175
|
}
|
|
158
176
|
};
|
|
@@ -182,13 +200,18 @@ struct server_task {
|
|
|
182
200
|
// used by SERVER_TASK_TYPE_METRICS
|
|
183
201
|
bool metrics_reset_bucket = false;
|
|
184
202
|
|
|
203
|
+
// used by SERVER_TASK_TYPE_SET_LORA
|
|
204
|
+
std::vector<common_adapter_lora_info> set_lora;
|
|
205
|
+
|
|
185
206
|
server_task(server_task_type type) : type(type) {}
|
|
186
207
|
|
|
187
208
|
static slot_params params_from_json_cmpl(
|
|
188
|
-
const llama_model * model,
|
|
189
209
|
const llama_context * ctx,
|
|
190
210
|
const common_params & params_base,
|
|
191
211
|
const json & data) {
|
|
212
|
+
const llama_model * model = llama_get_model(ctx);
|
|
213
|
+
const llama_vocab * vocab = llama_model_get_vocab(model);
|
|
214
|
+
|
|
192
215
|
slot_params params;
|
|
193
216
|
|
|
194
217
|
// Sampling parameter defaults are loaded from the global server context (but individual requests can still override them)
|
|
@@ -209,6 +232,7 @@ struct server_task {
|
|
|
209
232
|
params.n_discard = json_value(data, "n_discard", defaults.n_discard);
|
|
210
233
|
//params.t_max_prompt_ms = json_value(data, "t_max_prompt_ms", defaults.t_max_prompt_ms); // TODO: implement
|
|
211
234
|
params.t_max_predict_ms = json_value(data, "t_max_predict_ms", defaults.t_max_predict_ms);
|
|
235
|
+
params.response_fields = json_value(data, "response_fields", std::vector<std::string>());
|
|
212
236
|
|
|
213
237
|
params.sampling.top_k = json_value(data, "top_k", defaults.sampling.top_k);
|
|
214
238
|
params.sampling.top_p = json_value(data, "top_p", defaults.sampling.top_p);
|
|
@@ -243,6 +267,16 @@ struct server_task {
|
|
|
243
267
|
params.speculative.n_min = std::max(params.speculative.n_min, 2);
|
|
244
268
|
params.speculative.n_max = std::max(params.speculative.n_max, 0);
|
|
245
269
|
|
|
270
|
+
if (data.contains("lora")) {
|
|
271
|
+
if (data.at("lora").is_array()) {
|
|
272
|
+
params.lora = parse_lora_request(params_base.lora_adapters, data.at("lora"));
|
|
273
|
+
} else {
|
|
274
|
+
throw std::runtime_error("Error: 'lora' must be an array of objects with 'id' and 'scale' fields");
|
|
275
|
+
}
|
|
276
|
+
} else {
|
|
277
|
+
params.lora = params_base.lora_adapters;
|
|
278
|
+
}
|
|
279
|
+
|
|
246
280
|
// TODO: add more sanity checks for the input parameters
|
|
247
281
|
|
|
248
282
|
if (params.sampling.penalty_last_n < -1) {
|
|
@@ -300,7 +334,7 @@ struct server_task {
|
|
|
300
334
|
|
|
301
335
|
const auto & logit_bias = data.find("logit_bias");
|
|
302
336
|
if (logit_bias != data.end() && logit_bias->is_array()) {
|
|
303
|
-
const int n_vocab =
|
|
337
|
+
const int n_vocab = llama_vocab_n_tokens(vocab);
|
|
304
338
|
for (const auto & el : *logit_bias) {
|
|
305
339
|
// TODO: we may want to throw errors here, in case "el" is incorrect
|
|
306
340
|
if (el.is_array() && el.size() == 2) {
|
|
@@ -319,7 +353,7 @@ struct server_task {
|
|
|
319
353
|
params.sampling.logit_bias.push_back({tok, bias});
|
|
320
354
|
}
|
|
321
355
|
} else if (el[0].is_string()) {
|
|
322
|
-
auto toks = common_tokenize(
|
|
356
|
+
auto toks = common_tokenize(vocab, el[0].get<std::string>(), false);
|
|
323
357
|
for (auto tok : toks) {
|
|
324
358
|
params.sampling.logit_bias.push_back({tok, bias});
|
|
325
359
|
}
|
|
@@ -522,15 +556,15 @@ struct server_task_result_cmpl_final : server_task_result {
|
|
|
522
556
|
|
|
523
557
|
bool post_sampling_probs;
|
|
524
558
|
std::vector<completion_token_output> probs_output;
|
|
559
|
+
std::vector<std::string> response_fields;
|
|
525
560
|
|
|
526
561
|
slot_params generation_params;
|
|
527
562
|
|
|
528
563
|
// OAI-compat fields
|
|
529
|
-
bool
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
std::string
|
|
533
|
-
std::string oaicompat_cmpl_id;
|
|
564
|
+
bool verbose = false;
|
|
565
|
+
oaicompat_type oaicompat = OAICOMPAT_TYPE_NONE;
|
|
566
|
+
std::string oaicompat_model;
|
|
567
|
+
std::string oaicompat_cmpl_id;
|
|
534
568
|
|
|
535
569
|
virtual int get_index() override {
|
|
536
570
|
return index;
|
|
@@ -541,9 +575,16 @@ struct server_task_result_cmpl_final : server_task_result {
|
|
|
541
575
|
}
|
|
542
576
|
|
|
543
577
|
virtual json to_json() override {
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
578
|
+
switch (oaicompat) {
|
|
579
|
+
case OAICOMPAT_TYPE_NONE:
|
|
580
|
+
return to_json_non_oaicompat();
|
|
581
|
+
case OAICOMPAT_TYPE_COMPLETION:
|
|
582
|
+
return to_json_oaicompat();
|
|
583
|
+
case OAICOMPAT_TYPE_CHAT:
|
|
584
|
+
return stream ? to_json_oaicompat_chat_stream() : to_json_oaicompat_chat();
|
|
585
|
+
default:
|
|
586
|
+
GGML_ASSERT(false && "Invalid oaicompat_type");
|
|
587
|
+
}
|
|
547
588
|
}
|
|
548
589
|
|
|
549
590
|
json to_json_non_oaicompat() {
|
|
@@ -568,6 +609,50 @@ struct server_task_result_cmpl_final : server_task_result {
|
|
|
568
609
|
if (!stream && !probs_output.empty()) {
|
|
569
610
|
res["completion_probabilities"] = completion_token_output::probs_vector_to_json(probs_output, post_sampling_probs);
|
|
570
611
|
}
|
|
612
|
+
return response_fields.empty() ? res : json_get_nested_values(response_fields, res);
|
|
613
|
+
}
|
|
614
|
+
|
|
615
|
+
json to_json_oaicompat() {
|
|
616
|
+
std::time_t t = std::time(0);
|
|
617
|
+
json logprobs = json(nullptr); // OAI default to null
|
|
618
|
+
if (!stream && probs_output.size() > 0) {
|
|
619
|
+
logprobs = json{
|
|
620
|
+
{"content", completion_token_output::probs_vector_to_json(probs_output, post_sampling_probs)},
|
|
621
|
+
};
|
|
622
|
+
}
|
|
623
|
+
json finish_reason = "length";
|
|
624
|
+
if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) {
|
|
625
|
+
finish_reason = "stop";
|
|
626
|
+
}
|
|
627
|
+
json res = json {
|
|
628
|
+
{"choices", json::array({
|
|
629
|
+
json{
|
|
630
|
+
{"text", stream ? "" : content}, // in stream mode, content is already in last partial chunk
|
|
631
|
+
{"index", index},
|
|
632
|
+
{"logprobs", logprobs},
|
|
633
|
+
{"finish_reason", finish_reason},
|
|
634
|
+
}
|
|
635
|
+
})},
|
|
636
|
+
{"created", t},
|
|
637
|
+
{"model", oaicompat_model},
|
|
638
|
+
{"system_fingerprint", build_info},
|
|
639
|
+
{"object", "text_completion"},
|
|
640
|
+
{"usage", json {
|
|
641
|
+
{"completion_tokens", n_decoded},
|
|
642
|
+
{"prompt_tokens", n_prompt_tokens},
|
|
643
|
+
{"total_tokens", n_decoded + n_prompt_tokens}
|
|
644
|
+
}},
|
|
645
|
+
{"id", oaicompat_cmpl_id}
|
|
646
|
+
};
|
|
647
|
+
|
|
648
|
+
// extra fields for debugging purposes
|
|
649
|
+
if (verbose) {
|
|
650
|
+
res["__verbose"] = to_json_non_oaicompat();
|
|
651
|
+
}
|
|
652
|
+
if (timings.prompt_n >= 0) {
|
|
653
|
+
res.push_back({"timings", timings.to_json()});
|
|
654
|
+
}
|
|
655
|
+
|
|
571
656
|
return res;
|
|
572
657
|
}
|
|
573
658
|
|
|
@@ -595,10 +680,11 @@ struct server_task_result_cmpl_final : server_task_result {
|
|
|
595
680
|
std::time_t t = std::time(0);
|
|
596
681
|
|
|
597
682
|
json res = json {
|
|
598
|
-
{"choices",
|
|
599
|
-
{"created",
|
|
600
|
-
{"model",
|
|
601
|
-
{"
|
|
683
|
+
{"choices", json::array({choice})},
|
|
684
|
+
{"created", t},
|
|
685
|
+
{"model", oaicompat_model},
|
|
686
|
+
{"system_fingerprint", build_info},
|
|
687
|
+
{"object", "chat.completion"},
|
|
602
688
|
{"usage", json {
|
|
603
689
|
{"completion_tokens", n_decoded},
|
|
604
690
|
{"prompt_tokens", n_prompt_tokens},
|
|
@@ -632,11 +718,12 @@ struct server_task_result_cmpl_final : server_task_result {
|
|
|
632
718
|
};
|
|
633
719
|
|
|
634
720
|
json ret = json {
|
|
635
|
-
{"choices",
|
|
636
|
-
{"created",
|
|
637
|
-
{"id",
|
|
638
|
-
{"model",
|
|
639
|
-
{"
|
|
721
|
+
{"choices", json::array({choice})},
|
|
722
|
+
{"created", t},
|
|
723
|
+
{"id", oaicompat_cmpl_id},
|
|
724
|
+
{"model", oaicompat_model},
|
|
725
|
+
{"system_fingerprint", build_info},
|
|
726
|
+
{"object", "chat.completion.chunk"},
|
|
640
727
|
{"usage", json {
|
|
641
728
|
{"completion_tokens", n_decoded},
|
|
642
729
|
{"prompt_tokens", n_prompt_tokens},
|
|
@@ -666,11 +753,10 @@ struct server_task_result_cmpl_partial : server_task_result {
|
|
|
666
753
|
result_timings timings;
|
|
667
754
|
|
|
668
755
|
// OAI-compat fields
|
|
669
|
-
bool
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
std::string
|
|
673
|
-
std::string oaicompat_cmpl_id;
|
|
756
|
+
bool verbose = false;
|
|
757
|
+
oaicompat_type oaicompat = OAICOMPAT_TYPE_NONE;
|
|
758
|
+
std::string oaicompat_model;
|
|
759
|
+
std::string oaicompat_cmpl_id;
|
|
674
760
|
|
|
675
761
|
virtual int get_index() override {
|
|
676
762
|
return index;
|
|
@@ -681,7 +767,16 @@ struct server_task_result_cmpl_partial : server_task_result {
|
|
|
681
767
|
}
|
|
682
768
|
|
|
683
769
|
virtual json to_json() override {
|
|
684
|
-
|
|
770
|
+
switch (oaicompat) {
|
|
771
|
+
case OAICOMPAT_TYPE_NONE:
|
|
772
|
+
return to_json_non_oaicompat();
|
|
773
|
+
case OAICOMPAT_TYPE_COMPLETION:
|
|
774
|
+
return to_json_oaicompat();
|
|
775
|
+
case OAICOMPAT_TYPE_CHAT:
|
|
776
|
+
return to_json_oaicompat_chat();
|
|
777
|
+
default:
|
|
778
|
+
GGML_ASSERT(false && "Invalid oaicompat_type");
|
|
779
|
+
}
|
|
685
780
|
}
|
|
686
781
|
|
|
687
782
|
json to_json_non_oaicompat() {
|
|
@@ -706,6 +801,41 @@ struct server_task_result_cmpl_partial : server_task_result {
|
|
|
706
801
|
}
|
|
707
802
|
|
|
708
803
|
json to_json_oaicompat() {
|
|
804
|
+
std::time_t t = std::time(0);
|
|
805
|
+
json logprobs = json(nullptr); // OAI default to null
|
|
806
|
+
if (prob_output.probs.size() > 0) {
|
|
807
|
+
logprobs = json{
|
|
808
|
+
{"content", completion_token_output::probs_vector_to_json({prob_output}, post_sampling_probs)},
|
|
809
|
+
};
|
|
810
|
+
}
|
|
811
|
+
json res = json {
|
|
812
|
+
{"choices", json::array({
|
|
813
|
+
json{
|
|
814
|
+
{"text", content},
|
|
815
|
+
{"index", index},
|
|
816
|
+
{"logprobs", logprobs},
|
|
817
|
+
{"finish_reason", nullptr},
|
|
818
|
+
}
|
|
819
|
+
})},
|
|
820
|
+
{"created", t},
|
|
821
|
+
{"model", oaicompat_model},
|
|
822
|
+
{"system_fingerprint", build_info},
|
|
823
|
+
{"object", "text_completion"},
|
|
824
|
+
{"id", oaicompat_cmpl_id}
|
|
825
|
+
};
|
|
826
|
+
|
|
827
|
+
// extra fields for debugging purposes
|
|
828
|
+
if (verbose) {
|
|
829
|
+
res["__verbose"] = to_json_non_oaicompat();
|
|
830
|
+
}
|
|
831
|
+
if (timings.prompt_n >= 0) {
|
|
832
|
+
res.push_back({"timings", timings.to_json()});
|
|
833
|
+
}
|
|
834
|
+
|
|
835
|
+
return res;
|
|
836
|
+
}
|
|
837
|
+
|
|
838
|
+
json to_json_oaicompat_chat() {
|
|
709
839
|
bool first = n_decoded == 0;
|
|
710
840
|
std::time_t t = std::time(0);
|
|
711
841
|
json choices;
|
|
@@ -761,11 +891,12 @@ struct server_task_result_cmpl_partial : server_task_result {
|
|
|
761
891
|
}
|
|
762
892
|
|
|
763
893
|
json ret = json {
|
|
764
|
-
{"choices",
|
|
765
|
-
{"created",
|
|
766
|
-
{"id",
|
|
767
|
-
{"model",
|
|
768
|
-
{"
|
|
894
|
+
{"choices", choices},
|
|
895
|
+
{"created", t},
|
|
896
|
+
{"id", oaicompat_cmpl_id},
|
|
897
|
+
{"model", oaicompat_model},
|
|
898
|
+
{"system_fingerprint", build_info},
|
|
899
|
+
{"object", "chat.completion.chunk"}
|
|
769
900
|
};
|
|
770
901
|
|
|
771
902
|
if (timings.prompt_n >= 0) {
|
|
@@ -783,14 +914,16 @@ struct server_task_result_embd : server_task_result {
|
|
|
783
914
|
int32_t n_tokens;
|
|
784
915
|
|
|
785
916
|
// OAI-compat fields
|
|
786
|
-
|
|
917
|
+
oaicompat_type oaicompat = OAICOMPAT_TYPE_NONE;
|
|
787
918
|
|
|
788
919
|
virtual int get_index() override {
|
|
789
920
|
return index;
|
|
790
921
|
}
|
|
791
922
|
|
|
792
923
|
virtual json to_json() override {
|
|
793
|
-
return oaicompat
|
|
924
|
+
return oaicompat == OAICOMPAT_TYPE_EMBEDDING
|
|
925
|
+
? to_json_oaicompat()
|
|
926
|
+
: to_json_non_oaicompat();
|
|
794
927
|
}
|
|
795
928
|
|
|
796
929
|
json to_json_non_oaicompat() {
|
|
@@ -1003,6 +1136,8 @@ struct server_slot {
|
|
|
1003
1136
|
|
|
1004
1137
|
common_speculative * spec = nullptr;
|
|
1005
1138
|
|
|
1139
|
+
std::vector<common_adapter_lora_info> lora;
|
|
1140
|
+
|
|
1006
1141
|
// the index relative to completion multi-task request
|
|
1007
1142
|
size_t index = 0;
|
|
1008
1143
|
|
|
@@ -1084,6 +1219,11 @@ struct server_slot {
|
|
|
1084
1219
|
return task_type == SERVER_TASK_TYPE_EMBEDDING || task_type == SERVER_TASK_TYPE_RERANK;
|
|
1085
1220
|
}
|
|
1086
1221
|
|
|
1222
|
+
bool can_batch_with(server_slot & other_slot) {
|
|
1223
|
+
return is_non_causal() == other_slot.is_non_causal()
|
|
1224
|
+
&& are_lora_equal(lora, other_slot.lora);
|
|
1225
|
+
}
|
|
1226
|
+
|
|
1087
1227
|
bool has_budget(const common_params & global_params) {
|
|
1088
1228
|
if (params.n_predict == -1 && global_params.n_predict == -1) {
|
|
1089
1229
|
return true; // limitless
|
|
@@ -1465,6 +1605,30 @@ struct server_response {
|
|
|
1465
1605
|
// should never reach here
|
|
1466
1606
|
}
|
|
1467
1607
|
|
|
1608
|
+
// same as recv(), but have timeout in seconds
|
|
1609
|
+
// if timeout is reached, nullptr is returned
|
|
1610
|
+
server_task_result_ptr recv_with_timeout(const std::unordered_set<int> & id_tasks, int timeout) {
|
|
1611
|
+
while (true) {
|
|
1612
|
+
std::unique_lock<std::mutex> lock(mutex_results);
|
|
1613
|
+
bool cr_res = condition_results.wait_for(lock, std::chrono::seconds(timeout), [&]{
|
|
1614
|
+
return !queue_results.empty();
|
|
1615
|
+
});
|
|
1616
|
+
if (!cr_res) {
|
|
1617
|
+
return nullptr;
|
|
1618
|
+
}
|
|
1619
|
+
|
|
1620
|
+
for (int i = 0; i < (int) queue_results.size(); i++) {
|
|
1621
|
+
if (id_tasks.find(queue_results[i]->id) != id_tasks.end()) {
|
|
1622
|
+
server_task_result_ptr res = std::move(queue_results[i]);
|
|
1623
|
+
queue_results.erase(queue_results.begin() + i);
|
|
1624
|
+
return res;
|
|
1625
|
+
}
|
|
1626
|
+
}
|
|
1627
|
+
}
|
|
1628
|
+
|
|
1629
|
+
// should never reach here
|
|
1630
|
+
}
|
|
1631
|
+
|
|
1468
1632
|
// single-task version of recv()
|
|
1469
1633
|
server_task_result_ptr recv(int id_task) {
|
|
1470
1634
|
std::unordered_set<int> id_tasks = {id_task};
|
|
@@ -1491,11 +1655,17 @@ struct server_response {
|
|
|
1491
1655
|
struct server_context {
|
|
1492
1656
|
common_params params_base;
|
|
1493
1657
|
|
|
1658
|
+
// note: keep these alive - they determine the lifetime of the model, context, etc.
|
|
1659
|
+
common_init_result llama_init;
|
|
1660
|
+
common_init_result llama_init_dft;
|
|
1661
|
+
|
|
1494
1662
|
llama_model * model = nullptr;
|
|
1495
1663
|
llama_context * ctx = nullptr;
|
|
1496
|
-
|
|
1664
|
+
|
|
1665
|
+
const llama_vocab * vocab = nullptr;
|
|
1497
1666
|
|
|
1498
1667
|
llama_model * model_dft = nullptr;
|
|
1668
|
+
|
|
1499
1669
|
llama_context_params cparams_dft;
|
|
1500
1670
|
|
|
1501
1671
|
llama_batch batch = {};
|
|
@@ -1519,21 +1689,6 @@ struct server_context {
|
|
|
1519
1689
|
float slot_prompt_similarity = 0.0f;
|
|
1520
1690
|
|
|
1521
1691
|
~server_context() {
|
|
1522
|
-
if (ctx) {
|
|
1523
|
-
llama_free(ctx);
|
|
1524
|
-
ctx = nullptr;
|
|
1525
|
-
}
|
|
1526
|
-
|
|
1527
|
-
if (model) {
|
|
1528
|
-
llama_free_model(model);
|
|
1529
|
-
model = nullptr;
|
|
1530
|
-
}
|
|
1531
|
-
|
|
1532
|
-
if (model_dft) {
|
|
1533
|
-
llama_free_model(model_dft);
|
|
1534
|
-
model_dft = nullptr;
|
|
1535
|
-
}
|
|
1536
|
-
|
|
1537
1692
|
// Clear any sampling context
|
|
1538
1693
|
for (server_slot & slot : slots) {
|
|
1539
1694
|
common_sampler_free(slot.smpl);
|
|
@@ -1556,21 +1711,22 @@ struct server_context {
|
|
|
1556
1711
|
|
|
1557
1712
|
params_base = params;
|
|
1558
1713
|
|
|
1559
|
-
|
|
1714
|
+
llama_init = common_init_from_params(params_base);
|
|
1560
1715
|
|
|
1561
|
-
model = llama_init.model;
|
|
1562
|
-
ctx = llama_init.context;
|
|
1563
|
-
loras = llama_init.lora_adapters;
|
|
1716
|
+
model = llama_init.model.get();
|
|
1717
|
+
ctx = llama_init.context.get();
|
|
1564
1718
|
|
|
1565
1719
|
if (model == nullptr) {
|
|
1566
1720
|
SRV_ERR("failed to load model, '%s'\n", params_base.model.c_str());
|
|
1567
1721
|
return false;
|
|
1568
1722
|
}
|
|
1569
1723
|
|
|
1724
|
+
vocab = llama_model_get_vocab(model);
|
|
1725
|
+
|
|
1570
1726
|
n_ctx = llama_n_ctx(ctx);
|
|
1571
1727
|
|
|
1572
|
-
add_bos_token =
|
|
1573
|
-
has_eos_token =
|
|
1728
|
+
add_bos_token = llama_vocab_get_add_bos(vocab);
|
|
1729
|
+
has_eos_token = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
|
|
1574
1730
|
|
|
1575
1731
|
if (!params_base.speculative.model.empty()) {
|
|
1576
1732
|
SRV_INF("loading draft model '%s'\n", params_base.speculative.model.c_str());
|
|
@@ -1583,25 +1739,22 @@ struct server_context {
|
|
|
1583
1739
|
params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers;
|
|
1584
1740
|
params_dft.n_parallel = 1;
|
|
1585
1741
|
|
|
1586
|
-
|
|
1742
|
+
llama_init_dft = common_init_from_params(params_dft);
|
|
1587
1743
|
|
|
1588
|
-
model_dft = llama_init_dft.model;
|
|
1744
|
+
model_dft = llama_init_dft.model.get();
|
|
1589
1745
|
|
|
1590
1746
|
if (model_dft == nullptr) {
|
|
1591
1747
|
SRV_ERR("failed to load draft model, '%s'\n", params_base.speculative.model.c_str());
|
|
1592
1748
|
return false;
|
|
1593
1749
|
}
|
|
1594
1750
|
|
|
1595
|
-
if (!common_speculative_are_compatible(ctx, llama_init_dft.context)) {
|
|
1751
|
+
if (!common_speculative_are_compatible(ctx, llama_init_dft.context.get())) {
|
|
1596
1752
|
SRV_ERR("the draft model '%s' is not compatible with the target model '%s'\n", params_base.speculative.model.c_str(), params_base.model.c_str());
|
|
1597
1753
|
|
|
1598
|
-
llama_free (llama_init_dft.context);
|
|
1599
|
-
llama_free_model(llama_init_dft.model);
|
|
1600
|
-
|
|
1601
1754
|
return false;
|
|
1602
1755
|
}
|
|
1603
1756
|
|
|
1604
|
-
const int n_ctx_dft = llama_n_ctx(llama_init_dft.context);
|
|
1757
|
+
const int n_ctx_dft = llama_n_ctx(llama_init_dft.context.get());
|
|
1605
1758
|
|
|
1606
1759
|
cparams_dft = common_context_params_to_llama(params_dft);
|
|
1607
1760
|
cparams_dft.n_batch = n_ctx_dft;
|
|
@@ -1609,25 +1762,16 @@ struct server_context {
|
|
|
1609
1762
|
// force F16 KV cache for the draft model for extra performance
|
|
1610
1763
|
cparams_dft.type_k = GGML_TYPE_F16;
|
|
1611
1764
|
cparams_dft.type_v = GGML_TYPE_F16;
|
|
1612
|
-
|
|
1613
|
-
// the context is not needed - we will create one for each slot
|
|
1614
|
-
llama_free(llama_init_dft.context);
|
|
1615
1765
|
}
|
|
1616
1766
|
|
|
1617
1767
|
return true;
|
|
1618
1768
|
}
|
|
1619
1769
|
|
|
1620
|
-
bool
|
|
1621
|
-
|
|
1622
|
-
|
|
1623
|
-
int32_t
|
|
1624
|
-
|
|
1625
|
-
llama_chat_message chat[] = {{"user", "test"}};
|
|
1626
|
-
std::string tmpl = std::string(model_template.data(), model_template.size());
|
|
1627
|
-
int32_t chat_res = llama_chat_apply_template(model, tmpl.c_str(), chat, 1, true, nullptr, 0);
|
|
1628
|
-
return chat_res > 0;
|
|
1629
|
-
}
|
|
1630
|
-
return false;
|
|
1770
|
+
bool validate_builtin_chat_template() const {
|
|
1771
|
+
llama_chat_message chat[] = {{"user", "test"}};
|
|
1772
|
+
const char * tmpl = llama_model_chat_template(model);
|
|
1773
|
+
const int32_t chat_res = llama_chat_apply_template(tmpl, chat, 1, true, nullptr, 0);
|
|
1774
|
+
return chat_res > 0;
|
|
1631
1775
|
}
|
|
1632
1776
|
|
|
1633
1777
|
void init() {
|
|
@@ -1646,7 +1790,7 @@ struct server_context {
|
|
|
1646
1790
|
if (model_dft) {
|
|
1647
1791
|
slot.batch_spec = llama_batch_init(params_base.speculative.n_max + 1, 0, 1);
|
|
1648
1792
|
|
|
1649
|
-
slot.ctx_dft =
|
|
1793
|
+
slot.ctx_dft = llama_init_from_model(model_dft, cparams_dft);
|
|
1650
1794
|
if (slot.ctx_dft == nullptr) {
|
|
1651
1795
|
SRV_ERR("%s", "failed to create draft context\n");
|
|
1652
1796
|
return;
|
|
@@ -1766,6 +1910,12 @@ struct server_context {
|
|
|
1766
1910
|
slot.params = std::move(task.params);
|
|
1767
1911
|
slot.prompt_tokens = std::move(task.prompt_tokens);
|
|
1768
1912
|
|
|
1913
|
+
if (!are_lora_equal(task.params.lora, slot.lora)) {
|
|
1914
|
+
// if lora is changed, we cannot reuse cached tokens
|
|
1915
|
+
slot.cache_tokens.clear();
|
|
1916
|
+
slot.lora = task.params.lora;
|
|
1917
|
+
}
|
|
1918
|
+
|
|
1769
1919
|
SLT_DBG(slot, "launching slot : %s\n", safe_json_to_str(slot.to_json()).c_str());
|
|
1770
1920
|
|
|
1771
1921
|
if (slot.n_predict > 0 && slot.params.n_predict > slot.n_predict) {
|
|
@@ -1775,7 +1925,7 @@ struct server_context {
|
|
|
1775
1925
|
}
|
|
1776
1926
|
|
|
1777
1927
|
if (slot.params.ignore_eos && has_eos_token) {
|
|
1778
|
-
slot.params.sampling.logit_bias.push_back({
|
|
1928
|
+
slot.params.sampling.logit_bias.push_back({llama_vocab_eos(vocab), -INFINITY});
|
|
1779
1929
|
}
|
|
1780
1930
|
|
|
1781
1931
|
{
|
|
@@ -1850,6 +2000,8 @@ struct server_context {
|
|
|
1850
2000
|
result.text_to_send = slot.generated_text.substr(pos, std::string::npos);
|
|
1851
2001
|
slot.n_sent_text += result.text_to_send.size();
|
|
1852
2002
|
// add the token to slot queue and cache
|
|
2003
|
+
} else {
|
|
2004
|
+
result.text_to_send = "";
|
|
1853
2005
|
}
|
|
1854
2006
|
|
|
1855
2007
|
slot.add_token(result);
|
|
@@ -1929,14 +2081,14 @@ struct server_context {
|
|
|
1929
2081
|
slot.n_decoded, slot.n_prompt_tokens, slot.n_past, slot.n_ctx);
|
|
1930
2082
|
}
|
|
1931
2083
|
|
|
1932
|
-
if (
|
|
2084
|
+
if (llama_vocab_is_eog(vocab, result.tok)) {
|
|
1933
2085
|
slot.stop = STOP_TYPE_EOS;
|
|
1934
2086
|
slot.has_next_token = false;
|
|
1935
2087
|
|
|
1936
2088
|
SLT_DBG(slot, "%s", "stopped by EOS\n");
|
|
1937
2089
|
}
|
|
1938
2090
|
|
|
1939
|
-
const auto n_ctx_train =
|
|
2091
|
+
const auto n_ctx_train = llama_model_n_ctx_train(model);
|
|
1940
2092
|
|
|
1941
2093
|
if (slot.params.n_predict < 1 && slot.n_predict < 1 && slot.n_prompt_tokens + slot.n_decoded >= n_ctx_train) {
|
|
1942
2094
|
slot.truncated = true;
|
|
@@ -1956,7 +2108,7 @@ struct server_context {
|
|
|
1956
2108
|
|
|
1957
2109
|
void populate_token_probs(const server_slot & slot, completion_token_output & result, bool post_sampling, bool special, int idx) {
|
|
1958
2110
|
size_t n_probs = slot.params.sampling.n_probs;
|
|
1959
|
-
size_t n_vocab =
|
|
2111
|
+
size_t n_vocab = llama_vocab_n_tokens(vocab);
|
|
1960
2112
|
if (post_sampling) {
|
|
1961
2113
|
const auto * cur_p = common_sampler_get_candidates(slot.smpl);
|
|
1962
2114
|
const size_t max_probs = cur_p->size;
|
|
@@ -2036,7 +2188,6 @@ struct server_context {
|
|
|
2036
2188
|
|
|
2037
2189
|
res->verbose = slot.params.verbose;
|
|
2038
2190
|
res->oaicompat = slot.params.oaicompat;
|
|
2039
|
-
res->oaicompat_chat = slot.params.oaicompat_chat;
|
|
2040
2191
|
res->oaicompat_model = slot.params.oaicompat_model;
|
|
2041
2192
|
res->oaicompat_cmpl_id = slot.params.oaicompat_cmpl_id;
|
|
2042
2193
|
|
|
@@ -2063,6 +2214,7 @@ struct server_context {
|
|
|
2063
2214
|
res->tokens = slot.generated_tokens;
|
|
2064
2215
|
res->timings = slot.get_timings();
|
|
2065
2216
|
res->prompt = common_detokenize(ctx, slot.prompt_tokens, true);
|
|
2217
|
+
res->response_fields = slot.params.response_fields;
|
|
2066
2218
|
|
|
2067
2219
|
res->truncated = slot.truncated;
|
|
2068
2220
|
res->n_decoded = slot.n_decoded;
|
|
@@ -2076,7 +2228,6 @@ struct server_context {
|
|
|
2076
2228
|
res->verbose = slot.params.verbose;
|
|
2077
2229
|
res->stream = slot.params.stream;
|
|
2078
2230
|
res->oaicompat = slot.params.oaicompat;
|
|
2079
|
-
res->oaicompat_chat = slot.params.oaicompat_chat;
|
|
2080
2231
|
res->oaicompat_model = slot.params.oaicompat_model;
|
|
2081
2232
|
res->oaicompat_cmpl_id = slot.params.oaicompat_cmpl_id;
|
|
2082
2233
|
|
|
@@ -2108,7 +2259,7 @@ struct server_context {
|
|
|
2108
2259
|
res->n_tokens = slot.n_prompt_tokens;
|
|
2109
2260
|
res->oaicompat = slot.params.oaicompat;
|
|
2110
2261
|
|
|
2111
|
-
const int n_embd =
|
|
2262
|
+
const int n_embd = llama_model_n_embd(model);
|
|
2112
2263
|
|
|
2113
2264
|
std::vector<float> embd_res(n_embd, 0.0f);
|
|
2114
2265
|
|
|
@@ -2198,10 +2349,21 @@ struct server_context {
|
|
|
2198
2349
|
void receive_multi_results(
|
|
2199
2350
|
const std::unordered_set<int> & id_tasks,
|
|
2200
2351
|
const std::function<void(std::vector<server_task_result_ptr>&)> & result_handler,
|
|
2201
|
-
const std::function<void(json)> & error_handler
|
|
2352
|
+
const std::function<void(json)> & error_handler,
|
|
2353
|
+
const std::function<bool()> & is_connection_closed) {
|
|
2202
2354
|
std::vector<server_task_result_ptr> results(id_tasks.size());
|
|
2203
|
-
for (
|
|
2204
|
-
server_task_result_ptr result = queue_results.
|
|
2355
|
+
for (int i = 0; i < (int)id_tasks.size(); i++) {
|
|
2356
|
+
server_task_result_ptr result = queue_results.recv_with_timeout(id_tasks, HTTP_POLLING_SECONDS);
|
|
2357
|
+
|
|
2358
|
+
if (is_connection_closed()) {
|
|
2359
|
+
cancel_tasks(id_tasks);
|
|
2360
|
+
return;
|
|
2361
|
+
}
|
|
2362
|
+
|
|
2363
|
+
if (result == nullptr) {
|
|
2364
|
+
i--; // retry
|
|
2365
|
+
continue;
|
|
2366
|
+
}
|
|
2205
2367
|
|
|
2206
2368
|
if (result->is_error()) {
|
|
2207
2369
|
error_handler(result->to_json());
|
|
@@ -2225,10 +2387,20 @@ struct server_context {
|
|
|
2225
2387
|
void receive_cmpl_results_stream(
|
|
2226
2388
|
const std::unordered_set<int> & id_tasks,
|
|
2227
2389
|
const std::function<bool(server_task_result_ptr&)> & result_handler,
|
|
2228
|
-
const std::function<void(json)> & error_handler
|
|
2390
|
+
const std::function<void(json)> & error_handler,
|
|
2391
|
+
const std::function<bool()> & is_connection_closed) {
|
|
2229
2392
|
size_t n_finished = 0;
|
|
2230
2393
|
while (true) {
|
|
2231
|
-
server_task_result_ptr result = queue_results.
|
|
2394
|
+
server_task_result_ptr result = queue_results.recv_with_timeout(id_tasks, HTTP_POLLING_SECONDS);
|
|
2395
|
+
|
|
2396
|
+
if (is_connection_closed()) {
|
|
2397
|
+
cancel_tasks(id_tasks);
|
|
2398
|
+
return;
|
|
2399
|
+
}
|
|
2400
|
+
|
|
2401
|
+
if (result == nullptr) {
|
|
2402
|
+
continue; // retry
|
|
2403
|
+
}
|
|
2232
2404
|
|
|
2233
2405
|
if (result->is_error()) {
|
|
2234
2406
|
error_handler(result->to_json());
|
|
@@ -2456,7 +2628,7 @@ struct server_context {
|
|
|
2456
2628
|
} break;
|
|
2457
2629
|
case SERVER_TASK_TYPE_SET_LORA:
|
|
2458
2630
|
{
|
|
2459
|
-
|
|
2631
|
+
params_base.lora_adapters = std::move(task.set_lora);
|
|
2460
2632
|
auto res = std::make_unique<server_task_result_apply_lora>();
|
|
2461
2633
|
res->id = task.id;
|
|
2462
2634
|
queue_results.send(std::move(res));
|
|
@@ -2533,12 +2705,22 @@ struct server_context {
|
|
|
2533
2705
|
// start populating the batch for this iteration
|
|
2534
2706
|
common_batch_clear(batch);
|
|
2535
2707
|
|
|
2708
|
+
// track if given slot can be batched with slots already in the batch
|
|
2709
|
+
server_slot * slot_batched = nullptr;
|
|
2710
|
+
|
|
2536
2711
|
// frist, add sampled tokens from any ongoing sequences
|
|
2537
2712
|
for (auto & slot : slots) {
|
|
2538
2713
|
if (slot.state != SLOT_STATE_GENERATING) {
|
|
2539
2714
|
continue;
|
|
2540
2715
|
}
|
|
2541
2716
|
|
|
2717
|
+
// check if we can batch this slot with the previous one
|
|
2718
|
+
if (!slot_batched) {
|
|
2719
|
+
slot_batched = &slot;
|
|
2720
|
+
} else if (!slot_batched->can_batch_with(slot)) {
|
|
2721
|
+
continue;
|
|
2722
|
+
}
|
|
2723
|
+
|
|
2542
2724
|
slot.i_batch = batch.n_tokens;
|
|
2543
2725
|
|
|
2544
2726
|
common_batch_add(batch, slot.sampled, slot.n_past, { slot.id }, true);
|
|
@@ -2557,15 +2739,18 @@ struct server_context {
|
|
|
2557
2739
|
int32_t n_batch = llama_n_batch(ctx);
|
|
2558
2740
|
int32_t n_ubatch = llama_n_ubatch(ctx);
|
|
2559
2741
|
|
|
2560
|
-
// track if this is an embedding or non-embedding batch
|
|
2561
|
-
// if we've added sampled tokens above, we are in non-embedding mode
|
|
2562
|
-
// -1: none, 0: non-embedding, 1: embedding
|
|
2563
|
-
// TODO: make enum
|
|
2564
|
-
int32_t batch_type = batch.n_tokens > 0 ? 0 : -1;
|
|
2565
|
-
|
|
2566
2742
|
// next, batch any pending prompts without exceeding n_batch
|
|
2567
2743
|
if (params_base.cont_batching || batch.n_tokens == 0) {
|
|
2568
2744
|
for (auto & slot : slots) {
|
|
2745
|
+
// check if we can batch this slot with the previous one
|
|
2746
|
+
if (slot.is_processing()) {
|
|
2747
|
+
if (!slot_batched) {
|
|
2748
|
+
slot_batched = &slot;
|
|
2749
|
+
} else if (!slot_batched->can_batch_with(slot)) {
|
|
2750
|
+
continue;
|
|
2751
|
+
}
|
|
2752
|
+
}
|
|
2753
|
+
|
|
2569
2754
|
// this slot still has a prompt to be processed
|
|
2570
2755
|
if (slot.state == SLOT_STATE_PROCESSING_PROMPT || slot.state == SLOT_STATE_STARTED) {
|
|
2571
2756
|
auto & prompt_tokens = slot.prompt_tokens;
|
|
@@ -2726,14 +2911,6 @@ struct server_context {
|
|
|
2726
2911
|
}
|
|
2727
2912
|
}
|
|
2728
2913
|
|
|
2729
|
-
// check that we are in the right batch_type, if not defer the slot
|
|
2730
|
-
int slot_type = slot.is_non_causal();
|
|
2731
|
-
if (batch_type == -1) {
|
|
2732
|
-
batch_type = slot_type;
|
|
2733
|
-
} else if (batch_type != slot_type) {
|
|
2734
|
-
continue;
|
|
2735
|
-
}
|
|
2736
|
-
|
|
2737
2914
|
// keep only the common part
|
|
2738
2915
|
if (!llama_kv_cache_seq_rm(ctx, slot.id, slot.n_past, -1)) {
|
|
2739
2916
|
// could not partially delete (likely using a non-Transformer model)
|
|
@@ -2801,8 +2978,12 @@ struct server_context {
|
|
|
2801
2978
|
|
|
2802
2979
|
SRV_DBG("decoding batch, n_tokens = %d\n", batch.n_tokens);
|
|
2803
2980
|
|
|
2804
|
-
|
|
2805
|
-
|
|
2981
|
+
if (slot_batched) {
|
|
2982
|
+
// make sure we're in the right embedding mode
|
|
2983
|
+
llama_set_embeddings(ctx, slot_batched->is_non_causal());
|
|
2984
|
+
// apply lora, only need to do it once per batch
|
|
2985
|
+
common_set_adapter_lora(ctx, slot_batched->lora);
|
|
2986
|
+
}
|
|
2806
2987
|
|
|
2807
2988
|
// process the created batch of tokens
|
|
2808
2989
|
for (int32_t i = 0; i < batch.n_tokens; i += n_batch) {
|
|
@@ -3003,12 +3184,12 @@ struct server_context {
|
|
|
3003
3184
|
|
|
3004
3185
|
json model_meta() const {
|
|
3005
3186
|
return json {
|
|
3006
|
-
{"vocab_type", llama_vocab_type
|
|
3007
|
-
{"n_vocab",
|
|
3008
|
-
{"n_ctx_train",
|
|
3009
|
-
{"n_embd",
|
|
3010
|
-
{"n_params", llama_model_n_params(model)},
|
|
3011
|
-
{"size", llama_model_size
|
|
3187
|
+
{"vocab_type", llama_vocab_type (vocab)},
|
|
3188
|
+
{"n_vocab", llama_vocab_n_tokens (vocab)},
|
|
3189
|
+
{"n_ctx_train", llama_model_n_ctx_train(model)},
|
|
3190
|
+
{"n_embd", llama_model_n_embd (model)},
|
|
3191
|
+
{"n_params", llama_model_n_params (model)},
|
|
3192
|
+
{"size", llama_model_size (model)},
|
|
3012
3193
|
};
|
|
3013
3194
|
}
|
|
3014
3195
|
};
|
|
@@ -3475,7 +3656,8 @@ int main(int argc, char ** argv) {
|
|
|
3475
3656
|
{ "default_generation_settings", ctx_server.default_generation_settings_for_props },
|
|
3476
3657
|
{ "total_slots", ctx_server.params_base.n_parallel },
|
|
3477
3658
|
{ "model_path", ctx_server.params_base.model },
|
|
3478
|
-
{ "chat_template",
|
|
3659
|
+
{ "chat_template", common_get_builtin_chat_template(ctx_server.model) },
|
|
3660
|
+
{ "build_info", build_info },
|
|
3479
3661
|
};
|
|
3480
3662
|
|
|
3481
3663
|
res_ok(res, data);
|
|
@@ -3496,12 +3678,12 @@ int main(int argc, char ** argv) {
|
|
|
3496
3678
|
|
|
3497
3679
|
// handle completion-like requests (completion, chat, infill)
|
|
3498
3680
|
// we can optionally provide a custom format for partial results and final results
|
|
3499
|
-
const auto
|
|
3681
|
+
const auto handle_completions_impl = [&ctx_server, &res_error, &res_ok](
|
|
3500
3682
|
server_task_type type,
|
|
3501
3683
|
json & data,
|
|
3684
|
+
std::function<bool()> is_connection_closed,
|
|
3502
3685
|
httplib::Response & res,
|
|
3503
|
-
|
|
3504
|
-
bool oaicompat_chat = false) {
|
|
3686
|
+
oaicompat_type oaicompat) {
|
|
3505
3687
|
GGML_ASSERT(type == SERVER_TASK_TYPE_COMPLETION || type == SERVER_TASK_TYPE_INFILL);
|
|
3506
3688
|
|
|
3507
3689
|
if (ctx_server.params_base.embedding) {
|
|
@@ -3513,7 +3695,7 @@ int main(int argc, char ** argv) {
|
|
|
3513
3695
|
std::vector<server_task> tasks;
|
|
3514
3696
|
|
|
3515
3697
|
try {
|
|
3516
|
-
std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.
|
|
3698
|
+
std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, data.at("prompt"), true, true);
|
|
3517
3699
|
tasks.reserve(tokenized_prompts.size());
|
|
3518
3700
|
for (size_t i = 0; i < tokenized_prompts.size(); i++) {
|
|
3519
3701
|
server_task task = server_task(type);
|
|
@@ -3522,13 +3704,15 @@ int main(int argc, char ** argv) {
|
|
|
3522
3704
|
task.index = i;
|
|
3523
3705
|
|
|
3524
3706
|
task.prompt_tokens = std::move(tokenized_prompts[i]);
|
|
3525
|
-
task.params = server_task::params_from_json_cmpl(
|
|
3707
|
+
task.params = server_task::params_from_json_cmpl(
|
|
3708
|
+
ctx_server.ctx,
|
|
3709
|
+
ctx_server.params_base,
|
|
3710
|
+
data);
|
|
3526
3711
|
task.id_selected_slot = json_value(data, "id_slot", -1);
|
|
3527
3712
|
|
|
3528
3713
|
// OAI-compat
|
|
3529
|
-
task.params.oaicompat
|
|
3530
|
-
task.params.
|
|
3531
|
-
task.params.oaicompat_cmpl_id = completion_id;
|
|
3714
|
+
task.params.oaicompat = oaicompat;
|
|
3715
|
+
task.params.oaicompat_cmpl_id = completion_id;
|
|
3532
3716
|
// oaicompat_model is already populated by params_from_json_cmpl
|
|
3533
3717
|
|
|
3534
3718
|
tasks.push_back(task);
|
|
@@ -3559,7 +3743,7 @@ int main(int argc, char ** argv) {
|
|
|
3559
3743
|
}
|
|
3560
3744
|
}, [&](const json & error_data) {
|
|
3561
3745
|
res_error(res, error_data);
|
|
3562
|
-
});
|
|
3746
|
+
}, is_connection_closed);
|
|
3563
3747
|
|
|
3564
3748
|
ctx_server.queue_results.remove_waiting_task_ids(task_ids);
|
|
3565
3749
|
} else {
|
|
@@ -3569,6 +3753,7 @@ int main(int argc, char ** argv) {
|
|
|
3569
3753
|
if (res_json.is_array()) {
|
|
3570
3754
|
for (const auto & res : res_json) {
|
|
3571
3755
|
if (!server_sent_event(sink, "data", res)) {
|
|
3756
|
+
// sending failed (HTTP connection closed), cancel the generation
|
|
3572
3757
|
return false;
|
|
3573
3758
|
}
|
|
3574
3759
|
}
|
|
@@ -3578,8 +3763,11 @@ int main(int argc, char ** argv) {
|
|
|
3578
3763
|
}
|
|
3579
3764
|
}, [&](const json & error_data) {
|
|
3580
3765
|
server_sent_event(sink, "error", error_data);
|
|
3766
|
+
}, [&sink]() {
|
|
3767
|
+
// note: do not use req.is_connection_closed here because req is already destroyed
|
|
3768
|
+
return !sink.is_writable();
|
|
3581
3769
|
});
|
|
3582
|
-
if (oaicompat) {
|
|
3770
|
+
if (oaicompat != OAICOMPAT_TYPE_NONE) {
|
|
3583
3771
|
static const std::string ev_done = "data: [DONE]\n\n";
|
|
3584
3772
|
sink.write(ev_done.data(), ev_done.size());
|
|
3585
3773
|
}
|
|
@@ -3595,26 +3783,36 @@ int main(int argc, char ** argv) {
|
|
|
3595
3783
|
}
|
|
3596
3784
|
};
|
|
3597
3785
|
|
|
3598
|
-
const auto handle_completions = [&
|
|
3786
|
+
const auto handle_completions = [&handle_completions_impl](const httplib::Request & req, httplib::Response & res) {
|
|
3599
3787
|
json data = json::parse(req.body);
|
|
3600
|
-
return
|
|
3788
|
+
return handle_completions_impl(
|
|
3789
|
+
SERVER_TASK_TYPE_COMPLETION,
|
|
3790
|
+
data,
|
|
3791
|
+
req.is_connection_closed,
|
|
3792
|
+
res,
|
|
3793
|
+
OAICOMPAT_TYPE_NONE);
|
|
3794
|
+
};
|
|
3795
|
+
|
|
3796
|
+
const auto handle_completions_oai = [&handle_completions_impl](const httplib::Request & req, httplib::Response & res) {
|
|
3797
|
+
json data = oaicompat_completion_params_parse(json::parse(req.body));
|
|
3798
|
+
return handle_completions_impl(
|
|
3601
3799
|
SERVER_TASK_TYPE_COMPLETION,
|
|
3602
3800
|
data,
|
|
3801
|
+
req.is_connection_closed,
|
|
3603
3802
|
res,
|
|
3604
|
-
|
|
3605
|
-
/* oaicompat_chat */ false);
|
|
3803
|
+
OAICOMPAT_TYPE_COMPLETION);
|
|
3606
3804
|
};
|
|
3607
3805
|
|
|
3608
|
-
const auto handle_infill = [&ctx_server, &res_error, &
|
|
3806
|
+
const auto handle_infill = [&ctx_server, &res_error, &handle_completions_impl](const httplib::Request & req, httplib::Response & res) {
|
|
3609
3807
|
// check model compatibility
|
|
3610
3808
|
std::string err;
|
|
3611
|
-
if (
|
|
3809
|
+
if (llama_vocab_fim_pre(ctx_server.vocab) == LLAMA_TOKEN_NULL) {
|
|
3612
3810
|
err += "prefix token is missing. ";
|
|
3613
3811
|
}
|
|
3614
|
-
if (
|
|
3812
|
+
if (llama_vocab_fim_suf(ctx_server.vocab) == LLAMA_TOKEN_NULL) {
|
|
3615
3813
|
err += "suffix token is missing. ";
|
|
3616
3814
|
}
|
|
3617
|
-
if (
|
|
3815
|
+
if (llama_vocab_fim_mid(ctx_server.vocab) == LLAMA_TOKEN_NULL) {
|
|
3618
3816
|
err += "middle token is missing. ";
|
|
3619
3817
|
}
|
|
3620
3818
|
if (!err.empty()) {
|
|
@@ -3660,10 +3858,10 @@ int main(int argc, char ** argv) {
|
|
|
3660
3858
|
data["input_extra"] = input_extra; // default to empty array if it's not exist
|
|
3661
3859
|
|
|
3662
3860
|
std::string prompt = json_value(data, "prompt", std::string());
|
|
3663
|
-
std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.
|
|
3861
|
+
std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, prompt, false, true);
|
|
3664
3862
|
SRV_DBG("creating infill tasks, n_prompts = %d\n", (int) tokenized_prompts.size());
|
|
3665
3863
|
data["prompt"] = format_infill(
|
|
3666
|
-
ctx_server.
|
|
3864
|
+
ctx_server.vocab,
|
|
3667
3865
|
data.at("input_prefix"),
|
|
3668
3866
|
data.at("input_suffix"),
|
|
3669
3867
|
data.at("input_extra"),
|
|
@@ -3674,22 +3872,27 @@ int main(int argc, char ** argv) {
|
|
|
3674
3872
|
tokenized_prompts[0]
|
|
3675
3873
|
);
|
|
3676
3874
|
|
|
3677
|
-
return
|
|
3875
|
+
return handle_completions_impl(
|
|
3876
|
+
SERVER_TASK_TYPE_INFILL,
|
|
3877
|
+
data,
|
|
3878
|
+
req.is_connection_closed,
|
|
3879
|
+
res,
|
|
3880
|
+
OAICOMPAT_TYPE_NONE); // infill is not OAI compatible
|
|
3678
3881
|
};
|
|
3679
3882
|
|
|
3680
|
-
const auto handle_chat_completions = [&ctx_server, ¶ms, &res_error, &
|
|
3883
|
+
const auto handle_chat_completions = [&ctx_server, ¶ms, &res_error, &handle_completions_impl](const httplib::Request & req, httplib::Response & res) {
|
|
3681
3884
|
if (ctx_server.params_base.embedding) {
|
|
3682
3885
|
res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
|
|
3683
3886
|
return;
|
|
3684
3887
|
}
|
|
3685
3888
|
|
|
3686
|
-
json data =
|
|
3687
|
-
return
|
|
3889
|
+
json data = oaicompat_chat_completion_params_parse(ctx_server.model, json::parse(req.body), params.chat_template);
|
|
3890
|
+
return handle_completions_impl(
|
|
3688
3891
|
SERVER_TASK_TYPE_COMPLETION,
|
|
3689
3892
|
data,
|
|
3893
|
+
req.is_connection_closed,
|
|
3690
3894
|
res,
|
|
3691
|
-
|
|
3692
|
-
/* oaicompat_chat */ true);
|
|
3895
|
+
OAICOMPAT_TYPE_CHAT);
|
|
3693
3896
|
};
|
|
3694
3897
|
|
|
3695
3898
|
const auto handle_models = [¶ms, &ctx_server, &res_ok](const httplib::Request &, httplib::Response & res) {
|
|
@@ -3697,7 +3900,7 @@ int main(int argc, char ** argv) {
|
|
|
3697
3900
|
{"object", "list"},
|
|
3698
3901
|
{"data", {
|
|
3699
3902
|
{
|
|
3700
|
-
{"id", params.model_alias},
|
|
3903
|
+
{"id", params.model_alias.empty() ? params.model : params.model_alias},
|
|
3701
3904
|
{"object", "model"},
|
|
3702
3905
|
{"created", std::time(0)},
|
|
3703
3906
|
{"owned_by", "llamacpp"},
|
|
@@ -3717,7 +3920,7 @@ int main(int argc, char ** argv) {
|
|
|
3717
3920
|
const bool add_special = json_value(body, "add_special", false);
|
|
3718
3921
|
const bool with_pieces = json_value(body, "with_pieces", false);
|
|
3719
3922
|
|
|
3720
|
-
llama_tokens tokens = tokenize_mixed(ctx_server.
|
|
3923
|
+
llama_tokens tokens = tokenize_mixed(ctx_server.vocab, body.at("content"), add_special, true);
|
|
3721
3924
|
|
|
3722
3925
|
if (with_pieces) {
|
|
3723
3926
|
for (const auto& token : tokens) {
|
|
@@ -3762,10 +3965,10 @@ int main(int argc, char ** argv) {
|
|
|
3762
3965
|
res_ok(res, data);
|
|
3763
3966
|
};
|
|
3764
3967
|
|
|
3765
|
-
const auto handle_embeddings_impl = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res,
|
|
3968
|
+
const auto handle_embeddings_impl = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res, oaicompat_type oaicompat) {
|
|
3766
3969
|
const json body = json::parse(req.body);
|
|
3767
3970
|
|
|
3768
|
-
if (oaicompat && llama_pooling_type(ctx_server.ctx) == LLAMA_POOLING_TYPE_NONE) {
|
|
3971
|
+
if (oaicompat != OAICOMPAT_TYPE_NONE && llama_pooling_type(ctx_server.ctx) == LLAMA_POOLING_TYPE_NONE) {
|
|
3769
3972
|
res_error(res, format_error_response("Pooling type 'none' is not OAI compatible. Please use a different pooling type", ERROR_TYPE_INVALID_REQUEST));
|
|
3770
3973
|
return;
|
|
3771
3974
|
}
|
|
@@ -3775,14 +3978,25 @@ int main(int argc, char ** argv) {
|
|
|
3775
3978
|
if (body.count("input") != 0) {
|
|
3776
3979
|
prompt = body.at("input");
|
|
3777
3980
|
} else if (body.contains("content")) {
|
|
3778
|
-
oaicompat =
|
|
3981
|
+
oaicompat = OAICOMPAT_TYPE_NONE; // "content" field is not OAI compatible
|
|
3779
3982
|
prompt = body.at("content");
|
|
3780
3983
|
} else {
|
|
3781
3984
|
res_error(res, format_error_response("\"input\" or \"content\" must be provided", ERROR_TYPE_INVALID_REQUEST));
|
|
3782
3985
|
return;
|
|
3783
3986
|
}
|
|
3784
3987
|
|
|
3785
|
-
|
|
3988
|
+
bool use_base64 = false;
|
|
3989
|
+
if (body.count("encoding_format") != 0) {
|
|
3990
|
+
const std::string& format = body.at("encoding_format");
|
|
3991
|
+
if (format == "base64") {
|
|
3992
|
+
use_base64 = true;
|
|
3993
|
+
} else if (format != "float") {
|
|
3994
|
+
res_error(res, format_error_response("The format to return the embeddings in. Can be either float or base64", ERROR_TYPE_INVALID_REQUEST));
|
|
3995
|
+
return;
|
|
3996
|
+
}
|
|
3997
|
+
}
|
|
3998
|
+
|
|
3999
|
+
std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, prompt, true, true);
|
|
3786
4000
|
for (const auto & tokens : tokenized_prompts) {
|
|
3787
4001
|
// this check is necessary for models that do not add BOS token to the input
|
|
3788
4002
|
if (tokens.empty()) {
|
|
@@ -3823,7 +4037,7 @@ int main(int argc, char ** argv) {
|
|
|
3823
4037
|
}, [&](const json & error_data) {
|
|
3824
4038
|
res_error(res, error_data);
|
|
3825
4039
|
error = true;
|
|
3826
|
-
});
|
|
4040
|
+
}, req.is_connection_closed);
|
|
3827
4041
|
|
|
3828
4042
|
ctx_server.queue_results.remove_waiting_task_ids(task_ids);
|
|
3829
4043
|
}
|
|
@@ -3833,16 +4047,18 @@ int main(int argc, char ** argv) {
|
|
|
3833
4047
|
}
|
|
3834
4048
|
|
|
3835
4049
|
// write JSON response
|
|
3836
|
-
json root = oaicompat
|
|
4050
|
+
json root = oaicompat == OAICOMPAT_TYPE_EMBEDDING
|
|
4051
|
+
? format_embeddings_response_oaicompat(body, responses, use_base64)
|
|
4052
|
+
: json(responses);
|
|
3837
4053
|
res_ok(res, root);
|
|
3838
4054
|
};
|
|
3839
4055
|
|
|
3840
4056
|
const auto handle_embeddings = [&handle_embeddings_impl](const httplib::Request & req, httplib::Response & res) {
|
|
3841
|
-
handle_embeddings_impl(req, res,
|
|
4057
|
+
handle_embeddings_impl(req, res, OAICOMPAT_TYPE_NONE);
|
|
3842
4058
|
};
|
|
3843
4059
|
|
|
3844
4060
|
const auto handle_embeddings_oai = [&handle_embeddings_impl](const httplib::Request & req, httplib::Response & res) {
|
|
3845
|
-
handle_embeddings_impl(req, res,
|
|
4061
|
+
handle_embeddings_impl(req, res, OAICOMPAT_TYPE_EMBEDDING);
|
|
3846
4062
|
};
|
|
3847
4063
|
|
|
3848
4064
|
const auto handle_rerank = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) {
|
|
@@ -3880,20 +4096,20 @@ int main(int argc, char ** argv) {
|
|
|
3880
4096
|
return;
|
|
3881
4097
|
}
|
|
3882
4098
|
|
|
3883
|
-
llama_tokens tokenized_query = tokenize_input_prompts(ctx_server.
|
|
4099
|
+
llama_tokens tokenized_query = tokenize_input_prompts(ctx_server.vocab, query, /* add_special */ false, true)[0];
|
|
3884
4100
|
|
|
3885
4101
|
// create and queue the task
|
|
3886
4102
|
json responses = json::array();
|
|
3887
4103
|
bool error = false;
|
|
3888
4104
|
{
|
|
3889
4105
|
std::vector<server_task> tasks;
|
|
3890
|
-
std::vector<llama_tokens> tokenized_docs = tokenize_input_prompts(ctx_server.
|
|
4106
|
+
std::vector<llama_tokens> tokenized_docs = tokenize_input_prompts(ctx_server.vocab, documents, /* add_special */ false, true);
|
|
3891
4107
|
tasks.reserve(tokenized_docs.size());
|
|
3892
4108
|
for (size_t i = 0; i < tokenized_docs.size(); i++) {
|
|
3893
4109
|
server_task task = server_task(SERVER_TASK_TYPE_RERANK);
|
|
3894
4110
|
task.id = ctx_server.queue_tasks.get_new_id();
|
|
3895
4111
|
task.index = i;
|
|
3896
|
-
task.prompt_tokens = format_rerank(ctx_server.
|
|
4112
|
+
task.prompt_tokens = format_rerank(ctx_server.vocab, tokenized_query, tokenized_docs[i]);
|
|
3897
4113
|
tasks.push_back(task);
|
|
3898
4114
|
}
|
|
3899
4115
|
|
|
@@ -3911,7 +4127,7 @@ int main(int argc, char ** argv) {
|
|
|
3911
4127
|
}, [&](const json & error_data) {
|
|
3912
4128
|
res_error(res, error_data);
|
|
3913
4129
|
error = true;
|
|
3914
|
-
});
|
|
4130
|
+
}, req.is_connection_closed);
|
|
3915
4131
|
}
|
|
3916
4132
|
|
|
3917
4133
|
if (error) {
|
|
@@ -3925,8 +4141,9 @@ int main(int argc, char ** argv) {
|
|
|
3925
4141
|
|
|
3926
4142
|
const auto handle_lora_adapters_list = [&](const httplib::Request &, httplib::Response & res) {
|
|
3927
4143
|
json result = json::array();
|
|
3928
|
-
|
|
3929
|
-
|
|
4144
|
+
const auto & loras = ctx_server.params_base.lora_adapters;
|
|
4145
|
+
for (size_t i = 0; i < loras.size(); ++i) {
|
|
4146
|
+
auto & lora = loras[i];
|
|
3930
4147
|
result.push_back({
|
|
3931
4148
|
{"id", i},
|
|
3932
4149
|
{"path", lora.path},
|
|
@@ -3938,27 +4155,14 @@ int main(int argc, char ** argv) {
|
|
|
3938
4155
|
};
|
|
3939
4156
|
|
|
3940
4157
|
const auto handle_lora_adapters_apply = [&](const httplib::Request & req, httplib::Response & res) {
|
|
3941
|
-
const
|
|
3942
|
-
|
|
3943
|
-
|
|
3944
|
-
|
|
3945
|
-
for (auto & lora : ctx_server.loras) {
|
|
3946
|
-
lora.scale = 0.0f;
|
|
3947
|
-
}
|
|
3948
|
-
|
|
3949
|
-
// set value
|
|
3950
|
-
for (auto entry : body) {
|
|
3951
|
-
int id = entry.at("id");
|
|
3952
|
-
float scale = entry.at("scale");
|
|
3953
|
-
if (0 <= id && id < max_idx) {
|
|
3954
|
-
ctx_server.loras[id].scale = scale;
|
|
3955
|
-
} else {
|
|
3956
|
-
throw std::runtime_error("invalid adapter id");
|
|
3957
|
-
}
|
|
4158
|
+
const json body = json::parse(req.body);
|
|
4159
|
+
if (!body.is_array()) {
|
|
4160
|
+
res_error(res, format_error_response("Request body must be an array", ERROR_TYPE_INVALID_REQUEST));
|
|
4161
|
+
return;
|
|
3958
4162
|
}
|
|
3959
|
-
|
|
3960
4163
|
server_task task(SERVER_TASK_TYPE_SET_LORA);
|
|
3961
4164
|
task.id = ctx_server.queue_tasks.get_new_id();
|
|
4165
|
+
task.set_lora = parse_lora_request(ctx_server.params_base.lora_adapters, body);
|
|
3962
4166
|
ctx_server.queue_results.add_waiting_task_id(task.id);
|
|
3963
4167
|
ctx_server.queue_tasks.post(task);
|
|
3964
4168
|
|
|
@@ -4012,7 +4216,7 @@ int main(int argc, char ** argv) {
|
|
|
4012
4216
|
svr->Get ("/v1/models", handle_models); // public endpoint (no API key check)
|
|
4013
4217
|
svr->Post("/completion", handle_completions); // legacy
|
|
4014
4218
|
svr->Post("/completions", handle_completions);
|
|
4015
|
-
svr->Post("/v1/completions",
|
|
4219
|
+
svr->Post("/v1/completions", handle_completions_oai);
|
|
4016
4220
|
svr->Post("/chat/completions", handle_chat_completions);
|
|
4017
4221
|
svr->Post("/v1/chat/completions", handle_chat_completions);
|
|
4018
4222
|
svr->Post("/infill", handle_infill);
|
|
@@ -4092,14 +4296,16 @@ int main(int argc, char ** argv) {
|
|
|
4092
4296
|
|
|
4093
4297
|
// if a custom chat template is not supplied, we will use the one that comes with the model (if any)
|
|
4094
4298
|
if (params.chat_template.empty()) {
|
|
4095
|
-
if (!ctx_server.
|
|
4299
|
+
if (!ctx_server.validate_builtin_chat_template()) {
|
|
4096
4300
|
LOG_WRN("%s: The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses\n", __func__);
|
|
4097
4301
|
params.chat_template = "chatml";
|
|
4098
4302
|
}
|
|
4099
4303
|
}
|
|
4100
4304
|
|
|
4101
4305
|
// print sample chat example to make it clear which template is used
|
|
4102
|
-
LOG_INF("%s: chat template,
|
|
4306
|
+
LOG_INF("%s: chat template, chat_template: %s, example_format: '%s'\n", __func__,
|
|
4307
|
+
params.chat_template.empty() ? "(built-in)" : params.chat_template.c_str(),
|
|
4308
|
+
common_chat_format_example(ctx_server.model, params.chat_template).c_str());
|
|
4103
4309
|
|
|
4104
4310
|
ctx_server.queue_tasks.on_new_task(std::bind(
|
|
4105
4311
|
&server_context::process_single_task, &ctx_server, std::placeholders::_1));
|