@fugood/llama.node 0.3.15 → 0.3.17
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +3 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +5 -0
- package/package.json +1 -1
- package/src/LlamaCompletionWorker.cpp +8 -0
- package/src/LlamaCompletionWorker.h +1 -0
- package/src/LlamaContext.cpp +3 -2
- package/src/llama.cpp/.github/workflows/build-linux-cross.yml +124 -0
- package/src/llama.cpp/.github/workflows/build.yml +70 -27
- package/src/llama.cpp/.github/workflows/docker.yml +6 -6
- package/src/llama.cpp/.github/workflows/server.yml +7 -11
- package/src/llama.cpp/CMakeLists.txt +23 -1
- package/src/llama.cpp/common/CMakeLists.txt +6 -3
- package/src/llama.cpp/common/arg.cpp +809 -105
- package/src/llama.cpp/common/arg.h +9 -0
- package/src/llama.cpp/common/chat.cpp +1 -1
- package/src/llama.cpp/common/common.cpp +31 -521
- package/src/llama.cpp/common/common.h +17 -36
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -0
- package/src/llama.cpp/common/llguidance.cpp +30 -47
- package/src/llama.cpp/common/minja/chat-template.hpp +15 -7
- package/src/llama.cpp/common/minja/minja.hpp +119 -93
- package/src/llama.cpp/common/sampling.cpp +3 -0
- package/src/llama.cpp/docs/build.md +122 -7
- package/src/llama.cpp/examples/CMakeLists.txt +0 -9
- package/src/llama.cpp/examples/batched/batched.cpp +1 -1
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +7 -1
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +1 -1
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +15 -16
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +210 -8
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
- package/src/llama.cpp/examples/llava/CMakeLists.txt +39 -24
- package/src/llama.cpp/examples/llava/clip-impl.h +345 -0
- package/src/llama.cpp/examples/llava/clip.cpp +2152 -1803
- package/src/llama.cpp/examples/llava/clip.h +39 -22
- package/src/llama.cpp/examples/llava/deprecation-warning.cpp +22 -0
- package/src/llama.cpp/examples/llava/llava.cpp +64 -52
- package/src/llama.cpp/examples/llava/mtmd-cli.cpp +344 -0
- package/src/llama.cpp/examples/llava/mtmd.cpp +708 -0
- package/src/llama.cpp/examples/llava/mtmd.h +168 -0
- package/src/llama.cpp/examples/llava/{qwen2vl-cli.cpp → qwen2vl-test.cpp} +83 -31
- package/src/llama.cpp/examples/main/main.cpp +16 -5
- package/src/llama.cpp/examples/parallel/parallel.cpp +3 -1
- package/src/llama.cpp/examples/passkey/passkey.cpp +1 -1
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +17 -3
- package/src/llama.cpp/examples/quantize/quantize.cpp +115 -2
- package/src/llama.cpp/examples/rpc/CMakeLists.txt +4 -2
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +163 -8
- package/src/llama.cpp/examples/run/CMakeLists.txt +12 -1
- package/src/llama.cpp/examples/run/run.cpp +14 -28
- package/src/llama.cpp/examples/server/httplib.h +313 -247
- package/src/llama.cpp/examples/server/server.cpp +243 -139
- package/src/llama.cpp/examples/server/utils.hpp +51 -2
- package/src/llama.cpp/examples/speculative/speculative.cpp +1 -1
- package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
- package/src/llama.cpp/examples/sycl/build.sh +2 -2
- package/src/llama.cpp/examples/sycl/win-build-sycl.bat +2 -2
- package/src/llama.cpp/examples/tts/tts.cpp +14 -9
- package/src/llama.cpp/ggml/CMakeLists.txt +8 -2
- package/src/llama.cpp/ggml/cmake/GitVars.cmake +22 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +5 -0
- package/src/llama.cpp/ggml/include/ggml-rpc.h +6 -1
- package/src/llama.cpp/ggml/include/ggml.h +66 -99
- package/src/llama.cpp/ggml/src/CMakeLists.txt +15 -8
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +0 -2
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +8 -4
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +5 -5
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +692 -1534
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +613 -122
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +135 -1
- package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +507 -137
- package/src/llama.cpp/ggml/src/ggml-common.h +12 -6
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +48 -22
- package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/common.h +72 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2413 -228
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +2 -21
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +754 -404
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1004 -13516
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +2 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +2 -7
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +0 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +3 -4
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +533 -88
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +8809 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +110 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +892 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +28 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +258 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +802 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +7 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +0 -4
- package/src/llama.cpp/ggml/src/ggml-impl.h +52 -18
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +70 -3
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +67 -119
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1023 -260
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +293 -40
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +127 -33
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +350 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +39 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +0 -35
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +29 -293
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +79 -90
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +967 -438
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +22 -23
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +12 -43
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +24 -20
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +1 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +210 -286
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +84 -74
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +1 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +37 -49
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +7 -22
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +4 -14
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +204 -118
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +1 -3
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +23 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +692 -126
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +12 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +21 -10
- package/src/llama.cpp/ggml/src/ggml.c +141 -245
- package/src/llama.cpp/ggml/src/gguf.cpp +1 -0
- package/src/llama.cpp/include/llama.h +30 -11
- package/src/llama.cpp/models/ggml-vocab-llama4.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-llama4.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.out +46 -0
- package/src/llama.cpp/requirements/requirements-all.txt +2 -0
- package/src/llama.cpp/requirements/requirements-gguf_editor_gui.txt +3 -0
- package/src/llama.cpp/src/CMakeLists.txt +3 -2
- package/src/llama.cpp/src/llama-adapter.cpp +37 -1
- package/src/llama.cpp/src/llama-arch.cpp +161 -17
- package/src/llama.cpp/src/llama-arch.h +16 -0
- package/src/llama.cpp/src/llama-chat.cpp +82 -17
- package/src/llama.cpp/src/llama-chat.h +6 -2
- package/src/llama.cpp/src/llama-context.cpp +108 -92
- package/src/llama.cpp/src/llama-context.h +1 -2
- package/src/llama.cpp/src/llama-graph.cpp +189 -119
- package/src/llama.cpp/src/llama-graph.h +26 -6
- package/src/llama.cpp/src/llama-hparams.h +13 -0
- package/src/llama.cpp/src/llama-kv-cache.cpp +70 -123
- package/src/llama.cpp/src/llama-kv-cache.h +41 -115
- package/src/llama.cpp/src/llama-memory.h +1 -1
- package/src/llama.cpp/src/llama-mmap.cpp +1 -1
- package/src/llama.cpp/src/llama-model-loader.cpp +10 -5
- package/src/llama.cpp/src/llama-model-loader.h +5 -3
- package/src/llama.cpp/src/llama-model.cpp +1544 -291
- package/src/llama.cpp/src/llama-model.h +13 -1
- package/src/llama.cpp/src/llama-quant.cpp +29 -8
- package/src/llama.cpp/src/llama-sampling.cpp +7 -1
- package/src/llama.cpp/src/llama-vocab.cpp +44 -6
- package/src/llama.cpp/src/llama.cpp +1 -1
- package/src/llama.cpp/tests/CMakeLists.txt +43 -30
- package/src/llama.cpp/tests/test-arg-parser.cpp +51 -4
- package/src/llama.cpp/tests/test-backend-ops.cpp +139 -57
- package/src/llama.cpp/tests/test-chat-template.cpp +34 -13
- package/src/llama.cpp/tests/test-chat.cpp +12 -2
- package/src/llama.cpp/{examples/gbnf-validator/gbnf-validator.cpp → tests/test-gbnf-validator.cpp} +2 -2
- package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -2
- package/src/llama.cpp/tests/test-grammar-llguidance.cpp +63 -2
- package/src/llama.cpp/tests/test-grammar-parser.cpp +3 -1
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -1
- package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -1
- package/src/llama.cpp/{examples/quantize-stats/quantize-stats.cpp → tests/test-quantize-stats.cpp} +3 -1
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +2 -1
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +2 -1
- package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/llava/gemma3-cli.cpp +0 -341
- package/src/llama.cpp/examples/llava/llava-cli.cpp +0 -332
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +0 -354
- package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +0 -6
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
|
@@ -133,7 +133,8 @@ struct slot_params {
|
|
|
133
133
|
|
|
134
134
|
auto grammar_triggers = json::array();
|
|
135
135
|
for (const auto & trigger : sampling.grammar_triggers) {
|
|
136
|
-
|
|
136
|
+
server_grammar_trigger ct(std::move(trigger));
|
|
137
|
+
grammar_triggers.push_back(ct.to_json());
|
|
137
138
|
}
|
|
138
139
|
|
|
139
140
|
return json {
|
|
@@ -372,9 +373,9 @@ struct server_task {
|
|
|
372
373
|
const auto grammar_triggers = data.find("grammar_triggers");
|
|
373
374
|
if (grammar_triggers != data.end()) {
|
|
374
375
|
for (const auto & t : *grammar_triggers) {
|
|
375
|
-
|
|
376
|
-
if (ct.type == COMMON_GRAMMAR_TRIGGER_TYPE_WORD) {
|
|
377
|
-
const auto & word = ct.value;
|
|
376
|
+
server_grammar_trigger ct(t);
|
|
377
|
+
if (ct.value.type == COMMON_GRAMMAR_TRIGGER_TYPE_WORD) {
|
|
378
|
+
const auto & word = ct.value.value;
|
|
378
379
|
auto ids = common_tokenize(vocab, word, /* add_special= */ false, /* parse_special= */ true);
|
|
379
380
|
if (ids.size() == 1) {
|
|
380
381
|
auto token = ids[0];
|
|
@@ -392,7 +393,7 @@ struct server_task {
|
|
|
392
393
|
params.sampling.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, word});
|
|
393
394
|
}
|
|
394
395
|
} else {
|
|
395
|
-
params.sampling.grammar_triggers.push_back(ct);
|
|
396
|
+
params.sampling.grammar_triggers.push_back(std::move(ct.value));
|
|
396
397
|
}
|
|
397
398
|
}
|
|
398
399
|
}
|
|
@@ -489,8 +490,12 @@ struct result_timings {
|
|
|
489
490
|
double predicted_per_token_ms;
|
|
490
491
|
double predicted_per_second;
|
|
491
492
|
|
|
493
|
+
// Optional speculative metrics - only included when > 0
|
|
494
|
+
int32_t draft_n = 0;
|
|
495
|
+
int32_t draft_n_accepted = 0;
|
|
496
|
+
|
|
492
497
|
json to_json() const {
|
|
493
|
-
|
|
498
|
+
json base = {
|
|
494
499
|
{"prompt_n", prompt_n},
|
|
495
500
|
{"prompt_ms", prompt_ms},
|
|
496
501
|
{"prompt_per_token_ms", prompt_per_token_ms},
|
|
@@ -501,6 +506,13 @@ struct result_timings {
|
|
|
501
506
|
{"predicted_per_token_ms", predicted_per_token_ms},
|
|
502
507
|
{"predicted_per_second", predicted_per_second},
|
|
503
508
|
};
|
|
509
|
+
|
|
510
|
+
if (draft_n > 0) {
|
|
511
|
+
base["draft_n"] = draft_n;
|
|
512
|
+
base["draft_n_accepted"] = draft_n_accepted;
|
|
513
|
+
}
|
|
514
|
+
|
|
515
|
+
return base;
|
|
504
516
|
}
|
|
505
517
|
};
|
|
506
518
|
|
|
@@ -830,6 +842,11 @@ struct server_task_result_cmpl_final : server_task_result {
|
|
|
830
842
|
ret.push_back({"timings", timings.to_json()});
|
|
831
843
|
}
|
|
832
844
|
|
|
845
|
+
// extra fields for debugging purposes
|
|
846
|
+
if (verbose) {
|
|
847
|
+
ret["__verbose"] = to_json_non_oaicompat();
|
|
848
|
+
}
|
|
849
|
+
|
|
833
850
|
return ret;
|
|
834
851
|
}
|
|
835
852
|
};
|
|
@@ -1294,6 +1311,10 @@ struct server_slot {
|
|
|
1294
1311
|
|
|
1295
1312
|
std::function<void(int)> callback_on_release;
|
|
1296
1313
|
|
|
1314
|
+
// Speculative decoding stats
|
|
1315
|
+
int32_t n_draft_total = 0; // Total draft tokens generated
|
|
1316
|
+
int32_t n_draft_accepted = 0; // Draft tokens actually accepted
|
|
1317
|
+
|
|
1297
1318
|
void reset() {
|
|
1298
1319
|
SLT_DBG(*this, "%s", "\n");
|
|
1299
1320
|
|
|
@@ -1310,6 +1331,10 @@ struct server_slot {
|
|
|
1310
1331
|
|
|
1311
1332
|
generated_tokens.clear();
|
|
1312
1333
|
generated_token_probs.clear();
|
|
1334
|
+
|
|
1335
|
+
// clear speculative decoding stats
|
|
1336
|
+
n_draft_total = 0;
|
|
1337
|
+
n_draft_accepted = 0;
|
|
1313
1338
|
}
|
|
1314
1339
|
|
|
1315
1340
|
bool is_non_causal() const {
|
|
@@ -1376,6 +1401,12 @@ struct server_slot {
|
|
|
1376
1401
|
timings.predicted_per_token_ms = t_token_generation / n_decoded;
|
|
1377
1402
|
timings.predicted_per_second = 1e3 / t_token_generation * n_decoded;
|
|
1378
1403
|
|
|
1404
|
+
// Add speculative metrics
|
|
1405
|
+
if (n_draft_total > 0) {
|
|
1406
|
+
timings.draft_n = n_draft_total;
|
|
1407
|
+
timings.draft_n_accepted = n_draft_accepted;
|
|
1408
|
+
}
|
|
1409
|
+
|
|
1379
1410
|
return timings;
|
|
1380
1411
|
}
|
|
1381
1412
|
|
|
@@ -1423,6 +1454,15 @@ struct server_slot {
|
|
|
1423
1454
|
t_prompt_processing, n_prompt_tokens_processed, t_prompt, n_prompt_second,
|
|
1424
1455
|
t_token_generation, n_decoded, t_gen, n_gen_second,
|
|
1425
1456
|
t_prompt_processing + t_token_generation, n_prompt_tokens_processed + n_decoded);
|
|
1457
|
+
|
|
1458
|
+
if (n_draft_total > 0) {
|
|
1459
|
+
const float draft_ratio = (float) n_draft_accepted / n_draft_total;
|
|
1460
|
+
SLT_INF(*this,
|
|
1461
|
+
"\n"
|
|
1462
|
+
"draft acceptance rate = %0.5f (%5d accepted / %5d generated)\n",
|
|
1463
|
+
draft_ratio, n_draft_accepted, n_draft_total
|
|
1464
|
+
);
|
|
1465
|
+
}
|
|
1426
1466
|
}
|
|
1427
1467
|
|
|
1428
1468
|
json to_json() const {
|
|
@@ -1512,29 +1552,30 @@ struct server_queue {
|
|
|
1512
1552
|
std::condition_variable condition_tasks;
|
|
1513
1553
|
|
|
1514
1554
|
// callback functions
|
|
1515
|
-
std::function<void(server_task)> callback_new_task;
|
|
1516
|
-
std::function<void(void)>
|
|
1555
|
+
std::function<void(server_task &&)> callback_new_task;
|
|
1556
|
+
std::function<void(void)> callback_update_slots;
|
|
1517
1557
|
|
|
1518
1558
|
// Add a new task to the end of the queue
|
|
1519
|
-
int post(server_task task, bool front = false) {
|
|
1559
|
+
int post(server_task && task, bool front = false) {
|
|
1520
1560
|
std::unique_lock<std::mutex> lock(mutex_tasks);
|
|
1521
1561
|
GGML_ASSERT(task.id != -1);
|
|
1522
1562
|
// if this is cancel task make sure to clean up pending tasks
|
|
1523
1563
|
if (task.type == SERVER_TASK_TYPE_CANCEL) {
|
|
1524
1564
|
cleanup_pending_task(task.id_target);
|
|
1525
1565
|
}
|
|
1526
|
-
|
|
1566
|
+
const int task_id = task.id;
|
|
1567
|
+
QUE_DBG("new task, id = %d, front = %d\n", task_id, front);
|
|
1527
1568
|
if (front) {
|
|
1528
1569
|
queue_tasks.push_front(std::move(task));
|
|
1529
1570
|
} else {
|
|
1530
1571
|
queue_tasks.push_back(std::move(task));
|
|
1531
1572
|
}
|
|
1532
1573
|
condition_tasks.notify_one();
|
|
1533
|
-
return
|
|
1574
|
+
return task_id;
|
|
1534
1575
|
}
|
|
1535
1576
|
|
|
1536
1577
|
// multi-task version of post()
|
|
1537
|
-
int post(std::vector<server_task>
|
|
1578
|
+
int post(std::vector<server_task> && tasks, bool front = false) {
|
|
1538
1579
|
std::unique_lock<std::mutex> lock(mutex_tasks);
|
|
1539
1580
|
for (auto & task : tasks) {
|
|
1540
1581
|
if (task.id == -1) {
|
|
@@ -1556,7 +1597,7 @@ struct server_queue {
|
|
|
1556
1597
|
}
|
|
1557
1598
|
|
|
1558
1599
|
// Add a new task, but defer until one slot is available
|
|
1559
|
-
void defer(server_task task) {
|
|
1600
|
+
void defer(server_task && task) {
|
|
1560
1601
|
std::unique_lock<std::mutex> lock(mutex_tasks);
|
|
1561
1602
|
QUE_DBG("defer task, id = %d\n", task.id);
|
|
1562
1603
|
queue_tasks_deferred.push_back(std::move(task));
|
|
@@ -1571,7 +1612,7 @@ struct server_queue {
|
|
|
1571
1612
|
}
|
|
1572
1613
|
|
|
1573
1614
|
// Register function to process a new task
|
|
1574
|
-
void on_new_task(std::function<void(server_task)> callback) {
|
|
1615
|
+
void on_new_task(std::function<void(server_task &&)> callback) {
|
|
1575
1616
|
callback_new_task = std::move(callback);
|
|
1576
1617
|
}
|
|
1577
1618
|
|
|
@@ -1620,7 +1661,7 @@ struct server_queue {
|
|
|
1620
1661
|
lock.unlock();
|
|
1621
1662
|
break;
|
|
1622
1663
|
}
|
|
1623
|
-
server_task task = queue_tasks.front();
|
|
1664
|
+
server_task task = std::move(queue_tasks.front());
|
|
1624
1665
|
queue_tasks.pop_front();
|
|
1625
1666
|
lock.unlock();
|
|
1626
1667
|
|
|
@@ -1665,6 +1706,8 @@ private:
|
|
|
1665
1706
|
};
|
|
1666
1707
|
|
|
1667
1708
|
struct server_response {
|
|
1709
|
+
bool running = true;
|
|
1710
|
+
|
|
1668
1711
|
// for keeping track of all tasks waiting for the result
|
|
1669
1712
|
std::unordered_set<int> waiting_task_ids;
|
|
1670
1713
|
|
|
@@ -1719,6 +1762,10 @@ struct server_response {
|
|
|
1719
1762
|
while (true) {
|
|
1720
1763
|
std::unique_lock<std::mutex> lock(mutex_results);
|
|
1721
1764
|
condition_results.wait(lock, [&]{
|
|
1765
|
+
if (!running) {
|
|
1766
|
+
SRV_DBG("%s : queue result stop\n", __func__);
|
|
1767
|
+
std::terminate(); // we cannot return here since the caller is HTTP code
|
|
1768
|
+
}
|
|
1722
1769
|
return !queue_results.empty();
|
|
1723
1770
|
});
|
|
1724
1771
|
|
|
@@ -1749,6 +1796,10 @@ struct server_response {
|
|
|
1749
1796
|
}
|
|
1750
1797
|
|
|
1751
1798
|
std::cv_status cr_res = condition_results.wait_for(lock, std::chrono::seconds(timeout));
|
|
1799
|
+
if (!running) {
|
|
1800
|
+
SRV_DBG("%s : queue result stop\n", __func__);
|
|
1801
|
+
std::terminate(); // we cannot return here since the caller is HTTP code
|
|
1802
|
+
}
|
|
1752
1803
|
if (cr_res == std::cv_status::timeout) {
|
|
1753
1804
|
return nullptr;
|
|
1754
1805
|
}
|
|
@@ -1778,6 +1829,12 @@ struct server_response {
|
|
|
1778
1829
|
}
|
|
1779
1830
|
}
|
|
1780
1831
|
}
|
|
1832
|
+
|
|
1833
|
+
// terminate the waiting loop
|
|
1834
|
+
void terminate() {
|
|
1835
|
+
running = false;
|
|
1836
|
+
condition_results.notify_all();
|
|
1837
|
+
}
|
|
1781
1838
|
};
|
|
1782
1839
|
|
|
1783
1840
|
struct server_context {
|
|
@@ -1837,7 +1894,7 @@ struct server_context {
|
|
|
1837
1894
|
}
|
|
1838
1895
|
|
|
1839
1896
|
bool load_model(const common_params & params) {
|
|
1840
|
-
SRV_INF("loading model '%s'\n", params.model.c_str());
|
|
1897
|
+
SRV_INF("loading model '%s'\n", params.model.path.c_str());
|
|
1841
1898
|
|
|
1842
1899
|
params_base = params;
|
|
1843
1900
|
|
|
@@ -1847,7 +1904,7 @@ struct server_context {
|
|
|
1847
1904
|
ctx = llama_init.context.get();
|
|
1848
1905
|
|
|
1849
1906
|
if (model == nullptr) {
|
|
1850
|
-
SRV_ERR("failed to load model, '%s'\n", params_base.model.c_str());
|
|
1907
|
+
SRV_ERR("failed to load model, '%s'\n", params_base.model.path.c_str());
|
|
1851
1908
|
return false;
|
|
1852
1909
|
}
|
|
1853
1910
|
|
|
@@ -1858,16 +1915,13 @@ struct server_context {
|
|
|
1858
1915
|
add_bos_token = llama_vocab_get_add_bos(vocab);
|
|
1859
1916
|
has_eos_token = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
|
|
1860
1917
|
|
|
1861
|
-
if (!params_base.speculative.model.empty() || !params_base.speculative.hf_repo.empty()) {
|
|
1862
|
-
SRV_INF("loading draft model '%s'\n", params_base.speculative.model.c_str());
|
|
1918
|
+
if (!params_base.speculative.model.path.empty() || !params_base.speculative.model.hf_repo.empty()) {
|
|
1919
|
+
SRV_INF("loading draft model '%s'\n", params_base.speculative.model.path.c_str());
|
|
1863
1920
|
|
|
1864
1921
|
auto params_dft = params_base;
|
|
1865
1922
|
|
|
1866
1923
|
params_dft.devices = params_base.speculative.devices;
|
|
1867
|
-
params_dft.hf_file = params_base.speculative.hf_file;
|
|
1868
|
-
params_dft.hf_repo = params_base.speculative.hf_repo;
|
|
1869
1924
|
params_dft.model = params_base.speculative.model;
|
|
1870
|
-
params_dft.model_url = params_base.speculative.model_url;
|
|
1871
1925
|
params_dft.n_ctx = params_base.speculative.n_ctx == 0 ? params_base.n_ctx / params_base.n_parallel : params_base.speculative.n_ctx;
|
|
1872
1926
|
params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers;
|
|
1873
1927
|
params_dft.n_parallel = 1;
|
|
@@ -1881,12 +1935,12 @@ struct server_context {
|
|
|
1881
1935
|
model_dft = llama_init_dft.model.get();
|
|
1882
1936
|
|
|
1883
1937
|
if (model_dft == nullptr) {
|
|
1884
|
-
SRV_ERR("failed to load draft model, '%s'\n", params_base.speculative.model.c_str());
|
|
1938
|
+
SRV_ERR("failed to load draft model, '%s'\n", params_base.speculative.model.path.c_str());
|
|
1885
1939
|
return false;
|
|
1886
1940
|
}
|
|
1887
1941
|
|
|
1888
1942
|
if (!common_speculative_are_compatible(ctx, llama_init_dft.context.get())) {
|
|
1889
|
-
SRV_ERR("the draft model '%s' is not compatible with the target model '%s'\n", params_base.speculative.model.c_str(), params_base.model.c_str());
|
|
1943
|
+
SRV_ERR("the draft model '%s' is not compatible with the target model '%s'\n", params_base.speculative.model.path.c_str(), params_base.model.path.c_str());
|
|
1890
1944
|
|
|
1891
1945
|
return false;
|
|
1892
1946
|
}
|
|
@@ -1951,7 +2005,7 @@ struct server_context {
|
|
|
1951
2005
|
|
|
1952
2006
|
slot.reset();
|
|
1953
2007
|
|
|
1954
|
-
slots.push_back(slot);
|
|
2008
|
+
slots.push_back(std::move(slot));
|
|
1955
2009
|
}
|
|
1956
2010
|
|
|
1957
2011
|
default_generation_settings_for_props = slots[0].to_json();
|
|
@@ -2052,7 +2106,7 @@ struct server_context {
|
|
|
2052
2106
|
return true;
|
|
2053
2107
|
}
|
|
2054
2108
|
|
|
2055
|
-
bool launch_slot_with_task(server_slot & slot,
|
|
2109
|
+
bool launch_slot_with_task(server_slot & slot, server_task && task) {
|
|
2056
2110
|
slot.reset();
|
|
2057
2111
|
slot.id_task = task.id;
|
|
2058
2112
|
slot.index = task.index;
|
|
@@ -2060,10 +2114,10 @@ struct server_context {
|
|
|
2060
2114
|
slot.params = std::move(task.params);
|
|
2061
2115
|
slot.prompt_tokens = std::move(task.prompt_tokens);
|
|
2062
2116
|
|
|
2063
|
-
if (!are_lora_equal(
|
|
2117
|
+
if (!are_lora_equal(slot.params.lora, slot.lora)) {
|
|
2064
2118
|
// if lora is changed, we cannot reuse cached tokens
|
|
2065
2119
|
slot.cache_tokens.clear();
|
|
2066
|
-
slot.lora =
|
|
2120
|
+
slot.lora = slot.params.lora;
|
|
2067
2121
|
}
|
|
2068
2122
|
|
|
2069
2123
|
bool can_detokenize = can_be_detokenized(ctx, slot.prompt_tokens);
|
|
@@ -2494,10 +2548,10 @@ struct server_context {
|
|
|
2494
2548
|
server_task task(SERVER_TASK_TYPE_CANCEL);
|
|
2495
2549
|
task.id_target = id_task;
|
|
2496
2550
|
queue_results.remove_waiting_task_id(id_task);
|
|
2497
|
-
cancel_tasks.push_back(task);
|
|
2551
|
+
cancel_tasks.push_back(std::move(task));
|
|
2498
2552
|
}
|
|
2499
2553
|
// push to beginning of the queue, so it has highest priority
|
|
2500
|
-
queue_tasks.post(cancel_tasks, true);
|
|
2554
|
+
queue_tasks.post(std::move(cancel_tasks), true);
|
|
2501
2555
|
}
|
|
2502
2556
|
|
|
2503
2557
|
// receive the results from task(s)
|
|
@@ -2584,7 +2638,7 @@ struct server_context {
|
|
|
2584
2638
|
// Functions to process the task
|
|
2585
2639
|
//
|
|
2586
2640
|
|
|
2587
|
-
void process_single_task(server_task task) {
|
|
2641
|
+
void process_single_task(server_task && task) {
|
|
2588
2642
|
switch (task.type) {
|
|
2589
2643
|
case SERVER_TASK_TYPE_COMPLETION:
|
|
2590
2644
|
case SERVER_TASK_TYPE_INFILL:
|
|
@@ -2598,17 +2652,17 @@ struct server_context {
|
|
|
2598
2652
|
if (slot == nullptr) {
|
|
2599
2653
|
// if no slot is available, we defer this task for processing later
|
|
2600
2654
|
SRV_DBG("no slot is available, defer task, id_task = %d\n", task.id);
|
|
2601
|
-
queue_tasks.defer(task);
|
|
2655
|
+
queue_tasks.defer(std::move(task));
|
|
2602
2656
|
break;
|
|
2603
2657
|
}
|
|
2604
2658
|
if (slot->is_processing()) {
|
|
2605
2659
|
// if requested slot is unavailable, we defer this task for processing later
|
|
2606
2660
|
SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id);
|
|
2607
|
-
queue_tasks.defer(task);
|
|
2661
|
+
queue_tasks.defer(std::move(task));
|
|
2608
2662
|
break;
|
|
2609
2663
|
}
|
|
2610
2664
|
|
|
2611
|
-
if (!launch_slot_with_task(*slot, task)) {
|
|
2665
|
+
if (!launch_slot_with_task(*slot, std::move(task))) {
|
|
2612
2666
|
SRV_ERR("failed to launch slot with task, id_task = %d\n", task.id);
|
|
2613
2667
|
break;
|
|
2614
2668
|
}
|
|
@@ -2687,7 +2741,7 @@ struct server_context {
|
|
|
2687
2741
|
if (slot->is_processing()) {
|
|
2688
2742
|
// if requested slot is unavailable, we defer this task for processing later
|
|
2689
2743
|
SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id);
|
|
2690
|
-
queue_tasks.defer(task);
|
|
2744
|
+
queue_tasks.defer(std::move(task));
|
|
2691
2745
|
break;
|
|
2692
2746
|
}
|
|
2693
2747
|
|
|
@@ -2723,7 +2777,7 @@ struct server_context {
|
|
|
2723
2777
|
if (slot->is_processing()) {
|
|
2724
2778
|
// if requested slot is unavailable, we defer this task for processing later
|
|
2725
2779
|
SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id);
|
|
2726
|
-
queue_tasks.defer(task);
|
|
2780
|
+
queue_tasks.defer(std::move(task));
|
|
2727
2781
|
break;
|
|
2728
2782
|
}
|
|
2729
2783
|
|
|
@@ -2766,7 +2820,7 @@ struct server_context {
|
|
|
2766
2820
|
if (slot->is_processing()) {
|
|
2767
2821
|
// if requested slot is unavailable, we defer this task for processing later
|
|
2768
2822
|
SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id);
|
|
2769
|
-
queue_tasks.defer(task);
|
|
2823
|
+
queue_tasks.defer(std::move(task));
|
|
2770
2824
|
break;
|
|
2771
2825
|
}
|
|
2772
2826
|
|
|
@@ -2818,7 +2872,7 @@ struct server_context {
|
|
|
2818
2872
|
|
|
2819
2873
|
server_task task(SERVER_TASK_TYPE_NEXT_RESPONSE);
|
|
2820
2874
|
task.id = queue_tasks.get_new_id();
|
|
2821
|
-
queue_tasks.post(task);
|
|
2875
|
+
queue_tasks.post(std::move(task));
|
|
2822
2876
|
}
|
|
2823
2877
|
|
|
2824
2878
|
// apply context-shift if needed
|
|
@@ -3285,6 +3339,9 @@ struct server_context {
|
|
|
3285
3339
|
|
|
3286
3340
|
llama_tokens draft = common_speculative_gen_draft(slot.spec, params_spec, slot.cache_tokens, id);
|
|
3287
3341
|
|
|
3342
|
+
// keep track of total number of tokens generated in the draft
|
|
3343
|
+
slot.n_draft_total += draft.size();
|
|
3344
|
+
|
|
3288
3345
|
// ignore small drafts
|
|
3289
3346
|
if (slot.params.speculative.n_min > (int) draft.size()) {
|
|
3290
3347
|
SLT_DBG(slot, "ignoring small draft: %d < %d\n", (int) draft.size(), slot.params.speculative.n_min);
|
|
@@ -3310,6 +3367,9 @@ struct server_context {
|
|
|
3310
3367
|
slot.n_past += ids.size();
|
|
3311
3368
|
slot.n_decoded += ids.size();
|
|
3312
3369
|
|
|
3370
|
+
// update how many tokens out of draft was accepted
|
|
3371
|
+
slot.n_draft_accepted += ids.size() - 1;
|
|
3372
|
+
|
|
3313
3373
|
slot.cache_tokens.push_back(id);
|
|
3314
3374
|
slot.cache_tokens.insert(slot.cache_tokens.end(), ids.begin(), ids.end() - 1);
|
|
3315
3375
|
|
|
@@ -3574,14 +3634,17 @@ int main(int argc, char ** argv) {
|
|
|
3574
3634
|
}
|
|
3575
3635
|
|
|
3576
3636
|
// request slots data using task queue
|
|
3577
|
-
|
|
3578
|
-
|
|
3579
|
-
|
|
3580
|
-
|
|
3637
|
+
int task_id = ctx_server.queue_tasks.get_new_id();
|
|
3638
|
+
{
|
|
3639
|
+
server_task task(SERVER_TASK_TYPE_METRICS);
|
|
3640
|
+
task.id = task_id;
|
|
3641
|
+
ctx_server.queue_results.add_waiting_task_id(task_id);
|
|
3642
|
+
ctx_server.queue_tasks.post(std::move(task), true); // high-priority task
|
|
3643
|
+
}
|
|
3581
3644
|
|
|
3582
3645
|
// get the result
|
|
3583
|
-
server_task_result_ptr result = ctx_server.queue_results.recv(
|
|
3584
|
-
ctx_server.queue_results.remove_waiting_task_id(
|
|
3646
|
+
server_task_result_ptr result = ctx_server.queue_results.recv(task_id);
|
|
3647
|
+
ctx_server.queue_results.remove_waiting_task_id(task_id);
|
|
3585
3648
|
|
|
3586
3649
|
if (result->is_error()) {
|
|
3587
3650
|
res_error(res, result->to_json());
|
|
@@ -3610,16 +3673,17 @@ int main(int argc, char ** argv) {
|
|
|
3610
3673
|
}
|
|
3611
3674
|
|
|
3612
3675
|
// request slots data using task queue
|
|
3613
|
-
|
|
3614
|
-
|
|
3615
|
-
|
|
3616
|
-
|
|
3617
|
-
|
|
3618
|
-
|
|
3676
|
+
int task_id = ctx_server.queue_tasks.get_new_id();
|
|
3677
|
+
{
|
|
3678
|
+
server_task task(SERVER_TASK_TYPE_METRICS);
|
|
3679
|
+
task.id = task_id;
|
|
3680
|
+
ctx_server.queue_results.add_waiting_task_id(task_id);
|
|
3681
|
+
ctx_server.queue_tasks.post(std::move(task), true); // high-priority task
|
|
3682
|
+
}
|
|
3619
3683
|
|
|
3620
3684
|
// get the result
|
|
3621
|
-
server_task_result_ptr result = ctx_server.queue_results.recv(
|
|
3622
|
-
ctx_server.queue_results.remove_waiting_task_id(
|
|
3685
|
+
server_task_result_ptr result = ctx_server.queue_results.recv(task_id);
|
|
3686
|
+
ctx_server.queue_results.remove_waiting_task_id(task_id);
|
|
3623
3687
|
|
|
3624
3688
|
if (result->is_error()) {
|
|
3625
3689
|
res_error(res, result->to_json());
|
|
@@ -3716,17 +3780,20 @@ int main(int argc, char ** argv) {
|
|
|
3716
3780
|
}
|
|
3717
3781
|
std::string filepath = params.slot_save_path + filename;
|
|
3718
3782
|
|
|
3719
|
-
|
|
3720
|
-
|
|
3721
|
-
|
|
3722
|
-
|
|
3723
|
-
|
|
3783
|
+
int task_id = ctx_server.queue_tasks.get_new_id();
|
|
3784
|
+
{
|
|
3785
|
+
server_task task(SERVER_TASK_TYPE_SLOT_SAVE);
|
|
3786
|
+
task.id = task_id;
|
|
3787
|
+
task.slot_action.slot_id = id_slot;
|
|
3788
|
+
task.slot_action.filename = filename;
|
|
3789
|
+
task.slot_action.filepath = filepath;
|
|
3724
3790
|
|
|
3725
|
-
|
|
3726
|
-
|
|
3791
|
+
ctx_server.queue_results.add_waiting_task_id(task_id);
|
|
3792
|
+
ctx_server.queue_tasks.post(std::move(task));
|
|
3793
|
+
}
|
|
3727
3794
|
|
|
3728
|
-
server_task_result_ptr result = ctx_server.queue_results.recv(
|
|
3729
|
-
ctx_server.queue_results.remove_waiting_task_id(
|
|
3795
|
+
server_task_result_ptr result = ctx_server.queue_results.recv(task_id);
|
|
3796
|
+
ctx_server.queue_results.remove_waiting_task_id(task_id);
|
|
3730
3797
|
|
|
3731
3798
|
if (result->is_error()) {
|
|
3732
3799
|
res_error(res, result->to_json());
|
|
@@ -3745,17 +3812,20 @@ int main(int argc, char ** argv) {
|
|
|
3745
3812
|
}
|
|
3746
3813
|
std::string filepath = params.slot_save_path + filename;
|
|
3747
3814
|
|
|
3748
|
-
|
|
3749
|
-
|
|
3750
|
-
|
|
3751
|
-
|
|
3752
|
-
|
|
3815
|
+
int task_id = ctx_server.queue_tasks.get_new_id();
|
|
3816
|
+
{
|
|
3817
|
+
server_task task(SERVER_TASK_TYPE_SLOT_RESTORE);
|
|
3818
|
+
task.id = task_id;
|
|
3819
|
+
task.slot_action.slot_id = id_slot;
|
|
3820
|
+
task.slot_action.filename = filename;
|
|
3821
|
+
task.slot_action.filepath = filepath;
|
|
3753
3822
|
|
|
3754
|
-
|
|
3755
|
-
|
|
3823
|
+
ctx_server.queue_results.add_waiting_task_id(task_id);
|
|
3824
|
+
ctx_server.queue_tasks.post(std::move(task));
|
|
3825
|
+
}
|
|
3756
3826
|
|
|
3757
|
-
server_task_result_ptr result = ctx_server.queue_results.recv(
|
|
3758
|
-
ctx_server.queue_results.remove_waiting_task_id(
|
|
3827
|
+
server_task_result_ptr result = ctx_server.queue_results.recv(task_id);
|
|
3828
|
+
ctx_server.queue_results.remove_waiting_task_id(task_id);
|
|
3759
3829
|
|
|
3760
3830
|
if (result->is_error()) {
|
|
3761
3831
|
res_error(res, result->to_json());
|
|
@@ -3767,15 +3837,18 @@ int main(int argc, char ** argv) {
|
|
|
3767
3837
|
};
|
|
3768
3838
|
|
|
3769
3839
|
const auto handle_slots_erase = [&ctx_server, &res_error, &res_ok](const httplib::Request & /* req */, httplib::Response & res, int id_slot) {
|
|
3770
|
-
|
|
3771
|
-
|
|
3772
|
-
|
|
3840
|
+
int task_id = ctx_server.queue_tasks.get_new_id();
|
|
3841
|
+
{
|
|
3842
|
+
server_task task(SERVER_TASK_TYPE_SLOT_ERASE);
|
|
3843
|
+
task.id = task_id;
|
|
3844
|
+
task.slot_action.slot_id = id_slot;
|
|
3773
3845
|
|
|
3774
|
-
|
|
3775
|
-
|
|
3846
|
+
ctx_server.queue_results.add_waiting_task_id(task_id);
|
|
3847
|
+
ctx_server.queue_tasks.post(std::move(task));
|
|
3848
|
+
}
|
|
3776
3849
|
|
|
3777
|
-
server_task_result_ptr result = ctx_server.queue_results.recv(
|
|
3778
|
-
ctx_server.queue_results.remove_waiting_task_id(
|
|
3850
|
+
server_task_result_ptr result = ctx_server.queue_results.recv(task_id);
|
|
3851
|
+
ctx_server.queue_results.remove_waiting_task_id(task_id);
|
|
3779
3852
|
|
|
3780
3853
|
if (result->is_error()) {
|
|
3781
3854
|
res_error(res, result->to_json());
|
|
@@ -3820,7 +3893,7 @@ int main(int argc, char ** argv) {
|
|
|
3820
3893
|
json data = {
|
|
3821
3894
|
{ "default_generation_settings", ctx_server.default_generation_settings_for_props },
|
|
3822
3895
|
{ "total_slots", ctx_server.params_base.n_parallel },
|
|
3823
|
-
{ "model_path", ctx_server.params_base.model },
|
|
3896
|
+
{ "model_path", ctx_server.params_base.model.path },
|
|
3824
3897
|
{ "chat_template", common_chat_templates_source(ctx_server.chat_templates.get()) },
|
|
3825
3898
|
{ "bos_token", common_token_to_piece(ctx_server.ctx, llama_vocab_bos(ctx_server.vocab), /* special= */ true)},
|
|
3826
3899
|
{ "eos_token", common_token_to_piece(ctx_server.ctx, llama_vocab_eos(ctx_server.vocab), /* special= */ true)},
|
|
@@ -3848,6 +3921,21 @@ int main(int argc, char ** argv) {
|
|
|
3848
3921
|
res_ok(res, {{ "success", true }});
|
|
3849
3922
|
};
|
|
3850
3923
|
|
|
3924
|
+
const auto handle_api_show = [&ctx_server, &res_ok](const httplib::Request &, httplib::Response & res) {
|
|
3925
|
+
json data = {
|
|
3926
|
+
{
|
|
3927
|
+
"template", common_chat_templates_source(ctx_server.chat_templates.get()),
|
|
3928
|
+
},
|
|
3929
|
+
{
|
|
3930
|
+
"model_info", {
|
|
3931
|
+
{ "llama.context_length", ctx_server.slots.back().n_ctx, },
|
|
3932
|
+
}
|
|
3933
|
+
},
|
|
3934
|
+
};
|
|
3935
|
+
|
|
3936
|
+
res_ok(res, data);
|
|
3937
|
+
};
|
|
3938
|
+
|
|
3851
3939
|
// handle completion-like requests (completion, chat, infill)
|
|
3852
3940
|
// we can optionally provide a custom format for partial results and final results
|
|
3853
3941
|
const auto handle_completions_impl = [&ctx_server, &res_error, &res_ok](
|
|
@@ -3864,9 +3952,10 @@ int main(int argc, char ** argv) {
|
|
|
3864
3952
|
}
|
|
3865
3953
|
|
|
3866
3954
|
auto completion_id = gen_chatcmplid();
|
|
3867
|
-
std::
|
|
3868
|
-
|
|
3955
|
+
std::unordered_set<int> task_ids;
|
|
3869
3956
|
try {
|
|
3957
|
+
std::vector<server_task> tasks;
|
|
3958
|
+
|
|
3870
3959
|
const auto & prompt = data.at("prompt");
|
|
3871
3960
|
// TODO: this log can become very long, put it behind a flag or think about a more compact format
|
|
3872
3961
|
//SRV_DBG("Prompt: %s\n", prompt.is_string() ? prompt.get<std::string>().c_str() : prompt.dump(2).c_str());
|
|
@@ -3881,9 +3970,9 @@ int main(int argc, char ** argv) {
|
|
|
3881
3970
|
|
|
3882
3971
|
task.prompt_tokens = std::move(tokenized_prompts[i]);
|
|
3883
3972
|
task.params = server_task::params_from_json_cmpl(
|
|
3884
|
-
|
|
3885
|
-
|
|
3886
|
-
|
|
3973
|
+
ctx_server.ctx,
|
|
3974
|
+
ctx_server.params_base,
|
|
3975
|
+
data);
|
|
3887
3976
|
task.id_selected_slot = json_value(data, "id_slot", -1);
|
|
3888
3977
|
|
|
3889
3978
|
// OAI-compat
|
|
@@ -3891,18 +3980,18 @@ int main(int argc, char ** argv) {
|
|
|
3891
3980
|
task.params.oaicompat_cmpl_id = completion_id;
|
|
3892
3981
|
// oaicompat_model is already populated by params_from_json_cmpl
|
|
3893
3982
|
|
|
3894
|
-
tasks.push_back(task);
|
|
3983
|
+
tasks.push_back(std::move(task));
|
|
3895
3984
|
}
|
|
3985
|
+
|
|
3986
|
+
task_ids = server_task::get_list_id(tasks);
|
|
3987
|
+
ctx_server.queue_results.add_waiting_tasks(tasks);
|
|
3988
|
+
ctx_server.queue_tasks.post(std::move(tasks));
|
|
3896
3989
|
} catch (const std::exception & e) {
|
|
3897
3990
|
res_error(res, format_error_response(e.what(), ERROR_TYPE_INVALID_REQUEST));
|
|
3898
3991
|
return;
|
|
3899
3992
|
}
|
|
3900
3993
|
|
|
3901
|
-
ctx_server.queue_results.add_waiting_tasks(tasks);
|
|
3902
|
-
ctx_server.queue_tasks.post(tasks);
|
|
3903
|
-
|
|
3904
3994
|
bool stream = json_value(data, "stream", false);
|
|
3905
|
-
const auto task_ids = server_task::get_list_id(tasks);
|
|
3906
3995
|
|
|
3907
3996
|
if (!stream) {
|
|
3908
3997
|
ctx_server.receive_multi_results(task_ids, [&](std::vector<server_task_result_ptr> & results) {
|
|
@@ -4086,7 +4175,7 @@ int main(int argc, char ** argv) {
|
|
|
4086
4175
|
{"object", "list"},
|
|
4087
4176
|
{"data", {
|
|
4088
4177
|
{
|
|
4089
|
-
{"id", params.model_alias.empty() ? params.model : params.model_alias},
|
|
4178
|
+
{"id", params.model_alias.empty() ? params.model.path : params.model_alias},
|
|
4090
4179
|
{"object", "model"},
|
|
4091
4180
|
{"created", std::time(0)},
|
|
4092
4181
|
{"owned_by", "llamacpp"},
|
|
@@ -4194,6 +4283,7 @@ int main(int argc, char ** argv) {
|
|
|
4194
4283
|
// create and queue the task
|
|
4195
4284
|
json responses = json::array();
|
|
4196
4285
|
bool error = false;
|
|
4286
|
+
std::unordered_set<int> task_ids;
|
|
4197
4287
|
{
|
|
4198
4288
|
std::vector<server_task> tasks;
|
|
4199
4289
|
for (size_t i = 0; i < tokenized_prompts.size(); i++) {
|
|
@@ -4206,27 +4296,26 @@ int main(int argc, char ** argv) {
|
|
|
4206
4296
|
// OAI-compat
|
|
4207
4297
|
task.params.oaicompat = oaicompat;
|
|
4208
4298
|
|
|
4209
|
-
tasks.push_back(task);
|
|
4299
|
+
tasks.push_back(std::move(task));
|
|
4210
4300
|
}
|
|
4211
4301
|
|
|
4302
|
+
task_ids = server_task::get_list_id(tasks);
|
|
4212
4303
|
ctx_server.queue_results.add_waiting_tasks(tasks);
|
|
4213
|
-
ctx_server.queue_tasks.post(tasks);
|
|
4214
|
-
|
|
4215
|
-
// get the result
|
|
4216
|
-
std::unordered_set<int> task_ids = server_task::get_list_id(tasks);
|
|
4304
|
+
ctx_server.queue_tasks.post(std::move(tasks));
|
|
4305
|
+
}
|
|
4217
4306
|
|
|
4218
|
-
|
|
4219
|
-
|
|
4220
|
-
|
|
4221
|
-
|
|
4222
|
-
|
|
4223
|
-
}
|
|
4224
|
-
|
|
4225
|
-
|
|
4226
|
-
|
|
4307
|
+
// get the result
|
|
4308
|
+
ctx_server.receive_multi_results(task_ids, [&](std::vector<server_task_result_ptr> & results) {
|
|
4309
|
+
for (auto & res : results) {
|
|
4310
|
+
GGML_ASSERT(dynamic_cast<server_task_result_embd*>(res.get()) != nullptr);
|
|
4311
|
+
responses.push_back(res->to_json());
|
|
4312
|
+
}
|
|
4313
|
+
}, [&](const json & error_data) {
|
|
4314
|
+
res_error(res, error_data);
|
|
4315
|
+
error = true;
|
|
4316
|
+
}, req.is_connection_closed);
|
|
4227
4317
|
|
|
4228
|
-
|
|
4229
|
-
}
|
|
4318
|
+
ctx_server.queue_results.remove_waiting_task_ids(task_ids);
|
|
4230
4319
|
|
|
4231
4320
|
if (error) {
|
|
4232
4321
|
return;
|
|
@@ -4293,6 +4382,7 @@ int main(int argc, char ** argv) {
|
|
|
4293
4382
|
// create and queue the task
|
|
4294
4383
|
json responses = json::array();
|
|
4295
4384
|
bool error = false;
|
|
4385
|
+
std::unordered_set<int> task_ids;
|
|
4296
4386
|
{
|
|
4297
4387
|
std::vector<server_task> tasks;
|
|
4298
4388
|
std::vector<llama_tokens> tokenized_docs = tokenize_input_prompts(ctx_server.vocab, documents, /* add_special */ false, true);
|
|
@@ -4302,26 +4392,24 @@ int main(int argc, char ** argv) {
|
|
|
4302
4392
|
task.id = ctx_server.queue_tasks.get_new_id();
|
|
4303
4393
|
task.index = i;
|
|
4304
4394
|
task.prompt_tokens = format_rerank(ctx_server.vocab, tokenized_query, tokenized_docs[i]);
|
|
4305
|
-
tasks.push_back(task);
|
|
4395
|
+
tasks.push_back(std::move(task));
|
|
4306
4396
|
}
|
|
4307
4397
|
|
|
4398
|
+
task_ids = server_task::get_list_id(tasks);
|
|
4308
4399
|
ctx_server.queue_results.add_waiting_tasks(tasks);
|
|
4309
|
-
ctx_server.queue_tasks.post(tasks);
|
|
4310
|
-
|
|
4311
|
-
// get the result
|
|
4312
|
-
std::unordered_set<int> task_ids = server_task::get_list_id(tasks);
|
|
4313
|
-
|
|
4314
|
-
ctx_server.receive_multi_results(task_ids, [&](std::vector<server_task_result_ptr> & results) {
|
|
4315
|
-
for (auto & res : results) {
|
|
4316
|
-
GGML_ASSERT(dynamic_cast<server_task_result_rerank*>(res.get()) != nullptr);
|
|
4317
|
-
responses.push_back(res->to_json());
|
|
4318
|
-
}
|
|
4319
|
-
}, [&](const json & error_data) {
|
|
4320
|
-
res_error(res, error_data);
|
|
4321
|
-
error = true;
|
|
4322
|
-
}, req.is_connection_closed);
|
|
4400
|
+
ctx_server.queue_tasks.post(std::move(tasks));
|
|
4323
4401
|
}
|
|
4324
4402
|
|
|
4403
|
+
ctx_server.receive_multi_results(task_ids, [&](std::vector<server_task_result_ptr> & results) {
|
|
4404
|
+
for (auto & res : results) {
|
|
4405
|
+
GGML_ASSERT(dynamic_cast<server_task_result_rerank*>(res.get()) != nullptr);
|
|
4406
|
+
responses.push_back(res->to_json());
|
|
4407
|
+
}
|
|
4408
|
+
}, [&](const json & error_data) {
|
|
4409
|
+
res_error(res, error_data);
|
|
4410
|
+
error = true;
|
|
4411
|
+
}, req.is_connection_closed);
|
|
4412
|
+
|
|
4325
4413
|
if (error) {
|
|
4326
4414
|
return;
|
|
4327
4415
|
}
|
|
@@ -4357,14 +4445,19 @@ int main(int argc, char ** argv) {
|
|
|
4357
4445
|
res_error(res, format_error_response("Request body must be an array", ERROR_TYPE_INVALID_REQUEST));
|
|
4358
4446
|
return;
|
|
4359
4447
|
}
|
|
4360
|
-
server_task task(SERVER_TASK_TYPE_SET_LORA);
|
|
4361
|
-
task.id = ctx_server.queue_tasks.get_new_id();
|
|
4362
|
-
task.set_lora = parse_lora_request(ctx_server.params_base.lora_adapters, body);
|
|
4363
|
-
ctx_server.queue_results.add_waiting_task_id(task.id);
|
|
4364
|
-
ctx_server.queue_tasks.post(task);
|
|
4365
4448
|
|
|
4366
|
-
|
|
4367
|
-
|
|
4449
|
+
int task_id = ctx_server.queue_tasks.get_new_id();
|
|
4450
|
+
{
|
|
4451
|
+
server_task task(SERVER_TASK_TYPE_SET_LORA);
|
|
4452
|
+
task.id = task_id;
|
|
4453
|
+
task.set_lora = parse_lora_request(ctx_server.params_base.lora_adapters, body);
|
|
4454
|
+
ctx_server.queue_results.add_waiting_task_id(task_id);
|
|
4455
|
+
ctx_server.queue_tasks.post(std::move(task));
|
|
4456
|
+
}
|
|
4457
|
+
|
|
4458
|
+
// get the result
|
|
4459
|
+
server_task_result_ptr result = ctx_server.queue_results.recv(task_id);
|
|
4460
|
+
ctx_server.queue_results.remove_waiting_task_id(task_id);
|
|
4368
4461
|
|
|
4369
4462
|
if (result->is_error()) {
|
|
4370
4463
|
res_error(res, result->to_json());
|
|
@@ -4412,6 +4505,7 @@ int main(int argc, char ** argv) {
|
|
|
4412
4505
|
svr->Get ("/metrics", handle_metrics);
|
|
4413
4506
|
svr->Get ("/props", handle_props);
|
|
4414
4507
|
svr->Post("/props", handle_props_change);
|
|
4508
|
+
svr->Post("/api/show", handle_api_show);
|
|
4415
4509
|
svr->Get ("/models", handle_models); // public endpoint (no API key check)
|
|
4416
4510
|
svr->Get ("/v1/models", handle_models); // public endpoint (no API key check)
|
|
4417
4511
|
svr->Post("/completion", handle_completions); // legacy
|
|
@@ -4448,21 +4542,31 @@ int main(int argc, char ** argv) {
|
|
|
4448
4542
|
svr->new_task_queue = [¶ms] { return new httplib::ThreadPool(params.n_threads_http); };
|
|
4449
4543
|
|
|
4450
4544
|
// clean up function, to be called before exit
|
|
4451
|
-
auto clean_up = [&svr]() {
|
|
4545
|
+
auto clean_up = [&svr, &ctx_server]() {
|
|
4452
4546
|
SRV_INF("%s: cleaning up before exit...\n", __func__);
|
|
4453
4547
|
svr->stop();
|
|
4548
|
+
ctx_server.queue_results.terminate();
|
|
4454
4549
|
llama_backend_free();
|
|
4455
4550
|
};
|
|
4456
4551
|
|
|
4457
|
-
// bind HTTP listen port
|
|
4458
4552
|
bool was_bound = false;
|
|
4459
|
-
if (params.
|
|
4460
|
-
|
|
4461
|
-
|
|
4462
|
-
|
|
4463
|
-
|
|
4553
|
+
if (string_ends_with(std::string(params.hostname), ".sock")) {
|
|
4554
|
+
LOG_INF("%s: setting address family to AF_UNIX\n", __func__);
|
|
4555
|
+
svr->set_address_family(AF_UNIX);
|
|
4556
|
+
// bind_to_port requires a second arg, any value other than 0 should
|
|
4557
|
+
// simply get ignored
|
|
4558
|
+
was_bound = svr->bind_to_port(params.hostname, 8080);
|
|
4464
4559
|
} else {
|
|
4465
|
-
|
|
4560
|
+
LOG_INF("%s: binding port with default address family\n", __func__);
|
|
4561
|
+
// bind HTTP listen port
|
|
4562
|
+
if (params.port == 0) {
|
|
4563
|
+
int bound_port = svr->bind_to_any_port(params.hostname);
|
|
4564
|
+
if ((was_bound = (bound_port >= 0))) {
|
|
4565
|
+
params.port = bound_port;
|
|
4566
|
+
}
|
|
4567
|
+
} else {
|
|
4568
|
+
was_bound = svr->bind_to_port(params.hostname, params.port);
|
|
4569
|
+
}
|
|
4466
4570
|
}
|
|
4467
4571
|
|
|
4468
4572
|
if (!was_bound) {
|
|
@@ -4482,7 +4586,7 @@ int main(int argc, char ** argv) {
|
|
|
4482
4586
|
|
|
4483
4587
|
if (!ctx_server.load_model(params)) {
|
|
4484
4588
|
clean_up();
|
|
4485
|
-
|
|
4589
|
+
t.join();
|
|
4486
4590
|
LOG_ERR("%s: exiting due to model loading error\n", __func__);
|
|
4487
4591
|
return 1;
|
|
4488
4592
|
}
|
|
@@ -4497,8 +4601,8 @@ int main(int argc, char ** argv) {
|
|
|
4497
4601
|
common_chat_templates_source(ctx_server.chat_templates.get()),
|
|
4498
4602
|
common_chat_format_example(ctx_server.chat_templates.get(), ctx_server.params_base.use_jinja).c_str());
|
|
4499
4603
|
|
|
4500
|
-
ctx_server.queue_tasks.on_new_task([&ctx_server](
|
|
4501
|
-
ctx_server.process_single_task(task);
|
|
4604
|
+
ctx_server.queue_tasks.on_new_task([&ctx_server](server_task && task) {
|
|
4605
|
+
ctx_server.process_single_task(std::move(task));
|
|
4502
4606
|
});
|
|
4503
4607
|
|
|
4504
4608
|
ctx_server.queue_tasks.on_update_slots([&ctx_server]() {
|
|
@@ -4530,7 +4634,7 @@ int main(int argc, char ** argv) {
|
|
|
4530
4634
|
ctx_server.queue_tasks.start_loop();
|
|
4531
4635
|
|
|
4532
4636
|
clean_up();
|
|
4533
|
-
|
|
4637
|
+
t.join();
|
|
4534
4638
|
|
|
4535
4639
|
return 0;
|
|
4536
4640
|
}
|