@fugood/llama.node 0.3.16 → 0.3.17
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +3 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +5 -0
- package/package.json +1 -1
- package/src/LlamaCompletionWorker.cpp +8 -0
- package/src/LlamaCompletionWorker.h +1 -0
- package/src/LlamaContext.cpp +3 -2
- package/src/llama.cpp/.github/workflows/build-linux-cross.yml +124 -0
- package/src/llama.cpp/.github/workflows/build.yml +70 -27
- package/src/llama.cpp/.github/workflows/docker.yml +6 -6
- package/src/llama.cpp/.github/workflows/server.yml +7 -11
- package/src/llama.cpp/CMakeLists.txt +23 -1
- package/src/llama.cpp/common/CMakeLists.txt +6 -3
- package/src/llama.cpp/common/arg.cpp +809 -105
- package/src/llama.cpp/common/arg.h +9 -0
- package/src/llama.cpp/common/chat.cpp +1 -1
- package/src/llama.cpp/common/common.cpp +31 -521
- package/src/llama.cpp/common/common.h +17 -36
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -0
- package/src/llama.cpp/common/llguidance.cpp +30 -47
- package/src/llama.cpp/common/minja/chat-template.hpp +15 -7
- package/src/llama.cpp/common/minja/minja.hpp +119 -93
- package/src/llama.cpp/common/sampling.cpp +3 -0
- package/src/llama.cpp/docs/build.md +122 -7
- package/src/llama.cpp/examples/CMakeLists.txt +0 -9
- package/src/llama.cpp/examples/batched/batched.cpp +1 -1
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +7 -1
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +1 -1
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +15 -16
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +210 -8
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
- package/src/llama.cpp/examples/llava/CMakeLists.txt +39 -24
- package/src/llama.cpp/examples/llava/clip-impl.h +345 -0
- package/src/llama.cpp/examples/llava/clip.cpp +2152 -1803
- package/src/llama.cpp/examples/llava/clip.h +39 -22
- package/src/llama.cpp/examples/llava/deprecation-warning.cpp +22 -0
- package/src/llama.cpp/examples/llava/llava.cpp +64 -52
- package/src/llama.cpp/examples/llava/mtmd-cli.cpp +344 -0
- package/src/llama.cpp/examples/llava/mtmd.cpp +708 -0
- package/src/llama.cpp/examples/llava/mtmd.h +168 -0
- package/src/llama.cpp/examples/llava/{qwen2vl-cli.cpp → qwen2vl-test.cpp} +83 -31
- package/src/llama.cpp/examples/main/main.cpp +16 -5
- package/src/llama.cpp/examples/parallel/parallel.cpp +3 -1
- package/src/llama.cpp/examples/passkey/passkey.cpp +1 -1
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +17 -3
- package/src/llama.cpp/examples/quantize/quantize.cpp +115 -2
- package/src/llama.cpp/examples/rpc/CMakeLists.txt +4 -2
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +163 -8
- package/src/llama.cpp/examples/run/CMakeLists.txt +12 -1
- package/src/llama.cpp/examples/run/run.cpp +14 -28
- package/src/llama.cpp/examples/server/httplib.h +313 -247
- package/src/llama.cpp/examples/server/server.cpp +238 -139
- package/src/llama.cpp/examples/server/utils.hpp +51 -2
- package/src/llama.cpp/examples/speculative/speculative.cpp +1 -1
- package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
- package/src/llama.cpp/examples/sycl/build.sh +2 -2
- package/src/llama.cpp/examples/sycl/win-build-sycl.bat +2 -2
- package/src/llama.cpp/examples/tts/tts.cpp +6 -9
- package/src/llama.cpp/ggml/CMakeLists.txt +8 -2
- package/src/llama.cpp/ggml/cmake/GitVars.cmake +22 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +5 -0
- package/src/llama.cpp/ggml/include/ggml-rpc.h +6 -1
- package/src/llama.cpp/ggml/include/ggml.h +66 -99
- package/src/llama.cpp/ggml/src/CMakeLists.txt +10 -7
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +0 -2
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +8 -4
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +5 -5
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +692 -1534
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +613 -122
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +135 -1
- package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +507 -137
- package/src/llama.cpp/ggml/src/ggml-common.h +12 -6
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +48 -22
- package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/common.h +72 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +896 -192
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +2 -21
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +754 -404
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1003 -13519
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +2 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +2 -7
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +0 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +3 -4
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +533 -88
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +8809 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +110 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +892 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +28 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +258 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +802 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +7 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +0 -4
- package/src/llama.cpp/ggml/src/ggml-impl.h +52 -18
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +70 -3
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +67 -119
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1023 -260
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +293 -40
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +96 -22
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +350 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +39 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +0 -35
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +2 -292
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +79 -90
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +967 -438
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +22 -23
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +24 -20
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +1 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +204 -280
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +84 -74
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +1 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +37 -49
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +7 -22
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +4 -14
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +204 -118
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +1 -3
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +23 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +646 -114
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +12 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +17 -8
- package/src/llama.cpp/ggml/src/ggml.c +141 -245
- package/src/llama.cpp/ggml/src/gguf.cpp +1 -0
- package/src/llama.cpp/include/llama.h +30 -11
- package/src/llama.cpp/models/ggml-vocab-llama4.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-llama4.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.out +46 -0
- package/src/llama.cpp/requirements/requirements-all.txt +2 -0
- package/src/llama.cpp/requirements/requirements-gguf_editor_gui.txt +3 -0
- package/src/llama.cpp/src/CMakeLists.txt +3 -2
- package/src/llama.cpp/src/llama-adapter.cpp +37 -1
- package/src/llama.cpp/src/llama-arch.cpp +160 -17
- package/src/llama.cpp/src/llama-arch.h +16 -0
- package/src/llama.cpp/src/llama-chat.cpp +82 -17
- package/src/llama.cpp/src/llama-chat.h +6 -2
- package/src/llama.cpp/src/llama-context.cpp +108 -92
- package/src/llama.cpp/src/llama-context.h +1 -2
- package/src/llama.cpp/src/llama-graph.cpp +189 -119
- package/src/llama.cpp/src/llama-graph.h +26 -6
- package/src/llama.cpp/src/llama-hparams.h +13 -0
- package/src/llama.cpp/src/llama-kv-cache.cpp +70 -123
- package/src/llama.cpp/src/llama-kv-cache.h +41 -115
- package/src/llama.cpp/src/llama-memory.h +1 -1
- package/src/llama.cpp/src/llama-mmap.cpp +1 -1
- package/src/llama.cpp/src/llama-model-loader.cpp +10 -5
- package/src/llama.cpp/src/llama-model-loader.h +5 -3
- package/src/llama.cpp/src/llama-model.cpp +1760 -534
- package/src/llama.cpp/src/llama-model.h +13 -1
- package/src/llama.cpp/src/llama-quant.cpp +29 -8
- package/src/llama.cpp/src/llama-sampling.cpp +7 -1
- package/src/llama.cpp/src/llama-vocab.cpp +44 -6
- package/src/llama.cpp/src/llama.cpp +1 -1
- package/src/llama.cpp/tests/CMakeLists.txt +43 -30
- package/src/llama.cpp/tests/test-arg-parser.cpp +51 -4
- package/src/llama.cpp/tests/test-backend-ops.cpp +82 -43
- package/src/llama.cpp/tests/test-chat-template.cpp +34 -13
- package/src/llama.cpp/tests/test-chat.cpp +12 -2
- package/src/llama.cpp/{examples/gbnf-validator/gbnf-validator.cpp → tests/test-gbnf-validator.cpp} +2 -2
- package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -2
- package/src/llama.cpp/tests/test-grammar-llguidance.cpp +63 -2
- package/src/llama.cpp/tests/test-grammar-parser.cpp +3 -1
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -1
- package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -1
- package/src/llama.cpp/{examples/quantize-stats/quantize-stats.cpp → tests/test-quantize-stats.cpp} +3 -1
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +2 -1
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +2 -1
- package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/llava/gemma3-cli.cpp +0 -341
- package/src/llama.cpp/examples/llava/llava-cli.cpp +0 -332
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +0 -354
- package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +0 -6
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
|
@@ -133,7 +133,8 @@ struct slot_params {
|
|
|
133
133
|
|
|
134
134
|
auto grammar_triggers = json::array();
|
|
135
135
|
for (const auto & trigger : sampling.grammar_triggers) {
|
|
136
|
-
|
|
136
|
+
server_grammar_trigger ct(std::move(trigger));
|
|
137
|
+
grammar_triggers.push_back(ct.to_json());
|
|
137
138
|
}
|
|
138
139
|
|
|
139
140
|
return json {
|
|
@@ -372,9 +373,9 @@ struct server_task {
|
|
|
372
373
|
const auto grammar_triggers = data.find("grammar_triggers");
|
|
373
374
|
if (grammar_triggers != data.end()) {
|
|
374
375
|
for (const auto & t : *grammar_triggers) {
|
|
375
|
-
|
|
376
|
-
if (ct.type == COMMON_GRAMMAR_TRIGGER_TYPE_WORD) {
|
|
377
|
-
const auto & word = ct.value;
|
|
376
|
+
server_grammar_trigger ct(t);
|
|
377
|
+
if (ct.value.type == COMMON_GRAMMAR_TRIGGER_TYPE_WORD) {
|
|
378
|
+
const auto & word = ct.value.value;
|
|
378
379
|
auto ids = common_tokenize(vocab, word, /* add_special= */ false, /* parse_special= */ true);
|
|
379
380
|
if (ids.size() == 1) {
|
|
380
381
|
auto token = ids[0];
|
|
@@ -392,7 +393,7 @@ struct server_task {
|
|
|
392
393
|
params.sampling.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, word});
|
|
393
394
|
}
|
|
394
395
|
} else {
|
|
395
|
-
params.sampling.grammar_triggers.push_back(ct);
|
|
396
|
+
params.sampling.grammar_triggers.push_back(std::move(ct.value));
|
|
396
397
|
}
|
|
397
398
|
}
|
|
398
399
|
}
|
|
@@ -489,8 +490,12 @@ struct result_timings {
|
|
|
489
490
|
double predicted_per_token_ms;
|
|
490
491
|
double predicted_per_second;
|
|
491
492
|
|
|
493
|
+
// Optional speculative metrics - only included when > 0
|
|
494
|
+
int32_t draft_n = 0;
|
|
495
|
+
int32_t draft_n_accepted = 0;
|
|
496
|
+
|
|
492
497
|
json to_json() const {
|
|
493
|
-
|
|
498
|
+
json base = {
|
|
494
499
|
{"prompt_n", prompt_n},
|
|
495
500
|
{"prompt_ms", prompt_ms},
|
|
496
501
|
{"prompt_per_token_ms", prompt_per_token_ms},
|
|
@@ -501,6 +506,13 @@ struct result_timings {
|
|
|
501
506
|
{"predicted_per_token_ms", predicted_per_token_ms},
|
|
502
507
|
{"predicted_per_second", predicted_per_second},
|
|
503
508
|
};
|
|
509
|
+
|
|
510
|
+
if (draft_n > 0) {
|
|
511
|
+
base["draft_n"] = draft_n;
|
|
512
|
+
base["draft_n_accepted"] = draft_n_accepted;
|
|
513
|
+
}
|
|
514
|
+
|
|
515
|
+
return base;
|
|
504
516
|
}
|
|
505
517
|
};
|
|
506
518
|
|
|
@@ -1299,6 +1311,10 @@ struct server_slot {
|
|
|
1299
1311
|
|
|
1300
1312
|
std::function<void(int)> callback_on_release;
|
|
1301
1313
|
|
|
1314
|
+
// Speculative decoding stats
|
|
1315
|
+
int32_t n_draft_total = 0; // Total draft tokens generated
|
|
1316
|
+
int32_t n_draft_accepted = 0; // Draft tokens actually accepted
|
|
1317
|
+
|
|
1302
1318
|
void reset() {
|
|
1303
1319
|
SLT_DBG(*this, "%s", "\n");
|
|
1304
1320
|
|
|
@@ -1315,6 +1331,10 @@ struct server_slot {
|
|
|
1315
1331
|
|
|
1316
1332
|
generated_tokens.clear();
|
|
1317
1333
|
generated_token_probs.clear();
|
|
1334
|
+
|
|
1335
|
+
// clear speculative decoding stats
|
|
1336
|
+
n_draft_total = 0;
|
|
1337
|
+
n_draft_accepted = 0;
|
|
1318
1338
|
}
|
|
1319
1339
|
|
|
1320
1340
|
bool is_non_causal() const {
|
|
@@ -1381,6 +1401,12 @@ struct server_slot {
|
|
|
1381
1401
|
timings.predicted_per_token_ms = t_token_generation / n_decoded;
|
|
1382
1402
|
timings.predicted_per_second = 1e3 / t_token_generation * n_decoded;
|
|
1383
1403
|
|
|
1404
|
+
// Add speculative metrics
|
|
1405
|
+
if (n_draft_total > 0) {
|
|
1406
|
+
timings.draft_n = n_draft_total;
|
|
1407
|
+
timings.draft_n_accepted = n_draft_accepted;
|
|
1408
|
+
}
|
|
1409
|
+
|
|
1384
1410
|
return timings;
|
|
1385
1411
|
}
|
|
1386
1412
|
|
|
@@ -1428,6 +1454,15 @@ struct server_slot {
|
|
|
1428
1454
|
t_prompt_processing, n_prompt_tokens_processed, t_prompt, n_prompt_second,
|
|
1429
1455
|
t_token_generation, n_decoded, t_gen, n_gen_second,
|
|
1430
1456
|
t_prompt_processing + t_token_generation, n_prompt_tokens_processed + n_decoded);
|
|
1457
|
+
|
|
1458
|
+
if (n_draft_total > 0) {
|
|
1459
|
+
const float draft_ratio = (float) n_draft_accepted / n_draft_total;
|
|
1460
|
+
SLT_INF(*this,
|
|
1461
|
+
"\n"
|
|
1462
|
+
"draft acceptance rate = %0.5f (%5d accepted / %5d generated)\n",
|
|
1463
|
+
draft_ratio, n_draft_accepted, n_draft_total
|
|
1464
|
+
);
|
|
1465
|
+
}
|
|
1431
1466
|
}
|
|
1432
1467
|
|
|
1433
1468
|
json to_json() const {
|
|
@@ -1517,29 +1552,30 @@ struct server_queue {
|
|
|
1517
1552
|
std::condition_variable condition_tasks;
|
|
1518
1553
|
|
|
1519
1554
|
// callback functions
|
|
1520
|
-
std::function<void(server_task)> callback_new_task;
|
|
1521
|
-
std::function<void(void)>
|
|
1555
|
+
std::function<void(server_task &&)> callback_new_task;
|
|
1556
|
+
std::function<void(void)> callback_update_slots;
|
|
1522
1557
|
|
|
1523
1558
|
// Add a new task to the end of the queue
|
|
1524
|
-
int post(server_task task, bool front = false) {
|
|
1559
|
+
int post(server_task && task, bool front = false) {
|
|
1525
1560
|
std::unique_lock<std::mutex> lock(mutex_tasks);
|
|
1526
1561
|
GGML_ASSERT(task.id != -1);
|
|
1527
1562
|
// if this is cancel task make sure to clean up pending tasks
|
|
1528
1563
|
if (task.type == SERVER_TASK_TYPE_CANCEL) {
|
|
1529
1564
|
cleanup_pending_task(task.id_target);
|
|
1530
1565
|
}
|
|
1531
|
-
|
|
1566
|
+
const int task_id = task.id;
|
|
1567
|
+
QUE_DBG("new task, id = %d, front = %d\n", task_id, front);
|
|
1532
1568
|
if (front) {
|
|
1533
1569
|
queue_tasks.push_front(std::move(task));
|
|
1534
1570
|
} else {
|
|
1535
1571
|
queue_tasks.push_back(std::move(task));
|
|
1536
1572
|
}
|
|
1537
1573
|
condition_tasks.notify_one();
|
|
1538
|
-
return
|
|
1574
|
+
return task_id;
|
|
1539
1575
|
}
|
|
1540
1576
|
|
|
1541
1577
|
// multi-task version of post()
|
|
1542
|
-
int post(std::vector<server_task>
|
|
1578
|
+
int post(std::vector<server_task> && tasks, bool front = false) {
|
|
1543
1579
|
std::unique_lock<std::mutex> lock(mutex_tasks);
|
|
1544
1580
|
for (auto & task : tasks) {
|
|
1545
1581
|
if (task.id == -1) {
|
|
@@ -1561,7 +1597,7 @@ struct server_queue {
|
|
|
1561
1597
|
}
|
|
1562
1598
|
|
|
1563
1599
|
// Add a new task, but defer until one slot is available
|
|
1564
|
-
void defer(server_task task) {
|
|
1600
|
+
void defer(server_task && task) {
|
|
1565
1601
|
std::unique_lock<std::mutex> lock(mutex_tasks);
|
|
1566
1602
|
QUE_DBG("defer task, id = %d\n", task.id);
|
|
1567
1603
|
queue_tasks_deferred.push_back(std::move(task));
|
|
@@ -1576,7 +1612,7 @@ struct server_queue {
|
|
|
1576
1612
|
}
|
|
1577
1613
|
|
|
1578
1614
|
// Register function to process a new task
|
|
1579
|
-
void on_new_task(std::function<void(server_task)> callback) {
|
|
1615
|
+
void on_new_task(std::function<void(server_task &&)> callback) {
|
|
1580
1616
|
callback_new_task = std::move(callback);
|
|
1581
1617
|
}
|
|
1582
1618
|
|
|
@@ -1625,7 +1661,7 @@ struct server_queue {
|
|
|
1625
1661
|
lock.unlock();
|
|
1626
1662
|
break;
|
|
1627
1663
|
}
|
|
1628
|
-
server_task task = queue_tasks.front();
|
|
1664
|
+
server_task task = std::move(queue_tasks.front());
|
|
1629
1665
|
queue_tasks.pop_front();
|
|
1630
1666
|
lock.unlock();
|
|
1631
1667
|
|
|
@@ -1670,6 +1706,8 @@ private:
|
|
|
1670
1706
|
};
|
|
1671
1707
|
|
|
1672
1708
|
struct server_response {
|
|
1709
|
+
bool running = true;
|
|
1710
|
+
|
|
1673
1711
|
// for keeping track of all tasks waiting for the result
|
|
1674
1712
|
std::unordered_set<int> waiting_task_ids;
|
|
1675
1713
|
|
|
@@ -1724,6 +1762,10 @@ struct server_response {
|
|
|
1724
1762
|
while (true) {
|
|
1725
1763
|
std::unique_lock<std::mutex> lock(mutex_results);
|
|
1726
1764
|
condition_results.wait(lock, [&]{
|
|
1765
|
+
if (!running) {
|
|
1766
|
+
SRV_DBG("%s : queue result stop\n", __func__);
|
|
1767
|
+
std::terminate(); // we cannot return here since the caller is HTTP code
|
|
1768
|
+
}
|
|
1727
1769
|
return !queue_results.empty();
|
|
1728
1770
|
});
|
|
1729
1771
|
|
|
@@ -1754,6 +1796,10 @@ struct server_response {
|
|
|
1754
1796
|
}
|
|
1755
1797
|
|
|
1756
1798
|
std::cv_status cr_res = condition_results.wait_for(lock, std::chrono::seconds(timeout));
|
|
1799
|
+
if (!running) {
|
|
1800
|
+
SRV_DBG("%s : queue result stop\n", __func__);
|
|
1801
|
+
std::terminate(); // we cannot return here since the caller is HTTP code
|
|
1802
|
+
}
|
|
1757
1803
|
if (cr_res == std::cv_status::timeout) {
|
|
1758
1804
|
return nullptr;
|
|
1759
1805
|
}
|
|
@@ -1783,6 +1829,12 @@ struct server_response {
|
|
|
1783
1829
|
}
|
|
1784
1830
|
}
|
|
1785
1831
|
}
|
|
1832
|
+
|
|
1833
|
+
// terminate the waiting loop
|
|
1834
|
+
void terminate() {
|
|
1835
|
+
running = false;
|
|
1836
|
+
condition_results.notify_all();
|
|
1837
|
+
}
|
|
1786
1838
|
};
|
|
1787
1839
|
|
|
1788
1840
|
struct server_context {
|
|
@@ -1842,7 +1894,7 @@ struct server_context {
|
|
|
1842
1894
|
}
|
|
1843
1895
|
|
|
1844
1896
|
bool load_model(const common_params & params) {
|
|
1845
|
-
SRV_INF("loading model '%s'\n", params.model.c_str());
|
|
1897
|
+
SRV_INF("loading model '%s'\n", params.model.path.c_str());
|
|
1846
1898
|
|
|
1847
1899
|
params_base = params;
|
|
1848
1900
|
|
|
@@ -1852,7 +1904,7 @@ struct server_context {
|
|
|
1852
1904
|
ctx = llama_init.context.get();
|
|
1853
1905
|
|
|
1854
1906
|
if (model == nullptr) {
|
|
1855
|
-
SRV_ERR("failed to load model, '%s'\n", params_base.model.c_str());
|
|
1907
|
+
SRV_ERR("failed to load model, '%s'\n", params_base.model.path.c_str());
|
|
1856
1908
|
return false;
|
|
1857
1909
|
}
|
|
1858
1910
|
|
|
@@ -1863,16 +1915,13 @@ struct server_context {
|
|
|
1863
1915
|
add_bos_token = llama_vocab_get_add_bos(vocab);
|
|
1864
1916
|
has_eos_token = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
|
|
1865
1917
|
|
|
1866
|
-
if (!params_base.speculative.model.empty() || !params_base.speculative.hf_repo.empty()) {
|
|
1867
|
-
SRV_INF("loading draft model '%s'\n", params_base.speculative.model.c_str());
|
|
1918
|
+
if (!params_base.speculative.model.path.empty() || !params_base.speculative.model.hf_repo.empty()) {
|
|
1919
|
+
SRV_INF("loading draft model '%s'\n", params_base.speculative.model.path.c_str());
|
|
1868
1920
|
|
|
1869
1921
|
auto params_dft = params_base;
|
|
1870
1922
|
|
|
1871
1923
|
params_dft.devices = params_base.speculative.devices;
|
|
1872
|
-
params_dft.hf_file = params_base.speculative.hf_file;
|
|
1873
|
-
params_dft.hf_repo = params_base.speculative.hf_repo;
|
|
1874
1924
|
params_dft.model = params_base.speculative.model;
|
|
1875
|
-
params_dft.model_url = params_base.speculative.model_url;
|
|
1876
1925
|
params_dft.n_ctx = params_base.speculative.n_ctx == 0 ? params_base.n_ctx / params_base.n_parallel : params_base.speculative.n_ctx;
|
|
1877
1926
|
params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers;
|
|
1878
1927
|
params_dft.n_parallel = 1;
|
|
@@ -1886,12 +1935,12 @@ struct server_context {
|
|
|
1886
1935
|
model_dft = llama_init_dft.model.get();
|
|
1887
1936
|
|
|
1888
1937
|
if (model_dft == nullptr) {
|
|
1889
|
-
SRV_ERR("failed to load draft model, '%s'\n", params_base.speculative.model.c_str());
|
|
1938
|
+
SRV_ERR("failed to load draft model, '%s'\n", params_base.speculative.model.path.c_str());
|
|
1890
1939
|
return false;
|
|
1891
1940
|
}
|
|
1892
1941
|
|
|
1893
1942
|
if (!common_speculative_are_compatible(ctx, llama_init_dft.context.get())) {
|
|
1894
|
-
SRV_ERR("the draft model '%s' is not compatible with the target model '%s'\n", params_base.speculative.model.c_str(), params_base.model.c_str());
|
|
1943
|
+
SRV_ERR("the draft model '%s' is not compatible with the target model '%s'\n", params_base.speculative.model.path.c_str(), params_base.model.path.c_str());
|
|
1895
1944
|
|
|
1896
1945
|
return false;
|
|
1897
1946
|
}
|
|
@@ -1956,7 +2005,7 @@ struct server_context {
|
|
|
1956
2005
|
|
|
1957
2006
|
slot.reset();
|
|
1958
2007
|
|
|
1959
|
-
slots.push_back(slot);
|
|
2008
|
+
slots.push_back(std::move(slot));
|
|
1960
2009
|
}
|
|
1961
2010
|
|
|
1962
2011
|
default_generation_settings_for_props = slots[0].to_json();
|
|
@@ -2057,7 +2106,7 @@ struct server_context {
|
|
|
2057
2106
|
return true;
|
|
2058
2107
|
}
|
|
2059
2108
|
|
|
2060
|
-
bool launch_slot_with_task(server_slot & slot,
|
|
2109
|
+
bool launch_slot_with_task(server_slot & slot, server_task && task) {
|
|
2061
2110
|
slot.reset();
|
|
2062
2111
|
slot.id_task = task.id;
|
|
2063
2112
|
slot.index = task.index;
|
|
@@ -2065,10 +2114,10 @@ struct server_context {
|
|
|
2065
2114
|
slot.params = std::move(task.params);
|
|
2066
2115
|
slot.prompt_tokens = std::move(task.prompt_tokens);
|
|
2067
2116
|
|
|
2068
|
-
if (!are_lora_equal(
|
|
2117
|
+
if (!are_lora_equal(slot.params.lora, slot.lora)) {
|
|
2069
2118
|
// if lora is changed, we cannot reuse cached tokens
|
|
2070
2119
|
slot.cache_tokens.clear();
|
|
2071
|
-
slot.lora =
|
|
2120
|
+
slot.lora = slot.params.lora;
|
|
2072
2121
|
}
|
|
2073
2122
|
|
|
2074
2123
|
bool can_detokenize = can_be_detokenized(ctx, slot.prompt_tokens);
|
|
@@ -2499,10 +2548,10 @@ struct server_context {
|
|
|
2499
2548
|
server_task task(SERVER_TASK_TYPE_CANCEL);
|
|
2500
2549
|
task.id_target = id_task;
|
|
2501
2550
|
queue_results.remove_waiting_task_id(id_task);
|
|
2502
|
-
cancel_tasks.push_back(task);
|
|
2551
|
+
cancel_tasks.push_back(std::move(task));
|
|
2503
2552
|
}
|
|
2504
2553
|
// push to beginning of the queue, so it has highest priority
|
|
2505
|
-
queue_tasks.post(cancel_tasks, true);
|
|
2554
|
+
queue_tasks.post(std::move(cancel_tasks), true);
|
|
2506
2555
|
}
|
|
2507
2556
|
|
|
2508
2557
|
// receive the results from task(s)
|
|
@@ -2589,7 +2638,7 @@ struct server_context {
|
|
|
2589
2638
|
// Functions to process the task
|
|
2590
2639
|
//
|
|
2591
2640
|
|
|
2592
|
-
void process_single_task(server_task task) {
|
|
2641
|
+
void process_single_task(server_task && task) {
|
|
2593
2642
|
switch (task.type) {
|
|
2594
2643
|
case SERVER_TASK_TYPE_COMPLETION:
|
|
2595
2644
|
case SERVER_TASK_TYPE_INFILL:
|
|
@@ -2603,17 +2652,17 @@ struct server_context {
|
|
|
2603
2652
|
if (slot == nullptr) {
|
|
2604
2653
|
// if no slot is available, we defer this task for processing later
|
|
2605
2654
|
SRV_DBG("no slot is available, defer task, id_task = %d\n", task.id);
|
|
2606
|
-
queue_tasks.defer(task);
|
|
2655
|
+
queue_tasks.defer(std::move(task));
|
|
2607
2656
|
break;
|
|
2608
2657
|
}
|
|
2609
2658
|
if (slot->is_processing()) {
|
|
2610
2659
|
// if requested slot is unavailable, we defer this task for processing later
|
|
2611
2660
|
SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id);
|
|
2612
|
-
queue_tasks.defer(task);
|
|
2661
|
+
queue_tasks.defer(std::move(task));
|
|
2613
2662
|
break;
|
|
2614
2663
|
}
|
|
2615
2664
|
|
|
2616
|
-
if (!launch_slot_with_task(*slot, task)) {
|
|
2665
|
+
if (!launch_slot_with_task(*slot, std::move(task))) {
|
|
2617
2666
|
SRV_ERR("failed to launch slot with task, id_task = %d\n", task.id);
|
|
2618
2667
|
break;
|
|
2619
2668
|
}
|
|
@@ -2692,7 +2741,7 @@ struct server_context {
|
|
|
2692
2741
|
if (slot->is_processing()) {
|
|
2693
2742
|
// if requested slot is unavailable, we defer this task for processing later
|
|
2694
2743
|
SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id);
|
|
2695
|
-
queue_tasks.defer(task);
|
|
2744
|
+
queue_tasks.defer(std::move(task));
|
|
2696
2745
|
break;
|
|
2697
2746
|
}
|
|
2698
2747
|
|
|
@@ -2728,7 +2777,7 @@ struct server_context {
|
|
|
2728
2777
|
if (slot->is_processing()) {
|
|
2729
2778
|
// if requested slot is unavailable, we defer this task for processing later
|
|
2730
2779
|
SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id);
|
|
2731
|
-
queue_tasks.defer(task);
|
|
2780
|
+
queue_tasks.defer(std::move(task));
|
|
2732
2781
|
break;
|
|
2733
2782
|
}
|
|
2734
2783
|
|
|
@@ -2771,7 +2820,7 @@ struct server_context {
|
|
|
2771
2820
|
if (slot->is_processing()) {
|
|
2772
2821
|
// if requested slot is unavailable, we defer this task for processing later
|
|
2773
2822
|
SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id);
|
|
2774
|
-
queue_tasks.defer(task);
|
|
2823
|
+
queue_tasks.defer(std::move(task));
|
|
2775
2824
|
break;
|
|
2776
2825
|
}
|
|
2777
2826
|
|
|
@@ -2823,7 +2872,7 @@ struct server_context {
|
|
|
2823
2872
|
|
|
2824
2873
|
server_task task(SERVER_TASK_TYPE_NEXT_RESPONSE);
|
|
2825
2874
|
task.id = queue_tasks.get_new_id();
|
|
2826
|
-
queue_tasks.post(task);
|
|
2875
|
+
queue_tasks.post(std::move(task));
|
|
2827
2876
|
}
|
|
2828
2877
|
|
|
2829
2878
|
// apply context-shift if needed
|
|
@@ -3290,6 +3339,9 @@ struct server_context {
|
|
|
3290
3339
|
|
|
3291
3340
|
llama_tokens draft = common_speculative_gen_draft(slot.spec, params_spec, slot.cache_tokens, id);
|
|
3292
3341
|
|
|
3342
|
+
// keep track of total number of tokens generated in the draft
|
|
3343
|
+
slot.n_draft_total += draft.size();
|
|
3344
|
+
|
|
3293
3345
|
// ignore small drafts
|
|
3294
3346
|
if (slot.params.speculative.n_min > (int) draft.size()) {
|
|
3295
3347
|
SLT_DBG(slot, "ignoring small draft: %d < %d\n", (int) draft.size(), slot.params.speculative.n_min);
|
|
@@ -3315,6 +3367,9 @@ struct server_context {
|
|
|
3315
3367
|
slot.n_past += ids.size();
|
|
3316
3368
|
slot.n_decoded += ids.size();
|
|
3317
3369
|
|
|
3370
|
+
// update how many tokens out of draft was accepted
|
|
3371
|
+
slot.n_draft_accepted += ids.size() - 1;
|
|
3372
|
+
|
|
3318
3373
|
slot.cache_tokens.push_back(id);
|
|
3319
3374
|
slot.cache_tokens.insert(slot.cache_tokens.end(), ids.begin(), ids.end() - 1);
|
|
3320
3375
|
|
|
@@ -3579,14 +3634,17 @@ int main(int argc, char ** argv) {
|
|
|
3579
3634
|
}
|
|
3580
3635
|
|
|
3581
3636
|
// request slots data using task queue
|
|
3582
|
-
|
|
3583
|
-
|
|
3584
|
-
|
|
3585
|
-
|
|
3637
|
+
int task_id = ctx_server.queue_tasks.get_new_id();
|
|
3638
|
+
{
|
|
3639
|
+
server_task task(SERVER_TASK_TYPE_METRICS);
|
|
3640
|
+
task.id = task_id;
|
|
3641
|
+
ctx_server.queue_results.add_waiting_task_id(task_id);
|
|
3642
|
+
ctx_server.queue_tasks.post(std::move(task), true); // high-priority task
|
|
3643
|
+
}
|
|
3586
3644
|
|
|
3587
3645
|
// get the result
|
|
3588
|
-
server_task_result_ptr result = ctx_server.queue_results.recv(
|
|
3589
|
-
ctx_server.queue_results.remove_waiting_task_id(
|
|
3646
|
+
server_task_result_ptr result = ctx_server.queue_results.recv(task_id);
|
|
3647
|
+
ctx_server.queue_results.remove_waiting_task_id(task_id);
|
|
3590
3648
|
|
|
3591
3649
|
if (result->is_error()) {
|
|
3592
3650
|
res_error(res, result->to_json());
|
|
@@ -3615,16 +3673,17 @@ int main(int argc, char ** argv) {
|
|
|
3615
3673
|
}
|
|
3616
3674
|
|
|
3617
3675
|
// request slots data using task queue
|
|
3618
|
-
|
|
3619
|
-
|
|
3620
|
-
|
|
3621
|
-
|
|
3622
|
-
|
|
3623
|
-
|
|
3676
|
+
int task_id = ctx_server.queue_tasks.get_new_id();
|
|
3677
|
+
{
|
|
3678
|
+
server_task task(SERVER_TASK_TYPE_METRICS);
|
|
3679
|
+
task.id = task_id;
|
|
3680
|
+
ctx_server.queue_results.add_waiting_task_id(task_id);
|
|
3681
|
+
ctx_server.queue_tasks.post(std::move(task), true); // high-priority task
|
|
3682
|
+
}
|
|
3624
3683
|
|
|
3625
3684
|
// get the result
|
|
3626
|
-
server_task_result_ptr result = ctx_server.queue_results.recv(
|
|
3627
|
-
ctx_server.queue_results.remove_waiting_task_id(
|
|
3685
|
+
server_task_result_ptr result = ctx_server.queue_results.recv(task_id);
|
|
3686
|
+
ctx_server.queue_results.remove_waiting_task_id(task_id);
|
|
3628
3687
|
|
|
3629
3688
|
if (result->is_error()) {
|
|
3630
3689
|
res_error(res, result->to_json());
|
|
@@ -3721,17 +3780,20 @@ int main(int argc, char ** argv) {
|
|
|
3721
3780
|
}
|
|
3722
3781
|
std::string filepath = params.slot_save_path + filename;
|
|
3723
3782
|
|
|
3724
|
-
|
|
3725
|
-
|
|
3726
|
-
|
|
3727
|
-
|
|
3728
|
-
|
|
3783
|
+
int task_id = ctx_server.queue_tasks.get_new_id();
|
|
3784
|
+
{
|
|
3785
|
+
server_task task(SERVER_TASK_TYPE_SLOT_SAVE);
|
|
3786
|
+
task.id = task_id;
|
|
3787
|
+
task.slot_action.slot_id = id_slot;
|
|
3788
|
+
task.slot_action.filename = filename;
|
|
3789
|
+
task.slot_action.filepath = filepath;
|
|
3729
3790
|
|
|
3730
|
-
|
|
3731
|
-
|
|
3791
|
+
ctx_server.queue_results.add_waiting_task_id(task_id);
|
|
3792
|
+
ctx_server.queue_tasks.post(std::move(task));
|
|
3793
|
+
}
|
|
3732
3794
|
|
|
3733
|
-
server_task_result_ptr result = ctx_server.queue_results.recv(
|
|
3734
|
-
ctx_server.queue_results.remove_waiting_task_id(
|
|
3795
|
+
server_task_result_ptr result = ctx_server.queue_results.recv(task_id);
|
|
3796
|
+
ctx_server.queue_results.remove_waiting_task_id(task_id);
|
|
3735
3797
|
|
|
3736
3798
|
if (result->is_error()) {
|
|
3737
3799
|
res_error(res, result->to_json());
|
|
@@ -3750,17 +3812,20 @@ int main(int argc, char ** argv) {
|
|
|
3750
3812
|
}
|
|
3751
3813
|
std::string filepath = params.slot_save_path + filename;
|
|
3752
3814
|
|
|
3753
|
-
|
|
3754
|
-
|
|
3755
|
-
|
|
3756
|
-
|
|
3757
|
-
|
|
3815
|
+
int task_id = ctx_server.queue_tasks.get_new_id();
|
|
3816
|
+
{
|
|
3817
|
+
server_task task(SERVER_TASK_TYPE_SLOT_RESTORE);
|
|
3818
|
+
task.id = task_id;
|
|
3819
|
+
task.slot_action.slot_id = id_slot;
|
|
3820
|
+
task.slot_action.filename = filename;
|
|
3821
|
+
task.slot_action.filepath = filepath;
|
|
3758
3822
|
|
|
3759
|
-
|
|
3760
|
-
|
|
3823
|
+
ctx_server.queue_results.add_waiting_task_id(task_id);
|
|
3824
|
+
ctx_server.queue_tasks.post(std::move(task));
|
|
3825
|
+
}
|
|
3761
3826
|
|
|
3762
|
-
server_task_result_ptr result = ctx_server.queue_results.recv(
|
|
3763
|
-
ctx_server.queue_results.remove_waiting_task_id(
|
|
3827
|
+
server_task_result_ptr result = ctx_server.queue_results.recv(task_id);
|
|
3828
|
+
ctx_server.queue_results.remove_waiting_task_id(task_id);
|
|
3764
3829
|
|
|
3765
3830
|
if (result->is_error()) {
|
|
3766
3831
|
res_error(res, result->to_json());
|
|
@@ -3772,15 +3837,18 @@ int main(int argc, char ** argv) {
|
|
|
3772
3837
|
};
|
|
3773
3838
|
|
|
3774
3839
|
const auto handle_slots_erase = [&ctx_server, &res_error, &res_ok](const httplib::Request & /* req */, httplib::Response & res, int id_slot) {
|
|
3775
|
-
|
|
3776
|
-
|
|
3777
|
-
|
|
3840
|
+
int task_id = ctx_server.queue_tasks.get_new_id();
|
|
3841
|
+
{
|
|
3842
|
+
server_task task(SERVER_TASK_TYPE_SLOT_ERASE);
|
|
3843
|
+
task.id = task_id;
|
|
3844
|
+
task.slot_action.slot_id = id_slot;
|
|
3778
3845
|
|
|
3779
|
-
|
|
3780
|
-
|
|
3846
|
+
ctx_server.queue_results.add_waiting_task_id(task_id);
|
|
3847
|
+
ctx_server.queue_tasks.post(std::move(task));
|
|
3848
|
+
}
|
|
3781
3849
|
|
|
3782
|
-
server_task_result_ptr result = ctx_server.queue_results.recv(
|
|
3783
|
-
ctx_server.queue_results.remove_waiting_task_id(
|
|
3850
|
+
server_task_result_ptr result = ctx_server.queue_results.recv(task_id);
|
|
3851
|
+
ctx_server.queue_results.remove_waiting_task_id(task_id);
|
|
3784
3852
|
|
|
3785
3853
|
if (result->is_error()) {
|
|
3786
3854
|
res_error(res, result->to_json());
|
|
@@ -3825,7 +3893,7 @@ int main(int argc, char ** argv) {
|
|
|
3825
3893
|
json data = {
|
|
3826
3894
|
{ "default_generation_settings", ctx_server.default_generation_settings_for_props },
|
|
3827
3895
|
{ "total_slots", ctx_server.params_base.n_parallel },
|
|
3828
|
-
{ "model_path", ctx_server.params_base.model },
|
|
3896
|
+
{ "model_path", ctx_server.params_base.model.path },
|
|
3829
3897
|
{ "chat_template", common_chat_templates_source(ctx_server.chat_templates.get()) },
|
|
3830
3898
|
{ "bos_token", common_token_to_piece(ctx_server.ctx, llama_vocab_bos(ctx_server.vocab), /* special= */ true)},
|
|
3831
3899
|
{ "eos_token", common_token_to_piece(ctx_server.ctx, llama_vocab_eos(ctx_server.vocab), /* special= */ true)},
|
|
@@ -3853,6 +3921,21 @@ int main(int argc, char ** argv) {
|
|
|
3853
3921
|
res_ok(res, {{ "success", true }});
|
|
3854
3922
|
};
|
|
3855
3923
|
|
|
3924
|
+
const auto handle_api_show = [&ctx_server, &res_ok](const httplib::Request &, httplib::Response & res) {
|
|
3925
|
+
json data = {
|
|
3926
|
+
{
|
|
3927
|
+
"template", common_chat_templates_source(ctx_server.chat_templates.get()),
|
|
3928
|
+
},
|
|
3929
|
+
{
|
|
3930
|
+
"model_info", {
|
|
3931
|
+
{ "llama.context_length", ctx_server.slots.back().n_ctx, },
|
|
3932
|
+
}
|
|
3933
|
+
},
|
|
3934
|
+
};
|
|
3935
|
+
|
|
3936
|
+
res_ok(res, data);
|
|
3937
|
+
};
|
|
3938
|
+
|
|
3856
3939
|
// handle completion-like requests (completion, chat, infill)
|
|
3857
3940
|
// we can optionally provide a custom format for partial results and final results
|
|
3858
3941
|
const auto handle_completions_impl = [&ctx_server, &res_error, &res_ok](
|
|
@@ -3869,9 +3952,10 @@ int main(int argc, char ** argv) {
|
|
|
3869
3952
|
}
|
|
3870
3953
|
|
|
3871
3954
|
auto completion_id = gen_chatcmplid();
|
|
3872
|
-
std::
|
|
3873
|
-
|
|
3955
|
+
std::unordered_set<int> task_ids;
|
|
3874
3956
|
try {
|
|
3957
|
+
std::vector<server_task> tasks;
|
|
3958
|
+
|
|
3875
3959
|
const auto & prompt = data.at("prompt");
|
|
3876
3960
|
// TODO: this log can become very long, put it behind a flag or think about a more compact format
|
|
3877
3961
|
//SRV_DBG("Prompt: %s\n", prompt.is_string() ? prompt.get<std::string>().c_str() : prompt.dump(2).c_str());
|
|
@@ -3886,9 +3970,9 @@ int main(int argc, char ** argv) {
|
|
|
3886
3970
|
|
|
3887
3971
|
task.prompt_tokens = std::move(tokenized_prompts[i]);
|
|
3888
3972
|
task.params = server_task::params_from_json_cmpl(
|
|
3889
|
-
|
|
3890
|
-
|
|
3891
|
-
|
|
3973
|
+
ctx_server.ctx,
|
|
3974
|
+
ctx_server.params_base,
|
|
3975
|
+
data);
|
|
3892
3976
|
task.id_selected_slot = json_value(data, "id_slot", -1);
|
|
3893
3977
|
|
|
3894
3978
|
// OAI-compat
|
|
@@ -3896,18 +3980,18 @@ int main(int argc, char ** argv) {
|
|
|
3896
3980
|
task.params.oaicompat_cmpl_id = completion_id;
|
|
3897
3981
|
// oaicompat_model is already populated by params_from_json_cmpl
|
|
3898
3982
|
|
|
3899
|
-
tasks.push_back(task);
|
|
3983
|
+
tasks.push_back(std::move(task));
|
|
3900
3984
|
}
|
|
3985
|
+
|
|
3986
|
+
task_ids = server_task::get_list_id(tasks);
|
|
3987
|
+
ctx_server.queue_results.add_waiting_tasks(tasks);
|
|
3988
|
+
ctx_server.queue_tasks.post(std::move(tasks));
|
|
3901
3989
|
} catch (const std::exception & e) {
|
|
3902
3990
|
res_error(res, format_error_response(e.what(), ERROR_TYPE_INVALID_REQUEST));
|
|
3903
3991
|
return;
|
|
3904
3992
|
}
|
|
3905
3993
|
|
|
3906
|
-
ctx_server.queue_results.add_waiting_tasks(tasks);
|
|
3907
|
-
ctx_server.queue_tasks.post(tasks);
|
|
3908
|
-
|
|
3909
3994
|
bool stream = json_value(data, "stream", false);
|
|
3910
|
-
const auto task_ids = server_task::get_list_id(tasks);
|
|
3911
3995
|
|
|
3912
3996
|
if (!stream) {
|
|
3913
3997
|
ctx_server.receive_multi_results(task_ids, [&](std::vector<server_task_result_ptr> & results) {
|
|
@@ -4091,7 +4175,7 @@ int main(int argc, char ** argv) {
|
|
|
4091
4175
|
{"object", "list"},
|
|
4092
4176
|
{"data", {
|
|
4093
4177
|
{
|
|
4094
|
-
{"id", params.model_alias.empty() ? params.model : params.model_alias},
|
|
4178
|
+
{"id", params.model_alias.empty() ? params.model.path : params.model_alias},
|
|
4095
4179
|
{"object", "model"},
|
|
4096
4180
|
{"created", std::time(0)},
|
|
4097
4181
|
{"owned_by", "llamacpp"},
|
|
@@ -4199,6 +4283,7 @@ int main(int argc, char ** argv) {
|
|
|
4199
4283
|
// create and queue the task
|
|
4200
4284
|
json responses = json::array();
|
|
4201
4285
|
bool error = false;
|
|
4286
|
+
std::unordered_set<int> task_ids;
|
|
4202
4287
|
{
|
|
4203
4288
|
std::vector<server_task> tasks;
|
|
4204
4289
|
for (size_t i = 0; i < tokenized_prompts.size(); i++) {
|
|
@@ -4211,27 +4296,26 @@ int main(int argc, char ** argv) {
|
|
|
4211
4296
|
// OAI-compat
|
|
4212
4297
|
task.params.oaicompat = oaicompat;
|
|
4213
4298
|
|
|
4214
|
-
tasks.push_back(task);
|
|
4299
|
+
tasks.push_back(std::move(task));
|
|
4215
4300
|
}
|
|
4216
4301
|
|
|
4302
|
+
task_ids = server_task::get_list_id(tasks);
|
|
4217
4303
|
ctx_server.queue_results.add_waiting_tasks(tasks);
|
|
4218
|
-
ctx_server.queue_tasks.post(tasks);
|
|
4304
|
+
ctx_server.queue_tasks.post(std::move(tasks));
|
|
4305
|
+
}
|
|
4219
4306
|
|
|
4220
|
-
|
|
4221
|
-
|
|
4307
|
+
// get the result
|
|
4308
|
+
ctx_server.receive_multi_results(task_ids, [&](std::vector<server_task_result_ptr> & results) {
|
|
4309
|
+
for (auto & res : results) {
|
|
4310
|
+
GGML_ASSERT(dynamic_cast<server_task_result_embd*>(res.get()) != nullptr);
|
|
4311
|
+
responses.push_back(res->to_json());
|
|
4312
|
+
}
|
|
4313
|
+
}, [&](const json & error_data) {
|
|
4314
|
+
res_error(res, error_data);
|
|
4315
|
+
error = true;
|
|
4316
|
+
}, req.is_connection_closed);
|
|
4222
4317
|
|
|
4223
|
-
|
|
4224
|
-
for (auto & res : results) {
|
|
4225
|
-
GGML_ASSERT(dynamic_cast<server_task_result_embd*>(res.get()) != nullptr);
|
|
4226
|
-
responses.push_back(res->to_json());
|
|
4227
|
-
}
|
|
4228
|
-
}, [&](const json & error_data) {
|
|
4229
|
-
res_error(res, error_data);
|
|
4230
|
-
error = true;
|
|
4231
|
-
}, req.is_connection_closed);
|
|
4232
|
-
|
|
4233
|
-
ctx_server.queue_results.remove_waiting_task_ids(task_ids);
|
|
4234
|
-
}
|
|
4318
|
+
ctx_server.queue_results.remove_waiting_task_ids(task_ids);
|
|
4235
4319
|
|
|
4236
4320
|
if (error) {
|
|
4237
4321
|
return;
|
|
@@ -4298,6 +4382,7 @@ int main(int argc, char ** argv) {
|
|
|
4298
4382
|
// create and queue the task
|
|
4299
4383
|
json responses = json::array();
|
|
4300
4384
|
bool error = false;
|
|
4385
|
+
std::unordered_set<int> task_ids;
|
|
4301
4386
|
{
|
|
4302
4387
|
std::vector<server_task> tasks;
|
|
4303
4388
|
std::vector<llama_tokens> tokenized_docs = tokenize_input_prompts(ctx_server.vocab, documents, /* add_special */ false, true);
|
|
@@ -4307,26 +4392,24 @@ int main(int argc, char ** argv) {
|
|
|
4307
4392
|
task.id = ctx_server.queue_tasks.get_new_id();
|
|
4308
4393
|
task.index = i;
|
|
4309
4394
|
task.prompt_tokens = format_rerank(ctx_server.vocab, tokenized_query, tokenized_docs[i]);
|
|
4310
|
-
tasks.push_back(task);
|
|
4395
|
+
tasks.push_back(std::move(task));
|
|
4311
4396
|
}
|
|
4312
4397
|
|
|
4398
|
+
task_ids = server_task::get_list_id(tasks);
|
|
4313
4399
|
ctx_server.queue_results.add_waiting_tasks(tasks);
|
|
4314
|
-
ctx_server.queue_tasks.post(tasks);
|
|
4315
|
-
|
|
4316
|
-
// get the result
|
|
4317
|
-
std::unordered_set<int> task_ids = server_task::get_list_id(tasks);
|
|
4318
|
-
|
|
4319
|
-
ctx_server.receive_multi_results(task_ids, [&](std::vector<server_task_result_ptr> & results) {
|
|
4320
|
-
for (auto & res : results) {
|
|
4321
|
-
GGML_ASSERT(dynamic_cast<server_task_result_rerank*>(res.get()) != nullptr);
|
|
4322
|
-
responses.push_back(res->to_json());
|
|
4323
|
-
}
|
|
4324
|
-
}, [&](const json & error_data) {
|
|
4325
|
-
res_error(res, error_data);
|
|
4326
|
-
error = true;
|
|
4327
|
-
}, req.is_connection_closed);
|
|
4400
|
+
ctx_server.queue_tasks.post(std::move(tasks));
|
|
4328
4401
|
}
|
|
4329
4402
|
|
|
4403
|
+
ctx_server.receive_multi_results(task_ids, [&](std::vector<server_task_result_ptr> & results) {
|
|
4404
|
+
for (auto & res : results) {
|
|
4405
|
+
GGML_ASSERT(dynamic_cast<server_task_result_rerank*>(res.get()) != nullptr);
|
|
4406
|
+
responses.push_back(res->to_json());
|
|
4407
|
+
}
|
|
4408
|
+
}, [&](const json & error_data) {
|
|
4409
|
+
res_error(res, error_data);
|
|
4410
|
+
error = true;
|
|
4411
|
+
}, req.is_connection_closed);
|
|
4412
|
+
|
|
4330
4413
|
if (error) {
|
|
4331
4414
|
return;
|
|
4332
4415
|
}
|
|
@@ -4362,14 +4445,19 @@ int main(int argc, char ** argv) {
|
|
|
4362
4445
|
res_error(res, format_error_response("Request body must be an array", ERROR_TYPE_INVALID_REQUEST));
|
|
4363
4446
|
return;
|
|
4364
4447
|
}
|
|
4365
|
-
server_task task(SERVER_TASK_TYPE_SET_LORA);
|
|
4366
|
-
task.id = ctx_server.queue_tasks.get_new_id();
|
|
4367
|
-
task.set_lora = parse_lora_request(ctx_server.params_base.lora_adapters, body);
|
|
4368
|
-
ctx_server.queue_results.add_waiting_task_id(task.id);
|
|
4369
|
-
ctx_server.queue_tasks.post(task);
|
|
4370
4448
|
|
|
4371
|
-
|
|
4372
|
-
|
|
4449
|
+
int task_id = ctx_server.queue_tasks.get_new_id();
|
|
4450
|
+
{
|
|
4451
|
+
server_task task(SERVER_TASK_TYPE_SET_LORA);
|
|
4452
|
+
task.id = task_id;
|
|
4453
|
+
task.set_lora = parse_lora_request(ctx_server.params_base.lora_adapters, body);
|
|
4454
|
+
ctx_server.queue_results.add_waiting_task_id(task_id);
|
|
4455
|
+
ctx_server.queue_tasks.post(std::move(task));
|
|
4456
|
+
}
|
|
4457
|
+
|
|
4458
|
+
// get the result
|
|
4459
|
+
server_task_result_ptr result = ctx_server.queue_results.recv(task_id);
|
|
4460
|
+
ctx_server.queue_results.remove_waiting_task_id(task_id);
|
|
4373
4461
|
|
|
4374
4462
|
if (result->is_error()) {
|
|
4375
4463
|
res_error(res, result->to_json());
|
|
@@ -4417,6 +4505,7 @@ int main(int argc, char ** argv) {
|
|
|
4417
4505
|
svr->Get ("/metrics", handle_metrics);
|
|
4418
4506
|
svr->Get ("/props", handle_props);
|
|
4419
4507
|
svr->Post("/props", handle_props_change);
|
|
4508
|
+
svr->Post("/api/show", handle_api_show);
|
|
4420
4509
|
svr->Get ("/models", handle_models); // public endpoint (no API key check)
|
|
4421
4510
|
svr->Get ("/v1/models", handle_models); // public endpoint (no API key check)
|
|
4422
4511
|
svr->Post("/completion", handle_completions); // legacy
|
|
@@ -4453,21 +4542,31 @@ int main(int argc, char ** argv) {
|
|
|
4453
4542
|
svr->new_task_queue = [¶ms] { return new httplib::ThreadPool(params.n_threads_http); };
|
|
4454
4543
|
|
|
4455
4544
|
// clean up function, to be called before exit
|
|
4456
|
-
auto clean_up = [&svr]() {
|
|
4545
|
+
auto clean_up = [&svr, &ctx_server]() {
|
|
4457
4546
|
SRV_INF("%s: cleaning up before exit...\n", __func__);
|
|
4458
4547
|
svr->stop();
|
|
4548
|
+
ctx_server.queue_results.terminate();
|
|
4459
4549
|
llama_backend_free();
|
|
4460
4550
|
};
|
|
4461
4551
|
|
|
4462
|
-
// bind HTTP listen port
|
|
4463
4552
|
bool was_bound = false;
|
|
4464
|
-
if (params.
|
|
4465
|
-
|
|
4466
|
-
|
|
4467
|
-
|
|
4468
|
-
|
|
4553
|
+
if (string_ends_with(std::string(params.hostname), ".sock")) {
|
|
4554
|
+
LOG_INF("%s: setting address family to AF_UNIX\n", __func__);
|
|
4555
|
+
svr->set_address_family(AF_UNIX);
|
|
4556
|
+
// bind_to_port requires a second arg, any value other than 0 should
|
|
4557
|
+
// simply get ignored
|
|
4558
|
+
was_bound = svr->bind_to_port(params.hostname, 8080);
|
|
4469
4559
|
} else {
|
|
4470
|
-
|
|
4560
|
+
LOG_INF("%s: binding port with default address family\n", __func__);
|
|
4561
|
+
// bind HTTP listen port
|
|
4562
|
+
if (params.port == 0) {
|
|
4563
|
+
int bound_port = svr->bind_to_any_port(params.hostname);
|
|
4564
|
+
if ((was_bound = (bound_port >= 0))) {
|
|
4565
|
+
params.port = bound_port;
|
|
4566
|
+
}
|
|
4567
|
+
} else {
|
|
4568
|
+
was_bound = svr->bind_to_port(params.hostname, params.port);
|
|
4569
|
+
}
|
|
4471
4570
|
}
|
|
4472
4571
|
|
|
4473
4572
|
if (!was_bound) {
|
|
@@ -4487,7 +4586,7 @@ int main(int argc, char ** argv) {
|
|
|
4487
4586
|
|
|
4488
4587
|
if (!ctx_server.load_model(params)) {
|
|
4489
4588
|
clean_up();
|
|
4490
|
-
|
|
4589
|
+
t.join();
|
|
4491
4590
|
LOG_ERR("%s: exiting due to model loading error\n", __func__);
|
|
4492
4591
|
return 1;
|
|
4493
4592
|
}
|
|
@@ -4502,8 +4601,8 @@ int main(int argc, char ** argv) {
|
|
|
4502
4601
|
common_chat_templates_source(ctx_server.chat_templates.get()),
|
|
4503
4602
|
common_chat_format_example(ctx_server.chat_templates.get(), ctx_server.params_base.use_jinja).c_str());
|
|
4504
4603
|
|
|
4505
|
-
ctx_server.queue_tasks.on_new_task([&ctx_server](
|
|
4506
|
-
ctx_server.process_single_task(task);
|
|
4604
|
+
ctx_server.queue_tasks.on_new_task([&ctx_server](server_task && task) {
|
|
4605
|
+
ctx_server.process_single_task(std::move(task));
|
|
4507
4606
|
});
|
|
4508
4607
|
|
|
4509
4608
|
ctx_server.queue_tasks.on_update_slots([&ctx_server]() {
|
|
@@ -4535,7 +4634,7 @@ int main(int argc, char ** argv) {
|
|
|
4535
4634
|
ctx_server.queue_tasks.start_loop();
|
|
4536
4635
|
|
|
4537
4636
|
clean_up();
|
|
4538
|
-
|
|
4637
|
+
t.join();
|
|
4539
4638
|
|
|
4540
4639
|
return 0;
|
|
4541
4640
|
}
|