@fugood/llama.node 0.3.17 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +3 -1
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +39 -2
- package/lib/index.js +132 -1
- package/lib/index.ts +203 -3
- package/package.json +2 -1
- package/src/EmbeddingWorker.cpp +1 -1
- package/src/LlamaCompletionWorker.cpp +366 -19
- package/src/LlamaCompletionWorker.h +30 -10
- package/src/LlamaContext.cpp +213 -5
- package/src/LlamaContext.h +12 -0
- package/src/common.hpp +15 -0
- package/src/llama.cpp/.github/workflows/build-linux-cross.yml +133 -24
- package/src/llama.cpp/.github/workflows/build.yml +41 -762
- package/src/llama.cpp/.github/workflows/docker.yml +5 -2
- package/src/llama.cpp/.github/workflows/release.yml +716 -0
- package/src/llama.cpp/.github/workflows/server.yml +12 -12
- package/src/llama.cpp/CMakeLists.txt +5 -17
- package/src/llama.cpp/cmake/build-info.cmake +8 -2
- package/src/llama.cpp/cmake/x64-windows-llvm.cmake +0 -6
- package/src/llama.cpp/common/CMakeLists.txt +31 -3
- package/src/llama.cpp/common/arg.cpp +48 -29
- package/src/llama.cpp/common/chat.cpp +128 -106
- package/src/llama.cpp/common/chat.h +2 -0
- package/src/llama.cpp/common/common.cpp +37 -1
- package/src/llama.cpp/common/common.h +18 -9
- package/src/llama.cpp/common/llguidance.cpp +1 -0
- package/src/llama.cpp/common/minja/chat-template.hpp +9 -5
- package/src/llama.cpp/common/minja/minja.hpp +69 -36
- package/src/llama.cpp/common/regex-partial.cpp +204 -0
- package/src/llama.cpp/common/regex-partial.h +56 -0
- package/src/llama.cpp/common/sampling.cpp +57 -50
- package/src/llama.cpp/examples/CMakeLists.txt +2 -23
- package/src/llama.cpp/examples/embedding/embedding.cpp +2 -11
- package/src/llama.cpp/examples/parallel/parallel.cpp +86 -14
- package/src/llama.cpp/examples/training/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/training/finetune.cpp +96 -0
- package/src/llama.cpp/ggml/CMakeLists.txt +27 -0
- package/src/llama.cpp/ggml/include/ggml-backend.h +4 -4
- package/src/llama.cpp/ggml/include/ggml-cpp.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-opt.h +47 -28
- package/src/llama.cpp/ggml/include/ggml.h +10 -7
- package/src/llama.cpp/ggml/src/CMakeLists.txt +1 -1
- package/src/llama.cpp/ggml/src/ggml-alloc.c +4 -1
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +9 -5
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +20 -13
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +0 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +306 -6
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +4 -13
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +29 -16
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +88 -5
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +47 -12
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +264 -69
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +501 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +0 -13
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +0 -6
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +23 -4
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +36 -11
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +0 -2
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +368 -190
- package/src/llama.cpp/ggml/src/ggml-quants.c +0 -6
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +41 -27
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +29 -23
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +9 -8
- package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +121 -232
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +7 -15
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +72 -25
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +14 -7
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +59 -21
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +7 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +0 -23
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +37 -8
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +338 -166
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +185 -89
- package/src/llama.cpp/ggml/src/ggml-sycl/quants.hpp +83 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +128 -53
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +81 -70
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +657 -193
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +20 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +123 -29
- package/src/llama.cpp/ggml/src/ggml.c +29 -20
- package/src/llama.cpp/ggml/src/gguf.cpp +33 -33
- package/src/llama.cpp/include/llama.h +52 -11
- package/src/llama.cpp/requirements/requirements-all.txt +3 -3
- package/src/llama.cpp/scripts/xxd.cmake +1 -1
- package/src/llama.cpp/src/CMakeLists.txt +1 -0
- package/src/llama.cpp/src/llama-adapter.cpp +6 -0
- package/src/llama.cpp/src/llama-arch.cpp +3 -0
- package/src/llama.cpp/src/llama-batch.cpp +5 -1
- package/src/llama.cpp/src/llama-batch.h +2 -1
- package/src/llama.cpp/src/llama-chat.cpp +17 -7
- package/src/llama.cpp/src/llama-chat.h +1 -0
- package/src/llama.cpp/src/llama-context.cpp +389 -501
- package/src/llama.cpp/src/llama-context.h +44 -32
- package/src/llama.cpp/src/llama-cparams.h +1 -0
- package/src/llama.cpp/src/llama-graph.cpp +20 -38
- package/src/llama.cpp/src/llama-graph.h +12 -8
- package/src/llama.cpp/src/llama-kv-cache.cpp +1503 -389
- package/src/llama.cpp/src/llama-kv-cache.h +271 -85
- package/src/llama.cpp/src/llama-memory.h +11 -1
- package/src/llama.cpp/src/llama-model-loader.cpp +24 -15
- package/src/llama.cpp/src/llama-model-saver.cpp +281 -0
- package/src/llama.cpp/src/llama-model-saver.h +37 -0
- package/src/llama.cpp/src/llama-model.cpp +316 -69
- package/src/llama.cpp/src/llama-model.h +8 -1
- package/src/llama.cpp/src/llama-quant.cpp +15 -13
- package/src/llama.cpp/src/llama-sampling.cpp +18 -6
- package/src/llama.cpp/src/llama-vocab.cpp +42 -4
- package/src/llama.cpp/src/llama-vocab.h +6 -0
- package/src/llama.cpp/src/llama.cpp +14 -0
- package/src/llama.cpp/tests/CMakeLists.txt +10 -2
- package/src/llama.cpp/tests/test-backend-ops.cpp +107 -47
- package/src/llama.cpp/tests/test-chat-template.cpp +10 -11
- package/src/llama.cpp/tests/test-chat.cpp +3 -1
- package/src/llama.cpp/tests/test-mtmd-c-api.c +63 -0
- package/src/llama.cpp/tests/test-opt.cpp +33 -21
- package/src/llama.cpp/tests/test-regex-partial.cpp +288 -0
- package/src/llama.cpp/tests/test-sampling.cpp +1 -1
- package/src/llama.cpp/tools/CMakeLists.txt +39 -0
- package/src/llama.cpp/{examples → tools}/batched-bench/batched-bench.cpp +2 -2
- package/src/llama.cpp/{examples → tools}/imatrix/imatrix.cpp +11 -9
- package/src/llama.cpp/{examples → tools}/llama-bench/llama-bench.cpp +495 -348
- package/src/llama.cpp/{examples → tools}/main/main.cpp +6 -9
- package/src/llama.cpp/{examples/llava → tools/mtmd}/CMakeLists.txt +1 -35
- package/src/llama.cpp/{examples/llava → tools/mtmd}/clip-impl.h +25 -5
- package/src/llama.cpp/{examples/llava → tools/mtmd}/clip.cpp +1440 -1349
- package/src/llama.cpp/tools/mtmd/clip.h +99 -0
- package/src/llama.cpp/{examples/llava → tools/mtmd}/mtmd-cli.cpp +70 -44
- package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +310 -0
- package/src/llama.cpp/{examples/llava → tools/mtmd}/mtmd.cpp +251 -281
- package/src/llama.cpp/tools/mtmd/mtmd.h +331 -0
- package/src/llama.cpp/{examples → tools}/perplexity/perplexity.cpp +4 -2
- package/src/llama.cpp/{examples → tools}/quantize/quantize.cpp +13 -76
- package/src/llama.cpp/{examples → tools}/rpc/rpc-server.cpp +70 -74
- package/src/llama.cpp/{examples → tools}/run/run.cpp +18 -4
- package/src/llama.cpp/{examples → tools}/server/CMakeLists.txt +2 -1
- package/src/llama.cpp/{examples → tools}/server/server.cpp +291 -76
- package/src/llama.cpp/{examples → tools}/server/utils.hpp +377 -5
- package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +0 -6
- package/src/llama.cpp/examples/infill/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/infill/infill.cpp +0 -590
- package/src/llama.cpp/examples/llava/android/build_64.sh +0 -8
- package/src/llama.cpp/examples/llava/clip-quantize-cli.cpp +0 -59
- package/src/llama.cpp/examples/llava/clip.h +0 -135
- package/src/llama.cpp/examples/llava/llava.cpp +0 -586
- package/src/llama.cpp/examples/llava/llava.h +0 -49
- package/src/llama.cpp/examples/llava/mtmd.h +0 -168
- package/src/llama.cpp/examples/llava/qwen2vl-test.cpp +0 -636
- /package/src/llama.cpp/{examples → tools}/batched-bench/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/completions.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/cvector-generator.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/mean.hpp +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/negative.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/pca.hpp +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/positive.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/export-lora/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/export-lora/export-lora.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/gguf-split/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/gguf-split/gguf-split.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/imatrix/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/llama-bench/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/main/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples/llava → tools/mtmd}/deprecation-warning.cpp +0 -0
- /package/src/llama.cpp/{examples/llava → tools/mtmd}/requirements.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/perplexity/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/quantize/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/rpc/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/run/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.h +0 -0
- /package/src/llama.cpp/{examples → tools}/server/bench/requirements.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/server/httplib.h +0 -0
- /package/src/llama.cpp/{examples → tools}/server/tests/requirements.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/tokenize/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/tokenize/tokenize.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/tts/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/tts/tts.cpp +0 -0
|
@@ -7,6 +7,7 @@
|
|
|
7
7
|
#include "log.h"
|
|
8
8
|
#include "sampling.h"
|
|
9
9
|
#include "speculative.h"
|
|
10
|
+
#include "mtmd.h"
|
|
10
11
|
|
|
11
12
|
// Change JSON_ASSERT from assert() to GGML_ASSERT:
|
|
12
13
|
#define JSON_ASSERT GGML_ASSERT
|
|
@@ -146,6 +147,7 @@ struct slot_params {
|
|
|
146
147
|
{"top_k", sampling.top_k},
|
|
147
148
|
{"top_p", sampling.top_p},
|
|
148
149
|
{"min_p", sampling.min_p},
|
|
150
|
+
{"top_n_sigma", sampling.top_n_sigma},
|
|
149
151
|
{"xtc_probability", sampling.xtc_probability},
|
|
150
152
|
{"xtc_threshold", sampling.xtc_threshold},
|
|
151
153
|
{"typical_p", sampling.typ_p},
|
|
@@ -196,8 +198,8 @@ struct server_task {
|
|
|
196
198
|
int id_target = -1;
|
|
197
199
|
|
|
198
200
|
// used by SERVER_TASK_TYPE_INFERENCE
|
|
199
|
-
slot_params
|
|
200
|
-
|
|
201
|
+
slot_params params;
|
|
202
|
+
server_tokens prompt_tokens;
|
|
201
203
|
int id_selected_slot = -1;
|
|
202
204
|
|
|
203
205
|
// used by SERVER_TASK_TYPE_SLOT_SAVE, SERVER_TASK_TYPE_SLOT_RESTORE, SERVER_TASK_TYPE_SLOT_ERASE
|
|
@@ -248,6 +250,7 @@ struct server_task {
|
|
|
248
250
|
params.sampling.top_k = json_value(data, "top_k", defaults.sampling.top_k);
|
|
249
251
|
params.sampling.top_p = json_value(data, "top_p", defaults.sampling.top_p);
|
|
250
252
|
params.sampling.min_p = json_value(data, "min_p", defaults.sampling.min_p);
|
|
253
|
+
params.sampling.top_n_sigma = json_value(data, "top_n_sigma", defaults.sampling.top_n_sigma);
|
|
251
254
|
params.sampling.xtc_probability = json_value(data, "xtc_probability", defaults.sampling.xtc_probability);
|
|
252
255
|
params.sampling.xtc_threshold = json_value(data, "xtc_threshold", defaults.sampling.xtc_threshold);
|
|
253
256
|
params.sampling.typ_p = json_value(data, "typical_p", defaults.sampling.typ_p);
|
|
@@ -1246,6 +1249,9 @@ struct server_slot {
|
|
|
1246
1249
|
llama_context * ctx = nullptr;
|
|
1247
1250
|
llama_context * ctx_dft = nullptr;
|
|
1248
1251
|
|
|
1252
|
+
// multimodal
|
|
1253
|
+
mtmd_context * mctx = nullptr;
|
|
1254
|
+
|
|
1249
1255
|
common_speculative * spec = nullptr;
|
|
1250
1256
|
|
|
1251
1257
|
std::vector<common_adapter_lora_info> lora;
|
|
@@ -1273,14 +1279,14 @@ struct server_slot {
|
|
|
1273
1279
|
int32_t n_prompt_tokens_processed = 0;
|
|
1274
1280
|
|
|
1275
1281
|
// input prompt tokens
|
|
1276
|
-
|
|
1282
|
+
server_tokens prompt_tokens;
|
|
1277
1283
|
|
|
1278
1284
|
size_t last_nl_pos = 0;
|
|
1279
1285
|
|
|
1280
1286
|
std::string generated_text;
|
|
1281
1287
|
llama_tokens generated_tokens;
|
|
1282
1288
|
|
|
1283
|
-
|
|
1289
|
+
server_tokens cache_tokens;
|
|
1284
1290
|
|
|
1285
1291
|
std::vector<completion_token_output> generated_token_probs;
|
|
1286
1292
|
|
|
@@ -1423,7 +1429,7 @@ struct server_slot {
|
|
|
1423
1429
|
pos = text.find(word, from_pos);
|
|
1424
1430
|
} else {
|
|
1425
1431
|
// otherwise, partial stop
|
|
1426
|
-
pos =
|
|
1432
|
+
pos = string_find_partial_stop(text, word);
|
|
1427
1433
|
}
|
|
1428
1434
|
|
|
1429
1435
|
if (pos != std::string::npos && (stop_pos == std::string::npos || pos < stop_pos)) {
|
|
@@ -1474,7 +1480,7 @@ struct server_slot {
|
|
|
1474
1480
|
{"is_processing", is_processing()},
|
|
1475
1481
|
{"non_causal", is_non_causal()},
|
|
1476
1482
|
{"params", params.to_json()},
|
|
1477
|
-
{"prompt",
|
|
1483
|
+
{"prompt", prompt_tokens.detokenize(ctx, true)},
|
|
1478
1484
|
{"next_token",
|
|
1479
1485
|
{
|
|
1480
1486
|
{"has_next_token", has_next_token},
|
|
@@ -1847,13 +1853,16 @@ struct server_context {
|
|
|
1847
1853
|
llama_model * model = nullptr;
|
|
1848
1854
|
llama_context * ctx = nullptr;
|
|
1849
1855
|
|
|
1856
|
+
// multimodal
|
|
1857
|
+
mtmd_context * mctx = nullptr;
|
|
1858
|
+
|
|
1850
1859
|
const llama_vocab * vocab = nullptr;
|
|
1851
1860
|
|
|
1852
1861
|
llama_model * model_dft = nullptr;
|
|
1853
1862
|
|
|
1854
1863
|
llama_context_params cparams_dft;
|
|
1855
1864
|
|
|
1856
|
-
llama_batch batch
|
|
1865
|
+
llama_batch batch {};
|
|
1857
1866
|
|
|
1858
1867
|
bool clean_kv_cache = true;
|
|
1859
1868
|
bool add_bos_token = true;
|
|
@@ -1876,6 +1885,8 @@ struct server_context {
|
|
|
1876
1885
|
common_chat_templates_ptr chat_templates;
|
|
1877
1886
|
|
|
1878
1887
|
~server_context() {
|
|
1888
|
+
mtmd_free(mctx);
|
|
1889
|
+
|
|
1879
1890
|
// Clear any sampling context
|
|
1880
1891
|
for (server_slot & slot : slots) {
|
|
1881
1892
|
common_sampler_free(slot.smpl);
|
|
@@ -1963,6 +1974,36 @@ struct server_context {
|
|
|
1963
1974
|
chat_templates = common_chat_templates_init(model, "chatml");
|
|
1964
1975
|
}
|
|
1965
1976
|
|
|
1977
|
+
std::string & mmproj_path = params_base.mmproj.path;
|
|
1978
|
+
if (!mmproj_path.empty()) {
|
|
1979
|
+
mtmd_context_params mparams = mtmd_context_params_default();
|
|
1980
|
+
mparams.use_gpu = params_base.mmproj_use_gpu;
|
|
1981
|
+
mparams.print_timings = false;
|
|
1982
|
+
mparams.n_threads = params_base.cpuparams.n_threads;
|
|
1983
|
+
mparams.verbosity = params_base.verbosity > 0 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_INFO;
|
|
1984
|
+
mctx = mtmd_init_from_file(mmproj_path.c_str(), model, mparams);
|
|
1985
|
+
if (mctx == nullptr) {
|
|
1986
|
+
SRV_ERR("failed to load multimodal model, '%s'\n", mmproj_path.c_str());
|
|
1987
|
+
return false;
|
|
1988
|
+
}
|
|
1989
|
+
SRV_INF("loaded multimodal model, '%s'\n", mmproj_path.c_str());
|
|
1990
|
+
|
|
1991
|
+
if (params_base.ctx_shift) {
|
|
1992
|
+
params_base.ctx_shift = false;
|
|
1993
|
+
SRV_WRN("%s\n", "ctx_shift is not supported by multimodal, it will be disabled");
|
|
1994
|
+
}
|
|
1995
|
+
|
|
1996
|
+
if (params_base.n_cache_reuse) {
|
|
1997
|
+
params_base.n_cache_reuse = 0;
|
|
1998
|
+
SRV_WRN("%s\n", "cache_reuse is not supported by multimodal, it will be disabled");
|
|
1999
|
+
}
|
|
2000
|
+
|
|
2001
|
+
if (!params_base.speculative.model.path.empty()) {
|
|
2002
|
+
SRV_ERR("%s\n", "err: speculative decode is not supported by multimodal");
|
|
2003
|
+
return false;
|
|
2004
|
+
}
|
|
2005
|
+
}
|
|
2006
|
+
|
|
1966
2007
|
return true;
|
|
1967
2008
|
}
|
|
1968
2009
|
|
|
@@ -1978,6 +2019,8 @@ struct server_context {
|
|
|
1978
2019
|
slot.ctx = ctx;
|
|
1979
2020
|
slot.n_ctx = n_ctx_slot;
|
|
1980
2021
|
slot.n_predict = params_base.n_predict;
|
|
2022
|
+
slot.mctx = mctx;
|
|
2023
|
+
slot.cache_tokens.has_mtmd = mctx != nullptr;
|
|
1981
2024
|
|
|
1982
2025
|
if (model_dft) {
|
|
1983
2026
|
slot.batch_spec = llama_batch_init(params_base.speculative.n_max + 1, 0, 1);
|
|
@@ -2014,8 +2057,6 @@ struct server_context {
|
|
|
2014
2057
|
// note that n_batch can be > n_ctx (e.g. for non-causal attention models such as BERT where the KV cache is not used)
|
|
2015
2058
|
{
|
|
2016
2059
|
const int32_t n_batch = llama_n_batch(ctx);
|
|
2017
|
-
|
|
2018
|
-
// only a single seq_id per token is needed
|
|
2019
2060
|
batch = llama_batch_init(std::max(n_batch, params_base.n_parallel), 0, 1);
|
|
2020
2061
|
}
|
|
2021
2062
|
|
|
@@ -2052,7 +2093,7 @@ struct server_context {
|
|
|
2052
2093
|
}
|
|
2053
2094
|
|
|
2054
2095
|
// length of the Longest Common Subsequence between the current slot's prompt and the input prompt
|
|
2055
|
-
int cur_lcs_len =
|
|
2096
|
+
int cur_lcs_len = slot.cache_tokens.get_common_prefix(task.prompt_tokens);
|
|
2056
2097
|
|
|
2057
2098
|
// fraction of the common subsequence length compared to the current slot's prompt length
|
|
2058
2099
|
float cur_similarity = static_cast<float>(cur_lcs_len) / static_cast<int>(slot.cache_tokens.size());
|
|
@@ -2094,18 +2135,6 @@ struct server_context {
|
|
|
2094
2135
|
return ret;
|
|
2095
2136
|
}
|
|
2096
2137
|
|
|
2097
|
-
bool can_be_detokenized(const struct llama_context * ctx, const std::vector<llama_token> & tokens) {
|
|
2098
|
-
const llama_model * model = llama_get_model(ctx);
|
|
2099
|
-
const llama_vocab * vocab = llama_model_get_vocab(model);
|
|
2100
|
-
const int32_t n_vocab = llama_vocab_n_tokens(vocab);
|
|
2101
|
-
for (const auto & token : tokens) {
|
|
2102
|
-
if (token < 0 || token >= n_vocab) {
|
|
2103
|
-
return false;
|
|
2104
|
-
}
|
|
2105
|
-
}
|
|
2106
|
-
return true;
|
|
2107
|
-
}
|
|
2108
|
-
|
|
2109
2138
|
bool launch_slot_with_task(server_slot & slot, server_task && task) {
|
|
2110
2139
|
slot.reset();
|
|
2111
2140
|
slot.id_task = task.id;
|
|
@@ -2120,8 +2149,7 @@ struct server_context {
|
|
|
2120
2149
|
slot.lora = slot.params.lora;
|
|
2121
2150
|
}
|
|
2122
2151
|
|
|
2123
|
-
|
|
2124
|
-
if (!can_detokenize) {
|
|
2152
|
+
if (!slot.prompt_tokens.validate(ctx)) {
|
|
2125
2153
|
send_error(task, "Prompt contains invalid tokens", ERROR_TYPE_INVALID_REQUEST);
|
|
2126
2154
|
return false;
|
|
2127
2155
|
}
|
|
@@ -2223,6 +2251,14 @@ struct server_context {
|
|
|
2223
2251
|
slot.has_next_token = true;
|
|
2224
2252
|
}
|
|
2225
2253
|
|
|
2254
|
+
// if context shifting is disabled, make sure that we don't run out of context
|
|
2255
|
+
if (!params_base.ctx_shift && slot.n_past + 1 >= slot.n_ctx) {
|
|
2256
|
+
slot.stop = STOP_TYPE_LIMIT;
|
|
2257
|
+
slot.has_next_token = false;
|
|
2258
|
+
|
|
2259
|
+
SLT_DBG(slot, "stopped due to running out of context, n_past = %d, n_ctx = %d\n", slot.n_past, slot.n_ctx);
|
|
2260
|
+
}
|
|
2261
|
+
|
|
2226
2262
|
// check the limits
|
|
2227
2263
|
if (slot.n_decoded > 0 && slot.has_next_token && !slot.has_budget(params_base)) {
|
|
2228
2264
|
slot.stop = STOP_TYPE_LIMIT;
|
|
@@ -2383,6 +2419,15 @@ struct server_context {
|
|
|
2383
2419
|
queue_results.send(std::move(res));
|
|
2384
2420
|
}
|
|
2385
2421
|
|
|
2422
|
+
// if multimodal is enabled, send an error and return false
|
|
2423
|
+
bool ensure_no_mtmd(const int id_task) {
|
|
2424
|
+
if (mctx) {
|
|
2425
|
+
send_error(id_task, "This feature is not supported by multimodal", ERROR_TYPE_NOT_SUPPORTED);
|
|
2426
|
+
return false;
|
|
2427
|
+
}
|
|
2428
|
+
return true;
|
|
2429
|
+
}
|
|
2430
|
+
|
|
2386
2431
|
void send_partial_response(server_slot & slot, const completion_token_output & tkn) {
|
|
2387
2432
|
auto res = std::make_unique<server_task_result_cmpl_partial>();
|
|
2388
2433
|
|
|
@@ -2422,7 +2467,7 @@ struct server_context {
|
|
|
2422
2467
|
res->content = std::move(slot.generated_text);
|
|
2423
2468
|
res->tokens = std::move(slot.generated_tokens);
|
|
2424
2469
|
res->timings = slot.get_timings();
|
|
2425
|
-
res->prompt =
|
|
2470
|
+
res->prompt = slot.prompt_tokens.detokenize(ctx, true);
|
|
2426
2471
|
res->response_fields = std::move(slot.params.response_fields);
|
|
2427
2472
|
|
|
2428
2473
|
res->truncated = slot.truncated;
|
|
@@ -2732,6 +2777,10 @@ struct server_context {
|
|
|
2732
2777
|
} break;
|
|
2733
2778
|
case SERVER_TASK_TYPE_SLOT_SAVE:
|
|
2734
2779
|
{
|
|
2780
|
+
if (!ensure_no_mtmd(task.id)) {
|
|
2781
|
+
break;
|
|
2782
|
+
}
|
|
2783
|
+
|
|
2735
2784
|
int id_slot = task.slot_action.slot_id;
|
|
2736
2785
|
server_slot * slot = get_slot_by_id(id_slot);
|
|
2737
2786
|
if (slot == nullptr) {
|
|
@@ -2751,7 +2800,8 @@ struct server_context {
|
|
|
2751
2800
|
std::string filename = task.slot_action.filename;
|
|
2752
2801
|
std::string filepath = task.slot_action.filepath;
|
|
2753
2802
|
|
|
2754
|
-
const
|
|
2803
|
+
const llama_tokens & tokens = slot->cache_tokens.get_text_tokens();
|
|
2804
|
+
const size_t nwrite = llama_state_seq_save_file(ctx, filepath.c_str(), slot->id, tokens.data(), token_count);
|
|
2755
2805
|
|
|
2756
2806
|
const int64_t t_end = ggml_time_us();
|
|
2757
2807
|
const double t_save_ms = (t_end - t_start) / 1000.0;
|
|
@@ -2768,6 +2818,7 @@ struct server_context {
|
|
|
2768
2818
|
} break;
|
|
2769
2819
|
case SERVER_TASK_TYPE_SLOT_RESTORE:
|
|
2770
2820
|
{
|
|
2821
|
+
if (!ensure_no_mtmd(task.id)) break;
|
|
2771
2822
|
int id_slot = task.slot_action.slot_id;
|
|
2772
2823
|
server_slot * slot = get_slot_by_id(id_slot);
|
|
2773
2824
|
if (slot == nullptr) {
|
|
@@ -2786,15 +2837,18 @@ struct server_context {
|
|
|
2786
2837
|
std::string filename = task.slot_action.filename;
|
|
2787
2838
|
std::string filepath = task.slot_action.filepath;
|
|
2788
2839
|
|
|
2789
|
-
|
|
2840
|
+
llama_tokens tokens;
|
|
2841
|
+
tokens.resize(slot->n_ctx);
|
|
2790
2842
|
size_t token_count = 0;
|
|
2791
|
-
size_t nread = llama_state_seq_load_file(ctx, filepath.c_str(), slot->id,
|
|
2843
|
+
size_t nread = llama_state_seq_load_file(ctx, filepath.c_str(), slot->id, tokens.data(), tokens.size(), &token_count);
|
|
2792
2844
|
if (nread == 0) {
|
|
2793
|
-
slot->cache_tokens.
|
|
2845
|
+
slot->cache_tokens.clear(); // KV may already been invalidated?
|
|
2794
2846
|
send_error(task, "Unable to restore slot, no available space in KV cache or invalid slot save file", ERROR_TYPE_INVALID_REQUEST);
|
|
2795
2847
|
break;
|
|
2796
2848
|
}
|
|
2797
|
-
|
|
2849
|
+
tokens.resize(token_count);
|
|
2850
|
+
slot->cache_tokens.clear();
|
|
2851
|
+
slot->cache_tokens.insert(tokens);
|
|
2798
2852
|
|
|
2799
2853
|
const int64_t t_end = ggml_time_us();
|
|
2800
2854
|
const double t_restore_ms = (t_end - t_start) / 1000.0;
|
|
@@ -2811,6 +2865,7 @@ struct server_context {
|
|
|
2811
2865
|
} break;
|
|
2812
2866
|
case SERVER_TASK_TYPE_SLOT_ERASE:
|
|
2813
2867
|
{
|
|
2868
|
+
if (!ensure_no_mtmd(task.id)) break;
|
|
2814
2869
|
int id_slot = task.slot_action.slot_id;
|
|
2815
2870
|
server_slot * slot = get_slot_by_id(id_slot);
|
|
2816
2871
|
if (slot == nullptr) {
|
|
@@ -2842,6 +2897,7 @@ struct server_context {
|
|
|
2842
2897
|
res->id = task.id;
|
|
2843
2898
|
queue_results.send(std::move(res));
|
|
2844
2899
|
} break;
|
|
2900
|
+
|
|
2845
2901
|
}
|
|
2846
2902
|
}
|
|
2847
2903
|
|
|
@@ -2887,6 +2943,12 @@ struct server_context {
|
|
|
2887
2943
|
continue;
|
|
2888
2944
|
}
|
|
2889
2945
|
|
|
2946
|
+
if (mctx) {
|
|
2947
|
+
// we should never reach this because params_base.ctx_shift is automatically disabled if mmproj is loaded
|
|
2948
|
+
// we don't support ctx_shift because an image chunk may contains multiple tokens
|
|
2949
|
+
GGML_ABORT("not supported by multimodal");
|
|
2950
|
+
}
|
|
2951
|
+
|
|
2890
2952
|
// Shift context
|
|
2891
2953
|
const int n_keep = slot.params.n_keep + add_bos_token;
|
|
2892
2954
|
const int n_left = slot.n_past - n_keep;
|
|
@@ -2897,12 +2959,16 @@ struct server_context {
|
|
|
2897
2959
|
llama_kv_self_seq_rm (ctx, slot.id, n_keep , n_keep + n_discard);
|
|
2898
2960
|
llama_kv_self_seq_add(ctx, slot.id, n_keep + n_discard, slot.n_past, -n_discard);
|
|
2899
2961
|
|
|
2900
|
-
|
|
2901
|
-
|
|
2902
|
-
|
|
2962
|
+
// add generated tokens to cache
|
|
2963
|
+
{
|
|
2964
|
+
llama_tokens new_tokens = slot.cache_tokens.get_text_tokens(); // copy
|
|
2965
|
+
for (size_t i = n_keep + n_discard; i < new_tokens.size(); i++) {
|
|
2966
|
+
new_tokens[i - n_discard] = new_tokens[i];
|
|
2903
2967
|
}
|
|
2904
2968
|
|
|
2905
|
-
|
|
2969
|
+
new_tokens.resize(slot.cache_tokens.size() - n_discard);
|
|
2970
|
+
slot.cache_tokens.clear();
|
|
2971
|
+
slot.cache_tokens.insert(new_tokens);
|
|
2906
2972
|
}
|
|
2907
2973
|
|
|
2908
2974
|
slot.n_past -= n_discard;
|
|
@@ -2939,10 +3005,7 @@ struct server_context {
|
|
|
2939
3005
|
common_batch_add(batch, slot.sampled, slot.n_past, { slot.id }, true);
|
|
2940
3006
|
|
|
2941
3007
|
slot.n_past += 1;
|
|
2942
|
-
|
|
2943
|
-
if (slot.params.cache_prompt) {
|
|
2944
|
-
slot.cache_tokens.push_back(slot.sampled);
|
|
2945
|
-
}
|
|
3008
|
+
slot.cache_tokens.push_back(slot.sampled);
|
|
2946
3009
|
|
|
2947
3010
|
SLT_DBG(slot, "slot decode token, n_ctx = %d, n_past = %d, n_cache_tokens = %d, truncated = %d\n",
|
|
2948
3011
|
slot.n_ctx, slot.n_past, (int) slot.cache_tokens.size(), slot.truncated);
|
|
@@ -2980,7 +3043,7 @@ struct server_context {
|
|
|
2980
3043
|
SLT_INF(slot, "new prompt, n_ctx_slot = %d, n_keep = %d, n_prompt_tokens = %d\n", slot.n_ctx, slot.params.n_keep, slot.n_prompt_tokens);
|
|
2981
3044
|
|
|
2982
3045
|
// print prompt tokens (for debugging)
|
|
2983
|
-
if (1) {
|
|
3046
|
+
/*if (1) {
|
|
2984
3047
|
// first 16 tokens (avoid flooding logs)
|
|
2985
3048
|
for (int i = 0; i < std::min<int>(16, prompt_tokens.size()); i++) {
|
|
2986
3049
|
SLT_DBG(slot, "prompt token %3d: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx, prompt_tokens[i]).c_str());
|
|
@@ -2990,7 +3053,7 @@ struct server_context {
|
|
|
2990
3053
|
for (int i = 0; i < (int) prompt_tokens.size(); i++) {
|
|
2991
3054
|
SLT_DBG(slot, "prompt token %3d: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx, prompt_tokens[i]).c_str());
|
|
2992
3055
|
}
|
|
2993
|
-
}
|
|
3056
|
+
}*/
|
|
2994
3057
|
|
|
2995
3058
|
// empty prompt passed -> release the slot and send empty response
|
|
2996
3059
|
if (prompt_tokens.empty()) {
|
|
@@ -3032,21 +3095,27 @@ struct server_context {
|
|
|
3032
3095
|
|
|
3033
3096
|
// if input prompt is too big, truncate it
|
|
3034
3097
|
if (slot.n_prompt_tokens >= slot.n_ctx) {
|
|
3098
|
+
if (mctx) {
|
|
3099
|
+
// we should never reach this
|
|
3100
|
+
GGML_ABORT("not supported by multimodal");
|
|
3101
|
+
}
|
|
3035
3102
|
const int n_left = slot.n_ctx - slot.params.n_keep;
|
|
3036
3103
|
|
|
3037
3104
|
const int n_block_size = n_left / 2;
|
|
3038
3105
|
const int erased_blocks = (slot.n_prompt_tokens - slot.params.n_keep - n_block_size) / n_block_size;
|
|
3039
3106
|
|
|
3107
|
+
const llama_tokens & curr_tokens = slot.prompt_tokens.get_text_tokens();
|
|
3040
3108
|
llama_tokens new_tokens(
|
|
3041
|
-
|
|
3042
|
-
|
|
3109
|
+
curr_tokens.begin(),
|
|
3110
|
+
curr_tokens.begin() + slot.params.n_keep);
|
|
3043
3111
|
|
|
3044
3112
|
new_tokens.insert(
|
|
3045
3113
|
new_tokens.end(),
|
|
3046
|
-
|
|
3047
|
-
|
|
3114
|
+
curr_tokens.begin() + slot.params.n_keep + erased_blocks * n_block_size,
|
|
3115
|
+
curr_tokens.end());
|
|
3048
3116
|
|
|
3049
|
-
prompt_tokens
|
|
3117
|
+
prompt_tokens.clear();
|
|
3118
|
+
prompt_tokens.insert(new_tokens);
|
|
3050
3119
|
|
|
3051
3120
|
slot.truncated = true;
|
|
3052
3121
|
slot.n_prompt_tokens = prompt_tokens.size();
|
|
@@ -3058,13 +3127,18 @@ struct server_context {
|
|
|
3058
3127
|
|
|
3059
3128
|
if (slot.params.cache_prompt) {
|
|
3060
3129
|
// reuse any previously computed tokens that are common with the new prompt
|
|
3061
|
-
slot.n_past =
|
|
3130
|
+
slot.n_past = slot.cache_tokens.get_common_prefix(prompt_tokens);
|
|
3062
3131
|
|
|
3063
3132
|
// reuse chunks from the cached prompt by shifting their KV cache in the new position
|
|
3064
3133
|
if (params_base.n_cache_reuse > 0) {
|
|
3065
3134
|
size_t head_c = slot.n_past; // cache
|
|
3066
3135
|
size_t head_p = slot.n_past; // current prompt
|
|
3067
3136
|
|
|
3137
|
+
if (mctx) {
|
|
3138
|
+
// we should never reach this
|
|
3139
|
+
GGML_ABORT("not supported by multimodal");
|
|
3140
|
+
}
|
|
3141
|
+
|
|
3068
3142
|
SLT_DBG(slot, "trying to reuse chunks with size > %d, slot.n_past = %d\n", params_base.n_cache_reuse, slot.n_past);
|
|
3069
3143
|
|
|
3070
3144
|
while (head_c < slot.cache_tokens.size() &&
|
|
@@ -3090,7 +3164,7 @@ struct server_context {
|
|
|
3090
3164
|
llama_kv_self_seq_add(ctx, slot.id, head_c, head_c + n_match, kv_shift);
|
|
3091
3165
|
|
|
3092
3166
|
for (size_t i = 0; i < n_match; i++) {
|
|
3093
|
-
slot.cache_tokens
|
|
3167
|
+
slot.cache_tokens.set_token(head_p + i, slot.cache_tokens[head_c + i]);
|
|
3094
3168
|
slot.n_past++;
|
|
3095
3169
|
}
|
|
3096
3170
|
|
|
@@ -3103,6 +3177,11 @@ struct server_context {
|
|
|
3103
3177
|
|
|
3104
3178
|
SLT_DBG(slot, "after context reuse, new slot.n_past = %d\n", slot.n_past);
|
|
3105
3179
|
}
|
|
3180
|
+
} else {
|
|
3181
|
+
// if we don't cache the prompt, we have to remove the entire KV cache
|
|
3182
|
+
llama_kv_self_seq_rm(ctx, slot.id, 0, -1);
|
|
3183
|
+
slot.n_past = 0;
|
|
3184
|
+
slot.cache_tokens.clear();
|
|
3106
3185
|
}
|
|
3107
3186
|
}
|
|
3108
3187
|
|
|
@@ -3136,23 +3215,53 @@ struct server_context {
|
|
|
3136
3215
|
SLT_INF(slot, "kv cache rm [%d, end)\n", slot.n_past);
|
|
3137
3216
|
|
|
3138
3217
|
// remove the non-common part from the cache
|
|
3139
|
-
slot.cache_tokens.
|
|
3218
|
+
slot.cache_tokens.keep_first(slot.n_past);
|
|
3219
|
+
|
|
3220
|
+
// check if we should process the image
|
|
3221
|
+
if (slot.n_past < slot.n_prompt_tokens
|
|
3222
|
+
&& slot.prompt_tokens[slot.n_past] == LLAMA_TOKEN_NULL) {
|
|
3223
|
+
// process the image
|
|
3224
|
+
int32_t new_n_past;
|
|
3225
|
+
int32_t res = slot.prompt_tokens.process_chunk(ctx, mctx, slot.n_past, slot.id, new_n_past);
|
|
3226
|
+
int32_t n_pos = new_n_past - slot.n_past;
|
|
3227
|
+
|
|
3228
|
+
if (res != 0) {
|
|
3229
|
+
SLT_ERR(slot, "failed to process image, res = %d\n", res);
|
|
3230
|
+
slot.release();
|
|
3231
|
+
send_error(slot, "failed to process image", ERROR_TYPE_SERVER);
|
|
3232
|
+
continue;
|
|
3233
|
+
}
|
|
3234
|
+
|
|
3235
|
+
// add the image chunk to cache
|
|
3236
|
+
{
|
|
3237
|
+
const auto & chunk = slot.prompt_tokens.find_chunk(slot.n_past);
|
|
3238
|
+
slot.cache_tokens.push_back(chunk.get()); // copy
|
|
3239
|
+
}
|
|
3240
|
+
|
|
3241
|
+
slot.n_past += n_pos;
|
|
3242
|
+
slot.n_prompt_tokens_processed += n_pos;
|
|
3243
|
+
}
|
|
3140
3244
|
|
|
3141
3245
|
// add prompt tokens for processing in the current batch
|
|
3142
3246
|
while (slot.n_past < slot.n_prompt_tokens && batch.n_tokens < n_batch) {
|
|
3247
|
+
// get next token to process
|
|
3248
|
+
llama_token cur_tok = slot.prompt_tokens[slot.n_past];
|
|
3249
|
+
if (cur_tok == LLAMA_TOKEN_NULL) {
|
|
3250
|
+
break; // end of text chunk
|
|
3251
|
+
}
|
|
3252
|
+
|
|
3143
3253
|
// without pooling, we want to output the embeddings for all the tokens in the batch
|
|
3144
3254
|
const bool need_embd = slot.task_type == SERVER_TASK_TYPE_EMBEDDING && llama_pooling_type(slot.ctx) == LLAMA_POOLING_TYPE_NONE;
|
|
3145
3255
|
|
|
3146
|
-
common_batch_add(batch,
|
|
3147
|
-
|
|
3148
|
-
if (slot.params.cache_prompt) {
|
|
3149
|
-
slot.cache_tokens.push_back(prompt_tokens[slot.n_past]);
|
|
3150
|
-
}
|
|
3256
|
+
common_batch_add(batch, cur_tok, slot.n_past, { slot.id }, need_embd);
|
|
3257
|
+
slot.cache_tokens.push_back(cur_tok);
|
|
3151
3258
|
|
|
3152
3259
|
slot.n_prompt_tokens_processed++;
|
|
3153
3260
|
slot.n_past++;
|
|
3154
3261
|
}
|
|
3155
3262
|
|
|
3263
|
+
// SLT_INF(slot, "new cache_tokens: %s\n", slot.cache_tokens.str().c_str());
|
|
3264
|
+
|
|
3156
3265
|
SLT_INF(slot, "prompt processing progress, n_past = %d, n_tokens = %d, progress = %f\n", slot.n_past, batch.n_tokens, (float) slot.n_prompt_tokens_processed / slot.n_prompt_tokens);
|
|
3157
3266
|
|
|
3158
3267
|
// entire prompt has been processed
|
|
@@ -3160,12 +3269,16 @@ struct server_context {
|
|
|
3160
3269
|
slot.state = SLOT_STATE_DONE_PROMPT;
|
|
3161
3270
|
|
|
3162
3271
|
GGML_ASSERT(batch.n_tokens > 0);
|
|
3272
|
+
GGML_ASSERT((size_t) slot.n_prompt_tokens == slot.prompt_tokens.size());
|
|
3163
3273
|
|
|
3164
3274
|
common_sampler_reset(slot.smpl);
|
|
3165
3275
|
|
|
3166
3276
|
// Process all prompt tokens through sampler system
|
|
3167
3277
|
for (int i = 0; i < slot.n_prompt_tokens; ++i) {
|
|
3168
|
-
|
|
3278
|
+
llama_token id = slot.prompt_tokens[i];
|
|
3279
|
+
if (id != LLAMA_TOKEN_NULL) {
|
|
3280
|
+
common_sampler_accept(slot.smpl, id, false);
|
|
3281
|
+
}
|
|
3169
3282
|
}
|
|
3170
3283
|
|
|
3171
3284
|
// extract the logits only for the last token
|
|
@@ -3212,7 +3325,14 @@ struct server_context {
|
|
|
3212
3325
|
batch.logits + i,
|
|
3213
3326
|
};
|
|
3214
3327
|
|
|
3215
|
-
|
|
3328
|
+
int ret = 0;
|
|
3329
|
+
|
|
3330
|
+
if (params_base.embedding || params_base.reranking) {
|
|
3331
|
+
ret = llama_encode(ctx, batch_view);
|
|
3332
|
+
} else {
|
|
3333
|
+
ret = llama_decode(ctx, batch_view);
|
|
3334
|
+
}
|
|
3335
|
+
|
|
3216
3336
|
metrics.on_decoded(slots);
|
|
3217
3337
|
|
|
3218
3338
|
if (ret != 0) {
|
|
@@ -3311,6 +3431,11 @@ struct server_context {
|
|
|
3311
3431
|
continue;
|
|
3312
3432
|
}
|
|
3313
3433
|
|
|
3434
|
+
if (mctx) {
|
|
3435
|
+
// we should never reach this, as speculative is automatically disabled if mmproj is loaded
|
|
3436
|
+
GGML_ABORT("not supported by multimodal");
|
|
3437
|
+
}
|
|
3438
|
+
|
|
3314
3439
|
// determine the max draft that fits the current slot state
|
|
3315
3440
|
int n_draft_max = slot.params.speculative.n_max;
|
|
3316
3441
|
|
|
@@ -3337,7 +3462,8 @@ struct server_context {
|
|
|
3337
3462
|
params_spec.n_reuse = llama_n_ctx(slot.ctx_dft) - slot.params.speculative.n_max;
|
|
3338
3463
|
params_spec.p_min = slot.params.speculative.p_min;
|
|
3339
3464
|
|
|
3340
|
-
llama_tokens
|
|
3465
|
+
const llama_tokens & cached_text_tokens = slot.cache_tokens.get_text_tokens();
|
|
3466
|
+
llama_tokens draft = common_speculative_gen_draft(slot.spec, params_spec, cached_text_tokens, id);
|
|
3341
3467
|
|
|
3342
3468
|
// keep track of total number of tokens generated in the draft
|
|
3343
3469
|
slot.n_draft_total += draft.size();
|
|
@@ -3371,7 +3497,7 @@ struct server_context {
|
|
|
3371
3497
|
slot.n_draft_accepted += ids.size() - 1;
|
|
3372
3498
|
|
|
3373
3499
|
slot.cache_tokens.push_back(id);
|
|
3374
|
-
slot.cache_tokens.insert(
|
|
3500
|
+
slot.cache_tokens.insert({ids.begin(), ids.end() - 1});
|
|
3375
3501
|
|
|
3376
3502
|
llama_kv_self_seq_rm(ctx, slot.id, slot.n_past, -1);
|
|
3377
3503
|
|
|
@@ -3589,6 +3715,9 @@ int main(int argc, char ** argv) {
|
|
|
3589
3715
|
if (req.path == "/" || tmp.back() == "html") {
|
|
3590
3716
|
res.set_content(reinterpret_cast<const char*>(loading_html), loading_html_len, "text/html; charset=utf-8");
|
|
3591
3717
|
res.status = 503;
|
|
3718
|
+
} else if (req.path == "/models" || req.path == "/v1/models") {
|
|
3719
|
+
// allow the models endpoint to be accessed during loading
|
|
3720
|
+
return true;
|
|
3592
3721
|
} else {
|
|
3593
3722
|
res_error(res, format_error_response("Loading model", ERROR_TYPE_UNAVAILABLE));
|
|
3594
3723
|
}
|
|
@@ -3894,6 +4023,7 @@ int main(int argc, char ** argv) {
|
|
|
3894
4023
|
{ "default_generation_settings", ctx_server.default_generation_settings_for_props },
|
|
3895
4024
|
{ "total_slots", ctx_server.params_base.n_parallel },
|
|
3896
4025
|
{ "model_path", ctx_server.params_base.model.path },
|
|
4026
|
+
{ "modalities", json{{"vision", ctx_server.mctx != nullptr}} }, // TODO: add more in the future
|
|
3897
4027
|
{ "chat_template", common_chat_templates_source(ctx_server.chat_templates.get()) },
|
|
3898
4028
|
{ "bos_token", common_token_to_piece(ctx_server.ctx, llama_vocab_bos(ctx_server.vocab), /* special= */ true)},
|
|
3899
4029
|
{ "eos_token", common_token_to_piece(ctx_server.ctx, llama_vocab_eos(ctx_server.vocab), /* special= */ true)},
|
|
@@ -3941,9 +4071,10 @@ int main(int argc, char ** argv) {
|
|
|
3941
4071
|
const auto handle_completions_impl = [&ctx_server, &res_error, &res_ok](
|
|
3942
4072
|
server_task_type type,
|
|
3943
4073
|
json & data,
|
|
3944
|
-
std::
|
|
4074
|
+
const std::vector<raw_buffer> & files,
|
|
4075
|
+
const std::function<bool()> & is_connection_closed,
|
|
3945
4076
|
httplib::Response & res,
|
|
3946
|
-
oaicompat_type oaicompat) {
|
|
4077
|
+
oaicompat_type oaicompat) -> void {
|
|
3947
4078
|
GGML_ASSERT(type == SERVER_TASK_TYPE_COMPLETION || type == SERVER_TASK_TYPE_INFILL);
|
|
3948
4079
|
|
|
3949
4080
|
if (ctx_server.params_base.embedding) {
|
|
@@ -3960,15 +4091,69 @@ int main(int argc, char ** argv) {
|
|
|
3960
4091
|
// TODO: this log can become very long, put it behind a flag or think about a more compact format
|
|
3961
4092
|
//SRV_DBG("Prompt: %s\n", prompt.is_string() ? prompt.get<std::string>().c_str() : prompt.dump(2).c_str());
|
|
3962
4093
|
|
|
3963
|
-
|
|
3964
|
-
|
|
3965
|
-
|
|
4094
|
+
// process files
|
|
4095
|
+
mtmd::bitmaps bitmaps;
|
|
4096
|
+
const bool has_mtmd = ctx_server.mctx != nullptr;
|
|
4097
|
+
{
|
|
4098
|
+
if (!has_mtmd && !files.empty()) {
|
|
4099
|
+
throw std::runtime_error("This server does not support multimodal");
|
|
4100
|
+
}
|
|
4101
|
+
for (auto & file : files) {
|
|
4102
|
+
mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(file.data(), file.size()));
|
|
4103
|
+
if (!bmp.ptr) {
|
|
4104
|
+
throw std::runtime_error("Failed to load image");
|
|
4105
|
+
}
|
|
4106
|
+
// calculate bitmap hash (for KV caching)
|
|
4107
|
+
std::string hash = fnv_hash(bmp.data(), bmp.nx()*bmp.ny()*3);
|
|
4108
|
+
bmp.set_id(hash.c_str());
|
|
4109
|
+
bitmaps.entries.push_back(std::move(bmp));
|
|
4110
|
+
}
|
|
4111
|
+
}
|
|
4112
|
+
|
|
4113
|
+
// process prompt
|
|
4114
|
+
std::vector<server_tokens> inputs;
|
|
4115
|
+
if (oaicompat && !prompt.is_string()) {
|
|
4116
|
+
throw std::runtime_error("prompt must be a string");
|
|
4117
|
+
}
|
|
4118
|
+
|
|
4119
|
+
if (oaicompat && has_mtmd) {
|
|
4120
|
+
// multimodal
|
|
4121
|
+
std::string prompt_str = prompt.get<std::string>();
|
|
4122
|
+
mtmd_input_text inp_txt = {
|
|
4123
|
+
prompt_str.c_str(),
|
|
4124
|
+
/* add_special */ true,
|
|
4125
|
+
/* parse_special */ true,
|
|
4126
|
+
};
|
|
4127
|
+
mtmd::input_chunks chunks(mtmd_input_chunks_init());
|
|
4128
|
+
auto bitmaps_c_ptr = bitmaps.c_ptr();
|
|
4129
|
+
int32_t tokenized = mtmd_tokenize(ctx_server.mctx,
|
|
4130
|
+
chunks.ptr.get(),
|
|
4131
|
+
&inp_txt,
|
|
4132
|
+
bitmaps_c_ptr.data(),
|
|
4133
|
+
bitmaps_c_ptr.size());
|
|
4134
|
+
if (tokenized != 0) {
|
|
4135
|
+
throw std::runtime_error("Failed to tokenize prompt");
|
|
4136
|
+
}
|
|
4137
|
+
|
|
4138
|
+
server_tokens tmp(chunks, true);
|
|
4139
|
+
inputs.push_back(std::move(tmp));
|
|
4140
|
+
} else {
|
|
4141
|
+
// non-multimodal version
|
|
4142
|
+
auto tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, prompt, true, true);
|
|
4143
|
+
for (auto & p : tokenized_prompts) {
|
|
4144
|
+
auto tmp = server_tokens(p, ctx_server.mctx != nullptr);
|
|
4145
|
+
inputs.push_back(std::move(tmp));
|
|
4146
|
+
}
|
|
4147
|
+
}
|
|
4148
|
+
|
|
4149
|
+
tasks.reserve(inputs.size());
|
|
4150
|
+
for (size_t i = 0; i < inputs.size(); i++) {
|
|
3966
4151
|
server_task task = server_task(type);
|
|
3967
4152
|
|
|
3968
4153
|
task.id = ctx_server.queue_tasks.get_new_id();
|
|
3969
4154
|
task.index = i;
|
|
3970
4155
|
|
|
3971
|
-
task.prompt_tokens = std::move(
|
|
4156
|
+
task.prompt_tokens = std::move(inputs[i]);
|
|
3972
4157
|
task.params = server_task::params_from_json_cmpl(
|
|
3973
4158
|
ctx_server.ctx,
|
|
3974
4159
|
ctx_server.params_base,
|
|
@@ -4050,9 +4235,11 @@ int main(int argc, char ** argv) {
|
|
|
4050
4235
|
|
|
4051
4236
|
const auto handle_completions = [&handle_completions_impl](const httplib::Request & req, httplib::Response & res) {
|
|
4052
4237
|
json data = json::parse(req.body);
|
|
4053
|
-
|
|
4238
|
+
std::vector<raw_buffer> files; // dummy
|
|
4239
|
+
handle_completions_impl(
|
|
4054
4240
|
SERVER_TASK_TYPE_COMPLETION,
|
|
4055
4241
|
data,
|
|
4242
|
+
files,
|
|
4056
4243
|
req.is_connection_closed,
|
|
4057
4244
|
res,
|
|
4058
4245
|
OAICOMPAT_TYPE_NONE);
|
|
@@ -4060,9 +4247,11 @@ int main(int argc, char ** argv) {
|
|
|
4060
4247
|
|
|
4061
4248
|
const auto handle_completions_oai = [&handle_completions_impl](const httplib::Request & req, httplib::Response & res) {
|
|
4062
4249
|
json data = oaicompat_completion_params_parse(json::parse(req.body));
|
|
4063
|
-
|
|
4250
|
+
std::vector<raw_buffer> files; // dummy
|
|
4251
|
+
handle_completions_impl(
|
|
4064
4252
|
SERVER_TASK_TYPE_COMPLETION,
|
|
4065
4253
|
data,
|
|
4254
|
+
files,
|
|
4066
4255
|
req.is_connection_closed,
|
|
4067
4256
|
res,
|
|
4068
4257
|
OAICOMPAT_TYPE_COMPLETION);
|
|
@@ -4137,9 +4326,11 @@ int main(int argc, char ** argv) {
|
|
|
4137
4326
|
tokenized_prompts[0]
|
|
4138
4327
|
);
|
|
4139
4328
|
|
|
4140
|
-
|
|
4329
|
+
std::vector<raw_buffer> files; // dummy
|
|
4330
|
+
handle_completions_impl(
|
|
4141
4331
|
SERVER_TASK_TYPE_INFILL,
|
|
4142
4332
|
data,
|
|
4333
|
+
files,
|
|
4143
4334
|
req.is_connection_closed,
|
|
4144
4335
|
res,
|
|
4145
4336
|
OAICOMPAT_TYPE_NONE); // infill is not OAI compatible
|
|
@@ -4153,11 +4344,20 @@ int main(int argc, char ** argv) {
|
|
|
4153
4344
|
}
|
|
4154
4345
|
|
|
4155
4346
|
auto body = json::parse(req.body);
|
|
4156
|
-
|
|
4157
|
-
|
|
4158
|
-
|
|
4347
|
+
std::vector<raw_buffer> files;
|
|
4348
|
+
json data = oaicompat_completion_params_parse(
|
|
4349
|
+
body,
|
|
4350
|
+
params.use_jinja,
|
|
4351
|
+
params.prefill_assistant,
|
|
4352
|
+
params.reasoning_format,
|
|
4353
|
+
ctx_server.chat_templates.get(),
|
|
4354
|
+
ctx_server.mctx,
|
|
4355
|
+
files);
|
|
4356
|
+
|
|
4357
|
+
handle_completions_impl(
|
|
4159
4358
|
SERVER_TASK_TYPE_COMPLETION,
|
|
4160
4359
|
data,
|
|
4360
|
+
files,
|
|
4161
4361
|
req.is_connection_closed,
|
|
4162
4362
|
res,
|
|
4163
4363
|
OAICOMPAT_TYPE_CHAT);
|
|
@@ -4166,11 +4366,25 @@ int main(int argc, char ** argv) {
|
|
|
4166
4366
|
// same with handle_chat_completions, but without inference part
|
|
4167
4367
|
const auto handle_apply_template = [&ctx_server, ¶ms, &res_ok](const httplib::Request & req, httplib::Response & res) {
|
|
4168
4368
|
auto body = json::parse(req.body);
|
|
4169
|
-
|
|
4369
|
+
std::vector<raw_buffer> files; // dummy, unused
|
|
4370
|
+
json data = oaicompat_completion_params_parse(
|
|
4371
|
+
body,
|
|
4372
|
+
params.use_jinja,
|
|
4373
|
+
params.prefill_assistant,
|
|
4374
|
+
params.reasoning_format,
|
|
4375
|
+
ctx_server.chat_templates.get(),
|
|
4376
|
+
ctx_server.mctx,
|
|
4377
|
+
files);
|
|
4170
4378
|
res_ok(res, {{ "prompt", std::move(data.at("prompt")) }});
|
|
4171
4379
|
};
|
|
4172
4380
|
|
|
4173
|
-
const auto handle_models = [¶ms, &ctx_server, &res_ok](const httplib::Request &, httplib::Response & res) {
|
|
4381
|
+
const auto handle_models = [¶ms, &ctx_server, &state, &res_ok](const httplib::Request &, httplib::Response & res) {
|
|
4382
|
+
server_state current_state = state.load();
|
|
4383
|
+
json model_meta = nullptr;
|
|
4384
|
+
if (current_state == SERVER_STATE_READY) {
|
|
4385
|
+
model_meta = ctx_server.model_meta();
|
|
4386
|
+
}
|
|
4387
|
+
|
|
4174
4388
|
json models = {
|
|
4175
4389
|
{"object", "list"},
|
|
4176
4390
|
{"data", {
|
|
@@ -4179,7 +4393,7 @@ int main(int argc, char ** argv) {
|
|
|
4179
4393
|
{"object", "model"},
|
|
4180
4394
|
{"created", std::time(0)},
|
|
4181
4395
|
{"owned_by", "llamacpp"},
|
|
4182
|
-
{"meta",
|
|
4396
|
+
{"meta", model_meta},
|
|
4183
4397
|
},
|
|
4184
4398
|
}}
|
|
4185
4399
|
};
|
|
@@ -4271,7 +4485,7 @@ int main(int argc, char ** argv) {
|
|
|
4271
4485
|
}
|
|
4272
4486
|
}
|
|
4273
4487
|
|
|
4274
|
-
|
|
4488
|
+
auto tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, prompt, true, true);
|
|
4275
4489
|
for (const auto & tokens : tokenized_prompts) {
|
|
4276
4490
|
// this check is necessary for models that do not add BOS token to the input
|
|
4277
4491
|
if (tokens.empty()) {
|
|
@@ -4291,7 +4505,7 @@ int main(int argc, char ** argv) {
|
|
|
4291
4505
|
|
|
4292
4506
|
task.id = ctx_server.queue_tasks.get_new_id();
|
|
4293
4507
|
task.index = i;
|
|
4294
|
-
task.prompt_tokens =
|
|
4508
|
+
task.prompt_tokens = server_tokens(tokenized_prompts[i], ctx_server.mctx != nullptr);
|
|
4295
4509
|
|
|
4296
4510
|
// OAI-compat
|
|
4297
4511
|
task.params.oaicompat = oaicompat;
|
|
@@ -4385,13 +4599,14 @@ int main(int argc, char ** argv) {
|
|
|
4385
4599
|
std::unordered_set<int> task_ids;
|
|
4386
4600
|
{
|
|
4387
4601
|
std::vector<server_task> tasks;
|
|
4388
|
-
|
|
4602
|
+
auto tokenized_docs = tokenize_input_prompts(ctx_server.vocab, documents, /* add_special */ false, true);
|
|
4389
4603
|
tasks.reserve(tokenized_docs.size());
|
|
4390
4604
|
for (size_t i = 0; i < tokenized_docs.size(); i++) {
|
|
4605
|
+
auto tmp = format_rerank(ctx_server.vocab, tokenized_query, tokenized_docs[i]);
|
|
4391
4606
|
server_task task = server_task(SERVER_TASK_TYPE_RERANK);
|
|
4392
4607
|
task.id = ctx_server.queue_tasks.get_new_id();
|
|
4393
4608
|
task.index = i;
|
|
4394
|
-
task.prompt_tokens =
|
|
4609
|
+
task.prompt_tokens = server_tokens(tmp, ctx_server.mctx != nullptr);
|
|
4395
4610
|
tasks.push_back(std::move(task));
|
|
4396
4611
|
}
|
|
4397
4612
|
|