@fugood/llama.node 0.3.16 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +6 -1
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +44 -2
- package/lib/index.js +132 -1
- package/lib/index.ts +203 -3
- package/package.json +2 -1
- package/src/EmbeddingWorker.cpp +1 -1
- package/src/LlamaCompletionWorker.cpp +374 -19
- package/src/LlamaCompletionWorker.h +31 -10
- package/src/LlamaContext.cpp +216 -7
- package/src/LlamaContext.h +12 -0
- package/src/common.hpp +15 -0
- package/src/llama.cpp/.github/workflows/build-linux-cross.yml +233 -0
- package/src/llama.cpp/.github/workflows/build.yml +89 -767
- package/src/llama.cpp/.github/workflows/docker.yml +9 -6
- package/src/llama.cpp/.github/workflows/release.yml +716 -0
- package/src/llama.cpp/.github/workflows/server.yml +19 -23
- package/src/llama.cpp/CMakeLists.txt +11 -1
- package/src/llama.cpp/cmake/build-info.cmake +8 -2
- package/src/llama.cpp/cmake/x64-windows-llvm.cmake +0 -6
- package/src/llama.cpp/common/CMakeLists.txt +35 -4
- package/src/llama.cpp/common/arg.cpp +844 -121
- package/src/llama.cpp/common/arg.h +9 -0
- package/src/llama.cpp/common/chat.cpp +129 -107
- package/src/llama.cpp/common/chat.h +2 -0
- package/src/llama.cpp/common/common.cpp +64 -518
- package/src/llama.cpp/common/common.h +35 -45
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -0
- package/src/llama.cpp/common/llguidance.cpp +31 -47
- package/src/llama.cpp/common/minja/chat-template.hpp +23 -11
- package/src/llama.cpp/common/minja/minja.hpp +186 -127
- package/src/llama.cpp/common/regex-partial.cpp +204 -0
- package/src/llama.cpp/common/regex-partial.h +56 -0
- package/src/llama.cpp/common/sampling.cpp +60 -50
- package/src/llama.cpp/docs/build.md +122 -7
- package/src/llama.cpp/examples/CMakeLists.txt +2 -32
- package/src/llama.cpp/examples/batched/batched.cpp +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +9 -12
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
- package/src/llama.cpp/examples/parallel/parallel.cpp +89 -15
- package/src/llama.cpp/examples/passkey/passkey.cpp +1 -1
- package/src/llama.cpp/examples/speculative/speculative.cpp +1 -1
- package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
- package/src/llama.cpp/examples/sycl/build.sh +2 -2
- package/src/llama.cpp/examples/sycl/win-build-sycl.bat +2 -2
- package/src/llama.cpp/examples/training/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/training/finetune.cpp +96 -0
- package/src/llama.cpp/ggml/CMakeLists.txt +35 -2
- package/src/llama.cpp/ggml/cmake/GitVars.cmake +22 -0
- package/src/llama.cpp/ggml/include/ggml-backend.h +4 -4
- package/src/llama.cpp/ggml/include/ggml-cpp.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-cpu.h +5 -0
- package/src/llama.cpp/ggml/include/ggml-opt.h +47 -28
- package/src/llama.cpp/ggml/include/ggml-rpc.h +6 -1
- package/src/llama.cpp/ggml/include/ggml.h +76 -106
- package/src/llama.cpp/ggml/src/CMakeLists.txt +11 -8
- package/src/llama.cpp/ggml/src/ggml-alloc.c +4 -1
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +9 -5
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +0 -2
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +8 -4
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +5 -5
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +692 -1534
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +613 -122
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +135 -1
- package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +507 -137
- package/src/llama.cpp/ggml/src/ggml-common.h +12 -6
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +66 -33
- package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/common.h +72 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +896 -194
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +2 -21
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +1060 -410
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1008 -13533
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +31 -16
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +90 -12
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +47 -13
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +266 -72
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1034 -88
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +8796 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +110 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +892 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +28 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +252 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +802 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +23 -4
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +7 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +0 -4
- package/src/llama.cpp/ggml/src/ggml-impl.h +52 -18
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +106 -14
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +67 -119
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1023 -262
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +368 -190
- package/src/llama.cpp/ggml/src/ggml-quants.c +0 -6
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +307 -40
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +125 -45
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +10 -8
- package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +239 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +39 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +0 -35
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +9 -307
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +72 -25
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +14 -7
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +59 -21
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +7 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +79 -90
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +944 -438
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +22 -23
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +37 -8
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +24 -20
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +1 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +507 -411
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +84 -74
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +1 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +185 -89
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +37 -49
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +7 -22
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +4 -14
- package/src/llama.cpp/ggml/src/ggml-sycl/quants.hpp +83 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +204 -118
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +1 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +128 -53
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +83 -49
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1278 -282
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +32 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +133 -30
- package/src/llama.cpp/ggml/src/ggml.c +170 -265
- package/src/llama.cpp/ggml/src/gguf.cpp +34 -33
- package/src/llama.cpp/include/llama.h +82 -22
- package/src/llama.cpp/models/ggml-vocab-llama4.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-llama4.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.out +46 -0
- package/src/llama.cpp/requirements/requirements-all.txt +5 -3
- package/src/llama.cpp/requirements/requirements-gguf_editor_gui.txt +3 -0
- package/src/llama.cpp/scripts/xxd.cmake +1 -1
- package/src/llama.cpp/src/CMakeLists.txt +4 -2
- package/src/llama.cpp/src/llama-adapter.cpp +43 -1
- package/src/llama.cpp/src/llama-arch.cpp +163 -17
- package/src/llama.cpp/src/llama-arch.h +16 -0
- package/src/llama.cpp/src/llama-batch.cpp +5 -1
- package/src/llama.cpp/src/llama-batch.h +2 -1
- package/src/llama.cpp/src/llama-chat.cpp +91 -16
- package/src/llama.cpp/src/llama-chat.h +7 -2
- package/src/llama.cpp/src/llama-context.cpp +479 -575
- package/src/llama.cpp/src/llama-context.h +44 -33
- package/src/llama.cpp/src/llama-cparams.h +1 -0
- package/src/llama.cpp/src/llama-graph.cpp +209 -157
- package/src/llama.cpp/src/llama-graph.h +38 -14
- package/src/llama.cpp/src/llama-hparams.h +13 -0
- package/src/llama.cpp/src/llama-kv-cache.cpp +1604 -543
- package/src/llama.cpp/src/llama-kv-cache.h +283 -171
- package/src/llama.cpp/src/llama-memory.h +12 -2
- package/src/llama.cpp/src/llama-mmap.cpp +1 -1
- package/src/llama.cpp/src/llama-model-loader.cpp +34 -20
- package/src/llama.cpp/src/llama-model-loader.h +5 -3
- package/src/llama.cpp/src/llama-model-saver.cpp +281 -0
- package/src/llama.cpp/src/llama-model-saver.h +37 -0
- package/src/llama.cpp/src/llama-model.cpp +1803 -330
- package/src/llama.cpp/src/llama-model.h +21 -2
- package/src/llama.cpp/src/llama-quant.cpp +33 -10
- package/src/llama.cpp/src/llama-sampling.cpp +25 -7
- package/src/llama.cpp/src/llama-vocab.cpp +86 -10
- package/src/llama.cpp/src/llama-vocab.h +6 -0
- package/src/llama.cpp/src/llama.cpp +15 -1
- package/src/llama.cpp/tests/CMakeLists.txt +52 -31
- package/src/llama.cpp/tests/test-arg-parser.cpp +51 -4
- package/src/llama.cpp/tests/test-backend-ops.cpp +189 -90
- package/src/llama.cpp/tests/test-chat-template.cpp +26 -6
- package/src/llama.cpp/tests/test-chat.cpp +15 -3
- package/src/llama.cpp/{examples/gbnf-validator/gbnf-validator.cpp → tests/test-gbnf-validator.cpp} +2 -2
- package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -2
- package/src/llama.cpp/tests/test-grammar-llguidance.cpp +63 -2
- package/src/llama.cpp/tests/test-grammar-parser.cpp +3 -1
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -1
- package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -1
- package/src/llama.cpp/tests/test-mtmd-c-api.c +63 -0
- package/src/llama.cpp/tests/test-opt.cpp +33 -21
- package/src/llama.cpp/{examples/quantize-stats/quantize-stats.cpp → tests/test-quantize-stats.cpp} +3 -1
- package/src/llama.cpp/tests/test-regex-partial.cpp +288 -0
- package/src/llama.cpp/tests/test-sampling.cpp +1 -1
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +2 -1
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +2 -1
- package/src/llama.cpp/tools/CMakeLists.txt +39 -0
- package/src/llama.cpp/{examples → tools}/batched-bench/batched-bench.cpp +3 -3
- package/src/llama.cpp/{examples → tools}/export-lora/export-lora.cpp +1 -1
- package/src/llama.cpp/{examples → tools}/gguf-split/gguf-split.cpp +15 -16
- package/src/llama.cpp/{examples → tools}/imatrix/imatrix.cpp +11 -9
- package/src/llama.cpp/{examples → tools}/llama-bench/llama-bench.cpp +623 -274
- package/src/llama.cpp/{examples → tools}/main/main.cpp +22 -14
- package/src/llama.cpp/tools/mtmd/CMakeLists.txt +47 -0
- package/src/llama.cpp/tools/mtmd/clip-impl.h +365 -0
- package/src/llama.cpp/tools/mtmd/clip.cpp +3646 -0
- package/src/llama.cpp/tools/mtmd/clip.h +99 -0
- package/src/llama.cpp/tools/mtmd/deprecation-warning.cpp +22 -0
- package/src/llama.cpp/tools/mtmd/mtmd-cli.cpp +370 -0
- package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +310 -0
- package/src/llama.cpp/tools/mtmd/mtmd.cpp +678 -0
- package/src/llama.cpp/tools/mtmd/mtmd.h +331 -0
- package/src/llama.cpp/{examples → tools}/perplexity/perplexity.cpp +21 -5
- package/src/llama.cpp/{examples → tools}/quantize/quantize.cpp +53 -3
- package/src/llama.cpp/tools/rpc/CMakeLists.txt +4 -0
- package/src/llama.cpp/tools/rpc/rpc-server.cpp +322 -0
- package/src/llama.cpp/tools/run/CMakeLists.txt +16 -0
- package/src/llama.cpp/{examples → tools}/run/run.cpp +30 -30
- package/src/llama.cpp/{examples → tools}/server/CMakeLists.txt +2 -1
- package/src/llama.cpp/{examples → tools}/server/httplib.h +313 -247
- package/src/llama.cpp/{examples → tools}/server/server.cpp +529 -215
- package/src/llama.cpp/{examples → tools}/server/utils.hpp +427 -6
- package/src/llama.cpp/{examples → tools}/tts/tts.cpp +6 -9
- package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +0 -6
- package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/infill/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/infill/infill.cpp +0 -590
- package/src/llama.cpp/examples/llava/CMakeLists.txt +0 -66
- package/src/llama.cpp/examples/llava/android/build_64.sh +0 -8
- package/src/llama.cpp/examples/llava/clip-quantize-cli.cpp +0 -59
- package/src/llama.cpp/examples/llava/clip.cpp +0 -3206
- package/src/llama.cpp/examples/llava/clip.h +0 -118
- package/src/llama.cpp/examples/llava/gemma3-cli.cpp +0 -341
- package/src/llama.cpp/examples/llava/llava-cli.cpp +0 -332
- package/src/llama.cpp/examples/llava/llava.cpp +0 -574
- package/src/llama.cpp/examples/llava/llava.h +0 -49
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +0 -354
- package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +0 -584
- package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +0 -6
- package/src/llama.cpp/examples/rpc/CMakeLists.txt +0 -2
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +0 -171
- package/src/llama.cpp/examples/run/CMakeLists.txt +0 -5
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
- /package/src/llama.cpp/{examples → tools}/batched-bench/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/completions.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/cvector-generator.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/mean.hpp +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/negative.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/pca.hpp +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/positive.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/export-lora/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/gguf-split/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/imatrix/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/llama-bench/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/main/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples/llava → tools/mtmd}/requirements.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/perplexity/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/quantize/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.h +0 -0
- /package/src/llama.cpp/{examples → tools}/server/bench/requirements.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/server/tests/requirements.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/tokenize/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/tokenize/tokenize.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/tts/CMakeLists.txt +0 -0
|
@@ -7,6 +7,7 @@
|
|
|
7
7
|
#include "log.h"
|
|
8
8
|
#include "sampling.h"
|
|
9
9
|
#include "speculative.h"
|
|
10
|
+
#include "mtmd.h"
|
|
10
11
|
|
|
11
12
|
// Change JSON_ASSERT from assert() to GGML_ASSERT:
|
|
12
13
|
#define JSON_ASSERT GGML_ASSERT
|
|
@@ -133,7 +134,8 @@ struct slot_params {
|
|
|
133
134
|
|
|
134
135
|
auto grammar_triggers = json::array();
|
|
135
136
|
for (const auto & trigger : sampling.grammar_triggers) {
|
|
136
|
-
|
|
137
|
+
server_grammar_trigger ct(std::move(trigger));
|
|
138
|
+
grammar_triggers.push_back(ct.to_json());
|
|
137
139
|
}
|
|
138
140
|
|
|
139
141
|
return json {
|
|
@@ -145,6 +147,7 @@ struct slot_params {
|
|
|
145
147
|
{"top_k", sampling.top_k},
|
|
146
148
|
{"top_p", sampling.top_p},
|
|
147
149
|
{"min_p", sampling.min_p},
|
|
150
|
+
{"top_n_sigma", sampling.top_n_sigma},
|
|
148
151
|
{"xtc_probability", sampling.xtc_probability},
|
|
149
152
|
{"xtc_threshold", sampling.xtc_threshold},
|
|
150
153
|
{"typical_p", sampling.typ_p},
|
|
@@ -195,8 +198,8 @@ struct server_task {
|
|
|
195
198
|
int id_target = -1;
|
|
196
199
|
|
|
197
200
|
// used by SERVER_TASK_TYPE_INFERENCE
|
|
198
|
-
slot_params
|
|
199
|
-
|
|
201
|
+
slot_params params;
|
|
202
|
+
server_tokens prompt_tokens;
|
|
200
203
|
int id_selected_slot = -1;
|
|
201
204
|
|
|
202
205
|
// used by SERVER_TASK_TYPE_SLOT_SAVE, SERVER_TASK_TYPE_SLOT_RESTORE, SERVER_TASK_TYPE_SLOT_ERASE
|
|
@@ -247,6 +250,7 @@ struct server_task {
|
|
|
247
250
|
params.sampling.top_k = json_value(data, "top_k", defaults.sampling.top_k);
|
|
248
251
|
params.sampling.top_p = json_value(data, "top_p", defaults.sampling.top_p);
|
|
249
252
|
params.sampling.min_p = json_value(data, "min_p", defaults.sampling.min_p);
|
|
253
|
+
params.sampling.top_n_sigma = json_value(data, "top_n_sigma", defaults.sampling.top_n_sigma);
|
|
250
254
|
params.sampling.xtc_probability = json_value(data, "xtc_probability", defaults.sampling.xtc_probability);
|
|
251
255
|
params.sampling.xtc_threshold = json_value(data, "xtc_threshold", defaults.sampling.xtc_threshold);
|
|
252
256
|
params.sampling.typ_p = json_value(data, "typical_p", defaults.sampling.typ_p);
|
|
@@ -372,9 +376,9 @@ struct server_task {
|
|
|
372
376
|
const auto grammar_triggers = data.find("grammar_triggers");
|
|
373
377
|
if (grammar_triggers != data.end()) {
|
|
374
378
|
for (const auto & t : *grammar_triggers) {
|
|
375
|
-
|
|
376
|
-
if (ct.type == COMMON_GRAMMAR_TRIGGER_TYPE_WORD) {
|
|
377
|
-
const auto & word = ct.value;
|
|
379
|
+
server_grammar_trigger ct(t);
|
|
380
|
+
if (ct.value.type == COMMON_GRAMMAR_TRIGGER_TYPE_WORD) {
|
|
381
|
+
const auto & word = ct.value.value;
|
|
378
382
|
auto ids = common_tokenize(vocab, word, /* add_special= */ false, /* parse_special= */ true);
|
|
379
383
|
if (ids.size() == 1) {
|
|
380
384
|
auto token = ids[0];
|
|
@@ -392,7 +396,7 @@ struct server_task {
|
|
|
392
396
|
params.sampling.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, word});
|
|
393
397
|
}
|
|
394
398
|
} else {
|
|
395
|
-
params.sampling.grammar_triggers.push_back(ct);
|
|
399
|
+
params.sampling.grammar_triggers.push_back(std::move(ct.value));
|
|
396
400
|
}
|
|
397
401
|
}
|
|
398
402
|
}
|
|
@@ -489,8 +493,12 @@ struct result_timings {
|
|
|
489
493
|
double predicted_per_token_ms;
|
|
490
494
|
double predicted_per_second;
|
|
491
495
|
|
|
496
|
+
// Optional speculative metrics - only included when > 0
|
|
497
|
+
int32_t draft_n = 0;
|
|
498
|
+
int32_t draft_n_accepted = 0;
|
|
499
|
+
|
|
492
500
|
json to_json() const {
|
|
493
|
-
|
|
501
|
+
json base = {
|
|
494
502
|
{"prompt_n", prompt_n},
|
|
495
503
|
{"prompt_ms", prompt_ms},
|
|
496
504
|
{"prompt_per_token_ms", prompt_per_token_ms},
|
|
@@ -501,6 +509,13 @@ struct result_timings {
|
|
|
501
509
|
{"predicted_per_token_ms", predicted_per_token_ms},
|
|
502
510
|
{"predicted_per_second", predicted_per_second},
|
|
503
511
|
};
|
|
512
|
+
|
|
513
|
+
if (draft_n > 0) {
|
|
514
|
+
base["draft_n"] = draft_n;
|
|
515
|
+
base["draft_n_accepted"] = draft_n_accepted;
|
|
516
|
+
}
|
|
517
|
+
|
|
518
|
+
return base;
|
|
504
519
|
}
|
|
505
520
|
};
|
|
506
521
|
|
|
@@ -1234,6 +1249,9 @@ struct server_slot {
|
|
|
1234
1249
|
llama_context * ctx = nullptr;
|
|
1235
1250
|
llama_context * ctx_dft = nullptr;
|
|
1236
1251
|
|
|
1252
|
+
// multimodal
|
|
1253
|
+
mtmd_context * mctx = nullptr;
|
|
1254
|
+
|
|
1237
1255
|
common_speculative * spec = nullptr;
|
|
1238
1256
|
|
|
1239
1257
|
std::vector<common_adapter_lora_info> lora;
|
|
@@ -1261,14 +1279,14 @@ struct server_slot {
|
|
|
1261
1279
|
int32_t n_prompt_tokens_processed = 0;
|
|
1262
1280
|
|
|
1263
1281
|
// input prompt tokens
|
|
1264
|
-
|
|
1282
|
+
server_tokens prompt_tokens;
|
|
1265
1283
|
|
|
1266
1284
|
size_t last_nl_pos = 0;
|
|
1267
1285
|
|
|
1268
1286
|
std::string generated_text;
|
|
1269
1287
|
llama_tokens generated_tokens;
|
|
1270
1288
|
|
|
1271
|
-
|
|
1289
|
+
server_tokens cache_tokens;
|
|
1272
1290
|
|
|
1273
1291
|
std::vector<completion_token_output> generated_token_probs;
|
|
1274
1292
|
|
|
@@ -1299,6 +1317,10 @@ struct server_slot {
|
|
|
1299
1317
|
|
|
1300
1318
|
std::function<void(int)> callback_on_release;
|
|
1301
1319
|
|
|
1320
|
+
// Speculative decoding stats
|
|
1321
|
+
int32_t n_draft_total = 0; // Total draft tokens generated
|
|
1322
|
+
int32_t n_draft_accepted = 0; // Draft tokens actually accepted
|
|
1323
|
+
|
|
1302
1324
|
void reset() {
|
|
1303
1325
|
SLT_DBG(*this, "%s", "\n");
|
|
1304
1326
|
|
|
@@ -1315,6 +1337,10 @@ struct server_slot {
|
|
|
1315
1337
|
|
|
1316
1338
|
generated_tokens.clear();
|
|
1317
1339
|
generated_token_probs.clear();
|
|
1340
|
+
|
|
1341
|
+
// clear speculative decoding stats
|
|
1342
|
+
n_draft_total = 0;
|
|
1343
|
+
n_draft_accepted = 0;
|
|
1318
1344
|
}
|
|
1319
1345
|
|
|
1320
1346
|
bool is_non_causal() const {
|
|
@@ -1381,6 +1407,12 @@ struct server_slot {
|
|
|
1381
1407
|
timings.predicted_per_token_ms = t_token_generation / n_decoded;
|
|
1382
1408
|
timings.predicted_per_second = 1e3 / t_token_generation * n_decoded;
|
|
1383
1409
|
|
|
1410
|
+
// Add speculative metrics
|
|
1411
|
+
if (n_draft_total > 0) {
|
|
1412
|
+
timings.draft_n = n_draft_total;
|
|
1413
|
+
timings.draft_n_accepted = n_draft_accepted;
|
|
1414
|
+
}
|
|
1415
|
+
|
|
1384
1416
|
return timings;
|
|
1385
1417
|
}
|
|
1386
1418
|
|
|
@@ -1397,7 +1429,7 @@ struct server_slot {
|
|
|
1397
1429
|
pos = text.find(word, from_pos);
|
|
1398
1430
|
} else {
|
|
1399
1431
|
// otherwise, partial stop
|
|
1400
|
-
pos =
|
|
1432
|
+
pos = string_find_partial_stop(text, word);
|
|
1401
1433
|
}
|
|
1402
1434
|
|
|
1403
1435
|
if (pos != std::string::npos && (stop_pos == std::string::npos || pos < stop_pos)) {
|
|
@@ -1428,6 +1460,15 @@ struct server_slot {
|
|
|
1428
1460
|
t_prompt_processing, n_prompt_tokens_processed, t_prompt, n_prompt_second,
|
|
1429
1461
|
t_token_generation, n_decoded, t_gen, n_gen_second,
|
|
1430
1462
|
t_prompt_processing + t_token_generation, n_prompt_tokens_processed + n_decoded);
|
|
1463
|
+
|
|
1464
|
+
if (n_draft_total > 0) {
|
|
1465
|
+
const float draft_ratio = (float) n_draft_accepted / n_draft_total;
|
|
1466
|
+
SLT_INF(*this,
|
|
1467
|
+
"\n"
|
|
1468
|
+
"draft acceptance rate = %0.5f (%5d accepted / %5d generated)\n",
|
|
1469
|
+
draft_ratio, n_draft_accepted, n_draft_total
|
|
1470
|
+
);
|
|
1471
|
+
}
|
|
1431
1472
|
}
|
|
1432
1473
|
|
|
1433
1474
|
json to_json() const {
|
|
@@ -1439,7 +1480,7 @@ struct server_slot {
|
|
|
1439
1480
|
{"is_processing", is_processing()},
|
|
1440
1481
|
{"non_causal", is_non_causal()},
|
|
1441
1482
|
{"params", params.to_json()},
|
|
1442
|
-
{"prompt",
|
|
1483
|
+
{"prompt", prompt_tokens.detokenize(ctx, true)},
|
|
1443
1484
|
{"next_token",
|
|
1444
1485
|
{
|
|
1445
1486
|
{"has_next_token", has_next_token},
|
|
@@ -1517,29 +1558,30 @@ struct server_queue {
|
|
|
1517
1558
|
std::condition_variable condition_tasks;
|
|
1518
1559
|
|
|
1519
1560
|
// callback functions
|
|
1520
|
-
std::function<void(server_task)> callback_new_task;
|
|
1521
|
-
std::function<void(void)>
|
|
1561
|
+
std::function<void(server_task &&)> callback_new_task;
|
|
1562
|
+
std::function<void(void)> callback_update_slots;
|
|
1522
1563
|
|
|
1523
1564
|
// Add a new task to the end of the queue
|
|
1524
|
-
int post(server_task task, bool front = false) {
|
|
1565
|
+
int post(server_task && task, bool front = false) {
|
|
1525
1566
|
std::unique_lock<std::mutex> lock(mutex_tasks);
|
|
1526
1567
|
GGML_ASSERT(task.id != -1);
|
|
1527
1568
|
// if this is cancel task make sure to clean up pending tasks
|
|
1528
1569
|
if (task.type == SERVER_TASK_TYPE_CANCEL) {
|
|
1529
1570
|
cleanup_pending_task(task.id_target);
|
|
1530
1571
|
}
|
|
1531
|
-
|
|
1572
|
+
const int task_id = task.id;
|
|
1573
|
+
QUE_DBG("new task, id = %d, front = %d\n", task_id, front);
|
|
1532
1574
|
if (front) {
|
|
1533
1575
|
queue_tasks.push_front(std::move(task));
|
|
1534
1576
|
} else {
|
|
1535
1577
|
queue_tasks.push_back(std::move(task));
|
|
1536
1578
|
}
|
|
1537
1579
|
condition_tasks.notify_one();
|
|
1538
|
-
return
|
|
1580
|
+
return task_id;
|
|
1539
1581
|
}
|
|
1540
1582
|
|
|
1541
1583
|
// multi-task version of post()
|
|
1542
|
-
int post(std::vector<server_task>
|
|
1584
|
+
int post(std::vector<server_task> && tasks, bool front = false) {
|
|
1543
1585
|
std::unique_lock<std::mutex> lock(mutex_tasks);
|
|
1544
1586
|
for (auto & task : tasks) {
|
|
1545
1587
|
if (task.id == -1) {
|
|
@@ -1561,7 +1603,7 @@ struct server_queue {
|
|
|
1561
1603
|
}
|
|
1562
1604
|
|
|
1563
1605
|
// Add a new task, but defer until one slot is available
|
|
1564
|
-
void defer(server_task task) {
|
|
1606
|
+
void defer(server_task && task) {
|
|
1565
1607
|
std::unique_lock<std::mutex> lock(mutex_tasks);
|
|
1566
1608
|
QUE_DBG("defer task, id = %d\n", task.id);
|
|
1567
1609
|
queue_tasks_deferred.push_back(std::move(task));
|
|
@@ -1576,7 +1618,7 @@ struct server_queue {
|
|
|
1576
1618
|
}
|
|
1577
1619
|
|
|
1578
1620
|
// Register function to process a new task
|
|
1579
|
-
void on_new_task(std::function<void(server_task)> callback) {
|
|
1621
|
+
void on_new_task(std::function<void(server_task &&)> callback) {
|
|
1580
1622
|
callback_new_task = std::move(callback);
|
|
1581
1623
|
}
|
|
1582
1624
|
|
|
@@ -1625,7 +1667,7 @@ struct server_queue {
|
|
|
1625
1667
|
lock.unlock();
|
|
1626
1668
|
break;
|
|
1627
1669
|
}
|
|
1628
|
-
server_task task = queue_tasks.front();
|
|
1670
|
+
server_task task = std::move(queue_tasks.front());
|
|
1629
1671
|
queue_tasks.pop_front();
|
|
1630
1672
|
lock.unlock();
|
|
1631
1673
|
|
|
@@ -1670,6 +1712,8 @@ private:
|
|
|
1670
1712
|
};
|
|
1671
1713
|
|
|
1672
1714
|
struct server_response {
|
|
1715
|
+
bool running = true;
|
|
1716
|
+
|
|
1673
1717
|
// for keeping track of all tasks waiting for the result
|
|
1674
1718
|
std::unordered_set<int> waiting_task_ids;
|
|
1675
1719
|
|
|
@@ -1724,6 +1768,10 @@ struct server_response {
|
|
|
1724
1768
|
while (true) {
|
|
1725
1769
|
std::unique_lock<std::mutex> lock(mutex_results);
|
|
1726
1770
|
condition_results.wait(lock, [&]{
|
|
1771
|
+
if (!running) {
|
|
1772
|
+
SRV_DBG("%s : queue result stop\n", __func__);
|
|
1773
|
+
std::terminate(); // we cannot return here since the caller is HTTP code
|
|
1774
|
+
}
|
|
1727
1775
|
return !queue_results.empty();
|
|
1728
1776
|
});
|
|
1729
1777
|
|
|
@@ -1754,6 +1802,10 @@ struct server_response {
|
|
|
1754
1802
|
}
|
|
1755
1803
|
|
|
1756
1804
|
std::cv_status cr_res = condition_results.wait_for(lock, std::chrono::seconds(timeout));
|
|
1805
|
+
if (!running) {
|
|
1806
|
+
SRV_DBG("%s : queue result stop\n", __func__);
|
|
1807
|
+
std::terminate(); // we cannot return here since the caller is HTTP code
|
|
1808
|
+
}
|
|
1757
1809
|
if (cr_res == std::cv_status::timeout) {
|
|
1758
1810
|
return nullptr;
|
|
1759
1811
|
}
|
|
@@ -1783,6 +1835,12 @@ struct server_response {
|
|
|
1783
1835
|
}
|
|
1784
1836
|
}
|
|
1785
1837
|
}
|
|
1838
|
+
|
|
1839
|
+
// terminate the waiting loop
|
|
1840
|
+
void terminate() {
|
|
1841
|
+
running = false;
|
|
1842
|
+
condition_results.notify_all();
|
|
1843
|
+
}
|
|
1786
1844
|
};
|
|
1787
1845
|
|
|
1788
1846
|
struct server_context {
|
|
@@ -1795,13 +1853,16 @@ struct server_context {
|
|
|
1795
1853
|
llama_model * model = nullptr;
|
|
1796
1854
|
llama_context * ctx = nullptr;
|
|
1797
1855
|
|
|
1856
|
+
// multimodal
|
|
1857
|
+
mtmd_context * mctx = nullptr;
|
|
1858
|
+
|
|
1798
1859
|
const llama_vocab * vocab = nullptr;
|
|
1799
1860
|
|
|
1800
1861
|
llama_model * model_dft = nullptr;
|
|
1801
1862
|
|
|
1802
1863
|
llama_context_params cparams_dft;
|
|
1803
1864
|
|
|
1804
|
-
llama_batch batch
|
|
1865
|
+
llama_batch batch {};
|
|
1805
1866
|
|
|
1806
1867
|
bool clean_kv_cache = true;
|
|
1807
1868
|
bool add_bos_token = true;
|
|
@@ -1824,6 +1885,8 @@ struct server_context {
|
|
|
1824
1885
|
common_chat_templates_ptr chat_templates;
|
|
1825
1886
|
|
|
1826
1887
|
~server_context() {
|
|
1888
|
+
mtmd_free(mctx);
|
|
1889
|
+
|
|
1827
1890
|
// Clear any sampling context
|
|
1828
1891
|
for (server_slot & slot : slots) {
|
|
1829
1892
|
common_sampler_free(slot.smpl);
|
|
@@ -1842,7 +1905,7 @@ struct server_context {
|
|
|
1842
1905
|
}
|
|
1843
1906
|
|
|
1844
1907
|
bool load_model(const common_params & params) {
|
|
1845
|
-
SRV_INF("loading model '%s'\n", params.model.c_str());
|
|
1908
|
+
SRV_INF("loading model '%s'\n", params.model.path.c_str());
|
|
1846
1909
|
|
|
1847
1910
|
params_base = params;
|
|
1848
1911
|
|
|
@@ -1852,7 +1915,7 @@ struct server_context {
|
|
|
1852
1915
|
ctx = llama_init.context.get();
|
|
1853
1916
|
|
|
1854
1917
|
if (model == nullptr) {
|
|
1855
|
-
SRV_ERR("failed to load model, '%s'\n", params_base.model.c_str());
|
|
1918
|
+
SRV_ERR("failed to load model, '%s'\n", params_base.model.path.c_str());
|
|
1856
1919
|
return false;
|
|
1857
1920
|
}
|
|
1858
1921
|
|
|
@@ -1863,16 +1926,13 @@ struct server_context {
|
|
|
1863
1926
|
add_bos_token = llama_vocab_get_add_bos(vocab);
|
|
1864
1927
|
has_eos_token = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
|
|
1865
1928
|
|
|
1866
|
-
if (!params_base.speculative.model.empty() || !params_base.speculative.hf_repo.empty()) {
|
|
1867
|
-
SRV_INF("loading draft model '%s'\n", params_base.speculative.model.c_str());
|
|
1929
|
+
if (!params_base.speculative.model.path.empty() || !params_base.speculative.model.hf_repo.empty()) {
|
|
1930
|
+
SRV_INF("loading draft model '%s'\n", params_base.speculative.model.path.c_str());
|
|
1868
1931
|
|
|
1869
1932
|
auto params_dft = params_base;
|
|
1870
1933
|
|
|
1871
1934
|
params_dft.devices = params_base.speculative.devices;
|
|
1872
|
-
params_dft.hf_file = params_base.speculative.hf_file;
|
|
1873
|
-
params_dft.hf_repo = params_base.speculative.hf_repo;
|
|
1874
1935
|
params_dft.model = params_base.speculative.model;
|
|
1875
|
-
params_dft.model_url = params_base.speculative.model_url;
|
|
1876
1936
|
params_dft.n_ctx = params_base.speculative.n_ctx == 0 ? params_base.n_ctx / params_base.n_parallel : params_base.speculative.n_ctx;
|
|
1877
1937
|
params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers;
|
|
1878
1938
|
params_dft.n_parallel = 1;
|
|
@@ -1886,12 +1946,12 @@ struct server_context {
|
|
|
1886
1946
|
model_dft = llama_init_dft.model.get();
|
|
1887
1947
|
|
|
1888
1948
|
if (model_dft == nullptr) {
|
|
1889
|
-
SRV_ERR("failed to load draft model, '%s'\n", params_base.speculative.model.c_str());
|
|
1949
|
+
SRV_ERR("failed to load draft model, '%s'\n", params_base.speculative.model.path.c_str());
|
|
1890
1950
|
return false;
|
|
1891
1951
|
}
|
|
1892
1952
|
|
|
1893
1953
|
if (!common_speculative_are_compatible(ctx, llama_init_dft.context.get())) {
|
|
1894
|
-
SRV_ERR("the draft model '%s' is not compatible with the target model '%s'\n", params_base.speculative.model.c_str(), params_base.model.c_str());
|
|
1954
|
+
SRV_ERR("the draft model '%s' is not compatible with the target model '%s'\n", params_base.speculative.model.path.c_str(), params_base.model.path.c_str());
|
|
1895
1955
|
|
|
1896
1956
|
return false;
|
|
1897
1957
|
}
|
|
@@ -1914,6 +1974,36 @@ struct server_context {
|
|
|
1914
1974
|
chat_templates = common_chat_templates_init(model, "chatml");
|
|
1915
1975
|
}
|
|
1916
1976
|
|
|
1977
|
+
std::string & mmproj_path = params_base.mmproj.path;
|
|
1978
|
+
if (!mmproj_path.empty()) {
|
|
1979
|
+
mtmd_context_params mparams = mtmd_context_params_default();
|
|
1980
|
+
mparams.use_gpu = params_base.mmproj_use_gpu;
|
|
1981
|
+
mparams.print_timings = false;
|
|
1982
|
+
mparams.n_threads = params_base.cpuparams.n_threads;
|
|
1983
|
+
mparams.verbosity = params_base.verbosity > 0 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_INFO;
|
|
1984
|
+
mctx = mtmd_init_from_file(mmproj_path.c_str(), model, mparams);
|
|
1985
|
+
if (mctx == nullptr) {
|
|
1986
|
+
SRV_ERR("failed to load multimodal model, '%s'\n", mmproj_path.c_str());
|
|
1987
|
+
return false;
|
|
1988
|
+
}
|
|
1989
|
+
SRV_INF("loaded multimodal model, '%s'\n", mmproj_path.c_str());
|
|
1990
|
+
|
|
1991
|
+
if (params_base.ctx_shift) {
|
|
1992
|
+
params_base.ctx_shift = false;
|
|
1993
|
+
SRV_WRN("%s\n", "ctx_shift is not supported by multimodal, it will be disabled");
|
|
1994
|
+
}
|
|
1995
|
+
|
|
1996
|
+
if (params_base.n_cache_reuse) {
|
|
1997
|
+
params_base.n_cache_reuse = 0;
|
|
1998
|
+
SRV_WRN("%s\n", "cache_reuse is not supported by multimodal, it will be disabled");
|
|
1999
|
+
}
|
|
2000
|
+
|
|
2001
|
+
if (!params_base.speculative.model.path.empty()) {
|
|
2002
|
+
SRV_ERR("%s\n", "err: speculative decode is not supported by multimodal");
|
|
2003
|
+
return false;
|
|
2004
|
+
}
|
|
2005
|
+
}
|
|
2006
|
+
|
|
1917
2007
|
return true;
|
|
1918
2008
|
}
|
|
1919
2009
|
|
|
@@ -1929,6 +2019,8 @@ struct server_context {
|
|
|
1929
2019
|
slot.ctx = ctx;
|
|
1930
2020
|
slot.n_ctx = n_ctx_slot;
|
|
1931
2021
|
slot.n_predict = params_base.n_predict;
|
|
2022
|
+
slot.mctx = mctx;
|
|
2023
|
+
slot.cache_tokens.has_mtmd = mctx != nullptr;
|
|
1932
2024
|
|
|
1933
2025
|
if (model_dft) {
|
|
1934
2026
|
slot.batch_spec = llama_batch_init(params_base.speculative.n_max + 1, 0, 1);
|
|
@@ -1956,7 +2048,7 @@ struct server_context {
|
|
|
1956
2048
|
|
|
1957
2049
|
slot.reset();
|
|
1958
2050
|
|
|
1959
|
-
slots.push_back(slot);
|
|
2051
|
+
slots.push_back(std::move(slot));
|
|
1960
2052
|
}
|
|
1961
2053
|
|
|
1962
2054
|
default_generation_settings_for_props = slots[0].to_json();
|
|
@@ -1965,8 +2057,6 @@ struct server_context {
|
|
|
1965
2057
|
// note that n_batch can be > n_ctx (e.g. for non-causal attention models such as BERT where the KV cache is not used)
|
|
1966
2058
|
{
|
|
1967
2059
|
const int32_t n_batch = llama_n_batch(ctx);
|
|
1968
|
-
|
|
1969
|
-
// only a single seq_id per token is needed
|
|
1970
2060
|
batch = llama_batch_init(std::max(n_batch, params_base.n_parallel), 0, 1);
|
|
1971
2061
|
}
|
|
1972
2062
|
|
|
@@ -2003,7 +2093,7 @@ struct server_context {
|
|
|
2003
2093
|
}
|
|
2004
2094
|
|
|
2005
2095
|
// length of the Longest Common Subsequence between the current slot's prompt and the input prompt
|
|
2006
|
-
int cur_lcs_len =
|
|
2096
|
+
int cur_lcs_len = slot.cache_tokens.get_common_prefix(task.prompt_tokens);
|
|
2007
2097
|
|
|
2008
2098
|
// fraction of the common subsequence length compared to the current slot's prompt length
|
|
2009
2099
|
float cur_similarity = static_cast<float>(cur_lcs_len) / static_cast<int>(slot.cache_tokens.size());
|
|
@@ -2045,19 +2135,7 @@ struct server_context {
|
|
|
2045
2135
|
return ret;
|
|
2046
2136
|
}
|
|
2047
2137
|
|
|
2048
|
-
bool
|
|
2049
|
-
const llama_model * model = llama_get_model(ctx);
|
|
2050
|
-
const llama_vocab * vocab = llama_model_get_vocab(model);
|
|
2051
|
-
const int32_t n_vocab = llama_vocab_n_tokens(vocab);
|
|
2052
|
-
for (const auto & token : tokens) {
|
|
2053
|
-
if (token < 0 || token >= n_vocab) {
|
|
2054
|
-
return false;
|
|
2055
|
-
}
|
|
2056
|
-
}
|
|
2057
|
-
return true;
|
|
2058
|
-
}
|
|
2059
|
-
|
|
2060
|
-
bool launch_slot_with_task(server_slot & slot, const server_task & task) {
|
|
2138
|
+
bool launch_slot_with_task(server_slot & slot, server_task && task) {
|
|
2061
2139
|
slot.reset();
|
|
2062
2140
|
slot.id_task = task.id;
|
|
2063
2141
|
slot.index = task.index;
|
|
@@ -2065,14 +2143,13 @@ struct server_context {
|
|
|
2065
2143
|
slot.params = std::move(task.params);
|
|
2066
2144
|
slot.prompt_tokens = std::move(task.prompt_tokens);
|
|
2067
2145
|
|
|
2068
|
-
if (!are_lora_equal(
|
|
2146
|
+
if (!are_lora_equal(slot.params.lora, slot.lora)) {
|
|
2069
2147
|
// if lora is changed, we cannot reuse cached tokens
|
|
2070
2148
|
slot.cache_tokens.clear();
|
|
2071
|
-
slot.lora =
|
|
2149
|
+
slot.lora = slot.params.lora;
|
|
2072
2150
|
}
|
|
2073
2151
|
|
|
2074
|
-
|
|
2075
|
-
if (!can_detokenize) {
|
|
2152
|
+
if (!slot.prompt_tokens.validate(ctx)) {
|
|
2076
2153
|
send_error(task, "Prompt contains invalid tokens", ERROR_TYPE_INVALID_REQUEST);
|
|
2077
2154
|
return false;
|
|
2078
2155
|
}
|
|
@@ -2174,6 +2251,14 @@ struct server_context {
|
|
|
2174
2251
|
slot.has_next_token = true;
|
|
2175
2252
|
}
|
|
2176
2253
|
|
|
2254
|
+
// if context shifting is disabled, make sure that we don't run out of context
|
|
2255
|
+
if (!params_base.ctx_shift && slot.n_past + 1 >= slot.n_ctx) {
|
|
2256
|
+
slot.stop = STOP_TYPE_LIMIT;
|
|
2257
|
+
slot.has_next_token = false;
|
|
2258
|
+
|
|
2259
|
+
SLT_DBG(slot, "stopped due to running out of context, n_past = %d, n_ctx = %d\n", slot.n_past, slot.n_ctx);
|
|
2260
|
+
}
|
|
2261
|
+
|
|
2177
2262
|
// check the limits
|
|
2178
2263
|
if (slot.n_decoded > 0 && slot.has_next_token && !slot.has_budget(params_base)) {
|
|
2179
2264
|
slot.stop = STOP_TYPE_LIMIT;
|
|
@@ -2334,6 +2419,15 @@ struct server_context {
|
|
|
2334
2419
|
queue_results.send(std::move(res));
|
|
2335
2420
|
}
|
|
2336
2421
|
|
|
2422
|
+
// if multimodal is enabled, send an error and return false
|
|
2423
|
+
bool ensure_no_mtmd(const int id_task) {
|
|
2424
|
+
if (mctx) {
|
|
2425
|
+
send_error(id_task, "This feature is not supported by multimodal", ERROR_TYPE_NOT_SUPPORTED);
|
|
2426
|
+
return false;
|
|
2427
|
+
}
|
|
2428
|
+
return true;
|
|
2429
|
+
}
|
|
2430
|
+
|
|
2337
2431
|
void send_partial_response(server_slot & slot, const completion_token_output & tkn) {
|
|
2338
2432
|
auto res = std::make_unique<server_task_result_cmpl_partial>();
|
|
2339
2433
|
|
|
@@ -2373,7 +2467,7 @@ struct server_context {
|
|
|
2373
2467
|
res->content = std::move(slot.generated_text);
|
|
2374
2468
|
res->tokens = std::move(slot.generated_tokens);
|
|
2375
2469
|
res->timings = slot.get_timings();
|
|
2376
|
-
res->prompt =
|
|
2470
|
+
res->prompt = slot.prompt_tokens.detokenize(ctx, true);
|
|
2377
2471
|
res->response_fields = std::move(slot.params.response_fields);
|
|
2378
2472
|
|
|
2379
2473
|
res->truncated = slot.truncated;
|
|
@@ -2499,10 +2593,10 @@ struct server_context {
|
|
|
2499
2593
|
server_task task(SERVER_TASK_TYPE_CANCEL);
|
|
2500
2594
|
task.id_target = id_task;
|
|
2501
2595
|
queue_results.remove_waiting_task_id(id_task);
|
|
2502
|
-
cancel_tasks.push_back(task);
|
|
2596
|
+
cancel_tasks.push_back(std::move(task));
|
|
2503
2597
|
}
|
|
2504
2598
|
// push to beginning of the queue, so it has highest priority
|
|
2505
|
-
queue_tasks.post(cancel_tasks, true);
|
|
2599
|
+
queue_tasks.post(std::move(cancel_tasks), true);
|
|
2506
2600
|
}
|
|
2507
2601
|
|
|
2508
2602
|
// receive the results from task(s)
|
|
@@ -2589,7 +2683,7 @@ struct server_context {
|
|
|
2589
2683
|
// Functions to process the task
|
|
2590
2684
|
//
|
|
2591
2685
|
|
|
2592
|
-
void process_single_task(server_task task) {
|
|
2686
|
+
void process_single_task(server_task && task) {
|
|
2593
2687
|
switch (task.type) {
|
|
2594
2688
|
case SERVER_TASK_TYPE_COMPLETION:
|
|
2595
2689
|
case SERVER_TASK_TYPE_INFILL:
|
|
@@ -2603,17 +2697,17 @@ struct server_context {
|
|
|
2603
2697
|
if (slot == nullptr) {
|
|
2604
2698
|
// if no slot is available, we defer this task for processing later
|
|
2605
2699
|
SRV_DBG("no slot is available, defer task, id_task = %d\n", task.id);
|
|
2606
|
-
queue_tasks.defer(task);
|
|
2700
|
+
queue_tasks.defer(std::move(task));
|
|
2607
2701
|
break;
|
|
2608
2702
|
}
|
|
2609
2703
|
if (slot->is_processing()) {
|
|
2610
2704
|
// if requested slot is unavailable, we defer this task for processing later
|
|
2611
2705
|
SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id);
|
|
2612
|
-
queue_tasks.defer(task);
|
|
2706
|
+
queue_tasks.defer(std::move(task));
|
|
2613
2707
|
break;
|
|
2614
2708
|
}
|
|
2615
2709
|
|
|
2616
|
-
if (!launch_slot_with_task(*slot, task)) {
|
|
2710
|
+
if (!launch_slot_with_task(*slot, std::move(task))) {
|
|
2617
2711
|
SRV_ERR("failed to launch slot with task, id_task = %d\n", task.id);
|
|
2618
2712
|
break;
|
|
2619
2713
|
}
|
|
@@ -2683,6 +2777,10 @@ struct server_context {
|
|
|
2683
2777
|
} break;
|
|
2684
2778
|
case SERVER_TASK_TYPE_SLOT_SAVE:
|
|
2685
2779
|
{
|
|
2780
|
+
if (!ensure_no_mtmd(task.id)) {
|
|
2781
|
+
break;
|
|
2782
|
+
}
|
|
2783
|
+
|
|
2686
2784
|
int id_slot = task.slot_action.slot_id;
|
|
2687
2785
|
server_slot * slot = get_slot_by_id(id_slot);
|
|
2688
2786
|
if (slot == nullptr) {
|
|
@@ -2692,7 +2790,7 @@ struct server_context {
|
|
|
2692
2790
|
if (slot->is_processing()) {
|
|
2693
2791
|
// if requested slot is unavailable, we defer this task for processing later
|
|
2694
2792
|
SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id);
|
|
2695
|
-
queue_tasks.defer(task);
|
|
2793
|
+
queue_tasks.defer(std::move(task));
|
|
2696
2794
|
break;
|
|
2697
2795
|
}
|
|
2698
2796
|
|
|
@@ -2702,7 +2800,8 @@ struct server_context {
|
|
|
2702
2800
|
std::string filename = task.slot_action.filename;
|
|
2703
2801
|
std::string filepath = task.slot_action.filepath;
|
|
2704
2802
|
|
|
2705
|
-
const
|
|
2803
|
+
const llama_tokens & tokens = slot->cache_tokens.get_text_tokens();
|
|
2804
|
+
const size_t nwrite = llama_state_seq_save_file(ctx, filepath.c_str(), slot->id, tokens.data(), token_count);
|
|
2706
2805
|
|
|
2707
2806
|
const int64_t t_end = ggml_time_us();
|
|
2708
2807
|
const double t_save_ms = (t_end - t_start) / 1000.0;
|
|
@@ -2719,6 +2818,7 @@ struct server_context {
|
|
|
2719
2818
|
} break;
|
|
2720
2819
|
case SERVER_TASK_TYPE_SLOT_RESTORE:
|
|
2721
2820
|
{
|
|
2821
|
+
if (!ensure_no_mtmd(task.id)) break;
|
|
2722
2822
|
int id_slot = task.slot_action.slot_id;
|
|
2723
2823
|
server_slot * slot = get_slot_by_id(id_slot);
|
|
2724
2824
|
if (slot == nullptr) {
|
|
@@ -2728,7 +2828,7 @@ struct server_context {
|
|
|
2728
2828
|
if (slot->is_processing()) {
|
|
2729
2829
|
// if requested slot is unavailable, we defer this task for processing later
|
|
2730
2830
|
SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id);
|
|
2731
|
-
queue_tasks.defer(task);
|
|
2831
|
+
queue_tasks.defer(std::move(task));
|
|
2732
2832
|
break;
|
|
2733
2833
|
}
|
|
2734
2834
|
|
|
@@ -2737,15 +2837,18 @@ struct server_context {
|
|
|
2737
2837
|
std::string filename = task.slot_action.filename;
|
|
2738
2838
|
std::string filepath = task.slot_action.filepath;
|
|
2739
2839
|
|
|
2740
|
-
|
|
2840
|
+
llama_tokens tokens;
|
|
2841
|
+
tokens.resize(slot->n_ctx);
|
|
2741
2842
|
size_t token_count = 0;
|
|
2742
|
-
size_t nread = llama_state_seq_load_file(ctx, filepath.c_str(), slot->id,
|
|
2843
|
+
size_t nread = llama_state_seq_load_file(ctx, filepath.c_str(), slot->id, tokens.data(), tokens.size(), &token_count);
|
|
2743
2844
|
if (nread == 0) {
|
|
2744
|
-
slot->cache_tokens.
|
|
2845
|
+
slot->cache_tokens.clear(); // KV may already been invalidated?
|
|
2745
2846
|
send_error(task, "Unable to restore slot, no available space in KV cache or invalid slot save file", ERROR_TYPE_INVALID_REQUEST);
|
|
2746
2847
|
break;
|
|
2747
2848
|
}
|
|
2748
|
-
|
|
2849
|
+
tokens.resize(token_count);
|
|
2850
|
+
slot->cache_tokens.clear();
|
|
2851
|
+
slot->cache_tokens.insert(tokens);
|
|
2749
2852
|
|
|
2750
2853
|
const int64_t t_end = ggml_time_us();
|
|
2751
2854
|
const double t_restore_ms = (t_end - t_start) / 1000.0;
|
|
@@ -2762,6 +2865,7 @@ struct server_context {
|
|
|
2762
2865
|
} break;
|
|
2763
2866
|
case SERVER_TASK_TYPE_SLOT_ERASE:
|
|
2764
2867
|
{
|
|
2868
|
+
if (!ensure_no_mtmd(task.id)) break;
|
|
2765
2869
|
int id_slot = task.slot_action.slot_id;
|
|
2766
2870
|
server_slot * slot = get_slot_by_id(id_slot);
|
|
2767
2871
|
if (slot == nullptr) {
|
|
@@ -2771,7 +2875,7 @@ struct server_context {
|
|
|
2771
2875
|
if (slot->is_processing()) {
|
|
2772
2876
|
// if requested slot is unavailable, we defer this task for processing later
|
|
2773
2877
|
SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id);
|
|
2774
|
-
queue_tasks.defer(task);
|
|
2878
|
+
queue_tasks.defer(std::move(task));
|
|
2775
2879
|
break;
|
|
2776
2880
|
}
|
|
2777
2881
|
|
|
@@ -2793,6 +2897,7 @@ struct server_context {
|
|
|
2793
2897
|
res->id = task.id;
|
|
2794
2898
|
queue_results.send(std::move(res));
|
|
2795
2899
|
} break;
|
|
2900
|
+
|
|
2796
2901
|
}
|
|
2797
2902
|
}
|
|
2798
2903
|
|
|
@@ -2823,7 +2928,7 @@ struct server_context {
|
|
|
2823
2928
|
|
|
2824
2929
|
server_task task(SERVER_TASK_TYPE_NEXT_RESPONSE);
|
|
2825
2930
|
task.id = queue_tasks.get_new_id();
|
|
2826
|
-
queue_tasks.post(task);
|
|
2931
|
+
queue_tasks.post(std::move(task));
|
|
2827
2932
|
}
|
|
2828
2933
|
|
|
2829
2934
|
// apply context-shift if needed
|
|
@@ -2838,6 +2943,12 @@ struct server_context {
|
|
|
2838
2943
|
continue;
|
|
2839
2944
|
}
|
|
2840
2945
|
|
|
2946
|
+
if (mctx) {
|
|
2947
|
+
// we should never reach this because params_base.ctx_shift is automatically disabled if mmproj is loaded
|
|
2948
|
+
// we don't support ctx_shift because an image chunk may contains multiple tokens
|
|
2949
|
+
GGML_ABORT("not supported by multimodal");
|
|
2950
|
+
}
|
|
2951
|
+
|
|
2841
2952
|
// Shift context
|
|
2842
2953
|
const int n_keep = slot.params.n_keep + add_bos_token;
|
|
2843
2954
|
const int n_left = slot.n_past - n_keep;
|
|
@@ -2848,12 +2959,16 @@ struct server_context {
|
|
|
2848
2959
|
llama_kv_self_seq_rm (ctx, slot.id, n_keep , n_keep + n_discard);
|
|
2849
2960
|
llama_kv_self_seq_add(ctx, slot.id, n_keep + n_discard, slot.n_past, -n_discard);
|
|
2850
2961
|
|
|
2851
|
-
|
|
2852
|
-
|
|
2853
|
-
|
|
2962
|
+
// add generated tokens to cache
|
|
2963
|
+
{
|
|
2964
|
+
llama_tokens new_tokens = slot.cache_tokens.get_text_tokens(); // copy
|
|
2965
|
+
for (size_t i = n_keep + n_discard; i < new_tokens.size(); i++) {
|
|
2966
|
+
new_tokens[i - n_discard] = new_tokens[i];
|
|
2854
2967
|
}
|
|
2855
2968
|
|
|
2856
|
-
|
|
2969
|
+
new_tokens.resize(slot.cache_tokens.size() - n_discard);
|
|
2970
|
+
slot.cache_tokens.clear();
|
|
2971
|
+
slot.cache_tokens.insert(new_tokens);
|
|
2857
2972
|
}
|
|
2858
2973
|
|
|
2859
2974
|
slot.n_past -= n_discard;
|
|
@@ -2890,10 +3005,7 @@ struct server_context {
|
|
|
2890
3005
|
common_batch_add(batch, slot.sampled, slot.n_past, { slot.id }, true);
|
|
2891
3006
|
|
|
2892
3007
|
slot.n_past += 1;
|
|
2893
|
-
|
|
2894
|
-
if (slot.params.cache_prompt) {
|
|
2895
|
-
slot.cache_tokens.push_back(slot.sampled);
|
|
2896
|
-
}
|
|
3008
|
+
slot.cache_tokens.push_back(slot.sampled);
|
|
2897
3009
|
|
|
2898
3010
|
SLT_DBG(slot, "slot decode token, n_ctx = %d, n_past = %d, n_cache_tokens = %d, truncated = %d\n",
|
|
2899
3011
|
slot.n_ctx, slot.n_past, (int) slot.cache_tokens.size(), slot.truncated);
|
|
@@ -2931,7 +3043,7 @@ struct server_context {
|
|
|
2931
3043
|
SLT_INF(slot, "new prompt, n_ctx_slot = %d, n_keep = %d, n_prompt_tokens = %d\n", slot.n_ctx, slot.params.n_keep, slot.n_prompt_tokens);
|
|
2932
3044
|
|
|
2933
3045
|
// print prompt tokens (for debugging)
|
|
2934
|
-
if (1) {
|
|
3046
|
+
/*if (1) {
|
|
2935
3047
|
// first 16 tokens (avoid flooding logs)
|
|
2936
3048
|
for (int i = 0; i < std::min<int>(16, prompt_tokens.size()); i++) {
|
|
2937
3049
|
SLT_DBG(slot, "prompt token %3d: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx, prompt_tokens[i]).c_str());
|
|
@@ -2941,7 +3053,7 @@ struct server_context {
|
|
|
2941
3053
|
for (int i = 0; i < (int) prompt_tokens.size(); i++) {
|
|
2942
3054
|
SLT_DBG(slot, "prompt token %3d: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx, prompt_tokens[i]).c_str());
|
|
2943
3055
|
}
|
|
2944
|
-
}
|
|
3056
|
+
}*/
|
|
2945
3057
|
|
|
2946
3058
|
// empty prompt passed -> release the slot and send empty response
|
|
2947
3059
|
if (prompt_tokens.empty()) {
|
|
@@ -2983,21 +3095,27 @@ struct server_context {
|
|
|
2983
3095
|
|
|
2984
3096
|
// if input prompt is too big, truncate it
|
|
2985
3097
|
if (slot.n_prompt_tokens >= slot.n_ctx) {
|
|
3098
|
+
if (mctx) {
|
|
3099
|
+
// we should never reach this
|
|
3100
|
+
GGML_ABORT("not supported by multimodal");
|
|
3101
|
+
}
|
|
2986
3102
|
const int n_left = slot.n_ctx - slot.params.n_keep;
|
|
2987
3103
|
|
|
2988
3104
|
const int n_block_size = n_left / 2;
|
|
2989
3105
|
const int erased_blocks = (slot.n_prompt_tokens - slot.params.n_keep - n_block_size) / n_block_size;
|
|
2990
3106
|
|
|
3107
|
+
const llama_tokens & curr_tokens = slot.prompt_tokens.get_text_tokens();
|
|
2991
3108
|
llama_tokens new_tokens(
|
|
2992
|
-
|
|
2993
|
-
|
|
3109
|
+
curr_tokens.begin(),
|
|
3110
|
+
curr_tokens.begin() + slot.params.n_keep);
|
|
2994
3111
|
|
|
2995
3112
|
new_tokens.insert(
|
|
2996
3113
|
new_tokens.end(),
|
|
2997
|
-
|
|
2998
|
-
|
|
3114
|
+
curr_tokens.begin() + slot.params.n_keep + erased_blocks * n_block_size,
|
|
3115
|
+
curr_tokens.end());
|
|
2999
3116
|
|
|
3000
|
-
prompt_tokens
|
|
3117
|
+
prompt_tokens.clear();
|
|
3118
|
+
prompt_tokens.insert(new_tokens);
|
|
3001
3119
|
|
|
3002
3120
|
slot.truncated = true;
|
|
3003
3121
|
slot.n_prompt_tokens = prompt_tokens.size();
|
|
@@ -3009,13 +3127,18 @@ struct server_context {
|
|
|
3009
3127
|
|
|
3010
3128
|
if (slot.params.cache_prompt) {
|
|
3011
3129
|
// reuse any previously computed tokens that are common with the new prompt
|
|
3012
|
-
slot.n_past =
|
|
3130
|
+
slot.n_past = slot.cache_tokens.get_common_prefix(prompt_tokens);
|
|
3013
3131
|
|
|
3014
3132
|
// reuse chunks from the cached prompt by shifting their KV cache in the new position
|
|
3015
3133
|
if (params_base.n_cache_reuse > 0) {
|
|
3016
3134
|
size_t head_c = slot.n_past; // cache
|
|
3017
3135
|
size_t head_p = slot.n_past; // current prompt
|
|
3018
3136
|
|
|
3137
|
+
if (mctx) {
|
|
3138
|
+
// we should never reach this
|
|
3139
|
+
GGML_ABORT("not supported by multimodal");
|
|
3140
|
+
}
|
|
3141
|
+
|
|
3019
3142
|
SLT_DBG(slot, "trying to reuse chunks with size > %d, slot.n_past = %d\n", params_base.n_cache_reuse, slot.n_past);
|
|
3020
3143
|
|
|
3021
3144
|
while (head_c < slot.cache_tokens.size() &&
|
|
@@ -3041,7 +3164,7 @@ struct server_context {
|
|
|
3041
3164
|
llama_kv_self_seq_add(ctx, slot.id, head_c, head_c + n_match, kv_shift);
|
|
3042
3165
|
|
|
3043
3166
|
for (size_t i = 0; i < n_match; i++) {
|
|
3044
|
-
slot.cache_tokens
|
|
3167
|
+
slot.cache_tokens.set_token(head_p + i, slot.cache_tokens[head_c + i]);
|
|
3045
3168
|
slot.n_past++;
|
|
3046
3169
|
}
|
|
3047
3170
|
|
|
@@ -3054,6 +3177,11 @@ struct server_context {
|
|
|
3054
3177
|
|
|
3055
3178
|
SLT_DBG(slot, "after context reuse, new slot.n_past = %d\n", slot.n_past);
|
|
3056
3179
|
}
|
|
3180
|
+
} else {
|
|
3181
|
+
// if we don't cache the prompt, we have to remove the entire KV cache
|
|
3182
|
+
llama_kv_self_seq_rm(ctx, slot.id, 0, -1);
|
|
3183
|
+
slot.n_past = 0;
|
|
3184
|
+
slot.cache_tokens.clear();
|
|
3057
3185
|
}
|
|
3058
3186
|
}
|
|
3059
3187
|
|
|
@@ -3087,23 +3215,53 @@ struct server_context {
|
|
|
3087
3215
|
SLT_INF(slot, "kv cache rm [%d, end)\n", slot.n_past);
|
|
3088
3216
|
|
|
3089
3217
|
// remove the non-common part from the cache
|
|
3090
|
-
slot.cache_tokens.
|
|
3218
|
+
slot.cache_tokens.keep_first(slot.n_past);
|
|
3219
|
+
|
|
3220
|
+
// check if we should process the image
|
|
3221
|
+
if (slot.n_past < slot.n_prompt_tokens
|
|
3222
|
+
&& slot.prompt_tokens[slot.n_past] == LLAMA_TOKEN_NULL) {
|
|
3223
|
+
// process the image
|
|
3224
|
+
int32_t new_n_past;
|
|
3225
|
+
int32_t res = slot.prompt_tokens.process_chunk(ctx, mctx, slot.n_past, slot.id, new_n_past);
|
|
3226
|
+
int32_t n_pos = new_n_past - slot.n_past;
|
|
3227
|
+
|
|
3228
|
+
if (res != 0) {
|
|
3229
|
+
SLT_ERR(slot, "failed to process image, res = %d\n", res);
|
|
3230
|
+
slot.release();
|
|
3231
|
+
send_error(slot, "failed to process image", ERROR_TYPE_SERVER);
|
|
3232
|
+
continue;
|
|
3233
|
+
}
|
|
3234
|
+
|
|
3235
|
+
// add the image chunk to cache
|
|
3236
|
+
{
|
|
3237
|
+
const auto & chunk = slot.prompt_tokens.find_chunk(slot.n_past);
|
|
3238
|
+
slot.cache_tokens.push_back(chunk.get()); // copy
|
|
3239
|
+
}
|
|
3240
|
+
|
|
3241
|
+
slot.n_past += n_pos;
|
|
3242
|
+
slot.n_prompt_tokens_processed += n_pos;
|
|
3243
|
+
}
|
|
3091
3244
|
|
|
3092
3245
|
// add prompt tokens for processing in the current batch
|
|
3093
3246
|
while (slot.n_past < slot.n_prompt_tokens && batch.n_tokens < n_batch) {
|
|
3247
|
+
// get next token to process
|
|
3248
|
+
llama_token cur_tok = slot.prompt_tokens[slot.n_past];
|
|
3249
|
+
if (cur_tok == LLAMA_TOKEN_NULL) {
|
|
3250
|
+
break; // end of text chunk
|
|
3251
|
+
}
|
|
3252
|
+
|
|
3094
3253
|
// without pooling, we want to output the embeddings for all the tokens in the batch
|
|
3095
3254
|
const bool need_embd = slot.task_type == SERVER_TASK_TYPE_EMBEDDING && llama_pooling_type(slot.ctx) == LLAMA_POOLING_TYPE_NONE;
|
|
3096
3255
|
|
|
3097
|
-
common_batch_add(batch,
|
|
3098
|
-
|
|
3099
|
-
if (slot.params.cache_prompt) {
|
|
3100
|
-
slot.cache_tokens.push_back(prompt_tokens[slot.n_past]);
|
|
3101
|
-
}
|
|
3256
|
+
common_batch_add(batch, cur_tok, slot.n_past, { slot.id }, need_embd);
|
|
3257
|
+
slot.cache_tokens.push_back(cur_tok);
|
|
3102
3258
|
|
|
3103
3259
|
slot.n_prompt_tokens_processed++;
|
|
3104
3260
|
slot.n_past++;
|
|
3105
3261
|
}
|
|
3106
3262
|
|
|
3263
|
+
// SLT_INF(slot, "new cache_tokens: %s\n", slot.cache_tokens.str().c_str());
|
|
3264
|
+
|
|
3107
3265
|
SLT_INF(slot, "prompt processing progress, n_past = %d, n_tokens = %d, progress = %f\n", slot.n_past, batch.n_tokens, (float) slot.n_prompt_tokens_processed / slot.n_prompt_tokens);
|
|
3108
3266
|
|
|
3109
3267
|
// entire prompt has been processed
|
|
@@ -3111,12 +3269,16 @@ struct server_context {
|
|
|
3111
3269
|
slot.state = SLOT_STATE_DONE_PROMPT;
|
|
3112
3270
|
|
|
3113
3271
|
GGML_ASSERT(batch.n_tokens > 0);
|
|
3272
|
+
GGML_ASSERT((size_t) slot.n_prompt_tokens == slot.prompt_tokens.size());
|
|
3114
3273
|
|
|
3115
3274
|
common_sampler_reset(slot.smpl);
|
|
3116
3275
|
|
|
3117
3276
|
// Process all prompt tokens through sampler system
|
|
3118
3277
|
for (int i = 0; i < slot.n_prompt_tokens; ++i) {
|
|
3119
|
-
|
|
3278
|
+
llama_token id = slot.prompt_tokens[i];
|
|
3279
|
+
if (id != LLAMA_TOKEN_NULL) {
|
|
3280
|
+
common_sampler_accept(slot.smpl, id, false);
|
|
3281
|
+
}
|
|
3120
3282
|
}
|
|
3121
3283
|
|
|
3122
3284
|
// extract the logits only for the last token
|
|
@@ -3163,7 +3325,14 @@ struct server_context {
|
|
|
3163
3325
|
batch.logits + i,
|
|
3164
3326
|
};
|
|
3165
3327
|
|
|
3166
|
-
|
|
3328
|
+
int ret = 0;
|
|
3329
|
+
|
|
3330
|
+
if (params_base.embedding || params_base.reranking) {
|
|
3331
|
+
ret = llama_encode(ctx, batch_view);
|
|
3332
|
+
} else {
|
|
3333
|
+
ret = llama_decode(ctx, batch_view);
|
|
3334
|
+
}
|
|
3335
|
+
|
|
3167
3336
|
metrics.on_decoded(slots);
|
|
3168
3337
|
|
|
3169
3338
|
if (ret != 0) {
|
|
@@ -3262,6 +3431,11 @@ struct server_context {
|
|
|
3262
3431
|
continue;
|
|
3263
3432
|
}
|
|
3264
3433
|
|
|
3434
|
+
if (mctx) {
|
|
3435
|
+
// we should never reach this, as speculative is automatically disabled if mmproj is loaded
|
|
3436
|
+
GGML_ABORT("not supported by multimodal");
|
|
3437
|
+
}
|
|
3438
|
+
|
|
3265
3439
|
// determine the max draft that fits the current slot state
|
|
3266
3440
|
int n_draft_max = slot.params.speculative.n_max;
|
|
3267
3441
|
|
|
@@ -3288,7 +3462,11 @@ struct server_context {
|
|
|
3288
3462
|
params_spec.n_reuse = llama_n_ctx(slot.ctx_dft) - slot.params.speculative.n_max;
|
|
3289
3463
|
params_spec.p_min = slot.params.speculative.p_min;
|
|
3290
3464
|
|
|
3291
|
-
llama_tokens
|
|
3465
|
+
const llama_tokens & cached_text_tokens = slot.cache_tokens.get_text_tokens();
|
|
3466
|
+
llama_tokens draft = common_speculative_gen_draft(slot.spec, params_spec, cached_text_tokens, id);
|
|
3467
|
+
|
|
3468
|
+
// keep track of total number of tokens generated in the draft
|
|
3469
|
+
slot.n_draft_total += draft.size();
|
|
3292
3470
|
|
|
3293
3471
|
// ignore small drafts
|
|
3294
3472
|
if (slot.params.speculative.n_min > (int) draft.size()) {
|
|
@@ -3315,8 +3493,11 @@ struct server_context {
|
|
|
3315
3493
|
slot.n_past += ids.size();
|
|
3316
3494
|
slot.n_decoded += ids.size();
|
|
3317
3495
|
|
|
3496
|
+
// update how many tokens out of draft was accepted
|
|
3497
|
+
slot.n_draft_accepted += ids.size() - 1;
|
|
3498
|
+
|
|
3318
3499
|
slot.cache_tokens.push_back(id);
|
|
3319
|
-
slot.cache_tokens.insert(
|
|
3500
|
+
slot.cache_tokens.insert({ids.begin(), ids.end() - 1});
|
|
3320
3501
|
|
|
3321
3502
|
llama_kv_self_seq_rm(ctx, slot.id, slot.n_past, -1);
|
|
3322
3503
|
|
|
@@ -3534,6 +3715,9 @@ int main(int argc, char ** argv) {
|
|
|
3534
3715
|
if (req.path == "/" || tmp.back() == "html") {
|
|
3535
3716
|
res.set_content(reinterpret_cast<const char*>(loading_html), loading_html_len, "text/html; charset=utf-8");
|
|
3536
3717
|
res.status = 503;
|
|
3718
|
+
} else if (req.path == "/models" || req.path == "/v1/models") {
|
|
3719
|
+
// allow the models endpoint to be accessed during loading
|
|
3720
|
+
return true;
|
|
3537
3721
|
} else {
|
|
3538
3722
|
res_error(res, format_error_response("Loading model", ERROR_TYPE_UNAVAILABLE));
|
|
3539
3723
|
}
|
|
@@ -3579,14 +3763,17 @@ int main(int argc, char ** argv) {
|
|
|
3579
3763
|
}
|
|
3580
3764
|
|
|
3581
3765
|
// request slots data using task queue
|
|
3582
|
-
|
|
3583
|
-
|
|
3584
|
-
|
|
3585
|
-
|
|
3766
|
+
int task_id = ctx_server.queue_tasks.get_new_id();
|
|
3767
|
+
{
|
|
3768
|
+
server_task task(SERVER_TASK_TYPE_METRICS);
|
|
3769
|
+
task.id = task_id;
|
|
3770
|
+
ctx_server.queue_results.add_waiting_task_id(task_id);
|
|
3771
|
+
ctx_server.queue_tasks.post(std::move(task), true); // high-priority task
|
|
3772
|
+
}
|
|
3586
3773
|
|
|
3587
3774
|
// get the result
|
|
3588
|
-
server_task_result_ptr result = ctx_server.queue_results.recv(
|
|
3589
|
-
ctx_server.queue_results.remove_waiting_task_id(
|
|
3775
|
+
server_task_result_ptr result = ctx_server.queue_results.recv(task_id);
|
|
3776
|
+
ctx_server.queue_results.remove_waiting_task_id(task_id);
|
|
3590
3777
|
|
|
3591
3778
|
if (result->is_error()) {
|
|
3592
3779
|
res_error(res, result->to_json());
|
|
@@ -3615,16 +3802,17 @@ int main(int argc, char ** argv) {
|
|
|
3615
3802
|
}
|
|
3616
3803
|
|
|
3617
3804
|
// request slots data using task queue
|
|
3618
|
-
|
|
3619
|
-
|
|
3620
|
-
|
|
3621
|
-
|
|
3622
|
-
|
|
3623
|
-
|
|
3805
|
+
int task_id = ctx_server.queue_tasks.get_new_id();
|
|
3806
|
+
{
|
|
3807
|
+
server_task task(SERVER_TASK_TYPE_METRICS);
|
|
3808
|
+
task.id = task_id;
|
|
3809
|
+
ctx_server.queue_results.add_waiting_task_id(task_id);
|
|
3810
|
+
ctx_server.queue_tasks.post(std::move(task), true); // high-priority task
|
|
3811
|
+
}
|
|
3624
3812
|
|
|
3625
3813
|
// get the result
|
|
3626
|
-
server_task_result_ptr result = ctx_server.queue_results.recv(
|
|
3627
|
-
ctx_server.queue_results.remove_waiting_task_id(
|
|
3814
|
+
server_task_result_ptr result = ctx_server.queue_results.recv(task_id);
|
|
3815
|
+
ctx_server.queue_results.remove_waiting_task_id(task_id);
|
|
3628
3816
|
|
|
3629
3817
|
if (result->is_error()) {
|
|
3630
3818
|
res_error(res, result->to_json());
|
|
@@ -3721,17 +3909,20 @@ int main(int argc, char ** argv) {
|
|
|
3721
3909
|
}
|
|
3722
3910
|
std::string filepath = params.slot_save_path + filename;
|
|
3723
3911
|
|
|
3724
|
-
|
|
3725
|
-
|
|
3726
|
-
|
|
3727
|
-
|
|
3728
|
-
|
|
3912
|
+
int task_id = ctx_server.queue_tasks.get_new_id();
|
|
3913
|
+
{
|
|
3914
|
+
server_task task(SERVER_TASK_TYPE_SLOT_SAVE);
|
|
3915
|
+
task.id = task_id;
|
|
3916
|
+
task.slot_action.slot_id = id_slot;
|
|
3917
|
+
task.slot_action.filename = filename;
|
|
3918
|
+
task.slot_action.filepath = filepath;
|
|
3729
3919
|
|
|
3730
|
-
|
|
3731
|
-
|
|
3920
|
+
ctx_server.queue_results.add_waiting_task_id(task_id);
|
|
3921
|
+
ctx_server.queue_tasks.post(std::move(task));
|
|
3922
|
+
}
|
|
3732
3923
|
|
|
3733
|
-
server_task_result_ptr result = ctx_server.queue_results.recv(
|
|
3734
|
-
ctx_server.queue_results.remove_waiting_task_id(
|
|
3924
|
+
server_task_result_ptr result = ctx_server.queue_results.recv(task_id);
|
|
3925
|
+
ctx_server.queue_results.remove_waiting_task_id(task_id);
|
|
3735
3926
|
|
|
3736
3927
|
if (result->is_error()) {
|
|
3737
3928
|
res_error(res, result->to_json());
|
|
@@ -3750,17 +3941,20 @@ int main(int argc, char ** argv) {
|
|
|
3750
3941
|
}
|
|
3751
3942
|
std::string filepath = params.slot_save_path + filename;
|
|
3752
3943
|
|
|
3753
|
-
|
|
3754
|
-
|
|
3755
|
-
|
|
3756
|
-
|
|
3757
|
-
|
|
3944
|
+
int task_id = ctx_server.queue_tasks.get_new_id();
|
|
3945
|
+
{
|
|
3946
|
+
server_task task(SERVER_TASK_TYPE_SLOT_RESTORE);
|
|
3947
|
+
task.id = task_id;
|
|
3948
|
+
task.slot_action.slot_id = id_slot;
|
|
3949
|
+
task.slot_action.filename = filename;
|
|
3950
|
+
task.slot_action.filepath = filepath;
|
|
3758
3951
|
|
|
3759
|
-
|
|
3760
|
-
|
|
3952
|
+
ctx_server.queue_results.add_waiting_task_id(task_id);
|
|
3953
|
+
ctx_server.queue_tasks.post(std::move(task));
|
|
3954
|
+
}
|
|
3761
3955
|
|
|
3762
|
-
server_task_result_ptr result = ctx_server.queue_results.recv(
|
|
3763
|
-
ctx_server.queue_results.remove_waiting_task_id(
|
|
3956
|
+
server_task_result_ptr result = ctx_server.queue_results.recv(task_id);
|
|
3957
|
+
ctx_server.queue_results.remove_waiting_task_id(task_id);
|
|
3764
3958
|
|
|
3765
3959
|
if (result->is_error()) {
|
|
3766
3960
|
res_error(res, result->to_json());
|
|
@@ -3772,15 +3966,18 @@ int main(int argc, char ** argv) {
|
|
|
3772
3966
|
};
|
|
3773
3967
|
|
|
3774
3968
|
const auto handle_slots_erase = [&ctx_server, &res_error, &res_ok](const httplib::Request & /* req */, httplib::Response & res, int id_slot) {
|
|
3775
|
-
|
|
3776
|
-
|
|
3777
|
-
|
|
3969
|
+
int task_id = ctx_server.queue_tasks.get_new_id();
|
|
3970
|
+
{
|
|
3971
|
+
server_task task(SERVER_TASK_TYPE_SLOT_ERASE);
|
|
3972
|
+
task.id = task_id;
|
|
3973
|
+
task.slot_action.slot_id = id_slot;
|
|
3778
3974
|
|
|
3779
|
-
|
|
3780
|
-
|
|
3975
|
+
ctx_server.queue_results.add_waiting_task_id(task_id);
|
|
3976
|
+
ctx_server.queue_tasks.post(std::move(task));
|
|
3977
|
+
}
|
|
3781
3978
|
|
|
3782
|
-
server_task_result_ptr result = ctx_server.queue_results.recv(
|
|
3783
|
-
ctx_server.queue_results.remove_waiting_task_id(
|
|
3979
|
+
server_task_result_ptr result = ctx_server.queue_results.recv(task_id);
|
|
3980
|
+
ctx_server.queue_results.remove_waiting_task_id(task_id);
|
|
3784
3981
|
|
|
3785
3982
|
if (result->is_error()) {
|
|
3786
3983
|
res_error(res, result->to_json());
|
|
@@ -3825,7 +4022,8 @@ int main(int argc, char ** argv) {
|
|
|
3825
4022
|
json data = {
|
|
3826
4023
|
{ "default_generation_settings", ctx_server.default_generation_settings_for_props },
|
|
3827
4024
|
{ "total_slots", ctx_server.params_base.n_parallel },
|
|
3828
|
-
{ "model_path", ctx_server.params_base.model },
|
|
4025
|
+
{ "model_path", ctx_server.params_base.model.path },
|
|
4026
|
+
{ "modalities", json{{"vision", ctx_server.mctx != nullptr}} }, // TODO: add more in the future
|
|
3829
4027
|
{ "chat_template", common_chat_templates_source(ctx_server.chat_templates.get()) },
|
|
3830
4028
|
{ "bos_token", common_token_to_piece(ctx_server.ctx, llama_vocab_bos(ctx_server.vocab), /* special= */ true)},
|
|
3831
4029
|
{ "eos_token", common_token_to_piece(ctx_server.ctx, llama_vocab_eos(ctx_server.vocab), /* special= */ true)},
|
|
@@ -3853,14 +4051,30 @@ int main(int argc, char ** argv) {
|
|
|
3853
4051
|
res_ok(res, {{ "success", true }});
|
|
3854
4052
|
};
|
|
3855
4053
|
|
|
4054
|
+
const auto handle_api_show = [&ctx_server, &res_ok](const httplib::Request &, httplib::Response & res) {
|
|
4055
|
+
json data = {
|
|
4056
|
+
{
|
|
4057
|
+
"template", common_chat_templates_source(ctx_server.chat_templates.get()),
|
|
4058
|
+
},
|
|
4059
|
+
{
|
|
4060
|
+
"model_info", {
|
|
4061
|
+
{ "llama.context_length", ctx_server.slots.back().n_ctx, },
|
|
4062
|
+
}
|
|
4063
|
+
},
|
|
4064
|
+
};
|
|
4065
|
+
|
|
4066
|
+
res_ok(res, data);
|
|
4067
|
+
};
|
|
4068
|
+
|
|
3856
4069
|
// handle completion-like requests (completion, chat, infill)
|
|
3857
4070
|
// we can optionally provide a custom format for partial results and final results
|
|
3858
4071
|
const auto handle_completions_impl = [&ctx_server, &res_error, &res_ok](
|
|
3859
4072
|
server_task_type type,
|
|
3860
4073
|
json & data,
|
|
3861
|
-
std::
|
|
4074
|
+
const std::vector<raw_buffer> & files,
|
|
4075
|
+
const std::function<bool()> & is_connection_closed,
|
|
3862
4076
|
httplib::Response & res,
|
|
3863
|
-
oaicompat_type oaicompat) {
|
|
4077
|
+
oaicompat_type oaicompat) -> void {
|
|
3864
4078
|
GGML_ASSERT(type == SERVER_TASK_TYPE_COMPLETION || type == SERVER_TASK_TYPE_INFILL);
|
|
3865
4079
|
|
|
3866
4080
|
if (ctx_server.params_base.embedding) {
|
|
@@ -3869,26 +4083,81 @@ int main(int argc, char ** argv) {
|
|
|
3869
4083
|
}
|
|
3870
4084
|
|
|
3871
4085
|
auto completion_id = gen_chatcmplid();
|
|
3872
|
-
std::
|
|
3873
|
-
|
|
4086
|
+
std::unordered_set<int> task_ids;
|
|
3874
4087
|
try {
|
|
4088
|
+
std::vector<server_task> tasks;
|
|
4089
|
+
|
|
3875
4090
|
const auto & prompt = data.at("prompt");
|
|
3876
4091
|
// TODO: this log can become very long, put it behind a flag or think about a more compact format
|
|
3877
4092
|
//SRV_DBG("Prompt: %s\n", prompt.is_string() ? prompt.get<std::string>().c_str() : prompt.dump(2).c_str());
|
|
3878
4093
|
|
|
3879
|
-
|
|
3880
|
-
|
|
3881
|
-
|
|
4094
|
+
// process files
|
|
4095
|
+
mtmd::bitmaps bitmaps;
|
|
4096
|
+
const bool has_mtmd = ctx_server.mctx != nullptr;
|
|
4097
|
+
{
|
|
4098
|
+
if (!has_mtmd && !files.empty()) {
|
|
4099
|
+
throw std::runtime_error("This server does not support multimodal");
|
|
4100
|
+
}
|
|
4101
|
+
for (auto & file : files) {
|
|
4102
|
+
mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(file.data(), file.size()));
|
|
4103
|
+
if (!bmp.ptr) {
|
|
4104
|
+
throw std::runtime_error("Failed to load image");
|
|
4105
|
+
}
|
|
4106
|
+
// calculate bitmap hash (for KV caching)
|
|
4107
|
+
std::string hash = fnv_hash(bmp.data(), bmp.nx()*bmp.ny()*3);
|
|
4108
|
+
bmp.set_id(hash.c_str());
|
|
4109
|
+
bitmaps.entries.push_back(std::move(bmp));
|
|
4110
|
+
}
|
|
4111
|
+
}
|
|
4112
|
+
|
|
4113
|
+
// process prompt
|
|
4114
|
+
std::vector<server_tokens> inputs;
|
|
4115
|
+
if (oaicompat && !prompt.is_string()) {
|
|
4116
|
+
throw std::runtime_error("prompt must be a string");
|
|
4117
|
+
}
|
|
4118
|
+
|
|
4119
|
+
if (oaicompat && has_mtmd) {
|
|
4120
|
+
// multimodal
|
|
4121
|
+
std::string prompt_str = prompt.get<std::string>();
|
|
4122
|
+
mtmd_input_text inp_txt = {
|
|
4123
|
+
prompt_str.c_str(),
|
|
4124
|
+
/* add_special */ true,
|
|
4125
|
+
/* parse_special */ true,
|
|
4126
|
+
};
|
|
4127
|
+
mtmd::input_chunks chunks(mtmd_input_chunks_init());
|
|
4128
|
+
auto bitmaps_c_ptr = bitmaps.c_ptr();
|
|
4129
|
+
int32_t tokenized = mtmd_tokenize(ctx_server.mctx,
|
|
4130
|
+
chunks.ptr.get(),
|
|
4131
|
+
&inp_txt,
|
|
4132
|
+
bitmaps_c_ptr.data(),
|
|
4133
|
+
bitmaps_c_ptr.size());
|
|
4134
|
+
if (tokenized != 0) {
|
|
4135
|
+
throw std::runtime_error("Failed to tokenize prompt");
|
|
4136
|
+
}
|
|
4137
|
+
|
|
4138
|
+
server_tokens tmp(chunks, true);
|
|
4139
|
+
inputs.push_back(std::move(tmp));
|
|
4140
|
+
} else {
|
|
4141
|
+
// non-multimodal version
|
|
4142
|
+
auto tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, prompt, true, true);
|
|
4143
|
+
for (auto & p : tokenized_prompts) {
|
|
4144
|
+
auto tmp = server_tokens(p, ctx_server.mctx != nullptr);
|
|
4145
|
+
inputs.push_back(std::move(tmp));
|
|
4146
|
+
}
|
|
4147
|
+
}
|
|
4148
|
+
|
|
4149
|
+
tasks.reserve(inputs.size());
|
|
4150
|
+
for (size_t i = 0; i < inputs.size(); i++) {
|
|
3882
4151
|
server_task task = server_task(type);
|
|
3883
4152
|
|
|
3884
4153
|
task.id = ctx_server.queue_tasks.get_new_id();
|
|
3885
4154
|
task.index = i;
|
|
3886
4155
|
|
|
3887
|
-
task.prompt_tokens = std::move(
|
|
4156
|
+
task.prompt_tokens = std::move(inputs[i]);
|
|
3888
4157
|
task.params = server_task::params_from_json_cmpl(
|
|
3889
|
-
|
|
3890
|
-
|
|
3891
|
-
|
|
4158
|
+
ctx_server.ctx,
|
|
4159
|
+
ctx_server.params_base,
|
|
4160
|
+
data);
|
|
3892
4161
|
task.id_selected_slot = json_value(data, "id_slot", -1);
|
|
3893
4162
|
|
|
3894
4163
|
// OAI-compat
|
|
@@ -3896,18 +4165,18 @@ int main(int argc, char ** argv) {
|
|
|
3896
4165
|
task.params.oaicompat_cmpl_id = completion_id;
|
|
3897
4166
|
// oaicompat_model is already populated by params_from_json_cmpl
|
|
3898
4167
|
|
|
3899
|
-
tasks.push_back(task);
|
|
4168
|
+
tasks.push_back(std::move(task));
|
|
3900
4169
|
}
|
|
4170
|
+
|
|
4171
|
+
task_ids = server_task::get_list_id(tasks);
|
|
4172
|
+
ctx_server.queue_results.add_waiting_tasks(tasks);
|
|
4173
|
+
ctx_server.queue_tasks.post(std::move(tasks));
|
|
3901
4174
|
} catch (const std::exception & e) {
|
|
3902
4175
|
res_error(res, format_error_response(e.what(), ERROR_TYPE_INVALID_REQUEST));
|
|
3903
4176
|
return;
|
|
3904
4177
|
}
|
|
3905
4178
|
|
|
3906
|
-
ctx_server.queue_results.add_waiting_tasks(tasks);
|
|
3907
|
-
ctx_server.queue_tasks.post(tasks);
|
|
3908
|
-
|
|
3909
4179
|
bool stream = json_value(data, "stream", false);
|
|
3910
|
-
const auto task_ids = server_task::get_list_id(tasks);
|
|
3911
4180
|
|
|
3912
4181
|
if (!stream) {
|
|
3913
4182
|
ctx_server.receive_multi_results(task_ids, [&](std::vector<server_task_result_ptr> & results) {
|
|
@@ -3966,9 +4235,11 @@ int main(int argc, char ** argv) {
|
|
|
3966
4235
|
|
|
3967
4236
|
const auto handle_completions = [&handle_completions_impl](const httplib::Request & req, httplib::Response & res) {
|
|
3968
4237
|
json data = json::parse(req.body);
|
|
3969
|
-
|
|
4238
|
+
std::vector<raw_buffer> files; // dummy
|
|
4239
|
+
handle_completions_impl(
|
|
3970
4240
|
SERVER_TASK_TYPE_COMPLETION,
|
|
3971
4241
|
data,
|
|
4242
|
+
files,
|
|
3972
4243
|
req.is_connection_closed,
|
|
3973
4244
|
res,
|
|
3974
4245
|
OAICOMPAT_TYPE_NONE);
|
|
@@ -3976,9 +4247,11 @@ int main(int argc, char ** argv) {
|
|
|
3976
4247
|
|
|
3977
4248
|
const auto handle_completions_oai = [&handle_completions_impl](const httplib::Request & req, httplib::Response & res) {
|
|
3978
4249
|
json data = oaicompat_completion_params_parse(json::parse(req.body));
|
|
3979
|
-
|
|
4250
|
+
std::vector<raw_buffer> files; // dummy
|
|
4251
|
+
handle_completions_impl(
|
|
3980
4252
|
SERVER_TASK_TYPE_COMPLETION,
|
|
3981
4253
|
data,
|
|
4254
|
+
files,
|
|
3982
4255
|
req.is_connection_closed,
|
|
3983
4256
|
res,
|
|
3984
4257
|
OAICOMPAT_TYPE_COMPLETION);
|
|
@@ -4053,9 +4326,11 @@ int main(int argc, char ** argv) {
|
|
|
4053
4326
|
tokenized_prompts[0]
|
|
4054
4327
|
);
|
|
4055
4328
|
|
|
4056
|
-
|
|
4329
|
+
std::vector<raw_buffer> files; // dummy
|
|
4330
|
+
handle_completions_impl(
|
|
4057
4331
|
SERVER_TASK_TYPE_INFILL,
|
|
4058
4332
|
data,
|
|
4333
|
+
files,
|
|
4059
4334
|
req.is_connection_closed,
|
|
4060
4335
|
res,
|
|
4061
4336
|
OAICOMPAT_TYPE_NONE); // infill is not OAI compatible
|
|
@@ -4069,11 +4344,20 @@ int main(int argc, char ** argv) {
|
|
|
4069
4344
|
}
|
|
4070
4345
|
|
|
4071
4346
|
auto body = json::parse(req.body);
|
|
4072
|
-
|
|
4073
|
-
|
|
4074
|
-
|
|
4347
|
+
std::vector<raw_buffer> files;
|
|
4348
|
+
json data = oaicompat_completion_params_parse(
|
|
4349
|
+
body,
|
|
4350
|
+
params.use_jinja,
|
|
4351
|
+
params.prefill_assistant,
|
|
4352
|
+
params.reasoning_format,
|
|
4353
|
+
ctx_server.chat_templates.get(),
|
|
4354
|
+
ctx_server.mctx,
|
|
4355
|
+
files);
|
|
4356
|
+
|
|
4357
|
+
handle_completions_impl(
|
|
4075
4358
|
SERVER_TASK_TYPE_COMPLETION,
|
|
4076
4359
|
data,
|
|
4360
|
+
files,
|
|
4077
4361
|
req.is_connection_closed,
|
|
4078
4362
|
res,
|
|
4079
4363
|
OAICOMPAT_TYPE_CHAT);
|
|
@@ -4082,20 +4366,34 @@ int main(int argc, char ** argv) {
|
|
|
4082
4366
|
// same with handle_chat_completions, but without inference part
|
|
4083
4367
|
const auto handle_apply_template = [&ctx_server, ¶ms, &res_ok](const httplib::Request & req, httplib::Response & res) {
|
|
4084
4368
|
auto body = json::parse(req.body);
|
|
4085
|
-
|
|
4369
|
+
std::vector<raw_buffer> files; // dummy, unused
|
|
4370
|
+
json data = oaicompat_completion_params_parse(
|
|
4371
|
+
body,
|
|
4372
|
+
params.use_jinja,
|
|
4373
|
+
params.prefill_assistant,
|
|
4374
|
+
params.reasoning_format,
|
|
4375
|
+
ctx_server.chat_templates.get(),
|
|
4376
|
+
ctx_server.mctx,
|
|
4377
|
+
files);
|
|
4086
4378
|
res_ok(res, {{ "prompt", std::move(data.at("prompt")) }});
|
|
4087
4379
|
};
|
|
4088
4380
|
|
|
4089
|
-
const auto handle_models = [¶ms, &ctx_server, &res_ok](const httplib::Request &, httplib::Response & res) {
|
|
4381
|
+
const auto handle_models = [¶ms, &ctx_server, &state, &res_ok](const httplib::Request &, httplib::Response & res) {
|
|
4382
|
+
server_state current_state = state.load();
|
|
4383
|
+
json model_meta = nullptr;
|
|
4384
|
+
if (current_state == SERVER_STATE_READY) {
|
|
4385
|
+
model_meta = ctx_server.model_meta();
|
|
4386
|
+
}
|
|
4387
|
+
|
|
4090
4388
|
json models = {
|
|
4091
4389
|
{"object", "list"},
|
|
4092
4390
|
{"data", {
|
|
4093
4391
|
{
|
|
4094
|
-
{"id", params.model_alias.empty() ? params.model : params.model_alias},
|
|
4392
|
+
{"id", params.model_alias.empty() ? params.model.path : params.model_alias},
|
|
4095
4393
|
{"object", "model"},
|
|
4096
4394
|
{"created", std::time(0)},
|
|
4097
4395
|
{"owned_by", "llamacpp"},
|
|
4098
|
-
{"meta",
|
|
4396
|
+
{"meta", model_meta},
|
|
4099
4397
|
},
|
|
4100
4398
|
}}
|
|
4101
4399
|
};
|
|
@@ -4187,7 +4485,7 @@ int main(int argc, char ** argv) {
|
|
|
4187
4485
|
}
|
|
4188
4486
|
}
|
|
4189
4487
|
|
|
4190
|
-
|
|
4488
|
+
auto tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, prompt, true, true);
|
|
4191
4489
|
for (const auto & tokens : tokenized_prompts) {
|
|
4192
4490
|
// this check is necessary for models that do not add BOS token to the input
|
|
4193
4491
|
if (tokens.empty()) {
|
|
@@ -4199,6 +4497,7 @@ int main(int argc, char ** argv) {
|
|
|
4199
4497
|
// create and queue the task
|
|
4200
4498
|
json responses = json::array();
|
|
4201
4499
|
bool error = false;
|
|
4500
|
+
std::unordered_set<int> task_ids;
|
|
4202
4501
|
{
|
|
4203
4502
|
std::vector<server_task> tasks;
|
|
4204
4503
|
for (size_t i = 0; i < tokenized_prompts.size(); i++) {
|
|
@@ -4206,32 +4505,31 @@ int main(int argc, char ** argv) {
|
|
|
4206
4505
|
|
|
4207
4506
|
task.id = ctx_server.queue_tasks.get_new_id();
|
|
4208
4507
|
task.index = i;
|
|
4209
|
-
task.prompt_tokens =
|
|
4508
|
+
task.prompt_tokens = server_tokens(tokenized_prompts[i], ctx_server.mctx != nullptr);
|
|
4210
4509
|
|
|
4211
4510
|
// OAI-compat
|
|
4212
4511
|
task.params.oaicompat = oaicompat;
|
|
4213
4512
|
|
|
4214
|
-
tasks.push_back(task);
|
|
4513
|
+
tasks.push_back(std::move(task));
|
|
4215
4514
|
}
|
|
4216
4515
|
|
|
4516
|
+
task_ids = server_task::get_list_id(tasks);
|
|
4217
4517
|
ctx_server.queue_results.add_waiting_tasks(tasks);
|
|
4218
|
-
ctx_server.queue_tasks.post(tasks);
|
|
4219
|
-
|
|
4220
|
-
// get the result
|
|
4221
|
-
std::unordered_set<int> task_ids = server_task::get_list_id(tasks);
|
|
4518
|
+
ctx_server.queue_tasks.post(std::move(tasks));
|
|
4519
|
+
}
|
|
4222
4520
|
|
|
4223
|
-
|
|
4224
|
-
|
|
4225
|
-
|
|
4226
|
-
|
|
4227
|
-
|
|
4228
|
-
}
|
|
4229
|
-
|
|
4230
|
-
|
|
4231
|
-
|
|
4521
|
+
// get the result
|
|
4522
|
+
ctx_server.receive_multi_results(task_ids, [&](std::vector<server_task_result_ptr> & results) {
|
|
4523
|
+
for (auto & res : results) {
|
|
4524
|
+
GGML_ASSERT(dynamic_cast<server_task_result_embd*>(res.get()) != nullptr);
|
|
4525
|
+
responses.push_back(res->to_json());
|
|
4526
|
+
}
|
|
4527
|
+
}, [&](const json & error_data) {
|
|
4528
|
+
res_error(res, error_data);
|
|
4529
|
+
error = true;
|
|
4530
|
+
}, req.is_connection_closed);
|
|
4232
4531
|
|
|
4233
|
-
|
|
4234
|
-
}
|
|
4532
|
+
ctx_server.queue_results.remove_waiting_task_ids(task_ids);
|
|
4235
4533
|
|
|
4236
4534
|
if (error) {
|
|
4237
4535
|
return;
|
|
@@ -4298,35 +4596,35 @@ int main(int argc, char ** argv) {
|
|
|
4298
4596
|
// create and queue the task
|
|
4299
4597
|
json responses = json::array();
|
|
4300
4598
|
bool error = false;
|
|
4599
|
+
std::unordered_set<int> task_ids;
|
|
4301
4600
|
{
|
|
4302
4601
|
std::vector<server_task> tasks;
|
|
4303
|
-
|
|
4602
|
+
auto tokenized_docs = tokenize_input_prompts(ctx_server.vocab, documents, /* add_special */ false, true);
|
|
4304
4603
|
tasks.reserve(tokenized_docs.size());
|
|
4305
4604
|
for (size_t i = 0; i < tokenized_docs.size(); i++) {
|
|
4605
|
+
auto tmp = format_rerank(ctx_server.vocab, tokenized_query, tokenized_docs[i]);
|
|
4306
4606
|
server_task task = server_task(SERVER_TASK_TYPE_RERANK);
|
|
4307
4607
|
task.id = ctx_server.queue_tasks.get_new_id();
|
|
4308
4608
|
task.index = i;
|
|
4309
|
-
task.prompt_tokens =
|
|
4310
|
-
tasks.push_back(task);
|
|
4609
|
+
task.prompt_tokens = server_tokens(tmp, ctx_server.mctx != nullptr);
|
|
4610
|
+
tasks.push_back(std::move(task));
|
|
4311
4611
|
}
|
|
4312
4612
|
|
|
4613
|
+
task_ids = server_task::get_list_id(tasks);
|
|
4313
4614
|
ctx_server.queue_results.add_waiting_tasks(tasks);
|
|
4314
|
-
ctx_server.queue_tasks.post(tasks);
|
|
4315
|
-
|
|
4316
|
-
// get the result
|
|
4317
|
-
std::unordered_set<int> task_ids = server_task::get_list_id(tasks);
|
|
4318
|
-
|
|
4319
|
-
ctx_server.receive_multi_results(task_ids, [&](std::vector<server_task_result_ptr> & results) {
|
|
4320
|
-
for (auto & res : results) {
|
|
4321
|
-
GGML_ASSERT(dynamic_cast<server_task_result_rerank*>(res.get()) != nullptr);
|
|
4322
|
-
responses.push_back(res->to_json());
|
|
4323
|
-
}
|
|
4324
|
-
}, [&](const json & error_data) {
|
|
4325
|
-
res_error(res, error_data);
|
|
4326
|
-
error = true;
|
|
4327
|
-
}, req.is_connection_closed);
|
|
4615
|
+
ctx_server.queue_tasks.post(std::move(tasks));
|
|
4328
4616
|
}
|
|
4329
4617
|
|
|
4618
|
+
ctx_server.receive_multi_results(task_ids, [&](std::vector<server_task_result_ptr> & results) {
|
|
4619
|
+
for (auto & res : results) {
|
|
4620
|
+
GGML_ASSERT(dynamic_cast<server_task_result_rerank*>(res.get()) != nullptr);
|
|
4621
|
+
responses.push_back(res->to_json());
|
|
4622
|
+
}
|
|
4623
|
+
}, [&](const json & error_data) {
|
|
4624
|
+
res_error(res, error_data);
|
|
4625
|
+
error = true;
|
|
4626
|
+
}, req.is_connection_closed);
|
|
4627
|
+
|
|
4330
4628
|
if (error) {
|
|
4331
4629
|
return;
|
|
4332
4630
|
}
|
|
@@ -4362,14 +4660,19 @@ int main(int argc, char ** argv) {
|
|
|
4362
4660
|
res_error(res, format_error_response("Request body must be an array", ERROR_TYPE_INVALID_REQUEST));
|
|
4363
4661
|
return;
|
|
4364
4662
|
}
|
|
4365
|
-
server_task task(SERVER_TASK_TYPE_SET_LORA);
|
|
4366
|
-
task.id = ctx_server.queue_tasks.get_new_id();
|
|
4367
|
-
task.set_lora = parse_lora_request(ctx_server.params_base.lora_adapters, body);
|
|
4368
|
-
ctx_server.queue_results.add_waiting_task_id(task.id);
|
|
4369
|
-
ctx_server.queue_tasks.post(task);
|
|
4370
4663
|
|
|
4371
|
-
|
|
4372
|
-
|
|
4664
|
+
int task_id = ctx_server.queue_tasks.get_new_id();
|
|
4665
|
+
{
|
|
4666
|
+
server_task task(SERVER_TASK_TYPE_SET_LORA);
|
|
4667
|
+
task.id = task_id;
|
|
4668
|
+
task.set_lora = parse_lora_request(ctx_server.params_base.lora_adapters, body);
|
|
4669
|
+
ctx_server.queue_results.add_waiting_task_id(task_id);
|
|
4670
|
+
ctx_server.queue_tasks.post(std::move(task));
|
|
4671
|
+
}
|
|
4672
|
+
|
|
4673
|
+
// get the result
|
|
4674
|
+
server_task_result_ptr result = ctx_server.queue_results.recv(task_id);
|
|
4675
|
+
ctx_server.queue_results.remove_waiting_task_id(task_id);
|
|
4373
4676
|
|
|
4374
4677
|
if (result->is_error()) {
|
|
4375
4678
|
res_error(res, result->to_json());
|
|
@@ -4417,6 +4720,7 @@ int main(int argc, char ** argv) {
|
|
|
4417
4720
|
svr->Get ("/metrics", handle_metrics);
|
|
4418
4721
|
svr->Get ("/props", handle_props);
|
|
4419
4722
|
svr->Post("/props", handle_props_change);
|
|
4723
|
+
svr->Post("/api/show", handle_api_show);
|
|
4420
4724
|
svr->Get ("/models", handle_models); // public endpoint (no API key check)
|
|
4421
4725
|
svr->Get ("/v1/models", handle_models); // public endpoint (no API key check)
|
|
4422
4726
|
svr->Post("/completion", handle_completions); // legacy
|
|
@@ -4453,21 +4757,31 @@ int main(int argc, char ** argv) {
|
|
|
4453
4757
|
svr->new_task_queue = [¶ms] { return new httplib::ThreadPool(params.n_threads_http); };
|
|
4454
4758
|
|
|
4455
4759
|
// clean up function, to be called before exit
|
|
4456
|
-
auto clean_up = [&svr]() {
|
|
4760
|
+
auto clean_up = [&svr, &ctx_server]() {
|
|
4457
4761
|
SRV_INF("%s: cleaning up before exit...\n", __func__);
|
|
4458
4762
|
svr->stop();
|
|
4763
|
+
ctx_server.queue_results.terminate();
|
|
4459
4764
|
llama_backend_free();
|
|
4460
4765
|
};
|
|
4461
4766
|
|
|
4462
|
-
// bind HTTP listen port
|
|
4463
4767
|
bool was_bound = false;
|
|
4464
|
-
if (params.
|
|
4465
|
-
|
|
4466
|
-
|
|
4467
|
-
|
|
4468
|
-
|
|
4768
|
+
if (string_ends_with(std::string(params.hostname), ".sock")) {
|
|
4769
|
+
LOG_INF("%s: setting address family to AF_UNIX\n", __func__);
|
|
4770
|
+
svr->set_address_family(AF_UNIX);
|
|
4771
|
+
// bind_to_port requires a second arg, any value other than 0 should
|
|
4772
|
+
// simply get ignored
|
|
4773
|
+
was_bound = svr->bind_to_port(params.hostname, 8080);
|
|
4469
4774
|
} else {
|
|
4470
|
-
|
|
4775
|
+
LOG_INF("%s: binding port with default address family\n", __func__);
|
|
4776
|
+
// bind HTTP listen port
|
|
4777
|
+
if (params.port == 0) {
|
|
4778
|
+
int bound_port = svr->bind_to_any_port(params.hostname);
|
|
4779
|
+
if ((was_bound = (bound_port >= 0))) {
|
|
4780
|
+
params.port = bound_port;
|
|
4781
|
+
}
|
|
4782
|
+
} else {
|
|
4783
|
+
was_bound = svr->bind_to_port(params.hostname, params.port);
|
|
4784
|
+
}
|
|
4471
4785
|
}
|
|
4472
4786
|
|
|
4473
4787
|
if (!was_bound) {
|
|
@@ -4487,7 +4801,7 @@ int main(int argc, char ** argv) {
|
|
|
4487
4801
|
|
|
4488
4802
|
if (!ctx_server.load_model(params)) {
|
|
4489
4803
|
clean_up();
|
|
4490
|
-
|
|
4804
|
+
t.join();
|
|
4491
4805
|
LOG_ERR("%s: exiting due to model loading error\n", __func__);
|
|
4492
4806
|
return 1;
|
|
4493
4807
|
}
|
|
@@ -4502,8 +4816,8 @@ int main(int argc, char ** argv) {
|
|
|
4502
4816
|
common_chat_templates_source(ctx_server.chat_templates.get()),
|
|
4503
4817
|
common_chat_format_example(ctx_server.chat_templates.get(), ctx_server.params_base.use_jinja).c_str());
|
|
4504
4818
|
|
|
4505
|
-
ctx_server.queue_tasks.on_new_task([&ctx_server](
|
|
4506
|
-
ctx_server.process_single_task(task);
|
|
4819
|
+
ctx_server.queue_tasks.on_new_task([&ctx_server](server_task && task) {
|
|
4820
|
+
ctx_server.process_single_task(std::move(task));
|
|
4507
4821
|
});
|
|
4508
4822
|
|
|
4509
4823
|
ctx_server.queue_tasks.on_update_slots([&ctx_server]() {
|
|
@@ -4535,7 +4849,7 @@ int main(int argc, char ** argv) {
|
|
|
4535
4849
|
ctx_server.queue_tasks.start_loop();
|
|
4536
4850
|
|
|
4537
4851
|
clean_up();
|
|
4538
|
-
|
|
4852
|
+
t.join();
|
|
4539
4853
|
|
|
4540
4854
|
return 0;
|
|
4541
4855
|
}
|