@fugood/llama.node 0.3.1 → 0.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +1 -8
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/package.json +4 -2
- package/src/DetokenizeWorker.cpp +1 -1
- package/src/EmbeddingWorker.cpp +2 -2
- package/src/LlamaCompletionWorker.cpp +10 -10
- package/src/LlamaCompletionWorker.h +2 -2
- package/src/LlamaContext.cpp +14 -17
- package/src/TokenizeWorker.cpp +1 -1
- package/src/common.hpp +5 -4
- package/src/llama.cpp/.github/workflows/build.yml +137 -29
- package/src/llama.cpp/.github/workflows/close-issue.yml +5 -0
- package/src/llama.cpp/.github/workflows/docker.yml +46 -34
- package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +7 -0
- package/src/llama.cpp/.github/workflows/nix-ci.yml +7 -0
- package/src/llama.cpp/.github/workflows/python-check-requirements.yml +2 -4
- package/src/llama.cpp/.github/workflows/python-type-check.yml +3 -1
- package/src/llama.cpp/.github/workflows/server.yml +7 -0
- package/src/llama.cpp/CMakeLists.txt +26 -11
- package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
- package/src/llama.cpp/common/CMakeLists.txt +10 -10
- package/src/llama.cpp/common/arg.cpp +2041 -0
- package/src/llama.cpp/common/arg.h +77 -0
- package/src/llama.cpp/common/common.cpp +523 -1861
- package/src/llama.cpp/common/common.h +234 -106
- package/src/llama.cpp/common/console.cpp +3 -0
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
- package/src/llama.cpp/common/log.cpp +401 -0
- package/src/llama.cpp/common/log.h +66 -698
- package/src/llama.cpp/common/ngram-cache.cpp +39 -36
- package/src/llama.cpp/common/ngram-cache.h +19 -19
- package/src/llama.cpp/common/sampling.cpp +356 -350
- package/src/llama.cpp/common/sampling.h +62 -139
- package/src/llama.cpp/common/stb_image.h +5990 -6398
- package/src/llama.cpp/docs/build.md +72 -17
- package/src/llama.cpp/examples/CMakeLists.txt +1 -2
- package/src/llama.cpp/examples/batched/batched.cpp +49 -65
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +42 -53
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +55 -52
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +22 -22
- package/src/llama.cpp/examples/cvector-generator/pca.hpp +3 -13
- package/src/llama.cpp/examples/embedding/embedding.cpp +147 -91
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +37 -37
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +39 -38
- package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +14 -39
- package/src/llama.cpp/examples/{baby-llama → gen-docs}/CMakeLists.txt +2 -2
- package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +83 -0
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +58 -39
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +46 -39
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +75 -69
- package/src/llama.cpp/examples/infill/infill.cpp +131 -192
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +276 -178
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +40 -36
- package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
- package/src/llama.cpp/examples/llava/clip.cpp +686 -150
- package/src/llama.cpp/examples/llava/clip.h +11 -2
- package/src/llama.cpp/examples/llava/llava-cli.cpp +60 -71
- package/src/llama.cpp/examples/llava/llava.cpp +146 -26
- package/src/llama.cpp/examples/llava/llava.h +2 -3
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +323 -0
- package/src/llama.cpp/examples/llava/requirements.txt +1 -0
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +55 -56
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +15 -13
- package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +34 -33
- package/src/llama.cpp/examples/lookup/lookup.cpp +60 -63
- package/src/llama.cpp/examples/main/main.cpp +216 -313
- package/src/llama.cpp/examples/parallel/parallel.cpp +58 -59
- package/src/llama.cpp/examples/passkey/passkey.cpp +53 -61
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +277 -311
- package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/quantize.cpp +27 -9
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +12 -12
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +57 -52
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +27 -2
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +60 -46
- package/src/llama.cpp/examples/server/CMakeLists.txt +7 -18
- package/src/llama.cpp/examples/server/server.cpp +1347 -1531
- package/src/llama.cpp/examples/server/tests/requirements.txt +2 -1
- package/src/llama.cpp/examples/server/utils.hpp +396 -107
- package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/simple/simple.cpp +132 -106
- package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +197 -0
- package/src/llama.cpp/examples/speculative/speculative.cpp +153 -124
- package/src/llama.cpp/examples/sycl/run-llama2.sh +10 -19
- package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +27 -29
- package/src/llama.cpp/ggml/CMakeLists.txt +29 -12
- package/src/llama.cpp/ggml/include/ggml-alloc.h +3 -3
- package/src/llama.cpp/ggml/include/ggml-amx.h +25 -0
- package/src/llama.cpp/ggml/include/ggml-backend.h +166 -68
- package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
- package/src/llama.cpp/ggml/include/ggml-cann.h +17 -19
- package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +177 -0
- package/src/llama.cpp/ggml/include/ggml-cuda.h +17 -17
- package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
- package/src/llama.cpp/ggml/include/ggml-metal.h +13 -12
- package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
- package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
- package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
- package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
- package/src/llama.cpp/ggml/include/ggml.h +272 -505
- package/src/llama.cpp/ggml/src/CMakeLists.txt +69 -1110
- package/src/llama.cpp/ggml/src/ggml-aarch64.c +52 -2116
- package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -20
- package/src/llama.cpp/ggml/src/ggml-alloc.c +29 -27
- package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +107 -0
- package/src/llama.cpp/ggml/src/ggml-amx/common.h +94 -0
- package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
- package/src/llama.cpp/ggml/src/ggml-amx/mmq.cpp +2510 -0
- package/src/llama.cpp/ggml/src/ggml-amx/mmq.h +17 -0
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +144 -81
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +195 -0
- package/src/llama.cpp/ggml/src/{ggml-backend.c → ggml-backend.cpp} +394 -635
- package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +91 -0
- package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +217 -70
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +46 -0
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +4 -27
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +32 -4
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +179 -41
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +1 -0
- package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +458 -353
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -1
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +2 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +278 -0
- package/src/llama.cpp/ggml/src/ggml-common.h +20 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +261 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.c +3560 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +30 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +371 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10822 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +13970 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +663 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1885 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +155 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +178 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +134 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +106 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +380 -584
- package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +162 -0
- package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +233 -87
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +108 -0
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +249 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +100 -0
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +867 -0
- package/src/llama.cpp/ggml/src/ggml-quants.c +369 -9994
- package/src/llama.cpp/ggml/src/ggml-quants.h +78 -110
- package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +11 -0
- package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +560 -335
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +81 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +6 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +51 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +310 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +99 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +57 -57
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +106 -106
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +18 -25
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1011 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +101 -0
- package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3350 -3980
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +125 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +70 -68
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +9 -6
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +8 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +71 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +138 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
- package/src/llama.cpp/ggml/src/ggml-threading.h +12 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +78 -0
- package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +2034 -1718
- package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +2 -0
- package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/vulkan-shaders-gen.cpp +152 -185
- package/src/llama.cpp/ggml/src/ggml.c +2075 -16579
- package/src/llama.cpp/include/llama.h +296 -285
- package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
- package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
- package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
- package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +1 -1
- package/src/llama.cpp/src/CMakeLists.txt +2 -1
- package/src/llama.cpp/src/llama-grammar.cpp +721 -122
- package/src/llama.cpp/src/llama-grammar.h +120 -15
- package/src/llama.cpp/src/llama-impl.h +156 -1
- package/src/llama.cpp/src/llama-sampling.cpp +2058 -346
- package/src/llama.cpp/src/llama-sampling.h +39 -47
- package/src/llama.cpp/src/llama-vocab.cpp +390 -127
- package/src/llama.cpp/src/llama-vocab.h +60 -20
- package/src/llama.cpp/src/llama.cpp +6215 -3263
- package/src/llama.cpp/src/unicode-data.cpp +6 -4
- package/src/llama.cpp/src/unicode-data.h +4 -4
- package/src/llama.cpp/src/unicode.cpp +15 -7
- package/src/llama.cpp/tests/CMakeLists.txt +4 -2
- package/src/llama.cpp/tests/test-arg-parser.cpp +131 -0
- package/src/llama.cpp/tests/test-backend-ops.cpp +1725 -297
- package/src/llama.cpp/tests/test-barrier.cpp +94 -0
- package/src/llama.cpp/tests/test-chat-template.cpp +9 -5
- package/src/llama.cpp/tests/test-grammar-integration.cpp +23 -38
- package/src/llama.cpp/tests/test-grammar-parser.cpp +6 -4
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +23 -8
- package/src/llama.cpp/tests/test-llama-grammar.cpp +9 -8
- package/src/llama.cpp/tests/test-log.cpp +39 -0
- package/src/llama.cpp/tests/test-opt.cpp +853 -142
- package/src/llama.cpp/tests/test-quantize-fns.cpp +28 -19
- package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
- package/src/llama.cpp/tests/test-rope.cpp +2 -1
- package/src/llama.cpp/tests/test-sampling.cpp +226 -142
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +56 -36
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
- package/patches/llama.patch +0 -22
- package/src/llama.cpp/.github/workflows/bench.yml +0 -310
- package/src/llama.cpp/common/grammar-parser.cpp +0 -536
- package/src/llama.cpp/common/grammar-parser.h +0 -29
- package/src/llama.cpp/common/train.cpp +0 -1513
- package/src/llama.cpp/common/train.h +0 -233
- package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1640
- package/src/llama.cpp/examples/benchmark/CMakeLists.txt +0 -6
- package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -275
- package/src/llama.cpp/ggml/src/llamafile/sgemm.cpp +0 -1027
- package/src/llama.cpp/tests/test-grad0.cpp +0 -1566
- /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
- /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
|
@@ -1,20 +1,45 @@
|
|
|
1
1
|
#pragma once
|
|
2
2
|
|
|
3
|
-
#include "llama.h"
|
|
4
3
|
#include "common.h"
|
|
4
|
+
#include "log.h"
|
|
5
|
+
#include "llama.h"
|
|
6
|
+
|
|
7
|
+
#ifndef NDEBUG
|
|
8
|
+
// crash the server in debug mode, otherwise send an http 500 error
|
|
9
|
+
#define CPPHTTPLIB_NO_EXCEPTIONS 1
|
|
10
|
+
#endif
|
|
11
|
+
// increase max payload length to allow use of larger context size
|
|
12
|
+
#define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 1048576
|
|
13
|
+
#include "httplib.h"
|
|
5
14
|
|
|
6
15
|
// Change JSON_ASSERT from assert() to GGML_ASSERT:
|
|
7
16
|
#define JSON_ASSERT GGML_ASSERT
|
|
8
17
|
#include "json.hpp"
|
|
9
18
|
|
|
19
|
+
#include <random>
|
|
20
|
+
#include <sstream>
|
|
10
21
|
#include <string>
|
|
11
22
|
#include <vector>
|
|
12
|
-
#include <sstream>
|
|
13
|
-
#include <random>
|
|
14
23
|
|
|
15
24
|
#define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613"
|
|
16
25
|
|
|
17
26
|
using json = nlohmann::ordered_json;
|
|
27
|
+
using llama_tokens = std::vector<llama_token>;
|
|
28
|
+
|
|
29
|
+
#define SLT_INF(slot, fmt, ...) LOG_INF("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__)
|
|
30
|
+
#define SLT_WRN(slot, fmt, ...) LOG_WRN("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__)
|
|
31
|
+
#define SLT_ERR(slot, fmt, ...) LOG_ERR("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__)
|
|
32
|
+
#define SLT_DBG(slot, fmt, ...) LOG_DBG("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__)
|
|
33
|
+
|
|
34
|
+
#define SRV_INF(fmt, ...) LOG_INF("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__)
|
|
35
|
+
#define SRV_WRN(fmt, ...) LOG_WRN("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__)
|
|
36
|
+
#define SRV_ERR(fmt, ...) LOG_ERR("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__)
|
|
37
|
+
#define SRV_DBG(fmt, ...) LOG_DBG("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__)
|
|
38
|
+
|
|
39
|
+
#define QUE_INF(fmt, ...) LOG_INF("que %12.*s: " fmt, 12, __func__, __VA_ARGS__)
|
|
40
|
+
#define QUE_WRN(fmt, ...) LOG_WRN("que %12.*s: " fmt, 12, __func__, __VA_ARGS__)
|
|
41
|
+
#define QUE_ERR(fmt, ...) LOG_ERR("que %12.*s: " fmt, 12, __func__, __VA_ARGS__)
|
|
42
|
+
#define QUE_DBG(fmt, ...) LOG_DBG("que %12.*s: " fmt, 12, __func__, __VA_ARGS__)
|
|
18
43
|
|
|
19
44
|
// https://community.openai.com/t/openai-chat-list-of-error-codes-and-types/357791/11
|
|
20
45
|
enum error_type {
|
|
@@ -27,32 +52,6 @@ enum error_type {
|
|
|
27
52
|
ERROR_TYPE_NOT_SUPPORTED, // custom error
|
|
28
53
|
};
|
|
29
54
|
|
|
30
|
-
extern bool server_verbose;
|
|
31
|
-
extern bool server_log_json;
|
|
32
|
-
|
|
33
|
-
#ifndef SERVER_VERBOSE
|
|
34
|
-
#define SERVER_VERBOSE 1
|
|
35
|
-
#endif
|
|
36
|
-
|
|
37
|
-
#if SERVER_VERBOSE != 1
|
|
38
|
-
#define LOG_VERBOSE(MSG, ...)
|
|
39
|
-
#else
|
|
40
|
-
#define LOG_VERBOSE(MSG, ...) \
|
|
41
|
-
do \
|
|
42
|
-
{ \
|
|
43
|
-
if (server_verbose) \
|
|
44
|
-
{ \
|
|
45
|
-
server_log("VERB", __func__, __LINE__, MSG, __VA_ARGS__); \
|
|
46
|
-
} \
|
|
47
|
-
} while (0)
|
|
48
|
-
#endif
|
|
49
|
-
|
|
50
|
-
#define LOG_ERROR( MSG, ...) server_log("ERR", __func__, __LINE__, MSG, __VA_ARGS__)
|
|
51
|
-
#define LOG_WARNING(MSG, ...) server_log("WARN", __func__, __LINE__, MSG, __VA_ARGS__)
|
|
52
|
-
#define LOG_INFO( MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__)
|
|
53
|
-
|
|
54
|
-
static inline void server_log(const char * level, const char * function, int line, const char * message, const json & extra);
|
|
55
|
-
|
|
56
55
|
template <typename T>
|
|
57
56
|
static T json_value(const json & body, const std::string & key, const T & default_value) {
|
|
58
57
|
// Fallback null to default value
|
|
@@ -60,9 +59,7 @@ static T json_value(const json & body, const std::string & key, const T & defaul
|
|
|
60
59
|
try {
|
|
61
60
|
return body.at(key);
|
|
62
61
|
} catch (NLOHMANN_JSON_NAMESPACE::detail::type_error const &) {
|
|
63
|
-
|
|
64
|
-
ss << "Wrong type supplied for parameter '" << key << "'. Expected '" << json(default_value).type_name() << "', using default value.";
|
|
65
|
-
LOG_WARNING(ss.str().c_str(), body);
|
|
62
|
+
LOG_WRN("Wrong type supplied for parameter '%s'. Expected '%s', using default value\n", key.c_str(), json(default_value).type_name());
|
|
66
63
|
return default_value;
|
|
67
64
|
}
|
|
68
65
|
} else {
|
|
@@ -70,55 +67,241 @@ static T json_value(const json & body, const std::string & key, const T & defaul
|
|
|
70
67
|
}
|
|
71
68
|
}
|
|
72
69
|
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
json log = json{
|
|
77
|
-
{"tid", ss_tid.str()},
|
|
78
|
-
{"timestamp", time(nullptr)},
|
|
79
|
-
};
|
|
70
|
+
//
|
|
71
|
+
// tokenizer and input processing utils
|
|
72
|
+
//
|
|
80
73
|
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
{
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
}
|
|
74
|
+
static bool json_is_array_of_numbers(const json & data) {
|
|
75
|
+
if (data.is_array()) {
|
|
76
|
+
for (const auto & e : data) {
|
|
77
|
+
if (!e.is_number_integer()) {
|
|
78
|
+
return false;
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
return true;
|
|
82
|
+
}
|
|
83
|
+
return false;
|
|
84
|
+
}
|
|
88
85
|
|
|
89
|
-
|
|
90
|
-
|
|
86
|
+
// is array having BOTH numbers & strings?
|
|
87
|
+
static bool json_is_array_of_mixed_numbers_strings(const json & data) {
|
|
88
|
+
bool seen_string = false;
|
|
89
|
+
bool seen_number = false;
|
|
90
|
+
if (data.is_array()) {
|
|
91
|
+
for (const auto & e : data) {
|
|
92
|
+
seen_string |= e.is_string();
|
|
93
|
+
seen_number |= e.is_number_integer();
|
|
94
|
+
if (seen_number && seen_string) {
|
|
95
|
+
return true;
|
|
96
|
+
}
|
|
91
97
|
}
|
|
98
|
+
}
|
|
99
|
+
return false;
|
|
100
|
+
}
|
|
92
101
|
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
102
|
+
/**
|
|
103
|
+
* this handles 2 cases:
|
|
104
|
+
* - only string, example: "string"
|
|
105
|
+
* - mixed string and tokens, example: [12, 34, "string", 56, 78]
|
|
106
|
+
*/
|
|
107
|
+
static llama_tokens tokenize_mixed(const llama_context * ctx, const json & json_prompt, bool add_special, bool parse_special) {
|
|
108
|
+
// If `add_bos` is true, we only add BOS, when json_prompt is a string,
|
|
109
|
+
// or the first element of the json_prompt array is a string.
|
|
110
|
+
llama_tokens prompt_tokens;
|
|
111
|
+
|
|
112
|
+
if (json_prompt.is_array()) {
|
|
113
|
+
bool first = true;
|
|
114
|
+
for (const auto & p : json_prompt) {
|
|
115
|
+
if (p.is_string()) {
|
|
116
|
+
auto s = p.template get<std::string>();
|
|
117
|
+
|
|
118
|
+
llama_tokens p;
|
|
119
|
+
if (first) {
|
|
120
|
+
p = common_tokenize(ctx, s, add_special, parse_special);
|
|
121
|
+
first = false;
|
|
122
|
+
} else {
|
|
123
|
+
p = common_tokenize(ctx, s, false, parse_special);
|
|
124
|
+
}
|
|
97
125
|
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
ss << " " << el.key() << "=" << value;
|
|
126
|
+
prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
|
|
127
|
+
} else {
|
|
128
|
+
if (first) {
|
|
129
|
+
first = false;
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
prompt_tokens.push_back(p.template get<llama_token>());
|
|
133
|
+
}
|
|
107
134
|
}
|
|
135
|
+
} else {
|
|
136
|
+
auto s = json_prompt.template get<std::string>();
|
|
137
|
+
prompt_tokens = common_tokenize(ctx, s, add_special, parse_special);
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
return prompt_tokens;
|
|
141
|
+
}
|
|
108
142
|
|
|
109
|
-
|
|
110
|
-
|
|
143
|
+
/**
|
|
144
|
+
* break the input "prompt" object into multiple prompt if needed, then tokenize them
|
|
145
|
+
* this supports these cases:
|
|
146
|
+
* - "prompt": "string"
|
|
147
|
+
* - "prompt": [12, 34, 56]
|
|
148
|
+
* - "prompt": [12, 34, "string", 56, 78]
|
|
149
|
+
* and multiple prompts (multi-tasks):
|
|
150
|
+
* - "prompt": ["string1", "string2"]
|
|
151
|
+
* - "prompt": ["string1", [12, 34, 56]]
|
|
152
|
+
* - "prompt": [[12, 34, "string", 56, 78], [12, 34, 56]]
|
|
153
|
+
*/
|
|
154
|
+
static std::vector<llama_tokens> tokenize_input_prompts(llama_context * ctx, const json & json_prompt, bool add_special, bool parse_special) {
|
|
155
|
+
std::vector<llama_tokens> result;
|
|
156
|
+
if (json_prompt.is_string() || json_is_array_of_mixed_numbers_strings(json_prompt)) {
|
|
157
|
+
// string or mixed
|
|
158
|
+
result.push_back(tokenize_mixed(ctx, json_prompt, add_special, parse_special));
|
|
159
|
+
} else if (json_is_array_of_numbers(json_prompt)) {
|
|
160
|
+
// array of tokens
|
|
161
|
+
result.push_back(json_prompt.get<llama_tokens>());
|
|
162
|
+
} else if (json_prompt.is_array()) {
|
|
163
|
+
// array of prompts
|
|
164
|
+
result.reserve(json_prompt.size());
|
|
165
|
+
for (const auto & p : json_prompt) {
|
|
166
|
+
if (p.is_string() || json_is_array_of_mixed_numbers_strings(p)) {
|
|
167
|
+
result.push_back(tokenize_mixed(ctx, p, add_special, parse_special));
|
|
168
|
+
} else if (json_is_array_of_numbers(p)) {
|
|
169
|
+
// array of tokens
|
|
170
|
+
result.push_back(p.get<llama_tokens>());
|
|
171
|
+
} else {
|
|
172
|
+
throw std::runtime_error("element of \"prompt\" must be a string, an list of tokens, or a list of mixed strings & tokens");
|
|
173
|
+
}
|
|
174
|
+
}
|
|
175
|
+
} else {
|
|
176
|
+
throw std::runtime_error("\"prompt\" must be a string, an list of tokens, a list of mixed strings & tokens, or a list of prompts");
|
|
111
177
|
}
|
|
112
|
-
|
|
178
|
+
return result;
|
|
113
179
|
}
|
|
114
180
|
|
|
115
181
|
//
|
|
116
|
-
//
|
|
182
|
+
// template utils
|
|
117
183
|
//
|
|
118
184
|
|
|
185
|
+
// format rerank task: [BOS]query[EOS][SEP]doc[EOS]
|
|
186
|
+
static llama_tokens format_rerank(const struct llama_model * model, const llama_tokens & query, const llama_tokens & doc) {
|
|
187
|
+
llama_tokens result;
|
|
188
|
+
result.reserve(doc.size() + query.size() + 4);
|
|
189
|
+
result.push_back(llama_token_bos(model));
|
|
190
|
+
result.insert(result.end(), query.begin(), query.end());
|
|
191
|
+
result.push_back(llama_token_eos(model));
|
|
192
|
+
result.push_back(llama_token_sep(model));
|
|
193
|
+
result.insert(result.end(), doc.begin(), doc.end());
|
|
194
|
+
result.push_back(llama_token_eos(model));
|
|
195
|
+
return result;
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
// format infill task
|
|
199
|
+
static llama_tokens format_infill(
|
|
200
|
+
const llama_context * ctx,
|
|
201
|
+
const json & input_prefix,
|
|
202
|
+
const json & input_suffix,
|
|
203
|
+
const json & input_extra,
|
|
204
|
+
const int n_batch,
|
|
205
|
+
const int n_predict,
|
|
206
|
+
const int n_ctx,
|
|
207
|
+
const bool spm_infill,
|
|
208
|
+
const llama_tokens & tokens_prompt
|
|
209
|
+
) {
|
|
210
|
+
// TODO: optimize this block by reducing memory allocations and movement
|
|
211
|
+
|
|
212
|
+
// use FIM repo-level pattern:
|
|
213
|
+
// ref: https://arxiv.org/pdf/2409.12186
|
|
214
|
+
//
|
|
215
|
+
// [FIM_REP]myproject
|
|
216
|
+
// [FIM_SEP]filename0
|
|
217
|
+
// extra chunk 0
|
|
218
|
+
// [FIM_SEP]filename1
|
|
219
|
+
// extra chunk 1
|
|
220
|
+
// ...
|
|
221
|
+
// [FIM_SEP]filename
|
|
222
|
+
// [FIM_PRE]prefix[FIM_SUF]suffix[FIM_MID]prompt
|
|
223
|
+
//
|
|
224
|
+
llama_tokens extra_tokens;
|
|
225
|
+
extra_tokens.reserve(n_ctx);
|
|
226
|
+
|
|
227
|
+
auto model = llama_get_model(ctx);
|
|
228
|
+
auto tokens_prefix = tokenize_mixed(ctx, input_prefix, false, false);
|
|
229
|
+
auto tokens_suffix = tokenize_mixed(ctx, input_suffix, false, false);
|
|
230
|
+
|
|
231
|
+
if (llama_token_fim_rep(model) != LLAMA_TOKEN_NULL) {
|
|
232
|
+
// TODO: make project name an input
|
|
233
|
+
static const auto k_fim_repo = common_tokenize(ctx, "myproject\n", false, false);
|
|
234
|
+
|
|
235
|
+
extra_tokens.push_back(llama_token_fim_rep(model));
|
|
236
|
+
extra_tokens.insert(extra_tokens.end(), k_fim_repo.begin(), k_fim_repo.end());
|
|
237
|
+
}
|
|
238
|
+
for (const auto & chunk : input_extra) {
|
|
239
|
+
// { "text": string, "filename": string }
|
|
240
|
+
const std::string text = json_value(chunk, "text", std::string());
|
|
241
|
+
const std::string filename = json_value(chunk, "filename", std::string("tmp"));
|
|
242
|
+
|
|
243
|
+
if (llama_token_fim_sep(model) != LLAMA_TOKEN_NULL) {
|
|
244
|
+
const auto k_fim_file = common_tokenize(ctx, filename + "\n", false, false);
|
|
245
|
+
|
|
246
|
+
extra_tokens.insert(extra_tokens.end(), llama_token_fim_sep(model));
|
|
247
|
+
extra_tokens.insert(extra_tokens.end(), k_fim_file.begin(), k_fim_file.end());
|
|
248
|
+
} else {
|
|
249
|
+
// chunk separator in binary form to avoid confusing the AI
|
|
250
|
+
static const char k_chunk_prefix_str[] = {0x0a, 0x0a, 0x2d, 0x2d, 0x2d, 0x20, 0x73, 0x6e, 0x69, 0x70, 0x70, 0x65, 0x74, 0x20, 0x2d, 0x2d, 0x2d, 0x0a, 0x0a, 0x00};
|
|
251
|
+
static const auto k_chunk_prefix_tokens = common_tokenize(ctx, k_chunk_prefix_str, false, false);
|
|
252
|
+
|
|
253
|
+
extra_tokens.insert(extra_tokens.end(), k_chunk_prefix_tokens.begin(), k_chunk_prefix_tokens.end());
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
const auto chunk_tokens = common_tokenize(ctx, text, false, false);
|
|
257
|
+
extra_tokens.insert(extra_tokens.end(), chunk_tokens.begin(), chunk_tokens.end());
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
if (llama_token_fim_sep(model) != LLAMA_TOKEN_NULL) {
|
|
261
|
+
// TODO: current filename
|
|
262
|
+
static const auto k_fim_file = common_tokenize(ctx, "filename\n", false, false);
|
|
263
|
+
|
|
264
|
+
extra_tokens.insert(extra_tokens.end(), llama_token_fim_sep(model));
|
|
265
|
+
extra_tokens.insert(extra_tokens.end(), k_fim_file.begin(), k_fim_file.end());
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
// for now pick FIM context to fit in a batch (ratio prefix:suffix = 3:1, TODO: configurable?)
|
|
269
|
+
const int n_prefix_take = std::min<int>(tokens_prefix.size(), 3*(n_batch/4));
|
|
270
|
+
const int n_suffix_take = std::min<int>(tokens_suffix.size(), std::max<int>(0, (n_batch/4) - (2 + tokens_prompt.size())));
|
|
271
|
+
|
|
272
|
+
SRV_DBG("n_prefix_take = %d, n_suffix_take = %d, total = %d\n", n_prefix_take, n_suffix_take, (n_prefix_take + n_suffix_take));
|
|
273
|
+
|
|
274
|
+
// fill the rest of the context with extra chunks
|
|
275
|
+
const int n_extra_take = std::min<int>(std::max<int>(0, n_ctx - (n_batch) - 2*n_predict), extra_tokens.size());
|
|
276
|
+
|
|
277
|
+
tokens_prefix.erase(tokens_prefix.begin(), tokens_prefix.begin() + tokens_prefix.size() - n_prefix_take);
|
|
278
|
+
tokens_suffix.resize(n_suffix_take);
|
|
279
|
+
|
|
280
|
+
tokens_prefix.insert(tokens_prefix.begin(), llama_token_fim_pre(model));
|
|
281
|
+
tokens_prefix.insert(tokens_prefix.end(), tokens_prompt.begin(), tokens_prompt.end());
|
|
282
|
+
tokens_suffix.insert(tokens_suffix.begin(), llama_token_fim_suf(model));
|
|
283
|
+
|
|
284
|
+
auto embd_inp = spm_infill ? tokens_suffix : tokens_prefix;
|
|
285
|
+
auto embd_end = spm_infill ? tokens_prefix : tokens_suffix;
|
|
286
|
+
|
|
287
|
+
if (llama_add_bos_token(model)) {
|
|
288
|
+
embd_inp.insert(embd_inp.begin(), llama_token_bos(model));
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
SRV_DBG("extra: n_ctx = %d, n_extra_take = %d, n_extra = %d\n", n_ctx, n_extra_take, (int) extra_tokens.size());
|
|
292
|
+
|
|
293
|
+
// put the extra context before the FIM prefix
|
|
294
|
+
embd_inp.insert(embd_inp.begin(), extra_tokens.end() - n_extra_take, extra_tokens.end());
|
|
295
|
+
|
|
296
|
+
embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
|
|
297
|
+
embd_inp.push_back(llama_token_fim_mid(model));
|
|
298
|
+
|
|
299
|
+
return embd_inp;
|
|
300
|
+
}
|
|
301
|
+
|
|
119
302
|
// Format given chat. If tmpl is empty, we take the template from model metadata
|
|
120
303
|
inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector<json> & messages) {
|
|
121
|
-
std::vector<
|
|
304
|
+
std::vector<common_chat_msg> chat;
|
|
122
305
|
|
|
123
306
|
for (size_t i = 0; i < messages.size(); ++i) {
|
|
124
307
|
const auto & curr_msg = messages[i];
|
|
@@ -145,11 +328,25 @@ inline std::string format_chat(const struct llama_model * model, const std::stri
|
|
|
145
328
|
chat.push_back({role, content});
|
|
146
329
|
}
|
|
147
330
|
|
|
148
|
-
auto formatted_chat =
|
|
149
|
-
|
|
331
|
+
const auto formatted_chat = common_chat_apply_template(model, tmpl, chat, true);
|
|
332
|
+
LOG_DBG("formatted_chat: '%s'\n", formatted_chat.c_str());
|
|
333
|
+
|
|
150
334
|
return formatted_chat;
|
|
151
335
|
}
|
|
152
336
|
|
|
337
|
+
static std::string llama_get_chat_template(const struct llama_model * model) {
|
|
338
|
+
std::string template_key = "tokenizer.chat_template";
|
|
339
|
+
// call with NULL buffer to get the total size of the string
|
|
340
|
+
int32_t res = llama_model_meta_val_str(model, template_key.c_str(), NULL, 0);
|
|
341
|
+
if (res < 0) {
|
|
342
|
+
return "";
|
|
343
|
+
} else {
|
|
344
|
+
std::vector<char> model_template(res, 0);
|
|
345
|
+
llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size());
|
|
346
|
+
return std::string(model_template.data(), model_template.size());
|
|
347
|
+
}
|
|
348
|
+
}
|
|
349
|
+
|
|
153
350
|
//
|
|
154
351
|
// base64 utils (TODO: move to common in the future)
|
|
155
352
|
//
|
|
@@ -235,28 +432,67 @@ static std::string random_string() {
|
|
|
235
432
|
}
|
|
236
433
|
|
|
237
434
|
static std::string gen_chatcmplid() {
|
|
238
|
-
|
|
239
|
-
chatcmplid << "chatcmpl-" << random_string();
|
|
240
|
-
|
|
241
|
-
return chatcmplid.str();
|
|
435
|
+
return "chatcmpl-" + random_string();
|
|
242
436
|
}
|
|
243
437
|
|
|
244
438
|
//
|
|
245
439
|
// other common utils
|
|
246
440
|
//
|
|
247
441
|
|
|
248
|
-
static size_t
|
|
442
|
+
static size_t longest_common_prefix(const llama_tokens & a, const llama_tokens & b) {
|
|
249
443
|
size_t i;
|
|
250
444
|
for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {}
|
|
251
445
|
|
|
252
446
|
return i;
|
|
253
447
|
}
|
|
254
448
|
|
|
255
|
-
static size_t
|
|
256
|
-
|
|
257
|
-
|
|
449
|
+
static size_t longest_common_subsequence(const llama_tokens & a, const llama_tokens & b) {
|
|
450
|
+
// check for empty sequences
|
|
451
|
+
if (a.empty() || b.empty()) {
|
|
452
|
+
return 0;
|
|
453
|
+
}
|
|
258
454
|
|
|
259
|
-
|
|
455
|
+
// get the lengths of the input sequences
|
|
456
|
+
size_t a_len = a.size();
|
|
457
|
+
size_t b_len = b.size();
|
|
458
|
+
|
|
459
|
+
// initialize the maximum length of the longest common subsequence (LCS)
|
|
460
|
+
size_t max_length = 0;
|
|
461
|
+
|
|
462
|
+
// use two rows instead of a 2D matrix to optimize space
|
|
463
|
+
std::vector<size_t> prev_row(b_len + 1, 0);
|
|
464
|
+
std::vector<size_t> curr_row(b_len + 1, 0);
|
|
465
|
+
|
|
466
|
+
// iterate through the elements of a
|
|
467
|
+
for (size_t i = 1; i <= a_len; i++) {
|
|
468
|
+
// iterate through the elements of b
|
|
469
|
+
for (size_t j = 1; j <= b_len; j++) {
|
|
470
|
+
// if elements at the current positions match
|
|
471
|
+
if (a[i - 1] == b[j - 1]) {
|
|
472
|
+
// if it's the first element of either sequences, set LCS length to 1
|
|
473
|
+
if (i == 1 || j == 1) {
|
|
474
|
+
curr_row[j] = 1;
|
|
475
|
+
} else {
|
|
476
|
+
// increment LCS length by 1 compared to the previous element
|
|
477
|
+
curr_row[j] = prev_row[j - 1] + 1;
|
|
478
|
+
}
|
|
479
|
+
|
|
480
|
+
// update max_length if necessary
|
|
481
|
+
if (curr_row[j] > max_length) {
|
|
482
|
+
max_length = curr_row[j];
|
|
483
|
+
}
|
|
484
|
+
} else {
|
|
485
|
+
// reset LCS length if elements don't match
|
|
486
|
+
curr_row[j] = 0;
|
|
487
|
+
}
|
|
488
|
+
}
|
|
489
|
+
|
|
490
|
+
// update the previous row for the next iteration
|
|
491
|
+
prev_row = curr_row;
|
|
492
|
+
}
|
|
493
|
+
|
|
494
|
+
// return the maximum length of the LCS
|
|
495
|
+
return max_length;
|
|
260
496
|
}
|
|
261
497
|
|
|
262
498
|
static bool ends_with(const std::string & str, const std::string & suffix) {
|
|
@@ -284,7 +520,7 @@ template <class Iter>
|
|
|
284
520
|
static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
|
|
285
521
|
std::string ret;
|
|
286
522
|
for (; begin != end; ++begin) {
|
|
287
|
-
ret +=
|
|
523
|
+
ret += common_token_to_piece(ctx, *begin);
|
|
288
524
|
}
|
|
289
525
|
|
|
290
526
|
return ret;
|
|
@@ -292,7 +528,7 @@ static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
|
|
|
292
528
|
|
|
293
529
|
// format incomplete utf-8 multibyte character for output
|
|
294
530
|
static std::string tokens_to_output_formatted_string(const llama_context * ctx, const llama_token token) {
|
|
295
|
-
std::string out = token == -1 ? "" :
|
|
531
|
+
std::string out = token == -1 ? "" : common_token_to_piece(ctx, token);
|
|
296
532
|
|
|
297
533
|
// if the size is 1 and first bit is 1, meaning it's a partial character
|
|
298
534
|
// (size > 1 meaning it's already a known token)
|
|
@@ -343,6 +579,17 @@ static json probs_vector_to_json(const llama_context * ctx, const std::vector<co
|
|
|
343
579
|
return out;
|
|
344
580
|
}
|
|
345
581
|
|
|
582
|
+
static bool server_sent_event(httplib::DataSink & sink, const char * event, const json & data) {
|
|
583
|
+
const std::string str =
|
|
584
|
+
std::string(event) + ": " +
|
|
585
|
+
data.dump(-1, ' ', false, json::error_handler_t::replace) +
|
|
586
|
+
"\n\n"; // note: these newlines are important (not sure why though, if you know, add a comment to explain)
|
|
587
|
+
|
|
588
|
+
LOG_DBG("data stream, to_send: %s", str.c_str());
|
|
589
|
+
|
|
590
|
+
return sink.write(str.c_str(), str.size());
|
|
591
|
+
}
|
|
592
|
+
|
|
346
593
|
//
|
|
347
594
|
// OAI utils
|
|
348
595
|
//
|
|
@@ -355,24 +602,6 @@ static json oaicompat_completion_params_parse(
|
|
|
355
602
|
|
|
356
603
|
llama_params["__oaicompat"] = true;
|
|
357
604
|
|
|
358
|
-
// Map OpenAI parameters to llama.cpp parameters
|
|
359
|
-
//
|
|
360
|
-
// For parameters that are defined by the OpenAI documentation (e.g.
|
|
361
|
-
// temperature), we explicitly specify OpenAI's intended default; we
|
|
362
|
-
// need to do that because sometimes OpenAI disagrees with llama.cpp
|
|
363
|
-
//
|
|
364
|
-
// https://platform.openai.com/docs/api-reference/chat/create
|
|
365
|
-
llama_sampling_params default_sparams;
|
|
366
|
-
llama_params["model"] = json_value(body, "model", std::string("unknown"));
|
|
367
|
-
llama_params["frequency_penalty"] = json_value(body, "frequency_penalty", 0.0);
|
|
368
|
-
llama_params["logit_bias"] = json_value(body, "logit_bias", json::object());
|
|
369
|
-
llama_params["n_predict"] = json_value(body, "max_tokens", -1);
|
|
370
|
-
llama_params["presence_penalty"] = json_value(body, "presence_penalty", 0.0);
|
|
371
|
-
llama_params["seed"] = json_value(body, "seed", LLAMA_DEFAULT_SEED);
|
|
372
|
-
llama_params["stream"] = json_value(body, "stream", false);
|
|
373
|
-
llama_params["temperature"] = json_value(body, "temperature", 1.0);
|
|
374
|
-
llama_params["top_p"] = json_value(body, "top_p", 1.0);
|
|
375
|
-
|
|
376
605
|
// Apply chat template to the list of messages
|
|
377
606
|
llama_params["prompt"] = format_chat(model, chat_template, body.at("messages"));
|
|
378
607
|
|
|
@@ -389,6 +618,9 @@ static json oaicompat_completion_params_parse(
|
|
|
389
618
|
std::string response_type = json_value(response_format, "type", std::string());
|
|
390
619
|
if (response_type == "json_object") {
|
|
391
620
|
llama_params["json_schema"] = json_value(response_format, "schema", json::object());
|
|
621
|
+
} else if (response_type == "json_schema") {
|
|
622
|
+
json json_schema = json_value(response_format, "json_schema", json::object());
|
|
623
|
+
llama_params["json_schema"] = json_value(json_schema, "schema", json::object());
|
|
392
624
|
} else if (!response_type.empty() && response_type != "text") {
|
|
393
625
|
throw std::runtime_error("response_format type must be one of \"text\" or \"json_object\", but got: " + response_type);
|
|
394
626
|
}
|
|
@@ -402,22 +634,22 @@ static json oaicompat_completion_params_parse(
|
|
|
402
634
|
|
|
403
635
|
// Handle "logprobs" field
|
|
404
636
|
// TODO: The response format of this option is not yet OAI-compatible, but seems like no one really using it; We may need to fix it in the future
|
|
405
|
-
if (body
|
|
637
|
+
if (json_value(body, "logprobs", false)) {
|
|
406
638
|
llama_params["n_probs"] = json_value(body, "top_logprobs", 20);
|
|
407
|
-
} else if (body.contains("top_logprobs")) {
|
|
639
|
+
} else if (body.contains("top_logprobs") && !body.at("top_logprobs").is_null()) {
|
|
408
640
|
throw std::runtime_error("top_logprobs requires logprobs to be set to true");
|
|
409
641
|
}
|
|
410
642
|
|
|
411
643
|
// Params supported by OAI but unsupported by llama.cpp
|
|
412
644
|
static const std::vector<std::string> unsupported_params { "tools", "tool_choice" };
|
|
413
|
-
for (auto & param : unsupported_params) {
|
|
645
|
+
for (const auto & param : unsupported_params) {
|
|
414
646
|
if (body.contains(param)) {
|
|
415
647
|
throw std::runtime_error("Unsupported param: " + param);
|
|
416
648
|
}
|
|
417
649
|
}
|
|
418
650
|
|
|
419
651
|
// Copy remaining properties to llama_params
|
|
420
|
-
// This allows user to use llama.cpp-specific params like "mirostat",
|
|
652
|
+
// This allows user to use llama.cpp-specific params like "mirostat", ... via OAI endpoint.
|
|
421
653
|
// See "launch_slot_with_task()" for a complete list of params supported by llama.cpp
|
|
422
654
|
for (const auto & item : body.items()) {
|
|
423
655
|
// Exception: if "n_predict" is present, we overwrite the value specified earlier by "max_tokens"
|
|
@@ -429,7 +661,7 @@ static json oaicompat_completion_params_parse(
|
|
|
429
661
|
return llama_params;
|
|
430
662
|
}
|
|
431
663
|
|
|
432
|
-
static json format_final_response_oaicompat(const json & request, json result, const std::string & completion_id, bool streaming = false) {
|
|
664
|
+
static json format_final_response_oaicompat(const json & request, const json & result, const std::string & completion_id, bool streaming = false, bool verbose = false) {
|
|
433
665
|
bool stopped_word = result.count("stopped_word") != 0;
|
|
434
666
|
bool stopped_eos = json_value(result, "stopped_eos", false);
|
|
435
667
|
int num_tokens_predicted = json_value(result, "tokens_predicted", 0);
|
|
@@ -466,7 +698,8 @@ static json format_final_response_oaicompat(const json & request, json result, c
|
|
|
466
698
|
{"id", completion_id}
|
|
467
699
|
};
|
|
468
700
|
|
|
469
|
-
|
|
701
|
+
// extra fields for debugging purposes
|
|
702
|
+
if (verbose) {
|
|
470
703
|
res["__verbose"] = result;
|
|
471
704
|
}
|
|
472
705
|
|
|
@@ -478,7 +711,7 @@ static json format_final_response_oaicompat(const json & request, json result, c
|
|
|
478
711
|
}
|
|
479
712
|
|
|
480
713
|
// return value is vector as there is one case where we might need to generate two responses
|
|
481
|
-
static std::vector<json> format_partial_response_oaicompat(json result, const std::string & completion_id) {
|
|
714
|
+
static std::vector<json> format_partial_response_oaicompat(const json & result, const std::string & completion_id) {
|
|
482
715
|
if (!result.contains("model") || !result.contains("oaicompat_token_ctr")) {
|
|
483
716
|
return std::vector<json>({result});
|
|
484
717
|
}
|
|
@@ -580,7 +813,7 @@ static std::vector<json> format_partial_response_oaicompat(json result, const st
|
|
|
580
813
|
static json format_embeddings_response_oaicompat(const json & request, const json & embeddings) {
|
|
581
814
|
json data = json::array();
|
|
582
815
|
int i = 0;
|
|
583
|
-
for (auto & elem : embeddings) {
|
|
816
|
+
for (const auto & elem : embeddings) {
|
|
584
817
|
data.push_back(json{
|
|
585
818
|
{"embedding", json_value(elem, "embedding", json::array())},
|
|
586
819
|
{"index", i++},
|
|
@@ -591,7 +824,7 @@ static json format_embeddings_response_oaicompat(const json & request, const jso
|
|
|
591
824
|
json res = json {
|
|
592
825
|
{"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
|
|
593
826
|
{"object", "list"},
|
|
594
|
-
{"usage", json {
|
|
827
|
+
{"usage", json { // TODO: fill
|
|
595
828
|
{"prompt_tokens", 0},
|
|
596
829
|
{"total_tokens", 0}
|
|
597
830
|
}},
|
|
@@ -601,7 +834,63 @@ static json format_embeddings_response_oaicompat(const json & request, const jso
|
|
|
601
834
|
return res;
|
|
602
835
|
}
|
|
603
836
|
|
|
604
|
-
static json
|
|
837
|
+
static json format_response_rerank(const json & request, const json & ranks) {
|
|
838
|
+
json data = json::array();
|
|
839
|
+
int i = 0;
|
|
840
|
+
for (const auto & rank : ranks) {
|
|
841
|
+
data.push_back(json{
|
|
842
|
+
{"index", i++},
|
|
843
|
+
{"relevance_score", json_value(rank, "score", 0.0)},
|
|
844
|
+
});
|
|
845
|
+
}
|
|
846
|
+
|
|
847
|
+
json res = json {
|
|
848
|
+
{"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
|
|
849
|
+
{"object", "list"},
|
|
850
|
+
{"usage", json { // TODO: fill
|
|
851
|
+
{"prompt_tokens", 0},
|
|
852
|
+
{"total_tokens", 0}
|
|
853
|
+
}},
|
|
854
|
+
{"results", data}
|
|
855
|
+
};
|
|
856
|
+
|
|
857
|
+
return res;
|
|
858
|
+
}
|
|
859
|
+
|
|
860
|
+
static bool is_valid_utf8(const std::string & str) {
|
|
861
|
+
const unsigned char* bytes = reinterpret_cast<const unsigned char*>(str.data());
|
|
862
|
+
const unsigned char* end = bytes + str.length();
|
|
863
|
+
|
|
864
|
+
while (bytes < end) {
|
|
865
|
+
if (*bytes <= 0x7F) {
|
|
866
|
+
// 1-byte sequence (0xxxxxxx)
|
|
867
|
+
bytes++;
|
|
868
|
+
} else if ((*bytes & 0xE0) == 0xC0) {
|
|
869
|
+
// 2-byte sequence (110xxxxx 10xxxxxx)
|
|
870
|
+
if (end - bytes < 2 || (bytes[1] & 0xC0) != 0x80)
|
|
871
|
+
return false;
|
|
872
|
+
bytes += 2;
|
|
873
|
+
} else if ((*bytes & 0xF0) == 0xE0) {
|
|
874
|
+
// 3-byte sequence (1110xxxx 10xxxxxx 10xxxxxx)
|
|
875
|
+
if (end - bytes < 3 || (bytes[1] & 0xC0) != 0x80 || (bytes[2] & 0xC0) != 0x80)
|
|
876
|
+
return false;
|
|
877
|
+
bytes += 3;
|
|
878
|
+
} else if ((*bytes & 0xF8) == 0xF0) {
|
|
879
|
+
// 4-byte sequence (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
|
|
880
|
+
if (end - bytes < 4 || (bytes[1] & 0xC0) != 0x80 ||
|
|
881
|
+
(bytes[2] & 0xC0) != 0x80 || (bytes[3] & 0xC0) != 0x80)
|
|
882
|
+
return false;
|
|
883
|
+
bytes += 4;
|
|
884
|
+
} else {
|
|
885
|
+
// Invalid UTF-8 lead byte
|
|
886
|
+
return false;
|
|
887
|
+
}
|
|
888
|
+
}
|
|
889
|
+
|
|
890
|
+
return true;
|
|
891
|
+
}
|
|
892
|
+
|
|
893
|
+
static json format_tokenizer_response(const json & tokens) {
|
|
605
894
|
return json {
|
|
606
895
|
{"tokens", tokens}
|
|
607
896
|
};
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
set(TARGET llama-simple)
|
|
2
2
|
add_executable(${TARGET} simple.cpp)
|
|
3
3
|
install(TARGETS ${TARGET} RUNTIME)
|
|
4
|
-
target_link_libraries(${TARGET} PRIVATE
|
|
4
|
+
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
|
|
5
5
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|