@fugood/llama.node 0.3.2 → 0.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +7 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +18 -1
- package/package.json +1 -1
- package/src/DetokenizeWorker.cpp +1 -1
- package/src/EmbeddingWorker.cpp +17 -7
- package/src/EmbeddingWorker.h +2 -1
- package/src/LlamaCompletionWorker.cpp +8 -8
- package/src/LlamaCompletionWorker.h +2 -2
- package/src/LlamaContext.cpp +89 -27
- package/src/LlamaContext.h +2 -0
- package/src/TokenizeWorker.cpp +1 -1
- package/src/common.hpp +4 -4
- package/src/llama.cpp/.github/workflows/build.yml +240 -168
- package/src/llama.cpp/.github/workflows/docker.yml +8 -8
- package/src/llama.cpp/.github/workflows/python-lint.yml +8 -1
- package/src/llama.cpp/.github/workflows/server.yml +21 -14
- package/src/llama.cpp/CMakeLists.txt +14 -6
- package/src/llama.cpp/Sources/llama/llama.h +4 -0
- package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
- package/src/llama.cpp/cmake/common.cmake +33 -0
- package/src/llama.cpp/cmake/x64-windows-llvm.cmake +11 -0
- package/src/llama.cpp/common/CMakeLists.txt +6 -4
- package/src/llama.cpp/common/arg.cpp +986 -770
- package/src/llama.cpp/common/arg.h +22 -22
- package/src/llama.cpp/common/common.cpp +212 -351
- package/src/llama.cpp/common/common.h +204 -117
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
- package/src/llama.cpp/common/log.cpp +50 -50
- package/src/llama.cpp/common/log.h +18 -18
- package/src/llama.cpp/common/ngram-cache.cpp +36 -36
- package/src/llama.cpp/common/ngram-cache.h +19 -19
- package/src/llama.cpp/common/sampling.cpp +163 -121
- package/src/llama.cpp/common/sampling.h +41 -20
- package/src/llama.cpp/common/speculative.cpp +274 -0
- package/src/llama.cpp/common/speculative.h +28 -0
- package/src/llama.cpp/docs/build.md +134 -161
- package/src/llama.cpp/examples/CMakeLists.txt +33 -14
- package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/batched/batched.cpp +19 -18
- package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +10 -11
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
- package/src/llama.cpp/examples/cvector-generator/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +9 -9
- package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +1 -1
- package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +12 -12
- package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +3 -2
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +8 -8
- package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +5 -5
- package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +4 -7
- package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +7 -7
- package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +8 -1
- package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +2 -2
- package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +18 -18
- package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +31 -13
- package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/infill/infill.cpp +41 -87
- package/src/llama.cpp/examples/llama-bench/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +439 -459
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +2 -0
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -14
- package/src/llama.cpp/examples/llava/CMakeLists.txt +10 -3
- package/src/llama.cpp/examples/llava/clip.cpp +263 -66
- package/src/llama.cpp/examples/llava/clip.h +8 -2
- package/src/llama.cpp/examples/llava/llava-cli.cpp +23 -23
- package/src/llama.cpp/examples/llava/llava.cpp +83 -22
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +21 -21
- package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +581 -0
- package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +26 -26
- package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +7 -7
- package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +16 -15
- package/src/llama.cpp/examples/lookup/lookup.cpp +30 -30
- package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/main/main.cpp +73 -114
- package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/parallel/parallel.cpp +18 -19
- package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/passkey/passkey.cpp +14 -14
- package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +99 -120
- package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/quantize.cpp +0 -3
- package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +10 -9
- package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +16 -16
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +3 -1
- package/src/llama.cpp/examples/run/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/run/run.cpp +911 -0
- package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +38 -21
- package/src/llama.cpp/examples/server/CMakeLists.txt +3 -16
- package/src/llama.cpp/examples/server/server.cpp +2073 -1339
- package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
- package/src/llama.cpp/examples/server/utils.hpp +354 -277
- package/src/llama.cpp/examples/simple/CMakeLists.txt +2 -2
- package/src/llama.cpp/examples/simple/simple.cpp +130 -94
- package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +200 -0
- package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/speculative/speculative.cpp +68 -64
- package/src/llama.cpp/examples/speculative-simple/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +265 -0
- package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +3 -3
- package/src/llama.cpp/examples/tts/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/tts/tts.cpp +932 -0
- package/src/llama.cpp/ggml/CMakeLists.txt +54 -36
- package/src/llama.cpp/ggml/include/ggml-backend.h +63 -34
- package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
- package/src/llama.cpp/ggml/include/ggml-cann.h +9 -7
- package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +135 -0
- package/src/llama.cpp/ggml/include/ggml-cuda.h +12 -12
- package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
- package/src/llama.cpp/ggml/include/ggml-metal.h +11 -7
- package/src/llama.cpp/ggml/include/ggml-opencl.h +26 -0
- package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
- package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
- package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
- package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
- package/src/llama.cpp/ggml/include/ggml.h +159 -417
- package/src/llama.cpp/ggml/src/CMakeLists.txt +121 -1155
- package/src/llama.cpp/ggml/src/ggml-alloc.c +23 -28
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +57 -36
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +552 -0
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +306 -867
- package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +87 -0
- package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +216 -65
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +76 -0
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +456 -111
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +6 -3
- package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +343 -177
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -5
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +22 -9
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +24 -13
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +23 -13
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +17 -0
- package/src/llama.cpp/ggml/src/ggml-common.h +42 -42
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +336 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +220 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/common.h +91 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +2511 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.h +10 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +323 -0
- package/src/llama.cpp/ggml/src/{ggml-aarch64.c → ggml-cpu/ggml-cpu-aarch64.cpp} +1299 -246
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp +55 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.h +8 -0
- package/src/llama.cpp/ggml/src/{ggml-cpu-impl.h → ggml-cpu/ggml-cpu-impl.h} +14 -242
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10835 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.cpp +36 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.h +38 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +14123 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +628 -0
- package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.cpp +666 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +152 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +104 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +393 -22
- package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +166 -0
- package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +360 -127
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +105 -0
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +288 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +107 -0
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +147 -0
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +4004 -0
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +854 -0
- package/src/llama.cpp/ggml/src/ggml-quants.c +188 -10702
- package/src/llama.cpp/ggml/src/ggml-quants.h +78 -125
- package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +9 -0
- package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +478 -300
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +84 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +36 -5
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +259 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +5 -5
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +34 -35
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1030 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +4 -4
- package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3638 -4151
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +6 -6
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -87
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +7 -6
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +6 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +4 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +7 -7
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +141 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
- package/src/llama.cpp/ggml/src/ggml-threading.h +14 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +92 -0
- package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +2138 -887
- package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +3 -1
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +593 -0
- package/src/llama.cpp/ggml/src/ggml.c +4427 -20125
- package/src/llama.cpp/include/llama-cpp.h +25 -0
- package/src/llama.cpp/include/llama.h +93 -52
- package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.out +46 -0
- package/src/llama.cpp/pocs/CMakeLists.txt +3 -1
- package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
- package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
- package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
- package/src/llama.cpp/src/CMakeLists.txt +4 -8
- package/src/llama.cpp/src/llama-grammar.cpp +15 -15
- package/src/llama.cpp/src/llama-grammar.h +2 -5
- package/src/llama.cpp/src/llama-sampling.cpp +779 -194
- package/src/llama.cpp/src/llama-sampling.h +21 -2
- package/src/llama.cpp/src/llama-vocab.cpp +55 -10
- package/src/llama.cpp/src/llama-vocab.h +35 -11
- package/src/llama.cpp/src/llama.cpp +4317 -2979
- package/src/llama.cpp/src/unicode-data.cpp +2 -2
- package/src/llama.cpp/src/unicode.cpp +62 -51
- package/src/llama.cpp/src/unicode.h +9 -10
- package/src/llama.cpp/tests/CMakeLists.txt +48 -38
- package/src/llama.cpp/tests/test-arg-parser.cpp +15 -15
- package/src/llama.cpp/tests/test-backend-ops.cpp +324 -80
- package/src/llama.cpp/tests/test-barrier.cpp +1 -0
- package/src/llama.cpp/tests/test-chat-template.cpp +59 -9
- package/src/llama.cpp/tests/test-gguf.cpp +1303 -0
- package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -6
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -4
- package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -4
- package/src/llama.cpp/tests/test-log.cpp +2 -2
- package/src/llama.cpp/tests/test-opt.cpp +853 -142
- package/src/llama.cpp/tests/test-quantize-fns.cpp +24 -21
- package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
- package/src/llama.cpp/tests/test-rope.cpp +62 -20
- package/src/llama.cpp/tests/test-sampling.cpp +163 -138
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +7 -7
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
- package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +0 -72
- package/src/llama.cpp/.github/workflows/nix-ci.yml +0 -79
- package/src/llama.cpp/.github/workflows/nix-flake-update.yml +0 -22
- package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +0 -36
- package/src/llama.cpp/common/train.cpp +0 -1515
- package/src/llama.cpp/common/train.h +0 -233
- package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1639
- package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -39
- package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +0 -600
- package/src/llama.cpp/tests/test-grad0.cpp +0 -1683
- /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
- /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
|
@@ -20,21 +20,26 @@
|
|
|
20
20
|
#include <sstream>
|
|
21
21
|
#include <string>
|
|
22
22
|
#include <vector>
|
|
23
|
+
#include <memory>
|
|
23
24
|
|
|
24
|
-
#define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo
|
|
25
|
+
#define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo"
|
|
25
26
|
|
|
26
27
|
using json = nlohmann::ordered_json;
|
|
27
28
|
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
29
|
+
#define SLT_INF(slot, fmt, ...) LOG_INF("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__)
|
|
30
|
+
#define SLT_WRN(slot, fmt, ...) LOG_WRN("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__)
|
|
31
|
+
#define SLT_ERR(slot, fmt, ...) LOG_ERR("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__)
|
|
32
|
+
#define SLT_DBG(slot, fmt, ...) LOG_DBG("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__)
|
|
33
|
+
|
|
34
|
+
#define SRV_INF(fmt, ...) LOG_INF("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__)
|
|
35
|
+
#define SRV_WRN(fmt, ...) LOG_WRN("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__)
|
|
36
|
+
#define SRV_ERR(fmt, ...) LOG_ERR("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__)
|
|
37
|
+
#define SRV_DBG(fmt, ...) LOG_DBG("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__)
|
|
38
|
+
|
|
39
|
+
#define QUE_INF(fmt, ...) LOG_INF("que %12.*s: " fmt, 12, __func__, __VA_ARGS__)
|
|
40
|
+
#define QUE_WRN(fmt, ...) LOG_WRN("que %12.*s: " fmt, 12, __func__, __VA_ARGS__)
|
|
41
|
+
#define QUE_ERR(fmt, ...) LOG_ERR("que %12.*s: " fmt, 12, __func__, __VA_ARGS__)
|
|
42
|
+
#define QUE_DBG(fmt, ...) LOG_DBG("que %12.*s: " fmt, 12, __func__, __VA_ARGS__)
|
|
38
43
|
|
|
39
44
|
template <typename T>
|
|
40
45
|
static T json_value(const json & body, const std::string & key, const T & default_value) {
|
|
@@ -52,12 +57,274 @@ static T json_value(const json & body, const std::string & key, const T & defaul
|
|
|
52
57
|
}
|
|
53
58
|
|
|
54
59
|
//
|
|
55
|
-
//
|
|
60
|
+
// tokenizer and input processing utils
|
|
56
61
|
//
|
|
57
62
|
|
|
63
|
+
static bool json_is_array_of_numbers(const json & data) {
|
|
64
|
+
if (data.is_array()) {
|
|
65
|
+
for (const auto & e : data) {
|
|
66
|
+
if (!e.is_number_integer()) {
|
|
67
|
+
return false;
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
return true;
|
|
71
|
+
}
|
|
72
|
+
return false;
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
// is array having BOTH numbers & strings?
|
|
76
|
+
static bool json_is_array_of_mixed_numbers_strings(const json & data) {
|
|
77
|
+
bool seen_string = false;
|
|
78
|
+
bool seen_number = false;
|
|
79
|
+
if (data.is_array()) {
|
|
80
|
+
for (const auto & e : data) {
|
|
81
|
+
seen_string |= e.is_string();
|
|
82
|
+
seen_number |= e.is_number_integer();
|
|
83
|
+
if (seen_number && seen_string) {
|
|
84
|
+
return true;
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
return false;
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
/**
|
|
92
|
+
* this handles 2 cases:
|
|
93
|
+
* - only string, example: "string"
|
|
94
|
+
* - mixed string and tokens, example: [12, 34, "string", 56, 78]
|
|
95
|
+
*/
|
|
96
|
+
static llama_tokens tokenize_mixed(const llama_context * ctx, const json & json_prompt, bool add_special, bool parse_special) {
|
|
97
|
+
// If `add_bos` is true, we only add BOS, when json_prompt is a string,
|
|
98
|
+
// or the first element of the json_prompt array is a string.
|
|
99
|
+
llama_tokens prompt_tokens;
|
|
100
|
+
|
|
101
|
+
if (json_prompt.is_array()) {
|
|
102
|
+
bool first = true;
|
|
103
|
+
for (const auto & p : json_prompt) {
|
|
104
|
+
if (p.is_string()) {
|
|
105
|
+
auto s = p.template get<std::string>();
|
|
106
|
+
|
|
107
|
+
llama_tokens p;
|
|
108
|
+
if (first) {
|
|
109
|
+
p = common_tokenize(ctx, s, add_special, parse_special);
|
|
110
|
+
first = false;
|
|
111
|
+
} else {
|
|
112
|
+
p = common_tokenize(ctx, s, false, parse_special);
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
|
|
116
|
+
} else {
|
|
117
|
+
if (first) {
|
|
118
|
+
first = false;
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
prompt_tokens.push_back(p.template get<llama_token>());
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
} else {
|
|
125
|
+
auto s = json_prompt.template get<std::string>();
|
|
126
|
+
prompt_tokens = common_tokenize(ctx, s, add_special, parse_special);
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
return prompt_tokens;
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
/**
|
|
133
|
+
* break the input "prompt" object into multiple prompt if needed, then tokenize them
|
|
134
|
+
* this supports these cases:
|
|
135
|
+
* - "prompt": "string"
|
|
136
|
+
* - "prompt": [12, 34, 56]
|
|
137
|
+
* - "prompt": [12, 34, "string", 56, 78]
|
|
138
|
+
* and multiple prompts (multi-tasks):
|
|
139
|
+
* - "prompt": ["string1", "string2"]
|
|
140
|
+
* - "prompt": ["string1", [12, 34, 56]]
|
|
141
|
+
* - "prompt": [[12, 34, 56], [78, 90, 12]]
|
|
142
|
+
* - "prompt": [[12, 34, "string", 56, 78], [12, 34, 56]]
|
|
143
|
+
*/
|
|
144
|
+
static std::vector<llama_tokens> tokenize_input_prompts(llama_context * ctx, const json & json_prompt, bool add_special, bool parse_special) {
|
|
145
|
+
std::vector<llama_tokens> result;
|
|
146
|
+
if (json_prompt.is_string() || json_is_array_of_mixed_numbers_strings(json_prompt)) {
|
|
147
|
+
// string or mixed
|
|
148
|
+
result.push_back(tokenize_mixed(ctx, json_prompt, add_special, parse_special));
|
|
149
|
+
} else if (json_is_array_of_numbers(json_prompt)) {
|
|
150
|
+
// array of tokens
|
|
151
|
+
result.push_back(json_prompt.get<llama_tokens>());
|
|
152
|
+
} else if (json_prompt.is_array()) {
|
|
153
|
+
// array of prompts
|
|
154
|
+
result.reserve(json_prompt.size());
|
|
155
|
+
for (const auto & p : json_prompt) {
|
|
156
|
+
if (p.is_string() || json_is_array_of_mixed_numbers_strings(p)) {
|
|
157
|
+
result.push_back(tokenize_mixed(ctx, p, add_special, parse_special));
|
|
158
|
+
} else if (json_is_array_of_numbers(p)) {
|
|
159
|
+
// array of tokens
|
|
160
|
+
result.push_back(p.get<llama_tokens>());
|
|
161
|
+
} else {
|
|
162
|
+
throw std::runtime_error("element of \"prompt\" must be a string, an list of tokens, or a list of mixed strings & tokens");
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
} else {
|
|
166
|
+
throw std::runtime_error("\"prompt\" must be a string, an list of tokens, a list of mixed strings & tokens, or a list of prompts");
|
|
167
|
+
}
|
|
168
|
+
if (result.empty()) {
|
|
169
|
+
throw std::runtime_error("\"prompt\" must not be empty");
|
|
170
|
+
}
|
|
171
|
+
return result;
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
// return the last index of character that can form a valid string
|
|
175
|
+
// if the last character is potentially cut in half, return the index before the cut
|
|
176
|
+
// if validate_utf8(text) == text.size(), then the whole text is valid utf8
|
|
177
|
+
static size_t validate_utf8(const std::string& text) {
|
|
178
|
+
size_t len = text.size();
|
|
179
|
+
if (len == 0) return 0;
|
|
180
|
+
|
|
181
|
+
// Check the last few bytes to see if a multi-byte character is cut off
|
|
182
|
+
for (size_t i = 1; i <= 4 && i <= len; ++i) {
|
|
183
|
+
unsigned char c = text[len - i];
|
|
184
|
+
// Check for start of a multi-byte sequence from the end
|
|
185
|
+
if ((c & 0xE0) == 0xC0) {
|
|
186
|
+
// 2-byte character start: 110xxxxx
|
|
187
|
+
// Needs at least 2 bytes
|
|
188
|
+
if (i < 2) return len - i;
|
|
189
|
+
} else if ((c & 0xF0) == 0xE0) {
|
|
190
|
+
// 3-byte character start: 1110xxxx
|
|
191
|
+
// Needs at least 3 bytes
|
|
192
|
+
if (i < 3) return len - i;
|
|
193
|
+
} else if ((c & 0xF8) == 0xF0) {
|
|
194
|
+
// 4-byte character start: 11110xxx
|
|
195
|
+
// Needs at least 4 bytes
|
|
196
|
+
if (i < 4) return len - i;
|
|
197
|
+
}
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
// If no cut-off multi-byte character is found, return full length
|
|
201
|
+
return len;
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
//
|
|
205
|
+
// template utils
|
|
206
|
+
//
|
|
207
|
+
|
|
208
|
+
// format rerank task: [BOS]query[EOS][SEP]doc[EOS]
|
|
209
|
+
static llama_tokens format_rerank(const struct llama_model * model, const llama_tokens & query, const llama_tokens & doc) {
|
|
210
|
+
llama_tokens result;
|
|
211
|
+
result.reserve(doc.size() + query.size() + 4);
|
|
212
|
+
result.push_back(llama_token_bos(model));
|
|
213
|
+
result.insert(result.end(), query.begin(), query.end());
|
|
214
|
+
result.push_back(llama_token_eos(model));
|
|
215
|
+
result.push_back(llama_token_sep(model));
|
|
216
|
+
result.insert(result.end(), doc.begin(), doc.end());
|
|
217
|
+
result.push_back(llama_token_eos(model));
|
|
218
|
+
return result;
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
// format infill task
|
|
222
|
+
static llama_tokens format_infill(
|
|
223
|
+
const llama_context * ctx,
|
|
224
|
+
const json & input_prefix,
|
|
225
|
+
const json & input_suffix,
|
|
226
|
+
const json & input_extra,
|
|
227
|
+
const int n_batch,
|
|
228
|
+
const int n_predict,
|
|
229
|
+
const int n_ctx,
|
|
230
|
+
const bool spm_infill,
|
|
231
|
+
const llama_tokens & tokens_prompt
|
|
232
|
+
) {
|
|
233
|
+
// TODO: optimize this block by reducing memory allocations and movement
|
|
234
|
+
|
|
235
|
+
// use FIM repo-level pattern:
|
|
236
|
+
// ref: https://arxiv.org/pdf/2409.12186
|
|
237
|
+
//
|
|
238
|
+
// [FIM_REP]myproject
|
|
239
|
+
// [FIM_SEP]filename0
|
|
240
|
+
// extra chunk 0
|
|
241
|
+
// [FIM_SEP]filename1
|
|
242
|
+
// extra chunk 1
|
|
243
|
+
// ...
|
|
244
|
+
// [FIM_SEP]filename
|
|
245
|
+
// [FIM_PRE]prefix[FIM_SUF]suffix[FIM_MID]prompt
|
|
246
|
+
//
|
|
247
|
+
llama_tokens extra_tokens;
|
|
248
|
+
extra_tokens.reserve(n_ctx);
|
|
249
|
+
|
|
250
|
+
auto model = llama_get_model(ctx);
|
|
251
|
+
auto tokens_prefix = tokenize_mixed(ctx, input_prefix, false, false);
|
|
252
|
+
auto tokens_suffix = tokenize_mixed(ctx, input_suffix, false, false);
|
|
253
|
+
|
|
254
|
+
if (llama_token_fim_rep(model) != LLAMA_TOKEN_NULL) {
|
|
255
|
+
// TODO: make project name an input
|
|
256
|
+
static const auto k_fim_repo = common_tokenize(ctx, "myproject\n", false, false);
|
|
257
|
+
|
|
258
|
+
extra_tokens.push_back(llama_token_fim_rep(model));
|
|
259
|
+
extra_tokens.insert(extra_tokens.end(), k_fim_repo.begin(), k_fim_repo.end());
|
|
260
|
+
}
|
|
261
|
+
for (const auto & chunk : input_extra) {
|
|
262
|
+
// { "text": string, "filename": string }
|
|
263
|
+
const std::string text = json_value(chunk, "text", std::string());
|
|
264
|
+
const std::string filename = json_value(chunk, "filename", std::string("tmp"));
|
|
265
|
+
|
|
266
|
+
if (llama_token_fim_sep(model) != LLAMA_TOKEN_NULL) {
|
|
267
|
+
const auto k_fim_file = common_tokenize(ctx, filename + "\n", false, false);
|
|
268
|
+
|
|
269
|
+
extra_tokens.insert(extra_tokens.end(), llama_token_fim_sep(model));
|
|
270
|
+
extra_tokens.insert(extra_tokens.end(), k_fim_file.begin(), k_fim_file.end());
|
|
271
|
+
} else {
|
|
272
|
+
// chunk separator in binary form to avoid confusing the AI
|
|
273
|
+
static const char k_chunk_prefix_str[] = {0x0a, 0x0a, 0x2d, 0x2d, 0x2d, 0x20, 0x73, 0x6e, 0x69, 0x70, 0x70, 0x65, 0x74, 0x20, 0x2d, 0x2d, 0x2d, 0x0a, 0x0a, 0x00};
|
|
274
|
+
static const auto k_chunk_prefix_tokens = common_tokenize(ctx, k_chunk_prefix_str, false, false);
|
|
275
|
+
|
|
276
|
+
extra_tokens.insert(extra_tokens.end(), k_chunk_prefix_tokens.begin(), k_chunk_prefix_tokens.end());
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
const auto chunk_tokens = common_tokenize(ctx, text, false, false);
|
|
280
|
+
extra_tokens.insert(extra_tokens.end(), chunk_tokens.begin(), chunk_tokens.end());
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
if (llama_token_fim_sep(model) != LLAMA_TOKEN_NULL) {
|
|
284
|
+
// TODO: current filename
|
|
285
|
+
static const auto k_fim_file = common_tokenize(ctx, "filename\n", false, false);
|
|
286
|
+
|
|
287
|
+
extra_tokens.insert(extra_tokens.end(), llama_token_fim_sep(model));
|
|
288
|
+
extra_tokens.insert(extra_tokens.end(), k_fim_file.begin(), k_fim_file.end());
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
// for now pick FIM context to fit in a batch (ratio prefix:suffix = 3:1, TODO: configurable?)
|
|
292
|
+
const int n_prefix_take = std::min<int>(tokens_prefix.size(), 3*(n_batch/4));
|
|
293
|
+
const int n_suffix_take = std::min<int>(tokens_suffix.size(), std::max<int>(0, (n_batch/4) - (2 + tokens_prompt.size())));
|
|
294
|
+
|
|
295
|
+
SRV_DBG("n_prefix_take = %d, n_suffix_take = %d, total = %d\n", n_prefix_take, n_suffix_take, (n_prefix_take + n_suffix_take));
|
|
296
|
+
|
|
297
|
+
// fill the rest of the context with extra chunks
|
|
298
|
+
const int n_extra_take = std::min<int>(std::max<int>(0, n_ctx - (n_batch) - 2*n_predict), extra_tokens.size());
|
|
299
|
+
|
|
300
|
+
tokens_prefix.erase(tokens_prefix.begin(), tokens_prefix.begin() + tokens_prefix.size() - n_prefix_take);
|
|
301
|
+
tokens_suffix.resize(n_suffix_take);
|
|
302
|
+
|
|
303
|
+
tokens_prefix.insert(tokens_prefix.begin(), llama_token_fim_pre(model));
|
|
304
|
+
tokens_prefix.insert(tokens_prefix.end(), tokens_prompt.begin(), tokens_prompt.end());
|
|
305
|
+
tokens_suffix.insert(tokens_suffix.begin(), llama_token_fim_suf(model));
|
|
306
|
+
|
|
307
|
+
auto embd_inp = spm_infill ? tokens_suffix : tokens_prefix;
|
|
308
|
+
auto embd_end = spm_infill ? tokens_prefix : tokens_suffix;
|
|
309
|
+
|
|
310
|
+
if (llama_add_bos_token(model)) {
|
|
311
|
+
embd_inp.insert(embd_inp.begin(), llama_token_bos(model));
|
|
312
|
+
}
|
|
313
|
+
|
|
314
|
+
SRV_DBG("extra: n_ctx = %d, n_extra_take = %d, n_extra = %d\n", n_ctx, n_extra_take, (int) extra_tokens.size());
|
|
315
|
+
|
|
316
|
+
// put the extra context before the FIM prefix
|
|
317
|
+
embd_inp.insert(embd_inp.begin(), extra_tokens.end() - n_extra_take, extra_tokens.end());
|
|
318
|
+
|
|
319
|
+
embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
|
|
320
|
+
embd_inp.push_back(llama_token_fim_mid(model));
|
|
321
|
+
|
|
322
|
+
return embd_inp;
|
|
323
|
+
}
|
|
324
|
+
|
|
58
325
|
// Format given chat. If tmpl is empty, we take the template from model metadata
|
|
59
326
|
inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector<json> & messages) {
|
|
60
|
-
std::vector<
|
|
327
|
+
std::vector<common_chat_msg> chat;
|
|
61
328
|
|
|
62
329
|
for (size_t i = 0; i < messages.size(); ++i) {
|
|
63
330
|
const auto & curr_msg = messages[i];
|
|
@@ -84,12 +351,25 @@ inline std::string format_chat(const struct llama_model * model, const std::stri
|
|
|
84
351
|
chat.push_back({role, content});
|
|
85
352
|
}
|
|
86
353
|
|
|
87
|
-
const auto formatted_chat =
|
|
354
|
+
const auto formatted_chat = common_chat_apply_template(model, tmpl, chat, true);
|
|
88
355
|
LOG_DBG("formatted_chat: '%s'\n", formatted_chat.c_str());
|
|
89
356
|
|
|
90
357
|
return formatted_chat;
|
|
91
358
|
}
|
|
92
359
|
|
|
360
|
+
static std::string llama_get_chat_template(const struct llama_model * model) {
|
|
361
|
+
std::string template_key = "tokenizer.chat_template";
|
|
362
|
+
// call with NULL buffer to get the total size of the string
|
|
363
|
+
int32_t res = llama_model_meta_val_str(model, template_key.c_str(), NULL, 0);
|
|
364
|
+
if (res < 2) {
|
|
365
|
+
return "";
|
|
366
|
+
} else {
|
|
367
|
+
std::vector<char> model_template(res + 1, 0);
|
|
368
|
+
llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size());
|
|
369
|
+
return std::string(model_template.data(), model_template.size() - 1);
|
|
370
|
+
}
|
|
371
|
+
}
|
|
372
|
+
|
|
93
373
|
//
|
|
94
374
|
// base64 utils (TODO: move to common in the future)
|
|
95
375
|
//
|
|
@@ -182,20 +462,6 @@ static std::string gen_chatcmplid() {
|
|
|
182
462
|
// other common utils
|
|
183
463
|
//
|
|
184
464
|
|
|
185
|
-
static size_t common_part(const std::vector<llama_token> & a, const std::vector<llama_token> & b) {
|
|
186
|
-
size_t i;
|
|
187
|
-
for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {}
|
|
188
|
-
|
|
189
|
-
return i;
|
|
190
|
-
}
|
|
191
|
-
|
|
192
|
-
static size_t common_part(const std::string & a, const std::string & b) {
|
|
193
|
-
size_t i;
|
|
194
|
-
for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {}
|
|
195
|
-
|
|
196
|
-
return i;
|
|
197
|
-
}
|
|
198
|
-
|
|
199
465
|
static bool ends_with(const std::string & str, const std::string & suffix) {
|
|
200
466
|
return str.size() >= suffix.size() && 0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix);
|
|
201
467
|
}
|
|
@@ -216,24 +482,12 @@ static size_t find_partial_stop_string(const std::string &stop, const std::strin
|
|
|
216
482
|
return std::string::npos;
|
|
217
483
|
}
|
|
218
484
|
|
|
219
|
-
static bool json_is_array_of_numbers(const json & data) {
|
|
220
|
-
if (data.is_array()) {
|
|
221
|
-
for (const auto & e : data) {
|
|
222
|
-
if (!e.is_number()) {
|
|
223
|
-
return false;
|
|
224
|
-
}
|
|
225
|
-
}
|
|
226
|
-
return true;
|
|
227
|
-
}
|
|
228
|
-
return false;
|
|
229
|
-
}
|
|
230
|
-
|
|
231
485
|
// TODO: reuse llama_detokenize
|
|
232
486
|
template <class Iter>
|
|
233
487
|
static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
|
|
234
488
|
std::string ret;
|
|
235
489
|
for (; begin != end; ++begin) {
|
|
236
|
-
ret +=
|
|
490
|
+
ret += common_token_to_piece(ctx, *begin);
|
|
237
491
|
}
|
|
238
492
|
|
|
239
493
|
return ret;
|
|
@@ -241,7 +495,7 @@ static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
|
|
|
241
495
|
|
|
242
496
|
// format incomplete utf-8 multibyte character for output
|
|
243
497
|
static std::string tokens_to_output_formatted_string(const llama_context * ctx, const llama_token token) {
|
|
244
|
-
std::string out = token == -1 ? "" :
|
|
498
|
+
std::string out = token == -1 ? "" : common_token_to_piece(ctx, token);
|
|
245
499
|
|
|
246
500
|
// if the size is 1 and first bit is 1, meaning it's a partial character
|
|
247
501
|
// (size > 1 meaning it's already a known token)
|
|
@@ -255,48 +509,11 @@ static std::string tokens_to_output_formatted_string(const llama_context * ctx,
|
|
|
255
509
|
return out;
|
|
256
510
|
}
|
|
257
511
|
|
|
258
|
-
struct completion_token_output {
|
|
259
|
-
llama_token tok;
|
|
260
|
-
std::string text_to_send;
|
|
261
|
-
|
|
262
|
-
struct token_prob {
|
|
263
|
-
llama_token tok;
|
|
264
|
-
float prob;
|
|
265
|
-
};
|
|
266
|
-
|
|
267
|
-
std::vector<token_prob> probs;
|
|
268
|
-
};
|
|
269
|
-
|
|
270
|
-
// convert a vector of completion_token_output to json
|
|
271
|
-
static json probs_vector_to_json(const llama_context * ctx, const std::vector<completion_token_output> & probs) {
|
|
272
|
-
json out = json::array();
|
|
273
|
-
|
|
274
|
-
for (const auto & prob : probs) {
|
|
275
|
-
json probs_for_token = json::array();
|
|
276
|
-
|
|
277
|
-
for (const auto & p : prob.probs) {
|
|
278
|
-
const std::string tok_str = tokens_to_output_formatted_string(ctx, p.tok);
|
|
279
|
-
probs_for_token.push_back(json {
|
|
280
|
-
{"tok_str", tok_str},
|
|
281
|
-
{"prob", p.prob},
|
|
282
|
-
});
|
|
283
|
-
}
|
|
284
|
-
|
|
285
|
-
const std::string tok_str = tokens_to_output_formatted_string(ctx, prob.tok);
|
|
286
|
-
out.push_back(json {
|
|
287
|
-
{"content", tok_str},
|
|
288
|
-
{"probs", probs_for_token},
|
|
289
|
-
});
|
|
290
|
-
}
|
|
291
|
-
|
|
292
|
-
return out;
|
|
293
|
-
}
|
|
294
|
-
|
|
295
512
|
static bool server_sent_event(httplib::DataSink & sink, const char * event, const json & data) {
|
|
296
513
|
const std::string str =
|
|
297
514
|
std::string(event) + ": " +
|
|
298
515
|
data.dump(-1, ' ', false, json::error_handler_t::replace) +
|
|
299
|
-
"\n\n"; //
|
|
516
|
+
"\n\n"; // required by RFC 8895 - A message is terminated by a blank line (two line terminators in a row).
|
|
300
517
|
|
|
301
518
|
LOG_DBG("data stream, to_send: %s", str.c_str());
|
|
302
519
|
|
|
@@ -313,8 +530,6 @@ static json oaicompat_completion_params_parse(
|
|
|
313
530
|
const std::string & chat_template) {
|
|
314
531
|
json llama_params;
|
|
315
532
|
|
|
316
|
-
llama_params["__oaicompat"] = true;
|
|
317
|
-
|
|
318
533
|
// Apply chat template to the list of messages
|
|
319
534
|
llama_params["prompt"] = format_chat(model, chat_template, body.at("messages"));
|
|
320
535
|
|
|
@@ -347,9 +562,9 @@ static json oaicompat_completion_params_parse(
|
|
|
347
562
|
|
|
348
563
|
// Handle "logprobs" field
|
|
349
564
|
// TODO: The response format of this option is not yet OAI-compatible, but seems like no one really using it; We may need to fix it in the future
|
|
350
|
-
if (body
|
|
565
|
+
if (json_value(body, "logprobs", false)) {
|
|
351
566
|
llama_params["n_probs"] = json_value(body, "top_logprobs", 20);
|
|
352
|
-
} else if (body.contains("top_logprobs")) {
|
|
567
|
+
} else if (body.contains("top_logprobs") && !body.at("top_logprobs").is_null()) {
|
|
353
568
|
throw std::runtime_error("top_logprobs requires logprobs to be set to true");
|
|
354
569
|
}
|
|
355
570
|
|
|
@@ -362,7 +577,7 @@ static json oaicompat_completion_params_parse(
|
|
|
362
577
|
}
|
|
363
578
|
|
|
364
579
|
// Copy remaining properties to llama_params
|
|
365
|
-
// This allows user to use llama.cpp-specific params like "mirostat",
|
|
580
|
+
// This allows user to use llama.cpp-specific params like "mirostat", ... via OAI endpoint.
|
|
366
581
|
// See "launch_slot_with_task()" for a complete list of params supported by llama.cpp
|
|
367
582
|
for (const auto & item : body.items()) {
|
|
368
583
|
// Exception: if "n_predict" is present, we overwrite the value specified earlier by "max_tokens"
|
|
@@ -374,157 +589,9 @@ static json oaicompat_completion_params_parse(
|
|
|
374
589
|
return llama_params;
|
|
375
590
|
}
|
|
376
591
|
|
|
377
|
-
static json format_final_response_oaicompat(const json & request, const json & result, const std::string & completion_id, bool streaming = false, bool verbose = false) {
|
|
378
|
-
bool stopped_word = result.count("stopped_word") != 0;
|
|
379
|
-
bool stopped_eos = json_value(result, "stopped_eos", false);
|
|
380
|
-
int num_tokens_predicted = json_value(result, "tokens_predicted", 0);
|
|
381
|
-
int num_prompt_tokens = json_value(result, "tokens_evaluated", 0);
|
|
382
|
-
std::string content = json_value(result, "content", std::string(""));
|
|
383
|
-
|
|
384
|
-
std::string finish_reason = "length";
|
|
385
|
-
if (stopped_word || stopped_eos) {
|
|
386
|
-
finish_reason = "stop";
|
|
387
|
-
}
|
|
388
|
-
|
|
389
|
-
json choices =
|
|
390
|
-
streaming ? json::array({json{{"finish_reason", finish_reason},
|
|
391
|
-
{"index", 0},
|
|
392
|
-
{"delta", json::object()}}})
|
|
393
|
-
: json::array({json{{"finish_reason", finish_reason},
|
|
394
|
-
{"index", 0},
|
|
395
|
-
{"message", json{{"content", content},
|
|
396
|
-
{"role", "assistant"}}}}});
|
|
397
|
-
|
|
398
|
-
std::time_t t = std::time(0);
|
|
399
|
-
|
|
400
|
-
json res = json {
|
|
401
|
-
{"choices", choices},
|
|
402
|
-
{"created", t},
|
|
403
|
-
{"model",
|
|
404
|
-
json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
|
|
405
|
-
{"object", streaming ? "chat.completion.chunk" : "chat.completion"},
|
|
406
|
-
{"usage", json {
|
|
407
|
-
{"completion_tokens", num_tokens_predicted},
|
|
408
|
-
{"prompt_tokens", num_prompt_tokens},
|
|
409
|
-
{"total_tokens", num_tokens_predicted + num_prompt_tokens}
|
|
410
|
-
}},
|
|
411
|
-
{"id", completion_id}
|
|
412
|
-
};
|
|
413
|
-
|
|
414
|
-
// extra fields for debugging purposes
|
|
415
|
-
if (verbose) {
|
|
416
|
-
res["__verbose"] = result;
|
|
417
|
-
}
|
|
418
|
-
|
|
419
|
-
if (result.contains("completion_probabilities")) {
|
|
420
|
-
res["completion_probabilities"] = json_value(result, "completion_probabilities", json::array());
|
|
421
|
-
}
|
|
422
|
-
|
|
423
|
-
return res;
|
|
424
|
-
}
|
|
425
|
-
|
|
426
|
-
// return value is vector as there is one case where we might need to generate two responses
|
|
427
|
-
static std::vector<json> format_partial_response_oaicompat(const json & result, const std::string & completion_id) {
|
|
428
|
-
if (!result.contains("model") || !result.contains("oaicompat_token_ctr")) {
|
|
429
|
-
return std::vector<json>({result});
|
|
430
|
-
}
|
|
431
|
-
|
|
432
|
-
bool first = json_value(result, "oaicompat_token_ctr", 0) == 0;
|
|
433
|
-
std::string modelname = json_value(result, "model", std::string(DEFAULT_OAICOMPAT_MODEL));
|
|
434
|
-
|
|
435
|
-
bool stopped_word = json_value(result, "stopped_word", false);
|
|
436
|
-
bool stopped_eos = json_value(result, "stopped_eos", false);
|
|
437
|
-
bool stopped_limit = json_value(result, "stopped_limit", false);
|
|
438
|
-
std::string content = json_value(result, "content", std::string(""));
|
|
439
|
-
|
|
440
|
-
std::string finish_reason;
|
|
441
|
-
if (stopped_word || stopped_eos) {
|
|
442
|
-
finish_reason = "stop";
|
|
443
|
-
}
|
|
444
|
-
if (stopped_limit) {
|
|
445
|
-
finish_reason = "length";
|
|
446
|
-
}
|
|
447
|
-
|
|
448
|
-
std::time_t t = std::time(0);
|
|
449
|
-
|
|
450
|
-
json choices;
|
|
451
|
-
|
|
452
|
-
if (!finish_reason.empty()) {
|
|
453
|
-
choices = json::array({json{{"finish_reason", finish_reason},
|
|
454
|
-
{"index", 0},
|
|
455
|
-
{"delta", json::object()}}});
|
|
456
|
-
} else {
|
|
457
|
-
if (first) {
|
|
458
|
-
if (content.empty()) {
|
|
459
|
-
choices = json::array({json{{"finish_reason", nullptr},
|
|
460
|
-
{"index", 0},
|
|
461
|
-
{"delta", json{{"role", "assistant"}}}}});
|
|
462
|
-
} else {
|
|
463
|
-
// We have to send this as two updates to conform to openai behavior
|
|
464
|
-
json initial_ret = json{{"choices", json::array({json{
|
|
465
|
-
{"finish_reason", nullptr},
|
|
466
|
-
{"index", 0},
|
|
467
|
-
{"delta", json{
|
|
468
|
-
{"role", "assistant"}
|
|
469
|
-
}}}})},
|
|
470
|
-
{"created", t},
|
|
471
|
-
{"id", completion_id},
|
|
472
|
-
{"model", modelname},
|
|
473
|
-
{"object", "chat.completion.chunk"}};
|
|
474
|
-
|
|
475
|
-
json second_ret = json{
|
|
476
|
-
{"choices", json::array({json{{"finish_reason", nullptr},
|
|
477
|
-
{"index", 0},
|
|
478
|
-
{"delta", json{
|
|
479
|
-
{"content", content}}}
|
|
480
|
-
}})},
|
|
481
|
-
{"created", t},
|
|
482
|
-
{"id", completion_id},
|
|
483
|
-
{"model", modelname},
|
|
484
|
-
{"object", "chat.completion.chunk"}};
|
|
485
|
-
|
|
486
|
-
return std::vector<json>({initial_ret, second_ret});
|
|
487
|
-
}
|
|
488
|
-
} else {
|
|
489
|
-
// Some idiosyncrasy in task processing logic makes several trailing calls
|
|
490
|
-
// with empty content, we ignore these at the calee site.
|
|
491
|
-
if (content.empty()) {
|
|
492
|
-
return std::vector<json>({json::object()});
|
|
493
|
-
}
|
|
494
|
-
|
|
495
|
-
choices = json::array({json{
|
|
496
|
-
{"finish_reason", nullptr},
|
|
497
|
-
{"index", 0},
|
|
498
|
-
{"delta",
|
|
499
|
-
json{
|
|
500
|
-
{"content", content},
|
|
501
|
-
}},
|
|
502
|
-
}});
|
|
503
|
-
}
|
|
504
|
-
}
|
|
505
|
-
|
|
506
|
-
json ret = json {
|
|
507
|
-
{"choices", choices},
|
|
508
|
-
{"created", t},
|
|
509
|
-
{"id", completion_id},
|
|
510
|
-
{"model", modelname},
|
|
511
|
-
{"object", "chat.completion.chunk"}
|
|
512
|
-
};
|
|
513
|
-
if (!finish_reason.empty()) {
|
|
514
|
-
int num_tokens_predicted = json_value(result, "tokens_predicted", 0);
|
|
515
|
-
int num_prompt_tokens = json_value(result, "tokens_evaluated", 0);
|
|
516
|
-
ret.push_back({"usage", json {
|
|
517
|
-
{"completion_tokens", num_tokens_predicted},
|
|
518
|
-
{"prompt_tokens", num_prompt_tokens},
|
|
519
|
-
{"total_tokens", num_tokens_predicted + num_prompt_tokens}
|
|
520
|
-
}});
|
|
521
|
-
}
|
|
522
|
-
|
|
523
|
-
return std::vector<json>({ret});
|
|
524
|
-
}
|
|
525
|
-
|
|
526
592
|
static json format_embeddings_response_oaicompat(const json & request, const json & embeddings) {
|
|
527
593
|
json data = json::array();
|
|
594
|
+
int32_t n_tokens = 0;
|
|
528
595
|
int i = 0;
|
|
529
596
|
for (const auto & elem : embeddings) {
|
|
530
597
|
data.push_back(json{
|
|
@@ -532,14 +599,16 @@ static json format_embeddings_response_oaicompat(const json & request, const jso
|
|
|
532
599
|
{"index", i++},
|
|
533
600
|
{"object", "embedding"}
|
|
534
601
|
});
|
|
602
|
+
|
|
603
|
+
n_tokens += json_value(elem, "tokens_evaluated", 0);
|
|
535
604
|
}
|
|
536
605
|
|
|
537
606
|
json res = json {
|
|
538
607
|
{"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
|
|
539
608
|
{"object", "list"},
|
|
540
|
-
{"usage", json {
|
|
541
|
-
{"prompt_tokens",
|
|
542
|
-
{"total_tokens",
|
|
609
|
+
{"usage", json {
|
|
610
|
+
{"prompt_tokens", n_tokens},
|
|
611
|
+
{"total_tokens", n_tokens}
|
|
543
612
|
}},
|
|
544
613
|
{"data", data}
|
|
545
614
|
};
|
|
@@ -549,20 +618,23 @@ static json format_embeddings_response_oaicompat(const json & request, const jso
|
|
|
549
618
|
|
|
550
619
|
static json format_response_rerank(const json & request, const json & ranks) {
|
|
551
620
|
json data = json::array();
|
|
621
|
+
int32_t n_tokens = 0;
|
|
552
622
|
int i = 0;
|
|
553
623
|
for (const auto & rank : ranks) {
|
|
554
624
|
data.push_back(json{
|
|
555
625
|
{"index", i++},
|
|
556
626
|
{"relevance_score", json_value(rank, "score", 0.0)},
|
|
557
627
|
});
|
|
628
|
+
|
|
629
|
+
n_tokens += json_value(rank, "tokens_evaluated", 0);
|
|
558
630
|
}
|
|
559
631
|
|
|
560
632
|
json res = json {
|
|
561
633
|
{"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
|
|
562
634
|
{"object", "list"},
|
|
563
|
-
{"usage", json {
|
|
564
|
-
{"prompt_tokens",
|
|
565
|
-
{"total_tokens",
|
|
635
|
+
{"usage", json {
|
|
636
|
+
{"prompt_tokens", n_tokens},
|
|
637
|
+
{"total_tokens", n_tokens}
|
|
566
638
|
}},
|
|
567
639
|
{"results", data}
|
|
568
640
|
};
|
|
@@ -615,42 +687,47 @@ static json format_detokenized_response(const std::string & content) {
|
|
|
615
687
|
};
|
|
616
688
|
}
|
|
617
689
|
|
|
618
|
-
static json
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
break;
|
|
626
|
-
case ERROR_TYPE_AUTHENTICATION:
|
|
627
|
-
type_str = "authentication_error";
|
|
628
|
-
code = 401;
|
|
629
|
-
break;
|
|
630
|
-
case ERROR_TYPE_NOT_FOUND:
|
|
631
|
-
type_str = "not_found_error";
|
|
632
|
-
code = 404;
|
|
633
|
-
break;
|
|
634
|
-
case ERROR_TYPE_SERVER:
|
|
635
|
-
type_str = "server_error";
|
|
636
|
-
code = 500;
|
|
637
|
-
break;
|
|
638
|
-
case ERROR_TYPE_PERMISSION:
|
|
639
|
-
type_str = "permission_error";
|
|
640
|
-
code = 403;
|
|
641
|
-
break;
|
|
642
|
-
case ERROR_TYPE_NOT_SUPPORTED:
|
|
643
|
-
type_str = "not_supported_error";
|
|
644
|
-
code = 501;
|
|
645
|
-
break;
|
|
646
|
-
case ERROR_TYPE_UNAVAILABLE:
|
|
647
|
-
type_str = "unavailable_error";
|
|
648
|
-
code = 503;
|
|
649
|
-
break;
|
|
690
|
+
static json format_logit_bias(const std::vector<llama_logit_bias> & logit_bias) {
|
|
691
|
+
json data = json::array();
|
|
692
|
+
for (const auto & lb : logit_bias) {
|
|
693
|
+
data.push_back(json{
|
|
694
|
+
{"bias", lb.bias},
|
|
695
|
+
{"token", lb.token},
|
|
696
|
+
});
|
|
650
697
|
}
|
|
651
|
-
return
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
698
|
+
return data;
|
|
699
|
+
}
|
|
700
|
+
|
|
701
|
+
static std::string safe_json_to_str(json data) {
|
|
702
|
+
return data.dump(-1, ' ', false, json::error_handler_t::replace);
|
|
703
|
+
}
|
|
704
|
+
|
|
705
|
+
static std::vector<llama_token_data> get_token_probabilities(llama_context * ctx, int idx) {
|
|
706
|
+
std::vector<llama_token_data> cur;
|
|
707
|
+
const auto * logits = llama_get_logits_ith(ctx, idx);
|
|
708
|
+
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
|
|
709
|
+
|
|
710
|
+
cur.resize(n_vocab);
|
|
711
|
+
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
|
|
712
|
+
cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
|
|
713
|
+
}
|
|
714
|
+
|
|
715
|
+
// sort tokens by logits
|
|
716
|
+
std::sort(cur.begin(), cur.end(), [](const llama_token_data & a, const llama_token_data & b) {
|
|
717
|
+
return a.logit > b.logit;
|
|
718
|
+
});
|
|
719
|
+
|
|
720
|
+
// apply softmax
|
|
721
|
+
float max_l = cur[0].logit;
|
|
722
|
+
float cum_sum = 0.0f;
|
|
723
|
+
for (size_t i = 0; i < cur.size(); ++i) {
|
|
724
|
+
float p = expf(cur[i].logit - max_l);
|
|
725
|
+
cur[i].p = p;
|
|
726
|
+
cum_sum += p;
|
|
727
|
+
}
|
|
728
|
+
for (size_t i = 0; i < cur.size(); ++i) {
|
|
729
|
+
cur[i].p /= cum_sum;
|
|
730
|
+
}
|
|
731
|
+
|
|
732
|
+
return cur;
|
|
656
733
|
}
|