@fugood/llama.node 0.3.16 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +6 -1
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +44 -2
- package/lib/index.js +132 -1
- package/lib/index.ts +203 -3
- package/package.json +2 -1
- package/src/EmbeddingWorker.cpp +1 -1
- package/src/LlamaCompletionWorker.cpp +374 -19
- package/src/LlamaCompletionWorker.h +31 -10
- package/src/LlamaContext.cpp +216 -7
- package/src/LlamaContext.h +12 -0
- package/src/common.hpp +15 -0
- package/src/llama.cpp/.github/workflows/build-linux-cross.yml +233 -0
- package/src/llama.cpp/.github/workflows/build.yml +89 -767
- package/src/llama.cpp/.github/workflows/docker.yml +9 -6
- package/src/llama.cpp/.github/workflows/release.yml +716 -0
- package/src/llama.cpp/.github/workflows/server.yml +19 -23
- package/src/llama.cpp/CMakeLists.txt +11 -1
- package/src/llama.cpp/cmake/build-info.cmake +8 -2
- package/src/llama.cpp/cmake/x64-windows-llvm.cmake +0 -6
- package/src/llama.cpp/common/CMakeLists.txt +35 -4
- package/src/llama.cpp/common/arg.cpp +844 -121
- package/src/llama.cpp/common/arg.h +9 -0
- package/src/llama.cpp/common/chat.cpp +129 -107
- package/src/llama.cpp/common/chat.h +2 -0
- package/src/llama.cpp/common/common.cpp +64 -518
- package/src/llama.cpp/common/common.h +35 -45
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -0
- package/src/llama.cpp/common/llguidance.cpp +31 -47
- package/src/llama.cpp/common/minja/chat-template.hpp +23 -11
- package/src/llama.cpp/common/minja/minja.hpp +186 -127
- package/src/llama.cpp/common/regex-partial.cpp +204 -0
- package/src/llama.cpp/common/regex-partial.h +56 -0
- package/src/llama.cpp/common/sampling.cpp +60 -50
- package/src/llama.cpp/docs/build.md +122 -7
- package/src/llama.cpp/examples/CMakeLists.txt +2 -32
- package/src/llama.cpp/examples/batched/batched.cpp +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +9 -12
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
- package/src/llama.cpp/examples/parallel/parallel.cpp +89 -15
- package/src/llama.cpp/examples/passkey/passkey.cpp +1 -1
- package/src/llama.cpp/examples/speculative/speculative.cpp +1 -1
- package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
- package/src/llama.cpp/examples/sycl/build.sh +2 -2
- package/src/llama.cpp/examples/sycl/win-build-sycl.bat +2 -2
- package/src/llama.cpp/examples/training/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/training/finetune.cpp +96 -0
- package/src/llama.cpp/ggml/CMakeLists.txt +35 -2
- package/src/llama.cpp/ggml/cmake/GitVars.cmake +22 -0
- package/src/llama.cpp/ggml/include/ggml-backend.h +4 -4
- package/src/llama.cpp/ggml/include/ggml-cpp.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-cpu.h +5 -0
- package/src/llama.cpp/ggml/include/ggml-opt.h +47 -28
- package/src/llama.cpp/ggml/include/ggml-rpc.h +6 -1
- package/src/llama.cpp/ggml/include/ggml.h +76 -106
- package/src/llama.cpp/ggml/src/CMakeLists.txt +11 -8
- package/src/llama.cpp/ggml/src/ggml-alloc.c +4 -1
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +9 -5
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +0 -2
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +8 -4
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +5 -5
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +692 -1534
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +613 -122
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +135 -1
- package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +507 -137
- package/src/llama.cpp/ggml/src/ggml-common.h +12 -6
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +66 -33
- package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/common.h +72 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +896 -194
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +2 -21
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +1060 -410
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1008 -13533
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +31 -16
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +90 -12
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +47 -13
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +266 -72
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1034 -88
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +8796 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +110 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +892 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +28 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +252 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +802 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +23 -4
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +7 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +0 -4
- package/src/llama.cpp/ggml/src/ggml-impl.h +52 -18
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +106 -14
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +67 -119
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1023 -262
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +368 -190
- package/src/llama.cpp/ggml/src/ggml-quants.c +0 -6
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +307 -40
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +125 -45
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +10 -8
- package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +239 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +39 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +0 -35
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +9 -307
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +72 -25
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +14 -7
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +59 -21
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +7 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +79 -90
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +944 -438
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +22 -23
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +37 -8
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +24 -20
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +1 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +507 -411
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +84 -74
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +1 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +185 -89
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +37 -49
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +7 -22
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +4 -14
- package/src/llama.cpp/ggml/src/ggml-sycl/quants.hpp +83 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +204 -118
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +1 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +128 -53
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +83 -49
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1278 -282
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +32 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +133 -30
- package/src/llama.cpp/ggml/src/ggml.c +170 -265
- package/src/llama.cpp/ggml/src/gguf.cpp +34 -33
- package/src/llama.cpp/include/llama.h +82 -22
- package/src/llama.cpp/models/ggml-vocab-llama4.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-llama4.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.out +46 -0
- package/src/llama.cpp/requirements/requirements-all.txt +5 -3
- package/src/llama.cpp/requirements/requirements-gguf_editor_gui.txt +3 -0
- package/src/llama.cpp/scripts/xxd.cmake +1 -1
- package/src/llama.cpp/src/CMakeLists.txt +4 -2
- package/src/llama.cpp/src/llama-adapter.cpp +43 -1
- package/src/llama.cpp/src/llama-arch.cpp +163 -17
- package/src/llama.cpp/src/llama-arch.h +16 -0
- package/src/llama.cpp/src/llama-batch.cpp +5 -1
- package/src/llama.cpp/src/llama-batch.h +2 -1
- package/src/llama.cpp/src/llama-chat.cpp +91 -16
- package/src/llama.cpp/src/llama-chat.h +7 -2
- package/src/llama.cpp/src/llama-context.cpp +479 -575
- package/src/llama.cpp/src/llama-context.h +44 -33
- package/src/llama.cpp/src/llama-cparams.h +1 -0
- package/src/llama.cpp/src/llama-graph.cpp +209 -157
- package/src/llama.cpp/src/llama-graph.h +38 -14
- package/src/llama.cpp/src/llama-hparams.h +13 -0
- package/src/llama.cpp/src/llama-kv-cache.cpp +1604 -543
- package/src/llama.cpp/src/llama-kv-cache.h +283 -171
- package/src/llama.cpp/src/llama-memory.h +12 -2
- package/src/llama.cpp/src/llama-mmap.cpp +1 -1
- package/src/llama.cpp/src/llama-model-loader.cpp +34 -20
- package/src/llama.cpp/src/llama-model-loader.h +5 -3
- package/src/llama.cpp/src/llama-model-saver.cpp +281 -0
- package/src/llama.cpp/src/llama-model-saver.h +37 -0
- package/src/llama.cpp/src/llama-model.cpp +1803 -330
- package/src/llama.cpp/src/llama-model.h +21 -2
- package/src/llama.cpp/src/llama-quant.cpp +33 -10
- package/src/llama.cpp/src/llama-sampling.cpp +25 -7
- package/src/llama.cpp/src/llama-vocab.cpp +86 -10
- package/src/llama.cpp/src/llama-vocab.h +6 -0
- package/src/llama.cpp/src/llama.cpp +15 -1
- package/src/llama.cpp/tests/CMakeLists.txt +52 -31
- package/src/llama.cpp/tests/test-arg-parser.cpp +51 -4
- package/src/llama.cpp/tests/test-backend-ops.cpp +189 -90
- package/src/llama.cpp/tests/test-chat-template.cpp +26 -6
- package/src/llama.cpp/tests/test-chat.cpp +15 -3
- package/src/llama.cpp/{examples/gbnf-validator/gbnf-validator.cpp → tests/test-gbnf-validator.cpp} +2 -2
- package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -2
- package/src/llama.cpp/tests/test-grammar-llguidance.cpp +63 -2
- package/src/llama.cpp/tests/test-grammar-parser.cpp +3 -1
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -1
- package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -1
- package/src/llama.cpp/tests/test-mtmd-c-api.c +63 -0
- package/src/llama.cpp/tests/test-opt.cpp +33 -21
- package/src/llama.cpp/{examples/quantize-stats/quantize-stats.cpp → tests/test-quantize-stats.cpp} +3 -1
- package/src/llama.cpp/tests/test-regex-partial.cpp +288 -0
- package/src/llama.cpp/tests/test-sampling.cpp +1 -1
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +2 -1
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +2 -1
- package/src/llama.cpp/tools/CMakeLists.txt +39 -0
- package/src/llama.cpp/{examples → tools}/batched-bench/batched-bench.cpp +3 -3
- package/src/llama.cpp/{examples → tools}/export-lora/export-lora.cpp +1 -1
- package/src/llama.cpp/{examples → tools}/gguf-split/gguf-split.cpp +15 -16
- package/src/llama.cpp/{examples → tools}/imatrix/imatrix.cpp +11 -9
- package/src/llama.cpp/{examples → tools}/llama-bench/llama-bench.cpp +623 -274
- package/src/llama.cpp/{examples → tools}/main/main.cpp +22 -14
- package/src/llama.cpp/tools/mtmd/CMakeLists.txt +47 -0
- package/src/llama.cpp/tools/mtmd/clip-impl.h +365 -0
- package/src/llama.cpp/tools/mtmd/clip.cpp +3646 -0
- package/src/llama.cpp/tools/mtmd/clip.h +99 -0
- package/src/llama.cpp/tools/mtmd/deprecation-warning.cpp +22 -0
- package/src/llama.cpp/tools/mtmd/mtmd-cli.cpp +370 -0
- package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +310 -0
- package/src/llama.cpp/tools/mtmd/mtmd.cpp +678 -0
- package/src/llama.cpp/tools/mtmd/mtmd.h +331 -0
- package/src/llama.cpp/{examples → tools}/perplexity/perplexity.cpp +21 -5
- package/src/llama.cpp/{examples → tools}/quantize/quantize.cpp +53 -3
- package/src/llama.cpp/tools/rpc/CMakeLists.txt +4 -0
- package/src/llama.cpp/tools/rpc/rpc-server.cpp +322 -0
- package/src/llama.cpp/tools/run/CMakeLists.txt +16 -0
- package/src/llama.cpp/{examples → tools}/run/run.cpp +30 -30
- package/src/llama.cpp/{examples → tools}/server/CMakeLists.txt +2 -1
- package/src/llama.cpp/{examples → tools}/server/httplib.h +313 -247
- package/src/llama.cpp/{examples → tools}/server/server.cpp +529 -215
- package/src/llama.cpp/{examples → tools}/server/utils.hpp +427 -6
- package/src/llama.cpp/{examples → tools}/tts/tts.cpp +6 -9
- package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +0 -6
- package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/infill/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/infill/infill.cpp +0 -590
- package/src/llama.cpp/examples/llava/CMakeLists.txt +0 -66
- package/src/llama.cpp/examples/llava/android/build_64.sh +0 -8
- package/src/llama.cpp/examples/llava/clip-quantize-cli.cpp +0 -59
- package/src/llama.cpp/examples/llava/clip.cpp +0 -3206
- package/src/llama.cpp/examples/llava/clip.h +0 -118
- package/src/llama.cpp/examples/llava/gemma3-cli.cpp +0 -341
- package/src/llama.cpp/examples/llava/llava-cli.cpp +0 -332
- package/src/llama.cpp/examples/llava/llava.cpp +0 -574
- package/src/llama.cpp/examples/llava/llava.h +0 -49
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +0 -354
- package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +0 -584
- package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +0 -6
- package/src/llama.cpp/examples/rpc/CMakeLists.txt +0 -2
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +0 -171
- package/src/llama.cpp/examples/run/CMakeLists.txt +0 -5
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
- /package/src/llama.cpp/{examples → tools}/batched-bench/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/completions.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/cvector-generator.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/mean.hpp +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/negative.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/pca.hpp +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/positive.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/export-lora/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/gguf-split/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/imatrix/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/llama-bench/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/main/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples/llava → tools/mtmd}/requirements.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/perplexity/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/quantize/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.h +0 -0
- /package/src/llama.cpp/{examples → tools}/server/bench/requirements.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/server/tests/requirements.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/tokenize/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/tokenize/tokenize.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/tts/CMakeLists.txt +0 -0
|
@@ -3,7 +3,9 @@
|
|
|
3
3
|
#include "common.h"
|
|
4
4
|
#include "log.h"
|
|
5
5
|
#include "llama.h"
|
|
6
|
-
#include "
|
|
6
|
+
#include "arg.h" // common_remote_get_content
|
|
7
|
+
#include "base64.hpp"
|
|
8
|
+
#include "mtmd.h"
|
|
7
9
|
|
|
8
10
|
// increase max payload length to allow use of larger context size
|
|
9
11
|
#define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 1048576
|
|
@@ -21,6 +23,7 @@
|
|
|
21
23
|
#include <string>
|
|
22
24
|
#include <vector>
|
|
23
25
|
#include <memory>
|
|
26
|
+
#include <cinttypes>
|
|
24
27
|
|
|
25
28
|
#define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo"
|
|
26
29
|
|
|
@@ -41,6 +44,8 @@ using json = nlohmann::ordered_json;
|
|
|
41
44
|
#define QUE_ERR(fmt, ...) LOG_ERR("que %12.*s: " fmt, 12, __func__, __VA_ARGS__)
|
|
42
45
|
#define QUE_DBG(fmt, ...) LOG_DBG("que %12.*s: " fmt, 12, __func__, __VA_ARGS__)
|
|
43
46
|
|
|
47
|
+
using raw_buffer = std::vector<uint8_t>;
|
|
48
|
+
|
|
44
49
|
template <typename T>
|
|
45
50
|
static T json_value(const json & body, const std::string & key, const T & default_value) {
|
|
46
51
|
// Fallback null to default value
|
|
@@ -58,6 +63,32 @@ static T json_value(const json & body, const std::string & key, const T & defaul
|
|
|
58
63
|
|
|
59
64
|
const static std::string build_info("b" + std::to_string(LLAMA_BUILD_NUMBER) + "-" + LLAMA_COMMIT);
|
|
60
65
|
|
|
66
|
+
// thin wrapper around common_grammar_trigger with (de)serialization functions
|
|
67
|
+
struct server_grammar_trigger {
|
|
68
|
+
common_grammar_trigger value;
|
|
69
|
+
|
|
70
|
+
server_grammar_trigger() = default;
|
|
71
|
+
server_grammar_trigger(const common_grammar_trigger & value) : value(value) {}
|
|
72
|
+
server_grammar_trigger(const json & in) {
|
|
73
|
+
value.type = (common_grammar_trigger_type) in.at("type").get<int>();
|
|
74
|
+
value.value = in.at("value").get<std::string>();
|
|
75
|
+
if (value.type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) {
|
|
76
|
+
value.token = (llama_token) in.at("token").get<int>();
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
json to_json() const {
|
|
81
|
+
json out {
|
|
82
|
+
{"type", (int) value.type},
|
|
83
|
+
{"value", value.value},
|
|
84
|
+
};
|
|
85
|
+
if (value.type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) {
|
|
86
|
+
out["token"] = (int) value.token;
|
|
87
|
+
}
|
|
88
|
+
return out;
|
|
89
|
+
}
|
|
90
|
+
};
|
|
91
|
+
|
|
61
92
|
//
|
|
62
93
|
// tokenizer and input processing utils
|
|
63
94
|
//
|
|
@@ -360,7 +391,7 @@ static inline bool is_base64(uint8_t c) {
|
|
|
360
391
|
return (isalnum(c) || (c == '+') || (c == '/'));
|
|
361
392
|
}
|
|
362
393
|
|
|
363
|
-
static inline
|
|
394
|
+
static inline raw_buffer base64_decode(const std::string & encoded_string) {
|
|
364
395
|
int i = 0;
|
|
365
396
|
int j = 0;
|
|
366
397
|
int in_ = 0;
|
|
@@ -370,7 +401,7 @@ static inline std::vector<uint8_t> base64_decode(const std::string & encoded_str
|
|
|
370
401
|
uint8_t char_array_4[4];
|
|
371
402
|
uint8_t char_array_3[3];
|
|
372
403
|
|
|
373
|
-
|
|
404
|
+
raw_buffer ret;
|
|
374
405
|
|
|
375
406
|
while (in_len-- && (encoded_string[in_] != '=') && is_base64(encoded_string[in_])) {
|
|
376
407
|
char_array_4[i++] = encoded_string[in_]; in_++;
|
|
@@ -552,8 +583,11 @@ static json oaicompat_completion_params_parse(const json & body) {
|
|
|
552
583
|
static json oaicompat_completion_params_parse(
|
|
553
584
|
const json & body, /* openai api json semantics */
|
|
554
585
|
bool use_jinja,
|
|
586
|
+
bool prefill_assistant,
|
|
555
587
|
common_reasoning_format reasoning_format,
|
|
556
|
-
const struct common_chat_templates * tmpls
|
|
588
|
+
const struct common_chat_templates * tmpls,
|
|
589
|
+
bool allow_non_text,
|
|
590
|
+
std::vector<raw_buffer> & out_files)
|
|
557
591
|
{
|
|
558
592
|
json llama_params;
|
|
559
593
|
|
|
@@ -601,8 +635,89 @@ static json oaicompat_completion_params_parse(
|
|
|
601
635
|
}
|
|
602
636
|
}
|
|
603
637
|
|
|
638
|
+
// get input files
|
|
639
|
+
if (!body.contains("messages")) {
|
|
640
|
+
throw std::runtime_error("'messages' is required");
|
|
641
|
+
}
|
|
642
|
+
json messages = body.at("messages");
|
|
643
|
+
if (!messages.is_array()) {
|
|
644
|
+
throw std::runtime_error("Expected 'messages' to be an array");
|
|
645
|
+
}
|
|
646
|
+
for (auto & msg : messages) {
|
|
647
|
+
std::string role = json_value(msg, "role", std::string());
|
|
648
|
+
if (role != "assistant" && !msg.contains("content")) {
|
|
649
|
+
throw std::runtime_error("All non-assistant messages must contain 'content'");
|
|
650
|
+
}
|
|
651
|
+
if (role == "assistant") {
|
|
652
|
+
if (!msg.contains("content") && !msg.contains("tool_calls")) {
|
|
653
|
+
throw std::runtime_error("Assistant message must contain either 'content' or 'tool_calls'!");
|
|
654
|
+
}
|
|
655
|
+
if (!msg.contains("content")) {
|
|
656
|
+
continue; // avoid errors with no content
|
|
657
|
+
}
|
|
658
|
+
}
|
|
659
|
+
json & content = msg.at("content");
|
|
660
|
+
if (content.is_string() || content.is_null()) {
|
|
661
|
+
continue;
|
|
662
|
+
}
|
|
663
|
+
|
|
664
|
+
if (!content.is_array()) {
|
|
665
|
+
throw std::runtime_error("Expected 'content' to be a string or an array");
|
|
666
|
+
}
|
|
667
|
+
|
|
668
|
+
for (auto & p : content) {
|
|
669
|
+
std::string type = json_value(p, "type", std::string());
|
|
670
|
+
json image_url = json_value(p, "image_url", json::object());
|
|
671
|
+
if (type == "image_url") {
|
|
672
|
+
if (!allow_non_text) {
|
|
673
|
+
throw std::runtime_error("image input is not supported by this server");
|
|
674
|
+
}
|
|
675
|
+
|
|
676
|
+
std::string url = json_value(image_url, "url", std::string());
|
|
677
|
+
if (string_starts_with(url, "http")) {
|
|
678
|
+
// download remote image
|
|
679
|
+
// TODO @ngxson : maybe make these params configurable
|
|
680
|
+
common_remote_params params;
|
|
681
|
+
params.headers.push_back("User-Agent: llama.cpp/" + build_info);
|
|
682
|
+
params.max_size = 1024 * 1024 * 10; // 10MB
|
|
683
|
+
params.timeout = 10; // seconds
|
|
684
|
+
SRV_INF("downloading image from '%s'\n", url.c_str());
|
|
685
|
+
auto res = common_remote_get_content(url, params);
|
|
686
|
+
if (200 <= res.first && res.first < 300) {
|
|
687
|
+
SRV_INF("downloaded %ld bytes\n", res.second.size());
|
|
688
|
+
raw_buffer data;
|
|
689
|
+
data.insert(data.end(), res.second.begin(), res.second.end());
|
|
690
|
+
out_files.push_back(data);
|
|
691
|
+
} else {
|
|
692
|
+
throw std::runtime_error("Failed to download image");
|
|
693
|
+
}
|
|
694
|
+
|
|
695
|
+
} else {
|
|
696
|
+
// try to decode base64 image
|
|
697
|
+
std::vector<std::string> parts = string_split<std::string>(url, /*separator*/ ',');
|
|
698
|
+
if (parts.size() != 2) {
|
|
699
|
+
throw std::runtime_error("Invalid image_url.url value");
|
|
700
|
+
} else if (!string_starts_with(parts[0], "data:image/")) {
|
|
701
|
+
throw std::runtime_error("Invalid image_url.url format: " + parts[0]);
|
|
702
|
+
} else if (!string_ends_with(parts[0], "base64")) {
|
|
703
|
+
throw std::runtime_error("image_url.url must be base64 encoded");
|
|
704
|
+
} else {
|
|
705
|
+
auto base64_data = parts[1];
|
|
706
|
+
auto decoded_data = base64_decode(base64_data);
|
|
707
|
+
out_files.push_back(decoded_data);
|
|
708
|
+
}
|
|
709
|
+
}
|
|
710
|
+
|
|
711
|
+
// replace this chunk with a marker
|
|
712
|
+
p["type"] = "text";
|
|
713
|
+
p["text"] = MTMD_DEFAULT_IMAGE_MARKER;
|
|
714
|
+
p.erase("image_url");
|
|
715
|
+
}
|
|
716
|
+
}
|
|
717
|
+
}
|
|
718
|
+
|
|
604
719
|
common_chat_templates_inputs inputs;
|
|
605
|
-
inputs.messages = common_chat_msgs_parse_oaicompat(
|
|
720
|
+
inputs.messages = common_chat_msgs_parse_oaicompat(messages);
|
|
606
721
|
inputs.tools = common_chat_tools_parse_oaicompat(tools);
|
|
607
722
|
inputs.tool_choice = common_chat_tool_choice_parse_oaicompat(json_value(body, "tool_choice", std::string("auto")));
|
|
608
723
|
inputs.json_schema = json_schema.is_null() ? "" : json_schema.dump();
|
|
@@ -616,9 +731,31 @@ static json oaicompat_completion_params_parse(
|
|
|
616
731
|
throw std::runtime_error("Cannot use custom grammar constraints with tools.");
|
|
617
732
|
}
|
|
618
733
|
|
|
734
|
+
// if the assistant message appears at the end of list, we do not add end-of-turn token
|
|
735
|
+
// for ex. this can be useful to modify the reasoning process in reasoning models
|
|
736
|
+
bool prefill_assistant_message = !inputs.messages.empty() && inputs.messages.back().role == "assistant" && prefill_assistant;
|
|
737
|
+
common_chat_msg last_message;
|
|
738
|
+
if (prefill_assistant_message) {
|
|
739
|
+
last_message = inputs.messages.back();
|
|
740
|
+
inputs.messages.pop_back();
|
|
741
|
+
|
|
742
|
+
/* sanity check, max one assistant message at the end of the list */
|
|
743
|
+
if (!inputs.messages.empty() && inputs.messages.back().role == "assistant"){
|
|
744
|
+
throw std::runtime_error("Cannot have 2 or more assistant messages at the end of the list.");
|
|
745
|
+
}
|
|
746
|
+
|
|
747
|
+
inputs.extract_reasoning = false;
|
|
748
|
+
inputs.add_generation_prompt = true;
|
|
749
|
+
}
|
|
750
|
+
|
|
619
751
|
// Apply chat template to the list of messages
|
|
620
752
|
auto chat_params = common_chat_templates_apply(tmpls, inputs);
|
|
621
753
|
|
|
754
|
+
/* Append assistant prefilled message */
|
|
755
|
+
if (prefill_assistant_message) {
|
|
756
|
+
chat_params.prompt += last_message.content;
|
|
757
|
+
}
|
|
758
|
+
|
|
622
759
|
llama_params["chat_format"] = static_cast<int>(chat_params.format);
|
|
623
760
|
llama_params["prompt"] = chat_params.prompt;
|
|
624
761
|
if (!chat_params.grammar.empty()) {
|
|
@@ -627,7 +764,8 @@ static json oaicompat_completion_params_parse(
|
|
|
627
764
|
llama_params["grammar_lazy"] = chat_params.grammar_lazy;
|
|
628
765
|
auto grammar_triggers = json::array();
|
|
629
766
|
for (const auto & trigger : chat_params.grammar_triggers) {
|
|
630
|
-
|
|
767
|
+
server_grammar_trigger ct(trigger);
|
|
768
|
+
grammar_triggers.push_back(ct.to_json());
|
|
631
769
|
}
|
|
632
770
|
llama_params["grammar_triggers"] = grammar_triggers;
|
|
633
771
|
llama_params["preserved_tokens"] = chat_params.preserved_tokens;
|
|
@@ -886,3 +1024,286 @@ static std::vector<common_adapter_lora_info> parse_lora_request(
|
|
|
886
1024
|
|
|
887
1025
|
return lora;
|
|
888
1026
|
}
|
|
1027
|
+
|
|
1028
|
+
//
|
|
1029
|
+
// utils for interacting with libmtmd
|
|
1030
|
+
// (may need to refactor in near future)
|
|
1031
|
+
//
|
|
1032
|
+
|
|
1033
|
+
/**
|
|
1034
|
+
* server_tokens is a helper to manage the input tokens and image for the server.
|
|
1035
|
+
* it is made this way to simplify the logic of KV cache management.
|
|
1036
|
+
*/
|
|
1037
|
+
struct server_tokens {
|
|
1038
|
+
bool has_mtmd = false;
|
|
1039
|
+
|
|
1040
|
+
private: // disallow accessing these members directly, risking out-of-sync
|
|
1041
|
+
|
|
1042
|
+
// map a **start** position in tokens to the image chunk
|
|
1043
|
+
std::unordered_map<llama_pos, mtmd::input_chunk_ptr> map_pos_to_image;
|
|
1044
|
+
|
|
1045
|
+
// list of tokens
|
|
1046
|
+
// it can include LLAMA_TOKEN_NULL, which is used to indicate a token that is not a text token
|
|
1047
|
+
// a mtmd_input_chunk can occupy multiple tokens, one llama_token per **position**
|
|
1048
|
+
// important: for models using mrope, an image can contain multiple tokens but will use only one **position**
|
|
1049
|
+
llama_tokens tokens;
|
|
1050
|
+
|
|
1051
|
+
// for ex. with input of 5 text tokens and 2 images:
|
|
1052
|
+
// [0] [1] [2] [3] [4] [img0] [img0] [img0] [img1] [img1]
|
|
1053
|
+
// pos 0 1 2 3 4 5 6 7 8 9
|
|
1054
|
+
// map_pos_to_image will contain: {5, img0}, {8, img1}
|
|
1055
|
+
|
|
1056
|
+
public:
|
|
1057
|
+
server_tokens() = default;
|
|
1058
|
+
~server_tokens() = default;
|
|
1059
|
+
|
|
1060
|
+
// Prevent copying
|
|
1061
|
+
server_tokens(const server_tokens&) = delete;
|
|
1062
|
+
server_tokens& operator=(const server_tokens&) = delete;
|
|
1063
|
+
|
|
1064
|
+
// Allow moving (usually implicitly generated if members are movable)
|
|
1065
|
+
server_tokens(server_tokens&&) = default;
|
|
1066
|
+
server_tokens& operator=(server_tokens&&) = default;
|
|
1067
|
+
|
|
1068
|
+
// Allow accessing elements using [] operator
|
|
1069
|
+
llama_token operator[](size_t index) { return tokens[index]; }
|
|
1070
|
+
const llama_token& operator[](size_t index) const { return tokens[index]; }
|
|
1071
|
+
|
|
1072
|
+
server_tokens(mtmd::input_chunks & mtmd_chunks, bool has_mtmd) : has_mtmd(has_mtmd) {
|
|
1073
|
+
for (size_t i = 0; i < mtmd_chunks.size(); ++i) {
|
|
1074
|
+
push_back(mtmd_chunks[i]);
|
|
1075
|
+
}
|
|
1076
|
+
}
|
|
1077
|
+
|
|
1078
|
+
server_tokens(llama_tokens & tokens, bool has_mtmd) : has_mtmd(has_mtmd), tokens(tokens) {}
|
|
1079
|
+
|
|
1080
|
+
// for debugging
|
|
1081
|
+
std::string str() const {
|
|
1082
|
+
std::ostringstream oss;
|
|
1083
|
+
oss << "tokens: ";
|
|
1084
|
+
for (const auto & t : tokens) {
|
|
1085
|
+
if (t == LLAMA_TOKEN_NULL) {
|
|
1086
|
+
oss << "<embd> ";
|
|
1087
|
+
} else {
|
|
1088
|
+
oss << t << " ";
|
|
1089
|
+
}
|
|
1090
|
+
}
|
|
1091
|
+
oss << "\n";
|
|
1092
|
+
oss << "image pos: ";
|
|
1093
|
+
for (const auto & it : map_pos_to_image) {
|
|
1094
|
+
oss << it.first << ", ";
|
|
1095
|
+
}
|
|
1096
|
+
return oss.str();
|
|
1097
|
+
}
|
|
1098
|
+
|
|
1099
|
+
const mtmd::input_chunk_ptr & find_chunk(llama_pos pos) const {
|
|
1100
|
+
auto it = map_pos_to_image.find(pos);
|
|
1101
|
+
if (it != map_pos_to_image.end()) {
|
|
1102
|
+
return it->second;
|
|
1103
|
+
} else {
|
|
1104
|
+
throw std::runtime_error("Chunk not found");
|
|
1105
|
+
}
|
|
1106
|
+
}
|
|
1107
|
+
|
|
1108
|
+
void push_back(llama_token tok) {
|
|
1109
|
+
if (tok == LLAMA_TOKEN_NULL) {
|
|
1110
|
+
throw std::runtime_error("Invalid token");
|
|
1111
|
+
}
|
|
1112
|
+
tokens.emplace_back(tok);
|
|
1113
|
+
}
|
|
1114
|
+
|
|
1115
|
+
// will create a copy of the chunk if it contains non-text data
|
|
1116
|
+
void push_back(const mtmd_input_chunk * chunk) {
|
|
1117
|
+
auto type = mtmd_input_chunk_get_type(chunk);
|
|
1118
|
+
if (type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
|
|
1119
|
+
GGML_ASSERT(has_mtmd);
|
|
1120
|
+
auto img_tokens = mtmd_input_chunk_get_tokens_image(chunk);
|
|
1121
|
+
const int n_pos = mtmd_image_tokens_get_n_pos(img_tokens);
|
|
1122
|
+
llama_pos start_pos = tokens.size();
|
|
1123
|
+
for (int i = 0; i < n_pos; ++i) {
|
|
1124
|
+
tokens.emplace_back(LLAMA_TOKEN_NULL);
|
|
1125
|
+
}
|
|
1126
|
+
mtmd::input_chunk_ptr new_chunk(mtmd_input_chunk_copy(chunk));
|
|
1127
|
+
map_pos_to_image[start_pos] = std::move(new_chunk);
|
|
1128
|
+
} else if (type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
|
|
1129
|
+
size_t n_tokens;
|
|
1130
|
+
auto text_tokens = mtmd_input_chunk_get_tokens_text(chunk, &n_tokens);
|
|
1131
|
+
for (size_t i = 0; i < n_tokens; ++i) {
|
|
1132
|
+
push_back(text_tokens[i]);
|
|
1133
|
+
}
|
|
1134
|
+
} else {
|
|
1135
|
+
GGML_ABORT("Invalid chunk type");
|
|
1136
|
+
}
|
|
1137
|
+
}
|
|
1138
|
+
|
|
1139
|
+
// for compatibility with context shift and prompt truncation
|
|
1140
|
+
void insert(const llama_tokens & inp_tokens) {
|
|
1141
|
+
GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled
|
|
1142
|
+
tokens.insert(tokens.end(), inp_tokens.begin(), inp_tokens.end());
|
|
1143
|
+
}
|
|
1144
|
+
|
|
1145
|
+
// for compatibility with speculative decoding, ctx shift, slot save/load
|
|
1146
|
+
const llama_tokens & get_text_tokens() const {
|
|
1147
|
+
GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled
|
|
1148
|
+
return tokens;
|
|
1149
|
+
}
|
|
1150
|
+
|
|
1151
|
+
// for compatibility with speculative decoding
|
|
1152
|
+
void set_token(llama_pos pos, llama_token id) {
|
|
1153
|
+
GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled
|
|
1154
|
+
tokens[pos] = id;
|
|
1155
|
+
}
|
|
1156
|
+
|
|
1157
|
+
size_t size() const {
|
|
1158
|
+
return tokens.size();
|
|
1159
|
+
}
|
|
1160
|
+
|
|
1161
|
+
bool empty() const {
|
|
1162
|
+
return tokens.empty();
|
|
1163
|
+
}
|
|
1164
|
+
|
|
1165
|
+
void clear() {
|
|
1166
|
+
tokens.clear();
|
|
1167
|
+
}
|
|
1168
|
+
|
|
1169
|
+
void keep_first(size_t n) {
|
|
1170
|
+
GGML_ASSERT(n <= tokens.size());
|
|
1171
|
+
if (has_mtmd) {
|
|
1172
|
+
// we throw an error if we try to remove a token in the middle of an image
|
|
1173
|
+
// for ex. with input of 5 text tokens and 2 images:
|
|
1174
|
+
// [0] [1] [2] [3] [4] [img0] [img0] [img0] [img1] [img1]
|
|
1175
|
+
// n 1 2 3 4 5 6 7 8 9 10
|
|
1176
|
+
// allowed to resize ^ ^
|
|
1177
|
+
// disallowed to resize ^ ^ ^
|
|
1178
|
+
if (n > 0) {
|
|
1179
|
+
llama_token last_token = tokens[n - 1];
|
|
1180
|
+
// make sure we never remove tokens in the middle of an image
|
|
1181
|
+
if (last_token == LLAMA_TOKEN_NULL) {
|
|
1182
|
+
find_chunk(n - 1); // will throw an error if the token is not begin-of-chunk
|
|
1183
|
+
}
|
|
1184
|
+
}
|
|
1185
|
+
// remove all image chunks that are not used anymore
|
|
1186
|
+
for (auto it = map_pos_to_image.begin(); it != map_pos_to_image.end(); ) {
|
|
1187
|
+
llama_pos pos = it->first;
|
|
1188
|
+
if (pos >= (llama_pos)n) {
|
|
1189
|
+
it = map_pos_to_image.erase(it);
|
|
1190
|
+
} else {
|
|
1191
|
+
++it;
|
|
1192
|
+
}
|
|
1193
|
+
}
|
|
1194
|
+
}
|
|
1195
|
+
tokens.resize(n);
|
|
1196
|
+
}
|
|
1197
|
+
|
|
1198
|
+
std::string detokenize(const llama_context * ctx, bool special) const {
|
|
1199
|
+
llama_tokens text_tokens;
|
|
1200
|
+
text_tokens.reserve(tokens.size());
|
|
1201
|
+
for (const auto & t : tokens) {
|
|
1202
|
+
if (t != LLAMA_TOKEN_NULL) {
|
|
1203
|
+
text_tokens.push_back(t);
|
|
1204
|
+
}
|
|
1205
|
+
}
|
|
1206
|
+
return common_detokenize(ctx, text_tokens, special);
|
|
1207
|
+
}
|
|
1208
|
+
|
|
1209
|
+
size_t get_common_prefix(const server_tokens & b) const {
|
|
1210
|
+
size_t max_idx = std::min(tokens.size(), b.tokens.size());
|
|
1211
|
+
for (size_t i = 0; i < max_idx; ++i) {
|
|
1212
|
+
auto & ai = tokens[i];
|
|
1213
|
+
auto & bi = b.tokens[i];
|
|
1214
|
+
|
|
1215
|
+
if (ai == LLAMA_TOKEN_NULL && bi == LLAMA_TOKEN_NULL) {
|
|
1216
|
+
GGML_ASSERT(has_mtmd);
|
|
1217
|
+
const auto & a_chunk = find_chunk(i);
|
|
1218
|
+
const auto & b_chunk = b.find_chunk(i);
|
|
1219
|
+
GGML_ASSERT(a_chunk && b_chunk);
|
|
1220
|
+
const auto * a_img = mtmd_input_chunk_get_tokens_image(a_chunk.get());
|
|
1221
|
+
const auto * b_img = mtmd_input_chunk_get_tokens_image(b_chunk.get());
|
|
1222
|
+
std::string ai_id = mtmd_image_tokens_get_id(a_img);
|
|
1223
|
+
std::string bi_id = mtmd_image_tokens_get_id(b_img);
|
|
1224
|
+
size_t a_pos = mtmd_image_tokens_get_n_pos(a_img);
|
|
1225
|
+
size_t b_pos = mtmd_image_tokens_get_n_pos(b_img);
|
|
1226
|
+
if (ai_id == bi_id && a_pos == b_pos) {
|
|
1227
|
+
GGML_ASSERT(a_pos > 0 && "Invalid image token"); // should never happen
|
|
1228
|
+
i += a_pos - 1; // will be +1 by the for loop
|
|
1229
|
+
continue;
|
|
1230
|
+
} else {
|
|
1231
|
+
return i;
|
|
1232
|
+
}
|
|
1233
|
+
} else if (ai == bi) {
|
|
1234
|
+
continue;
|
|
1235
|
+
} else {
|
|
1236
|
+
return i;
|
|
1237
|
+
}
|
|
1238
|
+
}
|
|
1239
|
+
return max_idx; // all tokens are equal
|
|
1240
|
+
}
|
|
1241
|
+
|
|
1242
|
+
// make sure all text tokens are within the vocab range
|
|
1243
|
+
bool validate(const struct llama_context * ctx) const {
|
|
1244
|
+
const llama_model * model = llama_get_model(ctx);
|
|
1245
|
+
const llama_vocab * vocab = llama_model_get_vocab(model);
|
|
1246
|
+
const int32_t n_vocab = llama_vocab_n_tokens(vocab);
|
|
1247
|
+
|
|
1248
|
+
for (size_t i = 0; i < tokens.size(); ++i) {
|
|
1249
|
+
auto & t = tokens[i];
|
|
1250
|
+
if (t == LLAMA_TOKEN_NULL) {
|
|
1251
|
+
try {
|
|
1252
|
+
const auto & chunk = find_chunk(i);
|
|
1253
|
+
const auto * img_tokens = mtmd_input_chunk_get_tokens_image(chunk.get());
|
|
1254
|
+
size_t n_pos = mtmd_image_tokens_get_n_pos(img_tokens);
|
|
1255
|
+
i += n_pos - 1; // will be +1 by the for loop
|
|
1256
|
+
} catch (const std::exception & e) {
|
|
1257
|
+
return false;
|
|
1258
|
+
}
|
|
1259
|
+
} else if (t < 0 || t >= n_vocab) {
|
|
1260
|
+
return false;
|
|
1261
|
+
}
|
|
1262
|
+
}
|
|
1263
|
+
return true;
|
|
1264
|
+
}
|
|
1265
|
+
|
|
1266
|
+
// encode and decode the image chunk
|
|
1267
|
+
int32_t process_chunk(
|
|
1268
|
+
llama_context * ctx,
|
|
1269
|
+
mtmd_context * mctx,
|
|
1270
|
+
llama_pos n_past,
|
|
1271
|
+
int32_t seq_id,
|
|
1272
|
+
llama_pos & n_pos_out) {
|
|
1273
|
+
auto it = map_pos_to_image.find(n_past);
|
|
1274
|
+
if (it == map_pos_to_image.end()) {
|
|
1275
|
+
throw std::runtime_error("Chunk not found");
|
|
1276
|
+
}
|
|
1277
|
+
SRV_INF("%s\n", "processing image...");
|
|
1278
|
+
int32_t n_batch = llama_n_batch(ctx);
|
|
1279
|
+
int64_t t0 = ggml_time_ms();
|
|
1280
|
+
llama_pos new_n_past = n_past;
|
|
1281
|
+
int32_t result = mtmd_helper_eval_chunk_single(mctx, ctx,
|
|
1282
|
+
it->second.get(), // chunk
|
|
1283
|
+
n_past,
|
|
1284
|
+
seq_id,
|
|
1285
|
+
n_batch,
|
|
1286
|
+
true, // logits last
|
|
1287
|
+
&new_n_past);
|
|
1288
|
+
SRV_INF("image processed in %" PRId64 " ms\n", ggml_time_ms() - t0);
|
|
1289
|
+
if (result != 0) {
|
|
1290
|
+
LOG_ERR("mtmd_helper_eval failed with status %d", result);
|
|
1291
|
+
n_pos_out = n_past;
|
|
1292
|
+
return result;
|
|
1293
|
+
}
|
|
1294
|
+
n_pos_out = new_n_past;
|
|
1295
|
+
return 0;
|
|
1296
|
+
}
|
|
1297
|
+
};
|
|
1298
|
+
|
|
1299
|
+
// Computes FNV-1a hash of the data
|
|
1300
|
+
static std::string fnv_hash(const uint8_t * data, size_t len) {
|
|
1301
|
+
const uint64_t fnv_prime = 0x100000001b3ULL;
|
|
1302
|
+
uint64_t hash = 0xcbf29ce484222325ULL;
|
|
1303
|
+
|
|
1304
|
+
for (size_t i = 0; i < len; ++i) {
|
|
1305
|
+
hash ^= data[i];
|
|
1306
|
+
hash *= fnv_prime;
|
|
1307
|
+
}
|
|
1308
|
+
return std::to_string(hash);
|
|
1309
|
+
}
|
|
@@ -577,12 +577,7 @@ int main(int argc, char ** argv) {
|
|
|
577
577
|
|
|
578
578
|
const llama_vocab * vocab = llama_model_get_vocab(model_ttc);
|
|
579
579
|
|
|
580
|
-
|
|
581
|
-
params.model = params.vocoder.model;
|
|
582
|
-
params.model_url = params.vocoder.model_url;
|
|
583
|
-
params.hf_repo = params.vocoder.hf_repo;
|
|
584
|
-
params.hf_file = params.vocoder.hf_file;
|
|
585
|
-
|
|
580
|
+
params.model = params.vocoder.model;
|
|
586
581
|
params.embedding = true;
|
|
587
582
|
|
|
588
583
|
common_init_result llama_init_cts = common_init_from_params(params);
|
|
@@ -699,11 +694,13 @@ lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|14
|
|
|
699
694
|
const std::string voice_data = audio_data;
|
|
700
695
|
|
|
701
696
|
auto tmp = common_tokenize(vocab, voice_data, false, true);
|
|
702
|
-
|
|
697
|
+
|
|
698
|
+
std::ostringstream tokens_oss;
|
|
703
699
|
for (size_t i = 0; i < tmp.size(); ++i) {
|
|
704
|
-
|
|
700
|
+
tokens_oss << tmp[i] << ", ";
|
|
705
701
|
}
|
|
706
|
-
|
|
702
|
+
LOG_INF("\n\n%s: llama tokens: %s\n\n", __func__, tokens_oss.str().c_str());
|
|
703
|
+
|
|
707
704
|
prompt_add(prompt_inp, tmp);
|
|
708
705
|
#else
|
|
709
706
|
prompt_add(prompt_inp, llama_tokens {
|