@fugood/llama.node 0.3.16 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +6 -1
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +44 -2
- package/lib/index.js +132 -1
- package/lib/index.ts +203 -3
- package/package.json +2 -1
- package/src/EmbeddingWorker.cpp +1 -1
- package/src/LlamaCompletionWorker.cpp +374 -19
- package/src/LlamaCompletionWorker.h +31 -10
- package/src/LlamaContext.cpp +216 -7
- package/src/LlamaContext.h +12 -0
- package/src/common.hpp +15 -0
- package/src/llama.cpp/.github/workflows/build-linux-cross.yml +233 -0
- package/src/llama.cpp/.github/workflows/build.yml +89 -767
- package/src/llama.cpp/.github/workflows/docker.yml +9 -6
- package/src/llama.cpp/.github/workflows/release.yml +716 -0
- package/src/llama.cpp/.github/workflows/server.yml +19 -23
- package/src/llama.cpp/CMakeLists.txt +11 -1
- package/src/llama.cpp/cmake/build-info.cmake +8 -2
- package/src/llama.cpp/cmake/x64-windows-llvm.cmake +0 -6
- package/src/llama.cpp/common/CMakeLists.txt +35 -4
- package/src/llama.cpp/common/arg.cpp +844 -121
- package/src/llama.cpp/common/arg.h +9 -0
- package/src/llama.cpp/common/chat.cpp +129 -107
- package/src/llama.cpp/common/chat.h +2 -0
- package/src/llama.cpp/common/common.cpp +64 -518
- package/src/llama.cpp/common/common.h +35 -45
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -0
- package/src/llama.cpp/common/llguidance.cpp +31 -47
- package/src/llama.cpp/common/minja/chat-template.hpp +23 -11
- package/src/llama.cpp/common/minja/minja.hpp +186 -127
- package/src/llama.cpp/common/regex-partial.cpp +204 -0
- package/src/llama.cpp/common/regex-partial.h +56 -0
- package/src/llama.cpp/common/sampling.cpp +60 -50
- package/src/llama.cpp/docs/build.md +122 -7
- package/src/llama.cpp/examples/CMakeLists.txt +2 -32
- package/src/llama.cpp/examples/batched/batched.cpp +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +9 -12
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
- package/src/llama.cpp/examples/parallel/parallel.cpp +89 -15
- package/src/llama.cpp/examples/passkey/passkey.cpp +1 -1
- package/src/llama.cpp/examples/speculative/speculative.cpp +1 -1
- package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
- package/src/llama.cpp/examples/sycl/build.sh +2 -2
- package/src/llama.cpp/examples/sycl/win-build-sycl.bat +2 -2
- package/src/llama.cpp/examples/training/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/training/finetune.cpp +96 -0
- package/src/llama.cpp/ggml/CMakeLists.txt +35 -2
- package/src/llama.cpp/ggml/cmake/GitVars.cmake +22 -0
- package/src/llama.cpp/ggml/include/ggml-backend.h +4 -4
- package/src/llama.cpp/ggml/include/ggml-cpp.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-cpu.h +5 -0
- package/src/llama.cpp/ggml/include/ggml-opt.h +47 -28
- package/src/llama.cpp/ggml/include/ggml-rpc.h +6 -1
- package/src/llama.cpp/ggml/include/ggml.h +76 -106
- package/src/llama.cpp/ggml/src/CMakeLists.txt +11 -8
- package/src/llama.cpp/ggml/src/ggml-alloc.c +4 -1
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +9 -5
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +0 -2
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +8 -4
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +5 -5
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +692 -1534
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +613 -122
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +135 -1
- package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +507 -137
- package/src/llama.cpp/ggml/src/ggml-common.h +12 -6
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +66 -33
- package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/common.h +72 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +896 -194
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +2 -21
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +1060 -410
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1008 -13533
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +31 -16
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +90 -12
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +47 -13
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +266 -72
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1034 -88
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +8796 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +110 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +892 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +28 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +252 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +802 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +23 -4
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +7 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +0 -4
- package/src/llama.cpp/ggml/src/ggml-impl.h +52 -18
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +106 -14
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +67 -119
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1023 -262
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +368 -190
- package/src/llama.cpp/ggml/src/ggml-quants.c +0 -6
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +307 -40
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +125 -45
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +10 -8
- package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +239 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +39 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +0 -35
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +9 -307
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +72 -25
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +14 -7
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +59 -21
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +7 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +79 -90
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +944 -438
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +22 -23
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +37 -8
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +24 -20
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +1 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +507 -411
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +84 -74
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +1 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +185 -89
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +37 -49
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +7 -22
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +4 -14
- package/src/llama.cpp/ggml/src/ggml-sycl/quants.hpp +83 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +204 -118
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +1 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +128 -53
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +83 -49
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1278 -282
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +32 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +133 -30
- package/src/llama.cpp/ggml/src/ggml.c +170 -265
- package/src/llama.cpp/ggml/src/gguf.cpp +34 -33
- package/src/llama.cpp/include/llama.h +82 -22
- package/src/llama.cpp/models/ggml-vocab-llama4.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-llama4.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.out +46 -0
- package/src/llama.cpp/requirements/requirements-all.txt +5 -3
- package/src/llama.cpp/requirements/requirements-gguf_editor_gui.txt +3 -0
- package/src/llama.cpp/scripts/xxd.cmake +1 -1
- package/src/llama.cpp/src/CMakeLists.txt +4 -2
- package/src/llama.cpp/src/llama-adapter.cpp +43 -1
- package/src/llama.cpp/src/llama-arch.cpp +163 -17
- package/src/llama.cpp/src/llama-arch.h +16 -0
- package/src/llama.cpp/src/llama-batch.cpp +5 -1
- package/src/llama.cpp/src/llama-batch.h +2 -1
- package/src/llama.cpp/src/llama-chat.cpp +91 -16
- package/src/llama.cpp/src/llama-chat.h +7 -2
- package/src/llama.cpp/src/llama-context.cpp +479 -575
- package/src/llama.cpp/src/llama-context.h +44 -33
- package/src/llama.cpp/src/llama-cparams.h +1 -0
- package/src/llama.cpp/src/llama-graph.cpp +209 -157
- package/src/llama.cpp/src/llama-graph.h +38 -14
- package/src/llama.cpp/src/llama-hparams.h +13 -0
- package/src/llama.cpp/src/llama-kv-cache.cpp +1604 -543
- package/src/llama.cpp/src/llama-kv-cache.h +283 -171
- package/src/llama.cpp/src/llama-memory.h +12 -2
- package/src/llama.cpp/src/llama-mmap.cpp +1 -1
- package/src/llama.cpp/src/llama-model-loader.cpp +34 -20
- package/src/llama.cpp/src/llama-model-loader.h +5 -3
- package/src/llama.cpp/src/llama-model-saver.cpp +281 -0
- package/src/llama.cpp/src/llama-model-saver.h +37 -0
- package/src/llama.cpp/src/llama-model.cpp +1803 -330
- package/src/llama.cpp/src/llama-model.h +21 -2
- package/src/llama.cpp/src/llama-quant.cpp +33 -10
- package/src/llama.cpp/src/llama-sampling.cpp +25 -7
- package/src/llama.cpp/src/llama-vocab.cpp +86 -10
- package/src/llama.cpp/src/llama-vocab.h +6 -0
- package/src/llama.cpp/src/llama.cpp +15 -1
- package/src/llama.cpp/tests/CMakeLists.txt +52 -31
- package/src/llama.cpp/tests/test-arg-parser.cpp +51 -4
- package/src/llama.cpp/tests/test-backend-ops.cpp +189 -90
- package/src/llama.cpp/tests/test-chat-template.cpp +26 -6
- package/src/llama.cpp/tests/test-chat.cpp +15 -3
- package/src/llama.cpp/{examples/gbnf-validator/gbnf-validator.cpp → tests/test-gbnf-validator.cpp} +2 -2
- package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -2
- package/src/llama.cpp/tests/test-grammar-llguidance.cpp +63 -2
- package/src/llama.cpp/tests/test-grammar-parser.cpp +3 -1
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -1
- package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -1
- package/src/llama.cpp/tests/test-mtmd-c-api.c +63 -0
- package/src/llama.cpp/tests/test-opt.cpp +33 -21
- package/src/llama.cpp/{examples/quantize-stats/quantize-stats.cpp → tests/test-quantize-stats.cpp} +3 -1
- package/src/llama.cpp/tests/test-regex-partial.cpp +288 -0
- package/src/llama.cpp/tests/test-sampling.cpp +1 -1
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +2 -1
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +2 -1
- package/src/llama.cpp/tools/CMakeLists.txt +39 -0
- package/src/llama.cpp/{examples → tools}/batched-bench/batched-bench.cpp +3 -3
- package/src/llama.cpp/{examples → tools}/export-lora/export-lora.cpp +1 -1
- package/src/llama.cpp/{examples → tools}/gguf-split/gguf-split.cpp +15 -16
- package/src/llama.cpp/{examples → tools}/imatrix/imatrix.cpp +11 -9
- package/src/llama.cpp/{examples → tools}/llama-bench/llama-bench.cpp +623 -274
- package/src/llama.cpp/{examples → tools}/main/main.cpp +22 -14
- package/src/llama.cpp/tools/mtmd/CMakeLists.txt +47 -0
- package/src/llama.cpp/tools/mtmd/clip-impl.h +365 -0
- package/src/llama.cpp/tools/mtmd/clip.cpp +3646 -0
- package/src/llama.cpp/tools/mtmd/clip.h +99 -0
- package/src/llama.cpp/tools/mtmd/deprecation-warning.cpp +22 -0
- package/src/llama.cpp/tools/mtmd/mtmd-cli.cpp +370 -0
- package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +310 -0
- package/src/llama.cpp/tools/mtmd/mtmd.cpp +678 -0
- package/src/llama.cpp/tools/mtmd/mtmd.h +331 -0
- package/src/llama.cpp/{examples → tools}/perplexity/perplexity.cpp +21 -5
- package/src/llama.cpp/{examples → tools}/quantize/quantize.cpp +53 -3
- package/src/llama.cpp/tools/rpc/CMakeLists.txt +4 -0
- package/src/llama.cpp/tools/rpc/rpc-server.cpp +322 -0
- package/src/llama.cpp/tools/run/CMakeLists.txt +16 -0
- package/src/llama.cpp/{examples → tools}/run/run.cpp +30 -30
- package/src/llama.cpp/{examples → tools}/server/CMakeLists.txt +2 -1
- package/src/llama.cpp/{examples → tools}/server/httplib.h +313 -247
- package/src/llama.cpp/{examples → tools}/server/server.cpp +529 -215
- package/src/llama.cpp/{examples → tools}/server/utils.hpp +427 -6
- package/src/llama.cpp/{examples → tools}/tts/tts.cpp +6 -9
- package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +0 -6
- package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/infill/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/infill/infill.cpp +0 -590
- package/src/llama.cpp/examples/llava/CMakeLists.txt +0 -66
- package/src/llama.cpp/examples/llava/android/build_64.sh +0 -8
- package/src/llama.cpp/examples/llava/clip-quantize-cli.cpp +0 -59
- package/src/llama.cpp/examples/llava/clip.cpp +0 -3206
- package/src/llama.cpp/examples/llava/clip.h +0 -118
- package/src/llama.cpp/examples/llava/gemma3-cli.cpp +0 -341
- package/src/llama.cpp/examples/llava/llava-cli.cpp +0 -332
- package/src/llama.cpp/examples/llava/llava.cpp +0 -574
- package/src/llama.cpp/examples/llava/llava.h +0 -49
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +0 -354
- package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +0 -584
- package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +0 -6
- package/src/llama.cpp/examples/rpc/CMakeLists.txt +0 -2
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +0 -171
- package/src/llama.cpp/examples/run/CMakeLists.txt +0 -5
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
- /package/src/llama.cpp/{examples → tools}/batched-bench/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/completions.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/cvector-generator.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/mean.hpp +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/negative.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/pca.hpp +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/positive.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/export-lora/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/gguf-split/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/imatrix/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/llama-bench/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/main/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples/llava → tools/mtmd}/requirements.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/perplexity/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/quantize/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.h +0 -0
- /package/src/llama.cpp/{examples → tools}/server/bench/requirements.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/server/tests/requirements.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/tokenize/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/tokenize/tokenize.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/tts/CMakeLists.txt +0 -0
|
@@ -1,12 +1,24 @@
|
|
|
1
|
+
#include "gguf.h" // for reading GGUF splits
|
|
1
2
|
#include "arg.h"
|
|
2
3
|
|
|
4
|
+
#include "common.h"
|
|
3
5
|
#include "log.h"
|
|
4
6
|
#include "sampling.h"
|
|
5
7
|
#include "chat.h"
|
|
6
8
|
|
|
9
|
+
// fix problem with std::min and std::max
|
|
10
|
+
#if defined(_WIN32)
|
|
11
|
+
#define WIN32_LEAN_AND_MEAN
|
|
12
|
+
#ifndef NOMINMAX
|
|
13
|
+
# define NOMINMAX
|
|
14
|
+
#endif
|
|
15
|
+
#include <windows.h>
|
|
16
|
+
#endif
|
|
17
|
+
|
|
7
18
|
#include <algorithm>
|
|
8
19
|
#include <climits>
|
|
9
20
|
#include <cstdarg>
|
|
21
|
+
#include <filesystem>
|
|
10
22
|
#include <fstream>
|
|
11
23
|
#include <regex>
|
|
12
24
|
#include <set>
|
|
@@ -14,10 +26,42 @@
|
|
|
14
26
|
#include <thread>
|
|
15
27
|
#include <vector>
|
|
16
28
|
|
|
29
|
+
//#define LLAMA_USE_CURL
|
|
30
|
+
|
|
31
|
+
#if defined(LLAMA_USE_CURL)
|
|
32
|
+
#include <curl/curl.h>
|
|
33
|
+
#include <curl/easy.h>
|
|
34
|
+
#include <future>
|
|
35
|
+
#endif
|
|
36
|
+
|
|
17
37
|
#include "json-schema-to-grammar.h"
|
|
18
38
|
|
|
19
39
|
using json = nlohmann::ordered_json;
|
|
20
40
|
|
|
41
|
+
std::initializer_list<enum llama_example> mmproj_examples = {
|
|
42
|
+
LLAMA_EXAMPLE_LLAVA,
|
|
43
|
+
LLAMA_EXAMPLE_SERVER,
|
|
44
|
+
};
|
|
45
|
+
|
|
46
|
+
static std::string read_file(const std::string & fname) {
|
|
47
|
+
std::ifstream file(fname);
|
|
48
|
+
if (!file) {
|
|
49
|
+
throw std::runtime_error(string_format("error: failed to open file '%s'\n", fname.c_str()));
|
|
50
|
+
}
|
|
51
|
+
std::string content((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
|
|
52
|
+
file.close();
|
|
53
|
+
return content;
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
static void write_file(const std::string & fname, const std::string & content) {
|
|
57
|
+
std::ofstream file(fname);
|
|
58
|
+
if (!file) {
|
|
59
|
+
throw std::runtime_error(string_format("error: failed to open file '%s'\n", fname.c_str()));
|
|
60
|
+
}
|
|
61
|
+
file << content;
|
|
62
|
+
file.close();
|
|
63
|
+
}
|
|
64
|
+
|
|
21
65
|
common_arg & common_arg::set_examples(std::initializer_list<enum llama_example> examples) {
|
|
22
66
|
this->examples = std::move(examples);
|
|
23
67
|
return *this;
|
|
@@ -126,47 +170,635 @@ std::string common_arg::to_string() {
|
|
|
126
170
|
}
|
|
127
171
|
|
|
128
172
|
//
|
|
129
|
-
//
|
|
173
|
+
// downloader
|
|
130
174
|
//
|
|
131
175
|
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
176
|
+
struct common_hf_file_res {
|
|
177
|
+
std::string repo; // repo name with ":tag" removed
|
|
178
|
+
std::string ggufFile;
|
|
179
|
+
std::string mmprojFile;
|
|
180
|
+
};
|
|
181
|
+
|
|
182
|
+
#ifdef LLAMA_USE_CURL
|
|
183
|
+
|
|
184
|
+
bool common_has_curl() {
|
|
185
|
+
return true;
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
#ifdef __linux__
|
|
189
|
+
#include <linux/limits.h>
|
|
190
|
+
#elif defined(_WIN32)
|
|
191
|
+
# if !defined(PATH_MAX)
|
|
192
|
+
# define PATH_MAX MAX_PATH
|
|
193
|
+
# endif
|
|
194
|
+
#elif defined(_AIX)
|
|
195
|
+
#include <sys/limits.h>
|
|
196
|
+
#else
|
|
197
|
+
#include <sys/syslimits.h>
|
|
198
|
+
#endif
|
|
199
|
+
#define LLAMA_CURL_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
|
|
200
|
+
|
|
201
|
+
//
|
|
202
|
+
// CURL utils
|
|
203
|
+
//
|
|
204
|
+
|
|
205
|
+
using curl_ptr = std::unique_ptr<CURL, decltype(&curl_easy_cleanup)>;
|
|
206
|
+
|
|
207
|
+
// cannot use unique_ptr for curl_slist, because we cannot update without destroying the old one
|
|
208
|
+
struct curl_slist_ptr {
|
|
209
|
+
struct curl_slist * ptr = nullptr;
|
|
210
|
+
~curl_slist_ptr() {
|
|
211
|
+
if (ptr) {
|
|
212
|
+
curl_slist_free_all(ptr);
|
|
213
|
+
}
|
|
214
|
+
}
|
|
215
|
+
};
|
|
216
|
+
|
|
217
|
+
#define CURL_MAX_RETRY 3
|
|
218
|
+
#define CURL_RETRY_DELAY_SECONDS 2
|
|
219
|
+
|
|
220
|
+
static bool curl_perform_with_retry(const std::string & url, CURL * curl, int max_attempts, int retry_delay_seconds, const char * method_name) {
|
|
221
|
+
int remaining_attempts = max_attempts;
|
|
222
|
+
|
|
223
|
+
while (remaining_attempts > 0) {
|
|
224
|
+
LOG_INF("%s: %s %s (attempt %d of %d)...\n", __func__ , method_name, url.c_str(), max_attempts - remaining_attempts + 1, max_attempts);
|
|
225
|
+
|
|
226
|
+
CURLcode res = curl_easy_perform(curl);
|
|
227
|
+
if (res == CURLE_OK) {
|
|
228
|
+
return true;
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
int exponential_backoff_delay = std::pow(retry_delay_seconds, max_attempts - remaining_attempts) * 1000;
|
|
232
|
+
LOG_WRN("%s: curl_easy_perform() failed: %s, retrying after %d milliseconds...\n", __func__, curl_easy_strerror(res), exponential_backoff_delay);
|
|
233
|
+
|
|
234
|
+
remaining_attempts--;
|
|
235
|
+
if (remaining_attempts == 0) break;
|
|
236
|
+
std::this_thread::sleep_for(std::chrono::milliseconds(exponential_backoff_delay));
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
LOG_ERR("%s: curl_easy_perform() failed after %d attempts\n", __func__, max_attempts);
|
|
240
|
+
|
|
241
|
+
return false;
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
// download one single file from remote URL to local path
|
|
245
|
+
static bool common_download_file_single(const std::string & url, const std::string & path, const std::string & bearer_token) {
|
|
246
|
+
// Initialize libcurl
|
|
247
|
+
curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
|
|
248
|
+
curl_slist_ptr http_headers;
|
|
249
|
+
if (!curl) {
|
|
250
|
+
LOG_ERR("%s: error initializing libcurl\n", __func__);
|
|
251
|
+
return false;
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
// Set the URL, allow to follow http redirection
|
|
255
|
+
curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
|
|
256
|
+
curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);
|
|
257
|
+
|
|
258
|
+
http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
|
|
259
|
+
// Check if hf-token or bearer-token was specified
|
|
260
|
+
if (!bearer_token.empty()) {
|
|
261
|
+
std::string auth_header = "Authorization: Bearer " + bearer_token;
|
|
262
|
+
http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
|
|
263
|
+
}
|
|
264
|
+
curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
|
|
265
|
+
|
|
266
|
+
#if defined(_WIN32)
|
|
267
|
+
// CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of
|
|
268
|
+
// operating system. Currently implemented under MS-Windows.
|
|
269
|
+
curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
|
|
270
|
+
#endif
|
|
271
|
+
|
|
272
|
+
// Check if the file already exists locally
|
|
273
|
+
auto file_exists = std::filesystem::exists(path);
|
|
274
|
+
|
|
275
|
+
// If the file exists, check its JSON metadata companion file.
|
|
276
|
+
std::string metadata_path = path + ".json";
|
|
277
|
+
nlohmann::json metadata; // TODO @ngxson : get rid of this json, use regex instead
|
|
278
|
+
std::string etag;
|
|
279
|
+
std::string last_modified;
|
|
280
|
+
|
|
281
|
+
if (file_exists) {
|
|
282
|
+
// Try and read the JSON metadata file (note: stream autoclosed upon exiting this block).
|
|
283
|
+
std::ifstream metadata_in(metadata_path);
|
|
284
|
+
if (metadata_in.good()) {
|
|
285
|
+
try {
|
|
286
|
+
metadata_in >> metadata;
|
|
287
|
+
LOG_DBG("%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str());
|
|
288
|
+
if (metadata.contains("etag") && metadata.at("etag").is_string()) {
|
|
289
|
+
etag = metadata.at("etag");
|
|
146
290
|
}
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
291
|
+
if (metadata.contains("lastModified") && metadata.at("lastModified").is_string()) {
|
|
292
|
+
last_modified = metadata.at("lastModified");
|
|
293
|
+
}
|
|
294
|
+
} catch (const nlohmann::json::exception & e) {
|
|
295
|
+
LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
|
|
151
296
|
}
|
|
152
297
|
}
|
|
153
|
-
//
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
298
|
+
// if we cannot open the metadata file, we assume that the downloaded file is not valid (etag and last-modified are left empty, so we will download it again)
|
|
299
|
+
} else {
|
|
300
|
+
LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
|
|
301
|
+
}
|
|
302
|
+
|
|
303
|
+
// Send a HEAD request to retrieve the etag and last-modified headers
|
|
304
|
+
struct common_load_model_from_url_headers {
|
|
305
|
+
std::string etag;
|
|
306
|
+
std::string last_modified;
|
|
307
|
+
};
|
|
308
|
+
|
|
309
|
+
common_load_model_from_url_headers headers;
|
|
310
|
+
bool head_request_ok = false;
|
|
311
|
+
bool should_download = !file_exists; // by default, we should download if the file does not exist
|
|
312
|
+
|
|
313
|
+
// get ETag to see if the remote file has changed
|
|
314
|
+
{
|
|
315
|
+
typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
|
|
316
|
+
auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
|
|
317
|
+
common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata;
|
|
318
|
+
|
|
319
|
+
static std::regex header_regex("([^:]+): (.*)\r\n");
|
|
320
|
+
static std::regex etag_regex("ETag", std::regex_constants::icase);
|
|
321
|
+
static std::regex last_modified_regex("Last-Modified", std::regex_constants::icase);
|
|
322
|
+
|
|
323
|
+
std::string header(buffer, n_items);
|
|
324
|
+
std::smatch match;
|
|
325
|
+
if (std::regex_match(header, match, header_regex)) {
|
|
326
|
+
const std::string & key = match[1];
|
|
327
|
+
const std::string & value = match[2];
|
|
328
|
+
if (std::regex_match(key, match, etag_regex)) {
|
|
329
|
+
headers->etag = value;
|
|
330
|
+
} else if (std::regex_match(key, match, last_modified_regex)) {
|
|
331
|
+
headers->last_modified = value;
|
|
332
|
+
}
|
|
333
|
+
}
|
|
334
|
+
return n_items;
|
|
335
|
+
};
|
|
336
|
+
|
|
337
|
+
curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 1L); // will trigger the HEAD verb
|
|
338
|
+
curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L); // hide head request progress
|
|
339
|
+
curl_easy_setopt(curl.get(), CURLOPT_HEADERFUNCTION, static_cast<CURLOPT_HEADERFUNCTION_PTR>(header_callback));
|
|
340
|
+
curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers);
|
|
341
|
+
|
|
342
|
+
// we only allow retrying once for HEAD requests
|
|
343
|
+
// this is for the use case of using running offline (no internet), retrying can be annoying
|
|
344
|
+
bool was_perform_successful = curl_perform_with_retry(url, curl.get(), 1, 0, "HEAD");
|
|
345
|
+
if (!was_perform_successful) {
|
|
346
|
+
head_request_ok = false;
|
|
160
347
|
}
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
348
|
+
|
|
349
|
+
long http_code = 0;
|
|
350
|
+
curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
|
|
351
|
+
if (http_code == 200) {
|
|
352
|
+
head_request_ok = true;
|
|
353
|
+
} else {
|
|
354
|
+
LOG_WRN("%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
|
|
355
|
+
head_request_ok = false;
|
|
356
|
+
}
|
|
357
|
+
}
|
|
358
|
+
|
|
359
|
+
// if head_request_ok is false, we don't have the etag or last-modified headers
|
|
360
|
+
// we leave should_download as-is, which is true if the file does not exist
|
|
361
|
+
if (head_request_ok) {
|
|
362
|
+
// check if ETag or Last-Modified headers are different
|
|
363
|
+
// if it is, we need to download the file again
|
|
364
|
+
if (!etag.empty() && etag != headers.etag) {
|
|
365
|
+
LOG_WRN("%s: ETag header is different (%s != %s): triggering a new download\n", __func__, etag.c_str(), headers.etag.c_str());
|
|
366
|
+
should_download = true;
|
|
367
|
+
} else if (!last_modified.empty() && last_modified != headers.last_modified) {
|
|
368
|
+
LOG_WRN("%s: Last-Modified header is different (%s != %s): triggering a new download\n", __func__, last_modified.c_str(), headers.last_modified.c_str());
|
|
369
|
+
should_download = true;
|
|
166
370
|
}
|
|
167
|
-
} else if (model.empty()) {
|
|
168
|
-
model = model_default;
|
|
169
371
|
}
|
|
372
|
+
|
|
373
|
+
if (should_download) {
|
|
374
|
+
std::string path_temporary = path + ".downloadInProgress";
|
|
375
|
+
if (file_exists) {
|
|
376
|
+
LOG_WRN("%s: deleting previous downloaded file: %s\n", __func__, path.c_str());
|
|
377
|
+
if (remove(path.c_str()) != 0) {
|
|
378
|
+
LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
|
|
379
|
+
return false;
|
|
380
|
+
}
|
|
381
|
+
}
|
|
382
|
+
|
|
383
|
+
// Set the output file
|
|
384
|
+
|
|
385
|
+
struct FILE_deleter {
|
|
386
|
+
void operator()(FILE * f) const {
|
|
387
|
+
fclose(f);
|
|
388
|
+
}
|
|
389
|
+
};
|
|
390
|
+
|
|
391
|
+
std::unique_ptr<FILE, FILE_deleter> outfile(fopen(path_temporary.c_str(), "wb"));
|
|
392
|
+
if (!outfile) {
|
|
393
|
+
LOG_ERR("%s: error opening local file for writing: %s\n", __func__, path.c_str());
|
|
394
|
+
return false;
|
|
395
|
+
}
|
|
396
|
+
|
|
397
|
+
typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * data, size_t size, size_t nmemb, void * fd);
|
|
398
|
+
auto write_callback = [](void * data, size_t size, size_t nmemb, void * fd) -> size_t {
|
|
399
|
+
return fwrite(data, size, nmemb, (FILE *)fd);
|
|
400
|
+
};
|
|
401
|
+
curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 0L);
|
|
402
|
+
curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
|
|
403
|
+
curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, outfile.get());
|
|
404
|
+
|
|
405
|
+
// display download progress
|
|
406
|
+
curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 0L);
|
|
407
|
+
|
|
408
|
+
// helper function to hide password in URL
|
|
409
|
+
auto llama_download_hide_password_in_url = [](const std::string & url) -> std::string {
|
|
410
|
+
std::size_t protocol_pos = url.find("://");
|
|
411
|
+
if (protocol_pos == std::string::npos) {
|
|
412
|
+
return url; // Malformed URL
|
|
413
|
+
}
|
|
414
|
+
|
|
415
|
+
std::size_t at_pos = url.find('@', protocol_pos + 3);
|
|
416
|
+
if (at_pos == std::string::npos) {
|
|
417
|
+
return url; // No password in URL
|
|
418
|
+
}
|
|
419
|
+
|
|
420
|
+
return url.substr(0, protocol_pos + 3) + "********" + url.substr(at_pos);
|
|
421
|
+
};
|
|
422
|
+
|
|
423
|
+
// start the download
|
|
424
|
+
LOG_INF("%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
|
|
425
|
+
llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str());
|
|
426
|
+
bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS, "GET");
|
|
427
|
+
if (!was_perform_successful) {
|
|
428
|
+
return false;
|
|
429
|
+
}
|
|
430
|
+
|
|
431
|
+
long http_code = 0;
|
|
432
|
+
curl_easy_getinfo (curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
|
|
433
|
+
if (http_code < 200 || http_code >= 400) {
|
|
434
|
+
LOG_ERR("%s: invalid http status code received: %ld\n", __func__, http_code);
|
|
435
|
+
return false;
|
|
436
|
+
}
|
|
437
|
+
|
|
438
|
+
// Causes file to be closed explicitly here before we rename it.
|
|
439
|
+
outfile.reset();
|
|
440
|
+
|
|
441
|
+
// Write the updated JSON metadata file.
|
|
442
|
+
metadata.update({
|
|
443
|
+
{"url", url},
|
|
444
|
+
{"etag", headers.etag},
|
|
445
|
+
{"lastModified", headers.last_modified}
|
|
446
|
+
});
|
|
447
|
+
write_file(metadata_path, metadata.dump(4));
|
|
448
|
+
LOG_DBG("%s: file metadata saved: %s\n", __func__, metadata_path.c_str());
|
|
449
|
+
|
|
450
|
+
if (rename(path_temporary.c_str(), path.c_str()) != 0) {
|
|
451
|
+
LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
|
|
452
|
+
return false;
|
|
453
|
+
}
|
|
454
|
+
} else {
|
|
455
|
+
LOG_INF("%s: using cached file: %s\n", __func__, path.c_str());
|
|
456
|
+
}
|
|
457
|
+
|
|
458
|
+
return true;
|
|
459
|
+
}
|
|
460
|
+
|
|
461
|
+
// download multiple files from remote URLs to local paths
|
|
462
|
+
// the input is a vector of pairs <url, path>
|
|
463
|
+
static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> & urls, const std::string & bearer_token) {
|
|
464
|
+
// Prepare download in parallel
|
|
465
|
+
std::vector<std::future<bool>> futures_download;
|
|
466
|
+
for (auto const & item : urls) {
|
|
467
|
+
futures_download.push_back(std::async(std::launch::async, [bearer_token](const std::pair<std::string, std::string> & it) -> bool {
|
|
468
|
+
return common_download_file_single(it.first, it.second, bearer_token);
|
|
469
|
+
}, item));
|
|
470
|
+
}
|
|
471
|
+
|
|
472
|
+
// Wait for all downloads to complete
|
|
473
|
+
for (auto & f : futures_download) {
|
|
474
|
+
if (!f.get()) {
|
|
475
|
+
return false;
|
|
476
|
+
}
|
|
477
|
+
}
|
|
478
|
+
|
|
479
|
+
return true;
|
|
480
|
+
}
|
|
481
|
+
|
|
482
|
+
static bool common_download_model(
|
|
483
|
+
const common_params_model & model,
|
|
484
|
+
const std::string & bearer_token) {
|
|
485
|
+
// Basic validation of the model.url
|
|
486
|
+
if (model.url.empty()) {
|
|
487
|
+
LOG_ERR("%s: invalid model url\n", __func__);
|
|
488
|
+
return false;
|
|
489
|
+
}
|
|
490
|
+
|
|
491
|
+
if (!common_download_file_single(model.url, model.path, bearer_token)) {
|
|
492
|
+
return false;
|
|
493
|
+
}
|
|
494
|
+
|
|
495
|
+
// check for additional GGUFs split to download
|
|
496
|
+
int n_split = 0;
|
|
497
|
+
{
|
|
498
|
+
struct gguf_init_params gguf_params = {
|
|
499
|
+
/*.no_alloc = */ true,
|
|
500
|
+
/*.ctx = */ NULL,
|
|
501
|
+
};
|
|
502
|
+
auto * ctx_gguf = gguf_init_from_file(model.path.c_str(), gguf_params);
|
|
503
|
+
if (!ctx_gguf) {
|
|
504
|
+
LOG_ERR("\n%s: failed to load input GGUF from %s\n", __func__, model.path.c_str());
|
|
505
|
+
return false;
|
|
506
|
+
}
|
|
507
|
+
|
|
508
|
+
auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_SPLIT_COUNT);
|
|
509
|
+
if (key_n_split >= 0) {
|
|
510
|
+
n_split = gguf_get_val_u16(ctx_gguf, key_n_split);
|
|
511
|
+
}
|
|
512
|
+
|
|
513
|
+
gguf_free(ctx_gguf);
|
|
514
|
+
}
|
|
515
|
+
|
|
516
|
+
if (n_split > 1) {
|
|
517
|
+
char split_prefix[PATH_MAX] = {0};
|
|
518
|
+
char split_url_prefix[LLAMA_CURL_MAX_URL_LENGTH] = {0};
|
|
519
|
+
|
|
520
|
+
// Verify the first split file format
|
|
521
|
+
// and extract split URL and PATH prefixes
|
|
522
|
+
{
|
|
523
|
+
if (!llama_split_prefix(split_prefix, sizeof(split_prefix), model.path.c_str(), 0, n_split)) {
|
|
524
|
+
LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, model.path.c_str(), n_split);
|
|
525
|
+
return false;
|
|
526
|
+
}
|
|
527
|
+
|
|
528
|
+
if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model.url.c_str(), 0, n_split)) {
|
|
529
|
+
LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model.url.c_str(), n_split);
|
|
530
|
+
return false;
|
|
531
|
+
}
|
|
532
|
+
}
|
|
533
|
+
|
|
534
|
+
std::vector<std::pair<std::string, std::string>> urls;
|
|
535
|
+
for (int idx = 1; idx < n_split; idx++) {
|
|
536
|
+
char split_path[PATH_MAX] = {0};
|
|
537
|
+
llama_split_path(split_path, sizeof(split_path), split_prefix, idx, n_split);
|
|
538
|
+
|
|
539
|
+
char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0};
|
|
540
|
+
llama_split_path(split_url, sizeof(split_url), split_url_prefix, idx, n_split);
|
|
541
|
+
|
|
542
|
+
if (std::string(split_path) == model.path) {
|
|
543
|
+
continue; // skip the already downloaded file
|
|
544
|
+
}
|
|
545
|
+
|
|
546
|
+
urls.push_back({split_url, split_path});
|
|
547
|
+
}
|
|
548
|
+
|
|
549
|
+
// Download in parallel
|
|
550
|
+
common_download_file_multiple(urls, bearer_token);
|
|
551
|
+
}
|
|
552
|
+
|
|
553
|
+
return true;
|
|
554
|
+
}
|
|
555
|
+
|
|
556
|
+
std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url, const common_remote_params & params) {
|
|
557
|
+
curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
|
|
558
|
+
curl_slist_ptr http_headers;
|
|
559
|
+
std::vector<char> res_buffer;
|
|
560
|
+
|
|
561
|
+
curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
|
|
562
|
+
curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L);
|
|
563
|
+
curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);
|
|
564
|
+
typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * ptr, size_t size, size_t nmemb, void * data);
|
|
565
|
+
auto write_callback = [](void * ptr, size_t size, size_t nmemb, void * data) -> size_t {
|
|
566
|
+
auto data_vec = static_cast<std::vector<char> *>(data);
|
|
567
|
+
data_vec->insert(data_vec->end(), (char *)ptr, (char *)ptr + size * nmemb);
|
|
568
|
+
return size * nmemb;
|
|
569
|
+
};
|
|
570
|
+
curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
|
|
571
|
+
curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &res_buffer);
|
|
572
|
+
#if defined(_WIN32)
|
|
573
|
+
curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
|
|
574
|
+
#endif
|
|
575
|
+
if (params.timeout > 0) {
|
|
576
|
+
curl_easy_setopt(curl.get(), CURLOPT_TIMEOUT, params.timeout);
|
|
577
|
+
}
|
|
578
|
+
if (params.max_size > 0) {
|
|
579
|
+
curl_easy_setopt(curl.get(), CURLOPT_MAXFILESIZE, params.max_size);
|
|
580
|
+
}
|
|
581
|
+
http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
|
|
582
|
+
for (const auto & header : params.headers) {
|
|
583
|
+
http_headers.ptr = curl_slist_append(http_headers.ptr, header.c_str());
|
|
584
|
+
}
|
|
585
|
+
curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
|
|
586
|
+
|
|
587
|
+
CURLcode res = curl_easy_perform(curl.get());
|
|
588
|
+
|
|
589
|
+
if (res != CURLE_OK) {
|
|
590
|
+
std::string error_msg = curl_easy_strerror(res);
|
|
591
|
+
throw std::runtime_error("error: cannot make GET request: " + error_msg);
|
|
592
|
+
}
|
|
593
|
+
|
|
594
|
+
long res_code;
|
|
595
|
+
curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &res_code);
|
|
596
|
+
|
|
597
|
+
return { res_code, std::move(res_buffer) };
|
|
598
|
+
}
|
|
599
|
+
|
|
600
|
+
/**
|
|
601
|
+
* Allow getting the HF file from the HF repo with tag (like ollama), for example:
|
|
602
|
+
* - bartowski/Llama-3.2-3B-Instruct-GGUF:q4
|
|
603
|
+
* - bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M
|
|
604
|
+
* - bartowski/Llama-3.2-3B-Instruct-GGUF:q5_k_s
|
|
605
|
+
* Tag is optional, default to "latest" (meaning it checks for Q4_K_M first, then Q4, then if not found, return the first GGUF file in repo)
|
|
606
|
+
*
|
|
607
|
+
* Return pair of <repo, file> (with "repo" already having tag removed)
|
|
608
|
+
*
|
|
609
|
+
* Note: we use the Ollama-compatible HF API, but not using the blobId. Instead, we use the special "ggufFile" field which returns the value for "hf_file". This is done to be backward-compatible with existing cache files.
|
|
610
|
+
*/
|
|
611
|
+
static struct common_hf_file_res common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & bearer_token) {
|
|
612
|
+
auto parts = string_split<std::string>(hf_repo_with_tag, ':');
|
|
613
|
+
std::string tag = parts.size() > 1 ? parts.back() : "latest";
|
|
614
|
+
std::string hf_repo = parts[0];
|
|
615
|
+
if (string_split<std::string>(hf_repo, '/').size() != 2) {
|
|
616
|
+
throw std::invalid_argument("error: invalid HF repo format, expected <user>/<model>[:quant]\n");
|
|
617
|
+
}
|
|
618
|
+
|
|
619
|
+
std::string url = get_model_endpoint() + "v2/" + hf_repo + "/manifests/" + tag;
|
|
620
|
+
|
|
621
|
+
// headers
|
|
622
|
+
std::vector<std::string> headers;
|
|
623
|
+
headers.push_back("Accept: application/json");
|
|
624
|
+
if (!bearer_token.empty()) {
|
|
625
|
+
headers.push_back("Authorization: Bearer " + bearer_token);
|
|
626
|
+
}
|
|
627
|
+
// Important: the User-Agent must be "llama-cpp" to get the "ggufFile" field in the response
|
|
628
|
+
// User-Agent header is already set in common_remote_get_content, no need to set it here
|
|
629
|
+
|
|
630
|
+
// we use "=" to avoid clashing with other component, while still being allowed on windows
|
|
631
|
+
std::string cached_response_fname = "manifest=" + hf_repo + "=" + tag + ".json";
|
|
632
|
+
string_replace_all(cached_response_fname, "/", "_");
|
|
633
|
+
std::string cached_response_path = fs_get_cache_file(cached_response_fname);
|
|
634
|
+
|
|
635
|
+
// make the request
|
|
636
|
+
common_remote_params params;
|
|
637
|
+
params.headers = headers;
|
|
638
|
+
long res_code = 0;
|
|
639
|
+
std::string res_str;
|
|
640
|
+
bool use_cache = false;
|
|
641
|
+
try {
|
|
642
|
+
auto res = common_remote_get_content(url, params);
|
|
643
|
+
res_code = res.first;
|
|
644
|
+
res_str = std::string(res.second.data(), res.second.size());
|
|
645
|
+
} catch (const std::exception & e) {
|
|
646
|
+
LOG_WRN("error: failed to get manifest: %s\n", e.what());
|
|
647
|
+
LOG_WRN("try reading from cache\n");
|
|
648
|
+
// try to read from cache
|
|
649
|
+
try {
|
|
650
|
+
res_str = read_file(cached_response_path);
|
|
651
|
+
res_code = 200;
|
|
652
|
+
use_cache = true;
|
|
653
|
+
} catch (const std::exception & e) {
|
|
654
|
+
throw std::runtime_error("error: failed to get manifest (check your internet connection)");
|
|
655
|
+
}
|
|
656
|
+
}
|
|
657
|
+
std::string ggufFile;
|
|
658
|
+
std::string mmprojFile;
|
|
659
|
+
|
|
660
|
+
if (res_code == 200 || res_code == 304) {
|
|
661
|
+
// extract ggufFile.rfilename in json, using regex
|
|
662
|
+
{
|
|
663
|
+
std::regex pattern("\"ggufFile\"[\\s\\S]*?\"rfilename\"\\s*:\\s*\"([^\"]+)\"");
|
|
664
|
+
std::smatch match;
|
|
665
|
+
if (std::regex_search(res_str, match, pattern)) {
|
|
666
|
+
ggufFile = match[1].str();
|
|
667
|
+
}
|
|
668
|
+
}
|
|
669
|
+
// extract mmprojFile.rfilename in json, using regex
|
|
670
|
+
{
|
|
671
|
+
std::regex pattern("\"mmprojFile\"[\\s\\S]*?\"rfilename\"\\s*:\\s*\"([^\"]+)\"");
|
|
672
|
+
std::smatch match;
|
|
673
|
+
if (std::regex_search(res_str, match, pattern)) {
|
|
674
|
+
mmprojFile = match[1].str();
|
|
675
|
+
}
|
|
676
|
+
}
|
|
677
|
+
if (!use_cache) {
|
|
678
|
+
// if not using cached response, update the cache file
|
|
679
|
+
write_file(cached_response_path, res_str);
|
|
680
|
+
}
|
|
681
|
+
} else if (res_code == 401) {
|
|
682
|
+
throw std::runtime_error("error: model is private or does not exist; if you are accessing a gated model, please provide a valid HF token");
|
|
683
|
+
} else {
|
|
684
|
+
throw std::runtime_error(string_format("error from HF API, response code: %ld, data: %s", res_code, res_str.c_str()));
|
|
685
|
+
}
|
|
686
|
+
|
|
687
|
+
// check response
|
|
688
|
+
if (ggufFile.empty()) {
|
|
689
|
+
throw std::runtime_error("error: model does not have ggufFile");
|
|
690
|
+
}
|
|
691
|
+
|
|
692
|
+
return { hf_repo, ggufFile, mmprojFile };
|
|
693
|
+
}
|
|
694
|
+
|
|
695
|
+
#else
|
|
696
|
+
|
|
697
|
+
bool common_has_curl() {
|
|
698
|
+
return false;
|
|
699
|
+
}
|
|
700
|
+
|
|
701
|
+
static bool common_download_file_single(const std::string &, const std::string &, const std::string &) {
|
|
702
|
+
LOG_ERR("error: built without CURL, cannot download model from internet\n");
|
|
703
|
+
return false;
|
|
704
|
+
}
|
|
705
|
+
|
|
706
|
+
static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> &, const std::string &) {
|
|
707
|
+
LOG_ERR("error: built without CURL, cannot download model from the internet\n");
|
|
708
|
+
return false;
|
|
709
|
+
}
|
|
710
|
+
|
|
711
|
+
static bool common_download_model(
|
|
712
|
+
const common_params_model &,
|
|
713
|
+
const std::string &) {
|
|
714
|
+
LOG_ERR("error: built without CURL, cannot download model from the internet\n");
|
|
715
|
+
return false;
|
|
716
|
+
}
|
|
717
|
+
|
|
718
|
+
static struct common_hf_file_res common_get_hf_file(const std::string &, const std::string &) {
|
|
719
|
+
LOG_ERR("error: built without CURL, cannot download model from the internet\n");
|
|
720
|
+
return {};
|
|
721
|
+
}
|
|
722
|
+
|
|
723
|
+
std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url, const common_remote_params &) {
|
|
724
|
+
if (!url.empty()) {
|
|
725
|
+
throw std::runtime_error("error: built without CURL, cannot download model from the internet");
|
|
726
|
+
}
|
|
727
|
+
|
|
728
|
+
return {};
|
|
729
|
+
}
|
|
730
|
+
|
|
731
|
+
#endif // LLAMA_USE_CURL
|
|
732
|
+
|
|
733
|
+
//
|
|
734
|
+
// utils
|
|
735
|
+
//
|
|
736
|
+
|
|
737
|
+
struct handle_model_result {
|
|
738
|
+
bool found_mmproj = false;
|
|
739
|
+
common_params_model mmproj;
|
|
740
|
+
};
|
|
741
|
+
|
|
742
|
+
static handle_model_result common_params_handle_model(
|
|
743
|
+
struct common_params_model & model,
|
|
744
|
+
const std::string & bearer_token,
|
|
745
|
+
const std::string & model_path_default) {
|
|
746
|
+
handle_model_result result;
|
|
747
|
+
// handle pre-fill default model path and url based on hf_repo and hf_file
|
|
748
|
+
{
|
|
749
|
+
if (!model.hf_repo.empty()) {
|
|
750
|
+
// short-hand to avoid specifying --hf-file -> default it to --model
|
|
751
|
+
if (model.hf_file.empty()) {
|
|
752
|
+
if (model.path.empty()) {
|
|
753
|
+
auto auto_detected = common_get_hf_file(model.hf_repo, bearer_token);
|
|
754
|
+
if (auto_detected.repo.empty() || auto_detected.ggufFile.empty()) {
|
|
755
|
+
exit(1); // built without CURL, error message already printed
|
|
756
|
+
}
|
|
757
|
+
model.hf_repo = auto_detected.repo;
|
|
758
|
+
model.hf_file = auto_detected.ggufFile;
|
|
759
|
+
if (!auto_detected.mmprojFile.empty()) {
|
|
760
|
+
result.found_mmproj = true;
|
|
761
|
+
result.mmproj.hf_repo = model.hf_repo;
|
|
762
|
+
result.mmproj.hf_file = auto_detected.mmprojFile;
|
|
763
|
+
}
|
|
764
|
+
} else {
|
|
765
|
+
model.hf_file = model.path;
|
|
766
|
+
}
|
|
767
|
+
}
|
|
768
|
+
|
|
769
|
+
std::string model_endpoint = get_model_endpoint();
|
|
770
|
+
model.url = model_endpoint + model.hf_repo + "/resolve/main/" + model.hf_file;
|
|
771
|
+
// make sure model path is present (for caching purposes)
|
|
772
|
+
if (model.path.empty()) {
|
|
773
|
+
// this is to avoid different repo having same file name, or same file name in different subdirs
|
|
774
|
+
std::string filename = model.hf_repo + "_" + model.hf_file;
|
|
775
|
+
// to make sure we don't have any slashes in the filename
|
|
776
|
+
string_replace_all(filename, "/", "_");
|
|
777
|
+
model.path = fs_get_cache_file(filename);
|
|
778
|
+
}
|
|
779
|
+
|
|
780
|
+
} else if (!model.url.empty()) {
|
|
781
|
+
if (model.path.empty()) {
|
|
782
|
+
auto f = string_split<std::string>(model.url, '#').front();
|
|
783
|
+
f = string_split<std::string>(f, '?').front();
|
|
784
|
+
model.path = fs_get_cache_file(string_split<std::string>(f, '/').back());
|
|
785
|
+
}
|
|
786
|
+
|
|
787
|
+
} else if (model.path.empty()) {
|
|
788
|
+
model.path = model_path_default;
|
|
789
|
+
}
|
|
790
|
+
}
|
|
791
|
+
|
|
792
|
+
// then, download it if needed
|
|
793
|
+
if (!model.url.empty()) {
|
|
794
|
+
bool ok = common_download_model(model, bearer_token);
|
|
795
|
+
if (!ok) {
|
|
796
|
+
LOG_ERR("error: failed to download model from %s\n", model.url.c_str());
|
|
797
|
+
exit(1);
|
|
798
|
+
}
|
|
799
|
+
}
|
|
800
|
+
|
|
801
|
+
return result;
|
|
170
802
|
}
|
|
171
803
|
|
|
172
804
|
const std::vector<ggml_type> kv_cache_types = {
|
|
@@ -300,10 +932,25 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
|
|
300
932
|
throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
|
|
301
933
|
}
|
|
302
934
|
|
|
303
|
-
//
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
935
|
+
// handle model and download
|
|
936
|
+
{
|
|
937
|
+
auto res = common_params_handle_model(params.model, params.hf_token, DEFAULT_MODEL_PATH);
|
|
938
|
+
if (params.no_mmproj) {
|
|
939
|
+
params.mmproj = {};
|
|
940
|
+
} else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) {
|
|
941
|
+
// optionally, handle mmproj model when -hf is specified
|
|
942
|
+
params.mmproj = res.mmproj;
|
|
943
|
+
}
|
|
944
|
+
// only download mmproj if the current example is using it
|
|
945
|
+
for (auto & ex : mmproj_examples) {
|
|
946
|
+
if (ctx_arg.ex == ex) {
|
|
947
|
+
common_params_handle_model(params.mmproj, params.hf_token, "");
|
|
948
|
+
break;
|
|
949
|
+
}
|
|
950
|
+
}
|
|
951
|
+
common_params_handle_model(params.speculative.model, params.hf_token, "");
|
|
952
|
+
common_params_handle_model(params.vocoder.model, params.hf_token, "");
|
|
953
|
+
}
|
|
307
954
|
|
|
308
955
|
if (params.escape) {
|
|
309
956
|
string_process_escapes(params.prompt);
|
|
@@ -322,6 +969,10 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
|
|
322
969
|
params.kv_overrides.back().key[0] = 0;
|
|
323
970
|
}
|
|
324
971
|
|
|
972
|
+
if (!params.tensor_buft_overrides.empty()) {
|
|
973
|
+
params.tensor_buft_overrides.push_back({nullptr, nullptr});
|
|
974
|
+
}
|
|
975
|
+
|
|
325
976
|
if (params.reranking && params.embedding) {
|
|
326
977
|
throw std::invalid_argument("error: either --embedding or --reranking can be specified, but not both");
|
|
327
978
|
}
|
|
@@ -431,7 +1082,6 @@ static void common_params_print_completion(common_params_context & ctx_arg) {
|
|
|
431
1082
|
"llama-embedding",
|
|
432
1083
|
"llama-eval-callback",
|
|
433
1084
|
"llama-export-lora",
|
|
434
|
-
"llama-gbnf-validator",
|
|
435
1085
|
"llama-gen-docs",
|
|
436
1086
|
"llama-gguf",
|
|
437
1087
|
"llama-gguf-hash",
|
|
@@ -439,20 +1089,18 @@ static void common_params_print_completion(common_params_context & ctx_arg) {
|
|
|
439
1089
|
"llama-gritlm",
|
|
440
1090
|
"llama-imatrix",
|
|
441
1091
|
"llama-infill",
|
|
442
|
-
"llama-
|
|
1092
|
+
"llama-mtmd-cli",
|
|
443
1093
|
"llama-llava-clip-quantize-cli",
|
|
444
1094
|
"llama-lookahead",
|
|
445
1095
|
"llama-lookup",
|
|
446
1096
|
"llama-lookup-create",
|
|
447
1097
|
"llama-lookup-merge",
|
|
448
1098
|
"llama-lookup-stats",
|
|
449
|
-
"llama-minicpmv-cli",
|
|
450
1099
|
"llama-parallel",
|
|
451
1100
|
"llama-passkey",
|
|
452
1101
|
"llama-perplexity",
|
|
453
1102
|
"llama-q8dot",
|
|
454
1103
|
"llama-quantize",
|
|
455
|
-
"llama-quantize-stats",
|
|
456
1104
|
"llama-qwen2vl-cli",
|
|
457
1105
|
"llama-retrieval",
|
|
458
1106
|
"llama-run",
|
|
@@ -541,6 +1189,9 @@ bool common_params_parse(int argc, char ** argv, common_params & params, llama_e
|
|
|
541
1189
|
fprintf(stderr, "%s\n", ex.what());
|
|
542
1190
|
ctx_arg.params = params_org;
|
|
543
1191
|
return false;
|
|
1192
|
+
} catch (std::exception & ex) {
|
|
1193
|
+
fprintf(stderr, "%s\n", ex.what());
|
|
1194
|
+
exit(1); // for other exceptions, we exit with status code 1
|
|
544
1195
|
}
|
|
545
1196
|
|
|
546
1197
|
return true;
|
|
@@ -632,7 +1283,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
632
1283
|
[](common_params & params) {
|
|
633
1284
|
params.use_color = true;
|
|
634
1285
|
}
|
|
635
|
-
).set_examples({LLAMA_EXAMPLE_MAIN,
|
|
1286
|
+
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
|
|
636
1287
|
add_opt(common_arg(
|
|
637
1288
|
{"-t", "--threads"}, "N",
|
|
638
1289
|
string_format("number of threads to use during generation (default: %d)", params.cpuparams.n_threads),
|
|
@@ -765,7 +1416,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
765
1416
|
add_opt(common_arg(
|
|
766
1417
|
{"-n", "--predict", "--n-predict"}, "N",
|
|
767
1418
|
string_format(
|
|
768
|
-
ex == LLAMA_EXAMPLE_MAIN
|
|
1419
|
+
ex == LLAMA_EXAMPLE_MAIN
|
|
769
1420
|
? "number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)"
|
|
770
1421
|
: "number of tokens to predict (default: %d, -1 = infinity)",
|
|
771
1422
|
params.n_predict),
|
|
@@ -841,13 +1492,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
841
1492
|
{"-f", "--file"}, "FNAME",
|
|
842
1493
|
"a file containing the prompt (default: none)",
|
|
843
1494
|
[](common_params & params, const std::string & value) {
|
|
844
|
-
|
|
845
|
-
if (!file) {
|
|
846
|
-
throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
|
|
847
|
-
}
|
|
1495
|
+
params.prompt = read_file(value);
|
|
848
1496
|
// store the external file name in params
|
|
849
1497
|
params.prompt_file = value;
|
|
850
|
-
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
|
|
851
1498
|
if (!params.prompt.empty() && params.prompt.back() == '\n') {
|
|
852
1499
|
params.prompt.pop_back();
|
|
853
1500
|
}
|
|
@@ -857,11 +1504,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
857
1504
|
{"-sysf", "--system-prompt-file"}, "FNAME",
|
|
858
1505
|
"a file containing the system prompt (default: none)",
|
|
859
1506
|
[](common_params & params, const std::string & value) {
|
|
860
|
-
|
|
861
|
-
if (!file) {
|
|
862
|
-
throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
|
|
863
|
-
}
|
|
864
|
-
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.system_prompt));
|
|
1507
|
+
params.system_prompt = read_file(value);
|
|
865
1508
|
if (!params.system_prompt.empty() && params.system_prompt.back() == '\n') {
|
|
866
1509
|
params.system_prompt.pop_back();
|
|
867
1510
|
}
|
|
@@ -1012,7 +1655,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1012
1655
|
params.input_prefix = value;
|
|
1013
1656
|
params.enable_chat_template = false;
|
|
1014
1657
|
}
|
|
1015
|
-
).set_examples({LLAMA_EXAMPLE_MAIN
|
|
1658
|
+
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
|
1016
1659
|
add_opt(common_arg(
|
|
1017
1660
|
{"--in-suffix"}, "STRING",
|
|
1018
1661
|
"string to suffix after user inputs with (default: empty)",
|
|
@@ -1020,7 +1663,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1020
1663
|
params.input_suffix = value;
|
|
1021
1664
|
params.enable_chat_template = false;
|
|
1022
1665
|
}
|
|
1023
|
-
).set_examples({LLAMA_EXAMPLE_MAIN
|
|
1666
|
+
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
|
1024
1667
|
add_opt(common_arg(
|
|
1025
1668
|
{"--no-warmup"},
|
|
1026
1669
|
"skip warming up the model with an empty run",
|
|
@@ -1037,7 +1680,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1037
1680
|
[](common_params & params) {
|
|
1038
1681
|
params.spm_infill = true;
|
|
1039
1682
|
}
|
|
1040
|
-
).set_examples({LLAMA_EXAMPLE_SERVER
|
|
1683
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
|
1041
1684
|
add_opt(common_arg(
|
|
1042
1685
|
{"--samplers"}, "SAMPLERS",
|
|
1043
1686
|
string_format("samplers that will be used for generation in the order, separated by \';\'\n(default: %s)", sampler_type_names.c_str()),
|
|
@@ -1285,23 +1928,32 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1285
1928
|
add_opt(common_arg(
|
|
1286
1929
|
{"--grammar-file"}, "FNAME",
|
|
1287
1930
|
"file to read grammar from",
|
|
1931
|
+
[](common_params & params, const std::string & value) {
|
|
1932
|
+
params.sampling.grammar = read_file(value);
|
|
1933
|
+
}
|
|
1934
|
+
).set_sparam());
|
|
1935
|
+
add_opt(common_arg(
|
|
1936
|
+
{"-j", "--json-schema"}, "SCHEMA",
|
|
1937
|
+
"JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\nFor schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead",
|
|
1938
|
+
[](common_params & params, const std::string & value) {
|
|
1939
|
+
params.sampling.grammar = json_schema_to_grammar(json::parse(value));
|
|
1940
|
+
}
|
|
1941
|
+
).set_sparam());
|
|
1942
|
+
add_opt(common_arg(
|
|
1943
|
+
{"-jf", "--json-schema-file"}, "FILE",
|
|
1944
|
+
"File containing a JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\nFor schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead",
|
|
1288
1945
|
[](common_params & params, const std::string & value) {
|
|
1289
1946
|
std::ifstream file(value);
|
|
1290
1947
|
if (!file) {
|
|
1291
1948
|
throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
|
|
1292
1949
|
}
|
|
1950
|
+
std::string schema;
|
|
1293
1951
|
std::copy(
|
|
1294
1952
|
std::istreambuf_iterator<char>(file),
|
|
1295
1953
|
std::istreambuf_iterator<char>(),
|
|
1296
|
-
std::back_inserter(
|
|
1954
|
+
std::back_inserter(schema)
|
|
1297
1955
|
);
|
|
1298
|
-
|
|
1299
|
-
).set_sparam());
|
|
1300
|
-
add_opt(common_arg(
|
|
1301
|
-
{"-j", "--json-schema"}, "SCHEMA",
|
|
1302
|
-
"JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\nFor schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead",
|
|
1303
|
-
[](common_params & params, const std::string & value) {
|
|
1304
|
-
params.sampling.grammar = json_schema_to_grammar(json::parse(value));
|
|
1956
|
+
params.sampling.grammar = json_schema_to_grammar(json::parse(schema));
|
|
1305
1957
|
}
|
|
1306
1958
|
).set_sparam());
|
|
1307
1959
|
add_opt(common_arg(
|
|
@@ -1445,13 +2097,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1445
2097
|
params.cache_type_v = kv_cache_type_from_str(value);
|
|
1446
2098
|
}
|
|
1447
2099
|
).set_env("LLAMA_ARG_CACHE_TYPE_V"));
|
|
1448
|
-
add_opt(common_arg(
|
|
1449
|
-
{"--perplexity", "--all-logits"},
|
|
1450
|
-
string_format("return logits for all tokens in the batch (default: %s)", params.logits_all ? "true" : "false"),
|
|
1451
|
-
[](common_params & params) {
|
|
1452
|
-
params.logits_all = true;
|
|
1453
|
-
}
|
|
1454
|
-
).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
|
|
1455
2100
|
add_opt(common_arg(
|
|
1456
2101
|
{"--hellaswag"},
|
|
1457
2102
|
"compute HellaSwag score over random tasks from datafile supplied with -f",
|
|
@@ -1559,11 +2204,33 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1559
2204
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONT_BATCHING"));
|
|
1560
2205
|
add_opt(common_arg(
|
|
1561
2206
|
{"--mmproj"}, "FILE",
|
|
1562
|
-
"path to a multimodal projector file
|
|
2207
|
+
"path to a multimodal projector file. see tools/mtmd/README.md\n"
|
|
2208
|
+
"note: if -hf is used, this argument can be omitted",
|
|
1563
2209
|
[](common_params & params, const std::string & value) {
|
|
1564
|
-
params.mmproj = value;
|
|
2210
|
+
params.mmproj.path = value;
|
|
1565
2211
|
}
|
|
1566
|
-
).set_examples(
|
|
2212
|
+
).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ"));
|
|
2213
|
+
add_opt(common_arg(
|
|
2214
|
+
{"--mmproj-url"}, "URL",
|
|
2215
|
+
"URL to a multimodal projector file. see tools/mtmd/README.md",
|
|
2216
|
+
[](common_params & params, const std::string & value) {
|
|
2217
|
+
params.mmproj.url = value;
|
|
2218
|
+
}
|
|
2219
|
+
).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_URL"));
|
|
2220
|
+
add_opt(common_arg(
|
|
2221
|
+
{"--no-mmproj"},
|
|
2222
|
+
"explicitly disable multimodal projector, useful when using -hf",
|
|
2223
|
+
[](common_params & params) {
|
|
2224
|
+
params.no_mmproj = true;
|
|
2225
|
+
}
|
|
2226
|
+
).set_examples(mmproj_examples).set_env("LLAMA_ARG_NO_MMPROJ"));
|
|
2227
|
+
add_opt(common_arg(
|
|
2228
|
+
{"--no-mmproj-offload"},
|
|
2229
|
+
"do not offload multimodal projector to GPU",
|
|
2230
|
+
[](common_params & params) {
|
|
2231
|
+
params.mmproj_use_gpu = false;
|
|
2232
|
+
}
|
|
2233
|
+
).set_examples(mmproj_examples).set_env("LLAMA_ARG_NO_MMPROJ_OFFLOAD"));
|
|
1567
2234
|
add_opt(common_arg(
|
|
1568
2235
|
{"--image"}, "FILE",
|
|
1569
2236
|
"path to an image file. use with multimodal models. Specify multiple times for batching",
|
|
@@ -1647,6 +2314,41 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1647
2314
|
exit(0);
|
|
1648
2315
|
}
|
|
1649
2316
|
));
|
|
2317
|
+
add_opt(common_arg(
|
|
2318
|
+
{"--override-tensor", "-ot"}, "<tensor name pattern>=<buffer type>,...",
|
|
2319
|
+
"override tensor buffer type", [](common_params & params, const std::string & value) {
|
|
2320
|
+
/* static */ std::map<std::string, ggml_backend_buffer_type_t> buft_list;
|
|
2321
|
+
if (buft_list.empty()) {
|
|
2322
|
+
// enumerate all the devices and add their buffer types to the list
|
|
2323
|
+
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
|
|
2324
|
+
auto * dev = ggml_backend_dev_get(i);
|
|
2325
|
+
auto * buft = ggml_backend_dev_buffer_type(dev);
|
|
2326
|
+
if (buft) {
|
|
2327
|
+
buft_list[ggml_backend_buft_name(buft)] = buft;
|
|
2328
|
+
}
|
|
2329
|
+
}
|
|
2330
|
+
}
|
|
2331
|
+
|
|
2332
|
+
for (const auto & override : string_split<std::string>(value, ',')) {
|
|
2333
|
+
std::string::size_type pos = override.find('=');
|
|
2334
|
+
if (pos == std::string::npos) {
|
|
2335
|
+
throw std::invalid_argument("invalid value");
|
|
2336
|
+
}
|
|
2337
|
+
std::string tensor_name = override.substr(0, pos);
|
|
2338
|
+
std::string buffer_type = override.substr(pos + 1);
|
|
2339
|
+
|
|
2340
|
+
if (buft_list.find(buffer_type) == buft_list.end()) {
|
|
2341
|
+
printf("Available buffer types:\n");
|
|
2342
|
+
for (const auto & it : buft_list) {
|
|
2343
|
+
printf(" %s\n", ggml_backend_buft_name(it.second));
|
|
2344
|
+
}
|
|
2345
|
+
throw std::invalid_argument("unknown buffer type");
|
|
2346
|
+
}
|
|
2347
|
+
// FIXME: this leaks memory
|
|
2348
|
+
params.tensor_buft_overrides.push_back({strdup(tensor_name.c_str()), buft_list.at(buffer_type)});
|
|
2349
|
+
}
|
|
2350
|
+
}
|
|
2351
|
+
));
|
|
1650
2352
|
add_opt(common_arg(
|
|
1651
2353
|
{"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
|
|
1652
2354
|
"number of layers to store in VRAM",
|
|
@@ -1735,6 +2437,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1735
2437
|
}
|
|
1736
2438
|
}
|
|
1737
2439
|
));
|
|
2440
|
+
add_opt(common_arg(
|
|
2441
|
+
{"--no-op-offload"},
|
|
2442
|
+
string_format("disable offloading host tensor operations to device (default: %s)", params.no_op_offload ? "true" : "false"),
|
|
2443
|
+
[](common_params & params) {
|
|
2444
|
+
params.no_op_offload = true;
|
|
2445
|
+
}
|
|
2446
|
+
));
|
|
1738
2447
|
add_opt(common_arg(
|
|
1739
2448
|
{"--lora"}, "FNAME",
|
|
1740
2449
|
"path to LoRA adapter (can be repeated to use multiple adapters)",
|
|
@@ -1790,51 +2499,52 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1790
2499
|
"or `--model-url` if set, otherwise %s)", DEFAULT_MODEL_PATH
|
|
1791
2500
|
),
|
|
1792
2501
|
[](common_params & params, const std::string & value) {
|
|
1793
|
-
params.model = value;
|
|
2502
|
+
params.model.path = value;
|
|
1794
2503
|
}
|
|
1795
2504
|
).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}).set_env("LLAMA_ARG_MODEL"));
|
|
1796
2505
|
add_opt(common_arg(
|
|
1797
2506
|
{"-mu", "--model-url"}, "MODEL_URL",
|
|
1798
2507
|
"model download url (default: unused)",
|
|
1799
2508
|
[](common_params & params, const std::string & value) {
|
|
1800
|
-
params.
|
|
2509
|
+
params.model.url = value;
|
|
1801
2510
|
}
|
|
1802
2511
|
).set_env("LLAMA_ARG_MODEL_URL"));
|
|
1803
2512
|
add_opt(common_arg(
|
|
1804
2513
|
{"-hf", "-hfr", "--hf-repo"}, "<user>/<model>[:quant]",
|
|
1805
2514
|
"Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.\n"
|
|
2515
|
+
"mmproj is also downloaded automatically if available. to disable, add --no-mmproj\n"
|
|
1806
2516
|
"example: unsloth/phi-4-GGUF:q4_k_m\n"
|
|
1807
2517
|
"(default: unused)",
|
|
1808
2518
|
[](common_params & params, const std::string & value) {
|
|
1809
|
-
params.hf_repo = value;
|
|
2519
|
+
params.model.hf_repo = value;
|
|
1810
2520
|
}
|
|
1811
2521
|
).set_env("LLAMA_ARG_HF_REPO"));
|
|
1812
2522
|
add_opt(common_arg(
|
|
1813
2523
|
{"-hfd", "-hfrd", "--hf-repo-draft"}, "<user>/<model>[:quant]",
|
|
1814
2524
|
"Same as --hf-repo, but for the draft model (default: unused)",
|
|
1815
2525
|
[](common_params & params, const std::string & value) {
|
|
1816
|
-
params.speculative.hf_repo = value;
|
|
2526
|
+
params.speculative.model.hf_repo = value;
|
|
1817
2527
|
}
|
|
1818
2528
|
).set_env("LLAMA_ARG_HFD_REPO"));
|
|
1819
2529
|
add_opt(common_arg(
|
|
1820
2530
|
{"-hff", "--hf-file"}, "FILE",
|
|
1821
2531
|
"Hugging Face model file. If specified, it will override the quant in --hf-repo (default: unused)",
|
|
1822
2532
|
[](common_params & params, const std::string & value) {
|
|
1823
|
-
params.hf_file = value;
|
|
2533
|
+
params.model.hf_file = value;
|
|
1824
2534
|
}
|
|
1825
2535
|
).set_env("LLAMA_ARG_HF_FILE"));
|
|
1826
2536
|
add_opt(common_arg(
|
|
1827
2537
|
{"-hfv", "-hfrv", "--hf-repo-v"}, "<user>/<model>[:quant]",
|
|
1828
2538
|
"Hugging Face model repository for the vocoder model (default: unused)",
|
|
1829
2539
|
[](common_params & params, const std::string & value) {
|
|
1830
|
-
params.vocoder.hf_repo = value;
|
|
2540
|
+
params.vocoder.model.hf_repo = value;
|
|
1831
2541
|
}
|
|
1832
2542
|
).set_env("LLAMA_ARG_HF_REPO_V"));
|
|
1833
2543
|
add_opt(common_arg(
|
|
1834
2544
|
{"-hffv", "--hf-file-v"}, "FILE",
|
|
1835
2545
|
"Hugging Face model file for the vocoder model (default: unused)",
|
|
1836
2546
|
[](common_params & params, const std::string & value) {
|
|
1837
|
-
params.vocoder.hf_file = value;
|
|
2547
|
+
params.vocoder.model.hf_file = value;
|
|
1838
2548
|
}
|
|
1839
2549
|
).set_env("LLAMA_ARG_HF_FILE_V"));
|
|
1840
2550
|
add_opt(common_arg(
|
|
@@ -1875,7 +2585,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1875
2585
|
[](common_params & params, int value) {
|
|
1876
2586
|
params.n_junk = value;
|
|
1877
2587
|
}
|
|
1878
|
-
).set_examples({LLAMA_EXAMPLE_PASSKEY}));
|
|
2588
|
+
).set_examples({LLAMA_EXAMPLE_PASSKEY, LLAMA_EXAMPLE_PARALLEL}));
|
|
1879
2589
|
add_opt(common_arg(
|
|
1880
2590
|
{"--pos"}, "N",
|
|
1881
2591
|
string_format("position of the passkey in the junk text (default: %d)", params.i_pos),
|
|
@@ -1925,13 +2635,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1925
2635
|
params.i_chunk = value;
|
|
1926
2636
|
}
|
|
1927
2637
|
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
|
|
2638
|
+
add_opt(common_arg(
|
|
2639
|
+
{"--parse-special"},
|
|
2640
|
+
string_format("prase special tokens (chat, tool, etc) (default: %s)", params.parse_special ? "true" : "false"),
|
|
2641
|
+
[](common_params & params) {
|
|
2642
|
+
params.parse_special = true;
|
|
2643
|
+
}
|
|
2644
|
+
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
|
|
1928
2645
|
add_opt(common_arg(
|
|
1929
2646
|
{"-pps"},
|
|
1930
2647
|
string_format("is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false"),
|
|
1931
2648
|
[](common_params & params) {
|
|
1932
2649
|
params.is_pp_shared = true;
|
|
1933
2650
|
}
|
|
1934
|
-
).set_examples({LLAMA_EXAMPLE_BENCH}));
|
|
2651
|
+
).set_examples({LLAMA_EXAMPLE_BENCH, LLAMA_EXAMPLE_PARALLEL}));
|
|
1935
2652
|
add_opt(common_arg(
|
|
1936
2653
|
{"-npp"}, "n0,n1,...",
|
|
1937
2654
|
"number of prompt tokens",
|
|
@@ -1979,7 +2696,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1979
2696
|
).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
|
|
1980
2697
|
add_opt(common_arg(
|
|
1981
2698
|
{"--host"}, "HOST",
|
|
1982
|
-
string_format("ip address to listen (default: %s)", params.hostname.c_str()),
|
|
2699
|
+
string_format("ip address to listen, or bind to an UNIX socket if the address ends with .sock (default: %s)", params.hostname.c_str()),
|
|
1983
2700
|
[](common_params & params, const std::string & value) {
|
|
1984
2701
|
params.hostname = value;
|
|
1985
2702
|
}
|
|
@@ -2074,7 +2791,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2074
2791
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_THREADS_HTTP"));
|
|
2075
2792
|
add_opt(common_arg(
|
|
2076
2793
|
{"--cache-reuse"}, "N",
|
|
2077
|
-
string_format(
|
|
2794
|
+
string_format(
|
|
2795
|
+
"min chunk size to attempt reusing from the cache via KV shifting (default: %d)\n"
|
|
2796
|
+
"[(card)](https://ggml.ai/f0.png)", params.n_cache_reuse
|
|
2797
|
+
),
|
|
2078
2798
|
[](common_params & params, int value) {
|
|
2079
2799
|
params.n_cache_reuse = value;
|
|
2080
2800
|
}
|
|
@@ -2147,7 +2867,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2147
2867
|
[](common_params & params, const std::string & value) {
|
|
2148
2868
|
params.chat_template = value;
|
|
2149
2869
|
}
|
|
2150
|
-
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
|
|
2870
|
+
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_LLAVA}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
|
|
2151
2871
|
add_opt(common_arg(
|
|
2152
2872
|
{"--chat-template-file"}, "JINJA_TEMPLATE_FILE",
|
|
2153
2873
|
string_format(
|
|
@@ -2157,16 +2877,19 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2157
2877
|
"list of built-in templates:\n%s", list_builtin_chat_templates().c_str()
|
|
2158
2878
|
),
|
|
2159
2879
|
[](common_params & params, const std::string & value) {
|
|
2160
|
-
|
|
2161
|
-
if (!file) {
|
|
2162
|
-
throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
|
|
2163
|
-
}
|
|
2164
|
-
std::copy(
|
|
2165
|
-
std::istreambuf_iterator<char>(file),
|
|
2166
|
-
std::istreambuf_iterator<char>(),
|
|
2167
|
-
std::back_inserter(params.chat_template));
|
|
2880
|
+
params.chat_template = read_file(value);
|
|
2168
2881
|
}
|
|
2169
2882
|
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE_FILE"));
|
|
2883
|
+
add_opt(common_arg(
|
|
2884
|
+
{"--no-prefill-assistant"},
|
|
2885
|
+
string_format(
|
|
2886
|
+
"whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)\n"
|
|
2887
|
+
"when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled\n"
|
|
2888
|
+
),
|
|
2889
|
+
[](common_params & params) {
|
|
2890
|
+
params.prefill_assistant = false;
|
|
2891
|
+
}
|
|
2892
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_PREFILL_ASSISTANT"));
|
|
2170
2893
|
add_opt(common_arg(
|
|
2171
2894
|
{"-sps", "--slot-prompt-similarity"}, "SIMILARITY",
|
|
2172
2895
|
string_format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity),
|
|
@@ -2187,7 +2910,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2187
2910
|
[](common_params & params) {
|
|
2188
2911
|
params.simple_io = true;
|
|
2189
2912
|
}
|
|
2190
|
-
).set_examples({LLAMA_EXAMPLE_MAIN
|
|
2913
|
+
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
|
2191
2914
|
add_opt(common_arg(
|
|
2192
2915
|
{"--positive-file"}, "FNAME",
|
|
2193
2916
|
string_format("positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str()),
|
|
@@ -2454,7 +3177,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2454
3177
|
{"-md", "--model-draft"}, "FNAME",
|
|
2455
3178
|
"draft model for speculative decoding (default: unused)",
|
|
2456
3179
|
[](common_params & params, const std::string & value) {
|
|
2457
|
-
params.speculative.model = value;
|
|
3180
|
+
params.speculative.model.path = value;
|
|
2458
3181
|
}
|
|
2459
3182
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT"));
|
|
2460
3183
|
|
|
@@ -2462,7 +3185,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2462
3185
|
{"-mv", "--model-vocoder"}, "FNAME",
|
|
2463
3186
|
"vocoder model for audio generation (default: unused)",
|
|
2464
3187
|
[](common_params & params, const std::string & value) {
|
|
2465
|
-
params.vocoder.model = value;
|
|
3188
|
+
params.vocoder.model.path = value;
|
|
2466
3189
|
}
|
|
2467
3190
|
).set_examples({LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_SERVER}));
|
|
2468
3191
|
add_opt(common_arg(
|
|
@@ -2485,10 +3208,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2485
3208
|
{"--tts-oute-default"},
|
|
2486
3209
|
string_format("use default OuteTTS models (note: can download weights from the internet)"),
|
|
2487
3210
|
[](common_params & params) {
|
|
2488
|
-
params.hf_repo = "OuteAI/OuteTTS-0.2-500M-GGUF";
|
|
2489
|
-
params.hf_file = "OuteTTS-0.2-500M-Q8_0.gguf";
|
|
2490
|
-
params.vocoder.hf_repo = "ggml-org/WavTokenizer";
|
|
2491
|
-
params.vocoder.hf_file = "WavTokenizer-Large-75-F16.gguf";
|
|
3211
|
+
params.model.hf_repo = "OuteAI/OuteTTS-0.2-500M-GGUF";
|
|
3212
|
+
params.model.hf_file = "OuteTTS-0.2-500M-Q8_0.gguf";
|
|
3213
|
+
params.vocoder.model.hf_repo = "ggml-org/WavTokenizer";
|
|
3214
|
+
params.vocoder.model.hf_file = "WavTokenizer-Large-75-F16.gguf";
|
|
2492
3215
|
}
|
|
2493
3216
|
).set_examples({LLAMA_EXAMPLE_TTS}));
|
|
2494
3217
|
|
|
@@ -2496,8 +3219,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2496
3219
|
{"--embd-bge-small-en-default"},
|
|
2497
3220
|
string_format("use default bge-small-en-v1.5 model (note: can download weights from the internet)"),
|
|
2498
3221
|
[](common_params & params) {
|
|
2499
|
-
params.hf_repo = "ggml-org/bge-small-en-v1.5-Q8_0-GGUF";
|
|
2500
|
-
params.hf_file = "bge-small-en-v1.5-q8_0.gguf";
|
|
3222
|
+
params.model.hf_repo = "ggml-org/bge-small-en-v1.5-Q8_0-GGUF";
|
|
3223
|
+
params.model.hf_file = "bge-small-en-v1.5-q8_0.gguf";
|
|
2501
3224
|
params.pooling_type = LLAMA_POOLING_TYPE_NONE;
|
|
2502
3225
|
params.embd_normalize = 2;
|
|
2503
3226
|
params.n_ctx = 512;
|
|
@@ -2510,8 +3233,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2510
3233
|
{"--embd-e5-small-en-default"},
|
|
2511
3234
|
string_format("use default e5-small-v2 model (note: can download weights from the internet)"),
|
|
2512
3235
|
[](common_params & params) {
|
|
2513
|
-
params.hf_repo = "ggml-org/e5-small-v2-Q8_0-GGUF";
|
|
2514
|
-
params.hf_file = "e5-small-v2-q8_0.gguf";
|
|
3236
|
+
params.model.hf_repo = "ggml-org/e5-small-v2-Q8_0-GGUF";
|
|
3237
|
+
params.model.hf_file = "e5-small-v2-q8_0.gguf";
|
|
2515
3238
|
params.pooling_type = LLAMA_POOLING_TYPE_NONE;
|
|
2516
3239
|
params.embd_normalize = 2;
|
|
2517
3240
|
params.n_ctx = 512;
|
|
@@ -2524,8 +3247,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2524
3247
|
{"--embd-gte-small-default"},
|
|
2525
3248
|
string_format("use default gte-small model (note: can download weights from the internet)"),
|
|
2526
3249
|
[](common_params & params) {
|
|
2527
|
-
params.hf_repo = "ggml-org/gte-small-Q8_0-GGUF";
|
|
2528
|
-
params.hf_file = "gte-small-q8_0.gguf";
|
|
3250
|
+
params.model.hf_repo = "ggml-org/gte-small-Q8_0-GGUF";
|
|
3251
|
+
params.model.hf_file = "gte-small-q8_0.gguf";
|
|
2529
3252
|
params.pooling_type = LLAMA_POOLING_TYPE_NONE;
|
|
2530
3253
|
params.embd_normalize = 2;
|
|
2531
3254
|
params.n_ctx = 512;
|
|
@@ -2538,8 +3261,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2538
3261
|
{"--fim-qwen-1.5b-default"},
|
|
2539
3262
|
string_format("use default Qwen 2.5 Coder 1.5B (note: can download weights from the internet)"),
|
|
2540
3263
|
[](common_params & params) {
|
|
2541
|
-
params.hf_repo = "ggml-org/Qwen2.5-Coder-1.5B-Q8_0-GGUF";
|
|
2542
|
-
params.hf_file = "qwen2.5-coder-1.5b-q8_0.gguf";
|
|
3264
|
+
params.model.hf_repo = "ggml-org/Qwen2.5-Coder-1.5B-Q8_0-GGUF";
|
|
3265
|
+
params.model.hf_file = "qwen2.5-coder-1.5b-q8_0.gguf";
|
|
2543
3266
|
params.port = 8012;
|
|
2544
3267
|
params.n_gpu_layers = 99;
|
|
2545
3268
|
params.flash_attn = true;
|
|
@@ -2554,8 +3277,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2554
3277
|
{"--fim-qwen-3b-default"},
|
|
2555
3278
|
string_format("use default Qwen 2.5 Coder 3B (note: can download weights from the internet)"),
|
|
2556
3279
|
[](common_params & params) {
|
|
2557
|
-
params.hf_repo = "ggml-org/Qwen2.5-Coder-3B-Q8_0-GGUF";
|
|
2558
|
-
params.hf_file = "qwen2.5-coder-3b-q8_0.gguf";
|
|
3280
|
+
params.model.hf_repo = "ggml-org/Qwen2.5-Coder-3B-Q8_0-GGUF";
|
|
3281
|
+
params.model.hf_file = "qwen2.5-coder-3b-q8_0.gguf";
|
|
2559
3282
|
params.port = 8012;
|
|
2560
3283
|
params.n_gpu_layers = 99;
|
|
2561
3284
|
params.flash_attn = true;
|
|
@@ -2570,8 +3293,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2570
3293
|
{"--fim-qwen-7b-default"},
|
|
2571
3294
|
string_format("use default Qwen 2.5 Coder 7B (note: can download weights from the internet)"),
|
|
2572
3295
|
[](common_params & params) {
|
|
2573
|
-
params.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
|
|
2574
|
-
params.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
|
|
3296
|
+
params.model.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
|
|
3297
|
+
params.model.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
|
|
2575
3298
|
params.port = 8012;
|
|
2576
3299
|
params.n_gpu_layers = 99;
|
|
2577
3300
|
params.flash_attn = true;
|
|
@@ -2586,10 +3309,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2586
3309
|
{"--fim-qwen-7b-spec"},
|
|
2587
3310
|
string_format("use Qwen 2.5 Coder 7B + 0.5B draft for speculative decoding (note: can download weights from the internet)"),
|
|
2588
3311
|
[](common_params & params) {
|
|
2589
|
-
params.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
|
|
2590
|
-
params.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
|
|
2591
|
-
params.speculative.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
|
|
2592
|
-
params.speculative.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
|
|
3312
|
+
params.model.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
|
|
3313
|
+
params.model.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
|
|
3314
|
+
params.speculative.model.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
|
|
3315
|
+
params.speculative.model.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
|
|
2593
3316
|
params.speculative.n_gpu_layers = 99;
|
|
2594
3317
|
params.port = 8012;
|
|
2595
3318
|
params.n_gpu_layers = 99;
|
|
@@ -2605,10 +3328,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2605
3328
|
{"--fim-qwen-14b-spec"},
|
|
2606
3329
|
string_format("use Qwen 2.5 Coder 14B + 0.5B draft for speculative decoding (note: can download weights from the internet)"),
|
|
2607
3330
|
[](common_params & params) {
|
|
2608
|
-
params.hf_repo = "ggml-org/Qwen2.5-Coder-14B-Q8_0-GGUF";
|
|
2609
|
-
params.hf_file = "qwen2.5-coder-14b-q8_0.gguf";
|
|
2610
|
-
params.speculative.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
|
|
2611
|
-
params.speculative.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
|
|
3331
|
+
params.model.hf_repo = "ggml-org/Qwen2.5-Coder-14B-Q8_0-GGUF";
|
|
3332
|
+
params.model.hf_file = "qwen2.5-coder-14b-q8_0.gguf";
|
|
3333
|
+
params.speculative.model.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
|
|
3334
|
+
params.speculative.model.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
|
|
2612
3335
|
params.speculative.n_gpu_layers = 99;
|
|
2613
3336
|
params.port = 8012;
|
|
2614
3337
|
params.n_gpu_layers = 99;
|