@fugood/llama.node 0.3.16 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +6 -1
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +44 -2
- package/lib/index.js +132 -1
- package/lib/index.ts +203 -3
- package/package.json +2 -1
- package/src/EmbeddingWorker.cpp +1 -1
- package/src/LlamaCompletionWorker.cpp +374 -19
- package/src/LlamaCompletionWorker.h +31 -10
- package/src/LlamaContext.cpp +216 -7
- package/src/LlamaContext.h +12 -0
- package/src/common.hpp +15 -0
- package/src/llama.cpp/.github/workflows/build-linux-cross.yml +233 -0
- package/src/llama.cpp/.github/workflows/build.yml +89 -767
- package/src/llama.cpp/.github/workflows/docker.yml +9 -6
- package/src/llama.cpp/.github/workflows/release.yml +716 -0
- package/src/llama.cpp/.github/workflows/server.yml +19 -23
- package/src/llama.cpp/CMakeLists.txt +11 -1
- package/src/llama.cpp/cmake/build-info.cmake +8 -2
- package/src/llama.cpp/cmake/x64-windows-llvm.cmake +0 -6
- package/src/llama.cpp/common/CMakeLists.txt +35 -4
- package/src/llama.cpp/common/arg.cpp +844 -121
- package/src/llama.cpp/common/arg.h +9 -0
- package/src/llama.cpp/common/chat.cpp +129 -107
- package/src/llama.cpp/common/chat.h +2 -0
- package/src/llama.cpp/common/common.cpp +64 -518
- package/src/llama.cpp/common/common.h +35 -45
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -0
- package/src/llama.cpp/common/llguidance.cpp +31 -47
- package/src/llama.cpp/common/minja/chat-template.hpp +23 -11
- package/src/llama.cpp/common/minja/minja.hpp +186 -127
- package/src/llama.cpp/common/regex-partial.cpp +204 -0
- package/src/llama.cpp/common/regex-partial.h +56 -0
- package/src/llama.cpp/common/sampling.cpp +60 -50
- package/src/llama.cpp/docs/build.md +122 -7
- package/src/llama.cpp/examples/CMakeLists.txt +2 -32
- package/src/llama.cpp/examples/batched/batched.cpp +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +9 -12
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
- package/src/llama.cpp/examples/parallel/parallel.cpp +89 -15
- package/src/llama.cpp/examples/passkey/passkey.cpp +1 -1
- package/src/llama.cpp/examples/speculative/speculative.cpp +1 -1
- package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
- package/src/llama.cpp/examples/sycl/build.sh +2 -2
- package/src/llama.cpp/examples/sycl/win-build-sycl.bat +2 -2
- package/src/llama.cpp/examples/training/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/training/finetune.cpp +96 -0
- package/src/llama.cpp/ggml/CMakeLists.txt +35 -2
- package/src/llama.cpp/ggml/cmake/GitVars.cmake +22 -0
- package/src/llama.cpp/ggml/include/ggml-backend.h +4 -4
- package/src/llama.cpp/ggml/include/ggml-cpp.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-cpu.h +5 -0
- package/src/llama.cpp/ggml/include/ggml-opt.h +47 -28
- package/src/llama.cpp/ggml/include/ggml-rpc.h +6 -1
- package/src/llama.cpp/ggml/include/ggml.h +76 -106
- package/src/llama.cpp/ggml/src/CMakeLists.txt +11 -8
- package/src/llama.cpp/ggml/src/ggml-alloc.c +4 -1
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +9 -5
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +0 -2
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +8 -4
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +5 -5
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +692 -1534
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +613 -122
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +135 -1
- package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +507 -137
- package/src/llama.cpp/ggml/src/ggml-common.h +12 -6
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +66 -33
- package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/common.h +72 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +896 -194
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +2 -21
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +1060 -410
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1008 -13533
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +31 -16
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +90 -12
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +47 -13
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +266 -72
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1034 -88
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +8796 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +110 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +892 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +28 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +252 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +802 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +23 -4
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +7 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +0 -4
- package/src/llama.cpp/ggml/src/ggml-impl.h +52 -18
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +106 -14
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +67 -119
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1023 -262
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +368 -190
- package/src/llama.cpp/ggml/src/ggml-quants.c +0 -6
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +307 -40
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +125 -45
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +10 -8
- package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +239 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +39 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +0 -35
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +9 -307
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +72 -25
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +14 -7
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +59 -21
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +7 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +79 -90
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +944 -438
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +22 -23
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +37 -8
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +24 -20
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +1 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +507 -411
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +84 -74
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +1 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +185 -89
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +37 -49
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +7 -22
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +4 -14
- package/src/llama.cpp/ggml/src/ggml-sycl/quants.hpp +83 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +204 -118
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +1 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +128 -53
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +83 -49
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1278 -282
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +32 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +133 -30
- package/src/llama.cpp/ggml/src/ggml.c +170 -265
- package/src/llama.cpp/ggml/src/gguf.cpp +34 -33
- package/src/llama.cpp/include/llama.h +82 -22
- package/src/llama.cpp/models/ggml-vocab-llama4.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-llama4.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.out +46 -0
- package/src/llama.cpp/requirements/requirements-all.txt +5 -3
- package/src/llama.cpp/requirements/requirements-gguf_editor_gui.txt +3 -0
- package/src/llama.cpp/scripts/xxd.cmake +1 -1
- package/src/llama.cpp/src/CMakeLists.txt +4 -2
- package/src/llama.cpp/src/llama-adapter.cpp +43 -1
- package/src/llama.cpp/src/llama-arch.cpp +163 -17
- package/src/llama.cpp/src/llama-arch.h +16 -0
- package/src/llama.cpp/src/llama-batch.cpp +5 -1
- package/src/llama.cpp/src/llama-batch.h +2 -1
- package/src/llama.cpp/src/llama-chat.cpp +91 -16
- package/src/llama.cpp/src/llama-chat.h +7 -2
- package/src/llama.cpp/src/llama-context.cpp +479 -575
- package/src/llama.cpp/src/llama-context.h +44 -33
- package/src/llama.cpp/src/llama-cparams.h +1 -0
- package/src/llama.cpp/src/llama-graph.cpp +209 -157
- package/src/llama.cpp/src/llama-graph.h +38 -14
- package/src/llama.cpp/src/llama-hparams.h +13 -0
- package/src/llama.cpp/src/llama-kv-cache.cpp +1604 -543
- package/src/llama.cpp/src/llama-kv-cache.h +283 -171
- package/src/llama.cpp/src/llama-memory.h +12 -2
- package/src/llama.cpp/src/llama-mmap.cpp +1 -1
- package/src/llama.cpp/src/llama-model-loader.cpp +34 -20
- package/src/llama.cpp/src/llama-model-loader.h +5 -3
- package/src/llama.cpp/src/llama-model-saver.cpp +281 -0
- package/src/llama.cpp/src/llama-model-saver.h +37 -0
- package/src/llama.cpp/src/llama-model.cpp +1803 -330
- package/src/llama.cpp/src/llama-model.h +21 -2
- package/src/llama.cpp/src/llama-quant.cpp +33 -10
- package/src/llama.cpp/src/llama-sampling.cpp +25 -7
- package/src/llama.cpp/src/llama-vocab.cpp +86 -10
- package/src/llama.cpp/src/llama-vocab.h +6 -0
- package/src/llama.cpp/src/llama.cpp +15 -1
- package/src/llama.cpp/tests/CMakeLists.txt +52 -31
- package/src/llama.cpp/tests/test-arg-parser.cpp +51 -4
- package/src/llama.cpp/tests/test-backend-ops.cpp +189 -90
- package/src/llama.cpp/tests/test-chat-template.cpp +26 -6
- package/src/llama.cpp/tests/test-chat.cpp +15 -3
- package/src/llama.cpp/{examples/gbnf-validator/gbnf-validator.cpp → tests/test-gbnf-validator.cpp} +2 -2
- package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -2
- package/src/llama.cpp/tests/test-grammar-llguidance.cpp +63 -2
- package/src/llama.cpp/tests/test-grammar-parser.cpp +3 -1
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -1
- package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -1
- package/src/llama.cpp/tests/test-mtmd-c-api.c +63 -0
- package/src/llama.cpp/tests/test-opt.cpp +33 -21
- package/src/llama.cpp/{examples/quantize-stats/quantize-stats.cpp → tests/test-quantize-stats.cpp} +3 -1
- package/src/llama.cpp/tests/test-regex-partial.cpp +288 -0
- package/src/llama.cpp/tests/test-sampling.cpp +1 -1
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +2 -1
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +2 -1
- package/src/llama.cpp/tools/CMakeLists.txt +39 -0
- package/src/llama.cpp/{examples → tools}/batched-bench/batched-bench.cpp +3 -3
- package/src/llama.cpp/{examples → tools}/export-lora/export-lora.cpp +1 -1
- package/src/llama.cpp/{examples → tools}/gguf-split/gguf-split.cpp +15 -16
- package/src/llama.cpp/{examples → tools}/imatrix/imatrix.cpp +11 -9
- package/src/llama.cpp/{examples → tools}/llama-bench/llama-bench.cpp +623 -274
- package/src/llama.cpp/{examples → tools}/main/main.cpp +22 -14
- package/src/llama.cpp/tools/mtmd/CMakeLists.txt +47 -0
- package/src/llama.cpp/tools/mtmd/clip-impl.h +365 -0
- package/src/llama.cpp/tools/mtmd/clip.cpp +3646 -0
- package/src/llama.cpp/tools/mtmd/clip.h +99 -0
- package/src/llama.cpp/tools/mtmd/deprecation-warning.cpp +22 -0
- package/src/llama.cpp/tools/mtmd/mtmd-cli.cpp +370 -0
- package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +310 -0
- package/src/llama.cpp/tools/mtmd/mtmd.cpp +678 -0
- package/src/llama.cpp/tools/mtmd/mtmd.h +331 -0
- package/src/llama.cpp/{examples → tools}/perplexity/perplexity.cpp +21 -5
- package/src/llama.cpp/{examples → tools}/quantize/quantize.cpp +53 -3
- package/src/llama.cpp/tools/rpc/CMakeLists.txt +4 -0
- package/src/llama.cpp/tools/rpc/rpc-server.cpp +322 -0
- package/src/llama.cpp/tools/run/CMakeLists.txt +16 -0
- package/src/llama.cpp/{examples → tools}/run/run.cpp +30 -30
- package/src/llama.cpp/{examples → tools}/server/CMakeLists.txt +2 -1
- package/src/llama.cpp/{examples → tools}/server/httplib.h +313 -247
- package/src/llama.cpp/{examples → tools}/server/server.cpp +529 -215
- package/src/llama.cpp/{examples → tools}/server/utils.hpp +427 -6
- package/src/llama.cpp/{examples → tools}/tts/tts.cpp +6 -9
- package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +0 -6
- package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/infill/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/infill/infill.cpp +0 -590
- package/src/llama.cpp/examples/llava/CMakeLists.txt +0 -66
- package/src/llama.cpp/examples/llava/android/build_64.sh +0 -8
- package/src/llama.cpp/examples/llava/clip-quantize-cli.cpp +0 -59
- package/src/llama.cpp/examples/llava/clip.cpp +0 -3206
- package/src/llama.cpp/examples/llava/clip.h +0 -118
- package/src/llama.cpp/examples/llava/gemma3-cli.cpp +0 -341
- package/src/llama.cpp/examples/llava/llava-cli.cpp +0 -332
- package/src/llama.cpp/examples/llava/llava.cpp +0 -574
- package/src/llama.cpp/examples/llava/llava.h +0 -49
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +0 -354
- package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +0 -584
- package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +0 -6
- package/src/llama.cpp/examples/rpc/CMakeLists.txt +0 -2
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +0 -171
- package/src/llama.cpp/examples/run/CMakeLists.txt +0 -5
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
- /package/src/llama.cpp/{examples → tools}/batched-bench/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/completions.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/cvector-generator.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/mean.hpp +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/negative.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/pca.hpp +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/positive.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/export-lora/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/gguf-split/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/imatrix/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/llama-bench/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/main/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples/llava → tools/mtmd}/requirements.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/perplexity/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/quantize/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.h +0 -0
- /package/src/llama.cpp/{examples → tools}/server/bench/requirements.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/server/tests/requirements.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/tokenize/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/tokenize/tokenize.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/tts/CMakeLists.txt +0 -0
|
@@ -7,9 +7,6 @@
|
|
|
7
7
|
|
|
8
8
|
#include "common.h"
|
|
9
9
|
#include "log.h"
|
|
10
|
-
// Change JSON_ASSERT from assert() to GGML_ASSERT:
|
|
11
|
-
#define JSON_ASSERT GGML_ASSERT
|
|
12
|
-
#include "json.hpp"
|
|
13
10
|
#include "llama.h"
|
|
14
11
|
|
|
15
12
|
#include <algorithm>
|
|
@@ -51,47 +48,11 @@
|
|
|
51
48
|
#include <sys/stat.h>
|
|
52
49
|
#include <unistd.h>
|
|
53
50
|
#endif
|
|
54
|
-
#if defined(LLAMA_USE_CURL)
|
|
55
|
-
#include <curl/curl.h>
|
|
56
|
-
#include <curl/easy.h>
|
|
57
|
-
#include <future>
|
|
58
|
-
#endif
|
|
59
51
|
|
|
60
52
|
#if defined(_MSC_VER)
|
|
61
53
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
|
62
54
|
#endif
|
|
63
55
|
|
|
64
|
-
#if defined(LLAMA_USE_CURL)
|
|
65
|
-
#ifdef __linux__
|
|
66
|
-
#include <linux/limits.h>
|
|
67
|
-
#elif defined(_WIN32)
|
|
68
|
-
# if !defined(PATH_MAX)
|
|
69
|
-
# define PATH_MAX MAX_PATH
|
|
70
|
-
# endif
|
|
71
|
-
#else
|
|
72
|
-
#include <sys/syslimits.h>
|
|
73
|
-
#endif
|
|
74
|
-
#define LLAMA_CURL_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
|
|
75
|
-
|
|
76
|
-
//
|
|
77
|
-
// CURL utils
|
|
78
|
-
//
|
|
79
|
-
|
|
80
|
-
using curl_ptr = std::unique_ptr<CURL, decltype(&curl_easy_cleanup)>;
|
|
81
|
-
|
|
82
|
-
// cannot use unique_ptr for curl_slist, because we cannot update without destroying the old one
|
|
83
|
-
struct curl_slist_ptr {
|
|
84
|
-
struct curl_slist * ptr = nullptr;
|
|
85
|
-
~curl_slist_ptr() {
|
|
86
|
-
if (ptr) {
|
|
87
|
-
curl_slist_free_all(ptr);
|
|
88
|
-
}
|
|
89
|
-
}
|
|
90
|
-
};
|
|
91
|
-
#endif // LLAMA_USE_CURL
|
|
92
|
-
|
|
93
|
-
using json = nlohmann::ordered_json;
|
|
94
|
-
|
|
95
56
|
//
|
|
96
57
|
// CPU utils
|
|
97
58
|
//
|
|
@@ -482,6 +443,25 @@ void string_replace_all(std::string & s, const std::string & search, const std::
|
|
|
482
443
|
s = std::move(builder);
|
|
483
444
|
}
|
|
484
445
|
|
|
446
|
+
bool string_ends_with(const std::string_view & str, const std::string_view & suffix) {
|
|
447
|
+
return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
|
|
448
|
+
}
|
|
449
|
+
size_t string_find_partial_stop(const std::string_view & str, const std::string_view & stop) {
|
|
450
|
+
if (!str.empty() && !stop.empty()) {
|
|
451
|
+
const char text_last_char = str.back();
|
|
452
|
+
for (int64_t char_index = stop.size() - 1; char_index >= 0; char_index--) {
|
|
453
|
+
if (stop[char_index] == text_last_char) {
|
|
454
|
+
const auto current_partial = stop.substr(0, char_index + 1);
|
|
455
|
+
if (string_ends_with(str, current_partial)) {
|
|
456
|
+
return str.size() - char_index - 1;
|
|
457
|
+
}
|
|
458
|
+
}
|
|
459
|
+
}
|
|
460
|
+
}
|
|
461
|
+
|
|
462
|
+
return std::string::npos;
|
|
463
|
+
}
|
|
464
|
+
|
|
485
465
|
std::string regex_escape(const std::string & s) {
|
|
486
466
|
static const std::regex special_chars("[.^$|()*+?\\[\\]{}\\\\]");
|
|
487
467
|
return std::regex_replace(s, special_chars, "\\$0");
|
|
@@ -869,7 +849,7 @@ std::string fs_get_cache_directory() {
|
|
|
869
849
|
if (getenv("LLAMA_CACHE")) {
|
|
870
850
|
cache_directory = std::getenv("LLAMA_CACHE");
|
|
871
851
|
} else {
|
|
872
|
-
#
|
|
852
|
+
#if defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)
|
|
873
853
|
if (std::getenv("XDG_CACHE_HOME")) {
|
|
874
854
|
cache_directory = std::getenv("XDG_CACHE_HOME");
|
|
875
855
|
} else {
|
|
@@ -879,7 +859,9 @@ std::string fs_get_cache_directory() {
|
|
|
879
859
|
cache_directory = std::getenv("HOME") + std::string("/Library/Caches/");
|
|
880
860
|
#elif defined(_WIN32)
|
|
881
861
|
cache_directory = std::getenv("LOCALAPPDATA");
|
|
882
|
-
#
|
|
862
|
+
#else
|
|
863
|
+
# error Unknown architecture
|
|
864
|
+
#endif
|
|
883
865
|
cache_directory = ensure_trailing_slash(cache_directory);
|
|
884
866
|
cache_directory += "llama.cpp";
|
|
885
867
|
}
|
|
@@ -900,22 +882,14 @@ std::string fs_get_cache_file(const std::string & filename) {
|
|
|
900
882
|
//
|
|
901
883
|
// Model utils
|
|
902
884
|
//
|
|
885
|
+
|
|
903
886
|
struct common_init_result common_init_from_params(common_params & params) {
|
|
904
887
|
common_init_result iparams;
|
|
905
888
|
auto mparams = common_model_params_to_llama(params);
|
|
906
889
|
|
|
907
|
-
llama_model * model =
|
|
908
|
-
|
|
909
|
-
if (!params.hf_repo.empty() && !params.hf_file.empty()) {
|
|
910
|
-
model = common_load_model_from_hf(params.hf_repo, params.hf_file, params.model, params.hf_token, mparams);
|
|
911
|
-
} else if (!params.model_url.empty()) {
|
|
912
|
-
model = common_load_model_from_url(params.model_url, params.model, params.hf_token, mparams);
|
|
913
|
-
} else {
|
|
914
|
-
model = llama_model_load_from_file(params.model.c_str(), mparams);
|
|
915
|
-
}
|
|
916
|
-
|
|
890
|
+
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
|
|
917
891
|
if (model == NULL) {
|
|
918
|
-
LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.c_str());
|
|
892
|
+
LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.path.c_str());
|
|
919
893
|
return iparams;
|
|
920
894
|
}
|
|
921
895
|
|
|
@@ -950,7 +924,7 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|
|
950
924
|
|
|
951
925
|
llama_context * lctx = llama_init_from_model(model, cparams);
|
|
952
926
|
if (lctx == NULL) {
|
|
953
|
-
LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.c_str());
|
|
927
|
+
LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str());
|
|
954
928
|
llama_model_free(model);
|
|
955
929
|
return iparams;
|
|
956
930
|
}
|
|
@@ -1074,6 +1048,19 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|
|
1074
1048
|
return iparams;
|
|
1075
1049
|
}
|
|
1076
1050
|
|
|
1051
|
+
std::string get_model_endpoint() {
|
|
1052
|
+
const char * model_endpoint_env = getenv("MODEL_ENDPOINT");
|
|
1053
|
+
// We still respect the use of environment-variable "HF_ENDPOINT" for backward-compatibility.
|
|
1054
|
+
const char * hf_endpoint_env = getenv("HF_ENDPOINT");
|
|
1055
|
+
const char * endpoint_env = model_endpoint_env ? model_endpoint_env : hf_endpoint_env;
|
|
1056
|
+
std::string model_endpoint = "https://huggingface.co/";
|
|
1057
|
+
if (endpoint_env) {
|
|
1058
|
+
model_endpoint = endpoint_env;
|
|
1059
|
+
if (model_endpoint.back() != '/') model_endpoint += '/';
|
|
1060
|
+
}
|
|
1061
|
+
return model_endpoint;
|
|
1062
|
+
}
|
|
1063
|
+
|
|
1077
1064
|
void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora) {
|
|
1078
1065
|
llama_clear_adapter_lora(ctx);
|
|
1079
1066
|
for (auto & la : lora) {
|
|
@@ -1089,15 +1076,18 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
|
|
|
1089
1076
|
if (!params.devices.empty()) {
|
|
1090
1077
|
mparams.devices = params.devices.data();
|
|
1091
1078
|
}
|
|
1079
|
+
|
|
1092
1080
|
if (params.n_gpu_layers != -1) {
|
|
1093
1081
|
mparams.n_gpu_layers = params.n_gpu_layers;
|
|
1094
1082
|
}
|
|
1083
|
+
|
|
1095
1084
|
mparams.main_gpu = params.main_gpu;
|
|
1096
1085
|
mparams.split_mode = params.split_mode;
|
|
1097
1086
|
mparams.tensor_split = params.tensor_split;
|
|
1098
1087
|
mparams.use_mmap = params.use_mmap;
|
|
1099
1088
|
mparams.use_mlock = params.use_mlock;
|
|
1100
1089
|
mparams.check_tensors = params.check_tensors;
|
|
1090
|
+
|
|
1101
1091
|
if (params.kv_overrides.empty()) {
|
|
1102
1092
|
mparams.kv_overrides = NULL;
|
|
1103
1093
|
} else {
|
|
@@ -1105,6 +1095,13 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
|
|
|
1105
1095
|
mparams.kv_overrides = params.kv_overrides.data();
|
|
1106
1096
|
}
|
|
1107
1097
|
|
|
1098
|
+
if (params.tensor_buft_overrides.empty()) {
|
|
1099
|
+
mparams.tensor_buft_overrides = NULL;
|
|
1100
|
+
} else {
|
|
1101
|
+
GGML_ASSERT(params.tensor_buft_overrides.back().pattern == nullptr && "Tensor buffer overrides not terminated with empty pattern");
|
|
1102
|
+
mparams.tensor_buft_overrides = params.tensor_buft_overrides.data();
|
|
1103
|
+
}
|
|
1104
|
+
|
|
1108
1105
|
return mparams;
|
|
1109
1106
|
}
|
|
1110
1107
|
|
|
@@ -1118,7 +1115,6 @@ struct llama_context_params common_context_params_to_llama(const common_params &
|
|
|
1118
1115
|
cparams.n_threads = params.cpuparams.n_threads;
|
|
1119
1116
|
cparams.n_threads_batch = params.cpuparams_batch.n_threads == -1 ?
|
|
1120
1117
|
params.cpuparams.n_threads : params.cpuparams_batch.n_threads;
|
|
1121
|
-
cparams.logits_all = params.logits_all;
|
|
1122
1118
|
cparams.embeddings = params.embedding;
|
|
1123
1119
|
cparams.rope_scaling_type = params.rope_scaling_type;
|
|
1124
1120
|
cparams.rope_freq_base = params.rope_freq_base;
|
|
@@ -1136,6 +1132,7 @@ struct llama_context_params common_context_params_to_llama(const common_params &
|
|
|
1136
1132
|
cparams.offload_kqv = !params.no_kv_offload;
|
|
1137
1133
|
cparams.flash_attn = params.flash_attn;
|
|
1138
1134
|
cparams.no_perf = params.no_perf;
|
|
1135
|
+
cparams.op_offload = !params.no_op_offload;
|
|
1139
1136
|
|
|
1140
1137
|
if (params.reranking) {
|
|
1141
1138
|
cparams.embeddings = true;
|
|
@@ -1164,451 +1161,6 @@ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_p
|
|
|
1164
1161
|
return tpp;
|
|
1165
1162
|
}
|
|
1166
1163
|
|
|
1167
|
-
#ifdef LLAMA_USE_CURL
|
|
1168
|
-
|
|
1169
|
-
#define CURL_MAX_RETRY 3
|
|
1170
|
-
#define CURL_RETRY_DELAY_SECONDS 2
|
|
1171
|
-
|
|
1172
|
-
static bool curl_perform_with_retry(const std::string & url, CURL * curl, int max_attempts, int retry_delay_seconds) {
|
|
1173
|
-
int remaining_attempts = max_attempts;
|
|
1174
|
-
|
|
1175
|
-
while (remaining_attempts > 0) {
|
|
1176
|
-
LOG_INF("%s: Trying to download from %s (attempt %d of %d)...\n", __func__ , url.c_str(), max_attempts - remaining_attempts + 1, max_attempts);
|
|
1177
|
-
|
|
1178
|
-
CURLcode res = curl_easy_perform(curl);
|
|
1179
|
-
if (res == CURLE_OK) {
|
|
1180
|
-
return true;
|
|
1181
|
-
}
|
|
1182
|
-
|
|
1183
|
-
int exponential_backoff_delay = std::pow(retry_delay_seconds, max_attempts - remaining_attempts) * 1000;
|
|
1184
|
-
LOG_WRN("%s: curl_easy_perform() failed: %s, retrying after %d milliseconds...\n", __func__, curl_easy_strerror(res), exponential_backoff_delay);
|
|
1185
|
-
|
|
1186
|
-
remaining_attempts--;
|
|
1187
|
-
std::this_thread::sleep_for(std::chrono::milliseconds(exponential_backoff_delay));
|
|
1188
|
-
}
|
|
1189
|
-
|
|
1190
|
-
LOG_ERR("%s: curl_easy_perform() failed after %d attempts\n", __func__, max_attempts);
|
|
1191
|
-
|
|
1192
|
-
return false;
|
|
1193
|
-
}
|
|
1194
|
-
|
|
1195
|
-
static bool common_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {
|
|
1196
|
-
// Initialize libcurl
|
|
1197
|
-
curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
|
|
1198
|
-
curl_slist_ptr http_headers;
|
|
1199
|
-
if (!curl) {
|
|
1200
|
-
LOG_ERR("%s: error initializing libcurl\n", __func__);
|
|
1201
|
-
return false;
|
|
1202
|
-
}
|
|
1203
|
-
|
|
1204
|
-
bool force_download = false;
|
|
1205
|
-
|
|
1206
|
-
// Set the URL, allow to follow http redirection
|
|
1207
|
-
curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
|
|
1208
|
-
curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);
|
|
1209
|
-
|
|
1210
|
-
// Check if hf-token or bearer-token was specified
|
|
1211
|
-
if (!hf_token.empty()) {
|
|
1212
|
-
std::string auth_header = "Authorization: Bearer " + hf_token;
|
|
1213
|
-
http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
|
|
1214
|
-
curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
|
|
1215
|
-
}
|
|
1216
|
-
|
|
1217
|
-
#if defined(_WIN32)
|
|
1218
|
-
// CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of
|
|
1219
|
-
// operating system. Currently implemented under MS-Windows.
|
|
1220
|
-
curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
|
|
1221
|
-
#endif
|
|
1222
|
-
|
|
1223
|
-
// Check if the file already exists locally
|
|
1224
|
-
auto file_exists = std::filesystem::exists(path);
|
|
1225
|
-
|
|
1226
|
-
// If the file exists, check its JSON metadata companion file.
|
|
1227
|
-
std::string metadata_path = path + ".json";
|
|
1228
|
-
nlohmann::json metadata;
|
|
1229
|
-
std::string etag;
|
|
1230
|
-
std::string last_modified;
|
|
1231
|
-
|
|
1232
|
-
if (file_exists) {
|
|
1233
|
-
// Try and read the JSON metadata file (note: stream autoclosed upon exiting this block).
|
|
1234
|
-
std::ifstream metadata_in(metadata_path);
|
|
1235
|
-
if (metadata_in.good()) {
|
|
1236
|
-
try {
|
|
1237
|
-
metadata_in >> metadata;
|
|
1238
|
-
LOG_INF("%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str());
|
|
1239
|
-
if (metadata.contains("url") && metadata.at("url").is_string()) {
|
|
1240
|
-
auto previous_url = metadata.at("url").get<std::string>();
|
|
1241
|
-
if (previous_url != url) {
|
|
1242
|
-
LOG_ERR("%s: Model URL mismatch: %s != %s\n", __func__, url.c_str(), previous_url.c_str());
|
|
1243
|
-
return false;
|
|
1244
|
-
}
|
|
1245
|
-
}
|
|
1246
|
-
if (metadata.contains("etag") && metadata.at("etag").is_string()) {
|
|
1247
|
-
etag = metadata.at("etag");
|
|
1248
|
-
}
|
|
1249
|
-
if (metadata.contains("lastModified") && metadata.at("lastModified").is_string()) {
|
|
1250
|
-
last_modified = metadata.at("lastModified");
|
|
1251
|
-
}
|
|
1252
|
-
} catch (const nlohmann::json::exception & e) {
|
|
1253
|
-
LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
|
|
1254
|
-
return false;
|
|
1255
|
-
}
|
|
1256
|
-
}
|
|
1257
|
-
} else {
|
|
1258
|
-
LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
|
|
1259
|
-
}
|
|
1260
|
-
|
|
1261
|
-
// Send a HEAD request to retrieve the etag and last-modified headers
|
|
1262
|
-
struct common_load_model_from_url_headers {
|
|
1263
|
-
std::string etag;
|
|
1264
|
-
std::string last_modified;
|
|
1265
|
-
};
|
|
1266
|
-
|
|
1267
|
-
common_load_model_from_url_headers headers;
|
|
1268
|
-
|
|
1269
|
-
{
|
|
1270
|
-
typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
|
|
1271
|
-
auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
|
|
1272
|
-
common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata;
|
|
1273
|
-
|
|
1274
|
-
static std::regex header_regex("([^:]+): (.*)\r\n");
|
|
1275
|
-
static std::regex etag_regex("ETag", std::regex_constants::icase);
|
|
1276
|
-
static std::regex last_modified_regex("Last-Modified", std::regex_constants::icase);
|
|
1277
|
-
|
|
1278
|
-
std::string header(buffer, n_items);
|
|
1279
|
-
std::smatch match;
|
|
1280
|
-
if (std::regex_match(header, match, header_regex)) {
|
|
1281
|
-
const std::string & key = match[1];
|
|
1282
|
-
const std::string & value = match[2];
|
|
1283
|
-
if (std::regex_match(key, match, etag_regex)) {
|
|
1284
|
-
headers->etag = value;
|
|
1285
|
-
} else if (std::regex_match(key, match, last_modified_regex)) {
|
|
1286
|
-
headers->last_modified = value;
|
|
1287
|
-
}
|
|
1288
|
-
}
|
|
1289
|
-
return n_items;
|
|
1290
|
-
};
|
|
1291
|
-
|
|
1292
|
-
curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 1L); // will trigger the HEAD verb
|
|
1293
|
-
curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L); // hide head request progress
|
|
1294
|
-
curl_easy_setopt(curl.get(), CURLOPT_HEADERFUNCTION, static_cast<CURLOPT_HEADERFUNCTION_PTR>(header_callback));
|
|
1295
|
-
curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers);
|
|
1296
|
-
|
|
1297
|
-
bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
|
|
1298
|
-
if (!was_perform_successful) {
|
|
1299
|
-
return false;
|
|
1300
|
-
}
|
|
1301
|
-
|
|
1302
|
-
long http_code = 0;
|
|
1303
|
-
curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
|
|
1304
|
-
if (http_code != 200) {
|
|
1305
|
-
// HEAD not supported, we don't know if the file has changed
|
|
1306
|
-
// force trigger downloading
|
|
1307
|
-
force_download = true;
|
|
1308
|
-
LOG_ERR("%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
|
|
1309
|
-
}
|
|
1310
|
-
}
|
|
1311
|
-
|
|
1312
|
-
bool should_download = !file_exists || force_download;
|
|
1313
|
-
if (!should_download) {
|
|
1314
|
-
if (!etag.empty() && etag != headers.etag) {
|
|
1315
|
-
LOG_WRN("%s: ETag header is different (%s != %s): triggering a new download\n", __func__, etag.c_str(), headers.etag.c_str());
|
|
1316
|
-
should_download = true;
|
|
1317
|
-
} else if (!last_modified.empty() && last_modified != headers.last_modified) {
|
|
1318
|
-
LOG_WRN("%s: Last-Modified header is different (%s != %s): triggering a new download\n", __func__, last_modified.c_str(), headers.last_modified.c_str());
|
|
1319
|
-
should_download = true;
|
|
1320
|
-
}
|
|
1321
|
-
}
|
|
1322
|
-
if (should_download) {
|
|
1323
|
-
std::string path_temporary = path + ".downloadInProgress";
|
|
1324
|
-
if (file_exists) {
|
|
1325
|
-
LOG_WRN("%s: deleting previous downloaded file: %s\n", __func__, path.c_str());
|
|
1326
|
-
if (remove(path.c_str()) != 0) {
|
|
1327
|
-
LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
|
|
1328
|
-
return false;
|
|
1329
|
-
}
|
|
1330
|
-
}
|
|
1331
|
-
|
|
1332
|
-
// Set the output file
|
|
1333
|
-
|
|
1334
|
-
struct FILE_deleter {
|
|
1335
|
-
void operator()(FILE * f) const {
|
|
1336
|
-
fclose(f);
|
|
1337
|
-
}
|
|
1338
|
-
};
|
|
1339
|
-
|
|
1340
|
-
std::unique_ptr<FILE, FILE_deleter> outfile(fopen(path_temporary.c_str(), "wb"));
|
|
1341
|
-
if (!outfile) {
|
|
1342
|
-
LOG_ERR("%s: error opening local file for writing: %s\n", __func__, path.c_str());
|
|
1343
|
-
return false;
|
|
1344
|
-
}
|
|
1345
|
-
|
|
1346
|
-
typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * data, size_t size, size_t nmemb, void * fd);
|
|
1347
|
-
auto write_callback = [](void * data, size_t size, size_t nmemb, void * fd) -> size_t {
|
|
1348
|
-
return fwrite(data, size, nmemb, (FILE *)fd);
|
|
1349
|
-
};
|
|
1350
|
-
curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 0L);
|
|
1351
|
-
curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
|
|
1352
|
-
curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, outfile.get());
|
|
1353
|
-
|
|
1354
|
-
// display download progress
|
|
1355
|
-
curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 0L);
|
|
1356
|
-
|
|
1357
|
-
// helper function to hide password in URL
|
|
1358
|
-
auto llama_download_hide_password_in_url = [](const std::string & url) -> std::string {
|
|
1359
|
-
std::size_t protocol_pos = url.find("://");
|
|
1360
|
-
if (protocol_pos == std::string::npos) {
|
|
1361
|
-
return url; // Malformed URL
|
|
1362
|
-
}
|
|
1363
|
-
|
|
1364
|
-
std::size_t at_pos = url.find('@', protocol_pos + 3);
|
|
1365
|
-
if (at_pos == std::string::npos) {
|
|
1366
|
-
return url; // No password in URL
|
|
1367
|
-
}
|
|
1368
|
-
|
|
1369
|
-
return url.substr(0, protocol_pos + 3) + "********" + url.substr(at_pos);
|
|
1370
|
-
};
|
|
1371
|
-
|
|
1372
|
-
// start the download
|
|
1373
|
-
LOG_INF("%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
|
|
1374
|
-
llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str());
|
|
1375
|
-
bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
|
|
1376
|
-
if (!was_perform_successful) {
|
|
1377
|
-
return false;
|
|
1378
|
-
}
|
|
1379
|
-
|
|
1380
|
-
long http_code = 0;
|
|
1381
|
-
curl_easy_getinfo (curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
|
|
1382
|
-
if (http_code < 200 || http_code >= 400) {
|
|
1383
|
-
LOG_ERR("%s: invalid http status code received: %ld\n", __func__, http_code);
|
|
1384
|
-
return false;
|
|
1385
|
-
}
|
|
1386
|
-
|
|
1387
|
-
// Causes file to be closed explicitly here before we rename it.
|
|
1388
|
-
outfile.reset();
|
|
1389
|
-
|
|
1390
|
-
// Write the updated JSON metadata file.
|
|
1391
|
-
metadata.update({
|
|
1392
|
-
{"url", url},
|
|
1393
|
-
{"etag", headers.etag},
|
|
1394
|
-
{"lastModified", headers.last_modified}
|
|
1395
|
-
});
|
|
1396
|
-
std::ofstream(metadata_path) << metadata.dump(4);
|
|
1397
|
-
LOG_INF("%s: file metadata saved: %s\n", __func__, metadata_path.c_str());
|
|
1398
|
-
|
|
1399
|
-
if (rename(path_temporary.c_str(), path.c_str()) != 0) {
|
|
1400
|
-
LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
|
|
1401
|
-
return false;
|
|
1402
|
-
}
|
|
1403
|
-
}
|
|
1404
|
-
|
|
1405
|
-
return true;
|
|
1406
|
-
}
|
|
1407
|
-
|
|
1408
|
-
struct llama_model * common_load_model_from_url(
|
|
1409
|
-
const std::string & model_url,
|
|
1410
|
-
const std::string & local_path,
|
|
1411
|
-
const std::string & hf_token,
|
|
1412
|
-
const struct llama_model_params & params) {
|
|
1413
|
-
// Basic validation of the model_url
|
|
1414
|
-
if (model_url.empty()) {
|
|
1415
|
-
LOG_ERR("%s: invalid model_url\n", __func__);
|
|
1416
|
-
return NULL;
|
|
1417
|
-
}
|
|
1418
|
-
|
|
1419
|
-
if (!common_download_file(model_url, local_path, hf_token)) {
|
|
1420
|
-
return NULL;
|
|
1421
|
-
}
|
|
1422
|
-
|
|
1423
|
-
// check for additional GGUFs split to download
|
|
1424
|
-
int n_split = 0;
|
|
1425
|
-
{
|
|
1426
|
-
struct gguf_init_params gguf_params = {
|
|
1427
|
-
/*.no_alloc = */ true,
|
|
1428
|
-
/*.ctx = */ NULL,
|
|
1429
|
-
};
|
|
1430
|
-
auto * ctx_gguf = gguf_init_from_file(local_path.c_str(), gguf_params);
|
|
1431
|
-
if (!ctx_gguf) {
|
|
1432
|
-
LOG_ERR("\n%s: failed to load input GGUF from %s\n", __func__, local_path.c_str());
|
|
1433
|
-
return NULL;
|
|
1434
|
-
}
|
|
1435
|
-
|
|
1436
|
-
auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_SPLIT_COUNT);
|
|
1437
|
-
if (key_n_split >= 0) {
|
|
1438
|
-
n_split = gguf_get_val_u16(ctx_gguf, key_n_split);
|
|
1439
|
-
}
|
|
1440
|
-
|
|
1441
|
-
gguf_free(ctx_gguf);
|
|
1442
|
-
}
|
|
1443
|
-
|
|
1444
|
-
if (n_split > 1) {
|
|
1445
|
-
char split_prefix[PATH_MAX] = {0};
|
|
1446
|
-
char split_url_prefix[LLAMA_CURL_MAX_URL_LENGTH] = {0};
|
|
1447
|
-
|
|
1448
|
-
// Verify the first split file format
|
|
1449
|
-
// and extract split URL and PATH prefixes
|
|
1450
|
-
{
|
|
1451
|
-
if (!llama_split_prefix(split_prefix, sizeof(split_prefix), local_path.c_str(), 0, n_split)) {
|
|
1452
|
-
LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, local_path.c_str(), n_split);
|
|
1453
|
-
return NULL;
|
|
1454
|
-
}
|
|
1455
|
-
|
|
1456
|
-
if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url.c_str(), 0, n_split)) {
|
|
1457
|
-
LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model_url.c_str(), n_split);
|
|
1458
|
-
return NULL;
|
|
1459
|
-
}
|
|
1460
|
-
}
|
|
1461
|
-
|
|
1462
|
-
// Prepare download in parallel
|
|
1463
|
-
std::vector<std::future<bool>> futures_download;
|
|
1464
|
-
for (int idx = 1; idx < n_split; idx++) {
|
|
1465
|
-
futures_download.push_back(std::async(std::launch::async, [&split_prefix, &split_url_prefix, &n_split, hf_token](int download_idx) -> bool {
|
|
1466
|
-
char split_path[PATH_MAX] = {0};
|
|
1467
|
-
llama_split_path(split_path, sizeof(split_path), split_prefix, download_idx, n_split);
|
|
1468
|
-
|
|
1469
|
-
char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0};
|
|
1470
|
-
llama_split_path(split_url, sizeof(split_url), split_url_prefix, download_idx, n_split);
|
|
1471
|
-
|
|
1472
|
-
return common_download_file(split_url, split_path, hf_token);
|
|
1473
|
-
}, idx));
|
|
1474
|
-
}
|
|
1475
|
-
|
|
1476
|
-
// Wait for all downloads to complete
|
|
1477
|
-
for (auto & f : futures_download) {
|
|
1478
|
-
if (!f.get()) {
|
|
1479
|
-
return NULL;
|
|
1480
|
-
}
|
|
1481
|
-
}
|
|
1482
|
-
}
|
|
1483
|
-
|
|
1484
|
-
return llama_model_load_from_file(local_path.c_str(), params);
|
|
1485
|
-
}
|
|
1486
|
-
|
|
1487
|
-
struct llama_model * common_load_model_from_hf(
|
|
1488
|
-
const std::string & repo,
|
|
1489
|
-
const std::string & remote_path,
|
|
1490
|
-
const std::string & local_path,
|
|
1491
|
-
const std::string & hf_token,
|
|
1492
|
-
const struct llama_model_params & params) {
|
|
1493
|
-
// construct hugging face model url:
|
|
1494
|
-
//
|
|
1495
|
-
// --repo ggml-org/models --file tinyllama-1.1b/ggml-model-f16.gguf
|
|
1496
|
-
// https://huggingface.co/ggml-org/models/resolve/main/tinyllama-1.1b/ggml-model-f16.gguf
|
|
1497
|
-
//
|
|
1498
|
-
// --repo TheBloke/Mixtral-8x7B-v0.1-GGUF --file mixtral-8x7b-v0.1.Q4_K_M.gguf
|
|
1499
|
-
// https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/resolve/main/mixtral-8x7b-v0.1.Q4_K_M.gguf
|
|
1500
|
-
//
|
|
1501
|
-
|
|
1502
|
-
std::string model_url = "https://huggingface.co/";
|
|
1503
|
-
model_url += repo;
|
|
1504
|
-
model_url += "/resolve/main/";
|
|
1505
|
-
model_url += remote_path;
|
|
1506
|
-
|
|
1507
|
-
return common_load_model_from_url(model_url, local_path, hf_token, params);
|
|
1508
|
-
}
|
|
1509
|
-
|
|
1510
|
-
/**
|
|
1511
|
-
* Allow getting the HF file from the HF repo with tag (like ollama), for example:
|
|
1512
|
-
* - bartowski/Llama-3.2-3B-Instruct-GGUF:q4
|
|
1513
|
-
* - bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M
|
|
1514
|
-
* - bartowski/Llama-3.2-3B-Instruct-GGUF:q5_k_s
|
|
1515
|
-
* Tag is optional, default to "latest" (meaning it checks for Q4_K_M first, then Q4, then if not found, return the first GGUF file in repo)
|
|
1516
|
-
*
|
|
1517
|
-
* Return pair of <repo, file> (with "repo" already having tag removed)
|
|
1518
|
-
*
|
|
1519
|
-
* Note: we use the Ollama-compatible HF API, but not using the blobId. Instead, we use the special "ggufFile" field which returns the value for "hf_file". This is done to be backward-compatible with existing cache files.
|
|
1520
|
-
*/
|
|
1521
|
-
std::pair<std::string, std::string> common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & hf_token) {
|
|
1522
|
-
auto parts = string_split<std::string>(hf_repo_with_tag, ':');
|
|
1523
|
-
std::string tag = parts.size() > 1 ? parts.back() : "latest";
|
|
1524
|
-
std::string hf_repo = parts[0];
|
|
1525
|
-
if (string_split<std::string>(hf_repo, '/').size() != 2) {
|
|
1526
|
-
throw std::invalid_argument("error: invalid HF repo format, expected <user>/<model>[:quant]\n");
|
|
1527
|
-
}
|
|
1528
|
-
|
|
1529
|
-
// fetch model info from Hugging Face Hub API
|
|
1530
|
-
json model_info;
|
|
1531
|
-
curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
|
|
1532
|
-
curl_slist_ptr http_headers;
|
|
1533
|
-
std::string res_str;
|
|
1534
|
-
std::string url = "https://huggingface.co/v2/" + hf_repo + "/manifests/" + tag;
|
|
1535
|
-
curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
|
|
1536
|
-
curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L);
|
|
1537
|
-
typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * ptr, size_t size, size_t nmemb, void * data);
|
|
1538
|
-
auto write_callback = [](void * ptr, size_t size, size_t nmemb, void * data) -> size_t {
|
|
1539
|
-
static_cast<std::string *>(data)->append((char * ) ptr, size * nmemb);
|
|
1540
|
-
return size * nmemb;
|
|
1541
|
-
};
|
|
1542
|
-
curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
|
|
1543
|
-
curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &res_str);
|
|
1544
|
-
#if defined(_WIN32)
|
|
1545
|
-
curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
|
|
1546
|
-
#endif
|
|
1547
|
-
if (!hf_token.empty()) {
|
|
1548
|
-
std::string auth_header = "Authorization: Bearer " + hf_token;
|
|
1549
|
-
http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
|
|
1550
|
-
}
|
|
1551
|
-
// Important: the User-Agent must be "llama-cpp" to get the "ggufFile" field in the response
|
|
1552
|
-
http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
|
|
1553
|
-
http_headers.ptr = curl_slist_append(http_headers.ptr, "Accept: application/json");
|
|
1554
|
-
curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
|
|
1555
|
-
|
|
1556
|
-
CURLcode res = curl_easy_perform(curl.get());
|
|
1557
|
-
|
|
1558
|
-
if (res != CURLE_OK) {
|
|
1559
|
-
throw std::runtime_error("error: cannot make GET request to HF API");
|
|
1560
|
-
}
|
|
1561
|
-
|
|
1562
|
-
long res_code;
|
|
1563
|
-
curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &res_code);
|
|
1564
|
-
if (res_code == 200) {
|
|
1565
|
-
model_info = json::parse(res_str);
|
|
1566
|
-
} else if (res_code == 401) {
|
|
1567
|
-
throw std::runtime_error("error: model is private or does not exist; if you are accessing a gated model, please provide a valid HF token");
|
|
1568
|
-
} else {
|
|
1569
|
-
throw std::runtime_error(string_format("error from HF API, response code: %ld, data: %s", res_code, res_str.c_str()));
|
|
1570
|
-
}
|
|
1571
|
-
|
|
1572
|
-
// check response
|
|
1573
|
-
if (!model_info.contains("ggufFile")) {
|
|
1574
|
-
throw std::runtime_error("error: model does not have ggufFile");
|
|
1575
|
-
}
|
|
1576
|
-
json & gguf_file = model_info.at("ggufFile");
|
|
1577
|
-
if (!gguf_file.contains("rfilename")) {
|
|
1578
|
-
throw std::runtime_error("error: ggufFile does not have rfilename");
|
|
1579
|
-
}
|
|
1580
|
-
|
|
1581
|
-
return std::make_pair(hf_repo, gguf_file.at("rfilename"));
|
|
1582
|
-
}
|
|
1583
|
-
|
|
1584
|
-
#else
|
|
1585
|
-
|
|
1586
|
-
struct llama_model * common_load_model_from_url(
|
|
1587
|
-
const std::string & /*model_url*/,
|
|
1588
|
-
const std::string & /*local_path*/,
|
|
1589
|
-
const std::string & /*hf_token*/,
|
|
1590
|
-
const struct llama_model_params & /*params*/) {
|
|
1591
|
-
LOG_WRN("%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__);
|
|
1592
|
-
return nullptr;
|
|
1593
|
-
}
|
|
1594
|
-
|
|
1595
|
-
struct llama_model * common_load_model_from_hf(
|
|
1596
|
-
const std::string & /*repo*/,
|
|
1597
|
-
const std::string & /*remote_path*/,
|
|
1598
|
-
const std::string & /*local_path*/,
|
|
1599
|
-
const std::string & /*hf_token*/,
|
|
1600
|
-
const struct llama_model_params & /*params*/) {
|
|
1601
|
-
LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
|
|
1602
|
-
return nullptr;
|
|
1603
|
-
}
|
|
1604
|
-
|
|
1605
|
-
std::pair<std::string, std::string> common_get_hf_file(const std::string &, const std::string &) {
|
|
1606
|
-
LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
|
|
1607
|
-
return std::make_pair("", "");
|
|
1608
|
-
}
|
|
1609
|
-
|
|
1610
|
-
#endif // LLAMA_USE_CURL
|
|
1611
|
-
|
|
1612
1164
|
//
|
|
1613
1165
|
// Batch utils
|
|
1614
1166
|
//
|
|
@@ -2033,25 +1585,19 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c
|
|
|
2033
1585
|
return result;
|
|
2034
1586
|
}
|
|
2035
1587
|
|
|
2036
|
-
|
|
2037
|
-
|
|
2038
|
-
|
|
2039
|
-
|
|
2040
|
-
|
|
2041
|
-
|
|
2042
|
-
|
|
2043
|
-
|
|
2044
|
-
}
|
|
2045
|
-
return out;
|
|
2046
|
-
}
|
|
1588
|
+
ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std::vector<llama_token> & tokens, int64_t stride) {
|
|
1589
|
+
const int64_t ne_datapoint = llama_n_ctx(ctx);
|
|
1590
|
+
const int64_t ndata = (tokens.size() - ne_datapoint - 1) / stride;
|
|
1591
|
+
ggml_opt_dataset_t result = ggml_opt_dataset_init(
|
|
1592
|
+
GGML_TYPE_I32, GGML_TYPE_I32, ne_datapoint, ne_datapoint, ndata, /*ndata_shard =*/ 1);
|
|
1593
|
+
|
|
1594
|
+
llama_token * data = (llama_token *) ggml_opt_dataset_data(result)->data;
|
|
1595
|
+
llama_token * labels = (llama_token *) ggml_opt_dataset_labels(result)->data;
|
|
2047
1596
|
|
|
2048
|
-
|
|
2049
|
-
|
|
2050
|
-
|
|
2051
|
-
out.type = (common_grammar_trigger_type) in.at("type").get<int>();
|
|
2052
|
-
out.value = in.at("value").get<std::string>();
|
|
2053
|
-
if (out.type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) {
|
|
2054
|
-
out.token = (llama_token) in.at("token").get<int>();
|
|
1597
|
+
for (int64_t idata = 0; idata < ndata; ++idata) {
|
|
1598
|
+
memcpy(data + idata*ne_datapoint, tokens.data() + idata*stride + 0, ne_datapoint*sizeof(llama_token));
|
|
1599
|
+
memcpy(labels + idata*ne_datapoint, tokens.data() + idata*stride + 1, ne_datapoint*sizeof(llama_token));
|
|
2055
1600
|
}
|
|
2056
|
-
|
|
1601
|
+
|
|
1602
|
+
return result;
|
|
2057
1603
|
}
|