@fugood/llama.node 0.3.16 → 0.3.17
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +3 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +5 -0
- package/package.json +1 -1
- package/src/LlamaCompletionWorker.cpp +8 -0
- package/src/LlamaCompletionWorker.h +1 -0
- package/src/LlamaContext.cpp +3 -2
- package/src/llama.cpp/.github/workflows/build-linux-cross.yml +124 -0
- package/src/llama.cpp/.github/workflows/build.yml +70 -27
- package/src/llama.cpp/.github/workflows/docker.yml +6 -6
- package/src/llama.cpp/.github/workflows/server.yml +7 -11
- package/src/llama.cpp/CMakeLists.txt +23 -1
- package/src/llama.cpp/common/CMakeLists.txt +6 -3
- package/src/llama.cpp/common/arg.cpp +809 -105
- package/src/llama.cpp/common/arg.h +9 -0
- package/src/llama.cpp/common/chat.cpp +1 -1
- package/src/llama.cpp/common/common.cpp +31 -521
- package/src/llama.cpp/common/common.h +17 -36
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -0
- package/src/llama.cpp/common/llguidance.cpp +30 -47
- package/src/llama.cpp/common/minja/chat-template.hpp +15 -7
- package/src/llama.cpp/common/minja/minja.hpp +119 -93
- package/src/llama.cpp/common/sampling.cpp +3 -0
- package/src/llama.cpp/docs/build.md +122 -7
- package/src/llama.cpp/examples/CMakeLists.txt +0 -9
- package/src/llama.cpp/examples/batched/batched.cpp +1 -1
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +7 -1
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +1 -1
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +15 -16
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +210 -8
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
- package/src/llama.cpp/examples/llava/CMakeLists.txt +39 -24
- package/src/llama.cpp/examples/llava/clip-impl.h +345 -0
- package/src/llama.cpp/examples/llava/clip.cpp +2152 -1803
- package/src/llama.cpp/examples/llava/clip.h +39 -22
- package/src/llama.cpp/examples/llava/deprecation-warning.cpp +22 -0
- package/src/llama.cpp/examples/llava/llava.cpp +64 -52
- package/src/llama.cpp/examples/llava/mtmd-cli.cpp +344 -0
- package/src/llama.cpp/examples/llava/mtmd.cpp +708 -0
- package/src/llama.cpp/examples/llava/mtmd.h +168 -0
- package/src/llama.cpp/examples/llava/{qwen2vl-cli.cpp → qwen2vl-test.cpp} +83 -31
- package/src/llama.cpp/examples/main/main.cpp +16 -5
- package/src/llama.cpp/examples/parallel/parallel.cpp +3 -1
- package/src/llama.cpp/examples/passkey/passkey.cpp +1 -1
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +17 -3
- package/src/llama.cpp/examples/quantize/quantize.cpp +115 -2
- package/src/llama.cpp/examples/rpc/CMakeLists.txt +4 -2
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +163 -8
- package/src/llama.cpp/examples/run/CMakeLists.txt +12 -1
- package/src/llama.cpp/examples/run/run.cpp +14 -28
- package/src/llama.cpp/examples/server/httplib.h +313 -247
- package/src/llama.cpp/examples/server/server.cpp +238 -139
- package/src/llama.cpp/examples/server/utils.hpp +51 -2
- package/src/llama.cpp/examples/speculative/speculative.cpp +1 -1
- package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
- package/src/llama.cpp/examples/sycl/build.sh +2 -2
- package/src/llama.cpp/examples/sycl/win-build-sycl.bat +2 -2
- package/src/llama.cpp/examples/tts/tts.cpp +6 -9
- package/src/llama.cpp/ggml/CMakeLists.txt +8 -2
- package/src/llama.cpp/ggml/cmake/GitVars.cmake +22 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +5 -0
- package/src/llama.cpp/ggml/include/ggml-rpc.h +6 -1
- package/src/llama.cpp/ggml/include/ggml.h +66 -99
- package/src/llama.cpp/ggml/src/CMakeLists.txt +10 -7
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +0 -2
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +8 -4
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +5 -5
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +692 -1534
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +613 -122
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +135 -1
- package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +507 -137
- package/src/llama.cpp/ggml/src/ggml-common.h +12 -6
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +48 -22
- package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/common.h +72 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +896 -192
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +2 -21
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +754 -404
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1003 -13519
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +2 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +2 -7
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +0 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +3 -4
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +533 -88
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +8809 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +110 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +892 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +28 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +258 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +802 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +7 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +0 -4
- package/src/llama.cpp/ggml/src/ggml-impl.h +52 -18
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +70 -3
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +67 -119
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1023 -260
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +293 -40
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +96 -22
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +350 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +39 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +0 -35
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +2 -292
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +79 -90
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +967 -438
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +22 -23
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +24 -20
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +1 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +204 -280
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +84 -74
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +1 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +37 -49
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +7 -22
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +4 -14
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +204 -118
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +1 -3
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +23 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +646 -114
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +12 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +17 -8
- package/src/llama.cpp/ggml/src/ggml.c +141 -245
- package/src/llama.cpp/ggml/src/gguf.cpp +1 -0
- package/src/llama.cpp/include/llama.h +30 -11
- package/src/llama.cpp/models/ggml-vocab-llama4.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-llama4.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.out +46 -0
- package/src/llama.cpp/requirements/requirements-all.txt +2 -0
- package/src/llama.cpp/requirements/requirements-gguf_editor_gui.txt +3 -0
- package/src/llama.cpp/src/CMakeLists.txt +3 -2
- package/src/llama.cpp/src/llama-adapter.cpp +37 -1
- package/src/llama.cpp/src/llama-arch.cpp +160 -17
- package/src/llama.cpp/src/llama-arch.h +16 -0
- package/src/llama.cpp/src/llama-chat.cpp +82 -17
- package/src/llama.cpp/src/llama-chat.h +6 -2
- package/src/llama.cpp/src/llama-context.cpp +108 -92
- package/src/llama.cpp/src/llama-context.h +1 -2
- package/src/llama.cpp/src/llama-graph.cpp +189 -119
- package/src/llama.cpp/src/llama-graph.h +26 -6
- package/src/llama.cpp/src/llama-hparams.h +13 -0
- package/src/llama.cpp/src/llama-kv-cache.cpp +70 -123
- package/src/llama.cpp/src/llama-kv-cache.h +41 -115
- package/src/llama.cpp/src/llama-memory.h +1 -1
- package/src/llama.cpp/src/llama-mmap.cpp +1 -1
- package/src/llama.cpp/src/llama-model-loader.cpp +10 -5
- package/src/llama.cpp/src/llama-model-loader.h +5 -3
- package/src/llama.cpp/src/llama-model.cpp +1760 -534
- package/src/llama.cpp/src/llama-model.h +13 -1
- package/src/llama.cpp/src/llama-quant.cpp +29 -8
- package/src/llama.cpp/src/llama-sampling.cpp +7 -1
- package/src/llama.cpp/src/llama-vocab.cpp +44 -6
- package/src/llama.cpp/src/llama.cpp +1 -1
- package/src/llama.cpp/tests/CMakeLists.txt +43 -30
- package/src/llama.cpp/tests/test-arg-parser.cpp +51 -4
- package/src/llama.cpp/tests/test-backend-ops.cpp +82 -43
- package/src/llama.cpp/tests/test-chat-template.cpp +34 -13
- package/src/llama.cpp/tests/test-chat.cpp +12 -2
- package/src/llama.cpp/{examples/gbnf-validator/gbnf-validator.cpp → tests/test-gbnf-validator.cpp} +2 -2
- package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -2
- package/src/llama.cpp/tests/test-grammar-llguidance.cpp +63 -2
- package/src/llama.cpp/tests/test-grammar-parser.cpp +3 -1
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -1
- package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -1
- package/src/llama.cpp/{examples/quantize-stats/quantize-stats.cpp → tests/test-quantize-stats.cpp} +3 -1
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +2 -1
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +2 -1
- package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/llava/gemma3-cli.cpp +0 -341
- package/src/llama.cpp/examples/llava/llava-cli.cpp +0 -332
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +0 -354
- package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +0 -6
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
|
@@ -78,3 +78,12 @@ bool common_params_parse(int argc, char ** argv, common_params & params, llama_e
|
|
|
78
78
|
|
|
79
79
|
// function to be used by test-arg-parser
|
|
80
80
|
common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
|
|
81
|
+
bool common_has_curl();
|
|
82
|
+
|
|
83
|
+
struct common_remote_params {
|
|
84
|
+
std::vector<std::string> headers;
|
|
85
|
+
long timeout = 0; // CURLOPT_TIMEOUT, in seconds ; 0 means no timeout
|
|
86
|
+
long max_size = 0; // max size of the response ; unlimited if 0 ; max is 2GB
|
|
87
|
+
};
|
|
88
|
+
// get remote file content, returns <http_code, raw_response_body>
|
|
89
|
+
std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url, const common_remote_params & params);
|
|
@@ -1622,7 +1622,7 @@ static common_chat_params common_chat_templates_apply_jinja(
|
|
|
1622
1622
|
}
|
|
1623
1623
|
|
|
1624
1624
|
// Hermes 2/3 Pro, Qwen 2.5 Instruct (w/ tools)
|
|
1625
|
-
if (src.find("<tool_call>") != std::string::npos && params.json_schema.is_null()) {
|
|
1625
|
+
if (src.find("<tool_call>") != std::string::npos && params.json_schema.is_null() && params.tools.is_array() && params.json_schema.is_null()) {
|
|
1626
1626
|
return common_chat_params_init_hermes_2_pro(tmpl, params);
|
|
1627
1627
|
}
|
|
1628
1628
|
|
|
@@ -7,9 +7,6 @@
|
|
|
7
7
|
|
|
8
8
|
#include "common.h"
|
|
9
9
|
#include "log.h"
|
|
10
|
-
// Change JSON_ASSERT from assert() to GGML_ASSERT:
|
|
11
|
-
#define JSON_ASSERT GGML_ASSERT
|
|
12
|
-
#include "json.hpp"
|
|
13
10
|
#include "llama.h"
|
|
14
11
|
|
|
15
12
|
#include <algorithm>
|
|
@@ -51,47 +48,11 @@
|
|
|
51
48
|
#include <sys/stat.h>
|
|
52
49
|
#include <unistd.h>
|
|
53
50
|
#endif
|
|
54
|
-
#if defined(LLAMA_USE_CURL)
|
|
55
|
-
#include <curl/curl.h>
|
|
56
|
-
#include <curl/easy.h>
|
|
57
|
-
#include <future>
|
|
58
|
-
#endif
|
|
59
51
|
|
|
60
52
|
#if defined(_MSC_VER)
|
|
61
53
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
|
62
54
|
#endif
|
|
63
55
|
|
|
64
|
-
#if defined(LLAMA_USE_CURL)
|
|
65
|
-
#ifdef __linux__
|
|
66
|
-
#include <linux/limits.h>
|
|
67
|
-
#elif defined(_WIN32)
|
|
68
|
-
# if !defined(PATH_MAX)
|
|
69
|
-
# define PATH_MAX MAX_PATH
|
|
70
|
-
# endif
|
|
71
|
-
#else
|
|
72
|
-
#include <sys/syslimits.h>
|
|
73
|
-
#endif
|
|
74
|
-
#define LLAMA_CURL_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
|
|
75
|
-
|
|
76
|
-
//
|
|
77
|
-
// CURL utils
|
|
78
|
-
//
|
|
79
|
-
|
|
80
|
-
using curl_ptr = std::unique_ptr<CURL, decltype(&curl_easy_cleanup)>;
|
|
81
|
-
|
|
82
|
-
// cannot use unique_ptr for curl_slist, because we cannot update without destroying the old one
|
|
83
|
-
struct curl_slist_ptr {
|
|
84
|
-
struct curl_slist * ptr = nullptr;
|
|
85
|
-
~curl_slist_ptr() {
|
|
86
|
-
if (ptr) {
|
|
87
|
-
curl_slist_free_all(ptr);
|
|
88
|
-
}
|
|
89
|
-
}
|
|
90
|
-
};
|
|
91
|
-
#endif // LLAMA_USE_CURL
|
|
92
|
-
|
|
93
|
-
using json = nlohmann::ordered_json;
|
|
94
|
-
|
|
95
56
|
//
|
|
96
57
|
// CPU utils
|
|
97
58
|
//
|
|
@@ -869,7 +830,7 @@ std::string fs_get_cache_directory() {
|
|
|
869
830
|
if (getenv("LLAMA_CACHE")) {
|
|
870
831
|
cache_directory = std::getenv("LLAMA_CACHE");
|
|
871
832
|
} else {
|
|
872
|
-
#
|
|
833
|
+
#if defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)
|
|
873
834
|
if (std::getenv("XDG_CACHE_HOME")) {
|
|
874
835
|
cache_directory = std::getenv("XDG_CACHE_HOME");
|
|
875
836
|
} else {
|
|
@@ -879,7 +840,9 @@ std::string fs_get_cache_directory() {
|
|
|
879
840
|
cache_directory = std::getenv("HOME") + std::string("/Library/Caches/");
|
|
880
841
|
#elif defined(_WIN32)
|
|
881
842
|
cache_directory = std::getenv("LOCALAPPDATA");
|
|
882
|
-
#
|
|
843
|
+
#else
|
|
844
|
+
# error Unknown architecture
|
|
845
|
+
#endif
|
|
883
846
|
cache_directory = ensure_trailing_slash(cache_directory);
|
|
884
847
|
cache_directory += "llama.cpp";
|
|
885
848
|
}
|
|
@@ -900,22 +863,14 @@ std::string fs_get_cache_file(const std::string & filename) {
|
|
|
900
863
|
//
|
|
901
864
|
// Model utils
|
|
902
865
|
//
|
|
866
|
+
|
|
903
867
|
struct common_init_result common_init_from_params(common_params & params) {
|
|
904
868
|
common_init_result iparams;
|
|
905
869
|
auto mparams = common_model_params_to_llama(params);
|
|
906
870
|
|
|
907
|
-
llama_model * model =
|
|
908
|
-
|
|
909
|
-
if (!params.hf_repo.empty() && !params.hf_file.empty()) {
|
|
910
|
-
model = common_load_model_from_hf(params.hf_repo, params.hf_file, params.model, params.hf_token, mparams);
|
|
911
|
-
} else if (!params.model_url.empty()) {
|
|
912
|
-
model = common_load_model_from_url(params.model_url, params.model, params.hf_token, mparams);
|
|
913
|
-
} else {
|
|
914
|
-
model = llama_model_load_from_file(params.model.c_str(), mparams);
|
|
915
|
-
}
|
|
916
|
-
|
|
871
|
+
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
|
|
917
872
|
if (model == NULL) {
|
|
918
|
-
LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.c_str());
|
|
873
|
+
LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.path.c_str());
|
|
919
874
|
return iparams;
|
|
920
875
|
}
|
|
921
876
|
|
|
@@ -950,7 +905,7 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|
|
950
905
|
|
|
951
906
|
llama_context * lctx = llama_init_from_model(model, cparams);
|
|
952
907
|
if (lctx == NULL) {
|
|
953
|
-
LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.c_str());
|
|
908
|
+
LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str());
|
|
954
909
|
llama_model_free(model);
|
|
955
910
|
return iparams;
|
|
956
911
|
}
|
|
@@ -1074,6 +1029,19 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|
|
1074
1029
|
return iparams;
|
|
1075
1030
|
}
|
|
1076
1031
|
|
|
1032
|
+
std::string get_model_endpoint() {
|
|
1033
|
+
const char * model_endpoint_env = getenv("MODEL_ENDPOINT");
|
|
1034
|
+
// We still respect the use of environment-variable "HF_ENDPOINT" for backward-compatibility.
|
|
1035
|
+
const char * hf_endpoint_env = getenv("HF_ENDPOINT");
|
|
1036
|
+
const char * endpoint_env = model_endpoint_env ? model_endpoint_env : hf_endpoint_env;
|
|
1037
|
+
std::string model_endpoint = "https://huggingface.co/";
|
|
1038
|
+
if (endpoint_env) {
|
|
1039
|
+
model_endpoint = endpoint_env;
|
|
1040
|
+
if (model_endpoint.back() != '/') model_endpoint += '/';
|
|
1041
|
+
}
|
|
1042
|
+
return model_endpoint;
|
|
1043
|
+
}
|
|
1044
|
+
|
|
1077
1045
|
void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora) {
|
|
1078
1046
|
llama_clear_adapter_lora(ctx);
|
|
1079
1047
|
for (auto & la : lora) {
|
|
@@ -1089,15 +1057,18 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
|
|
|
1089
1057
|
if (!params.devices.empty()) {
|
|
1090
1058
|
mparams.devices = params.devices.data();
|
|
1091
1059
|
}
|
|
1060
|
+
|
|
1092
1061
|
if (params.n_gpu_layers != -1) {
|
|
1093
1062
|
mparams.n_gpu_layers = params.n_gpu_layers;
|
|
1094
1063
|
}
|
|
1064
|
+
|
|
1095
1065
|
mparams.main_gpu = params.main_gpu;
|
|
1096
1066
|
mparams.split_mode = params.split_mode;
|
|
1097
1067
|
mparams.tensor_split = params.tensor_split;
|
|
1098
1068
|
mparams.use_mmap = params.use_mmap;
|
|
1099
1069
|
mparams.use_mlock = params.use_mlock;
|
|
1100
1070
|
mparams.check_tensors = params.check_tensors;
|
|
1071
|
+
|
|
1101
1072
|
if (params.kv_overrides.empty()) {
|
|
1102
1073
|
mparams.kv_overrides = NULL;
|
|
1103
1074
|
} else {
|
|
@@ -1105,6 +1076,13 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
|
|
|
1105
1076
|
mparams.kv_overrides = params.kv_overrides.data();
|
|
1106
1077
|
}
|
|
1107
1078
|
|
|
1079
|
+
if (params.tensor_buft_overrides.empty()) {
|
|
1080
|
+
mparams.tensor_buft_overrides = NULL;
|
|
1081
|
+
} else {
|
|
1082
|
+
GGML_ASSERT(params.tensor_buft_overrides.back().pattern == nullptr && "Tensor buffer overrides not terminated with empty pattern");
|
|
1083
|
+
mparams.tensor_buft_overrides = params.tensor_buft_overrides.data();
|
|
1084
|
+
}
|
|
1085
|
+
|
|
1108
1086
|
return mparams;
|
|
1109
1087
|
}
|
|
1110
1088
|
|
|
@@ -1164,451 +1142,6 @@ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_p
|
|
|
1164
1142
|
return tpp;
|
|
1165
1143
|
}
|
|
1166
1144
|
|
|
1167
|
-
#ifdef LLAMA_USE_CURL
|
|
1168
|
-
|
|
1169
|
-
#define CURL_MAX_RETRY 3
|
|
1170
|
-
#define CURL_RETRY_DELAY_SECONDS 2
|
|
1171
|
-
|
|
1172
|
-
static bool curl_perform_with_retry(const std::string & url, CURL * curl, int max_attempts, int retry_delay_seconds) {
|
|
1173
|
-
int remaining_attempts = max_attempts;
|
|
1174
|
-
|
|
1175
|
-
while (remaining_attempts > 0) {
|
|
1176
|
-
LOG_INF("%s: Trying to download from %s (attempt %d of %d)...\n", __func__ , url.c_str(), max_attempts - remaining_attempts + 1, max_attempts);
|
|
1177
|
-
|
|
1178
|
-
CURLcode res = curl_easy_perform(curl);
|
|
1179
|
-
if (res == CURLE_OK) {
|
|
1180
|
-
return true;
|
|
1181
|
-
}
|
|
1182
|
-
|
|
1183
|
-
int exponential_backoff_delay = std::pow(retry_delay_seconds, max_attempts - remaining_attempts) * 1000;
|
|
1184
|
-
LOG_WRN("%s: curl_easy_perform() failed: %s, retrying after %d milliseconds...\n", __func__, curl_easy_strerror(res), exponential_backoff_delay);
|
|
1185
|
-
|
|
1186
|
-
remaining_attempts--;
|
|
1187
|
-
std::this_thread::sleep_for(std::chrono::milliseconds(exponential_backoff_delay));
|
|
1188
|
-
}
|
|
1189
|
-
|
|
1190
|
-
LOG_ERR("%s: curl_easy_perform() failed after %d attempts\n", __func__, max_attempts);
|
|
1191
|
-
|
|
1192
|
-
return false;
|
|
1193
|
-
}
|
|
1194
|
-
|
|
1195
|
-
static bool common_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {
|
|
1196
|
-
// Initialize libcurl
|
|
1197
|
-
curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
|
|
1198
|
-
curl_slist_ptr http_headers;
|
|
1199
|
-
if (!curl) {
|
|
1200
|
-
LOG_ERR("%s: error initializing libcurl\n", __func__);
|
|
1201
|
-
return false;
|
|
1202
|
-
}
|
|
1203
|
-
|
|
1204
|
-
bool force_download = false;
|
|
1205
|
-
|
|
1206
|
-
// Set the URL, allow to follow http redirection
|
|
1207
|
-
curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
|
|
1208
|
-
curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);
|
|
1209
|
-
|
|
1210
|
-
// Check if hf-token or bearer-token was specified
|
|
1211
|
-
if (!hf_token.empty()) {
|
|
1212
|
-
std::string auth_header = "Authorization: Bearer " + hf_token;
|
|
1213
|
-
http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
|
|
1214
|
-
curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
|
|
1215
|
-
}
|
|
1216
|
-
|
|
1217
|
-
#if defined(_WIN32)
|
|
1218
|
-
// CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of
|
|
1219
|
-
// operating system. Currently implemented under MS-Windows.
|
|
1220
|
-
curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
|
|
1221
|
-
#endif
|
|
1222
|
-
|
|
1223
|
-
// Check if the file already exists locally
|
|
1224
|
-
auto file_exists = std::filesystem::exists(path);
|
|
1225
|
-
|
|
1226
|
-
// If the file exists, check its JSON metadata companion file.
|
|
1227
|
-
std::string metadata_path = path + ".json";
|
|
1228
|
-
nlohmann::json metadata;
|
|
1229
|
-
std::string etag;
|
|
1230
|
-
std::string last_modified;
|
|
1231
|
-
|
|
1232
|
-
if (file_exists) {
|
|
1233
|
-
// Try and read the JSON metadata file (note: stream autoclosed upon exiting this block).
|
|
1234
|
-
std::ifstream metadata_in(metadata_path);
|
|
1235
|
-
if (metadata_in.good()) {
|
|
1236
|
-
try {
|
|
1237
|
-
metadata_in >> metadata;
|
|
1238
|
-
LOG_INF("%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str());
|
|
1239
|
-
if (metadata.contains("url") && metadata.at("url").is_string()) {
|
|
1240
|
-
auto previous_url = metadata.at("url").get<std::string>();
|
|
1241
|
-
if (previous_url != url) {
|
|
1242
|
-
LOG_ERR("%s: Model URL mismatch: %s != %s\n", __func__, url.c_str(), previous_url.c_str());
|
|
1243
|
-
return false;
|
|
1244
|
-
}
|
|
1245
|
-
}
|
|
1246
|
-
if (metadata.contains("etag") && metadata.at("etag").is_string()) {
|
|
1247
|
-
etag = metadata.at("etag");
|
|
1248
|
-
}
|
|
1249
|
-
if (metadata.contains("lastModified") && metadata.at("lastModified").is_string()) {
|
|
1250
|
-
last_modified = metadata.at("lastModified");
|
|
1251
|
-
}
|
|
1252
|
-
} catch (const nlohmann::json::exception & e) {
|
|
1253
|
-
LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
|
|
1254
|
-
return false;
|
|
1255
|
-
}
|
|
1256
|
-
}
|
|
1257
|
-
} else {
|
|
1258
|
-
LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
|
|
1259
|
-
}
|
|
1260
|
-
|
|
1261
|
-
// Send a HEAD request to retrieve the etag and last-modified headers
|
|
1262
|
-
struct common_load_model_from_url_headers {
|
|
1263
|
-
std::string etag;
|
|
1264
|
-
std::string last_modified;
|
|
1265
|
-
};
|
|
1266
|
-
|
|
1267
|
-
common_load_model_from_url_headers headers;
|
|
1268
|
-
|
|
1269
|
-
{
|
|
1270
|
-
typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
|
|
1271
|
-
auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
|
|
1272
|
-
common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata;
|
|
1273
|
-
|
|
1274
|
-
static std::regex header_regex("([^:]+): (.*)\r\n");
|
|
1275
|
-
static std::regex etag_regex("ETag", std::regex_constants::icase);
|
|
1276
|
-
static std::regex last_modified_regex("Last-Modified", std::regex_constants::icase);
|
|
1277
|
-
|
|
1278
|
-
std::string header(buffer, n_items);
|
|
1279
|
-
std::smatch match;
|
|
1280
|
-
if (std::regex_match(header, match, header_regex)) {
|
|
1281
|
-
const std::string & key = match[1];
|
|
1282
|
-
const std::string & value = match[2];
|
|
1283
|
-
if (std::regex_match(key, match, etag_regex)) {
|
|
1284
|
-
headers->etag = value;
|
|
1285
|
-
} else if (std::regex_match(key, match, last_modified_regex)) {
|
|
1286
|
-
headers->last_modified = value;
|
|
1287
|
-
}
|
|
1288
|
-
}
|
|
1289
|
-
return n_items;
|
|
1290
|
-
};
|
|
1291
|
-
|
|
1292
|
-
curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 1L); // will trigger the HEAD verb
|
|
1293
|
-
curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L); // hide head request progress
|
|
1294
|
-
curl_easy_setopt(curl.get(), CURLOPT_HEADERFUNCTION, static_cast<CURLOPT_HEADERFUNCTION_PTR>(header_callback));
|
|
1295
|
-
curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers);
|
|
1296
|
-
|
|
1297
|
-
bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
|
|
1298
|
-
if (!was_perform_successful) {
|
|
1299
|
-
return false;
|
|
1300
|
-
}
|
|
1301
|
-
|
|
1302
|
-
long http_code = 0;
|
|
1303
|
-
curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
|
|
1304
|
-
if (http_code != 200) {
|
|
1305
|
-
// HEAD not supported, we don't know if the file has changed
|
|
1306
|
-
// force trigger downloading
|
|
1307
|
-
force_download = true;
|
|
1308
|
-
LOG_ERR("%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
|
|
1309
|
-
}
|
|
1310
|
-
}
|
|
1311
|
-
|
|
1312
|
-
bool should_download = !file_exists || force_download;
|
|
1313
|
-
if (!should_download) {
|
|
1314
|
-
if (!etag.empty() && etag != headers.etag) {
|
|
1315
|
-
LOG_WRN("%s: ETag header is different (%s != %s): triggering a new download\n", __func__, etag.c_str(), headers.etag.c_str());
|
|
1316
|
-
should_download = true;
|
|
1317
|
-
} else if (!last_modified.empty() && last_modified != headers.last_modified) {
|
|
1318
|
-
LOG_WRN("%s: Last-Modified header is different (%s != %s): triggering a new download\n", __func__, last_modified.c_str(), headers.last_modified.c_str());
|
|
1319
|
-
should_download = true;
|
|
1320
|
-
}
|
|
1321
|
-
}
|
|
1322
|
-
if (should_download) {
|
|
1323
|
-
std::string path_temporary = path + ".downloadInProgress";
|
|
1324
|
-
if (file_exists) {
|
|
1325
|
-
LOG_WRN("%s: deleting previous downloaded file: %s\n", __func__, path.c_str());
|
|
1326
|
-
if (remove(path.c_str()) != 0) {
|
|
1327
|
-
LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
|
|
1328
|
-
return false;
|
|
1329
|
-
}
|
|
1330
|
-
}
|
|
1331
|
-
|
|
1332
|
-
// Set the output file
|
|
1333
|
-
|
|
1334
|
-
struct FILE_deleter {
|
|
1335
|
-
void operator()(FILE * f) const {
|
|
1336
|
-
fclose(f);
|
|
1337
|
-
}
|
|
1338
|
-
};
|
|
1339
|
-
|
|
1340
|
-
std::unique_ptr<FILE, FILE_deleter> outfile(fopen(path_temporary.c_str(), "wb"));
|
|
1341
|
-
if (!outfile) {
|
|
1342
|
-
LOG_ERR("%s: error opening local file for writing: %s\n", __func__, path.c_str());
|
|
1343
|
-
return false;
|
|
1344
|
-
}
|
|
1345
|
-
|
|
1346
|
-
typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * data, size_t size, size_t nmemb, void * fd);
|
|
1347
|
-
auto write_callback = [](void * data, size_t size, size_t nmemb, void * fd) -> size_t {
|
|
1348
|
-
return fwrite(data, size, nmemb, (FILE *)fd);
|
|
1349
|
-
};
|
|
1350
|
-
curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 0L);
|
|
1351
|
-
curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
|
|
1352
|
-
curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, outfile.get());
|
|
1353
|
-
|
|
1354
|
-
// display download progress
|
|
1355
|
-
curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 0L);
|
|
1356
|
-
|
|
1357
|
-
// helper function to hide password in URL
|
|
1358
|
-
auto llama_download_hide_password_in_url = [](const std::string & url) -> std::string {
|
|
1359
|
-
std::size_t protocol_pos = url.find("://");
|
|
1360
|
-
if (protocol_pos == std::string::npos) {
|
|
1361
|
-
return url; // Malformed URL
|
|
1362
|
-
}
|
|
1363
|
-
|
|
1364
|
-
std::size_t at_pos = url.find('@', protocol_pos + 3);
|
|
1365
|
-
if (at_pos == std::string::npos) {
|
|
1366
|
-
return url; // No password in URL
|
|
1367
|
-
}
|
|
1368
|
-
|
|
1369
|
-
return url.substr(0, protocol_pos + 3) + "********" + url.substr(at_pos);
|
|
1370
|
-
};
|
|
1371
|
-
|
|
1372
|
-
// start the download
|
|
1373
|
-
LOG_INF("%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
|
|
1374
|
-
llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str());
|
|
1375
|
-
bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
|
|
1376
|
-
if (!was_perform_successful) {
|
|
1377
|
-
return false;
|
|
1378
|
-
}
|
|
1379
|
-
|
|
1380
|
-
long http_code = 0;
|
|
1381
|
-
curl_easy_getinfo (curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
|
|
1382
|
-
if (http_code < 200 || http_code >= 400) {
|
|
1383
|
-
LOG_ERR("%s: invalid http status code received: %ld\n", __func__, http_code);
|
|
1384
|
-
return false;
|
|
1385
|
-
}
|
|
1386
|
-
|
|
1387
|
-
// Causes file to be closed explicitly here before we rename it.
|
|
1388
|
-
outfile.reset();
|
|
1389
|
-
|
|
1390
|
-
// Write the updated JSON metadata file.
|
|
1391
|
-
metadata.update({
|
|
1392
|
-
{"url", url},
|
|
1393
|
-
{"etag", headers.etag},
|
|
1394
|
-
{"lastModified", headers.last_modified}
|
|
1395
|
-
});
|
|
1396
|
-
std::ofstream(metadata_path) << metadata.dump(4);
|
|
1397
|
-
LOG_INF("%s: file metadata saved: %s\n", __func__, metadata_path.c_str());
|
|
1398
|
-
|
|
1399
|
-
if (rename(path_temporary.c_str(), path.c_str()) != 0) {
|
|
1400
|
-
LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
|
|
1401
|
-
return false;
|
|
1402
|
-
}
|
|
1403
|
-
}
|
|
1404
|
-
|
|
1405
|
-
return true;
|
|
1406
|
-
}
|
|
1407
|
-
|
|
1408
|
-
struct llama_model * common_load_model_from_url(
|
|
1409
|
-
const std::string & model_url,
|
|
1410
|
-
const std::string & local_path,
|
|
1411
|
-
const std::string & hf_token,
|
|
1412
|
-
const struct llama_model_params & params) {
|
|
1413
|
-
// Basic validation of the model_url
|
|
1414
|
-
if (model_url.empty()) {
|
|
1415
|
-
LOG_ERR("%s: invalid model_url\n", __func__);
|
|
1416
|
-
return NULL;
|
|
1417
|
-
}
|
|
1418
|
-
|
|
1419
|
-
if (!common_download_file(model_url, local_path, hf_token)) {
|
|
1420
|
-
return NULL;
|
|
1421
|
-
}
|
|
1422
|
-
|
|
1423
|
-
// check for additional GGUFs split to download
|
|
1424
|
-
int n_split = 0;
|
|
1425
|
-
{
|
|
1426
|
-
struct gguf_init_params gguf_params = {
|
|
1427
|
-
/*.no_alloc = */ true,
|
|
1428
|
-
/*.ctx = */ NULL,
|
|
1429
|
-
};
|
|
1430
|
-
auto * ctx_gguf = gguf_init_from_file(local_path.c_str(), gguf_params);
|
|
1431
|
-
if (!ctx_gguf) {
|
|
1432
|
-
LOG_ERR("\n%s: failed to load input GGUF from %s\n", __func__, local_path.c_str());
|
|
1433
|
-
return NULL;
|
|
1434
|
-
}
|
|
1435
|
-
|
|
1436
|
-
auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_SPLIT_COUNT);
|
|
1437
|
-
if (key_n_split >= 0) {
|
|
1438
|
-
n_split = gguf_get_val_u16(ctx_gguf, key_n_split);
|
|
1439
|
-
}
|
|
1440
|
-
|
|
1441
|
-
gguf_free(ctx_gguf);
|
|
1442
|
-
}
|
|
1443
|
-
|
|
1444
|
-
if (n_split > 1) {
|
|
1445
|
-
char split_prefix[PATH_MAX] = {0};
|
|
1446
|
-
char split_url_prefix[LLAMA_CURL_MAX_URL_LENGTH] = {0};
|
|
1447
|
-
|
|
1448
|
-
// Verify the first split file format
|
|
1449
|
-
// and extract split URL and PATH prefixes
|
|
1450
|
-
{
|
|
1451
|
-
if (!llama_split_prefix(split_prefix, sizeof(split_prefix), local_path.c_str(), 0, n_split)) {
|
|
1452
|
-
LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, local_path.c_str(), n_split);
|
|
1453
|
-
return NULL;
|
|
1454
|
-
}
|
|
1455
|
-
|
|
1456
|
-
if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url.c_str(), 0, n_split)) {
|
|
1457
|
-
LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model_url.c_str(), n_split);
|
|
1458
|
-
return NULL;
|
|
1459
|
-
}
|
|
1460
|
-
}
|
|
1461
|
-
|
|
1462
|
-
// Prepare download in parallel
|
|
1463
|
-
std::vector<std::future<bool>> futures_download;
|
|
1464
|
-
for (int idx = 1; idx < n_split; idx++) {
|
|
1465
|
-
futures_download.push_back(std::async(std::launch::async, [&split_prefix, &split_url_prefix, &n_split, hf_token](int download_idx) -> bool {
|
|
1466
|
-
char split_path[PATH_MAX] = {0};
|
|
1467
|
-
llama_split_path(split_path, sizeof(split_path), split_prefix, download_idx, n_split);
|
|
1468
|
-
|
|
1469
|
-
char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0};
|
|
1470
|
-
llama_split_path(split_url, sizeof(split_url), split_url_prefix, download_idx, n_split);
|
|
1471
|
-
|
|
1472
|
-
return common_download_file(split_url, split_path, hf_token);
|
|
1473
|
-
}, idx));
|
|
1474
|
-
}
|
|
1475
|
-
|
|
1476
|
-
// Wait for all downloads to complete
|
|
1477
|
-
for (auto & f : futures_download) {
|
|
1478
|
-
if (!f.get()) {
|
|
1479
|
-
return NULL;
|
|
1480
|
-
}
|
|
1481
|
-
}
|
|
1482
|
-
}
|
|
1483
|
-
|
|
1484
|
-
return llama_model_load_from_file(local_path.c_str(), params);
|
|
1485
|
-
}
|
|
1486
|
-
|
|
1487
|
-
struct llama_model * common_load_model_from_hf(
|
|
1488
|
-
const std::string & repo,
|
|
1489
|
-
const std::string & remote_path,
|
|
1490
|
-
const std::string & local_path,
|
|
1491
|
-
const std::string & hf_token,
|
|
1492
|
-
const struct llama_model_params & params) {
|
|
1493
|
-
// construct hugging face model url:
|
|
1494
|
-
//
|
|
1495
|
-
// --repo ggml-org/models --file tinyllama-1.1b/ggml-model-f16.gguf
|
|
1496
|
-
// https://huggingface.co/ggml-org/models/resolve/main/tinyllama-1.1b/ggml-model-f16.gguf
|
|
1497
|
-
//
|
|
1498
|
-
// --repo TheBloke/Mixtral-8x7B-v0.1-GGUF --file mixtral-8x7b-v0.1.Q4_K_M.gguf
|
|
1499
|
-
// https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/resolve/main/mixtral-8x7b-v0.1.Q4_K_M.gguf
|
|
1500
|
-
//
|
|
1501
|
-
|
|
1502
|
-
std::string model_url = "https://huggingface.co/";
|
|
1503
|
-
model_url += repo;
|
|
1504
|
-
model_url += "/resolve/main/";
|
|
1505
|
-
model_url += remote_path;
|
|
1506
|
-
|
|
1507
|
-
return common_load_model_from_url(model_url, local_path, hf_token, params);
|
|
1508
|
-
}
|
|
1509
|
-
|
|
1510
|
-
/**
|
|
1511
|
-
* Allow getting the HF file from the HF repo with tag (like ollama), for example:
|
|
1512
|
-
* - bartowski/Llama-3.2-3B-Instruct-GGUF:q4
|
|
1513
|
-
* - bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M
|
|
1514
|
-
* - bartowski/Llama-3.2-3B-Instruct-GGUF:q5_k_s
|
|
1515
|
-
* Tag is optional, default to "latest" (meaning it checks for Q4_K_M first, then Q4, then if not found, return the first GGUF file in repo)
|
|
1516
|
-
*
|
|
1517
|
-
* Return pair of <repo, file> (with "repo" already having tag removed)
|
|
1518
|
-
*
|
|
1519
|
-
* Note: we use the Ollama-compatible HF API, but not using the blobId. Instead, we use the special "ggufFile" field which returns the value for "hf_file". This is done to be backward-compatible with existing cache files.
|
|
1520
|
-
*/
|
|
1521
|
-
std::pair<std::string, std::string> common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & hf_token) {
|
|
1522
|
-
auto parts = string_split<std::string>(hf_repo_with_tag, ':');
|
|
1523
|
-
std::string tag = parts.size() > 1 ? parts.back() : "latest";
|
|
1524
|
-
std::string hf_repo = parts[0];
|
|
1525
|
-
if (string_split<std::string>(hf_repo, '/').size() != 2) {
|
|
1526
|
-
throw std::invalid_argument("error: invalid HF repo format, expected <user>/<model>[:quant]\n");
|
|
1527
|
-
}
|
|
1528
|
-
|
|
1529
|
-
// fetch model info from Hugging Face Hub API
|
|
1530
|
-
json model_info;
|
|
1531
|
-
curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
|
|
1532
|
-
curl_slist_ptr http_headers;
|
|
1533
|
-
std::string res_str;
|
|
1534
|
-
std::string url = "https://huggingface.co/v2/" + hf_repo + "/manifests/" + tag;
|
|
1535
|
-
curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
|
|
1536
|
-
curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L);
|
|
1537
|
-
typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * ptr, size_t size, size_t nmemb, void * data);
|
|
1538
|
-
auto write_callback = [](void * ptr, size_t size, size_t nmemb, void * data) -> size_t {
|
|
1539
|
-
static_cast<std::string *>(data)->append((char * ) ptr, size * nmemb);
|
|
1540
|
-
return size * nmemb;
|
|
1541
|
-
};
|
|
1542
|
-
curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
|
|
1543
|
-
curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &res_str);
|
|
1544
|
-
#if defined(_WIN32)
|
|
1545
|
-
curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
|
|
1546
|
-
#endif
|
|
1547
|
-
if (!hf_token.empty()) {
|
|
1548
|
-
std::string auth_header = "Authorization: Bearer " + hf_token;
|
|
1549
|
-
http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
|
|
1550
|
-
}
|
|
1551
|
-
// Important: the User-Agent must be "llama-cpp" to get the "ggufFile" field in the response
|
|
1552
|
-
http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
|
|
1553
|
-
http_headers.ptr = curl_slist_append(http_headers.ptr, "Accept: application/json");
|
|
1554
|
-
curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
|
|
1555
|
-
|
|
1556
|
-
CURLcode res = curl_easy_perform(curl.get());
|
|
1557
|
-
|
|
1558
|
-
if (res != CURLE_OK) {
|
|
1559
|
-
throw std::runtime_error("error: cannot make GET request to HF API");
|
|
1560
|
-
}
|
|
1561
|
-
|
|
1562
|
-
long res_code;
|
|
1563
|
-
curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &res_code);
|
|
1564
|
-
if (res_code == 200) {
|
|
1565
|
-
model_info = json::parse(res_str);
|
|
1566
|
-
} else if (res_code == 401) {
|
|
1567
|
-
throw std::runtime_error("error: model is private or does not exist; if you are accessing a gated model, please provide a valid HF token");
|
|
1568
|
-
} else {
|
|
1569
|
-
throw std::runtime_error(string_format("error from HF API, response code: %ld, data: %s", res_code, res_str.c_str()));
|
|
1570
|
-
}
|
|
1571
|
-
|
|
1572
|
-
// check response
|
|
1573
|
-
if (!model_info.contains("ggufFile")) {
|
|
1574
|
-
throw std::runtime_error("error: model does not have ggufFile");
|
|
1575
|
-
}
|
|
1576
|
-
json & gguf_file = model_info.at("ggufFile");
|
|
1577
|
-
if (!gguf_file.contains("rfilename")) {
|
|
1578
|
-
throw std::runtime_error("error: ggufFile does not have rfilename");
|
|
1579
|
-
}
|
|
1580
|
-
|
|
1581
|
-
return std::make_pair(hf_repo, gguf_file.at("rfilename"));
|
|
1582
|
-
}
|
|
1583
|
-
|
|
1584
|
-
#else
|
|
1585
|
-
|
|
1586
|
-
struct llama_model * common_load_model_from_url(
|
|
1587
|
-
const std::string & /*model_url*/,
|
|
1588
|
-
const std::string & /*local_path*/,
|
|
1589
|
-
const std::string & /*hf_token*/,
|
|
1590
|
-
const struct llama_model_params & /*params*/) {
|
|
1591
|
-
LOG_WRN("%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__);
|
|
1592
|
-
return nullptr;
|
|
1593
|
-
}
|
|
1594
|
-
|
|
1595
|
-
struct llama_model * common_load_model_from_hf(
|
|
1596
|
-
const std::string & /*repo*/,
|
|
1597
|
-
const std::string & /*remote_path*/,
|
|
1598
|
-
const std::string & /*local_path*/,
|
|
1599
|
-
const std::string & /*hf_token*/,
|
|
1600
|
-
const struct llama_model_params & /*params*/) {
|
|
1601
|
-
LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
|
|
1602
|
-
return nullptr;
|
|
1603
|
-
}
|
|
1604
|
-
|
|
1605
|
-
std::pair<std::string, std::string> common_get_hf_file(const std::string &, const std::string &) {
|
|
1606
|
-
LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
|
|
1607
|
-
return std::make_pair("", "");
|
|
1608
|
-
}
|
|
1609
|
-
|
|
1610
|
-
#endif // LLAMA_USE_CURL
|
|
1611
|
-
|
|
1612
1145
|
//
|
|
1613
1146
|
// Batch utils
|
|
1614
1147
|
//
|
|
@@ -2032,26 +1565,3 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c
|
|
|
2032
1565
|
|
|
2033
1566
|
return result;
|
|
2034
1567
|
}
|
|
2035
|
-
|
|
2036
|
-
template <>
|
|
2037
|
-
json common_grammar_trigger::to_json() const {
|
|
2038
|
-
json out {
|
|
2039
|
-
{"type", (int) type},
|
|
2040
|
-
{"value", value},
|
|
2041
|
-
};
|
|
2042
|
-
if (type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) {
|
|
2043
|
-
out["token"] = (int) token;
|
|
2044
|
-
}
|
|
2045
|
-
return out;
|
|
2046
|
-
}
|
|
2047
|
-
|
|
2048
|
-
template <>
|
|
2049
|
-
common_grammar_trigger common_grammar_trigger::from_json(const json & in) {
|
|
2050
|
-
common_grammar_trigger out;
|
|
2051
|
-
out.type = (common_grammar_trigger_type) in.at("type").get<int>();
|
|
2052
|
-
out.value = in.at("value").get<std::string>();
|
|
2053
|
-
if (out.type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) {
|
|
2054
|
-
out.token = (llama_token) in.at("token").get<int>();
|
|
2055
|
-
}
|
|
2056
|
-
return out;
|
|
2057
|
-
}
|