@fugood/llama.node 0.3.16 → 0.3.17
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +3 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +5 -0
- package/package.json +1 -1
- package/src/LlamaCompletionWorker.cpp +8 -0
- package/src/LlamaCompletionWorker.h +1 -0
- package/src/LlamaContext.cpp +3 -2
- package/src/llama.cpp/.github/workflows/build-linux-cross.yml +124 -0
- package/src/llama.cpp/.github/workflows/build.yml +70 -27
- package/src/llama.cpp/.github/workflows/docker.yml +6 -6
- package/src/llama.cpp/.github/workflows/server.yml +7 -11
- package/src/llama.cpp/CMakeLists.txt +23 -1
- package/src/llama.cpp/common/CMakeLists.txt +6 -3
- package/src/llama.cpp/common/arg.cpp +809 -105
- package/src/llama.cpp/common/arg.h +9 -0
- package/src/llama.cpp/common/chat.cpp +1 -1
- package/src/llama.cpp/common/common.cpp +31 -521
- package/src/llama.cpp/common/common.h +17 -36
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -0
- package/src/llama.cpp/common/llguidance.cpp +30 -47
- package/src/llama.cpp/common/minja/chat-template.hpp +15 -7
- package/src/llama.cpp/common/minja/minja.hpp +119 -93
- package/src/llama.cpp/common/sampling.cpp +3 -0
- package/src/llama.cpp/docs/build.md +122 -7
- package/src/llama.cpp/examples/CMakeLists.txt +0 -9
- package/src/llama.cpp/examples/batched/batched.cpp +1 -1
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +7 -1
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +1 -1
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +15 -16
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +210 -8
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
- package/src/llama.cpp/examples/llava/CMakeLists.txt +39 -24
- package/src/llama.cpp/examples/llava/clip-impl.h +345 -0
- package/src/llama.cpp/examples/llava/clip.cpp +2152 -1803
- package/src/llama.cpp/examples/llava/clip.h +39 -22
- package/src/llama.cpp/examples/llava/deprecation-warning.cpp +22 -0
- package/src/llama.cpp/examples/llava/llava.cpp +64 -52
- package/src/llama.cpp/examples/llava/mtmd-cli.cpp +344 -0
- package/src/llama.cpp/examples/llava/mtmd.cpp +708 -0
- package/src/llama.cpp/examples/llava/mtmd.h +168 -0
- package/src/llama.cpp/examples/llava/{qwen2vl-cli.cpp → qwen2vl-test.cpp} +83 -31
- package/src/llama.cpp/examples/main/main.cpp +16 -5
- package/src/llama.cpp/examples/parallel/parallel.cpp +3 -1
- package/src/llama.cpp/examples/passkey/passkey.cpp +1 -1
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +17 -3
- package/src/llama.cpp/examples/quantize/quantize.cpp +115 -2
- package/src/llama.cpp/examples/rpc/CMakeLists.txt +4 -2
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +163 -8
- package/src/llama.cpp/examples/run/CMakeLists.txt +12 -1
- package/src/llama.cpp/examples/run/run.cpp +14 -28
- package/src/llama.cpp/examples/server/httplib.h +313 -247
- package/src/llama.cpp/examples/server/server.cpp +238 -139
- package/src/llama.cpp/examples/server/utils.hpp +51 -2
- package/src/llama.cpp/examples/speculative/speculative.cpp +1 -1
- package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
- package/src/llama.cpp/examples/sycl/build.sh +2 -2
- package/src/llama.cpp/examples/sycl/win-build-sycl.bat +2 -2
- package/src/llama.cpp/examples/tts/tts.cpp +6 -9
- package/src/llama.cpp/ggml/CMakeLists.txt +8 -2
- package/src/llama.cpp/ggml/cmake/GitVars.cmake +22 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +5 -0
- package/src/llama.cpp/ggml/include/ggml-rpc.h +6 -1
- package/src/llama.cpp/ggml/include/ggml.h +66 -99
- package/src/llama.cpp/ggml/src/CMakeLists.txt +10 -7
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +0 -2
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +8 -4
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +5 -5
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +692 -1534
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +613 -122
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +135 -1
- package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +507 -137
- package/src/llama.cpp/ggml/src/ggml-common.h +12 -6
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +48 -22
- package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/common.h +72 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +896 -192
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +2 -21
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +754 -404
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1003 -13519
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +2 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +2 -7
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +0 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +3 -4
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +533 -88
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +8809 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +110 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +892 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +28 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +258 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +802 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +7 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +0 -4
- package/src/llama.cpp/ggml/src/ggml-impl.h +52 -18
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +70 -3
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +67 -119
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1023 -260
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +293 -40
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +96 -22
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +350 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +39 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +0 -35
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +2 -292
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +79 -90
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +967 -438
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +22 -23
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +24 -20
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +1 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +204 -280
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +84 -74
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +1 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +37 -49
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +7 -22
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +4 -14
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +204 -118
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +1 -3
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +23 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +646 -114
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +12 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +17 -8
- package/src/llama.cpp/ggml/src/ggml.c +141 -245
- package/src/llama.cpp/ggml/src/gguf.cpp +1 -0
- package/src/llama.cpp/include/llama.h +30 -11
- package/src/llama.cpp/models/ggml-vocab-llama4.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-llama4.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.out +46 -0
- package/src/llama.cpp/requirements/requirements-all.txt +2 -0
- package/src/llama.cpp/requirements/requirements-gguf_editor_gui.txt +3 -0
- package/src/llama.cpp/src/CMakeLists.txt +3 -2
- package/src/llama.cpp/src/llama-adapter.cpp +37 -1
- package/src/llama.cpp/src/llama-arch.cpp +160 -17
- package/src/llama.cpp/src/llama-arch.h +16 -0
- package/src/llama.cpp/src/llama-chat.cpp +82 -17
- package/src/llama.cpp/src/llama-chat.h +6 -2
- package/src/llama.cpp/src/llama-context.cpp +108 -92
- package/src/llama.cpp/src/llama-context.h +1 -2
- package/src/llama.cpp/src/llama-graph.cpp +189 -119
- package/src/llama.cpp/src/llama-graph.h +26 -6
- package/src/llama.cpp/src/llama-hparams.h +13 -0
- package/src/llama.cpp/src/llama-kv-cache.cpp +70 -123
- package/src/llama.cpp/src/llama-kv-cache.h +41 -115
- package/src/llama.cpp/src/llama-memory.h +1 -1
- package/src/llama.cpp/src/llama-mmap.cpp +1 -1
- package/src/llama.cpp/src/llama-model-loader.cpp +10 -5
- package/src/llama.cpp/src/llama-model-loader.h +5 -3
- package/src/llama.cpp/src/llama-model.cpp +1760 -534
- package/src/llama.cpp/src/llama-model.h +13 -1
- package/src/llama.cpp/src/llama-quant.cpp +29 -8
- package/src/llama.cpp/src/llama-sampling.cpp +7 -1
- package/src/llama.cpp/src/llama-vocab.cpp +44 -6
- package/src/llama.cpp/src/llama.cpp +1 -1
- package/src/llama.cpp/tests/CMakeLists.txt +43 -30
- package/src/llama.cpp/tests/test-arg-parser.cpp +51 -4
- package/src/llama.cpp/tests/test-backend-ops.cpp +82 -43
- package/src/llama.cpp/tests/test-chat-template.cpp +34 -13
- package/src/llama.cpp/tests/test-chat.cpp +12 -2
- package/src/llama.cpp/{examples/gbnf-validator/gbnf-validator.cpp → tests/test-gbnf-validator.cpp} +2 -2
- package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -2
- package/src/llama.cpp/tests/test-grammar-llguidance.cpp +63 -2
- package/src/llama.cpp/tests/test-grammar-parser.cpp +3 -1
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -1
- package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -1
- package/src/llama.cpp/{examples/quantize-stats/quantize-stats.cpp → tests/test-quantize-stats.cpp} +3 -1
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +2 -1
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +2 -1
- package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/llava/gemma3-cli.cpp +0 -341
- package/src/llama.cpp/examples/llava/llava-cli.cpp +0 -332
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +0 -354
- package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +0 -6
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
|
@@ -1,12 +1,24 @@
|
|
|
1
|
+
#include "gguf.h" // for reading GGUF splits
|
|
1
2
|
#include "arg.h"
|
|
2
3
|
|
|
4
|
+
#include "common.h"
|
|
3
5
|
#include "log.h"
|
|
4
6
|
#include "sampling.h"
|
|
5
7
|
#include "chat.h"
|
|
6
8
|
|
|
9
|
+
// fix problem with std::min and std::max
|
|
10
|
+
#if defined(_WIN32)
|
|
11
|
+
#define WIN32_LEAN_AND_MEAN
|
|
12
|
+
#ifndef NOMINMAX
|
|
13
|
+
# define NOMINMAX
|
|
14
|
+
#endif
|
|
15
|
+
#include <windows.h>
|
|
16
|
+
#endif
|
|
17
|
+
|
|
7
18
|
#include <algorithm>
|
|
8
19
|
#include <climits>
|
|
9
20
|
#include <cstdarg>
|
|
21
|
+
#include <filesystem>
|
|
10
22
|
#include <fstream>
|
|
11
23
|
#include <regex>
|
|
12
24
|
#include <set>
|
|
@@ -14,10 +26,42 @@
|
|
|
14
26
|
#include <thread>
|
|
15
27
|
#include <vector>
|
|
16
28
|
|
|
29
|
+
//#define LLAMA_USE_CURL
|
|
30
|
+
|
|
31
|
+
#if defined(LLAMA_USE_CURL)
|
|
32
|
+
#include <curl/curl.h>
|
|
33
|
+
#include <curl/easy.h>
|
|
34
|
+
#include <future>
|
|
35
|
+
#endif
|
|
36
|
+
|
|
17
37
|
#include "json-schema-to-grammar.h"
|
|
18
38
|
|
|
19
39
|
using json = nlohmann::ordered_json;
|
|
20
40
|
|
|
41
|
+
std::initializer_list<enum llama_example> mmproj_examples = {
|
|
42
|
+
LLAMA_EXAMPLE_LLAVA,
|
|
43
|
+
// TODO: add LLAMA_EXAMPLE_SERVER when it's ready
|
|
44
|
+
};
|
|
45
|
+
|
|
46
|
+
static std::string read_file(const std::string & fname) {
|
|
47
|
+
std::ifstream file(fname);
|
|
48
|
+
if (!file) {
|
|
49
|
+
throw std::runtime_error(string_format("error: failed to open file '%s'\n", fname.c_str()));
|
|
50
|
+
}
|
|
51
|
+
std::string content((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
|
|
52
|
+
file.close();
|
|
53
|
+
return content;
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
static void write_file(const std::string & fname, const std::string & content) {
|
|
57
|
+
std::ofstream file(fname);
|
|
58
|
+
if (!file) {
|
|
59
|
+
throw std::runtime_error(string_format("error: failed to open file '%s'\n", fname.c_str()));
|
|
60
|
+
}
|
|
61
|
+
file << content;
|
|
62
|
+
file.close();
|
|
63
|
+
}
|
|
64
|
+
|
|
21
65
|
common_arg & common_arg::set_examples(std::initializer_list<enum llama_example> examples) {
|
|
22
66
|
this->examples = std::move(examples);
|
|
23
67
|
return *this;
|
|
@@ -126,47 +170,637 @@ std::string common_arg::to_string() {
|
|
|
126
170
|
}
|
|
127
171
|
|
|
128
172
|
//
|
|
129
|
-
//
|
|
173
|
+
// downloader
|
|
174
|
+
//
|
|
175
|
+
|
|
176
|
+
struct common_hf_file_res {
|
|
177
|
+
std::string repo; // repo name with ":tag" removed
|
|
178
|
+
std::string ggufFile;
|
|
179
|
+
std::string mmprojFile;
|
|
180
|
+
};
|
|
181
|
+
|
|
182
|
+
#ifdef LLAMA_USE_CURL
|
|
183
|
+
|
|
184
|
+
bool common_has_curl() {
|
|
185
|
+
return true;
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
#ifdef __linux__
|
|
189
|
+
#include <linux/limits.h>
|
|
190
|
+
#elif defined(_WIN32)
|
|
191
|
+
# if !defined(PATH_MAX)
|
|
192
|
+
# define PATH_MAX MAX_PATH
|
|
193
|
+
# endif
|
|
194
|
+
#elif defined(_AIX)
|
|
195
|
+
#include <sys/limits.h>
|
|
196
|
+
#else
|
|
197
|
+
#include <sys/syslimits.h>
|
|
198
|
+
#endif
|
|
199
|
+
#define LLAMA_CURL_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
|
|
200
|
+
|
|
201
|
+
//
|
|
202
|
+
// CURL utils
|
|
130
203
|
//
|
|
131
204
|
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
205
|
+
using curl_ptr = std::unique_ptr<CURL, decltype(&curl_easy_cleanup)>;
|
|
206
|
+
|
|
207
|
+
// cannot use unique_ptr for curl_slist, because we cannot update without destroying the old one
|
|
208
|
+
struct curl_slist_ptr {
|
|
209
|
+
struct curl_slist * ptr = nullptr;
|
|
210
|
+
~curl_slist_ptr() {
|
|
211
|
+
if (ptr) {
|
|
212
|
+
curl_slist_free_all(ptr);
|
|
213
|
+
}
|
|
214
|
+
}
|
|
215
|
+
};
|
|
216
|
+
|
|
217
|
+
#define CURL_MAX_RETRY 3
|
|
218
|
+
#define CURL_RETRY_DELAY_SECONDS 2
|
|
219
|
+
|
|
220
|
+
static bool curl_perform_with_retry(const std::string & url, CURL * curl, int max_attempts, int retry_delay_seconds) {
|
|
221
|
+
int remaining_attempts = max_attempts;
|
|
222
|
+
char * method = nullptr;
|
|
223
|
+
curl_easy_getinfo(curl, CURLINFO_EFFECTIVE_METHOD, &method);
|
|
224
|
+
|
|
225
|
+
while (remaining_attempts > 0) {
|
|
226
|
+
LOG_INF("%s: %s %s (attempt %d of %d)...\n", __func__ , method, url.c_str(), max_attempts - remaining_attempts + 1, max_attempts);
|
|
227
|
+
|
|
228
|
+
CURLcode res = curl_easy_perform(curl);
|
|
229
|
+
if (res == CURLE_OK) {
|
|
230
|
+
return true;
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
int exponential_backoff_delay = std::pow(retry_delay_seconds, max_attempts - remaining_attempts) * 1000;
|
|
234
|
+
LOG_WRN("%s: curl_easy_perform() failed: %s, retrying after %d milliseconds...\n", __func__, curl_easy_strerror(res), exponential_backoff_delay);
|
|
235
|
+
|
|
236
|
+
remaining_attempts--;
|
|
237
|
+
if (remaining_attempts == 0) break;
|
|
238
|
+
std::this_thread::sleep_for(std::chrono::milliseconds(exponential_backoff_delay));
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
LOG_ERR("%s: curl_easy_perform() failed after %d attempts\n", __func__, max_attempts);
|
|
242
|
+
|
|
243
|
+
return false;
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
// download one single file from remote URL to local path
|
|
247
|
+
static bool common_download_file_single(const std::string & url, const std::string & path, const std::string & bearer_token) {
|
|
248
|
+
// Initialize libcurl
|
|
249
|
+
curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
|
|
250
|
+
curl_slist_ptr http_headers;
|
|
251
|
+
if (!curl) {
|
|
252
|
+
LOG_ERR("%s: error initializing libcurl\n", __func__);
|
|
253
|
+
return false;
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
// Set the URL, allow to follow http redirection
|
|
257
|
+
curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
|
|
258
|
+
curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);
|
|
259
|
+
|
|
260
|
+
http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
|
|
261
|
+
// Check if hf-token or bearer-token was specified
|
|
262
|
+
if (!bearer_token.empty()) {
|
|
263
|
+
std::string auth_header = "Authorization: Bearer " + bearer_token;
|
|
264
|
+
http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
|
|
265
|
+
}
|
|
266
|
+
curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
|
|
267
|
+
|
|
268
|
+
#if defined(_WIN32)
|
|
269
|
+
// CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of
|
|
270
|
+
// operating system. Currently implemented under MS-Windows.
|
|
271
|
+
curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
|
|
272
|
+
#endif
|
|
273
|
+
|
|
274
|
+
// Check if the file already exists locally
|
|
275
|
+
auto file_exists = std::filesystem::exists(path);
|
|
276
|
+
|
|
277
|
+
// If the file exists, check its JSON metadata companion file.
|
|
278
|
+
std::string metadata_path = path + ".json";
|
|
279
|
+
nlohmann::json metadata; // TODO @ngxson : get rid of this json, use regex instead
|
|
280
|
+
std::string etag;
|
|
281
|
+
std::string last_modified;
|
|
282
|
+
|
|
283
|
+
if (file_exists) {
|
|
284
|
+
// Try and read the JSON metadata file (note: stream autoclosed upon exiting this block).
|
|
285
|
+
std::ifstream metadata_in(metadata_path);
|
|
286
|
+
if (metadata_in.good()) {
|
|
287
|
+
try {
|
|
288
|
+
metadata_in >> metadata;
|
|
289
|
+
LOG_DBG("%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str());
|
|
290
|
+
if (metadata.contains("etag") && metadata.at("etag").is_string()) {
|
|
291
|
+
etag = metadata.at("etag");
|
|
146
292
|
}
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
293
|
+
if (metadata.contains("lastModified") && metadata.at("lastModified").is_string()) {
|
|
294
|
+
last_modified = metadata.at("lastModified");
|
|
295
|
+
}
|
|
296
|
+
} catch (const nlohmann::json::exception & e) {
|
|
297
|
+
LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
|
|
298
|
+
}
|
|
299
|
+
}
|
|
300
|
+
// if we cannot open the metadata file, we assume that the downloaded file is not valid (etag and last-modified are left empty, so we will download it again)
|
|
301
|
+
} else {
|
|
302
|
+
LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
|
|
303
|
+
}
|
|
304
|
+
|
|
305
|
+
// Send a HEAD request to retrieve the etag and last-modified headers
|
|
306
|
+
struct common_load_model_from_url_headers {
|
|
307
|
+
std::string etag;
|
|
308
|
+
std::string last_modified;
|
|
309
|
+
};
|
|
310
|
+
|
|
311
|
+
common_load_model_from_url_headers headers;
|
|
312
|
+
bool head_request_ok = false;
|
|
313
|
+
bool should_download = !file_exists; // by default, we should download if the file does not exist
|
|
314
|
+
|
|
315
|
+
// get ETag to see if the remote file has changed
|
|
316
|
+
{
|
|
317
|
+
typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
|
|
318
|
+
auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
|
|
319
|
+
common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata;
|
|
320
|
+
|
|
321
|
+
static std::regex header_regex("([^:]+): (.*)\r\n");
|
|
322
|
+
static std::regex etag_regex("ETag", std::regex_constants::icase);
|
|
323
|
+
static std::regex last_modified_regex("Last-Modified", std::regex_constants::icase);
|
|
324
|
+
|
|
325
|
+
std::string header(buffer, n_items);
|
|
326
|
+
std::smatch match;
|
|
327
|
+
if (std::regex_match(header, match, header_regex)) {
|
|
328
|
+
const std::string & key = match[1];
|
|
329
|
+
const std::string & value = match[2];
|
|
330
|
+
if (std::regex_match(key, match, etag_regex)) {
|
|
331
|
+
headers->etag = value;
|
|
332
|
+
} else if (std::regex_match(key, match, last_modified_regex)) {
|
|
333
|
+
headers->last_modified = value;
|
|
334
|
+
}
|
|
335
|
+
}
|
|
336
|
+
return n_items;
|
|
337
|
+
};
|
|
338
|
+
|
|
339
|
+
curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 1L); // will trigger the HEAD verb
|
|
340
|
+
curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L); // hide head request progress
|
|
341
|
+
curl_easy_setopt(curl.get(), CURLOPT_HEADERFUNCTION, static_cast<CURLOPT_HEADERFUNCTION_PTR>(header_callback));
|
|
342
|
+
curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers);
|
|
343
|
+
|
|
344
|
+
// we only allow retrying once for HEAD requests
|
|
345
|
+
// this is for the use case of using running offline (no internet), retrying can be annoying
|
|
346
|
+
bool was_perform_successful = curl_perform_with_retry(url, curl.get(), 1, 0);
|
|
347
|
+
if (!was_perform_successful) {
|
|
348
|
+
head_request_ok = false;
|
|
349
|
+
}
|
|
350
|
+
|
|
351
|
+
long http_code = 0;
|
|
352
|
+
curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
|
|
353
|
+
if (http_code == 200) {
|
|
354
|
+
head_request_ok = true;
|
|
355
|
+
} else {
|
|
356
|
+
LOG_WRN("%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
|
|
357
|
+
head_request_ok = false;
|
|
358
|
+
}
|
|
359
|
+
}
|
|
360
|
+
|
|
361
|
+
// if head_request_ok is false, we don't have the etag or last-modified headers
|
|
362
|
+
// we leave should_download as-is, which is true if the file does not exist
|
|
363
|
+
if (head_request_ok) {
|
|
364
|
+
// check if ETag or Last-Modified headers are different
|
|
365
|
+
// if it is, we need to download the file again
|
|
366
|
+
if (!etag.empty() && etag != headers.etag) {
|
|
367
|
+
LOG_WRN("%s: ETag header is different (%s != %s): triggering a new download\n", __func__, etag.c_str(), headers.etag.c_str());
|
|
368
|
+
should_download = true;
|
|
369
|
+
} else if (!last_modified.empty() && last_modified != headers.last_modified) {
|
|
370
|
+
LOG_WRN("%s: Last-Modified header is different (%s != %s): triggering a new download\n", __func__, last_modified.c_str(), headers.last_modified.c_str());
|
|
371
|
+
should_download = true;
|
|
372
|
+
}
|
|
373
|
+
}
|
|
374
|
+
|
|
375
|
+
if (should_download) {
|
|
376
|
+
std::string path_temporary = path + ".downloadInProgress";
|
|
377
|
+
if (file_exists) {
|
|
378
|
+
LOG_WRN("%s: deleting previous downloaded file: %s\n", __func__, path.c_str());
|
|
379
|
+
if (remove(path.c_str()) != 0) {
|
|
380
|
+
LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
|
|
381
|
+
return false;
|
|
382
|
+
}
|
|
383
|
+
}
|
|
384
|
+
|
|
385
|
+
// Set the output file
|
|
386
|
+
|
|
387
|
+
struct FILE_deleter {
|
|
388
|
+
void operator()(FILE * f) const {
|
|
389
|
+
fclose(f);
|
|
390
|
+
}
|
|
391
|
+
};
|
|
392
|
+
|
|
393
|
+
std::unique_ptr<FILE, FILE_deleter> outfile(fopen(path_temporary.c_str(), "wb"));
|
|
394
|
+
if (!outfile) {
|
|
395
|
+
LOG_ERR("%s: error opening local file for writing: %s\n", __func__, path.c_str());
|
|
396
|
+
return false;
|
|
397
|
+
}
|
|
398
|
+
|
|
399
|
+
typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * data, size_t size, size_t nmemb, void * fd);
|
|
400
|
+
auto write_callback = [](void * data, size_t size, size_t nmemb, void * fd) -> size_t {
|
|
401
|
+
return fwrite(data, size, nmemb, (FILE *)fd);
|
|
402
|
+
};
|
|
403
|
+
curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 0L);
|
|
404
|
+
curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
|
|
405
|
+
curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, outfile.get());
|
|
406
|
+
|
|
407
|
+
// display download progress
|
|
408
|
+
curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 0L);
|
|
409
|
+
|
|
410
|
+
// helper function to hide password in URL
|
|
411
|
+
auto llama_download_hide_password_in_url = [](const std::string & url) -> std::string {
|
|
412
|
+
std::size_t protocol_pos = url.find("://");
|
|
413
|
+
if (protocol_pos == std::string::npos) {
|
|
414
|
+
return url; // Malformed URL
|
|
415
|
+
}
|
|
416
|
+
|
|
417
|
+
std::size_t at_pos = url.find('@', protocol_pos + 3);
|
|
418
|
+
if (at_pos == std::string::npos) {
|
|
419
|
+
return url; // No password in URL
|
|
420
|
+
}
|
|
421
|
+
|
|
422
|
+
return url.substr(0, protocol_pos + 3) + "********" + url.substr(at_pos);
|
|
423
|
+
};
|
|
424
|
+
|
|
425
|
+
// start the download
|
|
426
|
+
LOG_INF("%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
|
|
427
|
+
llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str());
|
|
428
|
+
bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
|
|
429
|
+
if (!was_perform_successful) {
|
|
430
|
+
return false;
|
|
431
|
+
}
|
|
432
|
+
|
|
433
|
+
long http_code = 0;
|
|
434
|
+
curl_easy_getinfo (curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
|
|
435
|
+
if (http_code < 200 || http_code >= 400) {
|
|
436
|
+
LOG_ERR("%s: invalid http status code received: %ld\n", __func__, http_code);
|
|
437
|
+
return false;
|
|
438
|
+
}
|
|
439
|
+
|
|
440
|
+
// Causes file to be closed explicitly here before we rename it.
|
|
441
|
+
outfile.reset();
|
|
442
|
+
|
|
443
|
+
// Write the updated JSON metadata file.
|
|
444
|
+
metadata.update({
|
|
445
|
+
{"url", url},
|
|
446
|
+
{"etag", headers.etag},
|
|
447
|
+
{"lastModified", headers.last_modified}
|
|
448
|
+
});
|
|
449
|
+
write_file(metadata_path, metadata.dump(4));
|
|
450
|
+
LOG_DBG("%s: file metadata saved: %s\n", __func__, metadata_path.c_str());
|
|
451
|
+
|
|
452
|
+
if (rename(path_temporary.c_str(), path.c_str()) != 0) {
|
|
453
|
+
LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
|
|
454
|
+
return false;
|
|
455
|
+
}
|
|
456
|
+
} else {
|
|
457
|
+
LOG_INF("%s: using cached file: %s\n", __func__, path.c_str());
|
|
458
|
+
}
|
|
459
|
+
|
|
460
|
+
return true;
|
|
461
|
+
}
|
|
462
|
+
|
|
463
|
+
// download multiple files from remote URLs to local paths
|
|
464
|
+
// the input is a vector of pairs <url, path>
|
|
465
|
+
static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> & urls, const std::string & bearer_token) {
|
|
466
|
+
// Prepare download in parallel
|
|
467
|
+
std::vector<std::future<bool>> futures_download;
|
|
468
|
+
for (auto const & item : urls) {
|
|
469
|
+
futures_download.push_back(std::async(std::launch::async, [bearer_token](const std::pair<std::string, std::string> & it) -> bool {
|
|
470
|
+
return common_download_file_single(it.first, it.second, bearer_token);
|
|
471
|
+
}, item));
|
|
472
|
+
}
|
|
473
|
+
|
|
474
|
+
// Wait for all downloads to complete
|
|
475
|
+
for (auto & f : futures_download) {
|
|
476
|
+
if (!f.get()) {
|
|
477
|
+
return false;
|
|
478
|
+
}
|
|
479
|
+
}
|
|
480
|
+
|
|
481
|
+
return true;
|
|
482
|
+
}
|
|
483
|
+
|
|
484
|
+
static bool common_download_model(
|
|
485
|
+
const common_params_model & model,
|
|
486
|
+
const std::string & bearer_token) {
|
|
487
|
+
// Basic validation of the model.url
|
|
488
|
+
if (model.url.empty()) {
|
|
489
|
+
LOG_ERR("%s: invalid model url\n", __func__);
|
|
490
|
+
return false;
|
|
491
|
+
}
|
|
492
|
+
|
|
493
|
+
if (!common_download_file_single(model.url, model.path, bearer_token)) {
|
|
494
|
+
return false;
|
|
495
|
+
}
|
|
496
|
+
|
|
497
|
+
// check for additional GGUFs split to download
|
|
498
|
+
int n_split = 0;
|
|
499
|
+
{
|
|
500
|
+
struct gguf_init_params gguf_params = {
|
|
501
|
+
/*.no_alloc = */ true,
|
|
502
|
+
/*.ctx = */ NULL,
|
|
503
|
+
};
|
|
504
|
+
auto * ctx_gguf = gguf_init_from_file(model.path.c_str(), gguf_params);
|
|
505
|
+
if (!ctx_gguf) {
|
|
506
|
+
LOG_ERR("\n%s: failed to load input GGUF from %s\n", __func__, model.path.c_str());
|
|
507
|
+
return false;
|
|
508
|
+
}
|
|
509
|
+
|
|
510
|
+
auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_SPLIT_COUNT);
|
|
511
|
+
if (key_n_split >= 0) {
|
|
512
|
+
n_split = gguf_get_val_u16(ctx_gguf, key_n_split);
|
|
513
|
+
}
|
|
514
|
+
|
|
515
|
+
gguf_free(ctx_gguf);
|
|
516
|
+
}
|
|
517
|
+
|
|
518
|
+
if (n_split > 1) {
|
|
519
|
+
char split_prefix[PATH_MAX] = {0};
|
|
520
|
+
char split_url_prefix[LLAMA_CURL_MAX_URL_LENGTH] = {0};
|
|
521
|
+
|
|
522
|
+
// Verify the first split file format
|
|
523
|
+
// and extract split URL and PATH prefixes
|
|
524
|
+
{
|
|
525
|
+
if (!llama_split_prefix(split_prefix, sizeof(split_prefix), model.path.c_str(), 0, n_split)) {
|
|
526
|
+
LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, model.path.c_str(), n_split);
|
|
527
|
+
return false;
|
|
528
|
+
}
|
|
529
|
+
|
|
530
|
+
if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model.url.c_str(), 0, n_split)) {
|
|
531
|
+
LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model.url.c_str(), n_split);
|
|
532
|
+
return false;
|
|
533
|
+
}
|
|
534
|
+
}
|
|
535
|
+
|
|
536
|
+
std::vector<std::pair<std::string, std::string>> urls;
|
|
537
|
+
for (int idx = 1; idx < n_split; idx++) {
|
|
538
|
+
char split_path[PATH_MAX] = {0};
|
|
539
|
+
llama_split_path(split_path, sizeof(split_path), split_prefix, idx, n_split);
|
|
540
|
+
|
|
541
|
+
char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0};
|
|
542
|
+
llama_split_path(split_url, sizeof(split_url), split_url_prefix, idx, n_split);
|
|
543
|
+
|
|
544
|
+
if (std::string(split_path) == model.path) {
|
|
545
|
+
continue; // skip the already downloaded file
|
|
546
|
+
}
|
|
547
|
+
|
|
548
|
+
urls.push_back({split_url, split_path});
|
|
549
|
+
}
|
|
550
|
+
|
|
551
|
+
// Download in parallel
|
|
552
|
+
common_download_file_multiple(urls, bearer_token);
|
|
553
|
+
}
|
|
554
|
+
|
|
555
|
+
return true;
|
|
556
|
+
}
|
|
557
|
+
|
|
558
|
+
std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url, const common_remote_params & params) {
|
|
559
|
+
curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
|
|
560
|
+
curl_slist_ptr http_headers;
|
|
561
|
+
std::vector<char> res_buffer;
|
|
562
|
+
|
|
563
|
+
curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
|
|
564
|
+
curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L);
|
|
565
|
+
curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);
|
|
566
|
+
typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * ptr, size_t size, size_t nmemb, void * data);
|
|
567
|
+
auto write_callback = [](void * ptr, size_t size, size_t nmemb, void * data) -> size_t {
|
|
568
|
+
auto data_vec = static_cast<std::vector<char> *>(data);
|
|
569
|
+
data_vec->insert(data_vec->end(), (char *)ptr, (char *)ptr + size * nmemb);
|
|
570
|
+
return size * nmemb;
|
|
571
|
+
};
|
|
572
|
+
curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
|
|
573
|
+
curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &res_buffer);
|
|
574
|
+
#if defined(_WIN32)
|
|
575
|
+
curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
|
|
576
|
+
#endif
|
|
577
|
+
if (params.timeout > 0) {
|
|
578
|
+
curl_easy_setopt(curl.get(), CURLOPT_TIMEOUT, params.timeout);
|
|
579
|
+
}
|
|
580
|
+
if (params.max_size > 0) {
|
|
581
|
+
curl_easy_setopt(curl.get(), CURLOPT_MAXFILESIZE, params.max_size);
|
|
582
|
+
}
|
|
583
|
+
http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
|
|
584
|
+
for (const auto & header : params.headers) {
|
|
585
|
+
http_headers.ptr = curl_slist_append(http_headers.ptr, header.c_str());
|
|
586
|
+
}
|
|
587
|
+
curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
|
|
588
|
+
|
|
589
|
+
CURLcode res = curl_easy_perform(curl.get());
|
|
590
|
+
|
|
591
|
+
if (res != CURLE_OK) {
|
|
592
|
+
std::string error_msg = curl_easy_strerror(res);
|
|
593
|
+
throw std::runtime_error("error: cannot make GET request: " + error_msg);
|
|
594
|
+
}
|
|
595
|
+
|
|
596
|
+
long res_code;
|
|
597
|
+
curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &res_code);
|
|
598
|
+
|
|
599
|
+
return { res_code, std::move(res_buffer) };
|
|
600
|
+
}
|
|
601
|
+
|
|
602
|
+
/**
|
|
603
|
+
* Allow getting the HF file from the HF repo with tag (like ollama), for example:
|
|
604
|
+
* - bartowski/Llama-3.2-3B-Instruct-GGUF:q4
|
|
605
|
+
* - bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M
|
|
606
|
+
* - bartowski/Llama-3.2-3B-Instruct-GGUF:q5_k_s
|
|
607
|
+
* Tag is optional, default to "latest" (meaning it checks for Q4_K_M first, then Q4, then if not found, return the first GGUF file in repo)
|
|
608
|
+
*
|
|
609
|
+
* Return pair of <repo, file> (with "repo" already having tag removed)
|
|
610
|
+
*
|
|
611
|
+
* Note: we use the Ollama-compatible HF API, but not using the blobId. Instead, we use the special "ggufFile" field which returns the value for "hf_file". This is done to be backward-compatible with existing cache files.
|
|
612
|
+
*/
|
|
613
|
+
static struct common_hf_file_res common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & bearer_token) {
|
|
614
|
+
auto parts = string_split<std::string>(hf_repo_with_tag, ':');
|
|
615
|
+
std::string tag = parts.size() > 1 ? parts.back() : "latest";
|
|
616
|
+
std::string hf_repo = parts[0];
|
|
617
|
+
if (string_split<std::string>(hf_repo, '/').size() != 2) {
|
|
618
|
+
throw std::invalid_argument("error: invalid HF repo format, expected <user>/<model>[:quant]\n");
|
|
619
|
+
}
|
|
620
|
+
|
|
621
|
+
std::string url = get_model_endpoint() + "v2/" + hf_repo + "/manifests/" + tag;
|
|
622
|
+
|
|
623
|
+
// headers
|
|
624
|
+
std::vector<std::string> headers;
|
|
625
|
+
headers.push_back("Accept: application/json");
|
|
626
|
+
if (!bearer_token.empty()) {
|
|
627
|
+
headers.push_back("Authorization: Bearer " + bearer_token);
|
|
628
|
+
}
|
|
629
|
+
// Important: the User-Agent must be "llama-cpp" to get the "ggufFile" field in the response
|
|
630
|
+
// User-Agent header is already set in common_remote_get_content, no need to set it here
|
|
631
|
+
|
|
632
|
+
// we use "=" to avoid clashing with other component, while still being allowed on windows
|
|
633
|
+
std::string cached_response_fname = "manifest=" + hf_repo + "=" + tag + ".json";
|
|
634
|
+
string_replace_all(cached_response_fname, "/", "_");
|
|
635
|
+
std::string cached_response_path = fs_get_cache_file(cached_response_fname);
|
|
636
|
+
|
|
637
|
+
// make the request
|
|
638
|
+
common_remote_params params;
|
|
639
|
+
params.headers = headers;
|
|
640
|
+
long res_code = 0;
|
|
641
|
+
std::string res_str;
|
|
642
|
+
bool use_cache = false;
|
|
643
|
+
try {
|
|
644
|
+
auto res = common_remote_get_content(url, params);
|
|
645
|
+
res_code = res.first;
|
|
646
|
+
res_str = std::string(res.second.data(), res.second.size());
|
|
647
|
+
} catch (const std::exception & e) {
|
|
648
|
+
LOG_WRN("error: failed to get manifest: %s\n", e.what());
|
|
649
|
+
LOG_WRN("try reading from cache\n");
|
|
650
|
+
// try to read from cache
|
|
651
|
+
try {
|
|
652
|
+
res_str = read_file(cached_response_path);
|
|
653
|
+
res_code = 200;
|
|
654
|
+
use_cache = true;
|
|
655
|
+
} catch (const std::exception & e) {
|
|
656
|
+
throw std::runtime_error("error: failed to get manifest (check your internet connection)");
|
|
657
|
+
}
|
|
658
|
+
}
|
|
659
|
+
std::string ggufFile;
|
|
660
|
+
std::string mmprojFile;
|
|
661
|
+
|
|
662
|
+
if (res_code == 200 || res_code == 304) {
|
|
663
|
+
// extract ggufFile.rfilename in json, using regex
|
|
664
|
+
{
|
|
665
|
+
std::regex pattern("\"ggufFile\"[\\s\\S]*?\"rfilename\"\\s*:\\s*\"([^\"]+)\"");
|
|
666
|
+
std::smatch match;
|
|
667
|
+
if (std::regex_search(res_str, match, pattern)) {
|
|
668
|
+
ggufFile = match[1].str();
|
|
669
|
+
}
|
|
670
|
+
}
|
|
671
|
+
// extract mmprojFile.rfilename in json, using regex
|
|
672
|
+
{
|
|
673
|
+
std::regex pattern("\"mmprojFile\"[\\s\\S]*?\"rfilename\"\\s*:\\s*\"([^\"]+)\"");
|
|
674
|
+
std::smatch match;
|
|
675
|
+
if (std::regex_search(res_str, match, pattern)) {
|
|
676
|
+
mmprojFile = match[1].str();
|
|
151
677
|
}
|
|
152
678
|
}
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
679
|
+
if (!use_cache) {
|
|
680
|
+
// if not using cached response, update the cache file
|
|
681
|
+
write_file(cached_response_path, res_str);
|
|
682
|
+
}
|
|
683
|
+
} else if (res_code == 401) {
|
|
684
|
+
throw std::runtime_error("error: model is private or does not exist; if you are accessing a gated model, please provide a valid HF token");
|
|
685
|
+
} else {
|
|
686
|
+
throw std::runtime_error(string_format("error from HF API, response code: %ld, data: %s", res_code, res_str.c_str()));
|
|
687
|
+
}
|
|
688
|
+
|
|
689
|
+
// check response
|
|
690
|
+
if (ggufFile.empty()) {
|
|
691
|
+
throw std::runtime_error("error: model does not have ggufFile");
|
|
692
|
+
}
|
|
693
|
+
|
|
694
|
+
return { hf_repo, ggufFile, mmprojFile };
|
|
695
|
+
}
|
|
696
|
+
|
|
697
|
+
#else
|
|
698
|
+
|
|
699
|
+
bool common_has_curl() {
|
|
700
|
+
return false;
|
|
701
|
+
}
|
|
702
|
+
|
|
703
|
+
static bool common_download_file_single(const std::string &, const std::string &, const std::string &) {
|
|
704
|
+
LOG_ERR("error: built without CURL, cannot download model from internet\n");
|
|
705
|
+
return false;
|
|
706
|
+
}
|
|
707
|
+
|
|
708
|
+
static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> &, const std::string &) {
|
|
709
|
+
LOG_ERR("error: built without CURL, cannot download model from the internet\n");
|
|
710
|
+
return false;
|
|
711
|
+
}
|
|
712
|
+
|
|
713
|
+
static bool common_download_model(
|
|
714
|
+
const common_params_model &,
|
|
715
|
+
const std::string &) {
|
|
716
|
+
LOG_ERR("error: built without CURL, cannot download model from the internet\n");
|
|
717
|
+
return false;
|
|
718
|
+
}
|
|
719
|
+
|
|
720
|
+
static struct common_hf_file_res common_get_hf_file(const std::string &, const std::string &) {
|
|
721
|
+
LOG_ERR("error: built without CURL, cannot download model from the internet\n");
|
|
722
|
+
return {};
|
|
723
|
+
}
|
|
724
|
+
|
|
725
|
+
std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url, const common_remote_params &) {
|
|
726
|
+
if (!url.empty()) {
|
|
727
|
+
throw std::runtime_error("error: built without CURL, cannot download model from the internet");
|
|
728
|
+
}
|
|
729
|
+
|
|
730
|
+
return {};
|
|
731
|
+
}
|
|
732
|
+
|
|
733
|
+
#endif // LLAMA_USE_CURL
|
|
734
|
+
|
|
735
|
+
//
|
|
736
|
+
// utils
|
|
737
|
+
//
|
|
738
|
+
|
|
739
|
+
struct handle_model_result {
|
|
740
|
+
bool found_mmproj = false;
|
|
741
|
+
common_params_model mmproj;
|
|
742
|
+
};
|
|
743
|
+
|
|
744
|
+
static handle_model_result common_params_handle_model(
|
|
745
|
+
struct common_params_model & model,
|
|
746
|
+
const std::string & bearer_token,
|
|
747
|
+
const std::string & model_path_default) {
|
|
748
|
+
handle_model_result result;
|
|
749
|
+
// handle pre-fill default model path and url based on hf_repo and hf_file
|
|
750
|
+
{
|
|
751
|
+
if (!model.hf_repo.empty()) {
|
|
752
|
+
// short-hand to avoid specifying --hf-file -> default it to --model
|
|
753
|
+
if (model.hf_file.empty()) {
|
|
754
|
+
if (model.path.empty()) {
|
|
755
|
+
auto auto_detected = common_get_hf_file(model.hf_repo, bearer_token);
|
|
756
|
+
if (auto_detected.repo.empty() || auto_detected.ggufFile.empty()) {
|
|
757
|
+
exit(1); // built without CURL, error message already printed
|
|
758
|
+
}
|
|
759
|
+
model.hf_repo = auto_detected.repo;
|
|
760
|
+
model.hf_file = auto_detected.ggufFile;
|
|
761
|
+
if (!auto_detected.mmprojFile.empty()) {
|
|
762
|
+
result.found_mmproj = true;
|
|
763
|
+
result.mmproj.hf_repo = model.hf_repo;
|
|
764
|
+
result.mmproj.hf_file = auto_detected.mmprojFile;
|
|
765
|
+
}
|
|
766
|
+
} else {
|
|
767
|
+
model.hf_file = model.path;
|
|
768
|
+
}
|
|
769
|
+
}
|
|
770
|
+
|
|
771
|
+
std::string model_endpoint = get_model_endpoint();
|
|
772
|
+
model.url = model_endpoint + model.hf_repo + "/resolve/main/" + model.hf_file;
|
|
773
|
+
// make sure model path is present (for caching purposes)
|
|
774
|
+
if (model.path.empty()) {
|
|
775
|
+
// this is to avoid different repo having same file name, or same file name in different subdirs
|
|
776
|
+
std::string filename = model.hf_repo + "_" + model.hf_file;
|
|
777
|
+
// to make sure we don't have any slashes in the filename
|
|
778
|
+
string_replace_all(filename, "/", "_");
|
|
779
|
+
model.path = fs_get_cache_file(filename);
|
|
780
|
+
}
|
|
781
|
+
|
|
782
|
+
} else if (!model.url.empty()) {
|
|
783
|
+
if (model.path.empty()) {
|
|
784
|
+
auto f = string_split<std::string>(model.url, '#').front();
|
|
785
|
+
f = string_split<std::string>(f, '?').front();
|
|
786
|
+
model.path = fs_get_cache_file(string_split<std::string>(f, '/').back());
|
|
787
|
+
}
|
|
788
|
+
|
|
789
|
+
} else if (model.path.empty()) {
|
|
790
|
+
model.path = model_path_default;
|
|
160
791
|
}
|
|
161
|
-
}
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
792
|
+
}
|
|
793
|
+
|
|
794
|
+
// then, download it if needed
|
|
795
|
+
if (!model.url.empty()) {
|
|
796
|
+
bool ok = common_download_model(model, bearer_token);
|
|
797
|
+
if (!ok) {
|
|
798
|
+
LOG_ERR("error: failed to download model from %s\n", model.url.c_str());
|
|
799
|
+
exit(1);
|
|
166
800
|
}
|
|
167
|
-
} else if (model.empty()) {
|
|
168
|
-
model = model_default;
|
|
169
801
|
}
|
|
802
|
+
|
|
803
|
+
return result;
|
|
170
804
|
}
|
|
171
805
|
|
|
172
806
|
const std::vector<ggml_type> kv_cache_types = {
|
|
@@ -300,10 +934,25 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
|
|
300
934
|
throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
|
|
301
935
|
}
|
|
302
936
|
|
|
303
|
-
//
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
937
|
+
// handle model and download
|
|
938
|
+
{
|
|
939
|
+
auto res = common_params_handle_model(params.model, params.hf_token, DEFAULT_MODEL_PATH);
|
|
940
|
+
if (params.no_mmproj) {
|
|
941
|
+
params.mmproj = {};
|
|
942
|
+
} else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) {
|
|
943
|
+
// optionally, handle mmproj model when -hf is specified
|
|
944
|
+
params.mmproj = res.mmproj;
|
|
945
|
+
}
|
|
946
|
+
// only download mmproj if the current example is using it
|
|
947
|
+
for (auto & ex : mmproj_examples) {
|
|
948
|
+
if (ctx_arg.ex == ex) {
|
|
949
|
+
common_params_handle_model(params.mmproj, params.hf_token, "");
|
|
950
|
+
break;
|
|
951
|
+
}
|
|
952
|
+
}
|
|
953
|
+
common_params_handle_model(params.speculative.model, params.hf_token, "");
|
|
954
|
+
common_params_handle_model(params.vocoder.model, params.hf_token, "");
|
|
955
|
+
}
|
|
307
956
|
|
|
308
957
|
if (params.escape) {
|
|
309
958
|
string_process_escapes(params.prompt);
|
|
@@ -322,6 +971,10 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
|
|
322
971
|
params.kv_overrides.back().key[0] = 0;
|
|
323
972
|
}
|
|
324
973
|
|
|
974
|
+
if (!params.tensor_buft_overrides.empty()) {
|
|
975
|
+
params.tensor_buft_overrides.push_back({nullptr, nullptr});
|
|
976
|
+
}
|
|
977
|
+
|
|
325
978
|
if (params.reranking && params.embedding) {
|
|
326
979
|
throw std::invalid_argument("error: either --embedding or --reranking can be specified, but not both");
|
|
327
980
|
}
|
|
@@ -431,7 +1084,6 @@ static void common_params_print_completion(common_params_context & ctx_arg) {
|
|
|
431
1084
|
"llama-embedding",
|
|
432
1085
|
"llama-eval-callback",
|
|
433
1086
|
"llama-export-lora",
|
|
434
|
-
"llama-gbnf-validator",
|
|
435
1087
|
"llama-gen-docs",
|
|
436
1088
|
"llama-gguf",
|
|
437
1089
|
"llama-gguf-hash",
|
|
@@ -439,20 +1091,18 @@ static void common_params_print_completion(common_params_context & ctx_arg) {
|
|
|
439
1091
|
"llama-gritlm",
|
|
440
1092
|
"llama-imatrix",
|
|
441
1093
|
"llama-infill",
|
|
442
|
-
"llama-
|
|
1094
|
+
"llama-mtmd-cli",
|
|
443
1095
|
"llama-llava-clip-quantize-cli",
|
|
444
1096
|
"llama-lookahead",
|
|
445
1097
|
"llama-lookup",
|
|
446
1098
|
"llama-lookup-create",
|
|
447
1099
|
"llama-lookup-merge",
|
|
448
1100
|
"llama-lookup-stats",
|
|
449
|
-
"llama-minicpmv-cli",
|
|
450
1101
|
"llama-parallel",
|
|
451
1102
|
"llama-passkey",
|
|
452
1103
|
"llama-perplexity",
|
|
453
1104
|
"llama-q8dot",
|
|
454
1105
|
"llama-quantize",
|
|
455
|
-
"llama-quantize-stats",
|
|
456
1106
|
"llama-qwen2vl-cli",
|
|
457
1107
|
"llama-retrieval",
|
|
458
1108
|
"llama-run",
|
|
@@ -541,6 +1191,9 @@ bool common_params_parse(int argc, char ** argv, common_params & params, llama_e
|
|
|
541
1191
|
fprintf(stderr, "%s\n", ex.what());
|
|
542
1192
|
ctx_arg.params = params_org;
|
|
543
1193
|
return false;
|
|
1194
|
+
} catch (std::exception & ex) {
|
|
1195
|
+
fprintf(stderr, "%s\n", ex.what());
|
|
1196
|
+
exit(1); // for other exceptions, we exit with status code 1
|
|
544
1197
|
}
|
|
545
1198
|
|
|
546
1199
|
return true;
|
|
@@ -841,13 +1494,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
841
1494
|
{"-f", "--file"}, "FNAME",
|
|
842
1495
|
"a file containing the prompt (default: none)",
|
|
843
1496
|
[](common_params & params, const std::string & value) {
|
|
844
|
-
|
|
845
|
-
if (!file) {
|
|
846
|
-
throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
|
|
847
|
-
}
|
|
1497
|
+
params.prompt = read_file(value);
|
|
848
1498
|
// store the external file name in params
|
|
849
1499
|
params.prompt_file = value;
|
|
850
|
-
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
|
|
851
1500
|
if (!params.prompt.empty() && params.prompt.back() == '\n') {
|
|
852
1501
|
params.prompt.pop_back();
|
|
853
1502
|
}
|
|
@@ -857,11 +1506,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
857
1506
|
{"-sysf", "--system-prompt-file"}, "FNAME",
|
|
858
1507
|
"a file containing the system prompt (default: none)",
|
|
859
1508
|
[](common_params & params, const std::string & value) {
|
|
860
|
-
|
|
861
|
-
if (!file) {
|
|
862
|
-
throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
|
|
863
|
-
}
|
|
864
|
-
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.system_prompt));
|
|
1509
|
+
params.system_prompt = read_file(value);
|
|
865
1510
|
if (!params.system_prompt.empty() && params.system_prompt.back() == '\n') {
|
|
866
1511
|
params.system_prompt.pop_back();
|
|
867
1512
|
}
|
|
@@ -1285,23 +1930,32 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1285
1930
|
add_opt(common_arg(
|
|
1286
1931
|
{"--grammar-file"}, "FNAME",
|
|
1287
1932
|
"file to read grammar from",
|
|
1933
|
+
[](common_params & params, const std::string & value) {
|
|
1934
|
+
params.sampling.grammar = read_file(value);
|
|
1935
|
+
}
|
|
1936
|
+
).set_sparam());
|
|
1937
|
+
add_opt(common_arg(
|
|
1938
|
+
{"-j", "--json-schema"}, "SCHEMA",
|
|
1939
|
+
"JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\nFor schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead",
|
|
1940
|
+
[](common_params & params, const std::string & value) {
|
|
1941
|
+
params.sampling.grammar = json_schema_to_grammar(json::parse(value));
|
|
1942
|
+
}
|
|
1943
|
+
).set_sparam());
|
|
1944
|
+
add_opt(common_arg(
|
|
1945
|
+
{"-jf", "--json-schema-file"}, "FILE",
|
|
1946
|
+
"File containing a JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\nFor schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead",
|
|
1288
1947
|
[](common_params & params, const std::string & value) {
|
|
1289
1948
|
std::ifstream file(value);
|
|
1290
1949
|
if (!file) {
|
|
1291
1950
|
throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
|
|
1292
1951
|
}
|
|
1952
|
+
std::string schema;
|
|
1293
1953
|
std::copy(
|
|
1294
1954
|
std::istreambuf_iterator<char>(file),
|
|
1295
1955
|
std::istreambuf_iterator<char>(),
|
|
1296
|
-
std::back_inserter(
|
|
1956
|
+
std::back_inserter(schema)
|
|
1297
1957
|
);
|
|
1298
|
-
|
|
1299
|
-
).set_sparam());
|
|
1300
|
-
add_opt(common_arg(
|
|
1301
|
-
{"-j", "--json-schema"}, "SCHEMA",
|
|
1302
|
-
"JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\nFor schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead",
|
|
1303
|
-
[](common_params & params, const std::string & value) {
|
|
1304
|
-
params.sampling.grammar = json_schema_to_grammar(json::parse(value));
|
|
1958
|
+
params.sampling.grammar = json_schema_to_grammar(json::parse(schema));
|
|
1305
1959
|
}
|
|
1306
1960
|
).set_sparam());
|
|
1307
1961
|
add_opt(common_arg(
|
|
@@ -1559,11 +2213,32 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1559
2213
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONT_BATCHING"));
|
|
1560
2214
|
add_opt(common_arg(
|
|
1561
2215
|
{"--mmproj"}, "FILE",
|
|
1562
|
-
"path to a multimodal projector file
|
|
2216
|
+
"path to a multimodal projector file. see examples/llava/README.md",
|
|
1563
2217
|
[](common_params & params, const std::string & value) {
|
|
1564
|
-
params.mmproj = value;
|
|
2218
|
+
params.mmproj.path = value;
|
|
1565
2219
|
}
|
|
1566
|
-
).set_examples(
|
|
2220
|
+
).set_examples(mmproj_examples));
|
|
2221
|
+
add_opt(common_arg(
|
|
2222
|
+
{"--mmproj-url"}, "URL",
|
|
2223
|
+
"URL to a multimodal projector file. see examples/llava/README.md",
|
|
2224
|
+
[](common_params & params, const std::string & value) {
|
|
2225
|
+
params.mmproj.url = value;
|
|
2226
|
+
}
|
|
2227
|
+
).set_examples(mmproj_examples));
|
|
2228
|
+
add_opt(common_arg(
|
|
2229
|
+
{"--no-mmproj"},
|
|
2230
|
+
"explicitly disable multimodal projector, useful when using -hf",
|
|
2231
|
+
[](common_params & params) {
|
|
2232
|
+
params.no_mmproj = true;
|
|
2233
|
+
}
|
|
2234
|
+
).set_examples(mmproj_examples));
|
|
2235
|
+
add_opt(common_arg(
|
|
2236
|
+
{"--no-mmproj-offload"},
|
|
2237
|
+
"do not offload multimodal projector to GPU",
|
|
2238
|
+
[](common_params & params) {
|
|
2239
|
+
params.mmproj_use_gpu = false;
|
|
2240
|
+
}
|
|
2241
|
+
).set_examples(mmproj_examples));
|
|
1567
2242
|
add_opt(common_arg(
|
|
1568
2243
|
{"--image"}, "FILE",
|
|
1569
2244
|
"path to an image file. use with multimodal models. Specify multiple times for batching",
|
|
@@ -1647,6 +2322,41 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1647
2322
|
exit(0);
|
|
1648
2323
|
}
|
|
1649
2324
|
));
|
|
2325
|
+
add_opt(common_arg(
|
|
2326
|
+
{"--override-tensor", "-ot"}, "<tensor name pattern>=<buffer type>,...",
|
|
2327
|
+
"override tensor buffer type", [](common_params & params, const std::string & value) {
|
|
2328
|
+
/* static */ std::map<std::string, ggml_backend_buffer_type_t> buft_list;
|
|
2329
|
+
if (buft_list.empty()) {
|
|
2330
|
+
// enumerate all the devices and add their buffer types to the list
|
|
2331
|
+
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
|
|
2332
|
+
auto * dev = ggml_backend_dev_get(i);
|
|
2333
|
+
auto * buft = ggml_backend_dev_buffer_type(dev);
|
|
2334
|
+
if (buft) {
|
|
2335
|
+
buft_list[ggml_backend_buft_name(buft)] = buft;
|
|
2336
|
+
}
|
|
2337
|
+
}
|
|
2338
|
+
}
|
|
2339
|
+
|
|
2340
|
+
for (const auto & override : string_split<std::string>(value, ',')) {
|
|
2341
|
+
std::string::size_type pos = override.find('=');
|
|
2342
|
+
if (pos == std::string::npos) {
|
|
2343
|
+
throw std::invalid_argument("invalid value");
|
|
2344
|
+
}
|
|
2345
|
+
std::string tensor_name = override.substr(0, pos);
|
|
2346
|
+
std::string buffer_type = override.substr(pos + 1);
|
|
2347
|
+
|
|
2348
|
+
if (buft_list.find(buffer_type) == buft_list.end()) {
|
|
2349
|
+
printf("Available buffer types:\n");
|
|
2350
|
+
for (const auto & it : buft_list) {
|
|
2351
|
+
printf(" %s\n", ggml_backend_buft_name(it.second));
|
|
2352
|
+
}
|
|
2353
|
+
throw std::invalid_argument("unknown buffer type");
|
|
2354
|
+
}
|
|
2355
|
+
// FIXME: this leaks memory
|
|
2356
|
+
params.tensor_buft_overrides.push_back({strdup(tensor_name.c_str()), buft_list.at(buffer_type)});
|
|
2357
|
+
}
|
|
2358
|
+
}
|
|
2359
|
+
));
|
|
1650
2360
|
add_opt(common_arg(
|
|
1651
2361
|
{"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
|
|
1652
2362
|
"number of layers to store in VRAM",
|
|
@@ -1790,51 +2500,52 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1790
2500
|
"or `--model-url` if set, otherwise %s)", DEFAULT_MODEL_PATH
|
|
1791
2501
|
),
|
|
1792
2502
|
[](common_params & params, const std::string & value) {
|
|
1793
|
-
params.model = value;
|
|
2503
|
+
params.model.path = value;
|
|
1794
2504
|
}
|
|
1795
2505
|
).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}).set_env("LLAMA_ARG_MODEL"));
|
|
1796
2506
|
add_opt(common_arg(
|
|
1797
2507
|
{"-mu", "--model-url"}, "MODEL_URL",
|
|
1798
2508
|
"model download url (default: unused)",
|
|
1799
2509
|
[](common_params & params, const std::string & value) {
|
|
1800
|
-
params.
|
|
2510
|
+
params.model.url = value;
|
|
1801
2511
|
}
|
|
1802
2512
|
).set_env("LLAMA_ARG_MODEL_URL"));
|
|
1803
2513
|
add_opt(common_arg(
|
|
1804
2514
|
{"-hf", "-hfr", "--hf-repo"}, "<user>/<model>[:quant]",
|
|
1805
2515
|
"Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.\n"
|
|
2516
|
+
"mmproj is also downloaded automatically if available. to disable, add --no-mmproj\n"
|
|
1806
2517
|
"example: unsloth/phi-4-GGUF:q4_k_m\n"
|
|
1807
2518
|
"(default: unused)",
|
|
1808
2519
|
[](common_params & params, const std::string & value) {
|
|
1809
|
-
params.hf_repo = value;
|
|
2520
|
+
params.model.hf_repo = value;
|
|
1810
2521
|
}
|
|
1811
2522
|
).set_env("LLAMA_ARG_HF_REPO"));
|
|
1812
2523
|
add_opt(common_arg(
|
|
1813
2524
|
{"-hfd", "-hfrd", "--hf-repo-draft"}, "<user>/<model>[:quant]",
|
|
1814
2525
|
"Same as --hf-repo, but for the draft model (default: unused)",
|
|
1815
2526
|
[](common_params & params, const std::string & value) {
|
|
1816
|
-
params.speculative.hf_repo = value;
|
|
2527
|
+
params.speculative.model.hf_repo = value;
|
|
1817
2528
|
}
|
|
1818
2529
|
).set_env("LLAMA_ARG_HFD_REPO"));
|
|
1819
2530
|
add_opt(common_arg(
|
|
1820
2531
|
{"-hff", "--hf-file"}, "FILE",
|
|
1821
2532
|
"Hugging Face model file. If specified, it will override the quant in --hf-repo (default: unused)",
|
|
1822
2533
|
[](common_params & params, const std::string & value) {
|
|
1823
|
-
params.hf_file = value;
|
|
2534
|
+
params.model.hf_file = value;
|
|
1824
2535
|
}
|
|
1825
2536
|
).set_env("LLAMA_ARG_HF_FILE"));
|
|
1826
2537
|
add_opt(common_arg(
|
|
1827
2538
|
{"-hfv", "-hfrv", "--hf-repo-v"}, "<user>/<model>[:quant]",
|
|
1828
2539
|
"Hugging Face model repository for the vocoder model (default: unused)",
|
|
1829
2540
|
[](common_params & params, const std::string & value) {
|
|
1830
|
-
params.vocoder.hf_repo = value;
|
|
2541
|
+
params.vocoder.model.hf_repo = value;
|
|
1831
2542
|
}
|
|
1832
2543
|
).set_env("LLAMA_ARG_HF_REPO_V"));
|
|
1833
2544
|
add_opt(common_arg(
|
|
1834
2545
|
{"-hffv", "--hf-file-v"}, "FILE",
|
|
1835
2546
|
"Hugging Face model file for the vocoder model (default: unused)",
|
|
1836
2547
|
[](common_params & params, const std::string & value) {
|
|
1837
|
-
params.vocoder.hf_file = value;
|
|
2548
|
+
params.vocoder.model.hf_file = value;
|
|
1838
2549
|
}
|
|
1839
2550
|
).set_env("LLAMA_ARG_HF_FILE_V"));
|
|
1840
2551
|
add_opt(common_arg(
|
|
@@ -1979,7 +2690,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1979
2690
|
).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
|
|
1980
2691
|
add_opt(common_arg(
|
|
1981
2692
|
{"--host"}, "HOST",
|
|
1982
|
-
string_format("ip address to listen (default: %s)", params.hostname.c_str()),
|
|
2693
|
+
string_format("ip address to listen, or bind to an UNIX socket if the address ends with .sock (default: %s)", params.hostname.c_str()),
|
|
1983
2694
|
[](common_params & params, const std::string & value) {
|
|
1984
2695
|
params.hostname = value;
|
|
1985
2696
|
}
|
|
@@ -2147,7 +2858,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2147
2858
|
[](common_params & params, const std::string & value) {
|
|
2148
2859
|
params.chat_template = value;
|
|
2149
2860
|
}
|
|
2150
|
-
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
|
|
2861
|
+
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_LLAVA}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
|
|
2151
2862
|
add_opt(common_arg(
|
|
2152
2863
|
{"--chat-template-file"}, "JINJA_TEMPLATE_FILE",
|
|
2153
2864
|
string_format(
|
|
@@ -2157,14 +2868,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2157
2868
|
"list of built-in templates:\n%s", list_builtin_chat_templates().c_str()
|
|
2158
2869
|
),
|
|
2159
2870
|
[](common_params & params, const std::string & value) {
|
|
2160
|
-
|
|
2161
|
-
if (!file) {
|
|
2162
|
-
throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
|
|
2163
|
-
}
|
|
2164
|
-
std::copy(
|
|
2165
|
-
std::istreambuf_iterator<char>(file),
|
|
2166
|
-
std::istreambuf_iterator<char>(),
|
|
2167
|
-
std::back_inserter(params.chat_template));
|
|
2871
|
+
params.chat_template = read_file(value);
|
|
2168
2872
|
}
|
|
2169
2873
|
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE_FILE"));
|
|
2170
2874
|
add_opt(common_arg(
|
|
@@ -2454,7 +3158,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2454
3158
|
{"-md", "--model-draft"}, "FNAME",
|
|
2455
3159
|
"draft model for speculative decoding (default: unused)",
|
|
2456
3160
|
[](common_params & params, const std::string & value) {
|
|
2457
|
-
params.speculative.model = value;
|
|
3161
|
+
params.speculative.model.path = value;
|
|
2458
3162
|
}
|
|
2459
3163
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT"));
|
|
2460
3164
|
|
|
@@ -2462,7 +3166,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2462
3166
|
{"-mv", "--model-vocoder"}, "FNAME",
|
|
2463
3167
|
"vocoder model for audio generation (default: unused)",
|
|
2464
3168
|
[](common_params & params, const std::string & value) {
|
|
2465
|
-
params.vocoder.model = value;
|
|
3169
|
+
params.vocoder.model.path = value;
|
|
2466
3170
|
}
|
|
2467
3171
|
).set_examples({LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_SERVER}));
|
|
2468
3172
|
add_opt(common_arg(
|
|
@@ -2485,10 +3189,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2485
3189
|
{"--tts-oute-default"},
|
|
2486
3190
|
string_format("use default OuteTTS models (note: can download weights from the internet)"),
|
|
2487
3191
|
[](common_params & params) {
|
|
2488
|
-
params.hf_repo = "OuteAI/OuteTTS-0.2-500M-GGUF";
|
|
2489
|
-
params.hf_file = "OuteTTS-0.2-500M-Q8_0.gguf";
|
|
2490
|
-
params.vocoder.hf_repo = "ggml-org/WavTokenizer";
|
|
2491
|
-
params.vocoder.hf_file = "WavTokenizer-Large-75-F16.gguf";
|
|
3192
|
+
params.model.hf_repo = "OuteAI/OuteTTS-0.2-500M-GGUF";
|
|
3193
|
+
params.model.hf_file = "OuteTTS-0.2-500M-Q8_0.gguf";
|
|
3194
|
+
params.vocoder.model.hf_repo = "ggml-org/WavTokenizer";
|
|
3195
|
+
params.vocoder.model.hf_file = "WavTokenizer-Large-75-F16.gguf";
|
|
2492
3196
|
}
|
|
2493
3197
|
).set_examples({LLAMA_EXAMPLE_TTS}));
|
|
2494
3198
|
|
|
@@ -2496,8 +3200,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2496
3200
|
{"--embd-bge-small-en-default"},
|
|
2497
3201
|
string_format("use default bge-small-en-v1.5 model (note: can download weights from the internet)"),
|
|
2498
3202
|
[](common_params & params) {
|
|
2499
|
-
params.hf_repo = "ggml-org/bge-small-en-v1.5-Q8_0-GGUF";
|
|
2500
|
-
params.hf_file = "bge-small-en-v1.5-q8_0.gguf";
|
|
3203
|
+
params.model.hf_repo = "ggml-org/bge-small-en-v1.5-Q8_0-GGUF";
|
|
3204
|
+
params.model.hf_file = "bge-small-en-v1.5-q8_0.gguf";
|
|
2501
3205
|
params.pooling_type = LLAMA_POOLING_TYPE_NONE;
|
|
2502
3206
|
params.embd_normalize = 2;
|
|
2503
3207
|
params.n_ctx = 512;
|
|
@@ -2510,8 +3214,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2510
3214
|
{"--embd-e5-small-en-default"},
|
|
2511
3215
|
string_format("use default e5-small-v2 model (note: can download weights from the internet)"),
|
|
2512
3216
|
[](common_params & params) {
|
|
2513
|
-
params.hf_repo = "ggml-org/e5-small-v2-Q8_0-GGUF";
|
|
2514
|
-
params.hf_file = "e5-small-v2-q8_0.gguf";
|
|
3217
|
+
params.model.hf_repo = "ggml-org/e5-small-v2-Q8_0-GGUF";
|
|
3218
|
+
params.model.hf_file = "e5-small-v2-q8_0.gguf";
|
|
2515
3219
|
params.pooling_type = LLAMA_POOLING_TYPE_NONE;
|
|
2516
3220
|
params.embd_normalize = 2;
|
|
2517
3221
|
params.n_ctx = 512;
|
|
@@ -2524,8 +3228,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2524
3228
|
{"--embd-gte-small-default"},
|
|
2525
3229
|
string_format("use default gte-small model (note: can download weights from the internet)"),
|
|
2526
3230
|
[](common_params & params) {
|
|
2527
|
-
params.hf_repo = "ggml-org/gte-small-Q8_0-GGUF";
|
|
2528
|
-
params.hf_file = "gte-small-q8_0.gguf";
|
|
3231
|
+
params.model.hf_repo = "ggml-org/gte-small-Q8_0-GGUF";
|
|
3232
|
+
params.model.hf_file = "gte-small-q8_0.gguf";
|
|
2529
3233
|
params.pooling_type = LLAMA_POOLING_TYPE_NONE;
|
|
2530
3234
|
params.embd_normalize = 2;
|
|
2531
3235
|
params.n_ctx = 512;
|
|
@@ -2538,8 +3242,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2538
3242
|
{"--fim-qwen-1.5b-default"},
|
|
2539
3243
|
string_format("use default Qwen 2.5 Coder 1.5B (note: can download weights from the internet)"),
|
|
2540
3244
|
[](common_params & params) {
|
|
2541
|
-
params.hf_repo = "ggml-org/Qwen2.5-Coder-1.5B-Q8_0-GGUF";
|
|
2542
|
-
params.hf_file = "qwen2.5-coder-1.5b-q8_0.gguf";
|
|
3245
|
+
params.model.hf_repo = "ggml-org/Qwen2.5-Coder-1.5B-Q8_0-GGUF";
|
|
3246
|
+
params.model.hf_file = "qwen2.5-coder-1.5b-q8_0.gguf";
|
|
2543
3247
|
params.port = 8012;
|
|
2544
3248
|
params.n_gpu_layers = 99;
|
|
2545
3249
|
params.flash_attn = true;
|
|
@@ -2554,8 +3258,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2554
3258
|
{"--fim-qwen-3b-default"},
|
|
2555
3259
|
string_format("use default Qwen 2.5 Coder 3B (note: can download weights from the internet)"),
|
|
2556
3260
|
[](common_params & params) {
|
|
2557
|
-
params.hf_repo = "ggml-org/Qwen2.5-Coder-3B-Q8_0-GGUF";
|
|
2558
|
-
params.hf_file = "qwen2.5-coder-3b-q8_0.gguf";
|
|
3261
|
+
params.model.hf_repo = "ggml-org/Qwen2.5-Coder-3B-Q8_0-GGUF";
|
|
3262
|
+
params.model.hf_file = "qwen2.5-coder-3b-q8_0.gguf";
|
|
2559
3263
|
params.port = 8012;
|
|
2560
3264
|
params.n_gpu_layers = 99;
|
|
2561
3265
|
params.flash_attn = true;
|
|
@@ -2570,8 +3274,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2570
3274
|
{"--fim-qwen-7b-default"},
|
|
2571
3275
|
string_format("use default Qwen 2.5 Coder 7B (note: can download weights from the internet)"),
|
|
2572
3276
|
[](common_params & params) {
|
|
2573
|
-
params.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
|
|
2574
|
-
params.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
|
|
3277
|
+
params.model.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
|
|
3278
|
+
params.model.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
|
|
2575
3279
|
params.port = 8012;
|
|
2576
3280
|
params.n_gpu_layers = 99;
|
|
2577
3281
|
params.flash_attn = true;
|
|
@@ -2586,10 +3290,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2586
3290
|
{"--fim-qwen-7b-spec"},
|
|
2587
3291
|
string_format("use Qwen 2.5 Coder 7B + 0.5B draft for speculative decoding (note: can download weights from the internet)"),
|
|
2588
3292
|
[](common_params & params) {
|
|
2589
|
-
params.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
|
|
2590
|
-
params.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
|
|
2591
|
-
params.speculative.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
|
|
2592
|
-
params.speculative.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
|
|
3293
|
+
params.model.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
|
|
3294
|
+
params.model.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
|
|
3295
|
+
params.speculative.model.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
|
|
3296
|
+
params.speculative.model.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
|
|
2593
3297
|
params.speculative.n_gpu_layers = 99;
|
|
2594
3298
|
params.port = 8012;
|
|
2595
3299
|
params.n_gpu_layers = 99;
|
|
@@ -2605,10 +3309,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2605
3309
|
{"--fim-qwen-14b-spec"},
|
|
2606
3310
|
string_format("use Qwen 2.5 Coder 14B + 0.5B draft for speculative decoding (note: can download weights from the internet)"),
|
|
2607
3311
|
[](common_params & params) {
|
|
2608
|
-
params.hf_repo = "ggml-org/Qwen2.5-Coder-14B-Q8_0-GGUF";
|
|
2609
|
-
params.hf_file = "qwen2.5-coder-14b-q8_0.gguf";
|
|
2610
|
-
params.speculative.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
|
|
2611
|
-
params.speculative.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
|
|
3312
|
+
params.model.hf_repo = "ggml-org/Qwen2.5-Coder-14B-Q8_0-GGUF";
|
|
3313
|
+
params.model.hf_file = "qwen2.5-coder-14b-q8_0.gguf";
|
|
3314
|
+
params.speculative.model.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
|
|
3315
|
+
params.speculative.model.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
|
|
2612
3316
|
params.speculative.n_gpu_layers = 99;
|
|
2613
3317
|
params.port = 8012;
|
|
2614
3318
|
params.n_gpu_layers = 99;
|