@fugood/llama.node 1.0.0-beta.5 → 1.0.0-beta.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.ts +3 -1
- package/lib/index.js +2 -0
- package/lib/index.ts +3 -1
- package/package.json +14 -14
- package/scripts/llama.cpp.patch +27 -26
- package/src/EmbeddingWorker.cpp +1 -1
- package/src/LlamaCompletionWorker.cpp +28 -7
- package/src/LlamaCompletionWorker.h +4 -0
- package/src/LlamaContext.cpp +14 -17
- package/src/common.hpp +7 -6
- package/src/llama.cpp/CMakeLists.txt +15 -4
- package/src/llama.cpp/common/CMakeLists.txt +15 -24
- package/src/llama.cpp/common/arg.cpp +172 -110
- package/src/llama.cpp/common/chat-parser.cpp +385 -0
- package/src/llama.cpp/common/chat-parser.h +120 -0
- package/src/llama.cpp/common/chat.cpp +726 -596
- package/src/llama.cpp/common/chat.h +74 -8
- package/src/llama.cpp/common/common.cpp +56 -38
- package/src/llama.cpp/common/common.h +9 -3
- package/src/llama.cpp/common/json-partial.cpp +256 -0
- package/src/llama.cpp/common/json-partial.h +38 -0
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +2 -1
- package/src/llama.cpp/common/json-schema-to-grammar.h +4 -4
- package/src/llama.cpp/common/sampling.cpp +7 -8
- package/src/llama.cpp/common/speculative.cpp +6 -4
- package/src/llama.cpp/ggml/CMakeLists.txt +48 -3
- package/src/llama.cpp/ggml/include/ggml.h +22 -3
- package/src/llama.cpp/ggml/src/CMakeLists.txt +81 -22
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +131 -49
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +4113 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +2162 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +2638 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +2731 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +2068 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +396 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +1299 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +1480 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +4310 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +59 -3206
- package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +184 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/common.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +12 -13
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +64 -88
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +8 -8
- package/src/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.cpp → hbm.cpp} +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +56 -7
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +5 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +282 -100
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/quants.c +1157 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +1570 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.h +98 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +119 -5
- package/src/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +85 -16
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +204 -49
- package/src/llama.cpp/include/llama.h +145 -40
- package/src/llama.cpp/src/CMakeLists.txt +5 -1
- package/src/llama.cpp/src/llama-arch.cpp +99 -3
- package/src/llama.cpp/src/llama-arch.h +10 -1
- package/src/llama.cpp/src/llama-batch.cpp +728 -272
- package/src/llama.cpp/src/llama-batch.h +112 -54
- package/src/llama.cpp/src/llama-chat.cpp +19 -2
- package/src/llama.cpp/src/llama-chat.h +1 -0
- package/src/llama.cpp/src/llama-context.cpp +525 -339
- package/src/llama.cpp/src/llama-context.h +38 -17
- package/src/llama.cpp/src/llama-cparams.cpp +4 -0
- package/src/llama.cpp/src/llama-cparams.h +2 -0
- package/src/llama.cpp/src/llama-grammar.cpp +12 -2
- package/src/llama.cpp/src/llama-graph.cpp +413 -353
- package/src/llama.cpp/src/llama-graph.h +112 -56
- package/src/llama.cpp/src/llama-hparams.cpp +10 -2
- package/src/llama.cpp/src/llama-hparams.h +13 -2
- package/src/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +279 -0
- package/src/llama.cpp/src/llama-kv-cache-unified-iswa.h +128 -0
- package/src/llama.cpp/src/llama-kv-cache-unified.cpp +1815 -0
- package/src/llama.cpp/src/llama-kv-cache-unified.h +303 -0
- package/src/llama.cpp/src/llama-kv-cells.h +415 -0
- package/src/llama.cpp/src/llama-memory-hybrid.cpp +246 -0
- package/src/llama.cpp/src/llama-memory-hybrid.h +138 -0
- package/src/llama.cpp/src/llama-memory-recurrent.cpp +1112 -0
- package/src/llama.cpp/src/llama-memory-recurrent.h +183 -0
- package/src/llama.cpp/src/llama-memory.cpp +41 -0
- package/src/llama.cpp/src/llama-memory.h +86 -5
- package/src/llama.cpp/src/llama-mmap.cpp +1 -1
- package/src/llama.cpp/src/llama-model-loader.cpp +42 -17
- package/src/llama.cpp/src/llama-model-saver.cpp +1 -0
- package/src/llama.cpp/src/llama-model.cpp +1137 -528
- package/src/llama.cpp/src/llama-model.h +4 -0
- package/src/llama.cpp/src/llama-quant.cpp +2 -1
- package/src/llama.cpp/src/llama-sampling.cpp +2 -2
- package/src/llama.cpp/src/llama-vocab.cpp +69 -32
- package/src/llama.cpp/src/llama-vocab.h +1 -0
- package/src/llama.cpp/src/llama.cpp +11 -7
- package/src/llama.cpp/src/unicode.cpp +5 -0
- package/src/tts_utils.h +1 -1
- package/src/llama.cpp/common/json.hpp +0 -24766
- package/src/llama.cpp/common/minja/chat-template.hpp +0 -541
- package/src/llama.cpp/common/minja/minja.hpp +0 -2974
- package/src/llama.cpp/common/stb_image.h +0 -7988
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -13326
- package/src/llama.cpp/src/llama-kv-cache.cpp +0 -2827
- package/src/llama.cpp/src/llama-kv-cache.h +0 -515
- /package/src/llama.cpp/ggml/src/ggml-cpu/{cpu-feats-x86.cpp → arch/x86/cpu-feats.cpp} +0 -0
- /package/src/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.h → hbm.h} +0 -0
- /package/src/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
|
@@ -1,10 +1,11 @@
|
|
|
1
|
-
#include "gguf.h" // for reading GGUF splits
|
|
2
1
|
#include "arg.h"
|
|
3
2
|
|
|
3
|
+
#include "chat.h"
|
|
4
4
|
#include "common.h"
|
|
5
|
+
#include "gguf.h" // for reading GGUF splits
|
|
6
|
+
#include "json-schema-to-grammar.h"
|
|
5
7
|
#include "log.h"
|
|
6
8
|
#include "sampling.h"
|
|
7
|
-
#include "chat.h"
|
|
8
9
|
|
|
9
10
|
// fix problem with std::min and std::max
|
|
10
11
|
#if defined(_WIN32)
|
|
@@ -15,6 +16,9 @@
|
|
|
15
16
|
#include <windows.h>
|
|
16
17
|
#endif
|
|
17
18
|
|
|
19
|
+
#define JSON_ASSERT GGML_ASSERT
|
|
20
|
+
#include <nlohmann/json.hpp>
|
|
21
|
+
|
|
18
22
|
#include <algorithm>
|
|
19
23
|
#include <climits>
|
|
20
24
|
#include <cstdarg>
|
|
@@ -34,8 +38,6 @@
|
|
|
34
38
|
#include <future>
|
|
35
39
|
#endif
|
|
36
40
|
|
|
37
|
-
#include "json-schema-to-grammar.h"
|
|
38
|
-
|
|
39
41
|
using json = nlohmann::ordered_json;
|
|
40
42
|
|
|
41
43
|
std::initializer_list<enum llama_example> mmproj_examples = {
|
|
@@ -242,33 +244,7 @@ static bool curl_perform_with_retry(const std::string & url, CURL * curl, int ma
|
|
|
242
244
|
}
|
|
243
245
|
|
|
244
246
|
// download one single file from remote URL to local path
|
|
245
|
-
static bool common_download_file_single(const std::string & url, const std::string & path, const std::string & bearer_token) {
|
|
246
|
-
// Initialize libcurl
|
|
247
|
-
curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
|
|
248
|
-
curl_slist_ptr http_headers;
|
|
249
|
-
if (!curl) {
|
|
250
|
-
LOG_ERR("%s: error initializing libcurl\n", __func__);
|
|
251
|
-
return false;
|
|
252
|
-
}
|
|
253
|
-
|
|
254
|
-
// Set the URL, allow to follow http redirection
|
|
255
|
-
curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
|
|
256
|
-
curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);
|
|
257
|
-
|
|
258
|
-
http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
|
|
259
|
-
// Check if hf-token or bearer-token was specified
|
|
260
|
-
if (!bearer_token.empty()) {
|
|
261
|
-
std::string auth_header = "Authorization: Bearer " + bearer_token;
|
|
262
|
-
http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
|
|
263
|
-
}
|
|
264
|
-
curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
|
|
265
|
-
|
|
266
|
-
#if defined(_WIN32)
|
|
267
|
-
// CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of
|
|
268
|
-
// operating system. Currently implemented under MS-Windows.
|
|
269
|
-
curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
|
|
270
|
-
#endif
|
|
271
|
-
|
|
247
|
+
static bool common_download_file_single(const std::string & url, const std::string & path, const std::string & bearer_token, bool offline) {
|
|
272
248
|
// Check if the file already exists locally
|
|
273
249
|
auto file_exists = std::filesystem::exists(path);
|
|
274
250
|
|
|
@@ -279,6 +255,10 @@ static bool common_download_file_single(const std::string & url, const std::stri
|
|
|
279
255
|
std::string last_modified;
|
|
280
256
|
|
|
281
257
|
if (file_exists) {
|
|
258
|
+
if (offline) {
|
|
259
|
+
LOG_INF("%s: using cached file (offline mode): %s\n", __func__, path.c_str());
|
|
260
|
+
return true; // skip verification/downloading
|
|
261
|
+
}
|
|
282
262
|
// Try and read the JSON metadata file (note: stream autoclosed upon exiting this block).
|
|
283
263
|
std::ifstream metadata_in(metadata_path);
|
|
284
264
|
if (metadata_in.good()) {
|
|
@@ -297,6 +277,10 @@ static bool common_download_file_single(const std::string & url, const std::stri
|
|
|
297
277
|
}
|
|
298
278
|
// if we cannot open the metadata file, we assume that the downloaded file is not valid (etag and last-modified are left empty, so we will download it again)
|
|
299
279
|
} else {
|
|
280
|
+
if (offline) {
|
|
281
|
+
LOG_ERR("%s: required file is not available in cache (offline mode): %s\n", __func__, path.c_str());
|
|
282
|
+
return false;
|
|
283
|
+
}
|
|
300
284
|
LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
|
|
301
285
|
}
|
|
302
286
|
|
|
@@ -310,50 +294,73 @@ static bool common_download_file_single(const std::string & url, const std::stri
|
|
|
310
294
|
bool head_request_ok = false;
|
|
311
295
|
bool should_download = !file_exists; // by default, we should download if the file does not exist
|
|
312
296
|
|
|
313
|
-
//
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
297
|
+
// Initialize libcurl
|
|
298
|
+
curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
|
|
299
|
+
curl_slist_ptr http_headers;
|
|
300
|
+
if (!curl) {
|
|
301
|
+
LOG_ERR("%s: error initializing libcurl\n", __func__);
|
|
302
|
+
return false;
|
|
303
|
+
}
|
|
318
304
|
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
305
|
+
// Set the URL, allow to follow http redirection
|
|
306
|
+
curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
|
|
307
|
+
curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);
|
|
322
308
|
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
} else if (std::regex_match(key, match, last_modified_regex)) {
|
|
331
|
-
headers->last_modified = value;
|
|
332
|
-
}
|
|
333
|
-
}
|
|
334
|
-
return n_items;
|
|
335
|
-
};
|
|
309
|
+
http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
|
|
310
|
+
// Check if hf-token or bearer-token was specified
|
|
311
|
+
if (!bearer_token.empty()) {
|
|
312
|
+
std::string auth_header = "Authorization: Bearer " + bearer_token;
|
|
313
|
+
http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
|
|
314
|
+
}
|
|
315
|
+
curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
|
|
336
316
|
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
317
|
+
#if defined(_WIN32)
|
|
318
|
+
// CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of
|
|
319
|
+
// operating system. Currently implemented under MS-Windows.
|
|
320
|
+
curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
|
|
321
|
+
#endif
|
|
341
322
|
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
if (!was_perform_successful) {
|
|
346
|
-
head_request_ok = false;
|
|
347
|
-
}
|
|
323
|
+
typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
|
|
324
|
+
auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
|
|
325
|
+
common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata;
|
|
348
326
|
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
327
|
+
static std::regex header_regex("([^:]+): (.*)\r\n");
|
|
328
|
+
static std::regex etag_regex("ETag", std::regex_constants::icase);
|
|
329
|
+
static std::regex last_modified_regex("Last-Modified", std::regex_constants::icase);
|
|
330
|
+
|
|
331
|
+
std::string header(buffer, n_items);
|
|
332
|
+
std::smatch match;
|
|
333
|
+
if (std::regex_match(header, match, header_regex)) {
|
|
334
|
+
const std::string & key = match[1];
|
|
335
|
+
const std::string & value = match[2];
|
|
336
|
+
if (std::regex_match(key, match, etag_regex)) {
|
|
337
|
+
headers->etag = value;
|
|
338
|
+
} else if (std::regex_match(key, match, last_modified_regex)) {
|
|
339
|
+
headers->last_modified = value;
|
|
340
|
+
}
|
|
356
341
|
}
|
|
342
|
+
return n_items;
|
|
343
|
+
};
|
|
344
|
+
|
|
345
|
+
curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 1L); // will trigger the HEAD verb
|
|
346
|
+
curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L); // hide head request progress
|
|
347
|
+
curl_easy_setopt(curl.get(), CURLOPT_HEADERFUNCTION, static_cast<CURLOPT_HEADERFUNCTION_PTR>(header_callback));
|
|
348
|
+
curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers);
|
|
349
|
+
|
|
350
|
+
// we only allow retrying once for HEAD requests
|
|
351
|
+
// this is for the use case of using running offline (no internet), retrying can be annoying
|
|
352
|
+
bool was_perform_successful = curl_perform_with_retry(url, curl.get(), 1, 0, "HEAD");
|
|
353
|
+
if (!was_perform_successful) {
|
|
354
|
+
head_request_ok = false;
|
|
355
|
+
}
|
|
356
|
+
|
|
357
|
+
long http_code = 0;
|
|
358
|
+
curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
|
|
359
|
+
if (http_code == 200) {
|
|
360
|
+
head_request_ok = true;
|
|
361
|
+
} else {
|
|
362
|
+
LOG_WRN("%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
|
|
363
|
+
head_request_ok = false;
|
|
357
364
|
}
|
|
358
365
|
|
|
359
366
|
// if head_request_ok is false, we don't have the etag or last-modified headers
|
|
@@ -460,12 +467,12 @@ static bool common_download_file_single(const std::string & url, const std::stri
|
|
|
460
467
|
|
|
461
468
|
// download multiple files from remote URLs to local paths
|
|
462
469
|
// the input is a vector of pairs <url, path>
|
|
463
|
-
static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> & urls, const std::string & bearer_token) {
|
|
470
|
+
static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> & urls, const std::string & bearer_token, bool offline) {
|
|
464
471
|
// Prepare download in parallel
|
|
465
472
|
std::vector<std::future<bool>> futures_download;
|
|
466
473
|
for (auto const & item : urls) {
|
|
467
|
-
futures_download.push_back(std::async(std::launch::async, [bearer_token](const std::pair<std::string, std::string> & it) -> bool {
|
|
468
|
-
return common_download_file_single(it.first, it.second, bearer_token);
|
|
474
|
+
futures_download.push_back(std::async(std::launch::async, [bearer_token, offline](const std::pair<std::string, std::string> & it) -> bool {
|
|
475
|
+
return common_download_file_single(it.first, it.second, bearer_token, offline);
|
|
469
476
|
}, item));
|
|
470
477
|
}
|
|
471
478
|
|
|
@@ -481,14 +488,15 @@ static bool common_download_file_multiple(const std::vector<std::pair<std::strin
|
|
|
481
488
|
|
|
482
489
|
static bool common_download_model(
|
|
483
490
|
const common_params_model & model,
|
|
484
|
-
const std::string & bearer_token
|
|
491
|
+
const std::string & bearer_token,
|
|
492
|
+
bool offline) {
|
|
485
493
|
// Basic validation of the model.url
|
|
486
494
|
if (model.url.empty()) {
|
|
487
495
|
LOG_ERR("%s: invalid model url\n", __func__);
|
|
488
496
|
return false;
|
|
489
497
|
}
|
|
490
498
|
|
|
491
|
-
if (!common_download_file_single(model.url, model.path, bearer_token)) {
|
|
499
|
+
if (!common_download_file_single(model.url, model.path, bearer_token, offline)) {
|
|
492
500
|
return false;
|
|
493
501
|
}
|
|
494
502
|
|
|
@@ -547,7 +555,7 @@ static bool common_download_model(
|
|
|
547
555
|
}
|
|
548
556
|
|
|
549
557
|
// Download in parallel
|
|
550
|
-
common_download_file_multiple(urls, bearer_token);
|
|
558
|
+
common_download_file_multiple(urls, bearer_token, offline);
|
|
551
559
|
}
|
|
552
560
|
|
|
553
561
|
return true;
|
|
@@ -608,7 +616,7 @@ std::pair<long, std::vector<char>> common_remote_get_content(const std::string &
|
|
|
608
616
|
*
|
|
609
617
|
* Note: we use the Ollama-compatible HF API, but not using the blobId. Instead, we use the special "ggufFile" field which returns the value for "hf_file". This is done to be backward-compatible with existing cache files.
|
|
610
618
|
*/
|
|
611
|
-
static struct common_hf_file_res common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & bearer_token) {
|
|
619
|
+
static struct common_hf_file_res common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & bearer_token, bool offline) {
|
|
612
620
|
auto parts = string_split<std::string>(hf_repo_with_tag, ':');
|
|
613
621
|
std::string tag = parts.size() > 1 ? parts.back() : "latest";
|
|
614
622
|
std::string hf_repo = parts[0];
|
|
@@ -638,20 +646,25 @@ static struct common_hf_file_res common_get_hf_file(const std::string & hf_repo_
|
|
|
638
646
|
long res_code = 0;
|
|
639
647
|
std::string res_str;
|
|
640
648
|
bool use_cache = false;
|
|
641
|
-
|
|
642
|
-
auto res = common_remote_get_content(url, params);
|
|
643
|
-
res_code = res.first;
|
|
644
|
-
res_str = std::string(res.second.data(), res.second.size());
|
|
645
|
-
} catch (const std::exception & e) {
|
|
646
|
-
LOG_WRN("error: failed to get manifest: %s\n", e.what());
|
|
647
|
-
LOG_WRN("try reading from cache\n");
|
|
648
|
-
// try to read from cache
|
|
649
|
+
if (!offline) {
|
|
649
650
|
try {
|
|
651
|
+
auto res = common_remote_get_content(url, params);
|
|
652
|
+
res_code = res.first;
|
|
653
|
+
res_str = std::string(res.second.data(), res.second.size());
|
|
654
|
+
} catch (const std::exception & e) {
|
|
655
|
+
LOG_WRN("error: failed to get manifest at %s: %s\n", url.c_str(), e.what());
|
|
656
|
+
}
|
|
657
|
+
}
|
|
658
|
+
if (res_code == 0) {
|
|
659
|
+
if (std::filesystem::exists(cached_response_path)) {
|
|
660
|
+
LOG_WRN("trying to read manifest from cache: %s\n", cached_response_path.c_str());
|
|
650
661
|
res_str = read_file(cached_response_path);
|
|
651
662
|
res_code = 200;
|
|
652
663
|
use_cache = true;
|
|
653
|
-
}
|
|
654
|
-
throw std::runtime_error(
|
|
664
|
+
} else {
|
|
665
|
+
throw std::runtime_error(
|
|
666
|
+
offline ? "error: failed to get manifest (offline mode)"
|
|
667
|
+
: "error: failed to get manifest (check your internet connection)");
|
|
655
668
|
}
|
|
656
669
|
}
|
|
657
670
|
std::string ggufFile;
|
|
@@ -698,24 +711,25 @@ bool common_has_curl() {
|
|
|
698
711
|
return false;
|
|
699
712
|
}
|
|
700
713
|
|
|
701
|
-
static bool common_download_file_single(const std::string &, const std::string &, const std::string
|
|
714
|
+
static bool common_download_file_single(const std::string &, const std::string &, const std::string &, bool) {
|
|
702
715
|
LOG_ERR("error: built without CURL, cannot download model from internet\n");
|
|
703
716
|
return false;
|
|
704
717
|
}
|
|
705
718
|
|
|
706
|
-
static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> &, const std::string
|
|
719
|
+
static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> &, const std::string &, bool) {
|
|
707
720
|
LOG_ERR("error: built without CURL, cannot download model from the internet\n");
|
|
708
721
|
return false;
|
|
709
722
|
}
|
|
710
723
|
|
|
711
724
|
static bool common_download_model(
|
|
712
725
|
const common_params_model &,
|
|
713
|
-
const std::string
|
|
726
|
+
const std::string &,
|
|
727
|
+
bool) {
|
|
714
728
|
LOG_ERR("error: built without CURL, cannot download model from the internet\n");
|
|
715
729
|
return false;
|
|
716
730
|
}
|
|
717
731
|
|
|
718
|
-
static struct common_hf_file_res common_get_hf_file(const std::string &, const std::string
|
|
732
|
+
static struct common_hf_file_res common_get_hf_file(const std::string &, const std::string &, bool) {
|
|
719
733
|
LOG_ERR("error: built without CURL, cannot download model from the internet\n");
|
|
720
734
|
return {};
|
|
721
735
|
}
|
|
@@ -742,7 +756,8 @@ struct handle_model_result {
|
|
|
742
756
|
static handle_model_result common_params_handle_model(
|
|
743
757
|
struct common_params_model & model,
|
|
744
758
|
const std::string & bearer_token,
|
|
745
|
-
const std::string & model_path_default
|
|
759
|
+
const std::string & model_path_default,
|
|
760
|
+
bool offline) {
|
|
746
761
|
handle_model_result result;
|
|
747
762
|
// handle pre-fill default model path and url based on hf_repo and hf_file
|
|
748
763
|
{
|
|
@@ -750,7 +765,7 @@ static handle_model_result common_params_handle_model(
|
|
|
750
765
|
// short-hand to avoid specifying --hf-file -> default it to --model
|
|
751
766
|
if (model.hf_file.empty()) {
|
|
752
767
|
if (model.path.empty()) {
|
|
753
|
-
auto auto_detected = common_get_hf_file(model.hf_repo, bearer_token);
|
|
768
|
+
auto auto_detected = common_get_hf_file(model.hf_repo, bearer_token, offline);
|
|
754
769
|
if (auto_detected.repo.empty() || auto_detected.ggufFile.empty()) {
|
|
755
770
|
exit(1); // built without CURL, error message already printed
|
|
756
771
|
}
|
|
@@ -791,7 +806,7 @@ static handle_model_result common_params_handle_model(
|
|
|
791
806
|
|
|
792
807
|
// then, download it if needed
|
|
793
808
|
if (!model.url.empty()) {
|
|
794
|
-
bool ok = common_download_model(model, bearer_token);
|
|
809
|
+
bool ok = common_download_model(model, bearer_token, offline);
|
|
795
810
|
if (!ok) {
|
|
796
811
|
LOG_ERR("error: failed to download model from %s\n", model.url.c_str());
|
|
797
812
|
exit(1);
|
|
@@ -934,7 +949,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
|
|
934
949
|
|
|
935
950
|
// handle model and download
|
|
936
951
|
{
|
|
937
|
-
auto res = common_params_handle_model(params.model, params.hf_token, DEFAULT_MODEL_PATH);
|
|
952
|
+
auto res = common_params_handle_model(params.model, params.hf_token, DEFAULT_MODEL_PATH, params.offline);
|
|
938
953
|
if (params.no_mmproj) {
|
|
939
954
|
params.mmproj = {};
|
|
940
955
|
} else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) {
|
|
@@ -944,12 +959,12 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
|
|
944
959
|
// only download mmproj if the current example is using it
|
|
945
960
|
for (auto & ex : mmproj_examples) {
|
|
946
961
|
if (ctx_arg.ex == ex) {
|
|
947
|
-
common_params_handle_model(params.mmproj, params.hf_token, "");
|
|
962
|
+
common_params_handle_model(params.mmproj, params.hf_token, "", params.offline);
|
|
948
963
|
break;
|
|
949
964
|
}
|
|
950
965
|
}
|
|
951
|
-
common_params_handle_model(params.speculative.model, params.hf_token, "");
|
|
952
|
-
common_params_handle_model(params.vocoder.model, params.hf_token, "");
|
|
966
|
+
common_params_handle_model(params.speculative.model, params.hf_token, "", params.offline);
|
|
967
|
+
common_params_handle_model(params.vocoder.model, params.hf_token, "", params.offline);
|
|
953
968
|
}
|
|
954
969
|
|
|
955
970
|
if (params.escape) {
|
|
@@ -973,10 +988,6 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
|
|
973
988
|
params.tensor_buft_overrides.push_back({nullptr, nullptr});
|
|
974
989
|
}
|
|
975
990
|
|
|
976
|
-
if (params.reranking && params.embedding) {
|
|
977
|
-
throw std::invalid_argument("error: either --embedding or --reranking can be specified, but not both");
|
|
978
|
-
}
|
|
979
|
-
|
|
980
991
|
if (!params.chat_template.empty() && !common_chat_verify_template(params.chat_template, params.use_jinja)) {
|
|
981
992
|
throw std::runtime_error(string_format(
|
|
982
993
|
"error: the supplied chat template is not supported: %s%s\n",
|
|
@@ -1333,9 +1344,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1333
1344
|
));
|
|
1334
1345
|
add_opt(common_arg(
|
|
1335
1346
|
{"--prio"}, "N",
|
|
1336
|
-
string_format("set process/thread priority :
|
|
1347
|
+
string_format("set process/thread priority : low(-1), normal(0), medium(1), high(2), realtime(3) (default: %d)\n", params.cpuparams.priority),
|
|
1337
1348
|
[](common_params & params, int prio) {
|
|
1338
|
-
if (prio <
|
|
1349
|
+
if (prio < GGML_SCHED_PRIO_LOW || prio > GGML_SCHED_PRIO_REALTIME) {
|
|
1339
1350
|
throw std::invalid_argument("invalid value");
|
|
1340
1351
|
}
|
|
1341
1352
|
params.cpuparams.priority = (enum ggml_sched_priority) prio;
|
|
@@ -2695,6 +2706,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2695
2706
|
params.embd_sep = value;
|
|
2696
2707
|
}
|
|
2697
2708
|
).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
|
|
2709
|
+
add_opt(common_arg(
|
|
2710
|
+
{"--cls-separator"}, "STRING",
|
|
2711
|
+
"separator of classification sequences (default \\t) for example \"<#seq#>\"",
|
|
2712
|
+
[](common_params & params, const std::string & value) {
|
|
2713
|
+
params.cls_sep = value;
|
|
2714
|
+
}
|
|
2715
|
+
).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
|
|
2698
2716
|
add_opt(common_arg(
|
|
2699
2717
|
{"--host"}, "HOST",
|
|
2700
2718
|
string_format("ip address to listen, or bind to an UNIX socket if the address ends with .sock (default: %s)", params.hostname.c_str()),
|
|
@@ -2732,9 +2750,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2732
2750
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_EMBEDDINGS"));
|
|
2733
2751
|
add_opt(common_arg(
|
|
2734
2752
|
{"--reranking", "--rerank"},
|
|
2735
|
-
string_format("enable reranking endpoint on server (default: %s)",
|
|
2753
|
+
string_format("enable reranking endpoint on server (default: %s)", "disabled"),
|
|
2736
2754
|
[](common_params & params) {
|
|
2737
|
-
params.
|
|
2755
|
+
params.embedding = true;
|
|
2756
|
+
params.pooling_type = LLAMA_POOLING_TYPE_RANK;
|
|
2738
2757
|
}
|
|
2739
2758
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_RERANKING"));
|
|
2740
2759
|
add_opt(common_arg(
|
|
@@ -2848,15 +2867,25 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2848
2867
|
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_JINJA"));
|
|
2849
2868
|
add_opt(common_arg(
|
|
2850
2869
|
{"--reasoning-format"}, "FORMAT",
|
|
2851
|
-
"
|
|
2852
|
-
"
|
|
2853
|
-
"
|
|
2870
|
+
"controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"
|
|
2871
|
+
"- none: leaves thoughts unparsed in `message.content`\n"
|
|
2872
|
+
"- deepseek: puts thoughts in `message.reasoning_content` (except in streaming mode, which behaves as `none`)\n"
|
|
2873
|
+
"(default: deepseek)",
|
|
2854
2874
|
[](common_params & params, const std::string & value) {
|
|
2855
2875
|
/**/ if (value == "deepseek") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; }
|
|
2876
|
+
else if (value == "deepseek-legacy") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY; }
|
|
2856
2877
|
else if (value == "none") { params.reasoning_format = COMMON_REASONING_FORMAT_NONE; }
|
|
2857
|
-
else { std::invalid_argument("invalid value"); }
|
|
2878
|
+
else { throw std::invalid_argument("invalid value"); }
|
|
2858
2879
|
}
|
|
2859
2880
|
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK"));
|
|
2881
|
+
add_opt(common_arg(
|
|
2882
|
+
{"--reasoning-budget"}, "N",
|
|
2883
|
+
"controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)",
|
|
2884
|
+
[](common_params & params, int value) {
|
|
2885
|
+
if (value != 0 && value != -1) { throw std::invalid_argument("invalid value"); }
|
|
2886
|
+
params.reasoning_budget = value;
|
|
2887
|
+
}
|
|
2888
|
+
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK_BUDGET"));
|
|
2860
2889
|
add_opt(common_arg(
|
|
2861
2890
|
{"--chat-template"}, "JINJA_TEMPLATE",
|
|
2862
2891
|
string_format(
|
|
@@ -2955,7 +2984,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2955
2984
|
[](common_params & params, const std::string & value) {
|
|
2956
2985
|
/**/ if (value == "jsonl") { params.batched_bench_output_jsonl = true; }
|
|
2957
2986
|
else if (value == "md") { params.batched_bench_output_jsonl = false; }
|
|
2958
|
-
else { std::invalid_argument("invalid value"); }
|
|
2987
|
+
else { throw std::invalid_argument("invalid value"); }
|
|
2959
2988
|
}
|
|
2960
2989
|
).set_examples({LLAMA_EXAMPLE_BENCH}));
|
|
2961
2990
|
add_opt(common_arg(
|
|
@@ -2987,6 +3016,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2987
3016
|
common_log_set_verbosity_thold(INT_MAX);
|
|
2988
3017
|
}
|
|
2989
3018
|
));
|
|
3019
|
+
add_opt(common_arg(
|
|
3020
|
+
{"--offline"},
|
|
3021
|
+
"Offline mode: forces use of cache, prevents network access",
|
|
3022
|
+
[](common_params & params) {
|
|
3023
|
+
params.offline = true;
|
|
3024
|
+
}
|
|
3025
|
+
).set_env("LLAMA_OFFLINE"));
|
|
2990
3026
|
add_opt(common_arg(
|
|
2991
3027
|
{"-lv", "--verbosity", "--log-verbosity"}, "N",
|
|
2992
3028
|
"Set the verbosity threshold. Messages with a higher verbosity will be ignored.",
|
|
@@ -3181,6 +3217,32 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
3181
3217
|
params.speculative.model.path = value;
|
|
3182
3218
|
}
|
|
3183
3219
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT"));
|
|
3220
|
+
add_opt(common_arg(
|
|
3221
|
+
{"-ctkd", "--cache-type-k-draft"}, "TYPE",
|
|
3222
|
+
string_format(
|
|
3223
|
+
"KV cache data type for K for the draft model\n"
|
|
3224
|
+
"allowed values: %s\n"
|
|
3225
|
+
"(default: %s)",
|
|
3226
|
+
get_all_kv_cache_types().c_str(),
|
|
3227
|
+
ggml_type_name(params.speculative.cache_type_k)
|
|
3228
|
+
),
|
|
3229
|
+
[](common_params & params, const std::string & value) {
|
|
3230
|
+
params.speculative.cache_type_k = kv_cache_type_from_str(value);
|
|
3231
|
+
}
|
|
3232
|
+
).set_env("LLAMA_ARG_CACHE_TYPE_K_DRAFT"));
|
|
3233
|
+
add_opt(common_arg(
|
|
3234
|
+
{"-ctvd", "--cache-type-v-draft"}, "TYPE",
|
|
3235
|
+
string_format(
|
|
3236
|
+
"KV cache data type for V for the draft model\n"
|
|
3237
|
+
"allowed values: %s\n"
|
|
3238
|
+
"(default: %s)",
|
|
3239
|
+
get_all_kv_cache_types().c_str(),
|
|
3240
|
+
ggml_type_name(params.speculative.cache_type_v)
|
|
3241
|
+
),
|
|
3242
|
+
[](common_params & params, const std::string & value) {
|
|
3243
|
+
params.speculative.cache_type_v = kv_cache_type_from_str(value);
|
|
3244
|
+
}
|
|
3245
|
+
).set_env("LLAMA_ARG_CACHE_TYPE_V_DRAFT"));
|
|
3184
3246
|
|
|
3185
3247
|
add_opt(common_arg(
|
|
3186
3248
|
{"-mv", "--model-vocoder"}, "FNAME",
|