cui-llama.rn 1.4.0 → 1.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/jni.cpp +9 -9
- package/cpp/common.cpp +163 -60
- package/cpp/common.h +43 -12
- package/cpp/ggml-alloc.c +1042 -1037
- package/cpp/ggml-backend-impl.h +255 -256
- package/cpp/ggml-backend-reg.cpp +582 -582
- package/cpp/ggml-backend.cpp +2002 -2002
- package/cpp/ggml-backend.h +354 -352
- package/cpp/ggml-common.h +1853 -1853
- package/cpp/ggml-cpp.h +39 -39
- package/cpp/ggml-cpu-aarch64.cpp +4247 -4247
- package/cpp/ggml-cpu-aarch64.h +8 -8
- package/cpp/ggml-cpu-impl.h +386 -386
- package/cpp/ggml-cpu-quants.c +10920 -10839
- package/cpp/ggml-cpu-traits.cpp +36 -36
- package/cpp/ggml-cpu-traits.h +38 -38
- package/cpp/ggml-cpu.c +329 -60
- package/cpp/ggml-cpu.cpp +10 -2
- package/cpp/ggml-cpu.h +135 -135
- package/cpp/ggml-impl.h +567 -567
- package/cpp/ggml-metal-impl.h +17 -17
- package/cpp/ggml-metal.m +4884 -4884
- package/cpp/ggml-quants.c +5238 -5238
- package/cpp/ggml-threading.h +14 -14
- package/cpp/ggml.c +6514 -6448
- package/cpp/ggml.h +2194 -2163
- package/cpp/gguf.cpp +1329 -1325
- package/cpp/gguf.h +202 -202
- package/cpp/json-schema-to-grammar.cpp +1045 -1045
- package/cpp/json-schema-to-grammar.h +8 -8
- package/cpp/json.hpp +24766 -24766
- package/cpp/llama-adapter.cpp +347 -346
- package/cpp/llama-adapter.h +74 -73
- package/cpp/llama-arch.cpp +1487 -1434
- package/cpp/llama-arch.h +400 -395
- package/cpp/llama-batch.cpp +368 -368
- package/cpp/llama-batch.h +88 -88
- package/cpp/llama-chat.cpp +578 -567
- package/cpp/llama-chat.h +52 -51
- package/cpp/llama-context.cpp +1775 -1771
- package/cpp/llama-context.h +128 -128
- package/cpp/llama-cparams.cpp +1 -1
- package/cpp/llama-cparams.h +37 -37
- package/cpp/llama-cpp.h +30 -30
- package/cpp/llama-grammar.cpp +1139 -1139
- package/cpp/llama-grammar.h +143 -143
- package/cpp/llama-hparams.cpp +71 -71
- package/cpp/llama-hparams.h +139 -140
- package/cpp/llama-impl.cpp +167 -167
- package/cpp/llama-impl.h +61 -61
- package/cpp/llama-kv-cache.cpp +718 -718
- package/cpp/llama-kv-cache.h +218 -218
- package/cpp/llama-mmap.cpp +2 -1
- package/cpp/llama-mmap.h +67 -67
- package/cpp/llama-model-loader.cpp +1124 -1011
- package/cpp/llama-model-loader.h +167 -158
- package/cpp/llama-model.cpp +3997 -2202
- package/cpp/llama-model.h +370 -391
- package/cpp/llama-sampling.cpp +2408 -2406
- package/cpp/llama-sampling.h +32 -48
- package/cpp/llama-vocab.cpp +3247 -1982
- package/cpp/llama-vocab.h +125 -182
- package/cpp/llama.cpp +416 -2886
- package/cpp/llama.h +1323 -1285
- package/cpp/log.cpp +401 -401
- package/cpp/log.h +121 -121
- package/cpp/rn-llama.hpp +18 -12
- package/cpp/sampling.cpp +505 -500
- package/cpp/sgemm.cpp +2597 -2597
- package/cpp/speculative.cpp +277 -274
- package/cpp/speculative.h +28 -28
- package/cpp/unicode.cpp +2 -3
- package/package.json +1 -1
package/android/src/main/jni.cpp
CHANGED
@@ -345,10 +345,10 @@ Java_com_rnllama_LlamaContext_initContext(
|
|
345
345
|
llama_free(llama->ctx);
|
346
346
|
}
|
347
347
|
|
348
|
-
std::vector<
|
348
|
+
std::vector<common_adapter_lora_info> lora;
|
349
349
|
const char *lora_chars = env->GetStringUTFChars(lora_str, nullptr);
|
350
350
|
if (lora_chars != nullptr && lora_chars[0] != '\0') {
|
351
|
-
|
351
|
+
common_adapter_lora_info la;
|
352
352
|
la.path = lora_chars;
|
353
353
|
la.scale = lora_scaled;
|
354
354
|
lora.push_back(la);
|
@@ -362,7 +362,7 @@ Java_com_rnllama_LlamaContext_initContext(
|
|
362
362
|
jstring path = readablemap::getString(env, lora_adapter, "path", nullptr);
|
363
363
|
if (path != nullptr) {
|
364
364
|
const char *path_chars = env->GetStringUTFChars(path, nullptr);
|
365
|
-
|
365
|
+
common_adapter_lora_info la;
|
366
366
|
la.path = path_chars;
|
367
367
|
la.scale = readablemap::getFloat(env, lora_adapter, "scaled", 1.0f);
|
368
368
|
lora.push_back(la);
|
@@ -409,7 +409,7 @@ Java_com_rnllama_LlamaContext_loadModelDetails(
|
|
409
409
|
for (int i = 0; i < count; i++) {
|
410
410
|
char key[256];
|
411
411
|
llama_model_meta_key_by_index(llama->model, i, key, sizeof(key));
|
412
|
-
char val[
|
412
|
+
char val[4096];
|
413
413
|
llama_model_meta_val_str_by_index(llama->model, i, val, sizeof(val));
|
414
414
|
|
415
415
|
putString(env, meta, key, val);
|
@@ -623,7 +623,7 @@ Java_com_rnllama_LlamaContext_doCompletion(
|
|
623
623
|
|
624
624
|
sparams.logit_bias.clear();
|
625
625
|
if (ignore_eos) {
|
626
|
-
sparams.logit_bias[
|
626
|
+
sparams.logit_bias[llama_vocab_eos(llama_model_get_vocab(llama->model))].bias = -INFINITY;
|
627
627
|
}
|
628
628
|
|
629
629
|
// dry break seq
|
@@ -642,7 +642,7 @@ Java_com_rnllama_LlamaContext_doCompletion(
|
|
642
642
|
sparams.dry_sequence_breakers = dry_sequence_breakers_vector;
|
643
643
|
|
644
644
|
// logit bias
|
645
|
-
const int n_vocab =
|
645
|
+
const int n_vocab = llama_vocab_n_tokens(llama_model_get_vocab(llama->model));
|
646
646
|
jsize logit_bias_len = env->GetArrayLength(logit_bias);
|
647
647
|
|
648
648
|
for (jsize i = 0; i < logit_bias_len; i++) {
|
@@ -921,7 +921,7 @@ Java_com_rnllama_LlamaContext_applyLoraAdapters(
|
|
921
921
|
auto llama = context_map[(long) context_ptr];
|
922
922
|
|
923
923
|
// lora_adapters: ReadableArray<ReadableMap>
|
924
|
-
std::vector<
|
924
|
+
std::vector<common_adapter_lora_info> lora_adapters;
|
925
925
|
int lora_adapters_size = readablearray::size(env, loraAdapters);
|
926
926
|
for (int i = 0; i < lora_adapters_size; i++) {
|
927
927
|
jobject lora_adapter = readablearray::getMap(env, loraAdapters, i);
|
@@ -930,7 +930,7 @@ Java_com_rnllama_LlamaContext_applyLoraAdapters(
|
|
930
930
|
const char *path_chars = env->GetStringUTFChars(path, nullptr);
|
931
931
|
env->ReleaseStringUTFChars(path, path_chars);
|
932
932
|
float scaled = readablemap::getFloat(env, lora_adapter, "scaled", 1.0f);
|
933
|
-
|
933
|
+
common_adapter_lora_info la;
|
934
934
|
la.path = path_chars;
|
935
935
|
la.scale = scaled;
|
936
936
|
lora_adapters.push_back(la);
|
@@ -955,7 +955,7 @@ Java_com_rnllama_LlamaContext_getLoadedLoraAdapters(
|
|
955
955
|
auto llama = context_map[(long) context_ptr];
|
956
956
|
auto loaded_lora_adapters = llama->getLoadedLoraAdapters();
|
957
957
|
auto result = createWritableArray(env);
|
958
|
-
for (
|
958
|
+
for (common_adapter_lora_info &la : loaded_lora_adapters) {
|
959
959
|
auto map = createWriteableMap(env);
|
960
960
|
putString(env, map, "path", la.path.c_str());
|
961
961
|
putDouble(env, map, "scaled", la.scale);
|
package/cpp/common.cpp
CHANGED
@@ -79,6 +79,22 @@ char const *LLAMA_BUILD_TARGET = "unknown";
|
|
79
79
|
#include <sys/syslimits.h>
|
80
80
|
#endif
|
81
81
|
#define LLAMA_CURL_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
|
82
|
+
|
83
|
+
//
|
84
|
+
// CURL utils
|
85
|
+
//
|
86
|
+
|
87
|
+
using curl_ptr = std::unique_ptr<CURL, decltype(&curl_easy_cleanup)>;
|
88
|
+
|
89
|
+
// cannot use unique_ptr for curl_slist, because we cannot update without destroying the old one
|
90
|
+
struct curl_slist_ptr {
|
91
|
+
struct curl_slist * ptr = nullptr;
|
92
|
+
~curl_slist_ptr() {
|
93
|
+
if (ptr) {
|
94
|
+
curl_slist_free_all(ptr);
|
95
|
+
}
|
96
|
+
}
|
97
|
+
};
|
82
98
|
#endif // LLAMA_USE_CURL
|
83
99
|
|
84
100
|
using json = nlohmann::ordered_json;
|
@@ -863,21 +879,23 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|
863
879
|
return iparams;
|
864
880
|
}
|
865
881
|
|
882
|
+
const llama_vocab * vocab = llama_model_get_vocab(model);
|
883
|
+
|
866
884
|
if (params.reranking) {
|
867
885
|
bool ok = true;
|
868
886
|
|
869
|
-
if (
|
870
|
-
LOG_WRN("%s: warning:
|
887
|
+
if (llama_vocab_bos(vocab) == LLAMA_TOKEN_NULL) {
|
888
|
+
LOG_WRN("%s: warning: vocab does not have a BOS token, reranking will not work\n", __func__);
|
871
889
|
ok = false;
|
872
890
|
}
|
873
891
|
|
874
|
-
if (
|
875
|
-
LOG_WRN("%s: warning:
|
892
|
+
if (llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) {
|
893
|
+
LOG_WRN("%s: warning: vocab does not have an EOS token, reranking will not work\n", __func__);
|
876
894
|
ok = false;
|
877
895
|
}
|
878
896
|
|
879
|
-
if (
|
880
|
-
LOG_WRN("%s: warning:
|
897
|
+
if (llama_vocab_sep(vocab) == LLAMA_TOKEN_NULL) {
|
898
|
+
LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
|
881
899
|
ok = false;
|
882
900
|
}
|
883
901
|
|
@@ -890,7 +908,7 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|
890
908
|
|
891
909
|
auto cparams = common_context_params_to_llama(params);
|
892
910
|
|
893
|
-
llama_context * lctx =
|
911
|
+
llama_context * lctx = llama_init_from_model(model, cparams);
|
894
912
|
if (lctx == NULL) {
|
895
913
|
LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.c_str());
|
896
914
|
llama_model_free(model);
|
@@ -904,7 +922,7 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|
904
922
|
|
905
923
|
if (!params.control_vectors.empty()) {
|
906
924
|
if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1;
|
907
|
-
if (params.control_vector_layer_end <= 0) params.control_vector_layer_end =
|
925
|
+
if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_model_n_layer(model);
|
908
926
|
|
909
927
|
const auto cvec = common_control_vector_load(params.control_vectors);
|
910
928
|
if (cvec.n_embd == -1) {
|
@@ -914,12 +932,13 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|
914
932
|
return iparams;
|
915
933
|
}
|
916
934
|
|
917
|
-
int err =
|
918
|
-
|
919
|
-
|
920
|
-
|
921
|
-
|
922
|
-
|
935
|
+
int err = llama_apply_adapter_cvec(
|
936
|
+
lctx,
|
937
|
+
cvec.data.data(),
|
938
|
+
cvec.data.size(),
|
939
|
+
cvec.n_embd,
|
940
|
+
params.control_vector_layer_start,
|
941
|
+
params.control_vector_layer_end);
|
923
942
|
if (err) {
|
924
943
|
llama_free(lctx);
|
925
944
|
llama_model_free(model);
|
@@ -930,8 +949,8 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|
930
949
|
|
931
950
|
// load and optionally apply lora adapters
|
932
951
|
for (auto & la : params.lora_adapters) {
|
933
|
-
|
934
|
-
lora.reset(
|
952
|
+
llama_adapter_lora_ptr lora;
|
953
|
+
lora.reset(llama_adapter_lora_init(model, la.path.c_str()));
|
935
954
|
if (lora == nullptr) {
|
936
955
|
LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
|
937
956
|
llama_free(lctx);
|
@@ -944,17 +963,17 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|
944
963
|
}
|
945
964
|
|
946
965
|
if (!params.lora_init_without_apply) {
|
947
|
-
|
966
|
+
common_set_adapter_lora(lctx, params.lora_adapters);
|
948
967
|
}
|
949
968
|
|
950
|
-
if (params.sampling.ignore_eos &&
|
951
|
-
LOG_WRN("%s: warning:
|
969
|
+
if (params.sampling.ignore_eos && llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) {
|
970
|
+
LOG_WRN("%s: warning: vocab does not have an EOS token, ignoring --ignore-eos\n", __func__);
|
952
971
|
params.sampling.ignore_eos = false;
|
953
972
|
}
|
954
973
|
|
955
974
|
if (params.sampling.ignore_eos) {
|
956
|
-
for (llama_token i = 0; i <
|
957
|
-
if (
|
975
|
+
for (llama_token i = 0; i < llama_vocab_n_tokens(vocab); i++) {
|
976
|
+
if (llama_vocab_is_eog(vocab, i)) {
|
958
977
|
LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(lctx, i).c_str(), -INFINITY);
|
959
978
|
params.sampling.logit_bias.push_back({i, -INFINITY});
|
960
979
|
}
|
@@ -975,8 +994,9 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|
975
994
|
LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);
|
976
995
|
|
977
996
|
std::vector<llama_token> tmp;
|
978
|
-
llama_token bos =
|
979
|
-
llama_token eos =
|
997
|
+
llama_token bos = llama_vocab_bos(vocab);
|
998
|
+
llama_token eos = llama_vocab_eos(vocab);
|
999
|
+
|
980
1000
|
// some models (e.g. T5) don't have a BOS token
|
981
1001
|
if (bos != LLAMA_TOKEN_NULL) {
|
982
1002
|
tmp.push_back(bos);
|
@@ -1011,11 +1031,11 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|
1011
1031
|
return iparams;
|
1012
1032
|
}
|
1013
1033
|
|
1014
|
-
void
|
1015
|
-
|
1034
|
+
void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora) {
|
1035
|
+
llama_clear_adapter_lora(ctx);
|
1016
1036
|
for (auto & la : lora) {
|
1017
1037
|
if (la.scale != 0.0f) {
|
1018
|
-
|
1038
|
+
llama_set_adapter_lora(ctx, la.ptr, la.scale);
|
1019
1039
|
}
|
1020
1040
|
}
|
1021
1041
|
}
|
@@ -1033,7 +1053,6 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
|
|
1033
1053
|
mparams.progress_callback_user_data = params.progress_callback_user_data;
|
1034
1054
|
mparams.progress_callback = params.progress_callback;
|
1035
1055
|
mparams.vocab_only = params.vocab_only;
|
1036
|
-
mparams.rpc_servers = params.rpc_servers.c_str();
|
1037
1056
|
mparams.main_gpu = params.main_gpu;
|
1038
1057
|
mparams.split_mode = params.split_mode;
|
1039
1058
|
mparams.tensor_split = params.tensor_split;
|
@@ -1136,7 +1155,8 @@ static bool curl_perform_with_retry(const std::string & url, CURL * curl, int ma
|
|
1136
1155
|
|
1137
1156
|
static bool common_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {
|
1138
1157
|
// Initialize libcurl
|
1139
|
-
|
1158
|
+
curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
|
1159
|
+
curl_slist_ptr http_headers;
|
1140
1160
|
if (!curl) {
|
1141
1161
|
LOG_ERR("%s: error initializing libcurl\n", __func__);
|
1142
1162
|
return false;
|
@@ -1150,11 +1170,9 @@ static bool common_download_file(const std::string & url, const std::string & pa
|
|
1150
1170
|
|
1151
1171
|
// Check if hf-token or bearer-token was specified
|
1152
1172
|
if (!hf_token.empty()) {
|
1153
|
-
|
1154
|
-
|
1155
|
-
|
1156
|
-
http_headers = curl_slist_append(http_headers, auth_header.c_str());
|
1157
|
-
curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers);
|
1173
|
+
std::string auth_header = "Authorization: Bearer " + hf_token;
|
1174
|
+
http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
|
1175
|
+
curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
|
1158
1176
|
}
|
1159
1177
|
|
1160
1178
|
#if defined(_WIN32)
|
@@ -1450,6 +1468,80 @@ struct llama_model * common_load_model_from_hf(
|
|
1450
1468
|
return common_load_model_from_url(model_url, local_path, hf_token, params);
|
1451
1469
|
}
|
1452
1470
|
|
1471
|
+
/**
|
1472
|
+
* Allow getting the HF file from the HF repo with tag (like ollama), for example:
|
1473
|
+
* - bartowski/Llama-3.2-3B-Instruct-GGUF:q4
|
1474
|
+
* - bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M
|
1475
|
+
* - bartowski/Llama-3.2-3B-Instruct-GGUF:q5_k_s
|
1476
|
+
* Tag is optional, default to "latest" (meaning it checks for Q4_K_M first, then Q4, then if not found, return the first GGUF file in repo)
|
1477
|
+
*
|
1478
|
+
* Return pair of <repo, file> (with "repo" already having tag removed)
|
1479
|
+
*
|
1480
|
+
* Note: we use the Ollama-compatible HF API, but not using the blobId. Instead, we use the special "ggufFile" field which returns the value for "hf_file". This is done to be backward-compatible with existing cache files.
|
1481
|
+
*/
|
1482
|
+
std::pair<std::string, std::string> common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & hf_token) {
|
1483
|
+
auto parts = string_split<std::string>(hf_repo_with_tag, ':');
|
1484
|
+
std::string tag = parts.size() > 1 ? parts.back() : "latest";
|
1485
|
+
std::string hf_repo = parts[0];
|
1486
|
+
if (string_split<std::string>(hf_repo, '/').size() != 2) {
|
1487
|
+
throw std::invalid_argument("error: invalid HF repo format, expected <user>/<model>[:quant]\n");
|
1488
|
+
}
|
1489
|
+
|
1490
|
+
// fetch model info from Hugging Face Hub API
|
1491
|
+
json model_info;
|
1492
|
+
curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
|
1493
|
+
curl_slist_ptr http_headers;
|
1494
|
+
std::string res_str;
|
1495
|
+
std::string url = "https://huggingface.co/v2/" + hf_repo + "/manifests/" + tag;
|
1496
|
+
curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
|
1497
|
+
curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L);
|
1498
|
+
typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * ptr, size_t size, size_t nmemb, void * data);
|
1499
|
+
auto write_callback = [](void * ptr, size_t size, size_t nmemb, void * data) -> size_t {
|
1500
|
+
static_cast<std::string *>(data)->append((char * ) ptr, size * nmemb);
|
1501
|
+
return size * nmemb;
|
1502
|
+
};
|
1503
|
+
curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
|
1504
|
+
curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &res_str);
|
1505
|
+
#if defined(_WIN32)
|
1506
|
+
curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
|
1507
|
+
#endif
|
1508
|
+
if (!hf_token.empty()) {
|
1509
|
+
std::string auth_header = "Authorization: Bearer " + hf_token;
|
1510
|
+
http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
|
1511
|
+
}
|
1512
|
+
// Important: the User-Agent must be "llama-cpp" to get the "ggufFile" field in the response
|
1513
|
+
http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
|
1514
|
+
http_headers.ptr = curl_slist_append(http_headers.ptr, "Accept: application/json");
|
1515
|
+
curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
|
1516
|
+
|
1517
|
+
CURLcode res = curl_easy_perform(curl.get());
|
1518
|
+
|
1519
|
+
if (res != CURLE_OK) {
|
1520
|
+
throw std::runtime_error("error: cannot make GET request to HF API");
|
1521
|
+
}
|
1522
|
+
|
1523
|
+
long res_code;
|
1524
|
+
curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &res_code);
|
1525
|
+
if (res_code == 200) {
|
1526
|
+
model_info = json::parse(res_str);
|
1527
|
+
} else if (res_code == 401) {
|
1528
|
+
throw std::runtime_error("error: model is private or does not exist; if you are accessing a gated model, please provide a valid HF token");
|
1529
|
+
} else {
|
1530
|
+
throw std::runtime_error(string_format("error from HF API, response code: %ld, data: %s", res_code, res_str.c_str()));
|
1531
|
+
}
|
1532
|
+
|
1533
|
+
// check response
|
1534
|
+
if (!model_info.contains("ggufFile")) {
|
1535
|
+
throw std::runtime_error("error: model does not have ggufFile");
|
1536
|
+
}
|
1537
|
+
json & lm_gguf_file = model_info.at("ggufFile");
|
1538
|
+
if (!lm_gguf_file.contains("rfilename")) {
|
1539
|
+
throw std::runtime_error("error: ggufFile does not have rfilename");
|
1540
|
+
}
|
1541
|
+
|
1542
|
+
return std::make_pair(hf_repo, lm_gguf_file.at("rfilename"));
|
1543
|
+
}
|
1544
|
+
|
1453
1545
|
#else
|
1454
1546
|
|
1455
1547
|
struct llama_model * common_load_model_from_url(
|
@@ -1471,6 +1563,11 @@ struct llama_model * common_load_model_from_hf(
|
|
1471
1563
|
return nullptr;
|
1472
1564
|
}
|
1473
1565
|
|
1566
|
+
std::pair<std::string, std::string> common_get_hf_file(const std::string &, const std::string &) {
|
1567
|
+
LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
|
1568
|
+
return std::make_pair("", "");
|
1569
|
+
}
|
1570
|
+
|
1474
1571
|
#endif // LLAMA_USE_CURL
|
1475
1572
|
|
1476
1573
|
//
|
@@ -1569,21 +1666,23 @@ std::vector<llama_token> common_tokenize(
|
|
1569
1666
|
const std::string & text,
|
1570
1667
|
bool add_special,
|
1571
1668
|
bool parse_special) {
|
1572
|
-
|
1669
|
+
const llama_model * model = llama_get_model(ctx);
|
1670
|
+
const llama_vocab * vocab = llama_model_get_vocab(model);
|
1671
|
+
return common_tokenize(vocab, text, add_special, parse_special);
|
1573
1672
|
}
|
1574
1673
|
|
1575
1674
|
std::vector<llama_token> common_tokenize(
|
1576
|
-
const struct
|
1675
|
+
const struct llama_vocab * vocab,
|
1577
1676
|
const std::string & text,
|
1578
1677
|
bool add_special,
|
1579
1678
|
bool parse_special) {
|
1580
1679
|
// upper limit for the number of tokens
|
1581
1680
|
int n_tokens = text.length() + 2 * add_special;
|
1582
1681
|
std::vector<llama_token> result(n_tokens);
|
1583
|
-
n_tokens = llama_tokenize(
|
1682
|
+
n_tokens = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
|
1584
1683
|
if (n_tokens < 0) {
|
1585
1684
|
result.resize(-n_tokens);
|
1586
|
-
int check = llama_tokenize(
|
1685
|
+
int check = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
|
1587
1686
|
LM_GGML_ASSERT(check == -n_tokens);
|
1588
1687
|
} else {
|
1589
1688
|
result.resize(n_tokens);
|
@@ -1592,12 +1691,18 @@ std::vector<llama_token> common_tokenize(
|
|
1592
1691
|
}
|
1593
1692
|
|
1594
1693
|
std::string common_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
|
1694
|
+
const llama_model * model = llama_get_model(ctx);
|
1695
|
+
const llama_vocab * vocab = llama_model_get_vocab(model);
|
1696
|
+
return common_token_to_piece(vocab, token, special);
|
1697
|
+
}
|
1698
|
+
|
1699
|
+
std::string common_token_to_piece(const struct llama_vocab * vocab, llama_token token, bool special) {
|
1595
1700
|
std::string piece;
|
1596
1701
|
piece.resize(piece.capacity()); // using string internal cache, 15 bytes + '\n'
|
1597
|
-
const int n_chars = llama_token_to_piece(
|
1702
|
+
const int n_chars = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special);
|
1598
1703
|
if (n_chars < 0) {
|
1599
1704
|
piece.resize(-n_chars);
|
1600
|
-
int check = llama_token_to_piece(
|
1705
|
+
int check = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special);
|
1601
1706
|
LM_GGML_ASSERT(check == -n_chars);
|
1602
1707
|
}
|
1603
1708
|
else {
|
@@ -1607,13 +1712,19 @@ std::string common_token_to_piece(const struct llama_context * ctx, llama_token
|
|
1607
1712
|
return piece;
|
1608
1713
|
}
|
1609
1714
|
|
1610
|
-
std::string common_detokenize(llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
|
1715
|
+
std::string common_detokenize(const struct llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
|
1716
|
+
const llama_model * model = llama_get_model(ctx);
|
1717
|
+
const llama_vocab * vocab = llama_model_get_vocab(model);
|
1718
|
+
return common_detokenize(vocab, tokens, special);
|
1719
|
+
}
|
1720
|
+
|
1721
|
+
std::string common_detokenize(const struct llama_vocab * vocab, const std::vector<llama_token> & tokens, bool special) {
|
1611
1722
|
std::string text;
|
1612
1723
|
text.resize(std::max(text.capacity(), tokens.size()));
|
1613
|
-
int32_t n_chars = llama_detokenize(
|
1724
|
+
int32_t n_chars = llama_detokenize(vocab, tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
|
1614
1725
|
if (n_chars < 0) {
|
1615
1726
|
text.resize(-n_chars);
|
1616
|
-
n_chars = llama_detokenize(
|
1727
|
+
n_chars = llama_detokenize(vocab, tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
|
1617
1728
|
LM_GGML_ASSERT(n_chars <= (int32_t)text.size()); // whitespace trimming is performed after per-token detokenization
|
1618
1729
|
}
|
1619
1730
|
|
@@ -1628,20 +1739,13 @@ std::string common_detokenize(llama_context * ctx, const std::vector<llama_token
|
|
1628
1739
|
//
|
1629
1740
|
|
1630
1741
|
std::string common_get_builtin_chat_template(const struct llama_model * model) {
|
1631
|
-
|
1632
|
-
|
1633
|
-
int32_t res = llama_model_meta_val_str(model, template_key, NULL, 0);
|
1634
|
-
if (res > 0) {
|
1635
|
-
std::vector<char> model_template(res + 1, 0);
|
1636
|
-
llama_model_meta_val_str(model, template_key, model_template.data(), model_template.size());
|
1637
|
-
return std::string(model_template.data(), model_template.size() - 1);
|
1638
|
-
}
|
1639
|
-
return "";
|
1742
|
+
const char * ptr_tmpl = llama_model_chat_template(model);
|
1743
|
+
return ptr_tmpl == nullptr ? "" : ptr_tmpl;
|
1640
1744
|
}
|
1641
1745
|
|
1642
1746
|
bool common_chat_verify_template(const std::string & tmpl) {
|
1643
1747
|
llama_chat_message chat[] = {{"user", "test"}};
|
1644
|
-
int res = llama_chat_apply_template(
|
1748
|
+
const int res = llama_chat_apply_template(tmpl.c_str(), chat, 1, true, nullptr, 0);
|
1645
1749
|
return res >= 0;
|
1646
1750
|
}
|
1647
1751
|
|
@@ -1652,16 +1756,16 @@ std::string common_chat_apply_template(const struct llama_model * model,
|
|
1652
1756
|
int alloc_size = 0;
|
1653
1757
|
bool fallback = false; // indicate if we must fallback to default chatml
|
1654
1758
|
std::vector<llama_chat_message> chat;
|
1655
|
-
for (auto & msg : msgs) {
|
1759
|
+
for (const auto & msg : msgs) {
|
1656
1760
|
chat.push_back({msg.role.c_str(), msg.content.c_str()});
|
1657
1761
|
alloc_size += (msg.role.size() + msg.content.size()) * 1.25;
|
1658
1762
|
}
|
1659
1763
|
|
1660
|
-
const char * ptr_tmpl = tmpl.empty() ?
|
1764
|
+
const char * ptr_tmpl = tmpl.empty() ? llama_model_chat_template(model) : tmpl.c_str();
|
1661
1765
|
std::vector<char> buf(alloc_size);
|
1662
1766
|
|
1663
1767
|
// run the first time to get the total output length
|
1664
|
-
int32_t res = llama_chat_apply_template(
|
1768
|
+
int32_t res = llama_chat_apply_template(ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size());
|
1665
1769
|
|
1666
1770
|
// error: chat template is not supported
|
1667
1771
|
if (res < 0) {
|
@@ -1669,18 +1773,17 @@ std::string common_chat_apply_template(const struct llama_model * model,
|
|
1669
1773
|
// if the custom "tmpl" is not supported, we throw an error
|
1670
1774
|
// this is a bit redundant (for good), since we're not sure if user validated the custom template with llama_chat_verify_template()
|
1671
1775
|
throw std::runtime_error("this custom template is not supported");
|
1672
|
-
} else {
|
1673
|
-
// If the built-in template is not supported, we default to chatml
|
1674
|
-
res = llama_chat_apply_template(nullptr, "chatml", chat.data(), chat.size(), add_ass, buf.data(), buf.size());
|
1675
|
-
fallback = true;
|
1676
1776
|
}
|
1777
|
+
|
1778
|
+
// If the built-in template is not supported, we default to chatml
|
1779
|
+
res = llama_chat_apply_template("chatml", chat.data(), chat.size(), add_ass, buf.data(), buf.size());
|
1780
|
+
fallback = true;
|
1677
1781
|
}
|
1678
1782
|
|
1679
1783
|
// if it turns out that our buffer is too small, we resize it
|
1680
1784
|
if ((size_t) res > buf.size()) {
|
1681
1785
|
buf.resize(res);
|
1682
1786
|
res = llama_chat_apply_template(
|
1683
|
-
fallback ? nullptr : model,
|
1684
1787
|
fallback ? "chatml" : ptr_tmpl,
|
1685
1788
|
chat.data(), chat.size(), add_ass, buf.data(), buf.size());
|
1686
1789
|
}
|
package/cpp/common.h
CHANGED
@@ -2,7 +2,6 @@
|
|
2
2
|
|
3
3
|
#pragma once
|
4
4
|
|
5
|
-
#include "llama-cpp.h"
|
6
5
|
#include "llama-cpp.h"
|
7
6
|
|
8
7
|
#include <string>
|
@@ -25,11 +24,11 @@
|
|
25
24
|
|
26
25
|
#define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
|
27
26
|
|
28
|
-
struct
|
27
|
+
struct common_adapter_lora_info {
|
29
28
|
std::string path;
|
30
29
|
float scale;
|
31
30
|
|
32
|
-
struct
|
31
|
+
struct llama_adapter_lora * ptr;
|
33
32
|
};
|
34
33
|
|
35
34
|
using llama_tokens = std::vector<llama_token>;
|
@@ -115,6 +114,12 @@ enum dimre_method {
|
|
115
114
|
DIMRE_METHOD_MEAN,
|
116
115
|
};
|
117
116
|
|
117
|
+
enum common_conversation_mode {
|
118
|
+
COMMON_CONVERSATION_MODE_DISABLED = 0,
|
119
|
+
COMMON_CONVERSATION_MODE_ENABLED = 1,
|
120
|
+
COMMON_CONVERSATION_MODE_AUTO = 2,
|
121
|
+
};
|
122
|
+
|
118
123
|
// sampling parameters
|
119
124
|
struct common_params_sampling {
|
120
125
|
uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
|
@@ -181,7 +186,11 @@ struct common_params_speculative {
|
|
181
186
|
struct cpu_params cpuparams;
|
182
187
|
struct cpu_params cpuparams_batch;
|
183
188
|
|
184
|
-
std::string
|
189
|
+
std::string hf_repo = ""; // HF repo // NOLINT
|
190
|
+
std::string hf_file = ""; // HF file // NOLINT
|
191
|
+
|
192
|
+
std::string model = ""; // draft model for speculative decoding // NOLINT
|
193
|
+
std::string model_url = ""; // model url to download // NOLINT
|
185
194
|
};
|
186
195
|
|
187
196
|
struct common_params_vocoder {
|
@@ -190,6 +199,8 @@ struct common_params_vocoder {
|
|
190
199
|
|
191
200
|
std::string model = ""; // model path // NOLINT
|
192
201
|
std::string model_url = ""; // model url to download // NOLINT
|
202
|
+
|
203
|
+
bool use_guide_tokens = false; // enable guide tokens to improve TTS accuracy // NOLINT
|
193
204
|
};
|
194
205
|
|
195
206
|
struct common_params {
|
@@ -256,14 +267,13 @@ struct common_params {
|
|
256
267
|
std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding // NOLINT
|
257
268
|
std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding // NOLINT
|
258
269
|
std::string logits_file = ""; // file for saving *all* logits // NOLINT
|
259
|
-
std::string rpc_servers = ""; // comma separated list of RPC servers // NOLINT
|
260
270
|
|
261
271
|
std::vector<std::string> in_files; // all input files
|
262
272
|
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
|
263
273
|
std::vector<llama_model_kv_override> kv_overrides;
|
264
274
|
|
265
|
-
bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using
|
266
|
-
std::vector<
|
275
|
+
bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_adapter_lora_apply)
|
276
|
+
std::vector<common_adapter_lora_info> lora_adapters; // lora adapter path with user defined scale
|
267
277
|
|
268
278
|
std::vector<common_control_vector_load_info> control_vectors; // control vector with user defined scale
|
269
279
|
|
@@ -291,7 +301,6 @@ struct common_params {
|
|
291
301
|
bool special = false; // enable special token output
|
292
302
|
bool interactive = false; // interactive mode
|
293
303
|
bool interactive_first = false; // wait for user input immediately
|
294
|
-
bool conversation = false; // conversation mode (does not print special tokens and suffix/prefix)
|
295
304
|
bool prompt_cache_all = false; // save user input and generations to prompt cache
|
296
305
|
bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it
|
297
306
|
|
@@ -317,6 +326,8 @@ struct common_params {
|
|
317
326
|
lm_ggml_type cache_type_k = LM_GGML_TYPE_F16; // KV cache data type for the K
|
318
327
|
lm_ggml_type cache_type_v = LM_GGML_TYPE_F16; // KV cache data type for the V
|
319
328
|
|
329
|
+
common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO;
|
330
|
+
|
320
331
|
// multimodal models (see examples/llava)
|
321
332
|
std::string mmproj = ""; // path to multimodal projector // NOLINT
|
322
333
|
std::vector<std::string> image; // path to image file(s)
|
@@ -470,6 +481,11 @@ static bool string_starts_with(const std::string & str,
|
|
470
481
|
return str.rfind(prefix, 0) == 0;
|
471
482
|
}
|
472
483
|
|
484
|
+
static bool string_ends_with(const std::string & str,
|
485
|
+
const std::string & suffix) { // While we wait for C++20's std::string::ends_with...
|
486
|
+
return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
|
487
|
+
}
|
488
|
+
|
473
489
|
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
|
474
490
|
void string_process_escapes(std::string & input);
|
475
491
|
|
@@ -497,7 +513,7 @@ struct common_init_result {
|
|
497
513
|
llama_model_ptr model;
|
498
514
|
llama_context_ptr context;
|
499
515
|
|
500
|
-
std::vector<
|
516
|
+
std::vector<llama_adapter_lora_ptr> lora;
|
501
517
|
};
|
502
518
|
|
503
519
|
struct common_init_result common_init_from_params(common_params & params);
|
@@ -511,6 +527,7 @@ struct llama_model * common_load_model_from_url(
|
|
511
527
|
const std::string & local_path,
|
512
528
|
const std::string & hf_token,
|
513
529
|
const struct llama_model_params & params);
|
530
|
+
|
514
531
|
struct llama_model * common_load_model_from_hf(
|
515
532
|
const std::string & repo,
|
516
533
|
const std::string & remote_path,
|
@@ -518,8 +535,12 @@ struct llama_model * common_load_model_from_hf(
|
|
518
535
|
const std::string & hf_token,
|
519
536
|
const struct llama_model_params & params);
|
520
537
|
|
538
|
+
std::pair<std::string, std::string> common_get_hf_file(
|
539
|
+
const std::string & hf_repo_with_tag,
|
540
|
+
const std::string & hf_token);
|
541
|
+
|
521
542
|
// clear LoRA adapters from context, then apply new list of adapters
|
522
|
-
void
|
543
|
+
void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora);
|
523
544
|
|
524
545
|
//
|
525
546
|
// Batch utils
|
@@ -557,7 +578,7 @@ std::vector<llama_token> common_tokenize(
|
|
557
578
|
bool parse_special = false);
|
558
579
|
|
559
580
|
std::vector<llama_token> common_tokenize(
|
560
|
-
const struct
|
581
|
+
const struct llama_vocab * vocab,
|
561
582
|
const std::string & text,
|
562
583
|
bool add_special,
|
563
584
|
bool parse_special = false);
|
@@ -569,11 +590,21 @@ std::string common_token_to_piece(
|
|
569
590
|
llama_token token,
|
570
591
|
bool special = true);
|
571
592
|
|
593
|
+
std::string common_token_to_piece(
|
594
|
+
const struct llama_vocab * vocab,
|
595
|
+
llama_token token,
|
596
|
+
bool special = true);
|
597
|
+
|
572
598
|
// detokenizes a vector of tokens into a string
|
573
599
|
// should work similar to Python's `tokenizer.decode`
|
574
600
|
// optionally renders special/control tokens
|
575
601
|
std::string common_detokenize(
|
576
|
-
|
602
|
+
const struct llama_context * ctx,
|
603
|
+
const std::vector<llama_token> & tokens,
|
604
|
+
bool special = true);
|
605
|
+
|
606
|
+
std::string common_detokenize(
|
607
|
+
const struct llama_vocab * vocab,
|
577
608
|
const std::vector<llama_token> & tokens,
|
578
609
|
bool special = true);
|
579
610
|
|