cui-llama.rn 1.4.0 → 1.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +4 -23
- package/android/build.gradle +12 -3
- package/android/src/main/CMakeLists.txt +13 -7
- package/android/src/main/java/com/rnllama/LlamaContext.java +27 -20
- package/android/src/main/java/com/rnllama/RNLlama.java +5 -1
- package/android/src/main/jni.cpp +15 -12
- package/android/src/main/jniLibs/arm64-v8a/librnllama.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod_i8mm.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_i8mm.so +0 -0
- package/android/src/main/jniLibs/x86_64/librnllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/librnllama_x86_64.so +0 -0
- package/cpp/README.md +1 -1
- package/cpp/common.cpp +158 -267
- package/cpp/common.h +46 -12
- package/cpp/ggml-alloc.c +1042 -1037
- package/cpp/ggml-backend-impl.h +255 -256
- package/cpp/ggml-backend-reg.cpp +582 -582
- package/cpp/ggml-backend.cpp +2002 -2002
- package/cpp/ggml-backend.h +354 -352
- package/cpp/ggml-common.h +1853 -1853
- package/cpp/ggml-cpp.h +39 -39
- package/cpp/ggml-cpu-aarch64.cpp +4247 -4247
- package/cpp/ggml-cpu-aarch64.h +8 -8
- package/cpp/ggml-cpu-impl.h +386 -386
- package/cpp/ggml-cpu-quants.c +10920 -10839
- package/cpp/ggml-cpu-traits.cpp +36 -36
- package/cpp/ggml-cpu-traits.h +38 -38
- package/cpp/ggml-cpu.c +329 -60
- package/cpp/ggml-cpu.cpp +10 -2
- package/cpp/ggml-cpu.h +135 -135
- package/cpp/ggml-impl.h +567 -567
- package/cpp/ggml-metal-impl.h +17 -17
- package/cpp/ggml-metal.m +4884 -4884
- package/cpp/ggml-quants.c +5238 -5238
- package/cpp/ggml-threading.h +14 -14
- package/cpp/ggml.c +6514 -6448
- package/cpp/ggml.h +2194 -2163
- package/cpp/gguf.cpp +1329 -1325
- package/cpp/gguf.h +202 -202
- package/cpp/json-schema-to-grammar.cpp +1045 -1045
- package/cpp/json-schema-to-grammar.h +8 -8
- package/cpp/json.hpp +24766 -24766
- package/cpp/llama-adapter.cpp +347 -346
- package/cpp/llama-adapter.h +74 -73
- package/cpp/llama-arch.cpp +1487 -1434
- package/cpp/llama-arch.h +400 -395
- package/cpp/llama-batch.cpp +368 -368
- package/cpp/llama-batch.h +88 -88
- package/cpp/llama-chat.cpp +578 -567
- package/cpp/llama-chat.h +52 -51
- package/cpp/llama-context.cpp +1775 -1771
- package/cpp/llama-context.h +128 -128
- package/cpp/llama-cparams.cpp +1 -1
- package/cpp/llama-cparams.h +37 -37
- package/cpp/llama-cpp.h +30 -30
- package/cpp/llama-grammar.cpp +1139 -1139
- package/cpp/llama-grammar.h +143 -143
- package/cpp/llama-hparams.cpp +71 -71
- package/cpp/llama-hparams.h +139 -140
- package/cpp/llama-impl.cpp +167 -167
- package/cpp/llama-impl.h +61 -61
- package/cpp/llama-kv-cache.cpp +718 -718
- package/cpp/llama-kv-cache.h +218 -218
- package/cpp/llama-mmap.cpp +2 -1
- package/cpp/llama-mmap.h +67 -67
- package/cpp/llama-model-loader.cpp +1124 -1011
- package/cpp/llama-model-loader.h +167 -158
- package/cpp/llama-model.cpp +3997 -2202
- package/cpp/llama-model.h +370 -391
- package/cpp/llama-sampling.cpp +2408 -2406
- package/cpp/llama-sampling.h +32 -48
- package/cpp/llama-vocab.cpp +3247 -1982
- package/cpp/llama-vocab.h +125 -182
- package/cpp/llama.cpp +416 -2886
- package/cpp/llama.h +1323 -1285
- package/cpp/log.cpp +401 -401
- package/cpp/log.h +121 -121
- package/cpp/rn-llama.cpp +822 -0
- package/cpp/rn-llama.h +123 -0
- package/cpp/rn-llama.hpp +18 -12
- package/cpp/sampling.cpp +505 -500
- package/cpp/sgemm.cpp +2597 -2597
- package/cpp/speculative.cpp +277 -274
- package/cpp/speculative.h +28 -28
- package/cpp/unicode.cpp +2 -3
- package/ios/CMakeLists.txt +99 -0
- package/ios/RNLlama.h +5 -1
- package/ios/RNLlama.mm +2 -2
- package/ios/RNLlamaContext.h +8 -1
- package/ios/RNLlamaContext.mm +15 -11
- package/ios/rnllama.xcframework/Info.plist +74 -0
- package/jest/mock.js +3 -2
- package/lib/commonjs/NativeRNLlama.js.map +1 -1
- package/lib/commonjs/index.js +4 -2
- package/lib/commonjs/index.js.map +1 -1
- package/lib/module/NativeRNLlama.js.map +1 -1
- package/lib/module/index.js +4 -2
- package/lib/module/index.js.map +1 -1
- package/lib/typescript/NativeRNLlama.d.ts +5 -1
- package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
- package/lib/typescript/index.d.ts.map +1 -1
- package/llama-rn.podspec +8 -2
- package/package.json +5 -2
- package/src/NativeRNLlama.ts +5 -1
- package/src/index.ts +9 -2
package/cpp/common.cpp
CHANGED
@@ -79,6 +79,22 @@ char const *LLAMA_BUILD_TARGET = "unknown";
|
|
79
79
|
#include <sys/syslimits.h>
|
80
80
|
#endif
|
81
81
|
#define LLAMA_CURL_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
|
82
|
+
|
83
|
+
//
|
84
|
+
// CURL utils
|
85
|
+
//
|
86
|
+
|
87
|
+
using curl_ptr = std::unique_ptr<CURL, decltype(&curl_easy_cleanup)>;
|
88
|
+
|
89
|
+
// cannot use unique_ptr for curl_slist, because we cannot update without destroying the old one
|
90
|
+
struct curl_slist_ptr {
|
91
|
+
struct curl_slist * ptr = nullptr;
|
92
|
+
~curl_slist_ptr() {
|
93
|
+
if (ptr) {
|
94
|
+
curl_slist_free_all(ptr);
|
95
|
+
}
|
96
|
+
}
|
97
|
+
};
|
82
98
|
#endif // LLAMA_USE_CURL
|
83
99
|
|
84
100
|
using json = nlohmann::ordered_json;
|
@@ -863,21 +879,23 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|
863
879
|
return iparams;
|
864
880
|
}
|
865
881
|
|
882
|
+
const llama_vocab * vocab = llama_model_get_vocab(model);
|
883
|
+
|
866
884
|
if (params.reranking) {
|
867
885
|
bool ok = true;
|
868
886
|
|
869
|
-
if (
|
870
|
-
LOG_WRN("%s: warning:
|
887
|
+
if (llama_vocab_bos(vocab) == LLAMA_TOKEN_NULL) {
|
888
|
+
LOG_WRN("%s: warning: vocab does not have a BOS token, reranking will not work\n", __func__);
|
871
889
|
ok = false;
|
872
890
|
}
|
873
891
|
|
874
|
-
if (
|
875
|
-
LOG_WRN("%s: warning:
|
892
|
+
if (llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) {
|
893
|
+
LOG_WRN("%s: warning: vocab does not have an EOS token, reranking will not work\n", __func__);
|
876
894
|
ok = false;
|
877
895
|
}
|
878
896
|
|
879
|
-
if (
|
880
|
-
LOG_WRN("%s: warning:
|
897
|
+
if (llama_vocab_sep(vocab) == LLAMA_TOKEN_NULL) {
|
898
|
+
LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
|
881
899
|
ok = false;
|
882
900
|
}
|
883
901
|
|
@@ -890,7 +908,7 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|
890
908
|
|
891
909
|
auto cparams = common_context_params_to_llama(params);
|
892
910
|
|
893
|
-
llama_context * lctx =
|
911
|
+
llama_context * lctx = llama_init_from_model(model, cparams);
|
894
912
|
if (lctx == NULL) {
|
895
913
|
LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.c_str());
|
896
914
|
llama_model_free(model);
|
@@ -904,7 +922,7 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|
904
922
|
|
905
923
|
if (!params.control_vectors.empty()) {
|
906
924
|
if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1;
|
907
|
-
if (params.control_vector_layer_end <= 0) params.control_vector_layer_end =
|
925
|
+
if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_model_n_layer(model);
|
908
926
|
|
909
927
|
const auto cvec = common_control_vector_load(params.control_vectors);
|
910
928
|
if (cvec.n_embd == -1) {
|
@@ -914,12 +932,13 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|
914
932
|
return iparams;
|
915
933
|
}
|
916
934
|
|
917
|
-
int err =
|
918
|
-
|
919
|
-
|
920
|
-
|
921
|
-
|
922
|
-
|
935
|
+
int err = llama_apply_adapter_cvec(
|
936
|
+
lctx,
|
937
|
+
cvec.data.data(),
|
938
|
+
cvec.data.size(),
|
939
|
+
cvec.n_embd,
|
940
|
+
params.control_vector_layer_start,
|
941
|
+
params.control_vector_layer_end);
|
923
942
|
if (err) {
|
924
943
|
llama_free(lctx);
|
925
944
|
llama_model_free(model);
|
@@ -930,8 +949,8 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|
930
949
|
|
931
950
|
// load and optionally apply lora adapters
|
932
951
|
for (auto & la : params.lora_adapters) {
|
933
|
-
|
934
|
-
lora.reset(
|
952
|
+
llama_adapter_lora_ptr lora;
|
953
|
+
lora.reset(llama_adapter_lora_init(model, la.path.c_str()));
|
935
954
|
if (lora == nullptr) {
|
936
955
|
LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
|
937
956
|
llama_free(lctx);
|
@@ -944,17 +963,17 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|
944
963
|
}
|
945
964
|
|
946
965
|
if (!params.lora_init_without_apply) {
|
947
|
-
|
966
|
+
common_set_adapter_lora(lctx, params.lora_adapters);
|
948
967
|
}
|
949
968
|
|
950
|
-
if (params.sampling.ignore_eos &&
|
951
|
-
LOG_WRN("%s: warning:
|
969
|
+
if (params.sampling.ignore_eos && llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) {
|
970
|
+
LOG_WRN("%s: warning: vocab does not have an EOS token, ignoring --ignore-eos\n", __func__);
|
952
971
|
params.sampling.ignore_eos = false;
|
953
972
|
}
|
954
973
|
|
955
974
|
if (params.sampling.ignore_eos) {
|
956
|
-
for (llama_token i = 0; i <
|
957
|
-
if (
|
975
|
+
for (llama_token i = 0; i < llama_vocab_n_tokens(vocab); i++) {
|
976
|
+
if (llama_vocab_is_eog(vocab, i)) {
|
958
977
|
LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(lctx, i).c_str(), -INFINITY);
|
959
978
|
params.sampling.logit_bias.push_back({i, -INFINITY});
|
960
979
|
}
|
@@ -975,8 +994,9 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|
975
994
|
LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);
|
976
995
|
|
977
996
|
std::vector<llama_token> tmp;
|
978
|
-
llama_token bos =
|
979
|
-
llama_token eos =
|
997
|
+
llama_token bos = llama_vocab_bos(vocab);
|
998
|
+
llama_token eos = llama_vocab_eos(vocab);
|
999
|
+
|
980
1000
|
// some models (e.g. T5) don't have a BOS token
|
981
1001
|
if (bos != LLAMA_TOKEN_NULL) {
|
982
1002
|
tmp.push_back(bos);
|
@@ -1011,11 +1031,11 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|
1011
1031
|
return iparams;
|
1012
1032
|
}
|
1013
1033
|
|
1014
|
-
void
|
1015
|
-
|
1034
|
+
void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora) {
|
1035
|
+
llama_clear_adapter_lora(ctx);
|
1016
1036
|
for (auto & la : lora) {
|
1017
1037
|
if (la.scale != 0.0f) {
|
1018
|
-
|
1038
|
+
llama_set_adapter_lora(ctx, la.ptr, la.scale);
|
1019
1039
|
}
|
1020
1040
|
}
|
1021
1041
|
}
|
@@ -1033,7 +1053,6 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
|
|
1033
1053
|
mparams.progress_callback_user_data = params.progress_callback_user_data;
|
1034
1054
|
mparams.progress_callback = params.progress_callback;
|
1035
1055
|
mparams.vocab_only = params.vocab_only;
|
1036
|
-
mparams.rpc_servers = params.rpc_servers.c_str();
|
1037
1056
|
mparams.main_gpu = params.main_gpu;
|
1038
1057
|
mparams.split_mode = params.split_mode;
|
1039
1058
|
mparams.tensor_split = params.tensor_split;
|
@@ -1134,219 +1153,6 @@ static bool curl_perform_with_retry(const std::string & url, CURL * curl, int ma
|
|
1134
1153
|
return false;
|
1135
1154
|
}
|
1136
1155
|
|
1137
|
-
static bool common_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {
|
1138
|
-
// Initialize libcurl
|
1139
|
-
std::unique_ptr<CURL, decltype(&curl_easy_cleanup)> curl(curl_easy_init(), &curl_easy_cleanup);
|
1140
|
-
if (!curl) {
|
1141
|
-
LOG_ERR("%s: error initializing libcurl\n", __func__);
|
1142
|
-
return false;
|
1143
|
-
}
|
1144
|
-
|
1145
|
-
bool force_download = false;
|
1146
|
-
|
1147
|
-
// Set the URL, allow to follow http redirection
|
1148
|
-
curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
|
1149
|
-
curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);
|
1150
|
-
|
1151
|
-
// Check if hf-token or bearer-token was specified
|
1152
|
-
if (!hf_token.empty()) {
|
1153
|
-
std::string auth_header = "Authorization: Bearer ";
|
1154
|
-
auth_header += hf_token.c_str();
|
1155
|
-
struct curl_slist *http_headers = NULL;
|
1156
|
-
http_headers = curl_slist_append(http_headers, auth_header.c_str());
|
1157
|
-
curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers);
|
1158
|
-
}
|
1159
|
-
|
1160
|
-
#if defined(_WIN32)
|
1161
|
-
// CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of
|
1162
|
-
// operating system. Currently implemented under MS-Windows.
|
1163
|
-
curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
|
1164
|
-
#endif
|
1165
|
-
|
1166
|
-
// Check if the file already exists locally
|
1167
|
-
auto file_exists = std::filesystem::exists(path);
|
1168
|
-
|
1169
|
-
// If the file exists, check its JSON metadata companion file.
|
1170
|
-
std::string metadata_path = path + ".json";
|
1171
|
-
nlohmann::json metadata;
|
1172
|
-
std::string etag;
|
1173
|
-
std::string last_modified;
|
1174
|
-
|
1175
|
-
if (file_exists) {
|
1176
|
-
// Try and read the JSON metadata file (note: stream autoclosed upon exiting this block).
|
1177
|
-
std::ifstream metadata_in(metadata_path);
|
1178
|
-
if (metadata_in.good()) {
|
1179
|
-
try {
|
1180
|
-
metadata_in >> metadata;
|
1181
|
-
LOG_INF("%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str());
|
1182
|
-
if (metadata.contains("url") && metadata.at("url").is_string()) {
|
1183
|
-
auto previous_url = metadata.at("url").get<std::string>();
|
1184
|
-
if (previous_url != url) {
|
1185
|
-
LOG_ERR("%s: Model URL mismatch: %s != %s\n", __func__, url.c_str(), previous_url.c_str());
|
1186
|
-
return false;
|
1187
|
-
}
|
1188
|
-
}
|
1189
|
-
if (metadata.contains("etag") && metadata.at("etag").is_string()) {
|
1190
|
-
etag = metadata.at("etag");
|
1191
|
-
}
|
1192
|
-
if (metadata.contains("lastModified") && metadata.at("lastModified").is_string()) {
|
1193
|
-
last_modified = metadata.at("lastModified");
|
1194
|
-
}
|
1195
|
-
} catch (const nlohmann::json::exception & e) {
|
1196
|
-
LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
|
1197
|
-
return false;
|
1198
|
-
}
|
1199
|
-
}
|
1200
|
-
} else {
|
1201
|
-
LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
|
1202
|
-
}
|
1203
|
-
|
1204
|
-
// Send a HEAD request to retrieve the etag and last-modified headers
|
1205
|
-
struct common_load_model_from_url_headers {
|
1206
|
-
std::string etag;
|
1207
|
-
std::string last_modified;
|
1208
|
-
};
|
1209
|
-
|
1210
|
-
common_load_model_from_url_headers headers;
|
1211
|
-
|
1212
|
-
{
|
1213
|
-
typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
|
1214
|
-
auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
|
1215
|
-
common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata;
|
1216
|
-
|
1217
|
-
static std::regex header_regex("([^:]+): (.*)\r\n");
|
1218
|
-
static std::regex etag_regex("ETag", std::regex_constants::icase);
|
1219
|
-
static std::regex last_modified_regex("Last-Modified", std::regex_constants::icase);
|
1220
|
-
|
1221
|
-
std::string header(buffer, n_items);
|
1222
|
-
std::smatch match;
|
1223
|
-
if (std::regex_match(header, match, header_regex)) {
|
1224
|
-
const std::string & key = match[1];
|
1225
|
-
const std::string & value = match[2];
|
1226
|
-
if (std::regex_match(key, match, etag_regex)) {
|
1227
|
-
headers->etag = value;
|
1228
|
-
} else if (std::regex_match(key, match, last_modified_regex)) {
|
1229
|
-
headers->last_modified = value;
|
1230
|
-
}
|
1231
|
-
}
|
1232
|
-
return n_items;
|
1233
|
-
};
|
1234
|
-
|
1235
|
-
curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 1L); // will trigger the HEAD verb
|
1236
|
-
curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L); // hide head request progress
|
1237
|
-
curl_easy_setopt(curl.get(), CURLOPT_HEADERFUNCTION, static_cast<CURLOPT_HEADERFUNCTION_PTR>(header_callback));
|
1238
|
-
curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers);
|
1239
|
-
|
1240
|
-
bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
|
1241
|
-
if (!was_perform_successful) {
|
1242
|
-
return false;
|
1243
|
-
}
|
1244
|
-
|
1245
|
-
long http_code = 0;
|
1246
|
-
curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
|
1247
|
-
if (http_code != 200) {
|
1248
|
-
// HEAD not supported, we don't know if the file has changed
|
1249
|
-
// force trigger downloading
|
1250
|
-
force_download = true;
|
1251
|
-
LOG_ERR("%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
|
1252
|
-
}
|
1253
|
-
}
|
1254
|
-
|
1255
|
-
bool should_download = !file_exists || force_download;
|
1256
|
-
if (!should_download) {
|
1257
|
-
if (!etag.empty() && etag != headers.etag) {
|
1258
|
-
LOG_WRN("%s: ETag header is different (%s != %s): triggering a new download\n", __func__, etag.c_str(), headers.etag.c_str());
|
1259
|
-
should_download = true;
|
1260
|
-
} else if (!last_modified.empty() && last_modified != headers.last_modified) {
|
1261
|
-
LOG_WRN("%s: Last-Modified header is different (%s != %s): triggering a new download\n", __func__, last_modified.c_str(), headers.last_modified.c_str());
|
1262
|
-
should_download = true;
|
1263
|
-
}
|
1264
|
-
}
|
1265
|
-
if (should_download) {
|
1266
|
-
std::string path_temporary = path + ".downloadInProgress";
|
1267
|
-
if (file_exists) {
|
1268
|
-
LOG_WRN("%s: deleting previous downloaded file: %s\n", __func__, path.c_str());
|
1269
|
-
if (remove(path.c_str()) != 0) {
|
1270
|
-
LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
|
1271
|
-
return false;
|
1272
|
-
}
|
1273
|
-
}
|
1274
|
-
|
1275
|
-
// Set the output file
|
1276
|
-
|
1277
|
-
struct FILE_deleter {
|
1278
|
-
void operator()(FILE * f) const {
|
1279
|
-
fclose(f);
|
1280
|
-
}
|
1281
|
-
};
|
1282
|
-
|
1283
|
-
std::unique_ptr<FILE, FILE_deleter> outfile(fopen(path_temporary.c_str(), "wb"));
|
1284
|
-
if (!outfile) {
|
1285
|
-
LOG_ERR("%s: error opening local file for writing: %s\n", __func__, path.c_str());
|
1286
|
-
return false;
|
1287
|
-
}
|
1288
|
-
|
1289
|
-
typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * data, size_t size, size_t nmemb, void * fd);
|
1290
|
-
auto write_callback = [](void * data, size_t size, size_t nmemb, void * fd) -> size_t {
|
1291
|
-
return fwrite(data, size, nmemb, (FILE *)fd);
|
1292
|
-
};
|
1293
|
-
curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 0L);
|
1294
|
-
curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
|
1295
|
-
curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, outfile.get());
|
1296
|
-
|
1297
|
-
// display download progress
|
1298
|
-
curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 0L);
|
1299
|
-
|
1300
|
-
// helper function to hide password in URL
|
1301
|
-
auto llama_download_hide_password_in_url = [](const std::string & url) -> std::string {
|
1302
|
-
std::size_t protocol_pos = url.find("://");
|
1303
|
-
if (protocol_pos == std::string::npos) {
|
1304
|
-
return url; // Malformed URL
|
1305
|
-
}
|
1306
|
-
|
1307
|
-
std::size_t at_pos = url.find('@', protocol_pos + 3);
|
1308
|
-
if (at_pos == std::string::npos) {
|
1309
|
-
return url; // No password in URL
|
1310
|
-
}
|
1311
|
-
|
1312
|
-
return url.substr(0, protocol_pos + 3) + "********" + url.substr(at_pos);
|
1313
|
-
};
|
1314
|
-
|
1315
|
-
// start the download
|
1316
|
-
LOG_INF("%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
|
1317
|
-
llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str());
|
1318
|
-
bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
|
1319
|
-
if (!was_perform_successful) {
|
1320
|
-
return false;
|
1321
|
-
}
|
1322
|
-
|
1323
|
-
long http_code = 0;
|
1324
|
-
curl_easy_getinfo (curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
|
1325
|
-
if (http_code < 200 || http_code >= 400) {
|
1326
|
-
LOG_ERR("%s: invalid http status code received: %ld\n", __func__, http_code);
|
1327
|
-
return false;
|
1328
|
-
}
|
1329
|
-
|
1330
|
-
// Causes file to be closed explicitly here before we rename it.
|
1331
|
-
outfile.reset();
|
1332
|
-
|
1333
|
-
// Write the updated JSON metadata file.
|
1334
|
-
metadata.update({
|
1335
|
-
{"url", url},
|
1336
|
-
{"etag", headers.etag},
|
1337
|
-
{"lastModified", headers.last_modified}
|
1338
|
-
});
|
1339
|
-
std::ofstream(metadata_path) << metadata.dump(4);
|
1340
|
-
LOG_INF("%s: file metadata saved: %s\n", __func__, metadata_path.c_str());
|
1341
|
-
|
1342
|
-
if (rename(path_temporary.c_str(), path.c_str()) != 0) {
|
1343
|
-
LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
|
1344
|
-
return false;
|
1345
|
-
}
|
1346
|
-
}
|
1347
|
-
|
1348
|
-
return true;
|
1349
|
-
}
|
1350
1156
|
|
1351
1157
|
struct llama_model * common_load_model_from_url(
|
1352
1158
|
const std::string & model_url,
|
@@ -1450,6 +1256,80 @@ struct llama_model * common_load_model_from_hf(
|
|
1450
1256
|
return common_load_model_from_url(model_url, local_path, hf_token, params);
|
1451
1257
|
}
|
1452
1258
|
|
1259
|
+
/**
|
1260
|
+
* Allow getting the HF file from the HF repo with tag (like ollama), for example:
|
1261
|
+
* - bartowski/Llama-3.2-3B-Instruct-GGUF:q4
|
1262
|
+
* - bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M
|
1263
|
+
* - bartowski/Llama-3.2-3B-Instruct-GGUF:q5_k_s
|
1264
|
+
* Tag is optional, default to "latest" (meaning it checks for Q4_K_M first, then Q4, then if not found, return the first GGUF file in repo)
|
1265
|
+
*
|
1266
|
+
* Return pair of <repo, file> (with "repo" already having tag removed)
|
1267
|
+
*
|
1268
|
+
* Note: we use the Ollama-compatible HF API, but not using the blobId. Instead, we use the special "ggufFile" field which returns the value for "hf_file". This is done to be backward-compatible with existing cache files.
|
1269
|
+
*/
|
1270
|
+
std::pair<std::string, std::string> common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & hf_token) {
|
1271
|
+
auto parts = string_split<std::string>(hf_repo_with_tag, ':');
|
1272
|
+
std::string tag = parts.size() > 1 ? parts.back() : "latest";
|
1273
|
+
std::string hf_repo = parts[0];
|
1274
|
+
if (string_split<std::string>(hf_repo, '/').size() != 2) {
|
1275
|
+
throw std::invalid_argument("error: invalid HF repo format, expected <user>/<model>[:quant]\n");
|
1276
|
+
}
|
1277
|
+
|
1278
|
+
// fetch model info from Hugging Face Hub API
|
1279
|
+
json model_info;
|
1280
|
+
curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
|
1281
|
+
curl_slist_ptr http_headers;
|
1282
|
+
std::string res_str;
|
1283
|
+
std::string url = "https://huggingface.co/v2/" + hf_repo + "/manifests/" + tag;
|
1284
|
+
curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
|
1285
|
+
curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L);
|
1286
|
+
typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * ptr, size_t size, size_t nmemb, void * data);
|
1287
|
+
auto write_callback = [](void * ptr, size_t size, size_t nmemb, void * data) -> size_t {
|
1288
|
+
static_cast<std::string *>(data)->append((char * ) ptr, size * nmemb);
|
1289
|
+
return size * nmemb;
|
1290
|
+
};
|
1291
|
+
curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
|
1292
|
+
curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &res_str);
|
1293
|
+
#if defined(_WIN32)
|
1294
|
+
curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
|
1295
|
+
#endif
|
1296
|
+
if (!hf_token.empty()) {
|
1297
|
+
std::string auth_header = "Authorization: Bearer " + hf_token;
|
1298
|
+
http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
|
1299
|
+
}
|
1300
|
+
// Important: the User-Agent must be "llama-cpp" to get the "ggufFile" field in the response
|
1301
|
+
http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
|
1302
|
+
http_headers.ptr = curl_slist_append(http_headers.ptr, "Accept: application/json");
|
1303
|
+
curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
|
1304
|
+
|
1305
|
+
CURLcode res = curl_easy_perform(curl.get());
|
1306
|
+
|
1307
|
+
if (res != CURLE_OK) {
|
1308
|
+
throw std::runtime_error("error: cannot make GET request to HF API");
|
1309
|
+
}
|
1310
|
+
|
1311
|
+
long res_code;
|
1312
|
+
curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &res_code);
|
1313
|
+
if (res_code == 200) {
|
1314
|
+
model_info = json::parse(res_str);
|
1315
|
+
} else if (res_code == 401) {
|
1316
|
+
throw std::runtime_error("error: model is private or does not exist; if you are accessing a gated model, please provide a valid HF token");
|
1317
|
+
} else {
|
1318
|
+
throw std::runtime_error(string_format("error from HF API, response code: %ld, data: %s", res_code, res_str.c_str()));
|
1319
|
+
}
|
1320
|
+
|
1321
|
+
// check response
|
1322
|
+
if (!model_info.contains("ggufFile")) {
|
1323
|
+
throw std::runtime_error("error: model does not have ggufFile");
|
1324
|
+
}
|
1325
|
+
json & lm_gguf_file = model_info.at("ggufFile");
|
1326
|
+
if (!lm_gguf_file.contains("rfilename")) {
|
1327
|
+
throw std::runtime_error("error: ggufFile does not have rfilename");
|
1328
|
+
}
|
1329
|
+
|
1330
|
+
return std::make_pair(hf_repo, lm_gguf_file.at("rfilename"));
|
1331
|
+
}
|
1332
|
+
|
1453
1333
|
#else
|
1454
1334
|
|
1455
1335
|
struct llama_model * common_load_model_from_url(
|
@@ -1471,6 +1351,11 @@ struct llama_model * common_load_model_from_hf(
|
|
1471
1351
|
return nullptr;
|
1472
1352
|
}
|
1473
1353
|
|
1354
|
+
std::pair<std::string, std::string> common_get_hf_file(const std::string &, const std::string &) {
|
1355
|
+
LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
|
1356
|
+
return std::make_pair("", "");
|
1357
|
+
}
|
1358
|
+
|
1474
1359
|
#endif // LLAMA_USE_CURL
|
1475
1360
|
|
1476
1361
|
//
|
@@ -1569,21 +1454,23 @@ std::vector<llama_token> common_tokenize(
|
|
1569
1454
|
const std::string & text,
|
1570
1455
|
bool add_special,
|
1571
1456
|
bool parse_special) {
|
1572
|
-
|
1457
|
+
const llama_model * model = llama_get_model(ctx);
|
1458
|
+
const llama_vocab * vocab = llama_model_get_vocab(model);
|
1459
|
+
return common_tokenize(vocab, text, add_special, parse_special);
|
1573
1460
|
}
|
1574
1461
|
|
1575
1462
|
std::vector<llama_token> common_tokenize(
|
1576
|
-
const struct
|
1463
|
+
const struct llama_vocab * vocab,
|
1577
1464
|
const std::string & text,
|
1578
1465
|
bool add_special,
|
1579
1466
|
bool parse_special) {
|
1580
1467
|
// upper limit for the number of tokens
|
1581
1468
|
int n_tokens = text.length() + 2 * add_special;
|
1582
1469
|
std::vector<llama_token> result(n_tokens);
|
1583
|
-
n_tokens = llama_tokenize(
|
1470
|
+
n_tokens = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
|
1584
1471
|
if (n_tokens < 0) {
|
1585
1472
|
result.resize(-n_tokens);
|
1586
|
-
int check = llama_tokenize(
|
1473
|
+
int check = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
|
1587
1474
|
LM_GGML_ASSERT(check == -n_tokens);
|
1588
1475
|
} else {
|
1589
1476
|
result.resize(n_tokens);
|
@@ -1592,12 +1479,18 @@ std::vector<llama_token> common_tokenize(
|
|
1592
1479
|
}
|
1593
1480
|
|
1594
1481
|
std::string common_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
|
1482
|
+
const llama_model * model = llama_get_model(ctx);
|
1483
|
+
const llama_vocab * vocab = llama_model_get_vocab(model);
|
1484
|
+
return common_token_to_piece(vocab, token, special);
|
1485
|
+
}
|
1486
|
+
|
1487
|
+
std::string common_token_to_piece(const struct llama_vocab * vocab, llama_token token, bool special) {
|
1595
1488
|
std::string piece;
|
1596
1489
|
piece.resize(piece.capacity()); // using string internal cache, 15 bytes + '\n'
|
1597
|
-
const int n_chars = llama_token_to_piece(
|
1490
|
+
const int n_chars = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special);
|
1598
1491
|
if (n_chars < 0) {
|
1599
1492
|
piece.resize(-n_chars);
|
1600
|
-
int check = llama_token_to_piece(
|
1493
|
+
int check = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special);
|
1601
1494
|
LM_GGML_ASSERT(check == -n_chars);
|
1602
1495
|
}
|
1603
1496
|
else {
|
@@ -1607,13 +1500,19 @@ std::string common_token_to_piece(const struct llama_context * ctx, llama_token
|
|
1607
1500
|
return piece;
|
1608
1501
|
}
|
1609
1502
|
|
1610
|
-
std::string common_detokenize(llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
|
1503
|
+
std::string common_detokenize(const struct llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
|
1504
|
+
const llama_model * model = llama_get_model(ctx);
|
1505
|
+
const llama_vocab * vocab = llama_model_get_vocab(model);
|
1506
|
+
return common_detokenize(vocab, tokens, special);
|
1507
|
+
}
|
1508
|
+
|
1509
|
+
std::string common_detokenize(const struct llama_vocab * vocab, const std::vector<llama_token> & tokens, bool special) {
|
1611
1510
|
std::string text;
|
1612
1511
|
text.resize(std::max(text.capacity(), tokens.size()));
|
1613
|
-
int32_t n_chars = llama_detokenize(
|
1512
|
+
int32_t n_chars = llama_detokenize(vocab, tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
|
1614
1513
|
if (n_chars < 0) {
|
1615
1514
|
text.resize(-n_chars);
|
1616
|
-
n_chars = llama_detokenize(
|
1515
|
+
n_chars = llama_detokenize(vocab, tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
|
1617
1516
|
LM_GGML_ASSERT(n_chars <= (int32_t)text.size()); // whitespace trimming is performed after per-token detokenization
|
1618
1517
|
}
|
1619
1518
|
|
@@ -1628,20 +1527,13 @@ std::string common_detokenize(llama_context * ctx, const std::vector<llama_token
|
|
1628
1527
|
//
|
1629
1528
|
|
1630
1529
|
std::string common_get_builtin_chat_template(const struct llama_model * model) {
|
1631
|
-
|
1632
|
-
|
1633
|
-
int32_t res = llama_model_meta_val_str(model, template_key, NULL, 0);
|
1634
|
-
if (res > 0) {
|
1635
|
-
std::vector<char> model_template(res + 1, 0);
|
1636
|
-
llama_model_meta_val_str(model, template_key, model_template.data(), model_template.size());
|
1637
|
-
return std::string(model_template.data(), model_template.size() - 1);
|
1638
|
-
}
|
1639
|
-
return "";
|
1530
|
+
const char * ptr_tmpl = llama_model_chat_template(model);
|
1531
|
+
return ptr_tmpl == nullptr ? "" : ptr_tmpl;
|
1640
1532
|
}
|
1641
1533
|
|
1642
1534
|
bool common_chat_verify_template(const std::string & tmpl) {
|
1643
1535
|
llama_chat_message chat[] = {{"user", "test"}};
|
1644
|
-
int res = llama_chat_apply_template(
|
1536
|
+
const int res = llama_chat_apply_template(tmpl.c_str(), chat, 1, true, nullptr, 0);
|
1645
1537
|
return res >= 0;
|
1646
1538
|
}
|
1647
1539
|
|
@@ -1652,16 +1544,16 @@ std::string common_chat_apply_template(const struct llama_model * model,
|
|
1652
1544
|
int alloc_size = 0;
|
1653
1545
|
bool fallback = false; // indicate if we must fallback to default chatml
|
1654
1546
|
std::vector<llama_chat_message> chat;
|
1655
|
-
for (auto & msg : msgs) {
|
1547
|
+
for (const auto & msg : msgs) {
|
1656
1548
|
chat.push_back({msg.role.c_str(), msg.content.c_str()});
|
1657
1549
|
alloc_size += (msg.role.size() + msg.content.size()) * 1.25;
|
1658
1550
|
}
|
1659
1551
|
|
1660
|
-
const char * ptr_tmpl = tmpl.empty() ?
|
1552
|
+
const char * ptr_tmpl = tmpl.empty() ? llama_model_chat_template(model) : tmpl.c_str();
|
1661
1553
|
std::vector<char> buf(alloc_size);
|
1662
1554
|
|
1663
1555
|
// run the first time to get the total output length
|
1664
|
-
int32_t res = llama_chat_apply_template(
|
1556
|
+
int32_t res = llama_chat_apply_template(ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size());
|
1665
1557
|
|
1666
1558
|
// error: chat template is not supported
|
1667
1559
|
if (res < 0) {
|
@@ -1669,18 +1561,17 @@ std::string common_chat_apply_template(const struct llama_model * model,
|
|
1669
1561
|
// if the custom "tmpl" is not supported, we throw an error
|
1670
1562
|
// this is a bit redundant (for good), since we're not sure if user validated the custom template with llama_chat_verify_template()
|
1671
1563
|
throw std::runtime_error("this custom template is not supported");
|
1672
|
-
} else {
|
1673
|
-
// If the built-in template is not supported, we default to chatml
|
1674
|
-
res = llama_chat_apply_template(nullptr, "chatml", chat.data(), chat.size(), add_ass, buf.data(), buf.size());
|
1675
|
-
fallback = true;
|
1676
1564
|
}
|
1565
|
+
|
1566
|
+
// If the built-in template is not supported, we default to chatml
|
1567
|
+
res = llama_chat_apply_template("chatml", chat.data(), chat.size(), add_ass, buf.data(), buf.size());
|
1568
|
+
fallback = true;
|
1677
1569
|
}
|
1678
1570
|
|
1679
1571
|
// if it turns out that our buffer is too small, we resize it
|
1680
1572
|
if ((size_t) res > buf.size()) {
|
1681
1573
|
buf.resize(res);
|
1682
1574
|
res = llama_chat_apply_template(
|
1683
|
-
fallback ? nullptr : model,
|
1684
1575
|
fallback ? "chatml" : ptr_tmpl,
|
1685
1576
|
chat.data(), chat.size(), add_ass, buf.data(), buf.size());
|
1686
1577
|
}
|