cui-llama.rn 1.4.0 → 1.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (108) hide show
  1. package/README.md +4 -23
  2. package/android/build.gradle +12 -3
  3. package/android/src/main/CMakeLists.txt +13 -7
  4. package/android/src/main/java/com/rnllama/LlamaContext.java +27 -20
  5. package/android/src/main/java/com/rnllama/RNLlama.java +5 -1
  6. package/android/src/main/jni.cpp +15 -12
  7. package/android/src/main/jniLibs/arm64-v8a/librnllama.so +0 -0
  8. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8.so +0 -0
  9. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2.so +0 -0
  10. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod.so +0 -0
  11. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod_i8mm.so +0 -0
  12. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_i8mm.so +0 -0
  13. package/android/src/main/jniLibs/x86_64/librnllama.so +0 -0
  14. package/android/src/main/jniLibs/x86_64/librnllama_x86_64.so +0 -0
  15. package/cpp/README.md +1 -1
  16. package/cpp/common.cpp +158 -267
  17. package/cpp/common.h +46 -12
  18. package/cpp/ggml-alloc.c +1042 -1037
  19. package/cpp/ggml-backend-impl.h +255 -256
  20. package/cpp/ggml-backend-reg.cpp +582 -582
  21. package/cpp/ggml-backend.cpp +2002 -2002
  22. package/cpp/ggml-backend.h +354 -352
  23. package/cpp/ggml-common.h +1853 -1853
  24. package/cpp/ggml-cpp.h +39 -39
  25. package/cpp/ggml-cpu-aarch64.cpp +4247 -4247
  26. package/cpp/ggml-cpu-aarch64.h +8 -8
  27. package/cpp/ggml-cpu-impl.h +386 -386
  28. package/cpp/ggml-cpu-quants.c +10920 -10839
  29. package/cpp/ggml-cpu-traits.cpp +36 -36
  30. package/cpp/ggml-cpu-traits.h +38 -38
  31. package/cpp/ggml-cpu.c +329 -60
  32. package/cpp/ggml-cpu.cpp +10 -2
  33. package/cpp/ggml-cpu.h +135 -135
  34. package/cpp/ggml-impl.h +567 -567
  35. package/cpp/ggml-metal-impl.h +17 -17
  36. package/cpp/ggml-metal.m +4884 -4884
  37. package/cpp/ggml-quants.c +5238 -5238
  38. package/cpp/ggml-threading.h +14 -14
  39. package/cpp/ggml.c +6514 -6448
  40. package/cpp/ggml.h +2194 -2163
  41. package/cpp/gguf.cpp +1329 -1325
  42. package/cpp/gguf.h +202 -202
  43. package/cpp/json-schema-to-grammar.cpp +1045 -1045
  44. package/cpp/json-schema-to-grammar.h +8 -8
  45. package/cpp/json.hpp +24766 -24766
  46. package/cpp/llama-adapter.cpp +347 -346
  47. package/cpp/llama-adapter.h +74 -73
  48. package/cpp/llama-arch.cpp +1487 -1434
  49. package/cpp/llama-arch.h +400 -395
  50. package/cpp/llama-batch.cpp +368 -368
  51. package/cpp/llama-batch.h +88 -88
  52. package/cpp/llama-chat.cpp +578 -567
  53. package/cpp/llama-chat.h +52 -51
  54. package/cpp/llama-context.cpp +1775 -1771
  55. package/cpp/llama-context.h +128 -128
  56. package/cpp/llama-cparams.cpp +1 -1
  57. package/cpp/llama-cparams.h +37 -37
  58. package/cpp/llama-cpp.h +30 -30
  59. package/cpp/llama-grammar.cpp +1139 -1139
  60. package/cpp/llama-grammar.h +143 -143
  61. package/cpp/llama-hparams.cpp +71 -71
  62. package/cpp/llama-hparams.h +139 -140
  63. package/cpp/llama-impl.cpp +167 -167
  64. package/cpp/llama-impl.h +61 -61
  65. package/cpp/llama-kv-cache.cpp +718 -718
  66. package/cpp/llama-kv-cache.h +218 -218
  67. package/cpp/llama-mmap.cpp +2 -1
  68. package/cpp/llama-mmap.h +67 -67
  69. package/cpp/llama-model-loader.cpp +1124 -1011
  70. package/cpp/llama-model-loader.h +167 -158
  71. package/cpp/llama-model.cpp +3997 -2202
  72. package/cpp/llama-model.h +370 -391
  73. package/cpp/llama-sampling.cpp +2408 -2406
  74. package/cpp/llama-sampling.h +32 -48
  75. package/cpp/llama-vocab.cpp +3247 -1982
  76. package/cpp/llama-vocab.h +125 -182
  77. package/cpp/llama.cpp +416 -2886
  78. package/cpp/llama.h +1323 -1285
  79. package/cpp/log.cpp +401 -401
  80. package/cpp/log.h +121 -121
  81. package/cpp/rn-llama.cpp +822 -0
  82. package/cpp/rn-llama.h +123 -0
  83. package/cpp/rn-llama.hpp +18 -12
  84. package/cpp/sampling.cpp +505 -500
  85. package/cpp/sgemm.cpp +2597 -2597
  86. package/cpp/speculative.cpp +277 -274
  87. package/cpp/speculative.h +28 -28
  88. package/cpp/unicode.cpp +2 -3
  89. package/ios/CMakeLists.txt +99 -0
  90. package/ios/RNLlama.h +5 -1
  91. package/ios/RNLlama.mm +2 -2
  92. package/ios/RNLlamaContext.h +8 -1
  93. package/ios/RNLlamaContext.mm +15 -11
  94. package/ios/rnllama.xcframework/Info.plist +74 -0
  95. package/jest/mock.js +3 -2
  96. package/lib/commonjs/NativeRNLlama.js.map +1 -1
  97. package/lib/commonjs/index.js +4 -2
  98. package/lib/commonjs/index.js.map +1 -1
  99. package/lib/module/NativeRNLlama.js.map +1 -1
  100. package/lib/module/index.js +4 -2
  101. package/lib/module/index.js.map +1 -1
  102. package/lib/typescript/NativeRNLlama.d.ts +5 -1
  103. package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
  104. package/lib/typescript/index.d.ts.map +1 -1
  105. package/llama-rn.podspec +8 -2
  106. package/package.json +5 -2
  107. package/src/NativeRNLlama.ts +5 -1
  108. package/src/index.ts +9 -2
package/cpp/common.cpp CHANGED
@@ -79,6 +79,22 @@ char const *LLAMA_BUILD_TARGET = "unknown";
79
79
  #include <sys/syslimits.h>
80
80
  #endif
81
81
  #define LLAMA_CURL_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
82
+
83
+ //
84
+ // CURL utils
85
+ //
86
+
87
+ using curl_ptr = std::unique_ptr<CURL, decltype(&curl_easy_cleanup)>;
88
+
89
+ // cannot use unique_ptr for curl_slist, because we cannot update without destroying the old one
90
+ struct curl_slist_ptr {
91
+ struct curl_slist * ptr = nullptr;
92
+ ~curl_slist_ptr() {
93
+ if (ptr) {
94
+ curl_slist_free_all(ptr);
95
+ }
96
+ }
97
+ };
82
98
  #endif // LLAMA_USE_CURL
83
99
 
84
100
  using json = nlohmann::ordered_json;
@@ -863,21 +879,23 @@ struct common_init_result common_init_from_params(common_params & params) {
863
879
  return iparams;
864
880
  }
865
881
 
882
+ const llama_vocab * vocab = llama_model_get_vocab(model);
883
+
866
884
  if (params.reranking) {
867
885
  bool ok = true;
868
886
 
869
- if (llama_token_bos(model) == LLAMA_TOKEN_NULL) {
870
- LOG_WRN("%s: warning: model does not have a BOS token, reranking will not work\n", __func__);
887
+ if (llama_vocab_bos(vocab) == LLAMA_TOKEN_NULL) {
888
+ LOG_WRN("%s: warning: vocab does not have a BOS token, reranking will not work\n", __func__);
871
889
  ok = false;
872
890
  }
873
891
 
874
- if (llama_token_eos(model) == LLAMA_TOKEN_NULL) {
875
- LOG_WRN("%s: warning: model does not have an EOS token, reranking will not work\n", __func__);
892
+ if (llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) {
893
+ LOG_WRN("%s: warning: vocab does not have an EOS token, reranking will not work\n", __func__);
876
894
  ok = false;
877
895
  }
878
896
 
879
- if (llama_token_sep(model) == LLAMA_TOKEN_NULL) {
880
- LOG_WRN("%s: warning: model does not have a SEP token, reranking will not work\n", __func__);
897
+ if (llama_vocab_sep(vocab) == LLAMA_TOKEN_NULL) {
898
+ LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
881
899
  ok = false;
882
900
  }
883
901
 
@@ -890,7 +908,7 @@ struct common_init_result common_init_from_params(common_params & params) {
890
908
 
891
909
  auto cparams = common_context_params_to_llama(params);
892
910
 
893
- llama_context * lctx = llama_new_context_with_model(model, cparams);
911
+ llama_context * lctx = llama_init_from_model(model, cparams);
894
912
  if (lctx == NULL) {
895
913
  LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.c_str());
896
914
  llama_model_free(model);
@@ -904,7 +922,7 @@ struct common_init_result common_init_from_params(common_params & params) {
904
922
 
905
923
  if (!params.control_vectors.empty()) {
906
924
  if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1;
907
- if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_n_layer(model);
925
+ if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_model_n_layer(model);
908
926
 
909
927
  const auto cvec = common_control_vector_load(params.control_vectors);
910
928
  if (cvec.n_embd == -1) {
@@ -914,12 +932,13 @@ struct common_init_result common_init_from_params(common_params & params) {
914
932
  return iparams;
915
933
  }
916
934
 
917
- int err = llama_control_vector_apply(lctx,
918
- cvec.data.data(),
919
- cvec.data.size(),
920
- cvec.n_embd,
921
- params.control_vector_layer_start,
922
- params.control_vector_layer_end);
935
+ int err = llama_apply_adapter_cvec(
936
+ lctx,
937
+ cvec.data.data(),
938
+ cvec.data.size(),
939
+ cvec.n_embd,
940
+ params.control_vector_layer_start,
941
+ params.control_vector_layer_end);
923
942
  if (err) {
924
943
  llama_free(lctx);
925
944
  llama_model_free(model);
@@ -930,8 +949,8 @@ struct common_init_result common_init_from_params(common_params & params) {
930
949
 
931
950
  // load and optionally apply lora adapters
932
951
  for (auto & la : params.lora_adapters) {
933
- llama_lora_adapter_ptr lora;
934
- lora.reset(llama_lora_adapter_init(model, la.path.c_str()));
952
+ llama_adapter_lora_ptr lora;
953
+ lora.reset(llama_adapter_lora_init(model, la.path.c_str()));
935
954
  if (lora == nullptr) {
936
955
  LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
937
956
  llama_free(lctx);
@@ -944,17 +963,17 @@ struct common_init_result common_init_from_params(common_params & params) {
944
963
  }
945
964
 
946
965
  if (!params.lora_init_without_apply) {
947
- common_lora_adapters_apply(lctx, params.lora_adapters);
966
+ common_set_adapter_lora(lctx, params.lora_adapters);
948
967
  }
949
968
 
950
- if (params.sampling.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) {
951
- LOG_WRN("%s: warning: model does not have an EOS token, ignoring --ignore-eos\n", __func__);
969
+ if (params.sampling.ignore_eos && llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) {
970
+ LOG_WRN("%s: warning: vocab does not have an EOS token, ignoring --ignore-eos\n", __func__);
952
971
  params.sampling.ignore_eos = false;
953
972
  }
954
973
 
955
974
  if (params.sampling.ignore_eos) {
956
- for (llama_token i = 0; i < llama_n_vocab(model); i++) {
957
- if (llama_token_is_eog(model, i)) {
975
+ for (llama_token i = 0; i < llama_vocab_n_tokens(vocab); i++) {
976
+ if (llama_vocab_is_eog(vocab, i)) {
958
977
  LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(lctx, i).c_str(), -INFINITY);
959
978
  params.sampling.logit_bias.push_back({i, -INFINITY});
960
979
  }
@@ -975,8 +994,9 @@ struct common_init_result common_init_from_params(common_params & params) {
975
994
  LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);
976
995
 
977
996
  std::vector<llama_token> tmp;
978
- llama_token bos = llama_token_bos(model);
979
- llama_token eos = llama_token_eos(model);
997
+ llama_token bos = llama_vocab_bos(vocab);
998
+ llama_token eos = llama_vocab_eos(vocab);
999
+
980
1000
  // some models (e.g. T5) don't have a BOS token
981
1001
  if (bos != LLAMA_TOKEN_NULL) {
982
1002
  tmp.push_back(bos);
@@ -1011,11 +1031,11 @@ struct common_init_result common_init_from_params(common_params & params) {
1011
1031
  return iparams;
1012
1032
  }
1013
1033
 
1014
- void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_info> & lora) {
1015
- llama_lora_adapter_clear(ctx);
1034
+ void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora) {
1035
+ llama_clear_adapter_lora(ctx);
1016
1036
  for (auto & la : lora) {
1017
1037
  if (la.scale != 0.0f) {
1018
- llama_lora_adapter_set(ctx, la.ptr, la.scale);
1038
+ llama_set_adapter_lora(ctx, la.ptr, la.scale);
1019
1039
  }
1020
1040
  }
1021
1041
  }
@@ -1033,7 +1053,6 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
1033
1053
  mparams.progress_callback_user_data = params.progress_callback_user_data;
1034
1054
  mparams.progress_callback = params.progress_callback;
1035
1055
  mparams.vocab_only = params.vocab_only;
1036
- mparams.rpc_servers = params.rpc_servers.c_str();
1037
1056
  mparams.main_gpu = params.main_gpu;
1038
1057
  mparams.split_mode = params.split_mode;
1039
1058
  mparams.tensor_split = params.tensor_split;
@@ -1134,219 +1153,6 @@ static bool curl_perform_with_retry(const std::string & url, CURL * curl, int ma
1134
1153
  return false;
1135
1154
  }
1136
1155
 
1137
- static bool common_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {
1138
- // Initialize libcurl
1139
- std::unique_ptr<CURL, decltype(&curl_easy_cleanup)> curl(curl_easy_init(), &curl_easy_cleanup);
1140
- if (!curl) {
1141
- LOG_ERR("%s: error initializing libcurl\n", __func__);
1142
- return false;
1143
- }
1144
-
1145
- bool force_download = false;
1146
-
1147
- // Set the URL, allow to follow http redirection
1148
- curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
1149
- curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);
1150
-
1151
- // Check if hf-token or bearer-token was specified
1152
- if (!hf_token.empty()) {
1153
- std::string auth_header = "Authorization: Bearer ";
1154
- auth_header += hf_token.c_str();
1155
- struct curl_slist *http_headers = NULL;
1156
- http_headers = curl_slist_append(http_headers, auth_header.c_str());
1157
- curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers);
1158
- }
1159
-
1160
- #if defined(_WIN32)
1161
- // CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of
1162
- // operating system. Currently implemented under MS-Windows.
1163
- curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
1164
- #endif
1165
-
1166
- // Check if the file already exists locally
1167
- auto file_exists = std::filesystem::exists(path);
1168
-
1169
- // If the file exists, check its JSON metadata companion file.
1170
- std::string metadata_path = path + ".json";
1171
- nlohmann::json metadata;
1172
- std::string etag;
1173
- std::string last_modified;
1174
-
1175
- if (file_exists) {
1176
- // Try and read the JSON metadata file (note: stream autoclosed upon exiting this block).
1177
- std::ifstream metadata_in(metadata_path);
1178
- if (metadata_in.good()) {
1179
- try {
1180
- metadata_in >> metadata;
1181
- LOG_INF("%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str());
1182
- if (metadata.contains("url") && metadata.at("url").is_string()) {
1183
- auto previous_url = metadata.at("url").get<std::string>();
1184
- if (previous_url != url) {
1185
- LOG_ERR("%s: Model URL mismatch: %s != %s\n", __func__, url.c_str(), previous_url.c_str());
1186
- return false;
1187
- }
1188
- }
1189
- if (metadata.contains("etag") && metadata.at("etag").is_string()) {
1190
- etag = metadata.at("etag");
1191
- }
1192
- if (metadata.contains("lastModified") && metadata.at("lastModified").is_string()) {
1193
- last_modified = metadata.at("lastModified");
1194
- }
1195
- } catch (const nlohmann::json::exception & e) {
1196
- LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
1197
- return false;
1198
- }
1199
- }
1200
- } else {
1201
- LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
1202
- }
1203
-
1204
- // Send a HEAD request to retrieve the etag and last-modified headers
1205
- struct common_load_model_from_url_headers {
1206
- std::string etag;
1207
- std::string last_modified;
1208
- };
1209
-
1210
- common_load_model_from_url_headers headers;
1211
-
1212
- {
1213
- typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
1214
- auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
1215
- common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata;
1216
-
1217
- static std::regex header_regex("([^:]+): (.*)\r\n");
1218
- static std::regex etag_regex("ETag", std::regex_constants::icase);
1219
- static std::regex last_modified_regex("Last-Modified", std::regex_constants::icase);
1220
-
1221
- std::string header(buffer, n_items);
1222
- std::smatch match;
1223
- if (std::regex_match(header, match, header_regex)) {
1224
- const std::string & key = match[1];
1225
- const std::string & value = match[2];
1226
- if (std::regex_match(key, match, etag_regex)) {
1227
- headers->etag = value;
1228
- } else if (std::regex_match(key, match, last_modified_regex)) {
1229
- headers->last_modified = value;
1230
- }
1231
- }
1232
- return n_items;
1233
- };
1234
-
1235
- curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 1L); // will trigger the HEAD verb
1236
- curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L); // hide head request progress
1237
- curl_easy_setopt(curl.get(), CURLOPT_HEADERFUNCTION, static_cast<CURLOPT_HEADERFUNCTION_PTR>(header_callback));
1238
- curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers);
1239
-
1240
- bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
1241
- if (!was_perform_successful) {
1242
- return false;
1243
- }
1244
-
1245
- long http_code = 0;
1246
- curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
1247
- if (http_code != 200) {
1248
- // HEAD not supported, we don't know if the file has changed
1249
- // force trigger downloading
1250
- force_download = true;
1251
- LOG_ERR("%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
1252
- }
1253
- }
1254
-
1255
- bool should_download = !file_exists || force_download;
1256
- if (!should_download) {
1257
- if (!etag.empty() && etag != headers.etag) {
1258
- LOG_WRN("%s: ETag header is different (%s != %s): triggering a new download\n", __func__, etag.c_str(), headers.etag.c_str());
1259
- should_download = true;
1260
- } else if (!last_modified.empty() && last_modified != headers.last_modified) {
1261
- LOG_WRN("%s: Last-Modified header is different (%s != %s): triggering a new download\n", __func__, last_modified.c_str(), headers.last_modified.c_str());
1262
- should_download = true;
1263
- }
1264
- }
1265
- if (should_download) {
1266
- std::string path_temporary = path + ".downloadInProgress";
1267
- if (file_exists) {
1268
- LOG_WRN("%s: deleting previous downloaded file: %s\n", __func__, path.c_str());
1269
- if (remove(path.c_str()) != 0) {
1270
- LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
1271
- return false;
1272
- }
1273
- }
1274
-
1275
- // Set the output file
1276
-
1277
- struct FILE_deleter {
1278
- void operator()(FILE * f) const {
1279
- fclose(f);
1280
- }
1281
- };
1282
-
1283
- std::unique_ptr<FILE, FILE_deleter> outfile(fopen(path_temporary.c_str(), "wb"));
1284
- if (!outfile) {
1285
- LOG_ERR("%s: error opening local file for writing: %s\n", __func__, path.c_str());
1286
- return false;
1287
- }
1288
-
1289
- typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * data, size_t size, size_t nmemb, void * fd);
1290
- auto write_callback = [](void * data, size_t size, size_t nmemb, void * fd) -> size_t {
1291
- return fwrite(data, size, nmemb, (FILE *)fd);
1292
- };
1293
- curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 0L);
1294
- curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
1295
- curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, outfile.get());
1296
-
1297
- // display download progress
1298
- curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 0L);
1299
-
1300
- // helper function to hide password in URL
1301
- auto llama_download_hide_password_in_url = [](const std::string & url) -> std::string {
1302
- std::size_t protocol_pos = url.find("://");
1303
- if (protocol_pos == std::string::npos) {
1304
- return url; // Malformed URL
1305
- }
1306
-
1307
- std::size_t at_pos = url.find('@', protocol_pos + 3);
1308
- if (at_pos == std::string::npos) {
1309
- return url; // No password in URL
1310
- }
1311
-
1312
- return url.substr(0, protocol_pos + 3) + "********" + url.substr(at_pos);
1313
- };
1314
-
1315
- // start the download
1316
- LOG_INF("%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
1317
- llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str());
1318
- bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
1319
- if (!was_perform_successful) {
1320
- return false;
1321
- }
1322
-
1323
- long http_code = 0;
1324
- curl_easy_getinfo (curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
1325
- if (http_code < 200 || http_code >= 400) {
1326
- LOG_ERR("%s: invalid http status code received: %ld\n", __func__, http_code);
1327
- return false;
1328
- }
1329
-
1330
- // Causes file to be closed explicitly here before we rename it.
1331
- outfile.reset();
1332
-
1333
- // Write the updated JSON metadata file.
1334
- metadata.update({
1335
- {"url", url},
1336
- {"etag", headers.etag},
1337
- {"lastModified", headers.last_modified}
1338
- });
1339
- std::ofstream(metadata_path) << metadata.dump(4);
1340
- LOG_INF("%s: file metadata saved: %s\n", __func__, metadata_path.c_str());
1341
-
1342
- if (rename(path_temporary.c_str(), path.c_str()) != 0) {
1343
- LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
1344
- return false;
1345
- }
1346
- }
1347
-
1348
- return true;
1349
- }
1350
1156
 
1351
1157
  struct llama_model * common_load_model_from_url(
1352
1158
  const std::string & model_url,
@@ -1450,6 +1256,80 @@ struct llama_model * common_load_model_from_hf(
1450
1256
  return common_load_model_from_url(model_url, local_path, hf_token, params);
1451
1257
  }
1452
1258
 
1259
+ /**
1260
+ * Allow getting the HF file from the HF repo with tag (like ollama), for example:
1261
+ * - bartowski/Llama-3.2-3B-Instruct-GGUF:q4
1262
+ * - bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M
1263
+ * - bartowski/Llama-3.2-3B-Instruct-GGUF:q5_k_s
1264
+ * Tag is optional, default to "latest" (meaning it checks for Q4_K_M first, then Q4, then if not found, return the first GGUF file in repo)
1265
+ *
1266
+ * Return pair of <repo, file> (with "repo" already having tag removed)
1267
+ *
1268
+ * Note: we use the Ollama-compatible HF API, but not using the blobId. Instead, we use the special "ggufFile" field which returns the value for "hf_file". This is done to be backward-compatible with existing cache files.
1269
+ */
1270
+ std::pair<std::string, std::string> common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & hf_token) {
1271
+ auto parts = string_split<std::string>(hf_repo_with_tag, ':');
1272
+ std::string tag = parts.size() > 1 ? parts.back() : "latest";
1273
+ std::string hf_repo = parts[0];
1274
+ if (string_split<std::string>(hf_repo, '/').size() != 2) {
1275
+ throw std::invalid_argument("error: invalid HF repo format, expected <user>/<model>[:quant]\n");
1276
+ }
1277
+
1278
+ // fetch model info from Hugging Face Hub API
1279
+ json model_info;
1280
+ curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
1281
+ curl_slist_ptr http_headers;
1282
+ std::string res_str;
1283
+ std::string url = "https://huggingface.co/v2/" + hf_repo + "/manifests/" + tag;
1284
+ curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
1285
+ curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L);
1286
+ typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * ptr, size_t size, size_t nmemb, void * data);
1287
+ auto write_callback = [](void * ptr, size_t size, size_t nmemb, void * data) -> size_t {
1288
+ static_cast<std::string *>(data)->append((char * ) ptr, size * nmemb);
1289
+ return size * nmemb;
1290
+ };
1291
+ curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
1292
+ curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &res_str);
1293
+ #if defined(_WIN32)
1294
+ curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
1295
+ #endif
1296
+ if (!hf_token.empty()) {
1297
+ std::string auth_header = "Authorization: Bearer " + hf_token;
1298
+ http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
1299
+ }
1300
+ // Important: the User-Agent must be "llama-cpp" to get the "ggufFile" field in the response
1301
+ http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
1302
+ http_headers.ptr = curl_slist_append(http_headers.ptr, "Accept: application/json");
1303
+ curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
1304
+
1305
+ CURLcode res = curl_easy_perform(curl.get());
1306
+
1307
+ if (res != CURLE_OK) {
1308
+ throw std::runtime_error("error: cannot make GET request to HF API");
1309
+ }
1310
+
1311
+ long res_code;
1312
+ curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &res_code);
1313
+ if (res_code == 200) {
1314
+ model_info = json::parse(res_str);
1315
+ } else if (res_code == 401) {
1316
+ throw std::runtime_error("error: model is private or does not exist; if you are accessing a gated model, please provide a valid HF token");
1317
+ } else {
1318
+ throw std::runtime_error(string_format("error from HF API, response code: %ld, data: %s", res_code, res_str.c_str()));
1319
+ }
1320
+
1321
+ // check response
1322
+ if (!model_info.contains("ggufFile")) {
1323
+ throw std::runtime_error("error: model does not have ggufFile");
1324
+ }
1325
+ json & lm_gguf_file = model_info.at("ggufFile");
1326
+ if (!lm_gguf_file.contains("rfilename")) {
1327
+ throw std::runtime_error("error: ggufFile does not have rfilename");
1328
+ }
1329
+
1330
+ return std::make_pair(hf_repo, lm_gguf_file.at("rfilename"));
1331
+ }
1332
+
1453
1333
  #else
1454
1334
 
1455
1335
  struct llama_model * common_load_model_from_url(
@@ -1471,6 +1351,11 @@ struct llama_model * common_load_model_from_hf(
1471
1351
  return nullptr;
1472
1352
  }
1473
1353
 
1354
+ std::pair<std::string, std::string> common_get_hf_file(const std::string &, const std::string &) {
1355
+ LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
1356
+ return std::make_pair("", "");
1357
+ }
1358
+
1474
1359
  #endif // LLAMA_USE_CURL
1475
1360
 
1476
1361
  //
@@ -1569,21 +1454,23 @@ std::vector<llama_token> common_tokenize(
1569
1454
  const std::string & text,
1570
1455
  bool add_special,
1571
1456
  bool parse_special) {
1572
- return common_tokenize(llama_get_model(ctx), text, add_special, parse_special);
1457
+ const llama_model * model = llama_get_model(ctx);
1458
+ const llama_vocab * vocab = llama_model_get_vocab(model);
1459
+ return common_tokenize(vocab, text, add_special, parse_special);
1573
1460
  }
1574
1461
 
1575
1462
  std::vector<llama_token> common_tokenize(
1576
- const struct llama_model * model,
1463
+ const struct llama_vocab * vocab,
1577
1464
  const std::string & text,
1578
1465
  bool add_special,
1579
1466
  bool parse_special) {
1580
1467
  // upper limit for the number of tokens
1581
1468
  int n_tokens = text.length() + 2 * add_special;
1582
1469
  std::vector<llama_token> result(n_tokens);
1583
- n_tokens = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
1470
+ n_tokens = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
1584
1471
  if (n_tokens < 0) {
1585
1472
  result.resize(-n_tokens);
1586
- int check = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
1473
+ int check = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
1587
1474
  LM_GGML_ASSERT(check == -n_tokens);
1588
1475
  } else {
1589
1476
  result.resize(n_tokens);
@@ -1592,12 +1479,18 @@ std::vector<llama_token> common_tokenize(
1592
1479
  }
1593
1480
 
1594
1481
  std::string common_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
1482
+ const llama_model * model = llama_get_model(ctx);
1483
+ const llama_vocab * vocab = llama_model_get_vocab(model);
1484
+ return common_token_to_piece(vocab, token, special);
1485
+ }
1486
+
1487
+ std::string common_token_to_piece(const struct llama_vocab * vocab, llama_token token, bool special) {
1595
1488
  std::string piece;
1596
1489
  piece.resize(piece.capacity()); // using string internal cache, 15 bytes + '\n'
1597
- const int n_chars = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special);
1490
+ const int n_chars = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special);
1598
1491
  if (n_chars < 0) {
1599
1492
  piece.resize(-n_chars);
1600
- int check = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special);
1493
+ int check = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special);
1601
1494
  LM_GGML_ASSERT(check == -n_chars);
1602
1495
  }
1603
1496
  else {
@@ -1607,13 +1500,19 @@ std::string common_token_to_piece(const struct llama_context * ctx, llama_token
1607
1500
  return piece;
1608
1501
  }
1609
1502
 
1610
- std::string common_detokenize(llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
1503
+ std::string common_detokenize(const struct llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
1504
+ const llama_model * model = llama_get_model(ctx);
1505
+ const llama_vocab * vocab = llama_model_get_vocab(model);
1506
+ return common_detokenize(vocab, tokens, special);
1507
+ }
1508
+
1509
+ std::string common_detokenize(const struct llama_vocab * vocab, const std::vector<llama_token> & tokens, bool special) {
1611
1510
  std::string text;
1612
1511
  text.resize(std::max(text.capacity(), tokens.size()));
1613
- int32_t n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
1512
+ int32_t n_chars = llama_detokenize(vocab, tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
1614
1513
  if (n_chars < 0) {
1615
1514
  text.resize(-n_chars);
1616
- n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
1515
+ n_chars = llama_detokenize(vocab, tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
1617
1516
  LM_GGML_ASSERT(n_chars <= (int32_t)text.size()); // whitespace trimming is performed after per-token detokenization
1618
1517
  }
1619
1518
 
@@ -1628,20 +1527,13 @@ std::string common_detokenize(llama_context * ctx, const std::vector<llama_token
1628
1527
  //
1629
1528
 
1630
1529
  std::string common_get_builtin_chat_template(const struct llama_model * model) {
1631
- static const char * template_key = "tokenizer.chat_template";
1632
- // call with NULL buffer to get the total size of the string
1633
- int32_t res = llama_model_meta_val_str(model, template_key, NULL, 0);
1634
- if (res > 0) {
1635
- std::vector<char> model_template(res + 1, 0);
1636
- llama_model_meta_val_str(model, template_key, model_template.data(), model_template.size());
1637
- return std::string(model_template.data(), model_template.size() - 1);
1638
- }
1639
- return "";
1530
+ const char * ptr_tmpl = llama_model_chat_template(model);
1531
+ return ptr_tmpl == nullptr ? "" : ptr_tmpl;
1640
1532
  }
1641
1533
 
1642
1534
  bool common_chat_verify_template(const std::string & tmpl) {
1643
1535
  llama_chat_message chat[] = {{"user", "test"}};
1644
- int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0);
1536
+ const int res = llama_chat_apply_template(tmpl.c_str(), chat, 1, true, nullptr, 0);
1645
1537
  return res >= 0;
1646
1538
  }
1647
1539
 
@@ -1652,16 +1544,16 @@ std::string common_chat_apply_template(const struct llama_model * model,
1652
1544
  int alloc_size = 0;
1653
1545
  bool fallback = false; // indicate if we must fallback to default chatml
1654
1546
  std::vector<llama_chat_message> chat;
1655
- for (auto & msg : msgs) {
1547
+ for (const auto & msg : msgs) {
1656
1548
  chat.push_back({msg.role.c_str(), msg.content.c_str()});
1657
1549
  alloc_size += (msg.role.size() + msg.content.size()) * 1.25;
1658
1550
  }
1659
1551
 
1660
- const char * ptr_tmpl = tmpl.empty() ? nullptr : tmpl.c_str();
1552
+ const char * ptr_tmpl = tmpl.empty() ? llama_model_chat_template(model) : tmpl.c_str();
1661
1553
  std::vector<char> buf(alloc_size);
1662
1554
 
1663
1555
  // run the first time to get the total output length
1664
- int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size());
1556
+ int32_t res = llama_chat_apply_template(ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size());
1665
1557
 
1666
1558
  // error: chat template is not supported
1667
1559
  if (res < 0) {
@@ -1669,18 +1561,17 @@ std::string common_chat_apply_template(const struct llama_model * model,
1669
1561
  // if the custom "tmpl" is not supported, we throw an error
1670
1562
  // this is a bit redundant (for good), since we're not sure if user validated the custom template with llama_chat_verify_template()
1671
1563
  throw std::runtime_error("this custom template is not supported");
1672
- } else {
1673
- // If the built-in template is not supported, we default to chatml
1674
- res = llama_chat_apply_template(nullptr, "chatml", chat.data(), chat.size(), add_ass, buf.data(), buf.size());
1675
- fallback = true;
1676
1564
  }
1565
+
1566
+ // If the built-in template is not supported, we default to chatml
1567
+ res = llama_chat_apply_template("chatml", chat.data(), chat.size(), add_ass, buf.data(), buf.size());
1568
+ fallback = true;
1677
1569
  }
1678
1570
 
1679
1571
  // if it turns out that our buffer is too small, we resize it
1680
1572
  if ((size_t) res > buf.size()) {
1681
1573
  buf.resize(res);
1682
1574
  res = llama_chat_apply_template(
1683
- fallback ? nullptr : model,
1684
1575
  fallback ? "chatml" : ptr_tmpl,
1685
1576
  chat.data(), chat.size(), add_ass, buf.data(), buf.size());
1686
1577
  }