cui-llama.rn 1.4.0 → 1.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. package/android/src/main/jni.cpp +9 -9
  2. package/cpp/common.cpp +163 -60
  3. package/cpp/common.h +43 -12
  4. package/cpp/ggml-alloc.c +1042 -1037
  5. package/cpp/ggml-backend-impl.h +255 -256
  6. package/cpp/ggml-backend-reg.cpp +582 -582
  7. package/cpp/ggml-backend.cpp +2002 -2002
  8. package/cpp/ggml-backend.h +354 -352
  9. package/cpp/ggml-common.h +1853 -1853
  10. package/cpp/ggml-cpp.h +39 -39
  11. package/cpp/ggml-cpu-aarch64.cpp +4247 -4247
  12. package/cpp/ggml-cpu-aarch64.h +8 -8
  13. package/cpp/ggml-cpu-impl.h +386 -386
  14. package/cpp/ggml-cpu-quants.c +10920 -10839
  15. package/cpp/ggml-cpu-traits.cpp +36 -36
  16. package/cpp/ggml-cpu-traits.h +38 -38
  17. package/cpp/ggml-cpu.c +329 -60
  18. package/cpp/ggml-cpu.cpp +10 -2
  19. package/cpp/ggml-cpu.h +135 -135
  20. package/cpp/ggml-impl.h +567 -567
  21. package/cpp/ggml-metal-impl.h +17 -17
  22. package/cpp/ggml-metal.m +4884 -4884
  23. package/cpp/ggml-quants.c +5238 -5238
  24. package/cpp/ggml-threading.h +14 -14
  25. package/cpp/ggml.c +6514 -6448
  26. package/cpp/ggml.h +2194 -2163
  27. package/cpp/gguf.cpp +1329 -1325
  28. package/cpp/gguf.h +202 -202
  29. package/cpp/json-schema-to-grammar.cpp +1045 -1045
  30. package/cpp/json-schema-to-grammar.h +8 -8
  31. package/cpp/json.hpp +24766 -24766
  32. package/cpp/llama-adapter.cpp +347 -346
  33. package/cpp/llama-adapter.h +74 -73
  34. package/cpp/llama-arch.cpp +1487 -1434
  35. package/cpp/llama-arch.h +400 -395
  36. package/cpp/llama-batch.cpp +368 -368
  37. package/cpp/llama-batch.h +88 -88
  38. package/cpp/llama-chat.cpp +578 -567
  39. package/cpp/llama-chat.h +52 -51
  40. package/cpp/llama-context.cpp +1775 -1771
  41. package/cpp/llama-context.h +128 -128
  42. package/cpp/llama-cparams.cpp +1 -1
  43. package/cpp/llama-cparams.h +37 -37
  44. package/cpp/llama-cpp.h +30 -30
  45. package/cpp/llama-grammar.cpp +1139 -1139
  46. package/cpp/llama-grammar.h +143 -143
  47. package/cpp/llama-hparams.cpp +71 -71
  48. package/cpp/llama-hparams.h +139 -140
  49. package/cpp/llama-impl.cpp +167 -167
  50. package/cpp/llama-impl.h +61 -61
  51. package/cpp/llama-kv-cache.cpp +718 -718
  52. package/cpp/llama-kv-cache.h +218 -218
  53. package/cpp/llama-mmap.cpp +2 -1
  54. package/cpp/llama-mmap.h +67 -67
  55. package/cpp/llama-model-loader.cpp +1124 -1011
  56. package/cpp/llama-model-loader.h +167 -158
  57. package/cpp/llama-model.cpp +3997 -2202
  58. package/cpp/llama-model.h +370 -391
  59. package/cpp/llama-sampling.cpp +2408 -2406
  60. package/cpp/llama-sampling.h +32 -48
  61. package/cpp/llama-vocab.cpp +3247 -1982
  62. package/cpp/llama-vocab.h +125 -182
  63. package/cpp/llama.cpp +416 -2886
  64. package/cpp/llama.h +1323 -1285
  65. package/cpp/log.cpp +401 -401
  66. package/cpp/log.h +121 -121
  67. package/cpp/rn-llama.hpp +18 -12
  68. package/cpp/sampling.cpp +505 -500
  69. package/cpp/sgemm.cpp +2597 -2597
  70. package/cpp/speculative.cpp +277 -274
  71. package/cpp/speculative.h +28 -28
  72. package/cpp/unicode.cpp +2 -3
  73. package/package.json +1 -1
@@ -345,10 +345,10 @@ Java_com_rnllama_LlamaContext_initContext(
345
345
  llama_free(llama->ctx);
346
346
  }
347
347
 
348
- std::vector<common_lora_adapter_info> lora;
348
+ std::vector<common_adapter_lora_info> lora;
349
349
  const char *lora_chars = env->GetStringUTFChars(lora_str, nullptr);
350
350
  if (lora_chars != nullptr && lora_chars[0] != '\0') {
351
- common_lora_adapter_info la;
351
+ common_adapter_lora_info la;
352
352
  la.path = lora_chars;
353
353
  la.scale = lora_scaled;
354
354
  lora.push_back(la);
@@ -362,7 +362,7 @@ Java_com_rnllama_LlamaContext_initContext(
362
362
  jstring path = readablemap::getString(env, lora_adapter, "path", nullptr);
363
363
  if (path != nullptr) {
364
364
  const char *path_chars = env->GetStringUTFChars(path, nullptr);
365
- common_lora_adapter_info la;
365
+ common_adapter_lora_info la;
366
366
  la.path = path_chars;
367
367
  la.scale = readablemap::getFloat(env, lora_adapter, "scaled", 1.0f);
368
368
  lora.push_back(la);
@@ -409,7 +409,7 @@ Java_com_rnllama_LlamaContext_loadModelDetails(
409
409
  for (int i = 0; i < count; i++) {
410
410
  char key[256];
411
411
  llama_model_meta_key_by_index(llama->model, i, key, sizeof(key));
412
- char val[2048];
412
+ char val[4096];
413
413
  llama_model_meta_val_str_by_index(llama->model, i, val, sizeof(val));
414
414
 
415
415
  putString(env, meta, key, val);
@@ -623,7 +623,7 @@ Java_com_rnllama_LlamaContext_doCompletion(
623
623
 
624
624
  sparams.logit_bias.clear();
625
625
  if (ignore_eos) {
626
- sparams.logit_bias[llama_token_eos(llama->model)].bias = -INFINITY;
626
+ sparams.logit_bias[llama_vocab_eos(llama_model_get_vocab(llama->model))].bias = -INFINITY;
627
627
  }
628
628
 
629
629
  // dry break seq
@@ -642,7 +642,7 @@ Java_com_rnllama_LlamaContext_doCompletion(
642
642
  sparams.dry_sequence_breakers = dry_sequence_breakers_vector;
643
643
 
644
644
  // logit bias
645
- const int n_vocab = llama_n_vocab(llama_get_model(llama->ctx));
645
+ const int n_vocab = llama_vocab_n_tokens(llama_model_get_vocab(llama->model));
646
646
  jsize logit_bias_len = env->GetArrayLength(logit_bias);
647
647
 
648
648
  for (jsize i = 0; i < logit_bias_len; i++) {
@@ -921,7 +921,7 @@ Java_com_rnllama_LlamaContext_applyLoraAdapters(
921
921
  auto llama = context_map[(long) context_ptr];
922
922
 
923
923
  // lora_adapters: ReadableArray<ReadableMap>
924
- std::vector<common_lora_adapter_info> lora_adapters;
924
+ std::vector<common_adapter_lora_info> lora_adapters;
925
925
  int lora_adapters_size = readablearray::size(env, loraAdapters);
926
926
  for (int i = 0; i < lora_adapters_size; i++) {
927
927
  jobject lora_adapter = readablearray::getMap(env, loraAdapters, i);
@@ -930,7 +930,7 @@ Java_com_rnllama_LlamaContext_applyLoraAdapters(
930
930
  const char *path_chars = env->GetStringUTFChars(path, nullptr);
931
931
  env->ReleaseStringUTFChars(path, path_chars);
932
932
  float scaled = readablemap::getFloat(env, lora_adapter, "scaled", 1.0f);
933
- common_lora_adapter_info la;
933
+ common_adapter_lora_info la;
934
934
  la.path = path_chars;
935
935
  la.scale = scaled;
936
936
  lora_adapters.push_back(la);
@@ -955,7 +955,7 @@ Java_com_rnllama_LlamaContext_getLoadedLoraAdapters(
955
955
  auto llama = context_map[(long) context_ptr];
956
956
  auto loaded_lora_adapters = llama->getLoadedLoraAdapters();
957
957
  auto result = createWritableArray(env);
958
- for (common_lora_adapter_info &la : loaded_lora_adapters) {
958
+ for (common_adapter_lora_info &la : loaded_lora_adapters) {
959
959
  auto map = createWriteableMap(env);
960
960
  putString(env, map, "path", la.path.c_str());
961
961
  putDouble(env, map, "scaled", la.scale);
package/cpp/common.cpp CHANGED
@@ -79,6 +79,22 @@ char const *LLAMA_BUILD_TARGET = "unknown";
79
79
  #include <sys/syslimits.h>
80
80
  #endif
81
81
  #define LLAMA_CURL_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
82
+
83
+ //
84
+ // CURL utils
85
+ //
86
+
87
+ using curl_ptr = std::unique_ptr<CURL, decltype(&curl_easy_cleanup)>;
88
+
89
+ // cannot use unique_ptr for curl_slist, because we cannot update without destroying the old one
90
+ struct curl_slist_ptr {
91
+ struct curl_slist * ptr = nullptr;
92
+ ~curl_slist_ptr() {
93
+ if (ptr) {
94
+ curl_slist_free_all(ptr);
95
+ }
96
+ }
97
+ };
82
98
  #endif // LLAMA_USE_CURL
83
99
 
84
100
  using json = nlohmann::ordered_json;
@@ -863,21 +879,23 @@ struct common_init_result common_init_from_params(common_params & params) {
863
879
  return iparams;
864
880
  }
865
881
 
882
+ const llama_vocab * vocab = llama_model_get_vocab(model);
883
+
866
884
  if (params.reranking) {
867
885
  bool ok = true;
868
886
 
869
- if (llama_token_bos(model) == LLAMA_TOKEN_NULL) {
870
- LOG_WRN("%s: warning: model does not have a BOS token, reranking will not work\n", __func__);
887
+ if (llama_vocab_bos(vocab) == LLAMA_TOKEN_NULL) {
888
+ LOG_WRN("%s: warning: vocab does not have a BOS token, reranking will not work\n", __func__);
871
889
  ok = false;
872
890
  }
873
891
 
874
- if (llama_token_eos(model) == LLAMA_TOKEN_NULL) {
875
- LOG_WRN("%s: warning: model does not have an EOS token, reranking will not work\n", __func__);
892
+ if (llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) {
893
+ LOG_WRN("%s: warning: vocab does not have an EOS token, reranking will not work\n", __func__);
876
894
  ok = false;
877
895
  }
878
896
 
879
- if (llama_token_sep(model) == LLAMA_TOKEN_NULL) {
880
- LOG_WRN("%s: warning: model does not have a SEP token, reranking will not work\n", __func__);
897
+ if (llama_vocab_sep(vocab) == LLAMA_TOKEN_NULL) {
898
+ LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
881
899
  ok = false;
882
900
  }
883
901
 
@@ -890,7 +908,7 @@ struct common_init_result common_init_from_params(common_params & params) {
890
908
 
891
909
  auto cparams = common_context_params_to_llama(params);
892
910
 
893
- llama_context * lctx = llama_new_context_with_model(model, cparams);
911
+ llama_context * lctx = llama_init_from_model(model, cparams);
894
912
  if (lctx == NULL) {
895
913
  LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.c_str());
896
914
  llama_model_free(model);
@@ -904,7 +922,7 @@ struct common_init_result common_init_from_params(common_params & params) {
904
922
 
905
923
  if (!params.control_vectors.empty()) {
906
924
  if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1;
907
- if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_n_layer(model);
925
+ if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_model_n_layer(model);
908
926
 
909
927
  const auto cvec = common_control_vector_load(params.control_vectors);
910
928
  if (cvec.n_embd == -1) {
@@ -914,12 +932,13 @@ struct common_init_result common_init_from_params(common_params & params) {
914
932
  return iparams;
915
933
  }
916
934
 
917
- int err = llama_control_vector_apply(lctx,
918
- cvec.data.data(),
919
- cvec.data.size(),
920
- cvec.n_embd,
921
- params.control_vector_layer_start,
922
- params.control_vector_layer_end);
935
+ int err = llama_apply_adapter_cvec(
936
+ lctx,
937
+ cvec.data.data(),
938
+ cvec.data.size(),
939
+ cvec.n_embd,
940
+ params.control_vector_layer_start,
941
+ params.control_vector_layer_end);
923
942
  if (err) {
924
943
  llama_free(lctx);
925
944
  llama_model_free(model);
@@ -930,8 +949,8 @@ struct common_init_result common_init_from_params(common_params & params) {
930
949
 
931
950
  // load and optionally apply lora adapters
932
951
  for (auto & la : params.lora_adapters) {
933
- llama_lora_adapter_ptr lora;
934
- lora.reset(llama_lora_adapter_init(model, la.path.c_str()));
952
+ llama_adapter_lora_ptr lora;
953
+ lora.reset(llama_adapter_lora_init(model, la.path.c_str()));
935
954
  if (lora == nullptr) {
936
955
  LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
937
956
  llama_free(lctx);
@@ -944,17 +963,17 @@ struct common_init_result common_init_from_params(common_params & params) {
944
963
  }
945
964
 
946
965
  if (!params.lora_init_without_apply) {
947
- common_lora_adapters_apply(lctx, params.lora_adapters);
966
+ common_set_adapter_lora(lctx, params.lora_adapters);
948
967
  }
949
968
 
950
- if (params.sampling.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) {
951
- LOG_WRN("%s: warning: model does not have an EOS token, ignoring --ignore-eos\n", __func__);
969
+ if (params.sampling.ignore_eos && llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) {
970
+ LOG_WRN("%s: warning: vocab does not have an EOS token, ignoring --ignore-eos\n", __func__);
952
971
  params.sampling.ignore_eos = false;
953
972
  }
954
973
 
955
974
  if (params.sampling.ignore_eos) {
956
- for (llama_token i = 0; i < llama_n_vocab(model); i++) {
957
- if (llama_token_is_eog(model, i)) {
975
+ for (llama_token i = 0; i < llama_vocab_n_tokens(vocab); i++) {
976
+ if (llama_vocab_is_eog(vocab, i)) {
958
977
  LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(lctx, i).c_str(), -INFINITY);
959
978
  params.sampling.logit_bias.push_back({i, -INFINITY});
960
979
  }
@@ -975,8 +994,9 @@ struct common_init_result common_init_from_params(common_params & params) {
975
994
  LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);
976
995
 
977
996
  std::vector<llama_token> tmp;
978
- llama_token bos = llama_token_bos(model);
979
- llama_token eos = llama_token_eos(model);
997
+ llama_token bos = llama_vocab_bos(vocab);
998
+ llama_token eos = llama_vocab_eos(vocab);
999
+
980
1000
  // some models (e.g. T5) don't have a BOS token
981
1001
  if (bos != LLAMA_TOKEN_NULL) {
982
1002
  tmp.push_back(bos);
@@ -1011,11 +1031,11 @@ struct common_init_result common_init_from_params(common_params & params) {
1011
1031
  return iparams;
1012
1032
  }
1013
1033
 
1014
- void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_info> & lora) {
1015
- llama_lora_adapter_clear(ctx);
1034
+ void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora) {
1035
+ llama_clear_adapter_lora(ctx);
1016
1036
  for (auto & la : lora) {
1017
1037
  if (la.scale != 0.0f) {
1018
- llama_lora_adapter_set(ctx, la.ptr, la.scale);
1038
+ llama_set_adapter_lora(ctx, la.ptr, la.scale);
1019
1039
  }
1020
1040
  }
1021
1041
  }
@@ -1033,7 +1053,6 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
1033
1053
  mparams.progress_callback_user_data = params.progress_callback_user_data;
1034
1054
  mparams.progress_callback = params.progress_callback;
1035
1055
  mparams.vocab_only = params.vocab_only;
1036
- mparams.rpc_servers = params.rpc_servers.c_str();
1037
1056
  mparams.main_gpu = params.main_gpu;
1038
1057
  mparams.split_mode = params.split_mode;
1039
1058
  mparams.tensor_split = params.tensor_split;
@@ -1136,7 +1155,8 @@ static bool curl_perform_with_retry(const std::string & url, CURL * curl, int ma
1136
1155
 
1137
1156
  static bool common_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {
1138
1157
  // Initialize libcurl
1139
- std::unique_ptr<CURL, decltype(&curl_easy_cleanup)> curl(curl_easy_init(), &curl_easy_cleanup);
1158
+ curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
1159
+ curl_slist_ptr http_headers;
1140
1160
  if (!curl) {
1141
1161
  LOG_ERR("%s: error initializing libcurl\n", __func__);
1142
1162
  return false;
@@ -1150,11 +1170,9 @@ static bool common_download_file(const std::string & url, const std::string & pa
1150
1170
 
1151
1171
  // Check if hf-token or bearer-token was specified
1152
1172
  if (!hf_token.empty()) {
1153
- std::string auth_header = "Authorization: Bearer ";
1154
- auth_header += hf_token.c_str();
1155
- struct curl_slist *http_headers = NULL;
1156
- http_headers = curl_slist_append(http_headers, auth_header.c_str());
1157
- curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers);
1173
+ std::string auth_header = "Authorization: Bearer " + hf_token;
1174
+ http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
1175
+ curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
1158
1176
  }
1159
1177
 
1160
1178
  #if defined(_WIN32)
@@ -1450,6 +1468,80 @@ struct llama_model * common_load_model_from_hf(
1450
1468
  return common_load_model_from_url(model_url, local_path, hf_token, params);
1451
1469
  }
1452
1470
 
1471
+ /**
1472
+ * Allow getting the HF file from the HF repo with tag (like ollama), for example:
1473
+ * - bartowski/Llama-3.2-3B-Instruct-GGUF:q4
1474
+ * - bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M
1475
+ * - bartowski/Llama-3.2-3B-Instruct-GGUF:q5_k_s
1476
+ * Tag is optional, default to "latest" (meaning it checks for Q4_K_M first, then Q4, then if not found, return the first GGUF file in repo)
1477
+ *
1478
+ * Return pair of <repo, file> (with "repo" already having tag removed)
1479
+ *
1480
+ * Note: we use the Ollama-compatible HF API, but not using the blobId. Instead, we use the special "ggufFile" field which returns the value for "hf_file". This is done to be backward-compatible with existing cache files.
1481
+ */
1482
+ std::pair<std::string, std::string> common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & hf_token) {
1483
+ auto parts = string_split<std::string>(hf_repo_with_tag, ':');
1484
+ std::string tag = parts.size() > 1 ? parts.back() : "latest";
1485
+ std::string hf_repo = parts[0];
1486
+ if (string_split<std::string>(hf_repo, '/').size() != 2) {
1487
+ throw std::invalid_argument("error: invalid HF repo format, expected <user>/<model>[:quant]\n");
1488
+ }
1489
+
1490
+ // fetch model info from Hugging Face Hub API
1491
+ json model_info;
1492
+ curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
1493
+ curl_slist_ptr http_headers;
1494
+ std::string res_str;
1495
+ std::string url = "https://huggingface.co/v2/" + hf_repo + "/manifests/" + tag;
1496
+ curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
1497
+ curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L);
1498
+ typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * ptr, size_t size, size_t nmemb, void * data);
1499
+ auto write_callback = [](void * ptr, size_t size, size_t nmemb, void * data) -> size_t {
1500
+ static_cast<std::string *>(data)->append((char * ) ptr, size * nmemb);
1501
+ return size * nmemb;
1502
+ };
1503
+ curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
1504
+ curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &res_str);
1505
+ #if defined(_WIN32)
1506
+ curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
1507
+ #endif
1508
+ if (!hf_token.empty()) {
1509
+ std::string auth_header = "Authorization: Bearer " + hf_token;
1510
+ http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
1511
+ }
1512
+ // Important: the User-Agent must be "llama-cpp" to get the "ggufFile" field in the response
1513
+ http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
1514
+ http_headers.ptr = curl_slist_append(http_headers.ptr, "Accept: application/json");
1515
+ curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
1516
+
1517
+ CURLcode res = curl_easy_perform(curl.get());
1518
+
1519
+ if (res != CURLE_OK) {
1520
+ throw std::runtime_error("error: cannot make GET request to HF API");
1521
+ }
1522
+
1523
+ long res_code;
1524
+ curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &res_code);
1525
+ if (res_code == 200) {
1526
+ model_info = json::parse(res_str);
1527
+ } else if (res_code == 401) {
1528
+ throw std::runtime_error("error: model is private or does not exist; if you are accessing a gated model, please provide a valid HF token");
1529
+ } else {
1530
+ throw std::runtime_error(string_format("error from HF API, response code: %ld, data: %s", res_code, res_str.c_str()));
1531
+ }
1532
+
1533
+ // check response
1534
+ if (!model_info.contains("ggufFile")) {
1535
+ throw std::runtime_error("error: model does not have ggufFile");
1536
+ }
1537
+ json & lm_gguf_file = model_info.at("ggufFile");
1538
+ if (!lm_gguf_file.contains("rfilename")) {
1539
+ throw std::runtime_error("error: ggufFile does not have rfilename");
1540
+ }
1541
+
1542
+ return std::make_pair(hf_repo, lm_gguf_file.at("rfilename"));
1543
+ }
1544
+
1453
1545
  #else
1454
1546
 
1455
1547
  struct llama_model * common_load_model_from_url(
@@ -1471,6 +1563,11 @@ struct llama_model * common_load_model_from_hf(
1471
1563
  return nullptr;
1472
1564
  }
1473
1565
 
1566
+ std::pair<std::string, std::string> common_get_hf_file(const std::string &, const std::string &) {
1567
+ LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
1568
+ return std::make_pair("", "");
1569
+ }
1570
+
1474
1571
  #endif // LLAMA_USE_CURL
1475
1572
 
1476
1573
  //
@@ -1569,21 +1666,23 @@ std::vector<llama_token> common_tokenize(
1569
1666
  const std::string & text,
1570
1667
  bool add_special,
1571
1668
  bool parse_special) {
1572
- return common_tokenize(llama_get_model(ctx), text, add_special, parse_special);
1669
+ const llama_model * model = llama_get_model(ctx);
1670
+ const llama_vocab * vocab = llama_model_get_vocab(model);
1671
+ return common_tokenize(vocab, text, add_special, parse_special);
1573
1672
  }
1574
1673
 
1575
1674
  std::vector<llama_token> common_tokenize(
1576
- const struct llama_model * model,
1675
+ const struct llama_vocab * vocab,
1577
1676
  const std::string & text,
1578
1677
  bool add_special,
1579
1678
  bool parse_special) {
1580
1679
  // upper limit for the number of tokens
1581
1680
  int n_tokens = text.length() + 2 * add_special;
1582
1681
  std::vector<llama_token> result(n_tokens);
1583
- n_tokens = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
1682
+ n_tokens = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
1584
1683
  if (n_tokens < 0) {
1585
1684
  result.resize(-n_tokens);
1586
- int check = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
1685
+ int check = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
1587
1686
  LM_GGML_ASSERT(check == -n_tokens);
1588
1687
  } else {
1589
1688
  result.resize(n_tokens);
@@ -1592,12 +1691,18 @@ std::vector<llama_token> common_tokenize(
1592
1691
  }
1593
1692
 
1594
1693
  std::string common_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
1694
+ const llama_model * model = llama_get_model(ctx);
1695
+ const llama_vocab * vocab = llama_model_get_vocab(model);
1696
+ return common_token_to_piece(vocab, token, special);
1697
+ }
1698
+
1699
+ std::string common_token_to_piece(const struct llama_vocab * vocab, llama_token token, bool special) {
1595
1700
  std::string piece;
1596
1701
  piece.resize(piece.capacity()); // using string internal cache, 15 bytes + '\n'
1597
- const int n_chars = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special);
1702
+ const int n_chars = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special);
1598
1703
  if (n_chars < 0) {
1599
1704
  piece.resize(-n_chars);
1600
- int check = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special);
1705
+ int check = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special);
1601
1706
  LM_GGML_ASSERT(check == -n_chars);
1602
1707
  }
1603
1708
  else {
@@ -1607,13 +1712,19 @@ std::string common_token_to_piece(const struct llama_context * ctx, llama_token
1607
1712
  return piece;
1608
1713
  }
1609
1714
 
1610
- std::string common_detokenize(llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
1715
+ std::string common_detokenize(const struct llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
1716
+ const llama_model * model = llama_get_model(ctx);
1717
+ const llama_vocab * vocab = llama_model_get_vocab(model);
1718
+ return common_detokenize(vocab, tokens, special);
1719
+ }
1720
+
1721
+ std::string common_detokenize(const struct llama_vocab * vocab, const std::vector<llama_token> & tokens, bool special) {
1611
1722
  std::string text;
1612
1723
  text.resize(std::max(text.capacity(), tokens.size()));
1613
- int32_t n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
1724
+ int32_t n_chars = llama_detokenize(vocab, tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
1614
1725
  if (n_chars < 0) {
1615
1726
  text.resize(-n_chars);
1616
- n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
1727
+ n_chars = llama_detokenize(vocab, tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
1617
1728
  LM_GGML_ASSERT(n_chars <= (int32_t)text.size()); // whitespace trimming is performed after per-token detokenization
1618
1729
  }
1619
1730
 
@@ -1628,20 +1739,13 @@ std::string common_detokenize(llama_context * ctx, const std::vector<llama_token
1628
1739
  //
1629
1740
 
1630
1741
  std::string common_get_builtin_chat_template(const struct llama_model * model) {
1631
- static const char * template_key = "tokenizer.chat_template";
1632
- // call with NULL buffer to get the total size of the string
1633
- int32_t res = llama_model_meta_val_str(model, template_key, NULL, 0);
1634
- if (res > 0) {
1635
- std::vector<char> model_template(res + 1, 0);
1636
- llama_model_meta_val_str(model, template_key, model_template.data(), model_template.size());
1637
- return std::string(model_template.data(), model_template.size() - 1);
1638
- }
1639
- return "";
1742
+ const char * ptr_tmpl = llama_model_chat_template(model);
1743
+ return ptr_tmpl == nullptr ? "" : ptr_tmpl;
1640
1744
  }
1641
1745
 
1642
1746
  bool common_chat_verify_template(const std::string & tmpl) {
1643
1747
  llama_chat_message chat[] = {{"user", "test"}};
1644
- int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0);
1748
+ const int res = llama_chat_apply_template(tmpl.c_str(), chat, 1, true, nullptr, 0);
1645
1749
  return res >= 0;
1646
1750
  }
1647
1751
 
@@ -1652,16 +1756,16 @@ std::string common_chat_apply_template(const struct llama_model * model,
1652
1756
  int alloc_size = 0;
1653
1757
  bool fallback = false; // indicate if we must fallback to default chatml
1654
1758
  std::vector<llama_chat_message> chat;
1655
- for (auto & msg : msgs) {
1759
+ for (const auto & msg : msgs) {
1656
1760
  chat.push_back({msg.role.c_str(), msg.content.c_str()});
1657
1761
  alloc_size += (msg.role.size() + msg.content.size()) * 1.25;
1658
1762
  }
1659
1763
 
1660
- const char * ptr_tmpl = tmpl.empty() ? nullptr : tmpl.c_str();
1764
+ const char * ptr_tmpl = tmpl.empty() ? llama_model_chat_template(model) : tmpl.c_str();
1661
1765
  std::vector<char> buf(alloc_size);
1662
1766
 
1663
1767
  // run the first time to get the total output length
1664
- int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size());
1768
+ int32_t res = llama_chat_apply_template(ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size());
1665
1769
 
1666
1770
  // error: chat template is not supported
1667
1771
  if (res < 0) {
@@ -1669,18 +1773,17 @@ std::string common_chat_apply_template(const struct llama_model * model,
1669
1773
  // if the custom "tmpl" is not supported, we throw an error
1670
1774
  // this is a bit redundant (for good), since we're not sure if user validated the custom template with llama_chat_verify_template()
1671
1775
  throw std::runtime_error("this custom template is not supported");
1672
- } else {
1673
- // If the built-in template is not supported, we default to chatml
1674
- res = llama_chat_apply_template(nullptr, "chatml", chat.data(), chat.size(), add_ass, buf.data(), buf.size());
1675
- fallback = true;
1676
1776
  }
1777
+
1778
+ // If the built-in template is not supported, we default to chatml
1779
+ res = llama_chat_apply_template("chatml", chat.data(), chat.size(), add_ass, buf.data(), buf.size());
1780
+ fallback = true;
1677
1781
  }
1678
1782
 
1679
1783
  // if it turns out that our buffer is too small, we resize it
1680
1784
  if ((size_t) res > buf.size()) {
1681
1785
  buf.resize(res);
1682
1786
  res = llama_chat_apply_template(
1683
- fallback ? nullptr : model,
1684
1787
  fallback ? "chatml" : ptr_tmpl,
1685
1788
  chat.data(), chat.size(), add_ass, buf.data(), buf.size());
1686
1789
  }
package/cpp/common.h CHANGED
@@ -2,7 +2,6 @@
2
2
 
3
3
  #pragma once
4
4
 
5
- #include "llama-cpp.h"
6
5
  #include "llama-cpp.h"
7
6
 
8
7
  #include <string>
@@ -25,11 +24,11 @@
25
24
 
26
25
  #define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
27
26
 
28
- struct common_lora_adapter_info {
27
+ struct common_adapter_lora_info {
29
28
  std::string path;
30
29
  float scale;
31
30
 
32
- struct llama_lora_adapter * ptr;
31
+ struct llama_adapter_lora * ptr;
33
32
  };
34
33
 
35
34
  using llama_tokens = std::vector<llama_token>;
@@ -115,6 +114,12 @@ enum dimre_method {
115
114
  DIMRE_METHOD_MEAN,
116
115
  };
117
116
 
117
+ enum common_conversation_mode {
118
+ COMMON_CONVERSATION_MODE_DISABLED = 0,
119
+ COMMON_CONVERSATION_MODE_ENABLED = 1,
120
+ COMMON_CONVERSATION_MODE_AUTO = 2,
121
+ };
122
+
118
123
  // sampling parameters
119
124
  struct common_params_sampling {
120
125
  uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
@@ -181,7 +186,11 @@ struct common_params_speculative {
181
186
  struct cpu_params cpuparams;
182
187
  struct cpu_params cpuparams_batch;
183
188
 
184
- std::string model = ""; // draft model for speculative decoding // NOLINT
189
+ std::string hf_repo = ""; // HF repo // NOLINT
190
+ std::string hf_file = ""; // HF file // NOLINT
191
+
192
+ std::string model = ""; // draft model for speculative decoding // NOLINT
193
+ std::string model_url = ""; // model url to download // NOLINT
185
194
  };
186
195
 
187
196
  struct common_params_vocoder {
@@ -190,6 +199,8 @@ struct common_params_vocoder {
190
199
 
191
200
  std::string model = ""; // model path // NOLINT
192
201
  std::string model_url = ""; // model url to download // NOLINT
202
+
203
+ bool use_guide_tokens = false; // enable guide tokens to improve TTS accuracy // NOLINT
193
204
  };
194
205
 
195
206
  struct common_params {
@@ -256,14 +267,13 @@ struct common_params {
256
267
  std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding // NOLINT
257
268
  std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding // NOLINT
258
269
  std::string logits_file = ""; // file for saving *all* logits // NOLINT
259
- std::string rpc_servers = ""; // comma separated list of RPC servers // NOLINT
260
270
 
261
271
  std::vector<std::string> in_files; // all input files
262
272
  std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
263
273
  std::vector<llama_model_kv_override> kv_overrides;
264
274
 
265
- bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_lora_adapter_apply)
266
- std::vector<common_lora_adapter_info> lora_adapters; // lora adapter path with user defined scale
275
+ bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_adapter_lora_apply)
276
+ std::vector<common_adapter_lora_info> lora_adapters; // lora adapter path with user defined scale
267
277
 
268
278
  std::vector<common_control_vector_load_info> control_vectors; // control vector with user defined scale
269
279
 
@@ -291,7 +301,6 @@ struct common_params {
291
301
  bool special = false; // enable special token output
292
302
  bool interactive = false; // interactive mode
293
303
  bool interactive_first = false; // wait for user input immediately
294
- bool conversation = false; // conversation mode (does not print special tokens and suffix/prefix)
295
304
  bool prompt_cache_all = false; // save user input and generations to prompt cache
296
305
  bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it
297
306
 
@@ -317,6 +326,8 @@ struct common_params {
317
326
  lm_ggml_type cache_type_k = LM_GGML_TYPE_F16; // KV cache data type for the K
318
327
  lm_ggml_type cache_type_v = LM_GGML_TYPE_F16; // KV cache data type for the V
319
328
 
329
+ common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO;
330
+
320
331
  // multimodal models (see examples/llava)
321
332
  std::string mmproj = ""; // path to multimodal projector // NOLINT
322
333
  std::vector<std::string> image; // path to image file(s)
@@ -470,6 +481,11 @@ static bool string_starts_with(const std::string & str,
470
481
  return str.rfind(prefix, 0) == 0;
471
482
  }
472
483
 
484
+ static bool string_ends_with(const std::string & str,
485
+ const std::string & suffix) { // While we wait for C++20's std::string::ends_with...
486
+ return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
487
+ }
488
+
473
489
  bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
474
490
  void string_process_escapes(std::string & input);
475
491
 
@@ -497,7 +513,7 @@ struct common_init_result {
497
513
  llama_model_ptr model;
498
514
  llama_context_ptr context;
499
515
 
500
- std::vector<llama_lora_adapter_ptr> lora;
516
+ std::vector<llama_adapter_lora_ptr> lora;
501
517
  };
502
518
 
503
519
  struct common_init_result common_init_from_params(common_params & params);
@@ -511,6 +527,7 @@ struct llama_model * common_load_model_from_url(
511
527
  const std::string & local_path,
512
528
  const std::string & hf_token,
513
529
  const struct llama_model_params & params);
530
+
514
531
  struct llama_model * common_load_model_from_hf(
515
532
  const std::string & repo,
516
533
  const std::string & remote_path,
@@ -518,8 +535,12 @@ struct llama_model * common_load_model_from_hf(
518
535
  const std::string & hf_token,
519
536
  const struct llama_model_params & params);
520
537
 
538
+ std::pair<std::string, std::string> common_get_hf_file(
539
+ const std::string & hf_repo_with_tag,
540
+ const std::string & hf_token);
541
+
521
542
  // clear LoRA adapters from context, then apply new list of adapters
522
- void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_info> & lora);
543
+ void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora);
523
544
 
524
545
  //
525
546
  // Batch utils
@@ -557,7 +578,7 @@ std::vector<llama_token> common_tokenize(
557
578
  bool parse_special = false);
558
579
 
559
580
  std::vector<llama_token> common_tokenize(
560
- const struct llama_model * model,
581
+ const struct llama_vocab * vocab,
561
582
  const std::string & text,
562
583
  bool add_special,
563
584
  bool parse_special = false);
@@ -569,11 +590,21 @@ std::string common_token_to_piece(
569
590
  llama_token token,
570
591
  bool special = true);
571
592
 
593
+ std::string common_token_to_piece(
594
+ const struct llama_vocab * vocab,
595
+ llama_token token,
596
+ bool special = true);
597
+
572
598
  // detokenizes a vector of tokens into a string
573
599
  // should work similar to Python's `tokenizer.decode`
574
600
  // optionally renders special/control tokens
575
601
  std::string common_detokenize(
576
- llama_context * ctx,
602
+ const struct llama_context * ctx,
603
+ const std::vector<llama_token> & tokens,
604
+ bool special = true);
605
+
606
+ std::string common_detokenize(
607
+ const struct llama_vocab * vocab,
577
608
  const std::vector<llama_token> & tokens,
578
609
  bool special = true);
579
610