cui-llama.rn 1.2.2 → 1.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -12,6 +12,7 @@ import android.util.Log;
12
12
  import android.os.Build;
13
13
  import android.os.ParcelFileDescriptor;
14
14
  import android.net.Uri;
15
+ import android.content.Intent;
15
16
  import android.content.res.AssetManager;
16
17
 
17
18
  import java.lang.StringBuilder;
@@ -39,11 +40,12 @@ public class LlamaContext {
39
40
  InputStream fis = null;
40
41
  try {
41
42
  if (filepath.startsWith("content")) {
42
- fis = reactContext.getApplicationContext().getContentResolver().openInputStream(Uri.parse(filepath));
43
+ Uri uri = Uri.parse(filepath);
44
+ reactContext.getApplicationContext().getContentResolver().takePersistableUriPermission(uri, Intent.FLAG_GRANT_READ_URI_PERMISSION);
45
+ fis = reactContext.getApplicationContext().getContentResolver().openInputStream(uri);
43
46
  } else {
44
47
  fis = new FileInputStream(filepath);
45
48
  }
46
-
47
49
 
48
50
  int bytesRead = fis.read(fileHeader);
49
51
  if(bytesRead < 4) {
@@ -55,6 +57,7 @@ public class LlamaContext {
55
57
  }
56
58
  return true;
57
59
  } catch (Exception e) {
60
+ Log.e(NAME, "Failed to check GGUF: " + e.getMessage());
58
61
  return false;
59
62
  }finally {
60
63
  if (fis != null) {
@@ -156,7 +156,7 @@ Java_com_rnllama_LlamaContext_initContext(
156
156
  ) {
157
157
  UNUSED(thiz);
158
158
 
159
- gpt_params defaultParams;
159
+ common_params defaultParams;
160
160
 
161
161
  defaultParams.vocab_only = vocab_only;
162
162
  if(vocab_only) {
@@ -268,7 +268,7 @@ Java_com_rnllama_LlamaContext_getFormattedChat(
268
268
  UNUSED(thiz);
269
269
  auto llama = context_map[(long) context_ptr];
270
270
 
271
- std::vector<llama_chat_msg> chat;
271
+ std::vector<common_chat_msg> chat;
272
272
 
273
273
  int messages_len = env->GetArrayLength(messages);
274
274
  for (int i = 0; i < messages_len; i++) {
@@ -292,7 +292,7 @@ Java_com_rnllama_LlamaContext_getFormattedChat(
292
292
  }
293
293
 
294
294
  const char *tmpl_chars = env->GetStringUTFChars(chat_template, nullptr);
295
- std::string formatted_chat = llama_chat_apply_template(llama->model, tmpl_chars, chat, true);
295
+ std::string formatted_chat = common_chat_apply_template(llama->model, tmpl_chars, chat, true);
296
296
 
297
297
  return env->NewStringUTF(formatted_chat.c_str());
298
298
  }
@@ -497,7 +497,7 @@ Java_com_rnllama_LlamaContext_doCompletion(
497
497
  if (token_with_probs.tok == -1 || llama->incomplete) {
498
498
  continue;
499
499
  }
500
- const std::string token_text = llama_token_to_piece(llama->ctx, token_with_probs.tok);
500
+ const std::string token_text = common_token_to_piece(llama->ctx, token_with_probs.tok);
501
501
 
502
502
  size_t pos = std::min(sent_count, llama->generated_text.size());
503
503
 
@@ -532,7 +532,7 @@ Java_com_rnllama_LlamaContext_doCompletion(
532
532
  putString(env, tokenResult, "token", to_send.c_str());
533
533
 
534
534
  if (llama->params.sparams.n_probs > 0) {
535
- const std::vector<llama_token> to_send_toks = llama_tokenize(llama->ctx, to_send, false);
535
+ const std::vector<llama_token> to_send_toks = common_tokenize(llama->ctx, to_send, false);
536
536
  size_t probs_pos = std::min(sent_token_probs_index, llama->generated_token_probs.size());
537
537
  size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama->generated_token_probs.size());
538
538
  if (probs_pos < probs_stop_pos) {
@@ -607,7 +607,7 @@ Java_com_rnllama_LlamaContext_tokenize(
607
607
 
608
608
  const char *text_chars = env->GetStringUTFChars(text, nullptr);
609
609
 
610
- const std::vector<llama_token> toks = llama_tokenize(
610
+ const std::vector<llama_token> toks = common_tokenize(
611
611
  llama->ctx,
612
612
  text_chars,
613
613
  false
@@ -719,7 +719,7 @@ Java_com_rnllama_LlamaContext_freeContext(
719
719
  }
720
720
  if (llama->ctx_sampling != nullptr)
721
721
  {
722
- gpt_sampler_free(llama->ctx_sampling);
722
+ common_sampler_free(llama->ctx_sampling);
723
723
  }
724
724
  context_map.erase((long) llama->ctx);
725
725
  }
package/cpp/common.cpp CHANGED
@@ -12,6 +12,7 @@
12
12
 
13
13
  #include <algorithm>
14
14
  #include <cinttypes>
15
+ #include <climits>
15
16
  #include <cmath>
16
17
  #include <codecvt>
17
18
  #include <cstdarg>
@@ -23,10 +24,10 @@
23
24
  #include <regex>
24
25
  #include <sstream>
25
26
  #include <string>
27
+ #include <thread>
26
28
  #include <unordered_map>
27
29
  #include <unordered_set>
28
30
  #include <vector>
29
- #include <thread>
30
31
 
31
32
  #if defined(__APPLE__) && defined(__MACH__)
32
33
  #include <sys/types.h>
@@ -368,10 +369,10 @@ bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[LM_GGML_MAX_N_THR
368
369
  return true;
369
370
  }
370
371
 
371
- void gpt_init() {
372
+ void common_init() {
372
373
  llama_log_set([](lm_ggml_log_level level, const char * text, void * /*user_data*/) {
373
- if (LOG_DEFAULT_LLAMA <= gpt_log_verbosity_thold) {
374
- gpt_log_add(gpt_log_main(), level, "%s", text);
374
+ if (LOG_DEFAULT_LLAMA <= common_log_verbosity_thold) {
375
+ common_log_add(common_log_main(), level, "%s", text);
375
376
  }
376
377
  }, NULL);
377
378
 
@@ -384,7 +385,7 @@ void gpt_init() {
384
385
  LOG_INF("build: %d (%s) with %s for %s%s\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT, LLAMA_COMPILER, LLAMA_BUILD_TARGET, build_type);
385
386
  }
386
387
 
387
- std::string gpt_params_get_system_info(const gpt_params & params) {
388
+ std::string common_params_get_system_info(const common_params & params) {
388
389
  std::ostringstream os;
389
390
 
390
391
  os << "system_info: n_threads = " << params.cpuparams.n_threads;
@@ -406,6 +407,21 @@ std::string gpt_params_get_system_info(const gpt_params & params) {
406
407
  // String utils
407
408
  //
408
409
 
410
+ std::string string_format(const char * fmt, ...) {
411
+ va_list ap;
412
+ va_list ap2;
413
+ va_start(ap, fmt);
414
+ va_copy(ap2, ap);
415
+ int size = vsnprintf(NULL, 0, fmt, ap);
416
+ LM_GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
417
+ std::vector<char> buf(size + 1);
418
+ int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
419
+ LM_GGML_ASSERT(size2 == size);
420
+ va_end(ap2);
421
+ va_end(ap);
422
+ return std::string(buf.data(), size);
423
+ }
424
+
409
425
  std::vector<std::string> string_split(std::string input, char separator) {
410
426
  std::vector<std::string> parts;
411
427
  size_t separator_pos = input.find(separator);
@@ -499,7 +515,7 @@ std::string string_from(const struct llama_context * ctx, const std::vector<llam
499
515
  first = false;
500
516
  }
501
517
 
502
- auto detokenized = llama_token_to_piece(ctx, token);
518
+ auto detokenized = common_token_to_piece(ctx, token);
503
519
 
504
520
  detokenized.erase(
505
521
  std::remove_if(
@@ -530,7 +546,7 @@ std::string string_from(const struct llama_context * ctx, const struct llama_bat
530
546
  first = false;
531
547
  }
532
548
 
533
- auto detokenized = llama_token_to_piece(ctx, batch.token[i]);
549
+ auto detokenized = common_token_to_piece(ctx, batch.token[i]);
534
550
 
535
551
  detokenized.erase(
536
552
  std::remove_if(
@@ -825,16 +841,16 @@ std::string fs_get_cache_file(const std::string & filename) {
825
841
  //
826
842
  // Model utils
827
843
  //
828
- struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
829
- llama_init_result iparams;
830
- auto mparams = llama_model_params_from_gpt_params(params);
844
+ struct common_init_result common_init_from_params(common_params & params) {
845
+ common_init_result iparams;
846
+ auto mparams = common_model_params_to_llama(params);
831
847
 
832
848
  llama_model * model = nullptr;
833
849
 
834
850
  if (!params.hf_repo.empty() && !params.hf_file.empty()) {
835
- model = llama_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
851
+ model = common_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
836
852
  } else if (!params.model_url.empty()) {
837
- model = llama_load_model_from_url(params.model_url.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
853
+ model = common_load_model_from_url(params.model_url.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
838
854
  } else {
839
855
  model = llama_load_model_from_file(params.model.c_str(), mparams);
840
856
  }
@@ -869,7 +885,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
869
885
  }
870
886
  }
871
887
 
872
- auto cparams = llama_context_params_from_gpt_params(params);
888
+ auto cparams = common_context_params_to_llama(params);
873
889
 
874
890
  llama_context * lctx = llama_new_context_with_model(model, cparams);
875
891
  if (lctx == NULL) {
@@ -882,7 +898,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
882
898
  if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1;
883
899
  if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_n_layer(model);
884
900
 
885
- const auto cvec = llama_control_vector_load(params.control_vectors);
901
+ const auto cvec = common_control_vector_load(params.control_vectors);
886
902
  if (cvec.n_embd == -1) {
887
903
  llama_free(lctx);
888
904
  llama_free_model(model);
@@ -906,7 +922,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
906
922
 
907
923
  // load and optionally apply lora adapters
908
924
  for (auto & la : params.lora_adapters) {
909
- llama_lora_adapter_container loaded_la;
925
+ common_lora_adapter_container loaded_la;
910
926
  loaded_la.path = la.path;
911
927
  loaded_la.scale = la.scale;
912
928
  loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str());
@@ -919,7 +935,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
919
935
  iparams.lora_adapters.push_back(loaded_la); // copy to list of loaded adapters
920
936
  }
921
937
  if (!params.lora_init_without_apply) {
922
- llama_lora_adapters_apply(lctx, iparams.lora_adapters);
938
+ common_lora_adapters_apply(lctx, iparams.lora_adapters);
923
939
  }
924
940
 
925
941
  if (params.sparams.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) {
@@ -945,7 +961,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
945
961
  }
946
962
 
947
963
  if (llama_model_has_encoder(model)) {
948
- llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size(), 0, 0));
964
+ llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size()));
949
965
  llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
950
966
  if (decoder_start_token_id == -1) {
951
967
  decoder_start_token_id = bos;
@@ -954,7 +970,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
954
970
  tmp.push_back(decoder_start_token_id);
955
971
  }
956
972
  if (llama_model_has_decoder(model)) {
957
- llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
973
+ llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch)));
958
974
  }
959
975
  llama_kv_cache_clear(lctx);
960
976
  llama_synchronize(lctx);
@@ -967,7 +983,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
967
983
  return iparams;
968
984
  }
969
985
 
970
- void llama_lora_adapters_apply(struct llama_context * ctx, std::vector<llama_lora_adapter_container> & lora_adapters) {
986
+ void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters) {
971
987
  llama_lora_adapter_clear(ctx);
972
988
  for (auto & la : lora_adapters) {
973
989
  if (la.scale != 0.0f) {
@@ -976,7 +992,7 @@ void llama_lora_adapters_apply(struct llama_context * ctx, std::vector<llama_lor
976
992
  }
977
993
  }
978
994
 
979
- struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & params) {
995
+ struct llama_model_params common_model_params_to_llama(const common_params & params) {
980
996
  auto mparams = llama_model_default_params();
981
997
 
982
998
  if (params.n_gpu_layers != -1) {
@@ -1029,10 +1045,10 @@ static lm_ggml_type kv_cache_type_from_str(const std::string & s) {
1029
1045
  return LM_GGML_TYPE_Q5_1;
1030
1046
  }
1031
1047
 
1032
- throw std::runtime_error("Invalid cache type: " + s);
1048
+ throw std::runtime_error("Unsupported cache type: " + s);
1033
1049
  }
1034
1050
 
1035
- struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) {
1051
+ struct llama_context_params common_context_params_to_llama(const common_params & params) {
1036
1052
  auto cparams = llama_context_default_params();
1037
1053
 
1038
1054
  cparams.n_ctx = params.n_ctx;
@@ -1041,7 +1057,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
1041
1057
  cparams.n_ubatch = params.n_ubatch;
1042
1058
  cparams.n_threads = params.cpuparams.n_threads;
1043
1059
  cparams.n_threads_batch = params.cpuparams_batch.n_threads == -1 ?
1044
- params.cpuparams.n_threads : params.cpuparams_batch.n_threads;
1060
+ params.cpuparams.n_threads : params.cpuparams_batch.n_threads;
1045
1061
  cparams.logits_all = params.logits_all;
1046
1062
  cparams.embeddings = params.embedding;
1047
1063
  cparams.rope_scaling_type = params.rope_scaling_type;
@@ -1122,7 +1138,7 @@ static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_
1122
1138
  return false;
1123
1139
  }
1124
1140
 
1125
- static bool llama_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {
1141
+ static bool common_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {
1126
1142
 
1127
1143
  // Initialize libcurl
1128
1144
  std::unique_ptr<CURL, decltype(&curl_easy_cleanup)> curl(curl_easy_init(), &curl_easy_cleanup);
@@ -1192,15 +1208,15 @@ static bool llama_download_file(const std::string & url, const std::string & pat
1192
1208
  }
1193
1209
 
1194
1210
  // Send a HEAD request to retrieve the etag and last-modified headers
1195
- struct llama_load_model_from_url_headers {
1211
+ struct common_load_model_from_url_headers {
1196
1212
  std::string etag;
1197
1213
  std::string last_modified;
1198
1214
  };
1199
- llama_load_model_from_url_headers headers;
1215
+ common_load_model_from_url_headers headers;
1200
1216
  {
1201
1217
  typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
1202
1218
  auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
1203
- llama_load_model_from_url_headers *headers = (llama_load_model_from_url_headers *) userdata;
1219
+ common_load_model_from_url_headers *headers = (common_load_model_from_url_headers *) userdata;
1204
1220
 
1205
1221
  static std::regex header_regex("([^:]+): (.*)\r\n");
1206
1222
  static std::regex etag_regex("ETag", std::regex_constants::icase);
@@ -1336,7 +1352,7 @@ static bool llama_download_file(const std::string & url, const std::string & pat
1336
1352
  return true;
1337
1353
  }
1338
1354
 
1339
- struct llama_model * llama_load_model_from_url(
1355
+ struct llama_model * common_load_model_from_url(
1340
1356
  const char * model_url,
1341
1357
  const char * path_model,
1342
1358
  const char * hf_token,
@@ -1347,7 +1363,7 @@ struct llama_model * llama_load_model_from_url(
1347
1363
  return NULL;
1348
1364
  }
1349
1365
 
1350
- if (!llama_download_file(model_url, path_model, hf_token)) {
1366
+ if (!common_download_file(model_url, path_model, hf_token)) {
1351
1367
  return NULL;
1352
1368
  }
1353
1369
 
@@ -1400,7 +1416,7 @@ struct llama_model * llama_load_model_from_url(
1400
1416
  char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0};
1401
1417
  llama_split_path(split_url, sizeof(split_url), split_url_prefix, download_idx, n_split);
1402
1418
 
1403
- return llama_download_file(split_url, split_path, hf_token);
1419
+ return common_download_file(split_url, split_path, hf_token);
1404
1420
  }, idx));
1405
1421
  }
1406
1422
 
@@ -1415,7 +1431,7 @@ struct llama_model * llama_load_model_from_url(
1415
1431
  return llama_load_model_from_file(path_model, params);
1416
1432
  }
1417
1433
 
1418
- struct llama_model * llama_load_model_from_hf(
1434
+ struct llama_model * common_load_model_from_hf(
1419
1435
  const char * repo,
1420
1436
  const char * model,
1421
1437
  const char * path_model,
@@ -1435,12 +1451,12 @@ struct llama_model * llama_load_model_from_hf(
1435
1451
  model_url += "/resolve/main/";
1436
1452
  model_url += model;
1437
1453
 
1438
- return llama_load_model_from_url(model_url.c_str(), path_model, hf_token, params);
1454
+ return common_load_model_from_url(model_url.c_str(), path_model, hf_token, params);
1439
1455
  }
1440
1456
 
1441
1457
  #else
1442
1458
 
1443
- struct llama_model * llama_load_model_from_url(
1459
+ struct llama_model * common_load_model_from_url(
1444
1460
  const char * /*model_url*/,
1445
1461
  const char * /*path_model*/,
1446
1462
  const char * /*hf_token*/,
@@ -1449,7 +1465,7 @@ struct llama_model * llama_load_model_from_url(
1449
1465
  return nullptr;
1450
1466
  }
1451
1467
 
1452
- struct llama_model * llama_load_model_from_hf(
1468
+ struct llama_model * common_load_model_from_hf(
1453
1469
  const char * /*repo*/,
1454
1470
  const char * /*model*/,
1455
1471
  const char * /*path_model*/,
@@ -1465,11 +1481,11 @@ struct llama_model * llama_load_model_from_hf(
1465
1481
  // Batch utils
1466
1482
  //
1467
1483
 
1468
- void llama_batch_clear(struct llama_batch & batch) {
1484
+ void common_batch_clear(struct llama_batch & batch) {
1469
1485
  batch.n_tokens = 0;
1470
1486
  }
1471
1487
 
1472
- void llama_batch_add(
1488
+ void common_batch_add(
1473
1489
  struct llama_batch & batch,
1474
1490
  llama_token id,
1475
1491
  llama_pos pos,
@@ -1492,15 +1508,15 @@ void llama_batch_add(
1492
1508
  // Vocab utils
1493
1509
  //
1494
1510
 
1495
- std::vector<llama_token> llama_tokenize(
1511
+ std::vector<llama_token> common_tokenize(
1496
1512
  const struct llama_context * ctx,
1497
1513
  const std::string & text,
1498
1514
  bool add_special,
1499
1515
  bool parse_special) {
1500
- return llama_tokenize(llama_get_model(ctx), text, add_special, parse_special);
1516
+ return common_tokenize(llama_get_model(ctx), text, add_special, parse_special);
1501
1517
  }
1502
1518
 
1503
- std::vector<llama_token> llama_tokenize(
1519
+ std::vector<llama_token> common_tokenize(
1504
1520
  const struct llama_model * model,
1505
1521
  const std::string & text,
1506
1522
  bool add_special,
@@ -1519,7 +1535,7 @@ std::vector<llama_token> llama_tokenize(
1519
1535
  return result;
1520
1536
  }
1521
1537
 
1522
- std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
1538
+ std::string common_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
1523
1539
  std::string piece;
1524
1540
  piece.resize(piece.capacity()); // using string internal cache, 15 bytes + '\n'
1525
1541
  const int n_chars = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special);
@@ -1535,7 +1551,7 @@ std::string llama_token_to_piece(const struct llama_context * ctx, llama_token t
1535
1551
  return piece;
1536
1552
  }
1537
1553
 
1538
- std::string llama_detokenize(llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
1554
+ std::string common_detokenize(llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
1539
1555
  std::string text;
1540
1556
  text.resize(std::max(text.capacity(), tokens.size()));
1541
1557
  int32_t n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
@@ -1555,15 +1571,15 @@ std::string llama_detokenize(llama_context * ctx, const std::vector<llama_token>
1555
1571
  // Chat template utils
1556
1572
  //
1557
1573
 
1558
- bool llama_chat_verify_template(const std::string & tmpl) {
1574
+ bool common_chat_verify_template(const std::string & tmpl) {
1559
1575
  llama_chat_message chat[] = {{"user", "test"}};
1560
1576
  int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0);
1561
1577
  return res >= 0;
1562
1578
  }
1563
1579
 
1564
- std::string llama_chat_apply_template(const struct llama_model * model,
1580
+ std::string common_chat_apply_template(const struct llama_model * model,
1565
1581
  const std::string & tmpl,
1566
- const std::vector<llama_chat_msg> & msgs,
1582
+ const std::vector<common_chat_msg> & msgs,
1567
1583
  bool add_ass) {
1568
1584
  int alloc_size = 0;
1569
1585
  bool fallback = false; // indicate if we must fallback to default chatml
@@ -1605,42 +1621,42 @@ std::string llama_chat_apply_template(const struct llama_model * model,
1605
1621
  return formatted_chat;
1606
1622
  }
1607
1623
 
1608
- std::string llama_chat_format_single(const struct llama_model * model,
1624
+ std::string common_chat_format_single(const struct llama_model * model,
1609
1625
  const std::string & tmpl,
1610
- const std::vector<llama_chat_msg> & past_msg,
1611
- const llama_chat_msg & new_msg,
1626
+ const std::vector<common_chat_msg> & past_msg,
1627
+ const common_chat_msg & new_msg,
1612
1628
  bool add_ass) {
1613
1629
  std::ostringstream ss;
1614
- auto fmt_past_msg = past_msg.empty() ? "" : llama_chat_apply_template(model, tmpl, past_msg, false);
1615
- std::vector<llama_chat_msg> chat_new(past_msg);
1630
+ auto fmt_past_msg = past_msg.empty() ? "" : common_chat_apply_template(model, tmpl, past_msg, false);
1631
+ std::vector<common_chat_msg> chat_new(past_msg);
1616
1632
  // if the past_msg ends with a newline, we must preserve it in the formatted version
1617
1633
  if (add_ass && !fmt_past_msg.empty() && fmt_past_msg.back() == '\n') {
1618
1634
  ss << "\n";
1619
1635
  };
1620
1636
  // format chat with new_msg
1621
1637
  chat_new.push_back(new_msg);
1622
- auto fmt_new_msg = llama_chat_apply_template(model, tmpl, chat_new, add_ass);
1638
+ auto fmt_new_msg = common_chat_apply_template(model, tmpl, chat_new, add_ass);
1623
1639
  // get the diff part
1624
1640
  ss << fmt_new_msg.substr(fmt_past_msg.size(), fmt_new_msg.size() - fmt_past_msg.size());
1625
1641
  return ss.str();
1626
1642
  }
1627
1643
 
1628
- std::string llama_chat_format_example(const struct llama_model * model,
1644
+ std::string common_chat_format_example(const struct llama_model * model,
1629
1645
  const std::string & tmpl) {
1630
- std::vector<llama_chat_msg> msgs = {
1646
+ std::vector<common_chat_msg> msgs = {
1631
1647
  {"system", "You are a helpful assistant"},
1632
1648
  {"user", "Hello"},
1633
1649
  {"assistant", "Hi there"},
1634
1650
  {"user", "How are you?"},
1635
1651
  };
1636
- return llama_chat_apply_template(model, tmpl, msgs, true);
1652
+ return common_chat_apply_template(model, tmpl, msgs, true);
1637
1653
  }
1638
1654
 
1639
1655
  //
1640
1656
  // KV cache utils
1641
1657
  //
1642
1658
 
1643
- void llama_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size) {
1659
+ void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size) {
1644
1660
  static const char slot_chars[] = ".123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+";
1645
1661
 
1646
1662
  printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d",
@@ -1663,7 +1679,7 @@ void llama_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size) {
1663
1679
  printf("\n=== Done dumping\n");
1664
1680
  }
1665
1681
 
1666
- void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size) {
1682
+ void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size) {
1667
1683
  static const char slot_chars[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
1668
1684
 
1669
1685
  printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d\n",
@@ -1715,7 +1731,7 @@ void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_siz
1715
1731
  // Embedding utils
1716
1732
  //
1717
1733
 
1718
- void llama_embd_normalize(const float * inp, float * out, int n, int embd_norm) {
1734
+ void common_embd_normalize(const float * inp, float * out, int n, int embd_norm) {
1719
1735
  double sum = 0.0;
1720
1736
 
1721
1737
  switch (embd_norm) {
@@ -1749,7 +1765,7 @@ void llama_embd_normalize(const float * inp, float * out, int n, int embd_norm)
1749
1765
  }
1750
1766
  }
1751
1767
 
1752
- float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n){
1768
+ float common_embd_similarity_cos(const float * embd1, const float * embd2, int n){
1753
1769
  double sum = 0.0;
1754
1770
  double sum1 = 0.0;
1755
1771
  double sum2 = 0.0;
@@ -1775,8 +1791,8 @@ float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n)
1775
1791
  // Control vector utils
1776
1792
  //
1777
1793
 
1778
- static llama_control_vector_data llama_control_vector_load_one(const llama_control_vector_load_info & load_info) {
1779
- llama_control_vector_data result = { -1, {} };
1794
+ static common_control_vector_data common_control_vector_load_one(const common_control_vector_load_info & load_info) {
1795
+ common_control_vector_data result = { -1, {} };
1780
1796
 
1781
1797
  lm_ggml_context * ctx = nullptr;
1782
1798
  struct lm_gguf_init_params meta_lm_gguf_params = {
@@ -1860,11 +1876,11 @@ static llama_control_vector_data llama_control_vector_load_one(const llama_contr
1860
1876
  return result;
1861
1877
  }
1862
1878
 
1863
- llama_control_vector_data llama_control_vector_load(const std::vector<llama_control_vector_load_info> & load_infos) {
1864
- llama_control_vector_data result = { -1, {} };
1879
+ common_control_vector_data common_control_vector_load(const std::vector<common_control_vector_load_info> & load_infos) {
1880
+ common_control_vector_data result = { -1, {} };
1865
1881
 
1866
1882
  for (const auto & info : load_infos) {
1867
- auto cur = llama_control_vector_load_one(info);
1883
+ auto cur = common_control_vector_load_one(info);
1868
1884
 
1869
1885
  if (cur.n_embd == -1) {
1870
1886
  result.n_embd = -1;
@@ -1956,7 +1972,7 @@ void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const cha
1956
1972
  }
1957
1973
  }
1958
1974
 
1959
- void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const llama_context * lctx,
1975
+ void yaml_dump_non_result_info(FILE * stream, const common_params & params, const llama_context * lctx,
1960
1976
  const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
1961
1977
  const auto & sparams = params.sparams;
1962
1978
 
@@ -2098,6 +2114,8 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
2098
2114
  fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
2099
2115
  fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
2100
2116
  fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
2117
+ fprintf(stream, "xtc_probability: %f # default: 0.0\n", sparams.xtc_probability);
2118
+ fprintf(stream, "xtc_threshold: %f # default: 0.1\n", sparams.xtc_threshold);
2101
2119
  fprintf(stream, "typ_p: %f # default: 1.0\n", sparams.typ_p);
2102
2120
  fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
2103
2121
  fprintf(stream, "display_prompt: %s # default: true\n", params.display_prompt ? "true" : "false");