cui-llama.rn 1.2.3 → 1.2.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -11,8 +11,6 @@ The following features have been added for Android:
11
11
  - `vocab_only` mode: utilize the llama.cpp tokenizer
12
12
  - tokenizeSync: non-blocking, synchronous tokenizer function
13
13
  - Context Shift taken from [kobold.cpp](https://github.com/LostRuins/koboldcpp)
14
- - XTC sampling
15
- - Progress callback
16
14
  - Retrieving CPU Features to check for i8mm and dotprod flags
17
15
 
18
16
  Original repo README.md below.
@@ -18,6 +18,7 @@ set(
18
18
  ${RNLLAMA_LIB_DIR}/ggml-alloc.c
19
19
  ${RNLLAMA_LIB_DIR}/ggml-backend.cpp
20
20
  ${RNLLAMA_LIB_DIR}/ggml.c
21
+ ${RNLLAMA_LIB_DIR}/ggml-cpu.c
21
22
  ${RNLLAMA_LIB_DIR}/ggml-quants.c
22
23
  ${RNLLAMA_LIB_DIR}/common.cpp
23
24
  ${RNLLAMA_LIB_DIR}/json.hpp
@@ -248,8 +248,6 @@ public class LlamaContext {
248
248
  params.hasKey("xtc_t") ? (float) params.getDouble("xtc_t") : 0.00f,
249
249
  // float xtc_p,
250
250
  params.hasKey("xtc_p") ? (float) params.getDouble("xtc_p") : 0.00f,
251
- // float tfs_z,
252
- params.hasKey("tfs_z") ? (float) params.getDouble("tfs_z") : 1.00f,
253
251
  // float typical_p,
254
252
  params.hasKey("typical_p") ? (float) params.getDouble("typical_p") : 1.00f,
255
253
  // int seed,
@@ -438,7 +436,6 @@ public class LlamaContext {
438
436
  float min_p,
439
437
  float xtc_t,
440
438
  float xtc_p,
441
- float tfs_z,
442
439
  float typical_p,
443
440
  int seed,
444
441
  String[] stop,
@@ -156,7 +156,7 @@ Java_com_rnllama_LlamaContext_initContext(
156
156
  ) {
157
157
  UNUSED(thiz);
158
158
 
159
- gpt_params defaultParams;
159
+ common_params defaultParams;
160
160
 
161
161
  defaultParams.vocab_only = vocab_only;
162
162
  if(vocab_only) {
@@ -268,7 +268,7 @@ Java_com_rnllama_LlamaContext_getFormattedChat(
268
268
  UNUSED(thiz);
269
269
  auto llama = context_map[(long) context_ptr];
270
270
 
271
- std::vector<llama_chat_msg> chat;
271
+ std::vector<common_chat_msg> chat;
272
272
 
273
273
  int messages_len = env->GetArrayLength(messages);
274
274
  for (int i = 0; i < messages_len; i++) {
@@ -292,7 +292,7 @@ Java_com_rnllama_LlamaContext_getFormattedChat(
292
292
  }
293
293
 
294
294
  const char *tmpl_chars = env->GetStringUTFChars(chat_template, nullptr);
295
- std::string formatted_chat = llama_chat_apply_template(llama->model, tmpl_chars, chat, true);
295
+ std::string formatted_chat = common_chat_apply_template(llama->model, tmpl_chars, chat, true);
296
296
 
297
297
  return env->NewStringUTF(formatted_chat.c_str());
298
298
  }
@@ -399,7 +399,6 @@ Java_com_rnllama_LlamaContext_doCompletion(
399
399
  jfloat min_p,
400
400
  jfloat xtc_t,
401
401
  jfloat xtc_p,
402
- jfloat tfs_z,
403
402
  jfloat typical_p,
404
403
  jint seed,
405
404
  jobjectArray stop,
@@ -438,12 +437,11 @@ Java_com_rnllama_LlamaContext_doCompletion(
438
437
  sparams.top_k = top_k;
439
438
  sparams.top_p = top_p;
440
439
  sparams.min_p = min_p;
441
- sparams.tfs_z = tfs_z;
442
440
  sparams.typ_p = typical_p;
443
441
  sparams.n_probs = n_probs;
444
442
  sparams.grammar = env->GetStringUTFChars(grammar, nullptr);
445
- sparams.xtc_t = xtc_t;
446
- sparams.xtc_p = xtc_p;
443
+ sparams.xtc_threshold = xtc_t;
444
+ sparams.xtc_probability = xtc_p;
447
445
 
448
446
  sparams.logit_bias.clear();
449
447
  if (ignore_eos) {
@@ -497,7 +495,7 @@ Java_com_rnllama_LlamaContext_doCompletion(
497
495
  if (token_with_probs.tok == -1 || llama->incomplete) {
498
496
  continue;
499
497
  }
500
- const std::string token_text = llama_token_to_piece(llama->ctx, token_with_probs.tok);
498
+ const std::string token_text = common_token_to_piece(llama->ctx, token_with_probs.tok);
501
499
 
502
500
  size_t pos = std::min(sent_count, llama->generated_text.size());
503
501
 
@@ -532,7 +530,7 @@ Java_com_rnllama_LlamaContext_doCompletion(
532
530
  putString(env, tokenResult, "token", to_send.c_str());
533
531
 
534
532
  if (llama->params.sparams.n_probs > 0) {
535
- const std::vector<llama_token> to_send_toks = llama_tokenize(llama->ctx, to_send, false);
533
+ const std::vector<llama_token> to_send_toks = common_tokenize(llama->ctx, to_send, false);
536
534
  size_t probs_pos = std::min(sent_token_probs_index, llama->generated_token_probs.size());
537
535
  size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama->generated_token_probs.size());
538
536
  if (probs_pos < probs_stop_pos) {
@@ -607,7 +605,7 @@ Java_com_rnllama_LlamaContext_tokenize(
607
605
 
608
606
  const char *text_chars = env->GetStringUTFChars(text, nullptr);
609
607
 
610
- const std::vector<llama_token> toks = llama_tokenize(
608
+ const std::vector<llama_token> toks = common_tokenize(
611
609
  llama->ctx,
612
610
  text_chars,
613
611
  false
@@ -719,7 +717,7 @@ Java_com_rnllama_LlamaContext_freeContext(
719
717
  }
720
718
  if (llama->ctx_sampling != nullptr)
721
719
  {
722
- gpt_sampler_free(llama->ctx_sampling);
720
+ common_sampler_free(llama->ctx_sampling);
723
721
  }
724
722
  context_map.erase((long) llama->ctx);
725
723
  }
package/cpp/common.cpp CHANGED
@@ -12,6 +12,7 @@
12
12
 
13
13
  #include <algorithm>
14
14
  #include <cinttypes>
15
+ #include <climits>
15
16
  #include <cmath>
16
17
  #include <codecvt>
17
18
  #include <cstdarg>
@@ -23,10 +24,10 @@
23
24
  #include <regex>
24
25
  #include <sstream>
25
26
  #include <string>
27
+ #include <thread>
26
28
  #include <unordered_map>
27
29
  #include <unordered_set>
28
30
  #include <vector>
29
- #include <thread>
30
31
 
31
32
  #if defined(__APPLE__) && defined(__MACH__)
32
33
  #include <sys/types.h>
@@ -368,10 +369,10 @@ bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[LM_GGML_MAX_N_THR
368
369
  return true;
369
370
  }
370
371
 
371
- void gpt_init() {
372
+ void common_init() {
372
373
  llama_log_set([](lm_ggml_log_level level, const char * text, void * /*user_data*/) {
373
- if (LOG_DEFAULT_LLAMA <= gpt_log_verbosity_thold) {
374
- gpt_log_add(gpt_log_main(), level, "%s", text);
374
+ if (LOG_DEFAULT_LLAMA <= common_log_verbosity_thold) {
375
+ common_log_add(common_log_main(), level, "%s", text);
375
376
  }
376
377
  }, NULL);
377
378
 
@@ -384,7 +385,7 @@ void gpt_init() {
384
385
  LOG_INF("build: %d (%s) with %s for %s%s\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT, LLAMA_COMPILER, LLAMA_BUILD_TARGET, build_type);
385
386
  }
386
387
 
387
- std::string gpt_params_get_system_info(const gpt_params & params) {
388
+ std::string common_params_get_system_info(const common_params & params) {
388
389
  std::ostringstream os;
389
390
 
390
391
  os << "system_info: n_threads = " << params.cpuparams.n_threads;
@@ -406,17 +407,19 @@ std::string gpt_params_get_system_info(const gpt_params & params) {
406
407
  // String utils
407
408
  //
408
409
 
409
- std::vector<std::string> string_split(std::string input, char separator) {
410
- std::vector<std::string> parts;
411
- size_t separator_pos = input.find(separator);
412
- while (separator_pos != std::string::npos) {
413
- std::string part = input.substr(0, separator_pos);
414
- parts.emplace_back(part);
415
- input = input.substr(separator_pos + 1);
416
- separator_pos = input.find(separator);
417
- }
418
- parts.emplace_back(input);
419
- return parts;
410
+ std::string string_format(const char * fmt, ...) {
411
+ va_list ap;
412
+ va_list ap2;
413
+ va_start(ap, fmt);
414
+ va_copy(ap2, ap);
415
+ int size = vsnprintf(NULL, 0, fmt, ap);
416
+ LM_GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
417
+ std::vector<char> buf(size + 1);
418
+ int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
419
+ LM_GGML_ASSERT(size2 == size);
420
+ va_end(ap2);
421
+ va_end(ap);
422
+ return std::string(buf.data(), size);
420
423
  }
421
424
 
422
425
  std::string string_strip(const std::string & str) {
@@ -499,7 +502,7 @@ std::string string_from(const struct llama_context * ctx, const std::vector<llam
499
502
  first = false;
500
503
  }
501
504
 
502
- auto detokenized = llama_token_to_piece(ctx, token);
505
+ auto detokenized = common_token_to_piece(ctx, token);
503
506
 
504
507
  detokenized.erase(
505
508
  std::remove_if(
@@ -530,7 +533,7 @@ std::string string_from(const struct llama_context * ctx, const struct llama_bat
530
533
  first = false;
531
534
  }
532
535
 
533
- auto detokenized = llama_token_to_piece(ctx, batch.token[i]);
536
+ auto detokenized = common_token_to_piece(ctx, batch.token[i]);
534
537
 
535
538
  detokenized.erase(
536
539
  std::remove_if(
@@ -825,16 +828,16 @@ std::string fs_get_cache_file(const std::string & filename) {
825
828
  //
826
829
  // Model utils
827
830
  //
828
- struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
829
- llama_init_result iparams;
830
- auto mparams = llama_model_params_from_gpt_params(params);
831
+ struct common_init_result common_init_from_params(common_params & params) {
832
+ common_init_result iparams;
833
+ auto mparams = common_model_params_to_llama(params);
831
834
 
832
835
  llama_model * model = nullptr;
833
836
 
834
837
  if (!params.hf_repo.empty() && !params.hf_file.empty()) {
835
- model = llama_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
838
+ model = common_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
836
839
  } else if (!params.model_url.empty()) {
837
- model = llama_load_model_from_url(params.model_url.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
840
+ model = common_load_model_from_url(params.model_url.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
838
841
  } else {
839
842
  model = llama_load_model_from_file(params.model.c_str(), mparams);
840
843
  }
@@ -869,7 +872,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
869
872
  }
870
873
  }
871
874
 
872
- auto cparams = llama_context_params_from_gpt_params(params);
875
+ auto cparams = common_context_params_to_llama(params);
873
876
 
874
877
  llama_context * lctx = llama_new_context_with_model(model, cparams);
875
878
  if (lctx == NULL) {
@@ -882,7 +885,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
882
885
  if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1;
883
886
  if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_n_layer(model);
884
887
 
885
- const auto cvec = llama_control_vector_load(params.control_vectors);
888
+ const auto cvec = common_control_vector_load(params.control_vectors);
886
889
  if (cvec.n_embd == -1) {
887
890
  llama_free(lctx);
888
891
  llama_free_model(model);
@@ -906,7 +909,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
906
909
 
907
910
  // load and optionally apply lora adapters
908
911
  for (auto & la : params.lora_adapters) {
909
- llama_lora_adapter_container loaded_la;
912
+ common_lora_adapter_container loaded_la;
910
913
  loaded_la.path = la.path;
911
914
  loaded_la.scale = la.scale;
912
915
  loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str());
@@ -919,7 +922,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
919
922
  iparams.lora_adapters.push_back(loaded_la); // copy to list of loaded adapters
920
923
  }
921
924
  if (!params.lora_init_without_apply) {
922
- llama_lora_adapters_apply(lctx, iparams.lora_adapters);
925
+ common_lora_adapters_apply(lctx, iparams.lora_adapters);
923
926
  }
924
927
 
925
928
  if (params.sparams.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) {
@@ -945,7 +948,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
945
948
  }
946
949
 
947
950
  if (llama_model_has_encoder(model)) {
948
- llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size(), 0, 0));
951
+ llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size()));
949
952
  llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
950
953
  if (decoder_start_token_id == -1) {
951
954
  decoder_start_token_id = bos;
@@ -954,7 +957,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
954
957
  tmp.push_back(decoder_start_token_id);
955
958
  }
956
959
  if (llama_model_has_decoder(model)) {
957
- llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
960
+ llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch)));
958
961
  }
959
962
  llama_kv_cache_clear(lctx);
960
963
  llama_synchronize(lctx);
@@ -967,7 +970,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
967
970
  return iparams;
968
971
  }
969
972
 
970
- void llama_lora_adapters_apply(struct llama_context * ctx, std::vector<llama_lora_adapter_container> & lora_adapters) {
973
+ void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters) {
971
974
  llama_lora_adapter_clear(ctx);
972
975
  for (auto & la : lora_adapters) {
973
976
  if (la.scale != 0.0f) {
@@ -976,7 +979,7 @@ void llama_lora_adapters_apply(struct llama_context * ctx, std::vector<llama_lor
976
979
  }
977
980
  }
978
981
 
979
- struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & params) {
982
+ struct llama_model_params common_model_params_to_llama(const common_params & params) {
980
983
  auto mparams = llama_model_default_params();
981
984
 
982
985
  if (params.n_gpu_layers != -1) {
@@ -1029,10 +1032,10 @@ static lm_ggml_type kv_cache_type_from_str(const std::string & s) {
1029
1032
  return LM_GGML_TYPE_Q5_1;
1030
1033
  }
1031
1034
 
1032
- throw std::runtime_error("Invalid cache type: " + s);
1035
+ throw std::runtime_error("Unsupported cache type: " + s);
1033
1036
  }
1034
1037
 
1035
- struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) {
1038
+ struct llama_context_params common_context_params_to_llama(const common_params & params) {
1036
1039
  auto cparams = llama_context_default_params();
1037
1040
 
1038
1041
  cparams.n_ctx = params.n_ctx;
@@ -1041,7 +1044,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
1041
1044
  cparams.n_ubatch = params.n_ubatch;
1042
1045
  cparams.n_threads = params.cpuparams.n_threads;
1043
1046
  cparams.n_threads_batch = params.cpuparams_batch.n_threads == -1 ?
1044
- params.cpuparams.n_threads : params.cpuparams_batch.n_threads;
1047
+ params.cpuparams.n_threads : params.cpuparams_batch.n_threads;
1045
1048
  cparams.logits_all = params.logits_all;
1046
1049
  cparams.embeddings = params.embedding;
1047
1050
  cparams.rope_scaling_type = params.rope_scaling_type;
@@ -1122,7 +1125,7 @@ static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_
1122
1125
  return false;
1123
1126
  }
1124
1127
 
1125
- static bool llama_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {
1128
+ static bool common_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {
1126
1129
 
1127
1130
  // Initialize libcurl
1128
1131
  std::unique_ptr<CURL, decltype(&curl_easy_cleanup)> curl(curl_easy_init(), &curl_easy_cleanup);
@@ -1192,15 +1195,15 @@ static bool llama_download_file(const std::string & url, const std::string & pat
1192
1195
  }
1193
1196
 
1194
1197
  // Send a HEAD request to retrieve the etag and last-modified headers
1195
- struct llama_load_model_from_url_headers {
1198
+ struct common_load_model_from_url_headers {
1196
1199
  std::string etag;
1197
1200
  std::string last_modified;
1198
1201
  };
1199
- llama_load_model_from_url_headers headers;
1202
+ common_load_model_from_url_headers headers;
1200
1203
  {
1201
1204
  typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
1202
1205
  auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
1203
- llama_load_model_from_url_headers *headers = (llama_load_model_from_url_headers *) userdata;
1206
+ common_load_model_from_url_headers *headers = (common_load_model_from_url_headers *) userdata;
1204
1207
 
1205
1208
  static std::regex header_regex("([^:]+): (.*)\r\n");
1206
1209
  static std::regex etag_regex("ETag", std::regex_constants::icase);
@@ -1336,7 +1339,7 @@ static bool llama_download_file(const std::string & url, const std::string & pat
1336
1339
  return true;
1337
1340
  }
1338
1341
 
1339
- struct llama_model * llama_load_model_from_url(
1342
+ struct llama_model * common_load_model_from_url(
1340
1343
  const char * model_url,
1341
1344
  const char * path_model,
1342
1345
  const char * hf_token,
@@ -1347,7 +1350,7 @@ struct llama_model * llama_load_model_from_url(
1347
1350
  return NULL;
1348
1351
  }
1349
1352
 
1350
- if (!llama_download_file(model_url, path_model, hf_token)) {
1353
+ if (!common_download_file(model_url, path_model, hf_token)) {
1351
1354
  return NULL;
1352
1355
  }
1353
1356
 
@@ -1400,7 +1403,7 @@ struct llama_model * llama_load_model_from_url(
1400
1403
  char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0};
1401
1404
  llama_split_path(split_url, sizeof(split_url), split_url_prefix, download_idx, n_split);
1402
1405
 
1403
- return llama_download_file(split_url, split_path, hf_token);
1406
+ return common_download_file(split_url, split_path, hf_token);
1404
1407
  }, idx));
1405
1408
  }
1406
1409
 
@@ -1415,7 +1418,7 @@ struct llama_model * llama_load_model_from_url(
1415
1418
  return llama_load_model_from_file(path_model, params);
1416
1419
  }
1417
1420
 
1418
- struct llama_model * llama_load_model_from_hf(
1421
+ struct llama_model * common_load_model_from_hf(
1419
1422
  const char * repo,
1420
1423
  const char * model,
1421
1424
  const char * path_model,
@@ -1435,12 +1438,12 @@ struct llama_model * llama_load_model_from_hf(
1435
1438
  model_url += "/resolve/main/";
1436
1439
  model_url += model;
1437
1440
 
1438
- return llama_load_model_from_url(model_url.c_str(), path_model, hf_token, params);
1441
+ return common_load_model_from_url(model_url.c_str(), path_model, hf_token, params);
1439
1442
  }
1440
1443
 
1441
1444
  #else
1442
1445
 
1443
- struct llama_model * llama_load_model_from_url(
1446
+ struct llama_model * common_load_model_from_url(
1444
1447
  const char * /*model_url*/,
1445
1448
  const char * /*path_model*/,
1446
1449
  const char * /*hf_token*/,
@@ -1449,7 +1452,7 @@ struct llama_model * llama_load_model_from_url(
1449
1452
  return nullptr;
1450
1453
  }
1451
1454
 
1452
- struct llama_model * llama_load_model_from_hf(
1455
+ struct llama_model * common_load_model_from_hf(
1453
1456
  const char * /*repo*/,
1454
1457
  const char * /*model*/,
1455
1458
  const char * /*path_model*/,
@@ -1465,11 +1468,11 @@ struct llama_model * llama_load_model_from_hf(
1465
1468
  // Batch utils
1466
1469
  //
1467
1470
 
1468
- void llama_batch_clear(struct llama_batch & batch) {
1471
+ void common_batch_clear(struct llama_batch & batch) {
1469
1472
  batch.n_tokens = 0;
1470
1473
  }
1471
1474
 
1472
- void llama_batch_add(
1475
+ void common_batch_add(
1473
1476
  struct llama_batch & batch,
1474
1477
  llama_token id,
1475
1478
  llama_pos pos,
@@ -1492,15 +1495,15 @@ void llama_batch_add(
1492
1495
  // Vocab utils
1493
1496
  //
1494
1497
 
1495
- std::vector<llama_token> llama_tokenize(
1498
+ std::vector<llama_token> common_tokenize(
1496
1499
  const struct llama_context * ctx,
1497
1500
  const std::string & text,
1498
1501
  bool add_special,
1499
1502
  bool parse_special) {
1500
- return llama_tokenize(llama_get_model(ctx), text, add_special, parse_special);
1503
+ return common_tokenize(llama_get_model(ctx), text, add_special, parse_special);
1501
1504
  }
1502
1505
 
1503
- std::vector<llama_token> llama_tokenize(
1506
+ std::vector<llama_token> common_tokenize(
1504
1507
  const struct llama_model * model,
1505
1508
  const std::string & text,
1506
1509
  bool add_special,
@@ -1519,7 +1522,7 @@ std::vector<llama_token> llama_tokenize(
1519
1522
  return result;
1520
1523
  }
1521
1524
 
1522
- std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
1525
+ std::string common_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
1523
1526
  std::string piece;
1524
1527
  piece.resize(piece.capacity()); // using string internal cache, 15 bytes + '\n'
1525
1528
  const int n_chars = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special);
@@ -1535,7 +1538,7 @@ std::string llama_token_to_piece(const struct llama_context * ctx, llama_token t
1535
1538
  return piece;
1536
1539
  }
1537
1540
 
1538
- std::string llama_detokenize(llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
1541
+ std::string common_detokenize(llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
1539
1542
  std::string text;
1540
1543
  text.resize(std::max(text.capacity(), tokens.size()));
1541
1544
  int32_t n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
@@ -1555,15 +1558,15 @@ std::string llama_detokenize(llama_context * ctx, const std::vector<llama_token>
1555
1558
  // Chat template utils
1556
1559
  //
1557
1560
 
1558
- bool llama_chat_verify_template(const std::string & tmpl) {
1561
+ bool common_chat_verify_template(const std::string & tmpl) {
1559
1562
  llama_chat_message chat[] = {{"user", "test"}};
1560
1563
  int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0);
1561
1564
  return res >= 0;
1562
1565
  }
1563
1566
 
1564
- std::string llama_chat_apply_template(const struct llama_model * model,
1567
+ std::string common_chat_apply_template(const struct llama_model * model,
1565
1568
  const std::string & tmpl,
1566
- const std::vector<llama_chat_msg> & msgs,
1569
+ const std::vector<common_chat_msg> & msgs,
1567
1570
  bool add_ass) {
1568
1571
  int alloc_size = 0;
1569
1572
  bool fallback = false; // indicate if we must fallback to default chatml
@@ -1605,42 +1608,42 @@ std::string llama_chat_apply_template(const struct llama_model * model,
1605
1608
  return formatted_chat;
1606
1609
  }
1607
1610
 
1608
- std::string llama_chat_format_single(const struct llama_model * model,
1611
+ std::string common_chat_format_single(const struct llama_model * model,
1609
1612
  const std::string & tmpl,
1610
- const std::vector<llama_chat_msg> & past_msg,
1611
- const llama_chat_msg & new_msg,
1613
+ const std::vector<common_chat_msg> & past_msg,
1614
+ const common_chat_msg & new_msg,
1612
1615
  bool add_ass) {
1613
1616
  std::ostringstream ss;
1614
- auto fmt_past_msg = past_msg.empty() ? "" : llama_chat_apply_template(model, tmpl, past_msg, false);
1615
- std::vector<llama_chat_msg> chat_new(past_msg);
1617
+ auto fmt_past_msg = past_msg.empty() ? "" : common_chat_apply_template(model, tmpl, past_msg, false);
1618
+ std::vector<common_chat_msg> chat_new(past_msg);
1616
1619
  // if the past_msg ends with a newline, we must preserve it in the formatted version
1617
1620
  if (add_ass && !fmt_past_msg.empty() && fmt_past_msg.back() == '\n') {
1618
1621
  ss << "\n";
1619
1622
  };
1620
1623
  // format chat with new_msg
1621
1624
  chat_new.push_back(new_msg);
1622
- auto fmt_new_msg = llama_chat_apply_template(model, tmpl, chat_new, add_ass);
1625
+ auto fmt_new_msg = common_chat_apply_template(model, tmpl, chat_new, add_ass);
1623
1626
  // get the diff part
1624
1627
  ss << fmt_new_msg.substr(fmt_past_msg.size(), fmt_new_msg.size() - fmt_past_msg.size());
1625
1628
  return ss.str();
1626
1629
  }
1627
1630
 
1628
- std::string llama_chat_format_example(const struct llama_model * model,
1631
+ std::string common_chat_format_example(const struct llama_model * model,
1629
1632
  const std::string & tmpl) {
1630
- std::vector<llama_chat_msg> msgs = {
1633
+ std::vector<common_chat_msg> msgs = {
1631
1634
  {"system", "You are a helpful assistant"},
1632
1635
  {"user", "Hello"},
1633
1636
  {"assistant", "Hi there"},
1634
1637
  {"user", "How are you?"},
1635
1638
  };
1636
- return llama_chat_apply_template(model, tmpl, msgs, true);
1639
+ return common_chat_apply_template(model, tmpl, msgs, true);
1637
1640
  }
1638
1641
 
1639
1642
  //
1640
1643
  // KV cache utils
1641
1644
  //
1642
1645
 
1643
- void llama_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size) {
1646
+ void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size) {
1644
1647
  static const char slot_chars[] = ".123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+";
1645
1648
 
1646
1649
  printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d",
@@ -1663,7 +1666,7 @@ void llama_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size) {
1663
1666
  printf("\n=== Done dumping\n");
1664
1667
  }
1665
1668
 
1666
- void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size) {
1669
+ void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size) {
1667
1670
  static const char slot_chars[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
1668
1671
 
1669
1672
  printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d\n",
@@ -1715,7 +1718,7 @@ void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_siz
1715
1718
  // Embedding utils
1716
1719
  //
1717
1720
 
1718
- void llama_embd_normalize(const float * inp, float * out, int n, int embd_norm) {
1721
+ void common_embd_normalize(const float * inp, float * out, int n, int embd_norm) {
1719
1722
  double sum = 0.0;
1720
1723
 
1721
1724
  switch (embd_norm) {
@@ -1749,7 +1752,7 @@ void llama_embd_normalize(const float * inp, float * out, int n, int embd_norm)
1749
1752
  }
1750
1753
  }
1751
1754
 
1752
- float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n){
1755
+ float common_embd_similarity_cos(const float * embd1, const float * embd2, int n){
1753
1756
  double sum = 0.0;
1754
1757
  double sum1 = 0.0;
1755
1758
  double sum2 = 0.0;
@@ -1775,8 +1778,8 @@ float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n)
1775
1778
  // Control vector utils
1776
1779
  //
1777
1780
 
1778
- static llama_control_vector_data llama_control_vector_load_one(const llama_control_vector_load_info & load_info) {
1779
- llama_control_vector_data result = { -1, {} };
1781
+ static common_control_vector_data common_control_vector_load_one(const common_control_vector_load_info & load_info) {
1782
+ common_control_vector_data result = { -1, {} };
1780
1783
 
1781
1784
  lm_ggml_context * ctx = nullptr;
1782
1785
  struct lm_gguf_init_params meta_lm_gguf_params = {
@@ -1860,11 +1863,11 @@ static llama_control_vector_data llama_control_vector_load_one(const llama_contr
1860
1863
  return result;
1861
1864
  }
1862
1865
 
1863
- llama_control_vector_data llama_control_vector_load(const std::vector<llama_control_vector_load_info> & load_infos) {
1864
- llama_control_vector_data result = { -1, {} };
1866
+ common_control_vector_data common_control_vector_load(const std::vector<common_control_vector_load_info> & load_infos) {
1867
+ common_control_vector_data result = { -1, {} };
1865
1868
 
1866
1869
  for (const auto & info : load_infos) {
1867
- auto cur = llama_control_vector_load_one(info);
1870
+ auto cur = common_control_vector_load_one(info);
1868
1871
 
1869
1872
  if (cur.n_embd == -1) {
1870
1873
  result.n_embd = -1;
@@ -1956,8 +1959,10 @@ void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const cha
1956
1959
  }
1957
1960
  }
1958
1961
 
1959
- void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const llama_context * lctx,
1962
+ void yaml_dump_non_result_info(FILE * stream, const common_params & params, const llama_context * lctx,
1960
1963
  const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
1964
+ lm_ggml_cpu_init(); // some ARM features are detected at runtime
1965
+
1961
1966
  const auto & sparams = params.sparams;
1962
1967
 
1963
1968
  fprintf(stream, "build_commit: %s\n", LLAMA_COMMIT);
@@ -2013,6 +2018,10 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
2013
2018
  fprintf(stream, "chunks: %d # default: -1 (unlimited)\n", params.n_chunks);
2014
2019
  fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false");
2015
2020
  fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx);
2021
+ fprintf(stream, "dry_allowed_length: %d # default: 2\n", sparams.dry_allowed_length);
2022
+ fprintf(stream, "dry_base: %.2f # default: 1.75\n", sparams.dry_base);
2023
+ fprintf(stream, "dry_multiplier: %.1f # default: 0.0\n", sparams.dry_multiplier);
2024
+ fprintf(stream, "dry_penalty_last_n: %d # default: -1 (0 = disable, -1 = context size)\n", sparams.dry_penalty_last_n);
2016
2025
  fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false");
2017
2026
  fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n");
2018
2027
  fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", sparams.penalty_freq);
@@ -2093,11 +2102,12 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
2093
2102
  const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + llama_max_devices());
2094
2103
  yaml_dump_vector_float(stream, "tensor_split", tensor_split_vector);
2095
2104
 
2096
- fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z);
2097
2105
  fprintf(stream, "threads: %d # default: %u\n", params.cpuparams.n_threads, std::thread::hardware_concurrency());
2098
2106
  fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
2099
2107
  fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
2100
2108
  fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
2109
+ fprintf(stream, "xtc_probability: %f # default: 0.0\n", sparams.xtc_probability);
2110
+ fprintf(stream, "xtc_threshold: %f # default: 0.1\n", sparams.xtc_threshold);
2101
2111
  fprintf(stream, "typ_p: %f # default: 1.0\n", sparams.typ_p);
2102
2112
  fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
2103
2113
  fprintf(stream, "display_prompt: %s # default: true\n", params.display_prompt ? "true" : "false");