cui-llama.rn 1.2.2 → 1.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/java/com/rnllama/LlamaContext.java +5 -2
- package/android/src/main/jni.cpp +7 -7
- package/cpp/common.cpp +81 -63
- package/cpp/common.h +79 -62
- package/cpp/ggml-alloc.c +17 -19
- package/cpp/ggml-backend.cpp +59 -24
- package/cpp/ggml-impl.h +8 -0
- package/cpp/ggml.c +65 -23
- package/cpp/ggml.h +1 -0
- package/cpp/json-schema-to-grammar.cpp +1 -1
- package/cpp/llama-sampling.cpp +366 -24
- package/cpp/llama-sampling.h +3 -2
- package/cpp/llama-vocab.cpp +33 -9
- package/cpp/llama-vocab.h +30 -11
- package/cpp/llama.cpp +471 -387
- package/cpp/llama.h +52 -21
- package/cpp/log.cpp +50 -50
- package/cpp/log.h +18 -18
- package/cpp/rn-llama.hpp +23 -22
- package/cpp/sampling.cpp +110 -119
- package/cpp/sampling.h +20 -20
- package/package.json +1 -1
@@ -12,6 +12,7 @@ import android.util.Log;
|
|
12
12
|
import android.os.Build;
|
13
13
|
import android.os.ParcelFileDescriptor;
|
14
14
|
import android.net.Uri;
|
15
|
+
import android.content.Intent;
|
15
16
|
import android.content.res.AssetManager;
|
16
17
|
|
17
18
|
import java.lang.StringBuilder;
|
@@ -39,11 +40,12 @@ public class LlamaContext {
|
|
39
40
|
InputStream fis = null;
|
40
41
|
try {
|
41
42
|
if (filepath.startsWith("content")) {
|
42
|
-
|
43
|
+
Uri uri = Uri.parse(filepath);
|
44
|
+
reactContext.getApplicationContext().getContentResolver().takePersistableUriPermission(uri, Intent.FLAG_GRANT_READ_URI_PERMISSION);
|
45
|
+
fis = reactContext.getApplicationContext().getContentResolver().openInputStream(uri);
|
43
46
|
} else {
|
44
47
|
fis = new FileInputStream(filepath);
|
45
48
|
}
|
46
|
-
|
47
49
|
|
48
50
|
int bytesRead = fis.read(fileHeader);
|
49
51
|
if(bytesRead < 4) {
|
@@ -55,6 +57,7 @@ public class LlamaContext {
|
|
55
57
|
}
|
56
58
|
return true;
|
57
59
|
} catch (Exception e) {
|
60
|
+
Log.e(NAME, "Failed to check GGUF: " + e.getMessage());
|
58
61
|
return false;
|
59
62
|
}finally {
|
60
63
|
if (fis != null) {
|
package/android/src/main/jni.cpp
CHANGED
@@ -156,7 +156,7 @@ Java_com_rnllama_LlamaContext_initContext(
|
|
156
156
|
) {
|
157
157
|
UNUSED(thiz);
|
158
158
|
|
159
|
-
|
159
|
+
common_params defaultParams;
|
160
160
|
|
161
161
|
defaultParams.vocab_only = vocab_only;
|
162
162
|
if(vocab_only) {
|
@@ -268,7 +268,7 @@ Java_com_rnllama_LlamaContext_getFormattedChat(
|
|
268
268
|
UNUSED(thiz);
|
269
269
|
auto llama = context_map[(long) context_ptr];
|
270
270
|
|
271
|
-
std::vector<
|
271
|
+
std::vector<common_chat_msg> chat;
|
272
272
|
|
273
273
|
int messages_len = env->GetArrayLength(messages);
|
274
274
|
for (int i = 0; i < messages_len; i++) {
|
@@ -292,7 +292,7 @@ Java_com_rnllama_LlamaContext_getFormattedChat(
|
|
292
292
|
}
|
293
293
|
|
294
294
|
const char *tmpl_chars = env->GetStringUTFChars(chat_template, nullptr);
|
295
|
-
std::string formatted_chat =
|
295
|
+
std::string formatted_chat = common_chat_apply_template(llama->model, tmpl_chars, chat, true);
|
296
296
|
|
297
297
|
return env->NewStringUTF(formatted_chat.c_str());
|
298
298
|
}
|
@@ -497,7 +497,7 @@ Java_com_rnllama_LlamaContext_doCompletion(
|
|
497
497
|
if (token_with_probs.tok == -1 || llama->incomplete) {
|
498
498
|
continue;
|
499
499
|
}
|
500
|
-
const std::string token_text =
|
500
|
+
const std::string token_text = common_token_to_piece(llama->ctx, token_with_probs.tok);
|
501
501
|
|
502
502
|
size_t pos = std::min(sent_count, llama->generated_text.size());
|
503
503
|
|
@@ -532,7 +532,7 @@ Java_com_rnllama_LlamaContext_doCompletion(
|
|
532
532
|
putString(env, tokenResult, "token", to_send.c_str());
|
533
533
|
|
534
534
|
if (llama->params.sparams.n_probs > 0) {
|
535
|
-
const std::vector<llama_token> to_send_toks =
|
535
|
+
const std::vector<llama_token> to_send_toks = common_tokenize(llama->ctx, to_send, false);
|
536
536
|
size_t probs_pos = std::min(sent_token_probs_index, llama->generated_token_probs.size());
|
537
537
|
size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama->generated_token_probs.size());
|
538
538
|
if (probs_pos < probs_stop_pos) {
|
@@ -607,7 +607,7 @@ Java_com_rnllama_LlamaContext_tokenize(
|
|
607
607
|
|
608
608
|
const char *text_chars = env->GetStringUTFChars(text, nullptr);
|
609
609
|
|
610
|
-
const std::vector<llama_token> toks =
|
610
|
+
const std::vector<llama_token> toks = common_tokenize(
|
611
611
|
llama->ctx,
|
612
612
|
text_chars,
|
613
613
|
false
|
@@ -719,7 +719,7 @@ Java_com_rnllama_LlamaContext_freeContext(
|
|
719
719
|
}
|
720
720
|
if (llama->ctx_sampling != nullptr)
|
721
721
|
{
|
722
|
-
|
722
|
+
common_sampler_free(llama->ctx_sampling);
|
723
723
|
}
|
724
724
|
context_map.erase((long) llama->ctx);
|
725
725
|
}
|
package/cpp/common.cpp
CHANGED
@@ -12,6 +12,7 @@
|
|
12
12
|
|
13
13
|
#include <algorithm>
|
14
14
|
#include <cinttypes>
|
15
|
+
#include <climits>
|
15
16
|
#include <cmath>
|
16
17
|
#include <codecvt>
|
17
18
|
#include <cstdarg>
|
@@ -23,10 +24,10 @@
|
|
23
24
|
#include <regex>
|
24
25
|
#include <sstream>
|
25
26
|
#include <string>
|
27
|
+
#include <thread>
|
26
28
|
#include <unordered_map>
|
27
29
|
#include <unordered_set>
|
28
30
|
#include <vector>
|
29
|
-
#include <thread>
|
30
31
|
|
31
32
|
#if defined(__APPLE__) && defined(__MACH__)
|
32
33
|
#include <sys/types.h>
|
@@ -368,10 +369,10 @@ bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[LM_GGML_MAX_N_THR
|
|
368
369
|
return true;
|
369
370
|
}
|
370
371
|
|
371
|
-
void
|
372
|
+
void common_init() {
|
372
373
|
llama_log_set([](lm_ggml_log_level level, const char * text, void * /*user_data*/) {
|
373
|
-
if (LOG_DEFAULT_LLAMA <=
|
374
|
-
|
374
|
+
if (LOG_DEFAULT_LLAMA <= common_log_verbosity_thold) {
|
375
|
+
common_log_add(common_log_main(), level, "%s", text);
|
375
376
|
}
|
376
377
|
}, NULL);
|
377
378
|
|
@@ -384,7 +385,7 @@ void gpt_init() {
|
|
384
385
|
LOG_INF("build: %d (%s) with %s for %s%s\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT, LLAMA_COMPILER, LLAMA_BUILD_TARGET, build_type);
|
385
386
|
}
|
386
387
|
|
387
|
-
std::string
|
388
|
+
std::string common_params_get_system_info(const common_params & params) {
|
388
389
|
std::ostringstream os;
|
389
390
|
|
390
391
|
os << "system_info: n_threads = " << params.cpuparams.n_threads;
|
@@ -406,6 +407,21 @@ std::string gpt_params_get_system_info(const gpt_params & params) {
|
|
406
407
|
// String utils
|
407
408
|
//
|
408
409
|
|
410
|
+
std::string string_format(const char * fmt, ...) {
|
411
|
+
va_list ap;
|
412
|
+
va_list ap2;
|
413
|
+
va_start(ap, fmt);
|
414
|
+
va_copy(ap2, ap);
|
415
|
+
int size = vsnprintf(NULL, 0, fmt, ap);
|
416
|
+
LM_GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
|
417
|
+
std::vector<char> buf(size + 1);
|
418
|
+
int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
|
419
|
+
LM_GGML_ASSERT(size2 == size);
|
420
|
+
va_end(ap2);
|
421
|
+
va_end(ap);
|
422
|
+
return std::string(buf.data(), size);
|
423
|
+
}
|
424
|
+
|
409
425
|
std::vector<std::string> string_split(std::string input, char separator) {
|
410
426
|
std::vector<std::string> parts;
|
411
427
|
size_t separator_pos = input.find(separator);
|
@@ -499,7 +515,7 @@ std::string string_from(const struct llama_context * ctx, const std::vector<llam
|
|
499
515
|
first = false;
|
500
516
|
}
|
501
517
|
|
502
|
-
auto detokenized =
|
518
|
+
auto detokenized = common_token_to_piece(ctx, token);
|
503
519
|
|
504
520
|
detokenized.erase(
|
505
521
|
std::remove_if(
|
@@ -530,7 +546,7 @@ std::string string_from(const struct llama_context * ctx, const struct llama_bat
|
|
530
546
|
first = false;
|
531
547
|
}
|
532
548
|
|
533
|
-
auto detokenized =
|
549
|
+
auto detokenized = common_token_to_piece(ctx, batch.token[i]);
|
534
550
|
|
535
551
|
detokenized.erase(
|
536
552
|
std::remove_if(
|
@@ -825,16 +841,16 @@ std::string fs_get_cache_file(const std::string & filename) {
|
|
825
841
|
//
|
826
842
|
// Model utils
|
827
843
|
//
|
828
|
-
struct
|
829
|
-
|
830
|
-
auto mparams =
|
844
|
+
struct common_init_result common_init_from_params(common_params & params) {
|
845
|
+
common_init_result iparams;
|
846
|
+
auto mparams = common_model_params_to_llama(params);
|
831
847
|
|
832
848
|
llama_model * model = nullptr;
|
833
849
|
|
834
850
|
if (!params.hf_repo.empty() && !params.hf_file.empty()) {
|
835
|
-
model =
|
851
|
+
model = common_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
|
836
852
|
} else if (!params.model_url.empty()) {
|
837
|
-
model =
|
853
|
+
model = common_load_model_from_url(params.model_url.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
|
838
854
|
} else {
|
839
855
|
model = llama_load_model_from_file(params.model.c_str(), mparams);
|
840
856
|
}
|
@@ -869,7 +885,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
|
869
885
|
}
|
870
886
|
}
|
871
887
|
|
872
|
-
auto cparams =
|
888
|
+
auto cparams = common_context_params_to_llama(params);
|
873
889
|
|
874
890
|
llama_context * lctx = llama_new_context_with_model(model, cparams);
|
875
891
|
if (lctx == NULL) {
|
@@ -882,7 +898,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
|
882
898
|
if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1;
|
883
899
|
if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_n_layer(model);
|
884
900
|
|
885
|
-
const auto cvec =
|
901
|
+
const auto cvec = common_control_vector_load(params.control_vectors);
|
886
902
|
if (cvec.n_embd == -1) {
|
887
903
|
llama_free(lctx);
|
888
904
|
llama_free_model(model);
|
@@ -906,7 +922,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
|
906
922
|
|
907
923
|
// load and optionally apply lora adapters
|
908
924
|
for (auto & la : params.lora_adapters) {
|
909
|
-
|
925
|
+
common_lora_adapter_container loaded_la;
|
910
926
|
loaded_la.path = la.path;
|
911
927
|
loaded_la.scale = la.scale;
|
912
928
|
loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str());
|
@@ -919,7 +935,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
|
919
935
|
iparams.lora_adapters.push_back(loaded_la); // copy to list of loaded adapters
|
920
936
|
}
|
921
937
|
if (!params.lora_init_without_apply) {
|
922
|
-
|
938
|
+
common_lora_adapters_apply(lctx, iparams.lora_adapters);
|
923
939
|
}
|
924
940
|
|
925
941
|
if (params.sparams.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) {
|
@@ -945,7 +961,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
|
945
961
|
}
|
946
962
|
|
947
963
|
if (llama_model_has_encoder(model)) {
|
948
|
-
llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size()
|
964
|
+
llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size()));
|
949
965
|
llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
|
950
966
|
if (decoder_start_token_id == -1) {
|
951
967
|
decoder_start_token_id = bos;
|
@@ -954,7 +970,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
|
954
970
|
tmp.push_back(decoder_start_token_id);
|
955
971
|
}
|
956
972
|
if (llama_model_has_decoder(model)) {
|
957
|
-
llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch)
|
973
|
+
llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch)));
|
958
974
|
}
|
959
975
|
llama_kv_cache_clear(lctx);
|
960
976
|
llama_synchronize(lctx);
|
@@ -967,7 +983,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
|
967
983
|
return iparams;
|
968
984
|
}
|
969
985
|
|
970
|
-
void
|
986
|
+
void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters) {
|
971
987
|
llama_lora_adapter_clear(ctx);
|
972
988
|
for (auto & la : lora_adapters) {
|
973
989
|
if (la.scale != 0.0f) {
|
@@ -976,7 +992,7 @@ void llama_lora_adapters_apply(struct llama_context * ctx, std::vector<llama_lor
|
|
976
992
|
}
|
977
993
|
}
|
978
994
|
|
979
|
-
struct llama_model_params
|
995
|
+
struct llama_model_params common_model_params_to_llama(const common_params & params) {
|
980
996
|
auto mparams = llama_model_default_params();
|
981
997
|
|
982
998
|
if (params.n_gpu_layers != -1) {
|
@@ -1029,10 +1045,10 @@ static lm_ggml_type kv_cache_type_from_str(const std::string & s) {
|
|
1029
1045
|
return LM_GGML_TYPE_Q5_1;
|
1030
1046
|
}
|
1031
1047
|
|
1032
|
-
throw std::runtime_error("
|
1048
|
+
throw std::runtime_error("Unsupported cache type: " + s);
|
1033
1049
|
}
|
1034
1050
|
|
1035
|
-
struct llama_context_params
|
1051
|
+
struct llama_context_params common_context_params_to_llama(const common_params & params) {
|
1036
1052
|
auto cparams = llama_context_default_params();
|
1037
1053
|
|
1038
1054
|
cparams.n_ctx = params.n_ctx;
|
@@ -1041,7 +1057,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
|
|
1041
1057
|
cparams.n_ubatch = params.n_ubatch;
|
1042
1058
|
cparams.n_threads = params.cpuparams.n_threads;
|
1043
1059
|
cparams.n_threads_batch = params.cpuparams_batch.n_threads == -1 ?
|
1044
|
-
|
1060
|
+
params.cpuparams.n_threads : params.cpuparams_batch.n_threads;
|
1045
1061
|
cparams.logits_all = params.logits_all;
|
1046
1062
|
cparams.embeddings = params.embedding;
|
1047
1063
|
cparams.rope_scaling_type = params.rope_scaling_type;
|
@@ -1122,7 +1138,7 @@ static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_
|
|
1122
1138
|
return false;
|
1123
1139
|
}
|
1124
1140
|
|
1125
|
-
static bool
|
1141
|
+
static bool common_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {
|
1126
1142
|
|
1127
1143
|
// Initialize libcurl
|
1128
1144
|
std::unique_ptr<CURL, decltype(&curl_easy_cleanup)> curl(curl_easy_init(), &curl_easy_cleanup);
|
@@ -1192,15 +1208,15 @@ static bool llama_download_file(const std::string & url, const std::string & pat
|
|
1192
1208
|
}
|
1193
1209
|
|
1194
1210
|
// Send a HEAD request to retrieve the etag and last-modified headers
|
1195
|
-
struct
|
1211
|
+
struct common_load_model_from_url_headers {
|
1196
1212
|
std::string etag;
|
1197
1213
|
std::string last_modified;
|
1198
1214
|
};
|
1199
|
-
|
1215
|
+
common_load_model_from_url_headers headers;
|
1200
1216
|
{
|
1201
1217
|
typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
|
1202
1218
|
auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
|
1203
|
-
|
1219
|
+
common_load_model_from_url_headers *headers = (common_load_model_from_url_headers *) userdata;
|
1204
1220
|
|
1205
1221
|
static std::regex header_regex("([^:]+): (.*)\r\n");
|
1206
1222
|
static std::regex etag_regex("ETag", std::regex_constants::icase);
|
@@ -1336,7 +1352,7 @@ static bool llama_download_file(const std::string & url, const std::string & pat
|
|
1336
1352
|
return true;
|
1337
1353
|
}
|
1338
1354
|
|
1339
|
-
struct llama_model *
|
1355
|
+
struct llama_model * common_load_model_from_url(
|
1340
1356
|
const char * model_url,
|
1341
1357
|
const char * path_model,
|
1342
1358
|
const char * hf_token,
|
@@ -1347,7 +1363,7 @@ struct llama_model * llama_load_model_from_url(
|
|
1347
1363
|
return NULL;
|
1348
1364
|
}
|
1349
1365
|
|
1350
|
-
if (!
|
1366
|
+
if (!common_download_file(model_url, path_model, hf_token)) {
|
1351
1367
|
return NULL;
|
1352
1368
|
}
|
1353
1369
|
|
@@ -1400,7 +1416,7 @@ struct llama_model * llama_load_model_from_url(
|
|
1400
1416
|
char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0};
|
1401
1417
|
llama_split_path(split_url, sizeof(split_url), split_url_prefix, download_idx, n_split);
|
1402
1418
|
|
1403
|
-
return
|
1419
|
+
return common_download_file(split_url, split_path, hf_token);
|
1404
1420
|
}, idx));
|
1405
1421
|
}
|
1406
1422
|
|
@@ -1415,7 +1431,7 @@ struct llama_model * llama_load_model_from_url(
|
|
1415
1431
|
return llama_load_model_from_file(path_model, params);
|
1416
1432
|
}
|
1417
1433
|
|
1418
|
-
struct llama_model *
|
1434
|
+
struct llama_model * common_load_model_from_hf(
|
1419
1435
|
const char * repo,
|
1420
1436
|
const char * model,
|
1421
1437
|
const char * path_model,
|
@@ -1435,12 +1451,12 @@ struct llama_model * llama_load_model_from_hf(
|
|
1435
1451
|
model_url += "/resolve/main/";
|
1436
1452
|
model_url += model;
|
1437
1453
|
|
1438
|
-
return
|
1454
|
+
return common_load_model_from_url(model_url.c_str(), path_model, hf_token, params);
|
1439
1455
|
}
|
1440
1456
|
|
1441
1457
|
#else
|
1442
1458
|
|
1443
|
-
struct llama_model *
|
1459
|
+
struct llama_model * common_load_model_from_url(
|
1444
1460
|
const char * /*model_url*/,
|
1445
1461
|
const char * /*path_model*/,
|
1446
1462
|
const char * /*hf_token*/,
|
@@ -1449,7 +1465,7 @@ struct llama_model * llama_load_model_from_url(
|
|
1449
1465
|
return nullptr;
|
1450
1466
|
}
|
1451
1467
|
|
1452
|
-
struct llama_model *
|
1468
|
+
struct llama_model * common_load_model_from_hf(
|
1453
1469
|
const char * /*repo*/,
|
1454
1470
|
const char * /*model*/,
|
1455
1471
|
const char * /*path_model*/,
|
@@ -1465,11 +1481,11 @@ struct llama_model * llama_load_model_from_hf(
|
|
1465
1481
|
// Batch utils
|
1466
1482
|
//
|
1467
1483
|
|
1468
|
-
void
|
1484
|
+
void common_batch_clear(struct llama_batch & batch) {
|
1469
1485
|
batch.n_tokens = 0;
|
1470
1486
|
}
|
1471
1487
|
|
1472
|
-
void
|
1488
|
+
void common_batch_add(
|
1473
1489
|
struct llama_batch & batch,
|
1474
1490
|
llama_token id,
|
1475
1491
|
llama_pos pos,
|
@@ -1492,15 +1508,15 @@ void llama_batch_add(
|
|
1492
1508
|
// Vocab utils
|
1493
1509
|
//
|
1494
1510
|
|
1495
|
-
std::vector<llama_token>
|
1511
|
+
std::vector<llama_token> common_tokenize(
|
1496
1512
|
const struct llama_context * ctx,
|
1497
1513
|
const std::string & text,
|
1498
1514
|
bool add_special,
|
1499
1515
|
bool parse_special) {
|
1500
|
-
return
|
1516
|
+
return common_tokenize(llama_get_model(ctx), text, add_special, parse_special);
|
1501
1517
|
}
|
1502
1518
|
|
1503
|
-
std::vector<llama_token>
|
1519
|
+
std::vector<llama_token> common_tokenize(
|
1504
1520
|
const struct llama_model * model,
|
1505
1521
|
const std::string & text,
|
1506
1522
|
bool add_special,
|
@@ -1519,7 +1535,7 @@ std::vector<llama_token> llama_tokenize(
|
|
1519
1535
|
return result;
|
1520
1536
|
}
|
1521
1537
|
|
1522
|
-
std::string
|
1538
|
+
std::string common_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
|
1523
1539
|
std::string piece;
|
1524
1540
|
piece.resize(piece.capacity()); // using string internal cache, 15 bytes + '\n'
|
1525
1541
|
const int n_chars = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special);
|
@@ -1535,7 +1551,7 @@ std::string llama_token_to_piece(const struct llama_context * ctx, llama_token t
|
|
1535
1551
|
return piece;
|
1536
1552
|
}
|
1537
1553
|
|
1538
|
-
std::string
|
1554
|
+
std::string common_detokenize(llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
|
1539
1555
|
std::string text;
|
1540
1556
|
text.resize(std::max(text.capacity(), tokens.size()));
|
1541
1557
|
int32_t n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
|
@@ -1555,15 +1571,15 @@ std::string llama_detokenize(llama_context * ctx, const std::vector<llama_token>
|
|
1555
1571
|
// Chat template utils
|
1556
1572
|
//
|
1557
1573
|
|
1558
|
-
bool
|
1574
|
+
bool common_chat_verify_template(const std::string & tmpl) {
|
1559
1575
|
llama_chat_message chat[] = {{"user", "test"}};
|
1560
1576
|
int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0);
|
1561
1577
|
return res >= 0;
|
1562
1578
|
}
|
1563
1579
|
|
1564
|
-
std::string
|
1580
|
+
std::string common_chat_apply_template(const struct llama_model * model,
|
1565
1581
|
const std::string & tmpl,
|
1566
|
-
const std::vector<
|
1582
|
+
const std::vector<common_chat_msg> & msgs,
|
1567
1583
|
bool add_ass) {
|
1568
1584
|
int alloc_size = 0;
|
1569
1585
|
bool fallback = false; // indicate if we must fallback to default chatml
|
@@ -1605,42 +1621,42 @@ std::string llama_chat_apply_template(const struct llama_model * model,
|
|
1605
1621
|
return formatted_chat;
|
1606
1622
|
}
|
1607
1623
|
|
1608
|
-
std::string
|
1624
|
+
std::string common_chat_format_single(const struct llama_model * model,
|
1609
1625
|
const std::string & tmpl,
|
1610
|
-
const std::vector<
|
1611
|
-
const
|
1626
|
+
const std::vector<common_chat_msg> & past_msg,
|
1627
|
+
const common_chat_msg & new_msg,
|
1612
1628
|
bool add_ass) {
|
1613
1629
|
std::ostringstream ss;
|
1614
|
-
auto fmt_past_msg = past_msg.empty() ? "" :
|
1615
|
-
std::vector<
|
1630
|
+
auto fmt_past_msg = past_msg.empty() ? "" : common_chat_apply_template(model, tmpl, past_msg, false);
|
1631
|
+
std::vector<common_chat_msg> chat_new(past_msg);
|
1616
1632
|
// if the past_msg ends with a newline, we must preserve it in the formatted version
|
1617
1633
|
if (add_ass && !fmt_past_msg.empty() && fmt_past_msg.back() == '\n') {
|
1618
1634
|
ss << "\n";
|
1619
1635
|
};
|
1620
1636
|
// format chat with new_msg
|
1621
1637
|
chat_new.push_back(new_msg);
|
1622
|
-
auto fmt_new_msg =
|
1638
|
+
auto fmt_new_msg = common_chat_apply_template(model, tmpl, chat_new, add_ass);
|
1623
1639
|
// get the diff part
|
1624
1640
|
ss << fmt_new_msg.substr(fmt_past_msg.size(), fmt_new_msg.size() - fmt_past_msg.size());
|
1625
1641
|
return ss.str();
|
1626
1642
|
}
|
1627
1643
|
|
1628
|
-
std::string
|
1644
|
+
std::string common_chat_format_example(const struct llama_model * model,
|
1629
1645
|
const std::string & tmpl) {
|
1630
|
-
std::vector<
|
1646
|
+
std::vector<common_chat_msg> msgs = {
|
1631
1647
|
{"system", "You are a helpful assistant"},
|
1632
1648
|
{"user", "Hello"},
|
1633
1649
|
{"assistant", "Hi there"},
|
1634
1650
|
{"user", "How are you?"},
|
1635
1651
|
};
|
1636
|
-
return
|
1652
|
+
return common_chat_apply_template(model, tmpl, msgs, true);
|
1637
1653
|
}
|
1638
1654
|
|
1639
1655
|
//
|
1640
1656
|
// KV cache utils
|
1641
1657
|
//
|
1642
1658
|
|
1643
|
-
void
|
1659
|
+
void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size) {
|
1644
1660
|
static const char slot_chars[] = ".123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+";
|
1645
1661
|
|
1646
1662
|
printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d",
|
@@ -1663,7 +1679,7 @@ void llama_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size) {
|
|
1663
1679
|
printf("\n=== Done dumping\n");
|
1664
1680
|
}
|
1665
1681
|
|
1666
|
-
void
|
1682
|
+
void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size) {
|
1667
1683
|
static const char slot_chars[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
|
1668
1684
|
|
1669
1685
|
printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d\n",
|
@@ -1715,7 +1731,7 @@ void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_siz
|
|
1715
1731
|
// Embedding utils
|
1716
1732
|
//
|
1717
1733
|
|
1718
|
-
void
|
1734
|
+
void common_embd_normalize(const float * inp, float * out, int n, int embd_norm) {
|
1719
1735
|
double sum = 0.0;
|
1720
1736
|
|
1721
1737
|
switch (embd_norm) {
|
@@ -1749,7 +1765,7 @@ void llama_embd_normalize(const float * inp, float * out, int n, int embd_norm)
|
|
1749
1765
|
}
|
1750
1766
|
}
|
1751
1767
|
|
1752
|
-
float
|
1768
|
+
float common_embd_similarity_cos(const float * embd1, const float * embd2, int n){
|
1753
1769
|
double sum = 0.0;
|
1754
1770
|
double sum1 = 0.0;
|
1755
1771
|
double sum2 = 0.0;
|
@@ -1775,8 +1791,8 @@ float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n)
|
|
1775
1791
|
// Control vector utils
|
1776
1792
|
//
|
1777
1793
|
|
1778
|
-
static
|
1779
|
-
|
1794
|
+
static common_control_vector_data common_control_vector_load_one(const common_control_vector_load_info & load_info) {
|
1795
|
+
common_control_vector_data result = { -1, {} };
|
1780
1796
|
|
1781
1797
|
lm_ggml_context * ctx = nullptr;
|
1782
1798
|
struct lm_gguf_init_params meta_lm_gguf_params = {
|
@@ -1860,11 +1876,11 @@ static llama_control_vector_data llama_control_vector_load_one(const llama_contr
|
|
1860
1876
|
return result;
|
1861
1877
|
}
|
1862
1878
|
|
1863
|
-
|
1864
|
-
|
1879
|
+
common_control_vector_data common_control_vector_load(const std::vector<common_control_vector_load_info> & load_infos) {
|
1880
|
+
common_control_vector_data result = { -1, {} };
|
1865
1881
|
|
1866
1882
|
for (const auto & info : load_infos) {
|
1867
|
-
auto cur =
|
1883
|
+
auto cur = common_control_vector_load_one(info);
|
1868
1884
|
|
1869
1885
|
if (cur.n_embd == -1) {
|
1870
1886
|
result.n_embd = -1;
|
@@ -1956,7 +1972,7 @@ void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const cha
|
|
1956
1972
|
}
|
1957
1973
|
}
|
1958
1974
|
|
1959
|
-
void yaml_dump_non_result_info(FILE * stream, const
|
1975
|
+
void yaml_dump_non_result_info(FILE * stream, const common_params & params, const llama_context * lctx,
|
1960
1976
|
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
|
1961
1977
|
const auto & sparams = params.sparams;
|
1962
1978
|
|
@@ -2098,6 +2114,8 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
|
|
2098
2114
|
fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
|
2099
2115
|
fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
|
2100
2116
|
fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
|
2117
|
+
fprintf(stream, "xtc_probability: %f # default: 0.0\n", sparams.xtc_probability);
|
2118
|
+
fprintf(stream, "xtc_threshold: %f # default: 0.1\n", sparams.xtc_threshold);
|
2101
2119
|
fprintf(stream, "typ_p: %f # default: 1.0\n", sparams.typ_p);
|
2102
2120
|
fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
|
2103
2121
|
fprintf(stream, "display_prompt: %s # default: true\n", params.display_prompt ? "true" : "false");
|