cui-llama.rn 1.2.3 → 1.2.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +0 -2
- package/android/src/main/CMakeLists.txt +1 -0
- package/android/src/main/java/com/rnllama/LlamaContext.java +0 -3
- package/android/src/main/jni.cpp +9 -11
- package/cpp/common.cpp +85 -75
- package/cpp/common.h +127 -91
- package/cpp/ggml-aarch64.c +269 -0
- package/cpp/ggml-alloc.c +17 -19
- package/cpp/ggml-backend-impl.h +4 -15
- package/cpp/ggml-backend.cpp +1697 -1626
- package/cpp/ggml-backend.h +13 -25
- package/cpp/ggml-cpp.h +38 -0
- package/cpp/ggml-cpu.c +13720 -0
- package/cpp/ggml-cpu.h +150 -0
- package/cpp/ggml-impl.h +95 -0
- package/cpp/ggml-metal.m +185 -71
- package/cpp/ggml-quants.c +38 -51
- package/cpp/ggml.c +4468 -19500
- package/cpp/ggml.h +26 -146
- package/cpp/json-schema-to-grammar.cpp +1 -1
- package/cpp/llama-sampling.cpp +742 -249
- package/cpp/llama-sampling.h +21 -2
- package/cpp/llama-vocab.cpp +49 -9
- package/cpp/llama-vocab.h +35 -11
- package/cpp/llama.cpp +2468 -2307
- package/cpp/llama.h +65 -32
- package/cpp/log.cpp +50 -50
- package/cpp/log.h +18 -18
- package/cpp/rn-llama.hpp +23 -22
- package/cpp/sampling.cpp +117 -118
- package/cpp/sampling.h +20 -20
- package/cpp/sgemm.cpp +57 -0
- package/lib/commonjs/NativeRNLlama.js.map +1 -1
- package/lib/module/NativeRNLlama.js.map +1 -1
- package/lib/typescript/NativeRNLlama.d.ts +0 -1
- package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
- package/package.json +1 -1
- package/src/NativeRNLlama.ts +0 -1
package/README.md
CHANGED
@@ -11,8 +11,6 @@ The following features have been added for Android:
|
|
11
11
|
- `vocab_only` mode: utilize the llama.cpp tokenizer
|
12
12
|
- tokenizeSync: non-blocking, synchronous tokenizer function
|
13
13
|
- Context Shift taken from [kobold.cpp](https://github.com/LostRuins/koboldcpp)
|
14
|
-
- XTC sampling
|
15
|
-
- Progress callback
|
16
14
|
- Retrieving CPU Features to check for i8mm and dotprod flags
|
17
15
|
|
18
16
|
Original repo README.md below.
|
@@ -248,8 +248,6 @@ public class LlamaContext {
|
|
248
248
|
params.hasKey("xtc_t") ? (float) params.getDouble("xtc_t") : 0.00f,
|
249
249
|
// float xtc_p,
|
250
250
|
params.hasKey("xtc_p") ? (float) params.getDouble("xtc_p") : 0.00f,
|
251
|
-
// float tfs_z,
|
252
|
-
params.hasKey("tfs_z") ? (float) params.getDouble("tfs_z") : 1.00f,
|
253
251
|
// float typical_p,
|
254
252
|
params.hasKey("typical_p") ? (float) params.getDouble("typical_p") : 1.00f,
|
255
253
|
// int seed,
|
@@ -438,7 +436,6 @@ public class LlamaContext {
|
|
438
436
|
float min_p,
|
439
437
|
float xtc_t,
|
440
438
|
float xtc_p,
|
441
|
-
float tfs_z,
|
442
439
|
float typical_p,
|
443
440
|
int seed,
|
444
441
|
String[] stop,
|
package/android/src/main/jni.cpp
CHANGED
@@ -156,7 +156,7 @@ Java_com_rnllama_LlamaContext_initContext(
|
|
156
156
|
) {
|
157
157
|
UNUSED(thiz);
|
158
158
|
|
159
|
-
|
159
|
+
common_params defaultParams;
|
160
160
|
|
161
161
|
defaultParams.vocab_only = vocab_only;
|
162
162
|
if(vocab_only) {
|
@@ -268,7 +268,7 @@ Java_com_rnllama_LlamaContext_getFormattedChat(
|
|
268
268
|
UNUSED(thiz);
|
269
269
|
auto llama = context_map[(long) context_ptr];
|
270
270
|
|
271
|
-
std::vector<
|
271
|
+
std::vector<common_chat_msg> chat;
|
272
272
|
|
273
273
|
int messages_len = env->GetArrayLength(messages);
|
274
274
|
for (int i = 0; i < messages_len; i++) {
|
@@ -292,7 +292,7 @@ Java_com_rnllama_LlamaContext_getFormattedChat(
|
|
292
292
|
}
|
293
293
|
|
294
294
|
const char *tmpl_chars = env->GetStringUTFChars(chat_template, nullptr);
|
295
|
-
std::string formatted_chat =
|
295
|
+
std::string formatted_chat = common_chat_apply_template(llama->model, tmpl_chars, chat, true);
|
296
296
|
|
297
297
|
return env->NewStringUTF(formatted_chat.c_str());
|
298
298
|
}
|
@@ -399,7 +399,6 @@ Java_com_rnllama_LlamaContext_doCompletion(
|
|
399
399
|
jfloat min_p,
|
400
400
|
jfloat xtc_t,
|
401
401
|
jfloat xtc_p,
|
402
|
-
jfloat tfs_z,
|
403
402
|
jfloat typical_p,
|
404
403
|
jint seed,
|
405
404
|
jobjectArray stop,
|
@@ -438,12 +437,11 @@ Java_com_rnllama_LlamaContext_doCompletion(
|
|
438
437
|
sparams.top_k = top_k;
|
439
438
|
sparams.top_p = top_p;
|
440
439
|
sparams.min_p = min_p;
|
441
|
-
sparams.tfs_z = tfs_z;
|
442
440
|
sparams.typ_p = typical_p;
|
443
441
|
sparams.n_probs = n_probs;
|
444
442
|
sparams.grammar = env->GetStringUTFChars(grammar, nullptr);
|
445
|
-
sparams.
|
446
|
-
sparams.
|
443
|
+
sparams.xtc_threshold = xtc_t;
|
444
|
+
sparams.xtc_probability = xtc_p;
|
447
445
|
|
448
446
|
sparams.logit_bias.clear();
|
449
447
|
if (ignore_eos) {
|
@@ -497,7 +495,7 @@ Java_com_rnllama_LlamaContext_doCompletion(
|
|
497
495
|
if (token_with_probs.tok == -1 || llama->incomplete) {
|
498
496
|
continue;
|
499
497
|
}
|
500
|
-
const std::string token_text =
|
498
|
+
const std::string token_text = common_token_to_piece(llama->ctx, token_with_probs.tok);
|
501
499
|
|
502
500
|
size_t pos = std::min(sent_count, llama->generated_text.size());
|
503
501
|
|
@@ -532,7 +530,7 @@ Java_com_rnllama_LlamaContext_doCompletion(
|
|
532
530
|
putString(env, tokenResult, "token", to_send.c_str());
|
533
531
|
|
534
532
|
if (llama->params.sparams.n_probs > 0) {
|
535
|
-
const std::vector<llama_token> to_send_toks =
|
533
|
+
const std::vector<llama_token> to_send_toks = common_tokenize(llama->ctx, to_send, false);
|
536
534
|
size_t probs_pos = std::min(sent_token_probs_index, llama->generated_token_probs.size());
|
537
535
|
size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama->generated_token_probs.size());
|
538
536
|
if (probs_pos < probs_stop_pos) {
|
@@ -607,7 +605,7 @@ Java_com_rnllama_LlamaContext_tokenize(
|
|
607
605
|
|
608
606
|
const char *text_chars = env->GetStringUTFChars(text, nullptr);
|
609
607
|
|
610
|
-
const std::vector<llama_token> toks =
|
608
|
+
const std::vector<llama_token> toks = common_tokenize(
|
611
609
|
llama->ctx,
|
612
610
|
text_chars,
|
613
611
|
false
|
@@ -719,7 +717,7 @@ Java_com_rnllama_LlamaContext_freeContext(
|
|
719
717
|
}
|
720
718
|
if (llama->ctx_sampling != nullptr)
|
721
719
|
{
|
722
|
-
|
720
|
+
common_sampler_free(llama->ctx_sampling);
|
723
721
|
}
|
724
722
|
context_map.erase((long) llama->ctx);
|
725
723
|
}
|
package/cpp/common.cpp
CHANGED
@@ -12,6 +12,7 @@
|
|
12
12
|
|
13
13
|
#include <algorithm>
|
14
14
|
#include <cinttypes>
|
15
|
+
#include <climits>
|
15
16
|
#include <cmath>
|
16
17
|
#include <codecvt>
|
17
18
|
#include <cstdarg>
|
@@ -23,10 +24,10 @@
|
|
23
24
|
#include <regex>
|
24
25
|
#include <sstream>
|
25
26
|
#include <string>
|
27
|
+
#include <thread>
|
26
28
|
#include <unordered_map>
|
27
29
|
#include <unordered_set>
|
28
30
|
#include <vector>
|
29
|
-
#include <thread>
|
30
31
|
|
31
32
|
#if defined(__APPLE__) && defined(__MACH__)
|
32
33
|
#include <sys/types.h>
|
@@ -368,10 +369,10 @@ bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[LM_GGML_MAX_N_THR
|
|
368
369
|
return true;
|
369
370
|
}
|
370
371
|
|
371
|
-
void
|
372
|
+
void common_init() {
|
372
373
|
llama_log_set([](lm_ggml_log_level level, const char * text, void * /*user_data*/) {
|
373
|
-
if (LOG_DEFAULT_LLAMA <=
|
374
|
-
|
374
|
+
if (LOG_DEFAULT_LLAMA <= common_log_verbosity_thold) {
|
375
|
+
common_log_add(common_log_main(), level, "%s", text);
|
375
376
|
}
|
376
377
|
}, NULL);
|
377
378
|
|
@@ -384,7 +385,7 @@ void gpt_init() {
|
|
384
385
|
LOG_INF("build: %d (%s) with %s for %s%s\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT, LLAMA_COMPILER, LLAMA_BUILD_TARGET, build_type);
|
385
386
|
}
|
386
387
|
|
387
|
-
std::string
|
388
|
+
std::string common_params_get_system_info(const common_params & params) {
|
388
389
|
std::ostringstream os;
|
389
390
|
|
390
391
|
os << "system_info: n_threads = " << params.cpuparams.n_threads;
|
@@ -406,17 +407,19 @@ std::string gpt_params_get_system_info(const gpt_params & params) {
|
|
406
407
|
// String utils
|
407
408
|
//
|
408
409
|
|
409
|
-
std::
|
410
|
-
|
411
|
-
|
412
|
-
|
413
|
-
|
414
|
-
|
415
|
-
|
416
|
-
|
417
|
-
|
418
|
-
|
419
|
-
|
410
|
+
std::string string_format(const char * fmt, ...) {
|
411
|
+
va_list ap;
|
412
|
+
va_list ap2;
|
413
|
+
va_start(ap, fmt);
|
414
|
+
va_copy(ap2, ap);
|
415
|
+
int size = vsnprintf(NULL, 0, fmt, ap);
|
416
|
+
LM_GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
|
417
|
+
std::vector<char> buf(size + 1);
|
418
|
+
int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
|
419
|
+
LM_GGML_ASSERT(size2 == size);
|
420
|
+
va_end(ap2);
|
421
|
+
va_end(ap);
|
422
|
+
return std::string(buf.data(), size);
|
420
423
|
}
|
421
424
|
|
422
425
|
std::string string_strip(const std::string & str) {
|
@@ -499,7 +502,7 @@ std::string string_from(const struct llama_context * ctx, const std::vector<llam
|
|
499
502
|
first = false;
|
500
503
|
}
|
501
504
|
|
502
|
-
auto detokenized =
|
505
|
+
auto detokenized = common_token_to_piece(ctx, token);
|
503
506
|
|
504
507
|
detokenized.erase(
|
505
508
|
std::remove_if(
|
@@ -530,7 +533,7 @@ std::string string_from(const struct llama_context * ctx, const struct llama_bat
|
|
530
533
|
first = false;
|
531
534
|
}
|
532
535
|
|
533
|
-
auto detokenized =
|
536
|
+
auto detokenized = common_token_to_piece(ctx, batch.token[i]);
|
534
537
|
|
535
538
|
detokenized.erase(
|
536
539
|
std::remove_if(
|
@@ -825,16 +828,16 @@ std::string fs_get_cache_file(const std::string & filename) {
|
|
825
828
|
//
|
826
829
|
// Model utils
|
827
830
|
//
|
828
|
-
struct
|
829
|
-
|
830
|
-
auto mparams =
|
831
|
+
struct common_init_result common_init_from_params(common_params & params) {
|
832
|
+
common_init_result iparams;
|
833
|
+
auto mparams = common_model_params_to_llama(params);
|
831
834
|
|
832
835
|
llama_model * model = nullptr;
|
833
836
|
|
834
837
|
if (!params.hf_repo.empty() && !params.hf_file.empty()) {
|
835
|
-
model =
|
838
|
+
model = common_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
|
836
839
|
} else if (!params.model_url.empty()) {
|
837
|
-
model =
|
840
|
+
model = common_load_model_from_url(params.model_url.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
|
838
841
|
} else {
|
839
842
|
model = llama_load_model_from_file(params.model.c_str(), mparams);
|
840
843
|
}
|
@@ -869,7 +872,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
|
869
872
|
}
|
870
873
|
}
|
871
874
|
|
872
|
-
auto cparams =
|
875
|
+
auto cparams = common_context_params_to_llama(params);
|
873
876
|
|
874
877
|
llama_context * lctx = llama_new_context_with_model(model, cparams);
|
875
878
|
if (lctx == NULL) {
|
@@ -882,7 +885,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
|
882
885
|
if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1;
|
883
886
|
if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_n_layer(model);
|
884
887
|
|
885
|
-
const auto cvec =
|
888
|
+
const auto cvec = common_control_vector_load(params.control_vectors);
|
886
889
|
if (cvec.n_embd == -1) {
|
887
890
|
llama_free(lctx);
|
888
891
|
llama_free_model(model);
|
@@ -906,7 +909,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
|
906
909
|
|
907
910
|
// load and optionally apply lora adapters
|
908
911
|
for (auto & la : params.lora_adapters) {
|
909
|
-
|
912
|
+
common_lora_adapter_container loaded_la;
|
910
913
|
loaded_la.path = la.path;
|
911
914
|
loaded_la.scale = la.scale;
|
912
915
|
loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str());
|
@@ -919,7 +922,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
|
919
922
|
iparams.lora_adapters.push_back(loaded_la); // copy to list of loaded adapters
|
920
923
|
}
|
921
924
|
if (!params.lora_init_without_apply) {
|
922
|
-
|
925
|
+
common_lora_adapters_apply(lctx, iparams.lora_adapters);
|
923
926
|
}
|
924
927
|
|
925
928
|
if (params.sparams.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) {
|
@@ -945,7 +948,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
|
945
948
|
}
|
946
949
|
|
947
950
|
if (llama_model_has_encoder(model)) {
|
948
|
-
llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size()
|
951
|
+
llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size()));
|
949
952
|
llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
|
950
953
|
if (decoder_start_token_id == -1) {
|
951
954
|
decoder_start_token_id = bos;
|
@@ -954,7 +957,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
|
954
957
|
tmp.push_back(decoder_start_token_id);
|
955
958
|
}
|
956
959
|
if (llama_model_has_decoder(model)) {
|
957
|
-
llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch)
|
960
|
+
llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch)));
|
958
961
|
}
|
959
962
|
llama_kv_cache_clear(lctx);
|
960
963
|
llama_synchronize(lctx);
|
@@ -967,7 +970,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
|
967
970
|
return iparams;
|
968
971
|
}
|
969
972
|
|
970
|
-
void
|
973
|
+
void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters) {
|
971
974
|
llama_lora_adapter_clear(ctx);
|
972
975
|
for (auto & la : lora_adapters) {
|
973
976
|
if (la.scale != 0.0f) {
|
@@ -976,7 +979,7 @@ void llama_lora_adapters_apply(struct llama_context * ctx, std::vector<llama_lor
|
|
976
979
|
}
|
977
980
|
}
|
978
981
|
|
979
|
-
struct llama_model_params
|
982
|
+
struct llama_model_params common_model_params_to_llama(const common_params & params) {
|
980
983
|
auto mparams = llama_model_default_params();
|
981
984
|
|
982
985
|
if (params.n_gpu_layers != -1) {
|
@@ -1029,10 +1032,10 @@ static lm_ggml_type kv_cache_type_from_str(const std::string & s) {
|
|
1029
1032
|
return LM_GGML_TYPE_Q5_1;
|
1030
1033
|
}
|
1031
1034
|
|
1032
|
-
throw std::runtime_error("
|
1035
|
+
throw std::runtime_error("Unsupported cache type: " + s);
|
1033
1036
|
}
|
1034
1037
|
|
1035
|
-
struct llama_context_params
|
1038
|
+
struct llama_context_params common_context_params_to_llama(const common_params & params) {
|
1036
1039
|
auto cparams = llama_context_default_params();
|
1037
1040
|
|
1038
1041
|
cparams.n_ctx = params.n_ctx;
|
@@ -1041,7 +1044,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
|
|
1041
1044
|
cparams.n_ubatch = params.n_ubatch;
|
1042
1045
|
cparams.n_threads = params.cpuparams.n_threads;
|
1043
1046
|
cparams.n_threads_batch = params.cpuparams_batch.n_threads == -1 ?
|
1044
|
-
|
1047
|
+
params.cpuparams.n_threads : params.cpuparams_batch.n_threads;
|
1045
1048
|
cparams.logits_all = params.logits_all;
|
1046
1049
|
cparams.embeddings = params.embedding;
|
1047
1050
|
cparams.rope_scaling_type = params.rope_scaling_type;
|
@@ -1122,7 +1125,7 @@ static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_
|
|
1122
1125
|
return false;
|
1123
1126
|
}
|
1124
1127
|
|
1125
|
-
static bool
|
1128
|
+
static bool common_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {
|
1126
1129
|
|
1127
1130
|
// Initialize libcurl
|
1128
1131
|
std::unique_ptr<CURL, decltype(&curl_easy_cleanup)> curl(curl_easy_init(), &curl_easy_cleanup);
|
@@ -1192,15 +1195,15 @@ static bool llama_download_file(const std::string & url, const std::string & pat
|
|
1192
1195
|
}
|
1193
1196
|
|
1194
1197
|
// Send a HEAD request to retrieve the etag and last-modified headers
|
1195
|
-
struct
|
1198
|
+
struct common_load_model_from_url_headers {
|
1196
1199
|
std::string etag;
|
1197
1200
|
std::string last_modified;
|
1198
1201
|
};
|
1199
|
-
|
1202
|
+
common_load_model_from_url_headers headers;
|
1200
1203
|
{
|
1201
1204
|
typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
|
1202
1205
|
auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
|
1203
|
-
|
1206
|
+
common_load_model_from_url_headers *headers = (common_load_model_from_url_headers *) userdata;
|
1204
1207
|
|
1205
1208
|
static std::regex header_regex("([^:]+): (.*)\r\n");
|
1206
1209
|
static std::regex etag_regex("ETag", std::regex_constants::icase);
|
@@ -1336,7 +1339,7 @@ static bool llama_download_file(const std::string & url, const std::string & pat
|
|
1336
1339
|
return true;
|
1337
1340
|
}
|
1338
1341
|
|
1339
|
-
struct llama_model *
|
1342
|
+
struct llama_model * common_load_model_from_url(
|
1340
1343
|
const char * model_url,
|
1341
1344
|
const char * path_model,
|
1342
1345
|
const char * hf_token,
|
@@ -1347,7 +1350,7 @@ struct llama_model * llama_load_model_from_url(
|
|
1347
1350
|
return NULL;
|
1348
1351
|
}
|
1349
1352
|
|
1350
|
-
if (!
|
1353
|
+
if (!common_download_file(model_url, path_model, hf_token)) {
|
1351
1354
|
return NULL;
|
1352
1355
|
}
|
1353
1356
|
|
@@ -1400,7 +1403,7 @@ struct llama_model * llama_load_model_from_url(
|
|
1400
1403
|
char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0};
|
1401
1404
|
llama_split_path(split_url, sizeof(split_url), split_url_prefix, download_idx, n_split);
|
1402
1405
|
|
1403
|
-
return
|
1406
|
+
return common_download_file(split_url, split_path, hf_token);
|
1404
1407
|
}, idx));
|
1405
1408
|
}
|
1406
1409
|
|
@@ -1415,7 +1418,7 @@ struct llama_model * llama_load_model_from_url(
|
|
1415
1418
|
return llama_load_model_from_file(path_model, params);
|
1416
1419
|
}
|
1417
1420
|
|
1418
|
-
struct llama_model *
|
1421
|
+
struct llama_model * common_load_model_from_hf(
|
1419
1422
|
const char * repo,
|
1420
1423
|
const char * model,
|
1421
1424
|
const char * path_model,
|
@@ -1435,12 +1438,12 @@ struct llama_model * llama_load_model_from_hf(
|
|
1435
1438
|
model_url += "/resolve/main/";
|
1436
1439
|
model_url += model;
|
1437
1440
|
|
1438
|
-
return
|
1441
|
+
return common_load_model_from_url(model_url.c_str(), path_model, hf_token, params);
|
1439
1442
|
}
|
1440
1443
|
|
1441
1444
|
#else
|
1442
1445
|
|
1443
|
-
struct llama_model *
|
1446
|
+
struct llama_model * common_load_model_from_url(
|
1444
1447
|
const char * /*model_url*/,
|
1445
1448
|
const char * /*path_model*/,
|
1446
1449
|
const char * /*hf_token*/,
|
@@ -1449,7 +1452,7 @@ struct llama_model * llama_load_model_from_url(
|
|
1449
1452
|
return nullptr;
|
1450
1453
|
}
|
1451
1454
|
|
1452
|
-
struct llama_model *
|
1455
|
+
struct llama_model * common_load_model_from_hf(
|
1453
1456
|
const char * /*repo*/,
|
1454
1457
|
const char * /*model*/,
|
1455
1458
|
const char * /*path_model*/,
|
@@ -1465,11 +1468,11 @@ struct llama_model * llama_load_model_from_hf(
|
|
1465
1468
|
// Batch utils
|
1466
1469
|
//
|
1467
1470
|
|
1468
|
-
void
|
1471
|
+
void common_batch_clear(struct llama_batch & batch) {
|
1469
1472
|
batch.n_tokens = 0;
|
1470
1473
|
}
|
1471
1474
|
|
1472
|
-
void
|
1475
|
+
void common_batch_add(
|
1473
1476
|
struct llama_batch & batch,
|
1474
1477
|
llama_token id,
|
1475
1478
|
llama_pos pos,
|
@@ -1492,15 +1495,15 @@ void llama_batch_add(
|
|
1492
1495
|
// Vocab utils
|
1493
1496
|
//
|
1494
1497
|
|
1495
|
-
std::vector<llama_token>
|
1498
|
+
std::vector<llama_token> common_tokenize(
|
1496
1499
|
const struct llama_context * ctx,
|
1497
1500
|
const std::string & text,
|
1498
1501
|
bool add_special,
|
1499
1502
|
bool parse_special) {
|
1500
|
-
return
|
1503
|
+
return common_tokenize(llama_get_model(ctx), text, add_special, parse_special);
|
1501
1504
|
}
|
1502
1505
|
|
1503
|
-
std::vector<llama_token>
|
1506
|
+
std::vector<llama_token> common_tokenize(
|
1504
1507
|
const struct llama_model * model,
|
1505
1508
|
const std::string & text,
|
1506
1509
|
bool add_special,
|
@@ -1519,7 +1522,7 @@ std::vector<llama_token> llama_tokenize(
|
|
1519
1522
|
return result;
|
1520
1523
|
}
|
1521
1524
|
|
1522
|
-
std::string
|
1525
|
+
std::string common_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
|
1523
1526
|
std::string piece;
|
1524
1527
|
piece.resize(piece.capacity()); // using string internal cache, 15 bytes + '\n'
|
1525
1528
|
const int n_chars = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special);
|
@@ -1535,7 +1538,7 @@ std::string llama_token_to_piece(const struct llama_context * ctx, llama_token t
|
|
1535
1538
|
return piece;
|
1536
1539
|
}
|
1537
1540
|
|
1538
|
-
std::string
|
1541
|
+
std::string common_detokenize(llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
|
1539
1542
|
std::string text;
|
1540
1543
|
text.resize(std::max(text.capacity(), tokens.size()));
|
1541
1544
|
int32_t n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
|
@@ -1555,15 +1558,15 @@ std::string llama_detokenize(llama_context * ctx, const std::vector<llama_token>
|
|
1555
1558
|
// Chat template utils
|
1556
1559
|
//
|
1557
1560
|
|
1558
|
-
bool
|
1561
|
+
bool common_chat_verify_template(const std::string & tmpl) {
|
1559
1562
|
llama_chat_message chat[] = {{"user", "test"}};
|
1560
1563
|
int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0);
|
1561
1564
|
return res >= 0;
|
1562
1565
|
}
|
1563
1566
|
|
1564
|
-
std::string
|
1567
|
+
std::string common_chat_apply_template(const struct llama_model * model,
|
1565
1568
|
const std::string & tmpl,
|
1566
|
-
const std::vector<
|
1569
|
+
const std::vector<common_chat_msg> & msgs,
|
1567
1570
|
bool add_ass) {
|
1568
1571
|
int alloc_size = 0;
|
1569
1572
|
bool fallback = false; // indicate if we must fallback to default chatml
|
@@ -1605,42 +1608,42 @@ std::string llama_chat_apply_template(const struct llama_model * model,
|
|
1605
1608
|
return formatted_chat;
|
1606
1609
|
}
|
1607
1610
|
|
1608
|
-
std::string
|
1611
|
+
std::string common_chat_format_single(const struct llama_model * model,
|
1609
1612
|
const std::string & tmpl,
|
1610
|
-
const std::vector<
|
1611
|
-
const
|
1613
|
+
const std::vector<common_chat_msg> & past_msg,
|
1614
|
+
const common_chat_msg & new_msg,
|
1612
1615
|
bool add_ass) {
|
1613
1616
|
std::ostringstream ss;
|
1614
|
-
auto fmt_past_msg = past_msg.empty() ? "" :
|
1615
|
-
std::vector<
|
1617
|
+
auto fmt_past_msg = past_msg.empty() ? "" : common_chat_apply_template(model, tmpl, past_msg, false);
|
1618
|
+
std::vector<common_chat_msg> chat_new(past_msg);
|
1616
1619
|
// if the past_msg ends with a newline, we must preserve it in the formatted version
|
1617
1620
|
if (add_ass && !fmt_past_msg.empty() && fmt_past_msg.back() == '\n') {
|
1618
1621
|
ss << "\n";
|
1619
1622
|
};
|
1620
1623
|
// format chat with new_msg
|
1621
1624
|
chat_new.push_back(new_msg);
|
1622
|
-
auto fmt_new_msg =
|
1625
|
+
auto fmt_new_msg = common_chat_apply_template(model, tmpl, chat_new, add_ass);
|
1623
1626
|
// get the diff part
|
1624
1627
|
ss << fmt_new_msg.substr(fmt_past_msg.size(), fmt_new_msg.size() - fmt_past_msg.size());
|
1625
1628
|
return ss.str();
|
1626
1629
|
}
|
1627
1630
|
|
1628
|
-
std::string
|
1631
|
+
std::string common_chat_format_example(const struct llama_model * model,
|
1629
1632
|
const std::string & tmpl) {
|
1630
|
-
std::vector<
|
1633
|
+
std::vector<common_chat_msg> msgs = {
|
1631
1634
|
{"system", "You are a helpful assistant"},
|
1632
1635
|
{"user", "Hello"},
|
1633
1636
|
{"assistant", "Hi there"},
|
1634
1637
|
{"user", "How are you?"},
|
1635
1638
|
};
|
1636
|
-
return
|
1639
|
+
return common_chat_apply_template(model, tmpl, msgs, true);
|
1637
1640
|
}
|
1638
1641
|
|
1639
1642
|
//
|
1640
1643
|
// KV cache utils
|
1641
1644
|
//
|
1642
1645
|
|
1643
|
-
void
|
1646
|
+
void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size) {
|
1644
1647
|
static const char slot_chars[] = ".123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+";
|
1645
1648
|
|
1646
1649
|
printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d",
|
@@ -1663,7 +1666,7 @@ void llama_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size) {
|
|
1663
1666
|
printf("\n=== Done dumping\n");
|
1664
1667
|
}
|
1665
1668
|
|
1666
|
-
void
|
1669
|
+
void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size) {
|
1667
1670
|
static const char slot_chars[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
|
1668
1671
|
|
1669
1672
|
printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d\n",
|
@@ -1715,7 +1718,7 @@ void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_siz
|
|
1715
1718
|
// Embedding utils
|
1716
1719
|
//
|
1717
1720
|
|
1718
|
-
void
|
1721
|
+
void common_embd_normalize(const float * inp, float * out, int n, int embd_norm) {
|
1719
1722
|
double sum = 0.0;
|
1720
1723
|
|
1721
1724
|
switch (embd_norm) {
|
@@ -1749,7 +1752,7 @@ void llama_embd_normalize(const float * inp, float * out, int n, int embd_norm)
|
|
1749
1752
|
}
|
1750
1753
|
}
|
1751
1754
|
|
1752
|
-
float
|
1755
|
+
float common_embd_similarity_cos(const float * embd1, const float * embd2, int n){
|
1753
1756
|
double sum = 0.0;
|
1754
1757
|
double sum1 = 0.0;
|
1755
1758
|
double sum2 = 0.0;
|
@@ -1775,8 +1778,8 @@ float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n)
|
|
1775
1778
|
// Control vector utils
|
1776
1779
|
//
|
1777
1780
|
|
1778
|
-
static
|
1779
|
-
|
1781
|
+
static common_control_vector_data common_control_vector_load_one(const common_control_vector_load_info & load_info) {
|
1782
|
+
common_control_vector_data result = { -1, {} };
|
1780
1783
|
|
1781
1784
|
lm_ggml_context * ctx = nullptr;
|
1782
1785
|
struct lm_gguf_init_params meta_lm_gguf_params = {
|
@@ -1860,11 +1863,11 @@ static llama_control_vector_data llama_control_vector_load_one(const llama_contr
|
|
1860
1863
|
return result;
|
1861
1864
|
}
|
1862
1865
|
|
1863
|
-
|
1864
|
-
|
1866
|
+
common_control_vector_data common_control_vector_load(const std::vector<common_control_vector_load_info> & load_infos) {
|
1867
|
+
common_control_vector_data result = { -1, {} };
|
1865
1868
|
|
1866
1869
|
for (const auto & info : load_infos) {
|
1867
|
-
auto cur =
|
1870
|
+
auto cur = common_control_vector_load_one(info);
|
1868
1871
|
|
1869
1872
|
if (cur.n_embd == -1) {
|
1870
1873
|
result.n_embd = -1;
|
@@ -1956,8 +1959,10 @@ void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const cha
|
|
1956
1959
|
}
|
1957
1960
|
}
|
1958
1961
|
|
1959
|
-
void yaml_dump_non_result_info(FILE * stream, const
|
1962
|
+
void yaml_dump_non_result_info(FILE * stream, const common_params & params, const llama_context * lctx,
|
1960
1963
|
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
|
1964
|
+
lm_ggml_cpu_init(); // some ARM features are detected at runtime
|
1965
|
+
|
1961
1966
|
const auto & sparams = params.sparams;
|
1962
1967
|
|
1963
1968
|
fprintf(stream, "build_commit: %s\n", LLAMA_COMMIT);
|
@@ -2013,6 +2018,10 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
|
|
2013
2018
|
fprintf(stream, "chunks: %d # default: -1 (unlimited)\n", params.n_chunks);
|
2014
2019
|
fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false");
|
2015
2020
|
fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx);
|
2021
|
+
fprintf(stream, "dry_allowed_length: %d # default: 2\n", sparams.dry_allowed_length);
|
2022
|
+
fprintf(stream, "dry_base: %.2f # default: 1.75\n", sparams.dry_base);
|
2023
|
+
fprintf(stream, "dry_multiplier: %.1f # default: 0.0\n", sparams.dry_multiplier);
|
2024
|
+
fprintf(stream, "dry_penalty_last_n: %d # default: -1 (0 = disable, -1 = context size)\n", sparams.dry_penalty_last_n);
|
2016
2025
|
fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false");
|
2017
2026
|
fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n");
|
2018
2027
|
fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", sparams.penalty_freq);
|
@@ -2093,11 +2102,12 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
|
|
2093
2102
|
const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + llama_max_devices());
|
2094
2103
|
yaml_dump_vector_float(stream, "tensor_split", tensor_split_vector);
|
2095
2104
|
|
2096
|
-
fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z);
|
2097
2105
|
fprintf(stream, "threads: %d # default: %u\n", params.cpuparams.n_threads, std::thread::hardware_concurrency());
|
2098
2106
|
fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
|
2099
2107
|
fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
|
2100
2108
|
fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
|
2109
|
+
fprintf(stream, "xtc_probability: %f # default: 0.0\n", sparams.xtc_probability);
|
2110
|
+
fprintf(stream, "xtc_threshold: %f # default: 0.1\n", sparams.xtc_threshold);
|
2101
2111
|
fprintf(stream, "typ_p: %f # default: 1.0\n", sparams.typ_p);
|
2102
2112
|
fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
|
2103
2113
|
fprintf(stream, "display_prompt: %s # default: true\n", params.display_prompt ? "true" : "false");
|