cui-llama.rn 1.1.5 → 1.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/CMakeLists.txt +1 -0
- package/android/src/main/jni.cpp +1 -2
- package/cpp/common.cpp +157 -53
- package/cpp/common.h +11 -3
- package/cpp/ggml-metal.m +33 -22
- package/cpp/ggml-quants.c +33 -36
- package/cpp/ggml.h +5 -4
- package/cpp/llama-impl.h +1 -0
- package/cpp/llama-sampling.cpp +0 -8
- package/cpp/llama.cpp +519 -34
- package/cpp/llama.h +0 -17
- package/cpp/log.cpp +401 -0
- package/cpp/log.h +85 -703
- package/cpp/rn-llama.hpp +7 -10
- package/cpp/sampling.cpp +1 -5
- package/cpp/sgemm.cpp +38 -0
- package/package.json +1 -1
package/android/src/main/jni.cpp
CHANGED
@@ -538,7 +538,7 @@ Java_com_rnllama_LlamaContext_doCompletion(
|
|
538
538
|
putString(env, result, "stopping_word", llama->stopping_word.c_str());
|
539
539
|
putInt(env, result, "tokens_cached", llama->n_past);
|
540
540
|
|
541
|
-
const auto timings_token =
|
541
|
+
const auto timings_token = llama_perf_context(llama -> ctx);
|
542
542
|
|
543
543
|
auto timingsResult = createWriteableMap(env);
|
544
544
|
putInt(env, timingsResult, "prompt_n", timings_token.n_p_eval);
|
@@ -635,7 +635,6 @@ Java_com_rnllama_LlamaContext_embedding(
|
|
635
635
|
|
636
636
|
llama->rewind();
|
637
637
|
|
638
|
-
// llama_reset_timings(llama->ctx);
|
639
638
|
llama_perf_context_reset(llama->ctx);
|
640
639
|
gpt_sampler_reset(llama->ctx_sampling);
|
641
640
|
|
package/cpp/common.cpp
CHANGED
@@ -3,6 +3,7 @@
|
|
3
3
|
#endif
|
4
4
|
|
5
5
|
#include "common.h"
|
6
|
+
#include "log.h"
|
6
7
|
// Change JSON_ASSERT from assert() to LM_GGML_ASSERT:
|
7
8
|
#define JSON_ASSERT LM_GGML_ASSERT
|
8
9
|
#include "json.hpp"
|
@@ -25,6 +26,7 @@
|
|
25
26
|
#include <unordered_map>
|
26
27
|
#include <unordered_set>
|
27
28
|
#include <vector>
|
29
|
+
#include <thread>
|
28
30
|
|
29
31
|
#if defined(__APPLE__) && defined(__MACH__)
|
30
32
|
#include <sys/types.h>
|
@@ -48,7 +50,6 @@
|
|
48
50
|
#if defined(LLAMA_USE_CURL)
|
49
51
|
#include <curl/curl.h>
|
50
52
|
#include <curl/easy.h>
|
51
|
-
#include <thread>
|
52
53
|
#include <future>
|
53
54
|
#endif
|
54
55
|
|
@@ -232,7 +233,7 @@ bool set_process_priority(enum lm_ggml_sched_priority prio) {
|
|
232
233
|
}
|
233
234
|
|
234
235
|
if (!SetPriorityClass(GetCurrentProcess(), p)) {
|
235
|
-
|
236
|
+
LOG_WRN("failed to set process priority class %d : (%d)\n", prio, (int) GetLastError());
|
236
237
|
return false;
|
237
238
|
}
|
238
239
|
|
@@ -257,7 +258,7 @@ bool set_process_priority(enum lm_ggml_sched_priority prio) {
|
|
257
258
|
}
|
258
259
|
|
259
260
|
if (!setpriority(PRIO_PROCESS, 0, p)) {
|
260
|
-
|
261
|
+
LOG_WRN("failed to set process priority %d : %s (%d)\n", prio, strerror(errno), errno);
|
261
262
|
return false;
|
262
263
|
}
|
263
264
|
return true;
|
@@ -290,14 +291,14 @@ void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model)
|
|
290
291
|
|
291
292
|
if (n_set && n_set < cpuparams.n_threads) {
|
292
293
|
// Not enough set bits, may experience performance issues.
|
293
|
-
|
294
|
+
LOG_WRN("Not enough set bits in CPU mask (%d) to satisfy requested thread count: %d\n", n_set, cpuparams.n_threads);
|
294
295
|
}
|
295
296
|
}
|
296
297
|
|
297
298
|
bool parse_cpu_range(const std::string & range, bool (&boolmask)[LM_GGML_MAX_N_THREADS]) {
|
298
299
|
size_t dash_loc = range.find('-');
|
299
300
|
if (dash_loc == std::string::npos) {
|
300
|
-
|
301
|
+
LOG_ERR("Format of CPU range is invalid! Expected [<start>]-[<end>].\n");
|
301
302
|
return false;
|
302
303
|
}
|
303
304
|
|
@@ -309,7 +310,7 @@ bool parse_cpu_range(const std::string & range, bool (&boolmask)[LM_GGML_MAX_N_T
|
|
309
310
|
} else {
|
310
311
|
start_i = std::stoull(range.substr(0, dash_loc));
|
311
312
|
if (start_i >= LM_GGML_MAX_N_THREADS) {
|
312
|
-
|
313
|
+
LOG_ERR("Start index out of bounds!\n");
|
313
314
|
return false;
|
314
315
|
}
|
315
316
|
}
|
@@ -319,7 +320,7 @@ bool parse_cpu_range(const std::string & range, bool (&boolmask)[LM_GGML_MAX_N_T
|
|
319
320
|
} else {
|
320
321
|
end_i = std::stoull(range.substr(dash_loc + 1));
|
321
322
|
if (end_i >= LM_GGML_MAX_N_THREADS) {
|
322
|
-
|
323
|
+
LOG_ERR("End index out of bounds!\n");
|
323
324
|
return false;
|
324
325
|
}
|
325
326
|
}
|
@@ -354,7 +355,7 @@ bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[LM_GGML_MAX_N_THR
|
|
354
355
|
} else if (c >= 'A' && c <= 'F') {
|
355
356
|
id -= 'A' - 10;
|
356
357
|
} else {
|
357
|
-
|
358
|
+
LOG_ERR("Invalid hex character '%c' at position %d\n", c, int32_t(i));
|
358
359
|
return false;
|
359
360
|
}
|
360
361
|
|
@@ -367,6 +368,22 @@ bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[LM_GGML_MAX_N_THR
|
|
367
368
|
return true;
|
368
369
|
}
|
369
370
|
|
371
|
+
void gpt_init() {
|
372
|
+
llama_log_set([](lm_ggml_log_level level, const char * text, void * /*user_data*/) {
|
373
|
+
if (LOG_DEFAULT_LLAMA <= gpt_log_verbosity_thold) {
|
374
|
+
gpt_log_add(gpt_log_main(), level, "%s", text);
|
375
|
+
}
|
376
|
+
}, NULL);
|
377
|
+
|
378
|
+
#ifdef NDEBUG
|
379
|
+
const char * build_type = "";
|
380
|
+
#else
|
381
|
+
const char * build_type = " (debug)";
|
382
|
+
#endif
|
383
|
+
|
384
|
+
LOG_INF("build: %d (%s) with %s for %s%s\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT, LLAMA_COMPILER, LLAMA_BUILD_TARGET, build_type);
|
385
|
+
}
|
386
|
+
|
370
387
|
std::string gpt_params_get_system_info(const gpt_params & params) {
|
371
388
|
std::ostringstream os;
|
372
389
|
|
@@ -447,6 +464,94 @@ void string_replace_all(std::string & s, const std::string & search, const std::
|
|
447
464
|
s = std::move(builder);
|
448
465
|
}
|
449
466
|
|
467
|
+
std::string string_from(bool value) {
|
468
|
+
return value ? "true" : "false";
|
469
|
+
}
|
470
|
+
|
471
|
+
std::string string_from(const std::vector<int> & values) {
|
472
|
+
std::stringstream buf;
|
473
|
+
|
474
|
+
buf << "[ ";
|
475
|
+
bool first = true;
|
476
|
+
for (auto e : values) {
|
477
|
+
if (first) {
|
478
|
+
first = false;
|
479
|
+
} else {
|
480
|
+
buf << ", ";
|
481
|
+
}
|
482
|
+
buf << std::to_string(e);
|
483
|
+
}
|
484
|
+
buf << " ]";
|
485
|
+
|
486
|
+
return buf.str();
|
487
|
+
}
|
488
|
+
|
489
|
+
std::string string_from(const struct llama_context * ctx, const std::vector<llama_token> & tokens) {
|
490
|
+
std::stringstream buf;
|
491
|
+
|
492
|
+
buf << "[ ";
|
493
|
+
|
494
|
+
bool first = true;
|
495
|
+
for (const auto & token : tokens) {
|
496
|
+
if (!first) {
|
497
|
+
buf << ", ";
|
498
|
+
} else {
|
499
|
+
first = false;
|
500
|
+
}
|
501
|
+
|
502
|
+
auto detokenized = llama_token_to_piece(ctx, token);
|
503
|
+
|
504
|
+
detokenized.erase(
|
505
|
+
std::remove_if(
|
506
|
+
detokenized.begin(),
|
507
|
+
detokenized.end(),
|
508
|
+
[](const unsigned char c) { return !std::isprint(c); }),
|
509
|
+
detokenized.end());
|
510
|
+
|
511
|
+
buf << "'" << detokenized << "'"
|
512
|
+
<< ":" << std::to_string(token);
|
513
|
+
}
|
514
|
+
|
515
|
+
buf << " ]";
|
516
|
+
|
517
|
+
return buf.str();
|
518
|
+
}
|
519
|
+
|
520
|
+
std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch) {
|
521
|
+
std::stringstream buf;
|
522
|
+
|
523
|
+
buf << "[ ";
|
524
|
+
|
525
|
+
bool first = true;
|
526
|
+
for (int i = 0; i < batch.n_tokens; ++i) {
|
527
|
+
if (!first) {
|
528
|
+
buf << ", ";
|
529
|
+
} else {
|
530
|
+
first = false;
|
531
|
+
}
|
532
|
+
|
533
|
+
auto detokenized = llama_token_to_piece(ctx, batch.token[i]);
|
534
|
+
|
535
|
+
detokenized.erase(
|
536
|
+
std::remove_if(
|
537
|
+
detokenized.begin(),
|
538
|
+
detokenized.end(),
|
539
|
+
[](const unsigned char c) { return !std::isprint(c); }),
|
540
|
+
detokenized.end());
|
541
|
+
|
542
|
+
buf << "\n" << std::to_string(i)
|
543
|
+
<< ":token '" << detokenized << "'"
|
544
|
+
<< ":pos " << std::to_string(batch.pos[i])
|
545
|
+
<< ":n_seq_id " << std::to_string(batch.n_seq_id[i])
|
546
|
+
<< ":seq_id " << std::to_string(batch.seq_id[i][0])
|
547
|
+
<< ":logits " << std::to_string(batch.logits[i]);
|
548
|
+
}
|
549
|
+
|
550
|
+
buf << " ]";
|
551
|
+
|
552
|
+
return buf.str();
|
553
|
+
}
|
554
|
+
|
450
555
|
void string_process_escapes(std::string & input) {
|
451
556
|
std::size_t input_len = input.length();
|
452
557
|
std::size_t output_idx = 0;
|
@@ -487,7 +592,7 @@ void string_process_escapes(std::string & input) {
|
|
487
592
|
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides) {
|
488
593
|
const char * sep = strchr(data, '=');
|
489
594
|
if (sep == nullptr || sep - data >= 128) {
|
490
|
-
|
595
|
+
LOG_ERR("%s: malformed KV override '%s'\n", __func__, data);
|
491
596
|
return false;
|
492
597
|
}
|
493
598
|
llama_model_kv_override kvo;
|
@@ -510,20 +615,20 @@ bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_over
|
|
510
615
|
} else if (std::strcmp(sep, "false") == 0) {
|
511
616
|
kvo.val_bool = false;
|
512
617
|
} else {
|
513
|
-
|
618
|
+
LOG_ERR("%s: invalid boolean value for KV override '%s'\n", __func__, data);
|
514
619
|
return false;
|
515
620
|
}
|
516
621
|
} else if (strncmp(sep, "str:", 4) == 0) {
|
517
622
|
sep += 4;
|
518
623
|
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
|
519
624
|
if (strlen(sep) > 127) {
|
520
|
-
|
625
|
+
LOG_ERR("%s: malformed KV override '%s', value cannot exceed 127 chars\n", __func__, data);
|
521
626
|
return false;
|
522
627
|
}
|
523
628
|
strncpy(kvo.val_str, sep, 127);
|
524
629
|
kvo.val_str[127] = '\0';
|
525
630
|
} else {
|
526
|
-
|
631
|
+
LOG_ERR("%s: invalid type for KV override '%s'\n", __func__, data);
|
527
632
|
return false;
|
528
633
|
}
|
529
634
|
overrides.emplace_back(std::move(kvo));
|
@@ -735,7 +840,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
|
735
840
|
}
|
736
841
|
|
737
842
|
if (model == NULL) {
|
738
|
-
|
843
|
+
LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.c_str());
|
739
844
|
return iparams;
|
740
845
|
}
|
741
846
|
|
@@ -743,7 +848,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
|
743
848
|
|
744
849
|
llama_context * lctx = llama_new_context_with_model(model, cparams);
|
745
850
|
if (lctx == NULL) {
|
746
|
-
|
851
|
+
LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.c_str());
|
747
852
|
llama_free_model(model);
|
748
853
|
return iparams;
|
749
854
|
}
|
@@ -779,7 +884,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
|
779
884
|
loaded_la.scale = la.scale;
|
780
885
|
loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str());
|
781
886
|
if (loaded_la.adapter == nullptr) {
|
782
|
-
|
887
|
+
LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
|
783
888
|
llama_free(lctx);
|
784
889
|
llama_free_model(model);
|
785
890
|
return iparams;
|
@@ -791,12 +896,12 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
|
791
896
|
}
|
792
897
|
|
793
898
|
if (params.sparams.ignore_eos && llama_token_eos(model) == -1) {
|
794
|
-
|
899
|
+
LOG_WRN("%s: warning: model does not have an EOS token, ignoring --ignore-eos\n", __func__);
|
795
900
|
params.sparams.ignore_eos = false;
|
796
901
|
}
|
797
902
|
|
798
903
|
if (params.warmup) {
|
799
|
-
|
904
|
+
LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);
|
800
905
|
|
801
906
|
std::vector<llama_token> tmp;
|
802
907
|
llama_token bos = llama_token_bos(model);
|
@@ -962,7 +1067,7 @@ static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_
|
|
962
1067
|
int remaining_attempts = max_attempts;
|
963
1068
|
|
964
1069
|
while (remaining_attempts > 0) {
|
965
|
-
|
1070
|
+
LOG_INF("%s: Trying to download from %s (attempt %d of %d)...\n", __func__ , url.c_str(), max_attempts - remaining_attempts + 1, max_attempts);
|
966
1071
|
|
967
1072
|
CURLcode res = curl_easy_perform(curl);
|
968
1073
|
if (res == CURLE_OK) {
|
@@ -970,13 +1075,14 @@ static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_
|
|
970
1075
|
}
|
971
1076
|
|
972
1077
|
int exponential_backoff_delay = std::pow(retry_delay_seconds, max_attempts - remaining_attempts) * 1000;
|
973
|
-
|
1078
|
+
LOG_WRN("%s: curl_easy_perform() failed: %s, retrying after %d milliseconds...\n", __func__, curl_easy_strerror(res), exponential_backoff_delay);
|
974
1079
|
|
975
1080
|
remaining_attempts--;
|
976
1081
|
std::this_thread::sleep_for(std::chrono::milliseconds(exponential_backoff_delay));
|
977
1082
|
}
|
978
1083
|
|
979
|
-
|
1084
|
+
LOG_ERR("%s: curl_easy_perform() failed after %d attempts\n", __func__, max_attempts);
|
1085
|
+
|
980
1086
|
return false;
|
981
1087
|
}
|
982
1088
|
|
@@ -985,7 +1091,7 @@ static bool llama_download_file(const std::string & url, const std::string & pat
|
|
985
1091
|
// Initialize libcurl
|
986
1092
|
std::unique_ptr<CURL, decltype(&curl_easy_cleanup)> curl(curl_easy_init(), &curl_easy_cleanup);
|
987
1093
|
if (!curl) {
|
988
|
-
|
1094
|
+
LOG_ERR("%s: error initializing libcurl\n", __func__);
|
989
1095
|
return false;
|
990
1096
|
}
|
991
1097
|
|
@@ -1026,11 +1132,11 @@ static bool llama_download_file(const std::string & url, const std::string & pat
|
|
1026
1132
|
if (metadata_in.good()) {
|
1027
1133
|
try {
|
1028
1134
|
metadata_in >> metadata;
|
1029
|
-
|
1135
|
+
LOG_INF("%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str());
|
1030
1136
|
if (metadata.contains("url") && metadata.at("url").is_string()) {
|
1031
1137
|
auto previous_url = metadata.at("url").get<std::string>();
|
1032
1138
|
if (previous_url != url) {
|
1033
|
-
|
1139
|
+
LOG_ERR("%s: Model URL mismatch: %s != %s\n", __func__, url.c_str(), previous_url.c_str());
|
1034
1140
|
return false;
|
1035
1141
|
}
|
1036
1142
|
}
|
@@ -1041,12 +1147,12 @@ static bool llama_download_file(const std::string & url, const std::string & pat
|
|
1041
1147
|
last_modified = metadata.at("lastModified");
|
1042
1148
|
}
|
1043
1149
|
} catch (const nlohmann::json::exception & e) {
|
1044
|
-
|
1150
|
+
LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
|
1045
1151
|
return false;
|
1046
1152
|
}
|
1047
1153
|
}
|
1048
1154
|
} else {
|
1049
|
-
|
1155
|
+
LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
|
1050
1156
|
}
|
1051
1157
|
|
1052
1158
|
// Send a HEAD request to retrieve the etag and last-modified headers
|
@@ -1094,26 +1200,26 @@ static bool llama_download_file(const std::string & url, const std::string & pat
|
|
1094
1200
|
// HEAD not supported, we don't know if the file has changed
|
1095
1201
|
// force trigger downloading
|
1096
1202
|
force_download = true;
|
1097
|
-
|
1203
|
+
LOG_ERR("%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
|
1098
1204
|
}
|
1099
1205
|
}
|
1100
1206
|
|
1101
1207
|
bool should_download = !file_exists || force_download;
|
1102
1208
|
if (!should_download) {
|
1103
1209
|
if (!etag.empty() && etag != headers.etag) {
|
1104
|
-
|
1210
|
+
LOG_WRN("%s: ETag header is different (%s != %s): triggering a new download\n", __func__, etag.c_str(), headers.etag.c_str());
|
1105
1211
|
should_download = true;
|
1106
1212
|
} else if (!last_modified.empty() && last_modified != headers.last_modified) {
|
1107
|
-
|
1213
|
+
LOG_WRN("%s: Last-Modified header is different (%s != %s): triggering a new download\n", __func__, last_modified.c_str(), headers.last_modified.c_str());
|
1108
1214
|
should_download = true;
|
1109
1215
|
}
|
1110
1216
|
}
|
1111
1217
|
if (should_download) {
|
1112
1218
|
std::string path_temporary = path + ".downloadInProgress";
|
1113
1219
|
if (file_exists) {
|
1114
|
-
|
1220
|
+
LOG_WRN("%s: deleting previous downloaded file: %s\n", __func__, path.c_str());
|
1115
1221
|
if (remove(path.c_str()) != 0) {
|
1116
|
-
|
1222
|
+
LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
|
1117
1223
|
return false;
|
1118
1224
|
}
|
1119
1225
|
}
|
@@ -1128,7 +1234,7 @@ static bool llama_download_file(const std::string & url, const std::string & pat
|
|
1128
1234
|
|
1129
1235
|
std::unique_ptr<FILE, FILE_deleter> outfile(fopen(path_temporary.c_str(), "wb"));
|
1130
1236
|
if (!outfile) {
|
1131
|
-
|
1237
|
+
LOG_ERR("%s: error opening local file for writing: %s\n", __func__, path.c_str());
|
1132
1238
|
return false;
|
1133
1239
|
}
|
1134
1240
|
|
@@ -1159,7 +1265,7 @@ static bool llama_download_file(const std::string & url, const std::string & pat
|
|
1159
1265
|
};
|
1160
1266
|
|
1161
1267
|
// start the download
|
1162
|
-
|
1268
|
+
LOG_INF("%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
|
1163
1269
|
llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str());
|
1164
1270
|
bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
|
1165
1271
|
if (!was_perform_successful) {
|
@@ -1169,7 +1275,7 @@ static bool llama_download_file(const std::string & url, const std::string & pat
|
|
1169
1275
|
long http_code = 0;
|
1170
1276
|
curl_easy_getinfo (curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
|
1171
1277
|
if (http_code < 200 || http_code >= 400) {
|
1172
|
-
|
1278
|
+
LOG_ERR("%s: invalid http status code received: %ld\n", __func__, http_code);
|
1173
1279
|
return false;
|
1174
1280
|
}
|
1175
1281
|
|
@@ -1183,10 +1289,10 @@ static bool llama_download_file(const std::string & url, const std::string & pat
|
|
1183
1289
|
{"lastModified", headers.last_modified}
|
1184
1290
|
});
|
1185
1291
|
std::ofstream(metadata_path) << metadata.dump(4);
|
1186
|
-
|
1292
|
+
LOG_INF("%s: file metadata saved: %s\n", __func__, metadata_path.c_str());
|
1187
1293
|
|
1188
1294
|
if (rename(path_temporary.c_str(), path.c_str()) != 0) {
|
1189
|
-
|
1295
|
+
LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
|
1190
1296
|
return false;
|
1191
1297
|
}
|
1192
1298
|
}
|
@@ -1201,7 +1307,7 @@ struct llama_model * llama_load_model_from_url(
|
|
1201
1307
|
const struct llama_model_params & params) {
|
1202
1308
|
// Basic validation of the model_url
|
1203
1309
|
if (!model_url || strlen(model_url) == 0) {
|
1204
|
-
|
1310
|
+
LOG_ERR("%s: invalid model_url\n", __func__);
|
1205
1311
|
return NULL;
|
1206
1312
|
}
|
1207
1313
|
|
@@ -1218,7 +1324,7 @@ struct llama_model * llama_load_model_from_url(
|
|
1218
1324
|
};
|
1219
1325
|
auto * ctx_gguf = lm_gguf_init_from_file(path_model, lm_gguf_params);
|
1220
1326
|
if (!ctx_gguf) {
|
1221
|
-
|
1327
|
+
LOG_ERR("\n%s: failed to load input GGUF from %s\n", __func__, path_model);
|
1222
1328
|
return NULL;
|
1223
1329
|
}
|
1224
1330
|
|
@@ -1238,14 +1344,12 @@ struct llama_model * llama_load_model_from_url(
|
|
1238
1344
|
// and extract split URL and PATH prefixes
|
1239
1345
|
{
|
1240
1346
|
if (!llama_split_prefix(split_prefix, sizeof(split_prefix), path_model, 0, n_split)) {
|
1241
|
-
|
1242
|
-
" n_split=%d\n", __func__, path_model, n_split);
|
1347
|
+
LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, path_model, n_split);
|
1243
1348
|
return NULL;
|
1244
1349
|
}
|
1245
1350
|
|
1246
1351
|
if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url, 0, n_split)) {
|
1247
|
-
|
1248
|
-
" n_split=%d\n", __func__, model_url, n_split);
|
1352
|
+
LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model_url, n_split);
|
1249
1353
|
return NULL;
|
1250
1354
|
}
|
1251
1355
|
}
|
@@ -1305,7 +1409,7 @@ struct llama_model * llama_load_model_from_url(
|
|
1305
1409
|
const char * /*path_model*/,
|
1306
1410
|
const char * /*hf_token*/,
|
1307
1411
|
const struct llama_model_params & /*params*/) {
|
1308
|
-
|
1412
|
+
LOG_WRN("%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__);
|
1309
1413
|
return nullptr;
|
1310
1414
|
}
|
1311
1415
|
|
@@ -1315,7 +1419,7 @@ struct llama_model * llama_load_model_from_hf(
|
|
1315
1419
|
const char * /*path_model*/,
|
1316
1420
|
const char * /*hf_token*/,
|
1317
1421
|
const struct llama_model_params & /*params*/) {
|
1318
|
-
|
1422
|
+
LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
|
1319
1423
|
return nullptr;
|
1320
1424
|
}
|
1321
1425
|
|
@@ -1643,13 +1747,13 @@ static llama_control_vector_data llama_control_vector_load_one(const llama_contr
|
|
1643
1747
|
};
|
1644
1748
|
struct lm_gguf_context * ctx_gguf = lm_gguf_init_from_file(load_info.fname.c_str(), meta_lm_gguf_params);
|
1645
1749
|
if (!ctx_gguf) {
|
1646
|
-
|
1750
|
+
LOG_ERR("%s: failed to load control vector file from %s\n", __func__, load_info.fname.c_str());
|
1647
1751
|
return result;
|
1648
1752
|
}
|
1649
1753
|
|
1650
1754
|
int32_t n_tensors = lm_gguf_get_n_tensors(ctx_gguf);
|
1651
1755
|
if (n_tensors == 0) {
|
1652
|
-
|
1756
|
+
LOG_WRN("%s: no direction tensors found in %s\n", __func__, load_info.fname.c_str());
|
1653
1757
|
}
|
1654
1758
|
|
1655
1759
|
for (int i = 0; i < n_tensors; i++) {
|
@@ -1667,23 +1771,23 @@ static llama_control_vector_data llama_control_vector_load_one(const llama_contr
|
|
1667
1771
|
}
|
1668
1772
|
}
|
1669
1773
|
if (layer_idx < 0) {
|
1670
|
-
|
1774
|
+
LOG_ERR("%s: invalid/unparsable direction tensor layer index in %s\n", __func__, load_info.fname.c_str());
|
1671
1775
|
result.n_embd = -1;
|
1672
1776
|
break;
|
1673
1777
|
} else if (layer_idx == 0) {
|
1674
|
-
|
1778
|
+
LOG_ERR("%s: invalid (zero) direction tensor layer index in %s\n", __func__, load_info.fname.c_str());
|
1675
1779
|
result.n_embd = -1;
|
1676
1780
|
break;
|
1677
1781
|
}
|
1678
1782
|
|
1679
1783
|
struct lm_ggml_tensor * tensor = lm_ggml_get_tensor(ctx, name.c_str());
|
1680
1784
|
if (tensor->type != LM_GGML_TYPE_F32) {
|
1681
|
-
|
1785
|
+
LOG_ERR("%s: invalid (non-F32) direction tensor type in %s\n", __func__, load_info.fname.c_str());
|
1682
1786
|
result.n_embd = -1;
|
1683
1787
|
break;
|
1684
1788
|
}
|
1685
1789
|
if (lm_ggml_n_dims(tensor) != 1) {
|
1686
|
-
|
1790
|
+
LOG_ERR("%s: invalid (non-1D) direction tensor shape in %s\n", __func__, load_info.fname.c_str());
|
1687
1791
|
result.n_embd = -1;
|
1688
1792
|
break;
|
1689
1793
|
}
|
@@ -1691,7 +1795,7 @@ static llama_control_vector_data llama_control_vector_load_one(const llama_contr
|
|
1691
1795
|
if (result.n_embd == -1) {
|
1692
1796
|
result.n_embd = lm_ggml_nelements(tensor);
|
1693
1797
|
} else if (lm_ggml_nelements(tensor) != result.n_embd) {
|
1694
|
-
|
1798
|
+
LOG_ERR("%s: direction tensor in %s does not match previous dimensions\n", __func__, load_info.fname.c_str());
|
1695
1799
|
result.n_embd = -1;
|
1696
1800
|
break;
|
1697
1801
|
}
|
@@ -1708,7 +1812,7 @@ static llama_control_vector_data llama_control_vector_load_one(const llama_contr
|
|
1708
1812
|
}
|
1709
1813
|
|
1710
1814
|
if (result.n_embd == -1) {
|
1711
|
-
|
1815
|
+
LOG_WRN("%s: skipping %s due to invalid direction tensors\n", __func__, load_info.fname.c_str());
|
1712
1816
|
result.data.clear();
|
1713
1817
|
}
|
1714
1818
|
|
@@ -1729,7 +1833,7 @@ llama_control_vector_data llama_control_vector_load(const std::vector<llama_cont
|
|
1729
1833
|
break;
|
1730
1834
|
}
|
1731
1835
|
if (result.n_embd != -1 && result.n_embd != cur.n_embd) {
|
1732
|
-
|
1836
|
+
LOG_ERR("%s: control vectors in %s does not match previous dimensions\n", __func__, info.fname.c_str());
|
1733
1837
|
result.n_embd = -1;
|
1734
1838
|
break;
|
1735
1839
|
}
|
@@ -1745,7 +1849,7 @@ llama_control_vector_data llama_control_vector_load(const std::vector<llama_cont
|
|
1745
1849
|
}
|
1746
1850
|
|
1747
1851
|
if (result.n_embd == -1) {
|
1748
|
-
|
1852
|
+
LOG_ERR("%s: no valid control vector files passed\n", __func__);
|
1749
1853
|
result.data.clear();
|
1750
1854
|
}
|
1751
1855
|
|
package/cpp/common.h
CHANGED
@@ -4,11 +4,9 @@
|
|
4
4
|
|
5
5
|
#include "llama.h"
|
6
6
|
|
7
|
-
#define LOG_NO_FILE_LINE_FUNCTION
|
8
|
-
#include "log.h"
|
9
|
-
|
10
7
|
#include <string>
|
11
8
|
#include <vector>
|
9
|
+
#include <sstream>
|
12
10
|
|
13
11
|
#ifdef _WIN32
|
14
12
|
#define DIRECTORY_SEPARATOR '\\'
|
@@ -265,6 +263,7 @@ struct gpt_params {
|
|
265
263
|
bool cont_batching = true; // insert new sequences for decoding on-the-fly
|
266
264
|
bool flash_attn = false; // flash attention
|
267
265
|
bool no_perf = false; // disable performance metrics
|
266
|
+
bool ctx_shift = true; // context shift on inifinite text generation
|
268
267
|
|
269
268
|
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
|
270
269
|
bool logits_all = false; // return logits for all tokens in the batch
|
@@ -360,6 +359,10 @@ struct gpt_params {
|
|
360
359
|
bool batched_bench_output_jsonl = false;
|
361
360
|
};
|
362
361
|
|
362
|
+
// call once at the start of a program if it uses libcommon
|
363
|
+
// initializes the logging system and prints info about the build
|
364
|
+
void gpt_init();
|
365
|
+
|
363
366
|
std::string gpt_params_get_system_info(const gpt_params & params);
|
364
367
|
|
365
368
|
bool parse_cpu_range(const std::string& range, bool(&boolmask)[LM_GGML_MAX_N_THREADS]);
|
@@ -395,6 +398,11 @@ static std::vector<T> string_split(const std::string & str, char delim) {
|
|
395
398
|
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
|
396
399
|
void string_process_escapes(std::string & input);
|
397
400
|
|
401
|
+
std::string string_from(bool value);
|
402
|
+
std::string string_from(const std::vector<int> & values);
|
403
|
+
std::string string_from(const struct llama_context * ctx, const std::vector<llama_token> & tokens);
|
404
|
+
std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch);
|
405
|
+
|
398
406
|
//
|
399
407
|
// Filesystem utils
|
400
408
|
//
|