cui-llama.rn 1.3.4 → 1.3.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/CMakeLists.txt +14 -8
- package/android/src/main/jni.cpp +38 -37
- package/cpp/common.cpp +50 -30
- package/cpp/common.h +32 -13
- package/cpp/ggml-alloc.c +0 -1
- package/cpp/ggml-backend-reg.cpp +79 -49
- package/cpp/ggml-backend.cpp +5 -2
- package/cpp/ggml-cpp.h +1 -0
- package/cpp/ggml-cpu-aarch64.cpp +57 -72
- package/cpp/ggml-cpu-quants.c +5 -1
- package/cpp/ggml-cpu.c +6 -6
- package/cpp/ggml-cpu.cpp +9 -0
- package/cpp/ggml-impl.h +11 -0
- package/cpp/ggml-metal.m +2 -2
- package/cpp/ggml.c +129 -1388
- package/cpp/ggml.h +29 -152
- package/cpp/gguf.cpp +1325 -0
- package/cpp/gguf.h +202 -0
- package/cpp/llama-adapter.cpp +346 -0
- package/cpp/llama-adapter.h +73 -0
- package/cpp/llama-arch.cpp +1434 -0
- package/cpp/llama-arch.h +395 -0
- package/cpp/llama-batch.cpp +368 -0
- package/cpp/llama-batch.h +88 -0
- package/cpp/llama-chat.cpp +567 -0
- package/cpp/llama-chat.h +51 -0
- package/cpp/llama-context.cpp +1771 -0
- package/cpp/llama-context.h +128 -0
- package/cpp/llama-cparams.cpp +1 -0
- package/cpp/llama-cparams.h +37 -0
- package/cpp/llama-cpp.h +30 -0
- package/cpp/llama-grammar.cpp +16 -15
- package/cpp/llama-grammar.h +5 -6
- package/cpp/llama-hparams.cpp +71 -0
- package/cpp/llama-hparams.h +140 -0
- package/cpp/llama-impl.cpp +167 -0
- package/cpp/llama-impl.h +16 -136
- package/cpp/llama-kv-cache.cpp +718 -0
- package/cpp/llama-kv-cache.h +218 -0
- package/cpp/llama-mmap.cpp +589 -0
- package/cpp/llama-mmap.h +67 -0
- package/cpp/llama-model-loader.cpp +1011 -0
- package/cpp/llama-model-loader.h +158 -0
- package/cpp/llama-model.cpp +2202 -0
- package/cpp/llama-model.h +391 -0
- package/cpp/llama-sampling.cpp +117 -4
- package/cpp/llama-vocab.cpp +26 -29
- package/cpp/llama-vocab.h +14 -2
- package/cpp/llama.cpp +8839 -19131
- package/cpp/llama.cpp.rej +23 -0
- package/cpp/llama.h +31 -9
- package/cpp/rn-llama.hpp +39 -37
- package/cpp/sgemm.cpp +1091 -378
- package/cpp/sgemm.h +2 -2
- package/cpp/unicode.cpp +6 -0
- package/package.json +1 -1
@@ -9,17 +9,23 @@ include_directories(${RNLLAMA_LIB_DIR})
|
|
9
9
|
|
10
10
|
set(
|
11
11
|
SOURCE_FILES
|
12
|
-
${RNLLAMA_LIB_DIR}/llama-grammar.cpp
|
13
|
-
${RNLLAMA_LIB_DIR}/llama-sampling.cpp
|
14
|
-
${RNLLAMA_LIB_DIR}/llama-vocab.cpp
|
15
|
-
${RNLLAMA_LIB_DIR}/log.cpp
|
16
|
-
|
17
|
-
#${RNLLAMA_LIB_DIR}/amx/amx.cpp
|
18
|
-
#${RNLLAMA_LIB_DIR}/amx/mmq.cpp
|
19
12
|
|
13
|
+
${RNLLAMA_LIB_DIR}/common.cpp
|
20
14
|
${RNLLAMA_LIB_DIR}/llama-grammar.cpp
|
21
15
|
${RNLLAMA_LIB_DIR}/llama-sampling.cpp
|
22
16
|
${RNLLAMA_LIB_DIR}/llama-vocab.cpp
|
17
|
+
${RNLLAMA_LIB_DIR}/llama-chat.cpp
|
18
|
+
${RNLLAMA_LIB_DIR}/llama-mmap.cpp
|
19
|
+
${RNLLAMA_LIB_DIR}/llama-context.cpp
|
20
|
+
${RNLLAMA_LIB_DIR}/llama-kv-cache.cpp
|
21
|
+
${RNLLAMA_LIB_DIR}/llama-model-loader.cpp
|
22
|
+
${RNLLAMA_LIB_DIR}/llama-model.cpp
|
23
|
+
${RNLLAMA_LIB_DIR}/llama-batch.cpp
|
24
|
+
${RNLLAMA_LIB_DIR}/llama-arch.cpp
|
25
|
+
${RNLLAMA_LIB_DIR}/llama-cparams.cpp
|
26
|
+
${RNLLAMA_LIB_DIR}/llama-hparams.cpp
|
27
|
+
${RNLLAMA_LIB_DIR}/llama-adapter.cpp
|
28
|
+
${RNLLAMA_LIB_DIR}/llama-impl.cpp
|
23
29
|
${RNLLAMA_LIB_DIR}/log.cpp
|
24
30
|
${RNLLAMA_LIB_DIR}/json.hpp
|
25
31
|
${RNLLAMA_LIB_DIR}/json-schema-to-grammar.cpp
|
@@ -28,6 +34,7 @@ set(
|
|
28
34
|
${RNLLAMA_LIB_DIR}/ggml-backend.cpp
|
29
35
|
${RNLLAMA_LIB_DIR}/ggml-backend-reg.cpp
|
30
36
|
${RNLLAMA_LIB_DIR}/ggml.c
|
37
|
+
${RNLLAMA_LIB_DIR}/gguf.cpp
|
31
38
|
${RNLLAMA_LIB_DIR}/ggml-cpu.c
|
32
39
|
${RNLLAMA_LIB_DIR}/ggml-cpu.cpp
|
33
40
|
${RNLLAMA_LIB_DIR}/ggml-cpu-aarch64.cpp
|
@@ -35,7 +42,6 @@ set(
|
|
35
42
|
${RNLLAMA_LIB_DIR}/ggml-cpu-quants.c
|
36
43
|
${RNLLAMA_LIB_DIR}/ggml-threading.cpp
|
37
44
|
${RNLLAMA_LIB_DIR}/ggml-quants.c
|
38
|
-
${RNLLAMA_LIB_DIR}/common.cpp
|
39
45
|
${RNLLAMA_LIB_DIR}/sampling.cpp
|
40
46
|
${RNLLAMA_LIB_DIR}/unicode-data.cpp
|
41
47
|
${RNLLAMA_LIB_DIR}/unicode.cpp
|
package/android/src/main/jni.cpp
CHANGED
@@ -11,7 +11,8 @@
|
|
11
11
|
#include <unordered_map>
|
12
12
|
#include "llama.h"
|
13
13
|
#include "llama-impl.h"
|
14
|
-
#include "
|
14
|
+
#include "llama-context.h"
|
15
|
+
#include "gguf.h"
|
15
16
|
#include "rn-llama.hpp"
|
16
17
|
|
17
18
|
#define UNUSED(x) (void)(x)
|
@@ -336,17 +337,17 @@ Java_com_rnllama_LlamaContext_initContext(
|
|
336
337
|
|
337
338
|
LOGI("[RNLlama] is_model_loaded %s", (is_model_loaded ? "true" : "false"));
|
338
339
|
if (is_model_loaded) {
|
339
|
-
if (embedding && llama_model_has_encoder(llama->model) && llama_model_has_decoder(llama->model)) {
|
340
|
+
if (embedding && llama_model_has_encoder(llama->model.get()) && llama_model_has_decoder(llama->model.get())) {
|
340
341
|
LOGI("[RNLlama] computing embeddings in encoder-decoder models is not supported");
|
341
|
-
llama_free(llama->ctx);
|
342
|
+
llama_free(llama->ctx.get());
|
342
343
|
return -1;
|
343
344
|
}
|
344
|
-
context_map[(long) llama->ctx] = llama;
|
345
|
+
context_map[(long) llama->ctx.get()] = llama;
|
345
346
|
} else {
|
346
|
-
llama_free(llama->ctx);
|
347
|
+
llama_free(llama->ctx.get());
|
347
348
|
}
|
348
349
|
|
349
|
-
return reinterpret_cast<jlong>(llama->ctx);
|
350
|
+
return reinterpret_cast<jlong>(llama->ctx.get());
|
350
351
|
}
|
351
352
|
|
352
353
|
|
@@ -372,13 +373,13 @@ Java_com_rnllama_LlamaContext_loadModelDetails(
|
|
372
373
|
UNUSED(thiz);
|
373
374
|
auto llama = context_map[(long) context_ptr];
|
374
375
|
|
375
|
-
int count = llama_model_meta_count(llama->model);
|
376
|
+
int count = llama_model_meta_count(llama->model.get());
|
376
377
|
auto meta = createWriteableMap(env);
|
377
378
|
for (int i = 0; i < count; i++) {
|
378
379
|
char key[256];
|
379
|
-
llama_model_meta_key_by_index(llama->model, i, key, sizeof(key));
|
380
|
+
llama_model_meta_key_by_index(llama->model.get(), i, key, sizeof(key));
|
380
381
|
char val[2048];
|
381
|
-
llama_model_meta_val_str_by_index(llama->model, i, val, sizeof(val));
|
382
|
+
llama_model_meta_val_str_by_index(llama->model.get(), i, val, sizeof(val));
|
382
383
|
|
383
384
|
putString(env, meta, key, val);
|
384
385
|
}
|
@@ -386,10 +387,10 @@ Java_com_rnllama_LlamaContext_loadModelDetails(
|
|
386
387
|
auto result = createWriteableMap(env);
|
387
388
|
|
388
389
|
char desc[1024];
|
389
|
-
llama_model_desc(llama->model, desc, sizeof(desc));
|
390
|
+
llama_model_desc(llama->model.get(), desc, sizeof(desc));
|
390
391
|
putString(env, result, "desc", desc);
|
391
|
-
putDouble(env, result, "size", llama_model_size(llama->model));
|
392
|
-
putDouble(env, result, "nParams", llama_model_n_params(llama->model));
|
392
|
+
putDouble(env, result, "size", llama_model_size(llama->model.get()));
|
393
|
+
putDouble(env, result, "nParams", llama_model_n_params(llama->model.get()));
|
393
394
|
putBoolean(env, result, "isChatTemplateSupported", llama->validateModelChatTemplate());
|
394
395
|
putMap(env, result, "metadata", meta);
|
395
396
|
|
@@ -431,7 +432,7 @@ Java_com_rnllama_LlamaContext_getFormattedChat(
|
|
431
432
|
}
|
432
433
|
|
433
434
|
const char *tmpl_chars = env->GetStringUTFChars(chat_template, nullptr);
|
434
|
-
std::string formatted_chat = common_chat_apply_template(llama->model, tmpl_chars, chat, true);
|
435
|
+
std::string formatted_chat = common_chat_apply_template(llama->model.get(), tmpl_chars, chat, true);
|
435
436
|
|
436
437
|
return env->NewStringUTF(formatted_chat.c_str());
|
437
438
|
}
|
@@ -450,7 +451,7 @@ Java_com_rnllama_LlamaContext_loadSession(
|
|
450
451
|
auto result = createWriteableMap(env);
|
451
452
|
size_t n_token_count_out = 0;
|
452
453
|
llama->embd.resize(llama->params.n_ctx);
|
453
|
-
if (!llama_state_load_file(llama->ctx, path_chars, llama->embd.data(), llama->embd.capacity(), &n_token_count_out)) {
|
454
|
+
if (!llama_state_load_file(llama->ctx.get(), path_chars, llama->embd.data(), llama->embd.capacity(), &n_token_count_out)) {
|
454
455
|
env->ReleaseStringUTFChars(path, path_chars);
|
455
456
|
|
456
457
|
putString(env, result, "error", "Failed to load session");
|
@@ -459,7 +460,7 @@ Java_com_rnllama_LlamaContext_loadSession(
|
|
459
460
|
llama->embd.resize(n_token_count_out);
|
460
461
|
env->ReleaseStringUTFChars(path, path_chars);
|
461
462
|
|
462
|
-
const std::string text = rnllama::tokens_to_str(llama->ctx, llama->embd.cbegin(), llama->embd.cend());
|
463
|
+
const std::string text = rnllama::tokens_to_str(llama->ctx.get(), llama->embd.cbegin(), llama->embd.cend());
|
463
464
|
putInt(env, result, "tokens_loaded", n_token_count_out);
|
464
465
|
putString(env, result, "prompt", text.c_str());
|
465
466
|
return reinterpret_cast<jobject>(result);
|
@@ -481,7 +482,7 @@ Java_com_rnllama_LlamaContext_saveSession(
|
|
481
482
|
std::vector<llama_token> session_tokens = llama->embd;
|
482
483
|
int default_size = session_tokens.size();
|
483
484
|
int save_size = size > 0 && size <= default_size ? size : default_size;
|
484
|
-
if (!llama_state_save_file(llama->ctx, path_chars, session_tokens.data(), save_size)) {
|
485
|
+
if (!llama_state_save_file(llama->ctx.get(), path_chars, session_tokens.data(), save_size)) {
|
485
486
|
env->ReleaseStringUTFChars(path, path_chars);
|
486
487
|
return -1;
|
487
488
|
}
|
@@ -499,13 +500,13 @@ static inline jobject tokenProbsToMap(
|
|
499
500
|
for (const auto &prob : probs) {
|
500
501
|
auto probsForToken = createWritableArray(env);
|
501
502
|
for (const auto &p : prob.probs) {
|
502
|
-
std::string tokStr = rnllama::tokens_to_output_formatted_string(llama->ctx, p.tok);
|
503
|
+
std::string tokStr = rnllama::tokens_to_output_formatted_string(llama->ctx.get(), p.tok);
|
503
504
|
auto probResult = createWriteableMap(env);
|
504
505
|
putString(env, probResult, "tok_str", tokStr.c_str());
|
505
506
|
putDouble(env, probResult, "prob", p.prob);
|
506
507
|
pushMap(env, probsForToken, probResult);
|
507
508
|
}
|
508
|
-
std::string tokStr = rnllama::tokens_to_output_formatted_string(llama->ctx, prob.tok);
|
509
|
+
std::string tokStr = rnllama::tokens_to_output_formatted_string(llama->ctx.get(), prob.tok);
|
509
510
|
auto tokenResult = createWriteableMap(env);
|
510
511
|
putString(env, tokenResult, "content", tokStr.c_str());
|
511
512
|
putArray(env, tokenResult, "probs", probsForToken);
|
@@ -555,7 +556,7 @@ Java_com_rnllama_LlamaContext_doCompletion(
|
|
555
556
|
|
556
557
|
llama->rewind();
|
557
558
|
|
558
|
-
//llama_reset_timings(llama->ctx);
|
559
|
+
//llama_reset_timings(llama->ctx.get());
|
559
560
|
|
560
561
|
llama->params.prompt = env->GetStringUTFChars(prompt, nullptr);
|
561
562
|
llama->params.sampling.seed = (seed == -1) ? time(NULL) : seed;
|
@@ -593,7 +594,7 @@ Java_com_rnllama_LlamaContext_doCompletion(
|
|
593
594
|
|
594
595
|
sparams.logit_bias.clear();
|
595
596
|
if (ignore_eos) {
|
596
|
-
sparams.logit_bias[llama_token_eos(llama->model)].bias = -INFINITY;
|
597
|
+
sparams.logit_bias[llama_token_eos(llama->model.get())].bias = -INFINITY;
|
597
598
|
}
|
598
599
|
|
599
600
|
// dry break seq
|
@@ -612,7 +613,7 @@ Java_com_rnllama_LlamaContext_doCompletion(
|
|
612
613
|
sparams.dry_sequence_breakers = dry_sequence_breakers_vector;
|
613
614
|
|
614
615
|
// logit bias
|
615
|
-
const int n_vocab = llama_n_vocab(llama_get_model(llama->ctx));
|
616
|
+
const int n_vocab = llama_n_vocab(llama_get_model(llama->ctx.get()));
|
616
617
|
jsize logit_bias_len = env->GetArrayLength(logit_bias);
|
617
618
|
|
618
619
|
for (jsize i = 0; i < logit_bias_len; i++) {
|
@@ -659,7 +660,7 @@ Java_com_rnllama_LlamaContext_doCompletion(
|
|
659
660
|
if (token_with_probs.tok == -1 || llama->incomplete) {
|
660
661
|
continue;
|
661
662
|
}
|
662
|
-
const std::string token_text = common_token_to_piece(llama->ctx, token_with_probs.tok);
|
663
|
+
const std::string token_text = common_token_to_piece(llama->ctx.get(), token_with_probs.tok);
|
663
664
|
|
664
665
|
size_t pos = std::min(sent_count, llama->generated_text.size());
|
665
666
|
|
@@ -694,7 +695,7 @@ Java_com_rnllama_LlamaContext_doCompletion(
|
|
694
695
|
putString(env, tokenResult, "token", to_send.c_str());
|
695
696
|
|
696
697
|
if (llama->params.sampling.n_probs > 0) {
|
697
|
-
const std::vector<llama_token> to_send_toks = common_tokenize(llama->ctx, to_send, false);
|
698
|
+
const std::vector<llama_token> to_send_toks = common_tokenize(llama->ctx.get(), to_send, false);
|
698
699
|
size_t probs_pos = std::min(sent_token_probs_index, llama->generated_token_probs.size());
|
699
700
|
size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama->generated_token_probs.size());
|
700
701
|
if (probs_pos < probs_stop_pos) {
|
@@ -711,7 +712,7 @@ Java_com_rnllama_LlamaContext_doCompletion(
|
|
711
712
|
}
|
712
713
|
}
|
713
714
|
|
714
|
-
llama_perf_context_print(llama->ctx);
|
715
|
+
llama_perf_context_print(llama->ctx.get());
|
715
716
|
llama->is_predicting = false;
|
716
717
|
|
717
718
|
auto result = createWriteableMap(env);
|
@@ -726,7 +727,7 @@ Java_com_rnllama_LlamaContext_doCompletion(
|
|
726
727
|
putString(env, result, "stopping_word", llama->stopping_word.c_str());
|
727
728
|
putInt(env, result, "tokens_cached", llama->n_past);
|
728
729
|
|
729
|
-
const auto timings_token = llama_perf_context(llama -> ctx);
|
730
|
+
const auto timings_token = llama_perf_context(llama -> ctx.get());
|
730
731
|
|
731
732
|
auto timingsResult = createWriteableMap(env);
|
732
733
|
putInt(env, timingsResult, "prompt_n", timings_token.n_p_eval);
|
@@ -770,7 +771,7 @@ Java_com_rnllama_LlamaContext_tokenize(
|
|
770
771
|
const char *text_chars = env->GetStringUTFChars(text, nullptr);
|
771
772
|
|
772
773
|
const std::vector<llama_token> toks = common_tokenize(
|
773
|
-
llama->ctx,
|
774
|
+
llama->ctx.get(),
|
774
775
|
text_chars,
|
775
776
|
false
|
776
777
|
);
|
@@ -797,7 +798,7 @@ Java_com_rnllama_LlamaContext_detokenize(
|
|
797
798
|
toks.push_back(tokens_ptr[i]);
|
798
799
|
}
|
799
800
|
|
800
|
-
auto text = rnllama::tokens_to_str(llama->ctx, toks.cbegin(), toks.cend());
|
801
|
+
auto text = rnllama::tokens_to_str(llama->ctx.get(), toks.cbegin(), toks.cend());
|
801
802
|
|
802
803
|
env->ReleaseIntArrayElements(tokens, tokens_ptr, 0);
|
803
804
|
|
@@ -834,7 +835,7 @@ Java_com_rnllama_LlamaContext_embedding(
|
|
834
835
|
|
835
836
|
llama->rewind();
|
836
837
|
|
837
|
-
llama_perf_context_reset(llama->ctx);
|
838
|
+
llama_perf_context_reset(llama->ctx.get());
|
838
839
|
|
839
840
|
llama->params.prompt = text_chars;
|
840
841
|
|
@@ -860,7 +861,7 @@ Java_com_rnllama_LlamaContext_embedding(
|
|
860
861
|
|
861
862
|
auto promptTokens = createWritableArray(env);
|
862
863
|
for (const auto &tok : llama->embd) {
|
863
|
-
pushString(env, promptTokens, common_token_to_piece(llama->ctx, tok).c_str());
|
864
|
+
pushString(env, promptTokens, common_token_to_piece(llama->ctx.get(), tok).c_str());
|
864
865
|
}
|
865
866
|
putArray(env, result, "prompt_tokens", promptTokens);
|
866
867
|
|
@@ -890,17 +891,17 @@ Java_com_rnllama_LlamaContext_freeContext(
|
|
890
891
|
UNUSED(env);
|
891
892
|
UNUSED(thiz);
|
892
893
|
auto llama = context_map[(long) context_ptr];
|
893
|
-
if (llama->model) {
|
894
|
-
|
894
|
+
if (llama->model.get()) {
|
895
|
+
llama_model_free(llama->model.get());
|
895
896
|
}
|
896
|
-
if (llama->ctx) {
|
897
|
-
llama_free(llama->ctx);
|
897
|
+
if (llama->ctx.get()) {
|
898
|
+
llama_free(llama->ctx.get());
|
898
899
|
}
|
899
|
-
if (llama->
|
900
|
+
/*if (llama->ctx.get()-> != nullptr)
|
900
901
|
{
|
901
|
-
common_sampler_free(llama->
|
902
|
-
}
|
903
|
-
context_map.erase((long) llama->ctx);
|
902
|
+
common_sampler_free(llama->ctx.get() -> _sampling);
|
903
|
+
}*/
|
904
|
+
context_map.erase((long) llama->ctx.get());
|
904
905
|
}
|
905
906
|
|
906
907
|
JNIEXPORT void JNICALL
|
package/cpp/common.cpp
CHANGED
@@ -2,6 +2,9 @@
|
|
2
2
|
#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
|
3
3
|
#endif
|
4
4
|
|
5
|
+
#include "ggml.h"
|
6
|
+
#include "gguf.h"
|
7
|
+
|
5
8
|
#include "common.h"
|
6
9
|
#include "log.h"
|
7
10
|
// Change JSON_ASSERT from assert() to LM_GGML_ASSERT:
|
@@ -18,6 +21,7 @@
|
|
18
21
|
#include <cstdarg>
|
19
22
|
#include <cstring>
|
20
23
|
#include <ctime>
|
24
|
+
#include <filesystem>
|
21
25
|
#include <fstream>
|
22
26
|
#include <iostream>
|
23
27
|
#include <iterator>
|
@@ -68,7 +72,9 @@ char const *LLAMA_BUILD_TARGET = "unknown";
|
|
68
72
|
#ifdef __linux__
|
69
73
|
#include <linux/limits.h>
|
70
74
|
#elif defined(_WIN32)
|
71
|
-
#
|
75
|
+
# if !defined(PATH_MAX)
|
76
|
+
# define PATH_MAX MAX_PATH
|
77
|
+
# endif
|
72
78
|
#else
|
73
79
|
#include <sys/syslimits.h>
|
74
80
|
#endif
|
@@ -849,7 +855,7 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|
849
855
|
} else if (!params.model_url.empty()) {
|
850
856
|
model = common_load_model_from_url(params.model_url, params.model, params.hf_token, mparams);
|
851
857
|
} else {
|
852
|
-
model =
|
858
|
+
model = llama_model_load_from_file(params.model.c_str(), mparams);
|
853
859
|
}
|
854
860
|
|
855
861
|
if (model == NULL) {
|
@@ -876,7 +882,7 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|
876
882
|
}
|
877
883
|
|
878
884
|
if (!ok) {
|
879
|
-
|
885
|
+
llama_model_free(model);
|
880
886
|
|
881
887
|
return iparams;
|
882
888
|
}
|
@@ -887,14 +893,13 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|
887
893
|
llama_context * lctx = llama_new_context_with_model(model, cparams);
|
888
894
|
if (lctx == NULL) {
|
889
895
|
LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.c_str());
|
890
|
-
|
896
|
+
llama_model_free(model);
|
891
897
|
return iparams;
|
892
898
|
}
|
893
899
|
|
894
900
|
if (params.ctx_shift && !llama_kv_cache_can_shift(lctx)) {
|
895
|
-
|
896
|
-
|
897
|
-
return iparams;
|
901
|
+
LOG_WRN("%s: KV cache shifting is not supported for this model, disabling KV cache shifting\n", __func__);
|
902
|
+
params.ctx_shift = false;
|
898
903
|
}
|
899
904
|
|
900
905
|
if (!params.control_vectors.empty()) {
|
@@ -904,7 +909,7 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|
904
909
|
const auto cvec = common_control_vector_load(params.control_vectors);
|
905
910
|
if (cvec.n_embd == -1) {
|
906
911
|
llama_free(lctx);
|
907
|
-
|
912
|
+
llama_model_free(model);
|
908
913
|
|
909
914
|
return iparams;
|
910
915
|
}
|
@@ -917,7 +922,7 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|
917
922
|
params.control_vector_layer_end);
|
918
923
|
if (err) {
|
919
924
|
llama_free(lctx);
|
920
|
-
|
925
|
+
llama_model_free(model);
|
921
926
|
|
922
927
|
return iparams;
|
923
928
|
}
|
@@ -925,20 +930,21 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|
925
930
|
|
926
931
|
// load and optionally apply lora adapters
|
927
932
|
for (auto & la : params.lora_adapters) {
|
928
|
-
|
929
|
-
|
930
|
-
|
931
|
-
loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str());
|
932
|
-
if (loaded_la.adapter == nullptr) {
|
933
|
+
llama_lora_adapter_ptr lora;
|
934
|
+
lora.reset(llama_lora_adapter_init(model, la.path.c_str()));
|
935
|
+
if (lora == nullptr) {
|
933
936
|
LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
|
934
937
|
llama_free(lctx);
|
935
|
-
|
938
|
+
llama_model_free(model);
|
936
939
|
return iparams;
|
937
940
|
}
|
938
|
-
|
941
|
+
|
942
|
+
la.ptr = lora.get();
|
943
|
+
iparams.lora.emplace_back(std::move(lora)); // copy to list of loaded adapters
|
939
944
|
}
|
945
|
+
|
940
946
|
if (!params.lora_init_without_apply) {
|
941
|
-
common_lora_adapters_apply(lctx,
|
947
|
+
common_lora_adapters_apply(lctx, params.lora_adapters);
|
942
948
|
}
|
943
949
|
|
944
950
|
if (params.sampling.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) {
|
@@ -985,7 +991,7 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|
985
991
|
if (llama_model_has_encoder(model)) {
|
986
992
|
llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size()));
|
987
993
|
llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
|
988
|
-
if (decoder_start_token_id ==
|
994
|
+
if (decoder_start_token_id == LLAMA_TOKEN_NULL) {
|
989
995
|
decoder_start_token_id = bos;
|
990
996
|
}
|
991
997
|
tmp.clear();
|
@@ -999,17 +1005,17 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|
999
1005
|
llama_perf_context_reset(lctx);
|
1000
1006
|
}
|
1001
1007
|
|
1002
|
-
iparams.model
|
1003
|
-
iparams.context
|
1008
|
+
iparams.model.reset(model);
|
1009
|
+
iparams.context.reset(lctx);
|
1004
1010
|
|
1005
1011
|
return iparams;
|
1006
1012
|
}
|
1007
1013
|
|
1008
|
-
void common_lora_adapters_apply(struct llama_context * ctx, std::vector<
|
1014
|
+
void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_info> & lora) {
|
1009
1015
|
llama_lora_adapter_clear(ctx);
|
1010
|
-
for (auto & la :
|
1016
|
+
for (auto & la : lora) {
|
1011
1017
|
if (la.scale != 0.0f) {
|
1012
|
-
llama_lora_adapter_set(ctx, la.
|
1018
|
+
llama_lora_adapter_set(ctx, la.ptr, la.scale);
|
1013
1019
|
}
|
1014
1020
|
}
|
1015
1021
|
}
|
@@ -1105,7 +1111,7 @@ struct lm_ggml_threadpool_params lm_ggml_threadpool_params_from_cpu_params(const
|
|
1105
1111
|
#define CURL_MAX_RETRY 3
|
1106
1112
|
#define CURL_RETRY_DELAY_SECONDS 2
|
1107
1113
|
|
1108
|
-
static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_attempts, int retry_delay_seconds) {
|
1114
|
+
static bool curl_perform_with_retry(const std::string & url, CURL * curl, int max_attempts, int retry_delay_seconds) {
|
1109
1115
|
int remaining_attempts = max_attempts;
|
1110
1116
|
|
1111
1117
|
while (remaining_attempts > 0) {
|
@@ -1129,7 +1135,6 @@ static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_
|
|
1129
1135
|
}
|
1130
1136
|
|
1131
1137
|
static bool common_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {
|
1132
|
-
|
1133
1138
|
// Initialize libcurl
|
1134
1139
|
std::unique_ptr<CURL, decltype(&curl_easy_cleanup)> curl(curl_easy_init(), &curl_easy_cleanup);
|
1135
1140
|
if (!curl) {
|
@@ -1159,8 +1164,7 @@ static bool common_download_file(const std::string & url, const std::string & pa
|
|
1159
1164
|
#endif
|
1160
1165
|
|
1161
1166
|
// Check if the file already exists locally
|
1162
|
-
|
1163
|
-
auto file_exists = (stat(path.c_str(), &model_file_info) == 0);
|
1167
|
+
auto file_exists = std::filesystem::exists(path);
|
1164
1168
|
|
1165
1169
|
// If the file exists, check its JSON metadata companion file.
|
1166
1170
|
std::string metadata_path = path + ".json";
|
@@ -1202,11 +1206,13 @@ static bool common_download_file(const std::string & url, const std::string & pa
|
|
1202
1206
|
std::string etag;
|
1203
1207
|
std::string last_modified;
|
1204
1208
|
};
|
1209
|
+
|
1205
1210
|
common_load_model_from_url_headers headers;
|
1211
|
+
|
1206
1212
|
{
|
1207
1213
|
typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
|
1208
1214
|
auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
|
1209
|
-
common_load_model_from_url_headers *headers = (common_load_model_from_url_headers *) userdata;
|
1215
|
+
common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata;
|
1210
1216
|
|
1211
1217
|
static std::regex header_regex("([^:]+): (.*)\r\n");
|
1212
1218
|
static std::regex etag_regex("ETag", std::regex_constants::icase);
|
@@ -1418,7 +1424,7 @@ struct llama_model * common_load_model_from_url(
|
|
1418
1424
|
}
|
1419
1425
|
}
|
1420
1426
|
|
1421
|
-
return
|
1427
|
+
return llama_model_load_from_file(local_path.c_str(), params);
|
1422
1428
|
}
|
1423
1429
|
|
1424
1430
|
struct llama_model * common_load_model_from_hf(
|
@@ -1621,6 +1627,18 @@ std::string common_detokenize(llama_context * ctx, const std::vector<llama_token
|
|
1621
1627
|
// Chat template utils
|
1622
1628
|
//
|
1623
1629
|
|
1630
|
+
std::string common_get_builtin_chat_template(const struct llama_model * model) {
|
1631
|
+
static const char * template_key = "tokenizer.chat_template";
|
1632
|
+
// call with NULL buffer to get the total size of the string
|
1633
|
+
int32_t res = llama_model_meta_val_str(model, template_key, NULL, 0);
|
1634
|
+
if (res > 0) {
|
1635
|
+
std::vector<char> model_template(res + 1, 0);
|
1636
|
+
llama_model_meta_val_str(model, template_key, model_template.data(), model_template.size());
|
1637
|
+
return std::string(model_template.data(), model_template.size() - 1);
|
1638
|
+
}
|
1639
|
+
return "";
|
1640
|
+
}
|
1641
|
+
|
1624
1642
|
bool common_chat_verify_template(const std::string & tmpl) {
|
1625
1643
|
llama_chat_message chat[] = {{"user", "test"}};
|
1626
1644
|
int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0);
|
@@ -1790,7 +1808,9 @@ void common_embd_normalize(const float * inp, float * out, int n, int embd_norm)
|
|
1790
1808
|
break;
|
1791
1809
|
case 0: // max absolute
|
1792
1810
|
for (int i = 0; i < n; i++) {
|
1793
|
-
if (sum < std::abs(inp[i]))
|
1811
|
+
if (sum < std::abs(inp[i])) {
|
1812
|
+
sum = std::abs(inp[i]);
|
1813
|
+
}
|
1794
1814
|
}
|
1795
1815
|
sum /= 32760.0; // make an int16 range
|
1796
1816
|
break;
|
package/cpp/common.h
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
#pragma once
|
4
4
|
|
5
|
-
#include "llama.h"
|
5
|
+
#include "llama-cpp.h"
|
6
6
|
|
7
7
|
#include <string>
|
8
8
|
#include <vector>
|
@@ -27,10 +27,8 @@
|
|
27
27
|
struct common_lora_adapter_info {
|
28
28
|
std::string path;
|
29
29
|
float scale;
|
30
|
-
};
|
31
30
|
|
32
|
-
struct
|
33
|
-
struct llama_lora_adapter * adapter;
|
31
|
+
struct llama_lora_adapter * ptr;
|
34
32
|
};
|
35
33
|
|
36
34
|
using llama_tokens = std::vector<llama_token>;
|
@@ -91,6 +89,7 @@ enum llama_example {
|
|
91
89
|
LLAMA_EXAMPLE_LLAVA,
|
92
90
|
LLAMA_EXAMPLE_LOOKUP,
|
93
91
|
LLAMA_EXAMPLE_PARALLEL,
|
92
|
+
LLAMA_EXAMPLE_TTS,
|
94
93
|
|
95
94
|
LLAMA_EXAMPLE_COUNT,
|
96
95
|
};
|
@@ -170,6 +169,7 @@ struct common_params_sampling {
|
|
170
169
|
|
171
170
|
struct common_params_speculative {
|
172
171
|
std::vector<lm_ggml_backend_dev_t> devices; // devices to use for offloading
|
172
|
+
|
173
173
|
int32_t n_ctx = 0; // draft context size
|
174
174
|
int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding
|
175
175
|
int32_t n_min = 5; // minimum number of draft tokens to use for speculative decoding
|
@@ -183,6 +183,14 @@ struct common_params_speculative {
|
|
183
183
|
std::string model = ""; // draft model for speculative decoding // NOLINT
|
184
184
|
};
|
185
185
|
|
186
|
+
struct common_params_vocoder {
|
187
|
+
std::string hf_repo = ""; // HF repo // NOLINT
|
188
|
+
std::string hf_file = ""; // HF file // NOLINT
|
189
|
+
|
190
|
+
std::string model = ""; // model path // NOLINT
|
191
|
+
std::string model_url = ""; // model url to download // NOLINT
|
192
|
+
};
|
193
|
+
|
186
194
|
struct common_params {
|
187
195
|
|
188
196
|
void * progress_callback_user_data = nullptr;
|
@@ -229,8 +237,9 @@ struct common_params {
|
|
229
237
|
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
|
230
238
|
enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
|
231
239
|
|
232
|
-
struct common_params_sampling
|
240
|
+
struct common_params_sampling sampling;
|
233
241
|
struct common_params_speculative speculative;
|
242
|
+
struct common_params_vocoder vocoder;
|
234
243
|
|
235
244
|
std::string model = ""; // model path // NOLINT
|
236
245
|
std::string model_alias = ""; // model alias // NOLINT
|
@@ -482,10 +491,12 @@ std::string fs_get_cache_file(const std::string & filename);
|
|
482
491
|
// Model utils
|
483
492
|
//
|
484
493
|
|
494
|
+
// note: defines object's lifetime
|
485
495
|
struct common_init_result {
|
486
|
-
|
487
|
-
|
488
|
-
|
496
|
+
llama_model_ptr model;
|
497
|
+
llama_context_ptr context;
|
498
|
+
|
499
|
+
std::vector<llama_lora_adapter_ptr> lora;
|
489
500
|
};
|
490
501
|
|
491
502
|
struct common_init_result common_init_from_params(common_params & params);
|
@@ -507,7 +518,7 @@ struct llama_model * common_load_model_from_hf(
|
|
507
518
|
const struct llama_model_params & params);
|
508
519
|
|
509
520
|
// clear LoRA adapters from context, then apply new list of adapters
|
510
|
-
void common_lora_adapters_apply(struct llama_context * ctx, std::vector<
|
521
|
+
void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_info> & lora);
|
511
522
|
|
512
523
|
//
|
513
524
|
// Batch utils
|
@@ -575,6 +586,9 @@ struct common_chat_msg {
|
|
575
586
|
std::string content;
|
576
587
|
};
|
577
588
|
|
589
|
+
// Get the built-in chat template for the model. Return empty string if not present.
|
590
|
+
std::string common_get_builtin_chat_template(const struct llama_model * model);
|
591
|
+
|
578
592
|
// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
|
579
593
|
bool common_chat_verify_template(const std::string & tmpl);
|
580
594
|
|
@@ -611,7 +625,8 @@ void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_si
|
|
611
625
|
// Embedding utils
|
612
626
|
//
|
613
627
|
|
614
|
-
|
628
|
+
// TODO: repace embd_norm with an enum
|
629
|
+
void common_embd_normalize(const float * inp, float * out, int n, int embd_norm);
|
615
630
|
|
616
631
|
float common_embd_similarity_cos(const float * embd1, const float * embd2, int n);
|
617
632
|
|
@@ -640,6 +655,10 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c
|
|
640
655
|
// Split utils
|
641
656
|
//
|
642
657
|
|
643
|
-
|
644
|
-
|
645
|
-
|
658
|
+
namespace {
|
659
|
+
|
660
|
+
const char * const LLM_KV_SPLIT_NO = "split.no";
|
661
|
+
const char * const LLM_KV_SPLIT_COUNT = "split.count";
|
662
|
+
const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
|
663
|
+
|
664
|
+
}
|
package/cpp/ggml-alloc.c
CHANGED