cui-llama.rn 1.3.5 → 1.3.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/CMakeLists.txt +14 -8
- package/android/src/main/jni.cpp +38 -37
- package/cpp/common.cpp +43 -26
- package/cpp/common.h +18 -11
- package/cpp/ggml-backend-reg.cpp +5 -0
- package/cpp/ggml-backend.cpp +5 -2
- package/cpp/ggml-cpp.h +1 -0
- package/cpp/ggml-cpu-aarch64.cpp +6 -1
- package/cpp/ggml-cpu-quants.c +5 -1
- package/cpp/ggml-impl.h +11 -16
- package/cpp/ggml-metal.m +2 -2
- package/cpp/ggml.c +0 -1276
- package/cpp/ggml.h +0 -140
- package/cpp/gguf.cpp +1325 -0
- package/cpp/gguf.h +202 -0
- package/cpp/llama-adapter.cpp +346 -0
- package/cpp/llama-adapter.h +73 -0
- package/cpp/llama-arch.cpp +1434 -0
- package/cpp/llama-arch.h +395 -0
- package/cpp/llama-batch.cpp +368 -0
- package/cpp/llama-batch.h +88 -0
- package/cpp/llama-chat.cpp +567 -0
- package/cpp/llama-chat.h +51 -0
- package/cpp/llama-context.cpp +1771 -0
- package/cpp/llama-context.h +128 -0
- package/cpp/llama-cparams.cpp +1 -0
- package/cpp/llama-cparams.h +37 -0
- package/cpp/llama-cpp.h +30 -0
- package/cpp/llama-grammar.cpp +1 -0
- package/cpp/llama-grammar.h +3 -1
- package/cpp/llama-hparams.cpp +71 -0
- package/cpp/llama-hparams.h +140 -0
- package/cpp/llama-impl.cpp +167 -0
- package/cpp/llama-impl.h +16 -136
- package/cpp/llama-kv-cache.cpp +718 -0
- package/cpp/llama-kv-cache.h +218 -0
- package/cpp/llama-mmap.cpp +589 -0
- package/cpp/llama-mmap.h +67 -0
- package/cpp/llama-model-loader.cpp +1011 -0
- package/cpp/llama-model-loader.h +158 -0
- package/cpp/llama-model.cpp +2202 -0
- package/cpp/llama-model.h +391 -0
- package/cpp/llama-sampling.cpp +117 -4
- package/cpp/llama-vocab.cpp +21 -28
- package/cpp/llama-vocab.h +13 -1
- package/cpp/llama.cpp +8437 -19421
- package/cpp/llama.cpp.rej +23 -0
- package/cpp/llama.h +31 -6
- package/cpp/rn-llama.hpp +39 -37
- package/cpp/sgemm.cpp +776 -70
- package/cpp/unicode.cpp +6 -0
- package/package.json +1 -1
@@ -9,17 +9,23 @@ include_directories(${RNLLAMA_LIB_DIR})
|
|
9
9
|
|
10
10
|
set(
|
11
11
|
SOURCE_FILES
|
12
|
-
${RNLLAMA_LIB_DIR}/llama-grammar.cpp
|
13
|
-
${RNLLAMA_LIB_DIR}/llama-sampling.cpp
|
14
|
-
${RNLLAMA_LIB_DIR}/llama-vocab.cpp
|
15
|
-
${RNLLAMA_LIB_DIR}/log.cpp
|
16
|
-
|
17
|
-
#${RNLLAMA_LIB_DIR}/amx/amx.cpp
|
18
|
-
#${RNLLAMA_LIB_DIR}/amx/mmq.cpp
|
19
12
|
|
13
|
+
${RNLLAMA_LIB_DIR}/common.cpp
|
20
14
|
${RNLLAMA_LIB_DIR}/llama-grammar.cpp
|
21
15
|
${RNLLAMA_LIB_DIR}/llama-sampling.cpp
|
22
16
|
${RNLLAMA_LIB_DIR}/llama-vocab.cpp
|
17
|
+
${RNLLAMA_LIB_DIR}/llama-chat.cpp
|
18
|
+
${RNLLAMA_LIB_DIR}/llama-mmap.cpp
|
19
|
+
${RNLLAMA_LIB_DIR}/llama-context.cpp
|
20
|
+
${RNLLAMA_LIB_DIR}/llama-kv-cache.cpp
|
21
|
+
${RNLLAMA_LIB_DIR}/llama-model-loader.cpp
|
22
|
+
${RNLLAMA_LIB_DIR}/llama-model.cpp
|
23
|
+
${RNLLAMA_LIB_DIR}/llama-batch.cpp
|
24
|
+
${RNLLAMA_LIB_DIR}/llama-arch.cpp
|
25
|
+
${RNLLAMA_LIB_DIR}/llama-cparams.cpp
|
26
|
+
${RNLLAMA_LIB_DIR}/llama-hparams.cpp
|
27
|
+
${RNLLAMA_LIB_DIR}/llama-adapter.cpp
|
28
|
+
${RNLLAMA_LIB_DIR}/llama-impl.cpp
|
23
29
|
${RNLLAMA_LIB_DIR}/log.cpp
|
24
30
|
${RNLLAMA_LIB_DIR}/json.hpp
|
25
31
|
${RNLLAMA_LIB_DIR}/json-schema-to-grammar.cpp
|
@@ -28,6 +34,7 @@ set(
|
|
28
34
|
${RNLLAMA_LIB_DIR}/ggml-backend.cpp
|
29
35
|
${RNLLAMA_LIB_DIR}/ggml-backend-reg.cpp
|
30
36
|
${RNLLAMA_LIB_DIR}/ggml.c
|
37
|
+
${RNLLAMA_LIB_DIR}/gguf.cpp
|
31
38
|
${RNLLAMA_LIB_DIR}/ggml-cpu.c
|
32
39
|
${RNLLAMA_LIB_DIR}/ggml-cpu.cpp
|
33
40
|
${RNLLAMA_LIB_DIR}/ggml-cpu-aarch64.cpp
|
@@ -35,7 +42,6 @@ set(
|
|
35
42
|
${RNLLAMA_LIB_DIR}/ggml-cpu-quants.c
|
36
43
|
${RNLLAMA_LIB_DIR}/ggml-threading.cpp
|
37
44
|
${RNLLAMA_LIB_DIR}/ggml-quants.c
|
38
|
-
${RNLLAMA_LIB_DIR}/common.cpp
|
39
45
|
${RNLLAMA_LIB_DIR}/sampling.cpp
|
40
46
|
${RNLLAMA_LIB_DIR}/unicode-data.cpp
|
41
47
|
${RNLLAMA_LIB_DIR}/unicode.cpp
|
package/android/src/main/jni.cpp
CHANGED
@@ -11,7 +11,8 @@
|
|
11
11
|
#include <unordered_map>
|
12
12
|
#include "llama.h"
|
13
13
|
#include "llama-impl.h"
|
14
|
-
#include "
|
14
|
+
#include "llama-context.h"
|
15
|
+
#include "gguf.h"
|
15
16
|
#include "rn-llama.hpp"
|
16
17
|
|
17
18
|
#define UNUSED(x) (void)(x)
|
@@ -336,17 +337,17 @@ Java_com_rnllama_LlamaContext_initContext(
|
|
336
337
|
|
337
338
|
LOGI("[RNLlama] is_model_loaded %s", (is_model_loaded ? "true" : "false"));
|
338
339
|
if (is_model_loaded) {
|
339
|
-
if (embedding && llama_model_has_encoder(llama->model) && llama_model_has_decoder(llama->model)) {
|
340
|
+
if (embedding && llama_model_has_encoder(llama->model.get()) && llama_model_has_decoder(llama->model.get())) {
|
340
341
|
LOGI("[RNLlama] computing embeddings in encoder-decoder models is not supported");
|
341
|
-
llama_free(llama->ctx);
|
342
|
+
llama_free(llama->ctx.get());
|
342
343
|
return -1;
|
343
344
|
}
|
344
|
-
context_map[(long) llama->ctx] = llama;
|
345
|
+
context_map[(long) llama->ctx.get()] = llama;
|
345
346
|
} else {
|
346
|
-
llama_free(llama->ctx);
|
347
|
+
llama_free(llama->ctx.get());
|
347
348
|
}
|
348
349
|
|
349
|
-
return reinterpret_cast<jlong>(llama->ctx);
|
350
|
+
return reinterpret_cast<jlong>(llama->ctx.get());
|
350
351
|
}
|
351
352
|
|
352
353
|
|
@@ -372,13 +373,13 @@ Java_com_rnllama_LlamaContext_loadModelDetails(
|
|
372
373
|
UNUSED(thiz);
|
373
374
|
auto llama = context_map[(long) context_ptr];
|
374
375
|
|
375
|
-
int count = llama_model_meta_count(llama->model);
|
376
|
+
int count = llama_model_meta_count(llama->model.get());
|
376
377
|
auto meta = createWriteableMap(env);
|
377
378
|
for (int i = 0; i < count; i++) {
|
378
379
|
char key[256];
|
379
|
-
llama_model_meta_key_by_index(llama->model, i, key, sizeof(key));
|
380
|
+
llama_model_meta_key_by_index(llama->model.get(), i, key, sizeof(key));
|
380
381
|
char val[2048];
|
381
|
-
llama_model_meta_val_str_by_index(llama->model, i, val, sizeof(val));
|
382
|
+
llama_model_meta_val_str_by_index(llama->model.get(), i, val, sizeof(val));
|
382
383
|
|
383
384
|
putString(env, meta, key, val);
|
384
385
|
}
|
@@ -386,10 +387,10 @@ Java_com_rnllama_LlamaContext_loadModelDetails(
|
|
386
387
|
auto result = createWriteableMap(env);
|
387
388
|
|
388
389
|
char desc[1024];
|
389
|
-
llama_model_desc(llama->model, desc, sizeof(desc));
|
390
|
+
llama_model_desc(llama->model.get(), desc, sizeof(desc));
|
390
391
|
putString(env, result, "desc", desc);
|
391
|
-
putDouble(env, result, "size", llama_model_size(llama->model));
|
392
|
-
putDouble(env, result, "nParams", llama_model_n_params(llama->model));
|
392
|
+
putDouble(env, result, "size", llama_model_size(llama->model.get()));
|
393
|
+
putDouble(env, result, "nParams", llama_model_n_params(llama->model.get()));
|
393
394
|
putBoolean(env, result, "isChatTemplateSupported", llama->validateModelChatTemplate());
|
394
395
|
putMap(env, result, "metadata", meta);
|
395
396
|
|
@@ -431,7 +432,7 @@ Java_com_rnllama_LlamaContext_getFormattedChat(
|
|
431
432
|
}
|
432
433
|
|
433
434
|
const char *tmpl_chars = env->GetStringUTFChars(chat_template, nullptr);
|
434
|
-
std::string formatted_chat = common_chat_apply_template(llama->model, tmpl_chars, chat, true);
|
435
|
+
std::string formatted_chat = common_chat_apply_template(llama->model.get(), tmpl_chars, chat, true);
|
435
436
|
|
436
437
|
return env->NewStringUTF(formatted_chat.c_str());
|
437
438
|
}
|
@@ -450,7 +451,7 @@ Java_com_rnllama_LlamaContext_loadSession(
|
|
450
451
|
auto result = createWriteableMap(env);
|
451
452
|
size_t n_token_count_out = 0;
|
452
453
|
llama->embd.resize(llama->params.n_ctx);
|
453
|
-
if (!llama_state_load_file(llama->ctx, path_chars, llama->embd.data(), llama->embd.capacity(), &n_token_count_out)) {
|
454
|
+
if (!llama_state_load_file(llama->ctx.get(), path_chars, llama->embd.data(), llama->embd.capacity(), &n_token_count_out)) {
|
454
455
|
env->ReleaseStringUTFChars(path, path_chars);
|
455
456
|
|
456
457
|
putString(env, result, "error", "Failed to load session");
|
@@ -459,7 +460,7 @@ Java_com_rnllama_LlamaContext_loadSession(
|
|
459
460
|
llama->embd.resize(n_token_count_out);
|
460
461
|
env->ReleaseStringUTFChars(path, path_chars);
|
461
462
|
|
462
|
-
const std::string text = rnllama::tokens_to_str(llama->ctx, llama->embd.cbegin(), llama->embd.cend());
|
463
|
+
const std::string text = rnllama::tokens_to_str(llama->ctx.get(), llama->embd.cbegin(), llama->embd.cend());
|
463
464
|
putInt(env, result, "tokens_loaded", n_token_count_out);
|
464
465
|
putString(env, result, "prompt", text.c_str());
|
465
466
|
return reinterpret_cast<jobject>(result);
|
@@ -481,7 +482,7 @@ Java_com_rnllama_LlamaContext_saveSession(
|
|
481
482
|
std::vector<llama_token> session_tokens = llama->embd;
|
482
483
|
int default_size = session_tokens.size();
|
483
484
|
int save_size = size > 0 && size <= default_size ? size : default_size;
|
484
|
-
if (!llama_state_save_file(llama->ctx, path_chars, session_tokens.data(), save_size)) {
|
485
|
+
if (!llama_state_save_file(llama->ctx.get(), path_chars, session_tokens.data(), save_size)) {
|
485
486
|
env->ReleaseStringUTFChars(path, path_chars);
|
486
487
|
return -1;
|
487
488
|
}
|
@@ -499,13 +500,13 @@ static inline jobject tokenProbsToMap(
|
|
499
500
|
for (const auto &prob : probs) {
|
500
501
|
auto probsForToken = createWritableArray(env);
|
501
502
|
for (const auto &p : prob.probs) {
|
502
|
-
std::string tokStr = rnllama::tokens_to_output_formatted_string(llama->ctx, p.tok);
|
503
|
+
std::string tokStr = rnllama::tokens_to_output_formatted_string(llama->ctx.get(), p.tok);
|
503
504
|
auto probResult = createWriteableMap(env);
|
504
505
|
putString(env, probResult, "tok_str", tokStr.c_str());
|
505
506
|
putDouble(env, probResult, "prob", p.prob);
|
506
507
|
pushMap(env, probsForToken, probResult);
|
507
508
|
}
|
508
|
-
std::string tokStr = rnllama::tokens_to_output_formatted_string(llama->ctx, prob.tok);
|
509
|
+
std::string tokStr = rnllama::tokens_to_output_formatted_string(llama->ctx.get(), prob.tok);
|
509
510
|
auto tokenResult = createWriteableMap(env);
|
510
511
|
putString(env, tokenResult, "content", tokStr.c_str());
|
511
512
|
putArray(env, tokenResult, "probs", probsForToken);
|
@@ -555,7 +556,7 @@ Java_com_rnllama_LlamaContext_doCompletion(
|
|
555
556
|
|
556
557
|
llama->rewind();
|
557
558
|
|
558
|
-
//llama_reset_timings(llama->ctx);
|
559
|
+
//llama_reset_timings(llama->ctx.get());
|
559
560
|
|
560
561
|
llama->params.prompt = env->GetStringUTFChars(prompt, nullptr);
|
561
562
|
llama->params.sampling.seed = (seed == -1) ? time(NULL) : seed;
|
@@ -593,7 +594,7 @@ Java_com_rnllama_LlamaContext_doCompletion(
|
|
593
594
|
|
594
595
|
sparams.logit_bias.clear();
|
595
596
|
if (ignore_eos) {
|
596
|
-
sparams.logit_bias[llama_token_eos(llama->model)].bias = -INFINITY;
|
597
|
+
sparams.logit_bias[llama_token_eos(llama->model.get())].bias = -INFINITY;
|
597
598
|
}
|
598
599
|
|
599
600
|
// dry break seq
|
@@ -612,7 +613,7 @@ Java_com_rnllama_LlamaContext_doCompletion(
|
|
612
613
|
sparams.dry_sequence_breakers = dry_sequence_breakers_vector;
|
613
614
|
|
614
615
|
// logit bias
|
615
|
-
const int n_vocab = llama_n_vocab(llama_get_model(llama->ctx));
|
616
|
+
const int n_vocab = llama_n_vocab(llama_get_model(llama->ctx.get()));
|
616
617
|
jsize logit_bias_len = env->GetArrayLength(logit_bias);
|
617
618
|
|
618
619
|
for (jsize i = 0; i < logit_bias_len; i++) {
|
@@ -659,7 +660,7 @@ Java_com_rnllama_LlamaContext_doCompletion(
|
|
659
660
|
if (token_with_probs.tok == -1 || llama->incomplete) {
|
660
661
|
continue;
|
661
662
|
}
|
662
|
-
const std::string token_text = common_token_to_piece(llama->ctx, token_with_probs.tok);
|
663
|
+
const std::string token_text = common_token_to_piece(llama->ctx.get(), token_with_probs.tok);
|
663
664
|
|
664
665
|
size_t pos = std::min(sent_count, llama->generated_text.size());
|
665
666
|
|
@@ -694,7 +695,7 @@ Java_com_rnllama_LlamaContext_doCompletion(
|
|
694
695
|
putString(env, tokenResult, "token", to_send.c_str());
|
695
696
|
|
696
697
|
if (llama->params.sampling.n_probs > 0) {
|
697
|
-
const std::vector<llama_token> to_send_toks = common_tokenize(llama->ctx, to_send, false);
|
698
|
+
const std::vector<llama_token> to_send_toks = common_tokenize(llama->ctx.get(), to_send, false);
|
698
699
|
size_t probs_pos = std::min(sent_token_probs_index, llama->generated_token_probs.size());
|
699
700
|
size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama->generated_token_probs.size());
|
700
701
|
if (probs_pos < probs_stop_pos) {
|
@@ -711,7 +712,7 @@ Java_com_rnllama_LlamaContext_doCompletion(
|
|
711
712
|
}
|
712
713
|
}
|
713
714
|
|
714
|
-
llama_perf_context_print(llama->ctx);
|
715
|
+
llama_perf_context_print(llama->ctx.get());
|
715
716
|
llama->is_predicting = false;
|
716
717
|
|
717
718
|
auto result = createWriteableMap(env);
|
@@ -726,7 +727,7 @@ Java_com_rnllama_LlamaContext_doCompletion(
|
|
726
727
|
putString(env, result, "stopping_word", llama->stopping_word.c_str());
|
727
728
|
putInt(env, result, "tokens_cached", llama->n_past);
|
728
729
|
|
729
|
-
const auto timings_token = llama_perf_context(llama -> ctx);
|
730
|
+
const auto timings_token = llama_perf_context(llama -> ctx.get());
|
730
731
|
|
731
732
|
auto timingsResult = createWriteableMap(env);
|
732
733
|
putInt(env, timingsResult, "prompt_n", timings_token.n_p_eval);
|
@@ -770,7 +771,7 @@ Java_com_rnllama_LlamaContext_tokenize(
|
|
770
771
|
const char *text_chars = env->GetStringUTFChars(text, nullptr);
|
771
772
|
|
772
773
|
const std::vector<llama_token> toks = common_tokenize(
|
773
|
-
llama->ctx,
|
774
|
+
llama->ctx.get(),
|
774
775
|
text_chars,
|
775
776
|
false
|
776
777
|
);
|
@@ -797,7 +798,7 @@ Java_com_rnllama_LlamaContext_detokenize(
|
|
797
798
|
toks.push_back(tokens_ptr[i]);
|
798
799
|
}
|
799
800
|
|
800
|
-
auto text = rnllama::tokens_to_str(llama->ctx, toks.cbegin(), toks.cend());
|
801
|
+
auto text = rnllama::tokens_to_str(llama->ctx.get(), toks.cbegin(), toks.cend());
|
801
802
|
|
802
803
|
env->ReleaseIntArrayElements(tokens, tokens_ptr, 0);
|
803
804
|
|
@@ -834,7 +835,7 @@ Java_com_rnllama_LlamaContext_embedding(
|
|
834
835
|
|
835
836
|
llama->rewind();
|
836
837
|
|
837
|
-
llama_perf_context_reset(llama->ctx);
|
838
|
+
llama_perf_context_reset(llama->ctx.get());
|
838
839
|
|
839
840
|
llama->params.prompt = text_chars;
|
840
841
|
|
@@ -860,7 +861,7 @@ Java_com_rnllama_LlamaContext_embedding(
|
|
860
861
|
|
861
862
|
auto promptTokens = createWritableArray(env);
|
862
863
|
for (const auto &tok : llama->embd) {
|
863
|
-
pushString(env, promptTokens, common_token_to_piece(llama->ctx, tok).c_str());
|
864
|
+
pushString(env, promptTokens, common_token_to_piece(llama->ctx.get(), tok).c_str());
|
864
865
|
}
|
865
866
|
putArray(env, result, "prompt_tokens", promptTokens);
|
866
867
|
|
@@ -890,17 +891,17 @@ Java_com_rnllama_LlamaContext_freeContext(
|
|
890
891
|
UNUSED(env);
|
891
892
|
UNUSED(thiz);
|
892
893
|
auto llama = context_map[(long) context_ptr];
|
893
|
-
if (llama->model) {
|
894
|
-
|
894
|
+
if (llama->model.get()) {
|
895
|
+
llama_model_free(llama->model.get());
|
895
896
|
}
|
896
|
-
if (llama->ctx) {
|
897
|
-
llama_free(llama->ctx);
|
897
|
+
if (llama->ctx.get()) {
|
898
|
+
llama_free(llama->ctx.get());
|
898
899
|
}
|
899
|
-
if (llama->
|
900
|
+
/*if (llama->ctx.get()-> != nullptr)
|
900
901
|
{
|
901
|
-
common_sampler_free(llama->
|
902
|
-
}
|
903
|
-
context_map.erase((long) llama->ctx);
|
902
|
+
common_sampler_free(llama->ctx.get() -> _sampling);
|
903
|
+
}*/
|
904
|
+
context_map.erase((long) llama->ctx.get());
|
904
905
|
}
|
905
906
|
|
906
907
|
JNIEXPORT void JNICALL
|
package/cpp/common.cpp
CHANGED
@@ -2,6 +2,9 @@
|
|
2
2
|
#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
|
3
3
|
#endif
|
4
4
|
|
5
|
+
#include "ggml.h"
|
6
|
+
#include "gguf.h"
|
7
|
+
|
5
8
|
#include "common.h"
|
6
9
|
#include "log.h"
|
7
10
|
// Change JSON_ASSERT from assert() to LM_GGML_ASSERT:
|
@@ -18,6 +21,7 @@
|
|
18
21
|
#include <cstdarg>
|
19
22
|
#include <cstring>
|
20
23
|
#include <ctime>
|
24
|
+
#include <filesystem>
|
21
25
|
#include <fstream>
|
22
26
|
#include <iostream>
|
23
27
|
#include <iterator>
|
@@ -68,7 +72,9 @@ char const *LLAMA_BUILD_TARGET = "unknown";
|
|
68
72
|
#ifdef __linux__
|
69
73
|
#include <linux/limits.h>
|
70
74
|
#elif defined(_WIN32)
|
71
|
-
#
|
75
|
+
# if !defined(PATH_MAX)
|
76
|
+
# define PATH_MAX MAX_PATH
|
77
|
+
# endif
|
72
78
|
#else
|
73
79
|
#include <sys/syslimits.h>
|
74
80
|
#endif
|
@@ -849,7 +855,7 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|
849
855
|
} else if (!params.model_url.empty()) {
|
850
856
|
model = common_load_model_from_url(params.model_url, params.model, params.hf_token, mparams);
|
851
857
|
} else {
|
852
|
-
model =
|
858
|
+
model = llama_model_load_from_file(params.model.c_str(), mparams);
|
853
859
|
}
|
854
860
|
|
855
861
|
if (model == NULL) {
|
@@ -876,7 +882,7 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|
876
882
|
}
|
877
883
|
|
878
884
|
if (!ok) {
|
879
|
-
|
885
|
+
llama_model_free(model);
|
880
886
|
|
881
887
|
return iparams;
|
882
888
|
}
|
@@ -887,14 +893,13 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|
887
893
|
llama_context * lctx = llama_new_context_with_model(model, cparams);
|
888
894
|
if (lctx == NULL) {
|
889
895
|
LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.c_str());
|
890
|
-
|
896
|
+
llama_model_free(model);
|
891
897
|
return iparams;
|
892
898
|
}
|
893
899
|
|
894
900
|
if (params.ctx_shift && !llama_kv_cache_can_shift(lctx)) {
|
895
|
-
|
896
|
-
|
897
|
-
return iparams;
|
901
|
+
LOG_WRN("%s: KV cache shifting is not supported for this model, disabling KV cache shifting\n", __func__);
|
902
|
+
params.ctx_shift = false;
|
898
903
|
}
|
899
904
|
|
900
905
|
if (!params.control_vectors.empty()) {
|
@@ -904,7 +909,7 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|
904
909
|
const auto cvec = common_control_vector_load(params.control_vectors);
|
905
910
|
if (cvec.n_embd == -1) {
|
906
911
|
llama_free(lctx);
|
907
|
-
|
912
|
+
llama_model_free(model);
|
908
913
|
|
909
914
|
return iparams;
|
910
915
|
}
|
@@ -917,7 +922,7 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|
917
922
|
params.control_vector_layer_end);
|
918
923
|
if (err) {
|
919
924
|
llama_free(lctx);
|
920
|
-
|
925
|
+
llama_model_free(model);
|
921
926
|
|
922
927
|
return iparams;
|
923
928
|
}
|
@@ -925,20 +930,21 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|
925
930
|
|
926
931
|
// load and optionally apply lora adapters
|
927
932
|
for (auto & la : params.lora_adapters) {
|
928
|
-
|
929
|
-
|
930
|
-
|
931
|
-
loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str());
|
932
|
-
if (loaded_la.adapter == nullptr) {
|
933
|
+
llama_lora_adapter_ptr lora;
|
934
|
+
lora.reset(llama_lora_adapter_init(model, la.path.c_str()));
|
935
|
+
if (lora == nullptr) {
|
933
936
|
LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
|
934
937
|
llama_free(lctx);
|
935
|
-
|
938
|
+
llama_model_free(model);
|
936
939
|
return iparams;
|
937
940
|
}
|
938
|
-
|
941
|
+
|
942
|
+
la.ptr = lora.get();
|
943
|
+
iparams.lora.emplace_back(std::move(lora)); // copy to list of loaded adapters
|
939
944
|
}
|
945
|
+
|
940
946
|
if (!params.lora_init_without_apply) {
|
941
|
-
common_lora_adapters_apply(lctx,
|
947
|
+
common_lora_adapters_apply(lctx, params.lora_adapters);
|
942
948
|
}
|
943
949
|
|
944
950
|
if (params.sampling.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) {
|
@@ -985,7 +991,7 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|
985
991
|
if (llama_model_has_encoder(model)) {
|
986
992
|
llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size()));
|
987
993
|
llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
|
988
|
-
if (decoder_start_token_id ==
|
994
|
+
if (decoder_start_token_id == LLAMA_TOKEN_NULL) {
|
989
995
|
decoder_start_token_id = bos;
|
990
996
|
}
|
991
997
|
tmp.clear();
|
@@ -999,17 +1005,17 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|
999
1005
|
llama_perf_context_reset(lctx);
|
1000
1006
|
}
|
1001
1007
|
|
1002
|
-
iparams.model
|
1003
|
-
iparams.context
|
1008
|
+
iparams.model.reset(model);
|
1009
|
+
iparams.context.reset(lctx);
|
1004
1010
|
|
1005
1011
|
return iparams;
|
1006
1012
|
}
|
1007
1013
|
|
1008
|
-
void common_lora_adapters_apply(struct llama_context * ctx, std::vector<
|
1014
|
+
void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_info> & lora) {
|
1009
1015
|
llama_lora_adapter_clear(ctx);
|
1010
|
-
for (auto & la :
|
1016
|
+
for (auto & la : lora) {
|
1011
1017
|
if (la.scale != 0.0f) {
|
1012
|
-
llama_lora_adapter_set(ctx, la.
|
1018
|
+
llama_lora_adapter_set(ctx, la.ptr, la.scale);
|
1013
1019
|
}
|
1014
1020
|
}
|
1015
1021
|
}
|
@@ -1158,8 +1164,7 @@ static bool common_download_file(const std::string & url, const std::string & pa
|
|
1158
1164
|
#endif
|
1159
1165
|
|
1160
1166
|
// Check if the file already exists locally
|
1161
|
-
|
1162
|
-
auto file_exists = (stat(path.c_str(), &model_file_info) == 0);
|
1167
|
+
auto file_exists = std::filesystem::exists(path);
|
1163
1168
|
|
1164
1169
|
// If the file exists, check its JSON metadata companion file.
|
1165
1170
|
std::string metadata_path = path + ".json";
|
@@ -1419,7 +1424,7 @@ struct llama_model * common_load_model_from_url(
|
|
1419
1424
|
}
|
1420
1425
|
}
|
1421
1426
|
|
1422
|
-
return
|
1427
|
+
return llama_model_load_from_file(local_path.c_str(), params);
|
1423
1428
|
}
|
1424
1429
|
|
1425
1430
|
struct llama_model * common_load_model_from_hf(
|
@@ -1622,6 +1627,18 @@ std::string common_detokenize(llama_context * ctx, const std::vector<llama_token
|
|
1622
1627
|
// Chat template utils
|
1623
1628
|
//
|
1624
1629
|
|
1630
|
+
std::string common_get_builtin_chat_template(const struct llama_model * model) {
|
1631
|
+
static const char * template_key = "tokenizer.chat_template";
|
1632
|
+
// call with NULL buffer to get the total size of the string
|
1633
|
+
int32_t res = llama_model_meta_val_str(model, template_key, NULL, 0);
|
1634
|
+
if (res > 0) {
|
1635
|
+
std::vector<char> model_template(res + 1, 0);
|
1636
|
+
llama_model_meta_val_str(model, template_key, model_template.data(), model_template.size());
|
1637
|
+
return std::string(model_template.data(), model_template.size() - 1);
|
1638
|
+
}
|
1639
|
+
return "";
|
1640
|
+
}
|
1641
|
+
|
1625
1642
|
bool common_chat_verify_template(const std::string & tmpl) {
|
1626
1643
|
llama_chat_message chat[] = {{"user", "test"}};
|
1627
1644
|
int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0);
|
package/cpp/common.h
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
#pragma once
|
4
4
|
|
5
|
-
#include "llama.h"
|
5
|
+
#include "llama-cpp.h"
|
6
6
|
|
7
7
|
#include <string>
|
8
8
|
#include <vector>
|
@@ -27,10 +27,8 @@
|
|
27
27
|
struct common_lora_adapter_info {
|
28
28
|
std::string path;
|
29
29
|
float scale;
|
30
|
-
};
|
31
30
|
|
32
|
-
struct
|
33
|
-
struct llama_lora_adapter * adapter;
|
31
|
+
struct llama_lora_adapter * ptr;
|
34
32
|
};
|
35
33
|
|
36
34
|
using llama_tokens = std::vector<llama_token>;
|
@@ -493,10 +491,12 @@ std::string fs_get_cache_file(const std::string & filename);
|
|
493
491
|
// Model utils
|
494
492
|
//
|
495
493
|
|
494
|
+
// note: defines object's lifetime
|
496
495
|
struct common_init_result {
|
497
|
-
|
498
|
-
|
499
|
-
|
496
|
+
llama_model_ptr model;
|
497
|
+
llama_context_ptr context;
|
498
|
+
|
499
|
+
std::vector<llama_lora_adapter_ptr> lora;
|
500
500
|
};
|
501
501
|
|
502
502
|
struct common_init_result common_init_from_params(common_params & params);
|
@@ -518,7 +518,7 @@ struct llama_model * common_load_model_from_hf(
|
|
518
518
|
const struct llama_model_params & params);
|
519
519
|
|
520
520
|
// clear LoRA adapters from context, then apply new list of adapters
|
521
|
-
void common_lora_adapters_apply(struct llama_context * ctx, std::vector<
|
521
|
+
void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_info> & lora);
|
522
522
|
|
523
523
|
//
|
524
524
|
// Batch utils
|
@@ -586,6 +586,9 @@ struct common_chat_msg {
|
|
586
586
|
std::string content;
|
587
587
|
};
|
588
588
|
|
589
|
+
// Get the built-in chat template for the model. Return empty string if not present.
|
590
|
+
std::string common_get_builtin_chat_template(const struct llama_model * model);
|
591
|
+
|
589
592
|
// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
|
590
593
|
bool common_chat_verify_template(const std::string & tmpl);
|
591
594
|
|
@@ -652,6 +655,10 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c
|
|
652
655
|
// Split utils
|
653
656
|
//
|
654
657
|
|
655
|
-
|
656
|
-
|
657
|
-
|
658
|
+
namespace {
|
659
|
+
|
660
|
+
const char * const LLM_KV_SPLIT_NO = "split.no";
|
661
|
+
const char * const LLM_KV_SPLIT_COUNT = "split.count";
|
662
|
+
const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
|
663
|
+
|
664
|
+
}
|
package/cpp/ggml-backend-reg.cpp
CHANGED
@@ -574,4 +574,9 @@ void lm_ggml_backend_load_all_from_path(const char * dir_path) {
|
|
574
574
|
lm_ggml_backend_load_best("opencl", silent, dir_path);
|
575
575
|
lm_ggml_backend_load_best("musa", silent, dir_path);
|
576
576
|
lm_ggml_backend_load_best("cpu", silent, dir_path);
|
577
|
+
// check the environment variable LM_GGML_BACKEND_PATH to load an out-of-tree backend
|
578
|
+
const char * backend_path = std::getenv("LM_GGML_BACKEND_PATH");
|
579
|
+
if (backend_path) {
|
580
|
+
lm_ggml_backend_load(backend_path);
|
581
|
+
}
|
577
582
|
}
|
package/cpp/ggml-backend.cpp
CHANGED
@@ -764,7 +764,7 @@ static int lm_ggml_backend_sched_backend_id_from_cur(lm_ggml_backend_sched_t sch
|
|
764
764
|
if (tensor->op != LM_GGML_OP_ROPE && src->buffer != NULL && src->buffer->usage == LM_GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
765
765
|
int src_backend_id = lm_ggml_backend_sched_backend_from_buffer(sched, src, tensor);
|
766
766
|
// check if a backend with higher prio wants to offload the op
|
767
|
-
if (src_backend_id == sched->n_backends - 1) {
|
767
|
+
if (src_backend_id == sched->n_backends - 1 && lm_ggml_backend_buffer_is_host(src->buffer)) {
|
768
768
|
for (int b = 0; b < src_backend_id; b++) {
|
769
769
|
if (lm_ggml_backend_supports_op(sched->backends[b], tensor) && lm_ggml_backend_offload_op(sched->backends[b], tensor)) {
|
770
770
|
SET_CAUSE(tensor, "1.off");
|
@@ -795,9 +795,12 @@ static void lm_ggml_backend_sched_print_assignments(lm_ggml_backend_sched_t sche
|
|
795
795
|
for (int i = 0; i < graph->n_nodes; i++) {
|
796
796
|
if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) {
|
797
797
|
lm_ggml_backend_t split_backend = sched->backends[sched->splits[cur_split].backend_id];
|
798
|
-
LM_GGML_LOG_DEBUG("\n## SPLIT #%d: %s # %d inputs
|
798
|
+
LM_GGML_LOG_DEBUG("\n## SPLIT #%d: %s # %d inputs", cur_split, lm_ggml_backend_name(split_backend),
|
799
799
|
sched->splits[cur_split].n_inputs);
|
800
800
|
for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) {
|
801
|
+
if (j == 0) {
|
802
|
+
LM_GGML_LOG_DEBUG(": ");
|
803
|
+
}
|
801
804
|
LM_GGML_LOG_DEBUG("[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name,
|
802
805
|
fmt_size(lm_ggml_nbytes(sched->splits[cur_split].inputs[j])));
|
803
806
|
}
|
package/cpp/ggml-cpp.h
CHANGED
package/cpp/ggml-cpu-aarch64.cpp
CHANGED
@@ -194,9 +194,12 @@ static inline __m256i sum_i16_pairs_int32x8(const __m256i x) {
|
|
194
194
|
}
|
195
195
|
|
196
196
|
static inline __m256i mul_sum_us8_pairs_int32x8(const __m256i ax, const __m256i sy) {
|
197
|
-
#if defined(
|
197
|
+
#if defined(__AVX512VNNI__) && defined(__AVX512VL__)
|
198
198
|
const __m256i zero = _mm256_setzero_si256();
|
199
199
|
return _mm256_dpbusd_epi32(zero, ax, sy);
|
200
|
+
#elif defined(__AVXVNNI__)
|
201
|
+
const __m256i zero = _mm256_setzero_si256();
|
202
|
+
return _mm256_dpbusd_avx_epi32(zero, ax, sy);
|
200
203
|
#else
|
201
204
|
// Perform multiplication and create 16-bit values
|
202
205
|
const __m256i dot = _mm256_maddubs_epi16(ax, sy);
|
@@ -4166,6 +4169,8 @@ static lm_ggml_backend_buffer_t lm_ggml_backend_cpu_aarch64_buffer_type_alloc_bu
|
|
4166
4169
|
buffer->buft = buft;
|
4167
4170
|
buffer->iface.init_tensor = lm_ggml_backend_cpu_aarch64_buffer_init_tensor;
|
4168
4171
|
buffer->iface.set_tensor = lm_ggml_backend_cpu_aarch64_buffer_set_tensor;
|
4172
|
+
buffer->iface.get_tensor = nullptr;
|
4173
|
+
buffer->iface.cpy_tensor = nullptr;
|
4169
4174
|
return buffer;
|
4170
4175
|
}
|
4171
4176
|
|
package/cpp/ggml-cpu-quants.c
CHANGED
@@ -103,10 +103,14 @@ static inline __m256 sum_i16_pairs_float(const __m256i x) {
|
|
103
103
|
}
|
104
104
|
|
105
105
|
static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
|
106
|
-
#if defined(
|
106
|
+
#if defined(__AVX512VNNI__) && defined(__AVX512VL__)
|
107
107
|
const __m256i zero = _mm256_setzero_si256();
|
108
108
|
const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy);
|
109
109
|
return _mm256_cvtepi32_ps(summed_pairs);
|
110
|
+
#elif defined(__AVXVNNI__)
|
111
|
+
const __m256i zero = _mm256_setzero_si256();
|
112
|
+
const __m256i summed_pairs = _mm256_dpbusd_avx_epi32(zero, ax, sy);
|
113
|
+
return _mm256_cvtepi32_ps(summed_pairs);
|
110
114
|
#else
|
111
115
|
// Perform multiplication and create 16-bit values
|
112
116
|
const __m256i dot = _mm256_maddubs_epi16(ax, sy);
|
package/cpp/ggml-impl.h
CHANGED
@@ -3,6 +3,8 @@
|
|
3
3
|
// GGML internal header
|
4
4
|
|
5
5
|
#include "ggml.h"
|
6
|
+
#include "gguf.h"
|
7
|
+
|
6
8
|
#include <assert.h>
|
7
9
|
#include <math.h>
|
8
10
|
#include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/
|
@@ -551,22 +553,15 @@ static inline lm_ggml_bf16_t lm_ggml_compute_fp32_to_bf16(float s) {
|
|
551
553
|
#define LM_GGML_FP32_TO_BF16(x) lm_ggml_compute_fp32_to_bf16(x)
|
552
554
|
#define LM_GGML_BF16_TO_FP32(x) lm_ggml_compute_bf16_to_fp32(x)
|
553
555
|
|
554
|
-
// expose GGUF internals for test code
|
555
|
-
|
556
|
-
LM_GGML_API size_t lm_gguf_type_size(enum lm_gguf_type type);
|
557
|
-
|
558
|
-
LM_GGML_API struct lm_gguf_context * lm_gguf_init_from_file_impl(FILE * file, struct lm_gguf_init_params params);
|
559
|
-
|
560
|
-
struct lm_gguf_buf {
|
561
|
-
void * data;
|
562
|
-
size_t size;
|
563
|
-
size_t offset;
|
564
|
-
};
|
565
|
-
LM_GGML_API struct lm_gguf_buf lm_gguf_buf_init(size_t size);
|
566
|
-
LM_GGML_API void lm_gguf_buf_free(struct lm_gguf_buf buf);
|
567
|
-
|
568
|
-
LM_GGML_API void lm_gguf_write_to_buf(const struct lm_gguf_context * ctx, struct lm_gguf_buf * buf, bool only_meta);
|
569
|
-
|
570
556
|
#ifdef __cplusplus
|
571
557
|
}
|
572
558
|
#endif
|
559
|
+
|
560
|
+
#ifdef __cplusplus
|
561
|
+
#include <vector>
|
562
|
+
|
563
|
+
// expose GGUF internals for test code
|
564
|
+
LM_GGML_API size_t lm_gguf_type_size(enum lm_gguf_type type);
|
565
|
+
LM_GGML_API struct lm_gguf_context * lm_gguf_init_from_file_impl(FILE * file, struct lm_gguf_init_params params);
|
566
|
+
LM_GGML_API void lm_gguf_write_to_buf(const struct lm_gguf_context * ctx, std::vector<int8_t> & buf, bool only_meta);
|
567
|
+
#endif // __cplusplus
|
package/cpp/ggml-metal.m
CHANGED
@@ -2067,8 +2067,8 @@ static void lm_ggml_metal_encode_node(
|
|
2067
2067
|
LM_GGML_ASSERT(ne12 % ne02 == 0);
|
2068
2068
|
LM_GGML_ASSERT(ne13 % ne03 == 0);
|
2069
2069
|
|
2070
|
-
const
|
2071
|
-
const
|
2070
|
+
const uint32_t r2 = ne12/ne02;
|
2071
|
+
const uint32_t r3 = ne13/ne03;
|
2072
2072
|
|
2073
2073
|
// find the break-even point where the matrix-matrix kernel becomes more efficient compared
|
2074
2074
|
// to the matrix-vector kernel
|