cui-llama.rn 1.4.0 → 1.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +4 -23
- package/android/build.gradle +12 -3
- package/android/src/main/CMakeLists.txt +13 -7
- package/android/src/main/java/com/rnllama/LlamaContext.java +27 -20
- package/android/src/main/java/com/rnllama/RNLlama.java +5 -1
- package/android/src/main/jni.cpp +15 -12
- package/android/src/main/jniLibs/arm64-v8a/librnllama.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod_i8mm.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_i8mm.so +0 -0
- package/android/src/main/jniLibs/x86_64/librnllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/librnllama_x86_64.so +0 -0
- package/cpp/README.md +1 -1
- package/cpp/common.cpp +158 -267
- package/cpp/common.h +46 -12
- package/cpp/ggml-alloc.c +1042 -1037
- package/cpp/ggml-backend-impl.h +255 -256
- package/cpp/ggml-backend-reg.cpp +582 -582
- package/cpp/ggml-backend.cpp +2002 -2002
- package/cpp/ggml-backend.h +354 -352
- package/cpp/ggml-common.h +1853 -1853
- package/cpp/ggml-cpp.h +39 -39
- package/cpp/ggml-cpu-aarch64.cpp +4247 -4247
- package/cpp/ggml-cpu-aarch64.h +8 -8
- package/cpp/ggml-cpu-impl.h +386 -386
- package/cpp/ggml-cpu-quants.c +10920 -10839
- package/cpp/ggml-cpu-traits.cpp +36 -36
- package/cpp/ggml-cpu-traits.h +38 -38
- package/cpp/ggml-cpu.c +329 -60
- package/cpp/ggml-cpu.cpp +10 -2
- package/cpp/ggml-cpu.h +135 -135
- package/cpp/ggml-impl.h +567 -567
- package/cpp/ggml-metal-impl.h +17 -17
- package/cpp/ggml-metal.m +4884 -4884
- package/cpp/ggml-quants.c +5238 -5238
- package/cpp/ggml-threading.h +14 -14
- package/cpp/ggml.c +6514 -6448
- package/cpp/ggml.h +2194 -2163
- package/cpp/gguf.cpp +1329 -1325
- package/cpp/gguf.h +202 -202
- package/cpp/json-schema-to-grammar.cpp +1045 -1045
- package/cpp/json-schema-to-grammar.h +8 -8
- package/cpp/json.hpp +24766 -24766
- package/cpp/llama-adapter.cpp +347 -346
- package/cpp/llama-adapter.h +74 -73
- package/cpp/llama-arch.cpp +1487 -1434
- package/cpp/llama-arch.h +400 -395
- package/cpp/llama-batch.cpp +368 -368
- package/cpp/llama-batch.h +88 -88
- package/cpp/llama-chat.cpp +578 -567
- package/cpp/llama-chat.h +52 -51
- package/cpp/llama-context.cpp +1775 -1771
- package/cpp/llama-context.h +128 -128
- package/cpp/llama-cparams.cpp +1 -1
- package/cpp/llama-cparams.h +37 -37
- package/cpp/llama-cpp.h +30 -30
- package/cpp/llama-grammar.cpp +1139 -1139
- package/cpp/llama-grammar.h +143 -143
- package/cpp/llama-hparams.cpp +71 -71
- package/cpp/llama-hparams.h +139 -140
- package/cpp/llama-impl.cpp +167 -167
- package/cpp/llama-impl.h +61 -61
- package/cpp/llama-kv-cache.cpp +718 -718
- package/cpp/llama-kv-cache.h +218 -218
- package/cpp/llama-mmap.cpp +2 -1
- package/cpp/llama-mmap.h +67 -67
- package/cpp/llama-model-loader.cpp +1124 -1011
- package/cpp/llama-model-loader.h +167 -158
- package/cpp/llama-model.cpp +3997 -2202
- package/cpp/llama-model.h +370 -391
- package/cpp/llama-sampling.cpp +2408 -2406
- package/cpp/llama-sampling.h +32 -48
- package/cpp/llama-vocab.cpp +3247 -1982
- package/cpp/llama-vocab.h +125 -182
- package/cpp/llama.cpp +416 -2886
- package/cpp/llama.h +1323 -1285
- package/cpp/log.cpp +401 -401
- package/cpp/log.h +121 -121
- package/cpp/rn-llama.cpp +822 -0
- package/cpp/rn-llama.h +123 -0
- package/cpp/rn-llama.hpp +18 -12
- package/cpp/sampling.cpp +505 -500
- package/cpp/sgemm.cpp +2597 -2597
- package/cpp/speculative.cpp +277 -274
- package/cpp/speculative.h +28 -28
- package/cpp/unicode.cpp +2 -3
- package/ios/CMakeLists.txt +99 -0
- package/ios/RNLlama.h +5 -1
- package/ios/RNLlama.mm +2 -2
- package/ios/RNLlamaContext.h +8 -1
- package/ios/RNLlamaContext.mm +15 -11
- package/ios/rnllama.xcframework/Info.plist +74 -0
- package/jest/mock.js +3 -2
- package/lib/commonjs/NativeRNLlama.js.map +1 -1
- package/lib/commonjs/index.js +4 -2
- package/lib/commonjs/index.js.map +1 -1
- package/lib/module/NativeRNLlama.js.map +1 -1
- package/lib/module/index.js +4 -2
- package/lib/module/index.js.map +1 -1
- package/lib/typescript/NativeRNLlama.d.ts +5 -1
- package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
- package/lib/typescript/index.d.ts.map +1 -1
- package/llama-rn.podspec +8 -2
- package/package.json +5 -2
- package/src/NativeRNLlama.ts +5 -1
- package/src/index.ts +9 -2
package/cpp/rn-llama.h
ADDED
@@ -0,0 +1,123 @@
|
|
1
|
+
#ifndef RNLLAMA_H
|
2
|
+
#define RNLLAMA_H
|
3
|
+
|
4
|
+
#include <sstream>
|
5
|
+
#include <iostream>
|
6
|
+
#include "common.h"
|
7
|
+
#include "ggml.h"
|
8
|
+
#include "gguf.h"
|
9
|
+
#include "llama.h"
|
10
|
+
#include "llama-impl.h"
|
11
|
+
#include "sampling.h"
|
12
|
+
#if defined(__ANDROID__)
|
13
|
+
#include <android/log.h>
|
14
|
+
#endif
|
15
|
+
|
16
|
+
namespace rnllama {
|
17
|
+
|
18
|
+
std::string tokens_to_output_formatted_string(const llama_context *ctx, const llama_token token);
|
19
|
+
|
20
|
+
std::string tokens_to_str(llama_context *ctx, const std::vector<llama_token>::const_iterator begin, const std::vector<llama_token>::const_iterator end);
|
21
|
+
|
22
|
+
lm_ggml_type kv_cache_type_from_str(const std::string & s);
|
23
|
+
|
24
|
+
enum stop_type
|
25
|
+
{
|
26
|
+
STOP_FULL,
|
27
|
+
STOP_PARTIAL,
|
28
|
+
};
|
29
|
+
|
30
|
+
// completion token output with probabilities
|
31
|
+
struct completion_token_output
|
32
|
+
{
|
33
|
+
struct token_prob
|
34
|
+
{
|
35
|
+
llama_token tok;
|
36
|
+
float prob;
|
37
|
+
};
|
38
|
+
|
39
|
+
std::vector<token_prob> probs;
|
40
|
+
llama_token tok;
|
41
|
+
};
|
42
|
+
|
43
|
+
// Main context class
|
44
|
+
struct llama_rn_context {
|
45
|
+
bool is_predicting = false;
|
46
|
+
bool is_interrupted = false;
|
47
|
+
bool has_next_token = false;
|
48
|
+
std::string generated_text;
|
49
|
+
std::vector<completion_token_output> generated_token_probs;
|
50
|
+
|
51
|
+
size_t num_prompt_tokens = 0;
|
52
|
+
size_t num_tokens_predicted = 0;
|
53
|
+
size_t n_past = 0;
|
54
|
+
size_t n_remain = 0;
|
55
|
+
|
56
|
+
std::vector<llama_token> embd;
|
57
|
+
common_params params;
|
58
|
+
common_init_result llama_init;
|
59
|
+
|
60
|
+
llama_model *model = nullptr;
|
61
|
+
float loading_progress = 0;
|
62
|
+
bool is_load_interrupted = false;
|
63
|
+
|
64
|
+
llama_context *ctx = nullptr;
|
65
|
+
common_sampler *ctx_sampling = nullptr;
|
66
|
+
|
67
|
+
int n_ctx;
|
68
|
+
|
69
|
+
bool truncated = false;
|
70
|
+
bool stopped_eos = false;
|
71
|
+
bool stopped_word = false;
|
72
|
+
bool stopped_limit = false;
|
73
|
+
std::string stopping_word;
|
74
|
+
bool incomplete = false;
|
75
|
+
|
76
|
+
std::vector<common_adapter_lora_info> lora;
|
77
|
+
|
78
|
+
~llama_rn_context();
|
79
|
+
|
80
|
+
void rewind();
|
81
|
+
bool initSampling();
|
82
|
+
bool loadModel(common_params ¶ms_);
|
83
|
+
bool validateModelChatTemplate() const;
|
84
|
+
void truncatePrompt(std::vector<llama_token> &prompt_tokens);
|
85
|
+
void loadPrompt();
|
86
|
+
void beginCompletion();
|
87
|
+
completion_token_output nextToken();
|
88
|
+
size_t findStoppingStrings(const std::string &text, const size_t last_token_size, const stop_type type);
|
89
|
+
completion_token_output doCompletion();
|
90
|
+
std::vector<float> getEmbedding(common_params &embd_params);
|
91
|
+
std::string bench(int pp, int tg, int pl, int nr);
|
92
|
+
int applyLoraAdapters(std::vector<common_adapter_lora_info> lora);
|
93
|
+
void removeLoraAdapters();
|
94
|
+
std::vector<common_adapter_lora_info> getLoadedLoraAdapters();
|
95
|
+
std::vector<int> longest_common_subseq(const std::vector<int> x, const std::vector<int> y);
|
96
|
+
bool arr_start_with(const std::vector<int> targetArray, const std::vector<int> searchSeq);
|
97
|
+
int arr_find_index_of(const std::vector<int> targetArray, const std::vector<int> searchSeq);
|
98
|
+
void purge_missing_tokens(llama_context * ctx, std::vector<int> ¤t_context_tokens, std::vector<int> &new_context_tokens, const int genamt, const int nctx);
|
99
|
+
};\
|
100
|
+
|
101
|
+
// Logging macros
|
102
|
+
extern bool rnllama_verbose;
|
103
|
+
|
104
|
+
#if RNLLAMA_VERBOSE != 1
|
105
|
+
#define LOG_VERBOSE(MSG, ...)
|
106
|
+
#else
|
107
|
+
#define LOG_VERBOSE(MSG, ...) \
|
108
|
+
do \
|
109
|
+
{ \
|
110
|
+
if (rnllama_verbose) \
|
111
|
+
{ \
|
112
|
+
log("VERBOSE", __func__, __LINE__, MSG, ##__VA_ARGS__); \
|
113
|
+
} \
|
114
|
+
} while (0)
|
115
|
+
#endif
|
116
|
+
|
117
|
+
#define LOG_ERROR(MSG, ...) log("ERROR", __func__, __LINE__, MSG, ##__VA_ARGS__)
|
118
|
+
#define LOG_WARNING(MSG, ...) log("WARNING", __func__, __LINE__, MSG, ##__VA_ARGS__)
|
119
|
+
#define LOG_INFO(MSG, ...) log("INFO", __func__, __LINE__, MSG, ##__VA_ARGS__)
|
120
|
+
|
121
|
+
} // namespace rnllama
|
122
|
+
|
123
|
+
#endif /* RNLLAMA_H */
|
package/cpp/rn-llama.hpp
CHANGED
@@ -219,7 +219,7 @@ struct llama_rn_context
|
|
219
219
|
std::string stopping_word;
|
220
220
|
bool incomplete = false;
|
221
221
|
|
222
|
-
std::vector<
|
222
|
+
std::vector<common_adapter_lora_info> lora;
|
223
223
|
|
224
224
|
~llama_rn_context()
|
225
225
|
{
|
@@ -279,7 +279,7 @@ struct llama_rn_context
|
|
279
279
|
|
280
280
|
bool validateModelChatTemplate() const {
|
281
281
|
llama_chat_message chat[] = {{"user", "test"}};
|
282
|
-
int32_t chat_res = llama_chat_apply_template(model,
|
282
|
+
int32_t chat_res = llama_chat_apply_template(llama_model_chat_template(model), chat, 1, true, nullptr, 0);
|
283
283
|
return chat_res > 0;
|
284
284
|
}
|
285
285
|
|
@@ -307,7 +307,7 @@ struct llama_rn_context
|
|
307
307
|
|
308
308
|
void loadPrompt()
|
309
309
|
{
|
310
|
-
std::vector<llama_token> prompt_tokens = ::common_tokenize(model, params.prompt, true, true);
|
310
|
+
std::vector<llama_token> prompt_tokens = ::common_tokenize(llama_model_get_vocab(model), params.prompt, true, true);
|
311
311
|
num_prompt_tokens = prompt_tokens.size();
|
312
312
|
|
313
313
|
// LOG tokens
|
@@ -439,14 +439,14 @@ struct llama_rn_context
|
|
439
439
|
if (params.n_predict == 0)
|
440
440
|
{
|
441
441
|
has_next_token = false;
|
442
|
-
result.tok =
|
442
|
+
result.tok = llama_vocab_eos(llama_model_get_vocab(model));
|
443
443
|
return result;
|
444
444
|
}
|
445
445
|
|
446
446
|
{
|
447
447
|
// out of user input, sample next token
|
448
448
|
std::vector<llama_token_data> candidates;
|
449
|
-
candidates.reserve(
|
449
|
+
candidates.reserve(llama_vocab_n_tokens(llama_model_get_vocab(model)));
|
450
450
|
|
451
451
|
result.tok = common_sampler_sample(ctx_sampling, ctx, -1);
|
452
452
|
|
@@ -479,7 +479,7 @@ struct llama_rn_context
|
|
479
479
|
// decrement remaining sampling budget
|
480
480
|
--n_remain;
|
481
481
|
|
482
|
-
if (!embd.empty() && embd.back() ==
|
482
|
+
if (!embd.empty() && embd.back() == llama_vocab_eos(llama_model_get_vocab(model)))
|
483
483
|
{
|
484
484
|
// stopping_word = llama_token_to_piece(ctx, embd.back());
|
485
485
|
has_next_token = false;
|
@@ -584,7 +584,7 @@ struct llama_rn_context
|
|
584
584
|
|
585
585
|
std::vector<float> getEmbedding(common_params &embd_params)
|
586
586
|
{
|
587
|
-
static const int n_embd =
|
587
|
+
static const int n_embd = llama_model_n_embd(llama_get_model(ctx));
|
588
588
|
if (!embd_params.embedding)
|
589
589
|
{
|
590
590
|
LOG_WARNING("embedding disabled, embedding: %s", embd_params.embedding);
|
@@ -716,25 +716,31 @@ struct llama_rn_context
|
|
716
716
|
std::string("]");
|
717
717
|
}
|
718
718
|
|
719
|
-
int applyLoraAdapters(std::vector<
|
719
|
+
int applyLoraAdapters(std::vector<common_adapter_lora_info> lora) {
|
720
720
|
for (auto &la : lora) {
|
721
|
-
la.ptr =
|
721
|
+
la.ptr = llama_adapter_lora_init(model, la.path.c_str());
|
722
722
|
if (la.ptr == nullptr) {
|
723
723
|
LOG_ERROR("failed to apply lora adapter '%s'\n", la.path.c_str());
|
724
724
|
return -1;
|
725
725
|
}
|
726
726
|
}
|
727
727
|
this->lora = lora;
|
728
|
-
|
728
|
+
for (auto &la : lora) {
|
729
|
+
llama_set_adapter_lora(ctx, la.ptr, 1);
|
730
|
+
}
|
731
|
+
|
729
732
|
return 0;
|
730
733
|
}
|
731
734
|
|
732
735
|
void removeLoraAdapters() {
|
736
|
+
for (auto &la : this->lora) {
|
737
|
+
llama_adapter_lora_free(la.ptr);
|
738
|
+
}
|
733
739
|
this->lora.clear();
|
734
|
-
|
740
|
+
llama_clear_adapter_lora(ctx);
|
735
741
|
}
|
736
742
|
|
737
|
-
std::vector<
|
743
|
+
std::vector<common_adapter_lora_info> getLoadedLoraAdapters() {
|
738
744
|
return this->lora;
|
739
745
|
}
|
740
746
|
// Context Shifting from KoboldCpp <https://github.com/LostRuins/koboldcpp>
|