cui-llama.rn 1.4.3 → 1.4.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +93 -114
- package/android/src/main/CMakeLists.txt +5 -0
- package/android/src/main/java/com/rnllama/LlamaContext.java +91 -17
- package/android/src/main/java/com/rnllama/RNLlama.java +37 -4
- package/android/src/main/jni-utils.h +6 -0
- package/android/src/main/jni.cpp +289 -31
- package/android/src/main/jniLibs/arm64-v8a/librnllama.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod_i8mm.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_i8mm.so +0 -0
- package/android/src/main/jniLibs/x86_64/librnllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/librnllama_x86_64.so +0 -0
- package/android/src/newarch/java/com/rnllama/RNLlamaModule.java +7 -2
- package/android/src/oldarch/java/com/rnllama/RNLlamaModule.java +7 -2
- package/cpp/chat-template.hpp +529 -0
- package/cpp/chat.cpp +1779 -0
- package/cpp/chat.h +135 -0
- package/cpp/common.cpp +2064 -1873
- package/cpp/common.h +700 -699
- package/cpp/ggml-alloc.c +1039 -1042
- package/cpp/ggml-alloc.h +1 -1
- package/cpp/ggml-backend-impl.h +255 -255
- package/cpp/ggml-backend-reg.cpp +586 -582
- package/cpp/ggml-backend.cpp +2004 -2002
- package/cpp/ggml-backend.h +354 -354
- package/cpp/ggml-common.h +1851 -1853
- package/cpp/ggml-cpp.h +39 -39
- package/cpp/ggml-cpu-aarch64.cpp +4248 -4247
- package/cpp/ggml-cpu-aarch64.h +8 -8
- package/cpp/ggml-cpu-impl.h +531 -386
- package/cpp/ggml-cpu-quants.c +12527 -10920
- package/cpp/ggml-cpu-traits.cpp +36 -36
- package/cpp/ggml-cpu-traits.h +38 -38
- package/cpp/ggml-cpu.c +15766 -14391
- package/cpp/ggml-cpu.cpp +655 -635
- package/cpp/ggml-cpu.h +138 -135
- package/cpp/ggml-impl.h +567 -567
- package/cpp/ggml-metal-impl.h +235 -0
- package/cpp/ggml-metal.h +1 -1
- package/cpp/ggml-metal.m +5146 -4884
- package/cpp/ggml-opt.cpp +854 -854
- package/cpp/ggml-opt.h +216 -216
- package/cpp/ggml-quants.c +5238 -5238
- package/cpp/ggml-threading.h +14 -14
- package/cpp/ggml.c +6529 -6514
- package/cpp/ggml.h +2198 -2194
- package/cpp/gguf.cpp +1329 -1329
- package/cpp/gguf.h +202 -202
- package/cpp/json-schema-to-grammar.cpp +1024 -1045
- package/cpp/json-schema-to-grammar.h +21 -8
- package/cpp/json.hpp +24766 -24766
- package/cpp/llama-adapter.cpp +347 -347
- package/cpp/llama-adapter.h +74 -74
- package/cpp/llama-arch.cpp +1513 -1487
- package/cpp/llama-arch.h +403 -400
- package/cpp/llama-batch.cpp +368 -368
- package/cpp/llama-batch.h +88 -88
- package/cpp/llama-chat.cpp +588 -578
- package/cpp/llama-chat.h +53 -52
- package/cpp/llama-context.cpp +1775 -1775
- package/cpp/llama-context.h +128 -128
- package/cpp/llama-cparams.cpp +1 -1
- package/cpp/llama-cparams.h +37 -37
- package/cpp/llama-cpp.h +30 -30
- package/cpp/llama-grammar.cpp +1219 -1139
- package/cpp/llama-grammar.h +173 -143
- package/cpp/llama-hparams.cpp +71 -71
- package/cpp/llama-hparams.h +139 -139
- package/cpp/llama-impl.cpp +167 -167
- package/cpp/llama-impl.h +61 -61
- package/cpp/llama-kv-cache.cpp +718 -718
- package/cpp/llama-kv-cache.h +219 -218
- package/cpp/llama-mmap.cpp +600 -590
- package/cpp/llama-mmap.h +68 -67
- package/cpp/llama-model-loader.cpp +1124 -1124
- package/cpp/llama-model-loader.h +167 -167
- package/cpp/llama-model.cpp +4087 -3997
- package/cpp/llama-model.h +370 -370
- package/cpp/llama-sampling.cpp +2558 -2408
- package/cpp/llama-sampling.h +32 -32
- package/cpp/llama-vocab.cpp +3264 -3247
- package/cpp/llama-vocab.h +125 -125
- package/cpp/llama.cpp +10284 -10077
- package/cpp/llama.h +1354 -1323
- package/cpp/log.cpp +393 -401
- package/cpp/log.h +132 -121
- package/cpp/minja/chat-template.hpp +529 -0
- package/cpp/minja/minja.hpp +2915 -0
- package/cpp/minja.hpp +2915 -0
- package/cpp/rn-llama.cpp +66 -6
- package/cpp/rn-llama.h +26 -1
- package/cpp/sampling.cpp +570 -505
- package/cpp/sampling.h +3 -0
- package/cpp/sgemm.cpp +2598 -2597
- package/cpp/sgemm.h +14 -14
- package/cpp/speculative.cpp +278 -277
- package/cpp/speculative.h +28 -28
- package/cpp/unicode.cpp +9 -2
- package/ios/CMakeLists.txt +6 -0
- package/ios/RNLlama.h +0 -8
- package/ios/RNLlama.mm +27 -3
- package/ios/RNLlamaContext.h +10 -1
- package/ios/RNLlamaContext.mm +269 -57
- package/jest/mock.js +21 -2
- package/lib/commonjs/NativeRNLlama.js.map +1 -1
- package/lib/commonjs/grammar.js +3 -0
- package/lib/commonjs/grammar.js.map +1 -1
- package/lib/commonjs/index.js +87 -13
- package/lib/commonjs/index.js.map +1 -1
- package/lib/module/NativeRNLlama.js.map +1 -1
- package/lib/module/grammar.js +3 -0
- package/lib/module/grammar.js.map +1 -1
- package/lib/module/index.js +86 -13
- package/lib/module/index.js.map +1 -1
- package/lib/typescript/NativeRNLlama.d.ts +107 -2
- package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
- package/lib/typescript/grammar.d.ts.map +1 -1
- package/lib/typescript/index.d.ts +32 -7
- package/lib/typescript/index.d.ts.map +1 -1
- package/llama-rn.podspec +1 -1
- package/package.json +3 -2
- package/src/NativeRNLlama.ts +115 -3
- package/src/grammar.ts +3 -0
- package/src/index.ts +138 -21
- package/android/src/main/build-arm64/CMakeFiles/3.31.4/CMakeCCompiler.cmake +0 -81
- package/android/src/main/build-arm64/CMakeFiles/3.31.4/CMakeSystem.cmake +0 -15
- package/android/src/main/build-arm64/CMakeFiles/3.31.4/CompilerIdC/CMakeCCompilerId.c +0 -904
- package/android/src/main/build-arm64/CMakeFiles/3.31.4/CompilerIdC/CMakeCCompilerId.o +0 -0
- package/android/src/main/build-arm64/CMakeFiles/3.31.4/CompilerIdCXX/CMakeCXXCompilerId.cpp +0 -919
- package/android/src/main/build-arm64/CMakeFiles/3.31.4/CompilerIdCXX/CMakeCXXCompilerId.o +0 -0
- package/android/src/main/build-arm64/CMakeFiles/CMakeConfigureLog.yaml +0 -55
- package/cpp/rn-llama.hpp +0 -913
package/cpp/rn-llama.cpp
CHANGED
@@ -194,6 +194,8 @@ bool llama_rn_context::loadModel(common_params ¶ms_)
|
|
194
194
|
LOG_ERROR("unable to load model: %s", params_.model.c_str());
|
195
195
|
return false;
|
196
196
|
}
|
197
|
+
|
198
|
+
templates = common_chat_templates_init(model, params.chat_template);
|
197
199
|
n_ctx = llama_n_ctx(ctx);
|
198
200
|
|
199
201
|
// We can uncomment for debugging or after this fix: https://github.com/ggerganov/llama.cpp/pull/11101
|
@@ -202,11 +204,70 @@ bool llama_rn_context::loadModel(common_params ¶ms_)
|
|
202
204
|
return true;
|
203
205
|
}
|
204
206
|
|
205
|
-
bool llama_rn_context::validateModelChatTemplate() const {
|
206
|
-
const char * tmpl = llama_model_chat_template(model);
|
207
|
-
|
208
|
-
|
209
|
-
|
207
|
+
bool llama_rn_context::validateModelChatTemplate(bool use_jinja, const char *name) const {
|
208
|
+
const char * tmpl = llama_model_chat_template(model, name);
|
209
|
+
if (tmpl == nullptr) {
|
210
|
+
return false;
|
211
|
+
}
|
212
|
+
return common_chat_verify_template(tmpl, use_jinja);
|
213
|
+
}
|
214
|
+
|
215
|
+
common_chat_params llama_rn_context::getFormattedChatWithJinja(
|
216
|
+
const std::string &messages,
|
217
|
+
const std::string &chat_template,
|
218
|
+
const std::string &json_schema,
|
219
|
+
const std::string &tools,
|
220
|
+
const bool ¶llel_tool_calls,
|
221
|
+
const std::string &tool_choice
|
222
|
+
) const {
|
223
|
+
common_chat_templates_inputs inputs;
|
224
|
+
inputs.messages = common_chat_msgs_parse_oaicompat(json::parse(messages));
|
225
|
+
auto useTools = !tools.empty();
|
226
|
+
if (useTools) {
|
227
|
+
inputs.tools = common_chat_tools_parse_oaicompat(json::parse(tools));
|
228
|
+
}
|
229
|
+
inputs.parallel_tool_calls = parallel_tool_calls;
|
230
|
+
if (!tool_choice.empty()) {
|
231
|
+
inputs.tool_choice = common_chat_tool_choice_parse_oaicompat(tool_choice);
|
232
|
+
}
|
233
|
+
|
234
|
+
if (!json_schema.empty()) {
|
235
|
+
inputs.json_schema = json_schema;
|
236
|
+
}
|
237
|
+
inputs.extract_reasoning = params.reasoning_format != COMMON_REASONING_FORMAT_NONE;
|
238
|
+
|
239
|
+
// If chat_template is provided, create new one and use it (probably slow)
|
240
|
+
if (!chat_template.empty()) {
|
241
|
+
auto tmp = common_chat_templates_init(model, chat_template);
|
242
|
+
return common_chat_templates_apply(tmp.get(), inputs);
|
243
|
+
} else {
|
244
|
+
return common_chat_templates_apply(templates.get(), inputs);
|
245
|
+
}
|
246
|
+
}
|
247
|
+
|
248
|
+
std::string llama_rn_context::getFormattedChat(
|
249
|
+
const std::string &messages,
|
250
|
+
const std::string &chat_template
|
251
|
+
) const {
|
252
|
+
auto chat_json = json::parse(messages);
|
253
|
+
common_chat_templates_inputs inputs;
|
254
|
+
// Handle regular chat without tools
|
255
|
+
std::vector<common_chat_msg> chat_msgs;
|
256
|
+
for (const auto &msg : chat_json) {
|
257
|
+
chat_msgs.push_back({
|
258
|
+
msg["role"].get(),
|
259
|
+
msg["content"].get()
|
260
|
+
});
|
261
|
+
}
|
262
|
+
inputs.messages = chat_msgs;
|
263
|
+
|
264
|
+
// If chat_template is provided, create new one and use it (probably slow)
|
265
|
+
if (!chat_template.empty()) {
|
266
|
+
auto tmp = common_chat_templates_init(model, chat_template);
|
267
|
+
return common_chat_templates_apply(tmp.get(), inputs).prompt;
|
268
|
+
} else {
|
269
|
+
return common_chat_templates_apply(templates.get(), inputs).prompt;
|
270
|
+
}
|
210
271
|
}
|
211
272
|
|
212
273
|
void llama_rn_context::truncatePrompt(std::vector<llama_token> &prompt_tokens) {
|
@@ -518,7 +579,6 @@ std::vector<float> llama_rn_context::getEmbedding(common_params &embd_params)
|
|
518
579
|
float *data;
|
519
580
|
|
520
581
|
const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
|
521
|
-
printf("pooling_type: %d\n", pooling_type);
|
522
582
|
if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
|
523
583
|
data = llama_get_embeddings(ctx);
|
524
584
|
} else {
|
package/cpp/rn-llama.h
CHANGED
@@ -3,7 +3,9 @@
|
|
3
3
|
|
4
4
|
#include <sstream>
|
5
5
|
#include <iostream>
|
6
|
+
#include "chat-template.hpp"
|
6
7
|
#include "common.h"
|
8
|
+
#include "chat.h"
|
7
9
|
#include "ggml.h"
|
8
10
|
#include "gguf.h"
|
9
11
|
#include "llama.h"
|
@@ -13,8 +15,18 @@
|
|
13
15
|
#include <android/log.h>
|
14
16
|
#endif
|
15
17
|
|
18
|
+
using json = nlohmann::ordered_json;
|
19
|
+
typedef minja::chat_template common_chat_template;
|
20
|
+
|
21
|
+
struct common_chat_templates {
|
22
|
+
bool has_explicit_template;
|
23
|
+
std::unique_ptr<common_chat_template> template_default;
|
24
|
+
std::unique_ptr<common_chat_template> template_tool_use;
|
25
|
+
};
|
26
|
+
|
16
27
|
namespace rnllama {
|
17
28
|
|
29
|
+
|
18
30
|
std::string tokens_to_output_formatted_string(const llama_context *ctx, const llama_token token);
|
19
31
|
|
20
32
|
std::string tokens_to_str(llama_context *ctx, const std::vector<llama_token>::const_iterator begin, const std::vector<llama_token>::const_iterator end);
|
@@ -63,6 +75,7 @@ struct llama_rn_context {
|
|
63
75
|
|
64
76
|
llama_context *ctx = nullptr;
|
65
77
|
common_sampler *ctx_sampling = nullptr;
|
78
|
+
common_chat_templates_ptr templates = nullptr;
|
66
79
|
|
67
80
|
int n_ctx;
|
68
81
|
|
@@ -80,7 +93,19 @@ struct llama_rn_context {
|
|
80
93
|
void rewind();
|
81
94
|
bool initSampling();
|
82
95
|
bool loadModel(common_params ¶ms_);
|
83
|
-
bool validateModelChatTemplate() const;
|
96
|
+
bool validateModelChatTemplate(bool use_jinja, const char *name) const;
|
97
|
+
common_chat_params getFormattedChatWithJinja(
|
98
|
+
const std::string &messages,
|
99
|
+
const std::string &chat_template,
|
100
|
+
const std::string &json_schema,
|
101
|
+
const std::string &tools,
|
102
|
+
const bool ¶llel_tool_calls,
|
103
|
+
const std::string &tool_choice
|
104
|
+
) const;
|
105
|
+
std::string getFormattedChat(
|
106
|
+
const std::string &messages,
|
107
|
+
const std::string &chat_template
|
108
|
+
) const;
|
84
109
|
void truncatePrompt(std::vector<llama_token> &prompt_tokens);
|
85
110
|
void loadPrompt();
|
86
111
|
void beginCompletion();
|