cui-llama.rn 1.4.0 → 1.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +4 -23
- package/android/build.gradle +12 -3
- package/android/src/main/CMakeLists.txt +13 -7
- package/android/src/main/java/com/rnllama/LlamaContext.java +27 -20
- package/android/src/main/java/com/rnllama/RNLlama.java +5 -1
- package/android/src/main/jni.cpp +15 -12
- package/android/src/main/jniLibs/arm64-v8a/librnllama.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod_i8mm.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_i8mm.so +0 -0
- package/android/src/main/jniLibs/x86_64/librnllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/librnllama_x86_64.so +0 -0
- package/cpp/README.md +1 -1
- package/cpp/common.cpp +158 -267
- package/cpp/common.h +46 -12
- package/cpp/ggml-alloc.c +1042 -1037
- package/cpp/ggml-backend-impl.h +255 -256
- package/cpp/ggml-backend-reg.cpp +582 -582
- package/cpp/ggml-backend.cpp +2002 -2002
- package/cpp/ggml-backend.h +354 -352
- package/cpp/ggml-common.h +1853 -1853
- package/cpp/ggml-cpp.h +39 -39
- package/cpp/ggml-cpu-aarch64.cpp +4247 -4247
- package/cpp/ggml-cpu-aarch64.h +8 -8
- package/cpp/ggml-cpu-impl.h +386 -386
- package/cpp/ggml-cpu-quants.c +10920 -10839
- package/cpp/ggml-cpu-traits.cpp +36 -36
- package/cpp/ggml-cpu-traits.h +38 -38
- package/cpp/ggml-cpu.c +329 -60
- package/cpp/ggml-cpu.cpp +10 -2
- package/cpp/ggml-cpu.h +135 -135
- package/cpp/ggml-impl.h +567 -567
- package/cpp/ggml-metal-impl.h +17 -17
- package/cpp/ggml-metal.m +4884 -4884
- package/cpp/ggml-quants.c +5238 -5238
- package/cpp/ggml-threading.h +14 -14
- package/cpp/ggml.c +6514 -6448
- package/cpp/ggml.h +2194 -2163
- package/cpp/gguf.cpp +1329 -1325
- package/cpp/gguf.h +202 -202
- package/cpp/json-schema-to-grammar.cpp +1045 -1045
- package/cpp/json-schema-to-grammar.h +8 -8
- package/cpp/json.hpp +24766 -24766
- package/cpp/llama-adapter.cpp +347 -346
- package/cpp/llama-adapter.h +74 -73
- package/cpp/llama-arch.cpp +1487 -1434
- package/cpp/llama-arch.h +400 -395
- package/cpp/llama-batch.cpp +368 -368
- package/cpp/llama-batch.h +88 -88
- package/cpp/llama-chat.cpp +578 -567
- package/cpp/llama-chat.h +52 -51
- package/cpp/llama-context.cpp +1775 -1771
- package/cpp/llama-context.h +128 -128
- package/cpp/llama-cparams.cpp +1 -1
- package/cpp/llama-cparams.h +37 -37
- package/cpp/llama-cpp.h +30 -30
- package/cpp/llama-grammar.cpp +1139 -1139
- package/cpp/llama-grammar.h +143 -143
- package/cpp/llama-hparams.cpp +71 -71
- package/cpp/llama-hparams.h +139 -140
- package/cpp/llama-impl.cpp +167 -167
- package/cpp/llama-impl.h +61 -61
- package/cpp/llama-kv-cache.cpp +718 -718
- package/cpp/llama-kv-cache.h +218 -218
- package/cpp/llama-mmap.cpp +2 -1
- package/cpp/llama-mmap.h +67 -67
- package/cpp/llama-model-loader.cpp +1124 -1011
- package/cpp/llama-model-loader.h +167 -158
- package/cpp/llama-model.cpp +3997 -2202
- package/cpp/llama-model.h +370 -391
- package/cpp/llama-sampling.cpp +2408 -2406
- package/cpp/llama-sampling.h +32 -48
- package/cpp/llama-vocab.cpp +3247 -1982
- package/cpp/llama-vocab.h +125 -182
- package/cpp/llama.cpp +416 -2886
- package/cpp/llama.h +1323 -1285
- package/cpp/log.cpp +401 -401
- package/cpp/log.h +121 -121
- package/cpp/rn-llama.cpp +822 -0
- package/cpp/rn-llama.h +123 -0
- package/cpp/rn-llama.hpp +18 -12
- package/cpp/sampling.cpp +505 -500
- package/cpp/sgemm.cpp +2597 -2597
- package/cpp/speculative.cpp +277 -274
- package/cpp/speculative.h +28 -28
- package/cpp/unicode.cpp +2 -3
- package/ios/CMakeLists.txt +99 -0
- package/ios/RNLlama.h +5 -1
- package/ios/RNLlama.mm +2 -2
- package/ios/RNLlamaContext.h +8 -1
- package/ios/RNLlamaContext.mm +15 -11
- package/ios/rnllama.xcframework/Info.plist +74 -0
- package/jest/mock.js +3 -2
- package/lib/commonjs/NativeRNLlama.js.map +1 -1
- package/lib/commonjs/index.js +4 -2
- package/lib/commonjs/index.js.map +1 -1
- package/lib/module/NativeRNLlama.js.map +1 -1
- package/lib/module/index.js +4 -2
- package/lib/module/index.js.map +1 -1
- package/lib/typescript/NativeRNLlama.d.ts +5 -1
- package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
- package/lib/typescript/index.d.ts.map +1 -1
- package/llama-rn.podspec +8 -2
- package/package.json +5 -2
- package/src/NativeRNLlama.ts +5 -1
- package/src/index.ts +9 -2
package/cpp/common.h
CHANGED
@@ -2,7 +2,6 @@
|
|
2
2
|
|
3
3
|
#pragma once
|
4
4
|
|
5
|
-
#include "llama-cpp.h"
|
6
5
|
#include "llama-cpp.h"
|
7
6
|
|
8
7
|
#include <string>
|
@@ -25,11 +24,11 @@
|
|
25
24
|
|
26
25
|
#define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
|
27
26
|
|
28
|
-
struct
|
27
|
+
struct common_adapter_lora_info {
|
29
28
|
std::string path;
|
30
29
|
float scale;
|
31
30
|
|
32
|
-
struct
|
31
|
+
struct llama_adapter_lora * ptr;
|
33
32
|
};
|
34
33
|
|
35
34
|
using llama_tokens = std::vector<llama_token>;
|
@@ -115,6 +114,12 @@ enum dimre_method {
|
|
115
114
|
DIMRE_METHOD_MEAN,
|
116
115
|
};
|
117
116
|
|
117
|
+
enum common_conversation_mode {
|
118
|
+
COMMON_CONVERSATION_MODE_DISABLED = 0,
|
119
|
+
COMMON_CONVERSATION_MODE_ENABLED = 1,
|
120
|
+
COMMON_CONVERSATION_MODE_AUTO = 2,
|
121
|
+
};
|
122
|
+
|
118
123
|
// sampling parameters
|
119
124
|
struct common_params_sampling {
|
120
125
|
uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
|
@@ -181,7 +186,11 @@ struct common_params_speculative {
|
|
181
186
|
struct cpu_params cpuparams;
|
182
187
|
struct cpu_params cpuparams_batch;
|
183
188
|
|
184
|
-
std::string
|
189
|
+
std::string hf_repo = ""; // HF repo // NOLINT
|
190
|
+
std::string hf_file = ""; // HF file // NOLINT
|
191
|
+
|
192
|
+
std::string model = ""; // draft model for speculative decoding // NOLINT
|
193
|
+
std::string model_url = ""; // model url to download // NOLINT
|
185
194
|
};
|
186
195
|
|
187
196
|
struct common_params_vocoder {
|
@@ -190,6 +199,8 @@ struct common_params_vocoder {
|
|
190
199
|
|
191
200
|
std::string model = ""; // model path // NOLINT
|
192
201
|
std::string model_url = ""; // model url to download // NOLINT
|
202
|
+
|
203
|
+
bool use_guide_tokens = false; // enable guide tokens to improve TTS accuracy // NOLINT
|
193
204
|
};
|
194
205
|
|
195
206
|
struct common_params {
|
@@ -256,14 +267,13 @@ struct common_params {
|
|
256
267
|
std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding // NOLINT
|
257
268
|
std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding // NOLINT
|
258
269
|
std::string logits_file = ""; // file for saving *all* logits // NOLINT
|
259
|
-
std::string rpc_servers = ""; // comma separated list of RPC servers // NOLINT
|
260
270
|
|
261
271
|
std::vector<std::string> in_files; // all input files
|
262
272
|
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
|
263
273
|
std::vector<llama_model_kv_override> kv_overrides;
|
264
274
|
|
265
|
-
bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using
|
266
|
-
std::vector<
|
275
|
+
bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_adapter_lora_apply)
|
276
|
+
std::vector<common_adapter_lora_info> lora_adapters; // lora adapter path with user defined scale
|
267
277
|
|
268
278
|
std::vector<common_control_vector_load_info> control_vectors; // control vector with user defined scale
|
269
279
|
|
@@ -291,7 +301,6 @@ struct common_params {
|
|
291
301
|
bool special = false; // enable special token output
|
292
302
|
bool interactive = false; // interactive mode
|
293
303
|
bool interactive_first = false; // wait for user input immediately
|
294
|
-
bool conversation = false; // conversation mode (does not print special tokens and suffix/prefix)
|
295
304
|
bool prompt_cache_all = false; // save user input and generations to prompt cache
|
296
305
|
bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it
|
297
306
|
|
@@ -317,6 +326,8 @@ struct common_params {
|
|
317
326
|
lm_ggml_type cache_type_k = LM_GGML_TYPE_F16; // KV cache data type for the K
|
318
327
|
lm_ggml_type cache_type_v = LM_GGML_TYPE_F16; // KV cache data type for the V
|
319
328
|
|
329
|
+
common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO;
|
330
|
+
|
320
331
|
// multimodal models (see examples/llava)
|
321
332
|
std::string mmproj = ""; // path to multimodal projector // NOLINT
|
322
333
|
std::vector<std::string> image; // path to image file(s)
|
@@ -470,6 +481,11 @@ static bool string_starts_with(const std::string & str,
|
|
470
481
|
return str.rfind(prefix, 0) == 0;
|
471
482
|
}
|
472
483
|
|
484
|
+
static bool string_ends_with(const std::string & str,
|
485
|
+
const std::string & suffix) { // While we wait for C++20's std::string::ends_with...
|
486
|
+
return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
|
487
|
+
}
|
488
|
+
|
473
489
|
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
|
474
490
|
void string_process_escapes(std::string & input);
|
475
491
|
|
@@ -497,7 +513,7 @@ struct common_init_result {
|
|
497
513
|
llama_model_ptr model;
|
498
514
|
llama_context_ptr context;
|
499
515
|
|
500
|
-
std::vector<
|
516
|
+
std::vector<llama_adapter_lora_ptr> lora;
|
501
517
|
};
|
502
518
|
|
503
519
|
struct common_init_result common_init_from_params(common_params & params);
|
@@ -511,15 +527,23 @@ struct llama_model * common_load_model_from_url(
|
|
511
527
|
const std::string & local_path,
|
512
528
|
const std::string & hf_token,
|
513
529
|
const struct llama_model_params & params);
|
530
|
+
|
514
531
|
struct llama_model * common_load_model_from_hf(
|
515
532
|
const std::string & repo,
|
516
533
|
const std::string & remote_path,
|
517
534
|
const std::string & local_path,
|
518
535
|
const std::string & hf_token,
|
519
536
|
const struct llama_model_params & params);
|
537
|
+
std::pair<std::string, std::string> common_get_hf_file(
|
538
|
+
const std::string & hf_repo_with_tag,
|
539
|
+
const std::string & hf_token);
|
540
|
+
|
541
|
+
std::pair<std::string, std::string> common_get_hf_file(
|
542
|
+
const std::string & hf_repo_with_tag,
|
543
|
+
const std::string & hf_token);
|
520
544
|
|
521
545
|
// clear LoRA adapters from context, then apply new list of adapters
|
522
|
-
void
|
546
|
+
void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora);
|
523
547
|
|
524
548
|
//
|
525
549
|
// Batch utils
|
@@ -557,7 +581,7 @@ std::vector<llama_token> common_tokenize(
|
|
557
581
|
bool parse_special = false);
|
558
582
|
|
559
583
|
std::vector<llama_token> common_tokenize(
|
560
|
-
const struct
|
584
|
+
const struct llama_vocab * vocab,
|
561
585
|
const std::string & text,
|
562
586
|
bool add_special,
|
563
587
|
bool parse_special = false);
|
@@ -569,11 +593,21 @@ std::string common_token_to_piece(
|
|
569
593
|
llama_token token,
|
570
594
|
bool special = true);
|
571
595
|
|
596
|
+
std::string common_token_to_piece(
|
597
|
+
const struct llama_vocab * vocab,
|
598
|
+
llama_token token,
|
599
|
+
bool special = true);
|
600
|
+
|
572
601
|
// detokenizes a vector of tokens into a string
|
573
602
|
// should work similar to Python's `tokenizer.decode`
|
574
603
|
// optionally renders special/control tokens
|
575
604
|
std::string common_detokenize(
|
576
|
-
|
605
|
+
const struct llama_context * ctx,
|
606
|
+
const std::vector<llama_token> & tokens,
|
607
|
+
bool special = true);
|
608
|
+
|
609
|
+
std::string common_detokenize(
|
610
|
+
const struct llama_vocab * vocab,
|
577
611
|
const std::vector<llama_token> & tokens,
|
578
612
|
bool special = true);
|
579
613
|
|