cui-llama.rn 1.4.0 → 1.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (108) hide show
  1. package/README.md +4 -23
  2. package/android/build.gradle +12 -3
  3. package/android/src/main/CMakeLists.txt +13 -7
  4. package/android/src/main/java/com/rnllama/LlamaContext.java +27 -20
  5. package/android/src/main/java/com/rnllama/RNLlama.java +5 -1
  6. package/android/src/main/jni.cpp +15 -12
  7. package/android/src/main/jniLibs/arm64-v8a/librnllama.so +0 -0
  8. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8.so +0 -0
  9. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2.so +0 -0
  10. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod.so +0 -0
  11. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod_i8mm.so +0 -0
  12. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_i8mm.so +0 -0
  13. package/android/src/main/jniLibs/x86_64/librnllama.so +0 -0
  14. package/android/src/main/jniLibs/x86_64/librnllama_x86_64.so +0 -0
  15. package/cpp/README.md +1 -1
  16. package/cpp/common.cpp +158 -267
  17. package/cpp/common.h +46 -12
  18. package/cpp/ggml-alloc.c +1042 -1037
  19. package/cpp/ggml-backend-impl.h +255 -256
  20. package/cpp/ggml-backend-reg.cpp +582 -582
  21. package/cpp/ggml-backend.cpp +2002 -2002
  22. package/cpp/ggml-backend.h +354 -352
  23. package/cpp/ggml-common.h +1853 -1853
  24. package/cpp/ggml-cpp.h +39 -39
  25. package/cpp/ggml-cpu-aarch64.cpp +4247 -4247
  26. package/cpp/ggml-cpu-aarch64.h +8 -8
  27. package/cpp/ggml-cpu-impl.h +386 -386
  28. package/cpp/ggml-cpu-quants.c +10920 -10839
  29. package/cpp/ggml-cpu-traits.cpp +36 -36
  30. package/cpp/ggml-cpu-traits.h +38 -38
  31. package/cpp/ggml-cpu.c +329 -60
  32. package/cpp/ggml-cpu.cpp +10 -2
  33. package/cpp/ggml-cpu.h +135 -135
  34. package/cpp/ggml-impl.h +567 -567
  35. package/cpp/ggml-metal-impl.h +17 -17
  36. package/cpp/ggml-metal.m +4884 -4884
  37. package/cpp/ggml-quants.c +5238 -5238
  38. package/cpp/ggml-threading.h +14 -14
  39. package/cpp/ggml.c +6514 -6448
  40. package/cpp/ggml.h +2194 -2163
  41. package/cpp/gguf.cpp +1329 -1325
  42. package/cpp/gguf.h +202 -202
  43. package/cpp/json-schema-to-grammar.cpp +1045 -1045
  44. package/cpp/json-schema-to-grammar.h +8 -8
  45. package/cpp/json.hpp +24766 -24766
  46. package/cpp/llama-adapter.cpp +347 -346
  47. package/cpp/llama-adapter.h +74 -73
  48. package/cpp/llama-arch.cpp +1487 -1434
  49. package/cpp/llama-arch.h +400 -395
  50. package/cpp/llama-batch.cpp +368 -368
  51. package/cpp/llama-batch.h +88 -88
  52. package/cpp/llama-chat.cpp +578 -567
  53. package/cpp/llama-chat.h +52 -51
  54. package/cpp/llama-context.cpp +1775 -1771
  55. package/cpp/llama-context.h +128 -128
  56. package/cpp/llama-cparams.cpp +1 -1
  57. package/cpp/llama-cparams.h +37 -37
  58. package/cpp/llama-cpp.h +30 -30
  59. package/cpp/llama-grammar.cpp +1139 -1139
  60. package/cpp/llama-grammar.h +143 -143
  61. package/cpp/llama-hparams.cpp +71 -71
  62. package/cpp/llama-hparams.h +139 -140
  63. package/cpp/llama-impl.cpp +167 -167
  64. package/cpp/llama-impl.h +61 -61
  65. package/cpp/llama-kv-cache.cpp +718 -718
  66. package/cpp/llama-kv-cache.h +218 -218
  67. package/cpp/llama-mmap.cpp +2 -1
  68. package/cpp/llama-mmap.h +67 -67
  69. package/cpp/llama-model-loader.cpp +1124 -1011
  70. package/cpp/llama-model-loader.h +167 -158
  71. package/cpp/llama-model.cpp +3997 -2202
  72. package/cpp/llama-model.h +370 -391
  73. package/cpp/llama-sampling.cpp +2408 -2406
  74. package/cpp/llama-sampling.h +32 -48
  75. package/cpp/llama-vocab.cpp +3247 -1982
  76. package/cpp/llama-vocab.h +125 -182
  77. package/cpp/llama.cpp +416 -2886
  78. package/cpp/llama.h +1323 -1285
  79. package/cpp/log.cpp +401 -401
  80. package/cpp/log.h +121 -121
  81. package/cpp/rn-llama.cpp +822 -0
  82. package/cpp/rn-llama.h +123 -0
  83. package/cpp/rn-llama.hpp +18 -12
  84. package/cpp/sampling.cpp +505 -500
  85. package/cpp/sgemm.cpp +2597 -2597
  86. package/cpp/speculative.cpp +277 -274
  87. package/cpp/speculative.h +28 -28
  88. package/cpp/unicode.cpp +2 -3
  89. package/ios/CMakeLists.txt +99 -0
  90. package/ios/RNLlama.h +5 -1
  91. package/ios/RNLlama.mm +2 -2
  92. package/ios/RNLlamaContext.h +8 -1
  93. package/ios/RNLlamaContext.mm +15 -11
  94. package/ios/rnllama.xcframework/Info.plist +74 -0
  95. package/jest/mock.js +3 -2
  96. package/lib/commonjs/NativeRNLlama.js.map +1 -1
  97. package/lib/commonjs/index.js +4 -2
  98. package/lib/commonjs/index.js.map +1 -1
  99. package/lib/module/NativeRNLlama.js.map +1 -1
  100. package/lib/module/index.js +4 -2
  101. package/lib/module/index.js.map +1 -1
  102. package/lib/typescript/NativeRNLlama.d.ts +5 -1
  103. package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
  104. package/lib/typescript/index.d.ts.map +1 -1
  105. package/llama-rn.podspec +8 -2
  106. package/package.json +5 -2
  107. package/src/NativeRNLlama.ts +5 -1
  108. package/src/index.ts +9 -2
package/cpp/common.h CHANGED
@@ -2,7 +2,6 @@
2
2
 
3
3
  #pragma once
4
4
 
5
- #include "llama-cpp.h"
6
5
  #include "llama-cpp.h"
7
6
 
8
7
  #include <string>
@@ -25,11 +24,11 @@
25
24
 
26
25
  #define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
27
26
 
28
- struct common_lora_adapter_info {
27
+ struct common_adapter_lora_info {
29
28
  std::string path;
30
29
  float scale;
31
30
 
32
- struct llama_lora_adapter * ptr;
31
+ struct llama_adapter_lora * ptr;
33
32
  };
34
33
 
35
34
  using llama_tokens = std::vector<llama_token>;
@@ -115,6 +114,12 @@ enum dimre_method {
115
114
  DIMRE_METHOD_MEAN,
116
115
  };
117
116
 
117
+ enum common_conversation_mode {
118
+ COMMON_CONVERSATION_MODE_DISABLED = 0,
119
+ COMMON_CONVERSATION_MODE_ENABLED = 1,
120
+ COMMON_CONVERSATION_MODE_AUTO = 2,
121
+ };
122
+
118
123
  // sampling parameters
119
124
  struct common_params_sampling {
120
125
  uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
@@ -181,7 +186,11 @@ struct common_params_speculative {
181
186
  struct cpu_params cpuparams;
182
187
  struct cpu_params cpuparams_batch;
183
188
 
184
- std::string model = ""; // draft model for speculative decoding // NOLINT
189
+ std::string hf_repo = ""; // HF repo // NOLINT
190
+ std::string hf_file = ""; // HF file // NOLINT
191
+
192
+ std::string model = ""; // draft model for speculative decoding // NOLINT
193
+ std::string model_url = ""; // model url to download // NOLINT
185
194
  };
186
195
 
187
196
  struct common_params_vocoder {
@@ -190,6 +199,8 @@ struct common_params_vocoder {
190
199
 
191
200
  std::string model = ""; // model path // NOLINT
192
201
  std::string model_url = ""; // model url to download // NOLINT
202
+
203
+ bool use_guide_tokens = false; // enable guide tokens to improve TTS accuracy // NOLINT
193
204
  };
194
205
 
195
206
  struct common_params {
@@ -256,14 +267,13 @@ struct common_params {
256
267
  std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding // NOLINT
257
268
  std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding // NOLINT
258
269
  std::string logits_file = ""; // file for saving *all* logits // NOLINT
259
- std::string rpc_servers = ""; // comma separated list of RPC servers // NOLINT
260
270
 
261
271
  std::vector<std::string> in_files; // all input files
262
272
  std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
263
273
  std::vector<llama_model_kv_override> kv_overrides;
264
274
 
265
- bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_lora_adapter_apply)
266
- std::vector<common_lora_adapter_info> lora_adapters; // lora adapter path with user defined scale
275
+ bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_adapter_lora_apply)
276
+ std::vector<common_adapter_lora_info> lora_adapters; // lora adapter path with user defined scale
267
277
 
268
278
  std::vector<common_control_vector_load_info> control_vectors; // control vector with user defined scale
269
279
 
@@ -291,7 +301,6 @@ struct common_params {
291
301
  bool special = false; // enable special token output
292
302
  bool interactive = false; // interactive mode
293
303
  bool interactive_first = false; // wait for user input immediately
294
- bool conversation = false; // conversation mode (does not print special tokens and suffix/prefix)
295
304
  bool prompt_cache_all = false; // save user input and generations to prompt cache
296
305
  bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it
297
306
 
@@ -317,6 +326,8 @@ struct common_params {
317
326
  lm_ggml_type cache_type_k = LM_GGML_TYPE_F16; // KV cache data type for the K
318
327
  lm_ggml_type cache_type_v = LM_GGML_TYPE_F16; // KV cache data type for the V
319
328
 
329
+ common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO;
330
+
320
331
  // multimodal models (see examples/llava)
321
332
  std::string mmproj = ""; // path to multimodal projector // NOLINT
322
333
  std::vector<std::string> image; // path to image file(s)
@@ -470,6 +481,11 @@ static bool string_starts_with(const std::string & str,
470
481
  return str.rfind(prefix, 0) == 0;
471
482
  }
472
483
 
484
+ static bool string_ends_with(const std::string & str,
485
+ const std::string & suffix) { // While we wait for C++20's std::string::ends_with...
486
+ return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
487
+ }
488
+
473
489
  bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
474
490
  void string_process_escapes(std::string & input);
475
491
 
@@ -497,7 +513,7 @@ struct common_init_result {
497
513
  llama_model_ptr model;
498
514
  llama_context_ptr context;
499
515
 
500
- std::vector<llama_lora_adapter_ptr> lora;
516
+ std::vector<llama_adapter_lora_ptr> lora;
501
517
  };
502
518
 
503
519
  struct common_init_result common_init_from_params(common_params & params);
@@ -511,15 +527,23 @@ struct llama_model * common_load_model_from_url(
511
527
  const std::string & local_path,
512
528
  const std::string & hf_token,
513
529
  const struct llama_model_params & params);
530
+
514
531
  struct llama_model * common_load_model_from_hf(
515
532
  const std::string & repo,
516
533
  const std::string & remote_path,
517
534
  const std::string & local_path,
518
535
  const std::string & hf_token,
519
536
  const struct llama_model_params & params);
537
+ std::pair<std::string, std::string> common_get_hf_file(
538
+ const std::string & hf_repo_with_tag,
539
+ const std::string & hf_token);
540
+
541
+ std::pair<std::string, std::string> common_get_hf_file(
542
+ const std::string & hf_repo_with_tag,
543
+ const std::string & hf_token);
520
544
 
521
545
  // clear LoRA adapters from context, then apply new list of adapters
522
- void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_info> & lora);
546
+ void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora);
523
547
 
524
548
  //
525
549
  // Batch utils
@@ -557,7 +581,7 @@ std::vector<llama_token> common_tokenize(
557
581
  bool parse_special = false);
558
582
 
559
583
  std::vector<llama_token> common_tokenize(
560
- const struct llama_model * model,
584
+ const struct llama_vocab * vocab,
561
585
  const std::string & text,
562
586
  bool add_special,
563
587
  bool parse_special = false);
@@ -569,11 +593,21 @@ std::string common_token_to_piece(
569
593
  llama_token token,
570
594
  bool special = true);
571
595
 
596
+ std::string common_token_to_piece(
597
+ const struct llama_vocab * vocab,
598
+ llama_token token,
599
+ bool special = true);
600
+
572
601
  // detokenizes a vector of tokens into a string
573
602
  // should work similar to Python's `tokenizer.decode`
574
603
  // optionally renders special/control tokens
575
604
  std::string common_detokenize(
576
- llama_context * ctx,
605
+ const struct llama_context * ctx,
606
+ const std::vector<llama_token> & tokens,
607
+ bool special = true);
608
+
609
+ std::string common_detokenize(
610
+ const struct llama_vocab * vocab,
577
611
  const std::vector<llama_token> & tokens,
578
612
  bool special = true);
579
613