cui-llama.rn 1.3.5 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. package/README.md +22 -1
  2. package/android/src/main/CMakeLists.txt +25 -20
  3. package/android/src/main/java/com/rnllama/LlamaContext.java +31 -9
  4. package/android/src/main/java/com/rnllama/RNLlama.java +98 -0
  5. package/android/src/main/jni-utils.h +94 -0
  6. package/android/src/main/jni.cpp +108 -37
  7. package/android/src/newarch/java/com/rnllama/RNLlamaModule.java +15 -0
  8. package/android/src/oldarch/java/com/rnllama/RNLlamaModule.java +15 -0
  9. package/cpp/common.cpp +1982 -1965
  10. package/cpp/common.h +665 -657
  11. package/cpp/ggml-backend-reg.cpp +5 -0
  12. package/cpp/ggml-backend.cpp +5 -2
  13. package/cpp/ggml-cpp.h +1 -0
  14. package/cpp/ggml-cpu-aarch64.cpp +6 -1
  15. package/cpp/ggml-cpu-quants.c +5 -1
  16. package/cpp/ggml-cpu.c +14122 -14122
  17. package/cpp/ggml-cpu.cpp +627 -627
  18. package/cpp/ggml-impl.h +11 -16
  19. package/cpp/ggml-metal-impl.h +288 -0
  20. package/cpp/ggml-metal.m +2 -2
  21. package/cpp/ggml-opt.cpp +854 -0
  22. package/cpp/ggml-opt.h +216 -0
  23. package/cpp/ggml.c +0 -1276
  24. package/cpp/ggml.h +0 -140
  25. package/cpp/gguf.cpp +1325 -0
  26. package/cpp/gguf.h +202 -0
  27. package/cpp/llama-adapter.cpp +346 -0
  28. package/cpp/llama-adapter.h +73 -0
  29. package/cpp/llama-arch.cpp +1434 -0
  30. package/cpp/llama-arch.h +395 -0
  31. package/cpp/llama-batch.cpp +368 -0
  32. package/cpp/llama-batch.h +88 -0
  33. package/cpp/llama-chat.cpp +567 -0
  34. package/cpp/llama-chat.h +51 -0
  35. package/cpp/llama-context.cpp +1771 -0
  36. package/cpp/llama-context.h +128 -0
  37. package/cpp/llama-cparams.cpp +1 -0
  38. package/cpp/llama-cparams.h +37 -0
  39. package/cpp/llama-cpp.h +30 -0
  40. package/cpp/llama-grammar.cpp +1 -0
  41. package/cpp/llama-grammar.h +3 -1
  42. package/cpp/llama-hparams.cpp +71 -0
  43. package/cpp/llama-hparams.h +140 -0
  44. package/cpp/llama-impl.cpp +167 -0
  45. package/cpp/llama-impl.h +16 -136
  46. package/cpp/llama-kv-cache.cpp +718 -0
  47. package/cpp/llama-kv-cache.h +218 -0
  48. package/cpp/llama-mmap.cpp +589 -0
  49. package/cpp/llama-mmap.h +67 -0
  50. package/cpp/llama-model-loader.cpp +1011 -0
  51. package/cpp/llama-model-loader.h +158 -0
  52. package/cpp/llama-model.cpp +2202 -0
  53. package/cpp/llama-model.h +391 -0
  54. package/cpp/llama-sampling.cpp +117 -4
  55. package/cpp/llama-vocab.cpp +21 -28
  56. package/cpp/llama-vocab.h +13 -1
  57. package/cpp/llama.cpp +12547 -23528
  58. package/cpp/llama.h +31 -6
  59. package/cpp/rn-llama.hpp +90 -87
  60. package/cpp/sgemm.cpp +776 -70
  61. package/cpp/sgemm.h +14 -14
  62. package/cpp/unicode.cpp +6 -0
  63. package/ios/RNLlama.mm +47 -0
  64. package/ios/RNLlamaContext.h +3 -1
  65. package/ios/RNLlamaContext.mm +71 -14
  66. package/jest/mock.js +15 -3
  67. package/lib/commonjs/NativeRNLlama.js.map +1 -1
  68. package/lib/commonjs/index.js +33 -37
  69. package/lib/commonjs/index.js.map +1 -1
  70. package/lib/module/NativeRNLlama.js.map +1 -1
  71. package/lib/module/index.js +31 -35
  72. package/lib/module/index.js.map +1 -1
  73. package/lib/typescript/NativeRNLlama.d.ts +26 -6
  74. package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
  75. package/lib/typescript/index.d.ts +21 -36
  76. package/lib/typescript/index.d.ts.map +1 -1
  77. package/llama-rn.podspec +4 -18
  78. package/package.json +2 -3
  79. package/src/NativeRNLlama.ts +32 -13
  80. package/src/index.ts +52 -47
package/cpp/llama.h CHANGED
@@ -35,7 +35,6 @@
35
35
 
36
36
  #define LLAMA_DEFAULT_SEED 0xFFFFFFFF
37
37
 
38
- // TODO: use everywhere in the implementation
39
38
  #define LLAMA_TOKEN_NULL -1
40
39
 
41
40
  #define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
@@ -106,6 +105,7 @@ extern "C" {
106
105
  LLAMA_VOCAB_PRE_TYPE_EXAONE = 25,
107
106
  LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26,
108
107
  LLAMA_VOCAB_PRE_TYPE_MINERVA = 27,
108
+ LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28,
109
109
  };
110
110
 
111
111
  enum llama_rope_type {
@@ -386,6 +386,7 @@ extern "C" {
386
386
  } llama_chat_message;
387
387
 
388
388
  // lora adapter
389
+ // TODO: rename to llama_adapter_lora
389
390
  struct llama_lora_adapter;
390
391
 
391
392
  // Helpers for getting default parameters
@@ -413,11 +414,19 @@ extern "C" {
413
414
  // Call once at the end of the program - currently only used for MPI
414
415
  LLAMA_API void llama_backend_free(void);
415
416
 
416
- LLAMA_API struct llama_model * llama_load_model_from_file(
417
+ DEPRECATED(LLAMA_API struct llama_model * llama_load_model_from_file(
418
+ const char * path_model,
419
+ struct llama_model_params params),
420
+ "use llama_model_load_from_file instead");
421
+
422
+ LLAMA_API struct llama_model * llama_model_load_from_file(
417
423
  const char * path_model,
418
424
  struct llama_model_params params);
419
425
 
420
- LLAMA_API void llama_free_model(struct llama_model * model);
426
+ DEPRECATED(LLAMA_API void llama_free_model(struct llama_model * model),
427
+ "use llama_model_free instead");
428
+
429
+ LLAMA_API void llama_model_free(struct llama_model * model);
421
430
 
422
431
  // TODO: rename to llama_init_from_model
423
432
  LLAMA_API struct llama_context * llama_new_context_with_model(
@@ -502,14 +511,19 @@ extern "C" {
502
511
  const char * fname_out,
503
512
  const llama_model_quantize_params * params);
504
513
 
514
+ //
515
+ // Adapters
516
+ //
517
+
505
518
  // Load a LoRA adapter from file
506
- // The loaded adapter will be associated to the given model, and will be free when the model is deleted
519
+ // TODO: rename to llama_adapter_lora_init
507
520
  LLAMA_API struct llama_lora_adapter * llama_lora_adapter_init(
508
521
  struct llama_model * model,
509
522
  const char * path_lora);
510
523
 
511
524
  // Add a loaded LoRA adapter to given context
512
525
  // This will not modify model's weight
526
+ // TODO: rename to llama_set_adapter_lora
513
527
  LLAMA_API int32_t llama_lora_adapter_set(
514
528
  struct llama_context * ctx,
515
529
  struct llama_lora_adapter * adapter,
@@ -517,16 +531,18 @@ extern "C" {
517
531
 
518
532
  // Remove a specific LoRA adapter from given context
519
533
  // Return -1 if the adapter is not present in the context
534
+ // TODO: rename to llama_rm_adapter_lora
520
535
  LLAMA_API int32_t llama_lora_adapter_remove(
521
536
  struct llama_context * ctx,
522
537
  struct llama_lora_adapter * adapter);
523
538
 
524
539
  // Remove all LoRA adapters from given context
525
- LLAMA_API void llama_lora_adapter_clear(
526
- struct llama_context * ctx);
540
+ // TODO: rename to llama_clear_adapter_lora
541
+ LLAMA_API void llama_lora_adapter_clear(struct llama_context * ctx);
527
542
 
528
543
  // Manually free a LoRA adapter
529
544
  // Note: loaded adapters will be free when the associated model is deleted
545
+ // TODO: rename to llama_adapter_lora_free
530
546
  LLAMA_API void llama_lora_adapter_free(struct llama_lora_adapter * adapter);
531
547
 
532
548
  // Apply a loaded control vector to a llama_context, or if data is NULL, clear
@@ -535,6 +551,7 @@ extern "C" {
535
551
  // to an n_embd x n_layers buffer starting from layer 1.
536
552
  // il_start and il_end are the layer range the vector should apply to (both inclusive)
537
553
  // See llama_control_vector_load in common to load a control vector.
554
+ // TODO: rename to llama_adapter_cvec_apply
538
555
  LLAMA_API int32_t llama_control_vector_apply(
539
556
  struct llama_context * lctx,
540
557
  const float * data,
@@ -547,6 +564,8 @@ extern "C" {
547
564
  // KV cache
548
565
  //
549
566
 
567
+ // TODO: remove llama_kv_cache_view_* API
568
+
550
569
  // Information associated with an individual cell in the KV cache view.
551
570
  struct llama_kv_cache_view_cell {
552
571
  // The position for this cell. Takes KV cache shifts into account.
@@ -593,8 +612,11 @@ extern "C" {
593
612
  LLAMA_API void llama_kv_cache_view_free(struct llama_kv_cache_view * view);
594
613
 
595
614
  // Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)
615
+ // TODO: change signature to llama_kv_cache_view_update(struct llama_kv_cache_view * view, const struct llama_context * ctx)
596
616
  LLAMA_API void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view);
597
617
 
618
+ ///
619
+
598
620
  // Returns the number of tokens in the KV cache (slow, use only for debug)
599
621
  // If a KV cell has multiple sequences assigned to it, it will be counted multiple times
600
622
  LLAMA_API int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx);
@@ -664,6 +686,9 @@ extern "C" {
664
686
  struct llama_context * ctx,
665
687
  llama_seq_id seq_id);
666
688
 
689
+ // TODO: the llama_kv_cache_defrag and llama_kv_cache_update API tightly couples llama_context with llama_kv_cache
690
+ // how to avoid this?
691
+
667
692
  // Defragment the KV cache
668
693
  // This will be applied:
669
694
  // - lazily on next llama_decode()
package/cpp/rn-llama.hpp CHANGED
@@ -5,64 +5,35 @@
5
5
  #include <iostream>
6
6
  #include "common.h"
7
7
  #include "ggml.h"
8
+ #include "gguf.h"
8
9
  #include "llama.h"
9
10
  #include "llama-impl.h"
10
11
  #include "sampling.h"
12
+ #if defined(__ANDROID__)
13
+ #include <android/log.h>
14
+ #endif
11
15
 
12
16
  namespace rnllama {
13
17
 
14
- static std::string lm_gguf_data_to_str(enum lm_gguf_type type, const void * data, int i) {
15
- switch (type) {
16
- case LM_GGUF_TYPE_UINT8: return std::to_string(((const uint8_t *)data)[i]);
17
- case LM_GGUF_TYPE_INT8: return std::to_string(((const int8_t *)data)[i]);
18
- case LM_GGUF_TYPE_UINT16: return std::to_string(((const uint16_t *)data)[i]);
19
- case LM_GGUF_TYPE_INT16: return std::to_string(((const int16_t *)data)[i]);
20
- case LM_GGUF_TYPE_UINT32: return std::to_string(((const uint32_t *)data)[i]);
21
- case LM_GGUF_TYPE_INT32: return std::to_string(((const int32_t *)data)[i]);
22
- case LM_GGUF_TYPE_UINT64: return std::to_string(((const uint64_t *)data)[i]);
23
- case LM_GGUF_TYPE_INT64: return std::to_string(((const int64_t *)data)[i]);
24
- case LM_GGUF_TYPE_FLOAT32: return std::to_string(((const float *)data)[i]);
25
- case LM_GGUF_TYPE_FLOAT64: return std::to_string(((const double *)data)[i]);
26
- case LM_GGUF_TYPE_BOOL: return ((const bool *)data)[i] ? "true" : "false";
27
- default: return "unknown type: " + std::to_string(type);
28
- }
29
- }
30
-
31
- static std::string lm_gguf_kv_to_str(const struct lm_gguf_context * ctx_gguf, int i) {
32
- const enum lm_gguf_type type = lm_gguf_get_kv_type(ctx_gguf, i);
18
+ const std::vector<lm_ggml_type> kv_cache_types = {
19
+ LM_GGML_TYPE_F32,
20
+ LM_GGML_TYPE_F16,
21
+ LM_GGML_TYPE_BF16,
22
+ LM_GGML_TYPE_Q8_0,
23
+ LM_GGML_TYPE_Q4_0,
24
+ LM_GGML_TYPE_Q4_1,
25
+ LM_GGML_TYPE_IQ4_NL,
26
+ LM_GGML_TYPE_Q5_0,
27
+ LM_GGML_TYPE_Q5_1,
28
+ };
33
29
 
34
- switch (type) {
35
- case LM_GGUF_TYPE_STRING:
36
- return lm_gguf_get_val_str(ctx_gguf, i);
37
- case LM_GGUF_TYPE_ARRAY:
38
- {
39
- const enum lm_gguf_type arr_type = lm_gguf_get_arr_type(ctx_gguf, i);
40
- int arr_n = lm_gguf_get_arr_n(ctx_gguf, i);
41
- const void * data = lm_gguf_get_arr_data(ctx_gguf, i);
42
- std::stringstream ss;
43
- ss << "[";
44
- for (int j = 0; j < arr_n; j++) {
45
- if (arr_type == LM_GGUF_TYPE_STRING) {
46
- std::string val = lm_gguf_get_arr_str(ctx_gguf, i, j);
47
- // escape quotes
48
- replace_all(val, "\\", "\\\\");
49
- replace_all(val, "\"", "\\\"");
50
- ss << '"' << val << '"';
51
- } else if (arr_type == LM_GGUF_TYPE_ARRAY) {
52
- ss << "???";
53
- } else {
54
- ss << lm_gguf_data_to_str(arr_type, data, j);
55
- }
56
- if (j < arr_n - 1) {
57
- ss << ", ";
58
- }
59
- }
60
- ss << "]";
61
- return ss.str();
62
- }
63
- default:
64
- return lm_gguf_data_to_str(type, lm_gguf_get_val_data(ctx_gguf, i), 0);
30
+ static lm_ggml_type kv_cache_type_from_str(const std::string & s) {
31
+ for (const auto & type : kv_cache_types) {
32
+ if (lm_ggml_type_name(type) == s) {
33
+ return type;
34
+ }
65
35
  }
36
+ throw std::runtime_error("Unsupported cache type: " + s);
66
37
  }
67
38
 
68
39
  static void llama_batch_clear(llama_batch *batch) {
@@ -86,16 +57,32 @@ static void llama_batch_add(llama_batch *batch, llama_token id, llama_pos pos, s
86
57
  static void log(const char *level, const char *function, int line,
87
58
  const char *format, ...)
88
59
  {
89
- printf("[%s] %s:%d ", level, function, line);
90
-
91
60
  va_list args;
92
- va_start(args, format);
93
- vprintf(format, args);
94
- va_end(args);
95
-
96
- printf("\n");
61
+ #if defined(__ANDROID__)
62
+ char prefix[256];
63
+ snprintf(prefix, sizeof(prefix), "%s:%d %s", function, line, format);
64
+
65
+ va_start(args, format);
66
+ android_LogPriority priority;
67
+ if (strcmp(level, "ERROR") == 0) {
68
+ priority = ANDROID_LOG_ERROR;
69
+ } else if (strcmp(level, "WARNING") == 0) {
70
+ priority = ANDROID_LOG_WARN;
71
+ } else if (strcmp(level, "INFO") == 0) {
72
+ priority = ANDROID_LOG_INFO;
73
+ } else {
74
+ priority = ANDROID_LOG_DEBUG;
75
+ }
76
+ __android_log_vprint(priority, "RNLlama", prefix, args);
77
+ va_end(args);
78
+ #else
79
+ printf("[%s] %s:%d ", level, function, line);
80
+ va_start(args, format);
81
+ vprintf(format, args);
82
+ va_end(args);
83
+ printf("\n");
84
+ #endif
97
85
  }
98
-
99
86
  static bool rnllama_verbose = false;
100
87
 
101
88
  #if RNLLAMA_VERBOSE != 1
@@ -187,7 +174,7 @@ static std::string tokens_to_output_formatted_string(const llama_context *ctx, c
187
174
  }
188
175
 
189
176
  template <class Iter>
190
- static std::string tokens_to_str(llama_context *ctx, Iter begin, Iter end)
177
+ static std::string tokens_to_str(llama_context* ctx, Iter begin, Iter end)
191
178
  {
192
179
  std::string ret;
193
180
  for (; begin != end; ++begin)
@@ -214,6 +201,8 @@ struct llama_rn_context
214
201
 
215
202
  common_params params;
216
203
 
204
+ common_init_result llama_init;
205
+
217
206
  llama_model *model = nullptr;
218
207
  float loading_progress = 0;
219
208
  bool is_load_interrupted = false;
@@ -230,18 +219,10 @@ struct llama_rn_context
230
219
  std::string stopping_word;
231
220
  bool incomplete = false;
232
221
 
222
+ std::vector<common_lora_adapter_info> lora;
223
+
233
224
  ~llama_rn_context()
234
225
  {
235
- if (ctx)
236
- {
237
- llama_free(ctx);
238
- ctx = nullptr;
239
- }
240
- if (model)
241
- {
242
- llama_free_model(model);
243
- model = nullptr;
244
- }
245
226
  if (ctx_sampling != nullptr)
246
227
  {
247
228
  common_sampler_free(ctx_sampling);
@@ -280,30 +261,26 @@ struct llama_rn_context
280
261
  bool loadModel(common_params &params_)
281
262
  {
282
263
  params = params_;
283
- common_init_result result = common_init_from_params(params);
284
- model = result.model;
285
- ctx = result.context;
264
+ llama_init = common_init_from_params(params);
265
+ model = llama_init.model.get();
266
+ ctx = llama_init.context.get();
286
267
  if (model == nullptr)
287
268
  {
288
269
  LOG_ERROR("unable to load model: %s", params_.model.c_str());
289
270
  return false;
290
271
  }
291
- LOG_VERBOSE("getting n_ctx");
292
272
  n_ctx = llama_n_ctx(ctx);
273
+
274
+ // We can uncomment for debugging or after this fix: https://github.com/ggerganov/llama.cpp/pull/11101
275
+ // LOG_INFO("%s\n", common_params_get_system_info(params).c_str());
276
+
293
277
  return true;
294
278
  }
295
279
 
296
280
  bool validateModelChatTemplate() const {
297
- std::vector<char> model_template(2048, 0); // longest known template is about 1200 bytes
298
- std::string template_key = "tokenizer.chat_template";
299
- int32_t res = llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size());
300
- if (res >= 0) {
301
- llama_chat_message chat[] = {{"user", "test"}};
302
- std::string tmpl = std::string(model_template.data(), model_template.size());
303
- int32_t chat_res = llama_chat_apply_template(model, tmpl.c_str(), chat, 1, true, nullptr, 0);
304
- return chat_res > 0;
305
- }
306
- return res > 0;
281
+ llama_chat_message chat[] = {{"user", "test"}};
282
+ int32_t chat_res = llama_chat_apply_template(model, nullptr, chat, 1, true, nullptr, 0);
283
+ return chat_res > 0;
307
284
  }
308
285
 
309
286
  void truncatePrompt(std::vector<llama_token> &prompt_tokens) {
@@ -330,7 +307,7 @@ struct llama_rn_context
330
307
 
331
308
  void loadPrompt()
332
309
  {
333
- std::vector<llama_token> prompt_tokens = ::common_tokenize(ctx, params.prompt, true, true);
310
+ std::vector<llama_token> prompt_tokens = ::common_tokenize(model, params.prompt, true, true);
334
311
  num_prompt_tokens = prompt_tokens.size();
335
312
 
336
313
  // LOG tokens
@@ -439,6 +416,7 @@ struct llama_rn_context
439
416
  }
440
417
  if (llama_decode(ctx, llama_batch_get_one(&embd[n_past], n_eval)))
441
418
  {
419
+
442
420
  LOG_ERROR("failed to eval, n_eval: %d, n_past: %d, n_threads: %d, embd: %s",
443
421
  n_eval,
444
422
  n_past,
@@ -477,7 +455,7 @@ struct llama_rn_context
477
455
  const int32_t n_probs = params.sampling.n_probs;
478
456
 
479
457
  // deprecated
480
- /*if (params.sparams.temp <= 0 && n_probs > 0)
458
+ /*if (params.sampling.temp <= 0 && n_probs > 0)
481
459
  {
482
460
  // For llama_sample_token_greedy we need to sort candidates
483
461
  llama_sampler_init_softmax();
@@ -647,7 +625,11 @@ struct llama_rn_context
647
625
  double tg_std = 0;
648
626
 
649
627
  // TODO: move batch into llama_rn_context (related https://github.com/mybigday/llama.rn/issues/30)
650
- llama_batch batch = llama_batch_init(512, 0, 1);
628
+ llama_batch batch = llama_batch_init(
629
+ std::min(pp, params.n_ubatch), // max n_tokens is limited by n_ubatch
630
+ 0, // No embeddings
631
+ 1 // Single sequence
632
+ );
651
633
 
652
634
  for (int i = 0; i < nr; i++)
653
635
  {
@@ -734,7 +716,27 @@ struct llama_rn_context
734
716
  std::string("]");
735
717
  }
736
718
 
737
-
719
+ int applyLoraAdapters(std::vector<common_lora_adapter_info> lora) {
720
+ for (auto &la : lora) {
721
+ la.ptr = llama_lora_adapter_init(model, la.path.c_str());
722
+ if (la.ptr == nullptr) {
723
+ LOG_ERROR("failed to apply lora adapter '%s'\n", la.path.c_str());
724
+ return -1;
725
+ }
726
+ }
727
+ this->lora = lora;
728
+ common_lora_adapters_apply(ctx, lora);
729
+ return 0;
730
+ }
731
+
732
+ void removeLoraAdapters() {
733
+ this->lora.clear();
734
+ common_lora_adapters_apply(ctx, this->lora); // apply empty list
735
+ }
736
+
737
+ std::vector<common_lora_adapter_info> getLoadedLoraAdapters() {
738
+ return this->lora;
739
+ }
738
740
  // Context Shifting from KoboldCpp <https://github.com/LostRuins/koboldcpp>
739
741
  // Implementation obtained with special permission from @concedo
740
742
 
@@ -897,6 +899,7 @@ void purge_missing_tokens(llama_context * ctx, std::vector<int> &current_context
897
899
  }
898
900
 
899
901
  // End Context Shifting
902
+
900
903
  };
901
904
 
902
905
  }