cui-llama.rn 1.2.6 → 1.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. package/README.md +3 -2
  2. package/android/src/main/CMakeLists.txt +26 -6
  3. package/android/src/main/java/com/rnllama/LlamaContext.java +115 -27
  4. package/android/src/main/java/com/rnllama/RNLlama.java +40 -7
  5. package/android/src/main/jni.cpp +228 -40
  6. package/android/src/newarch/java/com/rnllama/RNLlamaModule.java +9 -4
  7. package/android/src/oldarch/java/com/rnllama/RNLlamaModule.java +9 -4
  8. package/cpp/amx/amx.cpp +196 -0
  9. package/cpp/amx/amx.h +20 -0
  10. package/cpp/amx/common.h +101 -0
  11. package/cpp/amx/mmq.cpp +2524 -0
  12. package/cpp/amx/mmq.h +16 -0
  13. package/cpp/common.cpp +118 -251
  14. package/cpp/common.h +53 -30
  15. package/cpp/ggml-aarch64.c +46 -3395
  16. package/cpp/ggml-aarch64.h +0 -20
  17. package/cpp/ggml-alloc.c +6 -8
  18. package/cpp/ggml-backend-impl.h +33 -11
  19. package/cpp/ggml-backend-reg.cpp +423 -0
  20. package/cpp/ggml-backend.cpp +14 -676
  21. package/cpp/ggml-backend.h +46 -9
  22. package/cpp/ggml-common.h +6 -0
  23. package/cpp/ggml-cpu-aarch64.c +3823 -0
  24. package/cpp/ggml-cpu-aarch64.h +32 -0
  25. package/cpp/ggml-cpu-impl.h +14 -242
  26. package/cpp/ggml-cpu-quants.c +10835 -0
  27. package/cpp/ggml-cpu-quants.h +63 -0
  28. package/cpp/ggml-cpu.c +13971 -13720
  29. package/cpp/ggml-cpu.cpp +715 -0
  30. package/cpp/ggml-cpu.h +65 -63
  31. package/cpp/ggml-impl.h +285 -25
  32. package/cpp/ggml-metal.h +8 -8
  33. package/cpp/ggml-metal.m +1221 -728
  34. package/cpp/ggml-quants.c +189 -10681
  35. package/cpp/ggml-quants.h +78 -125
  36. package/cpp/ggml-threading.cpp +12 -0
  37. package/cpp/ggml-threading.h +12 -0
  38. package/cpp/ggml.c +688 -1460
  39. package/cpp/ggml.h +58 -244
  40. package/cpp/json-schema-to-grammar.cpp +1045 -1045
  41. package/cpp/json.hpp +24766 -24766
  42. package/cpp/llama-sampling.cpp +5 -2
  43. package/cpp/llama.cpp +409 -123
  44. package/cpp/llama.h +8 -4
  45. package/cpp/rn-llama.hpp +89 -25
  46. package/cpp/sampling.cpp +42 -3
  47. package/cpp/sampling.h +22 -1
  48. package/cpp/sgemm.cpp +608 -0
  49. package/cpp/speculative.cpp +270 -0
  50. package/cpp/speculative.h +28 -0
  51. package/cpp/unicode.cpp +11 -0
  52. package/ios/RNLlama.mm +43 -20
  53. package/ios/RNLlamaContext.h +9 -3
  54. package/ios/RNLlamaContext.mm +146 -33
  55. package/jest/mock.js +0 -1
  56. package/lib/commonjs/NativeRNLlama.js.map +1 -1
  57. package/lib/commonjs/grammar.js +4 -2
  58. package/lib/commonjs/grammar.js.map +1 -1
  59. package/lib/commonjs/index.js +52 -15
  60. package/lib/commonjs/index.js.map +1 -1
  61. package/lib/module/NativeRNLlama.js.map +1 -1
  62. package/lib/module/grammar.js +2 -1
  63. package/lib/module/grammar.js.map +1 -1
  64. package/lib/module/index.js +51 -15
  65. package/lib/module/index.js.map +1 -1
  66. package/lib/typescript/NativeRNLlama.d.ts +122 -8
  67. package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
  68. package/lib/typescript/grammar.d.ts +5 -6
  69. package/lib/typescript/grammar.d.ts.map +1 -1
  70. package/lib/typescript/index.d.ts +15 -6
  71. package/lib/typescript/index.d.ts.map +1 -1
  72. package/package.json +2 -1
  73. package/src/NativeRNLlama.ts +135 -13
  74. package/src/grammar.ts +10 -8
  75. package/src/index.ts +104 -28
package/cpp/common.h CHANGED
@@ -33,6 +33,8 @@ struct common_lora_adapter_container : common_lora_adapter_info {
33
33
  struct llama_lora_adapter * adapter;
34
34
  };
35
35
 
36
+ using llama_tokens = std::vector<llama_token>;
37
+
36
38
  // build info
37
39
  extern int LLAMA_BUILD_NUMBER;
38
40
  extern char const * LLAMA_COMMIT;
@@ -112,8 +114,8 @@ enum dimre_method {
112
114
  DIMRE_METHOD_MEAN,
113
115
  };
114
116
 
115
- // sampler parameters
116
- struct common_sampler_params {
117
+ // sampling parameters
118
+ struct common_params_sampling {
117
119
  uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
118
120
 
119
121
  int32_t n_prev = 64; // number of previous tokens to remember
@@ -164,6 +166,21 @@ struct common_sampler_params {
164
166
  std::string print() const;
165
167
  };
166
168
 
169
+ struct common_params_speculative {
170
+ std::vector<lm_ggml_backend_dev_t> devices; // devices to use for offloading
171
+ int32_t n_ctx = 0; // draft context size
172
+ int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding
173
+ int32_t n_min = 5; // minimum number of draft tokens to use for speculative decoding
174
+ int32_t n_gpu_layers = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
175
+ float p_split = 0.1f; // speculative decoding split probability
176
+ float p_min = 0.9f; // minimum speculative decoding probability (greedy)
177
+
178
+ struct cpu_params cpuparams;
179
+ struct cpu_params cpuparams_batch;
180
+
181
+ std::string model = ""; // draft model for speculative decoding // NOLINT
182
+ };
183
+
167
184
  struct common_params {
168
185
 
169
186
  void * progress_callback_user_data = nullptr;
@@ -174,15 +191,9 @@ struct common_params {
174
191
  int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
175
192
  int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
176
193
  int32_t n_keep = 0; // number of tokens to keep from initial prompt
177
- int32_t n_draft = 5; // number of tokens to draft during speculative decoding
178
194
  int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
179
195
  int32_t n_parallel = 1; // number of parallel sequences to decode
180
196
  int32_t n_sequences = 1; // number of sequences to decode
181
- float p_split = 0.1f; // speculative decoding split probability
182
- int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
183
- int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
184
- int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
185
- float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
186
197
  int32_t grp_attn_n = 1; // group-attention factor
187
198
  int32_t grp_attn_w = 512; // group-attention width
188
199
  int32_t n_print = -1; // print token count every n tokens (-1 = disabled)
@@ -193,27 +204,31 @@ struct common_params {
193
204
  float yarn_beta_fast = 32.0f; // YaRN low correction dim
194
205
  float yarn_beta_slow = 1.0f; // YaRN high correction dim
195
206
  int32_t yarn_orig_ctx = 0; // YaRN original context length
196
- float defrag_thold = -1.0f; // KV cache defragmentation threshold
207
+ float defrag_thold = 0.1f; // KV cache defragmentation threshold
208
+
209
+ // offload params
210
+ std::vector<lm_ggml_backend_dev_t> devices; // devices to use for offloading
211
+ int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
212
+ int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
213
+ float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
214
+ enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
197
215
 
198
216
  struct cpu_params cpuparams;
199
217
  struct cpu_params cpuparams_batch;
200
- struct cpu_params draft_cpuparams;
201
- struct cpu_params draft_cpuparams_batch;
202
218
 
203
219
  lm_ggml_backend_sched_eval_callback cb_eval = nullptr;
204
220
  void * cb_eval_user_data = nullptr;
205
221
 
206
222
  lm_ggml_numa_strategy numa = LM_GGML_NUMA_STRATEGY_DISABLED;
207
223
 
208
- enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
209
224
  enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
210
225
  enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
211
226
  enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
212
227
 
213
- struct common_sampler_params sparams;
228
+ struct common_params_sampling sampling;
229
+ struct common_params_speculative speculative;
214
230
 
215
231
  std::string model = ""; // model path // NOLINT
216
- std::string model_draft = ""; // draft model for speculative decoding // NOLINT
217
232
  std::string model_alias = "unknown"; // model alias // NOLINT
218
233
  std::string model_url = ""; // model url to download // NOLINT
219
234
  std::string hf_token = ""; // HF token // NOLINT
@@ -224,7 +239,6 @@ struct common_params {
224
239
  std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state // NOLINT
225
240
  std::string input_prefix = ""; // string to prefix user inputs with // NOLINT
226
241
  std::string input_suffix = ""; // string to suffix user inputs with // NOLINT
227
- std::string logdir = ""; // directory in which to save YAML log files // NOLINT
228
242
  std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding // NOLINT
229
243
  std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding // NOLINT
230
244
  std::string logits_file = ""; // file for saving *all* logits // NOLINT
@@ -467,17 +481,28 @@ struct common_init_result {
467
481
 
468
482
  struct common_init_result common_init_from_params(common_params & params);
469
483
 
470
- struct llama_model_params common_model_params_to_llama (const common_params & params);
484
+ struct llama_model_params common_model_params_to_llama ( common_params & params);
471
485
  struct llama_context_params common_context_params_to_llama(const common_params & params);
472
486
  struct lm_ggml_threadpool_params lm_ggml_threadpool_params_from_cpu_params(const cpu_params & params);
473
487
 
474
- struct llama_model * common_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
475
- struct llama_model * common_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
488
+ struct llama_model * common_load_model_from_url(
489
+ const std::string & model_url,
490
+ const std::string & local_path,
491
+ const std::string & hf_token,
492
+ const struct llama_model_params & params);
493
+ struct llama_model * common_load_model_from_hf(
494
+ const std::string & repo,
495
+ const std::string & remote_path,
496
+ const std::string & local_path,
497
+ const std::string & hf_token,
498
+ const struct llama_model_params & params);
476
499
 
477
500
  // clear LoRA adapters from context, then apply new list of adapters
478
501
  void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters);
479
502
 
503
+ //
480
504
  // Batch utils
505
+ //
481
506
 
482
507
  void common_batch_clear(struct llama_batch & batch);
483
508
 
@@ -488,6 +513,16 @@ void common_batch_add(
488
513
  const std::vector<llama_seq_id> & seq_ids,
489
514
  bool logits);
490
515
 
516
+ //
517
+ // Token utils
518
+ //
519
+
520
+ // longest common prefix
521
+ size_t common_lcp(const llama_tokens & a, const llama_tokens & b);
522
+
523
+ // longet common subsequence
524
+ size_t common_lcs(const llama_tokens & a, const llama_tokens & b);
525
+
491
526
  //
492
527
  // Vocab utils
493
528
  //
@@ -599,15 +634,3 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c
599
634
  static const char * const LLM_KV_SPLIT_NO = "split.no";
600
635
  static const char * const LLM_KV_SPLIT_COUNT = "split.count";
601
636
  static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
602
-
603
- //
604
- // YAML utils
605
- //
606
-
607
- void yaml_dump_vector_float (FILE * stream, const char * prop_name, const std::vector<float> & data);
608
- void yaml_dump_vector_int (FILE * stream, const char * prop_name, const std::vector<int> & data);
609
- void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data);
610
-
611
- void yaml_dump_non_result_info(
612
- FILE * stream, const common_params & params, const llama_context * lctx,
613
- const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);