cui-llama.rn 1.2.6 → 1.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +3 -2
- package/android/src/main/CMakeLists.txt +26 -6
- package/android/src/main/java/com/rnllama/LlamaContext.java +115 -27
- package/android/src/main/java/com/rnllama/RNLlama.java +40 -7
- package/android/src/main/jni.cpp +228 -40
- package/android/src/newarch/java/com/rnllama/RNLlamaModule.java +9 -4
- package/android/src/oldarch/java/com/rnllama/RNLlamaModule.java +9 -4
- package/cpp/amx/amx.cpp +196 -0
- package/cpp/amx/amx.h +20 -0
- package/cpp/amx/common.h +101 -0
- package/cpp/amx/mmq.cpp +2524 -0
- package/cpp/amx/mmq.h +16 -0
- package/cpp/common.cpp +118 -251
- package/cpp/common.h +53 -30
- package/cpp/ggml-aarch64.c +46 -3395
- package/cpp/ggml-aarch64.h +0 -20
- package/cpp/ggml-alloc.c +6 -8
- package/cpp/ggml-backend-impl.h +33 -11
- package/cpp/ggml-backend-reg.cpp +423 -0
- package/cpp/ggml-backend.cpp +14 -676
- package/cpp/ggml-backend.h +46 -9
- package/cpp/ggml-common.h +6 -0
- package/cpp/ggml-cpu-aarch64.c +3823 -0
- package/cpp/ggml-cpu-aarch64.h +32 -0
- package/cpp/ggml-cpu-impl.h +14 -242
- package/cpp/ggml-cpu-quants.c +10835 -0
- package/cpp/ggml-cpu-quants.h +63 -0
- package/cpp/ggml-cpu.c +13971 -13720
- package/cpp/ggml-cpu.cpp +715 -0
- package/cpp/ggml-cpu.h +65 -63
- package/cpp/ggml-impl.h +285 -25
- package/cpp/ggml-metal.h +8 -8
- package/cpp/ggml-metal.m +1221 -728
- package/cpp/ggml-quants.c +189 -10681
- package/cpp/ggml-quants.h +78 -125
- package/cpp/ggml-threading.cpp +12 -0
- package/cpp/ggml-threading.h +12 -0
- package/cpp/ggml.c +688 -1460
- package/cpp/ggml.h +58 -244
- package/cpp/json-schema-to-grammar.cpp +1045 -1045
- package/cpp/json.hpp +24766 -24766
- package/cpp/llama-sampling.cpp +5 -2
- package/cpp/llama.cpp +409 -123
- package/cpp/llama.h +8 -4
- package/cpp/rn-llama.hpp +89 -25
- package/cpp/sampling.cpp +42 -3
- package/cpp/sampling.h +22 -1
- package/cpp/sgemm.cpp +608 -0
- package/cpp/speculative.cpp +270 -0
- package/cpp/speculative.h +28 -0
- package/cpp/unicode.cpp +11 -0
- package/ios/RNLlama.mm +43 -20
- package/ios/RNLlamaContext.h +9 -3
- package/ios/RNLlamaContext.mm +146 -33
- package/jest/mock.js +0 -1
- package/lib/commonjs/NativeRNLlama.js.map +1 -1
- package/lib/commonjs/grammar.js +4 -2
- package/lib/commonjs/grammar.js.map +1 -1
- package/lib/commonjs/index.js +52 -15
- package/lib/commonjs/index.js.map +1 -1
- package/lib/module/NativeRNLlama.js.map +1 -1
- package/lib/module/grammar.js +2 -1
- package/lib/module/grammar.js.map +1 -1
- package/lib/module/index.js +51 -15
- package/lib/module/index.js.map +1 -1
- package/lib/typescript/NativeRNLlama.d.ts +122 -8
- package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
- package/lib/typescript/grammar.d.ts +5 -6
- package/lib/typescript/grammar.d.ts.map +1 -1
- package/lib/typescript/index.d.ts +15 -6
- package/lib/typescript/index.d.ts.map +1 -1
- package/package.json +2 -1
- package/src/NativeRNLlama.ts +135 -13
- package/src/grammar.ts +10 -8
- package/src/index.ts +104 -28
package/cpp/common.h
CHANGED
@@ -33,6 +33,8 @@ struct common_lora_adapter_container : common_lora_adapter_info {
|
|
33
33
|
struct llama_lora_adapter * adapter;
|
34
34
|
};
|
35
35
|
|
36
|
+
using llama_tokens = std::vector<llama_token>;
|
37
|
+
|
36
38
|
// build info
|
37
39
|
extern int LLAMA_BUILD_NUMBER;
|
38
40
|
extern char const * LLAMA_COMMIT;
|
@@ -112,8 +114,8 @@ enum dimre_method {
|
|
112
114
|
DIMRE_METHOD_MEAN,
|
113
115
|
};
|
114
116
|
|
115
|
-
//
|
116
|
-
struct
|
117
|
+
// sampling parameters
|
118
|
+
struct common_params_sampling {
|
117
119
|
uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
|
118
120
|
|
119
121
|
int32_t n_prev = 64; // number of previous tokens to remember
|
@@ -164,6 +166,21 @@ struct common_sampler_params {
|
|
164
166
|
std::string print() const;
|
165
167
|
};
|
166
168
|
|
169
|
+
struct common_params_speculative {
|
170
|
+
std::vector<lm_ggml_backend_dev_t> devices; // devices to use for offloading
|
171
|
+
int32_t n_ctx = 0; // draft context size
|
172
|
+
int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding
|
173
|
+
int32_t n_min = 5; // minimum number of draft tokens to use for speculative decoding
|
174
|
+
int32_t n_gpu_layers = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
|
175
|
+
float p_split = 0.1f; // speculative decoding split probability
|
176
|
+
float p_min = 0.9f; // minimum speculative decoding probability (greedy)
|
177
|
+
|
178
|
+
struct cpu_params cpuparams;
|
179
|
+
struct cpu_params cpuparams_batch;
|
180
|
+
|
181
|
+
std::string model = ""; // draft model for speculative decoding // NOLINT
|
182
|
+
};
|
183
|
+
|
167
184
|
struct common_params {
|
168
185
|
|
169
186
|
void * progress_callback_user_data = nullptr;
|
@@ -174,15 +191,9 @@ struct common_params {
|
|
174
191
|
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
|
175
192
|
int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
|
176
193
|
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
177
|
-
int32_t n_draft = 5; // number of tokens to draft during speculative decoding
|
178
194
|
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
|
179
195
|
int32_t n_parallel = 1; // number of parallel sequences to decode
|
180
196
|
int32_t n_sequences = 1; // number of sequences to decode
|
181
|
-
float p_split = 0.1f; // speculative decoding split probability
|
182
|
-
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
|
183
|
-
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
|
184
|
-
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
185
|
-
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
|
186
197
|
int32_t grp_attn_n = 1; // group-attention factor
|
187
198
|
int32_t grp_attn_w = 512; // group-attention width
|
188
199
|
int32_t n_print = -1; // print token count every n tokens (-1 = disabled)
|
@@ -193,27 +204,31 @@ struct common_params {
|
|
193
204
|
float yarn_beta_fast = 32.0f; // YaRN low correction dim
|
194
205
|
float yarn_beta_slow = 1.0f; // YaRN high correction dim
|
195
206
|
int32_t yarn_orig_ctx = 0; // YaRN original context length
|
196
|
-
float defrag_thold =
|
207
|
+
float defrag_thold = 0.1f; // KV cache defragmentation threshold
|
208
|
+
|
209
|
+
// offload params
|
210
|
+
std::vector<lm_ggml_backend_dev_t> devices; // devices to use for offloading
|
211
|
+
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
|
212
|
+
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
213
|
+
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
|
214
|
+
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
|
197
215
|
|
198
216
|
struct cpu_params cpuparams;
|
199
217
|
struct cpu_params cpuparams_batch;
|
200
|
-
struct cpu_params draft_cpuparams;
|
201
|
-
struct cpu_params draft_cpuparams_batch;
|
202
218
|
|
203
219
|
lm_ggml_backend_sched_eval_callback cb_eval = nullptr;
|
204
220
|
void * cb_eval_user_data = nullptr;
|
205
221
|
|
206
222
|
lm_ggml_numa_strategy numa = LM_GGML_NUMA_STRATEGY_DISABLED;
|
207
223
|
|
208
|
-
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
|
209
224
|
enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
|
210
225
|
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
|
211
226
|
enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
|
212
227
|
|
213
|
-
struct
|
228
|
+
struct common_params_sampling sampling;
|
229
|
+
struct common_params_speculative speculative;
|
214
230
|
|
215
231
|
std::string model = ""; // model path // NOLINT
|
216
|
-
std::string model_draft = ""; // draft model for speculative decoding // NOLINT
|
217
232
|
std::string model_alias = "unknown"; // model alias // NOLINT
|
218
233
|
std::string model_url = ""; // model url to download // NOLINT
|
219
234
|
std::string hf_token = ""; // HF token // NOLINT
|
@@ -224,7 +239,6 @@ struct common_params {
|
|
224
239
|
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state // NOLINT
|
225
240
|
std::string input_prefix = ""; // string to prefix user inputs with // NOLINT
|
226
241
|
std::string input_suffix = ""; // string to suffix user inputs with // NOLINT
|
227
|
-
std::string logdir = ""; // directory in which to save YAML log files // NOLINT
|
228
242
|
std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding // NOLINT
|
229
243
|
std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding // NOLINT
|
230
244
|
std::string logits_file = ""; // file for saving *all* logits // NOLINT
|
@@ -467,17 +481,28 @@ struct common_init_result {
|
|
467
481
|
|
468
482
|
struct common_init_result common_init_from_params(common_params & params);
|
469
483
|
|
470
|
-
struct llama_model_params common_model_params_to_llama (
|
484
|
+
struct llama_model_params common_model_params_to_llama ( common_params & params);
|
471
485
|
struct llama_context_params common_context_params_to_llama(const common_params & params);
|
472
486
|
struct lm_ggml_threadpool_params lm_ggml_threadpool_params_from_cpu_params(const cpu_params & params);
|
473
487
|
|
474
|
-
struct llama_model * common_load_model_from_url(
|
475
|
-
|
488
|
+
struct llama_model * common_load_model_from_url(
|
489
|
+
const std::string & model_url,
|
490
|
+
const std::string & local_path,
|
491
|
+
const std::string & hf_token,
|
492
|
+
const struct llama_model_params & params);
|
493
|
+
struct llama_model * common_load_model_from_hf(
|
494
|
+
const std::string & repo,
|
495
|
+
const std::string & remote_path,
|
496
|
+
const std::string & local_path,
|
497
|
+
const std::string & hf_token,
|
498
|
+
const struct llama_model_params & params);
|
476
499
|
|
477
500
|
// clear LoRA adapters from context, then apply new list of adapters
|
478
501
|
void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters);
|
479
502
|
|
503
|
+
//
|
480
504
|
// Batch utils
|
505
|
+
//
|
481
506
|
|
482
507
|
void common_batch_clear(struct llama_batch & batch);
|
483
508
|
|
@@ -488,6 +513,16 @@ void common_batch_add(
|
|
488
513
|
const std::vector<llama_seq_id> & seq_ids,
|
489
514
|
bool logits);
|
490
515
|
|
516
|
+
//
|
517
|
+
// Token utils
|
518
|
+
//
|
519
|
+
|
520
|
+
// longest common prefix
|
521
|
+
size_t common_lcp(const llama_tokens & a, const llama_tokens & b);
|
522
|
+
|
523
|
+
// longet common subsequence
|
524
|
+
size_t common_lcs(const llama_tokens & a, const llama_tokens & b);
|
525
|
+
|
491
526
|
//
|
492
527
|
// Vocab utils
|
493
528
|
//
|
@@ -599,15 +634,3 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c
|
|
599
634
|
static const char * const LLM_KV_SPLIT_NO = "split.no";
|
600
635
|
static const char * const LLM_KV_SPLIT_COUNT = "split.count";
|
601
636
|
static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
|
602
|
-
|
603
|
-
//
|
604
|
-
// YAML utils
|
605
|
-
//
|
606
|
-
|
607
|
-
void yaml_dump_vector_float (FILE * stream, const char * prop_name, const std::vector<float> & data);
|
608
|
-
void yaml_dump_vector_int (FILE * stream, const char * prop_name, const std::vector<int> & data);
|
609
|
-
void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data);
|
610
|
-
|
611
|
-
void yaml_dump_non_result_info(
|
612
|
-
FILE * stream, const common_params & params, const llama_context * lctx,
|
613
|
-
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
|