cui-llama.rn 1.3.6 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. package/README.md +22 -1
  2. package/android/src/main/CMakeLists.txt +25 -26
  3. package/android/src/main/java/com/rnllama/LlamaContext.java +31 -9
  4. package/android/src/main/java/com/rnllama/RNLlama.java +98 -0
  5. package/android/src/main/jni-utils.h +94 -0
  6. package/android/src/main/jni.cpp +132 -62
  7. package/android/src/newarch/java/com/rnllama/RNLlamaModule.java +15 -0
  8. package/android/src/oldarch/java/com/rnllama/RNLlamaModule.java +15 -0
  9. package/cpp/common.cpp +1982 -1982
  10. package/cpp/common.h +665 -664
  11. package/cpp/ggml-cpu.c +14122 -14122
  12. package/cpp/ggml-cpu.cpp +627 -627
  13. package/cpp/ggml-metal-impl.h +288 -0
  14. package/cpp/ggml-opt.cpp +854 -0
  15. package/cpp/ggml-opt.h +216 -0
  16. package/cpp/llama-mmap.cpp +589 -589
  17. package/cpp/llama.cpp +12547 -12544
  18. package/cpp/rn-llama.hpp +117 -116
  19. package/cpp/sgemm.h +14 -14
  20. package/ios/RNLlama.mm +47 -0
  21. package/ios/RNLlamaContext.h +3 -1
  22. package/ios/RNLlamaContext.mm +71 -14
  23. package/jest/mock.js +15 -3
  24. package/lib/commonjs/NativeRNLlama.js.map +1 -1
  25. package/lib/commonjs/index.js +33 -37
  26. package/lib/commonjs/index.js.map +1 -1
  27. package/lib/module/NativeRNLlama.js.map +1 -1
  28. package/lib/module/index.js +31 -35
  29. package/lib/module/index.js.map +1 -1
  30. package/lib/typescript/NativeRNLlama.d.ts +26 -6
  31. package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
  32. package/lib/typescript/index.d.ts +21 -36
  33. package/lib/typescript/index.d.ts.map +1 -1
  34. package/llama-rn.podspec +4 -18
  35. package/package.json +2 -3
  36. package/src/NativeRNLlama.ts +32 -13
  37. package/src/index.ts +52 -47
  38. package/cpp/llama.cpp.rej +0 -23
package/cpp/common.h CHANGED
@@ -1,664 +1,665 @@
1
- // Various helper functions and utilities
2
-
3
- #pragma once
4
-
5
- #include "llama-cpp.h"
6
-
7
- #include <string>
8
- #include <vector>
9
- #include <sstream>
10
-
11
- #ifdef _WIN32
12
- #define DIRECTORY_SEPARATOR '\\'
13
- #else
14
- #define DIRECTORY_SEPARATOR '/'
15
- #endif // _WIN32
16
-
17
- #define die(msg) do { fputs("error: " msg "\n", stderr); exit(1); } while (0)
18
- #define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
19
-
20
- #define print_build_info() do { \
21
- fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); \
22
- fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \
23
- } while(0)
24
-
25
- #define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
26
-
27
- struct common_lora_adapter_info {
28
- std::string path;
29
- float scale;
30
-
31
- struct llama_lora_adapter * ptr;
32
- };
33
-
34
- using llama_tokens = std::vector<llama_token>;
35
-
36
- // build info
37
- extern int LLAMA_BUILD_NUMBER;
38
- extern const char * LLAMA_COMMIT;
39
- extern const char * LLAMA_COMPILER;
40
- extern const char * LLAMA_BUILD_TARGET;
41
-
42
- struct common_control_vector_load_info;
43
-
44
- #define print_build_info() do { \
45
- fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); \
46
- fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \
47
- } while(0)
48
-
49
- // build info
50
- extern int LLAMA_BUILD_NUMBER;
51
- extern char const *LLAMA_COMMIT;
52
- extern char const *LLAMA_COMPILER;
53
- extern char const *LLAMA_BUILD_TARGET;
54
-
55
- //
56
- // CPU utils
57
- //
58
-
59
- struct cpu_params {
60
- int n_threads = -1;
61
- bool cpumask[LM_GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
62
- bool mask_valid = false; // Default: any CPU
63
- enum lm_ggml_sched_priority priority = LM_GGML_SCHED_PRIO_NORMAL; // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
64
- bool strict_cpu = false; // Use strict CPU placement
65
- uint32_t poll = 50; // Polling (busywait) level (0 - no polling, 100 - mostly polling)
66
- };
67
-
68
- int32_t cpu_get_num_physical_cores();
69
- int32_t cpu_get_num_math();
70
-
71
- //
72
- // Common params
73
- //
74
-
75
- enum llama_example {
76
- LLAMA_EXAMPLE_COMMON,
77
- LLAMA_EXAMPLE_SPECULATIVE,
78
- LLAMA_EXAMPLE_MAIN,
79
- LLAMA_EXAMPLE_INFILL,
80
- LLAMA_EXAMPLE_EMBEDDING,
81
- LLAMA_EXAMPLE_PERPLEXITY,
82
- LLAMA_EXAMPLE_RETRIEVAL,
83
- LLAMA_EXAMPLE_PASSKEY,
84
- LLAMA_EXAMPLE_IMATRIX,
85
- LLAMA_EXAMPLE_BENCH,
86
- LLAMA_EXAMPLE_SERVER,
87
- LLAMA_EXAMPLE_CVECTOR_GENERATOR,
88
- LLAMA_EXAMPLE_EXPORT_LORA,
89
- LLAMA_EXAMPLE_LLAVA,
90
- LLAMA_EXAMPLE_LOOKUP,
91
- LLAMA_EXAMPLE_PARALLEL,
92
- LLAMA_EXAMPLE_TTS,
93
-
94
- LLAMA_EXAMPLE_COUNT,
95
- };
96
-
97
- enum common_sampler_type {
98
- COMMON_SAMPLER_TYPE_NONE = 0,
99
- COMMON_SAMPLER_TYPE_DRY = 1,
100
- COMMON_SAMPLER_TYPE_TOP_K = 2,
101
- COMMON_SAMPLER_TYPE_TOP_P = 3,
102
- COMMON_SAMPLER_TYPE_MIN_P = 4,
103
- //COMMON_SAMPLER_TYPE_TFS_Z = 5,
104
- COMMON_SAMPLER_TYPE_TYPICAL_P = 6,
105
- COMMON_SAMPLER_TYPE_TEMPERATURE = 7,
106
- COMMON_SAMPLER_TYPE_XTC = 8,
107
- COMMON_SAMPLER_TYPE_INFILL = 9,
108
- COMMON_SAMPLER_TYPE_PENALTIES = 10,
109
- };
110
-
111
- // dimensionality reduction methods, used by cvector-generator
112
- enum dimre_method {
113
- DIMRE_METHOD_PCA,
114
- DIMRE_METHOD_MEAN,
115
- };
116
-
117
- // sampling parameters
118
- struct common_params_sampling {
119
- uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
120
-
121
- int32_t n_prev = 64; // number of previous tokens to remember
122
- int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
123
- int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens
124
- int32_t top_k = 40; // <= 0 to use vocab size
125
- float top_p = 0.95f; // 1.0 = disabled
126
- float min_p = 0.05f; // 0.0 = disabled
127
- float xtc_probability = 0.00f; // 0.0 = disabled
128
- float xtc_threshold = 0.10f; // > 0.5 disables XTC
129
- float typ_p = 1.00f; // typical_p, 1.0 = disabled
130
- float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
131
- float dynatemp_range = 0.00f; // 0.0 = disabled
132
- float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
133
- int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
134
- float penalty_repeat = 1.00f; // 1.0 = disabled
135
- float penalty_freq = 0.00f; // 0.0 = disabled
136
- float penalty_present = 0.00f; // 0.0 = disabled
137
- float dry_multiplier = 0.0f; // 0.0 = disabled; DRY repetition penalty for tokens extending repetition:
138
- float dry_base = 1.75f; // 0.0 = disabled; multiplier * base ^ (length of sequence before token - allowed length)
139
- int32_t dry_allowed_length = 2; // tokens extending repetitions beyond this receive penalty
140
- int32_t dry_penalty_last_n = -1; // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size)
141
- int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
142
- float mirostat_tau = 5.00f; // target entropy
143
- float mirostat_eta = 0.10f; // learning rate
144
- bool ignore_eos = false;
145
- bool no_perf = false; // disable performance metrics
146
- bool timing_per_token = false;
147
-
148
- std::vector<std::string> dry_sequence_breakers = {"\n", ":", "\"", "*"}; // default sequence breakers for DRY
149
-
150
-
151
- std::vector<enum common_sampler_type> samplers = {
152
- COMMON_SAMPLER_TYPE_PENALTIES,
153
- COMMON_SAMPLER_TYPE_DRY,
154
- COMMON_SAMPLER_TYPE_TOP_K,
155
- COMMON_SAMPLER_TYPE_TYPICAL_P,
156
- COMMON_SAMPLER_TYPE_TOP_P,
157
- COMMON_SAMPLER_TYPE_MIN_P,
158
- COMMON_SAMPLER_TYPE_XTC,
159
- COMMON_SAMPLER_TYPE_TEMPERATURE,
160
- };
161
-
162
- std::string grammar; // optional BNF-like grammar to constrain sampling
163
-
164
- std::vector<llama_logit_bias> logit_bias; // logit biases to apply
165
-
166
- // print the parameters into a string
167
- std::string print() const;
168
- };
169
-
170
- struct common_params_speculative {
171
- std::vector<lm_ggml_backend_dev_t> devices; // devices to use for offloading
172
-
173
- int32_t n_ctx = 0; // draft context size
174
- int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding
175
- int32_t n_min = 5; // minimum number of draft tokens to use for speculative decoding
176
- int32_t n_gpu_layers = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
177
- float p_split = 0.1f; // speculative decoding split probability
178
- float p_min = 0.9f; // minimum speculative decoding probability (greedy)
179
-
180
- struct cpu_params cpuparams;
181
- struct cpu_params cpuparams_batch;
182
-
183
- std::string model = ""; // draft model for speculative decoding // NOLINT
184
- };
185
-
186
- struct common_params_vocoder {
187
- std::string hf_repo = ""; // HF repo // NOLINT
188
- std::string hf_file = ""; // HF file // NOLINT
189
-
190
- std::string model = ""; // model path // NOLINT
191
- std::string model_url = ""; // model url to download // NOLINT
192
- };
193
-
194
- struct common_params {
195
-
196
- void * progress_callback_user_data = nullptr;
197
- llama_progress_callback progress_callback = nullptr;
198
- bool vocab_only = false;
199
- int32_t n_predict = -1; // new tokens to predict
200
- int32_t n_ctx = 4096; // context size
201
- int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
202
- int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
203
- int32_t n_keep = 0; // number of tokens to keep from initial prompt
204
- int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
205
- int32_t n_parallel = 1; // number of parallel sequences to decode
206
- int32_t n_sequences = 1; // number of sequences to decode
207
- int32_t grp_attn_n = 1; // group-attention factor
208
- int32_t grp_attn_w = 512; // group-attention width
209
- int32_t n_print = -1; // print token count every n tokens (-1 = disabled)
210
- float rope_freq_base = 0.0f; // RoPE base frequency
211
- float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
212
- float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor
213
- float yarn_attn_factor = 1.0f; // YaRN magnitude scaling factor
214
- float yarn_beta_fast = 32.0f; // YaRN low correction dim
215
- float yarn_beta_slow = 1.0f; // YaRN high correction dim
216
- int32_t yarn_orig_ctx = 0; // YaRN original context length
217
- float defrag_thold = 0.1f; // KV cache defragmentation threshold
218
-
219
- // offload params
220
- std::vector<lm_ggml_backend_dev_t> devices; // devices to use for offloading
221
-
222
- int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
223
- int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
224
- float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
225
-
226
- enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
227
-
228
- struct cpu_params cpuparams;
229
- struct cpu_params cpuparams_batch;
230
-
231
- lm_ggml_backend_sched_eval_callback cb_eval = nullptr;
232
- void * cb_eval_user_data = nullptr;
233
-
234
- lm_ggml_numa_strategy numa = LM_GGML_NUMA_STRATEGY_DISABLED;
235
-
236
- enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
237
- enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
238
- enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
239
-
240
- struct common_params_sampling sampling;
241
- struct common_params_speculative speculative;
242
- struct common_params_vocoder vocoder;
243
-
244
- std::string model = ""; // model path // NOLINT
245
- std::string model_alias = ""; // model alias // NOLINT
246
- std::string model_url = ""; // model url to download // NOLINT
247
- std::string hf_token = ""; // HF token // NOLINT
248
- std::string hf_repo = ""; // HF repo // NOLINT
249
- std::string hf_file = ""; // HF file // NOLINT
250
- std::string prompt = ""; // NOLINT
251
- std::string prompt_file = ""; // store the external prompt file name // NOLINT
252
- std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state // NOLINT
253
- std::string input_prefix = ""; // string to prefix user inputs with // NOLINT
254
- std::string input_suffix = ""; // string to suffix user inputs with // NOLINT
255
- std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding // NOLINT
256
- std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding // NOLINT
257
- std::string logits_file = ""; // file for saving *all* logits // NOLINT
258
- std::string rpc_servers = ""; // comma separated list of RPC servers // NOLINT
259
-
260
- std::vector<std::string> in_files; // all input files
261
- std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
262
- std::vector<llama_model_kv_override> kv_overrides;
263
-
264
- bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_lora_adapter_apply)
265
- std::vector<common_lora_adapter_info> lora_adapters; // lora adapter path with user defined scale
266
-
267
- std::vector<common_control_vector_load_info> control_vectors; // control vector with user defined scale
268
-
269
- int32_t verbosity = 0;
270
- int32_t control_vector_layer_start = -1; // layer range for control vector
271
- int32_t control_vector_layer_end = -1; // layer range for control vector
272
-
273
- int32_t ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
274
- int32_t ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
275
- // (which is more convenient to use for plotting)
276
- //
277
- bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
278
- size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score
279
-
280
- bool winogrande = false; // compute Winogrande score over random tasks from datafile supplied in prompt
281
- size_t winogrande_tasks = 0; // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed
282
-
283
- bool multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt
284
- size_t multiple_choice_tasks = 0; // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed
285
-
286
- bool kl_divergence = false; // compute KL divergence
287
-
288
- bool usage = false; // print usage
289
- bool use_color = false; // use color to distinguish generations and inputs
290
- bool special = false; // enable special token output
291
- bool interactive = false; // interactive mode
292
- bool interactive_first = false; // wait for user input immediately
293
- bool conversation = false; // conversation mode (does not print special tokens and suffix/prefix)
294
- bool prompt_cache_all = false; // save user input and generations to prompt cache
295
- bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it
296
-
297
- bool escape = true; // escape "\n", "\r", "\t", "\'", "\"", and "\\"
298
- bool multiline_input = false; // reverse the usage of `\`
299
- bool simple_io = false; // improves compatibility with subprocesses and limited consoles
300
- bool cont_batching = true; // insert new sequences for decoding on-the-fly
301
- bool flash_attn = false; // flash attention
302
- bool no_perf = false; // disable performance metrics
303
- bool ctx_shift = true; // context shift on inifinite text generation
304
-
305
- bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
306
- bool logits_all = false; // return logits for all tokens in the batch
307
- bool use_mmap = true; // use mmap for faster loads
308
- bool use_mlock = false; // use mlock to keep model in memory
309
- bool verbose_prompt = false; // print prompt tokens before generation
310
- bool display_prompt = true; // print prompt before generation
311
- bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
312
- bool no_kv_offload = false; // disable KV offloading
313
- bool warmup = true; // warmup run
314
- bool check_tensors = false; // validate tensor data
315
-
316
- lm_ggml_type cache_type_k = LM_GGML_TYPE_F16; // KV cache data type for the K
317
- lm_ggml_type cache_type_v = LM_GGML_TYPE_F16; // KV cache data type for the V
318
-
319
- // multimodal models (see examples/llava)
320
- std::string mmproj = ""; // path to multimodal projector // NOLINT
321
- std::vector<std::string> image; // path to image file(s)
322
-
323
- // embedding
324
- bool embedding = false; // get only sentence embedding
325
- int32_t embd_normalize = 2; // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
326
- std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
327
- std::string embd_sep = "\n"; // separator of embeddings
328
- bool reranking = false; // enable reranking support on server
329
-
330
- // server params
331
- int32_t port = 8080; // server listens on this network port
332
- int32_t timeout_read = 600; // http read timeout in seconds
333
- int32_t timeout_write = timeout_read; // http write timeout in seconds
334
- int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
335
- int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
336
-
337
- std::string hostname = "127.0.0.1";
338
- std::string public_path = ""; // NOLINT
339
- std::string chat_template = ""; // NOLINT
340
- bool enable_chat_template = true;
341
-
342
- std::vector<std::string> api_keys;
343
-
344
- std::string ssl_file_key = ""; // NOLINT
345
- std::string ssl_file_cert = ""; // NOLINT
346
-
347
- // "advanced" endpoints are disabled by default for better security
348
- bool webui = true;
349
- bool endpoint_slots = false;
350
- bool endpoint_props = false; // only control POST requests, not GET
351
- bool endpoint_metrics = false;
352
-
353
- bool log_json = false;
354
-
355
- std::string slot_save_path;
356
-
357
- float slot_prompt_similarity = 0.5f;
358
-
359
- // batched-bench params
360
- bool is_pp_shared = false;
361
-
362
- std::vector<int32_t> n_pp;
363
- std::vector<int32_t> n_tg;
364
- std::vector<int32_t> n_pl;
365
-
366
- // retrieval params
367
- std::vector<std::string> context_files; // context files to embed
368
-
369
- int32_t chunk_size = 64; // chunk size for context embedding
370
-
371
- std::string chunk_separator = "\n"; // chunk separator for context embedding
372
-
373
- // passkey params
374
- int32_t n_junk = 250; // number of times to repeat the junk text
375
- int32_t i_pos = -1; // position of the passkey in the junk text
376
-
377
- // imatrix params
378
- std::string out_file = "imatrix.dat"; // save the resulting imatrix to this file
379
-
380
- int32_t n_out_freq = 10; // output the imatrix every n_out_freq iterations
381
- int32_t n_save_freq = 0; // save the imatrix every n_save_freq iterations
382
- int32_t i_chunk = 0; // start processing from this chunk
383
-
384
- bool process_output = false; // collect data for the output tensor
385
- bool compute_ppl = true; // whether to compute perplexity
386
-
387
- // cvector-generator params
388
- int n_pca_batch = 100;
389
- int n_pca_iterations = 1000;
390
- dimre_method cvector_dimre_method = DIMRE_METHOD_PCA;
391
- std::string cvector_outfile = "control_vector.gguf";
392
- std::string cvector_positive_file = "examples/cvector-generator/positive.txt";
393
- std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
394
-
395
- bool spm_infill = false; // suffix/prefix/middle pattern for infill
396
-
397
- std::string lora_outfile = "ggml-lora-merged-f16.gguf";
398
-
399
- // batched-bench params
400
- bool batched_bench_output_jsonl = false;
401
- };
402
-
403
- // call once at the start of a program if it uses libcommon
404
- // initializes the logging system and prints info about the build
405
- void common_init();
406
-
407
- std::string common_params_get_system_info(const common_params & params);
408
-
409
- bool parse_cpu_range(const std::string & range, bool(&boolmask)[LM_GGML_MAX_N_THREADS]);
410
- bool parse_cpu_mask(const std::string & mask, bool(&boolmask)[LM_GGML_MAX_N_THREADS]);
411
- void postprocess_cpu_params(cpu_params & cpuparams, const cpu_params * role_model = nullptr);
412
- bool set_process_priority(enum lm_ggml_sched_priority prio);
413
-
414
- //
415
- // String utils
416
- //
417
-
418
- #ifdef __GNUC__
419
- #ifdef __MINGW32__
420
- #define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
421
- #else
422
- #define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
423
- #endif
424
- #else
425
- #define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
426
- #endif
427
-
428
- LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
429
- std::string string_format(const char * fmt, ...);
430
-
431
- std::string string_strip(const std::string & str);
432
- std::string string_get_sortable_timestamp();
433
-
434
- void string_replace_all(std::string & s, const std::string & search, const std::string & replace);
435
-
436
- template<class T>
437
- static std::vector<T> string_split(const std::string & str, char delim) {
438
- static_assert(!std::is_same<T, std::string>::value, "Please use the specialized version for std::string");
439
- std::vector<T> values;
440
- std::istringstream str_stream(str);
441
- std::string token;
442
- while (std::getline(str_stream, token, delim)) {
443
- T value;
444
- std::istringstream token_stream(token);
445
- token_stream >> value;
446
- values.push_back(value);
447
- }
448
- return values;
449
- }
450
-
451
- template<>
452
- std::vector<std::string> string_split<std::string>(const std::string & input, char separator)
453
- {
454
- std::vector<std::string> parts;
455
- size_t begin_pos = 0;
456
- size_t separator_pos = input.find(separator);
457
- while (separator_pos != std::string::npos) {
458
- std::string part = input.substr(begin_pos, separator_pos - begin_pos);
459
- parts.emplace_back(part);
460
- begin_pos = separator_pos + 1;
461
- separator_pos = input.find(separator, begin_pos);
462
- }
463
- parts.emplace_back(input.substr(begin_pos, separator_pos - begin_pos));
464
- return parts;
465
- }
466
-
467
- static bool string_starts_with(const std::string & str,
468
- const std::string & prefix) { // While we wait for C++20's std::string::starts_with...
469
- return str.rfind(prefix, 0) == 0;
470
- }
471
-
472
- bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
473
- void string_process_escapes(std::string & input);
474
-
475
- std::string string_from(bool value);
476
- std::string string_from(const std::vector<int> & values);
477
- std::string string_from(const struct llama_context * ctx, const std::vector<llama_token> & tokens);
478
- std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch);
479
-
480
- //
481
- // Filesystem utils
482
- //
483
-
484
- bool fs_validate_filename(const std::string & filename);
485
- bool fs_create_directory_with_parents(const std::string & path);
486
-
487
- std::string fs_get_cache_directory();
488
- std::string fs_get_cache_file(const std::string & filename);
489
-
490
- //
491
- // Model utils
492
- //
493
-
494
- // note: defines object's lifetime
495
- struct common_init_result {
496
- llama_model_ptr model;
497
- llama_context_ptr context;
498
-
499
- std::vector<llama_lora_adapter_ptr> lora;
500
- };
501
-
502
- struct common_init_result common_init_from_params(common_params & params);
503
-
504
- struct llama_model_params common_model_params_to_llama ( common_params & params);
505
- struct llama_context_params common_context_params_to_llama(const common_params & params);
506
- struct lm_ggml_threadpool_params lm_ggml_threadpool_params_from_cpu_params(const cpu_params & params);
507
-
508
- struct llama_model * common_load_model_from_url(
509
- const std::string & model_url,
510
- const std::string & local_path,
511
- const std::string & hf_token,
512
- const struct llama_model_params & params);
513
- struct llama_model * common_load_model_from_hf(
514
- const std::string & repo,
515
- const std::string & remote_path,
516
- const std::string & local_path,
517
- const std::string & hf_token,
518
- const struct llama_model_params & params);
519
-
520
- // clear LoRA adapters from context, then apply new list of adapters
521
- void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_info> & lora);
522
-
523
- //
524
- // Batch utils
525
- //
526
-
527
- void common_batch_clear(struct llama_batch & batch);
528
-
529
- void common_batch_add(
530
- struct llama_batch & batch,
531
- llama_token id,
532
- llama_pos pos,
533
- const std::vector<llama_seq_id> & seq_ids,
534
- bool logits);
535
-
536
- //
537
- // Token utils
538
- //
539
-
540
- // longest common prefix
541
- size_t common_lcp(const llama_tokens & a, const llama_tokens & b);
542
-
543
- // longet common subsequence
544
- size_t common_lcs(const llama_tokens & a, const llama_tokens & b);
545
-
546
- //
547
- // Vocab utils
548
- //
549
-
550
- // tokenizes a string into a vector of tokens
551
- // should work similar to Python's `tokenizer.encode`
552
- std::vector<llama_token> common_tokenize(
553
- const struct llama_context * ctx,
554
- const std::string & text,
555
- bool add_special,
556
- bool parse_special = false);
557
-
558
- std::vector<llama_token> common_tokenize(
559
- const struct llama_model * model,
560
- const std::string & text,
561
- bool add_special,
562
- bool parse_special = false);
563
-
564
- // tokenizes a token into a piece, optionally renders special/control tokens
565
- // should work similar to Python's `tokenizer.id_to_piece`
566
- std::string common_token_to_piece(
567
- const struct llama_context * ctx,
568
- llama_token token,
569
- bool special = true);
570
-
571
- // detokenizes a vector of tokens into a string
572
- // should work similar to Python's `tokenizer.decode`
573
- // optionally renders special/control tokens
574
- std::string common_detokenize(
575
- llama_context * ctx,
576
- const std::vector<llama_token> & tokens,
577
- bool special = true);
578
-
579
- //
580
- // Chat template utils
581
- //
582
-
583
- // same with llama_chat_message, but uses std::string
584
- struct common_chat_msg {
585
- std::string role;
586
- std::string content;
587
- };
588
-
589
- // Get the built-in chat template for the model. Return empty string if not present.
590
- std::string common_get_builtin_chat_template(const struct llama_model * model);
591
-
592
- // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
593
- bool common_chat_verify_template(const std::string & tmpl);
594
-
595
- // CPP wrapper for llama_chat_apply_template
596
- // If the built-in template is not supported, we default to chatml
597
- // If the custom "tmpl" is not supported, we throw an error
598
- std::string common_chat_apply_template(const struct llama_model * model,
599
- const std::string & tmpl,
600
- const std::vector<common_chat_msg> & chat,
601
- bool add_ass);
602
-
603
- // Format single message, while taking into account the position of that message in chat history
604
- std::string common_chat_format_single(const struct llama_model * model,
605
- const std::string & tmpl,
606
- const std::vector<common_chat_msg> & past_msg,
607
- const common_chat_msg & new_msg,
608
- bool add_ass);
609
-
610
- // Returns an example of formatted chat
611
- std::string common_chat_format_example(const struct llama_model * model,
612
- const std::string & tmpl);
613
-
614
- //
615
- // KV cache utils
616
- //
617
-
618
- // Dump the KV cache view with the number of sequences per cell.
619
- void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);
620
-
621
- // Dump the KV cache view showing individual sequences in each cell (long output).
622
- void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
623
-
624
- //
625
- // Embedding utils
626
- //
627
-
628
- // TODO: repace embd_norm with an enum
629
- void common_embd_normalize(const float * inp, float * out, int n, int embd_norm);
630
-
631
- float common_embd_similarity_cos(const float * embd1, const float * embd2, int n);
632
-
633
- //
634
- // Control vector utils
635
- //
636
-
637
- struct common_control_vector_data {
638
- int n_embd;
639
-
640
- // stores data for layers [1, n_layer] where n_layer = data.size() / n_embd
641
- std::vector<float> data;
642
- };
643
-
644
- struct common_control_vector_load_info {
645
- float strength;
646
-
647
- std::string fname;
648
- };
649
-
650
- // Load control vectors, scale each by strength, and add them together.
651
- // On error, returns {-1, empty}
652
- common_control_vector_data common_control_vector_load(const std::vector<common_control_vector_load_info> & load_infos);
653
-
654
- //
655
- // Split utils
656
- //
657
-
658
- namespace {
659
-
660
- const char * const LLM_KV_SPLIT_NO = "split.no";
661
- const char * const LLM_KV_SPLIT_COUNT = "split.count";
662
- const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
663
-
664
- }
1
+ // Various helper functions and utilities
2
+
3
+ #pragma once
4
+
5
+ #include "llama-cpp.h"
6
+ #include "llama-cpp.h"
7
+
8
+ #include <string>
9
+ #include <vector>
10
+ #include <sstream>
11
+
12
+ #ifdef _WIN32
13
+ #define DIRECTORY_SEPARATOR '\\'
14
+ #else
15
+ #define DIRECTORY_SEPARATOR '/'
16
+ #endif // _WIN32
17
+
18
+ #define die(msg) do { fputs("error: " msg "\n", stderr); exit(1); } while (0)
19
+ #define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
20
+
21
+ #define print_build_info() do { \
22
+ fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); \
23
+ fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \
24
+ } while(0)
25
+
26
+ #define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
27
+
28
+ struct common_lora_adapter_info {
29
+ std::string path;
30
+ float scale;
31
+
32
+ struct llama_lora_adapter * ptr;
33
+ };
34
+
35
+ using llama_tokens = std::vector<llama_token>;
36
+
37
+ // build info
38
+ extern int LLAMA_BUILD_NUMBER;
39
+ extern const char * LLAMA_COMMIT;
40
+ extern const char * LLAMA_COMPILER;
41
+ extern const char * LLAMA_BUILD_TARGET;
42
+
43
+ struct common_control_vector_load_info;
44
+
45
+ #define print_build_info() do { \
46
+ fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); \
47
+ fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \
48
+ } while(0)
49
+
50
+ // build info
51
+ extern int LLAMA_BUILD_NUMBER;
52
+ extern char const *LLAMA_COMMIT;
53
+ extern char const *LLAMA_COMPILER;
54
+ extern char const *LLAMA_BUILD_TARGET;
55
+
56
+ //
57
+ // CPU utils
58
+ //
59
+
60
+ struct cpu_params {
61
+ int n_threads = -1;
62
+ bool cpumask[LM_GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
63
+ bool mask_valid = false; // Default: any CPU
64
+ enum lm_ggml_sched_priority priority = LM_GGML_SCHED_PRIO_NORMAL; // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
65
+ bool strict_cpu = false; // Use strict CPU placement
66
+ uint32_t poll = 50; // Polling (busywait) level (0 - no polling, 100 - mostly polling)
67
+ };
68
+
69
+ int32_t cpu_get_num_physical_cores();
70
+ int32_t cpu_get_num_math();
71
+
72
+ //
73
+ // Common params
74
+ //
75
+
76
+ enum llama_example {
77
+ LLAMA_EXAMPLE_COMMON,
78
+ LLAMA_EXAMPLE_SPECULATIVE,
79
+ LLAMA_EXAMPLE_MAIN,
80
+ LLAMA_EXAMPLE_INFILL,
81
+ LLAMA_EXAMPLE_EMBEDDING,
82
+ LLAMA_EXAMPLE_PERPLEXITY,
83
+ LLAMA_EXAMPLE_RETRIEVAL,
84
+ LLAMA_EXAMPLE_PASSKEY,
85
+ LLAMA_EXAMPLE_IMATRIX,
86
+ LLAMA_EXAMPLE_BENCH,
87
+ LLAMA_EXAMPLE_SERVER,
88
+ LLAMA_EXAMPLE_CVECTOR_GENERATOR,
89
+ LLAMA_EXAMPLE_EXPORT_LORA,
90
+ LLAMA_EXAMPLE_LLAVA,
91
+ LLAMA_EXAMPLE_LOOKUP,
92
+ LLAMA_EXAMPLE_PARALLEL,
93
+ LLAMA_EXAMPLE_TTS,
94
+
95
+ LLAMA_EXAMPLE_COUNT,
96
+ };
97
+
98
+ enum common_sampler_type {
99
+ COMMON_SAMPLER_TYPE_NONE = 0,
100
+ COMMON_SAMPLER_TYPE_DRY = 1,
101
+ COMMON_SAMPLER_TYPE_TOP_K = 2,
102
+ COMMON_SAMPLER_TYPE_TOP_P = 3,
103
+ COMMON_SAMPLER_TYPE_MIN_P = 4,
104
+ //COMMON_SAMPLER_TYPE_TFS_Z = 5,
105
+ COMMON_SAMPLER_TYPE_TYPICAL_P = 6,
106
+ COMMON_SAMPLER_TYPE_TEMPERATURE = 7,
107
+ COMMON_SAMPLER_TYPE_XTC = 8,
108
+ COMMON_SAMPLER_TYPE_INFILL = 9,
109
+ COMMON_SAMPLER_TYPE_PENALTIES = 10,
110
+ };
111
+
112
+ // dimensionality reduction methods, used by cvector-generator
113
+ enum dimre_method {
114
+ DIMRE_METHOD_PCA,
115
+ DIMRE_METHOD_MEAN,
116
+ };
117
+
118
+ // sampling parameters
119
+ struct common_params_sampling {
120
+ uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
121
+
122
+ int32_t n_prev = 64; // number of previous tokens to remember
123
+ int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
124
+ int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens
125
+ int32_t top_k = 40; // <= 0 to use vocab size
126
+ float top_p = 0.95f; // 1.0 = disabled
127
+ float min_p = 0.05f; // 0.0 = disabled
128
+ float xtc_probability = 0.00f; // 0.0 = disabled
129
+ float xtc_threshold = 0.10f; // > 0.5 disables XTC
130
+ float typ_p = 1.00f; // typical_p, 1.0 = disabled
131
+ float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
132
+ float dynatemp_range = 0.00f; // 0.0 = disabled
133
+ float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
134
+ int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
135
+ float penalty_repeat = 1.00f; // 1.0 = disabled
136
+ float penalty_freq = 0.00f; // 0.0 = disabled
137
+ float penalty_present = 0.00f; // 0.0 = disabled
138
+ float dry_multiplier = 0.0f; // 0.0 = disabled; DRY repetition penalty for tokens extending repetition:
139
+ float dry_base = 1.75f; // 0.0 = disabled; multiplier * base ^ (length of sequence before token - allowed length)
140
+ int32_t dry_allowed_length = 2; // tokens extending repetitions beyond this receive penalty
141
+ int32_t dry_penalty_last_n = -1; // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size)
142
+ int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
143
+ float mirostat_tau = 5.00f; // target entropy
144
+ float mirostat_eta = 0.10f; // learning rate
145
+ bool ignore_eos = false;
146
+ bool no_perf = false; // disable performance metrics
147
+ bool timing_per_token = false;
148
+
149
+ std::vector<std::string> dry_sequence_breakers = {"\n", ":", "\"", "*"}; // default sequence breakers for DRY
150
+
151
+
152
+ std::vector<enum common_sampler_type> samplers = {
153
+ COMMON_SAMPLER_TYPE_PENALTIES,
154
+ COMMON_SAMPLER_TYPE_DRY,
155
+ COMMON_SAMPLER_TYPE_TOP_K,
156
+ COMMON_SAMPLER_TYPE_TYPICAL_P,
157
+ COMMON_SAMPLER_TYPE_TOP_P,
158
+ COMMON_SAMPLER_TYPE_MIN_P,
159
+ COMMON_SAMPLER_TYPE_XTC,
160
+ COMMON_SAMPLER_TYPE_TEMPERATURE,
161
+ };
162
+
163
+ std::string grammar; // optional BNF-like grammar to constrain sampling
164
+
165
+ std::vector<llama_logit_bias> logit_bias; // logit biases to apply
166
+
167
+ // print the parameters into a string
168
+ std::string print() const;
169
+ };
170
+
171
+ struct common_params_speculative {
172
+ std::vector<lm_ggml_backend_dev_t> devices; // devices to use for offloading
173
+
174
+ int32_t n_ctx = 0; // draft context size
175
+ int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding
176
+ int32_t n_min = 5; // minimum number of draft tokens to use for speculative decoding
177
+ int32_t n_gpu_layers = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
178
+ float p_split = 0.1f; // speculative decoding split probability
179
+ float p_min = 0.9f; // minimum speculative decoding probability (greedy)
180
+
181
+ struct cpu_params cpuparams;
182
+ struct cpu_params cpuparams_batch;
183
+
184
+ std::string model = ""; // draft model for speculative decoding // NOLINT
185
+ };
186
+
187
+ struct common_params_vocoder {
188
+ std::string hf_repo = ""; // HF repo // NOLINT
189
+ std::string hf_file = ""; // HF file // NOLINT
190
+
191
+ std::string model = ""; // model path // NOLINT
192
+ std::string model_url = ""; // model url to download // NOLINT
193
+ };
194
+
195
+ struct common_params {
196
+
197
+ void * progress_callback_user_data = nullptr;
198
+ llama_progress_callback progress_callback = nullptr;
199
+ bool vocab_only = false;
200
+ int32_t n_predict = -1; // new tokens to predict
201
+ int32_t n_ctx = 4096; // context size
202
+ int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
203
+ int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
204
+ int32_t n_keep = 0; // number of tokens to keep from initial prompt
205
+ int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
206
+ int32_t n_parallel = 1; // number of parallel sequences to decode
207
+ int32_t n_sequences = 1; // number of sequences to decode
208
+ int32_t grp_attn_n = 1; // group-attention factor
209
+ int32_t grp_attn_w = 512; // group-attention width
210
+ int32_t n_print = -1; // print token count every n tokens (-1 = disabled)
211
+ float rope_freq_base = 0.0f; // RoPE base frequency
212
+ float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
213
+ float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor
214
+ float yarn_attn_factor = 1.0f; // YaRN magnitude scaling factor
215
+ float yarn_beta_fast = 32.0f; // YaRN low correction dim
216
+ float yarn_beta_slow = 1.0f; // YaRN high correction dim
217
+ int32_t yarn_orig_ctx = 0; // YaRN original context length
218
+ float defrag_thold = 0.1f; // KV cache defragmentation threshold
219
+
220
+ // offload params
221
+ std::vector<lm_ggml_backend_dev_t> devices; // devices to use for offloading
222
+
223
+ int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
224
+ int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
225
+ float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
226
+
227
+ enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
228
+
229
+ struct cpu_params cpuparams;
230
+ struct cpu_params cpuparams_batch;
231
+
232
+ lm_ggml_backend_sched_eval_callback cb_eval = nullptr;
233
+ void * cb_eval_user_data = nullptr;
234
+
235
+ lm_ggml_numa_strategy numa = LM_GGML_NUMA_STRATEGY_DISABLED;
236
+
237
+ enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
238
+ enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
239
+ enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
240
+
241
+ struct common_params_sampling sampling;
242
+ struct common_params_speculative speculative;
243
+ struct common_params_vocoder vocoder;
244
+
245
+ std::string model = ""; // model path // NOLINT
246
+ std::string model_alias = ""; // model alias // NOLINT
247
+ std::string model_url = ""; // model url to download // NOLINT
248
+ std::string hf_token = ""; // HF token // NOLINT
249
+ std::string hf_repo = ""; // HF repo // NOLINT
250
+ std::string hf_file = ""; // HF file // NOLINT
251
+ std::string prompt = ""; // NOLINT
252
+ std::string prompt_file = ""; // store the external prompt file name // NOLINT
253
+ std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state // NOLINT
254
+ std::string input_prefix = ""; // string to prefix user inputs with // NOLINT
255
+ std::string input_suffix = ""; // string to suffix user inputs with // NOLINT
256
+ std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding // NOLINT
257
+ std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding // NOLINT
258
+ std::string logits_file = ""; // file for saving *all* logits // NOLINT
259
+ std::string rpc_servers = ""; // comma separated list of RPC servers // NOLINT
260
+
261
+ std::vector<std::string> in_files; // all input files
262
+ std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
263
+ std::vector<llama_model_kv_override> kv_overrides;
264
+
265
+ bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_lora_adapter_apply)
266
+ std::vector<common_lora_adapter_info> lora_adapters; // lora adapter path with user defined scale
267
+
268
+ std::vector<common_control_vector_load_info> control_vectors; // control vector with user defined scale
269
+
270
+ int32_t verbosity = 0;
271
+ int32_t control_vector_layer_start = -1; // layer range for control vector
272
+ int32_t control_vector_layer_end = -1; // layer range for control vector
273
+
274
+ int32_t ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
275
+ int32_t ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
276
+ // (which is more convenient to use for plotting)
277
+ //
278
+ bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
279
+ size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score
280
+
281
+ bool winogrande = false; // compute Winogrande score over random tasks from datafile supplied in prompt
282
+ size_t winogrande_tasks = 0; // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed
283
+
284
+ bool multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt
285
+ size_t multiple_choice_tasks = 0; // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed
286
+
287
+ bool kl_divergence = false; // compute KL divergence
288
+
289
+ bool usage = false; // print usage
290
+ bool use_color = false; // use color to distinguish generations and inputs
291
+ bool special = false; // enable special token output
292
+ bool interactive = false; // interactive mode
293
+ bool interactive_first = false; // wait for user input immediately
294
+ bool conversation = false; // conversation mode (does not print special tokens and suffix/prefix)
295
+ bool prompt_cache_all = false; // save user input and generations to prompt cache
296
+ bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it
297
+
298
+ bool escape = true; // escape "\n", "\r", "\t", "\'", "\"", and "\\"
299
+ bool multiline_input = false; // reverse the usage of `\`
300
+ bool simple_io = false; // improves compatibility with subprocesses and limited consoles
301
+ bool cont_batching = true; // insert new sequences for decoding on-the-fly
302
+ bool flash_attn = false; // flash attention
303
+ bool no_perf = false; // disable performance metrics
304
+ bool ctx_shift = true; // context shift on inifinite text generation
305
+
306
+ bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
307
+ bool logits_all = false; // return logits for all tokens in the batch
308
+ bool use_mmap = true; // use mmap for faster loads
309
+ bool use_mlock = false; // use mlock to keep model in memory
310
+ bool verbose_prompt = false; // print prompt tokens before generation
311
+ bool display_prompt = true; // print prompt before generation
312
+ bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
313
+ bool no_kv_offload = false; // disable KV offloading
314
+ bool warmup = true; // warmup run
315
+ bool check_tensors = false; // validate tensor data
316
+
317
+ lm_ggml_type cache_type_k = LM_GGML_TYPE_F16; // KV cache data type for the K
318
+ lm_ggml_type cache_type_v = LM_GGML_TYPE_F16; // KV cache data type for the V
319
+
320
+ // multimodal models (see examples/llava)
321
+ std::string mmproj = ""; // path to multimodal projector // NOLINT
322
+ std::vector<std::string> image; // path to image file(s)
323
+
324
+ // embedding
325
+ bool embedding = false; // get only sentence embedding
326
+ int32_t embd_normalize = 2; // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
327
+ std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
328
+ std::string embd_sep = "\n"; // separator of embeddings
329
+ bool reranking = false; // enable reranking support on server
330
+
331
+ // server params
332
+ int32_t port = 8080; // server listens on this network port
333
+ int32_t timeout_read = 600; // http read timeout in seconds
334
+ int32_t timeout_write = timeout_read; // http write timeout in seconds
335
+ int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
336
+ int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
337
+
338
+ std::string hostname = "127.0.0.1";
339
+ std::string public_path = ""; // NOLINT
340
+ std::string chat_template = ""; // NOLINT
341
+ bool enable_chat_template = true;
342
+
343
+ std::vector<std::string> api_keys;
344
+
345
+ std::string ssl_file_key = ""; // NOLINT
346
+ std::string ssl_file_cert = ""; // NOLINT
347
+
348
+ // "advanced" endpoints are disabled by default for better security
349
+ bool webui = true;
350
+ bool endpoint_slots = false;
351
+ bool endpoint_props = false; // only control POST requests, not GET
352
+ bool endpoint_metrics = false;
353
+
354
+ bool log_json = false;
355
+
356
+ std::string slot_save_path;
357
+
358
+ float slot_prompt_similarity = 0.5f;
359
+
360
+ // batched-bench params
361
+ bool is_pp_shared = false;
362
+
363
+ std::vector<int32_t> n_pp;
364
+ std::vector<int32_t> n_tg;
365
+ std::vector<int32_t> n_pl;
366
+
367
+ // retrieval params
368
+ std::vector<std::string> context_files; // context files to embed
369
+
370
+ int32_t chunk_size = 64; // chunk size for context embedding
371
+
372
+ std::string chunk_separator = "\n"; // chunk separator for context embedding
373
+
374
+ // passkey params
375
+ int32_t n_junk = 250; // number of times to repeat the junk text
376
+ int32_t i_pos = -1; // position of the passkey in the junk text
377
+
378
+ // imatrix params
379
+ std::string out_file = "imatrix.dat"; // save the resulting imatrix to this file
380
+
381
+ int32_t n_out_freq = 10; // output the imatrix every n_out_freq iterations
382
+ int32_t n_save_freq = 0; // save the imatrix every n_save_freq iterations
383
+ int32_t i_chunk = 0; // start processing from this chunk
384
+
385
+ bool process_output = false; // collect data for the output tensor
386
+ bool compute_ppl = true; // whether to compute perplexity
387
+
388
+ // cvector-generator params
389
+ int n_pca_batch = 100;
390
+ int n_pca_iterations = 1000;
391
+ dimre_method cvector_dimre_method = DIMRE_METHOD_PCA;
392
+ std::string cvector_outfile = "control_vector.gguf";
393
+ std::string cvector_positive_file = "examples/cvector-generator/positive.txt";
394
+ std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
395
+
396
+ bool spm_infill = false; // suffix/prefix/middle pattern for infill
397
+
398
+ std::string lora_outfile = "ggml-lora-merged-f16.gguf";
399
+
400
+ // batched-bench params
401
+ bool batched_bench_output_jsonl = false;
402
+ };
403
+
404
+ // call once at the start of a program if it uses libcommon
405
+ // initializes the logging system and prints info about the build
406
+ void common_init();
407
+
408
+ std::string common_params_get_system_info(const common_params & params);
409
+
410
+ bool parse_cpu_range(const std::string & range, bool(&boolmask)[LM_GGML_MAX_N_THREADS]);
411
+ bool parse_cpu_mask(const std::string & mask, bool(&boolmask)[LM_GGML_MAX_N_THREADS]);
412
+ void postprocess_cpu_params(cpu_params & cpuparams, const cpu_params * role_model = nullptr);
413
+ bool set_process_priority(enum lm_ggml_sched_priority prio);
414
+
415
+ //
416
+ // String utils
417
+ //
418
+
419
+ #ifdef __GNUC__
420
+ #ifdef __MINGW32__
421
+ #define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
422
+ #else
423
+ #define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
424
+ #endif
425
+ #else
426
+ #define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
427
+ #endif
428
+
429
+ LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
430
+ std::string string_format(const char * fmt, ...);
431
+
432
+ std::string string_strip(const std::string & str);
433
+ std::string string_get_sortable_timestamp();
434
+
435
+ void string_replace_all(std::string & s, const std::string & search, const std::string & replace);
436
+
437
+ template<class T>
438
+ static std::vector<T> string_split(const std::string & str, char delim) {
439
+ static_assert(!std::is_same<T, std::string>::value, "Please use the specialized version for std::string");
440
+ std::vector<T> values;
441
+ std::istringstream str_stream(str);
442
+ std::string token;
443
+ while (std::getline(str_stream, token, delim)) {
444
+ T value;
445
+ std::istringstream token_stream(token);
446
+ token_stream >> value;
447
+ values.push_back(value);
448
+ }
449
+ return values;
450
+ }
451
+
452
+ template<>
453
+ std::vector<std::string> string_split<std::string>(const std::string & input, char separator)
454
+ {
455
+ std::vector<std::string> parts;
456
+ size_t begin_pos = 0;
457
+ size_t separator_pos = input.find(separator);
458
+ while (separator_pos != std::string::npos) {
459
+ std::string part = input.substr(begin_pos, separator_pos - begin_pos);
460
+ parts.emplace_back(part);
461
+ begin_pos = separator_pos + 1;
462
+ separator_pos = input.find(separator, begin_pos);
463
+ }
464
+ parts.emplace_back(input.substr(begin_pos, separator_pos - begin_pos));
465
+ return parts;
466
+ }
467
+
468
+ static bool string_starts_with(const std::string & str,
469
+ const std::string & prefix) { // While we wait for C++20's std::string::starts_with...
470
+ return str.rfind(prefix, 0) == 0;
471
+ }
472
+
473
+ bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
474
+ void string_process_escapes(std::string & input);
475
+
476
+ std::string string_from(bool value);
477
+ std::string string_from(const std::vector<int> & values);
478
+ std::string string_from(const struct llama_context * ctx, const std::vector<llama_token> & tokens);
479
+ std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch);
480
+
481
+ //
482
+ // Filesystem utils
483
+ //
484
+
485
+ bool fs_validate_filename(const std::string & filename);
486
+ bool fs_create_directory_with_parents(const std::string & path);
487
+
488
+ std::string fs_get_cache_directory();
489
+ std::string fs_get_cache_file(const std::string & filename);
490
+
491
+ //
492
+ // Model utils
493
+ //
494
+
495
+ // note: defines object's lifetime
496
+ struct common_init_result {
497
+ llama_model_ptr model;
498
+ llama_context_ptr context;
499
+
500
+ std::vector<llama_lora_adapter_ptr> lora;
501
+ };
502
+
503
+ struct common_init_result common_init_from_params(common_params & params);
504
+
505
+ struct llama_model_params common_model_params_to_llama ( common_params & params);
506
+ struct llama_context_params common_context_params_to_llama(const common_params & params);
507
+ struct lm_ggml_threadpool_params lm_ggml_threadpool_params_from_cpu_params(const cpu_params & params);
508
+
509
+ struct llama_model * common_load_model_from_url(
510
+ const std::string & model_url,
511
+ const std::string & local_path,
512
+ const std::string & hf_token,
513
+ const struct llama_model_params & params);
514
+ struct llama_model * common_load_model_from_hf(
515
+ const std::string & repo,
516
+ const std::string & remote_path,
517
+ const std::string & local_path,
518
+ const std::string & hf_token,
519
+ const struct llama_model_params & params);
520
+
521
+ // clear LoRA adapters from context, then apply new list of adapters
522
+ void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_info> & lora);
523
+
524
+ //
525
+ // Batch utils
526
+ //
527
+
528
+ void common_batch_clear(struct llama_batch & batch);
529
+
530
+ void common_batch_add(
531
+ struct llama_batch & batch,
532
+ llama_token id,
533
+ llama_pos pos,
534
+ const std::vector<llama_seq_id> & seq_ids,
535
+ bool logits);
536
+
537
+ //
538
+ // Token utils
539
+ //
540
+
541
+ // longest common prefix
542
+ size_t common_lcp(const llama_tokens & a, const llama_tokens & b);
543
+
544
+ // longet common subsequence
545
+ size_t common_lcs(const llama_tokens & a, const llama_tokens & b);
546
+
547
+ //
548
+ // Vocab utils
549
+ //
550
+
551
+ // tokenizes a string into a vector of tokens
552
+ // should work similar to Python's `tokenizer.encode`
553
+ std::vector<llama_token> common_tokenize(
554
+ const struct llama_context * ctx,
555
+ const std::string & text,
556
+ bool add_special,
557
+ bool parse_special = false);
558
+
559
+ std::vector<llama_token> common_tokenize(
560
+ const struct llama_model * model,
561
+ const std::string & text,
562
+ bool add_special,
563
+ bool parse_special = false);
564
+
565
+ // tokenizes a token into a piece, optionally renders special/control tokens
566
+ // should work similar to Python's `tokenizer.id_to_piece`
567
+ std::string common_token_to_piece(
568
+ const struct llama_context * ctx,
569
+ llama_token token,
570
+ bool special = true);
571
+
572
+ // detokenizes a vector of tokens into a string
573
+ // should work similar to Python's `tokenizer.decode`
574
+ // optionally renders special/control tokens
575
+ std::string common_detokenize(
576
+ llama_context * ctx,
577
+ const std::vector<llama_token> & tokens,
578
+ bool special = true);
579
+
580
+ //
581
+ // Chat template utils
582
+ //
583
+
584
+ // same with llama_chat_message, but uses std::string
585
+ struct common_chat_msg {
586
+ std::string role;
587
+ std::string content;
588
+ };
589
+
590
+ // Get the built-in chat template for the model. Return empty string if not present.
591
+ std::string common_get_builtin_chat_template(const struct llama_model * model);
592
+
593
+ // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
594
+ bool common_chat_verify_template(const std::string & tmpl);
595
+
596
+ // CPP wrapper for llama_chat_apply_template
597
+ // If the built-in template is not supported, we default to chatml
598
+ // If the custom "tmpl" is not supported, we throw an error
599
+ std::string common_chat_apply_template(const struct llama_model * model,
600
+ const std::string & tmpl,
601
+ const std::vector<common_chat_msg> & chat,
602
+ bool add_ass);
603
+
604
+ // Format single message, while taking into account the position of that message in chat history
605
+ std::string common_chat_format_single(const struct llama_model * model,
606
+ const std::string & tmpl,
607
+ const std::vector<common_chat_msg> & past_msg,
608
+ const common_chat_msg & new_msg,
609
+ bool add_ass);
610
+
611
+ // Returns an example of formatted chat
612
+ std::string common_chat_format_example(const struct llama_model * model,
613
+ const std::string & tmpl);
614
+
615
+ //
616
+ // KV cache utils
617
+ //
618
+
619
+ // Dump the KV cache view with the number of sequences per cell.
620
+ void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);
621
+
622
+ // Dump the KV cache view showing individual sequences in each cell (long output).
623
+ void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
624
+
625
+ //
626
+ // Embedding utils
627
+ //
628
+
629
+ // TODO: repace embd_norm with an enum
630
+ void common_embd_normalize(const float * inp, float * out, int n, int embd_norm);
631
+
632
+ float common_embd_similarity_cos(const float * embd1, const float * embd2, int n);
633
+
634
+ //
635
+ // Control vector utils
636
+ //
637
+
638
+ struct common_control_vector_data {
639
+ int n_embd;
640
+
641
+ // stores data for layers [1, n_layer] where n_layer = data.size() / n_embd
642
+ std::vector<float> data;
643
+ };
644
+
645
+ struct common_control_vector_load_info {
646
+ float strength;
647
+
648
+ std::string fname;
649
+ };
650
+
651
+ // Load control vectors, scale each by strength, and add them together.
652
+ // On error, returns {-1, empty}
653
+ common_control_vector_data common_control_vector_load(const std::vector<common_control_vector_load_info> & load_infos);
654
+
655
+ //
656
+ // Split utils
657
+ //
658
+
659
+ namespace {
660
+
661
+ const char * const LLM_KV_SPLIT_NO = "split.no";
662
+ const char * const LLM_KV_SPLIT_COUNT = "split.count";
663
+ const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
664
+
665
+ }