cui-llama.rn 1.2.6 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. package/README.md +3 -2
  2. package/android/src/main/CMakeLists.txt +20 -5
  3. package/android/src/main/java/com/rnllama/LlamaContext.java +115 -27
  4. package/android/src/main/java/com/rnllama/RNLlama.java +40 -7
  5. package/android/src/main/jni.cpp +222 -34
  6. package/android/src/newarch/java/com/rnllama/RNLlamaModule.java +9 -4
  7. package/android/src/oldarch/java/com/rnllama/RNLlamaModule.java +9 -4
  8. package/cpp/common.cpp +1682 -2114
  9. package/cpp/common.h +600 -613
  10. package/cpp/ggml-aarch64.c +129 -3478
  11. package/cpp/ggml-aarch64.h +19 -39
  12. package/cpp/ggml-alloc.c +1040 -1040
  13. package/cpp/ggml-alloc.h +76 -76
  14. package/cpp/ggml-backend-impl.h +216 -216
  15. package/cpp/ggml-backend-reg.cpp +195 -0
  16. package/cpp/ggml-backend.cpp +1997 -2661
  17. package/cpp/ggml-backend.h +328 -314
  18. package/cpp/ggml-common.h +1853 -1853
  19. package/cpp/ggml-cpp.h +38 -38
  20. package/cpp/ggml-cpu-aarch64.c +3560 -0
  21. package/cpp/ggml-cpu-aarch64.h +30 -0
  22. package/cpp/ggml-cpu-impl.h +371 -614
  23. package/cpp/ggml-cpu-quants.c +10822 -0
  24. package/cpp/ggml-cpu-quants.h +63 -0
  25. package/cpp/ggml-cpu.c +13975 -13720
  26. package/cpp/ggml-cpu.cpp +663 -0
  27. package/cpp/ggml-cpu.h +177 -150
  28. package/cpp/ggml-impl.h +550 -296
  29. package/cpp/ggml-metal.h +66 -66
  30. package/cpp/ggml-metal.m +4294 -3933
  31. package/cpp/ggml-quants.c +5247 -15739
  32. package/cpp/ggml-quants.h +100 -147
  33. package/cpp/ggml-threading.cpp +12 -0
  34. package/cpp/ggml-threading.h +12 -0
  35. package/cpp/ggml.c +8180 -8390
  36. package/cpp/ggml.h +2411 -2441
  37. package/cpp/llama-grammar.cpp +1138 -1138
  38. package/cpp/llama-grammar.h +144 -144
  39. package/cpp/llama-impl.h +181 -181
  40. package/cpp/llama-sampling.cpp +2348 -2345
  41. package/cpp/llama-sampling.h +48 -48
  42. package/cpp/llama-vocab.cpp +1984 -1984
  43. package/cpp/llama-vocab.h +170 -170
  44. package/cpp/llama.cpp +22132 -22046
  45. package/cpp/llama.h +1253 -1255
  46. package/cpp/log.cpp +401 -401
  47. package/cpp/log.h +121 -121
  48. package/cpp/rn-llama.hpp +83 -19
  49. package/cpp/sampling.cpp +466 -466
  50. package/cpp/sgemm.cpp +1884 -1276
  51. package/ios/RNLlama.mm +43 -20
  52. package/ios/RNLlamaContext.h +9 -3
  53. package/ios/RNLlamaContext.mm +133 -33
  54. package/jest/mock.js +0 -1
  55. package/lib/commonjs/NativeRNLlama.js.map +1 -1
  56. package/lib/commonjs/index.js +52 -15
  57. package/lib/commonjs/index.js.map +1 -1
  58. package/lib/module/NativeRNLlama.js.map +1 -1
  59. package/lib/module/index.js +51 -15
  60. package/lib/module/index.js.map +1 -1
  61. package/lib/typescript/NativeRNLlama.d.ts +29 -5
  62. package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
  63. package/lib/typescript/index.d.ts +12 -5
  64. package/lib/typescript/index.d.ts.map +1 -1
  65. package/package.json +1 -1
  66. package/src/NativeRNLlama.ts +41 -6
  67. package/src/index.ts +82 -27
  68. package/cpp/json-schema-to-grammar.cpp +0 -1045
  69. package/cpp/json-schema-to-grammar.h +0 -8
  70. package/cpp/json.hpp +0 -24766
package/cpp/common.h CHANGED
@@ -1,613 +1,600 @@
1
- // Various helper functions and utilities
2
-
3
- #pragma once
4
-
5
- #include "llama.h"
6
-
7
- #include <string>
8
- #include <vector>
9
- #include <sstream>
10
-
11
- #ifdef _WIN32
12
- #define DIRECTORY_SEPARATOR '\\'
13
- #else
14
- #define DIRECTORY_SEPARATOR '/'
15
- #endif // _WIN32
16
-
17
- #define die(msg) do { fputs("error: " msg "\n", stderr); exit(1); } while (0)
18
- #define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
19
-
20
- #define print_build_info() do { \
21
- fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); \
22
- fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \
23
- } while(0)
24
-
25
- #define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
26
-
27
- struct common_lora_adapter_info {
28
- std::string path;
29
- float scale;
30
- };
31
-
32
- struct common_lora_adapter_container : common_lora_adapter_info {
33
- struct llama_lora_adapter * adapter;
34
- };
35
-
36
- // build info
37
- extern int LLAMA_BUILD_NUMBER;
38
- extern char const * LLAMA_COMMIT;
39
- extern char const * LLAMA_COMPILER;
40
- extern char const * LLAMA_BUILD_TARGET;
41
-
42
- struct common_control_vector_load_info;
43
-
44
- #define print_build_info() do { \
45
- fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); \
46
- fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \
47
- } while(0)
48
-
49
- // build info
50
- extern int LLAMA_BUILD_NUMBER;
51
- extern char const *LLAMA_COMMIT;
52
- extern char const *LLAMA_COMPILER;
53
- extern char const *LLAMA_BUILD_TARGET;
54
-
55
- //
56
- // CPU utils
57
- //
58
-
59
- struct cpu_params {
60
- int n_threads = -1;
61
- bool cpumask[LM_GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
62
- bool mask_valid = false; // Default: any CPU
63
- enum lm_ggml_sched_priority priority = LM_GGML_SCHED_PRIO_NORMAL; // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
64
- bool strict_cpu = false; // Use strict CPU placement
65
- uint32_t poll = 50; // Polling (busywait) level (0 - no polling, 100 - mostly polling)
66
- };
67
-
68
- int32_t cpu_get_num_physical_cores();
69
- int32_t cpu_get_num_math();
70
-
71
- //
72
- // Common params
73
- //
74
-
75
- enum llama_example {
76
- LLAMA_EXAMPLE_COMMON,
77
- LLAMA_EXAMPLE_SPECULATIVE,
78
- LLAMA_EXAMPLE_MAIN,
79
- LLAMA_EXAMPLE_INFILL,
80
- LLAMA_EXAMPLE_EMBEDDING,
81
- LLAMA_EXAMPLE_PERPLEXITY,
82
- LLAMA_EXAMPLE_RETRIEVAL,
83
- LLAMA_EXAMPLE_PASSKEY,
84
- LLAMA_EXAMPLE_IMATRIX,
85
- LLAMA_EXAMPLE_BENCH,
86
- LLAMA_EXAMPLE_SERVER,
87
- LLAMA_EXAMPLE_CVECTOR_GENERATOR,
88
- LLAMA_EXAMPLE_EXPORT_LORA,
89
- LLAMA_EXAMPLE_LLAVA,
90
- LLAMA_EXAMPLE_LOOKUP,
91
- LLAMA_EXAMPLE_PARALLEL,
92
-
93
- LLAMA_EXAMPLE_COUNT,
94
- };
95
-
96
- enum common_sampler_type {
97
- COMMON_SAMPLER_TYPE_NONE = 0,
98
- COMMON_SAMPLER_TYPE_DRY = 1,
99
- COMMON_SAMPLER_TYPE_TOP_K = 2,
100
- COMMON_SAMPLER_TYPE_TOP_P = 3,
101
- COMMON_SAMPLER_TYPE_MIN_P = 4,
102
- //COMMON_SAMPLER_TYPE_TFS_Z = 5,
103
- COMMON_SAMPLER_TYPE_TYPICAL_P = 6,
104
- COMMON_SAMPLER_TYPE_TEMPERATURE = 7,
105
- COMMON_SAMPLER_TYPE_XTC = 8,
106
- COMMON_SAMPLER_TYPE_INFILL = 9,
107
- };
108
-
109
- // dimensionality reduction methods, used by cvector-generator
110
- enum dimre_method {
111
- DIMRE_METHOD_PCA,
112
- DIMRE_METHOD_MEAN,
113
- };
114
-
115
- // sampler parameters
116
- struct common_sampler_params {
117
- uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
118
-
119
- int32_t n_prev = 64; // number of previous tokens to remember
120
- int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
121
- int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens
122
- int32_t top_k = 40; // <= 0 to use vocab size
123
- float top_p = 0.95f; // 1.0 = disabled
124
- float min_p = 0.05f; // 0.0 = disabled
125
- float xtc_probability = 0.00f; // 0.0 = disabled
126
- float xtc_threshold = 0.10f; // > 0.5 disables XTC
127
- float typ_p = 1.00f; // typical_p, 1.0 = disabled
128
- float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
129
- float dynatemp_range = 0.00f; // 0.0 = disabled
130
- float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
131
- int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
132
- float penalty_repeat = 1.00f; // 1.0 = disabled
133
- float penalty_freq = 0.00f; // 0.0 = disabled
134
- float penalty_present = 0.00f; // 0.0 = disabled
135
- float dry_multiplier = 0.0f; // 0.0 = disabled; DRY repetition penalty for tokens extending repetition:
136
- float dry_base = 1.75f; // 0.0 = disabled; multiplier * base ^ (length of sequence before token - allowed length)
137
- int32_t dry_allowed_length = 2; // tokens extending repetitions beyond this receive penalty
138
- int32_t dry_penalty_last_n = -1; // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size)
139
- int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
140
- float mirostat_tau = 5.00f; // target entropy
141
- float mirostat_eta = 0.10f; // learning rate
142
- bool penalize_nl = false; // consider newlines as a repeatable token
143
- bool ignore_eos = false;
144
- bool no_perf = false; // disable performance metrics
145
-
146
- std::vector<std::string> dry_sequence_breakers = {"\n", ":", "\"", "*"}; // default sequence breakers for DRY
147
-
148
-
149
- std::vector<enum common_sampler_type> samplers = {
150
- COMMON_SAMPLER_TYPE_DRY,
151
- COMMON_SAMPLER_TYPE_TOP_K,
152
- COMMON_SAMPLER_TYPE_TYPICAL_P,
153
- COMMON_SAMPLER_TYPE_TOP_P,
154
- COMMON_SAMPLER_TYPE_MIN_P,
155
- COMMON_SAMPLER_TYPE_XTC,
156
- COMMON_SAMPLER_TYPE_TEMPERATURE,
157
- };
158
-
159
- std::string grammar; // optional BNF-like grammar to constrain sampling
160
-
161
- std::vector<llama_logit_bias> logit_bias; // logit biases to apply
162
-
163
- // print the parameters into a string
164
- std::string print() const;
165
- };
166
-
167
- struct common_params {
168
-
169
- void * progress_callback_user_data = nullptr;
170
- llama_progress_callback progress_callback = nullptr;
171
- bool vocab_only = false;
172
- int32_t n_predict = -1; // new tokens to predict
173
- int32_t n_ctx = 4096; // context size
174
- int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
175
- int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
176
- int32_t n_keep = 0; // number of tokens to keep from initial prompt
177
- int32_t n_draft = 5; // number of tokens to draft during speculative decoding
178
- int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
179
- int32_t n_parallel = 1; // number of parallel sequences to decode
180
- int32_t n_sequences = 1; // number of sequences to decode
181
- float p_split = 0.1f; // speculative decoding split probability
182
- int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
183
- int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
184
- int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
185
- float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
186
- int32_t grp_attn_n = 1; // group-attention factor
187
- int32_t grp_attn_w = 512; // group-attention width
188
- int32_t n_print = -1; // print token count every n tokens (-1 = disabled)
189
- float rope_freq_base = 0.0f; // RoPE base frequency
190
- float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
191
- float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor
192
- float yarn_attn_factor = 1.0f; // YaRN magnitude scaling factor
193
- float yarn_beta_fast = 32.0f; // YaRN low correction dim
194
- float yarn_beta_slow = 1.0f; // YaRN high correction dim
195
- int32_t yarn_orig_ctx = 0; // YaRN original context length
196
- float defrag_thold = -1.0f; // KV cache defragmentation threshold
197
-
198
- struct cpu_params cpuparams;
199
- struct cpu_params cpuparams_batch;
200
- struct cpu_params draft_cpuparams;
201
- struct cpu_params draft_cpuparams_batch;
202
-
203
- lm_ggml_backend_sched_eval_callback cb_eval = nullptr;
204
- void * cb_eval_user_data = nullptr;
205
-
206
- lm_ggml_numa_strategy numa = LM_GGML_NUMA_STRATEGY_DISABLED;
207
-
208
- enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
209
- enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
210
- enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
211
- enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
212
-
213
- struct common_sampler_params sparams;
214
-
215
- std::string model = ""; // model path // NOLINT
216
- std::string model_draft = ""; // draft model for speculative decoding // NOLINT
217
- std::string model_alias = "unknown"; // model alias // NOLINT
218
- std::string model_url = ""; // model url to download // NOLINT
219
- std::string hf_token = ""; // HF token // NOLINT
220
- std::string hf_repo = ""; // HF repo // NOLINT
221
- std::string hf_file = ""; // HF file // NOLINT
222
- std::string prompt = ""; // NOLINT
223
- std::string prompt_file = ""; // store the external prompt file name // NOLINT
224
- std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state // NOLINT
225
- std::string input_prefix = ""; // string to prefix user inputs with // NOLINT
226
- std::string input_suffix = ""; // string to suffix user inputs with // NOLINT
227
- std::string logdir = ""; // directory in which to save YAML log files // NOLINT
228
- std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding // NOLINT
229
- std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding // NOLINT
230
- std::string logits_file = ""; // file for saving *all* logits // NOLINT
231
- std::string rpc_servers = ""; // comma separated list of RPC servers // NOLINT
232
-
233
- std::vector<std::string> in_files; // all input files
234
- std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
235
- std::vector<llama_model_kv_override> kv_overrides;
236
-
237
- bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_lora_adapter_apply)
238
- std::vector<common_lora_adapter_info> lora_adapters; // lora adapter path with user defined scale
239
-
240
- std::vector<common_control_vector_load_info> control_vectors; // control vector with user defined scale
241
-
242
- int32_t verbosity = 0;
243
- int32_t control_vector_layer_start = -1; // layer range for control vector
244
- int32_t control_vector_layer_end = -1; // layer range for control vector
245
-
246
- int32_t ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
247
- int32_t ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
248
- // (which is more convenient to use for plotting)
249
- //
250
- bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
251
- size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score
252
-
253
- bool winogrande = false; // compute Winogrande score over random tasks from datafile supplied in prompt
254
- size_t winogrande_tasks = 0; // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed
255
-
256
- bool multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt
257
- size_t multiple_choice_tasks = 0; // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed
258
-
259
- bool kl_divergence = false; // compute KL divergence
260
-
261
- bool usage = false; // print usage
262
- bool use_color = false; // use color to distinguish generations and inputs
263
- bool special = false; // enable special token output
264
- bool interactive = false; // interactive mode
265
- bool interactive_first = false; // wait for user input immediately
266
- bool conversation = false; // conversation mode (does not print special tokens and suffix/prefix)
267
- bool prompt_cache_all = false; // save user input and generations to prompt cache
268
- bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it
269
-
270
- bool escape = true; // escape "\n", "\r", "\t", "\'", "\"", and "\\"
271
- bool multiline_input = false; // reverse the usage of `\`
272
- bool simple_io = false; // improves compatibility with subprocesses and limited consoles
273
- bool cont_batching = true; // insert new sequences for decoding on-the-fly
274
- bool flash_attn = false; // flash attention
275
- bool no_perf = false; // disable performance metrics
276
- bool ctx_shift = true; // context shift on inifinite text generation
277
-
278
- bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
279
- bool logits_all = false; // return logits for all tokens in the batch
280
- bool use_mmap = true; // use mmap for faster loads
281
- bool use_mlock = false; // use mlock to keep model in memory
282
- bool verbose_prompt = false; // print prompt tokens before generation
283
- bool display_prompt = true; // print prompt before generation
284
- bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
285
- bool no_kv_offload = false; // disable KV offloading
286
- bool warmup = true; // warmup run
287
- bool check_tensors = false; // validate tensor data
288
-
289
- std::string cache_type_k = "f16"; // KV cache data type for the K
290
- std::string cache_type_v = "f16"; // KV cache data type for the V
291
-
292
- // multimodal models (see examples/llava)
293
- std::string mmproj = ""; // path to multimodal projector // NOLINT
294
- std::vector<std::string> image; // path to image file(s)
295
-
296
- // embedding
297
- bool embedding = false; // get only sentence embedding
298
- int32_t embd_normalize = 2; // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
299
- std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
300
- std::string embd_sep = "\n"; // separator of embeddings
301
- bool reranking = false; // enable reranking support on server
302
-
303
- // server params
304
- int32_t port = 8080; // server listens on this network port
305
- int32_t timeout_read = 600; // http read timeout in seconds
306
- int32_t timeout_write = timeout_read; // http write timeout in seconds
307
- int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
308
- int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
309
-
310
- std::string hostname = "127.0.0.1";
311
- std::string public_path = ""; // NOLINT
312
- std::string chat_template = ""; // NOLINT
313
- bool enable_chat_template = true;
314
-
315
- std::vector<std::string> api_keys;
316
-
317
- std::string ssl_file_key = ""; // NOLINT
318
- std::string ssl_file_cert = ""; // NOLINT
319
-
320
- // "advanced" endpoints are disabled by default for better security
321
- bool webui = true;
322
- bool endpoint_slots = false;
323
- bool endpoint_props = false; // only control POST requests, not GET
324
- bool endpoint_metrics = false;
325
-
326
- bool log_json = false;
327
-
328
- std::string slot_save_path;
329
-
330
- float slot_prompt_similarity = 0.5f;
331
-
332
- // batched-bench params
333
- bool is_pp_shared = false;
334
-
335
- std::vector<int32_t> n_pp;
336
- std::vector<int32_t> n_tg;
337
- std::vector<int32_t> n_pl;
338
-
339
- // retrieval params
340
- std::vector<std::string> context_files; // context files to embed
341
-
342
- int32_t chunk_size = 64; // chunk size for context embedding
343
-
344
- std::string chunk_separator = "\n"; // chunk separator for context embedding
345
-
346
- // passkey params
347
- int32_t n_junk = 250; // number of times to repeat the junk text
348
- int32_t i_pos = -1; // position of the passkey in the junk text
349
-
350
- // imatrix params
351
- std::string out_file = "imatrix.dat"; // save the resulting imatrix to this file
352
-
353
- int32_t n_out_freq = 10; // output the imatrix every n_out_freq iterations
354
- int32_t n_save_freq = 0; // save the imatrix every n_save_freq iterations
355
- int32_t i_chunk = 0; // start processing from this chunk
356
-
357
- bool process_output = false; // collect data for the output tensor
358
- bool compute_ppl = true; // whether to compute perplexity
359
-
360
- // cvector-generator params
361
- int n_pca_batch = 100;
362
- int n_pca_iterations = 1000;
363
- dimre_method cvector_dimre_method = DIMRE_METHOD_PCA;
364
- std::string cvector_outfile = "control_vector.gguf";
365
- std::string cvector_positive_file = "examples/cvector-generator/positive.txt";
366
- std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
367
-
368
- bool spm_infill = false; // suffix/prefix/middle pattern for infill
369
-
370
- std::string lora_outfile = "ggml-lora-merged-f16.gguf";
371
-
372
- // batched-bench params
373
- bool batched_bench_output_jsonl = false;
374
- };
375
-
376
- // call once at the start of a program if it uses libcommon
377
- // initializes the logging system and prints info about the build
378
- void common_init();
379
-
380
- std::string common_params_get_system_info(const common_params & params);
381
-
382
- bool parse_cpu_range(const std::string & range, bool(&boolmask)[LM_GGML_MAX_N_THREADS]);
383
- bool parse_cpu_mask(const std::string & mask, bool(&boolmask)[LM_GGML_MAX_N_THREADS]);
384
- void postprocess_cpu_params(cpu_params & cpuparams, const cpu_params * role_model = nullptr);
385
- bool set_process_priority(enum lm_ggml_sched_priority prio);
386
-
387
- //
388
- // String utils
389
- //
390
-
391
- #ifdef __GNUC__
392
- #ifdef __MINGW32__
393
- #define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
394
- #else
395
- #define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
396
- #endif
397
- #else
398
- #define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
399
- #endif
400
-
401
- LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
402
- std::string string_format(const char * fmt, ...);
403
-
404
- std::string string_strip(const std::string & str);
405
- std::string string_get_sortable_timestamp();
406
-
407
- void string_replace_all(std::string & s, const std::string & search, const std::string & replace);
408
-
409
- template<class T>
410
- static std::vector<T> string_split(const std::string & str, char delim) {
411
- static_assert(!std::is_same<T, std::string>::value, "Please use the specialized version for std::string");
412
- std::vector<T> values;
413
- std::istringstream str_stream(str);
414
- std::string token;
415
- while (std::getline(str_stream, token, delim)) {
416
- T value;
417
- std::istringstream token_stream(token);
418
- token_stream >> value;
419
- values.push_back(value);
420
- }
421
- return values;
422
- }
423
-
424
- template<>
425
- std::vector<std::string> string_split<std::string>(const std::string & input, char separator)
426
- {
427
- std::vector<std::string> parts;
428
- size_t begin_pos = 0;
429
- size_t separator_pos = input.find(separator);
430
- while (separator_pos != std::string::npos) {
431
- std::string part = input.substr(begin_pos, separator_pos - begin_pos);
432
- parts.emplace_back(part);
433
- begin_pos = separator_pos + 1;
434
- separator_pos = input.find(separator, begin_pos);
435
- }
436
- parts.emplace_back(input.substr(begin_pos, separator_pos - begin_pos));
437
- return parts;
438
- }
439
-
440
- bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
441
- void string_process_escapes(std::string & input);
442
-
443
- std::string string_from(bool value);
444
- std::string string_from(const std::vector<int> & values);
445
- std::string string_from(const struct llama_context * ctx, const std::vector<llama_token> & tokens);
446
- std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch);
447
-
448
- //
449
- // Filesystem utils
450
- //
451
-
452
- bool fs_validate_filename(const std::string & filename);
453
- bool fs_create_directory_with_parents(const std::string & path);
454
-
455
- std::string fs_get_cache_directory();
456
- std::string fs_get_cache_file(const std::string & filename);
457
-
458
- //
459
- // Model utils
460
- //
461
-
462
- struct common_init_result {
463
- struct llama_model * model = nullptr;
464
- struct llama_context * context = nullptr;
465
- std::vector<common_lora_adapter_container> lora_adapters;
466
- };
467
-
468
- struct common_init_result common_init_from_params(common_params & params);
469
-
470
- struct llama_model_params common_model_params_to_llama (const common_params & params);
471
- struct llama_context_params common_context_params_to_llama(const common_params & params);
472
- struct lm_ggml_threadpool_params lm_ggml_threadpool_params_from_cpu_params(const cpu_params & params);
473
-
474
- struct llama_model * common_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
475
- struct llama_model * common_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
476
-
477
- // clear LoRA adapters from context, then apply new list of adapters
478
- void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters);
479
-
480
- // Batch utils
481
-
482
- void common_batch_clear(struct llama_batch & batch);
483
-
484
- void common_batch_add(
485
- struct llama_batch & batch,
486
- llama_token id,
487
- llama_pos pos,
488
- const std::vector<llama_seq_id> & seq_ids,
489
- bool logits);
490
-
491
- //
492
- // Vocab utils
493
- //
494
-
495
- // tokenizes a string into a vector of tokens
496
- // should work similar to Python's `tokenizer.encode`
497
- std::vector<llama_token> common_tokenize(
498
- const struct llama_context * ctx,
499
- const std::string & text,
500
- bool add_special,
501
- bool parse_special = false);
502
-
503
- std::vector<llama_token> common_tokenize(
504
- const struct llama_model * model,
505
- const std::string & text,
506
- bool add_special,
507
- bool parse_special = false);
508
-
509
- // tokenizes a token into a piece, optionally renders special/control tokens
510
- // should work similar to Python's `tokenizer.id_to_piece`
511
- std::string common_token_to_piece(
512
- const struct llama_context * ctx,
513
- llama_token token,
514
- bool special = true);
515
-
516
- // detokenizes a vector of tokens into a string
517
- // should work similar to Python's `tokenizer.decode`
518
- // optionally renders special/control tokens
519
- std::string common_detokenize(
520
- llama_context * ctx,
521
- const std::vector<llama_token> & tokens,
522
- bool special = true);
523
-
524
- //
525
- // Chat template utils
526
- //
527
-
528
- // same with llama_chat_message, but uses std::string
529
- struct common_chat_msg {
530
- std::string role;
531
- std::string content;
532
- };
533
-
534
- // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
535
- bool common_chat_verify_template(const std::string & tmpl);
536
-
537
- // CPP wrapper for llama_chat_apply_template
538
- // If the built-in template is not supported, we default to chatml
539
- // If the custom "tmpl" is not supported, we throw an error
540
- std::string common_chat_apply_template(const struct llama_model * model,
541
- const std::string & tmpl,
542
- const std::vector<common_chat_msg> & chat,
543
- bool add_ass);
544
-
545
- // Format single message, while taking into account the position of that message in chat history
546
- std::string common_chat_format_single(const struct llama_model * model,
547
- const std::string & tmpl,
548
- const std::vector<common_chat_msg> & past_msg,
549
- const common_chat_msg & new_msg,
550
- bool add_ass);
551
-
552
- // Returns an example of formatted chat
553
- std::string common_chat_format_example(const struct llama_model * model,
554
- const std::string & tmpl);
555
-
556
- //
557
- // KV cache utils
558
- //
559
-
560
- // Dump the KV cache view with the number of sequences per cell.
561
- void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);
562
-
563
- // Dump the KV cache view showing individual sequences in each cell (long output).
564
- void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
565
-
566
- //
567
- // Embedding utils
568
- //
569
-
570
- void common_embd_normalize(const float * inp, float * out, int n, int embd_norm = 2);
571
-
572
- float common_embd_similarity_cos(const float * embd1, const float * embd2, int n);
573
-
574
- //
575
- // Control vector utils
576
- //
577
-
578
- struct common_control_vector_data {
579
- int n_embd;
580
-
581
- // stores data for layers [1, n_layer] where n_layer = data.size() / n_embd
582
- std::vector<float> data;
583
- };
584
-
585
- struct common_control_vector_load_info {
586
- float strength;
587
-
588
- std::string fname;
589
- };
590
-
591
- // Load control vectors, scale each by strength, and add them together.
592
- // On error, returns {-1, empty}
593
- common_control_vector_data common_control_vector_load(const std::vector<common_control_vector_load_info> & load_infos);
594
-
595
- //
596
- // Split utils
597
- //
598
-
599
- static const char * const LLM_KV_SPLIT_NO = "split.no";
600
- static const char * const LLM_KV_SPLIT_COUNT = "split.count";
601
- static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
602
-
603
- //
604
- // YAML utils
605
- //
606
-
607
- void yaml_dump_vector_float (FILE * stream, const char * prop_name, const std::vector<float> & data);
608
- void yaml_dump_vector_int (FILE * stream, const char * prop_name, const std::vector<int> & data);
609
- void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data);
610
-
611
- void yaml_dump_non_result_info(
612
- FILE * stream, const common_params & params, const llama_context * lctx,
613
- const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
1
+ // Various helper functions and utilities
2
+
3
+ #pragma once
4
+
5
+ #include "llama.h"
6
+
7
+ #include <string>
8
+ #include <vector>
9
+ #include <sstream>
10
+
11
+ #ifdef _WIN32
12
+ #define DIRECTORY_SEPARATOR '\\'
13
+ #else
14
+ #define DIRECTORY_SEPARATOR '/'
15
+ #endif // _WIN32
16
+
17
+ #define die(msg) do { fputs("error: " msg "\n", stderr); exit(1); } while (0)
18
+ #define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
19
+
20
+ #define print_build_info() do { \
21
+ fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); \
22
+ fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \
23
+ } while(0)
24
+
25
+ #define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
26
+
27
+ struct common_lora_adapter_info {
28
+ std::string path;
29
+ float scale;
30
+ };
31
+
32
+ struct common_lora_adapter_container : common_lora_adapter_info {
33
+ struct llama_lora_adapter * adapter;
34
+ };
35
+
36
+ // build info
37
+ extern int LLAMA_BUILD_NUMBER;
38
+ extern char const * LLAMA_COMMIT;
39
+ extern char const * LLAMA_COMPILER;
40
+ extern char const * LLAMA_BUILD_TARGET;
41
+
42
+ struct common_control_vector_load_info;
43
+
44
+ #define print_build_info() do { \
45
+ fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); \
46
+ fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \
47
+ } while(0)
48
+
49
+ // build info
50
+ extern int LLAMA_BUILD_NUMBER;
51
+ extern char const *LLAMA_COMMIT;
52
+ extern char const *LLAMA_COMPILER;
53
+ extern char const *LLAMA_BUILD_TARGET;
54
+
55
+ //
56
+ // CPU utils
57
+ //
58
+
59
+ struct cpu_params {
60
+ int n_threads = -1;
61
+ bool cpumask[LM_GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
62
+ bool mask_valid = false; // Default: any CPU
63
+ enum lm_ggml_sched_priority priority = LM_GGML_SCHED_PRIO_NORMAL; // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
64
+ bool strict_cpu = false; // Use strict CPU placement
65
+ uint32_t poll = 50; // Polling (busywait) level (0 - no polling, 100 - mostly polling)
66
+ };
67
+
68
+ int32_t cpu_get_num_physical_cores();
69
+ int32_t cpu_get_num_math();
70
+
71
+ //
72
+ // Common params
73
+ //
74
+
75
+ enum llama_example {
76
+ LLAMA_EXAMPLE_COMMON,
77
+ LLAMA_EXAMPLE_SPECULATIVE,
78
+ LLAMA_EXAMPLE_MAIN,
79
+ LLAMA_EXAMPLE_INFILL,
80
+ LLAMA_EXAMPLE_EMBEDDING,
81
+ LLAMA_EXAMPLE_PERPLEXITY,
82
+ LLAMA_EXAMPLE_RETRIEVAL,
83
+ LLAMA_EXAMPLE_PASSKEY,
84
+ LLAMA_EXAMPLE_IMATRIX,
85
+ LLAMA_EXAMPLE_BENCH,
86
+ LLAMA_EXAMPLE_SERVER,
87
+ LLAMA_EXAMPLE_CVECTOR_GENERATOR,
88
+ LLAMA_EXAMPLE_EXPORT_LORA,
89
+ LLAMA_EXAMPLE_LLAVA,
90
+ LLAMA_EXAMPLE_LOOKUP,
91
+ LLAMA_EXAMPLE_PARALLEL,
92
+
93
+ LLAMA_EXAMPLE_COUNT,
94
+ };
95
+
96
+ enum common_sampler_type {
97
+ COMMON_SAMPLER_TYPE_NONE = 0,
98
+ COMMON_SAMPLER_TYPE_DRY = 1,
99
+ COMMON_SAMPLER_TYPE_TOP_K = 2,
100
+ COMMON_SAMPLER_TYPE_TOP_P = 3,
101
+ COMMON_SAMPLER_TYPE_MIN_P = 4,
102
+ //COMMON_SAMPLER_TYPE_TFS_Z = 5,
103
+ COMMON_SAMPLER_TYPE_TYPICAL_P = 6,
104
+ COMMON_SAMPLER_TYPE_TEMPERATURE = 7,
105
+ COMMON_SAMPLER_TYPE_XTC = 8,
106
+ COMMON_SAMPLER_TYPE_INFILL = 9,
107
+ };
108
+
109
+ // dimensionality reduction methods, used by cvector-generator
110
+ enum dimre_method {
111
+ DIMRE_METHOD_PCA,
112
+ DIMRE_METHOD_MEAN,
113
+ };
114
+
115
+ // sampler parameters
116
+ struct common_sampler_params {
117
+ uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
118
+
119
+ int32_t n_prev = 64; // number of previous tokens to remember
120
+ int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
121
+ int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens
122
+ int32_t top_k = 40; // <= 0 to use vocab size
123
+ float top_p = 0.95f; // 1.0 = disabled
124
+ float min_p = 0.05f; // 0.0 = disabled
125
+ float xtc_probability = 0.00f; // 0.0 = disabled
126
+ float xtc_threshold = 0.10f; // > 0.5 disables XTC
127
+ float typ_p = 1.00f; // typical_p, 1.0 = disabled
128
+ float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
129
+ float dynatemp_range = 0.00f; // 0.0 = disabled
130
+ float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
131
+ int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
132
+ float penalty_repeat = 1.00f; // 1.0 = disabled
133
+ float penalty_freq = 0.00f; // 0.0 = disabled
134
+ float penalty_present = 0.00f; // 0.0 = disabled
135
+ float dry_multiplier = 0.0f; // 0.0 = disabled; DRY repetition penalty for tokens extending repetition:
136
+ float dry_base = 1.75f; // 0.0 = disabled; multiplier * base ^ (length of sequence before token - allowed length)
137
+ int32_t dry_allowed_length = 2; // tokens extending repetitions beyond this receive penalty
138
+ int32_t dry_penalty_last_n = -1; // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size)
139
+ int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
140
+ float mirostat_tau = 5.00f; // target entropy
141
+ float mirostat_eta = 0.10f; // learning rate
142
+ bool penalize_nl = false; // consider newlines as a repeatable token
143
+ bool ignore_eos = false;
144
+ bool no_perf = false; // disable performance metrics
145
+
146
+ std::vector<std::string> dry_sequence_breakers = {"\n", ":", "\"", "*"}; // default sequence breakers for DRY
147
+
148
+
149
+ std::vector<enum common_sampler_type> samplers = {
150
+ COMMON_SAMPLER_TYPE_DRY,
151
+ COMMON_SAMPLER_TYPE_TOP_K,
152
+ COMMON_SAMPLER_TYPE_TYPICAL_P,
153
+ COMMON_SAMPLER_TYPE_TOP_P,
154
+ COMMON_SAMPLER_TYPE_MIN_P,
155
+ COMMON_SAMPLER_TYPE_XTC,
156
+ COMMON_SAMPLER_TYPE_TEMPERATURE,
157
+ };
158
+
159
+ std::string grammar; // optional BNF-like grammar to constrain sampling
160
+
161
+ std::vector<llama_logit_bias> logit_bias; // logit biases to apply
162
+
163
+ // print the parameters into a string
164
+ std::string print() const;
165
+ };
166
+
167
+ struct common_params {
168
+
169
+ void * progress_callback_user_data = nullptr;
170
+ llama_progress_callback progress_callback = nullptr;
171
+ bool vocab_only = false;
172
+ int32_t n_predict = -1; // new tokens to predict
173
+ int32_t n_ctx = 4096; // context size
174
+ int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
175
+ int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
176
+ int32_t n_keep = 0; // number of tokens to keep from initial prompt
177
+ int32_t n_draft = 5; // number of tokens to draft during speculative decoding
178
+ int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
179
+ int32_t n_parallel = 1; // number of parallel sequences to decode
180
+ int32_t n_sequences = 1; // number of sequences to decode
181
+ float p_split = 0.1f; // speculative decoding split probability
182
+ int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
183
+ int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
184
+ int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
185
+ float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
186
+ int32_t grp_attn_n = 1; // group-attention factor
187
+ int32_t grp_attn_w = 512; // group-attention width
188
+ int32_t n_print = -1; // print token count every n tokens (-1 = disabled)
189
+ float rope_freq_base = 0.0f; // RoPE base frequency
190
+ float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
191
+ float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor
192
+ float yarn_attn_factor = 1.0f; // YaRN magnitude scaling factor
193
+ float yarn_beta_fast = 32.0f; // YaRN low correction dim
194
+ float yarn_beta_slow = 1.0f; // YaRN high correction dim
195
+ int32_t yarn_orig_ctx = 0; // YaRN original context length
196
+ float defrag_thold = 0.1f; // KV cache defragmentation threshold
197
+
198
+ struct cpu_params cpuparams;
199
+ struct cpu_params cpuparams_batch;
200
+ struct cpu_params draft_cpuparams;
201
+ struct cpu_params draft_cpuparams_batch;
202
+
203
+ lm_ggml_backend_sched_eval_callback cb_eval = nullptr;
204
+ void * cb_eval_user_data = nullptr;
205
+
206
+ lm_ggml_numa_strategy numa = LM_GGML_NUMA_STRATEGY_DISABLED;
207
+
208
+ enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
209
+ enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
210
+ enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
211
+ enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
212
+
213
+ struct common_sampler_params sparams;
214
+
215
+ std::string model = ""; // model path // NOLINT
216
+ std::string model_draft = ""; // draft model for speculative decoding // NOLINT
217
+ std::string model_alias = "unknown"; // model alias // NOLINT
218
+ std::string model_url = ""; // model url to download // NOLINT
219
+ std::string hf_token = ""; // HF token // NOLINT
220
+ std::string hf_repo = ""; // HF repo // NOLINT
221
+ std::string hf_file = ""; // HF file // NOLINT
222
+ std::string prompt = ""; // NOLINT
223
+ std::string prompt_file = ""; // store the external prompt file name // NOLINT
224
+ std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state // NOLINT
225
+ std::string input_prefix = ""; // string to prefix user inputs with // NOLINT
226
+ std::string input_suffix = ""; // string to suffix user inputs with // NOLINT
227
+ std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding // NOLINT
228
+ std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding // NOLINT
229
+ std::string logits_file = ""; // file for saving *all* logits // NOLINT
230
+ std::string rpc_servers = ""; // comma separated list of RPC servers // NOLINT
231
+
232
+ std::vector<std::string> in_files; // all input files
233
+ std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
234
+ std::vector<llama_model_kv_override> kv_overrides;
235
+
236
+ bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_lora_adapter_apply)
237
+ std::vector<common_lora_adapter_info> lora_adapters; // lora adapter path with user defined scale
238
+
239
+ std::vector<common_control_vector_load_info> control_vectors; // control vector with user defined scale
240
+
241
+ int32_t verbosity = 0;
242
+ int32_t control_vector_layer_start = -1; // layer range for control vector
243
+ int32_t control_vector_layer_end = -1; // layer range for control vector
244
+
245
+ int32_t ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
246
+ int32_t ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
247
+ // (which is more convenient to use for plotting)
248
+ //
249
+ bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
250
+ size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score
251
+
252
+ bool winogrande = false; // compute Winogrande score over random tasks from datafile supplied in prompt
253
+ size_t winogrande_tasks = 0; // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed
254
+
255
+ bool multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt
256
+ size_t multiple_choice_tasks = 0; // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed
257
+
258
+ bool kl_divergence = false; // compute KL divergence
259
+
260
+ bool usage = false; // print usage
261
+ bool use_color = false; // use color to distinguish generations and inputs
262
+ bool special = false; // enable special token output
263
+ bool interactive = false; // interactive mode
264
+ bool interactive_first = false; // wait for user input immediately
265
+ bool conversation = false; // conversation mode (does not print special tokens and suffix/prefix)
266
+ bool prompt_cache_all = false; // save user input and generations to prompt cache
267
+ bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it
268
+
269
+ bool escape = true; // escape "\n", "\r", "\t", "\'", "\"", and "\\"
270
+ bool multiline_input = false; // reverse the usage of `\`
271
+ bool simple_io = false; // improves compatibility with subprocesses and limited consoles
272
+ bool cont_batching = true; // insert new sequences for decoding on-the-fly
273
+ bool flash_attn = false; // flash attention
274
+ bool no_perf = false; // disable performance metrics
275
+ bool ctx_shift = true; // context shift on inifinite text generation
276
+
277
+ bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
278
+ bool logits_all = false; // return logits for all tokens in the batch
279
+ bool use_mmap = true; // use mmap for faster loads
280
+ bool use_mlock = false; // use mlock to keep model in memory
281
+ bool verbose_prompt = false; // print prompt tokens before generation
282
+ bool display_prompt = true; // print prompt before generation
283
+ bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
284
+ bool no_kv_offload = false; // disable KV offloading
285
+ bool warmup = true; // warmup run
286
+ bool check_tensors = false; // validate tensor data
287
+
288
+ std::string cache_type_k = "f16"; // KV cache data type for the K
289
+ std::string cache_type_v = "f16"; // KV cache data type for the V
290
+
291
+ // multimodal models (see examples/llava)
292
+ std::string mmproj = ""; // path to multimodal projector // NOLINT
293
+ std::vector<std::string> image; // path to image file(s)
294
+
295
+ // embedding
296
+ bool embedding = false; // get only sentence embedding
297
+ int32_t embd_normalize = 2; // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
298
+ std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
299
+ std::string embd_sep = "\n"; // separator of embeddings
300
+ bool reranking = false; // enable reranking support on server
301
+
302
+ // server params
303
+ int32_t port = 8080; // server listens on this network port
304
+ int32_t timeout_read = 600; // http read timeout in seconds
305
+ int32_t timeout_write = timeout_read; // http write timeout in seconds
306
+ int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
307
+ int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
308
+
309
+ std::string hostname = "127.0.0.1";
310
+ std::string public_path = ""; // NOLINT
311
+ std::string chat_template = ""; // NOLINT
312
+ bool enable_chat_template = true;
313
+
314
+ std::vector<std::string> api_keys;
315
+
316
+ std::string ssl_file_key = ""; // NOLINT
317
+ std::string ssl_file_cert = ""; // NOLINT
318
+
319
+ // "advanced" endpoints are disabled by default for better security
320
+ bool webui = true;
321
+ bool endpoint_slots = false;
322
+ bool endpoint_props = false; // only control POST requests, not GET
323
+ bool endpoint_metrics = false;
324
+
325
+ bool log_json = false;
326
+
327
+ std::string slot_save_path;
328
+
329
+ float slot_prompt_similarity = 0.5f;
330
+
331
+ // batched-bench params
332
+ bool is_pp_shared = false;
333
+
334
+ std::vector<int32_t> n_pp;
335
+ std::vector<int32_t> n_tg;
336
+ std::vector<int32_t> n_pl;
337
+
338
+ // retrieval params
339
+ std::vector<std::string> context_files; // context files to embed
340
+
341
+ int32_t chunk_size = 64; // chunk size for context embedding
342
+
343
+ std::string chunk_separator = "\n"; // chunk separator for context embedding
344
+
345
+ // passkey params
346
+ int32_t n_junk = 250; // number of times to repeat the junk text
347
+ int32_t i_pos = -1; // position of the passkey in the junk text
348
+
349
+ // imatrix params
350
+ std::string out_file = "imatrix.dat"; // save the resulting imatrix to this file
351
+
352
+ int32_t n_out_freq = 10; // output the imatrix every n_out_freq iterations
353
+ int32_t n_save_freq = 0; // save the imatrix every n_save_freq iterations
354
+ int32_t i_chunk = 0; // start processing from this chunk
355
+
356
+ bool process_output = false; // collect data for the output tensor
357
+ bool compute_ppl = true; // whether to compute perplexity
358
+
359
+ // cvector-generator params
360
+ int n_pca_batch = 100;
361
+ int n_pca_iterations = 1000;
362
+ dimre_method cvector_dimre_method = DIMRE_METHOD_PCA;
363
+ std::string cvector_outfile = "control_vector.gguf";
364
+ std::string cvector_positive_file = "examples/cvector-generator/positive.txt";
365
+ std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
366
+
367
+ bool spm_infill = false; // suffix/prefix/middle pattern for infill
368
+
369
+ std::string lora_outfile = "ggml-lora-merged-f16.gguf";
370
+
371
+ // batched-bench params
372
+ bool batched_bench_output_jsonl = false;
373
+ };
374
+
375
+ // call once at the start of a program if it uses libcommon
376
+ // initializes the logging system and prints info about the build
377
+ void common_init();
378
+
379
+ std::string common_params_get_system_info(const common_params & params);
380
+
381
+ bool parse_cpu_range(const std::string & range, bool(&boolmask)[LM_GGML_MAX_N_THREADS]);
382
+ bool parse_cpu_mask(const std::string & mask, bool(&boolmask)[LM_GGML_MAX_N_THREADS]);
383
+ void postprocess_cpu_params(cpu_params & cpuparams, const cpu_params * role_model = nullptr);
384
+ bool set_process_priority(enum lm_ggml_sched_priority prio);
385
+
386
+ //
387
+ // String utils
388
+ //
389
+
390
+ #ifdef __GNUC__
391
+ #ifdef __MINGW32__
392
+ #define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
393
+ #else
394
+ #define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
395
+ #endif
396
+ #else
397
+ #define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
398
+ #endif
399
+
400
+ LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
401
+ std::string string_format(const char * fmt, ...);
402
+
403
+ std::string string_strip(const std::string & str);
404
+ std::string string_get_sortable_timestamp();
405
+
406
+ void string_replace_all(std::string & s, const std::string & search, const std::string & replace);
407
+
408
+ template<class T>
409
+ static std::vector<T> string_split(const std::string & str, char delim) {
410
+ static_assert(!std::is_same<T, std::string>::value, "Please use the specialized version for std::string");
411
+ std::vector<T> values;
412
+ std::istringstream str_stream(str);
413
+ std::string token;
414
+ while (std::getline(str_stream, token, delim)) {
415
+ T value;
416
+ std::istringstream token_stream(token);
417
+ token_stream >> value;
418
+ values.push_back(value);
419
+ }
420
+ return values;
421
+ }
422
+
423
+ template<>
424
+ std::vector<std::string> string_split<std::string>(const std::string & input, char separator)
425
+ {
426
+ std::vector<std::string> parts;
427
+ size_t begin_pos = 0;
428
+ size_t separator_pos = input.find(separator);
429
+ while (separator_pos != std::string::npos) {
430
+ std::string part = input.substr(begin_pos, separator_pos - begin_pos);
431
+ parts.emplace_back(part);
432
+ begin_pos = separator_pos + 1;
433
+ separator_pos = input.find(separator, begin_pos);
434
+ }
435
+ parts.emplace_back(input.substr(begin_pos, separator_pos - begin_pos));
436
+ return parts;
437
+ }
438
+
439
+ bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
440
+ void string_process_escapes(std::string & input);
441
+
442
+ std::string string_from(bool value);
443
+ std::string string_from(const std::vector<int> & values);
444
+ std::string string_from(const struct llama_context * ctx, const std::vector<llama_token> & tokens);
445
+ std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch);
446
+
447
+ //
448
+ // Filesystem utils
449
+ //
450
+
451
+ bool fs_validate_filename(const std::string & filename);
452
+ bool fs_create_directory_with_parents(const std::string & path);
453
+
454
+ std::string fs_get_cache_directory();
455
+ std::string fs_get_cache_file(const std::string & filename);
456
+
457
+ //
458
+ // Model utils
459
+ //
460
+
461
+ struct common_init_result {
462
+ struct llama_model * model = nullptr;
463
+ struct llama_context * context = nullptr;
464
+ std::vector<common_lora_adapter_container> lora_adapters;
465
+ };
466
+
467
+ struct common_init_result common_init_from_params(common_params & params);
468
+
469
+ struct llama_model_params common_model_params_to_llama (const common_params & params);
470
+ struct llama_context_params common_context_params_to_llama(const common_params & params);
471
+ struct lm_ggml_threadpool_params lm_ggml_threadpool_params_from_cpu_params(const cpu_params & params);
472
+
473
+ struct llama_model * common_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
474
+ struct llama_model * common_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
475
+
476
+ // clear LoRA adapters from context, then apply new list of adapters
477
+ void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters);
478
+
479
+ // Batch utils
480
+
481
+ void common_batch_clear(struct llama_batch & batch);
482
+
483
+ void common_batch_add(
484
+ struct llama_batch & batch,
485
+ llama_token id,
486
+ llama_pos pos,
487
+ const std::vector<llama_seq_id> & seq_ids,
488
+ bool logits);
489
+
490
+ //
491
+ // Vocab utils
492
+ //
493
+
494
+ // tokenizes a string into a vector of tokens
495
+ // should work similar to Python's `tokenizer.encode`
496
+ std::vector<llama_token> common_tokenize(
497
+ const struct llama_context * ctx,
498
+ const std::string & text,
499
+ bool add_special,
500
+ bool parse_special = false);
501
+
502
+ std::vector<llama_token> common_tokenize(
503
+ const struct llama_model * model,
504
+ const std::string & text,
505
+ bool add_special,
506
+ bool parse_special = false);
507
+
508
+ // tokenizes a token into a piece, optionally renders special/control tokens
509
+ // should work similar to Python's `tokenizer.id_to_piece`
510
+ std::string common_token_to_piece(
511
+ const struct llama_context * ctx,
512
+ llama_token token,
513
+ bool special = true);
514
+
515
+ // detokenizes a vector of tokens into a string
516
+ // should work similar to Python's `tokenizer.decode`
517
+ // optionally renders special/control tokens
518
+ std::string common_detokenize(
519
+ llama_context * ctx,
520
+ const std::vector<llama_token> & tokens,
521
+ bool special = true);
522
+
523
+ //
524
+ // Chat template utils
525
+ //
526
+
527
+ // same with llama_chat_message, but uses std::string
528
+ struct common_chat_msg {
529
+ std::string role;
530
+ std::string content;
531
+ };
532
+
533
+ // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
534
+ bool common_chat_verify_template(const std::string & tmpl);
535
+
536
+ // CPP wrapper for llama_chat_apply_template
537
+ // If the built-in template is not supported, we default to chatml
538
+ // If the custom "tmpl" is not supported, we throw an error
539
+ std::string common_chat_apply_template(const struct llama_model * model,
540
+ const std::string & tmpl,
541
+ const std::vector<common_chat_msg> & chat,
542
+ bool add_ass);
543
+
544
+ // Format single message, while taking into account the position of that message in chat history
545
+ std::string common_chat_format_single(const struct llama_model * model,
546
+ const std::string & tmpl,
547
+ const std::vector<common_chat_msg> & past_msg,
548
+ const common_chat_msg & new_msg,
549
+ bool add_ass);
550
+
551
+ // Returns an example of formatted chat
552
+ std::string common_chat_format_example(const struct llama_model * model,
553
+ const std::string & tmpl);
554
+
555
+ //
556
+ // KV cache utils
557
+ //
558
+
559
+ // Dump the KV cache view with the number of sequences per cell.
560
+ void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);
561
+
562
+ // Dump the KV cache view showing individual sequences in each cell (long output).
563
+ void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
564
+
565
+ //
566
+ // Embedding utils
567
+ //
568
+
569
+ void common_embd_normalize(const float * inp, float * out, int n, int embd_norm = 2);
570
+
571
+ float common_embd_similarity_cos(const float * embd1, const float * embd2, int n);
572
+
573
+ //
574
+ // Control vector utils
575
+ //
576
+
577
+ struct common_control_vector_data {
578
+ int n_embd;
579
+
580
+ // stores data for layers [1, n_layer] where n_layer = data.size() / n_embd
581
+ std::vector<float> data;
582
+ };
583
+
584
+ struct common_control_vector_load_info {
585
+ float strength;
586
+
587
+ std::string fname;
588
+ };
589
+
590
+ // Load control vectors, scale each by strength, and add them together.
591
+ // On error, returns {-1, empty}
592
+ common_control_vector_data common_control_vector_load(const std::vector<common_control_vector_load_info> & load_infos);
593
+
594
+ //
595
+ // Split utils
596
+ //
597
+
598
+ static const char * const LLM_KV_SPLIT_NO = "split.no";
599
+ static const char * const LLM_KV_SPLIT_COUNT = "split.count";
600
+ static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";