cui-llama.rn 1.3.6 → 1.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. package/README.md +22 -1
  2. package/android/src/main/CMakeLists.txt +25 -26
  3. package/android/src/main/java/com/rnllama/LlamaContext.java +31 -9
  4. package/android/src/main/java/com/rnllama/RNLlama.java +98 -0
  5. package/android/src/main/jni-utils.h +94 -0
  6. package/android/src/main/jni.cpp +133 -63
  7. package/android/src/newarch/java/com/rnllama/RNLlamaModule.java +15 -0
  8. package/android/src/oldarch/java/com/rnllama/RNLlamaModule.java +15 -0
  9. package/cpp/common.cpp +2085 -1982
  10. package/cpp/common.h +696 -664
  11. package/cpp/ggml-alloc.c +1042 -1037
  12. package/cpp/ggml-backend-impl.h +255 -256
  13. package/cpp/ggml-backend-reg.cpp +582 -582
  14. package/cpp/ggml-backend.cpp +2002 -2002
  15. package/cpp/ggml-backend.h +354 -352
  16. package/cpp/ggml-common.h +1853 -1853
  17. package/cpp/ggml-cpp.h +39 -39
  18. package/cpp/ggml-cpu-aarch64.cpp +4247 -4247
  19. package/cpp/ggml-cpu-aarch64.h +8 -8
  20. package/cpp/ggml-cpu-impl.h +386 -386
  21. package/cpp/ggml-cpu-quants.c +10920 -10839
  22. package/cpp/ggml-cpu-traits.cpp +36 -36
  23. package/cpp/ggml-cpu-traits.h +38 -38
  24. package/cpp/ggml-cpu.c +14391 -14122
  25. package/cpp/ggml-cpu.cpp +635 -627
  26. package/cpp/ggml-cpu.h +135 -135
  27. package/cpp/ggml-impl.h +567 -567
  28. package/cpp/ggml-metal-impl.h +288 -0
  29. package/cpp/ggml-metal.m +4884 -4884
  30. package/cpp/ggml-opt.cpp +854 -0
  31. package/cpp/ggml-opt.h +216 -0
  32. package/cpp/ggml-quants.c +5238 -5238
  33. package/cpp/ggml-threading.h +14 -14
  34. package/cpp/ggml.c +6514 -6448
  35. package/cpp/ggml.h +2194 -2163
  36. package/cpp/gguf.cpp +1329 -1325
  37. package/cpp/gguf.h +202 -202
  38. package/cpp/json-schema-to-grammar.cpp +1045 -1045
  39. package/cpp/json-schema-to-grammar.h +8 -8
  40. package/cpp/json.hpp +24766 -24766
  41. package/cpp/llama-adapter.cpp +347 -346
  42. package/cpp/llama-adapter.h +74 -73
  43. package/cpp/llama-arch.cpp +1487 -1434
  44. package/cpp/llama-arch.h +400 -395
  45. package/cpp/llama-batch.cpp +368 -368
  46. package/cpp/llama-batch.h +88 -88
  47. package/cpp/llama-chat.cpp +578 -567
  48. package/cpp/llama-chat.h +52 -51
  49. package/cpp/llama-context.cpp +1775 -1771
  50. package/cpp/llama-context.h +128 -128
  51. package/cpp/llama-cparams.cpp +1 -1
  52. package/cpp/llama-cparams.h +37 -37
  53. package/cpp/llama-cpp.h +30 -30
  54. package/cpp/llama-grammar.cpp +1139 -1139
  55. package/cpp/llama-grammar.h +143 -143
  56. package/cpp/llama-hparams.cpp +71 -71
  57. package/cpp/llama-hparams.h +139 -140
  58. package/cpp/llama-impl.cpp +167 -167
  59. package/cpp/llama-impl.h +61 -61
  60. package/cpp/llama-kv-cache.cpp +718 -718
  61. package/cpp/llama-kv-cache.h +218 -218
  62. package/cpp/llama-mmap.cpp +590 -589
  63. package/cpp/llama-mmap.h +67 -67
  64. package/cpp/llama-model-loader.cpp +1124 -1011
  65. package/cpp/llama-model-loader.h +167 -158
  66. package/cpp/llama-model.cpp +3997 -2202
  67. package/cpp/llama-model.h +370 -391
  68. package/cpp/llama-sampling.cpp +2408 -2406
  69. package/cpp/llama-sampling.h +32 -48
  70. package/cpp/llama-vocab.cpp +3247 -1982
  71. package/cpp/llama-vocab.h +125 -182
  72. package/cpp/llama.cpp +10077 -12544
  73. package/cpp/llama.h +1323 -1285
  74. package/cpp/log.cpp +401 -401
  75. package/cpp/log.h +121 -121
  76. package/cpp/rn-llama.hpp +123 -116
  77. package/cpp/sampling.cpp +505 -500
  78. package/cpp/sgemm.cpp +2597 -2597
  79. package/cpp/sgemm.h +14 -14
  80. package/cpp/speculative.cpp +277 -274
  81. package/cpp/speculative.h +28 -28
  82. package/cpp/unicode.cpp +2 -3
  83. package/ios/RNLlama.mm +47 -0
  84. package/ios/RNLlamaContext.h +3 -1
  85. package/ios/RNLlamaContext.mm +71 -14
  86. package/jest/mock.js +15 -3
  87. package/lib/commonjs/NativeRNLlama.js.map +1 -1
  88. package/lib/commonjs/index.js +33 -37
  89. package/lib/commonjs/index.js.map +1 -1
  90. package/lib/module/NativeRNLlama.js.map +1 -1
  91. package/lib/module/index.js +31 -35
  92. package/lib/module/index.js.map +1 -1
  93. package/lib/typescript/NativeRNLlama.d.ts +26 -6
  94. package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
  95. package/lib/typescript/index.d.ts +21 -36
  96. package/lib/typescript/index.d.ts.map +1 -1
  97. package/llama-rn.podspec +4 -18
  98. package/package.json +2 -3
  99. package/src/NativeRNLlama.ts +32 -13
  100. package/src/index.ts +52 -47
  101. package/cpp/llama.cpp.rej +0 -23
package/cpp/common.h CHANGED
@@ -1,664 +1,696 @@
1
- // Various helper functions and utilities
2
-
3
- #pragma once
4
-
5
- #include "llama-cpp.h"
6
-
7
- #include <string>
8
- #include <vector>
9
- #include <sstream>
10
-
11
- #ifdef _WIN32
12
- #define DIRECTORY_SEPARATOR '\\'
13
- #else
14
- #define DIRECTORY_SEPARATOR '/'
15
- #endif // _WIN32
16
-
17
- #define die(msg) do { fputs("error: " msg "\n", stderr); exit(1); } while (0)
18
- #define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
19
-
20
- #define print_build_info() do { \
21
- fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); \
22
- fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \
23
- } while(0)
24
-
25
- #define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
26
-
27
- struct common_lora_adapter_info {
28
- std::string path;
29
- float scale;
30
-
31
- struct llama_lora_adapter * ptr;
32
- };
33
-
34
- using llama_tokens = std::vector<llama_token>;
35
-
36
- // build info
37
- extern int LLAMA_BUILD_NUMBER;
38
- extern const char * LLAMA_COMMIT;
39
- extern const char * LLAMA_COMPILER;
40
- extern const char * LLAMA_BUILD_TARGET;
41
-
42
- struct common_control_vector_load_info;
43
-
44
- #define print_build_info() do { \
45
- fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); \
46
- fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \
47
- } while(0)
48
-
49
- // build info
50
- extern int LLAMA_BUILD_NUMBER;
51
- extern char const *LLAMA_COMMIT;
52
- extern char const *LLAMA_COMPILER;
53
- extern char const *LLAMA_BUILD_TARGET;
54
-
55
- //
56
- // CPU utils
57
- //
58
-
59
- struct cpu_params {
60
- int n_threads = -1;
61
- bool cpumask[LM_GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
62
- bool mask_valid = false; // Default: any CPU
63
- enum lm_ggml_sched_priority priority = LM_GGML_SCHED_PRIO_NORMAL; // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
64
- bool strict_cpu = false; // Use strict CPU placement
65
- uint32_t poll = 50; // Polling (busywait) level (0 - no polling, 100 - mostly polling)
66
- };
67
-
68
- int32_t cpu_get_num_physical_cores();
69
- int32_t cpu_get_num_math();
70
-
71
- //
72
- // Common params
73
- //
74
-
75
- enum llama_example {
76
- LLAMA_EXAMPLE_COMMON,
77
- LLAMA_EXAMPLE_SPECULATIVE,
78
- LLAMA_EXAMPLE_MAIN,
79
- LLAMA_EXAMPLE_INFILL,
80
- LLAMA_EXAMPLE_EMBEDDING,
81
- LLAMA_EXAMPLE_PERPLEXITY,
82
- LLAMA_EXAMPLE_RETRIEVAL,
83
- LLAMA_EXAMPLE_PASSKEY,
84
- LLAMA_EXAMPLE_IMATRIX,
85
- LLAMA_EXAMPLE_BENCH,
86
- LLAMA_EXAMPLE_SERVER,
87
- LLAMA_EXAMPLE_CVECTOR_GENERATOR,
88
- LLAMA_EXAMPLE_EXPORT_LORA,
89
- LLAMA_EXAMPLE_LLAVA,
90
- LLAMA_EXAMPLE_LOOKUP,
91
- LLAMA_EXAMPLE_PARALLEL,
92
- LLAMA_EXAMPLE_TTS,
93
-
94
- LLAMA_EXAMPLE_COUNT,
95
- };
96
-
97
- enum common_sampler_type {
98
- COMMON_SAMPLER_TYPE_NONE = 0,
99
- COMMON_SAMPLER_TYPE_DRY = 1,
100
- COMMON_SAMPLER_TYPE_TOP_K = 2,
101
- COMMON_SAMPLER_TYPE_TOP_P = 3,
102
- COMMON_SAMPLER_TYPE_MIN_P = 4,
103
- //COMMON_SAMPLER_TYPE_TFS_Z = 5,
104
- COMMON_SAMPLER_TYPE_TYPICAL_P = 6,
105
- COMMON_SAMPLER_TYPE_TEMPERATURE = 7,
106
- COMMON_SAMPLER_TYPE_XTC = 8,
107
- COMMON_SAMPLER_TYPE_INFILL = 9,
108
- COMMON_SAMPLER_TYPE_PENALTIES = 10,
109
- };
110
-
111
- // dimensionality reduction methods, used by cvector-generator
112
- enum dimre_method {
113
- DIMRE_METHOD_PCA,
114
- DIMRE_METHOD_MEAN,
115
- };
116
-
117
- // sampling parameters
118
- struct common_params_sampling {
119
- uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
120
-
121
- int32_t n_prev = 64; // number of previous tokens to remember
122
- int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
123
- int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens
124
- int32_t top_k = 40; // <= 0 to use vocab size
125
- float top_p = 0.95f; // 1.0 = disabled
126
- float min_p = 0.05f; // 0.0 = disabled
127
- float xtc_probability = 0.00f; // 0.0 = disabled
128
- float xtc_threshold = 0.10f; // > 0.5 disables XTC
129
- float typ_p = 1.00f; // typical_p, 1.0 = disabled
130
- float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
131
- float dynatemp_range = 0.00f; // 0.0 = disabled
132
- float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
133
- int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
134
- float penalty_repeat = 1.00f; // 1.0 = disabled
135
- float penalty_freq = 0.00f; // 0.0 = disabled
136
- float penalty_present = 0.00f; // 0.0 = disabled
137
- float dry_multiplier = 0.0f; // 0.0 = disabled; DRY repetition penalty for tokens extending repetition:
138
- float dry_base = 1.75f; // 0.0 = disabled; multiplier * base ^ (length of sequence before token - allowed length)
139
- int32_t dry_allowed_length = 2; // tokens extending repetitions beyond this receive penalty
140
- int32_t dry_penalty_last_n = -1; // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size)
141
- int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
142
- float mirostat_tau = 5.00f; // target entropy
143
- float mirostat_eta = 0.10f; // learning rate
144
- bool ignore_eos = false;
145
- bool no_perf = false; // disable performance metrics
146
- bool timing_per_token = false;
147
-
148
- std::vector<std::string> dry_sequence_breakers = {"\n", ":", "\"", "*"}; // default sequence breakers for DRY
149
-
150
-
151
- std::vector<enum common_sampler_type> samplers = {
152
- COMMON_SAMPLER_TYPE_PENALTIES,
153
- COMMON_SAMPLER_TYPE_DRY,
154
- COMMON_SAMPLER_TYPE_TOP_K,
155
- COMMON_SAMPLER_TYPE_TYPICAL_P,
156
- COMMON_SAMPLER_TYPE_TOP_P,
157
- COMMON_SAMPLER_TYPE_MIN_P,
158
- COMMON_SAMPLER_TYPE_XTC,
159
- COMMON_SAMPLER_TYPE_TEMPERATURE,
160
- };
161
-
162
- std::string grammar; // optional BNF-like grammar to constrain sampling
163
-
164
- std::vector<llama_logit_bias> logit_bias; // logit biases to apply
165
-
166
- // print the parameters into a string
167
- std::string print() const;
168
- };
169
-
170
- struct common_params_speculative {
171
- std::vector<lm_ggml_backend_dev_t> devices; // devices to use for offloading
172
-
173
- int32_t n_ctx = 0; // draft context size
174
- int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding
175
- int32_t n_min = 5; // minimum number of draft tokens to use for speculative decoding
176
- int32_t n_gpu_layers = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
177
- float p_split = 0.1f; // speculative decoding split probability
178
- float p_min = 0.9f; // minimum speculative decoding probability (greedy)
179
-
180
- struct cpu_params cpuparams;
181
- struct cpu_params cpuparams_batch;
182
-
183
- std::string model = ""; // draft model for speculative decoding // NOLINT
184
- };
185
-
186
- struct common_params_vocoder {
187
- std::string hf_repo = ""; // HF repo // NOLINT
188
- std::string hf_file = ""; // HF file // NOLINT
189
-
190
- std::string model = ""; // model path // NOLINT
191
- std::string model_url = ""; // model url to download // NOLINT
192
- };
193
-
194
- struct common_params {
195
-
196
- void * progress_callback_user_data = nullptr;
197
- llama_progress_callback progress_callback = nullptr;
198
- bool vocab_only = false;
199
- int32_t n_predict = -1; // new tokens to predict
200
- int32_t n_ctx = 4096; // context size
201
- int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
202
- int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
203
- int32_t n_keep = 0; // number of tokens to keep from initial prompt
204
- int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
205
- int32_t n_parallel = 1; // number of parallel sequences to decode
206
- int32_t n_sequences = 1; // number of sequences to decode
207
- int32_t grp_attn_n = 1; // group-attention factor
208
- int32_t grp_attn_w = 512; // group-attention width
209
- int32_t n_print = -1; // print token count every n tokens (-1 = disabled)
210
- float rope_freq_base = 0.0f; // RoPE base frequency
211
- float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
212
- float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor
213
- float yarn_attn_factor = 1.0f; // YaRN magnitude scaling factor
214
- float yarn_beta_fast = 32.0f; // YaRN low correction dim
215
- float yarn_beta_slow = 1.0f; // YaRN high correction dim
216
- int32_t yarn_orig_ctx = 0; // YaRN original context length
217
- float defrag_thold = 0.1f; // KV cache defragmentation threshold
218
-
219
- // offload params
220
- std::vector<lm_ggml_backend_dev_t> devices; // devices to use for offloading
221
-
222
- int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
223
- int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
224
- float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
225
-
226
- enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
227
-
228
- struct cpu_params cpuparams;
229
- struct cpu_params cpuparams_batch;
230
-
231
- lm_ggml_backend_sched_eval_callback cb_eval = nullptr;
232
- void * cb_eval_user_data = nullptr;
233
-
234
- lm_ggml_numa_strategy numa = LM_GGML_NUMA_STRATEGY_DISABLED;
235
-
236
- enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
237
- enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
238
- enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
239
-
240
- struct common_params_sampling sampling;
241
- struct common_params_speculative speculative;
242
- struct common_params_vocoder vocoder;
243
-
244
- std::string model = ""; // model path // NOLINT
245
- std::string model_alias = ""; // model alias // NOLINT
246
- std::string model_url = ""; // model url to download // NOLINT
247
- std::string hf_token = ""; // HF token // NOLINT
248
- std::string hf_repo = ""; // HF repo // NOLINT
249
- std::string hf_file = ""; // HF file // NOLINT
250
- std::string prompt = ""; // NOLINT
251
- std::string prompt_file = ""; // store the external prompt file name // NOLINT
252
- std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state // NOLINT
253
- std::string input_prefix = ""; // string to prefix user inputs with // NOLINT
254
- std::string input_suffix = ""; // string to suffix user inputs with // NOLINT
255
- std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding // NOLINT
256
- std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding // NOLINT
257
- std::string logits_file = ""; // file for saving *all* logits // NOLINT
258
- std::string rpc_servers = ""; // comma separated list of RPC servers // NOLINT
259
-
260
- std::vector<std::string> in_files; // all input files
261
- std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
262
- std::vector<llama_model_kv_override> kv_overrides;
263
-
264
- bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_lora_adapter_apply)
265
- std::vector<common_lora_adapter_info> lora_adapters; // lora adapter path with user defined scale
266
-
267
- std::vector<common_control_vector_load_info> control_vectors; // control vector with user defined scale
268
-
269
- int32_t verbosity = 0;
270
- int32_t control_vector_layer_start = -1; // layer range for control vector
271
- int32_t control_vector_layer_end = -1; // layer range for control vector
272
-
273
- int32_t ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
274
- int32_t ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
275
- // (which is more convenient to use for plotting)
276
- //
277
- bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
278
- size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score
279
-
280
- bool winogrande = false; // compute Winogrande score over random tasks from datafile supplied in prompt
281
- size_t winogrande_tasks = 0; // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed
282
-
283
- bool multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt
284
- size_t multiple_choice_tasks = 0; // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed
285
-
286
- bool kl_divergence = false; // compute KL divergence
287
-
288
- bool usage = false; // print usage
289
- bool use_color = false; // use color to distinguish generations and inputs
290
- bool special = false; // enable special token output
291
- bool interactive = false; // interactive mode
292
- bool interactive_first = false; // wait for user input immediately
293
- bool conversation = false; // conversation mode (does not print special tokens and suffix/prefix)
294
- bool prompt_cache_all = false; // save user input and generations to prompt cache
295
- bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it
296
-
297
- bool escape = true; // escape "\n", "\r", "\t", "\'", "\"", and "\\"
298
- bool multiline_input = false; // reverse the usage of `\`
299
- bool simple_io = false; // improves compatibility with subprocesses and limited consoles
300
- bool cont_batching = true; // insert new sequences for decoding on-the-fly
301
- bool flash_attn = false; // flash attention
302
- bool no_perf = false; // disable performance metrics
303
- bool ctx_shift = true; // context shift on inifinite text generation
304
-
305
- bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
306
- bool logits_all = false; // return logits for all tokens in the batch
307
- bool use_mmap = true; // use mmap for faster loads
308
- bool use_mlock = false; // use mlock to keep model in memory
309
- bool verbose_prompt = false; // print prompt tokens before generation
310
- bool display_prompt = true; // print prompt before generation
311
- bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
312
- bool no_kv_offload = false; // disable KV offloading
313
- bool warmup = true; // warmup run
314
- bool check_tensors = false; // validate tensor data
315
-
316
- lm_ggml_type cache_type_k = LM_GGML_TYPE_F16; // KV cache data type for the K
317
- lm_ggml_type cache_type_v = LM_GGML_TYPE_F16; // KV cache data type for the V
318
-
319
- // multimodal models (see examples/llava)
320
- std::string mmproj = ""; // path to multimodal projector // NOLINT
321
- std::vector<std::string> image; // path to image file(s)
322
-
323
- // embedding
324
- bool embedding = false; // get only sentence embedding
325
- int32_t embd_normalize = 2; // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
326
- std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
327
- std::string embd_sep = "\n"; // separator of embeddings
328
- bool reranking = false; // enable reranking support on server
329
-
330
- // server params
331
- int32_t port = 8080; // server listens on this network port
332
- int32_t timeout_read = 600; // http read timeout in seconds
333
- int32_t timeout_write = timeout_read; // http write timeout in seconds
334
- int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
335
- int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
336
-
337
- std::string hostname = "127.0.0.1";
338
- std::string public_path = ""; // NOLINT
339
- std::string chat_template = ""; // NOLINT
340
- bool enable_chat_template = true;
341
-
342
- std::vector<std::string> api_keys;
343
-
344
- std::string ssl_file_key = ""; // NOLINT
345
- std::string ssl_file_cert = ""; // NOLINT
346
-
347
- // "advanced" endpoints are disabled by default for better security
348
- bool webui = true;
349
- bool endpoint_slots = false;
350
- bool endpoint_props = false; // only control POST requests, not GET
351
- bool endpoint_metrics = false;
352
-
353
- bool log_json = false;
354
-
355
- std::string slot_save_path;
356
-
357
- float slot_prompt_similarity = 0.5f;
358
-
359
- // batched-bench params
360
- bool is_pp_shared = false;
361
-
362
- std::vector<int32_t> n_pp;
363
- std::vector<int32_t> n_tg;
364
- std::vector<int32_t> n_pl;
365
-
366
- // retrieval params
367
- std::vector<std::string> context_files; // context files to embed
368
-
369
- int32_t chunk_size = 64; // chunk size for context embedding
370
-
371
- std::string chunk_separator = "\n"; // chunk separator for context embedding
372
-
373
- // passkey params
374
- int32_t n_junk = 250; // number of times to repeat the junk text
375
- int32_t i_pos = -1; // position of the passkey in the junk text
376
-
377
- // imatrix params
378
- std::string out_file = "imatrix.dat"; // save the resulting imatrix to this file
379
-
380
- int32_t n_out_freq = 10; // output the imatrix every n_out_freq iterations
381
- int32_t n_save_freq = 0; // save the imatrix every n_save_freq iterations
382
- int32_t i_chunk = 0; // start processing from this chunk
383
-
384
- bool process_output = false; // collect data for the output tensor
385
- bool compute_ppl = true; // whether to compute perplexity
386
-
387
- // cvector-generator params
388
- int n_pca_batch = 100;
389
- int n_pca_iterations = 1000;
390
- dimre_method cvector_dimre_method = DIMRE_METHOD_PCA;
391
- std::string cvector_outfile = "control_vector.gguf";
392
- std::string cvector_positive_file = "examples/cvector-generator/positive.txt";
393
- std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
394
-
395
- bool spm_infill = false; // suffix/prefix/middle pattern for infill
396
-
397
- std::string lora_outfile = "ggml-lora-merged-f16.gguf";
398
-
399
- // batched-bench params
400
- bool batched_bench_output_jsonl = false;
401
- };
402
-
403
- // call once at the start of a program if it uses libcommon
404
- // initializes the logging system and prints info about the build
405
- void common_init();
406
-
407
- std::string common_params_get_system_info(const common_params & params);
408
-
409
- bool parse_cpu_range(const std::string & range, bool(&boolmask)[LM_GGML_MAX_N_THREADS]);
410
- bool parse_cpu_mask(const std::string & mask, bool(&boolmask)[LM_GGML_MAX_N_THREADS]);
411
- void postprocess_cpu_params(cpu_params & cpuparams, const cpu_params * role_model = nullptr);
412
- bool set_process_priority(enum lm_ggml_sched_priority prio);
413
-
414
- //
415
- // String utils
416
- //
417
-
418
- #ifdef __GNUC__
419
- #ifdef __MINGW32__
420
- #define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
421
- #else
422
- #define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
423
- #endif
424
- #else
425
- #define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
426
- #endif
427
-
428
- LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
429
- std::string string_format(const char * fmt, ...);
430
-
431
- std::string string_strip(const std::string & str);
432
- std::string string_get_sortable_timestamp();
433
-
434
- void string_replace_all(std::string & s, const std::string & search, const std::string & replace);
435
-
436
- template<class T>
437
- static std::vector<T> string_split(const std::string & str, char delim) {
438
- static_assert(!std::is_same<T, std::string>::value, "Please use the specialized version for std::string");
439
- std::vector<T> values;
440
- std::istringstream str_stream(str);
441
- std::string token;
442
- while (std::getline(str_stream, token, delim)) {
443
- T value;
444
- std::istringstream token_stream(token);
445
- token_stream >> value;
446
- values.push_back(value);
447
- }
448
- return values;
449
- }
450
-
451
- template<>
452
- std::vector<std::string> string_split<std::string>(const std::string & input, char separator)
453
- {
454
- std::vector<std::string> parts;
455
- size_t begin_pos = 0;
456
- size_t separator_pos = input.find(separator);
457
- while (separator_pos != std::string::npos) {
458
- std::string part = input.substr(begin_pos, separator_pos - begin_pos);
459
- parts.emplace_back(part);
460
- begin_pos = separator_pos + 1;
461
- separator_pos = input.find(separator, begin_pos);
462
- }
463
- parts.emplace_back(input.substr(begin_pos, separator_pos - begin_pos));
464
- return parts;
465
- }
466
-
467
- static bool string_starts_with(const std::string & str,
468
- const std::string & prefix) { // While we wait for C++20's std::string::starts_with...
469
- return str.rfind(prefix, 0) == 0;
470
- }
471
-
472
- bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
473
- void string_process_escapes(std::string & input);
474
-
475
- std::string string_from(bool value);
476
- std::string string_from(const std::vector<int> & values);
477
- std::string string_from(const struct llama_context * ctx, const std::vector<llama_token> & tokens);
478
- std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch);
479
-
480
- //
481
- // Filesystem utils
482
- //
483
-
484
- bool fs_validate_filename(const std::string & filename);
485
- bool fs_create_directory_with_parents(const std::string & path);
486
-
487
- std::string fs_get_cache_directory();
488
- std::string fs_get_cache_file(const std::string & filename);
489
-
490
- //
491
- // Model utils
492
- //
493
-
494
- // note: defines object's lifetime
495
- struct common_init_result {
496
- llama_model_ptr model;
497
- llama_context_ptr context;
498
-
499
- std::vector<llama_lora_adapter_ptr> lora;
500
- };
501
-
502
- struct common_init_result common_init_from_params(common_params & params);
503
-
504
- struct llama_model_params common_model_params_to_llama ( common_params & params);
505
- struct llama_context_params common_context_params_to_llama(const common_params & params);
506
- struct lm_ggml_threadpool_params lm_ggml_threadpool_params_from_cpu_params(const cpu_params & params);
507
-
508
- struct llama_model * common_load_model_from_url(
509
- const std::string & model_url,
510
- const std::string & local_path,
511
- const std::string & hf_token,
512
- const struct llama_model_params & params);
513
- struct llama_model * common_load_model_from_hf(
514
- const std::string & repo,
515
- const std::string & remote_path,
516
- const std::string & local_path,
517
- const std::string & hf_token,
518
- const struct llama_model_params & params);
519
-
520
- // clear LoRA adapters from context, then apply new list of adapters
521
- void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_info> & lora);
522
-
523
- //
524
- // Batch utils
525
- //
526
-
527
- void common_batch_clear(struct llama_batch & batch);
528
-
529
- void common_batch_add(
530
- struct llama_batch & batch,
531
- llama_token id,
532
- llama_pos pos,
533
- const std::vector<llama_seq_id> & seq_ids,
534
- bool logits);
535
-
536
- //
537
- // Token utils
538
- //
539
-
540
- // longest common prefix
541
- size_t common_lcp(const llama_tokens & a, const llama_tokens & b);
542
-
543
- // longet common subsequence
544
- size_t common_lcs(const llama_tokens & a, const llama_tokens & b);
545
-
546
- //
547
- // Vocab utils
548
- //
549
-
550
- // tokenizes a string into a vector of tokens
551
- // should work similar to Python's `tokenizer.encode`
552
- std::vector<llama_token> common_tokenize(
553
- const struct llama_context * ctx,
554
- const std::string & text,
555
- bool add_special,
556
- bool parse_special = false);
557
-
558
- std::vector<llama_token> common_tokenize(
559
- const struct llama_model * model,
560
- const std::string & text,
561
- bool add_special,
562
- bool parse_special = false);
563
-
564
- // tokenizes a token into a piece, optionally renders special/control tokens
565
- // should work similar to Python's `tokenizer.id_to_piece`
566
- std::string common_token_to_piece(
567
- const struct llama_context * ctx,
568
- llama_token token,
569
- bool special = true);
570
-
571
- // detokenizes a vector of tokens into a string
572
- // should work similar to Python's `tokenizer.decode`
573
- // optionally renders special/control tokens
574
- std::string common_detokenize(
575
- llama_context * ctx,
576
- const std::vector<llama_token> & tokens,
577
- bool special = true);
578
-
579
- //
580
- // Chat template utils
581
- //
582
-
583
- // same with llama_chat_message, but uses std::string
584
- struct common_chat_msg {
585
- std::string role;
586
- std::string content;
587
- };
588
-
589
- // Get the built-in chat template for the model. Return empty string if not present.
590
- std::string common_get_builtin_chat_template(const struct llama_model * model);
591
-
592
- // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
593
- bool common_chat_verify_template(const std::string & tmpl);
594
-
595
- // CPP wrapper for llama_chat_apply_template
596
- // If the built-in template is not supported, we default to chatml
597
- // If the custom "tmpl" is not supported, we throw an error
598
- std::string common_chat_apply_template(const struct llama_model * model,
599
- const std::string & tmpl,
600
- const std::vector<common_chat_msg> & chat,
601
- bool add_ass);
602
-
603
- // Format single message, while taking into account the position of that message in chat history
604
- std::string common_chat_format_single(const struct llama_model * model,
605
- const std::string & tmpl,
606
- const std::vector<common_chat_msg> & past_msg,
607
- const common_chat_msg & new_msg,
608
- bool add_ass);
609
-
610
- // Returns an example of formatted chat
611
- std::string common_chat_format_example(const struct llama_model * model,
612
- const std::string & tmpl);
613
-
614
- //
615
- // KV cache utils
616
- //
617
-
618
- // Dump the KV cache view with the number of sequences per cell.
619
- void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);
620
-
621
- // Dump the KV cache view showing individual sequences in each cell (long output).
622
- void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
623
-
624
- //
625
- // Embedding utils
626
- //
627
-
628
- // TODO: repace embd_norm with an enum
629
- void common_embd_normalize(const float * inp, float * out, int n, int embd_norm);
630
-
631
- float common_embd_similarity_cos(const float * embd1, const float * embd2, int n);
632
-
633
- //
634
- // Control vector utils
635
- //
636
-
637
- struct common_control_vector_data {
638
- int n_embd;
639
-
640
- // stores data for layers [1, n_layer] where n_layer = data.size() / n_embd
641
- std::vector<float> data;
642
- };
643
-
644
- struct common_control_vector_load_info {
645
- float strength;
646
-
647
- std::string fname;
648
- };
649
-
650
- // Load control vectors, scale each by strength, and add them together.
651
- // On error, returns {-1, empty}
652
- common_control_vector_data common_control_vector_load(const std::vector<common_control_vector_load_info> & load_infos);
653
-
654
- //
655
- // Split utils
656
- //
657
-
658
- namespace {
659
-
660
- const char * const LLM_KV_SPLIT_NO = "split.no";
661
- const char * const LLM_KV_SPLIT_COUNT = "split.count";
662
- const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
663
-
664
- }
1
+ // Various helper functions and utilities
2
+
3
+ #pragma once
4
+
5
+ #include "llama-cpp.h"
6
+
7
+ #include <string>
8
+ #include <vector>
9
+ #include <sstream>
10
+
11
+ #ifdef _WIN32
12
+ #define DIRECTORY_SEPARATOR '\\'
13
+ #else
14
+ #define DIRECTORY_SEPARATOR '/'
15
+ #endif // _WIN32
16
+
17
+ #define die(msg) do { fputs("error: " msg "\n", stderr); exit(1); } while (0)
18
+ #define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
19
+
20
+ #define print_build_info() do { \
21
+ fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); \
22
+ fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \
23
+ } while(0)
24
+
25
+ #define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
26
+
27
+ struct common_adapter_lora_info {
28
+ std::string path;
29
+ float scale;
30
+
31
+ struct llama_adapter_lora * ptr;
32
+ };
33
+
34
+ using llama_tokens = std::vector<llama_token>;
35
+
36
+ // build info
37
+ extern int LLAMA_BUILD_NUMBER;
38
+ extern const char * LLAMA_COMMIT;
39
+ extern const char * LLAMA_COMPILER;
40
+ extern const char * LLAMA_BUILD_TARGET;
41
+
42
+ struct common_control_vector_load_info;
43
+
44
+ #define print_build_info() do { \
45
+ fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); \
46
+ fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \
47
+ } while(0)
48
+
49
+ // build info
50
+ extern int LLAMA_BUILD_NUMBER;
51
+ extern char const *LLAMA_COMMIT;
52
+ extern char const *LLAMA_COMPILER;
53
+ extern char const *LLAMA_BUILD_TARGET;
54
+
55
+ //
56
+ // CPU utils
57
+ //
58
+
59
+ struct cpu_params {
60
+ int n_threads = -1;
61
+ bool cpumask[LM_GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
62
+ bool mask_valid = false; // Default: any CPU
63
+ enum lm_ggml_sched_priority priority = LM_GGML_SCHED_PRIO_NORMAL; // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
64
+ bool strict_cpu = false; // Use strict CPU placement
65
+ uint32_t poll = 50; // Polling (busywait) level (0 - no polling, 100 - mostly polling)
66
+ };
67
+
68
+ int32_t cpu_get_num_physical_cores();
69
+ int32_t cpu_get_num_math();
70
+
71
+ //
72
+ // Common params
73
+ //
74
+
75
+ enum llama_example {
76
+ LLAMA_EXAMPLE_COMMON,
77
+ LLAMA_EXAMPLE_SPECULATIVE,
78
+ LLAMA_EXAMPLE_MAIN,
79
+ LLAMA_EXAMPLE_INFILL,
80
+ LLAMA_EXAMPLE_EMBEDDING,
81
+ LLAMA_EXAMPLE_PERPLEXITY,
82
+ LLAMA_EXAMPLE_RETRIEVAL,
83
+ LLAMA_EXAMPLE_PASSKEY,
84
+ LLAMA_EXAMPLE_IMATRIX,
85
+ LLAMA_EXAMPLE_BENCH,
86
+ LLAMA_EXAMPLE_SERVER,
87
+ LLAMA_EXAMPLE_CVECTOR_GENERATOR,
88
+ LLAMA_EXAMPLE_EXPORT_LORA,
89
+ LLAMA_EXAMPLE_LLAVA,
90
+ LLAMA_EXAMPLE_LOOKUP,
91
+ LLAMA_EXAMPLE_PARALLEL,
92
+ LLAMA_EXAMPLE_TTS,
93
+
94
+ LLAMA_EXAMPLE_COUNT,
95
+ };
96
+
97
+ enum common_sampler_type {
98
+ COMMON_SAMPLER_TYPE_NONE = 0,
99
+ COMMON_SAMPLER_TYPE_DRY = 1,
100
+ COMMON_SAMPLER_TYPE_TOP_K = 2,
101
+ COMMON_SAMPLER_TYPE_TOP_P = 3,
102
+ COMMON_SAMPLER_TYPE_MIN_P = 4,
103
+ //COMMON_SAMPLER_TYPE_TFS_Z = 5,
104
+ COMMON_SAMPLER_TYPE_TYPICAL_P = 6,
105
+ COMMON_SAMPLER_TYPE_TEMPERATURE = 7,
106
+ COMMON_SAMPLER_TYPE_XTC = 8,
107
+ COMMON_SAMPLER_TYPE_INFILL = 9,
108
+ COMMON_SAMPLER_TYPE_PENALTIES = 10,
109
+ };
110
+
111
+ // dimensionality reduction methods, used by cvector-generator
112
+ enum dimre_method {
113
+ DIMRE_METHOD_PCA,
114
+ DIMRE_METHOD_MEAN,
115
+ };
116
+
117
+ enum common_conversation_mode {
118
+ COMMON_CONVERSATION_MODE_DISABLED = 0,
119
+ COMMON_CONVERSATION_MODE_ENABLED = 1,
120
+ COMMON_CONVERSATION_MODE_AUTO = 2,
121
+ };
122
+
123
+ // sampling parameters
124
+ struct common_params_sampling {
125
+ uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
126
+
127
+ int32_t n_prev = 64; // number of previous tokens to remember
128
+ int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
129
+ int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens
130
+ int32_t top_k = 40; // <= 0 to use vocab size
131
+ float top_p = 0.95f; // 1.0 = disabled
132
+ float min_p = 0.05f; // 0.0 = disabled
133
+ float xtc_probability = 0.00f; // 0.0 = disabled
134
+ float xtc_threshold = 0.10f; // > 0.5 disables XTC
135
+ float typ_p = 1.00f; // typical_p, 1.0 = disabled
136
+ float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
137
+ float dynatemp_range = 0.00f; // 0.0 = disabled
138
+ float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
139
+ int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
140
+ float penalty_repeat = 1.00f; // 1.0 = disabled
141
+ float penalty_freq = 0.00f; // 0.0 = disabled
142
+ float penalty_present = 0.00f; // 0.0 = disabled
143
+ float dry_multiplier = 0.0f; // 0.0 = disabled; DRY repetition penalty for tokens extending repetition:
144
+ float dry_base = 1.75f; // 0.0 = disabled; multiplier * base ^ (length of sequence before token - allowed length)
145
+ int32_t dry_allowed_length = 2; // tokens extending repetitions beyond this receive penalty
146
+ int32_t dry_penalty_last_n = -1; // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size)
147
+ int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
148
+ float mirostat_tau = 5.00f; // target entropy
149
+ float mirostat_eta = 0.10f; // learning rate
150
+ bool ignore_eos = false;
151
+ bool no_perf = false; // disable performance metrics
152
+ bool timing_per_token = false;
153
+
154
+ std::vector<std::string> dry_sequence_breakers = {"\n", ":", "\"", "*"}; // default sequence breakers for DRY
155
+
156
+
157
+ std::vector<enum common_sampler_type> samplers = {
158
+ COMMON_SAMPLER_TYPE_PENALTIES,
159
+ COMMON_SAMPLER_TYPE_DRY,
160
+ COMMON_SAMPLER_TYPE_TOP_K,
161
+ COMMON_SAMPLER_TYPE_TYPICAL_P,
162
+ COMMON_SAMPLER_TYPE_TOP_P,
163
+ COMMON_SAMPLER_TYPE_MIN_P,
164
+ COMMON_SAMPLER_TYPE_XTC,
165
+ COMMON_SAMPLER_TYPE_TEMPERATURE,
166
+ };
167
+
168
+ std::string grammar; // optional BNF-like grammar to constrain sampling
169
+
170
+ std::vector<llama_logit_bias> logit_bias; // logit biases to apply
171
+
172
+ // print the parameters into a string
173
+ std::string print() const;
174
+ };
175
+
176
+ struct common_params_speculative {
177
+ std::vector<lm_ggml_backend_dev_t> devices; // devices to use for offloading
178
+
179
+ int32_t n_ctx = 0; // draft context size
180
+ int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding
181
+ int32_t n_min = 5; // minimum number of draft tokens to use for speculative decoding
182
+ int32_t n_gpu_layers = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
183
+ float p_split = 0.1f; // speculative decoding split probability
184
+ float p_min = 0.9f; // minimum speculative decoding probability (greedy)
185
+
186
+ struct cpu_params cpuparams;
187
+ struct cpu_params cpuparams_batch;
188
+
189
+ std::string hf_repo = ""; // HF repo // NOLINT
190
+ std::string hf_file = ""; // HF file // NOLINT
191
+
192
+ std::string model = ""; // draft model for speculative decoding // NOLINT
193
+ std::string model_url = ""; // model url to download // NOLINT
194
+ };
195
+
196
+ struct common_params_vocoder {
197
+ std::string hf_repo = ""; // HF repo // NOLINT
198
+ std::string hf_file = ""; // HF file // NOLINT
199
+
200
+ std::string model = ""; // model path // NOLINT
201
+ std::string model_url = ""; // model url to download // NOLINT
202
+
203
+ bool use_guide_tokens = false; // enable guide tokens to improve TTS accuracy // NOLINT
204
+ };
205
+
206
+ struct common_params {
207
+
208
+ void * progress_callback_user_data = nullptr;
209
+ llama_progress_callback progress_callback = nullptr;
210
+ bool vocab_only = false;
211
+ int32_t n_predict = -1; // new tokens to predict
212
+ int32_t n_ctx = 4096; // context size
213
+ int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
214
+ int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
215
+ int32_t n_keep = 0; // number of tokens to keep from initial prompt
216
+ int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
217
+ int32_t n_parallel = 1; // number of parallel sequences to decode
218
+ int32_t n_sequences = 1; // number of sequences to decode
219
+ int32_t grp_attn_n = 1; // group-attention factor
220
+ int32_t grp_attn_w = 512; // group-attention width
221
+ int32_t n_print = -1; // print token count every n tokens (-1 = disabled)
222
+ float rope_freq_base = 0.0f; // RoPE base frequency
223
+ float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
224
+ float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor
225
+ float yarn_attn_factor = 1.0f; // YaRN magnitude scaling factor
226
+ float yarn_beta_fast = 32.0f; // YaRN low correction dim
227
+ float yarn_beta_slow = 1.0f; // YaRN high correction dim
228
+ int32_t yarn_orig_ctx = 0; // YaRN original context length
229
+ float defrag_thold = 0.1f; // KV cache defragmentation threshold
230
+
231
+ // offload params
232
+ std::vector<lm_ggml_backend_dev_t> devices; // devices to use for offloading
233
+
234
+ int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
235
+ int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
236
+ float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
237
+
238
+ enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
239
+
240
+ struct cpu_params cpuparams;
241
+ struct cpu_params cpuparams_batch;
242
+
243
+ lm_ggml_backend_sched_eval_callback cb_eval = nullptr;
244
+ void * cb_eval_user_data = nullptr;
245
+
246
+ lm_ggml_numa_strategy numa = LM_GGML_NUMA_STRATEGY_DISABLED;
247
+
248
+ enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
249
+ enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
250
+ enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
251
+
252
+ struct common_params_sampling sampling;
253
+ struct common_params_speculative speculative;
254
+ struct common_params_vocoder vocoder;
255
+
256
+ std::string model = ""; // model path // NOLINT
257
+ std::string model_alias = ""; // model alias // NOLINT
258
+ std::string model_url = ""; // model url to download // NOLINT
259
+ std::string hf_token = ""; // HF token // NOLINT
260
+ std::string hf_repo = ""; // HF repo // NOLINT
261
+ std::string hf_file = ""; // HF file // NOLINT
262
+ std::string prompt = ""; // NOLINT
263
+ std::string prompt_file = ""; // store the external prompt file name // NOLINT
264
+ std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state // NOLINT
265
+ std::string input_prefix = ""; // string to prefix user inputs with // NOLINT
266
+ std::string input_suffix = ""; // string to suffix user inputs with // NOLINT
267
+ std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding // NOLINT
268
+ std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding // NOLINT
269
+ std::string logits_file = ""; // file for saving *all* logits // NOLINT
270
+
271
+ std::vector<std::string> in_files; // all input files
272
+ std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
273
+ std::vector<llama_model_kv_override> kv_overrides;
274
+
275
+ bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_adapter_lora_apply)
276
+ std::vector<common_adapter_lora_info> lora_adapters; // lora adapter path with user defined scale
277
+
278
+ std::vector<common_control_vector_load_info> control_vectors; // control vector with user defined scale
279
+
280
+ int32_t verbosity = 0;
281
+ int32_t control_vector_layer_start = -1; // layer range for control vector
282
+ int32_t control_vector_layer_end = -1; // layer range for control vector
283
+
284
+ int32_t ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
285
+ int32_t ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
286
+ // (which is more convenient to use for plotting)
287
+ //
288
+ bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
289
+ size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score
290
+
291
+ bool winogrande = false; // compute Winogrande score over random tasks from datafile supplied in prompt
292
+ size_t winogrande_tasks = 0; // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed
293
+
294
+ bool multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt
295
+ size_t multiple_choice_tasks = 0; // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed
296
+
297
+ bool kl_divergence = false; // compute KL divergence
298
+
299
+ bool usage = false; // print usage
300
+ bool use_color = false; // use color to distinguish generations and inputs
301
+ bool special = false; // enable special token output
302
+ bool interactive = false; // interactive mode
303
+ bool interactive_first = false; // wait for user input immediately
304
+ bool prompt_cache_all = false; // save user input and generations to prompt cache
305
+ bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it
306
+
307
+ bool escape = true; // escape "\n", "\r", "\t", "\'", "\"", and "\\"
308
+ bool multiline_input = false; // reverse the usage of `\`
309
+ bool simple_io = false; // improves compatibility with subprocesses and limited consoles
310
+ bool cont_batching = true; // insert new sequences for decoding on-the-fly
311
+ bool flash_attn = false; // flash attention
312
+ bool no_perf = false; // disable performance metrics
313
+ bool ctx_shift = true; // context shift on inifinite text generation
314
+
315
+ bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
316
+ bool logits_all = false; // return logits for all tokens in the batch
317
+ bool use_mmap = true; // use mmap for faster loads
318
+ bool use_mlock = false; // use mlock to keep model in memory
319
+ bool verbose_prompt = false; // print prompt tokens before generation
320
+ bool display_prompt = true; // print prompt before generation
321
+ bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
322
+ bool no_kv_offload = false; // disable KV offloading
323
+ bool warmup = true; // warmup run
324
+ bool check_tensors = false; // validate tensor data
325
+
326
+ lm_ggml_type cache_type_k = LM_GGML_TYPE_F16; // KV cache data type for the K
327
+ lm_ggml_type cache_type_v = LM_GGML_TYPE_F16; // KV cache data type for the V
328
+
329
+ common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO;
330
+
331
+ // multimodal models (see examples/llava)
332
+ std::string mmproj = ""; // path to multimodal projector // NOLINT
333
+ std::vector<std::string> image; // path to image file(s)
334
+
335
+ // embedding
336
+ bool embedding = false; // get only sentence embedding
337
+ int32_t embd_normalize = 2; // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
338
+ std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
339
+ std::string embd_sep = "\n"; // separator of embeddings
340
+ bool reranking = false; // enable reranking support on server
341
+
342
+ // server params
343
+ int32_t port = 8080; // server listens on this network port
344
+ int32_t timeout_read = 600; // http read timeout in seconds
345
+ int32_t timeout_write = timeout_read; // http write timeout in seconds
346
+ int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
347
+ int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
348
+
349
+ std::string hostname = "127.0.0.1";
350
+ std::string public_path = ""; // NOLINT
351
+ std::string chat_template = ""; // NOLINT
352
+ bool enable_chat_template = true;
353
+
354
+ std::vector<std::string> api_keys;
355
+
356
+ std::string ssl_file_key = ""; // NOLINT
357
+ std::string ssl_file_cert = ""; // NOLINT
358
+
359
+ // "advanced" endpoints are disabled by default for better security
360
+ bool webui = true;
361
+ bool endpoint_slots = false;
362
+ bool endpoint_props = false; // only control POST requests, not GET
363
+ bool endpoint_metrics = false;
364
+
365
+ bool log_json = false;
366
+
367
+ std::string slot_save_path;
368
+
369
+ float slot_prompt_similarity = 0.5f;
370
+
371
+ // batched-bench params
372
+ bool is_pp_shared = false;
373
+
374
+ std::vector<int32_t> n_pp;
375
+ std::vector<int32_t> n_tg;
376
+ std::vector<int32_t> n_pl;
377
+
378
+ // retrieval params
379
+ std::vector<std::string> context_files; // context files to embed
380
+
381
+ int32_t chunk_size = 64; // chunk size for context embedding
382
+
383
+ std::string chunk_separator = "\n"; // chunk separator for context embedding
384
+
385
+ // passkey params
386
+ int32_t n_junk = 250; // number of times to repeat the junk text
387
+ int32_t i_pos = -1; // position of the passkey in the junk text
388
+
389
+ // imatrix params
390
+ std::string out_file = "imatrix.dat"; // save the resulting imatrix to this file
391
+
392
+ int32_t n_out_freq = 10; // output the imatrix every n_out_freq iterations
393
+ int32_t n_save_freq = 0; // save the imatrix every n_save_freq iterations
394
+ int32_t i_chunk = 0; // start processing from this chunk
395
+
396
+ bool process_output = false; // collect data for the output tensor
397
+ bool compute_ppl = true; // whether to compute perplexity
398
+
399
+ // cvector-generator params
400
+ int n_pca_batch = 100;
401
+ int n_pca_iterations = 1000;
402
+ dimre_method cvector_dimre_method = DIMRE_METHOD_PCA;
403
+ std::string cvector_outfile = "control_vector.gguf";
404
+ std::string cvector_positive_file = "examples/cvector-generator/positive.txt";
405
+ std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
406
+
407
+ bool spm_infill = false; // suffix/prefix/middle pattern for infill
408
+
409
+ std::string lora_outfile = "ggml-lora-merged-f16.gguf";
410
+
411
+ // batched-bench params
412
+ bool batched_bench_output_jsonl = false;
413
+ };
414
+
415
+ // call once at the start of a program if it uses libcommon
416
+ // initializes the logging system and prints info about the build
417
+ void common_init();
418
+
419
+ std::string common_params_get_system_info(const common_params & params);
420
+
421
+ bool parse_cpu_range(const std::string & range, bool(&boolmask)[LM_GGML_MAX_N_THREADS]);
422
+ bool parse_cpu_mask(const std::string & mask, bool(&boolmask)[LM_GGML_MAX_N_THREADS]);
423
+ void postprocess_cpu_params(cpu_params & cpuparams, const cpu_params * role_model = nullptr);
424
+ bool set_process_priority(enum lm_ggml_sched_priority prio);
425
+
426
+ //
427
+ // String utils
428
+ //
429
+
430
+ #ifdef __GNUC__
431
+ #ifdef __MINGW32__
432
+ #define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
433
+ #else
434
+ #define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
435
+ #endif
436
+ #else
437
+ #define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
438
+ #endif
439
+
440
+ LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
441
+ std::string string_format(const char * fmt, ...);
442
+
443
+ std::string string_strip(const std::string & str);
444
+ std::string string_get_sortable_timestamp();
445
+
446
+ void string_replace_all(std::string & s, const std::string & search, const std::string & replace);
447
+
448
+ template<class T>
449
+ static std::vector<T> string_split(const std::string & str, char delim) {
450
+ static_assert(!std::is_same<T, std::string>::value, "Please use the specialized version for std::string");
451
+ std::vector<T> values;
452
+ std::istringstream str_stream(str);
453
+ std::string token;
454
+ while (std::getline(str_stream, token, delim)) {
455
+ T value;
456
+ std::istringstream token_stream(token);
457
+ token_stream >> value;
458
+ values.push_back(value);
459
+ }
460
+ return values;
461
+ }
462
+
463
+ template<>
464
+ std::vector<std::string> string_split<std::string>(const std::string & input, char separator)
465
+ {
466
+ std::vector<std::string> parts;
467
+ size_t begin_pos = 0;
468
+ size_t separator_pos = input.find(separator);
469
+ while (separator_pos != std::string::npos) {
470
+ std::string part = input.substr(begin_pos, separator_pos - begin_pos);
471
+ parts.emplace_back(part);
472
+ begin_pos = separator_pos + 1;
473
+ separator_pos = input.find(separator, begin_pos);
474
+ }
475
+ parts.emplace_back(input.substr(begin_pos, separator_pos - begin_pos));
476
+ return parts;
477
+ }
478
+
479
+ static bool string_starts_with(const std::string & str,
480
+ const std::string & prefix) { // While we wait for C++20's std::string::starts_with...
481
+ return str.rfind(prefix, 0) == 0;
482
+ }
483
+
484
+ static bool string_ends_with(const std::string & str,
485
+ const std::string & suffix) { // While we wait for C++20's std::string::ends_with...
486
+ return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
487
+ }
488
+
489
+ bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
490
+ void string_process_escapes(std::string & input);
491
+
492
+ std::string string_from(bool value);
493
+ std::string string_from(const std::vector<int> & values);
494
+ std::string string_from(const struct llama_context * ctx, const std::vector<llama_token> & tokens);
495
+ std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch);
496
+
497
+ //
498
+ // Filesystem utils
499
+ //
500
+
501
+ bool fs_validate_filename(const std::string & filename);
502
+ bool fs_create_directory_with_parents(const std::string & path);
503
+
504
+ std::string fs_get_cache_directory();
505
+ std::string fs_get_cache_file(const std::string & filename);
506
+
507
+ //
508
+ // Model utils
509
+ //
510
+
511
+ // note: defines object's lifetime
512
+ struct common_init_result {
513
+ llama_model_ptr model;
514
+ llama_context_ptr context;
515
+
516
+ std::vector<llama_adapter_lora_ptr> lora;
517
+ };
518
+
519
+ struct common_init_result common_init_from_params(common_params & params);
520
+
521
+ struct llama_model_params common_model_params_to_llama ( common_params & params);
522
+ struct llama_context_params common_context_params_to_llama(const common_params & params);
523
+ struct lm_ggml_threadpool_params lm_ggml_threadpool_params_from_cpu_params(const cpu_params & params);
524
+
525
+ struct llama_model * common_load_model_from_url(
526
+ const std::string & model_url,
527
+ const std::string & local_path,
528
+ const std::string & hf_token,
529
+ const struct llama_model_params & params);
530
+
531
+ struct llama_model * common_load_model_from_hf(
532
+ const std::string & repo,
533
+ const std::string & remote_path,
534
+ const std::string & local_path,
535
+ const std::string & hf_token,
536
+ const struct llama_model_params & params);
537
+
538
+ std::pair<std::string, std::string> common_get_hf_file(
539
+ const std::string & hf_repo_with_tag,
540
+ const std::string & hf_token);
541
+
542
+ // clear LoRA adapters from context, then apply new list of adapters
543
+ void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora);
544
+
545
+ //
546
+ // Batch utils
547
+ //
548
+
549
+ void common_batch_clear(struct llama_batch & batch);
550
+
551
+ void common_batch_add(
552
+ struct llama_batch & batch,
553
+ llama_token id,
554
+ llama_pos pos,
555
+ const std::vector<llama_seq_id> & seq_ids,
556
+ bool logits);
557
+
558
+ //
559
+ // Token utils
560
+ //
561
+
562
+ // longest common prefix
563
+ size_t common_lcp(const llama_tokens & a, const llama_tokens & b);
564
+
565
+ // longet common subsequence
566
+ size_t common_lcs(const llama_tokens & a, const llama_tokens & b);
567
+
568
+ //
569
+ // Vocab utils
570
+ //
571
+
572
+ // tokenizes a string into a vector of tokens
573
+ // should work similar to Python's `tokenizer.encode`
574
+ std::vector<llama_token> common_tokenize(
575
+ const struct llama_context * ctx,
576
+ const std::string & text,
577
+ bool add_special,
578
+ bool parse_special = false);
579
+
580
+ std::vector<llama_token> common_tokenize(
581
+ const struct llama_vocab * vocab,
582
+ const std::string & text,
583
+ bool add_special,
584
+ bool parse_special = false);
585
+
586
+ // tokenizes a token into a piece, optionally renders special/control tokens
587
+ // should work similar to Python's `tokenizer.id_to_piece`
588
+ std::string common_token_to_piece(
589
+ const struct llama_context * ctx,
590
+ llama_token token,
591
+ bool special = true);
592
+
593
+ std::string common_token_to_piece(
594
+ const struct llama_vocab * vocab,
595
+ llama_token token,
596
+ bool special = true);
597
+
598
+ // detokenizes a vector of tokens into a string
599
+ // should work similar to Python's `tokenizer.decode`
600
+ // optionally renders special/control tokens
601
+ std::string common_detokenize(
602
+ const struct llama_context * ctx,
603
+ const std::vector<llama_token> & tokens,
604
+ bool special = true);
605
+
606
+ std::string common_detokenize(
607
+ const struct llama_vocab * vocab,
608
+ const std::vector<llama_token> & tokens,
609
+ bool special = true);
610
+
611
+ //
612
+ // Chat template utils
613
+ //
614
+
615
+ // same with llama_chat_message, but uses std::string
616
+ struct common_chat_msg {
617
+ std::string role;
618
+ std::string content;
619
+ };
620
+
621
+ // Get the built-in chat template for the model. Return empty string if not present.
622
+ std::string common_get_builtin_chat_template(const struct llama_model * model);
623
+
624
+ // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
625
+ bool common_chat_verify_template(const std::string & tmpl);
626
+
627
+ // CPP wrapper for llama_chat_apply_template
628
+ // If the built-in template is not supported, we default to chatml
629
+ // If the custom "tmpl" is not supported, we throw an error
630
+ std::string common_chat_apply_template(const struct llama_model * model,
631
+ const std::string & tmpl,
632
+ const std::vector<common_chat_msg> & chat,
633
+ bool add_ass);
634
+
635
+ // Format single message, while taking into account the position of that message in chat history
636
+ std::string common_chat_format_single(const struct llama_model * model,
637
+ const std::string & tmpl,
638
+ const std::vector<common_chat_msg> & past_msg,
639
+ const common_chat_msg & new_msg,
640
+ bool add_ass);
641
+
642
+ // Returns an example of formatted chat
643
+ std::string common_chat_format_example(const struct llama_model * model,
644
+ const std::string & tmpl);
645
+
646
+ //
647
+ // KV cache utils
648
+ //
649
+
650
+ // Dump the KV cache view with the number of sequences per cell.
651
+ void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);
652
+
653
+ // Dump the KV cache view showing individual sequences in each cell (long output).
654
+ void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
655
+
656
+ //
657
+ // Embedding utils
658
+ //
659
+
660
+ // TODO: repace embd_norm with an enum
661
+ void common_embd_normalize(const float * inp, float * out, int n, int embd_norm);
662
+
663
+ float common_embd_similarity_cos(const float * embd1, const float * embd2, int n);
664
+
665
+ //
666
+ // Control vector utils
667
+ //
668
+
669
+ struct common_control_vector_data {
670
+ int n_embd;
671
+
672
+ // stores data for layers [1, n_layer] where n_layer = data.size() / n_embd
673
+ std::vector<float> data;
674
+ };
675
+
676
+ struct common_control_vector_load_info {
677
+ float strength;
678
+
679
+ std::string fname;
680
+ };
681
+
682
+ // Load control vectors, scale each by strength, and add them together.
683
+ // On error, returns {-1, empty}
684
+ common_control_vector_data common_control_vector_load(const std::vector<common_control_vector_load_info> & load_infos);
685
+
686
+ //
687
+ // Split utils
688
+ //
689
+
690
+ namespace {
691
+
692
+ const char * const LLM_KV_SPLIT_NO = "split.no";
693
+ const char * const LLM_KV_SPLIT_COUNT = "split.count";
694
+ const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
695
+
696
+ }