cui-llama.rn 1.4.3 → 1.4.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. package/README.md +93 -114
  2. package/android/src/main/CMakeLists.txt +5 -0
  3. package/android/src/main/java/com/rnllama/LlamaContext.java +91 -17
  4. package/android/src/main/java/com/rnllama/RNLlama.java +37 -4
  5. package/android/src/main/jni-utils.h +6 -0
  6. package/android/src/main/jni.cpp +289 -31
  7. package/android/src/main/jniLibs/arm64-v8a/librnllama.so +0 -0
  8. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8.so +0 -0
  9. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2.so +0 -0
  10. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod.so +0 -0
  11. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod_i8mm.so +0 -0
  12. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_i8mm.so +0 -0
  13. package/android/src/main/jniLibs/x86_64/librnllama.so +0 -0
  14. package/android/src/main/jniLibs/x86_64/librnllama_x86_64.so +0 -0
  15. package/android/src/newarch/java/com/rnllama/RNLlamaModule.java +7 -2
  16. package/android/src/oldarch/java/com/rnllama/RNLlamaModule.java +7 -2
  17. package/cpp/chat-template.hpp +529 -0
  18. package/cpp/chat.cpp +1779 -0
  19. package/cpp/chat.h +135 -0
  20. package/cpp/common.cpp +2064 -1873
  21. package/cpp/common.h +700 -699
  22. package/cpp/ggml-alloc.c +1039 -1042
  23. package/cpp/ggml-alloc.h +1 -1
  24. package/cpp/ggml-backend-impl.h +255 -255
  25. package/cpp/ggml-backend-reg.cpp +586 -582
  26. package/cpp/ggml-backend.cpp +2004 -2002
  27. package/cpp/ggml-backend.h +354 -354
  28. package/cpp/ggml-common.h +1851 -1853
  29. package/cpp/ggml-cpp.h +39 -39
  30. package/cpp/ggml-cpu-aarch64.cpp +4248 -4247
  31. package/cpp/ggml-cpu-aarch64.h +8 -8
  32. package/cpp/ggml-cpu-impl.h +531 -386
  33. package/cpp/ggml-cpu-quants.c +12527 -10920
  34. package/cpp/ggml-cpu-traits.cpp +36 -36
  35. package/cpp/ggml-cpu-traits.h +38 -38
  36. package/cpp/ggml-cpu.c +15766 -14391
  37. package/cpp/ggml-cpu.cpp +655 -635
  38. package/cpp/ggml-cpu.h +138 -135
  39. package/cpp/ggml-impl.h +567 -567
  40. package/cpp/ggml-metal-impl.h +235 -0
  41. package/cpp/ggml-metal.h +1 -1
  42. package/cpp/ggml-metal.m +5146 -4884
  43. package/cpp/ggml-opt.cpp +854 -854
  44. package/cpp/ggml-opt.h +216 -216
  45. package/cpp/ggml-quants.c +5238 -5238
  46. package/cpp/ggml-threading.h +14 -14
  47. package/cpp/ggml.c +6529 -6514
  48. package/cpp/ggml.h +2198 -2194
  49. package/cpp/gguf.cpp +1329 -1329
  50. package/cpp/gguf.h +202 -202
  51. package/cpp/json-schema-to-grammar.cpp +1024 -1045
  52. package/cpp/json-schema-to-grammar.h +21 -8
  53. package/cpp/json.hpp +24766 -24766
  54. package/cpp/llama-adapter.cpp +347 -347
  55. package/cpp/llama-adapter.h +74 -74
  56. package/cpp/llama-arch.cpp +1513 -1487
  57. package/cpp/llama-arch.h +403 -400
  58. package/cpp/llama-batch.cpp +368 -368
  59. package/cpp/llama-batch.h +88 -88
  60. package/cpp/llama-chat.cpp +588 -578
  61. package/cpp/llama-chat.h +53 -52
  62. package/cpp/llama-context.cpp +1775 -1775
  63. package/cpp/llama-context.h +128 -128
  64. package/cpp/llama-cparams.cpp +1 -1
  65. package/cpp/llama-cparams.h +37 -37
  66. package/cpp/llama-cpp.h +30 -30
  67. package/cpp/llama-grammar.cpp +1219 -1139
  68. package/cpp/llama-grammar.h +173 -143
  69. package/cpp/llama-hparams.cpp +71 -71
  70. package/cpp/llama-hparams.h +139 -139
  71. package/cpp/llama-impl.cpp +167 -167
  72. package/cpp/llama-impl.h +61 -61
  73. package/cpp/llama-kv-cache.cpp +718 -718
  74. package/cpp/llama-kv-cache.h +219 -218
  75. package/cpp/llama-mmap.cpp +600 -590
  76. package/cpp/llama-mmap.h +68 -67
  77. package/cpp/llama-model-loader.cpp +1124 -1124
  78. package/cpp/llama-model-loader.h +167 -167
  79. package/cpp/llama-model.cpp +4087 -3997
  80. package/cpp/llama-model.h +370 -370
  81. package/cpp/llama-sampling.cpp +2558 -2408
  82. package/cpp/llama-sampling.h +32 -32
  83. package/cpp/llama-vocab.cpp +3264 -3247
  84. package/cpp/llama-vocab.h +125 -125
  85. package/cpp/llama.cpp +10284 -10077
  86. package/cpp/llama.h +1354 -1323
  87. package/cpp/log.cpp +393 -401
  88. package/cpp/log.h +132 -121
  89. package/cpp/minja/chat-template.hpp +529 -0
  90. package/cpp/minja/minja.hpp +2915 -0
  91. package/cpp/minja.hpp +2915 -0
  92. package/cpp/rn-llama.cpp +66 -6
  93. package/cpp/rn-llama.h +26 -1
  94. package/cpp/sampling.cpp +570 -505
  95. package/cpp/sampling.h +3 -0
  96. package/cpp/sgemm.cpp +2598 -2597
  97. package/cpp/sgemm.h +14 -14
  98. package/cpp/speculative.cpp +278 -277
  99. package/cpp/speculative.h +28 -28
  100. package/cpp/unicode.cpp +9 -2
  101. package/ios/CMakeLists.txt +6 -0
  102. package/ios/RNLlama.h +0 -8
  103. package/ios/RNLlama.mm +27 -3
  104. package/ios/RNLlamaContext.h +10 -1
  105. package/ios/RNLlamaContext.mm +269 -57
  106. package/jest/mock.js +21 -2
  107. package/lib/commonjs/NativeRNLlama.js.map +1 -1
  108. package/lib/commonjs/grammar.js +3 -0
  109. package/lib/commonjs/grammar.js.map +1 -1
  110. package/lib/commonjs/index.js +87 -13
  111. package/lib/commonjs/index.js.map +1 -1
  112. package/lib/module/NativeRNLlama.js.map +1 -1
  113. package/lib/module/grammar.js +3 -0
  114. package/lib/module/grammar.js.map +1 -1
  115. package/lib/module/index.js +86 -13
  116. package/lib/module/index.js.map +1 -1
  117. package/lib/typescript/NativeRNLlama.d.ts +107 -2
  118. package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
  119. package/lib/typescript/grammar.d.ts.map +1 -1
  120. package/lib/typescript/index.d.ts +32 -7
  121. package/lib/typescript/index.d.ts.map +1 -1
  122. package/llama-rn.podspec +1 -1
  123. package/package.json +3 -2
  124. package/src/NativeRNLlama.ts +115 -3
  125. package/src/grammar.ts +3 -0
  126. package/src/index.ts +138 -21
  127. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CMakeCCompiler.cmake +0 -81
  128. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CMakeSystem.cmake +0 -15
  129. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CompilerIdC/CMakeCCompilerId.c +0 -904
  130. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CompilerIdC/CMakeCCompilerId.o +0 -0
  131. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CompilerIdCXX/CMakeCXXCompilerId.cpp +0 -919
  132. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CompilerIdCXX/CMakeCXXCompilerId.o +0 -0
  133. package/android/src/main/build-arm64/CMakeFiles/CMakeConfigureLog.yaml +0 -55
  134. package/cpp/rn-llama.hpp +0 -913
package/cpp/common.h CHANGED
@@ -1,699 +1,700 @@
1
- // Various helper functions and utilities
2
-
3
- #pragma once
4
-
5
- #include "llama-cpp.h"
6
-
7
- #include <string>
8
- #include <vector>
9
- #include <sstream>
10
-
11
- #ifdef _WIN32
12
- #define DIRECTORY_SEPARATOR '\\'
13
- #else
14
- #define DIRECTORY_SEPARATOR '/'
15
- #endif // _WIN32
16
-
17
- #define die(msg) do { fputs("error: " msg "\n", stderr); exit(1); } while (0)
18
- #define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
19
-
20
- #define print_build_info() do { \
21
- fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); \
22
- fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \
23
- } while(0)
24
-
25
- #define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
26
-
27
- struct common_adapter_lora_info {
28
- std::string path;
29
- float scale;
30
-
31
- struct llama_adapter_lora * ptr;
32
- };
33
-
34
- using llama_tokens = std::vector<llama_token>;
35
-
36
- // build info
37
- extern int LLAMA_BUILD_NUMBER;
38
- extern const char * LLAMA_COMMIT;
39
- extern const char * LLAMA_COMPILER;
40
- extern const char * LLAMA_BUILD_TARGET;
41
-
42
- struct common_control_vector_load_info;
43
-
44
- #define print_build_info() do { \
45
- fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); \
46
- fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \
47
- } while(0)
48
-
49
- // build info
50
- extern int LLAMA_BUILD_NUMBER;
51
- extern char const *LLAMA_COMMIT;
52
- extern char const *LLAMA_COMPILER;
53
- extern char const *LLAMA_BUILD_TARGET;
54
-
55
- //
56
- // CPU utils
57
- //
58
-
59
- struct cpu_params {
60
- int n_threads = -1;
61
- bool cpumask[LM_GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
62
- bool mask_valid = false; // Default: any CPU
63
- enum lm_ggml_sched_priority priority = LM_GGML_SCHED_PRIO_NORMAL; // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
64
- bool strict_cpu = false; // Use strict CPU placement
65
- uint32_t poll = 50; // Polling (busywait) level (0 - no polling, 100 - mostly polling)
66
- };
67
-
68
- int32_t cpu_get_num_physical_cores();
69
- int32_t cpu_get_num_math();
70
-
71
- //
72
- // Common params
73
- //
74
-
75
- enum llama_example {
76
- LLAMA_EXAMPLE_COMMON,
77
- LLAMA_EXAMPLE_SPECULATIVE,
78
- LLAMA_EXAMPLE_MAIN,
79
- LLAMA_EXAMPLE_INFILL,
80
- LLAMA_EXAMPLE_EMBEDDING,
81
- LLAMA_EXAMPLE_PERPLEXITY,
82
- LLAMA_EXAMPLE_RETRIEVAL,
83
- LLAMA_EXAMPLE_PASSKEY,
84
- LLAMA_EXAMPLE_IMATRIX,
85
- LLAMA_EXAMPLE_BENCH,
86
- LLAMA_EXAMPLE_SERVER,
87
- LLAMA_EXAMPLE_CVECTOR_GENERATOR,
88
- LLAMA_EXAMPLE_EXPORT_LORA,
89
- LLAMA_EXAMPLE_LLAVA,
90
- LLAMA_EXAMPLE_LOOKUP,
91
- LLAMA_EXAMPLE_PARALLEL,
92
- LLAMA_EXAMPLE_TTS,
93
-
94
- LLAMA_EXAMPLE_COUNT,
95
- };
96
-
97
- enum common_sampler_type {
98
- COMMON_SAMPLER_TYPE_NONE = 0,
99
- COMMON_SAMPLER_TYPE_DRY = 1,
100
- COMMON_SAMPLER_TYPE_TOP_K = 2,
101
- COMMON_SAMPLER_TYPE_TOP_P = 3,
102
- COMMON_SAMPLER_TYPE_MIN_P = 4,
103
- //COMMON_SAMPLER_TYPE_TFS_Z = 5,
104
- COMMON_SAMPLER_TYPE_TYPICAL_P = 6,
105
- COMMON_SAMPLER_TYPE_TEMPERATURE = 7,
106
- COMMON_SAMPLER_TYPE_XTC = 8,
107
- COMMON_SAMPLER_TYPE_INFILL = 9,
108
- COMMON_SAMPLER_TYPE_PENALTIES = 10,
109
- };
110
-
111
- // dimensionality reduction methods, used by cvector-generator
112
- enum dimre_method {
113
- DIMRE_METHOD_PCA,
114
- DIMRE_METHOD_MEAN,
115
- };
116
-
117
- enum common_conversation_mode {
118
- COMMON_CONVERSATION_MODE_DISABLED = 0,
119
- COMMON_CONVERSATION_MODE_ENABLED = 1,
120
- COMMON_CONVERSATION_MODE_AUTO = 2,
121
- };
122
-
123
- // sampling parameters
124
- struct common_params_sampling {
125
- uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
126
-
127
- int32_t n_prev = 64; // number of previous tokens to remember
128
- int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
129
- int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens
130
- int32_t top_k = 40; // <= 0 to use vocab size
131
- float top_p = 0.95f; // 1.0 = disabled
132
- float min_p = 0.05f; // 0.0 = disabled
133
- float xtc_probability = 0.00f; // 0.0 = disabled
134
- float xtc_threshold = 0.10f; // > 0.5 disables XTC
135
- float typ_p = 1.00f; // typical_p, 1.0 = disabled
136
- float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
137
- float dynatemp_range = 0.00f; // 0.0 = disabled
138
- float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
139
- int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
140
- float penalty_repeat = 1.00f; // 1.0 = disabled
141
- float penalty_freq = 0.00f; // 0.0 = disabled
142
- float penalty_present = 0.00f; // 0.0 = disabled
143
- float dry_multiplier = 0.0f; // 0.0 = disabled; DRY repetition penalty for tokens extending repetition:
144
- float dry_base = 1.75f; // 0.0 = disabled; multiplier * base ^ (length of sequence before token - allowed length)
145
- int32_t dry_allowed_length = 2; // tokens extending repetitions beyond this receive penalty
146
- int32_t dry_penalty_last_n = -1; // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size)
147
- int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
148
- float mirostat_tau = 5.00f; // target entropy
149
- float mirostat_eta = 0.10f; // learning rate
150
- bool ignore_eos = false;
151
- bool no_perf = false; // disable performance metrics
152
- bool timing_per_token = false;
153
-
154
- std::vector<std::string> dry_sequence_breakers = {"\n", ":", "\"", "*"}; // default sequence breakers for DRY
155
-
156
-
157
- std::vector<enum common_sampler_type> samplers = {
158
- COMMON_SAMPLER_TYPE_PENALTIES,
159
- COMMON_SAMPLER_TYPE_DRY,
160
- COMMON_SAMPLER_TYPE_TOP_K,
161
- COMMON_SAMPLER_TYPE_TYPICAL_P,
162
- COMMON_SAMPLER_TYPE_TOP_P,
163
- COMMON_SAMPLER_TYPE_MIN_P,
164
- COMMON_SAMPLER_TYPE_XTC,
165
- COMMON_SAMPLER_TYPE_TEMPERATURE,
166
- };
167
-
168
- std::string grammar; // optional BNF-like grammar to constrain sampling
169
-
170
- std::vector<llama_logit_bias> logit_bias; // logit biases to apply
171
-
172
- // print the parameters into a string
173
- std::string print() const;
174
- };
175
-
176
- struct common_params_speculative {
177
- std::vector<lm_ggml_backend_dev_t> devices; // devices to use for offloading
178
-
179
- int32_t n_ctx = 0; // draft context size
180
- int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding
181
- int32_t n_min = 5; // minimum number of draft tokens to use for speculative decoding
182
- int32_t n_gpu_layers = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
183
- float p_split = 0.1f; // speculative decoding split probability
184
- float p_min = 0.9f; // minimum speculative decoding probability (greedy)
185
-
186
- struct cpu_params cpuparams;
187
- struct cpu_params cpuparams_batch;
188
-
189
- std::string hf_repo = ""; // HF repo // NOLINT
190
- std::string hf_file = ""; // HF file // NOLINT
191
-
192
- std::string model = ""; // draft model for speculative decoding // NOLINT
193
- std::string model_url = ""; // model url to download // NOLINT
194
- };
195
-
196
- struct common_params_vocoder {
197
- std::string hf_repo = ""; // HF repo // NOLINT
198
- std::string hf_file = ""; // HF file // NOLINT
199
-
200
- std::string model = ""; // model path // NOLINT
201
- std::string model_url = ""; // model url to download // NOLINT
202
-
203
- bool use_guide_tokens = false; // enable guide tokens to improve TTS accuracy // NOLINT
204
- };
205
-
206
- struct common_params {
207
-
208
- void * progress_callback_user_data = nullptr;
209
- llama_progress_callback progress_callback = nullptr;
210
- bool vocab_only = false;
211
- int32_t n_predict = -1; // new tokens to predict
212
- int32_t n_ctx = 4096; // context size
213
- int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
214
- int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
215
- int32_t n_keep = 0; // number of tokens to keep from initial prompt
216
- int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
217
- int32_t n_parallel = 1; // number of parallel sequences to decode
218
- int32_t n_sequences = 1; // number of sequences to decode
219
- int32_t grp_attn_n = 1; // group-attention factor
220
- int32_t grp_attn_w = 512; // group-attention width
221
- int32_t n_print = -1; // print token count every n tokens (-1 = disabled)
222
- float rope_freq_base = 0.0f; // RoPE base frequency
223
- float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
224
- float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor
225
- float yarn_attn_factor = 1.0f; // YaRN magnitude scaling factor
226
- float yarn_beta_fast = 32.0f; // YaRN low correction dim
227
- float yarn_beta_slow = 1.0f; // YaRN high correction dim
228
- int32_t yarn_orig_ctx = 0; // YaRN original context length
229
- float defrag_thold = 0.1f; // KV cache defragmentation threshold
230
-
231
- // offload params
232
- std::vector<lm_ggml_backend_dev_t> devices; // devices to use for offloading
233
-
234
- int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
235
- int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
236
- float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
237
-
238
- enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
239
-
240
- struct cpu_params cpuparams;
241
- struct cpu_params cpuparams_batch;
242
-
243
- lm_ggml_backend_sched_eval_callback cb_eval = nullptr;
244
- void * cb_eval_user_data = nullptr;
245
-
246
- lm_ggml_numa_strategy numa = LM_GGML_NUMA_STRATEGY_DISABLED;
247
-
248
- enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
249
- enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
250
- enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
251
-
252
- struct common_params_sampling sampling;
253
- struct common_params_speculative speculative;
254
- struct common_params_vocoder vocoder;
255
-
256
- std::string model = ""; // model path // NOLINT
257
- std::string model_alias = ""; // model alias // NOLINT
258
- std::string model_url = ""; // model url to download // NOLINT
259
- std::string hf_token = ""; // HF token // NOLINT
260
- std::string hf_repo = ""; // HF repo // NOLINT
261
- std::string hf_file = ""; // HF file // NOLINT
262
- std::string prompt = ""; // NOLINT
263
- std::string prompt_file = ""; // store the external prompt file name // NOLINT
264
- std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state // NOLINT
265
- std::string input_prefix = ""; // string to prefix user inputs with // NOLINT
266
- std::string input_suffix = ""; // string to suffix user inputs with // NOLINT
267
- std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding // NOLINT
268
- std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding // NOLINT
269
- std::string logits_file = ""; // file for saving *all* logits // NOLINT
270
-
271
- std::vector<std::string> in_files; // all input files
272
- std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
273
- std::vector<llama_model_kv_override> kv_overrides;
274
-
275
- bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_adapter_lora_apply)
276
- std::vector<common_adapter_lora_info> lora_adapters; // lora adapter path with user defined scale
277
-
278
- std::vector<common_control_vector_load_info> control_vectors; // control vector with user defined scale
279
-
280
- int32_t verbosity = 0;
281
- int32_t control_vector_layer_start = -1; // layer range for control vector
282
- int32_t control_vector_layer_end = -1; // layer range for control vector
283
-
284
- int32_t ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
285
- int32_t ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
286
- // (which is more convenient to use for plotting)
287
- //
288
- bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
289
- size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score
290
-
291
- bool winogrande = false; // compute Winogrande score over random tasks from datafile supplied in prompt
292
- size_t winogrande_tasks = 0; // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed
293
-
294
- bool multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt
295
- size_t multiple_choice_tasks = 0; // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed
296
-
297
- bool kl_divergence = false; // compute KL divergence
298
-
299
- bool usage = false; // print usage
300
- bool use_color = false; // use color to distinguish generations and inputs
301
- bool special = false; // enable special token output
302
- bool interactive = false; // interactive mode
303
- bool interactive_first = false; // wait for user input immediately
304
- bool prompt_cache_all = false; // save user input and generations to prompt cache
305
- bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it
306
-
307
- bool escape = true; // escape "\n", "\r", "\t", "\'", "\"", and "\\"
308
- bool multiline_input = false; // reverse the usage of `\`
309
- bool simple_io = false; // improves compatibility with subprocesses and limited consoles
310
- bool cont_batching = true; // insert new sequences for decoding on-the-fly
311
- bool flash_attn = false; // flash attention
312
- bool no_perf = false; // disable performance metrics
313
- bool ctx_shift = true; // context shift on inifinite text generation
314
-
315
- bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
316
- bool logits_all = false; // return logits for all tokens in the batch
317
- bool use_mmap = true; // use mmap for faster loads
318
- bool use_mlock = false; // use mlock to keep model in memory
319
- bool verbose_prompt = false; // print prompt tokens before generation
320
- bool display_prompt = true; // print prompt before generation
321
- bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
322
- bool no_kv_offload = false; // disable KV offloading
323
- bool warmup = true; // warmup run
324
- bool check_tensors = false; // validate tensor data
325
-
326
- lm_ggml_type cache_type_k = LM_GGML_TYPE_F16; // KV cache data type for the K
327
- lm_ggml_type cache_type_v = LM_GGML_TYPE_F16; // KV cache data type for the V
328
-
329
- common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO;
330
-
331
- // multimodal models (see examples/llava)
332
- std::string mmproj = ""; // path to multimodal projector // NOLINT
333
- std::vector<std::string> image; // path to image file(s)
334
-
335
- // embedding
336
- bool embedding = false; // get only sentence embedding
337
- int32_t embd_normalize = 2; // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
338
- std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
339
- std::string embd_sep = "\n"; // separator of embeddings
340
- bool reranking = false; // enable reranking support on server
341
-
342
- // server params
343
- int32_t port = 8080; // server listens on this network port
344
- int32_t timeout_read = 600; // http read timeout in seconds
345
- int32_t timeout_write = timeout_read; // http write timeout in seconds
346
- int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
347
- int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
348
-
349
- std::string hostname = "127.0.0.1";
350
- std::string public_path = ""; // NOLINT
351
- std::string chat_template = ""; // NOLINT
352
- bool enable_chat_template = true;
353
-
354
- std::vector<std::string> api_keys;
355
-
356
- std::string ssl_file_key = ""; // NOLINT
357
- std::string ssl_file_cert = ""; // NOLINT
358
-
359
- // "advanced" endpoints are disabled by default for better security
360
- bool webui = true;
361
- bool endpoint_slots = false;
362
- bool endpoint_props = false; // only control POST requests, not GET
363
- bool endpoint_metrics = false;
364
-
365
- bool log_json = false;
366
-
367
- std::string slot_save_path;
368
-
369
- float slot_prompt_similarity = 0.5f;
370
-
371
- // batched-bench params
372
- bool is_pp_shared = false;
373
-
374
- std::vector<int32_t> n_pp;
375
- std::vector<int32_t> n_tg;
376
- std::vector<int32_t> n_pl;
377
-
378
- // retrieval params
379
- std::vector<std::string> context_files; // context files to embed
380
-
381
- int32_t chunk_size = 64; // chunk size for context embedding
382
-
383
- std::string chunk_separator = "\n"; // chunk separator for context embedding
384
-
385
- // passkey params
386
- int32_t n_junk = 250; // number of times to repeat the junk text
387
- int32_t i_pos = -1; // position of the passkey in the junk text
388
-
389
- // imatrix params
390
- std::string out_file = "imatrix.dat"; // save the resulting imatrix to this file
391
-
392
- int32_t n_out_freq = 10; // output the imatrix every n_out_freq iterations
393
- int32_t n_save_freq = 0; // save the imatrix every n_save_freq iterations
394
- int32_t i_chunk = 0; // start processing from this chunk
395
-
396
- bool process_output = false; // collect data for the output tensor
397
- bool compute_ppl = true; // whether to compute perplexity
398
-
399
- // cvector-generator params
400
- int n_pca_batch = 100;
401
- int n_pca_iterations = 1000;
402
- dimre_method cvector_dimre_method = DIMRE_METHOD_PCA;
403
- std::string cvector_outfile = "control_vector.gguf";
404
- std::string cvector_positive_file = "examples/cvector-generator/positive.txt";
405
- std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
406
-
407
- bool spm_infill = false; // suffix/prefix/middle pattern for infill
408
-
409
- std::string lora_outfile = "ggml-lora-merged-f16.gguf";
410
-
411
- // batched-bench params
412
- bool batched_bench_output_jsonl = false;
413
- };
414
-
415
- // call once at the start of a program if it uses libcommon
416
- // initializes the logging system and prints info about the build
417
- void common_init();
418
-
419
- std::string common_params_get_system_info(const common_params & params);
420
-
421
- bool parse_cpu_range(const std::string & range, bool(&boolmask)[LM_GGML_MAX_N_THREADS]);
422
- bool parse_cpu_mask(const std::string & mask, bool(&boolmask)[LM_GGML_MAX_N_THREADS]);
423
- void postprocess_cpu_params(cpu_params & cpuparams, const cpu_params * role_model = nullptr);
424
- bool set_process_priority(enum lm_ggml_sched_priority prio);
425
-
426
- //
427
- // String utils
428
- //
429
-
430
- #ifdef __GNUC__
431
- #ifdef __MINGW32__
432
- #define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
433
- #else
434
- #define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
435
- #endif
436
- #else
437
- #define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
438
- #endif
439
-
440
- LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
441
- std::string string_format(const char * fmt, ...);
442
-
443
- std::string string_strip(const std::string & str);
444
- std::string string_get_sortable_timestamp();
445
-
446
- void string_replace_all(std::string & s, const std::string & search, const std::string & replace);
447
-
448
- template<class T>
449
- static std::vector<T> string_split(const std::string & str, char delim) {
450
- static_assert(!std::is_same<T, std::string>::value, "Please use the specialized version for std::string");
451
- std::vector<T> values;
452
- std::istringstream str_stream(str);
453
- std::string token;
454
- while (std::getline(str_stream, token, delim)) {
455
- T value;
456
- std::istringstream token_stream(token);
457
- token_stream >> value;
458
- values.push_back(value);
459
- }
460
- return values;
461
- }
462
-
463
- template<>
464
- std::vector<std::string> string_split<std::string>(const std::string & input, char separator)
465
- {
466
- std::vector<std::string> parts;
467
- size_t begin_pos = 0;
468
- size_t separator_pos = input.find(separator);
469
- while (separator_pos != std::string::npos) {
470
- std::string part = input.substr(begin_pos, separator_pos - begin_pos);
471
- parts.emplace_back(part);
472
- begin_pos = separator_pos + 1;
473
- separator_pos = input.find(separator, begin_pos);
474
- }
475
- parts.emplace_back(input.substr(begin_pos, separator_pos - begin_pos));
476
- return parts;
477
- }
478
-
479
- static bool string_starts_with(const std::string & str,
480
- const std::string & prefix) { // While we wait for C++20's std::string::starts_with...
481
- return str.rfind(prefix, 0) == 0;
482
- }
483
-
484
- static bool string_ends_with(const std::string & str,
485
- const std::string & suffix) { // While we wait for C++20's std::string::ends_with...
486
- return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
487
- }
488
-
489
- bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
490
- void string_process_escapes(std::string & input);
491
-
492
- std::string string_from(bool value);
493
- std::string string_from(const std::vector<int> & values);
494
- std::string string_from(const struct llama_context * ctx, const std::vector<llama_token> & tokens);
495
- std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch);
496
-
497
- //
498
- // Filesystem utils
499
- //
500
-
501
- bool fs_validate_filename(const std::string & filename);
502
- bool fs_create_directory_with_parents(const std::string & path);
503
-
504
- std::string fs_get_cache_directory();
505
- std::string fs_get_cache_file(const std::string & filename);
506
-
507
- //
508
- // Model utils
509
- //
510
-
511
- // note: defines object's lifetime
512
- struct common_init_result {
513
- llama_model_ptr model;
514
- llama_context_ptr context;
515
-
516
- std::vector<llama_adapter_lora_ptr> lora;
517
- };
518
-
519
- struct common_init_result common_init_from_params(common_params & params);
520
-
521
- struct llama_model_params common_model_params_to_llama ( common_params & params);
522
- struct llama_context_params common_context_params_to_llama(const common_params & params);
523
- struct lm_ggml_threadpool_params lm_ggml_threadpool_params_from_cpu_params(const cpu_params & params);
524
-
525
- struct llama_model * common_load_model_from_url(
526
- const std::string & model_url,
527
- const std::string & local_path,
528
- const std::string & hf_token,
529
- const struct llama_model_params & params);
530
-
531
- struct llama_model * common_load_model_from_hf(
532
- const std::string & repo,
533
- const std::string & remote_path,
534
- const std::string & local_path,
535
- const std::string & hf_token,
536
- const struct llama_model_params & params);
537
- std::pair<std::string, std::string> common_get_hf_file(
538
- const std::string & hf_repo_with_tag,
539
- const std::string & hf_token);
540
-
541
- std::pair<std::string, std::string> common_get_hf_file(
542
- const std::string & hf_repo_with_tag,
543
- const std::string & hf_token);
544
-
545
- // clear LoRA adapters from context, then apply new list of adapters
546
- void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora);
547
-
548
- //
549
- // Batch utils
550
- //
551
-
552
- void common_batch_clear(struct llama_batch & batch);
553
-
554
- void common_batch_add(
555
- struct llama_batch & batch,
556
- llama_token id,
557
- llama_pos pos,
558
- const std::vector<llama_seq_id> & seq_ids,
559
- bool logits);
560
-
561
- //
562
- // Token utils
563
- //
564
-
565
- // longest common prefix
566
- size_t common_lcp(const llama_tokens & a, const llama_tokens & b);
567
-
568
- // longet common subsequence
569
- size_t common_lcs(const llama_tokens & a, const llama_tokens & b);
570
-
571
- //
572
- // Vocab utils
573
- //
574
-
575
- // tokenizes a string into a vector of tokens
576
- // should work similar to Python's `tokenizer.encode`
577
- std::vector<llama_token> common_tokenize(
578
- const struct llama_context * ctx,
579
- const std::string & text,
580
- bool add_special,
581
- bool parse_special = false);
582
-
583
- std::vector<llama_token> common_tokenize(
584
- const struct llama_vocab * vocab,
585
- const std::string & text,
586
- bool add_special,
587
- bool parse_special = false);
588
-
589
- // tokenizes a token into a piece, optionally renders special/control tokens
590
- // should work similar to Python's `tokenizer.id_to_piece`
591
- std::string common_token_to_piece(
592
- const struct llama_context * ctx,
593
- llama_token token,
594
- bool special = true);
595
-
596
- std::string common_token_to_piece(
597
- const struct llama_vocab * vocab,
598
- llama_token token,
599
- bool special = true);
600
-
601
- // detokenizes a vector of tokens into a string
602
- // should work similar to Python's `tokenizer.decode`
603
- // optionally renders special/control tokens
604
- std::string common_detokenize(
605
- const struct llama_context * ctx,
606
- const std::vector<llama_token> & tokens,
607
- bool special = true);
608
-
609
- std::string common_detokenize(
610
- const struct llama_vocab * vocab,
611
- const std::vector<llama_token> & tokens,
612
- bool special = true);
613
-
614
- //
615
- // Chat template utils
616
- //
617
-
618
- // same with llama_chat_message, but uses std::string
619
- struct common_chat_msg {
620
- std::string role;
621
- std::string content;
622
- };
623
-
624
- // Get the built-in chat template for the model. Return empty string if not present.
625
- std::string common_get_builtin_chat_template(const struct llama_model * model);
626
-
627
- // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
628
- bool common_chat_verify_template(const std::string & tmpl);
629
-
630
- // CPP wrapper for llama_chat_apply_template
631
- // If the built-in template is not supported, we default to chatml
632
- // If the custom "tmpl" is not supported, we throw an error
633
- std::string common_chat_apply_template(const struct llama_model * model,
634
- const std::string & tmpl,
635
- const std::vector<common_chat_msg> & chat,
636
- bool add_ass);
637
-
638
- // Format single message, while taking into account the position of that message in chat history
639
- std::string common_chat_format_single(const struct llama_model * model,
640
- const std::string & tmpl,
641
- const std::vector<common_chat_msg> & past_msg,
642
- const common_chat_msg & new_msg,
643
- bool add_ass);
644
-
645
- // Returns an example of formatted chat
646
- std::string common_chat_format_example(const struct llama_model * model,
647
- const std::string & tmpl);
648
-
649
- //
650
- // KV cache utils
651
- //
652
-
653
- // Dump the KV cache view with the number of sequences per cell.
654
- void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);
655
-
656
- // Dump the KV cache view showing individual sequences in each cell (long output).
657
- void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
658
-
659
- //
660
- // Embedding utils
661
- //
662
-
663
- // TODO: repace embd_norm with an enum
664
- void common_embd_normalize(const float * inp, float * out, int n, int embd_norm);
665
-
666
- float common_embd_similarity_cos(const float * embd1, const float * embd2, int n);
667
-
668
- //
669
- // Control vector utils
670
- //
671
-
672
- struct common_control_vector_data {
673
- int n_embd;
674
-
675
- // stores data for layers [1, n_layer] where n_layer = data.size() / n_embd
676
- std::vector<float> data;
677
- };
678
-
679
- struct common_control_vector_load_info {
680
- float strength;
681
-
682
- std::string fname;
683
- };
684
-
685
- // Load control vectors, scale each by strength, and add them together.
686
- // On error, returns {-1, empty}
687
- common_control_vector_data common_control_vector_load(const std::vector<common_control_vector_load_info> & load_infos);
688
-
689
- //
690
- // Split utils
691
- //
692
-
693
- namespace {
694
-
695
- const char * const LLM_KV_SPLIT_NO = "split.no";
696
- const char * const LLM_KV_SPLIT_COUNT = "split.count";
697
- const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
698
-
699
- }
1
+ // Various helper functions and utilities
2
+
3
+ #pragma once
4
+
5
+ #include "llama-cpp.h"
6
+
7
+ #include <set>
8
+ #include <string>
9
+ #include <vector>
10
+ #include <sstream>
11
+
12
+ #ifdef _WIN32
13
+ #define DIRECTORY_SEPARATOR '\\'
14
+ #else
15
+ #define DIRECTORY_SEPARATOR '/'
16
+ #endif // _WIN32
17
+
18
+ #define die(msg) do { fputs("error: " msg "\n", stderr); exit(1); } while (0)
19
+ #define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
20
+
21
+ #define print_build_info() do { \
22
+ fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); \
23
+ fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \
24
+ } while(0)
25
+
26
+ #define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
27
+
28
+ struct common_adapter_lora_info {
29
+ std::string path;
30
+ float scale;
31
+
32
+ struct llama_adapter_lora * ptr;
33
+ };
34
+
35
+ using llama_tokens = std::vector<llama_token>;
36
+
37
+ // build info
38
+ extern int LLAMA_BUILD_NUMBER;
39
+ extern const char * LLAMA_COMMIT;
40
+ extern const char * LLAMA_COMPILER;
41
+ extern const char * LLAMA_BUILD_TARGET;
42
+
43
+ struct common_control_vector_load_info;
44
+
45
+ #define print_build_info() do { \
46
+ fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); \
47
+ fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \
48
+ } while(0)
49
+
50
+ // build info
51
+ extern int LLAMA_BUILD_NUMBER;
52
+ extern char const *LLAMA_COMMIT;
53
+ extern char const *LLAMA_COMPILER;
54
+ extern char const *LLAMA_BUILD_TARGET;
55
+
56
+ //
57
+ // CPU utils
58
+ //
59
+
60
+ struct cpu_params {
61
+ int n_threads = -1;
62
+ bool cpumask[LM_GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
63
+ bool mask_valid = false; // Default: any CPU
64
+ enum lm_ggml_sched_priority priority = LM_GGML_SCHED_PRIO_NORMAL; // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
65
+ bool strict_cpu = false; // Use strict CPU placement
66
+ uint32_t poll = 50; // Polling (busywait) level (0 - no polling, 100 - mostly polling)
67
+ };
68
+
69
+ int32_t cpu_get_num_physical_cores();
70
+ int32_t cpu_get_num_math();
71
+
72
+ //
73
+ // Common params
74
+ //
75
+
76
+ enum llama_example {
77
+ LLAMA_EXAMPLE_COMMON,
78
+ LLAMA_EXAMPLE_SPECULATIVE,
79
+ LLAMA_EXAMPLE_MAIN,
80
+ LLAMA_EXAMPLE_INFILL,
81
+ LLAMA_EXAMPLE_EMBEDDING,
82
+ LLAMA_EXAMPLE_PERPLEXITY,
83
+ LLAMA_EXAMPLE_RETRIEVAL,
84
+ LLAMA_EXAMPLE_PASSKEY,
85
+ LLAMA_EXAMPLE_IMATRIX,
86
+ LLAMA_EXAMPLE_BENCH,
87
+ LLAMA_EXAMPLE_SERVER,
88
+ LLAMA_EXAMPLE_CVECTOR_GENERATOR,
89
+ LLAMA_EXAMPLE_EXPORT_LORA,
90
+ LLAMA_EXAMPLE_LLAVA,
91
+ LLAMA_EXAMPLE_LOOKUP,
92
+ LLAMA_EXAMPLE_PARALLEL,
93
+ LLAMA_EXAMPLE_TTS,
94
+
95
+ LLAMA_EXAMPLE_COUNT,
96
+ };
97
+
98
+ enum common_sampler_type {
99
+ COMMON_SAMPLER_TYPE_NONE = 0,
100
+ COMMON_SAMPLER_TYPE_DRY = 1,
101
+ COMMON_SAMPLER_TYPE_TOP_K = 2,
102
+ COMMON_SAMPLER_TYPE_TOP_P = 3,
103
+ COMMON_SAMPLER_TYPE_MIN_P = 4,
104
+ //COMMON_SAMPLER_TYPE_TFS_Z = 5,
105
+ COMMON_SAMPLER_TYPE_TYPICAL_P = 6,
106
+ COMMON_SAMPLER_TYPE_TEMPERATURE = 7,
107
+ COMMON_SAMPLER_TYPE_XTC = 8,
108
+ COMMON_SAMPLER_TYPE_INFILL = 9,
109
+ COMMON_SAMPLER_TYPE_PENALTIES = 10,
110
+ };
111
+
112
+ // dimensionality reduction methods, used by cvector-generator
113
+ enum dimre_method {
114
+ DIMRE_METHOD_PCA,
115
+ DIMRE_METHOD_MEAN,
116
+ };
117
+
118
+ enum common_conversation_mode {
119
+ COMMON_CONVERSATION_MODE_DISABLED = 0,
120
+ COMMON_CONVERSATION_MODE_ENABLED = 1,
121
+ COMMON_CONVERSATION_MODE_AUTO = 2,
122
+ };
123
+
124
+ enum common_grammar_trigger_type {
125
+ COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN,
126
+ COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
127
+ COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
128
+ COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START,
129
+ };
130
+
131
+ struct common_grammar_trigger {
132
+ common_grammar_trigger_type type;
133
+ std::string value;
134
+ llama_token token = LLAMA_TOKEN_NULL;
135
+
136
+ // T can only be nlohmann::ordered_json
137
+ template <class T> T to_json() const;
138
+ template <class T> static common_grammar_trigger from_json(const T & in);
139
+ };
140
+
141
+ // sampling parameters
142
+ struct common_params_sampling {
143
+ uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
144
+
145
+ int32_t n_prev = 64; // number of previous tokens to remember
146
+ int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
147
+ int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens
148
+ int32_t top_k = 40; // <= 0 to use vocab size
149
+ float top_p = 0.95f; // 1.0 = disabled
150
+ float min_p = 0.05f; // 0.0 = disabled
151
+ float xtc_probability = 0.00f; // 0.0 = disabled
152
+ float xtc_threshold = 0.10f; // > 0.5 disables XTC
153
+ float typ_p = 1.00f; // typical_p, 1.0 = disabled
154
+ float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
155
+ float dynatemp_range = 0.00f; // 0.0 = disabled
156
+ float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
157
+ int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
158
+ float penalty_repeat = 1.00f; // 1.0 = disabled
159
+ float penalty_freq = 0.00f; // 0.0 = disabled
160
+ float penalty_present = 0.00f; // 0.0 = disabled
161
+ float dry_multiplier = 0.0f; // 0.0 = disabled; DRY repetition penalty for tokens extending repetition:
162
+ float dry_base = 1.75f; // 0.0 = disabled; multiplier * base ^ (length of sequence before token - allowed length)
163
+ int32_t dry_allowed_length = 2; // tokens extending repetitions beyond this receive penalty
164
+ int32_t dry_penalty_last_n = -1; // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size)
165
+ int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
166
+ float top_n_sigma = -1.00f;// -1.0 = disabled
167
+ float mirostat_tau = 5.00f; // target entropy
168
+ float mirostat_eta = 0.10f; // learning rate
169
+ bool ignore_eos = false;
170
+ bool no_perf = false; // disable performance metrics
171
+ bool timing_per_token = false;
172
+
173
+ std::vector<std::string> dry_sequence_breakers = {"\n", ":", "\"", "*"}; // default sequence breakers for DRY
174
+
175
+
176
+ std::vector<enum common_sampler_type> samplers = {
177
+ COMMON_SAMPLER_TYPE_PENALTIES,
178
+ COMMON_SAMPLER_TYPE_DRY,
179
+ COMMON_SAMPLER_TYPE_TOP_K,
180
+ COMMON_SAMPLER_TYPE_TYPICAL_P,
181
+ COMMON_SAMPLER_TYPE_TOP_P,
182
+ COMMON_SAMPLER_TYPE_MIN_P,
183
+ COMMON_SAMPLER_TYPE_XTC,
184
+ COMMON_SAMPLER_TYPE_TEMPERATURE,
185
+ };
186
+
187
+ std::string grammar; // optional BNF-like grammar to constrain sampling
188
+ bool grammar_lazy = false;
189
+ std::vector<common_grammar_trigger> grammar_triggers; // optional triggers (for lazy grammars)
190
+ std::set<llama_token> preserved_tokens;
191
+
192
+ std::vector<llama_logit_bias> logit_bias; // logit biases to apply
193
+
194
+ // print the parameters into a string
195
+ std::string print() const;
196
+ };
197
+
198
+ struct common_params_speculative {
199
+ std::vector<lm_ggml_backend_dev_t> devices; // devices to use for offloading
200
+
201
+ int32_t n_ctx = 0; // draft context size
202
+ int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding
203
+ int32_t n_min = 0; // minimum number of draft tokens to use for speculative decoding
204
+ int32_t n_gpu_layers = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
205
+ float p_split = 0.1f; // speculative decoding split probability
206
+ float p_min = 0.75f; // minimum speculative decoding probability (greedy)
207
+
208
+ struct cpu_params cpuparams;
209
+ struct cpu_params cpuparams_batch;
210
+
211
+ std::string hf_repo = ""; // HF repo // NOLINT
212
+ std::string hf_file = ""; // HF file // NOLINT
213
+
214
+ std::string model = ""; // draft model for speculative decoding // NOLINT
215
+ std::string model_url = ""; // model url to download // NOLINT
216
+ };
217
+
218
+ struct common_params_vocoder {
219
+ std::string hf_repo = ""; // HF repo // NOLINT
220
+ std::string hf_file = ""; // HF file // NOLINT
221
+
222
+ std::string model = ""; // model path // NOLINT
223
+ std::string model_url = ""; // model url to download // NOLINT
224
+
225
+ std::string speaker_file = ""; // speaker file path // NOLINT
226
+
227
+ bool use_guide_tokens = false; // enable guide tokens to improve TTS accuracy // NOLINT
228
+ };
229
+
230
+ enum common_reasoning_format {
231
+ COMMON_REASONING_FORMAT_NONE,
232
+ COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`
233
+ };
234
+
235
+ struct common_params {
236
+
237
+ void * progress_callback_user_data = nullptr;
238
+ llama_progress_callback progress_callback = nullptr;
239
+ bool vocab_only = false;
240
+ int32_t n_predict = -1; // new tokens to predict
241
+ int32_t n_ctx = 4096; // context size
242
+ int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
243
+ int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
244
+ int32_t n_keep = 0; // number of tokens to keep from initial prompt
245
+ int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
246
+ int32_t n_parallel = 1; // number of parallel sequences to decode
247
+ int32_t n_sequences = 1; // number of sequences to decode
248
+ int32_t grp_attn_n = 1; // group-attention factor
249
+ int32_t grp_attn_w = 512; // group-attention width
250
+ int32_t n_print = -1; // print token count every n tokens (-1 = disabled)
251
+ float rope_freq_base = 0.0f; // RoPE base frequency
252
+ float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
253
+ float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor
254
+ float yarn_attn_factor = 1.0f; // YaRN magnitude scaling factor
255
+ float yarn_beta_fast = 32.0f; // YaRN low correction dim
256
+ float yarn_beta_slow = 1.0f; // YaRN high correction dim
257
+ int32_t yarn_orig_ctx = 0; // YaRN original context length
258
+ float defrag_thold = 0.1f; // KV cache defragmentation threshold
259
+
260
+ // offload params
261
+ std::vector<lm_ggml_backend_dev_t> devices; // devices to use for offloading
262
+
263
+ int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
264
+ int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
265
+ float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
266
+
267
+ enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
268
+
269
+ struct cpu_params cpuparams;
270
+ struct cpu_params cpuparams_batch;
271
+
272
+ lm_ggml_backend_sched_eval_callback cb_eval = nullptr;
273
+ void * cb_eval_user_data = nullptr;
274
+
275
+ lm_ggml_numa_strategy numa = LM_GGML_NUMA_STRATEGY_DISABLED;
276
+
277
+ enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
278
+ enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
279
+ enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
280
+
281
+ struct common_params_sampling sampling;
282
+ struct common_params_speculative speculative;
283
+ struct common_params_vocoder vocoder;
284
+
285
+ std::string model = ""; // model path // NOLINT
286
+ std::string model_alias = ""; // model alias // NOLINT
287
+ std::string model_url = ""; // model url to download // NOLINT
288
+ std::string hf_token = ""; // HF token // NOLINT
289
+ std::string hf_repo = ""; // HF repo // NOLINT
290
+ std::string hf_file = ""; // HF file // NOLINT
291
+ std::string prompt = ""; // NOLINT
292
+ std::string system_prompt = ""; // NOLINT
293
+ std::string prompt_file = ""; // store the external prompt file name // NOLINT
294
+ std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state // NOLINT
295
+ std::string input_prefix = ""; // string to prefix user inputs with // NOLINT
296
+ std::string input_suffix = ""; // string to suffix user inputs with // NOLINT
297
+ std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding // NOLINT
298
+ std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding // NOLINT
299
+ std::string logits_file = ""; // file for saving *all* logits // NOLINT
300
+
301
+ std::vector<std::string> in_files; // all input files
302
+ std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
303
+ std::vector<llama_model_kv_override> kv_overrides;
304
+
305
+ bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_adapter_lora_apply)
306
+ std::vector<common_adapter_lora_info> lora_adapters; // lora adapter path with user defined scale
307
+
308
+ std::vector<common_control_vector_load_info> control_vectors; // control vector with user defined scale
309
+
310
+ int32_t verbosity = 0;
311
+ int32_t control_vector_layer_start = -1; // layer range for control vector
312
+ int32_t control_vector_layer_end = -1; // layer range for control vector
313
+
314
+ int32_t ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
315
+ int32_t ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
316
+ // (which is more convenient to use for plotting)
317
+ //
318
+ bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
319
+ size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score
320
+
321
+ bool winogrande = false; // compute Winogrande score over random tasks from datafile supplied in prompt
322
+ size_t winogrande_tasks = 0; // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed
323
+
324
+ bool multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt
325
+ size_t multiple_choice_tasks = 0; // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed
326
+
327
+ bool kl_divergence = false; // compute KL divergence
328
+
329
+ bool usage = false; // print usage
330
+ bool completion = false; // print source-able completion script
331
+ bool use_color = false; // use color to distinguish generations and inputs
332
+ bool special = false; // enable special token output
333
+ bool interactive = false; // interactive mode
334
+ bool interactive_first = false; // wait for user input immediately
335
+ bool prompt_cache_all = false; // save user input and generations to prompt cache
336
+ bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it
337
+
338
+ bool escape = true; // escape "\n", "\r", "\t", "\'", "\"", and "\\"
339
+ bool multiline_input = false; // reverse the usage of `\`
340
+ bool simple_io = false; // improves compatibility with subprocesses and limited consoles
341
+ bool cont_batching = true; // insert new sequences for decoding on-the-fly
342
+ bool flash_attn = false; // flash attention
343
+ bool no_perf = false; // disable performance metrics
344
+ bool ctx_shift = true; // context shift on inifinite text generation
345
+
346
+ bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
347
+ bool logits_all = false; // return logits for all tokens in the batch
348
+ bool use_mmap = true; // use mmap for faster loads
349
+ bool use_mlock = false; // use mlock to keep model in memory
350
+ bool verbose_prompt = false; // print prompt tokens before generation
351
+ bool display_prompt = true; // print prompt before generation
352
+ bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
353
+ bool no_kv_offload = false; // disable KV offloading
354
+ bool warmup = true; // warmup run
355
+ bool check_tensors = false; // validate tensor data
356
+
357
+ bool single_turn = false; // single turn chat conversation
358
+
359
+ lm_ggml_type cache_type_k = LM_GGML_TYPE_F16; // KV cache data type for the K
360
+ lm_ggml_type cache_type_v = LM_GGML_TYPE_F16; // KV cache data type for the V
361
+
362
+ common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO;
363
+
364
+ // multimodal models (see examples/llava)
365
+ std::string mmproj = ""; // path to multimodal projector // NOLINT
366
+ std::vector<std::string> image; // path to image file(s)
367
+
368
+ // embedding
369
+ bool embedding = false; // get only sentence embedding
370
+ int32_t embd_normalize = 2; // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
371
+ std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
372
+ std::string embd_sep = "\n"; // separator of embeddings
373
+ bool reranking = false; // enable reranking support on server
374
+
375
+ // server params
376
+ int32_t port = 8080; // server listens on this network port
377
+ int32_t timeout_read = 600; // http read timeout in seconds
378
+ int32_t timeout_write = timeout_read; // http write timeout in seconds
379
+ int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
380
+ int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
381
+
382
+ std::string hostname = "127.0.0.1";
383
+ std::string public_path = ""; // NOLINT
384
+ std::string chat_template = ""; // NOLINT
385
+ bool use_jinja = false; // NOLINT
386
+ bool enable_chat_template = true;
387
+ common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
388
+
389
+ std::vector<std::string> api_keys;
390
+
391
+ std::string ssl_file_key = ""; // NOLINT
392
+ std::string ssl_file_cert = ""; // NOLINT
393
+
394
+ // "advanced" endpoints are disabled by default for better security
395
+ bool webui = true;
396
+ bool endpoint_slots = false;
397
+ bool endpoint_props = false; // only control POST requests, not GET
398
+ bool endpoint_metrics = false;
399
+
400
+ bool log_json = false;
401
+
402
+ std::string slot_save_path;
403
+
404
+ float slot_prompt_similarity = 0.5f;
405
+
406
+ // batched-bench params
407
+ bool is_pp_shared = false;
408
+
409
+ std::vector<int32_t> n_pp;
410
+ std::vector<int32_t> n_tg;
411
+ std::vector<int32_t> n_pl;
412
+
413
+ // retrieval params
414
+ std::vector<std::string> context_files; // context files to embed
415
+
416
+ int32_t chunk_size = 64; // chunk size for context embedding
417
+
418
+ std::string chunk_separator = "\n"; // chunk separator for context embedding
419
+
420
+ // passkey params
421
+ int32_t n_junk = 250; // number of times to repeat the junk text
422
+ int32_t i_pos = -1; // position of the passkey in the junk text
423
+
424
+ // imatrix params
425
+ int32_t n_out_freq = 10; // output the imatrix every n_out_freq iterations
426
+ int32_t n_save_freq = 0; // save the imatrix every n_save_freq iterations
427
+ int32_t i_chunk = 0; // start processing from this chunk
428
+
429
+ bool process_output = false; // collect data for the output tensor
430
+ bool compute_ppl = true; // whether to compute perplexity
431
+
432
+ // cvector-generator params
433
+ int n_pca_batch = 100;
434
+ int n_pca_iterations = 1000;
435
+ dimre_method cvector_dimre_method = DIMRE_METHOD_PCA;
436
+ std::string cvector_positive_file = "examples/cvector-generator/positive.txt";
437
+ std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
438
+
439
+ bool spm_infill = false; // suffix/prefix/middle pattern for infill
440
+
441
+ // batched-bench params
442
+ bool batched_bench_output_jsonl = false;
443
+
444
+ // common params
445
+ std::string out_file; // output filename for all example programs
446
+ };
447
+
448
+ // call once at the start of a program if it uses libcommon
449
+ // initializes the logging system and prints info about the build
450
+ void common_init();
451
+
452
+ std::string common_params_get_system_info(const common_params & params);
453
+
454
+ bool parse_cpu_range(const std::string & range, bool(&boolmask)[LM_GGML_MAX_N_THREADS]);
455
+ bool parse_cpu_mask(const std::string & mask, bool(&boolmask)[LM_GGML_MAX_N_THREADS]);
456
+ void postprocess_cpu_params(cpu_params & cpuparams, const cpu_params * role_model = nullptr);
457
+ bool set_process_priority(enum lm_ggml_sched_priority prio);
458
+
459
+ //
460
+ // String utils
461
+ //
462
+
463
+ #ifdef __GNUC__
464
+ # if defined(__MINGW32__) && !defined(__clang__)
465
+ # define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
466
+ # else
467
+ # define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
468
+ # endif
469
+ #else
470
+ # define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
471
+ #endif
472
+
473
+ LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
474
+ std::string string_format(const char * fmt, ...);
475
+
476
+ std::string string_strip(const std::string & str);
477
+ std::string string_get_sortable_timestamp();
478
+
479
+ std::string string_join(const std::vector<std::string> & values, const std::string & separator);
480
+ std::vector<std::string> string_split(const std::string & str, const std::string & delimiter);
481
+ std::string string_repeat(const std::string & str, size_t n);
482
+
483
+ void string_replace_all(std::string & s, const std::string & search, const std::string & replace);
484
+
485
+ std::string regex_escape(const std::string & s);
486
+
487
+ template<class T>
488
+ static std::vector<T> string_split(const std::string & str, char delim) {
489
+ static_assert(!std::is_same<T, std::string>::value, "Please use the specialized version for std::string");
490
+ std::vector<T> values;
491
+ std::istringstream str_stream(str);
492
+ std::string token;
493
+ while (std::getline(str_stream, token, delim)) {
494
+ T value;
495
+ std::istringstream token_stream(token);
496
+ token_stream >> value;
497
+ values.push_back(value);
498
+ }
499
+ return values;
500
+ }
501
+
502
+ template<>
503
+ std::vector<std::string> string_split<std::string>(const std::string & input, char separator)
504
+ {
505
+ std::vector<std::string> parts;
506
+ size_t begin_pos = 0;
507
+ size_t separator_pos = input.find(separator);
508
+ while (separator_pos != std::string::npos) {
509
+ std::string part = input.substr(begin_pos, separator_pos - begin_pos);
510
+ parts.emplace_back(part);
511
+ begin_pos = separator_pos + 1;
512
+ separator_pos = input.find(separator, begin_pos);
513
+ }
514
+ parts.emplace_back(input.substr(begin_pos, separator_pos - begin_pos));
515
+ return parts;
516
+ }
517
+
518
+ static bool string_starts_with(const std::string & str,
519
+ const std::string & prefix) { // While we wait for C++20's std::string::starts_with...
520
+ return str.rfind(prefix, 0) == 0;
521
+ }
522
+
523
+ static bool string_ends_with(const std::string & str,
524
+ const std::string & suffix) { // While we wait for C++20's std::string::ends_with...
525
+ return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
526
+ }
527
+
528
+ bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
529
+ void string_process_escapes(std::string & input);
530
+
531
+ std::string string_from(bool value);
532
+ std::string string_from(const std::vector<int> & values);
533
+ std::string string_from(const struct llama_context * ctx, const std::vector<llama_token> & tokens);
534
+ std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch);
535
+
536
+ //
537
+ // Filesystem utils
538
+ //
539
+
540
+ bool fs_validate_filename(const std::string & filename);
541
+ bool fs_create_directory_with_parents(const std::string & path);
542
+
543
+ std::string fs_get_cache_directory();
544
+ std::string fs_get_cache_file(const std::string & filename);
545
+
546
+ //
547
+ // Model utils
548
+ //
549
+
550
+ // note: defines object's lifetime
551
+ struct common_init_result {
552
+ llama_model_ptr model;
553
+ llama_context_ptr context;
554
+
555
+ std::vector<llama_adapter_lora_ptr> lora;
556
+ };
557
+
558
+ struct common_init_result common_init_from_params(common_params & params);
559
+
560
+ struct llama_model_params common_model_params_to_llama ( common_params & params);
561
+ struct llama_context_params common_context_params_to_llama(const common_params & params);
562
+ struct lm_ggml_threadpool_params lm_ggml_threadpool_params_from_cpu_params(const cpu_params & params);
563
+
564
+ struct llama_model * common_load_model_from_url(
565
+ const std::string & model_url,
566
+ const std::string & local_path,
567
+ const std::string & hf_token,
568
+ const struct llama_model_params & params);
569
+
570
+ struct llama_model * common_load_model_from_hf(
571
+ const std::string & repo,
572
+ const std::string & remote_path,
573
+ const std::string & local_path,
574
+ const std::string & hf_token,
575
+ const struct llama_model_params & params);
576
+
577
+ std::pair<std::string, std::string> common_get_hf_file(
578
+ const std::string & hf_repo_with_tag,
579
+ const std::string & hf_token);
580
+
581
+ // clear LoRA adapters from context, then apply new list of adapters
582
+ void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora);
583
+
584
+ //
585
+ // Batch utils
586
+ //
587
+
588
+ void common_batch_clear(struct llama_batch & batch);
589
+
590
+ void common_batch_add(
591
+ struct llama_batch & batch,
592
+ llama_token id,
593
+ llama_pos pos,
594
+ const std::vector<llama_seq_id> & seq_ids,
595
+ bool logits);
596
+
597
+ //
598
+ // Token utils
599
+ //
600
+
601
+ // longest common prefix
602
+ size_t common_lcp(const llama_tokens & a, const llama_tokens & b);
603
+
604
+ // longet common subsequence
605
+ size_t common_lcs(const llama_tokens & a, const llama_tokens & b);
606
+
607
+ //
608
+ // Vocab utils
609
+ //
610
+
611
+ // tokenizes a string into a vector of tokens
612
+ // should work similar to Python's `tokenizer.encode`
613
+ std::vector<llama_token> common_tokenize(
614
+ const struct llama_context * ctx,
615
+ const std::string & text,
616
+ bool add_special,
617
+ bool parse_special = false);
618
+
619
+ std::vector<llama_token> common_tokenize(
620
+ const struct llama_vocab * vocab,
621
+ const std::string & text,
622
+ bool add_special,
623
+ bool parse_special = false);
624
+
625
+ // tokenizes a token into a piece, optionally renders special/control tokens
626
+ // should work similar to Python's `tokenizer.id_to_piece`
627
+ std::string common_token_to_piece(
628
+ const struct llama_context * ctx,
629
+ llama_token token,
630
+ bool special = true);
631
+
632
+ std::string common_token_to_piece(
633
+ const struct llama_vocab * vocab,
634
+ llama_token token,
635
+ bool special = true);
636
+
637
+ // detokenizes a vector of tokens into a string
638
+ // should work similar to Python's `tokenizer.decode`
639
+ // optionally renders special/control tokens
640
+ std::string common_detokenize(
641
+ const struct llama_context * ctx,
642
+ const std::vector<llama_token> & tokens,
643
+ bool special = true);
644
+
645
+ std::string common_detokenize(
646
+ const struct llama_vocab * vocab,
647
+ const std::vector<llama_token> & tokens,
648
+ bool special = true);
649
+
650
+ //
651
+ // KV cache utils
652
+ //
653
+
654
+ // Dump the KV cache view with the number of sequences per cell.
655
+ void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);
656
+
657
+ // Dump the KV cache view showing individual sequences in each cell (long output).
658
+ void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
659
+
660
+ //
661
+ // Embedding utils
662
+ //
663
+
664
+ // TODO: repace embd_norm with an enum
665
+ void common_embd_normalize(const float * inp, float * out, int n, int embd_norm);
666
+
667
+ float common_embd_similarity_cos(const float * embd1, const float * embd2, int n);
668
+
669
+ //
670
+ // Control vector utils
671
+ //
672
+
673
+ struct common_control_vector_data {
674
+ int n_embd;
675
+
676
+ // stores data for layers [1, n_layer] where n_layer = data.size() / n_embd
677
+ std::vector<float> data;
678
+ };
679
+
680
+ struct common_control_vector_load_info {
681
+ float strength;
682
+
683
+ std::string fname;
684
+ };
685
+
686
+ // Load control vectors, scale each by strength, and add them together.
687
+ // On error, returns {-1, empty}
688
+ common_control_vector_data common_control_vector_load(const std::vector<common_control_vector_load_info> & load_infos);
689
+
690
+ //
691
+ // Split utils
692
+ //
693
+
694
+ namespace {
695
+
696
+ const char * const LLM_KV_SPLIT_NO = "split.no";
697
+ const char * const LLM_KV_SPLIT_COUNT = "split.count";
698
+ const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
699
+
700
+ }