cui-llama.rn 0.2.0 → 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/cpp/common.h CHANGED
@@ -1,467 +1,468 @@
1
- // Various helper functions and utilities
2
-
3
- #pragma once
4
-
5
- #include "llama.h"
6
-
7
- #include "sampling.h"
8
-
9
- #define LOG_NO_FILE_LINE_FUNCTION
10
- #include "log.h"
11
-
12
- #include <cmath>
13
- #include <string>
14
- #include <vector>
15
- #include <random>
16
- #include <thread>
17
- #include <unordered_map>
18
- #include <tuple>
19
-
20
- #ifdef _WIN32
21
- #define DIRECTORY_SEPARATOR '\\'
22
- #else
23
- #define DIRECTORY_SEPARATOR '/'
24
- #endif // _WIN32
25
-
26
- #define die(msg) do { fputs("error: " msg "\n", stderr); exit(1); } while (0)
27
- #define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
28
-
29
- #define print_build_info() do { \
30
- fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); \
31
- fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \
32
- } while(0)
33
-
34
- #define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
35
-
36
- // build info
37
- extern int LLAMA_BUILD_NUMBER;
38
- extern char const * LLAMA_COMMIT;
39
- extern char const * LLAMA_COMPILER;
40
- extern char const * LLAMA_BUILD_TARGET;
41
-
42
- struct llama_control_vector_load_info;
43
-
44
- #define print_build_info() do { \
45
- fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); \
46
- fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \
47
- } while(0)
48
-
49
- // build info
50
- extern int LLAMA_BUILD_NUMBER;
51
- extern char const *LLAMA_COMMIT;
52
- extern char const *LLAMA_COMPILER;
53
- extern char const *LLAMA_BUILD_TARGET;
54
-
55
- //
56
- // CPU utils
57
- //
58
-
59
- int32_t cpu_get_num_physical_cores();
60
- int32_t cpu_get_num_math();
61
-
62
- //
63
- // CLI argument parsing
64
- //
65
-
66
- // dimensionality reduction methods, used by cvector-generator
67
- enum dimre_method {
68
- DIMRE_METHOD_PCA,
69
- DIMRE_METHOD_MEAN,
70
- };
71
-
72
- struct gpt_params {
73
- uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
74
-
75
- int32_t n_threads = cpu_get_num_math();
76
- int32_t n_threads_draft = -1;
77
- int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
78
- int32_t n_threads_batch_draft = -1;
79
- int32_t n_predict = -1; // new tokens to predict
80
- int32_t n_ctx = 0; // context size
81
- int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
82
- int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
83
- int32_t n_keep = 0; // number of tokens to keep from initial prompt
84
- int32_t n_draft = 5; // number of tokens to draft during speculative decoding
85
- int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
86
- int32_t n_parallel = 1; // number of parallel sequences to decode
87
- int32_t n_sequences = 1; // number of sequences to decode
88
- float p_split = 0.1f; // speculative decoding split probability
89
- int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
90
- int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
91
- int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
92
- float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
93
- int32_t grp_attn_n = 1; // group-attention factor
94
- int32_t grp_attn_w = 512; // group-attention width
95
- int32_t n_print = -1; // print token count every n tokens (-1 = disabled)
96
- float rope_freq_base = 0.0f; // RoPE base frequency
97
- float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
98
- float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor
99
- float yarn_attn_factor = 1.0f; // YaRN magnitude scaling factor
100
- float yarn_beta_fast = 32.0f; // YaRN low correction dim
101
- float yarn_beta_slow = 1.0f; // YaRN high correction dim
102
- int32_t yarn_orig_ctx = 0; // YaRN original context length
103
- float defrag_thold = -1.0f; // KV cache defragmentation threshold
104
-
105
- lm_ggml_backend_sched_eval_callback cb_eval = nullptr;
106
- void * cb_eval_user_data = nullptr;
107
-
108
- lm_ggml_numa_strategy numa = LM_GGML_NUMA_STRATEGY_DISABLED;
109
-
110
- enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
111
- enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
112
- enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
113
- enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
114
-
115
- // // sampling parameters
116
- struct llama_sampling_params sparams;
117
-
118
- std::string model = ""; // model path
119
- std::string model_draft = ""; // draft model for speculative decoding
120
- std::string model_alias = "unknown"; // model alias
121
- std::string model_url = ""; // model url to download
122
- std::string hf_token = ""; // HF token
123
- std::string hf_repo = ""; // HF repo
124
- std::string hf_file = ""; // HF file
125
- std::string prompt = "";
126
- std::string prompt_file = ""; // store the external prompt file name
127
- std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state
128
- std::string input_prefix = ""; // string to prefix user inputs with
129
- std::string input_suffix = ""; // string to suffix user inputs with
130
- std::string logdir = ""; // directory in which to save YAML log files
131
- std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding
132
- std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding
133
- std::string logits_file = ""; // file for saving *all* logits
134
- std::string rpc_servers = ""; // comma separated list of RPC servers
135
-
136
- std::vector<std::string> in_files; // all input files
137
- std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
138
- std::vector<llama_model_kv_override> kv_overrides;
139
-
140
- // TODO: avoid tuple, use struct
141
- std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
142
- std::string lora_base = ""; // base model path for the lora adapter
143
-
144
- std::vector<llama_control_vector_load_info> control_vectors; // control vector with user defined scale
145
-
146
- int32_t verbosity = 0;
147
- int32_t control_vector_layer_start = -1; // layer range for control vector
148
- int32_t control_vector_layer_end = -1; // layer range for control vector
149
-
150
- int32_t ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
151
- int32_t ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
152
- // (which is more convenient to use for plotting)
153
- //
154
- bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
155
- size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score
156
-
157
- bool winogrande = false; // compute Winogrande score over random tasks from datafile supplied in prompt
158
- size_t winogrande_tasks = 0; // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed
159
-
160
- bool multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt
161
- size_t multiple_choice_tasks = 0; // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed
162
-
163
- bool kl_divergence = false; // compute KL divergence
164
-
165
- bool usage = false; // print usage
166
- bool use_color = false; // use color to distinguish generations and inputs
167
- bool special = false; // enable special token output
168
- bool interactive = false; // interactive mode
169
- bool interactive_first = false; // wait for user input immediately
170
- bool conversation = false; // conversation mode (does not print special tokens and suffix/prefix)
171
- bool prompt_cache_all = false; // save user input and generations to prompt cache
172
- bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it
173
-
174
- bool escape = true; // escape "\n", "\r", "\t", "\'", "\"", and "\\"
175
- bool multiline_input = false; // reverse the usage of `\`
176
- bool simple_io = false; // improves compatibility with subprocesses and limited consoles
177
- bool cont_batching = true; // insert new sequences for decoding on-the-fly
178
- bool flash_attn = false; // flash attention
179
-
180
- bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
181
- bool ignore_eos = false; // ignore generated EOS tokens
182
- bool logits_all = false; // return logits for all tokens in the batch
183
- bool use_mmap = true; // use mmap for faster loads
184
- bool use_mlock = false; // use mlock to keep model in memory
185
- bool verbose_prompt = false; // print prompt tokens before generation
186
- bool display_prompt = true; // print prompt before generation
187
- bool infill = false; // use infill mode
188
- bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
189
- bool no_kv_offload = false; // disable KV offloading
190
- bool warmup = true; // warmup run
191
- bool check_tensors = false; // validate tensor data
192
-
193
- std::string cache_type_k = "f16"; // KV cache data type for the K
194
- std::string cache_type_v = "f16"; // KV cache data type for the V
195
-
196
- // multimodal models (see examples/llava)
197
- std::string mmproj = ""; // path to multimodal projector
198
- std::vector<std::string> image; // path to image file(s)
199
-
200
- // embedding
201
- bool embedding = false; // get only sentence embedding
202
- int32_t embd_normalize = 2; // normalisation for embendings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
203
- std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
204
- std::string embd_sep = "\n"; // separator of embendings
205
-
206
- // server params
207
- int32_t port = 8080; // server listens on this network port
208
- int32_t timeout_read = 600; // http read timeout in seconds
209
- int32_t timeout_write = timeout_read; // http write timeout in seconds
210
- int32_t n_threads_http = -1; // number of threads to process HTTP requests
211
-
212
- std::string hostname = "127.0.0.1";
213
- std::string public_path = "";
214
- std::string chat_template = "";
215
- std::string system_prompt = "";
216
- bool enable_chat_template = true;
217
-
218
- std::vector<std::string> api_keys;
219
-
220
- std::string ssl_file_key = "";
221
- std::string ssl_file_cert = "";
222
-
223
- bool endpoint_slots = true;
224
- bool endpoint_metrics = false;
225
-
226
- bool log_json = false;
227
-
228
- std::string slot_save_path;
229
-
230
- float slot_prompt_similarity = 0.5f;
231
-
232
- // batched-bench params
233
- bool is_pp_shared = false;
234
-
235
- std::vector<int32_t> n_pp;
236
- std::vector<int32_t> n_tg;
237
- std::vector<int32_t> n_pl;
238
-
239
- // retrieval params
240
- std::vector<std::string> context_files; // context files to embed
241
-
242
- int32_t chunk_size = 64; // chunk size for context embedding
243
-
244
- std::string chunk_separator = "\n"; // chunk separator for context embedding
245
-
246
- // passkey params
247
- int32_t n_junk = 250; // number of times to repeat the junk text
248
- int32_t i_pos = -1; // position of the passkey in the junk text
249
-
250
- // imatrix params
251
- std::string out_file = "imatrix.dat"; // save the resulting imatrix to this file
252
-
253
- int32_t n_out_freq = 10; // output the imatrix every n_out_freq iterations
254
- int32_t n_save_freq = 0; // save the imatrix every n_save_freq iterations
255
- int32_t i_chunk = 0; // start processing from this chunk
256
-
257
- bool process_output = false; // collect data for the output tensor
258
- bool compute_ppl = true; // whether to compute perplexity
259
-
260
- // cvector-generator params
261
- int n_pca_batch = 100;
262
- int n_pca_iterations = 1000;
263
- dimre_method cvector_dimre_method = DIMRE_METHOD_PCA;
264
- std::string cvector_outfile = "control_vector.gguf";
265
- std::string cvector_positive_file = "examples/cvector-generator/positive.txt";
266
- std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
267
-
268
- bool spm_infill = false; // suffix/prefix/middle pattern for infill
269
- };
270
-
271
- void gpt_params_handle_hf_token(gpt_params & params);
272
- void gpt_params_handle_model_default(gpt_params & params);
273
-
274
- bool gpt_params_parse_ex (int argc, char ** argv, gpt_params & params);
275
- bool gpt_params_parse (int argc, char ** argv, gpt_params & params);
276
- bool gpt_params_find_arg (int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param);
277
- void gpt_params_print_usage(int argc, char ** argv, const gpt_params & params);
278
-
279
- std::string gpt_params_get_system_info(const gpt_params & params);
280
-
281
- //
282
- // String utils
283
- //
284
-
285
- std::vector<std::string> string_split(std::string input, char separator);
286
-
287
- std::string string_strip(const std::string & str);
288
- std::string string_get_sortable_timestamp();
289
-
290
- template<class T>
291
- static std::vector<T> string_split(const std::string & str, char delim) {
292
- std::vector<T> values;
293
- std::istringstream str_stream(str);
294
- std::string token;
295
- while (std::getline(str_stream, token, delim)) {
296
- T value;
297
- std::istringstream token_stream(token);
298
- token_stream >> value;
299
- values.push_back(value);
300
- }
301
- return values;
302
- }
303
-
304
- bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
305
- void string_process_escapes(std::string & input);
306
-
307
- //
308
- // Filesystem utils
309
- //
310
-
311
- bool fs_validate_filename(const std::string & filename);
312
- bool fs_create_directory_with_parents(const std::string & path);
313
-
314
- std::string fs_get_cache_directory();
315
- std::string fs_get_cache_file(const std::string & filename);
316
-
317
- //
318
- // Model utils
319
- //
320
-
321
- // TODO: avoid tuplue, use struct
322
- std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params);
323
-
324
- struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params);
325
- struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
326
-
327
- struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
328
- struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
329
-
330
- // Batch utils
331
-
332
- void llama_batch_clear(struct llama_batch & batch);
333
-
334
- void llama_batch_add(
335
- struct llama_batch & batch,
336
- llama_token id,
337
- llama_pos pos,
338
- const std::vector<llama_seq_id> & seq_ids,
339
- bool logits);
340
-
341
- //
342
- // Vocab utils
343
- //
344
-
345
- // tokenizes a string into a vector of tokens
346
- // should work similar to Python's `tokenizer.encode`
347
- std::vector<llama_token> llama_tokenize(
348
- const struct llama_context * ctx,
349
- const std::string & text,
350
- bool add_special,
351
- bool parse_special = false);
352
-
353
- std::vector<llama_token> llama_tokenize(
354
- const struct llama_model * model,
355
- const std::string & text,
356
- bool add_special,
357
- bool parse_special = false);
358
-
359
- // tokenizes a token into a piece, optionally renders special/control tokens
360
- // should work similar to Python's `tokenizer.id_to_piece`
361
- std::string llama_token_to_piece(
362
- const struct llama_context * ctx,
363
- llama_token token,
364
- bool special = true);
365
-
366
- // detokenizes a vector of tokens into a string
367
- // should work similar to Python's `tokenizer.decode`
368
- // optionally renders special/control tokens
369
- std::string llama_detokenize(
370
- llama_context * ctx,
371
- const std::vector<llama_token> & tokens,
372
- bool special = true);
373
-
374
- // Uses the value from the model metadata if possible, otherwise
375
- // defaults to true when model type is SPM, otherwise false.
376
- bool llama_should_add_bos_token(const llama_model * model);
377
-
378
- //
379
- // Chat template utils
380
- //
381
-
382
- // same with llama_chat_message, but uses std::string
383
- struct llama_chat_msg {
384
- std::string role;
385
- std::string content;
386
- };
387
-
388
- // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
389
- bool llama_chat_verify_template(const std::string & tmpl);
390
-
391
- // CPP wrapper for llama_chat_apply_template
392
- // If the built-in template is not supported, we default to chatml
393
- // If the custom "tmpl" is not supported, we throw an error
394
- std::string llama_chat_apply_template(const struct llama_model * model,
395
- const std::string & tmpl,
396
- const std::vector<llama_chat_msg> & chat,
397
- bool add_ass);
398
-
399
- // Format single message, while taking into account the position of that message in chat history
400
- std::string llama_chat_format_single(const struct llama_model * model,
401
- const std::string & tmpl,
402
- const std::vector<llama_chat_msg> & past_msg,
403
- const llama_chat_msg & new_msg,
404
- bool add_ass);
405
-
406
- // Returns an example of formatted chat
407
- std::string llama_chat_format_example(const struct llama_model * model,
408
- const std::string & tmpl);
409
-
410
- //
411
- // KV cache utils
412
- //
413
-
414
- // Dump the KV cache view with the number of sequences per cell.
415
- void llama_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);
416
-
417
- // Dump the KV cache view showing individual sequences in each cell (long output).
418
- void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
419
-
420
- //
421
- // Embedding utils
422
- //
423
-
424
- void llama_embd_normalize(const float * inp, float * out, int n, int embd_norm = 2);
425
-
426
- float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n);
427
-
428
- //
429
- // Control vector utils
430
- //
431
-
432
- struct llama_control_vector_data {
433
- int n_embd;
434
-
435
- // stores data for layers [1, n_layer] where n_layer = data.size() / n_embd
436
- std::vector<float> data;
437
- };
438
-
439
- struct llama_control_vector_load_info {
440
- float strength;
441
-
442
- std::string fname;
443
- };
444
-
445
- // Load control vectors, scale each by strength, and add them together.
446
- // On error, returns {-1, empty}
447
- llama_control_vector_data llama_control_vector_load(const std::vector<llama_control_vector_load_info> & load_infos);
448
-
449
- //
450
- // Split utils
451
- //
452
-
453
- static const char * const LLM_KV_SPLIT_NO = "split.no";
454
- static const char * const LLM_KV_SPLIT_COUNT = "split.count";
455
- static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
456
-
457
- //
458
- // YAML utils
459
- //
460
-
461
- void yaml_dump_vector_float (FILE * stream, const char * prop_name, const std::vector<float> & data);
462
- void yaml_dump_vector_int (FILE * stream, const char * prop_name, const std::vector<int> & data);
463
- void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data);
464
-
465
- void yaml_dump_non_result_info(
466
- FILE * stream, const gpt_params & params, const llama_context * lctx,
467
- const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
1
+ // Various helper functions and utilities
2
+
3
+ #pragma once
4
+
5
+ #include "llama.h"
6
+
7
+ #include "sampling.h"
8
+
9
+ #define LOG_NO_FILE_LINE_FUNCTION
10
+ #include "log.h"
11
+
12
+ #include <cmath>
13
+ #include <string>
14
+ #include <vector>
15
+ #include <random>
16
+ #include <thread>
17
+ #include <unordered_map>
18
+ #include <tuple>
19
+
20
+ #ifdef _WIN32
21
+ #define DIRECTORY_SEPARATOR '\\'
22
+ #else
23
+ #define DIRECTORY_SEPARATOR '/'
24
+ #endif // _WIN32
25
+
26
+ #define die(msg) do { fputs("error: " msg "\n", stderr); exit(1); } while (0)
27
+ #define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
28
+
29
+ #define print_build_info() do { \
30
+ fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); \
31
+ fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \
32
+ } while(0)
33
+
34
+ #define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
35
+
36
+ // build info
37
+ extern int LLAMA_BUILD_NUMBER;
38
+ extern char const * LLAMA_COMMIT;
39
+ extern char const * LLAMA_COMPILER;
40
+ extern char const * LLAMA_BUILD_TARGET;
41
+
42
+ struct llama_control_vector_load_info;
43
+
44
+ #define print_build_info() do { \
45
+ fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); \
46
+ fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \
47
+ } while(0)
48
+
49
+ // build info
50
+ extern int LLAMA_BUILD_NUMBER;
51
+ extern char const *LLAMA_COMMIT;
52
+ extern char const *LLAMA_COMPILER;
53
+ extern char const *LLAMA_BUILD_TARGET;
54
+
55
+ //
56
+ // CPU utils
57
+ //
58
+
59
+ int32_t cpu_get_num_physical_cores();
60
+ int32_t cpu_get_num_math();
61
+
62
+ //
63
+ // CLI argument parsing
64
+ //
65
+
66
+ // dimensionality reduction methods, used by cvector-generator
67
+ enum dimre_method {
68
+ DIMRE_METHOD_PCA,
69
+ DIMRE_METHOD_MEAN,
70
+ };
71
+
72
+ struct gpt_params {
73
+ uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
74
+
75
+ bool vocab_only = false;
76
+ int32_t n_threads = cpu_get_num_math();
77
+ int32_t n_threads_draft = -1;
78
+ int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
79
+ int32_t n_threads_batch_draft = -1;
80
+ int32_t n_predict = -1; // new tokens to predict
81
+ int32_t n_ctx = 0; // context size
82
+ int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
83
+ int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
84
+ int32_t n_keep = 0; // number of tokens to keep from initial prompt
85
+ int32_t n_draft = 5; // number of tokens to draft during speculative decoding
86
+ int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
87
+ int32_t n_parallel = 1; // number of parallel sequences to decode
88
+ int32_t n_sequences = 1; // number of sequences to decode
89
+ float p_split = 0.1f; // speculative decoding split probability
90
+ int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
91
+ int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
92
+ int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
93
+ float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
94
+ int32_t grp_attn_n = 1; // group-attention factor
95
+ int32_t grp_attn_w = 512; // group-attention width
96
+ int32_t n_print = -1; // print token count every n tokens (-1 = disabled)
97
+ float rope_freq_base = 0.0f; // RoPE base frequency
98
+ float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
99
+ float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor
100
+ float yarn_attn_factor = 1.0f; // YaRN magnitude scaling factor
101
+ float yarn_beta_fast = 32.0f; // YaRN low correction dim
102
+ float yarn_beta_slow = 1.0f; // YaRN high correction dim
103
+ int32_t yarn_orig_ctx = 0; // YaRN original context length
104
+ float defrag_thold = -1.0f; // KV cache defragmentation threshold
105
+
106
+ lm_ggml_backend_sched_eval_callback cb_eval = nullptr;
107
+ void * cb_eval_user_data = nullptr;
108
+
109
+ lm_ggml_numa_strategy numa = LM_GGML_NUMA_STRATEGY_DISABLED;
110
+
111
+ enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
112
+ enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
113
+ enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
114
+ enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
115
+
116
+ // // sampling parameters
117
+ struct llama_sampling_params sparams;
118
+
119
+ std::string model = ""; // model path
120
+ std::string model_draft = ""; // draft model for speculative decoding
121
+ std::string model_alias = "unknown"; // model alias
122
+ std::string model_url = ""; // model url to download
123
+ std::string hf_token = ""; // HF token
124
+ std::string hf_repo = ""; // HF repo
125
+ std::string hf_file = ""; // HF file
126
+ std::string prompt = "";
127
+ std::string prompt_file = ""; // store the external prompt file name
128
+ std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state
129
+ std::string input_prefix = ""; // string to prefix user inputs with
130
+ std::string input_suffix = ""; // string to suffix user inputs with
131
+ std::string logdir = ""; // directory in which to save YAML log files
132
+ std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding
133
+ std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding
134
+ std::string logits_file = ""; // file for saving *all* logits
135
+ std::string rpc_servers = ""; // comma separated list of RPC servers
136
+
137
+ std::vector<std::string> in_files; // all input files
138
+ std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
139
+ std::vector<llama_model_kv_override> kv_overrides;
140
+
141
+ // TODO: avoid tuple, use struct
142
+ std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
143
+ std::string lora_base = ""; // base model path for the lora adapter
144
+
145
+ std::vector<llama_control_vector_load_info> control_vectors; // control vector with user defined scale
146
+
147
+ int32_t verbosity = 0;
148
+ int32_t control_vector_layer_start = -1; // layer range for control vector
149
+ int32_t control_vector_layer_end = -1; // layer range for control vector
150
+
151
+ int32_t ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
152
+ int32_t ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
153
+ // (which is more convenient to use for plotting)
154
+ //
155
+ bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
156
+ size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score
157
+
158
+ bool winogrande = false; // compute Winogrande score over random tasks from datafile supplied in prompt
159
+ size_t winogrande_tasks = 0; // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed
160
+
161
+ bool multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt
162
+ size_t multiple_choice_tasks = 0; // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed
163
+
164
+ bool kl_divergence = false; // compute KL divergence
165
+
166
+ bool usage = false; // print usage
167
+ bool use_color = false; // use color to distinguish generations and inputs
168
+ bool special = false; // enable special token output
169
+ bool interactive = false; // interactive mode
170
+ bool interactive_first = false; // wait for user input immediately
171
+ bool conversation = false; // conversation mode (does not print special tokens and suffix/prefix)
172
+ bool prompt_cache_all = false; // save user input and generations to prompt cache
173
+ bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it
174
+
175
+ bool escape = true; // escape "\n", "\r", "\t", "\'", "\"", and "\\"
176
+ bool multiline_input = false; // reverse the usage of `\`
177
+ bool simple_io = false; // improves compatibility with subprocesses and limited consoles
178
+ bool cont_batching = true; // insert new sequences for decoding on-the-fly
179
+ bool flash_attn = false; // flash attention
180
+
181
+ bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
182
+ bool ignore_eos = false; // ignore generated EOS tokens
183
+ bool logits_all = false; // return logits for all tokens in the batch
184
+ bool use_mmap = true; // use mmap for faster loads
185
+ bool use_mlock = false; // use mlock to keep model in memory
186
+ bool verbose_prompt = false; // print prompt tokens before generation
187
+ bool display_prompt = true; // print prompt before generation
188
+ bool infill = false; // use infill mode
189
+ bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
190
+ bool no_kv_offload = false; // disable KV offloading
191
+ bool warmup = true; // warmup run
192
+ bool check_tensors = false; // validate tensor data
193
+
194
+ std::string cache_type_k = "f16"; // KV cache data type for the K
195
+ std::string cache_type_v = "f16"; // KV cache data type for the V
196
+
197
+ // multimodal models (see examples/llava)
198
+ std::string mmproj = ""; // path to multimodal projector
199
+ std::vector<std::string> image; // path to image file(s)
200
+
201
+ // embedding
202
+ bool embedding = false; // get only sentence embedding
203
+ int32_t embd_normalize = 2; // normalisation for embendings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
204
+ std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
205
+ std::string embd_sep = "\n"; // separator of embendings
206
+
207
+ // server params
208
+ int32_t port = 8080; // server listens on this network port
209
+ int32_t timeout_read = 600; // http read timeout in seconds
210
+ int32_t timeout_write = timeout_read; // http write timeout in seconds
211
+ int32_t n_threads_http = -1; // number of threads to process HTTP requests
212
+
213
+ std::string hostname = "127.0.0.1";
214
+ std::string public_path = "";
215
+ std::string chat_template = "";
216
+ std::string system_prompt = "";
217
+ bool enable_chat_template = true;
218
+
219
+ std::vector<std::string> api_keys;
220
+
221
+ std::string ssl_file_key = "";
222
+ std::string ssl_file_cert = "";
223
+
224
+ bool endpoint_slots = true;
225
+ bool endpoint_metrics = false;
226
+
227
+ bool log_json = false;
228
+
229
+ std::string slot_save_path;
230
+
231
+ float slot_prompt_similarity = 0.5f;
232
+
233
+ // batched-bench params
234
+ bool is_pp_shared = false;
235
+
236
+ std::vector<int32_t> n_pp;
237
+ std::vector<int32_t> n_tg;
238
+ std::vector<int32_t> n_pl;
239
+
240
+ // retrieval params
241
+ std::vector<std::string> context_files; // context files to embed
242
+
243
+ int32_t chunk_size = 64; // chunk size for context embedding
244
+
245
+ std::string chunk_separator = "\n"; // chunk separator for context embedding
246
+
247
+ // passkey params
248
+ int32_t n_junk = 250; // number of times to repeat the junk text
249
+ int32_t i_pos = -1; // position of the passkey in the junk text
250
+
251
+ // imatrix params
252
+ std::string out_file = "imatrix.dat"; // save the resulting imatrix to this file
253
+
254
+ int32_t n_out_freq = 10; // output the imatrix every n_out_freq iterations
255
+ int32_t n_save_freq = 0; // save the imatrix every n_save_freq iterations
256
+ int32_t i_chunk = 0; // start processing from this chunk
257
+
258
+ bool process_output = false; // collect data for the output tensor
259
+ bool compute_ppl = true; // whether to compute perplexity
260
+
261
+ // cvector-generator params
262
+ int n_pca_batch = 100;
263
+ int n_pca_iterations = 1000;
264
+ dimre_method cvector_dimre_method = DIMRE_METHOD_PCA;
265
+ std::string cvector_outfile = "control_vector.gguf";
266
+ std::string cvector_positive_file = "examples/cvector-generator/positive.txt";
267
+ std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
268
+
269
+ bool spm_infill = false; // suffix/prefix/middle pattern for infill
270
+ };
271
+
272
+ void gpt_params_handle_hf_token(gpt_params & params);
273
+ void gpt_params_handle_model_default(gpt_params & params);
274
+
275
+ bool gpt_params_parse_ex (int argc, char ** argv, gpt_params & params);
276
+ bool gpt_params_parse (int argc, char ** argv, gpt_params & params);
277
+ bool gpt_params_find_arg (int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param);
278
+ void gpt_params_print_usage(int argc, char ** argv, const gpt_params & params);
279
+
280
+ std::string gpt_params_get_system_info(const gpt_params & params);
281
+
282
+ //
283
+ // String utils
284
+ //
285
+
286
+ std::vector<std::string> string_split(std::string input, char separator);
287
+
288
+ std::string string_strip(const std::string & str);
289
+ std::string string_get_sortable_timestamp();
290
+
291
+ template<class T>
292
+ static std::vector<T> string_split(const std::string & str, char delim) {
293
+ std::vector<T> values;
294
+ std::istringstream str_stream(str);
295
+ std::string token;
296
+ while (std::getline(str_stream, token, delim)) {
297
+ T value;
298
+ std::istringstream token_stream(token);
299
+ token_stream >> value;
300
+ values.push_back(value);
301
+ }
302
+ return values;
303
+ }
304
+
305
+ bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
306
+ void string_process_escapes(std::string & input);
307
+
308
+ //
309
+ // Filesystem utils
310
+ //
311
+
312
+ bool fs_validate_filename(const std::string & filename);
313
+ bool fs_create_directory_with_parents(const std::string & path);
314
+
315
+ std::string fs_get_cache_directory();
316
+ std::string fs_get_cache_file(const std::string & filename);
317
+
318
+ //
319
+ // Model utils
320
+ //
321
+
322
+ // TODO: avoid tuplue, use struct
323
+ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params);
324
+
325
+ struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params);
326
+ struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
327
+
328
+ struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
329
+ struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
330
+
331
+ // Batch utils
332
+
333
+ void llama_batch_clear(struct llama_batch & batch);
334
+
335
+ void llama_batch_add(
336
+ struct llama_batch & batch,
337
+ llama_token id,
338
+ llama_pos pos,
339
+ const std::vector<llama_seq_id> & seq_ids,
340
+ bool logits);
341
+
342
+ //
343
+ // Vocab utils
344
+ //
345
+
346
+ // tokenizes a string into a vector of tokens
347
+ // should work similar to Python's `tokenizer.encode`
348
+ std::vector<llama_token> llama_tokenize(
349
+ const struct llama_context * ctx,
350
+ const std::string & text,
351
+ bool add_special,
352
+ bool parse_special = false);
353
+
354
+ std::vector<llama_token> llama_tokenize(
355
+ const struct llama_model * model,
356
+ const std::string & text,
357
+ bool add_special,
358
+ bool parse_special = false);
359
+
360
+ // tokenizes a token into a piece, optionally renders special/control tokens
361
+ // should work similar to Python's `tokenizer.id_to_piece`
362
+ std::string llama_token_to_piece(
363
+ const struct llama_context * ctx,
364
+ llama_token token,
365
+ bool special = true);
366
+
367
+ // detokenizes a vector of tokens into a string
368
+ // should work similar to Python's `tokenizer.decode`
369
+ // optionally renders special/control tokens
370
+ std::string llama_detokenize(
371
+ llama_context * ctx,
372
+ const std::vector<llama_token> & tokens,
373
+ bool special = true);
374
+
375
+ // Uses the value from the model metadata if possible, otherwise
376
+ // defaults to true when model type is SPM, otherwise false.
377
+ bool llama_should_add_bos_token(const llama_model * model);
378
+
379
+ //
380
+ // Chat template utils
381
+ //
382
+
383
+ // same with llama_chat_message, but uses std::string
384
+ struct llama_chat_msg {
385
+ std::string role;
386
+ std::string content;
387
+ };
388
+
389
+ // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
390
+ bool llama_chat_verify_template(const std::string & tmpl);
391
+
392
+ // CPP wrapper for llama_chat_apply_template
393
+ // If the built-in template is not supported, we default to chatml
394
+ // If the custom "tmpl" is not supported, we throw an error
395
+ std::string llama_chat_apply_template(const struct llama_model * model,
396
+ const std::string & tmpl,
397
+ const std::vector<llama_chat_msg> & chat,
398
+ bool add_ass);
399
+
400
+ // Format single message, while taking into account the position of that message in chat history
401
+ std::string llama_chat_format_single(const struct llama_model * model,
402
+ const std::string & tmpl,
403
+ const std::vector<llama_chat_msg> & past_msg,
404
+ const llama_chat_msg & new_msg,
405
+ bool add_ass);
406
+
407
+ // Returns an example of formatted chat
408
+ std::string llama_chat_format_example(const struct llama_model * model,
409
+ const std::string & tmpl);
410
+
411
+ //
412
+ // KV cache utils
413
+ //
414
+
415
+ // Dump the KV cache view with the number of sequences per cell.
416
+ void llama_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);
417
+
418
+ // Dump the KV cache view showing individual sequences in each cell (long output).
419
+ void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
420
+
421
+ //
422
+ // Embedding utils
423
+ //
424
+
425
+ void llama_embd_normalize(const float * inp, float * out, int n, int embd_norm = 2);
426
+
427
+ float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n);
428
+
429
+ //
430
+ // Control vector utils
431
+ //
432
+
433
+ struct llama_control_vector_data {
434
+ int n_embd;
435
+
436
+ // stores data for layers [1, n_layer] where n_layer = data.size() / n_embd
437
+ std::vector<float> data;
438
+ };
439
+
440
+ struct llama_control_vector_load_info {
441
+ float strength;
442
+
443
+ std::string fname;
444
+ };
445
+
446
+ // Load control vectors, scale each by strength, and add them together.
447
+ // On error, returns {-1, empty}
448
+ llama_control_vector_data llama_control_vector_load(const std::vector<llama_control_vector_load_info> & load_infos);
449
+
450
+ //
451
+ // Split utils
452
+ //
453
+
454
+ static const char * const LLM_KV_SPLIT_NO = "split.no";
455
+ static const char * const LLM_KV_SPLIT_COUNT = "split.count";
456
+ static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
457
+
458
+ //
459
+ // YAML utils
460
+ //
461
+
462
+ void yaml_dump_vector_float (FILE * stream, const char * prop_name, const std::vector<float> & data);
463
+ void yaml_dump_vector_int (FILE * stream, const char * prop_name, const std::vector<int> & data);
464
+ void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data);
465
+
466
+ void yaml_dump_non_result_info(
467
+ FILE * stream, const gpt_params & params, const llama_context * lctx,
468
+ const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);