cui-llama.rn 1.1.2 → 1.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/cpp/common.h CHANGED
@@ -4,18 +4,11 @@
4
4
 
5
5
  #include "llama.h"
6
6
 
7
- #include "sampling.h"
8
-
9
7
  #define LOG_NO_FILE_LINE_FUNCTION
10
8
  #include "log.h"
11
9
 
12
- #include <cmath>
13
10
  #include <string>
14
11
  #include <vector>
15
- #include <random>
16
- #include <thread>
17
- #include <unordered_map>
18
- #include <tuple>
19
12
 
20
13
  #ifdef _WIN32
21
14
  #define DIRECTORY_SEPARATOR '\\'
@@ -65,27 +58,109 @@ extern char const *LLAMA_BUILD_TARGET;
65
58
  // CPU utils
66
59
  //
67
60
 
61
+ struct cpu_params {
62
+ int n_threads = -1;
63
+ bool cpumask[LM_GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
64
+ bool mask_valid = false; // Default: any CPU
65
+ enum lm_ggml_sched_priority priority = LM_GGML_SCHED_PRIO_NORMAL; // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
66
+ bool strict_cpu = false; // Use strict CPU placement
67
+ uint32_t poll = 50; // Polling (busywait) level (0 - no polling, 100 - mostly polling)
68
+ };
69
+
68
70
  int32_t cpu_get_num_physical_cores();
69
71
  int32_t cpu_get_num_math();
70
72
 
71
73
  //
72
- // CLI argument parsing
74
+ // Common params
73
75
  //
74
76
 
77
+ enum llama_example {
78
+ LLAMA_EXAMPLE_COMMON,
79
+ LLAMA_EXAMPLE_SPECULATIVE,
80
+ LLAMA_EXAMPLE_MAIN,
81
+ LLAMA_EXAMPLE_INFILL,
82
+ LLAMA_EXAMPLE_EMBEDDING,
83
+ LLAMA_EXAMPLE_PERPLEXITY,
84
+ LLAMA_EXAMPLE_RETRIEVAL,
85
+ LLAMA_EXAMPLE_PASSKEY,
86
+ LLAMA_EXAMPLE_IMATRIX,
87
+ LLAMA_EXAMPLE_BENCH,
88
+ LLAMA_EXAMPLE_SERVER,
89
+ LLAMA_EXAMPLE_CVECTOR_GENERATOR,
90
+ LLAMA_EXAMPLE_EXPORT_LORA,
91
+ LLAMA_EXAMPLE_LLAVA,
92
+ LLAMA_EXAMPLE_LOOKUP,
93
+ LLAMA_EXAMPLE_PARALLEL,
94
+
95
+ LLAMA_EXAMPLE_COUNT,
96
+ };
97
+
98
+ enum gpt_sampler_type {
99
+ GPT_SAMPLER_TYPE_NONE = 0,
100
+ GPT_SAMPLER_TYPE_TOP_K = 1,
101
+ GPT_SAMPLER_TYPE_TOP_P = 2,
102
+ GPT_SAMPLER_TYPE_MIN_P = 3,
103
+ GPT_SAMPLER_TYPE_TFS_Z = 4,
104
+ GPT_SAMPLER_TYPE_TYPICAL_P = 5,
105
+ GPT_SAMPLER_TYPE_TEMPERATURE = 6,
106
+ GPT_SAMPLER_TYPE_XTC = 7,
107
+ };
108
+
75
109
  // dimensionality reduction methods, used by cvector-generator
76
110
  enum dimre_method {
77
111
  DIMRE_METHOD_PCA,
78
112
  DIMRE_METHOD_MEAN,
79
113
  };
80
114
 
115
+ // sampler parameters
116
+ struct gpt_sampler_params {
117
+ uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
118
+
119
+ int32_t n_prev = 64; // number of previous tokens to remember
120
+ int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
121
+ int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens
122
+ int32_t top_k = 40; // <= 0 to use vocab size
123
+ float top_p = 0.95f; // 1.0 = disabled
124
+ float min_p = 0.05f; // 0.0 = disabled
125
+ float tfs_z = 1.00f; // 1.0 = disabled
126
+ float xtc_t = 0.0f; // 0.0 = disabled
127
+ float xtc_p = 0.0f;
128
+ float typ_p = 1.00f; // typical_p, 1.0 = disabled
129
+ float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
130
+ float dynatemp_range = 0.00f; // 0.0 = disabled
131
+ float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
132
+ int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
133
+ float penalty_repeat = 1.00f; // 1.0 = disabled
134
+ float penalty_freq = 0.00f; // 0.0 = disabled
135
+ float penalty_present = 0.00f; // 0.0 = disabled
136
+ int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
137
+ float mirostat_tau = 5.00f; // target entropy
138
+ float mirostat_eta = 0.10f; // learning rate
139
+ bool penalize_nl = false; // consider newlines as a repeatable token
140
+ bool ignore_eos = false;
141
+ bool no_perf = false; // disable performance metrics
142
+
143
+ std::vector<enum gpt_sampler_type> samplers = {
144
+ GPT_SAMPLER_TYPE_TOP_K,
145
+ GPT_SAMPLER_TYPE_TFS_Z,
146
+ GPT_SAMPLER_TYPE_TYPICAL_P,
147
+ GPT_SAMPLER_TYPE_TOP_P,
148
+ GPT_SAMPLER_TYPE_MIN_P,
149
+ GPT_SAMPLER_TYPE_TEMPERATURE,
150
+ GPT_SAMPLER_TYPE_XTC
151
+ };
152
+
153
+ std::string grammar; // optional BNF-like grammar to constrain sampling
154
+
155
+ std::vector<llama_logit_bias> logit_bias; // logit biases to apply
156
+
157
+ // print the parameters into a string
158
+ std::string print() const;
159
+ };
160
+
81
161
  struct gpt_params {
82
- uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
83
162
 
84
163
  bool vocab_only = false;
85
- int32_t n_threads = cpu_get_num_math();
86
- int32_t n_threads_draft = -1;
87
- int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
88
- int32_t n_threads_batch_draft = -1;
89
164
  int32_t n_predict = -1; // new tokens to predict
90
165
  int32_t n_ctx = 0; // context size
91
166
  int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
@@ -112,6 +187,11 @@ struct gpt_params {
112
187
  int32_t yarn_orig_ctx = 0; // YaRN original context length
113
188
  float defrag_thold = -1.0f; // KV cache defragmentation threshold
114
189
 
190
+ struct cpu_params cpuparams;
191
+ struct cpu_params cpuparams_batch;
192
+ struct cpu_params draft_cpuparams;
193
+ struct cpu_params draft_cpuparams_batch;
194
+
115
195
  lm_ggml_backend_sched_eval_callback cb_eval = nullptr;
116
196
  void * cb_eval_user_data = nullptr;
117
197
 
@@ -122,26 +202,25 @@ struct gpt_params {
122
202
  enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
123
203
  enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
124
204
 
125
- // // sampling parameters
126
- struct llama_sampling_params sparams;
127
-
128
- std::string model = ""; // model path
129
- std::string model_draft = ""; // draft model for speculative decoding
130
- std::string model_alias = "unknown"; // model alias
131
- std::string model_url = ""; // model url to download
132
- std::string hf_token = ""; // HF token
133
- std::string hf_repo = ""; // HF repo
134
- std::string hf_file = ""; // HF file
135
- std::string prompt = "";
136
- std::string prompt_file = ""; // store the external prompt file name
137
- std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state
138
- std::string input_prefix = ""; // string to prefix user inputs with
139
- std::string input_suffix = ""; // string to suffix user inputs with
140
- std::string logdir = ""; // directory in which to save YAML log files
141
- std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding
142
- std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding
143
- std::string logits_file = ""; // file for saving *all* logits
144
- std::string rpc_servers = ""; // comma separated list of RPC servers
205
+ struct gpt_sampler_params sparams;
206
+
207
+ std::string model = ""; // model path // NOLINT
208
+ std::string model_draft = ""; // draft model for speculative decoding // NOLINT
209
+ std::string model_alias = "unknown"; // model alias // NOLINT
210
+ std::string model_url = ""; // model url to download // NOLINT
211
+ std::string hf_token = ""; // HF token // NOLINT
212
+ std::string hf_repo = ""; // HF repo // NOLINT
213
+ std::string hf_file = ""; // HF file // NOLINT
214
+ std::string prompt = ""; // NOLINT
215
+ std::string prompt_file = ""; // store the external prompt file name // NOLINT
216
+ std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state // NOLINT
217
+ std::string input_prefix = ""; // string to prefix user inputs with // NOLINT
218
+ std::string input_suffix = ""; // string to suffix user inputs with // NOLINT
219
+ std::string logdir = ""; // directory in which to save YAML log files // NOLINT
220
+ std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding // NOLINT
221
+ std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding // NOLINT
222
+ std::string logits_file = ""; // file for saving *all* logits // NOLINT
223
+ std::string rpc_servers = ""; // comma separated list of RPC servers // NOLINT
145
224
 
146
225
  std::vector<std::string> in_files; // all input files
147
226
  std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
@@ -185,15 +264,14 @@ struct gpt_params {
185
264
  bool simple_io = false; // improves compatibility with subprocesses and limited consoles
186
265
  bool cont_batching = true; // insert new sequences for decoding on-the-fly
187
266
  bool flash_attn = false; // flash attention
267
+ bool no_perf = false; // disable performance metrics
188
268
 
189
269
  bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
190
- bool ignore_eos = false; // ignore generated EOS tokens
191
270
  bool logits_all = false; // return logits for all tokens in the batch
192
271
  bool use_mmap = true; // use mmap for faster loads
193
272
  bool use_mlock = false; // use mlock to keep model in memory
194
273
  bool verbose_prompt = false; // print prompt tokens before generation
195
274
  bool display_prompt = true; // print prompt before generation
196
- bool infill = false; // use infill mode
197
275
  bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
198
276
  bool no_kv_offload = false; // disable KV offloading
199
277
  bool warmup = true; // warmup run
@@ -203,7 +281,7 @@ struct gpt_params {
203
281
  std::string cache_type_v = "f16"; // KV cache data type for the V
204
282
 
205
283
  // multimodal models (see examples/llava)
206
- std::string mmproj = ""; // path to multimodal projector
284
+ std::string mmproj = ""; // path to multimodal projector // NOLINT
207
285
  std::vector<std::string> image; // path to image file(s)
208
286
 
209
287
  // embedding
@@ -216,18 +294,18 @@ struct gpt_params {
216
294
  int32_t port = 8080; // server listens on this network port
217
295
  int32_t timeout_read = 600; // http read timeout in seconds
218
296
  int32_t timeout_write = timeout_read; // http write timeout in seconds
219
- int32_t n_threads_http = -1; // number of threads to process HTTP requests
297
+ int n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
220
298
 
221
299
  std::string hostname = "127.0.0.1";
222
- std::string public_path = "";
223
- std::string chat_template = "";
224
- std::string system_prompt = "";
300
+ std::string public_path = ""; // NOLINT
301
+ std::string chat_template = ""; // NOLINT
302
+ std::string system_prompt = ""; // NOLINT
225
303
  bool enable_chat_template = true;
226
304
 
227
305
  std::vector<std::string> api_keys;
228
306
 
229
- std::string ssl_file_key = "";
230
- std::string ssl_file_cert = "";
307
+ std::string ssl_file_key = ""; // NOLINT
308
+ std::string ssl_file_cert = ""; // NOLINT
231
309
 
232
310
  bool endpoint_slots = true;
233
311
  bool endpoint_metrics = false;
@@ -277,18 +355,18 @@ struct gpt_params {
277
355
  bool spm_infill = false; // suffix/prefix/middle pattern for infill
278
356
 
279
357
  std::string lora_outfile = "ggml-lora-merged-f16.gguf";
280
- };
281
-
282
- void gpt_params_parse_from_env(gpt_params & params);
283
- void gpt_params_handle_model_default(gpt_params & params);
284
358
 
285
- bool gpt_params_parse_ex (int argc, char ** argv, gpt_params & params);
286
- bool gpt_params_parse (int argc, char ** argv, gpt_params & params);
287
- bool gpt_params_find_arg (int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param);
288
- void gpt_params_print_usage(int argc, char ** argv, const gpt_params & params);
359
+ // batched-bench params
360
+ bool batched_bench_output_jsonl = false;
361
+ };
289
362
 
290
363
  std::string gpt_params_get_system_info(const gpt_params & params);
291
364
 
365
+ bool parse_cpu_range(const std::string& range, bool(&boolmask)[LM_GGML_MAX_N_THREADS]);
366
+ bool parse_cpu_mask(const std::string& mask, bool(&boolmask)[LM_GGML_MAX_N_THREADS]);
367
+ void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model = nullptr);
368
+ bool set_process_priority(enum lm_ggml_sched_priority prio);
369
+
292
370
  //
293
371
  // String utils
294
372
  //
@@ -339,8 +417,9 @@ struct llama_init_result {
339
417
 
340
418
  struct llama_init_result llama_init_from_gpt_params(gpt_params & params);
341
419
 
342
- struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params);
343
- struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
420
+ struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params);
421
+ struct llama_context_params llama_context_params_from_gpt_params (const gpt_params & params);
422
+ struct lm_ggml_threadpool_params lm_ggml_threadpool_params_from_cpu_params(const cpu_params & params);
344
423
 
345
424
  struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
346
425
  struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);