cui-llama.rn 1.1.2 → 1.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/cpp/common.h CHANGED
@@ -14,8 +14,10 @@
14
14
  #include <vector>
15
15
  #include <random>
16
16
  #include <thread>
17
+ #include <set>
17
18
  #include <unordered_map>
18
19
  #include <tuple>
20
+ #include <functional>
19
21
 
20
22
  #ifdef _WIN32
21
23
  #define DIRECTORY_SEPARATOR '\\'
@@ -72,20 +74,44 @@ int32_t cpu_get_num_math();
72
74
  // CLI argument parsing
73
75
  //
74
76
 
77
+ enum llama_example {
78
+ LLAMA_EXAMPLE_COMMON,
79
+ LLAMA_EXAMPLE_SPECULATIVE,
80
+ LLAMA_EXAMPLE_MAIN,
81
+ LLAMA_EXAMPLE_INFILL,
82
+ LLAMA_EXAMPLE_EMBEDDING,
83
+ LLAMA_EXAMPLE_PERPLEXITY,
84
+ LLAMA_EXAMPLE_RETRIEVAL,
85
+ LLAMA_EXAMPLE_PASSKEY,
86
+ LLAMA_EXAMPLE_IMATRIX,
87
+ LLAMA_EXAMPLE_BENCH,
88
+ LLAMA_EXAMPLE_SERVER,
89
+ LLAMA_EXAMPLE_CVECTOR_GENERATOR,
90
+ LLAMA_EXAMPLE_EXPORT_LORA,
91
+ LLAMA_EXAMPLE_LLAVA,
92
+
93
+ LLAMA_EXAMPLE_COUNT,
94
+ };
95
+
75
96
  // dimensionality reduction methods, used by cvector-generator
76
97
  enum dimre_method {
77
98
  DIMRE_METHOD_PCA,
78
99
  DIMRE_METHOD_MEAN,
79
100
  };
80
101
 
102
+ struct cpu_params {
103
+ int n_threads = -1;
104
+ bool cpumask[LM_GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
105
+ bool mask_valid = false; // Default: any CPU
106
+ enum lm_ggml_sched_priority priority = LM_GGML_SCHED_PRIO_NORMAL; // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
107
+ bool strict_cpu = false; // Use strict CPU placement
108
+ uint32_t poll = 50; // Polling (busywait) level (0 - no polling, 100 - mostly polling)
109
+ };
110
+
81
111
  struct gpt_params {
82
- uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
112
+ enum llama_example curr_ex = LLAMA_EXAMPLE_COMMON;
83
113
 
84
114
  bool vocab_only = false;
85
- int32_t n_threads = cpu_get_num_math();
86
- int32_t n_threads_draft = -1;
87
- int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
88
- int32_t n_threads_batch_draft = -1;
89
115
  int32_t n_predict = -1; // new tokens to predict
90
116
  int32_t n_ctx = 0; // context size
91
117
  int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
@@ -112,6 +138,11 @@ struct gpt_params {
112
138
  int32_t yarn_orig_ctx = 0; // YaRN original context length
113
139
  float defrag_thold = -1.0f; // KV cache defragmentation threshold
114
140
 
141
+ struct cpu_params cpuparams;
142
+ struct cpu_params cpuparams_batch;
143
+ struct cpu_params draft_cpuparams;
144
+ struct cpu_params draft_cpuparams_batch;
145
+
115
146
  lm_ggml_backend_sched_eval_callback cb_eval = nullptr;
116
147
  void * cb_eval_user_data = nullptr;
117
148
 
@@ -122,8 +153,7 @@ struct gpt_params {
122
153
  enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
123
154
  enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
124
155
 
125
- // // sampling parameters
126
- struct llama_sampling_params sparams;
156
+ struct gpt_sampler_params sparams;
127
157
 
128
158
  std::string model = ""; // model path
129
159
  std::string model_draft = ""; // draft model for speculative decoding
@@ -171,6 +201,7 @@ struct gpt_params {
171
201
 
172
202
  bool kl_divergence = false; // compute KL divergence
173
203
 
204
+ std::function<void(int, char **)> print_usage = nullptr; // print example-specific usage and example
174
205
  bool usage = false; // print usage
175
206
  bool use_color = false; // use color to distinguish generations and inputs
176
207
  bool special = false; // enable special token output
@@ -187,13 +218,11 @@ struct gpt_params {
187
218
  bool flash_attn = false; // flash attention
188
219
 
189
220
  bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
190
- bool ignore_eos = false; // ignore generated EOS tokens
191
221
  bool logits_all = false; // return logits for all tokens in the batch
192
222
  bool use_mmap = true; // use mmap for faster loads
193
223
  bool use_mlock = false; // use mlock to keep model in memory
194
224
  bool verbose_prompt = false; // print prompt tokens before generation
195
225
  bool display_prompt = true; // print prompt before generation
196
- bool infill = false; // use infill mode
197
226
  bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
198
227
  bool no_kv_offload = false; // disable KV offloading
199
228
  bool warmup = true; // warmup run
@@ -216,7 +245,7 @@ struct gpt_params {
216
245
  int32_t port = 8080; // server listens on this network port
217
246
  int32_t timeout_read = 600; // http read timeout in seconds
218
247
  int32_t timeout_write = timeout_read; // http write timeout in seconds
219
- int32_t n_threads_http = -1; // number of threads to process HTTP requests
248
+ int n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
220
249
 
221
250
  std::string hostname = "127.0.0.1";
222
251
  std::string public_path = "";
@@ -277,18 +306,104 @@ struct gpt_params {
277
306
  bool spm_infill = false; // suffix/prefix/middle pattern for infill
278
307
 
279
308
  std::string lora_outfile = "ggml-lora-merged-f16.gguf";
309
+
310
+ // batched-bench params
311
+ bool batched_bench_output_jsonl = false;
280
312
  };
281
313
 
282
- void gpt_params_parse_from_env(gpt_params & params);
283
- void gpt_params_handle_model_default(gpt_params & params);
314
+ struct llama_arg {
315
+ std::set<enum llama_example> examples = {LLAMA_EXAMPLE_COMMON};
316
+ std::vector<const char *> args;
317
+ const char * value_hint = nullptr; // help text or example for arg value
318
+ const char * value_hint_2 = nullptr; // for second arg value
319
+ const char * env = nullptr;
320
+ std::string help;
321
+ void (*handler_void) (gpt_params & params) = nullptr;
322
+ void (*handler_string) (gpt_params & params, const std::string &) = nullptr;
323
+ void (*handler_str_str)(gpt_params & params, const std::string &, const std::string &) = nullptr;
324
+ void (*handler_int) (gpt_params & params, int) = nullptr;
325
+
326
+ llama_arg(
327
+ const std::initializer_list<const char *> & args,
328
+ const char * value_hint,
329
+ const std::string & help,
330
+ void (*handler)(gpt_params & params, const std::string &)
331
+ ) : args(args), value_hint(value_hint), help(help), handler_string(handler) {}
332
+
333
+ llama_arg(
334
+ const std::initializer_list<const char *> & args,
335
+ const char * value_hint,
336
+ const std::string & help,
337
+ void (*handler)(gpt_params & params, int)
338
+ ) : args(args), value_hint(value_hint), help(help), handler_int(handler) {}
339
+
340
+ llama_arg(
341
+ const std::initializer_list<const char *> & args,
342
+ const std::string & help,
343
+ void (*handler)(gpt_params & params)
344
+ ) : args(args), help(help), handler_void(handler) {}
345
+
346
+ // support 2 values for arg
347
+ llama_arg(
348
+ const std::initializer_list<const char *> & args,
349
+ const char * value_hint,
350
+ const char * value_hint_2,
351
+ const std::string & help,
352
+ void (*handler)(gpt_params & params, const std::string &, const std::string &)
353
+ ) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {}
354
+
355
+ llama_arg & set_examples(std::initializer_list<enum llama_example> examples) {
356
+ this->examples = std::move(examples);
357
+ return *this;
358
+ }
284
359
 
285
- bool gpt_params_parse_ex (int argc, char ** argv, gpt_params & params);
286
- bool gpt_params_parse (int argc, char ** argv, gpt_params & params);
287
- bool gpt_params_find_arg (int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param);
288
- void gpt_params_print_usage(int argc, char ** argv, const gpt_params & params);
360
+ llama_arg & set_env(const char * env) {
361
+ help = help + "\n(env: " + env + ")";
362
+ this->env = env;
363
+ return *this;
364
+ }
365
+
366
+ bool in_example(enum llama_example ex) {
367
+ return examples.find(ex) != examples.end();
368
+ }
369
+
370
+ bool get_value_from_env(std::string & output) const {
371
+ if (env == nullptr) return false;
372
+ char * value = std::getenv(env);
373
+ if (value) {
374
+ output = value;
375
+ return true;
376
+ }
377
+ return false;
378
+ }
379
+
380
+ bool has_value_from_env() const {
381
+ return env != nullptr && std::getenv(env);
382
+ }
383
+
384
+ std::string to_string();
385
+ };
386
+
387
+ // initialize list of options (arguments) that can be used by the current example
388
+ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example ex);
389
+ // optionally, we can provide "print_usage" to print example usage
390
+ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example ex, std::function<void(int, char **)> print_usage);
391
+
392
+ // parse input arguments from CLI
393
+ // if one argument has invalid value, it will automatically display usage of the specific argument (and not the full usage message)
394
+ bool gpt_params_parse (int argc, char ** argv, gpt_params & params, std::vector<llama_arg> & options);
395
+ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params, std::vector<llama_arg> & options);
396
+
397
+ // print full usage message; it will be called internally by gpt_params_parse() if "-h" is set
398
+ void gpt_params_print_usage(gpt_params & params, std::vector<llama_arg> & options);
289
399
 
290
400
  std::string gpt_params_get_system_info(const gpt_params & params);
291
401
 
402
+ bool parse_cpu_range(const std::string& range, bool(&boolmask)[LM_GGML_MAX_N_THREADS]);
403
+ bool parse_cpu_mask(const std::string& mask, bool(&boolmask)[LM_GGML_MAX_N_THREADS]);
404
+ void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model = nullptr);
405
+ bool set_process_priority(enum lm_ggml_sched_priority prio);
406
+
292
407
  //
293
408
  // String utils
294
409
  //
@@ -339,8 +454,9 @@ struct llama_init_result {
339
454
 
340
455
  struct llama_init_result llama_init_from_gpt_params(gpt_params & params);
341
456
 
342
- struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params);
343
- struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
457
+ struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params);
458
+ struct llama_context_params llama_context_params_from_gpt_params (const gpt_params & params);
459
+ struct lm_ggml_threadpool_params lm_ggml_threadpool_params_from_cpu_params(const cpu_params & params);
344
460
 
345
461
  struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
346
462
  struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);