cui-llama.rn 1.2.3 → 1.2.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/cpp/common.h CHANGED
@@ -24,12 +24,12 @@
24
24
 
25
25
  #define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
26
26
 
27
- struct llama_lora_adapter_info {
27
+ struct common_lora_adapter_info {
28
28
  std::string path;
29
29
  float scale;
30
30
  };
31
31
 
32
- struct llama_lora_adapter_container : llama_lora_adapter_info {
32
+ struct common_lora_adapter_container : common_lora_adapter_info {
33
33
  struct llama_lora_adapter * adapter;
34
34
  };
35
35
 
@@ -39,7 +39,7 @@ extern char const * LLAMA_COMMIT;
39
39
  extern char const * LLAMA_COMPILER;
40
40
  extern char const * LLAMA_BUILD_TARGET;
41
41
 
42
- struct llama_control_vector_load_info;
42
+ struct common_control_vector_load_info;
43
43
 
44
44
  #define print_build_info() do { \
45
45
  fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); \
@@ -93,15 +93,17 @@ enum llama_example {
93
93
  LLAMA_EXAMPLE_COUNT,
94
94
  };
95
95
 
96
- enum gpt_sampler_type {
97
- GPT_SAMPLER_TYPE_NONE = 0,
98
- GPT_SAMPLER_TYPE_TOP_K = 1,
99
- GPT_SAMPLER_TYPE_TOP_P = 2,
100
- GPT_SAMPLER_TYPE_MIN_P = 3,
101
- GPT_SAMPLER_TYPE_TFS_Z = 4,
102
- GPT_SAMPLER_TYPE_TYPICAL_P = 5,
103
- GPT_SAMPLER_TYPE_TEMPERATURE = 6,
104
- GPT_SAMPLER_TYPE_XTC = 7,
96
+ enum common_sampler_type {
97
+ COMMON_SAMPLER_TYPE_NONE = 0,
98
+ COMMON_SAMPLER_TYPE_DRY = 1,
99
+ COMMON_SAMPLER_TYPE_TOP_K = 2,
100
+ COMMON_SAMPLER_TYPE_TOP_P = 3,
101
+ COMMON_SAMPLER_TYPE_MIN_P = 4,
102
+ //COMMON_SAMPLER_TYPE_TFS_Z = 5,
103
+ COMMON_SAMPLER_TYPE_TYPICAL_P = 6,
104
+ COMMON_SAMPLER_TYPE_TEMPERATURE = 7,
105
+ COMMON_SAMPLER_TYPE_XTC = 8,
106
+ COMMON_SAMPLER_TYPE_INFILL = 9,
105
107
  };
106
108
 
107
109
  // dimensionality reduction methods, used by cvector-generator
@@ -111,41 +113,47 @@ enum dimre_method {
111
113
  };
112
114
 
113
115
  // sampler parameters
114
- struct gpt_sampler_params {
116
+ struct common_sampler_params {
115
117
  uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
116
-
117
- int32_t n_prev = 64; // number of previous tokens to remember
118
- int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
119
- int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens
120
- int32_t top_k = 40; // <= 0 to use vocab size
121
- float top_p = 0.95f; // 1.0 = disabled
122
- float min_p = 0.05f; // 0.0 = disabled
123
- float tfs_z = 1.00f; // 1.0 = disabled
124
- float xtc_t = 0.0f; // 0.0 = disabled
125
- float xtc_p = 0.0f;
126
- float typ_p = 1.00f; // typical_p, 1.0 = disabled
127
- float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
128
- float dynatemp_range = 0.00f; // 0.0 = disabled
129
- float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
130
- int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
131
- float penalty_repeat = 1.00f; // 1.0 = disabled
132
- float penalty_freq = 0.00f; // 0.0 = disabled
133
- float penalty_present = 0.00f; // 0.0 = disabled
134
- int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
135
- float mirostat_tau = 5.00f; // target entropy
136
- float mirostat_eta = 0.10f; // learning rate
137
- bool penalize_nl = false; // consider newlines as a repeatable token
138
- bool ignore_eos = false;
139
- bool no_perf = false; // disable performance metrics
140
-
141
- std::vector<enum gpt_sampler_type> samplers = {
142
- GPT_SAMPLER_TYPE_TOP_K,
143
- GPT_SAMPLER_TYPE_TFS_Z,
144
- GPT_SAMPLER_TYPE_TYPICAL_P,
145
- GPT_SAMPLER_TYPE_TOP_P,
146
- GPT_SAMPLER_TYPE_MIN_P,
147
- GPT_SAMPLER_TYPE_TEMPERATURE,
148
- GPT_SAMPLER_TYPE_XTC
118
+
119
+ int32_t n_prev = 64; // number of previous tokens to remember
120
+ int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
121
+ int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens
122
+ int32_t top_k = 40; // <= 0 to use vocab size
123
+ float top_p = 0.95f; // 1.0 = disabled
124
+ float min_p = 0.05f; // 0.0 = disabled
125
+ float xtc_probability = 0.00f; // 0.0 = disabled
126
+ float xtc_threshold = 0.10f; // > 0.5 disables XTC
127
+ float typ_p = 1.00f; // typical_p, 1.0 = disabled
128
+ float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
129
+ float dynatemp_range = 0.00f; // 0.0 = disabled
130
+ float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
131
+ int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
132
+ float penalty_repeat = 1.00f; // 1.0 = disabled
133
+ float penalty_freq = 0.00f; // 0.0 = disabled
134
+ float penalty_present = 0.00f; // 0.0 = disabled
135
+ float dry_multiplier = 0.0f; // 0.0 = disabled; DRY repetition penalty for tokens extending repetition:
136
+ float dry_base = 1.75f; // 0.0 = disabled; multiplier * base ^ (length of sequence before token - allowed length)
137
+ int32_t dry_allowed_length = 2; // tokens extending repetitions beyond this receive penalty
138
+ int32_t dry_penalty_last_n = -1; // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size)
139
+ int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
140
+ float mirostat_tau = 5.00f; // target entropy
141
+ float mirostat_eta = 0.10f; // learning rate
142
+ bool penalize_nl = false; // consider newlines as a repeatable token
143
+ bool ignore_eos = false;
144
+ bool no_perf = false; // disable performance metrics
145
+
146
+ std::vector<std::string> dry_sequence_breakers = {"\n", ":", "\"", "*"}; // default sequence breakers for DRY
147
+
148
+
149
+ std::vector<enum common_sampler_type> samplers = {
150
+ COMMON_SAMPLER_TYPE_DRY,
151
+ COMMON_SAMPLER_TYPE_TOP_K,
152
+ COMMON_SAMPLER_TYPE_TYPICAL_P,
153
+ COMMON_SAMPLER_TYPE_TOP_P,
154
+ COMMON_SAMPLER_TYPE_MIN_P,
155
+ COMMON_SAMPLER_TYPE_XTC,
156
+ COMMON_SAMPLER_TYPE_TEMPERATURE,
149
157
  };
150
158
 
151
159
  std::string grammar; // optional BNF-like grammar to constrain sampling
@@ -156,13 +164,13 @@ struct gpt_sampler_params {
156
164
  std::string print() const;
157
165
  };
158
166
 
159
- struct gpt_params {
167
+ struct common_params {
160
168
 
161
169
  void * progress_callback_user_data = nullptr;
162
170
  llama_progress_callback progress_callback = nullptr;
163
171
  bool vocab_only = false;
164
172
  int32_t n_predict = -1; // new tokens to predict
165
- int32_t n_ctx = 0; // context size
173
+ int32_t n_ctx = 4096; // context size
166
174
  int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
167
175
  int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
168
176
  int32_t n_keep = 0; // number of tokens to keep from initial prompt
@@ -202,7 +210,7 @@ struct gpt_params {
202
210
  enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
203
211
  enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
204
212
 
205
- struct gpt_sampler_params sparams;
213
+ struct common_sampler_params sparams;
206
214
 
207
215
  std::string model = ""; // model path // NOLINT
208
216
  std::string model_draft = ""; // draft model for speculative decoding // NOLINT
@@ -227,9 +235,9 @@ struct gpt_params {
227
235
  std::vector<llama_model_kv_override> kv_overrides;
228
236
 
229
237
  bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_lora_adapter_apply)
230
- std::vector<llama_lora_adapter_info> lora_adapters; // lora adapter path with user defined scale
238
+ std::vector<common_lora_adapter_info> lora_adapters; // lora adapter path with user defined scale
231
239
 
232
- std::vector<llama_control_vector_load_info> control_vectors; // control vector with user defined scale
240
+ std::vector<common_control_vector_load_info> control_vectors; // control vector with user defined scale
233
241
 
234
242
  int32_t verbosity = 0;
235
243
  int32_t control_vector_layer_start = -1; // layer range for control vector
@@ -287,21 +295,21 @@ struct gpt_params {
287
295
 
288
296
  // embedding
289
297
  bool embedding = false; // get only sentence embedding
290
- int32_t embd_normalize = 2; // normalisation for embendings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
298
+ int32_t embd_normalize = 2; // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
291
299
  std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
292
- std::string embd_sep = "\n"; // separator of embendings
300
+ std::string embd_sep = "\n"; // separator of embeddings
293
301
  bool reranking = false; // enable reranking support on server
294
302
 
295
303
  // server params
296
304
  int32_t port = 8080; // server listens on this network port
297
305
  int32_t timeout_read = 600; // http read timeout in seconds
298
306
  int32_t timeout_write = timeout_read; // http write timeout in seconds
299
- int n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
307
+ int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
308
+ int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
300
309
 
301
310
  std::string hostname = "127.0.0.1";
302
311
  std::string public_path = ""; // NOLINT
303
312
  std::string chat_template = ""; // NOLINT
304
- std::string system_prompt = ""; // NOLINT
305
313
  bool enable_chat_template = true;
306
314
 
307
315
  std::vector<std::string> api_keys;
@@ -367,20 +375,31 @@ struct gpt_params {
367
375
 
368
376
  // call once at the start of a program if it uses libcommon
369
377
  // initializes the logging system and prints info about the build
370
- void gpt_init();
378
+ void common_init();
371
379
 
372
- std::string gpt_params_get_system_info(const gpt_params & params);
380
+ std::string common_params_get_system_info(const common_params & params);
373
381
 
374
- bool parse_cpu_range(const std::string& range, bool(&boolmask)[LM_GGML_MAX_N_THREADS]);
375
- bool parse_cpu_mask(const std::string& mask, bool(&boolmask)[LM_GGML_MAX_N_THREADS]);
376
- void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model = nullptr);
382
+ bool parse_cpu_range(const std::string & range, bool(&boolmask)[LM_GGML_MAX_N_THREADS]);
383
+ bool parse_cpu_mask(const std::string & mask, bool(&boolmask)[LM_GGML_MAX_N_THREADS]);
384
+ void postprocess_cpu_params(cpu_params & cpuparams, const cpu_params * role_model = nullptr);
377
385
  bool set_process_priority(enum lm_ggml_sched_priority prio);
378
386
 
379
387
  //
380
388
  // String utils
381
389
  //
382
390
 
383
- std::vector<std::string> string_split(std::string input, char separator);
391
+ #ifdef __GNUC__
392
+ #ifdef __MINGW32__
393
+ #define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
394
+ #else
395
+ #define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
396
+ #endif
397
+ #else
398
+ #define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
399
+ #endif
400
+
401
+ LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
402
+ std::string string_format(const char * fmt, ...);
384
403
 
385
404
  std::string string_strip(const std::string & str);
386
405
  std::string string_get_sortable_timestamp();
@@ -389,6 +408,7 @@ void string_replace_all(std::string & s, const std::string & search, const std::
389
408
 
390
409
  template<class T>
391
410
  static std::vector<T> string_split(const std::string & str, char delim) {
411
+ static_assert(!std::is_same<T, std::string>::value, "Please use the specialized version for std::string");
392
412
  std::vector<T> values;
393
413
  std::istringstream str_stream(str);
394
414
  std::string token;
@@ -401,6 +421,22 @@ static std::vector<T> string_split(const std::string & str, char delim) {
401
421
  return values;
402
422
  }
403
423
 
424
+ template<>
425
+ std::vector<std::string> string_split<std::string>(const std::string & input, char separator)
426
+ {
427
+ std::vector<std::string> parts;
428
+ size_t begin_pos = 0;
429
+ size_t separator_pos = input.find(separator);
430
+ while (separator_pos != std::string::npos) {
431
+ std::string part = input.substr(begin_pos, separator_pos - begin_pos);
432
+ parts.emplace_back(part);
433
+ begin_pos = separator_pos + 1;
434
+ separator_pos = input.find(separator, begin_pos);
435
+ }
436
+ parts.emplace_back(input.substr(begin_pos, separator_pos - begin_pos));
437
+ return parts;
438
+ }
439
+
404
440
  bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
405
441
  void string_process_escapes(std::string & input);
406
442
 
@@ -423,29 +459,29 @@ std::string fs_get_cache_file(const std::string & filename);
423
459
  // Model utils
424
460
  //
425
461
 
426
- struct llama_init_result {
462
+ struct common_init_result {
427
463
  struct llama_model * model = nullptr;
428
464
  struct llama_context * context = nullptr;
429
- std::vector<llama_lora_adapter_container> lora_adapters;
465
+ std::vector<common_lora_adapter_container> lora_adapters;
430
466
  };
431
467
 
432
- struct llama_init_result llama_init_from_gpt_params(gpt_params & params);
468
+ struct common_init_result common_init_from_params(common_params & params);
433
469
 
434
- struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params);
435
- struct llama_context_params llama_context_params_from_gpt_params (const gpt_params & params);
470
+ struct llama_model_params common_model_params_to_llama (const common_params & params);
471
+ struct llama_context_params common_context_params_to_llama(const common_params & params);
436
472
  struct lm_ggml_threadpool_params lm_ggml_threadpool_params_from_cpu_params(const cpu_params & params);
437
473
 
438
- struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
439
- struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
474
+ struct llama_model * common_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
475
+ struct llama_model * common_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
440
476
 
441
477
  // clear LoRA adapters from context, then apply new list of adapters
442
- void llama_lora_adapters_apply(struct llama_context * ctx, std::vector<llama_lora_adapter_container> & lora_adapters);
478
+ void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters);
443
479
 
444
480
  // Batch utils
445
481
 
446
- void llama_batch_clear(struct llama_batch & batch);
482
+ void common_batch_clear(struct llama_batch & batch);
447
483
 
448
- void llama_batch_add(
484
+ void common_batch_add(
449
485
  struct llama_batch & batch,
450
486
  llama_token id,
451
487
  llama_pos pos,
@@ -458,13 +494,13 @@ void llama_batch_add(
458
494
 
459
495
  // tokenizes a string into a vector of tokens
460
496
  // should work similar to Python's `tokenizer.encode`
461
- std::vector<llama_token> llama_tokenize(
497
+ std::vector<llama_token> common_tokenize(
462
498
  const struct llama_context * ctx,
463
499
  const std::string & text,
464
500
  bool add_special,
465
501
  bool parse_special = false);
466
502
 
467
- std::vector<llama_token> llama_tokenize(
503
+ std::vector<llama_token> common_tokenize(
468
504
  const struct llama_model * model,
469
505
  const std::string & text,
470
506
  bool add_special,
@@ -472,7 +508,7 @@ std::vector<llama_token> llama_tokenize(
472
508
 
473
509
  // tokenizes a token into a piece, optionally renders special/control tokens
474
510
  // should work similar to Python's `tokenizer.id_to_piece`
475
- std::string llama_token_to_piece(
511
+ std::string common_token_to_piece(
476
512
  const struct llama_context * ctx,
477
513
  llama_token token,
478
514
  bool special = true);
@@ -480,7 +516,7 @@ std::string llama_token_to_piece(
480
516
  // detokenizes a vector of tokens into a string
481
517
  // should work similar to Python's `tokenizer.decode`
482
518
  // optionally renders special/control tokens
483
- std::string llama_detokenize(
519
+ std::string common_detokenize(
484
520
  llama_context * ctx,
485
521
  const std::vector<llama_token> & tokens,
486
522
  bool special = true);
@@ -490,31 +526,31 @@ std::string llama_detokenize(
490
526
  //
491
527
 
492
528
  // same with llama_chat_message, but uses std::string
493
- struct llama_chat_msg {
529
+ struct common_chat_msg {
494
530
  std::string role;
495
531
  std::string content;
496
532
  };
497
533
 
498
534
  // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
499
- bool llama_chat_verify_template(const std::string & tmpl);
535
+ bool common_chat_verify_template(const std::string & tmpl);
500
536
 
501
537
  // CPP wrapper for llama_chat_apply_template
502
538
  // If the built-in template is not supported, we default to chatml
503
539
  // If the custom "tmpl" is not supported, we throw an error
504
- std::string llama_chat_apply_template(const struct llama_model * model,
540
+ std::string common_chat_apply_template(const struct llama_model * model,
505
541
  const std::string & tmpl,
506
- const std::vector<llama_chat_msg> & chat,
542
+ const std::vector<common_chat_msg> & chat,
507
543
  bool add_ass);
508
544
 
509
545
  // Format single message, while taking into account the position of that message in chat history
510
- std::string llama_chat_format_single(const struct llama_model * model,
546
+ std::string common_chat_format_single(const struct llama_model * model,
511
547
  const std::string & tmpl,
512
- const std::vector<llama_chat_msg> & past_msg,
513
- const llama_chat_msg & new_msg,
548
+ const std::vector<common_chat_msg> & past_msg,
549
+ const common_chat_msg & new_msg,
514
550
  bool add_ass);
515
551
 
516
552
  // Returns an example of formatted chat
517
- std::string llama_chat_format_example(const struct llama_model * model,
553
+ std::string common_chat_format_example(const struct llama_model * model,
518
554
  const std::string & tmpl);
519
555
 
520
556
  //
@@ -522,31 +558,31 @@ std::string llama_chat_format_example(const struct llama_model * model,
522
558
  //
523
559
 
524
560
  // Dump the KV cache view with the number of sequences per cell.
525
- void llama_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);
561
+ void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);
526
562
 
527
563
  // Dump the KV cache view showing individual sequences in each cell (long output).
528
- void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
564
+ void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
529
565
 
530
566
  //
531
567
  // Embedding utils
532
568
  //
533
569
 
534
- void llama_embd_normalize(const float * inp, float * out, int n, int embd_norm = 2);
570
+ void common_embd_normalize(const float * inp, float * out, int n, int embd_norm = 2);
535
571
 
536
- float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n);
572
+ float common_embd_similarity_cos(const float * embd1, const float * embd2, int n);
537
573
 
538
574
  //
539
575
  // Control vector utils
540
576
  //
541
577
 
542
- struct llama_control_vector_data {
578
+ struct common_control_vector_data {
543
579
  int n_embd;
544
580
 
545
581
  // stores data for layers [1, n_layer] where n_layer = data.size() / n_embd
546
582
  std::vector<float> data;
547
583
  };
548
584
 
549
- struct llama_control_vector_load_info {
585
+ struct common_control_vector_load_info {
550
586
  float strength;
551
587
 
552
588
  std::string fname;
@@ -554,7 +590,7 @@ struct llama_control_vector_load_info {
554
590
 
555
591
  // Load control vectors, scale each by strength, and add them together.
556
592
  // On error, returns {-1, empty}
557
- llama_control_vector_data llama_control_vector_load(const std::vector<llama_control_vector_load_info> & load_infos);
593
+ common_control_vector_data common_control_vector_load(const std::vector<common_control_vector_load_info> & load_infos);
558
594
 
559
595
  //
560
596
  // Split utils
@@ -573,5 +609,5 @@ void yaml_dump_vector_int (FILE * stream, const char * prop_name, const std
573
609
  void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data);
574
610
 
575
611
  void yaml_dump_non_result_info(
576
- FILE * stream, const gpt_params & params, const llama_context * lctx,
612
+ FILE * stream, const common_params & params, const llama_context * lctx,
577
613
  const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);