cui-llama.rn 1.2.3 → 1.2.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/cpp/llama.h CHANGED
@@ -2,6 +2,7 @@
2
2
  #define LLAMA_H
3
3
 
4
4
  #include "ggml.h"
5
+ #include "ggml-cpu.h"
5
6
  #include "ggml-backend.h"
6
7
 
7
8
  #include <stddef.h>
@@ -206,7 +207,7 @@ extern "C" {
206
207
  enum llama_split_mode {
207
208
  LLAMA_SPLIT_MODE_NONE = 0, // single GPU
208
209
  LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs
209
- LLAMA_SPLIT_MODE_ROW = 2, // split rows across GPUs
210
+ LLAMA_SPLIT_MODE_ROW = 2, // split layers and KV across GPUs, use tensor parallelism if supported
210
211
  };
211
212
 
212
213
  // TODO: simplify (https://github.com/ggerganov/llama.cpp/pull/9294#pullrequestreview-2286561979)
@@ -218,6 +219,7 @@ extern "C" {
218
219
 
219
220
  typedef struct llama_token_data_array {
220
221
  // TODO: consider SoA
222
+ // NOTE: this pointer can be modified by the samplers
221
223
  llama_token_data * data;
222
224
  size_t size;
223
225
  int64_t selected; // this is the index in the data array (i.e. not the token id)
@@ -233,8 +235,11 @@ extern "C" {
233
235
  // - token : the token ids of the input (used when embd is NULL)
234
236
  // - embd : token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
235
237
  // - pos : the positions of the respective token in the sequence
238
+ // (if set to NULL, the token position will be tracked automatically by llama_decode)
236
239
  // - seq_id : the sequence to which the respective token belongs
240
+ // (if set to NULL, the sequence ID will be assumed to be 0)
237
241
  // - logits : if zero, the logits (and/or the embeddings) for the respective token will not be output
242
+ // (if set to NULL, only the logits for last token will be returned)
238
243
  //
239
244
  typedef struct llama_batch {
240
245
  int32_t n_tokens;
@@ -245,15 +250,6 @@ extern "C" {
245
250
  int32_t * n_seq_id;
246
251
  llama_seq_id ** seq_id;
247
252
  int8_t * logits; // TODO: rename this to "output"
248
-
249
- // NOTE: helpers for smooth API transition - can be deprecated in the future
250
- // for future-proof code, use the above fields instead and ignore everything below
251
- //
252
- // pos[i] = all_pos_0 + i*all_pos_1
253
- //
254
- llama_pos all_pos_0; // used if pos == NULL
255
- llama_pos all_pos_1; // used if pos == NULL
256
- llama_seq_id all_seq_id; // used if seq_id == NULL
257
253
  } llama_batch;
258
254
 
259
255
  enum llama_model_kv_override_type {
@@ -280,10 +276,7 @@ extern "C" {
280
276
  int32_t n_gpu_layers; // number of layers to store in VRAM
281
277
  enum llama_split_mode split_mode; // how to split the model across multiple GPUs
282
278
 
283
- // main_gpu interpretation depends on split_mode:
284
- // LLAMA_SPLIT_MODE_NONE: the GPU that is used for the entire model
285
- // LLAMA_SPLIT_MODE_ROW: the GPU that is used for small tensors and intermediate results
286
- // LLAMA_SPLIT_MODE_LAYER: ignored
279
+ // the GPU that is used for the entire model when split_mode is LLAMA_SPLIT_MODE_NONE
287
280
  int32_t main_gpu;
288
281
 
289
282
  // proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
@@ -434,6 +427,7 @@ extern "C" {
434
427
  LLAMA_API bool llama_supports_mmap (void);
435
428
  LLAMA_API bool llama_supports_mlock (void);
436
429
  LLAMA_API bool llama_supports_gpu_offload(void);
430
+ LLAMA_API bool llama_supports_rpc (void);
437
431
 
438
432
  LLAMA_API uint32_t llama_n_ctx (const struct llama_context * ctx);
439
433
  LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx);
@@ -776,15 +770,15 @@ extern "C" {
776
770
  // Decoding
777
771
  //
778
772
 
779
- // Return batch for single sequence of tokens starting at pos_0
773
+ // Return batch for single sequence of tokens
774
+ // The sequence ID will be fixed to 0
775
+ // The position of the tokens will be tracked automatically by llama_decode
780
776
  //
781
777
  // NOTE: this is a helper function to facilitate transition to the new batch API - avoid using it
782
778
  //
783
779
  LLAMA_API struct llama_batch llama_batch_get_one(
784
780
  llama_token * tokens,
785
- int32_t n_tokens,
786
- llama_pos pos_0,
787
- llama_seq_id seq_id);
781
+ int32_t n_tokens);
788
782
 
789
783
  // Allocates a batch of tokens on the heap that can hold a maximum of n_tokens
790
784
  // Each token can be assigned up to n_seq_max sequence ids
@@ -897,6 +891,7 @@ extern "C" {
897
891
  // Special tokens
898
892
  LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence
899
893
  LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
894
+ LLAMA_API llama_token llama_token_eot(const struct llama_model * model); // end-of-turn
900
895
  LLAMA_API llama_token llama_token_cls(const struct llama_model * model); // classification
901
896
  LLAMA_API llama_token llama_token_sep(const struct llama_model * model); // sentence separator
902
897
  LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
@@ -905,11 +900,17 @@ extern "C" {
905
900
  LLAMA_API bool llama_add_bos_token(const struct llama_model * model);
906
901
  LLAMA_API bool llama_add_eos_token(const struct llama_model * model);
907
902
 
908
- // Codellama infill tokens
909
- LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
910
- LLAMA_API llama_token llama_token_middle(const struct llama_model * model); // Beginning of infill middle
911
- LLAMA_API llama_token llama_token_suffix(const struct llama_model * model); // Beginning of infill suffix
912
- LLAMA_API llama_token llama_token_eot (const struct llama_model * model); // End of infill middle
903
+ // infill tokens
904
+ DEPRECATED(LLAMA_API llama_token llama_token_prefix(const struct llama_model * model), "use llama_token_fim_pre instead");
905
+ DEPRECATED(LLAMA_API llama_token llama_token_middle(const struct llama_model * model), "use llama_token_fim_mid instead");
906
+ DEPRECATED(LLAMA_API llama_token llama_token_suffix(const struct llama_model * model), "use llama_token_fim_suf instead");
907
+
908
+ LLAMA_API llama_token llama_token_fim_pre(const struct llama_model * model);
909
+ LLAMA_API llama_token llama_token_fim_suf(const struct llama_model * model);
910
+ LLAMA_API llama_token llama_token_fim_mid(const struct llama_model * model);
911
+ LLAMA_API llama_token llama_token_fim_pad(const struct llama_model * model);
912
+ LLAMA_API llama_token llama_token_fim_rep(const struct llama_model * model);
913
+ LLAMA_API llama_token llama_token_fim_sep(const struct llama_model * model);
913
914
 
914
915
  //
915
916
  // Tokenization
@@ -1068,12 +1069,13 @@ extern "C" {
1068
1069
 
1069
1070
  // available samplers:
1070
1071
 
1071
- LLAMA_API struct llama_sampler * llama_sampler_init_greedy (void);
1072
- LLAMA_API struct llama_sampler * llama_sampler_init_dist (uint32_t seed);
1072
+ LLAMA_API struct llama_sampler * llama_sampler_init_greedy(void);
1073
+ LLAMA_API struct llama_sampler * llama_sampler_init_dist (uint32_t seed);
1073
1074
 
1074
1075
  /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
1075
1076
  /// NOTE: Avoid using on the full vocabulary as the sorting can become slow. For example, apply top-k or top-p sampling first.
1076
- LLAMA_API struct llama_sampler * llama_sampler_init_softmax (void);
1077
+ DEPRECATED(LLAMA_API struct llama_sampler * llama_sampler_init_softmax (void),
1078
+ "will be removed in the future (see https://github.com/ggerganov/llama.cpp/pull/9896#discussion_r1800920915)");
1077
1079
 
1078
1080
  /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
1079
1081
  LLAMA_API struct llama_sampler * llama_sampler_init_top_k (int32_t k);
@@ -1084,19 +1086,18 @@ extern "C" {
1084
1086
  /// @details Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
1085
1087
  LLAMA_API struct llama_sampler * llama_sampler_init_min_p (float p, size_t min_keep);
1086
1088
 
1087
- /// @details XTC sampling as described in https://github.com/oobabooga/text-generation-webui/pull/6335
1088
- LLAMA_API struct llama_sampler * llama_sampler_init_xtc (float xtc_p, float xtc_t, size_t min_keep, uint32_t seed);
1089
-
1090
- /// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
1091
- LLAMA_API struct llama_sampler * llama_sampler_init_tail_free (float z, size_t min_keep);
1092
-
1093
1089
  /// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
1094
1090
  LLAMA_API struct llama_sampler * llama_sampler_init_typical (float p, size_t min_keep);
1091
+
1092
+ /// #details Updates the logits l_i` = l_i/t. When t <= 0.0f, the maximum logit is kept at it's original value, the rest are set to -inf
1095
1093
  LLAMA_API struct llama_sampler * llama_sampler_init_temp (float t);
1096
1094
 
1097
1095
  /// @details Dynamic temperature implementation (a.k.a. entropy) described in the paper https://arxiv.org/abs/2309.02772.
1098
1096
  LLAMA_API struct llama_sampler * llama_sampler_init_temp_ext (float t, float delta, float exponent);
1099
1097
 
1098
+ /// @details XTC sampler as described in https://github.com/oobabooga/text-generation-webui/pull/6335
1099
+ LLAMA_API struct llama_sampler * llama_sampler_init_xtc (float p, float t, size_t min_keep, uint32_t seed);
1100
+
1100
1101
  /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
1101
1102
  /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
1102
1103
  /// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
@@ -1136,11 +1137,43 @@ extern "C" {
1136
1137
  bool penalize_nl, // consider newlines as a repeatable token
1137
1138
  bool ignore_eos); // ignore the end-of-sequence token
1138
1139
 
1140
+ /// @details DRY sampler, designed by p-e-w, as described in: https://github.com/oobabooga/text-generation-webui/pull/5677, porting Koboldcpp implementation authored by pi6am: https://github.com/LostRuins/koboldcpp/pull/982
1141
+ LLAMA_API struct llama_sampler * llama_sampler_init_dry(
1142
+ const struct llama_model * model,
1143
+ float dry_multiplier,
1144
+ float dry_base,
1145
+ int32_t dry_allowed_length,
1146
+ int32_t dry_penalty_last_n,
1147
+ const char ** seq_breakers,
1148
+ size_t num_breakers);
1149
+
1139
1150
  LLAMA_API struct llama_sampler * llama_sampler_init_logit_bias(
1140
1151
  int32_t n_vocab,
1141
1152
  int32_t n_logit_bias,
1142
1153
  const llama_logit_bias * logit_bias);
1143
1154
 
1155
+ // this sampler is meant to be used for fill-in-the-middle infilling
1156
+ // it's supposed to be used after top_k + top_p sampling
1157
+ //
1158
+ // 1. if the sum of the EOG probs times the number of candidates is higher than the sum of the other probs -> pick EOG
1159
+ // 2. combine probs of tokens that have the same prefix
1160
+ //
1161
+ // example:
1162
+ //
1163
+ // - before:
1164
+ // "hel": 0.5
1165
+ // "hell": 0.2
1166
+ // "hello": 0.1
1167
+ // "dummy": 0.1
1168
+ //
1169
+ // - after:
1170
+ // "hel": 0.8
1171
+ // "dummy": 0.1
1172
+ //
1173
+ // 3. discard non-EOG tokens with low prob
1174
+ // 4. if no tokens are left -> pick EOT
1175
+ //
1176
+ LLAMA_API struct llama_sampler * llama_sampler_init_infill(const struct llama_model * model);
1144
1177
 
1145
1178
  // Returns the seed used by the sampler if applicable, LLAMA_DEFAULT_SEED otherwise
1146
1179
  LLAMA_API uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl);
package/cpp/log.cpp CHANGED
@@ -8,10 +8,10 @@
8
8
  #include <thread>
9
9
  #include <vector>
10
10
 
11
- int gpt_log_verbosity_thold = LOG_DEFAULT_LLAMA;
11
+ int common_log_verbosity_thold = LOG_DEFAULT_LLAMA;
12
12
 
13
- void gpt_log_set_verbosity_thold(int verbosity) {
14
- gpt_log_verbosity_thold = verbosity;
13
+ void common_log_set_verbosity_thold(int verbosity) {
14
+ common_log_verbosity_thold = verbosity;
15
15
  }
16
16
 
17
17
  #define LOG_COL_DEFAULT "\033[0m"
@@ -29,16 +29,16 @@ static int64_t t_us() {
29
29
  }
30
30
 
31
31
  // colors
32
- enum gpt_log_col : int {
33
- GPT_LOG_COL_DEFAULT = 0,
34
- GPT_LOG_COL_BOLD,
35
- GPT_LOG_COL_RED,
36
- GPT_LOG_COL_GREEN,
37
- GPT_LOG_COL_YELLOW,
38
- GPT_LOG_COL_BLUE,
39
- GPT_LOG_COL_MAGENTA,
40
- GPT_LOG_COL_CYAN,
41
- GPT_LOG_COL_WHITE,
32
+ enum common_log_col : int {
33
+ COMMON_LOG_COL_DEFAULT = 0,
34
+ COMMON_LOG_COL_BOLD,
35
+ COMMON_LOG_COL_RED,
36
+ COMMON_LOG_COL_GREEN,
37
+ COMMON_LOG_COL_YELLOW,
38
+ COMMON_LOG_COL_BLUE,
39
+ COMMON_LOG_COL_MAGENTA,
40
+ COMMON_LOG_COL_CYAN,
41
+ COMMON_LOG_COL_WHITE,
42
42
  };
43
43
 
44
44
  // disable colors by default
@@ -54,7 +54,7 @@ static std::vector<const char *> g_col = {
54
54
  "",
55
55
  };
56
56
 
57
- struct gpt_log_entry {
57
+ struct common_log_entry {
58
58
  enum lm_ggml_log_level level;
59
59
 
60
60
  bool prefix;
@@ -71,7 +71,7 @@ struct gpt_log_entry {
71
71
  if (!fcur) {
72
72
  // stderr displays DBG messages only when their verbosity level is not higher than the threshold
73
73
  // these messages will still be logged to a file
74
- if (level == LM_GGML_LOG_LEVEL_DEBUG && gpt_log_verbosity_thold < LOG_DEFAULT_DEBUG) {
74
+ if (level == LM_GGML_LOG_LEVEL_DEBUG && common_log_verbosity_thold < LOG_DEFAULT_DEBUG) {
75
75
  return;
76
76
  }
77
77
 
@@ -86,19 +86,19 @@ struct gpt_log_entry {
86
86
  if (timestamp) {
87
87
  // [M.s.ms.us]
88
88
  fprintf(fcur, "%s%d.%02d.%03d.%03d%s ",
89
- g_col[GPT_LOG_COL_BLUE],
89
+ g_col[COMMON_LOG_COL_BLUE],
90
90
  (int) (timestamp / 1000000 / 60),
91
91
  (int) (timestamp / 1000000 % 60),
92
92
  (int) (timestamp / 1000 % 1000),
93
93
  (int) (timestamp % 1000),
94
- g_col[GPT_LOG_COL_DEFAULT]);
94
+ g_col[COMMON_LOG_COL_DEFAULT]);
95
95
  }
96
96
 
97
97
  switch (level) {
98
- case LM_GGML_LOG_LEVEL_INFO: fprintf(fcur, "%sI %s", g_col[GPT_LOG_COL_GREEN], g_col[GPT_LOG_COL_DEFAULT]); break;
99
- case LM_GGML_LOG_LEVEL_WARN: fprintf(fcur, "%sW %s", g_col[GPT_LOG_COL_MAGENTA], "" ); break;
100
- case LM_GGML_LOG_LEVEL_ERROR: fprintf(fcur, "%sE %s", g_col[GPT_LOG_COL_RED], "" ); break;
101
- case LM_GGML_LOG_LEVEL_DEBUG: fprintf(fcur, "%sD %s", g_col[GPT_LOG_COL_YELLOW], "" ); break;
98
+ case LM_GGML_LOG_LEVEL_INFO: fprintf(fcur, "%sI %s", g_col[COMMON_LOG_COL_GREEN], g_col[COMMON_LOG_COL_DEFAULT]); break;
99
+ case LM_GGML_LOG_LEVEL_WARN: fprintf(fcur, "%sW %s", g_col[COMMON_LOG_COL_MAGENTA], "" ); break;
100
+ case LM_GGML_LOG_LEVEL_ERROR: fprintf(fcur, "%sE %s", g_col[COMMON_LOG_COL_RED], "" ); break;
101
+ case LM_GGML_LOG_LEVEL_DEBUG: fprintf(fcur, "%sD %s", g_col[COMMON_LOG_COL_YELLOW], "" ); break;
102
102
  default:
103
103
  break;
104
104
  }
@@ -107,18 +107,18 @@ struct gpt_log_entry {
107
107
  fprintf(fcur, "%s", msg.data());
108
108
 
109
109
  if (level == LM_GGML_LOG_LEVEL_WARN || level == LM_GGML_LOG_LEVEL_ERROR || level == LM_GGML_LOG_LEVEL_DEBUG) {
110
- fprintf(fcur, "%s", g_col[GPT_LOG_COL_DEFAULT]);
110
+ fprintf(fcur, "%s", g_col[COMMON_LOG_COL_DEFAULT]);
111
111
  }
112
112
 
113
113
  fflush(fcur);
114
114
  }
115
115
  };
116
116
 
117
- struct gpt_log {
117
+ struct common_log {
118
118
  // default capacity - will be expanded if needed
119
- gpt_log() : gpt_log(256) {}
119
+ common_log() : common_log(256) {}
120
120
 
121
- gpt_log(size_t capacity) {
121
+ common_log(size_t capacity) {
122
122
  file = nullptr;
123
123
  prefix = false;
124
124
  timestamps = false;
@@ -137,7 +137,7 @@ struct gpt_log {
137
137
  resume();
138
138
  }
139
139
 
140
- ~gpt_log() {
140
+ ~common_log() {
141
141
  pause();
142
142
  if (file) {
143
143
  fclose(file);
@@ -158,12 +158,12 @@ private:
158
158
  int64_t t_start;
159
159
 
160
160
  // ring buffer of entries
161
- std::vector<gpt_log_entry> entries;
161
+ std::vector<common_log_entry> entries;
162
162
  size_t head;
163
163
  size_t tail;
164
164
 
165
165
  // worker thread copies into this
166
- gpt_log_entry cur;
166
+ common_log_entry cur;
167
167
 
168
168
  public:
169
169
  void add(enum lm_ggml_log_level level, const char * fmt, va_list args) {
@@ -219,7 +219,7 @@ public:
219
219
  tail = (tail + 1) % entries.size();
220
220
  if (tail == head) {
221
221
  // expand the buffer
222
- std::vector<gpt_log_entry> new_entries(2*entries.size());
222
+ std::vector<common_log_entry> new_entries(2*entries.size());
223
223
 
224
224
  size_t new_tail = 0;
225
225
 
@@ -320,15 +320,15 @@ public:
320
320
  pause();
321
321
 
322
322
  if (colors) {
323
- g_col[GPT_LOG_COL_DEFAULT] = LOG_COL_DEFAULT;
324
- g_col[GPT_LOG_COL_BOLD] = LOG_COL_BOLD;
325
- g_col[GPT_LOG_COL_RED] = LOG_COL_RED;
326
- g_col[GPT_LOG_COL_GREEN] = LOG_COL_GREEN;
327
- g_col[GPT_LOG_COL_YELLOW] = LOG_COL_YELLOW;
328
- g_col[GPT_LOG_COL_BLUE] = LOG_COL_BLUE;
329
- g_col[GPT_LOG_COL_MAGENTA] = LOG_COL_MAGENTA;
330
- g_col[GPT_LOG_COL_CYAN] = LOG_COL_CYAN;
331
- g_col[GPT_LOG_COL_WHITE] = LOG_COL_WHITE;
323
+ g_col[COMMON_LOG_COL_DEFAULT] = LOG_COL_DEFAULT;
324
+ g_col[COMMON_LOG_COL_BOLD] = LOG_COL_BOLD;
325
+ g_col[COMMON_LOG_COL_RED] = LOG_COL_RED;
326
+ g_col[COMMON_LOG_COL_GREEN] = LOG_COL_GREEN;
327
+ g_col[COMMON_LOG_COL_YELLOW] = LOG_COL_YELLOW;
328
+ g_col[COMMON_LOG_COL_BLUE] = LOG_COL_BLUE;
329
+ g_col[COMMON_LOG_COL_MAGENTA] = LOG_COL_MAGENTA;
330
+ g_col[COMMON_LOG_COL_CYAN] = LOG_COL_CYAN;
331
+ g_col[COMMON_LOG_COL_WHITE] = LOG_COL_WHITE;
332
332
  } else {
333
333
  for (size_t i = 0; i < g_col.size(); i++) {
334
334
  g_col[i] = "";
@@ -355,47 +355,47 @@ public:
355
355
  // public API
356
356
  //
357
357
 
358
- struct gpt_log * gpt_log_init() {
359
- return new gpt_log;
358
+ struct common_log * common_log_init() {
359
+ return new common_log;
360
360
  }
361
361
 
362
- struct gpt_log * gpt_log_main() {
363
- static struct gpt_log log;
362
+ struct common_log * common_log_main() {
363
+ static struct common_log log;
364
364
 
365
365
  return &log;
366
366
  }
367
367
 
368
- void gpt_log_pause(struct gpt_log * log) {
368
+ void common_log_pause(struct common_log * log) {
369
369
  log->pause();
370
370
  }
371
371
 
372
- void gpt_log_resume(struct gpt_log * log) {
372
+ void common_log_resume(struct common_log * log) {
373
373
  log->resume();
374
374
  }
375
375
 
376
- void gpt_log_free(struct gpt_log * log) {
376
+ void common_log_free(struct common_log * log) {
377
377
  delete log;
378
378
  }
379
379
 
380
- void gpt_log_add(struct gpt_log * log, enum lm_ggml_log_level level, const char * fmt, ...) {
380
+ void common_log_add(struct common_log * log, enum lm_ggml_log_level level, const char * fmt, ...) {
381
381
  va_list args;
382
382
  va_start(args, fmt);
383
383
  log->add(level, fmt, args);
384
384
  va_end(args);
385
385
  }
386
386
 
387
- void gpt_log_set_file(struct gpt_log * log, const char * file) {
387
+ void common_log_set_file(struct common_log * log, const char * file) {
388
388
  log->set_file(file);
389
389
  }
390
390
 
391
- void gpt_log_set_colors(struct gpt_log * log, bool colors) {
391
+ void common_log_set_colors(struct common_log * log, bool colors) {
392
392
  log->set_colors(colors);
393
393
  }
394
394
 
395
- void gpt_log_set_prefix(struct gpt_log * log, bool prefix) {
395
+ void common_log_set_prefix(struct common_log * log, bool prefix) {
396
396
  log->set_prefix(prefix);
397
397
  }
398
398
 
399
- void gpt_log_set_timestamps(struct gpt_log * log, bool timestamps) {
399
+ void common_log_set_timestamps(struct common_log * log, bool timestamps) {
400
400
  log->set_timestamps(timestamps);
401
401
  }
package/cpp/log.h CHANGED
@@ -14,23 +14,23 @@
14
14
  #define LOG_DEFAULT_LLAMA 0
15
15
 
16
16
  // needed by the LOG_TMPL macro to avoid computing log arguments if the verbosity lower
17
- // set via gpt_log_set_verbosity()
18
- extern int gpt_log_verbosity_thold;
17
+ // set via common_log_set_verbosity()
18
+ extern int common_log_verbosity_thold;
19
19
 
20
- void gpt_log_set_verbosity_thold(int verbosity); // not thread-safe
20
+ void common_log_set_verbosity_thold(int verbosity); // not thread-safe
21
21
 
22
- // the gpt_log uses an internal worker thread to print/write log messages
22
+ // the common_log uses an internal worker thread to print/write log messages
23
23
  // when the worker thread is paused, incoming log messages are discarded
24
- struct gpt_log;
24
+ struct common_log;
25
25
 
26
- struct gpt_log * gpt_log_init();
27
- struct gpt_log * gpt_log_main(); // singleton, automatically destroys itself on exit
28
- void gpt_log_pause (struct gpt_log * log); // pause the worker thread, not thread-safe
29
- void gpt_log_resume(struct gpt_log * log); // resume the worker thread, not thread-safe
30
- void gpt_log_free (struct gpt_log * log);
26
+ struct common_log * common_log_init();
27
+ struct common_log * common_log_main(); // singleton, automatically destroys itself on exit
28
+ void common_log_pause (struct common_log * log); // pause the worker thread, not thread-safe
29
+ void common_log_resume(struct common_log * log); // resume the worker thread, not thread-safe
30
+ void common_log_free (struct common_log * log);
31
31
 
32
32
  LOG_ATTRIBUTE_FORMAT(3, 4)
33
- void gpt_log_add(struct gpt_log * log, enum lm_ggml_log_level level, const char * fmt, ...);
33
+ void common_log_add(struct common_log * log, enum lm_ggml_log_level level, const char * fmt, ...);
34
34
 
35
35
  // defaults: file = NULL, colors = false, prefix = false, timestamps = false
36
36
  //
@@ -54,10 +54,10 @@ void gpt_log_add(struct gpt_log * log, enum lm_ggml_log_level level, const char
54
54
  // D - debug (stderr, V = LOG_DEFAULT_DEBUG)
55
55
  //
56
56
 
57
- void gpt_log_set_file (struct gpt_log * log, const char * file); // not thread-safe
58
- void gpt_log_set_colors (struct gpt_log * log, bool colors); // not thread-safe
59
- void gpt_log_set_prefix (struct gpt_log * log, bool prefix); // whether to output prefix to each log
60
- void gpt_log_set_timestamps(struct gpt_log * log, bool timestamps); // whether to output timestamps in the prefix
57
+ void common_log_set_file (struct common_log * log, const char * file); // not thread-safe
58
+ void common_log_set_colors (struct common_log * log, bool colors); // not thread-safe
59
+ void common_log_set_prefix (struct common_log * log, bool prefix); // whether to output prefix to each log
60
+ void common_log_set_timestamps(struct common_log * log, bool timestamps); // whether to output timestamps in the prefix
61
61
 
62
62
  // helper macros for logging
63
63
  // use these to avoid computing log arguments if the verbosity of the log is higher than the threshold
@@ -66,7 +66,7 @@ void gpt_log_set_timestamps(struct gpt_log * log, bool timestamps); // w
66
66
  //
67
67
  // LOG_DBG("this is a debug message: %d\n", expensive_function());
68
68
  //
69
- // this will avoid calling expensive_function() if LOG_DEFAULT_DEBUG > gpt_log_verbosity_thold
69
+ // this will avoid calling expensive_function() if LOG_DEFAULT_DEBUG > common_log_verbosity_thold
70
70
  //
71
71
 
72
72
 
@@ -98,8 +98,8 @@ void gpt_log_set_timestamps(struct gpt_log * log, bool timestamps); // w
98
98
 
99
99
  #define LOG_TMPL(level, verbosity, ...) \
100
100
  do { \
101
- if ((verbosity) <= gpt_log_verbosity_thold) { \
102
- gpt_log_add(gpt_log_main(), (level), __VA_ARGS__); \
101
+ if ((verbosity) <= common_log_verbosity_thold) { \
102
+ common_log_add(common_log_main(), (level), __VA_ARGS__); \
103
103
  } \
104
104
  } while (0)
105
105
 
package/cpp/rn-llama.hpp CHANGED
@@ -117,7 +117,7 @@ static size_t find_partial_stop_string(const std::string &stop,
117
117
  // format incomplete utf-8 multibyte character for output
118
118
  static std::string tokens_to_output_formatted_string(const llama_context *ctx, const llama_token token)
119
119
  {
120
- std::string out = token == -1 ? "" : llama_token_to_piece(ctx, token);
120
+ std::string out = token == -1 ? "" : common_token_to_piece(ctx, token);
121
121
  // if the size is 1 and first bit is 1, meaning it's a partial character
122
122
  // (size > 1 meaning it's already a known token)
123
123
  if (out.size() == 1 && (out[0] & 0x80) == 0x80)
@@ -136,7 +136,7 @@ static std::string tokens_to_str(llama_context *ctx, Iter begin, Iter end)
136
136
  std::string ret;
137
137
  for (; begin != end; ++begin)
138
138
  {
139
- ret += llama_token_to_piece(ctx, *begin);
139
+ ret += common_token_to_piece(ctx, *begin);
140
140
  }
141
141
  return ret;
142
142
  }
@@ -157,11 +157,11 @@ struct llama_rn_context
157
157
 
158
158
  std::vector<llama_token> embd;
159
159
 
160
- gpt_params params;
160
+ common_params params;
161
161
 
162
162
  llama_model *model = nullptr;
163
163
  llama_context *ctx = nullptr;
164
- gpt_sampler *ctx_sampling = nullptr;
164
+ common_sampler *ctx_sampling = nullptr;
165
165
 
166
166
  int n_ctx;
167
167
 
@@ -186,7 +186,7 @@ struct llama_rn_context
186
186
  }
187
187
  if (ctx_sampling != nullptr)
188
188
  {
189
- gpt_sampler_free(ctx_sampling);
189
+ common_sampler_free(ctx_sampling);
190
190
  }
191
191
  }
192
192
 
@@ -213,16 +213,16 @@ struct llama_rn_context
213
213
 
214
214
  bool initSampling() {
215
215
  if (ctx_sampling != nullptr) {
216
- gpt_sampler_free(ctx_sampling);
216
+ common_sampler_free(ctx_sampling);
217
217
  }
218
- ctx_sampling = gpt_sampler_init(model, params.sparams);
218
+ ctx_sampling = common_sampler_init(model, params.sparams);
219
219
  return ctx_sampling != nullptr;
220
220
  }
221
221
 
222
- bool loadModel(gpt_params &params_)
222
+ bool loadModel(common_params &params_)
223
223
  {
224
224
  params = params_;
225
- llama_init_result result = llama_init_from_gpt_params(params);
225
+ common_init_result result = common_init_from_params(params);
226
226
  model = result.model;
227
227
  ctx = result.context;
228
228
  if (model == nullptr)
@@ -268,7 +268,7 @@ struct llama_rn_context
268
268
 
269
269
  void loadPrompt()
270
270
  {
271
- std::vector<llama_token> prompt_tokens = ::llama_tokenize(ctx, params.prompt, true, true);
271
+ std::vector<llama_token> prompt_tokens = ::common_tokenize(ctx, params.prompt, true, true);
272
272
  num_prompt_tokens = prompt_tokens.size();
273
273
 
274
274
  // LOG tokens
@@ -302,7 +302,7 @@ struct llama_rn_context
302
302
  // push the prompt into the sampling context (do not apply grammar)
303
303
  for (auto & token : prompt_tokens)
304
304
  {
305
- gpt_sampler_accept(ctx_sampling, token, false);
305
+ common_sampler_accept(ctx_sampling, token, false);
306
306
  }
307
307
  // compare the evaluated prompt with the new prompt
308
308
  n_past = params.embedding? 0 : common_part(embd, prompt_tokens);
@@ -375,8 +375,8 @@ struct llama_rn_context
375
375
  {
376
376
  n_eval = params.n_batch;
377
377
  }
378
- if (llama_decode(ctx, llama_batch_get_one(&embd[n_past], n_eval, n_past, 0)))
379
- {
378
+ if (llama_decode(ctx, llama_batch_get_one(&embd[n_past], n_eval)))
379
+ {
380
380
  LOG_ERROR("failed to eval, n_eval: %d, n_past: %d, n_threads: %d, embd: %s",
381
381
  n_eval,
382
382
  n_past,
@@ -408,18 +408,19 @@ struct llama_rn_context
408
408
  std::vector<llama_token_data> candidates;
409
409
  candidates.reserve(llama_n_vocab(model));
410
410
 
411
- result.tok = gpt_sampler_sample(ctx_sampling, ctx, -1);
411
+ result.tok = common_sampler_sample(ctx_sampling, ctx, -1);
412
412
 
413
- llama_token_data_array cur_p = *gpt_sampler_get_candidates(ctx_sampling);
413
+ llama_token_data_array cur_p = *common_sampler_get_candidates(ctx_sampling);
414
414
 
415
415
  const int32_t n_probs = params.sparams.n_probs;
416
416
 
417
-
418
- if (params.sparams.temp <= 0 && n_probs > 0)
417
+ // deprecated
418
+ /*if (params.sparams.temp <= 0 && n_probs > 0)
419
419
  {
420
420
  // For llama_sample_token_greedy we need to sort candidates
421
421
  llama_sampler_init_softmax();
422
- }
422
+
423
+ }*/
423
424
 
424
425
 
425
426
  for (size_t i = 0; i < std::min(cur_p.size, (size_t)n_probs); ++i)
@@ -427,7 +428,7 @@ struct llama_rn_context
427
428
  result.probs.push_back({cur_p.data[i].id, cur_p.data[i].p});
428
429
  }
429
430
 
430
- gpt_sampler_accept(ctx_sampling, result.tok, true);
431
+ common_sampler_accept(ctx_sampling, result.tok, true);
431
432
  if (tg) {
432
433
  num_tokens_predicted++;
433
434
  }
@@ -487,7 +488,7 @@ struct llama_rn_context
487
488
  {
488
489
  const completion_token_output token_with_probs = nextToken();
489
490
 
490
- const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_piece(ctx, token_with_probs.tok);
491
+ const std::string token_text = token_with_probs.tok == -1 ? "" : common_token_to_piece(ctx, token_with_probs.tok);
491
492
  generated_text += token_text;
492
493
 
493
494
  if (params.sparams.n_probs > 0)
@@ -528,7 +529,7 @@ struct llama_rn_context
528
529
  }
529
530
 
530
531
  LOG_VERBOSE("next token, token: %s, token_text: %s, has_next_token: %d, n_remain: %d, num_tokens_predicted: %d, stopped_eos: %d, stopped_word: %d, stopped_limit: %d, stopping_word: %s",
531
- llama_token_to_piece(ctx, token_with_probs.tok),
532
+ common_token_to_piece(ctx, token_with_probs.tok),
532
533
  tokens_to_output_formatted_string(ctx, token_with_probs.tok).c_str(),
533
534
  has_next_token,
534
535
  n_remain,
@@ -562,7 +563,7 @@ struct llama_rn_context
562
563
  return std::vector<float>(n_embd, 0.0f);
563
564
  }
564
565
  std::vector<float> embedding(data, data + n_embd), out(data, data + n_embd);
565
- llama_embd_normalize(embedding.data(), out.data(), n_embd, params.embd_normalize);
566
+ common_embd_normalize(embedding.data(), out.data(), n_embd, params.embd_normalize);
566
567
  return out;
567
568
  }
568
569