cui-llama.rn 1.2.2 → 1.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/cpp/llama.h CHANGED
@@ -218,6 +218,7 @@ extern "C" {
218
218
 
219
219
  typedef struct llama_token_data_array {
220
220
  // TODO: consider SoA
221
+ // NOTE: this pointer can be modified by the samplers
221
222
  llama_token_data * data;
222
223
  size_t size;
223
224
  int64_t selected; // this is the index in the data array (i.e. not the token id)
@@ -233,8 +234,11 @@ extern "C" {
233
234
  // - token : the token ids of the input (used when embd is NULL)
234
235
  // - embd : token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
235
236
  // - pos : the positions of the respective token in the sequence
237
+ // (if set to NULL, the token position will be tracked automatically by llama_decode)
236
238
  // - seq_id : the sequence to which the respective token belongs
239
+ // (if set to NULL, the sequence ID will be assumed to be 0)
237
240
  // - logits : if zero, the logits (and/or the embeddings) for the respective token will not be output
241
+ // (if set to NULL, only the logits for last token will be returned)
238
242
  //
239
243
  typedef struct llama_batch {
240
244
  int32_t n_tokens;
@@ -245,15 +249,6 @@ extern "C" {
245
249
  int32_t * n_seq_id;
246
250
  llama_seq_id ** seq_id;
247
251
  int8_t * logits; // TODO: rename this to "output"
248
-
249
- // NOTE: helpers for smooth API transition - can be deprecated in the future
250
- // for future-proof code, use the above fields instead and ignore everything below
251
- //
252
- // pos[i] = all_pos_0 + i*all_pos_1
253
- //
254
- llama_pos all_pos_0; // used if pos == NULL
255
- llama_pos all_pos_1; // used if pos == NULL
256
- llama_seq_id all_seq_id; // used if seq_id == NULL
257
252
  } llama_batch;
258
253
 
259
254
  enum llama_model_kv_override_type {
@@ -434,6 +429,7 @@ extern "C" {
434
429
  LLAMA_API bool llama_supports_mmap (void);
435
430
  LLAMA_API bool llama_supports_mlock (void);
436
431
  LLAMA_API bool llama_supports_gpu_offload(void);
432
+ LLAMA_API bool llama_supports_rpc (void);
437
433
 
438
434
  LLAMA_API uint32_t llama_n_ctx (const struct llama_context * ctx);
439
435
  LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx);
@@ -776,15 +772,15 @@ extern "C" {
776
772
  // Decoding
777
773
  //
778
774
 
779
- // Return batch for single sequence of tokens starting at pos_0
775
+ // Return batch for single sequence of tokens
776
+ // The sequence ID will be fixed to 0
777
+ // The position of the tokens will be tracked automatically by llama_decode
780
778
  //
781
779
  // NOTE: this is a helper function to facilitate transition to the new batch API - avoid using it
782
780
  //
783
781
  LLAMA_API struct llama_batch llama_batch_get_one(
784
782
  llama_token * tokens,
785
- int32_t n_tokens,
786
- llama_pos pos_0,
787
- llama_seq_id seq_id);
783
+ int32_t n_tokens);
788
784
 
789
785
  // Allocates a batch of tokens on the heap that can hold a maximum of n_tokens
790
786
  // Each token can be assigned up to n_seq_max sequence ids
@@ -897,6 +893,7 @@ extern "C" {
897
893
  // Special tokens
898
894
  LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence
899
895
  LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
896
+ LLAMA_API llama_token llama_token_eot(const struct llama_model * model); // end-of-turn
900
897
  LLAMA_API llama_token llama_token_cls(const struct llama_model * model); // classification
901
898
  LLAMA_API llama_token llama_token_sep(const struct llama_model * model); // sentence separator
902
899
  LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
@@ -905,11 +902,17 @@ extern "C" {
905
902
  LLAMA_API bool llama_add_bos_token(const struct llama_model * model);
906
903
  LLAMA_API bool llama_add_eos_token(const struct llama_model * model);
907
904
 
908
- // Codellama infill tokens
909
- LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
910
- LLAMA_API llama_token llama_token_middle(const struct llama_model * model); // Beginning of infill middle
911
- LLAMA_API llama_token llama_token_suffix(const struct llama_model * model); // Beginning of infill suffix
912
- LLAMA_API llama_token llama_token_eot (const struct llama_model * model); // End of infill middle
905
+ // infill tokens
906
+ DEPRECATED(LLAMA_API llama_token llama_token_prefix(const struct llama_model * model), "use llama_token_fim_pre instead");
907
+ DEPRECATED(LLAMA_API llama_token llama_token_middle(const struct llama_model * model), "use llama_token_fim_mid instead");
908
+ DEPRECATED(LLAMA_API llama_token llama_token_suffix(const struct llama_model * model), "use llama_token_fim_suf instead");
909
+
910
+ LLAMA_API llama_token llama_token_fim_pre(const struct llama_model * model);
911
+ LLAMA_API llama_token llama_token_fim_suf(const struct llama_model * model);
912
+ LLAMA_API llama_token llama_token_fim_mid(const struct llama_model * model);
913
+ LLAMA_API llama_token llama_token_fim_pad(const struct llama_model * model);
914
+ LLAMA_API llama_token llama_token_fim_rep(const struct llama_model * model);
915
+ LLAMA_API llama_token llama_token_fim_sep(const struct llama_model * model);
913
916
 
914
917
  //
915
918
  // Tokenization
@@ -1068,12 +1071,13 @@ extern "C" {
1068
1071
 
1069
1072
  // available samplers:
1070
1073
 
1071
- LLAMA_API struct llama_sampler * llama_sampler_init_greedy (void);
1072
- LLAMA_API struct llama_sampler * llama_sampler_init_dist (uint32_t seed);
1074
+ LLAMA_API struct llama_sampler * llama_sampler_init_greedy(void);
1075
+ LLAMA_API struct llama_sampler * llama_sampler_init_dist (uint32_t seed);
1073
1076
 
1074
1077
  /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
1075
1078
  /// NOTE: Avoid using on the full vocabulary as the sorting can become slow. For example, apply top-k or top-p sampling first.
1076
- LLAMA_API struct llama_sampler * llama_sampler_init_softmax (void);
1079
+ DEPRECATED(LLAMA_API struct llama_sampler * llama_sampler_init_softmax (void),
1080
+ "will be removed in the future (see https://github.com/ggerganov/llama.cpp/pull/9896#discussion_r1800920915)");
1077
1081
 
1078
1082
  /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
1079
1083
  LLAMA_API struct llama_sampler * llama_sampler_init_top_k (int32_t k);
@@ -1092,11 +1096,16 @@ extern "C" {
1092
1096
 
1093
1097
  /// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
1094
1098
  LLAMA_API struct llama_sampler * llama_sampler_init_typical (float p, size_t min_keep);
1099
+
1100
+ /// #details Updates the logits l_i` = l_i/t. When t <= 0.0f, the maximum logit is kept at it's original value, the rest are set to -inf
1095
1101
  LLAMA_API struct llama_sampler * llama_sampler_init_temp (float t);
1096
1102
 
1097
1103
  /// @details Dynamic temperature implementation (a.k.a. entropy) described in the paper https://arxiv.org/abs/2309.02772.
1098
1104
  LLAMA_API struct llama_sampler * llama_sampler_init_temp_ext (float t, float delta, float exponent);
1099
1105
 
1106
+ /// @details XTC sampler as described in https://github.com/oobabooga/text-generation-webui/pull/6335
1107
+ LLAMA_API struct llama_sampler * llama_sampler_init_xtc (float p, float t, size_t min_keep, uint32_t seed);
1108
+
1100
1109
  /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
1101
1110
  /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
1102
1111
  /// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
@@ -1141,6 +1150,28 @@ extern "C" {
1141
1150
  int32_t n_logit_bias,
1142
1151
  const llama_logit_bias * logit_bias);
1143
1152
 
1153
+ // this sampler is meant to be used for fill-in-the-middle infilling
1154
+ // it's supposed to be used after top_k + top_p sampling
1155
+ //
1156
+ // 1. if the sum of the EOG probs times the number of candidates is higher than the sum of the other probs -> pick EOG
1157
+ // 2. combine probs of tokens that have the same prefix
1158
+ //
1159
+ // example:
1160
+ //
1161
+ // - before:
1162
+ // "hel": 0.5
1163
+ // "hell": 0.2
1164
+ // "hello": 0.1
1165
+ // "dummy": 0.1
1166
+ //
1167
+ // - after:
1168
+ // "hel": 0.8
1169
+ // "dummy": 0.1
1170
+ //
1171
+ // 3. discard non-EOG tokens with low prob
1172
+ // 4. if no tokens are left -> pick EOT
1173
+ //
1174
+ LLAMA_API struct llama_sampler * llama_sampler_init_infill(const struct llama_model * model);
1144
1175
 
1145
1176
  // Returns the seed used by the sampler if applicable, LLAMA_DEFAULT_SEED otherwise
1146
1177
  LLAMA_API uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl);
package/cpp/log.cpp CHANGED
@@ -8,10 +8,10 @@
8
8
  #include <thread>
9
9
  #include <vector>
10
10
 
11
- int gpt_log_verbosity_thold = LOG_DEFAULT_LLAMA;
11
+ int common_log_verbosity_thold = LOG_DEFAULT_LLAMA;
12
12
 
13
- void gpt_log_set_verbosity_thold(int verbosity) {
14
- gpt_log_verbosity_thold = verbosity;
13
+ void common_log_set_verbosity_thold(int verbosity) {
14
+ common_log_verbosity_thold = verbosity;
15
15
  }
16
16
 
17
17
  #define LOG_COL_DEFAULT "\033[0m"
@@ -29,16 +29,16 @@ static int64_t t_us() {
29
29
  }
30
30
 
31
31
  // colors
32
- enum gpt_log_col : int {
33
- GPT_LOG_COL_DEFAULT = 0,
34
- GPT_LOG_COL_BOLD,
35
- GPT_LOG_COL_RED,
36
- GPT_LOG_COL_GREEN,
37
- GPT_LOG_COL_YELLOW,
38
- GPT_LOG_COL_BLUE,
39
- GPT_LOG_COL_MAGENTA,
40
- GPT_LOG_COL_CYAN,
41
- GPT_LOG_COL_WHITE,
32
+ enum common_log_col : int {
33
+ COMMON_LOG_COL_DEFAULT = 0,
34
+ COMMON_LOG_COL_BOLD,
35
+ COMMON_LOG_COL_RED,
36
+ COMMON_LOG_COL_GREEN,
37
+ COMMON_LOG_COL_YELLOW,
38
+ COMMON_LOG_COL_BLUE,
39
+ COMMON_LOG_COL_MAGENTA,
40
+ COMMON_LOG_COL_CYAN,
41
+ COMMON_LOG_COL_WHITE,
42
42
  };
43
43
 
44
44
  // disable colors by default
@@ -54,7 +54,7 @@ static std::vector<const char *> g_col = {
54
54
  "",
55
55
  };
56
56
 
57
- struct gpt_log_entry {
57
+ struct common_log_entry {
58
58
  enum lm_ggml_log_level level;
59
59
 
60
60
  bool prefix;
@@ -71,7 +71,7 @@ struct gpt_log_entry {
71
71
  if (!fcur) {
72
72
  // stderr displays DBG messages only when their verbosity level is not higher than the threshold
73
73
  // these messages will still be logged to a file
74
- if (level == LM_GGML_LOG_LEVEL_DEBUG && gpt_log_verbosity_thold < LOG_DEFAULT_DEBUG) {
74
+ if (level == LM_GGML_LOG_LEVEL_DEBUG && common_log_verbosity_thold < LOG_DEFAULT_DEBUG) {
75
75
  return;
76
76
  }
77
77
 
@@ -86,19 +86,19 @@ struct gpt_log_entry {
86
86
  if (timestamp) {
87
87
  // [M.s.ms.us]
88
88
  fprintf(fcur, "%s%d.%02d.%03d.%03d%s ",
89
- g_col[GPT_LOG_COL_BLUE],
89
+ g_col[COMMON_LOG_COL_BLUE],
90
90
  (int) (timestamp / 1000000 / 60),
91
91
  (int) (timestamp / 1000000 % 60),
92
92
  (int) (timestamp / 1000 % 1000),
93
93
  (int) (timestamp % 1000),
94
- g_col[GPT_LOG_COL_DEFAULT]);
94
+ g_col[COMMON_LOG_COL_DEFAULT]);
95
95
  }
96
96
 
97
97
  switch (level) {
98
- case LM_GGML_LOG_LEVEL_INFO: fprintf(fcur, "%sI %s", g_col[GPT_LOG_COL_GREEN], g_col[GPT_LOG_COL_DEFAULT]); break;
99
- case LM_GGML_LOG_LEVEL_WARN: fprintf(fcur, "%sW %s", g_col[GPT_LOG_COL_MAGENTA], "" ); break;
100
- case LM_GGML_LOG_LEVEL_ERROR: fprintf(fcur, "%sE %s", g_col[GPT_LOG_COL_RED], "" ); break;
101
- case LM_GGML_LOG_LEVEL_DEBUG: fprintf(fcur, "%sD %s", g_col[GPT_LOG_COL_YELLOW], "" ); break;
98
+ case LM_GGML_LOG_LEVEL_INFO: fprintf(fcur, "%sI %s", g_col[COMMON_LOG_COL_GREEN], g_col[COMMON_LOG_COL_DEFAULT]); break;
99
+ case LM_GGML_LOG_LEVEL_WARN: fprintf(fcur, "%sW %s", g_col[COMMON_LOG_COL_MAGENTA], "" ); break;
100
+ case LM_GGML_LOG_LEVEL_ERROR: fprintf(fcur, "%sE %s", g_col[COMMON_LOG_COL_RED], "" ); break;
101
+ case LM_GGML_LOG_LEVEL_DEBUG: fprintf(fcur, "%sD %s", g_col[COMMON_LOG_COL_YELLOW], "" ); break;
102
102
  default:
103
103
  break;
104
104
  }
@@ -107,18 +107,18 @@ struct gpt_log_entry {
107
107
  fprintf(fcur, "%s", msg.data());
108
108
 
109
109
  if (level == LM_GGML_LOG_LEVEL_WARN || level == LM_GGML_LOG_LEVEL_ERROR || level == LM_GGML_LOG_LEVEL_DEBUG) {
110
- fprintf(fcur, "%s", g_col[GPT_LOG_COL_DEFAULT]);
110
+ fprintf(fcur, "%s", g_col[COMMON_LOG_COL_DEFAULT]);
111
111
  }
112
112
 
113
113
  fflush(fcur);
114
114
  }
115
115
  };
116
116
 
117
- struct gpt_log {
117
+ struct common_log {
118
118
  // default capacity - will be expanded if needed
119
- gpt_log() : gpt_log(256) {}
119
+ common_log() : common_log(256) {}
120
120
 
121
- gpt_log(size_t capacity) {
121
+ common_log(size_t capacity) {
122
122
  file = nullptr;
123
123
  prefix = false;
124
124
  timestamps = false;
@@ -137,7 +137,7 @@ struct gpt_log {
137
137
  resume();
138
138
  }
139
139
 
140
- ~gpt_log() {
140
+ ~common_log() {
141
141
  pause();
142
142
  if (file) {
143
143
  fclose(file);
@@ -158,12 +158,12 @@ private:
158
158
  int64_t t_start;
159
159
 
160
160
  // ring buffer of entries
161
- std::vector<gpt_log_entry> entries;
161
+ std::vector<common_log_entry> entries;
162
162
  size_t head;
163
163
  size_t tail;
164
164
 
165
165
  // worker thread copies into this
166
- gpt_log_entry cur;
166
+ common_log_entry cur;
167
167
 
168
168
  public:
169
169
  void add(enum lm_ggml_log_level level, const char * fmt, va_list args) {
@@ -219,7 +219,7 @@ public:
219
219
  tail = (tail + 1) % entries.size();
220
220
  if (tail == head) {
221
221
  // expand the buffer
222
- std::vector<gpt_log_entry> new_entries(2*entries.size());
222
+ std::vector<common_log_entry> new_entries(2*entries.size());
223
223
 
224
224
  size_t new_tail = 0;
225
225
 
@@ -320,15 +320,15 @@ public:
320
320
  pause();
321
321
 
322
322
  if (colors) {
323
- g_col[GPT_LOG_COL_DEFAULT] = LOG_COL_DEFAULT;
324
- g_col[GPT_LOG_COL_BOLD] = LOG_COL_BOLD;
325
- g_col[GPT_LOG_COL_RED] = LOG_COL_RED;
326
- g_col[GPT_LOG_COL_GREEN] = LOG_COL_GREEN;
327
- g_col[GPT_LOG_COL_YELLOW] = LOG_COL_YELLOW;
328
- g_col[GPT_LOG_COL_BLUE] = LOG_COL_BLUE;
329
- g_col[GPT_LOG_COL_MAGENTA] = LOG_COL_MAGENTA;
330
- g_col[GPT_LOG_COL_CYAN] = LOG_COL_CYAN;
331
- g_col[GPT_LOG_COL_WHITE] = LOG_COL_WHITE;
323
+ g_col[COMMON_LOG_COL_DEFAULT] = LOG_COL_DEFAULT;
324
+ g_col[COMMON_LOG_COL_BOLD] = LOG_COL_BOLD;
325
+ g_col[COMMON_LOG_COL_RED] = LOG_COL_RED;
326
+ g_col[COMMON_LOG_COL_GREEN] = LOG_COL_GREEN;
327
+ g_col[COMMON_LOG_COL_YELLOW] = LOG_COL_YELLOW;
328
+ g_col[COMMON_LOG_COL_BLUE] = LOG_COL_BLUE;
329
+ g_col[COMMON_LOG_COL_MAGENTA] = LOG_COL_MAGENTA;
330
+ g_col[COMMON_LOG_COL_CYAN] = LOG_COL_CYAN;
331
+ g_col[COMMON_LOG_COL_WHITE] = LOG_COL_WHITE;
332
332
  } else {
333
333
  for (size_t i = 0; i < g_col.size(); i++) {
334
334
  g_col[i] = "";
@@ -355,47 +355,47 @@ public:
355
355
  // public API
356
356
  //
357
357
 
358
- struct gpt_log * gpt_log_init() {
359
- return new gpt_log;
358
+ struct common_log * common_log_init() {
359
+ return new common_log;
360
360
  }
361
361
 
362
- struct gpt_log * gpt_log_main() {
363
- static struct gpt_log log;
362
+ struct common_log * common_log_main() {
363
+ static struct common_log log;
364
364
 
365
365
  return &log;
366
366
  }
367
367
 
368
- void gpt_log_pause(struct gpt_log * log) {
368
+ void common_log_pause(struct common_log * log) {
369
369
  log->pause();
370
370
  }
371
371
 
372
- void gpt_log_resume(struct gpt_log * log) {
372
+ void common_log_resume(struct common_log * log) {
373
373
  log->resume();
374
374
  }
375
375
 
376
- void gpt_log_free(struct gpt_log * log) {
376
+ void common_log_free(struct common_log * log) {
377
377
  delete log;
378
378
  }
379
379
 
380
- void gpt_log_add(struct gpt_log * log, enum lm_ggml_log_level level, const char * fmt, ...) {
380
+ void common_log_add(struct common_log * log, enum lm_ggml_log_level level, const char * fmt, ...) {
381
381
  va_list args;
382
382
  va_start(args, fmt);
383
383
  log->add(level, fmt, args);
384
384
  va_end(args);
385
385
  }
386
386
 
387
- void gpt_log_set_file(struct gpt_log * log, const char * file) {
387
+ void common_log_set_file(struct common_log * log, const char * file) {
388
388
  log->set_file(file);
389
389
  }
390
390
 
391
- void gpt_log_set_colors(struct gpt_log * log, bool colors) {
391
+ void common_log_set_colors(struct common_log * log, bool colors) {
392
392
  log->set_colors(colors);
393
393
  }
394
394
 
395
- void gpt_log_set_prefix(struct gpt_log * log, bool prefix) {
395
+ void common_log_set_prefix(struct common_log * log, bool prefix) {
396
396
  log->set_prefix(prefix);
397
397
  }
398
398
 
399
- void gpt_log_set_timestamps(struct gpt_log * log, bool timestamps) {
399
+ void common_log_set_timestamps(struct common_log * log, bool timestamps) {
400
400
  log->set_timestamps(timestamps);
401
401
  }
package/cpp/log.h CHANGED
@@ -14,23 +14,23 @@
14
14
  #define LOG_DEFAULT_LLAMA 0
15
15
 
16
16
  // needed by the LOG_TMPL macro to avoid computing log arguments if the verbosity lower
17
- // set via gpt_log_set_verbosity()
18
- extern int gpt_log_verbosity_thold;
17
+ // set via common_log_set_verbosity()
18
+ extern int common_log_verbosity_thold;
19
19
 
20
- void gpt_log_set_verbosity_thold(int verbosity); // not thread-safe
20
+ void common_log_set_verbosity_thold(int verbosity); // not thread-safe
21
21
 
22
- // the gpt_log uses an internal worker thread to print/write log messages
22
+ // the common_log uses an internal worker thread to print/write log messages
23
23
  // when the worker thread is paused, incoming log messages are discarded
24
- struct gpt_log;
24
+ struct common_log;
25
25
 
26
- struct gpt_log * gpt_log_init();
27
- struct gpt_log * gpt_log_main(); // singleton, automatically destroys itself on exit
28
- void gpt_log_pause (struct gpt_log * log); // pause the worker thread, not thread-safe
29
- void gpt_log_resume(struct gpt_log * log); // resume the worker thread, not thread-safe
30
- void gpt_log_free (struct gpt_log * log);
26
+ struct common_log * common_log_init();
27
+ struct common_log * common_log_main(); // singleton, automatically destroys itself on exit
28
+ void common_log_pause (struct common_log * log); // pause the worker thread, not thread-safe
29
+ void common_log_resume(struct common_log * log); // resume the worker thread, not thread-safe
30
+ void common_log_free (struct common_log * log);
31
31
 
32
32
  LOG_ATTRIBUTE_FORMAT(3, 4)
33
- void gpt_log_add(struct gpt_log * log, enum lm_ggml_log_level level, const char * fmt, ...);
33
+ void common_log_add(struct common_log * log, enum lm_ggml_log_level level, const char * fmt, ...);
34
34
 
35
35
  // defaults: file = NULL, colors = false, prefix = false, timestamps = false
36
36
  //
@@ -54,10 +54,10 @@ void gpt_log_add(struct gpt_log * log, enum lm_ggml_log_level level, const char
54
54
  // D - debug (stderr, V = LOG_DEFAULT_DEBUG)
55
55
  //
56
56
 
57
- void gpt_log_set_file (struct gpt_log * log, const char * file); // not thread-safe
58
- void gpt_log_set_colors (struct gpt_log * log, bool colors); // not thread-safe
59
- void gpt_log_set_prefix (struct gpt_log * log, bool prefix); // whether to output prefix to each log
60
- void gpt_log_set_timestamps(struct gpt_log * log, bool timestamps); // whether to output timestamps in the prefix
57
+ void common_log_set_file (struct common_log * log, const char * file); // not thread-safe
58
+ void common_log_set_colors (struct common_log * log, bool colors); // not thread-safe
59
+ void common_log_set_prefix (struct common_log * log, bool prefix); // whether to output prefix to each log
60
+ void common_log_set_timestamps(struct common_log * log, bool timestamps); // whether to output timestamps in the prefix
61
61
 
62
62
  // helper macros for logging
63
63
  // use these to avoid computing log arguments if the verbosity of the log is higher than the threshold
@@ -66,7 +66,7 @@ void gpt_log_set_timestamps(struct gpt_log * log, bool timestamps); // w
66
66
  //
67
67
  // LOG_DBG("this is a debug message: %d\n", expensive_function());
68
68
  //
69
- // this will avoid calling expensive_function() if LOG_DEFAULT_DEBUG > gpt_log_verbosity_thold
69
+ // this will avoid calling expensive_function() if LOG_DEFAULT_DEBUG > common_log_verbosity_thold
70
70
  //
71
71
 
72
72
 
@@ -98,8 +98,8 @@ void gpt_log_set_timestamps(struct gpt_log * log, bool timestamps); // w
98
98
 
99
99
  #define LOG_TMPL(level, verbosity, ...) \
100
100
  do { \
101
- if ((verbosity) <= gpt_log_verbosity_thold) { \
102
- gpt_log_add(gpt_log_main(), (level), __VA_ARGS__); \
101
+ if ((verbosity) <= common_log_verbosity_thold) { \
102
+ common_log_add(common_log_main(), (level), __VA_ARGS__); \
103
103
  } \
104
104
  } while (0)
105
105
 
package/cpp/rn-llama.hpp CHANGED
@@ -117,7 +117,7 @@ static size_t find_partial_stop_string(const std::string &stop,
117
117
  // format incomplete utf-8 multibyte character for output
118
118
  static std::string tokens_to_output_formatted_string(const llama_context *ctx, const llama_token token)
119
119
  {
120
- std::string out = token == -1 ? "" : llama_token_to_piece(ctx, token);
120
+ std::string out = token == -1 ? "" : common_token_to_piece(ctx, token);
121
121
  // if the size is 1 and first bit is 1, meaning it's a partial character
122
122
  // (size > 1 meaning it's already a known token)
123
123
  if (out.size() == 1 && (out[0] & 0x80) == 0x80)
@@ -136,7 +136,7 @@ static std::string tokens_to_str(llama_context *ctx, Iter begin, Iter end)
136
136
  std::string ret;
137
137
  for (; begin != end; ++begin)
138
138
  {
139
- ret += llama_token_to_piece(ctx, *begin);
139
+ ret += common_token_to_piece(ctx, *begin);
140
140
  }
141
141
  return ret;
142
142
  }
@@ -157,11 +157,11 @@ struct llama_rn_context
157
157
 
158
158
  std::vector<llama_token> embd;
159
159
 
160
- gpt_params params;
160
+ common_params params;
161
161
 
162
162
  llama_model *model = nullptr;
163
163
  llama_context *ctx = nullptr;
164
- gpt_sampler *ctx_sampling = nullptr;
164
+ common_sampler *ctx_sampling = nullptr;
165
165
 
166
166
  int n_ctx;
167
167
 
@@ -186,7 +186,7 @@ struct llama_rn_context
186
186
  }
187
187
  if (ctx_sampling != nullptr)
188
188
  {
189
- gpt_sampler_free(ctx_sampling);
189
+ common_sampler_free(ctx_sampling);
190
190
  }
191
191
  }
192
192
 
@@ -213,16 +213,16 @@ struct llama_rn_context
213
213
 
214
214
  bool initSampling() {
215
215
  if (ctx_sampling != nullptr) {
216
- gpt_sampler_free(ctx_sampling);
216
+ common_sampler_free(ctx_sampling);
217
217
  }
218
- ctx_sampling = gpt_sampler_init(model, params.sparams);
218
+ ctx_sampling = common_sampler_init(model, params.sparams);
219
219
  return ctx_sampling != nullptr;
220
220
  }
221
221
 
222
- bool loadModel(gpt_params &params_)
222
+ bool loadModel(common_params &params_)
223
223
  {
224
224
  params = params_;
225
- llama_init_result result = llama_init_from_gpt_params(params);
225
+ common_init_result result = common_init_from_params(params);
226
226
  model = result.model;
227
227
  ctx = result.context;
228
228
  if (model == nullptr)
@@ -268,7 +268,7 @@ struct llama_rn_context
268
268
 
269
269
  void loadPrompt()
270
270
  {
271
- std::vector<llama_token> prompt_tokens = ::llama_tokenize(ctx, params.prompt, true, true);
271
+ std::vector<llama_token> prompt_tokens = ::common_tokenize(ctx, params.prompt, true, true);
272
272
  num_prompt_tokens = prompt_tokens.size();
273
273
 
274
274
  // LOG tokens
@@ -302,7 +302,7 @@ struct llama_rn_context
302
302
  // push the prompt into the sampling context (do not apply grammar)
303
303
  for (auto & token : prompt_tokens)
304
304
  {
305
- gpt_sampler_accept(ctx_sampling, token, false);
305
+ common_sampler_accept(ctx_sampling, token, false);
306
306
  }
307
307
  // compare the evaluated prompt with the new prompt
308
308
  n_past = params.embedding? 0 : common_part(embd, prompt_tokens);
@@ -375,8 +375,8 @@ struct llama_rn_context
375
375
  {
376
376
  n_eval = params.n_batch;
377
377
  }
378
- if (llama_decode(ctx, llama_batch_get_one(&embd[n_past], n_eval, n_past, 0)))
379
- {
378
+ if (llama_decode(ctx, llama_batch_get_one(&embd[n_past], n_eval)))
379
+ {
380
380
  LOG_ERROR("failed to eval, n_eval: %d, n_past: %d, n_threads: %d, embd: %s",
381
381
  n_eval,
382
382
  n_past,
@@ -408,18 +408,19 @@ struct llama_rn_context
408
408
  std::vector<llama_token_data> candidates;
409
409
  candidates.reserve(llama_n_vocab(model));
410
410
 
411
- result.tok = gpt_sampler_sample(ctx_sampling, ctx, -1);
411
+ result.tok = common_sampler_sample(ctx_sampling, ctx, -1);
412
412
 
413
- llama_token_data_array cur_p = *gpt_sampler_get_candidates(ctx_sampling);
413
+ llama_token_data_array cur_p = *common_sampler_get_candidates(ctx_sampling);
414
414
 
415
415
  const int32_t n_probs = params.sparams.n_probs;
416
416
 
417
-
418
- if (params.sparams.temp <= 0 && n_probs > 0)
417
+ // deprecated
418
+ /*if (params.sparams.temp <= 0 && n_probs > 0)
419
419
  {
420
420
  // For llama_sample_token_greedy we need to sort candidates
421
421
  llama_sampler_init_softmax();
422
- }
422
+
423
+ }*/
423
424
 
424
425
 
425
426
  for (size_t i = 0; i < std::min(cur_p.size, (size_t)n_probs); ++i)
@@ -427,7 +428,7 @@ struct llama_rn_context
427
428
  result.probs.push_back({cur_p.data[i].id, cur_p.data[i].p});
428
429
  }
429
430
 
430
- gpt_sampler_accept(ctx_sampling, result.tok, true);
431
+ common_sampler_accept(ctx_sampling, result.tok, true);
431
432
  if (tg) {
432
433
  num_tokens_predicted++;
433
434
  }
@@ -487,7 +488,7 @@ struct llama_rn_context
487
488
  {
488
489
  const completion_token_output token_with_probs = nextToken();
489
490
 
490
- const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_piece(ctx, token_with_probs.tok);
491
+ const std::string token_text = token_with_probs.tok == -1 ? "" : common_token_to_piece(ctx, token_with_probs.tok);
491
492
  generated_text += token_text;
492
493
 
493
494
  if (params.sparams.n_probs > 0)
@@ -528,7 +529,7 @@ struct llama_rn_context
528
529
  }
529
530
 
530
531
  LOG_VERBOSE("next token, token: %s, token_text: %s, has_next_token: %d, n_remain: %d, num_tokens_predicted: %d, stopped_eos: %d, stopped_word: %d, stopped_limit: %d, stopping_word: %s",
531
- llama_token_to_piece(ctx, token_with_probs.tok),
532
+ common_token_to_piece(ctx, token_with_probs.tok),
532
533
  tokens_to_output_formatted_string(ctx, token_with_probs.tok).c_str(),
533
534
  has_next_token,
534
535
  n_remain,
@@ -562,7 +563,7 @@ struct llama_rn_context
562
563
  return std::vector<float>(n_embd, 0.0f);
563
564
  }
564
565
  std::vector<float> embedding(data, data + n_embd), out(data, data + n_embd);
565
- llama_embd_normalize(embedding.data(), out.data(), n_embd, params.embd_normalize);
566
+ common_embd_normalize(embedding.data(), out.data(), n_embd, params.embd_normalize);
566
567
  return out;
567
568
  }
568
569