cui-llama.rn 1.2.3 → 1.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/jni.cpp +7 -7
- package/cpp/common.cpp +81 -63
- package/cpp/common.h +79 -62
- package/cpp/ggml-alloc.c +17 -19
- package/cpp/ggml-backend.cpp +59 -24
- package/cpp/ggml-impl.h +8 -0
- package/cpp/ggml.c +65 -23
- package/cpp/ggml.h +1 -0
- package/cpp/json-schema-to-grammar.cpp +1 -1
- package/cpp/llama-sampling.cpp +366 -24
- package/cpp/llama-sampling.h +3 -2
- package/cpp/llama-vocab.cpp +33 -9
- package/cpp/llama-vocab.h +30 -11
- package/cpp/llama.cpp +471 -387
- package/cpp/llama.h +52 -21
- package/cpp/log.cpp +50 -50
- package/cpp/log.h +18 -18
- package/cpp/rn-llama.hpp +23 -22
- package/cpp/sampling.cpp +110 -119
- package/cpp/sampling.h +20 -20
- package/package.json +1 -1
package/cpp/llama.h
CHANGED
@@ -218,6 +218,7 @@ extern "C" {
|
|
218
218
|
|
219
219
|
typedef struct llama_token_data_array {
|
220
220
|
// TODO: consider SoA
|
221
|
+
// NOTE: this pointer can be modified by the samplers
|
221
222
|
llama_token_data * data;
|
222
223
|
size_t size;
|
223
224
|
int64_t selected; // this is the index in the data array (i.e. not the token id)
|
@@ -233,8 +234,11 @@ extern "C" {
|
|
233
234
|
// - token : the token ids of the input (used when embd is NULL)
|
234
235
|
// - embd : token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
|
235
236
|
// - pos : the positions of the respective token in the sequence
|
237
|
+
// (if set to NULL, the token position will be tracked automatically by llama_decode)
|
236
238
|
// - seq_id : the sequence to which the respective token belongs
|
239
|
+
// (if set to NULL, the sequence ID will be assumed to be 0)
|
237
240
|
// - logits : if zero, the logits (and/or the embeddings) for the respective token will not be output
|
241
|
+
// (if set to NULL, only the logits for last token will be returned)
|
238
242
|
//
|
239
243
|
typedef struct llama_batch {
|
240
244
|
int32_t n_tokens;
|
@@ -245,15 +249,6 @@ extern "C" {
|
|
245
249
|
int32_t * n_seq_id;
|
246
250
|
llama_seq_id ** seq_id;
|
247
251
|
int8_t * logits; // TODO: rename this to "output"
|
248
|
-
|
249
|
-
// NOTE: helpers for smooth API transition - can be deprecated in the future
|
250
|
-
// for future-proof code, use the above fields instead and ignore everything below
|
251
|
-
//
|
252
|
-
// pos[i] = all_pos_0 + i*all_pos_1
|
253
|
-
//
|
254
|
-
llama_pos all_pos_0; // used if pos == NULL
|
255
|
-
llama_pos all_pos_1; // used if pos == NULL
|
256
|
-
llama_seq_id all_seq_id; // used if seq_id == NULL
|
257
252
|
} llama_batch;
|
258
253
|
|
259
254
|
enum llama_model_kv_override_type {
|
@@ -434,6 +429,7 @@ extern "C" {
|
|
434
429
|
LLAMA_API bool llama_supports_mmap (void);
|
435
430
|
LLAMA_API bool llama_supports_mlock (void);
|
436
431
|
LLAMA_API bool llama_supports_gpu_offload(void);
|
432
|
+
LLAMA_API bool llama_supports_rpc (void);
|
437
433
|
|
438
434
|
LLAMA_API uint32_t llama_n_ctx (const struct llama_context * ctx);
|
439
435
|
LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx);
|
@@ -776,15 +772,15 @@ extern "C" {
|
|
776
772
|
// Decoding
|
777
773
|
//
|
778
774
|
|
779
|
-
// Return batch for single sequence of tokens
|
775
|
+
// Return batch for single sequence of tokens
|
776
|
+
// The sequence ID will be fixed to 0
|
777
|
+
// The position of the tokens will be tracked automatically by llama_decode
|
780
778
|
//
|
781
779
|
// NOTE: this is a helper function to facilitate transition to the new batch API - avoid using it
|
782
780
|
//
|
783
781
|
LLAMA_API struct llama_batch llama_batch_get_one(
|
784
782
|
llama_token * tokens,
|
785
|
-
int32_t n_tokens
|
786
|
-
llama_pos pos_0,
|
787
|
-
llama_seq_id seq_id);
|
783
|
+
int32_t n_tokens);
|
788
784
|
|
789
785
|
// Allocates a batch of tokens on the heap that can hold a maximum of n_tokens
|
790
786
|
// Each token can be assigned up to n_seq_max sequence ids
|
@@ -897,6 +893,7 @@ extern "C" {
|
|
897
893
|
// Special tokens
|
898
894
|
LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence
|
899
895
|
LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
|
896
|
+
LLAMA_API llama_token llama_token_eot(const struct llama_model * model); // end-of-turn
|
900
897
|
LLAMA_API llama_token llama_token_cls(const struct llama_model * model); // classification
|
901
898
|
LLAMA_API llama_token llama_token_sep(const struct llama_model * model); // sentence separator
|
902
899
|
LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
|
@@ -905,11 +902,17 @@ extern "C" {
|
|
905
902
|
LLAMA_API bool llama_add_bos_token(const struct llama_model * model);
|
906
903
|
LLAMA_API bool llama_add_eos_token(const struct llama_model * model);
|
907
904
|
|
908
|
-
//
|
909
|
-
LLAMA_API llama_token llama_token_prefix(const struct llama_model * model)
|
910
|
-
LLAMA_API llama_token llama_token_middle(const struct llama_model * model)
|
911
|
-
LLAMA_API llama_token llama_token_suffix(const struct llama_model * model)
|
912
|
-
|
905
|
+
// infill tokens
|
906
|
+
DEPRECATED(LLAMA_API llama_token llama_token_prefix(const struct llama_model * model), "use llama_token_fim_pre instead");
|
907
|
+
DEPRECATED(LLAMA_API llama_token llama_token_middle(const struct llama_model * model), "use llama_token_fim_mid instead");
|
908
|
+
DEPRECATED(LLAMA_API llama_token llama_token_suffix(const struct llama_model * model), "use llama_token_fim_suf instead");
|
909
|
+
|
910
|
+
LLAMA_API llama_token llama_token_fim_pre(const struct llama_model * model);
|
911
|
+
LLAMA_API llama_token llama_token_fim_suf(const struct llama_model * model);
|
912
|
+
LLAMA_API llama_token llama_token_fim_mid(const struct llama_model * model);
|
913
|
+
LLAMA_API llama_token llama_token_fim_pad(const struct llama_model * model);
|
914
|
+
LLAMA_API llama_token llama_token_fim_rep(const struct llama_model * model);
|
915
|
+
LLAMA_API llama_token llama_token_fim_sep(const struct llama_model * model);
|
913
916
|
|
914
917
|
//
|
915
918
|
// Tokenization
|
@@ -1068,12 +1071,13 @@ extern "C" {
|
|
1068
1071
|
|
1069
1072
|
// available samplers:
|
1070
1073
|
|
1071
|
-
LLAMA_API struct llama_sampler * llama_sampler_init_greedy
|
1072
|
-
LLAMA_API struct llama_sampler * llama_sampler_init_dist
|
1074
|
+
LLAMA_API struct llama_sampler * llama_sampler_init_greedy(void);
|
1075
|
+
LLAMA_API struct llama_sampler * llama_sampler_init_dist (uint32_t seed);
|
1073
1076
|
|
1074
1077
|
/// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
|
1075
1078
|
/// NOTE: Avoid using on the full vocabulary as the sorting can become slow. For example, apply top-k or top-p sampling first.
|
1076
|
-
LLAMA_API struct llama_sampler * llama_sampler_init_softmax (void)
|
1079
|
+
DEPRECATED(LLAMA_API struct llama_sampler * llama_sampler_init_softmax (void),
|
1080
|
+
"will be removed in the future (see https://github.com/ggerganov/llama.cpp/pull/9896#discussion_r1800920915)");
|
1077
1081
|
|
1078
1082
|
/// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
|
1079
1083
|
LLAMA_API struct llama_sampler * llama_sampler_init_top_k (int32_t k);
|
@@ -1092,11 +1096,16 @@ extern "C" {
|
|
1092
1096
|
|
1093
1097
|
/// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
|
1094
1098
|
LLAMA_API struct llama_sampler * llama_sampler_init_typical (float p, size_t min_keep);
|
1099
|
+
|
1100
|
+
/// #details Updates the logits l_i` = l_i/t. When t <= 0.0f, the maximum logit is kept at it's original value, the rest are set to -inf
|
1095
1101
|
LLAMA_API struct llama_sampler * llama_sampler_init_temp (float t);
|
1096
1102
|
|
1097
1103
|
/// @details Dynamic temperature implementation (a.k.a. entropy) described in the paper https://arxiv.org/abs/2309.02772.
|
1098
1104
|
LLAMA_API struct llama_sampler * llama_sampler_init_temp_ext (float t, float delta, float exponent);
|
1099
1105
|
|
1106
|
+
/// @details XTC sampler as described in https://github.com/oobabooga/text-generation-webui/pull/6335
|
1107
|
+
LLAMA_API struct llama_sampler * llama_sampler_init_xtc (float p, float t, size_t min_keep, uint32_t seed);
|
1108
|
+
|
1100
1109
|
/// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
|
1101
1110
|
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
|
1102
1111
|
/// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
|
@@ -1141,6 +1150,28 @@ extern "C" {
|
|
1141
1150
|
int32_t n_logit_bias,
|
1142
1151
|
const llama_logit_bias * logit_bias);
|
1143
1152
|
|
1153
|
+
// this sampler is meant to be used for fill-in-the-middle infilling
|
1154
|
+
// it's supposed to be used after top_k + top_p sampling
|
1155
|
+
//
|
1156
|
+
// 1. if the sum of the EOG probs times the number of candidates is higher than the sum of the other probs -> pick EOG
|
1157
|
+
// 2. combine probs of tokens that have the same prefix
|
1158
|
+
//
|
1159
|
+
// example:
|
1160
|
+
//
|
1161
|
+
// - before:
|
1162
|
+
// "hel": 0.5
|
1163
|
+
// "hell": 0.2
|
1164
|
+
// "hello": 0.1
|
1165
|
+
// "dummy": 0.1
|
1166
|
+
//
|
1167
|
+
// - after:
|
1168
|
+
// "hel": 0.8
|
1169
|
+
// "dummy": 0.1
|
1170
|
+
//
|
1171
|
+
// 3. discard non-EOG tokens with low prob
|
1172
|
+
// 4. if no tokens are left -> pick EOT
|
1173
|
+
//
|
1174
|
+
LLAMA_API struct llama_sampler * llama_sampler_init_infill(const struct llama_model * model);
|
1144
1175
|
|
1145
1176
|
// Returns the seed used by the sampler if applicable, LLAMA_DEFAULT_SEED otherwise
|
1146
1177
|
LLAMA_API uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl);
|
package/cpp/log.cpp
CHANGED
@@ -8,10 +8,10 @@
|
|
8
8
|
#include <thread>
|
9
9
|
#include <vector>
|
10
10
|
|
11
|
-
int
|
11
|
+
int common_log_verbosity_thold = LOG_DEFAULT_LLAMA;
|
12
12
|
|
13
|
-
void
|
14
|
-
|
13
|
+
void common_log_set_verbosity_thold(int verbosity) {
|
14
|
+
common_log_verbosity_thold = verbosity;
|
15
15
|
}
|
16
16
|
|
17
17
|
#define LOG_COL_DEFAULT "\033[0m"
|
@@ -29,16 +29,16 @@ static int64_t t_us() {
|
|
29
29
|
}
|
30
30
|
|
31
31
|
// colors
|
32
|
-
enum
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
32
|
+
enum common_log_col : int {
|
33
|
+
COMMON_LOG_COL_DEFAULT = 0,
|
34
|
+
COMMON_LOG_COL_BOLD,
|
35
|
+
COMMON_LOG_COL_RED,
|
36
|
+
COMMON_LOG_COL_GREEN,
|
37
|
+
COMMON_LOG_COL_YELLOW,
|
38
|
+
COMMON_LOG_COL_BLUE,
|
39
|
+
COMMON_LOG_COL_MAGENTA,
|
40
|
+
COMMON_LOG_COL_CYAN,
|
41
|
+
COMMON_LOG_COL_WHITE,
|
42
42
|
};
|
43
43
|
|
44
44
|
// disable colors by default
|
@@ -54,7 +54,7 @@ static std::vector<const char *> g_col = {
|
|
54
54
|
"",
|
55
55
|
};
|
56
56
|
|
57
|
-
struct
|
57
|
+
struct common_log_entry {
|
58
58
|
enum lm_ggml_log_level level;
|
59
59
|
|
60
60
|
bool prefix;
|
@@ -71,7 +71,7 @@ struct gpt_log_entry {
|
|
71
71
|
if (!fcur) {
|
72
72
|
// stderr displays DBG messages only when their verbosity level is not higher than the threshold
|
73
73
|
// these messages will still be logged to a file
|
74
|
-
if (level == LM_GGML_LOG_LEVEL_DEBUG &&
|
74
|
+
if (level == LM_GGML_LOG_LEVEL_DEBUG && common_log_verbosity_thold < LOG_DEFAULT_DEBUG) {
|
75
75
|
return;
|
76
76
|
}
|
77
77
|
|
@@ -86,19 +86,19 @@ struct gpt_log_entry {
|
|
86
86
|
if (timestamp) {
|
87
87
|
// [M.s.ms.us]
|
88
88
|
fprintf(fcur, "%s%d.%02d.%03d.%03d%s ",
|
89
|
-
g_col[
|
89
|
+
g_col[COMMON_LOG_COL_BLUE],
|
90
90
|
(int) (timestamp / 1000000 / 60),
|
91
91
|
(int) (timestamp / 1000000 % 60),
|
92
92
|
(int) (timestamp / 1000 % 1000),
|
93
93
|
(int) (timestamp % 1000),
|
94
|
-
g_col[
|
94
|
+
g_col[COMMON_LOG_COL_DEFAULT]);
|
95
95
|
}
|
96
96
|
|
97
97
|
switch (level) {
|
98
|
-
case LM_GGML_LOG_LEVEL_INFO: fprintf(fcur, "%sI %s", g_col[
|
99
|
-
case LM_GGML_LOG_LEVEL_WARN: fprintf(fcur, "%sW %s", g_col[
|
100
|
-
case LM_GGML_LOG_LEVEL_ERROR: fprintf(fcur, "%sE %s", g_col[
|
101
|
-
case LM_GGML_LOG_LEVEL_DEBUG: fprintf(fcur, "%sD %s", g_col[
|
98
|
+
case LM_GGML_LOG_LEVEL_INFO: fprintf(fcur, "%sI %s", g_col[COMMON_LOG_COL_GREEN], g_col[COMMON_LOG_COL_DEFAULT]); break;
|
99
|
+
case LM_GGML_LOG_LEVEL_WARN: fprintf(fcur, "%sW %s", g_col[COMMON_LOG_COL_MAGENTA], "" ); break;
|
100
|
+
case LM_GGML_LOG_LEVEL_ERROR: fprintf(fcur, "%sE %s", g_col[COMMON_LOG_COL_RED], "" ); break;
|
101
|
+
case LM_GGML_LOG_LEVEL_DEBUG: fprintf(fcur, "%sD %s", g_col[COMMON_LOG_COL_YELLOW], "" ); break;
|
102
102
|
default:
|
103
103
|
break;
|
104
104
|
}
|
@@ -107,18 +107,18 @@ struct gpt_log_entry {
|
|
107
107
|
fprintf(fcur, "%s", msg.data());
|
108
108
|
|
109
109
|
if (level == LM_GGML_LOG_LEVEL_WARN || level == LM_GGML_LOG_LEVEL_ERROR || level == LM_GGML_LOG_LEVEL_DEBUG) {
|
110
|
-
fprintf(fcur, "%s", g_col[
|
110
|
+
fprintf(fcur, "%s", g_col[COMMON_LOG_COL_DEFAULT]);
|
111
111
|
}
|
112
112
|
|
113
113
|
fflush(fcur);
|
114
114
|
}
|
115
115
|
};
|
116
116
|
|
117
|
-
struct
|
117
|
+
struct common_log {
|
118
118
|
// default capacity - will be expanded if needed
|
119
|
-
|
119
|
+
common_log() : common_log(256) {}
|
120
120
|
|
121
|
-
|
121
|
+
common_log(size_t capacity) {
|
122
122
|
file = nullptr;
|
123
123
|
prefix = false;
|
124
124
|
timestamps = false;
|
@@ -137,7 +137,7 @@ struct gpt_log {
|
|
137
137
|
resume();
|
138
138
|
}
|
139
139
|
|
140
|
-
~
|
140
|
+
~common_log() {
|
141
141
|
pause();
|
142
142
|
if (file) {
|
143
143
|
fclose(file);
|
@@ -158,12 +158,12 @@ private:
|
|
158
158
|
int64_t t_start;
|
159
159
|
|
160
160
|
// ring buffer of entries
|
161
|
-
std::vector<
|
161
|
+
std::vector<common_log_entry> entries;
|
162
162
|
size_t head;
|
163
163
|
size_t tail;
|
164
164
|
|
165
165
|
// worker thread copies into this
|
166
|
-
|
166
|
+
common_log_entry cur;
|
167
167
|
|
168
168
|
public:
|
169
169
|
void add(enum lm_ggml_log_level level, const char * fmt, va_list args) {
|
@@ -219,7 +219,7 @@ public:
|
|
219
219
|
tail = (tail + 1) % entries.size();
|
220
220
|
if (tail == head) {
|
221
221
|
// expand the buffer
|
222
|
-
std::vector<
|
222
|
+
std::vector<common_log_entry> new_entries(2*entries.size());
|
223
223
|
|
224
224
|
size_t new_tail = 0;
|
225
225
|
|
@@ -320,15 +320,15 @@ public:
|
|
320
320
|
pause();
|
321
321
|
|
322
322
|
if (colors) {
|
323
|
-
g_col[
|
324
|
-
g_col[
|
325
|
-
g_col[
|
326
|
-
g_col[
|
327
|
-
g_col[
|
328
|
-
g_col[
|
329
|
-
g_col[
|
330
|
-
g_col[
|
331
|
-
g_col[
|
323
|
+
g_col[COMMON_LOG_COL_DEFAULT] = LOG_COL_DEFAULT;
|
324
|
+
g_col[COMMON_LOG_COL_BOLD] = LOG_COL_BOLD;
|
325
|
+
g_col[COMMON_LOG_COL_RED] = LOG_COL_RED;
|
326
|
+
g_col[COMMON_LOG_COL_GREEN] = LOG_COL_GREEN;
|
327
|
+
g_col[COMMON_LOG_COL_YELLOW] = LOG_COL_YELLOW;
|
328
|
+
g_col[COMMON_LOG_COL_BLUE] = LOG_COL_BLUE;
|
329
|
+
g_col[COMMON_LOG_COL_MAGENTA] = LOG_COL_MAGENTA;
|
330
|
+
g_col[COMMON_LOG_COL_CYAN] = LOG_COL_CYAN;
|
331
|
+
g_col[COMMON_LOG_COL_WHITE] = LOG_COL_WHITE;
|
332
332
|
} else {
|
333
333
|
for (size_t i = 0; i < g_col.size(); i++) {
|
334
334
|
g_col[i] = "";
|
@@ -355,47 +355,47 @@ public:
|
|
355
355
|
// public API
|
356
356
|
//
|
357
357
|
|
358
|
-
struct
|
359
|
-
return new
|
358
|
+
struct common_log * common_log_init() {
|
359
|
+
return new common_log;
|
360
360
|
}
|
361
361
|
|
362
|
-
struct
|
363
|
-
static struct
|
362
|
+
struct common_log * common_log_main() {
|
363
|
+
static struct common_log log;
|
364
364
|
|
365
365
|
return &log;
|
366
366
|
}
|
367
367
|
|
368
|
-
void
|
368
|
+
void common_log_pause(struct common_log * log) {
|
369
369
|
log->pause();
|
370
370
|
}
|
371
371
|
|
372
|
-
void
|
372
|
+
void common_log_resume(struct common_log * log) {
|
373
373
|
log->resume();
|
374
374
|
}
|
375
375
|
|
376
|
-
void
|
376
|
+
void common_log_free(struct common_log * log) {
|
377
377
|
delete log;
|
378
378
|
}
|
379
379
|
|
380
|
-
void
|
380
|
+
void common_log_add(struct common_log * log, enum lm_ggml_log_level level, const char * fmt, ...) {
|
381
381
|
va_list args;
|
382
382
|
va_start(args, fmt);
|
383
383
|
log->add(level, fmt, args);
|
384
384
|
va_end(args);
|
385
385
|
}
|
386
386
|
|
387
|
-
void
|
387
|
+
void common_log_set_file(struct common_log * log, const char * file) {
|
388
388
|
log->set_file(file);
|
389
389
|
}
|
390
390
|
|
391
|
-
void
|
391
|
+
void common_log_set_colors(struct common_log * log, bool colors) {
|
392
392
|
log->set_colors(colors);
|
393
393
|
}
|
394
394
|
|
395
|
-
void
|
395
|
+
void common_log_set_prefix(struct common_log * log, bool prefix) {
|
396
396
|
log->set_prefix(prefix);
|
397
397
|
}
|
398
398
|
|
399
|
-
void
|
399
|
+
void common_log_set_timestamps(struct common_log * log, bool timestamps) {
|
400
400
|
log->set_timestamps(timestamps);
|
401
401
|
}
|
package/cpp/log.h
CHANGED
@@ -14,23 +14,23 @@
|
|
14
14
|
#define LOG_DEFAULT_LLAMA 0
|
15
15
|
|
16
16
|
// needed by the LOG_TMPL macro to avoid computing log arguments if the verbosity lower
|
17
|
-
// set via
|
18
|
-
extern int
|
17
|
+
// set via common_log_set_verbosity()
|
18
|
+
extern int common_log_verbosity_thold;
|
19
19
|
|
20
|
-
void
|
20
|
+
void common_log_set_verbosity_thold(int verbosity); // not thread-safe
|
21
21
|
|
22
|
-
// the
|
22
|
+
// the common_log uses an internal worker thread to print/write log messages
|
23
23
|
// when the worker thread is paused, incoming log messages are discarded
|
24
|
-
struct
|
24
|
+
struct common_log;
|
25
25
|
|
26
|
-
struct
|
27
|
-
struct
|
28
|
-
void
|
29
|
-
void
|
30
|
-
void
|
26
|
+
struct common_log * common_log_init();
|
27
|
+
struct common_log * common_log_main(); // singleton, automatically destroys itself on exit
|
28
|
+
void common_log_pause (struct common_log * log); // pause the worker thread, not thread-safe
|
29
|
+
void common_log_resume(struct common_log * log); // resume the worker thread, not thread-safe
|
30
|
+
void common_log_free (struct common_log * log);
|
31
31
|
|
32
32
|
LOG_ATTRIBUTE_FORMAT(3, 4)
|
33
|
-
void
|
33
|
+
void common_log_add(struct common_log * log, enum lm_ggml_log_level level, const char * fmt, ...);
|
34
34
|
|
35
35
|
// defaults: file = NULL, colors = false, prefix = false, timestamps = false
|
36
36
|
//
|
@@ -54,10 +54,10 @@ void gpt_log_add(struct gpt_log * log, enum lm_ggml_log_level level, const char
|
|
54
54
|
// D - debug (stderr, V = LOG_DEFAULT_DEBUG)
|
55
55
|
//
|
56
56
|
|
57
|
-
void
|
58
|
-
void
|
59
|
-
void
|
60
|
-
void
|
57
|
+
void common_log_set_file (struct common_log * log, const char * file); // not thread-safe
|
58
|
+
void common_log_set_colors (struct common_log * log, bool colors); // not thread-safe
|
59
|
+
void common_log_set_prefix (struct common_log * log, bool prefix); // whether to output prefix to each log
|
60
|
+
void common_log_set_timestamps(struct common_log * log, bool timestamps); // whether to output timestamps in the prefix
|
61
61
|
|
62
62
|
// helper macros for logging
|
63
63
|
// use these to avoid computing log arguments if the verbosity of the log is higher than the threshold
|
@@ -66,7 +66,7 @@ void gpt_log_set_timestamps(struct gpt_log * log, bool timestamps); // w
|
|
66
66
|
//
|
67
67
|
// LOG_DBG("this is a debug message: %d\n", expensive_function());
|
68
68
|
//
|
69
|
-
// this will avoid calling expensive_function() if LOG_DEFAULT_DEBUG >
|
69
|
+
// this will avoid calling expensive_function() if LOG_DEFAULT_DEBUG > common_log_verbosity_thold
|
70
70
|
//
|
71
71
|
|
72
72
|
|
@@ -98,8 +98,8 @@ void gpt_log_set_timestamps(struct gpt_log * log, bool timestamps); // w
|
|
98
98
|
|
99
99
|
#define LOG_TMPL(level, verbosity, ...) \
|
100
100
|
do { \
|
101
|
-
if ((verbosity) <=
|
102
|
-
|
101
|
+
if ((verbosity) <= common_log_verbosity_thold) { \
|
102
|
+
common_log_add(common_log_main(), (level), __VA_ARGS__); \
|
103
103
|
} \
|
104
104
|
} while (0)
|
105
105
|
|
package/cpp/rn-llama.hpp
CHANGED
@@ -117,7 +117,7 @@ static size_t find_partial_stop_string(const std::string &stop,
|
|
117
117
|
// format incomplete utf-8 multibyte character for output
|
118
118
|
static std::string tokens_to_output_formatted_string(const llama_context *ctx, const llama_token token)
|
119
119
|
{
|
120
|
-
std::string out = token == -1 ? "" :
|
120
|
+
std::string out = token == -1 ? "" : common_token_to_piece(ctx, token);
|
121
121
|
// if the size is 1 and first bit is 1, meaning it's a partial character
|
122
122
|
// (size > 1 meaning it's already a known token)
|
123
123
|
if (out.size() == 1 && (out[0] & 0x80) == 0x80)
|
@@ -136,7 +136,7 @@ static std::string tokens_to_str(llama_context *ctx, Iter begin, Iter end)
|
|
136
136
|
std::string ret;
|
137
137
|
for (; begin != end; ++begin)
|
138
138
|
{
|
139
|
-
ret +=
|
139
|
+
ret += common_token_to_piece(ctx, *begin);
|
140
140
|
}
|
141
141
|
return ret;
|
142
142
|
}
|
@@ -157,11 +157,11 @@ struct llama_rn_context
|
|
157
157
|
|
158
158
|
std::vector<llama_token> embd;
|
159
159
|
|
160
|
-
|
160
|
+
common_params params;
|
161
161
|
|
162
162
|
llama_model *model = nullptr;
|
163
163
|
llama_context *ctx = nullptr;
|
164
|
-
|
164
|
+
common_sampler *ctx_sampling = nullptr;
|
165
165
|
|
166
166
|
int n_ctx;
|
167
167
|
|
@@ -186,7 +186,7 @@ struct llama_rn_context
|
|
186
186
|
}
|
187
187
|
if (ctx_sampling != nullptr)
|
188
188
|
{
|
189
|
-
|
189
|
+
common_sampler_free(ctx_sampling);
|
190
190
|
}
|
191
191
|
}
|
192
192
|
|
@@ -213,16 +213,16 @@ struct llama_rn_context
|
|
213
213
|
|
214
214
|
bool initSampling() {
|
215
215
|
if (ctx_sampling != nullptr) {
|
216
|
-
|
216
|
+
common_sampler_free(ctx_sampling);
|
217
217
|
}
|
218
|
-
ctx_sampling =
|
218
|
+
ctx_sampling = common_sampler_init(model, params.sparams);
|
219
219
|
return ctx_sampling != nullptr;
|
220
220
|
}
|
221
221
|
|
222
|
-
bool loadModel(
|
222
|
+
bool loadModel(common_params ¶ms_)
|
223
223
|
{
|
224
224
|
params = params_;
|
225
|
-
|
225
|
+
common_init_result result = common_init_from_params(params);
|
226
226
|
model = result.model;
|
227
227
|
ctx = result.context;
|
228
228
|
if (model == nullptr)
|
@@ -268,7 +268,7 @@ struct llama_rn_context
|
|
268
268
|
|
269
269
|
void loadPrompt()
|
270
270
|
{
|
271
|
-
std::vector<llama_token> prompt_tokens = ::
|
271
|
+
std::vector<llama_token> prompt_tokens = ::common_tokenize(ctx, params.prompt, true, true);
|
272
272
|
num_prompt_tokens = prompt_tokens.size();
|
273
273
|
|
274
274
|
// LOG tokens
|
@@ -302,7 +302,7 @@ struct llama_rn_context
|
|
302
302
|
// push the prompt into the sampling context (do not apply grammar)
|
303
303
|
for (auto & token : prompt_tokens)
|
304
304
|
{
|
305
|
-
|
305
|
+
common_sampler_accept(ctx_sampling, token, false);
|
306
306
|
}
|
307
307
|
// compare the evaluated prompt with the new prompt
|
308
308
|
n_past = params.embedding? 0 : common_part(embd, prompt_tokens);
|
@@ -375,8 +375,8 @@ struct llama_rn_context
|
|
375
375
|
{
|
376
376
|
n_eval = params.n_batch;
|
377
377
|
}
|
378
|
-
if (llama_decode(ctx, llama_batch_get_one(&embd[n_past], n_eval
|
379
|
-
{
|
378
|
+
if (llama_decode(ctx, llama_batch_get_one(&embd[n_past], n_eval)))
|
379
|
+
{
|
380
380
|
LOG_ERROR("failed to eval, n_eval: %d, n_past: %d, n_threads: %d, embd: %s",
|
381
381
|
n_eval,
|
382
382
|
n_past,
|
@@ -408,18 +408,19 @@ struct llama_rn_context
|
|
408
408
|
std::vector<llama_token_data> candidates;
|
409
409
|
candidates.reserve(llama_n_vocab(model));
|
410
410
|
|
411
|
-
result.tok =
|
411
|
+
result.tok = common_sampler_sample(ctx_sampling, ctx, -1);
|
412
412
|
|
413
|
-
llama_token_data_array cur_p = *
|
413
|
+
llama_token_data_array cur_p = *common_sampler_get_candidates(ctx_sampling);
|
414
414
|
|
415
415
|
const int32_t n_probs = params.sparams.n_probs;
|
416
416
|
|
417
|
-
|
418
|
-
if (params.sparams.temp <= 0 && n_probs > 0)
|
417
|
+
// deprecated
|
418
|
+
/*if (params.sparams.temp <= 0 && n_probs > 0)
|
419
419
|
{
|
420
420
|
// For llama_sample_token_greedy we need to sort candidates
|
421
421
|
llama_sampler_init_softmax();
|
422
|
-
|
422
|
+
|
423
|
+
}*/
|
423
424
|
|
424
425
|
|
425
426
|
for (size_t i = 0; i < std::min(cur_p.size, (size_t)n_probs); ++i)
|
@@ -427,7 +428,7 @@ struct llama_rn_context
|
|
427
428
|
result.probs.push_back({cur_p.data[i].id, cur_p.data[i].p});
|
428
429
|
}
|
429
430
|
|
430
|
-
|
431
|
+
common_sampler_accept(ctx_sampling, result.tok, true);
|
431
432
|
if (tg) {
|
432
433
|
num_tokens_predicted++;
|
433
434
|
}
|
@@ -487,7 +488,7 @@ struct llama_rn_context
|
|
487
488
|
{
|
488
489
|
const completion_token_output token_with_probs = nextToken();
|
489
490
|
|
490
|
-
const std::string token_text = token_with_probs.tok == -1 ? "" :
|
491
|
+
const std::string token_text = token_with_probs.tok == -1 ? "" : common_token_to_piece(ctx, token_with_probs.tok);
|
491
492
|
generated_text += token_text;
|
492
493
|
|
493
494
|
if (params.sparams.n_probs > 0)
|
@@ -528,7 +529,7 @@ struct llama_rn_context
|
|
528
529
|
}
|
529
530
|
|
530
531
|
LOG_VERBOSE("next token, token: %s, token_text: %s, has_next_token: %d, n_remain: %d, num_tokens_predicted: %d, stopped_eos: %d, stopped_word: %d, stopped_limit: %d, stopping_word: %s",
|
531
|
-
|
532
|
+
common_token_to_piece(ctx, token_with_probs.tok),
|
532
533
|
tokens_to_output_formatted_string(ctx, token_with_probs.tok).c_str(),
|
533
534
|
has_next_token,
|
534
535
|
n_remain,
|
@@ -562,7 +563,7 @@ struct llama_rn_context
|
|
562
563
|
return std::vector<float>(n_embd, 0.0f);
|
563
564
|
}
|
564
565
|
std::vector<float> embedding(data, data + n_embd), out(data, data + n_embd);
|
565
|
-
|
566
|
+
common_embd_normalize(embedding.data(), out.data(), n_embd, params.embd_normalize);
|
566
567
|
return out;
|
567
568
|
}
|
568
569
|
|