cui-llama.rn 1.2.3 → 1.2.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +0 -2
- package/android/src/main/CMakeLists.txt +1 -0
- package/android/src/main/java/com/rnllama/LlamaContext.java +0 -3
- package/android/src/main/jni.cpp +9 -11
- package/cpp/common.cpp +85 -75
- package/cpp/common.h +127 -91
- package/cpp/ggml-aarch64.c +269 -0
- package/cpp/ggml-alloc.c +17 -19
- package/cpp/ggml-backend-impl.h +4 -15
- package/cpp/ggml-backend.cpp +1697 -1626
- package/cpp/ggml-backend.h +13 -25
- package/cpp/ggml-cpp.h +38 -0
- package/cpp/ggml-cpu.c +13720 -0
- package/cpp/ggml-cpu.h +150 -0
- package/cpp/ggml-impl.h +95 -0
- package/cpp/ggml-metal.m +185 -71
- package/cpp/ggml-quants.c +38 -51
- package/cpp/ggml.c +4468 -19500
- package/cpp/ggml.h +26 -146
- package/cpp/json-schema-to-grammar.cpp +1 -1
- package/cpp/llama-sampling.cpp +742 -249
- package/cpp/llama-sampling.h +21 -2
- package/cpp/llama-vocab.cpp +49 -9
- package/cpp/llama-vocab.h +35 -11
- package/cpp/llama.cpp +2468 -2307
- package/cpp/llama.h +65 -32
- package/cpp/log.cpp +50 -50
- package/cpp/log.h +18 -18
- package/cpp/rn-llama.hpp +23 -22
- package/cpp/sampling.cpp +117 -118
- package/cpp/sampling.h +20 -20
- package/cpp/sgemm.cpp +57 -0
- package/lib/commonjs/NativeRNLlama.js.map +1 -1
- package/lib/module/NativeRNLlama.js.map +1 -1
- package/lib/typescript/NativeRNLlama.d.ts +0 -1
- package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
- package/package.json +1 -1
- package/src/NativeRNLlama.ts +0 -1
package/cpp/llama.h
CHANGED
@@ -2,6 +2,7 @@
|
|
2
2
|
#define LLAMA_H
|
3
3
|
|
4
4
|
#include "ggml.h"
|
5
|
+
#include "ggml-cpu.h"
|
5
6
|
#include "ggml-backend.h"
|
6
7
|
|
7
8
|
#include <stddef.h>
|
@@ -206,7 +207,7 @@ extern "C" {
|
|
206
207
|
enum llama_split_mode {
|
207
208
|
LLAMA_SPLIT_MODE_NONE = 0, // single GPU
|
208
209
|
LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs
|
209
|
-
LLAMA_SPLIT_MODE_ROW = 2, // split
|
210
|
+
LLAMA_SPLIT_MODE_ROW = 2, // split layers and KV across GPUs, use tensor parallelism if supported
|
210
211
|
};
|
211
212
|
|
212
213
|
// TODO: simplify (https://github.com/ggerganov/llama.cpp/pull/9294#pullrequestreview-2286561979)
|
@@ -218,6 +219,7 @@ extern "C" {
|
|
218
219
|
|
219
220
|
typedef struct llama_token_data_array {
|
220
221
|
// TODO: consider SoA
|
222
|
+
// NOTE: this pointer can be modified by the samplers
|
221
223
|
llama_token_data * data;
|
222
224
|
size_t size;
|
223
225
|
int64_t selected; // this is the index in the data array (i.e. not the token id)
|
@@ -233,8 +235,11 @@ extern "C" {
|
|
233
235
|
// - token : the token ids of the input (used when embd is NULL)
|
234
236
|
// - embd : token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
|
235
237
|
// - pos : the positions of the respective token in the sequence
|
238
|
+
// (if set to NULL, the token position will be tracked automatically by llama_decode)
|
236
239
|
// - seq_id : the sequence to which the respective token belongs
|
240
|
+
// (if set to NULL, the sequence ID will be assumed to be 0)
|
237
241
|
// - logits : if zero, the logits (and/or the embeddings) for the respective token will not be output
|
242
|
+
// (if set to NULL, only the logits for last token will be returned)
|
238
243
|
//
|
239
244
|
typedef struct llama_batch {
|
240
245
|
int32_t n_tokens;
|
@@ -245,15 +250,6 @@ extern "C" {
|
|
245
250
|
int32_t * n_seq_id;
|
246
251
|
llama_seq_id ** seq_id;
|
247
252
|
int8_t * logits; // TODO: rename this to "output"
|
248
|
-
|
249
|
-
// NOTE: helpers for smooth API transition - can be deprecated in the future
|
250
|
-
// for future-proof code, use the above fields instead and ignore everything below
|
251
|
-
//
|
252
|
-
// pos[i] = all_pos_0 + i*all_pos_1
|
253
|
-
//
|
254
|
-
llama_pos all_pos_0; // used if pos == NULL
|
255
|
-
llama_pos all_pos_1; // used if pos == NULL
|
256
|
-
llama_seq_id all_seq_id; // used if seq_id == NULL
|
257
253
|
} llama_batch;
|
258
254
|
|
259
255
|
enum llama_model_kv_override_type {
|
@@ -280,10 +276,7 @@ extern "C" {
|
|
280
276
|
int32_t n_gpu_layers; // number of layers to store in VRAM
|
281
277
|
enum llama_split_mode split_mode; // how to split the model across multiple GPUs
|
282
278
|
|
283
|
-
//
|
284
|
-
// LLAMA_SPLIT_MODE_NONE: the GPU that is used for the entire model
|
285
|
-
// LLAMA_SPLIT_MODE_ROW: the GPU that is used for small tensors and intermediate results
|
286
|
-
// LLAMA_SPLIT_MODE_LAYER: ignored
|
279
|
+
// the GPU that is used for the entire model when split_mode is LLAMA_SPLIT_MODE_NONE
|
287
280
|
int32_t main_gpu;
|
288
281
|
|
289
282
|
// proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
|
@@ -434,6 +427,7 @@ extern "C" {
|
|
434
427
|
LLAMA_API bool llama_supports_mmap (void);
|
435
428
|
LLAMA_API bool llama_supports_mlock (void);
|
436
429
|
LLAMA_API bool llama_supports_gpu_offload(void);
|
430
|
+
LLAMA_API bool llama_supports_rpc (void);
|
437
431
|
|
438
432
|
LLAMA_API uint32_t llama_n_ctx (const struct llama_context * ctx);
|
439
433
|
LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx);
|
@@ -776,15 +770,15 @@ extern "C" {
|
|
776
770
|
// Decoding
|
777
771
|
//
|
778
772
|
|
779
|
-
// Return batch for single sequence of tokens
|
773
|
+
// Return batch for single sequence of tokens
|
774
|
+
// The sequence ID will be fixed to 0
|
775
|
+
// The position of the tokens will be tracked automatically by llama_decode
|
780
776
|
//
|
781
777
|
// NOTE: this is a helper function to facilitate transition to the new batch API - avoid using it
|
782
778
|
//
|
783
779
|
LLAMA_API struct llama_batch llama_batch_get_one(
|
784
780
|
llama_token * tokens,
|
785
|
-
int32_t n_tokens
|
786
|
-
llama_pos pos_0,
|
787
|
-
llama_seq_id seq_id);
|
781
|
+
int32_t n_tokens);
|
788
782
|
|
789
783
|
// Allocates a batch of tokens on the heap that can hold a maximum of n_tokens
|
790
784
|
// Each token can be assigned up to n_seq_max sequence ids
|
@@ -897,6 +891,7 @@ extern "C" {
|
|
897
891
|
// Special tokens
|
898
892
|
LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence
|
899
893
|
LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
|
894
|
+
LLAMA_API llama_token llama_token_eot(const struct llama_model * model); // end-of-turn
|
900
895
|
LLAMA_API llama_token llama_token_cls(const struct llama_model * model); // classification
|
901
896
|
LLAMA_API llama_token llama_token_sep(const struct llama_model * model); // sentence separator
|
902
897
|
LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
|
@@ -905,11 +900,17 @@ extern "C" {
|
|
905
900
|
LLAMA_API bool llama_add_bos_token(const struct llama_model * model);
|
906
901
|
LLAMA_API bool llama_add_eos_token(const struct llama_model * model);
|
907
902
|
|
908
|
-
//
|
909
|
-
LLAMA_API llama_token llama_token_prefix(const struct llama_model * model)
|
910
|
-
LLAMA_API llama_token llama_token_middle(const struct llama_model * model)
|
911
|
-
LLAMA_API llama_token llama_token_suffix(const struct llama_model * model)
|
912
|
-
|
903
|
+
// infill tokens
|
904
|
+
DEPRECATED(LLAMA_API llama_token llama_token_prefix(const struct llama_model * model), "use llama_token_fim_pre instead");
|
905
|
+
DEPRECATED(LLAMA_API llama_token llama_token_middle(const struct llama_model * model), "use llama_token_fim_mid instead");
|
906
|
+
DEPRECATED(LLAMA_API llama_token llama_token_suffix(const struct llama_model * model), "use llama_token_fim_suf instead");
|
907
|
+
|
908
|
+
LLAMA_API llama_token llama_token_fim_pre(const struct llama_model * model);
|
909
|
+
LLAMA_API llama_token llama_token_fim_suf(const struct llama_model * model);
|
910
|
+
LLAMA_API llama_token llama_token_fim_mid(const struct llama_model * model);
|
911
|
+
LLAMA_API llama_token llama_token_fim_pad(const struct llama_model * model);
|
912
|
+
LLAMA_API llama_token llama_token_fim_rep(const struct llama_model * model);
|
913
|
+
LLAMA_API llama_token llama_token_fim_sep(const struct llama_model * model);
|
913
914
|
|
914
915
|
//
|
915
916
|
// Tokenization
|
@@ -1068,12 +1069,13 @@ extern "C" {
|
|
1068
1069
|
|
1069
1070
|
// available samplers:
|
1070
1071
|
|
1071
|
-
LLAMA_API struct llama_sampler * llama_sampler_init_greedy
|
1072
|
-
LLAMA_API struct llama_sampler * llama_sampler_init_dist
|
1072
|
+
LLAMA_API struct llama_sampler * llama_sampler_init_greedy(void);
|
1073
|
+
LLAMA_API struct llama_sampler * llama_sampler_init_dist (uint32_t seed);
|
1073
1074
|
|
1074
1075
|
/// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
|
1075
1076
|
/// NOTE: Avoid using on the full vocabulary as the sorting can become slow. For example, apply top-k or top-p sampling first.
|
1076
|
-
LLAMA_API struct llama_sampler * llama_sampler_init_softmax (void)
|
1077
|
+
DEPRECATED(LLAMA_API struct llama_sampler * llama_sampler_init_softmax (void),
|
1078
|
+
"will be removed in the future (see https://github.com/ggerganov/llama.cpp/pull/9896#discussion_r1800920915)");
|
1077
1079
|
|
1078
1080
|
/// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
|
1079
1081
|
LLAMA_API struct llama_sampler * llama_sampler_init_top_k (int32_t k);
|
@@ -1084,19 +1086,18 @@ extern "C" {
|
|
1084
1086
|
/// @details Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
|
1085
1087
|
LLAMA_API struct llama_sampler * llama_sampler_init_min_p (float p, size_t min_keep);
|
1086
1088
|
|
1087
|
-
/// @details XTC sampling as described in https://github.com/oobabooga/text-generation-webui/pull/6335
|
1088
|
-
LLAMA_API struct llama_sampler * llama_sampler_init_xtc (float xtc_p, float xtc_t, size_t min_keep, uint32_t seed);
|
1089
|
-
|
1090
|
-
/// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
|
1091
|
-
LLAMA_API struct llama_sampler * llama_sampler_init_tail_free (float z, size_t min_keep);
|
1092
|
-
|
1093
1089
|
/// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
|
1094
1090
|
LLAMA_API struct llama_sampler * llama_sampler_init_typical (float p, size_t min_keep);
|
1091
|
+
|
1092
|
+
/// #details Updates the logits l_i` = l_i/t. When t <= 0.0f, the maximum logit is kept at it's original value, the rest are set to -inf
|
1095
1093
|
LLAMA_API struct llama_sampler * llama_sampler_init_temp (float t);
|
1096
1094
|
|
1097
1095
|
/// @details Dynamic temperature implementation (a.k.a. entropy) described in the paper https://arxiv.org/abs/2309.02772.
|
1098
1096
|
LLAMA_API struct llama_sampler * llama_sampler_init_temp_ext (float t, float delta, float exponent);
|
1099
1097
|
|
1098
|
+
/// @details XTC sampler as described in https://github.com/oobabooga/text-generation-webui/pull/6335
|
1099
|
+
LLAMA_API struct llama_sampler * llama_sampler_init_xtc (float p, float t, size_t min_keep, uint32_t seed);
|
1100
|
+
|
1100
1101
|
/// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
|
1101
1102
|
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
|
1102
1103
|
/// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
|
@@ -1136,11 +1137,43 @@ extern "C" {
|
|
1136
1137
|
bool penalize_nl, // consider newlines as a repeatable token
|
1137
1138
|
bool ignore_eos); // ignore the end-of-sequence token
|
1138
1139
|
|
1140
|
+
/// @details DRY sampler, designed by p-e-w, as described in: https://github.com/oobabooga/text-generation-webui/pull/5677, porting Koboldcpp implementation authored by pi6am: https://github.com/LostRuins/koboldcpp/pull/982
|
1141
|
+
LLAMA_API struct llama_sampler * llama_sampler_init_dry(
|
1142
|
+
const struct llama_model * model,
|
1143
|
+
float dry_multiplier,
|
1144
|
+
float dry_base,
|
1145
|
+
int32_t dry_allowed_length,
|
1146
|
+
int32_t dry_penalty_last_n,
|
1147
|
+
const char ** seq_breakers,
|
1148
|
+
size_t num_breakers);
|
1149
|
+
|
1139
1150
|
LLAMA_API struct llama_sampler * llama_sampler_init_logit_bias(
|
1140
1151
|
int32_t n_vocab,
|
1141
1152
|
int32_t n_logit_bias,
|
1142
1153
|
const llama_logit_bias * logit_bias);
|
1143
1154
|
|
1155
|
+
// this sampler is meant to be used for fill-in-the-middle infilling
|
1156
|
+
// it's supposed to be used after top_k + top_p sampling
|
1157
|
+
//
|
1158
|
+
// 1. if the sum of the EOG probs times the number of candidates is higher than the sum of the other probs -> pick EOG
|
1159
|
+
// 2. combine probs of tokens that have the same prefix
|
1160
|
+
//
|
1161
|
+
// example:
|
1162
|
+
//
|
1163
|
+
// - before:
|
1164
|
+
// "hel": 0.5
|
1165
|
+
// "hell": 0.2
|
1166
|
+
// "hello": 0.1
|
1167
|
+
// "dummy": 0.1
|
1168
|
+
//
|
1169
|
+
// - after:
|
1170
|
+
// "hel": 0.8
|
1171
|
+
// "dummy": 0.1
|
1172
|
+
//
|
1173
|
+
// 3. discard non-EOG tokens with low prob
|
1174
|
+
// 4. if no tokens are left -> pick EOT
|
1175
|
+
//
|
1176
|
+
LLAMA_API struct llama_sampler * llama_sampler_init_infill(const struct llama_model * model);
|
1144
1177
|
|
1145
1178
|
// Returns the seed used by the sampler if applicable, LLAMA_DEFAULT_SEED otherwise
|
1146
1179
|
LLAMA_API uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl);
|
package/cpp/log.cpp
CHANGED
@@ -8,10 +8,10 @@
|
|
8
8
|
#include <thread>
|
9
9
|
#include <vector>
|
10
10
|
|
11
|
-
int
|
11
|
+
int common_log_verbosity_thold = LOG_DEFAULT_LLAMA;
|
12
12
|
|
13
|
-
void
|
14
|
-
|
13
|
+
void common_log_set_verbosity_thold(int verbosity) {
|
14
|
+
common_log_verbosity_thold = verbosity;
|
15
15
|
}
|
16
16
|
|
17
17
|
#define LOG_COL_DEFAULT "\033[0m"
|
@@ -29,16 +29,16 @@ static int64_t t_us() {
|
|
29
29
|
}
|
30
30
|
|
31
31
|
// colors
|
32
|
-
enum
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
32
|
+
enum common_log_col : int {
|
33
|
+
COMMON_LOG_COL_DEFAULT = 0,
|
34
|
+
COMMON_LOG_COL_BOLD,
|
35
|
+
COMMON_LOG_COL_RED,
|
36
|
+
COMMON_LOG_COL_GREEN,
|
37
|
+
COMMON_LOG_COL_YELLOW,
|
38
|
+
COMMON_LOG_COL_BLUE,
|
39
|
+
COMMON_LOG_COL_MAGENTA,
|
40
|
+
COMMON_LOG_COL_CYAN,
|
41
|
+
COMMON_LOG_COL_WHITE,
|
42
42
|
};
|
43
43
|
|
44
44
|
// disable colors by default
|
@@ -54,7 +54,7 @@ static std::vector<const char *> g_col = {
|
|
54
54
|
"",
|
55
55
|
};
|
56
56
|
|
57
|
-
struct
|
57
|
+
struct common_log_entry {
|
58
58
|
enum lm_ggml_log_level level;
|
59
59
|
|
60
60
|
bool prefix;
|
@@ -71,7 +71,7 @@ struct gpt_log_entry {
|
|
71
71
|
if (!fcur) {
|
72
72
|
// stderr displays DBG messages only when their verbosity level is not higher than the threshold
|
73
73
|
// these messages will still be logged to a file
|
74
|
-
if (level == LM_GGML_LOG_LEVEL_DEBUG &&
|
74
|
+
if (level == LM_GGML_LOG_LEVEL_DEBUG && common_log_verbosity_thold < LOG_DEFAULT_DEBUG) {
|
75
75
|
return;
|
76
76
|
}
|
77
77
|
|
@@ -86,19 +86,19 @@ struct gpt_log_entry {
|
|
86
86
|
if (timestamp) {
|
87
87
|
// [M.s.ms.us]
|
88
88
|
fprintf(fcur, "%s%d.%02d.%03d.%03d%s ",
|
89
|
-
g_col[
|
89
|
+
g_col[COMMON_LOG_COL_BLUE],
|
90
90
|
(int) (timestamp / 1000000 / 60),
|
91
91
|
(int) (timestamp / 1000000 % 60),
|
92
92
|
(int) (timestamp / 1000 % 1000),
|
93
93
|
(int) (timestamp % 1000),
|
94
|
-
g_col[
|
94
|
+
g_col[COMMON_LOG_COL_DEFAULT]);
|
95
95
|
}
|
96
96
|
|
97
97
|
switch (level) {
|
98
|
-
case LM_GGML_LOG_LEVEL_INFO: fprintf(fcur, "%sI %s", g_col[
|
99
|
-
case LM_GGML_LOG_LEVEL_WARN: fprintf(fcur, "%sW %s", g_col[
|
100
|
-
case LM_GGML_LOG_LEVEL_ERROR: fprintf(fcur, "%sE %s", g_col[
|
101
|
-
case LM_GGML_LOG_LEVEL_DEBUG: fprintf(fcur, "%sD %s", g_col[
|
98
|
+
case LM_GGML_LOG_LEVEL_INFO: fprintf(fcur, "%sI %s", g_col[COMMON_LOG_COL_GREEN], g_col[COMMON_LOG_COL_DEFAULT]); break;
|
99
|
+
case LM_GGML_LOG_LEVEL_WARN: fprintf(fcur, "%sW %s", g_col[COMMON_LOG_COL_MAGENTA], "" ); break;
|
100
|
+
case LM_GGML_LOG_LEVEL_ERROR: fprintf(fcur, "%sE %s", g_col[COMMON_LOG_COL_RED], "" ); break;
|
101
|
+
case LM_GGML_LOG_LEVEL_DEBUG: fprintf(fcur, "%sD %s", g_col[COMMON_LOG_COL_YELLOW], "" ); break;
|
102
102
|
default:
|
103
103
|
break;
|
104
104
|
}
|
@@ -107,18 +107,18 @@ struct gpt_log_entry {
|
|
107
107
|
fprintf(fcur, "%s", msg.data());
|
108
108
|
|
109
109
|
if (level == LM_GGML_LOG_LEVEL_WARN || level == LM_GGML_LOG_LEVEL_ERROR || level == LM_GGML_LOG_LEVEL_DEBUG) {
|
110
|
-
fprintf(fcur, "%s", g_col[
|
110
|
+
fprintf(fcur, "%s", g_col[COMMON_LOG_COL_DEFAULT]);
|
111
111
|
}
|
112
112
|
|
113
113
|
fflush(fcur);
|
114
114
|
}
|
115
115
|
};
|
116
116
|
|
117
|
-
struct
|
117
|
+
struct common_log {
|
118
118
|
// default capacity - will be expanded if needed
|
119
|
-
|
119
|
+
common_log() : common_log(256) {}
|
120
120
|
|
121
|
-
|
121
|
+
common_log(size_t capacity) {
|
122
122
|
file = nullptr;
|
123
123
|
prefix = false;
|
124
124
|
timestamps = false;
|
@@ -137,7 +137,7 @@ struct gpt_log {
|
|
137
137
|
resume();
|
138
138
|
}
|
139
139
|
|
140
|
-
~
|
140
|
+
~common_log() {
|
141
141
|
pause();
|
142
142
|
if (file) {
|
143
143
|
fclose(file);
|
@@ -158,12 +158,12 @@ private:
|
|
158
158
|
int64_t t_start;
|
159
159
|
|
160
160
|
// ring buffer of entries
|
161
|
-
std::vector<
|
161
|
+
std::vector<common_log_entry> entries;
|
162
162
|
size_t head;
|
163
163
|
size_t tail;
|
164
164
|
|
165
165
|
// worker thread copies into this
|
166
|
-
|
166
|
+
common_log_entry cur;
|
167
167
|
|
168
168
|
public:
|
169
169
|
void add(enum lm_ggml_log_level level, const char * fmt, va_list args) {
|
@@ -219,7 +219,7 @@ public:
|
|
219
219
|
tail = (tail + 1) % entries.size();
|
220
220
|
if (tail == head) {
|
221
221
|
// expand the buffer
|
222
|
-
std::vector<
|
222
|
+
std::vector<common_log_entry> new_entries(2*entries.size());
|
223
223
|
|
224
224
|
size_t new_tail = 0;
|
225
225
|
|
@@ -320,15 +320,15 @@ public:
|
|
320
320
|
pause();
|
321
321
|
|
322
322
|
if (colors) {
|
323
|
-
g_col[
|
324
|
-
g_col[
|
325
|
-
g_col[
|
326
|
-
g_col[
|
327
|
-
g_col[
|
328
|
-
g_col[
|
329
|
-
g_col[
|
330
|
-
g_col[
|
331
|
-
g_col[
|
323
|
+
g_col[COMMON_LOG_COL_DEFAULT] = LOG_COL_DEFAULT;
|
324
|
+
g_col[COMMON_LOG_COL_BOLD] = LOG_COL_BOLD;
|
325
|
+
g_col[COMMON_LOG_COL_RED] = LOG_COL_RED;
|
326
|
+
g_col[COMMON_LOG_COL_GREEN] = LOG_COL_GREEN;
|
327
|
+
g_col[COMMON_LOG_COL_YELLOW] = LOG_COL_YELLOW;
|
328
|
+
g_col[COMMON_LOG_COL_BLUE] = LOG_COL_BLUE;
|
329
|
+
g_col[COMMON_LOG_COL_MAGENTA] = LOG_COL_MAGENTA;
|
330
|
+
g_col[COMMON_LOG_COL_CYAN] = LOG_COL_CYAN;
|
331
|
+
g_col[COMMON_LOG_COL_WHITE] = LOG_COL_WHITE;
|
332
332
|
} else {
|
333
333
|
for (size_t i = 0; i < g_col.size(); i++) {
|
334
334
|
g_col[i] = "";
|
@@ -355,47 +355,47 @@ public:
|
|
355
355
|
// public API
|
356
356
|
//
|
357
357
|
|
358
|
-
struct
|
359
|
-
return new
|
358
|
+
struct common_log * common_log_init() {
|
359
|
+
return new common_log;
|
360
360
|
}
|
361
361
|
|
362
|
-
struct
|
363
|
-
static struct
|
362
|
+
struct common_log * common_log_main() {
|
363
|
+
static struct common_log log;
|
364
364
|
|
365
365
|
return &log;
|
366
366
|
}
|
367
367
|
|
368
|
-
void
|
368
|
+
void common_log_pause(struct common_log * log) {
|
369
369
|
log->pause();
|
370
370
|
}
|
371
371
|
|
372
|
-
void
|
372
|
+
void common_log_resume(struct common_log * log) {
|
373
373
|
log->resume();
|
374
374
|
}
|
375
375
|
|
376
|
-
void
|
376
|
+
void common_log_free(struct common_log * log) {
|
377
377
|
delete log;
|
378
378
|
}
|
379
379
|
|
380
|
-
void
|
380
|
+
void common_log_add(struct common_log * log, enum lm_ggml_log_level level, const char * fmt, ...) {
|
381
381
|
va_list args;
|
382
382
|
va_start(args, fmt);
|
383
383
|
log->add(level, fmt, args);
|
384
384
|
va_end(args);
|
385
385
|
}
|
386
386
|
|
387
|
-
void
|
387
|
+
void common_log_set_file(struct common_log * log, const char * file) {
|
388
388
|
log->set_file(file);
|
389
389
|
}
|
390
390
|
|
391
|
-
void
|
391
|
+
void common_log_set_colors(struct common_log * log, bool colors) {
|
392
392
|
log->set_colors(colors);
|
393
393
|
}
|
394
394
|
|
395
|
-
void
|
395
|
+
void common_log_set_prefix(struct common_log * log, bool prefix) {
|
396
396
|
log->set_prefix(prefix);
|
397
397
|
}
|
398
398
|
|
399
|
-
void
|
399
|
+
void common_log_set_timestamps(struct common_log * log, bool timestamps) {
|
400
400
|
log->set_timestamps(timestamps);
|
401
401
|
}
|
package/cpp/log.h
CHANGED
@@ -14,23 +14,23 @@
|
|
14
14
|
#define LOG_DEFAULT_LLAMA 0
|
15
15
|
|
16
16
|
// needed by the LOG_TMPL macro to avoid computing log arguments if the verbosity lower
|
17
|
-
// set via
|
18
|
-
extern int
|
17
|
+
// set via common_log_set_verbosity()
|
18
|
+
extern int common_log_verbosity_thold;
|
19
19
|
|
20
|
-
void
|
20
|
+
void common_log_set_verbosity_thold(int verbosity); // not thread-safe
|
21
21
|
|
22
|
-
// the
|
22
|
+
// the common_log uses an internal worker thread to print/write log messages
|
23
23
|
// when the worker thread is paused, incoming log messages are discarded
|
24
|
-
struct
|
24
|
+
struct common_log;
|
25
25
|
|
26
|
-
struct
|
27
|
-
struct
|
28
|
-
void
|
29
|
-
void
|
30
|
-
void
|
26
|
+
struct common_log * common_log_init();
|
27
|
+
struct common_log * common_log_main(); // singleton, automatically destroys itself on exit
|
28
|
+
void common_log_pause (struct common_log * log); // pause the worker thread, not thread-safe
|
29
|
+
void common_log_resume(struct common_log * log); // resume the worker thread, not thread-safe
|
30
|
+
void common_log_free (struct common_log * log);
|
31
31
|
|
32
32
|
LOG_ATTRIBUTE_FORMAT(3, 4)
|
33
|
-
void
|
33
|
+
void common_log_add(struct common_log * log, enum lm_ggml_log_level level, const char * fmt, ...);
|
34
34
|
|
35
35
|
// defaults: file = NULL, colors = false, prefix = false, timestamps = false
|
36
36
|
//
|
@@ -54,10 +54,10 @@ void gpt_log_add(struct gpt_log * log, enum lm_ggml_log_level level, const char
|
|
54
54
|
// D - debug (stderr, V = LOG_DEFAULT_DEBUG)
|
55
55
|
//
|
56
56
|
|
57
|
-
void
|
58
|
-
void
|
59
|
-
void
|
60
|
-
void
|
57
|
+
void common_log_set_file (struct common_log * log, const char * file); // not thread-safe
|
58
|
+
void common_log_set_colors (struct common_log * log, bool colors); // not thread-safe
|
59
|
+
void common_log_set_prefix (struct common_log * log, bool prefix); // whether to output prefix to each log
|
60
|
+
void common_log_set_timestamps(struct common_log * log, bool timestamps); // whether to output timestamps in the prefix
|
61
61
|
|
62
62
|
// helper macros for logging
|
63
63
|
// use these to avoid computing log arguments if the verbosity of the log is higher than the threshold
|
@@ -66,7 +66,7 @@ void gpt_log_set_timestamps(struct gpt_log * log, bool timestamps); // w
|
|
66
66
|
//
|
67
67
|
// LOG_DBG("this is a debug message: %d\n", expensive_function());
|
68
68
|
//
|
69
|
-
// this will avoid calling expensive_function() if LOG_DEFAULT_DEBUG >
|
69
|
+
// this will avoid calling expensive_function() if LOG_DEFAULT_DEBUG > common_log_verbosity_thold
|
70
70
|
//
|
71
71
|
|
72
72
|
|
@@ -98,8 +98,8 @@ void gpt_log_set_timestamps(struct gpt_log * log, bool timestamps); // w
|
|
98
98
|
|
99
99
|
#define LOG_TMPL(level, verbosity, ...) \
|
100
100
|
do { \
|
101
|
-
if ((verbosity) <=
|
102
|
-
|
101
|
+
if ((verbosity) <= common_log_verbosity_thold) { \
|
102
|
+
common_log_add(common_log_main(), (level), __VA_ARGS__); \
|
103
103
|
} \
|
104
104
|
} while (0)
|
105
105
|
|
package/cpp/rn-llama.hpp
CHANGED
@@ -117,7 +117,7 @@ static size_t find_partial_stop_string(const std::string &stop,
|
|
117
117
|
// format incomplete utf-8 multibyte character for output
|
118
118
|
static std::string tokens_to_output_formatted_string(const llama_context *ctx, const llama_token token)
|
119
119
|
{
|
120
|
-
std::string out = token == -1 ? "" :
|
120
|
+
std::string out = token == -1 ? "" : common_token_to_piece(ctx, token);
|
121
121
|
// if the size is 1 and first bit is 1, meaning it's a partial character
|
122
122
|
// (size > 1 meaning it's already a known token)
|
123
123
|
if (out.size() == 1 && (out[0] & 0x80) == 0x80)
|
@@ -136,7 +136,7 @@ static std::string tokens_to_str(llama_context *ctx, Iter begin, Iter end)
|
|
136
136
|
std::string ret;
|
137
137
|
for (; begin != end; ++begin)
|
138
138
|
{
|
139
|
-
ret +=
|
139
|
+
ret += common_token_to_piece(ctx, *begin);
|
140
140
|
}
|
141
141
|
return ret;
|
142
142
|
}
|
@@ -157,11 +157,11 @@ struct llama_rn_context
|
|
157
157
|
|
158
158
|
std::vector<llama_token> embd;
|
159
159
|
|
160
|
-
|
160
|
+
common_params params;
|
161
161
|
|
162
162
|
llama_model *model = nullptr;
|
163
163
|
llama_context *ctx = nullptr;
|
164
|
-
|
164
|
+
common_sampler *ctx_sampling = nullptr;
|
165
165
|
|
166
166
|
int n_ctx;
|
167
167
|
|
@@ -186,7 +186,7 @@ struct llama_rn_context
|
|
186
186
|
}
|
187
187
|
if (ctx_sampling != nullptr)
|
188
188
|
{
|
189
|
-
|
189
|
+
common_sampler_free(ctx_sampling);
|
190
190
|
}
|
191
191
|
}
|
192
192
|
|
@@ -213,16 +213,16 @@ struct llama_rn_context
|
|
213
213
|
|
214
214
|
bool initSampling() {
|
215
215
|
if (ctx_sampling != nullptr) {
|
216
|
-
|
216
|
+
common_sampler_free(ctx_sampling);
|
217
217
|
}
|
218
|
-
ctx_sampling =
|
218
|
+
ctx_sampling = common_sampler_init(model, params.sparams);
|
219
219
|
return ctx_sampling != nullptr;
|
220
220
|
}
|
221
221
|
|
222
|
-
bool loadModel(
|
222
|
+
bool loadModel(common_params ¶ms_)
|
223
223
|
{
|
224
224
|
params = params_;
|
225
|
-
|
225
|
+
common_init_result result = common_init_from_params(params);
|
226
226
|
model = result.model;
|
227
227
|
ctx = result.context;
|
228
228
|
if (model == nullptr)
|
@@ -268,7 +268,7 @@ struct llama_rn_context
|
|
268
268
|
|
269
269
|
void loadPrompt()
|
270
270
|
{
|
271
|
-
std::vector<llama_token> prompt_tokens = ::
|
271
|
+
std::vector<llama_token> prompt_tokens = ::common_tokenize(ctx, params.prompt, true, true);
|
272
272
|
num_prompt_tokens = prompt_tokens.size();
|
273
273
|
|
274
274
|
// LOG tokens
|
@@ -302,7 +302,7 @@ struct llama_rn_context
|
|
302
302
|
// push the prompt into the sampling context (do not apply grammar)
|
303
303
|
for (auto & token : prompt_tokens)
|
304
304
|
{
|
305
|
-
|
305
|
+
common_sampler_accept(ctx_sampling, token, false);
|
306
306
|
}
|
307
307
|
// compare the evaluated prompt with the new prompt
|
308
308
|
n_past = params.embedding? 0 : common_part(embd, prompt_tokens);
|
@@ -375,8 +375,8 @@ struct llama_rn_context
|
|
375
375
|
{
|
376
376
|
n_eval = params.n_batch;
|
377
377
|
}
|
378
|
-
if (llama_decode(ctx, llama_batch_get_one(&embd[n_past], n_eval
|
379
|
-
{
|
378
|
+
if (llama_decode(ctx, llama_batch_get_one(&embd[n_past], n_eval)))
|
379
|
+
{
|
380
380
|
LOG_ERROR("failed to eval, n_eval: %d, n_past: %d, n_threads: %d, embd: %s",
|
381
381
|
n_eval,
|
382
382
|
n_past,
|
@@ -408,18 +408,19 @@ struct llama_rn_context
|
|
408
408
|
std::vector<llama_token_data> candidates;
|
409
409
|
candidates.reserve(llama_n_vocab(model));
|
410
410
|
|
411
|
-
result.tok =
|
411
|
+
result.tok = common_sampler_sample(ctx_sampling, ctx, -1);
|
412
412
|
|
413
|
-
llama_token_data_array cur_p = *
|
413
|
+
llama_token_data_array cur_p = *common_sampler_get_candidates(ctx_sampling);
|
414
414
|
|
415
415
|
const int32_t n_probs = params.sparams.n_probs;
|
416
416
|
|
417
|
-
|
418
|
-
if (params.sparams.temp <= 0 && n_probs > 0)
|
417
|
+
// deprecated
|
418
|
+
/*if (params.sparams.temp <= 0 && n_probs > 0)
|
419
419
|
{
|
420
420
|
// For llama_sample_token_greedy we need to sort candidates
|
421
421
|
llama_sampler_init_softmax();
|
422
|
-
|
422
|
+
|
423
|
+
}*/
|
423
424
|
|
424
425
|
|
425
426
|
for (size_t i = 0; i < std::min(cur_p.size, (size_t)n_probs); ++i)
|
@@ -427,7 +428,7 @@ struct llama_rn_context
|
|
427
428
|
result.probs.push_back({cur_p.data[i].id, cur_p.data[i].p});
|
428
429
|
}
|
429
430
|
|
430
|
-
|
431
|
+
common_sampler_accept(ctx_sampling, result.tok, true);
|
431
432
|
if (tg) {
|
432
433
|
num_tokens_predicted++;
|
433
434
|
}
|
@@ -487,7 +488,7 @@ struct llama_rn_context
|
|
487
488
|
{
|
488
489
|
const completion_token_output token_with_probs = nextToken();
|
489
490
|
|
490
|
-
const std::string token_text = token_with_probs.tok == -1 ? "" :
|
491
|
+
const std::string token_text = token_with_probs.tok == -1 ? "" : common_token_to_piece(ctx, token_with_probs.tok);
|
491
492
|
generated_text += token_text;
|
492
493
|
|
493
494
|
if (params.sparams.n_probs > 0)
|
@@ -528,7 +529,7 @@ struct llama_rn_context
|
|
528
529
|
}
|
529
530
|
|
530
531
|
LOG_VERBOSE("next token, token: %s, token_text: %s, has_next_token: %d, n_remain: %d, num_tokens_predicted: %d, stopped_eos: %d, stopped_word: %d, stopped_limit: %d, stopping_word: %s",
|
531
|
-
|
532
|
+
common_token_to_piece(ctx, token_with_probs.tok),
|
532
533
|
tokens_to_output_formatted_string(ctx, token_with_probs.tok).c_str(),
|
533
534
|
has_next_token,
|
534
535
|
n_remain,
|
@@ -562,7 +563,7 @@ struct llama_rn_context
|
|
562
563
|
return std::vector<float>(n_embd, 0.0f);
|
563
564
|
}
|
564
565
|
std::vector<float> embedding(data, data + n_embd), out(data, data + n_embd);
|
565
|
-
|
566
|
+
common_embd_normalize(embedding.data(), out.data(), n_embd, params.embd_normalize);
|
566
567
|
return out;
|
567
568
|
}
|
568
569
|
|