cui-llama.rn 1.1.4 → 1.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/CMakeLists.txt +1 -0
- package/android/src/main/jni.cpp +3 -4
- package/cpp/common.cpp +183 -1990
- package/cpp/common.h +101 -130
- package/cpp/ggml-impl.h +32 -0
- package/cpp/ggml-metal.m +38 -28
- package/cpp/ggml-quants.c +275 -84
- package/cpp/ggml.c +89 -35
- package/cpp/ggml.h +30 -67
- package/cpp/llama-impl.h +1 -0
- package/cpp/llama-sampling.cpp +218 -102
- package/cpp/llama.cpp +599 -120
- package/cpp/llama.h +33 -25
- package/cpp/log.cpp +401 -0
- package/cpp/log.h +85 -703
- package/cpp/rn-llama.hpp +9 -11
- package/cpp/sampling.cpp +12 -9
- package/cpp/sampling.h +4 -56
- package/cpp/sgemm.cpp +38 -0
- package/package.json +1 -1
package/cpp/llama.h
CHANGED
@@ -344,7 +344,7 @@ extern "C" {
|
|
344
344
|
bool embeddings; // if true, extract embeddings (together with logits)
|
345
345
|
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
|
346
346
|
bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
|
347
|
-
|
347
|
+
bool no_perf; // whether to measure performance timings
|
348
348
|
|
349
349
|
// Abort callback
|
350
350
|
// if it returns true, execution of llama_decode() will be aborted
|
@@ -1057,6 +1057,9 @@ extern "C" {
|
|
1057
1057
|
LLAMA_API struct llama_sampler * llama_sampler_chain_get(const struct llama_sampler * chain, int32_t i);
|
1058
1058
|
LLAMA_API int llama_sampler_chain_n (const struct llama_sampler * chain);
|
1059
1059
|
|
1060
|
+
// after removing a sampler, the chain will no longer own it, and it will not be freed when the chain is freed
|
1061
|
+
LLAMA_API struct llama_sampler * llama_sampler_chain_remove( struct llama_sampler * chain, int32_t i);
|
1062
|
+
|
1060
1063
|
// available samplers:
|
1061
1064
|
|
1062
1065
|
LLAMA_API struct llama_sampler * llama_sampler_init_greedy (void);
|
@@ -1131,15 +1134,20 @@ extern "C" {
|
|
1131
1134
|
int32_t n_logit_bias,
|
1132
1135
|
const llama_logit_bias * logit_bias);
|
1133
1136
|
|
1134
|
-
|
1137
|
+
|
1138
|
+
// Returns the seed used by the sampler if applicable, LLAMA_DEFAULT_SEED otherwise
|
1139
|
+
LLAMA_API uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl);
|
1140
|
+
|
1141
|
+
/// @details Sample and accept a token from the idx-th output of the last evaluation
|
1135
1142
|
//
|
1143
|
+
// Shorthand for:
|
1136
1144
|
// const auto * logits = llama_get_logits_ith(ctx, idx);
|
1137
1145
|
// llama_token_data_array cur_p = { ... init from logits ... };
|
1138
1146
|
// llama_sampler_apply(smpl, &cur_p);
|
1139
|
-
//
|
1140
|
-
//
|
1141
|
-
//
|
1142
|
-
//
|
1147
|
+
// auto token = cur_p.data[cur_p.selected].id;
|
1148
|
+
// llama_sampler_accept(smpl, token);
|
1149
|
+
// return token;
|
1150
|
+
// Returns the sampled token
|
1143
1151
|
LLAMA_API llama_token llama_sampler_sample(struct llama_sampler * smpl, struct llama_context * ctx, int32_t idx);
|
1144
1152
|
|
1145
1153
|
// TODO: extend in the future
|
@@ -1172,21 +1180,8 @@ extern "C" {
|
|
1172
1180
|
// NOTE: Used by llama.cpp examples, avoid using in third-party apps. Instead, do your own performance measurements.
|
1173
1181
|
//
|
1174
1182
|
|
1175
|
-
|
1176
|
-
LLAMA_PERF_TYPE_CONTEXT = 0,
|
1177
|
-
LLAMA_PERF_TYPE_SAMPLER_CHAIN = 1,
|
1178
|
-
};
|
1179
|
-
|
1180
|
-
LLAMA_API void llama_perf_print(const void * ctx, enum llama_perf_type type);
|
1181
|
-
LLAMA_API void llama_perf_reset( void * ctx, enum llama_perf_type type);
|
1182
|
-
|
1183
|
-
LLAMA_API void llama_perf_dump_yaml(FILE * stream, const struct llama_context * ctx);
|
1184
|
-
|
1185
|
-
// Keeps timings of samplers
|
1186
|
-
LLAMA_API struct llama_sampler_timings {int64_t t_sample_us; int32_t n_sample;};
|
1187
|
-
LLAMA_API struct llama_token_timings {
|
1183
|
+
struct llama_perf_context_data {
|
1188
1184
|
double t_start_ms;
|
1189
|
-
double t_end_ms;
|
1190
1185
|
double t_load_ms;
|
1191
1186
|
double t_p_eval_ms;
|
1192
1187
|
double t_eval_ms;
|
@@ -1194,11 +1189,24 @@ extern "C" {
|
|
1194
1189
|
int32_t n_p_eval;
|
1195
1190
|
int32_t n_eval;
|
1196
1191
|
};
|
1197
|
-
|
1198
|
-
|
1199
|
-
|
1200
|
-
|
1201
|
-
|
1192
|
+
|
1193
|
+
struct llama_perf_sampler_data {
|
1194
|
+
double t_sample_ms;
|
1195
|
+
|
1196
|
+
int32_t n_sample;
|
1197
|
+
};
|
1198
|
+
|
1199
|
+
LLAMA_API struct llama_perf_context_data llama_perf_context (const struct llama_context * ctx);
|
1200
|
+
LLAMA_API void llama_perf_context_print(const struct llama_context * ctx);
|
1201
|
+
LLAMA_API void llama_perf_context_reset( struct llama_context * ctx);
|
1202
|
+
|
1203
|
+
// NOTE: the following work only with samplers constructed via llama_sampler_chain_init
|
1204
|
+
LLAMA_API struct llama_perf_sampler_data llama_perf_sampler (const struct llama_sampler * chain);
|
1205
|
+
LLAMA_API void llama_perf_sampler_print(const struct llama_sampler * chain);
|
1206
|
+
LLAMA_API void llama_perf_sampler_reset( struct llama_sampler * chain);
|
1207
|
+
|
1208
|
+
LLAMA_API void llama_perf_dump_yaml(FILE * stream, const struct llama_context * ctx);
|
1209
|
+
|
1202
1210
|
#ifdef __cplusplus
|
1203
1211
|
}
|
1204
1212
|
#endif
|
package/cpp/log.cpp
ADDED
@@ -0,0 +1,401 @@
|
|
1
|
+
#include "log.h"
|
2
|
+
|
3
|
+
#include <condition_variable>
|
4
|
+
#include <cstdarg>
|
5
|
+
#include <cstdio>
|
6
|
+
#include <mutex>
|
7
|
+
#include <sstream>
|
8
|
+
#include <thread>
|
9
|
+
#include <vector>
|
10
|
+
|
11
|
+
int gpt_log_verbosity_thold = LOG_DEFAULT_LLAMA;
|
12
|
+
|
13
|
+
void gpt_log_set_verbosity_thold(int verbosity) {
|
14
|
+
gpt_log_verbosity_thold = verbosity;
|
15
|
+
}
|
16
|
+
|
17
|
+
#define LOG_COL_DEFAULT "\033[0m"
|
18
|
+
#define LOG_COL_BOLD "\033[1m"
|
19
|
+
#define LOG_COL_RED "\033[31m"
|
20
|
+
#define LOG_COL_GREEN "\033[32m"
|
21
|
+
#define LOG_COL_YELLOW "\033[33m"
|
22
|
+
#define LOG_COL_BLUE "\033[34m"
|
23
|
+
#define LOG_COL_MAGENTA "\033[35m"
|
24
|
+
#define LOG_COL_CYAN "\033[36m"
|
25
|
+
#define LOG_COL_WHITE "\033[37m"
|
26
|
+
|
27
|
+
static int64_t t_us() {
|
28
|
+
return std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
|
29
|
+
}
|
30
|
+
|
31
|
+
// colors
|
32
|
+
enum gpt_log_col : int {
|
33
|
+
GPT_LOG_COL_DEFAULT = 0,
|
34
|
+
GPT_LOG_COL_BOLD,
|
35
|
+
GPT_LOG_COL_RED,
|
36
|
+
GPT_LOG_COL_GREEN,
|
37
|
+
GPT_LOG_COL_YELLOW,
|
38
|
+
GPT_LOG_COL_BLUE,
|
39
|
+
GPT_LOG_COL_MAGENTA,
|
40
|
+
GPT_LOG_COL_CYAN,
|
41
|
+
GPT_LOG_COL_WHITE,
|
42
|
+
};
|
43
|
+
|
44
|
+
// disable colors by default
|
45
|
+
static std::vector<const char *> g_col = {
|
46
|
+
"",
|
47
|
+
"",
|
48
|
+
"",
|
49
|
+
"",
|
50
|
+
"",
|
51
|
+
"",
|
52
|
+
"",
|
53
|
+
"",
|
54
|
+
"",
|
55
|
+
};
|
56
|
+
|
57
|
+
struct gpt_log_entry {
|
58
|
+
enum lm_ggml_log_level level;
|
59
|
+
|
60
|
+
bool prefix;
|
61
|
+
|
62
|
+
int64_t timestamp;
|
63
|
+
|
64
|
+
std::vector<char> msg;
|
65
|
+
|
66
|
+
// signals the worker thread to stop
|
67
|
+
bool is_end;
|
68
|
+
|
69
|
+
void print(FILE * file = nullptr) const {
|
70
|
+
FILE * fcur = file;
|
71
|
+
if (!fcur) {
|
72
|
+
// stderr displays DBG messages only when their verbosity level is not higher than the threshold
|
73
|
+
// these messages will still be logged to a file
|
74
|
+
if (level == LM_GGML_LOG_LEVEL_DEBUG && gpt_log_verbosity_thold < LOG_DEFAULT_DEBUG) {
|
75
|
+
return;
|
76
|
+
}
|
77
|
+
|
78
|
+
fcur = stdout;
|
79
|
+
|
80
|
+
if (level != LM_GGML_LOG_LEVEL_NONE) {
|
81
|
+
fcur = stderr;
|
82
|
+
}
|
83
|
+
}
|
84
|
+
|
85
|
+
if (level != LM_GGML_LOG_LEVEL_NONE && prefix) {
|
86
|
+
if (timestamp) {
|
87
|
+
// [M.s.ms.us]
|
88
|
+
fprintf(fcur, "%s%d.%02d.%03d.%03d%s ",
|
89
|
+
g_col[GPT_LOG_COL_BLUE],
|
90
|
+
(int) (timestamp / 1000000 / 60),
|
91
|
+
(int) (timestamp / 1000000 % 60),
|
92
|
+
(int) (timestamp / 1000 % 1000),
|
93
|
+
(int) (timestamp % 1000),
|
94
|
+
g_col[GPT_LOG_COL_DEFAULT]);
|
95
|
+
}
|
96
|
+
|
97
|
+
switch (level) {
|
98
|
+
case LM_GGML_LOG_LEVEL_INFO: fprintf(fcur, "%sI %s", g_col[GPT_LOG_COL_GREEN], g_col[GPT_LOG_COL_DEFAULT]); break;
|
99
|
+
case LM_GGML_LOG_LEVEL_WARN: fprintf(fcur, "%sW %s", g_col[GPT_LOG_COL_MAGENTA], "" ); break;
|
100
|
+
case LM_GGML_LOG_LEVEL_ERROR: fprintf(fcur, "%sE %s", g_col[GPT_LOG_COL_RED], "" ); break;
|
101
|
+
case LM_GGML_LOG_LEVEL_DEBUG: fprintf(fcur, "%sD %s", g_col[GPT_LOG_COL_YELLOW], "" ); break;
|
102
|
+
default:
|
103
|
+
break;
|
104
|
+
}
|
105
|
+
}
|
106
|
+
|
107
|
+
fprintf(fcur, "%s", msg.data());
|
108
|
+
|
109
|
+
if (level == LM_GGML_LOG_LEVEL_WARN || level == LM_GGML_LOG_LEVEL_ERROR || level == LM_GGML_LOG_LEVEL_DEBUG) {
|
110
|
+
fprintf(fcur, "%s", g_col[GPT_LOG_COL_DEFAULT]);
|
111
|
+
}
|
112
|
+
|
113
|
+
fflush(fcur);
|
114
|
+
}
|
115
|
+
};
|
116
|
+
|
117
|
+
struct gpt_log {
|
118
|
+
// default capacity - will be expanded if needed
|
119
|
+
gpt_log() : gpt_log(256) {}
|
120
|
+
|
121
|
+
gpt_log(size_t capacity) {
|
122
|
+
file = nullptr;
|
123
|
+
prefix = false;
|
124
|
+
timestamps = false;
|
125
|
+
running = false;
|
126
|
+
t_start = t_us();
|
127
|
+
|
128
|
+
// initial message size - will be expanded if longer messages arrive
|
129
|
+
entries.resize(capacity);
|
130
|
+
for (auto & entry : entries) {
|
131
|
+
entry.msg.resize(256);
|
132
|
+
}
|
133
|
+
|
134
|
+
head = 0;
|
135
|
+
tail = 0;
|
136
|
+
|
137
|
+
resume();
|
138
|
+
}
|
139
|
+
|
140
|
+
~gpt_log() {
|
141
|
+
pause();
|
142
|
+
if (file) {
|
143
|
+
fclose(file);
|
144
|
+
}
|
145
|
+
}
|
146
|
+
|
147
|
+
private:
|
148
|
+
std::mutex mtx;
|
149
|
+
std::thread thrd;
|
150
|
+
std::condition_variable cv;
|
151
|
+
|
152
|
+
FILE * file;
|
153
|
+
|
154
|
+
bool prefix;
|
155
|
+
bool timestamps;
|
156
|
+
bool running;
|
157
|
+
|
158
|
+
int64_t t_start;
|
159
|
+
|
160
|
+
// ring buffer of entries
|
161
|
+
std::vector<gpt_log_entry> entries;
|
162
|
+
size_t head;
|
163
|
+
size_t tail;
|
164
|
+
|
165
|
+
// worker thread copies into this
|
166
|
+
gpt_log_entry cur;
|
167
|
+
|
168
|
+
public:
|
169
|
+
void add(enum lm_ggml_log_level level, const char * fmt, va_list args) {
|
170
|
+
std::lock_guard<std::mutex> lock(mtx);
|
171
|
+
|
172
|
+
if (!running) {
|
173
|
+
// discard messages while the worker thread is paused
|
174
|
+
return;
|
175
|
+
}
|
176
|
+
|
177
|
+
auto & entry = entries[tail];
|
178
|
+
|
179
|
+
{
|
180
|
+
// cannot use args twice, so make a copy in case we need to expand the buffer
|
181
|
+
va_list args_copy;
|
182
|
+
va_copy(args_copy, args);
|
183
|
+
|
184
|
+
#if 1
|
185
|
+
const size_t n = vsnprintf(entry.msg.data(), entry.msg.size(), fmt, args);
|
186
|
+
if (n >= entry.msg.size()) {
|
187
|
+
entry.msg.resize(n + 1);
|
188
|
+
vsnprintf(entry.msg.data(), entry.msg.size(), fmt, args_copy);
|
189
|
+
}
|
190
|
+
#else
|
191
|
+
// hack for bolding arguments
|
192
|
+
|
193
|
+
std::stringstream ss;
|
194
|
+
for (int i = 0; fmt[i] != 0; i++) {
|
195
|
+
if (fmt[i] == '%') {
|
196
|
+
ss << LOG_COL_BOLD;
|
197
|
+
while (fmt[i] != ' ' && fmt[i] != ')' && fmt[i] != ']' && fmt[i] != 0) ss << fmt[i++];
|
198
|
+
ss << LOG_COL_DEFAULT;
|
199
|
+
if (fmt[i] == 0) break;
|
200
|
+
}
|
201
|
+
ss << fmt[i];
|
202
|
+
}
|
203
|
+
const size_t n = vsnprintf(entry.msg.data(), entry.msg.size(), ss.str().c_str(), args);
|
204
|
+
if (n >= entry.msg.size()) {
|
205
|
+
entry.msg.resize(n + 1);
|
206
|
+
vsnprintf(entry.msg.data(), entry.msg.size(), ss.str().c_str(), args_copy);
|
207
|
+
}
|
208
|
+
#endif
|
209
|
+
}
|
210
|
+
|
211
|
+
entry.level = level;
|
212
|
+
entry.prefix = prefix;
|
213
|
+
entry.timestamp = 0;
|
214
|
+
if (timestamps) {
|
215
|
+
entry.timestamp = t_us() - t_start;
|
216
|
+
}
|
217
|
+
entry.is_end = false;
|
218
|
+
|
219
|
+
tail = (tail + 1) % entries.size();
|
220
|
+
if (tail == head) {
|
221
|
+
// expand the buffer
|
222
|
+
std::vector<gpt_log_entry> new_entries(2*entries.size());
|
223
|
+
|
224
|
+
size_t new_tail = 0;
|
225
|
+
|
226
|
+
do {
|
227
|
+
new_entries[new_tail] = std::move(entries[head]);
|
228
|
+
|
229
|
+
head = (head + 1) % entries.size();
|
230
|
+
new_tail = (new_tail + 1);
|
231
|
+
} while (head != tail);
|
232
|
+
|
233
|
+
head = 0;
|
234
|
+
tail = new_tail;
|
235
|
+
|
236
|
+
for (size_t i = tail; i < new_entries.size(); i++) {
|
237
|
+
new_entries[i].msg.resize(256);
|
238
|
+
}
|
239
|
+
|
240
|
+
entries = std::move(new_entries);
|
241
|
+
}
|
242
|
+
|
243
|
+
cv.notify_one();
|
244
|
+
}
|
245
|
+
|
246
|
+
void resume() {
|
247
|
+
std::lock_guard<std::mutex> lock(mtx);
|
248
|
+
|
249
|
+
if (running) {
|
250
|
+
return;
|
251
|
+
}
|
252
|
+
|
253
|
+
running = true;
|
254
|
+
|
255
|
+
thrd = std::thread([this]() {
|
256
|
+
while (true) {
|
257
|
+
{
|
258
|
+
std::unique_lock<std::mutex> lock(mtx);
|
259
|
+
cv.wait(lock, [this]() { return head != tail; });
|
260
|
+
|
261
|
+
cur = entries[head];
|
262
|
+
|
263
|
+
head = (head + 1) % entries.size();
|
264
|
+
}
|
265
|
+
|
266
|
+
if (cur.is_end) {
|
267
|
+
break;
|
268
|
+
}
|
269
|
+
|
270
|
+
cur.print(); // stdout and stderr
|
271
|
+
|
272
|
+
if (file) {
|
273
|
+
cur.print(file);
|
274
|
+
}
|
275
|
+
}
|
276
|
+
});
|
277
|
+
}
|
278
|
+
|
279
|
+
void pause() {
|
280
|
+
{
|
281
|
+
std::lock_guard<std::mutex> lock(mtx);
|
282
|
+
|
283
|
+
if (!running) {
|
284
|
+
return;
|
285
|
+
}
|
286
|
+
|
287
|
+
running = false;
|
288
|
+
|
289
|
+
// push an entry to signal the worker thread to stop
|
290
|
+
{
|
291
|
+
auto & entry = entries[tail];
|
292
|
+
entry.is_end = true;
|
293
|
+
|
294
|
+
tail = (tail + 1) % entries.size();
|
295
|
+
}
|
296
|
+
|
297
|
+
cv.notify_one();
|
298
|
+
}
|
299
|
+
|
300
|
+
thrd.join();
|
301
|
+
}
|
302
|
+
|
303
|
+
void set_file(const char * path) {
|
304
|
+
pause();
|
305
|
+
|
306
|
+
if (file) {
|
307
|
+
fclose(file);
|
308
|
+
}
|
309
|
+
|
310
|
+
if (path) {
|
311
|
+
file = fopen(path, "w");
|
312
|
+
} else {
|
313
|
+
file = nullptr;
|
314
|
+
}
|
315
|
+
|
316
|
+
resume();
|
317
|
+
}
|
318
|
+
|
319
|
+
void set_colors(bool colors) {
|
320
|
+
pause();
|
321
|
+
|
322
|
+
if (colors) {
|
323
|
+
g_col[GPT_LOG_COL_DEFAULT] = LOG_COL_DEFAULT;
|
324
|
+
g_col[GPT_LOG_COL_BOLD] = LOG_COL_BOLD;
|
325
|
+
g_col[GPT_LOG_COL_RED] = LOG_COL_RED;
|
326
|
+
g_col[GPT_LOG_COL_GREEN] = LOG_COL_GREEN;
|
327
|
+
g_col[GPT_LOG_COL_YELLOW] = LOG_COL_YELLOW;
|
328
|
+
g_col[GPT_LOG_COL_BLUE] = LOG_COL_BLUE;
|
329
|
+
g_col[GPT_LOG_COL_MAGENTA] = LOG_COL_MAGENTA;
|
330
|
+
g_col[GPT_LOG_COL_CYAN] = LOG_COL_CYAN;
|
331
|
+
g_col[GPT_LOG_COL_WHITE] = LOG_COL_WHITE;
|
332
|
+
} else {
|
333
|
+
for (size_t i = 0; i < g_col.size(); i++) {
|
334
|
+
g_col[i] = "";
|
335
|
+
}
|
336
|
+
}
|
337
|
+
|
338
|
+
resume();
|
339
|
+
}
|
340
|
+
|
341
|
+
void set_prefix(bool prefix) {
|
342
|
+
std::lock_guard<std::mutex> lock(mtx);
|
343
|
+
|
344
|
+
this->prefix = prefix;
|
345
|
+
}
|
346
|
+
|
347
|
+
void set_timestamps(bool timestamps) {
|
348
|
+
std::lock_guard<std::mutex> lock(mtx);
|
349
|
+
|
350
|
+
this->timestamps = timestamps;
|
351
|
+
}
|
352
|
+
};
|
353
|
+
|
354
|
+
//
|
355
|
+
// public API
|
356
|
+
//
|
357
|
+
|
358
|
+
struct gpt_log * gpt_log_init() {
|
359
|
+
return new gpt_log;
|
360
|
+
}
|
361
|
+
|
362
|
+
struct gpt_log * gpt_log_main() {
|
363
|
+
static struct gpt_log log;
|
364
|
+
|
365
|
+
return &log;
|
366
|
+
}
|
367
|
+
|
368
|
+
void gpt_log_pause(struct gpt_log * log) {
|
369
|
+
log->pause();
|
370
|
+
}
|
371
|
+
|
372
|
+
void gpt_log_resume(struct gpt_log * log) {
|
373
|
+
log->resume();
|
374
|
+
}
|
375
|
+
|
376
|
+
void gpt_log_free(struct gpt_log * log) {
|
377
|
+
delete log;
|
378
|
+
}
|
379
|
+
|
380
|
+
void gpt_log_add(struct gpt_log * log, enum lm_ggml_log_level level, const char * fmt, ...) {
|
381
|
+
va_list args;
|
382
|
+
va_start(args, fmt);
|
383
|
+
log->add(level, fmt, args);
|
384
|
+
va_end(args);
|
385
|
+
}
|
386
|
+
|
387
|
+
void gpt_log_set_file(struct gpt_log * log, const char * file) {
|
388
|
+
log->set_file(file);
|
389
|
+
}
|
390
|
+
|
391
|
+
void gpt_log_set_colors(struct gpt_log * log, bool colors) {
|
392
|
+
log->set_colors(colors);
|
393
|
+
}
|
394
|
+
|
395
|
+
void gpt_log_set_prefix(struct gpt_log * log, bool prefix) {
|
396
|
+
log->set_prefix(prefix);
|
397
|
+
}
|
398
|
+
|
399
|
+
void gpt_log_set_timestamps(struct gpt_log * log, bool timestamps) {
|
400
|
+
log->set_timestamps(timestamps);
|
401
|
+
}
|