cui-llama.rn 1.1.4 → 1.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/cpp/llama.h CHANGED
@@ -344,7 +344,7 @@ extern "C" {
344
344
  bool embeddings; // if true, extract embeddings (together with logits)
345
345
  bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
346
346
  bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
347
- //bool no_perf; // whether to measure performance timings, TODO: implement
347
+ bool no_perf; // whether to measure performance timings
348
348
 
349
349
  // Abort callback
350
350
  // if it returns true, execution of llama_decode() will be aborted
@@ -1057,6 +1057,9 @@ extern "C" {
1057
1057
  LLAMA_API struct llama_sampler * llama_sampler_chain_get(const struct llama_sampler * chain, int32_t i);
1058
1058
  LLAMA_API int llama_sampler_chain_n (const struct llama_sampler * chain);
1059
1059
 
1060
+ // after removing a sampler, the chain will no longer own it, and it will not be freed when the chain is freed
1061
+ LLAMA_API struct llama_sampler * llama_sampler_chain_remove( struct llama_sampler * chain, int32_t i);
1062
+
1060
1063
  // available samplers:
1061
1064
 
1062
1065
  LLAMA_API struct llama_sampler * llama_sampler_init_greedy (void);
@@ -1131,15 +1134,20 @@ extern "C" {
1131
1134
  int32_t n_logit_bias,
1132
1135
  const llama_logit_bias * logit_bias);
1133
1136
 
1134
- // Shorthand for:
1137
+
1138
+ // Returns the seed used by the sampler if applicable, LLAMA_DEFAULT_SEED otherwise
1139
+ LLAMA_API uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl);
1140
+
1141
+ /// @details Sample and accept a token from the idx-th output of the last evaluation
1135
1142
  //
1143
+ // Shorthand for:
1136
1144
  // const auto * logits = llama_get_logits_ith(ctx, idx);
1137
1145
  // llama_token_data_array cur_p = { ... init from logits ... };
1138
1146
  // llama_sampler_apply(smpl, &cur_p);
1139
- // return cur_p.data[cur_p.selected].id;
1140
- //
1141
- // At this point, this is mostly a convenience function.
1142
- //
1147
+ // auto token = cur_p.data[cur_p.selected].id;
1148
+ // llama_sampler_accept(smpl, token);
1149
+ // return token;
1150
+ // Returns the sampled token
1143
1151
  LLAMA_API llama_token llama_sampler_sample(struct llama_sampler * smpl, struct llama_context * ctx, int32_t idx);
1144
1152
 
1145
1153
  // TODO: extend in the future
@@ -1172,21 +1180,8 @@ extern "C" {
1172
1180
  // NOTE: Used by llama.cpp examples, avoid using in third-party apps. Instead, do your own performance measurements.
1173
1181
  //
1174
1182
 
1175
- enum llama_perf_type {
1176
- LLAMA_PERF_TYPE_CONTEXT = 0,
1177
- LLAMA_PERF_TYPE_SAMPLER_CHAIN = 1,
1178
- };
1179
-
1180
- LLAMA_API void llama_perf_print(const void * ctx, enum llama_perf_type type);
1181
- LLAMA_API void llama_perf_reset( void * ctx, enum llama_perf_type type);
1182
-
1183
- LLAMA_API void llama_perf_dump_yaml(FILE * stream, const struct llama_context * ctx);
1184
-
1185
- // Keeps timings of samplers
1186
- LLAMA_API struct llama_sampler_timings {int64_t t_sample_us; int32_t n_sample;};
1187
- LLAMA_API struct llama_token_timings {
1183
+ struct llama_perf_context_data {
1188
1184
  double t_start_ms;
1189
- double t_end_ms;
1190
1185
  double t_load_ms;
1191
1186
  double t_p_eval_ms;
1192
1187
  double t_eval_ms;
@@ -1194,11 +1189,24 @@ extern "C" {
1194
1189
  int32_t n_p_eval;
1195
1190
  int32_t n_eval;
1196
1191
  };
1197
-
1198
- // helper function for getting timings
1199
- LLAMA_API struct llama_token_timings llama_get_token_timings(const void * v_ctx) ;
1200
- LLAMA_API struct llama_sampler_timings llama_sampler_chain_timings(struct llama_sampler * chain);
1201
- LLAMA_API struct llama_sampler_timings gpt_sampler_get_timigs(const struct gpt_sampler * gsmpl);
1192
+
1193
+ struct llama_perf_sampler_data {
1194
+ double t_sample_ms;
1195
+
1196
+ int32_t n_sample;
1197
+ };
1198
+
1199
+ LLAMA_API struct llama_perf_context_data llama_perf_context (const struct llama_context * ctx);
1200
+ LLAMA_API void llama_perf_context_print(const struct llama_context * ctx);
1201
+ LLAMA_API void llama_perf_context_reset( struct llama_context * ctx);
1202
+
1203
+ // NOTE: the following work only with samplers constructed via llama_sampler_chain_init
1204
+ LLAMA_API struct llama_perf_sampler_data llama_perf_sampler (const struct llama_sampler * chain);
1205
+ LLAMA_API void llama_perf_sampler_print(const struct llama_sampler * chain);
1206
+ LLAMA_API void llama_perf_sampler_reset( struct llama_sampler * chain);
1207
+
1208
+ LLAMA_API void llama_perf_dump_yaml(FILE * stream, const struct llama_context * ctx);
1209
+
1202
1210
  #ifdef __cplusplus
1203
1211
  }
1204
1212
  #endif
package/cpp/log.cpp ADDED
@@ -0,0 +1,401 @@
1
+ #include "log.h"
2
+
3
+ #include <condition_variable>
4
+ #include <cstdarg>
5
+ #include <cstdio>
6
+ #include <mutex>
7
+ #include <sstream>
8
+ #include <thread>
9
+ #include <vector>
10
+
11
+ int gpt_log_verbosity_thold = LOG_DEFAULT_LLAMA;
12
+
13
+ void gpt_log_set_verbosity_thold(int verbosity) {
14
+ gpt_log_verbosity_thold = verbosity;
15
+ }
16
+
17
+ #define LOG_COL_DEFAULT "\033[0m"
18
+ #define LOG_COL_BOLD "\033[1m"
19
+ #define LOG_COL_RED "\033[31m"
20
+ #define LOG_COL_GREEN "\033[32m"
21
+ #define LOG_COL_YELLOW "\033[33m"
22
+ #define LOG_COL_BLUE "\033[34m"
23
+ #define LOG_COL_MAGENTA "\033[35m"
24
+ #define LOG_COL_CYAN "\033[36m"
25
+ #define LOG_COL_WHITE "\033[37m"
26
+
27
+ static int64_t t_us() {
28
+ return std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
29
+ }
30
+
31
+ // colors
32
+ enum gpt_log_col : int {
33
+ GPT_LOG_COL_DEFAULT = 0,
34
+ GPT_LOG_COL_BOLD,
35
+ GPT_LOG_COL_RED,
36
+ GPT_LOG_COL_GREEN,
37
+ GPT_LOG_COL_YELLOW,
38
+ GPT_LOG_COL_BLUE,
39
+ GPT_LOG_COL_MAGENTA,
40
+ GPT_LOG_COL_CYAN,
41
+ GPT_LOG_COL_WHITE,
42
+ };
43
+
44
+ // disable colors by default
45
+ static std::vector<const char *> g_col = {
46
+ "",
47
+ "",
48
+ "",
49
+ "",
50
+ "",
51
+ "",
52
+ "",
53
+ "",
54
+ "",
55
+ };
56
+
57
+ struct gpt_log_entry {
58
+ enum lm_ggml_log_level level;
59
+
60
+ bool prefix;
61
+
62
+ int64_t timestamp;
63
+
64
+ std::vector<char> msg;
65
+
66
+ // signals the worker thread to stop
67
+ bool is_end;
68
+
69
+ void print(FILE * file = nullptr) const {
70
+ FILE * fcur = file;
71
+ if (!fcur) {
72
+ // stderr displays DBG messages only when their verbosity level is not higher than the threshold
73
+ // these messages will still be logged to a file
74
+ if (level == LM_GGML_LOG_LEVEL_DEBUG && gpt_log_verbosity_thold < LOG_DEFAULT_DEBUG) {
75
+ return;
76
+ }
77
+
78
+ fcur = stdout;
79
+
80
+ if (level != LM_GGML_LOG_LEVEL_NONE) {
81
+ fcur = stderr;
82
+ }
83
+ }
84
+
85
+ if (level != LM_GGML_LOG_LEVEL_NONE && prefix) {
86
+ if (timestamp) {
87
+ // [M.s.ms.us]
88
+ fprintf(fcur, "%s%d.%02d.%03d.%03d%s ",
89
+ g_col[GPT_LOG_COL_BLUE],
90
+ (int) (timestamp / 1000000 / 60),
91
+ (int) (timestamp / 1000000 % 60),
92
+ (int) (timestamp / 1000 % 1000),
93
+ (int) (timestamp % 1000),
94
+ g_col[GPT_LOG_COL_DEFAULT]);
95
+ }
96
+
97
+ switch (level) {
98
+ case LM_GGML_LOG_LEVEL_INFO: fprintf(fcur, "%sI %s", g_col[GPT_LOG_COL_GREEN], g_col[GPT_LOG_COL_DEFAULT]); break;
99
+ case LM_GGML_LOG_LEVEL_WARN: fprintf(fcur, "%sW %s", g_col[GPT_LOG_COL_MAGENTA], "" ); break;
100
+ case LM_GGML_LOG_LEVEL_ERROR: fprintf(fcur, "%sE %s", g_col[GPT_LOG_COL_RED], "" ); break;
101
+ case LM_GGML_LOG_LEVEL_DEBUG: fprintf(fcur, "%sD %s", g_col[GPT_LOG_COL_YELLOW], "" ); break;
102
+ default:
103
+ break;
104
+ }
105
+ }
106
+
107
+ fprintf(fcur, "%s", msg.data());
108
+
109
+ if (level == LM_GGML_LOG_LEVEL_WARN || level == LM_GGML_LOG_LEVEL_ERROR || level == LM_GGML_LOG_LEVEL_DEBUG) {
110
+ fprintf(fcur, "%s", g_col[GPT_LOG_COL_DEFAULT]);
111
+ }
112
+
113
+ fflush(fcur);
114
+ }
115
+ };
116
+
117
+ struct gpt_log {
118
+ // default capacity - will be expanded if needed
119
+ gpt_log() : gpt_log(256) {}
120
+
121
+ gpt_log(size_t capacity) {
122
+ file = nullptr;
123
+ prefix = false;
124
+ timestamps = false;
125
+ running = false;
126
+ t_start = t_us();
127
+
128
+ // initial message size - will be expanded if longer messages arrive
129
+ entries.resize(capacity);
130
+ for (auto & entry : entries) {
131
+ entry.msg.resize(256);
132
+ }
133
+
134
+ head = 0;
135
+ tail = 0;
136
+
137
+ resume();
138
+ }
139
+
140
+ ~gpt_log() {
141
+ pause();
142
+ if (file) {
143
+ fclose(file);
144
+ }
145
+ }
146
+
147
+ private:
148
+ std::mutex mtx;
149
+ std::thread thrd;
150
+ std::condition_variable cv;
151
+
152
+ FILE * file;
153
+
154
+ bool prefix;
155
+ bool timestamps;
156
+ bool running;
157
+
158
+ int64_t t_start;
159
+
160
+ // ring buffer of entries
161
+ std::vector<gpt_log_entry> entries;
162
+ size_t head;
163
+ size_t tail;
164
+
165
+ // worker thread copies into this
166
+ gpt_log_entry cur;
167
+
168
+ public:
169
+ void add(enum lm_ggml_log_level level, const char * fmt, va_list args) {
170
+ std::lock_guard<std::mutex> lock(mtx);
171
+
172
+ if (!running) {
173
+ // discard messages while the worker thread is paused
174
+ return;
175
+ }
176
+
177
+ auto & entry = entries[tail];
178
+
179
+ {
180
+ // cannot use args twice, so make a copy in case we need to expand the buffer
181
+ va_list args_copy;
182
+ va_copy(args_copy, args);
183
+
184
+ #if 1
185
+ const size_t n = vsnprintf(entry.msg.data(), entry.msg.size(), fmt, args);
186
+ if (n >= entry.msg.size()) {
187
+ entry.msg.resize(n + 1);
188
+ vsnprintf(entry.msg.data(), entry.msg.size(), fmt, args_copy);
189
+ }
190
+ #else
191
+ // hack for bolding arguments
192
+
193
+ std::stringstream ss;
194
+ for (int i = 0; fmt[i] != 0; i++) {
195
+ if (fmt[i] == '%') {
196
+ ss << LOG_COL_BOLD;
197
+ while (fmt[i] != ' ' && fmt[i] != ')' && fmt[i] != ']' && fmt[i] != 0) ss << fmt[i++];
198
+ ss << LOG_COL_DEFAULT;
199
+ if (fmt[i] == 0) break;
200
+ }
201
+ ss << fmt[i];
202
+ }
203
+ const size_t n = vsnprintf(entry.msg.data(), entry.msg.size(), ss.str().c_str(), args);
204
+ if (n >= entry.msg.size()) {
205
+ entry.msg.resize(n + 1);
206
+ vsnprintf(entry.msg.data(), entry.msg.size(), ss.str().c_str(), args_copy);
207
+ }
208
+ #endif
209
+ }
210
+
211
+ entry.level = level;
212
+ entry.prefix = prefix;
213
+ entry.timestamp = 0;
214
+ if (timestamps) {
215
+ entry.timestamp = t_us() - t_start;
216
+ }
217
+ entry.is_end = false;
218
+
219
+ tail = (tail + 1) % entries.size();
220
+ if (tail == head) {
221
+ // expand the buffer
222
+ std::vector<gpt_log_entry> new_entries(2*entries.size());
223
+
224
+ size_t new_tail = 0;
225
+
226
+ do {
227
+ new_entries[new_tail] = std::move(entries[head]);
228
+
229
+ head = (head + 1) % entries.size();
230
+ new_tail = (new_tail + 1);
231
+ } while (head != tail);
232
+
233
+ head = 0;
234
+ tail = new_tail;
235
+
236
+ for (size_t i = tail; i < new_entries.size(); i++) {
237
+ new_entries[i].msg.resize(256);
238
+ }
239
+
240
+ entries = std::move(new_entries);
241
+ }
242
+
243
+ cv.notify_one();
244
+ }
245
+
246
+ void resume() {
247
+ std::lock_guard<std::mutex> lock(mtx);
248
+
249
+ if (running) {
250
+ return;
251
+ }
252
+
253
+ running = true;
254
+
255
+ thrd = std::thread([this]() {
256
+ while (true) {
257
+ {
258
+ std::unique_lock<std::mutex> lock(mtx);
259
+ cv.wait(lock, [this]() { return head != tail; });
260
+
261
+ cur = entries[head];
262
+
263
+ head = (head + 1) % entries.size();
264
+ }
265
+
266
+ if (cur.is_end) {
267
+ break;
268
+ }
269
+
270
+ cur.print(); // stdout and stderr
271
+
272
+ if (file) {
273
+ cur.print(file);
274
+ }
275
+ }
276
+ });
277
+ }
278
+
279
+ void pause() {
280
+ {
281
+ std::lock_guard<std::mutex> lock(mtx);
282
+
283
+ if (!running) {
284
+ return;
285
+ }
286
+
287
+ running = false;
288
+
289
+ // push an entry to signal the worker thread to stop
290
+ {
291
+ auto & entry = entries[tail];
292
+ entry.is_end = true;
293
+
294
+ tail = (tail + 1) % entries.size();
295
+ }
296
+
297
+ cv.notify_one();
298
+ }
299
+
300
+ thrd.join();
301
+ }
302
+
303
+ void set_file(const char * path) {
304
+ pause();
305
+
306
+ if (file) {
307
+ fclose(file);
308
+ }
309
+
310
+ if (path) {
311
+ file = fopen(path, "w");
312
+ } else {
313
+ file = nullptr;
314
+ }
315
+
316
+ resume();
317
+ }
318
+
319
+ void set_colors(bool colors) {
320
+ pause();
321
+
322
+ if (colors) {
323
+ g_col[GPT_LOG_COL_DEFAULT] = LOG_COL_DEFAULT;
324
+ g_col[GPT_LOG_COL_BOLD] = LOG_COL_BOLD;
325
+ g_col[GPT_LOG_COL_RED] = LOG_COL_RED;
326
+ g_col[GPT_LOG_COL_GREEN] = LOG_COL_GREEN;
327
+ g_col[GPT_LOG_COL_YELLOW] = LOG_COL_YELLOW;
328
+ g_col[GPT_LOG_COL_BLUE] = LOG_COL_BLUE;
329
+ g_col[GPT_LOG_COL_MAGENTA] = LOG_COL_MAGENTA;
330
+ g_col[GPT_LOG_COL_CYAN] = LOG_COL_CYAN;
331
+ g_col[GPT_LOG_COL_WHITE] = LOG_COL_WHITE;
332
+ } else {
333
+ for (size_t i = 0; i < g_col.size(); i++) {
334
+ g_col[i] = "";
335
+ }
336
+ }
337
+
338
+ resume();
339
+ }
340
+
341
+ void set_prefix(bool prefix) {
342
+ std::lock_guard<std::mutex> lock(mtx);
343
+
344
+ this->prefix = prefix;
345
+ }
346
+
347
+ void set_timestamps(bool timestamps) {
348
+ std::lock_guard<std::mutex> lock(mtx);
349
+
350
+ this->timestamps = timestamps;
351
+ }
352
+ };
353
+
354
+ //
355
+ // public API
356
+ //
357
+
358
+ struct gpt_log * gpt_log_init() {
359
+ return new gpt_log;
360
+ }
361
+
362
+ struct gpt_log * gpt_log_main() {
363
+ static struct gpt_log log;
364
+
365
+ return &log;
366
+ }
367
+
368
+ void gpt_log_pause(struct gpt_log * log) {
369
+ log->pause();
370
+ }
371
+
372
+ void gpt_log_resume(struct gpt_log * log) {
373
+ log->resume();
374
+ }
375
+
376
+ void gpt_log_free(struct gpt_log * log) {
377
+ delete log;
378
+ }
379
+
380
+ void gpt_log_add(struct gpt_log * log, enum lm_ggml_log_level level, const char * fmt, ...) {
381
+ va_list args;
382
+ va_start(args, fmt);
383
+ log->add(level, fmt, args);
384
+ va_end(args);
385
+ }
386
+
387
+ void gpt_log_set_file(struct gpt_log * log, const char * file) {
388
+ log->set_file(file);
389
+ }
390
+
391
+ void gpt_log_set_colors(struct gpt_log * log, bool colors) {
392
+ log->set_colors(colors);
393
+ }
394
+
395
+ void gpt_log_set_prefix(struct gpt_log * log, bool prefix) {
396
+ log->set_prefix(prefix);
397
+ }
398
+
399
+ void gpt_log_set_timestamps(struct gpt_log * log, bool timestamps) {
400
+ log->set_timestamps(timestamps);
401
+ }