llama-rb 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1815 @@
1
+ #include "llama.h"
2
+
3
+ #include "ggml.h"
4
+
5
+ #include <cinttypes>
6
+ #include <fstream>
7
+ #include <random>
8
+ #include <map>
9
+ #include <unordered_map>
10
+ #include <queue>
11
+ #include <regex>
12
+ #include <cassert>
13
+ #include <cstring>
14
+
15
+ #if defined(_WIN32) && !defined(_POSIX_MAPPED_FILES)
16
+ #define WIN32_LEAN_AND_MEAN
17
+ #include <Windows.h>
18
+ #else
19
+ #include <sys/types.h>
20
+ #include <sys/mman.h>
21
+ #include <unistd.h>
22
+ #include <fcntl.h>
23
+ #endif
24
+
25
+ #define Min(X, Y) ((Y) > (X) ? (X) : (Y))
26
+ #define Max(X, Y) ((Y) < (X) ? (X) : (Y))
27
+
28
+ #define LLAMA_USE_SCRATCH
29
+ #define LLAMA_MAX_SCRATCH_BUFFERS 16
30
+
31
+ #define LLAMA_ASSERT(x) \
32
+ do { \
33
+ if (!(x)) { \
34
+ fprintf(stderr, "LLAMA_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
35
+ abort(); \
36
+ } \
37
+ } while (0)
38
+
39
+
40
+ // determine number of model parts based on the dimension
41
+ static const std::unordered_map<int, int> LLAMA_N_PARTS = {
42
+ { 4096, 1 },
43
+ { 5120, 2 },
44
+ { 6656, 4 },
45
+ { 8192, 8 },
46
+ };
47
+
48
+ // available llama models
49
+ enum e_model {
50
+ MODEL_UNKNOWN,
51
+ MODEL_7B,
52
+ MODEL_13B,
53
+ MODEL_30B,
54
+ MODEL_65B,
55
+ };
56
+
57
+ static const size_t MB = 1024*1024;
58
+
59
+ // computed for n_ctx == 2048
60
+ // TODO: dynamically determine these sizes
61
+ // needs modifications in ggml
62
+
63
+ static const std::map<e_model, size_t> MEM_REQ_SCRATCH0 = {
64
+ { MODEL_7B, 512ull*MB },
65
+ { MODEL_13B, 512ull*MB },
66
+ { MODEL_30B, 512ull*MB },
67
+ { MODEL_65B, 512ull*MB },
68
+ };
69
+
70
+ static const std::map<e_model, size_t> MEM_REQ_SCRATCH1 = {
71
+ { MODEL_7B, 512ull*MB },
72
+ { MODEL_13B, 512ull*MB },
73
+ { MODEL_30B, 512ull*MB },
74
+ { MODEL_65B, 512ull*MB },
75
+ };
76
+
77
+ // 2*n_embd*n_ctx*n_layer*sizeof(float16)
78
+ static const std::map<e_model, size_t> MEM_REQ_KV_SELF = {
79
+ { MODEL_7B, 1026ull*MB },
80
+ { MODEL_13B, 1608ull*MB },
81
+ { MODEL_30B, 3124ull*MB },
82
+ { MODEL_65B, 5120ull*MB },
83
+ };
84
+
85
+ // this is mostly needed for temporary mul_mat buffers to dequantize the data
86
+ // not actually needed if BLAS is disabled
87
+ static const std::map<e_model, size_t> MEM_REQ_EVAL = {
88
+ { MODEL_7B, 768ull*MB },
89
+ { MODEL_13B, 1024ull*MB },
90
+ { MODEL_30B, 1280ull*MB },
91
+ { MODEL_65B, 1536ull*MB },
92
+ };
93
+
94
+ // default hparams (LLaMA 7B)
95
+ struct llama_hparams {
96
+ int32_t n_vocab = 32000;
97
+ int32_t n_ctx = 512; // this is provided as user input?
98
+ int32_t n_embd = 4096;
99
+ int32_t n_mult = 256;
100
+ int32_t n_head = 32;
101
+ int32_t n_layer = 32;
102
+ int32_t n_rot = 64;
103
+ int32_t f16 = 1;
104
+ };
105
+
106
+ struct llama_layer {
107
+ // normalization
108
+ struct ggml_tensor * attention_norm;
109
+
110
+ // attention
111
+ struct ggml_tensor * wq;
112
+ struct ggml_tensor * wk;
113
+ struct ggml_tensor * wv;
114
+ struct ggml_tensor * wo;
115
+
116
+ // normalization
117
+ struct ggml_tensor * ffn_norm;
118
+
119
+ // ff
120
+ struct ggml_tensor * w1;
121
+ struct ggml_tensor * w2;
122
+ struct ggml_tensor * w3;
123
+ };
124
+
125
+ struct llama_kv_cache {
126
+ struct ggml_tensor * k;
127
+ struct ggml_tensor * v;
128
+
129
+ struct ggml_context * ctx;
130
+
131
+ std::vector<uint8_t> buf;
132
+
133
+ int n; // number of tokens currently in the cache
134
+ };
135
+
136
+ struct llama_model {
137
+ e_model type = MODEL_UNKNOWN;
138
+
139
+ llama_hparams hparams;
140
+
141
+ struct ggml_tensor * tok_embeddings;
142
+
143
+ struct ggml_tensor * norm;
144
+ struct ggml_tensor * output;
145
+
146
+ std::vector<llama_layer> layers;
147
+
148
+ // context
149
+ struct ggml_context * ctx;
150
+
151
+ // key + value cache for the self attention
152
+ // TODO: move to llama_state
153
+ struct llama_kv_cache kv_self;
154
+
155
+ // the model memory buffer
156
+ std::vector<uint8_t> buf;
157
+
158
+ // model memory mapped file
159
+ void * mm_addr = NULL;
160
+ uint64_t mm_length = 0;
161
+
162
+ // tensors
163
+ int n_loaded;
164
+ std::unordered_map<std::string, struct ggml_tensor *> tensors;
165
+ };
166
+
167
+ struct llama_vocab {
168
+ using id = int32_t;
169
+ using token = std::string;
170
+
171
+ struct token_score {
172
+ token tok;
173
+ float score;
174
+ };
175
+
176
+ std::unordered_map<token, id> token_to_id;
177
+ std::vector<token_score> id_to_token;
178
+ };
179
+
180
+ struct llama_context {
181
+ std::mt19937 rng;
182
+
183
+ int64_t t_load_us = 0;
184
+ int64_t t_start_us = 0;
185
+ bool has_evaluated_once = false;
186
+
187
+ int64_t t_sample_us = 0;
188
+ int64_t t_eval_us = 0;
189
+ int64_t t_p_eval_us = 0;
190
+
191
+ int32_t n_sample = 0; // number of tokens sampled
192
+ int32_t n_eval = 0; // number of eval calls
193
+ int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
194
+
195
+ llama_model model;
196
+ llama_vocab vocab;
197
+
198
+ size_t mem_per_token = 0;
199
+
200
+ // decode output (2-dimensional array: [n_tokens][n_vocab])
201
+ std::vector<float> logits;
202
+ bool logits_all = false;
203
+
204
+ // input embedding (1-dimensional array: [n_embd])
205
+ std::vector<float> embedding;
206
+
207
+ // memory buffers used to evaluate the model
208
+ // TODO: move in llama_state
209
+ std::vector<uint8_t> buf_compute;
210
+ std::vector<uint8_t> buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
211
+
212
+ int buf_last = 0;
213
+ size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
214
+
215
+ void use_buf(struct ggml_context * ctx, int i) {
216
+ #if defined(LLAMA_USE_SCRATCH)
217
+ size_t last_size = 0;
218
+
219
+ if (i == -1) {
220
+ last_size = ggml_set_scratch(ctx, { 0, 0, nullptr, });
221
+ } else {
222
+ auto & buf = buf_scratch[i];
223
+ last_size = ggml_set_scratch(ctx, { 0, buf.size(), buf.data(), });
224
+ }
225
+
226
+ if (buf_last >= 0) {
227
+ buf_max_size[buf_last] = Max(buf_max_size[buf_last], last_size);
228
+ }
229
+
230
+ buf_last = i;
231
+ #else
232
+ (void) i;
233
+ (void) ctx;
234
+ #endif
235
+ }
236
+
237
+ size_t get_buf_max_mem(int i) const {
238
+ #if defined(LLAMA_USE_SCRATCH)
239
+ return buf_max_size[i];
240
+ #else
241
+ (void) i;
242
+ return 0;
243
+ #endif
244
+ }
245
+ };
246
+
247
+ //
248
+ // kv cache
249
+ //
250
+
251
+ static bool kv_cache_init(
252
+ const struct llama_hparams & hparams,
253
+ struct llama_kv_cache & cache,
254
+ ggml_type wtype,
255
+ int n_ctx) {
256
+ const int n_embd = hparams.n_embd;
257
+ const int n_layer = hparams.n_layer;
258
+
259
+ const int n_mem = n_layer*n_ctx;
260
+ const int n_elements = n_embd*n_mem;
261
+
262
+ cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
263
+
264
+ struct ggml_init_params params;
265
+ params.mem_size = cache.buf.size();
266
+ params.mem_buffer = cache.buf.data();
267
+ params.no_alloc = false;
268
+
269
+ cache.ctx = ggml_init(params);
270
+
271
+ if (!cache.ctx) {
272
+ fprintf(stderr, "%s: failed to allocate memory for kv cache\n", __func__);
273
+ return false;
274
+ }
275
+
276
+ cache.k = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
277
+ cache.v = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
278
+
279
+ return true;
280
+ }
281
+
282
+ static void kv_cache_free(struct llama_kv_cache & cache) {
283
+ if (cache.ctx) {
284
+ ggml_free(cache.ctx);
285
+ cache.ctx = nullptr;
286
+ }
287
+ }
288
+
289
+ struct llama_context_params llama_context_default_params() {
290
+ struct llama_context_params result = {
291
+ /*.n_ctx =*/ 512,
292
+ /*.n_parts =*/ -1,
293
+ /*.seed =*/ 0,
294
+ /*.f16_kv =*/ false,
295
+ /*.logits_all =*/ false,
296
+ /*.vocab_only =*/ false,
297
+ /*.use_mlock =*/ false,
298
+ /*.embedding =*/ false,
299
+ /*.progress_callback =*/ nullptr,
300
+ /*.progress_callback_user_data =*/ nullptr,
301
+ };
302
+
303
+ return result;
304
+ }
305
+
306
+ //
307
+ // model loading
308
+ //
309
+
310
+ static void *mmap_file(const char *fname, uint64_t *mm_length) {
311
+ #if defined(_WIN32) && !defined(_POSIX_MAPPED_FILES)
312
+ HANDLE hFile = CreateFileA(fname,
313
+ GENERIC_READ,
314
+ FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
315
+ NULL,
316
+ OPEN_EXISTING,
317
+ FILE_ATTRIBUTE_NORMAL | FILE_ATTRIBUTE_NOT_CONTENT_INDEXED,
318
+ NULL);
319
+ if (hFile == INVALID_HANDLE_VALUE) return 0;
320
+ LARGE_INTEGER fileSize;
321
+ fileSize.QuadPart = -1;
322
+ GetFileSizeEx(hFile, &fileSize);
323
+ int64_t length = fileSize.QuadPart;
324
+ HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
325
+ CloseHandle(hFile);
326
+ if (!hMapping) return 0;
327
+ void *addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
328
+ CloseHandle(hMapping);
329
+ if (!addr) return 0;
330
+ #else
331
+ int fd = open(fname, O_RDONLY);
332
+ if (fd == -1) return 0;
333
+ int64_t length = lseek(fd, 0, SEEK_END);
334
+ void *addr = mmap(NULL, length, PROT_READ, MAP_SHARED, fd, 0);
335
+ close(fd);
336
+ if (addr == MAP_FAILED) return 0;
337
+ #endif
338
+ *mm_length = length;
339
+ return addr;
340
+ }
341
+
342
+ static void munmap_file(void * addr, size_t length) {
343
+ #if defined(_WIN32) && !defined(_POSIX_MAPPED_FILES)
344
+ UnmapViewOfFile(addr);
345
+ #else
346
+ munmap(addr, length);
347
+ #endif
348
+ }
349
+
350
+ static bool report_bad_magic(const char *path, uint32_t got, uint32_t want) {
351
+ fprintf(stderr,
352
+ "%s: invalid model file (bad magic [got %#x want %#x])\n"
353
+ "\tyou most likely need to regenerate your ggml files\n"
354
+ "\tthe benefit is you'll get 10-100x faster load times\n"
355
+ "\tsee https://github.com/ggerganov/llama.cpp/issues/91\n"
356
+ "\tuse convert-pth-to-ggml.py to regenerate from original pth\n"
357
+ "\tuse migrate-ggml-2023-03-30-pr613.py if you deleted originals\n",
358
+ path, got, want);
359
+ return false;
360
+ }
361
+
362
+ static bool llama_model_load(
363
+ const std::string & fname,
364
+ llama_context & lctx,
365
+ int n_ctx,
366
+ int n_parts,
367
+ ggml_type memory_type,
368
+ bool vocab_only,
369
+ llama_progress_callback progress_callback,
370
+ void *progress_callback_user_data) {
371
+ fprintf(stderr, "%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
372
+
373
+ lctx.t_start_us = ggml_time_us();
374
+
375
+ auto & model = lctx.model;
376
+ auto & vocab = lctx.vocab;
377
+
378
+ auto fin = std::ifstream(fname, std::ios::binary);
379
+ if (!fin) {
380
+ fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
381
+ return false;
382
+ }
383
+
384
+ std::vector<char> f_buf(1024*1024);
385
+ fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size());
386
+
387
+ fin.seekg(0, fin.end);
388
+ const size_t file_size = fin.tellg();
389
+ fin.seekg(0);
390
+
391
+ // verify magic
392
+ {
393
+ uint32_t magic;
394
+ fin.read((char *) &magic, sizeof(magic));
395
+ if (magic == LLAMA_FILE_MAGIC_UNVERSIONED) {
396
+ fprintf(stderr, "%s: invalid model file '%s' (too old, regenerate your model files or convert them with convert-unversioned-ggml-to-ggml.py!)\n",
397
+ __func__, fname.c_str());
398
+ return false;
399
+ }
400
+ if (magic != LLAMA_FILE_MAGIC) {
401
+ return report_bad_magic(fname.c_str(), magic, LLAMA_FILE_MAGIC);
402
+ }
403
+
404
+ uint32_t format_version;
405
+ fin.read((char *) &format_version, sizeof(format_version));
406
+
407
+ if (format_version != LLAMA_FILE_VERSION) {
408
+ fprintf(stderr, "%s: invalid model file '%s' (unsupported format version %" PRIu32 ", expected %d)\n",
409
+ __func__, fname.c_str(), format_version, LLAMA_FILE_VERSION);
410
+ return false;
411
+ }
412
+ }
413
+
414
+ int n_ff = 0;
415
+
416
+ // load hparams
417
+ {
418
+ auto & hparams = model.hparams;
419
+
420
+ fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
421
+ //fin.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
422
+ fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd));
423
+ fin.read((char *) &hparams.n_mult, sizeof(hparams.n_mult));
424
+ fin.read((char *) &hparams.n_head, sizeof(hparams.n_head));
425
+ fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
426
+ fin.read((char *) &hparams.n_rot, sizeof(hparams.n_rot));
427
+ fin.read((char *) &hparams.f16, sizeof(hparams.f16));
428
+
429
+ hparams.n_ctx = n_ctx;
430
+
431
+ n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
432
+
433
+ if (n_parts < 1) {
434
+ n_parts = LLAMA_N_PARTS.at(hparams.n_embd);
435
+ }
436
+
437
+ // temp warning to tell the user to use "--n_parts"
438
+ if (hparams.f16 == 4 && n_parts != 1) {
439
+ fprintf(stderr, "%s: GPTQ model detected - are you sure n_parts should be %d? we normally expect it to be 1\n", __func__, n_parts);
440
+ fprintf(stderr, "%s: use '--n_parts 1' if necessary\n", __func__);
441
+ }
442
+
443
+ if (hparams.n_layer == 32) {
444
+ model.type = e_model::MODEL_7B;
445
+ }
446
+
447
+ if (hparams.n_layer == 40) {
448
+ model.type = e_model::MODEL_13B;
449
+ }
450
+
451
+ if (hparams.n_layer == 60) {
452
+ model.type = e_model::MODEL_30B;
453
+ }
454
+
455
+ if (hparams.n_layer == 80) {
456
+ model.type = e_model::MODEL_65B;
457
+ }
458
+
459
+ fprintf(stderr, "%s: n_vocab = %d\n", __func__, hparams.n_vocab);
460
+ fprintf(stderr, "%s: n_ctx = %d\n", __func__, hparams.n_ctx);
461
+ fprintf(stderr, "%s: n_embd = %d\n", __func__, hparams.n_embd);
462
+ fprintf(stderr, "%s: n_mult = %d\n", __func__, hparams.n_mult);
463
+ fprintf(stderr, "%s: n_head = %d\n", __func__, hparams.n_head);
464
+ fprintf(stderr, "%s: n_layer = %d\n", __func__, hparams.n_layer);
465
+ fprintf(stderr, "%s: n_rot = %d\n", __func__, hparams.n_rot);
466
+ fprintf(stderr, "%s: f16 = %d\n", __func__, hparams.f16);
467
+ fprintf(stderr, "%s: n_ff = %d\n", __func__, n_ff);
468
+ fprintf(stderr, "%s: n_parts = %d\n", __func__, n_parts);
469
+ fprintf(stderr, "%s: type = %d\n", __func__, model.type);
470
+ }
471
+
472
+ // load vocab
473
+ {
474
+ std::string word;
475
+ vocab.id_to_token.resize(model.hparams.n_vocab);
476
+ std::vector<char> tmp(64);
477
+
478
+ for (int i = 0; i < model.hparams.n_vocab; i++) {
479
+ uint32_t len;
480
+ fin.read((char *) &len, sizeof(len));
481
+
482
+ word.resize(len);
483
+ if (len > 0) {
484
+ tmp.resize(len);
485
+ fin.read(tmp.data(), len);
486
+ word.assign(tmp.data(), len);
487
+ } else {
488
+ word.clear();
489
+ }
490
+
491
+ float score;
492
+ fin.read((char *) &score, sizeof(score));
493
+
494
+ vocab.token_to_id[word] = i;
495
+
496
+ auto &tok_score = vocab.id_to_token[i];
497
+ tok_score.tok = word;
498
+ tok_score.score = score;
499
+ }
500
+ }
501
+
502
+ if (vocab_only) {
503
+ return true;
504
+ }
505
+
506
+ // for the big tensors, we have the option to store the data in 16-bit floats or quantized
507
+ // in order to save memory and also to speed up the computation
508
+ // wtype is for per-layer weights, while vtype is for other weights
509
+ ggml_type wtype, vtype;
510
+ switch (model.hparams.f16) {
511
+ case 0: wtype = vtype = GGML_TYPE_F32; break;
512
+ case 1: wtype = vtype = GGML_TYPE_F16; break;
513
+ case 2: wtype = vtype = GGML_TYPE_Q4_0; break;
514
+ case 3: wtype = vtype = GGML_TYPE_Q4_1; break;
515
+ case 4: wtype = GGML_TYPE_Q4_1; vtype = GGML_TYPE_F16; break;
516
+ default:
517
+ {
518
+ fprintf(stderr, "%s: invalid model file '%s' (bad f16 value %d)\n",
519
+ __func__, fname.c_str(), model.hparams.f16);
520
+ return false;
521
+ }
522
+ }
523
+
524
+ // map model into memory
525
+ char *mm_addr = NULL;
526
+ model.mm_addr = mmap_file(fname.c_str(), &model.mm_length);
527
+ if (model.mm_addr == NULL) {
528
+ fprintf(stderr, "%s: failed to mmap '%s'\n", __func__, fname.c_str());
529
+ return false;
530
+ }
531
+ mm_addr = (char *)model.mm_addr;
532
+ fprintf(stderr, "%s: ggml map size = %6.2f MB\n", __func__, model.mm_length/(1024.0*1024.0));
533
+
534
+ auto & ctx = model.ctx;
535
+
536
+ size_t ctx_size = 0;
537
+ {
538
+ const auto &hparams = model.hparams;
539
+ const int n_layer = hparams.n_layer;
540
+ ctx_size += (5 + 10*n_layer)*256; // object overhead
541
+ fprintf(stderr, "%s: ggml ctx size = %6.2f KB\n", __func__, ctx_size/1024.0);
542
+ }
543
+
544
+ // print memory requirements
545
+ {
546
+ const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
547
+
548
+ // this is the total memory required to run the inference
549
+ const size_t mem_required =
550
+ ctx_size +
551
+ model.mm_length +
552
+ MEM_REQ_SCRATCH0.at(model.type) +
553
+ MEM_REQ_SCRATCH1.at(model.type) +
554
+ MEM_REQ_EVAL.at (model.type);
555
+
556
+ // this is the memory required by one llama_state
557
+ const size_t mem_required_state =
558
+ scale*MEM_REQ_KV_SELF.at(model.type);
559
+
560
+ fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
561
+ mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
562
+ }
563
+
564
+ // create the ggml context
565
+ {
566
+ lctx.model.buf.resize(ctx_size);
567
+
568
+ struct ggml_init_params params = {
569
+ /*.mem_size =*/ lctx.model.buf.size(),
570
+ /*.mem_buffer =*/ lctx.model.buf.data(),
571
+ /*.no_alloc =*/ true,
572
+ };
573
+
574
+ model.ctx = ggml_init(params);
575
+ if (!model.ctx) {
576
+ fprintf(stderr, "%s: ggml_init() failed\n", __func__);
577
+ return false;
578
+ }
579
+ }
580
+
581
+ // prepare memory for the weights
582
+ {
583
+ const auto & hparams = model.hparams;
584
+
585
+ const int n_embd = hparams.n_embd;
586
+ const int n_layer = hparams.n_layer;
587
+ const int n_vocab = hparams.n_vocab;
588
+
589
+ model.layers.resize(n_layer);
590
+
591
+ model.tok_embeddings = ggml_new_tensor_2d(ctx, vtype, n_embd, n_vocab);
592
+
593
+ model.norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
594
+ model.output = ggml_new_tensor_2d(ctx, vtype, n_embd, n_vocab);
595
+
596
+ // map by name
597
+ model.tensors["tok_embeddings.weight"] = model.tok_embeddings;
598
+
599
+ model.tensors["norm.weight"] = model.norm;
600
+ model.tensors["output.weight"] = model.output;
601
+
602
+ for (int i = 0; i < n_layer; ++i) {
603
+ auto & layer = model.layers[i];
604
+
605
+ layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
606
+
607
+ layer.wq = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
608
+ layer.wk = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
609
+ layer.wv = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
610
+ layer.wo = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
611
+
612
+ layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
613
+
614
+ layer.w1 = ggml_new_tensor_2d(ctx, wtype, n_embd, n_ff);
615
+ layer.w2 = ggml_new_tensor_2d(ctx, wtype, n_ff, n_embd);
616
+ layer.w3 = ggml_new_tensor_2d(ctx, wtype, n_embd, n_ff);
617
+
618
+ // map by name
619
+ model.tensors["layers." + std::to_string(i) + ".attention_norm.weight"] = layer.attention_norm;
620
+
621
+ model.tensors["layers." + std::to_string(i) + ".attention.wq.weight"] = layer.wq;
622
+ model.tensors["layers." + std::to_string(i) + ".attention.wk.weight"] = layer.wk;
623
+ model.tensors["layers." + std::to_string(i) + ".attention.wv.weight"] = layer.wv;
624
+ model.tensors["layers." + std::to_string(i) + ".attention.wo.weight"] = layer.wo;
625
+
626
+ model.tensors["layers." + std::to_string(i) + ".ffn_norm.weight"] = layer.ffn_norm;
627
+
628
+ model.tensors["layers." + std::to_string(i) + ".feed_forward.w1.weight"] = layer.w1;
629
+ model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.weight"] = layer.w2;
630
+ model.tensors["layers." + std::to_string(i) + ".feed_forward.w3.weight"] = layer.w3;
631
+ }
632
+ }
633
+
634
+ std::vector<uint8_t> tmp;
635
+
636
+ if (progress_callback) {
637
+ progress_callback(0.0, progress_callback_user_data);
638
+ }
639
+
640
+ fprintf(stderr, "%s: loading tensors from '%s'\n", __func__, fname.c_str());
641
+
642
+ // load weights
643
+ {
644
+ size_t total_size = 0;
645
+ model.n_loaded = 0;
646
+
647
+ while (true) {
648
+ int32_t n_dims;
649
+ int32_t length;
650
+ int32_t ftype;
651
+
652
+ fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
653
+ fin.read(reinterpret_cast<char *>(&length), sizeof(length));
654
+ fin.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
655
+
656
+ if (fin.eof()) {
657
+ break;
658
+ }
659
+
660
+ int32_t nelements = 1;
661
+ int32_t ne[2] = { 1, 1 };
662
+ for (int i = 0; i < n_dims; ++i) {
663
+ fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
664
+ nelements *= ne[i];
665
+ }
666
+
667
+ std::string name(length, 0);
668
+ fin.read(&name[0], length);
669
+
670
+ if (model.tensors.find(name.data()) == model.tensors.end()) {
671
+ fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
672
+ return false;
673
+ }
674
+
675
+ auto tensor = model.tensors[name.data()];
676
+
677
+ if (ggml_nelements(tensor) != nelements) {
678
+ fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
679
+ return false;
680
+ }
681
+ if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
682
+ fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
683
+ __func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
684
+ return false;
685
+ }
686
+ if (0) {
687
+ static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
688
+ fprintf(stderr, "%24s - [%5d, %5d], type = %6s\n", name.data(), ne[0], ne[1], ftype_str[ftype]);
689
+ }
690
+
691
+ switch (ftype) {
692
+ case 0: // f32
693
+ case 1: // f16
694
+ break;
695
+ case 2: // q4_0
696
+ case 3: // q4_1
697
+ assert(ne[0] % 64 == 0);
698
+ break;
699
+ default:
700
+ fprintf(stderr, "%s: unknown ftype %d in model file\n", __func__, ftype);
701
+ return false;
702
+ };
703
+
704
+ // load the tensor data into memory without copying or reading it
705
+ size_t offset = fin.tellg();
706
+ size_t tensor_data_size = ggml_nbytes(tensor);
707
+ offset = (offset + 31) & -32;
708
+ tensor->data = mm_addr + offset;
709
+ fin.seekg(offset + tensor_data_size);
710
+ total_size += tensor_data_size;
711
+ model.n_loaded++;
712
+
713
+ // progress
714
+ if (progress_callback) {
715
+ double current_progress = size_t(fin.tellg()) / double(file_size);
716
+ progress_callback(current_progress, progress_callback_user_data);
717
+ }
718
+ }
719
+
720
+ fin.close();
721
+
722
+ fprintf(stderr, "%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, model.n_loaded);
723
+ if (model.n_loaded == 0) {
724
+ fprintf(stderr, "%s: WARN no tensors loaded from model file - assuming empty model for testing\n", __func__);
725
+ } else if (model.n_loaded != (int) model.tensors.size()) {
726
+ fprintf(stderr, "%s: ERROR not all tensors loaded from model file - expected %zu, got %d\n", __func__, model.tensors.size(), model.n_loaded);
727
+ return false;
728
+ }
729
+ }
730
+
731
+ // loading time will be recalculate after the first eval, so
732
+ // we take page faults deferred by mmap() into consideration
733
+ lctx.t_load_us = ggml_time_us() - lctx.t_start_us;
734
+
735
+ if (progress_callback) {
736
+ progress_callback(1.0, progress_callback_user_data);
737
+ }
738
+
739
+ return true;
740
+ }
741
+
742
+ // evaluate the transformer
743
+ //
744
+ // - lctx: llama context
745
+ // - tokens: new batch of tokens to process
746
+ // - n_past: the context size so far
747
+ // - n_threads: number of threads to use
748
+ //
749
+ static bool llama_eval_internal(
750
+ llama_context & lctx,
751
+ const llama_token * tokens,
752
+ const int n_tokens,
753
+ const int n_past,
754
+ const int n_threads) {
755
+ const int64_t t_start_us = ggml_time_us();
756
+
757
+ const int N = n_tokens;
758
+
759
+ const auto & model = lctx.model;
760
+ const auto & hparams = model.hparams;
761
+
762
+ auto & kv_self = model.kv_self;
763
+
764
+ LLAMA_ASSERT(!!kv_self.ctx);
765
+
766
+ const int n_embd = hparams.n_embd;
767
+ const int n_layer = hparams.n_layer;
768
+ const int n_ctx = hparams.n_ctx;
769
+ const int n_head = hparams.n_head;
770
+ const int n_vocab = hparams.n_vocab;
771
+ const int n_rot = hparams.n_embd/hparams.n_head;
772
+
773
+ auto & mem_per_token = lctx.mem_per_token;
774
+ auto & buf_compute = lctx.buf_compute;
775
+
776
+ struct ggml_init_params params = {
777
+ /*.mem_size =*/ buf_compute.size(),
778
+ /*.mem_buffer =*/ buf_compute.data(),
779
+ /*.no_alloc =*/ false,
780
+ };
781
+
782
+ struct ggml_context * ctx0 = ggml_init(params);
783
+
784
+ // for big prompts, if BLAS is enabled, it is better to use only one thread
785
+ // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
786
+ ggml_cgraph gf = {};
787
+ gf.n_threads = N >= 32 && ggml_cpu_has_blas() ? 1 : n_threads;
788
+
789
+ struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
790
+ memcpy(embd->data, tokens, N*ggml_element_size(embd));
791
+
792
+ struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
793
+
794
+ for (int il = 0; il < n_layer; ++il) {
795
+ struct ggml_tensor * inpSA = inpL;
796
+
797
+ struct ggml_tensor * cur;
798
+
799
+ lctx.use_buf(ctx0, 0);
800
+
801
+ // norm
802
+ {
803
+ cur = ggml_rms_norm(ctx0, inpL);
804
+
805
+ // cur = attention_norm*cur
806
+ cur = ggml_mul(ctx0,
807
+ ggml_repeat(ctx0, model.layers[il].attention_norm, cur),
808
+ cur);
809
+ }
810
+
811
+ // self-attention
812
+ {
813
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
814
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
815
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
816
+
817
+ // store key and value to memory
818
+ if (N >= 1) {
819
+ struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
820
+ struct ggml_tensor * v = ggml_view_1d(ctx0, kv_self.v, N*n_embd, (ggml_element_size(kv_self.v)*n_embd)*(il*n_ctx + n_past));
821
+
822
+ ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
823
+ ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
824
+ }
825
+
826
+ // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
827
+ struct ggml_tensor * Q =
828
+ ggml_permute(ctx0,
829
+ ggml_rope(ctx0,
830
+ ggml_cpy(ctx0,
831
+ Qcur,
832
+ ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)),
833
+ n_past, n_rot, 0),
834
+ 0, 2, 1, 3);
835
+
836
+ // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
837
+ struct ggml_tensor * K =
838
+ ggml_permute(ctx0,
839
+ ggml_rope(ctx0,
840
+ ggml_reshape_3d(ctx0,
841
+ ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd),
842
+ n_embd/n_head, n_head, n_past + N),
843
+ n_past, n_rot, 1),
844
+ 0, 2, 1, 3);
845
+
846
+ // K * Q
847
+ struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
848
+
849
+ // KQ_scaled = KQ / sqrt(n_embd/n_head)
850
+ struct ggml_tensor * KQ_scaled =
851
+ ggml_scale(ctx0,
852
+ KQ,
853
+ ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head)));
854
+
855
+ // KQ_masked = mask_past(KQ_scaled)
856
+ struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
857
+
858
+ // KQ = soft_max(KQ_masked)
859
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
860
+
861
+ // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
862
+ struct ggml_tensor * V_trans =
863
+ ggml_cpy(ctx0,
864
+ ggml_permute(ctx0,
865
+ ggml_reshape_3d(ctx0,
866
+ ggml_view_1d(ctx0, kv_self.v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.v)*n_embd),
867
+ n_embd/n_head, n_head, n_past + N),
868
+ 1, 2, 0, 3),
869
+ ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd/n_head, n_head));
870
+
871
+ // KQV = transpose(V) * KQ_soft_max
872
+ struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
873
+
874
+ // KQV_merged = KQV.permute(0, 2, 1, 3)
875
+ struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
876
+
877
+ // cur = KQV_merged.contiguous().view(n_embd, N)
878
+ cur = ggml_cpy(ctx0,
879
+ KQV_merged,
880
+ ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
881
+
882
+ // projection (no bias)
883
+ cur = ggml_mul_mat(ctx0,
884
+ model.layers[il].wo,
885
+ cur);
886
+ }
887
+
888
+ lctx.use_buf(ctx0, 1);
889
+
890
+ struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
891
+
892
+ // feed-forward network
893
+ {
894
+ // norm
895
+ {
896
+ cur = ggml_rms_norm(ctx0, inpFF);
897
+
898
+ // cur = ffn_norm*cur
899
+ cur = ggml_mul(ctx0,
900
+ ggml_repeat(ctx0, model.layers[il].ffn_norm, cur),
901
+ cur);
902
+ }
903
+
904
+ struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
905
+ model.layers[il].w3,
906
+ cur);
907
+
908
+ cur = ggml_mul_mat(ctx0,
909
+ model.layers[il].w1,
910
+ cur);
911
+
912
+ // SILU activation
913
+ cur = ggml_silu(ctx0, cur);
914
+
915
+ cur = ggml_mul(ctx0, cur, tmp);
916
+
917
+ cur = ggml_mul_mat(ctx0,
918
+ model.layers[il].w2,
919
+ cur);
920
+ }
921
+
922
+ cur = ggml_add(ctx0, cur, inpFF);
923
+
924
+ // input for next layer
925
+ inpL = cur;
926
+ }
927
+
928
+ lctx.use_buf(ctx0, 0);
929
+
930
+ // used at the end to optionally extract the embeddings
931
+ struct ggml_tensor * embeddings = NULL;
932
+
933
+ // norm
934
+ {
935
+
936
+ inpL = ggml_rms_norm(ctx0, inpL);
937
+
938
+ // inpL = norm*inpL
939
+ inpL = ggml_mul(ctx0,
940
+ ggml_repeat(ctx0, model.norm, inpL),
941
+ inpL);
942
+
943
+ embeddings = inpL;
944
+ }
945
+
946
+ // lm_head
947
+ inpL = ggml_mul_mat(ctx0, model.output, inpL);
948
+
949
+ lctx.use_buf(ctx0, -1);
950
+
951
+ // logits -> probs
952
+ //inpL = ggml_soft_max(ctx0, inpL);
953
+
954
+ // run the computation
955
+ ggml_build_forward_expand(&gf, inpL);
956
+ ggml_graph_compute (ctx0, &gf);
957
+
958
+ //if (n_past%100 == 0) {
959
+ // ggml_graph_print (&gf);
960
+ // ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot");
961
+ //}
962
+
963
+ //embd_w.resize(n_vocab*N);
964
+ //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
965
+
966
+ // extract logits
967
+ {
968
+ auto & logits_out = lctx.logits;
969
+
970
+ if (lctx.logits_all) {
971
+ logits_out.resize(n_vocab * N);
972
+ memcpy(logits_out.data(), (float *) ggml_get_data(inpL), sizeof(float)*n_vocab*N);
973
+ } else {
974
+ // return result for just the last token
975
+ logits_out.resize(n_vocab);
976
+ memcpy(logits_out.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
977
+ }
978
+ }
979
+
980
+ // extract embeddings
981
+ if (lctx.embedding.size()) {
982
+ auto & embedding_out = lctx.embedding;
983
+
984
+ embedding_out.resize(n_embd);
985
+ memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(N - 1)), sizeof(float)*n_embd);
986
+ }
987
+
988
+ if (mem_per_token == 0) {
989
+ mem_per_token = ggml_used_mem(ctx0)/N;
990
+ }
991
+
992
+ #if 0
993
+ printf("\n%s: used_mem = %.3f MB, scratch -- %.3f MB %.3f MB\n", __func__,
994
+ ggml_used_mem(ctx0)/1024.0/1024.0,
995
+ lctx.get_buf_max_mem(0)/1024.0/1024.0,
996
+ lctx.get_buf_max_mem(1)/1024.0/1024.0);
997
+ #endif
998
+
999
+ ggml_free(ctx0);
1000
+
1001
+ // measure the performance only for the single-token evals
1002
+ if (N == 1) {
1003
+ lctx.t_eval_us += ggml_time_us() - t_start_us;
1004
+ lctx.n_eval++;
1005
+ }
1006
+ else if (N > 1) {
1007
+ lctx.t_p_eval_us += ggml_time_us() - t_start_us;
1008
+ lctx.n_p_eval += N;
1009
+ }
1010
+
1011
+ return true;
1012
+ }
1013
+
1014
+ //
1015
+ // tokenizer
1016
+ //
1017
+
1018
+ static size_t utf8_len(char src) {
1019
+ const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
1020
+ uint8_t highbits = static_cast<uint8_t>(src) >> 4;
1021
+ return lookup[highbits];
1022
+ }
1023
+
1024
+ struct llama_sp_symbol {
1025
+ using index = int;
1026
+ index prev;
1027
+ index next;
1028
+ const char * text;
1029
+ size_t n;
1030
+ };
1031
+
1032
+ struct llama_sp_bigram {
1033
+ struct comparator {
1034
+ bool operator()(llama_sp_bigram & l, llama_sp_bigram & r) {
1035
+ return (l.score < r.score) || (l.score == r.score && l.left > r.left);
1036
+ }
1037
+ };
1038
+ using queue_storage = std::vector<llama_sp_bigram>;
1039
+ using queue = std::priority_queue<llama_sp_bigram, queue_storage, comparator>;
1040
+ llama_sp_symbol::index left;
1041
+ llama_sp_symbol::index right;
1042
+ float score;
1043
+ size_t size;
1044
+ };
1045
+
1046
+ // original implementation:
1047
+ // https://github.com/ggerganov/llama.cpp/commit/074bea2eb1f1349a0118239c4152914aecaa1be4
1048
+ struct llama_tokenizer {
1049
+ llama_tokenizer(const llama_vocab & vocab): vocab_(vocab) {}
1050
+
1051
+ void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
1052
+ // split string into utf8 chars
1053
+ int index = 0;
1054
+ size_t offs = 0;
1055
+ while (offs < text.size()) {
1056
+ llama_sp_symbol sym;
1057
+ size_t char_len = Min(text.size() - offs, utf8_len(text[offs]));
1058
+ sym.text = text.c_str() + offs;
1059
+ sym.n = char_len;
1060
+ offs += char_len;
1061
+ sym.prev = index - 1;
1062
+ sym.next = offs == text.size() ? -1 : index + 1;
1063
+ index++;
1064
+ symbols_.emplace_back(std::move(sym));
1065
+ }
1066
+
1067
+ // seed the work queue with all possible 2-character tokens.
1068
+ for (size_t i = 1; i < symbols_.size(); ++i) {
1069
+ try_add_bigram(i - 1, i);
1070
+ }
1071
+
1072
+ // keep substituting the highest frequency pairs for as long as we can.
1073
+ while (!work_queue_.empty()) {
1074
+ auto bigram = work_queue_.top();
1075
+ work_queue_.pop();
1076
+
1077
+ auto & left_sym = symbols_[bigram.left];
1078
+ auto & right_sym = symbols_[bigram.right];
1079
+
1080
+ // if one of the symbols already got merged, skip it.
1081
+ if (left_sym.n == 0 || right_sym.n == 0 ||
1082
+ left_sym.n + right_sym.n != bigram.size) {
1083
+ continue;
1084
+ }
1085
+
1086
+ // merge the right sym into the left one
1087
+ left_sym.n += right_sym.n;
1088
+ right_sym.n = 0;
1089
+
1090
+ //printf("left = '%*s' size = %zu\n", (int) left_sym.n, left_sym.text, bigram.size);
1091
+
1092
+ // remove the right sym from the chain
1093
+ left_sym.next = right_sym.next;
1094
+ if (right_sym.next >= 0) {
1095
+ symbols_[right_sym.next].prev = bigram.left;
1096
+ }
1097
+
1098
+ // find more substitutions
1099
+ try_add_bigram(left_sym.prev, bigram.left);
1100
+ try_add_bigram(bigram.left, left_sym.next);
1101
+ }
1102
+
1103
+ for (int i = 0; i != -1; i = symbols_[i].next) {
1104
+ auto & symbol = symbols_[i];
1105
+ auto token = vocab_.token_to_id.find(std::string(symbol.text, symbol.n));
1106
+
1107
+ if (token == vocab_.token_to_id.end()) {
1108
+ // output any symbols that did not form tokens as bytes.
1109
+ for (int j = 0; j < (int) symbol.n; ++j) {
1110
+ llama_vocab::id token_id = static_cast<uint8_t>(symbol.text[j]) + 3;
1111
+ output.push_back(token_id);
1112
+ }
1113
+ } else {
1114
+ output.push_back((*token).second);
1115
+ }
1116
+ }
1117
+ }
1118
+
1119
+ private:
1120
+ void try_add_bigram(int left, int right) {
1121
+ if (left == -1 || right == -1) {
1122
+ return;
1123
+ }
1124
+
1125
+ const std::string text = std::string(symbols_[left].text, symbols_[left].n + symbols_[right].n);
1126
+ auto token = vocab_.token_to_id.find(text);
1127
+
1128
+ if (token == vocab_.token_to_id.end()) {
1129
+ return;
1130
+ }
1131
+
1132
+ if (static_cast<size_t>((*token).second) >= vocab_.id_to_token.size()) {
1133
+ return;
1134
+ }
1135
+
1136
+ const auto &tok_score = vocab_.id_to_token[(*token).second];
1137
+
1138
+ llama_sp_bigram bigram;
1139
+ bigram.left = left;
1140
+ bigram.right = right;
1141
+ bigram.score = tok_score.score;
1142
+ bigram.size = text.size();
1143
+ work_queue_.push(bigram);
1144
+ }
1145
+
1146
+ const llama_vocab & vocab_;
1147
+ std::vector<llama_sp_symbol> symbols_;
1148
+ llama_sp_bigram::queue work_queue_;
1149
+ };
1150
+
1151
+ static std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, const std::string & text, bool bos) {
1152
+ llama_tokenizer tokenizer(vocab);
1153
+ std::vector<llama_vocab::id> output;
1154
+
1155
+ if (text.size() == 0) {
1156
+ return output;
1157
+ }
1158
+
1159
+ if (bos) {
1160
+ output.push_back(1);
1161
+ }
1162
+
1163
+ tokenizer.tokenize(text, output);
1164
+ return output;
1165
+ }
1166
+
1167
+ //
1168
+ // sampling
1169
+ //
1170
+
1171
+ static void sample_top_k(std::vector<std::pair<float, llama_vocab::id>> & logits_id, int top_k) {
1172
+ // find the top k tokens
1173
+ std::partial_sort(
1174
+ logits_id.begin(),
1175
+ logits_id.begin() + top_k, logits_id.end(),
1176
+ [](const std::pair<float, llama_vocab::id> & a, const std::pair<float, llama_vocab::id> & b) {
1177
+ return a.first > b.first;
1178
+ });
1179
+
1180
+ logits_id.resize(top_k);
1181
+ }
1182
+
1183
+ static llama_vocab::id llama_sample_top_p_top_k(
1184
+ llama_context & lctx,
1185
+ const std::vector<llama_vocab::id> & last_n_tokens,
1186
+ int top_k,
1187
+ float top_p,
1188
+ float temp,
1189
+ float repeat_penalty) {
1190
+ auto & rng = lctx.rng;
1191
+
1192
+ const int n_logits = lctx.model.hparams.n_vocab;
1193
+
1194
+ const auto & logits = lctx.logits;
1195
+ const auto * plogits = logits.data() + logits.size() - n_logits;
1196
+
1197
+ std::vector<std::pair<float, llama_vocab::id>> logits_id;
1198
+ logits_id.reserve(n_logits);
1199
+
1200
+ {
1201
+ const float scale = 1.0f/temp;
1202
+ for (int i = 0; i < n_logits; ++i) {
1203
+ // repetition penalty from ctrl paper (https://arxiv.org/abs/1909.05858)
1204
+ // credit https://github.com/facebookresearch/llama/compare/main...shawwn:llama:main
1205
+ if (std::find(last_n_tokens.begin(), last_n_tokens.end(), i) != last_n_tokens.end()) {
1206
+ // if score < 0 then repetition penalty has to multiplied to reduce the previous token probability
1207
+ if (plogits[i] < 0.0f) {
1208
+ logits_id.push_back(std::make_pair(plogits[i]*scale*repeat_penalty, i));
1209
+ } else {
1210
+ logits_id.push_back(std::make_pair(plogits[i]*scale/repeat_penalty, i));
1211
+ }
1212
+ } else {
1213
+ logits_id.push_back(std::make_pair(plogits[i]*scale, i));
1214
+ }
1215
+ }
1216
+ }
1217
+
1218
+ sample_top_k(logits_id, top_k);
1219
+
1220
+ float maxl = -std::numeric_limits<float>::infinity();
1221
+ for (const auto & kv : logits_id) {
1222
+ maxl = Max(maxl, kv.first);
1223
+ }
1224
+
1225
+ // compute probs for the top k tokens
1226
+ std::vector<float> probs;
1227
+ probs.reserve(logits_id.size());
1228
+
1229
+ double sum = 0.0;
1230
+ for (const auto & kv : logits_id) {
1231
+ const float p = expf(kv.first - maxl);
1232
+ probs.push_back(p);
1233
+ sum += p;
1234
+ }
1235
+
1236
+ // normalize the probs
1237
+ for (auto & p : probs) {
1238
+ p /= sum;
1239
+ }
1240
+
1241
+ if (top_p < 1.0) {
1242
+ double cumsum = 0.0;
1243
+ for (int i = 0; i < (int) probs.size(); i++) {
1244
+ cumsum += probs[i];
1245
+ if (cumsum >= top_p) {
1246
+ probs.resize(i + 1);
1247
+ logits_id.resize(i + 1);
1248
+ break;
1249
+ }
1250
+ }
1251
+
1252
+ cumsum = 1.0/cumsum;
1253
+ for (int i = 0; i < (int) probs.size(); i++) {
1254
+ probs[i] *= cumsum;
1255
+ }
1256
+ }
1257
+
1258
+ //printf("\n");
1259
+ //for (int i = 0; i < (int) 10; i++) {
1260
+ // printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), probs[i]);
1261
+ //}
1262
+ //printf("\n\n");
1263
+ //exit(0);
1264
+
1265
+ std::discrete_distribution<> dist(probs.begin(), probs.end());
1266
+ int idx = dist(rng);
1267
+
1268
+ return logits_id[idx].second;
1269
+ }
1270
+
1271
+ //
1272
+ // quantization
1273
+ //
1274
+
1275
+ // TODO: reuse code from the llama_model_load() somehow
1276
+ static bool llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, int itype) {
1277
+ ggml_type type = GGML_TYPE_Q4_1;
1278
+
1279
+ switch (itype) {
1280
+ case 2: type = GGML_TYPE_Q4_0; break;
1281
+ case 3: type = GGML_TYPE_Q4_1; break;
1282
+ default: fprintf(stderr, "%s: invalid quantization type %d\n", __func__, itype); return 1;
1283
+ };
1284
+
1285
+ if (type != GGML_TYPE_Q4_0 && type != GGML_TYPE_Q4_1) {
1286
+ fprintf(stderr, "%s: invalid quantization type %d\n", __func__, type);
1287
+ return false;
1288
+ }
1289
+
1290
+ llama_vocab vocab;
1291
+
1292
+ printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
1293
+
1294
+ auto finp = std::ifstream(fname_inp, std::ios::binary);
1295
+ if (!finp) {
1296
+ fprintf(stderr, "%s: failed to open '%s' for reading\n", __func__, fname_inp.c_str());
1297
+ return false;
1298
+ }
1299
+
1300
+ auto fout = std::ofstream(fname_out, std::ios::binary);
1301
+ if (!fout) {
1302
+ fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname_out.c_str());
1303
+ return false;
1304
+ }
1305
+
1306
+ // verify magic
1307
+ {
1308
+ uint32_t magic;
1309
+ finp.read((char *) &magic, sizeof(magic));
1310
+ if (magic == LLAMA_FILE_MAGIC_UNVERSIONED) {
1311
+ fprintf(stderr, "%s: invalid model file '%s' (too old, regenerate your model files!)\n",
1312
+ __func__, fname_inp.c_str());
1313
+ return false;
1314
+ }
1315
+ if (magic != LLAMA_FILE_MAGIC) {
1316
+ return report_bad_magic(fname_inp.c_str(), magic, LLAMA_FILE_MAGIC);
1317
+ }
1318
+
1319
+ fout.write((char *) &magic, sizeof(magic));
1320
+
1321
+ uint32_t format_version;
1322
+ finp.read((char *) &format_version, sizeof(format_version));
1323
+
1324
+ if (format_version != LLAMA_FILE_VERSION) {
1325
+ fprintf(stderr, "%s: invalid model file '%s' (unsupported format version %" PRIu32 ", expected %d)\n",
1326
+ __func__, fname_inp.c_str(), format_version, LLAMA_FILE_VERSION);
1327
+ return false;
1328
+ }
1329
+
1330
+ fout.write((char *) &format_version, sizeof(format_version));
1331
+ }
1332
+
1333
+ llama_hparams hparams;
1334
+
1335
+ // load hparams
1336
+ {
1337
+ finp.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
1338
+ //finp.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
1339
+ finp.read((char *) &hparams.n_embd, sizeof(hparams.n_embd));
1340
+ finp.read((char *) &hparams.n_mult, sizeof(hparams.n_mult));
1341
+ finp.read((char *) &hparams.n_head, sizeof(hparams.n_head));
1342
+ finp.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
1343
+ finp.read((char *) &hparams.n_rot, sizeof(hparams.n_rot));
1344
+ finp.read((char *) &hparams.f16, sizeof(hparams.f16));
1345
+
1346
+ printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
1347
+ printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
1348
+ printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
1349
+ printf("%s: n_mult = %d\n", __func__, hparams.n_mult);
1350
+ printf("%s: n_head = %d\n", __func__, hparams.n_head);
1351
+ printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
1352
+ printf("%s: f16 = %d\n", __func__, hparams.f16);
1353
+
1354
+ fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
1355
+ //fout.write((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
1356
+ fout.write((char *) &hparams.n_embd, sizeof(hparams.n_embd));
1357
+ fout.write((char *) &hparams.n_mult, sizeof(hparams.n_mult));
1358
+ fout.write((char *) &hparams.n_head, sizeof(hparams.n_head));
1359
+ fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer));
1360
+ fout.write((char *) &hparams.n_rot, sizeof(hparams.n_rot));
1361
+ fout.write((char *) &itype, sizeof(hparams.f16));
1362
+ }
1363
+
1364
+ // load vocab
1365
+ {
1366
+ const int32_t n_vocab = hparams.n_vocab;
1367
+
1368
+ if (n_vocab != hparams.n_vocab) {
1369
+ fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
1370
+ __func__, fname_inp.c_str(), n_vocab, hparams.n_vocab);
1371
+ return false;
1372
+ }
1373
+
1374
+ std::vector<char> word(32);
1375
+ vocab.id_to_token.resize(n_vocab);
1376
+ for (int i = 0; i < n_vocab; i++) {
1377
+ uint32_t len;
1378
+ finp.read ((char *) &len, sizeof(len));
1379
+ fout.write((char *) &len, sizeof(len));
1380
+
1381
+ word.resize(len);
1382
+ finp.read ((char *) &word[0], len);
1383
+ fout.write((char *) &word[0], len);
1384
+
1385
+ float score;
1386
+ finp.read ((char *) &score, sizeof(score));
1387
+ fout.write((char *) &score, sizeof(score));
1388
+
1389
+ vocab.token_to_id[word.data()] = i;
1390
+
1391
+ auto &tok_score = vocab.id_to_token[i];
1392
+ tok_score.tok = word.data();
1393
+ tok_score.score = score;
1394
+ }
1395
+ }
1396
+
1397
+ // load weights
1398
+ {
1399
+ size_t total_size_org = 0;
1400
+ size_t total_size_new = 0;
1401
+
1402
+ std::vector<float> work;
1403
+
1404
+ std::vector<uint8_t> data_u8;
1405
+ std::vector<ggml_fp16_t> data_f16;
1406
+ std::vector<float> data_f32;
1407
+
1408
+ std::vector<int64_t> hist_all(1 << 4, 0);
1409
+
1410
+ while (true) {
1411
+ int32_t n_dims;
1412
+ int32_t length;
1413
+ int32_t ftype;
1414
+
1415
+ finp.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
1416
+ finp.read(reinterpret_cast<char *>(&length), sizeof(length));
1417
+ finp.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
1418
+
1419
+ if (finp.eof()) {
1420
+ break;
1421
+ }
1422
+
1423
+ int32_t nelements = 1;
1424
+ int32_t ne[2] = { 1, 1 };
1425
+ for (int i = 0; i < n_dims; ++i) {
1426
+ finp.read (reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
1427
+ nelements *= ne[i];
1428
+ }
1429
+
1430
+ std::string name(length, 0);
1431
+ finp.read (&name[0], length);
1432
+
1433
+ {
1434
+ // ensure tensor data is aligned
1435
+ uint64_t offset = finp.tellg();
1436
+ offset = (offset + 31) & -32;
1437
+ finp.seekg(offset);
1438
+ }
1439
+
1440
+ {
1441
+ static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
1442
+ printf("%48s - [%5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ftype_str[ftype]);
1443
+ }
1444
+
1445
+ // regexes of tensor names to be quantized
1446
+ const std::vector<std::string> k_names = {
1447
+ ".*weight",
1448
+ };
1449
+
1450
+ bool quantize = false;
1451
+ for (const auto & s : k_names) {
1452
+ if (std::regex_match(name, std::regex(s))) {
1453
+ quantize = true;
1454
+ break;
1455
+ }
1456
+ }
1457
+
1458
+ // quantize only 2D tensors
1459
+ quantize &= (n_dims == 2);
1460
+
1461
+ if (quantize) {
1462
+ if (ftype != 0 && ftype != 1) {
1463
+ fprintf(stderr, "%s: unsupported ftype %d for integer quantization\n", __func__, ftype);
1464
+ return false;
1465
+ }
1466
+
1467
+ if (ftype == 1) {
1468
+ data_f16.resize(nelements);
1469
+ finp.read(reinterpret_cast<char *>(data_f16.data()), nelements * sizeof(ggml_fp16_t));
1470
+ data_f32.resize(nelements);
1471
+ for (int i = 0; i < nelements; ++i) {
1472
+ data_f32[i] = ggml_fp16_to_fp32(data_f16[i]);
1473
+ }
1474
+ } else {
1475
+ data_f32.resize(nelements);
1476
+ finp.read(reinterpret_cast<char *>(data_f32.data()), nelements * sizeof(float));
1477
+ }
1478
+
1479
+ ftype = itype;
1480
+ } else {
1481
+ const int bpe = (ftype == 0) ? sizeof(float) : sizeof(uint16_t);
1482
+
1483
+ data_u8.resize(nelements*bpe);
1484
+ finp.read(reinterpret_cast<char *>(data_u8.data()), nelements * bpe);
1485
+ }
1486
+
1487
+ fout.write(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
1488
+ fout.write(reinterpret_cast<char *>(&length), sizeof(length));
1489
+ fout.write(reinterpret_cast<char *>(&ftype), sizeof(ftype));
1490
+ for (int i = 0; i < n_dims; ++i) {
1491
+ fout.write(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
1492
+ }
1493
+ fout.write(&name[0], length);
1494
+
1495
+ {
1496
+ // ensure tensor data is aligned
1497
+ uint64_t offset = fout.tellp();
1498
+ offset = (offset + 31) & -32;
1499
+ fout.seekp(offset);
1500
+ }
1501
+
1502
+ if (quantize) {
1503
+ printf("quantizing .. ");
1504
+ work.resize(nelements); // for quantization
1505
+
1506
+ size_t cur_size = 0;
1507
+ std::vector<int64_t> hist_cur(1 << 4, 0);
1508
+
1509
+ switch (type) {
1510
+ case GGML_TYPE_Q4_0:
1511
+ {
1512
+ cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
1513
+ } break;
1514
+ case GGML_TYPE_Q4_1:
1515
+ {
1516
+ cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
1517
+ } break;
1518
+ default:
1519
+ {
1520
+ fprintf(stderr, "%s: unsupported quantization type %d\n", __func__, type);
1521
+ return false;
1522
+ }
1523
+ }
1524
+
1525
+ fout.write(reinterpret_cast<char *>(work.data()), cur_size);
1526
+ total_size_new += cur_size;
1527
+
1528
+ printf("size = %8.2f MB -> %8.2f MB | hist: ", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0);
1529
+ for (int i = 0; i < (int) hist_cur.size(); ++i) {
1530
+ hist_all[i] += hist_cur[i];
1531
+ }
1532
+
1533
+ for (int i = 0; i < (int) hist_cur.size(); ++i) {
1534
+ printf("%5.3f ", hist_cur[i] / float(nelements));
1535
+ }
1536
+ printf("\n");
1537
+ } else {
1538
+ printf("size = %8.3f MB\n", data_u8.size()/1024.0/1024.0);
1539
+ fout.write(reinterpret_cast<char *>(data_u8.data()), data_u8.size());
1540
+ total_size_new += data_u8.size();
1541
+ }
1542
+
1543
+ total_size_org += nelements * sizeof(float);
1544
+ }
1545
+
1546
+ printf("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
1547
+ printf("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0);
1548
+
1549
+ {
1550
+ int64_t sum_all = 0;
1551
+ for (int i = 0; i < (int) hist_all.size(); ++i) {
1552
+ sum_all += hist_all[i];
1553
+ }
1554
+
1555
+ printf("%s: hist: ", __func__);
1556
+ for (int i = 0; i < (int) hist_all.size(); ++i) {
1557
+ printf("%5.3f ", hist_all[i] / float(sum_all));
1558
+ }
1559
+ printf("\n");
1560
+ }
1561
+ }
1562
+
1563
+ finp.close();
1564
+ fout.close();
1565
+
1566
+ return true;
1567
+ }
1568
+
1569
+ //
1570
+ // interface implementation
1571
+ //
1572
+
1573
+ struct llama_context * llama_init_from_file(
1574
+ const char * path_model,
1575
+ struct llama_context_params params) {
1576
+ ggml_time_init();
1577
+
1578
+ llama_context * ctx = new llama_context;
1579
+
1580
+ if (params.seed <= 0) {
1581
+ params.seed = time(NULL);
1582
+ }
1583
+
1584
+ ctx->rng = std::mt19937(params.seed);
1585
+ ctx->logits_all = params.logits_all;
1586
+
1587
+ ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
1588
+
1589
+ if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_parts, memory_type,
1590
+ params.vocab_only, params.progress_callback,
1591
+ params.progress_callback_user_data)) {
1592
+ fprintf(stderr, "%s: failed to load model\n", __func__);
1593
+ llama_free(ctx);
1594
+ return nullptr;
1595
+ }
1596
+
1597
+ if (params.use_mlock) {
1598
+ char *err;
1599
+ if (!ggml_mlock(ctx->model.ctx,
1600
+ ctx->model.mm_addr,
1601
+ ctx->model.mm_length,
1602
+ &err)) {
1603
+ fprintf(stderr, "%s\n", err);
1604
+ free(err);
1605
+ llama_free(ctx);
1606
+ return nullptr;
1607
+ }
1608
+ }
1609
+
1610
+ // reserve memory for context buffers
1611
+ {
1612
+ if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx)) {
1613
+ fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
1614
+ llama_free(ctx);
1615
+ return nullptr;
1616
+ }
1617
+
1618
+ {
1619
+ const size_t memory_size = ggml_nbytes(ctx->model.kv_self.k) + ggml_nbytes(ctx->model.kv_self.v);
1620
+ fprintf(stderr, "%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
1621
+ }
1622
+
1623
+ const auto & hparams = ctx->model.hparams;
1624
+
1625
+ // resized during inference
1626
+ if (params.logits_all) {
1627
+ ctx->logits.reserve(hparams.n_ctx*hparams.n_vocab);
1628
+ } else {
1629
+ ctx->logits.reserve(hparams.n_ctx);
1630
+ }
1631
+
1632
+ if (params.embedding){
1633
+ ctx->embedding.resize(hparams.n_embd);
1634
+ }
1635
+
1636
+ ctx->buf_compute.resize(MEM_REQ_EVAL.at(ctx->model.type));
1637
+
1638
+ ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0.at(ctx->model.type));
1639
+ ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1.at(ctx->model.type));
1640
+ }
1641
+
1642
+ return ctx;
1643
+ }
1644
+
1645
+ void llama_free(struct llama_context * ctx) {
1646
+ kv_cache_free(ctx->model.kv_self);
1647
+
1648
+ if (ctx->model.ctx) {
1649
+ ggml_free(ctx->model.ctx);
1650
+ }
1651
+
1652
+ if (ctx->model.mm_addr) {
1653
+ munmap_file(ctx->model.mm_addr, ctx->model.mm_length);
1654
+ }
1655
+
1656
+ delete ctx;
1657
+ }
1658
+
1659
+ int llama_model_quantize(
1660
+ const char * fname_inp,
1661
+ const char * fname_out,
1662
+ int itype) {
1663
+ if (!llama_model_quantize_internal(fname_inp, fname_out, itype)) {
1664
+ fprintf(stderr, "%s: failed to quantize\n", __func__);
1665
+ return 1;
1666
+ }
1667
+
1668
+ return 0;
1669
+ }
1670
+
1671
+ int llama_eval(
1672
+ struct llama_context * ctx,
1673
+ const llama_token * tokens,
1674
+ int n_tokens,
1675
+ int n_past,
1676
+ int n_threads) {
1677
+ if (!llama_eval_internal(*ctx, tokens, n_tokens, n_past, n_threads)) {
1678
+ fprintf(stderr, "%s: failed to eval\n", __func__);
1679
+ return 1;
1680
+ }
1681
+ // get a more accurate load time, upon first eval
1682
+ if (!ctx->has_evaluated_once) {
1683
+ ctx->t_load_us = ggml_time_us() - ctx->t_start_us;
1684
+ ctx->has_evaluated_once = true;
1685
+ }
1686
+ return 0;
1687
+ }
1688
+
1689
+ int llama_tokenize(
1690
+ struct llama_context * ctx,
1691
+ const char * text,
1692
+ llama_token * tokens,
1693
+ int n_max_tokens,
1694
+ bool add_bos) {
1695
+ auto res = llama_tokenize(ctx->vocab, text, add_bos);
1696
+
1697
+ if (n_max_tokens < (int) res.size()) {
1698
+ fprintf(stderr, "%s: too many tokens\n", __func__);
1699
+ return -((int) res.size());
1700
+ }
1701
+
1702
+ for (size_t i = 0; i < res.size(); i++) {
1703
+ tokens[i] = res[i];
1704
+ }
1705
+
1706
+ return res.size();
1707
+ }
1708
+
1709
+ int llama_n_vocab(struct llama_context * ctx) {
1710
+ return ctx->vocab.id_to_token.size();
1711
+ }
1712
+
1713
+ int llama_n_ctx(struct llama_context * ctx) {
1714
+ return ctx->model.hparams.n_ctx;
1715
+ }
1716
+
1717
+ int llama_n_embd(struct llama_context * ctx) {
1718
+ return ctx->model.hparams.n_embd;
1719
+ }
1720
+
1721
+ float * llama_get_logits(struct llama_context * ctx) {
1722
+ return ctx->logits.data();
1723
+ }
1724
+
1725
+ float * llama_get_embeddings(struct llama_context * ctx) {
1726
+ return ctx->embedding.data();
1727
+ }
1728
+
1729
+ const char * llama_token_to_str(struct llama_context * ctx, llama_token token) {
1730
+ if (token >= llama_n_vocab(ctx)) {
1731
+ return nullptr;
1732
+ }
1733
+
1734
+ return ctx->vocab.id_to_token[token].tok.c_str();
1735
+ }
1736
+
1737
+ llama_token llama_token_bos() {
1738
+ return 1;
1739
+ }
1740
+
1741
+ llama_token llama_token_eos() {
1742
+ return 2;
1743
+ }
1744
+
1745
+ llama_token llama_sample_top_p_top_k(
1746
+ llama_context * ctx,
1747
+ const llama_token * last_n_tokens_data,
1748
+ int last_n_tokens_size,
1749
+ int top_k,
1750
+ float top_p,
1751
+ float temp,
1752
+ float repeat_penalty) {
1753
+ const int64_t t_start_sample_us = ggml_time_us();
1754
+
1755
+ llama_token result = 0;
1756
+
1757
+ // TODO: avoid this ...
1758
+ const auto last_n_tokens = std::vector<llama_token>(last_n_tokens_data, last_n_tokens_data + last_n_tokens_size);
1759
+
1760
+ result = llama_sample_top_p_top_k(
1761
+ *ctx,
1762
+ last_n_tokens,
1763
+ top_k,
1764
+ top_p,
1765
+ temp,
1766
+ repeat_penalty);
1767
+
1768
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
1769
+ ctx->n_sample++;
1770
+
1771
+ return result;
1772
+ }
1773
+
1774
+
1775
+ void llama_print_timings(struct llama_context * ctx) {
1776
+ const int64_t t_end_us = ggml_time_us();
1777
+
1778
+ const int32_t n_sample = Max(1, ctx->n_sample);
1779
+ const int32_t n_eval = Max(1, ctx->n_eval);
1780
+ const int32_t n_p_eval = Max(1, ctx->n_p_eval);
1781
+
1782
+ fprintf(stderr, "\n");
1783
+ fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0);
1784
+ fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3 * ctx->t_sample_us, n_sample, 1e-3 * ctx->t_sample_us / n_sample);
1785
+ fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_p_eval_us, n_p_eval, 1e-3 * ctx->t_p_eval_us / n_p_eval);
1786
+ fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3 * ctx->t_eval_us, n_eval, 1e-3 * ctx->t_eval_us / n_eval);
1787
+ fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0);
1788
+ }
1789
+
1790
+ void llama_reset_timings(struct llama_context * ctx) {
1791
+ ctx->t_start_us = ggml_time_us();
1792
+ ctx->t_sample_us = ctx->n_sample = 0;
1793
+ ctx->t_eval_us = ctx->n_eval = 0;
1794
+ ctx->t_p_eval_us = ctx->n_p_eval = 0;
1795
+ }
1796
+
1797
+ const char * llama_print_system_info(void) {
1798
+ static std::string s;
1799
+
1800
+ s = "";
1801
+ s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | ";
1802
+ s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | ";
1803
+ s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
1804
+ s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | ";
1805
+ s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
1806
+ s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | ";
1807
+ s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | ";
1808
+ s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
1809
+ s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
1810
+ s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | ";
1811
+ s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
1812
+ s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
1813
+
1814
+ return s.c_str();
1815
+ }