llama-rb 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,1815 @@
1
+ #include "llama.h"
2
+
3
+ #include "ggml.h"
4
+
5
+ #include <cinttypes>
6
+ #include <fstream>
7
+ #include <random>
8
+ #include <map>
9
+ #include <unordered_map>
10
+ #include <queue>
11
+ #include <regex>
12
+ #include <cassert>
13
+ #include <cstring>
14
+
15
+ #if defined(_WIN32) && !defined(_POSIX_MAPPED_FILES)
16
+ #define WIN32_LEAN_AND_MEAN
17
+ #include <Windows.h>
18
+ #else
19
+ #include <sys/types.h>
20
+ #include <sys/mman.h>
21
+ #include <unistd.h>
22
+ #include <fcntl.h>
23
+ #endif
24
+
25
+ #define Min(X, Y) ((Y) > (X) ? (X) : (Y))
26
+ #define Max(X, Y) ((Y) < (X) ? (X) : (Y))
27
+
28
+ #define LLAMA_USE_SCRATCH
29
+ #define LLAMA_MAX_SCRATCH_BUFFERS 16
30
+
31
+ #define LLAMA_ASSERT(x) \
32
+ do { \
33
+ if (!(x)) { \
34
+ fprintf(stderr, "LLAMA_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
35
+ abort(); \
36
+ } \
37
+ } while (0)
38
+
39
+
40
+ // determine number of model parts based on the dimension
41
+ static const std::unordered_map<int, int> LLAMA_N_PARTS = {
42
+ { 4096, 1 },
43
+ { 5120, 2 },
44
+ { 6656, 4 },
45
+ { 8192, 8 },
46
+ };
47
+
48
+ // available llama models
49
+ enum e_model {
50
+ MODEL_UNKNOWN,
51
+ MODEL_7B,
52
+ MODEL_13B,
53
+ MODEL_30B,
54
+ MODEL_65B,
55
+ };
56
+
57
+ static const size_t MB = 1024*1024;
58
+
59
+ // computed for n_ctx == 2048
60
+ // TODO: dynamically determine these sizes
61
+ // needs modifications in ggml
62
+
63
+ static const std::map<e_model, size_t> MEM_REQ_SCRATCH0 = {
64
+ { MODEL_7B, 512ull*MB },
65
+ { MODEL_13B, 512ull*MB },
66
+ { MODEL_30B, 512ull*MB },
67
+ { MODEL_65B, 512ull*MB },
68
+ };
69
+
70
+ static const std::map<e_model, size_t> MEM_REQ_SCRATCH1 = {
71
+ { MODEL_7B, 512ull*MB },
72
+ { MODEL_13B, 512ull*MB },
73
+ { MODEL_30B, 512ull*MB },
74
+ { MODEL_65B, 512ull*MB },
75
+ };
76
+
77
+ // 2*n_embd*n_ctx*n_layer*sizeof(float16)
78
+ static const std::map<e_model, size_t> MEM_REQ_KV_SELF = {
79
+ { MODEL_7B, 1026ull*MB },
80
+ { MODEL_13B, 1608ull*MB },
81
+ { MODEL_30B, 3124ull*MB },
82
+ { MODEL_65B, 5120ull*MB },
83
+ };
84
+
85
+ // this is mostly needed for temporary mul_mat buffers to dequantize the data
86
+ // not actually needed if BLAS is disabled
87
+ static const std::map<e_model, size_t> MEM_REQ_EVAL = {
88
+ { MODEL_7B, 768ull*MB },
89
+ { MODEL_13B, 1024ull*MB },
90
+ { MODEL_30B, 1280ull*MB },
91
+ { MODEL_65B, 1536ull*MB },
92
+ };
93
+
94
+ // default hparams (LLaMA 7B)
95
+ struct llama_hparams {
96
+ int32_t n_vocab = 32000;
97
+ int32_t n_ctx = 512; // this is provided as user input?
98
+ int32_t n_embd = 4096;
99
+ int32_t n_mult = 256;
100
+ int32_t n_head = 32;
101
+ int32_t n_layer = 32;
102
+ int32_t n_rot = 64;
103
+ int32_t f16 = 1;
104
+ };
105
+
106
+ struct llama_layer {
107
+ // normalization
108
+ struct ggml_tensor * attention_norm;
109
+
110
+ // attention
111
+ struct ggml_tensor * wq;
112
+ struct ggml_tensor * wk;
113
+ struct ggml_tensor * wv;
114
+ struct ggml_tensor * wo;
115
+
116
+ // normalization
117
+ struct ggml_tensor * ffn_norm;
118
+
119
+ // ff
120
+ struct ggml_tensor * w1;
121
+ struct ggml_tensor * w2;
122
+ struct ggml_tensor * w3;
123
+ };
124
+
125
+ struct llama_kv_cache {
126
+ struct ggml_tensor * k;
127
+ struct ggml_tensor * v;
128
+
129
+ struct ggml_context * ctx;
130
+
131
+ std::vector<uint8_t> buf;
132
+
133
+ int n; // number of tokens currently in the cache
134
+ };
135
+
136
+ struct llama_model {
137
+ e_model type = MODEL_UNKNOWN;
138
+
139
+ llama_hparams hparams;
140
+
141
+ struct ggml_tensor * tok_embeddings;
142
+
143
+ struct ggml_tensor * norm;
144
+ struct ggml_tensor * output;
145
+
146
+ std::vector<llama_layer> layers;
147
+
148
+ // context
149
+ struct ggml_context * ctx;
150
+
151
+ // key + value cache for the self attention
152
+ // TODO: move to llama_state
153
+ struct llama_kv_cache kv_self;
154
+
155
+ // the model memory buffer
156
+ std::vector<uint8_t> buf;
157
+
158
+ // model memory mapped file
159
+ void * mm_addr = NULL;
160
+ uint64_t mm_length = 0;
161
+
162
+ // tensors
163
+ int n_loaded;
164
+ std::unordered_map<std::string, struct ggml_tensor *> tensors;
165
+ };
166
+
167
+ struct llama_vocab {
168
+ using id = int32_t;
169
+ using token = std::string;
170
+
171
+ struct token_score {
172
+ token tok;
173
+ float score;
174
+ };
175
+
176
+ std::unordered_map<token, id> token_to_id;
177
+ std::vector<token_score> id_to_token;
178
+ };
179
+
180
+ struct llama_context {
181
+ std::mt19937 rng;
182
+
183
+ int64_t t_load_us = 0;
184
+ int64_t t_start_us = 0;
185
+ bool has_evaluated_once = false;
186
+
187
+ int64_t t_sample_us = 0;
188
+ int64_t t_eval_us = 0;
189
+ int64_t t_p_eval_us = 0;
190
+
191
+ int32_t n_sample = 0; // number of tokens sampled
192
+ int32_t n_eval = 0; // number of eval calls
193
+ int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
194
+
195
+ llama_model model;
196
+ llama_vocab vocab;
197
+
198
+ size_t mem_per_token = 0;
199
+
200
+ // decode output (2-dimensional array: [n_tokens][n_vocab])
201
+ std::vector<float> logits;
202
+ bool logits_all = false;
203
+
204
+ // input embedding (1-dimensional array: [n_embd])
205
+ std::vector<float> embedding;
206
+
207
+ // memory buffers used to evaluate the model
208
+ // TODO: move in llama_state
209
+ std::vector<uint8_t> buf_compute;
210
+ std::vector<uint8_t> buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
211
+
212
+ int buf_last = 0;
213
+ size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
214
+
215
+ void use_buf(struct ggml_context * ctx, int i) {
216
+ #if defined(LLAMA_USE_SCRATCH)
217
+ size_t last_size = 0;
218
+
219
+ if (i == -1) {
220
+ last_size = ggml_set_scratch(ctx, { 0, 0, nullptr, });
221
+ } else {
222
+ auto & buf = buf_scratch[i];
223
+ last_size = ggml_set_scratch(ctx, { 0, buf.size(), buf.data(), });
224
+ }
225
+
226
+ if (buf_last >= 0) {
227
+ buf_max_size[buf_last] = Max(buf_max_size[buf_last], last_size);
228
+ }
229
+
230
+ buf_last = i;
231
+ #else
232
+ (void) i;
233
+ (void) ctx;
234
+ #endif
235
+ }
236
+
237
+ size_t get_buf_max_mem(int i) const {
238
+ #if defined(LLAMA_USE_SCRATCH)
239
+ return buf_max_size[i];
240
+ #else
241
+ (void) i;
242
+ return 0;
243
+ #endif
244
+ }
245
+ };
246
+
247
+ //
248
+ // kv cache
249
+ //
250
+
251
+ static bool kv_cache_init(
252
+ const struct llama_hparams & hparams,
253
+ struct llama_kv_cache & cache,
254
+ ggml_type wtype,
255
+ int n_ctx) {
256
+ const int n_embd = hparams.n_embd;
257
+ const int n_layer = hparams.n_layer;
258
+
259
+ const int n_mem = n_layer*n_ctx;
260
+ const int n_elements = n_embd*n_mem;
261
+
262
+ cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
263
+
264
+ struct ggml_init_params params;
265
+ params.mem_size = cache.buf.size();
266
+ params.mem_buffer = cache.buf.data();
267
+ params.no_alloc = false;
268
+
269
+ cache.ctx = ggml_init(params);
270
+
271
+ if (!cache.ctx) {
272
+ fprintf(stderr, "%s: failed to allocate memory for kv cache\n", __func__);
273
+ return false;
274
+ }
275
+
276
+ cache.k = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
277
+ cache.v = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
278
+
279
+ return true;
280
+ }
281
+
282
+ static void kv_cache_free(struct llama_kv_cache & cache) {
283
+ if (cache.ctx) {
284
+ ggml_free(cache.ctx);
285
+ cache.ctx = nullptr;
286
+ }
287
+ }
288
+
289
+ struct llama_context_params llama_context_default_params() {
290
+ struct llama_context_params result = {
291
+ /*.n_ctx =*/ 512,
292
+ /*.n_parts =*/ -1,
293
+ /*.seed =*/ 0,
294
+ /*.f16_kv =*/ false,
295
+ /*.logits_all =*/ false,
296
+ /*.vocab_only =*/ false,
297
+ /*.use_mlock =*/ false,
298
+ /*.embedding =*/ false,
299
+ /*.progress_callback =*/ nullptr,
300
+ /*.progress_callback_user_data =*/ nullptr,
301
+ };
302
+
303
+ return result;
304
+ }
305
+
306
+ //
307
+ // model loading
308
+ //
309
+
310
+ static void *mmap_file(const char *fname, uint64_t *mm_length) {
311
+ #if defined(_WIN32) && !defined(_POSIX_MAPPED_FILES)
312
+ HANDLE hFile = CreateFileA(fname,
313
+ GENERIC_READ,
314
+ FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
315
+ NULL,
316
+ OPEN_EXISTING,
317
+ FILE_ATTRIBUTE_NORMAL | FILE_ATTRIBUTE_NOT_CONTENT_INDEXED,
318
+ NULL);
319
+ if (hFile == INVALID_HANDLE_VALUE) return 0;
320
+ LARGE_INTEGER fileSize;
321
+ fileSize.QuadPart = -1;
322
+ GetFileSizeEx(hFile, &fileSize);
323
+ int64_t length = fileSize.QuadPart;
324
+ HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
325
+ CloseHandle(hFile);
326
+ if (!hMapping) return 0;
327
+ void *addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
328
+ CloseHandle(hMapping);
329
+ if (!addr) return 0;
330
+ #else
331
+ int fd = open(fname, O_RDONLY);
332
+ if (fd == -1) return 0;
333
+ int64_t length = lseek(fd, 0, SEEK_END);
334
+ void *addr = mmap(NULL, length, PROT_READ, MAP_SHARED, fd, 0);
335
+ close(fd);
336
+ if (addr == MAP_FAILED) return 0;
337
+ #endif
338
+ *mm_length = length;
339
+ return addr;
340
+ }
341
+
342
+ static void munmap_file(void * addr, size_t length) {
343
+ #if defined(_WIN32) && !defined(_POSIX_MAPPED_FILES)
344
+ UnmapViewOfFile(addr);
345
+ #else
346
+ munmap(addr, length);
347
+ #endif
348
+ }
349
+
350
+ static bool report_bad_magic(const char *path, uint32_t got, uint32_t want) {
351
+ fprintf(stderr,
352
+ "%s: invalid model file (bad magic [got %#x want %#x])\n"
353
+ "\tyou most likely need to regenerate your ggml files\n"
354
+ "\tthe benefit is you'll get 10-100x faster load times\n"
355
+ "\tsee https://github.com/ggerganov/llama.cpp/issues/91\n"
356
+ "\tuse convert-pth-to-ggml.py to regenerate from original pth\n"
357
+ "\tuse migrate-ggml-2023-03-30-pr613.py if you deleted originals\n",
358
+ path, got, want);
359
+ return false;
360
+ }
361
+
362
+ static bool llama_model_load(
363
+ const std::string & fname,
364
+ llama_context & lctx,
365
+ int n_ctx,
366
+ int n_parts,
367
+ ggml_type memory_type,
368
+ bool vocab_only,
369
+ llama_progress_callback progress_callback,
370
+ void *progress_callback_user_data) {
371
+ fprintf(stderr, "%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
372
+
373
+ lctx.t_start_us = ggml_time_us();
374
+
375
+ auto & model = lctx.model;
376
+ auto & vocab = lctx.vocab;
377
+
378
+ auto fin = std::ifstream(fname, std::ios::binary);
379
+ if (!fin) {
380
+ fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
381
+ return false;
382
+ }
383
+
384
+ std::vector<char> f_buf(1024*1024);
385
+ fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size());
386
+
387
+ fin.seekg(0, fin.end);
388
+ const size_t file_size = fin.tellg();
389
+ fin.seekg(0);
390
+
391
+ // verify magic
392
+ {
393
+ uint32_t magic;
394
+ fin.read((char *) &magic, sizeof(magic));
395
+ if (magic == LLAMA_FILE_MAGIC_UNVERSIONED) {
396
+ fprintf(stderr, "%s: invalid model file '%s' (too old, regenerate your model files or convert them with convert-unversioned-ggml-to-ggml.py!)\n",
397
+ __func__, fname.c_str());
398
+ return false;
399
+ }
400
+ if (magic != LLAMA_FILE_MAGIC) {
401
+ return report_bad_magic(fname.c_str(), magic, LLAMA_FILE_MAGIC);
402
+ }
403
+
404
+ uint32_t format_version;
405
+ fin.read((char *) &format_version, sizeof(format_version));
406
+
407
+ if (format_version != LLAMA_FILE_VERSION) {
408
+ fprintf(stderr, "%s: invalid model file '%s' (unsupported format version %" PRIu32 ", expected %d)\n",
409
+ __func__, fname.c_str(), format_version, LLAMA_FILE_VERSION);
410
+ return false;
411
+ }
412
+ }
413
+
414
+ int n_ff = 0;
415
+
416
+ // load hparams
417
+ {
418
+ auto & hparams = model.hparams;
419
+
420
+ fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
421
+ //fin.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
422
+ fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd));
423
+ fin.read((char *) &hparams.n_mult, sizeof(hparams.n_mult));
424
+ fin.read((char *) &hparams.n_head, sizeof(hparams.n_head));
425
+ fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
426
+ fin.read((char *) &hparams.n_rot, sizeof(hparams.n_rot));
427
+ fin.read((char *) &hparams.f16, sizeof(hparams.f16));
428
+
429
+ hparams.n_ctx = n_ctx;
430
+
431
+ n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
432
+
433
+ if (n_parts < 1) {
434
+ n_parts = LLAMA_N_PARTS.at(hparams.n_embd);
435
+ }
436
+
437
+ // temp warning to tell the user to use "--n_parts"
438
+ if (hparams.f16 == 4 && n_parts != 1) {
439
+ fprintf(stderr, "%s: GPTQ model detected - are you sure n_parts should be %d? we normally expect it to be 1\n", __func__, n_parts);
440
+ fprintf(stderr, "%s: use '--n_parts 1' if necessary\n", __func__);
441
+ }
442
+
443
+ if (hparams.n_layer == 32) {
444
+ model.type = e_model::MODEL_7B;
445
+ }
446
+
447
+ if (hparams.n_layer == 40) {
448
+ model.type = e_model::MODEL_13B;
449
+ }
450
+
451
+ if (hparams.n_layer == 60) {
452
+ model.type = e_model::MODEL_30B;
453
+ }
454
+
455
+ if (hparams.n_layer == 80) {
456
+ model.type = e_model::MODEL_65B;
457
+ }
458
+
459
+ fprintf(stderr, "%s: n_vocab = %d\n", __func__, hparams.n_vocab);
460
+ fprintf(stderr, "%s: n_ctx = %d\n", __func__, hparams.n_ctx);
461
+ fprintf(stderr, "%s: n_embd = %d\n", __func__, hparams.n_embd);
462
+ fprintf(stderr, "%s: n_mult = %d\n", __func__, hparams.n_mult);
463
+ fprintf(stderr, "%s: n_head = %d\n", __func__, hparams.n_head);
464
+ fprintf(stderr, "%s: n_layer = %d\n", __func__, hparams.n_layer);
465
+ fprintf(stderr, "%s: n_rot = %d\n", __func__, hparams.n_rot);
466
+ fprintf(stderr, "%s: f16 = %d\n", __func__, hparams.f16);
467
+ fprintf(stderr, "%s: n_ff = %d\n", __func__, n_ff);
468
+ fprintf(stderr, "%s: n_parts = %d\n", __func__, n_parts);
469
+ fprintf(stderr, "%s: type = %d\n", __func__, model.type);
470
+ }
471
+
472
+ // load vocab
473
+ {
474
+ std::string word;
475
+ vocab.id_to_token.resize(model.hparams.n_vocab);
476
+ std::vector<char> tmp(64);
477
+
478
+ for (int i = 0; i < model.hparams.n_vocab; i++) {
479
+ uint32_t len;
480
+ fin.read((char *) &len, sizeof(len));
481
+
482
+ word.resize(len);
483
+ if (len > 0) {
484
+ tmp.resize(len);
485
+ fin.read(tmp.data(), len);
486
+ word.assign(tmp.data(), len);
487
+ } else {
488
+ word.clear();
489
+ }
490
+
491
+ float score;
492
+ fin.read((char *) &score, sizeof(score));
493
+
494
+ vocab.token_to_id[word] = i;
495
+
496
+ auto &tok_score = vocab.id_to_token[i];
497
+ tok_score.tok = word;
498
+ tok_score.score = score;
499
+ }
500
+ }
501
+
502
+ if (vocab_only) {
503
+ return true;
504
+ }
505
+
506
+ // for the big tensors, we have the option to store the data in 16-bit floats or quantized
507
+ // in order to save memory and also to speed up the computation
508
+ // wtype is for per-layer weights, while vtype is for other weights
509
+ ggml_type wtype, vtype;
510
+ switch (model.hparams.f16) {
511
+ case 0: wtype = vtype = GGML_TYPE_F32; break;
512
+ case 1: wtype = vtype = GGML_TYPE_F16; break;
513
+ case 2: wtype = vtype = GGML_TYPE_Q4_0; break;
514
+ case 3: wtype = vtype = GGML_TYPE_Q4_1; break;
515
+ case 4: wtype = GGML_TYPE_Q4_1; vtype = GGML_TYPE_F16; break;
516
+ default:
517
+ {
518
+ fprintf(stderr, "%s: invalid model file '%s' (bad f16 value %d)\n",
519
+ __func__, fname.c_str(), model.hparams.f16);
520
+ return false;
521
+ }
522
+ }
523
+
524
+ // map model into memory
525
+ char *mm_addr = NULL;
526
+ model.mm_addr = mmap_file(fname.c_str(), &model.mm_length);
527
+ if (model.mm_addr == NULL) {
528
+ fprintf(stderr, "%s: failed to mmap '%s'\n", __func__, fname.c_str());
529
+ return false;
530
+ }
531
+ mm_addr = (char *)model.mm_addr;
532
+ fprintf(stderr, "%s: ggml map size = %6.2f MB\n", __func__, model.mm_length/(1024.0*1024.0));
533
+
534
+ auto & ctx = model.ctx;
535
+
536
+ size_t ctx_size = 0;
537
+ {
538
+ const auto &hparams = model.hparams;
539
+ const int n_layer = hparams.n_layer;
540
+ ctx_size += (5 + 10*n_layer)*256; // object overhead
541
+ fprintf(stderr, "%s: ggml ctx size = %6.2f KB\n", __func__, ctx_size/1024.0);
542
+ }
543
+
544
+ // print memory requirements
545
+ {
546
+ const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
547
+
548
+ // this is the total memory required to run the inference
549
+ const size_t mem_required =
550
+ ctx_size +
551
+ model.mm_length +
552
+ MEM_REQ_SCRATCH0.at(model.type) +
553
+ MEM_REQ_SCRATCH1.at(model.type) +
554
+ MEM_REQ_EVAL.at (model.type);
555
+
556
+ // this is the memory required by one llama_state
557
+ const size_t mem_required_state =
558
+ scale*MEM_REQ_KV_SELF.at(model.type);
559
+
560
+ fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
561
+ mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
562
+ }
563
+
564
+ // create the ggml context
565
+ {
566
+ lctx.model.buf.resize(ctx_size);
567
+
568
+ struct ggml_init_params params = {
569
+ /*.mem_size =*/ lctx.model.buf.size(),
570
+ /*.mem_buffer =*/ lctx.model.buf.data(),
571
+ /*.no_alloc =*/ true,
572
+ };
573
+
574
+ model.ctx = ggml_init(params);
575
+ if (!model.ctx) {
576
+ fprintf(stderr, "%s: ggml_init() failed\n", __func__);
577
+ return false;
578
+ }
579
+ }
580
+
581
+ // prepare memory for the weights
582
+ {
583
+ const auto & hparams = model.hparams;
584
+
585
+ const int n_embd = hparams.n_embd;
586
+ const int n_layer = hparams.n_layer;
587
+ const int n_vocab = hparams.n_vocab;
588
+
589
+ model.layers.resize(n_layer);
590
+
591
+ model.tok_embeddings = ggml_new_tensor_2d(ctx, vtype, n_embd, n_vocab);
592
+
593
+ model.norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
594
+ model.output = ggml_new_tensor_2d(ctx, vtype, n_embd, n_vocab);
595
+
596
+ // map by name
597
+ model.tensors["tok_embeddings.weight"] = model.tok_embeddings;
598
+
599
+ model.tensors["norm.weight"] = model.norm;
600
+ model.tensors["output.weight"] = model.output;
601
+
602
+ for (int i = 0; i < n_layer; ++i) {
603
+ auto & layer = model.layers[i];
604
+
605
+ layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
606
+
607
+ layer.wq = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
608
+ layer.wk = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
609
+ layer.wv = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
610
+ layer.wo = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
611
+
612
+ layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
613
+
614
+ layer.w1 = ggml_new_tensor_2d(ctx, wtype, n_embd, n_ff);
615
+ layer.w2 = ggml_new_tensor_2d(ctx, wtype, n_ff, n_embd);
616
+ layer.w3 = ggml_new_tensor_2d(ctx, wtype, n_embd, n_ff);
617
+
618
+ // map by name
619
+ model.tensors["layers." + std::to_string(i) + ".attention_norm.weight"] = layer.attention_norm;
620
+
621
+ model.tensors["layers." + std::to_string(i) + ".attention.wq.weight"] = layer.wq;
622
+ model.tensors["layers." + std::to_string(i) + ".attention.wk.weight"] = layer.wk;
623
+ model.tensors["layers." + std::to_string(i) + ".attention.wv.weight"] = layer.wv;
624
+ model.tensors["layers." + std::to_string(i) + ".attention.wo.weight"] = layer.wo;
625
+
626
+ model.tensors["layers." + std::to_string(i) + ".ffn_norm.weight"] = layer.ffn_norm;
627
+
628
+ model.tensors["layers." + std::to_string(i) + ".feed_forward.w1.weight"] = layer.w1;
629
+ model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.weight"] = layer.w2;
630
+ model.tensors["layers." + std::to_string(i) + ".feed_forward.w3.weight"] = layer.w3;
631
+ }
632
+ }
633
+
634
+ std::vector<uint8_t> tmp;
635
+
636
+ if (progress_callback) {
637
+ progress_callback(0.0, progress_callback_user_data);
638
+ }
639
+
640
+ fprintf(stderr, "%s: loading tensors from '%s'\n", __func__, fname.c_str());
641
+
642
+ // load weights
643
+ {
644
+ size_t total_size = 0;
645
+ model.n_loaded = 0;
646
+
647
+ while (true) {
648
+ int32_t n_dims;
649
+ int32_t length;
650
+ int32_t ftype;
651
+
652
+ fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
653
+ fin.read(reinterpret_cast<char *>(&length), sizeof(length));
654
+ fin.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
655
+
656
+ if (fin.eof()) {
657
+ break;
658
+ }
659
+
660
+ int32_t nelements = 1;
661
+ int32_t ne[2] = { 1, 1 };
662
+ for (int i = 0; i < n_dims; ++i) {
663
+ fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
664
+ nelements *= ne[i];
665
+ }
666
+
667
+ std::string name(length, 0);
668
+ fin.read(&name[0], length);
669
+
670
+ if (model.tensors.find(name.data()) == model.tensors.end()) {
671
+ fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
672
+ return false;
673
+ }
674
+
675
+ auto tensor = model.tensors[name.data()];
676
+
677
+ if (ggml_nelements(tensor) != nelements) {
678
+ fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
679
+ return false;
680
+ }
681
+ if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
682
+ fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
683
+ __func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
684
+ return false;
685
+ }
686
+ if (0) {
687
+ static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
688
+ fprintf(stderr, "%24s - [%5d, %5d], type = %6s\n", name.data(), ne[0], ne[1], ftype_str[ftype]);
689
+ }
690
+
691
+ switch (ftype) {
692
+ case 0: // f32
693
+ case 1: // f16
694
+ break;
695
+ case 2: // q4_0
696
+ case 3: // q4_1
697
+ assert(ne[0] % 64 == 0);
698
+ break;
699
+ default:
700
+ fprintf(stderr, "%s: unknown ftype %d in model file\n", __func__, ftype);
701
+ return false;
702
+ };
703
+
704
+ // load the tensor data into memory without copying or reading it
705
+ size_t offset = fin.tellg();
706
+ size_t tensor_data_size = ggml_nbytes(tensor);
707
+ offset = (offset + 31) & -32;
708
+ tensor->data = mm_addr + offset;
709
+ fin.seekg(offset + tensor_data_size);
710
+ total_size += tensor_data_size;
711
+ model.n_loaded++;
712
+
713
+ // progress
714
+ if (progress_callback) {
715
+ double current_progress = size_t(fin.tellg()) / double(file_size);
716
+ progress_callback(current_progress, progress_callback_user_data);
717
+ }
718
+ }
719
+
720
+ fin.close();
721
+
722
+ fprintf(stderr, "%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, model.n_loaded);
723
+ if (model.n_loaded == 0) {
724
+ fprintf(stderr, "%s: WARN no tensors loaded from model file - assuming empty model for testing\n", __func__);
725
+ } else if (model.n_loaded != (int) model.tensors.size()) {
726
+ fprintf(stderr, "%s: ERROR not all tensors loaded from model file - expected %zu, got %d\n", __func__, model.tensors.size(), model.n_loaded);
727
+ return false;
728
+ }
729
+ }
730
+
731
+ // loading time will be recalculate after the first eval, so
732
+ // we take page faults deferred by mmap() into consideration
733
+ lctx.t_load_us = ggml_time_us() - lctx.t_start_us;
734
+
735
+ if (progress_callback) {
736
+ progress_callback(1.0, progress_callback_user_data);
737
+ }
738
+
739
+ return true;
740
+ }
741
+
742
+ // evaluate the transformer
743
+ //
744
+ // - lctx: llama context
745
+ // - tokens: new batch of tokens to process
746
+ // - n_past: the context size so far
747
+ // - n_threads: number of threads to use
748
+ //
749
+ static bool llama_eval_internal(
750
+ llama_context & lctx,
751
+ const llama_token * tokens,
752
+ const int n_tokens,
753
+ const int n_past,
754
+ const int n_threads) {
755
+ const int64_t t_start_us = ggml_time_us();
756
+
757
+ const int N = n_tokens;
758
+
759
+ const auto & model = lctx.model;
760
+ const auto & hparams = model.hparams;
761
+
762
+ auto & kv_self = model.kv_self;
763
+
764
+ LLAMA_ASSERT(!!kv_self.ctx);
765
+
766
+ const int n_embd = hparams.n_embd;
767
+ const int n_layer = hparams.n_layer;
768
+ const int n_ctx = hparams.n_ctx;
769
+ const int n_head = hparams.n_head;
770
+ const int n_vocab = hparams.n_vocab;
771
+ const int n_rot = hparams.n_embd/hparams.n_head;
772
+
773
+ auto & mem_per_token = lctx.mem_per_token;
774
+ auto & buf_compute = lctx.buf_compute;
775
+
776
+ struct ggml_init_params params = {
777
+ /*.mem_size =*/ buf_compute.size(),
778
+ /*.mem_buffer =*/ buf_compute.data(),
779
+ /*.no_alloc =*/ false,
780
+ };
781
+
782
+ struct ggml_context * ctx0 = ggml_init(params);
783
+
784
+ // for big prompts, if BLAS is enabled, it is better to use only one thread
785
+ // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
786
+ ggml_cgraph gf = {};
787
+ gf.n_threads = N >= 32 && ggml_cpu_has_blas() ? 1 : n_threads;
788
+
789
+ struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
790
+ memcpy(embd->data, tokens, N*ggml_element_size(embd));
791
+
792
+ struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
793
+
794
+ for (int il = 0; il < n_layer; ++il) {
795
+ struct ggml_tensor * inpSA = inpL;
796
+
797
+ struct ggml_tensor * cur;
798
+
799
+ lctx.use_buf(ctx0, 0);
800
+
801
+ // norm
802
+ {
803
+ cur = ggml_rms_norm(ctx0, inpL);
804
+
805
+ // cur = attention_norm*cur
806
+ cur = ggml_mul(ctx0,
807
+ ggml_repeat(ctx0, model.layers[il].attention_norm, cur),
808
+ cur);
809
+ }
810
+
811
+ // self-attention
812
+ {
813
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
814
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
815
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
816
+
817
+ // store key and value to memory
818
+ if (N >= 1) {
819
+ struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
820
+ struct ggml_tensor * v = ggml_view_1d(ctx0, kv_self.v, N*n_embd, (ggml_element_size(kv_self.v)*n_embd)*(il*n_ctx + n_past));
821
+
822
+ ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
823
+ ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
824
+ }
825
+
826
+ // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
827
+ struct ggml_tensor * Q =
828
+ ggml_permute(ctx0,
829
+ ggml_rope(ctx0,
830
+ ggml_cpy(ctx0,
831
+ Qcur,
832
+ ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)),
833
+ n_past, n_rot, 0),
834
+ 0, 2, 1, 3);
835
+
836
+ // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
837
+ struct ggml_tensor * K =
838
+ ggml_permute(ctx0,
839
+ ggml_rope(ctx0,
840
+ ggml_reshape_3d(ctx0,
841
+ ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd),
842
+ n_embd/n_head, n_head, n_past + N),
843
+ n_past, n_rot, 1),
844
+ 0, 2, 1, 3);
845
+
846
+ // K * Q
847
+ struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
848
+
849
+ // KQ_scaled = KQ / sqrt(n_embd/n_head)
850
+ struct ggml_tensor * KQ_scaled =
851
+ ggml_scale(ctx0,
852
+ KQ,
853
+ ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head)));
854
+
855
+ // KQ_masked = mask_past(KQ_scaled)
856
+ struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
857
+
858
+ // KQ = soft_max(KQ_masked)
859
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
860
+
861
+ // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
862
+ struct ggml_tensor * V_trans =
863
+ ggml_cpy(ctx0,
864
+ ggml_permute(ctx0,
865
+ ggml_reshape_3d(ctx0,
866
+ ggml_view_1d(ctx0, kv_self.v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.v)*n_embd),
867
+ n_embd/n_head, n_head, n_past + N),
868
+ 1, 2, 0, 3),
869
+ ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd/n_head, n_head));
870
+
871
+ // KQV = transpose(V) * KQ_soft_max
872
+ struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
873
+
874
+ // KQV_merged = KQV.permute(0, 2, 1, 3)
875
+ struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
876
+
877
+ // cur = KQV_merged.contiguous().view(n_embd, N)
878
+ cur = ggml_cpy(ctx0,
879
+ KQV_merged,
880
+ ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
881
+
882
+ // projection (no bias)
883
+ cur = ggml_mul_mat(ctx0,
884
+ model.layers[il].wo,
885
+ cur);
886
+ }
887
+
888
+ lctx.use_buf(ctx0, 1);
889
+
890
+ struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
891
+
892
+ // feed-forward network
893
+ {
894
+ // norm
895
+ {
896
+ cur = ggml_rms_norm(ctx0, inpFF);
897
+
898
+ // cur = ffn_norm*cur
899
+ cur = ggml_mul(ctx0,
900
+ ggml_repeat(ctx0, model.layers[il].ffn_norm, cur),
901
+ cur);
902
+ }
903
+
904
+ struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
905
+ model.layers[il].w3,
906
+ cur);
907
+
908
+ cur = ggml_mul_mat(ctx0,
909
+ model.layers[il].w1,
910
+ cur);
911
+
912
+ // SILU activation
913
+ cur = ggml_silu(ctx0, cur);
914
+
915
+ cur = ggml_mul(ctx0, cur, tmp);
916
+
917
+ cur = ggml_mul_mat(ctx0,
918
+ model.layers[il].w2,
919
+ cur);
920
+ }
921
+
922
+ cur = ggml_add(ctx0, cur, inpFF);
923
+
924
+ // input for next layer
925
+ inpL = cur;
926
+ }
927
+
928
+ lctx.use_buf(ctx0, 0);
929
+
930
+ // used at the end to optionally extract the embeddings
931
+ struct ggml_tensor * embeddings = NULL;
932
+
933
+ // norm
934
+ {
935
+
936
+ inpL = ggml_rms_norm(ctx0, inpL);
937
+
938
+ // inpL = norm*inpL
939
+ inpL = ggml_mul(ctx0,
940
+ ggml_repeat(ctx0, model.norm, inpL),
941
+ inpL);
942
+
943
+ embeddings = inpL;
944
+ }
945
+
946
+ // lm_head
947
+ inpL = ggml_mul_mat(ctx0, model.output, inpL);
948
+
949
+ lctx.use_buf(ctx0, -1);
950
+
951
+ // logits -> probs
952
+ //inpL = ggml_soft_max(ctx0, inpL);
953
+
954
+ // run the computation
955
+ ggml_build_forward_expand(&gf, inpL);
956
+ ggml_graph_compute (ctx0, &gf);
957
+
958
+ //if (n_past%100 == 0) {
959
+ // ggml_graph_print (&gf);
960
+ // ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot");
961
+ //}
962
+
963
+ //embd_w.resize(n_vocab*N);
964
+ //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
965
+
966
+ // extract logits
967
+ {
968
+ auto & logits_out = lctx.logits;
969
+
970
+ if (lctx.logits_all) {
971
+ logits_out.resize(n_vocab * N);
972
+ memcpy(logits_out.data(), (float *) ggml_get_data(inpL), sizeof(float)*n_vocab*N);
973
+ } else {
974
+ // return result for just the last token
975
+ logits_out.resize(n_vocab);
976
+ memcpy(logits_out.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
977
+ }
978
+ }
979
+
980
+ // extract embeddings
981
+ if (lctx.embedding.size()) {
982
+ auto & embedding_out = lctx.embedding;
983
+
984
+ embedding_out.resize(n_embd);
985
+ memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(N - 1)), sizeof(float)*n_embd);
986
+ }
987
+
988
+ if (mem_per_token == 0) {
989
+ mem_per_token = ggml_used_mem(ctx0)/N;
990
+ }
991
+
992
+ #if 0
993
+ printf("\n%s: used_mem = %.3f MB, scratch -- %.3f MB %.3f MB\n", __func__,
994
+ ggml_used_mem(ctx0)/1024.0/1024.0,
995
+ lctx.get_buf_max_mem(0)/1024.0/1024.0,
996
+ lctx.get_buf_max_mem(1)/1024.0/1024.0);
997
+ #endif
998
+
999
+ ggml_free(ctx0);
1000
+
1001
+ // measure the performance only for the single-token evals
1002
+ if (N == 1) {
1003
+ lctx.t_eval_us += ggml_time_us() - t_start_us;
1004
+ lctx.n_eval++;
1005
+ }
1006
+ else if (N > 1) {
1007
+ lctx.t_p_eval_us += ggml_time_us() - t_start_us;
1008
+ lctx.n_p_eval += N;
1009
+ }
1010
+
1011
+ return true;
1012
+ }
1013
+
1014
+ //
1015
+ // tokenizer
1016
+ //
1017
+
1018
+ static size_t utf8_len(char src) {
1019
+ const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
1020
+ uint8_t highbits = static_cast<uint8_t>(src) >> 4;
1021
+ return lookup[highbits];
1022
+ }
1023
+
1024
+ struct llama_sp_symbol {
1025
+ using index = int;
1026
+ index prev;
1027
+ index next;
1028
+ const char * text;
1029
+ size_t n;
1030
+ };
1031
+
1032
+ struct llama_sp_bigram {
1033
+ struct comparator {
1034
+ bool operator()(llama_sp_bigram & l, llama_sp_bigram & r) {
1035
+ return (l.score < r.score) || (l.score == r.score && l.left > r.left);
1036
+ }
1037
+ };
1038
+ using queue_storage = std::vector<llama_sp_bigram>;
1039
+ using queue = std::priority_queue<llama_sp_bigram, queue_storage, comparator>;
1040
+ llama_sp_symbol::index left;
1041
+ llama_sp_symbol::index right;
1042
+ float score;
1043
+ size_t size;
1044
+ };
1045
+
1046
+ // original implementation:
1047
+ // https://github.com/ggerganov/llama.cpp/commit/074bea2eb1f1349a0118239c4152914aecaa1be4
1048
+ struct llama_tokenizer {
1049
+ llama_tokenizer(const llama_vocab & vocab): vocab_(vocab) {}
1050
+
1051
+ void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
1052
+ // split string into utf8 chars
1053
+ int index = 0;
1054
+ size_t offs = 0;
1055
+ while (offs < text.size()) {
1056
+ llama_sp_symbol sym;
1057
+ size_t char_len = Min(text.size() - offs, utf8_len(text[offs]));
1058
+ sym.text = text.c_str() + offs;
1059
+ sym.n = char_len;
1060
+ offs += char_len;
1061
+ sym.prev = index - 1;
1062
+ sym.next = offs == text.size() ? -1 : index + 1;
1063
+ index++;
1064
+ symbols_.emplace_back(std::move(sym));
1065
+ }
1066
+
1067
+ // seed the work queue with all possible 2-character tokens.
1068
+ for (size_t i = 1; i < symbols_.size(); ++i) {
1069
+ try_add_bigram(i - 1, i);
1070
+ }
1071
+
1072
+ // keep substituting the highest frequency pairs for as long as we can.
1073
+ while (!work_queue_.empty()) {
1074
+ auto bigram = work_queue_.top();
1075
+ work_queue_.pop();
1076
+
1077
+ auto & left_sym = symbols_[bigram.left];
1078
+ auto & right_sym = symbols_[bigram.right];
1079
+
1080
+ // if one of the symbols already got merged, skip it.
1081
+ if (left_sym.n == 0 || right_sym.n == 0 ||
1082
+ left_sym.n + right_sym.n != bigram.size) {
1083
+ continue;
1084
+ }
1085
+
1086
+ // merge the right sym into the left one
1087
+ left_sym.n += right_sym.n;
1088
+ right_sym.n = 0;
1089
+
1090
+ //printf("left = '%*s' size = %zu\n", (int) left_sym.n, left_sym.text, bigram.size);
1091
+
1092
+ // remove the right sym from the chain
1093
+ left_sym.next = right_sym.next;
1094
+ if (right_sym.next >= 0) {
1095
+ symbols_[right_sym.next].prev = bigram.left;
1096
+ }
1097
+
1098
+ // find more substitutions
1099
+ try_add_bigram(left_sym.prev, bigram.left);
1100
+ try_add_bigram(bigram.left, left_sym.next);
1101
+ }
1102
+
1103
+ for (int i = 0; i != -1; i = symbols_[i].next) {
1104
+ auto & symbol = symbols_[i];
1105
+ auto token = vocab_.token_to_id.find(std::string(symbol.text, symbol.n));
1106
+
1107
+ if (token == vocab_.token_to_id.end()) {
1108
+ // output any symbols that did not form tokens as bytes.
1109
+ for (int j = 0; j < (int) symbol.n; ++j) {
1110
+ llama_vocab::id token_id = static_cast<uint8_t>(symbol.text[j]) + 3;
1111
+ output.push_back(token_id);
1112
+ }
1113
+ } else {
1114
+ output.push_back((*token).second);
1115
+ }
1116
+ }
1117
+ }
1118
+
1119
+ private:
1120
+ void try_add_bigram(int left, int right) {
1121
+ if (left == -1 || right == -1) {
1122
+ return;
1123
+ }
1124
+
1125
+ const std::string text = std::string(symbols_[left].text, symbols_[left].n + symbols_[right].n);
1126
+ auto token = vocab_.token_to_id.find(text);
1127
+
1128
+ if (token == vocab_.token_to_id.end()) {
1129
+ return;
1130
+ }
1131
+
1132
+ if (static_cast<size_t>((*token).second) >= vocab_.id_to_token.size()) {
1133
+ return;
1134
+ }
1135
+
1136
+ const auto &tok_score = vocab_.id_to_token[(*token).second];
1137
+
1138
+ llama_sp_bigram bigram;
1139
+ bigram.left = left;
1140
+ bigram.right = right;
1141
+ bigram.score = tok_score.score;
1142
+ bigram.size = text.size();
1143
+ work_queue_.push(bigram);
1144
+ }
1145
+
1146
+ const llama_vocab & vocab_;
1147
+ std::vector<llama_sp_symbol> symbols_;
1148
+ llama_sp_bigram::queue work_queue_;
1149
+ };
1150
+
1151
+ static std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, const std::string & text, bool bos) {
1152
+ llama_tokenizer tokenizer(vocab);
1153
+ std::vector<llama_vocab::id> output;
1154
+
1155
+ if (text.size() == 0) {
1156
+ return output;
1157
+ }
1158
+
1159
+ if (bos) {
1160
+ output.push_back(1);
1161
+ }
1162
+
1163
+ tokenizer.tokenize(text, output);
1164
+ return output;
1165
+ }
1166
+
1167
+ //
1168
+ // sampling
1169
+ //
1170
+
1171
+ static void sample_top_k(std::vector<std::pair<float, llama_vocab::id>> & logits_id, int top_k) {
1172
+ // find the top k tokens
1173
+ std::partial_sort(
1174
+ logits_id.begin(),
1175
+ logits_id.begin() + top_k, logits_id.end(),
1176
+ [](const std::pair<float, llama_vocab::id> & a, const std::pair<float, llama_vocab::id> & b) {
1177
+ return a.first > b.first;
1178
+ });
1179
+
1180
+ logits_id.resize(top_k);
1181
+ }
1182
+
1183
+ static llama_vocab::id llama_sample_top_p_top_k(
1184
+ llama_context & lctx,
1185
+ const std::vector<llama_vocab::id> & last_n_tokens,
1186
+ int top_k,
1187
+ float top_p,
1188
+ float temp,
1189
+ float repeat_penalty) {
1190
+ auto & rng = lctx.rng;
1191
+
1192
+ const int n_logits = lctx.model.hparams.n_vocab;
1193
+
1194
+ const auto & logits = lctx.logits;
1195
+ const auto * plogits = logits.data() + logits.size() - n_logits;
1196
+
1197
+ std::vector<std::pair<float, llama_vocab::id>> logits_id;
1198
+ logits_id.reserve(n_logits);
1199
+
1200
+ {
1201
+ const float scale = 1.0f/temp;
1202
+ for (int i = 0; i < n_logits; ++i) {
1203
+ // repetition penalty from ctrl paper (https://arxiv.org/abs/1909.05858)
1204
+ // credit https://github.com/facebookresearch/llama/compare/main...shawwn:llama:main
1205
+ if (std::find(last_n_tokens.begin(), last_n_tokens.end(), i) != last_n_tokens.end()) {
1206
+ // if score < 0 then repetition penalty has to multiplied to reduce the previous token probability
1207
+ if (plogits[i] < 0.0f) {
1208
+ logits_id.push_back(std::make_pair(plogits[i]*scale*repeat_penalty, i));
1209
+ } else {
1210
+ logits_id.push_back(std::make_pair(plogits[i]*scale/repeat_penalty, i));
1211
+ }
1212
+ } else {
1213
+ logits_id.push_back(std::make_pair(plogits[i]*scale, i));
1214
+ }
1215
+ }
1216
+ }
1217
+
1218
+ sample_top_k(logits_id, top_k);
1219
+
1220
+ float maxl = -std::numeric_limits<float>::infinity();
1221
+ for (const auto & kv : logits_id) {
1222
+ maxl = Max(maxl, kv.first);
1223
+ }
1224
+
1225
+ // compute probs for the top k tokens
1226
+ std::vector<float> probs;
1227
+ probs.reserve(logits_id.size());
1228
+
1229
+ double sum = 0.0;
1230
+ for (const auto & kv : logits_id) {
1231
+ const float p = expf(kv.first - maxl);
1232
+ probs.push_back(p);
1233
+ sum += p;
1234
+ }
1235
+
1236
+ // normalize the probs
1237
+ for (auto & p : probs) {
1238
+ p /= sum;
1239
+ }
1240
+
1241
+ if (top_p < 1.0) {
1242
+ double cumsum = 0.0;
1243
+ for (int i = 0; i < (int) probs.size(); i++) {
1244
+ cumsum += probs[i];
1245
+ if (cumsum >= top_p) {
1246
+ probs.resize(i + 1);
1247
+ logits_id.resize(i + 1);
1248
+ break;
1249
+ }
1250
+ }
1251
+
1252
+ cumsum = 1.0/cumsum;
1253
+ for (int i = 0; i < (int) probs.size(); i++) {
1254
+ probs[i] *= cumsum;
1255
+ }
1256
+ }
1257
+
1258
+ //printf("\n");
1259
+ //for (int i = 0; i < (int) 10; i++) {
1260
+ // printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), probs[i]);
1261
+ //}
1262
+ //printf("\n\n");
1263
+ //exit(0);
1264
+
1265
+ std::discrete_distribution<> dist(probs.begin(), probs.end());
1266
+ int idx = dist(rng);
1267
+
1268
+ return logits_id[idx].second;
1269
+ }
1270
+
1271
+ //
1272
+ // quantization
1273
+ //
1274
+
1275
+ // TODO: reuse code from the llama_model_load() somehow
1276
+ static bool llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, int itype) {
1277
+ ggml_type type = GGML_TYPE_Q4_1;
1278
+
1279
+ switch (itype) {
1280
+ case 2: type = GGML_TYPE_Q4_0; break;
1281
+ case 3: type = GGML_TYPE_Q4_1; break;
1282
+ default: fprintf(stderr, "%s: invalid quantization type %d\n", __func__, itype); return 1;
1283
+ };
1284
+
1285
+ if (type != GGML_TYPE_Q4_0 && type != GGML_TYPE_Q4_1) {
1286
+ fprintf(stderr, "%s: invalid quantization type %d\n", __func__, type);
1287
+ return false;
1288
+ }
1289
+
1290
+ llama_vocab vocab;
1291
+
1292
+ printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
1293
+
1294
+ auto finp = std::ifstream(fname_inp, std::ios::binary);
1295
+ if (!finp) {
1296
+ fprintf(stderr, "%s: failed to open '%s' for reading\n", __func__, fname_inp.c_str());
1297
+ return false;
1298
+ }
1299
+
1300
+ auto fout = std::ofstream(fname_out, std::ios::binary);
1301
+ if (!fout) {
1302
+ fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname_out.c_str());
1303
+ return false;
1304
+ }
1305
+
1306
+ // verify magic
1307
+ {
1308
+ uint32_t magic;
1309
+ finp.read((char *) &magic, sizeof(magic));
1310
+ if (magic == LLAMA_FILE_MAGIC_UNVERSIONED) {
1311
+ fprintf(stderr, "%s: invalid model file '%s' (too old, regenerate your model files!)\n",
1312
+ __func__, fname_inp.c_str());
1313
+ return false;
1314
+ }
1315
+ if (magic != LLAMA_FILE_MAGIC) {
1316
+ return report_bad_magic(fname_inp.c_str(), magic, LLAMA_FILE_MAGIC);
1317
+ }
1318
+
1319
+ fout.write((char *) &magic, sizeof(magic));
1320
+
1321
+ uint32_t format_version;
1322
+ finp.read((char *) &format_version, sizeof(format_version));
1323
+
1324
+ if (format_version != LLAMA_FILE_VERSION) {
1325
+ fprintf(stderr, "%s: invalid model file '%s' (unsupported format version %" PRIu32 ", expected %d)\n",
1326
+ __func__, fname_inp.c_str(), format_version, LLAMA_FILE_VERSION);
1327
+ return false;
1328
+ }
1329
+
1330
+ fout.write((char *) &format_version, sizeof(format_version));
1331
+ }
1332
+
1333
+ llama_hparams hparams;
1334
+
1335
+ // load hparams
1336
+ {
1337
+ finp.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
1338
+ //finp.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
1339
+ finp.read((char *) &hparams.n_embd, sizeof(hparams.n_embd));
1340
+ finp.read((char *) &hparams.n_mult, sizeof(hparams.n_mult));
1341
+ finp.read((char *) &hparams.n_head, sizeof(hparams.n_head));
1342
+ finp.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
1343
+ finp.read((char *) &hparams.n_rot, sizeof(hparams.n_rot));
1344
+ finp.read((char *) &hparams.f16, sizeof(hparams.f16));
1345
+
1346
+ printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
1347
+ printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
1348
+ printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
1349
+ printf("%s: n_mult = %d\n", __func__, hparams.n_mult);
1350
+ printf("%s: n_head = %d\n", __func__, hparams.n_head);
1351
+ printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
1352
+ printf("%s: f16 = %d\n", __func__, hparams.f16);
1353
+
1354
+ fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
1355
+ //fout.write((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
1356
+ fout.write((char *) &hparams.n_embd, sizeof(hparams.n_embd));
1357
+ fout.write((char *) &hparams.n_mult, sizeof(hparams.n_mult));
1358
+ fout.write((char *) &hparams.n_head, sizeof(hparams.n_head));
1359
+ fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer));
1360
+ fout.write((char *) &hparams.n_rot, sizeof(hparams.n_rot));
1361
+ fout.write((char *) &itype, sizeof(hparams.f16));
1362
+ }
1363
+
1364
+ // load vocab
1365
+ {
1366
+ const int32_t n_vocab = hparams.n_vocab;
1367
+
1368
+ if (n_vocab != hparams.n_vocab) {
1369
+ fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
1370
+ __func__, fname_inp.c_str(), n_vocab, hparams.n_vocab);
1371
+ return false;
1372
+ }
1373
+
1374
+ std::vector<char> word(32);
1375
+ vocab.id_to_token.resize(n_vocab);
1376
+ for (int i = 0; i < n_vocab; i++) {
1377
+ uint32_t len;
1378
+ finp.read ((char *) &len, sizeof(len));
1379
+ fout.write((char *) &len, sizeof(len));
1380
+
1381
+ word.resize(len);
1382
+ finp.read ((char *) &word[0], len);
1383
+ fout.write((char *) &word[0], len);
1384
+
1385
+ float score;
1386
+ finp.read ((char *) &score, sizeof(score));
1387
+ fout.write((char *) &score, sizeof(score));
1388
+
1389
+ vocab.token_to_id[word.data()] = i;
1390
+
1391
+ auto &tok_score = vocab.id_to_token[i];
1392
+ tok_score.tok = word.data();
1393
+ tok_score.score = score;
1394
+ }
1395
+ }
1396
+
1397
+ // load weights
1398
+ {
1399
+ size_t total_size_org = 0;
1400
+ size_t total_size_new = 0;
1401
+
1402
+ std::vector<float> work;
1403
+
1404
+ std::vector<uint8_t> data_u8;
1405
+ std::vector<ggml_fp16_t> data_f16;
1406
+ std::vector<float> data_f32;
1407
+
1408
+ std::vector<int64_t> hist_all(1 << 4, 0);
1409
+
1410
+ while (true) {
1411
+ int32_t n_dims;
1412
+ int32_t length;
1413
+ int32_t ftype;
1414
+
1415
+ finp.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
1416
+ finp.read(reinterpret_cast<char *>(&length), sizeof(length));
1417
+ finp.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
1418
+
1419
+ if (finp.eof()) {
1420
+ break;
1421
+ }
1422
+
1423
+ int32_t nelements = 1;
1424
+ int32_t ne[2] = { 1, 1 };
1425
+ for (int i = 0; i < n_dims; ++i) {
1426
+ finp.read (reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
1427
+ nelements *= ne[i];
1428
+ }
1429
+
1430
+ std::string name(length, 0);
1431
+ finp.read (&name[0], length);
1432
+
1433
+ {
1434
+ // ensure tensor data is aligned
1435
+ uint64_t offset = finp.tellg();
1436
+ offset = (offset + 31) & -32;
1437
+ finp.seekg(offset);
1438
+ }
1439
+
1440
+ {
1441
+ static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
1442
+ printf("%48s - [%5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ftype_str[ftype]);
1443
+ }
1444
+
1445
+ // regexes of tensor names to be quantized
1446
+ const std::vector<std::string> k_names = {
1447
+ ".*weight",
1448
+ };
1449
+
1450
+ bool quantize = false;
1451
+ for (const auto & s : k_names) {
1452
+ if (std::regex_match(name, std::regex(s))) {
1453
+ quantize = true;
1454
+ break;
1455
+ }
1456
+ }
1457
+
1458
+ // quantize only 2D tensors
1459
+ quantize &= (n_dims == 2);
1460
+
1461
+ if (quantize) {
1462
+ if (ftype != 0 && ftype != 1) {
1463
+ fprintf(stderr, "%s: unsupported ftype %d for integer quantization\n", __func__, ftype);
1464
+ return false;
1465
+ }
1466
+
1467
+ if (ftype == 1) {
1468
+ data_f16.resize(nelements);
1469
+ finp.read(reinterpret_cast<char *>(data_f16.data()), nelements * sizeof(ggml_fp16_t));
1470
+ data_f32.resize(nelements);
1471
+ for (int i = 0; i < nelements; ++i) {
1472
+ data_f32[i] = ggml_fp16_to_fp32(data_f16[i]);
1473
+ }
1474
+ } else {
1475
+ data_f32.resize(nelements);
1476
+ finp.read(reinterpret_cast<char *>(data_f32.data()), nelements * sizeof(float));
1477
+ }
1478
+
1479
+ ftype = itype;
1480
+ } else {
1481
+ const int bpe = (ftype == 0) ? sizeof(float) : sizeof(uint16_t);
1482
+
1483
+ data_u8.resize(nelements*bpe);
1484
+ finp.read(reinterpret_cast<char *>(data_u8.data()), nelements * bpe);
1485
+ }
1486
+
1487
+ fout.write(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
1488
+ fout.write(reinterpret_cast<char *>(&length), sizeof(length));
1489
+ fout.write(reinterpret_cast<char *>(&ftype), sizeof(ftype));
1490
+ for (int i = 0; i < n_dims; ++i) {
1491
+ fout.write(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
1492
+ }
1493
+ fout.write(&name[0], length);
1494
+
1495
+ {
1496
+ // ensure tensor data is aligned
1497
+ uint64_t offset = fout.tellp();
1498
+ offset = (offset + 31) & -32;
1499
+ fout.seekp(offset);
1500
+ }
1501
+
1502
+ if (quantize) {
1503
+ printf("quantizing .. ");
1504
+ work.resize(nelements); // for quantization
1505
+
1506
+ size_t cur_size = 0;
1507
+ std::vector<int64_t> hist_cur(1 << 4, 0);
1508
+
1509
+ switch (type) {
1510
+ case GGML_TYPE_Q4_0:
1511
+ {
1512
+ cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
1513
+ } break;
1514
+ case GGML_TYPE_Q4_1:
1515
+ {
1516
+ cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
1517
+ } break;
1518
+ default:
1519
+ {
1520
+ fprintf(stderr, "%s: unsupported quantization type %d\n", __func__, type);
1521
+ return false;
1522
+ }
1523
+ }
1524
+
1525
+ fout.write(reinterpret_cast<char *>(work.data()), cur_size);
1526
+ total_size_new += cur_size;
1527
+
1528
+ printf("size = %8.2f MB -> %8.2f MB | hist: ", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0);
1529
+ for (int i = 0; i < (int) hist_cur.size(); ++i) {
1530
+ hist_all[i] += hist_cur[i];
1531
+ }
1532
+
1533
+ for (int i = 0; i < (int) hist_cur.size(); ++i) {
1534
+ printf("%5.3f ", hist_cur[i] / float(nelements));
1535
+ }
1536
+ printf("\n");
1537
+ } else {
1538
+ printf("size = %8.3f MB\n", data_u8.size()/1024.0/1024.0);
1539
+ fout.write(reinterpret_cast<char *>(data_u8.data()), data_u8.size());
1540
+ total_size_new += data_u8.size();
1541
+ }
1542
+
1543
+ total_size_org += nelements * sizeof(float);
1544
+ }
1545
+
1546
+ printf("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
1547
+ printf("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0);
1548
+
1549
+ {
1550
+ int64_t sum_all = 0;
1551
+ for (int i = 0; i < (int) hist_all.size(); ++i) {
1552
+ sum_all += hist_all[i];
1553
+ }
1554
+
1555
+ printf("%s: hist: ", __func__);
1556
+ for (int i = 0; i < (int) hist_all.size(); ++i) {
1557
+ printf("%5.3f ", hist_all[i] / float(sum_all));
1558
+ }
1559
+ printf("\n");
1560
+ }
1561
+ }
1562
+
1563
+ finp.close();
1564
+ fout.close();
1565
+
1566
+ return true;
1567
+ }
1568
+
1569
+ //
1570
+ // interface implementation
1571
+ //
1572
+
1573
+ struct llama_context * llama_init_from_file(
1574
+ const char * path_model,
1575
+ struct llama_context_params params) {
1576
+ ggml_time_init();
1577
+
1578
+ llama_context * ctx = new llama_context;
1579
+
1580
+ if (params.seed <= 0) {
1581
+ params.seed = time(NULL);
1582
+ }
1583
+
1584
+ ctx->rng = std::mt19937(params.seed);
1585
+ ctx->logits_all = params.logits_all;
1586
+
1587
+ ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
1588
+
1589
+ if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_parts, memory_type,
1590
+ params.vocab_only, params.progress_callback,
1591
+ params.progress_callback_user_data)) {
1592
+ fprintf(stderr, "%s: failed to load model\n", __func__);
1593
+ llama_free(ctx);
1594
+ return nullptr;
1595
+ }
1596
+
1597
+ if (params.use_mlock) {
1598
+ char *err;
1599
+ if (!ggml_mlock(ctx->model.ctx,
1600
+ ctx->model.mm_addr,
1601
+ ctx->model.mm_length,
1602
+ &err)) {
1603
+ fprintf(stderr, "%s\n", err);
1604
+ free(err);
1605
+ llama_free(ctx);
1606
+ return nullptr;
1607
+ }
1608
+ }
1609
+
1610
+ // reserve memory for context buffers
1611
+ {
1612
+ if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx)) {
1613
+ fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
1614
+ llama_free(ctx);
1615
+ return nullptr;
1616
+ }
1617
+
1618
+ {
1619
+ const size_t memory_size = ggml_nbytes(ctx->model.kv_self.k) + ggml_nbytes(ctx->model.kv_self.v);
1620
+ fprintf(stderr, "%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
1621
+ }
1622
+
1623
+ const auto & hparams = ctx->model.hparams;
1624
+
1625
+ // resized during inference
1626
+ if (params.logits_all) {
1627
+ ctx->logits.reserve(hparams.n_ctx*hparams.n_vocab);
1628
+ } else {
1629
+ ctx->logits.reserve(hparams.n_ctx);
1630
+ }
1631
+
1632
+ if (params.embedding){
1633
+ ctx->embedding.resize(hparams.n_embd);
1634
+ }
1635
+
1636
+ ctx->buf_compute.resize(MEM_REQ_EVAL.at(ctx->model.type));
1637
+
1638
+ ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0.at(ctx->model.type));
1639
+ ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1.at(ctx->model.type));
1640
+ }
1641
+
1642
+ return ctx;
1643
+ }
1644
+
1645
+ void llama_free(struct llama_context * ctx) {
1646
+ kv_cache_free(ctx->model.kv_self);
1647
+
1648
+ if (ctx->model.ctx) {
1649
+ ggml_free(ctx->model.ctx);
1650
+ }
1651
+
1652
+ if (ctx->model.mm_addr) {
1653
+ munmap_file(ctx->model.mm_addr, ctx->model.mm_length);
1654
+ }
1655
+
1656
+ delete ctx;
1657
+ }
1658
+
1659
+ int llama_model_quantize(
1660
+ const char * fname_inp,
1661
+ const char * fname_out,
1662
+ int itype) {
1663
+ if (!llama_model_quantize_internal(fname_inp, fname_out, itype)) {
1664
+ fprintf(stderr, "%s: failed to quantize\n", __func__);
1665
+ return 1;
1666
+ }
1667
+
1668
+ return 0;
1669
+ }
1670
+
1671
+ int llama_eval(
1672
+ struct llama_context * ctx,
1673
+ const llama_token * tokens,
1674
+ int n_tokens,
1675
+ int n_past,
1676
+ int n_threads) {
1677
+ if (!llama_eval_internal(*ctx, tokens, n_tokens, n_past, n_threads)) {
1678
+ fprintf(stderr, "%s: failed to eval\n", __func__);
1679
+ return 1;
1680
+ }
1681
+ // get a more accurate load time, upon first eval
1682
+ if (!ctx->has_evaluated_once) {
1683
+ ctx->t_load_us = ggml_time_us() - ctx->t_start_us;
1684
+ ctx->has_evaluated_once = true;
1685
+ }
1686
+ return 0;
1687
+ }
1688
+
1689
+ int llama_tokenize(
1690
+ struct llama_context * ctx,
1691
+ const char * text,
1692
+ llama_token * tokens,
1693
+ int n_max_tokens,
1694
+ bool add_bos) {
1695
+ auto res = llama_tokenize(ctx->vocab, text, add_bos);
1696
+
1697
+ if (n_max_tokens < (int) res.size()) {
1698
+ fprintf(stderr, "%s: too many tokens\n", __func__);
1699
+ return -((int) res.size());
1700
+ }
1701
+
1702
+ for (size_t i = 0; i < res.size(); i++) {
1703
+ tokens[i] = res[i];
1704
+ }
1705
+
1706
+ return res.size();
1707
+ }
1708
+
1709
+ int llama_n_vocab(struct llama_context * ctx) {
1710
+ return ctx->vocab.id_to_token.size();
1711
+ }
1712
+
1713
+ int llama_n_ctx(struct llama_context * ctx) {
1714
+ return ctx->model.hparams.n_ctx;
1715
+ }
1716
+
1717
+ int llama_n_embd(struct llama_context * ctx) {
1718
+ return ctx->model.hparams.n_embd;
1719
+ }
1720
+
1721
+ float * llama_get_logits(struct llama_context * ctx) {
1722
+ return ctx->logits.data();
1723
+ }
1724
+
1725
+ float * llama_get_embeddings(struct llama_context * ctx) {
1726
+ return ctx->embedding.data();
1727
+ }
1728
+
1729
+ const char * llama_token_to_str(struct llama_context * ctx, llama_token token) {
1730
+ if (token >= llama_n_vocab(ctx)) {
1731
+ return nullptr;
1732
+ }
1733
+
1734
+ return ctx->vocab.id_to_token[token].tok.c_str();
1735
+ }
1736
+
1737
+ llama_token llama_token_bos() {
1738
+ return 1;
1739
+ }
1740
+
1741
+ llama_token llama_token_eos() {
1742
+ return 2;
1743
+ }
1744
+
1745
+ llama_token llama_sample_top_p_top_k(
1746
+ llama_context * ctx,
1747
+ const llama_token * last_n_tokens_data,
1748
+ int last_n_tokens_size,
1749
+ int top_k,
1750
+ float top_p,
1751
+ float temp,
1752
+ float repeat_penalty) {
1753
+ const int64_t t_start_sample_us = ggml_time_us();
1754
+
1755
+ llama_token result = 0;
1756
+
1757
+ // TODO: avoid this ...
1758
+ const auto last_n_tokens = std::vector<llama_token>(last_n_tokens_data, last_n_tokens_data + last_n_tokens_size);
1759
+
1760
+ result = llama_sample_top_p_top_k(
1761
+ *ctx,
1762
+ last_n_tokens,
1763
+ top_k,
1764
+ top_p,
1765
+ temp,
1766
+ repeat_penalty);
1767
+
1768
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
1769
+ ctx->n_sample++;
1770
+
1771
+ return result;
1772
+ }
1773
+
1774
+
1775
+ void llama_print_timings(struct llama_context * ctx) {
1776
+ const int64_t t_end_us = ggml_time_us();
1777
+
1778
+ const int32_t n_sample = Max(1, ctx->n_sample);
1779
+ const int32_t n_eval = Max(1, ctx->n_eval);
1780
+ const int32_t n_p_eval = Max(1, ctx->n_p_eval);
1781
+
1782
+ fprintf(stderr, "\n");
1783
+ fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0);
1784
+ fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3 * ctx->t_sample_us, n_sample, 1e-3 * ctx->t_sample_us / n_sample);
1785
+ fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_p_eval_us, n_p_eval, 1e-3 * ctx->t_p_eval_us / n_p_eval);
1786
+ fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3 * ctx->t_eval_us, n_eval, 1e-3 * ctx->t_eval_us / n_eval);
1787
+ fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0);
1788
+ }
1789
+
1790
+ void llama_reset_timings(struct llama_context * ctx) {
1791
+ ctx->t_start_us = ggml_time_us();
1792
+ ctx->t_sample_us = ctx->n_sample = 0;
1793
+ ctx->t_eval_us = ctx->n_eval = 0;
1794
+ ctx->t_p_eval_us = ctx->n_p_eval = 0;
1795
+ }
1796
+
1797
+ const char * llama_print_system_info(void) {
1798
+ static std::string s;
1799
+
1800
+ s = "";
1801
+ s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | ";
1802
+ s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | ";
1803
+ s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
1804
+ s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | ";
1805
+ s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
1806
+ s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | ";
1807
+ s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | ";
1808
+ s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
1809
+ s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
1810
+ s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | ";
1811
+ s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
1812
+ s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
1813
+
1814
+ return s.c_str();
1815
+ }