llama_cpp 0.0.1 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -12,6 +12,19 @@
12
12
  #include <cassert>
13
13
  #include <cstring>
14
14
 
15
+ #if defined(_WIN32) && !defined(_POSIX_MAPPED_FILES)
16
+ #define WIN32_LEAN_AND_MEAN
17
+ #include <Windows.h>
18
+ #else
19
+ #include <sys/types.h>
20
+ #include <sys/mman.h>
21
+ #include <unistd.h>
22
+ #include <fcntl.h>
23
+ #endif
24
+
25
+ #define Min(X, Y) ((Y) > (X) ? (X) : (Y))
26
+ #define Max(X, Y) ((Y) < (X) ? (X) : (Y))
27
+
15
28
  #define LLAMA_USE_SCRATCH
16
29
  #define LLAMA_MAX_SCRATCH_BUFFERS 16
17
30
 
@@ -142,6 +155,10 @@ struct llama_model {
142
155
  // the model memory buffer
143
156
  std::vector<uint8_t> buf;
144
157
 
158
+ // model memory mapped file
159
+ void * mm_addr = NULL;
160
+ uint64_t mm_length = 0;
161
+
145
162
  // tensors
146
163
  int n_loaded;
147
164
  std::unordered_map<std::string, struct ggml_tensor *> tensors;
@@ -165,6 +182,7 @@ struct llama_context {
165
182
 
166
183
  int64_t t_load_us = 0;
167
184
  int64_t t_start_us = 0;
185
+ bool has_evaluated_once = false;
168
186
 
169
187
  int64_t t_sample_us = 0;
170
188
  int64_t t_eval_us = 0;
@@ -206,7 +224,7 @@ struct llama_context {
206
224
  }
207
225
 
208
226
  if (buf_last >= 0) {
209
- buf_max_size[buf_last] = std::max(buf_max_size[buf_last], last_size);
227
+ buf_max_size[buf_last] = Max(buf_max_size[buf_last], last_size);
210
228
  }
211
229
 
212
230
  buf_last = i;
@@ -238,14 +256,15 @@ static bool kv_cache_init(
238
256
  const int n_embd = hparams.n_embd;
239
257
  const int n_layer = hparams.n_layer;
240
258
 
241
- const int n_mem = n_layer*n_ctx;
242
- const int n_elements = n_embd*n_mem;
259
+ const int64_t n_mem = (int64_t)n_layer*n_ctx;
260
+ const int64_t n_elements = n_embd*n_mem;
243
261
 
244
262
  cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
245
263
 
246
264
  struct ggml_init_params params;
247
265
  params.mem_size = cache.buf.size();
248
266
  params.mem_buffer = cache.buf.data();
267
+ params.no_alloc = false;
249
268
 
250
269
  cache.ctx = ggml_init(params);
251
270
 
@@ -288,6 +307,58 @@ struct llama_context_params llama_context_default_params() {
288
307
  // model loading
289
308
  //
290
309
 
310
+ static void *mmap_file(const char *fname, uint64_t *mm_length) {
311
+ #if defined(_WIN32) && !defined(_POSIX_MAPPED_FILES)
312
+ HANDLE hFile = CreateFileA(fname,
313
+ GENERIC_READ,
314
+ FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
315
+ NULL,
316
+ OPEN_EXISTING,
317
+ FILE_ATTRIBUTE_NORMAL | FILE_ATTRIBUTE_NOT_CONTENT_INDEXED,
318
+ NULL);
319
+ if (hFile == INVALID_HANDLE_VALUE) return 0;
320
+ LARGE_INTEGER fileSize;
321
+ fileSize.QuadPart = -1;
322
+ GetFileSizeEx(hFile, &fileSize);
323
+ int64_t length = fileSize.QuadPart;
324
+ HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
325
+ CloseHandle(hFile);
326
+ if (!hMapping) return 0;
327
+ void *addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
328
+ CloseHandle(hMapping);
329
+ if (!addr) return 0;
330
+ #else
331
+ int fd = open(fname, O_RDONLY);
332
+ if (fd == -1) return 0;
333
+ int64_t length = lseek(fd, 0, SEEK_END);
334
+ void *addr = mmap(NULL, length, PROT_READ, MAP_SHARED, fd, 0);
335
+ close(fd);
336
+ if (addr == MAP_FAILED) return 0;
337
+ #endif
338
+ *mm_length = length;
339
+ return addr;
340
+ }
341
+
342
+ static void munmap_file(void * addr, size_t length) {
343
+ #if defined(_WIN32) && !defined(_POSIX_MAPPED_FILES)
344
+ UnmapViewOfFile(addr);
345
+ #else
346
+ munmap(addr, length);
347
+ #endif
348
+ }
349
+
350
+ static bool report_bad_magic(const char *path, uint32_t got, uint32_t want) {
351
+ fprintf(stderr,
352
+ "%s: invalid model file (bad magic [got %#x want %#x])\n"
353
+ "\tyou most likely need to regenerate your ggml files\n"
354
+ "\tthe benefit is you'll get 10-100x faster load times\n"
355
+ "\tsee https://github.com/ggerganov/llama.cpp/issues/91\n"
356
+ "\tuse convert-pth-to-ggml.py to regenerate from original pth\n"
357
+ "\tuse migrate-ggml-2023-03-30-pr613.py if you deleted originals\n",
358
+ path, got, want);
359
+ return false;
360
+ }
361
+
291
362
  static bool llama_model_load(
292
363
  const std::string & fname,
293
364
  llama_context & lctx,
@@ -299,22 +370,24 @@ static bool llama_model_load(
299
370
  void *progress_callback_user_data) {
300
371
  fprintf(stderr, "%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
301
372
 
302
- const int64_t t_start_us = ggml_time_us();
303
-
304
- lctx.t_start_us = t_start_us;
305
-
306
- std::vector<char> f_buf(1024*1024);
373
+ lctx.t_start_us = ggml_time_us();
307
374
 
308
375
  auto & model = lctx.model;
309
376
  auto & vocab = lctx.vocab;
310
377
 
311
378
  auto fin = std::ifstream(fname, std::ios::binary);
312
- fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size());
313
379
  if (!fin) {
314
380
  fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
315
381
  return false;
316
382
  }
317
383
 
384
+ std::vector<char> f_buf(1024*1024);
385
+ fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size());
386
+
387
+ fin.seekg(0, fin.end);
388
+ const size_t file_size = fin.tellg();
389
+ fin.seekg(0);
390
+
318
391
  // verify magic
319
392
  {
320
393
  uint32_t magic;
@@ -325,8 +398,7 @@ static bool llama_model_load(
325
398
  return false;
326
399
  }
327
400
  if (magic != LLAMA_FILE_MAGIC) {
328
- fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
329
- return false;
401
+ return report_bad_magic(fname.c_str(), magic, LLAMA_FILE_MAGIC);
330
402
  }
331
403
 
332
404
  uint32_t format_version;
@@ -449,43 +521,24 @@ static bool llama_model_load(
449
521
  }
450
522
  }
451
523
 
524
+ // map model into memory
525
+ char *mm_addr = NULL;
526
+ model.mm_addr = mmap_file(fname.c_str(), &model.mm_length);
527
+ if (model.mm_addr == NULL) {
528
+ fprintf(stderr, "%s: failed to mmap '%s'\n", __func__, fname.c_str());
529
+ return false;
530
+ }
531
+ mm_addr = (char *)model.mm_addr;
532
+ fprintf(stderr, "%s: ggml map size = %6.2f MB\n", __func__, model.mm_length/(1024.0*1024.0));
533
+
452
534
  auto & ctx = model.ctx;
453
535
 
454
536
  size_t ctx_size = 0;
455
-
456
537
  {
457
- const auto & hparams = model.hparams;
458
-
459
- const int n_embd = hparams.n_embd;
538
+ const auto &hparams = model.hparams;
460
539
  const int n_layer = hparams.n_layer;
461
- const int n_ctx = hparams.n_ctx;
462
- const int n_vocab = hparams.n_vocab;
463
-
464
- ctx_size += n_embd*n_vocab*ggml_type_sizef(vtype); // tok_embeddings
465
-
466
- ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // norm
467
-
468
- ctx_size += n_embd*n_vocab*ggml_type_sizef(vtype); // output
469
-
470
- ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // attention_norm
471
-
472
- ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wq
473
- ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wk
474
- ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wv
475
- ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wo
476
-
477
- ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ffn_norm
478
-
479
- ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w1
480
- ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w2
481
- ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w3
482
-
483
- ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(memory_type); // memory_k
484
- ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(memory_type); // memory_v
485
-
486
540
  ctx_size += (5 + 10*n_layer)*256; // object overhead
487
-
488
- fprintf(stderr, "%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
541
+ fprintf(stderr, "%s: ggml ctx size = %6.2f KB\n", __func__, ctx_size/1024.0);
489
542
  }
490
543
 
491
544
  // print memory requirements
@@ -495,6 +548,7 @@ static bool llama_model_load(
495
548
  // this is the total memory required to run the inference
496
549
  const size_t mem_required =
497
550
  ctx_size +
551
+ model.mm_length +
498
552
  MEM_REQ_SCRATCH0.at(model.type) +
499
553
  MEM_REQ_SCRATCH1.at(model.type) +
500
554
  MEM_REQ_EVAL.at (model.type);
@@ -514,6 +568,7 @@ static bool llama_model_load(
514
568
  struct ggml_init_params params = {
515
569
  /*.mem_size =*/ lctx.model.buf.size(),
516
570
  /*.mem_buffer =*/ lctx.model.buf.data(),
571
+ /*.no_alloc =*/ true,
517
572
  };
518
573
 
519
574
  model.ctx = ggml_init(params);
@@ -576,234 +631,106 @@ static bool llama_model_load(
576
631
  }
577
632
  }
578
633
 
579
- const size_t file_offset = fin.tellg();
580
-
581
- fin.close();
582
-
583
634
  std::vector<uint8_t> tmp;
584
635
 
585
636
  if (progress_callback) {
586
637
  progress_callback(0.0, progress_callback_user_data);
587
638
  }
588
639
 
589
- for (int i = 0; i < n_parts; ++i) {
590
- const int part_id = i;
591
- //const int part_id = n_parts - i - 1;
640
+ fprintf(stderr, "%s: loading tensors from '%s'\n", __func__, fname.c_str());
592
641
 
593
- std::string fname_part = fname;
594
- if (i > 0) {
595
- fname_part += "." + std::to_string(i);
596
- }
597
-
598
- fprintf(stderr, "%s: loading model part %d/%d from '%s'\n", __func__, i+1, n_parts, fname_part.c_str());
642
+ // load weights
643
+ {
644
+ size_t total_size = 0;
645
+ model.n_loaded = 0;
599
646
 
600
- fin = std::ifstream(fname_part, std::ios::binary);
601
- fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size());
647
+ while (true) {
648
+ int32_t n_dims;
649
+ int32_t length;
650
+ int32_t ftype;
602
651
 
603
- fin.seekg(0, fin.end);
604
- const size_t file_size = fin.tellg();
652
+ fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
653
+ fin.read(reinterpret_cast<char *>(&length), sizeof(length));
654
+ fin.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
605
655
 
606
- fin.seekg(file_offset);
656
+ if (fin.eof()) {
657
+ break;
658
+ }
607
659
 
608
- // load weights
609
- {
610
- size_t total_size = 0;
660
+ int32_t nelements = 1;
661
+ int32_t ne[2] = { 1, 1 };
662
+ for (int i = 0; i < n_dims; ++i) {
663
+ fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
664
+ nelements *= ne[i];
665
+ }
611
666
 
612
- model.n_loaded = 0;
667
+ std::string name(length, 0);
668
+ fin.read(&name[0], length);
613
669
 
614
- fprintf(stderr, "%s: ", __func__);
670
+ if (model.tensors.find(name.data()) == model.tensors.end()) {
671
+ fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
672
+ return false;
673
+ }
615
674
 
616
- while (true) {
617
- int32_t n_dims;
618
- int32_t length;
619
- int32_t ftype;
675
+ auto tensor = model.tensors[name.data()];
620
676
 
621
- fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
622
- fin.read(reinterpret_cast<char *>(&length), sizeof(length));
623
- fin.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
677
+ if (ggml_nelements(tensor) != nelements) {
678
+ fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
679
+ return false;
680
+ }
681
+ if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
682
+ fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%" PRId64 ", %" PRId64 "], expected [%d, %d]\n",
683
+ __func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
684
+ return false;
685
+ }
686
+ if (0) {
687
+ static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
688
+ fprintf(stderr, "%24s - [%5d, %5d], type = %6s\n", name.data(), ne[0], ne[1], ftype_str[ftype]);
689
+ }
624
690
 
625
- if (fin.eof()) {
691
+ switch (ftype) {
692
+ case 0: // f32
693
+ case 1: // f16
626
694
  break;
627
- }
628
-
629
- int32_t nelements = 1;
630
- int32_t ne[2] = { 1, 1 };
631
- for (int i = 0; i < n_dims; ++i) {
632
- fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
633
- nelements *= ne[i];
634
- }
635
-
636
- std::string name(length, 0);
637
- fin.read(&name[0], length);
638
-
639
- if (model.tensors.find(name.data()) == model.tensors.end()) {
640
- fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
695
+ case 2: // q4_0
696
+ case 3: // q4_1
697
+ assert(ne[0] % 64 == 0);
698
+ break;
699
+ default:
700
+ fprintf(stderr, "%s: unknown ftype %d in model file\n", __func__, ftype);
641
701
  return false;
642
- }
643
-
644
- // split_type = 0: split by columns
645
- // split_type = 1: split by rows
646
- int split_type = 0;
647
-
648
- // split_type = 0:
649
- // regex:
650
- // - tok_embeddings.*
651
- // - layers.*.attention.wo.weight
652
- // - layers.*.feed_forward.w2.weight
653
-
654
- // split_type = 1:
655
- // regex:
656
- // - output.*
657
- // - layers.*.attention.wq.weight
658
- // - layers.*.attention.wk.weight
659
- // - layers.*.attention.wv.weight
660
- // - layers.*.feed_forward.w1.weight
661
- // - layers.*.feed_forward.w3.weight
662
- if (name.find("tok_embeddings") != std::string::npos) {
663
- split_type = 0;
664
- } else if (name.find("layers") != std::string::npos) {
665
- if (name.find("attention.wo.weight") != std::string::npos) {
666
- split_type = 0;
667
- } else if (name.find("feed_forward.w2.weight") != std::string::npos) {
668
- split_type = 0;
669
- } else {
670
- split_type = 1;
671
- }
672
- } else if (name.find("output") != std::string::npos) {
673
- split_type = 1;
674
- }
675
-
676
- auto tensor = model.tensors[name.data()];
677
-
678
- if (n_dims == 1) {
679
- if (ggml_nelements(tensor) != nelements) {
680
- fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
681
- return false;
682
- }
683
- } else {
684
- if (ggml_nelements(tensor)/n_parts != nelements) {
685
- fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
686
- return false;
687
- }
688
- }
689
-
690
- if (n_dims == 1) {
691
- if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
692
- fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
693
- __func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
694
- return false;
695
- }
696
- } else {
697
- if (split_type == 0) {
698
- if (tensor->ne[0]/n_parts != ne[0] || tensor->ne[1] != ne[1]) {
699
- fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
700
- __func__, name.data(), tensor->ne[0]/n_parts, tensor->ne[1], ne[0], ne[1]);
701
- return false;
702
- }
703
- } else {
704
- if (tensor->ne[0] != ne[0] || tensor->ne[1]/n_parts != ne[1]) {
705
- fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
706
- __func__, name.data(), tensor->ne[0], tensor->ne[1]/n_parts, ne[0], ne[1]);
707
- return false;
708
- }
709
- }
710
- }
711
-
712
- if (0) {
713
- static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
714
- fprintf(stderr, "%24s - [%5d, %5d], type = %6s, split = %d\n", name.data(), ne[0], ne[1], ftype_str[ftype], split_type);
715
- }
716
-
717
- size_t bpe = 0;
718
-
719
- switch (ftype) {
720
- case 0: bpe = ggml_type_size(GGML_TYPE_F32); break;
721
- case 1: bpe = ggml_type_size(GGML_TYPE_F16); break;
722
- case 2: bpe = ggml_type_size(GGML_TYPE_Q4_0); assert(ne[0] % 64 == 0); break;
723
- case 3: bpe = ggml_type_size(GGML_TYPE_Q4_1); assert(ne[0] % 64 == 0); break;
724
- default:
725
- {
726
- fprintf(stderr, "%s: unknown ftype %d in model file\n", __func__, ftype);
727
- return false;
728
- }
729
- };
730
-
731
- if (n_dims == 1 || n_parts == 1) {
732
- if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
733
- fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
734
- __func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
735
- return false;
736
- }
737
-
738
- if (part_id == 0) {
739
- fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
740
- } else {
741
- fin.seekg(ggml_nbytes(tensor), std::ios::cur);
742
- }
743
-
744
- total_size += ggml_nbytes(tensor);
745
- } else {
746
- if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)/n_parts) {
747
- fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
748
- __func__, name.data(), ggml_nbytes(tensor)/n_parts, nelements*bpe);
749
- return false;
750
- }
751
-
752
- if (split_type == 0) {
753
- const int np0 = ne[0];
754
-
755
- const size_t row_size = (tensor->ne[0]/ggml_blck_size(tensor->type))*ggml_type_size(tensor->type);
756
- assert(row_size == tensor->nb[1]);
757
-
758
- for (int i1 = 0; i1 < ne[1]; ++i1) {
759
- const size_t offset_row = i1*row_size;
760
- const size_t offset = offset_row + ((part_id*np0)/ggml_blck_size(tensor->type))*ggml_type_size(tensor->type);
761
- fin.read(reinterpret_cast<char *>(tensor->data) + offset, row_size/n_parts);
762
- }
763
- } else {
764
- const int np1 = ne[1];
765
-
766
- const size_t row_size = (tensor->ne[0]/ggml_blck_size(tensor->type))*ggml_type_size(tensor->type);
767
-
768
- for (int i1 = 0; i1 < ne[1]; ++i1) {
769
- const size_t offset_row = (i1 + part_id*np1)*row_size;
770
- fin.read(reinterpret_cast<char *>(tensor->data) + offset_row, row_size);
771
- }
772
- }
773
-
774
- total_size += ggml_nbytes(tensor)/n_parts;
775
- }
776
-
777
- //fprintf(stderr, "%42s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);
778
- model.n_loaded++;
779
-
780
- // progress
781
- if (progress_callback) {
782
- float current_file_progress = float(size_t(fin.tellg()) - file_offset) / float(file_size - file_offset);
783
- float current_progress = (float(i) + current_file_progress) / float(n_parts);
784
- progress_callback(current_progress, progress_callback_user_data);
785
- }
786
- if (model.n_loaded % 8 == 0) {
787
- fprintf(stderr, ".");
788
- fflush(stderr);
789
- }
790
- }
791
-
792
- fprintf(stderr, " done\n");
702
+ };
793
703
 
794
- fprintf(stderr, "%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, model.n_loaded);
795
- if (model.n_loaded == 0) {
796
- fprintf(stderr, "%s: WARN no tensors loaded from model file - assuming empty model for testing\n", __func__);
797
- } else if (model.n_loaded != (int) model.tensors.size()) {
798
- fprintf(stderr, "%s: ERROR not all tensors loaded from model file - expected %zu, got %d\n", __func__, model.tensors.size(), model.n_loaded);
799
- return false;
704
+ // load the tensor data into memory without copying or reading it
705
+ size_t offset = fin.tellg();
706
+ size_t tensor_data_size = ggml_nbytes(tensor);
707
+ offset = (offset + 31) & -32;
708
+ tensor->data = mm_addr + offset;
709
+ fin.seekg(offset + tensor_data_size);
710
+ total_size += tensor_data_size;
711
+ model.n_loaded++;
712
+
713
+ // progress
714
+ if (progress_callback) {
715
+ double current_progress = size_t(fin.tellg()) / double(file_size);
716
+ progress_callback(current_progress, progress_callback_user_data);
800
717
  }
801
718
  }
802
719
 
803
720
  fin.close();
721
+
722
+ fprintf(stderr, "%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, model.n_loaded);
723
+ if (model.n_loaded == 0) {
724
+ fprintf(stderr, "%s: WARN no tensors loaded from model file - assuming empty model for testing\n", __func__);
725
+ } else if (model.n_loaded != (int) model.tensors.size()) {
726
+ fprintf(stderr, "%s: ERROR not all tensors loaded from model file - expected %zu, got %d\n", __func__, model.tensors.size(), model.n_loaded);
727
+ return false;
728
+ }
804
729
  }
805
730
 
806
- lctx.t_load_us = ggml_time_us() - t_start_us;
731
+ // loading time will be recalculate after the first eval, so
732
+ // we take page faults deferred by mmap() into consideration
733
+ lctx.t_load_us = ggml_time_us() - lctx.t_start_us;
807
734
 
808
735
  if (progress_callback) {
809
736
  progress_callback(1.0, progress_callback_user_data);
@@ -849,6 +776,7 @@ static bool llama_eval_internal(
849
776
  struct ggml_init_params params = {
850
777
  /*.mem_size =*/ buf_compute.size(),
851
778
  /*.mem_buffer =*/ buf_compute.data(),
779
+ /*.no_alloc =*/ false,
852
780
  };
853
781
 
854
782
  struct ggml_context * ctx0 = ggml_init(params);
@@ -856,7 +784,7 @@ static bool llama_eval_internal(
856
784
  // for big prompts, if BLAS is enabled, it is better to use only one thread
857
785
  // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
858
786
  ggml_cgraph gf = {};
859
- gf.n_threads = N > 255 && ggml_cpu_has_blas() ? 1 : n_threads;
787
+ gf.n_threads = N >= 32 && ggml_cpu_has_blas() ? 1 : n_threads;
860
788
 
861
789
  struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
862
790
  memcpy(embd->data, tokens, N*ggml_element_size(embd));
@@ -882,37 +810,35 @@ static bool llama_eval_internal(
882
810
 
883
811
  // self-attention
884
812
  {
885
- struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
886
- struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
887
- struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
813
+ // compute Q and K and RoPE them
814
+ struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
815
+ struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
888
816
 
889
817
  // store key and value to memory
890
- if (N >= 1) {
818
+ {
819
+ // compute the transposed [N, n_embd] V matrix
820
+ struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), n_embd, N));
821
+
891
822
  struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
892
- struct ggml_tensor * v = ggml_view_1d(ctx0, kv_self.v, N*n_embd, (ggml_element_size(kv_self.v)*n_embd)*(il*n_ctx + n_past));
823
+ struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
824
+ ( n_ctx)*ggml_element_size(kv_self.v),
825
+ (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
893
826
 
827
+ // important: storing RoPE-ed version of K in the KV cache!
894
828
  ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
895
829
  ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
896
830
  }
897
831
 
898
- // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
899
832
  struct ggml_tensor * Q =
900
833
  ggml_permute(ctx0,
901
- ggml_rope(ctx0,
902
- ggml_cpy(ctx0,
903
- Qcur,
904
- ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)),
905
- n_past, n_rot, 0),
834
+ Qcur,
906
835
  0, 2, 1, 3);
907
836
 
908
- // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
909
837
  struct ggml_tensor * K =
910
838
  ggml_permute(ctx0,
911
- ggml_rope(ctx0,
912
- ggml_reshape_3d(ctx0,
913
- ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd),
914
- n_embd/n_head, n_head, n_past + N),
915
- n_past, n_rot, 1),
839
+ ggml_reshape_3d(ctx0,
840
+ ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd),
841
+ n_embd/n_head, n_head, n_past + N),
916
842
  0, 2, 1, 3);
917
843
 
918
844
  // K * Q
@@ -930,18 +856,23 @@ static bool llama_eval_internal(
930
856
  // KQ = soft_max(KQ_masked)
931
857
  struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
932
858
 
933
- // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
934
- struct ggml_tensor * V_trans =
935
- ggml_cpy(ctx0,
936
- ggml_permute(ctx0,
937
- ggml_reshape_3d(ctx0,
938
- ggml_view_1d(ctx0, kv_self.v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.v)*n_embd),
939
- n_embd/n_head, n_head, n_past + N),
940
- 1, 2, 0, 3),
941
- ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd/n_head, n_head));
859
+ // split cached V into n_head heads
860
+ struct ggml_tensor * V =
861
+ ggml_view_3d(ctx0, kv_self.v,
862
+ n_past + N, n_embd/n_head, n_head,
863
+ n_ctx*ggml_element_size(kv_self.v),
864
+ n_ctx*ggml_element_size(kv_self.v)*n_embd/n_head,
865
+ il*n_ctx*ggml_element_size(kv_self.v)*n_embd);
942
866
 
943
- // KQV = transpose(V) * KQ_soft_max
944
- struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
867
+ #if 1
868
+ struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
869
+ #else
870
+ // make V contiguous in memory to speed up the matmul, however we waste time on the copy
871
+ // on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
872
+ // is there a better way?
873
+ struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd/n_head, n_head));
874
+ struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
875
+ #endif
945
876
 
946
877
  // KQV_merged = KQV.permute(0, 2, 1, 3)
947
878
  struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
@@ -1027,9 +958,13 @@ static bool llama_eval_internal(
1027
958
  ggml_build_forward_expand(&gf, inpL);
1028
959
  ggml_graph_compute (ctx0, &gf);
1029
960
 
961
+ // print timing information per ggml operation (for debugging purposes)
962
+ // requires GGML_PERF to be defined
963
+ //ggml_graph_print(&gf);
964
+
965
+ // plot the computation graph in dot format (for debugging purposes)
1030
966
  //if (n_past%100 == 0) {
1031
- // ggml_graph_print (&gf);
1032
- // ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot");
967
+ // ggml_graph_dump_dot(&gf, NULL, "llama.dot");
1033
968
  //}
1034
969
 
1035
970
  //embd_w.resize(n_vocab*N);
@@ -1126,7 +1061,7 @@ struct llama_tokenizer {
1126
1061
  size_t offs = 0;
1127
1062
  while (offs < text.size()) {
1128
1063
  llama_sp_symbol sym;
1129
- size_t char_len = std::min(text.size() - offs, utf8_len(text[offs]));
1064
+ size_t char_len = Min(text.size() - offs, utf8_len(text[offs]));
1130
1065
  sym.text = text.c_str() + offs;
1131
1066
  sym.n = char_len;
1132
1067
  offs += char_len;
@@ -1266,6 +1201,20 @@ static llama_vocab::id llama_sample_top_p_top_k(
1266
1201
  const auto & logits = lctx.logits;
1267
1202
  const auto * plogits = logits.data() + logits.size() - n_logits;
1268
1203
 
1204
+ if (temp <= 0) {
1205
+ // select the token with the highest logit directly
1206
+ float max_logit = plogits[0];
1207
+ llama_vocab::id max_id = 0;
1208
+
1209
+ for (int i = 1; i < n_logits; ++i) {
1210
+ if (plogits[i] > max_logit) {
1211
+ max_logit = plogits[i];
1212
+ max_id = i;
1213
+ }
1214
+ }
1215
+ return max_id;
1216
+ }
1217
+
1269
1218
  std::vector<std::pair<float, llama_vocab::id>> logits_id;
1270
1219
  logits_id.reserve(n_logits);
1271
1220
 
@@ -1287,17 +1236,13 @@ static llama_vocab::id llama_sample_top_p_top_k(
1287
1236
  }
1288
1237
  }
1289
1238
 
1290
- sample_top_k(logits_id, top_k);
1291
-
1292
- float maxl = -std::numeric_limits<float>::infinity();
1293
- for (const auto & kv : logits_id) {
1294
- maxl = std::max(maxl, kv.first);
1295
- }
1239
+ sample_top_k(logits_id, top_k > 0 ? Min(top_k, n_logits) : n_logits);
1296
1240
 
1297
1241
  // compute probs for the top k tokens
1298
1242
  std::vector<float> probs;
1299
1243
  probs.reserve(logits_id.size());
1300
1244
 
1245
+ float maxl = logits_id[0].first;
1301
1246
  double sum = 0.0;
1302
1247
  for (const auto & kv : logits_id) {
1303
1248
  const float p = expf(kv.first - maxl);
@@ -1320,16 +1265,11 @@ static llama_vocab::id llama_sample_top_p_top_k(
1320
1265
  break;
1321
1266
  }
1322
1267
  }
1323
-
1324
- cumsum = 1.0/cumsum;
1325
- for (int i = 0; i < (int) probs.size(); i++) {
1326
- probs[i] *= cumsum;
1327
- }
1328
1268
  }
1329
1269
 
1330
1270
  //printf("\n");
1331
1271
  //for (int i = 0; i < (int) 10; i++) {
1332
- // printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), probs[i]);
1272
+ // printf("%d: '%s' %f\n", i, lctx.vocab.id_to_token.at(logits_id[i].second).tok.c_str(), probs[i]);
1333
1273
  //}
1334
1274
  //printf("\n\n");
1335
1275
  //exit(0);
@@ -1385,8 +1325,7 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
1385
1325
  return false;
1386
1326
  }
1387
1327
  if (magic != LLAMA_FILE_MAGIC) {
1388
- fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname_inp.c_str());
1389
- return false;
1328
+ return report_bad_magic(fname_inp.c_str(), magic, LLAMA_FILE_MAGIC);
1390
1329
  }
1391
1330
 
1392
1331
  fout.write((char *) &magic, sizeof(magic));
@@ -1444,7 +1383,7 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
1444
1383
  return false;
1445
1384
  }
1446
1385
 
1447
- std::string word;
1386
+ std::vector<char> word(32);
1448
1387
  vocab.id_to_token.resize(n_vocab);
1449
1388
  for (int i = 0; i < n_vocab; i++) {
1450
1389
  uint32_t len;
@@ -1452,17 +1391,17 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
1452
1391
  fout.write((char *) &len, sizeof(len));
1453
1392
 
1454
1393
  word.resize(len);
1455
- finp.read ((char *) word.data(), len);
1456
- fout.write((char *) word.data(), len);
1394
+ finp.read ((char *) &word[0], len);
1395
+ fout.write((char *) &word[0], len);
1457
1396
 
1458
1397
  float score;
1459
1398
  finp.read ((char *) &score, sizeof(score));
1460
1399
  fout.write((char *) &score, sizeof(score));
1461
1400
 
1462
- vocab.token_to_id[word] = i;
1401
+ vocab.token_to_id[word.data()] = i;
1463
1402
 
1464
1403
  auto &tok_score = vocab.id_to_token[i];
1465
- tok_score.tok = word;
1404
+ tok_score.tok = word.data();
1466
1405
  tok_score.score = score;
1467
1406
  }
1468
1407
  }
@@ -1503,6 +1442,13 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
1503
1442
  std::string name(length, 0);
1504
1443
  finp.read (&name[0], length);
1505
1444
 
1445
+ {
1446
+ // ensure tensor data is aligned
1447
+ uint64_t offset = finp.tellg();
1448
+ offset = (offset + 31) & -32;
1449
+ finp.seekg(offset);
1450
+ }
1451
+
1506
1452
  {
1507
1453
  static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
1508
1454
  printf("%48s - [%5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ftype_str[ftype]);
@@ -1558,6 +1504,13 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
1558
1504
  }
1559
1505
  fout.write(&name[0], length);
1560
1506
 
1507
+ {
1508
+ // ensure tensor data is aligned
1509
+ uint64_t offset = fout.tellp();
1510
+ offset = (offset + 31) & -32;
1511
+ fout.seekp(offset);
1512
+ }
1513
+
1561
1514
  if (quantize) {
1562
1515
  printf("quantizing .. ");
1563
1516
  work.resize(nelements); // for quantization
@@ -1655,7 +1608,10 @@ struct llama_context * llama_init_from_file(
1655
1608
 
1656
1609
  if (params.use_mlock) {
1657
1610
  char *err;
1658
- if (!ggml_mlock(ctx->model.ctx, &err)) {
1611
+ if (!ggml_mlock(ctx->model.ctx,
1612
+ ctx->model.mm_addr,
1613
+ ctx->model.mm_length,
1614
+ &err)) {
1659
1615
  fprintf(stderr, "%s\n", err);
1660
1616
  free(err);
1661
1617
  llama_free(ctx);
@@ -1664,7 +1620,7 @@ struct llama_context * llama_init_from_file(
1664
1620
  }
1665
1621
 
1666
1622
  // reserve memory for context buffers
1667
- {
1623
+ if (!params.vocab_only) {
1668
1624
  if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx)) {
1669
1625
  fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
1670
1626
  llama_free(ctx);
@@ -1705,6 +1661,10 @@ void llama_free(struct llama_context * ctx) {
1705
1661
  ggml_free(ctx->model.ctx);
1706
1662
  }
1707
1663
 
1664
+ if (ctx->model.mm_addr) {
1665
+ munmap_file(ctx->model.mm_addr, ctx->model.mm_length);
1666
+ }
1667
+
1708
1668
  delete ctx;
1709
1669
  }
1710
1670
 
@@ -1720,6 +1680,33 @@ int llama_model_quantize(
1720
1680
  return 0;
1721
1681
  }
1722
1682
 
1683
+ // Returns the KV cache that will contain the context for the
1684
+ // ongoing prediction with the model.
1685
+ const uint8_t * llama_get_kv_cache(struct llama_context * ctx) {
1686
+ return ctx->model.kv_self.buf.data();
1687
+ }
1688
+
1689
+ // Returns the size of the KV cache
1690
+ size_t llama_get_kv_cache_size(struct llama_context * ctx) {
1691
+ return ctx->model.kv_self.buf.size();
1692
+ }
1693
+
1694
+ int llama_get_kv_cache_token_count(struct llama_context * ctx) {
1695
+ return ctx->model.kv_self.n;
1696
+ }
1697
+
1698
+ // Sets the KV cache containing the current context for the model
1699
+ void llama_set_kv_cache(
1700
+ struct llama_context * ctx,
1701
+ const uint8_t * kv_cache,
1702
+ size_t n_size,
1703
+ int n_token_count) {
1704
+ // Make sure we have the same kv cache setup
1705
+ LLAMA_ASSERT(ctx->model.kv_self.buf.size() == n_size);
1706
+ memcpy(ctx->model.kv_self.buf.data(), kv_cache, n_size);
1707
+ ctx->model.kv_self.n = n_token_count;
1708
+ }
1709
+
1723
1710
  int llama_eval(
1724
1711
  struct llama_context * ctx,
1725
1712
  const llama_token * tokens,
@@ -1730,7 +1717,11 @@ int llama_eval(
1730
1717
  fprintf(stderr, "%s: failed to eval\n", __func__);
1731
1718
  return 1;
1732
1719
  }
1733
-
1720
+ // get a more accurate load time, upon first eval
1721
+ if (!ctx->has_evaluated_once) {
1722
+ ctx->t_load_us = ggml_time_us() - ctx->t_start_us;
1723
+ ctx->has_evaluated_once = true;
1724
+ }
1734
1725
  return 0;
1735
1726
  }
1736
1727
 
@@ -1823,9 +1814,9 @@ llama_token llama_sample_top_p_top_k(
1823
1814
  void llama_print_timings(struct llama_context * ctx) {
1824
1815
  const int64_t t_end_us = ggml_time_us();
1825
1816
 
1826
- const int32_t n_sample = std::max(1, ctx->n_sample);
1827
- const int32_t n_eval = std::max(1, ctx->n_eval);
1828
- const int32_t n_p_eval = std::max(1, ctx->n_p_eval);
1817
+ const int32_t n_sample = Max(1, ctx->n_sample);
1818
+ const int32_t n_eval = Max(1, ctx->n_eval);
1819
+ const int32_t n_p_eval = Max(1, ctx->n_p_eval);
1829
1820
 
1830
1821
  fprintf(stderr, "\n");
1831
1822
  fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0);
@@ -1837,7 +1828,6 @@ void llama_print_timings(struct llama_context * ctx) {
1837
1828
 
1838
1829
  void llama_reset_timings(struct llama_context * ctx) {
1839
1830
  ctx->t_start_us = ggml_time_us();
1840
-
1841
1831
  ctx->t_sample_us = ctx->n_sample = 0;
1842
1832
  ctx->t_eval_us = ctx->n_eval = 0;
1843
1833
  ctx->t_p_eval_us = ctx->n_p_eval = 0;