llama_cpp 0.0.1 → 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -12,6 +12,19 @@
12
12
  #include <cassert>
13
13
  #include <cstring>
14
14
 
15
+ #if defined(_WIN32) && !defined(_POSIX_MAPPED_FILES)
16
+ #define WIN32_LEAN_AND_MEAN
17
+ #include <Windows.h>
18
+ #else
19
+ #include <sys/types.h>
20
+ #include <sys/mman.h>
21
+ #include <unistd.h>
22
+ #include <fcntl.h>
23
+ #endif
24
+
25
+ #define Min(X, Y) ((Y) > (X) ? (X) : (Y))
26
+ #define Max(X, Y) ((Y) < (X) ? (X) : (Y))
27
+
15
28
  #define LLAMA_USE_SCRATCH
16
29
  #define LLAMA_MAX_SCRATCH_BUFFERS 16
17
30
 
@@ -142,6 +155,10 @@ struct llama_model {
142
155
  // the model memory buffer
143
156
  std::vector<uint8_t> buf;
144
157
 
158
+ // model memory mapped file
159
+ void * mm_addr = NULL;
160
+ uint64_t mm_length = 0;
161
+
145
162
  // tensors
146
163
  int n_loaded;
147
164
  std::unordered_map<std::string, struct ggml_tensor *> tensors;
@@ -165,6 +182,7 @@ struct llama_context {
165
182
 
166
183
  int64_t t_load_us = 0;
167
184
  int64_t t_start_us = 0;
185
+ bool has_evaluated_once = false;
168
186
 
169
187
  int64_t t_sample_us = 0;
170
188
  int64_t t_eval_us = 0;
@@ -206,7 +224,7 @@ struct llama_context {
206
224
  }
207
225
 
208
226
  if (buf_last >= 0) {
209
- buf_max_size[buf_last] = std::max(buf_max_size[buf_last], last_size);
227
+ buf_max_size[buf_last] = Max(buf_max_size[buf_last], last_size);
210
228
  }
211
229
 
212
230
  buf_last = i;
@@ -238,14 +256,15 @@ static bool kv_cache_init(
238
256
  const int n_embd = hparams.n_embd;
239
257
  const int n_layer = hparams.n_layer;
240
258
 
241
- const int n_mem = n_layer*n_ctx;
242
- const int n_elements = n_embd*n_mem;
259
+ const int64_t n_mem = (int64_t)n_layer*n_ctx;
260
+ const int64_t n_elements = n_embd*n_mem;
243
261
 
244
262
  cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
245
263
 
246
264
  struct ggml_init_params params;
247
265
  params.mem_size = cache.buf.size();
248
266
  params.mem_buffer = cache.buf.data();
267
+ params.no_alloc = false;
249
268
 
250
269
  cache.ctx = ggml_init(params);
251
270
 
@@ -288,6 +307,58 @@ struct llama_context_params llama_context_default_params() {
288
307
  // model loading
289
308
  //
290
309
 
310
+ static void *mmap_file(const char *fname, uint64_t *mm_length) {
311
+ #if defined(_WIN32) && !defined(_POSIX_MAPPED_FILES)
312
+ HANDLE hFile = CreateFileA(fname,
313
+ GENERIC_READ,
314
+ FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
315
+ NULL,
316
+ OPEN_EXISTING,
317
+ FILE_ATTRIBUTE_NORMAL | FILE_ATTRIBUTE_NOT_CONTENT_INDEXED,
318
+ NULL);
319
+ if (hFile == INVALID_HANDLE_VALUE) return 0;
320
+ LARGE_INTEGER fileSize;
321
+ fileSize.QuadPart = -1;
322
+ GetFileSizeEx(hFile, &fileSize);
323
+ int64_t length = fileSize.QuadPart;
324
+ HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
325
+ CloseHandle(hFile);
326
+ if (!hMapping) return 0;
327
+ void *addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
328
+ CloseHandle(hMapping);
329
+ if (!addr) return 0;
330
+ #else
331
+ int fd = open(fname, O_RDONLY);
332
+ if (fd == -1) return 0;
333
+ int64_t length = lseek(fd, 0, SEEK_END);
334
+ void *addr = mmap(NULL, length, PROT_READ, MAP_SHARED, fd, 0);
335
+ close(fd);
336
+ if (addr == MAP_FAILED) return 0;
337
+ #endif
338
+ *mm_length = length;
339
+ return addr;
340
+ }
341
+
342
+ static void munmap_file(void * addr, size_t length) {
343
+ #if defined(_WIN32) && !defined(_POSIX_MAPPED_FILES)
344
+ UnmapViewOfFile(addr);
345
+ #else
346
+ munmap(addr, length);
347
+ #endif
348
+ }
349
+
350
+ static bool report_bad_magic(const char *path, uint32_t got, uint32_t want) {
351
+ fprintf(stderr,
352
+ "%s: invalid model file (bad magic [got %#x want %#x])\n"
353
+ "\tyou most likely need to regenerate your ggml files\n"
354
+ "\tthe benefit is you'll get 10-100x faster load times\n"
355
+ "\tsee https://github.com/ggerganov/llama.cpp/issues/91\n"
356
+ "\tuse convert-pth-to-ggml.py to regenerate from original pth\n"
357
+ "\tuse migrate-ggml-2023-03-30-pr613.py if you deleted originals\n",
358
+ path, got, want);
359
+ return false;
360
+ }
361
+
291
362
  static bool llama_model_load(
292
363
  const std::string & fname,
293
364
  llama_context & lctx,
@@ -299,22 +370,24 @@ static bool llama_model_load(
299
370
  void *progress_callback_user_data) {
300
371
  fprintf(stderr, "%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
301
372
 
302
- const int64_t t_start_us = ggml_time_us();
303
-
304
- lctx.t_start_us = t_start_us;
305
-
306
- std::vector<char> f_buf(1024*1024);
373
+ lctx.t_start_us = ggml_time_us();
307
374
 
308
375
  auto & model = lctx.model;
309
376
  auto & vocab = lctx.vocab;
310
377
 
311
378
  auto fin = std::ifstream(fname, std::ios::binary);
312
- fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size());
313
379
  if (!fin) {
314
380
  fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
315
381
  return false;
316
382
  }
317
383
 
384
+ std::vector<char> f_buf(1024*1024);
385
+ fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size());
386
+
387
+ fin.seekg(0, fin.end);
388
+ const size_t file_size = fin.tellg();
389
+ fin.seekg(0);
390
+
318
391
  // verify magic
319
392
  {
320
393
  uint32_t magic;
@@ -325,8 +398,7 @@ static bool llama_model_load(
325
398
  return false;
326
399
  }
327
400
  if (magic != LLAMA_FILE_MAGIC) {
328
- fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
329
- return false;
401
+ return report_bad_magic(fname.c_str(), magic, LLAMA_FILE_MAGIC);
330
402
  }
331
403
 
332
404
  uint32_t format_version;
@@ -449,43 +521,24 @@ static bool llama_model_load(
449
521
  }
450
522
  }
451
523
 
524
+ // map model into memory
525
+ char *mm_addr = NULL;
526
+ model.mm_addr = mmap_file(fname.c_str(), &model.mm_length);
527
+ if (model.mm_addr == NULL) {
528
+ fprintf(stderr, "%s: failed to mmap '%s'\n", __func__, fname.c_str());
529
+ return false;
530
+ }
531
+ mm_addr = (char *)model.mm_addr;
532
+ fprintf(stderr, "%s: ggml map size = %6.2f MB\n", __func__, model.mm_length/(1024.0*1024.0));
533
+
452
534
  auto & ctx = model.ctx;
453
535
 
454
536
  size_t ctx_size = 0;
455
-
456
537
  {
457
- const auto & hparams = model.hparams;
458
-
459
- const int n_embd = hparams.n_embd;
538
+ const auto &hparams = model.hparams;
460
539
  const int n_layer = hparams.n_layer;
461
- const int n_ctx = hparams.n_ctx;
462
- const int n_vocab = hparams.n_vocab;
463
-
464
- ctx_size += n_embd*n_vocab*ggml_type_sizef(vtype); // tok_embeddings
465
-
466
- ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // norm
467
-
468
- ctx_size += n_embd*n_vocab*ggml_type_sizef(vtype); // output
469
-
470
- ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // attention_norm
471
-
472
- ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wq
473
- ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wk
474
- ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wv
475
- ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wo
476
-
477
- ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ffn_norm
478
-
479
- ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w1
480
- ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w2
481
- ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w3
482
-
483
- ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(memory_type); // memory_k
484
- ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(memory_type); // memory_v
485
-
486
540
  ctx_size += (5 + 10*n_layer)*256; // object overhead
487
-
488
- fprintf(stderr, "%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
541
+ fprintf(stderr, "%s: ggml ctx size = %6.2f KB\n", __func__, ctx_size/1024.0);
489
542
  }
490
543
 
491
544
  // print memory requirements
@@ -495,6 +548,7 @@ static bool llama_model_load(
495
548
  // this is the total memory required to run the inference
496
549
  const size_t mem_required =
497
550
  ctx_size +
551
+ model.mm_length +
498
552
  MEM_REQ_SCRATCH0.at(model.type) +
499
553
  MEM_REQ_SCRATCH1.at(model.type) +
500
554
  MEM_REQ_EVAL.at (model.type);
@@ -514,6 +568,7 @@ static bool llama_model_load(
514
568
  struct ggml_init_params params = {
515
569
  /*.mem_size =*/ lctx.model.buf.size(),
516
570
  /*.mem_buffer =*/ lctx.model.buf.data(),
571
+ /*.no_alloc =*/ true,
517
572
  };
518
573
 
519
574
  model.ctx = ggml_init(params);
@@ -576,234 +631,106 @@ static bool llama_model_load(
576
631
  }
577
632
  }
578
633
 
579
- const size_t file_offset = fin.tellg();
580
-
581
- fin.close();
582
-
583
634
  std::vector<uint8_t> tmp;
584
635
 
585
636
  if (progress_callback) {
586
637
  progress_callback(0.0, progress_callback_user_data);
587
638
  }
588
639
 
589
- for (int i = 0; i < n_parts; ++i) {
590
- const int part_id = i;
591
- //const int part_id = n_parts - i - 1;
640
+ fprintf(stderr, "%s: loading tensors from '%s'\n", __func__, fname.c_str());
592
641
 
593
- std::string fname_part = fname;
594
- if (i > 0) {
595
- fname_part += "." + std::to_string(i);
596
- }
597
-
598
- fprintf(stderr, "%s: loading model part %d/%d from '%s'\n", __func__, i+1, n_parts, fname_part.c_str());
642
+ // load weights
643
+ {
644
+ size_t total_size = 0;
645
+ model.n_loaded = 0;
599
646
 
600
- fin = std::ifstream(fname_part, std::ios::binary);
601
- fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size());
647
+ while (true) {
648
+ int32_t n_dims;
649
+ int32_t length;
650
+ int32_t ftype;
602
651
 
603
- fin.seekg(0, fin.end);
604
- const size_t file_size = fin.tellg();
652
+ fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
653
+ fin.read(reinterpret_cast<char *>(&length), sizeof(length));
654
+ fin.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
605
655
 
606
- fin.seekg(file_offset);
656
+ if (fin.eof()) {
657
+ break;
658
+ }
607
659
 
608
- // load weights
609
- {
610
- size_t total_size = 0;
660
+ int32_t nelements = 1;
661
+ int32_t ne[2] = { 1, 1 };
662
+ for (int i = 0; i < n_dims; ++i) {
663
+ fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
664
+ nelements *= ne[i];
665
+ }
611
666
 
612
- model.n_loaded = 0;
667
+ std::string name(length, 0);
668
+ fin.read(&name[0], length);
613
669
 
614
- fprintf(stderr, "%s: ", __func__);
670
+ if (model.tensors.find(name.data()) == model.tensors.end()) {
671
+ fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
672
+ return false;
673
+ }
615
674
 
616
- while (true) {
617
- int32_t n_dims;
618
- int32_t length;
619
- int32_t ftype;
675
+ auto tensor = model.tensors[name.data()];
620
676
 
621
- fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
622
- fin.read(reinterpret_cast<char *>(&length), sizeof(length));
623
- fin.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
677
+ if (ggml_nelements(tensor) != nelements) {
678
+ fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
679
+ return false;
680
+ }
681
+ if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
682
+ fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%" PRId64 ", %" PRId64 "], expected [%d, %d]\n",
683
+ __func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
684
+ return false;
685
+ }
686
+ if (0) {
687
+ static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
688
+ fprintf(stderr, "%24s - [%5d, %5d], type = %6s\n", name.data(), ne[0], ne[1], ftype_str[ftype]);
689
+ }
624
690
 
625
- if (fin.eof()) {
691
+ switch (ftype) {
692
+ case 0: // f32
693
+ case 1: // f16
626
694
  break;
627
- }
628
-
629
- int32_t nelements = 1;
630
- int32_t ne[2] = { 1, 1 };
631
- for (int i = 0; i < n_dims; ++i) {
632
- fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
633
- nelements *= ne[i];
634
- }
635
-
636
- std::string name(length, 0);
637
- fin.read(&name[0], length);
638
-
639
- if (model.tensors.find(name.data()) == model.tensors.end()) {
640
- fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
695
+ case 2: // q4_0
696
+ case 3: // q4_1
697
+ assert(ne[0] % 64 == 0);
698
+ break;
699
+ default:
700
+ fprintf(stderr, "%s: unknown ftype %d in model file\n", __func__, ftype);
641
701
  return false;
642
- }
643
-
644
- // split_type = 0: split by columns
645
- // split_type = 1: split by rows
646
- int split_type = 0;
647
-
648
- // split_type = 0:
649
- // regex:
650
- // - tok_embeddings.*
651
- // - layers.*.attention.wo.weight
652
- // - layers.*.feed_forward.w2.weight
653
-
654
- // split_type = 1:
655
- // regex:
656
- // - output.*
657
- // - layers.*.attention.wq.weight
658
- // - layers.*.attention.wk.weight
659
- // - layers.*.attention.wv.weight
660
- // - layers.*.feed_forward.w1.weight
661
- // - layers.*.feed_forward.w3.weight
662
- if (name.find("tok_embeddings") != std::string::npos) {
663
- split_type = 0;
664
- } else if (name.find("layers") != std::string::npos) {
665
- if (name.find("attention.wo.weight") != std::string::npos) {
666
- split_type = 0;
667
- } else if (name.find("feed_forward.w2.weight") != std::string::npos) {
668
- split_type = 0;
669
- } else {
670
- split_type = 1;
671
- }
672
- } else if (name.find("output") != std::string::npos) {
673
- split_type = 1;
674
- }
675
-
676
- auto tensor = model.tensors[name.data()];
677
-
678
- if (n_dims == 1) {
679
- if (ggml_nelements(tensor) != nelements) {
680
- fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
681
- return false;
682
- }
683
- } else {
684
- if (ggml_nelements(tensor)/n_parts != nelements) {
685
- fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
686
- return false;
687
- }
688
- }
689
-
690
- if (n_dims == 1) {
691
- if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
692
- fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
693
- __func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
694
- return false;
695
- }
696
- } else {
697
- if (split_type == 0) {
698
- if (tensor->ne[0]/n_parts != ne[0] || tensor->ne[1] != ne[1]) {
699
- fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
700
- __func__, name.data(), tensor->ne[0]/n_parts, tensor->ne[1], ne[0], ne[1]);
701
- return false;
702
- }
703
- } else {
704
- if (tensor->ne[0] != ne[0] || tensor->ne[1]/n_parts != ne[1]) {
705
- fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
706
- __func__, name.data(), tensor->ne[0], tensor->ne[1]/n_parts, ne[0], ne[1]);
707
- return false;
708
- }
709
- }
710
- }
711
-
712
- if (0) {
713
- static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
714
- fprintf(stderr, "%24s - [%5d, %5d], type = %6s, split = %d\n", name.data(), ne[0], ne[1], ftype_str[ftype], split_type);
715
- }
716
-
717
- size_t bpe = 0;
718
-
719
- switch (ftype) {
720
- case 0: bpe = ggml_type_size(GGML_TYPE_F32); break;
721
- case 1: bpe = ggml_type_size(GGML_TYPE_F16); break;
722
- case 2: bpe = ggml_type_size(GGML_TYPE_Q4_0); assert(ne[0] % 64 == 0); break;
723
- case 3: bpe = ggml_type_size(GGML_TYPE_Q4_1); assert(ne[0] % 64 == 0); break;
724
- default:
725
- {
726
- fprintf(stderr, "%s: unknown ftype %d in model file\n", __func__, ftype);
727
- return false;
728
- }
729
- };
730
-
731
- if (n_dims == 1 || n_parts == 1) {
732
- if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
733
- fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
734
- __func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
735
- return false;
736
- }
737
-
738
- if (part_id == 0) {
739
- fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
740
- } else {
741
- fin.seekg(ggml_nbytes(tensor), std::ios::cur);
742
- }
743
-
744
- total_size += ggml_nbytes(tensor);
745
- } else {
746
- if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)/n_parts) {
747
- fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
748
- __func__, name.data(), ggml_nbytes(tensor)/n_parts, nelements*bpe);
749
- return false;
750
- }
751
-
752
- if (split_type == 0) {
753
- const int np0 = ne[0];
754
-
755
- const size_t row_size = (tensor->ne[0]/ggml_blck_size(tensor->type))*ggml_type_size(tensor->type);
756
- assert(row_size == tensor->nb[1]);
757
-
758
- for (int i1 = 0; i1 < ne[1]; ++i1) {
759
- const size_t offset_row = i1*row_size;
760
- const size_t offset = offset_row + ((part_id*np0)/ggml_blck_size(tensor->type))*ggml_type_size(tensor->type);
761
- fin.read(reinterpret_cast<char *>(tensor->data) + offset, row_size/n_parts);
762
- }
763
- } else {
764
- const int np1 = ne[1];
765
-
766
- const size_t row_size = (tensor->ne[0]/ggml_blck_size(tensor->type))*ggml_type_size(tensor->type);
767
-
768
- for (int i1 = 0; i1 < ne[1]; ++i1) {
769
- const size_t offset_row = (i1 + part_id*np1)*row_size;
770
- fin.read(reinterpret_cast<char *>(tensor->data) + offset_row, row_size);
771
- }
772
- }
773
-
774
- total_size += ggml_nbytes(tensor)/n_parts;
775
- }
776
-
777
- //fprintf(stderr, "%42s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);
778
- model.n_loaded++;
779
-
780
- // progress
781
- if (progress_callback) {
782
- float current_file_progress = float(size_t(fin.tellg()) - file_offset) / float(file_size - file_offset);
783
- float current_progress = (float(i) + current_file_progress) / float(n_parts);
784
- progress_callback(current_progress, progress_callback_user_data);
785
- }
786
- if (model.n_loaded % 8 == 0) {
787
- fprintf(stderr, ".");
788
- fflush(stderr);
789
- }
790
- }
791
-
792
- fprintf(stderr, " done\n");
702
+ };
793
703
 
794
- fprintf(stderr, "%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, model.n_loaded);
795
- if (model.n_loaded == 0) {
796
- fprintf(stderr, "%s: WARN no tensors loaded from model file - assuming empty model for testing\n", __func__);
797
- } else if (model.n_loaded != (int) model.tensors.size()) {
798
- fprintf(stderr, "%s: ERROR not all tensors loaded from model file - expected %zu, got %d\n", __func__, model.tensors.size(), model.n_loaded);
799
- return false;
704
+ // load the tensor data into memory without copying or reading it
705
+ size_t offset = fin.tellg();
706
+ size_t tensor_data_size = ggml_nbytes(tensor);
707
+ offset = (offset + 31) & -32;
708
+ tensor->data = mm_addr + offset;
709
+ fin.seekg(offset + tensor_data_size);
710
+ total_size += tensor_data_size;
711
+ model.n_loaded++;
712
+
713
+ // progress
714
+ if (progress_callback) {
715
+ double current_progress = size_t(fin.tellg()) / double(file_size);
716
+ progress_callback(current_progress, progress_callback_user_data);
800
717
  }
801
718
  }
802
719
 
803
720
  fin.close();
721
+
722
+ fprintf(stderr, "%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, model.n_loaded);
723
+ if (model.n_loaded == 0) {
724
+ fprintf(stderr, "%s: WARN no tensors loaded from model file - assuming empty model for testing\n", __func__);
725
+ } else if (model.n_loaded != (int) model.tensors.size()) {
726
+ fprintf(stderr, "%s: ERROR not all tensors loaded from model file - expected %zu, got %d\n", __func__, model.tensors.size(), model.n_loaded);
727
+ return false;
728
+ }
804
729
  }
805
730
 
806
- lctx.t_load_us = ggml_time_us() - t_start_us;
731
+ // loading time will be recalculate after the first eval, so
732
+ // we take page faults deferred by mmap() into consideration
733
+ lctx.t_load_us = ggml_time_us() - lctx.t_start_us;
807
734
 
808
735
  if (progress_callback) {
809
736
  progress_callback(1.0, progress_callback_user_data);
@@ -849,6 +776,7 @@ static bool llama_eval_internal(
849
776
  struct ggml_init_params params = {
850
777
  /*.mem_size =*/ buf_compute.size(),
851
778
  /*.mem_buffer =*/ buf_compute.data(),
779
+ /*.no_alloc =*/ false,
852
780
  };
853
781
 
854
782
  struct ggml_context * ctx0 = ggml_init(params);
@@ -856,7 +784,7 @@ static bool llama_eval_internal(
856
784
  // for big prompts, if BLAS is enabled, it is better to use only one thread
857
785
  // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
858
786
  ggml_cgraph gf = {};
859
- gf.n_threads = N > 255 && ggml_cpu_has_blas() ? 1 : n_threads;
787
+ gf.n_threads = N >= 32 && ggml_cpu_has_blas() ? 1 : n_threads;
860
788
 
861
789
  struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
862
790
  memcpy(embd->data, tokens, N*ggml_element_size(embd));
@@ -882,37 +810,35 @@ static bool llama_eval_internal(
882
810
 
883
811
  // self-attention
884
812
  {
885
- struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
886
- struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
887
- struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
813
+ // compute Q and K and RoPE them
814
+ struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
815
+ struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
888
816
 
889
817
  // store key and value to memory
890
- if (N >= 1) {
818
+ {
819
+ // compute the transposed [N, n_embd] V matrix
820
+ struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), n_embd, N));
821
+
891
822
  struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
892
- struct ggml_tensor * v = ggml_view_1d(ctx0, kv_self.v, N*n_embd, (ggml_element_size(kv_self.v)*n_embd)*(il*n_ctx + n_past));
823
+ struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
824
+ ( n_ctx)*ggml_element_size(kv_self.v),
825
+ (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
893
826
 
827
+ // important: storing RoPE-ed version of K in the KV cache!
894
828
  ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
895
829
  ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
896
830
  }
897
831
 
898
- // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
899
832
  struct ggml_tensor * Q =
900
833
  ggml_permute(ctx0,
901
- ggml_rope(ctx0,
902
- ggml_cpy(ctx0,
903
- Qcur,
904
- ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)),
905
- n_past, n_rot, 0),
834
+ Qcur,
906
835
  0, 2, 1, 3);
907
836
 
908
- // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
909
837
  struct ggml_tensor * K =
910
838
  ggml_permute(ctx0,
911
- ggml_rope(ctx0,
912
- ggml_reshape_3d(ctx0,
913
- ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd),
914
- n_embd/n_head, n_head, n_past + N),
915
- n_past, n_rot, 1),
839
+ ggml_reshape_3d(ctx0,
840
+ ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd),
841
+ n_embd/n_head, n_head, n_past + N),
916
842
  0, 2, 1, 3);
917
843
 
918
844
  // K * Q
@@ -930,18 +856,23 @@ static bool llama_eval_internal(
930
856
  // KQ = soft_max(KQ_masked)
931
857
  struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
932
858
 
933
- // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
934
- struct ggml_tensor * V_trans =
935
- ggml_cpy(ctx0,
936
- ggml_permute(ctx0,
937
- ggml_reshape_3d(ctx0,
938
- ggml_view_1d(ctx0, kv_self.v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.v)*n_embd),
939
- n_embd/n_head, n_head, n_past + N),
940
- 1, 2, 0, 3),
941
- ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd/n_head, n_head));
859
+ // split cached V into n_head heads
860
+ struct ggml_tensor * V =
861
+ ggml_view_3d(ctx0, kv_self.v,
862
+ n_past + N, n_embd/n_head, n_head,
863
+ n_ctx*ggml_element_size(kv_self.v),
864
+ n_ctx*ggml_element_size(kv_self.v)*n_embd/n_head,
865
+ il*n_ctx*ggml_element_size(kv_self.v)*n_embd);
942
866
 
943
- // KQV = transpose(V) * KQ_soft_max
944
- struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
867
+ #if 1
868
+ struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
869
+ #else
870
+ // make V contiguous in memory to speed up the matmul, however we waste time on the copy
871
+ // on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
872
+ // is there a better way?
873
+ struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd/n_head, n_head));
874
+ struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
875
+ #endif
945
876
 
946
877
  // KQV_merged = KQV.permute(0, 2, 1, 3)
947
878
  struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
@@ -1027,9 +958,13 @@ static bool llama_eval_internal(
1027
958
  ggml_build_forward_expand(&gf, inpL);
1028
959
  ggml_graph_compute (ctx0, &gf);
1029
960
 
961
+ // print timing information per ggml operation (for debugging purposes)
962
+ // requires GGML_PERF to be defined
963
+ //ggml_graph_print(&gf);
964
+
965
+ // plot the computation graph in dot format (for debugging purposes)
1030
966
  //if (n_past%100 == 0) {
1031
- // ggml_graph_print (&gf);
1032
- // ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot");
967
+ // ggml_graph_dump_dot(&gf, NULL, "llama.dot");
1033
968
  //}
1034
969
 
1035
970
  //embd_w.resize(n_vocab*N);
@@ -1126,7 +1061,7 @@ struct llama_tokenizer {
1126
1061
  size_t offs = 0;
1127
1062
  while (offs < text.size()) {
1128
1063
  llama_sp_symbol sym;
1129
- size_t char_len = std::min(text.size() - offs, utf8_len(text[offs]));
1064
+ size_t char_len = Min(text.size() - offs, utf8_len(text[offs]));
1130
1065
  sym.text = text.c_str() + offs;
1131
1066
  sym.n = char_len;
1132
1067
  offs += char_len;
@@ -1266,6 +1201,20 @@ static llama_vocab::id llama_sample_top_p_top_k(
1266
1201
  const auto & logits = lctx.logits;
1267
1202
  const auto * plogits = logits.data() + logits.size() - n_logits;
1268
1203
 
1204
+ if (temp <= 0) {
1205
+ // select the token with the highest logit directly
1206
+ float max_logit = plogits[0];
1207
+ llama_vocab::id max_id = 0;
1208
+
1209
+ for (int i = 1; i < n_logits; ++i) {
1210
+ if (plogits[i] > max_logit) {
1211
+ max_logit = plogits[i];
1212
+ max_id = i;
1213
+ }
1214
+ }
1215
+ return max_id;
1216
+ }
1217
+
1269
1218
  std::vector<std::pair<float, llama_vocab::id>> logits_id;
1270
1219
  logits_id.reserve(n_logits);
1271
1220
 
@@ -1287,17 +1236,13 @@ static llama_vocab::id llama_sample_top_p_top_k(
1287
1236
  }
1288
1237
  }
1289
1238
 
1290
- sample_top_k(logits_id, top_k);
1291
-
1292
- float maxl = -std::numeric_limits<float>::infinity();
1293
- for (const auto & kv : logits_id) {
1294
- maxl = std::max(maxl, kv.first);
1295
- }
1239
+ sample_top_k(logits_id, top_k > 0 ? Min(top_k, n_logits) : n_logits);
1296
1240
 
1297
1241
  // compute probs for the top k tokens
1298
1242
  std::vector<float> probs;
1299
1243
  probs.reserve(logits_id.size());
1300
1244
 
1245
+ float maxl = logits_id[0].first;
1301
1246
  double sum = 0.0;
1302
1247
  for (const auto & kv : logits_id) {
1303
1248
  const float p = expf(kv.first - maxl);
@@ -1320,16 +1265,11 @@ static llama_vocab::id llama_sample_top_p_top_k(
1320
1265
  break;
1321
1266
  }
1322
1267
  }
1323
-
1324
- cumsum = 1.0/cumsum;
1325
- for (int i = 0; i < (int) probs.size(); i++) {
1326
- probs[i] *= cumsum;
1327
- }
1328
1268
  }
1329
1269
 
1330
1270
  //printf("\n");
1331
1271
  //for (int i = 0; i < (int) 10; i++) {
1332
- // printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), probs[i]);
1272
+ // printf("%d: '%s' %f\n", i, lctx.vocab.id_to_token.at(logits_id[i].second).tok.c_str(), probs[i]);
1333
1273
  //}
1334
1274
  //printf("\n\n");
1335
1275
  //exit(0);
@@ -1385,8 +1325,7 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
1385
1325
  return false;
1386
1326
  }
1387
1327
  if (magic != LLAMA_FILE_MAGIC) {
1388
- fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname_inp.c_str());
1389
- return false;
1328
+ return report_bad_magic(fname_inp.c_str(), magic, LLAMA_FILE_MAGIC);
1390
1329
  }
1391
1330
 
1392
1331
  fout.write((char *) &magic, sizeof(magic));
@@ -1444,7 +1383,7 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
1444
1383
  return false;
1445
1384
  }
1446
1385
 
1447
- std::string word;
1386
+ std::vector<char> word(32);
1448
1387
  vocab.id_to_token.resize(n_vocab);
1449
1388
  for (int i = 0; i < n_vocab; i++) {
1450
1389
  uint32_t len;
@@ -1452,17 +1391,17 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
1452
1391
  fout.write((char *) &len, sizeof(len));
1453
1392
 
1454
1393
  word.resize(len);
1455
- finp.read ((char *) word.data(), len);
1456
- fout.write((char *) word.data(), len);
1394
+ finp.read ((char *) &word[0], len);
1395
+ fout.write((char *) &word[0], len);
1457
1396
 
1458
1397
  float score;
1459
1398
  finp.read ((char *) &score, sizeof(score));
1460
1399
  fout.write((char *) &score, sizeof(score));
1461
1400
 
1462
- vocab.token_to_id[word] = i;
1401
+ vocab.token_to_id[word.data()] = i;
1463
1402
 
1464
1403
  auto &tok_score = vocab.id_to_token[i];
1465
- tok_score.tok = word;
1404
+ tok_score.tok = word.data();
1466
1405
  tok_score.score = score;
1467
1406
  }
1468
1407
  }
@@ -1503,6 +1442,13 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
1503
1442
  std::string name(length, 0);
1504
1443
  finp.read (&name[0], length);
1505
1444
 
1445
+ {
1446
+ // ensure tensor data is aligned
1447
+ uint64_t offset = finp.tellg();
1448
+ offset = (offset + 31) & -32;
1449
+ finp.seekg(offset);
1450
+ }
1451
+
1506
1452
  {
1507
1453
  static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
1508
1454
  printf("%48s - [%5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ftype_str[ftype]);
@@ -1558,6 +1504,13 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
1558
1504
  }
1559
1505
  fout.write(&name[0], length);
1560
1506
 
1507
+ {
1508
+ // ensure tensor data is aligned
1509
+ uint64_t offset = fout.tellp();
1510
+ offset = (offset + 31) & -32;
1511
+ fout.seekp(offset);
1512
+ }
1513
+
1561
1514
  if (quantize) {
1562
1515
  printf("quantizing .. ");
1563
1516
  work.resize(nelements); // for quantization
@@ -1655,7 +1608,10 @@ struct llama_context * llama_init_from_file(
1655
1608
 
1656
1609
  if (params.use_mlock) {
1657
1610
  char *err;
1658
- if (!ggml_mlock(ctx->model.ctx, &err)) {
1611
+ if (!ggml_mlock(ctx->model.ctx,
1612
+ ctx->model.mm_addr,
1613
+ ctx->model.mm_length,
1614
+ &err)) {
1659
1615
  fprintf(stderr, "%s\n", err);
1660
1616
  free(err);
1661
1617
  llama_free(ctx);
@@ -1664,7 +1620,7 @@ struct llama_context * llama_init_from_file(
1664
1620
  }
1665
1621
 
1666
1622
  // reserve memory for context buffers
1667
- {
1623
+ if (!params.vocab_only) {
1668
1624
  if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx)) {
1669
1625
  fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
1670
1626
  llama_free(ctx);
@@ -1705,6 +1661,10 @@ void llama_free(struct llama_context * ctx) {
1705
1661
  ggml_free(ctx->model.ctx);
1706
1662
  }
1707
1663
 
1664
+ if (ctx->model.mm_addr) {
1665
+ munmap_file(ctx->model.mm_addr, ctx->model.mm_length);
1666
+ }
1667
+
1708
1668
  delete ctx;
1709
1669
  }
1710
1670
 
@@ -1720,6 +1680,33 @@ int llama_model_quantize(
1720
1680
  return 0;
1721
1681
  }
1722
1682
 
1683
+ // Returns the KV cache that will contain the context for the
1684
+ // ongoing prediction with the model.
1685
+ const uint8_t * llama_get_kv_cache(struct llama_context * ctx) {
1686
+ return ctx->model.kv_self.buf.data();
1687
+ }
1688
+
1689
+ // Returns the size of the KV cache
1690
+ size_t llama_get_kv_cache_size(struct llama_context * ctx) {
1691
+ return ctx->model.kv_self.buf.size();
1692
+ }
1693
+
1694
+ int llama_get_kv_cache_token_count(struct llama_context * ctx) {
1695
+ return ctx->model.kv_self.n;
1696
+ }
1697
+
1698
+ // Sets the KV cache containing the current context for the model
1699
+ void llama_set_kv_cache(
1700
+ struct llama_context * ctx,
1701
+ const uint8_t * kv_cache,
1702
+ size_t n_size,
1703
+ int n_token_count) {
1704
+ // Make sure we have the same kv cache setup
1705
+ LLAMA_ASSERT(ctx->model.kv_self.buf.size() == n_size);
1706
+ memcpy(ctx->model.kv_self.buf.data(), kv_cache, n_size);
1707
+ ctx->model.kv_self.n = n_token_count;
1708
+ }
1709
+
1723
1710
  int llama_eval(
1724
1711
  struct llama_context * ctx,
1725
1712
  const llama_token * tokens,
@@ -1730,7 +1717,11 @@ int llama_eval(
1730
1717
  fprintf(stderr, "%s: failed to eval\n", __func__);
1731
1718
  return 1;
1732
1719
  }
1733
-
1720
+ // get a more accurate load time, upon first eval
1721
+ if (!ctx->has_evaluated_once) {
1722
+ ctx->t_load_us = ggml_time_us() - ctx->t_start_us;
1723
+ ctx->has_evaluated_once = true;
1724
+ }
1734
1725
  return 0;
1735
1726
  }
1736
1727
 
@@ -1823,9 +1814,9 @@ llama_token llama_sample_top_p_top_k(
1823
1814
  void llama_print_timings(struct llama_context * ctx) {
1824
1815
  const int64_t t_end_us = ggml_time_us();
1825
1816
 
1826
- const int32_t n_sample = std::max(1, ctx->n_sample);
1827
- const int32_t n_eval = std::max(1, ctx->n_eval);
1828
- const int32_t n_p_eval = std::max(1, ctx->n_p_eval);
1817
+ const int32_t n_sample = Max(1, ctx->n_sample);
1818
+ const int32_t n_eval = Max(1, ctx->n_eval);
1819
+ const int32_t n_p_eval = Max(1, ctx->n_p_eval);
1829
1820
 
1830
1821
  fprintf(stderr, "\n");
1831
1822
  fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0);
@@ -1837,7 +1828,6 @@ void llama_print_timings(struct llama_context * ctx) {
1837
1828
 
1838
1829
  void llama_reset_timings(struct llama_context * ctx) {
1839
1830
  ctx->t_start_us = ggml_time_us();
1840
-
1841
1831
  ctx->t_sample_us = ctx->n_sample = 0;
1842
1832
  ctx->t_eval_us = ctx->n_eval = 0;
1843
1833
  ctx->t_p_eval_us = ctx->n_p_eval = 0;