llama_cpp 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -12,6 +12,19 @@
12
12
  #include <cassert>
13
13
  #include <cstring>
14
14
 
15
+ #if defined(_WIN32) && !defined(_POSIX_MAPPED_FILES)
16
+ #define WIN32_LEAN_AND_MEAN
17
+ #include <Windows.h>
18
+ #else
19
+ #include <sys/types.h>
20
+ #include <sys/mman.h>
21
+ #include <unistd.h>
22
+ #include <fcntl.h>
23
+ #endif
24
+
25
+ #define Min(X, Y) ((Y) > (X) ? (X) : (Y))
26
+ #define Max(X, Y) ((Y) < (X) ? (X) : (Y))
27
+
15
28
  #define LLAMA_USE_SCRATCH
16
29
  #define LLAMA_MAX_SCRATCH_BUFFERS 16
17
30
 
@@ -142,6 +155,10 @@ struct llama_model {
142
155
  // the model memory buffer
143
156
  std::vector<uint8_t> buf;
144
157
 
158
+ // model memory mapped file
159
+ void * mm_addr = NULL;
160
+ uint64_t mm_length = 0;
161
+
145
162
  // tensors
146
163
  int n_loaded;
147
164
  std::unordered_map<std::string, struct ggml_tensor *> tensors;
@@ -165,6 +182,7 @@ struct llama_context {
165
182
 
166
183
  int64_t t_load_us = 0;
167
184
  int64_t t_start_us = 0;
185
+ bool has_evaluated_once = false;
168
186
 
169
187
  int64_t t_sample_us = 0;
170
188
  int64_t t_eval_us = 0;
@@ -206,7 +224,7 @@ struct llama_context {
206
224
  }
207
225
 
208
226
  if (buf_last >= 0) {
209
- buf_max_size[buf_last] = std::max(buf_max_size[buf_last], last_size);
227
+ buf_max_size[buf_last] = Max(buf_max_size[buf_last], last_size);
210
228
  }
211
229
 
212
230
  buf_last = i;
@@ -246,6 +264,7 @@ static bool kv_cache_init(
246
264
  struct ggml_init_params params;
247
265
  params.mem_size = cache.buf.size();
248
266
  params.mem_buffer = cache.buf.data();
267
+ params.no_alloc = false;
249
268
 
250
269
  cache.ctx = ggml_init(params);
251
270
 
@@ -288,6 +307,58 @@ struct llama_context_params llama_context_default_params() {
288
307
  // model loading
289
308
  //
290
309
 
310
+ static void *mmap_file(const char *fname, uint64_t *mm_length) {
311
+ #if defined(_WIN32) && !defined(_POSIX_MAPPED_FILES)
312
+ HANDLE hFile = CreateFileA(fname,
313
+ GENERIC_READ,
314
+ FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
315
+ NULL,
316
+ OPEN_EXISTING,
317
+ FILE_ATTRIBUTE_NORMAL | FILE_ATTRIBUTE_NOT_CONTENT_INDEXED,
318
+ NULL);
319
+ if (hFile == INVALID_HANDLE_VALUE) return 0;
320
+ LARGE_INTEGER fileSize;
321
+ fileSize.QuadPart = -1;
322
+ GetFileSizeEx(hFile, &fileSize);
323
+ int64_t length = fileSize.QuadPart;
324
+ HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
325
+ CloseHandle(hFile);
326
+ if (!hMapping) return 0;
327
+ void *addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
328
+ CloseHandle(hMapping);
329
+ if (!addr) return 0;
330
+ #else
331
+ int fd = open(fname, O_RDONLY);
332
+ if (fd == -1) return 0;
333
+ int64_t length = lseek(fd, 0, SEEK_END);
334
+ void *addr = mmap(NULL, length, PROT_READ, MAP_SHARED, fd, 0);
335
+ close(fd);
336
+ if (addr == MAP_FAILED) return 0;
337
+ #endif
338
+ *mm_length = length;
339
+ return addr;
340
+ }
341
+
342
+ static void munmap_file(void * addr, size_t length) {
343
+ #if defined(_WIN32) && !defined(_POSIX_MAPPED_FILES)
344
+ UnmapViewOfFile(addr);
345
+ #else
346
+ munmap(addr, length);
347
+ #endif
348
+ }
349
+
350
+ static bool report_bad_magic(const char *path, uint32_t got, uint32_t want) {
351
+ fprintf(stderr,
352
+ "%s: invalid model file (bad magic [got %#x want %#x])\n"
353
+ "\tyou most likely need to regenerate your ggml files\n"
354
+ "\tthe benefit is you'll get 10-100x faster load times\n"
355
+ "\tsee https://github.com/ggerganov/llama.cpp/issues/91\n"
356
+ "\tuse convert-pth-to-ggml.py to regenerate from original pth\n"
357
+ "\tuse migrate-ggml-2023-03-30-pr613.py if you deleted originals\n",
358
+ path, got, want);
359
+ return false;
360
+ }
361
+
291
362
  static bool llama_model_load(
292
363
  const std::string & fname,
293
364
  llama_context & lctx,
@@ -299,22 +370,24 @@ static bool llama_model_load(
299
370
  void *progress_callback_user_data) {
300
371
  fprintf(stderr, "%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
301
372
 
302
- const int64_t t_start_us = ggml_time_us();
303
-
304
- lctx.t_start_us = t_start_us;
305
-
306
- std::vector<char> f_buf(1024*1024);
373
+ lctx.t_start_us = ggml_time_us();
307
374
 
308
375
  auto & model = lctx.model;
309
376
  auto & vocab = lctx.vocab;
310
377
 
311
378
  auto fin = std::ifstream(fname, std::ios::binary);
312
- fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size());
313
379
  if (!fin) {
314
380
  fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
315
381
  return false;
316
382
  }
317
383
 
384
+ std::vector<char> f_buf(1024*1024);
385
+ fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size());
386
+
387
+ fin.seekg(0, fin.end);
388
+ const size_t file_size = fin.tellg();
389
+ fin.seekg(0);
390
+
318
391
  // verify magic
319
392
  {
320
393
  uint32_t magic;
@@ -325,8 +398,7 @@ static bool llama_model_load(
325
398
  return false;
326
399
  }
327
400
  if (magic != LLAMA_FILE_MAGIC) {
328
- fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
329
- return false;
401
+ return report_bad_magic(fname.c_str(), magic, LLAMA_FILE_MAGIC);
330
402
  }
331
403
 
332
404
  uint32_t format_version;
@@ -449,43 +521,24 @@ static bool llama_model_load(
449
521
  }
450
522
  }
451
523
 
524
+ // map model into memory
525
+ char *mm_addr = NULL;
526
+ model.mm_addr = mmap_file(fname.c_str(), &model.mm_length);
527
+ if (model.mm_addr == NULL) {
528
+ fprintf(stderr, "%s: failed to mmap '%s'\n", __func__, fname.c_str());
529
+ return false;
530
+ }
531
+ mm_addr = (char *)model.mm_addr;
532
+ fprintf(stderr, "%s: ggml map size = %6.2f MB\n", __func__, model.mm_length/(1024.0*1024.0));
533
+
452
534
  auto & ctx = model.ctx;
453
535
 
454
536
  size_t ctx_size = 0;
455
-
456
537
  {
457
- const auto & hparams = model.hparams;
458
-
459
- const int n_embd = hparams.n_embd;
538
+ const auto &hparams = model.hparams;
460
539
  const int n_layer = hparams.n_layer;
461
- const int n_ctx = hparams.n_ctx;
462
- const int n_vocab = hparams.n_vocab;
463
-
464
- ctx_size += n_embd*n_vocab*ggml_type_sizef(vtype); // tok_embeddings
465
-
466
- ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // norm
467
-
468
- ctx_size += n_embd*n_vocab*ggml_type_sizef(vtype); // output
469
-
470
- ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // attention_norm
471
-
472
- ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wq
473
- ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wk
474
- ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wv
475
- ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wo
476
-
477
- ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ffn_norm
478
-
479
- ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w1
480
- ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w2
481
- ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w3
482
-
483
- ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(memory_type); // memory_k
484
- ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(memory_type); // memory_v
485
-
486
540
  ctx_size += (5 + 10*n_layer)*256; // object overhead
487
-
488
- fprintf(stderr, "%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
541
+ fprintf(stderr, "%s: ggml ctx size = %6.2f KB\n", __func__, ctx_size/1024.0);
489
542
  }
490
543
 
491
544
  // print memory requirements
@@ -495,6 +548,7 @@ static bool llama_model_load(
495
548
  // this is the total memory required to run the inference
496
549
  const size_t mem_required =
497
550
  ctx_size +
551
+ model.mm_length +
498
552
  MEM_REQ_SCRATCH0.at(model.type) +
499
553
  MEM_REQ_SCRATCH1.at(model.type) +
500
554
  MEM_REQ_EVAL.at (model.type);
@@ -514,6 +568,7 @@ static bool llama_model_load(
514
568
  struct ggml_init_params params = {
515
569
  /*.mem_size =*/ lctx.model.buf.size(),
516
570
  /*.mem_buffer =*/ lctx.model.buf.data(),
571
+ /*.no_alloc =*/ true,
517
572
  };
518
573
 
519
574
  model.ctx = ggml_init(params);
@@ -576,234 +631,106 @@ static bool llama_model_load(
576
631
  }
577
632
  }
578
633
 
579
- const size_t file_offset = fin.tellg();
580
-
581
- fin.close();
582
-
583
634
  std::vector<uint8_t> tmp;
584
635
 
585
636
  if (progress_callback) {
586
637
  progress_callback(0.0, progress_callback_user_data);
587
638
  }
588
639
 
589
- for (int i = 0; i < n_parts; ++i) {
590
- const int part_id = i;
591
- //const int part_id = n_parts - i - 1;
640
+ fprintf(stderr, "%s: loading tensors from '%s'\n", __func__, fname.c_str());
592
641
 
593
- std::string fname_part = fname;
594
- if (i > 0) {
595
- fname_part += "." + std::to_string(i);
596
- }
597
-
598
- fprintf(stderr, "%s: loading model part %d/%d from '%s'\n", __func__, i+1, n_parts, fname_part.c_str());
642
+ // load weights
643
+ {
644
+ size_t total_size = 0;
645
+ model.n_loaded = 0;
599
646
 
600
- fin = std::ifstream(fname_part, std::ios::binary);
601
- fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size());
647
+ while (true) {
648
+ int32_t n_dims;
649
+ int32_t length;
650
+ int32_t ftype;
602
651
 
603
- fin.seekg(0, fin.end);
604
- const size_t file_size = fin.tellg();
652
+ fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
653
+ fin.read(reinterpret_cast<char *>(&length), sizeof(length));
654
+ fin.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
605
655
 
606
- fin.seekg(file_offset);
656
+ if (fin.eof()) {
657
+ break;
658
+ }
607
659
 
608
- // load weights
609
- {
610
- size_t total_size = 0;
660
+ int32_t nelements = 1;
661
+ int32_t ne[2] = { 1, 1 };
662
+ for (int i = 0; i < n_dims; ++i) {
663
+ fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
664
+ nelements *= ne[i];
665
+ }
611
666
 
612
- model.n_loaded = 0;
667
+ std::string name(length, 0);
668
+ fin.read(&name[0], length);
613
669
 
614
- fprintf(stderr, "%s: ", __func__);
670
+ if (model.tensors.find(name.data()) == model.tensors.end()) {
671
+ fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
672
+ return false;
673
+ }
615
674
 
616
- while (true) {
617
- int32_t n_dims;
618
- int32_t length;
619
- int32_t ftype;
675
+ auto tensor = model.tensors[name.data()];
620
676
 
621
- fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
622
- fin.read(reinterpret_cast<char *>(&length), sizeof(length));
623
- fin.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
677
+ if (ggml_nelements(tensor) != nelements) {
678
+ fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
679
+ return false;
680
+ }
681
+ if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
682
+ fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
683
+ __func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
684
+ return false;
685
+ }
686
+ if (0) {
687
+ static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
688
+ fprintf(stderr, "%24s - [%5d, %5d], type = %6s\n", name.data(), ne[0], ne[1], ftype_str[ftype]);
689
+ }
624
690
 
625
- if (fin.eof()) {
691
+ switch (ftype) {
692
+ case 0: // f32
693
+ case 1: // f16
626
694
  break;
627
- }
628
-
629
- int32_t nelements = 1;
630
- int32_t ne[2] = { 1, 1 };
631
- for (int i = 0; i < n_dims; ++i) {
632
- fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
633
- nelements *= ne[i];
634
- }
635
-
636
- std::string name(length, 0);
637
- fin.read(&name[0], length);
638
-
639
- if (model.tensors.find(name.data()) == model.tensors.end()) {
640
- fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
695
+ case 2: // q4_0
696
+ case 3: // q4_1
697
+ assert(ne[0] % 64 == 0);
698
+ break;
699
+ default:
700
+ fprintf(stderr, "%s: unknown ftype %d in model file\n", __func__, ftype);
641
701
  return false;
642
- }
643
-
644
- // split_type = 0: split by columns
645
- // split_type = 1: split by rows
646
- int split_type = 0;
647
-
648
- // split_type = 0:
649
- // regex:
650
- // - tok_embeddings.*
651
- // - layers.*.attention.wo.weight
652
- // - layers.*.feed_forward.w2.weight
653
-
654
- // split_type = 1:
655
- // regex:
656
- // - output.*
657
- // - layers.*.attention.wq.weight
658
- // - layers.*.attention.wk.weight
659
- // - layers.*.attention.wv.weight
660
- // - layers.*.feed_forward.w1.weight
661
- // - layers.*.feed_forward.w3.weight
662
- if (name.find("tok_embeddings") != std::string::npos) {
663
- split_type = 0;
664
- } else if (name.find("layers") != std::string::npos) {
665
- if (name.find("attention.wo.weight") != std::string::npos) {
666
- split_type = 0;
667
- } else if (name.find("feed_forward.w2.weight") != std::string::npos) {
668
- split_type = 0;
669
- } else {
670
- split_type = 1;
671
- }
672
- } else if (name.find("output") != std::string::npos) {
673
- split_type = 1;
674
- }
675
-
676
- auto tensor = model.tensors[name.data()];
677
-
678
- if (n_dims == 1) {
679
- if (ggml_nelements(tensor) != nelements) {
680
- fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
681
- return false;
682
- }
683
- } else {
684
- if (ggml_nelements(tensor)/n_parts != nelements) {
685
- fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
686
- return false;
687
- }
688
- }
689
-
690
- if (n_dims == 1) {
691
- if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
692
- fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
693
- __func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
694
- return false;
695
- }
696
- } else {
697
- if (split_type == 0) {
698
- if (tensor->ne[0]/n_parts != ne[0] || tensor->ne[1] != ne[1]) {
699
- fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
700
- __func__, name.data(), tensor->ne[0]/n_parts, tensor->ne[1], ne[0], ne[1]);
701
- return false;
702
- }
703
- } else {
704
- if (tensor->ne[0] != ne[0] || tensor->ne[1]/n_parts != ne[1]) {
705
- fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
706
- __func__, name.data(), tensor->ne[0], tensor->ne[1]/n_parts, ne[0], ne[1]);
707
- return false;
708
- }
709
- }
710
- }
711
-
712
- if (0) {
713
- static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
714
- fprintf(stderr, "%24s - [%5d, %5d], type = %6s, split = %d\n", name.data(), ne[0], ne[1], ftype_str[ftype], split_type);
715
- }
716
-
717
- size_t bpe = 0;
718
-
719
- switch (ftype) {
720
- case 0: bpe = ggml_type_size(GGML_TYPE_F32); break;
721
- case 1: bpe = ggml_type_size(GGML_TYPE_F16); break;
722
- case 2: bpe = ggml_type_size(GGML_TYPE_Q4_0); assert(ne[0] % 64 == 0); break;
723
- case 3: bpe = ggml_type_size(GGML_TYPE_Q4_1); assert(ne[0] % 64 == 0); break;
724
- default:
725
- {
726
- fprintf(stderr, "%s: unknown ftype %d in model file\n", __func__, ftype);
727
- return false;
728
- }
729
- };
730
-
731
- if (n_dims == 1 || n_parts == 1) {
732
- if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
733
- fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
734
- __func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
735
- return false;
736
- }
737
-
738
- if (part_id == 0) {
739
- fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
740
- } else {
741
- fin.seekg(ggml_nbytes(tensor), std::ios::cur);
742
- }
743
-
744
- total_size += ggml_nbytes(tensor);
745
- } else {
746
- if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)/n_parts) {
747
- fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
748
- __func__, name.data(), ggml_nbytes(tensor)/n_parts, nelements*bpe);
749
- return false;
750
- }
751
-
752
- if (split_type == 0) {
753
- const int np0 = ne[0];
754
-
755
- const size_t row_size = (tensor->ne[0]/ggml_blck_size(tensor->type))*ggml_type_size(tensor->type);
756
- assert(row_size == tensor->nb[1]);
757
-
758
- for (int i1 = 0; i1 < ne[1]; ++i1) {
759
- const size_t offset_row = i1*row_size;
760
- const size_t offset = offset_row + ((part_id*np0)/ggml_blck_size(tensor->type))*ggml_type_size(tensor->type);
761
- fin.read(reinterpret_cast<char *>(tensor->data) + offset, row_size/n_parts);
762
- }
763
- } else {
764
- const int np1 = ne[1];
765
-
766
- const size_t row_size = (tensor->ne[0]/ggml_blck_size(tensor->type))*ggml_type_size(tensor->type);
767
-
768
- for (int i1 = 0; i1 < ne[1]; ++i1) {
769
- const size_t offset_row = (i1 + part_id*np1)*row_size;
770
- fin.read(reinterpret_cast<char *>(tensor->data) + offset_row, row_size);
771
- }
772
- }
773
-
774
- total_size += ggml_nbytes(tensor)/n_parts;
775
- }
776
-
777
- //fprintf(stderr, "%42s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);
778
- model.n_loaded++;
779
-
780
- // progress
781
- if (progress_callback) {
782
- float current_file_progress = float(size_t(fin.tellg()) - file_offset) / float(file_size - file_offset);
783
- float current_progress = (float(i) + current_file_progress) / float(n_parts);
784
- progress_callback(current_progress, progress_callback_user_data);
785
- }
786
- if (model.n_loaded % 8 == 0) {
787
- fprintf(stderr, ".");
788
- fflush(stderr);
789
- }
790
- }
791
-
792
- fprintf(stderr, " done\n");
702
+ };
793
703
 
794
- fprintf(stderr, "%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, model.n_loaded);
795
- if (model.n_loaded == 0) {
796
- fprintf(stderr, "%s: WARN no tensors loaded from model file - assuming empty model for testing\n", __func__);
797
- } else if (model.n_loaded != (int) model.tensors.size()) {
798
- fprintf(stderr, "%s: ERROR not all tensors loaded from model file - expected %zu, got %d\n", __func__, model.tensors.size(), model.n_loaded);
799
- return false;
704
+ // load the tensor data into memory without copying or reading it
705
+ size_t offset = fin.tellg();
706
+ size_t tensor_data_size = ggml_nbytes(tensor);
707
+ offset = (offset + 31) & -32;
708
+ tensor->data = mm_addr + offset;
709
+ fin.seekg(offset + tensor_data_size);
710
+ total_size += tensor_data_size;
711
+ model.n_loaded++;
712
+
713
+ // progress
714
+ if (progress_callback) {
715
+ double current_progress = size_t(fin.tellg()) / double(file_size);
716
+ progress_callback(current_progress, progress_callback_user_data);
800
717
  }
801
718
  }
802
719
 
803
720
  fin.close();
721
+
722
+ fprintf(stderr, "%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, model.n_loaded);
723
+ if (model.n_loaded == 0) {
724
+ fprintf(stderr, "%s: WARN no tensors loaded from model file - assuming empty model for testing\n", __func__);
725
+ } else if (model.n_loaded != (int) model.tensors.size()) {
726
+ fprintf(stderr, "%s: ERROR not all tensors loaded from model file - expected %zu, got %d\n", __func__, model.tensors.size(), model.n_loaded);
727
+ return false;
728
+ }
804
729
  }
805
730
 
806
- lctx.t_load_us = ggml_time_us() - t_start_us;
731
+ // loading time will be recalculate after the first eval, so
732
+ // we take page faults deferred by mmap() into consideration
733
+ lctx.t_load_us = ggml_time_us() - lctx.t_start_us;
807
734
 
808
735
  if (progress_callback) {
809
736
  progress_callback(1.0, progress_callback_user_data);
@@ -849,6 +776,7 @@ static bool llama_eval_internal(
849
776
  struct ggml_init_params params = {
850
777
  /*.mem_size =*/ buf_compute.size(),
851
778
  /*.mem_buffer =*/ buf_compute.data(),
779
+ /*.no_alloc =*/ false,
852
780
  };
853
781
 
854
782
  struct ggml_context * ctx0 = ggml_init(params);
@@ -856,7 +784,7 @@ static bool llama_eval_internal(
856
784
  // for big prompts, if BLAS is enabled, it is better to use only one thread
857
785
  // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
858
786
  ggml_cgraph gf = {};
859
- gf.n_threads = N > 255 && ggml_cpu_has_blas() ? 1 : n_threads;
787
+ gf.n_threads = N >= 32 && ggml_cpu_has_blas() ? 1 : n_threads;
860
788
 
861
789
  struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
862
790
  memcpy(embd->data, tokens, N*ggml_element_size(embd));
@@ -1126,7 +1054,7 @@ struct llama_tokenizer {
1126
1054
  size_t offs = 0;
1127
1055
  while (offs < text.size()) {
1128
1056
  llama_sp_symbol sym;
1129
- size_t char_len = std::min(text.size() - offs, utf8_len(text[offs]));
1057
+ size_t char_len = Min(text.size() - offs, utf8_len(text[offs]));
1130
1058
  sym.text = text.c_str() + offs;
1131
1059
  sym.n = char_len;
1132
1060
  offs += char_len;
@@ -1291,7 +1219,7 @@ static llama_vocab::id llama_sample_top_p_top_k(
1291
1219
 
1292
1220
  float maxl = -std::numeric_limits<float>::infinity();
1293
1221
  for (const auto & kv : logits_id) {
1294
- maxl = std::max(maxl, kv.first);
1222
+ maxl = Max(maxl, kv.first);
1295
1223
  }
1296
1224
 
1297
1225
  // compute probs for the top k tokens
@@ -1385,8 +1313,7 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
1385
1313
  return false;
1386
1314
  }
1387
1315
  if (magic != LLAMA_FILE_MAGIC) {
1388
- fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname_inp.c_str());
1389
- return false;
1316
+ return report_bad_magic(fname_inp.c_str(), magic, LLAMA_FILE_MAGIC);
1390
1317
  }
1391
1318
 
1392
1319
  fout.write((char *) &magic, sizeof(magic));
@@ -1444,7 +1371,7 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
1444
1371
  return false;
1445
1372
  }
1446
1373
 
1447
- std::string word;
1374
+ std::vector<char> word(32);
1448
1375
  vocab.id_to_token.resize(n_vocab);
1449
1376
  for (int i = 0; i < n_vocab; i++) {
1450
1377
  uint32_t len;
@@ -1452,17 +1379,17 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
1452
1379
  fout.write((char *) &len, sizeof(len));
1453
1380
 
1454
1381
  word.resize(len);
1455
- finp.read ((char *) word.data(), len);
1456
- fout.write((char *) word.data(), len);
1382
+ finp.read ((char *) &word[0], len);
1383
+ fout.write((char *) &word[0], len);
1457
1384
 
1458
1385
  float score;
1459
1386
  finp.read ((char *) &score, sizeof(score));
1460
1387
  fout.write((char *) &score, sizeof(score));
1461
1388
 
1462
- vocab.token_to_id[word] = i;
1389
+ vocab.token_to_id[word.data()] = i;
1463
1390
 
1464
1391
  auto &tok_score = vocab.id_to_token[i];
1465
- tok_score.tok = word;
1392
+ tok_score.tok = word.data();
1466
1393
  tok_score.score = score;
1467
1394
  }
1468
1395
  }
@@ -1503,6 +1430,13 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
1503
1430
  std::string name(length, 0);
1504
1431
  finp.read (&name[0], length);
1505
1432
 
1433
+ {
1434
+ // ensure tensor data is aligned
1435
+ uint64_t offset = finp.tellg();
1436
+ offset = (offset + 31) & -32;
1437
+ finp.seekg(offset);
1438
+ }
1439
+
1506
1440
  {
1507
1441
  static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
1508
1442
  printf("%48s - [%5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ftype_str[ftype]);
@@ -1558,6 +1492,13 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
1558
1492
  }
1559
1493
  fout.write(&name[0], length);
1560
1494
 
1495
+ {
1496
+ // ensure tensor data is aligned
1497
+ uint64_t offset = fout.tellp();
1498
+ offset = (offset + 31) & -32;
1499
+ fout.seekp(offset);
1500
+ }
1501
+
1561
1502
  if (quantize) {
1562
1503
  printf("quantizing .. ");
1563
1504
  work.resize(nelements); // for quantization
@@ -1655,7 +1596,10 @@ struct llama_context * llama_init_from_file(
1655
1596
 
1656
1597
  if (params.use_mlock) {
1657
1598
  char *err;
1658
- if (!ggml_mlock(ctx->model.ctx, &err)) {
1599
+ if (!ggml_mlock(ctx->model.ctx,
1600
+ ctx->model.mm_addr,
1601
+ ctx->model.mm_length,
1602
+ &err)) {
1659
1603
  fprintf(stderr, "%s\n", err);
1660
1604
  free(err);
1661
1605
  llama_free(ctx);
@@ -1705,6 +1649,10 @@ void llama_free(struct llama_context * ctx) {
1705
1649
  ggml_free(ctx->model.ctx);
1706
1650
  }
1707
1651
 
1652
+ if (ctx->model.mm_addr) {
1653
+ munmap_file(ctx->model.mm_addr, ctx->model.mm_length);
1654
+ }
1655
+
1708
1656
  delete ctx;
1709
1657
  }
1710
1658
 
@@ -1730,7 +1678,11 @@ int llama_eval(
1730
1678
  fprintf(stderr, "%s: failed to eval\n", __func__);
1731
1679
  return 1;
1732
1680
  }
1733
-
1681
+ // get a more accurate load time, upon first eval
1682
+ if (!ctx->has_evaluated_once) {
1683
+ ctx->t_load_us = ggml_time_us() - ctx->t_start_us;
1684
+ ctx->has_evaluated_once = true;
1685
+ }
1734
1686
  return 0;
1735
1687
  }
1736
1688
 
@@ -1823,9 +1775,9 @@ llama_token llama_sample_top_p_top_k(
1823
1775
  void llama_print_timings(struct llama_context * ctx) {
1824
1776
  const int64_t t_end_us = ggml_time_us();
1825
1777
 
1826
- const int32_t n_sample = std::max(1, ctx->n_sample);
1827
- const int32_t n_eval = std::max(1, ctx->n_eval);
1828
- const int32_t n_p_eval = std::max(1, ctx->n_p_eval);
1778
+ const int32_t n_sample = Max(1, ctx->n_sample);
1779
+ const int32_t n_eval = Max(1, ctx->n_eval);
1780
+ const int32_t n_p_eval = Max(1, ctx->n_p_eval);
1829
1781
 
1830
1782
  fprintf(stderr, "\n");
1831
1783
  fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0);
@@ -1837,7 +1789,6 @@ void llama_print_timings(struct llama_context * ctx) {
1837
1789
 
1838
1790
  void llama_reset_timings(struct llama_context * ctx) {
1839
1791
  ctx->t_start_us = ggml_time_us();
1840
-
1841
1792
  ctx->t_sample_us = ctx->n_sample = 0;
1842
1793
  ctx->t_eval_us = ctx->n_eval = 0;
1843
1794
  ctx->t_p_eval_us = ctx->n_p_eval = 0;