llama_cpp 0.1.0 → 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,6 +1,7 @@
1
1
  // Defines fileno on msys:
2
2
  #ifndef _GNU_SOURCE
3
3
  #define _GNU_SOURCE
4
+ #include <cstddef>
4
5
  #include <cstdint>
5
6
  #include <cstdio>
6
7
  #endif
@@ -9,6 +10,9 @@
9
10
  #include "llama.h"
10
11
 
11
12
  #include "ggml.h"
13
+ #ifdef GGML_USE_CUBLAS
14
+ #include "ggml-cuda.h"
15
+ #endif
12
16
 
13
17
  #include <array>
14
18
  #include <ctime>
@@ -42,6 +46,7 @@ enum e_model {
42
46
  MODEL_65B,
43
47
  };
44
48
 
49
+
45
50
  static const size_t MB = 1024*1024;
46
51
 
47
52
  // computed for n_ctx == 2048
@@ -50,49 +55,49 @@ static const size_t MB = 1024*1024;
50
55
 
51
56
  static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
52
57
  {
53
- static std::map<e_model, size_t> _MEM_REQ_SCRATCH0 = {
58
+ static std::map<e_model, size_t> k_sizes = {
54
59
  { MODEL_7B, 512ull * MB },
55
60
  { MODEL_13B, 512ull * MB },
56
61
  { MODEL_30B, 512ull * MB },
57
62
  { MODEL_65B, 1024ull * MB },
58
63
  };
59
- return _MEM_REQ_SCRATCH0;
64
+ return k_sizes;
60
65
  }
61
66
 
62
67
  static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
63
68
  {
64
- static std::map<e_model, size_t> _MEM_REQ_SCRATCH1 = {
69
+ static std::map<e_model, size_t> k_sizes = {
65
70
  { MODEL_7B, 512ull * MB },
66
71
  { MODEL_13B, 512ull * MB },
67
72
  { MODEL_30B, 512ull * MB },
68
73
  { MODEL_65B, 1024ull * MB },
69
74
  };
70
- return _MEM_REQ_SCRATCH1;
75
+ return k_sizes;
71
76
  }
72
77
 
73
78
  // 2*n_embd*n_ctx*n_layer*sizeof(float16)
74
79
  static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
75
80
  {
76
- static std::map<e_model, size_t> _MEM_REQ_KV_SELF = {
81
+ static std::map<e_model, size_t> k_sizes = {
77
82
  { MODEL_7B, 1026ull * MB },
78
83
  { MODEL_13B, 1608ull * MB },
79
84
  { MODEL_30B, 3124ull * MB },
80
85
  { MODEL_65B, 5120ull * MB },
81
86
  };
82
- return _MEM_REQ_KV_SELF;
87
+ return k_sizes;
83
88
  }
84
89
 
85
90
  // this is mostly needed for temporary mul_mat buffers to dequantize the data
86
91
  // not actually needed if BLAS is disabled
87
92
  static const std::map<e_model, size_t> & MEM_REQ_EVAL()
88
93
  {
89
- static std::map<e_model, size_t> _MEM_REQ_EVAL = {
94
+ static std::map<e_model, size_t> k_sizes = {
90
95
  { MODEL_7B, 768ull * MB },
91
96
  { MODEL_13B, 1024ull * MB },
92
97
  { MODEL_30B, 1280ull * MB },
93
98
  { MODEL_65B, 1536ull * MB },
94
99
  };
95
- return _MEM_REQ_EVAL;
100
+ return k_sizes;
96
101
  }
97
102
 
98
103
  // default hparams (LLaMA 7B)
@@ -107,7 +112,7 @@ struct llama_hparams {
107
112
  enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;
108
113
 
109
114
  bool operator!=(const llama_hparams & other) const {
110
- return memcmp(this, &other, sizeof(llama_hparams));
115
+ return static_cast<bool>(memcmp(this, &other, sizeof(llama_hparams)));
111
116
  }
112
117
  };
113
118
 
@@ -402,6 +407,8 @@ enum llama_file_version {
402
407
  LLAMA_FILE_VERSION_GGML,
403
408
  LLAMA_FILE_VERSION_GGMF_V1, // added version field and scores in vocab
404
409
  LLAMA_FILE_VERSION_GGJT_V1, // added padding
410
+ LLAMA_FILE_VERSION_GGJT_V2, // changed quantization format
411
+ LLAMA_FILE_VERSION_GGJT_V3, // changed Q4 and Q8 quantization format
405
412
  };
406
413
 
407
414
  struct llama_file_loader {
@@ -420,22 +427,30 @@ struct llama_file_loader {
420
427
  }
421
428
  void read_magic() {
422
429
  uint32_t magic = file.read_u32();
423
- uint32_t version = 0;
424
430
 
425
- if (magic != 'ggml') {
426
- version = file.read_u32();
431
+ if (magic == LLAMA_FILE_MAGIC_GGML) {
432
+ file_version = LLAMA_FILE_VERSION_GGML;
433
+ return;
427
434
  }
428
435
 
429
- if (magic == 'ggml' && version == 0) {
430
- file_version = LLAMA_FILE_VERSION_GGML;
431
- } else if (magic == 'ggmf' && version == 1) {
432
- file_version = LLAMA_FILE_VERSION_GGMF_V1;
433
- } else if (magic == 'ggjt' && version == 1) {
434
- file_version = LLAMA_FILE_VERSION_GGJT_V1;
435
- } else {
436
- throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
437
- magic, version);
436
+ uint32_t version = file.read_u32();
437
+
438
+ switch (magic) {
439
+ case LLAMA_FILE_MAGIC_GGMF:
440
+ switch (version) {
441
+ case 1: file_version = LLAMA_FILE_VERSION_GGMF_V1; return;
442
+ }
443
+ break;
444
+ case LLAMA_FILE_MAGIC_GGJT:
445
+ switch (version) {
446
+ case 1: file_version = LLAMA_FILE_VERSION_GGJT_V1; return;
447
+ case 2: file_version = LLAMA_FILE_VERSION_GGJT_V2; return;
448
+ case 3: file_version = LLAMA_FILE_VERSION_GGJT_V3; return;
449
+ }
438
450
  }
451
+
452
+ throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
453
+ magic, version);
439
454
  }
440
455
  void read_hparams() {
441
456
  hparams.n_vocab = file.read_u32();
@@ -482,7 +497,6 @@ struct llama_file_loader {
482
497
  case GGML_TYPE_F16:
483
498
  case GGML_TYPE_Q4_0:
484
499
  case GGML_TYPE_Q4_1:
485
- case GGML_TYPE_Q4_2:
486
500
  case GGML_TYPE_Q5_0:
487
501
  case GGML_TYPE_Q5_1:
488
502
  case GGML_TYPE_Q8_0:
@@ -494,7 +508,7 @@ struct llama_file_loader {
494
508
 
495
509
  if (file_version >= LLAMA_FILE_VERSION_GGJT_V1) {
496
510
  // skip to the next multiple of 32 bytes
497
- file.seek(-file.tell() & 31, SEEK_CUR);
511
+ file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
498
512
  }
499
513
  shard.file_idx = file_idx;
500
514
  shard.file_off = file.tell();
@@ -527,8 +541,8 @@ struct llama_file_saver {
527
541
  write_vocab();
528
542
  }
529
543
  void write_magic() {
530
- file.write_u32('ggjt'); // magic
531
- file.write_u32(1); // version
544
+ file.write_u32(LLAMA_FILE_MAGIC); // magic
545
+ file.write_u32(LLAMA_FILE_VERSION); // version
532
546
  }
533
547
  void write_hparams(enum llama_ftype new_ftype) {
534
548
  const llama_hparams & hparams = any_file_loader->hparams;
@@ -558,7 +572,6 @@ struct llama_file_saver {
558
572
  case GGML_TYPE_F16:
559
573
  case GGML_TYPE_Q4_0:
560
574
  case GGML_TYPE_Q4_1:
561
- case GGML_TYPE_Q4_2:
562
575
  case GGML_TYPE_Q5_0:
563
576
  case GGML_TYPE_Q5_1:
564
577
  case GGML_TYPE_Q8_0:
@@ -570,7 +583,7 @@ struct llama_file_saver {
570
583
  file.write_u32(new_type);
571
584
  file.write_raw(tensor.ne.data(), sizeof(tensor.ne[0]) * tensor.ne.size());
572
585
  file.write_raw(tensor.name.data(), tensor.name.size());
573
- file.seek(-file.tell() & 31, SEEK_CUR);
586
+ file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
574
587
  LLAMA_ASSERT(new_size == llama_calc_tensor_size(tensor.ne, new_type));
575
588
  file.write_raw(new_data, new_size);
576
589
  }
@@ -585,12 +598,12 @@ struct llama_model_loader {
585
598
  std::unique_ptr<llama_mmap> mapping;
586
599
 
587
600
  llama_model_loader(const std::string & fname_base, bool use_mmap, bool vocab_only) {
588
- auto first_file = new llama_file_loader(fname_base.c_str(), 0, tensors_map);
601
+ auto * first_file = new llama_file_loader(fname_base.c_str(), 0, tensors_map);
589
602
  file_loaders.emplace_back(first_file);
590
603
  uint32_t n_parts = vocab_only ? 1 : guess_n_parts();
591
604
  for (uint32_t i = 1; i < n_parts; i++) {
592
605
  std::string fname = fname_base + "." + std::to_string(i);
593
- auto ith_file = new llama_file_loader(fname.c_str(), i, tensors_map);
606
+ auto * ith_file = new llama_file_loader(fname.c_str(), i, tensors_map);
594
607
  file_loaders.emplace_back(ith_file);
595
608
  if (ith_file->hparams != first_file->hparams) {
596
609
  throw format("llama.cpp: hparams inconsistent between files");
@@ -637,7 +650,7 @@ struct llama_model_loader {
637
650
  }
638
651
  }
639
652
 
640
- struct ggml_tensor * get_tensor(const std::string & name, std::vector<uint32_t> ne) {
653
+ struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne, ggml_backend backend) {
641
654
  auto it = tensors_map.name_to_idx.find(name);
642
655
  if (it == tensors_map.name_to_idx.end()) {
643
656
  throw format("llama.cpp: tensor '%s' is missing from model", name.c_str());
@@ -648,10 +661,10 @@ struct llama_model_loader {
648
661
  name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str());
649
662
  }
650
663
 
651
- return get_tensor_for(lt);
664
+ return get_tensor_for(lt, backend);
652
665
  }
653
666
 
654
- struct ggml_tensor * get_tensor_for(llama_load_tensor & lt) {
667
+ struct ggml_tensor * get_tensor_for(llama_load_tensor & lt, ggml_backend backend) {
655
668
  struct ggml_tensor * tensor;
656
669
  if (lt.ne.size() == 2) {
657
670
  tensor = ggml_new_tensor_2d(ggml_ctx, lt.type, lt.ne.at(0), lt.ne.at(1));
@@ -661,12 +674,13 @@ struct llama_model_loader {
661
674
  }
662
675
  ggml_set_name(tensor, lt.name.c_str());
663
676
  LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor
677
+ tensor->backend = backend;
664
678
  lt.ggml_tensor = tensor;
665
679
  num_ggml_tensors_created++;
666
680
  return tensor;
667
681
  }
668
682
 
669
- void done_getting_tensors() {
683
+ void done_getting_tensors() const {
670
684
  if (num_ggml_tensors_created != tensors_map.tensors.size()) {
671
685
  throw std::string("llama.cpp: file contained more tensors than expected");
672
686
  }
@@ -674,12 +688,16 @@ struct llama_model_loader {
674
688
 
675
689
  void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) {
676
690
  size_t data_size = 0;
691
+ size_t prefetch_size = 0;
677
692
  for (const llama_load_tensor & lt : tensors_map.tensors) {
678
693
  data_size += lt.size;
694
+ if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
695
+ prefetch_size += lt.size;
696
+ }
679
697
  }
680
698
 
681
699
  if (use_mmap) {
682
- mapping.reset(new llama_mmap(&file_loaders.at(0)->file));
700
+ mapping.reset(new llama_mmap(&file_loaders.at(0)->file, prefetch_size));
683
701
  if (!lmlock) {
684
702
  // Don't call the callback since the actual loading will be lazy
685
703
  // and we can't measure it.
@@ -692,6 +710,9 @@ struct llama_model_loader {
692
710
 
693
711
  size_t done_size = 0;
694
712
  for (llama_load_tensor & lt : tensors_map.tensors) {
713
+ if (lt.ggml_tensor->backend != GGML_BACKEND_CPU) {
714
+ continue;
715
+ }
695
716
  if (progress_callback) {
696
717
  progress_callback((float) done_size / data_size, progress_callback_user_data);
697
718
  }
@@ -704,9 +725,6 @@ struct llama_model_loader {
704
725
  lmlock->grow_to(done_size);
705
726
  }
706
727
  }
707
- if (progress_callback) {
708
- progress_callback(1.0f, progress_callback_user_data);
709
- }
710
728
  }
711
729
 
712
730
  void load_data_for(llama_load_tensor & lt) {
@@ -808,9 +826,9 @@ static bool kv_cache_init(
808
826
  struct llama_context_params llama_context_default_params() {
809
827
  struct llama_context_params result = {
810
828
  /*.n_ctx =*/ 512,
811
- /*.n_parts =*/ -1,
829
+ /*.gpu_layers =*/ 0,
812
830
  /*.seed =*/ -1,
813
- /*.f16_kv =*/ false,
831
+ /*.f16_kv =*/ true,
814
832
  /*.logits_all =*/ false,
815
833
  /*.vocab_only =*/ false,
816
834
  /*.use_mmap =*/ true,
@@ -831,6 +849,21 @@ bool llama_mlock_supported() {
831
849
  return llama_mlock::SUPPORTED;
832
850
  }
833
851
 
852
+ void llama_init_backend() {
853
+ ggml_time_init();
854
+
855
+ // needed to initialize f16 tables
856
+ {
857
+ struct ggml_init_params params = { 0, NULL, false };
858
+ struct ggml_context * ctx = ggml_init(params);
859
+ ggml_free(ctx);
860
+ }
861
+ }
862
+
863
+ int64_t llama_time_us() {
864
+ return ggml_time_us();
865
+ }
866
+
834
867
  //
835
868
  // model loading
836
869
  //
@@ -839,9 +872,12 @@ static const char *llama_file_version_name(llama_file_version version) {
839
872
  switch (version) {
840
873
  case LLAMA_FILE_VERSION_GGML: return "'ggml' (old version with low tokenizer quality and no mmap support)";
841
874
  case LLAMA_FILE_VERSION_GGMF_V1: return "ggmf v1 (old version with no mmap support)";
842
- case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (latest)";
843
- default: LLAMA_ASSERT(false);
875
+ case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (pre #1405)";
876
+ case LLAMA_FILE_VERSION_GGJT_V2: return "ggjt v2 (pre #1508)";
877
+ case LLAMA_FILE_VERSION_GGJT_V3: return "ggjt v3 (latest)";
844
878
  }
879
+
880
+ return "unknown";
845
881
  }
846
882
 
847
883
  static const char *llama_ftype_name(enum llama_ftype ftype) {
@@ -852,7 +888,6 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
852
888
  case LLAMA_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1";
853
889
  case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
854
890
  return "mostly Q4_1, some F16";
855
- case LLAMA_FTYPE_MOSTLY_Q4_2: return "mostly Q4_2";
856
891
  case LLAMA_FTYPE_MOSTLY_Q5_0: return "mostly Q5_0";
857
892
  case LLAMA_FTYPE_MOSTLY_Q5_1: return "mostly Q5_1";
858
893
  case LLAMA_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0";
@@ -874,6 +909,7 @@ static void llama_model_load_internal(
874
909
  const std::string & fname,
875
910
  llama_context & lctx,
876
911
  int n_ctx,
912
+ int n_gpu_layers,
877
913
  ggml_type memory_type,
878
914
  bool use_mmap,
879
915
  bool use_mlock,
@@ -918,35 +954,32 @@ static void llama_model_load_internal(
918
954
  fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
919
955
  }
920
956
 
957
+ if (file_version < LLAMA_FILE_VERSION_GGJT_V2) {
958
+ if (hparams.ftype != LLAMA_FTYPE_ALL_F32 &&
959
+ hparams.ftype != LLAMA_FTYPE_MOSTLY_F16 &&
960
+ hparams.ftype != LLAMA_FTYPE_MOSTLY_Q8_0) {
961
+ throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1405)");
962
+ }
963
+ }
964
+
965
+ if (file_version < LLAMA_FILE_VERSION_GGJT_V3) {
966
+ if (hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ||
967
+ hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_1 ||
968
+ hparams.ftype == LLAMA_FTYPE_MOSTLY_Q8_0) {
969
+ throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1508)");
970
+ }
971
+ }
972
+
921
973
  if (vocab_only) {
922
974
  return;
923
975
  }
924
976
 
925
977
  auto & ctx = model.ctx;
926
978
 
927
- size_t ctx_size, mmapped_size;
979
+ size_t ctx_size;
980
+ size_t mmapped_size;
928
981
  ml->calc_sizes(&ctx_size, &mmapped_size);
929
- fprintf(stderr, "%s: ggml ctx size = %6.2f KB\n", __func__, ctx_size/1024.0);
930
-
931
- // print memory requirements
932
- {
933
- const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
934
-
935
- // this is the total memory required to run the inference
936
- const size_t mem_required =
937
- ctx_size +
938
- mmapped_size +
939
- MEM_REQ_SCRATCH0().at(model.type) +
940
- MEM_REQ_SCRATCH1().at(model.type) +
941
- MEM_REQ_EVAL().at(model.type);
942
-
943
- // this is the memory required by one llama_state
944
- const size_t mem_required_state =
945
- scale*MEM_REQ_KV_SELF().at(model.type);
946
-
947
- fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
948
- mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
949
- }
982
+ fprintf(stderr, "%s: ggml ctx size = %7.2f MB\n", __func__, ctx_size/1024.0/1024.0);
950
983
 
951
984
  // create the ggml context
952
985
  {
@@ -968,43 +1001,102 @@ static void llama_model_load_internal(
968
1001
  }
969
1002
  }
970
1003
 
1004
+ #ifdef GGML_USE_CUBLAS
1005
+ #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CUDA
1006
+ #else
1007
+ #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CPU
1008
+ #endif
1009
+
971
1010
  // prepare memory for the weights
1011
+ size_t vram_total = 0;
972
1012
  {
973
- const auto & hparams = model.hparams;
974
-
975
1013
  const uint32_t n_embd = hparams.n_embd;
976
1014
  const uint32_t n_layer = hparams.n_layer;
977
1015
  const uint32_t n_vocab = hparams.n_vocab;
978
1016
 
979
1017
  ml->ggml_ctx = ctx;
980
1018
 
981
- model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab});
982
- model.norm = ml->get_tensor("norm.weight", {n_embd});
983
- model.output = ml->get_tensor("output.weight", {n_embd, n_vocab});
1019
+ model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab}, GGML_BACKEND_CPU);
1020
+ model.norm = ml->get_tensor("norm.weight", {n_embd}, GGML_BACKEND_CPU);
1021
+
1022
+ // "output" tensor
1023
+ {
1024
+ ggml_backend backend_output;
1025
+ if (n_gpu_layers > int(n_layer)) { // NOLINT
1026
+ backend_output = LLAMA_BACKEND_OFFLOAD;
1027
+ } else {
1028
+ backend_output = GGML_BACKEND_CPU;
1029
+ }
1030
+
1031
+ model.output = ml->get_tensor("output.weight", {n_embd, n_vocab}, backend_output);
1032
+ }
1033
+
1034
+ const int i_gpu_start = n_layer - n_gpu_layers;
984
1035
 
985
1036
  model.layers.resize(n_layer);
986
1037
  for (uint32_t i = 0; i < n_layer; ++i) {
1038
+ const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
1039
+
987
1040
  auto & layer = model.layers[i];
988
1041
 
989
1042
  std::string layers_i = "layers." + std::to_string(i);
990
1043
 
991
- layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd});
1044
+ layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd}, backend);
1045
+
1046
+ layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}, backend);
1047
+ layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd}, backend);
1048
+ layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd}, backend);
1049
+ layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}, backend);
992
1050
 
993
- layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd});
994
- layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd});
995
- layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd});
996
- layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd});
1051
+ layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd}, backend);
997
1052
 
998
- layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd});
1053
+ layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}, backend);
1054
+ layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}, backend);
1055
+ layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}, backend);
999
1056
 
1000
- layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff});
1001
- layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd});
1002
- layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff});
1057
+ if (backend == GGML_BACKEND_CUDA) {
1058
+ vram_total +=
1059
+ ggml_nbytes(layer.attention_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
1060
+ ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.attention_norm) +
1061
+ ggml_nbytes(layer.w1) + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3);
1062
+ }
1003
1063
  }
1004
1064
  }
1005
1065
 
1006
1066
  ml->done_getting_tensors();
1007
1067
 
1068
+ // print memory requirements
1069
+ {
1070
+ const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
1071
+
1072
+ // this is the total memory required to run the inference
1073
+ const size_t mem_required =
1074
+ ctx_size +
1075
+ mmapped_size - vram_total + // weights in VRAM not in memory
1076
+ MEM_REQ_SCRATCH0().at(model.type) +
1077
+ MEM_REQ_SCRATCH1().at(model.type) +
1078
+ MEM_REQ_EVAL().at(model.type);
1079
+
1080
+ // this is the memory required by one llama_state
1081
+ const size_t mem_required_state =
1082
+ scale*MEM_REQ_KV_SELF().at(model.type);
1083
+
1084
+ fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
1085
+ mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
1086
+
1087
+ #ifdef GGML_USE_CUBLAS
1088
+ const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
1089
+
1090
+ fprintf(stderr, "%s: [cublas] offloading %d layers to GPU\n", __func__, n_gpu);
1091
+ if (n_gpu_layers > (int) hparams.n_layer) {
1092
+ fprintf(stderr, "%s: [cublas] offloading output layer to GPU\n", __func__);
1093
+ }
1094
+ fprintf(stderr, "%s: [cublas] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
1095
+ #else
1096
+ (void) n_gpu_layers;
1097
+ #endif
1098
+ }
1099
+
1008
1100
  // populate `tensors_by_name`
1009
1101
  for (llama_load_tensor & lt : ml->tensors_map.tensors) {
1010
1102
  model.tensors_by_name.emplace_back(lt.name, lt.ggml_tensor);
@@ -1012,6 +1104,33 @@ static void llama_model_load_internal(
1012
1104
 
1013
1105
  ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
1014
1106
 
1107
+ #ifdef GGML_USE_CUBLAS
1108
+ {
1109
+ size_t done_size = 0;
1110
+ size_t data_size = 0;
1111
+ for (llama_load_tensor & lt : ml->tensors_map.tensors) {
1112
+ data_size += lt.size;
1113
+ if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
1114
+ done_size += lt.size;
1115
+ }
1116
+ }
1117
+ for (llama_load_tensor & lt : ml->tensors_map.tensors) {
1118
+ if (lt.ggml_tensor->backend != GGML_BACKEND_CUDA) {
1119
+ continue;
1120
+ }
1121
+ if (progress_callback) {
1122
+ progress_callback((float) done_size / data_size, progress_callback_user_data);
1123
+ }
1124
+ ggml_cuda_load_data(fname.c_str(), lt.ggml_tensor, lt.shards.at(0).file_off);
1125
+ done_size += lt.size;
1126
+ }
1127
+ }
1128
+ #endif // GGML_USE_CUBLAS
1129
+
1130
+ if (progress_callback) {
1131
+ progress_callback(1.0f, progress_callback_user_data);
1132
+ }
1133
+
1015
1134
  model.mapping = std::move(ml->mapping);
1016
1135
 
1017
1136
  // loading time will be recalculate after the first eval, so
@@ -1023,6 +1142,7 @@ static bool llama_model_load(
1023
1142
  const std::string & fname,
1024
1143
  llama_context & lctx,
1025
1144
  int n_ctx,
1145
+ int n_gpu_layers,
1026
1146
  ggml_type memory_type,
1027
1147
  bool use_mmap,
1028
1148
  bool use_mlock,
@@ -1030,7 +1150,7 @@ static bool llama_model_load(
1030
1150
  llama_progress_callback progress_callback,
1031
1151
  void *progress_callback_user_data) {
1032
1152
  try {
1033
- llama_model_load_internal(fname, lctx, n_ctx, memory_type, use_mmap, use_mlock,
1153
+ llama_model_load_internal(fname, lctx, n_ctx, n_gpu_layers, memory_type, use_mmap, use_mlock,
1034
1154
  vocab_only, progress_callback, progress_callback_user_data);
1035
1155
  return true;
1036
1156
  } catch (const std::string & err) {
@@ -1052,6 +1172,13 @@ static bool llama_eval_internal(
1052
1172
  const int n_tokens,
1053
1173
  const int n_past,
1054
1174
  const int n_threads) {
1175
+
1176
+ // enforce that the first token is BOS
1177
+ if (n_past == 0 && tokens[0] != llama_token_bos()) {
1178
+ fprintf(stderr, "%s: first token must be BOS\n", __func__);
1179
+ return false;
1180
+ }
1181
+
1055
1182
  const int64_t t_start_us = ggml_time_us();
1056
1183
 
1057
1184
  const int N = n_tokens;
@@ -1059,7 +1186,7 @@ static bool llama_eval_internal(
1059
1186
  const auto & model = lctx.model;
1060
1187
  const auto & hparams = model.hparams;
1061
1188
 
1062
- auto & kv_self = model.kv_self;
1189
+ const auto & kv_self = model.kv_self;
1063
1190
 
1064
1191
  LLAMA_ASSERT(!!kv_self.ctx);
1065
1192
 
@@ -1103,17 +1230,15 @@ static bool llama_eval_internal(
1103
1230
  {
1104
1231
  cur = ggml_rms_norm(ctx0, inpL);
1105
1232
 
1106
- // cur = attention_norm*cur
1107
- cur = ggml_mul(ctx0,
1108
- ggml_repeat(ctx0, model.layers[il].attention_norm, cur),
1109
- cur);
1233
+ // cur = cur*attention_norm(broadcasted)
1234
+ cur = ggml_mul(ctx0, cur, model.layers[il].attention_norm);
1110
1235
  }
1111
1236
 
1112
1237
  // self-attention
1113
1238
  {
1114
1239
  // compute Q and K and RoPE them
1115
- struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
1116
- struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
1240
+ struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
1241
+ struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
1117
1242
  ggml_set_name(Qcur, "Qcur");
1118
1243
  ggml_set_name(Kcur, "Kcur");
1119
1244
 
@@ -1154,17 +1279,19 @@ static bool llama_eval_internal(
1154
1279
  struct ggml_tensor * KQ_scale = ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head));
1155
1280
  ggml_set_name(KQ_scale, "1/sqrt(n_embd/n_head)");
1156
1281
 
1157
- struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
1282
+ // KQ_scaled shape [n_past + N, N, n_head, 1]
1283
+ struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
1158
1284
  ggml_set_name(KQ_scaled, "KQ_scaled");
1159
1285
 
1160
1286
  // KQ_masked = mask_past(KQ_scaled)
1161
- struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
1287
+ struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
1162
1288
  ggml_set_name(KQ_masked, "KQ_masked");
1163
1289
 
1164
1290
  // KQ = soft_max(KQ_masked)
1165
- struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
1291
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
1166
1292
  ggml_set_name(KQ_soft_max, "KQ_soft_max");
1167
1293
 
1294
+
1168
1295
  // split cached V into n_head heads
1169
1296
  struct ggml_tensor * V =
1170
1297
  ggml_view_3d(ctx0, kv_self.v,
@@ -1211,10 +1338,8 @@ static bool llama_eval_internal(
1211
1338
  {
1212
1339
  cur = ggml_rms_norm(ctx0, inpFF);
1213
1340
 
1214
- // cur = ffn_norm*cur
1215
- cur = ggml_mul(ctx0,
1216
- ggml_repeat(ctx0, model.layers[il].ffn_norm, cur),
1217
- cur);
1341
+ // cur = cur*ffn_norm(broadcasted)
1342
+ cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
1218
1343
  }
1219
1344
 
1220
1345
  struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
@@ -1251,10 +1376,8 @@ static bool llama_eval_internal(
1251
1376
 
1252
1377
  inpL = ggml_rms_norm(ctx0, inpL);
1253
1378
 
1254
- // inpL = norm*inpL
1255
- inpL = ggml_mul(ctx0,
1256
- ggml_repeat(ctx0, model.norm, inpL),
1257
- inpL);
1379
+ // inpL = inpL*norm(broadcasted)
1380
+ inpL = ggml_mul(ctx0, inpL, model.norm);
1258
1381
 
1259
1382
  embeddings = inpL;
1260
1383
  }
@@ -1265,7 +1388,7 @@ static bool llama_eval_internal(
1265
1388
  lctx.use_buf(ctx0, -1);
1266
1389
 
1267
1390
  // logits -> probs
1268
- //inpL = ggml_soft_max(ctx0, inpL);
1391
+ //inpL = ggml_soft_max_inplace(ctx0, inpL);
1269
1392
 
1270
1393
  // run the computation
1271
1394
  ggml_build_forward_expand(&gf, inpL);
@@ -1303,7 +1426,7 @@ static bool llama_eval_internal(
1303
1426
  }
1304
1427
 
1305
1428
  // extract embeddings
1306
- if (lctx.embedding.size()) {
1429
+ if (!lctx.embedding.empty()) {
1307
1430
  auto & embedding_out = lctx.embedding;
1308
1431
 
1309
1432
  embedding_out.resize(n_embd);
@@ -1354,6 +1477,8 @@ struct llama_sp_symbol {
1354
1477
  size_t n;
1355
1478
  };
1356
1479
 
1480
+ static_assert(std::is_trivially_copyable<llama_sp_symbol>::value, "llama_sp_symbol is not trivially copyable");
1481
+
1357
1482
  struct llama_sp_bigram {
1358
1483
  struct comparator {
1359
1484
  bool operator()(llama_sp_bigram & l, llama_sp_bigram & r) {
@@ -1386,7 +1511,7 @@ struct llama_tokenizer {
1386
1511
  sym.prev = index - 1;
1387
1512
  sym.next = offs == text.size() ? -1 : index + 1;
1388
1513
  index++;
1389
- symbols_.emplace_back(std::move(sym));
1514
+ symbols_.emplace_back(sym);
1390
1515
  }
1391
1516
 
1392
1517
  // seed the work queue with all possible 2-character tokens.
@@ -1477,12 +1602,12 @@ static std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, co
1477
1602
  llama_tokenizer tokenizer(vocab);
1478
1603
  std::vector<llama_vocab::id> output;
1479
1604
 
1480
- if (text.size() == 0) {
1605
+ if (text.empty()) {
1481
1606
  return output;
1482
1607
  }
1483
1608
 
1484
1609
  if (bos) {
1485
- output.push_back(1);
1610
+ output.push_back(llama_token_bos());
1486
1611
  }
1487
1612
 
1488
1613
  tokenizer.tokenize(text, output);
@@ -1713,7 +1838,7 @@ void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_dat
1713
1838
  const int64_t t_start_sample_us = ggml_time_us();
1714
1839
 
1715
1840
  for (size_t i = 0; i < candidates->size; ++i) {
1716
- auto token_iter = std::find(last_tokens, last_tokens + last_tokens_size, candidates->data[i].id);
1841
+ const auto * token_iter = std::find(last_tokens, last_tokens + last_tokens_size, candidates->data[i].id);
1717
1842
  if (token_iter == last_tokens + last_tokens_size) {
1718
1843
  continue;
1719
1844
  }
@@ -1791,7 +1916,7 @@ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_
1791
1916
  float k = powf((epsilon_hat * powf(2, *mu)) / (1 - powf(N, -epsilon_hat)), 1 / s_hat);
1792
1917
 
1793
1918
  // Sample the next word X using top-k sampling
1794
- llama_sample_top_k(nullptr, candidates, int(k));
1919
+ llama_sample_top_k(nullptr, candidates, int(k), 1);
1795
1920
  if (ctx) {
1796
1921
  ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
1797
1922
  }
@@ -1857,7 +1982,7 @@ llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_da
1857
1982
  const int64_t t_start_sample_us = ggml_time_us();
1858
1983
 
1859
1984
  // Find max element
1860
- auto max_iter = std::max_element(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
1985
+ auto * max_iter = std::max_element(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
1861
1986
  return a.logit < b.logit;
1862
1987
  });
1863
1988
 
@@ -1900,7 +2025,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1900
2025
  switch (ftype) {
1901
2026
  case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
1902
2027
  case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
1903
- case LLAMA_FTYPE_MOSTLY_Q4_2: quantized_type = GGML_TYPE_Q4_2; break;
1904
2028
  case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
1905
2029
  case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
1906
2030
  case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
@@ -1911,7 +2035,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1911
2035
  nthread = std::thread::hardware_concurrency();
1912
2036
  }
1913
2037
 
1914
- std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp.c_str(), /*use_mmap*/ false,
2038
+ std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp, /*use_mmap*/ false,
1915
2039
  /*vocab_only*/ false));
1916
2040
  llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), ftype);
1917
2041
 
@@ -1965,7 +2089,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1965
2089
  } else if (tensor.type == GGML_TYPE_F16) {
1966
2090
  f32_conv_buf.resize(nelements * sizeof(float));
1967
2091
  f32_data = (float *) f32_conv_buf.addr;
1968
- auto f16_data = (const ggml_fp16_t *) tensor.data;
2092
+ const auto * f16_data = (const ggml_fp16_t *) tensor.data;
1969
2093
  for (size_t i = 0; i < nelements; i++) {
1970
2094
  f32_data[i] = ggml_fp16_to_fp32(f16_data[i]);
1971
2095
  }
@@ -1996,21 +2120,31 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1996
2120
  size_t first = counter; counter += chunk_size;
1997
2121
  if (first >= nelements) {
1998
2122
  if (!local_hist.empty()) {
1999
- for (int j=0; j<int(local_hist.size()); ++j) hist_cur[j] += local_hist[j];
2123
+ for (int j=0; j<int(local_hist.size()); ++j) {
2124
+ hist_cur[j] += local_hist[j];
2125
+ }
2000
2126
  new_size += local_size;
2001
2127
  }
2002
2128
  break;
2003
2129
  }
2004
2130
  lock.unlock();
2005
2131
  size_t last = std::min(nelements, first + chunk_size);
2006
- if (local_hist.empty()) local_hist.resize(hist_cur.size(), 0);
2132
+ if (local_hist.empty()) {
2133
+ local_hist.resize(hist_cur.size(), 0);
2134
+ }
2007
2135
  local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data());
2008
2136
  }
2009
2137
  };
2010
- if (int(workers.size()) < nthread_use - 1) workers.resize(nthread_use - 1);
2011
- for (int it = 0; it < nthread_use - 1; ++it) workers[it] = std::thread(compute);
2138
+ if ((int) workers.size() < nthread_use - 1) {
2139
+ workers.resize(nthread_use - 1);
2140
+ }
2141
+ for (int it = 0; it < nthread_use - 1; ++it) {
2142
+ workers[it] = std::thread(compute);
2143
+ }
2012
2144
  compute();
2013
- for (int it = 0; it < nthread_use - 1; ++it) workers[it].join();
2145
+ for (int it = 0; it < nthread_use - 1; ++it) {
2146
+ workers[it].join();
2147
+ }
2014
2148
  }
2015
2149
 
2016
2150
  printf("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0);
@@ -2067,7 +2201,7 @@ struct llama_context * llama_init_from_file(
2067
2201
  unsigned * cur_percentage_p = (unsigned *) ctx;
2068
2202
  unsigned percentage = (unsigned) (100 * progress);
2069
2203
  while (percentage > *cur_percentage_p) {
2070
- ++*cur_percentage_p;
2204
+ *cur_percentage_p = percentage;
2071
2205
  fprintf(stderr, ".");
2072
2206
  fflush(stderr);
2073
2207
  if (percentage >= 100) {
@@ -2082,7 +2216,7 @@ struct llama_context * llama_init_from_file(
2082
2216
 
2083
2217
  ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
2084
2218
 
2085
- if (!llama_model_load(path_model, *ctx, params.n_ctx, memory_type,
2219
+ if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_gpu_layers, memory_type,
2086
2220
  params.use_mmap, params.use_mlock, params.vocab_only,
2087
2221
  params.progress_callback, params.progress_callback_user_data)) {
2088
2222
  fprintf(stderr, "%s: failed to load model\n", __func__);
@@ -2160,7 +2294,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
2160
2294
  {
2161
2295
  uint32_t magic;
2162
2296
  fin.read((char *) &magic, sizeof(magic));
2163
- if (magic != 'ggla') {
2297
+ if (magic != LLAMA_FILE_MAGIC_GGLA) {
2164
2298
  fprintf(stderr, "%s: bad file magic\n", __func__);
2165
2299
  return 1;
2166
2300
  }
@@ -2208,7 +2342,8 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
2208
2342
  fprintf(stderr, "%s: loading base model from '%s'\n", __func__, path_base_model);
2209
2343
  model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*vocab_only*/ false));
2210
2344
 
2211
- size_t ctx_size, mmapped_size;
2345
+ size_t ctx_size;
2346
+ size_t mmapped_size;
2212
2347
  model_loader->calc_sizes(&ctx_size, &mmapped_size);
2213
2348
  base_buf.resize(ctx_size);
2214
2349
 
@@ -2223,7 +2358,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
2223
2358
 
2224
2359
  // maybe this should in llama_model_loader
2225
2360
  if (model_loader->use_mmap) {
2226
- model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ false));
2361
+ model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ 0));
2227
2362
  }
2228
2363
  }
2229
2364
 
@@ -2247,8 +2382,12 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
2247
2382
  fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
2248
2383
  }
2249
2384
 
2250
- std::string name(length, 0);
2251
- fin.read(&name[0], length);
2385
+ std::string name;
2386
+ {
2387
+ char buf[1024];
2388
+ fin.read(buf, length);
2389
+ name = std::string(buf, length);
2390
+ }
2252
2391
 
2253
2392
  // check for lora suffix and get the type of tensor
2254
2393
  const std::string lora_suffix = ".lora";
@@ -2263,7 +2402,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
2263
2402
  base_name.erase(pos);
2264
2403
  // fprintf(stderr, "%s: %s => %s (lora type %s) ", __func__, name.c_str(),base_name.c_str(), lora_type.c_str());
2265
2404
 
2266
- if (model_tensors.find(base_name.data()) == model_tensors.end()) {
2405
+ if (model_tensors.find(base_name) == model_tensors.end()) {
2267
2406
  fprintf(stderr, "%s: unknown tensor '%s' in lora adapter\n", __func__, name.data());
2268
2407
  return 1;
2269
2408
  }
@@ -2312,7 +2451,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
2312
2451
  }
2313
2452
  size_t idx = model_loader->tensors_map.name_to_idx[base_name];
2314
2453
  llama_load_tensor & lt = model_loader->tensors_map.tensors[idx];
2315
- base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] });
2454
+ base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] }, GGML_BACKEND_CPU);
2316
2455
  lt.data = (uint8_t *) lt.ggml_tensor->data;
2317
2456
  model_loader->load_data_for(lt);
2318
2457
  lt.ggml_tensor->data = lt.data;
@@ -2343,7 +2482,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
2343
2482
 
2344
2483
  if (scaling != 1.0f) {
2345
2484
  ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling);
2346
- BA = ggml_scale(lora_ctx, BA, scale_tensor);
2485
+ BA = ggml_scale_inplace(lora_ctx, BA, scale_tensor);
2347
2486
  }
2348
2487
 
2349
2488
  ggml_tensor * r;
@@ -2365,8 +2504,9 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
2365
2504
  lora_tensors.clear();
2366
2505
 
2367
2506
  n_tensors++;
2368
- if (n_tensors % 4 == 0)
2507
+ if (n_tensors % 4 == 0) {
2369
2508
  fprintf(stderr, ".");
2509
+ }
2370
2510
  }
2371
2511
  }
2372
2512
 
@@ -2395,7 +2535,7 @@ int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
2395
2535
  return ctx->model.kv_self.n;
2396
2536
  }
2397
2537
 
2398
- #define LLAMA_MAX_RNG_STATE 64*1024
2538
+ #define LLAMA_MAX_RNG_STATE (64*1024)
2399
2539
 
2400
2540
  void llama_set_rng_seed(struct llama_context * ctx, int seed) {
2401
2541
  if (seed < 0) {
@@ -2436,8 +2576,8 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
2436
2576
  }
2437
2577
 
2438
2578
  // Copies the state to the specified destination address
2439
- size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) {
2440
- uint8_t * out = dest;
2579
+ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
2580
+ uint8_t * out = dst;
2441
2581
 
2442
2582
  // copy rng
2443
2583
  {
@@ -2497,7 +2637,9 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) {
2497
2637
 
2498
2638
  if (kv_size) {
2499
2639
  const size_t elt_size = ggml_element_size(kv_self.k);
2640
+
2500
2641
  char buffer[4096];
2642
+
2501
2643
  ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
2502
2644
  ggml_cgraph gf{};
2503
2645
  gf.n_threads = 1;
@@ -2521,10 +2663,12 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) {
2521
2663
  ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
2522
2664
  ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d));
2523
2665
  ggml_graph_compute(cpy_ctx, &gf);
2666
+
2667
+ ggml_free(cpy_ctx);
2524
2668
  }
2525
2669
  }
2526
2670
 
2527
- const size_t written = out - dest;
2671
+ const size_t written = out - dst;
2528
2672
  const size_t max_size = llama_get_state_size(ctx);
2529
2673
 
2530
2674
  LLAMA_ASSERT(written <= max_size);
@@ -2533,16 +2677,16 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) {
2533
2677
  }
2534
2678
 
2535
2679
  // Sets the state reading from the specified source address
2536
- size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
2537
- const uint8_t * in = src;
2680
+ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
2681
+ uint8_t * inp = src;
2538
2682
 
2539
2683
  // set rng
2540
2684
  {
2541
2685
  size_t rng_size;
2542
2686
  char rng_buf[LLAMA_MAX_RNG_STATE];
2543
2687
 
2544
- memcpy(&rng_size, in, sizeof(rng_size)); in += sizeof(rng_size);
2545
- memcpy(&rng_buf[0], in, LLAMA_MAX_RNG_STATE); in += LLAMA_MAX_RNG_STATE;
2688
+ memcpy(&rng_size, inp, sizeof(rng_size)); inp += sizeof(rng_size);
2689
+ memcpy(&rng_buf[0], inp, LLAMA_MAX_RNG_STATE); inp += LLAMA_MAX_RNG_STATE;
2546
2690
 
2547
2691
  std::stringstream rng_ss;
2548
2692
  rng_ss.str(std::string(&rng_buf[0], rng_size));
@@ -2556,30 +2700,30 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
2556
2700
  size_t logits_cap;
2557
2701
  size_t logits_size;
2558
2702
 
2559
- memcpy(&logits_cap, in, sizeof(logits_cap)); in += sizeof(logits_cap);
2560
- memcpy(&logits_size, in, sizeof(logits_size)); in += sizeof(logits_size);
2703
+ memcpy(&logits_cap, inp, sizeof(logits_cap)); inp += sizeof(logits_cap);
2704
+ memcpy(&logits_size, inp, sizeof(logits_size)); inp += sizeof(logits_size);
2561
2705
 
2562
2706
  LLAMA_ASSERT(ctx->logits.capacity() == logits_cap);
2563
2707
 
2564
2708
  if (logits_size) {
2565
2709
  ctx->logits.resize(logits_size);
2566
- memcpy(ctx->logits.data(), in, logits_size * sizeof(float));
2710
+ memcpy(ctx->logits.data(), inp, logits_size * sizeof(float));
2567
2711
  }
2568
2712
 
2569
- in += logits_cap * sizeof(float);
2713
+ inp += logits_cap * sizeof(float);
2570
2714
  }
2571
2715
 
2572
2716
  // set embeddings
2573
2717
  {
2574
2718
  size_t embedding_size;
2575
2719
 
2576
- memcpy(&embedding_size, in, sizeof(embedding_size)); in += sizeof(embedding_size);
2720
+ memcpy(&embedding_size, inp, sizeof(embedding_size)); inp += sizeof(embedding_size);
2577
2721
 
2578
2722
  LLAMA_ASSERT(ctx->embedding.capacity() == embedding_size);
2579
2723
 
2580
2724
  if (embedding_size) {
2581
- memcpy(ctx->embedding.data(), in, embedding_size * sizeof(float));
2582
- in += embedding_size * sizeof(float);
2725
+ memcpy(ctx->embedding.data(), inp, embedding_size * sizeof(float));
2726
+ inp += embedding_size * sizeof(float);
2583
2727
  }
2584
2728
  }
2585
2729
 
@@ -2594,25 +2738,27 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
2594
2738
  size_t kv_size;
2595
2739
  int kv_ntok;
2596
2740
 
2597
- memcpy(&kv_size, in, sizeof(kv_size)); in += sizeof(kv_size);
2598
- memcpy(&kv_ntok, in, sizeof(kv_ntok)); in += sizeof(kv_ntok);
2741
+ memcpy(&kv_size, inp, sizeof(kv_size)); inp += sizeof(kv_size);
2742
+ memcpy(&kv_ntok, inp, sizeof(kv_ntok)); inp += sizeof(kv_ntok);
2599
2743
 
2600
2744
  if (kv_size) {
2601
2745
  LLAMA_ASSERT(kv_self.buf.size == kv_size);
2602
2746
 
2603
2747
  const size_t elt_size = ggml_element_size(kv_self.k);
2748
+
2604
2749
  char buffer[4096];
2750
+
2605
2751
  ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
2606
2752
  ggml_cgraph gf{};
2607
2753
  gf.n_threads = 1;
2608
2754
 
2609
2755
  ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
2610
- kin3d->data = (void *) in;
2611
- in += ggml_nbytes(kin3d);
2756
+ kin3d->data = (void *) inp;
2757
+ inp += ggml_nbytes(kin3d);
2612
2758
 
2613
2759
  ggml_tensor * vin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_ntok, n_embd, n_layer);
2614
- vin3d->data = (void *) in;
2615
- in += ggml_nbytes(vin3d);
2760
+ vin3d->data = (void *) inp;
2761
+ inp += ggml_nbytes(vin3d);
2616
2762
 
2617
2763
  ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k,
2618
2764
  n_embd, kv_ntok, n_layer,
@@ -2625,12 +2771,14 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
2625
2771
  ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d));
2626
2772
  ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d));
2627
2773
  ggml_graph_compute(cpy_ctx, &gf);
2774
+
2775
+ ggml_free(cpy_ctx);
2628
2776
  }
2629
2777
 
2630
2778
  ctx->model.kv_self.n = kv_ntok;
2631
2779
  }
2632
2780
 
2633
- const size_t nread = in - src;
2781
+ const size_t nread = inp - src;
2634
2782
  const size_t max_size = llama_get_state_size(ctx);
2635
2783
 
2636
2784
  LLAMA_ASSERT(nread <= max_size);
@@ -2646,7 +2794,7 @@ bool llama_load_session_file(struct llama_context * ctx, const char * path_sessi
2646
2794
  const uint32_t magic = file.read_u32();
2647
2795
  const uint32_t version = file.read_u32();
2648
2796
 
2649
- if (!(magic == LLAMA_SESSION_MAGIC && version == LLAMA_SESSION_VERSION)) {
2797
+ if (magic != LLAMA_SESSION_MAGIC || version != LLAMA_SESSION_VERSION) {
2650
2798
  fprintf(stderr, "%s : unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version);
2651
2799
  return false;
2652
2800
  }
@@ -2727,11 +2875,14 @@ int llama_eval(
2727
2875
  fprintf(stderr, "%s: failed to eval\n", __func__);
2728
2876
  return 1;
2729
2877
  }
2878
+
2730
2879
  // get a more accurate load time, upon first eval
2880
+ // TODO: fix this
2731
2881
  if (!ctx->has_evaluated_once) {
2732
2882
  ctx->t_load_us = ggml_time_us() - ctx->t_start_us;
2733
2883
  ctx->has_evaluated_once = true;
2734
2884
  }
2885
+
2735
2886
  return 0;
2736
2887
  }
2737
2888
 
@@ -2805,9 +2956,9 @@ void llama_print_timings(struct llama_context * ctx) {
2805
2956
 
2806
2957
  fprintf(stderr, "\n");
2807
2958
  fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0);
2808
- fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3 * ctx->t_sample_us, n_sample, 1e-3 * ctx->t_sample_us / n_sample);
2959
+ fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_sample_us, n_sample, 1e-3 * ctx->t_sample_us / n_sample);
2809
2960
  fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_p_eval_us, n_p_eval, 1e-3 * ctx->t_p_eval_us / n_p_eval);
2810
- fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3 * ctx->t_eval_us, n_eval, 1e-3 * ctx->t_eval_us / n_eval);
2961
+ fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_eval_us, n_eval, 1e-3 * ctx->t_eval_us / n_eval);
2811
2962
  fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0);
2812
2963
  }
2813
2964