llama_cpp 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,7 @@
1
1
  // Defines fileno on msys:
2
2
  #ifndef _GNU_SOURCE
3
3
  #define _GNU_SOURCE
4
+ #include <cstddef>
4
5
  #include <cstdint>
5
6
  #include <cstdio>
6
7
  #endif
@@ -9,6 +10,9 @@
9
10
  #include "llama.h"
10
11
 
11
12
  #include "ggml.h"
13
+ #ifdef GGML_USE_CUBLAS
14
+ #include "ggml-cuda.h"
15
+ #endif
12
16
 
13
17
  #include <array>
14
18
  #include <ctime>
@@ -42,6 +46,7 @@ enum e_model {
42
46
  MODEL_65B,
43
47
  };
44
48
 
49
+
45
50
  static const size_t MB = 1024*1024;
46
51
 
47
52
  // computed for n_ctx == 2048
@@ -50,49 +55,49 @@ static const size_t MB = 1024*1024;
50
55
 
51
56
  static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
52
57
  {
53
- static std::map<e_model, size_t> _MEM_REQ_SCRATCH0 = {
58
+ static std::map<e_model, size_t> k_sizes = {
54
59
  { MODEL_7B, 512ull * MB },
55
60
  { MODEL_13B, 512ull * MB },
56
61
  { MODEL_30B, 512ull * MB },
57
62
  { MODEL_65B, 1024ull * MB },
58
63
  };
59
- return _MEM_REQ_SCRATCH0;
64
+ return k_sizes;
60
65
  }
61
66
 
62
67
  static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
63
68
  {
64
- static std::map<e_model, size_t> _MEM_REQ_SCRATCH1 = {
69
+ static std::map<e_model, size_t> k_sizes = {
65
70
  { MODEL_7B, 512ull * MB },
66
71
  { MODEL_13B, 512ull * MB },
67
72
  { MODEL_30B, 512ull * MB },
68
73
  { MODEL_65B, 1024ull * MB },
69
74
  };
70
- return _MEM_REQ_SCRATCH1;
75
+ return k_sizes;
71
76
  }
72
77
 
73
78
  // 2*n_embd*n_ctx*n_layer*sizeof(float16)
74
79
  static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
75
80
  {
76
- static std::map<e_model, size_t> _MEM_REQ_KV_SELF = {
81
+ static std::map<e_model, size_t> k_sizes = {
77
82
  { MODEL_7B, 1026ull * MB },
78
83
  { MODEL_13B, 1608ull * MB },
79
84
  { MODEL_30B, 3124ull * MB },
80
85
  { MODEL_65B, 5120ull * MB },
81
86
  };
82
- return _MEM_REQ_KV_SELF;
87
+ return k_sizes;
83
88
  }
84
89
 
85
90
  // this is mostly needed for temporary mul_mat buffers to dequantize the data
86
91
  // not actually needed if BLAS is disabled
87
92
  static const std::map<e_model, size_t> & MEM_REQ_EVAL()
88
93
  {
89
- static std::map<e_model, size_t> _MEM_REQ_EVAL = {
94
+ static std::map<e_model, size_t> k_sizes = {
90
95
  { MODEL_7B, 768ull * MB },
91
96
  { MODEL_13B, 1024ull * MB },
92
97
  { MODEL_30B, 1280ull * MB },
93
98
  { MODEL_65B, 1536ull * MB },
94
99
  };
95
- return _MEM_REQ_EVAL;
100
+ return k_sizes;
96
101
  }
97
102
 
98
103
  // default hparams (LLaMA 7B)
@@ -107,7 +112,7 @@ struct llama_hparams {
107
112
  enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;
108
113
 
109
114
  bool operator!=(const llama_hparams & other) const {
110
- return memcmp(this, &other, sizeof(llama_hparams));
115
+ return static_cast<bool>(memcmp(this, &other, sizeof(llama_hparams)));
111
116
  }
112
117
  };
113
118
 
@@ -402,6 +407,8 @@ enum llama_file_version {
402
407
  LLAMA_FILE_VERSION_GGML,
403
408
  LLAMA_FILE_VERSION_GGMF_V1, // added version field and scores in vocab
404
409
  LLAMA_FILE_VERSION_GGJT_V1, // added padding
410
+ LLAMA_FILE_VERSION_GGJT_V2, // changed quantization format
411
+ LLAMA_FILE_VERSION_GGJT_V3, // changed Q4 and Q8 quantization format
405
412
  };
406
413
 
407
414
  struct llama_file_loader {
@@ -420,22 +427,30 @@ struct llama_file_loader {
420
427
  }
421
428
  void read_magic() {
422
429
  uint32_t magic = file.read_u32();
423
- uint32_t version = 0;
424
430
 
425
- if (magic != 'ggml') {
426
- version = file.read_u32();
431
+ if (magic == LLAMA_FILE_MAGIC_GGML) {
432
+ file_version = LLAMA_FILE_VERSION_GGML;
433
+ return;
427
434
  }
428
435
 
429
- if (magic == 'ggml' && version == 0) {
430
- file_version = LLAMA_FILE_VERSION_GGML;
431
- } else if (magic == 'ggmf' && version == 1) {
432
- file_version = LLAMA_FILE_VERSION_GGMF_V1;
433
- } else if (magic == 'ggjt' && version == 1) {
434
- file_version = LLAMA_FILE_VERSION_GGJT_V1;
435
- } else {
436
- throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
437
- magic, version);
436
+ uint32_t version = file.read_u32();
437
+
438
+ switch (magic) {
439
+ case LLAMA_FILE_MAGIC_GGMF:
440
+ switch (version) {
441
+ case 1: file_version = LLAMA_FILE_VERSION_GGMF_V1; return;
442
+ }
443
+ break;
444
+ case LLAMA_FILE_MAGIC_GGJT:
445
+ switch (version) {
446
+ case 1: file_version = LLAMA_FILE_VERSION_GGJT_V1; return;
447
+ case 2: file_version = LLAMA_FILE_VERSION_GGJT_V2; return;
448
+ case 3: file_version = LLAMA_FILE_VERSION_GGJT_V3; return;
449
+ }
438
450
  }
451
+
452
+ throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
453
+ magic, version);
439
454
  }
440
455
  void read_hparams() {
441
456
  hparams.n_vocab = file.read_u32();
@@ -482,7 +497,6 @@ struct llama_file_loader {
482
497
  case GGML_TYPE_F16:
483
498
  case GGML_TYPE_Q4_0:
484
499
  case GGML_TYPE_Q4_1:
485
- case GGML_TYPE_Q4_2:
486
500
  case GGML_TYPE_Q5_0:
487
501
  case GGML_TYPE_Q5_1:
488
502
  case GGML_TYPE_Q8_0:
@@ -494,7 +508,7 @@ struct llama_file_loader {
494
508
 
495
509
  if (file_version >= LLAMA_FILE_VERSION_GGJT_V1) {
496
510
  // skip to the next multiple of 32 bytes
497
- file.seek(-file.tell() & 31, SEEK_CUR);
511
+ file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
498
512
  }
499
513
  shard.file_idx = file_idx;
500
514
  shard.file_off = file.tell();
@@ -527,8 +541,8 @@ struct llama_file_saver {
527
541
  write_vocab();
528
542
  }
529
543
  void write_magic() {
530
- file.write_u32('ggjt'); // magic
531
- file.write_u32(1); // version
544
+ file.write_u32(LLAMA_FILE_MAGIC); // magic
545
+ file.write_u32(LLAMA_FILE_VERSION); // version
532
546
  }
533
547
  void write_hparams(enum llama_ftype new_ftype) {
534
548
  const llama_hparams & hparams = any_file_loader->hparams;
@@ -558,7 +572,6 @@ struct llama_file_saver {
558
572
  case GGML_TYPE_F16:
559
573
  case GGML_TYPE_Q4_0:
560
574
  case GGML_TYPE_Q4_1:
561
- case GGML_TYPE_Q4_2:
562
575
  case GGML_TYPE_Q5_0:
563
576
  case GGML_TYPE_Q5_1:
564
577
  case GGML_TYPE_Q8_0:
@@ -570,7 +583,7 @@ struct llama_file_saver {
570
583
  file.write_u32(new_type);
571
584
  file.write_raw(tensor.ne.data(), sizeof(tensor.ne[0]) * tensor.ne.size());
572
585
  file.write_raw(tensor.name.data(), tensor.name.size());
573
- file.seek(-file.tell() & 31, SEEK_CUR);
586
+ file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
574
587
  LLAMA_ASSERT(new_size == llama_calc_tensor_size(tensor.ne, new_type));
575
588
  file.write_raw(new_data, new_size);
576
589
  }
@@ -585,12 +598,12 @@ struct llama_model_loader {
585
598
  std::unique_ptr<llama_mmap> mapping;
586
599
 
587
600
  llama_model_loader(const std::string & fname_base, bool use_mmap, bool vocab_only) {
588
- auto first_file = new llama_file_loader(fname_base.c_str(), 0, tensors_map);
601
+ auto * first_file = new llama_file_loader(fname_base.c_str(), 0, tensors_map);
589
602
  file_loaders.emplace_back(first_file);
590
603
  uint32_t n_parts = vocab_only ? 1 : guess_n_parts();
591
604
  for (uint32_t i = 1; i < n_parts; i++) {
592
605
  std::string fname = fname_base + "." + std::to_string(i);
593
- auto ith_file = new llama_file_loader(fname.c_str(), i, tensors_map);
606
+ auto * ith_file = new llama_file_loader(fname.c_str(), i, tensors_map);
594
607
  file_loaders.emplace_back(ith_file);
595
608
  if (ith_file->hparams != first_file->hparams) {
596
609
  throw format("llama.cpp: hparams inconsistent between files");
@@ -637,7 +650,7 @@ struct llama_model_loader {
637
650
  }
638
651
  }
639
652
 
640
- struct ggml_tensor * get_tensor(const std::string & name, std::vector<uint32_t> ne) {
653
+ struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne, ggml_backend backend) {
641
654
  auto it = tensors_map.name_to_idx.find(name);
642
655
  if (it == tensors_map.name_to_idx.end()) {
643
656
  throw format("llama.cpp: tensor '%s' is missing from model", name.c_str());
@@ -648,10 +661,10 @@ struct llama_model_loader {
648
661
  name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str());
649
662
  }
650
663
 
651
- return get_tensor_for(lt);
664
+ return get_tensor_for(lt, backend);
652
665
  }
653
666
 
654
- struct ggml_tensor * get_tensor_for(llama_load_tensor & lt) {
667
+ struct ggml_tensor * get_tensor_for(llama_load_tensor & lt, ggml_backend backend) {
655
668
  struct ggml_tensor * tensor;
656
669
  if (lt.ne.size() == 2) {
657
670
  tensor = ggml_new_tensor_2d(ggml_ctx, lt.type, lt.ne.at(0), lt.ne.at(1));
@@ -661,12 +674,13 @@ struct llama_model_loader {
661
674
  }
662
675
  ggml_set_name(tensor, lt.name.c_str());
663
676
  LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor
677
+ tensor->backend = backend;
664
678
  lt.ggml_tensor = tensor;
665
679
  num_ggml_tensors_created++;
666
680
  return tensor;
667
681
  }
668
682
 
669
- void done_getting_tensors() {
683
+ void done_getting_tensors() const {
670
684
  if (num_ggml_tensors_created != tensors_map.tensors.size()) {
671
685
  throw std::string("llama.cpp: file contained more tensors than expected");
672
686
  }
@@ -674,12 +688,16 @@ struct llama_model_loader {
674
688
 
675
689
  void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) {
676
690
  size_t data_size = 0;
691
+ size_t prefetch_size = 0;
677
692
  for (const llama_load_tensor & lt : tensors_map.tensors) {
678
693
  data_size += lt.size;
694
+ if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
695
+ prefetch_size += lt.size;
696
+ }
679
697
  }
680
698
 
681
699
  if (use_mmap) {
682
- mapping.reset(new llama_mmap(&file_loaders.at(0)->file));
700
+ mapping.reset(new llama_mmap(&file_loaders.at(0)->file, prefetch_size));
683
701
  if (!lmlock) {
684
702
  // Don't call the callback since the actual loading will be lazy
685
703
  // and we can't measure it.
@@ -692,6 +710,9 @@ struct llama_model_loader {
692
710
 
693
711
  size_t done_size = 0;
694
712
  for (llama_load_tensor & lt : tensors_map.tensors) {
713
+ if (lt.ggml_tensor->backend != GGML_BACKEND_CPU) {
714
+ continue;
715
+ }
695
716
  if (progress_callback) {
696
717
  progress_callback((float) done_size / data_size, progress_callback_user_data);
697
718
  }
@@ -704,9 +725,6 @@ struct llama_model_loader {
704
725
  lmlock->grow_to(done_size);
705
726
  }
706
727
  }
707
- if (progress_callback) {
708
- progress_callback(1.0f, progress_callback_user_data);
709
- }
710
728
  }
711
729
 
712
730
  void load_data_for(llama_load_tensor & lt) {
@@ -808,9 +826,9 @@ static bool kv_cache_init(
808
826
  struct llama_context_params llama_context_default_params() {
809
827
  struct llama_context_params result = {
810
828
  /*.n_ctx =*/ 512,
811
- /*.n_parts =*/ -1,
829
+ /*.gpu_layers =*/ 0,
812
830
  /*.seed =*/ -1,
813
- /*.f16_kv =*/ false,
831
+ /*.f16_kv =*/ true,
814
832
  /*.logits_all =*/ false,
815
833
  /*.vocab_only =*/ false,
816
834
  /*.use_mmap =*/ true,
@@ -831,6 +849,21 @@ bool llama_mlock_supported() {
831
849
  return llama_mlock::SUPPORTED;
832
850
  }
833
851
 
852
+ void llama_init_backend() {
853
+ ggml_time_init();
854
+
855
+ // needed to initialize f16 tables
856
+ {
857
+ struct ggml_init_params params = { 0, NULL, false };
858
+ struct ggml_context * ctx = ggml_init(params);
859
+ ggml_free(ctx);
860
+ }
861
+ }
862
+
863
+ int64_t llama_time_us() {
864
+ return ggml_time_us();
865
+ }
866
+
834
867
  //
835
868
  // model loading
836
869
  //
@@ -839,9 +872,12 @@ static const char *llama_file_version_name(llama_file_version version) {
839
872
  switch (version) {
840
873
  case LLAMA_FILE_VERSION_GGML: return "'ggml' (old version with low tokenizer quality and no mmap support)";
841
874
  case LLAMA_FILE_VERSION_GGMF_V1: return "ggmf v1 (old version with no mmap support)";
842
- case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (latest)";
843
- default: LLAMA_ASSERT(false);
875
+ case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (pre #1405)";
876
+ case LLAMA_FILE_VERSION_GGJT_V2: return "ggjt v2 (pre #1508)";
877
+ case LLAMA_FILE_VERSION_GGJT_V3: return "ggjt v3 (latest)";
844
878
  }
879
+
880
+ return "unknown";
845
881
  }
846
882
 
847
883
  static const char *llama_ftype_name(enum llama_ftype ftype) {
@@ -852,7 +888,6 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
852
888
  case LLAMA_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1";
853
889
  case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
854
890
  return "mostly Q4_1, some F16";
855
- case LLAMA_FTYPE_MOSTLY_Q4_2: return "mostly Q4_2";
856
891
  case LLAMA_FTYPE_MOSTLY_Q5_0: return "mostly Q5_0";
857
892
  case LLAMA_FTYPE_MOSTLY_Q5_1: return "mostly Q5_1";
858
893
  case LLAMA_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0";
@@ -874,6 +909,7 @@ static void llama_model_load_internal(
874
909
  const std::string & fname,
875
910
  llama_context & lctx,
876
911
  int n_ctx,
912
+ int n_gpu_layers,
877
913
  ggml_type memory_type,
878
914
  bool use_mmap,
879
915
  bool use_mlock,
@@ -918,35 +954,32 @@ static void llama_model_load_internal(
918
954
  fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
919
955
  }
920
956
 
957
+ if (file_version < LLAMA_FILE_VERSION_GGJT_V2) {
958
+ if (hparams.ftype != LLAMA_FTYPE_ALL_F32 &&
959
+ hparams.ftype != LLAMA_FTYPE_MOSTLY_F16 &&
960
+ hparams.ftype != LLAMA_FTYPE_MOSTLY_Q8_0) {
961
+ throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1405)");
962
+ }
963
+ }
964
+
965
+ if (file_version < LLAMA_FILE_VERSION_GGJT_V3) {
966
+ if (hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ||
967
+ hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_1 ||
968
+ hparams.ftype == LLAMA_FTYPE_MOSTLY_Q8_0) {
969
+ throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1508)");
970
+ }
971
+ }
972
+
921
973
  if (vocab_only) {
922
974
  return;
923
975
  }
924
976
 
925
977
  auto & ctx = model.ctx;
926
978
 
927
- size_t ctx_size, mmapped_size;
979
+ size_t ctx_size;
980
+ size_t mmapped_size;
928
981
  ml->calc_sizes(&ctx_size, &mmapped_size);
929
- fprintf(stderr, "%s: ggml ctx size = %6.2f KB\n", __func__, ctx_size/1024.0);
930
-
931
- // print memory requirements
932
- {
933
- const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
934
-
935
- // this is the total memory required to run the inference
936
- const size_t mem_required =
937
- ctx_size +
938
- mmapped_size +
939
- MEM_REQ_SCRATCH0().at(model.type) +
940
- MEM_REQ_SCRATCH1().at(model.type) +
941
- MEM_REQ_EVAL().at(model.type);
942
-
943
- // this is the memory required by one llama_state
944
- const size_t mem_required_state =
945
- scale*MEM_REQ_KV_SELF().at(model.type);
946
-
947
- fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
948
- mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
949
- }
982
+ fprintf(stderr, "%s: ggml ctx size = %7.2f MB\n", __func__, ctx_size/1024.0/1024.0);
950
983
 
951
984
  // create the ggml context
952
985
  {
@@ -968,43 +1001,102 @@ static void llama_model_load_internal(
968
1001
  }
969
1002
  }
970
1003
 
1004
+ #ifdef GGML_USE_CUBLAS
1005
+ #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CUDA
1006
+ #else
1007
+ #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CPU
1008
+ #endif
1009
+
971
1010
  // prepare memory for the weights
1011
+ size_t vram_total = 0;
972
1012
  {
973
- const auto & hparams = model.hparams;
974
-
975
1013
  const uint32_t n_embd = hparams.n_embd;
976
1014
  const uint32_t n_layer = hparams.n_layer;
977
1015
  const uint32_t n_vocab = hparams.n_vocab;
978
1016
 
979
1017
  ml->ggml_ctx = ctx;
980
1018
 
981
- model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab});
982
- model.norm = ml->get_tensor("norm.weight", {n_embd});
983
- model.output = ml->get_tensor("output.weight", {n_embd, n_vocab});
1019
+ model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab}, GGML_BACKEND_CPU);
1020
+ model.norm = ml->get_tensor("norm.weight", {n_embd}, GGML_BACKEND_CPU);
1021
+
1022
+ // "output" tensor
1023
+ {
1024
+ ggml_backend backend_output;
1025
+ if (n_gpu_layers > int(n_layer)) { // NOLINT
1026
+ backend_output = LLAMA_BACKEND_OFFLOAD;
1027
+ } else {
1028
+ backend_output = GGML_BACKEND_CPU;
1029
+ }
1030
+
1031
+ model.output = ml->get_tensor("output.weight", {n_embd, n_vocab}, backend_output);
1032
+ }
1033
+
1034
+ const int i_gpu_start = n_layer - n_gpu_layers;
984
1035
 
985
1036
  model.layers.resize(n_layer);
986
1037
  for (uint32_t i = 0; i < n_layer; ++i) {
1038
+ const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
1039
+
987
1040
  auto & layer = model.layers[i];
988
1041
 
989
1042
  std::string layers_i = "layers." + std::to_string(i);
990
1043
 
991
- layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd});
1044
+ layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd}, backend);
1045
+
1046
+ layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}, backend);
1047
+ layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd}, backend);
1048
+ layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd}, backend);
1049
+ layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}, backend);
992
1050
 
993
- layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd});
994
- layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd});
995
- layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd});
996
- layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd});
1051
+ layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd}, backend);
997
1052
 
998
- layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd});
1053
+ layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}, backend);
1054
+ layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}, backend);
1055
+ layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}, backend);
999
1056
 
1000
- layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff});
1001
- layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd});
1002
- layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff});
1057
+ if (backend == GGML_BACKEND_CUDA) {
1058
+ vram_total +=
1059
+ ggml_nbytes(layer.attention_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
1060
+ ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.attention_norm) +
1061
+ ggml_nbytes(layer.w1) + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3);
1062
+ }
1003
1063
  }
1004
1064
  }
1005
1065
 
1006
1066
  ml->done_getting_tensors();
1007
1067
 
1068
+ // print memory requirements
1069
+ {
1070
+ const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
1071
+
1072
+ // this is the total memory required to run the inference
1073
+ const size_t mem_required =
1074
+ ctx_size +
1075
+ mmapped_size - vram_total + // weights in VRAM not in memory
1076
+ MEM_REQ_SCRATCH0().at(model.type) +
1077
+ MEM_REQ_SCRATCH1().at(model.type) +
1078
+ MEM_REQ_EVAL().at(model.type);
1079
+
1080
+ // this is the memory required by one llama_state
1081
+ const size_t mem_required_state =
1082
+ scale*MEM_REQ_KV_SELF().at(model.type);
1083
+
1084
+ fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
1085
+ mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
1086
+
1087
+ #ifdef GGML_USE_CUBLAS
1088
+ const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
1089
+
1090
+ fprintf(stderr, "%s: [cublas] offloading %d layers to GPU\n", __func__, n_gpu);
1091
+ if (n_gpu_layers > (int) hparams.n_layer) {
1092
+ fprintf(stderr, "%s: [cublas] offloading output layer to GPU\n", __func__);
1093
+ }
1094
+ fprintf(stderr, "%s: [cublas] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
1095
+ #else
1096
+ (void) n_gpu_layers;
1097
+ #endif
1098
+ }
1099
+
1008
1100
  // populate `tensors_by_name`
1009
1101
  for (llama_load_tensor & lt : ml->tensors_map.tensors) {
1010
1102
  model.tensors_by_name.emplace_back(lt.name, lt.ggml_tensor);
@@ -1012,6 +1104,33 @@ static void llama_model_load_internal(
1012
1104
 
1013
1105
  ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
1014
1106
 
1107
+ #ifdef GGML_USE_CUBLAS
1108
+ {
1109
+ size_t done_size = 0;
1110
+ size_t data_size = 0;
1111
+ for (llama_load_tensor & lt : ml->tensors_map.tensors) {
1112
+ data_size += lt.size;
1113
+ if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
1114
+ done_size += lt.size;
1115
+ }
1116
+ }
1117
+ for (llama_load_tensor & lt : ml->tensors_map.tensors) {
1118
+ if (lt.ggml_tensor->backend != GGML_BACKEND_CUDA) {
1119
+ continue;
1120
+ }
1121
+ if (progress_callback) {
1122
+ progress_callback((float) done_size / data_size, progress_callback_user_data);
1123
+ }
1124
+ ggml_cuda_load_data(fname.c_str(), lt.ggml_tensor, lt.shards.at(0).file_off);
1125
+ done_size += lt.size;
1126
+ }
1127
+ }
1128
+ #endif // GGML_USE_CUBLAS
1129
+
1130
+ if (progress_callback) {
1131
+ progress_callback(1.0f, progress_callback_user_data);
1132
+ }
1133
+
1015
1134
  model.mapping = std::move(ml->mapping);
1016
1135
 
1017
1136
  // loading time will be recalculate after the first eval, so
@@ -1023,6 +1142,7 @@ static bool llama_model_load(
1023
1142
  const std::string & fname,
1024
1143
  llama_context & lctx,
1025
1144
  int n_ctx,
1145
+ int n_gpu_layers,
1026
1146
  ggml_type memory_type,
1027
1147
  bool use_mmap,
1028
1148
  bool use_mlock,
@@ -1030,7 +1150,7 @@ static bool llama_model_load(
1030
1150
  llama_progress_callback progress_callback,
1031
1151
  void *progress_callback_user_data) {
1032
1152
  try {
1033
- llama_model_load_internal(fname, lctx, n_ctx, memory_type, use_mmap, use_mlock,
1153
+ llama_model_load_internal(fname, lctx, n_ctx, n_gpu_layers, memory_type, use_mmap, use_mlock,
1034
1154
  vocab_only, progress_callback, progress_callback_user_data);
1035
1155
  return true;
1036
1156
  } catch (const std::string & err) {
@@ -1052,6 +1172,13 @@ static bool llama_eval_internal(
1052
1172
  const int n_tokens,
1053
1173
  const int n_past,
1054
1174
  const int n_threads) {
1175
+
1176
+ // enforce that the first token is BOS
1177
+ if (n_past == 0 && tokens[0] != llama_token_bos()) {
1178
+ fprintf(stderr, "%s: first token must be BOS\n", __func__);
1179
+ return false;
1180
+ }
1181
+
1055
1182
  const int64_t t_start_us = ggml_time_us();
1056
1183
 
1057
1184
  const int N = n_tokens;
@@ -1059,7 +1186,7 @@ static bool llama_eval_internal(
1059
1186
  const auto & model = lctx.model;
1060
1187
  const auto & hparams = model.hparams;
1061
1188
 
1062
- auto & kv_self = model.kv_self;
1189
+ const auto & kv_self = model.kv_self;
1063
1190
 
1064
1191
  LLAMA_ASSERT(!!kv_self.ctx);
1065
1192
 
@@ -1103,17 +1230,15 @@ static bool llama_eval_internal(
1103
1230
  {
1104
1231
  cur = ggml_rms_norm(ctx0, inpL);
1105
1232
 
1106
- // cur = attention_norm*cur
1107
- cur = ggml_mul(ctx0,
1108
- ggml_repeat(ctx0, model.layers[il].attention_norm, cur),
1109
- cur);
1233
+ // cur = cur*attention_norm(broadcasted)
1234
+ cur = ggml_mul(ctx0, cur, model.layers[il].attention_norm);
1110
1235
  }
1111
1236
 
1112
1237
  // self-attention
1113
1238
  {
1114
1239
  // compute Q and K and RoPE them
1115
- struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
1116
- struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
1240
+ struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
1241
+ struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
1117
1242
  ggml_set_name(Qcur, "Qcur");
1118
1243
  ggml_set_name(Kcur, "Kcur");
1119
1244
 
@@ -1154,17 +1279,19 @@ static bool llama_eval_internal(
1154
1279
  struct ggml_tensor * KQ_scale = ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head));
1155
1280
  ggml_set_name(KQ_scale, "1/sqrt(n_embd/n_head)");
1156
1281
 
1157
- struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
1282
+ // KQ_scaled shape [n_past + N, N, n_head, 1]
1283
+ struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
1158
1284
  ggml_set_name(KQ_scaled, "KQ_scaled");
1159
1285
 
1160
1286
  // KQ_masked = mask_past(KQ_scaled)
1161
- struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
1287
+ struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
1162
1288
  ggml_set_name(KQ_masked, "KQ_masked");
1163
1289
 
1164
1290
  // KQ = soft_max(KQ_masked)
1165
- struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
1291
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
1166
1292
  ggml_set_name(KQ_soft_max, "KQ_soft_max");
1167
1293
 
1294
+
1168
1295
  // split cached V into n_head heads
1169
1296
  struct ggml_tensor * V =
1170
1297
  ggml_view_3d(ctx0, kv_self.v,
@@ -1211,10 +1338,8 @@ static bool llama_eval_internal(
1211
1338
  {
1212
1339
  cur = ggml_rms_norm(ctx0, inpFF);
1213
1340
 
1214
- // cur = ffn_norm*cur
1215
- cur = ggml_mul(ctx0,
1216
- ggml_repeat(ctx0, model.layers[il].ffn_norm, cur),
1217
- cur);
1341
+ // cur = cur*ffn_norm(broadcasted)
1342
+ cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
1218
1343
  }
1219
1344
 
1220
1345
  struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
@@ -1251,10 +1376,8 @@ static bool llama_eval_internal(
1251
1376
 
1252
1377
  inpL = ggml_rms_norm(ctx0, inpL);
1253
1378
 
1254
- // inpL = norm*inpL
1255
- inpL = ggml_mul(ctx0,
1256
- ggml_repeat(ctx0, model.norm, inpL),
1257
- inpL);
1379
+ // inpL = inpL*norm(broadcasted)
1380
+ inpL = ggml_mul(ctx0, inpL, model.norm);
1258
1381
 
1259
1382
  embeddings = inpL;
1260
1383
  }
@@ -1265,7 +1388,7 @@ static bool llama_eval_internal(
1265
1388
  lctx.use_buf(ctx0, -1);
1266
1389
 
1267
1390
  // logits -> probs
1268
- //inpL = ggml_soft_max(ctx0, inpL);
1391
+ //inpL = ggml_soft_max_inplace(ctx0, inpL);
1269
1392
 
1270
1393
  // run the computation
1271
1394
  ggml_build_forward_expand(&gf, inpL);
@@ -1303,7 +1426,7 @@ static bool llama_eval_internal(
1303
1426
  }
1304
1427
 
1305
1428
  // extract embeddings
1306
- if (lctx.embedding.size()) {
1429
+ if (!lctx.embedding.empty()) {
1307
1430
  auto & embedding_out = lctx.embedding;
1308
1431
 
1309
1432
  embedding_out.resize(n_embd);
@@ -1354,6 +1477,8 @@ struct llama_sp_symbol {
1354
1477
  size_t n;
1355
1478
  };
1356
1479
 
1480
+ static_assert(std::is_trivially_copyable<llama_sp_symbol>::value, "llama_sp_symbol is not trivially copyable");
1481
+
1357
1482
  struct llama_sp_bigram {
1358
1483
  struct comparator {
1359
1484
  bool operator()(llama_sp_bigram & l, llama_sp_bigram & r) {
@@ -1386,7 +1511,7 @@ struct llama_tokenizer {
1386
1511
  sym.prev = index - 1;
1387
1512
  sym.next = offs == text.size() ? -1 : index + 1;
1388
1513
  index++;
1389
- symbols_.emplace_back(std::move(sym));
1514
+ symbols_.emplace_back(sym);
1390
1515
  }
1391
1516
 
1392
1517
  // seed the work queue with all possible 2-character tokens.
@@ -1477,12 +1602,12 @@ static std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, co
1477
1602
  llama_tokenizer tokenizer(vocab);
1478
1603
  std::vector<llama_vocab::id> output;
1479
1604
 
1480
- if (text.size() == 0) {
1605
+ if (text.empty()) {
1481
1606
  return output;
1482
1607
  }
1483
1608
 
1484
1609
  if (bos) {
1485
- output.push_back(1);
1610
+ output.push_back(llama_token_bos());
1486
1611
  }
1487
1612
 
1488
1613
  tokenizer.tokenize(text, output);
@@ -1713,7 +1838,7 @@ void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_dat
1713
1838
  const int64_t t_start_sample_us = ggml_time_us();
1714
1839
 
1715
1840
  for (size_t i = 0; i < candidates->size; ++i) {
1716
- auto token_iter = std::find(last_tokens, last_tokens + last_tokens_size, candidates->data[i].id);
1841
+ const auto * token_iter = std::find(last_tokens, last_tokens + last_tokens_size, candidates->data[i].id);
1717
1842
  if (token_iter == last_tokens + last_tokens_size) {
1718
1843
  continue;
1719
1844
  }
@@ -1791,7 +1916,7 @@ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_
1791
1916
  float k = powf((epsilon_hat * powf(2, *mu)) / (1 - powf(N, -epsilon_hat)), 1 / s_hat);
1792
1917
 
1793
1918
  // Sample the next word X using top-k sampling
1794
- llama_sample_top_k(nullptr, candidates, int(k));
1919
+ llama_sample_top_k(nullptr, candidates, int(k), 1);
1795
1920
  if (ctx) {
1796
1921
  ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
1797
1922
  }
@@ -1857,7 +1982,7 @@ llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_da
1857
1982
  const int64_t t_start_sample_us = ggml_time_us();
1858
1983
 
1859
1984
  // Find max element
1860
- auto max_iter = std::max_element(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
1985
+ auto * max_iter = std::max_element(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
1861
1986
  return a.logit < b.logit;
1862
1987
  });
1863
1988
 
@@ -1900,7 +2025,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1900
2025
  switch (ftype) {
1901
2026
  case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
1902
2027
  case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
1903
- case LLAMA_FTYPE_MOSTLY_Q4_2: quantized_type = GGML_TYPE_Q4_2; break;
1904
2028
  case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
1905
2029
  case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
1906
2030
  case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
@@ -1911,7 +2035,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1911
2035
  nthread = std::thread::hardware_concurrency();
1912
2036
  }
1913
2037
 
1914
- std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp.c_str(), /*use_mmap*/ false,
2038
+ std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp, /*use_mmap*/ false,
1915
2039
  /*vocab_only*/ false));
1916
2040
  llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), ftype);
1917
2041
 
@@ -1965,7 +2089,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1965
2089
  } else if (tensor.type == GGML_TYPE_F16) {
1966
2090
  f32_conv_buf.resize(nelements * sizeof(float));
1967
2091
  f32_data = (float *) f32_conv_buf.addr;
1968
- auto f16_data = (const ggml_fp16_t *) tensor.data;
2092
+ const auto * f16_data = (const ggml_fp16_t *) tensor.data;
1969
2093
  for (size_t i = 0; i < nelements; i++) {
1970
2094
  f32_data[i] = ggml_fp16_to_fp32(f16_data[i]);
1971
2095
  }
@@ -1996,21 +2120,31 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1996
2120
  size_t first = counter; counter += chunk_size;
1997
2121
  if (first >= nelements) {
1998
2122
  if (!local_hist.empty()) {
1999
- for (int j=0; j<int(local_hist.size()); ++j) hist_cur[j] += local_hist[j];
2123
+ for (int j=0; j<int(local_hist.size()); ++j) {
2124
+ hist_cur[j] += local_hist[j];
2125
+ }
2000
2126
  new_size += local_size;
2001
2127
  }
2002
2128
  break;
2003
2129
  }
2004
2130
  lock.unlock();
2005
2131
  size_t last = std::min(nelements, first + chunk_size);
2006
- if (local_hist.empty()) local_hist.resize(hist_cur.size(), 0);
2132
+ if (local_hist.empty()) {
2133
+ local_hist.resize(hist_cur.size(), 0);
2134
+ }
2007
2135
  local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data());
2008
2136
  }
2009
2137
  };
2010
- if (int(workers.size()) < nthread_use - 1) workers.resize(nthread_use - 1);
2011
- for (int it = 0; it < nthread_use - 1; ++it) workers[it] = std::thread(compute);
2138
+ if ((int) workers.size() < nthread_use - 1) {
2139
+ workers.resize(nthread_use - 1);
2140
+ }
2141
+ for (int it = 0; it < nthread_use - 1; ++it) {
2142
+ workers[it] = std::thread(compute);
2143
+ }
2012
2144
  compute();
2013
- for (int it = 0; it < nthread_use - 1; ++it) workers[it].join();
2145
+ for (int it = 0; it < nthread_use - 1; ++it) {
2146
+ workers[it].join();
2147
+ }
2014
2148
  }
2015
2149
 
2016
2150
  printf("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0);
@@ -2067,7 +2201,7 @@ struct llama_context * llama_init_from_file(
2067
2201
  unsigned * cur_percentage_p = (unsigned *) ctx;
2068
2202
  unsigned percentage = (unsigned) (100 * progress);
2069
2203
  while (percentage > *cur_percentage_p) {
2070
- ++*cur_percentage_p;
2204
+ *cur_percentage_p = percentage;
2071
2205
  fprintf(stderr, ".");
2072
2206
  fflush(stderr);
2073
2207
  if (percentage >= 100) {
@@ -2082,7 +2216,7 @@ struct llama_context * llama_init_from_file(
2082
2216
 
2083
2217
  ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
2084
2218
 
2085
- if (!llama_model_load(path_model, *ctx, params.n_ctx, memory_type,
2219
+ if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_gpu_layers, memory_type,
2086
2220
  params.use_mmap, params.use_mlock, params.vocab_only,
2087
2221
  params.progress_callback, params.progress_callback_user_data)) {
2088
2222
  fprintf(stderr, "%s: failed to load model\n", __func__);
@@ -2160,7 +2294,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
2160
2294
  {
2161
2295
  uint32_t magic;
2162
2296
  fin.read((char *) &magic, sizeof(magic));
2163
- if (magic != 'ggla') {
2297
+ if (magic != LLAMA_FILE_MAGIC_GGLA) {
2164
2298
  fprintf(stderr, "%s: bad file magic\n", __func__);
2165
2299
  return 1;
2166
2300
  }
@@ -2208,7 +2342,8 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
2208
2342
  fprintf(stderr, "%s: loading base model from '%s'\n", __func__, path_base_model);
2209
2343
  model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*vocab_only*/ false));
2210
2344
 
2211
- size_t ctx_size, mmapped_size;
2345
+ size_t ctx_size;
2346
+ size_t mmapped_size;
2212
2347
  model_loader->calc_sizes(&ctx_size, &mmapped_size);
2213
2348
  base_buf.resize(ctx_size);
2214
2349
 
@@ -2223,7 +2358,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
2223
2358
 
2224
2359
  // maybe this should in llama_model_loader
2225
2360
  if (model_loader->use_mmap) {
2226
- model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ false));
2361
+ model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ 0));
2227
2362
  }
2228
2363
  }
2229
2364
 
@@ -2247,8 +2382,12 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
2247
2382
  fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
2248
2383
  }
2249
2384
 
2250
- std::string name(length, 0);
2251
- fin.read(&name[0], length);
2385
+ std::string name;
2386
+ {
2387
+ char buf[1024];
2388
+ fin.read(buf, length);
2389
+ name = std::string(buf, length);
2390
+ }
2252
2391
 
2253
2392
  // check for lora suffix and get the type of tensor
2254
2393
  const std::string lora_suffix = ".lora";
@@ -2263,7 +2402,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
2263
2402
  base_name.erase(pos);
2264
2403
  // fprintf(stderr, "%s: %s => %s (lora type %s) ", __func__, name.c_str(),base_name.c_str(), lora_type.c_str());
2265
2404
 
2266
- if (model_tensors.find(base_name.data()) == model_tensors.end()) {
2405
+ if (model_tensors.find(base_name) == model_tensors.end()) {
2267
2406
  fprintf(stderr, "%s: unknown tensor '%s' in lora adapter\n", __func__, name.data());
2268
2407
  return 1;
2269
2408
  }
@@ -2312,7 +2451,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
2312
2451
  }
2313
2452
  size_t idx = model_loader->tensors_map.name_to_idx[base_name];
2314
2453
  llama_load_tensor & lt = model_loader->tensors_map.tensors[idx];
2315
- base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] });
2454
+ base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] }, GGML_BACKEND_CPU);
2316
2455
  lt.data = (uint8_t *) lt.ggml_tensor->data;
2317
2456
  model_loader->load_data_for(lt);
2318
2457
  lt.ggml_tensor->data = lt.data;
@@ -2343,7 +2482,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
2343
2482
 
2344
2483
  if (scaling != 1.0f) {
2345
2484
  ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling);
2346
- BA = ggml_scale(lora_ctx, BA, scale_tensor);
2485
+ BA = ggml_scale_inplace(lora_ctx, BA, scale_tensor);
2347
2486
  }
2348
2487
 
2349
2488
  ggml_tensor * r;
@@ -2365,8 +2504,9 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
2365
2504
  lora_tensors.clear();
2366
2505
 
2367
2506
  n_tensors++;
2368
- if (n_tensors % 4 == 0)
2507
+ if (n_tensors % 4 == 0) {
2369
2508
  fprintf(stderr, ".");
2509
+ }
2370
2510
  }
2371
2511
  }
2372
2512
 
@@ -2395,7 +2535,7 @@ int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
2395
2535
  return ctx->model.kv_self.n;
2396
2536
  }
2397
2537
 
2398
- #define LLAMA_MAX_RNG_STATE 64*1024
2538
+ #define LLAMA_MAX_RNG_STATE (64*1024)
2399
2539
 
2400
2540
  void llama_set_rng_seed(struct llama_context * ctx, int seed) {
2401
2541
  if (seed < 0) {
@@ -2436,8 +2576,8 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
2436
2576
  }
2437
2577
 
2438
2578
  // Copies the state to the specified destination address
2439
- size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) {
2440
- uint8_t * out = dest;
2579
+ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
2580
+ uint8_t * out = dst;
2441
2581
 
2442
2582
  // copy rng
2443
2583
  {
@@ -2497,7 +2637,9 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) {
2497
2637
 
2498
2638
  if (kv_size) {
2499
2639
  const size_t elt_size = ggml_element_size(kv_self.k);
2640
+
2500
2641
  char buffer[4096];
2642
+
2501
2643
  ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
2502
2644
  ggml_cgraph gf{};
2503
2645
  gf.n_threads = 1;
@@ -2521,10 +2663,12 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) {
2521
2663
  ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
2522
2664
  ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d));
2523
2665
  ggml_graph_compute(cpy_ctx, &gf);
2666
+
2667
+ ggml_free(cpy_ctx);
2524
2668
  }
2525
2669
  }
2526
2670
 
2527
- const size_t written = out - dest;
2671
+ const size_t written = out - dst;
2528
2672
  const size_t max_size = llama_get_state_size(ctx);
2529
2673
 
2530
2674
  LLAMA_ASSERT(written <= max_size);
@@ -2533,16 +2677,16 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) {
2533
2677
  }
2534
2678
 
2535
2679
  // Sets the state reading from the specified source address
2536
- size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
2537
- const uint8_t * in = src;
2680
+ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
2681
+ uint8_t * inp = src;
2538
2682
 
2539
2683
  // set rng
2540
2684
  {
2541
2685
  size_t rng_size;
2542
2686
  char rng_buf[LLAMA_MAX_RNG_STATE];
2543
2687
 
2544
- memcpy(&rng_size, in, sizeof(rng_size)); in += sizeof(rng_size);
2545
- memcpy(&rng_buf[0], in, LLAMA_MAX_RNG_STATE); in += LLAMA_MAX_RNG_STATE;
2688
+ memcpy(&rng_size, inp, sizeof(rng_size)); inp += sizeof(rng_size);
2689
+ memcpy(&rng_buf[0], inp, LLAMA_MAX_RNG_STATE); inp += LLAMA_MAX_RNG_STATE;
2546
2690
 
2547
2691
  std::stringstream rng_ss;
2548
2692
  rng_ss.str(std::string(&rng_buf[0], rng_size));
@@ -2556,30 +2700,30 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
2556
2700
  size_t logits_cap;
2557
2701
  size_t logits_size;
2558
2702
 
2559
- memcpy(&logits_cap, in, sizeof(logits_cap)); in += sizeof(logits_cap);
2560
- memcpy(&logits_size, in, sizeof(logits_size)); in += sizeof(logits_size);
2703
+ memcpy(&logits_cap, inp, sizeof(logits_cap)); inp += sizeof(logits_cap);
2704
+ memcpy(&logits_size, inp, sizeof(logits_size)); inp += sizeof(logits_size);
2561
2705
 
2562
2706
  LLAMA_ASSERT(ctx->logits.capacity() == logits_cap);
2563
2707
 
2564
2708
  if (logits_size) {
2565
2709
  ctx->logits.resize(logits_size);
2566
- memcpy(ctx->logits.data(), in, logits_size * sizeof(float));
2710
+ memcpy(ctx->logits.data(), inp, logits_size * sizeof(float));
2567
2711
  }
2568
2712
 
2569
- in += logits_cap * sizeof(float);
2713
+ inp += logits_cap * sizeof(float);
2570
2714
  }
2571
2715
 
2572
2716
  // set embeddings
2573
2717
  {
2574
2718
  size_t embedding_size;
2575
2719
 
2576
- memcpy(&embedding_size, in, sizeof(embedding_size)); in += sizeof(embedding_size);
2720
+ memcpy(&embedding_size, inp, sizeof(embedding_size)); inp += sizeof(embedding_size);
2577
2721
 
2578
2722
  LLAMA_ASSERT(ctx->embedding.capacity() == embedding_size);
2579
2723
 
2580
2724
  if (embedding_size) {
2581
- memcpy(ctx->embedding.data(), in, embedding_size * sizeof(float));
2582
- in += embedding_size * sizeof(float);
2725
+ memcpy(ctx->embedding.data(), inp, embedding_size * sizeof(float));
2726
+ inp += embedding_size * sizeof(float);
2583
2727
  }
2584
2728
  }
2585
2729
 
@@ -2594,25 +2738,27 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
2594
2738
  size_t kv_size;
2595
2739
  int kv_ntok;
2596
2740
 
2597
- memcpy(&kv_size, in, sizeof(kv_size)); in += sizeof(kv_size);
2598
- memcpy(&kv_ntok, in, sizeof(kv_ntok)); in += sizeof(kv_ntok);
2741
+ memcpy(&kv_size, inp, sizeof(kv_size)); inp += sizeof(kv_size);
2742
+ memcpy(&kv_ntok, inp, sizeof(kv_ntok)); inp += sizeof(kv_ntok);
2599
2743
 
2600
2744
  if (kv_size) {
2601
2745
  LLAMA_ASSERT(kv_self.buf.size == kv_size);
2602
2746
 
2603
2747
  const size_t elt_size = ggml_element_size(kv_self.k);
2748
+
2604
2749
  char buffer[4096];
2750
+
2605
2751
  ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
2606
2752
  ggml_cgraph gf{};
2607
2753
  gf.n_threads = 1;
2608
2754
 
2609
2755
  ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
2610
- kin3d->data = (void *) in;
2611
- in += ggml_nbytes(kin3d);
2756
+ kin3d->data = (void *) inp;
2757
+ inp += ggml_nbytes(kin3d);
2612
2758
 
2613
2759
  ggml_tensor * vin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_ntok, n_embd, n_layer);
2614
- vin3d->data = (void *) in;
2615
- in += ggml_nbytes(vin3d);
2760
+ vin3d->data = (void *) inp;
2761
+ inp += ggml_nbytes(vin3d);
2616
2762
 
2617
2763
  ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k,
2618
2764
  n_embd, kv_ntok, n_layer,
@@ -2625,12 +2771,14 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
2625
2771
  ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d));
2626
2772
  ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d));
2627
2773
  ggml_graph_compute(cpy_ctx, &gf);
2774
+
2775
+ ggml_free(cpy_ctx);
2628
2776
  }
2629
2777
 
2630
2778
  ctx->model.kv_self.n = kv_ntok;
2631
2779
  }
2632
2780
 
2633
- const size_t nread = in - src;
2781
+ const size_t nread = inp - src;
2634
2782
  const size_t max_size = llama_get_state_size(ctx);
2635
2783
 
2636
2784
  LLAMA_ASSERT(nread <= max_size);
@@ -2646,7 +2794,7 @@ bool llama_load_session_file(struct llama_context * ctx, const char * path_sessi
2646
2794
  const uint32_t magic = file.read_u32();
2647
2795
  const uint32_t version = file.read_u32();
2648
2796
 
2649
- if (!(magic == LLAMA_SESSION_MAGIC && version == LLAMA_SESSION_VERSION)) {
2797
+ if (magic != LLAMA_SESSION_MAGIC || version != LLAMA_SESSION_VERSION) {
2650
2798
  fprintf(stderr, "%s : unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version);
2651
2799
  return false;
2652
2800
  }
@@ -2727,11 +2875,14 @@ int llama_eval(
2727
2875
  fprintf(stderr, "%s: failed to eval\n", __func__);
2728
2876
  return 1;
2729
2877
  }
2878
+
2730
2879
  // get a more accurate load time, upon first eval
2880
+ // TODO: fix this
2731
2881
  if (!ctx->has_evaluated_once) {
2732
2882
  ctx->t_load_us = ggml_time_us() - ctx->t_start_us;
2733
2883
  ctx->has_evaluated_once = true;
2734
2884
  }
2885
+
2735
2886
  return 0;
2736
2887
  }
2737
2888
 
@@ -2805,9 +2956,9 @@ void llama_print_timings(struct llama_context * ctx) {
2805
2956
 
2806
2957
  fprintf(stderr, "\n");
2807
2958
  fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0);
2808
- fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3 * ctx->t_sample_us, n_sample, 1e-3 * ctx->t_sample_us / n_sample);
2959
+ fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_sample_us, n_sample, 1e-3 * ctx->t_sample_us / n_sample);
2809
2960
  fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_p_eval_us, n_p_eval, 1e-3 * ctx->t_p_eval_us / n_p_eval);
2810
- fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3 * ctx->t_eval_us, n_eval, 1e-3 * ctx->t_eval_us / n_eval);
2961
+ fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_eval_us, n_eval, 1e-3 * ctx->t_eval_us / n_eval);
2811
2962
  fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0);
2812
2963
  }
2813
2964