llama_cpp 0.1.0 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -9,6 +9,9 @@
9
9
  #include "llama.h"
10
10
 
11
11
  #include "ggml.h"
12
+ #ifdef GGML_USE_CUBLAS
13
+ #include "ggml-cuda.h"
14
+ #endif
12
15
 
13
16
  #include <array>
14
17
  #include <ctime>
@@ -50,49 +53,49 @@ static const size_t MB = 1024*1024;
50
53
 
51
54
  static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
52
55
  {
53
- static std::map<e_model, size_t> _MEM_REQ_SCRATCH0 = {
56
+ static std::map<e_model, size_t> k_sizes = {
54
57
  { MODEL_7B, 512ull * MB },
55
58
  { MODEL_13B, 512ull * MB },
56
59
  { MODEL_30B, 512ull * MB },
57
60
  { MODEL_65B, 1024ull * MB },
58
61
  };
59
- return _MEM_REQ_SCRATCH0;
62
+ return k_sizes;
60
63
  }
61
64
 
62
65
  static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
63
66
  {
64
- static std::map<e_model, size_t> _MEM_REQ_SCRATCH1 = {
67
+ static std::map<e_model, size_t> k_sizes = {
65
68
  { MODEL_7B, 512ull * MB },
66
69
  { MODEL_13B, 512ull * MB },
67
70
  { MODEL_30B, 512ull * MB },
68
71
  { MODEL_65B, 1024ull * MB },
69
72
  };
70
- return _MEM_REQ_SCRATCH1;
73
+ return k_sizes;
71
74
  }
72
75
 
73
76
  // 2*n_embd*n_ctx*n_layer*sizeof(float16)
74
77
  static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
75
78
  {
76
- static std::map<e_model, size_t> _MEM_REQ_KV_SELF = {
79
+ static std::map<e_model, size_t> k_sizes = {
77
80
  { MODEL_7B, 1026ull * MB },
78
81
  { MODEL_13B, 1608ull * MB },
79
82
  { MODEL_30B, 3124ull * MB },
80
83
  { MODEL_65B, 5120ull * MB },
81
84
  };
82
- return _MEM_REQ_KV_SELF;
85
+ return k_sizes;
83
86
  }
84
87
 
85
88
  // this is mostly needed for temporary mul_mat buffers to dequantize the data
86
89
  // not actually needed if BLAS is disabled
87
90
  static const std::map<e_model, size_t> & MEM_REQ_EVAL()
88
91
  {
89
- static std::map<e_model, size_t> _MEM_REQ_EVAL = {
92
+ static std::map<e_model, size_t> k_sizes = {
90
93
  { MODEL_7B, 768ull * MB },
91
94
  { MODEL_13B, 1024ull * MB },
92
95
  { MODEL_30B, 1280ull * MB },
93
96
  { MODEL_65B, 1536ull * MB },
94
97
  };
95
- return _MEM_REQ_EVAL;
98
+ return k_sizes;
96
99
  }
97
100
 
98
101
  // default hparams (LLaMA 7B)
@@ -402,6 +405,7 @@ enum llama_file_version {
402
405
  LLAMA_FILE_VERSION_GGML,
403
406
  LLAMA_FILE_VERSION_GGMF_V1, // added version field and scores in vocab
404
407
  LLAMA_FILE_VERSION_GGJT_V1, // added padding
408
+ LLAMA_FILE_VERSION_GGJT_V2, // changed quantization format
405
409
  };
406
410
 
407
411
  struct llama_file_loader {
@@ -432,6 +436,8 @@ struct llama_file_loader {
432
436
  file_version = LLAMA_FILE_VERSION_GGMF_V1;
433
437
  } else if (magic == 'ggjt' && version == 1) {
434
438
  file_version = LLAMA_FILE_VERSION_GGJT_V1;
439
+ } else if (magic == 'ggjt' && version == 2) {
440
+ file_version = LLAMA_FILE_VERSION_GGJT_V2;
435
441
  } else {
436
442
  throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
437
443
  magic, version);
@@ -482,7 +488,6 @@ struct llama_file_loader {
482
488
  case GGML_TYPE_F16:
483
489
  case GGML_TYPE_Q4_0:
484
490
  case GGML_TYPE_Q4_1:
485
- case GGML_TYPE_Q4_2:
486
491
  case GGML_TYPE_Q5_0:
487
492
  case GGML_TYPE_Q5_1:
488
493
  case GGML_TYPE_Q8_0:
@@ -527,8 +532,8 @@ struct llama_file_saver {
527
532
  write_vocab();
528
533
  }
529
534
  void write_magic() {
530
- file.write_u32('ggjt'); // magic
531
- file.write_u32(1); // version
535
+ file.write_u32(LLAMA_FILE_MAGIC); // magic
536
+ file.write_u32(LLAMA_FILE_VERSION); // version
532
537
  }
533
538
  void write_hparams(enum llama_ftype new_ftype) {
534
539
  const llama_hparams & hparams = any_file_loader->hparams;
@@ -558,7 +563,6 @@ struct llama_file_saver {
558
563
  case GGML_TYPE_F16:
559
564
  case GGML_TYPE_Q4_0:
560
565
  case GGML_TYPE_Q4_1:
561
- case GGML_TYPE_Q4_2:
562
566
  case GGML_TYPE_Q5_0:
563
567
  case GGML_TYPE_Q5_1:
564
568
  case GGML_TYPE_Q8_0:
@@ -585,12 +589,12 @@ struct llama_model_loader {
585
589
  std::unique_ptr<llama_mmap> mapping;
586
590
 
587
591
  llama_model_loader(const std::string & fname_base, bool use_mmap, bool vocab_only) {
588
- auto first_file = new llama_file_loader(fname_base.c_str(), 0, tensors_map);
592
+ auto * first_file = new llama_file_loader(fname_base.c_str(), 0, tensors_map);
589
593
  file_loaders.emplace_back(first_file);
590
594
  uint32_t n_parts = vocab_only ? 1 : guess_n_parts();
591
595
  for (uint32_t i = 1; i < n_parts; i++) {
592
596
  std::string fname = fname_base + "." + std::to_string(i);
593
- auto ith_file = new llama_file_loader(fname.c_str(), i, tensors_map);
597
+ auto * ith_file = new llama_file_loader(fname.c_str(), i, tensors_map);
594
598
  file_loaders.emplace_back(ith_file);
595
599
  if (ith_file->hparams != first_file->hparams) {
596
600
  throw format("llama.cpp: hparams inconsistent between files");
@@ -637,7 +641,7 @@ struct llama_model_loader {
637
641
  }
638
642
  }
639
643
 
640
- struct ggml_tensor * get_tensor(const std::string & name, std::vector<uint32_t> ne) {
644
+ struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne) {
641
645
  auto it = tensors_map.name_to_idx.find(name);
642
646
  if (it == tensors_map.name_to_idx.end()) {
643
647
  throw format("llama.cpp: tensor '%s' is missing from model", name.c_str());
@@ -666,7 +670,7 @@ struct llama_model_loader {
666
670
  return tensor;
667
671
  }
668
672
 
669
- void done_getting_tensors() {
673
+ void done_getting_tensors() const {
670
674
  if (num_ggml_tensors_created != tensors_map.tensors.size()) {
671
675
  throw std::string("llama.cpp: file contained more tensors than expected");
672
676
  }
@@ -808,9 +812,9 @@ static bool kv_cache_init(
808
812
  struct llama_context_params llama_context_default_params() {
809
813
  struct llama_context_params result = {
810
814
  /*.n_ctx =*/ 512,
811
- /*.n_parts =*/ -1,
815
+ /*.gpu_layers =*/ 0,
812
816
  /*.seed =*/ -1,
813
- /*.f16_kv =*/ false,
817
+ /*.f16_kv =*/ true,
814
818
  /*.logits_all =*/ false,
815
819
  /*.vocab_only =*/ false,
816
820
  /*.use_mmap =*/ true,
@@ -839,9 +843,11 @@ static const char *llama_file_version_name(llama_file_version version) {
839
843
  switch (version) {
840
844
  case LLAMA_FILE_VERSION_GGML: return "'ggml' (old version with low tokenizer quality and no mmap support)";
841
845
  case LLAMA_FILE_VERSION_GGMF_V1: return "ggmf v1 (old version with no mmap support)";
842
- case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (latest)";
843
- default: LLAMA_ASSERT(false);
846
+ case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (pre #1405)";
847
+ case LLAMA_FILE_VERSION_GGJT_V2: return "ggjt v2 (latest)";
844
848
  }
849
+
850
+ return "unknown";
845
851
  }
846
852
 
847
853
  static const char *llama_ftype_name(enum llama_ftype ftype) {
@@ -852,7 +858,6 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
852
858
  case LLAMA_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1";
853
859
  case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
854
860
  return "mostly Q4_1, some F16";
855
- case LLAMA_FTYPE_MOSTLY_Q4_2: return "mostly Q4_2";
856
861
  case LLAMA_FTYPE_MOSTLY_Q5_0: return "mostly Q5_0";
857
862
  case LLAMA_FTYPE_MOSTLY_Q5_1: return "mostly Q5_1";
858
863
  case LLAMA_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0";
@@ -874,6 +879,7 @@ static void llama_model_load_internal(
874
879
  const std::string & fname,
875
880
  llama_context & lctx,
876
881
  int n_ctx,
882
+ int n_gpu_layers,
877
883
  ggml_type memory_type,
878
884
  bool use_mmap,
879
885
  bool use_mlock,
@@ -918,15 +924,24 @@ static void llama_model_load_internal(
918
924
  fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
919
925
  }
920
926
 
927
+ if (file_version != LLAMA_FILE_VERSION_GGJT_V2) {
928
+ if (hparams.ftype != LLAMA_FTYPE_ALL_F32 &&
929
+ hparams.ftype != LLAMA_FTYPE_MOSTLY_F16 &&
930
+ hparams.ftype != LLAMA_FTYPE_MOSTLY_Q8_0) {
931
+ throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1305)");
932
+ }
933
+ }
934
+
921
935
  if (vocab_only) {
922
936
  return;
923
937
  }
924
938
 
925
939
  auto & ctx = model.ctx;
926
940
 
927
- size_t ctx_size, mmapped_size;
941
+ size_t ctx_size;
942
+ size_t mmapped_size;
928
943
  ml->calc_sizes(&ctx_size, &mmapped_size);
929
- fprintf(stderr, "%s: ggml ctx size = %6.2f KB\n", __func__, ctx_size/1024.0);
944
+ fprintf(stderr, "%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/1024.0/1024.0);
930
945
 
931
946
  // print memory requirements
932
947
  {
@@ -970,8 +985,6 @@ static void llama_model_load_internal(
970
985
 
971
986
  // prepare memory for the weights
972
987
  {
973
- const auto & hparams = model.hparams;
974
-
975
988
  const uint32_t n_embd = hparams.n_embd;
976
989
  const uint32_t n_layer = hparams.n_layer;
977
990
  const uint32_t n_vocab = hparams.n_vocab;
@@ -1013,6 +1026,35 @@ static void llama_model_load_internal(
1013
1026
  ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
1014
1027
 
1015
1028
  model.mapping = std::move(ml->mapping);
1029
+ #ifdef GGML_USE_CUBLAS
1030
+ {
1031
+ const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
1032
+
1033
+ fprintf(stderr, "%s: [cublas] offloading %d layers to GPU\n", __func__, n_gpu);
1034
+
1035
+ size_t vram_total = 0;
1036
+
1037
+ for (int i = 0; i < n_gpu; ++i) {
1038
+ const auto & layer = model.layers[i];
1039
+
1040
+ ggml_cuda_transform_tensor(layer.wq); vram_total += ggml_nbytes(layer.wq);
1041
+ ggml_cuda_transform_tensor(layer.wk); vram_total += ggml_nbytes(layer.wk);
1042
+ ggml_cuda_transform_tensor(layer.wv); vram_total += ggml_nbytes(layer.wv);
1043
+ ggml_cuda_transform_tensor(layer.wo); vram_total += ggml_nbytes(layer.wo);
1044
+ ggml_cuda_transform_tensor(layer.w1); vram_total += ggml_nbytes(layer.w1);
1045
+ ggml_cuda_transform_tensor(layer.w2); vram_total += ggml_nbytes(layer.w2);
1046
+ ggml_cuda_transform_tensor(layer.w3); vram_total += ggml_nbytes(layer.w3);
1047
+ }
1048
+ if (n_gpu_layers > (int) hparams.n_layer) {
1049
+ fprintf(stderr, "%s: [cublas] offloading output layer to GPU\n", __func__);
1050
+ ggml_cuda_transform_tensor(model.output); vram_total += ggml_nbytes(model.output);
1051
+ }
1052
+
1053
+ fprintf(stderr, "%s: [cublas] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
1054
+ }
1055
+ #else
1056
+ (void) n_gpu_layers;
1057
+ #endif
1016
1058
 
1017
1059
  // loading time will be recalculate after the first eval, so
1018
1060
  // we take page faults deferred by mmap() into consideration
@@ -1023,6 +1065,7 @@ static bool llama_model_load(
1023
1065
  const std::string & fname,
1024
1066
  llama_context & lctx,
1025
1067
  int n_ctx,
1068
+ int n_gpu_layers,
1026
1069
  ggml_type memory_type,
1027
1070
  bool use_mmap,
1028
1071
  bool use_mlock,
@@ -1030,7 +1073,7 @@ static bool llama_model_load(
1030
1073
  llama_progress_callback progress_callback,
1031
1074
  void *progress_callback_user_data) {
1032
1075
  try {
1033
- llama_model_load_internal(fname, lctx, n_ctx, memory_type, use_mmap, use_mlock,
1076
+ llama_model_load_internal(fname, lctx, n_ctx, n_gpu_layers, memory_type, use_mmap, use_mlock,
1034
1077
  vocab_only, progress_callback, progress_callback_user_data);
1035
1078
  return true;
1036
1079
  } catch (const std::string & err) {
@@ -1052,6 +1095,13 @@ static bool llama_eval_internal(
1052
1095
  const int n_tokens,
1053
1096
  const int n_past,
1054
1097
  const int n_threads) {
1098
+
1099
+ // enforce that the first token is BOS
1100
+ if (n_past == 0 && tokens[0] != llama_token_bos()) {
1101
+ fprintf(stderr, "%s: first token must be BOS\n", __func__);
1102
+ return false;
1103
+ }
1104
+
1055
1105
  const int64_t t_start_us = ggml_time_us();
1056
1106
 
1057
1107
  const int N = n_tokens;
@@ -1059,7 +1109,7 @@ static bool llama_eval_internal(
1059
1109
  const auto & model = lctx.model;
1060
1110
  const auto & hparams = model.hparams;
1061
1111
 
1062
- auto & kv_self = model.kv_self;
1112
+ const auto & kv_self = model.kv_self;
1063
1113
 
1064
1114
  LLAMA_ASSERT(!!kv_self.ctx);
1065
1115
 
@@ -1112,8 +1162,8 @@ static bool llama_eval_internal(
1112
1162
  // self-attention
1113
1163
  {
1114
1164
  // compute Q and K and RoPE them
1115
- struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
1116
- struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
1165
+ struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
1166
+ struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
1117
1167
  ggml_set_name(Qcur, "Qcur");
1118
1168
  ggml_set_name(Kcur, "Kcur");
1119
1169
 
@@ -1154,17 +1204,19 @@ static bool llama_eval_internal(
1154
1204
  struct ggml_tensor * KQ_scale = ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head));
1155
1205
  ggml_set_name(KQ_scale, "1/sqrt(n_embd/n_head)");
1156
1206
 
1157
- struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
1207
+ // KQ_scaled shape [n_past + N, N, n_head, 1]
1208
+ struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
1158
1209
  ggml_set_name(KQ_scaled, "KQ_scaled");
1159
1210
 
1160
1211
  // KQ_masked = mask_past(KQ_scaled)
1161
- struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
1212
+ struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
1162
1213
  ggml_set_name(KQ_masked, "KQ_masked");
1163
1214
 
1164
1215
  // KQ = soft_max(KQ_masked)
1165
- struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
1216
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
1166
1217
  ggml_set_name(KQ_soft_max, "KQ_soft_max");
1167
1218
 
1219
+
1168
1220
  // split cached V into n_head heads
1169
1221
  struct ggml_tensor * V =
1170
1222
  ggml_view_3d(ctx0, kv_self.v,
@@ -1265,7 +1317,7 @@ static bool llama_eval_internal(
1265
1317
  lctx.use_buf(ctx0, -1);
1266
1318
 
1267
1319
  // logits -> probs
1268
- //inpL = ggml_soft_max(ctx0, inpL);
1320
+ //inpL = ggml_soft_max_inplace(ctx0, inpL);
1269
1321
 
1270
1322
  // run the computation
1271
1323
  ggml_build_forward_expand(&gf, inpL);
@@ -1303,7 +1355,7 @@ static bool llama_eval_internal(
1303
1355
  }
1304
1356
 
1305
1357
  // extract embeddings
1306
- if (lctx.embedding.size()) {
1358
+ if (!lctx.embedding.empty()) {
1307
1359
  auto & embedding_out = lctx.embedding;
1308
1360
 
1309
1361
  embedding_out.resize(n_embd);
@@ -1354,6 +1406,8 @@ struct llama_sp_symbol {
1354
1406
  size_t n;
1355
1407
  };
1356
1408
 
1409
+ static_assert(std::is_trivially_copyable<llama_sp_symbol>::value, "llama_sp_symbol is not trivially copyable");
1410
+
1357
1411
  struct llama_sp_bigram {
1358
1412
  struct comparator {
1359
1413
  bool operator()(llama_sp_bigram & l, llama_sp_bigram & r) {
@@ -1386,7 +1440,7 @@ struct llama_tokenizer {
1386
1440
  sym.prev = index - 1;
1387
1441
  sym.next = offs == text.size() ? -1 : index + 1;
1388
1442
  index++;
1389
- symbols_.emplace_back(std::move(sym));
1443
+ symbols_.emplace_back(sym);
1390
1444
  }
1391
1445
 
1392
1446
  // seed the work queue with all possible 2-character tokens.
@@ -1477,12 +1531,12 @@ static std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, co
1477
1531
  llama_tokenizer tokenizer(vocab);
1478
1532
  std::vector<llama_vocab::id> output;
1479
1533
 
1480
- if (text.size() == 0) {
1534
+ if (text.empty()) {
1481
1535
  return output;
1482
1536
  }
1483
1537
 
1484
1538
  if (bos) {
1485
- output.push_back(1);
1539
+ output.push_back(llama_token_bos());
1486
1540
  }
1487
1541
 
1488
1542
  tokenizer.tokenize(text, output);
@@ -1713,7 +1767,7 @@ void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_dat
1713
1767
  const int64_t t_start_sample_us = ggml_time_us();
1714
1768
 
1715
1769
  for (size_t i = 0; i < candidates->size; ++i) {
1716
- auto token_iter = std::find(last_tokens, last_tokens + last_tokens_size, candidates->data[i].id);
1770
+ const auto * token_iter = std::find(last_tokens, last_tokens + last_tokens_size, candidates->data[i].id);
1717
1771
  if (token_iter == last_tokens + last_tokens_size) {
1718
1772
  continue;
1719
1773
  }
@@ -1791,7 +1845,7 @@ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_
1791
1845
  float k = powf((epsilon_hat * powf(2, *mu)) / (1 - powf(N, -epsilon_hat)), 1 / s_hat);
1792
1846
 
1793
1847
  // Sample the next word X using top-k sampling
1794
- llama_sample_top_k(nullptr, candidates, int(k));
1848
+ llama_sample_top_k(nullptr, candidates, int(k), 1);
1795
1849
  if (ctx) {
1796
1850
  ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
1797
1851
  }
@@ -1857,7 +1911,7 @@ llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_da
1857
1911
  const int64_t t_start_sample_us = ggml_time_us();
1858
1912
 
1859
1913
  // Find max element
1860
- auto max_iter = std::max_element(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
1914
+ auto * max_iter = std::max_element(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
1861
1915
  return a.logit < b.logit;
1862
1916
  });
1863
1917
 
@@ -1900,7 +1954,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1900
1954
  switch (ftype) {
1901
1955
  case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
1902
1956
  case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
1903
- case LLAMA_FTYPE_MOSTLY_Q4_2: quantized_type = GGML_TYPE_Q4_2; break;
1904
1957
  case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
1905
1958
  case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
1906
1959
  case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
@@ -1911,7 +1964,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1911
1964
  nthread = std::thread::hardware_concurrency();
1912
1965
  }
1913
1966
 
1914
- std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp.c_str(), /*use_mmap*/ false,
1967
+ std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp, /*use_mmap*/ false,
1915
1968
  /*vocab_only*/ false));
1916
1969
  llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), ftype);
1917
1970
 
@@ -1965,7 +2018,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1965
2018
  } else if (tensor.type == GGML_TYPE_F16) {
1966
2019
  f32_conv_buf.resize(nelements * sizeof(float));
1967
2020
  f32_data = (float *) f32_conv_buf.addr;
1968
- auto f16_data = (const ggml_fp16_t *) tensor.data;
2021
+ const auto * f16_data = (const ggml_fp16_t *) tensor.data;
1969
2022
  for (size_t i = 0; i < nelements; i++) {
1970
2023
  f32_data[i] = ggml_fp16_to_fp32(f16_data[i]);
1971
2024
  }
@@ -1996,21 +2049,31 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1996
2049
  size_t first = counter; counter += chunk_size;
1997
2050
  if (first >= nelements) {
1998
2051
  if (!local_hist.empty()) {
1999
- for (int j=0; j<int(local_hist.size()); ++j) hist_cur[j] += local_hist[j];
2052
+ for (int j=0; j<int(local_hist.size()); ++j) {
2053
+ hist_cur[j] += local_hist[j];
2054
+ }
2000
2055
  new_size += local_size;
2001
2056
  }
2002
2057
  break;
2003
2058
  }
2004
2059
  lock.unlock();
2005
2060
  size_t last = std::min(nelements, first + chunk_size);
2006
- if (local_hist.empty()) local_hist.resize(hist_cur.size(), 0);
2061
+ if (local_hist.empty()) {
2062
+ local_hist.resize(hist_cur.size(), 0);
2063
+ }
2007
2064
  local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data());
2008
2065
  }
2009
2066
  };
2010
- if (int(workers.size()) < nthread_use - 1) workers.resize(nthread_use - 1);
2011
- for (int it = 0; it < nthread_use - 1; ++it) workers[it] = std::thread(compute);
2067
+ if ((int) workers.size() < nthread_use - 1) {
2068
+ workers.resize(nthread_use - 1);
2069
+ }
2070
+ for (int it = 0; it < nthread_use - 1; ++it) {
2071
+ workers[it] = std::thread(compute);
2072
+ }
2012
2073
  compute();
2013
- for (int it = 0; it < nthread_use - 1; ++it) workers[it].join();
2074
+ for (int it = 0; it < nthread_use - 1; ++it) {
2075
+ workers[it].join();
2076
+ }
2014
2077
  }
2015
2078
 
2016
2079
  printf("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0);
@@ -2082,7 +2145,7 @@ struct llama_context * llama_init_from_file(
2082
2145
 
2083
2146
  ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
2084
2147
 
2085
- if (!llama_model_load(path_model, *ctx, params.n_ctx, memory_type,
2148
+ if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_gpu_layers, memory_type,
2086
2149
  params.use_mmap, params.use_mlock, params.vocab_only,
2087
2150
  params.progress_callback, params.progress_callback_user_data)) {
2088
2151
  fprintf(stderr, "%s: failed to load model\n", __func__);
@@ -2208,7 +2271,8 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
2208
2271
  fprintf(stderr, "%s: loading base model from '%s'\n", __func__, path_base_model);
2209
2272
  model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*vocab_only*/ false));
2210
2273
 
2211
- size_t ctx_size, mmapped_size;
2274
+ size_t ctx_size;
2275
+ size_t mmapped_size;
2212
2276
  model_loader->calc_sizes(&ctx_size, &mmapped_size);
2213
2277
  base_buf.resize(ctx_size);
2214
2278
 
@@ -2247,8 +2311,12 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
2247
2311
  fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
2248
2312
  }
2249
2313
 
2250
- std::string name(length, 0);
2251
- fin.read(&name[0], length);
2314
+ std::string name;
2315
+ {
2316
+ char buf[1024];
2317
+ fin.read(buf, length);
2318
+ name = std::string(buf, length);
2319
+ }
2252
2320
 
2253
2321
  // check for lora suffix and get the type of tensor
2254
2322
  const std::string lora_suffix = ".lora";
@@ -2263,7 +2331,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
2263
2331
  base_name.erase(pos);
2264
2332
  // fprintf(stderr, "%s: %s => %s (lora type %s) ", __func__, name.c_str(),base_name.c_str(), lora_type.c_str());
2265
2333
 
2266
- if (model_tensors.find(base_name.data()) == model_tensors.end()) {
2334
+ if (model_tensors.find(base_name) == model_tensors.end()) {
2267
2335
  fprintf(stderr, "%s: unknown tensor '%s' in lora adapter\n", __func__, name.data());
2268
2336
  return 1;
2269
2337
  }
@@ -2343,7 +2411,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
2343
2411
 
2344
2412
  if (scaling != 1.0f) {
2345
2413
  ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling);
2346
- BA = ggml_scale(lora_ctx, BA, scale_tensor);
2414
+ BA = ggml_scale_inplace(lora_ctx, BA, scale_tensor);
2347
2415
  }
2348
2416
 
2349
2417
  ggml_tensor * r;
@@ -2365,8 +2433,9 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
2365
2433
  lora_tensors.clear();
2366
2434
 
2367
2435
  n_tensors++;
2368
- if (n_tensors % 4 == 0)
2436
+ if (n_tensors % 4 == 0) {
2369
2437
  fprintf(stderr, ".");
2438
+ }
2370
2439
  }
2371
2440
  }
2372
2441
 
@@ -2395,7 +2464,7 @@ int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
2395
2464
  return ctx->model.kv_self.n;
2396
2465
  }
2397
2466
 
2398
- #define LLAMA_MAX_RNG_STATE 64*1024
2467
+ #define LLAMA_MAX_RNG_STATE (64*1024)
2399
2468
 
2400
2469
  void llama_set_rng_seed(struct llama_context * ctx, int seed) {
2401
2470
  if (seed < 0) {
@@ -2436,8 +2505,8 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
2436
2505
  }
2437
2506
 
2438
2507
  // Copies the state to the specified destination address
2439
- size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) {
2440
- uint8_t * out = dest;
2508
+ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
2509
+ uint8_t * out = dst;
2441
2510
 
2442
2511
  // copy rng
2443
2512
  {
@@ -2497,7 +2566,9 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) {
2497
2566
 
2498
2567
  if (kv_size) {
2499
2568
  const size_t elt_size = ggml_element_size(kv_self.k);
2569
+
2500
2570
  char buffer[4096];
2571
+
2501
2572
  ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
2502
2573
  ggml_cgraph gf{};
2503
2574
  gf.n_threads = 1;
@@ -2521,10 +2592,12 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) {
2521
2592
  ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
2522
2593
  ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d));
2523
2594
  ggml_graph_compute(cpy_ctx, &gf);
2595
+
2596
+ ggml_free(cpy_ctx);
2524
2597
  }
2525
2598
  }
2526
2599
 
2527
- const size_t written = out - dest;
2600
+ const size_t written = out - dst;
2528
2601
  const size_t max_size = llama_get_state_size(ctx);
2529
2602
 
2530
2603
  LLAMA_ASSERT(written <= max_size);
@@ -2534,15 +2607,15 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) {
2534
2607
 
2535
2608
  // Sets the state reading from the specified source address
2536
2609
  size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
2537
- const uint8_t * in = src;
2610
+ const uint8_t * inp = src;
2538
2611
 
2539
2612
  // set rng
2540
2613
  {
2541
2614
  size_t rng_size;
2542
2615
  char rng_buf[LLAMA_MAX_RNG_STATE];
2543
2616
 
2544
- memcpy(&rng_size, in, sizeof(rng_size)); in += sizeof(rng_size);
2545
- memcpy(&rng_buf[0], in, LLAMA_MAX_RNG_STATE); in += LLAMA_MAX_RNG_STATE;
2617
+ memcpy(&rng_size, inp, sizeof(rng_size)); inp += sizeof(rng_size);
2618
+ memcpy(&rng_buf[0], inp, LLAMA_MAX_RNG_STATE); inp += LLAMA_MAX_RNG_STATE;
2546
2619
 
2547
2620
  std::stringstream rng_ss;
2548
2621
  rng_ss.str(std::string(&rng_buf[0], rng_size));
@@ -2556,30 +2629,30 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
2556
2629
  size_t logits_cap;
2557
2630
  size_t logits_size;
2558
2631
 
2559
- memcpy(&logits_cap, in, sizeof(logits_cap)); in += sizeof(logits_cap);
2560
- memcpy(&logits_size, in, sizeof(logits_size)); in += sizeof(logits_size);
2632
+ memcpy(&logits_cap, inp, sizeof(logits_cap)); inp += sizeof(logits_cap);
2633
+ memcpy(&logits_size, inp, sizeof(logits_size)); inp += sizeof(logits_size);
2561
2634
 
2562
2635
  LLAMA_ASSERT(ctx->logits.capacity() == logits_cap);
2563
2636
 
2564
2637
  if (logits_size) {
2565
2638
  ctx->logits.resize(logits_size);
2566
- memcpy(ctx->logits.data(), in, logits_size * sizeof(float));
2639
+ memcpy(ctx->logits.data(), inp, logits_size * sizeof(float));
2567
2640
  }
2568
2641
 
2569
- in += logits_cap * sizeof(float);
2642
+ inp += logits_cap * sizeof(float);
2570
2643
  }
2571
2644
 
2572
2645
  // set embeddings
2573
2646
  {
2574
2647
  size_t embedding_size;
2575
2648
 
2576
- memcpy(&embedding_size, in, sizeof(embedding_size)); in += sizeof(embedding_size);
2649
+ memcpy(&embedding_size, inp, sizeof(embedding_size)); inp += sizeof(embedding_size);
2577
2650
 
2578
2651
  LLAMA_ASSERT(ctx->embedding.capacity() == embedding_size);
2579
2652
 
2580
2653
  if (embedding_size) {
2581
- memcpy(ctx->embedding.data(), in, embedding_size * sizeof(float));
2582
- in += embedding_size * sizeof(float);
2654
+ memcpy(ctx->embedding.data(), inp, embedding_size * sizeof(float));
2655
+ inp += embedding_size * sizeof(float);
2583
2656
  }
2584
2657
  }
2585
2658
 
@@ -2594,25 +2667,27 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
2594
2667
  size_t kv_size;
2595
2668
  int kv_ntok;
2596
2669
 
2597
- memcpy(&kv_size, in, sizeof(kv_size)); in += sizeof(kv_size);
2598
- memcpy(&kv_ntok, in, sizeof(kv_ntok)); in += sizeof(kv_ntok);
2670
+ memcpy(&kv_size, inp, sizeof(kv_size)); inp += sizeof(kv_size);
2671
+ memcpy(&kv_ntok, inp, sizeof(kv_ntok)); inp += sizeof(kv_ntok);
2599
2672
 
2600
2673
  if (kv_size) {
2601
2674
  LLAMA_ASSERT(kv_self.buf.size == kv_size);
2602
2675
 
2603
2676
  const size_t elt_size = ggml_element_size(kv_self.k);
2677
+
2604
2678
  char buffer[4096];
2679
+
2605
2680
  ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
2606
2681
  ggml_cgraph gf{};
2607
2682
  gf.n_threads = 1;
2608
2683
 
2609
2684
  ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
2610
- kin3d->data = (void *) in;
2611
- in += ggml_nbytes(kin3d);
2685
+ kin3d->data = (void *) inp;
2686
+ inp += ggml_nbytes(kin3d);
2612
2687
 
2613
2688
  ggml_tensor * vin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_ntok, n_embd, n_layer);
2614
- vin3d->data = (void *) in;
2615
- in += ggml_nbytes(vin3d);
2689
+ vin3d->data = (void *) inp;
2690
+ inp += ggml_nbytes(vin3d);
2616
2691
 
2617
2692
  ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k,
2618
2693
  n_embd, kv_ntok, n_layer,
@@ -2625,12 +2700,14 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
2625
2700
  ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d));
2626
2701
  ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d));
2627
2702
  ggml_graph_compute(cpy_ctx, &gf);
2703
+
2704
+ ggml_free(cpy_ctx);
2628
2705
  }
2629
2706
 
2630
2707
  ctx->model.kv_self.n = kv_ntok;
2631
2708
  }
2632
2709
 
2633
- const size_t nread = in - src;
2710
+ const size_t nread = inp - src;
2634
2711
  const size_t max_size = llama_get_state_size(ctx);
2635
2712
 
2636
2713
  LLAMA_ASSERT(nread <= max_size);
@@ -2646,7 +2723,7 @@ bool llama_load_session_file(struct llama_context * ctx, const char * path_sessi
2646
2723
  const uint32_t magic = file.read_u32();
2647
2724
  const uint32_t version = file.read_u32();
2648
2725
 
2649
- if (!(magic == LLAMA_SESSION_MAGIC && version == LLAMA_SESSION_VERSION)) {
2726
+ if (magic != LLAMA_SESSION_MAGIC || version != LLAMA_SESSION_VERSION) {
2650
2727
  fprintf(stderr, "%s : unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version);
2651
2728
  return false;
2652
2729
  }
@@ -2727,11 +2804,14 @@ int llama_eval(
2727
2804
  fprintf(stderr, "%s: failed to eval\n", __func__);
2728
2805
  return 1;
2729
2806
  }
2807
+
2730
2808
  // get a more accurate load time, upon first eval
2809
+ // TODO: fix this
2731
2810
  if (!ctx->has_evaluated_once) {
2732
2811
  ctx->t_load_us = ggml_time_us() - ctx->t_start_us;
2733
2812
  ctx->has_evaluated_once = true;
2734
2813
  }
2814
+
2735
2815
  return 0;
2736
2816
  }
2737
2817
 
@@ -2805,9 +2885,9 @@ void llama_print_timings(struct llama_context * ctx) {
2805
2885
 
2806
2886
  fprintf(stderr, "\n");
2807
2887
  fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0);
2808
- fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3 * ctx->t_sample_us, n_sample, 1e-3 * ctx->t_sample_us / n_sample);
2888
+ fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_sample_us, n_sample, 1e-3 * ctx->t_sample_us / n_sample);
2809
2889
  fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_p_eval_us, n_p_eval, 1e-3 * ctx->t_p_eval_us / n_p_eval);
2810
- fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3 * ctx->t_eval_us, n_eval, 1e-3 * ctx->t_eval_us / n_eval);
2890
+ fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_eval_us, n_eval, 1e-3 * ctx->t_eval_us / n_eval);
2811
2891
  fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0);
2812
2892
  }
2813
2893