llama_cpp 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -9,6 +9,9 @@
9
9
  #include "llama.h"
10
10
 
11
11
  #include "ggml.h"
12
+ #ifdef GGML_USE_CUBLAS
13
+ #include "ggml-cuda.h"
14
+ #endif
12
15
 
13
16
  #include <array>
14
17
  #include <ctime>
@@ -50,49 +53,49 @@ static const size_t MB = 1024*1024;
50
53
 
51
54
  static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
52
55
  {
53
- static std::map<e_model, size_t> _MEM_REQ_SCRATCH0 = {
56
+ static std::map<e_model, size_t> k_sizes = {
54
57
  { MODEL_7B, 512ull * MB },
55
58
  { MODEL_13B, 512ull * MB },
56
59
  { MODEL_30B, 512ull * MB },
57
60
  { MODEL_65B, 1024ull * MB },
58
61
  };
59
- return _MEM_REQ_SCRATCH0;
62
+ return k_sizes;
60
63
  }
61
64
 
62
65
  static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
63
66
  {
64
- static std::map<e_model, size_t> _MEM_REQ_SCRATCH1 = {
67
+ static std::map<e_model, size_t> k_sizes = {
65
68
  { MODEL_7B, 512ull * MB },
66
69
  { MODEL_13B, 512ull * MB },
67
70
  { MODEL_30B, 512ull * MB },
68
71
  { MODEL_65B, 1024ull * MB },
69
72
  };
70
- return _MEM_REQ_SCRATCH1;
73
+ return k_sizes;
71
74
  }
72
75
 
73
76
  // 2*n_embd*n_ctx*n_layer*sizeof(float16)
74
77
  static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
75
78
  {
76
- static std::map<e_model, size_t> _MEM_REQ_KV_SELF = {
79
+ static std::map<e_model, size_t> k_sizes = {
77
80
  { MODEL_7B, 1026ull * MB },
78
81
  { MODEL_13B, 1608ull * MB },
79
82
  { MODEL_30B, 3124ull * MB },
80
83
  { MODEL_65B, 5120ull * MB },
81
84
  };
82
- return _MEM_REQ_KV_SELF;
85
+ return k_sizes;
83
86
  }
84
87
 
85
88
  // this is mostly needed for temporary mul_mat buffers to dequantize the data
86
89
  // not actually needed if BLAS is disabled
87
90
  static const std::map<e_model, size_t> & MEM_REQ_EVAL()
88
91
  {
89
- static std::map<e_model, size_t> _MEM_REQ_EVAL = {
92
+ static std::map<e_model, size_t> k_sizes = {
90
93
  { MODEL_7B, 768ull * MB },
91
94
  { MODEL_13B, 1024ull * MB },
92
95
  { MODEL_30B, 1280ull * MB },
93
96
  { MODEL_65B, 1536ull * MB },
94
97
  };
95
- return _MEM_REQ_EVAL;
98
+ return k_sizes;
96
99
  }
97
100
 
98
101
  // default hparams (LLaMA 7B)
@@ -402,6 +405,7 @@ enum llama_file_version {
402
405
  LLAMA_FILE_VERSION_GGML,
403
406
  LLAMA_FILE_VERSION_GGMF_V1, // added version field and scores in vocab
404
407
  LLAMA_FILE_VERSION_GGJT_V1, // added padding
408
+ LLAMA_FILE_VERSION_GGJT_V2, // changed quantization format
405
409
  };
406
410
 
407
411
  struct llama_file_loader {
@@ -432,6 +436,8 @@ struct llama_file_loader {
432
436
  file_version = LLAMA_FILE_VERSION_GGMF_V1;
433
437
  } else if (magic == 'ggjt' && version == 1) {
434
438
  file_version = LLAMA_FILE_VERSION_GGJT_V1;
439
+ } else if (magic == 'ggjt' && version == 2) {
440
+ file_version = LLAMA_FILE_VERSION_GGJT_V2;
435
441
  } else {
436
442
  throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
437
443
  magic, version);
@@ -482,7 +488,6 @@ struct llama_file_loader {
482
488
  case GGML_TYPE_F16:
483
489
  case GGML_TYPE_Q4_0:
484
490
  case GGML_TYPE_Q4_1:
485
- case GGML_TYPE_Q4_2:
486
491
  case GGML_TYPE_Q5_0:
487
492
  case GGML_TYPE_Q5_1:
488
493
  case GGML_TYPE_Q8_0:
@@ -527,8 +532,8 @@ struct llama_file_saver {
527
532
  write_vocab();
528
533
  }
529
534
  void write_magic() {
530
- file.write_u32('ggjt'); // magic
531
- file.write_u32(1); // version
535
+ file.write_u32(LLAMA_FILE_MAGIC); // magic
536
+ file.write_u32(LLAMA_FILE_VERSION); // version
532
537
  }
533
538
  void write_hparams(enum llama_ftype new_ftype) {
534
539
  const llama_hparams & hparams = any_file_loader->hparams;
@@ -558,7 +563,6 @@ struct llama_file_saver {
558
563
  case GGML_TYPE_F16:
559
564
  case GGML_TYPE_Q4_0:
560
565
  case GGML_TYPE_Q4_1:
561
- case GGML_TYPE_Q4_2:
562
566
  case GGML_TYPE_Q5_0:
563
567
  case GGML_TYPE_Q5_1:
564
568
  case GGML_TYPE_Q8_0:
@@ -585,12 +589,12 @@ struct llama_model_loader {
585
589
  std::unique_ptr<llama_mmap> mapping;
586
590
 
587
591
  llama_model_loader(const std::string & fname_base, bool use_mmap, bool vocab_only) {
588
- auto first_file = new llama_file_loader(fname_base.c_str(), 0, tensors_map);
592
+ auto * first_file = new llama_file_loader(fname_base.c_str(), 0, tensors_map);
589
593
  file_loaders.emplace_back(first_file);
590
594
  uint32_t n_parts = vocab_only ? 1 : guess_n_parts();
591
595
  for (uint32_t i = 1; i < n_parts; i++) {
592
596
  std::string fname = fname_base + "." + std::to_string(i);
593
- auto ith_file = new llama_file_loader(fname.c_str(), i, tensors_map);
597
+ auto * ith_file = new llama_file_loader(fname.c_str(), i, tensors_map);
594
598
  file_loaders.emplace_back(ith_file);
595
599
  if (ith_file->hparams != first_file->hparams) {
596
600
  throw format("llama.cpp: hparams inconsistent between files");
@@ -637,7 +641,7 @@ struct llama_model_loader {
637
641
  }
638
642
  }
639
643
 
640
- struct ggml_tensor * get_tensor(const std::string & name, std::vector<uint32_t> ne) {
644
+ struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne) {
641
645
  auto it = tensors_map.name_to_idx.find(name);
642
646
  if (it == tensors_map.name_to_idx.end()) {
643
647
  throw format("llama.cpp: tensor '%s' is missing from model", name.c_str());
@@ -666,7 +670,7 @@ struct llama_model_loader {
666
670
  return tensor;
667
671
  }
668
672
 
669
- void done_getting_tensors() {
673
+ void done_getting_tensors() const {
670
674
  if (num_ggml_tensors_created != tensors_map.tensors.size()) {
671
675
  throw std::string("llama.cpp: file contained more tensors than expected");
672
676
  }
@@ -808,9 +812,9 @@ static bool kv_cache_init(
808
812
  struct llama_context_params llama_context_default_params() {
809
813
  struct llama_context_params result = {
810
814
  /*.n_ctx =*/ 512,
811
- /*.n_parts =*/ -1,
815
+ /*.gpu_layers =*/ 0,
812
816
  /*.seed =*/ -1,
813
- /*.f16_kv =*/ false,
817
+ /*.f16_kv =*/ true,
814
818
  /*.logits_all =*/ false,
815
819
  /*.vocab_only =*/ false,
816
820
  /*.use_mmap =*/ true,
@@ -839,9 +843,11 @@ static const char *llama_file_version_name(llama_file_version version) {
839
843
  switch (version) {
840
844
  case LLAMA_FILE_VERSION_GGML: return "'ggml' (old version with low tokenizer quality and no mmap support)";
841
845
  case LLAMA_FILE_VERSION_GGMF_V1: return "ggmf v1 (old version with no mmap support)";
842
- case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (latest)";
843
- default: LLAMA_ASSERT(false);
846
+ case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (pre #1405)";
847
+ case LLAMA_FILE_VERSION_GGJT_V2: return "ggjt v2 (latest)";
844
848
  }
849
+
850
+ return "unknown";
845
851
  }
846
852
 
847
853
  static const char *llama_ftype_name(enum llama_ftype ftype) {
@@ -852,7 +858,6 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
852
858
  case LLAMA_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1";
853
859
  case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
854
860
  return "mostly Q4_1, some F16";
855
- case LLAMA_FTYPE_MOSTLY_Q4_2: return "mostly Q4_2";
856
861
  case LLAMA_FTYPE_MOSTLY_Q5_0: return "mostly Q5_0";
857
862
  case LLAMA_FTYPE_MOSTLY_Q5_1: return "mostly Q5_1";
858
863
  case LLAMA_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0";
@@ -874,6 +879,7 @@ static void llama_model_load_internal(
874
879
  const std::string & fname,
875
880
  llama_context & lctx,
876
881
  int n_ctx,
882
+ int n_gpu_layers,
877
883
  ggml_type memory_type,
878
884
  bool use_mmap,
879
885
  bool use_mlock,
@@ -918,15 +924,24 @@ static void llama_model_load_internal(
918
924
  fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
919
925
  }
920
926
 
927
+ if (file_version != LLAMA_FILE_VERSION_GGJT_V2) {
928
+ if (hparams.ftype != LLAMA_FTYPE_ALL_F32 &&
929
+ hparams.ftype != LLAMA_FTYPE_MOSTLY_F16 &&
930
+ hparams.ftype != LLAMA_FTYPE_MOSTLY_Q8_0) {
931
+ throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1305)");
932
+ }
933
+ }
934
+
921
935
  if (vocab_only) {
922
936
  return;
923
937
  }
924
938
 
925
939
  auto & ctx = model.ctx;
926
940
 
927
- size_t ctx_size, mmapped_size;
941
+ size_t ctx_size;
942
+ size_t mmapped_size;
928
943
  ml->calc_sizes(&ctx_size, &mmapped_size);
929
- fprintf(stderr, "%s: ggml ctx size = %6.2f KB\n", __func__, ctx_size/1024.0);
944
+ fprintf(stderr, "%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/1024.0/1024.0);
930
945
 
931
946
  // print memory requirements
932
947
  {
@@ -970,8 +985,6 @@ static void llama_model_load_internal(
970
985
 
971
986
  // prepare memory for the weights
972
987
  {
973
- const auto & hparams = model.hparams;
974
-
975
988
  const uint32_t n_embd = hparams.n_embd;
976
989
  const uint32_t n_layer = hparams.n_layer;
977
990
  const uint32_t n_vocab = hparams.n_vocab;
@@ -1013,6 +1026,35 @@ static void llama_model_load_internal(
1013
1026
  ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
1014
1027
 
1015
1028
  model.mapping = std::move(ml->mapping);
1029
+ #ifdef GGML_USE_CUBLAS
1030
+ {
1031
+ const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
1032
+
1033
+ fprintf(stderr, "%s: [cublas] offloading %d layers to GPU\n", __func__, n_gpu);
1034
+
1035
+ size_t vram_total = 0;
1036
+
1037
+ for (int i = 0; i < n_gpu; ++i) {
1038
+ const auto & layer = model.layers[i];
1039
+
1040
+ ggml_cuda_transform_tensor(layer.wq); vram_total += ggml_nbytes(layer.wq);
1041
+ ggml_cuda_transform_tensor(layer.wk); vram_total += ggml_nbytes(layer.wk);
1042
+ ggml_cuda_transform_tensor(layer.wv); vram_total += ggml_nbytes(layer.wv);
1043
+ ggml_cuda_transform_tensor(layer.wo); vram_total += ggml_nbytes(layer.wo);
1044
+ ggml_cuda_transform_tensor(layer.w1); vram_total += ggml_nbytes(layer.w1);
1045
+ ggml_cuda_transform_tensor(layer.w2); vram_total += ggml_nbytes(layer.w2);
1046
+ ggml_cuda_transform_tensor(layer.w3); vram_total += ggml_nbytes(layer.w3);
1047
+ }
1048
+ if (n_gpu_layers > (int) hparams.n_layer) {
1049
+ fprintf(stderr, "%s: [cublas] offloading output layer to GPU\n", __func__);
1050
+ ggml_cuda_transform_tensor(model.output); vram_total += ggml_nbytes(model.output);
1051
+ }
1052
+
1053
+ fprintf(stderr, "%s: [cublas] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
1054
+ }
1055
+ #else
1056
+ (void) n_gpu_layers;
1057
+ #endif
1016
1058
 
1017
1059
  // loading time will be recalculate after the first eval, so
1018
1060
  // we take page faults deferred by mmap() into consideration
@@ -1023,6 +1065,7 @@ static bool llama_model_load(
1023
1065
  const std::string & fname,
1024
1066
  llama_context & lctx,
1025
1067
  int n_ctx,
1068
+ int n_gpu_layers,
1026
1069
  ggml_type memory_type,
1027
1070
  bool use_mmap,
1028
1071
  bool use_mlock,
@@ -1030,7 +1073,7 @@ static bool llama_model_load(
1030
1073
  llama_progress_callback progress_callback,
1031
1074
  void *progress_callback_user_data) {
1032
1075
  try {
1033
- llama_model_load_internal(fname, lctx, n_ctx, memory_type, use_mmap, use_mlock,
1076
+ llama_model_load_internal(fname, lctx, n_ctx, n_gpu_layers, memory_type, use_mmap, use_mlock,
1034
1077
  vocab_only, progress_callback, progress_callback_user_data);
1035
1078
  return true;
1036
1079
  } catch (const std::string & err) {
@@ -1052,6 +1095,13 @@ static bool llama_eval_internal(
1052
1095
  const int n_tokens,
1053
1096
  const int n_past,
1054
1097
  const int n_threads) {
1098
+
1099
+ // enforce that the first token is BOS
1100
+ if (n_past == 0 && tokens[0] != llama_token_bos()) {
1101
+ fprintf(stderr, "%s: first token must be BOS\n", __func__);
1102
+ return false;
1103
+ }
1104
+
1055
1105
  const int64_t t_start_us = ggml_time_us();
1056
1106
 
1057
1107
  const int N = n_tokens;
@@ -1059,7 +1109,7 @@ static bool llama_eval_internal(
1059
1109
  const auto & model = lctx.model;
1060
1110
  const auto & hparams = model.hparams;
1061
1111
 
1062
- auto & kv_self = model.kv_self;
1112
+ const auto & kv_self = model.kv_self;
1063
1113
 
1064
1114
  LLAMA_ASSERT(!!kv_self.ctx);
1065
1115
 
@@ -1112,8 +1162,8 @@ static bool llama_eval_internal(
1112
1162
  // self-attention
1113
1163
  {
1114
1164
  // compute Q and K and RoPE them
1115
- struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
1116
- struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
1165
+ struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
1166
+ struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
1117
1167
  ggml_set_name(Qcur, "Qcur");
1118
1168
  ggml_set_name(Kcur, "Kcur");
1119
1169
 
@@ -1154,17 +1204,19 @@ static bool llama_eval_internal(
1154
1204
  struct ggml_tensor * KQ_scale = ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head));
1155
1205
  ggml_set_name(KQ_scale, "1/sqrt(n_embd/n_head)");
1156
1206
 
1157
- struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
1207
+ // KQ_scaled shape [n_past + N, N, n_head, 1]
1208
+ struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
1158
1209
  ggml_set_name(KQ_scaled, "KQ_scaled");
1159
1210
 
1160
1211
  // KQ_masked = mask_past(KQ_scaled)
1161
- struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
1212
+ struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
1162
1213
  ggml_set_name(KQ_masked, "KQ_masked");
1163
1214
 
1164
1215
  // KQ = soft_max(KQ_masked)
1165
- struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
1216
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
1166
1217
  ggml_set_name(KQ_soft_max, "KQ_soft_max");
1167
1218
 
1219
+
1168
1220
  // split cached V into n_head heads
1169
1221
  struct ggml_tensor * V =
1170
1222
  ggml_view_3d(ctx0, kv_self.v,
@@ -1265,7 +1317,7 @@ static bool llama_eval_internal(
1265
1317
  lctx.use_buf(ctx0, -1);
1266
1318
 
1267
1319
  // logits -> probs
1268
- //inpL = ggml_soft_max(ctx0, inpL);
1320
+ //inpL = ggml_soft_max_inplace(ctx0, inpL);
1269
1321
 
1270
1322
  // run the computation
1271
1323
  ggml_build_forward_expand(&gf, inpL);
@@ -1303,7 +1355,7 @@ static bool llama_eval_internal(
1303
1355
  }
1304
1356
 
1305
1357
  // extract embeddings
1306
- if (lctx.embedding.size()) {
1358
+ if (!lctx.embedding.empty()) {
1307
1359
  auto & embedding_out = lctx.embedding;
1308
1360
 
1309
1361
  embedding_out.resize(n_embd);
@@ -1354,6 +1406,8 @@ struct llama_sp_symbol {
1354
1406
  size_t n;
1355
1407
  };
1356
1408
 
1409
+ static_assert(std::is_trivially_copyable<llama_sp_symbol>::value, "llama_sp_symbol is not trivially copyable");
1410
+
1357
1411
  struct llama_sp_bigram {
1358
1412
  struct comparator {
1359
1413
  bool operator()(llama_sp_bigram & l, llama_sp_bigram & r) {
@@ -1386,7 +1440,7 @@ struct llama_tokenizer {
1386
1440
  sym.prev = index - 1;
1387
1441
  sym.next = offs == text.size() ? -1 : index + 1;
1388
1442
  index++;
1389
- symbols_.emplace_back(std::move(sym));
1443
+ symbols_.emplace_back(sym);
1390
1444
  }
1391
1445
 
1392
1446
  // seed the work queue with all possible 2-character tokens.
@@ -1477,12 +1531,12 @@ static std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, co
1477
1531
  llama_tokenizer tokenizer(vocab);
1478
1532
  std::vector<llama_vocab::id> output;
1479
1533
 
1480
- if (text.size() == 0) {
1534
+ if (text.empty()) {
1481
1535
  return output;
1482
1536
  }
1483
1537
 
1484
1538
  if (bos) {
1485
- output.push_back(1);
1539
+ output.push_back(llama_token_bos());
1486
1540
  }
1487
1541
 
1488
1542
  tokenizer.tokenize(text, output);
@@ -1713,7 +1767,7 @@ void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_dat
1713
1767
  const int64_t t_start_sample_us = ggml_time_us();
1714
1768
 
1715
1769
  for (size_t i = 0; i < candidates->size; ++i) {
1716
- auto token_iter = std::find(last_tokens, last_tokens + last_tokens_size, candidates->data[i].id);
1770
+ const auto * token_iter = std::find(last_tokens, last_tokens + last_tokens_size, candidates->data[i].id);
1717
1771
  if (token_iter == last_tokens + last_tokens_size) {
1718
1772
  continue;
1719
1773
  }
@@ -1791,7 +1845,7 @@ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_
1791
1845
  float k = powf((epsilon_hat * powf(2, *mu)) / (1 - powf(N, -epsilon_hat)), 1 / s_hat);
1792
1846
 
1793
1847
  // Sample the next word X using top-k sampling
1794
- llama_sample_top_k(nullptr, candidates, int(k));
1848
+ llama_sample_top_k(nullptr, candidates, int(k), 1);
1795
1849
  if (ctx) {
1796
1850
  ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
1797
1851
  }
@@ -1857,7 +1911,7 @@ llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_da
1857
1911
  const int64_t t_start_sample_us = ggml_time_us();
1858
1912
 
1859
1913
  // Find max element
1860
- auto max_iter = std::max_element(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
1914
+ auto * max_iter = std::max_element(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
1861
1915
  return a.logit < b.logit;
1862
1916
  });
1863
1917
 
@@ -1900,7 +1954,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1900
1954
  switch (ftype) {
1901
1955
  case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
1902
1956
  case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
1903
- case LLAMA_FTYPE_MOSTLY_Q4_2: quantized_type = GGML_TYPE_Q4_2; break;
1904
1957
  case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
1905
1958
  case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
1906
1959
  case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
@@ -1911,7 +1964,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1911
1964
  nthread = std::thread::hardware_concurrency();
1912
1965
  }
1913
1966
 
1914
- std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp.c_str(), /*use_mmap*/ false,
1967
+ std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp, /*use_mmap*/ false,
1915
1968
  /*vocab_only*/ false));
1916
1969
  llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), ftype);
1917
1970
 
@@ -1965,7 +2018,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1965
2018
  } else if (tensor.type == GGML_TYPE_F16) {
1966
2019
  f32_conv_buf.resize(nelements * sizeof(float));
1967
2020
  f32_data = (float *) f32_conv_buf.addr;
1968
- auto f16_data = (const ggml_fp16_t *) tensor.data;
2021
+ const auto * f16_data = (const ggml_fp16_t *) tensor.data;
1969
2022
  for (size_t i = 0; i < nelements; i++) {
1970
2023
  f32_data[i] = ggml_fp16_to_fp32(f16_data[i]);
1971
2024
  }
@@ -1996,21 +2049,31 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1996
2049
  size_t first = counter; counter += chunk_size;
1997
2050
  if (first >= nelements) {
1998
2051
  if (!local_hist.empty()) {
1999
- for (int j=0; j<int(local_hist.size()); ++j) hist_cur[j] += local_hist[j];
2052
+ for (int j=0; j<int(local_hist.size()); ++j) {
2053
+ hist_cur[j] += local_hist[j];
2054
+ }
2000
2055
  new_size += local_size;
2001
2056
  }
2002
2057
  break;
2003
2058
  }
2004
2059
  lock.unlock();
2005
2060
  size_t last = std::min(nelements, first + chunk_size);
2006
- if (local_hist.empty()) local_hist.resize(hist_cur.size(), 0);
2061
+ if (local_hist.empty()) {
2062
+ local_hist.resize(hist_cur.size(), 0);
2063
+ }
2007
2064
  local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data());
2008
2065
  }
2009
2066
  };
2010
- if (int(workers.size()) < nthread_use - 1) workers.resize(nthread_use - 1);
2011
- for (int it = 0; it < nthread_use - 1; ++it) workers[it] = std::thread(compute);
2067
+ if ((int) workers.size() < nthread_use - 1) {
2068
+ workers.resize(nthread_use - 1);
2069
+ }
2070
+ for (int it = 0; it < nthread_use - 1; ++it) {
2071
+ workers[it] = std::thread(compute);
2072
+ }
2012
2073
  compute();
2013
- for (int it = 0; it < nthread_use - 1; ++it) workers[it].join();
2074
+ for (int it = 0; it < nthread_use - 1; ++it) {
2075
+ workers[it].join();
2076
+ }
2014
2077
  }
2015
2078
 
2016
2079
  printf("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0);
@@ -2082,7 +2145,7 @@ struct llama_context * llama_init_from_file(
2082
2145
 
2083
2146
  ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
2084
2147
 
2085
- if (!llama_model_load(path_model, *ctx, params.n_ctx, memory_type,
2148
+ if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_gpu_layers, memory_type,
2086
2149
  params.use_mmap, params.use_mlock, params.vocab_only,
2087
2150
  params.progress_callback, params.progress_callback_user_data)) {
2088
2151
  fprintf(stderr, "%s: failed to load model\n", __func__);
@@ -2208,7 +2271,8 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
2208
2271
  fprintf(stderr, "%s: loading base model from '%s'\n", __func__, path_base_model);
2209
2272
  model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*vocab_only*/ false));
2210
2273
 
2211
- size_t ctx_size, mmapped_size;
2274
+ size_t ctx_size;
2275
+ size_t mmapped_size;
2212
2276
  model_loader->calc_sizes(&ctx_size, &mmapped_size);
2213
2277
  base_buf.resize(ctx_size);
2214
2278
 
@@ -2247,8 +2311,12 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
2247
2311
  fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
2248
2312
  }
2249
2313
 
2250
- std::string name(length, 0);
2251
- fin.read(&name[0], length);
2314
+ std::string name;
2315
+ {
2316
+ char buf[1024];
2317
+ fin.read(buf, length);
2318
+ name = std::string(buf, length);
2319
+ }
2252
2320
 
2253
2321
  // check for lora suffix and get the type of tensor
2254
2322
  const std::string lora_suffix = ".lora";
@@ -2263,7 +2331,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
2263
2331
  base_name.erase(pos);
2264
2332
  // fprintf(stderr, "%s: %s => %s (lora type %s) ", __func__, name.c_str(),base_name.c_str(), lora_type.c_str());
2265
2333
 
2266
- if (model_tensors.find(base_name.data()) == model_tensors.end()) {
2334
+ if (model_tensors.find(base_name) == model_tensors.end()) {
2267
2335
  fprintf(stderr, "%s: unknown tensor '%s' in lora adapter\n", __func__, name.data());
2268
2336
  return 1;
2269
2337
  }
@@ -2343,7 +2411,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
2343
2411
 
2344
2412
  if (scaling != 1.0f) {
2345
2413
  ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling);
2346
- BA = ggml_scale(lora_ctx, BA, scale_tensor);
2414
+ BA = ggml_scale_inplace(lora_ctx, BA, scale_tensor);
2347
2415
  }
2348
2416
 
2349
2417
  ggml_tensor * r;
@@ -2365,8 +2433,9 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
2365
2433
  lora_tensors.clear();
2366
2434
 
2367
2435
  n_tensors++;
2368
- if (n_tensors % 4 == 0)
2436
+ if (n_tensors % 4 == 0) {
2369
2437
  fprintf(stderr, ".");
2438
+ }
2370
2439
  }
2371
2440
  }
2372
2441
 
@@ -2395,7 +2464,7 @@ int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
2395
2464
  return ctx->model.kv_self.n;
2396
2465
  }
2397
2466
 
2398
- #define LLAMA_MAX_RNG_STATE 64*1024
2467
+ #define LLAMA_MAX_RNG_STATE (64*1024)
2399
2468
 
2400
2469
  void llama_set_rng_seed(struct llama_context * ctx, int seed) {
2401
2470
  if (seed < 0) {
@@ -2436,8 +2505,8 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
2436
2505
  }
2437
2506
 
2438
2507
  // Copies the state to the specified destination address
2439
- size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) {
2440
- uint8_t * out = dest;
2508
+ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
2509
+ uint8_t * out = dst;
2441
2510
 
2442
2511
  // copy rng
2443
2512
  {
@@ -2497,7 +2566,9 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) {
2497
2566
 
2498
2567
  if (kv_size) {
2499
2568
  const size_t elt_size = ggml_element_size(kv_self.k);
2569
+
2500
2570
  char buffer[4096];
2571
+
2501
2572
  ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
2502
2573
  ggml_cgraph gf{};
2503
2574
  gf.n_threads = 1;
@@ -2521,10 +2592,12 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) {
2521
2592
  ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
2522
2593
  ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d));
2523
2594
  ggml_graph_compute(cpy_ctx, &gf);
2595
+
2596
+ ggml_free(cpy_ctx);
2524
2597
  }
2525
2598
  }
2526
2599
 
2527
- const size_t written = out - dest;
2600
+ const size_t written = out - dst;
2528
2601
  const size_t max_size = llama_get_state_size(ctx);
2529
2602
 
2530
2603
  LLAMA_ASSERT(written <= max_size);
@@ -2534,15 +2607,15 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) {
2534
2607
 
2535
2608
  // Sets the state reading from the specified source address
2536
2609
  size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
2537
- const uint8_t * in = src;
2610
+ const uint8_t * inp = src;
2538
2611
 
2539
2612
  // set rng
2540
2613
  {
2541
2614
  size_t rng_size;
2542
2615
  char rng_buf[LLAMA_MAX_RNG_STATE];
2543
2616
 
2544
- memcpy(&rng_size, in, sizeof(rng_size)); in += sizeof(rng_size);
2545
- memcpy(&rng_buf[0], in, LLAMA_MAX_RNG_STATE); in += LLAMA_MAX_RNG_STATE;
2617
+ memcpy(&rng_size, inp, sizeof(rng_size)); inp += sizeof(rng_size);
2618
+ memcpy(&rng_buf[0], inp, LLAMA_MAX_RNG_STATE); inp += LLAMA_MAX_RNG_STATE;
2546
2619
 
2547
2620
  std::stringstream rng_ss;
2548
2621
  rng_ss.str(std::string(&rng_buf[0], rng_size));
@@ -2556,30 +2629,30 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
2556
2629
  size_t logits_cap;
2557
2630
  size_t logits_size;
2558
2631
 
2559
- memcpy(&logits_cap, in, sizeof(logits_cap)); in += sizeof(logits_cap);
2560
- memcpy(&logits_size, in, sizeof(logits_size)); in += sizeof(logits_size);
2632
+ memcpy(&logits_cap, inp, sizeof(logits_cap)); inp += sizeof(logits_cap);
2633
+ memcpy(&logits_size, inp, sizeof(logits_size)); inp += sizeof(logits_size);
2561
2634
 
2562
2635
  LLAMA_ASSERT(ctx->logits.capacity() == logits_cap);
2563
2636
 
2564
2637
  if (logits_size) {
2565
2638
  ctx->logits.resize(logits_size);
2566
- memcpy(ctx->logits.data(), in, logits_size * sizeof(float));
2639
+ memcpy(ctx->logits.data(), inp, logits_size * sizeof(float));
2567
2640
  }
2568
2641
 
2569
- in += logits_cap * sizeof(float);
2642
+ inp += logits_cap * sizeof(float);
2570
2643
  }
2571
2644
 
2572
2645
  // set embeddings
2573
2646
  {
2574
2647
  size_t embedding_size;
2575
2648
 
2576
- memcpy(&embedding_size, in, sizeof(embedding_size)); in += sizeof(embedding_size);
2649
+ memcpy(&embedding_size, inp, sizeof(embedding_size)); inp += sizeof(embedding_size);
2577
2650
 
2578
2651
  LLAMA_ASSERT(ctx->embedding.capacity() == embedding_size);
2579
2652
 
2580
2653
  if (embedding_size) {
2581
- memcpy(ctx->embedding.data(), in, embedding_size * sizeof(float));
2582
- in += embedding_size * sizeof(float);
2654
+ memcpy(ctx->embedding.data(), inp, embedding_size * sizeof(float));
2655
+ inp += embedding_size * sizeof(float);
2583
2656
  }
2584
2657
  }
2585
2658
 
@@ -2594,25 +2667,27 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
2594
2667
  size_t kv_size;
2595
2668
  int kv_ntok;
2596
2669
 
2597
- memcpy(&kv_size, in, sizeof(kv_size)); in += sizeof(kv_size);
2598
- memcpy(&kv_ntok, in, sizeof(kv_ntok)); in += sizeof(kv_ntok);
2670
+ memcpy(&kv_size, inp, sizeof(kv_size)); inp += sizeof(kv_size);
2671
+ memcpy(&kv_ntok, inp, sizeof(kv_ntok)); inp += sizeof(kv_ntok);
2599
2672
 
2600
2673
  if (kv_size) {
2601
2674
  LLAMA_ASSERT(kv_self.buf.size == kv_size);
2602
2675
 
2603
2676
  const size_t elt_size = ggml_element_size(kv_self.k);
2677
+
2604
2678
  char buffer[4096];
2679
+
2605
2680
  ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
2606
2681
  ggml_cgraph gf{};
2607
2682
  gf.n_threads = 1;
2608
2683
 
2609
2684
  ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
2610
- kin3d->data = (void *) in;
2611
- in += ggml_nbytes(kin3d);
2685
+ kin3d->data = (void *) inp;
2686
+ inp += ggml_nbytes(kin3d);
2612
2687
 
2613
2688
  ggml_tensor * vin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_ntok, n_embd, n_layer);
2614
- vin3d->data = (void *) in;
2615
- in += ggml_nbytes(vin3d);
2689
+ vin3d->data = (void *) inp;
2690
+ inp += ggml_nbytes(vin3d);
2616
2691
 
2617
2692
  ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k,
2618
2693
  n_embd, kv_ntok, n_layer,
@@ -2625,12 +2700,14 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
2625
2700
  ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d));
2626
2701
  ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d));
2627
2702
  ggml_graph_compute(cpy_ctx, &gf);
2703
+
2704
+ ggml_free(cpy_ctx);
2628
2705
  }
2629
2706
 
2630
2707
  ctx->model.kv_self.n = kv_ntok;
2631
2708
  }
2632
2709
 
2633
- const size_t nread = in - src;
2710
+ const size_t nread = inp - src;
2634
2711
  const size_t max_size = llama_get_state_size(ctx);
2635
2712
 
2636
2713
  LLAMA_ASSERT(nread <= max_size);
@@ -2646,7 +2723,7 @@ bool llama_load_session_file(struct llama_context * ctx, const char * path_sessi
2646
2723
  const uint32_t magic = file.read_u32();
2647
2724
  const uint32_t version = file.read_u32();
2648
2725
 
2649
- if (!(magic == LLAMA_SESSION_MAGIC && version == LLAMA_SESSION_VERSION)) {
2726
+ if (magic != LLAMA_SESSION_MAGIC || version != LLAMA_SESSION_VERSION) {
2650
2727
  fprintf(stderr, "%s : unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version);
2651
2728
  return false;
2652
2729
  }
@@ -2727,11 +2804,14 @@ int llama_eval(
2727
2804
  fprintf(stderr, "%s: failed to eval\n", __func__);
2728
2805
  return 1;
2729
2806
  }
2807
+
2730
2808
  // get a more accurate load time, upon first eval
2809
+ // TODO: fix this
2731
2810
  if (!ctx->has_evaluated_once) {
2732
2811
  ctx->t_load_us = ggml_time_us() - ctx->t_start_us;
2733
2812
  ctx->has_evaluated_once = true;
2734
2813
  }
2814
+
2735
2815
  return 0;
2736
2816
  }
2737
2817
 
@@ -2805,9 +2885,9 @@ void llama_print_timings(struct llama_context * ctx) {
2805
2885
 
2806
2886
  fprintf(stderr, "\n");
2807
2887
  fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0);
2808
- fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3 * ctx->t_sample_us, n_sample, 1e-3 * ctx->t_sample_us / n_sample);
2888
+ fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_sample_us, n_sample, 1e-3 * ctx->t_sample_us / n_sample);
2809
2889
  fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_p_eval_us, n_p_eval, 1e-3 * ctx->t_p_eval_us / n_p_eval);
2810
- fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3 * ctx->t_eval_us, n_eval, 1e-3 * ctx->t_eval_us / n_eval);
2890
+ fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_eval_us, n_eval, 1e-3 * ctx->t_eval_us / n_eval);
2811
2891
  fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0);
2812
2892
  }
2813
2893