llama_cpp 0.0.7 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -5,10 +5,13 @@
5
5
  #include <cstdio>
6
6
  #endif
7
7
 
8
- #include "llama_util.h"
8
+ #include "llama-util.h"
9
9
  #include "llama.h"
10
10
 
11
11
  #include "ggml.h"
12
+ #ifdef GGML_USE_CUBLAS
13
+ #include "ggml-cuda.h"
14
+ #endif
12
15
 
13
16
  #include <array>
14
17
  #include <ctime>
@@ -28,11 +31,11 @@
28
31
  #include <atomic>
29
32
  #include <mutex>
30
33
  #include <sstream>
34
+ #include <numeric>
31
35
 
32
36
  #define LLAMA_USE_SCRATCH
33
37
  #define LLAMA_MAX_SCRATCH_BUFFERS 16
34
38
 
35
-
36
39
  // available llama models
37
40
  enum e_model {
38
41
  MODEL_UNKNOWN,
@@ -50,49 +53,49 @@ static const size_t MB = 1024*1024;
50
53
 
51
54
  static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
52
55
  {
53
- static std::map<e_model, size_t> _MEM_REQ_SCRATCH0 = {
56
+ static std::map<e_model, size_t> k_sizes = {
54
57
  { MODEL_7B, 512ull * MB },
55
58
  { MODEL_13B, 512ull * MB },
56
59
  { MODEL_30B, 512ull * MB },
57
60
  { MODEL_65B, 1024ull * MB },
58
61
  };
59
- return _MEM_REQ_SCRATCH0;
62
+ return k_sizes;
60
63
  }
61
64
 
62
65
  static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
63
66
  {
64
- static std::map<e_model, size_t> _MEM_REQ_SCRATCH1 = {
67
+ static std::map<e_model, size_t> k_sizes = {
65
68
  { MODEL_7B, 512ull * MB },
66
69
  { MODEL_13B, 512ull * MB },
67
70
  { MODEL_30B, 512ull * MB },
68
71
  { MODEL_65B, 1024ull * MB },
69
72
  };
70
- return _MEM_REQ_SCRATCH1;
73
+ return k_sizes;
71
74
  }
72
75
 
73
76
  // 2*n_embd*n_ctx*n_layer*sizeof(float16)
74
77
  static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
75
78
  {
76
- static std::map<e_model, size_t> _MEM_REQ_KV_SELF = {
79
+ static std::map<e_model, size_t> k_sizes = {
77
80
  { MODEL_7B, 1026ull * MB },
78
81
  { MODEL_13B, 1608ull * MB },
79
82
  { MODEL_30B, 3124ull * MB },
80
83
  { MODEL_65B, 5120ull * MB },
81
84
  };
82
- return _MEM_REQ_KV_SELF;
85
+ return k_sizes;
83
86
  }
84
87
 
85
88
  // this is mostly needed for temporary mul_mat buffers to dequantize the data
86
89
  // not actually needed if BLAS is disabled
87
90
  static const std::map<e_model, size_t> & MEM_REQ_EVAL()
88
91
  {
89
- static std::map<e_model, size_t> _MEM_REQ_EVAL = {
92
+ static std::map<e_model, size_t> k_sizes = {
90
93
  { MODEL_7B, 768ull * MB },
91
94
  { MODEL_13B, 1024ull * MB },
92
95
  { MODEL_30B, 1280ull * MB },
93
96
  { MODEL_65B, 1536ull * MB },
94
97
  };
95
- return _MEM_REQ_EVAL;
98
+ return k_sizes;
96
99
  }
97
100
 
98
101
  // default hparams (LLaMA 7B)
@@ -136,7 +139,7 @@ struct llama_kv_cache {
136
139
 
137
140
  struct ggml_context * ctx = NULL;
138
141
 
139
- llama_buffer buf;
142
+ llama_ctx_buffer buf;
140
143
 
141
144
  int n; // number of tokens currently in the cache
142
145
 
@@ -167,7 +170,7 @@ struct llama_model {
167
170
  struct llama_kv_cache kv_self;
168
171
 
169
172
  // the model memory buffer
170
- llama_buffer buf;
173
+ llama_ctx_buffer buf;
171
174
 
172
175
  // model memory mapped file
173
176
  std::unique_ptr<llama_mmap> mapping;
@@ -228,8 +231,8 @@ struct llama_context {
228
231
 
229
232
  // memory buffers used to evaluate the model
230
233
  // TODO: move in llama_state
231
- llama_buffer buf_compute;
232
- llama_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
234
+ llama_ctx_buffer buf_compute;
235
+ llama_ctx_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
233
236
 
234
237
  int buf_last = 0;
235
238
  size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
@@ -402,6 +405,7 @@ enum llama_file_version {
402
405
  LLAMA_FILE_VERSION_GGML,
403
406
  LLAMA_FILE_VERSION_GGMF_V1, // added version field and scores in vocab
404
407
  LLAMA_FILE_VERSION_GGJT_V1, // added padding
408
+ LLAMA_FILE_VERSION_GGJT_V2, // changed quantization format
405
409
  };
406
410
 
407
411
  struct llama_file_loader {
@@ -432,6 +436,8 @@ struct llama_file_loader {
432
436
  file_version = LLAMA_FILE_VERSION_GGMF_V1;
433
437
  } else if (magic == 'ggjt' && version == 1) {
434
438
  file_version = LLAMA_FILE_VERSION_GGJT_V1;
439
+ } else if (magic == 'ggjt' && version == 2) {
440
+ file_version = LLAMA_FILE_VERSION_GGJT_V2;
435
441
  } else {
436
442
  throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
437
443
  magic, version);
@@ -482,8 +488,6 @@ struct llama_file_loader {
482
488
  case GGML_TYPE_F16:
483
489
  case GGML_TYPE_Q4_0:
484
490
  case GGML_TYPE_Q4_1:
485
- case GGML_TYPE_Q4_2:
486
- case GGML_TYPE_Q4_3:
487
491
  case GGML_TYPE_Q5_0:
488
492
  case GGML_TYPE_Q5_1:
489
493
  case GGML_TYPE_Q8_0:
@@ -528,8 +532,8 @@ struct llama_file_saver {
528
532
  write_vocab();
529
533
  }
530
534
  void write_magic() {
531
- file.write_u32('ggjt'); // magic
532
- file.write_u32(1); // version
535
+ file.write_u32(LLAMA_FILE_MAGIC); // magic
536
+ file.write_u32(LLAMA_FILE_VERSION); // version
533
537
  }
534
538
  void write_hparams(enum llama_ftype new_ftype) {
535
539
  const llama_hparams & hparams = any_file_loader->hparams;
@@ -559,8 +563,6 @@ struct llama_file_saver {
559
563
  case GGML_TYPE_F16:
560
564
  case GGML_TYPE_Q4_0:
561
565
  case GGML_TYPE_Q4_1:
562
- case GGML_TYPE_Q4_2:
563
- case GGML_TYPE_Q4_3:
564
566
  case GGML_TYPE_Q5_0:
565
567
  case GGML_TYPE_Q5_1:
566
568
  case GGML_TYPE_Q8_0:
@@ -587,12 +589,12 @@ struct llama_model_loader {
587
589
  std::unique_ptr<llama_mmap> mapping;
588
590
 
589
591
  llama_model_loader(const std::string & fname_base, bool use_mmap, bool vocab_only) {
590
- auto first_file = new llama_file_loader(fname_base.c_str(), 0, tensors_map);
592
+ auto * first_file = new llama_file_loader(fname_base.c_str(), 0, tensors_map);
591
593
  file_loaders.emplace_back(first_file);
592
594
  uint32_t n_parts = vocab_only ? 1 : guess_n_parts();
593
595
  for (uint32_t i = 1; i < n_parts; i++) {
594
596
  std::string fname = fname_base + "." + std::to_string(i);
595
- auto ith_file = new llama_file_loader(fname.c_str(), i, tensors_map);
597
+ auto * ith_file = new llama_file_loader(fname.c_str(), i, tensors_map);
596
598
  file_loaders.emplace_back(ith_file);
597
599
  if (ith_file->hparams != first_file->hparams) {
598
600
  throw format("llama.cpp: hparams inconsistent between files");
@@ -639,7 +641,7 @@ struct llama_model_loader {
639
641
  }
640
642
  }
641
643
 
642
- struct ggml_tensor * get_tensor(const std::string & name, std::vector<uint32_t> ne) {
644
+ struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne) {
643
645
  auto it = tensors_map.name_to_idx.find(name);
644
646
  if (it == tensors_map.name_to_idx.end()) {
645
647
  throw format("llama.cpp: tensor '%s' is missing from model", name.c_str());
@@ -661,13 +663,14 @@ struct llama_model_loader {
661
663
  LLAMA_ASSERT(lt.ne.size() == 1);
662
664
  tensor = ggml_new_tensor_1d(ggml_ctx, lt.type, lt.ne.at(0));
663
665
  }
666
+ ggml_set_name(tensor, lt.name.c_str());
664
667
  LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor
665
668
  lt.ggml_tensor = tensor;
666
669
  num_ggml_tensors_created++;
667
670
  return tensor;
668
671
  }
669
672
 
670
- void done_getting_tensors() {
673
+ void done_getting_tensors() const {
671
674
  if (num_ggml_tensors_created != tensors_map.tensors.size()) {
672
675
  throw std::string("llama.cpp: file contained more tensors than expected");
673
676
  }
@@ -729,8 +732,7 @@ struct llama_model_loader {
729
732
  LLAMA_ASSERT(offset == lt.size);
730
733
  } else if (lt.split_type == SPLIT_BY_COLUMNS) {
731
734
  // Let's load the data into temporary buffers to ensure the OS performs large loads.
732
- std::vector<llama_buffer> tmp_bufs;
733
- tmp_bufs.resize(lt.shards.size());
735
+ std::vector<llama_buffer> tmp_bufs(lt.shards.size());
734
736
  for (size_t i = 0; i < lt.shards.size(); i++) {
735
737
  llama_load_tensor_shard & shard = lt.shards.at(i);
736
738
  llama_file & file = file_loaders.at(shard.file_idx)->file;
@@ -782,7 +784,7 @@ static bool kv_cache_init(
782
784
  const int n_embd = hparams.n_embd;
783
785
  const int n_layer = hparams.n_layer;
784
786
 
785
- const int64_t n_mem = (int64_t)n_layer*n_ctx;
787
+ const int64_t n_mem = n_layer*n_ctx;
786
788
  const int64_t n_elements = n_embd*n_mem;
787
789
 
788
790
  cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
@@ -801,6 +803,8 @@ static bool kv_cache_init(
801
803
 
802
804
  cache.k = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
803
805
  cache.v = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
806
+ ggml_set_name(cache.k, "cache_k");
807
+ ggml_set_name(cache.v, "cache_v");
804
808
 
805
809
  return true;
806
810
  }
@@ -808,9 +812,9 @@ static bool kv_cache_init(
808
812
  struct llama_context_params llama_context_default_params() {
809
813
  struct llama_context_params result = {
810
814
  /*.n_ctx =*/ 512,
811
- /*.n_parts =*/ -1,
812
- /*.seed =*/ 0,
813
- /*.f16_kv =*/ false,
815
+ /*.gpu_layers =*/ 0,
816
+ /*.seed =*/ -1,
817
+ /*.f16_kv =*/ true,
814
818
  /*.logits_all =*/ false,
815
819
  /*.vocab_only =*/ false,
816
820
  /*.use_mmap =*/ true,
@@ -839,9 +843,11 @@ static const char *llama_file_version_name(llama_file_version version) {
839
843
  switch (version) {
840
844
  case LLAMA_FILE_VERSION_GGML: return "'ggml' (old version with low tokenizer quality and no mmap support)";
841
845
  case LLAMA_FILE_VERSION_GGMF_V1: return "ggmf v1 (old version with no mmap support)";
842
- case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (latest)";
843
- default: LLAMA_ASSERT(false);
846
+ case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (pre #1405)";
847
+ case LLAMA_FILE_VERSION_GGJT_V2: return "ggjt v2 (latest)";
844
848
  }
849
+
850
+ return "unknown";
845
851
  }
846
852
 
847
853
  static const char *llama_ftype_name(enum llama_ftype ftype) {
@@ -852,8 +858,6 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
852
858
  case LLAMA_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1";
853
859
  case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
854
860
  return "mostly Q4_1, some F16";
855
- case LLAMA_FTYPE_MOSTLY_Q4_2: return "mostly Q4_2";
856
- case LLAMA_FTYPE_MOSTLY_Q4_3: return "mostly Q4_3";
857
861
  case LLAMA_FTYPE_MOSTLY_Q5_0: return "mostly Q5_0";
858
862
  case LLAMA_FTYPE_MOSTLY_Q5_1: return "mostly Q5_1";
859
863
  case LLAMA_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0";
@@ -875,6 +879,7 @@ static void llama_model_load_internal(
875
879
  const std::string & fname,
876
880
  llama_context & lctx,
877
881
  int n_ctx,
882
+ int n_gpu_layers,
878
883
  ggml_type memory_type,
879
884
  bool use_mmap,
880
885
  bool use_mlock,
@@ -919,15 +924,24 @@ static void llama_model_load_internal(
919
924
  fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
920
925
  }
921
926
 
927
+ if (file_version != LLAMA_FILE_VERSION_GGJT_V2) {
928
+ if (hparams.ftype != LLAMA_FTYPE_ALL_F32 &&
929
+ hparams.ftype != LLAMA_FTYPE_MOSTLY_F16 &&
930
+ hparams.ftype != LLAMA_FTYPE_MOSTLY_Q8_0) {
931
+ throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1305)");
932
+ }
933
+ }
934
+
922
935
  if (vocab_only) {
923
936
  return;
924
937
  }
925
938
 
926
939
  auto & ctx = model.ctx;
927
940
 
928
- size_t ctx_size, mmapped_size;
941
+ size_t ctx_size;
942
+ size_t mmapped_size;
929
943
  ml->calc_sizes(&ctx_size, &mmapped_size);
930
- fprintf(stderr, "%s: ggml ctx size = %6.2f KB\n", __func__, ctx_size/1024.0);
944
+ fprintf(stderr, "%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/1024.0/1024.0);
931
945
 
932
946
  // print memory requirements
933
947
  {
@@ -971,8 +985,6 @@ static void llama_model_load_internal(
971
985
 
972
986
  // prepare memory for the weights
973
987
  {
974
- const auto & hparams = model.hparams;
975
-
976
988
  const uint32_t n_embd = hparams.n_embd;
977
989
  const uint32_t n_layer = hparams.n_layer;
978
990
  const uint32_t n_vocab = hparams.n_vocab;
@@ -1014,6 +1026,35 @@ static void llama_model_load_internal(
1014
1026
  ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
1015
1027
 
1016
1028
  model.mapping = std::move(ml->mapping);
1029
+ #ifdef GGML_USE_CUBLAS
1030
+ {
1031
+ const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
1032
+
1033
+ fprintf(stderr, "%s: [cublas] offloading %d layers to GPU\n", __func__, n_gpu);
1034
+
1035
+ size_t vram_total = 0;
1036
+
1037
+ for (int i = 0; i < n_gpu; ++i) {
1038
+ const auto & layer = model.layers[i];
1039
+
1040
+ ggml_cuda_transform_tensor(layer.wq); vram_total += ggml_nbytes(layer.wq);
1041
+ ggml_cuda_transform_tensor(layer.wk); vram_total += ggml_nbytes(layer.wk);
1042
+ ggml_cuda_transform_tensor(layer.wv); vram_total += ggml_nbytes(layer.wv);
1043
+ ggml_cuda_transform_tensor(layer.wo); vram_total += ggml_nbytes(layer.wo);
1044
+ ggml_cuda_transform_tensor(layer.w1); vram_total += ggml_nbytes(layer.w1);
1045
+ ggml_cuda_transform_tensor(layer.w2); vram_total += ggml_nbytes(layer.w2);
1046
+ ggml_cuda_transform_tensor(layer.w3); vram_total += ggml_nbytes(layer.w3);
1047
+ }
1048
+ if (n_gpu_layers > (int) hparams.n_layer) {
1049
+ fprintf(stderr, "%s: [cublas] offloading output layer to GPU\n", __func__);
1050
+ ggml_cuda_transform_tensor(model.output); vram_total += ggml_nbytes(model.output);
1051
+ }
1052
+
1053
+ fprintf(stderr, "%s: [cublas] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
1054
+ }
1055
+ #else
1056
+ (void) n_gpu_layers;
1057
+ #endif
1017
1058
 
1018
1059
  // loading time will be recalculate after the first eval, so
1019
1060
  // we take page faults deferred by mmap() into consideration
@@ -1024,6 +1065,7 @@ static bool llama_model_load(
1024
1065
  const std::string & fname,
1025
1066
  llama_context & lctx,
1026
1067
  int n_ctx,
1068
+ int n_gpu_layers,
1027
1069
  ggml_type memory_type,
1028
1070
  bool use_mmap,
1029
1071
  bool use_mlock,
@@ -1031,7 +1073,7 @@ static bool llama_model_load(
1031
1073
  llama_progress_callback progress_callback,
1032
1074
  void *progress_callback_user_data) {
1033
1075
  try {
1034
- llama_model_load_internal(fname, lctx, n_ctx, memory_type, use_mmap, use_mlock,
1076
+ llama_model_load_internal(fname, lctx, n_ctx, n_gpu_layers, memory_type, use_mmap, use_mlock,
1035
1077
  vocab_only, progress_callback, progress_callback_user_data);
1036
1078
  return true;
1037
1079
  } catch (const std::string & err) {
@@ -1053,6 +1095,13 @@ static bool llama_eval_internal(
1053
1095
  const int n_tokens,
1054
1096
  const int n_past,
1055
1097
  const int n_threads) {
1098
+
1099
+ // enforce that the first token is BOS
1100
+ if (n_past == 0 && tokens[0] != llama_token_bos()) {
1101
+ fprintf(stderr, "%s: first token must be BOS\n", __func__);
1102
+ return false;
1103
+ }
1104
+
1056
1105
  const int64_t t_start_us = ggml_time_us();
1057
1106
 
1058
1107
  const int N = n_tokens;
@@ -1060,7 +1109,7 @@ static bool llama_eval_internal(
1060
1109
  const auto & model = lctx.model;
1061
1110
  const auto & hparams = model.hparams;
1062
1111
 
1063
- auto & kv_self = model.kv_self;
1112
+ const auto & kv_self = model.kv_self;
1064
1113
 
1065
1114
  LLAMA_ASSERT(!!kv_self.ctx);
1066
1115
 
@@ -1088,6 +1137,7 @@ static bool llama_eval_internal(
1088
1137
  gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
1089
1138
 
1090
1139
  struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
1140
+ ggml_set_name(embd, "embd");
1091
1141
  memcpy(embd->data, tokens, N*ggml_element_size(embd));
1092
1142
 
1093
1143
  struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
@@ -1112,8 +1162,10 @@ static bool llama_eval_internal(
1112
1162
  // self-attention
1113
1163
  {
1114
1164
  // compute Q and K and RoPE them
1115
- struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
1116
- struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
1165
+ struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
1166
+ struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
1167
+ ggml_set_name(Qcur, "Qcur");
1168
+ ggml_set_name(Kcur, "Kcur");
1117
1169
 
1118
1170
  // store key and value to memory
1119
1171
  {
@@ -1134,6 +1186,7 @@ static bool llama_eval_internal(
1134
1186
  ggml_permute(ctx0,
1135
1187
  Qcur,
1136
1188
  0, 2, 1, 3);
1189
+ ggml_set_name(Q, "Q");
1137
1190
 
1138
1191
  struct ggml_tensor * K =
1139
1192
  ggml_permute(ctx0,
@@ -1141,21 +1194,28 @@ static bool llama_eval_internal(
1141
1194
  ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd),
1142
1195
  n_embd/n_head, n_head, n_past + N),
1143
1196
  0, 2, 1, 3);
1197
+ ggml_set_name(K, "K");
1144
1198
 
1145
1199
  // K * Q
1146
1200
  struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
1201
+ ggml_set_name(KQ, "KQ");
1147
1202
 
1148
1203
  // KQ_scaled = KQ / sqrt(n_embd/n_head)
1149
- struct ggml_tensor * KQ_scaled =
1150
- ggml_scale(ctx0,
1151
- KQ,
1152
- ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head)));
1204
+ struct ggml_tensor * KQ_scale = ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head));
1205
+ ggml_set_name(KQ_scale, "1/sqrt(n_embd/n_head)");
1206
+
1207
+ // KQ_scaled shape [n_past + N, N, n_head, 1]
1208
+ struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
1209
+ ggml_set_name(KQ_scaled, "KQ_scaled");
1153
1210
 
1154
1211
  // KQ_masked = mask_past(KQ_scaled)
1155
- struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
1212
+ struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
1213
+ ggml_set_name(KQ_masked, "KQ_masked");
1156
1214
 
1157
1215
  // KQ = soft_max(KQ_masked)
1158
- struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
1216
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
1217
+ ggml_set_name(KQ_soft_max, "KQ_soft_max");
1218
+
1159
1219
 
1160
1220
  // split cached V into n_head heads
1161
1221
  struct ggml_tensor * V =
@@ -1164,9 +1224,11 @@ static bool llama_eval_internal(
1164
1224
  n_ctx*ggml_element_size(kv_self.v),
1165
1225
  n_ctx*ggml_element_size(kv_self.v)*n_embd/n_head,
1166
1226
  il*n_ctx*ggml_element_size(kv_self.v)*n_embd);
1227
+ ggml_set_name(V, "V");
1167
1228
 
1168
1229
  #if 1
1169
1230
  struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
1231
+ ggml_set_name(KQV, "KQV");
1170
1232
  #else
1171
1233
  // make V contiguous in memory to speed up the matmul, however we waste time on the copy
1172
1234
  // on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
@@ -1177,11 +1239,13 @@ static bool llama_eval_internal(
1177
1239
 
1178
1240
  // KQV_merged = KQV.permute(0, 2, 1, 3)
1179
1241
  struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
1242
+ ggml_set_name(KQV_merged, "KQV_merged");
1180
1243
 
1181
1244
  // cur = KQV_merged.contiguous().view(n_embd, N)
1182
1245
  cur = ggml_cpy(ctx0,
1183
1246
  KQV_merged,
1184
1247
  ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
1248
+ ggml_set_name(cur, "KQV_merged_contiguous");
1185
1249
 
1186
1250
  // projection (no bias)
1187
1251
  cur = ggml_mul_mat(ctx0,
@@ -1253,7 +1317,7 @@ static bool llama_eval_internal(
1253
1317
  lctx.use_buf(ctx0, -1);
1254
1318
 
1255
1319
  // logits -> probs
1256
- //inpL = ggml_soft_max(ctx0, inpL);
1320
+ //inpL = ggml_soft_max_inplace(ctx0, inpL);
1257
1321
 
1258
1322
  // run the computation
1259
1323
  ggml_build_forward_expand(&gf, inpL);
@@ -1273,6 +1337,9 @@ static bool llama_eval_internal(
1273
1337
  //embd_w.resize(n_vocab*N);
1274
1338
  //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
1275
1339
 
1340
+ // update kv token count
1341
+ lctx.model.kv_self.n = n_past + N;
1342
+
1276
1343
  // extract logits
1277
1344
  {
1278
1345
  auto & logits_out = lctx.logits;
@@ -1288,7 +1355,7 @@ static bool llama_eval_internal(
1288
1355
  }
1289
1356
 
1290
1357
  // extract embeddings
1291
- if (lctx.embedding.size()) {
1358
+ if (!lctx.embedding.empty()) {
1292
1359
  auto & embedding_out = lctx.embedding;
1293
1360
 
1294
1361
  embedding_out.resize(n_embd);
@@ -1339,6 +1406,8 @@ struct llama_sp_symbol {
1339
1406
  size_t n;
1340
1407
  };
1341
1408
 
1409
+ static_assert(std::is_trivially_copyable<llama_sp_symbol>::value, "llama_sp_symbol is not trivially copyable");
1410
+
1342
1411
  struct llama_sp_bigram {
1343
1412
  struct comparator {
1344
1413
  bool operator()(llama_sp_bigram & l, llama_sp_bigram & r) {
@@ -1371,7 +1440,7 @@ struct llama_tokenizer {
1371
1440
  sym.prev = index - 1;
1372
1441
  sym.next = offs == text.size() ? -1 : index + 1;
1373
1442
  index++;
1374
- symbols_.emplace_back(std::move(sym));
1443
+ symbols_.emplace_back(sym);
1375
1444
  }
1376
1445
 
1377
1446
  // seed the work queue with all possible 2-character tokens.
@@ -1462,12 +1531,12 @@ static std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, co
1462
1531
  llama_tokenizer tokenizer(vocab);
1463
1532
  std::vector<llama_vocab::id> output;
1464
1533
 
1465
- if (text.size() == 0) {
1534
+ if (text.empty()) {
1466
1535
  return output;
1467
1536
  }
1468
1537
 
1469
1538
  if (bos) {
1470
- output.push_back(1);
1539
+ output.push_back(llama_token_bos());
1471
1540
  }
1472
1541
 
1473
1542
  tokenizer.tokenize(text, output);
@@ -1478,109 +1547,402 @@ static std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, co
1478
1547
  // sampling
1479
1548
  //
1480
1549
 
1481
- static void sample_top_k(std::vector<std::pair<float, llama_vocab::id>> & logits_id, int top_k) {
1482
- // find the top k tokens
1483
- std::partial_sort(
1484
- logits_id.begin(),
1485
- logits_id.begin() + top_k, logits_id.end(),
1486
- [](const std::pair<float, llama_vocab::id> & a, const std::pair<float, llama_vocab::id> & b) {
1487
- return a.first > b.first;
1488
- });
1550
+ void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates) {
1551
+ assert(candidates->size > 0);
1552
+
1553
+ const int64_t t_start_sample_us = ggml_time_us();
1554
+
1555
+ // Sort the logits in descending order
1556
+ if (!candidates->sorted) {
1557
+ std::sort(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
1558
+ return a.logit > b.logit;
1559
+ });
1560
+ candidates->sorted = true;
1561
+ }
1489
1562
 
1490
- logits_id.resize(top_k);
1563
+ float max_l = candidates->data[0].logit;
1564
+ float cum_sum = 0.0f;
1565
+ for (size_t i = 0; i < candidates->size; ++i) {
1566
+ float p = expf(candidates->data[i].logit - max_l);
1567
+ candidates->data[i].p = p;
1568
+ cum_sum += p;
1569
+ }
1570
+ for (size_t i = 0; i < candidates->size; ++i) {
1571
+ candidates->data[i].p /= cum_sum;
1572
+ }
1573
+
1574
+ if (ctx) {
1575
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
1576
+ }
1491
1577
  }
1492
1578
 
1493
- static llama_vocab::id llama_sample_top_p_top_k(
1494
- llama_context & lctx,
1495
- const std::vector<llama_vocab::id> & last_n_tokens,
1496
- int top_k,
1497
- float top_p,
1498
- float temp,
1499
- float repeat_penalty) {
1500
- auto & rng = lctx.rng;
1501
-
1502
- const int n_logits = lctx.model.hparams.n_vocab;
1503
-
1504
- const auto & logits = lctx.logits;
1505
- const auto * plogits = logits.data() + logits.size() - n_logits;
1506
-
1507
- if (temp <= 0) {
1508
- // select the token with the highest logit directly
1509
- float max_logit = plogits[0];
1510
- llama_vocab::id max_id = 0;
1511
-
1512
- for (int i = 1; i < n_logits; ++i) {
1513
- if (plogits[i] > max_logit) {
1514
- max_logit = plogits[i];
1515
- max_id = i;
1516
- }
1579
+ void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep) {
1580
+ const int64_t t_start_sample_us = ggml_time_us();
1581
+
1582
+ k = std::max(k, (int) min_keep);
1583
+ k = std::min(k, (int) candidates->size);
1584
+
1585
+ // Sort scores in descending order
1586
+ if (!candidates->sorted) {
1587
+ auto comp = [](const llama_token_data & a, const llama_token_data & b) {
1588
+ return a.logit > b.logit;
1589
+ };
1590
+ if (k == (int) candidates->size) {
1591
+ std::sort(candidates->data, candidates->data + candidates->size, comp);
1592
+ } else {
1593
+ std::partial_sort(candidates->data, candidates->data + k, candidates->data + candidates->size, comp);
1517
1594
  }
1518
- return max_id;
1595
+ candidates->sorted = true;
1519
1596
  }
1597
+ candidates->size = k;
1520
1598
 
1521
- std::vector<std::pair<float, llama_vocab::id>> logits_id;
1522
- logits_id.reserve(n_logits);
1599
+ if (ctx) {
1600
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
1601
+ }
1602
+ }
1523
1603
 
1524
- {
1525
- const float scale = 1.0f/temp;
1526
- for (int i = 0; i < n_logits; ++i) {
1527
- // repetition penalty from ctrl paper (https://arxiv.org/abs/1909.05858)
1528
- // credit https://github.com/facebookresearch/llama/compare/main...shawwn:llama:main
1529
- if (std::find(last_n_tokens.begin(), last_n_tokens.end(), i) != last_n_tokens.end()) {
1530
- // if score < 0 then repetition penalty has to multiplied to reduce the previous token probability
1531
- if (plogits[i] < 0.0f) {
1532
- logits_id.push_back(std::make_pair(plogits[i]*scale*repeat_penalty, i));
1533
- } else {
1534
- logits_id.push_back(std::make_pair(plogits[i]*scale/repeat_penalty, i));
1535
- }
1536
- } else {
1537
- logits_id.push_back(std::make_pair(plogits[i]*scale, i));
1538
- }
1604
+ void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) {
1605
+ if (p >= 1.0f) {
1606
+ return;
1607
+ }
1608
+
1609
+ const int64_t t_start_sample_us = ggml_time_us();
1610
+
1611
+ llama_sample_softmax(ctx, candidates);
1612
+
1613
+ // Compute the cumulative probabilities
1614
+ float cum_sum = 0.0f;
1615
+ size_t last_idx = candidates->size;
1616
+
1617
+ for (size_t i = 0; i < candidates->size; ++i) {
1618
+ cum_sum += candidates->data[i].p;
1619
+
1620
+ // Check if the running sum is greater than p or if we have kept at least min_keep tokens
1621
+ if (cum_sum > p && i >= min_keep) {
1622
+ last_idx = i;
1623
+ break;
1539
1624
  }
1540
1625
  }
1541
1626
 
1542
- sample_top_k(logits_id, top_k > 0 ? std::min(top_k, n_logits) : n_logits);
1627
+ // Resize the output vector to keep only the top-p tokens
1628
+ candidates->size = last_idx;
1543
1629
 
1544
- // compute probs for the top k tokens
1545
- std::vector<float> probs;
1546
- probs.reserve(logits_id.size());
1630
+ if (ctx) {
1631
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
1632
+ }
1633
+ }
1634
+
1635
+ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep) {
1636
+ if (z >= 1.0f || candidates->size <= 2) {
1637
+ return;
1638
+ }
1547
1639
 
1548
- float maxl = logits_id[0].first;
1549
- double sum = 0.0;
1550
- for (const auto & kv : logits_id) {
1551
- const float p = expf(kv.first - maxl);
1552
- probs.push_back(p);
1553
- sum += p;
1640
+ const int64_t t_start_sample_us = ggml_time_us();
1641
+
1642
+ llama_sample_softmax(nullptr, candidates);
1643
+
1644
+ // Compute the first and second derivatives
1645
+ std::vector<float> first_derivatives(candidates->size - 1);
1646
+ std::vector<float> second_derivatives(candidates->size - 2);
1647
+
1648
+ for (size_t i = 0; i < first_derivatives.size(); ++i) {
1649
+ first_derivatives[i] = candidates->data[i].p - candidates->data[i + 1].p;
1650
+ }
1651
+ for (size_t i = 0; i < second_derivatives.size(); ++i) {
1652
+ second_derivatives[i] = first_derivatives[i] - first_derivatives[i + 1];
1554
1653
  }
1555
1654
 
1556
- // normalize the probs
1557
- for (auto & p : probs) {
1558
- p /= sum;
1655
+ // Calculate absolute value of second derivatives
1656
+ for (size_t i = 0; i < second_derivatives.size(); ++i) {
1657
+ second_derivatives[i] = abs(second_derivatives[i]);
1559
1658
  }
1560
1659
 
1561
- if (top_p < 1.0) {
1562
- double cumsum = 0.0;
1563
- for (int i = 0; i < (int) probs.size(); i++) {
1564
- cumsum += probs[i];
1565
- if (cumsum >= top_p) {
1566
- probs.resize(i + 1);
1567
- logits_id.resize(i + 1);
1568
- break;
1569
- }
1660
+ // Normalize the second derivatives
1661
+ float second_derivatives_sum = std::accumulate(second_derivatives.begin(), second_derivatives.end(), 0.0f);
1662
+ for (float & value : second_derivatives) {
1663
+ value /= second_derivatives_sum;
1664
+ }
1665
+
1666
+ float cum_sum = 0.0f;
1667
+ size_t last_idx = candidates->size;
1668
+ for (size_t i = 0; i < second_derivatives.size(); ++i) {
1669
+ cum_sum += second_derivatives[i];
1670
+
1671
+ // Check if the running sum is greater than z or if we have kept at least min_keep tokens
1672
+ if (cum_sum > z && i >= min_keep) {
1673
+ last_idx = i;
1674
+ break;
1570
1675
  }
1571
1676
  }
1572
1677
 
1573
- //printf("\n");
1574
- //for (int i = 0; i < (int) 10; i++) {
1575
- // printf("%d: '%s' %f\n", i, lctx.vocab.id_to_token.at(logits_id[i].second).tok.c_str(), probs[i]);
1576
- //}
1577
- //printf("\n\n");
1578
- //exit(0);
1678
+ // Resize the output vector to keep only the tokens above the tail location
1679
+ candidates->size = last_idx;
1680
+
1681
+ if (ctx) {
1682
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
1683
+ }
1684
+ }
1685
+
1686
+
1687
+ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) {
1688
+ // Reference implementation:
1689
+ // https://github.com/huggingface/transformers/compare/main...cimeister:typical-sampling:typical-pr
1690
+ if (p >= 1.0f) {
1691
+ return;
1692
+ }
1693
+
1694
+ const int64_t t_start_sample_us = ggml_time_us();
1695
+
1696
+ // Compute the softmax of logits and calculate entropy
1697
+ llama_sample_softmax(nullptr, candidates);
1698
+
1699
+ float entropy = 0.0f;
1700
+ for (size_t i = 0; i < candidates->size; ++i) {
1701
+ entropy += -candidates->data[i].p * logf(candidates->data[i].p);
1702
+ }
1703
+
1704
+ // Compute the absolute difference between negative log probability and entropy for each candidate
1705
+ std::vector<float> shifted_scores;
1706
+ for (size_t i = 0; i < candidates->size; ++i) {
1707
+ float shifted_score = fabsf(-logf(candidates->data[i].p) - entropy);
1708
+ shifted_scores.push_back(shifted_score);
1709
+ }
1710
+
1711
+ // Sort tokens based on the shifted_scores and their corresponding indices
1712
+ std::vector<size_t> indices(candidates->size);
1713
+ std::iota(indices.begin(), indices.end(), 0);
1714
+
1715
+ std::sort(indices.begin(), indices.end(), [&](size_t a, size_t b) {
1716
+ return shifted_scores[a] < shifted_scores[b];
1717
+ });
1718
+
1719
+ // Compute the cumulative probabilities
1720
+ float cum_sum = 0.0f;
1721
+ size_t last_idx = indices.size();
1722
+
1723
+ for (size_t i = 0; i < indices.size(); ++i) {
1724
+ size_t idx = indices[i];
1725
+ cum_sum += candidates->data[idx].p;
1726
+
1727
+ // Check if the running sum is greater than typical or if we have kept at least min_keep tokens
1728
+ if (cum_sum > p && i >= min_keep - 1) {
1729
+ last_idx = i + 1;
1730
+ break;
1731
+ }
1732
+ }
1733
+
1734
+ // Resize the output vector to keep only the locally typical tokens
1735
+ std::vector<llama_token_data> new_candidates;
1736
+ for (size_t i = 0; i < last_idx; ++i) {
1737
+ size_t idx = indices[i];
1738
+ new_candidates.push_back(candidates->data[idx]);
1739
+ }
1740
+
1741
+ // Replace the data in candidates with the new_candidates data
1742
+ std::copy(new_candidates.begin(), new_candidates.end(), candidates->data);
1743
+ candidates->size = new_candidates.size();
1744
+
1745
+ if (ctx) {
1746
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
1747
+ }
1748
+ }
1749
+
1750
+ void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
1751
+ const int64_t t_start_sample_us = ggml_time_us();
1752
+
1753
+ for (size_t i = 0; i < candidates_p->size; ++i) {
1754
+ candidates_p->data[i].logit /= temp;
1755
+ }
1756
+
1757
+ if (ctx) {
1758
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
1759
+ }
1760
+ }
1761
+
1762
+ void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty) {
1763
+ if (last_tokens_size == 0 || penalty == 1.0f) {
1764
+ return;
1765
+ }
1766
+
1767
+ const int64_t t_start_sample_us = ggml_time_us();
1768
+
1769
+ for (size_t i = 0; i < candidates->size; ++i) {
1770
+ const auto * token_iter = std::find(last_tokens, last_tokens + last_tokens_size, candidates->data[i].id);
1771
+ if (token_iter == last_tokens + last_tokens_size) {
1772
+ continue;
1773
+ }
1774
+
1775
+ // The academic publication that described this technique actually just only divided, but that would cause tokens with negative logits to become more likely, which is obviously wrong.
1776
+ // This is common fix for this problem, which is to multiply by the penalty instead of dividing.
1777
+ if (candidates->data[i].logit <= 0) {
1778
+ candidates->data[i].logit *= penalty;
1779
+ } else {
1780
+ candidates->data[i].logit /= penalty;
1781
+ }
1782
+ }
1783
+
1784
+ candidates->sorted = false;
1785
+
1786
+ if (ctx) {
1787
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
1788
+ }
1789
+ }
1790
+
1791
+ void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens_p, size_t last_tokens_size, float alpha_frequency, float alpha_presence) {
1792
+ if (last_tokens_size == 0 || (alpha_frequency == 0.0f && alpha_presence == 0.0f)) {
1793
+ return;
1794
+ }
1795
+
1796
+ const int64_t t_start_sample_us = ggml_time_us();
1797
+
1798
+ // Create a frequency map to count occurrences of each token in last_tokens
1799
+ std::unordered_map<llama_token, int> token_count;
1800
+ for (size_t i = 0; i < last_tokens_size; ++i) {
1801
+ token_count[last_tokens_p[i]]++;
1802
+ }
1803
+
1804
+ // Apply frequency and presence penalties to the candidates
1805
+ for (size_t i = 0; i < candidates->size; ++i) {
1806
+ auto token_iter = token_count.find(candidates->data[i].id);
1807
+ if (token_iter == token_count.end()) {
1808
+ continue;
1809
+ }
1810
+
1811
+ int count = token_iter->second;
1812
+ candidates->data[i].logit -= float(count) * alpha_frequency + float(count > 0) * alpha_presence;
1813
+ }
1814
+
1815
+ candidates->sorted = false;
1816
+
1817
+ if (ctx) {
1818
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
1819
+ }
1820
+ }
1821
+
1822
+
1823
+ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu) {
1824
+ assert(ctx);
1825
+ auto N = float(llama_n_vocab(ctx));
1826
+ int64_t t_start_sample_us;
1827
+ t_start_sample_us = ggml_time_us();
1828
+
1829
+ llama_sample_softmax(nullptr, candidates);
1830
+
1831
+ // Estimate s_hat using the most probable m tokens
1832
+ float s_hat = 0.0;
1833
+ float sum_ti_bi = 0.0;
1834
+ float sum_ti_sq = 0.0;
1835
+ for (size_t i = 0; i < size_t(m - 1) && i < candidates->size - 1; ++i) {
1836
+ float t_i = logf(float(i + 2) / float(i + 1));
1837
+ float b_i = logf(candidates->data[i].p / candidates->data[i + 1].p);
1838
+ sum_ti_bi += t_i * b_i;
1839
+ sum_ti_sq += t_i * t_i;
1840
+ }
1841
+ s_hat = sum_ti_bi / sum_ti_sq;
1842
+
1843
+ // Compute k from the estimated s_hat and target surprise value
1844
+ float epsilon_hat = s_hat - 1;
1845
+ float k = powf((epsilon_hat * powf(2, *mu)) / (1 - powf(N, -epsilon_hat)), 1 / s_hat);
1846
+
1847
+ // Sample the next word X using top-k sampling
1848
+ llama_sample_top_k(nullptr, candidates, int(k), 1);
1849
+ if (ctx) {
1850
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
1851
+ }
1852
+ llama_token X = llama_sample_token(ctx, candidates);
1853
+ t_start_sample_us = ggml_time_us();
1854
+
1855
+ // Compute error as the difference between observed surprise and target surprise value
1856
+ size_t X_idx = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) {
1857
+ return candidate.id == X;
1858
+ }));
1859
+ float observed_surprise = -log2f(candidates->data[X_idx].p);
1860
+ float e = observed_surprise - tau;
1861
+
1862
+ // Update mu using the learning rate and error
1863
+ *mu = *mu - eta * e;
1864
+
1865
+ if (ctx) {
1866
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
1867
+ ctx->n_sample++;
1868
+ }
1869
+ return X;
1870
+ }
1871
+
1872
+ llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu) {
1873
+ assert(ctx);
1874
+ int64_t t_start_sample_us;
1875
+ t_start_sample_us = ggml_time_us();
1876
+
1877
+ llama_sample_softmax(ctx, candidates);
1878
+
1879
+ // Truncate the words with surprise values greater than mu
1880
+ candidates->size = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) {
1881
+ return -log2f(candidate.p) > *mu;
1882
+ }));
1883
+
1884
+ // Normalize the probabilities of the remaining words
1885
+ llama_sample_softmax(ctx, candidates);
1886
+
1887
+ // Sample the next word X from the remaining words
1888
+ if (ctx) {
1889
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
1890
+ }
1891
+ llama_token X = llama_sample_token(ctx, candidates);
1892
+ t_start_sample_us = ggml_time_us();
1893
+
1894
+ // Compute error as the difference between observed surprise and target surprise value
1895
+ size_t X_idx = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) {
1896
+ return candidate.id == X;
1897
+ }));
1898
+ float observed_surprise = -log2f(candidates->data[X_idx].p);
1899
+ float e = observed_surprise - tau;
1900
+
1901
+ // Update mu using the learning rate and error
1902
+ *mu = *mu - eta * e;
1903
+
1904
+ if (ctx) {
1905
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
1906
+ }
1907
+ return X;
1908
+ }
1909
+
1910
+ llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_data_array * candidates) {
1911
+ const int64_t t_start_sample_us = ggml_time_us();
1912
+
1913
+ // Find max element
1914
+ auto * max_iter = std::max_element(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
1915
+ return a.logit < b.logit;
1916
+ });
1917
+
1918
+ llama_token result = max_iter->id;
1919
+ if (ctx) {
1920
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
1921
+ ctx->n_sample++;
1922
+ }
1923
+ return result;
1924
+ }
1925
+
1926
+ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates) {
1927
+ assert(ctx);
1928
+ const int64_t t_start_sample_us = ggml_time_us();
1929
+ llama_sample_softmax(nullptr, candidates);
1930
+
1931
+ std::vector<float> probs;
1932
+ probs.reserve(candidates->size);
1933
+ for (size_t i = 0; i < candidates->size; ++i) {
1934
+ probs.push_back(candidates->data[i].p);
1935
+ }
1579
1936
 
1580
1937
  std::discrete_distribution<> dist(probs.begin(), probs.end());
1938
+ auto & rng = ctx->rng;
1581
1939
  int idx = dist(rng);
1582
1940
 
1583
- return logits_id[idx].second;
1941
+ llama_token result = candidates->data[idx].id;
1942
+
1943
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
1944
+ ctx->n_sample++;
1945
+ return result;
1584
1946
  }
1585
1947
 
1586
1948
  //
@@ -1592,8 +1954,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1592
1954
  switch (ftype) {
1593
1955
  case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
1594
1956
  case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
1595
- case LLAMA_FTYPE_MOSTLY_Q4_2: quantized_type = GGML_TYPE_Q4_2; break;
1596
- case LLAMA_FTYPE_MOSTLY_Q4_3: quantized_type = GGML_TYPE_Q4_3; break;
1597
1957
  case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
1598
1958
  case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
1599
1959
  case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
@@ -1604,7 +1964,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1604
1964
  nthread = std::thread::hardware_concurrency();
1605
1965
  }
1606
1966
 
1607
- std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp.c_str(), /*use_mmap*/ false,
1967
+ std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp, /*use_mmap*/ false,
1608
1968
  /*vocab_only*/ false));
1609
1969
  llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), ftype);
1610
1970
 
@@ -1658,7 +2018,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1658
2018
  } else if (tensor.type == GGML_TYPE_F16) {
1659
2019
  f32_conv_buf.resize(nelements * sizeof(float));
1660
2020
  f32_data = (float *) f32_conv_buf.addr;
1661
- auto f16_data = (const ggml_fp16_t *) tensor.data;
2021
+ const auto * f16_data = (const ggml_fp16_t *) tensor.data;
1662
2022
  for (size_t i = 0; i < nelements; i++) {
1663
2023
  f32_data[i] = ggml_fp16_to_fp32(f16_data[i]);
1664
2024
  }
@@ -1689,21 +2049,31 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1689
2049
  size_t first = counter; counter += chunk_size;
1690
2050
  if (first >= nelements) {
1691
2051
  if (!local_hist.empty()) {
1692
- for (int j=0; j<int(local_hist.size()); ++j) hist_cur[j] += local_hist[j];
2052
+ for (int j=0; j<int(local_hist.size()); ++j) {
2053
+ hist_cur[j] += local_hist[j];
2054
+ }
1693
2055
  new_size += local_size;
1694
2056
  }
1695
2057
  break;
1696
2058
  }
1697
2059
  lock.unlock();
1698
2060
  size_t last = std::min(nelements, first + chunk_size);
1699
- if (local_hist.empty()) local_hist.resize(hist_cur.size(), 0);
2061
+ if (local_hist.empty()) {
2062
+ local_hist.resize(hist_cur.size(), 0);
2063
+ }
1700
2064
  local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data());
1701
2065
  }
1702
2066
  };
1703
- if (int(workers.size()) < nthread_use - 1) workers.resize(nthread_use - 1);
1704
- for (int it = 0; it < nthread_use - 1; ++it) workers[it] = std::thread(compute);
2067
+ if ((int) workers.size() < nthread_use - 1) {
2068
+ workers.resize(nthread_use - 1);
2069
+ }
2070
+ for (int it = 0; it < nthread_use - 1; ++it) {
2071
+ workers[it] = std::thread(compute);
2072
+ }
1705
2073
  compute();
1706
- for (int it = 0; it < nthread_use - 1; ++it) workers[it].join();
2074
+ for (int it = 0; it < nthread_use - 1; ++it) {
2075
+ workers[it].join();
2076
+ }
1707
2077
  }
1708
2078
 
1709
2079
  printf("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0);
@@ -1749,7 +2119,7 @@ struct llama_context * llama_init_from_file(
1749
2119
 
1750
2120
  llama_context * ctx = new llama_context;
1751
2121
 
1752
- if (params.seed <= 0) {
2122
+ if (params.seed < 0) {
1753
2123
  params.seed = time(NULL);
1754
2124
  }
1755
2125
 
@@ -1775,7 +2145,7 @@ struct llama_context * llama_init_from_file(
1775
2145
 
1776
2146
  ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
1777
2147
 
1778
- if (!llama_model_load(path_model, *ctx, params.n_ctx, memory_type,
2148
+ if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_gpu_layers, memory_type,
1779
2149
  params.use_mmap, params.use_mlock, params.vocab_only,
1780
2150
  params.progress_callback, params.progress_callback_user_data)) {
1781
2151
  fprintf(stderr, "%s: failed to load model\n", __func__);
@@ -1901,7 +2271,8 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
1901
2271
  fprintf(stderr, "%s: loading base model from '%s'\n", __func__, path_base_model);
1902
2272
  model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*vocab_only*/ false));
1903
2273
 
1904
- size_t ctx_size, mmapped_size;
2274
+ size_t ctx_size;
2275
+ size_t mmapped_size;
1905
2276
  model_loader->calc_sizes(&ctx_size, &mmapped_size);
1906
2277
  base_buf.resize(ctx_size);
1907
2278
 
@@ -1940,8 +2311,12 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
1940
2311
  fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
1941
2312
  }
1942
2313
 
1943
- std::string name(length, 0);
1944
- fin.read(&name[0], length);
2314
+ std::string name;
2315
+ {
2316
+ char buf[1024];
2317
+ fin.read(buf, length);
2318
+ name = std::string(buf, length);
2319
+ }
1945
2320
 
1946
2321
  // check for lora suffix and get the type of tensor
1947
2322
  const std::string lora_suffix = ".lora";
@@ -1956,7 +2331,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
1956
2331
  base_name.erase(pos);
1957
2332
  // fprintf(stderr, "%s: %s => %s (lora type %s) ", __func__, name.c_str(),base_name.c_str(), lora_type.c_str());
1958
2333
 
1959
- if (model_tensors.find(base_name.data()) == model_tensors.end()) {
2334
+ if (model_tensors.find(base_name) == model_tensors.end()) {
1960
2335
  fprintf(stderr, "%s: unknown tensor '%s' in lora adapter\n", __func__, name.data());
1961
2336
  return 1;
1962
2337
  }
@@ -2036,7 +2411,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
2036
2411
 
2037
2412
  if (scaling != 1.0f) {
2038
2413
  ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling);
2039
- BA = ggml_scale(lora_ctx, BA, scale_tensor);
2414
+ BA = ggml_scale_inplace(lora_ctx, BA, scale_tensor);
2040
2415
  }
2041
2416
 
2042
2417
  ggml_tensor * r;
@@ -2058,8 +2433,9 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
2058
2433
  lora_tensors.clear();
2059
2434
 
2060
2435
  n_tensors++;
2061
- if (n_tensors % 4 == 0)
2436
+ if (n_tensors % 4 == 0) {
2062
2437
  fprintf(stderr, ".");
2438
+ }
2063
2439
  }
2064
2440
  }
2065
2441
 
@@ -2084,21 +2460,21 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
2084
2460
  }
2085
2461
  }
2086
2462
 
2087
- int llama_get_kv_cache_token_count(struct llama_context * ctx) {
2463
+ int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
2088
2464
  return ctx->model.kv_self.n;
2089
2465
  }
2090
2466
 
2091
- #define LLAMA_MAX_RNG_STATE 64*1024
2467
+ #define LLAMA_MAX_RNG_STATE (64*1024)
2092
2468
 
2093
2469
  void llama_set_rng_seed(struct llama_context * ctx, int seed) {
2094
- if (seed <= 0) {
2470
+ if (seed < 0) {
2095
2471
  seed = time(NULL);
2096
2472
  }
2097
2473
  ctx->rng.seed(seed);
2098
2474
  }
2099
2475
 
2100
- // Returns the size of the state
2101
- size_t llama_get_state_size(struct llama_context * ctx) {
2476
+ // Returns the *maximum* size of the state
2477
+ size_t llama_get_state_size(const struct llama_context * ctx) {
2102
2478
  // we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.
2103
2479
  // for reference, std::mt19937(1337) serializes to 6701 bytes.
2104
2480
  const size_t s_rng_size = sizeof(size_t);
@@ -2129,8 +2505,8 @@ size_t llama_get_state_size(struct llama_context * ctx) {
2129
2505
  }
2130
2506
 
2131
2507
  // Copies the state to the specified destination address
2132
- size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) {
2133
- uint8_t * out = dest;
2508
+ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
2509
+ uint8_t * out = dst;
2134
2510
 
2135
2511
  // copy rng
2136
2512
  {
@@ -2176,36 +2552,70 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) {
2176
2552
 
2177
2553
  // copy kv cache
2178
2554
  {
2179
- const size_t kv_size = ctx->model.kv_self.buf.size;
2555
+ const auto & kv_self = ctx->model.kv_self;
2556
+ const auto & hparams = ctx->model.hparams;
2557
+ const int n_layer = hparams.n_layer;
2558
+ const int n_embd = hparams.n_embd;
2559
+ const int n_ctx = hparams.n_ctx;
2560
+
2561
+ const size_t kv_size = kv_self.buf.size;
2180
2562
  const int kv_ntok = llama_get_kv_cache_token_count(ctx);
2181
2563
 
2182
2564
  memcpy(out, &kv_size, sizeof(kv_size)); out += sizeof(kv_size);
2183
2565
  memcpy(out, &kv_ntok, sizeof(kv_ntok)); out += sizeof(kv_ntok);
2184
2566
 
2185
2567
  if (kv_size) {
2186
- memcpy(out, ctx->model.kv_self.buf.addr, kv_size); out += kv_size;
2568
+ const size_t elt_size = ggml_element_size(kv_self.k);
2569
+
2570
+ char buffer[4096];
2571
+
2572
+ ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
2573
+ ggml_cgraph gf{};
2574
+ gf.n_threads = 1;
2575
+
2576
+ ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
2577
+ kout3d->data = out;
2578
+ out += ggml_nbytes(kout3d);
2579
+
2580
+ ggml_tensor * vout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_ntok, n_embd, n_layer);
2581
+ vout3d->data = out;
2582
+ out += ggml_nbytes(vout3d);
2583
+
2584
+ ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k,
2585
+ n_embd, kv_ntok, n_layer,
2586
+ elt_size*n_embd, elt_size*n_embd*n_ctx, 0);
2587
+
2588
+ ggml_tensor * v3d = ggml_view_3d(cpy_ctx, kv_self.v,
2589
+ kv_ntok, n_embd, n_layer,
2590
+ elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
2591
+
2592
+ ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
2593
+ ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d));
2594
+ ggml_graph_compute(cpy_ctx, &gf);
2595
+
2596
+ ggml_free(cpy_ctx);
2187
2597
  }
2188
2598
  }
2189
2599
 
2190
- const size_t written = out - dest;
2191
- const size_t expected = llama_get_state_size(ctx);
2600
+ const size_t written = out - dst;
2601
+ const size_t max_size = llama_get_state_size(ctx);
2192
2602
 
2193
- LLAMA_ASSERT(written == expected);
2603
+ LLAMA_ASSERT(written <= max_size);
2194
2604
 
2195
2605
  return written;
2196
2606
  }
2197
2607
 
2198
2608
  // Sets the state reading from the specified source address
2199
2609
  size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
2200
- const uint8_t * in = src;
2610
+ const uint8_t * inp = src;
2201
2611
 
2202
2612
  // set rng
2203
2613
  {
2204
2614
  size_t rng_size;
2205
2615
  char rng_buf[LLAMA_MAX_RNG_STATE];
2206
2616
 
2207
- memcpy(&rng_size, in, sizeof(rng_size)); in += sizeof(rng_size);
2208
- memcpy(&rng_buf[0], in, LLAMA_MAX_RNG_STATE); in += LLAMA_MAX_RNG_STATE;
2617
+ memcpy(&rng_size, inp, sizeof(rng_size)); inp += sizeof(rng_size);
2618
+ memcpy(&rng_buf[0], inp, LLAMA_MAX_RNG_STATE); inp += LLAMA_MAX_RNG_STATE;
2209
2619
 
2210
2620
  std::stringstream rng_ss;
2211
2621
  rng_ss.str(std::string(&rng_buf[0], rng_size));
@@ -2219,65 +2629,171 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
2219
2629
  size_t logits_cap;
2220
2630
  size_t logits_size;
2221
2631
 
2222
- memcpy(&logits_cap, in, sizeof(logits_cap)); in += sizeof(logits_cap);
2223
- memcpy(&logits_size, in, sizeof(logits_size)); in += sizeof(logits_size);
2632
+ memcpy(&logits_cap, inp, sizeof(logits_cap)); inp += sizeof(logits_cap);
2633
+ memcpy(&logits_size, inp, sizeof(logits_size)); inp += sizeof(logits_size);
2224
2634
 
2225
2635
  LLAMA_ASSERT(ctx->logits.capacity() == logits_cap);
2226
2636
 
2227
2637
  if (logits_size) {
2228
2638
  ctx->logits.resize(logits_size);
2229
- memcpy(ctx->logits.data(), in, logits_size * sizeof(float));
2639
+ memcpy(ctx->logits.data(), inp, logits_size * sizeof(float));
2230
2640
  }
2231
2641
 
2232
- in += logits_cap * sizeof(float);
2642
+ inp += logits_cap * sizeof(float);
2233
2643
  }
2234
2644
 
2235
2645
  // set embeddings
2236
2646
  {
2237
2647
  size_t embedding_size;
2238
2648
 
2239
- memcpy(&embedding_size, in, sizeof(embedding_size)); in += sizeof(embedding_size);
2649
+ memcpy(&embedding_size, inp, sizeof(embedding_size)); inp += sizeof(embedding_size);
2240
2650
 
2241
2651
  LLAMA_ASSERT(ctx->embedding.capacity() == embedding_size);
2242
2652
 
2243
2653
  if (embedding_size) {
2244
- memcpy(ctx->embedding.data(), in, embedding_size * sizeof(float));
2245
- in += embedding_size * sizeof(float);
2654
+ memcpy(ctx->embedding.data(), inp, embedding_size * sizeof(float));
2655
+ inp += embedding_size * sizeof(float);
2246
2656
  }
2247
2657
  }
2248
2658
 
2249
2659
  // set kv cache
2250
2660
  {
2661
+ const auto & kv_self = ctx->model.kv_self;
2662
+ const auto & hparams = ctx->model.hparams;
2663
+ const int n_layer = hparams.n_layer;
2664
+ const int n_embd = hparams.n_embd;
2665
+ const int n_ctx = hparams.n_ctx;
2666
+
2251
2667
  size_t kv_size;
2252
2668
  int kv_ntok;
2253
2669
 
2254
- memcpy(&kv_size, in, sizeof(kv_size)); in += sizeof(kv_size);
2255
- memcpy(&kv_ntok, in, sizeof(kv_ntok)); in += sizeof(kv_ntok);
2670
+ memcpy(&kv_size, inp, sizeof(kv_size)); inp += sizeof(kv_size);
2671
+ memcpy(&kv_ntok, inp, sizeof(kv_ntok)); inp += sizeof(kv_ntok);
2256
2672
 
2257
2673
  if (kv_size) {
2258
- LLAMA_ASSERT(ctx->model.kv_self.buf.size == kv_size);
2674
+ LLAMA_ASSERT(kv_self.buf.size == kv_size);
2675
+
2676
+ const size_t elt_size = ggml_element_size(kv_self.k);
2677
+
2678
+ char buffer[4096];
2259
2679
 
2260
- void * k_data = ctx->model.kv_self.k->data; // remember data pointers
2261
- void * v_data = ctx->model.kv_self.v->data; // because their value is stored in buf and overwritten by memcpy
2680
+ ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
2681
+ ggml_cgraph gf{};
2682
+ gf.n_threads = 1;
2262
2683
 
2263
- memcpy(ctx->model.kv_self.buf.addr, in, kv_size); in += kv_size;
2684
+ ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
2685
+ kin3d->data = (void *) inp;
2686
+ inp += ggml_nbytes(kin3d);
2264
2687
 
2265
- ctx->model.kv_self.k->data = k_data; // restore correct data pointers
2266
- ctx->model.kv_self.v->data = v_data;
2688
+ ggml_tensor * vin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_ntok, n_embd, n_layer);
2689
+ vin3d->data = (void *) inp;
2690
+ inp += ggml_nbytes(vin3d);
2267
2691
 
2692
+ ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k,
2693
+ n_embd, kv_ntok, n_layer,
2694
+ elt_size*n_embd, elt_size*n_embd*n_ctx, 0);
2695
+
2696
+ ggml_tensor * v3d = ggml_view_3d(cpy_ctx, kv_self.v,
2697
+ kv_ntok, n_embd, n_layer,
2698
+ elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
2699
+
2700
+ ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d));
2701
+ ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d));
2702
+ ggml_graph_compute(cpy_ctx, &gf);
2703
+
2704
+ ggml_free(cpy_ctx);
2268
2705
  }
2269
2706
 
2270
2707
  ctx->model.kv_self.n = kv_ntok;
2271
2708
  }
2272
2709
 
2273
- const size_t nread = in - src;
2274
- const size_t expected = llama_get_state_size(ctx);
2710
+ const size_t nread = inp - src;
2711
+ const size_t max_size = llama_get_state_size(ctx);
2275
2712
 
2276
- LLAMA_ASSERT(nread == expected);
2713
+ LLAMA_ASSERT(nread <= max_size);
2277
2714
 
2278
2715
  return nread;
2279
2716
  }
2280
2717
 
2718
+ bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
2719
+ llama_file file(path_session, "rb");
2720
+
2721
+ // sanity checks
2722
+ {
2723
+ const uint32_t magic = file.read_u32();
2724
+ const uint32_t version = file.read_u32();
2725
+
2726
+ if (magic != LLAMA_SESSION_MAGIC || version != LLAMA_SESSION_VERSION) {
2727
+ fprintf(stderr, "%s : unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version);
2728
+ return false;
2729
+ }
2730
+
2731
+ llama_hparams session_hparams;
2732
+ file.read_raw(&session_hparams, sizeof(llama_hparams));
2733
+
2734
+ if (session_hparams != ctx->model.hparams) {
2735
+ fprintf(stderr, "%s : model hparams didn't match from session file!\n", __func__);
2736
+ return false;
2737
+ }
2738
+ }
2739
+
2740
+ // load the prompt
2741
+ {
2742
+ const uint32_t n_token_count = file.read_u32();
2743
+
2744
+ if (n_token_count > n_token_capacity) {
2745
+ fprintf(stderr, "%s : token count in session file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity);
2746
+ return false;
2747
+ }
2748
+
2749
+ file.read_raw(tokens_out, sizeof(llama_token) * n_token_count);
2750
+ *n_token_count_out = n_token_count;
2751
+ }
2752
+
2753
+ // restore the context state
2754
+ {
2755
+ const size_t n_state_size_cur = file.size - file.tell();
2756
+ const size_t n_state_size_max = llama_get_state_size(ctx);
2757
+
2758
+ if (n_state_size_cur > n_state_size_max) {
2759
+ fprintf(stderr, "%s : the state size in session file is too big! max %zu, got %zu\n", __func__, n_state_size_max, n_state_size_cur);
2760
+ return false;
2761
+ }
2762
+
2763
+ std::vector<uint8_t> state_data(n_state_size_max);
2764
+ file.read_raw(state_data.data(), n_state_size_cur);
2765
+
2766
+ llama_set_state_data(ctx, state_data.data());
2767
+ }
2768
+
2769
+ return true;
2770
+ }
2771
+
2772
+ bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
2773
+ llama_file file(path_session, "wb");
2774
+
2775
+ file.write_u32(LLAMA_SESSION_MAGIC);
2776
+ file.write_u32(LLAMA_SESSION_VERSION);
2777
+
2778
+ file.write_raw(&ctx->model.hparams, sizeof(llama_hparams));
2779
+
2780
+ // save the prompt
2781
+ file.write_u32((uint32_t) n_token_count);
2782
+ file.write_raw(tokens, sizeof(llama_token) * n_token_count);
2783
+
2784
+ // save the context state
2785
+ {
2786
+ const size_t n_state_size_max = llama_get_state_size(ctx);
2787
+
2788
+ std::vector<uint8_t> state_data(n_state_size_max);
2789
+ const size_t n_state_size_cur = llama_copy_state_data(ctx, state_data.data());
2790
+
2791
+ file.write_raw(state_data.data(), n_state_size_cur);
2792
+ }
2793
+
2794
+ return true;
2795
+ }
2796
+
2281
2797
  int llama_eval(
2282
2798
  struct llama_context * ctx,
2283
2799
  const llama_token * tokens,
@@ -2288,11 +2804,14 @@ int llama_eval(
2288
2804
  fprintf(stderr, "%s: failed to eval\n", __func__);
2289
2805
  return 1;
2290
2806
  }
2807
+
2291
2808
  // get a more accurate load time, upon first eval
2809
+ // TODO: fix this
2292
2810
  if (!ctx->has_evaluated_once) {
2293
2811
  ctx->t_load_us = ggml_time_us() - ctx->t_start_us;
2294
2812
  ctx->has_evaluated_once = true;
2295
2813
  }
2814
+
2296
2815
  return 0;
2297
2816
  }
2298
2817
 
@@ -2316,15 +2835,15 @@ int llama_tokenize(
2316
2835
  return res.size();
2317
2836
  }
2318
2837
 
2319
- int llama_n_vocab(struct llama_context * ctx) {
2838
+ int llama_n_vocab(const struct llama_context * ctx) {
2320
2839
  return ctx->vocab.id_to_token.size();
2321
2840
  }
2322
2841
 
2323
- int llama_n_ctx(struct llama_context * ctx) {
2842
+ int llama_n_ctx(const struct llama_context * ctx) {
2324
2843
  return ctx->model.hparams.n_ctx;
2325
2844
  }
2326
2845
 
2327
- int llama_n_embd(struct llama_context * ctx) {
2846
+ int llama_n_embd(const struct llama_context * ctx) {
2328
2847
  return ctx->model.hparams.n_embd;
2329
2848
  }
2330
2849
 
@@ -2336,7 +2855,7 @@ float * llama_get_embeddings(struct llama_context * ctx) {
2336
2855
  return ctx->embedding.data();
2337
2856
  }
2338
2857
 
2339
- const char * llama_token_to_str(struct llama_context * ctx, llama_token token) {
2858
+ const char * llama_token_to_str(const struct llama_context * ctx, llama_token token) {
2340
2859
  if (token >= llama_n_vocab(ctx)) {
2341
2860
  return nullptr;
2342
2861
  }
@@ -2352,33 +2871,8 @@ llama_token llama_token_eos() {
2352
2871
  return 2;
2353
2872
  }
2354
2873
 
2355
- llama_token llama_sample_top_p_top_k(
2356
- llama_context * ctx,
2357
- const llama_token * last_n_tokens_data,
2358
- int last_n_tokens_size,
2359
- int top_k,
2360
- float top_p,
2361
- float temp,
2362
- float repeat_penalty) {
2363
- const int64_t t_start_sample_us = ggml_time_us();
2364
-
2365
- llama_token result = 0;
2366
-
2367
- // TODO: avoid this ...
2368
- const auto last_n_tokens = std::vector<llama_token>(last_n_tokens_data, last_n_tokens_data + last_n_tokens_size);
2369
-
2370
- result = llama_sample_top_p_top_k(
2371
- *ctx,
2372
- last_n_tokens,
2373
- top_k,
2374
- top_p,
2375
- temp,
2376
- repeat_penalty);
2377
-
2378
- ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
2379
- ctx->n_sample++;
2380
-
2381
- return result;
2874
+ llama_token llama_token_nl() {
2875
+ return 13;
2382
2876
  }
2383
2877
 
2384
2878
 
@@ -2391,9 +2885,9 @@ void llama_print_timings(struct llama_context * ctx) {
2391
2885
 
2392
2886
  fprintf(stderr, "\n");
2393
2887
  fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0);
2394
- fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3 * ctx->t_sample_us, n_sample, 1e-3 * ctx->t_sample_us / n_sample);
2888
+ fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_sample_us, n_sample, 1e-3 * ctx->t_sample_us / n_sample);
2395
2889
  fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_p_eval_us, n_p_eval, 1e-3 * ctx->t_p_eval_us / n_p_eval);
2396
- fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3 * ctx->t_eval_us, n_eval, 1e-3 * ctx->t_eval_us / n_eval);
2890
+ fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_eval_us, n_eval, 1e-3 * ctx->t_eval_us / n_eval);
2397
2891
  fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0);
2398
2892
  }
2399
2893
 
@@ -2430,4 +2924,3 @@ const char * llama_print_system_info(void) {
2430
2924
  std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
2431
2925
  return ctx->model.tensors_by_name;
2432
2926
  }
2433
-