llama_cpp 0.0.7 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,10 +5,13 @@
5
5
  #include <cstdio>
6
6
  #endif
7
7
 
8
- #include "llama_util.h"
8
+ #include "llama-util.h"
9
9
  #include "llama.h"
10
10
 
11
11
  #include "ggml.h"
12
+ #ifdef GGML_USE_CUBLAS
13
+ #include "ggml-cuda.h"
14
+ #endif
12
15
 
13
16
  #include <array>
14
17
  #include <ctime>
@@ -28,11 +31,11 @@
28
31
  #include <atomic>
29
32
  #include <mutex>
30
33
  #include <sstream>
34
+ #include <numeric>
31
35
 
32
36
  #define LLAMA_USE_SCRATCH
33
37
  #define LLAMA_MAX_SCRATCH_BUFFERS 16
34
38
 
35
-
36
39
  // available llama models
37
40
  enum e_model {
38
41
  MODEL_UNKNOWN,
@@ -50,49 +53,49 @@ static const size_t MB = 1024*1024;
50
53
 
51
54
  static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
52
55
  {
53
- static std::map<e_model, size_t> _MEM_REQ_SCRATCH0 = {
56
+ static std::map<e_model, size_t> k_sizes = {
54
57
  { MODEL_7B, 512ull * MB },
55
58
  { MODEL_13B, 512ull * MB },
56
59
  { MODEL_30B, 512ull * MB },
57
60
  { MODEL_65B, 1024ull * MB },
58
61
  };
59
- return _MEM_REQ_SCRATCH0;
62
+ return k_sizes;
60
63
  }
61
64
 
62
65
  static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
63
66
  {
64
- static std::map<e_model, size_t> _MEM_REQ_SCRATCH1 = {
67
+ static std::map<e_model, size_t> k_sizes = {
65
68
  { MODEL_7B, 512ull * MB },
66
69
  { MODEL_13B, 512ull * MB },
67
70
  { MODEL_30B, 512ull * MB },
68
71
  { MODEL_65B, 1024ull * MB },
69
72
  };
70
- return _MEM_REQ_SCRATCH1;
73
+ return k_sizes;
71
74
  }
72
75
 
73
76
  // 2*n_embd*n_ctx*n_layer*sizeof(float16)
74
77
  static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
75
78
  {
76
- static std::map<e_model, size_t> _MEM_REQ_KV_SELF = {
79
+ static std::map<e_model, size_t> k_sizes = {
77
80
  { MODEL_7B, 1026ull * MB },
78
81
  { MODEL_13B, 1608ull * MB },
79
82
  { MODEL_30B, 3124ull * MB },
80
83
  { MODEL_65B, 5120ull * MB },
81
84
  };
82
- return _MEM_REQ_KV_SELF;
85
+ return k_sizes;
83
86
  }
84
87
 
85
88
  // this is mostly needed for temporary mul_mat buffers to dequantize the data
86
89
  // not actually needed if BLAS is disabled
87
90
  static const std::map<e_model, size_t> & MEM_REQ_EVAL()
88
91
  {
89
- static std::map<e_model, size_t> _MEM_REQ_EVAL = {
92
+ static std::map<e_model, size_t> k_sizes = {
90
93
  { MODEL_7B, 768ull * MB },
91
94
  { MODEL_13B, 1024ull * MB },
92
95
  { MODEL_30B, 1280ull * MB },
93
96
  { MODEL_65B, 1536ull * MB },
94
97
  };
95
- return _MEM_REQ_EVAL;
98
+ return k_sizes;
96
99
  }
97
100
 
98
101
  // default hparams (LLaMA 7B)
@@ -136,7 +139,7 @@ struct llama_kv_cache {
136
139
 
137
140
  struct ggml_context * ctx = NULL;
138
141
 
139
- llama_buffer buf;
142
+ llama_ctx_buffer buf;
140
143
 
141
144
  int n; // number of tokens currently in the cache
142
145
 
@@ -167,7 +170,7 @@ struct llama_model {
167
170
  struct llama_kv_cache kv_self;
168
171
 
169
172
  // the model memory buffer
170
- llama_buffer buf;
173
+ llama_ctx_buffer buf;
171
174
 
172
175
  // model memory mapped file
173
176
  std::unique_ptr<llama_mmap> mapping;
@@ -228,8 +231,8 @@ struct llama_context {
228
231
 
229
232
  // memory buffers used to evaluate the model
230
233
  // TODO: move in llama_state
231
- llama_buffer buf_compute;
232
- llama_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
234
+ llama_ctx_buffer buf_compute;
235
+ llama_ctx_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
233
236
 
234
237
  int buf_last = 0;
235
238
  size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
@@ -402,6 +405,7 @@ enum llama_file_version {
402
405
  LLAMA_FILE_VERSION_GGML,
403
406
  LLAMA_FILE_VERSION_GGMF_V1, // added version field and scores in vocab
404
407
  LLAMA_FILE_VERSION_GGJT_V1, // added padding
408
+ LLAMA_FILE_VERSION_GGJT_V2, // changed quantization format
405
409
  };
406
410
 
407
411
  struct llama_file_loader {
@@ -432,6 +436,8 @@ struct llama_file_loader {
432
436
  file_version = LLAMA_FILE_VERSION_GGMF_V1;
433
437
  } else if (magic == 'ggjt' && version == 1) {
434
438
  file_version = LLAMA_FILE_VERSION_GGJT_V1;
439
+ } else if (magic == 'ggjt' && version == 2) {
440
+ file_version = LLAMA_FILE_VERSION_GGJT_V2;
435
441
  } else {
436
442
  throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
437
443
  magic, version);
@@ -482,8 +488,6 @@ struct llama_file_loader {
482
488
  case GGML_TYPE_F16:
483
489
  case GGML_TYPE_Q4_0:
484
490
  case GGML_TYPE_Q4_1:
485
- case GGML_TYPE_Q4_2:
486
- case GGML_TYPE_Q4_3:
487
491
  case GGML_TYPE_Q5_0:
488
492
  case GGML_TYPE_Q5_1:
489
493
  case GGML_TYPE_Q8_0:
@@ -528,8 +532,8 @@ struct llama_file_saver {
528
532
  write_vocab();
529
533
  }
530
534
  void write_magic() {
531
- file.write_u32('ggjt'); // magic
532
- file.write_u32(1); // version
535
+ file.write_u32(LLAMA_FILE_MAGIC); // magic
536
+ file.write_u32(LLAMA_FILE_VERSION); // version
533
537
  }
534
538
  void write_hparams(enum llama_ftype new_ftype) {
535
539
  const llama_hparams & hparams = any_file_loader->hparams;
@@ -559,8 +563,6 @@ struct llama_file_saver {
559
563
  case GGML_TYPE_F16:
560
564
  case GGML_TYPE_Q4_0:
561
565
  case GGML_TYPE_Q4_1:
562
- case GGML_TYPE_Q4_2:
563
- case GGML_TYPE_Q4_3:
564
566
  case GGML_TYPE_Q5_0:
565
567
  case GGML_TYPE_Q5_1:
566
568
  case GGML_TYPE_Q8_0:
@@ -587,12 +589,12 @@ struct llama_model_loader {
587
589
  std::unique_ptr<llama_mmap> mapping;
588
590
 
589
591
  llama_model_loader(const std::string & fname_base, bool use_mmap, bool vocab_only) {
590
- auto first_file = new llama_file_loader(fname_base.c_str(), 0, tensors_map);
592
+ auto * first_file = new llama_file_loader(fname_base.c_str(), 0, tensors_map);
591
593
  file_loaders.emplace_back(first_file);
592
594
  uint32_t n_parts = vocab_only ? 1 : guess_n_parts();
593
595
  for (uint32_t i = 1; i < n_parts; i++) {
594
596
  std::string fname = fname_base + "." + std::to_string(i);
595
- auto ith_file = new llama_file_loader(fname.c_str(), i, tensors_map);
597
+ auto * ith_file = new llama_file_loader(fname.c_str(), i, tensors_map);
596
598
  file_loaders.emplace_back(ith_file);
597
599
  if (ith_file->hparams != first_file->hparams) {
598
600
  throw format("llama.cpp: hparams inconsistent between files");
@@ -639,7 +641,7 @@ struct llama_model_loader {
639
641
  }
640
642
  }
641
643
 
642
- struct ggml_tensor * get_tensor(const std::string & name, std::vector<uint32_t> ne) {
644
+ struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne) {
643
645
  auto it = tensors_map.name_to_idx.find(name);
644
646
  if (it == tensors_map.name_to_idx.end()) {
645
647
  throw format("llama.cpp: tensor '%s' is missing from model", name.c_str());
@@ -661,13 +663,14 @@ struct llama_model_loader {
661
663
  LLAMA_ASSERT(lt.ne.size() == 1);
662
664
  tensor = ggml_new_tensor_1d(ggml_ctx, lt.type, lt.ne.at(0));
663
665
  }
666
+ ggml_set_name(tensor, lt.name.c_str());
664
667
  LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor
665
668
  lt.ggml_tensor = tensor;
666
669
  num_ggml_tensors_created++;
667
670
  return tensor;
668
671
  }
669
672
 
670
- void done_getting_tensors() {
673
+ void done_getting_tensors() const {
671
674
  if (num_ggml_tensors_created != tensors_map.tensors.size()) {
672
675
  throw std::string("llama.cpp: file contained more tensors than expected");
673
676
  }
@@ -729,8 +732,7 @@ struct llama_model_loader {
729
732
  LLAMA_ASSERT(offset == lt.size);
730
733
  } else if (lt.split_type == SPLIT_BY_COLUMNS) {
731
734
  // Let's load the data into temporary buffers to ensure the OS performs large loads.
732
- std::vector<llama_buffer> tmp_bufs;
733
- tmp_bufs.resize(lt.shards.size());
735
+ std::vector<llama_buffer> tmp_bufs(lt.shards.size());
734
736
  for (size_t i = 0; i < lt.shards.size(); i++) {
735
737
  llama_load_tensor_shard & shard = lt.shards.at(i);
736
738
  llama_file & file = file_loaders.at(shard.file_idx)->file;
@@ -782,7 +784,7 @@ static bool kv_cache_init(
782
784
  const int n_embd = hparams.n_embd;
783
785
  const int n_layer = hparams.n_layer;
784
786
 
785
- const int64_t n_mem = (int64_t)n_layer*n_ctx;
787
+ const int64_t n_mem = n_layer*n_ctx;
786
788
  const int64_t n_elements = n_embd*n_mem;
787
789
 
788
790
  cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
@@ -801,6 +803,8 @@ static bool kv_cache_init(
801
803
 
802
804
  cache.k = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
803
805
  cache.v = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
806
+ ggml_set_name(cache.k, "cache_k");
807
+ ggml_set_name(cache.v, "cache_v");
804
808
 
805
809
  return true;
806
810
  }
@@ -808,9 +812,9 @@ static bool kv_cache_init(
808
812
  struct llama_context_params llama_context_default_params() {
809
813
  struct llama_context_params result = {
810
814
  /*.n_ctx =*/ 512,
811
- /*.n_parts =*/ -1,
812
- /*.seed =*/ 0,
813
- /*.f16_kv =*/ false,
815
+ /*.gpu_layers =*/ 0,
816
+ /*.seed =*/ -1,
817
+ /*.f16_kv =*/ true,
814
818
  /*.logits_all =*/ false,
815
819
  /*.vocab_only =*/ false,
816
820
  /*.use_mmap =*/ true,
@@ -839,9 +843,11 @@ static const char *llama_file_version_name(llama_file_version version) {
839
843
  switch (version) {
840
844
  case LLAMA_FILE_VERSION_GGML: return "'ggml' (old version with low tokenizer quality and no mmap support)";
841
845
  case LLAMA_FILE_VERSION_GGMF_V1: return "ggmf v1 (old version with no mmap support)";
842
- case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (latest)";
843
- default: LLAMA_ASSERT(false);
846
+ case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (pre #1405)";
847
+ case LLAMA_FILE_VERSION_GGJT_V2: return "ggjt v2 (latest)";
844
848
  }
849
+
850
+ return "unknown";
845
851
  }
846
852
 
847
853
  static const char *llama_ftype_name(enum llama_ftype ftype) {
@@ -852,8 +858,6 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
852
858
  case LLAMA_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1";
853
859
  case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
854
860
  return "mostly Q4_1, some F16";
855
- case LLAMA_FTYPE_MOSTLY_Q4_2: return "mostly Q4_2";
856
- case LLAMA_FTYPE_MOSTLY_Q4_3: return "mostly Q4_3";
857
861
  case LLAMA_FTYPE_MOSTLY_Q5_0: return "mostly Q5_0";
858
862
  case LLAMA_FTYPE_MOSTLY_Q5_1: return "mostly Q5_1";
859
863
  case LLAMA_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0";
@@ -875,6 +879,7 @@ static void llama_model_load_internal(
875
879
  const std::string & fname,
876
880
  llama_context & lctx,
877
881
  int n_ctx,
882
+ int n_gpu_layers,
878
883
  ggml_type memory_type,
879
884
  bool use_mmap,
880
885
  bool use_mlock,
@@ -919,15 +924,24 @@ static void llama_model_load_internal(
919
924
  fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
920
925
  }
921
926
 
927
+ if (file_version != LLAMA_FILE_VERSION_GGJT_V2) {
928
+ if (hparams.ftype != LLAMA_FTYPE_ALL_F32 &&
929
+ hparams.ftype != LLAMA_FTYPE_MOSTLY_F16 &&
930
+ hparams.ftype != LLAMA_FTYPE_MOSTLY_Q8_0) {
931
+ throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1305)");
932
+ }
933
+ }
934
+
922
935
  if (vocab_only) {
923
936
  return;
924
937
  }
925
938
 
926
939
  auto & ctx = model.ctx;
927
940
 
928
- size_t ctx_size, mmapped_size;
941
+ size_t ctx_size;
942
+ size_t mmapped_size;
929
943
  ml->calc_sizes(&ctx_size, &mmapped_size);
930
- fprintf(stderr, "%s: ggml ctx size = %6.2f KB\n", __func__, ctx_size/1024.0);
944
+ fprintf(stderr, "%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/1024.0/1024.0);
931
945
 
932
946
  // print memory requirements
933
947
  {
@@ -971,8 +985,6 @@ static void llama_model_load_internal(
971
985
 
972
986
  // prepare memory for the weights
973
987
  {
974
- const auto & hparams = model.hparams;
975
-
976
988
  const uint32_t n_embd = hparams.n_embd;
977
989
  const uint32_t n_layer = hparams.n_layer;
978
990
  const uint32_t n_vocab = hparams.n_vocab;
@@ -1014,6 +1026,35 @@ static void llama_model_load_internal(
1014
1026
  ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
1015
1027
 
1016
1028
  model.mapping = std::move(ml->mapping);
1029
+ #ifdef GGML_USE_CUBLAS
1030
+ {
1031
+ const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
1032
+
1033
+ fprintf(stderr, "%s: [cublas] offloading %d layers to GPU\n", __func__, n_gpu);
1034
+
1035
+ size_t vram_total = 0;
1036
+
1037
+ for (int i = 0; i < n_gpu; ++i) {
1038
+ const auto & layer = model.layers[i];
1039
+
1040
+ ggml_cuda_transform_tensor(layer.wq); vram_total += ggml_nbytes(layer.wq);
1041
+ ggml_cuda_transform_tensor(layer.wk); vram_total += ggml_nbytes(layer.wk);
1042
+ ggml_cuda_transform_tensor(layer.wv); vram_total += ggml_nbytes(layer.wv);
1043
+ ggml_cuda_transform_tensor(layer.wo); vram_total += ggml_nbytes(layer.wo);
1044
+ ggml_cuda_transform_tensor(layer.w1); vram_total += ggml_nbytes(layer.w1);
1045
+ ggml_cuda_transform_tensor(layer.w2); vram_total += ggml_nbytes(layer.w2);
1046
+ ggml_cuda_transform_tensor(layer.w3); vram_total += ggml_nbytes(layer.w3);
1047
+ }
1048
+ if (n_gpu_layers > (int) hparams.n_layer) {
1049
+ fprintf(stderr, "%s: [cublas] offloading output layer to GPU\n", __func__);
1050
+ ggml_cuda_transform_tensor(model.output); vram_total += ggml_nbytes(model.output);
1051
+ }
1052
+
1053
+ fprintf(stderr, "%s: [cublas] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
1054
+ }
1055
+ #else
1056
+ (void) n_gpu_layers;
1057
+ #endif
1017
1058
 
1018
1059
  // loading time will be recalculate after the first eval, so
1019
1060
  // we take page faults deferred by mmap() into consideration
@@ -1024,6 +1065,7 @@ static bool llama_model_load(
1024
1065
  const std::string & fname,
1025
1066
  llama_context & lctx,
1026
1067
  int n_ctx,
1068
+ int n_gpu_layers,
1027
1069
  ggml_type memory_type,
1028
1070
  bool use_mmap,
1029
1071
  bool use_mlock,
@@ -1031,7 +1073,7 @@ static bool llama_model_load(
1031
1073
  llama_progress_callback progress_callback,
1032
1074
  void *progress_callback_user_data) {
1033
1075
  try {
1034
- llama_model_load_internal(fname, lctx, n_ctx, memory_type, use_mmap, use_mlock,
1076
+ llama_model_load_internal(fname, lctx, n_ctx, n_gpu_layers, memory_type, use_mmap, use_mlock,
1035
1077
  vocab_only, progress_callback, progress_callback_user_data);
1036
1078
  return true;
1037
1079
  } catch (const std::string & err) {
@@ -1053,6 +1095,13 @@ static bool llama_eval_internal(
1053
1095
  const int n_tokens,
1054
1096
  const int n_past,
1055
1097
  const int n_threads) {
1098
+
1099
+ // enforce that the first token is BOS
1100
+ if (n_past == 0 && tokens[0] != llama_token_bos()) {
1101
+ fprintf(stderr, "%s: first token must be BOS\n", __func__);
1102
+ return false;
1103
+ }
1104
+
1056
1105
  const int64_t t_start_us = ggml_time_us();
1057
1106
 
1058
1107
  const int N = n_tokens;
@@ -1060,7 +1109,7 @@ static bool llama_eval_internal(
1060
1109
  const auto & model = lctx.model;
1061
1110
  const auto & hparams = model.hparams;
1062
1111
 
1063
- auto & kv_self = model.kv_self;
1112
+ const auto & kv_self = model.kv_self;
1064
1113
 
1065
1114
  LLAMA_ASSERT(!!kv_self.ctx);
1066
1115
 
@@ -1088,6 +1137,7 @@ static bool llama_eval_internal(
1088
1137
  gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
1089
1138
 
1090
1139
  struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
1140
+ ggml_set_name(embd, "embd");
1091
1141
  memcpy(embd->data, tokens, N*ggml_element_size(embd));
1092
1142
 
1093
1143
  struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
@@ -1112,8 +1162,10 @@ static bool llama_eval_internal(
1112
1162
  // self-attention
1113
1163
  {
1114
1164
  // compute Q and K and RoPE them
1115
- struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
1116
- struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
1165
+ struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
1166
+ struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
1167
+ ggml_set_name(Qcur, "Qcur");
1168
+ ggml_set_name(Kcur, "Kcur");
1117
1169
 
1118
1170
  // store key and value to memory
1119
1171
  {
@@ -1134,6 +1186,7 @@ static bool llama_eval_internal(
1134
1186
  ggml_permute(ctx0,
1135
1187
  Qcur,
1136
1188
  0, 2, 1, 3);
1189
+ ggml_set_name(Q, "Q");
1137
1190
 
1138
1191
  struct ggml_tensor * K =
1139
1192
  ggml_permute(ctx0,
@@ -1141,21 +1194,28 @@ static bool llama_eval_internal(
1141
1194
  ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd),
1142
1195
  n_embd/n_head, n_head, n_past + N),
1143
1196
  0, 2, 1, 3);
1197
+ ggml_set_name(K, "K");
1144
1198
 
1145
1199
  // K * Q
1146
1200
  struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
1201
+ ggml_set_name(KQ, "KQ");
1147
1202
 
1148
1203
  // KQ_scaled = KQ / sqrt(n_embd/n_head)
1149
- struct ggml_tensor * KQ_scaled =
1150
- ggml_scale(ctx0,
1151
- KQ,
1152
- ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head)));
1204
+ struct ggml_tensor * KQ_scale = ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head));
1205
+ ggml_set_name(KQ_scale, "1/sqrt(n_embd/n_head)");
1206
+
1207
+ // KQ_scaled shape [n_past + N, N, n_head, 1]
1208
+ struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
1209
+ ggml_set_name(KQ_scaled, "KQ_scaled");
1153
1210
 
1154
1211
  // KQ_masked = mask_past(KQ_scaled)
1155
- struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
1212
+ struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
1213
+ ggml_set_name(KQ_masked, "KQ_masked");
1156
1214
 
1157
1215
  // KQ = soft_max(KQ_masked)
1158
- struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
1216
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
1217
+ ggml_set_name(KQ_soft_max, "KQ_soft_max");
1218
+
1159
1219
 
1160
1220
  // split cached V into n_head heads
1161
1221
  struct ggml_tensor * V =
@@ -1164,9 +1224,11 @@ static bool llama_eval_internal(
1164
1224
  n_ctx*ggml_element_size(kv_self.v),
1165
1225
  n_ctx*ggml_element_size(kv_self.v)*n_embd/n_head,
1166
1226
  il*n_ctx*ggml_element_size(kv_self.v)*n_embd);
1227
+ ggml_set_name(V, "V");
1167
1228
 
1168
1229
  #if 1
1169
1230
  struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
1231
+ ggml_set_name(KQV, "KQV");
1170
1232
  #else
1171
1233
  // make V contiguous in memory to speed up the matmul, however we waste time on the copy
1172
1234
  // on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
@@ -1177,11 +1239,13 @@ static bool llama_eval_internal(
1177
1239
 
1178
1240
  // KQV_merged = KQV.permute(0, 2, 1, 3)
1179
1241
  struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
1242
+ ggml_set_name(KQV_merged, "KQV_merged");
1180
1243
 
1181
1244
  // cur = KQV_merged.contiguous().view(n_embd, N)
1182
1245
  cur = ggml_cpy(ctx0,
1183
1246
  KQV_merged,
1184
1247
  ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
1248
+ ggml_set_name(cur, "KQV_merged_contiguous");
1185
1249
 
1186
1250
  // projection (no bias)
1187
1251
  cur = ggml_mul_mat(ctx0,
@@ -1253,7 +1317,7 @@ static bool llama_eval_internal(
1253
1317
  lctx.use_buf(ctx0, -1);
1254
1318
 
1255
1319
  // logits -> probs
1256
- //inpL = ggml_soft_max(ctx0, inpL);
1320
+ //inpL = ggml_soft_max_inplace(ctx0, inpL);
1257
1321
 
1258
1322
  // run the computation
1259
1323
  ggml_build_forward_expand(&gf, inpL);
@@ -1273,6 +1337,9 @@ static bool llama_eval_internal(
1273
1337
  //embd_w.resize(n_vocab*N);
1274
1338
  //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
1275
1339
 
1340
+ // update kv token count
1341
+ lctx.model.kv_self.n = n_past + N;
1342
+
1276
1343
  // extract logits
1277
1344
  {
1278
1345
  auto & logits_out = lctx.logits;
@@ -1288,7 +1355,7 @@ static bool llama_eval_internal(
1288
1355
  }
1289
1356
 
1290
1357
  // extract embeddings
1291
- if (lctx.embedding.size()) {
1358
+ if (!lctx.embedding.empty()) {
1292
1359
  auto & embedding_out = lctx.embedding;
1293
1360
 
1294
1361
  embedding_out.resize(n_embd);
@@ -1339,6 +1406,8 @@ struct llama_sp_symbol {
1339
1406
  size_t n;
1340
1407
  };
1341
1408
 
1409
+ static_assert(std::is_trivially_copyable<llama_sp_symbol>::value, "llama_sp_symbol is not trivially copyable");
1410
+
1342
1411
  struct llama_sp_bigram {
1343
1412
  struct comparator {
1344
1413
  bool operator()(llama_sp_bigram & l, llama_sp_bigram & r) {
@@ -1371,7 +1440,7 @@ struct llama_tokenizer {
1371
1440
  sym.prev = index - 1;
1372
1441
  sym.next = offs == text.size() ? -1 : index + 1;
1373
1442
  index++;
1374
- symbols_.emplace_back(std::move(sym));
1443
+ symbols_.emplace_back(sym);
1375
1444
  }
1376
1445
 
1377
1446
  // seed the work queue with all possible 2-character tokens.
@@ -1462,12 +1531,12 @@ static std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, co
1462
1531
  llama_tokenizer tokenizer(vocab);
1463
1532
  std::vector<llama_vocab::id> output;
1464
1533
 
1465
- if (text.size() == 0) {
1534
+ if (text.empty()) {
1466
1535
  return output;
1467
1536
  }
1468
1537
 
1469
1538
  if (bos) {
1470
- output.push_back(1);
1539
+ output.push_back(llama_token_bos());
1471
1540
  }
1472
1541
 
1473
1542
  tokenizer.tokenize(text, output);
@@ -1478,109 +1547,402 @@ static std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, co
1478
1547
  // sampling
1479
1548
  //
1480
1549
 
1481
- static void sample_top_k(std::vector<std::pair<float, llama_vocab::id>> & logits_id, int top_k) {
1482
- // find the top k tokens
1483
- std::partial_sort(
1484
- logits_id.begin(),
1485
- logits_id.begin() + top_k, logits_id.end(),
1486
- [](const std::pair<float, llama_vocab::id> & a, const std::pair<float, llama_vocab::id> & b) {
1487
- return a.first > b.first;
1488
- });
1550
+ void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates) {
1551
+ assert(candidates->size > 0);
1552
+
1553
+ const int64_t t_start_sample_us = ggml_time_us();
1554
+
1555
+ // Sort the logits in descending order
1556
+ if (!candidates->sorted) {
1557
+ std::sort(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
1558
+ return a.logit > b.logit;
1559
+ });
1560
+ candidates->sorted = true;
1561
+ }
1489
1562
 
1490
- logits_id.resize(top_k);
1563
+ float max_l = candidates->data[0].logit;
1564
+ float cum_sum = 0.0f;
1565
+ for (size_t i = 0; i < candidates->size; ++i) {
1566
+ float p = expf(candidates->data[i].logit - max_l);
1567
+ candidates->data[i].p = p;
1568
+ cum_sum += p;
1569
+ }
1570
+ for (size_t i = 0; i < candidates->size; ++i) {
1571
+ candidates->data[i].p /= cum_sum;
1572
+ }
1573
+
1574
+ if (ctx) {
1575
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
1576
+ }
1491
1577
  }
1492
1578
 
1493
- static llama_vocab::id llama_sample_top_p_top_k(
1494
- llama_context & lctx,
1495
- const std::vector<llama_vocab::id> & last_n_tokens,
1496
- int top_k,
1497
- float top_p,
1498
- float temp,
1499
- float repeat_penalty) {
1500
- auto & rng = lctx.rng;
1501
-
1502
- const int n_logits = lctx.model.hparams.n_vocab;
1503
-
1504
- const auto & logits = lctx.logits;
1505
- const auto * plogits = logits.data() + logits.size() - n_logits;
1506
-
1507
- if (temp <= 0) {
1508
- // select the token with the highest logit directly
1509
- float max_logit = plogits[0];
1510
- llama_vocab::id max_id = 0;
1511
-
1512
- for (int i = 1; i < n_logits; ++i) {
1513
- if (plogits[i] > max_logit) {
1514
- max_logit = plogits[i];
1515
- max_id = i;
1516
- }
1579
+ void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep) {
1580
+ const int64_t t_start_sample_us = ggml_time_us();
1581
+
1582
+ k = std::max(k, (int) min_keep);
1583
+ k = std::min(k, (int) candidates->size);
1584
+
1585
+ // Sort scores in descending order
1586
+ if (!candidates->sorted) {
1587
+ auto comp = [](const llama_token_data & a, const llama_token_data & b) {
1588
+ return a.logit > b.logit;
1589
+ };
1590
+ if (k == (int) candidates->size) {
1591
+ std::sort(candidates->data, candidates->data + candidates->size, comp);
1592
+ } else {
1593
+ std::partial_sort(candidates->data, candidates->data + k, candidates->data + candidates->size, comp);
1517
1594
  }
1518
- return max_id;
1595
+ candidates->sorted = true;
1519
1596
  }
1597
+ candidates->size = k;
1520
1598
 
1521
- std::vector<std::pair<float, llama_vocab::id>> logits_id;
1522
- logits_id.reserve(n_logits);
1599
+ if (ctx) {
1600
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
1601
+ }
1602
+ }
1523
1603
 
1524
- {
1525
- const float scale = 1.0f/temp;
1526
- for (int i = 0; i < n_logits; ++i) {
1527
- // repetition penalty from ctrl paper (https://arxiv.org/abs/1909.05858)
1528
- // credit https://github.com/facebookresearch/llama/compare/main...shawwn:llama:main
1529
- if (std::find(last_n_tokens.begin(), last_n_tokens.end(), i) != last_n_tokens.end()) {
1530
- // if score < 0 then repetition penalty has to multiplied to reduce the previous token probability
1531
- if (plogits[i] < 0.0f) {
1532
- logits_id.push_back(std::make_pair(plogits[i]*scale*repeat_penalty, i));
1533
- } else {
1534
- logits_id.push_back(std::make_pair(plogits[i]*scale/repeat_penalty, i));
1535
- }
1536
- } else {
1537
- logits_id.push_back(std::make_pair(plogits[i]*scale, i));
1538
- }
1604
+ void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) {
1605
+ if (p >= 1.0f) {
1606
+ return;
1607
+ }
1608
+
1609
+ const int64_t t_start_sample_us = ggml_time_us();
1610
+
1611
+ llama_sample_softmax(ctx, candidates);
1612
+
1613
+ // Compute the cumulative probabilities
1614
+ float cum_sum = 0.0f;
1615
+ size_t last_idx = candidates->size;
1616
+
1617
+ for (size_t i = 0; i < candidates->size; ++i) {
1618
+ cum_sum += candidates->data[i].p;
1619
+
1620
+ // Check if the running sum is greater than p or if we have kept at least min_keep tokens
1621
+ if (cum_sum > p && i >= min_keep) {
1622
+ last_idx = i;
1623
+ break;
1539
1624
  }
1540
1625
  }
1541
1626
 
1542
- sample_top_k(logits_id, top_k > 0 ? std::min(top_k, n_logits) : n_logits);
1627
+ // Resize the output vector to keep only the top-p tokens
1628
+ candidates->size = last_idx;
1543
1629
 
1544
- // compute probs for the top k tokens
1545
- std::vector<float> probs;
1546
- probs.reserve(logits_id.size());
1630
+ if (ctx) {
1631
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
1632
+ }
1633
+ }
1634
+
1635
+ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep) {
1636
+ if (z >= 1.0f || candidates->size <= 2) {
1637
+ return;
1638
+ }
1547
1639
 
1548
- float maxl = logits_id[0].first;
1549
- double sum = 0.0;
1550
- for (const auto & kv : logits_id) {
1551
- const float p = expf(kv.first - maxl);
1552
- probs.push_back(p);
1553
- sum += p;
1640
+ const int64_t t_start_sample_us = ggml_time_us();
1641
+
1642
+ llama_sample_softmax(nullptr, candidates);
1643
+
1644
+ // Compute the first and second derivatives
1645
+ std::vector<float> first_derivatives(candidates->size - 1);
1646
+ std::vector<float> second_derivatives(candidates->size - 2);
1647
+
1648
+ for (size_t i = 0; i < first_derivatives.size(); ++i) {
1649
+ first_derivatives[i] = candidates->data[i].p - candidates->data[i + 1].p;
1650
+ }
1651
+ for (size_t i = 0; i < second_derivatives.size(); ++i) {
1652
+ second_derivatives[i] = first_derivatives[i] - first_derivatives[i + 1];
1554
1653
  }
1555
1654
 
1556
- // normalize the probs
1557
- for (auto & p : probs) {
1558
- p /= sum;
1655
+ // Calculate absolute value of second derivatives
1656
+ for (size_t i = 0; i < second_derivatives.size(); ++i) {
1657
+ second_derivatives[i] = abs(second_derivatives[i]);
1559
1658
  }
1560
1659
 
1561
- if (top_p < 1.0) {
1562
- double cumsum = 0.0;
1563
- for (int i = 0; i < (int) probs.size(); i++) {
1564
- cumsum += probs[i];
1565
- if (cumsum >= top_p) {
1566
- probs.resize(i + 1);
1567
- logits_id.resize(i + 1);
1568
- break;
1569
- }
1660
+ // Normalize the second derivatives
1661
+ float second_derivatives_sum = std::accumulate(second_derivatives.begin(), second_derivatives.end(), 0.0f);
1662
+ for (float & value : second_derivatives) {
1663
+ value /= second_derivatives_sum;
1664
+ }
1665
+
1666
+ float cum_sum = 0.0f;
1667
+ size_t last_idx = candidates->size;
1668
+ for (size_t i = 0; i < second_derivatives.size(); ++i) {
1669
+ cum_sum += second_derivatives[i];
1670
+
1671
+ // Check if the running sum is greater than z or if we have kept at least min_keep tokens
1672
+ if (cum_sum > z && i >= min_keep) {
1673
+ last_idx = i;
1674
+ break;
1570
1675
  }
1571
1676
  }
1572
1677
 
1573
- //printf("\n");
1574
- //for (int i = 0; i < (int) 10; i++) {
1575
- // printf("%d: '%s' %f\n", i, lctx.vocab.id_to_token.at(logits_id[i].second).tok.c_str(), probs[i]);
1576
- //}
1577
- //printf("\n\n");
1578
- //exit(0);
1678
+ // Resize the output vector to keep only the tokens above the tail location
1679
+ candidates->size = last_idx;
1680
+
1681
+ if (ctx) {
1682
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
1683
+ }
1684
+ }
1685
+
1686
+
1687
+ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) {
1688
+ // Reference implementation:
1689
+ // https://github.com/huggingface/transformers/compare/main...cimeister:typical-sampling:typical-pr
1690
+ if (p >= 1.0f) {
1691
+ return;
1692
+ }
1693
+
1694
+ const int64_t t_start_sample_us = ggml_time_us();
1695
+
1696
+ // Compute the softmax of logits and calculate entropy
1697
+ llama_sample_softmax(nullptr, candidates);
1698
+
1699
+ float entropy = 0.0f;
1700
+ for (size_t i = 0; i < candidates->size; ++i) {
1701
+ entropy += -candidates->data[i].p * logf(candidates->data[i].p);
1702
+ }
1703
+
1704
+ // Compute the absolute difference between negative log probability and entropy for each candidate
1705
+ std::vector<float> shifted_scores;
1706
+ for (size_t i = 0; i < candidates->size; ++i) {
1707
+ float shifted_score = fabsf(-logf(candidates->data[i].p) - entropy);
1708
+ shifted_scores.push_back(shifted_score);
1709
+ }
1710
+
1711
+ // Sort tokens based on the shifted_scores and their corresponding indices
1712
+ std::vector<size_t> indices(candidates->size);
1713
+ std::iota(indices.begin(), indices.end(), 0);
1714
+
1715
+ std::sort(indices.begin(), indices.end(), [&](size_t a, size_t b) {
1716
+ return shifted_scores[a] < shifted_scores[b];
1717
+ });
1718
+
1719
+ // Compute the cumulative probabilities
1720
+ float cum_sum = 0.0f;
1721
+ size_t last_idx = indices.size();
1722
+
1723
+ for (size_t i = 0; i < indices.size(); ++i) {
1724
+ size_t idx = indices[i];
1725
+ cum_sum += candidates->data[idx].p;
1726
+
1727
+ // Check if the running sum is greater than typical or if we have kept at least min_keep tokens
1728
+ if (cum_sum > p && i >= min_keep - 1) {
1729
+ last_idx = i + 1;
1730
+ break;
1731
+ }
1732
+ }
1733
+
1734
+ // Resize the output vector to keep only the locally typical tokens
1735
+ std::vector<llama_token_data> new_candidates;
1736
+ for (size_t i = 0; i < last_idx; ++i) {
1737
+ size_t idx = indices[i];
1738
+ new_candidates.push_back(candidates->data[idx]);
1739
+ }
1740
+
1741
+ // Replace the data in candidates with the new_candidates data
1742
+ std::copy(new_candidates.begin(), new_candidates.end(), candidates->data);
1743
+ candidates->size = new_candidates.size();
1744
+
1745
+ if (ctx) {
1746
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
1747
+ }
1748
+ }
1749
+
1750
+ void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
1751
+ const int64_t t_start_sample_us = ggml_time_us();
1752
+
1753
+ for (size_t i = 0; i < candidates_p->size; ++i) {
1754
+ candidates_p->data[i].logit /= temp;
1755
+ }
1756
+
1757
+ if (ctx) {
1758
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
1759
+ }
1760
+ }
1761
+
1762
+ void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty) {
1763
+ if (last_tokens_size == 0 || penalty == 1.0f) {
1764
+ return;
1765
+ }
1766
+
1767
+ const int64_t t_start_sample_us = ggml_time_us();
1768
+
1769
+ for (size_t i = 0; i < candidates->size; ++i) {
1770
+ const auto * token_iter = std::find(last_tokens, last_tokens + last_tokens_size, candidates->data[i].id);
1771
+ if (token_iter == last_tokens + last_tokens_size) {
1772
+ continue;
1773
+ }
1774
+
1775
+ // The academic publication that described this technique actually just only divided, but that would cause tokens with negative logits to become more likely, which is obviously wrong.
1776
+ // This is common fix for this problem, which is to multiply by the penalty instead of dividing.
1777
+ if (candidates->data[i].logit <= 0) {
1778
+ candidates->data[i].logit *= penalty;
1779
+ } else {
1780
+ candidates->data[i].logit /= penalty;
1781
+ }
1782
+ }
1783
+
1784
+ candidates->sorted = false;
1785
+
1786
+ if (ctx) {
1787
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
1788
+ }
1789
+ }
1790
+
1791
+ void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens_p, size_t last_tokens_size, float alpha_frequency, float alpha_presence) {
1792
+ if (last_tokens_size == 0 || (alpha_frequency == 0.0f && alpha_presence == 0.0f)) {
1793
+ return;
1794
+ }
1795
+
1796
+ const int64_t t_start_sample_us = ggml_time_us();
1797
+
1798
+ // Create a frequency map to count occurrences of each token in last_tokens
1799
+ std::unordered_map<llama_token, int> token_count;
1800
+ for (size_t i = 0; i < last_tokens_size; ++i) {
1801
+ token_count[last_tokens_p[i]]++;
1802
+ }
1803
+
1804
+ // Apply frequency and presence penalties to the candidates
1805
+ for (size_t i = 0; i < candidates->size; ++i) {
1806
+ auto token_iter = token_count.find(candidates->data[i].id);
1807
+ if (token_iter == token_count.end()) {
1808
+ continue;
1809
+ }
1810
+
1811
+ int count = token_iter->second;
1812
+ candidates->data[i].logit -= float(count) * alpha_frequency + float(count > 0) * alpha_presence;
1813
+ }
1814
+
1815
+ candidates->sorted = false;
1816
+
1817
+ if (ctx) {
1818
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
1819
+ }
1820
+ }
1821
+
1822
+
1823
+ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu) {
1824
+ assert(ctx);
1825
+ auto N = float(llama_n_vocab(ctx));
1826
+ int64_t t_start_sample_us;
1827
+ t_start_sample_us = ggml_time_us();
1828
+
1829
+ llama_sample_softmax(nullptr, candidates);
1830
+
1831
+ // Estimate s_hat using the most probable m tokens
1832
+ float s_hat = 0.0;
1833
+ float sum_ti_bi = 0.0;
1834
+ float sum_ti_sq = 0.0;
1835
+ for (size_t i = 0; i < size_t(m - 1) && i < candidates->size - 1; ++i) {
1836
+ float t_i = logf(float(i + 2) / float(i + 1));
1837
+ float b_i = logf(candidates->data[i].p / candidates->data[i + 1].p);
1838
+ sum_ti_bi += t_i * b_i;
1839
+ sum_ti_sq += t_i * t_i;
1840
+ }
1841
+ s_hat = sum_ti_bi / sum_ti_sq;
1842
+
1843
+ // Compute k from the estimated s_hat and target surprise value
1844
+ float epsilon_hat = s_hat - 1;
1845
+ float k = powf((epsilon_hat * powf(2, *mu)) / (1 - powf(N, -epsilon_hat)), 1 / s_hat);
1846
+
1847
+ // Sample the next word X using top-k sampling
1848
+ llama_sample_top_k(nullptr, candidates, int(k), 1);
1849
+ if (ctx) {
1850
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
1851
+ }
1852
+ llama_token X = llama_sample_token(ctx, candidates);
1853
+ t_start_sample_us = ggml_time_us();
1854
+
1855
+ // Compute error as the difference between observed surprise and target surprise value
1856
+ size_t X_idx = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) {
1857
+ return candidate.id == X;
1858
+ }));
1859
+ float observed_surprise = -log2f(candidates->data[X_idx].p);
1860
+ float e = observed_surprise - tau;
1861
+
1862
+ // Update mu using the learning rate and error
1863
+ *mu = *mu - eta * e;
1864
+
1865
+ if (ctx) {
1866
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
1867
+ ctx->n_sample++;
1868
+ }
1869
+ return X;
1870
+ }
1871
+
1872
+ llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu) {
1873
+ assert(ctx);
1874
+ int64_t t_start_sample_us;
1875
+ t_start_sample_us = ggml_time_us();
1876
+
1877
+ llama_sample_softmax(ctx, candidates);
1878
+
1879
+ // Truncate the words with surprise values greater than mu
1880
+ candidates->size = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) {
1881
+ return -log2f(candidate.p) > *mu;
1882
+ }));
1883
+
1884
+ // Normalize the probabilities of the remaining words
1885
+ llama_sample_softmax(ctx, candidates);
1886
+
1887
+ // Sample the next word X from the remaining words
1888
+ if (ctx) {
1889
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
1890
+ }
1891
+ llama_token X = llama_sample_token(ctx, candidates);
1892
+ t_start_sample_us = ggml_time_us();
1893
+
1894
+ // Compute error as the difference between observed surprise and target surprise value
1895
+ size_t X_idx = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) {
1896
+ return candidate.id == X;
1897
+ }));
1898
+ float observed_surprise = -log2f(candidates->data[X_idx].p);
1899
+ float e = observed_surprise - tau;
1900
+
1901
+ // Update mu using the learning rate and error
1902
+ *mu = *mu - eta * e;
1903
+
1904
+ if (ctx) {
1905
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
1906
+ }
1907
+ return X;
1908
+ }
1909
+
1910
+ llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_data_array * candidates) {
1911
+ const int64_t t_start_sample_us = ggml_time_us();
1912
+
1913
+ // Find max element
1914
+ auto * max_iter = std::max_element(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
1915
+ return a.logit < b.logit;
1916
+ });
1917
+
1918
+ llama_token result = max_iter->id;
1919
+ if (ctx) {
1920
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
1921
+ ctx->n_sample++;
1922
+ }
1923
+ return result;
1924
+ }
1925
+
1926
+ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates) {
1927
+ assert(ctx);
1928
+ const int64_t t_start_sample_us = ggml_time_us();
1929
+ llama_sample_softmax(nullptr, candidates);
1930
+
1931
+ std::vector<float> probs;
1932
+ probs.reserve(candidates->size);
1933
+ for (size_t i = 0; i < candidates->size; ++i) {
1934
+ probs.push_back(candidates->data[i].p);
1935
+ }
1579
1936
 
1580
1937
  std::discrete_distribution<> dist(probs.begin(), probs.end());
1938
+ auto & rng = ctx->rng;
1581
1939
  int idx = dist(rng);
1582
1940
 
1583
- return logits_id[idx].second;
1941
+ llama_token result = candidates->data[idx].id;
1942
+
1943
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
1944
+ ctx->n_sample++;
1945
+ return result;
1584
1946
  }
1585
1947
 
1586
1948
  //
@@ -1592,8 +1954,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1592
1954
  switch (ftype) {
1593
1955
  case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
1594
1956
  case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
1595
- case LLAMA_FTYPE_MOSTLY_Q4_2: quantized_type = GGML_TYPE_Q4_2; break;
1596
- case LLAMA_FTYPE_MOSTLY_Q4_3: quantized_type = GGML_TYPE_Q4_3; break;
1597
1957
  case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
1598
1958
  case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
1599
1959
  case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
@@ -1604,7 +1964,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1604
1964
  nthread = std::thread::hardware_concurrency();
1605
1965
  }
1606
1966
 
1607
- std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp.c_str(), /*use_mmap*/ false,
1967
+ std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp, /*use_mmap*/ false,
1608
1968
  /*vocab_only*/ false));
1609
1969
  llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), ftype);
1610
1970
 
@@ -1658,7 +2018,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1658
2018
  } else if (tensor.type == GGML_TYPE_F16) {
1659
2019
  f32_conv_buf.resize(nelements * sizeof(float));
1660
2020
  f32_data = (float *) f32_conv_buf.addr;
1661
- auto f16_data = (const ggml_fp16_t *) tensor.data;
2021
+ const auto * f16_data = (const ggml_fp16_t *) tensor.data;
1662
2022
  for (size_t i = 0; i < nelements; i++) {
1663
2023
  f32_data[i] = ggml_fp16_to_fp32(f16_data[i]);
1664
2024
  }
@@ -1689,21 +2049,31 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1689
2049
  size_t first = counter; counter += chunk_size;
1690
2050
  if (first >= nelements) {
1691
2051
  if (!local_hist.empty()) {
1692
- for (int j=0; j<int(local_hist.size()); ++j) hist_cur[j] += local_hist[j];
2052
+ for (int j=0; j<int(local_hist.size()); ++j) {
2053
+ hist_cur[j] += local_hist[j];
2054
+ }
1693
2055
  new_size += local_size;
1694
2056
  }
1695
2057
  break;
1696
2058
  }
1697
2059
  lock.unlock();
1698
2060
  size_t last = std::min(nelements, first + chunk_size);
1699
- if (local_hist.empty()) local_hist.resize(hist_cur.size(), 0);
2061
+ if (local_hist.empty()) {
2062
+ local_hist.resize(hist_cur.size(), 0);
2063
+ }
1700
2064
  local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data());
1701
2065
  }
1702
2066
  };
1703
- if (int(workers.size()) < nthread_use - 1) workers.resize(nthread_use - 1);
1704
- for (int it = 0; it < nthread_use - 1; ++it) workers[it] = std::thread(compute);
2067
+ if ((int) workers.size() < nthread_use - 1) {
2068
+ workers.resize(nthread_use - 1);
2069
+ }
2070
+ for (int it = 0; it < nthread_use - 1; ++it) {
2071
+ workers[it] = std::thread(compute);
2072
+ }
1705
2073
  compute();
1706
- for (int it = 0; it < nthread_use - 1; ++it) workers[it].join();
2074
+ for (int it = 0; it < nthread_use - 1; ++it) {
2075
+ workers[it].join();
2076
+ }
1707
2077
  }
1708
2078
 
1709
2079
  printf("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0);
@@ -1749,7 +2119,7 @@ struct llama_context * llama_init_from_file(
1749
2119
 
1750
2120
  llama_context * ctx = new llama_context;
1751
2121
 
1752
- if (params.seed <= 0) {
2122
+ if (params.seed < 0) {
1753
2123
  params.seed = time(NULL);
1754
2124
  }
1755
2125
 
@@ -1775,7 +2145,7 @@ struct llama_context * llama_init_from_file(
1775
2145
 
1776
2146
  ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
1777
2147
 
1778
- if (!llama_model_load(path_model, *ctx, params.n_ctx, memory_type,
2148
+ if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_gpu_layers, memory_type,
1779
2149
  params.use_mmap, params.use_mlock, params.vocab_only,
1780
2150
  params.progress_callback, params.progress_callback_user_data)) {
1781
2151
  fprintf(stderr, "%s: failed to load model\n", __func__);
@@ -1901,7 +2271,8 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
1901
2271
  fprintf(stderr, "%s: loading base model from '%s'\n", __func__, path_base_model);
1902
2272
  model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*vocab_only*/ false));
1903
2273
 
1904
- size_t ctx_size, mmapped_size;
2274
+ size_t ctx_size;
2275
+ size_t mmapped_size;
1905
2276
  model_loader->calc_sizes(&ctx_size, &mmapped_size);
1906
2277
  base_buf.resize(ctx_size);
1907
2278
 
@@ -1940,8 +2311,12 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
1940
2311
  fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
1941
2312
  }
1942
2313
 
1943
- std::string name(length, 0);
1944
- fin.read(&name[0], length);
2314
+ std::string name;
2315
+ {
2316
+ char buf[1024];
2317
+ fin.read(buf, length);
2318
+ name = std::string(buf, length);
2319
+ }
1945
2320
 
1946
2321
  // check for lora suffix and get the type of tensor
1947
2322
  const std::string lora_suffix = ".lora";
@@ -1956,7 +2331,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
1956
2331
  base_name.erase(pos);
1957
2332
  // fprintf(stderr, "%s: %s => %s (lora type %s) ", __func__, name.c_str(),base_name.c_str(), lora_type.c_str());
1958
2333
 
1959
- if (model_tensors.find(base_name.data()) == model_tensors.end()) {
2334
+ if (model_tensors.find(base_name) == model_tensors.end()) {
1960
2335
  fprintf(stderr, "%s: unknown tensor '%s' in lora adapter\n", __func__, name.data());
1961
2336
  return 1;
1962
2337
  }
@@ -2036,7 +2411,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
2036
2411
 
2037
2412
  if (scaling != 1.0f) {
2038
2413
  ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling);
2039
- BA = ggml_scale(lora_ctx, BA, scale_tensor);
2414
+ BA = ggml_scale_inplace(lora_ctx, BA, scale_tensor);
2040
2415
  }
2041
2416
 
2042
2417
  ggml_tensor * r;
@@ -2058,8 +2433,9 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
2058
2433
  lora_tensors.clear();
2059
2434
 
2060
2435
  n_tensors++;
2061
- if (n_tensors % 4 == 0)
2436
+ if (n_tensors % 4 == 0) {
2062
2437
  fprintf(stderr, ".");
2438
+ }
2063
2439
  }
2064
2440
  }
2065
2441
 
@@ -2084,21 +2460,21 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
2084
2460
  }
2085
2461
  }
2086
2462
 
2087
- int llama_get_kv_cache_token_count(struct llama_context * ctx) {
2463
+ int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
2088
2464
  return ctx->model.kv_self.n;
2089
2465
  }
2090
2466
 
2091
- #define LLAMA_MAX_RNG_STATE 64*1024
2467
+ #define LLAMA_MAX_RNG_STATE (64*1024)
2092
2468
 
2093
2469
  void llama_set_rng_seed(struct llama_context * ctx, int seed) {
2094
- if (seed <= 0) {
2470
+ if (seed < 0) {
2095
2471
  seed = time(NULL);
2096
2472
  }
2097
2473
  ctx->rng.seed(seed);
2098
2474
  }
2099
2475
 
2100
- // Returns the size of the state
2101
- size_t llama_get_state_size(struct llama_context * ctx) {
2476
+ // Returns the *maximum* size of the state
2477
+ size_t llama_get_state_size(const struct llama_context * ctx) {
2102
2478
  // we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.
2103
2479
  // for reference, std::mt19937(1337) serializes to 6701 bytes.
2104
2480
  const size_t s_rng_size = sizeof(size_t);
@@ -2129,8 +2505,8 @@ size_t llama_get_state_size(struct llama_context * ctx) {
2129
2505
  }
2130
2506
 
2131
2507
  // Copies the state to the specified destination address
2132
- size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) {
2133
- uint8_t * out = dest;
2508
+ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
2509
+ uint8_t * out = dst;
2134
2510
 
2135
2511
  // copy rng
2136
2512
  {
@@ -2176,36 +2552,70 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) {
2176
2552
 
2177
2553
  // copy kv cache
2178
2554
  {
2179
- const size_t kv_size = ctx->model.kv_self.buf.size;
2555
+ const auto & kv_self = ctx->model.kv_self;
2556
+ const auto & hparams = ctx->model.hparams;
2557
+ const int n_layer = hparams.n_layer;
2558
+ const int n_embd = hparams.n_embd;
2559
+ const int n_ctx = hparams.n_ctx;
2560
+
2561
+ const size_t kv_size = kv_self.buf.size;
2180
2562
  const int kv_ntok = llama_get_kv_cache_token_count(ctx);
2181
2563
 
2182
2564
  memcpy(out, &kv_size, sizeof(kv_size)); out += sizeof(kv_size);
2183
2565
  memcpy(out, &kv_ntok, sizeof(kv_ntok)); out += sizeof(kv_ntok);
2184
2566
 
2185
2567
  if (kv_size) {
2186
- memcpy(out, ctx->model.kv_self.buf.addr, kv_size); out += kv_size;
2568
+ const size_t elt_size = ggml_element_size(kv_self.k);
2569
+
2570
+ char buffer[4096];
2571
+
2572
+ ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
2573
+ ggml_cgraph gf{};
2574
+ gf.n_threads = 1;
2575
+
2576
+ ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
2577
+ kout3d->data = out;
2578
+ out += ggml_nbytes(kout3d);
2579
+
2580
+ ggml_tensor * vout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_ntok, n_embd, n_layer);
2581
+ vout3d->data = out;
2582
+ out += ggml_nbytes(vout3d);
2583
+
2584
+ ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k,
2585
+ n_embd, kv_ntok, n_layer,
2586
+ elt_size*n_embd, elt_size*n_embd*n_ctx, 0);
2587
+
2588
+ ggml_tensor * v3d = ggml_view_3d(cpy_ctx, kv_self.v,
2589
+ kv_ntok, n_embd, n_layer,
2590
+ elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
2591
+
2592
+ ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
2593
+ ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d));
2594
+ ggml_graph_compute(cpy_ctx, &gf);
2595
+
2596
+ ggml_free(cpy_ctx);
2187
2597
  }
2188
2598
  }
2189
2599
 
2190
- const size_t written = out - dest;
2191
- const size_t expected = llama_get_state_size(ctx);
2600
+ const size_t written = out - dst;
2601
+ const size_t max_size = llama_get_state_size(ctx);
2192
2602
 
2193
- LLAMA_ASSERT(written == expected);
2603
+ LLAMA_ASSERT(written <= max_size);
2194
2604
 
2195
2605
  return written;
2196
2606
  }
2197
2607
 
2198
2608
  // Sets the state reading from the specified source address
2199
2609
  size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
2200
- const uint8_t * in = src;
2610
+ const uint8_t * inp = src;
2201
2611
 
2202
2612
  // set rng
2203
2613
  {
2204
2614
  size_t rng_size;
2205
2615
  char rng_buf[LLAMA_MAX_RNG_STATE];
2206
2616
 
2207
- memcpy(&rng_size, in, sizeof(rng_size)); in += sizeof(rng_size);
2208
- memcpy(&rng_buf[0], in, LLAMA_MAX_RNG_STATE); in += LLAMA_MAX_RNG_STATE;
2617
+ memcpy(&rng_size, inp, sizeof(rng_size)); inp += sizeof(rng_size);
2618
+ memcpy(&rng_buf[0], inp, LLAMA_MAX_RNG_STATE); inp += LLAMA_MAX_RNG_STATE;
2209
2619
 
2210
2620
  std::stringstream rng_ss;
2211
2621
  rng_ss.str(std::string(&rng_buf[0], rng_size));
@@ -2219,65 +2629,171 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
2219
2629
  size_t logits_cap;
2220
2630
  size_t logits_size;
2221
2631
 
2222
- memcpy(&logits_cap, in, sizeof(logits_cap)); in += sizeof(logits_cap);
2223
- memcpy(&logits_size, in, sizeof(logits_size)); in += sizeof(logits_size);
2632
+ memcpy(&logits_cap, inp, sizeof(logits_cap)); inp += sizeof(logits_cap);
2633
+ memcpy(&logits_size, inp, sizeof(logits_size)); inp += sizeof(logits_size);
2224
2634
 
2225
2635
  LLAMA_ASSERT(ctx->logits.capacity() == logits_cap);
2226
2636
 
2227
2637
  if (logits_size) {
2228
2638
  ctx->logits.resize(logits_size);
2229
- memcpy(ctx->logits.data(), in, logits_size * sizeof(float));
2639
+ memcpy(ctx->logits.data(), inp, logits_size * sizeof(float));
2230
2640
  }
2231
2641
 
2232
- in += logits_cap * sizeof(float);
2642
+ inp += logits_cap * sizeof(float);
2233
2643
  }
2234
2644
 
2235
2645
  // set embeddings
2236
2646
  {
2237
2647
  size_t embedding_size;
2238
2648
 
2239
- memcpy(&embedding_size, in, sizeof(embedding_size)); in += sizeof(embedding_size);
2649
+ memcpy(&embedding_size, inp, sizeof(embedding_size)); inp += sizeof(embedding_size);
2240
2650
 
2241
2651
  LLAMA_ASSERT(ctx->embedding.capacity() == embedding_size);
2242
2652
 
2243
2653
  if (embedding_size) {
2244
- memcpy(ctx->embedding.data(), in, embedding_size * sizeof(float));
2245
- in += embedding_size * sizeof(float);
2654
+ memcpy(ctx->embedding.data(), inp, embedding_size * sizeof(float));
2655
+ inp += embedding_size * sizeof(float);
2246
2656
  }
2247
2657
  }
2248
2658
 
2249
2659
  // set kv cache
2250
2660
  {
2661
+ const auto & kv_self = ctx->model.kv_self;
2662
+ const auto & hparams = ctx->model.hparams;
2663
+ const int n_layer = hparams.n_layer;
2664
+ const int n_embd = hparams.n_embd;
2665
+ const int n_ctx = hparams.n_ctx;
2666
+
2251
2667
  size_t kv_size;
2252
2668
  int kv_ntok;
2253
2669
 
2254
- memcpy(&kv_size, in, sizeof(kv_size)); in += sizeof(kv_size);
2255
- memcpy(&kv_ntok, in, sizeof(kv_ntok)); in += sizeof(kv_ntok);
2670
+ memcpy(&kv_size, inp, sizeof(kv_size)); inp += sizeof(kv_size);
2671
+ memcpy(&kv_ntok, inp, sizeof(kv_ntok)); inp += sizeof(kv_ntok);
2256
2672
 
2257
2673
  if (kv_size) {
2258
- LLAMA_ASSERT(ctx->model.kv_self.buf.size == kv_size);
2674
+ LLAMA_ASSERT(kv_self.buf.size == kv_size);
2675
+
2676
+ const size_t elt_size = ggml_element_size(kv_self.k);
2677
+
2678
+ char buffer[4096];
2259
2679
 
2260
- void * k_data = ctx->model.kv_self.k->data; // remember data pointers
2261
- void * v_data = ctx->model.kv_self.v->data; // because their value is stored in buf and overwritten by memcpy
2680
+ ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
2681
+ ggml_cgraph gf{};
2682
+ gf.n_threads = 1;
2262
2683
 
2263
- memcpy(ctx->model.kv_self.buf.addr, in, kv_size); in += kv_size;
2684
+ ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
2685
+ kin3d->data = (void *) inp;
2686
+ inp += ggml_nbytes(kin3d);
2264
2687
 
2265
- ctx->model.kv_self.k->data = k_data; // restore correct data pointers
2266
- ctx->model.kv_self.v->data = v_data;
2688
+ ggml_tensor * vin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_ntok, n_embd, n_layer);
2689
+ vin3d->data = (void *) inp;
2690
+ inp += ggml_nbytes(vin3d);
2267
2691
 
2692
+ ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k,
2693
+ n_embd, kv_ntok, n_layer,
2694
+ elt_size*n_embd, elt_size*n_embd*n_ctx, 0);
2695
+
2696
+ ggml_tensor * v3d = ggml_view_3d(cpy_ctx, kv_self.v,
2697
+ kv_ntok, n_embd, n_layer,
2698
+ elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
2699
+
2700
+ ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d));
2701
+ ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d));
2702
+ ggml_graph_compute(cpy_ctx, &gf);
2703
+
2704
+ ggml_free(cpy_ctx);
2268
2705
  }
2269
2706
 
2270
2707
  ctx->model.kv_self.n = kv_ntok;
2271
2708
  }
2272
2709
 
2273
- const size_t nread = in - src;
2274
- const size_t expected = llama_get_state_size(ctx);
2710
+ const size_t nread = inp - src;
2711
+ const size_t max_size = llama_get_state_size(ctx);
2275
2712
 
2276
- LLAMA_ASSERT(nread == expected);
2713
+ LLAMA_ASSERT(nread <= max_size);
2277
2714
 
2278
2715
  return nread;
2279
2716
  }
2280
2717
 
2718
+ bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
2719
+ llama_file file(path_session, "rb");
2720
+
2721
+ // sanity checks
2722
+ {
2723
+ const uint32_t magic = file.read_u32();
2724
+ const uint32_t version = file.read_u32();
2725
+
2726
+ if (magic != LLAMA_SESSION_MAGIC || version != LLAMA_SESSION_VERSION) {
2727
+ fprintf(stderr, "%s : unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version);
2728
+ return false;
2729
+ }
2730
+
2731
+ llama_hparams session_hparams;
2732
+ file.read_raw(&session_hparams, sizeof(llama_hparams));
2733
+
2734
+ if (session_hparams != ctx->model.hparams) {
2735
+ fprintf(stderr, "%s : model hparams didn't match from session file!\n", __func__);
2736
+ return false;
2737
+ }
2738
+ }
2739
+
2740
+ // load the prompt
2741
+ {
2742
+ const uint32_t n_token_count = file.read_u32();
2743
+
2744
+ if (n_token_count > n_token_capacity) {
2745
+ fprintf(stderr, "%s : token count in session file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity);
2746
+ return false;
2747
+ }
2748
+
2749
+ file.read_raw(tokens_out, sizeof(llama_token) * n_token_count);
2750
+ *n_token_count_out = n_token_count;
2751
+ }
2752
+
2753
+ // restore the context state
2754
+ {
2755
+ const size_t n_state_size_cur = file.size - file.tell();
2756
+ const size_t n_state_size_max = llama_get_state_size(ctx);
2757
+
2758
+ if (n_state_size_cur > n_state_size_max) {
2759
+ fprintf(stderr, "%s : the state size in session file is too big! max %zu, got %zu\n", __func__, n_state_size_max, n_state_size_cur);
2760
+ return false;
2761
+ }
2762
+
2763
+ std::vector<uint8_t> state_data(n_state_size_max);
2764
+ file.read_raw(state_data.data(), n_state_size_cur);
2765
+
2766
+ llama_set_state_data(ctx, state_data.data());
2767
+ }
2768
+
2769
+ return true;
2770
+ }
2771
+
2772
+ bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
2773
+ llama_file file(path_session, "wb");
2774
+
2775
+ file.write_u32(LLAMA_SESSION_MAGIC);
2776
+ file.write_u32(LLAMA_SESSION_VERSION);
2777
+
2778
+ file.write_raw(&ctx->model.hparams, sizeof(llama_hparams));
2779
+
2780
+ // save the prompt
2781
+ file.write_u32((uint32_t) n_token_count);
2782
+ file.write_raw(tokens, sizeof(llama_token) * n_token_count);
2783
+
2784
+ // save the context state
2785
+ {
2786
+ const size_t n_state_size_max = llama_get_state_size(ctx);
2787
+
2788
+ std::vector<uint8_t> state_data(n_state_size_max);
2789
+ const size_t n_state_size_cur = llama_copy_state_data(ctx, state_data.data());
2790
+
2791
+ file.write_raw(state_data.data(), n_state_size_cur);
2792
+ }
2793
+
2794
+ return true;
2795
+ }
2796
+
2281
2797
  int llama_eval(
2282
2798
  struct llama_context * ctx,
2283
2799
  const llama_token * tokens,
@@ -2288,11 +2804,14 @@ int llama_eval(
2288
2804
  fprintf(stderr, "%s: failed to eval\n", __func__);
2289
2805
  return 1;
2290
2806
  }
2807
+
2291
2808
  // get a more accurate load time, upon first eval
2809
+ // TODO: fix this
2292
2810
  if (!ctx->has_evaluated_once) {
2293
2811
  ctx->t_load_us = ggml_time_us() - ctx->t_start_us;
2294
2812
  ctx->has_evaluated_once = true;
2295
2813
  }
2814
+
2296
2815
  return 0;
2297
2816
  }
2298
2817
 
@@ -2316,15 +2835,15 @@ int llama_tokenize(
2316
2835
  return res.size();
2317
2836
  }
2318
2837
 
2319
- int llama_n_vocab(struct llama_context * ctx) {
2838
+ int llama_n_vocab(const struct llama_context * ctx) {
2320
2839
  return ctx->vocab.id_to_token.size();
2321
2840
  }
2322
2841
 
2323
- int llama_n_ctx(struct llama_context * ctx) {
2842
+ int llama_n_ctx(const struct llama_context * ctx) {
2324
2843
  return ctx->model.hparams.n_ctx;
2325
2844
  }
2326
2845
 
2327
- int llama_n_embd(struct llama_context * ctx) {
2846
+ int llama_n_embd(const struct llama_context * ctx) {
2328
2847
  return ctx->model.hparams.n_embd;
2329
2848
  }
2330
2849
 
@@ -2336,7 +2855,7 @@ float * llama_get_embeddings(struct llama_context * ctx) {
2336
2855
  return ctx->embedding.data();
2337
2856
  }
2338
2857
 
2339
- const char * llama_token_to_str(struct llama_context * ctx, llama_token token) {
2858
+ const char * llama_token_to_str(const struct llama_context * ctx, llama_token token) {
2340
2859
  if (token >= llama_n_vocab(ctx)) {
2341
2860
  return nullptr;
2342
2861
  }
@@ -2352,33 +2871,8 @@ llama_token llama_token_eos() {
2352
2871
  return 2;
2353
2872
  }
2354
2873
 
2355
- llama_token llama_sample_top_p_top_k(
2356
- llama_context * ctx,
2357
- const llama_token * last_n_tokens_data,
2358
- int last_n_tokens_size,
2359
- int top_k,
2360
- float top_p,
2361
- float temp,
2362
- float repeat_penalty) {
2363
- const int64_t t_start_sample_us = ggml_time_us();
2364
-
2365
- llama_token result = 0;
2366
-
2367
- // TODO: avoid this ...
2368
- const auto last_n_tokens = std::vector<llama_token>(last_n_tokens_data, last_n_tokens_data + last_n_tokens_size);
2369
-
2370
- result = llama_sample_top_p_top_k(
2371
- *ctx,
2372
- last_n_tokens,
2373
- top_k,
2374
- top_p,
2375
- temp,
2376
- repeat_penalty);
2377
-
2378
- ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
2379
- ctx->n_sample++;
2380
-
2381
- return result;
2874
+ llama_token llama_token_nl() {
2875
+ return 13;
2382
2876
  }
2383
2877
 
2384
2878
 
@@ -2391,9 +2885,9 @@ void llama_print_timings(struct llama_context * ctx) {
2391
2885
 
2392
2886
  fprintf(stderr, "\n");
2393
2887
  fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0);
2394
- fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3 * ctx->t_sample_us, n_sample, 1e-3 * ctx->t_sample_us / n_sample);
2888
+ fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_sample_us, n_sample, 1e-3 * ctx->t_sample_us / n_sample);
2395
2889
  fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_p_eval_us, n_p_eval, 1e-3 * ctx->t_p_eval_us / n_p_eval);
2396
- fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3 * ctx->t_eval_us, n_eval, 1e-3 * ctx->t_eval_us / n_eval);
2890
+ fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_eval_us, n_eval, 1e-3 * ctx->t_eval_us / n_eval);
2397
2891
  fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0);
2398
2892
  }
2399
2893
 
@@ -2430,4 +2924,3 @@ const char * llama_print_system_info(void) {
2430
2924
  std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
2431
2925
  return ctx->model.tensors_by_name;
2432
2926
  }
2433
-