llama_cpp 0.2.2 → 0.3.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -21,9 +21,13 @@
21
21
  #endif
22
22
  #ifdef GGML_USE_K_QUANTS
23
23
  #ifndef QK_K
24
+ #ifdef GGML_QKK_64
25
+ #define QK_K 64
26
+ #else
24
27
  #define QK_K 256
25
28
  #endif
26
29
  #endif
30
+ #endif
27
31
 
28
32
  #include <array>
29
33
  #include <ctime>
@@ -182,6 +186,19 @@ struct llama_kv_cache {
182
186
  }
183
187
  };
184
188
 
189
+ struct llama_vocab {
190
+ using id = int32_t;
191
+ using token = std::string;
192
+
193
+ struct token_score {
194
+ token tok;
195
+ float score;
196
+ };
197
+
198
+ std::unordered_map<token, id> token_to_id;
199
+ std::vector<token_score> id_to_token;
200
+ };
201
+
185
202
  struct llama_model {
186
203
  e_model type = MODEL_UNKNOWN;
187
204
 
@@ -198,10 +215,6 @@ struct llama_model {
198
215
  // context
199
216
  struct ggml_context * ctx = NULL;
200
217
 
201
- // key + value cache for the self attention
202
- // TODO: move to llama_state
203
- struct llama_kv_cache kv_self;
204
-
205
218
  // the model memory buffer
206
219
  llama_ctx_buffer buf;
207
220
 
@@ -215,6 +228,11 @@ struct llama_model {
215
228
  // for quantize-stats only
216
229
  std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
217
230
 
231
+ int64_t t_load_us = 0;
232
+ int64_t t_start_us = 0;
233
+
234
+ llama_vocab vocab;
235
+
218
236
  ~llama_model() {
219
237
  if (ctx) {
220
238
  ggml_free(ctx);
@@ -233,24 +251,11 @@ struct llama_model {
233
251
  }
234
252
  };
235
253
 
236
- struct llama_vocab {
237
- using id = int32_t;
238
- using token = std::string;
239
-
240
- struct token_score {
241
- token tok;
242
- float score;
243
- };
244
-
245
- std::unordered_map<token, id> token_to_id;
246
- std::vector<token_score> id_to_token;
247
- };
248
-
249
254
  struct llama_context {
255
+ llama_context(const llama_model & model, const llama_vocab & vocab) : model(model), vocab(vocab), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
256
+
250
257
  std::mt19937 rng;
251
258
 
252
- int64_t t_load_us = 0;
253
- int64_t t_start_us = 0;
254
259
  bool has_evaluated_once = false;
255
260
 
256
261
  int64_t t_sample_us = 0;
@@ -261,8 +266,16 @@ struct llama_context {
261
266
  int32_t n_eval = 0; // number of eval calls
262
267
  int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
263
268
 
264
- llama_model model;
265
- llama_vocab vocab;
269
+ const llama_model & model;
270
+ const llama_vocab & vocab;
271
+
272
+ bool model_owner = false;
273
+
274
+ int64_t t_load_us;
275
+ int64_t t_start_us;
276
+
277
+ // key + value cache for the self attention
278
+ struct llama_kv_cache kv_self;
266
279
 
267
280
  size_t mem_per_token = 0;
268
281
 
@@ -351,96 +364,14 @@ static size_t llama_calc_tensor_size(const std::vector<uint32_t> & ne, enum ggml
351
364
  return size / ggml_blck_size(type);
352
365
  }
353
366
 
354
- struct llama_load_tensor_shard {
355
- std::vector<uint32_t> ne;
356
- size_t size;
357
- enum ggml_type type;
358
- size_t file_idx;
359
- size_t file_off;
360
-
361
- void calc_size() {
362
- size = llama_calc_tensor_size(ne, type);
363
- }
364
- };
365
-
366
- enum llama_split_type {
367
- SPLIT_NONE,
368
- SPLIT_BY_COLUMNS,
369
- SPLIT_BY_ROWS
370
- };
371
-
372
367
  struct llama_load_tensor {
373
- std::vector<llama_load_tensor_shard> shards;
374
-
375
368
  std::string name;
376
369
  enum ggml_type type = GGML_TYPE_F32;
377
- llama_split_type split_type = SPLIT_NONE;
378
370
  std::vector<uint32_t> ne;
371
+ size_t file_off;
379
372
  size_t size;
380
373
  struct ggml_tensor * ggml_tensor = NULL;
381
374
  uint8_t * data;
382
-
383
- llama_load_tensor(const std::string & name) : name(name) {}
384
-
385
- void calc_all() {
386
- calc_type();
387
- calc_split_type();
388
- calc_ne();
389
- calc_size();
390
- }
391
-
392
- void calc_type() {
393
- const auto & first_shard = shards.at(0);
394
- for (const auto & shard : shards) {
395
- if (shard.type != first_shard.type) {
396
- throw std::runtime_error(format("inconsistent tensor shard type in '%s'", name.c_str()));
397
- }
398
- }
399
- type = first_shard.type;
400
- }
401
-
402
- void calc_split_type() {
403
- if (shards.at(0).ne.size() == 1 || // 1D tensors are just duplicated in every file
404
- shards.size() == 1) { // only one file?
405
- split_type = SPLIT_NONE;
406
- } else if (name.find("tok_embeddings.") == 0 ||
407
- name.find(".attention.wo.weight") != std::string::npos ||
408
- name.find(".feed_forward.w2.weight") != std::string::npos) {
409
- split_type = SPLIT_BY_COLUMNS;
410
- } else {
411
- split_type = SPLIT_BY_ROWS;
412
- }
413
- }
414
-
415
- void calc_ne() {
416
- const auto & first_shard = shards.at(0);
417
- for (const auto & shard : shards) {
418
- if (shard.ne != first_shard.ne) {
419
- throw std::runtime_error(format("inconsistent tensor shard shape in '%s': first was %s, other was %s",
420
- name.c_str(), llama_format_tensor_shape(first_shard.ne).c_str(), llama_format_tensor_shape(shard.ne).c_str()));
421
- }
422
- }
423
- ne = first_shard.ne;
424
- LLAMA_ASSERT(shards.size() <= UINT32_MAX);
425
- uint32_t n_shards = (uint32_t) shards.size();
426
- switch (split_type) {
427
- case SPLIT_NONE:
428
- ne = first_shard.ne;
429
- break;
430
- case SPLIT_BY_COLUMNS:
431
- ne = {checked_mul<uint32_t>(first_shard.ne[0], n_shards),
432
- first_shard.ne[1]};
433
- break;
434
- case SPLIT_BY_ROWS:
435
- ne = {first_shard.ne[0],
436
- checked_mul<uint32_t>(first_shard.ne[1], n_shards)};
437
- break;
438
- }
439
- }
440
-
441
- void calc_size() {
442
- size = llama_calc_tensor_size(ne, type);
443
- }
444
375
  };
445
376
 
446
377
  struct llama_load_tensors_map {
@@ -463,13 +394,13 @@ struct llama_file_loader {
463
394
  llama_hparams hparams;
464
395
  llama_vocab vocab;
465
396
 
466
- llama_file_loader(const char * fname, size_t file_idx, llama_load_tensors_map & tensors_map)
397
+ llama_file_loader(const char * fname, llama_load_tensors_map & tensors_map)
467
398
  : file(fname, "rb") {
468
399
  fprintf(stderr, "llama.cpp: loading model from %s\n", fname);
469
400
  read_magic();
470
401
  read_hparams();
471
402
  read_vocab();
472
- read_tensor_metadata(file_idx, tensors_map);
403
+ read_tensor_metadata(tensors_map);
473
404
  }
474
405
  void read_magic() {
475
406
  uint32_t magic = file.read_u32();
@@ -526,19 +457,19 @@ struct llama_file_loader {
526
457
  tok_score.score = score;
527
458
  }
528
459
  }
529
- void read_tensor_metadata(size_t file_idx, llama_load_tensors_map & tensors_map) {
460
+ void read_tensor_metadata(llama_load_tensors_map & tensors_map) {
530
461
  while (file.tell() < file.size) {
531
- llama_load_tensor_shard shard;
462
+ llama_load_tensor tensor;
532
463
  uint32_t n_dims = file.read_u32();
533
464
  uint32_t name_len = file.read_u32();
534
- shard.type = (enum ggml_type) file.read_u32();
535
- shard.ne.resize(n_dims);
536
- file.read_raw(shard.ne.data(), sizeof(shard.ne[0]) * n_dims);
465
+ tensor.type = (enum ggml_type) file.read_u32();
466
+ tensor.ne.resize(n_dims);
467
+ file.read_raw(tensor.ne.data(), sizeof(tensor.ne[0]) * n_dims);
537
468
  std::string name = file.read_string(name_len);
538
469
  if (n_dims < 1 || n_dims > 2) {
539
470
  throw std::runtime_error(format("llama.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims));
540
471
  }
541
- switch (shard.type) {
472
+ switch (tensor.type) {
542
473
  case GGML_TYPE_F32:
543
474
  case GGML_TYPE_F16:
544
475
  case GGML_TYPE_Q4_0:
@@ -553,30 +484,20 @@ struct llama_file_loader {
553
484
  case GGML_TYPE_Q6_K:
554
485
  break;
555
486
  default: {
556
- throw std::runtime_error(format("unrecognized tensor type %u\n", shard.type));
487
+ throw std::runtime_error(format("unrecognized tensor type %u\n", tensor.type));
557
488
  }
558
489
  }
559
490
 
560
- if (file_version >= LLAMA_FILE_VERSION_GGJT_V1) {
561
- // skip to the next multiple of 32 bytes
562
- file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
563
- }
564
- shard.file_idx = file_idx;
565
- shard.file_off = file.tell();
491
+ // skip to the next multiple of 32 bytes
492
+ file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
566
493
 
567
- shard.calc_size();
568
- file.seek(shard.size, SEEK_CUR);
494
+ tensor.file_off = file.tell();
495
+ tensor.name = name;
496
+ tensor.size = llama_calc_tensor_size(tensor.ne, tensor.type);
497
+ file.seek(tensor.size, SEEK_CUR);
569
498
 
570
- auto it = tensors_map.name_to_idx.find(name);
571
- size_t idx;
572
- if (it != tensors_map.name_to_idx.end()) {
573
- idx = it->second;
574
- } else {
575
- tensors_map.tensors.emplace_back(name);
576
- idx = tensors_map.tensors.size() - 1;
577
- tensors_map.name_to_idx.emplace(name, idx);
578
- }
579
- tensors_map.tensors.at(idx).shards.push_back(shard);
499
+ tensors_map.tensors.push_back(tensor);
500
+ tensors_map.name_to_idx[name] = tensors_map.tensors.size() - 1;
580
501
  }
581
502
  }
582
503
  };
@@ -646,56 +567,19 @@ struct llama_file_saver {
646
567
  };
647
568
 
648
569
  struct llama_model_loader {
649
- std::vector<std::unique_ptr<llama_file_loader>> file_loaders;
570
+ std::unique_ptr<llama_file_loader> file_loader;
650
571
  llama_load_tensors_map tensors_map;
651
572
  bool use_mmap;
652
573
  size_t num_ggml_tensors_created = 0;
653
574
  struct ggml_context * ggml_ctx = NULL;
654
575
  std::unique_ptr<llama_mmap> mapping;
655
576
 
656
- llama_model_loader(const std::string & fname_base, bool use_mmap, bool vocab_only) {
657
- auto * first_file = new llama_file_loader(fname_base.c_str(), 0, tensors_map);
658
- file_loaders.emplace_back(first_file);
659
- uint32_t n_parts = vocab_only ? 1 : guess_n_parts();
660
- for (uint32_t i = 1; i < n_parts; i++) {
661
- std::string fname = fname_base + "." + std::to_string(i);
662
- auto * ith_file = new llama_file_loader(fname.c_str(), i, tensors_map);
663
- file_loaders.emplace_back(ith_file);
664
- if (ith_file->hparams != first_file->hparams) {
665
- throw std::runtime_error(format("llama.cpp: hparams inconsistent between files"));
666
- }
667
- }
577
+ llama_model_loader(const std::string & fname_base, bool use_mmap) {
578
+ file_loader = std::unique_ptr<llama_file_loader>(new llama_file_loader(fname_base.c_str(), tensors_map));
668
579
  if (!llama_mmap::SUPPORTED) {
669
580
  use_mmap = false;
670
581
  }
671
- if (use_mmap && alignment_prevents_mmap()) {
672
- fprintf(stderr, "llama.cpp: can't use mmap because tensors are not aligned; convert to new format to avoid this\n");
673
- use_mmap = false;
674
- }
675
582
  this->use_mmap = use_mmap;
676
- for (llama_load_tensor & lt : tensors_map.tensors) {
677
- lt.calc_all();
678
- }
679
- }
680
-
681
- bool alignment_prevents_mmap() {
682
- for (const llama_load_tensor & lt : tensors_map.tensors) {
683
- for (const llama_load_tensor_shard & shard : lt.shards) {
684
- if (shard.file_off & 3) {
685
- return true;
686
- }
687
- }
688
- }
689
- return false;
690
- }
691
-
692
- uint32_t guess_n_parts() const {
693
- auto it = tensors_map.name_to_idx.find("tok_embeddings.weight");
694
- if (it == tensors_map.name_to_idx.end()) {
695
- throw std::runtime_error(std::string("missing tok_embeddings.weight"));
696
- }
697
- const llama_load_tensor & lt = tensors_map.tensors.at(it->second);
698
- return file_loaders.at(0)->hparams.n_embd / lt.shards.at(0).ne.at(0);
699
583
  }
700
584
 
701
585
  void calc_sizes(size_t * ctx_size_p, size_t * mmapped_size_p) const {
@@ -761,7 +645,7 @@ struct llama_model_loader {
761
645
  }
762
646
 
763
647
  if (use_mmap) {
764
- mapping.reset(new llama_mmap(&file_loaders.at(0)->file, prefetch_size));
648
+ mapping.reset(new llama_mmap(&file_loader->file, prefetch_size, ggml_is_numa()));
765
649
  if (lmlock) {
766
650
  lmlock->init(mapping->addr);
767
651
  }
@@ -817,45 +701,13 @@ struct llama_model_loader {
817
701
 
818
702
  void load_data_for(llama_load_tensor & lt) {
819
703
  if (use_mmap) {
820
- LLAMA_ASSERT(lt.shards.size() == 1);
821
- lt.data = (uint8_t *) mapping->addr + lt.shards.at(0).file_off;
822
- } else if (lt.split_type == SPLIT_NONE) {
823
- llama_file & file = file_loaders.at(lt.shards.at(0).file_idx)->file;
824
- file.seek(lt.shards.at(0).file_off, SEEK_SET);
704
+ lt.data = (uint8_t *) mapping->addr + lt.file_off;
705
+ } else {
706
+ llama_file & file = file_loader->file;
707
+ file.seek(lt.file_off, SEEK_SET);
825
708
  file.read_raw(lt.data, lt.size);
826
- } else if (lt.split_type == SPLIT_BY_ROWS) {
827
- size_t offset = 0;
828
- for (llama_load_tensor_shard & shard : lt.shards) {
829
- llama_file & file = file_loaders.at(shard.file_idx)->file;
830
- file.seek(shard.file_off, SEEK_SET);
831
- file.read_raw(lt.data + offset, shard.size);
832
- offset += shard.size;
833
- }
834
- LLAMA_ASSERT(offset == lt.size);
835
- } else if (lt.split_type == SPLIT_BY_COLUMNS) {
836
- // Let's load the data into temporary buffers to ensure the OS performs large loads.
837
- std::vector<llama_buffer> tmp_bufs(lt.shards.size());
838
- for (size_t i = 0; i < lt.shards.size(); i++) {
839
- llama_load_tensor_shard & shard = lt.shards.at(i);
840
- llama_file & file = file_loaders.at(shard.file_idx)->file;
841
- file.seek(shard.file_off, SEEK_SET);
842
- tmp_bufs.at(i).resize(shard.size);
843
- file.read_raw(tmp_bufs.at(i).addr, shard.size);
844
- }
845
- // Then reshape.
846
- size_t num_rows = lt.ne.at(1);
847
- size_t per_shard_row_size = lt.shards.at(0).size / num_rows;
848
- size_t out_offset = 0;
849
- for (size_t row = 0; row < num_rows; row++) {
850
- for (llama_buffer & tmp_buf : tmp_bufs) {
851
- memcpy(lt.data + out_offset,
852
- tmp_buf.addr + row * per_shard_row_size,
853
- per_shard_row_size);
854
- out_offset += per_shard_row_size;
855
- }
856
- }
857
- LLAMA_ASSERT(out_offset == lt.size);
858
709
  }
710
+
859
711
  if (0) {
860
712
  print_checksum(lt);
861
713
  }
@@ -925,7 +777,7 @@ static bool kv_cache_init(
925
777
 
926
778
  struct llama_context_params llama_context_default_params() {
927
779
  struct llama_context_params result = {
928
- /*.seed =*/ -1,
780
+ /*.seed =*/ LLAMA_DEFAULT_SEED,
929
781
  /*.n_ctx =*/ 512,
930
782
  /*.n_batch =*/ 512,
931
783
  /*.gpu_layers =*/ 0,
@@ -964,7 +816,7 @@ bool llama_mlock_supported() {
964
816
  return llama_mlock::SUPPORTED;
965
817
  }
966
818
 
967
- void llama_init_backend() {
819
+ void llama_init_backend(bool numa) {
968
820
  ggml_time_init();
969
821
 
970
822
  // needed to initialize f16 tables
@@ -973,6 +825,10 @@ void llama_init_backend() {
973
825
  struct ggml_context * ctx = ggml_init(params);
974
826
  ggml_free(ctx);
975
827
  }
828
+
829
+ if (numa) {
830
+ ggml_numa_init();
831
+ }
976
832
  }
977
833
 
978
834
  int64_t llama_time_us() {
@@ -1033,7 +889,8 @@ static const char *llama_model_type_name(e_model type) {
1033
889
 
1034
890
  static void llama_model_load_internal(
1035
891
  const std::string & fname,
1036
- llama_context & lctx,
892
+ llama_model & model,
893
+ llama_vocab & vocab,
1037
894
  int n_ctx,
1038
895
  int n_batch,
1039
896
  int n_gpu_layers,
@@ -1047,15 +904,14 @@ static void llama_model_load_internal(
1047
904
  llama_progress_callback progress_callback,
1048
905
  void * progress_callback_user_data) {
1049
906
 
1050
- lctx.t_start_us = ggml_time_us();
907
+ model.t_start_us = ggml_time_us();
1051
908
 
1052
- std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap, vocab_only));
909
+ std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap));
1053
910
 
1054
- lctx.vocab = std::move(ml->file_loaders.at(0)->vocab);
1055
- auto & model = lctx.model;
1056
- model.hparams = ml->file_loaders.at(0)->hparams;
911
+ vocab = std::move(ml->file_loader->vocab);
912
+ model.hparams = ml->file_loader->hparams;
1057
913
  model.n_gpu_layers = n_gpu_layers;
1058
- llama_file_version file_version = ml->file_loaders.at(0)->file_version;
914
+ llama_file_version file_version = ml->file_loader->file_version;
1059
915
  auto & hparams = model.hparams;
1060
916
 
1061
917
  {
@@ -1089,7 +945,6 @@ static void llama_model_load_internal(
1089
945
  fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot);
1090
946
  fprintf(stderr, "%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype));
1091
947
  fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff);
1092
- fprintf(stderr, "%s: n_parts = %zu\n", __func__, ml->file_loaders.size());
1093
948
  fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
1094
949
  }
1095
950
 
@@ -1122,15 +977,15 @@ static void llama_model_load_internal(
1122
977
 
1123
978
  // create the ggml context
1124
979
  {
1125
- lctx.model.buf.resize(ctx_size);
980
+ model.buf.resize(ctx_size);
1126
981
  if (use_mlock) {
1127
- lctx.model.mlock_buf.init(lctx.model.buf.addr);
1128
- lctx.model.mlock_buf.grow_to(lctx.model.buf.size);
982
+ model.mlock_buf.init(model.buf.addr);
983
+ model.mlock_buf.grow_to(model.buf.size);
1129
984
  }
1130
985
 
1131
986
  struct ggml_init_params params = {
1132
- /*.mem_size =*/ lctx.model.buf.size,
1133
- /*.mem_buffer =*/ lctx.model.buf.addr,
987
+ /*.mem_size =*/ model.buf.size,
988
+ /*.mem_buffer =*/ model.buf.addr,
1134
989
  /*.no_alloc =*/ ml->use_mmap,
1135
990
  };
1136
991
 
@@ -1311,7 +1166,7 @@ static void llama_model_load_internal(
1311
1166
  }
1312
1167
  #endif
1313
1168
 
1314
- ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
1169
+ ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &model.mlock_mmap : NULL);
1315
1170
 
1316
1171
  if (progress_callback) {
1317
1172
  progress_callback(1.0f, progress_callback_user_data);
@@ -1321,12 +1176,13 @@ static void llama_model_load_internal(
1321
1176
 
1322
1177
  // loading time will be recalculate after the first eval, so
1323
1178
  // we take page faults deferred by mmap() into consideration
1324
- lctx.t_load_us = ggml_time_us() - lctx.t_start_us;
1179
+ model.t_load_us = ggml_time_us() - model.t_start_us;
1325
1180
  }
1326
1181
 
1327
1182
  static bool llama_model_load(
1328
1183
  const std::string & fname,
1329
- llama_context & lctx,
1184
+ llama_model & model,
1185
+ llama_vocab & vocab,
1330
1186
  int n_ctx,
1331
1187
  int n_batch,
1332
1188
  int n_gpu_layers,
@@ -1340,7 +1196,7 @@ static bool llama_model_load(
1340
1196
  llama_progress_callback progress_callback,
1341
1197
  void *progress_callback_user_data) {
1342
1198
  try {
1343
- llama_model_load_internal(fname, lctx, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type,
1199
+ llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type,
1344
1200
  use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
1345
1201
  return true;
1346
1202
  } catch (const std::exception & err) {
@@ -1351,22 +1207,26 @@ static bool llama_model_load(
1351
1207
 
1352
1208
  // evaluate the transformer
1353
1209
  //
1354
- // - lctx: llama context
1355
- // - tokens: new batch of tokens to process
1356
- // - n_past: the context size so far
1357
- // - n_threads: number of threads to use
1358
- // - cgraph_fname: filename of the exported computation graph
1210
+ // - lctx: llama context
1211
+ // - tokens: new batch of tokens to process
1212
+ // - embd embeddings input
1213
+ // - n_tokens number of tokens
1214
+ // - n_past: the context size so far
1215
+ // - n_threads: number of threads to use
1359
1216
  //
1360
1217
  static bool llama_eval_internal(
1361
- llama_context & lctx,
1362
- const llama_token * tokens,
1363
- const int n_tokens,
1364
- const int n_past,
1365
- const int n_threads,
1218
+ llama_context & lctx,
1219
+ const llama_token * tokens,
1220
+ const float * embd,
1221
+ const int n_tokens,
1222
+ const int n_past,
1223
+ const int n_threads,
1366
1224
  const char * cgraph_fname) {
1367
1225
 
1226
+ LLAMA_ASSERT((!tokens && embd) || (tokens && !embd));
1227
+
1368
1228
  // enforce that the first token is BOS
1369
- if (n_past == 0 && tokens[0] != llama_token_bos()) {
1229
+ if (tokens && n_past == 0 && tokens[0] != llama_token_bos()) {
1370
1230
  fprintf(stderr, "%s: first token must be BOS\n", __func__);
1371
1231
  return false;
1372
1232
  }
@@ -1378,7 +1238,7 @@ static bool llama_eval_internal(
1378
1238
  const auto & model = lctx.model;
1379
1239
  const auto & hparams = model.hparams;
1380
1240
 
1381
- const auto & kv_self = model.kv_self;
1241
+ const auto & kv_self = lctx.kv_self;
1382
1242
 
1383
1243
  LLAMA_ASSERT(!!kv_self.ctx);
1384
1244
 
@@ -1406,12 +1266,18 @@ static bool llama_eval_internal(
1406
1266
  ggml_cgraph gf = {};
1407
1267
  gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
1408
1268
 
1409
- struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
1410
- ggml_set_name(embd, "embd");
1411
- memcpy(embd->data, tokens, N*ggml_element_size(embd));
1412
-
1413
1269
  struct ggml_tensor * cur;
1414
- struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
1270
+ struct ggml_tensor * inpL;
1271
+
1272
+ if (tokens) {
1273
+ struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
1274
+ ggml_set_name(embd, "embd");
1275
+ memcpy(embd->data, tokens, N*ggml_element_size(embd));
1276
+ inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
1277
+ } else {
1278
+ inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
1279
+ memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
1280
+ }
1415
1281
 
1416
1282
  const int i_gpu_start = n_layer - n_gpu_layers;
1417
1283
  (void) i_gpu_start;
@@ -1473,11 +1339,11 @@ static bool llama_eval_internal(
1473
1339
  offload_func_kq(tmpq);
1474
1340
  ggml_set_name(tmpq, "tmpq");
1475
1341
 
1476
- struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0);
1342
+ struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
1477
1343
  offload_func_kq(Kcur);
1478
1344
  ggml_set_name(Kcur, "Kcur");
1479
1345
 
1480
- struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0);
1346
+ struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
1481
1347
  offload_func_kq(Qcur);
1482
1348
  ggml_set_name(Qcur, "Qcur");
1483
1349
 
@@ -1726,7 +1592,7 @@ static bool llama_eval_internal(
1726
1592
  //memcpy(embd_w.data(), ggml_get_data(cur), sizeof(float)*n_vocab*N);
1727
1593
 
1728
1594
  // update kv token count
1729
- lctx.model.kv_self.n = n_past + N;
1595
+ lctx.kv_self.n = n_past + N;
1730
1596
 
1731
1597
  // extract logits
1732
1598
  {
@@ -2005,9 +1871,10 @@ void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * can
2005
1871
  for (size_t i = 0; i < candidates->size; ++i) {
2006
1872
  cum_sum += candidates->data[i].p;
2007
1873
 
2008
- // Check if the running sum is greater than p or if we have kept at least min_keep tokens
2009
- if (cum_sum > p && i >= min_keep) {
2010
- last_idx = i;
1874
+ // Check if the running sum is at least p or if we have kept at least min_keep tokens
1875
+ // we set the last index to i+1 to indicate that the current iterate should be included in the set
1876
+ if (cum_sum >= p && i + 1 >= min_keep) {
1877
+ last_idx = i + 1;
2011
1878
  break;
2012
1879
  }
2013
1880
  }
@@ -2432,9 +2299,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2432
2299
  nthread = std::thread::hardware_concurrency();
2433
2300
  }
2434
2301
 
2435
- std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp, /*use_mmap*/ false,
2436
- /*vocab_only*/ false));
2437
- llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), params->ftype);
2302
+ std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp, /*use_mmap*/ false));
2303
+ llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loader.get(), params->ftype);
2438
2304
 
2439
2305
  #ifdef GGML_USE_K_QUANTS
2440
2306
  int n_attention_wv = 0;
@@ -2459,6 +2325,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2459
2325
  std::vector<std::thread> workers;
2460
2326
  std::mutex mutex;
2461
2327
 
2328
+ auto use_more_bits = [] (int i_layer, int num_layers) -> bool {
2329
+ return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2;
2330
+ };
2331
+
2462
2332
  size_t idx = 0;
2463
2333
  for (llama_load_tensor & tensor : model_loader->tensors_map.tensors) {
2464
2334
  llama_buffer read_data;
@@ -2513,15 +2383,16 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2513
2383
  if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
2514
2384
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
2515
2385
  else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
2516
- (i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8 ||
2517
- (i_attention_wv - n_attention_wv/8)%3 == 2)) new_type = GGML_TYPE_Q6_K;
2386
+ use_more_bits(i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K;
2387
+ else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
2388
+ (i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
2518
2389
  ++i_attention_wv;
2519
2390
  } else if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) {
2520
2391
  if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
2521
2392
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
2522
2393
  else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
2523
- (i_feed_forward_w2 < n_feed_forward_w2/8 || i_feed_forward_w2 >= 7*n_feed_forward_w2/8 ||
2524
- (i_feed_forward_w2 - n_feed_forward_w2/8)%3 == 2)) new_type = GGML_TYPE_Q6_K;
2394
+ use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
2395
+ //else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_feed_forward_w2 < n_feed_forward_w2/8) new_type = GGML_TYPE_Q6_K;
2525
2396
  ++i_feed_forward_w2;
2526
2397
  } else if (tensor.name.find("attention.wo.weight") != std::string::npos) {
2527
2398
  if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
@@ -2630,18 +2501,47 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2630
2501
  }
2631
2502
  }
2632
2503
 
2504
+
2505
+
2633
2506
  //
2634
2507
  // interface implementation
2635
2508
  //
2636
2509
 
2637
- struct llama_context * llama_init_from_file(
2510
+ struct llama_model * llama_load_model_from_file(
2638
2511
  const char * path_model,
2639
2512
  struct llama_context_params params) {
2640
2513
  ggml_time_init();
2641
2514
 
2642
- llama_context * ctx = new llama_context;
2515
+ llama_model * model = new llama_model;
2516
+
2517
+ ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
2518
+
2519
+ if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gpu_layers,
2520
+ params.main_gpu, params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock,
2521
+ params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
2522
+ delete model;
2523
+ fprintf(stderr, "%s: failed to load model\n", __func__);
2524
+ return nullptr;
2525
+ }
2526
+
2527
+ return model;
2528
+ }
2643
2529
 
2644
- if (params.seed < 0) {
2530
+ void llama_free_model(struct llama_model * model) {
2531
+ delete model;
2532
+ }
2533
+
2534
+ struct llama_context * llama_new_context_with_model(
2535
+ struct llama_model * model,
2536
+ struct llama_context_params params) {
2537
+
2538
+ if (!model) {
2539
+ return nullptr;
2540
+ }
2541
+
2542
+ llama_context * ctx = new llama_context(*model, model->vocab);
2543
+
2544
+ if (params.seed == LLAMA_DEFAULT_SEED) {
2645
2545
  params.seed = time(NULL);
2646
2546
  }
2647
2547
 
@@ -2667,24 +2567,16 @@ struct llama_context * llama_init_from_file(
2667
2567
 
2668
2568
  ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
2669
2569
 
2670
- if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_batch, params.n_gpu_layers, params.main_gpu,
2671
- params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock,
2672
- params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
2673
- fprintf(stderr, "%s: failed to load model\n", __func__);
2674
- llama_free(ctx);
2675
- return nullptr;
2676
- }
2677
-
2678
2570
  // reserve memory for context buffers
2679
2571
  if (!params.vocab_only) {
2680
- if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx, params.n_gpu_layers)) {
2572
+ if (!kv_cache_init(ctx->model.hparams, ctx->kv_self, memory_type, ctx->model.hparams.n_ctx, params.n_gpu_layers)) {
2681
2573
  fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
2682
2574
  llama_free(ctx);
2683
2575
  return nullptr;
2684
2576
  }
2685
2577
 
2686
2578
  {
2687
- const size_t memory_size = ggml_nbytes(ctx->model.kv_self.k) + ggml_nbytes(ctx->model.kv_self.v);
2579
+ const size_t memory_size = ggml_nbytes(ctx->kv_self.k) + ggml_nbytes(ctx->kv_self.v);
2688
2580
  fprintf(stderr, "%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
2689
2581
  }
2690
2582
 
@@ -2736,8 +2628,8 @@ struct llama_context * llama_init_from_file(
2736
2628
 
2737
2629
  LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
2738
2630
 
2739
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size, 0));
2740
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->model.kv_self.buf.addr, ctx->model.kv_self.buf.size, 0));
2631
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size, 0));
2632
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.addr, ctx->kv_self.buf.size, 0));
2741
2633
 
2742
2634
  LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size, 0));
2743
2635
  LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size, 0));
@@ -2748,7 +2640,23 @@ struct llama_context * llama_init_from_file(
2748
2640
  return ctx;
2749
2641
  }
2750
2642
 
2643
+ struct llama_context * llama_init_from_file(
2644
+ const char * path_model,
2645
+ struct llama_context_params params) {
2646
+
2647
+ struct llama_model * model = llama_load_model_from_file(path_model, params);
2648
+ if (!model) {
2649
+ return nullptr;
2650
+ }
2651
+ struct llama_context * ctx = llama_new_context_with_model(model, params);
2652
+ ctx->model_owner = true;
2653
+ return ctx;
2654
+ }
2655
+
2751
2656
  void llama_free(struct llama_context * ctx) {
2657
+ if (ctx->model_owner) {
2658
+ delete &ctx->model;
2659
+ }
2752
2660
  delete ctx;
2753
2661
  }
2754
2662
 
@@ -2765,11 +2673,9 @@ int llama_model_quantize(
2765
2673
  }
2766
2674
  }
2767
2675
 
2768
- int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
2676
+ int llama_apply_lora_from_file_internal(const struct llama_model & model, const char * path_lora, const char * path_base_model, int n_threads) {
2769
2677
  fprintf(stderr, "%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
2770
2678
 
2771
- auto & model = ctx->model;
2772
-
2773
2679
  const int64_t t_start_lora_us = ggml_time_us();
2774
2680
 
2775
2681
  auto fin = std::ifstream(path_lora, std::ios::binary);
@@ -2817,7 +2723,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
2817
2723
 
2818
2724
  // create a name -> tensor map of the model to accelerate lookups
2819
2725
  std::unordered_map<std::string, struct ggml_tensor*> model_tensors;
2820
- for (auto & kv: model.tensors_by_name) {
2726
+ for (const auto & kv: model.tensors_by_name) {
2821
2727
  model_tensors.insert(kv);
2822
2728
  }
2823
2729
 
@@ -2828,7 +2734,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
2828
2734
  llama_buffer base_buf;
2829
2735
  if (path_base_model) {
2830
2736
  fprintf(stderr, "%s: loading base model from '%s'\n", __func__, path_base_model);
2831
- model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*vocab_only*/ false));
2737
+ model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true));
2832
2738
 
2833
2739
  size_t ctx_size;
2834
2740
  size_t mmapped_size;
@@ -2846,7 +2752,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
2846
2752
 
2847
2753
  // maybe this should in llama_model_loader
2848
2754
  if (model_loader->use_mmap) {
2849
- model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ 0));
2755
+ model_loader->mapping.reset(new llama_mmap(&model_loader->file_loader->file, /* prefetch */ 0, ggml_is_numa()));
2850
2756
  }
2851
2757
  }
2852
2758
 
@@ -2907,7 +2813,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
2907
2813
  return false;
2908
2814
  }
2909
2815
  }
2910
- ggml_tensor* lora_tensor;
2816
+ ggml_tensor * lora_tensor;
2911
2817
  if (n_dims == 2) {
2912
2818
  lora_tensor = ggml_new_tensor_2d(lora_ctx, wtype, ne[0], ne[1]);
2913
2819
  }
@@ -2915,6 +2821,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
2915
2821
  fprintf(stderr, "%s: unsupported tensor dimension %d\n", __func__, n_dims);
2916
2822
  return 1;
2917
2823
  }
2824
+ ggml_set_name(lora_tensor, "lora_tensor");
2918
2825
 
2919
2826
  // load tensor data
2920
2827
  size_t offset = fin.tellg();
@@ -2930,6 +2837,21 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
2930
2837
  lora_tensors.find(base_name + ".loraB") != lora_tensors.end()) {
2931
2838
 
2932
2839
  ggml_tensor * dest_t = model_tensors[base_name];
2840
+
2841
+ offload_func_t offload_func = llama_nop;
2842
+ offload_func_t offload_func_force_inplace = llama_nop;
2843
+
2844
+ #ifdef GGML_USE_CUBLAS
2845
+ if (dest_t->backend == GGML_BACKEND_GPU || dest_t->backend == GGML_BACKEND_GPU_SPLIT) {
2846
+ if (dest_t->type != GGML_TYPE_F16) {
2847
+ throw std::runtime_error(format(
2848
+ "%s: error: the simultaneous use of LoRAs and GPU acceleration is only supported for f16 models", __func__));
2849
+ }
2850
+ offload_func = ggml_cuda_assign_buffers;
2851
+ offload_func_force_inplace = ggml_cuda_assign_buffers_force_inplace;
2852
+ }
2853
+ #endif // GGML_USE_CUBLAS
2854
+
2933
2855
  ggml_tensor * base_t;
2934
2856
  if (model_loader) {
2935
2857
  // load from base model
@@ -2957,7 +2879,12 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
2957
2879
  }
2958
2880
 
2959
2881
  ggml_tensor * loraA = lora_tensors[base_name + ".loraA"];
2882
+ GGML_ASSERT(loraA->type == GGML_TYPE_F32);
2883
+ ggml_set_name(loraA, "loraA");
2884
+
2960
2885
  ggml_tensor * loraB = lora_tensors[base_name + ".loraB"];
2886
+ GGML_ASSERT(loraB->type == GGML_TYPE_F32);
2887
+ ggml_set_name(loraB, "loraB");
2961
2888
 
2962
2889
  if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) {
2963
2890
  fprintf(stderr, "%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");"
@@ -2967,19 +2894,32 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
2967
2894
 
2968
2895
  // w = w + BA*s
2969
2896
  ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB);
2897
+ offload_func(BA);
2898
+ ggml_set_name(BA, "BA");
2970
2899
 
2971
2900
  if (scaling != 1.0f) {
2972
2901
  ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling);
2902
+ ggml_set_name(scale_tensor, "scale_tensor");
2903
+
2973
2904
  BA = ggml_scale_inplace(lora_ctx, BA, scale_tensor);
2905
+ offload_func(BA);
2906
+ ggml_set_name(BA, "BA_scaled");
2974
2907
  }
2975
2908
 
2976
2909
  ggml_tensor * r;
2977
2910
  if (base_t == dest_t) {
2978
2911
  r = ggml_add_inplace(lora_ctx, dest_t, BA);
2912
+ offload_func_force_inplace(r);
2913
+ ggml_set_name(r, "r_add_inplace");
2979
2914
  }
2980
2915
  else {
2981
2916
  r = ggml_add(lora_ctx, base_t, BA);
2917
+ offload_func(r);
2918
+ ggml_set_name(r, "r_add");
2919
+
2982
2920
  r = ggml_cpy(lora_ctx, r, dest_t);
2921
+ offload_func(r);
2922
+ ggml_set_name(r, "r_cpy");
2983
2923
  }
2984
2924
 
2985
2925
  struct ggml_cgraph gf = ggml_build_forward(r);
@@ -3012,7 +2952,16 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
3012
2952
 
3013
2953
  int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
3014
2954
  try {
3015
- return llama_apply_lora_from_file_internal(ctx, path_lora, path_base_model, n_threads);
2955
+ return llama_apply_lora_from_file_internal(ctx->model, path_lora, path_base_model, n_threads);
2956
+ } catch (const std::exception & err) {
2957
+ fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what());
2958
+ return 1;
2959
+ }
2960
+ }
2961
+
2962
+ int llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, const char * path_base_model, int n_threads) {
2963
+ try {
2964
+ return llama_apply_lora_from_file_internal(*model, path_lora, path_base_model, n_threads);
3016
2965
  } catch (const std::exception & err) {
3017
2966
  fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what());
3018
2967
  return 1;
@@ -3020,13 +2969,13 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
3020
2969
  }
3021
2970
 
3022
2971
  int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
3023
- return ctx->model.kv_self.n;
2972
+ return ctx->kv_self.n;
3024
2973
  }
3025
2974
 
3026
2975
  #define LLAMA_MAX_RNG_STATE (64*1024)
3027
2976
 
3028
- void llama_set_rng_seed(struct llama_context * ctx, int seed) {
3029
- if (seed < 0) {
2977
+ void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed) {
2978
+ if (seed == LLAMA_DEFAULT_SEED) {
3030
2979
  seed = time(NULL);
3031
2980
  }
3032
2981
  ctx->rng.seed(seed);
@@ -3045,7 +2994,7 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
3045
2994
  const size_t s_embedding = ctx->embedding.size() * sizeof(float);
3046
2995
  const size_t s_kv_size = sizeof(size_t);
3047
2996
  const size_t s_kv_ntok = sizeof(int);
3048
- const size_t s_kv = ctx->model.kv_self.buf.size;
2997
+ const size_t s_kv = ctx->kv_self.buf.size;
3049
2998
 
3050
2999
  const size_t s_total = (
3051
3000
  + s_rng_size
@@ -3111,7 +3060,7 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
3111
3060
 
3112
3061
  // copy kv cache
3113
3062
  {
3114
- const auto & kv_self = ctx->model.kv_self;
3063
+ const auto & kv_self = ctx->kv_self;
3115
3064
  const auto & hparams = ctx->model.hparams;
3116
3065
  const int n_layer = hparams.n_layer;
3117
3066
  const int n_embd = hparams.n_embd;
@@ -3215,7 +3164,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
3215
3164
 
3216
3165
  // set kv cache
3217
3166
  {
3218
- const auto & kv_self = ctx->model.kv_self;
3167
+ const auto & kv_self = ctx->kv_self;
3219
3168
  const auto & hparams = ctx->model.hparams;
3220
3169
  const int n_layer = hparams.n_layer;
3221
3170
  const int n_embd = hparams.n_embd;
@@ -3259,7 +3208,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
3259
3208
  ggml_free(cpy_ctx);
3260
3209
  }
3261
3210
 
3262
- ctx->model.kv_self.n = kv_ntok;
3211
+ ctx->kv_self.n = kv_ntok;
3263
3212
  }
3264
3213
 
3265
3214
  const size_t nread = inp - src;
@@ -3355,7 +3304,29 @@ int llama_eval(
3355
3304
  int n_tokens,
3356
3305
  int n_past,
3357
3306
  int n_threads) {
3358
- if (!llama_eval_internal(*ctx, tokens, n_tokens, n_past, n_threads, nullptr)) {
3307
+ if (!llama_eval_internal(*ctx, tokens, nullptr, n_tokens, n_past, n_threads, nullptr)) {
3308
+ fprintf(stderr, "%s: failed to eval\n", __func__);
3309
+ return 1;
3310
+ }
3311
+
3312
+ // get a more accurate load time, upon first eval
3313
+ // TODO: fix this
3314
+ if (!ctx->has_evaluated_once) {
3315
+ ctx->t_load_us = ggml_time_us() - ctx->t_start_us;
3316
+ ctx->has_evaluated_once = true;
3317
+ }
3318
+
3319
+ return 0;
3320
+ }
3321
+
3322
+
3323
+ int llama_eval_embd(
3324
+ struct llama_context * ctx,
3325
+ const float * embd,
3326
+ int n_tokens,
3327
+ int n_past,
3328
+ int n_threads) {
3329
+ if (!llama_eval_internal(*ctx, nullptr, embd, n_tokens, n_past, n_threads, nullptr)) {
3359
3330
  fprintf(stderr, "%s: failed to eval\n", __func__);
3360
3331
  return 1;
3361
3332
  }
@@ -3376,7 +3347,7 @@ int llama_eval_export(struct llama_context * ctx, const char * fname) {
3376
3347
 
3377
3348
  const std::vector<llama_token> tmp(n_batch, llama_token_bos());
3378
3349
 
3379
- if (!llama_eval_internal(*ctx, tmp.data(), tmp.size(), n_ctx, 1, fname)) {
3350
+ if (!llama_eval_internal(*ctx, tmp.data(), nullptr, tmp.size(), n_ctx, 1, fname)) {
3380
3351
  fprintf(stderr, "%s: failed to eval\n", __func__);
3381
3352
  return 1;
3382
3353
  }
@@ -3506,6 +3477,6 @@ const char * llama_print_system_info(void) {
3506
3477
  }
3507
3478
 
3508
3479
  // For internal test use
3509
- std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
3480
+ const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
3510
3481
  return ctx->model.tensors_by_name;
3511
3482
  }