llama_cpp 0.2.2 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -21,9 +21,13 @@
21
21
  #endif
22
22
  #ifdef GGML_USE_K_QUANTS
23
23
  #ifndef QK_K
24
+ #ifdef GGML_QKK_64
25
+ #define QK_K 64
26
+ #else
24
27
  #define QK_K 256
25
28
  #endif
26
29
  #endif
30
+ #endif
27
31
 
28
32
  #include <array>
29
33
  #include <ctime>
@@ -182,6 +186,19 @@ struct llama_kv_cache {
182
186
  }
183
187
  };
184
188
 
189
+ struct llama_vocab {
190
+ using id = int32_t;
191
+ using token = std::string;
192
+
193
+ struct token_score {
194
+ token tok;
195
+ float score;
196
+ };
197
+
198
+ std::unordered_map<token, id> token_to_id;
199
+ std::vector<token_score> id_to_token;
200
+ };
201
+
185
202
  struct llama_model {
186
203
  e_model type = MODEL_UNKNOWN;
187
204
 
@@ -198,10 +215,6 @@ struct llama_model {
198
215
  // context
199
216
  struct ggml_context * ctx = NULL;
200
217
 
201
- // key + value cache for the self attention
202
- // TODO: move to llama_state
203
- struct llama_kv_cache kv_self;
204
-
205
218
  // the model memory buffer
206
219
  llama_ctx_buffer buf;
207
220
 
@@ -215,6 +228,11 @@ struct llama_model {
215
228
  // for quantize-stats only
216
229
  std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
217
230
 
231
+ int64_t t_load_us = 0;
232
+ int64_t t_start_us = 0;
233
+
234
+ llama_vocab vocab;
235
+
218
236
  ~llama_model() {
219
237
  if (ctx) {
220
238
  ggml_free(ctx);
@@ -233,24 +251,11 @@ struct llama_model {
233
251
  }
234
252
  };
235
253
 
236
- struct llama_vocab {
237
- using id = int32_t;
238
- using token = std::string;
239
-
240
- struct token_score {
241
- token tok;
242
- float score;
243
- };
244
-
245
- std::unordered_map<token, id> token_to_id;
246
- std::vector<token_score> id_to_token;
247
- };
248
-
249
254
  struct llama_context {
255
+ llama_context(const llama_model & model, const llama_vocab & vocab) : model(model), vocab(vocab), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
256
+
250
257
  std::mt19937 rng;
251
258
 
252
- int64_t t_load_us = 0;
253
- int64_t t_start_us = 0;
254
259
  bool has_evaluated_once = false;
255
260
 
256
261
  int64_t t_sample_us = 0;
@@ -261,8 +266,16 @@ struct llama_context {
261
266
  int32_t n_eval = 0; // number of eval calls
262
267
  int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
263
268
 
264
- llama_model model;
265
- llama_vocab vocab;
269
+ const llama_model & model;
270
+ const llama_vocab & vocab;
271
+
272
+ bool model_owner = false;
273
+
274
+ int64_t t_load_us;
275
+ int64_t t_start_us;
276
+
277
+ // key + value cache for the self attention
278
+ struct llama_kv_cache kv_self;
266
279
 
267
280
  size_t mem_per_token = 0;
268
281
 
@@ -351,96 +364,14 @@ static size_t llama_calc_tensor_size(const std::vector<uint32_t> & ne, enum ggml
351
364
  return size / ggml_blck_size(type);
352
365
  }
353
366
 
354
- struct llama_load_tensor_shard {
355
- std::vector<uint32_t> ne;
356
- size_t size;
357
- enum ggml_type type;
358
- size_t file_idx;
359
- size_t file_off;
360
-
361
- void calc_size() {
362
- size = llama_calc_tensor_size(ne, type);
363
- }
364
- };
365
-
366
- enum llama_split_type {
367
- SPLIT_NONE,
368
- SPLIT_BY_COLUMNS,
369
- SPLIT_BY_ROWS
370
- };
371
-
372
367
  struct llama_load_tensor {
373
- std::vector<llama_load_tensor_shard> shards;
374
-
375
368
  std::string name;
376
369
  enum ggml_type type = GGML_TYPE_F32;
377
- llama_split_type split_type = SPLIT_NONE;
378
370
  std::vector<uint32_t> ne;
371
+ size_t file_off;
379
372
  size_t size;
380
373
  struct ggml_tensor * ggml_tensor = NULL;
381
374
  uint8_t * data;
382
-
383
- llama_load_tensor(const std::string & name) : name(name) {}
384
-
385
- void calc_all() {
386
- calc_type();
387
- calc_split_type();
388
- calc_ne();
389
- calc_size();
390
- }
391
-
392
- void calc_type() {
393
- const auto & first_shard = shards.at(0);
394
- for (const auto & shard : shards) {
395
- if (shard.type != first_shard.type) {
396
- throw std::runtime_error(format("inconsistent tensor shard type in '%s'", name.c_str()));
397
- }
398
- }
399
- type = first_shard.type;
400
- }
401
-
402
- void calc_split_type() {
403
- if (shards.at(0).ne.size() == 1 || // 1D tensors are just duplicated in every file
404
- shards.size() == 1) { // only one file?
405
- split_type = SPLIT_NONE;
406
- } else if (name.find("tok_embeddings.") == 0 ||
407
- name.find(".attention.wo.weight") != std::string::npos ||
408
- name.find(".feed_forward.w2.weight") != std::string::npos) {
409
- split_type = SPLIT_BY_COLUMNS;
410
- } else {
411
- split_type = SPLIT_BY_ROWS;
412
- }
413
- }
414
-
415
- void calc_ne() {
416
- const auto & first_shard = shards.at(0);
417
- for (const auto & shard : shards) {
418
- if (shard.ne != first_shard.ne) {
419
- throw std::runtime_error(format("inconsistent tensor shard shape in '%s': first was %s, other was %s",
420
- name.c_str(), llama_format_tensor_shape(first_shard.ne).c_str(), llama_format_tensor_shape(shard.ne).c_str()));
421
- }
422
- }
423
- ne = first_shard.ne;
424
- LLAMA_ASSERT(shards.size() <= UINT32_MAX);
425
- uint32_t n_shards = (uint32_t) shards.size();
426
- switch (split_type) {
427
- case SPLIT_NONE:
428
- ne = first_shard.ne;
429
- break;
430
- case SPLIT_BY_COLUMNS:
431
- ne = {checked_mul<uint32_t>(first_shard.ne[0], n_shards),
432
- first_shard.ne[1]};
433
- break;
434
- case SPLIT_BY_ROWS:
435
- ne = {first_shard.ne[0],
436
- checked_mul<uint32_t>(first_shard.ne[1], n_shards)};
437
- break;
438
- }
439
- }
440
-
441
- void calc_size() {
442
- size = llama_calc_tensor_size(ne, type);
443
- }
444
375
  };
445
376
 
446
377
  struct llama_load_tensors_map {
@@ -463,13 +394,13 @@ struct llama_file_loader {
463
394
  llama_hparams hparams;
464
395
  llama_vocab vocab;
465
396
 
466
- llama_file_loader(const char * fname, size_t file_idx, llama_load_tensors_map & tensors_map)
397
+ llama_file_loader(const char * fname, llama_load_tensors_map & tensors_map)
467
398
  : file(fname, "rb") {
468
399
  fprintf(stderr, "llama.cpp: loading model from %s\n", fname);
469
400
  read_magic();
470
401
  read_hparams();
471
402
  read_vocab();
472
- read_tensor_metadata(file_idx, tensors_map);
403
+ read_tensor_metadata(tensors_map);
473
404
  }
474
405
  void read_magic() {
475
406
  uint32_t magic = file.read_u32();
@@ -526,19 +457,19 @@ struct llama_file_loader {
526
457
  tok_score.score = score;
527
458
  }
528
459
  }
529
- void read_tensor_metadata(size_t file_idx, llama_load_tensors_map & tensors_map) {
460
+ void read_tensor_metadata(llama_load_tensors_map & tensors_map) {
530
461
  while (file.tell() < file.size) {
531
- llama_load_tensor_shard shard;
462
+ llama_load_tensor tensor;
532
463
  uint32_t n_dims = file.read_u32();
533
464
  uint32_t name_len = file.read_u32();
534
- shard.type = (enum ggml_type) file.read_u32();
535
- shard.ne.resize(n_dims);
536
- file.read_raw(shard.ne.data(), sizeof(shard.ne[0]) * n_dims);
465
+ tensor.type = (enum ggml_type) file.read_u32();
466
+ tensor.ne.resize(n_dims);
467
+ file.read_raw(tensor.ne.data(), sizeof(tensor.ne[0]) * n_dims);
537
468
  std::string name = file.read_string(name_len);
538
469
  if (n_dims < 1 || n_dims > 2) {
539
470
  throw std::runtime_error(format("llama.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims));
540
471
  }
541
- switch (shard.type) {
472
+ switch (tensor.type) {
542
473
  case GGML_TYPE_F32:
543
474
  case GGML_TYPE_F16:
544
475
  case GGML_TYPE_Q4_0:
@@ -553,30 +484,20 @@ struct llama_file_loader {
553
484
  case GGML_TYPE_Q6_K:
554
485
  break;
555
486
  default: {
556
- throw std::runtime_error(format("unrecognized tensor type %u\n", shard.type));
487
+ throw std::runtime_error(format("unrecognized tensor type %u\n", tensor.type));
557
488
  }
558
489
  }
559
490
 
560
- if (file_version >= LLAMA_FILE_VERSION_GGJT_V1) {
561
- // skip to the next multiple of 32 bytes
562
- file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
563
- }
564
- shard.file_idx = file_idx;
565
- shard.file_off = file.tell();
491
+ // skip to the next multiple of 32 bytes
492
+ file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
566
493
 
567
- shard.calc_size();
568
- file.seek(shard.size, SEEK_CUR);
494
+ tensor.file_off = file.tell();
495
+ tensor.name = name;
496
+ tensor.size = llama_calc_tensor_size(tensor.ne, tensor.type);
497
+ file.seek(tensor.size, SEEK_CUR);
569
498
 
570
- auto it = tensors_map.name_to_idx.find(name);
571
- size_t idx;
572
- if (it != tensors_map.name_to_idx.end()) {
573
- idx = it->second;
574
- } else {
575
- tensors_map.tensors.emplace_back(name);
576
- idx = tensors_map.tensors.size() - 1;
577
- tensors_map.name_to_idx.emplace(name, idx);
578
- }
579
- tensors_map.tensors.at(idx).shards.push_back(shard);
499
+ tensors_map.tensors.push_back(tensor);
500
+ tensors_map.name_to_idx[name] = tensors_map.tensors.size() - 1;
580
501
  }
581
502
  }
582
503
  };
@@ -646,56 +567,19 @@ struct llama_file_saver {
646
567
  };
647
568
 
648
569
  struct llama_model_loader {
649
- std::vector<std::unique_ptr<llama_file_loader>> file_loaders;
570
+ std::unique_ptr<llama_file_loader> file_loader;
650
571
  llama_load_tensors_map tensors_map;
651
572
  bool use_mmap;
652
573
  size_t num_ggml_tensors_created = 0;
653
574
  struct ggml_context * ggml_ctx = NULL;
654
575
  std::unique_ptr<llama_mmap> mapping;
655
576
 
656
- llama_model_loader(const std::string & fname_base, bool use_mmap, bool vocab_only) {
657
- auto * first_file = new llama_file_loader(fname_base.c_str(), 0, tensors_map);
658
- file_loaders.emplace_back(first_file);
659
- uint32_t n_parts = vocab_only ? 1 : guess_n_parts();
660
- for (uint32_t i = 1; i < n_parts; i++) {
661
- std::string fname = fname_base + "." + std::to_string(i);
662
- auto * ith_file = new llama_file_loader(fname.c_str(), i, tensors_map);
663
- file_loaders.emplace_back(ith_file);
664
- if (ith_file->hparams != first_file->hparams) {
665
- throw std::runtime_error(format("llama.cpp: hparams inconsistent between files"));
666
- }
667
- }
577
+ llama_model_loader(const std::string & fname_base, bool use_mmap) {
578
+ file_loader = std::unique_ptr<llama_file_loader>(new llama_file_loader(fname_base.c_str(), tensors_map));
668
579
  if (!llama_mmap::SUPPORTED) {
669
580
  use_mmap = false;
670
581
  }
671
- if (use_mmap && alignment_prevents_mmap()) {
672
- fprintf(stderr, "llama.cpp: can't use mmap because tensors are not aligned; convert to new format to avoid this\n");
673
- use_mmap = false;
674
- }
675
582
  this->use_mmap = use_mmap;
676
- for (llama_load_tensor & lt : tensors_map.tensors) {
677
- lt.calc_all();
678
- }
679
- }
680
-
681
- bool alignment_prevents_mmap() {
682
- for (const llama_load_tensor & lt : tensors_map.tensors) {
683
- for (const llama_load_tensor_shard & shard : lt.shards) {
684
- if (shard.file_off & 3) {
685
- return true;
686
- }
687
- }
688
- }
689
- return false;
690
- }
691
-
692
- uint32_t guess_n_parts() const {
693
- auto it = tensors_map.name_to_idx.find("tok_embeddings.weight");
694
- if (it == tensors_map.name_to_idx.end()) {
695
- throw std::runtime_error(std::string("missing tok_embeddings.weight"));
696
- }
697
- const llama_load_tensor & lt = tensors_map.tensors.at(it->second);
698
- return file_loaders.at(0)->hparams.n_embd / lt.shards.at(0).ne.at(0);
699
583
  }
700
584
 
701
585
  void calc_sizes(size_t * ctx_size_p, size_t * mmapped_size_p) const {
@@ -761,7 +645,7 @@ struct llama_model_loader {
761
645
  }
762
646
 
763
647
  if (use_mmap) {
764
- mapping.reset(new llama_mmap(&file_loaders.at(0)->file, prefetch_size));
648
+ mapping.reset(new llama_mmap(&file_loader->file, prefetch_size, ggml_is_numa()));
765
649
  if (lmlock) {
766
650
  lmlock->init(mapping->addr);
767
651
  }
@@ -817,45 +701,13 @@ struct llama_model_loader {
817
701
 
818
702
  void load_data_for(llama_load_tensor & lt) {
819
703
  if (use_mmap) {
820
- LLAMA_ASSERT(lt.shards.size() == 1);
821
- lt.data = (uint8_t *) mapping->addr + lt.shards.at(0).file_off;
822
- } else if (lt.split_type == SPLIT_NONE) {
823
- llama_file & file = file_loaders.at(lt.shards.at(0).file_idx)->file;
824
- file.seek(lt.shards.at(0).file_off, SEEK_SET);
704
+ lt.data = (uint8_t *) mapping->addr + lt.file_off;
705
+ } else {
706
+ llama_file & file = file_loader->file;
707
+ file.seek(lt.file_off, SEEK_SET);
825
708
  file.read_raw(lt.data, lt.size);
826
- } else if (lt.split_type == SPLIT_BY_ROWS) {
827
- size_t offset = 0;
828
- for (llama_load_tensor_shard & shard : lt.shards) {
829
- llama_file & file = file_loaders.at(shard.file_idx)->file;
830
- file.seek(shard.file_off, SEEK_SET);
831
- file.read_raw(lt.data + offset, shard.size);
832
- offset += shard.size;
833
- }
834
- LLAMA_ASSERT(offset == lt.size);
835
- } else if (lt.split_type == SPLIT_BY_COLUMNS) {
836
- // Let's load the data into temporary buffers to ensure the OS performs large loads.
837
- std::vector<llama_buffer> tmp_bufs(lt.shards.size());
838
- for (size_t i = 0; i < lt.shards.size(); i++) {
839
- llama_load_tensor_shard & shard = lt.shards.at(i);
840
- llama_file & file = file_loaders.at(shard.file_idx)->file;
841
- file.seek(shard.file_off, SEEK_SET);
842
- tmp_bufs.at(i).resize(shard.size);
843
- file.read_raw(tmp_bufs.at(i).addr, shard.size);
844
- }
845
- // Then reshape.
846
- size_t num_rows = lt.ne.at(1);
847
- size_t per_shard_row_size = lt.shards.at(0).size / num_rows;
848
- size_t out_offset = 0;
849
- for (size_t row = 0; row < num_rows; row++) {
850
- for (llama_buffer & tmp_buf : tmp_bufs) {
851
- memcpy(lt.data + out_offset,
852
- tmp_buf.addr + row * per_shard_row_size,
853
- per_shard_row_size);
854
- out_offset += per_shard_row_size;
855
- }
856
- }
857
- LLAMA_ASSERT(out_offset == lt.size);
858
709
  }
710
+
859
711
  if (0) {
860
712
  print_checksum(lt);
861
713
  }
@@ -925,7 +777,7 @@ static bool kv_cache_init(
925
777
 
926
778
  struct llama_context_params llama_context_default_params() {
927
779
  struct llama_context_params result = {
928
- /*.seed =*/ -1,
780
+ /*.seed =*/ LLAMA_DEFAULT_SEED,
929
781
  /*.n_ctx =*/ 512,
930
782
  /*.n_batch =*/ 512,
931
783
  /*.gpu_layers =*/ 0,
@@ -964,7 +816,7 @@ bool llama_mlock_supported() {
964
816
  return llama_mlock::SUPPORTED;
965
817
  }
966
818
 
967
- void llama_init_backend() {
819
+ void llama_init_backend(bool numa) {
968
820
  ggml_time_init();
969
821
 
970
822
  // needed to initialize f16 tables
@@ -973,6 +825,10 @@ void llama_init_backend() {
973
825
  struct ggml_context * ctx = ggml_init(params);
974
826
  ggml_free(ctx);
975
827
  }
828
+
829
+ if (numa) {
830
+ ggml_numa_init();
831
+ }
976
832
  }
977
833
 
978
834
  int64_t llama_time_us() {
@@ -1033,7 +889,8 @@ static const char *llama_model_type_name(e_model type) {
1033
889
 
1034
890
  static void llama_model_load_internal(
1035
891
  const std::string & fname,
1036
- llama_context & lctx,
892
+ llama_model & model,
893
+ llama_vocab & vocab,
1037
894
  int n_ctx,
1038
895
  int n_batch,
1039
896
  int n_gpu_layers,
@@ -1047,15 +904,14 @@ static void llama_model_load_internal(
1047
904
  llama_progress_callback progress_callback,
1048
905
  void * progress_callback_user_data) {
1049
906
 
1050
- lctx.t_start_us = ggml_time_us();
907
+ model.t_start_us = ggml_time_us();
1051
908
 
1052
- std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap, vocab_only));
909
+ std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap));
1053
910
 
1054
- lctx.vocab = std::move(ml->file_loaders.at(0)->vocab);
1055
- auto & model = lctx.model;
1056
- model.hparams = ml->file_loaders.at(0)->hparams;
911
+ vocab = std::move(ml->file_loader->vocab);
912
+ model.hparams = ml->file_loader->hparams;
1057
913
  model.n_gpu_layers = n_gpu_layers;
1058
- llama_file_version file_version = ml->file_loaders.at(0)->file_version;
914
+ llama_file_version file_version = ml->file_loader->file_version;
1059
915
  auto & hparams = model.hparams;
1060
916
 
1061
917
  {
@@ -1089,7 +945,6 @@ static void llama_model_load_internal(
1089
945
  fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot);
1090
946
  fprintf(stderr, "%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype));
1091
947
  fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff);
1092
- fprintf(stderr, "%s: n_parts = %zu\n", __func__, ml->file_loaders.size());
1093
948
  fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
1094
949
  }
1095
950
 
@@ -1122,15 +977,15 @@ static void llama_model_load_internal(
1122
977
 
1123
978
  // create the ggml context
1124
979
  {
1125
- lctx.model.buf.resize(ctx_size);
980
+ model.buf.resize(ctx_size);
1126
981
  if (use_mlock) {
1127
- lctx.model.mlock_buf.init(lctx.model.buf.addr);
1128
- lctx.model.mlock_buf.grow_to(lctx.model.buf.size);
982
+ model.mlock_buf.init(model.buf.addr);
983
+ model.mlock_buf.grow_to(model.buf.size);
1129
984
  }
1130
985
 
1131
986
  struct ggml_init_params params = {
1132
- /*.mem_size =*/ lctx.model.buf.size,
1133
- /*.mem_buffer =*/ lctx.model.buf.addr,
987
+ /*.mem_size =*/ model.buf.size,
988
+ /*.mem_buffer =*/ model.buf.addr,
1134
989
  /*.no_alloc =*/ ml->use_mmap,
1135
990
  };
1136
991
 
@@ -1311,7 +1166,7 @@ static void llama_model_load_internal(
1311
1166
  }
1312
1167
  #endif
1313
1168
 
1314
- ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
1169
+ ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &model.mlock_mmap : NULL);
1315
1170
 
1316
1171
  if (progress_callback) {
1317
1172
  progress_callback(1.0f, progress_callback_user_data);
@@ -1321,12 +1176,13 @@ static void llama_model_load_internal(
1321
1176
 
1322
1177
  // loading time will be recalculate after the first eval, so
1323
1178
  // we take page faults deferred by mmap() into consideration
1324
- lctx.t_load_us = ggml_time_us() - lctx.t_start_us;
1179
+ model.t_load_us = ggml_time_us() - model.t_start_us;
1325
1180
  }
1326
1181
 
1327
1182
  static bool llama_model_load(
1328
1183
  const std::string & fname,
1329
- llama_context & lctx,
1184
+ llama_model & model,
1185
+ llama_vocab & vocab,
1330
1186
  int n_ctx,
1331
1187
  int n_batch,
1332
1188
  int n_gpu_layers,
@@ -1340,7 +1196,7 @@ static bool llama_model_load(
1340
1196
  llama_progress_callback progress_callback,
1341
1197
  void *progress_callback_user_data) {
1342
1198
  try {
1343
- llama_model_load_internal(fname, lctx, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type,
1199
+ llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type,
1344
1200
  use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
1345
1201
  return true;
1346
1202
  } catch (const std::exception & err) {
@@ -1351,22 +1207,26 @@ static bool llama_model_load(
1351
1207
 
1352
1208
  // evaluate the transformer
1353
1209
  //
1354
- // - lctx: llama context
1355
- // - tokens: new batch of tokens to process
1356
- // - n_past: the context size so far
1357
- // - n_threads: number of threads to use
1358
- // - cgraph_fname: filename of the exported computation graph
1210
+ // - lctx: llama context
1211
+ // - tokens: new batch of tokens to process
1212
+ // - embd embeddings input
1213
+ // - n_tokens number of tokens
1214
+ // - n_past: the context size so far
1215
+ // - n_threads: number of threads to use
1359
1216
  //
1360
1217
  static bool llama_eval_internal(
1361
- llama_context & lctx,
1362
- const llama_token * tokens,
1363
- const int n_tokens,
1364
- const int n_past,
1365
- const int n_threads,
1218
+ llama_context & lctx,
1219
+ const llama_token * tokens,
1220
+ const float * embd,
1221
+ const int n_tokens,
1222
+ const int n_past,
1223
+ const int n_threads,
1366
1224
  const char * cgraph_fname) {
1367
1225
 
1226
+ LLAMA_ASSERT((!tokens && embd) || (tokens && !embd));
1227
+
1368
1228
  // enforce that the first token is BOS
1369
- if (n_past == 0 && tokens[0] != llama_token_bos()) {
1229
+ if (tokens && n_past == 0 && tokens[0] != llama_token_bos()) {
1370
1230
  fprintf(stderr, "%s: first token must be BOS\n", __func__);
1371
1231
  return false;
1372
1232
  }
@@ -1378,7 +1238,7 @@ static bool llama_eval_internal(
1378
1238
  const auto & model = lctx.model;
1379
1239
  const auto & hparams = model.hparams;
1380
1240
 
1381
- const auto & kv_self = model.kv_self;
1241
+ const auto & kv_self = lctx.kv_self;
1382
1242
 
1383
1243
  LLAMA_ASSERT(!!kv_self.ctx);
1384
1244
 
@@ -1406,12 +1266,18 @@ static bool llama_eval_internal(
1406
1266
  ggml_cgraph gf = {};
1407
1267
  gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
1408
1268
 
1409
- struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
1410
- ggml_set_name(embd, "embd");
1411
- memcpy(embd->data, tokens, N*ggml_element_size(embd));
1412
-
1413
1269
  struct ggml_tensor * cur;
1414
- struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
1270
+ struct ggml_tensor * inpL;
1271
+
1272
+ if (tokens) {
1273
+ struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
1274
+ ggml_set_name(embd, "embd");
1275
+ memcpy(embd->data, tokens, N*ggml_element_size(embd));
1276
+ inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
1277
+ } else {
1278
+ inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
1279
+ memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
1280
+ }
1415
1281
 
1416
1282
  const int i_gpu_start = n_layer - n_gpu_layers;
1417
1283
  (void) i_gpu_start;
@@ -1473,11 +1339,11 @@ static bool llama_eval_internal(
1473
1339
  offload_func_kq(tmpq);
1474
1340
  ggml_set_name(tmpq, "tmpq");
1475
1341
 
1476
- struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0);
1342
+ struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
1477
1343
  offload_func_kq(Kcur);
1478
1344
  ggml_set_name(Kcur, "Kcur");
1479
1345
 
1480
- struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0);
1346
+ struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
1481
1347
  offload_func_kq(Qcur);
1482
1348
  ggml_set_name(Qcur, "Qcur");
1483
1349
 
@@ -1726,7 +1592,7 @@ static bool llama_eval_internal(
1726
1592
  //memcpy(embd_w.data(), ggml_get_data(cur), sizeof(float)*n_vocab*N);
1727
1593
 
1728
1594
  // update kv token count
1729
- lctx.model.kv_self.n = n_past + N;
1595
+ lctx.kv_self.n = n_past + N;
1730
1596
 
1731
1597
  // extract logits
1732
1598
  {
@@ -2005,9 +1871,10 @@ void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * can
2005
1871
  for (size_t i = 0; i < candidates->size; ++i) {
2006
1872
  cum_sum += candidates->data[i].p;
2007
1873
 
2008
- // Check if the running sum is greater than p or if we have kept at least min_keep tokens
2009
- if (cum_sum > p && i >= min_keep) {
2010
- last_idx = i;
1874
+ // Check if the running sum is at least p or if we have kept at least min_keep tokens
1875
+ // we set the last index to i+1 to indicate that the current iterate should be included in the set
1876
+ if (cum_sum >= p && i + 1 >= min_keep) {
1877
+ last_idx = i + 1;
2011
1878
  break;
2012
1879
  }
2013
1880
  }
@@ -2432,9 +2299,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2432
2299
  nthread = std::thread::hardware_concurrency();
2433
2300
  }
2434
2301
 
2435
- std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp, /*use_mmap*/ false,
2436
- /*vocab_only*/ false));
2437
- llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), params->ftype);
2302
+ std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp, /*use_mmap*/ false));
2303
+ llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loader.get(), params->ftype);
2438
2304
 
2439
2305
  #ifdef GGML_USE_K_QUANTS
2440
2306
  int n_attention_wv = 0;
@@ -2459,6 +2325,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2459
2325
  std::vector<std::thread> workers;
2460
2326
  std::mutex mutex;
2461
2327
 
2328
+ auto use_more_bits = [] (int i_layer, int num_layers) -> bool {
2329
+ return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2;
2330
+ };
2331
+
2462
2332
  size_t idx = 0;
2463
2333
  for (llama_load_tensor & tensor : model_loader->tensors_map.tensors) {
2464
2334
  llama_buffer read_data;
@@ -2513,15 +2383,16 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2513
2383
  if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
2514
2384
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
2515
2385
  else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
2516
- (i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8 ||
2517
- (i_attention_wv - n_attention_wv/8)%3 == 2)) new_type = GGML_TYPE_Q6_K;
2386
+ use_more_bits(i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K;
2387
+ else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
2388
+ (i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
2518
2389
  ++i_attention_wv;
2519
2390
  } else if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) {
2520
2391
  if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
2521
2392
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
2522
2393
  else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
2523
- (i_feed_forward_w2 < n_feed_forward_w2/8 || i_feed_forward_w2 >= 7*n_feed_forward_w2/8 ||
2524
- (i_feed_forward_w2 - n_feed_forward_w2/8)%3 == 2)) new_type = GGML_TYPE_Q6_K;
2394
+ use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
2395
+ //else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_feed_forward_w2 < n_feed_forward_w2/8) new_type = GGML_TYPE_Q6_K;
2525
2396
  ++i_feed_forward_w2;
2526
2397
  } else if (tensor.name.find("attention.wo.weight") != std::string::npos) {
2527
2398
  if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
@@ -2630,18 +2501,47 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2630
2501
  }
2631
2502
  }
2632
2503
 
2504
+
2505
+
2633
2506
  //
2634
2507
  // interface implementation
2635
2508
  //
2636
2509
 
2637
- struct llama_context * llama_init_from_file(
2510
+ struct llama_model * llama_load_model_from_file(
2638
2511
  const char * path_model,
2639
2512
  struct llama_context_params params) {
2640
2513
  ggml_time_init();
2641
2514
 
2642
- llama_context * ctx = new llama_context;
2515
+ llama_model * model = new llama_model;
2516
+
2517
+ ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
2518
+
2519
+ if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gpu_layers,
2520
+ params.main_gpu, params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock,
2521
+ params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
2522
+ delete model;
2523
+ fprintf(stderr, "%s: failed to load model\n", __func__);
2524
+ return nullptr;
2525
+ }
2526
+
2527
+ return model;
2528
+ }
2643
2529
 
2644
- if (params.seed < 0) {
2530
+ void llama_free_model(struct llama_model * model) {
2531
+ delete model;
2532
+ }
2533
+
2534
+ struct llama_context * llama_new_context_with_model(
2535
+ struct llama_model * model,
2536
+ struct llama_context_params params) {
2537
+
2538
+ if (!model) {
2539
+ return nullptr;
2540
+ }
2541
+
2542
+ llama_context * ctx = new llama_context(*model, model->vocab);
2543
+
2544
+ if (params.seed == LLAMA_DEFAULT_SEED) {
2645
2545
  params.seed = time(NULL);
2646
2546
  }
2647
2547
 
@@ -2667,24 +2567,16 @@ struct llama_context * llama_init_from_file(
2667
2567
 
2668
2568
  ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
2669
2569
 
2670
- if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_batch, params.n_gpu_layers, params.main_gpu,
2671
- params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock,
2672
- params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
2673
- fprintf(stderr, "%s: failed to load model\n", __func__);
2674
- llama_free(ctx);
2675
- return nullptr;
2676
- }
2677
-
2678
2570
  // reserve memory for context buffers
2679
2571
  if (!params.vocab_only) {
2680
- if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx, params.n_gpu_layers)) {
2572
+ if (!kv_cache_init(ctx->model.hparams, ctx->kv_self, memory_type, ctx->model.hparams.n_ctx, params.n_gpu_layers)) {
2681
2573
  fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
2682
2574
  llama_free(ctx);
2683
2575
  return nullptr;
2684
2576
  }
2685
2577
 
2686
2578
  {
2687
- const size_t memory_size = ggml_nbytes(ctx->model.kv_self.k) + ggml_nbytes(ctx->model.kv_self.v);
2579
+ const size_t memory_size = ggml_nbytes(ctx->kv_self.k) + ggml_nbytes(ctx->kv_self.v);
2688
2580
  fprintf(stderr, "%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
2689
2581
  }
2690
2582
 
@@ -2736,8 +2628,8 @@ struct llama_context * llama_init_from_file(
2736
2628
 
2737
2629
  LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
2738
2630
 
2739
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size, 0));
2740
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->model.kv_self.buf.addr, ctx->model.kv_self.buf.size, 0));
2631
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size, 0));
2632
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.addr, ctx->kv_self.buf.size, 0));
2741
2633
 
2742
2634
  LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size, 0));
2743
2635
  LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size, 0));
@@ -2748,7 +2640,23 @@ struct llama_context * llama_init_from_file(
2748
2640
  return ctx;
2749
2641
  }
2750
2642
 
2643
+ struct llama_context * llama_init_from_file(
2644
+ const char * path_model,
2645
+ struct llama_context_params params) {
2646
+
2647
+ struct llama_model * model = llama_load_model_from_file(path_model, params);
2648
+ if (!model) {
2649
+ return nullptr;
2650
+ }
2651
+ struct llama_context * ctx = llama_new_context_with_model(model, params);
2652
+ ctx->model_owner = true;
2653
+ return ctx;
2654
+ }
2655
+
2751
2656
  void llama_free(struct llama_context * ctx) {
2657
+ if (ctx->model_owner) {
2658
+ delete &ctx->model;
2659
+ }
2752
2660
  delete ctx;
2753
2661
  }
2754
2662
 
@@ -2765,11 +2673,9 @@ int llama_model_quantize(
2765
2673
  }
2766
2674
  }
2767
2675
 
2768
- int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
2676
+ int llama_apply_lora_from_file_internal(const struct llama_model & model, const char * path_lora, const char * path_base_model, int n_threads) {
2769
2677
  fprintf(stderr, "%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
2770
2678
 
2771
- auto & model = ctx->model;
2772
-
2773
2679
  const int64_t t_start_lora_us = ggml_time_us();
2774
2680
 
2775
2681
  auto fin = std::ifstream(path_lora, std::ios::binary);
@@ -2817,7 +2723,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
2817
2723
 
2818
2724
  // create a name -> tensor map of the model to accelerate lookups
2819
2725
  std::unordered_map<std::string, struct ggml_tensor*> model_tensors;
2820
- for (auto & kv: model.tensors_by_name) {
2726
+ for (const auto & kv: model.tensors_by_name) {
2821
2727
  model_tensors.insert(kv);
2822
2728
  }
2823
2729
 
@@ -2828,7 +2734,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
2828
2734
  llama_buffer base_buf;
2829
2735
  if (path_base_model) {
2830
2736
  fprintf(stderr, "%s: loading base model from '%s'\n", __func__, path_base_model);
2831
- model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*vocab_only*/ false));
2737
+ model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true));
2832
2738
 
2833
2739
  size_t ctx_size;
2834
2740
  size_t mmapped_size;
@@ -2846,7 +2752,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
2846
2752
 
2847
2753
  // maybe this should in llama_model_loader
2848
2754
  if (model_loader->use_mmap) {
2849
- model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ 0));
2755
+ model_loader->mapping.reset(new llama_mmap(&model_loader->file_loader->file, /* prefetch */ 0, ggml_is_numa()));
2850
2756
  }
2851
2757
  }
2852
2758
 
@@ -2907,7 +2813,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
2907
2813
  return false;
2908
2814
  }
2909
2815
  }
2910
- ggml_tensor* lora_tensor;
2816
+ ggml_tensor * lora_tensor;
2911
2817
  if (n_dims == 2) {
2912
2818
  lora_tensor = ggml_new_tensor_2d(lora_ctx, wtype, ne[0], ne[1]);
2913
2819
  }
@@ -2915,6 +2821,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
2915
2821
  fprintf(stderr, "%s: unsupported tensor dimension %d\n", __func__, n_dims);
2916
2822
  return 1;
2917
2823
  }
2824
+ ggml_set_name(lora_tensor, "lora_tensor");
2918
2825
 
2919
2826
  // load tensor data
2920
2827
  size_t offset = fin.tellg();
@@ -2930,6 +2837,21 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
2930
2837
  lora_tensors.find(base_name + ".loraB") != lora_tensors.end()) {
2931
2838
 
2932
2839
  ggml_tensor * dest_t = model_tensors[base_name];
2840
+
2841
+ offload_func_t offload_func = llama_nop;
2842
+ offload_func_t offload_func_force_inplace = llama_nop;
2843
+
2844
+ #ifdef GGML_USE_CUBLAS
2845
+ if (dest_t->backend == GGML_BACKEND_GPU || dest_t->backend == GGML_BACKEND_GPU_SPLIT) {
2846
+ if (dest_t->type != GGML_TYPE_F16) {
2847
+ throw std::runtime_error(format(
2848
+ "%s: error: the simultaneous use of LoRAs and GPU acceleration is only supported for f16 models", __func__));
2849
+ }
2850
+ offload_func = ggml_cuda_assign_buffers;
2851
+ offload_func_force_inplace = ggml_cuda_assign_buffers_force_inplace;
2852
+ }
2853
+ #endif // GGML_USE_CUBLAS
2854
+
2933
2855
  ggml_tensor * base_t;
2934
2856
  if (model_loader) {
2935
2857
  // load from base model
@@ -2957,7 +2879,12 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
2957
2879
  }
2958
2880
 
2959
2881
  ggml_tensor * loraA = lora_tensors[base_name + ".loraA"];
2882
+ GGML_ASSERT(loraA->type == GGML_TYPE_F32);
2883
+ ggml_set_name(loraA, "loraA");
2884
+
2960
2885
  ggml_tensor * loraB = lora_tensors[base_name + ".loraB"];
2886
+ GGML_ASSERT(loraB->type == GGML_TYPE_F32);
2887
+ ggml_set_name(loraB, "loraB");
2961
2888
 
2962
2889
  if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) {
2963
2890
  fprintf(stderr, "%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");"
@@ -2967,19 +2894,32 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
2967
2894
 
2968
2895
  // w = w + BA*s
2969
2896
  ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB);
2897
+ offload_func(BA);
2898
+ ggml_set_name(BA, "BA");
2970
2899
 
2971
2900
  if (scaling != 1.0f) {
2972
2901
  ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling);
2902
+ ggml_set_name(scale_tensor, "scale_tensor");
2903
+
2973
2904
  BA = ggml_scale_inplace(lora_ctx, BA, scale_tensor);
2905
+ offload_func(BA);
2906
+ ggml_set_name(BA, "BA_scaled");
2974
2907
  }
2975
2908
 
2976
2909
  ggml_tensor * r;
2977
2910
  if (base_t == dest_t) {
2978
2911
  r = ggml_add_inplace(lora_ctx, dest_t, BA);
2912
+ offload_func_force_inplace(r);
2913
+ ggml_set_name(r, "r_add_inplace");
2979
2914
  }
2980
2915
  else {
2981
2916
  r = ggml_add(lora_ctx, base_t, BA);
2917
+ offload_func(r);
2918
+ ggml_set_name(r, "r_add");
2919
+
2982
2920
  r = ggml_cpy(lora_ctx, r, dest_t);
2921
+ offload_func(r);
2922
+ ggml_set_name(r, "r_cpy");
2983
2923
  }
2984
2924
 
2985
2925
  struct ggml_cgraph gf = ggml_build_forward(r);
@@ -3012,7 +2952,16 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
3012
2952
 
3013
2953
  int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
3014
2954
  try {
3015
- return llama_apply_lora_from_file_internal(ctx, path_lora, path_base_model, n_threads);
2955
+ return llama_apply_lora_from_file_internal(ctx->model, path_lora, path_base_model, n_threads);
2956
+ } catch (const std::exception & err) {
2957
+ fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what());
2958
+ return 1;
2959
+ }
2960
+ }
2961
+
2962
+ int llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, const char * path_base_model, int n_threads) {
2963
+ try {
2964
+ return llama_apply_lora_from_file_internal(*model, path_lora, path_base_model, n_threads);
3016
2965
  } catch (const std::exception & err) {
3017
2966
  fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what());
3018
2967
  return 1;
@@ -3020,13 +2969,13 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
3020
2969
  }
3021
2970
 
3022
2971
  int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
3023
- return ctx->model.kv_self.n;
2972
+ return ctx->kv_self.n;
3024
2973
  }
3025
2974
 
3026
2975
  #define LLAMA_MAX_RNG_STATE (64*1024)
3027
2976
 
3028
- void llama_set_rng_seed(struct llama_context * ctx, int seed) {
3029
- if (seed < 0) {
2977
+ void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed) {
2978
+ if (seed == LLAMA_DEFAULT_SEED) {
3030
2979
  seed = time(NULL);
3031
2980
  }
3032
2981
  ctx->rng.seed(seed);
@@ -3045,7 +2994,7 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
3045
2994
  const size_t s_embedding = ctx->embedding.size() * sizeof(float);
3046
2995
  const size_t s_kv_size = sizeof(size_t);
3047
2996
  const size_t s_kv_ntok = sizeof(int);
3048
- const size_t s_kv = ctx->model.kv_self.buf.size;
2997
+ const size_t s_kv = ctx->kv_self.buf.size;
3049
2998
 
3050
2999
  const size_t s_total = (
3051
3000
  + s_rng_size
@@ -3111,7 +3060,7 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
3111
3060
 
3112
3061
  // copy kv cache
3113
3062
  {
3114
- const auto & kv_self = ctx->model.kv_self;
3063
+ const auto & kv_self = ctx->kv_self;
3115
3064
  const auto & hparams = ctx->model.hparams;
3116
3065
  const int n_layer = hparams.n_layer;
3117
3066
  const int n_embd = hparams.n_embd;
@@ -3215,7 +3164,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
3215
3164
 
3216
3165
  // set kv cache
3217
3166
  {
3218
- const auto & kv_self = ctx->model.kv_self;
3167
+ const auto & kv_self = ctx->kv_self;
3219
3168
  const auto & hparams = ctx->model.hparams;
3220
3169
  const int n_layer = hparams.n_layer;
3221
3170
  const int n_embd = hparams.n_embd;
@@ -3259,7 +3208,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
3259
3208
  ggml_free(cpy_ctx);
3260
3209
  }
3261
3210
 
3262
- ctx->model.kv_self.n = kv_ntok;
3211
+ ctx->kv_self.n = kv_ntok;
3263
3212
  }
3264
3213
 
3265
3214
  const size_t nread = inp - src;
@@ -3355,7 +3304,29 @@ int llama_eval(
3355
3304
  int n_tokens,
3356
3305
  int n_past,
3357
3306
  int n_threads) {
3358
- if (!llama_eval_internal(*ctx, tokens, n_tokens, n_past, n_threads, nullptr)) {
3307
+ if (!llama_eval_internal(*ctx, tokens, nullptr, n_tokens, n_past, n_threads, nullptr)) {
3308
+ fprintf(stderr, "%s: failed to eval\n", __func__);
3309
+ return 1;
3310
+ }
3311
+
3312
+ // get a more accurate load time, upon first eval
3313
+ // TODO: fix this
3314
+ if (!ctx->has_evaluated_once) {
3315
+ ctx->t_load_us = ggml_time_us() - ctx->t_start_us;
3316
+ ctx->has_evaluated_once = true;
3317
+ }
3318
+
3319
+ return 0;
3320
+ }
3321
+
3322
+
3323
+ int llama_eval_embd(
3324
+ struct llama_context * ctx,
3325
+ const float * embd,
3326
+ int n_tokens,
3327
+ int n_past,
3328
+ int n_threads) {
3329
+ if (!llama_eval_internal(*ctx, nullptr, embd, n_tokens, n_past, n_threads, nullptr)) {
3359
3330
  fprintf(stderr, "%s: failed to eval\n", __func__);
3360
3331
  return 1;
3361
3332
  }
@@ -3376,7 +3347,7 @@ int llama_eval_export(struct llama_context * ctx, const char * fname) {
3376
3347
 
3377
3348
  const std::vector<llama_token> tmp(n_batch, llama_token_bos());
3378
3349
 
3379
- if (!llama_eval_internal(*ctx, tmp.data(), tmp.size(), n_ctx, 1, fname)) {
3350
+ if (!llama_eval_internal(*ctx, tmp.data(), nullptr, tmp.size(), n_ctx, 1, fname)) {
3380
3351
  fprintf(stderr, "%s: failed to eval\n", __func__);
3381
3352
  return 1;
3382
3353
  }
@@ -3506,6 +3477,6 @@ const char * llama_print_system_info(void) {
3506
3477
  }
3507
3478
 
3508
3479
  // For internal test use
3509
- std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
3480
+ const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
3510
3481
  return ctx->model.tensors_by_name;
3511
3482
  }