llama_cpp 0.3.0 → 0.3.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -66,6 +66,7 @@ enum e_model {
66
66
  MODEL_65B,
67
67
  };
68
68
 
69
+ static const size_t kB = 1024;
69
70
  static const size_t MB = 1024*1024;
70
71
 
71
72
  // computed for n_ctx == 2048
@@ -129,6 +130,34 @@ static const std::map<e_model, size_t> & MEM_REQ_EVAL()
129
130
  return k_sizes;
130
131
  }
131
132
 
133
+ // amount of VRAM needed per batch size to hold temporary results
134
+ // the values for 3b and 65b are not derived from testing but instead chosen conservatively
135
+ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_BASE()
136
+ {
137
+ static std::map<e_model, size_t> k_sizes = {
138
+ { MODEL_3B, 512ull * kB },
139
+ { MODEL_7B, 512ull * kB },
140
+ { MODEL_13B, 640ull * kB },
141
+ { MODEL_30B, 768ull * kB },
142
+ { MODEL_65B, 1536ull * kB },
143
+ };
144
+ return k_sizes;
145
+ }
146
+
147
+ // amount of VRAM needed per batch size and context to hold temporary results
148
+ // the values for 3b and 65b are not derived from testing but instead chosen conservatively
149
+ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT()
150
+ {
151
+ static std::map<e_model, size_t> k_sizes = {
152
+ { MODEL_3B, 128ull },
153
+ { MODEL_7B, 128ull },
154
+ { MODEL_13B, 160ull },
155
+ { MODEL_30B, 208ull },
156
+ { MODEL_65B, 416ull },
157
+ };
158
+ return k_sizes;
159
+ }
160
+
132
161
  // default hparams (LLaMA 7B)
133
162
  struct llama_hparams {
134
163
  uint32_t n_vocab = 32000;
@@ -165,8 +194,8 @@ struct llama_layer {
165
194
  };
166
195
 
167
196
  struct llama_kv_cache {
168
- struct ggml_tensor * k;
169
- struct ggml_tensor * v;
197
+ struct ggml_tensor * k = NULL;
198
+ struct ggml_tensor * v = NULL;
170
199
 
171
200
  struct ggml_context * ctx = NULL;
172
201
 
@@ -253,7 +282,13 @@ struct llama_model {
253
282
 
254
283
  struct llama_context {
255
284
  llama_context(const llama_model & model, const llama_vocab & vocab) : model(model), vocab(vocab), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
256
-
285
+ #ifdef GGML_USE_METAL
286
+ ~llama_context() {
287
+ if (ctx_metal) {
288
+ ggml_metal_free(ctx_metal);
289
+ }
290
+ }
291
+ #endif
257
292
  std::mt19937 rng;
258
293
 
259
294
  bool has_evaluated_once = false;
@@ -364,96 +399,14 @@ static size_t llama_calc_tensor_size(const std::vector<uint32_t> & ne, enum ggml
364
399
  return size / ggml_blck_size(type);
365
400
  }
366
401
 
367
- struct llama_load_tensor_shard {
368
- std::vector<uint32_t> ne;
369
- size_t size;
370
- enum ggml_type type;
371
- size_t file_idx;
372
- size_t file_off;
373
-
374
- void calc_size() {
375
- size = llama_calc_tensor_size(ne, type);
376
- }
377
- };
378
-
379
- enum llama_split_type {
380
- SPLIT_NONE,
381
- SPLIT_BY_COLUMNS,
382
- SPLIT_BY_ROWS
383
- };
384
-
385
402
  struct llama_load_tensor {
386
- std::vector<llama_load_tensor_shard> shards;
387
-
388
403
  std::string name;
389
404
  enum ggml_type type = GGML_TYPE_F32;
390
- llama_split_type split_type = SPLIT_NONE;
391
405
  std::vector<uint32_t> ne;
406
+ size_t file_off;
392
407
  size_t size;
393
408
  struct ggml_tensor * ggml_tensor = NULL;
394
409
  uint8_t * data;
395
-
396
- llama_load_tensor(const std::string & name) : name(name) {}
397
-
398
- void calc_all() {
399
- calc_type();
400
- calc_split_type();
401
- calc_ne();
402
- calc_size();
403
- }
404
-
405
- void calc_type() {
406
- const auto & first_shard = shards.at(0);
407
- for (const auto & shard : shards) {
408
- if (shard.type != first_shard.type) {
409
- throw std::runtime_error(format("inconsistent tensor shard type in '%s'", name.c_str()));
410
- }
411
- }
412
- type = first_shard.type;
413
- }
414
-
415
- void calc_split_type() {
416
- if (shards.at(0).ne.size() == 1 || // 1D tensors are just duplicated in every file
417
- shards.size() == 1) { // only one file?
418
- split_type = SPLIT_NONE;
419
- } else if (name.find("tok_embeddings.") == 0 ||
420
- name.find(".attention.wo.weight") != std::string::npos ||
421
- name.find(".feed_forward.w2.weight") != std::string::npos) {
422
- split_type = SPLIT_BY_COLUMNS;
423
- } else {
424
- split_type = SPLIT_BY_ROWS;
425
- }
426
- }
427
-
428
- void calc_ne() {
429
- const auto & first_shard = shards.at(0);
430
- for (const auto & shard : shards) {
431
- if (shard.ne != first_shard.ne) {
432
- throw std::runtime_error(format("inconsistent tensor shard shape in '%s': first was %s, other was %s",
433
- name.c_str(), llama_format_tensor_shape(first_shard.ne).c_str(), llama_format_tensor_shape(shard.ne).c_str()));
434
- }
435
- }
436
- ne = first_shard.ne;
437
- LLAMA_ASSERT(shards.size() <= UINT32_MAX);
438
- uint32_t n_shards = (uint32_t) shards.size();
439
- switch (split_type) {
440
- case SPLIT_NONE:
441
- ne = first_shard.ne;
442
- break;
443
- case SPLIT_BY_COLUMNS:
444
- ne = {checked_mul<uint32_t>(first_shard.ne[0], n_shards),
445
- first_shard.ne[1]};
446
- break;
447
- case SPLIT_BY_ROWS:
448
- ne = {first_shard.ne[0],
449
- checked_mul<uint32_t>(first_shard.ne[1], n_shards)};
450
- break;
451
- }
452
- }
453
-
454
- void calc_size() {
455
- size = llama_calc_tensor_size(ne, type);
456
- }
457
410
  };
458
411
 
459
412
  struct llama_load_tensors_map {
@@ -476,13 +429,13 @@ struct llama_file_loader {
476
429
  llama_hparams hparams;
477
430
  llama_vocab vocab;
478
431
 
479
- llama_file_loader(const char * fname, size_t file_idx, llama_load_tensors_map & tensors_map)
432
+ llama_file_loader(const char * fname, llama_load_tensors_map & tensors_map)
480
433
  : file(fname, "rb") {
481
434
  fprintf(stderr, "llama.cpp: loading model from %s\n", fname);
482
435
  read_magic();
483
436
  read_hparams();
484
437
  read_vocab();
485
- read_tensor_metadata(file_idx, tensors_map);
438
+ read_tensor_metadata(tensors_map);
486
439
  }
487
440
  void read_magic() {
488
441
  uint32_t magic = file.read_u32();
@@ -528,9 +481,7 @@ struct llama_file_loader {
528
481
  std::string word = file.read_string(len);
529
482
 
530
483
  float score = 0.0f;
531
- if (file_version >= LLAMA_FILE_VERSION_GGMF_V1) {
532
- file.read_raw(&score, sizeof(score));
533
- }
484
+ file.read_raw(&score, sizeof(score));
534
485
 
535
486
  vocab.token_to_id[word] = i;
536
487
 
@@ -539,19 +490,19 @@ struct llama_file_loader {
539
490
  tok_score.score = score;
540
491
  }
541
492
  }
542
- void read_tensor_metadata(size_t file_idx, llama_load_tensors_map & tensors_map) {
493
+ void read_tensor_metadata(llama_load_tensors_map & tensors_map) {
543
494
  while (file.tell() < file.size) {
544
- llama_load_tensor_shard shard;
495
+ llama_load_tensor tensor;
545
496
  uint32_t n_dims = file.read_u32();
546
497
  uint32_t name_len = file.read_u32();
547
- shard.type = (enum ggml_type) file.read_u32();
548
- shard.ne.resize(n_dims);
549
- file.read_raw(shard.ne.data(), sizeof(shard.ne[0]) * n_dims);
498
+ tensor.type = (enum ggml_type) file.read_u32();
499
+ tensor.ne.resize(n_dims);
500
+ file.read_raw(tensor.ne.data(), sizeof(tensor.ne[0]) * n_dims);
550
501
  std::string name = file.read_string(name_len);
551
502
  if (n_dims < 1 || n_dims > 2) {
552
503
  throw std::runtime_error(format("llama.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims));
553
504
  }
554
- switch (shard.type) {
505
+ switch (tensor.type) {
555
506
  case GGML_TYPE_F32:
556
507
  case GGML_TYPE_F16:
557
508
  case GGML_TYPE_Q4_0:
@@ -566,30 +517,20 @@ struct llama_file_loader {
566
517
  case GGML_TYPE_Q6_K:
567
518
  break;
568
519
  default: {
569
- throw std::runtime_error(format("unrecognized tensor type %u\n", shard.type));
520
+ throw std::runtime_error(format("unrecognized tensor type %u\n", tensor.type));
570
521
  }
571
522
  }
572
523
 
573
- if (file_version >= LLAMA_FILE_VERSION_GGJT_V1) {
574
- // skip to the next multiple of 32 bytes
575
- file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
576
- }
577
- shard.file_idx = file_idx;
578
- shard.file_off = file.tell();
524
+ // skip to the next multiple of 32 bytes
525
+ file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
579
526
 
580
- shard.calc_size();
581
- file.seek(shard.size, SEEK_CUR);
527
+ tensor.file_off = file.tell();
528
+ tensor.name = name;
529
+ tensor.size = llama_calc_tensor_size(tensor.ne, tensor.type);
530
+ file.seek(tensor.size, SEEK_CUR);
582
531
 
583
- auto it = tensors_map.name_to_idx.find(name);
584
- size_t idx;
585
- if (it != tensors_map.name_to_idx.end()) {
586
- idx = it->second;
587
- } else {
588
- tensors_map.tensors.emplace_back(name);
589
- idx = tensors_map.tensors.size() - 1;
590
- tensors_map.name_to_idx.emplace(name, idx);
591
- }
592
- tensors_map.tensors.at(idx).shards.push_back(shard);
532
+ tensors_map.tensors.push_back(tensor);
533
+ tensors_map.name_to_idx[name] = tensors_map.tensors.size() - 1;
593
534
  }
594
535
  }
595
536
  };
@@ -659,56 +600,19 @@ struct llama_file_saver {
659
600
  };
660
601
 
661
602
  struct llama_model_loader {
662
- std::vector<std::unique_ptr<llama_file_loader>> file_loaders;
603
+ std::unique_ptr<llama_file_loader> file_loader;
663
604
  llama_load_tensors_map tensors_map;
664
605
  bool use_mmap;
665
606
  size_t num_ggml_tensors_created = 0;
666
607
  struct ggml_context * ggml_ctx = NULL;
667
608
  std::unique_ptr<llama_mmap> mapping;
668
609
 
669
- llama_model_loader(const std::string & fname_base, bool use_mmap, bool vocab_only) {
670
- auto * first_file = new llama_file_loader(fname_base.c_str(), 0, tensors_map);
671
- file_loaders.emplace_back(first_file);
672
- uint32_t n_parts = vocab_only ? 1 : guess_n_parts();
673
- for (uint32_t i = 1; i < n_parts; i++) {
674
- std::string fname = fname_base + "." + std::to_string(i);
675
- auto * ith_file = new llama_file_loader(fname.c_str(), i, tensors_map);
676
- file_loaders.emplace_back(ith_file);
677
- if (ith_file->hparams != first_file->hparams) {
678
- throw std::runtime_error(format("llama.cpp: hparams inconsistent between files"));
679
- }
680
- }
610
+ llama_model_loader(const std::string & fname_base, bool use_mmap) {
611
+ file_loader = std::unique_ptr<llama_file_loader>(new llama_file_loader(fname_base.c_str(), tensors_map));
681
612
  if (!llama_mmap::SUPPORTED) {
682
613
  use_mmap = false;
683
614
  }
684
- if (use_mmap && alignment_prevents_mmap()) {
685
- fprintf(stderr, "llama.cpp: can't use mmap because tensors are not aligned; convert to new format to avoid this\n");
686
- use_mmap = false;
687
- }
688
615
  this->use_mmap = use_mmap;
689
- for (llama_load_tensor & lt : tensors_map.tensors) {
690
- lt.calc_all();
691
- }
692
- }
693
-
694
- bool alignment_prevents_mmap() {
695
- for (const llama_load_tensor & lt : tensors_map.tensors) {
696
- for (const llama_load_tensor_shard & shard : lt.shards) {
697
- if (shard.file_off & 3) {
698
- return true;
699
- }
700
- }
701
- }
702
- return false;
703
- }
704
-
705
- uint32_t guess_n_parts() const {
706
- auto it = tensors_map.name_to_idx.find("tok_embeddings.weight");
707
- if (it == tensors_map.name_to_idx.end()) {
708
- throw std::runtime_error(std::string("missing tok_embeddings.weight"));
709
- }
710
- const llama_load_tensor & lt = tensors_map.tensors.at(it->second);
711
- return file_loaders.at(0)->hparams.n_embd / lt.shards.at(0).ne.at(0);
712
616
  }
713
617
 
714
618
  void calc_sizes(size_t * ctx_size_p, size_t * mmapped_size_p) const {
@@ -774,7 +678,7 @@ struct llama_model_loader {
774
678
  }
775
679
 
776
680
  if (use_mmap) {
777
- mapping.reset(new llama_mmap(&file_loaders.at(0)->file, prefetch_size, ggml_is_numa()));
681
+ mapping.reset(new llama_mmap(&file_loader->file, prefetch_size, ggml_is_numa()));
778
682
  if (lmlock) {
779
683
  lmlock->init(mapping->addr);
780
684
  }
@@ -830,45 +734,13 @@ struct llama_model_loader {
830
734
 
831
735
  void load_data_for(llama_load_tensor & lt) {
832
736
  if (use_mmap) {
833
- LLAMA_ASSERT(lt.shards.size() == 1);
834
- lt.data = (uint8_t *) mapping->addr + lt.shards.at(0).file_off;
835
- } else if (lt.split_type == SPLIT_NONE) {
836
- llama_file & file = file_loaders.at(lt.shards.at(0).file_idx)->file;
837
- file.seek(lt.shards.at(0).file_off, SEEK_SET);
737
+ lt.data = (uint8_t *) mapping->addr + lt.file_off;
738
+ } else {
739
+ llama_file & file = file_loader->file;
740
+ file.seek(lt.file_off, SEEK_SET);
838
741
  file.read_raw(lt.data, lt.size);
839
- } else if (lt.split_type == SPLIT_BY_ROWS) {
840
- size_t offset = 0;
841
- for (llama_load_tensor_shard & shard : lt.shards) {
842
- llama_file & file = file_loaders.at(shard.file_idx)->file;
843
- file.seek(shard.file_off, SEEK_SET);
844
- file.read_raw(lt.data + offset, shard.size);
845
- offset += shard.size;
846
- }
847
- LLAMA_ASSERT(offset == lt.size);
848
- } else if (lt.split_type == SPLIT_BY_COLUMNS) {
849
- // Let's load the data into temporary buffers to ensure the OS performs large loads.
850
- std::vector<llama_buffer> tmp_bufs(lt.shards.size());
851
- for (size_t i = 0; i < lt.shards.size(); i++) {
852
- llama_load_tensor_shard & shard = lt.shards.at(i);
853
- llama_file & file = file_loaders.at(shard.file_idx)->file;
854
- file.seek(shard.file_off, SEEK_SET);
855
- tmp_bufs.at(i).resize(shard.size);
856
- file.read_raw(tmp_bufs.at(i).addr, shard.size);
857
- }
858
- // Then reshape.
859
- size_t num_rows = lt.ne.at(1);
860
- size_t per_shard_row_size = lt.shards.at(0).size / num_rows;
861
- size_t out_offset = 0;
862
- for (size_t row = 0; row < num_rows; row++) {
863
- for (llama_buffer & tmp_buf : tmp_bufs) {
864
- memcpy(lt.data + out_offset,
865
- tmp_buf.addr + row * per_shard_row_size,
866
- per_shard_row_size);
867
- out_offset += per_shard_row_size;
868
- }
869
- }
870
- LLAMA_ASSERT(out_offset == lt.size);
871
742
  }
743
+
872
744
  if (0) {
873
745
  print_checksum(lt);
874
746
  }
@@ -938,7 +810,7 @@ static bool kv_cache_init(
938
810
 
939
811
  struct llama_context_params llama_context_default_params() {
940
812
  struct llama_context_params result = {
941
- /*.seed =*/ -1,
813
+ /*.seed =*/ LLAMA_DEFAULT_SEED,
942
814
  /*.n_ctx =*/ 512,
943
815
  /*.n_batch =*/ 512,
944
816
  /*.gpu_layers =*/ 0,
@@ -1067,12 +939,12 @@ static void llama_model_load_internal(
1067
939
 
1068
940
  model.t_start_us = ggml_time_us();
1069
941
 
1070
- std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap, vocab_only));
942
+ std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap));
1071
943
 
1072
- vocab = std::move(ml->file_loaders.at(0)->vocab);
1073
- model.hparams = ml->file_loaders.at(0)->hparams;
944
+ vocab = std::move(ml->file_loader->vocab);
945
+ model.hparams = ml->file_loader->hparams;
1074
946
  model.n_gpu_layers = n_gpu_layers;
1075
- llama_file_version file_version = ml->file_loaders.at(0)->file_version;
947
+ llama_file_version file_version = ml->file_loader->file_version;
1076
948
  auto & hparams = model.hparams;
1077
949
 
1078
950
  {
@@ -1106,7 +978,6 @@ static void llama_model_load_internal(
1106
978
  fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot);
1107
979
  fprintf(stderr, "%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype));
1108
980
  fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff);
1109
- fprintf(stderr, "%s: n_parts = %zu\n", __func__, ml->file_loaders.size());
1110
981
  fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
1111
982
  }
1112
983
 
@@ -1274,14 +1145,18 @@ static void llama_model_load_internal(
1274
1145
  fprintf(stderr, "%s: not allocating a VRAM scratch buffer due to low VRAM option\n", __func__);
1275
1146
  ggml_cuda_set_scratch_size(0); // disable scratch
1276
1147
  } else {
1277
- vram_scratch = n_batch * MB;
1148
+ const size_t vram_scratch_base = VRAM_REQ_SCRATCH_BASE().at(model.type);
1149
+ const size_t vram_scratch_per_context = VRAM_REQ_SCRATCH_PER_CONTEXT().at(model.type);
1150
+ vram_scratch = n_batch * (vram_scratch_base + n_ctx * vram_scratch_per_context);
1278
1151
  ggml_cuda_set_scratch_size(vram_scratch);
1279
1152
  if (n_gpu_layers > 0) {
1280
- fprintf(stderr, "%s: allocating batch_size x 1 MB = %zd MB VRAM for the scratch buffer\n",
1281
- __func__, vram_scratch / MB);
1153
+ fprintf(stderr, "%s: allocating batch_size x (%zd kB + n_ctx x %zd B) = %zd MB VRAM for the scratch buffer\n",
1154
+ __func__, vram_scratch_base / kB, vram_scratch_per_context,
1155
+ (vram_scratch + MB - 1) / MB); // round up
1282
1156
  }
1283
1157
  }
1284
1158
  #endif // GGML_USE_CUBLAS
1159
+
1285
1160
  #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
1286
1161
  const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
1287
1162
 
@@ -1290,6 +1165,10 @@ static void llama_model_load_internal(
1290
1165
  fprintf(stderr, "%s: offloading non-repeating layers to GPU\n", __func__);
1291
1166
  }
1292
1167
  size_t vram_kv_cache = 0;
1168
+
1169
+ #ifdef GGML_USE_CUBLAS
1170
+ const int max_backend_supported_layers = hparams.n_layer + 3;
1171
+ const int max_offloadable_layers = low_vram ? hparams.n_layer + 1 : hparams.n_layer + 3;
1293
1172
  if (n_gpu_layers > (int) hparams.n_layer + 1) {
1294
1173
  if (low_vram) {
1295
1174
  fprintf(stderr, "%s: cannot offload v cache to GPU due to low VRAM option\n", __func__);
@@ -1306,14 +1185,18 @@ static void llama_model_load_internal(
1306
1185
  vram_kv_cache += MEM_REQ_KV_SELF().at(model.type) / 2;
1307
1186
  }
1308
1187
  }
1309
- const int max_offloadable_layers = low_vram ? hparams.n_layer + 1 : hparams.n_layer + 3;
1188
+ #elif defined(GGML_USE_CLBLAST)
1189
+ const int max_backend_supported_layers = hparams.n_layer + 1;
1190
+ const int max_offloadable_layers = hparams.n_layer + 1;
1191
+ #endif // GGML_USE_CUBLAS
1192
+
1310
1193
  fprintf(stderr, "%s: offloaded %d/%d layers to GPU\n",
1311
- __func__, std::min(n_gpu_layers, max_offloadable_layers), hparams.n_layer + 3);
1194
+ __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
1312
1195
  fprintf(stderr, "%s: total VRAM used: %zu MB\n",
1313
1196
  __func__, (vram_weights + vram_scratch + vram_kv_cache + MB - 1) / MB); // round up
1314
1197
  #else
1315
1198
  (void) n_gpu_layers;
1316
- #endif
1199
+ #endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
1317
1200
  }
1318
1201
 
1319
1202
  // populate `tensors_by_name`
@@ -1369,22 +1252,26 @@ static bool llama_model_load(
1369
1252
 
1370
1253
  // evaluate the transformer
1371
1254
  //
1372
- // - lctx: llama context
1373
- // - tokens: new batch of tokens to process
1374
- // - n_past: the context size so far
1375
- // - n_threads: number of threads to use
1376
- // - cgraph_fname: filename of the exported computation graph
1255
+ // - lctx: llama context
1256
+ // - tokens: new batch of tokens to process
1257
+ // - embd embeddings input
1258
+ // - n_tokens number of tokens
1259
+ // - n_past: the context size so far
1260
+ // - n_threads: number of threads to use
1377
1261
  //
1378
1262
  static bool llama_eval_internal(
1379
- llama_context & lctx,
1380
- const llama_token * tokens,
1381
- const int n_tokens,
1382
- const int n_past,
1383
- const int n_threads,
1263
+ llama_context & lctx,
1264
+ const llama_token * tokens,
1265
+ const float * embd,
1266
+ const int n_tokens,
1267
+ const int n_past,
1268
+ const int n_threads,
1384
1269
  const char * cgraph_fname) {
1385
1270
 
1271
+ LLAMA_ASSERT((!tokens && embd) || (tokens && !embd));
1272
+
1386
1273
  // enforce that the first token is BOS
1387
- if (n_past == 0 && tokens[0] != llama_token_bos()) {
1274
+ if (tokens && n_past == 0 && tokens[0] != llama_token_bos()) {
1388
1275
  fprintf(stderr, "%s: first token must be BOS\n", __func__);
1389
1276
  return false;
1390
1277
  }
@@ -1424,12 +1311,18 @@ static bool llama_eval_internal(
1424
1311
  ggml_cgraph gf = {};
1425
1312
  gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
1426
1313
 
1427
- struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
1428
- ggml_set_name(embd, "embd");
1429
- memcpy(embd->data, tokens, N*ggml_element_size(embd));
1430
-
1431
1314
  struct ggml_tensor * cur;
1432
- struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
1315
+ struct ggml_tensor * inpL;
1316
+
1317
+ if (tokens) {
1318
+ struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
1319
+ ggml_set_name(embd, "embd");
1320
+ memcpy(embd->data, tokens, N*ggml_element_size(embd));
1321
+ inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
1322
+ } else {
1323
+ inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
1324
+ memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
1325
+ }
1433
1326
 
1434
1327
  const int i_gpu_start = n_layer - n_gpu_layers;
1435
1328
  (void) i_gpu_start;
@@ -2012,10 +1905,10 @@ void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * can
2012
1905
  return;
2013
1906
  }
2014
1907
 
2015
- const int64_t t_start_sample_us = ggml_time_us();
2016
-
2017
1908
  llama_sample_softmax(ctx, candidates);
2018
1909
 
1910
+ const int64_t t_start_sample_us = ggml_time_us();
1911
+
2019
1912
  // Compute the cumulative probabilities
2020
1913
  float cum_sum = 0.0f;
2021
1914
  size_t last_idx = candidates->size;
@@ -2044,9 +1937,8 @@ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array *
2044
1937
  return;
2045
1938
  }
2046
1939
 
2047
- const int64_t t_start_sample_us = ggml_time_us();
2048
-
2049
1940
  llama_sample_softmax(nullptr, candidates);
1941
+ const int64_t t_start_sample_us = ggml_time_us();
2050
1942
 
2051
1943
  // Compute the first and second derivatives
2052
1944
  std::vector<float> first_derivatives(candidates->size - 1);
@@ -2098,11 +1990,11 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c
2098
1990
  return;
2099
1991
  }
2100
1992
 
2101
- const int64_t t_start_sample_us = ggml_time_us();
2102
-
2103
1993
  // Compute the softmax of logits and calculate entropy
2104
1994
  llama_sample_softmax(nullptr, candidates);
2105
1995
 
1996
+ const int64_t t_start_sample_us = ggml_time_us();
1997
+
2106
1998
  float entropy = 0.0f;
2107
1999
  for (size_t i = 0; i < candidates->size; ++i) {
2108
2000
  entropy += -candidates->data[i].p * logf(candidates->data[i].p);
@@ -2271,13 +2163,11 @@ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_
2271
2163
 
2272
2164
  if (ctx) {
2273
2165
  ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
2274
- ctx->n_sample++;
2275
2166
  }
2276
2167
  return X;
2277
2168
  }
2278
2169
 
2279
2170
  llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu) {
2280
- assert(ctx);
2281
2171
  int64_t t_start_sample_us;
2282
2172
  t_start_sample_us = ggml_time_us();
2283
2173
 
@@ -2292,13 +2182,14 @@ llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_tok
2292
2182
  candidates->size = 1;
2293
2183
  }
2294
2184
 
2185
+ if (ctx) {
2186
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
2187
+ }
2188
+
2295
2189
  // Normalize the probabilities of the remaining words
2296
2190
  llama_sample_softmax(ctx, candidates);
2297
2191
 
2298
2192
  // Sample the next word X from the remaining words
2299
- if (ctx) {
2300
- ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
2301
- }
2302
2193
  llama_token X = llama_sample_token(ctx, candidates);
2303
2194
  t_start_sample_us = ggml_time_us();
2304
2195
 
@@ -2366,10 +2257,10 @@ static void llama_convert_tensor_internal(const llama_load_tensor & tensor, llam
2366
2257
  }
2367
2258
  float * f32_output = (float *) output.addr;
2368
2259
 
2369
- quantize_fns_t qtype;
2260
+ ggml_type_traits_t qtype;
2370
2261
  if (ggml_is_quantized(tensor.type)) {
2371
- qtype = ggml_internal_get_quantize_fn(tensor.type);
2372
- if (qtype.dequantize_row_q == NULL) {
2262
+ qtype = ggml_internal_get_type_traits(tensor.type);
2263
+ if (qtype.to_float == NULL) {
2373
2264
  throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor.type)));
2374
2265
  }
2375
2266
  } else if (tensor.type != GGML_TYPE_F16) {
@@ -2380,7 +2271,7 @@ static void llama_convert_tensor_internal(const llama_load_tensor & tensor, llam
2380
2271
  if (tensor.type == GGML_TYPE_F16) {
2381
2272
  ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor.data, f32_output, nelements);
2382
2273
  } else if (ggml_is_quantized(tensor.type)) {
2383
- qtype.dequantize_row_q(tensor.data, f32_output, nelements);
2274
+ qtype.to_float(tensor.data, f32_output, nelements);
2384
2275
  } else {
2385
2276
  LLAMA_ASSERT(false); // unreachable
2386
2277
  }
@@ -2405,7 +2296,7 @@ static void llama_convert_tensor_internal(const llama_load_tensor & tensor, llam
2405
2296
  if (typ == GGML_TYPE_F16) {
2406
2297
  ggml_fp16_to_fp32_row((ggml_fp16_t *)inbuf, outbuf, nels);
2407
2298
  } else {
2408
- qtype.dequantize_row_q(inbuf, outbuf, nels);
2299
+ qtype.to_float(inbuf, outbuf, nels);
2409
2300
  }
2410
2301
  };
2411
2302
  workers.push_back(std::thread(compute, tensor.type, tensor.data + in_buff_offs, f32_output + out_buff_offs, thr_elems));
@@ -2451,9 +2342,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2451
2342
  nthread = std::thread::hardware_concurrency();
2452
2343
  }
2453
2344
 
2454
- std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp, /*use_mmap*/ false,
2455
- /*vocab_only*/ false));
2456
- llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), params->ftype);
2345
+ std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp, /*use_mmap*/ false));
2346
+ llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loader.get(), params->ftype);
2457
2347
 
2458
2348
  #ifdef GGML_USE_K_QUANTS
2459
2349
  int n_attention_wv = 0;
@@ -2654,6 +2544,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2654
2544
  }
2655
2545
  }
2656
2546
 
2547
+
2548
+
2657
2549
  //
2658
2550
  // interface implementation
2659
2551
  //
@@ -2692,7 +2584,7 @@ struct llama_context * llama_new_context_with_model(
2692
2584
 
2693
2585
  llama_context * ctx = new llama_context(*model, model->vocab);
2694
2586
 
2695
- if (params.seed < 0) {
2587
+ if (params.seed == LLAMA_DEFAULT_SEED) {
2696
2588
  params.seed = time(NULL);
2697
2589
  }
2698
2590
 
@@ -2874,7 +2766,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
2874
2766
 
2875
2767
  // create a name -> tensor map of the model to accelerate lookups
2876
2768
  std::unordered_map<std::string, struct ggml_tensor*> model_tensors;
2877
- for (auto & kv: model.tensors_by_name) {
2769
+ for (const auto & kv: model.tensors_by_name) {
2878
2770
  model_tensors.insert(kv);
2879
2771
  }
2880
2772
 
@@ -2885,7 +2777,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
2885
2777
  llama_buffer base_buf;
2886
2778
  if (path_base_model) {
2887
2779
  fprintf(stderr, "%s: loading base model from '%s'\n", __func__, path_base_model);
2888
- model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*vocab_only*/ false));
2780
+ model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true));
2889
2781
 
2890
2782
  size_t ctx_size;
2891
2783
  size_t mmapped_size;
@@ -2903,7 +2795,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
2903
2795
 
2904
2796
  // maybe this should in llama_model_loader
2905
2797
  if (model_loader->use_mmap) {
2906
- model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ 0, ggml_is_numa()));
2798
+ model_loader->mapping.reset(new llama_mmap(&model_loader->file_loader->file, /* prefetch */ 0, ggml_is_numa()));
2907
2799
  }
2908
2800
  }
2909
2801
 
@@ -2964,7 +2856,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
2964
2856
  return false;
2965
2857
  }
2966
2858
  }
2967
- ggml_tensor* lora_tensor;
2859
+ ggml_tensor * lora_tensor;
2968
2860
  if (n_dims == 2) {
2969
2861
  lora_tensor = ggml_new_tensor_2d(lora_ctx, wtype, ne[0], ne[1]);
2970
2862
  }
@@ -2972,6 +2864,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
2972
2864
  fprintf(stderr, "%s: unsupported tensor dimension %d\n", __func__, n_dims);
2973
2865
  return 1;
2974
2866
  }
2867
+ ggml_set_name(lora_tensor, "lora_tensor");
2975
2868
 
2976
2869
  // load tensor data
2977
2870
  size_t offset = fin.tellg();
@@ -2987,6 +2880,21 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
2987
2880
  lora_tensors.find(base_name + ".loraB") != lora_tensors.end()) {
2988
2881
 
2989
2882
  ggml_tensor * dest_t = model_tensors[base_name];
2883
+
2884
+ offload_func_t offload_func = llama_nop;
2885
+ offload_func_t offload_func_force_inplace = llama_nop;
2886
+
2887
+ #ifdef GGML_USE_CUBLAS
2888
+ if (dest_t->backend == GGML_BACKEND_GPU || dest_t->backend == GGML_BACKEND_GPU_SPLIT) {
2889
+ if (dest_t->type != GGML_TYPE_F16) {
2890
+ throw std::runtime_error(format(
2891
+ "%s: error: the simultaneous use of LoRAs and GPU acceleration is only supported for f16 models", __func__));
2892
+ }
2893
+ offload_func = ggml_cuda_assign_buffers;
2894
+ offload_func_force_inplace = ggml_cuda_assign_buffers_force_inplace;
2895
+ }
2896
+ #endif // GGML_USE_CUBLAS
2897
+
2990
2898
  ggml_tensor * base_t;
2991
2899
  if (model_loader) {
2992
2900
  // load from base model
@@ -3014,7 +2922,12 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
3014
2922
  }
3015
2923
 
3016
2924
  ggml_tensor * loraA = lora_tensors[base_name + ".loraA"];
2925
+ GGML_ASSERT(loraA->type == GGML_TYPE_F32);
2926
+ ggml_set_name(loraA, "loraA");
2927
+
3017
2928
  ggml_tensor * loraB = lora_tensors[base_name + ".loraB"];
2929
+ GGML_ASSERT(loraB->type == GGML_TYPE_F32);
2930
+ ggml_set_name(loraB, "loraB");
3018
2931
 
3019
2932
  if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) {
3020
2933
  fprintf(stderr, "%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");"
@@ -3024,19 +2937,32 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
3024
2937
 
3025
2938
  // w = w + BA*s
3026
2939
  ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB);
2940
+ offload_func(BA);
2941
+ ggml_set_name(BA, "BA");
3027
2942
 
3028
2943
  if (scaling != 1.0f) {
3029
2944
  ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling);
2945
+ ggml_set_name(scale_tensor, "scale_tensor");
2946
+
3030
2947
  BA = ggml_scale_inplace(lora_ctx, BA, scale_tensor);
2948
+ offload_func(BA);
2949
+ ggml_set_name(BA, "BA_scaled");
3031
2950
  }
3032
2951
 
3033
2952
  ggml_tensor * r;
3034
2953
  if (base_t == dest_t) {
3035
2954
  r = ggml_add_inplace(lora_ctx, dest_t, BA);
2955
+ offload_func_force_inplace(r);
2956
+ ggml_set_name(r, "r_add_inplace");
3036
2957
  }
3037
2958
  else {
3038
2959
  r = ggml_add(lora_ctx, base_t, BA);
2960
+ offload_func(r);
2961
+ ggml_set_name(r, "r_add");
2962
+
3039
2963
  r = ggml_cpy(lora_ctx, r, dest_t);
2964
+ offload_func(r);
2965
+ ggml_set_name(r, "r_cpy");
3040
2966
  }
3041
2967
 
3042
2968
  struct ggml_cgraph gf = ggml_build_forward(r);
@@ -3091,8 +3017,8 @@ int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
3091
3017
 
3092
3018
  #define LLAMA_MAX_RNG_STATE (64*1024)
3093
3019
 
3094
- void llama_set_rng_seed(struct llama_context * ctx, int seed) {
3095
- if (seed < 0) {
3020
+ void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed) {
3021
+ if (seed == LLAMA_DEFAULT_SEED) {
3096
3022
  seed = time(NULL);
3097
3023
  }
3098
3024
  ctx->rng.seed(seed);
@@ -3336,7 +3262,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
3336
3262
  return nread;
3337
3263
  }
3338
3264
 
3339
- bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
3265
+ static bool llama_load_session_file_internal(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
3340
3266
  llama_file file(path_session, "rb");
3341
3267
 
3342
3268
  // sanity checks
@@ -3390,6 +3316,15 @@ bool llama_load_session_file(struct llama_context * ctx, const char * path_sessi
3390
3316
  return true;
3391
3317
  }
3392
3318
 
3319
+ bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
3320
+ try {
3321
+ return llama_load_session_file_internal(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out);
3322
+ } catch (const std::exception & err) {
3323
+ fprintf(stderr, "error loading session file: %s\n", err.what());
3324
+ return false;
3325
+ }
3326
+ }
3327
+
3393
3328
  bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
3394
3329
  llama_file file(path_session, "wb");
3395
3330
 
@@ -3421,7 +3356,29 @@ int llama_eval(
3421
3356
  int n_tokens,
3422
3357
  int n_past,
3423
3358
  int n_threads) {
3424
- if (!llama_eval_internal(*ctx, tokens, n_tokens, n_past, n_threads, nullptr)) {
3359
+ if (!llama_eval_internal(*ctx, tokens, nullptr, n_tokens, n_past, n_threads, nullptr)) {
3360
+ fprintf(stderr, "%s: failed to eval\n", __func__);
3361
+ return 1;
3362
+ }
3363
+
3364
+ // get a more accurate load time, upon first eval
3365
+ // TODO: fix this
3366
+ if (!ctx->has_evaluated_once) {
3367
+ ctx->t_load_us = ggml_time_us() - ctx->t_start_us;
3368
+ ctx->has_evaluated_once = true;
3369
+ }
3370
+
3371
+ return 0;
3372
+ }
3373
+
3374
+
3375
+ int llama_eval_embd(
3376
+ struct llama_context * ctx,
3377
+ const float * embd,
3378
+ int n_tokens,
3379
+ int n_past,
3380
+ int n_threads) {
3381
+ if (!llama_eval_internal(*ctx, nullptr, embd, n_tokens, n_past, n_threads, nullptr)) {
3425
3382
  fprintf(stderr, "%s: failed to eval\n", __func__);
3426
3383
  return 1;
3427
3384
  }
@@ -3442,7 +3399,7 @@ int llama_eval_export(struct llama_context * ctx, const char * fname) {
3442
3399
 
3443
3400
  const std::vector<llama_token> tmp(n_batch, llama_token_bos());
3444
3401
 
3445
- if (!llama_eval_internal(*ctx, tmp.data(), tmp.size(), n_ctx, 1, fname)) {
3402
+ if (!llama_eval_internal(*ctx, tmp.data(), nullptr, tmp.size(), n_ctx, 1, fname)) {
3446
3403
  fprintf(stderr, "%s: failed to eval\n", __func__);
3447
3404
  return 1;
3448
3405
  }
@@ -3523,23 +3480,35 @@ llama_token llama_token_nl() {
3523
3480
  return 13;
3524
3481
  }
3525
3482
 
3483
+ struct llama_timings llama_get_timings(struct llama_context * ctx) {
3484
+ struct llama_timings result = {
3485
+ /*.t_start_ms =*/ 1e-3 * ctx->t_start_us,
3486
+ /*.t_end_ms =*/ 1.00 * ggml_time_ms(),
3487
+ /*.t_load_ms =*/ 1e-3 * ctx->t_load_us,
3488
+ /*.t_sample_ms =*/ 1e-3 * ctx->t_sample_us,
3489
+ /*.t_p_eval_ms =*/ 1e-3 * ctx->t_p_eval_us,
3490
+ /*.t_eval_ms =*/ 1e-3 * ctx->t_eval_us,
3526
3491
 
3527
- void llama_print_timings(struct llama_context * ctx) {
3528
- const int64_t t_end_us = ggml_time_us();
3492
+ /*.n_sample =*/ std::max(1, ctx->n_sample),
3493
+ /*.n_p_eval =*/ std::max(1, ctx->n_p_eval),
3494
+ /*.n_eval =*/ std::max(1, ctx->n_eval),
3495
+ };
3529
3496
 
3530
- const int32_t n_sample = std::max(1, ctx->n_sample);
3531
- const int32_t n_eval = std::max(1, ctx->n_eval);
3532
- const int32_t n_p_eval = std::max(1, ctx->n_p_eval);
3497
+ return result;
3498
+ }
3499
+
3500
+ void llama_print_timings(struct llama_context * ctx) {
3501
+ const llama_timings timings = llama_get_timings(ctx);
3533
3502
 
3534
3503
  fprintf(stderr, "\n");
3535
- fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0);
3504
+ fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, timings.t_load_ms);
3536
3505
  fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
3537
- __func__, 1e-3 * ctx->t_sample_us, n_sample, 1e-3 * ctx->t_sample_us / n_sample, 1e6 / ctx->t_sample_us * n_sample);
3506
+ __func__, timings.t_sample_ms, timings.n_sample, timings.t_sample_ms / timings.n_sample, 1e3 / timings.t_sample_ms * timings.n_sample);
3538
3507
  fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
3539
- __func__, 1e-3 * ctx->t_p_eval_us, n_p_eval, 1e-3 * ctx->t_p_eval_us / n_p_eval, 1e6 / ctx->t_p_eval_us * n_p_eval);
3508
+ __func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval);
3540
3509
  fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
3541
- __func__, 1e-3 * ctx->t_eval_us, n_eval, 1e-3 * ctx->t_eval_us / n_eval, 1e6 / ctx->t_eval_us * n_eval);
3542
- fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0);
3510
+ __func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval);
3511
+ fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (timings.t_end_ms - timings.t_start_ms));
3543
3512
  }
3544
3513
 
3545
3514
  void llama_reset_timings(struct llama_context * ctx) {