llama_cpp 0.3.0 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -66,6 +66,7 @@ enum e_model {
66
66
  MODEL_65B,
67
67
  };
68
68
 
69
+ static const size_t kB = 1024;
69
70
  static const size_t MB = 1024*1024;
70
71
 
71
72
  // computed for n_ctx == 2048
@@ -129,6 +130,34 @@ static const std::map<e_model, size_t> & MEM_REQ_EVAL()
129
130
  return k_sizes;
130
131
  }
131
132
 
133
+ // amount of VRAM needed per batch size to hold temporary results
134
+ // the values for 3b and 65b are not derived from testing but instead chosen conservatively
135
+ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_BASE()
136
+ {
137
+ static std::map<e_model, size_t> k_sizes = {
138
+ { MODEL_3B, 512ull * kB },
139
+ { MODEL_7B, 512ull * kB },
140
+ { MODEL_13B, 640ull * kB },
141
+ { MODEL_30B, 768ull * kB },
142
+ { MODEL_65B, 1536ull * kB },
143
+ };
144
+ return k_sizes;
145
+ }
146
+
147
+ // amount of VRAM needed per batch size and context to hold temporary results
148
+ // the values for 3b and 65b are not derived from testing but instead chosen conservatively
149
+ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT()
150
+ {
151
+ static std::map<e_model, size_t> k_sizes = {
152
+ { MODEL_3B, 128ull },
153
+ { MODEL_7B, 128ull },
154
+ { MODEL_13B, 160ull },
155
+ { MODEL_30B, 208ull },
156
+ { MODEL_65B, 416ull },
157
+ };
158
+ return k_sizes;
159
+ }
160
+
132
161
  // default hparams (LLaMA 7B)
133
162
  struct llama_hparams {
134
163
  uint32_t n_vocab = 32000;
@@ -165,8 +194,8 @@ struct llama_layer {
165
194
  };
166
195
 
167
196
  struct llama_kv_cache {
168
- struct ggml_tensor * k;
169
- struct ggml_tensor * v;
197
+ struct ggml_tensor * k = NULL;
198
+ struct ggml_tensor * v = NULL;
170
199
 
171
200
  struct ggml_context * ctx = NULL;
172
201
 
@@ -253,7 +282,13 @@ struct llama_model {
253
282
 
254
283
  struct llama_context {
255
284
  llama_context(const llama_model & model, const llama_vocab & vocab) : model(model), vocab(vocab), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
256
-
285
+ #ifdef GGML_USE_METAL
286
+ ~llama_context() {
287
+ if (ctx_metal) {
288
+ ggml_metal_free(ctx_metal);
289
+ }
290
+ }
291
+ #endif
257
292
  std::mt19937 rng;
258
293
 
259
294
  bool has_evaluated_once = false;
@@ -364,96 +399,14 @@ static size_t llama_calc_tensor_size(const std::vector<uint32_t> & ne, enum ggml
364
399
  return size / ggml_blck_size(type);
365
400
  }
366
401
 
367
- struct llama_load_tensor_shard {
368
- std::vector<uint32_t> ne;
369
- size_t size;
370
- enum ggml_type type;
371
- size_t file_idx;
372
- size_t file_off;
373
-
374
- void calc_size() {
375
- size = llama_calc_tensor_size(ne, type);
376
- }
377
- };
378
-
379
- enum llama_split_type {
380
- SPLIT_NONE,
381
- SPLIT_BY_COLUMNS,
382
- SPLIT_BY_ROWS
383
- };
384
-
385
402
  struct llama_load_tensor {
386
- std::vector<llama_load_tensor_shard> shards;
387
-
388
403
  std::string name;
389
404
  enum ggml_type type = GGML_TYPE_F32;
390
- llama_split_type split_type = SPLIT_NONE;
391
405
  std::vector<uint32_t> ne;
406
+ size_t file_off;
392
407
  size_t size;
393
408
  struct ggml_tensor * ggml_tensor = NULL;
394
409
  uint8_t * data;
395
-
396
- llama_load_tensor(const std::string & name) : name(name) {}
397
-
398
- void calc_all() {
399
- calc_type();
400
- calc_split_type();
401
- calc_ne();
402
- calc_size();
403
- }
404
-
405
- void calc_type() {
406
- const auto & first_shard = shards.at(0);
407
- for (const auto & shard : shards) {
408
- if (shard.type != first_shard.type) {
409
- throw std::runtime_error(format("inconsistent tensor shard type in '%s'", name.c_str()));
410
- }
411
- }
412
- type = first_shard.type;
413
- }
414
-
415
- void calc_split_type() {
416
- if (shards.at(0).ne.size() == 1 || // 1D tensors are just duplicated in every file
417
- shards.size() == 1) { // only one file?
418
- split_type = SPLIT_NONE;
419
- } else if (name.find("tok_embeddings.") == 0 ||
420
- name.find(".attention.wo.weight") != std::string::npos ||
421
- name.find(".feed_forward.w2.weight") != std::string::npos) {
422
- split_type = SPLIT_BY_COLUMNS;
423
- } else {
424
- split_type = SPLIT_BY_ROWS;
425
- }
426
- }
427
-
428
- void calc_ne() {
429
- const auto & first_shard = shards.at(0);
430
- for (const auto & shard : shards) {
431
- if (shard.ne != first_shard.ne) {
432
- throw std::runtime_error(format("inconsistent tensor shard shape in '%s': first was %s, other was %s",
433
- name.c_str(), llama_format_tensor_shape(first_shard.ne).c_str(), llama_format_tensor_shape(shard.ne).c_str()));
434
- }
435
- }
436
- ne = first_shard.ne;
437
- LLAMA_ASSERT(shards.size() <= UINT32_MAX);
438
- uint32_t n_shards = (uint32_t) shards.size();
439
- switch (split_type) {
440
- case SPLIT_NONE:
441
- ne = first_shard.ne;
442
- break;
443
- case SPLIT_BY_COLUMNS:
444
- ne = {checked_mul<uint32_t>(first_shard.ne[0], n_shards),
445
- first_shard.ne[1]};
446
- break;
447
- case SPLIT_BY_ROWS:
448
- ne = {first_shard.ne[0],
449
- checked_mul<uint32_t>(first_shard.ne[1], n_shards)};
450
- break;
451
- }
452
- }
453
-
454
- void calc_size() {
455
- size = llama_calc_tensor_size(ne, type);
456
- }
457
410
  };
458
411
 
459
412
  struct llama_load_tensors_map {
@@ -476,13 +429,13 @@ struct llama_file_loader {
476
429
  llama_hparams hparams;
477
430
  llama_vocab vocab;
478
431
 
479
- llama_file_loader(const char * fname, size_t file_idx, llama_load_tensors_map & tensors_map)
432
+ llama_file_loader(const char * fname, llama_load_tensors_map & tensors_map)
480
433
  : file(fname, "rb") {
481
434
  fprintf(stderr, "llama.cpp: loading model from %s\n", fname);
482
435
  read_magic();
483
436
  read_hparams();
484
437
  read_vocab();
485
- read_tensor_metadata(file_idx, tensors_map);
438
+ read_tensor_metadata(tensors_map);
486
439
  }
487
440
  void read_magic() {
488
441
  uint32_t magic = file.read_u32();
@@ -528,9 +481,7 @@ struct llama_file_loader {
528
481
  std::string word = file.read_string(len);
529
482
 
530
483
  float score = 0.0f;
531
- if (file_version >= LLAMA_FILE_VERSION_GGMF_V1) {
532
- file.read_raw(&score, sizeof(score));
533
- }
484
+ file.read_raw(&score, sizeof(score));
534
485
 
535
486
  vocab.token_to_id[word] = i;
536
487
 
@@ -539,19 +490,19 @@ struct llama_file_loader {
539
490
  tok_score.score = score;
540
491
  }
541
492
  }
542
- void read_tensor_metadata(size_t file_idx, llama_load_tensors_map & tensors_map) {
493
+ void read_tensor_metadata(llama_load_tensors_map & tensors_map) {
543
494
  while (file.tell() < file.size) {
544
- llama_load_tensor_shard shard;
495
+ llama_load_tensor tensor;
545
496
  uint32_t n_dims = file.read_u32();
546
497
  uint32_t name_len = file.read_u32();
547
- shard.type = (enum ggml_type) file.read_u32();
548
- shard.ne.resize(n_dims);
549
- file.read_raw(shard.ne.data(), sizeof(shard.ne[0]) * n_dims);
498
+ tensor.type = (enum ggml_type) file.read_u32();
499
+ tensor.ne.resize(n_dims);
500
+ file.read_raw(tensor.ne.data(), sizeof(tensor.ne[0]) * n_dims);
550
501
  std::string name = file.read_string(name_len);
551
502
  if (n_dims < 1 || n_dims > 2) {
552
503
  throw std::runtime_error(format("llama.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims));
553
504
  }
554
- switch (shard.type) {
505
+ switch (tensor.type) {
555
506
  case GGML_TYPE_F32:
556
507
  case GGML_TYPE_F16:
557
508
  case GGML_TYPE_Q4_0:
@@ -566,30 +517,20 @@ struct llama_file_loader {
566
517
  case GGML_TYPE_Q6_K:
567
518
  break;
568
519
  default: {
569
- throw std::runtime_error(format("unrecognized tensor type %u\n", shard.type));
520
+ throw std::runtime_error(format("unrecognized tensor type %u\n", tensor.type));
570
521
  }
571
522
  }
572
523
 
573
- if (file_version >= LLAMA_FILE_VERSION_GGJT_V1) {
574
- // skip to the next multiple of 32 bytes
575
- file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
576
- }
577
- shard.file_idx = file_idx;
578
- shard.file_off = file.tell();
524
+ // skip to the next multiple of 32 bytes
525
+ file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
579
526
 
580
- shard.calc_size();
581
- file.seek(shard.size, SEEK_CUR);
527
+ tensor.file_off = file.tell();
528
+ tensor.name = name;
529
+ tensor.size = llama_calc_tensor_size(tensor.ne, tensor.type);
530
+ file.seek(tensor.size, SEEK_CUR);
582
531
 
583
- auto it = tensors_map.name_to_idx.find(name);
584
- size_t idx;
585
- if (it != tensors_map.name_to_idx.end()) {
586
- idx = it->second;
587
- } else {
588
- tensors_map.tensors.emplace_back(name);
589
- idx = tensors_map.tensors.size() - 1;
590
- tensors_map.name_to_idx.emplace(name, idx);
591
- }
592
- tensors_map.tensors.at(idx).shards.push_back(shard);
532
+ tensors_map.tensors.push_back(tensor);
533
+ tensors_map.name_to_idx[name] = tensors_map.tensors.size() - 1;
593
534
  }
594
535
  }
595
536
  };
@@ -659,56 +600,19 @@ struct llama_file_saver {
659
600
  };
660
601
 
661
602
  struct llama_model_loader {
662
- std::vector<std::unique_ptr<llama_file_loader>> file_loaders;
603
+ std::unique_ptr<llama_file_loader> file_loader;
663
604
  llama_load_tensors_map tensors_map;
664
605
  bool use_mmap;
665
606
  size_t num_ggml_tensors_created = 0;
666
607
  struct ggml_context * ggml_ctx = NULL;
667
608
  std::unique_ptr<llama_mmap> mapping;
668
609
 
669
- llama_model_loader(const std::string & fname_base, bool use_mmap, bool vocab_only) {
670
- auto * first_file = new llama_file_loader(fname_base.c_str(), 0, tensors_map);
671
- file_loaders.emplace_back(first_file);
672
- uint32_t n_parts = vocab_only ? 1 : guess_n_parts();
673
- for (uint32_t i = 1; i < n_parts; i++) {
674
- std::string fname = fname_base + "." + std::to_string(i);
675
- auto * ith_file = new llama_file_loader(fname.c_str(), i, tensors_map);
676
- file_loaders.emplace_back(ith_file);
677
- if (ith_file->hparams != first_file->hparams) {
678
- throw std::runtime_error(format("llama.cpp: hparams inconsistent between files"));
679
- }
680
- }
610
+ llama_model_loader(const std::string & fname_base, bool use_mmap) {
611
+ file_loader = std::unique_ptr<llama_file_loader>(new llama_file_loader(fname_base.c_str(), tensors_map));
681
612
  if (!llama_mmap::SUPPORTED) {
682
613
  use_mmap = false;
683
614
  }
684
- if (use_mmap && alignment_prevents_mmap()) {
685
- fprintf(stderr, "llama.cpp: can't use mmap because tensors are not aligned; convert to new format to avoid this\n");
686
- use_mmap = false;
687
- }
688
615
  this->use_mmap = use_mmap;
689
- for (llama_load_tensor & lt : tensors_map.tensors) {
690
- lt.calc_all();
691
- }
692
- }
693
-
694
- bool alignment_prevents_mmap() {
695
- for (const llama_load_tensor & lt : tensors_map.tensors) {
696
- for (const llama_load_tensor_shard & shard : lt.shards) {
697
- if (shard.file_off & 3) {
698
- return true;
699
- }
700
- }
701
- }
702
- return false;
703
- }
704
-
705
- uint32_t guess_n_parts() const {
706
- auto it = tensors_map.name_to_idx.find("tok_embeddings.weight");
707
- if (it == tensors_map.name_to_idx.end()) {
708
- throw std::runtime_error(std::string("missing tok_embeddings.weight"));
709
- }
710
- const llama_load_tensor & lt = tensors_map.tensors.at(it->second);
711
- return file_loaders.at(0)->hparams.n_embd / lt.shards.at(0).ne.at(0);
712
616
  }
713
617
 
714
618
  void calc_sizes(size_t * ctx_size_p, size_t * mmapped_size_p) const {
@@ -774,7 +678,7 @@ struct llama_model_loader {
774
678
  }
775
679
 
776
680
  if (use_mmap) {
777
- mapping.reset(new llama_mmap(&file_loaders.at(0)->file, prefetch_size, ggml_is_numa()));
681
+ mapping.reset(new llama_mmap(&file_loader->file, prefetch_size, ggml_is_numa()));
778
682
  if (lmlock) {
779
683
  lmlock->init(mapping->addr);
780
684
  }
@@ -830,45 +734,13 @@ struct llama_model_loader {
830
734
 
831
735
  void load_data_for(llama_load_tensor & lt) {
832
736
  if (use_mmap) {
833
- LLAMA_ASSERT(lt.shards.size() == 1);
834
- lt.data = (uint8_t *) mapping->addr + lt.shards.at(0).file_off;
835
- } else if (lt.split_type == SPLIT_NONE) {
836
- llama_file & file = file_loaders.at(lt.shards.at(0).file_idx)->file;
837
- file.seek(lt.shards.at(0).file_off, SEEK_SET);
737
+ lt.data = (uint8_t *) mapping->addr + lt.file_off;
738
+ } else {
739
+ llama_file & file = file_loader->file;
740
+ file.seek(lt.file_off, SEEK_SET);
838
741
  file.read_raw(lt.data, lt.size);
839
- } else if (lt.split_type == SPLIT_BY_ROWS) {
840
- size_t offset = 0;
841
- for (llama_load_tensor_shard & shard : lt.shards) {
842
- llama_file & file = file_loaders.at(shard.file_idx)->file;
843
- file.seek(shard.file_off, SEEK_SET);
844
- file.read_raw(lt.data + offset, shard.size);
845
- offset += shard.size;
846
- }
847
- LLAMA_ASSERT(offset == lt.size);
848
- } else if (lt.split_type == SPLIT_BY_COLUMNS) {
849
- // Let's load the data into temporary buffers to ensure the OS performs large loads.
850
- std::vector<llama_buffer> tmp_bufs(lt.shards.size());
851
- for (size_t i = 0; i < lt.shards.size(); i++) {
852
- llama_load_tensor_shard & shard = lt.shards.at(i);
853
- llama_file & file = file_loaders.at(shard.file_idx)->file;
854
- file.seek(shard.file_off, SEEK_SET);
855
- tmp_bufs.at(i).resize(shard.size);
856
- file.read_raw(tmp_bufs.at(i).addr, shard.size);
857
- }
858
- // Then reshape.
859
- size_t num_rows = lt.ne.at(1);
860
- size_t per_shard_row_size = lt.shards.at(0).size / num_rows;
861
- size_t out_offset = 0;
862
- for (size_t row = 0; row < num_rows; row++) {
863
- for (llama_buffer & tmp_buf : tmp_bufs) {
864
- memcpy(lt.data + out_offset,
865
- tmp_buf.addr + row * per_shard_row_size,
866
- per_shard_row_size);
867
- out_offset += per_shard_row_size;
868
- }
869
- }
870
- LLAMA_ASSERT(out_offset == lt.size);
871
742
  }
743
+
872
744
  if (0) {
873
745
  print_checksum(lt);
874
746
  }
@@ -938,7 +810,7 @@ static bool kv_cache_init(
938
810
 
939
811
  struct llama_context_params llama_context_default_params() {
940
812
  struct llama_context_params result = {
941
- /*.seed =*/ -1,
813
+ /*.seed =*/ LLAMA_DEFAULT_SEED,
942
814
  /*.n_ctx =*/ 512,
943
815
  /*.n_batch =*/ 512,
944
816
  /*.gpu_layers =*/ 0,
@@ -1067,12 +939,12 @@ static void llama_model_load_internal(
1067
939
 
1068
940
  model.t_start_us = ggml_time_us();
1069
941
 
1070
- std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap, vocab_only));
942
+ std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap));
1071
943
 
1072
- vocab = std::move(ml->file_loaders.at(0)->vocab);
1073
- model.hparams = ml->file_loaders.at(0)->hparams;
944
+ vocab = std::move(ml->file_loader->vocab);
945
+ model.hparams = ml->file_loader->hparams;
1074
946
  model.n_gpu_layers = n_gpu_layers;
1075
- llama_file_version file_version = ml->file_loaders.at(0)->file_version;
947
+ llama_file_version file_version = ml->file_loader->file_version;
1076
948
  auto & hparams = model.hparams;
1077
949
 
1078
950
  {
@@ -1106,7 +978,6 @@ static void llama_model_load_internal(
1106
978
  fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot);
1107
979
  fprintf(stderr, "%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype));
1108
980
  fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff);
1109
- fprintf(stderr, "%s: n_parts = %zu\n", __func__, ml->file_loaders.size());
1110
981
  fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
1111
982
  }
1112
983
 
@@ -1274,14 +1145,18 @@ static void llama_model_load_internal(
1274
1145
  fprintf(stderr, "%s: not allocating a VRAM scratch buffer due to low VRAM option\n", __func__);
1275
1146
  ggml_cuda_set_scratch_size(0); // disable scratch
1276
1147
  } else {
1277
- vram_scratch = n_batch * MB;
1148
+ const size_t vram_scratch_base = VRAM_REQ_SCRATCH_BASE().at(model.type);
1149
+ const size_t vram_scratch_per_context = VRAM_REQ_SCRATCH_PER_CONTEXT().at(model.type);
1150
+ vram_scratch = n_batch * (vram_scratch_base + n_ctx * vram_scratch_per_context);
1278
1151
  ggml_cuda_set_scratch_size(vram_scratch);
1279
1152
  if (n_gpu_layers > 0) {
1280
- fprintf(stderr, "%s: allocating batch_size x 1 MB = %zd MB VRAM for the scratch buffer\n",
1281
- __func__, vram_scratch / MB);
1153
+ fprintf(stderr, "%s: allocating batch_size x (%zd kB + n_ctx x %zd B) = %zd MB VRAM for the scratch buffer\n",
1154
+ __func__, vram_scratch_base / kB, vram_scratch_per_context,
1155
+ (vram_scratch + MB - 1) / MB); // round up
1282
1156
  }
1283
1157
  }
1284
1158
  #endif // GGML_USE_CUBLAS
1159
+
1285
1160
  #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
1286
1161
  const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
1287
1162
 
@@ -1290,6 +1165,10 @@ static void llama_model_load_internal(
1290
1165
  fprintf(stderr, "%s: offloading non-repeating layers to GPU\n", __func__);
1291
1166
  }
1292
1167
  size_t vram_kv_cache = 0;
1168
+
1169
+ #ifdef GGML_USE_CUBLAS
1170
+ const int max_backend_supported_layers = hparams.n_layer + 3;
1171
+ const int max_offloadable_layers = low_vram ? hparams.n_layer + 1 : hparams.n_layer + 3;
1293
1172
  if (n_gpu_layers > (int) hparams.n_layer + 1) {
1294
1173
  if (low_vram) {
1295
1174
  fprintf(stderr, "%s: cannot offload v cache to GPU due to low VRAM option\n", __func__);
@@ -1306,14 +1185,18 @@ static void llama_model_load_internal(
1306
1185
  vram_kv_cache += MEM_REQ_KV_SELF().at(model.type) / 2;
1307
1186
  }
1308
1187
  }
1309
- const int max_offloadable_layers = low_vram ? hparams.n_layer + 1 : hparams.n_layer + 3;
1188
+ #elif defined(GGML_USE_CLBLAST)
1189
+ const int max_backend_supported_layers = hparams.n_layer + 1;
1190
+ const int max_offloadable_layers = hparams.n_layer + 1;
1191
+ #endif // GGML_USE_CUBLAS
1192
+
1310
1193
  fprintf(stderr, "%s: offloaded %d/%d layers to GPU\n",
1311
- __func__, std::min(n_gpu_layers, max_offloadable_layers), hparams.n_layer + 3);
1194
+ __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
1312
1195
  fprintf(stderr, "%s: total VRAM used: %zu MB\n",
1313
1196
  __func__, (vram_weights + vram_scratch + vram_kv_cache + MB - 1) / MB); // round up
1314
1197
  #else
1315
1198
  (void) n_gpu_layers;
1316
- #endif
1199
+ #endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
1317
1200
  }
1318
1201
 
1319
1202
  // populate `tensors_by_name`
@@ -1369,22 +1252,26 @@ static bool llama_model_load(
1369
1252
 
1370
1253
  // evaluate the transformer
1371
1254
  //
1372
- // - lctx: llama context
1373
- // - tokens: new batch of tokens to process
1374
- // - n_past: the context size so far
1375
- // - n_threads: number of threads to use
1376
- // - cgraph_fname: filename of the exported computation graph
1255
+ // - lctx: llama context
1256
+ // - tokens: new batch of tokens to process
1257
+ // - embd embeddings input
1258
+ // - n_tokens number of tokens
1259
+ // - n_past: the context size so far
1260
+ // - n_threads: number of threads to use
1377
1261
  //
1378
1262
  static bool llama_eval_internal(
1379
- llama_context & lctx,
1380
- const llama_token * tokens,
1381
- const int n_tokens,
1382
- const int n_past,
1383
- const int n_threads,
1263
+ llama_context & lctx,
1264
+ const llama_token * tokens,
1265
+ const float * embd,
1266
+ const int n_tokens,
1267
+ const int n_past,
1268
+ const int n_threads,
1384
1269
  const char * cgraph_fname) {
1385
1270
 
1271
+ LLAMA_ASSERT((!tokens && embd) || (tokens && !embd));
1272
+
1386
1273
  // enforce that the first token is BOS
1387
- if (n_past == 0 && tokens[0] != llama_token_bos()) {
1274
+ if (tokens && n_past == 0 && tokens[0] != llama_token_bos()) {
1388
1275
  fprintf(stderr, "%s: first token must be BOS\n", __func__);
1389
1276
  return false;
1390
1277
  }
@@ -1424,12 +1311,18 @@ static bool llama_eval_internal(
1424
1311
  ggml_cgraph gf = {};
1425
1312
  gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
1426
1313
 
1427
- struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
1428
- ggml_set_name(embd, "embd");
1429
- memcpy(embd->data, tokens, N*ggml_element_size(embd));
1430
-
1431
1314
  struct ggml_tensor * cur;
1432
- struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
1315
+ struct ggml_tensor * inpL;
1316
+
1317
+ if (tokens) {
1318
+ struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
1319
+ ggml_set_name(embd, "embd");
1320
+ memcpy(embd->data, tokens, N*ggml_element_size(embd));
1321
+ inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
1322
+ } else {
1323
+ inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
1324
+ memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
1325
+ }
1433
1326
 
1434
1327
  const int i_gpu_start = n_layer - n_gpu_layers;
1435
1328
  (void) i_gpu_start;
@@ -2012,10 +1905,10 @@ void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * can
2012
1905
  return;
2013
1906
  }
2014
1907
 
2015
- const int64_t t_start_sample_us = ggml_time_us();
2016
-
2017
1908
  llama_sample_softmax(ctx, candidates);
2018
1909
 
1910
+ const int64_t t_start_sample_us = ggml_time_us();
1911
+
2019
1912
  // Compute the cumulative probabilities
2020
1913
  float cum_sum = 0.0f;
2021
1914
  size_t last_idx = candidates->size;
@@ -2044,9 +1937,8 @@ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array *
2044
1937
  return;
2045
1938
  }
2046
1939
 
2047
- const int64_t t_start_sample_us = ggml_time_us();
2048
-
2049
1940
  llama_sample_softmax(nullptr, candidates);
1941
+ const int64_t t_start_sample_us = ggml_time_us();
2050
1942
 
2051
1943
  // Compute the first and second derivatives
2052
1944
  std::vector<float> first_derivatives(candidates->size - 1);
@@ -2098,11 +1990,11 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c
2098
1990
  return;
2099
1991
  }
2100
1992
 
2101
- const int64_t t_start_sample_us = ggml_time_us();
2102
-
2103
1993
  // Compute the softmax of logits and calculate entropy
2104
1994
  llama_sample_softmax(nullptr, candidates);
2105
1995
 
1996
+ const int64_t t_start_sample_us = ggml_time_us();
1997
+
2106
1998
  float entropy = 0.0f;
2107
1999
  for (size_t i = 0; i < candidates->size; ++i) {
2108
2000
  entropy += -candidates->data[i].p * logf(candidates->data[i].p);
@@ -2271,13 +2163,11 @@ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_
2271
2163
 
2272
2164
  if (ctx) {
2273
2165
  ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
2274
- ctx->n_sample++;
2275
2166
  }
2276
2167
  return X;
2277
2168
  }
2278
2169
 
2279
2170
  llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu) {
2280
- assert(ctx);
2281
2171
  int64_t t_start_sample_us;
2282
2172
  t_start_sample_us = ggml_time_us();
2283
2173
 
@@ -2292,13 +2182,14 @@ llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_tok
2292
2182
  candidates->size = 1;
2293
2183
  }
2294
2184
 
2185
+ if (ctx) {
2186
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
2187
+ }
2188
+
2295
2189
  // Normalize the probabilities of the remaining words
2296
2190
  llama_sample_softmax(ctx, candidates);
2297
2191
 
2298
2192
  // Sample the next word X from the remaining words
2299
- if (ctx) {
2300
- ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
2301
- }
2302
2193
  llama_token X = llama_sample_token(ctx, candidates);
2303
2194
  t_start_sample_us = ggml_time_us();
2304
2195
 
@@ -2366,10 +2257,10 @@ static void llama_convert_tensor_internal(const llama_load_tensor & tensor, llam
2366
2257
  }
2367
2258
  float * f32_output = (float *) output.addr;
2368
2259
 
2369
- quantize_fns_t qtype;
2260
+ ggml_type_traits_t qtype;
2370
2261
  if (ggml_is_quantized(tensor.type)) {
2371
- qtype = ggml_internal_get_quantize_fn(tensor.type);
2372
- if (qtype.dequantize_row_q == NULL) {
2262
+ qtype = ggml_internal_get_type_traits(tensor.type);
2263
+ if (qtype.to_float == NULL) {
2373
2264
  throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor.type)));
2374
2265
  }
2375
2266
  } else if (tensor.type != GGML_TYPE_F16) {
@@ -2380,7 +2271,7 @@ static void llama_convert_tensor_internal(const llama_load_tensor & tensor, llam
2380
2271
  if (tensor.type == GGML_TYPE_F16) {
2381
2272
  ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor.data, f32_output, nelements);
2382
2273
  } else if (ggml_is_quantized(tensor.type)) {
2383
- qtype.dequantize_row_q(tensor.data, f32_output, nelements);
2274
+ qtype.to_float(tensor.data, f32_output, nelements);
2384
2275
  } else {
2385
2276
  LLAMA_ASSERT(false); // unreachable
2386
2277
  }
@@ -2405,7 +2296,7 @@ static void llama_convert_tensor_internal(const llama_load_tensor & tensor, llam
2405
2296
  if (typ == GGML_TYPE_F16) {
2406
2297
  ggml_fp16_to_fp32_row((ggml_fp16_t *)inbuf, outbuf, nels);
2407
2298
  } else {
2408
- qtype.dequantize_row_q(inbuf, outbuf, nels);
2299
+ qtype.to_float(inbuf, outbuf, nels);
2409
2300
  }
2410
2301
  };
2411
2302
  workers.push_back(std::thread(compute, tensor.type, tensor.data + in_buff_offs, f32_output + out_buff_offs, thr_elems));
@@ -2451,9 +2342,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2451
2342
  nthread = std::thread::hardware_concurrency();
2452
2343
  }
2453
2344
 
2454
- std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp, /*use_mmap*/ false,
2455
- /*vocab_only*/ false));
2456
- llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), params->ftype);
2345
+ std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp, /*use_mmap*/ false));
2346
+ llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loader.get(), params->ftype);
2457
2347
 
2458
2348
  #ifdef GGML_USE_K_QUANTS
2459
2349
  int n_attention_wv = 0;
@@ -2654,6 +2544,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2654
2544
  }
2655
2545
  }
2656
2546
 
2547
+
2548
+
2657
2549
  //
2658
2550
  // interface implementation
2659
2551
  //
@@ -2692,7 +2584,7 @@ struct llama_context * llama_new_context_with_model(
2692
2584
 
2693
2585
  llama_context * ctx = new llama_context(*model, model->vocab);
2694
2586
 
2695
- if (params.seed < 0) {
2587
+ if (params.seed == LLAMA_DEFAULT_SEED) {
2696
2588
  params.seed = time(NULL);
2697
2589
  }
2698
2590
 
@@ -2874,7 +2766,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
2874
2766
 
2875
2767
  // create a name -> tensor map of the model to accelerate lookups
2876
2768
  std::unordered_map<std::string, struct ggml_tensor*> model_tensors;
2877
- for (auto & kv: model.tensors_by_name) {
2769
+ for (const auto & kv: model.tensors_by_name) {
2878
2770
  model_tensors.insert(kv);
2879
2771
  }
2880
2772
 
@@ -2885,7 +2777,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
2885
2777
  llama_buffer base_buf;
2886
2778
  if (path_base_model) {
2887
2779
  fprintf(stderr, "%s: loading base model from '%s'\n", __func__, path_base_model);
2888
- model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*vocab_only*/ false));
2780
+ model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true));
2889
2781
 
2890
2782
  size_t ctx_size;
2891
2783
  size_t mmapped_size;
@@ -2903,7 +2795,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
2903
2795
 
2904
2796
  // maybe this should in llama_model_loader
2905
2797
  if (model_loader->use_mmap) {
2906
- model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ 0, ggml_is_numa()));
2798
+ model_loader->mapping.reset(new llama_mmap(&model_loader->file_loader->file, /* prefetch */ 0, ggml_is_numa()));
2907
2799
  }
2908
2800
  }
2909
2801
 
@@ -2964,7 +2856,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
2964
2856
  return false;
2965
2857
  }
2966
2858
  }
2967
- ggml_tensor* lora_tensor;
2859
+ ggml_tensor * lora_tensor;
2968
2860
  if (n_dims == 2) {
2969
2861
  lora_tensor = ggml_new_tensor_2d(lora_ctx, wtype, ne[0], ne[1]);
2970
2862
  }
@@ -2972,6 +2864,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
2972
2864
  fprintf(stderr, "%s: unsupported tensor dimension %d\n", __func__, n_dims);
2973
2865
  return 1;
2974
2866
  }
2867
+ ggml_set_name(lora_tensor, "lora_tensor");
2975
2868
 
2976
2869
  // load tensor data
2977
2870
  size_t offset = fin.tellg();
@@ -2987,6 +2880,21 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
2987
2880
  lora_tensors.find(base_name + ".loraB") != lora_tensors.end()) {
2988
2881
 
2989
2882
  ggml_tensor * dest_t = model_tensors[base_name];
2883
+
2884
+ offload_func_t offload_func = llama_nop;
2885
+ offload_func_t offload_func_force_inplace = llama_nop;
2886
+
2887
+ #ifdef GGML_USE_CUBLAS
2888
+ if (dest_t->backend == GGML_BACKEND_GPU || dest_t->backend == GGML_BACKEND_GPU_SPLIT) {
2889
+ if (dest_t->type != GGML_TYPE_F16) {
2890
+ throw std::runtime_error(format(
2891
+ "%s: error: the simultaneous use of LoRAs and GPU acceleration is only supported for f16 models", __func__));
2892
+ }
2893
+ offload_func = ggml_cuda_assign_buffers;
2894
+ offload_func_force_inplace = ggml_cuda_assign_buffers_force_inplace;
2895
+ }
2896
+ #endif // GGML_USE_CUBLAS
2897
+
2990
2898
  ggml_tensor * base_t;
2991
2899
  if (model_loader) {
2992
2900
  // load from base model
@@ -3014,7 +2922,12 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
3014
2922
  }
3015
2923
 
3016
2924
  ggml_tensor * loraA = lora_tensors[base_name + ".loraA"];
2925
+ GGML_ASSERT(loraA->type == GGML_TYPE_F32);
2926
+ ggml_set_name(loraA, "loraA");
2927
+
3017
2928
  ggml_tensor * loraB = lora_tensors[base_name + ".loraB"];
2929
+ GGML_ASSERT(loraB->type == GGML_TYPE_F32);
2930
+ ggml_set_name(loraB, "loraB");
3018
2931
 
3019
2932
  if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) {
3020
2933
  fprintf(stderr, "%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");"
@@ -3024,19 +2937,32 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
3024
2937
 
3025
2938
  // w = w + BA*s
3026
2939
  ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB);
2940
+ offload_func(BA);
2941
+ ggml_set_name(BA, "BA");
3027
2942
 
3028
2943
  if (scaling != 1.0f) {
3029
2944
  ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling);
2945
+ ggml_set_name(scale_tensor, "scale_tensor");
2946
+
3030
2947
  BA = ggml_scale_inplace(lora_ctx, BA, scale_tensor);
2948
+ offload_func(BA);
2949
+ ggml_set_name(BA, "BA_scaled");
3031
2950
  }
3032
2951
 
3033
2952
  ggml_tensor * r;
3034
2953
  if (base_t == dest_t) {
3035
2954
  r = ggml_add_inplace(lora_ctx, dest_t, BA);
2955
+ offload_func_force_inplace(r);
2956
+ ggml_set_name(r, "r_add_inplace");
3036
2957
  }
3037
2958
  else {
3038
2959
  r = ggml_add(lora_ctx, base_t, BA);
2960
+ offload_func(r);
2961
+ ggml_set_name(r, "r_add");
2962
+
3039
2963
  r = ggml_cpy(lora_ctx, r, dest_t);
2964
+ offload_func(r);
2965
+ ggml_set_name(r, "r_cpy");
3040
2966
  }
3041
2967
 
3042
2968
  struct ggml_cgraph gf = ggml_build_forward(r);
@@ -3091,8 +3017,8 @@ int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
3091
3017
 
3092
3018
  #define LLAMA_MAX_RNG_STATE (64*1024)
3093
3019
 
3094
- void llama_set_rng_seed(struct llama_context * ctx, int seed) {
3095
- if (seed < 0) {
3020
+ void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed) {
3021
+ if (seed == LLAMA_DEFAULT_SEED) {
3096
3022
  seed = time(NULL);
3097
3023
  }
3098
3024
  ctx->rng.seed(seed);
@@ -3336,7 +3262,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
3336
3262
  return nread;
3337
3263
  }
3338
3264
 
3339
- bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
3265
+ static bool llama_load_session_file_internal(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
3340
3266
  llama_file file(path_session, "rb");
3341
3267
 
3342
3268
  // sanity checks
@@ -3390,6 +3316,15 @@ bool llama_load_session_file(struct llama_context * ctx, const char * path_sessi
3390
3316
  return true;
3391
3317
  }
3392
3318
 
3319
+ bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
3320
+ try {
3321
+ return llama_load_session_file_internal(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out);
3322
+ } catch (const std::exception & err) {
3323
+ fprintf(stderr, "error loading session file: %s\n", err.what());
3324
+ return false;
3325
+ }
3326
+ }
3327
+
3393
3328
  bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
3394
3329
  llama_file file(path_session, "wb");
3395
3330
 
@@ -3421,7 +3356,29 @@ int llama_eval(
3421
3356
  int n_tokens,
3422
3357
  int n_past,
3423
3358
  int n_threads) {
3424
- if (!llama_eval_internal(*ctx, tokens, n_tokens, n_past, n_threads, nullptr)) {
3359
+ if (!llama_eval_internal(*ctx, tokens, nullptr, n_tokens, n_past, n_threads, nullptr)) {
3360
+ fprintf(stderr, "%s: failed to eval\n", __func__);
3361
+ return 1;
3362
+ }
3363
+
3364
+ // get a more accurate load time, upon first eval
3365
+ // TODO: fix this
3366
+ if (!ctx->has_evaluated_once) {
3367
+ ctx->t_load_us = ggml_time_us() - ctx->t_start_us;
3368
+ ctx->has_evaluated_once = true;
3369
+ }
3370
+
3371
+ return 0;
3372
+ }
3373
+
3374
+
3375
+ int llama_eval_embd(
3376
+ struct llama_context * ctx,
3377
+ const float * embd,
3378
+ int n_tokens,
3379
+ int n_past,
3380
+ int n_threads) {
3381
+ if (!llama_eval_internal(*ctx, nullptr, embd, n_tokens, n_past, n_threads, nullptr)) {
3425
3382
  fprintf(stderr, "%s: failed to eval\n", __func__);
3426
3383
  return 1;
3427
3384
  }
@@ -3442,7 +3399,7 @@ int llama_eval_export(struct llama_context * ctx, const char * fname) {
3442
3399
 
3443
3400
  const std::vector<llama_token> tmp(n_batch, llama_token_bos());
3444
3401
 
3445
- if (!llama_eval_internal(*ctx, tmp.data(), tmp.size(), n_ctx, 1, fname)) {
3402
+ if (!llama_eval_internal(*ctx, tmp.data(), nullptr, tmp.size(), n_ctx, 1, fname)) {
3446
3403
  fprintf(stderr, "%s: failed to eval\n", __func__);
3447
3404
  return 1;
3448
3405
  }
@@ -3523,23 +3480,35 @@ llama_token llama_token_nl() {
3523
3480
  return 13;
3524
3481
  }
3525
3482
 
3483
+ struct llama_timings llama_get_timings(struct llama_context * ctx) {
3484
+ struct llama_timings result = {
3485
+ /*.t_start_ms =*/ 1e-3 * ctx->t_start_us,
3486
+ /*.t_end_ms =*/ 1.00 * ggml_time_ms(),
3487
+ /*.t_load_ms =*/ 1e-3 * ctx->t_load_us,
3488
+ /*.t_sample_ms =*/ 1e-3 * ctx->t_sample_us,
3489
+ /*.t_p_eval_ms =*/ 1e-3 * ctx->t_p_eval_us,
3490
+ /*.t_eval_ms =*/ 1e-3 * ctx->t_eval_us,
3526
3491
 
3527
- void llama_print_timings(struct llama_context * ctx) {
3528
- const int64_t t_end_us = ggml_time_us();
3492
+ /*.n_sample =*/ std::max(1, ctx->n_sample),
3493
+ /*.n_p_eval =*/ std::max(1, ctx->n_p_eval),
3494
+ /*.n_eval =*/ std::max(1, ctx->n_eval),
3495
+ };
3529
3496
 
3530
- const int32_t n_sample = std::max(1, ctx->n_sample);
3531
- const int32_t n_eval = std::max(1, ctx->n_eval);
3532
- const int32_t n_p_eval = std::max(1, ctx->n_p_eval);
3497
+ return result;
3498
+ }
3499
+
3500
+ void llama_print_timings(struct llama_context * ctx) {
3501
+ const llama_timings timings = llama_get_timings(ctx);
3533
3502
 
3534
3503
  fprintf(stderr, "\n");
3535
- fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0);
3504
+ fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, timings.t_load_ms);
3536
3505
  fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
3537
- __func__, 1e-3 * ctx->t_sample_us, n_sample, 1e-3 * ctx->t_sample_us / n_sample, 1e6 / ctx->t_sample_us * n_sample);
3506
+ __func__, timings.t_sample_ms, timings.n_sample, timings.t_sample_ms / timings.n_sample, 1e3 / timings.t_sample_ms * timings.n_sample);
3538
3507
  fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
3539
- __func__, 1e-3 * ctx->t_p_eval_us, n_p_eval, 1e-3 * ctx->t_p_eval_us / n_p_eval, 1e6 / ctx->t_p_eval_us * n_p_eval);
3508
+ __func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval);
3540
3509
  fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
3541
- __func__, 1e-3 * ctx->t_eval_us, n_eval, 1e-3 * ctx->t_eval_us / n_eval, 1e6 / ctx->t_eval_us * n_eval);
3542
- fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0);
3510
+ __func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval);
3511
+ fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (timings.t_end_ms - timings.t_start_ms));
3543
3512
  }
3544
3513
 
3545
3514
  void llama_reset_timings(struct llama_context * ctx) {