llama_cpp 0.1.4 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -16,6 +16,10 @@
16
16
  #include "ggml-opencl.h"
17
17
  #endif
18
18
 
19
+ #ifdef GGML_USE_METAL
20
+ #include "ggml-metal.h"
21
+ #endif
22
+
19
23
  #include <array>
20
24
  #include <ctime>
21
25
  #include <cinttypes>
@@ -49,17 +53,22 @@ enum e_model {
49
53
  MODEL_65B,
50
54
  };
51
55
 
52
-
53
56
  static const size_t MB = 1024*1024;
54
57
 
55
58
  // computed for n_ctx == 2048
56
59
  // TODO: dynamically determine these sizes
57
60
  // needs modifications in ggml
58
61
 
62
+ typedef void (*offload_func_t)(struct ggml_tensor * tensor);
63
+
64
+ void llama_nop(struct ggml_tensor * tensor) { // don't offload by default
65
+ (void) tensor;
66
+ }
67
+
59
68
  static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
60
69
  {
61
70
  static std::map<e_model, size_t> k_sizes = {
62
- { MODEL_3B, 128ull * MB },
71
+ { MODEL_3B, 256ull * MB },
63
72
  { MODEL_7B, 512ull * MB },
64
73
  { MODEL_13B, 512ull * MB },
65
74
  { MODEL_30B, 512ull * MB },
@@ -71,7 +80,7 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
71
80
  static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
72
81
  {
73
82
  static std::map<e_model, size_t> k_sizes = {
74
- { MODEL_3B, 128ull * MB },
83
+ { MODEL_3B, 256ull * MB },
75
84
  { MODEL_7B, 512ull * MB },
76
85
  { MODEL_13B, 512ull * MB },
77
86
  { MODEL_30B, 512ull * MB },
@@ -156,6 +165,11 @@ struct llama_kv_cache {
156
165
  if (ctx) {
157
166
  ggml_free(ctx);
158
167
  }
168
+
169
+ #ifdef GGML_USE_CUBLAS
170
+ ggml_cuda_free_data(k);
171
+ ggml_cuda_free_data(v);
172
+ #endif // GGML_USE_CUBLAS
159
173
  }
160
174
  };
161
175
 
@@ -170,6 +184,7 @@ struct llama_model {
170
184
  struct ggml_tensor * output;
171
185
 
172
186
  std::vector<llama_layer> layers;
187
+ int n_gpu_layers;
173
188
 
174
189
  // context
175
190
  struct ggml_context * ctx = NULL;
@@ -195,6 +210,17 @@ struct llama_model {
195
210
  if (ctx) {
196
211
  ggml_free(ctx);
197
212
  }
213
+
214
+ #ifdef GGML_USE_CUBLAS
215
+ for (size_t i = 0; i < tensors_by_name.size(); ++i) {
216
+ ggml_cuda_free_data(tensors_by_name[i].second);
217
+ }
218
+ ggml_cuda_free_scratch();
219
+ #elif defined(GGML_USE_CLBLAST)
220
+ for (size_t i = 0; i < tensors_by_name.size(); ++i) {
221
+ ggml_cl_free_data(tensors_by_name[i].second);
222
+ }
223
+ #endif
198
224
  }
199
225
  };
200
226
 
@@ -243,6 +269,10 @@ struct llama_context {
243
269
  llama_ctx_buffer buf_compute;
244
270
  llama_ctx_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
245
271
 
272
+ #ifdef GGML_USE_METAL
273
+ ggml_metal_context * ctx_metal = NULL;
274
+ #endif
275
+
246
276
  int buf_last = 0;
247
277
  size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
248
278
 
@@ -282,15 +312,15 @@ template <typename T>
282
312
  static T checked_mul(T a, T b) {
283
313
  T ret = a * b;
284
314
  if (a != 0 && ret / a != b) {
285
- throw format("overflow multiplying %llu * %llu",
286
- (unsigned long long) a, (unsigned long long) b);
315
+ throw std::runtime_error(format("overflow multiplying %llu * %llu",
316
+ (unsigned long long) a, (unsigned long long) b));
287
317
  }
288
318
  return ret;
289
319
  }
290
320
 
291
321
  static size_t checked_div(size_t a, size_t b) {
292
322
  if (b == 0 || a % b != 0) {
293
- throw format("error dividing %zu / %zu", a, b);
323
+ throw std::runtime_error(format("error dividing %zu / %zu", a, b));
294
324
  }
295
325
  return a / b;
296
326
  }
@@ -354,7 +384,7 @@ struct llama_load_tensor {
354
384
  const auto & first_shard = shards.at(0);
355
385
  for (const auto & shard : shards) {
356
386
  if (shard.type != first_shard.type) {
357
- throw format("inconsistent tensor shard type in '%s'", name.c_str());
387
+ throw std::runtime_error(format("inconsistent tensor shard type in '%s'", name.c_str()));
358
388
  }
359
389
  }
360
390
  type = first_shard.type;
@@ -377,8 +407,8 @@ struct llama_load_tensor {
377
407
  const auto & first_shard = shards.at(0);
378
408
  for (const auto & shard : shards) {
379
409
  if (shard.ne != first_shard.ne) {
380
- throw format("inconsistent tensor shard shape in '%s': first was %s, other was %s",
381
- name.c_str(), llama_format_tensor_shape(first_shard.ne).c_str(), llama_format_tensor_shape(shard.ne).c_str());
410
+ throw std::runtime_error(format("inconsistent tensor shard shape in '%s': first was %s, other was %s",
411
+ name.c_str(), llama_format_tensor_shape(first_shard.ne).c_str(), llama_format_tensor_shape(shard.ne).c_str()));
382
412
  }
383
413
  }
384
414
  ne = first_shard.ne;
@@ -456,8 +486,8 @@ struct llama_file_loader {
456
486
  }
457
487
  }
458
488
 
459
- throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
460
- magic, version);
489
+ throw std::runtime_error(format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
490
+ magic, version));
461
491
  }
462
492
  void read_hparams() {
463
493
  hparams.n_vocab = file.read_u32();
@@ -497,7 +527,7 @@ struct llama_file_loader {
497
527
  file.read_raw(shard.ne.data(), sizeof(shard.ne[0]) * n_dims);
498
528
  std::string name = file.read_string(name_len);
499
529
  if (n_dims < 1 || n_dims > 2) {
500
- throw format("llama.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims);
530
+ throw std::runtime_error(format("llama.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims));
501
531
  }
502
532
  switch (shard.type) {
503
533
  case GGML_TYPE_F32:
@@ -507,9 +537,14 @@ struct llama_file_loader {
507
537
  case GGML_TYPE_Q5_0:
508
538
  case GGML_TYPE_Q5_1:
509
539
  case GGML_TYPE_Q8_0:
540
+ case GGML_TYPE_Q2_K:
541
+ case GGML_TYPE_Q3_K:
542
+ case GGML_TYPE_Q4_K:
543
+ case GGML_TYPE_Q5_K:
544
+ case GGML_TYPE_Q6_K:
510
545
  break;
511
546
  default: {
512
- throw format("unrecognized tensor type %u\n", shard.type);
547
+ throw std::runtime_error(format("unrecognized tensor type %u\n", shard.type));
513
548
  }
514
549
  }
515
550
 
@@ -582,6 +617,11 @@ struct llama_file_saver {
582
617
  case GGML_TYPE_Q5_0:
583
618
  case GGML_TYPE_Q5_1:
584
619
  case GGML_TYPE_Q8_0:
620
+ case GGML_TYPE_Q2_K:
621
+ case GGML_TYPE_Q3_K:
622
+ case GGML_TYPE_Q4_K:
623
+ case GGML_TYPE_Q5_K:
624
+ case GGML_TYPE_Q6_K:
585
625
  break;
586
626
  default: LLAMA_ASSERT(false);
587
627
  }
@@ -613,7 +653,7 @@ struct llama_model_loader {
613
653
  auto * ith_file = new llama_file_loader(fname.c_str(), i, tensors_map);
614
654
  file_loaders.emplace_back(ith_file);
615
655
  if (ith_file->hparams != first_file->hparams) {
616
- throw format("llama.cpp: hparams inconsistent between files");
656
+ throw std::runtime_error(format("llama.cpp: hparams inconsistent between files"));
617
657
  }
618
658
  }
619
659
  if (!llama_mmap::SUPPORTED) {
@@ -643,7 +683,7 @@ struct llama_model_loader {
643
683
  uint32_t guess_n_parts() const {
644
684
  auto it = tensors_map.name_to_idx.find("tok_embeddings.weight");
645
685
  if (it == tensors_map.name_to_idx.end()) {
646
- throw std::string("missing tok_embeddings.weight");
686
+ throw std::runtime_error(std::string("missing tok_embeddings.weight"));
647
687
  }
648
688
  const llama_load_tensor & lt = tensors_map.tensors.at(it->second);
649
689
  return file_loaders.at(0)->hparams.n_embd / lt.shards.at(0).ne.at(0);
@@ -660,12 +700,12 @@ struct llama_model_loader {
660
700
  struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne, ggml_backend backend) {
661
701
  auto it = tensors_map.name_to_idx.find(name);
662
702
  if (it == tensors_map.name_to_idx.end()) {
663
- throw format("llama.cpp: tensor '%s' is missing from model", name.c_str());
703
+ throw std::runtime_error(std::runtime_error(format("llama.cpp: tensor '%s' is missing from model", name.c_str())));
664
704
  }
665
705
  llama_load_tensor & lt = tensors_map.tensors.at(it->second);
666
706
  if (lt.ne != ne) {
667
- throw format("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s",
668
- name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str());
707
+ throw std::runtime_error(format("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s",
708
+ name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str()));
669
709
  }
670
710
 
671
711
  return get_tensor_for(lt, backend);
@@ -673,6 +713,9 @@ struct llama_model_loader {
673
713
 
674
714
  struct ggml_tensor * get_tensor_for(llama_load_tensor & lt, ggml_backend backend) {
675
715
  struct ggml_tensor * tensor;
716
+ if (backend != GGML_BACKEND_CPU) {
717
+ ggml_set_no_alloc(ggml_ctx, true);
718
+ }
676
719
  if (lt.ne.size() == 2) {
677
720
  tensor = ggml_new_tensor_2d(ggml_ctx, lt.type, lt.ne.at(0), lt.ne.at(1));
678
721
  } else {
@@ -681,6 +724,10 @@ struct llama_model_loader {
681
724
  }
682
725
  ggml_set_name(tensor, lt.name.c_str());
683
726
  LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor
727
+
728
+ if (backend != GGML_BACKEND_CPU) {
729
+ ggml_set_no_alloc(ggml_ctx, use_mmap);
730
+ }
684
731
  tensor->backend = backend;
685
732
  lt.ggml_tensor = tensor;
686
733
  num_ggml_tensors_created++;
@@ -689,13 +736,14 @@ struct llama_model_loader {
689
736
 
690
737
  void done_getting_tensors() const {
691
738
  if (num_ggml_tensors_created != tensors_map.tensors.size()) {
692
- throw std::string("llama.cpp: file contained more tensors than expected");
739
+ throw std::runtime_error(std::string("llama.cpp: file contained more tensors than expected"));
693
740
  }
694
741
  }
695
742
 
696
743
  void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) {
697
744
  size_t data_size = 0;
698
745
  size_t prefetch_size = 0;
746
+ size_t lock_size = 0;
699
747
  for (const llama_load_tensor & lt : tensors_map.tensors) {
700
748
  data_size += lt.size;
701
749
  if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
@@ -705,11 +753,6 @@ struct llama_model_loader {
705
753
 
706
754
  if (use_mmap) {
707
755
  mapping.reset(new llama_mmap(&file_loaders.at(0)->file, prefetch_size));
708
- if (!lmlock) {
709
- // Don't call the callback since the actual loading will be lazy
710
- // and we can't measure it.
711
- progress_callback = NULL;
712
- }
713
756
  if (lmlock) {
714
757
  lmlock->init(mapping->addr);
715
758
  }
@@ -717,20 +760,49 @@ struct llama_model_loader {
717
760
 
718
761
  size_t done_size = 0;
719
762
  for (llama_load_tensor & lt : tensors_map.tensors) {
720
- if (lt.ggml_tensor->backend != GGML_BACKEND_CPU) {
721
- continue;
722
- }
723
763
  if (progress_callback) {
724
764
  progress_callback((float) done_size / data_size, progress_callback_user_data);
725
765
  }
726
766
  LLAMA_ASSERT(lt.ggml_tensor); // unused tensors should have been caught by load_data already
727
767
  lt.data = (uint8_t *) lt.ggml_tensor->data;
768
+
769
+ // allocate temp buffer if not using mmap
770
+ if (!use_mmap && lt.data == NULL) {
771
+ GGML_ASSERT(lt.ggml_tensor->backend != GGML_BACKEND_CPU);
772
+ lt.data = (uint8_t*)malloc(ggml_nbytes(lt.ggml_tensor));
773
+ }
774
+
728
775
  load_data_for(lt);
729
- lt.ggml_tensor->data = lt.data;
730
- done_size += lt.size;
731
- if (use_mmap && lmlock) {
732
- lmlock->grow_to(done_size);
776
+
777
+ switch(lt.ggml_tensor->backend) {
778
+ case GGML_BACKEND_CPU:
779
+ lt.ggml_tensor->data = lt.data;
780
+ if (use_mmap && lmlock) {
781
+ lock_size += lt.size;
782
+ lmlock->grow_to(lock_size);
783
+ }
784
+ break;
785
+ #if defined(GGML_USE_CUBLAS)
786
+ case GGML_BACKEND_GPU:
787
+ case GGML_BACKEND_GPU_SPLIT:
788
+ ggml_cuda_transform_tensor(lt.data, lt.ggml_tensor);
789
+ if (!use_mmap) {
790
+ free(lt.data);
791
+ }
792
+ break;
793
+ #elif defined(GGML_USE_CLBLAST)
794
+ case GGML_BACKEND_GPU:
795
+ ggml_cl_transform_tensor(lt.data, lt.ggml_tensor);
796
+ if (!use_mmap) {
797
+ free(lt.data);
798
+ }
799
+ break;
800
+ #endif
801
+ default:
802
+ continue;
733
803
  }
804
+
805
+ done_size += lt.size;
734
806
  }
735
807
  }
736
808
 
@@ -801,7 +873,8 @@ static bool kv_cache_init(
801
873
  const struct llama_hparams & hparams,
802
874
  struct llama_kv_cache & cache,
803
875
  ggml_type wtype,
804
- int n_ctx) {
876
+ int n_ctx,
877
+ int n_gpu_layers) {
805
878
  const int n_embd = hparams.n_embd;
806
879
  const int n_layer = hparams.n_layer;
807
880
 
@@ -827,13 +900,26 @@ static bool kv_cache_init(
827
900
  ggml_set_name(cache.k, "cache_k");
828
901
  ggml_set_name(cache.v, "cache_v");
829
902
 
903
+ #ifdef GGML_USE_CUBLAS
904
+ if (n_gpu_layers > n_layer + 1) {
905
+ ggml_cuda_assign_buffers_no_scratch(cache.v);
906
+ }
907
+ if (n_gpu_layers > n_layer + 2) {
908
+ ggml_cuda_assign_buffers_no_scratch(cache.k);
909
+ }
910
+ #endif // GGML_USE_CUBLAS
911
+
830
912
  return true;
831
913
  }
832
914
 
833
915
  struct llama_context_params llama_context_default_params() {
834
916
  struct llama_context_params result = {
835
917
  /*.n_ctx =*/ 512,
918
+ /*.n_batch =*/ 512,
836
919
  /*.gpu_layers =*/ 0,
920
+ /*.main_gpu =*/ 0,
921
+ /*.tensor_split =*/ {0},
922
+ /*.low_vram =*/ false,
837
923
  /*.seed =*/ -1,
838
924
  /*.f16_kv =*/ true,
839
925
  /*.logits_all =*/ false,
@@ -848,6 +934,17 @@ struct llama_context_params llama_context_default_params() {
848
934
  return result;
849
935
  }
850
936
 
937
+ struct llama_model_quantize_params llama_model_quantize_default_params() {
938
+ struct llama_model_quantize_params result = {
939
+ /*.nthread =*/ 0,
940
+ /*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
941
+ /*.allow_requantize =*/ false,
942
+ /*.quantize_output_tensor =*/ true,
943
+ };
944
+
945
+ return result;
946
+ }
947
+
851
948
  bool llama_mmap_supported() {
852
949
  return llama_mmap::SUPPORTED;
853
950
  }
@@ -898,6 +995,16 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
898
995
  case LLAMA_FTYPE_MOSTLY_Q5_0: return "mostly Q5_0";
899
996
  case LLAMA_FTYPE_MOSTLY_Q5_1: return "mostly Q5_1";
900
997
  case LLAMA_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0";
998
+ // K-quants
999
+ case LLAMA_FTYPE_MOSTLY_Q2_K: return "mostly Q2_K";
1000
+ case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "mostly Q3_K - Small";
1001
+ case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "mostly Q3_K - Medium";
1002
+ case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "mostly Q3_K - Large";
1003
+ case LLAMA_FTYPE_MOSTLY_Q4_K_S: return "mostly Q4_K - Small";
1004
+ case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "mostly Q4_K - Medium";
1005
+ case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "mostly Q5_K - Small";
1006
+ case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "mostly Q5_K - Medium";
1007
+ case LLAMA_FTYPE_MOSTLY_Q6_K: return "mostly Q6_K";
901
1008
  default: return "unknown, may not work";
902
1009
  }
903
1010
  }
@@ -917,7 +1024,11 @@ static void llama_model_load_internal(
917
1024
  const std::string & fname,
918
1025
  llama_context & lctx,
919
1026
  int n_ctx,
1027
+ int n_batch,
920
1028
  int n_gpu_layers,
1029
+ int main_gpu,
1030
+ const float * tensor_split,
1031
+ bool low_vram,
921
1032
  ggml_type memory_type,
922
1033
  bool use_mmap,
923
1034
  bool use_mlock,
@@ -932,9 +1043,9 @@ static void llama_model_load_internal(
932
1043
  lctx.vocab = std::move(ml->file_loaders.at(0)->vocab);
933
1044
  auto & model = lctx.model;
934
1045
  model.hparams = ml->file_loaders.at(0)->hparams;
1046
+ model.n_gpu_layers = n_gpu_layers;
935
1047
  llama_file_version file_version = ml->file_loaders.at(0)->file_version;
936
1048
  auto & hparams = model.hparams;
937
- uint32_t n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
938
1049
 
939
1050
  {
940
1051
  switch (hparams.n_layer) {
@@ -943,11 +1054,19 @@ static void llama_model_load_internal(
943
1054
  case 40: model.type = e_model::MODEL_13B; break;
944
1055
  case 60: model.type = e_model::MODEL_30B; break;
945
1056
  case 80: model.type = e_model::MODEL_65B; break;
1057
+ default:
1058
+ {
1059
+ if (hparams.n_layer < 32) {
1060
+ model.type = e_model::MODEL_7B;
1061
+ }
1062
+ } break;
946
1063
  }
947
1064
 
948
1065
  hparams.n_ctx = n_ctx;
949
1066
  }
950
1067
 
1068
+ const uint32_t n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
1069
+
951
1070
  {
952
1071
  fprintf(stderr, "%s: format = %s\n", __func__, llama_file_version_name(file_version));
953
1072
  fprintf(stderr, "%s: n_vocab = %u\n", __func__, hparams.n_vocab);
@@ -967,7 +1086,7 @@ static void llama_model_load_internal(
967
1086
  if (hparams.ftype != LLAMA_FTYPE_ALL_F32 &&
968
1087
  hparams.ftype != LLAMA_FTYPE_MOSTLY_F16 &&
969
1088
  hparams.ftype != LLAMA_FTYPE_MOSTLY_Q8_0) {
970
- throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1405)");
1089
+ throw std::runtime_error(format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1405)"));
971
1090
  }
972
1091
  }
973
1092
 
@@ -975,7 +1094,7 @@ static void llama_model_load_internal(
975
1094
  if (hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ||
976
1095
  hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_1 ||
977
1096
  hparams.ftype == LLAMA_FTYPE_MOSTLY_Q8_0) {
978
- throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1508)");
1097
+ throw std::runtime_error(format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1508)"));
979
1098
  }
980
1099
  }
981
1100
 
@@ -1006,18 +1125,28 @@ static void llama_model_load_internal(
1006
1125
 
1007
1126
  model.ctx = ggml_init(params);
1008
1127
  if (!model.ctx) {
1009
- throw format("ggml_init() failed");
1128
+ throw std::runtime_error(format("ggml_init() failed"));
1010
1129
  }
1011
1130
  }
1012
1131
 
1013
- #ifdef GGML_USE_CUBLAS
1014
- #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CUDA
1132
+ (void) main_gpu;
1133
+ #if defined(GGML_USE_CUBLAS)
1134
+ fprintf(stderr, "%s: using CUDA for GPU acceleration\n", __func__);
1135
+ ggml_cuda_set_main_device(main_gpu);
1136
+ #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
1137
+ #define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT
1138
+ #elif defined(GGML_USE_CLBLAST)
1139
+ fprintf(stderr, "%s: using OpenCL for GPU acceleration\n", __func__);
1140
+ #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
1141
+ #define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU
1015
1142
  #else
1016
- #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CPU
1143
+ #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CPU
1144
+ #define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_CPU
1017
1145
  #endif
1018
1146
 
1019
1147
  // prepare memory for the weights
1020
- size_t vram_total = 0;
1148
+ size_t vram_weights = 0;
1149
+ size_t vram_scratch = 0;
1021
1150
  {
1022
1151
  const uint32_t n_embd = hparams.n_embd;
1023
1152
  const uint32_t n_layer = hparams.n_layer;
@@ -1026,25 +1155,42 @@ static void llama_model_load_internal(
1026
1155
  ml->ggml_ctx = ctx;
1027
1156
 
1028
1157
  model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab}, GGML_BACKEND_CPU);
1029
- model.norm = ml->get_tensor("norm.weight", {n_embd}, GGML_BACKEND_CPU);
1030
1158
 
1031
1159
  // "output" tensor
1032
1160
  {
1161
+ ggml_backend backend_norm;
1033
1162
  ggml_backend backend_output;
1034
1163
  if (n_gpu_layers > int(n_layer)) { // NOLINT
1035
- backend_output = LLAMA_BACKEND_OFFLOAD;
1164
+ // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
1165
+ // on Windows however this is detrimental unless everything is on the GPU
1166
+ #ifndef _WIN32
1167
+ backend_norm = low_vram ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
1168
+ #else
1169
+ backend_norm = low_vram || n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
1170
+ #endif // _WIN32
1171
+
1172
+ backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
1036
1173
  } else {
1174
+ backend_norm = GGML_BACKEND_CPU;
1037
1175
  backend_output = GGML_BACKEND_CPU;
1038
1176
  }
1039
1177
 
1178
+ model.norm = ml->get_tensor("norm.weight", {n_embd}, backend_norm);
1040
1179
  model.output = ml->get_tensor("output.weight", {n_embd, n_vocab}, backend_output);
1180
+ if (backend_norm == GGML_BACKEND_GPU) {
1181
+ vram_weights += ggml_nbytes(model.norm);
1182
+ }
1183
+ if (backend_output == GGML_BACKEND_GPU_SPLIT) {
1184
+ vram_weights += ggml_nbytes(model.output);
1185
+ }
1041
1186
  }
1042
1187
 
1043
1188
  const int i_gpu_start = n_layer - n_gpu_layers;
1044
1189
 
1045
1190
  model.layers.resize(n_layer);
1046
1191
  for (uint32_t i = 0; i < n_layer; ++i) {
1047
- const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
1192
+ const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
1193
+ const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
1048
1194
 
1049
1195
  auto & layer = model.layers[i];
1050
1196
 
@@ -1052,21 +1198,21 @@ static void llama_model_load_internal(
1052
1198
 
1053
1199
  layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd}, backend);
1054
1200
 
1055
- layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}, backend);
1056
- layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd}, backend);
1057
- layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd}, backend);
1058
- layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}, backend);
1201
+ layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}, backend_split);
1202
+ layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd}, backend_split);
1203
+ layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd}, backend_split);
1204
+ layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}, backend_split);
1059
1205
 
1060
1206
  layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd}, backend);
1061
1207
 
1062
- layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}, backend);
1063
- layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}, backend);
1064
- layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}, backend);
1208
+ layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}, backend_split);
1209
+ layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}, backend_split);
1210
+ layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}, backend_split);
1065
1211
 
1066
- if (backend == GGML_BACKEND_CUDA) {
1067
- vram_total +=
1212
+ if (backend == GGML_BACKEND_GPU) {
1213
+ vram_weights +=
1068
1214
  ggml_nbytes(layer.attention_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
1069
- ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.attention_norm) +
1215
+ ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) +
1070
1216
  ggml_nbytes(layer.w1) + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3);
1071
1217
  }
1072
1218
  }
@@ -1081,10 +1227,10 @@ static void llama_model_load_internal(
1081
1227
  // this is the total memory required to run the inference
1082
1228
  const size_t mem_required =
1083
1229
  ctx_size +
1084
- mmapped_size - vram_total + // weights in VRAM not in memory
1230
+ mmapped_size - vram_weights + // weights in VRAM not in memory
1085
1231
  MEM_REQ_SCRATCH0().at(model.type) +
1086
1232
  MEM_REQ_SCRATCH1().at(model.type) +
1087
- MEM_REQ_EVAL().at(model.type);
1233
+ MEM_REQ_EVAL().at (model.type);
1088
1234
 
1089
1235
  // this is the memory required by one llama_state
1090
1236
  const size_t mem_required_state =
@@ -1093,15 +1239,51 @@ static void llama_model_load_internal(
1093
1239
  fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
1094
1240
  mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
1095
1241
 
1242
+ (void) vram_scratch;
1243
+ (void) n_batch;
1096
1244
  #ifdef GGML_USE_CUBLAS
1245
+ if (low_vram) {
1246
+ fprintf(stderr, "%s: not allocating a VRAM scratch buffer due to low VRAM option\n", __func__);
1247
+ ggml_cuda_set_scratch_size(0); // disable scratch
1248
+ } else {
1249
+ vram_scratch = n_batch * MB;
1250
+ ggml_cuda_set_scratch_size(vram_scratch);
1251
+ if (n_gpu_layers > 0) {
1252
+ fprintf(stderr, "%s: allocating batch_size x 1 MB = %ld MB VRAM for the scratch buffer\n",
1253
+ __func__, vram_scratch / MB);
1254
+ }
1255
+ }
1256
+ #endif // GGML_USE_CUBLAS
1257
+ #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
1097
1258
  const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
1098
1259
 
1099
- fprintf(stderr, "%s: [cublas] offloading %d layers to GPU\n", __func__, n_gpu);
1260
+ fprintf(stderr, "%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
1100
1261
  if (n_gpu_layers > (int) hparams.n_layer) {
1101
- fprintf(stderr, "%s: [cublas] offloading output layer to GPU\n", __func__);
1262
+ fprintf(stderr, "%s: offloading non-repeating layers to GPU\n", __func__);
1263
+ }
1264
+ size_t vram_kv_cache = 0;
1265
+ if (n_gpu_layers > (int) hparams.n_layer + 1) {
1266
+ if (low_vram) {
1267
+ fprintf(stderr, "%s: cannot offload v cache to GPU due to low VRAM option\n", __func__);
1268
+ } else {
1269
+ fprintf(stderr, "%s: offloading v cache to GPU\n", __func__);
1270
+ vram_kv_cache += MEM_REQ_KV_SELF().at(model.type) / 2;
1271
+ }
1102
1272
  }
1103
- fprintf(stderr, "%s: [cublas] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
1104
- #elif !defined(GGML_USE_CLBLAST)
1273
+ if (n_gpu_layers > (int) hparams.n_layer + 2) {
1274
+ if (low_vram) {
1275
+ fprintf(stderr, "%s: cannot offload k cache to GPU due to low VRAM option\n", __func__);
1276
+ } else {
1277
+ fprintf(stderr, "%s: offloading k cache to GPU\n", __func__);
1278
+ vram_kv_cache += MEM_REQ_KV_SELF().at(model.type) / 2;
1279
+ }
1280
+ }
1281
+ const int max_offloadable_layers = low_vram ? hparams.n_layer + 1 : hparams.n_layer + 3;
1282
+ fprintf(stderr, "%s: offloaded %d/%d layers to GPU\n",
1283
+ __func__, std::min(n_gpu_layers, max_offloadable_layers), hparams.n_layer + 3);
1284
+ fprintf(stderr, "%s: total VRAM used: %zu MB\n",
1285
+ __func__, (vram_weights + vram_scratch + vram_kv_cache + MB - 1) / MB); // round up
1286
+ #else
1105
1287
  (void) n_gpu_layers;
1106
1288
  #endif
1107
1289
  }
@@ -1111,57 +1293,15 @@ static void llama_model_load_internal(
1111
1293
  model.tensors_by_name.emplace_back(lt.name, lt.ggml_tensor);
1112
1294
  }
1113
1295
 
1114
- ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
1115
-
1116
- #ifdef GGML_USE_CUBLAS
1296
+ (void) tensor_split;
1297
+ #if defined(GGML_USE_CUBLAS)
1117
1298
  {
1118
- size_t done_size = 0;
1119
- size_t data_size = 0;
1120
- for (llama_load_tensor & lt : ml->tensors_map.tensors) {
1121
- data_size += lt.size;
1122
- if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
1123
- done_size += lt.size;
1124
- }
1125
- }
1126
- for (llama_load_tensor & lt : ml->tensors_map.tensors) {
1127
- if (lt.ggml_tensor->backend != GGML_BACKEND_CUDA) {
1128
- continue;
1129
- }
1130
- if (progress_callback) {
1131
- progress_callback((float) done_size / data_size, progress_callback_user_data);
1132
- }
1133
- ggml_cuda_load_data(fname.c_str(), lt.ggml_tensor, lt.shards.at(0).file_off);
1134
- done_size += lt.size;
1135
- }
1136
- }
1137
- #elif defined(GGML_USE_CLBLAST)
1138
- {
1139
- const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
1140
-
1141
- fprintf(stderr, "ggml_opencl: offloading %d layers to GPU\n", n_gpu);
1142
-
1143
- size_t vram_total = 0;
1144
-
1145
- for (int i = 0; i < n_gpu; ++i) {
1146
- const auto & layer = model.layers[i];
1147
-
1148
- ggml_cl_transform_tensor(layer.wq); vram_total += ggml_nbytes(layer.wq);
1149
- ggml_cl_transform_tensor(layer.wk); vram_total += ggml_nbytes(layer.wk);
1150
- ggml_cl_transform_tensor(layer.wv); vram_total += ggml_nbytes(layer.wv);
1151
- ggml_cl_transform_tensor(layer.wo); vram_total += ggml_nbytes(layer.wo);
1152
- ggml_cl_transform_tensor(layer.w1); vram_total += ggml_nbytes(layer.w1);
1153
- ggml_cl_transform_tensor(layer.w2); vram_total += ggml_nbytes(layer.w2);
1154
- ggml_cl_transform_tensor(layer.w3); vram_total += ggml_nbytes(layer.w3);
1155
- }
1156
- if (n_gpu_layers > (int) hparams.n_layer) {
1157
- fprintf(stderr, "ggml_opencl: offloading output layer to GPU\n");
1158
- ggml_cl_transform_tensor(model.output); vram_total += ggml_nbytes(model.output);
1159
- }
1160
-
1161
- fprintf(stderr, "ggml_opencl: total VRAM used: %zu MB\n", vram_total / 1024 / 1024);
1299
+ ggml_cuda_set_tensor_split(tensor_split);
1162
1300
  }
1163
1301
  #endif
1164
1302
 
1303
+ ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
1304
+
1165
1305
  if (progress_callback) {
1166
1306
  progress_callback(1.0f, progress_callback_user_data);
1167
1307
  }
@@ -1177,7 +1317,11 @@ static bool llama_model_load(
1177
1317
  const std::string & fname,
1178
1318
  llama_context & lctx,
1179
1319
  int n_ctx,
1320
+ int n_batch,
1180
1321
  int n_gpu_layers,
1322
+ int main_gpu,
1323
+ float * tensor_split,
1324
+ bool low_vram,
1181
1325
  ggml_type memory_type,
1182
1326
  bool use_mmap,
1183
1327
  bool use_mlock,
@@ -1185,28 +1329,30 @@ static bool llama_model_load(
1185
1329
  llama_progress_callback progress_callback,
1186
1330
  void *progress_callback_user_data) {
1187
1331
  try {
1188
- llama_model_load_internal(fname, lctx, n_ctx, n_gpu_layers, memory_type, use_mmap, use_mlock,
1189
- vocab_only, progress_callback, progress_callback_user_data);
1332
+ llama_model_load_internal(fname, lctx, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type,
1333
+ use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
1190
1334
  return true;
1191
- } catch (const std::string & err) {
1192
- fprintf(stderr, "error loading model: %s\n", err.c_str());
1335
+ } catch (const std::exception & err) {
1336
+ fprintf(stderr, "error loading model: %s\n", err.what());
1193
1337
  return false;
1194
1338
  }
1195
1339
  }
1196
1340
 
1197
1341
  // evaluate the transformer
1198
1342
  //
1199
- // - lctx: llama context
1200
- // - tokens: new batch of tokens to process
1201
- // - n_past: the context size so far
1202
- // - n_threads: number of threads to use
1343
+ // - lctx: llama context
1344
+ // - tokens: new batch of tokens to process
1345
+ // - n_past: the context size so far
1346
+ // - n_threads: number of threads to use
1347
+ // - cgraph_fname: filename of the exported computation graph
1203
1348
  //
1204
1349
  static bool llama_eval_internal(
1205
- llama_context & lctx,
1206
- const llama_token * tokens,
1207
- const int n_tokens,
1208
- const int n_past,
1209
- const int n_threads) {
1350
+ llama_context & lctx,
1351
+ const llama_token * tokens,
1352
+ const int n_tokens,
1353
+ const int n_past,
1354
+ const int n_threads,
1355
+ const char * cgraph_fname) {
1210
1356
 
1211
1357
  // enforce that the first token is BOS
1212
1358
  if (n_past == 0 && tokens[0] != llama_token_bos()) {
@@ -1225,12 +1371,13 @@ static bool llama_eval_internal(
1225
1371
 
1226
1372
  LLAMA_ASSERT(!!kv_self.ctx);
1227
1373
 
1228
- const int n_embd = hparams.n_embd;
1229
- const int n_layer = hparams.n_layer;
1230
- const int n_ctx = hparams.n_ctx;
1231
- const int n_head = hparams.n_head;
1232
- const int n_vocab = hparams.n_vocab;
1233
- const int n_rot = hparams.n_embd/hparams.n_head;
1374
+ const int n_embd = hparams.n_embd;
1375
+ const int n_layer = hparams.n_layer;
1376
+ const int n_ctx = hparams.n_ctx;
1377
+ const int n_head = hparams.n_head;
1378
+ const int n_vocab = hparams.n_vocab;
1379
+ const int n_rot = hparams.n_embd/hparams.n_head;
1380
+ const int n_gpu_layers = model.n_gpu_layers;
1234
1381
 
1235
1382
  auto & mem_per_token = lctx.mem_per_token;
1236
1383
  auto & buf_compute = lctx.buf_compute;
@@ -1252,40 +1399,98 @@ static bool llama_eval_internal(
1252
1399
  ggml_set_name(embd, "embd");
1253
1400
  memcpy(embd->data, tokens, N*ggml_element_size(embd));
1254
1401
 
1402
+ struct ggml_tensor * cur;
1255
1403
  struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
1256
1404
 
1405
+ const int i_gpu_start = n_layer - n_gpu_layers;
1406
+ (void) i_gpu_start;
1407
+
1408
+ // offload functions set the tensor output backend to GPU
1409
+ // tensors are GPU-accelerated if any input or the output has been offloaded
1410
+ //
1411
+ // with the low VRAM option VRAM scratch is disabled in llama_load_model_internal
1412
+ // in that case ggml_cuda_assign_buffers has no effect
1413
+ offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
1414
+ offload_func_t offload_func_kq = llama_nop;
1415
+ offload_func_t offload_func_v = llama_nop;
1416
+
1417
+ #ifdef GGML_USE_CUBLAS
1418
+ if (n_gpu_layers > n_layer) {
1419
+ offload_func_nr = ggml_cuda_assign_buffers;
1420
+ }
1421
+ if (n_gpu_layers > n_layer + 1) {
1422
+ offload_func_v = ggml_cuda_assign_buffers;
1423
+ }
1424
+ if (n_gpu_layers > n_layer + 2) {
1425
+ offload_func_kq = ggml_cuda_assign_buffers;
1426
+ }
1427
+ #endif // GGML_USE_CUBLAS
1428
+
1257
1429
  for (int il = 0; il < n_layer; ++il) {
1258
- struct ggml_tensor * inpSA = inpL;
1430
+ offload_func_t offload_func = llama_nop;
1431
+
1432
+ #ifdef GGML_USE_CUBLAS
1433
+ if (il >= i_gpu_start) {
1434
+ offload_func = ggml_cuda_assign_buffers;
1435
+ }
1436
+ #endif // GGML_USE_CUBLAS
1259
1437
 
1260
- struct ggml_tensor * cur;
1438
+ struct ggml_tensor * inpSA = inpL;
1261
1439
 
1262
1440
  lctx.use_buf(ctx0, 0);
1263
1441
 
1264
1442
  // norm
1265
1443
  {
1266
1444
  cur = ggml_rms_norm(ctx0, inpL);
1445
+ offload_func(cur);
1446
+ ggml_set_name(cur, "rms_norm_0");
1267
1447
 
1268
1448
  // cur = cur*attention_norm(broadcasted)
1269
1449
  cur = ggml_mul(ctx0, cur, model.layers[il].attention_norm);
1450
+ offload_func(cur);
1451
+ ggml_set_name(cur, "attention_norm_0");
1270
1452
  }
1271
1453
 
1272
1454
  // self-attention
1273
1455
  {
1274
1456
  // compute Q and K and RoPE them
1275
- struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
1276
- struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
1277
- ggml_set_name(Qcur, "Qcur");
1457
+ struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
1458
+ offload_func_kq(tmpk);
1459
+ ggml_set_name(tmpk, "tmpk");
1460
+
1461
+ struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
1462
+ offload_func_kq(tmpq);
1463
+ ggml_set_name(tmpq, "tmpq");
1464
+
1465
+ struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0);
1466
+ offload_func_kq(Kcur);
1278
1467
  ggml_set_name(Kcur, "Kcur");
1279
1468
 
1469
+ struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0);
1470
+ offload_func_kq(Qcur);
1471
+ ggml_set_name(Qcur, "Qcur");
1472
+
1280
1473
  // store key and value to memory
1281
1474
  {
1282
1475
  // compute the transposed [N, n_embd] V matrix
1283
- struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), n_embd, N));
1476
+
1477
+ struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
1478
+ offload_func_v(tmpv);
1479
+ ggml_set_name(tmpv, "tmpv");
1480
+
1481
+ struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd, N));
1482
+ offload_func_v(Vcur);
1483
+ ggml_set_name(Vcur, "Vcur");
1284
1484
 
1285
1485
  struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
1486
+ offload_func_kq(k);
1487
+ ggml_set_name(k, "k");
1488
+
1286
1489
  struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
1287
1490
  ( n_ctx)*ggml_element_size(kv_self.v),
1288
1491
  (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
1492
+ offload_func_v(v);
1493
+ ggml_set_name(v, "v");
1289
1494
 
1290
1495
  // important: storing RoPE-ed version of K in the KV cache!
1291
1496
  ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
@@ -1296,6 +1501,7 @@ static bool llama_eval_internal(
1296
1501
  ggml_permute(ctx0,
1297
1502
  Qcur,
1298
1503
  0, 2, 1, 3);
1504
+ offload_func_kq(Q);
1299
1505
  ggml_set_name(Q, "Q");
1300
1506
 
1301
1507
  struct ggml_tensor * K =
@@ -1304,10 +1510,12 @@ static bool llama_eval_internal(
1304
1510
  ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd),
1305
1511
  n_embd/n_head, n_head, n_past + N),
1306
1512
  0, 2, 1, 3);
1513
+ offload_func_kq(K);
1307
1514
  ggml_set_name(K, "K");
1308
1515
 
1309
1516
  // K * Q
1310
1517
  struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
1518
+ offload_func_kq(KQ);
1311
1519
  ggml_set_name(KQ, "KQ");
1312
1520
 
1313
1521
  // KQ_scaled = KQ / sqrt(n_embd/n_head)
@@ -1316,17 +1524,19 @@ static bool llama_eval_internal(
1316
1524
 
1317
1525
  // KQ_scaled shape [n_past + N, N, n_head, 1]
1318
1526
  struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
1527
+ offload_func_kq(KQ_scaled);
1319
1528
  ggml_set_name(KQ_scaled, "KQ_scaled");
1320
1529
 
1321
1530
  // KQ_masked = mask_past(KQ_scaled)
1322
1531
  struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
1532
+ offload_func_kq(KQ_masked);
1323
1533
  ggml_set_name(KQ_masked, "KQ_masked");
1324
1534
 
1325
1535
  // KQ = soft_max(KQ_masked)
1326
1536
  struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
1537
+ offload_func_v(KQ_soft_max);
1327
1538
  ggml_set_name(KQ_soft_max, "KQ_soft_max");
1328
1539
 
1329
-
1330
1540
  // split cached V into n_head heads
1331
1541
  struct ggml_tensor * V =
1332
1542
  ggml_view_3d(ctx0, kv_self.v,
@@ -1334,10 +1544,12 @@ static bool llama_eval_internal(
1334
1544
  n_ctx*ggml_element_size(kv_self.v),
1335
1545
  n_ctx*ggml_element_size(kv_self.v)*n_embd/n_head,
1336
1546
  il*n_ctx*ggml_element_size(kv_self.v)*n_embd);
1547
+ offload_func_v(V);
1337
1548
  ggml_set_name(V, "V");
1338
1549
 
1339
1550
  #if 1
1340
1551
  struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
1552
+ offload_func_v(KQV);
1341
1553
  ggml_set_name(KQV, "KQV");
1342
1554
  #else
1343
1555
  // make V contiguous in memory to speed up the matmul, however we waste time on the copy
@@ -1349,56 +1561,79 @@ static bool llama_eval_internal(
1349
1561
 
1350
1562
  // KQV_merged = KQV.permute(0, 2, 1, 3)
1351
1563
  struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
1564
+ offload_func_v(KQV_merged);
1352
1565
  ggml_set_name(KQV_merged, "KQV_merged");
1353
1566
 
1354
1567
  // cur = KQV_merged.contiguous().view(n_embd, N)
1355
1568
  cur = ggml_cpy(ctx0,
1356
1569
  KQV_merged,
1357
1570
  ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
1571
+ offload_func_v(cur);
1358
1572
  ggml_set_name(cur, "KQV_merged_contiguous");
1359
1573
 
1360
1574
  // projection (no bias)
1361
1575
  cur = ggml_mul_mat(ctx0,
1362
1576
  model.layers[il].wo,
1363
1577
  cur);
1578
+ offload_func(cur);
1579
+ ggml_set_name(cur, "result_wo");
1364
1580
  }
1365
1581
 
1366
1582
  lctx.use_buf(ctx0, 1);
1367
1583
 
1368
1584
  struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
1585
+ offload_func(inpFF);
1586
+ ggml_set_name(inpFF, "inpFF");
1369
1587
 
1370
1588
  // feed-forward network
1371
1589
  {
1372
1590
  // norm
1373
1591
  {
1374
1592
  cur = ggml_rms_norm(ctx0, inpFF);
1593
+ offload_func(cur);
1594
+ ggml_set_name(cur, "rms_norm_1");
1375
1595
 
1376
1596
  // cur = cur*ffn_norm(broadcasted)
1377
1597
  cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
1598
+ offload_func(cur);
1599
+ ggml_set_name(cur, "ffn_norm");
1378
1600
  }
1379
1601
 
1380
1602
  struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
1381
1603
  model.layers[il].w3,
1382
1604
  cur);
1605
+ offload_func(tmp);
1606
+ ggml_set_name(tmp, "result_w3");
1383
1607
 
1384
1608
  cur = ggml_mul_mat(ctx0,
1385
1609
  model.layers[il].w1,
1386
1610
  cur);
1611
+ offload_func(cur);
1612
+ ggml_set_name(cur, "result_w2");
1387
1613
 
1388
1614
  // SILU activation
1389
1615
  cur = ggml_silu(ctx0, cur);
1616
+ offload_func(cur);
1617
+ ggml_set_name(cur, "silu");
1390
1618
 
1391
1619
  cur = ggml_mul(ctx0, cur, tmp);
1620
+ offload_func(cur);
1621
+ ggml_set_name(cur, "silu_x_result_w3");
1392
1622
 
1393
1623
  cur = ggml_mul_mat(ctx0,
1394
1624
  model.layers[il].w2,
1395
1625
  cur);
1626
+ offload_func(cur);
1627
+ ggml_set_name(cur, "result_w2");
1396
1628
  }
1397
1629
 
1398
1630
  cur = ggml_add(ctx0, cur, inpFF);
1631
+ offload_func(cur);
1632
+ ggml_set_name(cur, "inpFF_+_result_w2");
1399
1633
 
1400
1634
  // input for next layer
1401
1635
  inpL = cur;
1636
+
1402
1637
  }
1403
1638
 
1404
1639
  lctx.use_buf(ctx0, 0);
@@ -1406,28 +1641,68 @@ static bool llama_eval_internal(
1406
1641
  // used at the end to optionally extract the embeddings
1407
1642
  struct ggml_tensor * embeddings = NULL;
1408
1643
 
1644
+
1409
1645
  // norm
1410
1646
  {
1647
+ cur = ggml_rms_norm(ctx0, inpL);
1648
+ offload_func_nr(cur);
1649
+ ggml_set_name(cur, "rms_norm_inpL");
1411
1650
 
1412
- inpL = ggml_rms_norm(ctx0, inpL);
1651
+ cur = ggml_rms_norm(ctx0, cur);
1652
+ offload_func_nr(cur);
1653
+ ggml_set_name(cur, "rms_norm_after");
1413
1654
 
1414
- // inpL = inpL*norm(broadcasted)
1415
- inpL = ggml_mul(ctx0, inpL, model.norm);
1655
+ // cur = cur*norm(broadcasted)
1656
+ cur = ggml_mul(ctx0, cur, model.norm);
1657
+ offload_func_nr(cur);
1658
+ ggml_set_name(cur, "result_norm");
1416
1659
 
1417
- embeddings = inpL;
1660
+ embeddings = cur;
1418
1661
  }
1419
1662
 
1663
+
1420
1664
  // lm_head
1421
- inpL = ggml_mul_mat(ctx0, model.output, inpL);
1665
+ cur = ggml_mul_mat(ctx0, model.output, cur);
1666
+ ggml_set_name(cur, "result_output");
1422
1667
 
1423
1668
  lctx.use_buf(ctx0, -1);
1424
1669
 
1425
1670
  // logits -> probs
1426
- //inpL = ggml_soft_max_inplace(ctx0, inpL);
1671
+ //cur = ggml_soft_max_inplace(ctx0, cur);
1427
1672
 
1428
1673
  // run the computation
1429
- ggml_build_forward_expand(&gf, inpL);
1430
- ggml_graph_compute (ctx0, &gf);
1674
+ ggml_build_forward_expand(&gf, cur);
1675
+
1676
+ #ifdef GGML_USE_METAL
1677
+ if (lctx.ctx_metal && N == 1) {
1678
+ ggml_metal_graph_compute(lctx.ctx_metal, &gf);
1679
+ ggml_metal_get_tensor (lctx.ctx_metal, cur);
1680
+ } else {
1681
+ // IMPORTANT:
1682
+ // Since we don't have efficient Matrix x Matrix Metal multiplication yet, we fallback to vanilla
1683
+ // ggml_graph_compute(). It uses Apple's Accelerate CBLAS API which takes advantage of the ANE or the AMX
1684
+ // coprocessor.
1685
+ //
1686
+ // When we implement Matrix x Matrix Metal multiplication, we can avoid this branch.
1687
+ // But for now, we have focused only on Matrix x Vector Metal multiplication.
1688
+ //
1689
+ // TODO: avoid these syncs via shared memory (ref #1696)
1690
+ //
1691
+ if (lctx.ctx_metal) {
1692
+ // We need to sync the GPU KV cache with the CPU KV cache
1693
+ ggml_metal_get_tensor(lctx.ctx_metal, kv_self.k);
1694
+ ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v);
1695
+ }
1696
+
1697
+ ggml_graph_compute(ctx0, &gf);
1698
+ }
1699
+ #else
1700
+ ggml_graph_compute(ctx0, &gf);
1701
+ #endif
1702
+
1703
+ if (cgraph_fname) {
1704
+ ggml_graph_export(&gf, cgraph_fname);
1705
+ }
1431
1706
 
1432
1707
  #ifdef GGML_PERF
1433
1708
  // print timing information per ggml operation (for debugging purposes)
@@ -1441,7 +1716,7 @@ static bool llama_eval_internal(
1441
1716
  //}
1442
1717
 
1443
1718
  //embd_w.resize(n_vocab*N);
1444
- //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
1719
+ //memcpy(embd_w.data(), ggml_get_data(cur), sizeof(float)*n_vocab*N);
1445
1720
 
1446
1721
  // update kv token count
1447
1722
  lctx.model.kv_self.n = n_past + N;
@@ -1452,11 +1727,11 @@ static bool llama_eval_internal(
1452
1727
 
1453
1728
  if (lctx.logits_all) {
1454
1729
  logits_out.resize(n_vocab * N);
1455
- memcpy(logits_out.data(), (float *) ggml_get_data(inpL), sizeof(float)*n_vocab*N);
1730
+ memcpy(logits_out.data(), (float *) ggml_get_data(cur), sizeof(float)*n_vocab*N);
1456
1731
  } else {
1457
1732
  // return result for just the last token
1458
1733
  logits_out.resize(n_vocab);
1459
- memcpy(logits_out.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
1734
+ memcpy(logits_out.data(), (float *) ggml_get_data(cur) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
1460
1735
  }
1461
1736
  }
1462
1737
 
@@ -1987,6 +2262,10 @@ llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_tok
1987
2262
  return -log2f(candidate.p) > *mu;
1988
2263
  }));
1989
2264
 
2265
+ if (candidates->size == 0) {
2266
+ candidates->size = 1;
2267
+ }
2268
+
1990
2269
  // Normalize the probabilities of the remaining words
1991
2270
  llama_sample_softmax(ctx, candidates);
1992
2271
 
@@ -2055,16 +2334,92 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra
2055
2334
  // quantization
2056
2335
  //
2057
2336
 
2058
- static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, enum llama_ftype ftype, int nthread) {
2337
+ static void llama_convert_tensor_internal(const llama_load_tensor & tensor, llama_buffer & output, const int nelements, const int nthread) {
2338
+ if (output.size < nelements * sizeof(float)) {
2339
+ output.resize(nelements * sizeof(float));
2340
+ }
2341
+ float * f32_output = (float *) output.addr;
2342
+
2343
+ quantize_fns_t qtype;
2344
+ if (ggml_is_quantized(tensor.type)) {
2345
+ qtype = ggml_internal_get_quantize_fn(tensor.type);
2346
+ if (qtype.dequantize_row_q == NULL) {
2347
+ throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor.type)));
2348
+ }
2349
+ } else if (tensor.type != GGML_TYPE_F16) {
2350
+ throw std::runtime_error(format("cannot dequantize/convert tensor type %s", ggml_type_name(tensor.type)));
2351
+ }
2352
+
2353
+ if (nthread < 2) {
2354
+ if (tensor.type == GGML_TYPE_F16) {
2355
+ ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor.data, f32_output, nelements);
2356
+ } else if (ggml_is_quantized(tensor.type)) {
2357
+ qtype.dequantize_row_q(tensor.data, f32_output, nelements);
2358
+ } else {
2359
+ LLAMA_ASSERT(false); // unreachable
2360
+ }
2361
+ return;
2362
+ }
2363
+
2364
+ auto block_size = tensor.type == GGML_TYPE_F16 ? 1 : (size_t)ggml_blck_size(tensor.type);
2365
+ auto block_size_bytes = ggml_type_size(tensor.type);
2366
+
2367
+ LLAMA_ASSERT(nelements % block_size == 0);
2368
+ auto nblocks = nelements / block_size;
2369
+ auto blocks_per_thread = nblocks / nthread;
2370
+ auto spare_blocks = nblocks - (blocks_per_thread * nthread); // if blocks aren't divisible by thread count
2371
+
2372
+ std::vector<std::thread> workers;
2373
+ for (auto tnum = 0, in_buff_offs = 0, out_buff_offs = 0; tnum < nthread; tnum++) {
2374
+ auto thr_blocks = blocks_per_thread + (tnum == nthread - 1 ? spare_blocks : 0); // num blocks for this thread
2375
+ auto thr_elems = thr_blocks * block_size; // number of elements for this thread
2376
+ auto thr_block_bytes = thr_blocks * block_size_bytes; // number of input bytes for this thread
2377
+
2378
+ auto compute = [qtype] (ggml_type typ, uint8_t * inbuf, float * outbuf, int nels) {
2379
+ if (typ == GGML_TYPE_F16) {
2380
+ ggml_fp16_to_fp32_row((ggml_fp16_t *)inbuf, outbuf, nels);
2381
+ } else {
2382
+ qtype.dequantize_row_q(inbuf, outbuf, nels);
2383
+ }
2384
+ };
2385
+ workers.push_back(std::thread(compute, tensor.type, tensor.data + in_buff_offs, f32_output + out_buff_offs, thr_elems));
2386
+ in_buff_offs += thr_block_bytes;
2387
+ out_buff_offs += thr_elems;
2388
+ }
2389
+ for (auto & worker : workers) {
2390
+ worker.join();
2391
+ }
2392
+
2393
+ }
2394
+
2395
+ static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
2059
2396
  ggml_type quantized_type;
2060
- switch (ftype) {
2397
+ llama_ftype ftype = params->ftype;
2398
+ int nthread = params->nthread;
2399
+
2400
+ switch (params->ftype) {
2061
2401
  case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
2062
2402
  case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
2063
2403
  case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
2064
2404
  case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
2065
2405
  case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
2066
- default: throw format("invalid output file type %d\n", ftype);
2067
- };
2406
+ case LLAMA_FTYPE_MOSTLY_F16: quantized_type = GGML_TYPE_F16; break;
2407
+ case LLAMA_FTYPE_ALL_F32: quantized_type = GGML_TYPE_F32; break;
2408
+
2409
+ #ifdef GGML_USE_K_QUANTS
2410
+ // K-quants
2411
+ case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
2412
+ case LLAMA_FTYPE_MOSTLY_Q3_K_S:
2413
+ case LLAMA_FTYPE_MOSTLY_Q3_K_M:
2414
+ case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break;
2415
+ case LLAMA_FTYPE_MOSTLY_Q4_K_S:
2416
+ case LLAMA_FTYPE_MOSTLY_Q4_K_M: quantized_type = GGML_TYPE_Q4_K; break;
2417
+ case LLAMA_FTYPE_MOSTLY_Q5_K_S:
2418
+ case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break;
2419
+ case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
2420
+ #endif
2421
+ default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
2422
+ }
2068
2423
 
2069
2424
  if (nthread <= 0) {
2070
2425
  nthread = std::thread::hardware_concurrency();
@@ -2072,7 +2427,23 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2072
2427
 
2073
2428
  std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp, /*use_mmap*/ false,
2074
2429
  /*vocab_only*/ false));
2075
- llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), ftype);
2430
+ llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), params->ftype);
2431
+
2432
+ #ifdef GGML_USE_K_QUANTS
2433
+ int n_attention_wv = 0;
2434
+ int n_feed_forward_w2 = 0;
2435
+ for (auto& tensor : model_loader->tensors_map.tensors) {
2436
+ if (tensor.name.find("attention.wv.weight") != std::string::npos) {
2437
+ ++n_attention_wv;
2438
+ }
2439
+ else if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) {
2440
+ ++n_feed_forward_w2;
2441
+ }
2442
+ }
2443
+
2444
+ int i_attention_wv = 0;
2445
+ int i_feed_forward_w2 = 0;
2446
+ #endif
2076
2447
 
2077
2448
  size_t total_size_org = 0;
2078
2449
  size_t total_size_new = 0;
@@ -2098,11 +2469,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2098
2469
 
2099
2470
  // quantize only 2D tensors
2100
2471
  quantize &= (tensor.ne.size() == 2);
2101
-
2102
- // uncomment this to keep the output layer in FP16
2103
- //if (tensor.name == "output.weight") {
2104
- // quantize = false;
2105
- //}
2472
+ quantize &= params->quantize_output_tensor || tensor.name != "output.weight";
2473
+ quantize &= quantized_type != tensor.type;
2106
2474
 
2107
2475
  enum ggml_type new_type;
2108
2476
  void * new_data;
@@ -2116,20 +2484,40 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2116
2484
  printf("size = %8.3f MB\n", tensor.size/1024.0/1024.0);
2117
2485
  } else {
2118
2486
  new_type = quantized_type;
2487
+ #ifdef GGML_USE_K_QUANTS
2488
+ if (tensor.name == "output.weight") {
2489
+ new_type = GGML_TYPE_Q6_K;
2490
+ } else if (tensor.name.find("attention.wv.weight") != std::string::npos) {
2491
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
2492
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
2493
+ else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
2494
+ (i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8 ||
2495
+ (i_attention_wv - n_attention_wv/8)%3 == 2)) new_type = GGML_TYPE_Q6_K;
2496
+ ++i_attention_wv;
2497
+ } else if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) {
2498
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
2499
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
2500
+ else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
2501
+ (i_feed_forward_w2 < n_feed_forward_w2/8 || i_feed_forward_w2 >= 7*n_feed_forward_w2/8 ||
2502
+ (i_feed_forward_w2 - n_feed_forward_w2/8)%3 == 2)) new_type = GGML_TYPE_Q6_K;
2503
+ ++i_feed_forward_w2;
2504
+ } else if (tensor.name.find("attention.wo.weight") != std::string::npos) {
2505
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
2506
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
2507
+ }
2508
+ #endif
2509
+
2119
2510
  float * f32_data;
2120
2511
  size_t nelements = tensor.ne.at(0) * tensor.ne.at(1);
2121
2512
  llama_buffer f32_conv_buf;
2513
+
2122
2514
  if (tensor.type == GGML_TYPE_F32) {
2123
2515
  f32_data = (float *) tensor.data;
2124
- } else if (tensor.type == GGML_TYPE_F16) {
2125
- f32_conv_buf.resize(nelements * sizeof(float));
2126
- f32_data = (float *) f32_conv_buf.addr;
2127
- const auto * f16_data = (const ggml_fp16_t *) tensor.data;
2128
- for (size_t i = 0; i < nelements; i++) {
2129
- f32_data[i] = ggml_fp16_to_fp32(f16_data[i]);
2130
- }
2516
+ } else if (ggml_is_quantized(tensor.type) && !params->allow_requantize) {
2517
+ throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor.type)));
2131
2518
  } else {
2132
- throw format("type %s unsupported for integer quantization", ggml_type_name(tensor.type));
2519
+ llama_convert_tensor_internal(tensor, f32_conv_buf, nelements, nthread);
2520
+ f32_data = (float *) f32_conv_buf.addr;
2133
2521
  }
2134
2522
 
2135
2523
  printf("quantizing .. ");
@@ -2183,12 +2571,16 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2183
2571
  }
2184
2572
 
2185
2573
  printf("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0);
2574
+ int64_t tot_count = 0;
2186
2575
  for (size_t i = 0; i < hist_cur.size(); i++) {
2187
2576
  hist_all[i] += hist_cur[i];
2577
+ tot_count += hist_cur[i];
2188
2578
  }
2189
2579
 
2190
- for (size_t i = 0; i < hist_cur.size(); i++) {
2191
- printf("%5.3f ", hist_cur[i] / float(nelements));
2580
+ if (tot_count > 0) {
2581
+ for (size_t i = 0; i < hist_cur.size(); i++) {
2582
+ printf("%5.3f ", hist_cur[i] / float(nelements));
2583
+ }
2192
2584
  }
2193
2585
  printf("\n");
2194
2586
  }
@@ -2206,11 +2598,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2206
2598
  sum_all += hist_all[i];
2207
2599
  }
2208
2600
 
2209
- printf("%s: hist: ", __func__);
2210
- for (size_t i = 0; i < hist_all.size(); i++) {
2211
- printf("%5.3f ", hist_all[i] / float(sum_all));
2601
+ if (sum_all > 0) {
2602
+ printf("%s: hist: ", __func__);
2603
+ for (size_t i = 0; i < hist_all.size(); i++) {
2604
+ printf("%5.3f ", hist_all[i] / float(sum_all));
2605
+ }
2606
+ printf("\n");
2212
2607
  }
2213
- printf("\n");
2214
2608
  }
2215
2609
  }
2216
2610
 
@@ -2251,9 +2645,9 @@ struct llama_context * llama_init_from_file(
2251
2645
 
2252
2646
  ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
2253
2647
 
2254
- if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_gpu_layers, memory_type,
2255
- params.use_mmap, params.use_mlock, params.vocab_only,
2256
- params.progress_callback, params.progress_callback_user_data)) {
2648
+ if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_batch, params.n_gpu_layers, params.main_gpu,
2649
+ params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock,
2650
+ params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
2257
2651
  fprintf(stderr, "%s: failed to load model\n", __func__);
2258
2652
  llama_free(ctx);
2259
2653
  return nullptr;
@@ -2261,7 +2655,7 @@ struct llama_context * llama_init_from_file(
2261
2655
 
2262
2656
  // reserve memory for context buffers
2263
2657
  if (!params.vocab_only) {
2264
- if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx)) {
2658
+ if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx, params.n_gpu_layers)) {
2265
2659
  fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
2266
2660
  llama_free(ctx);
2267
2661
  return nullptr;
@@ -2291,6 +2685,38 @@ struct llama_context * llama_init_from_file(
2291
2685
  ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
2292
2686
  }
2293
2687
 
2688
+ #ifdef GGML_USE_METAL
2689
+ if (params.n_gpu_layers > 0) {
2690
+ // this allocates all Metal resources and memory buffers
2691
+ ctx->ctx_metal = ggml_metal_init();
2692
+
2693
+ void *data_ptr = NULL;
2694
+ size_t data_size = 0;
2695
+ if (params.use_mmap) {
2696
+ data_ptr = ctx->model.mapping->addr;
2697
+ data_size= ctx->model.mapping->size;
2698
+ } else {
2699
+ data_ptr = ggml_get_mem_buffer(ctx->model.ctx);
2700
+ data_size= ggml_get_mem_size(ctx->model.ctx);
2701
+ }
2702
+
2703
+ #define LLAMA_METAL_CHECK_BUF(result) \
2704
+ if (!(result)) { \
2705
+ fprintf(stderr, "%s: failed to add buffer\n", __func__); \
2706
+ llama_free(ctx); \
2707
+ return NULL; \
2708
+ }
2709
+
2710
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size));
2711
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size));
2712
+
2713
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->model.kv_self.buf.addr, ctx->model.kv_self.buf.size));
2714
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size));
2715
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size));
2716
+ #undef LLAMA_METAL_CHECK_BUF
2717
+ }
2718
+ #endif
2719
+
2294
2720
  return ctx;
2295
2721
  }
2296
2722
 
@@ -2301,13 +2727,12 @@ void llama_free(struct llama_context * ctx) {
2301
2727
  int llama_model_quantize(
2302
2728
  const char * fname_inp,
2303
2729
  const char * fname_out,
2304
- enum llama_ftype ftype,
2305
- int nthread) {
2730
+ const llama_model_quantize_params *params) {
2306
2731
  try {
2307
- llama_model_quantize_internal(fname_inp, fname_out, ftype, nthread);
2732
+ llama_model_quantize_internal(fname_inp, fname_out, params);
2308
2733
  return 0;
2309
- } catch (const std::string & err) {
2310
- fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.c_str());
2734
+ } catch (const std::exception & err) {
2735
+ fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.what());
2311
2736
  return 1;
2312
2737
  }
2313
2738
  }
@@ -2560,8 +2985,8 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
2560
2985
  int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
2561
2986
  try {
2562
2987
  return llama_apply_lora_from_file_internal(ctx, path_lora, path_base_model, n_threads);
2563
- } catch (const std::string & err) {
2564
- fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.c_str());
2988
+ } catch (const std::exception & err) {
2989
+ fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what());
2565
2990
  return 1;
2566
2991
  }
2567
2992
  }
@@ -2906,7 +3331,7 @@ int llama_eval(
2906
3331
  int n_tokens,
2907
3332
  int n_past,
2908
3333
  int n_threads) {
2909
- if (!llama_eval_internal(*ctx, tokens, n_tokens, n_past, n_threads)) {
3334
+ if (!llama_eval_internal(*ctx, tokens, n_tokens, n_past, n_threads, nullptr)) {
2910
3335
  fprintf(stderr, "%s: failed to eval\n", __func__);
2911
3336
  return 1;
2912
3337
  }
@@ -2921,6 +3346,20 @@ int llama_eval(
2921
3346
  return 0;
2922
3347
  }
2923
3348
 
3349
+ int llama_eval_export(struct llama_context * ctx, const char * fname) {
3350
+ const int n_batch = 1;
3351
+ const int n_ctx = 512 - n_batch;
3352
+
3353
+ const std::vector<llama_token> tmp(n_batch, llama_token_bos());
3354
+
3355
+ if (!llama_eval_internal(*ctx, tmp.data(), tmp.size(), n_ctx, 1, fname)) {
3356
+ fprintf(stderr, "%s: failed to eval\n", __func__);
3357
+ return 1;
3358
+ }
3359
+
3360
+ return 0;
3361
+ }
3362
+
2924
3363
  int llama_tokenize(
2925
3364
  struct llama_context * ctx,
2926
3365
  const char * text,
@@ -2953,6 +3392,19 @@ int llama_n_embd(const struct llama_context * ctx) {
2953
3392
  return ctx->model.hparams.n_embd;
2954
3393
  }
2955
3394
 
3395
+ int llama_get_vocab(
3396
+ const struct llama_context * ctx,
3397
+ const char * * strings,
3398
+ float * scores,
3399
+ int capacity) {
3400
+ int n = std::min(capacity, (int) ctx->vocab.id_to_token.size());
3401
+ for (int i = 0; i<n; ++i) {
3402
+ strings[i] = ctx->vocab.id_to_token[i].tok.c_str();
3403
+ scores[i] = ctx->vocab.id_to_token[i].score;
3404
+ }
3405
+ return n;
3406
+ }
3407
+
2956
3408
  float * llama_get_logits(struct llama_context * ctx) {
2957
3409
  return ctx->logits.data();
2958
3410
  }