llama_cpp 0.1.4 → 0.2.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -16,6 +16,10 @@
16
16
  #include "ggml-opencl.h"
17
17
  #endif
18
18
 
19
+ #ifdef GGML_USE_METAL
20
+ #include "ggml-metal.h"
21
+ #endif
22
+
19
23
  #include <array>
20
24
  #include <ctime>
21
25
  #include <cinttypes>
@@ -49,17 +53,22 @@ enum e_model {
49
53
  MODEL_65B,
50
54
  };
51
55
 
52
-
53
56
  static const size_t MB = 1024*1024;
54
57
 
55
58
  // computed for n_ctx == 2048
56
59
  // TODO: dynamically determine these sizes
57
60
  // needs modifications in ggml
58
61
 
62
+ typedef void (*offload_func_t)(struct ggml_tensor * tensor);
63
+
64
+ void llama_nop(struct ggml_tensor * tensor) { // don't offload by default
65
+ (void) tensor;
66
+ }
67
+
59
68
  static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
60
69
  {
61
70
  static std::map<e_model, size_t> k_sizes = {
62
- { MODEL_3B, 128ull * MB },
71
+ { MODEL_3B, 256ull * MB },
63
72
  { MODEL_7B, 512ull * MB },
64
73
  { MODEL_13B, 512ull * MB },
65
74
  { MODEL_30B, 512ull * MB },
@@ -71,7 +80,7 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
71
80
  static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
72
81
  {
73
82
  static std::map<e_model, size_t> k_sizes = {
74
- { MODEL_3B, 128ull * MB },
83
+ { MODEL_3B, 256ull * MB },
75
84
  { MODEL_7B, 512ull * MB },
76
85
  { MODEL_13B, 512ull * MB },
77
86
  { MODEL_30B, 512ull * MB },
@@ -156,6 +165,11 @@ struct llama_kv_cache {
156
165
  if (ctx) {
157
166
  ggml_free(ctx);
158
167
  }
168
+
169
+ #ifdef GGML_USE_CUBLAS
170
+ ggml_cuda_free_data(k);
171
+ ggml_cuda_free_data(v);
172
+ #endif // GGML_USE_CUBLAS
159
173
  }
160
174
  };
161
175
 
@@ -170,6 +184,7 @@ struct llama_model {
170
184
  struct ggml_tensor * output;
171
185
 
172
186
  std::vector<llama_layer> layers;
187
+ int n_gpu_layers;
173
188
 
174
189
  // context
175
190
  struct ggml_context * ctx = NULL;
@@ -195,6 +210,17 @@ struct llama_model {
195
210
  if (ctx) {
196
211
  ggml_free(ctx);
197
212
  }
213
+
214
+ #ifdef GGML_USE_CUBLAS
215
+ for (size_t i = 0; i < tensors_by_name.size(); ++i) {
216
+ ggml_cuda_free_data(tensors_by_name[i].second);
217
+ }
218
+ ggml_cuda_free_scratch();
219
+ #elif defined(GGML_USE_CLBLAST)
220
+ for (size_t i = 0; i < tensors_by_name.size(); ++i) {
221
+ ggml_cl_free_data(tensors_by_name[i].second);
222
+ }
223
+ #endif
198
224
  }
199
225
  };
200
226
 
@@ -243,6 +269,10 @@ struct llama_context {
243
269
  llama_ctx_buffer buf_compute;
244
270
  llama_ctx_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
245
271
 
272
+ #ifdef GGML_USE_METAL
273
+ ggml_metal_context * ctx_metal = NULL;
274
+ #endif
275
+
246
276
  int buf_last = 0;
247
277
  size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
248
278
 
@@ -282,15 +312,15 @@ template <typename T>
282
312
  static T checked_mul(T a, T b) {
283
313
  T ret = a * b;
284
314
  if (a != 0 && ret / a != b) {
285
- throw format("overflow multiplying %llu * %llu",
286
- (unsigned long long) a, (unsigned long long) b);
315
+ throw std::runtime_error(format("overflow multiplying %llu * %llu",
316
+ (unsigned long long) a, (unsigned long long) b));
287
317
  }
288
318
  return ret;
289
319
  }
290
320
 
291
321
  static size_t checked_div(size_t a, size_t b) {
292
322
  if (b == 0 || a % b != 0) {
293
- throw format("error dividing %zu / %zu", a, b);
323
+ throw std::runtime_error(format("error dividing %zu / %zu", a, b));
294
324
  }
295
325
  return a / b;
296
326
  }
@@ -354,7 +384,7 @@ struct llama_load_tensor {
354
384
  const auto & first_shard = shards.at(0);
355
385
  for (const auto & shard : shards) {
356
386
  if (shard.type != first_shard.type) {
357
- throw format("inconsistent tensor shard type in '%s'", name.c_str());
387
+ throw std::runtime_error(format("inconsistent tensor shard type in '%s'", name.c_str()));
358
388
  }
359
389
  }
360
390
  type = first_shard.type;
@@ -377,8 +407,8 @@ struct llama_load_tensor {
377
407
  const auto & first_shard = shards.at(0);
378
408
  for (const auto & shard : shards) {
379
409
  if (shard.ne != first_shard.ne) {
380
- throw format("inconsistent tensor shard shape in '%s': first was %s, other was %s",
381
- name.c_str(), llama_format_tensor_shape(first_shard.ne).c_str(), llama_format_tensor_shape(shard.ne).c_str());
410
+ throw std::runtime_error(format("inconsistent tensor shard shape in '%s': first was %s, other was %s",
411
+ name.c_str(), llama_format_tensor_shape(first_shard.ne).c_str(), llama_format_tensor_shape(shard.ne).c_str()));
382
412
  }
383
413
  }
384
414
  ne = first_shard.ne;
@@ -456,8 +486,8 @@ struct llama_file_loader {
456
486
  }
457
487
  }
458
488
 
459
- throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
460
- magic, version);
489
+ throw std::runtime_error(format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
490
+ magic, version));
461
491
  }
462
492
  void read_hparams() {
463
493
  hparams.n_vocab = file.read_u32();
@@ -497,7 +527,7 @@ struct llama_file_loader {
497
527
  file.read_raw(shard.ne.data(), sizeof(shard.ne[0]) * n_dims);
498
528
  std::string name = file.read_string(name_len);
499
529
  if (n_dims < 1 || n_dims > 2) {
500
- throw format("llama.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims);
530
+ throw std::runtime_error(format("llama.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims));
501
531
  }
502
532
  switch (shard.type) {
503
533
  case GGML_TYPE_F32:
@@ -507,9 +537,14 @@ struct llama_file_loader {
507
537
  case GGML_TYPE_Q5_0:
508
538
  case GGML_TYPE_Q5_1:
509
539
  case GGML_TYPE_Q8_0:
540
+ case GGML_TYPE_Q2_K:
541
+ case GGML_TYPE_Q3_K:
542
+ case GGML_TYPE_Q4_K:
543
+ case GGML_TYPE_Q5_K:
544
+ case GGML_TYPE_Q6_K:
510
545
  break;
511
546
  default: {
512
- throw format("unrecognized tensor type %u\n", shard.type);
547
+ throw std::runtime_error(format("unrecognized tensor type %u\n", shard.type));
513
548
  }
514
549
  }
515
550
 
@@ -582,6 +617,11 @@ struct llama_file_saver {
582
617
  case GGML_TYPE_Q5_0:
583
618
  case GGML_TYPE_Q5_1:
584
619
  case GGML_TYPE_Q8_0:
620
+ case GGML_TYPE_Q2_K:
621
+ case GGML_TYPE_Q3_K:
622
+ case GGML_TYPE_Q4_K:
623
+ case GGML_TYPE_Q5_K:
624
+ case GGML_TYPE_Q6_K:
585
625
  break;
586
626
  default: LLAMA_ASSERT(false);
587
627
  }
@@ -613,7 +653,7 @@ struct llama_model_loader {
613
653
  auto * ith_file = new llama_file_loader(fname.c_str(), i, tensors_map);
614
654
  file_loaders.emplace_back(ith_file);
615
655
  if (ith_file->hparams != first_file->hparams) {
616
- throw format("llama.cpp: hparams inconsistent between files");
656
+ throw std::runtime_error(format("llama.cpp: hparams inconsistent between files"));
617
657
  }
618
658
  }
619
659
  if (!llama_mmap::SUPPORTED) {
@@ -643,7 +683,7 @@ struct llama_model_loader {
643
683
  uint32_t guess_n_parts() const {
644
684
  auto it = tensors_map.name_to_idx.find("tok_embeddings.weight");
645
685
  if (it == tensors_map.name_to_idx.end()) {
646
- throw std::string("missing tok_embeddings.weight");
686
+ throw std::runtime_error(std::string("missing tok_embeddings.weight"));
647
687
  }
648
688
  const llama_load_tensor & lt = tensors_map.tensors.at(it->second);
649
689
  return file_loaders.at(0)->hparams.n_embd / lt.shards.at(0).ne.at(0);
@@ -660,12 +700,12 @@ struct llama_model_loader {
660
700
  struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne, ggml_backend backend) {
661
701
  auto it = tensors_map.name_to_idx.find(name);
662
702
  if (it == tensors_map.name_to_idx.end()) {
663
- throw format("llama.cpp: tensor '%s' is missing from model", name.c_str());
703
+ throw std::runtime_error(std::runtime_error(format("llama.cpp: tensor '%s' is missing from model", name.c_str())));
664
704
  }
665
705
  llama_load_tensor & lt = tensors_map.tensors.at(it->second);
666
706
  if (lt.ne != ne) {
667
- throw format("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s",
668
- name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str());
707
+ throw std::runtime_error(format("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s",
708
+ name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str()));
669
709
  }
670
710
 
671
711
  return get_tensor_for(lt, backend);
@@ -673,6 +713,9 @@ struct llama_model_loader {
673
713
 
674
714
  struct ggml_tensor * get_tensor_for(llama_load_tensor & lt, ggml_backend backend) {
675
715
  struct ggml_tensor * tensor;
716
+ if (backend != GGML_BACKEND_CPU) {
717
+ ggml_set_no_alloc(ggml_ctx, true);
718
+ }
676
719
  if (lt.ne.size() == 2) {
677
720
  tensor = ggml_new_tensor_2d(ggml_ctx, lt.type, lt.ne.at(0), lt.ne.at(1));
678
721
  } else {
@@ -681,6 +724,10 @@ struct llama_model_loader {
681
724
  }
682
725
  ggml_set_name(tensor, lt.name.c_str());
683
726
  LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor
727
+
728
+ if (backend != GGML_BACKEND_CPU) {
729
+ ggml_set_no_alloc(ggml_ctx, use_mmap);
730
+ }
684
731
  tensor->backend = backend;
685
732
  lt.ggml_tensor = tensor;
686
733
  num_ggml_tensors_created++;
@@ -689,13 +736,14 @@ struct llama_model_loader {
689
736
 
690
737
  void done_getting_tensors() const {
691
738
  if (num_ggml_tensors_created != tensors_map.tensors.size()) {
692
- throw std::string("llama.cpp: file contained more tensors than expected");
739
+ throw std::runtime_error(std::string("llama.cpp: file contained more tensors than expected"));
693
740
  }
694
741
  }
695
742
 
696
743
  void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) {
697
744
  size_t data_size = 0;
698
745
  size_t prefetch_size = 0;
746
+ size_t lock_size = 0;
699
747
  for (const llama_load_tensor & lt : tensors_map.tensors) {
700
748
  data_size += lt.size;
701
749
  if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
@@ -705,11 +753,6 @@ struct llama_model_loader {
705
753
 
706
754
  if (use_mmap) {
707
755
  mapping.reset(new llama_mmap(&file_loaders.at(0)->file, prefetch_size));
708
- if (!lmlock) {
709
- // Don't call the callback since the actual loading will be lazy
710
- // and we can't measure it.
711
- progress_callback = NULL;
712
- }
713
756
  if (lmlock) {
714
757
  lmlock->init(mapping->addr);
715
758
  }
@@ -717,20 +760,49 @@ struct llama_model_loader {
717
760
 
718
761
  size_t done_size = 0;
719
762
  for (llama_load_tensor & lt : tensors_map.tensors) {
720
- if (lt.ggml_tensor->backend != GGML_BACKEND_CPU) {
721
- continue;
722
- }
723
763
  if (progress_callback) {
724
764
  progress_callback((float) done_size / data_size, progress_callback_user_data);
725
765
  }
726
766
  LLAMA_ASSERT(lt.ggml_tensor); // unused tensors should have been caught by load_data already
727
767
  lt.data = (uint8_t *) lt.ggml_tensor->data;
768
+
769
+ // allocate temp buffer if not using mmap
770
+ if (!use_mmap && lt.data == NULL) {
771
+ GGML_ASSERT(lt.ggml_tensor->backend != GGML_BACKEND_CPU);
772
+ lt.data = (uint8_t*)malloc(ggml_nbytes(lt.ggml_tensor));
773
+ }
774
+
728
775
  load_data_for(lt);
729
- lt.ggml_tensor->data = lt.data;
730
- done_size += lt.size;
731
- if (use_mmap && lmlock) {
732
- lmlock->grow_to(done_size);
776
+
777
+ switch(lt.ggml_tensor->backend) {
778
+ case GGML_BACKEND_CPU:
779
+ lt.ggml_tensor->data = lt.data;
780
+ if (use_mmap && lmlock) {
781
+ lock_size += lt.size;
782
+ lmlock->grow_to(lock_size);
783
+ }
784
+ break;
785
+ #if defined(GGML_USE_CUBLAS)
786
+ case GGML_BACKEND_GPU:
787
+ case GGML_BACKEND_GPU_SPLIT:
788
+ ggml_cuda_transform_tensor(lt.data, lt.ggml_tensor);
789
+ if (!use_mmap) {
790
+ free(lt.data);
791
+ }
792
+ break;
793
+ #elif defined(GGML_USE_CLBLAST)
794
+ case GGML_BACKEND_GPU:
795
+ ggml_cl_transform_tensor(lt.data, lt.ggml_tensor);
796
+ if (!use_mmap) {
797
+ free(lt.data);
798
+ }
799
+ break;
800
+ #endif
801
+ default:
802
+ continue;
733
803
  }
804
+
805
+ done_size += lt.size;
734
806
  }
735
807
  }
736
808
 
@@ -801,7 +873,8 @@ static bool kv_cache_init(
801
873
  const struct llama_hparams & hparams,
802
874
  struct llama_kv_cache & cache,
803
875
  ggml_type wtype,
804
- int n_ctx) {
876
+ int n_ctx,
877
+ int n_gpu_layers) {
805
878
  const int n_embd = hparams.n_embd;
806
879
  const int n_layer = hparams.n_layer;
807
880
 
@@ -827,13 +900,26 @@ static bool kv_cache_init(
827
900
  ggml_set_name(cache.k, "cache_k");
828
901
  ggml_set_name(cache.v, "cache_v");
829
902
 
903
+ #ifdef GGML_USE_CUBLAS
904
+ if (n_gpu_layers > n_layer + 1) {
905
+ ggml_cuda_assign_buffers_no_scratch(cache.v);
906
+ }
907
+ if (n_gpu_layers > n_layer + 2) {
908
+ ggml_cuda_assign_buffers_no_scratch(cache.k);
909
+ }
910
+ #endif // GGML_USE_CUBLAS
911
+
830
912
  return true;
831
913
  }
832
914
 
833
915
  struct llama_context_params llama_context_default_params() {
834
916
  struct llama_context_params result = {
835
917
  /*.n_ctx =*/ 512,
918
+ /*.n_batch =*/ 512,
836
919
  /*.gpu_layers =*/ 0,
920
+ /*.main_gpu =*/ 0,
921
+ /*.tensor_split =*/ {0},
922
+ /*.low_vram =*/ false,
837
923
  /*.seed =*/ -1,
838
924
  /*.f16_kv =*/ true,
839
925
  /*.logits_all =*/ false,
@@ -848,6 +934,17 @@ struct llama_context_params llama_context_default_params() {
848
934
  return result;
849
935
  }
850
936
 
937
+ struct llama_model_quantize_params llama_model_quantize_default_params() {
938
+ struct llama_model_quantize_params result = {
939
+ /*.nthread =*/ 0,
940
+ /*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
941
+ /*.allow_requantize =*/ false,
942
+ /*.quantize_output_tensor =*/ true,
943
+ };
944
+
945
+ return result;
946
+ }
947
+
851
948
  bool llama_mmap_supported() {
852
949
  return llama_mmap::SUPPORTED;
853
950
  }
@@ -898,6 +995,16 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
898
995
  case LLAMA_FTYPE_MOSTLY_Q5_0: return "mostly Q5_0";
899
996
  case LLAMA_FTYPE_MOSTLY_Q5_1: return "mostly Q5_1";
900
997
  case LLAMA_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0";
998
+ // K-quants
999
+ case LLAMA_FTYPE_MOSTLY_Q2_K: return "mostly Q2_K";
1000
+ case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "mostly Q3_K - Small";
1001
+ case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "mostly Q3_K - Medium";
1002
+ case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "mostly Q3_K - Large";
1003
+ case LLAMA_FTYPE_MOSTLY_Q4_K_S: return "mostly Q4_K - Small";
1004
+ case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "mostly Q4_K - Medium";
1005
+ case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "mostly Q5_K - Small";
1006
+ case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "mostly Q5_K - Medium";
1007
+ case LLAMA_FTYPE_MOSTLY_Q6_K: return "mostly Q6_K";
901
1008
  default: return "unknown, may not work";
902
1009
  }
903
1010
  }
@@ -917,7 +1024,11 @@ static void llama_model_load_internal(
917
1024
  const std::string & fname,
918
1025
  llama_context & lctx,
919
1026
  int n_ctx,
1027
+ int n_batch,
920
1028
  int n_gpu_layers,
1029
+ int main_gpu,
1030
+ const float * tensor_split,
1031
+ bool low_vram,
921
1032
  ggml_type memory_type,
922
1033
  bool use_mmap,
923
1034
  bool use_mlock,
@@ -932,9 +1043,9 @@ static void llama_model_load_internal(
932
1043
  lctx.vocab = std::move(ml->file_loaders.at(0)->vocab);
933
1044
  auto & model = lctx.model;
934
1045
  model.hparams = ml->file_loaders.at(0)->hparams;
1046
+ model.n_gpu_layers = n_gpu_layers;
935
1047
  llama_file_version file_version = ml->file_loaders.at(0)->file_version;
936
1048
  auto & hparams = model.hparams;
937
- uint32_t n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
938
1049
 
939
1050
  {
940
1051
  switch (hparams.n_layer) {
@@ -943,11 +1054,19 @@ static void llama_model_load_internal(
943
1054
  case 40: model.type = e_model::MODEL_13B; break;
944
1055
  case 60: model.type = e_model::MODEL_30B; break;
945
1056
  case 80: model.type = e_model::MODEL_65B; break;
1057
+ default:
1058
+ {
1059
+ if (hparams.n_layer < 32) {
1060
+ model.type = e_model::MODEL_7B;
1061
+ }
1062
+ } break;
946
1063
  }
947
1064
 
948
1065
  hparams.n_ctx = n_ctx;
949
1066
  }
950
1067
 
1068
+ const uint32_t n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
1069
+
951
1070
  {
952
1071
  fprintf(stderr, "%s: format = %s\n", __func__, llama_file_version_name(file_version));
953
1072
  fprintf(stderr, "%s: n_vocab = %u\n", __func__, hparams.n_vocab);
@@ -967,7 +1086,7 @@ static void llama_model_load_internal(
967
1086
  if (hparams.ftype != LLAMA_FTYPE_ALL_F32 &&
968
1087
  hparams.ftype != LLAMA_FTYPE_MOSTLY_F16 &&
969
1088
  hparams.ftype != LLAMA_FTYPE_MOSTLY_Q8_0) {
970
- throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1405)");
1089
+ throw std::runtime_error(format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1405)"));
971
1090
  }
972
1091
  }
973
1092
 
@@ -975,7 +1094,7 @@ static void llama_model_load_internal(
975
1094
  if (hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ||
976
1095
  hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_1 ||
977
1096
  hparams.ftype == LLAMA_FTYPE_MOSTLY_Q8_0) {
978
- throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1508)");
1097
+ throw std::runtime_error(format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1508)"));
979
1098
  }
980
1099
  }
981
1100
 
@@ -1006,18 +1125,28 @@ static void llama_model_load_internal(
1006
1125
 
1007
1126
  model.ctx = ggml_init(params);
1008
1127
  if (!model.ctx) {
1009
- throw format("ggml_init() failed");
1128
+ throw std::runtime_error(format("ggml_init() failed"));
1010
1129
  }
1011
1130
  }
1012
1131
 
1013
- #ifdef GGML_USE_CUBLAS
1014
- #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CUDA
1132
+ (void) main_gpu;
1133
+ #if defined(GGML_USE_CUBLAS)
1134
+ fprintf(stderr, "%s: using CUDA for GPU acceleration\n", __func__);
1135
+ ggml_cuda_set_main_device(main_gpu);
1136
+ #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
1137
+ #define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT
1138
+ #elif defined(GGML_USE_CLBLAST)
1139
+ fprintf(stderr, "%s: using OpenCL for GPU acceleration\n", __func__);
1140
+ #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
1141
+ #define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU
1015
1142
  #else
1016
- #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CPU
1143
+ #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CPU
1144
+ #define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_CPU
1017
1145
  #endif
1018
1146
 
1019
1147
  // prepare memory for the weights
1020
- size_t vram_total = 0;
1148
+ size_t vram_weights = 0;
1149
+ size_t vram_scratch = 0;
1021
1150
  {
1022
1151
  const uint32_t n_embd = hparams.n_embd;
1023
1152
  const uint32_t n_layer = hparams.n_layer;
@@ -1026,25 +1155,42 @@ static void llama_model_load_internal(
1026
1155
  ml->ggml_ctx = ctx;
1027
1156
 
1028
1157
  model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab}, GGML_BACKEND_CPU);
1029
- model.norm = ml->get_tensor("norm.weight", {n_embd}, GGML_BACKEND_CPU);
1030
1158
 
1031
1159
  // "output" tensor
1032
1160
  {
1161
+ ggml_backend backend_norm;
1033
1162
  ggml_backend backend_output;
1034
1163
  if (n_gpu_layers > int(n_layer)) { // NOLINT
1035
- backend_output = LLAMA_BACKEND_OFFLOAD;
1164
+ // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
1165
+ // on Windows however this is detrimental unless everything is on the GPU
1166
+ #ifndef _WIN32
1167
+ backend_norm = low_vram ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
1168
+ #else
1169
+ backend_norm = low_vram || n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
1170
+ #endif // _WIN32
1171
+
1172
+ backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
1036
1173
  } else {
1174
+ backend_norm = GGML_BACKEND_CPU;
1037
1175
  backend_output = GGML_BACKEND_CPU;
1038
1176
  }
1039
1177
 
1178
+ model.norm = ml->get_tensor("norm.weight", {n_embd}, backend_norm);
1040
1179
  model.output = ml->get_tensor("output.weight", {n_embd, n_vocab}, backend_output);
1180
+ if (backend_norm == GGML_BACKEND_GPU) {
1181
+ vram_weights += ggml_nbytes(model.norm);
1182
+ }
1183
+ if (backend_output == GGML_BACKEND_GPU_SPLIT) {
1184
+ vram_weights += ggml_nbytes(model.output);
1185
+ }
1041
1186
  }
1042
1187
 
1043
1188
  const int i_gpu_start = n_layer - n_gpu_layers;
1044
1189
 
1045
1190
  model.layers.resize(n_layer);
1046
1191
  for (uint32_t i = 0; i < n_layer; ++i) {
1047
- const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
1192
+ const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
1193
+ const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
1048
1194
 
1049
1195
  auto & layer = model.layers[i];
1050
1196
 
@@ -1052,21 +1198,21 @@ static void llama_model_load_internal(
1052
1198
 
1053
1199
  layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd}, backend);
1054
1200
 
1055
- layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}, backend);
1056
- layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd}, backend);
1057
- layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd}, backend);
1058
- layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}, backend);
1201
+ layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}, backend_split);
1202
+ layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd}, backend_split);
1203
+ layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd}, backend_split);
1204
+ layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}, backend_split);
1059
1205
 
1060
1206
  layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd}, backend);
1061
1207
 
1062
- layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}, backend);
1063
- layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}, backend);
1064
- layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}, backend);
1208
+ layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}, backend_split);
1209
+ layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}, backend_split);
1210
+ layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}, backend_split);
1065
1211
 
1066
- if (backend == GGML_BACKEND_CUDA) {
1067
- vram_total +=
1212
+ if (backend == GGML_BACKEND_GPU) {
1213
+ vram_weights +=
1068
1214
  ggml_nbytes(layer.attention_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
1069
- ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.attention_norm) +
1215
+ ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) +
1070
1216
  ggml_nbytes(layer.w1) + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3);
1071
1217
  }
1072
1218
  }
@@ -1081,10 +1227,10 @@ static void llama_model_load_internal(
1081
1227
  // this is the total memory required to run the inference
1082
1228
  const size_t mem_required =
1083
1229
  ctx_size +
1084
- mmapped_size - vram_total + // weights in VRAM not in memory
1230
+ mmapped_size - vram_weights + // weights in VRAM not in memory
1085
1231
  MEM_REQ_SCRATCH0().at(model.type) +
1086
1232
  MEM_REQ_SCRATCH1().at(model.type) +
1087
- MEM_REQ_EVAL().at(model.type);
1233
+ MEM_REQ_EVAL().at (model.type);
1088
1234
 
1089
1235
  // this is the memory required by one llama_state
1090
1236
  const size_t mem_required_state =
@@ -1093,15 +1239,51 @@ static void llama_model_load_internal(
1093
1239
  fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
1094
1240
  mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
1095
1241
 
1242
+ (void) vram_scratch;
1243
+ (void) n_batch;
1096
1244
  #ifdef GGML_USE_CUBLAS
1245
+ if (low_vram) {
1246
+ fprintf(stderr, "%s: not allocating a VRAM scratch buffer due to low VRAM option\n", __func__);
1247
+ ggml_cuda_set_scratch_size(0); // disable scratch
1248
+ } else {
1249
+ vram_scratch = n_batch * MB;
1250
+ ggml_cuda_set_scratch_size(vram_scratch);
1251
+ if (n_gpu_layers > 0) {
1252
+ fprintf(stderr, "%s: allocating batch_size x 1 MB = %ld MB VRAM for the scratch buffer\n",
1253
+ __func__, vram_scratch / MB);
1254
+ }
1255
+ }
1256
+ #endif // GGML_USE_CUBLAS
1257
+ #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
1097
1258
  const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
1098
1259
 
1099
- fprintf(stderr, "%s: [cublas] offloading %d layers to GPU\n", __func__, n_gpu);
1260
+ fprintf(stderr, "%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
1100
1261
  if (n_gpu_layers > (int) hparams.n_layer) {
1101
- fprintf(stderr, "%s: [cublas] offloading output layer to GPU\n", __func__);
1262
+ fprintf(stderr, "%s: offloading non-repeating layers to GPU\n", __func__);
1263
+ }
1264
+ size_t vram_kv_cache = 0;
1265
+ if (n_gpu_layers > (int) hparams.n_layer + 1) {
1266
+ if (low_vram) {
1267
+ fprintf(stderr, "%s: cannot offload v cache to GPU due to low VRAM option\n", __func__);
1268
+ } else {
1269
+ fprintf(stderr, "%s: offloading v cache to GPU\n", __func__);
1270
+ vram_kv_cache += MEM_REQ_KV_SELF().at(model.type) / 2;
1271
+ }
1102
1272
  }
1103
- fprintf(stderr, "%s: [cublas] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
1104
- #elif !defined(GGML_USE_CLBLAST)
1273
+ if (n_gpu_layers > (int) hparams.n_layer + 2) {
1274
+ if (low_vram) {
1275
+ fprintf(stderr, "%s: cannot offload k cache to GPU due to low VRAM option\n", __func__);
1276
+ } else {
1277
+ fprintf(stderr, "%s: offloading k cache to GPU\n", __func__);
1278
+ vram_kv_cache += MEM_REQ_KV_SELF().at(model.type) / 2;
1279
+ }
1280
+ }
1281
+ const int max_offloadable_layers = low_vram ? hparams.n_layer + 1 : hparams.n_layer + 3;
1282
+ fprintf(stderr, "%s: offloaded %d/%d layers to GPU\n",
1283
+ __func__, std::min(n_gpu_layers, max_offloadable_layers), hparams.n_layer + 3);
1284
+ fprintf(stderr, "%s: total VRAM used: %zu MB\n",
1285
+ __func__, (vram_weights + vram_scratch + vram_kv_cache + MB - 1) / MB); // round up
1286
+ #else
1105
1287
  (void) n_gpu_layers;
1106
1288
  #endif
1107
1289
  }
@@ -1111,57 +1293,15 @@ static void llama_model_load_internal(
1111
1293
  model.tensors_by_name.emplace_back(lt.name, lt.ggml_tensor);
1112
1294
  }
1113
1295
 
1114
- ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
1115
-
1116
- #ifdef GGML_USE_CUBLAS
1296
+ (void) tensor_split;
1297
+ #if defined(GGML_USE_CUBLAS)
1117
1298
  {
1118
- size_t done_size = 0;
1119
- size_t data_size = 0;
1120
- for (llama_load_tensor & lt : ml->tensors_map.tensors) {
1121
- data_size += lt.size;
1122
- if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
1123
- done_size += lt.size;
1124
- }
1125
- }
1126
- for (llama_load_tensor & lt : ml->tensors_map.tensors) {
1127
- if (lt.ggml_tensor->backend != GGML_BACKEND_CUDA) {
1128
- continue;
1129
- }
1130
- if (progress_callback) {
1131
- progress_callback((float) done_size / data_size, progress_callback_user_data);
1132
- }
1133
- ggml_cuda_load_data(fname.c_str(), lt.ggml_tensor, lt.shards.at(0).file_off);
1134
- done_size += lt.size;
1135
- }
1136
- }
1137
- #elif defined(GGML_USE_CLBLAST)
1138
- {
1139
- const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
1140
-
1141
- fprintf(stderr, "ggml_opencl: offloading %d layers to GPU\n", n_gpu);
1142
-
1143
- size_t vram_total = 0;
1144
-
1145
- for (int i = 0; i < n_gpu; ++i) {
1146
- const auto & layer = model.layers[i];
1147
-
1148
- ggml_cl_transform_tensor(layer.wq); vram_total += ggml_nbytes(layer.wq);
1149
- ggml_cl_transform_tensor(layer.wk); vram_total += ggml_nbytes(layer.wk);
1150
- ggml_cl_transform_tensor(layer.wv); vram_total += ggml_nbytes(layer.wv);
1151
- ggml_cl_transform_tensor(layer.wo); vram_total += ggml_nbytes(layer.wo);
1152
- ggml_cl_transform_tensor(layer.w1); vram_total += ggml_nbytes(layer.w1);
1153
- ggml_cl_transform_tensor(layer.w2); vram_total += ggml_nbytes(layer.w2);
1154
- ggml_cl_transform_tensor(layer.w3); vram_total += ggml_nbytes(layer.w3);
1155
- }
1156
- if (n_gpu_layers > (int) hparams.n_layer) {
1157
- fprintf(stderr, "ggml_opencl: offloading output layer to GPU\n");
1158
- ggml_cl_transform_tensor(model.output); vram_total += ggml_nbytes(model.output);
1159
- }
1160
-
1161
- fprintf(stderr, "ggml_opencl: total VRAM used: %zu MB\n", vram_total / 1024 / 1024);
1299
+ ggml_cuda_set_tensor_split(tensor_split);
1162
1300
  }
1163
1301
  #endif
1164
1302
 
1303
+ ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
1304
+
1165
1305
  if (progress_callback) {
1166
1306
  progress_callback(1.0f, progress_callback_user_data);
1167
1307
  }
@@ -1177,7 +1317,11 @@ static bool llama_model_load(
1177
1317
  const std::string & fname,
1178
1318
  llama_context & lctx,
1179
1319
  int n_ctx,
1320
+ int n_batch,
1180
1321
  int n_gpu_layers,
1322
+ int main_gpu,
1323
+ float * tensor_split,
1324
+ bool low_vram,
1181
1325
  ggml_type memory_type,
1182
1326
  bool use_mmap,
1183
1327
  bool use_mlock,
@@ -1185,28 +1329,30 @@ static bool llama_model_load(
1185
1329
  llama_progress_callback progress_callback,
1186
1330
  void *progress_callback_user_data) {
1187
1331
  try {
1188
- llama_model_load_internal(fname, lctx, n_ctx, n_gpu_layers, memory_type, use_mmap, use_mlock,
1189
- vocab_only, progress_callback, progress_callback_user_data);
1332
+ llama_model_load_internal(fname, lctx, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type,
1333
+ use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
1190
1334
  return true;
1191
- } catch (const std::string & err) {
1192
- fprintf(stderr, "error loading model: %s\n", err.c_str());
1335
+ } catch (const std::exception & err) {
1336
+ fprintf(stderr, "error loading model: %s\n", err.what());
1193
1337
  return false;
1194
1338
  }
1195
1339
  }
1196
1340
 
1197
1341
  // evaluate the transformer
1198
1342
  //
1199
- // - lctx: llama context
1200
- // - tokens: new batch of tokens to process
1201
- // - n_past: the context size so far
1202
- // - n_threads: number of threads to use
1343
+ // - lctx: llama context
1344
+ // - tokens: new batch of tokens to process
1345
+ // - n_past: the context size so far
1346
+ // - n_threads: number of threads to use
1347
+ // - cgraph_fname: filename of the exported computation graph
1203
1348
  //
1204
1349
  static bool llama_eval_internal(
1205
- llama_context & lctx,
1206
- const llama_token * tokens,
1207
- const int n_tokens,
1208
- const int n_past,
1209
- const int n_threads) {
1350
+ llama_context & lctx,
1351
+ const llama_token * tokens,
1352
+ const int n_tokens,
1353
+ const int n_past,
1354
+ const int n_threads,
1355
+ const char * cgraph_fname) {
1210
1356
 
1211
1357
  // enforce that the first token is BOS
1212
1358
  if (n_past == 0 && tokens[0] != llama_token_bos()) {
@@ -1225,12 +1371,13 @@ static bool llama_eval_internal(
1225
1371
 
1226
1372
  LLAMA_ASSERT(!!kv_self.ctx);
1227
1373
 
1228
- const int n_embd = hparams.n_embd;
1229
- const int n_layer = hparams.n_layer;
1230
- const int n_ctx = hparams.n_ctx;
1231
- const int n_head = hparams.n_head;
1232
- const int n_vocab = hparams.n_vocab;
1233
- const int n_rot = hparams.n_embd/hparams.n_head;
1374
+ const int n_embd = hparams.n_embd;
1375
+ const int n_layer = hparams.n_layer;
1376
+ const int n_ctx = hparams.n_ctx;
1377
+ const int n_head = hparams.n_head;
1378
+ const int n_vocab = hparams.n_vocab;
1379
+ const int n_rot = hparams.n_embd/hparams.n_head;
1380
+ const int n_gpu_layers = model.n_gpu_layers;
1234
1381
 
1235
1382
  auto & mem_per_token = lctx.mem_per_token;
1236
1383
  auto & buf_compute = lctx.buf_compute;
@@ -1252,40 +1399,98 @@ static bool llama_eval_internal(
1252
1399
  ggml_set_name(embd, "embd");
1253
1400
  memcpy(embd->data, tokens, N*ggml_element_size(embd));
1254
1401
 
1402
+ struct ggml_tensor * cur;
1255
1403
  struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
1256
1404
 
1405
+ const int i_gpu_start = n_layer - n_gpu_layers;
1406
+ (void) i_gpu_start;
1407
+
1408
+ // offload functions set the tensor output backend to GPU
1409
+ // tensors are GPU-accelerated if any input or the output has been offloaded
1410
+ //
1411
+ // with the low VRAM option VRAM scratch is disabled in llama_load_model_internal
1412
+ // in that case ggml_cuda_assign_buffers has no effect
1413
+ offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
1414
+ offload_func_t offload_func_kq = llama_nop;
1415
+ offload_func_t offload_func_v = llama_nop;
1416
+
1417
+ #ifdef GGML_USE_CUBLAS
1418
+ if (n_gpu_layers > n_layer) {
1419
+ offload_func_nr = ggml_cuda_assign_buffers;
1420
+ }
1421
+ if (n_gpu_layers > n_layer + 1) {
1422
+ offload_func_v = ggml_cuda_assign_buffers;
1423
+ }
1424
+ if (n_gpu_layers > n_layer + 2) {
1425
+ offload_func_kq = ggml_cuda_assign_buffers;
1426
+ }
1427
+ #endif // GGML_USE_CUBLAS
1428
+
1257
1429
  for (int il = 0; il < n_layer; ++il) {
1258
- struct ggml_tensor * inpSA = inpL;
1430
+ offload_func_t offload_func = llama_nop;
1431
+
1432
+ #ifdef GGML_USE_CUBLAS
1433
+ if (il >= i_gpu_start) {
1434
+ offload_func = ggml_cuda_assign_buffers;
1435
+ }
1436
+ #endif // GGML_USE_CUBLAS
1259
1437
 
1260
- struct ggml_tensor * cur;
1438
+ struct ggml_tensor * inpSA = inpL;
1261
1439
 
1262
1440
  lctx.use_buf(ctx0, 0);
1263
1441
 
1264
1442
  // norm
1265
1443
  {
1266
1444
  cur = ggml_rms_norm(ctx0, inpL);
1445
+ offload_func(cur);
1446
+ ggml_set_name(cur, "rms_norm_0");
1267
1447
 
1268
1448
  // cur = cur*attention_norm(broadcasted)
1269
1449
  cur = ggml_mul(ctx0, cur, model.layers[il].attention_norm);
1450
+ offload_func(cur);
1451
+ ggml_set_name(cur, "attention_norm_0");
1270
1452
  }
1271
1453
 
1272
1454
  // self-attention
1273
1455
  {
1274
1456
  // compute Q and K and RoPE them
1275
- struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
1276
- struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
1277
- ggml_set_name(Qcur, "Qcur");
1457
+ struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
1458
+ offload_func_kq(tmpk);
1459
+ ggml_set_name(tmpk, "tmpk");
1460
+
1461
+ struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
1462
+ offload_func_kq(tmpq);
1463
+ ggml_set_name(tmpq, "tmpq");
1464
+
1465
+ struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0);
1466
+ offload_func_kq(Kcur);
1278
1467
  ggml_set_name(Kcur, "Kcur");
1279
1468
 
1469
+ struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0);
1470
+ offload_func_kq(Qcur);
1471
+ ggml_set_name(Qcur, "Qcur");
1472
+
1280
1473
  // store key and value to memory
1281
1474
  {
1282
1475
  // compute the transposed [N, n_embd] V matrix
1283
- struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), n_embd, N));
1476
+
1477
+ struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
1478
+ offload_func_v(tmpv);
1479
+ ggml_set_name(tmpv, "tmpv");
1480
+
1481
+ struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd, N));
1482
+ offload_func_v(Vcur);
1483
+ ggml_set_name(Vcur, "Vcur");
1284
1484
 
1285
1485
  struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
1486
+ offload_func_kq(k);
1487
+ ggml_set_name(k, "k");
1488
+
1286
1489
  struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
1287
1490
  ( n_ctx)*ggml_element_size(kv_self.v),
1288
1491
  (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
1492
+ offload_func_v(v);
1493
+ ggml_set_name(v, "v");
1289
1494
 
1290
1495
  // important: storing RoPE-ed version of K in the KV cache!
1291
1496
  ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
@@ -1296,6 +1501,7 @@ static bool llama_eval_internal(
1296
1501
  ggml_permute(ctx0,
1297
1502
  Qcur,
1298
1503
  0, 2, 1, 3);
1504
+ offload_func_kq(Q);
1299
1505
  ggml_set_name(Q, "Q");
1300
1506
 
1301
1507
  struct ggml_tensor * K =
@@ -1304,10 +1510,12 @@ static bool llama_eval_internal(
1304
1510
  ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd),
1305
1511
  n_embd/n_head, n_head, n_past + N),
1306
1512
  0, 2, 1, 3);
1513
+ offload_func_kq(K);
1307
1514
  ggml_set_name(K, "K");
1308
1515
 
1309
1516
  // K * Q
1310
1517
  struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
1518
+ offload_func_kq(KQ);
1311
1519
  ggml_set_name(KQ, "KQ");
1312
1520
 
1313
1521
  // KQ_scaled = KQ / sqrt(n_embd/n_head)
@@ -1316,17 +1524,19 @@ static bool llama_eval_internal(
1316
1524
 
1317
1525
  // KQ_scaled shape [n_past + N, N, n_head, 1]
1318
1526
  struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
1527
+ offload_func_kq(KQ_scaled);
1319
1528
  ggml_set_name(KQ_scaled, "KQ_scaled");
1320
1529
 
1321
1530
  // KQ_masked = mask_past(KQ_scaled)
1322
1531
  struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
1532
+ offload_func_kq(KQ_masked);
1323
1533
  ggml_set_name(KQ_masked, "KQ_masked");
1324
1534
 
1325
1535
  // KQ = soft_max(KQ_masked)
1326
1536
  struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
1537
+ offload_func_v(KQ_soft_max);
1327
1538
  ggml_set_name(KQ_soft_max, "KQ_soft_max");
1328
1539
 
1329
-
1330
1540
  // split cached V into n_head heads
1331
1541
  struct ggml_tensor * V =
1332
1542
  ggml_view_3d(ctx0, kv_self.v,
@@ -1334,10 +1544,12 @@ static bool llama_eval_internal(
1334
1544
  n_ctx*ggml_element_size(kv_self.v),
1335
1545
  n_ctx*ggml_element_size(kv_self.v)*n_embd/n_head,
1336
1546
  il*n_ctx*ggml_element_size(kv_self.v)*n_embd);
1547
+ offload_func_v(V);
1337
1548
  ggml_set_name(V, "V");
1338
1549
 
1339
1550
  #if 1
1340
1551
  struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
1552
+ offload_func_v(KQV);
1341
1553
  ggml_set_name(KQV, "KQV");
1342
1554
  #else
1343
1555
  // make V contiguous in memory to speed up the matmul, however we waste time on the copy
@@ -1349,56 +1561,79 @@ static bool llama_eval_internal(
1349
1561
 
1350
1562
  // KQV_merged = KQV.permute(0, 2, 1, 3)
1351
1563
  struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
1564
+ offload_func_v(KQV_merged);
1352
1565
  ggml_set_name(KQV_merged, "KQV_merged");
1353
1566
 
1354
1567
  // cur = KQV_merged.contiguous().view(n_embd, N)
1355
1568
  cur = ggml_cpy(ctx0,
1356
1569
  KQV_merged,
1357
1570
  ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
1571
+ offload_func_v(cur);
1358
1572
  ggml_set_name(cur, "KQV_merged_contiguous");
1359
1573
 
1360
1574
  // projection (no bias)
1361
1575
  cur = ggml_mul_mat(ctx0,
1362
1576
  model.layers[il].wo,
1363
1577
  cur);
1578
+ offload_func(cur);
1579
+ ggml_set_name(cur, "result_wo");
1364
1580
  }
1365
1581
 
1366
1582
  lctx.use_buf(ctx0, 1);
1367
1583
 
1368
1584
  struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
1585
+ offload_func(inpFF);
1586
+ ggml_set_name(inpFF, "inpFF");
1369
1587
 
1370
1588
  // feed-forward network
1371
1589
  {
1372
1590
  // norm
1373
1591
  {
1374
1592
  cur = ggml_rms_norm(ctx0, inpFF);
1593
+ offload_func(cur);
1594
+ ggml_set_name(cur, "rms_norm_1");
1375
1595
 
1376
1596
  // cur = cur*ffn_norm(broadcasted)
1377
1597
  cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
1598
+ offload_func(cur);
1599
+ ggml_set_name(cur, "ffn_norm");
1378
1600
  }
1379
1601
 
1380
1602
  struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
1381
1603
  model.layers[il].w3,
1382
1604
  cur);
1605
+ offload_func(tmp);
1606
+ ggml_set_name(tmp, "result_w3");
1383
1607
 
1384
1608
  cur = ggml_mul_mat(ctx0,
1385
1609
  model.layers[il].w1,
1386
1610
  cur);
1611
+ offload_func(cur);
1612
+ ggml_set_name(cur, "result_w2");
1387
1613
 
1388
1614
  // SILU activation
1389
1615
  cur = ggml_silu(ctx0, cur);
1616
+ offload_func(cur);
1617
+ ggml_set_name(cur, "silu");
1390
1618
 
1391
1619
  cur = ggml_mul(ctx0, cur, tmp);
1620
+ offload_func(cur);
1621
+ ggml_set_name(cur, "silu_x_result_w3");
1392
1622
 
1393
1623
  cur = ggml_mul_mat(ctx0,
1394
1624
  model.layers[il].w2,
1395
1625
  cur);
1626
+ offload_func(cur);
1627
+ ggml_set_name(cur, "result_w2");
1396
1628
  }
1397
1629
 
1398
1630
  cur = ggml_add(ctx0, cur, inpFF);
1631
+ offload_func(cur);
1632
+ ggml_set_name(cur, "inpFF_+_result_w2");
1399
1633
 
1400
1634
  // input for next layer
1401
1635
  inpL = cur;
1636
+
1402
1637
  }
1403
1638
 
1404
1639
  lctx.use_buf(ctx0, 0);
@@ -1406,28 +1641,68 @@ static bool llama_eval_internal(
1406
1641
  // used at the end to optionally extract the embeddings
1407
1642
  struct ggml_tensor * embeddings = NULL;
1408
1643
 
1644
+
1409
1645
  // norm
1410
1646
  {
1647
+ cur = ggml_rms_norm(ctx0, inpL);
1648
+ offload_func_nr(cur);
1649
+ ggml_set_name(cur, "rms_norm_inpL");
1411
1650
 
1412
- inpL = ggml_rms_norm(ctx0, inpL);
1651
+ cur = ggml_rms_norm(ctx0, cur);
1652
+ offload_func_nr(cur);
1653
+ ggml_set_name(cur, "rms_norm_after");
1413
1654
 
1414
- // inpL = inpL*norm(broadcasted)
1415
- inpL = ggml_mul(ctx0, inpL, model.norm);
1655
+ // cur = cur*norm(broadcasted)
1656
+ cur = ggml_mul(ctx0, cur, model.norm);
1657
+ offload_func_nr(cur);
1658
+ ggml_set_name(cur, "result_norm");
1416
1659
 
1417
- embeddings = inpL;
1660
+ embeddings = cur;
1418
1661
  }
1419
1662
 
1663
+
1420
1664
  // lm_head
1421
- inpL = ggml_mul_mat(ctx0, model.output, inpL);
1665
+ cur = ggml_mul_mat(ctx0, model.output, cur);
1666
+ ggml_set_name(cur, "result_output");
1422
1667
 
1423
1668
  lctx.use_buf(ctx0, -1);
1424
1669
 
1425
1670
  // logits -> probs
1426
- //inpL = ggml_soft_max_inplace(ctx0, inpL);
1671
+ //cur = ggml_soft_max_inplace(ctx0, cur);
1427
1672
 
1428
1673
  // run the computation
1429
- ggml_build_forward_expand(&gf, inpL);
1430
- ggml_graph_compute (ctx0, &gf);
1674
+ ggml_build_forward_expand(&gf, cur);
1675
+
1676
+ #ifdef GGML_USE_METAL
1677
+ if (lctx.ctx_metal && N == 1) {
1678
+ ggml_metal_graph_compute(lctx.ctx_metal, &gf);
1679
+ ggml_metal_get_tensor (lctx.ctx_metal, cur);
1680
+ } else {
1681
+ // IMPORTANT:
1682
+ // Since we don't have efficient Matrix x Matrix Metal multiplication yet, we fallback to vanilla
1683
+ // ggml_graph_compute(). It uses Apple's Accelerate CBLAS API which takes advantage of the ANE or the AMX
1684
+ // coprocessor.
1685
+ //
1686
+ // When we implement Matrix x Matrix Metal multiplication, we can avoid this branch.
1687
+ // But for now, we have focused only on Matrix x Vector Metal multiplication.
1688
+ //
1689
+ // TODO: avoid these syncs via shared memory (ref #1696)
1690
+ //
1691
+ if (lctx.ctx_metal) {
1692
+ // We need to sync the GPU KV cache with the CPU KV cache
1693
+ ggml_metal_get_tensor(lctx.ctx_metal, kv_self.k);
1694
+ ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v);
1695
+ }
1696
+
1697
+ ggml_graph_compute(ctx0, &gf);
1698
+ }
1699
+ #else
1700
+ ggml_graph_compute(ctx0, &gf);
1701
+ #endif
1702
+
1703
+ if (cgraph_fname) {
1704
+ ggml_graph_export(&gf, cgraph_fname);
1705
+ }
1431
1706
 
1432
1707
  #ifdef GGML_PERF
1433
1708
  // print timing information per ggml operation (for debugging purposes)
@@ -1441,7 +1716,7 @@ static bool llama_eval_internal(
1441
1716
  //}
1442
1717
 
1443
1718
  //embd_w.resize(n_vocab*N);
1444
- //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
1719
+ //memcpy(embd_w.data(), ggml_get_data(cur), sizeof(float)*n_vocab*N);
1445
1720
 
1446
1721
  // update kv token count
1447
1722
  lctx.model.kv_self.n = n_past + N;
@@ -1452,11 +1727,11 @@ static bool llama_eval_internal(
1452
1727
 
1453
1728
  if (lctx.logits_all) {
1454
1729
  logits_out.resize(n_vocab * N);
1455
- memcpy(logits_out.data(), (float *) ggml_get_data(inpL), sizeof(float)*n_vocab*N);
1730
+ memcpy(logits_out.data(), (float *) ggml_get_data(cur), sizeof(float)*n_vocab*N);
1456
1731
  } else {
1457
1732
  // return result for just the last token
1458
1733
  logits_out.resize(n_vocab);
1459
- memcpy(logits_out.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
1734
+ memcpy(logits_out.data(), (float *) ggml_get_data(cur) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
1460
1735
  }
1461
1736
  }
1462
1737
 
@@ -1987,6 +2262,10 @@ llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_tok
1987
2262
  return -log2f(candidate.p) > *mu;
1988
2263
  }));
1989
2264
 
2265
+ if (candidates->size == 0) {
2266
+ candidates->size = 1;
2267
+ }
2268
+
1990
2269
  // Normalize the probabilities of the remaining words
1991
2270
  llama_sample_softmax(ctx, candidates);
1992
2271
 
@@ -2055,16 +2334,92 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra
2055
2334
  // quantization
2056
2335
  //
2057
2336
 
2058
- static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, enum llama_ftype ftype, int nthread) {
2337
+ static void llama_convert_tensor_internal(const llama_load_tensor & tensor, llama_buffer & output, const int nelements, const int nthread) {
2338
+ if (output.size < nelements * sizeof(float)) {
2339
+ output.resize(nelements * sizeof(float));
2340
+ }
2341
+ float * f32_output = (float *) output.addr;
2342
+
2343
+ quantize_fns_t qtype;
2344
+ if (ggml_is_quantized(tensor.type)) {
2345
+ qtype = ggml_internal_get_quantize_fn(tensor.type);
2346
+ if (qtype.dequantize_row_q == NULL) {
2347
+ throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor.type)));
2348
+ }
2349
+ } else if (tensor.type != GGML_TYPE_F16) {
2350
+ throw std::runtime_error(format("cannot dequantize/convert tensor type %s", ggml_type_name(tensor.type)));
2351
+ }
2352
+
2353
+ if (nthread < 2) {
2354
+ if (tensor.type == GGML_TYPE_F16) {
2355
+ ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor.data, f32_output, nelements);
2356
+ } else if (ggml_is_quantized(tensor.type)) {
2357
+ qtype.dequantize_row_q(tensor.data, f32_output, nelements);
2358
+ } else {
2359
+ LLAMA_ASSERT(false); // unreachable
2360
+ }
2361
+ return;
2362
+ }
2363
+
2364
+ auto block_size = tensor.type == GGML_TYPE_F16 ? 1 : (size_t)ggml_blck_size(tensor.type);
2365
+ auto block_size_bytes = ggml_type_size(tensor.type);
2366
+
2367
+ LLAMA_ASSERT(nelements % block_size == 0);
2368
+ auto nblocks = nelements / block_size;
2369
+ auto blocks_per_thread = nblocks / nthread;
2370
+ auto spare_blocks = nblocks - (blocks_per_thread * nthread); // if blocks aren't divisible by thread count
2371
+
2372
+ std::vector<std::thread> workers;
2373
+ for (auto tnum = 0, in_buff_offs = 0, out_buff_offs = 0; tnum < nthread; tnum++) {
2374
+ auto thr_blocks = blocks_per_thread + (tnum == nthread - 1 ? spare_blocks : 0); // num blocks for this thread
2375
+ auto thr_elems = thr_blocks * block_size; // number of elements for this thread
2376
+ auto thr_block_bytes = thr_blocks * block_size_bytes; // number of input bytes for this thread
2377
+
2378
+ auto compute = [qtype] (ggml_type typ, uint8_t * inbuf, float * outbuf, int nels) {
2379
+ if (typ == GGML_TYPE_F16) {
2380
+ ggml_fp16_to_fp32_row((ggml_fp16_t *)inbuf, outbuf, nels);
2381
+ } else {
2382
+ qtype.dequantize_row_q(inbuf, outbuf, nels);
2383
+ }
2384
+ };
2385
+ workers.push_back(std::thread(compute, tensor.type, tensor.data + in_buff_offs, f32_output + out_buff_offs, thr_elems));
2386
+ in_buff_offs += thr_block_bytes;
2387
+ out_buff_offs += thr_elems;
2388
+ }
2389
+ for (auto & worker : workers) {
2390
+ worker.join();
2391
+ }
2392
+
2393
+ }
2394
+
2395
+ static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
2059
2396
  ggml_type quantized_type;
2060
- switch (ftype) {
2397
+ llama_ftype ftype = params->ftype;
2398
+ int nthread = params->nthread;
2399
+
2400
+ switch (params->ftype) {
2061
2401
  case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
2062
2402
  case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
2063
2403
  case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
2064
2404
  case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
2065
2405
  case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
2066
- default: throw format("invalid output file type %d\n", ftype);
2067
- };
2406
+ case LLAMA_FTYPE_MOSTLY_F16: quantized_type = GGML_TYPE_F16; break;
2407
+ case LLAMA_FTYPE_ALL_F32: quantized_type = GGML_TYPE_F32; break;
2408
+
2409
+ #ifdef GGML_USE_K_QUANTS
2410
+ // K-quants
2411
+ case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
2412
+ case LLAMA_FTYPE_MOSTLY_Q3_K_S:
2413
+ case LLAMA_FTYPE_MOSTLY_Q3_K_M:
2414
+ case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break;
2415
+ case LLAMA_FTYPE_MOSTLY_Q4_K_S:
2416
+ case LLAMA_FTYPE_MOSTLY_Q4_K_M: quantized_type = GGML_TYPE_Q4_K; break;
2417
+ case LLAMA_FTYPE_MOSTLY_Q5_K_S:
2418
+ case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break;
2419
+ case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
2420
+ #endif
2421
+ default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
2422
+ }
2068
2423
 
2069
2424
  if (nthread <= 0) {
2070
2425
  nthread = std::thread::hardware_concurrency();
@@ -2072,7 +2427,23 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2072
2427
 
2073
2428
  std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp, /*use_mmap*/ false,
2074
2429
  /*vocab_only*/ false));
2075
- llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), ftype);
2430
+ llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), params->ftype);
2431
+
2432
+ #ifdef GGML_USE_K_QUANTS
2433
+ int n_attention_wv = 0;
2434
+ int n_feed_forward_w2 = 0;
2435
+ for (auto& tensor : model_loader->tensors_map.tensors) {
2436
+ if (tensor.name.find("attention.wv.weight") != std::string::npos) {
2437
+ ++n_attention_wv;
2438
+ }
2439
+ else if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) {
2440
+ ++n_feed_forward_w2;
2441
+ }
2442
+ }
2443
+
2444
+ int i_attention_wv = 0;
2445
+ int i_feed_forward_w2 = 0;
2446
+ #endif
2076
2447
 
2077
2448
  size_t total_size_org = 0;
2078
2449
  size_t total_size_new = 0;
@@ -2098,11 +2469,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2098
2469
 
2099
2470
  // quantize only 2D tensors
2100
2471
  quantize &= (tensor.ne.size() == 2);
2101
-
2102
- // uncomment this to keep the output layer in FP16
2103
- //if (tensor.name == "output.weight") {
2104
- // quantize = false;
2105
- //}
2472
+ quantize &= params->quantize_output_tensor || tensor.name != "output.weight";
2473
+ quantize &= quantized_type != tensor.type;
2106
2474
 
2107
2475
  enum ggml_type new_type;
2108
2476
  void * new_data;
@@ -2116,20 +2484,40 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2116
2484
  printf("size = %8.3f MB\n", tensor.size/1024.0/1024.0);
2117
2485
  } else {
2118
2486
  new_type = quantized_type;
2487
+ #ifdef GGML_USE_K_QUANTS
2488
+ if (tensor.name == "output.weight") {
2489
+ new_type = GGML_TYPE_Q6_K;
2490
+ } else if (tensor.name.find("attention.wv.weight") != std::string::npos) {
2491
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
2492
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
2493
+ else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
2494
+ (i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8 ||
2495
+ (i_attention_wv - n_attention_wv/8)%3 == 2)) new_type = GGML_TYPE_Q6_K;
2496
+ ++i_attention_wv;
2497
+ } else if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) {
2498
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
2499
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
2500
+ else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
2501
+ (i_feed_forward_w2 < n_feed_forward_w2/8 || i_feed_forward_w2 >= 7*n_feed_forward_w2/8 ||
2502
+ (i_feed_forward_w2 - n_feed_forward_w2/8)%3 == 2)) new_type = GGML_TYPE_Q6_K;
2503
+ ++i_feed_forward_w2;
2504
+ } else if (tensor.name.find("attention.wo.weight") != std::string::npos) {
2505
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
2506
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
2507
+ }
2508
+ #endif
2509
+
2119
2510
  float * f32_data;
2120
2511
  size_t nelements = tensor.ne.at(0) * tensor.ne.at(1);
2121
2512
  llama_buffer f32_conv_buf;
2513
+
2122
2514
  if (tensor.type == GGML_TYPE_F32) {
2123
2515
  f32_data = (float *) tensor.data;
2124
- } else if (tensor.type == GGML_TYPE_F16) {
2125
- f32_conv_buf.resize(nelements * sizeof(float));
2126
- f32_data = (float *) f32_conv_buf.addr;
2127
- const auto * f16_data = (const ggml_fp16_t *) tensor.data;
2128
- for (size_t i = 0; i < nelements; i++) {
2129
- f32_data[i] = ggml_fp16_to_fp32(f16_data[i]);
2130
- }
2516
+ } else if (ggml_is_quantized(tensor.type) && !params->allow_requantize) {
2517
+ throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor.type)));
2131
2518
  } else {
2132
- throw format("type %s unsupported for integer quantization", ggml_type_name(tensor.type));
2519
+ llama_convert_tensor_internal(tensor, f32_conv_buf, nelements, nthread);
2520
+ f32_data = (float *) f32_conv_buf.addr;
2133
2521
  }
2134
2522
 
2135
2523
  printf("quantizing .. ");
@@ -2183,12 +2571,16 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2183
2571
  }
2184
2572
 
2185
2573
  printf("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0);
2574
+ int64_t tot_count = 0;
2186
2575
  for (size_t i = 0; i < hist_cur.size(); i++) {
2187
2576
  hist_all[i] += hist_cur[i];
2577
+ tot_count += hist_cur[i];
2188
2578
  }
2189
2579
 
2190
- for (size_t i = 0; i < hist_cur.size(); i++) {
2191
- printf("%5.3f ", hist_cur[i] / float(nelements));
2580
+ if (tot_count > 0) {
2581
+ for (size_t i = 0; i < hist_cur.size(); i++) {
2582
+ printf("%5.3f ", hist_cur[i] / float(nelements));
2583
+ }
2192
2584
  }
2193
2585
  printf("\n");
2194
2586
  }
@@ -2206,11 +2598,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2206
2598
  sum_all += hist_all[i];
2207
2599
  }
2208
2600
 
2209
- printf("%s: hist: ", __func__);
2210
- for (size_t i = 0; i < hist_all.size(); i++) {
2211
- printf("%5.3f ", hist_all[i] / float(sum_all));
2601
+ if (sum_all > 0) {
2602
+ printf("%s: hist: ", __func__);
2603
+ for (size_t i = 0; i < hist_all.size(); i++) {
2604
+ printf("%5.3f ", hist_all[i] / float(sum_all));
2605
+ }
2606
+ printf("\n");
2212
2607
  }
2213
- printf("\n");
2214
2608
  }
2215
2609
  }
2216
2610
 
@@ -2251,9 +2645,9 @@ struct llama_context * llama_init_from_file(
2251
2645
 
2252
2646
  ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
2253
2647
 
2254
- if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_gpu_layers, memory_type,
2255
- params.use_mmap, params.use_mlock, params.vocab_only,
2256
- params.progress_callback, params.progress_callback_user_data)) {
2648
+ if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_batch, params.n_gpu_layers, params.main_gpu,
2649
+ params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock,
2650
+ params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
2257
2651
  fprintf(stderr, "%s: failed to load model\n", __func__);
2258
2652
  llama_free(ctx);
2259
2653
  return nullptr;
@@ -2261,7 +2655,7 @@ struct llama_context * llama_init_from_file(
2261
2655
 
2262
2656
  // reserve memory for context buffers
2263
2657
  if (!params.vocab_only) {
2264
- if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx)) {
2658
+ if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx, params.n_gpu_layers)) {
2265
2659
  fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
2266
2660
  llama_free(ctx);
2267
2661
  return nullptr;
@@ -2291,6 +2685,38 @@ struct llama_context * llama_init_from_file(
2291
2685
  ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
2292
2686
  }
2293
2687
 
2688
+ #ifdef GGML_USE_METAL
2689
+ if (params.n_gpu_layers > 0) {
2690
+ // this allocates all Metal resources and memory buffers
2691
+ ctx->ctx_metal = ggml_metal_init();
2692
+
2693
+ void *data_ptr = NULL;
2694
+ size_t data_size = 0;
2695
+ if (params.use_mmap) {
2696
+ data_ptr = ctx->model.mapping->addr;
2697
+ data_size= ctx->model.mapping->size;
2698
+ } else {
2699
+ data_ptr = ggml_get_mem_buffer(ctx->model.ctx);
2700
+ data_size= ggml_get_mem_size(ctx->model.ctx);
2701
+ }
2702
+
2703
+ #define LLAMA_METAL_CHECK_BUF(result) \
2704
+ if (!(result)) { \
2705
+ fprintf(stderr, "%s: failed to add buffer\n", __func__); \
2706
+ llama_free(ctx); \
2707
+ return NULL; \
2708
+ }
2709
+
2710
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size));
2711
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size));
2712
+
2713
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->model.kv_self.buf.addr, ctx->model.kv_self.buf.size));
2714
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size));
2715
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size));
2716
+ #undef LLAMA_METAL_CHECK_BUF
2717
+ }
2718
+ #endif
2719
+
2294
2720
  return ctx;
2295
2721
  }
2296
2722
 
@@ -2301,13 +2727,12 @@ void llama_free(struct llama_context * ctx) {
2301
2727
  int llama_model_quantize(
2302
2728
  const char * fname_inp,
2303
2729
  const char * fname_out,
2304
- enum llama_ftype ftype,
2305
- int nthread) {
2730
+ const llama_model_quantize_params *params) {
2306
2731
  try {
2307
- llama_model_quantize_internal(fname_inp, fname_out, ftype, nthread);
2732
+ llama_model_quantize_internal(fname_inp, fname_out, params);
2308
2733
  return 0;
2309
- } catch (const std::string & err) {
2310
- fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.c_str());
2734
+ } catch (const std::exception & err) {
2735
+ fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.what());
2311
2736
  return 1;
2312
2737
  }
2313
2738
  }
@@ -2560,8 +2985,8 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
2560
2985
  int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
2561
2986
  try {
2562
2987
  return llama_apply_lora_from_file_internal(ctx, path_lora, path_base_model, n_threads);
2563
- } catch (const std::string & err) {
2564
- fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.c_str());
2988
+ } catch (const std::exception & err) {
2989
+ fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what());
2565
2990
  return 1;
2566
2991
  }
2567
2992
  }
@@ -2906,7 +3331,7 @@ int llama_eval(
2906
3331
  int n_tokens,
2907
3332
  int n_past,
2908
3333
  int n_threads) {
2909
- if (!llama_eval_internal(*ctx, tokens, n_tokens, n_past, n_threads)) {
3334
+ if (!llama_eval_internal(*ctx, tokens, n_tokens, n_past, n_threads, nullptr)) {
2910
3335
  fprintf(stderr, "%s: failed to eval\n", __func__);
2911
3336
  return 1;
2912
3337
  }
@@ -2921,6 +3346,20 @@ int llama_eval(
2921
3346
  return 0;
2922
3347
  }
2923
3348
 
3349
+ int llama_eval_export(struct llama_context * ctx, const char * fname) {
3350
+ const int n_batch = 1;
3351
+ const int n_ctx = 512 - n_batch;
3352
+
3353
+ const std::vector<llama_token> tmp(n_batch, llama_token_bos());
3354
+
3355
+ if (!llama_eval_internal(*ctx, tmp.data(), tmp.size(), n_ctx, 1, fname)) {
3356
+ fprintf(stderr, "%s: failed to eval\n", __func__);
3357
+ return 1;
3358
+ }
3359
+
3360
+ return 0;
3361
+ }
3362
+
2924
3363
  int llama_tokenize(
2925
3364
  struct llama_context * ctx,
2926
3365
  const char * text,
@@ -2953,6 +3392,19 @@ int llama_n_embd(const struct llama_context * ctx) {
2953
3392
  return ctx->model.hparams.n_embd;
2954
3393
  }
2955
3394
 
3395
+ int llama_get_vocab(
3396
+ const struct llama_context * ctx,
3397
+ const char * * strings,
3398
+ float * scores,
3399
+ int capacity) {
3400
+ int n = std::min(capacity, (int) ctx->vocab.id_to_token.size());
3401
+ for (int i = 0; i<n; ++i) {
3402
+ strings[i] = ctx->vocab.id_to_token[i].tok.c_str();
3403
+ scores[i] = ctx->vocab.id_to_token[i].score;
3404
+ }
3405
+ return n;
3406
+ }
3407
+
2956
3408
  float * llama_get_logits(struct llama_context * ctx) {
2957
3409
  return ctx->logits.data();
2958
3410
  }