llama_cpp 0.1.3 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -16,6 +16,10 @@
16
16
  #include "ggml-opencl.h"
17
17
  #endif
18
18
 
19
+ #ifdef GGML_USE_METAL
20
+ #include "ggml-metal.h"
21
+ #endif
22
+
19
23
  #include <array>
20
24
  #include <ctime>
21
25
  #include <cinttypes>
@@ -42,22 +46,29 @@
42
46
  // available llama models
43
47
  enum e_model {
44
48
  MODEL_UNKNOWN,
49
+ MODEL_3B,
45
50
  MODEL_7B,
46
51
  MODEL_13B,
47
52
  MODEL_30B,
48
53
  MODEL_65B,
49
54
  };
50
55
 
51
-
52
56
  static const size_t MB = 1024*1024;
53
57
 
54
58
  // computed for n_ctx == 2048
55
59
  // TODO: dynamically determine these sizes
56
60
  // needs modifications in ggml
57
61
 
62
+ typedef void (*offload_func_t)(struct ggml_tensor * tensor);
63
+
64
+ void llama_nop(struct ggml_tensor * tensor) { // don't offload by default
65
+ (void) tensor;
66
+ }
67
+
58
68
  static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
59
69
  {
60
70
  static std::map<e_model, size_t> k_sizes = {
71
+ { MODEL_3B, 256ull * MB },
61
72
  { MODEL_7B, 512ull * MB },
62
73
  { MODEL_13B, 512ull * MB },
63
74
  { MODEL_30B, 512ull * MB },
@@ -69,6 +80,7 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
69
80
  static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
70
81
  {
71
82
  static std::map<e_model, size_t> k_sizes = {
83
+ { MODEL_3B, 256ull * MB },
72
84
  { MODEL_7B, 512ull * MB },
73
85
  { MODEL_13B, 512ull * MB },
74
86
  { MODEL_30B, 512ull * MB },
@@ -81,6 +93,7 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
81
93
  static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
82
94
  {
83
95
  static std::map<e_model, size_t> k_sizes = {
96
+ { MODEL_3B, 682ull * MB },
84
97
  { MODEL_7B, 1026ull * MB },
85
98
  { MODEL_13B, 1608ull * MB },
86
99
  { MODEL_30B, 3124ull * MB },
@@ -94,6 +107,7 @@ static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
94
107
  static const std::map<e_model, size_t> & MEM_REQ_EVAL()
95
108
  {
96
109
  static std::map<e_model, size_t> k_sizes = {
110
+ { MODEL_3B, 512ull * MB },
97
111
  { MODEL_7B, 768ull * MB },
98
112
  { MODEL_13B, 1024ull * MB },
99
113
  { MODEL_30B, 1280ull * MB },
@@ -165,6 +179,7 @@ struct llama_model {
165
179
  struct ggml_tensor * output;
166
180
 
167
181
  std::vector<llama_layer> layers;
182
+ int n_gpu_layers;
168
183
 
169
184
  // context
170
185
  struct ggml_context * ctx = NULL;
@@ -190,6 +205,16 @@ struct llama_model {
190
205
  if (ctx) {
191
206
  ggml_free(ctx);
192
207
  }
208
+
209
+ #ifdef GGML_USE_CUBLAS
210
+ for (size_t i = 0; i < tensors_by_name.size(); ++i) {
211
+ ggml_cuda_free_data(tensors_by_name[i].second);
212
+ }
213
+ #elif defined(GGML_USE_CLBLAST)
214
+ for (size_t i = 0; i < tensors_by_name.size(); ++i) {
215
+ ggml_cl_free_data(tensors_by_name[i].second);
216
+ }
217
+ #endif
193
218
  }
194
219
  };
195
220
 
@@ -238,6 +263,10 @@ struct llama_context {
238
263
  llama_ctx_buffer buf_compute;
239
264
  llama_ctx_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
240
265
 
266
+ #ifdef GGML_USE_METAL
267
+ ggml_metal_context * ctx_metal = NULL;
268
+ #endif
269
+
241
270
  int buf_last = 0;
242
271
  size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
243
272
 
@@ -277,15 +306,15 @@ template <typename T>
277
306
  static T checked_mul(T a, T b) {
278
307
  T ret = a * b;
279
308
  if (a != 0 && ret / a != b) {
280
- throw format("overflow multiplying %llu * %llu",
281
- (unsigned long long) a, (unsigned long long) b);
309
+ throw std::runtime_error(format("overflow multiplying %llu * %llu",
310
+ (unsigned long long) a, (unsigned long long) b));
282
311
  }
283
312
  return ret;
284
313
  }
285
314
 
286
315
  static size_t checked_div(size_t a, size_t b) {
287
316
  if (b == 0 || a % b != 0) {
288
- throw format("error dividing %zu / %zu", a, b);
317
+ throw std::runtime_error(format("error dividing %zu / %zu", a, b));
289
318
  }
290
319
  return a / b;
291
320
  }
@@ -349,7 +378,7 @@ struct llama_load_tensor {
349
378
  const auto & first_shard = shards.at(0);
350
379
  for (const auto & shard : shards) {
351
380
  if (shard.type != first_shard.type) {
352
- throw format("inconsistent tensor shard type in '%s'", name.c_str());
381
+ throw std::runtime_error(format("inconsistent tensor shard type in '%s'", name.c_str()));
353
382
  }
354
383
  }
355
384
  type = first_shard.type;
@@ -372,8 +401,8 @@ struct llama_load_tensor {
372
401
  const auto & first_shard = shards.at(0);
373
402
  for (const auto & shard : shards) {
374
403
  if (shard.ne != first_shard.ne) {
375
- throw format("inconsistent tensor shard shape in '%s': first was %s, other was %s",
376
- name.c_str(), llama_format_tensor_shape(first_shard.ne).c_str(), llama_format_tensor_shape(shard.ne).c_str());
404
+ throw std::runtime_error(format("inconsistent tensor shard shape in '%s': first was %s, other was %s",
405
+ name.c_str(), llama_format_tensor_shape(first_shard.ne).c_str(), llama_format_tensor_shape(shard.ne).c_str()));
377
406
  }
378
407
  }
379
408
  ne = first_shard.ne;
@@ -451,8 +480,8 @@ struct llama_file_loader {
451
480
  }
452
481
  }
453
482
 
454
- throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
455
- magic, version);
483
+ throw std::runtime_error(format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
484
+ magic, version));
456
485
  }
457
486
  void read_hparams() {
458
487
  hparams.n_vocab = file.read_u32();
@@ -492,7 +521,7 @@ struct llama_file_loader {
492
521
  file.read_raw(shard.ne.data(), sizeof(shard.ne[0]) * n_dims);
493
522
  std::string name = file.read_string(name_len);
494
523
  if (n_dims < 1 || n_dims > 2) {
495
- throw format("llama.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims);
524
+ throw std::runtime_error(format("llama.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims));
496
525
  }
497
526
  switch (shard.type) {
498
527
  case GGML_TYPE_F32:
@@ -502,9 +531,14 @@ struct llama_file_loader {
502
531
  case GGML_TYPE_Q5_0:
503
532
  case GGML_TYPE_Q5_1:
504
533
  case GGML_TYPE_Q8_0:
534
+ case GGML_TYPE_Q2_K:
535
+ case GGML_TYPE_Q3_K:
536
+ case GGML_TYPE_Q4_K:
537
+ case GGML_TYPE_Q5_K:
538
+ case GGML_TYPE_Q6_K:
505
539
  break;
506
540
  default: {
507
- throw format("unrecognized tensor type %u\n", shard.type);
541
+ throw std::runtime_error(format("unrecognized tensor type %u\n", shard.type));
508
542
  }
509
543
  }
510
544
 
@@ -577,6 +611,11 @@ struct llama_file_saver {
577
611
  case GGML_TYPE_Q5_0:
578
612
  case GGML_TYPE_Q5_1:
579
613
  case GGML_TYPE_Q8_0:
614
+ case GGML_TYPE_Q2_K:
615
+ case GGML_TYPE_Q3_K:
616
+ case GGML_TYPE_Q4_K:
617
+ case GGML_TYPE_Q5_K:
618
+ case GGML_TYPE_Q6_K:
580
619
  break;
581
620
  default: LLAMA_ASSERT(false);
582
621
  }
@@ -608,7 +647,7 @@ struct llama_model_loader {
608
647
  auto * ith_file = new llama_file_loader(fname.c_str(), i, tensors_map);
609
648
  file_loaders.emplace_back(ith_file);
610
649
  if (ith_file->hparams != first_file->hparams) {
611
- throw format("llama.cpp: hparams inconsistent between files");
650
+ throw std::runtime_error(format("llama.cpp: hparams inconsistent between files"));
612
651
  }
613
652
  }
614
653
  if (!llama_mmap::SUPPORTED) {
@@ -638,7 +677,7 @@ struct llama_model_loader {
638
677
  uint32_t guess_n_parts() const {
639
678
  auto it = tensors_map.name_to_idx.find("tok_embeddings.weight");
640
679
  if (it == tensors_map.name_to_idx.end()) {
641
- throw std::string("missing tok_embeddings.weight");
680
+ throw std::runtime_error(std::string("missing tok_embeddings.weight"));
642
681
  }
643
682
  const llama_load_tensor & lt = tensors_map.tensors.at(it->second);
644
683
  return file_loaders.at(0)->hparams.n_embd / lt.shards.at(0).ne.at(0);
@@ -655,12 +694,12 @@ struct llama_model_loader {
655
694
  struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne, ggml_backend backend) {
656
695
  auto it = tensors_map.name_to_idx.find(name);
657
696
  if (it == tensors_map.name_to_idx.end()) {
658
- throw format("llama.cpp: tensor '%s' is missing from model", name.c_str());
697
+ throw std::runtime_error(std::runtime_error(format("llama.cpp: tensor '%s' is missing from model", name.c_str())));
659
698
  }
660
699
  llama_load_tensor & lt = tensors_map.tensors.at(it->second);
661
700
  if (lt.ne != ne) {
662
- throw format("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s",
663
- name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str());
701
+ throw std::runtime_error(format("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s",
702
+ name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str()));
664
703
  }
665
704
 
666
705
  return get_tensor_for(lt, backend);
@@ -676,6 +715,7 @@ struct llama_model_loader {
676
715
  }
677
716
  ggml_set_name(tensor, lt.name.c_str());
678
717
  LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor
718
+
679
719
  tensor->backend = backend;
680
720
  lt.ggml_tensor = tensor;
681
721
  num_ggml_tensors_created++;
@@ -684,7 +724,7 @@ struct llama_model_loader {
684
724
 
685
725
  void done_getting_tensors() const {
686
726
  if (num_ggml_tensors_created != tensors_map.tensors.size()) {
687
- throw std::string("llama.cpp: file contained more tensors than expected");
727
+ throw std::runtime_error(std::string("llama.cpp: file contained more tensors than expected"));
688
728
  }
689
729
  }
690
730
 
@@ -828,7 +868,10 @@ static bool kv_cache_init(
828
868
  struct llama_context_params llama_context_default_params() {
829
869
  struct llama_context_params result = {
830
870
  /*.n_ctx =*/ 512,
871
+ /*.n_batch =*/ 512,
831
872
  /*.gpu_layers =*/ 0,
873
+ /*.main_gpu =*/ 0,
874
+ /*.tensor_split =*/ {0},
832
875
  /*.seed =*/ -1,
833
876
  /*.f16_kv =*/ true,
834
877
  /*.logits_all =*/ false,
@@ -843,6 +886,17 @@ struct llama_context_params llama_context_default_params() {
843
886
  return result;
844
887
  }
845
888
 
889
+ struct llama_model_quantize_params llama_model_quantize_default_params() {
890
+ struct llama_model_quantize_params result = {
891
+ /*.nthread =*/ 0,
892
+ /*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
893
+ /*.allow_requantize =*/ false,
894
+ /*.quantize_output_tensor =*/ true,
895
+ };
896
+
897
+ return result;
898
+ }
899
+
846
900
  bool llama_mmap_supported() {
847
901
  return llama_mmap::SUPPORTED;
848
902
  }
@@ -893,12 +947,23 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
893
947
  case LLAMA_FTYPE_MOSTLY_Q5_0: return "mostly Q5_0";
894
948
  case LLAMA_FTYPE_MOSTLY_Q5_1: return "mostly Q5_1";
895
949
  case LLAMA_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0";
950
+ // K-quants
951
+ case LLAMA_FTYPE_MOSTLY_Q2_K: return "mostly Q2_K";
952
+ case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "mostly Q3_K - Small";
953
+ case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "mostly Q3_K - Medium";
954
+ case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "mostly Q3_K - Large";
955
+ case LLAMA_FTYPE_MOSTLY_Q4_K_S: return "mostly Q4_K - Small";
956
+ case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "mostly Q4_K - Medium";
957
+ case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "mostly Q5_K - Small";
958
+ case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "mostly Q5_K - Medium";
959
+ case LLAMA_FTYPE_MOSTLY_Q6_K: return "mostly Q6_K";
896
960
  default: return "unknown, may not work";
897
961
  }
898
962
  }
899
963
 
900
964
  static const char *llama_model_type_name(e_model type) {
901
965
  switch (type) {
966
+ case MODEL_3B: return "3B";
902
967
  case MODEL_7B: return "7B";
903
968
  case MODEL_13B: return "13B";
904
969
  case MODEL_30B: return "30B";
@@ -911,7 +976,10 @@ static void llama_model_load_internal(
911
976
  const std::string & fname,
912
977
  llama_context & lctx,
913
978
  int n_ctx,
979
+ int n_batch,
914
980
  int n_gpu_layers,
981
+ int main_gpu,
982
+ const float * tensor_split,
915
983
  ggml_type memory_type,
916
984
  bool use_mmap,
917
985
  bool use_mlock,
@@ -926,12 +994,13 @@ static void llama_model_load_internal(
926
994
  lctx.vocab = std::move(ml->file_loaders.at(0)->vocab);
927
995
  auto & model = lctx.model;
928
996
  model.hparams = ml->file_loaders.at(0)->hparams;
997
+ model.n_gpu_layers = n_gpu_layers;
929
998
  llama_file_version file_version = ml->file_loaders.at(0)->file_version;
930
999
  auto & hparams = model.hparams;
931
- uint32_t n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
932
1000
 
933
1001
  {
934
1002
  switch (hparams.n_layer) {
1003
+ case 26: model.type = e_model::MODEL_3B; break;
935
1004
  case 32: model.type = e_model::MODEL_7B; break;
936
1005
  case 40: model.type = e_model::MODEL_13B; break;
937
1006
  case 60: model.type = e_model::MODEL_30B; break;
@@ -941,6 +1010,8 @@ static void llama_model_load_internal(
941
1010
  hparams.n_ctx = n_ctx;
942
1011
  }
943
1012
 
1013
+ const uint32_t n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
1014
+
944
1015
  {
945
1016
  fprintf(stderr, "%s: format = %s\n", __func__, llama_file_version_name(file_version));
946
1017
  fprintf(stderr, "%s: n_vocab = %u\n", __func__, hparams.n_vocab);
@@ -960,7 +1031,7 @@ static void llama_model_load_internal(
960
1031
  if (hparams.ftype != LLAMA_FTYPE_ALL_F32 &&
961
1032
  hparams.ftype != LLAMA_FTYPE_MOSTLY_F16 &&
962
1033
  hparams.ftype != LLAMA_FTYPE_MOSTLY_Q8_0) {
963
- throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1405)");
1034
+ throw std::runtime_error(format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1405)"));
964
1035
  }
965
1036
  }
966
1037
 
@@ -968,7 +1039,7 @@ static void llama_model_load_internal(
968
1039
  if (hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ||
969
1040
  hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_1 ||
970
1041
  hparams.ftype == LLAMA_FTYPE_MOSTLY_Q8_0) {
971
- throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1508)");
1042
+ throw std::runtime_error(format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1508)"));
972
1043
  }
973
1044
  }
974
1045
 
@@ -999,18 +1070,28 @@ static void llama_model_load_internal(
999
1070
 
1000
1071
  model.ctx = ggml_init(params);
1001
1072
  if (!model.ctx) {
1002
- throw format("ggml_init() failed");
1073
+ throw std::runtime_error(format("ggml_init() failed"));
1003
1074
  }
1004
1075
  }
1005
1076
 
1006
- #ifdef GGML_USE_CUBLAS
1007
- #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CUDA
1077
+ (void) main_gpu;
1078
+ #if defined(GGML_USE_CUBLAS)
1079
+ fprintf(stderr, "%s: using CUDA for GPU acceleration\n", __func__);
1080
+ ggml_cuda_set_main_device(main_gpu);
1081
+ #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
1082
+ #define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT
1083
+ #elif defined(GGML_USE_CLBLAST)
1084
+ fprintf(stderr, "%s: using OpenCL for GPU acceleration\n", __func__);
1085
+ #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
1086
+ #define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU
1008
1087
  #else
1009
- #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CPU
1088
+ #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CPU
1089
+ #define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_CPU
1010
1090
  #endif
1011
1091
 
1012
1092
  // prepare memory for the weights
1013
- size_t vram_total = 0;
1093
+ size_t vram_weights = 0;
1094
+ size_t vram_scratch = 0;
1014
1095
  {
1015
1096
  const uint32_t n_embd = hparams.n_embd;
1016
1097
  const uint32_t n_layer = hparams.n_layer;
@@ -1025,7 +1106,7 @@ static void llama_model_load_internal(
1025
1106
  {
1026
1107
  ggml_backend backend_output;
1027
1108
  if (n_gpu_layers > int(n_layer)) { // NOLINT
1028
- backend_output = LLAMA_BACKEND_OFFLOAD;
1109
+ backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
1029
1110
  } else {
1030
1111
  backend_output = GGML_BACKEND_CPU;
1031
1112
  }
@@ -1037,7 +1118,8 @@ static void llama_model_load_internal(
1037
1118
 
1038
1119
  model.layers.resize(n_layer);
1039
1120
  for (uint32_t i = 0; i < n_layer; ++i) {
1040
- const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
1121
+ const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
1122
+ const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
1041
1123
 
1042
1124
  auto & layer = model.layers[i];
1043
1125
 
@@ -1045,19 +1127,19 @@ static void llama_model_load_internal(
1045
1127
 
1046
1128
  layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd}, backend);
1047
1129
 
1048
- layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}, backend);
1049
- layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd}, backend);
1050
- layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd}, backend);
1051
- layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}, backend);
1130
+ layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}, backend_split);
1131
+ layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd}, backend_split);
1132
+ layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd}, backend_split);
1133
+ layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}, backend_split);
1052
1134
 
1053
1135
  layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd}, backend);
1054
1136
 
1055
- layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}, backend);
1056
- layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}, backend);
1057
- layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}, backend);
1137
+ layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}, backend_split);
1138
+ layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}, backend_split);
1139
+ layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}, backend_split);
1058
1140
 
1059
- if (backend == GGML_BACKEND_CUDA) {
1060
- vram_total +=
1141
+ if (backend == GGML_BACKEND_GPU) {
1142
+ vram_weights +=
1061
1143
  ggml_nbytes(layer.attention_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
1062
1144
  ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.attention_norm) +
1063
1145
  ggml_nbytes(layer.w1) + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3);
@@ -1074,10 +1156,10 @@ static void llama_model_load_internal(
1074
1156
  // this is the total memory required to run the inference
1075
1157
  const size_t mem_required =
1076
1158
  ctx_size +
1077
- mmapped_size - vram_total + // weights in VRAM not in memory
1159
+ mmapped_size - vram_weights + // weights in VRAM not in memory
1078
1160
  MEM_REQ_SCRATCH0().at(model.type) +
1079
1161
  MEM_REQ_SCRATCH1().at(model.type) +
1080
- MEM_REQ_EVAL().at(model.type);
1162
+ MEM_REQ_EVAL().at (model.type);
1081
1163
 
1082
1164
  // this is the memory required by one llama_state
1083
1165
  const size_t mem_required_state =
@@ -1086,15 +1168,25 @@ static void llama_model_load_internal(
1086
1168
  fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
1087
1169
  mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
1088
1170
 
1171
+ (void) vram_scratch;
1089
1172
  #ifdef GGML_USE_CUBLAS
1173
+ vram_scratch = n_batch * MB;
1174
+ ggml_cuda_set_scratch_size(vram_scratch);
1175
+ if (n_gpu_layers > 0) {
1176
+ fprintf(stderr, "%s: allocating batch_size x 1 MB = %ld MB VRAM for the scratch buffer\n",
1177
+ __func__, vram_scratch / MB);
1178
+ }
1179
+ #endif // GGML_USE_CUBLAS
1180
+ #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
1090
1181
  const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
1091
1182
 
1092
- fprintf(stderr, "%s: [cublas] offloading %d layers to GPU\n", __func__, n_gpu);
1183
+ fprintf(stderr, "%s: offloading %d layers to GPU\n", __func__, n_gpu);
1093
1184
  if (n_gpu_layers > (int) hparams.n_layer) {
1094
- fprintf(stderr, "%s: [cublas] offloading output layer to GPU\n", __func__);
1185
+ fprintf(stderr, "%s: offloading output layer to GPU\n", __func__);
1095
1186
  }
1096
- fprintf(stderr, "%s: [cublas] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
1097
- #elif !defined(GGML_USE_CLBLAST)
1187
+ fprintf(stderr, "%s: total VRAM used: %zu MB\n",
1188
+ __func__, (vram_weights + vram_scratch + MB - 1) / MB); // round up
1189
+ #else
1098
1190
  (void) n_gpu_layers;
1099
1191
  #endif
1100
1192
  }
@@ -1106,8 +1198,10 @@ static void llama_model_load_internal(
1106
1198
 
1107
1199
  ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
1108
1200
 
1109
- #ifdef GGML_USE_CUBLAS
1201
+ #if defined(GGML_USE_CUBLAS)
1110
1202
  {
1203
+ ggml_cuda_set_tensor_split(tensor_split);
1204
+
1111
1205
  size_t done_size = 0;
1112
1206
  size_t data_size = 0;
1113
1207
  for (llama_load_tensor & lt : ml->tensors_map.tensors) {
@@ -1117,7 +1211,8 @@ static void llama_model_load_internal(
1117
1211
  }
1118
1212
  }
1119
1213
  for (llama_load_tensor & lt : ml->tensors_map.tensors) {
1120
- if (lt.ggml_tensor->backend != GGML_BACKEND_CUDA) {
1214
+ ggml_backend backend = lt.ggml_tensor->backend;
1215
+ if (backend != GGML_BACKEND_GPU && backend != GGML_BACKEND_GPU_SPLIT) {
1121
1216
  continue;
1122
1217
  }
1123
1218
  if (progress_callback) {
@@ -1129,30 +1224,28 @@ static void llama_model_load_internal(
1129
1224
  }
1130
1225
  #elif defined(GGML_USE_CLBLAST)
1131
1226
  {
1132
- const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
1133
-
1134
- fprintf(stderr, "ggml_opencl: offloading %d layers to GPU\n", n_gpu);
1135
-
1136
- size_t vram_total = 0;
1137
-
1138
- for (int i = 0; i < n_gpu; ++i) {
1139
- const auto & layer = model.layers[i];
1140
-
1141
- ggml_cl_transform_tensor(layer.wq); vram_total += ggml_nbytes(layer.wq);
1142
- ggml_cl_transform_tensor(layer.wk); vram_total += ggml_nbytes(layer.wk);
1143
- ggml_cl_transform_tensor(layer.wv); vram_total += ggml_nbytes(layer.wv);
1144
- ggml_cl_transform_tensor(layer.wo); vram_total += ggml_nbytes(layer.wo);
1145
- ggml_cl_transform_tensor(layer.w1); vram_total += ggml_nbytes(layer.w1);
1146
- ggml_cl_transform_tensor(layer.w2); vram_total += ggml_nbytes(layer.w2);
1147
- ggml_cl_transform_tensor(layer.w3); vram_total += ggml_nbytes(layer.w3);
1227
+ size_t done_size = 0;
1228
+ size_t data_size = 0;
1229
+ for (llama_load_tensor & lt : ml->tensors_map.tensors) {
1230
+ data_size += lt.size;
1231
+ if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
1232
+ done_size += lt.size;
1233
+ }
1148
1234
  }
1149
- if (n_gpu_layers > (int) hparams.n_layer) {
1150
- fprintf(stderr, "ggml_opencl: offloading output layer to GPU\n");
1151
- ggml_cl_transform_tensor(model.output); vram_total += ggml_nbytes(model.output);
1235
+ for (llama_load_tensor & lt : ml->tensors_map.tensors) {
1236
+ if (lt.ggml_tensor->backend != GGML_BACKEND_GPU) {
1237
+ continue;
1238
+ }
1239
+ if (progress_callback) {
1240
+ progress_callback((float) done_size / data_size, progress_callback_user_data);
1241
+ }
1242
+ ggml_cl_load_data(fname.c_str(), lt.ggml_tensor, lt.shards.at(0).file_off);
1243
+ done_size += lt.size;
1152
1244
  }
1153
-
1154
- fprintf(stderr, "ggml_opencl: total VRAM used: %zu MB\n", vram_total / 1024 / 1024);
1155
1245
  }
1246
+ #else
1247
+ (void) n_batch;
1248
+ (void) tensor_split;
1156
1249
  #endif
1157
1250
 
1158
1251
  if (progress_callback) {
@@ -1170,7 +1263,10 @@ static bool llama_model_load(
1170
1263
  const std::string & fname,
1171
1264
  llama_context & lctx,
1172
1265
  int n_ctx,
1266
+ int n_batch,
1173
1267
  int n_gpu_layers,
1268
+ int main_gpu,
1269
+ float * tensor_split,
1174
1270
  ggml_type memory_type,
1175
1271
  bool use_mmap,
1176
1272
  bool use_mlock,
@@ -1178,28 +1274,30 @@ static bool llama_model_load(
1178
1274
  llama_progress_callback progress_callback,
1179
1275
  void *progress_callback_user_data) {
1180
1276
  try {
1181
- llama_model_load_internal(fname, lctx, n_ctx, n_gpu_layers, memory_type, use_mmap, use_mlock,
1182
- vocab_only, progress_callback, progress_callback_user_data);
1277
+ llama_model_load_internal(fname, lctx, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, memory_type,
1278
+ use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
1183
1279
  return true;
1184
- } catch (const std::string & err) {
1185
- fprintf(stderr, "error loading model: %s\n", err.c_str());
1280
+ } catch (const std::exception & err) {
1281
+ fprintf(stderr, "error loading model: %s\n", err.what());
1186
1282
  return false;
1187
1283
  }
1188
1284
  }
1189
1285
 
1190
1286
  // evaluate the transformer
1191
1287
  //
1192
- // - lctx: llama context
1193
- // - tokens: new batch of tokens to process
1194
- // - n_past: the context size so far
1195
- // - n_threads: number of threads to use
1288
+ // - lctx: llama context
1289
+ // - tokens: new batch of tokens to process
1290
+ // - n_past: the context size so far
1291
+ // - n_threads: number of threads to use
1292
+ // - cgraph_fname: filename of the exported computation graph
1196
1293
  //
1197
1294
  static bool llama_eval_internal(
1198
- llama_context & lctx,
1199
- const llama_token * tokens,
1200
- const int n_tokens,
1201
- const int n_past,
1202
- const int n_threads) {
1295
+ llama_context & lctx,
1296
+ const llama_token * tokens,
1297
+ const int n_tokens,
1298
+ const int n_past,
1299
+ const int n_threads,
1300
+ const char * cgraph_fname) {
1203
1301
 
1204
1302
  // enforce that the first token is BOS
1205
1303
  if (n_past == 0 && tokens[0] != llama_token_bos()) {
@@ -1218,12 +1316,13 @@ static bool llama_eval_internal(
1218
1316
 
1219
1317
  LLAMA_ASSERT(!!kv_self.ctx);
1220
1318
 
1221
- const int n_embd = hparams.n_embd;
1222
- const int n_layer = hparams.n_layer;
1223
- const int n_ctx = hparams.n_ctx;
1224
- const int n_head = hparams.n_head;
1225
- const int n_vocab = hparams.n_vocab;
1226
- const int n_rot = hparams.n_embd/hparams.n_head;
1319
+ const int n_embd = hparams.n_embd;
1320
+ const int n_layer = hparams.n_layer;
1321
+ const int n_ctx = hparams.n_ctx;
1322
+ const int n_head = hparams.n_head;
1323
+ const int n_vocab = hparams.n_vocab;
1324
+ const int n_rot = hparams.n_embd/hparams.n_head;
1325
+ const int n_gpu_layers = model.n_gpu_layers;
1227
1326
 
1228
1327
  auto & mem_per_token = lctx.mem_per_token;
1229
1328
  auto & buf_compute = lctx.buf_compute;
@@ -1245,40 +1344,66 @@ static bool llama_eval_internal(
1245
1344
  ggml_set_name(embd, "embd");
1246
1345
  memcpy(embd->data, tokens, N*ggml_element_size(embd));
1247
1346
 
1347
+ struct ggml_tensor * cur;
1248
1348
  struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
1249
1349
 
1350
+ const int i_gpu_start = n_layer - n_gpu_layers;
1351
+ (void) i_gpu_start;
1352
+
1250
1353
  for (int il = 0; il < n_layer; ++il) {
1251
- struct ggml_tensor * inpSA = inpL;
1354
+ offload_func_t offload_func = llama_nop;
1252
1355
 
1253
- struct ggml_tensor * cur;
1356
+ #ifdef GGML_USE_CUBLAS
1357
+ if (il >= i_gpu_start) {
1358
+ offload_func = ggml_cuda_assign_buffers; // sets the output backend to GPU
1359
+ }
1360
+ #endif // GGML_USE_CUBLAS
1361
+
1362
+ struct ggml_tensor * inpSA = inpL;
1254
1363
 
1255
1364
  lctx.use_buf(ctx0, 0);
1256
1365
 
1257
1366
  // norm
1258
1367
  {
1259
1368
  cur = ggml_rms_norm(ctx0, inpL);
1369
+ offload_func(cur);
1370
+ ggml_set_name(cur, "rms_norm_0");
1260
1371
 
1261
1372
  // cur = cur*attention_norm(broadcasted)
1262
1373
  cur = ggml_mul(ctx0, cur, model.layers[il].attention_norm);
1374
+ offload_func(cur);
1375
+ ggml_set_name(cur, "attention_norm_0");
1263
1376
  }
1264
1377
 
1265
1378
  // self-attention
1266
1379
  {
1267
1380
  // compute Q and K and RoPE them
1268
- struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
1269
- struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
1270
- ggml_set_name(Qcur, "Qcur");
1381
+ struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
1382
+ // offload_func(tmpq);
1383
+ ggml_set_name(tmpq, "tmpq");
1384
+
1385
+ struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
1386
+ // offload_func(tmpk);
1387
+ ggml_set_name(tmpk, "tmpk");
1388
+
1389
+ struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0);
1271
1390
  ggml_set_name(Kcur, "Kcur");
1272
1391
 
1392
+ struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0);
1393
+ ggml_set_name(Qcur, "Qcur");
1394
+
1273
1395
  // store key and value to memory
1274
1396
  {
1275
1397
  // compute the transposed [N, n_embd] V matrix
1276
1398
  struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), n_embd, N));
1399
+ ggml_set_name(Vcur, "Vcur");
1277
1400
 
1278
1401
  struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
1402
+ ggml_set_name(k, "k");
1279
1403
  struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
1280
1404
  ( n_ctx)*ggml_element_size(kv_self.v),
1281
1405
  (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
1406
+ ggml_set_name(v, "v");
1282
1407
 
1283
1408
  // important: storing RoPE-ed version of K in the KV cache!
1284
1409
  ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
@@ -1319,7 +1444,6 @@ static bool llama_eval_internal(
1319
1444
  struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
1320
1445
  ggml_set_name(KQ_soft_max, "KQ_soft_max");
1321
1446
 
1322
-
1323
1447
  // split cached V into n_head heads
1324
1448
  struct ggml_tensor * V =
1325
1449
  ggml_view_3d(ctx0, kv_self.v,
@@ -1354,73 +1478,143 @@ static bool llama_eval_internal(
1354
1478
  cur = ggml_mul_mat(ctx0,
1355
1479
  model.layers[il].wo,
1356
1480
  cur);
1481
+ offload_func(cur);
1482
+ ggml_set_name(cur, "result_wo");
1357
1483
  }
1358
1484
 
1359
1485
  lctx.use_buf(ctx0, 1);
1486
+ //ggml_cuda_set_scratch(1);
1360
1487
 
1361
1488
  struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
1489
+ offload_func(inpFF);
1490
+ ggml_set_name(inpFF, "inpFF");
1362
1491
 
1363
1492
  // feed-forward network
1364
1493
  {
1365
1494
  // norm
1366
1495
  {
1367
1496
  cur = ggml_rms_norm(ctx0, inpFF);
1497
+ offload_func(cur);
1498
+ ggml_set_name(cur, "rms_norm_1");
1368
1499
 
1369
1500
  // cur = cur*ffn_norm(broadcasted)
1370
1501
  cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
1502
+ offload_func(cur);
1503
+ ggml_set_name(cur, "ffn_norm");
1371
1504
  }
1372
1505
 
1373
1506
  struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
1374
1507
  model.layers[il].w3,
1375
1508
  cur);
1509
+ offload_func(tmp);
1510
+ ggml_set_name(tmp, "result_w3");
1376
1511
 
1377
1512
  cur = ggml_mul_mat(ctx0,
1378
1513
  model.layers[il].w1,
1379
1514
  cur);
1515
+ offload_func(cur);
1516
+ ggml_set_name(cur, "result_w2");
1380
1517
 
1381
1518
  // SILU activation
1382
1519
  cur = ggml_silu(ctx0, cur);
1520
+ offload_func(cur);
1521
+ ggml_set_name(cur, "silu");
1383
1522
 
1384
1523
  cur = ggml_mul(ctx0, cur, tmp);
1524
+ offload_func(cur);
1525
+ ggml_set_name(cur, "silu_x_result_w3");
1385
1526
 
1386
1527
  cur = ggml_mul_mat(ctx0,
1387
1528
  model.layers[il].w2,
1388
1529
  cur);
1530
+ offload_func(cur);
1531
+ ggml_set_name(cur, "result_w2");
1389
1532
  }
1390
1533
 
1391
1534
  cur = ggml_add(ctx0, cur, inpFF);
1535
+ offload_func(cur);
1536
+ ggml_set_name(cur, "inpFF_+_result_w2");
1392
1537
 
1393
1538
  // input for next layer
1394
1539
  inpL = cur;
1540
+
1395
1541
  }
1396
1542
 
1397
1543
  lctx.use_buf(ctx0, 0);
1544
+ //ggml_cuda_set_scratch(0);
1398
1545
 
1399
1546
  // used at the end to optionally extract the embeddings
1400
1547
  struct ggml_tensor * embeddings = NULL;
1401
1548
 
1549
+ offload_func_t offload_func = llama_nop;
1550
+
1551
+ #ifdef GGML_USE_CUBLAS
1552
+ if (n_gpu_layers > n_layer) {
1553
+ offload_func = ggml_cuda_assign_buffers; // sets the output backend to GPU
1554
+ }
1555
+ #endif // GGML_USE_CUBLAS
1556
+
1402
1557
  // norm
1403
1558
  {
1559
+ cur = ggml_rms_norm(ctx0, inpL);
1560
+ offload_func(cur);
1561
+ ggml_set_name(cur, "rms_norm_inpL");
1404
1562
 
1405
- inpL = ggml_rms_norm(ctx0, inpL);
1563
+ cur = ggml_rms_norm(ctx0, cur);
1564
+ offload_func(cur);
1565
+ ggml_set_name(cur, "rms_norm_after");
1406
1566
 
1407
- // inpL = inpL*norm(broadcasted)
1408
- inpL = ggml_mul(ctx0, inpL, model.norm);
1567
+ // cur = cur*norm(broadcasted)
1568
+ cur = ggml_mul(ctx0, cur, model.norm);
1569
+ offload_func(cur);
1570
+ ggml_set_name(cur, "result_norm");
1409
1571
 
1410
- embeddings = inpL;
1572
+ embeddings = cur;
1411
1573
  }
1412
1574
 
1575
+
1413
1576
  // lm_head
1414
- inpL = ggml_mul_mat(ctx0, model.output, inpL);
1577
+ cur = ggml_mul_mat(ctx0, model.output, cur);
1578
+ ggml_set_name(cur, "result_output");
1415
1579
 
1416
1580
  lctx.use_buf(ctx0, -1);
1417
1581
 
1418
1582
  // logits -> probs
1419
- //inpL = ggml_soft_max_inplace(ctx0, inpL);
1583
+ //cur = ggml_soft_max_inplace(ctx0, cur);
1420
1584
 
1421
1585
  // run the computation
1422
- ggml_build_forward_expand(&gf, inpL);
1423
- ggml_graph_compute (ctx0, &gf);
1586
+ ggml_build_forward_expand(&gf, cur);
1587
+
1588
+ #ifdef GGML_USE_METAL
1589
+ if (lctx.ctx_metal && N == 1) {
1590
+ ggml_metal_graph_compute(lctx.ctx_metal, &gf);
1591
+ ggml_metal_get_tensor (lctx.ctx_metal, cur);
1592
+ } else {
1593
+ // IMPORTANT:
1594
+ // Since we don't have efficient Matrix x Matrix Metal multiplication yet, we fallback to vanilla
1595
+ // ggml_graph_compute(). It uses Apple's Accelerate CBLAS API which takes advantage of the ANE or the AMX
1596
+ // coprocessor.
1597
+ //
1598
+ // When we implement Matrix x Matrix Metal multiplication, we can avoid this branch.
1599
+ // But for now, we have focused only on Matrix x Vector Metal multiplication.
1600
+ //
1601
+ // TODO: avoid these syncs via shared memory (ref #1696)
1602
+ //
1603
+ if (lctx.ctx_metal) {
1604
+ // We need to sync the GPU KV cache with the CPU KV cache
1605
+ ggml_metal_get_tensor(lctx.ctx_metal, kv_self.k);
1606
+ ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v);
1607
+ }
1608
+
1609
+ ggml_graph_compute(ctx0, &gf);
1610
+ }
1611
+ #else
1612
+ ggml_graph_compute(ctx0, &gf);
1613
+ #endif
1614
+
1615
+ if (cgraph_fname) {
1616
+ ggml_graph_export(&gf, cgraph_fname);
1617
+ }
1424
1618
 
1425
1619
  #ifdef GGML_PERF
1426
1620
  // print timing information per ggml operation (for debugging purposes)
@@ -1434,7 +1628,7 @@ static bool llama_eval_internal(
1434
1628
  //}
1435
1629
 
1436
1630
  //embd_w.resize(n_vocab*N);
1437
- //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
1631
+ //memcpy(embd_w.data(), ggml_get_data(cur), sizeof(float)*n_vocab*N);
1438
1632
 
1439
1633
  // update kv token count
1440
1634
  lctx.model.kv_self.n = n_past + N;
@@ -1445,11 +1639,11 @@ static bool llama_eval_internal(
1445
1639
 
1446
1640
  if (lctx.logits_all) {
1447
1641
  logits_out.resize(n_vocab * N);
1448
- memcpy(logits_out.data(), (float *) ggml_get_data(inpL), sizeof(float)*n_vocab*N);
1642
+ memcpy(logits_out.data(), (float *) ggml_get_data(cur), sizeof(float)*n_vocab*N);
1449
1643
  } else {
1450
1644
  // return result for just the last token
1451
1645
  logits_out.resize(n_vocab);
1452
- memcpy(logits_out.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
1646
+ memcpy(logits_out.data(), (float *) ggml_get_data(cur) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
1453
1647
  }
1454
1648
  }
1455
1649
 
@@ -2048,16 +2242,88 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra
2048
2242
  // quantization
2049
2243
  //
2050
2244
 
2051
- static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, enum llama_ftype ftype, int nthread) {
2245
+ static void llama_convert_tensor_internal(const llama_load_tensor & tensor, llama_buffer & output, const int nelements, const int nthread) {
2246
+ if (output.size < nelements * sizeof(float)) {
2247
+ output.resize(nelements * sizeof(float));
2248
+ }
2249
+ float * f32_output = (float *) output.addr;
2250
+
2251
+ quantize_fns_t qtype;
2252
+ if (ggml_is_quantized(tensor.type)) {
2253
+ qtype = ggml_internal_get_quantize_fn(tensor.type);
2254
+ if (qtype.dequantize_row_q == NULL) {
2255
+ throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor.type)));
2256
+ }
2257
+ } else if (tensor.type != GGML_TYPE_F16) {
2258
+ throw std::runtime_error(format("cannot dequantize/convert tensor type %s", ggml_type_name(tensor.type)));
2259
+ }
2260
+
2261
+ if (nthread < 2) {
2262
+ if (tensor.type == GGML_TYPE_F16) {
2263
+ ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor.data, f32_output, nelements);
2264
+ } else if (ggml_is_quantized(tensor.type)) {
2265
+ qtype.dequantize_row_q(tensor.data, f32_output, nelements);
2266
+ } else {
2267
+ LLAMA_ASSERT(false); // unreachable
2268
+ }
2269
+ return;
2270
+ }
2271
+
2272
+ auto block_size = tensor.type == GGML_TYPE_F16 ? 1 : (size_t)ggml_blck_size(tensor.type);
2273
+ auto block_size_bytes = ggml_type_size(tensor.type);
2274
+
2275
+ LLAMA_ASSERT(nelements % block_size == 0);
2276
+ auto nblocks = nelements / block_size;
2277
+ auto blocks_per_thread = nblocks / nthread;
2278
+ auto spare_blocks = nblocks - (blocks_per_thread * nthread); // if blocks aren't divisible by thread count
2279
+
2280
+ std::vector<std::thread> workers;
2281
+ for (auto tnum = 0, in_buff_offs = 0, out_buff_offs = 0; tnum < nthread; tnum++) {
2282
+ auto thr_blocks = blocks_per_thread + (tnum == nthread - 1 ? spare_blocks : 0); // num blocks for this thread
2283
+ auto thr_elems = thr_blocks * block_size; // number of elements for this thread
2284
+ auto thr_block_bytes = thr_blocks * block_size_bytes; // number of input bytes for this thread
2285
+
2286
+ auto compute = [qtype] (ggml_type typ, uint8_t * inbuf, float * outbuf, int nels) {
2287
+ if (typ == GGML_TYPE_F16) {
2288
+ ggml_fp16_to_fp32_row((ggml_fp16_t *)inbuf, outbuf, nels);
2289
+ } else {
2290
+ qtype.dequantize_row_q(inbuf, outbuf, nels);
2291
+ }
2292
+ };
2293
+ workers.push_back(std::thread(compute, tensor.type, tensor.data + in_buff_offs, f32_output + out_buff_offs, thr_elems));
2294
+ in_buff_offs += thr_block_bytes;
2295
+ out_buff_offs += thr_elems;
2296
+ }
2297
+ for (auto & worker : workers) {
2298
+ worker.join();
2299
+ }
2300
+
2301
+ }
2302
+
2303
+ static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
2052
2304
  ggml_type quantized_type;
2053
- switch (ftype) {
2305
+ llama_ftype ftype = params->ftype;
2306
+ int nthread = params->nthread;
2307
+
2308
+ switch (params->ftype) {
2054
2309
  case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
2055
2310
  case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
2056
2311
  case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
2057
2312
  case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
2058
2313
  case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
2059
- default: throw format("invalid output file type %d\n", ftype);
2060
- };
2314
+
2315
+ // K-quants
2316
+ case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
2317
+ case LLAMA_FTYPE_MOSTLY_Q3_K_S:
2318
+ case LLAMA_FTYPE_MOSTLY_Q3_K_M:
2319
+ case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break;
2320
+ case LLAMA_FTYPE_MOSTLY_Q4_K_S:
2321
+ case LLAMA_FTYPE_MOSTLY_Q4_K_M: quantized_type = GGML_TYPE_Q4_K; break;
2322
+ case LLAMA_FTYPE_MOSTLY_Q5_K_S:
2323
+ case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break;
2324
+ case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
2325
+ default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
2326
+ }
2061
2327
 
2062
2328
  if (nthread <= 0) {
2063
2329
  nthread = std::thread::hardware_concurrency();
@@ -2065,7 +2331,21 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2065
2331
 
2066
2332
  std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp, /*use_mmap*/ false,
2067
2333
  /*vocab_only*/ false));
2068
- llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), ftype);
2334
+ llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), params->ftype);
2335
+
2336
+ int n_attention_wv = 0;
2337
+ int n_feed_forward_w2 = 0;
2338
+ for (auto& tensor : model_loader->tensors_map.tensors) {
2339
+ if (tensor.name.find("attention.wv.weight") != std::string::npos) {
2340
+ ++n_attention_wv;
2341
+ }
2342
+ else if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) {
2343
+ ++n_feed_forward_w2;
2344
+ }
2345
+ }
2346
+
2347
+ int i_attention_wv = 0;
2348
+ int i_feed_forward_w2 = 0;
2069
2349
 
2070
2350
  size_t total_size_org = 0;
2071
2351
  size_t total_size_new = 0;
@@ -2093,9 +2373,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2093
2373
  quantize &= (tensor.ne.size() == 2);
2094
2374
 
2095
2375
  // uncomment this to keep the output layer in FP16
2096
- //if (tensor.name == "output.weight") {
2097
- // quantize = false;
2098
- //}
2376
+ if (!params->quantize_output_tensor && tensor.name == "output.weight") {
2377
+ quantize = false;
2378
+ }
2379
+ quantize = quantize && quantized_type != tensor.type;
2099
2380
 
2100
2381
  enum ggml_type new_type;
2101
2382
  void * new_data;
@@ -2109,20 +2390,43 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2109
2390
  printf("size = %8.3f MB\n", tensor.size/1024.0/1024.0);
2110
2391
  } else {
2111
2392
  new_type = quantized_type;
2393
+ // TODO: temporary disabled until Metal / OpenCL support is available
2394
+ // ref: https://github.com/ggerganov/llama.cpp/issues/1711
2395
+ //if (tensor.name == "output.weight") {
2396
+ // new_type = GGML_TYPE_Q6_K;
2397
+ //}
2398
+ if (tensor.name.find("attention.wv.weight") != std::string::npos) {
2399
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
2400
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
2401
+ else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
2402
+ (i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8 ||
2403
+ (i_attention_wv - n_attention_wv/8)%3 == 2)) new_type = GGML_TYPE_Q6_K;
2404
+ ++i_attention_wv;
2405
+ }
2406
+ if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) {
2407
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
2408
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
2409
+ else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
2410
+ (i_feed_forward_w2 < n_feed_forward_w2/8 || i_feed_forward_w2 >= 7*n_feed_forward_w2/8 ||
2411
+ (i_feed_forward_w2 - n_feed_forward_w2/8)%3 == 2)) new_type = GGML_TYPE_Q6_K;
2412
+ ++i_feed_forward_w2;
2413
+ }
2414
+ if (tensor.name.find("attention.wo.weight") != std::string::npos) {
2415
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
2416
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
2417
+ }
2418
+
2112
2419
  float * f32_data;
2113
2420
  size_t nelements = tensor.ne.at(0) * tensor.ne.at(1);
2114
2421
  llama_buffer f32_conv_buf;
2422
+
2115
2423
  if (tensor.type == GGML_TYPE_F32) {
2116
2424
  f32_data = (float *) tensor.data;
2117
- } else if (tensor.type == GGML_TYPE_F16) {
2118
- f32_conv_buf.resize(nelements * sizeof(float));
2119
- f32_data = (float *) f32_conv_buf.addr;
2120
- const auto * f16_data = (const ggml_fp16_t *) tensor.data;
2121
- for (size_t i = 0; i < nelements; i++) {
2122
- f32_data[i] = ggml_fp16_to_fp32(f16_data[i]);
2123
- }
2425
+ } else if (ggml_is_quantized(tensor.type) && !params->allow_requantize) {
2426
+ throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor.type)));
2124
2427
  } else {
2125
- throw format("type %s unsupported for integer quantization", ggml_type_name(tensor.type));
2428
+ llama_convert_tensor_internal(tensor, f32_conv_buf, nelements, nthread);
2429
+ f32_data = (float *) f32_conv_buf.addr;
2126
2430
  }
2127
2431
 
2128
2432
  printf("quantizing .. ");
@@ -2176,12 +2480,16 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2176
2480
  }
2177
2481
 
2178
2482
  printf("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0);
2483
+ int64_t tot_count = 0;
2179
2484
  for (size_t i = 0; i < hist_cur.size(); i++) {
2180
2485
  hist_all[i] += hist_cur[i];
2486
+ tot_count += hist_cur[i];
2181
2487
  }
2182
2488
 
2183
- for (size_t i = 0; i < hist_cur.size(); i++) {
2184
- printf("%5.3f ", hist_cur[i] / float(nelements));
2489
+ if (tot_count > 0) {
2490
+ for (size_t i = 0; i < hist_cur.size(); i++) {
2491
+ printf("%5.3f ", hist_cur[i] / float(nelements));
2492
+ }
2185
2493
  }
2186
2494
  printf("\n");
2187
2495
  }
@@ -2199,11 +2507,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2199
2507
  sum_all += hist_all[i];
2200
2508
  }
2201
2509
 
2202
- printf("%s: hist: ", __func__);
2203
- for (size_t i = 0; i < hist_all.size(); i++) {
2204
- printf("%5.3f ", hist_all[i] / float(sum_all));
2510
+ if (sum_all > 0) {
2511
+ printf("%s: hist: ", __func__);
2512
+ for (size_t i = 0; i < hist_all.size(); i++) {
2513
+ printf("%5.3f ", hist_all[i] / float(sum_all));
2514
+ }
2515
+ printf("\n");
2205
2516
  }
2206
- printf("\n");
2207
2517
  }
2208
2518
  }
2209
2519
 
@@ -2244,9 +2554,9 @@ struct llama_context * llama_init_from_file(
2244
2554
 
2245
2555
  ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
2246
2556
 
2247
- if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_gpu_layers, memory_type,
2248
- params.use_mmap, params.use_mlock, params.vocab_only,
2249
- params.progress_callback, params.progress_callback_user_data)) {
2557
+ if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_batch, params.n_gpu_layers,
2558
+ params.main_gpu, params.tensor_split, memory_type, params.use_mmap, params.use_mlock,
2559
+ params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
2250
2560
  fprintf(stderr, "%s: failed to load model\n", __func__);
2251
2561
  llama_free(ctx);
2252
2562
  return nullptr;
@@ -2284,6 +2594,38 @@ struct llama_context * llama_init_from_file(
2284
2594
  ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
2285
2595
  }
2286
2596
 
2597
+ #ifdef GGML_USE_METAL
2598
+ if (params.n_gpu_layers > 0) {
2599
+ // this allocates all Metal resources and memory buffers
2600
+ ctx->ctx_metal = ggml_metal_init();
2601
+
2602
+ void *data_ptr = NULL;
2603
+ size_t data_size = 0;
2604
+ if (params.use_mmap) {
2605
+ data_ptr = ctx->model.mapping->addr;
2606
+ data_size= ctx->model.mapping->size;
2607
+ } else {
2608
+ data_ptr = ggml_get_mem_buffer(ctx->model.ctx);
2609
+ data_size= ggml_get_mem_size(ctx->model.ctx);
2610
+ }
2611
+
2612
+ #define LLAMA_METAL_CHECK_BUF(result) \
2613
+ if (!(result)) { \
2614
+ fprintf(stderr, "%s: failed to add buffer\n", __func__); \
2615
+ llama_free(ctx); \
2616
+ return NULL; \
2617
+ }
2618
+
2619
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size));
2620
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size));
2621
+
2622
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->model.kv_self.buf.addr, ctx->model.kv_self.buf.size));
2623
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size));
2624
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size));
2625
+ #undef LLAMA_METAL_CHECK_BUF
2626
+ }
2627
+ #endif
2628
+
2287
2629
  return ctx;
2288
2630
  }
2289
2631
 
@@ -2294,13 +2636,12 @@ void llama_free(struct llama_context * ctx) {
2294
2636
  int llama_model_quantize(
2295
2637
  const char * fname_inp,
2296
2638
  const char * fname_out,
2297
- enum llama_ftype ftype,
2298
- int nthread) {
2639
+ const llama_model_quantize_params *params) {
2299
2640
  try {
2300
- llama_model_quantize_internal(fname_inp, fname_out, ftype, nthread);
2641
+ llama_model_quantize_internal(fname_inp, fname_out, params);
2301
2642
  return 0;
2302
- } catch (const std::string & err) {
2303
- fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.c_str());
2643
+ } catch (const std::exception & err) {
2644
+ fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.what());
2304
2645
  return 1;
2305
2646
  }
2306
2647
  }
@@ -2553,8 +2894,8 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
2553
2894
  int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
2554
2895
  try {
2555
2896
  return llama_apply_lora_from_file_internal(ctx, path_lora, path_base_model, n_threads);
2556
- } catch (const std::string & err) {
2557
- fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.c_str());
2897
+ } catch (const std::exception & err) {
2898
+ fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what());
2558
2899
  return 1;
2559
2900
  }
2560
2901
  }
@@ -2899,7 +3240,7 @@ int llama_eval(
2899
3240
  int n_tokens,
2900
3241
  int n_past,
2901
3242
  int n_threads) {
2902
- if (!llama_eval_internal(*ctx, tokens, n_tokens, n_past, n_threads)) {
3243
+ if (!llama_eval_internal(*ctx, tokens, n_tokens, n_past, n_threads, nullptr)) {
2903
3244
  fprintf(stderr, "%s: failed to eval\n", __func__);
2904
3245
  return 1;
2905
3246
  }
@@ -2914,6 +3255,20 @@ int llama_eval(
2914
3255
  return 0;
2915
3256
  }
2916
3257
 
3258
+ int llama_eval_export(struct llama_context * ctx, const char * fname) {
3259
+ const int n_batch = 1;
3260
+ const int n_ctx = 512 - n_batch;
3261
+
3262
+ const std::vector<llama_token> tmp(n_batch, llama_token_bos());
3263
+
3264
+ if (!llama_eval_internal(*ctx, tmp.data(), tmp.size(), n_ctx, 1, fname)) {
3265
+ fprintf(stderr, "%s: failed to eval\n", __func__);
3266
+ return 1;
3267
+ }
3268
+
3269
+ return 0;
3270
+ }
3271
+
2917
3272
  int llama_tokenize(
2918
3273
  struct llama_context * ctx,
2919
3274
  const char * text,