llama_cpp 0.1.4 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -16,6 +16,10 @@
16
16
  #include "ggml-opencl.h"
17
17
  #endif
18
18
 
19
+ #ifdef GGML_USE_METAL
20
+ #include "ggml-metal.h"
21
+ #endif
22
+
19
23
  #include <array>
20
24
  #include <ctime>
21
25
  #include <cinttypes>
@@ -49,17 +53,22 @@ enum e_model {
49
53
  MODEL_65B,
50
54
  };
51
55
 
52
-
53
56
  static const size_t MB = 1024*1024;
54
57
 
55
58
  // computed for n_ctx == 2048
56
59
  // TODO: dynamically determine these sizes
57
60
  // needs modifications in ggml
58
61
 
62
+ typedef void (*offload_func_t)(struct ggml_tensor * tensor);
63
+
64
+ void llama_nop(struct ggml_tensor * tensor) { // don't offload by default
65
+ (void) tensor;
66
+ }
67
+
59
68
  static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
60
69
  {
61
70
  static std::map<e_model, size_t> k_sizes = {
62
- { MODEL_3B, 128ull * MB },
71
+ { MODEL_3B, 256ull * MB },
63
72
  { MODEL_7B, 512ull * MB },
64
73
  { MODEL_13B, 512ull * MB },
65
74
  { MODEL_30B, 512ull * MB },
@@ -71,7 +80,7 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
71
80
  static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
72
81
  {
73
82
  static std::map<e_model, size_t> k_sizes = {
74
- { MODEL_3B, 128ull * MB },
83
+ { MODEL_3B, 256ull * MB },
75
84
  { MODEL_7B, 512ull * MB },
76
85
  { MODEL_13B, 512ull * MB },
77
86
  { MODEL_30B, 512ull * MB },
@@ -170,6 +179,7 @@ struct llama_model {
170
179
  struct ggml_tensor * output;
171
180
 
172
181
  std::vector<llama_layer> layers;
182
+ int n_gpu_layers;
173
183
 
174
184
  // context
175
185
  struct ggml_context * ctx = NULL;
@@ -195,6 +205,16 @@ struct llama_model {
195
205
  if (ctx) {
196
206
  ggml_free(ctx);
197
207
  }
208
+
209
+ #ifdef GGML_USE_CUBLAS
210
+ for (size_t i = 0; i < tensors_by_name.size(); ++i) {
211
+ ggml_cuda_free_data(tensors_by_name[i].second);
212
+ }
213
+ #elif defined(GGML_USE_CLBLAST)
214
+ for (size_t i = 0; i < tensors_by_name.size(); ++i) {
215
+ ggml_cl_free_data(tensors_by_name[i].second);
216
+ }
217
+ #endif
198
218
  }
199
219
  };
200
220
 
@@ -243,6 +263,10 @@ struct llama_context {
243
263
  llama_ctx_buffer buf_compute;
244
264
  llama_ctx_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
245
265
 
266
+ #ifdef GGML_USE_METAL
267
+ ggml_metal_context * ctx_metal = NULL;
268
+ #endif
269
+
246
270
  int buf_last = 0;
247
271
  size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
248
272
 
@@ -282,15 +306,15 @@ template <typename T>
282
306
  static T checked_mul(T a, T b) {
283
307
  T ret = a * b;
284
308
  if (a != 0 && ret / a != b) {
285
- throw format("overflow multiplying %llu * %llu",
286
- (unsigned long long) a, (unsigned long long) b);
309
+ throw std::runtime_error(format("overflow multiplying %llu * %llu",
310
+ (unsigned long long) a, (unsigned long long) b));
287
311
  }
288
312
  return ret;
289
313
  }
290
314
 
291
315
  static size_t checked_div(size_t a, size_t b) {
292
316
  if (b == 0 || a % b != 0) {
293
- throw format("error dividing %zu / %zu", a, b);
317
+ throw std::runtime_error(format("error dividing %zu / %zu", a, b));
294
318
  }
295
319
  return a / b;
296
320
  }
@@ -354,7 +378,7 @@ struct llama_load_tensor {
354
378
  const auto & first_shard = shards.at(0);
355
379
  for (const auto & shard : shards) {
356
380
  if (shard.type != first_shard.type) {
357
- throw format("inconsistent tensor shard type in '%s'", name.c_str());
381
+ throw std::runtime_error(format("inconsistent tensor shard type in '%s'", name.c_str()));
358
382
  }
359
383
  }
360
384
  type = first_shard.type;
@@ -377,8 +401,8 @@ struct llama_load_tensor {
377
401
  const auto & first_shard = shards.at(0);
378
402
  for (const auto & shard : shards) {
379
403
  if (shard.ne != first_shard.ne) {
380
- throw format("inconsistent tensor shard shape in '%s': first was %s, other was %s",
381
- name.c_str(), llama_format_tensor_shape(first_shard.ne).c_str(), llama_format_tensor_shape(shard.ne).c_str());
404
+ throw std::runtime_error(format("inconsistent tensor shard shape in '%s': first was %s, other was %s",
405
+ name.c_str(), llama_format_tensor_shape(first_shard.ne).c_str(), llama_format_tensor_shape(shard.ne).c_str()));
382
406
  }
383
407
  }
384
408
  ne = first_shard.ne;
@@ -456,8 +480,8 @@ struct llama_file_loader {
456
480
  }
457
481
  }
458
482
 
459
- throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
460
- magic, version);
483
+ throw std::runtime_error(format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
484
+ magic, version));
461
485
  }
462
486
  void read_hparams() {
463
487
  hparams.n_vocab = file.read_u32();
@@ -497,7 +521,7 @@ struct llama_file_loader {
497
521
  file.read_raw(shard.ne.data(), sizeof(shard.ne[0]) * n_dims);
498
522
  std::string name = file.read_string(name_len);
499
523
  if (n_dims < 1 || n_dims > 2) {
500
- throw format("llama.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims);
524
+ throw std::runtime_error(format("llama.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims));
501
525
  }
502
526
  switch (shard.type) {
503
527
  case GGML_TYPE_F32:
@@ -507,9 +531,14 @@ struct llama_file_loader {
507
531
  case GGML_TYPE_Q5_0:
508
532
  case GGML_TYPE_Q5_1:
509
533
  case GGML_TYPE_Q8_0:
534
+ case GGML_TYPE_Q2_K:
535
+ case GGML_TYPE_Q3_K:
536
+ case GGML_TYPE_Q4_K:
537
+ case GGML_TYPE_Q5_K:
538
+ case GGML_TYPE_Q6_K:
510
539
  break;
511
540
  default: {
512
- throw format("unrecognized tensor type %u\n", shard.type);
541
+ throw std::runtime_error(format("unrecognized tensor type %u\n", shard.type));
513
542
  }
514
543
  }
515
544
 
@@ -582,6 +611,11 @@ struct llama_file_saver {
582
611
  case GGML_TYPE_Q5_0:
583
612
  case GGML_TYPE_Q5_1:
584
613
  case GGML_TYPE_Q8_0:
614
+ case GGML_TYPE_Q2_K:
615
+ case GGML_TYPE_Q3_K:
616
+ case GGML_TYPE_Q4_K:
617
+ case GGML_TYPE_Q5_K:
618
+ case GGML_TYPE_Q6_K:
585
619
  break;
586
620
  default: LLAMA_ASSERT(false);
587
621
  }
@@ -613,7 +647,7 @@ struct llama_model_loader {
613
647
  auto * ith_file = new llama_file_loader(fname.c_str(), i, tensors_map);
614
648
  file_loaders.emplace_back(ith_file);
615
649
  if (ith_file->hparams != first_file->hparams) {
616
- throw format("llama.cpp: hparams inconsistent between files");
650
+ throw std::runtime_error(format("llama.cpp: hparams inconsistent between files"));
617
651
  }
618
652
  }
619
653
  if (!llama_mmap::SUPPORTED) {
@@ -643,7 +677,7 @@ struct llama_model_loader {
643
677
  uint32_t guess_n_parts() const {
644
678
  auto it = tensors_map.name_to_idx.find("tok_embeddings.weight");
645
679
  if (it == tensors_map.name_to_idx.end()) {
646
- throw std::string("missing tok_embeddings.weight");
680
+ throw std::runtime_error(std::string("missing tok_embeddings.weight"));
647
681
  }
648
682
  const llama_load_tensor & lt = tensors_map.tensors.at(it->second);
649
683
  return file_loaders.at(0)->hparams.n_embd / lt.shards.at(0).ne.at(0);
@@ -660,12 +694,12 @@ struct llama_model_loader {
660
694
  struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne, ggml_backend backend) {
661
695
  auto it = tensors_map.name_to_idx.find(name);
662
696
  if (it == tensors_map.name_to_idx.end()) {
663
- throw format("llama.cpp: tensor '%s' is missing from model", name.c_str());
697
+ throw std::runtime_error(std::runtime_error(format("llama.cpp: tensor '%s' is missing from model", name.c_str())));
664
698
  }
665
699
  llama_load_tensor & lt = tensors_map.tensors.at(it->second);
666
700
  if (lt.ne != ne) {
667
- throw format("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s",
668
- name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str());
701
+ throw std::runtime_error(format("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s",
702
+ name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str()));
669
703
  }
670
704
 
671
705
  return get_tensor_for(lt, backend);
@@ -681,6 +715,7 @@ struct llama_model_loader {
681
715
  }
682
716
  ggml_set_name(tensor, lt.name.c_str());
683
717
  LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor
718
+
684
719
  tensor->backend = backend;
685
720
  lt.ggml_tensor = tensor;
686
721
  num_ggml_tensors_created++;
@@ -689,7 +724,7 @@ struct llama_model_loader {
689
724
 
690
725
  void done_getting_tensors() const {
691
726
  if (num_ggml_tensors_created != tensors_map.tensors.size()) {
692
- throw std::string("llama.cpp: file contained more tensors than expected");
727
+ throw std::runtime_error(std::string("llama.cpp: file contained more tensors than expected"));
693
728
  }
694
729
  }
695
730
 
@@ -833,7 +868,10 @@ static bool kv_cache_init(
833
868
  struct llama_context_params llama_context_default_params() {
834
869
  struct llama_context_params result = {
835
870
  /*.n_ctx =*/ 512,
871
+ /*.n_batch =*/ 512,
836
872
  /*.gpu_layers =*/ 0,
873
+ /*.main_gpu =*/ 0,
874
+ /*.tensor_split =*/ {0},
837
875
  /*.seed =*/ -1,
838
876
  /*.f16_kv =*/ true,
839
877
  /*.logits_all =*/ false,
@@ -848,6 +886,17 @@ struct llama_context_params llama_context_default_params() {
848
886
  return result;
849
887
  }
850
888
 
889
+ struct llama_model_quantize_params llama_model_quantize_default_params() {
890
+ struct llama_model_quantize_params result = {
891
+ /*.nthread =*/ 0,
892
+ /*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
893
+ /*.allow_requantize =*/ false,
894
+ /*.quantize_output_tensor =*/ true,
895
+ };
896
+
897
+ return result;
898
+ }
899
+
851
900
  bool llama_mmap_supported() {
852
901
  return llama_mmap::SUPPORTED;
853
902
  }
@@ -898,6 +947,16 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
898
947
  case LLAMA_FTYPE_MOSTLY_Q5_0: return "mostly Q5_0";
899
948
  case LLAMA_FTYPE_MOSTLY_Q5_1: return "mostly Q5_1";
900
949
  case LLAMA_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0";
950
+ // K-quants
951
+ case LLAMA_FTYPE_MOSTLY_Q2_K: return "mostly Q2_K";
952
+ case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "mostly Q3_K - Small";
953
+ case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "mostly Q3_K - Medium";
954
+ case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "mostly Q3_K - Large";
955
+ case LLAMA_FTYPE_MOSTLY_Q4_K_S: return "mostly Q4_K - Small";
956
+ case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "mostly Q4_K - Medium";
957
+ case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "mostly Q5_K - Small";
958
+ case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "mostly Q5_K - Medium";
959
+ case LLAMA_FTYPE_MOSTLY_Q6_K: return "mostly Q6_K";
901
960
  default: return "unknown, may not work";
902
961
  }
903
962
  }
@@ -917,7 +976,10 @@ static void llama_model_load_internal(
917
976
  const std::string & fname,
918
977
  llama_context & lctx,
919
978
  int n_ctx,
979
+ int n_batch,
920
980
  int n_gpu_layers,
981
+ int main_gpu,
982
+ const float * tensor_split,
921
983
  ggml_type memory_type,
922
984
  bool use_mmap,
923
985
  bool use_mlock,
@@ -932,9 +994,9 @@ static void llama_model_load_internal(
932
994
  lctx.vocab = std::move(ml->file_loaders.at(0)->vocab);
933
995
  auto & model = lctx.model;
934
996
  model.hparams = ml->file_loaders.at(0)->hparams;
997
+ model.n_gpu_layers = n_gpu_layers;
935
998
  llama_file_version file_version = ml->file_loaders.at(0)->file_version;
936
999
  auto & hparams = model.hparams;
937
- uint32_t n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
938
1000
 
939
1001
  {
940
1002
  switch (hparams.n_layer) {
@@ -948,6 +1010,8 @@ static void llama_model_load_internal(
948
1010
  hparams.n_ctx = n_ctx;
949
1011
  }
950
1012
 
1013
+ const uint32_t n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
1014
+
951
1015
  {
952
1016
  fprintf(stderr, "%s: format = %s\n", __func__, llama_file_version_name(file_version));
953
1017
  fprintf(stderr, "%s: n_vocab = %u\n", __func__, hparams.n_vocab);
@@ -967,7 +1031,7 @@ static void llama_model_load_internal(
967
1031
  if (hparams.ftype != LLAMA_FTYPE_ALL_F32 &&
968
1032
  hparams.ftype != LLAMA_FTYPE_MOSTLY_F16 &&
969
1033
  hparams.ftype != LLAMA_FTYPE_MOSTLY_Q8_0) {
970
- throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1405)");
1034
+ throw std::runtime_error(format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1405)"));
971
1035
  }
972
1036
  }
973
1037
 
@@ -975,7 +1039,7 @@ static void llama_model_load_internal(
975
1039
  if (hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ||
976
1040
  hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_1 ||
977
1041
  hparams.ftype == LLAMA_FTYPE_MOSTLY_Q8_0) {
978
- throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1508)");
1042
+ throw std::runtime_error(format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1508)"));
979
1043
  }
980
1044
  }
981
1045
 
@@ -1006,18 +1070,28 @@ static void llama_model_load_internal(
1006
1070
 
1007
1071
  model.ctx = ggml_init(params);
1008
1072
  if (!model.ctx) {
1009
- throw format("ggml_init() failed");
1073
+ throw std::runtime_error(format("ggml_init() failed"));
1010
1074
  }
1011
1075
  }
1012
1076
 
1013
- #ifdef GGML_USE_CUBLAS
1014
- #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CUDA
1077
+ (void) main_gpu;
1078
+ #if defined(GGML_USE_CUBLAS)
1079
+ fprintf(stderr, "%s: using CUDA for GPU acceleration\n", __func__);
1080
+ ggml_cuda_set_main_device(main_gpu);
1081
+ #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
1082
+ #define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT
1083
+ #elif defined(GGML_USE_CLBLAST)
1084
+ fprintf(stderr, "%s: using OpenCL for GPU acceleration\n", __func__);
1085
+ #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
1086
+ #define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU
1015
1087
  #else
1016
- #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CPU
1088
+ #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CPU
1089
+ #define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_CPU
1017
1090
  #endif
1018
1091
 
1019
1092
  // prepare memory for the weights
1020
- size_t vram_total = 0;
1093
+ size_t vram_weights = 0;
1094
+ size_t vram_scratch = 0;
1021
1095
  {
1022
1096
  const uint32_t n_embd = hparams.n_embd;
1023
1097
  const uint32_t n_layer = hparams.n_layer;
@@ -1032,7 +1106,7 @@ static void llama_model_load_internal(
1032
1106
  {
1033
1107
  ggml_backend backend_output;
1034
1108
  if (n_gpu_layers > int(n_layer)) { // NOLINT
1035
- backend_output = LLAMA_BACKEND_OFFLOAD;
1109
+ backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
1036
1110
  } else {
1037
1111
  backend_output = GGML_BACKEND_CPU;
1038
1112
  }
@@ -1044,7 +1118,8 @@ static void llama_model_load_internal(
1044
1118
 
1045
1119
  model.layers.resize(n_layer);
1046
1120
  for (uint32_t i = 0; i < n_layer; ++i) {
1047
- const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
1121
+ const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
1122
+ const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
1048
1123
 
1049
1124
  auto & layer = model.layers[i];
1050
1125
 
@@ -1052,19 +1127,19 @@ static void llama_model_load_internal(
1052
1127
 
1053
1128
  layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd}, backend);
1054
1129
 
1055
- layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}, backend);
1056
- layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd}, backend);
1057
- layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd}, backend);
1058
- layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}, backend);
1130
+ layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}, backend_split);
1131
+ layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd}, backend_split);
1132
+ layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd}, backend_split);
1133
+ layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}, backend_split);
1059
1134
 
1060
1135
  layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd}, backend);
1061
1136
 
1062
- layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}, backend);
1063
- layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}, backend);
1064
- layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}, backend);
1137
+ layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}, backend_split);
1138
+ layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}, backend_split);
1139
+ layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}, backend_split);
1065
1140
 
1066
- if (backend == GGML_BACKEND_CUDA) {
1067
- vram_total +=
1141
+ if (backend == GGML_BACKEND_GPU) {
1142
+ vram_weights +=
1068
1143
  ggml_nbytes(layer.attention_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
1069
1144
  ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.attention_norm) +
1070
1145
  ggml_nbytes(layer.w1) + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3);
@@ -1081,10 +1156,10 @@ static void llama_model_load_internal(
1081
1156
  // this is the total memory required to run the inference
1082
1157
  const size_t mem_required =
1083
1158
  ctx_size +
1084
- mmapped_size - vram_total + // weights in VRAM not in memory
1159
+ mmapped_size - vram_weights + // weights in VRAM not in memory
1085
1160
  MEM_REQ_SCRATCH0().at(model.type) +
1086
1161
  MEM_REQ_SCRATCH1().at(model.type) +
1087
- MEM_REQ_EVAL().at(model.type);
1162
+ MEM_REQ_EVAL().at (model.type);
1088
1163
 
1089
1164
  // this is the memory required by one llama_state
1090
1165
  const size_t mem_required_state =
@@ -1093,15 +1168,25 @@ static void llama_model_load_internal(
1093
1168
  fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
1094
1169
  mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
1095
1170
 
1171
+ (void) vram_scratch;
1096
1172
  #ifdef GGML_USE_CUBLAS
1173
+ vram_scratch = n_batch * MB;
1174
+ ggml_cuda_set_scratch_size(vram_scratch);
1175
+ if (n_gpu_layers > 0) {
1176
+ fprintf(stderr, "%s: allocating batch_size x 1 MB = %ld MB VRAM for the scratch buffer\n",
1177
+ __func__, vram_scratch / MB);
1178
+ }
1179
+ #endif // GGML_USE_CUBLAS
1180
+ #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
1097
1181
  const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
1098
1182
 
1099
- fprintf(stderr, "%s: [cublas] offloading %d layers to GPU\n", __func__, n_gpu);
1183
+ fprintf(stderr, "%s: offloading %d layers to GPU\n", __func__, n_gpu);
1100
1184
  if (n_gpu_layers > (int) hparams.n_layer) {
1101
- fprintf(stderr, "%s: [cublas] offloading output layer to GPU\n", __func__);
1185
+ fprintf(stderr, "%s: offloading output layer to GPU\n", __func__);
1102
1186
  }
1103
- fprintf(stderr, "%s: [cublas] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
1104
- #elif !defined(GGML_USE_CLBLAST)
1187
+ fprintf(stderr, "%s: total VRAM used: %zu MB\n",
1188
+ __func__, (vram_weights + vram_scratch + MB - 1) / MB); // round up
1189
+ #else
1105
1190
  (void) n_gpu_layers;
1106
1191
  #endif
1107
1192
  }
@@ -1113,8 +1198,10 @@ static void llama_model_load_internal(
1113
1198
 
1114
1199
  ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
1115
1200
 
1116
- #ifdef GGML_USE_CUBLAS
1201
+ #if defined(GGML_USE_CUBLAS)
1117
1202
  {
1203
+ ggml_cuda_set_tensor_split(tensor_split);
1204
+
1118
1205
  size_t done_size = 0;
1119
1206
  size_t data_size = 0;
1120
1207
  for (llama_load_tensor & lt : ml->tensors_map.tensors) {
@@ -1124,7 +1211,8 @@ static void llama_model_load_internal(
1124
1211
  }
1125
1212
  }
1126
1213
  for (llama_load_tensor & lt : ml->tensors_map.tensors) {
1127
- if (lt.ggml_tensor->backend != GGML_BACKEND_CUDA) {
1214
+ ggml_backend backend = lt.ggml_tensor->backend;
1215
+ if (backend != GGML_BACKEND_GPU && backend != GGML_BACKEND_GPU_SPLIT) {
1128
1216
  continue;
1129
1217
  }
1130
1218
  if (progress_callback) {
@@ -1136,30 +1224,28 @@ static void llama_model_load_internal(
1136
1224
  }
1137
1225
  #elif defined(GGML_USE_CLBLAST)
1138
1226
  {
1139
- const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
1140
-
1141
- fprintf(stderr, "ggml_opencl: offloading %d layers to GPU\n", n_gpu);
1142
-
1143
- size_t vram_total = 0;
1144
-
1145
- for (int i = 0; i < n_gpu; ++i) {
1146
- const auto & layer = model.layers[i];
1147
-
1148
- ggml_cl_transform_tensor(layer.wq); vram_total += ggml_nbytes(layer.wq);
1149
- ggml_cl_transform_tensor(layer.wk); vram_total += ggml_nbytes(layer.wk);
1150
- ggml_cl_transform_tensor(layer.wv); vram_total += ggml_nbytes(layer.wv);
1151
- ggml_cl_transform_tensor(layer.wo); vram_total += ggml_nbytes(layer.wo);
1152
- ggml_cl_transform_tensor(layer.w1); vram_total += ggml_nbytes(layer.w1);
1153
- ggml_cl_transform_tensor(layer.w2); vram_total += ggml_nbytes(layer.w2);
1154
- ggml_cl_transform_tensor(layer.w3); vram_total += ggml_nbytes(layer.w3);
1227
+ size_t done_size = 0;
1228
+ size_t data_size = 0;
1229
+ for (llama_load_tensor & lt : ml->tensors_map.tensors) {
1230
+ data_size += lt.size;
1231
+ if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
1232
+ done_size += lt.size;
1233
+ }
1155
1234
  }
1156
- if (n_gpu_layers > (int) hparams.n_layer) {
1157
- fprintf(stderr, "ggml_opencl: offloading output layer to GPU\n");
1158
- ggml_cl_transform_tensor(model.output); vram_total += ggml_nbytes(model.output);
1235
+ for (llama_load_tensor & lt : ml->tensors_map.tensors) {
1236
+ if (lt.ggml_tensor->backend != GGML_BACKEND_GPU) {
1237
+ continue;
1238
+ }
1239
+ if (progress_callback) {
1240
+ progress_callback((float) done_size / data_size, progress_callback_user_data);
1241
+ }
1242
+ ggml_cl_load_data(fname.c_str(), lt.ggml_tensor, lt.shards.at(0).file_off);
1243
+ done_size += lt.size;
1159
1244
  }
1160
-
1161
- fprintf(stderr, "ggml_opencl: total VRAM used: %zu MB\n", vram_total / 1024 / 1024);
1162
1245
  }
1246
+ #else
1247
+ (void) n_batch;
1248
+ (void) tensor_split;
1163
1249
  #endif
1164
1250
 
1165
1251
  if (progress_callback) {
@@ -1177,7 +1263,10 @@ static bool llama_model_load(
1177
1263
  const std::string & fname,
1178
1264
  llama_context & lctx,
1179
1265
  int n_ctx,
1266
+ int n_batch,
1180
1267
  int n_gpu_layers,
1268
+ int main_gpu,
1269
+ float * tensor_split,
1181
1270
  ggml_type memory_type,
1182
1271
  bool use_mmap,
1183
1272
  bool use_mlock,
@@ -1185,28 +1274,30 @@ static bool llama_model_load(
1185
1274
  llama_progress_callback progress_callback,
1186
1275
  void *progress_callback_user_data) {
1187
1276
  try {
1188
- llama_model_load_internal(fname, lctx, n_ctx, n_gpu_layers, memory_type, use_mmap, use_mlock,
1189
- vocab_only, progress_callback, progress_callback_user_data);
1277
+ llama_model_load_internal(fname, lctx, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, memory_type,
1278
+ use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
1190
1279
  return true;
1191
- } catch (const std::string & err) {
1192
- fprintf(stderr, "error loading model: %s\n", err.c_str());
1280
+ } catch (const std::exception & err) {
1281
+ fprintf(stderr, "error loading model: %s\n", err.what());
1193
1282
  return false;
1194
1283
  }
1195
1284
  }
1196
1285
 
1197
1286
  // evaluate the transformer
1198
1287
  //
1199
- // - lctx: llama context
1200
- // - tokens: new batch of tokens to process
1201
- // - n_past: the context size so far
1202
- // - n_threads: number of threads to use
1288
+ // - lctx: llama context
1289
+ // - tokens: new batch of tokens to process
1290
+ // - n_past: the context size so far
1291
+ // - n_threads: number of threads to use
1292
+ // - cgraph_fname: filename of the exported computation graph
1203
1293
  //
1204
1294
  static bool llama_eval_internal(
1205
- llama_context & lctx,
1206
- const llama_token * tokens,
1207
- const int n_tokens,
1208
- const int n_past,
1209
- const int n_threads) {
1295
+ llama_context & lctx,
1296
+ const llama_token * tokens,
1297
+ const int n_tokens,
1298
+ const int n_past,
1299
+ const int n_threads,
1300
+ const char * cgraph_fname) {
1210
1301
 
1211
1302
  // enforce that the first token is BOS
1212
1303
  if (n_past == 0 && tokens[0] != llama_token_bos()) {
@@ -1225,12 +1316,13 @@ static bool llama_eval_internal(
1225
1316
 
1226
1317
  LLAMA_ASSERT(!!kv_self.ctx);
1227
1318
 
1228
- const int n_embd = hparams.n_embd;
1229
- const int n_layer = hparams.n_layer;
1230
- const int n_ctx = hparams.n_ctx;
1231
- const int n_head = hparams.n_head;
1232
- const int n_vocab = hparams.n_vocab;
1233
- const int n_rot = hparams.n_embd/hparams.n_head;
1319
+ const int n_embd = hparams.n_embd;
1320
+ const int n_layer = hparams.n_layer;
1321
+ const int n_ctx = hparams.n_ctx;
1322
+ const int n_head = hparams.n_head;
1323
+ const int n_vocab = hparams.n_vocab;
1324
+ const int n_rot = hparams.n_embd/hparams.n_head;
1325
+ const int n_gpu_layers = model.n_gpu_layers;
1234
1326
 
1235
1327
  auto & mem_per_token = lctx.mem_per_token;
1236
1328
  auto & buf_compute = lctx.buf_compute;
@@ -1252,40 +1344,66 @@ static bool llama_eval_internal(
1252
1344
  ggml_set_name(embd, "embd");
1253
1345
  memcpy(embd->data, tokens, N*ggml_element_size(embd));
1254
1346
 
1347
+ struct ggml_tensor * cur;
1255
1348
  struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
1256
1349
 
1350
+ const int i_gpu_start = n_layer - n_gpu_layers;
1351
+ (void) i_gpu_start;
1352
+
1257
1353
  for (int il = 0; il < n_layer; ++il) {
1258
- struct ggml_tensor * inpSA = inpL;
1354
+ offload_func_t offload_func = llama_nop;
1259
1355
 
1260
- struct ggml_tensor * cur;
1356
+ #ifdef GGML_USE_CUBLAS
1357
+ if (il >= i_gpu_start) {
1358
+ offload_func = ggml_cuda_assign_buffers; // sets the output backend to GPU
1359
+ }
1360
+ #endif // GGML_USE_CUBLAS
1361
+
1362
+ struct ggml_tensor * inpSA = inpL;
1261
1363
 
1262
1364
  lctx.use_buf(ctx0, 0);
1263
1365
 
1264
1366
  // norm
1265
1367
  {
1266
1368
  cur = ggml_rms_norm(ctx0, inpL);
1369
+ offload_func(cur);
1370
+ ggml_set_name(cur, "rms_norm_0");
1267
1371
 
1268
1372
  // cur = cur*attention_norm(broadcasted)
1269
1373
  cur = ggml_mul(ctx0, cur, model.layers[il].attention_norm);
1374
+ offload_func(cur);
1375
+ ggml_set_name(cur, "attention_norm_0");
1270
1376
  }
1271
1377
 
1272
1378
  // self-attention
1273
1379
  {
1274
1380
  // compute Q and K and RoPE them
1275
- struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
1276
- struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
1277
- ggml_set_name(Qcur, "Qcur");
1381
+ struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
1382
+ // offload_func(tmpq);
1383
+ ggml_set_name(tmpq, "tmpq");
1384
+
1385
+ struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
1386
+ // offload_func(tmpk);
1387
+ ggml_set_name(tmpk, "tmpk");
1388
+
1389
+ struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0);
1278
1390
  ggml_set_name(Kcur, "Kcur");
1279
1391
 
1392
+ struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0);
1393
+ ggml_set_name(Qcur, "Qcur");
1394
+
1280
1395
  // store key and value to memory
1281
1396
  {
1282
1397
  // compute the transposed [N, n_embd] V matrix
1283
1398
  struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), n_embd, N));
1399
+ ggml_set_name(Vcur, "Vcur");
1284
1400
 
1285
1401
  struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
1402
+ ggml_set_name(k, "k");
1286
1403
  struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
1287
1404
  ( n_ctx)*ggml_element_size(kv_self.v),
1288
1405
  (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
1406
+ ggml_set_name(v, "v");
1289
1407
 
1290
1408
  // important: storing RoPE-ed version of K in the KV cache!
1291
1409
  ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
@@ -1326,7 +1444,6 @@ static bool llama_eval_internal(
1326
1444
  struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
1327
1445
  ggml_set_name(KQ_soft_max, "KQ_soft_max");
1328
1446
 
1329
-
1330
1447
  // split cached V into n_head heads
1331
1448
  struct ggml_tensor * V =
1332
1449
  ggml_view_3d(ctx0, kv_self.v,
@@ -1361,73 +1478,143 @@ static bool llama_eval_internal(
1361
1478
  cur = ggml_mul_mat(ctx0,
1362
1479
  model.layers[il].wo,
1363
1480
  cur);
1481
+ offload_func(cur);
1482
+ ggml_set_name(cur, "result_wo");
1364
1483
  }
1365
1484
 
1366
1485
  lctx.use_buf(ctx0, 1);
1486
+ //ggml_cuda_set_scratch(1);
1367
1487
 
1368
1488
  struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
1489
+ offload_func(inpFF);
1490
+ ggml_set_name(inpFF, "inpFF");
1369
1491
 
1370
1492
  // feed-forward network
1371
1493
  {
1372
1494
  // norm
1373
1495
  {
1374
1496
  cur = ggml_rms_norm(ctx0, inpFF);
1497
+ offload_func(cur);
1498
+ ggml_set_name(cur, "rms_norm_1");
1375
1499
 
1376
1500
  // cur = cur*ffn_norm(broadcasted)
1377
1501
  cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
1502
+ offload_func(cur);
1503
+ ggml_set_name(cur, "ffn_norm");
1378
1504
  }
1379
1505
 
1380
1506
  struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
1381
1507
  model.layers[il].w3,
1382
1508
  cur);
1509
+ offload_func(tmp);
1510
+ ggml_set_name(tmp, "result_w3");
1383
1511
 
1384
1512
  cur = ggml_mul_mat(ctx0,
1385
1513
  model.layers[il].w1,
1386
1514
  cur);
1515
+ offload_func(cur);
1516
+ ggml_set_name(cur, "result_w2");
1387
1517
 
1388
1518
  // SILU activation
1389
1519
  cur = ggml_silu(ctx0, cur);
1520
+ offload_func(cur);
1521
+ ggml_set_name(cur, "silu");
1390
1522
 
1391
1523
  cur = ggml_mul(ctx0, cur, tmp);
1524
+ offload_func(cur);
1525
+ ggml_set_name(cur, "silu_x_result_w3");
1392
1526
 
1393
1527
  cur = ggml_mul_mat(ctx0,
1394
1528
  model.layers[il].w2,
1395
1529
  cur);
1530
+ offload_func(cur);
1531
+ ggml_set_name(cur, "result_w2");
1396
1532
  }
1397
1533
 
1398
1534
  cur = ggml_add(ctx0, cur, inpFF);
1535
+ offload_func(cur);
1536
+ ggml_set_name(cur, "inpFF_+_result_w2");
1399
1537
 
1400
1538
  // input for next layer
1401
1539
  inpL = cur;
1540
+
1402
1541
  }
1403
1542
 
1404
1543
  lctx.use_buf(ctx0, 0);
1544
+ //ggml_cuda_set_scratch(0);
1405
1545
 
1406
1546
  // used at the end to optionally extract the embeddings
1407
1547
  struct ggml_tensor * embeddings = NULL;
1408
1548
 
1549
+ offload_func_t offload_func = llama_nop;
1550
+
1551
+ #ifdef GGML_USE_CUBLAS
1552
+ if (n_gpu_layers > n_layer) {
1553
+ offload_func = ggml_cuda_assign_buffers; // sets the output backend to GPU
1554
+ }
1555
+ #endif // GGML_USE_CUBLAS
1556
+
1409
1557
  // norm
1410
1558
  {
1559
+ cur = ggml_rms_norm(ctx0, inpL);
1560
+ offload_func(cur);
1561
+ ggml_set_name(cur, "rms_norm_inpL");
1411
1562
 
1412
- inpL = ggml_rms_norm(ctx0, inpL);
1563
+ cur = ggml_rms_norm(ctx0, cur);
1564
+ offload_func(cur);
1565
+ ggml_set_name(cur, "rms_norm_after");
1413
1566
 
1414
- // inpL = inpL*norm(broadcasted)
1415
- inpL = ggml_mul(ctx0, inpL, model.norm);
1567
+ // cur = cur*norm(broadcasted)
1568
+ cur = ggml_mul(ctx0, cur, model.norm);
1569
+ offload_func(cur);
1570
+ ggml_set_name(cur, "result_norm");
1416
1571
 
1417
- embeddings = inpL;
1572
+ embeddings = cur;
1418
1573
  }
1419
1574
 
1575
+
1420
1576
  // lm_head
1421
- inpL = ggml_mul_mat(ctx0, model.output, inpL);
1577
+ cur = ggml_mul_mat(ctx0, model.output, cur);
1578
+ ggml_set_name(cur, "result_output");
1422
1579
 
1423
1580
  lctx.use_buf(ctx0, -1);
1424
1581
 
1425
1582
  // logits -> probs
1426
- //inpL = ggml_soft_max_inplace(ctx0, inpL);
1583
+ //cur = ggml_soft_max_inplace(ctx0, cur);
1427
1584
 
1428
1585
  // run the computation
1429
- ggml_build_forward_expand(&gf, inpL);
1430
- ggml_graph_compute (ctx0, &gf);
1586
+ ggml_build_forward_expand(&gf, cur);
1587
+
1588
+ #ifdef GGML_USE_METAL
1589
+ if (lctx.ctx_metal && N == 1) {
1590
+ ggml_metal_graph_compute(lctx.ctx_metal, &gf);
1591
+ ggml_metal_get_tensor (lctx.ctx_metal, cur);
1592
+ } else {
1593
+ // IMPORTANT:
1594
+ // Since we don't have efficient Matrix x Matrix Metal multiplication yet, we fallback to vanilla
1595
+ // ggml_graph_compute(). It uses Apple's Accelerate CBLAS API which takes advantage of the ANE or the AMX
1596
+ // coprocessor.
1597
+ //
1598
+ // When we implement Matrix x Matrix Metal multiplication, we can avoid this branch.
1599
+ // But for now, we have focused only on Matrix x Vector Metal multiplication.
1600
+ //
1601
+ // TODO: avoid these syncs via shared memory (ref #1696)
1602
+ //
1603
+ if (lctx.ctx_metal) {
1604
+ // We need to sync the GPU KV cache with the CPU KV cache
1605
+ ggml_metal_get_tensor(lctx.ctx_metal, kv_self.k);
1606
+ ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v);
1607
+ }
1608
+
1609
+ ggml_graph_compute(ctx0, &gf);
1610
+ }
1611
+ #else
1612
+ ggml_graph_compute(ctx0, &gf);
1613
+ #endif
1614
+
1615
+ if (cgraph_fname) {
1616
+ ggml_graph_export(&gf, cgraph_fname);
1617
+ }
1431
1618
 
1432
1619
  #ifdef GGML_PERF
1433
1620
  // print timing information per ggml operation (for debugging purposes)
@@ -1441,7 +1628,7 @@ static bool llama_eval_internal(
1441
1628
  //}
1442
1629
 
1443
1630
  //embd_w.resize(n_vocab*N);
1444
- //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
1631
+ //memcpy(embd_w.data(), ggml_get_data(cur), sizeof(float)*n_vocab*N);
1445
1632
 
1446
1633
  // update kv token count
1447
1634
  lctx.model.kv_self.n = n_past + N;
@@ -1452,11 +1639,11 @@ static bool llama_eval_internal(
1452
1639
 
1453
1640
  if (lctx.logits_all) {
1454
1641
  logits_out.resize(n_vocab * N);
1455
- memcpy(logits_out.data(), (float *) ggml_get_data(inpL), sizeof(float)*n_vocab*N);
1642
+ memcpy(logits_out.data(), (float *) ggml_get_data(cur), sizeof(float)*n_vocab*N);
1456
1643
  } else {
1457
1644
  // return result for just the last token
1458
1645
  logits_out.resize(n_vocab);
1459
- memcpy(logits_out.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
1646
+ memcpy(logits_out.data(), (float *) ggml_get_data(cur) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
1460
1647
  }
1461
1648
  }
1462
1649
 
@@ -2055,16 +2242,88 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra
2055
2242
  // quantization
2056
2243
  //
2057
2244
 
2058
- static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, enum llama_ftype ftype, int nthread) {
2245
+ static void llama_convert_tensor_internal(const llama_load_tensor & tensor, llama_buffer & output, const int nelements, const int nthread) {
2246
+ if (output.size < nelements * sizeof(float)) {
2247
+ output.resize(nelements * sizeof(float));
2248
+ }
2249
+ float * f32_output = (float *) output.addr;
2250
+
2251
+ quantize_fns_t qtype;
2252
+ if (ggml_is_quantized(tensor.type)) {
2253
+ qtype = ggml_internal_get_quantize_fn(tensor.type);
2254
+ if (qtype.dequantize_row_q == NULL) {
2255
+ throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor.type)));
2256
+ }
2257
+ } else if (tensor.type != GGML_TYPE_F16) {
2258
+ throw std::runtime_error(format("cannot dequantize/convert tensor type %s", ggml_type_name(tensor.type)));
2259
+ }
2260
+
2261
+ if (nthread < 2) {
2262
+ if (tensor.type == GGML_TYPE_F16) {
2263
+ ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor.data, f32_output, nelements);
2264
+ } else if (ggml_is_quantized(tensor.type)) {
2265
+ qtype.dequantize_row_q(tensor.data, f32_output, nelements);
2266
+ } else {
2267
+ LLAMA_ASSERT(false); // unreachable
2268
+ }
2269
+ return;
2270
+ }
2271
+
2272
+ auto block_size = tensor.type == GGML_TYPE_F16 ? 1 : (size_t)ggml_blck_size(tensor.type);
2273
+ auto block_size_bytes = ggml_type_size(tensor.type);
2274
+
2275
+ LLAMA_ASSERT(nelements % block_size == 0);
2276
+ auto nblocks = nelements / block_size;
2277
+ auto blocks_per_thread = nblocks / nthread;
2278
+ auto spare_blocks = nblocks - (blocks_per_thread * nthread); // if blocks aren't divisible by thread count
2279
+
2280
+ std::vector<std::thread> workers;
2281
+ for (auto tnum = 0, in_buff_offs = 0, out_buff_offs = 0; tnum < nthread; tnum++) {
2282
+ auto thr_blocks = blocks_per_thread + (tnum == nthread - 1 ? spare_blocks : 0); // num blocks for this thread
2283
+ auto thr_elems = thr_blocks * block_size; // number of elements for this thread
2284
+ auto thr_block_bytes = thr_blocks * block_size_bytes; // number of input bytes for this thread
2285
+
2286
+ auto compute = [qtype] (ggml_type typ, uint8_t * inbuf, float * outbuf, int nels) {
2287
+ if (typ == GGML_TYPE_F16) {
2288
+ ggml_fp16_to_fp32_row((ggml_fp16_t *)inbuf, outbuf, nels);
2289
+ } else {
2290
+ qtype.dequantize_row_q(inbuf, outbuf, nels);
2291
+ }
2292
+ };
2293
+ workers.push_back(std::thread(compute, tensor.type, tensor.data + in_buff_offs, f32_output + out_buff_offs, thr_elems));
2294
+ in_buff_offs += thr_block_bytes;
2295
+ out_buff_offs += thr_elems;
2296
+ }
2297
+ for (auto & worker : workers) {
2298
+ worker.join();
2299
+ }
2300
+
2301
+ }
2302
+
2303
+ static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
2059
2304
  ggml_type quantized_type;
2060
- switch (ftype) {
2305
+ llama_ftype ftype = params->ftype;
2306
+ int nthread = params->nthread;
2307
+
2308
+ switch (params->ftype) {
2061
2309
  case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
2062
2310
  case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
2063
2311
  case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
2064
2312
  case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
2065
2313
  case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
2066
- default: throw format("invalid output file type %d\n", ftype);
2067
- };
2314
+
2315
+ // K-quants
2316
+ case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
2317
+ case LLAMA_FTYPE_MOSTLY_Q3_K_S:
2318
+ case LLAMA_FTYPE_MOSTLY_Q3_K_M:
2319
+ case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break;
2320
+ case LLAMA_FTYPE_MOSTLY_Q4_K_S:
2321
+ case LLAMA_FTYPE_MOSTLY_Q4_K_M: quantized_type = GGML_TYPE_Q4_K; break;
2322
+ case LLAMA_FTYPE_MOSTLY_Q5_K_S:
2323
+ case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break;
2324
+ case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
2325
+ default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
2326
+ }
2068
2327
 
2069
2328
  if (nthread <= 0) {
2070
2329
  nthread = std::thread::hardware_concurrency();
@@ -2072,7 +2331,21 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2072
2331
 
2073
2332
  std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp, /*use_mmap*/ false,
2074
2333
  /*vocab_only*/ false));
2075
- llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), ftype);
2334
+ llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), params->ftype);
2335
+
2336
+ int n_attention_wv = 0;
2337
+ int n_feed_forward_w2 = 0;
2338
+ for (auto& tensor : model_loader->tensors_map.tensors) {
2339
+ if (tensor.name.find("attention.wv.weight") != std::string::npos) {
2340
+ ++n_attention_wv;
2341
+ }
2342
+ else if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) {
2343
+ ++n_feed_forward_w2;
2344
+ }
2345
+ }
2346
+
2347
+ int i_attention_wv = 0;
2348
+ int i_feed_forward_w2 = 0;
2076
2349
 
2077
2350
  size_t total_size_org = 0;
2078
2351
  size_t total_size_new = 0;
@@ -2100,9 +2373,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2100
2373
  quantize &= (tensor.ne.size() == 2);
2101
2374
 
2102
2375
  // uncomment this to keep the output layer in FP16
2103
- //if (tensor.name == "output.weight") {
2104
- // quantize = false;
2105
- //}
2376
+ if (!params->quantize_output_tensor && tensor.name == "output.weight") {
2377
+ quantize = false;
2378
+ }
2379
+ quantize = quantize && quantized_type != tensor.type;
2106
2380
 
2107
2381
  enum ggml_type new_type;
2108
2382
  void * new_data;
@@ -2116,20 +2390,43 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2116
2390
  printf("size = %8.3f MB\n", tensor.size/1024.0/1024.0);
2117
2391
  } else {
2118
2392
  new_type = quantized_type;
2393
+ // TODO: temporary disabled until Metal / OpenCL support is available
2394
+ // ref: https://github.com/ggerganov/llama.cpp/issues/1711
2395
+ //if (tensor.name == "output.weight") {
2396
+ // new_type = GGML_TYPE_Q6_K;
2397
+ //}
2398
+ if (tensor.name.find("attention.wv.weight") != std::string::npos) {
2399
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
2400
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
2401
+ else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
2402
+ (i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8 ||
2403
+ (i_attention_wv - n_attention_wv/8)%3 == 2)) new_type = GGML_TYPE_Q6_K;
2404
+ ++i_attention_wv;
2405
+ }
2406
+ if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) {
2407
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
2408
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
2409
+ else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
2410
+ (i_feed_forward_w2 < n_feed_forward_w2/8 || i_feed_forward_w2 >= 7*n_feed_forward_w2/8 ||
2411
+ (i_feed_forward_w2 - n_feed_forward_w2/8)%3 == 2)) new_type = GGML_TYPE_Q6_K;
2412
+ ++i_feed_forward_w2;
2413
+ }
2414
+ if (tensor.name.find("attention.wo.weight") != std::string::npos) {
2415
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
2416
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
2417
+ }
2418
+
2119
2419
  float * f32_data;
2120
2420
  size_t nelements = tensor.ne.at(0) * tensor.ne.at(1);
2121
2421
  llama_buffer f32_conv_buf;
2422
+
2122
2423
  if (tensor.type == GGML_TYPE_F32) {
2123
2424
  f32_data = (float *) tensor.data;
2124
- } else if (tensor.type == GGML_TYPE_F16) {
2125
- f32_conv_buf.resize(nelements * sizeof(float));
2126
- f32_data = (float *) f32_conv_buf.addr;
2127
- const auto * f16_data = (const ggml_fp16_t *) tensor.data;
2128
- for (size_t i = 0; i < nelements; i++) {
2129
- f32_data[i] = ggml_fp16_to_fp32(f16_data[i]);
2130
- }
2425
+ } else if (ggml_is_quantized(tensor.type) && !params->allow_requantize) {
2426
+ throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor.type)));
2131
2427
  } else {
2132
- throw format("type %s unsupported for integer quantization", ggml_type_name(tensor.type));
2428
+ llama_convert_tensor_internal(tensor, f32_conv_buf, nelements, nthread);
2429
+ f32_data = (float *) f32_conv_buf.addr;
2133
2430
  }
2134
2431
 
2135
2432
  printf("quantizing .. ");
@@ -2183,12 +2480,16 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2183
2480
  }
2184
2481
 
2185
2482
  printf("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0);
2483
+ int64_t tot_count = 0;
2186
2484
  for (size_t i = 0; i < hist_cur.size(); i++) {
2187
2485
  hist_all[i] += hist_cur[i];
2486
+ tot_count += hist_cur[i];
2188
2487
  }
2189
2488
 
2190
- for (size_t i = 0; i < hist_cur.size(); i++) {
2191
- printf("%5.3f ", hist_cur[i] / float(nelements));
2489
+ if (tot_count > 0) {
2490
+ for (size_t i = 0; i < hist_cur.size(); i++) {
2491
+ printf("%5.3f ", hist_cur[i] / float(nelements));
2492
+ }
2192
2493
  }
2193
2494
  printf("\n");
2194
2495
  }
@@ -2206,11 +2507,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2206
2507
  sum_all += hist_all[i];
2207
2508
  }
2208
2509
 
2209
- printf("%s: hist: ", __func__);
2210
- for (size_t i = 0; i < hist_all.size(); i++) {
2211
- printf("%5.3f ", hist_all[i] / float(sum_all));
2510
+ if (sum_all > 0) {
2511
+ printf("%s: hist: ", __func__);
2512
+ for (size_t i = 0; i < hist_all.size(); i++) {
2513
+ printf("%5.3f ", hist_all[i] / float(sum_all));
2514
+ }
2515
+ printf("\n");
2212
2516
  }
2213
- printf("\n");
2214
2517
  }
2215
2518
  }
2216
2519
 
@@ -2251,9 +2554,9 @@ struct llama_context * llama_init_from_file(
2251
2554
 
2252
2555
  ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
2253
2556
 
2254
- if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_gpu_layers, memory_type,
2255
- params.use_mmap, params.use_mlock, params.vocab_only,
2256
- params.progress_callback, params.progress_callback_user_data)) {
2557
+ if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_batch, params.n_gpu_layers,
2558
+ params.main_gpu, params.tensor_split, memory_type, params.use_mmap, params.use_mlock,
2559
+ params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
2257
2560
  fprintf(stderr, "%s: failed to load model\n", __func__);
2258
2561
  llama_free(ctx);
2259
2562
  return nullptr;
@@ -2291,6 +2594,38 @@ struct llama_context * llama_init_from_file(
2291
2594
  ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
2292
2595
  }
2293
2596
 
2597
+ #ifdef GGML_USE_METAL
2598
+ if (params.n_gpu_layers > 0) {
2599
+ // this allocates all Metal resources and memory buffers
2600
+ ctx->ctx_metal = ggml_metal_init();
2601
+
2602
+ void *data_ptr = NULL;
2603
+ size_t data_size = 0;
2604
+ if (params.use_mmap) {
2605
+ data_ptr = ctx->model.mapping->addr;
2606
+ data_size= ctx->model.mapping->size;
2607
+ } else {
2608
+ data_ptr = ggml_get_mem_buffer(ctx->model.ctx);
2609
+ data_size= ggml_get_mem_size(ctx->model.ctx);
2610
+ }
2611
+
2612
+ #define LLAMA_METAL_CHECK_BUF(result) \
2613
+ if (!(result)) { \
2614
+ fprintf(stderr, "%s: failed to add buffer\n", __func__); \
2615
+ llama_free(ctx); \
2616
+ return NULL; \
2617
+ }
2618
+
2619
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size));
2620
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size));
2621
+
2622
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->model.kv_self.buf.addr, ctx->model.kv_self.buf.size));
2623
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size));
2624
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size));
2625
+ #undef LLAMA_METAL_CHECK_BUF
2626
+ }
2627
+ #endif
2628
+
2294
2629
  return ctx;
2295
2630
  }
2296
2631
 
@@ -2301,13 +2636,12 @@ void llama_free(struct llama_context * ctx) {
2301
2636
  int llama_model_quantize(
2302
2637
  const char * fname_inp,
2303
2638
  const char * fname_out,
2304
- enum llama_ftype ftype,
2305
- int nthread) {
2639
+ const llama_model_quantize_params *params) {
2306
2640
  try {
2307
- llama_model_quantize_internal(fname_inp, fname_out, ftype, nthread);
2641
+ llama_model_quantize_internal(fname_inp, fname_out, params);
2308
2642
  return 0;
2309
- } catch (const std::string & err) {
2310
- fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.c_str());
2643
+ } catch (const std::exception & err) {
2644
+ fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.what());
2311
2645
  return 1;
2312
2646
  }
2313
2647
  }
@@ -2560,8 +2894,8 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
2560
2894
  int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
2561
2895
  try {
2562
2896
  return llama_apply_lora_from_file_internal(ctx, path_lora, path_base_model, n_threads);
2563
- } catch (const std::string & err) {
2564
- fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.c_str());
2897
+ } catch (const std::exception & err) {
2898
+ fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what());
2565
2899
  return 1;
2566
2900
  }
2567
2901
  }
@@ -2906,7 +3240,7 @@ int llama_eval(
2906
3240
  int n_tokens,
2907
3241
  int n_past,
2908
3242
  int n_threads) {
2909
- if (!llama_eval_internal(*ctx, tokens, n_tokens, n_past, n_threads)) {
3243
+ if (!llama_eval_internal(*ctx, tokens, n_tokens, n_past, n_threads, nullptr)) {
2910
3244
  fprintf(stderr, "%s: failed to eval\n", __func__);
2911
3245
  return 1;
2912
3246
  }
@@ -2921,6 +3255,20 @@ int llama_eval(
2921
3255
  return 0;
2922
3256
  }
2923
3257
 
3258
+ int llama_eval_export(struct llama_context * ctx, const char * fname) {
3259
+ const int n_batch = 1;
3260
+ const int n_ctx = 512 - n_batch;
3261
+
3262
+ const std::vector<llama_token> tmp(n_batch, llama_token_bos());
3263
+
3264
+ if (!llama_eval_internal(*ctx, tmp.data(), tmp.size(), n_ctx, 1, fname)) {
3265
+ fprintf(stderr, "%s: failed to eval\n", __func__);
3266
+ return 1;
3267
+ }
3268
+
3269
+ return 0;
3270
+ }
3271
+
2924
3272
  int llama_tokenize(
2925
3273
  struct llama_context * ctx,
2926
3274
  const char * text,