llama_cpp 0.1.4 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -16,6 +16,10 @@
16
16
  #include "ggml-opencl.h"
17
17
  #endif
18
18
 
19
+ #ifdef GGML_USE_METAL
20
+ #include "ggml-metal.h"
21
+ #endif
22
+
19
23
  #include <array>
20
24
  #include <ctime>
21
25
  #include <cinttypes>
@@ -49,17 +53,22 @@ enum e_model {
49
53
  MODEL_65B,
50
54
  };
51
55
 
52
-
53
56
  static const size_t MB = 1024*1024;
54
57
 
55
58
  // computed for n_ctx == 2048
56
59
  // TODO: dynamically determine these sizes
57
60
  // needs modifications in ggml
58
61
 
62
+ typedef void (*offload_func_t)(struct ggml_tensor * tensor);
63
+
64
+ void llama_nop(struct ggml_tensor * tensor) { // don't offload by default
65
+ (void) tensor;
66
+ }
67
+
59
68
  static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
60
69
  {
61
70
  static std::map<e_model, size_t> k_sizes = {
62
- { MODEL_3B, 128ull * MB },
71
+ { MODEL_3B, 256ull * MB },
63
72
  { MODEL_7B, 512ull * MB },
64
73
  { MODEL_13B, 512ull * MB },
65
74
  { MODEL_30B, 512ull * MB },
@@ -71,7 +80,7 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
71
80
  static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
72
81
  {
73
82
  static std::map<e_model, size_t> k_sizes = {
74
- { MODEL_3B, 128ull * MB },
83
+ { MODEL_3B, 256ull * MB },
75
84
  { MODEL_7B, 512ull * MB },
76
85
  { MODEL_13B, 512ull * MB },
77
86
  { MODEL_30B, 512ull * MB },
@@ -170,6 +179,7 @@ struct llama_model {
170
179
  struct ggml_tensor * output;
171
180
 
172
181
  std::vector<llama_layer> layers;
182
+ int n_gpu_layers;
173
183
 
174
184
  // context
175
185
  struct ggml_context * ctx = NULL;
@@ -195,6 +205,16 @@ struct llama_model {
195
205
  if (ctx) {
196
206
  ggml_free(ctx);
197
207
  }
208
+
209
+ #ifdef GGML_USE_CUBLAS
210
+ for (size_t i = 0; i < tensors_by_name.size(); ++i) {
211
+ ggml_cuda_free_data(tensors_by_name[i].second);
212
+ }
213
+ #elif defined(GGML_USE_CLBLAST)
214
+ for (size_t i = 0; i < tensors_by_name.size(); ++i) {
215
+ ggml_cl_free_data(tensors_by_name[i].second);
216
+ }
217
+ #endif
198
218
  }
199
219
  };
200
220
 
@@ -243,6 +263,10 @@ struct llama_context {
243
263
  llama_ctx_buffer buf_compute;
244
264
  llama_ctx_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
245
265
 
266
+ #ifdef GGML_USE_METAL
267
+ ggml_metal_context * ctx_metal = NULL;
268
+ #endif
269
+
246
270
  int buf_last = 0;
247
271
  size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
248
272
 
@@ -282,15 +306,15 @@ template <typename T>
282
306
  static T checked_mul(T a, T b) {
283
307
  T ret = a * b;
284
308
  if (a != 0 && ret / a != b) {
285
- throw format("overflow multiplying %llu * %llu",
286
- (unsigned long long) a, (unsigned long long) b);
309
+ throw std::runtime_error(format("overflow multiplying %llu * %llu",
310
+ (unsigned long long) a, (unsigned long long) b));
287
311
  }
288
312
  return ret;
289
313
  }
290
314
 
291
315
  static size_t checked_div(size_t a, size_t b) {
292
316
  if (b == 0 || a % b != 0) {
293
- throw format("error dividing %zu / %zu", a, b);
317
+ throw std::runtime_error(format("error dividing %zu / %zu", a, b));
294
318
  }
295
319
  return a / b;
296
320
  }
@@ -354,7 +378,7 @@ struct llama_load_tensor {
354
378
  const auto & first_shard = shards.at(0);
355
379
  for (const auto & shard : shards) {
356
380
  if (shard.type != first_shard.type) {
357
- throw format("inconsistent tensor shard type in '%s'", name.c_str());
381
+ throw std::runtime_error(format("inconsistent tensor shard type in '%s'", name.c_str()));
358
382
  }
359
383
  }
360
384
  type = first_shard.type;
@@ -377,8 +401,8 @@ struct llama_load_tensor {
377
401
  const auto & first_shard = shards.at(0);
378
402
  for (const auto & shard : shards) {
379
403
  if (shard.ne != first_shard.ne) {
380
- throw format("inconsistent tensor shard shape in '%s': first was %s, other was %s",
381
- name.c_str(), llama_format_tensor_shape(first_shard.ne).c_str(), llama_format_tensor_shape(shard.ne).c_str());
404
+ throw std::runtime_error(format("inconsistent tensor shard shape in '%s': first was %s, other was %s",
405
+ name.c_str(), llama_format_tensor_shape(first_shard.ne).c_str(), llama_format_tensor_shape(shard.ne).c_str()));
382
406
  }
383
407
  }
384
408
  ne = first_shard.ne;
@@ -456,8 +480,8 @@ struct llama_file_loader {
456
480
  }
457
481
  }
458
482
 
459
- throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
460
- magic, version);
483
+ throw std::runtime_error(format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
484
+ magic, version));
461
485
  }
462
486
  void read_hparams() {
463
487
  hparams.n_vocab = file.read_u32();
@@ -497,7 +521,7 @@ struct llama_file_loader {
497
521
  file.read_raw(shard.ne.data(), sizeof(shard.ne[0]) * n_dims);
498
522
  std::string name = file.read_string(name_len);
499
523
  if (n_dims < 1 || n_dims > 2) {
500
- throw format("llama.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims);
524
+ throw std::runtime_error(format("llama.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims));
501
525
  }
502
526
  switch (shard.type) {
503
527
  case GGML_TYPE_F32:
@@ -507,9 +531,14 @@ struct llama_file_loader {
507
531
  case GGML_TYPE_Q5_0:
508
532
  case GGML_TYPE_Q5_1:
509
533
  case GGML_TYPE_Q8_0:
534
+ case GGML_TYPE_Q2_K:
535
+ case GGML_TYPE_Q3_K:
536
+ case GGML_TYPE_Q4_K:
537
+ case GGML_TYPE_Q5_K:
538
+ case GGML_TYPE_Q6_K:
510
539
  break;
511
540
  default: {
512
- throw format("unrecognized tensor type %u\n", shard.type);
541
+ throw std::runtime_error(format("unrecognized tensor type %u\n", shard.type));
513
542
  }
514
543
  }
515
544
 
@@ -582,6 +611,11 @@ struct llama_file_saver {
582
611
  case GGML_TYPE_Q5_0:
583
612
  case GGML_TYPE_Q5_1:
584
613
  case GGML_TYPE_Q8_0:
614
+ case GGML_TYPE_Q2_K:
615
+ case GGML_TYPE_Q3_K:
616
+ case GGML_TYPE_Q4_K:
617
+ case GGML_TYPE_Q5_K:
618
+ case GGML_TYPE_Q6_K:
585
619
  break;
586
620
  default: LLAMA_ASSERT(false);
587
621
  }
@@ -613,7 +647,7 @@ struct llama_model_loader {
613
647
  auto * ith_file = new llama_file_loader(fname.c_str(), i, tensors_map);
614
648
  file_loaders.emplace_back(ith_file);
615
649
  if (ith_file->hparams != first_file->hparams) {
616
- throw format("llama.cpp: hparams inconsistent between files");
650
+ throw std::runtime_error(format("llama.cpp: hparams inconsistent between files"));
617
651
  }
618
652
  }
619
653
  if (!llama_mmap::SUPPORTED) {
@@ -643,7 +677,7 @@ struct llama_model_loader {
643
677
  uint32_t guess_n_parts() const {
644
678
  auto it = tensors_map.name_to_idx.find("tok_embeddings.weight");
645
679
  if (it == tensors_map.name_to_idx.end()) {
646
- throw std::string("missing tok_embeddings.weight");
680
+ throw std::runtime_error(std::string("missing tok_embeddings.weight"));
647
681
  }
648
682
  const llama_load_tensor & lt = tensors_map.tensors.at(it->second);
649
683
  return file_loaders.at(0)->hparams.n_embd / lt.shards.at(0).ne.at(0);
@@ -660,12 +694,12 @@ struct llama_model_loader {
660
694
  struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne, ggml_backend backend) {
661
695
  auto it = tensors_map.name_to_idx.find(name);
662
696
  if (it == tensors_map.name_to_idx.end()) {
663
- throw format("llama.cpp: tensor '%s' is missing from model", name.c_str());
697
+ throw std::runtime_error(std::runtime_error(format("llama.cpp: tensor '%s' is missing from model", name.c_str())));
664
698
  }
665
699
  llama_load_tensor & lt = tensors_map.tensors.at(it->second);
666
700
  if (lt.ne != ne) {
667
- throw format("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s",
668
- name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str());
701
+ throw std::runtime_error(format("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s",
702
+ name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str()));
669
703
  }
670
704
 
671
705
  return get_tensor_for(lt, backend);
@@ -681,6 +715,7 @@ struct llama_model_loader {
681
715
  }
682
716
  ggml_set_name(tensor, lt.name.c_str());
683
717
  LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor
718
+
684
719
  tensor->backend = backend;
685
720
  lt.ggml_tensor = tensor;
686
721
  num_ggml_tensors_created++;
@@ -689,7 +724,7 @@ struct llama_model_loader {
689
724
 
690
725
  void done_getting_tensors() const {
691
726
  if (num_ggml_tensors_created != tensors_map.tensors.size()) {
692
- throw std::string("llama.cpp: file contained more tensors than expected");
727
+ throw std::runtime_error(std::string("llama.cpp: file contained more tensors than expected"));
693
728
  }
694
729
  }
695
730
 
@@ -833,7 +868,10 @@ static bool kv_cache_init(
833
868
  struct llama_context_params llama_context_default_params() {
834
869
  struct llama_context_params result = {
835
870
  /*.n_ctx =*/ 512,
871
+ /*.n_batch =*/ 512,
836
872
  /*.gpu_layers =*/ 0,
873
+ /*.main_gpu =*/ 0,
874
+ /*.tensor_split =*/ {0},
837
875
  /*.seed =*/ -1,
838
876
  /*.f16_kv =*/ true,
839
877
  /*.logits_all =*/ false,
@@ -848,6 +886,17 @@ struct llama_context_params llama_context_default_params() {
848
886
  return result;
849
887
  }
850
888
 
889
+ struct llama_model_quantize_params llama_model_quantize_default_params() {
890
+ struct llama_model_quantize_params result = {
891
+ /*.nthread =*/ 0,
892
+ /*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
893
+ /*.allow_requantize =*/ false,
894
+ /*.quantize_output_tensor =*/ true,
895
+ };
896
+
897
+ return result;
898
+ }
899
+
851
900
  bool llama_mmap_supported() {
852
901
  return llama_mmap::SUPPORTED;
853
902
  }
@@ -898,6 +947,16 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
898
947
  case LLAMA_FTYPE_MOSTLY_Q5_0: return "mostly Q5_0";
899
948
  case LLAMA_FTYPE_MOSTLY_Q5_1: return "mostly Q5_1";
900
949
  case LLAMA_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0";
950
+ // K-quants
951
+ case LLAMA_FTYPE_MOSTLY_Q2_K: return "mostly Q2_K";
952
+ case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "mostly Q3_K - Small";
953
+ case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "mostly Q3_K - Medium";
954
+ case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "mostly Q3_K - Large";
955
+ case LLAMA_FTYPE_MOSTLY_Q4_K_S: return "mostly Q4_K - Small";
956
+ case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "mostly Q4_K - Medium";
957
+ case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "mostly Q5_K - Small";
958
+ case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "mostly Q5_K - Medium";
959
+ case LLAMA_FTYPE_MOSTLY_Q6_K: return "mostly Q6_K";
901
960
  default: return "unknown, may not work";
902
961
  }
903
962
  }
@@ -917,7 +976,10 @@ static void llama_model_load_internal(
917
976
  const std::string & fname,
918
977
  llama_context & lctx,
919
978
  int n_ctx,
979
+ int n_batch,
920
980
  int n_gpu_layers,
981
+ int main_gpu,
982
+ const float * tensor_split,
921
983
  ggml_type memory_type,
922
984
  bool use_mmap,
923
985
  bool use_mlock,
@@ -932,9 +994,9 @@ static void llama_model_load_internal(
932
994
  lctx.vocab = std::move(ml->file_loaders.at(0)->vocab);
933
995
  auto & model = lctx.model;
934
996
  model.hparams = ml->file_loaders.at(0)->hparams;
997
+ model.n_gpu_layers = n_gpu_layers;
935
998
  llama_file_version file_version = ml->file_loaders.at(0)->file_version;
936
999
  auto & hparams = model.hparams;
937
- uint32_t n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
938
1000
 
939
1001
  {
940
1002
  switch (hparams.n_layer) {
@@ -948,6 +1010,8 @@ static void llama_model_load_internal(
948
1010
  hparams.n_ctx = n_ctx;
949
1011
  }
950
1012
 
1013
+ const uint32_t n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
1014
+
951
1015
  {
952
1016
  fprintf(stderr, "%s: format = %s\n", __func__, llama_file_version_name(file_version));
953
1017
  fprintf(stderr, "%s: n_vocab = %u\n", __func__, hparams.n_vocab);
@@ -967,7 +1031,7 @@ static void llama_model_load_internal(
967
1031
  if (hparams.ftype != LLAMA_FTYPE_ALL_F32 &&
968
1032
  hparams.ftype != LLAMA_FTYPE_MOSTLY_F16 &&
969
1033
  hparams.ftype != LLAMA_FTYPE_MOSTLY_Q8_0) {
970
- throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1405)");
1034
+ throw std::runtime_error(format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1405)"));
971
1035
  }
972
1036
  }
973
1037
 
@@ -975,7 +1039,7 @@ static void llama_model_load_internal(
975
1039
  if (hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ||
976
1040
  hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_1 ||
977
1041
  hparams.ftype == LLAMA_FTYPE_MOSTLY_Q8_0) {
978
- throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1508)");
1042
+ throw std::runtime_error(format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1508)"));
979
1043
  }
980
1044
  }
981
1045
 
@@ -1006,18 +1070,28 @@ static void llama_model_load_internal(
1006
1070
 
1007
1071
  model.ctx = ggml_init(params);
1008
1072
  if (!model.ctx) {
1009
- throw format("ggml_init() failed");
1073
+ throw std::runtime_error(format("ggml_init() failed"));
1010
1074
  }
1011
1075
  }
1012
1076
 
1013
- #ifdef GGML_USE_CUBLAS
1014
- #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CUDA
1077
+ (void) main_gpu;
1078
+ #if defined(GGML_USE_CUBLAS)
1079
+ fprintf(stderr, "%s: using CUDA for GPU acceleration\n", __func__);
1080
+ ggml_cuda_set_main_device(main_gpu);
1081
+ #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
1082
+ #define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT
1083
+ #elif defined(GGML_USE_CLBLAST)
1084
+ fprintf(stderr, "%s: using OpenCL for GPU acceleration\n", __func__);
1085
+ #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
1086
+ #define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU
1015
1087
  #else
1016
- #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CPU
1088
+ #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CPU
1089
+ #define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_CPU
1017
1090
  #endif
1018
1091
 
1019
1092
  // prepare memory for the weights
1020
- size_t vram_total = 0;
1093
+ size_t vram_weights = 0;
1094
+ size_t vram_scratch = 0;
1021
1095
  {
1022
1096
  const uint32_t n_embd = hparams.n_embd;
1023
1097
  const uint32_t n_layer = hparams.n_layer;
@@ -1032,7 +1106,7 @@ static void llama_model_load_internal(
1032
1106
  {
1033
1107
  ggml_backend backend_output;
1034
1108
  if (n_gpu_layers > int(n_layer)) { // NOLINT
1035
- backend_output = LLAMA_BACKEND_OFFLOAD;
1109
+ backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
1036
1110
  } else {
1037
1111
  backend_output = GGML_BACKEND_CPU;
1038
1112
  }
@@ -1044,7 +1118,8 @@ static void llama_model_load_internal(
1044
1118
 
1045
1119
  model.layers.resize(n_layer);
1046
1120
  for (uint32_t i = 0; i < n_layer; ++i) {
1047
- const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
1121
+ const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
1122
+ const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
1048
1123
 
1049
1124
  auto & layer = model.layers[i];
1050
1125
 
@@ -1052,19 +1127,19 @@ static void llama_model_load_internal(
1052
1127
 
1053
1128
  layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd}, backend);
1054
1129
 
1055
- layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}, backend);
1056
- layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd}, backend);
1057
- layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd}, backend);
1058
- layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}, backend);
1130
+ layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}, backend_split);
1131
+ layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd}, backend_split);
1132
+ layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd}, backend_split);
1133
+ layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}, backend_split);
1059
1134
 
1060
1135
  layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd}, backend);
1061
1136
 
1062
- layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}, backend);
1063
- layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}, backend);
1064
- layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}, backend);
1137
+ layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}, backend_split);
1138
+ layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}, backend_split);
1139
+ layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}, backend_split);
1065
1140
 
1066
- if (backend == GGML_BACKEND_CUDA) {
1067
- vram_total +=
1141
+ if (backend == GGML_BACKEND_GPU) {
1142
+ vram_weights +=
1068
1143
  ggml_nbytes(layer.attention_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
1069
1144
  ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.attention_norm) +
1070
1145
  ggml_nbytes(layer.w1) + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3);
@@ -1081,10 +1156,10 @@ static void llama_model_load_internal(
1081
1156
  // this is the total memory required to run the inference
1082
1157
  const size_t mem_required =
1083
1158
  ctx_size +
1084
- mmapped_size - vram_total + // weights in VRAM not in memory
1159
+ mmapped_size - vram_weights + // weights in VRAM not in memory
1085
1160
  MEM_REQ_SCRATCH0().at(model.type) +
1086
1161
  MEM_REQ_SCRATCH1().at(model.type) +
1087
- MEM_REQ_EVAL().at(model.type);
1162
+ MEM_REQ_EVAL().at (model.type);
1088
1163
 
1089
1164
  // this is the memory required by one llama_state
1090
1165
  const size_t mem_required_state =
@@ -1093,15 +1168,25 @@ static void llama_model_load_internal(
1093
1168
  fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
1094
1169
  mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
1095
1170
 
1171
+ (void) vram_scratch;
1096
1172
  #ifdef GGML_USE_CUBLAS
1173
+ vram_scratch = n_batch * MB;
1174
+ ggml_cuda_set_scratch_size(vram_scratch);
1175
+ if (n_gpu_layers > 0) {
1176
+ fprintf(stderr, "%s: allocating batch_size x 1 MB = %ld MB VRAM for the scratch buffer\n",
1177
+ __func__, vram_scratch / MB);
1178
+ }
1179
+ #endif // GGML_USE_CUBLAS
1180
+ #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
1097
1181
  const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
1098
1182
 
1099
- fprintf(stderr, "%s: [cublas] offloading %d layers to GPU\n", __func__, n_gpu);
1183
+ fprintf(stderr, "%s: offloading %d layers to GPU\n", __func__, n_gpu);
1100
1184
  if (n_gpu_layers > (int) hparams.n_layer) {
1101
- fprintf(stderr, "%s: [cublas] offloading output layer to GPU\n", __func__);
1185
+ fprintf(stderr, "%s: offloading output layer to GPU\n", __func__);
1102
1186
  }
1103
- fprintf(stderr, "%s: [cublas] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
1104
- #elif !defined(GGML_USE_CLBLAST)
1187
+ fprintf(stderr, "%s: total VRAM used: %zu MB\n",
1188
+ __func__, (vram_weights + vram_scratch + MB - 1) / MB); // round up
1189
+ #else
1105
1190
  (void) n_gpu_layers;
1106
1191
  #endif
1107
1192
  }
@@ -1113,8 +1198,10 @@ static void llama_model_load_internal(
1113
1198
 
1114
1199
  ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
1115
1200
 
1116
- #ifdef GGML_USE_CUBLAS
1201
+ #if defined(GGML_USE_CUBLAS)
1117
1202
  {
1203
+ ggml_cuda_set_tensor_split(tensor_split);
1204
+
1118
1205
  size_t done_size = 0;
1119
1206
  size_t data_size = 0;
1120
1207
  for (llama_load_tensor & lt : ml->tensors_map.tensors) {
@@ -1124,7 +1211,8 @@ static void llama_model_load_internal(
1124
1211
  }
1125
1212
  }
1126
1213
  for (llama_load_tensor & lt : ml->tensors_map.tensors) {
1127
- if (lt.ggml_tensor->backend != GGML_BACKEND_CUDA) {
1214
+ ggml_backend backend = lt.ggml_tensor->backend;
1215
+ if (backend != GGML_BACKEND_GPU && backend != GGML_BACKEND_GPU_SPLIT) {
1128
1216
  continue;
1129
1217
  }
1130
1218
  if (progress_callback) {
@@ -1136,30 +1224,28 @@ static void llama_model_load_internal(
1136
1224
  }
1137
1225
  #elif defined(GGML_USE_CLBLAST)
1138
1226
  {
1139
- const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
1140
-
1141
- fprintf(stderr, "ggml_opencl: offloading %d layers to GPU\n", n_gpu);
1142
-
1143
- size_t vram_total = 0;
1144
-
1145
- for (int i = 0; i < n_gpu; ++i) {
1146
- const auto & layer = model.layers[i];
1147
-
1148
- ggml_cl_transform_tensor(layer.wq); vram_total += ggml_nbytes(layer.wq);
1149
- ggml_cl_transform_tensor(layer.wk); vram_total += ggml_nbytes(layer.wk);
1150
- ggml_cl_transform_tensor(layer.wv); vram_total += ggml_nbytes(layer.wv);
1151
- ggml_cl_transform_tensor(layer.wo); vram_total += ggml_nbytes(layer.wo);
1152
- ggml_cl_transform_tensor(layer.w1); vram_total += ggml_nbytes(layer.w1);
1153
- ggml_cl_transform_tensor(layer.w2); vram_total += ggml_nbytes(layer.w2);
1154
- ggml_cl_transform_tensor(layer.w3); vram_total += ggml_nbytes(layer.w3);
1227
+ size_t done_size = 0;
1228
+ size_t data_size = 0;
1229
+ for (llama_load_tensor & lt : ml->tensors_map.tensors) {
1230
+ data_size += lt.size;
1231
+ if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
1232
+ done_size += lt.size;
1233
+ }
1155
1234
  }
1156
- if (n_gpu_layers > (int) hparams.n_layer) {
1157
- fprintf(stderr, "ggml_opencl: offloading output layer to GPU\n");
1158
- ggml_cl_transform_tensor(model.output); vram_total += ggml_nbytes(model.output);
1235
+ for (llama_load_tensor & lt : ml->tensors_map.tensors) {
1236
+ if (lt.ggml_tensor->backend != GGML_BACKEND_GPU) {
1237
+ continue;
1238
+ }
1239
+ if (progress_callback) {
1240
+ progress_callback((float) done_size / data_size, progress_callback_user_data);
1241
+ }
1242
+ ggml_cl_load_data(fname.c_str(), lt.ggml_tensor, lt.shards.at(0).file_off);
1243
+ done_size += lt.size;
1159
1244
  }
1160
-
1161
- fprintf(stderr, "ggml_opencl: total VRAM used: %zu MB\n", vram_total / 1024 / 1024);
1162
1245
  }
1246
+ #else
1247
+ (void) n_batch;
1248
+ (void) tensor_split;
1163
1249
  #endif
1164
1250
 
1165
1251
  if (progress_callback) {
@@ -1177,7 +1263,10 @@ static bool llama_model_load(
1177
1263
  const std::string & fname,
1178
1264
  llama_context & lctx,
1179
1265
  int n_ctx,
1266
+ int n_batch,
1180
1267
  int n_gpu_layers,
1268
+ int main_gpu,
1269
+ float * tensor_split,
1181
1270
  ggml_type memory_type,
1182
1271
  bool use_mmap,
1183
1272
  bool use_mlock,
@@ -1185,28 +1274,30 @@ static bool llama_model_load(
1185
1274
  llama_progress_callback progress_callback,
1186
1275
  void *progress_callback_user_data) {
1187
1276
  try {
1188
- llama_model_load_internal(fname, lctx, n_ctx, n_gpu_layers, memory_type, use_mmap, use_mlock,
1189
- vocab_only, progress_callback, progress_callback_user_data);
1277
+ llama_model_load_internal(fname, lctx, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, memory_type,
1278
+ use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
1190
1279
  return true;
1191
- } catch (const std::string & err) {
1192
- fprintf(stderr, "error loading model: %s\n", err.c_str());
1280
+ } catch (const std::exception & err) {
1281
+ fprintf(stderr, "error loading model: %s\n", err.what());
1193
1282
  return false;
1194
1283
  }
1195
1284
  }
1196
1285
 
1197
1286
  // evaluate the transformer
1198
1287
  //
1199
- // - lctx: llama context
1200
- // - tokens: new batch of tokens to process
1201
- // - n_past: the context size so far
1202
- // - n_threads: number of threads to use
1288
+ // - lctx: llama context
1289
+ // - tokens: new batch of tokens to process
1290
+ // - n_past: the context size so far
1291
+ // - n_threads: number of threads to use
1292
+ // - cgraph_fname: filename of the exported computation graph
1203
1293
  //
1204
1294
  static bool llama_eval_internal(
1205
- llama_context & lctx,
1206
- const llama_token * tokens,
1207
- const int n_tokens,
1208
- const int n_past,
1209
- const int n_threads) {
1295
+ llama_context & lctx,
1296
+ const llama_token * tokens,
1297
+ const int n_tokens,
1298
+ const int n_past,
1299
+ const int n_threads,
1300
+ const char * cgraph_fname) {
1210
1301
 
1211
1302
  // enforce that the first token is BOS
1212
1303
  if (n_past == 0 && tokens[0] != llama_token_bos()) {
@@ -1225,12 +1316,13 @@ static bool llama_eval_internal(
1225
1316
 
1226
1317
  LLAMA_ASSERT(!!kv_self.ctx);
1227
1318
 
1228
- const int n_embd = hparams.n_embd;
1229
- const int n_layer = hparams.n_layer;
1230
- const int n_ctx = hparams.n_ctx;
1231
- const int n_head = hparams.n_head;
1232
- const int n_vocab = hparams.n_vocab;
1233
- const int n_rot = hparams.n_embd/hparams.n_head;
1319
+ const int n_embd = hparams.n_embd;
1320
+ const int n_layer = hparams.n_layer;
1321
+ const int n_ctx = hparams.n_ctx;
1322
+ const int n_head = hparams.n_head;
1323
+ const int n_vocab = hparams.n_vocab;
1324
+ const int n_rot = hparams.n_embd/hparams.n_head;
1325
+ const int n_gpu_layers = model.n_gpu_layers;
1234
1326
 
1235
1327
  auto & mem_per_token = lctx.mem_per_token;
1236
1328
  auto & buf_compute = lctx.buf_compute;
@@ -1252,40 +1344,66 @@ static bool llama_eval_internal(
1252
1344
  ggml_set_name(embd, "embd");
1253
1345
  memcpy(embd->data, tokens, N*ggml_element_size(embd));
1254
1346
 
1347
+ struct ggml_tensor * cur;
1255
1348
  struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
1256
1349
 
1350
+ const int i_gpu_start = n_layer - n_gpu_layers;
1351
+ (void) i_gpu_start;
1352
+
1257
1353
  for (int il = 0; il < n_layer; ++il) {
1258
- struct ggml_tensor * inpSA = inpL;
1354
+ offload_func_t offload_func = llama_nop;
1259
1355
 
1260
- struct ggml_tensor * cur;
1356
+ #ifdef GGML_USE_CUBLAS
1357
+ if (il >= i_gpu_start) {
1358
+ offload_func = ggml_cuda_assign_buffers; // sets the output backend to GPU
1359
+ }
1360
+ #endif // GGML_USE_CUBLAS
1361
+
1362
+ struct ggml_tensor * inpSA = inpL;
1261
1363
 
1262
1364
  lctx.use_buf(ctx0, 0);
1263
1365
 
1264
1366
  // norm
1265
1367
  {
1266
1368
  cur = ggml_rms_norm(ctx0, inpL);
1369
+ offload_func(cur);
1370
+ ggml_set_name(cur, "rms_norm_0");
1267
1371
 
1268
1372
  // cur = cur*attention_norm(broadcasted)
1269
1373
  cur = ggml_mul(ctx0, cur, model.layers[il].attention_norm);
1374
+ offload_func(cur);
1375
+ ggml_set_name(cur, "attention_norm_0");
1270
1376
  }
1271
1377
 
1272
1378
  // self-attention
1273
1379
  {
1274
1380
  // compute Q and K and RoPE them
1275
- struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
1276
- struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
1277
- ggml_set_name(Qcur, "Qcur");
1381
+ struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
1382
+ // offload_func(tmpq);
1383
+ ggml_set_name(tmpq, "tmpq");
1384
+
1385
+ struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
1386
+ // offload_func(tmpk);
1387
+ ggml_set_name(tmpk, "tmpk");
1388
+
1389
+ struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0);
1278
1390
  ggml_set_name(Kcur, "Kcur");
1279
1391
 
1392
+ struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0);
1393
+ ggml_set_name(Qcur, "Qcur");
1394
+
1280
1395
  // store key and value to memory
1281
1396
  {
1282
1397
  // compute the transposed [N, n_embd] V matrix
1283
1398
  struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), n_embd, N));
1399
+ ggml_set_name(Vcur, "Vcur");
1284
1400
 
1285
1401
  struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
1402
+ ggml_set_name(k, "k");
1286
1403
  struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
1287
1404
  ( n_ctx)*ggml_element_size(kv_self.v),
1288
1405
  (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
1406
+ ggml_set_name(v, "v");
1289
1407
 
1290
1408
  // important: storing RoPE-ed version of K in the KV cache!
1291
1409
  ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
@@ -1326,7 +1444,6 @@ static bool llama_eval_internal(
1326
1444
  struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
1327
1445
  ggml_set_name(KQ_soft_max, "KQ_soft_max");
1328
1446
 
1329
-
1330
1447
  // split cached V into n_head heads
1331
1448
  struct ggml_tensor * V =
1332
1449
  ggml_view_3d(ctx0, kv_self.v,
@@ -1361,73 +1478,143 @@ static bool llama_eval_internal(
1361
1478
  cur = ggml_mul_mat(ctx0,
1362
1479
  model.layers[il].wo,
1363
1480
  cur);
1481
+ offload_func(cur);
1482
+ ggml_set_name(cur, "result_wo");
1364
1483
  }
1365
1484
 
1366
1485
  lctx.use_buf(ctx0, 1);
1486
+ //ggml_cuda_set_scratch(1);
1367
1487
 
1368
1488
  struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
1489
+ offload_func(inpFF);
1490
+ ggml_set_name(inpFF, "inpFF");
1369
1491
 
1370
1492
  // feed-forward network
1371
1493
  {
1372
1494
  // norm
1373
1495
  {
1374
1496
  cur = ggml_rms_norm(ctx0, inpFF);
1497
+ offload_func(cur);
1498
+ ggml_set_name(cur, "rms_norm_1");
1375
1499
 
1376
1500
  // cur = cur*ffn_norm(broadcasted)
1377
1501
  cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
1502
+ offload_func(cur);
1503
+ ggml_set_name(cur, "ffn_norm");
1378
1504
  }
1379
1505
 
1380
1506
  struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
1381
1507
  model.layers[il].w3,
1382
1508
  cur);
1509
+ offload_func(tmp);
1510
+ ggml_set_name(tmp, "result_w3");
1383
1511
 
1384
1512
  cur = ggml_mul_mat(ctx0,
1385
1513
  model.layers[il].w1,
1386
1514
  cur);
1515
+ offload_func(cur);
1516
+ ggml_set_name(cur, "result_w2");
1387
1517
 
1388
1518
  // SILU activation
1389
1519
  cur = ggml_silu(ctx0, cur);
1520
+ offload_func(cur);
1521
+ ggml_set_name(cur, "silu");
1390
1522
 
1391
1523
  cur = ggml_mul(ctx0, cur, tmp);
1524
+ offload_func(cur);
1525
+ ggml_set_name(cur, "silu_x_result_w3");
1392
1526
 
1393
1527
  cur = ggml_mul_mat(ctx0,
1394
1528
  model.layers[il].w2,
1395
1529
  cur);
1530
+ offload_func(cur);
1531
+ ggml_set_name(cur, "result_w2");
1396
1532
  }
1397
1533
 
1398
1534
  cur = ggml_add(ctx0, cur, inpFF);
1535
+ offload_func(cur);
1536
+ ggml_set_name(cur, "inpFF_+_result_w2");
1399
1537
 
1400
1538
  // input for next layer
1401
1539
  inpL = cur;
1540
+
1402
1541
  }
1403
1542
 
1404
1543
  lctx.use_buf(ctx0, 0);
1544
+ //ggml_cuda_set_scratch(0);
1405
1545
 
1406
1546
  // used at the end to optionally extract the embeddings
1407
1547
  struct ggml_tensor * embeddings = NULL;
1408
1548
 
1549
+ offload_func_t offload_func = llama_nop;
1550
+
1551
+ #ifdef GGML_USE_CUBLAS
1552
+ if (n_gpu_layers > n_layer) {
1553
+ offload_func = ggml_cuda_assign_buffers; // sets the output backend to GPU
1554
+ }
1555
+ #endif // GGML_USE_CUBLAS
1556
+
1409
1557
  // norm
1410
1558
  {
1559
+ cur = ggml_rms_norm(ctx0, inpL);
1560
+ offload_func(cur);
1561
+ ggml_set_name(cur, "rms_norm_inpL");
1411
1562
 
1412
- inpL = ggml_rms_norm(ctx0, inpL);
1563
+ cur = ggml_rms_norm(ctx0, cur);
1564
+ offload_func(cur);
1565
+ ggml_set_name(cur, "rms_norm_after");
1413
1566
 
1414
- // inpL = inpL*norm(broadcasted)
1415
- inpL = ggml_mul(ctx0, inpL, model.norm);
1567
+ // cur = cur*norm(broadcasted)
1568
+ cur = ggml_mul(ctx0, cur, model.norm);
1569
+ offload_func(cur);
1570
+ ggml_set_name(cur, "result_norm");
1416
1571
 
1417
- embeddings = inpL;
1572
+ embeddings = cur;
1418
1573
  }
1419
1574
 
1575
+
1420
1576
  // lm_head
1421
- inpL = ggml_mul_mat(ctx0, model.output, inpL);
1577
+ cur = ggml_mul_mat(ctx0, model.output, cur);
1578
+ ggml_set_name(cur, "result_output");
1422
1579
 
1423
1580
  lctx.use_buf(ctx0, -1);
1424
1581
 
1425
1582
  // logits -> probs
1426
- //inpL = ggml_soft_max_inplace(ctx0, inpL);
1583
+ //cur = ggml_soft_max_inplace(ctx0, cur);
1427
1584
 
1428
1585
  // run the computation
1429
- ggml_build_forward_expand(&gf, inpL);
1430
- ggml_graph_compute (ctx0, &gf);
1586
+ ggml_build_forward_expand(&gf, cur);
1587
+
1588
+ #ifdef GGML_USE_METAL
1589
+ if (lctx.ctx_metal && N == 1) {
1590
+ ggml_metal_graph_compute(lctx.ctx_metal, &gf);
1591
+ ggml_metal_get_tensor (lctx.ctx_metal, cur);
1592
+ } else {
1593
+ // IMPORTANT:
1594
+ // Since we don't have efficient Matrix x Matrix Metal multiplication yet, we fallback to vanilla
1595
+ // ggml_graph_compute(). It uses Apple's Accelerate CBLAS API which takes advantage of the ANE or the AMX
1596
+ // coprocessor.
1597
+ //
1598
+ // When we implement Matrix x Matrix Metal multiplication, we can avoid this branch.
1599
+ // But for now, we have focused only on Matrix x Vector Metal multiplication.
1600
+ //
1601
+ // TODO: avoid these syncs via shared memory (ref #1696)
1602
+ //
1603
+ if (lctx.ctx_metal) {
1604
+ // We need to sync the GPU KV cache with the CPU KV cache
1605
+ ggml_metal_get_tensor(lctx.ctx_metal, kv_self.k);
1606
+ ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v);
1607
+ }
1608
+
1609
+ ggml_graph_compute(ctx0, &gf);
1610
+ }
1611
+ #else
1612
+ ggml_graph_compute(ctx0, &gf);
1613
+ #endif
1614
+
1615
+ if (cgraph_fname) {
1616
+ ggml_graph_export(&gf, cgraph_fname);
1617
+ }
1431
1618
 
1432
1619
  #ifdef GGML_PERF
1433
1620
  // print timing information per ggml operation (for debugging purposes)
@@ -1441,7 +1628,7 @@ static bool llama_eval_internal(
1441
1628
  //}
1442
1629
 
1443
1630
  //embd_w.resize(n_vocab*N);
1444
- //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
1631
+ //memcpy(embd_w.data(), ggml_get_data(cur), sizeof(float)*n_vocab*N);
1445
1632
 
1446
1633
  // update kv token count
1447
1634
  lctx.model.kv_self.n = n_past + N;
@@ -1452,11 +1639,11 @@ static bool llama_eval_internal(
1452
1639
 
1453
1640
  if (lctx.logits_all) {
1454
1641
  logits_out.resize(n_vocab * N);
1455
- memcpy(logits_out.data(), (float *) ggml_get_data(inpL), sizeof(float)*n_vocab*N);
1642
+ memcpy(logits_out.data(), (float *) ggml_get_data(cur), sizeof(float)*n_vocab*N);
1456
1643
  } else {
1457
1644
  // return result for just the last token
1458
1645
  logits_out.resize(n_vocab);
1459
- memcpy(logits_out.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
1646
+ memcpy(logits_out.data(), (float *) ggml_get_data(cur) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
1460
1647
  }
1461
1648
  }
1462
1649
 
@@ -2055,16 +2242,88 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra
2055
2242
  // quantization
2056
2243
  //
2057
2244
 
2058
- static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, enum llama_ftype ftype, int nthread) {
2245
+ static void llama_convert_tensor_internal(const llama_load_tensor & tensor, llama_buffer & output, const int nelements, const int nthread) {
2246
+ if (output.size < nelements * sizeof(float)) {
2247
+ output.resize(nelements * sizeof(float));
2248
+ }
2249
+ float * f32_output = (float *) output.addr;
2250
+
2251
+ quantize_fns_t qtype;
2252
+ if (ggml_is_quantized(tensor.type)) {
2253
+ qtype = ggml_internal_get_quantize_fn(tensor.type);
2254
+ if (qtype.dequantize_row_q == NULL) {
2255
+ throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor.type)));
2256
+ }
2257
+ } else if (tensor.type != GGML_TYPE_F16) {
2258
+ throw std::runtime_error(format("cannot dequantize/convert tensor type %s", ggml_type_name(tensor.type)));
2259
+ }
2260
+
2261
+ if (nthread < 2) {
2262
+ if (tensor.type == GGML_TYPE_F16) {
2263
+ ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor.data, f32_output, nelements);
2264
+ } else if (ggml_is_quantized(tensor.type)) {
2265
+ qtype.dequantize_row_q(tensor.data, f32_output, nelements);
2266
+ } else {
2267
+ LLAMA_ASSERT(false); // unreachable
2268
+ }
2269
+ return;
2270
+ }
2271
+
2272
+ auto block_size = tensor.type == GGML_TYPE_F16 ? 1 : (size_t)ggml_blck_size(tensor.type);
2273
+ auto block_size_bytes = ggml_type_size(tensor.type);
2274
+
2275
+ LLAMA_ASSERT(nelements % block_size == 0);
2276
+ auto nblocks = nelements / block_size;
2277
+ auto blocks_per_thread = nblocks / nthread;
2278
+ auto spare_blocks = nblocks - (blocks_per_thread * nthread); // if blocks aren't divisible by thread count
2279
+
2280
+ std::vector<std::thread> workers;
2281
+ for (auto tnum = 0, in_buff_offs = 0, out_buff_offs = 0; tnum < nthread; tnum++) {
2282
+ auto thr_blocks = blocks_per_thread + (tnum == nthread - 1 ? spare_blocks : 0); // num blocks for this thread
2283
+ auto thr_elems = thr_blocks * block_size; // number of elements for this thread
2284
+ auto thr_block_bytes = thr_blocks * block_size_bytes; // number of input bytes for this thread
2285
+
2286
+ auto compute = [qtype] (ggml_type typ, uint8_t * inbuf, float * outbuf, int nels) {
2287
+ if (typ == GGML_TYPE_F16) {
2288
+ ggml_fp16_to_fp32_row((ggml_fp16_t *)inbuf, outbuf, nels);
2289
+ } else {
2290
+ qtype.dequantize_row_q(inbuf, outbuf, nels);
2291
+ }
2292
+ };
2293
+ workers.push_back(std::thread(compute, tensor.type, tensor.data + in_buff_offs, f32_output + out_buff_offs, thr_elems));
2294
+ in_buff_offs += thr_block_bytes;
2295
+ out_buff_offs += thr_elems;
2296
+ }
2297
+ for (auto & worker : workers) {
2298
+ worker.join();
2299
+ }
2300
+
2301
+ }
2302
+
2303
+ static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
2059
2304
  ggml_type quantized_type;
2060
- switch (ftype) {
2305
+ llama_ftype ftype = params->ftype;
2306
+ int nthread = params->nthread;
2307
+
2308
+ switch (params->ftype) {
2061
2309
  case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
2062
2310
  case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
2063
2311
  case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
2064
2312
  case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
2065
2313
  case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
2066
- default: throw format("invalid output file type %d\n", ftype);
2067
- };
2314
+
2315
+ // K-quants
2316
+ case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
2317
+ case LLAMA_FTYPE_MOSTLY_Q3_K_S:
2318
+ case LLAMA_FTYPE_MOSTLY_Q3_K_M:
2319
+ case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break;
2320
+ case LLAMA_FTYPE_MOSTLY_Q4_K_S:
2321
+ case LLAMA_FTYPE_MOSTLY_Q4_K_M: quantized_type = GGML_TYPE_Q4_K; break;
2322
+ case LLAMA_FTYPE_MOSTLY_Q5_K_S:
2323
+ case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break;
2324
+ case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
2325
+ default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
2326
+ }
2068
2327
 
2069
2328
  if (nthread <= 0) {
2070
2329
  nthread = std::thread::hardware_concurrency();
@@ -2072,7 +2331,21 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2072
2331
 
2073
2332
  std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp, /*use_mmap*/ false,
2074
2333
  /*vocab_only*/ false));
2075
- llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), ftype);
2334
+ llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), params->ftype);
2335
+
2336
+ int n_attention_wv = 0;
2337
+ int n_feed_forward_w2 = 0;
2338
+ for (auto& tensor : model_loader->tensors_map.tensors) {
2339
+ if (tensor.name.find("attention.wv.weight") != std::string::npos) {
2340
+ ++n_attention_wv;
2341
+ }
2342
+ else if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) {
2343
+ ++n_feed_forward_w2;
2344
+ }
2345
+ }
2346
+
2347
+ int i_attention_wv = 0;
2348
+ int i_feed_forward_w2 = 0;
2076
2349
 
2077
2350
  size_t total_size_org = 0;
2078
2351
  size_t total_size_new = 0;
@@ -2100,9 +2373,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2100
2373
  quantize &= (tensor.ne.size() == 2);
2101
2374
 
2102
2375
  // uncomment this to keep the output layer in FP16
2103
- //if (tensor.name == "output.weight") {
2104
- // quantize = false;
2105
- //}
2376
+ if (!params->quantize_output_tensor && tensor.name == "output.weight") {
2377
+ quantize = false;
2378
+ }
2379
+ quantize = quantize && quantized_type != tensor.type;
2106
2380
 
2107
2381
  enum ggml_type new_type;
2108
2382
  void * new_data;
@@ -2116,20 +2390,43 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2116
2390
  printf("size = %8.3f MB\n", tensor.size/1024.0/1024.0);
2117
2391
  } else {
2118
2392
  new_type = quantized_type;
2393
+ // TODO: temporary disabled until Metal / OpenCL support is available
2394
+ // ref: https://github.com/ggerganov/llama.cpp/issues/1711
2395
+ //if (tensor.name == "output.weight") {
2396
+ // new_type = GGML_TYPE_Q6_K;
2397
+ //}
2398
+ if (tensor.name.find("attention.wv.weight") != std::string::npos) {
2399
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
2400
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
2401
+ else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
2402
+ (i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8 ||
2403
+ (i_attention_wv - n_attention_wv/8)%3 == 2)) new_type = GGML_TYPE_Q6_K;
2404
+ ++i_attention_wv;
2405
+ }
2406
+ if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) {
2407
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
2408
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
2409
+ else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
2410
+ (i_feed_forward_w2 < n_feed_forward_w2/8 || i_feed_forward_w2 >= 7*n_feed_forward_w2/8 ||
2411
+ (i_feed_forward_w2 - n_feed_forward_w2/8)%3 == 2)) new_type = GGML_TYPE_Q6_K;
2412
+ ++i_feed_forward_w2;
2413
+ }
2414
+ if (tensor.name.find("attention.wo.weight") != std::string::npos) {
2415
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
2416
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
2417
+ }
2418
+
2119
2419
  float * f32_data;
2120
2420
  size_t nelements = tensor.ne.at(0) * tensor.ne.at(1);
2121
2421
  llama_buffer f32_conv_buf;
2422
+
2122
2423
  if (tensor.type == GGML_TYPE_F32) {
2123
2424
  f32_data = (float *) tensor.data;
2124
- } else if (tensor.type == GGML_TYPE_F16) {
2125
- f32_conv_buf.resize(nelements * sizeof(float));
2126
- f32_data = (float *) f32_conv_buf.addr;
2127
- const auto * f16_data = (const ggml_fp16_t *) tensor.data;
2128
- for (size_t i = 0; i < nelements; i++) {
2129
- f32_data[i] = ggml_fp16_to_fp32(f16_data[i]);
2130
- }
2425
+ } else if (ggml_is_quantized(tensor.type) && !params->allow_requantize) {
2426
+ throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor.type)));
2131
2427
  } else {
2132
- throw format("type %s unsupported for integer quantization", ggml_type_name(tensor.type));
2428
+ llama_convert_tensor_internal(tensor, f32_conv_buf, nelements, nthread);
2429
+ f32_data = (float *) f32_conv_buf.addr;
2133
2430
  }
2134
2431
 
2135
2432
  printf("quantizing .. ");
@@ -2183,12 +2480,16 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2183
2480
  }
2184
2481
 
2185
2482
  printf("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0);
2483
+ int64_t tot_count = 0;
2186
2484
  for (size_t i = 0; i < hist_cur.size(); i++) {
2187
2485
  hist_all[i] += hist_cur[i];
2486
+ tot_count += hist_cur[i];
2188
2487
  }
2189
2488
 
2190
- for (size_t i = 0; i < hist_cur.size(); i++) {
2191
- printf("%5.3f ", hist_cur[i] / float(nelements));
2489
+ if (tot_count > 0) {
2490
+ for (size_t i = 0; i < hist_cur.size(); i++) {
2491
+ printf("%5.3f ", hist_cur[i] / float(nelements));
2492
+ }
2192
2493
  }
2193
2494
  printf("\n");
2194
2495
  }
@@ -2206,11 +2507,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2206
2507
  sum_all += hist_all[i];
2207
2508
  }
2208
2509
 
2209
- printf("%s: hist: ", __func__);
2210
- for (size_t i = 0; i < hist_all.size(); i++) {
2211
- printf("%5.3f ", hist_all[i] / float(sum_all));
2510
+ if (sum_all > 0) {
2511
+ printf("%s: hist: ", __func__);
2512
+ for (size_t i = 0; i < hist_all.size(); i++) {
2513
+ printf("%5.3f ", hist_all[i] / float(sum_all));
2514
+ }
2515
+ printf("\n");
2212
2516
  }
2213
- printf("\n");
2214
2517
  }
2215
2518
  }
2216
2519
 
@@ -2251,9 +2554,9 @@ struct llama_context * llama_init_from_file(
2251
2554
 
2252
2555
  ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
2253
2556
 
2254
- if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_gpu_layers, memory_type,
2255
- params.use_mmap, params.use_mlock, params.vocab_only,
2256
- params.progress_callback, params.progress_callback_user_data)) {
2557
+ if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_batch, params.n_gpu_layers,
2558
+ params.main_gpu, params.tensor_split, memory_type, params.use_mmap, params.use_mlock,
2559
+ params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
2257
2560
  fprintf(stderr, "%s: failed to load model\n", __func__);
2258
2561
  llama_free(ctx);
2259
2562
  return nullptr;
@@ -2291,6 +2594,38 @@ struct llama_context * llama_init_from_file(
2291
2594
  ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
2292
2595
  }
2293
2596
 
2597
+ #ifdef GGML_USE_METAL
2598
+ if (params.n_gpu_layers > 0) {
2599
+ // this allocates all Metal resources and memory buffers
2600
+ ctx->ctx_metal = ggml_metal_init();
2601
+
2602
+ void *data_ptr = NULL;
2603
+ size_t data_size = 0;
2604
+ if (params.use_mmap) {
2605
+ data_ptr = ctx->model.mapping->addr;
2606
+ data_size= ctx->model.mapping->size;
2607
+ } else {
2608
+ data_ptr = ggml_get_mem_buffer(ctx->model.ctx);
2609
+ data_size= ggml_get_mem_size(ctx->model.ctx);
2610
+ }
2611
+
2612
+ #define LLAMA_METAL_CHECK_BUF(result) \
2613
+ if (!(result)) { \
2614
+ fprintf(stderr, "%s: failed to add buffer\n", __func__); \
2615
+ llama_free(ctx); \
2616
+ return NULL; \
2617
+ }
2618
+
2619
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size));
2620
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size));
2621
+
2622
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->model.kv_self.buf.addr, ctx->model.kv_self.buf.size));
2623
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size));
2624
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size));
2625
+ #undef LLAMA_METAL_CHECK_BUF
2626
+ }
2627
+ #endif
2628
+
2294
2629
  return ctx;
2295
2630
  }
2296
2631
 
@@ -2301,13 +2636,12 @@ void llama_free(struct llama_context * ctx) {
2301
2636
  int llama_model_quantize(
2302
2637
  const char * fname_inp,
2303
2638
  const char * fname_out,
2304
- enum llama_ftype ftype,
2305
- int nthread) {
2639
+ const llama_model_quantize_params *params) {
2306
2640
  try {
2307
- llama_model_quantize_internal(fname_inp, fname_out, ftype, nthread);
2641
+ llama_model_quantize_internal(fname_inp, fname_out, params);
2308
2642
  return 0;
2309
- } catch (const std::string & err) {
2310
- fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.c_str());
2643
+ } catch (const std::exception & err) {
2644
+ fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.what());
2311
2645
  return 1;
2312
2646
  }
2313
2647
  }
@@ -2560,8 +2894,8 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
2560
2894
  int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
2561
2895
  try {
2562
2896
  return llama_apply_lora_from_file_internal(ctx, path_lora, path_base_model, n_threads);
2563
- } catch (const std::string & err) {
2564
- fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.c_str());
2897
+ } catch (const std::exception & err) {
2898
+ fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what());
2565
2899
  return 1;
2566
2900
  }
2567
2901
  }
@@ -2906,7 +3240,7 @@ int llama_eval(
2906
3240
  int n_tokens,
2907
3241
  int n_past,
2908
3242
  int n_threads) {
2909
- if (!llama_eval_internal(*ctx, tokens, n_tokens, n_past, n_threads)) {
3243
+ if (!llama_eval_internal(*ctx, tokens, n_tokens, n_past, n_threads, nullptr)) {
2910
3244
  fprintf(stderr, "%s: failed to eval\n", __func__);
2911
3245
  return 1;
2912
3246
  }
@@ -2921,6 +3255,20 @@ int llama_eval(
2921
3255
  return 0;
2922
3256
  }
2923
3257
 
3258
+ int llama_eval_export(struct llama_context * ctx, const char * fname) {
3259
+ const int n_batch = 1;
3260
+ const int n_ctx = 512 - n_batch;
3261
+
3262
+ const std::vector<llama_token> tmp(n_batch, llama_token_bos());
3263
+
3264
+ if (!llama_eval_internal(*ctx, tmp.data(), tmp.size(), n_ctx, 1, fname)) {
3265
+ fprintf(stderr, "%s: failed to eval\n", __func__);
3266
+ return 1;
3267
+ }
3268
+
3269
+ return 0;
3270
+ }
3271
+
2924
3272
  int llama_tokenize(
2925
3273
  struct llama_context * ctx,
2926
3274
  const char * text,