llama_cpp 0.1.3 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -16,6 +16,10 @@
16
16
  #include "ggml-opencl.h"
17
17
  #endif
18
18
 
19
+ #ifdef GGML_USE_METAL
20
+ #include "ggml-metal.h"
21
+ #endif
22
+
19
23
  #include <array>
20
24
  #include <ctime>
21
25
  #include <cinttypes>
@@ -42,22 +46,29 @@
42
46
  // available llama models
43
47
  enum e_model {
44
48
  MODEL_UNKNOWN,
49
+ MODEL_3B,
45
50
  MODEL_7B,
46
51
  MODEL_13B,
47
52
  MODEL_30B,
48
53
  MODEL_65B,
49
54
  };
50
55
 
51
-
52
56
  static const size_t MB = 1024*1024;
53
57
 
54
58
  // computed for n_ctx == 2048
55
59
  // TODO: dynamically determine these sizes
56
60
  // needs modifications in ggml
57
61
 
62
+ typedef void (*offload_func_t)(struct ggml_tensor * tensor);
63
+
64
+ void llama_nop(struct ggml_tensor * tensor) { // don't offload by default
65
+ (void) tensor;
66
+ }
67
+
58
68
  static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
59
69
  {
60
70
  static std::map<e_model, size_t> k_sizes = {
71
+ { MODEL_3B, 256ull * MB },
61
72
  { MODEL_7B, 512ull * MB },
62
73
  { MODEL_13B, 512ull * MB },
63
74
  { MODEL_30B, 512ull * MB },
@@ -69,6 +80,7 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
69
80
  static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
70
81
  {
71
82
  static std::map<e_model, size_t> k_sizes = {
83
+ { MODEL_3B, 256ull * MB },
72
84
  { MODEL_7B, 512ull * MB },
73
85
  { MODEL_13B, 512ull * MB },
74
86
  { MODEL_30B, 512ull * MB },
@@ -81,6 +93,7 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
81
93
  static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
82
94
  {
83
95
  static std::map<e_model, size_t> k_sizes = {
96
+ { MODEL_3B, 682ull * MB },
84
97
  { MODEL_7B, 1026ull * MB },
85
98
  { MODEL_13B, 1608ull * MB },
86
99
  { MODEL_30B, 3124ull * MB },
@@ -94,6 +107,7 @@ static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
94
107
  static const std::map<e_model, size_t> & MEM_REQ_EVAL()
95
108
  {
96
109
  static std::map<e_model, size_t> k_sizes = {
110
+ { MODEL_3B, 512ull * MB },
97
111
  { MODEL_7B, 768ull * MB },
98
112
  { MODEL_13B, 1024ull * MB },
99
113
  { MODEL_30B, 1280ull * MB },
@@ -165,6 +179,7 @@ struct llama_model {
165
179
  struct ggml_tensor * output;
166
180
 
167
181
  std::vector<llama_layer> layers;
182
+ int n_gpu_layers;
168
183
 
169
184
  // context
170
185
  struct ggml_context * ctx = NULL;
@@ -190,6 +205,16 @@ struct llama_model {
190
205
  if (ctx) {
191
206
  ggml_free(ctx);
192
207
  }
208
+
209
+ #ifdef GGML_USE_CUBLAS
210
+ for (size_t i = 0; i < tensors_by_name.size(); ++i) {
211
+ ggml_cuda_free_data(tensors_by_name[i].second);
212
+ }
213
+ #elif defined(GGML_USE_CLBLAST)
214
+ for (size_t i = 0; i < tensors_by_name.size(); ++i) {
215
+ ggml_cl_free_data(tensors_by_name[i].second);
216
+ }
217
+ #endif
193
218
  }
194
219
  };
195
220
 
@@ -238,6 +263,10 @@ struct llama_context {
238
263
  llama_ctx_buffer buf_compute;
239
264
  llama_ctx_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
240
265
 
266
+ #ifdef GGML_USE_METAL
267
+ ggml_metal_context * ctx_metal = NULL;
268
+ #endif
269
+
241
270
  int buf_last = 0;
242
271
  size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
243
272
 
@@ -277,15 +306,15 @@ template <typename T>
277
306
  static T checked_mul(T a, T b) {
278
307
  T ret = a * b;
279
308
  if (a != 0 && ret / a != b) {
280
- throw format("overflow multiplying %llu * %llu",
281
- (unsigned long long) a, (unsigned long long) b);
309
+ throw std::runtime_error(format("overflow multiplying %llu * %llu",
310
+ (unsigned long long) a, (unsigned long long) b));
282
311
  }
283
312
  return ret;
284
313
  }
285
314
 
286
315
  static size_t checked_div(size_t a, size_t b) {
287
316
  if (b == 0 || a % b != 0) {
288
- throw format("error dividing %zu / %zu", a, b);
317
+ throw std::runtime_error(format("error dividing %zu / %zu", a, b));
289
318
  }
290
319
  return a / b;
291
320
  }
@@ -349,7 +378,7 @@ struct llama_load_tensor {
349
378
  const auto & first_shard = shards.at(0);
350
379
  for (const auto & shard : shards) {
351
380
  if (shard.type != first_shard.type) {
352
- throw format("inconsistent tensor shard type in '%s'", name.c_str());
381
+ throw std::runtime_error(format("inconsistent tensor shard type in '%s'", name.c_str()));
353
382
  }
354
383
  }
355
384
  type = first_shard.type;
@@ -372,8 +401,8 @@ struct llama_load_tensor {
372
401
  const auto & first_shard = shards.at(0);
373
402
  for (const auto & shard : shards) {
374
403
  if (shard.ne != first_shard.ne) {
375
- throw format("inconsistent tensor shard shape in '%s': first was %s, other was %s",
376
- name.c_str(), llama_format_tensor_shape(first_shard.ne).c_str(), llama_format_tensor_shape(shard.ne).c_str());
404
+ throw std::runtime_error(format("inconsistent tensor shard shape in '%s': first was %s, other was %s",
405
+ name.c_str(), llama_format_tensor_shape(first_shard.ne).c_str(), llama_format_tensor_shape(shard.ne).c_str()));
377
406
  }
378
407
  }
379
408
  ne = first_shard.ne;
@@ -451,8 +480,8 @@ struct llama_file_loader {
451
480
  }
452
481
  }
453
482
 
454
- throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
455
- magic, version);
483
+ throw std::runtime_error(format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
484
+ magic, version));
456
485
  }
457
486
  void read_hparams() {
458
487
  hparams.n_vocab = file.read_u32();
@@ -492,7 +521,7 @@ struct llama_file_loader {
492
521
  file.read_raw(shard.ne.data(), sizeof(shard.ne[0]) * n_dims);
493
522
  std::string name = file.read_string(name_len);
494
523
  if (n_dims < 1 || n_dims > 2) {
495
- throw format("llama.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims);
524
+ throw std::runtime_error(format("llama.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims));
496
525
  }
497
526
  switch (shard.type) {
498
527
  case GGML_TYPE_F32:
@@ -502,9 +531,14 @@ struct llama_file_loader {
502
531
  case GGML_TYPE_Q5_0:
503
532
  case GGML_TYPE_Q5_1:
504
533
  case GGML_TYPE_Q8_0:
534
+ case GGML_TYPE_Q2_K:
535
+ case GGML_TYPE_Q3_K:
536
+ case GGML_TYPE_Q4_K:
537
+ case GGML_TYPE_Q5_K:
538
+ case GGML_TYPE_Q6_K:
505
539
  break;
506
540
  default: {
507
- throw format("unrecognized tensor type %u\n", shard.type);
541
+ throw std::runtime_error(format("unrecognized tensor type %u\n", shard.type));
508
542
  }
509
543
  }
510
544
 
@@ -577,6 +611,11 @@ struct llama_file_saver {
577
611
  case GGML_TYPE_Q5_0:
578
612
  case GGML_TYPE_Q5_1:
579
613
  case GGML_TYPE_Q8_0:
614
+ case GGML_TYPE_Q2_K:
615
+ case GGML_TYPE_Q3_K:
616
+ case GGML_TYPE_Q4_K:
617
+ case GGML_TYPE_Q5_K:
618
+ case GGML_TYPE_Q6_K:
580
619
  break;
581
620
  default: LLAMA_ASSERT(false);
582
621
  }
@@ -608,7 +647,7 @@ struct llama_model_loader {
608
647
  auto * ith_file = new llama_file_loader(fname.c_str(), i, tensors_map);
609
648
  file_loaders.emplace_back(ith_file);
610
649
  if (ith_file->hparams != first_file->hparams) {
611
- throw format("llama.cpp: hparams inconsistent between files");
650
+ throw std::runtime_error(format("llama.cpp: hparams inconsistent between files"));
612
651
  }
613
652
  }
614
653
  if (!llama_mmap::SUPPORTED) {
@@ -638,7 +677,7 @@ struct llama_model_loader {
638
677
  uint32_t guess_n_parts() const {
639
678
  auto it = tensors_map.name_to_idx.find("tok_embeddings.weight");
640
679
  if (it == tensors_map.name_to_idx.end()) {
641
- throw std::string("missing tok_embeddings.weight");
680
+ throw std::runtime_error(std::string("missing tok_embeddings.weight"));
642
681
  }
643
682
  const llama_load_tensor & lt = tensors_map.tensors.at(it->second);
644
683
  return file_loaders.at(0)->hparams.n_embd / lt.shards.at(0).ne.at(0);
@@ -655,12 +694,12 @@ struct llama_model_loader {
655
694
  struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne, ggml_backend backend) {
656
695
  auto it = tensors_map.name_to_idx.find(name);
657
696
  if (it == tensors_map.name_to_idx.end()) {
658
- throw format("llama.cpp: tensor '%s' is missing from model", name.c_str());
697
+ throw std::runtime_error(std::runtime_error(format("llama.cpp: tensor '%s' is missing from model", name.c_str())));
659
698
  }
660
699
  llama_load_tensor & lt = tensors_map.tensors.at(it->second);
661
700
  if (lt.ne != ne) {
662
- throw format("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s",
663
- name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str());
701
+ throw std::runtime_error(format("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s",
702
+ name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str()));
664
703
  }
665
704
 
666
705
  return get_tensor_for(lt, backend);
@@ -676,6 +715,7 @@ struct llama_model_loader {
676
715
  }
677
716
  ggml_set_name(tensor, lt.name.c_str());
678
717
  LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor
718
+
679
719
  tensor->backend = backend;
680
720
  lt.ggml_tensor = tensor;
681
721
  num_ggml_tensors_created++;
@@ -684,7 +724,7 @@ struct llama_model_loader {
684
724
 
685
725
  void done_getting_tensors() const {
686
726
  if (num_ggml_tensors_created != tensors_map.tensors.size()) {
687
- throw std::string("llama.cpp: file contained more tensors than expected");
727
+ throw std::runtime_error(std::string("llama.cpp: file contained more tensors than expected"));
688
728
  }
689
729
  }
690
730
 
@@ -828,7 +868,10 @@ static bool kv_cache_init(
828
868
  struct llama_context_params llama_context_default_params() {
829
869
  struct llama_context_params result = {
830
870
  /*.n_ctx =*/ 512,
871
+ /*.n_batch =*/ 512,
831
872
  /*.gpu_layers =*/ 0,
873
+ /*.main_gpu =*/ 0,
874
+ /*.tensor_split =*/ {0},
832
875
  /*.seed =*/ -1,
833
876
  /*.f16_kv =*/ true,
834
877
  /*.logits_all =*/ false,
@@ -843,6 +886,17 @@ struct llama_context_params llama_context_default_params() {
843
886
  return result;
844
887
  }
845
888
 
889
+ struct llama_model_quantize_params llama_model_quantize_default_params() {
890
+ struct llama_model_quantize_params result = {
891
+ /*.nthread =*/ 0,
892
+ /*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
893
+ /*.allow_requantize =*/ false,
894
+ /*.quantize_output_tensor =*/ true,
895
+ };
896
+
897
+ return result;
898
+ }
899
+
846
900
  bool llama_mmap_supported() {
847
901
  return llama_mmap::SUPPORTED;
848
902
  }
@@ -893,12 +947,23 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
893
947
  case LLAMA_FTYPE_MOSTLY_Q5_0: return "mostly Q5_0";
894
948
  case LLAMA_FTYPE_MOSTLY_Q5_1: return "mostly Q5_1";
895
949
  case LLAMA_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0";
950
+ // K-quants
951
+ case LLAMA_FTYPE_MOSTLY_Q2_K: return "mostly Q2_K";
952
+ case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "mostly Q3_K - Small";
953
+ case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "mostly Q3_K - Medium";
954
+ case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "mostly Q3_K - Large";
955
+ case LLAMA_FTYPE_MOSTLY_Q4_K_S: return "mostly Q4_K - Small";
956
+ case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "mostly Q4_K - Medium";
957
+ case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "mostly Q5_K - Small";
958
+ case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "mostly Q5_K - Medium";
959
+ case LLAMA_FTYPE_MOSTLY_Q6_K: return "mostly Q6_K";
896
960
  default: return "unknown, may not work";
897
961
  }
898
962
  }
899
963
 
900
964
  static const char *llama_model_type_name(e_model type) {
901
965
  switch (type) {
966
+ case MODEL_3B: return "3B";
902
967
  case MODEL_7B: return "7B";
903
968
  case MODEL_13B: return "13B";
904
969
  case MODEL_30B: return "30B";
@@ -911,7 +976,10 @@ static void llama_model_load_internal(
911
976
  const std::string & fname,
912
977
  llama_context & lctx,
913
978
  int n_ctx,
979
+ int n_batch,
914
980
  int n_gpu_layers,
981
+ int main_gpu,
982
+ const float * tensor_split,
915
983
  ggml_type memory_type,
916
984
  bool use_mmap,
917
985
  bool use_mlock,
@@ -926,12 +994,13 @@ static void llama_model_load_internal(
926
994
  lctx.vocab = std::move(ml->file_loaders.at(0)->vocab);
927
995
  auto & model = lctx.model;
928
996
  model.hparams = ml->file_loaders.at(0)->hparams;
997
+ model.n_gpu_layers = n_gpu_layers;
929
998
  llama_file_version file_version = ml->file_loaders.at(0)->file_version;
930
999
  auto & hparams = model.hparams;
931
- uint32_t n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
932
1000
 
933
1001
  {
934
1002
  switch (hparams.n_layer) {
1003
+ case 26: model.type = e_model::MODEL_3B; break;
935
1004
  case 32: model.type = e_model::MODEL_7B; break;
936
1005
  case 40: model.type = e_model::MODEL_13B; break;
937
1006
  case 60: model.type = e_model::MODEL_30B; break;
@@ -941,6 +1010,8 @@ static void llama_model_load_internal(
941
1010
  hparams.n_ctx = n_ctx;
942
1011
  }
943
1012
 
1013
+ const uint32_t n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
1014
+
944
1015
  {
945
1016
  fprintf(stderr, "%s: format = %s\n", __func__, llama_file_version_name(file_version));
946
1017
  fprintf(stderr, "%s: n_vocab = %u\n", __func__, hparams.n_vocab);
@@ -960,7 +1031,7 @@ static void llama_model_load_internal(
960
1031
  if (hparams.ftype != LLAMA_FTYPE_ALL_F32 &&
961
1032
  hparams.ftype != LLAMA_FTYPE_MOSTLY_F16 &&
962
1033
  hparams.ftype != LLAMA_FTYPE_MOSTLY_Q8_0) {
963
- throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1405)");
1034
+ throw std::runtime_error(format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1405)"));
964
1035
  }
965
1036
  }
966
1037
 
@@ -968,7 +1039,7 @@ static void llama_model_load_internal(
968
1039
  if (hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ||
969
1040
  hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_1 ||
970
1041
  hparams.ftype == LLAMA_FTYPE_MOSTLY_Q8_0) {
971
- throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1508)");
1042
+ throw std::runtime_error(format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1508)"));
972
1043
  }
973
1044
  }
974
1045
 
@@ -999,18 +1070,28 @@ static void llama_model_load_internal(
999
1070
 
1000
1071
  model.ctx = ggml_init(params);
1001
1072
  if (!model.ctx) {
1002
- throw format("ggml_init() failed");
1073
+ throw std::runtime_error(format("ggml_init() failed"));
1003
1074
  }
1004
1075
  }
1005
1076
 
1006
- #ifdef GGML_USE_CUBLAS
1007
- #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CUDA
1077
+ (void) main_gpu;
1078
+ #if defined(GGML_USE_CUBLAS)
1079
+ fprintf(stderr, "%s: using CUDA for GPU acceleration\n", __func__);
1080
+ ggml_cuda_set_main_device(main_gpu);
1081
+ #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
1082
+ #define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT
1083
+ #elif defined(GGML_USE_CLBLAST)
1084
+ fprintf(stderr, "%s: using OpenCL for GPU acceleration\n", __func__);
1085
+ #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
1086
+ #define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU
1008
1087
  #else
1009
- #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CPU
1088
+ #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CPU
1089
+ #define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_CPU
1010
1090
  #endif
1011
1091
 
1012
1092
  // prepare memory for the weights
1013
- size_t vram_total = 0;
1093
+ size_t vram_weights = 0;
1094
+ size_t vram_scratch = 0;
1014
1095
  {
1015
1096
  const uint32_t n_embd = hparams.n_embd;
1016
1097
  const uint32_t n_layer = hparams.n_layer;
@@ -1025,7 +1106,7 @@ static void llama_model_load_internal(
1025
1106
  {
1026
1107
  ggml_backend backend_output;
1027
1108
  if (n_gpu_layers > int(n_layer)) { // NOLINT
1028
- backend_output = LLAMA_BACKEND_OFFLOAD;
1109
+ backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
1029
1110
  } else {
1030
1111
  backend_output = GGML_BACKEND_CPU;
1031
1112
  }
@@ -1037,7 +1118,8 @@ static void llama_model_load_internal(
1037
1118
 
1038
1119
  model.layers.resize(n_layer);
1039
1120
  for (uint32_t i = 0; i < n_layer; ++i) {
1040
- const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
1121
+ const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
1122
+ const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
1041
1123
 
1042
1124
  auto & layer = model.layers[i];
1043
1125
 
@@ -1045,19 +1127,19 @@ static void llama_model_load_internal(
1045
1127
 
1046
1128
  layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd}, backend);
1047
1129
 
1048
- layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}, backend);
1049
- layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd}, backend);
1050
- layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd}, backend);
1051
- layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}, backend);
1130
+ layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}, backend_split);
1131
+ layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd}, backend_split);
1132
+ layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd}, backend_split);
1133
+ layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}, backend_split);
1052
1134
 
1053
1135
  layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd}, backend);
1054
1136
 
1055
- layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}, backend);
1056
- layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}, backend);
1057
- layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}, backend);
1137
+ layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}, backend_split);
1138
+ layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}, backend_split);
1139
+ layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}, backend_split);
1058
1140
 
1059
- if (backend == GGML_BACKEND_CUDA) {
1060
- vram_total +=
1141
+ if (backend == GGML_BACKEND_GPU) {
1142
+ vram_weights +=
1061
1143
  ggml_nbytes(layer.attention_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
1062
1144
  ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.attention_norm) +
1063
1145
  ggml_nbytes(layer.w1) + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3);
@@ -1074,10 +1156,10 @@ static void llama_model_load_internal(
1074
1156
  // this is the total memory required to run the inference
1075
1157
  const size_t mem_required =
1076
1158
  ctx_size +
1077
- mmapped_size - vram_total + // weights in VRAM not in memory
1159
+ mmapped_size - vram_weights + // weights in VRAM not in memory
1078
1160
  MEM_REQ_SCRATCH0().at(model.type) +
1079
1161
  MEM_REQ_SCRATCH1().at(model.type) +
1080
- MEM_REQ_EVAL().at(model.type);
1162
+ MEM_REQ_EVAL().at (model.type);
1081
1163
 
1082
1164
  // this is the memory required by one llama_state
1083
1165
  const size_t mem_required_state =
@@ -1086,15 +1168,25 @@ static void llama_model_load_internal(
1086
1168
  fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
1087
1169
  mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
1088
1170
 
1171
+ (void) vram_scratch;
1089
1172
  #ifdef GGML_USE_CUBLAS
1173
+ vram_scratch = n_batch * MB;
1174
+ ggml_cuda_set_scratch_size(vram_scratch);
1175
+ if (n_gpu_layers > 0) {
1176
+ fprintf(stderr, "%s: allocating batch_size x 1 MB = %ld MB VRAM for the scratch buffer\n",
1177
+ __func__, vram_scratch / MB);
1178
+ }
1179
+ #endif // GGML_USE_CUBLAS
1180
+ #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
1090
1181
  const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
1091
1182
 
1092
- fprintf(stderr, "%s: [cublas] offloading %d layers to GPU\n", __func__, n_gpu);
1183
+ fprintf(stderr, "%s: offloading %d layers to GPU\n", __func__, n_gpu);
1093
1184
  if (n_gpu_layers > (int) hparams.n_layer) {
1094
- fprintf(stderr, "%s: [cublas] offloading output layer to GPU\n", __func__);
1185
+ fprintf(stderr, "%s: offloading output layer to GPU\n", __func__);
1095
1186
  }
1096
- fprintf(stderr, "%s: [cublas] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
1097
- #elif !defined(GGML_USE_CLBLAST)
1187
+ fprintf(stderr, "%s: total VRAM used: %zu MB\n",
1188
+ __func__, (vram_weights + vram_scratch + MB - 1) / MB); // round up
1189
+ #else
1098
1190
  (void) n_gpu_layers;
1099
1191
  #endif
1100
1192
  }
@@ -1106,8 +1198,10 @@ static void llama_model_load_internal(
1106
1198
 
1107
1199
  ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
1108
1200
 
1109
- #ifdef GGML_USE_CUBLAS
1201
+ #if defined(GGML_USE_CUBLAS)
1110
1202
  {
1203
+ ggml_cuda_set_tensor_split(tensor_split);
1204
+
1111
1205
  size_t done_size = 0;
1112
1206
  size_t data_size = 0;
1113
1207
  for (llama_load_tensor & lt : ml->tensors_map.tensors) {
@@ -1117,7 +1211,8 @@ static void llama_model_load_internal(
1117
1211
  }
1118
1212
  }
1119
1213
  for (llama_load_tensor & lt : ml->tensors_map.tensors) {
1120
- if (lt.ggml_tensor->backend != GGML_BACKEND_CUDA) {
1214
+ ggml_backend backend = lt.ggml_tensor->backend;
1215
+ if (backend != GGML_BACKEND_GPU && backend != GGML_BACKEND_GPU_SPLIT) {
1121
1216
  continue;
1122
1217
  }
1123
1218
  if (progress_callback) {
@@ -1129,30 +1224,28 @@ static void llama_model_load_internal(
1129
1224
  }
1130
1225
  #elif defined(GGML_USE_CLBLAST)
1131
1226
  {
1132
- const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
1133
-
1134
- fprintf(stderr, "ggml_opencl: offloading %d layers to GPU\n", n_gpu);
1135
-
1136
- size_t vram_total = 0;
1137
-
1138
- for (int i = 0; i < n_gpu; ++i) {
1139
- const auto & layer = model.layers[i];
1140
-
1141
- ggml_cl_transform_tensor(layer.wq); vram_total += ggml_nbytes(layer.wq);
1142
- ggml_cl_transform_tensor(layer.wk); vram_total += ggml_nbytes(layer.wk);
1143
- ggml_cl_transform_tensor(layer.wv); vram_total += ggml_nbytes(layer.wv);
1144
- ggml_cl_transform_tensor(layer.wo); vram_total += ggml_nbytes(layer.wo);
1145
- ggml_cl_transform_tensor(layer.w1); vram_total += ggml_nbytes(layer.w1);
1146
- ggml_cl_transform_tensor(layer.w2); vram_total += ggml_nbytes(layer.w2);
1147
- ggml_cl_transform_tensor(layer.w3); vram_total += ggml_nbytes(layer.w3);
1227
+ size_t done_size = 0;
1228
+ size_t data_size = 0;
1229
+ for (llama_load_tensor & lt : ml->tensors_map.tensors) {
1230
+ data_size += lt.size;
1231
+ if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
1232
+ done_size += lt.size;
1233
+ }
1148
1234
  }
1149
- if (n_gpu_layers > (int) hparams.n_layer) {
1150
- fprintf(stderr, "ggml_opencl: offloading output layer to GPU\n");
1151
- ggml_cl_transform_tensor(model.output); vram_total += ggml_nbytes(model.output);
1235
+ for (llama_load_tensor & lt : ml->tensors_map.tensors) {
1236
+ if (lt.ggml_tensor->backend != GGML_BACKEND_GPU) {
1237
+ continue;
1238
+ }
1239
+ if (progress_callback) {
1240
+ progress_callback((float) done_size / data_size, progress_callback_user_data);
1241
+ }
1242
+ ggml_cl_load_data(fname.c_str(), lt.ggml_tensor, lt.shards.at(0).file_off);
1243
+ done_size += lt.size;
1152
1244
  }
1153
-
1154
- fprintf(stderr, "ggml_opencl: total VRAM used: %zu MB\n", vram_total / 1024 / 1024);
1155
1245
  }
1246
+ #else
1247
+ (void) n_batch;
1248
+ (void) tensor_split;
1156
1249
  #endif
1157
1250
 
1158
1251
  if (progress_callback) {
@@ -1170,7 +1263,10 @@ static bool llama_model_load(
1170
1263
  const std::string & fname,
1171
1264
  llama_context & lctx,
1172
1265
  int n_ctx,
1266
+ int n_batch,
1173
1267
  int n_gpu_layers,
1268
+ int main_gpu,
1269
+ float * tensor_split,
1174
1270
  ggml_type memory_type,
1175
1271
  bool use_mmap,
1176
1272
  bool use_mlock,
@@ -1178,28 +1274,30 @@ static bool llama_model_load(
1178
1274
  llama_progress_callback progress_callback,
1179
1275
  void *progress_callback_user_data) {
1180
1276
  try {
1181
- llama_model_load_internal(fname, lctx, n_ctx, n_gpu_layers, memory_type, use_mmap, use_mlock,
1182
- vocab_only, progress_callback, progress_callback_user_data);
1277
+ llama_model_load_internal(fname, lctx, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, memory_type,
1278
+ use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
1183
1279
  return true;
1184
- } catch (const std::string & err) {
1185
- fprintf(stderr, "error loading model: %s\n", err.c_str());
1280
+ } catch (const std::exception & err) {
1281
+ fprintf(stderr, "error loading model: %s\n", err.what());
1186
1282
  return false;
1187
1283
  }
1188
1284
  }
1189
1285
 
1190
1286
  // evaluate the transformer
1191
1287
  //
1192
- // - lctx: llama context
1193
- // - tokens: new batch of tokens to process
1194
- // - n_past: the context size so far
1195
- // - n_threads: number of threads to use
1288
+ // - lctx: llama context
1289
+ // - tokens: new batch of tokens to process
1290
+ // - n_past: the context size so far
1291
+ // - n_threads: number of threads to use
1292
+ // - cgraph_fname: filename of the exported computation graph
1196
1293
  //
1197
1294
  static bool llama_eval_internal(
1198
- llama_context & lctx,
1199
- const llama_token * tokens,
1200
- const int n_tokens,
1201
- const int n_past,
1202
- const int n_threads) {
1295
+ llama_context & lctx,
1296
+ const llama_token * tokens,
1297
+ const int n_tokens,
1298
+ const int n_past,
1299
+ const int n_threads,
1300
+ const char * cgraph_fname) {
1203
1301
 
1204
1302
  // enforce that the first token is BOS
1205
1303
  if (n_past == 0 && tokens[0] != llama_token_bos()) {
@@ -1218,12 +1316,13 @@ static bool llama_eval_internal(
1218
1316
 
1219
1317
  LLAMA_ASSERT(!!kv_self.ctx);
1220
1318
 
1221
- const int n_embd = hparams.n_embd;
1222
- const int n_layer = hparams.n_layer;
1223
- const int n_ctx = hparams.n_ctx;
1224
- const int n_head = hparams.n_head;
1225
- const int n_vocab = hparams.n_vocab;
1226
- const int n_rot = hparams.n_embd/hparams.n_head;
1319
+ const int n_embd = hparams.n_embd;
1320
+ const int n_layer = hparams.n_layer;
1321
+ const int n_ctx = hparams.n_ctx;
1322
+ const int n_head = hparams.n_head;
1323
+ const int n_vocab = hparams.n_vocab;
1324
+ const int n_rot = hparams.n_embd/hparams.n_head;
1325
+ const int n_gpu_layers = model.n_gpu_layers;
1227
1326
 
1228
1327
  auto & mem_per_token = lctx.mem_per_token;
1229
1328
  auto & buf_compute = lctx.buf_compute;
@@ -1245,40 +1344,66 @@ static bool llama_eval_internal(
1245
1344
  ggml_set_name(embd, "embd");
1246
1345
  memcpy(embd->data, tokens, N*ggml_element_size(embd));
1247
1346
 
1347
+ struct ggml_tensor * cur;
1248
1348
  struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
1249
1349
 
1350
+ const int i_gpu_start = n_layer - n_gpu_layers;
1351
+ (void) i_gpu_start;
1352
+
1250
1353
  for (int il = 0; il < n_layer; ++il) {
1251
- struct ggml_tensor * inpSA = inpL;
1354
+ offload_func_t offload_func = llama_nop;
1252
1355
 
1253
- struct ggml_tensor * cur;
1356
+ #ifdef GGML_USE_CUBLAS
1357
+ if (il >= i_gpu_start) {
1358
+ offload_func = ggml_cuda_assign_buffers; // sets the output backend to GPU
1359
+ }
1360
+ #endif // GGML_USE_CUBLAS
1361
+
1362
+ struct ggml_tensor * inpSA = inpL;
1254
1363
 
1255
1364
  lctx.use_buf(ctx0, 0);
1256
1365
 
1257
1366
  // norm
1258
1367
  {
1259
1368
  cur = ggml_rms_norm(ctx0, inpL);
1369
+ offload_func(cur);
1370
+ ggml_set_name(cur, "rms_norm_0");
1260
1371
 
1261
1372
  // cur = cur*attention_norm(broadcasted)
1262
1373
  cur = ggml_mul(ctx0, cur, model.layers[il].attention_norm);
1374
+ offload_func(cur);
1375
+ ggml_set_name(cur, "attention_norm_0");
1263
1376
  }
1264
1377
 
1265
1378
  // self-attention
1266
1379
  {
1267
1380
  // compute Q and K and RoPE them
1268
- struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
1269
- struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
1270
- ggml_set_name(Qcur, "Qcur");
1381
+ struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
1382
+ // offload_func(tmpq);
1383
+ ggml_set_name(tmpq, "tmpq");
1384
+
1385
+ struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
1386
+ // offload_func(tmpk);
1387
+ ggml_set_name(tmpk, "tmpk");
1388
+
1389
+ struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0);
1271
1390
  ggml_set_name(Kcur, "Kcur");
1272
1391
 
1392
+ struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0);
1393
+ ggml_set_name(Qcur, "Qcur");
1394
+
1273
1395
  // store key and value to memory
1274
1396
  {
1275
1397
  // compute the transposed [N, n_embd] V matrix
1276
1398
  struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), n_embd, N));
1399
+ ggml_set_name(Vcur, "Vcur");
1277
1400
 
1278
1401
  struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
1402
+ ggml_set_name(k, "k");
1279
1403
  struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
1280
1404
  ( n_ctx)*ggml_element_size(kv_self.v),
1281
1405
  (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
1406
+ ggml_set_name(v, "v");
1282
1407
 
1283
1408
  // important: storing RoPE-ed version of K in the KV cache!
1284
1409
  ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
@@ -1319,7 +1444,6 @@ static bool llama_eval_internal(
1319
1444
  struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
1320
1445
  ggml_set_name(KQ_soft_max, "KQ_soft_max");
1321
1446
 
1322
-
1323
1447
  // split cached V into n_head heads
1324
1448
  struct ggml_tensor * V =
1325
1449
  ggml_view_3d(ctx0, kv_self.v,
@@ -1354,73 +1478,143 @@ static bool llama_eval_internal(
1354
1478
  cur = ggml_mul_mat(ctx0,
1355
1479
  model.layers[il].wo,
1356
1480
  cur);
1481
+ offload_func(cur);
1482
+ ggml_set_name(cur, "result_wo");
1357
1483
  }
1358
1484
 
1359
1485
  lctx.use_buf(ctx0, 1);
1486
+ //ggml_cuda_set_scratch(1);
1360
1487
 
1361
1488
  struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
1489
+ offload_func(inpFF);
1490
+ ggml_set_name(inpFF, "inpFF");
1362
1491
 
1363
1492
  // feed-forward network
1364
1493
  {
1365
1494
  // norm
1366
1495
  {
1367
1496
  cur = ggml_rms_norm(ctx0, inpFF);
1497
+ offload_func(cur);
1498
+ ggml_set_name(cur, "rms_norm_1");
1368
1499
 
1369
1500
  // cur = cur*ffn_norm(broadcasted)
1370
1501
  cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
1502
+ offload_func(cur);
1503
+ ggml_set_name(cur, "ffn_norm");
1371
1504
  }
1372
1505
 
1373
1506
  struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
1374
1507
  model.layers[il].w3,
1375
1508
  cur);
1509
+ offload_func(tmp);
1510
+ ggml_set_name(tmp, "result_w3");
1376
1511
 
1377
1512
  cur = ggml_mul_mat(ctx0,
1378
1513
  model.layers[il].w1,
1379
1514
  cur);
1515
+ offload_func(cur);
1516
+ ggml_set_name(cur, "result_w2");
1380
1517
 
1381
1518
  // SILU activation
1382
1519
  cur = ggml_silu(ctx0, cur);
1520
+ offload_func(cur);
1521
+ ggml_set_name(cur, "silu");
1383
1522
 
1384
1523
  cur = ggml_mul(ctx0, cur, tmp);
1524
+ offload_func(cur);
1525
+ ggml_set_name(cur, "silu_x_result_w3");
1385
1526
 
1386
1527
  cur = ggml_mul_mat(ctx0,
1387
1528
  model.layers[il].w2,
1388
1529
  cur);
1530
+ offload_func(cur);
1531
+ ggml_set_name(cur, "result_w2");
1389
1532
  }
1390
1533
 
1391
1534
  cur = ggml_add(ctx0, cur, inpFF);
1535
+ offload_func(cur);
1536
+ ggml_set_name(cur, "inpFF_+_result_w2");
1392
1537
 
1393
1538
  // input for next layer
1394
1539
  inpL = cur;
1540
+
1395
1541
  }
1396
1542
 
1397
1543
  lctx.use_buf(ctx0, 0);
1544
+ //ggml_cuda_set_scratch(0);
1398
1545
 
1399
1546
  // used at the end to optionally extract the embeddings
1400
1547
  struct ggml_tensor * embeddings = NULL;
1401
1548
 
1549
+ offload_func_t offload_func = llama_nop;
1550
+
1551
+ #ifdef GGML_USE_CUBLAS
1552
+ if (n_gpu_layers > n_layer) {
1553
+ offload_func = ggml_cuda_assign_buffers; // sets the output backend to GPU
1554
+ }
1555
+ #endif // GGML_USE_CUBLAS
1556
+
1402
1557
  // norm
1403
1558
  {
1559
+ cur = ggml_rms_norm(ctx0, inpL);
1560
+ offload_func(cur);
1561
+ ggml_set_name(cur, "rms_norm_inpL");
1404
1562
 
1405
- inpL = ggml_rms_norm(ctx0, inpL);
1563
+ cur = ggml_rms_norm(ctx0, cur);
1564
+ offload_func(cur);
1565
+ ggml_set_name(cur, "rms_norm_after");
1406
1566
 
1407
- // inpL = inpL*norm(broadcasted)
1408
- inpL = ggml_mul(ctx0, inpL, model.norm);
1567
+ // cur = cur*norm(broadcasted)
1568
+ cur = ggml_mul(ctx0, cur, model.norm);
1569
+ offload_func(cur);
1570
+ ggml_set_name(cur, "result_norm");
1409
1571
 
1410
- embeddings = inpL;
1572
+ embeddings = cur;
1411
1573
  }
1412
1574
 
1575
+
1413
1576
  // lm_head
1414
- inpL = ggml_mul_mat(ctx0, model.output, inpL);
1577
+ cur = ggml_mul_mat(ctx0, model.output, cur);
1578
+ ggml_set_name(cur, "result_output");
1415
1579
 
1416
1580
  lctx.use_buf(ctx0, -1);
1417
1581
 
1418
1582
  // logits -> probs
1419
- //inpL = ggml_soft_max_inplace(ctx0, inpL);
1583
+ //cur = ggml_soft_max_inplace(ctx0, cur);
1420
1584
 
1421
1585
  // run the computation
1422
- ggml_build_forward_expand(&gf, inpL);
1423
- ggml_graph_compute (ctx0, &gf);
1586
+ ggml_build_forward_expand(&gf, cur);
1587
+
1588
+ #ifdef GGML_USE_METAL
1589
+ if (lctx.ctx_metal && N == 1) {
1590
+ ggml_metal_graph_compute(lctx.ctx_metal, &gf);
1591
+ ggml_metal_get_tensor (lctx.ctx_metal, cur);
1592
+ } else {
1593
+ // IMPORTANT:
1594
+ // Since we don't have efficient Matrix x Matrix Metal multiplication yet, we fallback to vanilla
1595
+ // ggml_graph_compute(). It uses Apple's Accelerate CBLAS API which takes advantage of the ANE or the AMX
1596
+ // coprocessor.
1597
+ //
1598
+ // When we implement Matrix x Matrix Metal multiplication, we can avoid this branch.
1599
+ // But for now, we have focused only on Matrix x Vector Metal multiplication.
1600
+ //
1601
+ // TODO: avoid these syncs via shared memory (ref #1696)
1602
+ //
1603
+ if (lctx.ctx_metal) {
1604
+ // We need to sync the GPU KV cache with the CPU KV cache
1605
+ ggml_metal_get_tensor(lctx.ctx_metal, kv_self.k);
1606
+ ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v);
1607
+ }
1608
+
1609
+ ggml_graph_compute(ctx0, &gf);
1610
+ }
1611
+ #else
1612
+ ggml_graph_compute(ctx0, &gf);
1613
+ #endif
1614
+
1615
+ if (cgraph_fname) {
1616
+ ggml_graph_export(&gf, cgraph_fname);
1617
+ }
1424
1618
 
1425
1619
  #ifdef GGML_PERF
1426
1620
  // print timing information per ggml operation (for debugging purposes)
@@ -1434,7 +1628,7 @@ static bool llama_eval_internal(
1434
1628
  //}
1435
1629
 
1436
1630
  //embd_w.resize(n_vocab*N);
1437
- //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
1631
+ //memcpy(embd_w.data(), ggml_get_data(cur), sizeof(float)*n_vocab*N);
1438
1632
 
1439
1633
  // update kv token count
1440
1634
  lctx.model.kv_self.n = n_past + N;
@@ -1445,11 +1639,11 @@ static bool llama_eval_internal(
1445
1639
 
1446
1640
  if (lctx.logits_all) {
1447
1641
  logits_out.resize(n_vocab * N);
1448
- memcpy(logits_out.data(), (float *) ggml_get_data(inpL), sizeof(float)*n_vocab*N);
1642
+ memcpy(logits_out.data(), (float *) ggml_get_data(cur), sizeof(float)*n_vocab*N);
1449
1643
  } else {
1450
1644
  // return result for just the last token
1451
1645
  logits_out.resize(n_vocab);
1452
- memcpy(logits_out.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
1646
+ memcpy(logits_out.data(), (float *) ggml_get_data(cur) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
1453
1647
  }
1454
1648
  }
1455
1649
 
@@ -2048,16 +2242,88 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra
2048
2242
  // quantization
2049
2243
  //
2050
2244
 
2051
- static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, enum llama_ftype ftype, int nthread) {
2245
+ static void llama_convert_tensor_internal(const llama_load_tensor & tensor, llama_buffer & output, const int nelements, const int nthread) {
2246
+ if (output.size < nelements * sizeof(float)) {
2247
+ output.resize(nelements * sizeof(float));
2248
+ }
2249
+ float * f32_output = (float *) output.addr;
2250
+
2251
+ quantize_fns_t qtype;
2252
+ if (ggml_is_quantized(tensor.type)) {
2253
+ qtype = ggml_internal_get_quantize_fn(tensor.type);
2254
+ if (qtype.dequantize_row_q == NULL) {
2255
+ throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor.type)));
2256
+ }
2257
+ } else if (tensor.type != GGML_TYPE_F16) {
2258
+ throw std::runtime_error(format("cannot dequantize/convert tensor type %s", ggml_type_name(tensor.type)));
2259
+ }
2260
+
2261
+ if (nthread < 2) {
2262
+ if (tensor.type == GGML_TYPE_F16) {
2263
+ ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor.data, f32_output, nelements);
2264
+ } else if (ggml_is_quantized(tensor.type)) {
2265
+ qtype.dequantize_row_q(tensor.data, f32_output, nelements);
2266
+ } else {
2267
+ LLAMA_ASSERT(false); // unreachable
2268
+ }
2269
+ return;
2270
+ }
2271
+
2272
+ auto block_size = tensor.type == GGML_TYPE_F16 ? 1 : (size_t)ggml_blck_size(tensor.type);
2273
+ auto block_size_bytes = ggml_type_size(tensor.type);
2274
+
2275
+ LLAMA_ASSERT(nelements % block_size == 0);
2276
+ auto nblocks = nelements / block_size;
2277
+ auto blocks_per_thread = nblocks / nthread;
2278
+ auto spare_blocks = nblocks - (blocks_per_thread * nthread); // if blocks aren't divisible by thread count
2279
+
2280
+ std::vector<std::thread> workers;
2281
+ for (auto tnum = 0, in_buff_offs = 0, out_buff_offs = 0; tnum < nthread; tnum++) {
2282
+ auto thr_blocks = blocks_per_thread + (tnum == nthread - 1 ? spare_blocks : 0); // num blocks for this thread
2283
+ auto thr_elems = thr_blocks * block_size; // number of elements for this thread
2284
+ auto thr_block_bytes = thr_blocks * block_size_bytes; // number of input bytes for this thread
2285
+
2286
+ auto compute = [qtype] (ggml_type typ, uint8_t * inbuf, float * outbuf, int nels) {
2287
+ if (typ == GGML_TYPE_F16) {
2288
+ ggml_fp16_to_fp32_row((ggml_fp16_t *)inbuf, outbuf, nels);
2289
+ } else {
2290
+ qtype.dequantize_row_q(inbuf, outbuf, nels);
2291
+ }
2292
+ };
2293
+ workers.push_back(std::thread(compute, tensor.type, tensor.data + in_buff_offs, f32_output + out_buff_offs, thr_elems));
2294
+ in_buff_offs += thr_block_bytes;
2295
+ out_buff_offs += thr_elems;
2296
+ }
2297
+ for (auto & worker : workers) {
2298
+ worker.join();
2299
+ }
2300
+
2301
+ }
2302
+
2303
+ static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
2052
2304
  ggml_type quantized_type;
2053
- switch (ftype) {
2305
+ llama_ftype ftype = params->ftype;
2306
+ int nthread = params->nthread;
2307
+
2308
+ switch (params->ftype) {
2054
2309
  case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
2055
2310
  case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
2056
2311
  case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
2057
2312
  case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
2058
2313
  case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
2059
- default: throw format("invalid output file type %d\n", ftype);
2060
- };
2314
+
2315
+ // K-quants
2316
+ case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
2317
+ case LLAMA_FTYPE_MOSTLY_Q3_K_S:
2318
+ case LLAMA_FTYPE_MOSTLY_Q3_K_M:
2319
+ case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break;
2320
+ case LLAMA_FTYPE_MOSTLY_Q4_K_S:
2321
+ case LLAMA_FTYPE_MOSTLY_Q4_K_M: quantized_type = GGML_TYPE_Q4_K; break;
2322
+ case LLAMA_FTYPE_MOSTLY_Q5_K_S:
2323
+ case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break;
2324
+ case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
2325
+ default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
2326
+ }
2061
2327
 
2062
2328
  if (nthread <= 0) {
2063
2329
  nthread = std::thread::hardware_concurrency();
@@ -2065,7 +2331,21 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2065
2331
 
2066
2332
  std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp, /*use_mmap*/ false,
2067
2333
  /*vocab_only*/ false));
2068
- llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), ftype);
2334
+ llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), params->ftype);
2335
+
2336
+ int n_attention_wv = 0;
2337
+ int n_feed_forward_w2 = 0;
2338
+ for (auto& tensor : model_loader->tensors_map.tensors) {
2339
+ if (tensor.name.find("attention.wv.weight") != std::string::npos) {
2340
+ ++n_attention_wv;
2341
+ }
2342
+ else if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) {
2343
+ ++n_feed_forward_w2;
2344
+ }
2345
+ }
2346
+
2347
+ int i_attention_wv = 0;
2348
+ int i_feed_forward_w2 = 0;
2069
2349
 
2070
2350
  size_t total_size_org = 0;
2071
2351
  size_t total_size_new = 0;
@@ -2093,9 +2373,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2093
2373
  quantize &= (tensor.ne.size() == 2);
2094
2374
 
2095
2375
  // uncomment this to keep the output layer in FP16
2096
- //if (tensor.name == "output.weight") {
2097
- // quantize = false;
2098
- //}
2376
+ if (!params->quantize_output_tensor && tensor.name == "output.weight") {
2377
+ quantize = false;
2378
+ }
2379
+ quantize = quantize && quantized_type != tensor.type;
2099
2380
 
2100
2381
  enum ggml_type new_type;
2101
2382
  void * new_data;
@@ -2109,20 +2390,43 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2109
2390
  printf("size = %8.3f MB\n", tensor.size/1024.0/1024.0);
2110
2391
  } else {
2111
2392
  new_type = quantized_type;
2393
+ // TODO: temporary disabled until Metal / OpenCL support is available
2394
+ // ref: https://github.com/ggerganov/llama.cpp/issues/1711
2395
+ //if (tensor.name == "output.weight") {
2396
+ // new_type = GGML_TYPE_Q6_K;
2397
+ //}
2398
+ if (tensor.name.find("attention.wv.weight") != std::string::npos) {
2399
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
2400
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
2401
+ else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
2402
+ (i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8 ||
2403
+ (i_attention_wv - n_attention_wv/8)%3 == 2)) new_type = GGML_TYPE_Q6_K;
2404
+ ++i_attention_wv;
2405
+ }
2406
+ if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) {
2407
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
2408
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
2409
+ else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
2410
+ (i_feed_forward_w2 < n_feed_forward_w2/8 || i_feed_forward_w2 >= 7*n_feed_forward_w2/8 ||
2411
+ (i_feed_forward_w2 - n_feed_forward_w2/8)%3 == 2)) new_type = GGML_TYPE_Q6_K;
2412
+ ++i_feed_forward_w2;
2413
+ }
2414
+ if (tensor.name.find("attention.wo.weight") != std::string::npos) {
2415
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
2416
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
2417
+ }
2418
+
2112
2419
  float * f32_data;
2113
2420
  size_t nelements = tensor.ne.at(0) * tensor.ne.at(1);
2114
2421
  llama_buffer f32_conv_buf;
2422
+
2115
2423
  if (tensor.type == GGML_TYPE_F32) {
2116
2424
  f32_data = (float *) tensor.data;
2117
- } else if (tensor.type == GGML_TYPE_F16) {
2118
- f32_conv_buf.resize(nelements * sizeof(float));
2119
- f32_data = (float *) f32_conv_buf.addr;
2120
- const auto * f16_data = (const ggml_fp16_t *) tensor.data;
2121
- for (size_t i = 0; i < nelements; i++) {
2122
- f32_data[i] = ggml_fp16_to_fp32(f16_data[i]);
2123
- }
2425
+ } else if (ggml_is_quantized(tensor.type) && !params->allow_requantize) {
2426
+ throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor.type)));
2124
2427
  } else {
2125
- throw format("type %s unsupported for integer quantization", ggml_type_name(tensor.type));
2428
+ llama_convert_tensor_internal(tensor, f32_conv_buf, nelements, nthread);
2429
+ f32_data = (float *) f32_conv_buf.addr;
2126
2430
  }
2127
2431
 
2128
2432
  printf("quantizing .. ");
@@ -2176,12 +2480,16 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2176
2480
  }
2177
2481
 
2178
2482
  printf("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0);
2483
+ int64_t tot_count = 0;
2179
2484
  for (size_t i = 0; i < hist_cur.size(); i++) {
2180
2485
  hist_all[i] += hist_cur[i];
2486
+ tot_count += hist_cur[i];
2181
2487
  }
2182
2488
 
2183
- for (size_t i = 0; i < hist_cur.size(); i++) {
2184
- printf("%5.3f ", hist_cur[i] / float(nelements));
2489
+ if (tot_count > 0) {
2490
+ for (size_t i = 0; i < hist_cur.size(); i++) {
2491
+ printf("%5.3f ", hist_cur[i] / float(nelements));
2492
+ }
2185
2493
  }
2186
2494
  printf("\n");
2187
2495
  }
@@ -2199,11 +2507,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2199
2507
  sum_all += hist_all[i];
2200
2508
  }
2201
2509
 
2202
- printf("%s: hist: ", __func__);
2203
- for (size_t i = 0; i < hist_all.size(); i++) {
2204
- printf("%5.3f ", hist_all[i] / float(sum_all));
2510
+ if (sum_all > 0) {
2511
+ printf("%s: hist: ", __func__);
2512
+ for (size_t i = 0; i < hist_all.size(); i++) {
2513
+ printf("%5.3f ", hist_all[i] / float(sum_all));
2514
+ }
2515
+ printf("\n");
2205
2516
  }
2206
- printf("\n");
2207
2517
  }
2208
2518
  }
2209
2519
 
@@ -2244,9 +2554,9 @@ struct llama_context * llama_init_from_file(
2244
2554
 
2245
2555
  ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
2246
2556
 
2247
- if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_gpu_layers, memory_type,
2248
- params.use_mmap, params.use_mlock, params.vocab_only,
2249
- params.progress_callback, params.progress_callback_user_data)) {
2557
+ if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_batch, params.n_gpu_layers,
2558
+ params.main_gpu, params.tensor_split, memory_type, params.use_mmap, params.use_mlock,
2559
+ params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
2250
2560
  fprintf(stderr, "%s: failed to load model\n", __func__);
2251
2561
  llama_free(ctx);
2252
2562
  return nullptr;
@@ -2284,6 +2594,38 @@ struct llama_context * llama_init_from_file(
2284
2594
  ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
2285
2595
  }
2286
2596
 
2597
+ #ifdef GGML_USE_METAL
2598
+ if (params.n_gpu_layers > 0) {
2599
+ // this allocates all Metal resources and memory buffers
2600
+ ctx->ctx_metal = ggml_metal_init();
2601
+
2602
+ void *data_ptr = NULL;
2603
+ size_t data_size = 0;
2604
+ if (params.use_mmap) {
2605
+ data_ptr = ctx->model.mapping->addr;
2606
+ data_size= ctx->model.mapping->size;
2607
+ } else {
2608
+ data_ptr = ggml_get_mem_buffer(ctx->model.ctx);
2609
+ data_size= ggml_get_mem_size(ctx->model.ctx);
2610
+ }
2611
+
2612
+ #define LLAMA_METAL_CHECK_BUF(result) \
2613
+ if (!(result)) { \
2614
+ fprintf(stderr, "%s: failed to add buffer\n", __func__); \
2615
+ llama_free(ctx); \
2616
+ return NULL; \
2617
+ }
2618
+
2619
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size));
2620
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size));
2621
+
2622
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->model.kv_self.buf.addr, ctx->model.kv_self.buf.size));
2623
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size));
2624
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size));
2625
+ #undef LLAMA_METAL_CHECK_BUF
2626
+ }
2627
+ #endif
2628
+
2287
2629
  return ctx;
2288
2630
  }
2289
2631
 
@@ -2294,13 +2636,12 @@ void llama_free(struct llama_context * ctx) {
2294
2636
  int llama_model_quantize(
2295
2637
  const char * fname_inp,
2296
2638
  const char * fname_out,
2297
- enum llama_ftype ftype,
2298
- int nthread) {
2639
+ const llama_model_quantize_params *params) {
2299
2640
  try {
2300
- llama_model_quantize_internal(fname_inp, fname_out, ftype, nthread);
2641
+ llama_model_quantize_internal(fname_inp, fname_out, params);
2301
2642
  return 0;
2302
- } catch (const std::string & err) {
2303
- fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.c_str());
2643
+ } catch (const std::exception & err) {
2644
+ fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.what());
2304
2645
  return 1;
2305
2646
  }
2306
2647
  }
@@ -2553,8 +2894,8 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
2553
2894
  int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
2554
2895
  try {
2555
2896
  return llama_apply_lora_from_file_internal(ctx, path_lora, path_base_model, n_threads);
2556
- } catch (const std::string & err) {
2557
- fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.c_str());
2897
+ } catch (const std::exception & err) {
2898
+ fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what());
2558
2899
  return 1;
2559
2900
  }
2560
2901
  }
@@ -2899,7 +3240,7 @@ int llama_eval(
2899
3240
  int n_tokens,
2900
3241
  int n_past,
2901
3242
  int n_threads) {
2902
- if (!llama_eval_internal(*ctx, tokens, n_tokens, n_past, n_threads)) {
3243
+ if (!llama_eval_internal(*ctx, tokens, n_tokens, n_past, n_threads, nullptr)) {
2903
3244
  fprintf(stderr, "%s: failed to eval\n", __func__);
2904
3245
  return 1;
2905
3246
  }
@@ -2914,6 +3255,20 @@ int llama_eval(
2914
3255
  return 0;
2915
3256
  }
2916
3257
 
3258
+ int llama_eval_export(struct llama_context * ctx, const char * fname) {
3259
+ const int n_batch = 1;
3260
+ const int n_ctx = 512 - n_batch;
3261
+
3262
+ const std::vector<llama_token> tmp(n_batch, llama_token_bos());
3263
+
3264
+ if (!llama_eval_internal(*ctx, tmp.data(), tmp.size(), n_ctx, 1, fname)) {
3265
+ fprintf(stderr, "%s: failed to eval\n", __func__);
3266
+ return 1;
3267
+ }
3268
+
3269
+ return 0;
3270
+ }
3271
+
2917
3272
  int llama_tokenize(
2918
3273
  struct llama_context * ctx,
2919
3274
  const char * text,