llama_cpp 0.1.1 → 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,6 +1,7 @@
1
1
  // Defines fileno on msys:
2
2
  #ifndef _GNU_SOURCE
3
3
  #define _GNU_SOURCE
4
+ #include <cstddef>
4
5
  #include <cstdint>
5
6
  #include <cstdio>
6
7
  #endif
@@ -11,6 +12,8 @@
11
12
  #include "ggml.h"
12
13
  #ifdef GGML_USE_CUBLAS
13
14
  #include "ggml-cuda.h"
15
+ #elif defined(GGML_USE_CLBLAST)
16
+ #include "ggml-opencl.h"
14
17
  #endif
15
18
 
16
19
  #include <array>
@@ -45,6 +48,7 @@ enum e_model {
45
48
  MODEL_65B,
46
49
  };
47
50
 
51
+
48
52
  static const size_t MB = 1024*1024;
49
53
 
50
54
  // computed for n_ctx == 2048
@@ -110,7 +114,7 @@ struct llama_hparams {
110
114
  enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;
111
115
 
112
116
  bool operator!=(const llama_hparams & other) const {
113
- return memcmp(this, &other, sizeof(llama_hparams));
117
+ return static_cast<bool>(memcmp(this, &other, sizeof(llama_hparams)));
114
118
  }
115
119
  };
116
120
 
@@ -406,6 +410,7 @@ enum llama_file_version {
406
410
  LLAMA_FILE_VERSION_GGMF_V1, // added version field and scores in vocab
407
411
  LLAMA_FILE_VERSION_GGJT_V1, // added padding
408
412
  LLAMA_FILE_VERSION_GGJT_V2, // changed quantization format
413
+ LLAMA_FILE_VERSION_GGJT_V3, // changed Q4 and Q8 quantization format
409
414
  };
410
415
 
411
416
  struct llama_file_loader {
@@ -424,24 +429,30 @@ struct llama_file_loader {
424
429
  }
425
430
  void read_magic() {
426
431
  uint32_t magic = file.read_u32();
427
- uint32_t version = 0;
428
432
 
429
- if (magic != 'ggml') {
430
- version = file.read_u32();
433
+ if (magic == LLAMA_FILE_MAGIC_GGML) {
434
+ file_version = LLAMA_FILE_VERSION_GGML;
435
+ return;
431
436
  }
432
437
 
433
- if (magic == 'ggml' && version == 0) {
434
- file_version = LLAMA_FILE_VERSION_GGML;
435
- } else if (magic == 'ggmf' && version == 1) {
436
- file_version = LLAMA_FILE_VERSION_GGMF_V1;
437
- } else if (magic == 'ggjt' && version == 1) {
438
- file_version = LLAMA_FILE_VERSION_GGJT_V1;
439
- } else if (magic == 'ggjt' && version == 2) {
440
- file_version = LLAMA_FILE_VERSION_GGJT_V2;
441
- } else {
442
- throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
443
- magic, version);
438
+ uint32_t version = file.read_u32();
439
+
440
+ switch (magic) {
441
+ case LLAMA_FILE_MAGIC_GGMF:
442
+ switch (version) {
443
+ case 1: file_version = LLAMA_FILE_VERSION_GGMF_V1; return;
444
+ }
445
+ break;
446
+ case LLAMA_FILE_MAGIC_GGJT:
447
+ switch (version) {
448
+ case 1: file_version = LLAMA_FILE_VERSION_GGJT_V1; return;
449
+ case 2: file_version = LLAMA_FILE_VERSION_GGJT_V2; return;
450
+ case 3: file_version = LLAMA_FILE_VERSION_GGJT_V3; return;
451
+ }
444
452
  }
453
+
454
+ throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
455
+ magic, version);
445
456
  }
446
457
  void read_hparams() {
447
458
  hparams.n_vocab = file.read_u32();
@@ -499,7 +510,7 @@ struct llama_file_loader {
499
510
 
500
511
  if (file_version >= LLAMA_FILE_VERSION_GGJT_V1) {
501
512
  // skip to the next multiple of 32 bytes
502
- file.seek(-file.tell() & 31, SEEK_CUR);
513
+ file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
503
514
  }
504
515
  shard.file_idx = file_idx;
505
516
  shard.file_off = file.tell();
@@ -574,7 +585,7 @@ struct llama_file_saver {
574
585
  file.write_u32(new_type);
575
586
  file.write_raw(tensor.ne.data(), sizeof(tensor.ne[0]) * tensor.ne.size());
576
587
  file.write_raw(tensor.name.data(), tensor.name.size());
577
- file.seek(-file.tell() & 31, SEEK_CUR);
588
+ file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
578
589
  LLAMA_ASSERT(new_size == llama_calc_tensor_size(tensor.ne, new_type));
579
590
  file.write_raw(new_data, new_size);
580
591
  }
@@ -641,7 +652,7 @@ struct llama_model_loader {
641
652
  }
642
653
  }
643
654
 
644
- struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne) {
655
+ struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne, ggml_backend backend) {
645
656
  auto it = tensors_map.name_to_idx.find(name);
646
657
  if (it == tensors_map.name_to_idx.end()) {
647
658
  throw format("llama.cpp: tensor '%s' is missing from model", name.c_str());
@@ -652,10 +663,10 @@ struct llama_model_loader {
652
663
  name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str());
653
664
  }
654
665
 
655
- return get_tensor_for(lt);
666
+ return get_tensor_for(lt, backend);
656
667
  }
657
668
 
658
- struct ggml_tensor * get_tensor_for(llama_load_tensor & lt) {
669
+ struct ggml_tensor * get_tensor_for(llama_load_tensor & lt, ggml_backend backend) {
659
670
  struct ggml_tensor * tensor;
660
671
  if (lt.ne.size() == 2) {
661
672
  tensor = ggml_new_tensor_2d(ggml_ctx, lt.type, lt.ne.at(0), lt.ne.at(1));
@@ -665,6 +676,7 @@ struct llama_model_loader {
665
676
  }
666
677
  ggml_set_name(tensor, lt.name.c_str());
667
678
  LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor
679
+ tensor->backend = backend;
668
680
  lt.ggml_tensor = tensor;
669
681
  num_ggml_tensors_created++;
670
682
  return tensor;
@@ -678,12 +690,16 @@ struct llama_model_loader {
678
690
 
679
691
  void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) {
680
692
  size_t data_size = 0;
693
+ size_t prefetch_size = 0;
681
694
  for (const llama_load_tensor & lt : tensors_map.tensors) {
682
695
  data_size += lt.size;
696
+ if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
697
+ prefetch_size += lt.size;
698
+ }
683
699
  }
684
700
 
685
701
  if (use_mmap) {
686
- mapping.reset(new llama_mmap(&file_loaders.at(0)->file));
702
+ mapping.reset(new llama_mmap(&file_loaders.at(0)->file, prefetch_size));
687
703
  if (!lmlock) {
688
704
  // Don't call the callback since the actual loading will be lazy
689
705
  // and we can't measure it.
@@ -696,6 +712,9 @@ struct llama_model_loader {
696
712
 
697
713
  size_t done_size = 0;
698
714
  for (llama_load_tensor & lt : tensors_map.tensors) {
715
+ if (lt.ggml_tensor->backend != GGML_BACKEND_CPU) {
716
+ continue;
717
+ }
699
718
  if (progress_callback) {
700
719
  progress_callback((float) done_size / data_size, progress_callback_user_data);
701
720
  }
@@ -708,9 +727,6 @@ struct llama_model_loader {
708
727
  lmlock->grow_to(done_size);
709
728
  }
710
729
  }
711
- if (progress_callback) {
712
- progress_callback(1.0f, progress_callback_user_data);
713
- }
714
730
  }
715
731
 
716
732
  void load_data_for(llama_load_tensor & lt) {
@@ -835,6 +851,21 @@ bool llama_mlock_supported() {
835
851
  return llama_mlock::SUPPORTED;
836
852
  }
837
853
 
854
+ void llama_init_backend() {
855
+ ggml_time_init();
856
+
857
+ // needed to initialize f16 tables
858
+ {
859
+ struct ggml_init_params params = { 0, NULL, false };
860
+ struct ggml_context * ctx = ggml_init(params);
861
+ ggml_free(ctx);
862
+ }
863
+ }
864
+
865
+ int64_t llama_time_us() {
866
+ return ggml_time_us();
867
+ }
868
+
838
869
  //
839
870
  // model loading
840
871
  //
@@ -844,7 +875,8 @@ static const char *llama_file_version_name(llama_file_version version) {
844
875
  case LLAMA_FILE_VERSION_GGML: return "'ggml' (old version with low tokenizer quality and no mmap support)";
845
876
  case LLAMA_FILE_VERSION_GGMF_V1: return "ggmf v1 (old version with no mmap support)";
846
877
  case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (pre #1405)";
847
- case LLAMA_FILE_VERSION_GGJT_V2: return "ggjt v2 (latest)";
878
+ case LLAMA_FILE_VERSION_GGJT_V2: return "ggjt v2 (pre #1508)";
879
+ case LLAMA_FILE_VERSION_GGJT_V3: return "ggjt v3 (latest)";
848
880
  }
849
881
 
850
882
  return "unknown";
@@ -924,11 +956,19 @@ static void llama_model_load_internal(
924
956
  fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
925
957
  }
926
958
 
927
- if (file_version != LLAMA_FILE_VERSION_GGJT_V2) {
959
+ if (file_version < LLAMA_FILE_VERSION_GGJT_V2) {
928
960
  if (hparams.ftype != LLAMA_FTYPE_ALL_F32 &&
929
961
  hparams.ftype != LLAMA_FTYPE_MOSTLY_F16 &&
930
962
  hparams.ftype != LLAMA_FTYPE_MOSTLY_Q8_0) {
931
- throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1305)");
963
+ throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1405)");
964
+ }
965
+ }
966
+
967
+ if (file_version < LLAMA_FILE_VERSION_GGJT_V3) {
968
+ if (hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ||
969
+ hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_1 ||
970
+ hparams.ftype == LLAMA_FTYPE_MOSTLY_Q8_0) {
971
+ throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1508)");
932
972
  }
933
973
  }
934
974
 
@@ -941,27 +981,7 @@ static void llama_model_load_internal(
941
981
  size_t ctx_size;
942
982
  size_t mmapped_size;
943
983
  ml->calc_sizes(&ctx_size, &mmapped_size);
944
- fprintf(stderr, "%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/1024.0/1024.0);
945
-
946
- // print memory requirements
947
- {
948
- const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
949
-
950
- // this is the total memory required to run the inference
951
- const size_t mem_required =
952
- ctx_size +
953
- mmapped_size +
954
- MEM_REQ_SCRATCH0().at(model.type) +
955
- MEM_REQ_SCRATCH1().at(model.type) +
956
- MEM_REQ_EVAL().at(model.type);
957
-
958
- // this is the memory required by one llama_state
959
- const size_t mem_required_state =
960
- scale*MEM_REQ_KV_SELF().at(model.type);
961
-
962
- fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
963
- mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
964
- }
984
+ fprintf(stderr, "%s: ggml ctx size = %7.2f MB\n", __func__, ctx_size/1024.0/1024.0);
965
985
 
966
986
  // create the ggml context
967
987
  {
@@ -983,7 +1003,14 @@ static void llama_model_load_internal(
983
1003
  }
984
1004
  }
985
1005
 
1006
+ #ifdef GGML_USE_CUBLAS
1007
+ #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CUDA
1008
+ #else
1009
+ #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CPU
1010
+ #endif
1011
+
986
1012
  // prepare memory for the weights
1013
+ size_t vram_total = 0;
987
1014
  {
988
1015
  const uint32_t n_embd = hparams.n_embd;
989
1016
  const uint32_t n_layer = hparams.n_layer;
@@ -991,33 +1018,87 @@ static void llama_model_load_internal(
991
1018
 
992
1019
  ml->ggml_ctx = ctx;
993
1020
 
994
- model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab});
995
- model.norm = ml->get_tensor("norm.weight", {n_embd});
996
- model.output = ml->get_tensor("output.weight", {n_embd, n_vocab});
1021
+ model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab}, GGML_BACKEND_CPU);
1022
+ model.norm = ml->get_tensor("norm.weight", {n_embd}, GGML_BACKEND_CPU);
1023
+
1024
+ // "output" tensor
1025
+ {
1026
+ ggml_backend backend_output;
1027
+ if (n_gpu_layers > int(n_layer)) { // NOLINT
1028
+ backend_output = LLAMA_BACKEND_OFFLOAD;
1029
+ } else {
1030
+ backend_output = GGML_BACKEND_CPU;
1031
+ }
1032
+
1033
+ model.output = ml->get_tensor("output.weight", {n_embd, n_vocab}, backend_output);
1034
+ }
1035
+
1036
+ const int i_gpu_start = n_layer - n_gpu_layers;
997
1037
 
998
1038
  model.layers.resize(n_layer);
999
1039
  for (uint32_t i = 0; i < n_layer; ++i) {
1040
+ const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
1041
+
1000
1042
  auto & layer = model.layers[i];
1001
1043
 
1002
1044
  std::string layers_i = "layers." + std::to_string(i);
1003
1045
 
1004
- layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd});
1046
+ layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd}, backend);
1047
+
1048
+ layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}, backend);
1049
+ layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd}, backend);
1050
+ layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd}, backend);
1051
+ layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}, backend);
1005
1052
 
1006
- layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd});
1007
- layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd});
1008
- layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd});
1009
- layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd});
1053
+ layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd}, backend);
1010
1054
 
1011
- layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd});
1055
+ layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}, backend);
1056
+ layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}, backend);
1057
+ layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}, backend);
1012
1058
 
1013
- layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff});
1014
- layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd});
1015
- layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff});
1059
+ if (backend == GGML_BACKEND_CUDA) {
1060
+ vram_total +=
1061
+ ggml_nbytes(layer.attention_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
1062
+ ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.attention_norm) +
1063
+ ggml_nbytes(layer.w1) + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3);
1064
+ }
1016
1065
  }
1017
1066
  }
1018
1067
 
1019
1068
  ml->done_getting_tensors();
1020
1069
 
1070
+ // print memory requirements
1071
+ {
1072
+ const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
1073
+
1074
+ // this is the total memory required to run the inference
1075
+ const size_t mem_required =
1076
+ ctx_size +
1077
+ mmapped_size - vram_total + // weights in VRAM not in memory
1078
+ MEM_REQ_SCRATCH0().at(model.type) +
1079
+ MEM_REQ_SCRATCH1().at(model.type) +
1080
+ MEM_REQ_EVAL().at(model.type);
1081
+
1082
+ // this is the memory required by one llama_state
1083
+ const size_t mem_required_state =
1084
+ scale*MEM_REQ_KV_SELF().at(model.type);
1085
+
1086
+ fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
1087
+ mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
1088
+
1089
+ #ifdef GGML_USE_CUBLAS
1090
+ const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
1091
+
1092
+ fprintf(stderr, "%s: [cublas] offloading %d layers to GPU\n", __func__, n_gpu);
1093
+ if (n_gpu_layers > (int) hparams.n_layer) {
1094
+ fprintf(stderr, "%s: [cublas] offloading output layer to GPU\n", __func__);
1095
+ }
1096
+ fprintf(stderr, "%s: [cublas] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
1097
+ #elif !defined(GGML_USE_CLBLAST)
1098
+ (void) n_gpu_layers;
1099
+ #endif
1100
+ }
1101
+
1021
1102
  // populate `tensors_by_name`
1022
1103
  for (llama_load_tensor & lt : ml->tensors_map.tensors) {
1023
1104
  model.tensors_by_name.emplace_back(lt.name, lt.ggml_tensor);
@@ -1025,37 +1106,61 @@ static void llama_model_load_internal(
1025
1106
 
1026
1107
  ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
1027
1108
 
1028
- model.mapping = std::move(ml->mapping);
1029
1109
  #ifdef GGML_USE_CUBLAS
1110
+ {
1111
+ size_t done_size = 0;
1112
+ size_t data_size = 0;
1113
+ for (llama_load_tensor & lt : ml->tensors_map.tensors) {
1114
+ data_size += lt.size;
1115
+ if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
1116
+ done_size += lt.size;
1117
+ }
1118
+ }
1119
+ for (llama_load_tensor & lt : ml->tensors_map.tensors) {
1120
+ if (lt.ggml_tensor->backend != GGML_BACKEND_CUDA) {
1121
+ continue;
1122
+ }
1123
+ if (progress_callback) {
1124
+ progress_callback((float) done_size / data_size, progress_callback_user_data);
1125
+ }
1126
+ ggml_cuda_load_data(fname.c_str(), lt.ggml_tensor, lt.shards.at(0).file_off);
1127
+ done_size += lt.size;
1128
+ }
1129
+ }
1130
+ #elif defined(GGML_USE_CLBLAST)
1030
1131
  {
1031
1132
  const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
1032
1133
 
1033
- fprintf(stderr, "%s: [cublas] offloading %d layers to GPU\n", __func__, n_gpu);
1134
+ fprintf(stderr, "ggml_opencl: offloading %d layers to GPU\n", n_gpu);
1034
1135
 
1035
1136
  size_t vram_total = 0;
1036
1137
 
1037
1138
  for (int i = 0; i < n_gpu; ++i) {
1038
1139
  const auto & layer = model.layers[i];
1039
1140
 
1040
- ggml_cuda_transform_tensor(layer.wq); vram_total += ggml_nbytes(layer.wq);
1041
- ggml_cuda_transform_tensor(layer.wk); vram_total += ggml_nbytes(layer.wk);
1042
- ggml_cuda_transform_tensor(layer.wv); vram_total += ggml_nbytes(layer.wv);
1043
- ggml_cuda_transform_tensor(layer.wo); vram_total += ggml_nbytes(layer.wo);
1044
- ggml_cuda_transform_tensor(layer.w1); vram_total += ggml_nbytes(layer.w1);
1045
- ggml_cuda_transform_tensor(layer.w2); vram_total += ggml_nbytes(layer.w2);
1046
- ggml_cuda_transform_tensor(layer.w3); vram_total += ggml_nbytes(layer.w3);
1141
+ ggml_cl_transform_tensor(layer.wq); vram_total += ggml_nbytes(layer.wq);
1142
+ ggml_cl_transform_tensor(layer.wk); vram_total += ggml_nbytes(layer.wk);
1143
+ ggml_cl_transform_tensor(layer.wv); vram_total += ggml_nbytes(layer.wv);
1144
+ ggml_cl_transform_tensor(layer.wo); vram_total += ggml_nbytes(layer.wo);
1145
+ ggml_cl_transform_tensor(layer.w1); vram_total += ggml_nbytes(layer.w1);
1146
+ ggml_cl_transform_tensor(layer.w2); vram_total += ggml_nbytes(layer.w2);
1147
+ ggml_cl_transform_tensor(layer.w3); vram_total += ggml_nbytes(layer.w3);
1047
1148
  }
1048
1149
  if (n_gpu_layers > (int) hparams.n_layer) {
1049
- fprintf(stderr, "%s: [cublas] offloading output layer to GPU\n", __func__);
1050
- ggml_cuda_transform_tensor(model.output); vram_total += ggml_nbytes(model.output);
1150
+ fprintf(stderr, "ggml_opencl: offloading output layer to GPU\n");
1151
+ ggml_cl_transform_tensor(model.output); vram_total += ggml_nbytes(model.output);
1051
1152
  }
1052
1153
 
1053
- fprintf(stderr, "%s: [cublas] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
1154
+ fprintf(stderr, "ggml_opencl: total VRAM used: %zu MB\n", vram_total / 1024 / 1024);
1054
1155
  }
1055
- #else
1056
- (void) n_gpu_layers;
1057
1156
  #endif
1058
1157
 
1158
+ if (progress_callback) {
1159
+ progress_callback(1.0f, progress_callback_user_data);
1160
+ }
1161
+
1162
+ model.mapping = std::move(ml->mapping);
1163
+
1059
1164
  // loading time will be recalculate after the first eval, so
1060
1165
  // we take page faults deferred by mmap() into consideration
1061
1166
  lctx.t_load_us = ggml_time_us() - lctx.t_start_us;
@@ -1153,10 +1258,8 @@ static bool llama_eval_internal(
1153
1258
  {
1154
1259
  cur = ggml_rms_norm(ctx0, inpL);
1155
1260
 
1156
- // cur = attention_norm*cur
1157
- cur = ggml_mul(ctx0,
1158
- ggml_repeat(ctx0, model.layers[il].attention_norm, cur),
1159
- cur);
1261
+ // cur = cur*attention_norm(broadcasted)
1262
+ cur = ggml_mul(ctx0, cur, model.layers[il].attention_norm);
1160
1263
  }
1161
1264
 
1162
1265
  // self-attention
@@ -1263,10 +1366,8 @@ static bool llama_eval_internal(
1263
1366
  {
1264
1367
  cur = ggml_rms_norm(ctx0, inpFF);
1265
1368
 
1266
- // cur = ffn_norm*cur
1267
- cur = ggml_mul(ctx0,
1268
- ggml_repeat(ctx0, model.layers[il].ffn_norm, cur),
1269
- cur);
1369
+ // cur = cur*ffn_norm(broadcasted)
1370
+ cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
1270
1371
  }
1271
1372
 
1272
1373
  struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
@@ -1303,10 +1404,8 @@ static bool llama_eval_internal(
1303
1404
 
1304
1405
  inpL = ggml_rms_norm(ctx0, inpL);
1305
1406
 
1306
- // inpL = norm*inpL
1307
- inpL = ggml_mul(ctx0,
1308
- ggml_repeat(ctx0, model.norm, inpL),
1309
- inpL);
1407
+ // inpL = inpL*norm(broadcasted)
1408
+ inpL = ggml_mul(ctx0, inpL, model.norm);
1310
1409
 
1311
1410
  embeddings = inpL;
1312
1411
  }
@@ -2130,7 +2229,7 @@ struct llama_context * llama_init_from_file(
2130
2229
  unsigned * cur_percentage_p = (unsigned *) ctx;
2131
2230
  unsigned percentage = (unsigned) (100 * progress);
2132
2231
  while (percentage > *cur_percentage_p) {
2133
- ++*cur_percentage_p;
2232
+ *cur_percentage_p = percentage;
2134
2233
  fprintf(stderr, ".");
2135
2234
  fflush(stderr);
2136
2235
  if (percentage >= 100) {
@@ -2223,7 +2322,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
2223
2322
  {
2224
2323
  uint32_t magic;
2225
2324
  fin.read((char *) &magic, sizeof(magic));
2226
- if (magic != 'ggla') {
2325
+ if (magic != LLAMA_FILE_MAGIC_GGLA) {
2227
2326
  fprintf(stderr, "%s: bad file magic\n", __func__);
2228
2327
  return 1;
2229
2328
  }
@@ -2287,7 +2386,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
2287
2386
 
2288
2387
  // maybe this should in llama_model_loader
2289
2388
  if (model_loader->use_mmap) {
2290
- model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ false));
2389
+ model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ 0));
2291
2390
  }
2292
2391
  }
2293
2392
 
@@ -2380,7 +2479,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
2380
2479
  }
2381
2480
  size_t idx = model_loader->tensors_map.name_to_idx[base_name];
2382
2481
  llama_load_tensor & lt = model_loader->tensors_map.tensors[idx];
2383
- base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] });
2482
+ base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] }, GGML_BACKEND_CPU);
2384
2483
  lt.data = (uint8_t *) lt.ggml_tensor->data;
2385
2484
  model_loader->load_data_for(lt);
2386
2485
  lt.ggml_tensor->data = lt.data;
@@ -2606,8 +2705,8 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
2606
2705
  }
2607
2706
 
2608
2707
  // Sets the state reading from the specified source address
2609
- size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
2610
- const uint8_t * inp = src;
2708
+ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
2709
+ uint8_t * inp = src;
2611
2710
 
2612
2711
  // set rng
2613
2712
  {
@@ -19,10 +19,16 @@
19
19
  # define LLAMA_API
20
20
  #endif
21
21
 
22
- #define LLAMA_FILE_VERSION 2
23
- #define LLAMA_FILE_MAGIC 'ggjt'
24
- #define LLAMA_FILE_MAGIC_UNVERSIONED 'ggml'
25
- #define LLAMA_SESSION_MAGIC 'ggsn'
22
+ #define LLAMA_FILE_MAGIC_GGJT 0x67676a74u // 'ggjt'
23
+ #define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
24
+ #define LLAMA_FILE_MAGIC_GGMF 0x67676d66u // 'ggmf'
25
+ #define LLAMA_FILE_MAGIC_GGML 0x67676d6cu // 'ggml'
26
+ #define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
27
+
28
+ #define LLAMA_FILE_VERSION 3
29
+ #define LLAMA_FILE_MAGIC LLAMA_FILE_MAGIC_GGJT
30
+ #define LLAMA_FILE_MAGIC_UNVERSIONED LLAMA_FILE_MAGIC_GGML
31
+ #define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
26
32
  #define LLAMA_SESSION_VERSION 1
27
33
 
28
34
  #ifdef __cplusplus
@@ -40,9 +46,9 @@ extern "C" {
40
46
  typedef int llama_token;
41
47
 
42
48
  typedef struct llama_token_data {
43
- llama_token id; // token id
44
- float logit; // log-odds of the token
45
- float p; // probability of the token
49
+ llama_token id; // token id
50
+ float logit; // log-odds of the token
51
+ float p; // probability of the token
46
52
  } llama_token_data;
47
53
 
48
54
  typedef struct llama_token_data_array {
@@ -73,16 +79,16 @@ extern "C" {
73
79
 
74
80
  // model file types
75
81
  enum llama_ftype {
76
- LLAMA_FTYPE_ALL_F32 = 0,
77
- LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
78
- LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
79
- LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
82
+ LLAMA_FTYPE_ALL_F32 = 0,
83
+ LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
84
+ LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
85
+ LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
80
86
  LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
81
- // LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed
82
- // LLAMA_FTYPE_MOSTLY_Q4_3 (6) support has been removed
83
- LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
84
- LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
85
- LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
87
+ // LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed
88
+ // LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed
89
+ LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
90
+ LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
91
+ LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
86
92
  };
87
93
 
88
94
  LLAMA_API struct llama_context_params llama_context_default_params();
@@ -90,6 +96,13 @@ extern "C" {
90
96
  LLAMA_API bool llama_mmap_supported();
91
97
  LLAMA_API bool llama_mlock_supported();
92
98
 
99
+ // TODO: not great API - very likely to change
100
+ // Initialize the llama + ggml backend
101
+ // Call once at the start of the program
102
+ LLAMA_API void llama_init_backend();
103
+
104
+ LLAMA_API int64_t llama_time_us();
105
+
93
106
  // Various functions for loading a ggml llama model.
94
107
  // Allocate (almost) all memory needed for the model.
95
108
  // Return NULL on failure
@@ -138,7 +151,7 @@ extern "C" {
138
151
 
139
152
  // Set the state reading from the specified address
140
153
  // Returns the number of bytes read
141
- LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src);
154
+ LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src);
142
155
 
143
156
  // Save/load session file
144
157
  LLAMA_API bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out);
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.1.1'
6
+ VERSION = '0.1.3'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'master-6986c78'
9
+ LLAMA_CPP_VERSION = 'master-66874d4'
10
10
  end
data/lib/llama_cpp.rb CHANGED
@@ -106,3 +106,5 @@ module LLaMACpp
106
106
  output.join.delete_prefix(spaced_prompt).strip
107
107
  end
108
108
  end
109
+
110
+ LLaMACpp.init_backend
data/sig/llama_cpp.rbs CHANGED
@@ -14,6 +14,7 @@ module LLaMACpp
14
14
  LLAMA_FTYPE_MOSTLY_Q5_0: Integer
15
15
  LLAMA_FTYPE_MOSTLY_Q5_1: Integer
16
16
 
17
+ def self?.init_backend: () -> void
17
18
  def self?.model_quantize: (input_path: String, output_path: String, ftype: Integer, ?n_threads: Integer) -> void
18
19
  def self?.generate: (::LLaMACpp::Context, String, ?n_predict: Integer, ?n_threads: Integer) -> String
19
20
  def self?.print_system_info: () -> void
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama_cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.1.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-05-21 00:00:00.000000000 Z
11
+ date: 2023-05-26 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
14
14
  email:
@@ -27,7 +27,7 @@ files:
27
27
  - ext/llama_cpp/llama_cpp.h
28
28
  - ext/llama_cpp/src/LICENSE
29
29
  - ext/llama_cpp/src/ggml-cuda.h
30
- - ext/llama_cpp/src/ggml-opencl.c
30
+ - ext/llama_cpp/src/ggml-opencl.cpp
31
31
  - ext/llama_cpp/src/ggml-opencl.h
32
32
  - ext/llama_cpp/src/ggml.c
33
33
  - ext/llama_cpp/src/ggml.h