llama_cpp 0.1.1 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,7 @@
1
1
  // Defines fileno on msys:
2
2
  #ifndef _GNU_SOURCE
3
3
  #define _GNU_SOURCE
4
+ #include <cstddef>
4
5
  #include <cstdint>
5
6
  #include <cstdio>
6
7
  #endif
@@ -11,6 +12,8 @@
11
12
  #include "ggml.h"
12
13
  #ifdef GGML_USE_CUBLAS
13
14
  #include "ggml-cuda.h"
15
+ #elif defined(GGML_USE_CLBLAST)
16
+ #include "ggml-opencl.h"
14
17
  #endif
15
18
 
16
19
  #include <array>
@@ -45,6 +48,7 @@ enum e_model {
45
48
  MODEL_65B,
46
49
  };
47
50
 
51
+
48
52
  static const size_t MB = 1024*1024;
49
53
 
50
54
  // computed for n_ctx == 2048
@@ -110,7 +114,7 @@ struct llama_hparams {
110
114
  enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;
111
115
 
112
116
  bool operator!=(const llama_hparams & other) const {
113
- return memcmp(this, &other, sizeof(llama_hparams));
117
+ return static_cast<bool>(memcmp(this, &other, sizeof(llama_hparams)));
114
118
  }
115
119
  };
116
120
 
@@ -406,6 +410,7 @@ enum llama_file_version {
406
410
  LLAMA_FILE_VERSION_GGMF_V1, // added version field and scores in vocab
407
411
  LLAMA_FILE_VERSION_GGJT_V1, // added padding
408
412
  LLAMA_FILE_VERSION_GGJT_V2, // changed quantization format
413
+ LLAMA_FILE_VERSION_GGJT_V3, // changed Q4 and Q8 quantization format
409
414
  };
410
415
 
411
416
  struct llama_file_loader {
@@ -424,24 +429,30 @@ struct llama_file_loader {
424
429
  }
425
430
  void read_magic() {
426
431
  uint32_t magic = file.read_u32();
427
- uint32_t version = 0;
428
432
 
429
- if (magic != 'ggml') {
430
- version = file.read_u32();
433
+ if (magic == LLAMA_FILE_MAGIC_GGML) {
434
+ file_version = LLAMA_FILE_VERSION_GGML;
435
+ return;
431
436
  }
432
437
 
433
- if (magic == 'ggml' && version == 0) {
434
- file_version = LLAMA_FILE_VERSION_GGML;
435
- } else if (magic == 'ggmf' && version == 1) {
436
- file_version = LLAMA_FILE_VERSION_GGMF_V1;
437
- } else if (magic == 'ggjt' && version == 1) {
438
- file_version = LLAMA_FILE_VERSION_GGJT_V1;
439
- } else if (magic == 'ggjt' && version == 2) {
440
- file_version = LLAMA_FILE_VERSION_GGJT_V2;
441
- } else {
442
- throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
443
- magic, version);
438
+ uint32_t version = file.read_u32();
439
+
440
+ switch (magic) {
441
+ case LLAMA_FILE_MAGIC_GGMF:
442
+ switch (version) {
443
+ case 1: file_version = LLAMA_FILE_VERSION_GGMF_V1; return;
444
+ }
445
+ break;
446
+ case LLAMA_FILE_MAGIC_GGJT:
447
+ switch (version) {
448
+ case 1: file_version = LLAMA_FILE_VERSION_GGJT_V1; return;
449
+ case 2: file_version = LLAMA_FILE_VERSION_GGJT_V2; return;
450
+ case 3: file_version = LLAMA_FILE_VERSION_GGJT_V3; return;
451
+ }
444
452
  }
453
+
454
+ throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
455
+ magic, version);
445
456
  }
446
457
  void read_hparams() {
447
458
  hparams.n_vocab = file.read_u32();
@@ -499,7 +510,7 @@ struct llama_file_loader {
499
510
 
500
511
  if (file_version >= LLAMA_FILE_VERSION_GGJT_V1) {
501
512
  // skip to the next multiple of 32 bytes
502
- file.seek(-file.tell() & 31, SEEK_CUR);
513
+ file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
503
514
  }
504
515
  shard.file_idx = file_idx;
505
516
  shard.file_off = file.tell();
@@ -574,7 +585,7 @@ struct llama_file_saver {
574
585
  file.write_u32(new_type);
575
586
  file.write_raw(tensor.ne.data(), sizeof(tensor.ne[0]) * tensor.ne.size());
576
587
  file.write_raw(tensor.name.data(), tensor.name.size());
577
- file.seek(-file.tell() & 31, SEEK_CUR);
588
+ file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
578
589
  LLAMA_ASSERT(new_size == llama_calc_tensor_size(tensor.ne, new_type));
579
590
  file.write_raw(new_data, new_size);
580
591
  }
@@ -641,7 +652,7 @@ struct llama_model_loader {
641
652
  }
642
653
  }
643
654
 
644
- struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne) {
655
+ struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne, ggml_backend backend) {
645
656
  auto it = tensors_map.name_to_idx.find(name);
646
657
  if (it == tensors_map.name_to_idx.end()) {
647
658
  throw format("llama.cpp: tensor '%s' is missing from model", name.c_str());
@@ -652,10 +663,10 @@ struct llama_model_loader {
652
663
  name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str());
653
664
  }
654
665
 
655
- return get_tensor_for(lt);
666
+ return get_tensor_for(lt, backend);
656
667
  }
657
668
 
658
- struct ggml_tensor * get_tensor_for(llama_load_tensor & lt) {
669
+ struct ggml_tensor * get_tensor_for(llama_load_tensor & lt, ggml_backend backend) {
659
670
  struct ggml_tensor * tensor;
660
671
  if (lt.ne.size() == 2) {
661
672
  tensor = ggml_new_tensor_2d(ggml_ctx, lt.type, lt.ne.at(0), lt.ne.at(1));
@@ -665,6 +676,7 @@ struct llama_model_loader {
665
676
  }
666
677
  ggml_set_name(tensor, lt.name.c_str());
667
678
  LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor
679
+ tensor->backend = backend;
668
680
  lt.ggml_tensor = tensor;
669
681
  num_ggml_tensors_created++;
670
682
  return tensor;
@@ -678,12 +690,16 @@ struct llama_model_loader {
678
690
 
679
691
  void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) {
680
692
  size_t data_size = 0;
693
+ size_t prefetch_size = 0;
681
694
  for (const llama_load_tensor & lt : tensors_map.tensors) {
682
695
  data_size += lt.size;
696
+ if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
697
+ prefetch_size += lt.size;
698
+ }
683
699
  }
684
700
 
685
701
  if (use_mmap) {
686
- mapping.reset(new llama_mmap(&file_loaders.at(0)->file));
702
+ mapping.reset(new llama_mmap(&file_loaders.at(0)->file, prefetch_size));
687
703
  if (!lmlock) {
688
704
  // Don't call the callback since the actual loading will be lazy
689
705
  // and we can't measure it.
@@ -696,6 +712,9 @@ struct llama_model_loader {
696
712
 
697
713
  size_t done_size = 0;
698
714
  for (llama_load_tensor & lt : tensors_map.tensors) {
715
+ if (lt.ggml_tensor->backend != GGML_BACKEND_CPU) {
716
+ continue;
717
+ }
699
718
  if (progress_callback) {
700
719
  progress_callback((float) done_size / data_size, progress_callback_user_data);
701
720
  }
@@ -708,9 +727,6 @@ struct llama_model_loader {
708
727
  lmlock->grow_to(done_size);
709
728
  }
710
729
  }
711
- if (progress_callback) {
712
- progress_callback(1.0f, progress_callback_user_data);
713
- }
714
730
  }
715
731
 
716
732
  void load_data_for(llama_load_tensor & lt) {
@@ -835,6 +851,21 @@ bool llama_mlock_supported() {
835
851
  return llama_mlock::SUPPORTED;
836
852
  }
837
853
 
854
+ void llama_init_backend() {
855
+ ggml_time_init();
856
+
857
+ // needed to initialize f16 tables
858
+ {
859
+ struct ggml_init_params params = { 0, NULL, false };
860
+ struct ggml_context * ctx = ggml_init(params);
861
+ ggml_free(ctx);
862
+ }
863
+ }
864
+
865
+ int64_t llama_time_us() {
866
+ return ggml_time_us();
867
+ }
868
+
838
869
  //
839
870
  // model loading
840
871
  //
@@ -844,7 +875,8 @@ static const char *llama_file_version_name(llama_file_version version) {
844
875
  case LLAMA_FILE_VERSION_GGML: return "'ggml' (old version with low tokenizer quality and no mmap support)";
845
876
  case LLAMA_FILE_VERSION_GGMF_V1: return "ggmf v1 (old version with no mmap support)";
846
877
  case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (pre #1405)";
847
- case LLAMA_FILE_VERSION_GGJT_V2: return "ggjt v2 (latest)";
878
+ case LLAMA_FILE_VERSION_GGJT_V2: return "ggjt v2 (pre #1508)";
879
+ case LLAMA_FILE_VERSION_GGJT_V3: return "ggjt v3 (latest)";
848
880
  }
849
881
 
850
882
  return "unknown";
@@ -924,11 +956,19 @@ static void llama_model_load_internal(
924
956
  fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
925
957
  }
926
958
 
927
- if (file_version != LLAMA_FILE_VERSION_GGJT_V2) {
959
+ if (file_version < LLAMA_FILE_VERSION_GGJT_V2) {
928
960
  if (hparams.ftype != LLAMA_FTYPE_ALL_F32 &&
929
961
  hparams.ftype != LLAMA_FTYPE_MOSTLY_F16 &&
930
962
  hparams.ftype != LLAMA_FTYPE_MOSTLY_Q8_0) {
931
- throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1305)");
963
+ throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1405)");
964
+ }
965
+ }
966
+
967
+ if (file_version < LLAMA_FILE_VERSION_GGJT_V3) {
968
+ if (hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ||
969
+ hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_1 ||
970
+ hparams.ftype == LLAMA_FTYPE_MOSTLY_Q8_0) {
971
+ throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1508)");
932
972
  }
933
973
  }
934
974
 
@@ -941,27 +981,7 @@ static void llama_model_load_internal(
941
981
  size_t ctx_size;
942
982
  size_t mmapped_size;
943
983
  ml->calc_sizes(&ctx_size, &mmapped_size);
944
- fprintf(stderr, "%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/1024.0/1024.0);
945
-
946
- // print memory requirements
947
- {
948
- const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
949
-
950
- // this is the total memory required to run the inference
951
- const size_t mem_required =
952
- ctx_size +
953
- mmapped_size +
954
- MEM_REQ_SCRATCH0().at(model.type) +
955
- MEM_REQ_SCRATCH1().at(model.type) +
956
- MEM_REQ_EVAL().at(model.type);
957
-
958
- // this is the memory required by one llama_state
959
- const size_t mem_required_state =
960
- scale*MEM_REQ_KV_SELF().at(model.type);
961
-
962
- fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
963
- mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
964
- }
984
+ fprintf(stderr, "%s: ggml ctx size = %7.2f MB\n", __func__, ctx_size/1024.0/1024.0);
965
985
 
966
986
  // create the ggml context
967
987
  {
@@ -983,7 +1003,14 @@ static void llama_model_load_internal(
983
1003
  }
984
1004
  }
985
1005
 
1006
+ #ifdef GGML_USE_CUBLAS
1007
+ #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CUDA
1008
+ #else
1009
+ #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CPU
1010
+ #endif
1011
+
986
1012
  // prepare memory for the weights
1013
+ size_t vram_total = 0;
987
1014
  {
988
1015
  const uint32_t n_embd = hparams.n_embd;
989
1016
  const uint32_t n_layer = hparams.n_layer;
@@ -991,33 +1018,87 @@ static void llama_model_load_internal(
991
1018
 
992
1019
  ml->ggml_ctx = ctx;
993
1020
 
994
- model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab});
995
- model.norm = ml->get_tensor("norm.weight", {n_embd});
996
- model.output = ml->get_tensor("output.weight", {n_embd, n_vocab});
1021
+ model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab}, GGML_BACKEND_CPU);
1022
+ model.norm = ml->get_tensor("norm.weight", {n_embd}, GGML_BACKEND_CPU);
1023
+
1024
+ // "output" tensor
1025
+ {
1026
+ ggml_backend backend_output;
1027
+ if (n_gpu_layers > int(n_layer)) { // NOLINT
1028
+ backend_output = LLAMA_BACKEND_OFFLOAD;
1029
+ } else {
1030
+ backend_output = GGML_BACKEND_CPU;
1031
+ }
1032
+
1033
+ model.output = ml->get_tensor("output.weight", {n_embd, n_vocab}, backend_output);
1034
+ }
1035
+
1036
+ const int i_gpu_start = n_layer - n_gpu_layers;
997
1037
 
998
1038
  model.layers.resize(n_layer);
999
1039
  for (uint32_t i = 0; i < n_layer; ++i) {
1040
+ const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
1041
+
1000
1042
  auto & layer = model.layers[i];
1001
1043
 
1002
1044
  std::string layers_i = "layers." + std::to_string(i);
1003
1045
 
1004
- layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd});
1046
+ layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd}, backend);
1047
+
1048
+ layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}, backend);
1049
+ layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd}, backend);
1050
+ layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd}, backend);
1051
+ layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}, backend);
1005
1052
 
1006
- layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd});
1007
- layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd});
1008
- layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd});
1009
- layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd});
1053
+ layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd}, backend);
1010
1054
 
1011
- layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd});
1055
+ layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}, backend);
1056
+ layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}, backend);
1057
+ layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}, backend);
1012
1058
 
1013
- layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff});
1014
- layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd});
1015
- layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff});
1059
+ if (backend == GGML_BACKEND_CUDA) {
1060
+ vram_total +=
1061
+ ggml_nbytes(layer.attention_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
1062
+ ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.attention_norm) +
1063
+ ggml_nbytes(layer.w1) + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3);
1064
+ }
1016
1065
  }
1017
1066
  }
1018
1067
 
1019
1068
  ml->done_getting_tensors();
1020
1069
 
1070
+ // print memory requirements
1071
+ {
1072
+ const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
1073
+
1074
+ // this is the total memory required to run the inference
1075
+ const size_t mem_required =
1076
+ ctx_size +
1077
+ mmapped_size - vram_total + // weights in VRAM not in memory
1078
+ MEM_REQ_SCRATCH0().at(model.type) +
1079
+ MEM_REQ_SCRATCH1().at(model.type) +
1080
+ MEM_REQ_EVAL().at(model.type);
1081
+
1082
+ // this is the memory required by one llama_state
1083
+ const size_t mem_required_state =
1084
+ scale*MEM_REQ_KV_SELF().at(model.type);
1085
+
1086
+ fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
1087
+ mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
1088
+
1089
+ #ifdef GGML_USE_CUBLAS
1090
+ const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
1091
+
1092
+ fprintf(stderr, "%s: [cublas] offloading %d layers to GPU\n", __func__, n_gpu);
1093
+ if (n_gpu_layers > (int) hparams.n_layer) {
1094
+ fprintf(stderr, "%s: [cublas] offloading output layer to GPU\n", __func__);
1095
+ }
1096
+ fprintf(stderr, "%s: [cublas] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
1097
+ #elif !defined(GGML_USE_CLBLAST)
1098
+ (void) n_gpu_layers;
1099
+ #endif
1100
+ }
1101
+
1021
1102
  // populate `tensors_by_name`
1022
1103
  for (llama_load_tensor & lt : ml->tensors_map.tensors) {
1023
1104
  model.tensors_by_name.emplace_back(lt.name, lt.ggml_tensor);
@@ -1025,37 +1106,61 @@ static void llama_model_load_internal(
1025
1106
 
1026
1107
  ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
1027
1108
 
1028
- model.mapping = std::move(ml->mapping);
1029
1109
  #ifdef GGML_USE_CUBLAS
1110
+ {
1111
+ size_t done_size = 0;
1112
+ size_t data_size = 0;
1113
+ for (llama_load_tensor & lt : ml->tensors_map.tensors) {
1114
+ data_size += lt.size;
1115
+ if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
1116
+ done_size += lt.size;
1117
+ }
1118
+ }
1119
+ for (llama_load_tensor & lt : ml->tensors_map.tensors) {
1120
+ if (lt.ggml_tensor->backend != GGML_BACKEND_CUDA) {
1121
+ continue;
1122
+ }
1123
+ if (progress_callback) {
1124
+ progress_callback((float) done_size / data_size, progress_callback_user_data);
1125
+ }
1126
+ ggml_cuda_load_data(fname.c_str(), lt.ggml_tensor, lt.shards.at(0).file_off);
1127
+ done_size += lt.size;
1128
+ }
1129
+ }
1130
+ #elif defined(GGML_USE_CLBLAST)
1030
1131
  {
1031
1132
  const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
1032
1133
 
1033
- fprintf(stderr, "%s: [cublas] offloading %d layers to GPU\n", __func__, n_gpu);
1134
+ fprintf(stderr, "ggml_opencl: offloading %d layers to GPU\n", n_gpu);
1034
1135
 
1035
1136
  size_t vram_total = 0;
1036
1137
 
1037
1138
  for (int i = 0; i < n_gpu; ++i) {
1038
1139
  const auto & layer = model.layers[i];
1039
1140
 
1040
- ggml_cuda_transform_tensor(layer.wq); vram_total += ggml_nbytes(layer.wq);
1041
- ggml_cuda_transform_tensor(layer.wk); vram_total += ggml_nbytes(layer.wk);
1042
- ggml_cuda_transform_tensor(layer.wv); vram_total += ggml_nbytes(layer.wv);
1043
- ggml_cuda_transform_tensor(layer.wo); vram_total += ggml_nbytes(layer.wo);
1044
- ggml_cuda_transform_tensor(layer.w1); vram_total += ggml_nbytes(layer.w1);
1045
- ggml_cuda_transform_tensor(layer.w2); vram_total += ggml_nbytes(layer.w2);
1046
- ggml_cuda_transform_tensor(layer.w3); vram_total += ggml_nbytes(layer.w3);
1141
+ ggml_cl_transform_tensor(layer.wq); vram_total += ggml_nbytes(layer.wq);
1142
+ ggml_cl_transform_tensor(layer.wk); vram_total += ggml_nbytes(layer.wk);
1143
+ ggml_cl_transform_tensor(layer.wv); vram_total += ggml_nbytes(layer.wv);
1144
+ ggml_cl_transform_tensor(layer.wo); vram_total += ggml_nbytes(layer.wo);
1145
+ ggml_cl_transform_tensor(layer.w1); vram_total += ggml_nbytes(layer.w1);
1146
+ ggml_cl_transform_tensor(layer.w2); vram_total += ggml_nbytes(layer.w2);
1147
+ ggml_cl_transform_tensor(layer.w3); vram_total += ggml_nbytes(layer.w3);
1047
1148
  }
1048
1149
  if (n_gpu_layers > (int) hparams.n_layer) {
1049
- fprintf(stderr, "%s: [cublas] offloading output layer to GPU\n", __func__);
1050
- ggml_cuda_transform_tensor(model.output); vram_total += ggml_nbytes(model.output);
1150
+ fprintf(stderr, "ggml_opencl: offloading output layer to GPU\n");
1151
+ ggml_cl_transform_tensor(model.output); vram_total += ggml_nbytes(model.output);
1051
1152
  }
1052
1153
 
1053
- fprintf(stderr, "%s: [cublas] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
1154
+ fprintf(stderr, "ggml_opencl: total VRAM used: %zu MB\n", vram_total / 1024 / 1024);
1054
1155
  }
1055
- #else
1056
- (void) n_gpu_layers;
1057
1156
  #endif
1058
1157
 
1158
+ if (progress_callback) {
1159
+ progress_callback(1.0f, progress_callback_user_data);
1160
+ }
1161
+
1162
+ model.mapping = std::move(ml->mapping);
1163
+
1059
1164
  // loading time will be recalculate after the first eval, so
1060
1165
  // we take page faults deferred by mmap() into consideration
1061
1166
  lctx.t_load_us = ggml_time_us() - lctx.t_start_us;
@@ -1153,10 +1258,8 @@ static bool llama_eval_internal(
1153
1258
  {
1154
1259
  cur = ggml_rms_norm(ctx0, inpL);
1155
1260
 
1156
- // cur = attention_norm*cur
1157
- cur = ggml_mul(ctx0,
1158
- ggml_repeat(ctx0, model.layers[il].attention_norm, cur),
1159
- cur);
1261
+ // cur = cur*attention_norm(broadcasted)
1262
+ cur = ggml_mul(ctx0, cur, model.layers[il].attention_norm);
1160
1263
  }
1161
1264
 
1162
1265
  // self-attention
@@ -1263,10 +1366,8 @@ static bool llama_eval_internal(
1263
1366
  {
1264
1367
  cur = ggml_rms_norm(ctx0, inpFF);
1265
1368
 
1266
- // cur = ffn_norm*cur
1267
- cur = ggml_mul(ctx0,
1268
- ggml_repeat(ctx0, model.layers[il].ffn_norm, cur),
1269
- cur);
1369
+ // cur = cur*ffn_norm(broadcasted)
1370
+ cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
1270
1371
  }
1271
1372
 
1272
1373
  struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
@@ -1303,10 +1404,8 @@ static bool llama_eval_internal(
1303
1404
 
1304
1405
  inpL = ggml_rms_norm(ctx0, inpL);
1305
1406
 
1306
- // inpL = norm*inpL
1307
- inpL = ggml_mul(ctx0,
1308
- ggml_repeat(ctx0, model.norm, inpL),
1309
- inpL);
1407
+ // inpL = inpL*norm(broadcasted)
1408
+ inpL = ggml_mul(ctx0, inpL, model.norm);
1310
1409
 
1311
1410
  embeddings = inpL;
1312
1411
  }
@@ -2130,7 +2229,7 @@ struct llama_context * llama_init_from_file(
2130
2229
  unsigned * cur_percentage_p = (unsigned *) ctx;
2131
2230
  unsigned percentage = (unsigned) (100 * progress);
2132
2231
  while (percentage > *cur_percentage_p) {
2133
- ++*cur_percentage_p;
2232
+ *cur_percentage_p = percentage;
2134
2233
  fprintf(stderr, ".");
2135
2234
  fflush(stderr);
2136
2235
  if (percentage >= 100) {
@@ -2223,7 +2322,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
2223
2322
  {
2224
2323
  uint32_t magic;
2225
2324
  fin.read((char *) &magic, sizeof(magic));
2226
- if (magic != 'ggla') {
2325
+ if (magic != LLAMA_FILE_MAGIC_GGLA) {
2227
2326
  fprintf(stderr, "%s: bad file magic\n", __func__);
2228
2327
  return 1;
2229
2328
  }
@@ -2287,7 +2386,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
2287
2386
 
2288
2387
  // maybe this should in llama_model_loader
2289
2388
  if (model_loader->use_mmap) {
2290
- model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ false));
2389
+ model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ 0));
2291
2390
  }
2292
2391
  }
2293
2392
 
@@ -2380,7 +2479,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
2380
2479
  }
2381
2480
  size_t idx = model_loader->tensors_map.name_to_idx[base_name];
2382
2481
  llama_load_tensor & lt = model_loader->tensors_map.tensors[idx];
2383
- base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] });
2482
+ base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] }, GGML_BACKEND_CPU);
2384
2483
  lt.data = (uint8_t *) lt.ggml_tensor->data;
2385
2484
  model_loader->load_data_for(lt);
2386
2485
  lt.ggml_tensor->data = lt.data;
@@ -2606,8 +2705,8 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
2606
2705
  }
2607
2706
 
2608
2707
  // Sets the state reading from the specified source address
2609
- size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
2610
- const uint8_t * inp = src;
2708
+ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
2709
+ uint8_t * inp = src;
2611
2710
 
2612
2711
  // set rng
2613
2712
  {
@@ -19,10 +19,16 @@
19
19
  # define LLAMA_API
20
20
  #endif
21
21
 
22
- #define LLAMA_FILE_VERSION 2
23
- #define LLAMA_FILE_MAGIC 'ggjt'
24
- #define LLAMA_FILE_MAGIC_UNVERSIONED 'ggml'
25
- #define LLAMA_SESSION_MAGIC 'ggsn'
22
+ #define LLAMA_FILE_MAGIC_GGJT 0x67676a74u // 'ggjt'
23
+ #define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
24
+ #define LLAMA_FILE_MAGIC_GGMF 0x67676d66u // 'ggmf'
25
+ #define LLAMA_FILE_MAGIC_GGML 0x67676d6cu // 'ggml'
26
+ #define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
27
+
28
+ #define LLAMA_FILE_VERSION 3
29
+ #define LLAMA_FILE_MAGIC LLAMA_FILE_MAGIC_GGJT
30
+ #define LLAMA_FILE_MAGIC_UNVERSIONED LLAMA_FILE_MAGIC_GGML
31
+ #define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
26
32
  #define LLAMA_SESSION_VERSION 1
27
33
 
28
34
  #ifdef __cplusplus
@@ -40,9 +46,9 @@ extern "C" {
40
46
  typedef int llama_token;
41
47
 
42
48
  typedef struct llama_token_data {
43
- llama_token id; // token id
44
- float logit; // log-odds of the token
45
- float p; // probability of the token
49
+ llama_token id; // token id
50
+ float logit; // log-odds of the token
51
+ float p; // probability of the token
46
52
  } llama_token_data;
47
53
 
48
54
  typedef struct llama_token_data_array {
@@ -73,16 +79,16 @@ extern "C" {
73
79
 
74
80
  // model file types
75
81
  enum llama_ftype {
76
- LLAMA_FTYPE_ALL_F32 = 0,
77
- LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
78
- LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
79
- LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
82
+ LLAMA_FTYPE_ALL_F32 = 0,
83
+ LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
84
+ LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
85
+ LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
80
86
  LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
81
- // LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed
82
- // LLAMA_FTYPE_MOSTLY_Q4_3 (6) support has been removed
83
- LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
84
- LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
85
- LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
87
+ // LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed
88
+ // LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed
89
+ LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
90
+ LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
91
+ LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
86
92
  };
87
93
 
88
94
  LLAMA_API struct llama_context_params llama_context_default_params();
@@ -90,6 +96,13 @@ extern "C" {
90
96
  LLAMA_API bool llama_mmap_supported();
91
97
  LLAMA_API bool llama_mlock_supported();
92
98
 
99
+ // TODO: not great API - very likely to change
100
+ // Initialize the llama + ggml backend
101
+ // Call once at the start of the program
102
+ LLAMA_API void llama_init_backend();
103
+
104
+ LLAMA_API int64_t llama_time_us();
105
+
93
106
  // Various functions for loading a ggml llama model.
94
107
  // Allocate (almost) all memory needed for the model.
95
108
  // Return NULL on failure
@@ -138,7 +151,7 @@ extern "C" {
138
151
 
139
152
  // Set the state reading from the specified address
140
153
  // Returns the number of bytes read
141
- LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src);
154
+ LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src);
142
155
 
143
156
  // Save/load session file
144
157
  LLAMA_API bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out);
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.1.1'
6
+ VERSION = '0.1.3'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'master-6986c78'
9
+ LLAMA_CPP_VERSION = 'master-66874d4'
10
10
  end
data/lib/llama_cpp.rb CHANGED
@@ -106,3 +106,5 @@ module LLaMACpp
106
106
  output.join.delete_prefix(spaced_prompt).strip
107
107
  end
108
108
  end
109
+
110
+ LLaMACpp.init_backend
data/sig/llama_cpp.rbs CHANGED
@@ -14,6 +14,7 @@ module LLaMACpp
14
14
  LLAMA_FTYPE_MOSTLY_Q5_0: Integer
15
15
  LLAMA_FTYPE_MOSTLY_Q5_1: Integer
16
16
 
17
+ def self?.init_backend: () -> void
17
18
  def self?.model_quantize: (input_path: String, output_path: String, ftype: Integer, ?n_threads: Integer) -> void
18
19
  def self?.generate: (::LLaMACpp::Context, String, ?n_predict: Integer, ?n_threads: Integer) -> String
19
20
  def self?.print_system_info: () -> void
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama_cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.1.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-05-21 00:00:00.000000000 Z
11
+ date: 2023-05-26 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
14
14
  email:
@@ -27,7 +27,7 @@ files:
27
27
  - ext/llama_cpp/llama_cpp.h
28
28
  - ext/llama_cpp/src/LICENSE
29
29
  - ext/llama_cpp/src/ggml-cuda.h
30
- - ext/llama_cpp/src/ggml-opencl.c
30
+ - ext/llama_cpp/src/ggml-opencl.cpp
31
31
  - ext/llama_cpp/src/ggml-opencl.h
32
32
  - ext/llama_cpp/src/ggml.c
33
33
  - ext/llama_cpp/src/ggml.h