llama_cpp 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -190,7 +190,7 @@
190
190
  #define GGML_FILE_MAGIC 0x67676d6c // "ggml"
191
191
  #define GGML_FILE_VERSION 1
192
192
 
193
- #define GGML_QNT_VERSION 1 // bump this on quantization format changes
193
+ #define GGML_QNT_VERSION 2 // bump this on quantization format changes
194
194
  #define GGML_QNT_VERSION_FACTOR 1000 // do not change this
195
195
 
196
196
  #define GGML_MAX_DIMS 4
@@ -313,6 +313,7 @@ extern "C" {
313
313
  GGML_OP_ROPE,
314
314
  GGML_OP_ROPE_BACK,
315
315
  GGML_OP_ALIBI,
316
+ GGML_OP_CLAMP,
316
317
  GGML_OP_CONV_1D_1S,
317
318
  GGML_OP_CONV_1D_2S,
318
319
 
@@ -849,7 +850,7 @@ extern "C" {
849
850
  int n_past);
850
851
 
851
852
  // in-place, returns view(a)
852
- GGML_API struct ggml_tensor * gml_diag_mask_zero_inplace(
853
+ GGML_API struct ggml_tensor * ggml_diag_mask_zero_inplace(
853
854
  struct ggml_context * ctx,
854
855
  struct ggml_tensor * a,
855
856
  int n_past);
@@ -897,7 +898,16 @@ extern "C" {
897
898
  struct ggml_context * ctx,
898
899
  struct ggml_tensor * a,
899
900
  int n_past,
900
- int n_head);
901
+ int n_head,
902
+ float bias_max);
903
+
904
+ // clamp
905
+ // in-place, returns view(a)
906
+ struct ggml_tensor * ggml_clamp(
907
+ struct ggml_context * ctx,
908
+ struct ggml_tensor * a,
909
+ float min,
910
+ float max);
901
911
 
902
912
  // padding = 1
903
913
  // TODO: we don't support extra parameters for now
@@ -101,12 +101,12 @@ struct llama_file {
101
101
  LLAMA_ASSERT(ret == 0); // same
102
102
  }
103
103
 
104
- void read_raw(void * ptr, size_t size) {
105
- if (size == 0) {
104
+ void read_raw(void * ptr, size_t len) const {
105
+ if (len == 0) {
106
106
  return;
107
107
  }
108
108
  errno = 0;
109
- std::size_t ret = std::fread(ptr, size, 1, fp);
109
+ std::size_t ret = std::fread(ptr, len, 1, fp);
110
110
  if (ferror(fp)) {
111
111
  throw std::runtime_error(format("read error: %s", strerror(errno)));
112
112
  }
@@ -127,12 +127,12 @@ struct llama_file {
127
127
  return std::string(chars.data(), len);
128
128
  }
129
129
 
130
- void write_raw(const void * ptr, size_t size) {
131
- if (size == 0) {
130
+ void write_raw(const void * ptr, size_t len) const {
131
+ if (len == 0) {
132
132
  return;
133
133
  }
134
134
  errno = 0;
135
- size_t ret = std::fwrite(ptr, size, 1, fp);
135
+ size_t ret = std::fwrite(ptr, len, 1, fp);
136
136
  if (ret != 1) {
137
137
  throw std::runtime_error(format("write error: %s", strerror(errno)));
138
138
  }
@@ -172,7 +172,7 @@ struct llama_mmap {
172
172
  #ifdef _POSIX_MAPPED_FILES
173
173
  static constexpr bool SUPPORTED = true;
174
174
 
175
- llama_mmap(struct llama_file * file, bool prefetch = true) {
175
+ llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */) {
176
176
  size = file->size;
177
177
  int fd = fileno(file->fp);
178
178
  int flags = MAP_SHARED;
@@ -184,9 +184,9 @@ struct llama_mmap {
184
184
  throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
185
185
  }
186
186
 
187
- if (prefetch) {
187
+ if (prefetch > 0) {
188
188
  // Advise the kernel to preload the mapped memory
189
- if (madvise(addr, file->size, MADV_WILLNEED)) {
189
+ if (madvise(addr, std::min(file->size, prefetch), MADV_WILLNEED)) {
190
190
  fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
191
191
  strerror(errno));
192
192
  }
@@ -267,9 +267,9 @@ struct llama_mlock {
267
267
  }
268
268
  }
269
269
 
270
- void init(void * addr) {
271
- LLAMA_ASSERT(this->addr == NULL && this->size == 0);
272
- this->addr = addr;
270
+ void init(void * ptr) {
271
+ LLAMA_ASSERT(addr == NULL && size == 0);
272
+ addr = ptr;
273
273
  }
274
274
 
275
275
  void grow_to(size_t target_size) {
@@ -340,14 +340,14 @@ struct llama_mlock {
340
340
  return (size_t) si.dwPageSize;
341
341
  }
342
342
 
343
- bool raw_lock(void * addr, size_t size) {
343
+ bool raw_lock(void * ptr, size_t len) {
344
344
  for (int tries = 1; ; tries++) {
345
- if (VirtualLock(addr, size)) {
345
+ if (VirtualLock(ptr, len)) {
346
346
  return true;
347
347
  }
348
348
  if (tries == 2) {
349
349
  fprintf(stderr, "warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n",
350
- size, this->size, llama_format_win_err(GetLastError()).c_str());
350
+ len, size, llama_format_win_err(GetLastError()).c_str());
351
351
  return false;
352
352
  }
353
353
 
@@ -363,7 +363,7 @@ struct llama_mlock {
363
363
  // is equal to the number of pages in its minimum working set minus
364
364
  // a small overhead."
365
365
  // Hopefully a megabyte is enough overhead:
366
- size_t increment = size + 1048576;
366
+ size_t increment = len + 1048576;
367
367
  // The minimum must be <= the maximum, so we need to increase both:
368
368
  min_ws_size += increment;
369
369
  max_ws_size += increment;
@@ -375,8 +375,8 @@ struct llama_mlock {
375
375
  }
376
376
  }
377
377
 
378
- void raw_unlock(void * addr, size_t size) {
379
- if (!VirtualUnlock(addr, size)) {
378
+ void raw_unlock(void * ptr, size_t len) {
379
+ if (!VirtualUnlock(ptr, len)) {
380
380
  fprintf(stderr, "warning: failed to VirtualUnlock buffer: %s\n",
381
381
  llama_format_win_err(GetLastError()).c_str());
382
382
  }
@@ -388,12 +388,12 @@ struct llama_mlock {
388
388
  return (size_t) 65536;
389
389
  }
390
390
 
391
- bool raw_lock(const void * addr, size_t size) {
391
+ bool raw_lock(const void * addr, size_t len) {
392
392
  fprintf(stderr, "warning: mlock not supported on this system\n");
393
393
  return false;
394
394
  }
395
395
 
396
- void raw_unlock(const void * addr, size_t size) {}
396
+ void raw_unlock(const void * addr, size_t len) {}
397
397
  #endif
398
398
  };
399
399
 
@@ -404,10 +404,10 @@ struct llama_buffer {
404
404
 
405
405
  llama_buffer() = default;
406
406
 
407
- void resize(size_t size) {
407
+ void resize(size_t len) {
408
408
  delete[] addr;
409
- addr = new uint8_t[size];
410
- this->size = size;
409
+ addr = new uint8_t[len];
410
+ size = len;
411
411
  }
412
412
 
413
413
  ~llama_buffer() {
@@ -1,6 +1,7 @@
1
1
  // Defines fileno on msys:
2
2
  #ifndef _GNU_SOURCE
3
3
  #define _GNU_SOURCE
4
+ #include <cstddef>
4
5
  #include <cstdint>
5
6
  #include <cstdio>
6
7
  #endif
@@ -45,6 +46,7 @@ enum e_model {
45
46
  MODEL_65B,
46
47
  };
47
48
 
49
+
48
50
  static const size_t MB = 1024*1024;
49
51
 
50
52
  // computed for n_ctx == 2048
@@ -110,7 +112,7 @@ struct llama_hparams {
110
112
  enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;
111
113
 
112
114
  bool operator!=(const llama_hparams & other) const {
113
- return memcmp(this, &other, sizeof(llama_hparams));
115
+ return static_cast<bool>(memcmp(this, &other, sizeof(llama_hparams)));
114
116
  }
115
117
  };
116
118
 
@@ -406,6 +408,7 @@ enum llama_file_version {
406
408
  LLAMA_FILE_VERSION_GGMF_V1, // added version field and scores in vocab
407
409
  LLAMA_FILE_VERSION_GGJT_V1, // added padding
408
410
  LLAMA_FILE_VERSION_GGJT_V2, // changed quantization format
411
+ LLAMA_FILE_VERSION_GGJT_V3, // changed Q4 and Q8 quantization format
409
412
  };
410
413
 
411
414
  struct llama_file_loader {
@@ -424,24 +427,30 @@ struct llama_file_loader {
424
427
  }
425
428
  void read_magic() {
426
429
  uint32_t magic = file.read_u32();
427
- uint32_t version = 0;
428
430
 
429
- if (magic != 'ggml') {
430
- version = file.read_u32();
431
+ if (magic == LLAMA_FILE_MAGIC_GGML) {
432
+ file_version = LLAMA_FILE_VERSION_GGML;
433
+ return;
431
434
  }
432
435
 
433
- if (magic == 'ggml' && version == 0) {
434
- file_version = LLAMA_FILE_VERSION_GGML;
435
- } else if (magic == 'ggmf' && version == 1) {
436
- file_version = LLAMA_FILE_VERSION_GGMF_V1;
437
- } else if (magic == 'ggjt' && version == 1) {
438
- file_version = LLAMA_FILE_VERSION_GGJT_V1;
439
- } else if (magic == 'ggjt' && version == 2) {
440
- file_version = LLAMA_FILE_VERSION_GGJT_V2;
441
- } else {
442
- throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
443
- magic, version);
436
+ uint32_t version = file.read_u32();
437
+
438
+ switch (magic) {
439
+ case LLAMA_FILE_MAGIC_GGMF:
440
+ switch (version) {
441
+ case 1: file_version = LLAMA_FILE_VERSION_GGMF_V1; return;
442
+ }
443
+ break;
444
+ case LLAMA_FILE_MAGIC_GGJT:
445
+ switch (version) {
446
+ case 1: file_version = LLAMA_FILE_VERSION_GGJT_V1; return;
447
+ case 2: file_version = LLAMA_FILE_VERSION_GGJT_V2; return;
448
+ case 3: file_version = LLAMA_FILE_VERSION_GGJT_V3; return;
449
+ }
444
450
  }
451
+
452
+ throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
453
+ magic, version);
445
454
  }
446
455
  void read_hparams() {
447
456
  hparams.n_vocab = file.read_u32();
@@ -499,7 +508,7 @@ struct llama_file_loader {
499
508
 
500
509
  if (file_version >= LLAMA_FILE_VERSION_GGJT_V1) {
501
510
  // skip to the next multiple of 32 bytes
502
- file.seek(-file.tell() & 31, SEEK_CUR);
511
+ file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
503
512
  }
504
513
  shard.file_idx = file_idx;
505
514
  shard.file_off = file.tell();
@@ -574,7 +583,7 @@ struct llama_file_saver {
574
583
  file.write_u32(new_type);
575
584
  file.write_raw(tensor.ne.data(), sizeof(tensor.ne[0]) * tensor.ne.size());
576
585
  file.write_raw(tensor.name.data(), tensor.name.size());
577
- file.seek(-file.tell() & 31, SEEK_CUR);
586
+ file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
578
587
  LLAMA_ASSERT(new_size == llama_calc_tensor_size(tensor.ne, new_type));
579
588
  file.write_raw(new_data, new_size);
580
589
  }
@@ -641,7 +650,7 @@ struct llama_model_loader {
641
650
  }
642
651
  }
643
652
 
644
- struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne) {
653
+ struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne, ggml_backend backend) {
645
654
  auto it = tensors_map.name_to_idx.find(name);
646
655
  if (it == tensors_map.name_to_idx.end()) {
647
656
  throw format("llama.cpp: tensor '%s' is missing from model", name.c_str());
@@ -652,10 +661,10 @@ struct llama_model_loader {
652
661
  name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str());
653
662
  }
654
663
 
655
- return get_tensor_for(lt);
664
+ return get_tensor_for(lt, backend);
656
665
  }
657
666
 
658
- struct ggml_tensor * get_tensor_for(llama_load_tensor & lt) {
667
+ struct ggml_tensor * get_tensor_for(llama_load_tensor & lt, ggml_backend backend) {
659
668
  struct ggml_tensor * tensor;
660
669
  if (lt.ne.size() == 2) {
661
670
  tensor = ggml_new_tensor_2d(ggml_ctx, lt.type, lt.ne.at(0), lt.ne.at(1));
@@ -665,6 +674,7 @@ struct llama_model_loader {
665
674
  }
666
675
  ggml_set_name(tensor, lt.name.c_str());
667
676
  LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor
677
+ tensor->backend = backend;
668
678
  lt.ggml_tensor = tensor;
669
679
  num_ggml_tensors_created++;
670
680
  return tensor;
@@ -678,12 +688,16 @@ struct llama_model_loader {
678
688
 
679
689
  void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) {
680
690
  size_t data_size = 0;
691
+ size_t prefetch_size = 0;
681
692
  for (const llama_load_tensor & lt : tensors_map.tensors) {
682
693
  data_size += lt.size;
694
+ if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
695
+ prefetch_size += lt.size;
696
+ }
683
697
  }
684
698
 
685
699
  if (use_mmap) {
686
- mapping.reset(new llama_mmap(&file_loaders.at(0)->file));
700
+ mapping.reset(new llama_mmap(&file_loaders.at(0)->file, prefetch_size));
687
701
  if (!lmlock) {
688
702
  // Don't call the callback since the actual loading will be lazy
689
703
  // and we can't measure it.
@@ -696,6 +710,9 @@ struct llama_model_loader {
696
710
 
697
711
  size_t done_size = 0;
698
712
  for (llama_load_tensor & lt : tensors_map.tensors) {
713
+ if (lt.ggml_tensor->backend != GGML_BACKEND_CPU) {
714
+ continue;
715
+ }
699
716
  if (progress_callback) {
700
717
  progress_callback((float) done_size / data_size, progress_callback_user_data);
701
718
  }
@@ -708,9 +725,6 @@ struct llama_model_loader {
708
725
  lmlock->grow_to(done_size);
709
726
  }
710
727
  }
711
- if (progress_callback) {
712
- progress_callback(1.0f, progress_callback_user_data);
713
- }
714
728
  }
715
729
 
716
730
  void load_data_for(llama_load_tensor & lt) {
@@ -835,6 +849,21 @@ bool llama_mlock_supported() {
835
849
  return llama_mlock::SUPPORTED;
836
850
  }
837
851
 
852
+ void llama_init_backend() {
853
+ ggml_time_init();
854
+
855
+ // needed to initialize f16 tables
856
+ {
857
+ struct ggml_init_params params = { 0, NULL, false };
858
+ struct ggml_context * ctx = ggml_init(params);
859
+ ggml_free(ctx);
860
+ }
861
+ }
862
+
863
+ int64_t llama_time_us() {
864
+ return ggml_time_us();
865
+ }
866
+
838
867
  //
839
868
  // model loading
840
869
  //
@@ -844,7 +873,8 @@ static const char *llama_file_version_name(llama_file_version version) {
844
873
  case LLAMA_FILE_VERSION_GGML: return "'ggml' (old version with low tokenizer quality and no mmap support)";
845
874
  case LLAMA_FILE_VERSION_GGMF_V1: return "ggmf v1 (old version with no mmap support)";
846
875
  case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (pre #1405)";
847
- case LLAMA_FILE_VERSION_GGJT_V2: return "ggjt v2 (latest)";
876
+ case LLAMA_FILE_VERSION_GGJT_V2: return "ggjt v2 (pre #1508)";
877
+ case LLAMA_FILE_VERSION_GGJT_V3: return "ggjt v3 (latest)";
848
878
  }
849
879
 
850
880
  return "unknown";
@@ -924,11 +954,19 @@ static void llama_model_load_internal(
924
954
  fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
925
955
  }
926
956
 
927
- if (file_version != LLAMA_FILE_VERSION_GGJT_V2) {
957
+ if (file_version < LLAMA_FILE_VERSION_GGJT_V2) {
928
958
  if (hparams.ftype != LLAMA_FTYPE_ALL_F32 &&
929
959
  hparams.ftype != LLAMA_FTYPE_MOSTLY_F16 &&
930
960
  hparams.ftype != LLAMA_FTYPE_MOSTLY_Q8_0) {
931
- throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1305)");
961
+ throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1405)");
962
+ }
963
+ }
964
+
965
+ if (file_version < LLAMA_FILE_VERSION_GGJT_V3) {
966
+ if (hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ||
967
+ hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_1 ||
968
+ hparams.ftype == LLAMA_FTYPE_MOSTLY_Q8_0) {
969
+ throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1508)");
932
970
  }
933
971
  }
934
972
 
@@ -941,27 +979,7 @@ static void llama_model_load_internal(
941
979
  size_t ctx_size;
942
980
  size_t mmapped_size;
943
981
  ml->calc_sizes(&ctx_size, &mmapped_size);
944
- fprintf(stderr, "%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/1024.0/1024.0);
945
-
946
- // print memory requirements
947
- {
948
- const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
949
-
950
- // this is the total memory required to run the inference
951
- const size_t mem_required =
952
- ctx_size +
953
- mmapped_size +
954
- MEM_REQ_SCRATCH0().at(model.type) +
955
- MEM_REQ_SCRATCH1().at(model.type) +
956
- MEM_REQ_EVAL().at(model.type);
957
-
958
- // this is the memory required by one llama_state
959
- const size_t mem_required_state =
960
- scale*MEM_REQ_KV_SELF().at(model.type);
961
-
962
- fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
963
- mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
964
- }
982
+ fprintf(stderr, "%s: ggml ctx size = %7.2f MB\n", __func__, ctx_size/1024.0/1024.0);
965
983
 
966
984
  // create the ggml context
967
985
  {
@@ -983,7 +1001,14 @@ static void llama_model_load_internal(
983
1001
  }
984
1002
  }
985
1003
 
1004
+ #ifdef GGML_USE_CUBLAS
1005
+ #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CUDA
1006
+ #else
1007
+ #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CPU
1008
+ #endif
1009
+
986
1010
  // prepare memory for the weights
1011
+ size_t vram_total = 0;
987
1012
  {
988
1013
  const uint32_t n_embd = hparams.n_embd;
989
1014
  const uint32_t n_layer = hparams.n_layer;
@@ -991,70 +1016,122 @@ static void llama_model_load_internal(
991
1016
 
992
1017
  ml->ggml_ctx = ctx;
993
1018
 
994
- model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab});
995
- model.norm = ml->get_tensor("norm.weight", {n_embd});
996
- model.output = ml->get_tensor("output.weight", {n_embd, n_vocab});
1019
+ model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab}, GGML_BACKEND_CPU);
1020
+ model.norm = ml->get_tensor("norm.weight", {n_embd}, GGML_BACKEND_CPU);
1021
+
1022
+ // "output" tensor
1023
+ {
1024
+ ggml_backend backend_output;
1025
+ if (n_gpu_layers > int(n_layer)) { // NOLINT
1026
+ backend_output = LLAMA_BACKEND_OFFLOAD;
1027
+ } else {
1028
+ backend_output = GGML_BACKEND_CPU;
1029
+ }
1030
+
1031
+ model.output = ml->get_tensor("output.weight", {n_embd, n_vocab}, backend_output);
1032
+ }
1033
+
1034
+ const int i_gpu_start = n_layer - n_gpu_layers;
997
1035
 
998
1036
  model.layers.resize(n_layer);
999
1037
  for (uint32_t i = 0; i < n_layer; ++i) {
1038
+ const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
1039
+
1000
1040
  auto & layer = model.layers[i];
1001
1041
 
1002
1042
  std::string layers_i = "layers." + std::to_string(i);
1003
1043
 
1004
- layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd});
1044
+ layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd}, backend);
1045
+
1046
+ layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}, backend);
1047
+ layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd}, backend);
1048
+ layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd}, backend);
1049
+ layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}, backend);
1005
1050
 
1006
- layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd});
1007
- layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd});
1008
- layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd});
1009
- layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd});
1051
+ layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd}, backend);
1010
1052
 
1011
- layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd});
1053
+ layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}, backend);
1054
+ layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}, backend);
1055
+ layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}, backend);
1012
1056
 
1013
- layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff});
1014
- layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd});
1015
- layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff});
1057
+ if (backend == GGML_BACKEND_CUDA) {
1058
+ vram_total +=
1059
+ ggml_nbytes(layer.attention_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
1060
+ ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.attention_norm) +
1061
+ ggml_nbytes(layer.w1) + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3);
1062
+ }
1016
1063
  }
1017
1064
  }
1018
1065
 
1019
1066
  ml->done_getting_tensors();
1020
1067
 
1021
- // populate `tensors_by_name`
1022
- for (llama_load_tensor & lt : ml->tensors_map.tensors) {
1023
- model.tensors_by_name.emplace_back(lt.name, lt.ggml_tensor);
1024
- }
1068
+ // print memory requirements
1069
+ {
1070
+ const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
1025
1071
 
1026
- ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
1072
+ // this is the total memory required to run the inference
1073
+ const size_t mem_required =
1074
+ ctx_size +
1075
+ mmapped_size - vram_total + // weights in VRAM not in memory
1076
+ MEM_REQ_SCRATCH0().at(model.type) +
1077
+ MEM_REQ_SCRATCH1().at(model.type) +
1078
+ MEM_REQ_EVAL().at(model.type);
1079
+
1080
+ // this is the memory required by one llama_state
1081
+ const size_t mem_required_state =
1082
+ scale*MEM_REQ_KV_SELF().at(model.type);
1083
+
1084
+ fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
1085
+ mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
1027
1086
 
1028
- model.mapping = std::move(ml->mapping);
1029
1087
  #ifdef GGML_USE_CUBLAS
1030
- {
1031
1088
  const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
1032
1089
 
1033
1090
  fprintf(stderr, "%s: [cublas] offloading %d layers to GPU\n", __func__, n_gpu);
1091
+ if (n_gpu_layers > (int) hparams.n_layer) {
1092
+ fprintf(stderr, "%s: [cublas] offloading output layer to GPU\n", __func__);
1093
+ }
1094
+ fprintf(stderr, "%s: [cublas] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
1095
+ #else
1096
+ (void) n_gpu_layers;
1097
+ #endif
1098
+ }
1034
1099
 
1035
- size_t vram_total = 0;
1100
+ // populate `tensors_by_name`
1101
+ for (llama_load_tensor & lt : ml->tensors_map.tensors) {
1102
+ model.tensors_by_name.emplace_back(lt.name, lt.ggml_tensor);
1103
+ }
1036
1104
 
1037
- for (int i = 0; i < n_gpu; ++i) {
1038
- const auto & layer = model.layers[i];
1105
+ ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
1039
1106
 
1040
- ggml_cuda_transform_tensor(layer.wq); vram_total += ggml_nbytes(layer.wq);
1041
- ggml_cuda_transform_tensor(layer.wk); vram_total += ggml_nbytes(layer.wk);
1042
- ggml_cuda_transform_tensor(layer.wv); vram_total += ggml_nbytes(layer.wv);
1043
- ggml_cuda_transform_tensor(layer.wo); vram_total += ggml_nbytes(layer.wo);
1044
- ggml_cuda_transform_tensor(layer.w1); vram_total += ggml_nbytes(layer.w1);
1045
- ggml_cuda_transform_tensor(layer.w2); vram_total += ggml_nbytes(layer.w2);
1046
- ggml_cuda_transform_tensor(layer.w3); vram_total += ggml_nbytes(layer.w3);
1107
+ #ifdef GGML_USE_CUBLAS
1108
+ {
1109
+ size_t done_size = 0;
1110
+ size_t data_size = 0;
1111
+ for (llama_load_tensor & lt : ml->tensors_map.tensors) {
1112
+ data_size += lt.size;
1113
+ if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
1114
+ done_size += lt.size;
1115
+ }
1047
1116
  }
1048
- if (n_gpu_layers > (int) hparams.n_layer) {
1049
- fprintf(stderr, "%s: [cublas] offloading output layer to GPU\n", __func__);
1050
- ggml_cuda_transform_tensor(model.output); vram_total += ggml_nbytes(model.output);
1117
+ for (llama_load_tensor & lt : ml->tensors_map.tensors) {
1118
+ if (lt.ggml_tensor->backend != GGML_BACKEND_CUDA) {
1119
+ continue;
1120
+ }
1121
+ if (progress_callback) {
1122
+ progress_callback((float) done_size / data_size, progress_callback_user_data);
1123
+ }
1124
+ ggml_cuda_load_data(fname.c_str(), lt.ggml_tensor, lt.shards.at(0).file_off);
1125
+ done_size += lt.size;
1051
1126
  }
1127
+ }
1128
+ #endif // GGML_USE_CUBLAS
1052
1129
 
1053
- fprintf(stderr, "%s: [cublas] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
1130
+ if (progress_callback) {
1131
+ progress_callback(1.0f, progress_callback_user_data);
1054
1132
  }
1055
- #else
1056
- (void) n_gpu_layers;
1057
- #endif
1133
+
1134
+ model.mapping = std::move(ml->mapping);
1058
1135
 
1059
1136
  // loading time will be recalculate after the first eval, so
1060
1137
  // we take page faults deferred by mmap() into consideration
@@ -1153,10 +1230,8 @@ static bool llama_eval_internal(
1153
1230
  {
1154
1231
  cur = ggml_rms_norm(ctx0, inpL);
1155
1232
 
1156
- // cur = attention_norm*cur
1157
- cur = ggml_mul(ctx0,
1158
- ggml_repeat(ctx0, model.layers[il].attention_norm, cur),
1159
- cur);
1233
+ // cur = cur*attention_norm(broadcasted)
1234
+ cur = ggml_mul(ctx0, cur, model.layers[il].attention_norm);
1160
1235
  }
1161
1236
 
1162
1237
  // self-attention
@@ -1263,10 +1338,8 @@ static bool llama_eval_internal(
1263
1338
  {
1264
1339
  cur = ggml_rms_norm(ctx0, inpFF);
1265
1340
 
1266
- // cur = ffn_norm*cur
1267
- cur = ggml_mul(ctx0,
1268
- ggml_repeat(ctx0, model.layers[il].ffn_norm, cur),
1269
- cur);
1341
+ // cur = cur*ffn_norm(broadcasted)
1342
+ cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
1270
1343
  }
1271
1344
 
1272
1345
  struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
@@ -1303,10 +1376,8 @@ static bool llama_eval_internal(
1303
1376
 
1304
1377
  inpL = ggml_rms_norm(ctx0, inpL);
1305
1378
 
1306
- // inpL = norm*inpL
1307
- inpL = ggml_mul(ctx0,
1308
- ggml_repeat(ctx0, model.norm, inpL),
1309
- inpL);
1379
+ // inpL = inpL*norm(broadcasted)
1380
+ inpL = ggml_mul(ctx0, inpL, model.norm);
1310
1381
 
1311
1382
  embeddings = inpL;
1312
1383
  }
@@ -2130,7 +2201,7 @@ struct llama_context * llama_init_from_file(
2130
2201
  unsigned * cur_percentage_p = (unsigned *) ctx;
2131
2202
  unsigned percentage = (unsigned) (100 * progress);
2132
2203
  while (percentage > *cur_percentage_p) {
2133
- ++*cur_percentage_p;
2204
+ *cur_percentage_p = percentage;
2134
2205
  fprintf(stderr, ".");
2135
2206
  fflush(stderr);
2136
2207
  if (percentage >= 100) {
@@ -2223,7 +2294,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
2223
2294
  {
2224
2295
  uint32_t magic;
2225
2296
  fin.read((char *) &magic, sizeof(magic));
2226
- if (magic != 'ggla') {
2297
+ if (magic != LLAMA_FILE_MAGIC_GGLA) {
2227
2298
  fprintf(stderr, "%s: bad file magic\n", __func__);
2228
2299
  return 1;
2229
2300
  }
@@ -2287,7 +2358,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
2287
2358
 
2288
2359
  // maybe this should in llama_model_loader
2289
2360
  if (model_loader->use_mmap) {
2290
- model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ false));
2361
+ model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ 0));
2291
2362
  }
2292
2363
  }
2293
2364
 
@@ -2380,7 +2451,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
2380
2451
  }
2381
2452
  size_t idx = model_loader->tensors_map.name_to_idx[base_name];
2382
2453
  llama_load_tensor & lt = model_loader->tensors_map.tensors[idx];
2383
- base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] });
2454
+ base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] }, GGML_BACKEND_CPU);
2384
2455
  lt.data = (uint8_t *) lt.ggml_tensor->data;
2385
2456
  model_loader->load_data_for(lt);
2386
2457
  lt.ggml_tensor->data = lt.data;
@@ -2606,8 +2677,8 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
2606
2677
  }
2607
2678
 
2608
2679
  // Sets the state reading from the specified source address
2609
- size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
2610
- const uint8_t * inp = src;
2680
+ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
2681
+ uint8_t * inp = src;
2611
2682
 
2612
2683
  // set rng
2613
2684
  {