llama_cpp 0.1.1 → 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -190,7 +190,7 @@
190
190
  #define GGML_FILE_MAGIC 0x67676d6c // "ggml"
191
191
  #define GGML_FILE_VERSION 1
192
192
 
193
- #define GGML_QNT_VERSION 1 // bump this on quantization format changes
193
+ #define GGML_QNT_VERSION 2 // bump this on quantization format changes
194
194
  #define GGML_QNT_VERSION_FACTOR 1000 // do not change this
195
195
 
196
196
  #define GGML_MAX_DIMS 4
@@ -313,6 +313,7 @@ extern "C" {
313
313
  GGML_OP_ROPE,
314
314
  GGML_OP_ROPE_BACK,
315
315
  GGML_OP_ALIBI,
316
+ GGML_OP_CLAMP,
316
317
  GGML_OP_CONV_1D_1S,
317
318
  GGML_OP_CONV_1D_2S,
318
319
 
@@ -849,7 +850,7 @@ extern "C" {
849
850
  int n_past);
850
851
 
851
852
  // in-place, returns view(a)
852
- GGML_API struct ggml_tensor * gml_diag_mask_zero_inplace(
853
+ GGML_API struct ggml_tensor * ggml_diag_mask_zero_inplace(
853
854
  struct ggml_context * ctx,
854
855
  struct ggml_tensor * a,
855
856
  int n_past);
@@ -897,7 +898,16 @@ extern "C" {
897
898
  struct ggml_context * ctx,
898
899
  struct ggml_tensor * a,
899
900
  int n_past,
900
- int n_head);
901
+ int n_head,
902
+ float bias_max);
903
+
904
+ // clamp
905
+ // in-place, returns view(a)
906
+ struct ggml_tensor * ggml_clamp(
907
+ struct ggml_context * ctx,
908
+ struct ggml_tensor * a,
909
+ float min,
910
+ float max);
901
911
 
902
912
  // padding = 1
903
913
  // TODO: we don't support extra parameters for now
@@ -101,12 +101,12 @@ struct llama_file {
101
101
  LLAMA_ASSERT(ret == 0); // same
102
102
  }
103
103
 
104
- void read_raw(void * ptr, size_t size) {
105
- if (size == 0) {
104
+ void read_raw(void * ptr, size_t len) const {
105
+ if (len == 0) {
106
106
  return;
107
107
  }
108
108
  errno = 0;
109
- std::size_t ret = std::fread(ptr, size, 1, fp);
109
+ std::size_t ret = std::fread(ptr, len, 1, fp);
110
110
  if (ferror(fp)) {
111
111
  throw std::runtime_error(format("read error: %s", strerror(errno)));
112
112
  }
@@ -127,12 +127,12 @@ struct llama_file {
127
127
  return std::string(chars.data(), len);
128
128
  }
129
129
 
130
- void write_raw(const void * ptr, size_t size) {
131
- if (size == 0) {
130
+ void write_raw(const void * ptr, size_t len) const {
131
+ if (len == 0) {
132
132
  return;
133
133
  }
134
134
  errno = 0;
135
- size_t ret = std::fwrite(ptr, size, 1, fp);
135
+ size_t ret = std::fwrite(ptr, len, 1, fp);
136
136
  if (ret != 1) {
137
137
  throw std::runtime_error(format("write error: %s", strerror(errno)));
138
138
  }
@@ -172,7 +172,7 @@ struct llama_mmap {
172
172
  #ifdef _POSIX_MAPPED_FILES
173
173
  static constexpr bool SUPPORTED = true;
174
174
 
175
- llama_mmap(struct llama_file * file, bool prefetch = true) {
175
+ llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */) {
176
176
  size = file->size;
177
177
  int fd = fileno(file->fp);
178
178
  int flags = MAP_SHARED;
@@ -184,9 +184,9 @@ struct llama_mmap {
184
184
  throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
185
185
  }
186
186
 
187
- if (prefetch) {
187
+ if (prefetch > 0) {
188
188
  // Advise the kernel to preload the mapped memory
189
- if (madvise(addr, file->size, MADV_WILLNEED)) {
189
+ if (madvise(addr, std::min(file->size, prefetch), MADV_WILLNEED)) {
190
190
  fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
191
191
  strerror(errno));
192
192
  }
@@ -267,9 +267,9 @@ struct llama_mlock {
267
267
  }
268
268
  }
269
269
 
270
- void init(void * addr) {
271
- LLAMA_ASSERT(this->addr == NULL && this->size == 0);
272
- this->addr = addr;
270
+ void init(void * ptr) {
271
+ LLAMA_ASSERT(addr == NULL && size == 0);
272
+ addr = ptr;
273
273
  }
274
274
 
275
275
  void grow_to(size_t target_size) {
@@ -340,14 +340,14 @@ struct llama_mlock {
340
340
  return (size_t) si.dwPageSize;
341
341
  }
342
342
 
343
- bool raw_lock(void * addr, size_t size) {
343
+ bool raw_lock(void * ptr, size_t len) {
344
344
  for (int tries = 1; ; tries++) {
345
- if (VirtualLock(addr, size)) {
345
+ if (VirtualLock(ptr, len)) {
346
346
  return true;
347
347
  }
348
348
  if (tries == 2) {
349
349
  fprintf(stderr, "warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n",
350
- size, this->size, llama_format_win_err(GetLastError()).c_str());
350
+ len, size, llama_format_win_err(GetLastError()).c_str());
351
351
  return false;
352
352
  }
353
353
 
@@ -363,7 +363,7 @@ struct llama_mlock {
363
363
  // is equal to the number of pages in its minimum working set minus
364
364
  // a small overhead."
365
365
  // Hopefully a megabyte is enough overhead:
366
- size_t increment = size + 1048576;
366
+ size_t increment = len + 1048576;
367
367
  // The minimum must be <= the maximum, so we need to increase both:
368
368
  min_ws_size += increment;
369
369
  max_ws_size += increment;
@@ -375,8 +375,8 @@ struct llama_mlock {
375
375
  }
376
376
  }
377
377
 
378
- void raw_unlock(void * addr, size_t size) {
379
- if (!VirtualUnlock(addr, size)) {
378
+ void raw_unlock(void * ptr, size_t len) {
379
+ if (!VirtualUnlock(ptr, len)) {
380
380
  fprintf(stderr, "warning: failed to VirtualUnlock buffer: %s\n",
381
381
  llama_format_win_err(GetLastError()).c_str());
382
382
  }
@@ -388,12 +388,12 @@ struct llama_mlock {
388
388
  return (size_t) 65536;
389
389
  }
390
390
 
391
- bool raw_lock(const void * addr, size_t size) {
391
+ bool raw_lock(const void * addr, size_t len) {
392
392
  fprintf(stderr, "warning: mlock not supported on this system\n");
393
393
  return false;
394
394
  }
395
395
 
396
- void raw_unlock(const void * addr, size_t size) {}
396
+ void raw_unlock(const void * addr, size_t len) {}
397
397
  #endif
398
398
  };
399
399
 
@@ -404,10 +404,10 @@ struct llama_buffer {
404
404
 
405
405
  llama_buffer() = default;
406
406
 
407
- void resize(size_t size) {
407
+ void resize(size_t len) {
408
408
  delete[] addr;
409
- addr = new uint8_t[size];
410
- this->size = size;
409
+ addr = new uint8_t[len];
410
+ size = len;
411
411
  }
412
412
 
413
413
  ~llama_buffer() {
@@ -1,6 +1,7 @@
1
1
  // Defines fileno on msys:
2
2
  #ifndef _GNU_SOURCE
3
3
  #define _GNU_SOURCE
4
+ #include <cstddef>
4
5
  #include <cstdint>
5
6
  #include <cstdio>
6
7
  #endif
@@ -45,6 +46,7 @@ enum e_model {
45
46
  MODEL_65B,
46
47
  };
47
48
 
49
+
48
50
  static const size_t MB = 1024*1024;
49
51
 
50
52
  // computed for n_ctx == 2048
@@ -110,7 +112,7 @@ struct llama_hparams {
110
112
  enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;
111
113
 
112
114
  bool operator!=(const llama_hparams & other) const {
113
- return memcmp(this, &other, sizeof(llama_hparams));
115
+ return static_cast<bool>(memcmp(this, &other, sizeof(llama_hparams)));
114
116
  }
115
117
  };
116
118
 
@@ -406,6 +408,7 @@ enum llama_file_version {
406
408
  LLAMA_FILE_VERSION_GGMF_V1, // added version field and scores in vocab
407
409
  LLAMA_FILE_VERSION_GGJT_V1, // added padding
408
410
  LLAMA_FILE_VERSION_GGJT_V2, // changed quantization format
411
+ LLAMA_FILE_VERSION_GGJT_V3, // changed Q4 and Q8 quantization format
409
412
  };
410
413
 
411
414
  struct llama_file_loader {
@@ -424,24 +427,30 @@ struct llama_file_loader {
424
427
  }
425
428
  void read_magic() {
426
429
  uint32_t magic = file.read_u32();
427
- uint32_t version = 0;
428
430
 
429
- if (magic != 'ggml') {
430
- version = file.read_u32();
431
+ if (magic == LLAMA_FILE_MAGIC_GGML) {
432
+ file_version = LLAMA_FILE_VERSION_GGML;
433
+ return;
431
434
  }
432
435
 
433
- if (magic == 'ggml' && version == 0) {
434
- file_version = LLAMA_FILE_VERSION_GGML;
435
- } else if (magic == 'ggmf' && version == 1) {
436
- file_version = LLAMA_FILE_VERSION_GGMF_V1;
437
- } else if (magic == 'ggjt' && version == 1) {
438
- file_version = LLAMA_FILE_VERSION_GGJT_V1;
439
- } else if (magic == 'ggjt' && version == 2) {
440
- file_version = LLAMA_FILE_VERSION_GGJT_V2;
441
- } else {
442
- throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
443
- magic, version);
436
+ uint32_t version = file.read_u32();
437
+
438
+ switch (magic) {
439
+ case LLAMA_FILE_MAGIC_GGMF:
440
+ switch (version) {
441
+ case 1: file_version = LLAMA_FILE_VERSION_GGMF_V1; return;
442
+ }
443
+ break;
444
+ case LLAMA_FILE_MAGIC_GGJT:
445
+ switch (version) {
446
+ case 1: file_version = LLAMA_FILE_VERSION_GGJT_V1; return;
447
+ case 2: file_version = LLAMA_FILE_VERSION_GGJT_V2; return;
448
+ case 3: file_version = LLAMA_FILE_VERSION_GGJT_V3; return;
449
+ }
444
450
  }
451
+
452
+ throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
453
+ magic, version);
445
454
  }
446
455
  void read_hparams() {
447
456
  hparams.n_vocab = file.read_u32();
@@ -499,7 +508,7 @@ struct llama_file_loader {
499
508
 
500
509
  if (file_version >= LLAMA_FILE_VERSION_GGJT_V1) {
501
510
  // skip to the next multiple of 32 bytes
502
- file.seek(-file.tell() & 31, SEEK_CUR);
511
+ file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
503
512
  }
504
513
  shard.file_idx = file_idx;
505
514
  shard.file_off = file.tell();
@@ -574,7 +583,7 @@ struct llama_file_saver {
574
583
  file.write_u32(new_type);
575
584
  file.write_raw(tensor.ne.data(), sizeof(tensor.ne[0]) * tensor.ne.size());
576
585
  file.write_raw(tensor.name.data(), tensor.name.size());
577
- file.seek(-file.tell() & 31, SEEK_CUR);
586
+ file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
578
587
  LLAMA_ASSERT(new_size == llama_calc_tensor_size(tensor.ne, new_type));
579
588
  file.write_raw(new_data, new_size);
580
589
  }
@@ -641,7 +650,7 @@ struct llama_model_loader {
641
650
  }
642
651
  }
643
652
 
644
- struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne) {
653
+ struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne, ggml_backend backend) {
645
654
  auto it = tensors_map.name_to_idx.find(name);
646
655
  if (it == tensors_map.name_to_idx.end()) {
647
656
  throw format("llama.cpp: tensor '%s' is missing from model", name.c_str());
@@ -652,10 +661,10 @@ struct llama_model_loader {
652
661
  name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str());
653
662
  }
654
663
 
655
- return get_tensor_for(lt);
664
+ return get_tensor_for(lt, backend);
656
665
  }
657
666
 
658
- struct ggml_tensor * get_tensor_for(llama_load_tensor & lt) {
667
+ struct ggml_tensor * get_tensor_for(llama_load_tensor & lt, ggml_backend backend) {
659
668
  struct ggml_tensor * tensor;
660
669
  if (lt.ne.size() == 2) {
661
670
  tensor = ggml_new_tensor_2d(ggml_ctx, lt.type, lt.ne.at(0), lt.ne.at(1));
@@ -665,6 +674,7 @@ struct llama_model_loader {
665
674
  }
666
675
  ggml_set_name(tensor, lt.name.c_str());
667
676
  LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor
677
+ tensor->backend = backend;
668
678
  lt.ggml_tensor = tensor;
669
679
  num_ggml_tensors_created++;
670
680
  return tensor;
@@ -678,12 +688,16 @@ struct llama_model_loader {
678
688
 
679
689
  void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) {
680
690
  size_t data_size = 0;
691
+ size_t prefetch_size = 0;
681
692
  for (const llama_load_tensor & lt : tensors_map.tensors) {
682
693
  data_size += lt.size;
694
+ if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
695
+ prefetch_size += lt.size;
696
+ }
683
697
  }
684
698
 
685
699
  if (use_mmap) {
686
- mapping.reset(new llama_mmap(&file_loaders.at(0)->file));
700
+ mapping.reset(new llama_mmap(&file_loaders.at(0)->file, prefetch_size));
687
701
  if (!lmlock) {
688
702
  // Don't call the callback since the actual loading will be lazy
689
703
  // and we can't measure it.
@@ -696,6 +710,9 @@ struct llama_model_loader {
696
710
 
697
711
  size_t done_size = 0;
698
712
  for (llama_load_tensor & lt : tensors_map.tensors) {
713
+ if (lt.ggml_tensor->backend != GGML_BACKEND_CPU) {
714
+ continue;
715
+ }
699
716
  if (progress_callback) {
700
717
  progress_callback((float) done_size / data_size, progress_callback_user_data);
701
718
  }
@@ -708,9 +725,6 @@ struct llama_model_loader {
708
725
  lmlock->grow_to(done_size);
709
726
  }
710
727
  }
711
- if (progress_callback) {
712
- progress_callback(1.0f, progress_callback_user_data);
713
- }
714
728
  }
715
729
 
716
730
  void load_data_for(llama_load_tensor & lt) {
@@ -835,6 +849,21 @@ bool llama_mlock_supported() {
835
849
  return llama_mlock::SUPPORTED;
836
850
  }
837
851
 
852
+ void llama_init_backend() {
853
+ ggml_time_init();
854
+
855
+ // needed to initialize f16 tables
856
+ {
857
+ struct ggml_init_params params = { 0, NULL, false };
858
+ struct ggml_context * ctx = ggml_init(params);
859
+ ggml_free(ctx);
860
+ }
861
+ }
862
+
863
+ int64_t llama_time_us() {
864
+ return ggml_time_us();
865
+ }
866
+
838
867
  //
839
868
  // model loading
840
869
  //
@@ -844,7 +873,8 @@ static const char *llama_file_version_name(llama_file_version version) {
844
873
  case LLAMA_FILE_VERSION_GGML: return "'ggml' (old version with low tokenizer quality and no mmap support)";
845
874
  case LLAMA_FILE_VERSION_GGMF_V1: return "ggmf v1 (old version with no mmap support)";
846
875
  case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (pre #1405)";
847
- case LLAMA_FILE_VERSION_GGJT_V2: return "ggjt v2 (latest)";
876
+ case LLAMA_FILE_VERSION_GGJT_V2: return "ggjt v2 (pre #1508)";
877
+ case LLAMA_FILE_VERSION_GGJT_V3: return "ggjt v3 (latest)";
848
878
  }
849
879
 
850
880
  return "unknown";
@@ -924,11 +954,19 @@ static void llama_model_load_internal(
924
954
  fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
925
955
  }
926
956
 
927
- if (file_version != LLAMA_FILE_VERSION_GGJT_V2) {
957
+ if (file_version < LLAMA_FILE_VERSION_GGJT_V2) {
928
958
  if (hparams.ftype != LLAMA_FTYPE_ALL_F32 &&
929
959
  hparams.ftype != LLAMA_FTYPE_MOSTLY_F16 &&
930
960
  hparams.ftype != LLAMA_FTYPE_MOSTLY_Q8_0) {
931
- throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1305)");
961
+ throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1405)");
962
+ }
963
+ }
964
+
965
+ if (file_version < LLAMA_FILE_VERSION_GGJT_V3) {
966
+ if (hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ||
967
+ hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_1 ||
968
+ hparams.ftype == LLAMA_FTYPE_MOSTLY_Q8_0) {
969
+ throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1508)");
932
970
  }
933
971
  }
934
972
 
@@ -941,27 +979,7 @@ static void llama_model_load_internal(
941
979
  size_t ctx_size;
942
980
  size_t mmapped_size;
943
981
  ml->calc_sizes(&ctx_size, &mmapped_size);
944
- fprintf(stderr, "%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/1024.0/1024.0);
945
-
946
- // print memory requirements
947
- {
948
- const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
949
-
950
- // this is the total memory required to run the inference
951
- const size_t mem_required =
952
- ctx_size +
953
- mmapped_size +
954
- MEM_REQ_SCRATCH0().at(model.type) +
955
- MEM_REQ_SCRATCH1().at(model.type) +
956
- MEM_REQ_EVAL().at(model.type);
957
-
958
- // this is the memory required by one llama_state
959
- const size_t mem_required_state =
960
- scale*MEM_REQ_KV_SELF().at(model.type);
961
-
962
- fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
963
- mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
964
- }
982
+ fprintf(stderr, "%s: ggml ctx size = %7.2f MB\n", __func__, ctx_size/1024.0/1024.0);
965
983
 
966
984
  // create the ggml context
967
985
  {
@@ -983,7 +1001,14 @@ static void llama_model_load_internal(
983
1001
  }
984
1002
  }
985
1003
 
1004
+ #ifdef GGML_USE_CUBLAS
1005
+ #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CUDA
1006
+ #else
1007
+ #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CPU
1008
+ #endif
1009
+
986
1010
  // prepare memory for the weights
1011
+ size_t vram_total = 0;
987
1012
  {
988
1013
  const uint32_t n_embd = hparams.n_embd;
989
1014
  const uint32_t n_layer = hparams.n_layer;
@@ -991,70 +1016,122 @@ static void llama_model_load_internal(
991
1016
 
992
1017
  ml->ggml_ctx = ctx;
993
1018
 
994
- model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab});
995
- model.norm = ml->get_tensor("norm.weight", {n_embd});
996
- model.output = ml->get_tensor("output.weight", {n_embd, n_vocab});
1019
+ model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab}, GGML_BACKEND_CPU);
1020
+ model.norm = ml->get_tensor("norm.weight", {n_embd}, GGML_BACKEND_CPU);
1021
+
1022
+ // "output" tensor
1023
+ {
1024
+ ggml_backend backend_output;
1025
+ if (n_gpu_layers > int(n_layer)) { // NOLINT
1026
+ backend_output = LLAMA_BACKEND_OFFLOAD;
1027
+ } else {
1028
+ backend_output = GGML_BACKEND_CPU;
1029
+ }
1030
+
1031
+ model.output = ml->get_tensor("output.weight", {n_embd, n_vocab}, backend_output);
1032
+ }
1033
+
1034
+ const int i_gpu_start = n_layer - n_gpu_layers;
997
1035
 
998
1036
  model.layers.resize(n_layer);
999
1037
  for (uint32_t i = 0; i < n_layer; ++i) {
1038
+ const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
1039
+
1000
1040
  auto & layer = model.layers[i];
1001
1041
 
1002
1042
  std::string layers_i = "layers." + std::to_string(i);
1003
1043
 
1004
- layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd});
1044
+ layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd}, backend);
1045
+
1046
+ layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}, backend);
1047
+ layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd}, backend);
1048
+ layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd}, backend);
1049
+ layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}, backend);
1005
1050
 
1006
- layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd});
1007
- layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd});
1008
- layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd});
1009
- layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd});
1051
+ layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd}, backend);
1010
1052
 
1011
- layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd});
1053
+ layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}, backend);
1054
+ layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}, backend);
1055
+ layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}, backend);
1012
1056
 
1013
- layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff});
1014
- layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd});
1015
- layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff});
1057
+ if (backend == GGML_BACKEND_CUDA) {
1058
+ vram_total +=
1059
+ ggml_nbytes(layer.attention_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
1060
+ ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.attention_norm) +
1061
+ ggml_nbytes(layer.w1) + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3);
1062
+ }
1016
1063
  }
1017
1064
  }
1018
1065
 
1019
1066
  ml->done_getting_tensors();
1020
1067
 
1021
- // populate `tensors_by_name`
1022
- for (llama_load_tensor & lt : ml->tensors_map.tensors) {
1023
- model.tensors_by_name.emplace_back(lt.name, lt.ggml_tensor);
1024
- }
1068
+ // print memory requirements
1069
+ {
1070
+ const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
1025
1071
 
1026
- ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
1072
+ // this is the total memory required to run the inference
1073
+ const size_t mem_required =
1074
+ ctx_size +
1075
+ mmapped_size - vram_total + // weights in VRAM not in memory
1076
+ MEM_REQ_SCRATCH0().at(model.type) +
1077
+ MEM_REQ_SCRATCH1().at(model.type) +
1078
+ MEM_REQ_EVAL().at(model.type);
1079
+
1080
+ // this is the memory required by one llama_state
1081
+ const size_t mem_required_state =
1082
+ scale*MEM_REQ_KV_SELF().at(model.type);
1083
+
1084
+ fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
1085
+ mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
1027
1086
 
1028
- model.mapping = std::move(ml->mapping);
1029
1087
  #ifdef GGML_USE_CUBLAS
1030
- {
1031
1088
  const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
1032
1089
 
1033
1090
  fprintf(stderr, "%s: [cublas] offloading %d layers to GPU\n", __func__, n_gpu);
1091
+ if (n_gpu_layers > (int) hparams.n_layer) {
1092
+ fprintf(stderr, "%s: [cublas] offloading output layer to GPU\n", __func__);
1093
+ }
1094
+ fprintf(stderr, "%s: [cublas] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
1095
+ #else
1096
+ (void) n_gpu_layers;
1097
+ #endif
1098
+ }
1034
1099
 
1035
- size_t vram_total = 0;
1100
+ // populate `tensors_by_name`
1101
+ for (llama_load_tensor & lt : ml->tensors_map.tensors) {
1102
+ model.tensors_by_name.emplace_back(lt.name, lt.ggml_tensor);
1103
+ }
1036
1104
 
1037
- for (int i = 0; i < n_gpu; ++i) {
1038
- const auto & layer = model.layers[i];
1105
+ ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
1039
1106
 
1040
- ggml_cuda_transform_tensor(layer.wq); vram_total += ggml_nbytes(layer.wq);
1041
- ggml_cuda_transform_tensor(layer.wk); vram_total += ggml_nbytes(layer.wk);
1042
- ggml_cuda_transform_tensor(layer.wv); vram_total += ggml_nbytes(layer.wv);
1043
- ggml_cuda_transform_tensor(layer.wo); vram_total += ggml_nbytes(layer.wo);
1044
- ggml_cuda_transform_tensor(layer.w1); vram_total += ggml_nbytes(layer.w1);
1045
- ggml_cuda_transform_tensor(layer.w2); vram_total += ggml_nbytes(layer.w2);
1046
- ggml_cuda_transform_tensor(layer.w3); vram_total += ggml_nbytes(layer.w3);
1107
+ #ifdef GGML_USE_CUBLAS
1108
+ {
1109
+ size_t done_size = 0;
1110
+ size_t data_size = 0;
1111
+ for (llama_load_tensor & lt : ml->tensors_map.tensors) {
1112
+ data_size += lt.size;
1113
+ if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
1114
+ done_size += lt.size;
1115
+ }
1047
1116
  }
1048
- if (n_gpu_layers > (int) hparams.n_layer) {
1049
- fprintf(stderr, "%s: [cublas] offloading output layer to GPU\n", __func__);
1050
- ggml_cuda_transform_tensor(model.output); vram_total += ggml_nbytes(model.output);
1117
+ for (llama_load_tensor & lt : ml->tensors_map.tensors) {
1118
+ if (lt.ggml_tensor->backend != GGML_BACKEND_CUDA) {
1119
+ continue;
1120
+ }
1121
+ if (progress_callback) {
1122
+ progress_callback((float) done_size / data_size, progress_callback_user_data);
1123
+ }
1124
+ ggml_cuda_load_data(fname.c_str(), lt.ggml_tensor, lt.shards.at(0).file_off);
1125
+ done_size += lt.size;
1051
1126
  }
1127
+ }
1128
+ #endif // GGML_USE_CUBLAS
1052
1129
 
1053
- fprintf(stderr, "%s: [cublas] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
1130
+ if (progress_callback) {
1131
+ progress_callback(1.0f, progress_callback_user_data);
1054
1132
  }
1055
- #else
1056
- (void) n_gpu_layers;
1057
- #endif
1133
+
1134
+ model.mapping = std::move(ml->mapping);
1058
1135
 
1059
1136
  // loading time will be recalculate after the first eval, so
1060
1137
  // we take page faults deferred by mmap() into consideration
@@ -1153,10 +1230,8 @@ static bool llama_eval_internal(
1153
1230
  {
1154
1231
  cur = ggml_rms_norm(ctx0, inpL);
1155
1232
 
1156
- // cur = attention_norm*cur
1157
- cur = ggml_mul(ctx0,
1158
- ggml_repeat(ctx0, model.layers[il].attention_norm, cur),
1159
- cur);
1233
+ // cur = cur*attention_norm(broadcasted)
1234
+ cur = ggml_mul(ctx0, cur, model.layers[il].attention_norm);
1160
1235
  }
1161
1236
 
1162
1237
  // self-attention
@@ -1263,10 +1338,8 @@ static bool llama_eval_internal(
1263
1338
  {
1264
1339
  cur = ggml_rms_norm(ctx0, inpFF);
1265
1340
 
1266
- // cur = ffn_norm*cur
1267
- cur = ggml_mul(ctx0,
1268
- ggml_repeat(ctx0, model.layers[il].ffn_norm, cur),
1269
- cur);
1341
+ // cur = cur*ffn_norm(broadcasted)
1342
+ cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
1270
1343
  }
1271
1344
 
1272
1345
  struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
@@ -1303,10 +1376,8 @@ static bool llama_eval_internal(
1303
1376
 
1304
1377
  inpL = ggml_rms_norm(ctx0, inpL);
1305
1378
 
1306
- // inpL = norm*inpL
1307
- inpL = ggml_mul(ctx0,
1308
- ggml_repeat(ctx0, model.norm, inpL),
1309
- inpL);
1379
+ // inpL = inpL*norm(broadcasted)
1380
+ inpL = ggml_mul(ctx0, inpL, model.norm);
1310
1381
 
1311
1382
  embeddings = inpL;
1312
1383
  }
@@ -2130,7 +2201,7 @@ struct llama_context * llama_init_from_file(
2130
2201
  unsigned * cur_percentage_p = (unsigned *) ctx;
2131
2202
  unsigned percentage = (unsigned) (100 * progress);
2132
2203
  while (percentage > *cur_percentage_p) {
2133
- ++*cur_percentage_p;
2204
+ *cur_percentage_p = percentage;
2134
2205
  fprintf(stderr, ".");
2135
2206
  fflush(stderr);
2136
2207
  if (percentage >= 100) {
@@ -2223,7 +2294,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
2223
2294
  {
2224
2295
  uint32_t magic;
2225
2296
  fin.read((char *) &magic, sizeof(magic));
2226
- if (magic != 'ggla') {
2297
+ if (magic != LLAMA_FILE_MAGIC_GGLA) {
2227
2298
  fprintf(stderr, "%s: bad file magic\n", __func__);
2228
2299
  return 1;
2229
2300
  }
@@ -2287,7 +2358,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
2287
2358
 
2288
2359
  // maybe this should in llama_model_loader
2289
2360
  if (model_loader->use_mmap) {
2290
- model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ false));
2361
+ model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ 0));
2291
2362
  }
2292
2363
  }
2293
2364
 
@@ -2380,7 +2451,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
2380
2451
  }
2381
2452
  size_t idx = model_loader->tensors_map.name_to_idx[base_name];
2382
2453
  llama_load_tensor & lt = model_loader->tensors_map.tensors[idx];
2383
- base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] });
2454
+ base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] }, GGML_BACKEND_CPU);
2384
2455
  lt.data = (uint8_t *) lt.ggml_tensor->data;
2385
2456
  model_loader->load_data_for(lt);
2386
2457
  lt.ggml_tensor->data = lt.data;
@@ -2606,8 +2677,8 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
2606
2677
  }
2607
2678
 
2608
2679
  // Sets the state reading from the specified source address
2609
- size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
2610
- const uint8_t * inp = src;
2680
+ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
2681
+ uint8_t * inp = src;
2611
2682
 
2612
2683
  // set rng
2613
2684
  {