llama_cpp 0.10.0 → 0.10.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,11 +1,12 @@
1
1
  #define LLAMA_API_INTERNAL
2
+ //#define LLAMA_GGML_BACKEND_CUDA_TEST // for testing only - enables ggml-cuda through ggml-backend, disables partial offloading
2
3
  #include "llama.h"
3
4
 
4
5
  #include "unicode.h"
5
6
 
6
7
  #include "ggml.h"
7
-
8
8
  #include "ggml-alloc.h"
9
+ #include "ggml-backend.h"
9
10
 
10
11
  #ifdef GGML_USE_CUBLAS
11
12
  # include "ggml-cuda.h"
@@ -32,6 +33,7 @@
32
33
  #include <unistd.h>
33
34
  #if defined(_POSIX_MAPPED_FILES)
34
35
  #include <sys/mman.h>
36
+ #include <fcntl.h>
35
37
  #endif
36
38
  #if defined(_POSIX_MEMLOCK_RANGE)
37
39
  #include <sys/resource.h>
@@ -91,7 +93,8 @@
91
93
  #define LLAMA_ATTRIBUTE_FORMAT(...)
92
94
  #endif
93
95
 
94
- #define LLAMA_MAX_NODES 8192
96
+ #define LLAMA_MAX_NODES 8192
97
+ #define LLAMA_MAX_EXPERTS 8
95
98
 
96
99
  //
97
100
  // logging
@@ -194,6 +197,7 @@ enum llm_arch {
194
197
  LLM_ARCH_BLOOM,
195
198
  LLM_ARCH_STABLELM,
196
199
  LLM_ARCH_QWEN,
200
+ LLM_ARCH_PHI2,
197
201
  LLM_ARCH_UNKNOWN,
198
202
  };
199
203
 
@@ -211,6 +215,7 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
211
215
  { LLM_ARCH_BLOOM, "bloom" },
212
216
  { LLM_ARCH_STABLELM, "stablelm" },
213
217
  { LLM_ARCH_QWEN, "qwen" },
218
+ { LLM_ARCH_PHI2, "phi2" },
214
219
  };
215
220
 
216
221
  enum llm_kv {
@@ -231,6 +236,8 @@ enum llm_kv {
231
236
  LLM_KV_FEED_FORWARD_LENGTH,
232
237
  LLM_KV_USE_PARALLEL_RESIDUAL,
233
238
  LLM_KV_TENSOR_DATA_LAYOUT,
239
+ LLM_KV_EXPERT_COUNT,
240
+ LLM_KV_EXPERT_USED_COUNT,
234
241
 
235
242
  LLM_KV_ATTENTION_HEAD_COUNT,
236
243
  LLM_KV_ATTENTION_HEAD_COUNT_KV,
@@ -281,6 +288,8 @@ static std::map<llm_kv, std::string> LLM_KV_NAMES = {
281
288
  { LLM_KV_FEED_FORWARD_LENGTH, "%s.feed_forward_length" },
282
289
  { LLM_KV_USE_PARALLEL_RESIDUAL, "%s.use_parallel_residual" },
283
290
  { LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
291
+ { LLM_KV_EXPERT_COUNT, "%s.expert_count" },
292
+ { LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
284
293
 
285
294
  { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
286
295
  { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
@@ -338,10 +347,14 @@ enum llm_tensor {
338
347
  LLM_TENSOR_ATTN_NORM,
339
348
  LLM_TENSOR_ATTN_NORM_2,
340
349
  LLM_TENSOR_ATTN_ROT_EMBD,
350
+ LLM_TENSOR_FFN_GATE_INP,
351
+ LLM_TENSOR_FFN_NORM,
341
352
  LLM_TENSOR_FFN_GATE,
342
353
  LLM_TENSOR_FFN_DOWN,
343
354
  LLM_TENSOR_FFN_UP,
344
- LLM_TENSOR_FFN_NORM,
355
+ LLM_TENSOR_FFN_DOWN_EXP,
356
+ LLM_TENSOR_FFN_GATE_EXP,
357
+ LLM_TENSOR_FFN_UP_EXP,
345
358
  LLM_TENSOR_ATTN_Q_NORM,
346
359
  LLM_TENSOR_ATTN_K_NORM,
347
360
  };
@@ -360,10 +373,14 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
360
373
  { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
361
374
  { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
362
375
  { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
376
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
363
377
  { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
364
378
  { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
365
379
  { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
366
380
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
381
+ { LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
382
+ { LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
383
+ { LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
367
384
  },
368
385
  },
369
386
  {
@@ -537,6 +554,19 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
537
554
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
538
555
  },
539
556
  },
557
+ {
558
+ LLM_ARCH_PHI2,
559
+ {
560
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
561
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
562
+ { LLM_TENSOR_OUTPUT, "output" },
563
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
564
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
565
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
566
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
567
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
568
+ },
569
+ },
540
570
 
541
571
  {
542
572
  LLM_ARCH_UNKNOWN,
@@ -585,6 +615,10 @@ struct LLM_TN {
585
615
  std::string operator()(llm_tensor tensor, const std::string & suffix, int bid) const {
586
616
  return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid) + "." + suffix;
587
617
  }
618
+
619
+ std::string operator()(llm_tensor tensor, const std::string & suffix, int bid, int xid) const {
620
+ return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid, xid) + "." + suffix;
621
+ }
588
622
  };
589
623
 
590
624
  //
@@ -680,38 +714,6 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
680
714
  // llama helpers
681
715
  //
682
716
 
683
- inline void * llama_host_malloc(size_t n) {
684
- #ifdef GGML_USE_CUBLAS
685
- if (ggml_cublas_loaded()) {
686
- return ggml_cuda_host_malloc(n);
687
- } else {
688
- return malloc(n);
689
- }
690
- #elif GGML_USE_METAL
691
- return ggml_metal_host_malloc(n);
692
- #elif GGML_USE_CPU_HBM
693
- return hbw_malloc(n);
694
- #else
695
- return malloc(n);
696
- #endif
697
- }
698
-
699
- inline void llama_host_free(void * ptr) {
700
- #ifdef GGML_USE_CUBLAS
701
- if (ggml_cublas_loaded()) {
702
- return ggml_cuda_host_free(ptr);
703
- } else {
704
- return free(ptr);
705
- }
706
- #elif GGML_USE_METAL
707
- return ggml_metal_host_free(ptr);
708
- #elif GGML_USE_CPU_HBM
709
- return hbw_free(ptr);
710
- #else
711
- return free(ptr);
712
- #endif
713
- }
714
-
715
717
  #if defined(_WIN32)
716
718
  static std::string llama_format_win_err(DWORD err) {
717
719
  LPSTR buf;
@@ -726,40 +728,10 @@ static std::string llama_format_win_err(DWORD err) {
726
728
  }
727
729
  #endif
728
730
 
729
- struct llama_buffer {
730
- void * data = NULL;
731
- size_t size = 0;
732
-
733
- // fallback to malloc / free
734
- // useful in cases where CUDA can try to allocate PINNED memory
735
- bool fallback = false;
736
-
737
- void resize(size_t n) {
738
- llama_host_free(data);
739
-
740
- data = llama_host_malloc(n);
741
- if (!data) {
742
- fallback = true;
743
- data = malloc(n);
744
- } else {
745
- fallback = false;
746
- }
747
-
748
- GGML_ASSERT(data);
749
- size = n;
750
- }
751
-
752
- ~llama_buffer() {
753
- if (data) {
754
- if (fallback) { // NOLINT
755
- free(data);
756
- } else {
757
- llama_host_free(data);
758
- }
759
- }
760
-
761
- data = NULL;
762
- }
731
+ template <typename T>
732
+ struct no_init {
733
+ T value;
734
+ no_init() { /* do nothing */ }
763
735
  };
764
736
 
765
737
  struct llama_file {
@@ -847,6 +819,9 @@ struct llama_mmap {
847
819
  #ifdef _POSIX_MAPPED_FILES
848
820
  static constexpr bool SUPPORTED = true;
849
821
 
822
+ // list of mapped fragments (first_offset, last_offset)
823
+ std::vector<std::pair<size_t, size_t>> mapped_fragments;
824
+
850
825
  llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) {
851
826
  size = file->size;
852
827
  int fd = fileno(file->fp);
@@ -854,17 +829,22 @@ struct llama_mmap {
854
829
  // prefetch/readahead impairs performance on NUMA systems
855
830
  if (numa) { prefetch = 0; }
856
831
  #ifdef __linux__
832
+ // advise the kernel to read the file sequentially (increases readahead)
833
+ if (posix_fadvise(fd, 0, 0, POSIX_FADV_SEQUENTIAL)) {
834
+ LLAMA_LOG_WARN("warning: posix_fadvise(.., POSIX_FADV_SEQUENTIAL) failed: %s\n",
835
+ strerror(errno));
836
+ }
857
837
  if (prefetch) { flags |= MAP_POPULATE; }
858
838
  #endif
859
839
  addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
860
- if (addr == MAP_FAILED) {
840
+ if (addr == MAP_FAILED) { // NOLINT
861
841
  throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
862
842
  }
863
843
 
864
844
  if (prefetch > 0) {
865
- // Advise the kernel to preload the mapped memory
845
+ // advise the kernel to preload the mapped memory
866
846
  if (posix_madvise(addr, std::min(file->size, prefetch), POSIX_MADV_WILLNEED)) {
867
- fprintf(stderr, "warning: posix_madvise(.., POSIX_MADV_WILLNEED) failed: %s\n",
847
+ LLAMA_LOG_WARN("warning: posix_madvise(.., POSIX_MADV_WILLNEED) failed: %s\n",
868
848
  strerror(errno));
869
849
  }
870
850
  }
@@ -872,14 +852,81 @@ struct llama_mmap {
872
852
  // advise the kernel not to use readahead
873
853
  // (because the next page might not belong on the same node)
874
854
  if (posix_madvise(addr, file->size, POSIX_MADV_RANDOM)) {
875
- fprintf(stderr, "warning: posix_madvise(.., POSIX_MADV_RANDOM) failed: %s\n",
855
+ LLAMA_LOG_WARN("warning: posix_madvise(.., POSIX_MADV_RANDOM) failed: %s\n",
876
856
  strerror(errno));
877
857
  }
878
858
  }
859
+
860
+ // initialize list of mapped_fragments
861
+ mapped_fragments.emplace_back(0, file->size);
862
+ }
863
+
864
+ static void align_range(size_t * first, size_t * last, size_t page_size) {
865
+ // align first to the next page
866
+ size_t offset_in_page = *first & (page_size - 1);
867
+ size_t offset_to_page = offset_in_page == 0 ? 0 : page_size - offset_in_page;
868
+ *first += offset_to_page;
869
+
870
+ // align last to the previous page
871
+ *last = *last & ~(page_size - 1);
872
+
873
+ if (*last <= *first) {
874
+ *last = *first;
875
+ }
876
+ }
877
+
878
+ // partially unmap the file in the range [first, last)
879
+ void unmap_fragment(size_t first, size_t last) {
880
+ // note: this function must not be called multiple times with overlapping ranges
881
+ // otherwise, there is a risk of invalidating addresses that have been repurposed for other mappings
882
+ int page_size = sysconf(_SC_PAGESIZE);
883
+ align_range(&first, &last, page_size);
884
+ size_t len = last - first;
885
+
886
+ if (len == 0) {
887
+ return;
888
+ }
889
+
890
+ GGML_ASSERT(first % page_size == 0);
891
+ GGML_ASSERT(last % page_size == 0);
892
+ GGML_ASSERT(last > first);
893
+
894
+ void * next_page_start = (uint8_t *) addr + first;
895
+
896
+ // unmap the range
897
+ if (munmap(next_page_start, len)) {
898
+ LLAMA_LOG_WARN("warning: munmap failed: %s\n", strerror(errno));
899
+ }
900
+
901
+ // update the list of mapped fragments to avoid unmapping the same range again in the destructor
902
+ std::vector<std::pair<size_t, size_t>> new_mapped_fragments;
903
+ for (const auto & frag : mapped_fragments) {
904
+ if (frag.first < first && frag.second > last) {
905
+ // the range is in the middle of the fragment, split it
906
+ new_mapped_fragments.emplace_back(frag.first, first);
907
+ new_mapped_fragments.emplace_back(last, frag.second);
908
+ } else if (frag.first < first && frag.second > first) {
909
+ // the range starts in the middle of the fragment
910
+ new_mapped_fragments.emplace_back(frag.first, first);
911
+ } else if (frag.first < last && frag.second > last) {
912
+ // the range ends in the middle of the fragment
913
+ new_mapped_fragments.emplace_back(last, frag.second);
914
+ } else if (frag.first >= first && frag.second <= last) {
915
+ // the range covers the entire fragment
916
+ } else {
917
+ // the range is outside the fragment
918
+ new_mapped_fragments.push_back(frag);
919
+ }
920
+ }
921
+ mapped_fragments = std::move(new_mapped_fragments);
879
922
  }
880
923
 
881
924
  ~llama_mmap() {
882
- munmap(addr, size);
925
+ for (const auto & frag : mapped_fragments) {
926
+ if (munmap((char *) addr + frag.first, frag.second - frag.first)) {
927
+ LLAMA_LOG_WARN("warning: munmap failed: %s\n", strerror(errno));
928
+ }
929
+ }
883
930
  }
884
931
  #elif defined(_WIN32)
885
932
  static constexpr bool SUPPORTED = true;
@@ -922,6 +969,12 @@ struct llama_mmap {
922
969
  #endif // _WIN32_WINNT >= _WIN32_WINNT_WIN8
923
970
  }
924
971
 
972
+ void unmap_fragment(size_t first, size_t last) {
973
+ // not supported
974
+ GGML_UNUSED(first);
975
+ GGML_UNUSED(last);
976
+ }
977
+
925
978
  ~llama_mmap() {
926
979
  if (!UnmapViewOfFile(addr)) {
927
980
  fprintf(stderr, "warning: UnmapViewOfFile failed: %s\n",
@@ -938,6 +991,13 @@ struct llama_mmap {
938
991
 
939
992
  throw std::runtime_error(std::string("mmap not supported"));
940
993
  }
994
+
995
+ void unmap(size_t offset, size_t len) {
996
+ (void) offset;
997
+ (void) len;
998
+
999
+ throw std::runtime_error(std::string("mmap not supported"));
1000
+ }
941
1001
  #endif
942
1002
  };
943
1003
 
@@ -1111,6 +1171,26 @@ static std::string llama_token_to_piece(const struct llama_context * ctx, llama_
1111
1171
  return std::string(result.data(), result.size());
1112
1172
  }
1113
1173
 
1174
+ static ggml_backend_buffer_type_t llama_default_buffer_type(int n_gpu_layers) {
1175
+ #ifdef GGML_USE_METAL
1176
+ if (n_gpu_layers > 0) {
1177
+ return ggml_backend_metal_buffer_type();
1178
+ }
1179
+ #elif defined(GGML_USE_CUBLAS) && defined(LLAMA_GGML_BACKEND_CUDA_TEST)
1180
+ if (n_gpu_layers > 0) {
1181
+ return ggml_backend_cuda_buffer_type(0);
1182
+ }
1183
+ #elif defined(GGML_USE_CUBLAS)
1184
+ return ggml_backend_cuda_host_buffer_type();
1185
+ #elif defined(GGML_USE_CPU_HBM)
1186
+ return ggml_backend_cpu_hbm_buffer_type();
1187
+ #endif
1188
+
1189
+ return ggml_backend_cpu_buffer_type();
1190
+
1191
+ GGML_UNUSED(n_gpu_layers);
1192
+ }
1193
+
1114
1194
  //
1115
1195
  // globals
1116
1196
  //
@@ -1159,6 +1239,8 @@ struct llama_hparams {
1159
1239
  uint32_t n_layer;
1160
1240
  uint32_t n_rot;
1161
1241
  uint32_t n_ff;
1242
+ uint32_t n_expert = 0;
1243
+ uint32_t n_expert_used = 0;
1162
1244
 
1163
1245
  float f_norm_eps;
1164
1246
  float f_norm_rms_eps;
@@ -1173,15 +1255,18 @@ struct llama_hparams {
1173
1255
  float f_max_alibi_bias;
1174
1256
 
1175
1257
  bool operator!=(const llama_hparams & other) const {
1176
- if (this->vocab_only != other.vocab_only) return true;
1177
- if (this->n_vocab != other.n_vocab) return true;
1178
- if (this->n_ctx_train != other.n_ctx_train) return true;
1179
- if (this->n_embd != other.n_embd) return true;
1180
- if (this->n_head != other.n_head) return true;
1181
- if (this->n_head_kv != other.n_head_kv) return true;
1182
- if (this->n_layer != other.n_layer) return true;
1183
- if (this->n_rot != other.n_rot) return true;
1184
- if (this->n_ff != other.n_ff) return true;
1258
+ if (this->vocab_only != other.vocab_only) return true;
1259
+ if (this->n_vocab != other.n_vocab) return true;
1260
+ if (this->n_ctx_train != other.n_ctx_train) return true;
1261
+ if (this->n_embd != other.n_embd) return true;
1262
+ if (this->n_head != other.n_head) return true;
1263
+ if (this->n_head_kv != other.n_head_kv) return true;
1264
+ if (this->n_layer != other.n_layer) return true;
1265
+ if (this->n_rot != other.n_rot) return true;
1266
+ if (this->n_ff != other.n_ff) return true;
1267
+ if (this->n_expert != other.n_expert) return true;
1268
+ if (this->n_expert_used != other.n_expert_used) return true;
1269
+
1185
1270
  if (this->rope_finetuned != other.rope_finetuned) return true;
1186
1271
  if (this->n_yarn_orig_ctx != other.n_yarn_orig_ctx) return true;
1187
1272
 
@@ -1263,6 +1348,12 @@ struct llama_layer {
1263
1348
  struct ggml_tensor * ffn_down; // w2
1264
1349
  struct ggml_tensor * ffn_up; // w3
1265
1350
 
1351
+ // ff MoE
1352
+ struct ggml_tensor * ffn_gate_inp;
1353
+ struct ggml_tensor * ffn_gate_exp[LLAMA_MAX_EXPERTS];
1354
+ struct ggml_tensor * ffn_down_exp[LLAMA_MAX_EXPERTS];
1355
+ struct ggml_tensor * ffn_up_exp [LLAMA_MAX_EXPERTS];
1356
+
1266
1357
  // ff bias
1267
1358
  struct ggml_tensor * ffn_down_b; // b2
1268
1359
  struct ggml_tensor * ffn_up_b; // b3
@@ -1300,14 +1391,10 @@ struct llama_kv_cache {
1300
1391
 
1301
1392
  struct ggml_context * ctx = NULL;
1302
1393
 
1303
- llama_buffer buf;
1394
+ ggml_backend_buffer_t buf = NULL;
1304
1395
 
1305
1396
  ~llama_kv_cache() {
1306
- if (ctx) {
1307
- ggml_free(ctx);
1308
- }
1309
-
1310
- #ifdef GGML_USE_CUBLAS
1397
+ #if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
1311
1398
  if (ggml_cublas_loaded()) {
1312
1399
  for (size_t i = 0; i < k_l.size(); ++i) {
1313
1400
  ggml_cuda_free_data(k_l[i]);
@@ -1315,6 +1402,11 @@ struct llama_kv_cache {
1315
1402
  }
1316
1403
  }
1317
1404
  #endif
1405
+ if (ctx) {
1406
+ ggml_free(ctx);
1407
+ }
1408
+
1409
+ ggml_backend_buffer_free(buf);
1318
1410
  }
1319
1411
  };
1320
1412
 
@@ -1354,11 +1446,11 @@ struct llama_vocab {
1354
1446
  id special_suffix_id = 32008;
1355
1447
  id special_eot_id = 32010;
1356
1448
 
1357
- int find_bpe_rank(std::string token_left, std::string token_right) const {
1358
- GGML_ASSERT(token_left.find(" ") == std::string::npos);
1359
- GGML_ASSERT(token_left.find("\n") == std::string::npos);
1360
- GGML_ASSERT(token_right.find(" ") == std::string::npos);
1361
- GGML_ASSERT(token_right.find("\n") == std::string::npos);
1449
+ int find_bpe_rank(const std::string & token_left, const std::string & token_right) const {
1450
+ GGML_ASSERT(token_left.find(' ') == std::string::npos);
1451
+ GGML_ASSERT(token_left.find('\n') == std::string::npos);
1452
+ GGML_ASSERT(token_right.find(' ') == std::string::npos);
1453
+ GGML_ASSERT(token_right.find('\n') == std::string::npos);
1362
1454
 
1363
1455
  auto it = bpe_ranks.find(std::make_pair(token_left, token_right));
1364
1456
  if (it == bpe_ranks.end()) {
@@ -1387,6 +1479,7 @@ struct llama_model {
1387
1479
  struct ggml_tensor * output_norm;
1388
1480
  struct ggml_tensor * output_norm_b;
1389
1481
  struct ggml_tensor * output;
1482
+ struct ggml_tensor * output_b;
1390
1483
 
1391
1484
  std::vector<llama_layer> layers;
1392
1485
 
@@ -1399,7 +1492,7 @@ struct llama_model {
1399
1492
  struct ggml_context * ctx = NULL;
1400
1493
 
1401
1494
  // the model memory buffer
1402
- llama_buffer buf;
1495
+ ggml_backend_buffer_t buf = NULL;
1403
1496
 
1404
1497
  // model memory mapped file
1405
1498
  std::unique_ptr<llama_mmap> mapping;
@@ -1415,11 +1508,7 @@ struct llama_model {
1415
1508
  int64_t t_start_us = 0;
1416
1509
 
1417
1510
  ~llama_model() {
1418
- if (ctx) {
1419
- ggml_free(ctx);
1420
- }
1421
-
1422
- #ifdef GGML_USE_CUBLAS
1511
+ #if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
1423
1512
  if (ggml_cublas_loaded()) {
1424
1513
  for (size_t i = 0; i < tensors_by_name.size(); ++i) {
1425
1514
  ggml_cuda_free_data(tensors_by_name[i].second);
@@ -1433,24 +1522,26 @@ struct llama_model {
1433
1522
  ggml_cl_free_data(tensors_by_name[i].second);
1434
1523
  }
1435
1524
  #endif
1525
+ if (ctx) {
1526
+ ggml_free(ctx);
1527
+ }
1528
+
1529
+ ggml_backend_buffer_free(buf);
1436
1530
  }
1437
1531
  };
1438
1532
 
1439
1533
  struct llama_context {
1440
1534
  llama_context(const llama_model & model) : model(model), t_start_us(model.t_start_us), t_load_us(model.t_load_us) {}
1441
1535
  ~llama_context() {
1442
- #ifdef GGML_USE_METAL
1443
- if (ctx_metal) {
1444
- ggml_metal_free(ctx_metal);
1445
- }
1446
- #endif
1447
- if (alloc) {
1448
- ggml_allocr_free(alloc);
1449
- }
1536
+ ggml_allocr_free(alloc);
1537
+ ggml_backend_buffer_free(buf_alloc);
1538
+ ggml_backend_free(backend);
1450
1539
  }
1451
1540
 
1452
1541
  llama_cparams cparams;
1453
1542
 
1543
+ ggml_backend_t backend = nullptr;
1544
+
1454
1545
  const llama_model & model;
1455
1546
 
1456
1547
  // key + value cache for the self attention
@@ -1472,23 +1563,22 @@ struct llama_context {
1472
1563
 
1473
1564
  // decode output (2-dimensional array: [n_tokens][n_vocab])
1474
1565
  std::vector<float> logits;
1566
+ #ifndef NDEBUG
1567
+ // guard against access to unset logits
1568
+ std::vector<bool> logits_valid;
1569
+ #endif
1475
1570
  bool logits_all = false;
1476
1571
 
1477
1572
  // input embedding (1-dimensional array: [n_embd])
1478
1573
  std::vector<float> embedding;
1479
1574
 
1480
- // reusable buffer for `struct ggml_graph_plan.work_data`
1481
- std::vector<uint8_t> work_buffer;
1482
-
1483
1575
  // memory buffers used to evaluate the model
1484
- llama_buffer buf_compute;
1485
-
1486
- llama_buffer buf_alloc;
1576
+ std::vector<uint8_t> buf_compute_meta;
1577
+ ggml_backend_buffer_t buf_alloc = NULL;
1487
1578
  ggml_allocr * alloc = NULL;
1488
1579
 
1489
- #ifdef GGML_USE_METAL
1490
- ggml_metal_context * ctx_metal = NULL;
1491
- #endif
1580
+ // temporary buffer for copying data to/from the backend
1581
+ std::vector<no_init<uint8_t>> buf_copy;
1492
1582
 
1493
1583
  #ifdef GGML_USE_MPI
1494
1584
  ggml_mpi_context * ctx_mpi = NULL;
@@ -1510,9 +1600,6 @@ static bool llama_kv_cache_init(
1510
1600
  const uint32_t n_embd = hparams.n_embd_gqa();
1511
1601
  const uint32_t n_layer = hparams.n_layer;
1512
1602
 
1513
- const int64_t n_mem = n_layer*n_ctx;
1514
- const int64_t n_elements = n_embd*n_mem;
1515
-
1516
1603
  cache.has_shift = false;
1517
1604
 
1518
1605
  cache.head = 0;
@@ -1522,13 +1609,10 @@ static bool llama_kv_cache_init(
1522
1609
  cache.cells.clear();
1523
1610
  cache.cells.resize(n_ctx);
1524
1611
 
1525
- cache.buf.resize(n_elements*(ggml_type_sizef(ktype) + ggml_type_sizef(vtype)) + 2u*n_layer*ggml_tensor_overhead());
1526
- memset(cache.buf.data, 0, cache.buf.size);
1527
-
1528
1612
  struct ggml_init_params params;
1529
- params.mem_size = cache.buf.size;
1530
- params.mem_buffer = cache.buf.data;
1531
- params.no_alloc = false;
1613
+ params.mem_size = 2u*n_layer*ggml_tensor_overhead();
1614
+ params.mem_buffer = NULL;
1615
+ params.no_alloc = true;
1532
1616
 
1533
1617
  cache.ctx = ggml_init(params);
1534
1618
 
@@ -1542,9 +1626,7 @@ static bool llama_kv_cache_init(
1542
1626
  cache.k_l.reserve(n_layer);
1543
1627
  cache.v_l.reserve(n_layer);
1544
1628
 
1545
- const int i_gpu_start = (int) n_layer - n_gpu_layers; GGML_UNUSED(i_gpu_start);
1546
-
1547
- GGML_UNUSED(offload);
1629
+ const int i_gpu_start = (int) n_layer - n_gpu_layers;
1548
1630
 
1549
1631
  for (int i = 0; i < (int) n_layer; i++) {
1550
1632
  ggml_tensor * k = ggml_new_tensor_1d(cache.ctx, ktype, n_embd*n_ctx);
@@ -1553,23 +1635,35 @@ static bool llama_kv_cache_init(
1553
1635
  ggml_format_name(v, "cache_v_l%d", i);
1554
1636
  cache.k_l.push_back(k);
1555
1637
  cache.v_l.push_back(v);
1556
- #ifdef GGML_USE_CUBLAS
1638
+ #if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
1557
1639
  if (i >= i_gpu_start) {
1558
1640
  if (offload) {
1559
1641
  ggml_cuda_assign_buffers_no_scratch(k);
1560
- vram_kv_cache += ggml_nbytes(k);
1561
1642
  ggml_cuda_assign_buffers_no_scratch(v);
1643
+ vram_kv_cache += ggml_nbytes(k);
1562
1644
  vram_kv_cache += ggml_nbytes(v);
1645
+ // HACK: mark tensor as allocated
1646
+ k->data = v->data = (void *)(uintptr_t)1;
1563
1647
  }
1564
1648
  }
1565
1649
  #endif // GGML_USE_CUBLAS
1566
1650
  }
1567
1651
 
1652
+ // allocate tensors
1653
+ cache.buf = ggml_backend_alloc_ctx_tensors_from_buft(cache.ctx, llama_default_buffer_type(n_gpu_layers));
1654
+
1655
+ // buf may be NULL with full offload
1656
+ if (cache.buf) {
1657
+ // initialize the buffer to avoid NaNs in the padding
1658
+ ggml_backend_buffer_clear(cache.buf, 0);
1659
+ }
1660
+
1568
1661
  if (vram_kv_cache > 0) {
1569
1662
  LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
1570
1663
  }
1571
1664
 
1572
- GGML_UNUSED(n_gpu_layers);
1665
+ GGML_UNUSED(i_gpu_start);
1666
+ GGML_UNUSED(offload);
1573
1667
 
1574
1668
  return true;
1575
1669
  }
@@ -1900,7 +1994,7 @@ namespace GGUFMeta {
1900
1994
  target = override->bool_value;
1901
1995
  return true;
1902
1996
  }
1903
- return true;
1997
+ return false;
1904
1998
  }
1905
1999
 
1906
2000
  template<typename OT>
@@ -2020,17 +2114,16 @@ struct llama_model_loader {
2020
2114
  enum ggml_type type_max = GGML_TYPE_F32;
2021
2115
 
2022
2116
  for (int i = 0; i < n_tensors; i++) {
2023
- const char * name = gguf_get_tensor_name(ctx_gguf, i);
2024
- struct ggml_tensor * meta = ggml_get_tensor(ctx_meta, name);
2117
+ enum ggml_type type = gguf_get_tensor_type(ctx_gguf, i);
2025
2118
 
2026
- n_type[meta->type]++;
2119
+ n_type[type]++;
2027
2120
 
2028
- if (n_type_max < n_type[meta->type]) {
2029
- n_type_max = n_type[meta->type];
2030
- type_max = meta->type;
2121
+ if (n_type_max < n_type[type]) {
2122
+ n_type_max = n_type[type];
2123
+ type_max = type;
2031
2124
  }
2032
2125
 
2033
- LLAMA_LOG_INFO("%s: - tensor %4d: %32s %-8s [ %s ]\n", __func__, i, name, ggml_type_name(meta->type), llama_format_tensor_shape(meta).c_str());
2126
+ // LLAMA_LOG_INFO("%s: - tensor %4d: %32s %-8s [ %s ]\n", __func__, i, name, ggml_type_name(meta->type), llama_format_tensor_shape(meta).c_str());
2034
2127
  }
2035
2128
 
2036
2129
  switch (type_max) {
@@ -2168,34 +2261,19 @@ struct llama_model_loader {
2168
2261
  return gguf_get_tensor_name(ctx_gguf, i);
2169
2262
  }
2170
2263
 
2171
- struct ggml_tensor * get_tensor_meta(int i) const {
2172
- return ggml_get_tensor(ctx_meta, get_tensor_name(i));
2264
+ struct ggml_tensor * get_tensor_meta(const char * name) const {
2265
+ return ggml_get_tensor(ctx_meta, name);
2173
2266
  }
2174
2267
 
2175
- void calc_sizes(size_t & ctx_size_p, size_t & mmapped_size_p) const {
2176
- ctx_size_p = 0;
2177
- mmapped_size_p = 0;
2178
-
2179
- for (int i = 0; i < n_tensors; i++) {
2180
- struct ggml_tensor * meta = get_tensor_meta(i);
2181
- ctx_size_p += sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE;
2182
- (use_mmap ? mmapped_size_p : ctx_size_p) += ggml_nbytes_pad(meta);
2183
- }
2268
+ struct ggml_tensor * get_tensor_meta(int i) const {
2269
+ return get_tensor_meta(get_tensor_name(i));
2184
2270
  }
2185
2271
 
2186
2272
  struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta, ggml_backend_type backend) {
2187
- if (backend != GGML_BACKEND_CPU) {
2188
- ggml_set_no_alloc(ctx, true);
2189
- }
2190
-
2191
2273
  struct ggml_tensor * tensor = ggml_dup_tensor(ctx, meta);
2192
2274
  tensor->backend = backend; // TODO: ggml_set_backend
2193
2275
  ggml_set_name(tensor, ggml_get_name(meta));
2194
2276
 
2195
- if (backend != GGML_BACKEND_CPU) {
2196
- ggml_set_no_alloc(ctx, use_mmap);
2197
- }
2198
-
2199
2277
  n_created++;
2200
2278
 
2201
2279
  return tensor;
@@ -2253,91 +2331,144 @@ struct llama_model_loader {
2253
2331
  return gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, idx);
2254
2332
  }
2255
2333
 
2334
+ void init_mapping(bool prefetch = true) {
2335
+ /*
2336
+ // prefetch only CPU tensors
2337
+ if (use_mmap) {
2338
+ size_t size_pref = 0; // prefetch
2339
+
2340
+ for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
2341
+ struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i));
2342
+ if (cur->backend == GGML_BACKEND_CPU) {
2343
+ size_t tensor_end = gguf_get_tensor_offset(ctx_gguf, i) + ggml_nbytes(cur);
2344
+ size_pref = std::max(size_pref, tensor_end);
2345
+ }
2346
+ }
2347
+ mapping.reset(new llama_mmap(&file, gguf_get_data_offset(ctx_gguf) + size_pref, ggml_is_numa()));
2348
+ }
2349
+ */
2350
+ // prefetch the whole file - all the data is needed anyway
2351
+ if (use_mmap) {
2352
+ mapping.reset(new llama_mmap(&file, prefetch ? -1 : 0, ggml_is_numa()));
2353
+ }
2354
+ }
2355
+
2356
+ // for backwards compatibility, does not support ggml-backend
2256
2357
  void load_data_for(struct ggml_tensor * cur) const {
2257
2358
  const size_t offs = file_offset(ggml_get_name(cur));
2258
2359
 
2259
- if (use_mmap) {
2260
- cur->data = (uint8_t *) mapping->addr + offs;
2360
+ if (use_mmap && mapping) {
2361
+ GGML_ASSERT(cur->data == nullptr);
2362
+ cur->data = (uint8_t *)mapping->addr + offs;
2261
2363
  } else {
2364
+ GGML_ASSERT(cur->data != nullptr);
2262
2365
  file.seek(offs, SEEK_SET);
2263
2366
  file.read_raw(cur->data, ggml_nbytes(cur));
2264
2367
  }
2265
2368
  }
2266
2369
 
2267
- void load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) {
2370
+ // Returns false if cancelled by progress_callback
2371
+ bool load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, ggml_backend_buffer_t buf_mmap, llama_mlock * lmlock) const {
2268
2372
  size_t size_data = 0;
2269
- size_t size_lock = 0;
2270
- size_t size_pref = 0; // prefetch
2271
2373
 
2272
2374
  for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
2273
2375
  struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i));
2274
2376
  size_data += ggml_nbytes(cur);
2275
- if (cur->backend == GGML_BACKEND_CPU) {
2276
- size_pref += ggml_nbytes(cur);
2277
- }
2278
2377
  }
2279
2378
 
2280
- if (use_mmap) {
2281
- mapping.reset(new llama_mmap(&file, size_pref, ggml_is_numa()));
2379
+ if (use_mmap && buf_mmap) {
2282
2380
  if (lmlock) {
2283
2381
  lmlock->init(mapping->addr);
2284
2382
  }
2285
2383
  }
2286
2384
 
2287
- size_t done_size = 0;
2385
+ #if (defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)) || defined(GGML_USE_CLBLAST)
2386
+ const bool legacy_offload = true;
2387
+ #else
2388
+ const bool legacy_offload = false;
2389
+ #endif
2390
+
2391
+ std::vector<no_init<uint8_t>> read_buf;
2392
+
2393
+ size_t size_done = 0;
2394
+
2395
+ size_t mmap_first = -1;
2396
+ size_t mmap_last = 0;
2397
+
2288
2398
  for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
2289
2399
  struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i));
2290
2400
  GGML_ASSERT(cur); // unused tensors should have been caught by load_data already
2291
2401
 
2292
2402
  if (progress_callback) {
2293
- progress_callback((float) done_size / size_data, progress_callback_user_data);
2294
- }
2295
-
2296
- // allocate temp buffer if not using mmap
2297
- if (!use_mmap && cur->data == NULL) {
2298
- GGML_ASSERT(cur->backend != GGML_BACKEND_CPU);
2299
- #ifdef GGML_USE_CPU_HBM
2300
- cur->data = (uint8_t*)hbw_malloc(ggml_nbytes(cur));
2301
- #else
2302
- cur->data = (uint8_t*)malloc(ggml_nbytes(cur));
2303
- #endif
2403
+ if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) {
2404
+ return false;
2405
+ }
2304
2406
  }
2305
2407
 
2306
- load_data_for(cur);
2408
+ const size_t offs = file_offset(ggml_get_name(cur));
2307
2409
 
2308
- switch (cur->backend) {
2309
- case GGML_BACKEND_CPU:
2310
- if (use_mmap && lmlock) {
2311
- size_lock += ggml_nbytes(cur);
2312
- lmlock->grow_to(size_lock);
2410
+ if (!legacy_offload || cur->backend == GGML_BACKEND_CPU) {
2411
+ if (use_mmap && mapping) {
2412
+ if (buf_mmap) {
2413
+ ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *) mapping->addr + offs);
2414
+ if (lmlock) {
2415
+ lmlock->grow_to(offs + ggml_nbytes(cur));
2416
+ }
2417
+ mmap_first = std::min(mmap_first, offs);
2418
+ mmap_last = std::max(mmap_last, offs + ggml_nbytes(cur));
2419
+ } else {
2420
+ ggml_backend_tensor_set(cur, (uint8_t *) mapping->addr + offs, 0, ggml_nbytes(cur));
2313
2421
  }
2314
- break;
2315
- #ifdef GGML_USE_CUBLAS
2316
- case GGML_BACKEND_GPU:
2317
- case GGML_BACKEND_GPU_SPLIT:
2318
- // old code:
2319
- //ggml_cuda_transform_tensor(lt.data, lt.ggml_tensor);
2320
-
2321
- // TODO: test if this works !!
2322
- ggml_cuda_transform_tensor(cur->data, cur);
2323
- if (!use_mmap) {
2324
- free(cur->data);
2422
+ } else {
2423
+ if (ggml_backend_buffer_is_host(cur->buffer)) {
2424
+ file.seek(offs, SEEK_SET);
2425
+ file.read_raw(cur->data, ggml_nbytes(cur));
2426
+ } else {
2427
+ read_buf.resize(ggml_nbytes(cur));
2428
+ file.seek(offs, SEEK_SET);
2429
+ file.read_raw(read_buf.data(), ggml_nbytes(cur));
2430
+ ggml_backend_tensor_set(cur, read_buf.data(), 0, ggml_nbytes(cur));
2325
2431
  }
2326
- break;
2432
+ }
2433
+ } else {
2434
+ // HACK: mark tensor as allocated
2435
+ cur->data = (void *)(uintptr_t)1;
2436
+ void * data;
2437
+ if (use_mmap && mapping) {
2438
+ data = (uint8_t *) mapping->addr + offs;
2439
+ } else {
2440
+ read_buf.resize(ggml_nbytes(cur));
2441
+ file.seek(offs, SEEK_SET);
2442
+ file.read_raw(read_buf.data(), ggml_nbytes(cur));
2443
+ data = read_buf.data();
2444
+ }
2445
+
2446
+ #if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
2447
+ ggml_cuda_transform_tensor(data, cur);
2327
2448
  #elif defined(GGML_USE_CLBLAST)
2328
- case GGML_BACKEND_GPU:
2329
- ggml_cl_transform_tensor(cur->data, cur);
2330
- if (!use_mmap) {
2331
- free(cur->data);
2332
- }
2333
- break;
2449
+ GGML_ASSERT(cur->backend == GGML_BACKEND_GPU);
2450
+ ggml_cl_transform_tensor(data, cur);
2451
+ #else
2452
+ GGML_ASSERT(!"GPU tensor without a GPU backend");
2453
+ GGML_UNUSED(data);
2334
2454
  #endif
2335
- default:
2336
- continue;
2337
2455
  }
2338
2456
 
2339
- done_size += ggml_nbytes(cur);
2457
+ size_done += ggml_nbytes(cur);
2340
2458
  }
2459
+
2460
+ // unmap offloaded tensors and metadata
2461
+ if (use_mmap && mapping) {
2462
+ mapping->unmap_fragment(0, mmap_first);
2463
+ mapping->unmap_fragment(mmap_last, mapping->size);
2464
+ }
2465
+
2466
+ if (progress_callback) {
2467
+ // Even though the model is done loading, we still honor
2468
+ // cancellation since we need to free allocations.
2469
+ return progress_callback(1.0f, progress_callback_user_data);
2470
+ }
2471
+ return true;
2341
2472
  }
2342
2473
  };
2343
2474
 
@@ -2360,25 +2491,25 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
2360
2491
 
2361
2492
  switch (ftype) {
2362
2493
  case LLAMA_FTYPE_ALL_F32: return "all F32";
2363
- case LLAMA_FTYPE_MOSTLY_F16: return "mostly F16";
2364
- case LLAMA_FTYPE_MOSTLY_Q4_0: return "mostly Q4_0";
2365
- case LLAMA_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1";
2494
+ case LLAMA_FTYPE_MOSTLY_F16: return "F16";
2495
+ case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0";
2496
+ case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1";
2366
2497
  case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
2367
- return "mostly Q4_1, some F16";
2368
- case LLAMA_FTYPE_MOSTLY_Q5_0: return "mostly Q5_0";
2369
- case LLAMA_FTYPE_MOSTLY_Q5_1: return "mostly Q5_1";
2370
- case LLAMA_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0";
2498
+ return "Q4_1, some F16";
2499
+ case LLAMA_FTYPE_MOSTLY_Q5_0: return "Q5_0";
2500
+ case LLAMA_FTYPE_MOSTLY_Q5_1: return "Q5_1";
2501
+ case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0";
2371
2502
 
2372
2503
  // K-quants
2373
- case LLAMA_FTYPE_MOSTLY_Q2_K: return "mostly Q2_K";
2374
- case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "mostly Q3_K - Small";
2375
- case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "mostly Q3_K - Medium";
2376
- case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "mostly Q3_K - Large";
2377
- case LLAMA_FTYPE_MOSTLY_Q4_K_S: return "mostly Q4_K - Small";
2378
- case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "mostly Q4_K - Medium";
2379
- case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "mostly Q5_K - Small";
2380
- case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "mostly Q5_K - Medium";
2381
- case LLAMA_FTYPE_MOSTLY_Q6_K: return "mostly Q6_K";
2504
+ case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K";
2505
+ case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small";
2506
+ case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "Q3_K - Medium";
2507
+ case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "Q3_K - Large";
2508
+ case LLAMA_FTYPE_MOSTLY_Q4_K_S: return "Q4_K - Small";
2509
+ case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "Q4_K - Medium";
2510
+ case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small";
2511
+ case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium";
2512
+ case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
2382
2513
 
2383
2514
  default: return "unknown, may not work";
2384
2515
  }
@@ -2435,6 +2566,16 @@ static void llm_load_hparams(
2435
2566
  ml.get_key (LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff);
2436
2567
  ml.get_key (LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head);
2437
2568
  ml.get_key (LLM_KV_BLOCK_COUNT, hparams.n_layer);
2569
+ ml.get_key (LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
2570
+ ml.get_key (LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
2571
+
2572
+ GGML_ASSERT(hparams.n_expert <= LLAMA_MAX_EXPERTS);
2573
+ GGML_ASSERT(hparams.n_expert_used <= hparams.n_expert);
2574
+ if (hparams.n_expert > 0) {
2575
+ GGML_ASSERT(hparams.n_expert_used > 0);
2576
+ } else {
2577
+ GGML_ASSERT(hparams.n_expert_used == 0);
2578
+ }
2438
2579
 
2439
2580
  // n_head_kv is optional, default to n_head
2440
2581
  hparams.n_head_kv = hparams.n_head;
@@ -2486,6 +2627,7 @@ static void llm_load_hparams(
2486
2627
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2487
2628
 
2488
2629
  switch (hparams.n_layer) {
2630
+ case 22: model.type = e_model::MODEL_1B; break;
2489
2631
  case 26: model.type = e_model::MODEL_3B; break;
2490
2632
  case 32: model.type = e_model::MODEL_7B; break;
2491
2633
  case 40: model.type = e_model::MODEL_13B; break;
@@ -2587,6 +2729,15 @@ static void llm_load_hparams(
2587
2729
  default: model.type = e_model::MODEL_UNKNOWN;
2588
2730
  }
2589
2731
  } break;
2732
+ case LLM_ARCH_PHI2:
2733
+ {
2734
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
2735
+
2736
+ switch (hparams.n_layer) {
2737
+ case 32: model.type = e_model::MODEL_3B; break;
2738
+ default: model.type = e_model::MODEL_UNKNOWN;
2739
+ }
2740
+ } break;
2590
2741
 
2591
2742
  default: (void)0;
2592
2743
  }
@@ -2753,7 +2904,7 @@ static void llm_load_vocab(
2753
2904
  // The assumption is, since special tokens aren't meant to be exposed to end user, they are designed
2754
2905
  // to be unmatchable by the tokenizer, therefore tokens from the vocab, which are unmatchable by the tokenizer
2755
2906
  // are special tokens.
2756
- // From testing, this appears to corelate 1:1 with special tokens.
2907
+ // From testing, this appears to correlate 1:1 with special tokens.
2757
2908
  //
2758
2909
 
2759
2910
  // Counting special tokens and verifying in only one direction
@@ -2866,6 +3017,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
2866
3017
  LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv);
2867
3018
  LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias);
2868
3019
  LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
3020
+ LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
3021
+ LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
2869
3022
  LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type.c_str());
2870
3023
  LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
2871
3024
  LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
@@ -2892,7 +3045,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
2892
3045
  if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
2893
3046
  }
2894
3047
 
2895
- static void llm_load_tensors(
3048
+ // Returns false if cancelled by progress_callback
3049
+ static bool llm_load_tensors(
2896
3050
  llama_model_loader & ml,
2897
3051
  llama_model & model,
2898
3052
  int n_gpu_layers,
@@ -2908,25 +3062,16 @@ static void llm_load_tensors(
2908
3062
 
2909
3063
  model.n_gpu_layers = n_gpu_layers;
2910
3064
 
2911
- size_t ctx_size;
2912
- size_t mmapped_size;
2913
-
2914
- ml.calc_sizes(ctx_size, mmapped_size);
3065
+ size_t ctx_size = ggml_tensor_overhead() * ml.n_tensors;
2915
3066
 
2916
- LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MiB\n", __func__, ctx_size/1024.0/1024.0);
3067
+ LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MiB\n", __func__, ctx_size/1024.0/1024.0);
2917
3068
 
2918
3069
  // create the ggml context
2919
3070
  {
2920
- model.buf.resize(ctx_size);
2921
- if (use_mlock) {
2922
- model.mlock_buf.init (model.buf.data);
2923
- model.mlock_buf.grow_to(model.buf.size);
2924
- }
2925
-
2926
3071
  struct ggml_init_params params = {
2927
- /*.mem_size =*/ model.buf.size,
2928
- /*.mem_buffer =*/ model.buf.data,
2929
- /*.no_alloc =*/ ml.use_mmap,
3072
+ /*.mem_size =*/ ctx_size,
3073
+ /*.mem_buffer =*/ NULL,
3074
+ /*.no_alloc =*/ true,
2930
3075
  };
2931
3076
 
2932
3077
  model.ctx = ggml_init(params);
@@ -2937,25 +3082,24 @@ static void llm_load_tensors(
2937
3082
 
2938
3083
  (void) main_gpu;
2939
3084
 
2940
- enum ggml_backend_type llama_backend_offload = GGML_BACKEND_CPU;
3085
+ enum ggml_backend_type llama_backend_offload = GGML_BACKEND_CPU;
2941
3086
  enum ggml_backend_type llama_backend_offload_split = GGML_BACKEND_CPU;
2942
3087
 
2943
- #ifdef GGML_USE_CUBLAS
3088
+ #if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
2944
3089
  if (ggml_cublas_loaded()) {
2945
3090
  LLAMA_LOG_INFO("%s: using " GGML_CUDA_NAME " for GPU acceleration\n", __func__);
2946
3091
  ggml_cuda_set_main_device(main_gpu);
2947
3092
 
2948
- llama_backend_offload = GGML_BACKEND_GPU;
3093
+ llama_backend_offload = GGML_BACKEND_GPU;
2949
3094
  llama_backend_offload_split = GGML_BACKEND_GPU_SPLIT;
2950
3095
  }
2951
3096
  #elif defined(GGML_USE_CLBLAST)
2952
3097
  LLAMA_LOG_INFO("%s: using OpenCL for GPU acceleration\n", __func__);
2953
- llama_backend_offload = GGML_BACKEND_GPU;
3098
+ llama_backend_offload = GGML_BACKEND_GPU;
2954
3099
  llama_backend_offload_split = GGML_BACKEND_GPU;
2955
3100
  #endif
2956
3101
 
2957
- // prepare memory for the weights
2958
- size_t vram_weights = 0;
3102
+ // create tensors for the weights
2959
3103
  {
2960
3104
  const int64_t n_embd = hparams.n_embd;
2961
3105
  const int64_t n_embd_gqa = hparams.n_embd_gqa();
@@ -2984,13 +3128,6 @@ static void llm_load_tensors(
2984
3128
 
2985
3129
  model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
2986
3130
  model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
2987
-
2988
- if (backend_norm == GGML_BACKEND_GPU) {
2989
- vram_weights += ggml_nbytes(model.output_norm);
2990
- }
2991
- if (backend_output == GGML_BACKEND_GPU_SPLIT) {
2992
- vram_weights += ggml_nbytes(model.output);
2993
- }
2994
3131
  }
2995
3132
 
2996
3133
  const uint32_t n_ff = hparams.n_ff;
@@ -3020,20 +3157,25 @@ static void llm_load_tensors(
3020
3157
 
3021
3158
  layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
3022
3159
 
3023
- layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
3024
- layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
3025
- layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
3160
+ layer.ffn_gate_inp = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd}, backend, false);
3026
3161
 
3027
- if (backend == GGML_BACKEND_GPU) {
3028
- vram_weights +=
3029
- ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
3030
- ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) +
3031
- (layer.bq ? ggml_nbytes(layer.bq) : 0) +
3032
- (layer.bk ? ggml_nbytes(layer.bk) : 0) +
3033
- (layer.bv ? ggml_nbytes(layer.bv) : 0) +
3034
- (layer.bo ? ggml_nbytes(layer.bo) : 0) +
3035
- ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_gate) +
3036
- ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
3162
+ if (layer.ffn_gate_inp == nullptr) {
3163
+ GGML_ASSERT(hparams.n_expert == 0);
3164
+ GGML_ASSERT(hparams.n_expert_used == 0);
3165
+
3166
+ layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
3167
+ layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
3168
+ layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
3169
+ } else {
3170
+ GGML_ASSERT(hparams.n_expert > 0);
3171
+ GGML_ASSERT(hparams.n_expert_used > 0);
3172
+
3173
+ // MoE branch
3174
+ for (uint32_t x = 0; x < hparams.n_expert; ++x) {
3175
+ layer.ffn_gate_exp[x] = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), {n_embd, n_ff}, backend_split);
3176
+ layer.ffn_down_exp[x] = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), { n_ff, n_embd}, backend_split);
3177
+ layer.ffn_up_exp[x] = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), {n_embd, n_ff}, backend_split);
3178
+ }
3037
3179
  }
3038
3180
  }
3039
3181
  } break;
@@ -3054,13 +3196,6 @@ static void llm_load_tensors(
3054
3196
 
3055
3197
  model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
3056
3198
  model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3057
-
3058
- if (backend_norm == GGML_BACKEND_GPU) {
3059
- vram_weights += ggml_nbytes(model.output_norm);
3060
- }
3061
- if (backend_output == GGML_BACKEND_GPU_SPLIT) {
3062
- vram_weights += ggml_nbytes(model.output);
3063
- }
3064
3199
  }
3065
3200
 
3066
3201
  const uint32_t n_ff = hparams.n_ff;
@@ -3087,19 +3222,10 @@ static void llm_load_tensors(
3087
3222
  layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
3088
3223
  layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
3089
3224
  layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
3090
-
3091
- if (backend == GGML_BACKEND_GPU) {
3092
- vram_weights +=
3093
- ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
3094
- ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) +
3095
- ggml_nbytes(layer.ffn_gate) + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
3096
- }
3097
3225
  }
3098
3226
  } break;
3099
3227
  case LLM_ARCH_FALCON:
3100
3228
  {
3101
- // TODO: CPU-only for now
3102
-
3103
3229
  model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
3104
3230
 
3105
3231
  // output
@@ -3118,14 +3244,6 @@ static void llm_load_tensors(
3118
3244
  model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
3119
3245
  model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
3120
3246
  model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3121
-
3122
- if (backend_norm == GGML_BACKEND_GPU) {
3123
- vram_weights += ggml_nbytes(model.output_norm);
3124
- vram_weights += ggml_nbytes(model.output_norm_b);
3125
- }
3126
- if (backend_output == GGML_BACKEND_GPU_SPLIT) {
3127
- vram_weights += ggml_nbytes(model.output);
3128
- }
3129
3247
  }
3130
3248
 
3131
3249
  const uint32_t n_ff = hparams.n_ff;
@@ -3146,11 +3264,6 @@ static void llm_load_tensors(
3146
3264
  if (gguf_find_tensor(ml.ctx_gguf, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i).c_str()) >= 0) {
3147
3265
  layer.attn_norm_2 = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, backend);
3148
3266
  layer.attn_norm_2_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, backend);
3149
-
3150
- if (backend == GGML_BACKEND_GPU) {
3151
- vram_weights += ggml_nbytes(layer.attn_norm_2);
3152
- vram_weights += ggml_nbytes(layer.attn_norm_2_b);
3153
- }
3154
3267
  }
3155
3268
 
3156
3269
  layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
@@ -3158,13 +3271,6 @@ static void llm_load_tensors(
3158
3271
 
3159
3272
  layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
3160
3273
  layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
3161
-
3162
- if (backend == GGML_BACKEND_GPU) {
3163
- vram_weights +=
3164
- ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
3165
- ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.wo) +
3166
- ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
3167
- }
3168
3274
  }
3169
3275
  } break;
3170
3276
  case LLM_ARCH_STARCODER:
@@ -3188,14 +3294,6 @@ static void llm_load_tensors(
3188
3294
  model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
3189
3295
  model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
3190
3296
  model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3191
-
3192
- if (backend_norm == GGML_BACKEND_GPU) {
3193
- vram_weights += ggml_nbytes(model.output_norm);
3194
- vram_weights += ggml_nbytes(model.output_norm_b);
3195
- }
3196
- if (backend_output == GGML_BACKEND_GPU_SPLIT) {
3197
- vram_weights += ggml_nbytes(model.output);
3198
- }
3199
3297
  }
3200
3298
 
3201
3299
  const uint32_t n_ff = hparams.n_ff;
@@ -3227,16 +3325,6 @@ static void llm_load_tensors(
3227
3325
 
3228
3326
  layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
3229
3327
  layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend);
3230
-
3231
- if (backend == GGML_BACKEND_GPU) {
3232
- vram_weights +=
3233
- ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
3234
- ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) +
3235
- ggml_nbytes(layer.wo) + ggml_nbytes(layer.bo) +
3236
- ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_norm_b) +
3237
- ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_down_b) +
3238
- ggml_nbytes(layer.ffn_up) + ggml_nbytes(layer.ffn_up_b);
3239
- }
3240
3328
  }
3241
3329
  } break;
3242
3330
  case LLM_ARCH_PERSIMMON:
@@ -3258,14 +3346,6 @@ static void llm_load_tensors(
3258
3346
  model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
3259
3347
  model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
3260
3348
  model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3261
-
3262
- if (backend_norm == GGML_BACKEND_GPU) {
3263
- vram_weights += ggml_nbytes(model.output_norm);
3264
- vram_weights += ggml_nbytes(model.output_norm_b);
3265
- }
3266
- if (backend_output == GGML_BACKEND_GPU_SPLIT) {
3267
- vram_weights += ggml_nbytes(model.output);
3268
- }
3269
3349
  }
3270
3350
 
3271
3351
  const uint32_t n_ff = hparams.n_ff;
@@ -3295,8 +3375,6 @@ static void llm_load_tensors(
3295
3375
  } break;
3296
3376
  case LLM_ARCH_BLOOM:
3297
3377
  {
3298
- // TODO: CPU-only for now
3299
-
3300
3378
  model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
3301
3379
  model.tok_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, GGML_BACKEND_CPU);
3302
3380
  model.tok_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, GGML_BACKEND_CPU);
@@ -3317,14 +3395,6 @@ static void llm_load_tensors(
3317
3395
  model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
3318
3396
  model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
3319
3397
  model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3320
-
3321
- if (backend_norm == GGML_BACKEND_GPU) {
3322
- vram_weights += ggml_nbytes(model.output_norm);
3323
- vram_weights += ggml_nbytes(model.output_norm_b);
3324
- }
3325
- if (backend_output == GGML_BACKEND_GPU_SPLIT) {
3326
- vram_weights += ggml_nbytes(model.output);
3327
- }
3328
3398
  }
3329
3399
 
3330
3400
  const uint32_t n_ff = hparams.n_ff;
@@ -3356,16 +3426,6 @@ static void llm_load_tensors(
3356
3426
 
3357
3427
  layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
3358
3428
  layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend);
3359
-
3360
- if (backend == GGML_BACKEND_GPU) {
3361
- vram_weights +=
3362
- ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
3363
- ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) +
3364
- ggml_nbytes(layer.wo) + ggml_nbytes(layer.bo) +
3365
- ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_norm_b) +
3366
- ggml_nbytes(layer.ffn_up) + ggml_nbytes(layer.ffn_up_b) +
3367
- ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_down_b);
3368
- }
3369
3429
  }
3370
3430
  } break;
3371
3431
  case LLM_ARCH_MPT:
@@ -3387,13 +3447,6 @@ static void llm_load_tensors(
3387
3447
 
3388
3448
  model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
3389
3449
  model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3390
-
3391
- if (backend_norm == GGML_BACKEND_GPU) {
3392
- vram_weights += ggml_nbytes(model.output_norm);
3393
- }
3394
- if (backend_output == GGML_BACKEND_GPU_SPLIT) {
3395
- vram_weights += ggml_nbytes(model.output);
3396
- }
3397
3450
  }
3398
3451
 
3399
3452
  const uint32_t n_ff = hparams.n_ff;
@@ -3416,16 +3469,6 @@ static void llm_load_tensors(
3416
3469
 
3417
3470
  layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
3418
3471
  layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
3419
-
3420
- if (backend == GGML_BACKEND_GPU) {
3421
- vram_weights +=
3422
- ggml_nbytes(layer.attn_norm) +
3423
- ggml_nbytes(layer.wqkv) +
3424
- ggml_nbytes(layer.wo) +
3425
- ggml_nbytes(layer.ffn_norm) +
3426
- ggml_nbytes(layer.ffn_down) +
3427
- ggml_nbytes(layer.ffn_up);
3428
- }
3429
3472
  }
3430
3473
  } break;
3431
3474
  case LLM_ARCH_STABLELM:
@@ -3448,13 +3491,6 @@ static void llm_load_tensors(
3448
3491
  model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
3449
3492
  model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
3450
3493
  model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3451
-
3452
- if (backend_norm == GGML_BACKEND_GPU) {
3453
- vram_weights += ggml_nbytes(model.output_norm);
3454
- }
3455
- if (backend_output == GGML_BACKEND_GPU_SPLIT) {
3456
- vram_weights += ggml_nbytes(model.output);
3457
- }
3458
3494
  }
3459
3495
 
3460
3496
  const uint32_t n_ff = hparams.n_ff;
@@ -3486,13 +3522,6 @@ static void llm_load_tensors(
3486
3522
  layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
3487
3523
  layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
3488
3524
  layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
3489
-
3490
- if (backend == GGML_BACKEND_GPU) {
3491
- vram_weights +=
3492
- ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
3493
- ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) +
3494
- ggml_nbytes(layer.ffn_gate) + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
3495
- }
3496
3525
  }
3497
3526
  } break;
3498
3527
  case LLM_ARCH_QWEN:
@@ -3512,14 +3541,7 @@ static void llm_load_tensors(
3512
3541
 
3513
3542
  model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
3514
3543
  model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3515
-
3516
- if (backend_norm == GGML_BACKEND_GPU) {
3517
- vram_weights += ggml_nbytes(model.output_norm);
3518
- }
3519
- if (backend_output == GGML_BACKEND_GPU_SPLIT) {
3520
- vram_weights += ggml_nbytes(model.output);
3521
- }
3522
- }
3544
+ }
3523
3545
 
3524
3546
  const uint32_t n_ff = hparams.n_ff / 2;
3525
3547
 
@@ -3544,16 +3566,59 @@ static void llm_load_tensors(
3544
3566
  layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
3545
3567
  layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
3546
3568
  layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
3569
+ }
3570
+ } break;
3571
+ case LLM_ARCH_PHI2:
3572
+ {
3573
+ model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
3547
3574
 
3548
- if (backend == GGML_BACKEND_GPU) {
3549
- vram_weights +=
3550
- ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) +
3551
- ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_gate) +
3552
- ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
3575
+ // output
3576
+ {
3577
+ ggml_backend_type backend_norm;
3578
+ ggml_backend_type backend_output;
3579
+
3580
+ if (n_gpu_layers > int(n_layer)) {
3581
+ backend_norm = llama_backend_offload;
3582
+ backend_output = llama_backend_offload;
3583
+ } else {
3584
+ backend_norm = GGML_BACKEND_CPU;
3585
+ backend_output = GGML_BACKEND_CPU;
3553
3586
  }
3587
+
3588
+ model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
3589
+ model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
3590
+ model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3591
+ model.output_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "bias"), {n_vocab}, backend_output);
3554
3592
  }
3555
- } break;
3556
3593
 
3594
+ const uint32_t n_ff = hparams.n_ff;
3595
+
3596
+ const int i_gpu_start = n_layer - n_gpu_layers;
3597
+
3598
+ model.layers.resize(n_layer);
3599
+
3600
+ for (uint32_t i = 0; i < n_layer; ++i) {
3601
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
3602
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
3603
+
3604
+ auto & layer = model.layers[i];
3605
+
3606
+ layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
3607
+ layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
3608
+
3609
+ layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
3610
+ layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend);
3611
+
3612
+ layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
3613
+ layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend);
3614
+
3615
+ layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
3616
+ layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend);
3617
+
3618
+ layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
3619
+ layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend);
3620
+ }
3621
+ } break;
3557
3622
  default:
3558
3623
  throw std::runtime_error("unknown architecture");
3559
3624
  }
@@ -3561,16 +3626,78 @@ static void llm_load_tensors(
3561
3626
 
3562
3627
  ml.done_getting_tensors();
3563
3628
 
3629
+ ml.init_mapping();
3630
+
3631
+ // allocate tensors
3632
+ size_t vram_weights = 0;
3633
+ size_t buf_size = 0;
3634
+
3635
+ ggml_backend_buffer_type_t buft = llama_default_buffer_type(n_gpu_layers);
3636
+
3637
+ for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
3638
+ // GGML_BACKEND_GPU tensors are for CUDA and OpenCL only, which are handled separately without ggml-backend
3639
+ if (t->backend == GGML_BACKEND_CPU) {
3640
+ buf_size += GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), ggml_backend_buft_get_alignment(buft));
3641
+ } else {
3642
+ vram_weights += ggml_nbytes(t);
3643
+ }
3644
+ }
3645
+
3646
+ // create backend buffer
3647
+ ggml_backend_buffer_t buf_mmap = nullptr;
3648
+
3649
+ #ifdef GGML_USE_METAL
3650
+ if (n_gpu_layers > 0) {
3651
+ if (ml.use_mmap) {
3652
+ const size_t max_size = ggml_get_max_tensor_size(ctx);
3653
+ model.buf = ggml_backend_metal_buffer_from_ptr(ml.mapping->addr, ml.mapping->size, max_size);
3654
+ buf_mmap = model.buf;
3655
+ } else {
3656
+ model.buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_metal_buffer_type());
3657
+ }
3658
+ }
3659
+ #elif defined(GGML_USE_CUBLAS) && defined(LLAMA_GGML_BACKEND_CUDA_TEST)
3660
+ // for testing only
3661
+ if (n_gpu_layers > 0) {
3662
+ model.buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_cuda_buffer_type(0));
3663
+ }
3664
+ #endif
3665
+
3666
+ if (model.buf == nullptr) {
3667
+ // CPU backend, and indirectly CUDA and OpenCL
3668
+ if (ml.use_mmap) {
3669
+ model.buf = ggml_backend_cpu_buffer_from_ptr(ml.mapping->addr, ml.mapping->size);
3670
+ buf_mmap = model.buf;
3671
+ } else {
3672
+ // allocate only CPU tensors
3673
+ model.buf = ggml_backend_buft_alloc_buffer(buft, buf_size);
3674
+ ggml_tallocr_t alloc = ggml_tallocr_new_from_buffer(model.buf);
3675
+ for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
3676
+ if (t->backend == GGML_BACKEND_CPU) {
3677
+ ggml_tallocr_alloc(alloc, t);
3678
+ }
3679
+ }
3680
+ ggml_tallocr_free(alloc);
3681
+ }
3682
+ }
3683
+
3684
+ if (use_mlock && ggml_backend_buffer_is_host(model.buf)) {
3685
+ model.mlock_buf.init (ggml_backend_buffer_get_base(model.buf));
3686
+ model.mlock_buf.grow_to(ggml_backend_buffer_get_size(model.buf));
3687
+ }
3688
+
3564
3689
  // print memory requirements
3565
3690
  {
3566
- // this is the total memory required to run the inference
3567
- size_t mem_required =
3568
- ctx_size +
3569
- mmapped_size - vram_weights; // weights in VRAM not in memory
3691
+ size_t sys_mem_required = ctx_size + buf_size;
3570
3692
 
3571
- LLAMA_LOG_INFO("%s: mem required = %7.2f MiB\n", __func__, mem_required / 1024.0 / 1024.0);
3693
+ if (sys_mem_required > 0) {
3694
+ LLAMA_LOG_INFO("%s: system memory used = %7.2f MiB\n", __func__, sys_mem_required / 1024.0 / 1024.0);
3695
+ }
3696
+ if (vram_weights > 0) {
3697
+ LLAMA_LOG_INFO("%s: VRAM used = %7.2f MiB\n", __func__, vram_weights / 1024.0 / 1024.0);
3698
+ }
3572
3699
 
3573
- #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
3700
+ #if (defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)) || defined(GGML_USE_CLBLAST)
3574
3701
  const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
3575
3702
 
3576
3703
  LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
@@ -3578,38 +3705,27 @@ static void llm_load_tensors(
3578
3705
  LLAMA_LOG_INFO("%s: offloading non-repeating layers to GPU\n", __func__);
3579
3706
  }
3580
3707
 
3581
- #ifdef GGML_USE_CUBLAS
3582
3708
  const int max_backend_supported_layers = hparams.n_layer + 1;
3583
3709
  const int max_offloadable_layers = hparams.n_layer + 1;
3584
- #elif GGML_USE_CLBLAST
3585
- const int max_backend_supported_layers = hparams.n_layer + 1;
3586
- const int max_offloadable_layers = hparams.n_layer + 1;
3587
- #endif // GGML_USE_CUBLAS
3588
3710
 
3589
3711
  LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
3590
- LLAMA_LOG_INFO("%s: VRAM used: %.2f MiB\n", __func__, vram_weights / 1024.0 / 1024.0);
3591
- #else
3592
- (void) n_gpu_layers;
3593
3712
  #endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
3594
3713
  }
3595
3714
 
3596
- // populate `tensors_by_name`
3715
+ #if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
3716
+ ggml_cuda_set_tensor_split(tensor_split);
3717
+ #else
3718
+ GGML_UNUSED(tensor_split);
3719
+ #endif // GGML_USE_CUBLAS
3720
+
3721
+ // populate tensors_by_name
3597
3722
  for (int i = 0; i < ml.n_tensors; ++i) {
3598
3723
  struct ggml_tensor * cur = ggml_get_tensor(ctx, ml.get_tensor_name(i));
3599
3724
  model.tensors_by_name.emplace_back(ggml_get_name(cur), cur);
3600
3725
  }
3601
3726
 
3602
- (void) tensor_split;
3603
- #ifdef GGML_USE_CUBLAS
3604
- {
3605
- ggml_cuda_set_tensor_split(tensor_split);
3606
- }
3607
- #endif
3608
-
3609
- ml.load_all_data(ctx, progress_callback, progress_callback_user_data, use_mlock ? &model.mlock_mmap : NULL);
3610
-
3611
- if (progress_callback) {
3612
- progress_callback(1.0f, progress_callback_user_data);
3727
+ if (!ml.load_all_data(ctx, progress_callback, progress_callback_user_data, buf_mmap, use_mlock ? &model.mlock_mmap : NULL)) {
3728
+ return false;
3613
3729
  }
3614
3730
 
3615
3731
  model.mapping = std::move(ml.mapping);
@@ -3617,9 +3733,11 @@ static void llm_load_tensors(
3617
3733
  // loading time will be recalculate after the first eval, so
3618
3734
  // we take page faults deferred by mmap() into consideration
3619
3735
  model.t_load_us = ggml_time_us() - model.t_start_us;
3736
+ return true;
3620
3737
  }
3621
3738
 
3622
- static bool llama_model_load(const std::string & fname, llama_model & model, const llama_model_params & params) {
3739
+ // Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
3740
+ static int llama_model_load(const std::string & fname, llama_model & model, const llama_model_params & params) {
3623
3741
  try {
3624
3742
  llama_model_loader ml(fname, params.use_mmap, params.kv_overrides);
3625
3743
 
@@ -3637,19 +3755,21 @@ static bool llama_model_load(const std::string & fname, llama_model & model, con
3637
3755
 
3638
3756
  if (params.vocab_only) {
3639
3757
  LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__);
3640
- return true;
3758
+ return 0;
3641
3759
  }
3642
3760
 
3643
- llm_load_tensors(
3761
+ if (!llm_load_tensors(
3644
3762
  ml, model, params.n_gpu_layers, params.main_gpu, params.tensor_split, params.use_mlock,
3645
3763
  params.progress_callback, params.progress_callback_user_data
3646
- );
3764
+ )) {
3765
+ return -2;
3766
+ }
3647
3767
  } catch (const std::exception & err) {
3648
3768
  LLAMA_LOG_ERROR("error loading model: %s\n", err.what());
3649
- return false;
3769
+ return -1;
3650
3770
  }
3651
3771
 
3652
- return true;
3772
+ return 0;
3653
3773
  }
3654
3774
 
3655
3775
  //
@@ -3750,8 +3870,8 @@ static void llm_build_k_shift(
3750
3870
  ggml_rope_custom_inplace(ctx,
3751
3871
  ggml_view_3d(ctx, kv.k_l[il],
3752
3872
  n_embd_head, n_head_kv, n_ctx,
3753
- ggml_type_sizef(kv.k_l[il]->type)*n_embd_head,
3754
- ggml_type_sizef(kv.k_l[il]->type)*n_embd_gqa,
3873
+ ggml_row_size(kv.k_l[il]->type, n_embd_head),
3874
+ ggml_row_size(kv.k_l[il]->type, n_embd_gqa),
3755
3875
  0),
3756
3876
  K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
3757
3877
  ext_factor, attn_factor, beta_fast, beta_slow);
@@ -3780,7 +3900,7 @@ static void llm_build_kv_store(
3780
3900
  cb(v_cur_t, "v_cur_t", il);
3781
3901
 
3782
3902
  struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_gqa,
3783
- (ggml_type_sizef(kv.k_l[il]->type)*n_embd_gqa)*kv_head);
3903
+ (ggml_row_size(kv.k_l[il]->type, n_embd_gqa))*kv_head);
3784
3904
  cb(k_cache_view, "k_cache_view", il);
3785
3905
 
3786
3906
  struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_gqa,
@@ -3914,17 +4034,18 @@ static struct ggml_tensor * llm_build_ffn(
3914
4034
  // if max_alibi_bias > 0 then apply ALiBi
3915
4035
  static struct ggml_tensor * llm_build_kqv(
3916
4036
  struct ggml_context * ctx,
4037
+ const llama_model & model,
3917
4038
  const llama_hparams & hparams,
3918
4039
  const llama_kv_cache & kv,
3919
4040
  struct ggml_tensor * wo,
3920
4041
  struct ggml_tensor * wo_b,
3921
4042
  struct ggml_tensor * q_cur,
3922
- struct ggml_tensor * kq_scale,
3923
4043
  struct ggml_tensor * kq_mask,
3924
4044
  int64_t n_ctx,
3925
4045
  int32_t n_tokens,
3926
4046
  int32_t n_kv,
3927
4047
  float max_alibi_bias,
4048
+ float kq_scale,
3928
4049
  const llm_build_cb & cb,
3929
4050
  int il) {
3930
4051
  const int64_t n_embd = hparams.n_embd;
@@ -3939,14 +4060,20 @@ static struct ggml_tensor * llm_build_kqv(
3939
4060
  struct ggml_tensor * k =
3940
4061
  ggml_view_3d(ctx, kv.k_l[il],
3941
4062
  n_embd_head, n_kv, n_head_kv,
3942
- ggml_type_sizef(kv.k_l[il]->type)*n_embd_gqa,
3943
- ggml_type_sizef(kv.k_l[il]->type)*n_embd_head,
4063
+ ggml_row_size(kv.k_l[il]->type, n_embd_gqa),
4064
+ ggml_row_size(kv.k_l[il]->type, n_embd_head),
3944
4065
  0);
3945
4066
  cb(k, "k", il);
3946
4067
 
3947
4068
  struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
3948
4069
  cb(kq, "kq", il);
3949
4070
 
4071
+ if (model.arch == LLM_ARCH_PHI2) {
4072
+ // for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
4073
+ // ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
4074
+ ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
4075
+ }
4076
+
3950
4077
  if (max_alibi_bias > 0.0f) {
3951
4078
  // temporary branch until we figure out how to handle ggml_alibi through ggml_add
3952
4079
  kq = ggml_scale(ctx, kq, kq_scale);
@@ -3966,7 +4093,7 @@ static struct ggml_tensor * llm_build_kqv(
3966
4093
  kq = ggml_soft_max(ctx, kq);
3967
4094
  cb(kq, "kq_soft_max", il);
3968
4095
  } else {
3969
- kq = ggml_soft_max_ext(ctx, kq, kq_mask, 1.0f/sqrtf(float(n_embd_head)));
4096
+ kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_scale);
3970
4097
  cb(kq, "kq_soft_max_ext", il);
3971
4098
  }
3972
4099
 
@@ -4014,6 +4141,8 @@ struct llm_build_context {
4014
4141
  const int64_t n_head_kv;
4015
4142
  const int64_t n_embd_head;
4016
4143
  const int64_t n_embd_gqa;
4144
+ const int64_t n_expert;
4145
+ const int64_t n_expert_used;
4017
4146
 
4018
4147
  const float freq_base;
4019
4148
  const float freq_scale;
@@ -4033,7 +4162,7 @@ struct llm_build_context {
4033
4162
 
4034
4163
  const llm_build_cb & cb;
4035
4164
 
4036
- llama_buffer & buf_compute;
4165
+ std::vector<uint8_t> & buf_compute_meta;
4037
4166
 
4038
4167
  struct ggml_context * ctx0 = nullptr;
4039
4168
 
@@ -4043,33 +4172,35 @@ struct llm_build_context {
4043
4172
  const llama_batch & batch,
4044
4173
  const llm_build_cb & cb,
4045
4174
  bool worst_case) :
4046
- model (lctx.model),
4047
- hparams (model.hparams),
4048
- cparams (lctx.cparams),
4049
- batch (batch),
4050
- kv_self (lctx.kv_self),
4051
- n_embd (hparams.n_embd),
4052
- n_layer (hparams.n_layer),
4053
- n_ctx (cparams.n_ctx),
4054
- n_head (hparams.n_head),
4055
- n_head_kv (hparams.n_head_kv),
4056
- n_embd_head (hparams.n_embd_head()),
4057
- n_embd_gqa (hparams.n_embd_gqa()),
4058
- freq_base (cparams.rope_freq_base),
4059
- freq_scale (cparams.rope_freq_scale),
4060
- ext_factor (cparams.yarn_ext_factor),
4061
- attn_factor (cparams.yarn_attn_factor),
4062
- beta_fast (cparams.yarn_beta_fast),
4063
- beta_slow (cparams.yarn_beta_slow),
4064
- norm_eps (hparams.f_norm_eps),
4065
- norm_rms_eps (hparams.f_norm_rms_eps),
4066
- n_tokens (batch.n_tokens),
4067
- n_kv (worst_case ? n_ctx : kv_self.n),
4068
- kv_head (worst_case ? n_ctx - n_tokens : kv_self.head),
4069
- n_orig_ctx (cparams.n_yarn_orig_ctx),
4070
- do_rope_shift (worst_case || kv_self.has_shift),
4071
- cb (cb),
4072
- buf_compute (lctx.buf_compute) {
4175
+ model (lctx.model),
4176
+ hparams (model.hparams),
4177
+ cparams (lctx.cparams),
4178
+ batch (batch),
4179
+ kv_self (lctx.kv_self),
4180
+ n_embd (hparams.n_embd),
4181
+ n_layer (hparams.n_layer),
4182
+ n_ctx (cparams.n_ctx),
4183
+ n_head (hparams.n_head),
4184
+ n_head_kv (hparams.n_head_kv),
4185
+ n_embd_head (hparams.n_embd_head()),
4186
+ n_embd_gqa (hparams.n_embd_gqa()),
4187
+ n_expert (hparams.n_expert),
4188
+ n_expert_used (hparams.n_expert_used),
4189
+ freq_base (cparams.rope_freq_base),
4190
+ freq_scale (cparams.rope_freq_scale),
4191
+ ext_factor (cparams.yarn_ext_factor),
4192
+ attn_factor (cparams.yarn_attn_factor),
4193
+ beta_fast (cparams.yarn_beta_fast),
4194
+ beta_slow (cparams.yarn_beta_slow),
4195
+ norm_eps (hparams.f_norm_eps),
4196
+ norm_rms_eps (hparams.f_norm_rms_eps),
4197
+ n_tokens (batch.n_tokens),
4198
+ n_kv (worst_case ? n_ctx : kv_self.n),
4199
+ kv_head (worst_case ? n_ctx - n_tokens : kv_self.head),
4200
+ n_orig_ctx (cparams.n_yarn_orig_ctx),
4201
+ do_rope_shift (worst_case || kv_self.has_shift),
4202
+ cb (cb),
4203
+ buf_compute_meta (lctx.buf_compute_meta) {
4073
4204
  GGML_ASSERT(!!kv_self.ctx);
4074
4205
 
4075
4206
  // all initializations should be done in init()
@@ -4077,8 +4208,8 @@ struct llm_build_context {
4077
4208
 
4078
4209
  void init() {
4079
4210
  struct ggml_init_params params = {
4080
- /*.mem_size =*/ buf_compute.size,
4081
- /*.mem_buffer =*/ buf_compute.data,
4211
+ /*.mem_size =*/ buf_compute_meta.size(),
4212
+ /*.mem_buffer =*/ buf_compute_meta.data(),
4082
4213
  /*.no_alloc =*/ true,
4083
4214
  };
4084
4215
 
@@ -4107,10 +4238,6 @@ struct llm_build_context {
4107
4238
  struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
4108
4239
  cb(inp_pos, "inp_pos", -1);
4109
4240
 
4110
- // KQ_scale
4111
- struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
4112
- cb(KQ_scale, "KQ_scale", -1);
4113
-
4114
4241
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
4115
4242
  struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
4116
4243
  cb(KQ_mask, "KQ_mask", -1);
@@ -4169,9 +4296,9 @@ struct llm_build_context {
4169
4296
 
4170
4297
  llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
4171
4298
 
4172
- cur = llm_build_kqv(ctx0, hparams, kv_self,
4299
+ cur = llm_build_kqv(ctx0, model, hparams, kv_self,
4173
4300
  model.layers[il].wo, model.layers[il].bo,
4174
- Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
4301
+ Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
4175
4302
  cb(cur, "kqv_out", il);
4176
4303
  }
4177
4304
 
@@ -4179,7 +4306,7 @@ struct llm_build_context {
4179
4306
  cb(ffn_inp, "ffn_inp", il);
4180
4307
 
4181
4308
  // feed-forward network
4182
- {
4309
+ if (model.layers[il].ffn_gate_inp == nullptr) {
4183
4310
  cur = llm_build_norm(ctx0, ffn_inp, hparams,
4184
4311
  model.layers[il].ffn_norm, NULL,
4185
4312
  LLM_NORM_RMS, cb, il);
@@ -4191,6 +4318,69 @@ struct llm_build_context {
4191
4318
  model.layers[il].ffn_down, NULL,
4192
4319
  LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
4193
4320
  cb(cur, "ffn_out", il);
4321
+ } else {
4322
+ // MoE branch
4323
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
4324
+ model.layers[il].ffn_norm, NULL,
4325
+ LLM_NORM_RMS, cb, il);
4326
+ cb(cur, "ffn_norm", il);
4327
+
4328
+ ggml_tensor * logits = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp, cur); // [n_tokens, num_experts]
4329
+ cb(logits, "ffn_moe_logits", il);
4330
+
4331
+ ggml_tensor * probs = ggml_soft_max(ctx0, logits); // [n_tokens, num_experts]
4332
+ cb(probs, "ffn_moe_probs", il);
4333
+
4334
+ // select experts
4335
+ ggml_tensor * selected_experts = ggml_top_k(ctx0, probs, n_expert_used); // [n_tokens, num_experts_per_tok]
4336
+ cb(selected_experts->src[0], "ffn_moe_argsort", il);
4337
+
4338
+ ggml_tensor * weights = ggml_get_rows(ctx0,
4339
+ ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts);
4340
+ cb(weights, "ffn_moe_weights", il);
4341
+
4342
+ weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens); // [n_tokens, num_experts_per_tok]
4343
+
4344
+ ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights);
4345
+ cb(weights_sum, "ffn_moe_weights_sum", il);
4346
+
4347
+ weights = ggml_div(ctx0, weights, weights_sum); // [n_tokens, num_experts_per_tok]
4348
+ cb(weights, "ffn_moe_weights_norm", il);
4349
+
4350
+ // compute expert outputs
4351
+ ggml_tensor * moe_out = nullptr;
4352
+
4353
+ for (int i = 0; i < n_expert_used; ++i) {
4354
+ ggml_tensor * cur_expert;
4355
+
4356
+ ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exp, n_expert, selected_experts, i, cur);
4357
+ cb(cur_up, "ffn_moe_up", il);
4358
+
4359
+ ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exp, n_expert, selected_experts, i, cur);
4360
+ cb(cur_gate, "ffn_moe_gate", il);
4361
+
4362
+ cur_gate = ggml_silu(ctx0, cur_gate);
4363
+ cb(cur_gate, "ffn_moe_silu", il);
4364
+
4365
+ cur_expert = ggml_mul(ctx0, cur_up, cur_gate); // [n_tokens, n_embd]
4366
+ cb(cur_expert, "ffn_moe_gate_par", il);
4367
+
4368
+ cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exp, n_expert, selected_experts, i, cur_expert); // [n_tokens, n_embd]
4369
+ cb(cur_expert, "ffn_moe_down", il);
4370
+
4371
+ cur_expert = ggml_mul(ctx0, cur_expert,
4372
+ ggml_view_2d(ctx0, weights, 1, n_tokens, weights->nb[1], i*weights->nb[0]));
4373
+ cb(cur_expert, "ffn_moe_weighted", il);
4374
+
4375
+ if (i == 0) {
4376
+ moe_out = cur_expert;
4377
+ } else {
4378
+ moe_out = ggml_add(ctx0, moe_out, cur_expert);
4379
+ cb(moe_out, "ffn_moe_out", il);
4380
+ }
4381
+ }
4382
+
4383
+ cur = moe_out;
4194
4384
  }
4195
4385
 
4196
4386
  cur = ggml_add(ctx0, cur, ffn_inp);
@@ -4229,10 +4419,6 @@ struct llm_build_context {
4229
4419
  struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
4230
4420
  cb(inp_pos, "inp_pos", -1);
4231
4421
 
4232
- // KQ_scale
4233
- struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
4234
- cb(KQ_scale, "KQ_scale", -1);
4235
-
4236
4422
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
4237
4423
  struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
4238
4424
  cb(KQ_mask, "KQ_mask", -1);
@@ -4289,9 +4475,9 @@ struct llm_build_context {
4289
4475
  // apply ALiBi for 13B model
4290
4476
  const float max_alibi_bias = model.type == MODEL_13B ? 8.0f : -1.0f;
4291
4477
 
4292
- cur = llm_build_kqv(ctx0, hparams, kv_self,
4478
+ cur = llm_build_kqv(ctx0, model, hparams, kv_self,
4293
4479
  model.layers[il].wo, NULL,
4294
- Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, max_alibi_bias, cb, il);
4480
+ Qcur, KQ_mask, n_ctx, n_tokens, n_kv, max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il);
4295
4481
  cb(cur, "kqv_out", il);
4296
4482
  }
4297
4483
 
@@ -4349,10 +4535,6 @@ struct llm_build_context {
4349
4535
  struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
4350
4536
  cb(inp_pos, "inp_pos", -1);
4351
4537
 
4352
- // KQ_scale
4353
- struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
4354
- cb(KQ_scale, "KQ_scale", -1);
4355
-
4356
4538
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
4357
4539
  struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
4358
4540
  cb(KQ_mask, "KQ_mask", -1);
@@ -4413,9 +4595,9 @@ struct llm_build_context {
4413
4595
 
4414
4596
  llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
4415
4597
 
4416
- cur = llm_build_kqv(ctx0, hparams, kv_self,
4598
+ cur = llm_build_kqv(ctx0, model, hparams, kv_self,
4417
4599
  model.layers[il].wo, NULL,
4418
- Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
4600
+ Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
4419
4601
  cb(cur, "kqv_out", il);
4420
4602
  }
4421
4603
 
@@ -4472,10 +4654,6 @@ struct llm_build_context {
4472
4654
  struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
4473
4655
  cb(inp_pos, "inp_pos", -1);
4474
4656
 
4475
- // KQ_scale
4476
- struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
4477
- cb(KQ_scale, "KQ_scale", -1);
4478
-
4479
4657
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
4480
4658
  struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
4481
4659
  cb(KQ_mask, "KQ_mask", -1);
@@ -4513,9 +4691,9 @@ struct llm_build_context {
4513
4691
 
4514
4692
  llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
4515
4693
 
4516
- cur = llm_build_kqv(ctx0, hparams, kv_self,
4694
+ cur = llm_build_kqv(ctx0, model, hparams, kv_self,
4517
4695
  model.layers[il].wo, model.layers[il].bo,
4518
- Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
4696
+ Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
4519
4697
  cb(cur, "kqv_out", il);
4520
4698
  }
4521
4699
 
@@ -4572,10 +4750,6 @@ struct llm_build_context {
4572
4750
  struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
4573
4751
  cb(inp_pos, "inp_pos", -1);
4574
4752
 
4575
- // KQ_scale
4576
- struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
4577
- cb(KQ_scale, "KQ_scale", -1);
4578
-
4579
4753
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
4580
4754
  struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
4581
4755
  cb(KQ_mask, "KQ_mask", -1);
@@ -4722,9 +4896,9 @@ struct llm_build_context {
4722
4896
  llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
4723
4897
 
4724
4898
  // TODO: not tested, could be broken
4725
- cur = llm_build_kqv(ctx0, hparams, kv_self,
4899
+ cur = llm_build_kqv(ctx0, model, hparams, kv_self,
4726
4900
  model.layers[il].wo, model.layers[il].bo,
4727
- Q, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
4901
+ Q, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
4728
4902
  cb(cur, "kqv_out", il);
4729
4903
  }
4730
4904
 
@@ -4778,10 +4952,6 @@ struct llm_build_context {
4778
4952
  inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
4779
4953
  cb(inpL, "inp_embd", -1);
4780
4954
 
4781
- // KQ_scale
4782
- struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
4783
- cb(KQ_scale, "KQ_scale", -1);
4784
-
4785
4955
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
4786
4956
  struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
4787
4957
  cb(KQ_mask, "KQ_mask", -1);
@@ -4813,9 +4983,9 @@ struct llm_build_context {
4813
4983
 
4814
4984
  llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
4815
4985
 
4816
- cur = llm_build_kqv(ctx0, hparams, kv_self,
4986
+ cur = llm_build_kqv(ctx0, model, hparams, kv_self,
4817
4987
  model.layers[il].wo, NULL,
4818
- Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, cb, il);
4988
+ Qcur, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
4819
4989
  cb(cur, "kqv_out", il);
4820
4990
  }
4821
4991
 
@@ -4869,10 +5039,6 @@ struct llm_build_context {
4869
5039
  inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
4870
5040
  cb(inpL, "inp_embd", -1);
4871
5041
 
4872
- // KQ_scale
4873
- struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
4874
- cb(KQ_scale, "KQ_scale", -1);
4875
-
4876
5042
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
4877
5043
  struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
4878
5044
  cb(KQ_mask, "KQ_mask", -1);
@@ -4910,9 +5076,9 @@ struct llm_build_context {
4910
5076
 
4911
5077
  llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
4912
5078
 
4913
- cur = llm_build_kqv(ctx0, hparams, kv_self,
5079
+ cur = llm_build_kqv(ctx0, model, hparams, kv_self,
4914
5080
  model.layers[il].wo, model.layers[il].bo,
4915
- Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, cb, il);
5081
+ Qcur, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
4916
5082
  cb(cur, "kqv_out", il);
4917
5083
  }
4918
5084
 
@@ -4963,10 +5129,6 @@ struct llm_build_context {
4963
5129
  inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
4964
5130
  cb(inpL, "inp_embd", -1);
4965
5131
 
4966
- // KQ_scale
4967
- struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
4968
- cb(KQ_scale, "KQ_scale", -1);
4969
-
4970
5132
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
4971
5133
  struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
4972
5134
  cb(KQ_mask, "KQ_mask", -1);
@@ -5004,9 +5166,9 @@ struct llm_build_context {
5004
5166
 
5005
5167
  llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
5006
5168
 
5007
- cur = llm_build_kqv(ctx0, hparams, kv_self,
5169
+ cur = llm_build_kqv(ctx0, model, hparams, kv_self,
5008
5170
  model.layers[il].wo, NULL,
5009
- Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, hparams.f_max_alibi_bias, cb, il);
5171
+ Qcur, KQ_mask, n_ctx, n_tokens, n_kv, hparams.f_max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5010
5172
  cb(cur, "kqv_out", il);
5011
5173
  }
5012
5174
 
@@ -5066,10 +5228,6 @@ struct llm_build_context {
5066
5228
  struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
5067
5229
  cb(inp_pos, "inp_pos", -1);
5068
5230
 
5069
- // KQ_scale
5070
- struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
5071
- cb(KQ_scale, "KQ_scale", -1);
5072
-
5073
5231
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
5074
5232
  struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
5075
5233
  cb(KQ_mask, "KQ_mask", -1);
@@ -5098,40 +5256,152 @@ struct llm_build_context {
5098
5256
  struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
5099
5257
  cb(Kcur, "Kcur", il);
5100
5258
 
5101
- struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
5259
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
5260
+ cb(Vcur, "Vcur", il);
5261
+
5262
+ Qcur = ggml_rope_custom(
5263
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
5264
+ hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
5265
+ ext_factor, attn_factor, beta_fast, beta_slow
5266
+ );
5267
+ cb(Qcur, "Qcur", il);
5268
+
5269
+ Kcur = ggml_rope_custom(
5270
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
5271
+ hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
5272
+ ext_factor, attn_factor, beta_fast, beta_slow
5273
+ );
5274
+ cb(Kcur, "Kcur", il);
5275
+
5276
+ llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
5277
+
5278
+ cur = llm_build_kqv(ctx0, model, hparams, kv_self,
5279
+ model.layers[il].wo, NULL,
5280
+ Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5281
+ cb(cur, "kqv_out", il);
5282
+ }
5283
+
5284
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
5285
+ cb(ffn_inp, "ffn_inp", il);
5286
+
5287
+ // feed-forward network
5288
+ {
5289
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
5290
+ model.layers[il].ffn_norm,
5291
+ model.layers[il].ffn_norm_b,
5292
+ LLM_NORM, cb, il);
5293
+ cb(cur, "ffn_norm", il);
5294
+
5295
+ cur = llm_build_ffn(ctx0, cur,
5296
+ model.layers[il].ffn_up, NULL,
5297
+ model.layers[il].ffn_gate, NULL,
5298
+ model.layers[il].ffn_down, NULL,
5299
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
5300
+ cb(cur, "ffn_out", il);
5301
+ }
5302
+
5303
+ cur = ggml_add(ctx0, cur, ffn_inp);
5304
+ cb(cur, "l_out", il);
5305
+
5306
+ // input for next layer
5307
+ inpL = cur;
5308
+ }
5309
+
5310
+ cur = inpL;
5311
+
5312
+ cur = llm_build_norm(ctx0, cur, hparams,
5313
+ model.output_norm,
5314
+ model.output_norm_b,
5315
+ LLM_NORM, cb, -1);
5316
+ cb(cur, "result_norm", -1);
5317
+
5318
+ // lm_head
5319
+ cur = ggml_mul_mat(ctx0, model.output, cur);
5320
+ cb(cur, "result_output", -1);
5321
+
5322
+ ggml_build_forward_expand(gf, cur);
5323
+
5324
+ return gf;
5325
+ }
5326
+
5327
+ struct ggml_cgraph * build_qwen() {
5328
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5329
+
5330
+ struct ggml_tensor * cur;
5331
+ struct ggml_tensor * inpL;
5332
+
5333
+ inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
5334
+ cb(inpL, "inp_embd", -1);
5335
+
5336
+ // inp_pos - contains the positions
5337
+ struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
5338
+ cb(inp_pos, "inp_pos", -1);
5339
+
5340
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
5341
+ struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
5342
+ cb(KQ_mask, "KQ_mask", -1);
5343
+
5344
+ // shift the entire K-cache if needed
5345
+ if (do_rope_shift) {
5346
+ llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, n_embd_head, freq_base, freq_scale, cb);
5347
+ }
5348
+
5349
+ for (int il = 0; il < n_layer; ++il) {
5350
+ struct ggml_tensor * inpSA = inpL;
5351
+
5352
+ cur = llm_build_norm(ctx0, inpL, hparams,
5353
+ model.layers[il].attn_norm, NULL,
5354
+ LLM_NORM_RMS, cb, il);
5355
+ cb(cur, "attn_norm", il);
5356
+
5357
+ // self-attention
5358
+ {
5359
+ cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
5360
+ cb(cur, "wqkv", il);
5361
+
5362
+ cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
5363
+ cb(cur, "bqkv", il);
5364
+
5365
+ struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
5366
+ struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
5367
+ struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 2*sizeof(float)*(n_embd)));
5368
+
5369
+ cb(Qcur, "Qcur", il);
5370
+ cb(Kcur, "Kcur", il);
5102
5371
  cb(Vcur, "Vcur", il);
5103
5372
 
5373
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
5374
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
5375
+
5376
+ // using mode = 2 for neox mode
5104
5377
  Qcur = ggml_rope_custom(
5105
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
5106
- hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
5107
- ext_factor, attn_factor, beta_fast, beta_slow
5378
+ ctx0, Qcur, inp_pos, n_embd_head, 2, 0, n_orig_ctx,
5379
+ freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
5108
5380
  );
5109
5381
  cb(Qcur, "Qcur", il);
5110
5382
 
5111
5383
  Kcur = ggml_rope_custom(
5112
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
5113
- hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
5114
- ext_factor, attn_factor, beta_fast, beta_slow
5384
+ ctx0, Kcur, inp_pos, n_embd_head, 2, 0, n_orig_ctx,
5385
+ freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
5115
5386
  );
5116
5387
  cb(Kcur, "Kcur", il);
5117
5388
 
5118
5389
  llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
5119
5390
 
5120
- cur = llm_build_kqv(ctx0, hparams, kv_self,
5391
+ cur = llm_build_kqv(ctx0, model, hparams, kv_self,
5121
5392
  model.layers[il].wo, NULL,
5122
- Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
5393
+ Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5123
5394
  cb(cur, "kqv_out", il);
5124
5395
  }
5125
5396
 
5126
5397
  struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
5127
5398
  cb(ffn_inp, "ffn_inp", il);
5128
5399
 
5129
- // feed-forward network
5400
+ // feed-forward forward
5130
5401
  {
5131
5402
  cur = llm_build_norm(ctx0, ffn_inp, hparams,
5132
- model.layers[il].ffn_norm,
5133
- model.layers[il].ffn_norm_b,
5134
- LLM_NORM, cb, il);
5403
+ model.layers[il].ffn_norm, NULL,
5404
+ LLM_NORM_RMS, cb, il);
5135
5405
  cb(cur, "ffn_norm", il);
5136
5406
 
5137
5407
  cur = llm_build_ffn(ctx0, cur,
@@ -5152,9 +5422,8 @@ struct llm_build_context {
5152
5422
  cur = inpL;
5153
5423
 
5154
5424
  cur = llm_build_norm(ctx0, cur, hparams,
5155
- model.output_norm,
5156
- model.output_norm_b,
5157
- LLM_NORM, cb, -1);
5425
+ model.output_norm, NULL,
5426
+ LLM_NORM_RMS, cb, -1);
5158
5427
  cb(cur, "result_norm", -1);
5159
5428
 
5160
5429
  // lm_head
@@ -5165,26 +5434,23 @@ struct llm_build_context {
5165
5434
 
5166
5435
  return gf;
5167
5436
  }
5168
-
5169
- struct ggml_cgraph * build_qwen() {
5437
+ struct ggml_cgraph * build_phi2() {
5170
5438
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5171
5439
 
5172
5440
  struct ggml_tensor * cur;
5441
+ struct ggml_tensor * attn_norm_output;
5442
+ struct ggml_tensor * ffn_output;
5173
5443
  struct ggml_tensor * inpL;
5174
5444
 
5175
5445
  inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
5176
5446
  cb(inpL, "inp_embd", -1);
5177
5447
 
5178
5448
  // inp_pos - contains the positions
5179
- struct ggml_tensor * inp_pos= ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
5449
+ struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
5180
5450
  cb(inp_pos, "inp_pos", -1);
5181
5451
 
5182
- // KQ_scale
5183
- struct ggml_tensor * KQ_scale= ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
5184
- cb(KQ_scale, "KQ_scale", -1);
5185
-
5186
5452
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
5187
- struct ggml_tensor * KQ_mask= ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
5453
+ struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
5188
5454
  cb(KQ_mask, "KQ_mask", -1);
5189
5455
 
5190
5456
  // shift the entire K-cache if needed
@@ -5193,24 +5459,23 @@ struct llm_build_context {
5193
5459
  }
5194
5460
 
5195
5461
  for (int il = 0; il < n_layer; ++il) {
5196
- struct ggml_tensor * inpSA = inpL;
5197
-
5198
- cur = llm_build_norm(ctx0, inpL, hparams,
5199
- model.layers[il].attn_norm, NULL,
5200
- LLM_NORM_RMS, cb, il);
5201
- cb(cur, "attn_norm", il);
5462
+ attn_norm_output = llm_build_norm(ctx0, inpL, hparams,
5463
+ model.layers[il].attn_norm,
5464
+ model.layers[il].attn_norm_b,
5465
+ LLM_NORM, cb, il);
5466
+ cb(attn_norm_output, "attn_norm", il);
5202
5467
 
5203
5468
  // self-attention
5204
5469
  {
5205
- cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
5470
+ cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, attn_norm_output);
5206
5471
  cb(cur, "wqkv", il);
5207
5472
 
5208
5473
  cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
5209
5474
  cb(cur, "bqkv", il);
5210
5475
 
5211
- struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
5212
- struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
5213
- struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 2*sizeof(float)*(n_embd)));
5476
+ struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
5477
+ struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
5478
+ struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
5214
5479
 
5215
5480
  cb(Qcur, "Qcur", il);
5216
5481
  cb(Kcur, "Kcur", il);
@@ -5219,61 +5484,60 @@ struct llm_build_context {
5219
5484
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
5220
5485
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
5221
5486
 
5222
- // using mode = 2 for neox mode
5223
5487
  Qcur = ggml_rope_custom(
5224
- ctx0, Qcur, inp_pos, n_embd_head, 2, 0, n_orig_ctx,
5488
+ ctx0, Qcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
5225
5489
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
5226
5490
  );
5227
5491
  cb(Qcur, "Qcur", il);
5228
5492
 
5493
+ // with phi2, we scale the Q to avoid precision issues
5494
+ // ref: https://github.com/ml-explore/mlx-examples/blob/08e862336ade809bc37d1035f94b359e7d1a5152/phi2/phi2.py#L64-L66
5495
+ Qcur = ggml_scale(ctx0, Qcur, 1.0f/sqrtf(float(n_embd_head)));
5496
+ cb(Qcur, "Qcur", il);
5497
+
5229
5498
  Kcur = ggml_rope_custom(
5230
- ctx0, Kcur, inp_pos, n_embd_head, 2, 0, n_orig_ctx,
5499
+ ctx0, Kcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
5231
5500
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
5232
5501
  );
5233
5502
  cb(Kcur, "Kcur", il);
5234
5503
 
5235
5504
  llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
5236
5505
 
5237
- cur = llm_build_kqv(ctx0, hparams, kv_self,
5238
- model.layers[il].wo, NULL,
5239
- Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
5506
+ cur = llm_build_kqv(ctx0, model, hparams, kv_self,
5507
+ model.layers[il].wo, model.layers[il].bo,
5508
+ Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f, cb, il);
5240
5509
  cb(cur, "kqv_out", il);
5241
5510
  }
5242
5511
 
5243
- struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
5244
- cb(ffn_inp, "ffn_inp", il);
5245
-
5246
- // feed-forward forward
5512
+ // FF
5247
5513
  {
5248
- cur = llm_build_norm(ctx0, ffn_inp, hparams,
5249
- model.layers[il].ffn_norm, NULL,
5250
- LLM_NORM_RMS, cb, il);
5251
- cb(cur, "ffn_norm", il);
5252
-
5253
- cur = llm_build_ffn(ctx0, cur,
5254
- model.layers[il].ffn_up, NULL,
5255
- model.layers[il].ffn_gate, NULL,
5256
- model.layers[il].ffn_down, NULL,
5257
- LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
5258
- cb(cur, "ffn_out", il);
5514
+ ffn_output = llm_build_ffn(ctx0, attn_norm_output,
5515
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b,
5516
+ NULL, NULL,
5517
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b,
5518
+ LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
5519
+ cb(ffn_output, "ffn_out", il);
5259
5520
  }
5260
5521
 
5261
- cur = ggml_add(ctx0, cur, ffn_inp);
5522
+ cur = ggml_add(ctx0, cur, ffn_output);
5523
+ cb(cur, "l_out", il);
5524
+
5525
+ cur = ggml_add(ctx0, cur, inpL);
5262
5526
  cb(cur, "l_out", il);
5263
5527
 
5264
- // input for next layer
5265
5528
  inpL = cur;
5266
5529
  }
5267
5530
 
5268
- cur = inpL;
5269
-
5270
- cur = llm_build_norm(ctx0, cur, hparams,
5271
- model.output_norm, NULL,
5272
- LLM_NORM_RMS, cb, -1);
5531
+ cur = llm_build_norm(ctx0, inpL, hparams,
5532
+ model.output_norm,
5533
+ model.output_norm_b,
5534
+ LLM_NORM, cb, -1);
5273
5535
  cb(cur, "result_norm", -1);
5274
5536
 
5275
- // lm_head
5276
5537
  cur = ggml_mul_mat(ctx0, model.output, cur);
5538
+ cb(cur, "result_output_no_bias", -1);
5539
+
5540
+ cur = ggml_add(ctx0, cur, model.output_b);
5277
5541
  cb(cur, "result_output", -1);
5278
5542
 
5279
5543
  ggml_build_forward_expand(gf, cur);
@@ -5293,7 +5557,7 @@ enum llm_offload_func_e {
5293
5557
  OFFLOAD_FUNC_FRC, // force offload
5294
5558
  OFFLOAD_FUNC_KQV,
5295
5559
  OFFLOAD_FUNC_NR,
5296
- OFFLOAD_FUNC_EMB,
5560
+ OFFLOAD_FUNC_EMB, // embeddings
5297
5561
  OFFLOAD_FUNC_OUT,
5298
5562
  };
5299
5563
 
@@ -5378,7 +5642,6 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map
5378
5642
  { "pos_embd", OFFLOAD_FUNC_NR },
5379
5643
 
5380
5644
  { "inp_pos", OFFLOAD_FUNC_FRC }, // this is often used for KQ ops (e.g. rope)
5381
- { "KQ_scale", OFFLOAD_FUNC_FRC },
5382
5645
  { "KQ_mask", OFFLOAD_FUNC_FRC },
5383
5646
  { "K_shift", OFFLOAD_FUNC_FRC },
5384
5647
 
@@ -5445,9 +5708,24 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map
5445
5708
  { "ffn_relu", OFFLOAD_FUNC },
5446
5709
  { "ffn_sqr(relu)", OFFLOAD_FUNC },
5447
5710
 
5711
+ { "ffn_moe_logits", OFFLOAD_FUNC },
5712
+ { "ffn_moe_probs", OFFLOAD_FUNC },
5713
+ { "ffn_moe_argsort", OFFLOAD_FUNC },
5714
+ { "ffn_moe_weights", OFFLOAD_FUNC },
5715
+ { "ffn_moe_weights_sum", OFFLOAD_FUNC },
5716
+ { "ffn_moe_weights_norm", OFFLOAD_FUNC },
5717
+ { "ffn_moe_weighted", OFFLOAD_FUNC },
5718
+ { "ffn_moe_up", OFFLOAD_FUNC },
5719
+ { "ffn_moe_gate", OFFLOAD_FUNC },
5720
+ { "ffn_moe_silu", OFFLOAD_FUNC },
5721
+ { "ffn_moe_gate_par", OFFLOAD_FUNC },
5722
+ { "ffn_moe_down", OFFLOAD_FUNC },
5723
+ { "ffn_moe_out", OFFLOAD_FUNC },
5724
+
5448
5725
  { "l_out", OFFLOAD_FUNC },
5449
5726
 
5450
5727
  { "result_norm", OFFLOAD_FUNC_EMB },
5728
+ { "result_output_no_bias", OFFLOAD_FUNC_EMB },
5451
5729
  { "result_output", OFFLOAD_FUNC_OUT },
5452
5730
  };
5453
5731
 
@@ -5465,11 +5743,10 @@ static struct ggml_cgraph * llama_build_graph(
5465
5743
  bool alloc_inp_tokens = false;
5466
5744
  bool alloc_inp_embd = false;
5467
5745
  bool alloc_inp_pos = false;
5468
- bool alloc_inp_KQ_scale = false;
5469
5746
  bool alloc_inp_KQ_mask = false;
5470
5747
  bool alloc_inp_K_shift = false;
5471
5748
 
5472
- #ifdef GGML_USE_CUBLAS
5749
+ #if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
5473
5750
  const bool do_offload = true;
5474
5751
  #else
5475
5752
  const bool do_offload = true; // TODO: set to false after finishing refactoring
@@ -5497,7 +5774,7 @@ static struct ggml_cgraph * llama_build_graph(
5497
5774
  if (!ggml_allocr_is_measure(lctx.alloc) && batch.token) {
5498
5775
  const int64_t n_tokens = cur->ne[0];
5499
5776
 
5500
- memcpy(cur->data, batch.token, n_tokens*ggml_element_size(cur));
5777
+ ggml_backend_tensor_set(cur, batch.token, 0, n_tokens*ggml_element_size(cur));
5501
5778
  }
5502
5779
 
5503
5780
  alloc_inp_tokens = true;
@@ -5510,7 +5787,7 @@ static struct ggml_cgraph * llama_build_graph(
5510
5787
  const int64_t n_embd = cur->ne[0];
5511
5788
  const int64_t n_tokens = cur->ne[1];
5512
5789
 
5513
- memcpy(cur->data, batch.embd, n_tokens*n_embd*ggml_element_size(cur));
5790
+ ggml_backend_tensor_set(cur, batch.embd, 0, n_tokens*n_embd*ggml_element_size(cur));
5514
5791
  }
5515
5792
 
5516
5793
  alloc_inp_embd = true;
@@ -5522,27 +5799,13 @@ static struct ggml_cgraph * llama_build_graph(
5522
5799
  if (!ggml_allocr_is_measure(lctx.alloc) && batch.pos) {
5523
5800
  const int64_t n_tokens = cur->ne[0];
5524
5801
 
5525
- int32_t * data = (int32_t *) cur->data;
5526
-
5527
- for (int i = 0; i < n_tokens; ++i) {
5528
- data[i] = batch.pos[i];
5529
- }
5802
+ static_assert(std::is_same<llama_pos, int32_t>::value, "llama_pos must be int32_t");
5803
+ ggml_backend_tensor_set(cur, batch.pos, 0, n_tokens*ggml_element_size(cur));
5530
5804
  }
5531
5805
 
5532
5806
  alloc_inp_pos = true;
5533
5807
  }
5534
5808
 
5535
- if (!alloc_inp_KQ_scale && strcmp(name, "KQ_scale") == 0) {
5536
- ggml_allocr_alloc(lctx.alloc, cur);
5537
-
5538
- if (!ggml_allocr_is_measure(lctx.alloc)) {
5539
- const int64_t n_embd_head = model.hparams.n_embd_head();
5540
- ggml_set_f32(cur, 1.0f/sqrtf(float(n_embd_head)));
5541
- }
5542
-
5543
- alloc_inp_KQ_scale = true;
5544
- }
5545
-
5546
5809
  if (!alloc_inp_KQ_mask && strcmp(name, "KQ_mask") == 0) {
5547
5810
  ggml_allocr_alloc(lctx.alloc, cur);
5548
5811
 
@@ -5550,8 +5813,13 @@ static struct ggml_cgraph * llama_build_graph(
5550
5813
  const int64_t n_kv = cur->ne[0];
5551
5814
  const int64_t n_tokens = cur->ne[1];
5552
5815
 
5553
- float * data = (float *) cur->data;
5554
- memset(data, 0, ggml_nbytes(cur));
5816
+ float * data;
5817
+ if (ggml_backend_buffer_is_host(cur->buffer)) {
5818
+ data = (float *) cur->data;
5819
+ } else {
5820
+ lctx.buf_copy.resize(ggml_nbytes(cur));
5821
+ data = (float *) lctx.buf_copy.data();
5822
+ }
5555
5823
 
5556
5824
  for (int h = 0; h < 1; ++h) {
5557
5825
  for (int j = 0; j < n_tokens; ++j) {
@@ -5559,12 +5827,20 @@ static struct ggml_cgraph * llama_build_graph(
5559
5827
  const llama_seq_id seq_id = batch.seq_id[j][0];
5560
5828
 
5561
5829
  for (int i = 0; i < n_kv; ++i) {
5830
+ float f;
5562
5831
  if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) {
5563
- data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
5832
+ f = -INFINITY;
5833
+ } else {
5834
+ f = 0;
5564
5835
  }
5836
+ data[h*(n_kv*n_tokens) + j*n_kv + i] = f;
5565
5837
  }
5566
5838
  }
5567
5839
  }
5840
+
5841
+ if (data != cur->data) {
5842
+ ggml_backend_tensor_set(cur, data, 0, ggml_nbytes(cur));
5843
+ }
5568
5844
  }
5569
5845
 
5570
5846
  alloc_inp_KQ_mask = true;
@@ -5576,11 +5852,21 @@ static struct ggml_cgraph * llama_build_graph(
5576
5852
  if (!ggml_allocr_is_measure(lctx.alloc)) {
5577
5853
  const int64_t n_ctx = cur->ne[0];
5578
5854
 
5579
- int32_t * data = (int32_t *) cur->data;
5855
+ int32_t * data;
5856
+ if (ggml_backend_buffer_is_host(cur->buffer)) {
5857
+ data = (int32_t *) cur->data;
5858
+ } else {
5859
+ lctx.buf_copy.resize(ggml_nbytes(cur));
5860
+ data = (int32_t *) lctx.buf_copy.data();
5861
+ }
5580
5862
 
5581
5863
  for (int i = 0; i < n_ctx; ++i) {
5582
5864
  data[i] = lctx.kv_self.cells[i].delta;
5583
5865
  }
5866
+
5867
+ if (data != cur->data) {
5868
+ ggml_backend_tensor_set(cur, data, 0, ggml_nbytes(cur));
5869
+ }
5584
5870
  }
5585
5871
 
5586
5872
  alloc_inp_K_shift = true;
@@ -5617,7 +5903,7 @@ static struct ggml_cgraph * llama_build_graph(
5617
5903
  static const std::unordered_map<llm_offload_func_e, std::string, std::hash<int>> k_offload_func_name = {
5618
5904
  { OFFLOAD_FUNC_NOP, "CPU" },
5619
5905
  { OFFLOAD_FUNC_OUT, "CPU" },
5620
- #ifdef GGML_USE_CUBLAS
5906
+ #if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
5621
5907
  { OFFLOAD_FUNC, "GPU (CUDA)" },
5622
5908
  { OFFLOAD_FUNC_FRC, "GPU (CUDA) FRC" },
5623
5909
  { OFFLOAD_FUNC_KQV, "GPU (CUDA) KQV" },
@@ -5690,7 +5976,7 @@ static struct ggml_cgraph * llama_build_graph(
5690
5976
  offload_func_t func = ggml_offload_nop;
5691
5977
 
5692
5978
  // this is needed for compatibility with Metal for example
5693
- #ifdef GGML_USE_CUBLAS
5979
+ #if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
5694
5980
  static offload_func_t ggml_offload_gpu = ggml_cuda_assign_buffers_no_alloc;
5695
5981
  #else
5696
5982
  static offload_func_t ggml_offload_gpu = ggml_offload_nop;
@@ -5764,6 +6050,10 @@ static struct ggml_cgraph * llama_build_graph(
5764
6050
  {
5765
6051
  result = llm.build_qwen();
5766
6052
  } break;
6053
+ case LLM_ARCH_PHI2:
6054
+ {
6055
+ result = llm.build_phi2();
6056
+ } break;
5767
6057
  default:
5768
6058
  GGML_ASSERT(false);
5769
6059
  }
@@ -5841,7 +6131,7 @@ static int llama_decode_internal(
5841
6131
  const int64_t n_embd = hparams.n_embd;
5842
6132
  const int64_t n_vocab = hparams.n_vocab;
5843
6133
 
5844
- // helpers for smoother batch API transistion
6134
+ // helpers for smoother batch API transition
5845
6135
  // after deprecating the llama_eval calls, these will be removed
5846
6136
  std::vector<llama_pos> pos;
5847
6137
 
@@ -5897,18 +6187,23 @@ static int llama_decode_internal(
5897
6187
 
5898
6188
  ggml_allocr_alloc_graph(lctx.alloc, gf);
5899
6189
 
5900
- struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
5901
- struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
5902
-
5903
- GGML_ASSERT(strcmp(res->name, "result_output") == 0);
5904
- GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
6190
+ // the output is always the last tensor in the graph
6191
+ struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
6192
+ GGML_ASSERT(strcmp(res->name, "result_output") == 0);
5905
6193
 
6194
+ // the embeddings could be the second to last tensor, or the third to last tensor
6195
+ struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
6196
+ if (strcmp(embeddings->name, "result_norm") != 0) {
6197
+ embeddings = gf->nodes[gf->n_nodes - 3];
6198
+ GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
6199
+ }
5906
6200
 
5907
- #ifdef GGML_USE_CUBLAS
6201
+ #if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
6202
+ char * buf_alloc_base = (char *)ggml_backend_buffer_get_base(lctx.buf_alloc);
5908
6203
  for (int i = 0; i < gf->n_leafs; i++) {
5909
6204
  ggml_tensor * node = gf->leafs[i];
5910
6205
  if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) {
5911
- ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) lctx.buf_alloc.data);
6206
+ ggml_cuda_assign_scratch_offset(node, (char *)node->data - buf_alloc_base);
5912
6207
  ggml_cuda_copy_to_device(node);
5913
6208
  }
5914
6209
  }
@@ -5916,7 +6211,7 @@ static int llama_decode_internal(
5916
6211
  for (int i = 0; i < gf->n_nodes; i++) {
5917
6212
  ggml_tensor * node = gf->nodes[i];
5918
6213
  if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) {
5919
- ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) lctx.buf_alloc.data);
6214
+ ggml_cuda_assign_scratch_offset(node, (char *)node->data - buf_alloc_base);
5920
6215
  }
5921
6216
  }
5922
6217
 
@@ -5943,23 +6238,23 @@ static int llama_decode_internal(
5943
6238
  n_threads = 1;
5944
6239
  }
5945
6240
 
5946
- #if GGML_USE_MPI
6241
+ #ifdef GGML_USE_MPI
5947
6242
  const int64_t n_layer = hparams.n_layer;
5948
6243
  ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
5949
6244
  #endif
5950
6245
 
5951
6246
  #ifdef GGML_USE_METAL
5952
- if (lctx.ctx_metal) {
5953
- ggml_metal_set_n_cb (lctx.ctx_metal, n_threads);
5954
- ggml_metal_graph_compute(lctx.ctx_metal, gf);
5955
- } else {
5956
- ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
6247
+ if (ggml_backend_is_metal(lctx.backend)) {
6248
+ ggml_backend_metal_set_n_cb(lctx.backend, n_threads);
5957
6249
  }
5958
- #else
5959
- ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
5960
6250
  #endif
5961
6251
 
5962
- #if GGML_USE_MPI
6252
+ if (ggml_backend_is_cpu(lctx.backend)) {
6253
+ ggml_backend_cpu_set_n_threads(lctx.backend, n_threads);
6254
+ }
6255
+ ggml_backend_graph_compute(lctx.backend, gf);
6256
+
6257
+ #ifdef GGML_USE_MPI
5963
6258
  ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
5964
6259
  #endif
5965
6260
 
@@ -5997,20 +6292,37 @@ static int llama_decode_internal(
5997
6292
  {
5998
6293
  auto & logits_out = lctx.logits;
5999
6294
 
6295
+ #ifndef NDEBUG
6296
+ auto & logits_valid = lctx.logits_valid;
6297
+ logits_valid.clear();
6298
+ logits_valid.resize(n_tokens);
6299
+
6300
+ logits_out.clear();
6301
+ #endif
6302
+
6000
6303
  if (batch.logits) {
6001
6304
  logits_out.resize(n_vocab * n_tokens);
6002
6305
  for (uint32_t i = 0; i < n_tokens; i++) {
6003
6306
  if (batch.logits[i] == 0) {
6004
6307
  continue;
6005
6308
  }
6006
- memcpy(logits_out.data() + (n_vocab*i), (float *) ggml_get_data(res) + (n_vocab*i), sizeof(float)*n_vocab);
6309
+ ggml_backend_tensor_get(res, logits_out.data() + (n_vocab*i), (n_vocab*i)*sizeof(float), n_vocab*sizeof(float));
6310
+ #ifndef NDEBUG
6311
+ logits_valid[i] = true;
6312
+ #endif
6007
6313
  }
6008
6314
  } else if (lctx.logits_all) {
6009
6315
  logits_out.resize(n_vocab * n_tokens);
6010
- memcpy(logits_out.data(), (float *) ggml_get_data(res), sizeof(float)*n_vocab*n_tokens);
6316
+ ggml_backend_tensor_get(res, logits_out.data(), 0, n_vocab*n_tokens*sizeof(float));
6317
+ #ifndef NDEBUG
6318
+ std::fill(logits_valid.begin(), logits_valid.end(), true);
6319
+ #endif
6011
6320
  } else {
6012
6321
  logits_out.resize(n_vocab);
6013
- memcpy(logits_out.data(), (float *) ggml_get_data(res) + (n_vocab*(n_tokens - 1)), sizeof(float)*n_vocab);
6322
+ ggml_backend_tensor_get(res, logits_out.data(), (n_vocab*(n_tokens - 1))*sizeof(float), n_vocab*sizeof(float));
6323
+ #ifndef NDEBUG
6324
+ logits_valid[0] = true;
6325
+ #endif
6014
6326
  }
6015
6327
  }
6016
6328
 
@@ -6019,7 +6331,7 @@ static int llama_decode_internal(
6019
6331
  auto & embedding_out = lctx.embedding;
6020
6332
 
6021
6333
  embedding_out.resize(n_embd);
6022
- memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(n_tokens - 1)), sizeof(float)*n_embd);
6334
+ ggml_backend_tensor_get(embeddings, embedding_out.data(), (n_embd*(n_tokens - 1))*sizeof(float), n_embd*sizeof(float));
6023
6335
  }
6024
6336
 
6025
6337
  // measure the performance only for the single-token evals
@@ -6620,12 +6932,12 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
6620
6932
 
6621
6933
  // loop over the text
6622
6934
  while (true) {
6623
- // find the first occurence of a given special token in this fragment
6935
+ // find the first occurrence of a given special token in this fragment
6624
6936
  // passing offset argument only limit the "search area" but match coordinates
6625
6937
  // are still relative to the source full raw_text
6626
6938
  auto match = raw_text->find(special_token, raw_text_base_offset);
6627
6939
 
6628
- // no occurences found, stop processing this fragment for a given special token
6940
+ // no occurrences found, stop processing this fragment for a given special token
6629
6941
  if (match == std::string::npos) break;
6630
6942
 
6631
6943
  // check if match is within bounds of offset <-> length
@@ -7498,7 +7810,7 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
7498
7810
 
7499
7811
  for (size_t i = 0; i < candidates->size; ++i) {
7500
7812
  const llama_token id = candidates->data[i].id;
7501
- const std::string & piece = ctx->model.vocab.id_to_token[id].text;
7813
+ const std::string piece = llama_token_to_piece(ctx, id);
7502
7814
  if (id == eos) {
7503
7815
  if (!allow_eos) {
7504
7816
  candidates->data[i].logit = -INFINITY;
@@ -7710,7 +8022,7 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
7710
8022
  GGML_ASSERT(false);
7711
8023
  }
7712
8024
 
7713
- const std::string & piece = ctx->model.vocab.id_to_token[token].text;
8025
+ const std::string piece = llama_token_to_piece(ctx, token);
7714
8026
 
7715
8027
  // Note terminating 0 in decoded string
7716
8028
  const auto decoded = decode_utf8(piece, grammar->partial_utf8);
@@ -7824,7 +8136,7 @@ struct llama_beam_search_data {
7824
8136
  }
7825
8137
 
7826
8138
  // Min-heaps are used to efficiently collect the top-k elements (k=n_beams).
7827
- // The repetative patterns below reflect the 2 stages of heaps:
8139
+ // The repetitive patterns below reflect the 2 stages of heaps:
7828
8140
  // * Gather elements until the vector is full, then call std::make_heap() on it.
7829
8141
  // * If the heap is full and a new element is found that should be included, pop the
7830
8142
  // least element to the back(), replace it with the new, then push it into the heap.
@@ -7977,12 +8289,6 @@ void llama_beam_search(llama_context * ctx,
7977
8289
  // quantization
7978
8290
  //
7979
8291
 
7980
- template <typename T>
7981
- struct no_init {
7982
- T value;
7983
- no_init() { /* do nothing */ }
7984
- };
7985
-
7986
8292
  struct quantize_state_internal {
7987
8293
  const llama_model & model;
7988
8294
  const llama_model_quantize_params * params;
@@ -8062,11 +8368,9 @@ static void llama_convert_tensor_internal(
8062
8368
  workers.clear();
8063
8369
  }
8064
8370
 
8065
- static ggml_type get_k_quant_type(
8066
- quantize_state_internal & qs,
8067
- ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype
8068
- ) {
8371
+ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
8069
8372
  const std::string name = ggml_get_name(tensor);
8373
+
8070
8374
  // TODO: avoid hardcoded tensor names - use the TN_* constants
8071
8375
  const llm_arch arch = qs.model.arch;
8072
8376
  const auto tn = LLM_TN(arch);
@@ -8100,7 +8404,18 @@ static ggml_type get_k_quant_type(
8100
8404
  // nearly negligible increase in model size by quantizing this tensor with more bits:
8101
8405
  if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
8102
8406
  }
8407
+ if (qs.model.hparams.n_expert == 8) {
8408
+ // for the 8-expert model, bumping this to Q8_0 trades just ~128MB
8409
+ // TODO: explore better strategies
8410
+ new_type = GGML_TYPE_Q8_0;
8411
+ }
8103
8412
  ++qs.i_attention_wv;
8413
+ } else if (name.find("attn_k.weight") != std::string::npos) {
8414
+ if (qs.model.hparams.n_expert == 8) {
8415
+ // for the 8-expert model, bumping this to Q8_0 trades just ~128MB
8416
+ // TODO: explore better strategies
8417
+ new_type = GGML_TYPE_Q8_0;
8418
+ }
8104
8419
  } else if (name.find("ffn_down.weight") != std::string::npos) {
8105
8420
  if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
8106
8421
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
@@ -8216,9 +8531,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
8216
8531
  #endif
8217
8532
 
8218
8533
  llama_model_loader ml(fname_inp, use_mmap, NULL);
8219
- if (ml.use_mmap) {
8220
- ml.mapping.reset(new llama_mmap(&ml.file, /* prefetch */ 0, ggml_is_numa()));
8221
- }
8534
+ ml.init_mapping(false); // no prefetching?
8222
8535
 
8223
8536
  llama_model model;
8224
8537
  llm_load_arch(ml, model);
@@ -8309,10 +8622,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
8309
8622
  bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'?
8310
8623
 
8311
8624
  // quantize only 2D tensors
8312
- quantize &= (tensor->n_dims == 2);
8625
+ quantize &= (ggml_n_dims(tensor) == 2);
8313
8626
  quantize &= params->quantize_output_tensor || name != "output.weight";
8314
8627
  quantize &= !params->only_copy;
8315
8628
 
8629
+ // do not quantize expert gating tensors
8630
+ quantize &= name.find("ffn_gate_inp.weight") == std::string::npos;
8631
+
8316
8632
  enum ggml_type new_type;
8317
8633
  void * new_data;
8318
8634
  size_t new_size;
@@ -8461,74 +8777,63 @@ static int llama_apply_lora_from_file_internal(
8461
8777
 
8462
8778
  const int64_t t_start_lora_us = ggml_time_us();
8463
8779
 
8464
- auto fin = std::ifstream(path_lora, std::ios::binary);
8465
- if (!fin) {
8466
- LLAMA_LOG_ERROR("%s: failed to open '%s'\n", __func__, path_lora);
8467
- return 1;
8468
- }
8780
+ llama_file fin(path_lora, "rb");
8469
8781
 
8470
8782
  // verify magic and version
8471
8783
  {
8472
- uint32_t magic;
8473
- fin.read((char *) &magic, sizeof(magic));
8474
- uint32_t format_version;
8475
- fin.read((char *) &format_version, sizeof(format_version));
8784
+ uint32_t magic = fin.read_u32();
8785
+ if (magic != LLAMA_FILE_MAGIC_GGLA) {
8786
+ LLAMA_LOG_ERROR("%s: bad file magic\n", __func__);
8787
+ return 1;
8788
+ }
8476
8789
 
8790
+ uint32_t format_version = fin.read_u32();
8477
8791
  if (format_version != 1) {
8478
8792
  LLAMA_LOG_ERROR("%s: unsupported file version\n", __func__ );
8479
8793
  return 1;
8480
8794
  }
8481
8795
  }
8482
8796
 
8483
- int32_t lora_r;
8484
- int32_t lora_alpha;
8485
- fin.read((char *) &lora_r, sizeof(lora_r));
8486
- fin.read((char *) &lora_alpha, sizeof(lora_alpha));
8797
+ int32_t lora_r = fin.read_u32();
8798
+ int32_t lora_alpha = fin.read_u32();
8487
8799
  float scaling = scale * (float)lora_alpha / (float)lora_r;
8488
8800
 
8489
8801
  LLAMA_LOG_INFO("%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);
8490
8802
 
8803
+ // create a name -> tensor map of the model to accelerate lookups
8804
+ // find the max tensor size to estimate the required temporary buffer size
8805
+ size_t max_tensor_size = 0;
8806
+ std::unordered_map<std::string, struct ggml_tensor*> model_tensors;
8807
+ for (const auto & kv : model.tensors_by_name) {
8808
+ model_tensors.insert(kv);
8809
+ size_t f32_size = ggml_nelements(kv.second) * sizeof(float);
8810
+ max_tensor_size = std::max(max_tensor_size, f32_size);
8811
+ }
8812
+
8491
8813
  // create a temporary ggml context to store the lora tensors
8492
- // todo: calculate size from biggest possible tensor
8493
- std::vector<uint8_t> lora_buf(1024ull * 1024ull * 1024ull);
8814
+ // TODO: use ggml-alloc
8815
+ size_t lora_ctx_size = max_tensor_size * 3;
8816
+ LLAMA_LOG_INFO("%s: allocating %.f MB for lora temporary buffer\n", __func__, lora_ctx_size / 1024.0 / 1024.0);
8817
+ std::vector<uint8_t> lora_buf(lora_ctx_size);
8818
+
8494
8819
  struct ggml_init_params params;
8495
8820
  params.mem_size = lora_buf.size();
8496
8821
  params.mem_buffer = lora_buf.data();
8497
8822
  params.no_alloc = false;
8498
8823
 
8499
- ggml_context * lora_ctx = ggml_init(params);
8500
- std::unordered_map<std::string, struct ggml_tensor *> lora_tensors;
8824
+ using unique_context = std::unique_ptr<ggml_context, decltype(&ggml_free)>;
8501
8825
 
8502
- // create a name -> tensor map of the model to accelerate lookups
8503
- std::unordered_map<std::string, struct ggml_tensor*> model_tensors;
8504
- for (const auto & kv : model.tensors_by_name) {
8505
- model_tensors.insert(kv);
8506
- }
8826
+ unique_context lora_ctx(nullptr, ggml_free);
8827
+ lora_ctx.reset(ggml_init(params));
8828
+ std::unordered_map<std::string, struct ggml_tensor *> lora_tensors;
8507
8829
 
8508
8830
  // load base model
8509
8831
  std::unique_ptr<llama_model_loader> ml;
8510
- ggml_context * base_ctx = NULL;
8511
- std::vector<uint8_t> base_buf;
8512
- if (path_base_model) {
8513
- LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
8514
- ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*kv_overrides*/ NULL));
8515
-
8516
- size_t ctx_size;
8517
- size_t mmapped_size;
8518
- ml->calc_sizes(ctx_size, mmapped_size);
8519
- base_buf.resize(ctx_size);
8520
-
8521
- ggml_init_params base_params;
8522
- base_params.mem_size = base_buf.size();
8523
- base_params.mem_buffer = base_buf.data();
8524
- base_params.no_alloc = ml->use_mmap;
8525
8832
 
8526
- base_ctx = ggml_init(base_params);
8527
-
8528
- // maybe this should in llama_model_loader
8529
- if (ml->use_mmap) {
8530
- ml->mapping.reset(new llama_mmap(&ml->file, /* prefetch */ 0, ggml_is_numa()));
8531
- }
8833
+ if (path_base_model) {
8834
+ LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
8835
+ ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*kv_overrides*/ nullptr));
8836
+ ml->init_mapping(false); // no prefetching
8532
8837
  }
8533
8838
 
8534
8839
  // read tensors and apply
@@ -8538,27 +8843,35 @@ static int llama_apply_lora_from_file_internal(
8538
8843
  std::vector<uint8_t> work_buffer;
8539
8844
 
8540
8845
  while (true) {
8846
+ if (fin.tell() == fin.size) {
8847
+ // eof
8848
+ break;
8849
+ }
8850
+
8541
8851
  int32_t n_dims;
8542
- int32_t length;
8852
+ int32_t name_len;
8543
8853
  int32_t ftype;
8544
8854
 
8545
- fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
8546
- fin.read(reinterpret_cast<char *>(&length), sizeof(length));
8547
- fin.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
8548
- if (fin.eof()) {
8549
- break;
8855
+ fin.read_raw(&n_dims, sizeof(n_dims));
8856
+ fin.read_raw(&name_len, sizeof(name_len));
8857
+ fin.read_raw(&ftype, sizeof(ftype));
8858
+
8859
+ if (n_dims != 1 && n_dims != 2) {
8860
+ LLAMA_LOG_ERROR("%s: unsupported tensor dimension %d\n", __func__, n_dims);
8861
+ return 1;
8550
8862
  }
8551
8863
 
8552
8864
  int32_t ne[2] = { 1, 1 };
8553
8865
  for (int i = 0; i < n_dims; ++i) {
8554
- fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
8866
+ fin.read_raw(&ne[i], sizeof(ne[i]));
8555
8867
  }
8556
8868
 
8557
8869
  std::string name;
8558
8870
  {
8871
+ GGML_ASSERT(name_len <= 1024);
8559
8872
  char buf[1024];
8560
- fin.read(buf, length);
8561
- name = std::string(buf, length);
8873
+ fin.read_raw(buf, name_len);
8874
+ name = std::string(buf, name_len);
8562
8875
  }
8563
8876
 
8564
8877
  // check for lora suffix and get the type of tensor
@@ -8572,7 +8885,7 @@ static int llama_apply_lora_from_file_internal(
8572
8885
  std::string lora_type = name.substr(pos + lora_suffix.length());
8573
8886
  std::string base_name = name;
8574
8887
  base_name.erase(pos);
8575
- // LLAMA_LOG_INFO("%s: %s => %s (lora type %s) \n", __func__, name.c_str(),base_name.c_str(), lora_type.c_str());
8888
+ // LLAMA_LOG_INFO("%s: %s => %s (lora type %s) \n", __func__, name.c_str(), base_name.c_str(), lora_type.c_str());
8576
8889
 
8577
8890
  if (model_tensors.find(base_name) == model_tensors.end()) {
8578
8891
  LLAMA_LOG_ERROR("%s: unknown tensor '%s' in lora adapter\n", __func__, name.data());
@@ -8591,22 +8904,15 @@ static int llama_apply_lora_from_file_internal(
8591
8904
  return false;
8592
8905
  }
8593
8906
  }
8594
- ggml_tensor * lora_tensor;
8595
- if (n_dims == 2) {
8596
- lora_tensor = ggml_new_tensor_2d(lora_ctx, wtype, ne[0], ne[1]);
8597
- }
8598
- else {
8599
- LLAMA_LOG_ERROR("%s: unsupported tensor dimension %d\n", __func__, n_dims);
8600
- return 1;
8601
- }
8602
- ggml_set_name(lora_tensor, "lora_tensor");
8907
+ ggml_tensor * lora_tensor = ggml_new_tensor_2d(lora_ctx.get(), wtype, ne[0], ne[1]);
8908
+ ggml_set_name(lora_tensor, name.c_str());
8603
8909
 
8604
8910
  // load tensor data
8605
- size_t offset = fin.tellg();
8911
+ size_t offset = fin.tell();
8606
8912
  size_t tensor_data_size = ggml_nbytes(lora_tensor);
8607
8913
  offset = (offset + 31) & -32;
8608
- fin.seekg(offset);
8609
- fin.read((char*)lora_tensor->data, tensor_data_size);
8914
+ fin.seek(offset, SEEK_SET);
8915
+ fin.read_raw(lora_tensor->data, tensor_data_size);
8610
8916
 
8611
8917
  lora_tensors[name] = lora_tensor;
8612
8918
 
@@ -8619,7 +8925,7 @@ static int llama_apply_lora_from_file_internal(
8619
8925
  offload_func_t offload_func = ggml_offload_nop;
8620
8926
  offload_func_t offload_func_force_inplace = ggml_offload_nop;
8621
8927
 
8622
- #ifdef GGML_USE_CUBLAS
8928
+ #if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
8623
8929
  if (dest_t->backend == GGML_BACKEND_GPU || dest_t->backend == GGML_BACKEND_GPU_SPLIT) {
8624
8930
  if (dest_t->type != GGML_TYPE_F16) {
8625
8931
  throw std::runtime_error(format(
@@ -8636,13 +8942,11 @@ static int llama_apply_lora_from_file_internal(
8636
8942
 
8637
8943
  // load from base model
8638
8944
  if (gguf_find_tensor(ctx_gguf, base_name.c_str()) < 0) {
8639
- // TODO: throw
8640
8945
  LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
8641
8946
  return 1;
8642
8947
  }
8643
8948
 
8644
- // TODO: not tested!! maybe not working!
8645
- base_t = ml->create_tensor(base_ctx, base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] }, GGML_BACKEND_CPU);
8949
+ base_t = ml->get_tensor_meta(base_name.c_str());
8646
8950
  ml->load_data_for(base_t);
8647
8951
  } else {
8648
8952
  base_t = dest_t;
@@ -8671,43 +8975,42 @@ static int llama_apply_lora_from_file_internal(
8671
8975
  }
8672
8976
 
8673
8977
  // w = w + BA*s
8674
- ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB);
8978
+ ggml_tensor * BA = ggml_mul_mat(lora_ctx.get(), loraA, loraB);
8675
8979
  offload_func(BA);
8676
8980
  ggml_set_name(BA, "BA");
8677
8981
 
8678
8982
  if (scaling != 1.0f) {
8679
- ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling);
8680
- ggml_set_name(scale_tensor, "scale_tensor");
8681
-
8682
- BA = ggml_scale_inplace(lora_ctx, BA, scale_tensor);
8983
+ BA = ggml_scale_inplace(lora_ctx.get(), BA, scaling);
8683
8984
  offload_func(BA);
8684
8985
  ggml_set_name(BA, "BA_scaled");
8685
8986
  }
8686
8987
 
8687
8988
  ggml_tensor * r;
8688
8989
  if (base_t == dest_t) {
8689
- r = ggml_add_inplace(lora_ctx, dest_t, BA);
8990
+ r = ggml_add_inplace(lora_ctx.get(), dest_t, BA);
8690
8991
  offload_func_force_inplace(r);
8691
8992
  ggml_set_name(r, "r_add_inplace");
8692
8993
  }
8693
8994
  else {
8694
- r = ggml_add(lora_ctx, base_t, BA);
8995
+ r = ggml_add(lora_ctx.get(), base_t, BA);
8695
8996
  offload_func(r);
8696
8997
  ggml_set_name(r, "r_add");
8697
8998
 
8698
- r = ggml_cpy(lora_ctx, r, dest_t);
8999
+ r = ggml_cpy(lora_ctx.get(), r, dest_t);
8699
9000
  offload_func(r);
8700
9001
  ggml_set_name(r, "r_cpy");
8701
9002
  }
8702
9003
 
8703
- struct ggml_cgraph * gf = ggml_new_graph(lora_ctx);
9004
+ struct ggml_cgraph * gf = ggml_new_graph(lora_ctx.get());
8704
9005
  ggml_build_forward_expand(gf, r);
8705
9006
 
8706
9007
  ggml_graph_compute_helper(work_buffer, gf, n_threads);
8707
9008
 
9009
+ // the tensors in the adapter must be sorted such that loraA and loraB of the same tensor are next to each other
9010
+ GGML_ASSERT(lora_tensors.size() == 2);
9011
+
8708
9012
  // we won't need these tensors again, reset the context to save memory
8709
- ggml_free(lora_ctx);
8710
- lora_ctx = ggml_init(params);
9013
+ lora_ctx.reset(ggml_init(params));
8711
9014
  lora_tensors.clear();
8712
9015
 
8713
9016
  n_tensors++;
@@ -8717,12 +9020,6 @@ static int llama_apply_lora_from_file_internal(
8717
9020
  }
8718
9021
  }
8719
9022
 
8720
- // TODO: this should be in a destructor, it will leak on failure
8721
- ggml_free(lora_ctx);
8722
- if (base_ctx) {
8723
- ggml_free(base_ctx);
8724
- }
8725
-
8726
9023
  const int64_t t_lora_us = ggml_time_us() - t_start_lora_us;
8727
9024
  LLAMA_LOG_INFO(" done (%.2f ms)\n", t_lora_us / 1000.0);
8728
9025
 
@@ -8852,11 +9149,18 @@ struct llama_model * llama_load_model_from_file(
8852
9149
  LLAMA_LOG_INFO("\n");
8853
9150
  }
8854
9151
  }
9152
+ return true;
8855
9153
  };
8856
9154
  }
8857
9155
 
8858
- if (!llama_model_load(path_model, *model, params)) {
8859
- LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
9156
+ int status = llama_model_load(path_model, *model, params);
9157
+ GGML_ASSERT(status <= 0);
9158
+ if (status < 0) {
9159
+ if (status == -1) {
9160
+ LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
9161
+ } else if (status == -2) {
9162
+ LLAMA_LOG_INFO("%s: cancelled model load\n", __func__);
9163
+ }
8860
9164
  delete model;
8861
9165
  return nullptr;
8862
9166
  }
@@ -8931,7 +9235,39 @@ struct llama_context * llama_new_context_with_model(
8931
9235
 
8932
9236
  // reserve memory for context buffers
8933
9237
  if (!hparams.vocab_only) {
8934
- if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, type_k, type_v, cparams.n_ctx, model->n_gpu_layers, cparams.offload_kqv)) {
9238
+ // initialize backend
9239
+ #ifdef GGML_USE_METAL
9240
+ if (model->n_gpu_layers > 0) {
9241
+ ctx->backend = ggml_backend_metal_init();
9242
+ if (ctx->backend == nullptr) {
9243
+ LLAMA_LOG_ERROR("%s: failed to initialize Metal backend\n", __func__);
9244
+ }
9245
+ }
9246
+ #elif defined(GGML_USE_CUBLAS) && defined(LLAMA_GGML_BACKEND_CUDA_TEST)
9247
+ // for testing only
9248
+ if (model->n_gpu_layers > 0) {
9249
+ ctx->backend = ggml_backend_cuda_init(0);
9250
+ if (ctx->backend == nullptr) {
9251
+ LLAMA_LOG_ERROR("%s: failed to initialize CUDA backend\n", __func__);
9252
+ }
9253
+ }
9254
+ #endif
9255
+
9256
+ if (ctx->backend == nullptr && ggml_backend_buffer_is_host(model->buf)) {
9257
+ ctx->backend = ggml_backend_cpu_init();
9258
+ if (ctx->backend == nullptr) {
9259
+ LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__);
9260
+ }
9261
+ }
9262
+
9263
+ if (ctx->backend == nullptr) {
9264
+ LLAMA_LOG_ERROR("%s: failed to initialize a backend\n", __func__);
9265
+ delete ctx;
9266
+ return nullptr;
9267
+ }
9268
+
9269
+ if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, type_k, type_v,
9270
+ cparams.n_ctx, model->n_gpu_layers, cparams.offload_kqv)) {
8935
9271
  LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
8936
9272
  llama_free(ctx);
8937
9273
  return nullptr;
@@ -8967,12 +9303,11 @@ struct llama_context * llama_new_context_with_model(
8967
9303
  }
8968
9304
 
8969
9305
  {
8970
- static const size_t tensor_alignment = 32;
8971
9306
  // the compute buffer is used to store the tensor and graph structs, while the allocator buffer is used for the tensor data
8972
- ctx->buf_compute.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead());
9307
+ ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead());
8973
9308
 
8974
9309
  // create measure allocator
8975
- ctx->alloc = ggml_allocr_new_measure(tensor_alignment);
9310
+ ctx->alloc = ggml_allocr_new_measure_from_backend(ctx->backend);
8976
9311
 
8977
9312
  // build worst-case graph
8978
9313
  int n_tokens = (int)std::min(cparams.n_ctx, cparams.n_batch);
@@ -8980,98 +9315,50 @@ struct llama_context * llama_new_context_with_model(
8980
9315
  llama_token token = llama_token_bos(&ctx->model); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
8981
9316
  ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0));
8982
9317
 
8983
- #ifdef GGML_USE_METAL
8984
- if (model->n_gpu_layers > 0) {
8985
- ctx->ctx_metal = ggml_metal_init(1);
8986
- if (!ctx->ctx_metal) {
8987
- LLAMA_LOG_ERROR("%s: ggml_metal_init() failed\n", __func__);
8988
- llama_free(ctx);
8989
- return NULL;
8990
- }
8991
- //ggml_metal_graph_find_concurrency(ctx->ctx_metal, gf, false);
8992
- //ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
8993
- }
8994
- #endif
8995
9318
  // measure memory requirements for the graph
8996
- size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment;
9319
+ size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf);
8997
9320
 
8998
- LLAMA_LOG_INFO("%s: compute buffer total size = %.2f MiB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0);
9321
+ LLAMA_LOG_INFO("%s: compute buffer total size = %.2f MiB\n", __func__, (ctx->buf_compute_meta.size() + alloc_size) / 1024.0 / 1024.0);
8999
9322
 
9000
- // recreate allocator with exact memory requirements
9323
+ // create allocator again with exact memory requirements
9001
9324
  ggml_allocr_free(ctx->alloc);
9002
9325
 
9003
- ctx->buf_alloc.resize(alloc_size);
9004
- ctx->alloc = ggml_allocr_new(ctx->buf_alloc.data, ctx->buf_alloc.size, tensor_alignment);
9005
- #ifdef GGML_USE_METAL
9006
- if (ctx->ctx_metal) {
9007
- //ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
9008
- }
9009
- #endif
9010
- #ifdef GGML_USE_CUBLAS
9011
- ggml_cuda_set_scratch_size(alloc_size);
9012
- LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MiB\n", __func__, alloc_size / 1024.0 / 1024.0);
9326
+ ctx->buf_alloc = ggml_backend_alloc_buffer(ctx->backend, alloc_size);
9327
+ ctx->alloc = ggml_allocr_new_from_buffer(ctx->buf_alloc);
9328
+ #if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
9329
+ if (model->n_gpu_layers > 0) {
9330
+ ggml_cuda_set_scratch_size(alloc_size);
9331
+ LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MiB\n", __func__, alloc_size / 1024.0 / 1024.0);
9013
9332
 
9014
- // calculate total VRAM usage
9015
- auto add_tensor = [](const ggml_tensor * t, size_t & size) {
9016
- if (t->backend == GGML_BACKEND_GPU || t->backend == GGML_BACKEND_GPU_SPLIT) {
9017
- size += ggml_nbytes(t);
9333
+ // calculate total VRAM usage
9334
+ auto add_tensor = [](const ggml_tensor * t, size_t & size) {
9335
+ if (t->backend == GGML_BACKEND_GPU || t->backend == GGML_BACKEND_GPU_SPLIT) {
9336
+ size += ggml_nbytes(t);
9337
+ }
9338
+ };
9339
+ size_t model_vram_size = 0;
9340
+ for (const auto & kv : model->tensors_by_name) {
9341
+ add_tensor(kv.second, model_vram_size);
9018
9342
  }
9019
- };
9020
- size_t model_vram_size = 0;
9021
- for (const auto & kv : model->tensors_by_name) {
9022
- add_tensor(kv.second, model_vram_size);
9023
- }
9024
-
9025
- size_t kv_vram_size = 0;
9026
- for (auto & k : ctx->kv_self.k_l) {
9027
- add_tensor(k, kv_vram_size);
9028
- }
9029
- for (auto & v : ctx->kv_self.v_l) {
9030
- add_tensor(v, kv_vram_size);
9031
- }
9032
-
9033
- size_t ctx_vram_size = alloc_size + kv_vram_size;
9034
- size_t total_vram_size = model_vram_size + ctx_vram_size;
9035
9343
 
9036
- LLAMA_LOG_INFO("%s: total VRAM used: %.2f MiB (model: %.2f MiB, context: %.2f MiB)\n", __func__,
9037
- total_vram_size / 1024.0 / 1024.0,
9038
- model_vram_size / 1024.0 / 1024.0,
9039
- ctx_vram_size / 1024.0 / 1024.0);
9040
- #endif
9041
- }
9042
-
9043
- #ifdef GGML_USE_METAL
9044
- if (model->n_gpu_layers > 0) {
9045
- // this allocates all Metal resources and memory buffers
9046
-
9047
- void * data_ptr = NULL;
9048
- size_t data_size = 0;
9049
-
9050
- if (ctx->model.mapping) {
9051
- data_ptr = ctx->model.mapping->addr;
9052
- data_size = ctx->model.mapping->size;
9053
- } else {
9054
- data_ptr = ggml_get_mem_buffer(ctx->model.ctx);
9055
- data_size = ggml_get_mem_size (ctx->model.ctx);
9056
- }
9057
-
9058
- const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
9344
+ size_t kv_vram_size = 0;
9345
+ for (auto & k : ctx->kv_self.k_l) {
9346
+ add_tensor(k, kv_vram_size);
9347
+ }
9348
+ for (auto & v : ctx->kv_self.v_l) {
9349
+ add_tensor(v, kv_vram_size);
9350
+ }
9059
9351
 
9060
- LLAMA_LOG_INFO("%s: max tensor size = %8.2f MiB\n", __func__, max_size/1024.0/1024.0);
9352
+ size_t ctx_vram_size = alloc_size + kv_vram_size;
9353
+ size_t total_vram_size = model_vram_size + ctx_vram_size;
9061
9354
 
9062
- #define LLAMA_METAL_CHECK_BUF(result) \
9063
- if (!(result)) { \
9064
- LLAMA_LOG_ERROR("%s: failed to add buffer\n", __func__); \
9065
- llama_free(ctx); \
9066
- return NULL; \
9355
+ LLAMA_LOG_INFO("%s: total VRAM used: %.2f MiB (model: %.2f MiB, context: %.2f MiB)\n", __func__,
9356
+ total_vram_size / 1024.0 / 1024.0,
9357
+ model_vram_size / 1024.0 / 1024.0,
9358
+ ctx_vram_size / 1024.0 / 1024.0);
9067
9359
  }
9068
-
9069
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
9070
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.data, ctx->kv_self.buf.size, 0));
9071
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "alloc", ctx->buf_alloc.data, ctx->buf_alloc.size, 0));
9072
- #undef LLAMA_METAL_CHECK_BUF
9073
- }
9074
9360
  #endif
9361
+ }
9075
9362
  }
9076
9363
 
9077
9364
  #ifdef GGML_USE_MPI
@@ -9099,10 +9386,14 @@ const llama_model * llama_get_model(const struct llama_context * ctx) {
9099
9386
  return &ctx->model;
9100
9387
  }
9101
9388
 
9102
- int llama_n_ctx(const struct llama_context * ctx) {
9389
+ uint32_t llama_n_ctx(const struct llama_context * ctx) {
9103
9390
  return ctx->cparams.n_ctx;
9104
9391
  }
9105
9392
 
9393
+ uint32_t llama_n_batch(const struct llama_context * ctx) {
9394
+ return ctx->cparams.n_batch;
9395
+ }
9396
+
9106
9397
  enum llama_vocab_type llama_vocab_type(const struct llama_model * model) {
9107
9398
  return model->vocab.type;
9108
9399
  }
@@ -9359,7 +9650,7 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
9359
9650
  const size_t s_embedding = ctx->embedding.size() * sizeof(float);
9360
9651
  const size_t s_kv_size = sizeof(size_t);
9361
9652
  const size_t s_kv_ntok = sizeof(int);
9362
- const size_t s_kv = ctx->kv_self.buf.size;
9653
+ const size_t s_kv = ggml_backend_buffer_get_size(ctx->kv_self.buf);
9363
9654
 
9364
9655
  const size_t s_total = (
9365
9656
  + s_rng_size
@@ -9487,7 +9778,7 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
9487
9778
  const auto n_embd = hparams.n_embd_gqa();
9488
9779
  const auto n_ctx = cparams.n_ctx;
9489
9780
 
9490
- const size_t kv_buf_size = kv_self.buf.size;
9781
+ const size_t kv_buf_size = ggml_backend_buffer_get_size(kv_self.buf);
9491
9782
  const uint32_t kv_head = kv_self.head;
9492
9783
  const uint32_t kv_size = kv_self.size;
9493
9784
  const uint32_t kv_used = kv_self.used;
@@ -9503,17 +9794,12 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
9503
9794
  ggml_context * cpy_ctx = ggml_init({ 6*n_layer*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
9504
9795
  ggml_cgraph * gf = ggml_new_graph(cpy_ctx);
9505
9796
 
9506
- std::vector<std::vector<uint8_t>> kout2d_data(n_layer);
9507
- std::vector<std::vector<uint8_t>> vout2d_data(n_layer);
9797
+ std::vector<struct ggml_tensor *> kout2d(n_layer);
9798
+ std::vector<struct ggml_tensor *> vout2d(n_layer);
9508
9799
 
9509
9800
  for (int il = 0; il < (int) n_layer; ++il) {
9510
- ggml_tensor * kout2d = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd, kv_head);
9511
- kout2d_data[il].resize(ggml_nbytes(kout2d));
9512
- kout2d->data = kout2d_data[il].data();
9513
-
9514
- ggml_tensor * vout2d = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd);
9515
- vout2d_data[il].resize(ggml_nbytes(vout2d));
9516
- vout2d->data = vout2d_data[il].data();
9801
+ kout2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd, kv_head);
9802
+ vout2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd);
9517
9803
 
9518
9804
  ggml_tensor * k2d = ggml_view_2d(cpy_ctx, kv_self.k_l[il],
9519
9805
  n_embd, kv_head,
@@ -9523,20 +9809,28 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
9523
9809
  kv_head, n_embd,
9524
9810
  elt_size*n_ctx, 0);
9525
9811
 
9526
- ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, k2d, kout2d));
9527
- ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, v2d, vout2d));
9812
+ ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, k2d, kout2d[il]));
9813
+ ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, v2d, vout2d[il]));
9528
9814
  }
9529
9815
 
9530
- ggml_graph_compute_helper(ctx->work_buffer, gf, /*n_threads*/ 1);
9816
+ ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(cpy_ctx, ctx->backend);
9531
9817
 
9532
- ggml_free(cpy_ctx);
9818
+ ggml_backend_graph_compute(ctx->backend, gf);
9819
+
9820
+ std::vector<uint8_t> tmp_buf;
9821
+ for (int il = 0; il < (int) n_layer; ++il) {
9822
+ tmp_buf.resize(ggml_nbytes(kout2d[il]));
9823
+ ggml_backend_tensor_get(kout2d[il], tmp_buf.data(), 0, tmp_buf.size());
9824
+ data_ctx->write(tmp_buf.data(), tmp_buf.size());
9533
9825
 
9534
- // our data is now in the kout2d_data and vout2d_data buffers
9535
- // write them to file
9536
- for (uint32_t il = 0; il < n_layer; ++il) {
9537
- data_ctx->write(kout2d_data[il].data(), kout2d_data[il].size());
9538
- data_ctx->write(vout2d_data[il].data(), vout2d_data[il].size());
9826
+ tmp_buf.resize(ggml_nbytes(vout2d[il]));
9827
+ ggml_backend_tensor_get(vout2d[il], tmp_buf.data(), 0, tmp_buf.size());
9828
+ data_ctx->write(tmp_buf.data(), tmp_buf.size());
9539
9829
  }
9830
+
9831
+ ggml_free(cpy_ctx);
9832
+
9833
+ ggml_backend_buffer_free(buf);
9540
9834
  }
9541
9835
 
9542
9836
  for (uint32_t i = 0; i < kv_size; ++i) {
@@ -9634,21 +9928,19 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
9634
9928
  memcpy(&kv_used, inp, sizeof(kv_used)); inp += sizeof(kv_used);
9635
9929
 
9636
9930
  if (kv_buf_size) {
9637
- GGML_ASSERT(kv_self.buf.size == kv_buf_size);
9931
+ GGML_ASSERT(ggml_backend_buffer_get_size(kv_self.buf) == kv_buf_size);
9638
9932
 
9639
9933
  const size_t elt_size = ggml_element_size(kv_self.k_l[0]);
9640
9934
 
9641
9935
  ggml_context * cpy_ctx = ggml_init({ 6*n_layer*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
9642
9936
  ggml_cgraph * gf = ggml_new_graph(cpy_ctx);
9643
9937
 
9644
- for (int il = 0; il < n_layer; ++il) {
9645
- ggml_tensor * kin2d = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd, kv_head);
9646
- kin2d->data = (void *) inp;
9647
- inp += ggml_nbytes(kin2d);
9938
+ std::vector<struct ggml_tensor *> kin2d(n_layer);
9939
+ std::vector<struct ggml_tensor *> vin2d(n_layer);
9648
9940
 
9649
- ggml_tensor * vin2d = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd);
9650
- vin2d->data = (void *) inp;
9651
- inp += ggml_nbytes(vin2d);
9941
+ for (int il = 0; il < n_layer; ++il) {
9942
+ kin2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd, kv_head);
9943
+ vin2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd);
9652
9944
 
9653
9945
  ggml_tensor * k2d = ggml_view_2d(cpy_ctx, kv_self.k_l[il],
9654
9946
  n_embd, kv_head,
@@ -9658,13 +9950,26 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
9658
9950
  kv_head, n_embd,
9659
9951
  elt_size*n_ctx, 0);
9660
9952
 
9661
- ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, kin2d, k2d));
9662
- ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, vin2d, v2d));
9953
+ ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, kin2d[il], k2d));
9954
+ ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, vin2d[il], v2d));
9955
+ }
9956
+
9957
+ ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(cpy_ctx, ctx->backend);
9958
+
9959
+ // load data into the tensors
9960
+ for (int il = 0; il < n_layer; ++il) {
9961
+ ggml_backend_tensor_set(kin2d[il], inp, 0, ggml_nbytes(kin2d[il]));
9962
+ inp += ggml_nbytes(kin2d[il]);
9963
+
9964
+ ggml_backend_tensor_set(vin2d[il], inp, 0, ggml_nbytes(vin2d[il]));
9965
+ inp += ggml_nbytes(vin2d[il]);
9663
9966
  }
9664
9967
 
9665
- ggml_graph_compute_helper(ctx->work_buffer, gf, /*n_threads*/ 1);
9968
+ ggml_backend_graph_compute(ctx->backend, gf);
9666
9969
 
9667
9970
  ggml_free(cpy_ctx);
9971
+
9972
+ ggml_backend_buffer_free(buf);
9668
9973
  }
9669
9974
 
9670
9975
  ctx->kv_self.head = kv_head;
@@ -9887,6 +10192,7 @@ float * llama_get_logits(struct llama_context * ctx) {
9887
10192
  }
9888
10193
 
9889
10194
  float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
10195
+ assert(ctx->logits_valid.at(i));
9890
10196
  return ctx->logits.data() + i*ctx->model.hparams.n_vocab;
9891
10197
  }
9892
10198