llama_cpp 0.10.1 → 0.10.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,11 +1,12 @@
1
1
  #define LLAMA_API_INTERNAL
2
+ //#define LLAMA_GGML_BACKEND_CUDA_TEST // for testing only - enables ggml-cuda through ggml-backend, disables partial offloading
2
3
  #include "llama.h"
3
4
 
4
5
  #include "unicode.h"
5
6
 
6
7
  #include "ggml.h"
7
-
8
8
  #include "ggml-alloc.h"
9
+ #include "ggml-backend.h"
9
10
 
10
11
  #ifdef GGML_USE_CUBLAS
11
12
  # include "ggml-cuda.h"
@@ -32,6 +33,7 @@
32
33
  #include <unistd.h>
33
34
  #if defined(_POSIX_MAPPED_FILES)
34
35
  #include <sys/mman.h>
36
+ #include <fcntl.h>
35
37
  #endif
36
38
  #if defined(_POSIX_MEMLOCK_RANGE)
37
39
  #include <sys/resource.h>
@@ -195,6 +197,7 @@ enum llm_arch {
195
197
  LLM_ARCH_BLOOM,
196
198
  LLM_ARCH_STABLELM,
197
199
  LLM_ARCH_QWEN,
200
+ LLM_ARCH_PHI2,
198
201
  LLM_ARCH_UNKNOWN,
199
202
  };
200
203
 
@@ -212,6 +215,7 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
212
215
  { LLM_ARCH_BLOOM, "bloom" },
213
216
  { LLM_ARCH_STABLELM, "stablelm" },
214
217
  { LLM_ARCH_QWEN, "qwen" },
218
+ { LLM_ARCH_PHI2, "phi2" },
215
219
  };
216
220
 
217
221
  enum llm_kv {
@@ -550,6 +554,19 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
550
554
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
551
555
  },
552
556
  },
557
+ {
558
+ LLM_ARCH_PHI2,
559
+ {
560
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
561
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
562
+ { LLM_TENSOR_OUTPUT, "output" },
563
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
564
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
565
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
566
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
567
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
568
+ },
569
+ },
553
570
 
554
571
  {
555
572
  LLM_ARCH_UNKNOWN,
@@ -697,38 +714,6 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
697
714
  // llama helpers
698
715
  //
699
716
 
700
- inline void * llama_host_malloc(size_t n) {
701
- #ifdef GGML_USE_CUBLAS
702
- if (ggml_cublas_loaded()) {
703
- return ggml_cuda_host_malloc(n);
704
- } else {
705
- return malloc(n);
706
- }
707
- #elif GGML_USE_METAL
708
- return ggml_metal_host_malloc(n);
709
- #elif GGML_USE_CPU_HBM
710
- return hbw_malloc(n);
711
- #else
712
- return malloc(n);
713
- #endif
714
- }
715
-
716
- inline void llama_host_free(void * ptr) {
717
- #ifdef GGML_USE_CUBLAS
718
- if (ggml_cublas_loaded()) {
719
- return ggml_cuda_host_free(ptr);
720
- } else {
721
- return free(ptr);
722
- }
723
- #elif GGML_USE_METAL
724
- return ggml_metal_host_free(ptr);
725
- #elif GGML_USE_CPU_HBM
726
- return hbw_free(ptr);
727
- #else
728
- return free(ptr);
729
- #endif
730
- }
731
-
732
717
  #if defined(_WIN32)
733
718
  static std::string llama_format_win_err(DWORD err) {
734
719
  LPSTR buf;
@@ -743,40 +728,10 @@ static std::string llama_format_win_err(DWORD err) {
743
728
  }
744
729
  #endif
745
730
 
746
- struct llama_buffer {
747
- void * data = NULL;
748
- size_t size = 0;
749
-
750
- // fallback to malloc / free
751
- // useful in cases where CUDA can try to allocate PINNED memory
752
- bool fallback = false;
753
-
754
- void resize(size_t n) {
755
- llama_host_free(data);
756
-
757
- data = llama_host_malloc(n);
758
- if (!data) {
759
- fallback = true;
760
- data = malloc(n);
761
- } else {
762
- fallback = false;
763
- }
764
-
765
- GGML_ASSERT(data);
766
- size = n;
767
- }
768
-
769
- ~llama_buffer() {
770
- if (data) {
771
- if (fallback) { // NOLINT
772
- free(data);
773
- } else {
774
- llama_host_free(data);
775
- }
776
- }
777
-
778
- data = NULL;
779
- }
731
+ template <typename T>
732
+ struct no_init {
733
+ T value;
734
+ no_init() { /* do nothing */ }
780
735
  };
781
736
 
782
737
  struct llama_file {
@@ -864,6 +819,9 @@ struct llama_mmap {
864
819
  #ifdef _POSIX_MAPPED_FILES
865
820
  static constexpr bool SUPPORTED = true;
866
821
 
822
+ // list of mapped fragments (first_offset, last_offset)
823
+ std::vector<std::pair<size_t, size_t>> mapped_fragments;
824
+
867
825
  llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) {
868
826
  size = file->size;
869
827
  int fd = fileno(file->fp);
@@ -871,17 +829,22 @@ struct llama_mmap {
871
829
  // prefetch/readahead impairs performance on NUMA systems
872
830
  if (numa) { prefetch = 0; }
873
831
  #ifdef __linux__
832
+ // advise the kernel to read the file sequentially (increases readahead)
833
+ if (posix_fadvise(fd, 0, 0, POSIX_FADV_SEQUENTIAL)) {
834
+ LLAMA_LOG_WARN("warning: posix_fadvise(.., POSIX_FADV_SEQUENTIAL) failed: %s\n",
835
+ strerror(errno));
836
+ }
874
837
  if (prefetch) { flags |= MAP_POPULATE; }
875
838
  #endif
876
839
  addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
877
- if (addr == MAP_FAILED) {
840
+ if (addr == MAP_FAILED) { // NOLINT
878
841
  throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
879
842
  }
880
843
 
881
844
  if (prefetch > 0) {
882
- // Advise the kernel to preload the mapped memory
845
+ // advise the kernel to preload the mapped memory
883
846
  if (posix_madvise(addr, std::min(file->size, prefetch), POSIX_MADV_WILLNEED)) {
884
- fprintf(stderr, "warning: posix_madvise(.., POSIX_MADV_WILLNEED) failed: %s\n",
847
+ LLAMA_LOG_WARN("warning: posix_madvise(.., POSIX_MADV_WILLNEED) failed: %s\n",
885
848
  strerror(errno));
886
849
  }
887
850
  }
@@ -889,14 +852,81 @@ struct llama_mmap {
889
852
  // advise the kernel not to use readahead
890
853
  // (because the next page might not belong on the same node)
891
854
  if (posix_madvise(addr, file->size, POSIX_MADV_RANDOM)) {
892
- fprintf(stderr, "warning: posix_madvise(.., POSIX_MADV_RANDOM) failed: %s\n",
855
+ LLAMA_LOG_WARN("warning: posix_madvise(.., POSIX_MADV_RANDOM) failed: %s\n",
893
856
  strerror(errno));
894
857
  }
895
858
  }
859
+
860
+ // initialize list of mapped_fragments
861
+ mapped_fragments.emplace_back(0, file->size);
862
+ }
863
+
864
+ static void align_range(size_t * first, size_t * last, size_t page_size) {
865
+ // align first to the next page
866
+ size_t offset_in_page = *first & (page_size - 1);
867
+ size_t offset_to_page = offset_in_page == 0 ? 0 : page_size - offset_in_page;
868
+ *first += offset_to_page;
869
+
870
+ // align last to the previous page
871
+ *last = *last & ~(page_size - 1);
872
+
873
+ if (*last <= *first) {
874
+ *last = *first;
875
+ }
876
+ }
877
+
878
+ // partially unmap the file in the range [first, last)
879
+ void unmap_fragment(size_t first, size_t last) {
880
+ // note: this function must not be called multiple times with overlapping ranges
881
+ // otherwise, there is a risk of invalidating addresses that have been repurposed for other mappings
882
+ int page_size = sysconf(_SC_PAGESIZE);
883
+ align_range(&first, &last, page_size);
884
+ size_t len = last - first;
885
+
886
+ if (len == 0) {
887
+ return;
888
+ }
889
+
890
+ GGML_ASSERT(first % page_size == 0);
891
+ GGML_ASSERT(last % page_size == 0);
892
+ GGML_ASSERT(last > first);
893
+
894
+ void * next_page_start = (uint8_t *) addr + first;
895
+
896
+ // unmap the range
897
+ if (munmap(next_page_start, len)) {
898
+ LLAMA_LOG_WARN("warning: munmap failed: %s\n", strerror(errno));
899
+ }
900
+
901
+ // update the list of mapped fragments to avoid unmapping the same range again in the destructor
902
+ std::vector<std::pair<size_t, size_t>> new_mapped_fragments;
903
+ for (const auto & frag : mapped_fragments) {
904
+ if (frag.first < first && frag.second > last) {
905
+ // the range is in the middle of the fragment, split it
906
+ new_mapped_fragments.emplace_back(frag.first, first);
907
+ new_mapped_fragments.emplace_back(last, frag.second);
908
+ } else if (frag.first < first && frag.second > first) {
909
+ // the range starts in the middle of the fragment
910
+ new_mapped_fragments.emplace_back(frag.first, first);
911
+ } else if (frag.first < last && frag.second > last) {
912
+ // the range ends in the middle of the fragment
913
+ new_mapped_fragments.emplace_back(last, frag.second);
914
+ } else if (frag.first >= first && frag.second <= last) {
915
+ // the range covers the entire fragment
916
+ } else {
917
+ // the range is outside the fragment
918
+ new_mapped_fragments.push_back(frag);
919
+ }
920
+ }
921
+ mapped_fragments = std::move(new_mapped_fragments);
896
922
  }
897
923
 
898
924
  ~llama_mmap() {
899
- munmap(addr, size);
925
+ for (const auto & frag : mapped_fragments) {
926
+ if (munmap((char *) addr + frag.first, frag.second - frag.first)) {
927
+ LLAMA_LOG_WARN("warning: munmap failed: %s\n", strerror(errno));
928
+ }
929
+ }
900
930
  }
901
931
  #elif defined(_WIN32)
902
932
  static constexpr bool SUPPORTED = true;
@@ -939,6 +969,12 @@ struct llama_mmap {
939
969
  #endif // _WIN32_WINNT >= _WIN32_WINNT_WIN8
940
970
  }
941
971
 
972
+ void unmap_fragment(size_t first, size_t last) {
973
+ // not supported
974
+ GGML_UNUSED(first);
975
+ GGML_UNUSED(last);
976
+ }
977
+
942
978
  ~llama_mmap() {
943
979
  if (!UnmapViewOfFile(addr)) {
944
980
  fprintf(stderr, "warning: UnmapViewOfFile failed: %s\n",
@@ -955,6 +991,13 @@ struct llama_mmap {
955
991
 
956
992
  throw std::runtime_error(std::string("mmap not supported"));
957
993
  }
994
+
995
+ void unmap(size_t offset, size_t len) {
996
+ (void) offset;
997
+ (void) len;
998
+
999
+ throw std::runtime_error(std::string("mmap not supported"));
1000
+ }
958
1001
  #endif
959
1002
  };
960
1003
 
@@ -1128,6 +1171,26 @@ static std::string llama_token_to_piece(const struct llama_context * ctx, llama_
1128
1171
  return std::string(result.data(), result.size());
1129
1172
  }
1130
1173
 
1174
+ static ggml_backend_buffer_type_t llama_default_buffer_type(int n_gpu_layers) {
1175
+ #ifdef GGML_USE_METAL
1176
+ if (n_gpu_layers > 0) {
1177
+ return ggml_backend_metal_buffer_type();
1178
+ }
1179
+ #elif defined(GGML_USE_CUBLAS) && defined(LLAMA_GGML_BACKEND_CUDA_TEST)
1180
+ if (n_gpu_layers > 0) {
1181
+ return ggml_backend_cuda_buffer_type(0);
1182
+ }
1183
+ #elif defined(GGML_USE_CUBLAS)
1184
+ return ggml_backend_cuda_host_buffer_type();
1185
+ #elif defined(GGML_USE_CPU_HBM)
1186
+ return ggml_backend_cpu_hbm_buffer_type();
1187
+ #endif
1188
+
1189
+ return ggml_backend_cpu_buffer_type();
1190
+
1191
+ GGML_UNUSED(n_gpu_layers);
1192
+ }
1193
+
1131
1194
  //
1132
1195
  // globals
1133
1196
  //
@@ -1328,14 +1391,10 @@ struct llama_kv_cache {
1328
1391
 
1329
1392
  struct ggml_context * ctx = NULL;
1330
1393
 
1331
- llama_buffer buf;
1394
+ ggml_backend_buffer_t buf = NULL;
1332
1395
 
1333
1396
  ~llama_kv_cache() {
1334
- if (ctx) {
1335
- ggml_free(ctx);
1336
- }
1337
-
1338
- #ifdef GGML_USE_CUBLAS
1397
+ #if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
1339
1398
  if (ggml_cublas_loaded()) {
1340
1399
  for (size_t i = 0; i < k_l.size(); ++i) {
1341
1400
  ggml_cuda_free_data(k_l[i]);
@@ -1343,6 +1402,11 @@ struct llama_kv_cache {
1343
1402
  }
1344
1403
  }
1345
1404
  #endif
1405
+ if (ctx) {
1406
+ ggml_free(ctx);
1407
+ }
1408
+
1409
+ ggml_backend_buffer_free(buf);
1346
1410
  }
1347
1411
  };
1348
1412
 
@@ -1382,11 +1446,11 @@ struct llama_vocab {
1382
1446
  id special_suffix_id = 32008;
1383
1447
  id special_eot_id = 32010;
1384
1448
 
1385
- int find_bpe_rank(std::string token_left, std::string token_right) const {
1386
- GGML_ASSERT(token_left.find(" ") == std::string::npos);
1387
- GGML_ASSERT(token_left.find("\n") == std::string::npos);
1388
- GGML_ASSERT(token_right.find(" ") == std::string::npos);
1389
- GGML_ASSERT(token_right.find("\n") == std::string::npos);
1449
+ int find_bpe_rank(const std::string & token_left, const std::string & token_right) const {
1450
+ GGML_ASSERT(token_left.find(' ') == std::string::npos);
1451
+ GGML_ASSERT(token_left.find('\n') == std::string::npos);
1452
+ GGML_ASSERT(token_right.find(' ') == std::string::npos);
1453
+ GGML_ASSERT(token_right.find('\n') == std::string::npos);
1390
1454
 
1391
1455
  auto it = bpe_ranks.find(std::make_pair(token_left, token_right));
1392
1456
  if (it == bpe_ranks.end()) {
@@ -1415,6 +1479,7 @@ struct llama_model {
1415
1479
  struct ggml_tensor * output_norm;
1416
1480
  struct ggml_tensor * output_norm_b;
1417
1481
  struct ggml_tensor * output;
1482
+ struct ggml_tensor * output_b;
1418
1483
 
1419
1484
  std::vector<llama_layer> layers;
1420
1485
 
@@ -1427,7 +1492,7 @@ struct llama_model {
1427
1492
  struct ggml_context * ctx = NULL;
1428
1493
 
1429
1494
  // the model memory buffer
1430
- llama_buffer buf;
1495
+ ggml_backend_buffer_t buf = NULL;
1431
1496
 
1432
1497
  // model memory mapped file
1433
1498
  std::unique_ptr<llama_mmap> mapping;
@@ -1443,11 +1508,7 @@ struct llama_model {
1443
1508
  int64_t t_start_us = 0;
1444
1509
 
1445
1510
  ~llama_model() {
1446
- if (ctx) {
1447
- ggml_free(ctx);
1448
- }
1449
-
1450
- #ifdef GGML_USE_CUBLAS
1511
+ #if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
1451
1512
  if (ggml_cublas_loaded()) {
1452
1513
  for (size_t i = 0; i < tensors_by_name.size(); ++i) {
1453
1514
  ggml_cuda_free_data(tensors_by_name[i].second);
@@ -1461,24 +1522,26 @@ struct llama_model {
1461
1522
  ggml_cl_free_data(tensors_by_name[i].second);
1462
1523
  }
1463
1524
  #endif
1525
+ if (ctx) {
1526
+ ggml_free(ctx);
1527
+ }
1528
+
1529
+ ggml_backend_buffer_free(buf);
1464
1530
  }
1465
1531
  };
1466
1532
 
1467
1533
  struct llama_context {
1468
1534
  llama_context(const llama_model & model) : model(model), t_start_us(model.t_start_us), t_load_us(model.t_load_us) {}
1469
1535
  ~llama_context() {
1470
- #ifdef GGML_USE_METAL
1471
- if (ctx_metal) {
1472
- ggml_metal_free(ctx_metal);
1473
- }
1474
- #endif
1475
- if (alloc) {
1476
- ggml_allocr_free(alloc);
1477
- }
1536
+ ggml_allocr_free(alloc);
1537
+ ggml_backend_buffer_free(buf_alloc);
1538
+ ggml_backend_free(backend);
1478
1539
  }
1479
1540
 
1480
1541
  llama_cparams cparams;
1481
1542
 
1543
+ ggml_backend_t backend = nullptr;
1544
+
1482
1545
  const llama_model & model;
1483
1546
 
1484
1547
  // key + value cache for the self attention
@@ -1500,23 +1563,22 @@ struct llama_context {
1500
1563
 
1501
1564
  // decode output (2-dimensional array: [n_tokens][n_vocab])
1502
1565
  std::vector<float> logits;
1566
+ #ifndef NDEBUG
1567
+ // guard against access to unset logits
1568
+ std::vector<bool> logits_valid;
1569
+ #endif
1503
1570
  bool logits_all = false;
1504
1571
 
1505
1572
  // input embedding (1-dimensional array: [n_embd])
1506
1573
  std::vector<float> embedding;
1507
1574
 
1508
- // reusable buffer for `struct ggml_graph_plan.work_data`
1509
- std::vector<uint8_t> work_buffer;
1510
-
1511
1575
  // memory buffers used to evaluate the model
1512
- llama_buffer buf_compute;
1513
-
1514
- llama_buffer buf_alloc;
1576
+ std::vector<uint8_t> buf_compute_meta;
1577
+ ggml_backend_buffer_t buf_alloc = NULL;
1515
1578
  ggml_allocr * alloc = NULL;
1516
1579
 
1517
- #ifdef GGML_USE_METAL
1518
- ggml_metal_context * ctx_metal = NULL;
1519
- #endif
1580
+ // temporary buffer for copying data to/from the backend
1581
+ std::vector<no_init<uint8_t>> buf_copy;
1520
1582
 
1521
1583
  #ifdef GGML_USE_MPI
1522
1584
  ggml_mpi_context * ctx_mpi = NULL;
@@ -1538,9 +1600,6 @@ static bool llama_kv_cache_init(
1538
1600
  const uint32_t n_embd = hparams.n_embd_gqa();
1539
1601
  const uint32_t n_layer = hparams.n_layer;
1540
1602
 
1541
- const int64_t n_mem = n_layer*n_ctx;
1542
- const int64_t n_elements = n_embd*n_mem;
1543
-
1544
1603
  cache.has_shift = false;
1545
1604
 
1546
1605
  cache.head = 0;
@@ -1550,13 +1609,10 @@ static bool llama_kv_cache_init(
1550
1609
  cache.cells.clear();
1551
1610
  cache.cells.resize(n_ctx);
1552
1611
 
1553
- cache.buf.resize(ggml_row_size(ktype, n_elements) + ggml_row_size(vtype, n_elements) + 2u*n_layer*ggml_tensor_overhead());
1554
- memset(cache.buf.data, 0, cache.buf.size);
1555
-
1556
1612
  struct ggml_init_params params;
1557
- params.mem_size = cache.buf.size;
1558
- params.mem_buffer = cache.buf.data;
1559
- params.no_alloc = false;
1613
+ params.mem_size = 2u*n_layer*ggml_tensor_overhead();
1614
+ params.mem_buffer = NULL;
1615
+ params.no_alloc = true;
1560
1616
 
1561
1617
  cache.ctx = ggml_init(params);
1562
1618
 
@@ -1570,9 +1626,7 @@ static bool llama_kv_cache_init(
1570
1626
  cache.k_l.reserve(n_layer);
1571
1627
  cache.v_l.reserve(n_layer);
1572
1628
 
1573
- const int i_gpu_start = (int) n_layer - n_gpu_layers; GGML_UNUSED(i_gpu_start);
1574
-
1575
- GGML_UNUSED(offload);
1629
+ const int i_gpu_start = (int) n_layer - n_gpu_layers;
1576
1630
 
1577
1631
  for (int i = 0; i < (int) n_layer; i++) {
1578
1632
  ggml_tensor * k = ggml_new_tensor_1d(cache.ctx, ktype, n_embd*n_ctx);
@@ -1581,23 +1635,35 @@ static bool llama_kv_cache_init(
1581
1635
  ggml_format_name(v, "cache_v_l%d", i);
1582
1636
  cache.k_l.push_back(k);
1583
1637
  cache.v_l.push_back(v);
1584
- #ifdef GGML_USE_CUBLAS
1638
+ #if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
1585
1639
  if (i >= i_gpu_start) {
1586
1640
  if (offload) {
1587
1641
  ggml_cuda_assign_buffers_no_scratch(k);
1588
- vram_kv_cache += ggml_nbytes(k);
1589
1642
  ggml_cuda_assign_buffers_no_scratch(v);
1643
+ vram_kv_cache += ggml_nbytes(k);
1590
1644
  vram_kv_cache += ggml_nbytes(v);
1645
+ // HACK: mark tensor as allocated
1646
+ k->data = v->data = (void *)(uintptr_t)1;
1591
1647
  }
1592
1648
  }
1593
1649
  #endif // GGML_USE_CUBLAS
1594
1650
  }
1595
1651
 
1652
+ // allocate tensors
1653
+ cache.buf = ggml_backend_alloc_ctx_tensors_from_buft(cache.ctx, llama_default_buffer_type(n_gpu_layers));
1654
+
1655
+ // buf may be NULL with full offload
1656
+ if (cache.buf) {
1657
+ // initialize the buffer to avoid NaNs in the padding
1658
+ ggml_backend_buffer_clear(cache.buf, 0);
1659
+ }
1660
+
1596
1661
  if (vram_kv_cache > 0) {
1597
1662
  LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
1598
1663
  }
1599
1664
 
1600
- GGML_UNUSED(n_gpu_layers);
1665
+ GGML_UNUSED(i_gpu_start);
1666
+ GGML_UNUSED(offload);
1601
1667
 
1602
1668
  return true;
1603
1669
  }
@@ -1928,7 +1994,7 @@ namespace GGUFMeta {
1928
1994
  target = override->bool_value;
1929
1995
  return true;
1930
1996
  }
1931
- return true;
1997
+ return false;
1932
1998
  }
1933
1999
 
1934
2000
  template<typename OT>
@@ -2048,17 +2114,16 @@ struct llama_model_loader {
2048
2114
  enum ggml_type type_max = GGML_TYPE_F32;
2049
2115
 
2050
2116
  for (int i = 0; i < n_tensors; i++) {
2051
- const char * name = gguf_get_tensor_name(ctx_gguf, i);
2052
- struct ggml_tensor * meta = ggml_get_tensor(ctx_meta, name);
2117
+ enum ggml_type type = gguf_get_tensor_type(ctx_gguf, i);
2053
2118
 
2054
- n_type[meta->type]++;
2119
+ n_type[type]++;
2055
2120
 
2056
- if (n_type_max < n_type[meta->type]) {
2057
- n_type_max = n_type[meta->type];
2058
- type_max = meta->type;
2121
+ if (n_type_max < n_type[type]) {
2122
+ n_type_max = n_type[type];
2123
+ type_max = type;
2059
2124
  }
2060
2125
 
2061
- LLAMA_LOG_INFO("%s: - tensor %4d: %32s %-8s [ %s ]\n", __func__, i, name, ggml_type_name(meta->type), llama_format_tensor_shape(meta).c_str());
2126
+ // LLAMA_LOG_INFO("%s: - tensor %4d: %32s %-8s [ %s ]\n", __func__, i, name, ggml_type_name(meta->type), llama_format_tensor_shape(meta).c_str());
2062
2127
  }
2063
2128
 
2064
2129
  switch (type_max) {
@@ -2196,34 +2261,19 @@ struct llama_model_loader {
2196
2261
  return gguf_get_tensor_name(ctx_gguf, i);
2197
2262
  }
2198
2263
 
2199
- struct ggml_tensor * get_tensor_meta(int i) const {
2200
- return ggml_get_tensor(ctx_meta, get_tensor_name(i));
2264
+ struct ggml_tensor * get_tensor_meta(const char * name) const {
2265
+ return ggml_get_tensor(ctx_meta, name);
2201
2266
  }
2202
2267
 
2203
- void calc_sizes(size_t & ctx_size_p, size_t & mmapped_size_p) const {
2204
- ctx_size_p = 0;
2205
- mmapped_size_p = 0;
2206
-
2207
- for (int i = 0; i < n_tensors; i++) {
2208
- struct ggml_tensor * meta = get_tensor_meta(i);
2209
- ctx_size_p += sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE;
2210
- (use_mmap ? mmapped_size_p : ctx_size_p) += ggml_nbytes_pad(meta);
2211
- }
2268
+ struct ggml_tensor * get_tensor_meta(int i) const {
2269
+ return get_tensor_meta(get_tensor_name(i));
2212
2270
  }
2213
2271
 
2214
2272
  struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta, ggml_backend_type backend) {
2215
- if (backend != GGML_BACKEND_CPU) {
2216
- ggml_set_no_alloc(ctx, true);
2217
- }
2218
-
2219
2273
  struct ggml_tensor * tensor = ggml_dup_tensor(ctx, meta);
2220
2274
  tensor->backend = backend; // TODO: ggml_set_backend
2221
2275
  ggml_set_name(tensor, ggml_get_name(meta));
2222
2276
 
2223
- if (backend != GGML_BACKEND_CPU) {
2224
- ggml_set_no_alloc(ctx, use_mmap);
2225
- }
2226
-
2227
2277
  n_created++;
2228
2278
 
2229
2279
  return tensor;
@@ -2281,91 +2331,144 @@ struct llama_model_loader {
2281
2331
  return gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, idx);
2282
2332
  }
2283
2333
 
2334
+ void init_mapping(bool prefetch = true) {
2335
+ /*
2336
+ // prefetch only CPU tensors
2337
+ if (use_mmap) {
2338
+ size_t size_pref = 0; // prefetch
2339
+
2340
+ for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
2341
+ struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i));
2342
+ if (cur->backend == GGML_BACKEND_CPU) {
2343
+ size_t tensor_end = gguf_get_tensor_offset(ctx_gguf, i) + ggml_nbytes(cur);
2344
+ size_pref = std::max(size_pref, tensor_end);
2345
+ }
2346
+ }
2347
+ mapping.reset(new llama_mmap(&file, gguf_get_data_offset(ctx_gguf) + size_pref, ggml_is_numa()));
2348
+ }
2349
+ */
2350
+ // prefetch the whole file - all the data is needed anyway
2351
+ if (use_mmap) {
2352
+ mapping.reset(new llama_mmap(&file, prefetch ? -1 : 0, ggml_is_numa()));
2353
+ }
2354
+ }
2355
+
2356
+ // for backwards compatibility, does not support ggml-backend
2284
2357
  void load_data_for(struct ggml_tensor * cur) const {
2285
2358
  const size_t offs = file_offset(ggml_get_name(cur));
2286
2359
 
2287
- if (use_mmap) {
2288
- cur->data = (uint8_t *) mapping->addr + offs;
2360
+ if (use_mmap && mapping) {
2361
+ GGML_ASSERT(cur->data == nullptr);
2362
+ cur->data = (uint8_t *)mapping->addr + offs;
2289
2363
  } else {
2364
+ GGML_ASSERT(cur->data != nullptr);
2290
2365
  file.seek(offs, SEEK_SET);
2291
2366
  file.read_raw(cur->data, ggml_nbytes(cur));
2292
2367
  }
2293
2368
  }
2294
2369
 
2295
- void load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) {
2370
+ // Returns false if cancelled by progress_callback
2371
+ bool load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, ggml_backend_buffer_t buf_mmap, llama_mlock * lmlock) const {
2296
2372
  size_t size_data = 0;
2297
- size_t size_lock = 0;
2298
- size_t size_pref = 0; // prefetch
2299
2373
 
2300
2374
  for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
2301
2375
  struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i));
2302
2376
  size_data += ggml_nbytes(cur);
2303
- if (cur->backend == GGML_BACKEND_CPU) {
2304
- size_pref += ggml_nbytes(cur);
2305
- }
2306
2377
  }
2307
2378
 
2308
- if (use_mmap) {
2309
- mapping.reset(new llama_mmap(&file, size_pref, ggml_is_numa()));
2379
+ if (use_mmap && buf_mmap) {
2310
2380
  if (lmlock) {
2311
2381
  lmlock->init(mapping->addr);
2312
2382
  }
2313
2383
  }
2314
2384
 
2315
- size_t done_size = 0;
2385
+ #if (defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)) || defined(GGML_USE_CLBLAST)
2386
+ const bool legacy_offload = true;
2387
+ #else
2388
+ const bool legacy_offload = false;
2389
+ #endif
2390
+
2391
+ std::vector<no_init<uint8_t>> read_buf;
2392
+
2393
+ size_t size_done = 0;
2394
+
2395
+ size_t mmap_first = -1;
2396
+ size_t mmap_last = 0;
2397
+
2316
2398
  for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
2317
2399
  struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i));
2318
2400
  GGML_ASSERT(cur); // unused tensors should have been caught by load_data already
2319
2401
 
2320
2402
  if (progress_callback) {
2321
- progress_callback((float) done_size / size_data, progress_callback_user_data);
2322
- }
2323
-
2324
- // allocate temp buffer if not using mmap
2325
- if (!use_mmap && cur->data == NULL) {
2326
- GGML_ASSERT(cur->backend != GGML_BACKEND_CPU);
2327
- #ifdef GGML_USE_CPU_HBM
2328
- cur->data = (uint8_t*)hbw_malloc(ggml_nbytes(cur));
2329
- #else
2330
- cur->data = (uint8_t*)malloc(ggml_nbytes(cur));
2331
- #endif
2403
+ if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) {
2404
+ return false;
2405
+ }
2332
2406
  }
2333
2407
 
2334
- load_data_for(cur);
2408
+ const size_t offs = file_offset(ggml_get_name(cur));
2335
2409
 
2336
- switch (cur->backend) {
2337
- case GGML_BACKEND_CPU:
2338
- if (use_mmap && lmlock) {
2339
- size_lock += ggml_nbytes(cur);
2340
- lmlock->grow_to(size_lock);
2410
+ if (!legacy_offload || cur->backend == GGML_BACKEND_CPU) {
2411
+ if (use_mmap && mapping) {
2412
+ if (buf_mmap) {
2413
+ ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *) mapping->addr + offs);
2414
+ if (lmlock) {
2415
+ lmlock->grow_to(offs + ggml_nbytes(cur));
2416
+ }
2417
+ mmap_first = std::min(mmap_first, offs);
2418
+ mmap_last = std::max(mmap_last, offs + ggml_nbytes(cur));
2419
+ } else {
2420
+ ggml_backend_tensor_set(cur, (uint8_t *) mapping->addr + offs, 0, ggml_nbytes(cur));
2341
2421
  }
2342
- break;
2343
- #ifdef GGML_USE_CUBLAS
2344
- case GGML_BACKEND_GPU:
2345
- case GGML_BACKEND_GPU_SPLIT:
2346
- // old code:
2347
- //ggml_cuda_transform_tensor(lt.data, lt.ggml_tensor);
2348
-
2349
- // TODO: test if this works !!
2350
- ggml_cuda_transform_tensor(cur->data, cur);
2351
- if (!use_mmap) {
2352
- free(cur->data);
2422
+ } else {
2423
+ if (ggml_backend_buffer_is_host(cur->buffer)) {
2424
+ file.seek(offs, SEEK_SET);
2425
+ file.read_raw(cur->data, ggml_nbytes(cur));
2426
+ } else {
2427
+ read_buf.resize(ggml_nbytes(cur));
2428
+ file.seek(offs, SEEK_SET);
2429
+ file.read_raw(read_buf.data(), ggml_nbytes(cur));
2430
+ ggml_backend_tensor_set(cur, read_buf.data(), 0, ggml_nbytes(cur));
2353
2431
  }
2354
- break;
2432
+ }
2433
+ } else {
2434
+ // HACK: mark tensor as allocated
2435
+ cur->data = (void *)(uintptr_t)1;
2436
+ void * data;
2437
+ if (use_mmap && mapping) {
2438
+ data = (uint8_t *) mapping->addr + offs;
2439
+ } else {
2440
+ read_buf.resize(ggml_nbytes(cur));
2441
+ file.seek(offs, SEEK_SET);
2442
+ file.read_raw(read_buf.data(), ggml_nbytes(cur));
2443
+ data = read_buf.data();
2444
+ }
2445
+
2446
+ #if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
2447
+ ggml_cuda_transform_tensor(data, cur);
2355
2448
  #elif defined(GGML_USE_CLBLAST)
2356
- case GGML_BACKEND_GPU:
2357
- ggml_cl_transform_tensor(cur->data, cur);
2358
- if (!use_mmap) {
2359
- free(cur->data);
2360
- }
2361
- break;
2449
+ GGML_ASSERT(cur->backend == GGML_BACKEND_GPU);
2450
+ ggml_cl_transform_tensor(data, cur);
2451
+ #else
2452
+ GGML_ASSERT(!"GPU tensor without a GPU backend");
2453
+ GGML_UNUSED(data);
2362
2454
  #endif
2363
- default:
2364
- continue;
2365
2455
  }
2366
2456
 
2367
- done_size += ggml_nbytes(cur);
2457
+ size_done += ggml_nbytes(cur);
2368
2458
  }
2459
+
2460
+ // unmap offloaded tensors and metadata
2461
+ if (use_mmap && mapping) {
2462
+ mapping->unmap_fragment(0, mmap_first);
2463
+ mapping->unmap_fragment(mmap_last, mapping->size);
2464
+ }
2465
+
2466
+ if (progress_callback) {
2467
+ // Even though the model is done loading, we still honor
2468
+ // cancellation since we need to free allocations.
2469
+ return progress_callback(1.0f, progress_callback_user_data);
2470
+ }
2471
+ return true;
2369
2472
  }
2370
2473
  };
2371
2474
 
@@ -2388,25 +2491,25 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
2388
2491
 
2389
2492
  switch (ftype) {
2390
2493
  case LLAMA_FTYPE_ALL_F32: return "all F32";
2391
- case LLAMA_FTYPE_MOSTLY_F16: return "mostly F16";
2392
- case LLAMA_FTYPE_MOSTLY_Q4_0: return "mostly Q4_0";
2393
- case LLAMA_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1";
2494
+ case LLAMA_FTYPE_MOSTLY_F16: return "F16";
2495
+ case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0";
2496
+ case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1";
2394
2497
  case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
2395
- return "mostly Q4_1, some F16";
2396
- case LLAMA_FTYPE_MOSTLY_Q5_0: return "mostly Q5_0";
2397
- case LLAMA_FTYPE_MOSTLY_Q5_1: return "mostly Q5_1";
2398
- case LLAMA_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0";
2498
+ return "Q4_1, some F16";
2499
+ case LLAMA_FTYPE_MOSTLY_Q5_0: return "Q5_0";
2500
+ case LLAMA_FTYPE_MOSTLY_Q5_1: return "Q5_1";
2501
+ case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0";
2399
2502
 
2400
2503
  // K-quants
2401
- case LLAMA_FTYPE_MOSTLY_Q2_K: return "mostly Q2_K";
2402
- case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "mostly Q3_K - Small";
2403
- case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "mostly Q3_K - Medium";
2404
- case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "mostly Q3_K - Large";
2405
- case LLAMA_FTYPE_MOSTLY_Q4_K_S: return "mostly Q4_K - Small";
2406
- case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "mostly Q4_K - Medium";
2407
- case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "mostly Q5_K - Small";
2408
- case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "mostly Q5_K - Medium";
2409
- case LLAMA_FTYPE_MOSTLY_Q6_K: return "mostly Q6_K";
2504
+ case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K";
2505
+ case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small";
2506
+ case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "Q3_K - Medium";
2507
+ case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "Q3_K - Large";
2508
+ case LLAMA_FTYPE_MOSTLY_Q4_K_S: return "Q4_K - Small";
2509
+ case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "Q4_K - Medium";
2510
+ case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small";
2511
+ case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium";
2512
+ case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
2410
2513
 
2411
2514
  default: return "unknown, may not work";
2412
2515
  }
@@ -2524,6 +2627,7 @@ static void llm_load_hparams(
2524
2627
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2525
2628
 
2526
2629
  switch (hparams.n_layer) {
2630
+ case 22: model.type = e_model::MODEL_1B; break;
2527
2631
  case 26: model.type = e_model::MODEL_3B; break;
2528
2632
  case 32: model.type = e_model::MODEL_7B; break;
2529
2633
  case 40: model.type = e_model::MODEL_13B; break;
@@ -2625,6 +2729,15 @@ static void llm_load_hparams(
2625
2729
  default: model.type = e_model::MODEL_UNKNOWN;
2626
2730
  }
2627
2731
  } break;
2732
+ case LLM_ARCH_PHI2:
2733
+ {
2734
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
2735
+
2736
+ switch (hparams.n_layer) {
2737
+ case 32: model.type = e_model::MODEL_3B; break;
2738
+ default: model.type = e_model::MODEL_UNKNOWN;
2739
+ }
2740
+ } break;
2628
2741
 
2629
2742
  default: (void)0;
2630
2743
  }
@@ -2932,7 +3045,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
2932
3045
  if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
2933
3046
  }
2934
3047
 
2935
- static void llm_load_tensors(
3048
+ // Returns false if cancelled by progress_callback
3049
+ static bool llm_load_tensors(
2936
3050
  llama_model_loader & ml,
2937
3051
  llama_model & model,
2938
3052
  int n_gpu_layers,
@@ -2948,25 +3062,16 @@ static void llm_load_tensors(
2948
3062
 
2949
3063
  model.n_gpu_layers = n_gpu_layers;
2950
3064
 
2951
- size_t ctx_size;
2952
- size_t mmapped_size;
2953
-
2954
- ml.calc_sizes(ctx_size, mmapped_size);
3065
+ size_t ctx_size = ggml_tensor_overhead() * ml.n_tensors;
2955
3066
 
2956
- LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MiB\n", __func__, ctx_size/1024.0/1024.0);
3067
+ LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MiB\n", __func__, ctx_size/1024.0/1024.0);
2957
3068
 
2958
3069
  // create the ggml context
2959
3070
  {
2960
- model.buf.resize(ctx_size);
2961
- if (use_mlock) {
2962
- model.mlock_buf.init (model.buf.data);
2963
- model.mlock_buf.grow_to(model.buf.size);
2964
- }
2965
-
2966
3071
  struct ggml_init_params params = {
2967
- /*.mem_size =*/ model.buf.size,
2968
- /*.mem_buffer =*/ model.buf.data,
2969
- /*.no_alloc =*/ ml.use_mmap,
3072
+ /*.mem_size =*/ ctx_size,
3073
+ /*.mem_buffer =*/ NULL,
3074
+ /*.no_alloc =*/ true,
2970
3075
  };
2971
3076
 
2972
3077
  model.ctx = ggml_init(params);
@@ -2977,25 +3082,24 @@ static void llm_load_tensors(
2977
3082
 
2978
3083
  (void) main_gpu;
2979
3084
 
2980
- enum ggml_backend_type llama_backend_offload = GGML_BACKEND_CPU;
3085
+ enum ggml_backend_type llama_backend_offload = GGML_BACKEND_CPU;
2981
3086
  enum ggml_backend_type llama_backend_offload_split = GGML_BACKEND_CPU;
2982
3087
 
2983
- #ifdef GGML_USE_CUBLAS
3088
+ #if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
2984
3089
  if (ggml_cublas_loaded()) {
2985
3090
  LLAMA_LOG_INFO("%s: using " GGML_CUDA_NAME " for GPU acceleration\n", __func__);
2986
3091
  ggml_cuda_set_main_device(main_gpu);
2987
3092
 
2988
- llama_backend_offload = GGML_BACKEND_GPU;
3093
+ llama_backend_offload = GGML_BACKEND_GPU;
2989
3094
  llama_backend_offload_split = GGML_BACKEND_GPU_SPLIT;
2990
3095
  }
2991
3096
  #elif defined(GGML_USE_CLBLAST)
2992
3097
  LLAMA_LOG_INFO("%s: using OpenCL for GPU acceleration\n", __func__);
2993
- llama_backend_offload = GGML_BACKEND_GPU;
3098
+ llama_backend_offload = GGML_BACKEND_GPU;
2994
3099
  llama_backend_offload_split = GGML_BACKEND_GPU;
2995
3100
  #endif
2996
3101
 
2997
- // prepare memory for the weights
2998
- size_t vram_weights = 0;
3102
+ // create tensors for the weights
2999
3103
  {
3000
3104
  const int64_t n_embd = hparams.n_embd;
3001
3105
  const int64_t n_embd_gqa = hparams.n_embd_gqa();
@@ -3024,13 +3128,6 @@ static void llm_load_tensors(
3024
3128
 
3025
3129
  model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
3026
3130
  model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3027
-
3028
- if (backend_norm == GGML_BACKEND_GPU) {
3029
- vram_weights += ggml_nbytes(model.output_norm);
3030
- }
3031
- if (backend_output == GGML_BACKEND_GPU_SPLIT) {
3032
- vram_weights += ggml_nbytes(model.output);
3033
- }
3034
3131
  }
3035
3132
 
3036
3133
  const uint32_t n_ff = hparams.n_ff;
@@ -3080,28 +3177,6 @@ static void llm_load_tensors(
3080
3177
  layer.ffn_up_exp[x] = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), {n_embd, n_ff}, backend_split);
3081
3178
  }
3082
3179
  }
3083
-
3084
- if (backend == GGML_BACKEND_GPU) {
3085
- vram_weights +=
3086
- ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
3087
- ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) +
3088
- (layer.bq ? ggml_nbytes(layer.bq) : 0) +
3089
- (layer.bk ? ggml_nbytes(layer.bk) : 0) +
3090
- (layer.bv ? ggml_nbytes(layer.bv) : 0) +
3091
- (layer.bo ? ggml_nbytes(layer.bo) : 0) +
3092
- ggml_nbytes(layer.ffn_norm);
3093
-
3094
- if (layer.ffn_gate_inp == nullptr) {
3095
- vram_weights +=
3096
- ggml_nbytes(layer.ffn_gate) + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
3097
- } else {
3098
- vram_weights += ggml_nbytes(layer.ffn_gate_inp);
3099
- for (uint32_t x = 0; x < hparams.n_expert; ++x) {
3100
- vram_weights +=
3101
- ggml_nbytes(layer.ffn_gate_exp[x]) + ggml_nbytes(layer.ffn_down_exp[x]) + ggml_nbytes(layer.ffn_up_exp[x]);
3102
- }
3103
- }
3104
- }
3105
3180
  }
3106
3181
  } break;
3107
3182
  case LLM_ARCH_BAICHUAN:
@@ -3121,13 +3196,6 @@ static void llm_load_tensors(
3121
3196
 
3122
3197
  model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
3123
3198
  model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3124
-
3125
- if (backend_norm == GGML_BACKEND_GPU) {
3126
- vram_weights += ggml_nbytes(model.output_norm);
3127
- }
3128
- if (backend_output == GGML_BACKEND_GPU_SPLIT) {
3129
- vram_weights += ggml_nbytes(model.output);
3130
- }
3131
3199
  }
3132
3200
 
3133
3201
  const uint32_t n_ff = hparams.n_ff;
@@ -3154,19 +3222,10 @@ static void llm_load_tensors(
3154
3222
  layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
3155
3223
  layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
3156
3224
  layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
3157
-
3158
- if (backend == GGML_BACKEND_GPU) {
3159
- vram_weights +=
3160
- ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
3161
- ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) +
3162
- ggml_nbytes(layer.ffn_gate) + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
3163
- }
3164
3225
  }
3165
3226
  } break;
3166
3227
  case LLM_ARCH_FALCON:
3167
3228
  {
3168
- // TODO: CPU-only for now
3169
-
3170
3229
  model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
3171
3230
 
3172
3231
  // output
@@ -3185,14 +3244,6 @@ static void llm_load_tensors(
3185
3244
  model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
3186
3245
  model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
3187
3246
  model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3188
-
3189
- if (backend_norm == GGML_BACKEND_GPU) {
3190
- vram_weights += ggml_nbytes(model.output_norm);
3191
- vram_weights += ggml_nbytes(model.output_norm_b);
3192
- }
3193
- if (backend_output == GGML_BACKEND_GPU_SPLIT) {
3194
- vram_weights += ggml_nbytes(model.output);
3195
- }
3196
3247
  }
3197
3248
 
3198
3249
  const uint32_t n_ff = hparams.n_ff;
@@ -3213,11 +3264,6 @@ static void llm_load_tensors(
3213
3264
  if (gguf_find_tensor(ml.ctx_gguf, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i).c_str()) >= 0) {
3214
3265
  layer.attn_norm_2 = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, backend);
3215
3266
  layer.attn_norm_2_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, backend);
3216
-
3217
- if (backend == GGML_BACKEND_GPU) {
3218
- vram_weights += ggml_nbytes(layer.attn_norm_2);
3219
- vram_weights += ggml_nbytes(layer.attn_norm_2_b);
3220
- }
3221
3267
  }
3222
3268
 
3223
3269
  layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
@@ -3225,13 +3271,6 @@ static void llm_load_tensors(
3225
3271
 
3226
3272
  layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
3227
3273
  layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
3228
-
3229
- if (backend == GGML_BACKEND_GPU) {
3230
- vram_weights +=
3231
- ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
3232
- ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.wo) +
3233
- ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
3234
- }
3235
3274
  }
3236
3275
  } break;
3237
3276
  case LLM_ARCH_STARCODER:
@@ -3255,14 +3294,6 @@ static void llm_load_tensors(
3255
3294
  model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
3256
3295
  model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
3257
3296
  model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3258
-
3259
- if (backend_norm == GGML_BACKEND_GPU) {
3260
- vram_weights += ggml_nbytes(model.output_norm);
3261
- vram_weights += ggml_nbytes(model.output_norm_b);
3262
- }
3263
- if (backend_output == GGML_BACKEND_GPU_SPLIT) {
3264
- vram_weights += ggml_nbytes(model.output);
3265
- }
3266
3297
  }
3267
3298
 
3268
3299
  const uint32_t n_ff = hparams.n_ff;
@@ -3294,16 +3325,6 @@ static void llm_load_tensors(
3294
3325
 
3295
3326
  layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
3296
3327
  layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend);
3297
-
3298
- if (backend == GGML_BACKEND_GPU) {
3299
- vram_weights +=
3300
- ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
3301
- ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) +
3302
- ggml_nbytes(layer.wo) + ggml_nbytes(layer.bo) +
3303
- ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_norm_b) +
3304
- ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_down_b) +
3305
- ggml_nbytes(layer.ffn_up) + ggml_nbytes(layer.ffn_up_b);
3306
- }
3307
3328
  }
3308
3329
  } break;
3309
3330
  case LLM_ARCH_PERSIMMON:
@@ -3325,14 +3346,6 @@ static void llm_load_tensors(
3325
3346
  model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
3326
3347
  model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
3327
3348
  model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3328
-
3329
- if (backend_norm == GGML_BACKEND_GPU) {
3330
- vram_weights += ggml_nbytes(model.output_norm);
3331
- vram_weights += ggml_nbytes(model.output_norm_b);
3332
- }
3333
- if (backend_output == GGML_BACKEND_GPU_SPLIT) {
3334
- vram_weights += ggml_nbytes(model.output);
3335
- }
3336
3349
  }
3337
3350
 
3338
3351
  const uint32_t n_ff = hparams.n_ff;
@@ -3362,8 +3375,6 @@ static void llm_load_tensors(
3362
3375
  } break;
3363
3376
  case LLM_ARCH_BLOOM:
3364
3377
  {
3365
- // TODO: CPU-only for now
3366
-
3367
3378
  model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
3368
3379
  model.tok_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, GGML_BACKEND_CPU);
3369
3380
  model.tok_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, GGML_BACKEND_CPU);
@@ -3384,14 +3395,6 @@ static void llm_load_tensors(
3384
3395
  model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
3385
3396
  model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
3386
3397
  model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3387
-
3388
- if (backend_norm == GGML_BACKEND_GPU) {
3389
- vram_weights += ggml_nbytes(model.output_norm);
3390
- vram_weights += ggml_nbytes(model.output_norm_b);
3391
- }
3392
- if (backend_output == GGML_BACKEND_GPU_SPLIT) {
3393
- vram_weights += ggml_nbytes(model.output);
3394
- }
3395
3398
  }
3396
3399
 
3397
3400
  const uint32_t n_ff = hparams.n_ff;
@@ -3423,16 +3426,6 @@ static void llm_load_tensors(
3423
3426
 
3424
3427
  layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
3425
3428
  layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend);
3426
-
3427
- if (backend == GGML_BACKEND_GPU) {
3428
- vram_weights +=
3429
- ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
3430
- ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) +
3431
- ggml_nbytes(layer.wo) + ggml_nbytes(layer.bo) +
3432
- ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_norm_b) +
3433
- ggml_nbytes(layer.ffn_up) + ggml_nbytes(layer.ffn_up_b) +
3434
- ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_down_b);
3435
- }
3436
3429
  }
3437
3430
  } break;
3438
3431
  case LLM_ARCH_MPT:
@@ -3454,13 +3447,6 @@ static void llm_load_tensors(
3454
3447
 
3455
3448
  model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
3456
3449
  model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3457
-
3458
- if (backend_norm == GGML_BACKEND_GPU) {
3459
- vram_weights += ggml_nbytes(model.output_norm);
3460
- }
3461
- if (backend_output == GGML_BACKEND_GPU_SPLIT) {
3462
- vram_weights += ggml_nbytes(model.output);
3463
- }
3464
3450
  }
3465
3451
 
3466
3452
  const uint32_t n_ff = hparams.n_ff;
@@ -3483,16 +3469,6 @@ static void llm_load_tensors(
3483
3469
 
3484
3470
  layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
3485
3471
  layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
3486
-
3487
- if (backend == GGML_BACKEND_GPU) {
3488
- vram_weights +=
3489
- ggml_nbytes(layer.attn_norm) +
3490
- ggml_nbytes(layer.wqkv) +
3491
- ggml_nbytes(layer.wo) +
3492
- ggml_nbytes(layer.ffn_norm) +
3493
- ggml_nbytes(layer.ffn_down) +
3494
- ggml_nbytes(layer.ffn_up);
3495
- }
3496
3472
  }
3497
3473
  } break;
3498
3474
  case LLM_ARCH_STABLELM:
@@ -3515,13 +3491,6 @@ static void llm_load_tensors(
3515
3491
  model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
3516
3492
  model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
3517
3493
  model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3518
-
3519
- if (backend_norm == GGML_BACKEND_GPU) {
3520
- vram_weights += ggml_nbytes(model.output_norm);
3521
- }
3522
- if (backend_output == GGML_BACKEND_GPU_SPLIT) {
3523
- vram_weights += ggml_nbytes(model.output);
3524
- }
3525
3494
  }
3526
3495
 
3527
3496
  const uint32_t n_ff = hparams.n_ff;
@@ -3553,13 +3522,6 @@ static void llm_load_tensors(
3553
3522
  layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
3554
3523
  layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
3555
3524
  layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
3556
-
3557
- if (backend == GGML_BACKEND_GPU) {
3558
- vram_weights +=
3559
- ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
3560
- ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) +
3561
- ggml_nbytes(layer.ffn_gate) + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
3562
- }
3563
3525
  }
3564
3526
  } break;
3565
3527
  case LLM_ARCH_QWEN:
@@ -3579,14 +3541,7 @@ static void llm_load_tensors(
3579
3541
 
3580
3542
  model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
3581
3543
  model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3582
-
3583
- if (backend_norm == GGML_BACKEND_GPU) {
3584
- vram_weights += ggml_nbytes(model.output_norm);
3585
- }
3586
- if (backend_output == GGML_BACKEND_GPU_SPLIT) {
3587
- vram_weights += ggml_nbytes(model.output);
3588
- }
3589
- }
3544
+ }
3590
3545
 
3591
3546
  const uint32_t n_ff = hparams.n_ff / 2;
3592
3547
 
@@ -3611,16 +3566,59 @@ static void llm_load_tensors(
3611
3566
  layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
3612
3567
  layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
3613
3568
  layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
3569
+ }
3570
+ } break;
3571
+ case LLM_ARCH_PHI2:
3572
+ {
3573
+ model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
3574
+
3575
+ // output
3576
+ {
3577
+ ggml_backend_type backend_norm;
3578
+ ggml_backend_type backend_output;
3614
3579
 
3615
- if (backend == GGML_BACKEND_GPU) {
3616
- vram_weights +=
3617
- ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) +
3618
- ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_gate) +
3619
- ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
3580
+ if (n_gpu_layers > int(n_layer)) {
3581
+ backend_norm = llama_backend_offload;
3582
+ backend_output = llama_backend_offload;
3583
+ } else {
3584
+ backend_norm = GGML_BACKEND_CPU;
3585
+ backend_output = GGML_BACKEND_CPU;
3620
3586
  }
3587
+
3588
+ model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
3589
+ model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
3590
+ model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3591
+ model.output_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "bias"), {n_vocab}, backend_output);
3621
3592
  }
3622
- } break;
3623
3593
 
3594
+ const uint32_t n_ff = hparams.n_ff;
3595
+
3596
+ const int i_gpu_start = n_layer - n_gpu_layers;
3597
+
3598
+ model.layers.resize(n_layer);
3599
+
3600
+ for (uint32_t i = 0; i < n_layer; ++i) {
3601
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
3602
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
3603
+
3604
+ auto & layer = model.layers[i];
3605
+
3606
+ layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
3607
+ layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
3608
+
3609
+ layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
3610
+ layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend);
3611
+
3612
+ layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
3613
+ layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend);
3614
+
3615
+ layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
3616
+ layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend);
3617
+
3618
+ layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
3619
+ layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend);
3620
+ }
3621
+ } break;
3624
3622
  default:
3625
3623
  throw std::runtime_error("unknown architecture");
3626
3624
  }
@@ -3628,16 +3626,78 @@ static void llm_load_tensors(
3628
3626
 
3629
3627
  ml.done_getting_tensors();
3630
3628
 
3629
+ ml.init_mapping();
3630
+
3631
+ // allocate tensors
3632
+ size_t vram_weights = 0;
3633
+ size_t buf_size = 0;
3634
+
3635
+ ggml_backend_buffer_type_t buft = llama_default_buffer_type(n_gpu_layers);
3636
+
3637
+ for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
3638
+ // GGML_BACKEND_GPU tensors are for CUDA and OpenCL only, which are handled separately without ggml-backend
3639
+ if (t->backend == GGML_BACKEND_CPU) {
3640
+ buf_size += GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), ggml_backend_buft_get_alignment(buft));
3641
+ } else {
3642
+ vram_weights += ggml_nbytes(t);
3643
+ }
3644
+ }
3645
+
3646
+ // create backend buffer
3647
+ ggml_backend_buffer_t buf_mmap = nullptr;
3648
+
3649
+ #ifdef GGML_USE_METAL
3650
+ if (n_gpu_layers > 0) {
3651
+ if (ml.use_mmap) {
3652
+ const size_t max_size = ggml_get_max_tensor_size(ctx);
3653
+ model.buf = ggml_backend_metal_buffer_from_ptr(ml.mapping->addr, ml.mapping->size, max_size);
3654
+ buf_mmap = model.buf;
3655
+ } else {
3656
+ model.buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_metal_buffer_type());
3657
+ }
3658
+ }
3659
+ #elif defined(GGML_USE_CUBLAS) && defined(LLAMA_GGML_BACKEND_CUDA_TEST)
3660
+ // for testing only
3661
+ if (n_gpu_layers > 0) {
3662
+ model.buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_cuda_buffer_type(0));
3663
+ }
3664
+ #endif
3665
+
3666
+ if (model.buf == nullptr) {
3667
+ // CPU backend, and indirectly CUDA and OpenCL
3668
+ if (ml.use_mmap) {
3669
+ model.buf = ggml_backend_cpu_buffer_from_ptr(ml.mapping->addr, ml.mapping->size);
3670
+ buf_mmap = model.buf;
3671
+ } else {
3672
+ // allocate only CPU tensors
3673
+ model.buf = ggml_backend_buft_alloc_buffer(buft, buf_size);
3674
+ ggml_tallocr_t alloc = ggml_tallocr_new_from_buffer(model.buf);
3675
+ for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
3676
+ if (t->backend == GGML_BACKEND_CPU) {
3677
+ ggml_tallocr_alloc(alloc, t);
3678
+ }
3679
+ }
3680
+ ggml_tallocr_free(alloc);
3681
+ }
3682
+ }
3683
+
3684
+ if (use_mlock && ggml_backend_buffer_is_host(model.buf)) {
3685
+ model.mlock_buf.init (ggml_backend_buffer_get_base(model.buf));
3686
+ model.mlock_buf.grow_to(ggml_backend_buffer_get_size(model.buf));
3687
+ }
3688
+
3631
3689
  // print memory requirements
3632
3690
  {
3633
- // this is the total memory required to run the inference
3634
- size_t mem_required =
3635
- ctx_size +
3636
- mmapped_size - vram_weights; // weights in VRAM not in memory
3691
+ size_t sys_mem_required = ctx_size + buf_size;
3637
3692
 
3638
- LLAMA_LOG_INFO("%s: mem required = %7.2f MiB\n", __func__, mem_required / 1024.0 / 1024.0);
3693
+ if (sys_mem_required > 0) {
3694
+ LLAMA_LOG_INFO("%s: system memory used = %7.2f MiB\n", __func__, sys_mem_required / 1024.0 / 1024.0);
3695
+ }
3696
+ if (vram_weights > 0) {
3697
+ LLAMA_LOG_INFO("%s: VRAM used = %7.2f MiB\n", __func__, vram_weights / 1024.0 / 1024.0);
3698
+ }
3639
3699
 
3640
- #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
3700
+ #if (defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)) || defined(GGML_USE_CLBLAST)
3641
3701
  const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
3642
3702
 
3643
3703
  LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
@@ -3645,38 +3705,27 @@ static void llm_load_tensors(
3645
3705
  LLAMA_LOG_INFO("%s: offloading non-repeating layers to GPU\n", __func__);
3646
3706
  }
3647
3707
 
3648
- #ifdef GGML_USE_CUBLAS
3649
- const int max_backend_supported_layers = hparams.n_layer + 1;
3650
- const int max_offloadable_layers = hparams.n_layer + 1;
3651
- #elif GGML_USE_CLBLAST
3652
3708
  const int max_backend_supported_layers = hparams.n_layer + 1;
3653
3709
  const int max_offloadable_layers = hparams.n_layer + 1;
3654
- #endif // GGML_USE_CUBLAS
3655
3710
 
3656
3711
  LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
3657
- LLAMA_LOG_INFO("%s: VRAM used: %.2f MiB\n", __func__, vram_weights / 1024.0 / 1024.0);
3658
- #else
3659
- (void) n_gpu_layers;
3660
3712
  #endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
3661
3713
  }
3662
3714
 
3663
- // populate `tensors_by_name`
3715
+ #if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
3716
+ ggml_cuda_set_tensor_split(tensor_split);
3717
+ #else
3718
+ GGML_UNUSED(tensor_split);
3719
+ #endif // GGML_USE_CUBLAS
3720
+
3721
+ // populate tensors_by_name
3664
3722
  for (int i = 0; i < ml.n_tensors; ++i) {
3665
3723
  struct ggml_tensor * cur = ggml_get_tensor(ctx, ml.get_tensor_name(i));
3666
3724
  model.tensors_by_name.emplace_back(ggml_get_name(cur), cur);
3667
3725
  }
3668
3726
 
3669
- (void) tensor_split;
3670
- #ifdef GGML_USE_CUBLAS
3671
- {
3672
- ggml_cuda_set_tensor_split(tensor_split);
3673
- }
3674
- #endif
3675
-
3676
- ml.load_all_data(ctx, progress_callback, progress_callback_user_data, use_mlock ? &model.mlock_mmap : NULL);
3677
-
3678
- if (progress_callback) {
3679
- progress_callback(1.0f, progress_callback_user_data);
3727
+ if (!ml.load_all_data(ctx, progress_callback, progress_callback_user_data, buf_mmap, use_mlock ? &model.mlock_mmap : NULL)) {
3728
+ return false;
3680
3729
  }
3681
3730
 
3682
3731
  model.mapping = std::move(ml.mapping);
@@ -3684,9 +3733,11 @@ static void llm_load_tensors(
3684
3733
  // loading time will be recalculate after the first eval, so
3685
3734
  // we take page faults deferred by mmap() into consideration
3686
3735
  model.t_load_us = ggml_time_us() - model.t_start_us;
3736
+ return true;
3687
3737
  }
3688
3738
 
3689
- static bool llama_model_load(const std::string & fname, llama_model & model, const llama_model_params & params) {
3739
+ // Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
3740
+ static int llama_model_load(const std::string & fname, llama_model & model, const llama_model_params & params) {
3690
3741
  try {
3691
3742
  llama_model_loader ml(fname, params.use_mmap, params.kv_overrides);
3692
3743
 
@@ -3704,19 +3755,21 @@ static bool llama_model_load(const std::string & fname, llama_model & model, con
3704
3755
 
3705
3756
  if (params.vocab_only) {
3706
3757
  LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__);
3707
- return true;
3758
+ return 0;
3708
3759
  }
3709
3760
 
3710
- llm_load_tensors(
3761
+ if (!llm_load_tensors(
3711
3762
  ml, model, params.n_gpu_layers, params.main_gpu, params.tensor_split, params.use_mlock,
3712
3763
  params.progress_callback, params.progress_callback_user_data
3713
- );
3764
+ )) {
3765
+ return -2;
3766
+ }
3714
3767
  } catch (const std::exception & err) {
3715
3768
  LLAMA_LOG_ERROR("error loading model: %s\n", err.what());
3716
- return false;
3769
+ return -1;
3717
3770
  }
3718
3771
 
3719
- return true;
3772
+ return 0;
3720
3773
  }
3721
3774
 
3722
3775
  //
@@ -3981,17 +4034,18 @@ static struct ggml_tensor * llm_build_ffn(
3981
4034
  // if max_alibi_bias > 0 then apply ALiBi
3982
4035
  static struct ggml_tensor * llm_build_kqv(
3983
4036
  struct ggml_context * ctx,
4037
+ const llama_model & model,
3984
4038
  const llama_hparams & hparams,
3985
4039
  const llama_kv_cache & kv,
3986
4040
  struct ggml_tensor * wo,
3987
4041
  struct ggml_tensor * wo_b,
3988
4042
  struct ggml_tensor * q_cur,
3989
- struct ggml_tensor * kq_scale,
3990
4043
  struct ggml_tensor * kq_mask,
3991
4044
  int64_t n_ctx,
3992
4045
  int32_t n_tokens,
3993
4046
  int32_t n_kv,
3994
4047
  float max_alibi_bias,
4048
+ float kq_scale,
3995
4049
  const llm_build_cb & cb,
3996
4050
  int il) {
3997
4051
  const int64_t n_embd = hparams.n_embd;
@@ -4014,6 +4068,12 @@ static struct ggml_tensor * llm_build_kqv(
4014
4068
  struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
4015
4069
  cb(kq, "kq", il);
4016
4070
 
4071
+ if (model.arch == LLM_ARCH_PHI2) {
4072
+ // for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
4073
+ // ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
4074
+ ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
4075
+ }
4076
+
4017
4077
  if (max_alibi_bias > 0.0f) {
4018
4078
  // temporary branch until we figure out how to handle ggml_alibi through ggml_add
4019
4079
  kq = ggml_scale(ctx, kq, kq_scale);
@@ -4033,7 +4093,7 @@ static struct ggml_tensor * llm_build_kqv(
4033
4093
  kq = ggml_soft_max(ctx, kq);
4034
4094
  cb(kq, "kq_soft_max", il);
4035
4095
  } else {
4036
- kq = ggml_soft_max_ext(ctx, kq, kq_mask, 1.0f/sqrtf(float(n_embd_head)));
4096
+ kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_scale);
4037
4097
  cb(kq, "kq_soft_max_ext", il);
4038
4098
  }
4039
4099
 
@@ -4102,7 +4162,7 @@ struct llm_build_context {
4102
4162
 
4103
4163
  const llm_build_cb & cb;
4104
4164
 
4105
- llama_buffer & buf_compute;
4165
+ std::vector<uint8_t> & buf_compute_meta;
4106
4166
 
4107
4167
  struct ggml_context * ctx0 = nullptr;
4108
4168
 
@@ -4112,35 +4172,35 @@ struct llm_build_context {
4112
4172
  const llama_batch & batch,
4113
4173
  const llm_build_cb & cb,
4114
4174
  bool worst_case) :
4115
- model (lctx.model),
4116
- hparams (model.hparams),
4117
- cparams (lctx.cparams),
4118
- batch (batch),
4119
- kv_self (lctx.kv_self),
4120
- n_embd (hparams.n_embd),
4121
- n_layer (hparams.n_layer),
4122
- n_ctx (cparams.n_ctx),
4123
- n_head (hparams.n_head),
4124
- n_head_kv (hparams.n_head_kv),
4125
- n_embd_head (hparams.n_embd_head()),
4126
- n_embd_gqa (hparams.n_embd_gqa()),
4127
- n_expert (hparams.n_expert),
4128
- n_expert_used (hparams.n_expert_used),
4129
- freq_base (cparams.rope_freq_base),
4130
- freq_scale (cparams.rope_freq_scale),
4131
- ext_factor (cparams.yarn_ext_factor),
4132
- attn_factor (cparams.yarn_attn_factor),
4133
- beta_fast (cparams.yarn_beta_fast),
4134
- beta_slow (cparams.yarn_beta_slow),
4135
- norm_eps (hparams.f_norm_eps),
4136
- norm_rms_eps (hparams.f_norm_rms_eps),
4137
- n_tokens (batch.n_tokens),
4138
- n_kv (worst_case ? n_ctx : kv_self.n),
4139
- kv_head (worst_case ? n_ctx - n_tokens : kv_self.head),
4140
- n_orig_ctx (cparams.n_yarn_orig_ctx),
4141
- do_rope_shift (worst_case || kv_self.has_shift),
4142
- cb (cb),
4143
- buf_compute (lctx.buf_compute) {
4175
+ model (lctx.model),
4176
+ hparams (model.hparams),
4177
+ cparams (lctx.cparams),
4178
+ batch (batch),
4179
+ kv_self (lctx.kv_self),
4180
+ n_embd (hparams.n_embd),
4181
+ n_layer (hparams.n_layer),
4182
+ n_ctx (cparams.n_ctx),
4183
+ n_head (hparams.n_head),
4184
+ n_head_kv (hparams.n_head_kv),
4185
+ n_embd_head (hparams.n_embd_head()),
4186
+ n_embd_gqa (hparams.n_embd_gqa()),
4187
+ n_expert (hparams.n_expert),
4188
+ n_expert_used (hparams.n_expert_used),
4189
+ freq_base (cparams.rope_freq_base),
4190
+ freq_scale (cparams.rope_freq_scale),
4191
+ ext_factor (cparams.yarn_ext_factor),
4192
+ attn_factor (cparams.yarn_attn_factor),
4193
+ beta_fast (cparams.yarn_beta_fast),
4194
+ beta_slow (cparams.yarn_beta_slow),
4195
+ norm_eps (hparams.f_norm_eps),
4196
+ norm_rms_eps (hparams.f_norm_rms_eps),
4197
+ n_tokens (batch.n_tokens),
4198
+ n_kv (worst_case ? n_ctx : kv_self.n),
4199
+ kv_head (worst_case ? n_ctx - n_tokens : kv_self.head),
4200
+ n_orig_ctx (cparams.n_yarn_orig_ctx),
4201
+ do_rope_shift (worst_case || kv_self.has_shift),
4202
+ cb (cb),
4203
+ buf_compute_meta (lctx.buf_compute_meta) {
4144
4204
  GGML_ASSERT(!!kv_self.ctx);
4145
4205
 
4146
4206
  // all initializations should be done in init()
@@ -4148,8 +4208,8 @@ struct llm_build_context {
4148
4208
 
4149
4209
  void init() {
4150
4210
  struct ggml_init_params params = {
4151
- /*.mem_size =*/ buf_compute.size,
4152
- /*.mem_buffer =*/ buf_compute.data,
4211
+ /*.mem_size =*/ buf_compute_meta.size(),
4212
+ /*.mem_buffer =*/ buf_compute_meta.data(),
4153
4213
  /*.no_alloc =*/ true,
4154
4214
  };
4155
4215
 
@@ -4178,10 +4238,6 @@ struct llm_build_context {
4178
4238
  struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
4179
4239
  cb(inp_pos, "inp_pos", -1);
4180
4240
 
4181
- // KQ_scale
4182
- struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
4183
- cb(KQ_scale, "KQ_scale", -1);
4184
-
4185
4241
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
4186
4242
  struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
4187
4243
  cb(KQ_mask, "KQ_mask", -1);
@@ -4240,9 +4296,9 @@ struct llm_build_context {
4240
4296
 
4241
4297
  llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
4242
4298
 
4243
- cur = llm_build_kqv(ctx0, hparams, kv_self,
4299
+ cur = llm_build_kqv(ctx0, model, hparams, kv_self,
4244
4300
  model.layers[il].wo, model.layers[il].bo,
4245
- Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
4301
+ Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
4246
4302
  cb(cur, "kqv_out", il);
4247
4303
  }
4248
4304
 
@@ -4363,10 +4419,6 @@ struct llm_build_context {
4363
4419
  struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
4364
4420
  cb(inp_pos, "inp_pos", -1);
4365
4421
 
4366
- // KQ_scale
4367
- struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
4368
- cb(KQ_scale, "KQ_scale", -1);
4369
-
4370
4422
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
4371
4423
  struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
4372
4424
  cb(KQ_mask, "KQ_mask", -1);
@@ -4423,9 +4475,9 @@ struct llm_build_context {
4423
4475
  // apply ALiBi for 13B model
4424
4476
  const float max_alibi_bias = model.type == MODEL_13B ? 8.0f : -1.0f;
4425
4477
 
4426
- cur = llm_build_kqv(ctx0, hparams, kv_self,
4478
+ cur = llm_build_kqv(ctx0, model, hparams, kv_self,
4427
4479
  model.layers[il].wo, NULL,
4428
- Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, max_alibi_bias, cb, il);
4480
+ Qcur, KQ_mask, n_ctx, n_tokens, n_kv, max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il);
4429
4481
  cb(cur, "kqv_out", il);
4430
4482
  }
4431
4483
 
@@ -4483,10 +4535,6 @@ struct llm_build_context {
4483
4535
  struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
4484
4536
  cb(inp_pos, "inp_pos", -1);
4485
4537
 
4486
- // KQ_scale
4487
- struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
4488
- cb(KQ_scale, "KQ_scale", -1);
4489
-
4490
4538
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
4491
4539
  struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
4492
4540
  cb(KQ_mask, "KQ_mask", -1);
@@ -4547,9 +4595,9 @@ struct llm_build_context {
4547
4595
 
4548
4596
  llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
4549
4597
 
4550
- cur = llm_build_kqv(ctx0, hparams, kv_self,
4598
+ cur = llm_build_kqv(ctx0, model, hparams, kv_self,
4551
4599
  model.layers[il].wo, NULL,
4552
- Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
4600
+ Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
4553
4601
  cb(cur, "kqv_out", il);
4554
4602
  }
4555
4603
 
@@ -4606,10 +4654,6 @@ struct llm_build_context {
4606
4654
  struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
4607
4655
  cb(inp_pos, "inp_pos", -1);
4608
4656
 
4609
- // KQ_scale
4610
- struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
4611
- cb(KQ_scale, "KQ_scale", -1);
4612
-
4613
4657
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
4614
4658
  struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
4615
4659
  cb(KQ_mask, "KQ_mask", -1);
@@ -4647,9 +4691,9 @@ struct llm_build_context {
4647
4691
 
4648
4692
  llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
4649
4693
 
4650
- cur = llm_build_kqv(ctx0, hparams, kv_self,
4694
+ cur = llm_build_kqv(ctx0, model, hparams, kv_self,
4651
4695
  model.layers[il].wo, model.layers[il].bo,
4652
- Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
4696
+ Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
4653
4697
  cb(cur, "kqv_out", il);
4654
4698
  }
4655
4699
 
@@ -4706,10 +4750,6 @@ struct llm_build_context {
4706
4750
  struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
4707
4751
  cb(inp_pos, "inp_pos", -1);
4708
4752
 
4709
- // KQ_scale
4710
- struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
4711
- cb(KQ_scale, "KQ_scale", -1);
4712
-
4713
4753
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
4714
4754
  struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
4715
4755
  cb(KQ_mask, "KQ_mask", -1);
@@ -4856,9 +4896,9 @@ struct llm_build_context {
4856
4896
  llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
4857
4897
 
4858
4898
  // TODO: not tested, could be broken
4859
- cur = llm_build_kqv(ctx0, hparams, kv_self,
4899
+ cur = llm_build_kqv(ctx0, model, hparams, kv_self,
4860
4900
  model.layers[il].wo, model.layers[il].bo,
4861
- Q, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
4901
+ Q, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
4862
4902
  cb(cur, "kqv_out", il);
4863
4903
  }
4864
4904
 
@@ -4912,10 +4952,6 @@ struct llm_build_context {
4912
4952
  inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
4913
4953
  cb(inpL, "inp_embd", -1);
4914
4954
 
4915
- // KQ_scale
4916
- struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
4917
- cb(KQ_scale, "KQ_scale", -1);
4918
-
4919
4955
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
4920
4956
  struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
4921
4957
  cb(KQ_mask, "KQ_mask", -1);
@@ -4947,9 +4983,9 @@ struct llm_build_context {
4947
4983
 
4948
4984
  llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
4949
4985
 
4950
- cur = llm_build_kqv(ctx0, hparams, kv_self,
4986
+ cur = llm_build_kqv(ctx0, model, hparams, kv_self,
4951
4987
  model.layers[il].wo, NULL,
4952
- Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, cb, il);
4988
+ Qcur, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
4953
4989
  cb(cur, "kqv_out", il);
4954
4990
  }
4955
4991
 
@@ -5003,10 +5039,6 @@ struct llm_build_context {
5003
5039
  inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
5004
5040
  cb(inpL, "inp_embd", -1);
5005
5041
 
5006
- // KQ_scale
5007
- struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
5008
- cb(KQ_scale, "KQ_scale", -1);
5009
-
5010
5042
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
5011
5043
  struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
5012
5044
  cb(KQ_mask, "KQ_mask", -1);
@@ -5044,9 +5076,9 @@ struct llm_build_context {
5044
5076
 
5045
5077
  llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
5046
5078
 
5047
- cur = llm_build_kqv(ctx0, hparams, kv_self,
5079
+ cur = llm_build_kqv(ctx0, model, hparams, kv_self,
5048
5080
  model.layers[il].wo, model.layers[il].bo,
5049
- Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, cb, il);
5081
+ Qcur, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5050
5082
  cb(cur, "kqv_out", il);
5051
5083
  }
5052
5084
 
@@ -5097,10 +5129,6 @@ struct llm_build_context {
5097
5129
  inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
5098
5130
  cb(inpL, "inp_embd", -1);
5099
5131
 
5100
- // KQ_scale
5101
- struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
5102
- cb(KQ_scale, "KQ_scale", -1);
5103
-
5104
5132
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
5105
5133
  struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
5106
5134
  cb(KQ_mask, "KQ_mask", -1);
@@ -5138,9 +5166,9 @@ struct llm_build_context {
5138
5166
 
5139
5167
  llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
5140
5168
 
5141
- cur = llm_build_kqv(ctx0, hparams, kv_self,
5169
+ cur = llm_build_kqv(ctx0, model, hparams, kv_self,
5142
5170
  model.layers[il].wo, NULL,
5143
- Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, hparams.f_max_alibi_bias, cb, il);
5171
+ Qcur, KQ_mask, n_ctx, n_tokens, n_kv, hparams.f_max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5144
5172
  cb(cur, "kqv_out", il);
5145
5173
  }
5146
5174
 
@@ -5200,10 +5228,6 @@ struct llm_build_context {
5200
5228
  struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
5201
5229
  cb(inp_pos, "inp_pos", -1);
5202
5230
 
5203
- // KQ_scale
5204
- struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
5205
- cb(KQ_scale, "KQ_scale", -1);
5206
-
5207
5231
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
5208
5232
  struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
5209
5233
  cb(KQ_mask, "KQ_mask", -1);
@@ -5251,9 +5275,9 @@ struct llm_build_context {
5251
5275
 
5252
5276
  llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
5253
5277
 
5254
- cur = llm_build_kqv(ctx0, hparams, kv_self,
5278
+ cur = llm_build_kqv(ctx0, model, hparams, kv_self,
5255
5279
  model.layers[il].wo, NULL,
5256
- Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
5280
+ Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5257
5281
  cb(cur, "kqv_out", il);
5258
5282
  }
5259
5283
 
@@ -5310,15 +5334,11 @@ struct llm_build_context {
5310
5334
  cb(inpL, "inp_embd", -1);
5311
5335
 
5312
5336
  // inp_pos - contains the positions
5313
- struct ggml_tensor * inp_pos= ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
5337
+ struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
5314
5338
  cb(inp_pos, "inp_pos", -1);
5315
5339
 
5316
- // KQ_scale
5317
- struct ggml_tensor * KQ_scale= ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
5318
- cb(KQ_scale, "KQ_scale", -1);
5319
-
5320
5340
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
5321
- struct ggml_tensor * KQ_mask= ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
5341
+ struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
5322
5342
  cb(KQ_mask, "KQ_mask", -1);
5323
5343
 
5324
5344
  // shift the entire K-cache if needed
@@ -5368,9 +5388,9 @@ struct llm_build_context {
5368
5388
 
5369
5389
  llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
5370
5390
 
5371
- cur = llm_build_kqv(ctx0, hparams, kv_self,
5391
+ cur = llm_build_kqv(ctx0, model, hparams, kv_self,
5372
5392
  model.layers[il].wo, NULL,
5373
- Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
5393
+ Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5374
5394
  cb(cur, "kqv_out", il);
5375
5395
  }
5376
5396
 
@@ -5412,6 +5432,116 @@ struct llm_build_context {
5412
5432
 
5413
5433
  ggml_build_forward_expand(gf, cur);
5414
5434
 
5435
+ return gf;
5436
+ }
5437
+ struct ggml_cgraph * build_phi2() {
5438
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5439
+
5440
+ struct ggml_tensor * cur;
5441
+ struct ggml_tensor * attn_norm_output;
5442
+ struct ggml_tensor * ffn_output;
5443
+ struct ggml_tensor * inpL;
5444
+
5445
+ inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
5446
+ cb(inpL, "inp_embd", -1);
5447
+
5448
+ // inp_pos - contains the positions
5449
+ struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
5450
+ cb(inp_pos, "inp_pos", -1);
5451
+
5452
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
5453
+ struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
5454
+ cb(KQ_mask, "KQ_mask", -1);
5455
+
5456
+ // shift the entire K-cache if needed
5457
+ if (do_rope_shift) {
5458
+ llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, n_embd_head, freq_base, freq_scale, cb);
5459
+ }
5460
+
5461
+ for (int il = 0; il < n_layer; ++il) {
5462
+ attn_norm_output = llm_build_norm(ctx0, inpL, hparams,
5463
+ model.layers[il].attn_norm,
5464
+ model.layers[il].attn_norm_b,
5465
+ LLM_NORM, cb, il);
5466
+ cb(attn_norm_output, "attn_norm", il);
5467
+
5468
+ // self-attention
5469
+ {
5470
+ cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, attn_norm_output);
5471
+ cb(cur, "wqkv", il);
5472
+
5473
+ cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
5474
+ cb(cur, "bqkv", il);
5475
+
5476
+ struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
5477
+ struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
5478
+ struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
5479
+
5480
+ cb(Qcur, "Qcur", il);
5481
+ cb(Kcur, "Kcur", il);
5482
+ cb(Vcur, "Vcur", il);
5483
+
5484
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
5485
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
5486
+
5487
+ Qcur = ggml_rope_custom(
5488
+ ctx0, Qcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
5489
+ freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
5490
+ );
5491
+ cb(Qcur, "Qcur", il);
5492
+
5493
+ // with phi2, we scale the Q to avoid precision issues
5494
+ // ref: https://github.com/ml-explore/mlx-examples/blob/08e862336ade809bc37d1035f94b359e7d1a5152/phi2/phi2.py#L64-L66
5495
+ Qcur = ggml_scale(ctx0, Qcur, 1.0f/sqrtf(float(n_embd_head)));
5496
+ cb(Qcur, "Qcur", il);
5497
+
5498
+ Kcur = ggml_rope_custom(
5499
+ ctx0, Kcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
5500
+ freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
5501
+ );
5502
+ cb(Kcur, "Kcur", il);
5503
+
5504
+ llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
5505
+
5506
+ cur = llm_build_kqv(ctx0, model, hparams, kv_self,
5507
+ model.layers[il].wo, model.layers[il].bo,
5508
+ Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f, cb, il);
5509
+ cb(cur, "kqv_out", il);
5510
+ }
5511
+
5512
+ // FF
5513
+ {
5514
+ ffn_output = llm_build_ffn(ctx0, attn_norm_output,
5515
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b,
5516
+ NULL, NULL,
5517
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b,
5518
+ LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
5519
+ cb(ffn_output, "ffn_out", il);
5520
+ }
5521
+
5522
+ cur = ggml_add(ctx0, cur, ffn_output);
5523
+ cb(cur, "l_out", il);
5524
+
5525
+ cur = ggml_add(ctx0, cur, inpL);
5526
+ cb(cur, "l_out", il);
5527
+
5528
+ inpL = cur;
5529
+ }
5530
+
5531
+ cur = llm_build_norm(ctx0, inpL, hparams,
5532
+ model.output_norm,
5533
+ model.output_norm_b,
5534
+ LLM_NORM, cb, -1);
5535
+ cb(cur, "result_norm", -1);
5536
+
5537
+ cur = ggml_mul_mat(ctx0, model.output, cur);
5538
+ cb(cur, "result_output_no_bias", -1);
5539
+
5540
+ cur = ggml_add(ctx0, cur, model.output_b);
5541
+ cb(cur, "result_output", -1);
5542
+
5543
+ ggml_build_forward_expand(gf, cur);
5544
+
5415
5545
  return gf;
5416
5546
  }
5417
5547
  };
@@ -5427,7 +5557,7 @@ enum llm_offload_func_e {
5427
5557
  OFFLOAD_FUNC_FRC, // force offload
5428
5558
  OFFLOAD_FUNC_KQV,
5429
5559
  OFFLOAD_FUNC_NR,
5430
- OFFLOAD_FUNC_EMB,
5560
+ OFFLOAD_FUNC_EMB, // embeddings
5431
5561
  OFFLOAD_FUNC_OUT,
5432
5562
  };
5433
5563
 
@@ -5512,7 +5642,6 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map
5512
5642
  { "pos_embd", OFFLOAD_FUNC_NR },
5513
5643
 
5514
5644
  { "inp_pos", OFFLOAD_FUNC_FRC }, // this is often used for KQ ops (e.g. rope)
5515
- { "KQ_scale", OFFLOAD_FUNC_FRC },
5516
5645
  { "KQ_mask", OFFLOAD_FUNC_FRC },
5517
5646
  { "K_shift", OFFLOAD_FUNC_FRC },
5518
5647
 
@@ -5596,6 +5725,7 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map
5596
5725
  { "l_out", OFFLOAD_FUNC },
5597
5726
 
5598
5727
  { "result_norm", OFFLOAD_FUNC_EMB },
5728
+ { "result_output_no_bias", OFFLOAD_FUNC_EMB },
5599
5729
  { "result_output", OFFLOAD_FUNC_OUT },
5600
5730
  };
5601
5731
 
@@ -5613,11 +5743,10 @@ static struct ggml_cgraph * llama_build_graph(
5613
5743
  bool alloc_inp_tokens = false;
5614
5744
  bool alloc_inp_embd = false;
5615
5745
  bool alloc_inp_pos = false;
5616
- bool alloc_inp_KQ_scale = false;
5617
5746
  bool alloc_inp_KQ_mask = false;
5618
5747
  bool alloc_inp_K_shift = false;
5619
5748
 
5620
- #ifdef GGML_USE_CUBLAS
5749
+ #if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
5621
5750
  const bool do_offload = true;
5622
5751
  #else
5623
5752
  const bool do_offload = true; // TODO: set to false after finishing refactoring
@@ -5645,7 +5774,7 @@ static struct ggml_cgraph * llama_build_graph(
5645
5774
  if (!ggml_allocr_is_measure(lctx.alloc) && batch.token) {
5646
5775
  const int64_t n_tokens = cur->ne[0];
5647
5776
 
5648
- memcpy(cur->data, batch.token, n_tokens*ggml_element_size(cur));
5777
+ ggml_backend_tensor_set(cur, batch.token, 0, n_tokens*ggml_element_size(cur));
5649
5778
  }
5650
5779
 
5651
5780
  alloc_inp_tokens = true;
@@ -5658,7 +5787,7 @@ static struct ggml_cgraph * llama_build_graph(
5658
5787
  const int64_t n_embd = cur->ne[0];
5659
5788
  const int64_t n_tokens = cur->ne[1];
5660
5789
 
5661
- memcpy(cur->data, batch.embd, n_tokens*n_embd*ggml_element_size(cur));
5790
+ ggml_backend_tensor_set(cur, batch.embd, 0, n_tokens*n_embd*ggml_element_size(cur));
5662
5791
  }
5663
5792
 
5664
5793
  alloc_inp_embd = true;
@@ -5670,27 +5799,13 @@ static struct ggml_cgraph * llama_build_graph(
5670
5799
  if (!ggml_allocr_is_measure(lctx.alloc) && batch.pos) {
5671
5800
  const int64_t n_tokens = cur->ne[0];
5672
5801
 
5673
- int32_t * data = (int32_t *) cur->data;
5674
-
5675
- for (int i = 0; i < n_tokens; ++i) {
5676
- data[i] = batch.pos[i];
5677
- }
5802
+ static_assert(std::is_same<llama_pos, int32_t>::value, "llama_pos must be int32_t");
5803
+ ggml_backend_tensor_set(cur, batch.pos, 0, n_tokens*ggml_element_size(cur));
5678
5804
  }
5679
5805
 
5680
5806
  alloc_inp_pos = true;
5681
5807
  }
5682
5808
 
5683
- if (!alloc_inp_KQ_scale && strcmp(name, "KQ_scale") == 0) {
5684
- ggml_allocr_alloc(lctx.alloc, cur);
5685
-
5686
- if (!ggml_allocr_is_measure(lctx.alloc)) {
5687
- const int64_t n_embd_head = model.hparams.n_embd_head();
5688
- ggml_set_f32(cur, 1.0f/sqrtf(float(n_embd_head)));
5689
- }
5690
-
5691
- alloc_inp_KQ_scale = true;
5692
- }
5693
-
5694
5809
  if (!alloc_inp_KQ_mask && strcmp(name, "KQ_mask") == 0) {
5695
5810
  ggml_allocr_alloc(lctx.alloc, cur);
5696
5811
 
@@ -5698,8 +5813,13 @@ static struct ggml_cgraph * llama_build_graph(
5698
5813
  const int64_t n_kv = cur->ne[0];
5699
5814
  const int64_t n_tokens = cur->ne[1];
5700
5815
 
5701
- float * data = (float *) cur->data;
5702
- memset(data, 0, ggml_nbytes(cur));
5816
+ float * data;
5817
+ if (ggml_backend_buffer_is_host(cur->buffer)) {
5818
+ data = (float *) cur->data;
5819
+ } else {
5820
+ lctx.buf_copy.resize(ggml_nbytes(cur));
5821
+ data = (float *) lctx.buf_copy.data();
5822
+ }
5703
5823
 
5704
5824
  for (int h = 0; h < 1; ++h) {
5705
5825
  for (int j = 0; j < n_tokens; ++j) {
@@ -5707,12 +5827,20 @@ static struct ggml_cgraph * llama_build_graph(
5707
5827
  const llama_seq_id seq_id = batch.seq_id[j][0];
5708
5828
 
5709
5829
  for (int i = 0; i < n_kv; ++i) {
5830
+ float f;
5710
5831
  if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) {
5711
- data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
5832
+ f = -INFINITY;
5833
+ } else {
5834
+ f = 0;
5712
5835
  }
5836
+ data[h*(n_kv*n_tokens) + j*n_kv + i] = f;
5713
5837
  }
5714
5838
  }
5715
5839
  }
5840
+
5841
+ if (data != cur->data) {
5842
+ ggml_backend_tensor_set(cur, data, 0, ggml_nbytes(cur));
5843
+ }
5716
5844
  }
5717
5845
 
5718
5846
  alloc_inp_KQ_mask = true;
@@ -5724,11 +5852,21 @@ static struct ggml_cgraph * llama_build_graph(
5724
5852
  if (!ggml_allocr_is_measure(lctx.alloc)) {
5725
5853
  const int64_t n_ctx = cur->ne[0];
5726
5854
 
5727
- int32_t * data = (int32_t *) cur->data;
5855
+ int32_t * data;
5856
+ if (ggml_backend_buffer_is_host(cur->buffer)) {
5857
+ data = (int32_t *) cur->data;
5858
+ } else {
5859
+ lctx.buf_copy.resize(ggml_nbytes(cur));
5860
+ data = (int32_t *) lctx.buf_copy.data();
5861
+ }
5728
5862
 
5729
5863
  for (int i = 0; i < n_ctx; ++i) {
5730
5864
  data[i] = lctx.kv_self.cells[i].delta;
5731
5865
  }
5866
+
5867
+ if (data != cur->data) {
5868
+ ggml_backend_tensor_set(cur, data, 0, ggml_nbytes(cur));
5869
+ }
5732
5870
  }
5733
5871
 
5734
5872
  alloc_inp_K_shift = true;
@@ -5765,7 +5903,7 @@ static struct ggml_cgraph * llama_build_graph(
5765
5903
  static const std::unordered_map<llm_offload_func_e, std::string, std::hash<int>> k_offload_func_name = {
5766
5904
  { OFFLOAD_FUNC_NOP, "CPU" },
5767
5905
  { OFFLOAD_FUNC_OUT, "CPU" },
5768
- #ifdef GGML_USE_CUBLAS
5906
+ #if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
5769
5907
  { OFFLOAD_FUNC, "GPU (CUDA)" },
5770
5908
  { OFFLOAD_FUNC_FRC, "GPU (CUDA) FRC" },
5771
5909
  { OFFLOAD_FUNC_KQV, "GPU (CUDA) KQV" },
@@ -5838,7 +5976,7 @@ static struct ggml_cgraph * llama_build_graph(
5838
5976
  offload_func_t func = ggml_offload_nop;
5839
5977
 
5840
5978
  // this is needed for compatibility with Metal for example
5841
- #ifdef GGML_USE_CUBLAS
5979
+ #if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
5842
5980
  static offload_func_t ggml_offload_gpu = ggml_cuda_assign_buffers_no_alloc;
5843
5981
  #else
5844
5982
  static offload_func_t ggml_offload_gpu = ggml_offload_nop;
@@ -5912,6 +6050,10 @@ static struct ggml_cgraph * llama_build_graph(
5912
6050
  {
5913
6051
  result = llm.build_qwen();
5914
6052
  } break;
6053
+ case LLM_ARCH_PHI2:
6054
+ {
6055
+ result = llm.build_phi2();
6056
+ } break;
5915
6057
  default:
5916
6058
  GGML_ASSERT(false);
5917
6059
  }
@@ -6045,18 +6187,23 @@ static int llama_decode_internal(
6045
6187
 
6046
6188
  ggml_allocr_alloc_graph(lctx.alloc, gf);
6047
6189
 
6048
- struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
6049
- struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
6050
-
6051
- GGML_ASSERT(strcmp(res->name, "result_output") == 0);
6052
- GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
6190
+ // the output is always the last tensor in the graph
6191
+ struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
6192
+ GGML_ASSERT(strcmp(res->name, "result_output") == 0);
6053
6193
 
6194
+ // the embeddings could be the second to last tensor, or the third to last tensor
6195
+ struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
6196
+ if (strcmp(embeddings->name, "result_norm") != 0) {
6197
+ embeddings = gf->nodes[gf->n_nodes - 3];
6198
+ GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
6199
+ }
6054
6200
 
6055
- #ifdef GGML_USE_CUBLAS
6201
+ #if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
6202
+ char * buf_alloc_base = (char *)ggml_backend_buffer_get_base(lctx.buf_alloc);
6056
6203
  for (int i = 0; i < gf->n_leafs; i++) {
6057
6204
  ggml_tensor * node = gf->leafs[i];
6058
6205
  if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) {
6059
- ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) lctx.buf_alloc.data);
6206
+ ggml_cuda_assign_scratch_offset(node, (char *)node->data - buf_alloc_base);
6060
6207
  ggml_cuda_copy_to_device(node);
6061
6208
  }
6062
6209
  }
@@ -6064,7 +6211,7 @@ static int llama_decode_internal(
6064
6211
  for (int i = 0; i < gf->n_nodes; i++) {
6065
6212
  ggml_tensor * node = gf->nodes[i];
6066
6213
  if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) {
6067
- ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) lctx.buf_alloc.data);
6214
+ ggml_cuda_assign_scratch_offset(node, (char *)node->data - buf_alloc_base);
6068
6215
  }
6069
6216
  }
6070
6217
 
@@ -6091,23 +6238,23 @@ static int llama_decode_internal(
6091
6238
  n_threads = 1;
6092
6239
  }
6093
6240
 
6094
- #if GGML_USE_MPI
6241
+ #ifdef GGML_USE_MPI
6095
6242
  const int64_t n_layer = hparams.n_layer;
6096
6243
  ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
6097
6244
  #endif
6098
6245
 
6099
6246
  #ifdef GGML_USE_METAL
6100
- if (lctx.ctx_metal) {
6101
- ggml_metal_set_n_cb (lctx.ctx_metal, n_threads);
6102
- ggml_metal_graph_compute(lctx.ctx_metal, gf);
6103
- } else {
6104
- ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
6247
+ if (ggml_backend_is_metal(lctx.backend)) {
6248
+ ggml_backend_metal_set_n_cb(lctx.backend, n_threads);
6105
6249
  }
6106
- #else
6107
- ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
6108
6250
  #endif
6109
6251
 
6110
- #if GGML_USE_MPI
6252
+ if (ggml_backend_is_cpu(lctx.backend)) {
6253
+ ggml_backend_cpu_set_n_threads(lctx.backend, n_threads);
6254
+ }
6255
+ ggml_backend_graph_compute(lctx.backend, gf);
6256
+
6257
+ #ifdef GGML_USE_MPI
6111
6258
  ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
6112
6259
  #endif
6113
6260
 
@@ -6145,20 +6292,37 @@ static int llama_decode_internal(
6145
6292
  {
6146
6293
  auto & logits_out = lctx.logits;
6147
6294
 
6295
+ #ifndef NDEBUG
6296
+ auto & logits_valid = lctx.logits_valid;
6297
+ logits_valid.clear();
6298
+ logits_valid.resize(n_tokens);
6299
+
6300
+ logits_out.clear();
6301
+ #endif
6302
+
6148
6303
  if (batch.logits) {
6149
6304
  logits_out.resize(n_vocab * n_tokens);
6150
6305
  for (uint32_t i = 0; i < n_tokens; i++) {
6151
6306
  if (batch.logits[i] == 0) {
6152
6307
  continue;
6153
6308
  }
6154
- memcpy(logits_out.data() + (n_vocab*i), (float *) ggml_get_data(res) + (n_vocab*i), sizeof(float)*n_vocab);
6309
+ ggml_backend_tensor_get(res, logits_out.data() + (n_vocab*i), (n_vocab*i)*sizeof(float), n_vocab*sizeof(float));
6310
+ #ifndef NDEBUG
6311
+ logits_valid[i] = true;
6312
+ #endif
6155
6313
  }
6156
6314
  } else if (lctx.logits_all) {
6157
6315
  logits_out.resize(n_vocab * n_tokens);
6158
- memcpy(logits_out.data(), (float *) ggml_get_data(res), sizeof(float)*n_vocab*n_tokens);
6316
+ ggml_backend_tensor_get(res, logits_out.data(), 0, n_vocab*n_tokens*sizeof(float));
6317
+ #ifndef NDEBUG
6318
+ std::fill(logits_valid.begin(), logits_valid.end(), true);
6319
+ #endif
6159
6320
  } else {
6160
6321
  logits_out.resize(n_vocab);
6161
- memcpy(logits_out.data(), (float *) ggml_get_data(res) + (n_vocab*(n_tokens - 1)), sizeof(float)*n_vocab);
6322
+ ggml_backend_tensor_get(res, logits_out.data(), (n_vocab*(n_tokens - 1))*sizeof(float), n_vocab*sizeof(float));
6323
+ #ifndef NDEBUG
6324
+ logits_valid[0] = true;
6325
+ #endif
6162
6326
  }
6163
6327
  }
6164
6328
 
@@ -6167,7 +6331,7 @@ static int llama_decode_internal(
6167
6331
  auto & embedding_out = lctx.embedding;
6168
6332
 
6169
6333
  embedding_out.resize(n_embd);
6170
- memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(n_tokens - 1)), sizeof(float)*n_embd);
6334
+ ggml_backend_tensor_get(embeddings, embedding_out.data(), (n_embd*(n_tokens - 1))*sizeof(float), n_embd*sizeof(float));
6171
6335
  }
6172
6336
 
6173
6337
  // measure the performance only for the single-token evals
@@ -8125,12 +8289,6 @@ void llama_beam_search(llama_context * ctx,
8125
8289
  // quantization
8126
8290
  //
8127
8291
 
8128
- template <typename T>
8129
- struct no_init {
8130
- T value;
8131
- no_init() { /* do nothing */ }
8132
- };
8133
-
8134
8292
  struct quantize_state_internal {
8135
8293
  const llama_model & model;
8136
8294
  const llama_model_quantize_params * params;
@@ -8373,9 +8531,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
8373
8531
  #endif
8374
8532
 
8375
8533
  llama_model_loader ml(fname_inp, use_mmap, NULL);
8376
- if (ml.use_mmap) {
8377
- ml.mapping.reset(new llama_mmap(&ml.file, /* prefetch */ 0, ggml_is_numa()));
8378
- }
8534
+ ml.init_mapping(false); // no prefetching?
8379
8535
 
8380
8536
  llama_model model;
8381
8537
  llm_load_arch(ml, model);
@@ -8621,74 +8777,63 @@ static int llama_apply_lora_from_file_internal(
8621
8777
 
8622
8778
  const int64_t t_start_lora_us = ggml_time_us();
8623
8779
 
8624
- auto fin = std::ifstream(path_lora, std::ios::binary);
8625
- if (!fin) {
8626
- LLAMA_LOG_ERROR("%s: failed to open '%s'\n", __func__, path_lora);
8627
- return 1;
8628
- }
8780
+ llama_file fin(path_lora, "rb");
8629
8781
 
8630
8782
  // verify magic and version
8631
8783
  {
8632
- uint32_t magic;
8633
- fin.read((char *) &magic, sizeof(magic));
8634
- uint32_t format_version;
8635
- fin.read((char *) &format_version, sizeof(format_version));
8784
+ uint32_t magic = fin.read_u32();
8785
+ if (magic != LLAMA_FILE_MAGIC_GGLA) {
8786
+ LLAMA_LOG_ERROR("%s: bad file magic\n", __func__);
8787
+ return 1;
8788
+ }
8636
8789
 
8790
+ uint32_t format_version = fin.read_u32();
8637
8791
  if (format_version != 1) {
8638
8792
  LLAMA_LOG_ERROR("%s: unsupported file version\n", __func__ );
8639
8793
  return 1;
8640
8794
  }
8641
8795
  }
8642
8796
 
8643
- int32_t lora_r;
8644
- int32_t lora_alpha;
8645
- fin.read((char *) &lora_r, sizeof(lora_r));
8646
- fin.read((char *) &lora_alpha, sizeof(lora_alpha));
8797
+ int32_t lora_r = fin.read_u32();
8798
+ int32_t lora_alpha = fin.read_u32();
8647
8799
  float scaling = scale * (float)lora_alpha / (float)lora_r;
8648
8800
 
8649
8801
  LLAMA_LOG_INFO("%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);
8650
8802
 
8803
+ // create a name -> tensor map of the model to accelerate lookups
8804
+ // find the max tensor size to estimate the required temporary buffer size
8805
+ size_t max_tensor_size = 0;
8806
+ std::unordered_map<std::string, struct ggml_tensor*> model_tensors;
8807
+ for (const auto & kv : model.tensors_by_name) {
8808
+ model_tensors.insert(kv);
8809
+ size_t f32_size = ggml_nelements(kv.second) * sizeof(float);
8810
+ max_tensor_size = std::max(max_tensor_size, f32_size);
8811
+ }
8812
+
8651
8813
  // create a temporary ggml context to store the lora tensors
8652
- // todo: calculate size from biggest possible tensor
8653
- std::vector<uint8_t> lora_buf(1024ull * 1024ull * 1024ull);
8814
+ // TODO: use ggml-alloc
8815
+ size_t lora_ctx_size = max_tensor_size * 3;
8816
+ LLAMA_LOG_INFO("%s: allocating %.f MB for lora temporary buffer\n", __func__, lora_ctx_size / 1024.0 / 1024.0);
8817
+ std::vector<uint8_t> lora_buf(lora_ctx_size);
8818
+
8654
8819
  struct ggml_init_params params;
8655
8820
  params.mem_size = lora_buf.size();
8656
8821
  params.mem_buffer = lora_buf.data();
8657
8822
  params.no_alloc = false;
8658
8823
 
8659
- ggml_context * lora_ctx = ggml_init(params);
8660
- std::unordered_map<std::string, struct ggml_tensor *> lora_tensors;
8824
+ using unique_context = std::unique_ptr<ggml_context, decltype(&ggml_free)>;
8661
8825
 
8662
- // create a name -> tensor map of the model to accelerate lookups
8663
- std::unordered_map<std::string, struct ggml_tensor*> model_tensors;
8664
- for (const auto & kv : model.tensors_by_name) {
8665
- model_tensors.insert(kv);
8666
- }
8826
+ unique_context lora_ctx(nullptr, ggml_free);
8827
+ lora_ctx.reset(ggml_init(params));
8828
+ std::unordered_map<std::string, struct ggml_tensor *> lora_tensors;
8667
8829
 
8668
8830
  // load base model
8669
8831
  std::unique_ptr<llama_model_loader> ml;
8670
- ggml_context * base_ctx = NULL;
8671
- std::vector<uint8_t> base_buf;
8672
- if (path_base_model) {
8673
- LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
8674
- ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*kv_overrides*/ NULL));
8675
-
8676
- size_t ctx_size;
8677
- size_t mmapped_size;
8678
- ml->calc_sizes(ctx_size, mmapped_size);
8679
- base_buf.resize(ctx_size);
8680
-
8681
- ggml_init_params base_params;
8682
- base_params.mem_size = base_buf.size();
8683
- base_params.mem_buffer = base_buf.data();
8684
- base_params.no_alloc = ml->use_mmap;
8685
8832
 
8686
- base_ctx = ggml_init(base_params);
8687
-
8688
- // maybe this should in llama_model_loader
8689
- if (ml->use_mmap) {
8690
- ml->mapping.reset(new llama_mmap(&ml->file, /* prefetch */ 0, ggml_is_numa()));
8691
- }
8833
+ if (path_base_model) {
8834
+ LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
8835
+ ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*kv_overrides*/ nullptr));
8836
+ ml->init_mapping(false); // no prefetching
8692
8837
  }
8693
8838
 
8694
8839
  // read tensors and apply
@@ -8698,27 +8843,35 @@ static int llama_apply_lora_from_file_internal(
8698
8843
  std::vector<uint8_t> work_buffer;
8699
8844
 
8700
8845
  while (true) {
8846
+ if (fin.tell() == fin.size) {
8847
+ // eof
8848
+ break;
8849
+ }
8850
+
8701
8851
  int32_t n_dims;
8702
- int32_t length;
8852
+ int32_t name_len;
8703
8853
  int32_t ftype;
8704
8854
 
8705
- fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
8706
- fin.read(reinterpret_cast<char *>(&length), sizeof(length));
8707
- fin.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
8708
- if (fin.eof()) {
8709
- break;
8855
+ fin.read_raw(&n_dims, sizeof(n_dims));
8856
+ fin.read_raw(&name_len, sizeof(name_len));
8857
+ fin.read_raw(&ftype, sizeof(ftype));
8858
+
8859
+ if (n_dims != 1 && n_dims != 2) {
8860
+ LLAMA_LOG_ERROR("%s: unsupported tensor dimension %d\n", __func__, n_dims);
8861
+ return 1;
8710
8862
  }
8711
8863
 
8712
8864
  int32_t ne[2] = { 1, 1 };
8713
8865
  for (int i = 0; i < n_dims; ++i) {
8714
- fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
8866
+ fin.read_raw(&ne[i], sizeof(ne[i]));
8715
8867
  }
8716
8868
 
8717
8869
  std::string name;
8718
8870
  {
8871
+ GGML_ASSERT(name_len <= 1024);
8719
8872
  char buf[1024];
8720
- fin.read(buf, length);
8721
- name = std::string(buf, length);
8873
+ fin.read_raw(buf, name_len);
8874
+ name = std::string(buf, name_len);
8722
8875
  }
8723
8876
 
8724
8877
  // check for lora suffix and get the type of tensor
@@ -8732,7 +8885,7 @@ static int llama_apply_lora_from_file_internal(
8732
8885
  std::string lora_type = name.substr(pos + lora_suffix.length());
8733
8886
  std::string base_name = name;
8734
8887
  base_name.erase(pos);
8735
- // LLAMA_LOG_INFO("%s: %s => %s (lora type %s) \n", __func__, name.c_str(),base_name.c_str(), lora_type.c_str());
8888
+ // LLAMA_LOG_INFO("%s: %s => %s (lora type %s) \n", __func__, name.c_str(), base_name.c_str(), lora_type.c_str());
8736
8889
 
8737
8890
  if (model_tensors.find(base_name) == model_tensors.end()) {
8738
8891
  LLAMA_LOG_ERROR("%s: unknown tensor '%s' in lora adapter\n", __func__, name.data());
@@ -8751,22 +8904,15 @@ static int llama_apply_lora_from_file_internal(
8751
8904
  return false;
8752
8905
  }
8753
8906
  }
8754
- ggml_tensor * lora_tensor;
8755
- if (n_dims == 2) {
8756
- lora_tensor = ggml_new_tensor_2d(lora_ctx, wtype, ne[0], ne[1]);
8757
- }
8758
- else {
8759
- LLAMA_LOG_ERROR("%s: unsupported tensor dimension %d\n", __func__, n_dims);
8760
- return 1;
8761
- }
8762
- ggml_set_name(lora_tensor, "lora_tensor");
8907
+ ggml_tensor * lora_tensor = ggml_new_tensor_2d(lora_ctx.get(), wtype, ne[0], ne[1]);
8908
+ ggml_set_name(lora_tensor, name.c_str());
8763
8909
 
8764
8910
  // load tensor data
8765
- size_t offset = fin.tellg();
8911
+ size_t offset = fin.tell();
8766
8912
  size_t tensor_data_size = ggml_nbytes(lora_tensor);
8767
8913
  offset = (offset + 31) & -32;
8768
- fin.seekg(offset);
8769
- fin.read((char*)lora_tensor->data, tensor_data_size);
8914
+ fin.seek(offset, SEEK_SET);
8915
+ fin.read_raw(lora_tensor->data, tensor_data_size);
8770
8916
 
8771
8917
  lora_tensors[name] = lora_tensor;
8772
8918
 
@@ -8779,7 +8925,7 @@ static int llama_apply_lora_from_file_internal(
8779
8925
  offload_func_t offload_func = ggml_offload_nop;
8780
8926
  offload_func_t offload_func_force_inplace = ggml_offload_nop;
8781
8927
 
8782
- #ifdef GGML_USE_CUBLAS
8928
+ #if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
8783
8929
  if (dest_t->backend == GGML_BACKEND_GPU || dest_t->backend == GGML_BACKEND_GPU_SPLIT) {
8784
8930
  if (dest_t->type != GGML_TYPE_F16) {
8785
8931
  throw std::runtime_error(format(
@@ -8796,13 +8942,11 @@ static int llama_apply_lora_from_file_internal(
8796
8942
 
8797
8943
  // load from base model
8798
8944
  if (gguf_find_tensor(ctx_gguf, base_name.c_str()) < 0) {
8799
- // TODO: throw
8800
8945
  LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
8801
8946
  return 1;
8802
8947
  }
8803
8948
 
8804
- // TODO: not tested!! maybe not working!
8805
- base_t = ml->create_tensor(base_ctx, base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] }, GGML_BACKEND_CPU);
8949
+ base_t = ml->get_tensor_meta(base_name.c_str());
8806
8950
  ml->load_data_for(base_t);
8807
8951
  } else {
8808
8952
  base_t = dest_t;
@@ -8831,43 +8975,42 @@ static int llama_apply_lora_from_file_internal(
8831
8975
  }
8832
8976
 
8833
8977
  // w = w + BA*s
8834
- ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB);
8978
+ ggml_tensor * BA = ggml_mul_mat(lora_ctx.get(), loraA, loraB);
8835
8979
  offload_func(BA);
8836
8980
  ggml_set_name(BA, "BA");
8837
8981
 
8838
8982
  if (scaling != 1.0f) {
8839
- ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling);
8840
- ggml_set_name(scale_tensor, "scale_tensor");
8841
-
8842
- BA = ggml_scale_inplace(lora_ctx, BA, scale_tensor);
8983
+ BA = ggml_scale_inplace(lora_ctx.get(), BA, scaling);
8843
8984
  offload_func(BA);
8844
8985
  ggml_set_name(BA, "BA_scaled");
8845
8986
  }
8846
8987
 
8847
8988
  ggml_tensor * r;
8848
8989
  if (base_t == dest_t) {
8849
- r = ggml_add_inplace(lora_ctx, dest_t, BA);
8990
+ r = ggml_add_inplace(lora_ctx.get(), dest_t, BA);
8850
8991
  offload_func_force_inplace(r);
8851
8992
  ggml_set_name(r, "r_add_inplace");
8852
8993
  }
8853
8994
  else {
8854
- r = ggml_add(lora_ctx, base_t, BA);
8995
+ r = ggml_add(lora_ctx.get(), base_t, BA);
8855
8996
  offload_func(r);
8856
8997
  ggml_set_name(r, "r_add");
8857
8998
 
8858
- r = ggml_cpy(lora_ctx, r, dest_t);
8999
+ r = ggml_cpy(lora_ctx.get(), r, dest_t);
8859
9000
  offload_func(r);
8860
9001
  ggml_set_name(r, "r_cpy");
8861
9002
  }
8862
9003
 
8863
- struct ggml_cgraph * gf = ggml_new_graph(lora_ctx);
9004
+ struct ggml_cgraph * gf = ggml_new_graph(lora_ctx.get());
8864
9005
  ggml_build_forward_expand(gf, r);
8865
9006
 
8866
9007
  ggml_graph_compute_helper(work_buffer, gf, n_threads);
8867
9008
 
9009
+ // the tensors in the adapter must be sorted such that loraA and loraB of the same tensor are next to each other
9010
+ GGML_ASSERT(lora_tensors.size() == 2);
9011
+
8868
9012
  // we won't need these tensors again, reset the context to save memory
8869
- ggml_free(lora_ctx);
8870
- lora_ctx = ggml_init(params);
9013
+ lora_ctx.reset(ggml_init(params));
8871
9014
  lora_tensors.clear();
8872
9015
 
8873
9016
  n_tensors++;
@@ -8877,12 +9020,6 @@ static int llama_apply_lora_from_file_internal(
8877
9020
  }
8878
9021
  }
8879
9022
 
8880
- // TODO: this should be in a destructor, it will leak on failure
8881
- ggml_free(lora_ctx);
8882
- if (base_ctx) {
8883
- ggml_free(base_ctx);
8884
- }
8885
-
8886
9023
  const int64_t t_lora_us = ggml_time_us() - t_start_lora_us;
8887
9024
  LLAMA_LOG_INFO(" done (%.2f ms)\n", t_lora_us / 1000.0);
8888
9025
 
@@ -9012,11 +9149,18 @@ struct llama_model * llama_load_model_from_file(
9012
9149
  LLAMA_LOG_INFO("\n");
9013
9150
  }
9014
9151
  }
9152
+ return true;
9015
9153
  };
9016
9154
  }
9017
9155
 
9018
- if (!llama_model_load(path_model, *model, params)) {
9019
- LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
9156
+ int status = llama_model_load(path_model, *model, params);
9157
+ GGML_ASSERT(status <= 0);
9158
+ if (status < 0) {
9159
+ if (status == -1) {
9160
+ LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
9161
+ } else if (status == -2) {
9162
+ LLAMA_LOG_INFO("%s: cancelled model load\n", __func__);
9163
+ }
9020
9164
  delete model;
9021
9165
  return nullptr;
9022
9166
  }
@@ -9091,7 +9235,39 @@ struct llama_context * llama_new_context_with_model(
9091
9235
 
9092
9236
  // reserve memory for context buffers
9093
9237
  if (!hparams.vocab_only) {
9094
- if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, type_k, type_v, cparams.n_ctx, model->n_gpu_layers, cparams.offload_kqv)) {
9238
+ // initialize backend
9239
+ #ifdef GGML_USE_METAL
9240
+ if (model->n_gpu_layers > 0) {
9241
+ ctx->backend = ggml_backend_metal_init();
9242
+ if (ctx->backend == nullptr) {
9243
+ LLAMA_LOG_ERROR("%s: failed to initialize Metal backend\n", __func__);
9244
+ }
9245
+ }
9246
+ #elif defined(GGML_USE_CUBLAS) && defined(LLAMA_GGML_BACKEND_CUDA_TEST)
9247
+ // for testing only
9248
+ if (model->n_gpu_layers > 0) {
9249
+ ctx->backend = ggml_backend_cuda_init(0);
9250
+ if (ctx->backend == nullptr) {
9251
+ LLAMA_LOG_ERROR("%s: failed to initialize CUDA backend\n", __func__);
9252
+ }
9253
+ }
9254
+ #endif
9255
+
9256
+ if (ctx->backend == nullptr && ggml_backend_buffer_is_host(model->buf)) {
9257
+ ctx->backend = ggml_backend_cpu_init();
9258
+ if (ctx->backend == nullptr) {
9259
+ LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__);
9260
+ }
9261
+ }
9262
+
9263
+ if (ctx->backend == nullptr) {
9264
+ LLAMA_LOG_ERROR("%s: failed to initialize a backend\n", __func__);
9265
+ delete ctx;
9266
+ return nullptr;
9267
+ }
9268
+
9269
+ if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, type_k, type_v,
9270
+ cparams.n_ctx, model->n_gpu_layers, cparams.offload_kqv)) {
9095
9271
  LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
9096
9272
  llama_free(ctx);
9097
9273
  return nullptr;
@@ -9127,12 +9303,11 @@ struct llama_context * llama_new_context_with_model(
9127
9303
  }
9128
9304
 
9129
9305
  {
9130
- static const size_t tensor_alignment = 32;
9131
9306
  // the compute buffer is used to store the tensor and graph structs, while the allocator buffer is used for the tensor data
9132
- ctx->buf_compute.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead());
9307
+ ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead());
9133
9308
 
9134
9309
  // create measure allocator
9135
- ctx->alloc = ggml_allocr_new_measure(tensor_alignment);
9310
+ ctx->alloc = ggml_allocr_new_measure_from_backend(ctx->backend);
9136
9311
 
9137
9312
  // build worst-case graph
9138
9313
  int n_tokens = (int)std::min(cparams.n_ctx, cparams.n_batch);
@@ -9140,98 +9315,50 @@ struct llama_context * llama_new_context_with_model(
9140
9315
  llama_token token = llama_token_bos(&ctx->model); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
9141
9316
  ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0));
9142
9317
 
9143
- #ifdef GGML_USE_METAL
9144
- if (model->n_gpu_layers > 0) {
9145
- ctx->ctx_metal = ggml_metal_init(1);
9146
- if (!ctx->ctx_metal) {
9147
- LLAMA_LOG_ERROR("%s: ggml_metal_init() failed\n", __func__);
9148
- llama_free(ctx);
9149
- return NULL;
9150
- }
9151
- //ggml_metal_graph_find_concurrency(ctx->ctx_metal, gf, false);
9152
- //ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
9153
- }
9154
- #endif
9155
9318
  // measure memory requirements for the graph
9156
- size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment;
9319
+ size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf);
9157
9320
 
9158
- LLAMA_LOG_INFO("%s: compute buffer total size = %.2f MiB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0);
9321
+ LLAMA_LOG_INFO("%s: compute buffer total size = %.2f MiB\n", __func__, (ctx->buf_compute_meta.size() + alloc_size) / 1024.0 / 1024.0);
9159
9322
 
9160
- // recreate allocator with exact memory requirements
9323
+ // create allocator again with exact memory requirements
9161
9324
  ggml_allocr_free(ctx->alloc);
9162
9325
 
9163
- ctx->buf_alloc.resize(alloc_size);
9164
- ctx->alloc = ggml_allocr_new(ctx->buf_alloc.data, ctx->buf_alloc.size, tensor_alignment);
9165
- #ifdef GGML_USE_METAL
9166
- if (ctx->ctx_metal) {
9167
- //ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
9168
- }
9169
- #endif
9170
- #ifdef GGML_USE_CUBLAS
9171
- ggml_cuda_set_scratch_size(alloc_size);
9172
- LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MiB\n", __func__, alloc_size / 1024.0 / 1024.0);
9326
+ ctx->buf_alloc = ggml_backend_alloc_buffer(ctx->backend, alloc_size);
9327
+ ctx->alloc = ggml_allocr_new_from_buffer(ctx->buf_alloc);
9328
+ #if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
9329
+ if (model->n_gpu_layers > 0) {
9330
+ ggml_cuda_set_scratch_size(alloc_size);
9331
+ LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MiB\n", __func__, alloc_size / 1024.0 / 1024.0);
9173
9332
 
9174
- // calculate total VRAM usage
9175
- auto add_tensor = [](const ggml_tensor * t, size_t & size) {
9176
- if (t->backend == GGML_BACKEND_GPU || t->backend == GGML_BACKEND_GPU_SPLIT) {
9177
- size += ggml_nbytes(t);
9333
+ // calculate total VRAM usage
9334
+ auto add_tensor = [](const ggml_tensor * t, size_t & size) {
9335
+ if (t->backend == GGML_BACKEND_GPU || t->backend == GGML_BACKEND_GPU_SPLIT) {
9336
+ size += ggml_nbytes(t);
9337
+ }
9338
+ };
9339
+ size_t model_vram_size = 0;
9340
+ for (const auto & kv : model->tensors_by_name) {
9341
+ add_tensor(kv.second, model_vram_size);
9178
9342
  }
9179
- };
9180
- size_t model_vram_size = 0;
9181
- for (const auto & kv : model->tensors_by_name) {
9182
- add_tensor(kv.second, model_vram_size);
9183
- }
9184
-
9185
- size_t kv_vram_size = 0;
9186
- for (auto & k : ctx->kv_self.k_l) {
9187
- add_tensor(k, kv_vram_size);
9188
- }
9189
- for (auto & v : ctx->kv_self.v_l) {
9190
- add_tensor(v, kv_vram_size);
9191
- }
9192
-
9193
- size_t ctx_vram_size = alloc_size + kv_vram_size;
9194
- size_t total_vram_size = model_vram_size + ctx_vram_size;
9195
9343
 
9196
- LLAMA_LOG_INFO("%s: total VRAM used: %.2f MiB (model: %.2f MiB, context: %.2f MiB)\n", __func__,
9197
- total_vram_size / 1024.0 / 1024.0,
9198
- model_vram_size / 1024.0 / 1024.0,
9199
- ctx_vram_size / 1024.0 / 1024.0);
9200
- #endif
9201
- }
9202
-
9203
- #ifdef GGML_USE_METAL
9204
- if (model->n_gpu_layers > 0) {
9205
- // this allocates all Metal resources and memory buffers
9206
-
9207
- void * data_ptr = NULL;
9208
- size_t data_size = 0;
9209
-
9210
- if (ctx->model.mapping) {
9211
- data_ptr = ctx->model.mapping->addr;
9212
- data_size = ctx->model.mapping->size;
9213
- } else {
9214
- data_ptr = ggml_get_mem_buffer(ctx->model.ctx);
9215
- data_size = ggml_get_mem_size (ctx->model.ctx);
9216
- }
9217
-
9218
- const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
9344
+ size_t kv_vram_size = 0;
9345
+ for (auto & k : ctx->kv_self.k_l) {
9346
+ add_tensor(k, kv_vram_size);
9347
+ }
9348
+ for (auto & v : ctx->kv_self.v_l) {
9349
+ add_tensor(v, kv_vram_size);
9350
+ }
9219
9351
 
9220
- LLAMA_LOG_INFO("%s: max tensor size = %8.2f MiB\n", __func__, max_size/1024.0/1024.0);
9352
+ size_t ctx_vram_size = alloc_size + kv_vram_size;
9353
+ size_t total_vram_size = model_vram_size + ctx_vram_size;
9221
9354
 
9222
- #define LLAMA_METAL_CHECK_BUF(result) \
9223
- if (!(result)) { \
9224
- LLAMA_LOG_ERROR("%s: failed to add buffer\n", __func__); \
9225
- llama_free(ctx); \
9226
- return NULL; \
9355
+ LLAMA_LOG_INFO("%s: total VRAM used: %.2f MiB (model: %.2f MiB, context: %.2f MiB)\n", __func__,
9356
+ total_vram_size / 1024.0 / 1024.0,
9357
+ model_vram_size / 1024.0 / 1024.0,
9358
+ ctx_vram_size / 1024.0 / 1024.0);
9227
9359
  }
9228
-
9229
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
9230
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.data, ctx->kv_self.buf.size, 0));
9231
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "alloc", ctx->buf_alloc.data, ctx->buf_alloc.size, 0));
9232
- #undef LLAMA_METAL_CHECK_BUF
9233
- }
9234
9360
  #endif
9361
+ }
9235
9362
  }
9236
9363
 
9237
9364
  #ifdef GGML_USE_MPI
@@ -9259,10 +9386,14 @@ const llama_model * llama_get_model(const struct llama_context * ctx) {
9259
9386
  return &ctx->model;
9260
9387
  }
9261
9388
 
9262
- int llama_n_ctx(const struct llama_context * ctx) {
9389
+ uint32_t llama_n_ctx(const struct llama_context * ctx) {
9263
9390
  return ctx->cparams.n_ctx;
9264
9391
  }
9265
9392
 
9393
+ uint32_t llama_n_batch(const struct llama_context * ctx) {
9394
+ return ctx->cparams.n_batch;
9395
+ }
9396
+
9266
9397
  enum llama_vocab_type llama_vocab_type(const struct llama_model * model) {
9267
9398
  return model->vocab.type;
9268
9399
  }
@@ -9519,7 +9650,7 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
9519
9650
  const size_t s_embedding = ctx->embedding.size() * sizeof(float);
9520
9651
  const size_t s_kv_size = sizeof(size_t);
9521
9652
  const size_t s_kv_ntok = sizeof(int);
9522
- const size_t s_kv = ctx->kv_self.buf.size;
9653
+ const size_t s_kv = ggml_backend_buffer_get_size(ctx->kv_self.buf);
9523
9654
 
9524
9655
  const size_t s_total = (
9525
9656
  + s_rng_size
@@ -9647,7 +9778,7 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
9647
9778
  const auto n_embd = hparams.n_embd_gqa();
9648
9779
  const auto n_ctx = cparams.n_ctx;
9649
9780
 
9650
- const size_t kv_buf_size = kv_self.buf.size;
9781
+ const size_t kv_buf_size = ggml_backend_buffer_get_size(kv_self.buf);
9651
9782
  const uint32_t kv_head = kv_self.head;
9652
9783
  const uint32_t kv_size = kv_self.size;
9653
9784
  const uint32_t kv_used = kv_self.used;
@@ -9663,17 +9794,12 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
9663
9794
  ggml_context * cpy_ctx = ggml_init({ 6*n_layer*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
9664
9795
  ggml_cgraph * gf = ggml_new_graph(cpy_ctx);
9665
9796
 
9666
- std::vector<std::vector<uint8_t>> kout2d_data(n_layer);
9667
- std::vector<std::vector<uint8_t>> vout2d_data(n_layer);
9797
+ std::vector<struct ggml_tensor *> kout2d(n_layer);
9798
+ std::vector<struct ggml_tensor *> vout2d(n_layer);
9668
9799
 
9669
9800
  for (int il = 0; il < (int) n_layer; ++il) {
9670
- ggml_tensor * kout2d = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd, kv_head);
9671
- kout2d_data[il].resize(ggml_nbytes(kout2d));
9672
- kout2d->data = kout2d_data[il].data();
9673
-
9674
- ggml_tensor * vout2d = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd);
9675
- vout2d_data[il].resize(ggml_nbytes(vout2d));
9676
- vout2d->data = vout2d_data[il].data();
9801
+ kout2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd, kv_head);
9802
+ vout2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd);
9677
9803
 
9678
9804
  ggml_tensor * k2d = ggml_view_2d(cpy_ctx, kv_self.k_l[il],
9679
9805
  n_embd, kv_head,
@@ -9683,20 +9809,28 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
9683
9809
  kv_head, n_embd,
9684
9810
  elt_size*n_ctx, 0);
9685
9811
 
9686
- ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, k2d, kout2d));
9687
- ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, v2d, vout2d));
9812
+ ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, k2d, kout2d[il]));
9813
+ ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, v2d, vout2d[il]));
9688
9814
  }
9689
9815
 
9690
- ggml_graph_compute_helper(ctx->work_buffer, gf, /*n_threads*/ 1);
9816
+ ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(cpy_ctx, ctx->backend);
9691
9817
 
9692
- ggml_free(cpy_ctx);
9818
+ ggml_backend_graph_compute(ctx->backend, gf);
9819
+
9820
+ std::vector<uint8_t> tmp_buf;
9821
+ for (int il = 0; il < (int) n_layer; ++il) {
9822
+ tmp_buf.resize(ggml_nbytes(kout2d[il]));
9823
+ ggml_backend_tensor_get(kout2d[il], tmp_buf.data(), 0, tmp_buf.size());
9824
+ data_ctx->write(tmp_buf.data(), tmp_buf.size());
9693
9825
 
9694
- // our data is now in the kout2d_data and vout2d_data buffers
9695
- // write them to file
9696
- for (uint32_t il = 0; il < n_layer; ++il) {
9697
- data_ctx->write(kout2d_data[il].data(), kout2d_data[il].size());
9698
- data_ctx->write(vout2d_data[il].data(), vout2d_data[il].size());
9826
+ tmp_buf.resize(ggml_nbytes(vout2d[il]));
9827
+ ggml_backend_tensor_get(vout2d[il], tmp_buf.data(), 0, tmp_buf.size());
9828
+ data_ctx->write(tmp_buf.data(), tmp_buf.size());
9699
9829
  }
9830
+
9831
+ ggml_free(cpy_ctx);
9832
+
9833
+ ggml_backend_buffer_free(buf);
9700
9834
  }
9701
9835
 
9702
9836
  for (uint32_t i = 0; i < kv_size; ++i) {
@@ -9794,21 +9928,19 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
9794
9928
  memcpy(&kv_used, inp, sizeof(kv_used)); inp += sizeof(kv_used);
9795
9929
 
9796
9930
  if (kv_buf_size) {
9797
- GGML_ASSERT(kv_self.buf.size == kv_buf_size);
9931
+ GGML_ASSERT(ggml_backend_buffer_get_size(kv_self.buf) == kv_buf_size);
9798
9932
 
9799
9933
  const size_t elt_size = ggml_element_size(kv_self.k_l[0]);
9800
9934
 
9801
9935
  ggml_context * cpy_ctx = ggml_init({ 6*n_layer*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
9802
9936
  ggml_cgraph * gf = ggml_new_graph(cpy_ctx);
9803
9937
 
9804
- for (int il = 0; il < n_layer; ++il) {
9805
- ggml_tensor * kin2d = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd, kv_head);
9806
- kin2d->data = (void *) inp;
9807
- inp += ggml_nbytes(kin2d);
9938
+ std::vector<struct ggml_tensor *> kin2d(n_layer);
9939
+ std::vector<struct ggml_tensor *> vin2d(n_layer);
9808
9940
 
9809
- ggml_tensor * vin2d = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd);
9810
- vin2d->data = (void *) inp;
9811
- inp += ggml_nbytes(vin2d);
9941
+ for (int il = 0; il < n_layer; ++il) {
9942
+ kin2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd, kv_head);
9943
+ vin2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd);
9812
9944
 
9813
9945
  ggml_tensor * k2d = ggml_view_2d(cpy_ctx, kv_self.k_l[il],
9814
9946
  n_embd, kv_head,
@@ -9818,13 +9950,26 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
9818
9950
  kv_head, n_embd,
9819
9951
  elt_size*n_ctx, 0);
9820
9952
 
9821
- ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, kin2d, k2d));
9822
- ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, vin2d, v2d));
9953
+ ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, kin2d[il], k2d));
9954
+ ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, vin2d[il], v2d));
9955
+ }
9956
+
9957
+ ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(cpy_ctx, ctx->backend);
9958
+
9959
+ // load data into the tensors
9960
+ for (int il = 0; il < n_layer; ++il) {
9961
+ ggml_backend_tensor_set(kin2d[il], inp, 0, ggml_nbytes(kin2d[il]));
9962
+ inp += ggml_nbytes(kin2d[il]);
9963
+
9964
+ ggml_backend_tensor_set(vin2d[il], inp, 0, ggml_nbytes(vin2d[il]));
9965
+ inp += ggml_nbytes(vin2d[il]);
9823
9966
  }
9824
9967
 
9825
- ggml_graph_compute_helper(ctx->work_buffer, gf, /*n_threads*/ 1);
9968
+ ggml_backend_graph_compute(ctx->backend, gf);
9826
9969
 
9827
9970
  ggml_free(cpy_ctx);
9971
+
9972
+ ggml_backend_buffer_free(buf);
9828
9973
  }
9829
9974
 
9830
9975
  ctx->kv_self.head = kv_head;
@@ -10047,6 +10192,7 @@ float * llama_get_logits(struct llama_context * ctx) {
10047
10192
  }
10048
10193
 
10049
10194
  float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
10195
+ assert(ctx->logits_valid.at(i));
10050
10196
  return ctx->logits.data() + i*ctx->model.hparams.n_vocab;
10051
10197
  }
10052
10198