llama_cpp 0.10.1 → 0.10.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,11 +1,12 @@
1
1
  #define LLAMA_API_INTERNAL
2
+ //#define LLAMA_GGML_BACKEND_CUDA_TEST // for testing only - enables ggml-cuda through ggml-backend, disables partial offloading
2
3
  #include "llama.h"
3
4
 
4
5
  #include "unicode.h"
5
6
 
6
7
  #include "ggml.h"
7
-
8
8
  #include "ggml-alloc.h"
9
+ #include "ggml-backend.h"
9
10
 
10
11
  #ifdef GGML_USE_CUBLAS
11
12
  # include "ggml-cuda.h"
@@ -32,6 +33,7 @@
32
33
  #include <unistd.h>
33
34
  #if defined(_POSIX_MAPPED_FILES)
34
35
  #include <sys/mman.h>
36
+ #include <fcntl.h>
35
37
  #endif
36
38
  #if defined(_POSIX_MEMLOCK_RANGE)
37
39
  #include <sys/resource.h>
@@ -195,6 +197,7 @@ enum llm_arch {
195
197
  LLM_ARCH_BLOOM,
196
198
  LLM_ARCH_STABLELM,
197
199
  LLM_ARCH_QWEN,
200
+ LLM_ARCH_PHI2,
198
201
  LLM_ARCH_UNKNOWN,
199
202
  };
200
203
 
@@ -212,6 +215,7 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
212
215
  { LLM_ARCH_BLOOM, "bloom" },
213
216
  { LLM_ARCH_STABLELM, "stablelm" },
214
217
  { LLM_ARCH_QWEN, "qwen" },
218
+ { LLM_ARCH_PHI2, "phi2" },
215
219
  };
216
220
 
217
221
  enum llm_kv {
@@ -550,6 +554,19 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
550
554
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
551
555
  },
552
556
  },
557
+ {
558
+ LLM_ARCH_PHI2,
559
+ {
560
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
561
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
562
+ { LLM_TENSOR_OUTPUT, "output" },
563
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
564
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
565
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
566
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
567
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
568
+ },
569
+ },
553
570
 
554
571
  {
555
572
  LLM_ARCH_UNKNOWN,
@@ -697,38 +714,6 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
697
714
  // llama helpers
698
715
  //
699
716
 
700
- inline void * llama_host_malloc(size_t n) {
701
- #ifdef GGML_USE_CUBLAS
702
- if (ggml_cublas_loaded()) {
703
- return ggml_cuda_host_malloc(n);
704
- } else {
705
- return malloc(n);
706
- }
707
- #elif GGML_USE_METAL
708
- return ggml_metal_host_malloc(n);
709
- #elif GGML_USE_CPU_HBM
710
- return hbw_malloc(n);
711
- #else
712
- return malloc(n);
713
- #endif
714
- }
715
-
716
- inline void llama_host_free(void * ptr) {
717
- #ifdef GGML_USE_CUBLAS
718
- if (ggml_cublas_loaded()) {
719
- return ggml_cuda_host_free(ptr);
720
- } else {
721
- return free(ptr);
722
- }
723
- #elif GGML_USE_METAL
724
- return ggml_metal_host_free(ptr);
725
- #elif GGML_USE_CPU_HBM
726
- return hbw_free(ptr);
727
- #else
728
- return free(ptr);
729
- #endif
730
- }
731
-
732
717
  #if defined(_WIN32)
733
718
  static std::string llama_format_win_err(DWORD err) {
734
719
  LPSTR buf;
@@ -743,40 +728,10 @@ static std::string llama_format_win_err(DWORD err) {
743
728
  }
744
729
  #endif
745
730
 
746
- struct llama_buffer {
747
- void * data = NULL;
748
- size_t size = 0;
749
-
750
- // fallback to malloc / free
751
- // useful in cases where CUDA can try to allocate PINNED memory
752
- bool fallback = false;
753
-
754
- void resize(size_t n) {
755
- llama_host_free(data);
756
-
757
- data = llama_host_malloc(n);
758
- if (!data) {
759
- fallback = true;
760
- data = malloc(n);
761
- } else {
762
- fallback = false;
763
- }
764
-
765
- GGML_ASSERT(data);
766
- size = n;
767
- }
768
-
769
- ~llama_buffer() {
770
- if (data) {
771
- if (fallback) { // NOLINT
772
- free(data);
773
- } else {
774
- llama_host_free(data);
775
- }
776
- }
777
-
778
- data = NULL;
779
- }
731
+ template <typename T>
732
+ struct no_init {
733
+ T value;
734
+ no_init() { /* do nothing */ }
780
735
  };
781
736
 
782
737
  struct llama_file {
@@ -864,6 +819,9 @@ struct llama_mmap {
864
819
  #ifdef _POSIX_MAPPED_FILES
865
820
  static constexpr bool SUPPORTED = true;
866
821
 
822
+ // list of mapped fragments (first_offset, last_offset)
823
+ std::vector<std::pair<size_t, size_t>> mapped_fragments;
824
+
867
825
  llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) {
868
826
  size = file->size;
869
827
  int fd = fileno(file->fp);
@@ -871,17 +829,22 @@ struct llama_mmap {
871
829
  // prefetch/readahead impairs performance on NUMA systems
872
830
  if (numa) { prefetch = 0; }
873
831
  #ifdef __linux__
832
+ // advise the kernel to read the file sequentially (increases readahead)
833
+ if (posix_fadvise(fd, 0, 0, POSIX_FADV_SEQUENTIAL)) {
834
+ LLAMA_LOG_WARN("warning: posix_fadvise(.., POSIX_FADV_SEQUENTIAL) failed: %s\n",
835
+ strerror(errno));
836
+ }
874
837
  if (prefetch) { flags |= MAP_POPULATE; }
875
838
  #endif
876
839
  addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
877
- if (addr == MAP_FAILED) {
840
+ if (addr == MAP_FAILED) { // NOLINT
878
841
  throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
879
842
  }
880
843
 
881
844
  if (prefetch > 0) {
882
- // Advise the kernel to preload the mapped memory
845
+ // advise the kernel to preload the mapped memory
883
846
  if (posix_madvise(addr, std::min(file->size, prefetch), POSIX_MADV_WILLNEED)) {
884
- fprintf(stderr, "warning: posix_madvise(.., POSIX_MADV_WILLNEED) failed: %s\n",
847
+ LLAMA_LOG_WARN("warning: posix_madvise(.., POSIX_MADV_WILLNEED) failed: %s\n",
885
848
  strerror(errno));
886
849
  }
887
850
  }
@@ -889,14 +852,81 @@ struct llama_mmap {
889
852
  // advise the kernel not to use readahead
890
853
  // (because the next page might not belong on the same node)
891
854
  if (posix_madvise(addr, file->size, POSIX_MADV_RANDOM)) {
892
- fprintf(stderr, "warning: posix_madvise(.., POSIX_MADV_RANDOM) failed: %s\n",
855
+ LLAMA_LOG_WARN("warning: posix_madvise(.., POSIX_MADV_RANDOM) failed: %s\n",
893
856
  strerror(errno));
894
857
  }
895
858
  }
859
+
860
+ // initialize list of mapped_fragments
861
+ mapped_fragments.emplace_back(0, file->size);
862
+ }
863
+
864
+ static void align_range(size_t * first, size_t * last, size_t page_size) {
865
+ // align first to the next page
866
+ size_t offset_in_page = *first & (page_size - 1);
867
+ size_t offset_to_page = offset_in_page == 0 ? 0 : page_size - offset_in_page;
868
+ *first += offset_to_page;
869
+
870
+ // align last to the previous page
871
+ *last = *last & ~(page_size - 1);
872
+
873
+ if (*last <= *first) {
874
+ *last = *first;
875
+ }
876
+ }
877
+
878
+ // partially unmap the file in the range [first, last)
879
+ void unmap_fragment(size_t first, size_t last) {
880
+ // note: this function must not be called multiple times with overlapping ranges
881
+ // otherwise, there is a risk of invalidating addresses that have been repurposed for other mappings
882
+ int page_size = sysconf(_SC_PAGESIZE);
883
+ align_range(&first, &last, page_size);
884
+ size_t len = last - first;
885
+
886
+ if (len == 0) {
887
+ return;
888
+ }
889
+
890
+ GGML_ASSERT(first % page_size == 0);
891
+ GGML_ASSERT(last % page_size == 0);
892
+ GGML_ASSERT(last > first);
893
+
894
+ void * next_page_start = (uint8_t *) addr + first;
895
+
896
+ // unmap the range
897
+ if (munmap(next_page_start, len)) {
898
+ LLAMA_LOG_WARN("warning: munmap failed: %s\n", strerror(errno));
899
+ }
900
+
901
+ // update the list of mapped fragments to avoid unmapping the same range again in the destructor
902
+ std::vector<std::pair<size_t, size_t>> new_mapped_fragments;
903
+ for (const auto & frag : mapped_fragments) {
904
+ if (frag.first < first && frag.second > last) {
905
+ // the range is in the middle of the fragment, split it
906
+ new_mapped_fragments.emplace_back(frag.first, first);
907
+ new_mapped_fragments.emplace_back(last, frag.second);
908
+ } else if (frag.first < first && frag.second > first) {
909
+ // the range starts in the middle of the fragment
910
+ new_mapped_fragments.emplace_back(frag.first, first);
911
+ } else if (frag.first < last && frag.second > last) {
912
+ // the range ends in the middle of the fragment
913
+ new_mapped_fragments.emplace_back(last, frag.second);
914
+ } else if (frag.first >= first && frag.second <= last) {
915
+ // the range covers the entire fragment
916
+ } else {
917
+ // the range is outside the fragment
918
+ new_mapped_fragments.push_back(frag);
919
+ }
920
+ }
921
+ mapped_fragments = std::move(new_mapped_fragments);
896
922
  }
897
923
 
898
924
  ~llama_mmap() {
899
- munmap(addr, size);
925
+ for (const auto & frag : mapped_fragments) {
926
+ if (munmap((char *) addr + frag.first, frag.second - frag.first)) {
927
+ LLAMA_LOG_WARN("warning: munmap failed: %s\n", strerror(errno));
928
+ }
929
+ }
900
930
  }
901
931
  #elif defined(_WIN32)
902
932
  static constexpr bool SUPPORTED = true;
@@ -939,6 +969,12 @@ struct llama_mmap {
939
969
  #endif // _WIN32_WINNT >= _WIN32_WINNT_WIN8
940
970
  }
941
971
 
972
+ void unmap_fragment(size_t first, size_t last) {
973
+ // not supported
974
+ GGML_UNUSED(first);
975
+ GGML_UNUSED(last);
976
+ }
977
+
942
978
  ~llama_mmap() {
943
979
  if (!UnmapViewOfFile(addr)) {
944
980
  fprintf(stderr, "warning: UnmapViewOfFile failed: %s\n",
@@ -955,6 +991,13 @@ struct llama_mmap {
955
991
 
956
992
  throw std::runtime_error(std::string("mmap not supported"));
957
993
  }
994
+
995
+ void unmap(size_t offset, size_t len) {
996
+ (void) offset;
997
+ (void) len;
998
+
999
+ throw std::runtime_error(std::string("mmap not supported"));
1000
+ }
958
1001
  #endif
959
1002
  };
960
1003
 
@@ -1128,6 +1171,26 @@ static std::string llama_token_to_piece(const struct llama_context * ctx, llama_
1128
1171
  return std::string(result.data(), result.size());
1129
1172
  }
1130
1173
 
1174
+ static ggml_backend_buffer_type_t llama_default_buffer_type(int n_gpu_layers) {
1175
+ #ifdef GGML_USE_METAL
1176
+ if (n_gpu_layers > 0) {
1177
+ return ggml_backend_metal_buffer_type();
1178
+ }
1179
+ #elif defined(GGML_USE_CUBLAS) && defined(LLAMA_GGML_BACKEND_CUDA_TEST)
1180
+ if (n_gpu_layers > 0) {
1181
+ return ggml_backend_cuda_buffer_type(0);
1182
+ }
1183
+ #elif defined(GGML_USE_CUBLAS)
1184
+ return ggml_backend_cuda_host_buffer_type();
1185
+ #elif defined(GGML_USE_CPU_HBM)
1186
+ return ggml_backend_cpu_hbm_buffer_type();
1187
+ #endif
1188
+
1189
+ return ggml_backend_cpu_buffer_type();
1190
+
1191
+ GGML_UNUSED(n_gpu_layers);
1192
+ }
1193
+
1131
1194
  //
1132
1195
  // globals
1133
1196
  //
@@ -1328,14 +1391,10 @@ struct llama_kv_cache {
1328
1391
 
1329
1392
  struct ggml_context * ctx = NULL;
1330
1393
 
1331
- llama_buffer buf;
1394
+ ggml_backend_buffer_t buf = NULL;
1332
1395
 
1333
1396
  ~llama_kv_cache() {
1334
- if (ctx) {
1335
- ggml_free(ctx);
1336
- }
1337
-
1338
- #ifdef GGML_USE_CUBLAS
1397
+ #if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
1339
1398
  if (ggml_cublas_loaded()) {
1340
1399
  for (size_t i = 0; i < k_l.size(); ++i) {
1341
1400
  ggml_cuda_free_data(k_l[i]);
@@ -1343,6 +1402,11 @@ struct llama_kv_cache {
1343
1402
  }
1344
1403
  }
1345
1404
  #endif
1405
+ if (ctx) {
1406
+ ggml_free(ctx);
1407
+ }
1408
+
1409
+ ggml_backend_buffer_free(buf);
1346
1410
  }
1347
1411
  };
1348
1412
 
@@ -1382,11 +1446,11 @@ struct llama_vocab {
1382
1446
  id special_suffix_id = 32008;
1383
1447
  id special_eot_id = 32010;
1384
1448
 
1385
- int find_bpe_rank(std::string token_left, std::string token_right) const {
1386
- GGML_ASSERT(token_left.find(" ") == std::string::npos);
1387
- GGML_ASSERT(token_left.find("\n") == std::string::npos);
1388
- GGML_ASSERT(token_right.find(" ") == std::string::npos);
1389
- GGML_ASSERT(token_right.find("\n") == std::string::npos);
1449
+ int find_bpe_rank(const std::string & token_left, const std::string & token_right) const {
1450
+ GGML_ASSERT(token_left.find(' ') == std::string::npos);
1451
+ GGML_ASSERT(token_left.find('\n') == std::string::npos);
1452
+ GGML_ASSERT(token_right.find(' ') == std::string::npos);
1453
+ GGML_ASSERT(token_right.find('\n') == std::string::npos);
1390
1454
 
1391
1455
  auto it = bpe_ranks.find(std::make_pair(token_left, token_right));
1392
1456
  if (it == bpe_ranks.end()) {
@@ -1415,6 +1479,7 @@ struct llama_model {
1415
1479
  struct ggml_tensor * output_norm;
1416
1480
  struct ggml_tensor * output_norm_b;
1417
1481
  struct ggml_tensor * output;
1482
+ struct ggml_tensor * output_b;
1418
1483
 
1419
1484
  std::vector<llama_layer> layers;
1420
1485
 
@@ -1427,7 +1492,7 @@ struct llama_model {
1427
1492
  struct ggml_context * ctx = NULL;
1428
1493
 
1429
1494
  // the model memory buffer
1430
- llama_buffer buf;
1495
+ ggml_backend_buffer_t buf = NULL;
1431
1496
 
1432
1497
  // model memory mapped file
1433
1498
  std::unique_ptr<llama_mmap> mapping;
@@ -1443,11 +1508,7 @@ struct llama_model {
1443
1508
  int64_t t_start_us = 0;
1444
1509
 
1445
1510
  ~llama_model() {
1446
- if (ctx) {
1447
- ggml_free(ctx);
1448
- }
1449
-
1450
- #ifdef GGML_USE_CUBLAS
1511
+ #if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
1451
1512
  if (ggml_cublas_loaded()) {
1452
1513
  for (size_t i = 0; i < tensors_by_name.size(); ++i) {
1453
1514
  ggml_cuda_free_data(tensors_by_name[i].second);
@@ -1461,24 +1522,26 @@ struct llama_model {
1461
1522
  ggml_cl_free_data(tensors_by_name[i].second);
1462
1523
  }
1463
1524
  #endif
1525
+ if (ctx) {
1526
+ ggml_free(ctx);
1527
+ }
1528
+
1529
+ ggml_backend_buffer_free(buf);
1464
1530
  }
1465
1531
  };
1466
1532
 
1467
1533
  struct llama_context {
1468
1534
  llama_context(const llama_model & model) : model(model), t_start_us(model.t_start_us), t_load_us(model.t_load_us) {}
1469
1535
  ~llama_context() {
1470
- #ifdef GGML_USE_METAL
1471
- if (ctx_metal) {
1472
- ggml_metal_free(ctx_metal);
1473
- }
1474
- #endif
1475
- if (alloc) {
1476
- ggml_allocr_free(alloc);
1477
- }
1536
+ ggml_allocr_free(alloc);
1537
+ ggml_backend_buffer_free(buf_alloc);
1538
+ ggml_backend_free(backend);
1478
1539
  }
1479
1540
 
1480
1541
  llama_cparams cparams;
1481
1542
 
1543
+ ggml_backend_t backend = nullptr;
1544
+
1482
1545
  const llama_model & model;
1483
1546
 
1484
1547
  // key + value cache for the self attention
@@ -1500,23 +1563,22 @@ struct llama_context {
1500
1563
 
1501
1564
  // decode output (2-dimensional array: [n_tokens][n_vocab])
1502
1565
  std::vector<float> logits;
1566
+ #ifndef NDEBUG
1567
+ // guard against access to unset logits
1568
+ std::vector<bool> logits_valid;
1569
+ #endif
1503
1570
  bool logits_all = false;
1504
1571
 
1505
1572
  // input embedding (1-dimensional array: [n_embd])
1506
1573
  std::vector<float> embedding;
1507
1574
 
1508
- // reusable buffer for `struct ggml_graph_plan.work_data`
1509
- std::vector<uint8_t> work_buffer;
1510
-
1511
1575
  // memory buffers used to evaluate the model
1512
- llama_buffer buf_compute;
1513
-
1514
- llama_buffer buf_alloc;
1576
+ std::vector<uint8_t> buf_compute_meta;
1577
+ ggml_backend_buffer_t buf_alloc = NULL;
1515
1578
  ggml_allocr * alloc = NULL;
1516
1579
 
1517
- #ifdef GGML_USE_METAL
1518
- ggml_metal_context * ctx_metal = NULL;
1519
- #endif
1580
+ // temporary buffer for copying data to/from the backend
1581
+ std::vector<no_init<uint8_t>> buf_copy;
1520
1582
 
1521
1583
  #ifdef GGML_USE_MPI
1522
1584
  ggml_mpi_context * ctx_mpi = NULL;
@@ -1538,9 +1600,6 @@ static bool llama_kv_cache_init(
1538
1600
  const uint32_t n_embd = hparams.n_embd_gqa();
1539
1601
  const uint32_t n_layer = hparams.n_layer;
1540
1602
 
1541
- const int64_t n_mem = n_layer*n_ctx;
1542
- const int64_t n_elements = n_embd*n_mem;
1543
-
1544
1603
  cache.has_shift = false;
1545
1604
 
1546
1605
  cache.head = 0;
@@ -1550,13 +1609,10 @@ static bool llama_kv_cache_init(
1550
1609
  cache.cells.clear();
1551
1610
  cache.cells.resize(n_ctx);
1552
1611
 
1553
- cache.buf.resize(ggml_row_size(ktype, n_elements) + ggml_row_size(vtype, n_elements) + 2u*n_layer*ggml_tensor_overhead());
1554
- memset(cache.buf.data, 0, cache.buf.size);
1555
-
1556
1612
  struct ggml_init_params params;
1557
- params.mem_size = cache.buf.size;
1558
- params.mem_buffer = cache.buf.data;
1559
- params.no_alloc = false;
1613
+ params.mem_size = 2u*n_layer*ggml_tensor_overhead();
1614
+ params.mem_buffer = NULL;
1615
+ params.no_alloc = true;
1560
1616
 
1561
1617
  cache.ctx = ggml_init(params);
1562
1618
 
@@ -1570,9 +1626,7 @@ static bool llama_kv_cache_init(
1570
1626
  cache.k_l.reserve(n_layer);
1571
1627
  cache.v_l.reserve(n_layer);
1572
1628
 
1573
- const int i_gpu_start = (int) n_layer - n_gpu_layers; GGML_UNUSED(i_gpu_start);
1574
-
1575
- GGML_UNUSED(offload);
1629
+ const int i_gpu_start = (int) n_layer - n_gpu_layers;
1576
1630
 
1577
1631
  for (int i = 0; i < (int) n_layer; i++) {
1578
1632
  ggml_tensor * k = ggml_new_tensor_1d(cache.ctx, ktype, n_embd*n_ctx);
@@ -1581,23 +1635,35 @@ static bool llama_kv_cache_init(
1581
1635
  ggml_format_name(v, "cache_v_l%d", i);
1582
1636
  cache.k_l.push_back(k);
1583
1637
  cache.v_l.push_back(v);
1584
- #ifdef GGML_USE_CUBLAS
1638
+ #if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
1585
1639
  if (i >= i_gpu_start) {
1586
1640
  if (offload) {
1587
1641
  ggml_cuda_assign_buffers_no_scratch(k);
1588
- vram_kv_cache += ggml_nbytes(k);
1589
1642
  ggml_cuda_assign_buffers_no_scratch(v);
1643
+ vram_kv_cache += ggml_nbytes(k);
1590
1644
  vram_kv_cache += ggml_nbytes(v);
1645
+ // HACK: mark tensor as allocated
1646
+ k->data = v->data = (void *)(uintptr_t)1;
1591
1647
  }
1592
1648
  }
1593
1649
  #endif // GGML_USE_CUBLAS
1594
1650
  }
1595
1651
 
1652
+ // allocate tensors
1653
+ cache.buf = ggml_backend_alloc_ctx_tensors_from_buft(cache.ctx, llama_default_buffer_type(n_gpu_layers));
1654
+
1655
+ // buf may be NULL with full offload
1656
+ if (cache.buf) {
1657
+ // initialize the buffer to avoid NaNs in the padding
1658
+ ggml_backend_buffer_clear(cache.buf, 0);
1659
+ }
1660
+
1596
1661
  if (vram_kv_cache > 0) {
1597
1662
  LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
1598
1663
  }
1599
1664
 
1600
- GGML_UNUSED(n_gpu_layers);
1665
+ GGML_UNUSED(i_gpu_start);
1666
+ GGML_UNUSED(offload);
1601
1667
 
1602
1668
  return true;
1603
1669
  }
@@ -1928,7 +1994,7 @@ namespace GGUFMeta {
1928
1994
  target = override->bool_value;
1929
1995
  return true;
1930
1996
  }
1931
- return true;
1997
+ return false;
1932
1998
  }
1933
1999
 
1934
2000
  template<typename OT>
@@ -2048,17 +2114,16 @@ struct llama_model_loader {
2048
2114
  enum ggml_type type_max = GGML_TYPE_F32;
2049
2115
 
2050
2116
  for (int i = 0; i < n_tensors; i++) {
2051
- const char * name = gguf_get_tensor_name(ctx_gguf, i);
2052
- struct ggml_tensor * meta = ggml_get_tensor(ctx_meta, name);
2117
+ enum ggml_type type = gguf_get_tensor_type(ctx_gguf, i);
2053
2118
 
2054
- n_type[meta->type]++;
2119
+ n_type[type]++;
2055
2120
 
2056
- if (n_type_max < n_type[meta->type]) {
2057
- n_type_max = n_type[meta->type];
2058
- type_max = meta->type;
2121
+ if (n_type_max < n_type[type]) {
2122
+ n_type_max = n_type[type];
2123
+ type_max = type;
2059
2124
  }
2060
2125
 
2061
- LLAMA_LOG_INFO("%s: - tensor %4d: %32s %-8s [ %s ]\n", __func__, i, name, ggml_type_name(meta->type), llama_format_tensor_shape(meta).c_str());
2126
+ // LLAMA_LOG_INFO("%s: - tensor %4d: %32s %-8s [ %s ]\n", __func__, i, name, ggml_type_name(meta->type), llama_format_tensor_shape(meta).c_str());
2062
2127
  }
2063
2128
 
2064
2129
  switch (type_max) {
@@ -2196,34 +2261,19 @@ struct llama_model_loader {
2196
2261
  return gguf_get_tensor_name(ctx_gguf, i);
2197
2262
  }
2198
2263
 
2199
- struct ggml_tensor * get_tensor_meta(int i) const {
2200
- return ggml_get_tensor(ctx_meta, get_tensor_name(i));
2264
+ struct ggml_tensor * get_tensor_meta(const char * name) const {
2265
+ return ggml_get_tensor(ctx_meta, name);
2201
2266
  }
2202
2267
 
2203
- void calc_sizes(size_t & ctx_size_p, size_t & mmapped_size_p) const {
2204
- ctx_size_p = 0;
2205
- mmapped_size_p = 0;
2206
-
2207
- for (int i = 0; i < n_tensors; i++) {
2208
- struct ggml_tensor * meta = get_tensor_meta(i);
2209
- ctx_size_p += sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE;
2210
- (use_mmap ? mmapped_size_p : ctx_size_p) += ggml_nbytes_pad(meta);
2211
- }
2268
+ struct ggml_tensor * get_tensor_meta(int i) const {
2269
+ return get_tensor_meta(get_tensor_name(i));
2212
2270
  }
2213
2271
 
2214
2272
  struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta, ggml_backend_type backend) {
2215
- if (backend != GGML_BACKEND_CPU) {
2216
- ggml_set_no_alloc(ctx, true);
2217
- }
2218
-
2219
2273
  struct ggml_tensor * tensor = ggml_dup_tensor(ctx, meta);
2220
2274
  tensor->backend = backend; // TODO: ggml_set_backend
2221
2275
  ggml_set_name(tensor, ggml_get_name(meta));
2222
2276
 
2223
- if (backend != GGML_BACKEND_CPU) {
2224
- ggml_set_no_alloc(ctx, use_mmap);
2225
- }
2226
-
2227
2277
  n_created++;
2228
2278
 
2229
2279
  return tensor;
@@ -2281,91 +2331,144 @@ struct llama_model_loader {
2281
2331
  return gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, idx);
2282
2332
  }
2283
2333
 
2334
+ void init_mapping(bool prefetch = true) {
2335
+ /*
2336
+ // prefetch only CPU tensors
2337
+ if (use_mmap) {
2338
+ size_t size_pref = 0; // prefetch
2339
+
2340
+ for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
2341
+ struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i));
2342
+ if (cur->backend == GGML_BACKEND_CPU) {
2343
+ size_t tensor_end = gguf_get_tensor_offset(ctx_gguf, i) + ggml_nbytes(cur);
2344
+ size_pref = std::max(size_pref, tensor_end);
2345
+ }
2346
+ }
2347
+ mapping.reset(new llama_mmap(&file, gguf_get_data_offset(ctx_gguf) + size_pref, ggml_is_numa()));
2348
+ }
2349
+ */
2350
+ // prefetch the whole file - all the data is needed anyway
2351
+ if (use_mmap) {
2352
+ mapping.reset(new llama_mmap(&file, prefetch ? -1 : 0, ggml_is_numa()));
2353
+ }
2354
+ }
2355
+
2356
+ // for backwards compatibility, does not support ggml-backend
2284
2357
  void load_data_for(struct ggml_tensor * cur) const {
2285
2358
  const size_t offs = file_offset(ggml_get_name(cur));
2286
2359
 
2287
- if (use_mmap) {
2288
- cur->data = (uint8_t *) mapping->addr + offs;
2360
+ if (use_mmap && mapping) {
2361
+ GGML_ASSERT(cur->data == nullptr);
2362
+ cur->data = (uint8_t *)mapping->addr + offs;
2289
2363
  } else {
2364
+ GGML_ASSERT(cur->data != nullptr);
2290
2365
  file.seek(offs, SEEK_SET);
2291
2366
  file.read_raw(cur->data, ggml_nbytes(cur));
2292
2367
  }
2293
2368
  }
2294
2369
 
2295
- void load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) {
2370
+ // Returns false if cancelled by progress_callback
2371
+ bool load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, ggml_backend_buffer_t buf_mmap, llama_mlock * lmlock) const {
2296
2372
  size_t size_data = 0;
2297
- size_t size_lock = 0;
2298
- size_t size_pref = 0; // prefetch
2299
2373
 
2300
2374
  for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
2301
2375
  struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i));
2302
2376
  size_data += ggml_nbytes(cur);
2303
- if (cur->backend == GGML_BACKEND_CPU) {
2304
- size_pref += ggml_nbytes(cur);
2305
- }
2306
2377
  }
2307
2378
 
2308
- if (use_mmap) {
2309
- mapping.reset(new llama_mmap(&file, size_pref, ggml_is_numa()));
2379
+ if (use_mmap && buf_mmap) {
2310
2380
  if (lmlock) {
2311
2381
  lmlock->init(mapping->addr);
2312
2382
  }
2313
2383
  }
2314
2384
 
2315
- size_t done_size = 0;
2385
+ #if (defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)) || defined(GGML_USE_CLBLAST)
2386
+ const bool legacy_offload = true;
2387
+ #else
2388
+ const bool legacy_offload = false;
2389
+ #endif
2390
+
2391
+ std::vector<no_init<uint8_t>> read_buf;
2392
+
2393
+ size_t size_done = 0;
2394
+
2395
+ size_t mmap_first = -1;
2396
+ size_t mmap_last = 0;
2397
+
2316
2398
  for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
2317
2399
  struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i));
2318
2400
  GGML_ASSERT(cur); // unused tensors should have been caught by load_data already
2319
2401
 
2320
2402
  if (progress_callback) {
2321
- progress_callback((float) done_size / size_data, progress_callback_user_data);
2322
- }
2323
-
2324
- // allocate temp buffer if not using mmap
2325
- if (!use_mmap && cur->data == NULL) {
2326
- GGML_ASSERT(cur->backend != GGML_BACKEND_CPU);
2327
- #ifdef GGML_USE_CPU_HBM
2328
- cur->data = (uint8_t*)hbw_malloc(ggml_nbytes(cur));
2329
- #else
2330
- cur->data = (uint8_t*)malloc(ggml_nbytes(cur));
2331
- #endif
2403
+ if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) {
2404
+ return false;
2405
+ }
2332
2406
  }
2333
2407
 
2334
- load_data_for(cur);
2408
+ const size_t offs = file_offset(ggml_get_name(cur));
2335
2409
 
2336
- switch (cur->backend) {
2337
- case GGML_BACKEND_CPU:
2338
- if (use_mmap && lmlock) {
2339
- size_lock += ggml_nbytes(cur);
2340
- lmlock->grow_to(size_lock);
2410
+ if (!legacy_offload || cur->backend == GGML_BACKEND_CPU) {
2411
+ if (use_mmap && mapping) {
2412
+ if (buf_mmap) {
2413
+ ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *) mapping->addr + offs);
2414
+ if (lmlock) {
2415
+ lmlock->grow_to(offs + ggml_nbytes(cur));
2416
+ }
2417
+ mmap_first = std::min(mmap_first, offs);
2418
+ mmap_last = std::max(mmap_last, offs + ggml_nbytes(cur));
2419
+ } else {
2420
+ ggml_backend_tensor_set(cur, (uint8_t *) mapping->addr + offs, 0, ggml_nbytes(cur));
2341
2421
  }
2342
- break;
2343
- #ifdef GGML_USE_CUBLAS
2344
- case GGML_BACKEND_GPU:
2345
- case GGML_BACKEND_GPU_SPLIT:
2346
- // old code:
2347
- //ggml_cuda_transform_tensor(lt.data, lt.ggml_tensor);
2348
-
2349
- // TODO: test if this works !!
2350
- ggml_cuda_transform_tensor(cur->data, cur);
2351
- if (!use_mmap) {
2352
- free(cur->data);
2422
+ } else {
2423
+ if (ggml_backend_buffer_is_host(cur->buffer)) {
2424
+ file.seek(offs, SEEK_SET);
2425
+ file.read_raw(cur->data, ggml_nbytes(cur));
2426
+ } else {
2427
+ read_buf.resize(ggml_nbytes(cur));
2428
+ file.seek(offs, SEEK_SET);
2429
+ file.read_raw(read_buf.data(), ggml_nbytes(cur));
2430
+ ggml_backend_tensor_set(cur, read_buf.data(), 0, ggml_nbytes(cur));
2353
2431
  }
2354
- break;
2432
+ }
2433
+ } else {
2434
+ // HACK: mark tensor as allocated
2435
+ cur->data = (void *)(uintptr_t)1;
2436
+ void * data;
2437
+ if (use_mmap && mapping) {
2438
+ data = (uint8_t *) mapping->addr + offs;
2439
+ } else {
2440
+ read_buf.resize(ggml_nbytes(cur));
2441
+ file.seek(offs, SEEK_SET);
2442
+ file.read_raw(read_buf.data(), ggml_nbytes(cur));
2443
+ data = read_buf.data();
2444
+ }
2445
+
2446
+ #if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
2447
+ ggml_cuda_transform_tensor(data, cur);
2355
2448
  #elif defined(GGML_USE_CLBLAST)
2356
- case GGML_BACKEND_GPU:
2357
- ggml_cl_transform_tensor(cur->data, cur);
2358
- if (!use_mmap) {
2359
- free(cur->data);
2360
- }
2361
- break;
2449
+ GGML_ASSERT(cur->backend == GGML_BACKEND_GPU);
2450
+ ggml_cl_transform_tensor(data, cur);
2451
+ #else
2452
+ GGML_ASSERT(!"GPU tensor without a GPU backend");
2453
+ GGML_UNUSED(data);
2362
2454
  #endif
2363
- default:
2364
- continue;
2365
2455
  }
2366
2456
 
2367
- done_size += ggml_nbytes(cur);
2457
+ size_done += ggml_nbytes(cur);
2368
2458
  }
2459
+
2460
+ // unmap offloaded tensors and metadata
2461
+ if (use_mmap && mapping) {
2462
+ mapping->unmap_fragment(0, mmap_first);
2463
+ mapping->unmap_fragment(mmap_last, mapping->size);
2464
+ }
2465
+
2466
+ if (progress_callback) {
2467
+ // Even though the model is done loading, we still honor
2468
+ // cancellation since we need to free allocations.
2469
+ return progress_callback(1.0f, progress_callback_user_data);
2470
+ }
2471
+ return true;
2369
2472
  }
2370
2473
  };
2371
2474
 
@@ -2388,25 +2491,25 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
2388
2491
 
2389
2492
  switch (ftype) {
2390
2493
  case LLAMA_FTYPE_ALL_F32: return "all F32";
2391
- case LLAMA_FTYPE_MOSTLY_F16: return "mostly F16";
2392
- case LLAMA_FTYPE_MOSTLY_Q4_0: return "mostly Q4_0";
2393
- case LLAMA_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1";
2494
+ case LLAMA_FTYPE_MOSTLY_F16: return "F16";
2495
+ case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0";
2496
+ case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1";
2394
2497
  case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
2395
- return "mostly Q4_1, some F16";
2396
- case LLAMA_FTYPE_MOSTLY_Q5_0: return "mostly Q5_0";
2397
- case LLAMA_FTYPE_MOSTLY_Q5_1: return "mostly Q5_1";
2398
- case LLAMA_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0";
2498
+ return "Q4_1, some F16";
2499
+ case LLAMA_FTYPE_MOSTLY_Q5_0: return "Q5_0";
2500
+ case LLAMA_FTYPE_MOSTLY_Q5_1: return "Q5_1";
2501
+ case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0";
2399
2502
 
2400
2503
  // K-quants
2401
- case LLAMA_FTYPE_MOSTLY_Q2_K: return "mostly Q2_K";
2402
- case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "mostly Q3_K - Small";
2403
- case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "mostly Q3_K - Medium";
2404
- case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "mostly Q3_K - Large";
2405
- case LLAMA_FTYPE_MOSTLY_Q4_K_S: return "mostly Q4_K - Small";
2406
- case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "mostly Q4_K - Medium";
2407
- case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "mostly Q5_K - Small";
2408
- case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "mostly Q5_K - Medium";
2409
- case LLAMA_FTYPE_MOSTLY_Q6_K: return "mostly Q6_K";
2504
+ case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K";
2505
+ case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small";
2506
+ case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "Q3_K - Medium";
2507
+ case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "Q3_K - Large";
2508
+ case LLAMA_FTYPE_MOSTLY_Q4_K_S: return "Q4_K - Small";
2509
+ case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "Q4_K - Medium";
2510
+ case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small";
2511
+ case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium";
2512
+ case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
2410
2513
 
2411
2514
  default: return "unknown, may not work";
2412
2515
  }
@@ -2524,6 +2627,7 @@ static void llm_load_hparams(
2524
2627
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2525
2628
 
2526
2629
  switch (hparams.n_layer) {
2630
+ case 22: model.type = e_model::MODEL_1B; break;
2527
2631
  case 26: model.type = e_model::MODEL_3B; break;
2528
2632
  case 32: model.type = e_model::MODEL_7B; break;
2529
2633
  case 40: model.type = e_model::MODEL_13B; break;
@@ -2625,6 +2729,15 @@ static void llm_load_hparams(
2625
2729
  default: model.type = e_model::MODEL_UNKNOWN;
2626
2730
  }
2627
2731
  } break;
2732
+ case LLM_ARCH_PHI2:
2733
+ {
2734
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
2735
+
2736
+ switch (hparams.n_layer) {
2737
+ case 32: model.type = e_model::MODEL_3B; break;
2738
+ default: model.type = e_model::MODEL_UNKNOWN;
2739
+ }
2740
+ } break;
2628
2741
 
2629
2742
  default: (void)0;
2630
2743
  }
@@ -2932,7 +3045,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
2932
3045
  if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
2933
3046
  }
2934
3047
 
2935
- static void llm_load_tensors(
3048
+ // Returns false if cancelled by progress_callback
3049
+ static bool llm_load_tensors(
2936
3050
  llama_model_loader & ml,
2937
3051
  llama_model & model,
2938
3052
  int n_gpu_layers,
@@ -2948,25 +3062,16 @@ static void llm_load_tensors(
2948
3062
 
2949
3063
  model.n_gpu_layers = n_gpu_layers;
2950
3064
 
2951
- size_t ctx_size;
2952
- size_t mmapped_size;
2953
-
2954
- ml.calc_sizes(ctx_size, mmapped_size);
3065
+ size_t ctx_size = ggml_tensor_overhead() * ml.n_tensors;
2955
3066
 
2956
- LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MiB\n", __func__, ctx_size/1024.0/1024.0);
3067
+ LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MiB\n", __func__, ctx_size/1024.0/1024.0);
2957
3068
 
2958
3069
  // create the ggml context
2959
3070
  {
2960
- model.buf.resize(ctx_size);
2961
- if (use_mlock) {
2962
- model.mlock_buf.init (model.buf.data);
2963
- model.mlock_buf.grow_to(model.buf.size);
2964
- }
2965
-
2966
3071
  struct ggml_init_params params = {
2967
- /*.mem_size =*/ model.buf.size,
2968
- /*.mem_buffer =*/ model.buf.data,
2969
- /*.no_alloc =*/ ml.use_mmap,
3072
+ /*.mem_size =*/ ctx_size,
3073
+ /*.mem_buffer =*/ NULL,
3074
+ /*.no_alloc =*/ true,
2970
3075
  };
2971
3076
 
2972
3077
  model.ctx = ggml_init(params);
@@ -2977,25 +3082,24 @@ static void llm_load_tensors(
2977
3082
 
2978
3083
  (void) main_gpu;
2979
3084
 
2980
- enum ggml_backend_type llama_backend_offload = GGML_BACKEND_CPU;
3085
+ enum ggml_backend_type llama_backend_offload = GGML_BACKEND_CPU;
2981
3086
  enum ggml_backend_type llama_backend_offload_split = GGML_BACKEND_CPU;
2982
3087
 
2983
- #ifdef GGML_USE_CUBLAS
3088
+ #if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
2984
3089
  if (ggml_cublas_loaded()) {
2985
3090
  LLAMA_LOG_INFO("%s: using " GGML_CUDA_NAME " for GPU acceleration\n", __func__);
2986
3091
  ggml_cuda_set_main_device(main_gpu);
2987
3092
 
2988
- llama_backend_offload = GGML_BACKEND_GPU;
3093
+ llama_backend_offload = GGML_BACKEND_GPU;
2989
3094
  llama_backend_offload_split = GGML_BACKEND_GPU_SPLIT;
2990
3095
  }
2991
3096
  #elif defined(GGML_USE_CLBLAST)
2992
3097
  LLAMA_LOG_INFO("%s: using OpenCL for GPU acceleration\n", __func__);
2993
- llama_backend_offload = GGML_BACKEND_GPU;
3098
+ llama_backend_offload = GGML_BACKEND_GPU;
2994
3099
  llama_backend_offload_split = GGML_BACKEND_GPU;
2995
3100
  #endif
2996
3101
 
2997
- // prepare memory for the weights
2998
- size_t vram_weights = 0;
3102
+ // create tensors for the weights
2999
3103
  {
3000
3104
  const int64_t n_embd = hparams.n_embd;
3001
3105
  const int64_t n_embd_gqa = hparams.n_embd_gqa();
@@ -3024,13 +3128,6 @@ static void llm_load_tensors(
3024
3128
 
3025
3129
  model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
3026
3130
  model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3027
-
3028
- if (backend_norm == GGML_BACKEND_GPU) {
3029
- vram_weights += ggml_nbytes(model.output_norm);
3030
- }
3031
- if (backend_output == GGML_BACKEND_GPU_SPLIT) {
3032
- vram_weights += ggml_nbytes(model.output);
3033
- }
3034
3131
  }
3035
3132
 
3036
3133
  const uint32_t n_ff = hparams.n_ff;
@@ -3080,28 +3177,6 @@ static void llm_load_tensors(
3080
3177
  layer.ffn_up_exp[x] = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), {n_embd, n_ff}, backend_split);
3081
3178
  }
3082
3179
  }
3083
-
3084
- if (backend == GGML_BACKEND_GPU) {
3085
- vram_weights +=
3086
- ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
3087
- ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) +
3088
- (layer.bq ? ggml_nbytes(layer.bq) : 0) +
3089
- (layer.bk ? ggml_nbytes(layer.bk) : 0) +
3090
- (layer.bv ? ggml_nbytes(layer.bv) : 0) +
3091
- (layer.bo ? ggml_nbytes(layer.bo) : 0) +
3092
- ggml_nbytes(layer.ffn_norm);
3093
-
3094
- if (layer.ffn_gate_inp == nullptr) {
3095
- vram_weights +=
3096
- ggml_nbytes(layer.ffn_gate) + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
3097
- } else {
3098
- vram_weights += ggml_nbytes(layer.ffn_gate_inp);
3099
- for (uint32_t x = 0; x < hparams.n_expert; ++x) {
3100
- vram_weights +=
3101
- ggml_nbytes(layer.ffn_gate_exp[x]) + ggml_nbytes(layer.ffn_down_exp[x]) + ggml_nbytes(layer.ffn_up_exp[x]);
3102
- }
3103
- }
3104
- }
3105
3180
  }
3106
3181
  } break;
3107
3182
  case LLM_ARCH_BAICHUAN:
@@ -3121,13 +3196,6 @@ static void llm_load_tensors(
3121
3196
 
3122
3197
  model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
3123
3198
  model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3124
-
3125
- if (backend_norm == GGML_BACKEND_GPU) {
3126
- vram_weights += ggml_nbytes(model.output_norm);
3127
- }
3128
- if (backend_output == GGML_BACKEND_GPU_SPLIT) {
3129
- vram_weights += ggml_nbytes(model.output);
3130
- }
3131
3199
  }
3132
3200
 
3133
3201
  const uint32_t n_ff = hparams.n_ff;
@@ -3154,19 +3222,10 @@ static void llm_load_tensors(
3154
3222
  layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
3155
3223
  layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
3156
3224
  layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
3157
-
3158
- if (backend == GGML_BACKEND_GPU) {
3159
- vram_weights +=
3160
- ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
3161
- ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) +
3162
- ggml_nbytes(layer.ffn_gate) + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
3163
- }
3164
3225
  }
3165
3226
  } break;
3166
3227
  case LLM_ARCH_FALCON:
3167
3228
  {
3168
- // TODO: CPU-only for now
3169
-
3170
3229
  model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
3171
3230
 
3172
3231
  // output
@@ -3185,14 +3244,6 @@ static void llm_load_tensors(
3185
3244
  model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
3186
3245
  model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
3187
3246
  model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3188
-
3189
- if (backend_norm == GGML_BACKEND_GPU) {
3190
- vram_weights += ggml_nbytes(model.output_norm);
3191
- vram_weights += ggml_nbytes(model.output_norm_b);
3192
- }
3193
- if (backend_output == GGML_BACKEND_GPU_SPLIT) {
3194
- vram_weights += ggml_nbytes(model.output);
3195
- }
3196
3247
  }
3197
3248
 
3198
3249
  const uint32_t n_ff = hparams.n_ff;
@@ -3213,11 +3264,6 @@ static void llm_load_tensors(
3213
3264
  if (gguf_find_tensor(ml.ctx_gguf, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i).c_str()) >= 0) {
3214
3265
  layer.attn_norm_2 = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, backend);
3215
3266
  layer.attn_norm_2_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, backend);
3216
-
3217
- if (backend == GGML_BACKEND_GPU) {
3218
- vram_weights += ggml_nbytes(layer.attn_norm_2);
3219
- vram_weights += ggml_nbytes(layer.attn_norm_2_b);
3220
- }
3221
3267
  }
3222
3268
 
3223
3269
  layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
@@ -3225,13 +3271,6 @@ static void llm_load_tensors(
3225
3271
 
3226
3272
  layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
3227
3273
  layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
3228
-
3229
- if (backend == GGML_BACKEND_GPU) {
3230
- vram_weights +=
3231
- ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
3232
- ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.wo) +
3233
- ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
3234
- }
3235
3274
  }
3236
3275
  } break;
3237
3276
  case LLM_ARCH_STARCODER:
@@ -3255,14 +3294,6 @@ static void llm_load_tensors(
3255
3294
  model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
3256
3295
  model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
3257
3296
  model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3258
-
3259
- if (backend_norm == GGML_BACKEND_GPU) {
3260
- vram_weights += ggml_nbytes(model.output_norm);
3261
- vram_weights += ggml_nbytes(model.output_norm_b);
3262
- }
3263
- if (backend_output == GGML_BACKEND_GPU_SPLIT) {
3264
- vram_weights += ggml_nbytes(model.output);
3265
- }
3266
3297
  }
3267
3298
 
3268
3299
  const uint32_t n_ff = hparams.n_ff;
@@ -3294,16 +3325,6 @@ static void llm_load_tensors(
3294
3325
 
3295
3326
  layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
3296
3327
  layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend);
3297
-
3298
- if (backend == GGML_BACKEND_GPU) {
3299
- vram_weights +=
3300
- ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
3301
- ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) +
3302
- ggml_nbytes(layer.wo) + ggml_nbytes(layer.bo) +
3303
- ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_norm_b) +
3304
- ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_down_b) +
3305
- ggml_nbytes(layer.ffn_up) + ggml_nbytes(layer.ffn_up_b);
3306
- }
3307
3328
  }
3308
3329
  } break;
3309
3330
  case LLM_ARCH_PERSIMMON:
@@ -3325,14 +3346,6 @@ static void llm_load_tensors(
3325
3346
  model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
3326
3347
  model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
3327
3348
  model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3328
-
3329
- if (backend_norm == GGML_BACKEND_GPU) {
3330
- vram_weights += ggml_nbytes(model.output_norm);
3331
- vram_weights += ggml_nbytes(model.output_norm_b);
3332
- }
3333
- if (backend_output == GGML_BACKEND_GPU_SPLIT) {
3334
- vram_weights += ggml_nbytes(model.output);
3335
- }
3336
3349
  }
3337
3350
 
3338
3351
  const uint32_t n_ff = hparams.n_ff;
@@ -3362,8 +3375,6 @@ static void llm_load_tensors(
3362
3375
  } break;
3363
3376
  case LLM_ARCH_BLOOM:
3364
3377
  {
3365
- // TODO: CPU-only for now
3366
-
3367
3378
  model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
3368
3379
  model.tok_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, GGML_BACKEND_CPU);
3369
3380
  model.tok_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, GGML_BACKEND_CPU);
@@ -3384,14 +3395,6 @@ static void llm_load_tensors(
3384
3395
  model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
3385
3396
  model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
3386
3397
  model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3387
-
3388
- if (backend_norm == GGML_BACKEND_GPU) {
3389
- vram_weights += ggml_nbytes(model.output_norm);
3390
- vram_weights += ggml_nbytes(model.output_norm_b);
3391
- }
3392
- if (backend_output == GGML_BACKEND_GPU_SPLIT) {
3393
- vram_weights += ggml_nbytes(model.output);
3394
- }
3395
3398
  }
3396
3399
 
3397
3400
  const uint32_t n_ff = hparams.n_ff;
@@ -3423,16 +3426,6 @@ static void llm_load_tensors(
3423
3426
 
3424
3427
  layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
3425
3428
  layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend);
3426
-
3427
- if (backend == GGML_BACKEND_GPU) {
3428
- vram_weights +=
3429
- ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
3430
- ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) +
3431
- ggml_nbytes(layer.wo) + ggml_nbytes(layer.bo) +
3432
- ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_norm_b) +
3433
- ggml_nbytes(layer.ffn_up) + ggml_nbytes(layer.ffn_up_b) +
3434
- ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_down_b);
3435
- }
3436
3429
  }
3437
3430
  } break;
3438
3431
  case LLM_ARCH_MPT:
@@ -3454,13 +3447,6 @@ static void llm_load_tensors(
3454
3447
 
3455
3448
  model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
3456
3449
  model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3457
-
3458
- if (backend_norm == GGML_BACKEND_GPU) {
3459
- vram_weights += ggml_nbytes(model.output_norm);
3460
- }
3461
- if (backend_output == GGML_BACKEND_GPU_SPLIT) {
3462
- vram_weights += ggml_nbytes(model.output);
3463
- }
3464
3450
  }
3465
3451
 
3466
3452
  const uint32_t n_ff = hparams.n_ff;
@@ -3483,16 +3469,6 @@ static void llm_load_tensors(
3483
3469
 
3484
3470
  layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
3485
3471
  layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
3486
-
3487
- if (backend == GGML_BACKEND_GPU) {
3488
- vram_weights +=
3489
- ggml_nbytes(layer.attn_norm) +
3490
- ggml_nbytes(layer.wqkv) +
3491
- ggml_nbytes(layer.wo) +
3492
- ggml_nbytes(layer.ffn_norm) +
3493
- ggml_nbytes(layer.ffn_down) +
3494
- ggml_nbytes(layer.ffn_up);
3495
- }
3496
3472
  }
3497
3473
  } break;
3498
3474
  case LLM_ARCH_STABLELM:
@@ -3515,13 +3491,6 @@ static void llm_load_tensors(
3515
3491
  model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
3516
3492
  model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
3517
3493
  model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3518
-
3519
- if (backend_norm == GGML_BACKEND_GPU) {
3520
- vram_weights += ggml_nbytes(model.output_norm);
3521
- }
3522
- if (backend_output == GGML_BACKEND_GPU_SPLIT) {
3523
- vram_weights += ggml_nbytes(model.output);
3524
- }
3525
3494
  }
3526
3495
 
3527
3496
  const uint32_t n_ff = hparams.n_ff;
@@ -3553,13 +3522,6 @@ static void llm_load_tensors(
3553
3522
  layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
3554
3523
  layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
3555
3524
  layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
3556
-
3557
- if (backend == GGML_BACKEND_GPU) {
3558
- vram_weights +=
3559
- ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
3560
- ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) +
3561
- ggml_nbytes(layer.ffn_gate) + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
3562
- }
3563
3525
  }
3564
3526
  } break;
3565
3527
  case LLM_ARCH_QWEN:
@@ -3579,14 +3541,7 @@ static void llm_load_tensors(
3579
3541
 
3580
3542
  model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
3581
3543
  model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3582
-
3583
- if (backend_norm == GGML_BACKEND_GPU) {
3584
- vram_weights += ggml_nbytes(model.output_norm);
3585
- }
3586
- if (backend_output == GGML_BACKEND_GPU_SPLIT) {
3587
- vram_weights += ggml_nbytes(model.output);
3588
- }
3589
- }
3544
+ }
3590
3545
 
3591
3546
  const uint32_t n_ff = hparams.n_ff / 2;
3592
3547
 
@@ -3611,16 +3566,59 @@ static void llm_load_tensors(
3611
3566
  layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
3612
3567
  layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
3613
3568
  layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
3569
+ }
3570
+ } break;
3571
+ case LLM_ARCH_PHI2:
3572
+ {
3573
+ model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
3574
+
3575
+ // output
3576
+ {
3577
+ ggml_backend_type backend_norm;
3578
+ ggml_backend_type backend_output;
3614
3579
 
3615
- if (backend == GGML_BACKEND_GPU) {
3616
- vram_weights +=
3617
- ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) +
3618
- ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_gate) +
3619
- ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
3580
+ if (n_gpu_layers > int(n_layer)) {
3581
+ backend_norm = llama_backend_offload;
3582
+ backend_output = llama_backend_offload;
3583
+ } else {
3584
+ backend_norm = GGML_BACKEND_CPU;
3585
+ backend_output = GGML_BACKEND_CPU;
3620
3586
  }
3587
+
3588
+ model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
3589
+ model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
3590
+ model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3591
+ model.output_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "bias"), {n_vocab}, backend_output);
3621
3592
  }
3622
- } break;
3623
3593
 
3594
+ const uint32_t n_ff = hparams.n_ff;
3595
+
3596
+ const int i_gpu_start = n_layer - n_gpu_layers;
3597
+
3598
+ model.layers.resize(n_layer);
3599
+
3600
+ for (uint32_t i = 0; i < n_layer; ++i) {
3601
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
3602
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
3603
+
3604
+ auto & layer = model.layers[i];
3605
+
3606
+ layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
3607
+ layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
3608
+
3609
+ layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
3610
+ layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend);
3611
+
3612
+ layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
3613
+ layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend);
3614
+
3615
+ layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
3616
+ layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend);
3617
+
3618
+ layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
3619
+ layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend);
3620
+ }
3621
+ } break;
3624
3622
  default:
3625
3623
  throw std::runtime_error("unknown architecture");
3626
3624
  }
@@ -3628,16 +3626,78 @@ static void llm_load_tensors(
3628
3626
 
3629
3627
  ml.done_getting_tensors();
3630
3628
 
3629
+ ml.init_mapping();
3630
+
3631
+ // allocate tensors
3632
+ size_t vram_weights = 0;
3633
+ size_t buf_size = 0;
3634
+
3635
+ ggml_backend_buffer_type_t buft = llama_default_buffer_type(n_gpu_layers);
3636
+
3637
+ for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
3638
+ // GGML_BACKEND_GPU tensors are for CUDA and OpenCL only, which are handled separately without ggml-backend
3639
+ if (t->backend == GGML_BACKEND_CPU) {
3640
+ buf_size += GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), ggml_backend_buft_get_alignment(buft));
3641
+ } else {
3642
+ vram_weights += ggml_nbytes(t);
3643
+ }
3644
+ }
3645
+
3646
+ // create backend buffer
3647
+ ggml_backend_buffer_t buf_mmap = nullptr;
3648
+
3649
+ #ifdef GGML_USE_METAL
3650
+ if (n_gpu_layers > 0) {
3651
+ if (ml.use_mmap) {
3652
+ const size_t max_size = ggml_get_max_tensor_size(ctx);
3653
+ model.buf = ggml_backend_metal_buffer_from_ptr(ml.mapping->addr, ml.mapping->size, max_size);
3654
+ buf_mmap = model.buf;
3655
+ } else {
3656
+ model.buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_metal_buffer_type());
3657
+ }
3658
+ }
3659
+ #elif defined(GGML_USE_CUBLAS) && defined(LLAMA_GGML_BACKEND_CUDA_TEST)
3660
+ // for testing only
3661
+ if (n_gpu_layers > 0) {
3662
+ model.buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_cuda_buffer_type(0));
3663
+ }
3664
+ #endif
3665
+
3666
+ if (model.buf == nullptr) {
3667
+ // CPU backend, and indirectly CUDA and OpenCL
3668
+ if (ml.use_mmap) {
3669
+ model.buf = ggml_backend_cpu_buffer_from_ptr(ml.mapping->addr, ml.mapping->size);
3670
+ buf_mmap = model.buf;
3671
+ } else {
3672
+ // allocate only CPU tensors
3673
+ model.buf = ggml_backend_buft_alloc_buffer(buft, buf_size);
3674
+ ggml_tallocr_t alloc = ggml_tallocr_new_from_buffer(model.buf);
3675
+ for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
3676
+ if (t->backend == GGML_BACKEND_CPU) {
3677
+ ggml_tallocr_alloc(alloc, t);
3678
+ }
3679
+ }
3680
+ ggml_tallocr_free(alloc);
3681
+ }
3682
+ }
3683
+
3684
+ if (use_mlock && ggml_backend_buffer_is_host(model.buf)) {
3685
+ model.mlock_buf.init (ggml_backend_buffer_get_base(model.buf));
3686
+ model.mlock_buf.grow_to(ggml_backend_buffer_get_size(model.buf));
3687
+ }
3688
+
3631
3689
  // print memory requirements
3632
3690
  {
3633
- // this is the total memory required to run the inference
3634
- size_t mem_required =
3635
- ctx_size +
3636
- mmapped_size - vram_weights; // weights in VRAM not in memory
3691
+ size_t sys_mem_required = ctx_size + buf_size;
3637
3692
 
3638
- LLAMA_LOG_INFO("%s: mem required = %7.2f MiB\n", __func__, mem_required / 1024.0 / 1024.0);
3693
+ if (sys_mem_required > 0) {
3694
+ LLAMA_LOG_INFO("%s: system memory used = %7.2f MiB\n", __func__, sys_mem_required / 1024.0 / 1024.0);
3695
+ }
3696
+ if (vram_weights > 0) {
3697
+ LLAMA_LOG_INFO("%s: VRAM used = %7.2f MiB\n", __func__, vram_weights / 1024.0 / 1024.0);
3698
+ }
3639
3699
 
3640
- #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
3700
+ #if (defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)) || defined(GGML_USE_CLBLAST)
3641
3701
  const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
3642
3702
 
3643
3703
  LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
@@ -3645,38 +3705,27 @@ static void llm_load_tensors(
3645
3705
  LLAMA_LOG_INFO("%s: offloading non-repeating layers to GPU\n", __func__);
3646
3706
  }
3647
3707
 
3648
- #ifdef GGML_USE_CUBLAS
3649
- const int max_backend_supported_layers = hparams.n_layer + 1;
3650
- const int max_offloadable_layers = hparams.n_layer + 1;
3651
- #elif GGML_USE_CLBLAST
3652
3708
  const int max_backend_supported_layers = hparams.n_layer + 1;
3653
3709
  const int max_offloadable_layers = hparams.n_layer + 1;
3654
- #endif // GGML_USE_CUBLAS
3655
3710
 
3656
3711
  LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
3657
- LLAMA_LOG_INFO("%s: VRAM used: %.2f MiB\n", __func__, vram_weights / 1024.0 / 1024.0);
3658
- #else
3659
- (void) n_gpu_layers;
3660
3712
  #endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
3661
3713
  }
3662
3714
 
3663
- // populate `tensors_by_name`
3715
+ #if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
3716
+ ggml_cuda_set_tensor_split(tensor_split);
3717
+ #else
3718
+ GGML_UNUSED(tensor_split);
3719
+ #endif // GGML_USE_CUBLAS
3720
+
3721
+ // populate tensors_by_name
3664
3722
  for (int i = 0; i < ml.n_tensors; ++i) {
3665
3723
  struct ggml_tensor * cur = ggml_get_tensor(ctx, ml.get_tensor_name(i));
3666
3724
  model.tensors_by_name.emplace_back(ggml_get_name(cur), cur);
3667
3725
  }
3668
3726
 
3669
- (void) tensor_split;
3670
- #ifdef GGML_USE_CUBLAS
3671
- {
3672
- ggml_cuda_set_tensor_split(tensor_split);
3673
- }
3674
- #endif
3675
-
3676
- ml.load_all_data(ctx, progress_callback, progress_callback_user_data, use_mlock ? &model.mlock_mmap : NULL);
3677
-
3678
- if (progress_callback) {
3679
- progress_callback(1.0f, progress_callback_user_data);
3727
+ if (!ml.load_all_data(ctx, progress_callback, progress_callback_user_data, buf_mmap, use_mlock ? &model.mlock_mmap : NULL)) {
3728
+ return false;
3680
3729
  }
3681
3730
 
3682
3731
  model.mapping = std::move(ml.mapping);
@@ -3684,9 +3733,11 @@ static void llm_load_tensors(
3684
3733
  // loading time will be recalculate after the first eval, so
3685
3734
  // we take page faults deferred by mmap() into consideration
3686
3735
  model.t_load_us = ggml_time_us() - model.t_start_us;
3736
+ return true;
3687
3737
  }
3688
3738
 
3689
- static bool llama_model_load(const std::string & fname, llama_model & model, const llama_model_params & params) {
3739
+ // Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
3740
+ static int llama_model_load(const std::string & fname, llama_model & model, const llama_model_params & params) {
3690
3741
  try {
3691
3742
  llama_model_loader ml(fname, params.use_mmap, params.kv_overrides);
3692
3743
 
@@ -3704,19 +3755,21 @@ static bool llama_model_load(const std::string & fname, llama_model & model, con
3704
3755
 
3705
3756
  if (params.vocab_only) {
3706
3757
  LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__);
3707
- return true;
3758
+ return 0;
3708
3759
  }
3709
3760
 
3710
- llm_load_tensors(
3761
+ if (!llm_load_tensors(
3711
3762
  ml, model, params.n_gpu_layers, params.main_gpu, params.tensor_split, params.use_mlock,
3712
3763
  params.progress_callback, params.progress_callback_user_data
3713
- );
3764
+ )) {
3765
+ return -2;
3766
+ }
3714
3767
  } catch (const std::exception & err) {
3715
3768
  LLAMA_LOG_ERROR("error loading model: %s\n", err.what());
3716
- return false;
3769
+ return -1;
3717
3770
  }
3718
3771
 
3719
- return true;
3772
+ return 0;
3720
3773
  }
3721
3774
 
3722
3775
  //
@@ -3981,17 +4034,18 @@ static struct ggml_tensor * llm_build_ffn(
3981
4034
  // if max_alibi_bias > 0 then apply ALiBi
3982
4035
  static struct ggml_tensor * llm_build_kqv(
3983
4036
  struct ggml_context * ctx,
4037
+ const llama_model & model,
3984
4038
  const llama_hparams & hparams,
3985
4039
  const llama_kv_cache & kv,
3986
4040
  struct ggml_tensor * wo,
3987
4041
  struct ggml_tensor * wo_b,
3988
4042
  struct ggml_tensor * q_cur,
3989
- struct ggml_tensor * kq_scale,
3990
4043
  struct ggml_tensor * kq_mask,
3991
4044
  int64_t n_ctx,
3992
4045
  int32_t n_tokens,
3993
4046
  int32_t n_kv,
3994
4047
  float max_alibi_bias,
4048
+ float kq_scale,
3995
4049
  const llm_build_cb & cb,
3996
4050
  int il) {
3997
4051
  const int64_t n_embd = hparams.n_embd;
@@ -4014,6 +4068,12 @@ static struct ggml_tensor * llm_build_kqv(
4014
4068
  struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
4015
4069
  cb(kq, "kq", il);
4016
4070
 
4071
+ if (model.arch == LLM_ARCH_PHI2) {
4072
+ // for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
4073
+ // ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
4074
+ ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
4075
+ }
4076
+
4017
4077
  if (max_alibi_bias > 0.0f) {
4018
4078
  // temporary branch until we figure out how to handle ggml_alibi through ggml_add
4019
4079
  kq = ggml_scale(ctx, kq, kq_scale);
@@ -4033,7 +4093,7 @@ static struct ggml_tensor * llm_build_kqv(
4033
4093
  kq = ggml_soft_max(ctx, kq);
4034
4094
  cb(kq, "kq_soft_max", il);
4035
4095
  } else {
4036
- kq = ggml_soft_max_ext(ctx, kq, kq_mask, 1.0f/sqrtf(float(n_embd_head)));
4096
+ kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_scale);
4037
4097
  cb(kq, "kq_soft_max_ext", il);
4038
4098
  }
4039
4099
 
@@ -4102,7 +4162,7 @@ struct llm_build_context {
4102
4162
 
4103
4163
  const llm_build_cb & cb;
4104
4164
 
4105
- llama_buffer & buf_compute;
4165
+ std::vector<uint8_t> & buf_compute_meta;
4106
4166
 
4107
4167
  struct ggml_context * ctx0 = nullptr;
4108
4168
 
@@ -4112,35 +4172,35 @@ struct llm_build_context {
4112
4172
  const llama_batch & batch,
4113
4173
  const llm_build_cb & cb,
4114
4174
  bool worst_case) :
4115
- model (lctx.model),
4116
- hparams (model.hparams),
4117
- cparams (lctx.cparams),
4118
- batch (batch),
4119
- kv_self (lctx.kv_self),
4120
- n_embd (hparams.n_embd),
4121
- n_layer (hparams.n_layer),
4122
- n_ctx (cparams.n_ctx),
4123
- n_head (hparams.n_head),
4124
- n_head_kv (hparams.n_head_kv),
4125
- n_embd_head (hparams.n_embd_head()),
4126
- n_embd_gqa (hparams.n_embd_gqa()),
4127
- n_expert (hparams.n_expert),
4128
- n_expert_used (hparams.n_expert_used),
4129
- freq_base (cparams.rope_freq_base),
4130
- freq_scale (cparams.rope_freq_scale),
4131
- ext_factor (cparams.yarn_ext_factor),
4132
- attn_factor (cparams.yarn_attn_factor),
4133
- beta_fast (cparams.yarn_beta_fast),
4134
- beta_slow (cparams.yarn_beta_slow),
4135
- norm_eps (hparams.f_norm_eps),
4136
- norm_rms_eps (hparams.f_norm_rms_eps),
4137
- n_tokens (batch.n_tokens),
4138
- n_kv (worst_case ? n_ctx : kv_self.n),
4139
- kv_head (worst_case ? n_ctx - n_tokens : kv_self.head),
4140
- n_orig_ctx (cparams.n_yarn_orig_ctx),
4141
- do_rope_shift (worst_case || kv_self.has_shift),
4142
- cb (cb),
4143
- buf_compute (lctx.buf_compute) {
4175
+ model (lctx.model),
4176
+ hparams (model.hparams),
4177
+ cparams (lctx.cparams),
4178
+ batch (batch),
4179
+ kv_self (lctx.kv_self),
4180
+ n_embd (hparams.n_embd),
4181
+ n_layer (hparams.n_layer),
4182
+ n_ctx (cparams.n_ctx),
4183
+ n_head (hparams.n_head),
4184
+ n_head_kv (hparams.n_head_kv),
4185
+ n_embd_head (hparams.n_embd_head()),
4186
+ n_embd_gqa (hparams.n_embd_gqa()),
4187
+ n_expert (hparams.n_expert),
4188
+ n_expert_used (hparams.n_expert_used),
4189
+ freq_base (cparams.rope_freq_base),
4190
+ freq_scale (cparams.rope_freq_scale),
4191
+ ext_factor (cparams.yarn_ext_factor),
4192
+ attn_factor (cparams.yarn_attn_factor),
4193
+ beta_fast (cparams.yarn_beta_fast),
4194
+ beta_slow (cparams.yarn_beta_slow),
4195
+ norm_eps (hparams.f_norm_eps),
4196
+ norm_rms_eps (hparams.f_norm_rms_eps),
4197
+ n_tokens (batch.n_tokens),
4198
+ n_kv (worst_case ? n_ctx : kv_self.n),
4199
+ kv_head (worst_case ? n_ctx - n_tokens : kv_self.head),
4200
+ n_orig_ctx (cparams.n_yarn_orig_ctx),
4201
+ do_rope_shift (worst_case || kv_self.has_shift),
4202
+ cb (cb),
4203
+ buf_compute_meta (lctx.buf_compute_meta) {
4144
4204
  GGML_ASSERT(!!kv_self.ctx);
4145
4205
 
4146
4206
  // all initializations should be done in init()
@@ -4148,8 +4208,8 @@ struct llm_build_context {
4148
4208
 
4149
4209
  void init() {
4150
4210
  struct ggml_init_params params = {
4151
- /*.mem_size =*/ buf_compute.size,
4152
- /*.mem_buffer =*/ buf_compute.data,
4211
+ /*.mem_size =*/ buf_compute_meta.size(),
4212
+ /*.mem_buffer =*/ buf_compute_meta.data(),
4153
4213
  /*.no_alloc =*/ true,
4154
4214
  };
4155
4215
 
@@ -4178,10 +4238,6 @@ struct llm_build_context {
4178
4238
  struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
4179
4239
  cb(inp_pos, "inp_pos", -1);
4180
4240
 
4181
- // KQ_scale
4182
- struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
4183
- cb(KQ_scale, "KQ_scale", -1);
4184
-
4185
4241
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
4186
4242
  struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
4187
4243
  cb(KQ_mask, "KQ_mask", -1);
@@ -4240,9 +4296,9 @@ struct llm_build_context {
4240
4296
 
4241
4297
  llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
4242
4298
 
4243
- cur = llm_build_kqv(ctx0, hparams, kv_self,
4299
+ cur = llm_build_kqv(ctx0, model, hparams, kv_self,
4244
4300
  model.layers[il].wo, model.layers[il].bo,
4245
- Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
4301
+ Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
4246
4302
  cb(cur, "kqv_out", il);
4247
4303
  }
4248
4304
 
@@ -4363,10 +4419,6 @@ struct llm_build_context {
4363
4419
  struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
4364
4420
  cb(inp_pos, "inp_pos", -1);
4365
4421
 
4366
- // KQ_scale
4367
- struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
4368
- cb(KQ_scale, "KQ_scale", -1);
4369
-
4370
4422
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
4371
4423
  struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
4372
4424
  cb(KQ_mask, "KQ_mask", -1);
@@ -4423,9 +4475,9 @@ struct llm_build_context {
4423
4475
  // apply ALiBi for 13B model
4424
4476
  const float max_alibi_bias = model.type == MODEL_13B ? 8.0f : -1.0f;
4425
4477
 
4426
- cur = llm_build_kqv(ctx0, hparams, kv_self,
4478
+ cur = llm_build_kqv(ctx0, model, hparams, kv_self,
4427
4479
  model.layers[il].wo, NULL,
4428
- Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, max_alibi_bias, cb, il);
4480
+ Qcur, KQ_mask, n_ctx, n_tokens, n_kv, max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il);
4429
4481
  cb(cur, "kqv_out", il);
4430
4482
  }
4431
4483
 
@@ -4483,10 +4535,6 @@ struct llm_build_context {
4483
4535
  struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
4484
4536
  cb(inp_pos, "inp_pos", -1);
4485
4537
 
4486
- // KQ_scale
4487
- struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
4488
- cb(KQ_scale, "KQ_scale", -1);
4489
-
4490
4538
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
4491
4539
  struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
4492
4540
  cb(KQ_mask, "KQ_mask", -1);
@@ -4547,9 +4595,9 @@ struct llm_build_context {
4547
4595
 
4548
4596
  llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
4549
4597
 
4550
- cur = llm_build_kqv(ctx0, hparams, kv_self,
4598
+ cur = llm_build_kqv(ctx0, model, hparams, kv_self,
4551
4599
  model.layers[il].wo, NULL,
4552
- Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
4600
+ Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
4553
4601
  cb(cur, "kqv_out", il);
4554
4602
  }
4555
4603
 
@@ -4606,10 +4654,6 @@ struct llm_build_context {
4606
4654
  struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
4607
4655
  cb(inp_pos, "inp_pos", -1);
4608
4656
 
4609
- // KQ_scale
4610
- struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
4611
- cb(KQ_scale, "KQ_scale", -1);
4612
-
4613
4657
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
4614
4658
  struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
4615
4659
  cb(KQ_mask, "KQ_mask", -1);
@@ -4647,9 +4691,9 @@ struct llm_build_context {
4647
4691
 
4648
4692
  llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
4649
4693
 
4650
- cur = llm_build_kqv(ctx0, hparams, kv_self,
4694
+ cur = llm_build_kqv(ctx0, model, hparams, kv_self,
4651
4695
  model.layers[il].wo, model.layers[il].bo,
4652
- Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
4696
+ Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
4653
4697
  cb(cur, "kqv_out", il);
4654
4698
  }
4655
4699
 
@@ -4706,10 +4750,6 @@ struct llm_build_context {
4706
4750
  struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
4707
4751
  cb(inp_pos, "inp_pos", -1);
4708
4752
 
4709
- // KQ_scale
4710
- struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
4711
- cb(KQ_scale, "KQ_scale", -1);
4712
-
4713
4753
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
4714
4754
  struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
4715
4755
  cb(KQ_mask, "KQ_mask", -1);
@@ -4856,9 +4896,9 @@ struct llm_build_context {
4856
4896
  llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
4857
4897
 
4858
4898
  // TODO: not tested, could be broken
4859
- cur = llm_build_kqv(ctx0, hparams, kv_self,
4899
+ cur = llm_build_kqv(ctx0, model, hparams, kv_self,
4860
4900
  model.layers[il].wo, model.layers[il].bo,
4861
- Q, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
4901
+ Q, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
4862
4902
  cb(cur, "kqv_out", il);
4863
4903
  }
4864
4904
 
@@ -4912,10 +4952,6 @@ struct llm_build_context {
4912
4952
  inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
4913
4953
  cb(inpL, "inp_embd", -1);
4914
4954
 
4915
- // KQ_scale
4916
- struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
4917
- cb(KQ_scale, "KQ_scale", -1);
4918
-
4919
4955
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
4920
4956
  struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
4921
4957
  cb(KQ_mask, "KQ_mask", -1);
@@ -4947,9 +4983,9 @@ struct llm_build_context {
4947
4983
 
4948
4984
  llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
4949
4985
 
4950
- cur = llm_build_kqv(ctx0, hparams, kv_self,
4986
+ cur = llm_build_kqv(ctx0, model, hparams, kv_self,
4951
4987
  model.layers[il].wo, NULL,
4952
- Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, cb, il);
4988
+ Qcur, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
4953
4989
  cb(cur, "kqv_out", il);
4954
4990
  }
4955
4991
 
@@ -5003,10 +5039,6 @@ struct llm_build_context {
5003
5039
  inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
5004
5040
  cb(inpL, "inp_embd", -1);
5005
5041
 
5006
- // KQ_scale
5007
- struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
5008
- cb(KQ_scale, "KQ_scale", -1);
5009
-
5010
5042
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
5011
5043
  struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
5012
5044
  cb(KQ_mask, "KQ_mask", -1);
@@ -5044,9 +5076,9 @@ struct llm_build_context {
5044
5076
 
5045
5077
  llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
5046
5078
 
5047
- cur = llm_build_kqv(ctx0, hparams, kv_self,
5079
+ cur = llm_build_kqv(ctx0, model, hparams, kv_self,
5048
5080
  model.layers[il].wo, model.layers[il].bo,
5049
- Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, cb, il);
5081
+ Qcur, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5050
5082
  cb(cur, "kqv_out", il);
5051
5083
  }
5052
5084
 
@@ -5097,10 +5129,6 @@ struct llm_build_context {
5097
5129
  inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
5098
5130
  cb(inpL, "inp_embd", -1);
5099
5131
 
5100
- // KQ_scale
5101
- struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
5102
- cb(KQ_scale, "KQ_scale", -1);
5103
-
5104
5132
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
5105
5133
  struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
5106
5134
  cb(KQ_mask, "KQ_mask", -1);
@@ -5138,9 +5166,9 @@ struct llm_build_context {
5138
5166
 
5139
5167
  llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
5140
5168
 
5141
- cur = llm_build_kqv(ctx0, hparams, kv_self,
5169
+ cur = llm_build_kqv(ctx0, model, hparams, kv_self,
5142
5170
  model.layers[il].wo, NULL,
5143
- Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, hparams.f_max_alibi_bias, cb, il);
5171
+ Qcur, KQ_mask, n_ctx, n_tokens, n_kv, hparams.f_max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5144
5172
  cb(cur, "kqv_out", il);
5145
5173
  }
5146
5174
 
@@ -5200,10 +5228,6 @@ struct llm_build_context {
5200
5228
  struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
5201
5229
  cb(inp_pos, "inp_pos", -1);
5202
5230
 
5203
- // KQ_scale
5204
- struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
5205
- cb(KQ_scale, "KQ_scale", -1);
5206
-
5207
5231
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
5208
5232
  struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
5209
5233
  cb(KQ_mask, "KQ_mask", -1);
@@ -5251,9 +5275,9 @@ struct llm_build_context {
5251
5275
 
5252
5276
  llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
5253
5277
 
5254
- cur = llm_build_kqv(ctx0, hparams, kv_self,
5278
+ cur = llm_build_kqv(ctx0, model, hparams, kv_self,
5255
5279
  model.layers[il].wo, NULL,
5256
- Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
5280
+ Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5257
5281
  cb(cur, "kqv_out", il);
5258
5282
  }
5259
5283
 
@@ -5310,15 +5334,11 @@ struct llm_build_context {
5310
5334
  cb(inpL, "inp_embd", -1);
5311
5335
 
5312
5336
  // inp_pos - contains the positions
5313
- struct ggml_tensor * inp_pos= ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
5337
+ struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
5314
5338
  cb(inp_pos, "inp_pos", -1);
5315
5339
 
5316
- // KQ_scale
5317
- struct ggml_tensor * KQ_scale= ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
5318
- cb(KQ_scale, "KQ_scale", -1);
5319
-
5320
5340
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
5321
- struct ggml_tensor * KQ_mask= ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
5341
+ struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
5322
5342
  cb(KQ_mask, "KQ_mask", -1);
5323
5343
 
5324
5344
  // shift the entire K-cache if needed
@@ -5368,9 +5388,9 @@ struct llm_build_context {
5368
5388
 
5369
5389
  llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
5370
5390
 
5371
- cur = llm_build_kqv(ctx0, hparams, kv_self,
5391
+ cur = llm_build_kqv(ctx0, model, hparams, kv_self,
5372
5392
  model.layers[il].wo, NULL,
5373
- Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
5393
+ Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5374
5394
  cb(cur, "kqv_out", il);
5375
5395
  }
5376
5396
 
@@ -5412,6 +5432,116 @@ struct llm_build_context {
5412
5432
 
5413
5433
  ggml_build_forward_expand(gf, cur);
5414
5434
 
5435
+ return gf;
5436
+ }
5437
+ struct ggml_cgraph * build_phi2() {
5438
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5439
+
5440
+ struct ggml_tensor * cur;
5441
+ struct ggml_tensor * attn_norm_output;
5442
+ struct ggml_tensor * ffn_output;
5443
+ struct ggml_tensor * inpL;
5444
+
5445
+ inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
5446
+ cb(inpL, "inp_embd", -1);
5447
+
5448
+ // inp_pos - contains the positions
5449
+ struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
5450
+ cb(inp_pos, "inp_pos", -1);
5451
+
5452
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
5453
+ struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
5454
+ cb(KQ_mask, "KQ_mask", -1);
5455
+
5456
+ // shift the entire K-cache if needed
5457
+ if (do_rope_shift) {
5458
+ llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, n_embd_head, freq_base, freq_scale, cb);
5459
+ }
5460
+
5461
+ for (int il = 0; il < n_layer; ++il) {
5462
+ attn_norm_output = llm_build_norm(ctx0, inpL, hparams,
5463
+ model.layers[il].attn_norm,
5464
+ model.layers[il].attn_norm_b,
5465
+ LLM_NORM, cb, il);
5466
+ cb(attn_norm_output, "attn_norm", il);
5467
+
5468
+ // self-attention
5469
+ {
5470
+ cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, attn_norm_output);
5471
+ cb(cur, "wqkv", il);
5472
+
5473
+ cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
5474
+ cb(cur, "bqkv", il);
5475
+
5476
+ struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
5477
+ struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
5478
+ struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
5479
+
5480
+ cb(Qcur, "Qcur", il);
5481
+ cb(Kcur, "Kcur", il);
5482
+ cb(Vcur, "Vcur", il);
5483
+
5484
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
5485
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
5486
+
5487
+ Qcur = ggml_rope_custom(
5488
+ ctx0, Qcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
5489
+ freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
5490
+ );
5491
+ cb(Qcur, "Qcur", il);
5492
+
5493
+ // with phi2, we scale the Q to avoid precision issues
5494
+ // ref: https://github.com/ml-explore/mlx-examples/blob/08e862336ade809bc37d1035f94b359e7d1a5152/phi2/phi2.py#L64-L66
5495
+ Qcur = ggml_scale(ctx0, Qcur, 1.0f/sqrtf(float(n_embd_head)));
5496
+ cb(Qcur, "Qcur", il);
5497
+
5498
+ Kcur = ggml_rope_custom(
5499
+ ctx0, Kcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
5500
+ freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
5501
+ );
5502
+ cb(Kcur, "Kcur", il);
5503
+
5504
+ llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
5505
+
5506
+ cur = llm_build_kqv(ctx0, model, hparams, kv_self,
5507
+ model.layers[il].wo, model.layers[il].bo,
5508
+ Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f, cb, il);
5509
+ cb(cur, "kqv_out", il);
5510
+ }
5511
+
5512
+ // FF
5513
+ {
5514
+ ffn_output = llm_build_ffn(ctx0, attn_norm_output,
5515
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b,
5516
+ NULL, NULL,
5517
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b,
5518
+ LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
5519
+ cb(ffn_output, "ffn_out", il);
5520
+ }
5521
+
5522
+ cur = ggml_add(ctx0, cur, ffn_output);
5523
+ cb(cur, "l_out", il);
5524
+
5525
+ cur = ggml_add(ctx0, cur, inpL);
5526
+ cb(cur, "l_out", il);
5527
+
5528
+ inpL = cur;
5529
+ }
5530
+
5531
+ cur = llm_build_norm(ctx0, inpL, hparams,
5532
+ model.output_norm,
5533
+ model.output_norm_b,
5534
+ LLM_NORM, cb, -1);
5535
+ cb(cur, "result_norm", -1);
5536
+
5537
+ cur = ggml_mul_mat(ctx0, model.output, cur);
5538
+ cb(cur, "result_output_no_bias", -1);
5539
+
5540
+ cur = ggml_add(ctx0, cur, model.output_b);
5541
+ cb(cur, "result_output", -1);
5542
+
5543
+ ggml_build_forward_expand(gf, cur);
5544
+
5415
5545
  return gf;
5416
5546
  }
5417
5547
  };
@@ -5427,7 +5557,7 @@ enum llm_offload_func_e {
5427
5557
  OFFLOAD_FUNC_FRC, // force offload
5428
5558
  OFFLOAD_FUNC_KQV,
5429
5559
  OFFLOAD_FUNC_NR,
5430
- OFFLOAD_FUNC_EMB,
5560
+ OFFLOAD_FUNC_EMB, // embeddings
5431
5561
  OFFLOAD_FUNC_OUT,
5432
5562
  };
5433
5563
 
@@ -5512,7 +5642,6 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map
5512
5642
  { "pos_embd", OFFLOAD_FUNC_NR },
5513
5643
 
5514
5644
  { "inp_pos", OFFLOAD_FUNC_FRC }, // this is often used for KQ ops (e.g. rope)
5515
- { "KQ_scale", OFFLOAD_FUNC_FRC },
5516
5645
  { "KQ_mask", OFFLOAD_FUNC_FRC },
5517
5646
  { "K_shift", OFFLOAD_FUNC_FRC },
5518
5647
 
@@ -5596,6 +5725,7 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map
5596
5725
  { "l_out", OFFLOAD_FUNC },
5597
5726
 
5598
5727
  { "result_norm", OFFLOAD_FUNC_EMB },
5728
+ { "result_output_no_bias", OFFLOAD_FUNC_EMB },
5599
5729
  { "result_output", OFFLOAD_FUNC_OUT },
5600
5730
  };
5601
5731
 
@@ -5613,11 +5743,10 @@ static struct ggml_cgraph * llama_build_graph(
5613
5743
  bool alloc_inp_tokens = false;
5614
5744
  bool alloc_inp_embd = false;
5615
5745
  bool alloc_inp_pos = false;
5616
- bool alloc_inp_KQ_scale = false;
5617
5746
  bool alloc_inp_KQ_mask = false;
5618
5747
  bool alloc_inp_K_shift = false;
5619
5748
 
5620
- #ifdef GGML_USE_CUBLAS
5749
+ #if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
5621
5750
  const bool do_offload = true;
5622
5751
  #else
5623
5752
  const bool do_offload = true; // TODO: set to false after finishing refactoring
@@ -5645,7 +5774,7 @@ static struct ggml_cgraph * llama_build_graph(
5645
5774
  if (!ggml_allocr_is_measure(lctx.alloc) && batch.token) {
5646
5775
  const int64_t n_tokens = cur->ne[0];
5647
5776
 
5648
- memcpy(cur->data, batch.token, n_tokens*ggml_element_size(cur));
5777
+ ggml_backend_tensor_set(cur, batch.token, 0, n_tokens*ggml_element_size(cur));
5649
5778
  }
5650
5779
 
5651
5780
  alloc_inp_tokens = true;
@@ -5658,7 +5787,7 @@ static struct ggml_cgraph * llama_build_graph(
5658
5787
  const int64_t n_embd = cur->ne[0];
5659
5788
  const int64_t n_tokens = cur->ne[1];
5660
5789
 
5661
- memcpy(cur->data, batch.embd, n_tokens*n_embd*ggml_element_size(cur));
5790
+ ggml_backend_tensor_set(cur, batch.embd, 0, n_tokens*n_embd*ggml_element_size(cur));
5662
5791
  }
5663
5792
 
5664
5793
  alloc_inp_embd = true;
@@ -5670,27 +5799,13 @@ static struct ggml_cgraph * llama_build_graph(
5670
5799
  if (!ggml_allocr_is_measure(lctx.alloc) && batch.pos) {
5671
5800
  const int64_t n_tokens = cur->ne[0];
5672
5801
 
5673
- int32_t * data = (int32_t *) cur->data;
5674
-
5675
- for (int i = 0; i < n_tokens; ++i) {
5676
- data[i] = batch.pos[i];
5677
- }
5802
+ static_assert(std::is_same<llama_pos, int32_t>::value, "llama_pos must be int32_t");
5803
+ ggml_backend_tensor_set(cur, batch.pos, 0, n_tokens*ggml_element_size(cur));
5678
5804
  }
5679
5805
 
5680
5806
  alloc_inp_pos = true;
5681
5807
  }
5682
5808
 
5683
- if (!alloc_inp_KQ_scale && strcmp(name, "KQ_scale") == 0) {
5684
- ggml_allocr_alloc(lctx.alloc, cur);
5685
-
5686
- if (!ggml_allocr_is_measure(lctx.alloc)) {
5687
- const int64_t n_embd_head = model.hparams.n_embd_head();
5688
- ggml_set_f32(cur, 1.0f/sqrtf(float(n_embd_head)));
5689
- }
5690
-
5691
- alloc_inp_KQ_scale = true;
5692
- }
5693
-
5694
5809
  if (!alloc_inp_KQ_mask && strcmp(name, "KQ_mask") == 0) {
5695
5810
  ggml_allocr_alloc(lctx.alloc, cur);
5696
5811
 
@@ -5698,8 +5813,13 @@ static struct ggml_cgraph * llama_build_graph(
5698
5813
  const int64_t n_kv = cur->ne[0];
5699
5814
  const int64_t n_tokens = cur->ne[1];
5700
5815
 
5701
- float * data = (float *) cur->data;
5702
- memset(data, 0, ggml_nbytes(cur));
5816
+ float * data;
5817
+ if (ggml_backend_buffer_is_host(cur->buffer)) {
5818
+ data = (float *) cur->data;
5819
+ } else {
5820
+ lctx.buf_copy.resize(ggml_nbytes(cur));
5821
+ data = (float *) lctx.buf_copy.data();
5822
+ }
5703
5823
 
5704
5824
  for (int h = 0; h < 1; ++h) {
5705
5825
  for (int j = 0; j < n_tokens; ++j) {
@@ -5707,12 +5827,20 @@ static struct ggml_cgraph * llama_build_graph(
5707
5827
  const llama_seq_id seq_id = batch.seq_id[j][0];
5708
5828
 
5709
5829
  for (int i = 0; i < n_kv; ++i) {
5830
+ float f;
5710
5831
  if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) {
5711
- data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
5832
+ f = -INFINITY;
5833
+ } else {
5834
+ f = 0;
5712
5835
  }
5836
+ data[h*(n_kv*n_tokens) + j*n_kv + i] = f;
5713
5837
  }
5714
5838
  }
5715
5839
  }
5840
+
5841
+ if (data != cur->data) {
5842
+ ggml_backend_tensor_set(cur, data, 0, ggml_nbytes(cur));
5843
+ }
5716
5844
  }
5717
5845
 
5718
5846
  alloc_inp_KQ_mask = true;
@@ -5724,11 +5852,21 @@ static struct ggml_cgraph * llama_build_graph(
5724
5852
  if (!ggml_allocr_is_measure(lctx.alloc)) {
5725
5853
  const int64_t n_ctx = cur->ne[0];
5726
5854
 
5727
- int32_t * data = (int32_t *) cur->data;
5855
+ int32_t * data;
5856
+ if (ggml_backend_buffer_is_host(cur->buffer)) {
5857
+ data = (int32_t *) cur->data;
5858
+ } else {
5859
+ lctx.buf_copy.resize(ggml_nbytes(cur));
5860
+ data = (int32_t *) lctx.buf_copy.data();
5861
+ }
5728
5862
 
5729
5863
  for (int i = 0; i < n_ctx; ++i) {
5730
5864
  data[i] = lctx.kv_self.cells[i].delta;
5731
5865
  }
5866
+
5867
+ if (data != cur->data) {
5868
+ ggml_backend_tensor_set(cur, data, 0, ggml_nbytes(cur));
5869
+ }
5732
5870
  }
5733
5871
 
5734
5872
  alloc_inp_K_shift = true;
@@ -5765,7 +5903,7 @@ static struct ggml_cgraph * llama_build_graph(
5765
5903
  static const std::unordered_map<llm_offload_func_e, std::string, std::hash<int>> k_offload_func_name = {
5766
5904
  { OFFLOAD_FUNC_NOP, "CPU" },
5767
5905
  { OFFLOAD_FUNC_OUT, "CPU" },
5768
- #ifdef GGML_USE_CUBLAS
5906
+ #if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
5769
5907
  { OFFLOAD_FUNC, "GPU (CUDA)" },
5770
5908
  { OFFLOAD_FUNC_FRC, "GPU (CUDA) FRC" },
5771
5909
  { OFFLOAD_FUNC_KQV, "GPU (CUDA) KQV" },
@@ -5838,7 +5976,7 @@ static struct ggml_cgraph * llama_build_graph(
5838
5976
  offload_func_t func = ggml_offload_nop;
5839
5977
 
5840
5978
  // this is needed for compatibility with Metal for example
5841
- #ifdef GGML_USE_CUBLAS
5979
+ #if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
5842
5980
  static offload_func_t ggml_offload_gpu = ggml_cuda_assign_buffers_no_alloc;
5843
5981
  #else
5844
5982
  static offload_func_t ggml_offload_gpu = ggml_offload_nop;
@@ -5912,6 +6050,10 @@ static struct ggml_cgraph * llama_build_graph(
5912
6050
  {
5913
6051
  result = llm.build_qwen();
5914
6052
  } break;
6053
+ case LLM_ARCH_PHI2:
6054
+ {
6055
+ result = llm.build_phi2();
6056
+ } break;
5915
6057
  default:
5916
6058
  GGML_ASSERT(false);
5917
6059
  }
@@ -6045,18 +6187,23 @@ static int llama_decode_internal(
6045
6187
 
6046
6188
  ggml_allocr_alloc_graph(lctx.alloc, gf);
6047
6189
 
6048
- struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
6049
- struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
6050
-
6051
- GGML_ASSERT(strcmp(res->name, "result_output") == 0);
6052
- GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
6190
+ // the output is always the last tensor in the graph
6191
+ struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
6192
+ GGML_ASSERT(strcmp(res->name, "result_output") == 0);
6053
6193
 
6194
+ // the embeddings could be the second to last tensor, or the third to last tensor
6195
+ struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
6196
+ if (strcmp(embeddings->name, "result_norm") != 0) {
6197
+ embeddings = gf->nodes[gf->n_nodes - 3];
6198
+ GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
6199
+ }
6054
6200
 
6055
- #ifdef GGML_USE_CUBLAS
6201
+ #if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
6202
+ char * buf_alloc_base = (char *)ggml_backend_buffer_get_base(lctx.buf_alloc);
6056
6203
  for (int i = 0; i < gf->n_leafs; i++) {
6057
6204
  ggml_tensor * node = gf->leafs[i];
6058
6205
  if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) {
6059
- ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) lctx.buf_alloc.data);
6206
+ ggml_cuda_assign_scratch_offset(node, (char *)node->data - buf_alloc_base);
6060
6207
  ggml_cuda_copy_to_device(node);
6061
6208
  }
6062
6209
  }
@@ -6064,7 +6211,7 @@ static int llama_decode_internal(
6064
6211
  for (int i = 0; i < gf->n_nodes; i++) {
6065
6212
  ggml_tensor * node = gf->nodes[i];
6066
6213
  if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) {
6067
- ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) lctx.buf_alloc.data);
6214
+ ggml_cuda_assign_scratch_offset(node, (char *)node->data - buf_alloc_base);
6068
6215
  }
6069
6216
  }
6070
6217
 
@@ -6091,23 +6238,23 @@ static int llama_decode_internal(
6091
6238
  n_threads = 1;
6092
6239
  }
6093
6240
 
6094
- #if GGML_USE_MPI
6241
+ #ifdef GGML_USE_MPI
6095
6242
  const int64_t n_layer = hparams.n_layer;
6096
6243
  ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
6097
6244
  #endif
6098
6245
 
6099
6246
  #ifdef GGML_USE_METAL
6100
- if (lctx.ctx_metal) {
6101
- ggml_metal_set_n_cb (lctx.ctx_metal, n_threads);
6102
- ggml_metal_graph_compute(lctx.ctx_metal, gf);
6103
- } else {
6104
- ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
6247
+ if (ggml_backend_is_metal(lctx.backend)) {
6248
+ ggml_backend_metal_set_n_cb(lctx.backend, n_threads);
6105
6249
  }
6106
- #else
6107
- ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
6108
6250
  #endif
6109
6251
 
6110
- #if GGML_USE_MPI
6252
+ if (ggml_backend_is_cpu(lctx.backend)) {
6253
+ ggml_backend_cpu_set_n_threads(lctx.backend, n_threads);
6254
+ }
6255
+ ggml_backend_graph_compute(lctx.backend, gf);
6256
+
6257
+ #ifdef GGML_USE_MPI
6111
6258
  ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
6112
6259
  #endif
6113
6260
 
@@ -6145,20 +6292,37 @@ static int llama_decode_internal(
6145
6292
  {
6146
6293
  auto & logits_out = lctx.logits;
6147
6294
 
6295
+ #ifndef NDEBUG
6296
+ auto & logits_valid = lctx.logits_valid;
6297
+ logits_valid.clear();
6298
+ logits_valid.resize(n_tokens);
6299
+
6300
+ logits_out.clear();
6301
+ #endif
6302
+
6148
6303
  if (batch.logits) {
6149
6304
  logits_out.resize(n_vocab * n_tokens);
6150
6305
  for (uint32_t i = 0; i < n_tokens; i++) {
6151
6306
  if (batch.logits[i] == 0) {
6152
6307
  continue;
6153
6308
  }
6154
- memcpy(logits_out.data() + (n_vocab*i), (float *) ggml_get_data(res) + (n_vocab*i), sizeof(float)*n_vocab);
6309
+ ggml_backend_tensor_get(res, logits_out.data() + (n_vocab*i), (n_vocab*i)*sizeof(float), n_vocab*sizeof(float));
6310
+ #ifndef NDEBUG
6311
+ logits_valid[i] = true;
6312
+ #endif
6155
6313
  }
6156
6314
  } else if (lctx.logits_all) {
6157
6315
  logits_out.resize(n_vocab * n_tokens);
6158
- memcpy(logits_out.data(), (float *) ggml_get_data(res), sizeof(float)*n_vocab*n_tokens);
6316
+ ggml_backend_tensor_get(res, logits_out.data(), 0, n_vocab*n_tokens*sizeof(float));
6317
+ #ifndef NDEBUG
6318
+ std::fill(logits_valid.begin(), logits_valid.end(), true);
6319
+ #endif
6159
6320
  } else {
6160
6321
  logits_out.resize(n_vocab);
6161
- memcpy(logits_out.data(), (float *) ggml_get_data(res) + (n_vocab*(n_tokens - 1)), sizeof(float)*n_vocab);
6322
+ ggml_backend_tensor_get(res, logits_out.data(), (n_vocab*(n_tokens - 1))*sizeof(float), n_vocab*sizeof(float));
6323
+ #ifndef NDEBUG
6324
+ logits_valid[0] = true;
6325
+ #endif
6162
6326
  }
6163
6327
  }
6164
6328
 
@@ -6167,7 +6331,7 @@ static int llama_decode_internal(
6167
6331
  auto & embedding_out = lctx.embedding;
6168
6332
 
6169
6333
  embedding_out.resize(n_embd);
6170
- memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(n_tokens - 1)), sizeof(float)*n_embd);
6334
+ ggml_backend_tensor_get(embeddings, embedding_out.data(), (n_embd*(n_tokens - 1))*sizeof(float), n_embd*sizeof(float));
6171
6335
  }
6172
6336
 
6173
6337
  // measure the performance only for the single-token evals
@@ -8125,12 +8289,6 @@ void llama_beam_search(llama_context * ctx,
8125
8289
  // quantization
8126
8290
  //
8127
8291
 
8128
- template <typename T>
8129
- struct no_init {
8130
- T value;
8131
- no_init() { /* do nothing */ }
8132
- };
8133
-
8134
8292
  struct quantize_state_internal {
8135
8293
  const llama_model & model;
8136
8294
  const llama_model_quantize_params * params;
@@ -8373,9 +8531,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
8373
8531
  #endif
8374
8532
 
8375
8533
  llama_model_loader ml(fname_inp, use_mmap, NULL);
8376
- if (ml.use_mmap) {
8377
- ml.mapping.reset(new llama_mmap(&ml.file, /* prefetch */ 0, ggml_is_numa()));
8378
- }
8534
+ ml.init_mapping(false); // no prefetching?
8379
8535
 
8380
8536
  llama_model model;
8381
8537
  llm_load_arch(ml, model);
@@ -8621,74 +8777,63 @@ static int llama_apply_lora_from_file_internal(
8621
8777
 
8622
8778
  const int64_t t_start_lora_us = ggml_time_us();
8623
8779
 
8624
- auto fin = std::ifstream(path_lora, std::ios::binary);
8625
- if (!fin) {
8626
- LLAMA_LOG_ERROR("%s: failed to open '%s'\n", __func__, path_lora);
8627
- return 1;
8628
- }
8780
+ llama_file fin(path_lora, "rb");
8629
8781
 
8630
8782
  // verify magic and version
8631
8783
  {
8632
- uint32_t magic;
8633
- fin.read((char *) &magic, sizeof(magic));
8634
- uint32_t format_version;
8635
- fin.read((char *) &format_version, sizeof(format_version));
8784
+ uint32_t magic = fin.read_u32();
8785
+ if (magic != LLAMA_FILE_MAGIC_GGLA) {
8786
+ LLAMA_LOG_ERROR("%s: bad file magic\n", __func__);
8787
+ return 1;
8788
+ }
8636
8789
 
8790
+ uint32_t format_version = fin.read_u32();
8637
8791
  if (format_version != 1) {
8638
8792
  LLAMA_LOG_ERROR("%s: unsupported file version\n", __func__ );
8639
8793
  return 1;
8640
8794
  }
8641
8795
  }
8642
8796
 
8643
- int32_t lora_r;
8644
- int32_t lora_alpha;
8645
- fin.read((char *) &lora_r, sizeof(lora_r));
8646
- fin.read((char *) &lora_alpha, sizeof(lora_alpha));
8797
+ int32_t lora_r = fin.read_u32();
8798
+ int32_t lora_alpha = fin.read_u32();
8647
8799
  float scaling = scale * (float)lora_alpha / (float)lora_r;
8648
8800
 
8649
8801
  LLAMA_LOG_INFO("%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);
8650
8802
 
8803
+ // create a name -> tensor map of the model to accelerate lookups
8804
+ // find the max tensor size to estimate the required temporary buffer size
8805
+ size_t max_tensor_size = 0;
8806
+ std::unordered_map<std::string, struct ggml_tensor*> model_tensors;
8807
+ for (const auto & kv : model.tensors_by_name) {
8808
+ model_tensors.insert(kv);
8809
+ size_t f32_size = ggml_nelements(kv.second) * sizeof(float);
8810
+ max_tensor_size = std::max(max_tensor_size, f32_size);
8811
+ }
8812
+
8651
8813
  // create a temporary ggml context to store the lora tensors
8652
- // todo: calculate size from biggest possible tensor
8653
- std::vector<uint8_t> lora_buf(1024ull * 1024ull * 1024ull);
8814
+ // TODO: use ggml-alloc
8815
+ size_t lora_ctx_size = max_tensor_size * 3;
8816
+ LLAMA_LOG_INFO("%s: allocating %.f MB for lora temporary buffer\n", __func__, lora_ctx_size / 1024.0 / 1024.0);
8817
+ std::vector<uint8_t> lora_buf(lora_ctx_size);
8818
+
8654
8819
  struct ggml_init_params params;
8655
8820
  params.mem_size = lora_buf.size();
8656
8821
  params.mem_buffer = lora_buf.data();
8657
8822
  params.no_alloc = false;
8658
8823
 
8659
- ggml_context * lora_ctx = ggml_init(params);
8660
- std::unordered_map<std::string, struct ggml_tensor *> lora_tensors;
8824
+ using unique_context = std::unique_ptr<ggml_context, decltype(&ggml_free)>;
8661
8825
 
8662
- // create a name -> tensor map of the model to accelerate lookups
8663
- std::unordered_map<std::string, struct ggml_tensor*> model_tensors;
8664
- for (const auto & kv : model.tensors_by_name) {
8665
- model_tensors.insert(kv);
8666
- }
8826
+ unique_context lora_ctx(nullptr, ggml_free);
8827
+ lora_ctx.reset(ggml_init(params));
8828
+ std::unordered_map<std::string, struct ggml_tensor *> lora_tensors;
8667
8829
 
8668
8830
  // load base model
8669
8831
  std::unique_ptr<llama_model_loader> ml;
8670
- ggml_context * base_ctx = NULL;
8671
- std::vector<uint8_t> base_buf;
8672
- if (path_base_model) {
8673
- LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
8674
- ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*kv_overrides*/ NULL));
8675
-
8676
- size_t ctx_size;
8677
- size_t mmapped_size;
8678
- ml->calc_sizes(ctx_size, mmapped_size);
8679
- base_buf.resize(ctx_size);
8680
-
8681
- ggml_init_params base_params;
8682
- base_params.mem_size = base_buf.size();
8683
- base_params.mem_buffer = base_buf.data();
8684
- base_params.no_alloc = ml->use_mmap;
8685
8832
 
8686
- base_ctx = ggml_init(base_params);
8687
-
8688
- // maybe this should in llama_model_loader
8689
- if (ml->use_mmap) {
8690
- ml->mapping.reset(new llama_mmap(&ml->file, /* prefetch */ 0, ggml_is_numa()));
8691
- }
8833
+ if (path_base_model) {
8834
+ LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
8835
+ ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*kv_overrides*/ nullptr));
8836
+ ml->init_mapping(false); // no prefetching
8692
8837
  }
8693
8838
 
8694
8839
  // read tensors and apply
@@ -8698,27 +8843,35 @@ static int llama_apply_lora_from_file_internal(
8698
8843
  std::vector<uint8_t> work_buffer;
8699
8844
 
8700
8845
  while (true) {
8846
+ if (fin.tell() == fin.size) {
8847
+ // eof
8848
+ break;
8849
+ }
8850
+
8701
8851
  int32_t n_dims;
8702
- int32_t length;
8852
+ int32_t name_len;
8703
8853
  int32_t ftype;
8704
8854
 
8705
- fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
8706
- fin.read(reinterpret_cast<char *>(&length), sizeof(length));
8707
- fin.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
8708
- if (fin.eof()) {
8709
- break;
8855
+ fin.read_raw(&n_dims, sizeof(n_dims));
8856
+ fin.read_raw(&name_len, sizeof(name_len));
8857
+ fin.read_raw(&ftype, sizeof(ftype));
8858
+
8859
+ if (n_dims != 1 && n_dims != 2) {
8860
+ LLAMA_LOG_ERROR("%s: unsupported tensor dimension %d\n", __func__, n_dims);
8861
+ return 1;
8710
8862
  }
8711
8863
 
8712
8864
  int32_t ne[2] = { 1, 1 };
8713
8865
  for (int i = 0; i < n_dims; ++i) {
8714
- fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
8866
+ fin.read_raw(&ne[i], sizeof(ne[i]));
8715
8867
  }
8716
8868
 
8717
8869
  std::string name;
8718
8870
  {
8871
+ GGML_ASSERT(name_len <= 1024);
8719
8872
  char buf[1024];
8720
- fin.read(buf, length);
8721
- name = std::string(buf, length);
8873
+ fin.read_raw(buf, name_len);
8874
+ name = std::string(buf, name_len);
8722
8875
  }
8723
8876
 
8724
8877
  // check for lora suffix and get the type of tensor
@@ -8732,7 +8885,7 @@ static int llama_apply_lora_from_file_internal(
8732
8885
  std::string lora_type = name.substr(pos + lora_suffix.length());
8733
8886
  std::string base_name = name;
8734
8887
  base_name.erase(pos);
8735
- // LLAMA_LOG_INFO("%s: %s => %s (lora type %s) \n", __func__, name.c_str(),base_name.c_str(), lora_type.c_str());
8888
+ // LLAMA_LOG_INFO("%s: %s => %s (lora type %s) \n", __func__, name.c_str(), base_name.c_str(), lora_type.c_str());
8736
8889
 
8737
8890
  if (model_tensors.find(base_name) == model_tensors.end()) {
8738
8891
  LLAMA_LOG_ERROR("%s: unknown tensor '%s' in lora adapter\n", __func__, name.data());
@@ -8751,22 +8904,15 @@ static int llama_apply_lora_from_file_internal(
8751
8904
  return false;
8752
8905
  }
8753
8906
  }
8754
- ggml_tensor * lora_tensor;
8755
- if (n_dims == 2) {
8756
- lora_tensor = ggml_new_tensor_2d(lora_ctx, wtype, ne[0], ne[1]);
8757
- }
8758
- else {
8759
- LLAMA_LOG_ERROR("%s: unsupported tensor dimension %d\n", __func__, n_dims);
8760
- return 1;
8761
- }
8762
- ggml_set_name(lora_tensor, "lora_tensor");
8907
+ ggml_tensor * lora_tensor = ggml_new_tensor_2d(lora_ctx.get(), wtype, ne[0], ne[1]);
8908
+ ggml_set_name(lora_tensor, name.c_str());
8763
8909
 
8764
8910
  // load tensor data
8765
- size_t offset = fin.tellg();
8911
+ size_t offset = fin.tell();
8766
8912
  size_t tensor_data_size = ggml_nbytes(lora_tensor);
8767
8913
  offset = (offset + 31) & -32;
8768
- fin.seekg(offset);
8769
- fin.read((char*)lora_tensor->data, tensor_data_size);
8914
+ fin.seek(offset, SEEK_SET);
8915
+ fin.read_raw(lora_tensor->data, tensor_data_size);
8770
8916
 
8771
8917
  lora_tensors[name] = lora_tensor;
8772
8918
 
@@ -8779,7 +8925,7 @@ static int llama_apply_lora_from_file_internal(
8779
8925
  offload_func_t offload_func = ggml_offload_nop;
8780
8926
  offload_func_t offload_func_force_inplace = ggml_offload_nop;
8781
8927
 
8782
- #ifdef GGML_USE_CUBLAS
8928
+ #if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
8783
8929
  if (dest_t->backend == GGML_BACKEND_GPU || dest_t->backend == GGML_BACKEND_GPU_SPLIT) {
8784
8930
  if (dest_t->type != GGML_TYPE_F16) {
8785
8931
  throw std::runtime_error(format(
@@ -8796,13 +8942,11 @@ static int llama_apply_lora_from_file_internal(
8796
8942
 
8797
8943
  // load from base model
8798
8944
  if (gguf_find_tensor(ctx_gguf, base_name.c_str()) < 0) {
8799
- // TODO: throw
8800
8945
  LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
8801
8946
  return 1;
8802
8947
  }
8803
8948
 
8804
- // TODO: not tested!! maybe not working!
8805
- base_t = ml->create_tensor(base_ctx, base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] }, GGML_BACKEND_CPU);
8949
+ base_t = ml->get_tensor_meta(base_name.c_str());
8806
8950
  ml->load_data_for(base_t);
8807
8951
  } else {
8808
8952
  base_t = dest_t;
@@ -8831,43 +8975,42 @@ static int llama_apply_lora_from_file_internal(
8831
8975
  }
8832
8976
 
8833
8977
  // w = w + BA*s
8834
- ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB);
8978
+ ggml_tensor * BA = ggml_mul_mat(lora_ctx.get(), loraA, loraB);
8835
8979
  offload_func(BA);
8836
8980
  ggml_set_name(BA, "BA");
8837
8981
 
8838
8982
  if (scaling != 1.0f) {
8839
- ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling);
8840
- ggml_set_name(scale_tensor, "scale_tensor");
8841
-
8842
- BA = ggml_scale_inplace(lora_ctx, BA, scale_tensor);
8983
+ BA = ggml_scale_inplace(lora_ctx.get(), BA, scaling);
8843
8984
  offload_func(BA);
8844
8985
  ggml_set_name(BA, "BA_scaled");
8845
8986
  }
8846
8987
 
8847
8988
  ggml_tensor * r;
8848
8989
  if (base_t == dest_t) {
8849
- r = ggml_add_inplace(lora_ctx, dest_t, BA);
8990
+ r = ggml_add_inplace(lora_ctx.get(), dest_t, BA);
8850
8991
  offload_func_force_inplace(r);
8851
8992
  ggml_set_name(r, "r_add_inplace");
8852
8993
  }
8853
8994
  else {
8854
- r = ggml_add(lora_ctx, base_t, BA);
8995
+ r = ggml_add(lora_ctx.get(), base_t, BA);
8855
8996
  offload_func(r);
8856
8997
  ggml_set_name(r, "r_add");
8857
8998
 
8858
- r = ggml_cpy(lora_ctx, r, dest_t);
8999
+ r = ggml_cpy(lora_ctx.get(), r, dest_t);
8859
9000
  offload_func(r);
8860
9001
  ggml_set_name(r, "r_cpy");
8861
9002
  }
8862
9003
 
8863
- struct ggml_cgraph * gf = ggml_new_graph(lora_ctx);
9004
+ struct ggml_cgraph * gf = ggml_new_graph(lora_ctx.get());
8864
9005
  ggml_build_forward_expand(gf, r);
8865
9006
 
8866
9007
  ggml_graph_compute_helper(work_buffer, gf, n_threads);
8867
9008
 
9009
+ // the tensors in the adapter must be sorted such that loraA and loraB of the same tensor are next to each other
9010
+ GGML_ASSERT(lora_tensors.size() == 2);
9011
+
8868
9012
  // we won't need these tensors again, reset the context to save memory
8869
- ggml_free(lora_ctx);
8870
- lora_ctx = ggml_init(params);
9013
+ lora_ctx.reset(ggml_init(params));
8871
9014
  lora_tensors.clear();
8872
9015
 
8873
9016
  n_tensors++;
@@ -8877,12 +9020,6 @@ static int llama_apply_lora_from_file_internal(
8877
9020
  }
8878
9021
  }
8879
9022
 
8880
- // TODO: this should be in a destructor, it will leak on failure
8881
- ggml_free(lora_ctx);
8882
- if (base_ctx) {
8883
- ggml_free(base_ctx);
8884
- }
8885
-
8886
9023
  const int64_t t_lora_us = ggml_time_us() - t_start_lora_us;
8887
9024
  LLAMA_LOG_INFO(" done (%.2f ms)\n", t_lora_us / 1000.0);
8888
9025
 
@@ -9012,11 +9149,18 @@ struct llama_model * llama_load_model_from_file(
9012
9149
  LLAMA_LOG_INFO("\n");
9013
9150
  }
9014
9151
  }
9152
+ return true;
9015
9153
  };
9016
9154
  }
9017
9155
 
9018
- if (!llama_model_load(path_model, *model, params)) {
9019
- LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
9156
+ int status = llama_model_load(path_model, *model, params);
9157
+ GGML_ASSERT(status <= 0);
9158
+ if (status < 0) {
9159
+ if (status == -1) {
9160
+ LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
9161
+ } else if (status == -2) {
9162
+ LLAMA_LOG_INFO("%s: cancelled model load\n", __func__);
9163
+ }
9020
9164
  delete model;
9021
9165
  return nullptr;
9022
9166
  }
@@ -9091,7 +9235,39 @@ struct llama_context * llama_new_context_with_model(
9091
9235
 
9092
9236
  // reserve memory for context buffers
9093
9237
  if (!hparams.vocab_only) {
9094
- if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, type_k, type_v, cparams.n_ctx, model->n_gpu_layers, cparams.offload_kqv)) {
9238
+ // initialize backend
9239
+ #ifdef GGML_USE_METAL
9240
+ if (model->n_gpu_layers > 0) {
9241
+ ctx->backend = ggml_backend_metal_init();
9242
+ if (ctx->backend == nullptr) {
9243
+ LLAMA_LOG_ERROR("%s: failed to initialize Metal backend\n", __func__);
9244
+ }
9245
+ }
9246
+ #elif defined(GGML_USE_CUBLAS) && defined(LLAMA_GGML_BACKEND_CUDA_TEST)
9247
+ // for testing only
9248
+ if (model->n_gpu_layers > 0) {
9249
+ ctx->backend = ggml_backend_cuda_init(0);
9250
+ if (ctx->backend == nullptr) {
9251
+ LLAMA_LOG_ERROR("%s: failed to initialize CUDA backend\n", __func__);
9252
+ }
9253
+ }
9254
+ #endif
9255
+
9256
+ if (ctx->backend == nullptr && ggml_backend_buffer_is_host(model->buf)) {
9257
+ ctx->backend = ggml_backend_cpu_init();
9258
+ if (ctx->backend == nullptr) {
9259
+ LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__);
9260
+ }
9261
+ }
9262
+
9263
+ if (ctx->backend == nullptr) {
9264
+ LLAMA_LOG_ERROR("%s: failed to initialize a backend\n", __func__);
9265
+ delete ctx;
9266
+ return nullptr;
9267
+ }
9268
+
9269
+ if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, type_k, type_v,
9270
+ cparams.n_ctx, model->n_gpu_layers, cparams.offload_kqv)) {
9095
9271
  LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
9096
9272
  llama_free(ctx);
9097
9273
  return nullptr;
@@ -9127,12 +9303,11 @@ struct llama_context * llama_new_context_with_model(
9127
9303
  }
9128
9304
 
9129
9305
  {
9130
- static const size_t tensor_alignment = 32;
9131
9306
  // the compute buffer is used to store the tensor and graph structs, while the allocator buffer is used for the tensor data
9132
- ctx->buf_compute.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead());
9307
+ ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead());
9133
9308
 
9134
9309
  // create measure allocator
9135
- ctx->alloc = ggml_allocr_new_measure(tensor_alignment);
9310
+ ctx->alloc = ggml_allocr_new_measure_from_backend(ctx->backend);
9136
9311
 
9137
9312
  // build worst-case graph
9138
9313
  int n_tokens = (int)std::min(cparams.n_ctx, cparams.n_batch);
@@ -9140,98 +9315,50 @@ struct llama_context * llama_new_context_with_model(
9140
9315
  llama_token token = llama_token_bos(&ctx->model); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
9141
9316
  ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0));
9142
9317
 
9143
- #ifdef GGML_USE_METAL
9144
- if (model->n_gpu_layers > 0) {
9145
- ctx->ctx_metal = ggml_metal_init(1);
9146
- if (!ctx->ctx_metal) {
9147
- LLAMA_LOG_ERROR("%s: ggml_metal_init() failed\n", __func__);
9148
- llama_free(ctx);
9149
- return NULL;
9150
- }
9151
- //ggml_metal_graph_find_concurrency(ctx->ctx_metal, gf, false);
9152
- //ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
9153
- }
9154
- #endif
9155
9318
  // measure memory requirements for the graph
9156
- size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment;
9319
+ size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf);
9157
9320
 
9158
- LLAMA_LOG_INFO("%s: compute buffer total size = %.2f MiB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0);
9321
+ LLAMA_LOG_INFO("%s: compute buffer total size = %.2f MiB\n", __func__, (ctx->buf_compute_meta.size() + alloc_size) / 1024.0 / 1024.0);
9159
9322
 
9160
- // recreate allocator with exact memory requirements
9323
+ // create allocator again with exact memory requirements
9161
9324
  ggml_allocr_free(ctx->alloc);
9162
9325
 
9163
- ctx->buf_alloc.resize(alloc_size);
9164
- ctx->alloc = ggml_allocr_new(ctx->buf_alloc.data, ctx->buf_alloc.size, tensor_alignment);
9165
- #ifdef GGML_USE_METAL
9166
- if (ctx->ctx_metal) {
9167
- //ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
9168
- }
9169
- #endif
9170
- #ifdef GGML_USE_CUBLAS
9171
- ggml_cuda_set_scratch_size(alloc_size);
9172
- LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MiB\n", __func__, alloc_size / 1024.0 / 1024.0);
9326
+ ctx->buf_alloc = ggml_backend_alloc_buffer(ctx->backend, alloc_size);
9327
+ ctx->alloc = ggml_allocr_new_from_buffer(ctx->buf_alloc);
9328
+ #if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
9329
+ if (model->n_gpu_layers > 0) {
9330
+ ggml_cuda_set_scratch_size(alloc_size);
9331
+ LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MiB\n", __func__, alloc_size / 1024.0 / 1024.0);
9173
9332
 
9174
- // calculate total VRAM usage
9175
- auto add_tensor = [](const ggml_tensor * t, size_t & size) {
9176
- if (t->backend == GGML_BACKEND_GPU || t->backend == GGML_BACKEND_GPU_SPLIT) {
9177
- size += ggml_nbytes(t);
9333
+ // calculate total VRAM usage
9334
+ auto add_tensor = [](const ggml_tensor * t, size_t & size) {
9335
+ if (t->backend == GGML_BACKEND_GPU || t->backend == GGML_BACKEND_GPU_SPLIT) {
9336
+ size += ggml_nbytes(t);
9337
+ }
9338
+ };
9339
+ size_t model_vram_size = 0;
9340
+ for (const auto & kv : model->tensors_by_name) {
9341
+ add_tensor(kv.second, model_vram_size);
9178
9342
  }
9179
- };
9180
- size_t model_vram_size = 0;
9181
- for (const auto & kv : model->tensors_by_name) {
9182
- add_tensor(kv.second, model_vram_size);
9183
- }
9184
-
9185
- size_t kv_vram_size = 0;
9186
- for (auto & k : ctx->kv_self.k_l) {
9187
- add_tensor(k, kv_vram_size);
9188
- }
9189
- for (auto & v : ctx->kv_self.v_l) {
9190
- add_tensor(v, kv_vram_size);
9191
- }
9192
-
9193
- size_t ctx_vram_size = alloc_size + kv_vram_size;
9194
- size_t total_vram_size = model_vram_size + ctx_vram_size;
9195
9343
 
9196
- LLAMA_LOG_INFO("%s: total VRAM used: %.2f MiB (model: %.2f MiB, context: %.2f MiB)\n", __func__,
9197
- total_vram_size / 1024.0 / 1024.0,
9198
- model_vram_size / 1024.0 / 1024.0,
9199
- ctx_vram_size / 1024.0 / 1024.0);
9200
- #endif
9201
- }
9202
-
9203
- #ifdef GGML_USE_METAL
9204
- if (model->n_gpu_layers > 0) {
9205
- // this allocates all Metal resources and memory buffers
9206
-
9207
- void * data_ptr = NULL;
9208
- size_t data_size = 0;
9209
-
9210
- if (ctx->model.mapping) {
9211
- data_ptr = ctx->model.mapping->addr;
9212
- data_size = ctx->model.mapping->size;
9213
- } else {
9214
- data_ptr = ggml_get_mem_buffer(ctx->model.ctx);
9215
- data_size = ggml_get_mem_size (ctx->model.ctx);
9216
- }
9217
-
9218
- const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
9344
+ size_t kv_vram_size = 0;
9345
+ for (auto & k : ctx->kv_self.k_l) {
9346
+ add_tensor(k, kv_vram_size);
9347
+ }
9348
+ for (auto & v : ctx->kv_self.v_l) {
9349
+ add_tensor(v, kv_vram_size);
9350
+ }
9219
9351
 
9220
- LLAMA_LOG_INFO("%s: max tensor size = %8.2f MiB\n", __func__, max_size/1024.0/1024.0);
9352
+ size_t ctx_vram_size = alloc_size + kv_vram_size;
9353
+ size_t total_vram_size = model_vram_size + ctx_vram_size;
9221
9354
 
9222
- #define LLAMA_METAL_CHECK_BUF(result) \
9223
- if (!(result)) { \
9224
- LLAMA_LOG_ERROR("%s: failed to add buffer\n", __func__); \
9225
- llama_free(ctx); \
9226
- return NULL; \
9355
+ LLAMA_LOG_INFO("%s: total VRAM used: %.2f MiB (model: %.2f MiB, context: %.2f MiB)\n", __func__,
9356
+ total_vram_size / 1024.0 / 1024.0,
9357
+ model_vram_size / 1024.0 / 1024.0,
9358
+ ctx_vram_size / 1024.0 / 1024.0);
9227
9359
  }
9228
-
9229
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
9230
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.data, ctx->kv_self.buf.size, 0));
9231
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "alloc", ctx->buf_alloc.data, ctx->buf_alloc.size, 0));
9232
- #undef LLAMA_METAL_CHECK_BUF
9233
- }
9234
9360
  #endif
9361
+ }
9235
9362
  }
9236
9363
 
9237
9364
  #ifdef GGML_USE_MPI
@@ -9259,10 +9386,14 @@ const llama_model * llama_get_model(const struct llama_context * ctx) {
9259
9386
  return &ctx->model;
9260
9387
  }
9261
9388
 
9262
- int llama_n_ctx(const struct llama_context * ctx) {
9389
+ uint32_t llama_n_ctx(const struct llama_context * ctx) {
9263
9390
  return ctx->cparams.n_ctx;
9264
9391
  }
9265
9392
 
9393
+ uint32_t llama_n_batch(const struct llama_context * ctx) {
9394
+ return ctx->cparams.n_batch;
9395
+ }
9396
+
9266
9397
  enum llama_vocab_type llama_vocab_type(const struct llama_model * model) {
9267
9398
  return model->vocab.type;
9268
9399
  }
@@ -9519,7 +9650,7 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
9519
9650
  const size_t s_embedding = ctx->embedding.size() * sizeof(float);
9520
9651
  const size_t s_kv_size = sizeof(size_t);
9521
9652
  const size_t s_kv_ntok = sizeof(int);
9522
- const size_t s_kv = ctx->kv_self.buf.size;
9653
+ const size_t s_kv = ggml_backend_buffer_get_size(ctx->kv_self.buf);
9523
9654
 
9524
9655
  const size_t s_total = (
9525
9656
  + s_rng_size
@@ -9647,7 +9778,7 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
9647
9778
  const auto n_embd = hparams.n_embd_gqa();
9648
9779
  const auto n_ctx = cparams.n_ctx;
9649
9780
 
9650
- const size_t kv_buf_size = kv_self.buf.size;
9781
+ const size_t kv_buf_size = ggml_backend_buffer_get_size(kv_self.buf);
9651
9782
  const uint32_t kv_head = kv_self.head;
9652
9783
  const uint32_t kv_size = kv_self.size;
9653
9784
  const uint32_t kv_used = kv_self.used;
@@ -9663,17 +9794,12 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
9663
9794
  ggml_context * cpy_ctx = ggml_init({ 6*n_layer*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
9664
9795
  ggml_cgraph * gf = ggml_new_graph(cpy_ctx);
9665
9796
 
9666
- std::vector<std::vector<uint8_t>> kout2d_data(n_layer);
9667
- std::vector<std::vector<uint8_t>> vout2d_data(n_layer);
9797
+ std::vector<struct ggml_tensor *> kout2d(n_layer);
9798
+ std::vector<struct ggml_tensor *> vout2d(n_layer);
9668
9799
 
9669
9800
  for (int il = 0; il < (int) n_layer; ++il) {
9670
- ggml_tensor * kout2d = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd, kv_head);
9671
- kout2d_data[il].resize(ggml_nbytes(kout2d));
9672
- kout2d->data = kout2d_data[il].data();
9673
-
9674
- ggml_tensor * vout2d = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd);
9675
- vout2d_data[il].resize(ggml_nbytes(vout2d));
9676
- vout2d->data = vout2d_data[il].data();
9801
+ kout2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd, kv_head);
9802
+ vout2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd);
9677
9803
 
9678
9804
  ggml_tensor * k2d = ggml_view_2d(cpy_ctx, kv_self.k_l[il],
9679
9805
  n_embd, kv_head,
@@ -9683,20 +9809,28 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
9683
9809
  kv_head, n_embd,
9684
9810
  elt_size*n_ctx, 0);
9685
9811
 
9686
- ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, k2d, kout2d));
9687
- ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, v2d, vout2d));
9812
+ ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, k2d, kout2d[il]));
9813
+ ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, v2d, vout2d[il]));
9688
9814
  }
9689
9815
 
9690
- ggml_graph_compute_helper(ctx->work_buffer, gf, /*n_threads*/ 1);
9816
+ ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(cpy_ctx, ctx->backend);
9691
9817
 
9692
- ggml_free(cpy_ctx);
9818
+ ggml_backend_graph_compute(ctx->backend, gf);
9819
+
9820
+ std::vector<uint8_t> tmp_buf;
9821
+ for (int il = 0; il < (int) n_layer; ++il) {
9822
+ tmp_buf.resize(ggml_nbytes(kout2d[il]));
9823
+ ggml_backend_tensor_get(kout2d[il], tmp_buf.data(), 0, tmp_buf.size());
9824
+ data_ctx->write(tmp_buf.data(), tmp_buf.size());
9693
9825
 
9694
- // our data is now in the kout2d_data and vout2d_data buffers
9695
- // write them to file
9696
- for (uint32_t il = 0; il < n_layer; ++il) {
9697
- data_ctx->write(kout2d_data[il].data(), kout2d_data[il].size());
9698
- data_ctx->write(vout2d_data[il].data(), vout2d_data[il].size());
9826
+ tmp_buf.resize(ggml_nbytes(vout2d[il]));
9827
+ ggml_backend_tensor_get(vout2d[il], tmp_buf.data(), 0, tmp_buf.size());
9828
+ data_ctx->write(tmp_buf.data(), tmp_buf.size());
9699
9829
  }
9830
+
9831
+ ggml_free(cpy_ctx);
9832
+
9833
+ ggml_backend_buffer_free(buf);
9700
9834
  }
9701
9835
 
9702
9836
  for (uint32_t i = 0; i < kv_size; ++i) {
@@ -9794,21 +9928,19 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
9794
9928
  memcpy(&kv_used, inp, sizeof(kv_used)); inp += sizeof(kv_used);
9795
9929
 
9796
9930
  if (kv_buf_size) {
9797
- GGML_ASSERT(kv_self.buf.size == kv_buf_size);
9931
+ GGML_ASSERT(ggml_backend_buffer_get_size(kv_self.buf) == kv_buf_size);
9798
9932
 
9799
9933
  const size_t elt_size = ggml_element_size(kv_self.k_l[0]);
9800
9934
 
9801
9935
  ggml_context * cpy_ctx = ggml_init({ 6*n_layer*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
9802
9936
  ggml_cgraph * gf = ggml_new_graph(cpy_ctx);
9803
9937
 
9804
- for (int il = 0; il < n_layer; ++il) {
9805
- ggml_tensor * kin2d = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd, kv_head);
9806
- kin2d->data = (void *) inp;
9807
- inp += ggml_nbytes(kin2d);
9938
+ std::vector<struct ggml_tensor *> kin2d(n_layer);
9939
+ std::vector<struct ggml_tensor *> vin2d(n_layer);
9808
9940
 
9809
- ggml_tensor * vin2d = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd);
9810
- vin2d->data = (void *) inp;
9811
- inp += ggml_nbytes(vin2d);
9941
+ for (int il = 0; il < n_layer; ++il) {
9942
+ kin2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd, kv_head);
9943
+ vin2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd);
9812
9944
 
9813
9945
  ggml_tensor * k2d = ggml_view_2d(cpy_ctx, kv_self.k_l[il],
9814
9946
  n_embd, kv_head,
@@ -9818,13 +9950,26 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
9818
9950
  kv_head, n_embd,
9819
9951
  elt_size*n_ctx, 0);
9820
9952
 
9821
- ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, kin2d, k2d));
9822
- ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, vin2d, v2d));
9953
+ ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, kin2d[il], k2d));
9954
+ ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, vin2d[il], v2d));
9955
+ }
9956
+
9957
+ ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(cpy_ctx, ctx->backend);
9958
+
9959
+ // load data into the tensors
9960
+ for (int il = 0; il < n_layer; ++il) {
9961
+ ggml_backend_tensor_set(kin2d[il], inp, 0, ggml_nbytes(kin2d[il]));
9962
+ inp += ggml_nbytes(kin2d[il]);
9963
+
9964
+ ggml_backend_tensor_set(vin2d[il], inp, 0, ggml_nbytes(vin2d[il]));
9965
+ inp += ggml_nbytes(vin2d[il]);
9823
9966
  }
9824
9967
 
9825
- ggml_graph_compute_helper(ctx->work_buffer, gf, /*n_threads*/ 1);
9968
+ ggml_backend_graph_compute(ctx->backend, gf);
9826
9969
 
9827
9970
  ggml_free(cpy_ctx);
9971
+
9972
+ ggml_backend_buffer_free(buf);
9828
9973
  }
9829
9974
 
9830
9975
  ctx->kv_self.head = kv_head;
@@ -10047,6 +10192,7 @@ float * llama_get_logits(struct llama_context * ctx) {
10047
10192
  }
10048
10193
 
10049
10194
  float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
10195
+ assert(ctx->logits_valid.at(i));
10050
10196
  return ctx->logits.data() + i*ctx->model.hparams.n_vocab;
10051
10197
  }
10052
10198