llama_cpp 0.10.1 → 0.10.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/ext/llama_cpp/llama_cpp.cpp +16 -1
- data/ext/llama_cpp/src/ggml-alloc.c +12 -4
- data/ext/llama_cpp/src/ggml-backend-impl.h +12 -8
- data/ext/llama_cpp/src/ggml-backend.c +75 -5
- data/ext/llama_cpp/src/ggml-backend.h +7 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +284 -162
- data/ext/llama_cpp/src/ggml-metal.h +3 -0
- data/ext/llama_cpp/src/ggml-metal.m +190 -44
- data/ext/llama_cpp/src/ggml-metal.metal +11 -2
- data/ext/llama_cpp/src/ggml.c +262 -89
- data/ext/llama_cpp/src/ggml.h +24 -10
- data/ext/llama_cpp/src/llama.cpp +926 -780
- data/ext/llama_cpp/src/llama.h +8 -3
- data/lib/llama_cpp/version.rb +2 -2
- metadata +2 -2
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -1,11 +1,12 @@
|
|
1
1
|
#define LLAMA_API_INTERNAL
|
2
|
+
//#define LLAMA_GGML_BACKEND_CUDA_TEST // for testing only - enables ggml-cuda through ggml-backend, disables partial offloading
|
2
3
|
#include "llama.h"
|
3
4
|
|
4
5
|
#include "unicode.h"
|
5
6
|
|
6
7
|
#include "ggml.h"
|
7
|
-
|
8
8
|
#include "ggml-alloc.h"
|
9
|
+
#include "ggml-backend.h"
|
9
10
|
|
10
11
|
#ifdef GGML_USE_CUBLAS
|
11
12
|
# include "ggml-cuda.h"
|
@@ -32,6 +33,7 @@
|
|
32
33
|
#include <unistd.h>
|
33
34
|
#if defined(_POSIX_MAPPED_FILES)
|
34
35
|
#include <sys/mman.h>
|
36
|
+
#include <fcntl.h>
|
35
37
|
#endif
|
36
38
|
#if defined(_POSIX_MEMLOCK_RANGE)
|
37
39
|
#include <sys/resource.h>
|
@@ -195,6 +197,7 @@ enum llm_arch {
|
|
195
197
|
LLM_ARCH_BLOOM,
|
196
198
|
LLM_ARCH_STABLELM,
|
197
199
|
LLM_ARCH_QWEN,
|
200
|
+
LLM_ARCH_PHI2,
|
198
201
|
LLM_ARCH_UNKNOWN,
|
199
202
|
};
|
200
203
|
|
@@ -212,6 +215,7 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
|
|
212
215
|
{ LLM_ARCH_BLOOM, "bloom" },
|
213
216
|
{ LLM_ARCH_STABLELM, "stablelm" },
|
214
217
|
{ LLM_ARCH_QWEN, "qwen" },
|
218
|
+
{ LLM_ARCH_PHI2, "phi2" },
|
215
219
|
};
|
216
220
|
|
217
221
|
enum llm_kv {
|
@@ -550,6 +554,19 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
550
554
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
551
555
|
},
|
552
556
|
},
|
557
|
+
{
|
558
|
+
LLM_ARCH_PHI2,
|
559
|
+
{
|
560
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
561
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
562
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
563
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
564
|
+
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
565
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
566
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
567
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
568
|
+
},
|
569
|
+
},
|
553
570
|
|
554
571
|
{
|
555
572
|
LLM_ARCH_UNKNOWN,
|
@@ -697,38 +714,6 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
|
|
697
714
|
// llama helpers
|
698
715
|
//
|
699
716
|
|
700
|
-
inline void * llama_host_malloc(size_t n) {
|
701
|
-
#ifdef GGML_USE_CUBLAS
|
702
|
-
if (ggml_cublas_loaded()) {
|
703
|
-
return ggml_cuda_host_malloc(n);
|
704
|
-
} else {
|
705
|
-
return malloc(n);
|
706
|
-
}
|
707
|
-
#elif GGML_USE_METAL
|
708
|
-
return ggml_metal_host_malloc(n);
|
709
|
-
#elif GGML_USE_CPU_HBM
|
710
|
-
return hbw_malloc(n);
|
711
|
-
#else
|
712
|
-
return malloc(n);
|
713
|
-
#endif
|
714
|
-
}
|
715
|
-
|
716
|
-
inline void llama_host_free(void * ptr) {
|
717
|
-
#ifdef GGML_USE_CUBLAS
|
718
|
-
if (ggml_cublas_loaded()) {
|
719
|
-
return ggml_cuda_host_free(ptr);
|
720
|
-
} else {
|
721
|
-
return free(ptr);
|
722
|
-
}
|
723
|
-
#elif GGML_USE_METAL
|
724
|
-
return ggml_metal_host_free(ptr);
|
725
|
-
#elif GGML_USE_CPU_HBM
|
726
|
-
return hbw_free(ptr);
|
727
|
-
#else
|
728
|
-
return free(ptr);
|
729
|
-
#endif
|
730
|
-
}
|
731
|
-
|
732
717
|
#if defined(_WIN32)
|
733
718
|
static std::string llama_format_win_err(DWORD err) {
|
734
719
|
LPSTR buf;
|
@@ -743,40 +728,10 @@ static std::string llama_format_win_err(DWORD err) {
|
|
743
728
|
}
|
744
729
|
#endif
|
745
730
|
|
746
|
-
|
747
|
-
|
748
|
-
|
749
|
-
|
750
|
-
// fallback to malloc / free
|
751
|
-
// useful in cases where CUDA can try to allocate PINNED memory
|
752
|
-
bool fallback = false;
|
753
|
-
|
754
|
-
void resize(size_t n) {
|
755
|
-
llama_host_free(data);
|
756
|
-
|
757
|
-
data = llama_host_malloc(n);
|
758
|
-
if (!data) {
|
759
|
-
fallback = true;
|
760
|
-
data = malloc(n);
|
761
|
-
} else {
|
762
|
-
fallback = false;
|
763
|
-
}
|
764
|
-
|
765
|
-
GGML_ASSERT(data);
|
766
|
-
size = n;
|
767
|
-
}
|
768
|
-
|
769
|
-
~llama_buffer() {
|
770
|
-
if (data) {
|
771
|
-
if (fallback) { // NOLINT
|
772
|
-
free(data);
|
773
|
-
} else {
|
774
|
-
llama_host_free(data);
|
775
|
-
}
|
776
|
-
}
|
777
|
-
|
778
|
-
data = NULL;
|
779
|
-
}
|
731
|
+
template <typename T>
|
732
|
+
struct no_init {
|
733
|
+
T value;
|
734
|
+
no_init() { /* do nothing */ }
|
780
735
|
};
|
781
736
|
|
782
737
|
struct llama_file {
|
@@ -864,6 +819,9 @@ struct llama_mmap {
|
|
864
819
|
#ifdef _POSIX_MAPPED_FILES
|
865
820
|
static constexpr bool SUPPORTED = true;
|
866
821
|
|
822
|
+
// list of mapped fragments (first_offset, last_offset)
|
823
|
+
std::vector<std::pair<size_t, size_t>> mapped_fragments;
|
824
|
+
|
867
825
|
llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) {
|
868
826
|
size = file->size;
|
869
827
|
int fd = fileno(file->fp);
|
@@ -871,17 +829,22 @@ struct llama_mmap {
|
|
871
829
|
// prefetch/readahead impairs performance on NUMA systems
|
872
830
|
if (numa) { prefetch = 0; }
|
873
831
|
#ifdef __linux__
|
832
|
+
// advise the kernel to read the file sequentially (increases readahead)
|
833
|
+
if (posix_fadvise(fd, 0, 0, POSIX_FADV_SEQUENTIAL)) {
|
834
|
+
LLAMA_LOG_WARN("warning: posix_fadvise(.., POSIX_FADV_SEQUENTIAL) failed: %s\n",
|
835
|
+
strerror(errno));
|
836
|
+
}
|
874
837
|
if (prefetch) { flags |= MAP_POPULATE; }
|
875
838
|
#endif
|
876
839
|
addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
|
877
|
-
if (addr == MAP_FAILED) {
|
840
|
+
if (addr == MAP_FAILED) { // NOLINT
|
878
841
|
throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
|
879
842
|
}
|
880
843
|
|
881
844
|
if (prefetch > 0) {
|
882
|
-
//
|
845
|
+
// advise the kernel to preload the mapped memory
|
883
846
|
if (posix_madvise(addr, std::min(file->size, prefetch), POSIX_MADV_WILLNEED)) {
|
884
|
-
|
847
|
+
LLAMA_LOG_WARN("warning: posix_madvise(.., POSIX_MADV_WILLNEED) failed: %s\n",
|
885
848
|
strerror(errno));
|
886
849
|
}
|
887
850
|
}
|
@@ -889,14 +852,81 @@ struct llama_mmap {
|
|
889
852
|
// advise the kernel not to use readahead
|
890
853
|
// (because the next page might not belong on the same node)
|
891
854
|
if (posix_madvise(addr, file->size, POSIX_MADV_RANDOM)) {
|
892
|
-
|
855
|
+
LLAMA_LOG_WARN("warning: posix_madvise(.., POSIX_MADV_RANDOM) failed: %s\n",
|
893
856
|
strerror(errno));
|
894
857
|
}
|
895
858
|
}
|
859
|
+
|
860
|
+
// initialize list of mapped_fragments
|
861
|
+
mapped_fragments.emplace_back(0, file->size);
|
862
|
+
}
|
863
|
+
|
864
|
+
static void align_range(size_t * first, size_t * last, size_t page_size) {
|
865
|
+
// align first to the next page
|
866
|
+
size_t offset_in_page = *first & (page_size - 1);
|
867
|
+
size_t offset_to_page = offset_in_page == 0 ? 0 : page_size - offset_in_page;
|
868
|
+
*first += offset_to_page;
|
869
|
+
|
870
|
+
// align last to the previous page
|
871
|
+
*last = *last & ~(page_size - 1);
|
872
|
+
|
873
|
+
if (*last <= *first) {
|
874
|
+
*last = *first;
|
875
|
+
}
|
876
|
+
}
|
877
|
+
|
878
|
+
// partially unmap the file in the range [first, last)
|
879
|
+
void unmap_fragment(size_t first, size_t last) {
|
880
|
+
// note: this function must not be called multiple times with overlapping ranges
|
881
|
+
// otherwise, there is a risk of invalidating addresses that have been repurposed for other mappings
|
882
|
+
int page_size = sysconf(_SC_PAGESIZE);
|
883
|
+
align_range(&first, &last, page_size);
|
884
|
+
size_t len = last - first;
|
885
|
+
|
886
|
+
if (len == 0) {
|
887
|
+
return;
|
888
|
+
}
|
889
|
+
|
890
|
+
GGML_ASSERT(first % page_size == 0);
|
891
|
+
GGML_ASSERT(last % page_size == 0);
|
892
|
+
GGML_ASSERT(last > first);
|
893
|
+
|
894
|
+
void * next_page_start = (uint8_t *) addr + first;
|
895
|
+
|
896
|
+
// unmap the range
|
897
|
+
if (munmap(next_page_start, len)) {
|
898
|
+
LLAMA_LOG_WARN("warning: munmap failed: %s\n", strerror(errno));
|
899
|
+
}
|
900
|
+
|
901
|
+
// update the list of mapped fragments to avoid unmapping the same range again in the destructor
|
902
|
+
std::vector<std::pair<size_t, size_t>> new_mapped_fragments;
|
903
|
+
for (const auto & frag : mapped_fragments) {
|
904
|
+
if (frag.first < first && frag.second > last) {
|
905
|
+
// the range is in the middle of the fragment, split it
|
906
|
+
new_mapped_fragments.emplace_back(frag.first, first);
|
907
|
+
new_mapped_fragments.emplace_back(last, frag.second);
|
908
|
+
} else if (frag.first < first && frag.second > first) {
|
909
|
+
// the range starts in the middle of the fragment
|
910
|
+
new_mapped_fragments.emplace_back(frag.first, first);
|
911
|
+
} else if (frag.first < last && frag.second > last) {
|
912
|
+
// the range ends in the middle of the fragment
|
913
|
+
new_mapped_fragments.emplace_back(last, frag.second);
|
914
|
+
} else if (frag.first >= first && frag.second <= last) {
|
915
|
+
// the range covers the entire fragment
|
916
|
+
} else {
|
917
|
+
// the range is outside the fragment
|
918
|
+
new_mapped_fragments.push_back(frag);
|
919
|
+
}
|
920
|
+
}
|
921
|
+
mapped_fragments = std::move(new_mapped_fragments);
|
896
922
|
}
|
897
923
|
|
898
924
|
~llama_mmap() {
|
899
|
-
|
925
|
+
for (const auto & frag : mapped_fragments) {
|
926
|
+
if (munmap((char *) addr + frag.first, frag.second - frag.first)) {
|
927
|
+
LLAMA_LOG_WARN("warning: munmap failed: %s\n", strerror(errno));
|
928
|
+
}
|
929
|
+
}
|
900
930
|
}
|
901
931
|
#elif defined(_WIN32)
|
902
932
|
static constexpr bool SUPPORTED = true;
|
@@ -939,6 +969,12 @@ struct llama_mmap {
|
|
939
969
|
#endif // _WIN32_WINNT >= _WIN32_WINNT_WIN8
|
940
970
|
}
|
941
971
|
|
972
|
+
void unmap_fragment(size_t first, size_t last) {
|
973
|
+
// not supported
|
974
|
+
GGML_UNUSED(first);
|
975
|
+
GGML_UNUSED(last);
|
976
|
+
}
|
977
|
+
|
942
978
|
~llama_mmap() {
|
943
979
|
if (!UnmapViewOfFile(addr)) {
|
944
980
|
fprintf(stderr, "warning: UnmapViewOfFile failed: %s\n",
|
@@ -955,6 +991,13 @@ struct llama_mmap {
|
|
955
991
|
|
956
992
|
throw std::runtime_error(std::string("mmap not supported"));
|
957
993
|
}
|
994
|
+
|
995
|
+
void unmap(size_t offset, size_t len) {
|
996
|
+
(void) offset;
|
997
|
+
(void) len;
|
998
|
+
|
999
|
+
throw std::runtime_error(std::string("mmap not supported"));
|
1000
|
+
}
|
958
1001
|
#endif
|
959
1002
|
};
|
960
1003
|
|
@@ -1128,6 +1171,26 @@ static std::string llama_token_to_piece(const struct llama_context * ctx, llama_
|
|
1128
1171
|
return std::string(result.data(), result.size());
|
1129
1172
|
}
|
1130
1173
|
|
1174
|
+
static ggml_backend_buffer_type_t llama_default_buffer_type(int n_gpu_layers) {
|
1175
|
+
#ifdef GGML_USE_METAL
|
1176
|
+
if (n_gpu_layers > 0) {
|
1177
|
+
return ggml_backend_metal_buffer_type();
|
1178
|
+
}
|
1179
|
+
#elif defined(GGML_USE_CUBLAS) && defined(LLAMA_GGML_BACKEND_CUDA_TEST)
|
1180
|
+
if (n_gpu_layers > 0) {
|
1181
|
+
return ggml_backend_cuda_buffer_type(0);
|
1182
|
+
}
|
1183
|
+
#elif defined(GGML_USE_CUBLAS)
|
1184
|
+
return ggml_backend_cuda_host_buffer_type();
|
1185
|
+
#elif defined(GGML_USE_CPU_HBM)
|
1186
|
+
return ggml_backend_cpu_hbm_buffer_type();
|
1187
|
+
#endif
|
1188
|
+
|
1189
|
+
return ggml_backend_cpu_buffer_type();
|
1190
|
+
|
1191
|
+
GGML_UNUSED(n_gpu_layers);
|
1192
|
+
}
|
1193
|
+
|
1131
1194
|
//
|
1132
1195
|
// globals
|
1133
1196
|
//
|
@@ -1328,14 +1391,10 @@ struct llama_kv_cache {
|
|
1328
1391
|
|
1329
1392
|
struct ggml_context * ctx = NULL;
|
1330
1393
|
|
1331
|
-
|
1394
|
+
ggml_backend_buffer_t buf = NULL;
|
1332
1395
|
|
1333
1396
|
~llama_kv_cache() {
|
1334
|
-
|
1335
|
-
ggml_free(ctx);
|
1336
|
-
}
|
1337
|
-
|
1338
|
-
#ifdef GGML_USE_CUBLAS
|
1397
|
+
#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
|
1339
1398
|
if (ggml_cublas_loaded()) {
|
1340
1399
|
for (size_t i = 0; i < k_l.size(); ++i) {
|
1341
1400
|
ggml_cuda_free_data(k_l[i]);
|
@@ -1343,6 +1402,11 @@ struct llama_kv_cache {
|
|
1343
1402
|
}
|
1344
1403
|
}
|
1345
1404
|
#endif
|
1405
|
+
if (ctx) {
|
1406
|
+
ggml_free(ctx);
|
1407
|
+
}
|
1408
|
+
|
1409
|
+
ggml_backend_buffer_free(buf);
|
1346
1410
|
}
|
1347
1411
|
};
|
1348
1412
|
|
@@ -1382,11 +1446,11 @@ struct llama_vocab {
|
|
1382
1446
|
id special_suffix_id = 32008;
|
1383
1447
|
id special_eot_id = 32010;
|
1384
1448
|
|
1385
|
-
int find_bpe_rank(std::string token_left, std::string token_right) const {
|
1386
|
-
GGML_ASSERT(token_left.find(
|
1387
|
-
GGML_ASSERT(token_left.find(
|
1388
|
-
GGML_ASSERT(token_right.find(
|
1389
|
-
GGML_ASSERT(token_right.find(
|
1449
|
+
int find_bpe_rank(const std::string & token_left, const std::string & token_right) const {
|
1450
|
+
GGML_ASSERT(token_left.find(' ') == std::string::npos);
|
1451
|
+
GGML_ASSERT(token_left.find('\n') == std::string::npos);
|
1452
|
+
GGML_ASSERT(token_right.find(' ') == std::string::npos);
|
1453
|
+
GGML_ASSERT(token_right.find('\n') == std::string::npos);
|
1390
1454
|
|
1391
1455
|
auto it = bpe_ranks.find(std::make_pair(token_left, token_right));
|
1392
1456
|
if (it == bpe_ranks.end()) {
|
@@ -1415,6 +1479,7 @@ struct llama_model {
|
|
1415
1479
|
struct ggml_tensor * output_norm;
|
1416
1480
|
struct ggml_tensor * output_norm_b;
|
1417
1481
|
struct ggml_tensor * output;
|
1482
|
+
struct ggml_tensor * output_b;
|
1418
1483
|
|
1419
1484
|
std::vector<llama_layer> layers;
|
1420
1485
|
|
@@ -1427,7 +1492,7 @@ struct llama_model {
|
|
1427
1492
|
struct ggml_context * ctx = NULL;
|
1428
1493
|
|
1429
1494
|
// the model memory buffer
|
1430
|
-
|
1495
|
+
ggml_backend_buffer_t buf = NULL;
|
1431
1496
|
|
1432
1497
|
// model memory mapped file
|
1433
1498
|
std::unique_ptr<llama_mmap> mapping;
|
@@ -1443,11 +1508,7 @@ struct llama_model {
|
|
1443
1508
|
int64_t t_start_us = 0;
|
1444
1509
|
|
1445
1510
|
~llama_model() {
|
1446
|
-
|
1447
|
-
ggml_free(ctx);
|
1448
|
-
}
|
1449
|
-
|
1450
|
-
#ifdef GGML_USE_CUBLAS
|
1511
|
+
#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
|
1451
1512
|
if (ggml_cublas_loaded()) {
|
1452
1513
|
for (size_t i = 0; i < tensors_by_name.size(); ++i) {
|
1453
1514
|
ggml_cuda_free_data(tensors_by_name[i].second);
|
@@ -1461,24 +1522,26 @@ struct llama_model {
|
|
1461
1522
|
ggml_cl_free_data(tensors_by_name[i].second);
|
1462
1523
|
}
|
1463
1524
|
#endif
|
1525
|
+
if (ctx) {
|
1526
|
+
ggml_free(ctx);
|
1527
|
+
}
|
1528
|
+
|
1529
|
+
ggml_backend_buffer_free(buf);
|
1464
1530
|
}
|
1465
1531
|
};
|
1466
1532
|
|
1467
1533
|
struct llama_context {
|
1468
1534
|
llama_context(const llama_model & model) : model(model), t_start_us(model.t_start_us), t_load_us(model.t_load_us) {}
|
1469
1535
|
~llama_context() {
|
1470
|
-
|
1471
|
-
|
1472
|
-
|
1473
|
-
}
|
1474
|
-
#endif
|
1475
|
-
if (alloc) {
|
1476
|
-
ggml_allocr_free(alloc);
|
1477
|
-
}
|
1536
|
+
ggml_allocr_free(alloc);
|
1537
|
+
ggml_backend_buffer_free(buf_alloc);
|
1538
|
+
ggml_backend_free(backend);
|
1478
1539
|
}
|
1479
1540
|
|
1480
1541
|
llama_cparams cparams;
|
1481
1542
|
|
1543
|
+
ggml_backend_t backend = nullptr;
|
1544
|
+
|
1482
1545
|
const llama_model & model;
|
1483
1546
|
|
1484
1547
|
// key + value cache for the self attention
|
@@ -1500,23 +1563,22 @@ struct llama_context {
|
|
1500
1563
|
|
1501
1564
|
// decode output (2-dimensional array: [n_tokens][n_vocab])
|
1502
1565
|
std::vector<float> logits;
|
1566
|
+
#ifndef NDEBUG
|
1567
|
+
// guard against access to unset logits
|
1568
|
+
std::vector<bool> logits_valid;
|
1569
|
+
#endif
|
1503
1570
|
bool logits_all = false;
|
1504
1571
|
|
1505
1572
|
// input embedding (1-dimensional array: [n_embd])
|
1506
1573
|
std::vector<float> embedding;
|
1507
1574
|
|
1508
|
-
// reusable buffer for `struct ggml_graph_plan.work_data`
|
1509
|
-
std::vector<uint8_t> work_buffer;
|
1510
|
-
|
1511
1575
|
// memory buffers used to evaluate the model
|
1512
|
-
|
1513
|
-
|
1514
|
-
llama_buffer buf_alloc;
|
1576
|
+
std::vector<uint8_t> buf_compute_meta;
|
1577
|
+
ggml_backend_buffer_t buf_alloc = NULL;
|
1515
1578
|
ggml_allocr * alloc = NULL;
|
1516
1579
|
|
1517
|
-
|
1518
|
-
|
1519
|
-
#endif
|
1580
|
+
// temporary buffer for copying data to/from the backend
|
1581
|
+
std::vector<no_init<uint8_t>> buf_copy;
|
1520
1582
|
|
1521
1583
|
#ifdef GGML_USE_MPI
|
1522
1584
|
ggml_mpi_context * ctx_mpi = NULL;
|
@@ -1538,9 +1600,6 @@ static bool llama_kv_cache_init(
|
|
1538
1600
|
const uint32_t n_embd = hparams.n_embd_gqa();
|
1539
1601
|
const uint32_t n_layer = hparams.n_layer;
|
1540
1602
|
|
1541
|
-
const int64_t n_mem = n_layer*n_ctx;
|
1542
|
-
const int64_t n_elements = n_embd*n_mem;
|
1543
|
-
|
1544
1603
|
cache.has_shift = false;
|
1545
1604
|
|
1546
1605
|
cache.head = 0;
|
@@ -1550,13 +1609,10 @@ static bool llama_kv_cache_init(
|
|
1550
1609
|
cache.cells.clear();
|
1551
1610
|
cache.cells.resize(n_ctx);
|
1552
1611
|
|
1553
|
-
cache.buf.resize(ggml_row_size(ktype, n_elements) + ggml_row_size(vtype, n_elements) + 2u*n_layer*ggml_tensor_overhead());
|
1554
|
-
memset(cache.buf.data, 0, cache.buf.size);
|
1555
|
-
|
1556
1612
|
struct ggml_init_params params;
|
1557
|
-
params.mem_size =
|
1558
|
-
params.mem_buffer =
|
1559
|
-
params.no_alloc =
|
1613
|
+
params.mem_size = 2u*n_layer*ggml_tensor_overhead();
|
1614
|
+
params.mem_buffer = NULL;
|
1615
|
+
params.no_alloc = true;
|
1560
1616
|
|
1561
1617
|
cache.ctx = ggml_init(params);
|
1562
1618
|
|
@@ -1570,9 +1626,7 @@ static bool llama_kv_cache_init(
|
|
1570
1626
|
cache.k_l.reserve(n_layer);
|
1571
1627
|
cache.v_l.reserve(n_layer);
|
1572
1628
|
|
1573
|
-
const int i_gpu_start = (int) n_layer - n_gpu_layers;
|
1574
|
-
|
1575
|
-
GGML_UNUSED(offload);
|
1629
|
+
const int i_gpu_start = (int) n_layer - n_gpu_layers;
|
1576
1630
|
|
1577
1631
|
for (int i = 0; i < (int) n_layer; i++) {
|
1578
1632
|
ggml_tensor * k = ggml_new_tensor_1d(cache.ctx, ktype, n_embd*n_ctx);
|
@@ -1581,23 +1635,35 @@ static bool llama_kv_cache_init(
|
|
1581
1635
|
ggml_format_name(v, "cache_v_l%d", i);
|
1582
1636
|
cache.k_l.push_back(k);
|
1583
1637
|
cache.v_l.push_back(v);
|
1584
|
-
#
|
1638
|
+
#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
|
1585
1639
|
if (i >= i_gpu_start) {
|
1586
1640
|
if (offload) {
|
1587
1641
|
ggml_cuda_assign_buffers_no_scratch(k);
|
1588
|
-
vram_kv_cache += ggml_nbytes(k);
|
1589
1642
|
ggml_cuda_assign_buffers_no_scratch(v);
|
1643
|
+
vram_kv_cache += ggml_nbytes(k);
|
1590
1644
|
vram_kv_cache += ggml_nbytes(v);
|
1645
|
+
// HACK: mark tensor as allocated
|
1646
|
+
k->data = v->data = (void *)(uintptr_t)1;
|
1591
1647
|
}
|
1592
1648
|
}
|
1593
1649
|
#endif // GGML_USE_CUBLAS
|
1594
1650
|
}
|
1595
1651
|
|
1652
|
+
// allocate tensors
|
1653
|
+
cache.buf = ggml_backend_alloc_ctx_tensors_from_buft(cache.ctx, llama_default_buffer_type(n_gpu_layers));
|
1654
|
+
|
1655
|
+
// buf may be NULL with full offload
|
1656
|
+
if (cache.buf) {
|
1657
|
+
// initialize the buffer to avoid NaNs in the padding
|
1658
|
+
ggml_backend_buffer_clear(cache.buf, 0);
|
1659
|
+
}
|
1660
|
+
|
1596
1661
|
if (vram_kv_cache > 0) {
|
1597
1662
|
LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
|
1598
1663
|
}
|
1599
1664
|
|
1600
|
-
GGML_UNUSED(
|
1665
|
+
GGML_UNUSED(i_gpu_start);
|
1666
|
+
GGML_UNUSED(offload);
|
1601
1667
|
|
1602
1668
|
return true;
|
1603
1669
|
}
|
@@ -1928,7 +1994,7 @@ namespace GGUFMeta {
|
|
1928
1994
|
target = override->bool_value;
|
1929
1995
|
return true;
|
1930
1996
|
}
|
1931
|
-
return
|
1997
|
+
return false;
|
1932
1998
|
}
|
1933
1999
|
|
1934
2000
|
template<typename OT>
|
@@ -2048,17 +2114,16 @@ struct llama_model_loader {
|
|
2048
2114
|
enum ggml_type type_max = GGML_TYPE_F32;
|
2049
2115
|
|
2050
2116
|
for (int i = 0; i < n_tensors; i++) {
|
2051
|
-
|
2052
|
-
struct ggml_tensor * meta = ggml_get_tensor(ctx_meta, name);
|
2117
|
+
enum ggml_type type = gguf_get_tensor_type(ctx_gguf, i);
|
2053
2118
|
|
2054
|
-
n_type[
|
2119
|
+
n_type[type]++;
|
2055
2120
|
|
2056
|
-
if (n_type_max < n_type[
|
2057
|
-
n_type_max = n_type[
|
2058
|
-
type_max =
|
2121
|
+
if (n_type_max < n_type[type]) {
|
2122
|
+
n_type_max = n_type[type];
|
2123
|
+
type_max = type;
|
2059
2124
|
}
|
2060
2125
|
|
2061
|
-
LLAMA_LOG_INFO("%s: - tensor %4d: %32s %-8s [ %s ]\n", __func__, i, name, ggml_type_name(meta->type), llama_format_tensor_shape(meta).c_str());
|
2126
|
+
// LLAMA_LOG_INFO("%s: - tensor %4d: %32s %-8s [ %s ]\n", __func__, i, name, ggml_type_name(meta->type), llama_format_tensor_shape(meta).c_str());
|
2062
2127
|
}
|
2063
2128
|
|
2064
2129
|
switch (type_max) {
|
@@ -2196,34 +2261,19 @@ struct llama_model_loader {
|
|
2196
2261
|
return gguf_get_tensor_name(ctx_gguf, i);
|
2197
2262
|
}
|
2198
2263
|
|
2199
|
-
struct ggml_tensor * get_tensor_meta(
|
2200
|
-
return ggml_get_tensor(ctx_meta,
|
2264
|
+
struct ggml_tensor * get_tensor_meta(const char * name) const {
|
2265
|
+
return ggml_get_tensor(ctx_meta, name);
|
2201
2266
|
}
|
2202
2267
|
|
2203
|
-
|
2204
|
-
|
2205
|
-
mmapped_size_p = 0;
|
2206
|
-
|
2207
|
-
for (int i = 0; i < n_tensors; i++) {
|
2208
|
-
struct ggml_tensor * meta = get_tensor_meta(i);
|
2209
|
-
ctx_size_p += sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE;
|
2210
|
-
(use_mmap ? mmapped_size_p : ctx_size_p) += ggml_nbytes_pad(meta);
|
2211
|
-
}
|
2268
|
+
struct ggml_tensor * get_tensor_meta(int i) const {
|
2269
|
+
return get_tensor_meta(get_tensor_name(i));
|
2212
2270
|
}
|
2213
2271
|
|
2214
2272
|
struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta, ggml_backend_type backend) {
|
2215
|
-
if (backend != GGML_BACKEND_CPU) {
|
2216
|
-
ggml_set_no_alloc(ctx, true);
|
2217
|
-
}
|
2218
|
-
|
2219
2273
|
struct ggml_tensor * tensor = ggml_dup_tensor(ctx, meta);
|
2220
2274
|
tensor->backend = backend; // TODO: ggml_set_backend
|
2221
2275
|
ggml_set_name(tensor, ggml_get_name(meta));
|
2222
2276
|
|
2223
|
-
if (backend != GGML_BACKEND_CPU) {
|
2224
|
-
ggml_set_no_alloc(ctx, use_mmap);
|
2225
|
-
}
|
2226
|
-
|
2227
2277
|
n_created++;
|
2228
2278
|
|
2229
2279
|
return tensor;
|
@@ -2281,91 +2331,144 @@ struct llama_model_loader {
|
|
2281
2331
|
return gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, idx);
|
2282
2332
|
}
|
2283
2333
|
|
2334
|
+
void init_mapping(bool prefetch = true) {
|
2335
|
+
/*
|
2336
|
+
// prefetch only CPU tensors
|
2337
|
+
if (use_mmap) {
|
2338
|
+
size_t size_pref = 0; // prefetch
|
2339
|
+
|
2340
|
+
for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
|
2341
|
+
struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i));
|
2342
|
+
if (cur->backend == GGML_BACKEND_CPU) {
|
2343
|
+
size_t tensor_end = gguf_get_tensor_offset(ctx_gguf, i) + ggml_nbytes(cur);
|
2344
|
+
size_pref = std::max(size_pref, tensor_end);
|
2345
|
+
}
|
2346
|
+
}
|
2347
|
+
mapping.reset(new llama_mmap(&file, gguf_get_data_offset(ctx_gguf) + size_pref, ggml_is_numa()));
|
2348
|
+
}
|
2349
|
+
*/
|
2350
|
+
// prefetch the whole file - all the data is needed anyway
|
2351
|
+
if (use_mmap) {
|
2352
|
+
mapping.reset(new llama_mmap(&file, prefetch ? -1 : 0, ggml_is_numa()));
|
2353
|
+
}
|
2354
|
+
}
|
2355
|
+
|
2356
|
+
// for backwards compatibility, does not support ggml-backend
|
2284
2357
|
void load_data_for(struct ggml_tensor * cur) const {
|
2285
2358
|
const size_t offs = file_offset(ggml_get_name(cur));
|
2286
2359
|
|
2287
|
-
if (use_mmap) {
|
2288
|
-
cur->data
|
2360
|
+
if (use_mmap && mapping) {
|
2361
|
+
GGML_ASSERT(cur->data == nullptr);
|
2362
|
+
cur->data = (uint8_t *)mapping->addr + offs;
|
2289
2363
|
} else {
|
2364
|
+
GGML_ASSERT(cur->data != nullptr);
|
2290
2365
|
file.seek(offs, SEEK_SET);
|
2291
2366
|
file.read_raw(cur->data, ggml_nbytes(cur));
|
2292
2367
|
}
|
2293
2368
|
}
|
2294
2369
|
|
2295
|
-
|
2370
|
+
// Returns false if cancelled by progress_callback
|
2371
|
+
bool load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, ggml_backend_buffer_t buf_mmap, llama_mlock * lmlock) const {
|
2296
2372
|
size_t size_data = 0;
|
2297
|
-
size_t size_lock = 0;
|
2298
|
-
size_t size_pref = 0; // prefetch
|
2299
2373
|
|
2300
2374
|
for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
|
2301
2375
|
struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i));
|
2302
2376
|
size_data += ggml_nbytes(cur);
|
2303
|
-
if (cur->backend == GGML_BACKEND_CPU) {
|
2304
|
-
size_pref += ggml_nbytes(cur);
|
2305
|
-
}
|
2306
2377
|
}
|
2307
2378
|
|
2308
|
-
if (use_mmap) {
|
2309
|
-
mapping.reset(new llama_mmap(&file, size_pref, ggml_is_numa()));
|
2379
|
+
if (use_mmap && buf_mmap) {
|
2310
2380
|
if (lmlock) {
|
2311
2381
|
lmlock->init(mapping->addr);
|
2312
2382
|
}
|
2313
2383
|
}
|
2314
2384
|
|
2315
|
-
|
2385
|
+
#if (defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)) || defined(GGML_USE_CLBLAST)
|
2386
|
+
const bool legacy_offload = true;
|
2387
|
+
#else
|
2388
|
+
const bool legacy_offload = false;
|
2389
|
+
#endif
|
2390
|
+
|
2391
|
+
std::vector<no_init<uint8_t>> read_buf;
|
2392
|
+
|
2393
|
+
size_t size_done = 0;
|
2394
|
+
|
2395
|
+
size_t mmap_first = -1;
|
2396
|
+
size_t mmap_last = 0;
|
2397
|
+
|
2316
2398
|
for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
|
2317
2399
|
struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i));
|
2318
2400
|
GGML_ASSERT(cur); // unused tensors should have been caught by load_data already
|
2319
2401
|
|
2320
2402
|
if (progress_callback) {
|
2321
|
-
progress_callback((float)
|
2322
|
-
|
2323
|
-
|
2324
|
-
// allocate temp buffer if not using mmap
|
2325
|
-
if (!use_mmap && cur->data == NULL) {
|
2326
|
-
GGML_ASSERT(cur->backend != GGML_BACKEND_CPU);
|
2327
|
-
#ifdef GGML_USE_CPU_HBM
|
2328
|
-
cur->data = (uint8_t*)hbw_malloc(ggml_nbytes(cur));
|
2329
|
-
#else
|
2330
|
-
cur->data = (uint8_t*)malloc(ggml_nbytes(cur));
|
2331
|
-
#endif
|
2403
|
+
if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) {
|
2404
|
+
return false;
|
2405
|
+
}
|
2332
2406
|
}
|
2333
2407
|
|
2334
|
-
|
2408
|
+
const size_t offs = file_offset(ggml_get_name(cur));
|
2335
2409
|
|
2336
|
-
|
2337
|
-
|
2338
|
-
if (
|
2339
|
-
|
2340
|
-
lmlock
|
2410
|
+
if (!legacy_offload || cur->backend == GGML_BACKEND_CPU) {
|
2411
|
+
if (use_mmap && mapping) {
|
2412
|
+
if (buf_mmap) {
|
2413
|
+
ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *) mapping->addr + offs);
|
2414
|
+
if (lmlock) {
|
2415
|
+
lmlock->grow_to(offs + ggml_nbytes(cur));
|
2416
|
+
}
|
2417
|
+
mmap_first = std::min(mmap_first, offs);
|
2418
|
+
mmap_last = std::max(mmap_last, offs + ggml_nbytes(cur));
|
2419
|
+
} else {
|
2420
|
+
ggml_backend_tensor_set(cur, (uint8_t *) mapping->addr + offs, 0, ggml_nbytes(cur));
|
2341
2421
|
}
|
2342
|
-
|
2343
|
-
|
2344
|
-
|
2345
|
-
|
2346
|
-
|
2347
|
-
|
2348
|
-
|
2349
|
-
|
2350
|
-
|
2351
|
-
if (!use_mmap) {
|
2352
|
-
free(cur->data);
|
2422
|
+
} else {
|
2423
|
+
if (ggml_backend_buffer_is_host(cur->buffer)) {
|
2424
|
+
file.seek(offs, SEEK_SET);
|
2425
|
+
file.read_raw(cur->data, ggml_nbytes(cur));
|
2426
|
+
} else {
|
2427
|
+
read_buf.resize(ggml_nbytes(cur));
|
2428
|
+
file.seek(offs, SEEK_SET);
|
2429
|
+
file.read_raw(read_buf.data(), ggml_nbytes(cur));
|
2430
|
+
ggml_backend_tensor_set(cur, read_buf.data(), 0, ggml_nbytes(cur));
|
2353
2431
|
}
|
2354
|
-
|
2432
|
+
}
|
2433
|
+
} else {
|
2434
|
+
// HACK: mark tensor as allocated
|
2435
|
+
cur->data = (void *)(uintptr_t)1;
|
2436
|
+
void * data;
|
2437
|
+
if (use_mmap && mapping) {
|
2438
|
+
data = (uint8_t *) mapping->addr + offs;
|
2439
|
+
} else {
|
2440
|
+
read_buf.resize(ggml_nbytes(cur));
|
2441
|
+
file.seek(offs, SEEK_SET);
|
2442
|
+
file.read_raw(read_buf.data(), ggml_nbytes(cur));
|
2443
|
+
data = read_buf.data();
|
2444
|
+
}
|
2445
|
+
|
2446
|
+
#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
|
2447
|
+
ggml_cuda_transform_tensor(data, cur);
|
2355
2448
|
#elif defined(GGML_USE_CLBLAST)
|
2356
|
-
|
2357
|
-
|
2358
|
-
|
2359
|
-
|
2360
|
-
|
2361
|
-
break;
|
2449
|
+
GGML_ASSERT(cur->backend == GGML_BACKEND_GPU);
|
2450
|
+
ggml_cl_transform_tensor(data, cur);
|
2451
|
+
#else
|
2452
|
+
GGML_ASSERT(!"GPU tensor without a GPU backend");
|
2453
|
+
GGML_UNUSED(data);
|
2362
2454
|
#endif
|
2363
|
-
default:
|
2364
|
-
continue;
|
2365
2455
|
}
|
2366
2456
|
|
2367
|
-
|
2457
|
+
size_done += ggml_nbytes(cur);
|
2368
2458
|
}
|
2459
|
+
|
2460
|
+
// unmap offloaded tensors and metadata
|
2461
|
+
if (use_mmap && mapping) {
|
2462
|
+
mapping->unmap_fragment(0, mmap_first);
|
2463
|
+
mapping->unmap_fragment(mmap_last, mapping->size);
|
2464
|
+
}
|
2465
|
+
|
2466
|
+
if (progress_callback) {
|
2467
|
+
// Even though the model is done loading, we still honor
|
2468
|
+
// cancellation since we need to free allocations.
|
2469
|
+
return progress_callback(1.0f, progress_callback_user_data);
|
2470
|
+
}
|
2471
|
+
return true;
|
2369
2472
|
}
|
2370
2473
|
};
|
2371
2474
|
|
@@ -2388,25 +2491,25 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
|
2388
2491
|
|
2389
2492
|
switch (ftype) {
|
2390
2493
|
case LLAMA_FTYPE_ALL_F32: return "all F32";
|
2391
|
-
case LLAMA_FTYPE_MOSTLY_F16: return "
|
2392
|
-
case LLAMA_FTYPE_MOSTLY_Q4_0: return "
|
2393
|
-
case LLAMA_FTYPE_MOSTLY_Q4_1: return "
|
2494
|
+
case LLAMA_FTYPE_MOSTLY_F16: return "F16";
|
2495
|
+
case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0";
|
2496
|
+
case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1";
|
2394
2497
|
case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
|
2395
|
-
return "
|
2396
|
-
case LLAMA_FTYPE_MOSTLY_Q5_0: return "
|
2397
|
-
case LLAMA_FTYPE_MOSTLY_Q5_1: return "
|
2398
|
-
case LLAMA_FTYPE_MOSTLY_Q8_0: return "
|
2498
|
+
return "Q4_1, some F16";
|
2499
|
+
case LLAMA_FTYPE_MOSTLY_Q5_0: return "Q5_0";
|
2500
|
+
case LLAMA_FTYPE_MOSTLY_Q5_1: return "Q5_1";
|
2501
|
+
case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0";
|
2399
2502
|
|
2400
2503
|
// K-quants
|
2401
|
-
case LLAMA_FTYPE_MOSTLY_Q2_K: return "
|
2402
|
-
case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "
|
2403
|
-
case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "
|
2404
|
-
case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "
|
2405
|
-
case LLAMA_FTYPE_MOSTLY_Q4_K_S: return "
|
2406
|
-
case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "
|
2407
|
-
case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "
|
2408
|
-
case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "
|
2409
|
-
case LLAMA_FTYPE_MOSTLY_Q6_K: return "
|
2504
|
+
case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K";
|
2505
|
+
case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small";
|
2506
|
+
case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "Q3_K - Medium";
|
2507
|
+
case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "Q3_K - Large";
|
2508
|
+
case LLAMA_FTYPE_MOSTLY_Q4_K_S: return "Q4_K - Small";
|
2509
|
+
case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "Q4_K - Medium";
|
2510
|
+
case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small";
|
2511
|
+
case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium";
|
2512
|
+
case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
|
2410
2513
|
|
2411
2514
|
default: return "unknown, may not work";
|
2412
2515
|
}
|
@@ -2524,6 +2627,7 @@ static void llm_load_hparams(
|
|
2524
2627
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
2525
2628
|
|
2526
2629
|
switch (hparams.n_layer) {
|
2630
|
+
case 22: model.type = e_model::MODEL_1B; break;
|
2527
2631
|
case 26: model.type = e_model::MODEL_3B; break;
|
2528
2632
|
case 32: model.type = e_model::MODEL_7B; break;
|
2529
2633
|
case 40: model.type = e_model::MODEL_13B; break;
|
@@ -2625,6 +2729,15 @@ static void llm_load_hparams(
|
|
2625
2729
|
default: model.type = e_model::MODEL_UNKNOWN;
|
2626
2730
|
}
|
2627
2731
|
} break;
|
2732
|
+
case LLM_ARCH_PHI2:
|
2733
|
+
{
|
2734
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
2735
|
+
|
2736
|
+
switch (hparams.n_layer) {
|
2737
|
+
case 32: model.type = e_model::MODEL_3B; break;
|
2738
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
2739
|
+
}
|
2740
|
+
} break;
|
2628
2741
|
|
2629
2742
|
default: (void)0;
|
2630
2743
|
}
|
@@ -2932,7 +3045,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
2932
3045
|
if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
|
2933
3046
|
}
|
2934
3047
|
|
2935
|
-
|
3048
|
+
// Returns false if cancelled by progress_callback
|
3049
|
+
static bool llm_load_tensors(
|
2936
3050
|
llama_model_loader & ml,
|
2937
3051
|
llama_model & model,
|
2938
3052
|
int n_gpu_layers,
|
@@ -2948,25 +3062,16 @@ static void llm_load_tensors(
|
|
2948
3062
|
|
2949
3063
|
model.n_gpu_layers = n_gpu_layers;
|
2950
3064
|
|
2951
|
-
size_t ctx_size;
|
2952
|
-
size_t mmapped_size;
|
2953
|
-
|
2954
|
-
ml.calc_sizes(ctx_size, mmapped_size);
|
3065
|
+
size_t ctx_size = ggml_tensor_overhead() * ml.n_tensors;
|
2955
3066
|
|
2956
|
-
LLAMA_LOG_INFO("%s: ggml ctx size
|
3067
|
+
LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MiB\n", __func__, ctx_size/1024.0/1024.0);
|
2957
3068
|
|
2958
3069
|
// create the ggml context
|
2959
3070
|
{
|
2960
|
-
model.buf.resize(ctx_size);
|
2961
|
-
if (use_mlock) {
|
2962
|
-
model.mlock_buf.init (model.buf.data);
|
2963
|
-
model.mlock_buf.grow_to(model.buf.size);
|
2964
|
-
}
|
2965
|
-
|
2966
3071
|
struct ggml_init_params params = {
|
2967
|
-
/*.mem_size =*/
|
2968
|
-
/*.mem_buffer =*/
|
2969
|
-
/*.no_alloc =*/
|
3072
|
+
/*.mem_size =*/ ctx_size,
|
3073
|
+
/*.mem_buffer =*/ NULL,
|
3074
|
+
/*.no_alloc =*/ true,
|
2970
3075
|
};
|
2971
3076
|
|
2972
3077
|
model.ctx = ggml_init(params);
|
@@ -2977,25 +3082,24 @@ static void llm_load_tensors(
|
|
2977
3082
|
|
2978
3083
|
(void) main_gpu;
|
2979
3084
|
|
2980
|
-
enum ggml_backend_type llama_backend_offload
|
3085
|
+
enum ggml_backend_type llama_backend_offload = GGML_BACKEND_CPU;
|
2981
3086
|
enum ggml_backend_type llama_backend_offload_split = GGML_BACKEND_CPU;
|
2982
3087
|
|
2983
|
-
#
|
3088
|
+
#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
|
2984
3089
|
if (ggml_cublas_loaded()) {
|
2985
3090
|
LLAMA_LOG_INFO("%s: using " GGML_CUDA_NAME " for GPU acceleration\n", __func__);
|
2986
3091
|
ggml_cuda_set_main_device(main_gpu);
|
2987
3092
|
|
2988
|
-
llama_backend_offload
|
3093
|
+
llama_backend_offload = GGML_BACKEND_GPU;
|
2989
3094
|
llama_backend_offload_split = GGML_BACKEND_GPU_SPLIT;
|
2990
3095
|
}
|
2991
3096
|
#elif defined(GGML_USE_CLBLAST)
|
2992
3097
|
LLAMA_LOG_INFO("%s: using OpenCL for GPU acceleration\n", __func__);
|
2993
|
-
llama_backend_offload
|
3098
|
+
llama_backend_offload = GGML_BACKEND_GPU;
|
2994
3099
|
llama_backend_offload_split = GGML_BACKEND_GPU;
|
2995
3100
|
#endif
|
2996
3101
|
|
2997
|
-
//
|
2998
|
-
size_t vram_weights = 0;
|
3102
|
+
// create tensors for the weights
|
2999
3103
|
{
|
3000
3104
|
const int64_t n_embd = hparams.n_embd;
|
3001
3105
|
const int64_t n_embd_gqa = hparams.n_embd_gqa();
|
@@ -3024,13 +3128,6 @@ static void llm_load_tensors(
|
|
3024
3128
|
|
3025
3129
|
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
3026
3130
|
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
3027
|
-
|
3028
|
-
if (backend_norm == GGML_BACKEND_GPU) {
|
3029
|
-
vram_weights += ggml_nbytes(model.output_norm);
|
3030
|
-
}
|
3031
|
-
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
3032
|
-
vram_weights += ggml_nbytes(model.output);
|
3033
|
-
}
|
3034
3131
|
}
|
3035
3132
|
|
3036
3133
|
const uint32_t n_ff = hparams.n_ff;
|
@@ -3080,28 +3177,6 @@ static void llm_load_tensors(
|
|
3080
3177
|
layer.ffn_up_exp[x] = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), {n_embd, n_ff}, backend_split);
|
3081
3178
|
}
|
3082
3179
|
}
|
3083
|
-
|
3084
|
-
if (backend == GGML_BACKEND_GPU) {
|
3085
|
-
vram_weights +=
|
3086
|
-
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
|
3087
|
-
ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) +
|
3088
|
-
(layer.bq ? ggml_nbytes(layer.bq) : 0) +
|
3089
|
-
(layer.bk ? ggml_nbytes(layer.bk) : 0) +
|
3090
|
-
(layer.bv ? ggml_nbytes(layer.bv) : 0) +
|
3091
|
-
(layer.bo ? ggml_nbytes(layer.bo) : 0) +
|
3092
|
-
ggml_nbytes(layer.ffn_norm);
|
3093
|
-
|
3094
|
-
if (layer.ffn_gate_inp == nullptr) {
|
3095
|
-
vram_weights +=
|
3096
|
-
ggml_nbytes(layer.ffn_gate) + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
|
3097
|
-
} else {
|
3098
|
-
vram_weights += ggml_nbytes(layer.ffn_gate_inp);
|
3099
|
-
for (uint32_t x = 0; x < hparams.n_expert; ++x) {
|
3100
|
-
vram_weights +=
|
3101
|
-
ggml_nbytes(layer.ffn_gate_exp[x]) + ggml_nbytes(layer.ffn_down_exp[x]) + ggml_nbytes(layer.ffn_up_exp[x]);
|
3102
|
-
}
|
3103
|
-
}
|
3104
|
-
}
|
3105
3180
|
}
|
3106
3181
|
} break;
|
3107
3182
|
case LLM_ARCH_BAICHUAN:
|
@@ -3121,13 +3196,6 @@ static void llm_load_tensors(
|
|
3121
3196
|
|
3122
3197
|
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
3123
3198
|
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
3124
|
-
|
3125
|
-
if (backend_norm == GGML_BACKEND_GPU) {
|
3126
|
-
vram_weights += ggml_nbytes(model.output_norm);
|
3127
|
-
}
|
3128
|
-
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
3129
|
-
vram_weights += ggml_nbytes(model.output);
|
3130
|
-
}
|
3131
3199
|
}
|
3132
3200
|
|
3133
3201
|
const uint32_t n_ff = hparams.n_ff;
|
@@ -3154,19 +3222,10 @@ static void llm_load_tensors(
|
|
3154
3222
|
layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
|
3155
3223
|
layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
|
3156
3224
|
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
3157
|
-
|
3158
|
-
if (backend == GGML_BACKEND_GPU) {
|
3159
|
-
vram_weights +=
|
3160
|
-
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
|
3161
|
-
ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) +
|
3162
|
-
ggml_nbytes(layer.ffn_gate) + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
|
3163
|
-
}
|
3164
3225
|
}
|
3165
3226
|
} break;
|
3166
3227
|
case LLM_ARCH_FALCON:
|
3167
3228
|
{
|
3168
|
-
// TODO: CPU-only for now
|
3169
|
-
|
3170
3229
|
model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
3171
3230
|
|
3172
3231
|
// output
|
@@ -3185,14 +3244,6 @@ static void llm_load_tensors(
|
|
3185
3244
|
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
3186
3245
|
model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
|
3187
3246
|
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
3188
|
-
|
3189
|
-
if (backend_norm == GGML_BACKEND_GPU) {
|
3190
|
-
vram_weights += ggml_nbytes(model.output_norm);
|
3191
|
-
vram_weights += ggml_nbytes(model.output_norm_b);
|
3192
|
-
}
|
3193
|
-
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
3194
|
-
vram_weights += ggml_nbytes(model.output);
|
3195
|
-
}
|
3196
3247
|
}
|
3197
3248
|
|
3198
3249
|
const uint32_t n_ff = hparams.n_ff;
|
@@ -3213,11 +3264,6 @@ static void llm_load_tensors(
|
|
3213
3264
|
if (gguf_find_tensor(ml.ctx_gguf, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i).c_str()) >= 0) {
|
3214
3265
|
layer.attn_norm_2 = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, backend);
|
3215
3266
|
layer.attn_norm_2_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, backend);
|
3216
|
-
|
3217
|
-
if (backend == GGML_BACKEND_GPU) {
|
3218
|
-
vram_weights += ggml_nbytes(layer.attn_norm_2);
|
3219
|
-
vram_weights += ggml_nbytes(layer.attn_norm_2_b);
|
3220
|
-
}
|
3221
3267
|
}
|
3222
3268
|
|
3223
3269
|
layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
|
@@ -3225,13 +3271,6 @@ static void llm_load_tensors(
|
|
3225
3271
|
|
3226
3272
|
layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
|
3227
3273
|
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
3228
|
-
|
3229
|
-
if (backend == GGML_BACKEND_GPU) {
|
3230
|
-
vram_weights +=
|
3231
|
-
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
|
3232
|
-
ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.wo) +
|
3233
|
-
ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
|
3234
|
-
}
|
3235
3274
|
}
|
3236
3275
|
} break;
|
3237
3276
|
case LLM_ARCH_STARCODER:
|
@@ -3255,14 +3294,6 @@ static void llm_load_tensors(
|
|
3255
3294
|
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
3256
3295
|
model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
|
3257
3296
|
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
3258
|
-
|
3259
|
-
if (backend_norm == GGML_BACKEND_GPU) {
|
3260
|
-
vram_weights += ggml_nbytes(model.output_norm);
|
3261
|
-
vram_weights += ggml_nbytes(model.output_norm_b);
|
3262
|
-
}
|
3263
|
-
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
3264
|
-
vram_weights += ggml_nbytes(model.output);
|
3265
|
-
}
|
3266
3297
|
}
|
3267
3298
|
|
3268
3299
|
const uint32_t n_ff = hparams.n_ff;
|
@@ -3294,16 +3325,6 @@ static void llm_load_tensors(
|
|
3294
3325
|
|
3295
3326
|
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
3296
3327
|
layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend);
|
3297
|
-
|
3298
|
-
if (backend == GGML_BACKEND_GPU) {
|
3299
|
-
vram_weights +=
|
3300
|
-
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
|
3301
|
-
ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) +
|
3302
|
-
ggml_nbytes(layer.wo) + ggml_nbytes(layer.bo) +
|
3303
|
-
ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_norm_b) +
|
3304
|
-
ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_down_b) +
|
3305
|
-
ggml_nbytes(layer.ffn_up) + ggml_nbytes(layer.ffn_up_b);
|
3306
|
-
}
|
3307
3328
|
}
|
3308
3329
|
} break;
|
3309
3330
|
case LLM_ARCH_PERSIMMON:
|
@@ -3325,14 +3346,6 @@ static void llm_load_tensors(
|
|
3325
3346
|
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
3326
3347
|
model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
|
3327
3348
|
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
3328
|
-
|
3329
|
-
if (backend_norm == GGML_BACKEND_GPU) {
|
3330
|
-
vram_weights += ggml_nbytes(model.output_norm);
|
3331
|
-
vram_weights += ggml_nbytes(model.output_norm_b);
|
3332
|
-
}
|
3333
|
-
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
3334
|
-
vram_weights += ggml_nbytes(model.output);
|
3335
|
-
}
|
3336
3349
|
}
|
3337
3350
|
|
3338
3351
|
const uint32_t n_ff = hparams.n_ff;
|
@@ -3362,8 +3375,6 @@ static void llm_load_tensors(
|
|
3362
3375
|
} break;
|
3363
3376
|
case LLM_ARCH_BLOOM:
|
3364
3377
|
{
|
3365
|
-
// TODO: CPU-only for now
|
3366
|
-
|
3367
3378
|
model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
3368
3379
|
model.tok_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, GGML_BACKEND_CPU);
|
3369
3380
|
model.tok_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, GGML_BACKEND_CPU);
|
@@ -3384,14 +3395,6 @@ static void llm_load_tensors(
|
|
3384
3395
|
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
3385
3396
|
model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
|
3386
3397
|
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
3387
|
-
|
3388
|
-
if (backend_norm == GGML_BACKEND_GPU) {
|
3389
|
-
vram_weights += ggml_nbytes(model.output_norm);
|
3390
|
-
vram_weights += ggml_nbytes(model.output_norm_b);
|
3391
|
-
}
|
3392
|
-
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
3393
|
-
vram_weights += ggml_nbytes(model.output);
|
3394
|
-
}
|
3395
3398
|
}
|
3396
3399
|
|
3397
3400
|
const uint32_t n_ff = hparams.n_ff;
|
@@ -3423,16 +3426,6 @@ static void llm_load_tensors(
|
|
3423
3426
|
|
3424
3427
|
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
3425
3428
|
layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend);
|
3426
|
-
|
3427
|
-
if (backend == GGML_BACKEND_GPU) {
|
3428
|
-
vram_weights +=
|
3429
|
-
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
|
3430
|
-
ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) +
|
3431
|
-
ggml_nbytes(layer.wo) + ggml_nbytes(layer.bo) +
|
3432
|
-
ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_norm_b) +
|
3433
|
-
ggml_nbytes(layer.ffn_up) + ggml_nbytes(layer.ffn_up_b) +
|
3434
|
-
ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_down_b);
|
3435
|
-
}
|
3436
3429
|
}
|
3437
3430
|
} break;
|
3438
3431
|
case LLM_ARCH_MPT:
|
@@ -3454,13 +3447,6 @@ static void llm_load_tensors(
|
|
3454
3447
|
|
3455
3448
|
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
3456
3449
|
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
3457
|
-
|
3458
|
-
if (backend_norm == GGML_BACKEND_GPU) {
|
3459
|
-
vram_weights += ggml_nbytes(model.output_norm);
|
3460
|
-
}
|
3461
|
-
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
3462
|
-
vram_weights += ggml_nbytes(model.output);
|
3463
|
-
}
|
3464
3450
|
}
|
3465
3451
|
|
3466
3452
|
const uint32_t n_ff = hparams.n_ff;
|
@@ -3483,16 +3469,6 @@ static void llm_load_tensors(
|
|
3483
3469
|
|
3484
3470
|
layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
|
3485
3471
|
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
3486
|
-
|
3487
|
-
if (backend == GGML_BACKEND_GPU) {
|
3488
|
-
vram_weights +=
|
3489
|
-
ggml_nbytes(layer.attn_norm) +
|
3490
|
-
ggml_nbytes(layer.wqkv) +
|
3491
|
-
ggml_nbytes(layer.wo) +
|
3492
|
-
ggml_nbytes(layer.ffn_norm) +
|
3493
|
-
ggml_nbytes(layer.ffn_down) +
|
3494
|
-
ggml_nbytes(layer.ffn_up);
|
3495
|
-
}
|
3496
3472
|
}
|
3497
3473
|
} break;
|
3498
3474
|
case LLM_ARCH_STABLELM:
|
@@ -3515,13 +3491,6 @@ static void llm_load_tensors(
|
|
3515
3491
|
model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
|
3516
3492
|
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
3517
3493
|
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
3518
|
-
|
3519
|
-
if (backend_norm == GGML_BACKEND_GPU) {
|
3520
|
-
vram_weights += ggml_nbytes(model.output_norm);
|
3521
|
-
}
|
3522
|
-
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
3523
|
-
vram_weights += ggml_nbytes(model.output);
|
3524
|
-
}
|
3525
3494
|
}
|
3526
3495
|
|
3527
3496
|
const uint32_t n_ff = hparams.n_ff;
|
@@ -3553,13 +3522,6 @@ static void llm_load_tensors(
|
|
3553
3522
|
layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
|
3554
3523
|
layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
|
3555
3524
|
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
3556
|
-
|
3557
|
-
if (backend == GGML_BACKEND_GPU) {
|
3558
|
-
vram_weights +=
|
3559
|
-
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
|
3560
|
-
ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) +
|
3561
|
-
ggml_nbytes(layer.ffn_gate) + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
|
3562
|
-
}
|
3563
3525
|
}
|
3564
3526
|
} break;
|
3565
3527
|
case LLM_ARCH_QWEN:
|
@@ -3579,14 +3541,7 @@ static void llm_load_tensors(
|
|
3579
3541
|
|
3580
3542
|
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
3581
3543
|
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
3582
|
-
|
3583
|
-
if (backend_norm == GGML_BACKEND_GPU) {
|
3584
|
-
vram_weights += ggml_nbytes(model.output_norm);
|
3585
|
-
}
|
3586
|
-
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
3587
|
-
vram_weights += ggml_nbytes(model.output);
|
3588
|
-
}
|
3589
|
-
}
|
3544
|
+
}
|
3590
3545
|
|
3591
3546
|
const uint32_t n_ff = hparams.n_ff / 2;
|
3592
3547
|
|
@@ -3611,16 +3566,59 @@ static void llm_load_tensors(
|
|
3611
3566
|
layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
|
3612
3567
|
layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
|
3613
3568
|
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
3569
|
+
}
|
3570
|
+
} break;
|
3571
|
+
case LLM_ARCH_PHI2:
|
3572
|
+
{
|
3573
|
+
model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
3574
|
+
|
3575
|
+
// output
|
3576
|
+
{
|
3577
|
+
ggml_backend_type backend_norm;
|
3578
|
+
ggml_backend_type backend_output;
|
3614
3579
|
|
3615
|
-
if (
|
3616
|
-
|
3617
|
-
|
3618
|
-
|
3619
|
-
|
3580
|
+
if (n_gpu_layers > int(n_layer)) {
|
3581
|
+
backend_norm = llama_backend_offload;
|
3582
|
+
backend_output = llama_backend_offload;
|
3583
|
+
} else {
|
3584
|
+
backend_norm = GGML_BACKEND_CPU;
|
3585
|
+
backend_output = GGML_BACKEND_CPU;
|
3620
3586
|
}
|
3587
|
+
|
3588
|
+
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
3589
|
+
model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
|
3590
|
+
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
3591
|
+
model.output_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "bias"), {n_vocab}, backend_output);
|
3621
3592
|
}
|
3622
|
-
} break;
|
3623
3593
|
|
3594
|
+
const uint32_t n_ff = hparams.n_ff;
|
3595
|
+
|
3596
|
+
const int i_gpu_start = n_layer - n_gpu_layers;
|
3597
|
+
|
3598
|
+
model.layers.resize(n_layer);
|
3599
|
+
|
3600
|
+
for (uint32_t i = 0; i < n_layer; ++i) {
|
3601
|
+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
|
3602
|
+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
|
3603
|
+
|
3604
|
+
auto & layer = model.layers[i];
|
3605
|
+
|
3606
|
+
layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
|
3607
|
+
layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
|
3608
|
+
|
3609
|
+
layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
|
3610
|
+
layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend);
|
3611
|
+
|
3612
|
+
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
|
3613
|
+
layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend);
|
3614
|
+
|
3615
|
+
layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
|
3616
|
+
layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend);
|
3617
|
+
|
3618
|
+
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
3619
|
+
layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend);
|
3620
|
+
}
|
3621
|
+
} break;
|
3624
3622
|
default:
|
3625
3623
|
throw std::runtime_error("unknown architecture");
|
3626
3624
|
}
|
@@ -3628,16 +3626,78 @@ static void llm_load_tensors(
|
|
3628
3626
|
|
3629
3627
|
ml.done_getting_tensors();
|
3630
3628
|
|
3629
|
+
ml.init_mapping();
|
3630
|
+
|
3631
|
+
// allocate tensors
|
3632
|
+
size_t vram_weights = 0;
|
3633
|
+
size_t buf_size = 0;
|
3634
|
+
|
3635
|
+
ggml_backend_buffer_type_t buft = llama_default_buffer_type(n_gpu_layers);
|
3636
|
+
|
3637
|
+
for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
|
3638
|
+
// GGML_BACKEND_GPU tensors are for CUDA and OpenCL only, which are handled separately without ggml-backend
|
3639
|
+
if (t->backend == GGML_BACKEND_CPU) {
|
3640
|
+
buf_size += GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), ggml_backend_buft_get_alignment(buft));
|
3641
|
+
} else {
|
3642
|
+
vram_weights += ggml_nbytes(t);
|
3643
|
+
}
|
3644
|
+
}
|
3645
|
+
|
3646
|
+
// create backend buffer
|
3647
|
+
ggml_backend_buffer_t buf_mmap = nullptr;
|
3648
|
+
|
3649
|
+
#ifdef GGML_USE_METAL
|
3650
|
+
if (n_gpu_layers > 0) {
|
3651
|
+
if (ml.use_mmap) {
|
3652
|
+
const size_t max_size = ggml_get_max_tensor_size(ctx);
|
3653
|
+
model.buf = ggml_backend_metal_buffer_from_ptr(ml.mapping->addr, ml.mapping->size, max_size);
|
3654
|
+
buf_mmap = model.buf;
|
3655
|
+
} else {
|
3656
|
+
model.buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_metal_buffer_type());
|
3657
|
+
}
|
3658
|
+
}
|
3659
|
+
#elif defined(GGML_USE_CUBLAS) && defined(LLAMA_GGML_BACKEND_CUDA_TEST)
|
3660
|
+
// for testing only
|
3661
|
+
if (n_gpu_layers > 0) {
|
3662
|
+
model.buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_cuda_buffer_type(0));
|
3663
|
+
}
|
3664
|
+
#endif
|
3665
|
+
|
3666
|
+
if (model.buf == nullptr) {
|
3667
|
+
// CPU backend, and indirectly CUDA and OpenCL
|
3668
|
+
if (ml.use_mmap) {
|
3669
|
+
model.buf = ggml_backend_cpu_buffer_from_ptr(ml.mapping->addr, ml.mapping->size);
|
3670
|
+
buf_mmap = model.buf;
|
3671
|
+
} else {
|
3672
|
+
// allocate only CPU tensors
|
3673
|
+
model.buf = ggml_backend_buft_alloc_buffer(buft, buf_size);
|
3674
|
+
ggml_tallocr_t alloc = ggml_tallocr_new_from_buffer(model.buf);
|
3675
|
+
for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
|
3676
|
+
if (t->backend == GGML_BACKEND_CPU) {
|
3677
|
+
ggml_tallocr_alloc(alloc, t);
|
3678
|
+
}
|
3679
|
+
}
|
3680
|
+
ggml_tallocr_free(alloc);
|
3681
|
+
}
|
3682
|
+
}
|
3683
|
+
|
3684
|
+
if (use_mlock && ggml_backend_buffer_is_host(model.buf)) {
|
3685
|
+
model.mlock_buf.init (ggml_backend_buffer_get_base(model.buf));
|
3686
|
+
model.mlock_buf.grow_to(ggml_backend_buffer_get_size(model.buf));
|
3687
|
+
}
|
3688
|
+
|
3631
3689
|
// print memory requirements
|
3632
3690
|
{
|
3633
|
-
|
3634
|
-
size_t mem_required =
|
3635
|
-
ctx_size +
|
3636
|
-
mmapped_size - vram_weights; // weights in VRAM not in memory
|
3691
|
+
size_t sys_mem_required = ctx_size + buf_size;
|
3637
3692
|
|
3638
|
-
|
3693
|
+
if (sys_mem_required > 0) {
|
3694
|
+
LLAMA_LOG_INFO("%s: system memory used = %7.2f MiB\n", __func__, sys_mem_required / 1024.0 / 1024.0);
|
3695
|
+
}
|
3696
|
+
if (vram_weights > 0) {
|
3697
|
+
LLAMA_LOG_INFO("%s: VRAM used = %7.2f MiB\n", __func__, vram_weights / 1024.0 / 1024.0);
|
3698
|
+
}
|
3639
3699
|
|
3640
|
-
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
3700
|
+
#if (defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)) || defined(GGML_USE_CLBLAST)
|
3641
3701
|
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
3642
3702
|
|
3643
3703
|
LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
|
@@ -3645,38 +3705,27 @@ static void llm_load_tensors(
|
|
3645
3705
|
LLAMA_LOG_INFO("%s: offloading non-repeating layers to GPU\n", __func__);
|
3646
3706
|
}
|
3647
3707
|
|
3648
|
-
#ifdef GGML_USE_CUBLAS
|
3649
|
-
const int max_backend_supported_layers = hparams.n_layer + 1;
|
3650
|
-
const int max_offloadable_layers = hparams.n_layer + 1;
|
3651
|
-
#elif GGML_USE_CLBLAST
|
3652
3708
|
const int max_backend_supported_layers = hparams.n_layer + 1;
|
3653
3709
|
const int max_offloadable_layers = hparams.n_layer + 1;
|
3654
|
-
#endif // GGML_USE_CUBLAS
|
3655
3710
|
|
3656
3711
|
LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
|
3657
|
-
LLAMA_LOG_INFO("%s: VRAM used: %.2f MiB\n", __func__, vram_weights / 1024.0 / 1024.0);
|
3658
|
-
#else
|
3659
|
-
(void) n_gpu_layers;
|
3660
3712
|
#endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
3661
3713
|
}
|
3662
3714
|
|
3663
|
-
|
3715
|
+
#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
|
3716
|
+
ggml_cuda_set_tensor_split(tensor_split);
|
3717
|
+
#else
|
3718
|
+
GGML_UNUSED(tensor_split);
|
3719
|
+
#endif // GGML_USE_CUBLAS
|
3720
|
+
|
3721
|
+
// populate tensors_by_name
|
3664
3722
|
for (int i = 0; i < ml.n_tensors; ++i) {
|
3665
3723
|
struct ggml_tensor * cur = ggml_get_tensor(ctx, ml.get_tensor_name(i));
|
3666
3724
|
model.tensors_by_name.emplace_back(ggml_get_name(cur), cur);
|
3667
3725
|
}
|
3668
3726
|
|
3669
|
-
(
|
3670
|
-
|
3671
|
-
{
|
3672
|
-
ggml_cuda_set_tensor_split(tensor_split);
|
3673
|
-
}
|
3674
|
-
#endif
|
3675
|
-
|
3676
|
-
ml.load_all_data(ctx, progress_callback, progress_callback_user_data, use_mlock ? &model.mlock_mmap : NULL);
|
3677
|
-
|
3678
|
-
if (progress_callback) {
|
3679
|
-
progress_callback(1.0f, progress_callback_user_data);
|
3727
|
+
if (!ml.load_all_data(ctx, progress_callback, progress_callback_user_data, buf_mmap, use_mlock ? &model.mlock_mmap : NULL)) {
|
3728
|
+
return false;
|
3680
3729
|
}
|
3681
3730
|
|
3682
3731
|
model.mapping = std::move(ml.mapping);
|
@@ -3684,9 +3733,11 @@ static void llm_load_tensors(
|
|
3684
3733
|
// loading time will be recalculate after the first eval, so
|
3685
3734
|
// we take page faults deferred by mmap() into consideration
|
3686
3735
|
model.t_load_us = ggml_time_us() - model.t_start_us;
|
3736
|
+
return true;
|
3687
3737
|
}
|
3688
3738
|
|
3689
|
-
|
3739
|
+
// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
|
3740
|
+
static int llama_model_load(const std::string & fname, llama_model & model, const llama_model_params & params) {
|
3690
3741
|
try {
|
3691
3742
|
llama_model_loader ml(fname, params.use_mmap, params.kv_overrides);
|
3692
3743
|
|
@@ -3704,19 +3755,21 @@ static bool llama_model_load(const std::string & fname, llama_model & model, con
|
|
3704
3755
|
|
3705
3756
|
if (params.vocab_only) {
|
3706
3757
|
LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__);
|
3707
|
-
return
|
3758
|
+
return 0;
|
3708
3759
|
}
|
3709
3760
|
|
3710
|
-
llm_load_tensors(
|
3761
|
+
if (!llm_load_tensors(
|
3711
3762
|
ml, model, params.n_gpu_layers, params.main_gpu, params.tensor_split, params.use_mlock,
|
3712
3763
|
params.progress_callback, params.progress_callback_user_data
|
3713
|
-
)
|
3764
|
+
)) {
|
3765
|
+
return -2;
|
3766
|
+
}
|
3714
3767
|
} catch (const std::exception & err) {
|
3715
3768
|
LLAMA_LOG_ERROR("error loading model: %s\n", err.what());
|
3716
|
-
return
|
3769
|
+
return -1;
|
3717
3770
|
}
|
3718
3771
|
|
3719
|
-
return
|
3772
|
+
return 0;
|
3720
3773
|
}
|
3721
3774
|
|
3722
3775
|
//
|
@@ -3981,17 +4034,18 @@ static struct ggml_tensor * llm_build_ffn(
|
|
3981
4034
|
// if max_alibi_bias > 0 then apply ALiBi
|
3982
4035
|
static struct ggml_tensor * llm_build_kqv(
|
3983
4036
|
struct ggml_context * ctx,
|
4037
|
+
const llama_model & model,
|
3984
4038
|
const llama_hparams & hparams,
|
3985
4039
|
const llama_kv_cache & kv,
|
3986
4040
|
struct ggml_tensor * wo,
|
3987
4041
|
struct ggml_tensor * wo_b,
|
3988
4042
|
struct ggml_tensor * q_cur,
|
3989
|
-
struct ggml_tensor * kq_scale,
|
3990
4043
|
struct ggml_tensor * kq_mask,
|
3991
4044
|
int64_t n_ctx,
|
3992
4045
|
int32_t n_tokens,
|
3993
4046
|
int32_t n_kv,
|
3994
4047
|
float max_alibi_bias,
|
4048
|
+
float kq_scale,
|
3995
4049
|
const llm_build_cb & cb,
|
3996
4050
|
int il) {
|
3997
4051
|
const int64_t n_embd = hparams.n_embd;
|
@@ -4014,6 +4068,12 @@ static struct ggml_tensor * llm_build_kqv(
|
|
4014
4068
|
struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
|
4015
4069
|
cb(kq, "kq", il);
|
4016
4070
|
|
4071
|
+
if (model.arch == LLM_ARCH_PHI2) {
|
4072
|
+
// for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
|
4073
|
+
// ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
|
4074
|
+
ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
|
4075
|
+
}
|
4076
|
+
|
4017
4077
|
if (max_alibi_bias > 0.0f) {
|
4018
4078
|
// temporary branch until we figure out how to handle ggml_alibi through ggml_add
|
4019
4079
|
kq = ggml_scale(ctx, kq, kq_scale);
|
@@ -4033,7 +4093,7 @@ static struct ggml_tensor * llm_build_kqv(
|
|
4033
4093
|
kq = ggml_soft_max(ctx, kq);
|
4034
4094
|
cb(kq, "kq_soft_max", il);
|
4035
4095
|
} else {
|
4036
|
-
kq = ggml_soft_max_ext(ctx, kq, kq_mask,
|
4096
|
+
kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_scale);
|
4037
4097
|
cb(kq, "kq_soft_max_ext", il);
|
4038
4098
|
}
|
4039
4099
|
|
@@ -4102,7 +4162,7 @@ struct llm_build_context {
|
|
4102
4162
|
|
4103
4163
|
const llm_build_cb & cb;
|
4104
4164
|
|
4105
|
-
|
4165
|
+
std::vector<uint8_t> & buf_compute_meta;
|
4106
4166
|
|
4107
4167
|
struct ggml_context * ctx0 = nullptr;
|
4108
4168
|
|
@@ -4112,35 +4172,35 @@ struct llm_build_context {
|
|
4112
4172
|
const llama_batch & batch,
|
4113
4173
|
const llm_build_cb & cb,
|
4114
4174
|
bool worst_case) :
|
4115
|
-
model
|
4116
|
-
hparams
|
4117
|
-
cparams
|
4118
|
-
batch
|
4119
|
-
kv_self
|
4120
|
-
n_embd
|
4121
|
-
n_layer
|
4122
|
-
n_ctx
|
4123
|
-
n_head
|
4124
|
-
n_head_kv
|
4125
|
-
n_embd_head
|
4126
|
-
n_embd_gqa
|
4127
|
-
n_expert
|
4128
|
-
n_expert_used
|
4129
|
-
freq_base
|
4130
|
-
freq_scale
|
4131
|
-
ext_factor
|
4132
|
-
attn_factor
|
4133
|
-
beta_fast
|
4134
|
-
beta_slow
|
4135
|
-
norm_eps
|
4136
|
-
norm_rms_eps
|
4137
|
-
n_tokens
|
4138
|
-
n_kv
|
4139
|
-
kv_head
|
4140
|
-
n_orig_ctx
|
4141
|
-
do_rope_shift
|
4142
|
-
cb
|
4143
|
-
|
4175
|
+
model (lctx.model),
|
4176
|
+
hparams (model.hparams),
|
4177
|
+
cparams (lctx.cparams),
|
4178
|
+
batch (batch),
|
4179
|
+
kv_self (lctx.kv_self),
|
4180
|
+
n_embd (hparams.n_embd),
|
4181
|
+
n_layer (hparams.n_layer),
|
4182
|
+
n_ctx (cparams.n_ctx),
|
4183
|
+
n_head (hparams.n_head),
|
4184
|
+
n_head_kv (hparams.n_head_kv),
|
4185
|
+
n_embd_head (hparams.n_embd_head()),
|
4186
|
+
n_embd_gqa (hparams.n_embd_gqa()),
|
4187
|
+
n_expert (hparams.n_expert),
|
4188
|
+
n_expert_used (hparams.n_expert_used),
|
4189
|
+
freq_base (cparams.rope_freq_base),
|
4190
|
+
freq_scale (cparams.rope_freq_scale),
|
4191
|
+
ext_factor (cparams.yarn_ext_factor),
|
4192
|
+
attn_factor (cparams.yarn_attn_factor),
|
4193
|
+
beta_fast (cparams.yarn_beta_fast),
|
4194
|
+
beta_slow (cparams.yarn_beta_slow),
|
4195
|
+
norm_eps (hparams.f_norm_eps),
|
4196
|
+
norm_rms_eps (hparams.f_norm_rms_eps),
|
4197
|
+
n_tokens (batch.n_tokens),
|
4198
|
+
n_kv (worst_case ? n_ctx : kv_self.n),
|
4199
|
+
kv_head (worst_case ? n_ctx - n_tokens : kv_self.head),
|
4200
|
+
n_orig_ctx (cparams.n_yarn_orig_ctx),
|
4201
|
+
do_rope_shift (worst_case || kv_self.has_shift),
|
4202
|
+
cb (cb),
|
4203
|
+
buf_compute_meta (lctx.buf_compute_meta) {
|
4144
4204
|
GGML_ASSERT(!!kv_self.ctx);
|
4145
4205
|
|
4146
4206
|
// all initializations should be done in init()
|
@@ -4148,8 +4208,8 @@ struct llm_build_context {
|
|
4148
4208
|
|
4149
4209
|
void init() {
|
4150
4210
|
struct ggml_init_params params = {
|
4151
|
-
/*.mem_size =*/
|
4152
|
-
/*.mem_buffer =*/
|
4211
|
+
/*.mem_size =*/ buf_compute_meta.size(),
|
4212
|
+
/*.mem_buffer =*/ buf_compute_meta.data(),
|
4153
4213
|
/*.no_alloc =*/ true,
|
4154
4214
|
};
|
4155
4215
|
|
@@ -4178,10 +4238,6 @@ struct llm_build_context {
|
|
4178
4238
|
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
4179
4239
|
cb(inp_pos, "inp_pos", -1);
|
4180
4240
|
|
4181
|
-
// KQ_scale
|
4182
|
-
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
4183
|
-
cb(KQ_scale, "KQ_scale", -1);
|
4184
|
-
|
4185
4241
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
4186
4242
|
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
4187
4243
|
cb(KQ_mask, "KQ_mask", -1);
|
@@ -4240,9 +4296,9 @@ struct llm_build_context {
|
|
4240
4296
|
|
4241
4297
|
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
4242
4298
|
|
4243
|
-
cur = llm_build_kqv(ctx0, hparams, kv_self,
|
4299
|
+
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
|
4244
4300
|
model.layers[il].wo, model.layers[il].bo,
|
4245
|
-
Qcur,
|
4301
|
+
Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
4246
4302
|
cb(cur, "kqv_out", il);
|
4247
4303
|
}
|
4248
4304
|
|
@@ -4363,10 +4419,6 @@ struct llm_build_context {
|
|
4363
4419
|
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
4364
4420
|
cb(inp_pos, "inp_pos", -1);
|
4365
4421
|
|
4366
|
-
// KQ_scale
|
4367
|
-
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
4368
|
-
cb(KQ_scale, "KQ_scale", -1);
|
4369
|
-
|
4370
4422
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
4371
4423
|
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
4372
4424
|
cb(KQ_mask, "KQ_mask", -1);
|
@@ -4423,9 +4475,9 @@ struct llm_build_context {
|
|
4423
4475
|
// apply ALiBi for 13B model
|
4424
4476
|
const float max_alibi_bias = model.type == MODEL_13B ? 8.0f : -1.0f;
|
4425
4477
|
|
4426
|
-
cur = llm_build_kqv(ctx0, hparams, kv_self,
|
4478
|
+
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
|
4427
4479
|
model.layers[il].wo, NULL,
|
4428
|
-
Qcur,
|
4480
|
+
Qcur, KQ_mask, n_ctx, n_tokens, n_kv, max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
4429
4481
|
cb(cur, "kqv_out", il);
|
4430
4482
|
}
|
4431
4483
|
|
@@ -4483,10 +4535,6 @@ struct llm_build_context {
|
|
4483
4535
|
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
4484
4536
|
cb(inp_pos, "inp_pos", -1);
|
4485
4537
|
|
4486
|
-
// KQ_scale
|
4487
|
-
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
4488
|
-
cb(KQ_scale, "KQ_scale", -1);
|
4489
|
-
|
4490
4538
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
4491
4539
|
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
4492
4540
|
cb(KQ_mask, "KQ_mask", -1);
|
@@ -4547,9 +4595,9 @@ struct llm_build_context {
|
|
4547
4595
|
|
4548
4596
|
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
4549
4597
|
|
4550
|
-
cur = llm_build_kqv(ctx0, hparams, kv_self,
|
4598
|
+
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
|
4551
4599
|
model.layers[il].wo, NULL,
|
4552
|
-
Qcur,
|
4600
|
+
Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
4553
4601
|
cb(cur, "kqv_out", il);
|
4554
4602
|
}
|
4555
4603
|
|
@@ -4606,10 +4654,6 @@ struct llm_build_context {
|
|
4606
4654
|
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
4607
4655
|
cb(inp_pos, "inp_pos", -1);
|
4608
4656
|
|
4609
|
-
// KQ_scale
|
4610
|
-
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
4611
|
-
cb(KQ_scale, "KQ_scale", -1);
|
4612
|
-
|
4613
4657
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
4614
4658
|
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
4615
4659
|
cb(KQ_mask, "KQ_mask", -1);
|
@@ -4647,9 +4691,9 @@ struct llm_build_context {
|
|
4647
4691
|
|
4648
4692
|
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
4649
4693
|
|
4650
|
-
cur = llm_build_kqv(ctx0, hparams, kv_self,
|
4694
|
+
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
|
4651
4695
|
model.layers[il].wo, model.layers[il].bo,
|
4652
|
-
Qcur,
|
4696
|
+
Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
4653
4697
|
cb(cur, "kqv_out", il);
|
4654
4698
|
}
|
4655
4699
|
|
@@ -4706,10 +4750,6 @@ struct llm_build_context {
|
|
4706
4750
|
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
4707
4751
|
cb(inp_pos, "inp_pos", -1);
|
4708
4752
|
|
4709
|
-
// KQ_scale
|
4710
|
-
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
4711
|
-
cb(KQ_scale, "KQ_scale", -1);
|
4712
|
-
|
4713
4753
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
4714
4754
|
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
4715
4755
|
cb(KQ_mask, "KQ_mask", -1);
|
@@ -4856,9 +4896,9 @@ struct llm_build_context {
|
|
4856
4896
|
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
4857
4897
|
|
4858
4898
|
// TODO: not tested, could be broken
|
4859
|
-
cur = llm_build_kqv(ctx0, hparams, kv_self,
|
4899
|
+
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
|
4860
4900
|
model.layers[il].wo, model.layers[il].bo,
|
4861
|
-
Q,
|
4901
|
+
Q, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
4862
4902
|
cb(cur, "kqv_out", il);
|
4863
4903
|
}
|
4864
4904
|
|
@@ -4912,10 +4952,6 @@ struct llm_build_context {
|
|
4912
4952
|
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
|
4913
4953
|
cb(inpL, "inp_embd", -1);
|
4914
4954
|
|
4915
|
-
// KQ_scale
|
4916
|
-
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
4917
|
-
cb(KQ_scale, "KQ_scale", -1);
|
4918
|
-
|
4919
4955
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
4920
4956
|
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
4921
4957
|
cb(KQ_mask, "KQ_mask", -1);
|
@@ -4947,9 +4983,9 @@ struct llm_build_context {
|
|
4947
4983
|
|
4948
4984
|
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
4949
4985
|
|
4950
|
-
cur = llm_build_kqv(ctx0, hparams, kv_self,
|
4986
|
+
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
|
4951
4987
|
model.layers[il].wo, NULL,
|
4952
|
-
Qcur,
|
4988
|
+
Qcur, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
4953
4989
|
cb(cur, "kqv_out", il);
|
4954
4990
|
}
|
4955
4991
|
|
@@ -5003,10 +5039,6 @@ struct llm_build_context {
|
|
5003
5039
|
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
|
5004
5040
|
cb(inpL, "inp_embd", -1);
|
5005
5041
|
|
5006
|
-
// KQ_scale
|
5007
|
-
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
5008
|
-
cb(KQ_scale, "KQ_scale", -1);
|
5009
|
-
|
5010
5042
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
5011
5043
|
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
5012
5044
|
cb(KQ_mask, "KQ_mask", -1);
|
@@ -5044,9 +5076,9 @@ struct llm_build_context {
|
|
5044
5076
|
|
5045
5077
|
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
5046
5078
|
|
5047
|
-
cur = llm_build_kqv(ctx0, hparams, kv_self,
|
5079
|
+
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
|
5048
5080
|
model.layers[il].wo, model.layers[il].bo,
|
5049
|
-
Qcur,
|
5081
|
+
Qcur, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
5050
5082
|
cb(cur, "kqv_out", il);
|
5051
5083
|
}
|
5052
5084
|
|
@@ -5097,10 +5129,6 @@ struct llm_build_context {
|
|
5097
5129
|
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
|
5098
5130
|
cb(inpL, "inp_embd", -1);
|
5099
5131
|
|
5100
|
-
// KQ_scale
|
5101
|
-
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
5102
|
-
cb(KQ_scale, "KQ_scale", -1);
|
5103
|
-
|
5104
5132
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
5105
5133
|
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
5106
5134
|
cb(KQ_mask, "KQ_mask", -1);
|
@@ -5138,9 +5166,9 @@ struct llm_build_context {
|
|
5138
5166
|
|
5139
5167
|
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
5140
5168
|
|
5141
|
-
cur = llm_build_kqv(ctx0, hparams, kv_self,
|
5169
|
+
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
|
5142
5170
|
model.layers[il].wo, NULL,
|
5143
|
-
Qcur,
|
5171
|
+
Qcur, KQ_mask, n_ctx, n_tokens, n_kv, hparams.f_max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
5144
5172
|
cb(cur, "kqv_out", il);
|
5145
5173
|
}
|
5146
5174
|
|
@@ -5200,10 +5228,6 @@ struct llm_build_context {
|
|
5200
5228
|
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
5201
5229
|
cb(inp_pos, "inp_pos", -1);
|
5202
5230
|
|
5203
|
-
// KQ_scale
|
5204
|
-
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
5205
|
-
cb(KQ_scale, "KQ_scale", -1);
|
5206
|
-
|
5207
5231
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
5208
5232
|
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
5209
5233
|
cb(KQ_mask, "KQ_mask", -1);
|
@@ -5251,9 +5275,9 @@ struct llm_build_context {
|
|
5251
5275
|
|
5252
5276
|
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
5253
5277
|
|
5254
|
-
cur = llm_build_kqv(ctx0, hparams, kv_self,
|
5278
|
+
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
|
5255
5279
|
model.layers[il].wo, NULL,
|
5256
|
-
Qcur,
|
5280
|
+
Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
5257
5281
|
cb(cur, "kqv_out", il);
|
5258
5282
|
}
|
5259
5283
|
|
@@ -5310,15 +5334,11 @@ struct llm_build_context {
|
|
5310
5334
|
cb(inpL, "inp_embd", -1);
|
5311
5335
|
|
5312
5336
|
// inp_pos - contains the positions
|
5313
|
-
struct ggml_tensor * inp_pos= ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
5337
|
+
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
5314
5338
|
cb(inp_pos, "inp_pos", -1);
|
5315
5339
|
|
5316
|
-
// KQ_scale
|
5317
|
-
struct ggml_tensor * KQ_scale= ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
5318
|
-
cb(KQ_scale, "KQ_scale", -1);
|
5319
|
-
|
5320
5340
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
5321
|
-
struct ggml_tensor * KQ_mask= ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
5341
|
+
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
5322
5342
|
cb(KQ_mask, "KQ_mask", -1);
|
5323
5343
|
|
5324
5344
|
// shift the entire K-cache if needed
|
@@ -5368,9 +5388,9 @@ struct llm_build_context {
|
|
5368
5388
|
|
5369
5389
|
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
5370
5390
|
|
5371
|
-
cur = llm_build_kqv(ctx0, hparams, kv_self,
|
5391
|
+
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
|
5372
5392
|
model.layers[il].wo, NULL,
|
5373
|
-
Qcur,
|
5393
|
+
Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
5374
5394
|
cb(cur, "kqv_out", il);
|
5375
5395
|
}
|
5376
5396
|
|
@@ -5412,6 +5432,116 @@ struct llm_build_context {
|
|
5412
5432
|
|
5413
5433
|
ggml_build_forward_expand(gf, cur);
|
5414
5434
|
|
5435
|
+
return gf;
|
5436
|
+
}
|
5437
|
+
struct ggml_cgraph * build_phi2() {
|
5438
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
5439
|
+
|
5440
|
+
struct ggml_tensor * cur;
|
5441
|
+
struct ggml_tensor * attn_norm_output;
|
5442
|
+
struct ggml_tensor * ffn_output;
|
5443
|
+
struct ggml_tensor * inpL;
|
5444
|
+
|
5445
|
+
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
|
5446
|
+
cb(inpL, "inp_embd", -1);
|
5447
|
+
|
5448
|
+
// inp_pos - contains the positions
|
5449
|
+
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
5450
|
+
cb(inp_pos, "inp_pos", -1);
|
5451
|
+
|
5452
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
5453
|
+
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
5454
|
+
cb(KQ_mask, "KQ_mask", -1);
|
5455
|
+
|
5456
|
+
// shift the entire K-cache if needed
|
5457
|
+
if (do_rope_shift) {
|
5458
|
+
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, n_embd_head, freq_base, freq_scale, cb);
|
5459
|
+
}
|
5460
|
+
|
5461
|
+
for (int il = 0; il < n_layer; ++il) {
|
5462
|
+
attn_norm_output = llm_build_norm(ctx0, inpL, hparams,
|
5463
|
+
model.layers[il].attn_norm,
|
5464
|
+
model.layers[il].attn_norm_b,
|
5465
|
+
LLM_NORM, cb, il);
|
5466
|
+
cb(attn_norm_output, "attn_norm", il);
|
5467
|
+
|
5468
|
+
// self-attention
|
5469
|
+
{
|
5470
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, attn_norm_output);
|
5471
|
+
cb(cur, "wqkv", il);
|
5472
|
+
|
5473
|
+
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
5474
|
+
cb(cur, "bqkv", il);
|
5475
|
+
|
5476
|
+
struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
5477
|
+
struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
5478
|
+
struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
5479
|
+
|
5480
|
+
cb(Qcur, "Qcur", il);
|
5481
|
+
cb(Kcur, "Kcur", il);
|
5482
|
+
cb(Vcur, "Vcur", il);
|
5483
|
+
|
5484
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
5485
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
5486
|
+
|
5487
|
+
Qcur = ggml_rope_custom(
|
5488
|
+
ctx0, Qcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
|
5489
|
+
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
5490
|
+
);
|
5491
|
+
cb(Qcur, "Qcur", il);
|
5492
|
+
|
5493
|
+
// with phi2, we scale the Q to avoid precision issues
|
5494
|
+
// ref: https://github.com/ml-explore/mlx-examples/blob/08e862336ade809bc37d1035f94b359e7d1a5152/phi2/phi2.py#L64-L66
|
5495
|
+
Qcur = ggml_scale(ctx0, Qcur, 1.0f/sqrtf(float(n_embd_head)));
|
5496
|
+
cb(Qcur, "Qcur", il);
|
5497
|
+
|
5498
|
+
Kcur = ggml_rope_custom(
|
5499
|
+
ctx0, Kcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
|
5500
|
+
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
5501
|
+
);
|
5502
|
+
cb(Kcur, "Kcur", il);
|
5503
|
+
|
5504
|
+
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
5505
|
+
|
5506
|
+
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
|
5507
|
+
model.layers[il].wo, model.layers[il].bo,
|
5508
|
+
Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f, cb, il);
|
5509
|
+
cb(cur, "kqv_out", il);
|
5510
|
+
}
|
5511
|
+
|
5512
|
+
// FF
|
5513
|
+
{
|
5514
|
+
ffn_output = llm_build_ffn(ctx0, attn_norm_output,
|
5515
|
+
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
|
5516
|
+
NULL, NULL,
|
5517
|
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
5518
|
+
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
5519
|
+
cb(ffn_output, "ffn_out", il);
|
5520
|
+
}
|
5521
|
+
|
5522
|
+
cur = ggml_add(ctx0, cur, ffn_output);
|
5523
|
+
cb(cur, "l_out", il);
|
5524
|
+
|
5525
|
+
cur = ggml_add(ctx0, cur, inpL);
|
5526
|
+
cb(cur, "l_out", il);
|
5527
|
+
|
5528
|
+
inpL = cur;
|
5529
|
+
}
|
5530
|
+
|
5531
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
5532
|
+
model.output_norm,
|
5533
|
+
model.output_norm_b,
|
5534
|
+
LLM_NORM, cb, -1);
|
5535
|
+
cb(cur, "result_norm", -1);
|
5536
|
+
|
5537
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
5538
|
+
cb(cur, "result_output_no_bias", -1);
|
5539
|
+
|
5540
|
+
cur = ggml_add(ctx0, cur, model.output_b);
|
5541
|
+
cb(cur, "result_output", -1);
|
5542
|
+
|
5543
|
+
ggml_build_forward_expand(gf, cur);
|
5544
|
+
|
5415
5545
|
return gf;
|
5416
5546
|
}
|
5417
5547
|
};
|
@@ -5427,7 +5557,7 @@ enum llm_offload_func_e {
|
|
5427
5557
|
OFFLOAD_FUNC_FRC, // force offload
|
5428
5558
|
OFFLOAD_FUNC_KQV,
|
5429
5559
|
OFFLOAD_FUNC_NR,
|
5430
|
-
OFFLOAD_FUNC_EMB,
|
5560
|
+
OFFLOAD_FUNC_EMB, // embeddings
|
5431
5561
|
OFFLOAD_FUNC_OUT,
|
5432
5562
|
};
|
5433
5563
|
|
@@ -5512,7 +5642,6 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map
|
|
5512
5642
|
{ "pos_embd", OFFLOAD_FUNC_NR },
|
5513
5643
|
|
5514
5644
|
{ "inp_pos", OFFLOAD_FUNC_FRC }, // this is often used for KQ ops (e.g. rope)
|
5515
|
-
{ "KQ_scale", OFFLOAD_FUNC_FRC },
|
5516
5645
|
{ "KQ_mask", OFFLOAD_FUNC_FRC },
|
5517
5646
|
{ "K_shift", OFFLOAD_FUNC_FRC },
|
5518
5647
|
|
@@ -5596,6 +5725,7 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map
|
|
5596
5725
|
{ "l_out", OFFLOAD_FUNC },
|
5597
5726
|
|
5598
5727
|
{ "result_norm", OFFLOAD_FUNC_EMB },
|
5728
|
+
{ "result_output_no_bias", OFFLOAD_FUNC_EMB },
|
5599
5729
|
{ "result_output", OFFLOAD_FUNC_OUT },
|
5600
5730
|
};
|
5601
5731
|
|
@@ -5613,11 +5743,10 @@ static struct ggml_cgraph * llama_build_graph(
|
|
5613
5743
|
bool alloc_inp_tokens = false;
|
5614
5744
|
bool alloc_inp_embd = false;
|
5615
5745
|
bool alloc_inp_pos = false;
|
5616
|
-
bool alloc_inp_KQ_scale = false;
|
5617
5746
|
bool alloc_inp_KQ_mask = false;
|
5618
5747
|
bool alloc_inp_K_shift = false;
|
5619
5748
|
|
5620
|
-
#
|
5749
|
+
#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
|
5621
5750
|
const bool do_offload = true;
|
5622
5751
|
#else
|
5623
5752
|
const bool do_offload = true; // TODO: set to false after finishing refactoring
|
@@ -5645,7 +5774,7 @@ static struct ggml_cgraph * llama_build_graph(
|
|
5645
5774
|
if (!ggml_allocr_is_measure(lctx.alloc) && batch.token) {
|
5646
5775
|
const int64_t n_tokens = cur->ne[0];
|
5647
5776
|
|
5648
|
-
|
5777
|
+
ggml_backend_tensor_set(cur, batch.token, 0, n_tokens*ggml_element_size(cur));
|
5649
5778
|
}
|
5650
5779
|
|
5651
5780
|
alloc_inp_tokens = true;
|
@@ -5658,7 +5787,7 @@ static struct ggml_cgraph * llama_build_graph(
|
|
5658
5787
|
const int64_t n_embd = cur->ne[0];
|
5659
5788
|
const int64_t n_tokens = cur->ne[1];
|
5660
5789
|
|
5661
|
-
|
5790
|
+
ggml_backend_tensor_set(cur, batch.embd, 0, n_tokens*n_embd*ggml_element_size(cur));
|
5662
5791
|
}
|
5663
5792
|
|
5664
5793
|
alloc_inp_embd = true;
|
@@ -5670,27 +5799,13 @@ static struct ggml_cgraph * llama_build_graph(
|
|
5670
5799
|
if (!ggml_allocr_is_measure(lctx.alloc) && batch.pos) {
|
5671
5800
|
const int64_t n_tokens = cur->ne[0];
|
5672
5801
|
|
5673
|
-
int32_t
|
5674
|
-
|
5675
|
-
for (int i = 0; i < n_tokens; ++i) {
|
5676
|
-
data[i] = batch.pos[i];
|
5677
|
-
}
|
5802
|
+
static_assert(std::is_same<llama_pos, int32_t>::value, "llama_pos must be int32_t");
|
5803
|
+
ggml_backend_tensor_set(cur, batch.pos, 0, n_tokens*ggml_element_size(cur));
|
5678
5804
|
}
|
5679
5805
|
|
5680
5806
|
alloc_inp_pos = true;
|
5681
5807
|
}
|
5682
5808
|
|
5683
|
-
if (!alloc_inp_KQ_scale && strcmp(name, "KQ_scale") == 0) {
|
5684
|
-
ggml_allocr_alloc(lctx.alloc, cur);
|
5685
|
-
|
5686
|
-
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
5687
|
-
const int64_t n_embd_head = model.hparams.n_embd_head();
|
5688
|
-
ggml_set_f32(cur, 1.0f/sqrtf(float(n_embd_head)));
|
5689
|
-
}
|
5690
|
-
|
5691
|
-
alloc_inp_KQ_scale = true;
|
5692
|
-
}
|
5693
|
-
|
5694
5809
|
if (!alloc_inp_KQ_mask && strcmp(name, "KQ_mask") == 0) {
|
5695
5810
|
ggml_allocr_alloc(lctx.alloc, cur);
|
5696
5811
|
|
@@ -5698,8 +5813,13 @@ static struct ggml_cgraph * llama_build_graph(
|
|
5698
5813
|
const int64_t n_kv = cur->ne[0];
|
5699
5814
|
const int64_t n_tokens = cur->ne[1];
|
5700
5815
|
|
5701
|
-
float * data
|
5702
|
-
|
5816
|
+
float * data;
|
5817
|
+
if (ggml_backend_buffer_is_host(cur->buffer)) {
|
5818
|
+
data = (float *) cur->data;
|
5819
|
+
} else {
|
5820
|
+
lctx.buf_copy.resize(ggml_nbytes(cur));
|
5821
|
+
data = (float *) lctx.buf_copy.data();
|
5822
|
+
}
|
5703
5823
|
|
5704
5824
|
for (int h = 0; h < 1; ++h) {
|
5705
5825
|
for (int j = 0; j < n_tokens; ++j) {
|
@@ -5707,12 +5827,20 @@ static struct ggml_cgraph * llama_build_graph(
|
|
5707
5827
|
const llama_seq_id seq_id = batch.seq_id[j][0];
|
5708
5828
|
|
5709
5829
|
for (int i = 0; i < n_kv; ++i) {
|
5830
|
+
float f;
|
5710
5831
|
if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) {
|
5711
|
-
|
5832
|
+
f = -INFINITY;
|
5833
|
+
} else {
|
5834
|
+
f = 0;
|
5712
5835
|
}
|
5836
|
+
data[h*(n_kv*n_tokens) + j*n_kv + i] = f;
|
5713
5837
|
}
|
5714
5838
|
}
|
5715
5839
|
}
|
5840
|
+
|
5841
|
+
if (data != cur->data) {
|
5842
|
+
ggml_backend_tensor_set(cur, data, 0, ggml_nbytes(cur));
|
5843
|
+
}
|
5716
5844
|
}
|
5717
5845
|
|
5718
5846
|
alloc_inp_KQ_mask = true;
|
@@ -5724,11 +5852,21 @@ static struct ggml_cgraph * llama_build_graph(
|
|
5724
5852
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
5725
5853
|
const int64_t n_ctx = cur->ne[0];
|
5726
5854
|
|
5727
|
-
int32_t * data
|
5855
|
+
int32_t * data;
|
5856
|
+
if (ggml_backend_buffer_is_host(cur->buffer)) {
|
5857
|
+
data = (int32_t *) cur->data;
|
5858
|
+
} else {
|
5859
|
+
lctx.buf_copy.resize(ggml_nbytes(cur));
|
5860
|
+
data = (int32_t *) lctx.buf_copy.data();
|
5861
|
+
}
|
5728
5862
|
|
5729
5863
|
for (int i = 0; i < n_ctx; ++i) {
|
5730
5864
|
data[i] = lctx.kv_self.cells[i].delta;
|
5731
5865
|
}
|
5866
|
+
|
5867
|
+
if (data != cur->data) {
|
5868
|
+
ggml_backend_tensor_set(cur, data, 0, ggml_nbytes(cur));
|
5869
|
+
}
|
5732
5870
|
}
|
5733
5871
|
|
5734
5872
|
alloc_inp_K_shift = true;
|
@@ -5765,7 +5903,7 @@ static struct ggml_cgraph * llama_build_graph(
|
|
5765
5903
|
static const std::unordered_map<llm_offload_func_e, std::string, std::hash<int>> k_offload_func_name = {
|
5766
5904
|
{ OFFLOAD_FUNC_NOP, "CPU" },
|
5767
5905
|
{ OFFLOAD_FUNC_OUT, "CPU" },
|
5768
|
-
#
|
5906
|
+
#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
|
5769
5907
|
{ OFFLOAD_FUNC, "GPU (CUDA)" },
|
5770
5908
|
{ OFFLOAD_FUNC_FRC, "GPU (CUDA) FRC" },
|
5771
5909
|
{ OFFLOAD_FUNC_KQV, "GPU (CUDA) KQV" },
|
@@ -5838,7 +5976,7 @@ static struct ggml_cgraph * llama_build_graph(
|
|
5838
5976
|
offload_func_t func = ggml_offload_nop;
|
5839
5977
|
|
5840
5978
|
// this is needed for compatibility with Metal for example
|
5841
|
-
#
|
5979
|
+
#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
|
5842
5980
|
static offload_func_t ggml_offload_gpu = ggml_cuda_assign_buffers_no_alloc;
|
5843
5981
|
#else
|
5844
5982
|
static offload_func_t ggml_offload_gpu = ggml_offload_nop;
|
@@ -5912,6 +6050,10 @@ static struct ggml_cgraph * llama_build_graph(
|
|
5912
6050
|
{
|
5913
6051
|
result = llm.build_qwen();
|
5914
6052
|
} break;
|
6053
|
+
case LLM_ARCH_PHI2:
|
6054
|
+
{
|
6055
|
+
result = llm.build_phi2();
|
6056
|
+
} break;
|
5915
6057
|
default:
|
5916
6058
|
GGML_ASSERT(false);
|
5917
6059
|
}
|
@@ -6045,18 +6187,23 @@ static int llama_decode_internal(
|
|
6045
6187
|
|
6046
6188
|
ggml_allocr_alloc_graph(lctx.alloc, gf);
|
6047
6189
|
|
6048
|
-
|
6049
|
-
struct ggml_tensor *
|
6050
|
-
|
6051
|
-
GGML_ASSERT(strcmp(res->name, "result_output") == 0);
|
6052
|
-
GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
|
6190
|
+
// the output is always the last tensor in the graph
|
6191
|
+
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
|
6192
|
+
GGML_ASSERT(strcmp(res->name, "result_output") == 0);
|
6053
6193
|
|
6194
|
+
// the embeddings could be the second to last tensor, or the third to last tensor
|
6195
|
+
struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
|
6196
|
+
if (strcmp(embeddings->name, "result_norm") != 0) {
|
6197
|
+
embeddings = gf->nodes[gf->n_nodes - 3];
|
6198
|
+
GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
|
6199
|
+
}
|
6054
6200
|
|
6055
|
-
#
|
6201
|
+
#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
|
6202
|
+
char * buf_alloc_base = (char *)ggml_backend_buffer_get_base(lctx.buf_alloc);
|
6056
6203
|
for (int i = 0; i < gf->n_leafs; i++) {
|
6057
6204
|
ggml_tensor * node = gf->leafs[i];
|
6058
6205
|
if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) {
|
6059
|
-
ggml_cuda_assign_scratch_offset(node, (char*)node->data -
|
6206
|
+
ggml_cuda_assign_scratch_offset(node, (char *)node->data - buf_alloc_base);
|
6060
6207
|
ggml_cuda_copy_to_device(node);
|
6061
6208
|
}
|
6062
6209
|
}
|
@@ -6064,7 +6211,7 @@ static int llama_decode_internal(
|
|
6064
6211
|
for (int i = 0; i < gf->n_nodes; i++) {
|
6065
6212
|
ggml_tensor * node = gf->nodes[i];
|
6066
6213
|
if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) {
|
6067
|
-
ggml_cuda_assign_scratch_offset(node, (char*)node->data -
|
6214
|
+
ggml_cuda_assign_scratch_offset(node, (char *)node->data - buf_alloc_base);
|
6068
6215
|
}
|
6069
6216
|
}
|
6070
6217
|
|
@@ -6091,23 +6238,23 @@ static int llama_decode_internal(
|
|
6091
6238
|
n_threads = 1;
|
6092
6239
|
}
|
6093
6240
|
|
6094
|
-
#
|
6241
|
+
#ifdef GGML_USE_MPI
|
6095
6242
|
const int64_t n_layer = hparams.n_layer;
|
6096
6243
|
ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
|
6097
6244
|
#endif
|
6098
6245
|
|
6099
6246
|
#ifdef GGML_USE_METAL
|
6100
|
-
if (lctx.
|
6101
|
-
|
6102
|
-
ggml_metal_graph_compute(lctx.ctx_metal, gf);
|
6103
|
-
} else {
|
6104
|
-
ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
|
6247
|
+
if (ggml_backend_is_metal(lctx.backend)) {
|
6248
|
+
ggml_backend_metal_set_n_cb(lctx.backend, n_threads);
|
6105
6249
|
}
|
6106
|
-
#else
|
6107
|
-
ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
|
6108
6250
|
#endif
|
6109
6251
|
|
6110
|
-
|
6252
|
+
if (ggml_backend_is_cpu(lctx.backend)) {
|
6253
|
+
ggml_backend_cpu_set_n_threads(lctx.backend, n_threads);
|
6254
|
+
}
|
6255
|
+
ggml_backend_graph_compute(lctx.backend, gf);
|
6256
|
+
|
6257
|
+
#ifdef GGML_USE_MPI
|
6111
6258
|
ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
|
6112
6259
|
#endif
|
6113
6260
|
|
@@ -6145,20 +6292,37 @@ static int llama_decode_internal(
|
|
6145
6292
|
{
|
6146
6293
|
auto & logits_out = lctx.logits;
|
6147
6294
|
|
6295
|
+
#ifndef NDEBUG
|
6296
|
+
auto & logits_valid = lctx.logits_valid;
|
6297
|
+
logits_valid.clear();
|
6298
|
+
logits_valid.resize(n_tokens);
|
6299
|
+
|
6300
|
+
logits_out.clear();
|
6301
|
+
#endif
|
6302
|
+
|
6148
6303
|
if (batch.logits) {
|
6149
6304
|
logits_out.resize(n_vocab * n_tokens);
|
6150
6305
|
for (uint32_t i = 0; i < n_tokens; i++) {
|
6151
6306
|
if (batch.logits[i] == 0) {
|
6152
6307
|
continue;
|
6153
6308
|
}
|
6154
|
-
|
6309
|
+
ggml_backend_tensor_get(res, logits_out.data() + (n_vocab*i), (n_vocab*i)*sizeof(float), n_vocab*sizeof(float));
|
6310
|
+
#ifndef NDEBUG
|
6311
|
+
logits_valid[i] = true;
|
6312
|
+
#endif
|
6155
6313
|
}
|
6156
6314
|
} else if (lctx.logits_all) {
|
6157
6315
|
logits_out.resize(n_vocab * n_tokens);
|
6158
|
-
|
6316
|
+
ggml_backend_tensor_get(res, logits_out.data(), 0, n_vocab*n_tokens*sizeof(float));
|
6317
|
+
#ifndef NDEBUG
|
6318
|
+
std::fill(logits_valid.begin(), logits_valid.end(), true);
|
6319
|
+
#endif
|
6159
6320
|
} else {
|
6160
6321
|
logits_out.resize(n_vocab);
|
6161
|
-
|
6322
|
+
ggml_backend_tensor_get(res, logits_out.data(), (n_vocab*(n_tokens - 1))*sizeof(float), n_vocab*sizeof(float));
|
6323
|
+
#ifndef NDEBUG
|
6324
|
+
logits_valid[0] = true;
|
6325
|
+
#endif
|
6162
6326
|
}
|
6163
6327
|
}
|
6164
6328
|
|
@@ -6167,7 +6331,7 @@ static int llama_decode_internal(
|
|
6167
6331
|
auto & embedding_out = lctx.embedding;
|
6168
6332
|
|
6169
6333
|
embedding_out.resize(n_embd);
|
6170
|
-
|
6334
|
+
ggml_backend_tensor_get(embeddings, embedding_out.data(), (n_embd*(n_tokens - 1))*sizeof(float), n_embd*sizeof(float));
|
6171
6335
|
}
|
6172
6336
|
|
6173
6337
|
// measure the performance only for the single-token evals
|
@@ -8125,12 +8289,6 @@ void llama_beam_search(llama_context * ctx,
|
|
8125
8289
|
// quantization
|
8126
8290
|
//
|
8127
8291
|
|
8128
|
-
template <typename T>
|
8129
|
-
struct no_init {
|
8130
|
-
T value;
|
8131
|
-
no_init() { /* do nothing */ }
|
8132
|
-
};
|
8133
|
-
|
8134
8292
|
struct quantize_state_internal {
|
8135
8293
|
const llama_model & model;
|
8136
8294
|
const llama_model_quantize_params * params;
|
@@ -8373,9 +8531,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
8373
8531
|
#endif
|
8374
8532
|
|
8375
8533
|
llama_model_loader ml(fname_inp, use_mmap, NULL);
|
8376
|
-
|
8377
|
-
ml.mapping.reset(new llama_mmap(&ml.file, /* prefetch */ 0, ggml_is_numa()));
|
8378
|
-
}
|
8534
|
+
ml.init_mapping(false); // no prefetching?
|
8379
8535
|
|
8380
8536
|
llama_model model;
|
8381
8537
|
llm_load_arch(ml, model);
|
@@ -8621,74 +8777,63 @@ static int llama_apply_lora_from_file_internal(
|
|
8621
8777
|
|
8622
8778
|
const int64_t t_start_lora_us = ggml_time_us();
|
8623
8779
|
|
8624
|
-
|
8625
|
-
if (!fin) {
|
8626
|
-
LLAMA_LOG_ERROR("%s: failed to open '%s'\n", __func__, path_lora);
|
8627
|
-
return 1;
|
8628
|
-
}
|
8780
|
+
llama_file fin(path_lora, "rb");
|
8629
8781
|
|
8630
8782
|
// verify magic and version
|
8631
8783
|
{
|
8632
|
-
uint32_t magic;
|
8633
|
-
|
8634
|
-
|
8635
|
-
|
8784
|
+
uint32_t magic = fin.read_u32();
|
8785
|
+
if (magic != LLAMA_FILE_MAGIC_GGLA) {
|
8786
|
+
LLAMA_LOG_ERROR("%s: bad file magic\n", __func__);
|
8787
|
+
return 1;
|
8788
|
+
}
|
8636
8789
|
|
8790
|
+
uint32_t format_version = fin.read_u32();
|
8637
8791
|
if (format_version != 1) {
|
8638
8792
|
LLAMA_LOG_ERROR("%s: unsupported file version\n", __func__ );
|
8639
8793
|
return 1;
|
8640
8794
|
}
|
8641
8795
|
}
|
8642
8796
|
|
8643
|
-
int32_t lora_r;
|
8644
|
-
int32_t lora_alpha;
|
8645
|
-
fin.read((char *) &lora_r, sizeof(lora_r));
|
8646
|
-
fin.read((char *) &lora_alpha, sizeof(lora_alpha));
|
8797
|
+
int32_t lora_r = fin.read_u32();
|
8798
|
+
int32_t lora_alpha = fin.read_u32();
|
8647
8799
|
float scaling = scale * (float)lora_alpha / (float)lora_r;
|
8648
8800
|
|
8649
8801
|
LLAMA_LOG_INFO("%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);
|
8650
8802
|
|
8803
|
+
// create a name -> tensor map of the model to accelerate lookups
|
8804
|
+
// find the max tensor size to estimate the required temporary buffer size
|
8805
|
+
size_t max_tensor_size = 0;
|
8806
|
+
std::unordered_map<std::string, struct ggml_tensor*> model_tensors;
|
8807
|
+
for (const auto & kv : model.tensors_by_name) {
|
8808
|
+
model_tensors.insert(kv);
|
8809
|
+
size_t f32_size = ggml_nelements(kv.second) * sizeof(float);
|
8810
|
+
max_tensor_size = std::max(max_tensor_size, f32_size);
|
8811
|
+
}
|
8812
|
+
|
8651
8813
|
// create a temporary ggml context to store the lora tensors
|
8652
|
-
//
|
8653
|
-
|
8814
|
+
// TODO: use ggml-alloc
|
8815
|
+
size_t lora_ctx_size = max_tensor_size * 3;
|
8816
|
+
LLAMA_LOG_INFO("%s: allocating %.f MB for lora temporary buffer\n", __func__, lora_ctx_size / 1024.0 / 1024.0);
|
8817
|
+
std::vector<uint8_t> lora_buf(lora_ctx_size);
|
8818
|
+
|
8654
8819
|
struct ggml_init_params params;
|
8655
8820
|
params.mem_size = lora_buf.size();
|
8656
8821
|
params.mem_buffer = lora_buf.data();
|
8657
8822
|
params.no_alloc = false;
|
8658
8823
|
|
8659
|
-
|
8660
|
-
std::unordered_map<std::string, struct ggml_tensor *> lora_tensors;
|
8824
|
+
using unique_context = std::unique_ptr<ggml_context, decltype(&ggml_free)>;
|
8661
8825
|
|
8662
|
-
|
8663
|
-
|
8664
|
-
|
8665
|
-
model_tensors.insert(kv);
|
8666
|
-
}
|
8826
|
+
unique_context lora_ctx(nullptr, ggml_free);
|
8827
|
+
lora_ctx.reset(ggml_init(params));
|
8828
|
+
std::unordered_map<std::string, struct ggml_tensor *> lora_tensors;
|
8667
8829
|
|
8668
8830
|
// load base model
|
8669
8831
|
std::unique_ptr<llama_model_loader> ml;
|
8670
|
-
ggml_context * base_ctx = NULL;
|
8671
|
-
std::vector<uint8_t> base_buf;
|
8672
|
-
if (path_base_model) {
|
8673
|
-
LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
|
8674
|
-
ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*kv_overrides*/ NULL));
|
8675
|
-
|
8676
|
-
size_t ctx_size;
|
8677
|
-
size_t mmapped_size;
|
8678
|
-
ml->calc_sizes(ctx_size, mmapped_size);
|
8679
|
-
base_buf.resize(ctx_size);
|
8680
|
-
|
8681
|
-
ggml_init_params base_params;
|
8682
|
-
base_params.mem_size = base_buf.size();
|
8683
|
-
base_params.mem_buffer = base_buf.data();
|
8684
|
-
base_params.no_alloc = ml->use_mmap;
|
8685
8832
|
|
8686
|
-
|
8687
|
-
|
8688
|
-
|
8689
|
-
|
8690
|
-
ml->mapping.reset(new llama_mmap(&ml->file, /* prefetch */ 0, ggml_is_numa()));
|
8691
|
-
}
|
8833
|
+
if (path_base_model) {
|
8834
|
+
LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
|
8835
|
+
ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*kv_overrides*/ nullptr));
|
8836
|
+
ml->init_mapping(false); // no prefetching
|
8692
8837
|
}
|
8693
8838
|
|
8694
8839
|
// read tensors and apply
|
@@ -8698,27 +8843,35 @@ static int llama_apply_lora_from_file_internal(
|
|
8698
8843
|
std::vector<uint8_t> work_buffer;
|
8699
8844
|
|
8700
8845
|
while (true) {
|
8846
|
+
if (fin.tell() == fin.size) {
|
8847
|
+
// eof
|
8848
|
+
break;
|
8849
|
+
}
|
8850
|
+
|
8701
8851
|
int32_t n_dims;
|
8702
|
-
int32_t
|
8852
|
+
int32_t name_len;
|
8703
8853
|
int32_t ftype;
|
8704
8854
|
|
8705
|
-
fin.
|
8706
|
-
fin.
|
8707
|
-
fin.
|
8708
|
-
|
8709
|
-
|
8855
|
+
fin.read_raw(&n_dims, sizeof(n_dims));
|
8856
|
+
fin.read_raw(&name_len, sizeof(name_len));
|
8857
|
+
fin.read_raw(&ftype, sizeof(ftype));
|
8858
|
+
|
8859
|
+
if (n_dims != 1 && n_dims != 2) {
|
8860
|
+
LLAMA_LOG_ERROR("%s: unsupported tensor dimension %d\n", __func__, n_dims);
|
8861
|
+
return 1;
|
8710
8862
|
}
|
8711
8863
|
|
8712
8864
|
int32_t ne[2] = { 1, 1 };
|
8713
8865
|
for (int i = 0; i < n_dims; ++i) {
|
8714
|
-
fin.
|
8866
|
+
fin.read_raw(&ne[i], sizeof(ne[i]));
|
8715
8867
|
}
|
8716
8868
|
|
8717
8869
|
std::string name;
|
8718
8870
|
{
|
8871
|
+
GGML_ASSERT(name_len <= 1024);
|
8719
8872
|
char buf[1024];
|
8720
|
-
fin.
|
8721
|
-
name = std::string(buf,
|
8873
|
+
fin.read_raw(buf, name_len);
|
8874
|
+
name = std::string(buf, name_len);
|
8722
8875
|
}
|
8723
8876
|
|
8724
8877
|
// check for lora suffix and get the type of tensor
|
@@ -8732,7 +8885,7 @@ static int llama_apply_lora_from_file_internal(
|
|
8732
8885
|
std::string lora_type = name.substr(pos + lora_suffix.length());
|
8733
8886
|
std::string base_name = name;
|
8734
8887
|
base_name.erase(pos);
|
8735
|
-
// LLAMA_LOG_INFO("%s: %s => %s (lora type %s) \n", __func__, name.c_str(),base_name.c_str(), lora_type.c_str());
|
8888
|
+
// LLAMA_LOG_INFO("%s: %s => %s (lora type %s) \n", __func__, name.c_str(), base_name.c_str(), lora_type.c_str());
|
8736
8889
|
|
8737
8890
|
if (model_tensors.find(base_name) == model_tensors.end()) {
|
8738
8891
|
LLAMA_LOG_ERROR("%s: unknown tensor '%s' in lora adapter\n", __func__, name.data());
|
@@ -8751,22 +8904,15 @@ static int llama_apply_lora_from_file_internal(
|
|
8751
8904
|
return false;
|
8752
8905
|
}
|
8753
8906
|
}
|
8754
|
-
ggml_tensor * lora_tensor;
|
8755
|
-
|
8756
|
-
lora_tensor = ggml_new_tensor_2d(lora_ctx, wtype, ne[0], ne[1]);
|
8757
|
-
}
|
8758
|
-
else {
|
8759
|
-
LLAMA_LOG_ERROR("%s: unsupported tensor dimension %d\n", __func__, n_dims);
|
8760
|
-
return 1;
|
8761
|
-
}
|
8762
|
-
ggml_set_name(lora_tensor, "lora_tensor");
|
8907
|
+
ggml_tensor * lora_tensor = ggml_new_tensor_2d(lora_ctx.get(), wtype, ne[0], ne[1]);
|
8908
|
+
ggml_set_name(lora_tensor, name.c_str());
|
8763
8909
|
|
8764
8910
|
// load tensor data
|
8765
|
-
size_t offset = fin.
|
8911
|
+
size_t offset = fin.tell();
|
8766
8912
|
size_t tensor_data_size = ggml_nbytes(lora_tensor);
|
8767
8913
|
offset = (offset + 31) & -32;
|
8768
|
-
fin.
|
8769
|
-
fin.
|
8914
|
+
fin.seek(offset, SEEK_SET);
|
8915
|
+
fin.read_raw(lora_tensor->data, tensor_data_size);
|
8770
8916
|
|
8771
8917
|
lora_tensors[name] = lora_tensor;
|
8772
8918
|
|
@@ -8779,7 +8925,7 @@ static int llama_apply_lora_from_file_internal(
|
|
8779
8925
|
offload_func_t offload_func = ggml_offload_nop;
|
8780
8926
|
offload_func_t offload_func_force_inplace = ggml_offload_nop;
|
8781
8927
|
|
8782
|
-
#
|
8928
|
+
#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
|
8783
8929
|
if (dest_t->backend == GGML_BACKEND_GPU || dest_t->backend == GGML_BACKEND_GPU_SPLIT) {
|
8784
8930
|
if (dest_t->type != GGML_TYPE_F16) {
|
8785
8931
|
throw std::runtime_error(format(
|
@@ -8796,13 +8942,11 @@ static int llama_apply_lora_from_file_internal(
|
|
8796
8942
|
|
8797
8943
|
// load from base model
|
8798
8944
|
if (gguf_find_tensor(ctx_gguf, base_name.c_str()) < 0) {
|
8799
|
-
// TODO: throw
|
8800
8945
|
LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
|
8801
8946
|
return 1;
|
8802
8947
|
}
|
8803
8948
|
|
8804
|
-
|
8805
|
-
base_t = ml->create_tensor(base_ctx, base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] }, GGML_BACKEND_CPU);
|
8949
|
+
base_t = ml->get_tensor_meta(base_name.c_str());
|
8806
8950
|
ml->load_data_for(base_t);
|
8807
8951
|
} else {
|
8808
8952
|
base_t = dest_t;
|
@@ -8831,43 +8975,42 @@ static int llama_apply_lora_from_file_internal(
|
|
8831
8975
|
}
|
8832
8976
|
|
8833
8977
|
// w = w + BA*s
|
8834
|
-
ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB);
|
8978
|
+
ggml_tensor * BA = ggml_mul_mat(lora_ctx.get(), loraA, loraB);
|
8835
8979
|
offload_func(BA);
|
8836
8980
|
ggml_set_name(BA, "BA");
|
8837
8981
|
|
8838
8982
|
if (scaling != 1.0f) {
|
8839
|
-
|
8840
|
-
ggml_set_name(scale_tensor, "scale_tensor");
|
8841
|
-
|
8842
|
-
BA = ggml_scale_inplace(lora_ctx, BA, scale_tensor);
|
8983
|
+
BA = ggml_scale_inplace(lora_ctx.get(), BA, scaling);
|
8843
8984
|
offload_func(BA);
|
8844
8985
|
ggml_set_name(BA, "BA_scaled");
|
8845
8986
|
}
|
8846
8987
|
|
8847
8988
|
ggml_tensor * r;
|
8848
8989
|
if (base_t == dest_t) {
|
8849
|
-
r = ggml_add_inplace(lora_ctx, dest_t, BA);
|
8990
|
+
r = ggml_add_inplace(lora_ctx.get(), dest_t, BA);
|
8850
8991
|
offload_func_force_inplace(r);
|
8851
8992
|
ggml_set_name(r, "r_add_inplace");
|
8852
8993
|
}
|
8853
8994
|
else {
|
8854
|
-
r = ggml_add(lora_ctx, base_t, BA);
|
8995
|
+
r = ggml_add(lora_ctx.get(), base_t, BA);
|
8855
8996
|
offload_func(r);
|
8856
8997
|
ggml_set_name(r, "r_add");
|
8857
8998
|
|
8858
|
-
r = ggml_cpy(lora_ctx, r, dest_t);
|
8999
|
+
r = ggml_cpy(lora_ctx.get(), r, dest_t);
|
8859
9000
|
offload_func(r);
|
8860
9001
|
ggml_set_name(r, "r_cpy");
|
8861
9002
|
}
|
8862
9003
|
|
8863
|
-
struct ggml_cgraph * gf = ggml_new_graph(lora_ctx);
|
9004
|
+
struct ggml_cgraph * gf = ggml_new_graph(lora_ctx.get());
|
8864
9005
|
ggml_build_forward_expand(gf, r);
|
8865
9006
|
|
8866
9007
|
ggml_graph_compute_helper(work_buffer, gf, n_threads);
|
8867
9008
|
|
9009
|
+
// the tensors in the adapter must be sorted such that loraA and loraB of the same tensor are next to each other
|
9010
|
+
GGML_ASSERT(lora_tensors.size() == 2);
|
9011
|
+
|
8868
9012
|
// we won't need these tensors again, reset the context to save memory
|
8869
|
-
|
8870
|
-
lora_ctx = ggml_init(params);
|
9013
|
+
lora_ctx.reset(ggml_init(params));
|
8871
9014
|
lora_tensors.clear();
|
8872
9015
|
|
8873
9016
|
n_tensors++;
|
@@ -8877,12 +9020,6 @@ static int llama_apply_lora_from_file_internal(
|
|
8877
9020
|
}
|
8878
9021
|
}
|
8879
9022
|
|
8880
|
-
// TODO: this should be in a destructor, it will leak on failure
|
8881
|
-
ggml_free(lora_ctx);
|
8882
|
-
if (base_ctx) {
|
8883
|
-
ggml_free(base_ctx);
|
8884
|
-
}
|
8885
|
-
|
8886
9023
|
const int64_t t_lora_us = ggml_time_us() - t_start_lora_us;
|
8887
9024
|
LLAMA_LOG_INFO(" done (%.2f ms)\n", t_lora_us / 1000.0);
|
8888
9025
|
|
@@ -9012,11 +9149,18 @@ struct llama_model * llama_load_model_from_file(
|
|
9012
9149
|
LLAMA_LOG_INFO("\n");
|
9013
9150
|
}
|
9014
9151
|
}
|
9152
|
+
return true;
|
9015
9153
|
};
|
9016
9154
|
}
|
9017
9155
|
|
9018
|
-
|
9019
|
-
|
9156
|
+
int status = llama_model_load(path_model, *model, params);
|
9157
|
+
GGML_ASSERT(status <= 0);
|
9158
|
+
if (status < 0) {
|
9159
|
+
if (status == -1) {
|
9160
|
+
LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
|
9161
|
+
} else if (status == -2) {
|
9162
|
+
LLAMA_LOG_INFO("%s: cancelled model load\n", __func__);
|
9163
|
+
}
|
9020
9164
|
delete model;
|
9021
9165
|
return nullptr;
|
9022
9166
|
}
|
@@ -9091,7 +9235,39 @@ struct llama_context * llama_new_context_with_model(
|
|
9091
9235
|
|
9092
9236
|
// reserve memory for context buffers
|
9093
9237
|
if (!hparams.vocab_only) {
|
9094
|
-
|
9238
|
+
// initialize backend
|
9239
|
+
#ifdef GGML_USE_METAL
|
9240
|
+
if (model->n_gpu_layers > 0) {
|
9241
|
+
ctx->backend = ggml_backend_metal_init();
|
9242
|
+
if (ctx->backend == nullptr) {
|
9243
|
+
LLAMA_LOG_ERROR("%s: failed to initialize Metal backend\n", __func__);
|
9244
|
+
}
|
9245
|
+
}
|
9246
|
+
#elif defined(GGML_USE_CUBLAS) && defined(LLAMA_GGML_BACKEND_CUDA_TEST)
|
9247
|
+
// for testing only
|
9248
|
+
if (model->n_gpu_layers > 0) {
|
9249
|
+
ctx->backend = ggml_backend_cuda_init(0);
|
9250
|
+
if (ctx->backend == nullptr) {
|
9251
|
+
LLAMA_LOG_ERROR("%s: failed to initialize CUDA backend\n", __func__);
|
9252
|
+
}
|
9253
|
+
}
|
9254
|
+
#endif
|
9255
|
+
|
9256
|
+
if (ctx->backend == nullptr && ggml_backend_buffer_is_host(model->buf)) {
|
9257
|
+
ctx->backend = ggml_backend_cpu_init();
|
9258
|
+
if (ctx->backend == nullptr) {
|
9259
|
+
LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__);
|
9260
|
+
}
|
9261
|
+
}
|
9262
|
+
|
9263
|
+
if (ctx->backend == nullptr) {
|
9264
|
+
LLAMA_LOG_ERROR("%s: failed to initialize a backend\n", __func__);
|
9265
|
+
delete ctx;
|
9266
|
+
return nullptr;
|
9267
|
+
}
|
9268
|
+
|
9269
|
+
if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, type_k, type_v,
|
9270
|
+
cparams.n_ctx, model->n_gpu_layers, cparams.offload_kqv)) {
|
9095
9271
|
LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
|
9096
9272
|
llama_free(ctx);
|
9097
9273
|
return nullptr;
|
@@ -9127,12 +9303,11 @@ struct llama_context * llama_new_context_with_model(
|
|
9127
9303
|
}
|
9128
9304
|
|
9129
9305
|
{
|
9130
|
-
static const size_t tensor_alignment = 32;
|
9131
9306
|
// the compute buffer is used to store the tensor and graph structs, while the allocator buffer is used for the tensor data
|
9132
|
-
ctx->
|
9307
|
+
ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead());
|
9133
9308
|
|
9134
9309
|
// create measure allocator
|
9135
|
-
ctx->alloc =
|
9310
|
+
ctx->alloc = ggml_allocr_new_measure_from_backend(ctx->backend);
|
9136
9311
|
|
9137
9312
|
// build worst-case graph
|
9138
9313
|
int n_tokens = (int)std::min(cparams.n_ctx, cparams.n_batch);
|
@@ -9140,98 +9315,50 @@ struct llama_context * llama_new_context_with_model(
|
|
9140
9315
|
llama_token token = llama_token_bos(&ctx->model); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
|
9141
9316
|
ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0));
|
9142
9317
|
|
9143
|
-
#ifdef GGML_USE_METAL
|
9144
|
-
if (model->n_gpu_layers > 0) {
|
9145
|
-
ctx->ctx_metal = ggml_metal_init(1);
|
9146
|
-
if (!ctx->ctx_metal) {
|
9147
|
-
LLAMA_LOG_ERROR("%s: ggml_metal_init() failed\n", __func__);
|
9148
|
-
llama_free(ctx);
|
9149
|
-
return NULL;
|
9150
|
-
}
|
9151
|
-
//ggml_metal_graph_find_concurrency(ctx->ctx_metal, gf, false);
|
9152
|
-
//ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
|
9153
|
-
}
|
9154
|
-
#endif
|
9155
9318
|
// measure memory requirements for the graph
|
9156
|
-
size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf)
|
9319
|
+
size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf);
|
9157
9320
|
|
9158
|
-
LLAMA_LOG_INFO("%s: compute buffer total size = %.2f MiB\n", __func__, (ctx->
|
9321
|
+
LLAMA_LOG_INFO("%s: compute buffer total size = %.2f MiB\n", __func__, (ctx->buf_compute_meta.size() + alloc_size) / 1024.0 / 1024.0);
|
9159
9322
|
|
9160
|
-
//
|
9323
|
+
// create allocator again with exact memory requirements
|
9161
9324
|
ggml_allocr_free(ctx->alloc);
|
9162
9325
|
|
9163
|
-
ctx->buf_alloc
|
9164
|
-
ctx->alloc =
|
9165
|
-
#
|
9166
|
-
if (
|
9167
|
-
|
9168
|
-
|
9169
|
-
#endif
|
9170
|
-
#ifdef GGML_USE_CUBLAS
|
9171
|
-
ggml_cuda_set_scratch_size(alloc_size);
|
9172
|
-
LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MiB\n", __func__, alloc_size / 1024.0 / 1024.0);
|
9326
|
+
ctx->buf_alloc = ggml_backend_alloc_buffer(ctx->backend, alloc_size);
|
9327
|
+
ctx->alloc = ggml_allocr_new_from_buffer(ctx->buf_alloc);
|
9328
|
+
#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
|
9329
|
+
if (model->n_gpu_layers > 0) {
|
9330
|
+
ggml_cuda_set_scratch_size(alloc_size);
|
9331
|
+
LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MiB\n", __func__, alloc_size / 1024.0 / 1024.0);
|
9173
9332
|
|
9174
|
-
|
9175
|
-
|
9176
|
-
|
9177
|
-
|
9333
|
+
// calculate total VRAM usage
|
9334
|
+
auto add_tensor = [](const ggml_tensor * t, size_t & size) {
|
9335
|
+
if (t->backend == GGML_BACKEND_GPU || t->backend == GGML_BACKEND_GPU_SPLIT) {
|
9336
|
+
size += ggml_nbytes(t);
|
9337
|
+
}
|
9338
|
+
};
|
9339
|
+
size_t model_vram_size = 0;
|
9340
|
+
for (const auto & kv : model->tensors_by_name) {
|
9341
|
+
add_tensor(kv.second, model_vram_size);
|
9178
9342
|
}
|
9179
|
-
};
|
9180
|
-
size_t model_vram_size = 0;
|
9181
|
-
for (const auto & kv : model->tensors_by_name) {
|
9182
|
-
add_tensor(kv.second, model_vram_size);
|
9183
|
-
}
|
9184
|
-
|
9185
|
-
size_t kv_vram_size = 0;
|
9186
|
-
for (auto & k : ctx->kv_self.k_l) {
|
9187
|
-
add_tensor(k, kv_vram_size);
|
9188
|
-
}
|
9189
|
-
for (auto & v : ctx->kv_self.v_l) {
|
9190
|
-
add_tensor(v, kv_vram_size);
|
9191
|
-
}
|
9192
|
-
|
9193
|
-
size_t ctx_vram_size = alloc_size + kv_vram_size;
|
9194
|
-
size_t total_vram_size = model_vram_size + ctx_vram_size;
|
9195
9343
|
|
9196
|
-
|
9197
|
-
|
9198
|
-
|
9199
|
-
|
9200
|
-
|
9201
|
-
|
9202
|
-
|
9203
|
-
#ifdef GGML_USE_METAL
|
9204
|
-
if (model->n_gpu_layers > 0) {
|
9205
|
-
// this allocates all Metal resources and memory buffers
|
9206
|
-
|
9207
|
-
void * data_ptr = NULL;
|
9208
|
-
size_t data_size = 0;
|
9209
|
-
|
9210
|
-
if (ctx->model.mapping) {
|
9211
|
-
data_ptr = ctx->model.mapping->addr;
|
9212
|
-
data_size = ctx->model.mapping->size;
|
9213
|
-
} else {
|
9214
|
-
data_ptr = ggml_get_mem_buffer(ctx->model.ctx);
|
9215
|
-
data_size = ggml_get_mem_size (ctx->model.ctx);
|
9216
|
-
}
|
9217
|
-
|
9218
|
-
const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
|
9344
|
+
size_t kv_vram_size = 0;
|
9345
|
+
for (auto & k : ctx->kv_self.k_l) {
|
9346
|
+
add_tensor(k, kv_vram_size);
|
9347
|
+
}
|
9348
|
+
for (auto & v : ctx->kv_self.v_l) {
|
9349
|
+
add_tensor(v, kv_vram_size);
|
9350
|
+
}
|
9219
9351
|
|
9220
|
-
|
9352
|
+
size_t ctx_vram_size = alloc_size + kv_vram_size;
|
9353
|
+
size_t total_vram_size = model_vram_size + ctx_vram_size;
|
9221
9354
|
|
9222
|
-
|
9223
|
-
|
9224
|
-
|
9225
|
-
|
9226
|
-
return NULL; \
|
9355
|
+
LLAMA_LOG_INFO("%s: total VRAM used: %.2f MiB (model: %.2f MiB, context: %.2f MiB)\n", __func__,
|
9356
|
+
total_vram_size / 1024.0 / 1024.0,
|
9357
|
+
model_vram_size / 1024.0 / 1024.0,
|
9358
|
+
ctx_vram_size / 1024.0 / 1024.0);
|
9227
9359
|
}
|
9228
|
-
|
9229
|
-
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
|
9230
|
-
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.data, ctx->kv_self.buf.size, 0));
|
9231
|
-
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "alloc", ctx->buf_alloc.data, ctx->buf_alloc.size, 0));
|
9232
|
-
#undef LLAMA_METAL_CHECK_BUF
|
9233
|
-
}
|
9234
9360
|
#endif
|
9361
|
+
}
|
9235
9362
|
}
|
9236
9363
|
|
9237
9364
|
#ifdef GGML_USE_MPI
|
@@ -9259,10 +9386,14 @@ const llama_model * llama_get_model(const struct llama_context * ctx) {
|
|
9259
9386
|
return &ctx->model;
|
9260
9387
|
}
|
9261
9388
|
|
9262
|
-
|
9389
|
+
uint32_t llama_n_ctx(const struct llama_context * ctx) {
|
9263
9390
|
return ctx->cparams.n_ctx;
|
9264
9391
|
}
|
9265
9392
|
|
9393
|
+
uint32_t llama_n_batch(const struct llama_context * ctx) {
|
9394
|
+
return ctx->cparams.n_batch;
|
9395
|
+
}
|
9396
|
+
|
9266
9397
|
enum llama_vocab_type llama_vocab_type(const struct llama_model * model) {
|
9267
9398
|
return model->vocab.type;
|
9268
9399
|
}
|
@@ -9519,7 +9650,7 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
|
|
9519
9650
|
const size_t s_embedding = ctx->embedding.size() * sizeof(float);
|
9520
9651
|
const size_t s_kv_size = sizeof(size_t);
|
9521
9652
|
const size_t s_kv_ntok = sizeof(int);
|
9522
|
-
const size_t s_kv = ctx->kv_self.buf
|
9653
|
+
const size_t s_kv = ggml_backend_buffer_get_size(ctx->kv_self.buf);
|
9523
9654
|
|
9524
9655
|
const size_t s_total = (
|
9525
9656
|
+ s_rng_size
|
@@ -9647,7 +9778,7 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
9647
9778
|
const auto n_embd = hparams.n_embd_gqa();
|
9648
9779
|
const auto n_ctx = cparams.n_ctx;
|
9649
9780
|
|
9650
|
-
const size_t kv_buf_size = kv_self.buf
|
9781
|
+
const size_t kv_buf_size = ggml_backend_buffer_get_size(kv_self.buf);
|
9651
9782
|
const uint32_t kv_head = kv_self.head;
|
9652
9783
|
const uint32_t kv_size = kv_self.size;
|
9653
9784
|
const uint32_t kv_used = kv_self.used;
|
@@ -9663,17 +9794,12 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
9663
9794
|
ggml_context * cpy_ctx = ggml_init({ 6*n_layer*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
|
9664
9795
|
ggml_cgraph * gf = ggml_new_graph(cpy_ctx);
|
9665
9796
|
|
9666
|
-
std::vector<
|
9667
|
-
std::vector<
|
9797
|
+
std::vector<struct ggml_tensor *> kout2d(n_layer);
|
9798
|
+
std::vector<struct ggml_tensor *> vout2d(n_layer);
|
9668
9799
|
|
9669
9800
|
for (int il = 0; il < (int) n_layer; ++il) {
|
9670
|
-
|
9671
|
-
|
9672
|
-
kout2d->data = kout2d_data[il].data();
|
9673
|
-
|
9674
|
-
ggml_tensor * vout2d = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd);
|
9675
|
-
vout2d_data[il].resize(ggml_nbytes(vout2d));
|
9676
|
-
vout2d->data = vout2d_data[il].data();
|
9801
|
+
kout2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd, kv_head);
|
9802
|
+
vout2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd);
|
9677
9803
|
|
9678
9804
|
ggml_tensor * k2d = ggml_view_2d(cpy_ctx, kv_self.k_l[il],
|
9679
9805
|
n_embd, kv_head,
|
@@ -9683,20 +9809,28 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
9683
9809
|
kv_head, n_embd,
|
9684
9810
|
elt_size*n_ctx, 0);
|
9685
9811
|
|
9686
|
-
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, k2d, kout2d));
|
9687
|
-
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, v2d, vout2d));
|
9812
|
+
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, k2d, kout2d[il]));
|
9813
|
+
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, v2d, vout2d[il]));
|
9688
9814
|
}
|
9689
9815
|
|
9690
|
-
|
9816
|
+
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(cpy_ctx, ctx->backend);
|
9691
9817
|
|
9692
|
-
|
9818
|
+
ggml_backend_graph_compute(ctx->backend, gf);
|
9819
|
+
|
9820
|
+
std::vector<uint8_t> tmp_buf;
|
9821
|
+
for (int il = 0; il < (int) n_layer; ++il) {
|
9822
|
+
tmp_buf.resize(ggml_nbytes(kout2d[il]));
|
9823
|
+
ggml_backend_tensor_get(kout2d[il], tmp_buf.data(), 0, tmp_buf.size());
|
9824
|
+
data_ctx->write(tmp_buf.data(), tmp_buf.size());
|
9693
9825
|
|
9694
|
-
|
9695
|
-
|
9696
|
-
|
9697
|
-
data_ctx->write(kout2d_data[il].data(), kout2d_data[il].size());
|
9698
|
-
data_ctx->write(vout2d_data[il].data(), vout2d_data[il].size());
|
9826
|
+
tmp_buf.resize(ggml_nbytes(vout2d[il]));
|
9827
|
+
ggml_backend_tensor_get(vout2d[il], tmp_buf.data(), 0, tmp_buf.size());
|
9828
|
+
data_ctx->write(tmp_buf.data(), tmp_buf.size());
|
9699
9829
|
}
|
9830
|
+
|
9831
|
+
ggml_free(cpy_ctx);
|
9832
|
+
|
9833
|
+
ggml_backend_buffer_free(buf);
|
9700
9834
|
}
|
9701
9835
|
|
9702
9836
|
for (uint32_t i = 0; i < kv_size; ++i) {
|
@@ -9794,21 +9928,19 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
9794
9928
|
memcpy(&kv_used, inp, sizeof(kv_used)); inp += sizeof(kv_used);
|
9795
9929
|
|
9796
9930
|
if (kv_buf_size) {
|
9797
|
-
GGML_ASSERT(kv_self.buf
|
9931
|
+
GGML_ASSERT(ggml_backend_buffer_get_size(kv_self.buf) == kv_buf_size);
|
9798
9932
|
|
9799
9933
|
const size_t elt_size = ggml_element_size(kv_self.k_l[0]);
|
9800
9934
|
|
9801
9935
|
ggml_context * cpy_ctx = ggml_init({ 6*n_layer*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
|
9802
9936
|
ggml_cgraph * gf = ggml_new_graph(cpy_ctx);
|
9803
9937
|
|
9804
|
-
|
9805
|
-
|
9806
|
-
kin2d->data = (void *) inp;
|
9807
|
-
inp += ggml_nbytes(kin2d);
|
9938
|
+
std::vector<struct ggml_tensor *> kin2d(n_layer);
|
9939
|
+
std::vector<struct ggml_tensor *> vin2d(n_layer);
|
9808
9940
|
|
9809
|
-
|
9810
|
-
|
9811
|
-
|
9941
|
+
for (int il = 0; il < n_layer; ++il) {
|
9942
|
+
kin2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd, kv_head);
|
9943
|
+
vin2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd);
|
9812
9944
|
|
9813
9945
|
ggml_tensor * k2d = ggml_view_2d(cpy_ctx, kv_self.k_l[il],
|
9814
9946
|
n_embd, kv_head,
|
@@ -9818,13 +9950,26 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
9818
9950
|
kv_head, n_embd,
|
9819
9951
|
elt_size*n_ctx, 0);
|
9820
9952
|
|
9821
|
-
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, kin2d, k2d));
|
9822
|
-
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, vin2d, v2d));
|
9953
|
+
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, kin2d[il], k2d));
|
9954
|
+
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, vin2d[il], v2d));
|
9955
|
+
}
|
9956
|
+
|
9957
|
+
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(cpy_ctx, ctx->backend);
|
9958
|
+
|
9959
|
+
// load data into the tensors
|
9960
|
+
for (int il = 0; il < n_layer; ++il) {
|
9961
|
+
ggml_backend_tensor_set(kin2d[il], inp, 0, ggml_nbytes(kin2d[il]));
|
9962
|
+
inp += ggml_nbytes(kin2d[il]);
|
9963
|
+
|
9964
|
+
ggml_backend_tensor_set(vin2d[il], inp, 0, ggml_nbytes(vin2d[il]));
|
9965
|
+
inp += ggml_nbytes(vin2d[il]);
|
9823
9966
|
}
|
9824
9967
|
|
9825
|
-
|
9968
|
+
ggml_backend_graph_compute(ctx->backend, gf);
|
9826
9969
|
|
9827
9970
|
ggml_free(cpy_ctx);
|
9971
|
+
|
9972
|
+
ggml_backend_buffer_free(buf);
|
9828
9973
|
}
|
9829
9974
|
|
9830
9975
|
ctx->kv_self.head = kv_head;
|
@@ -10047,6 +10192,7 @@ float * llama_get_logits(struct llama_context * ctx) {
|
|
10047
10192
|
}
|
10048
10193
|
|
10049
10194
|
float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
|
10195
|
+
assert(ctx->logits_valid.at(i));
|
10050
10196
|
return ctx->logits.data() + i*ctx->model.hparams.n_vocab;
|
10051
10197
|
}
|
10052
10198
|
|