llama_cpp 0.10.1 → 0.10.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/ext/llama_cpp/llama_cpp.cpp +16 -1
- data/ext/llama_cpp/src/ggml-alloc.c +12 -4
- data/ext/llama_cpp/src/ggml-backend-impl.h +12 -8
- data/ext/llama_cpp/src/ggml-backend.c +75 -5
- data/ext/llama_cpp/src/ggml-backend.h +7 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +284 -162
- data/ext/llama_cpp/src/ggml-metal.h +3 -0
- data/ext/llama_cpp/src/ggml-metal.m +190 -44
- data/ext/llama_cpp/src/ggml-metal.metal +11 -2
- data/ext/llama_cpp/src/ggml.c +262 -89
- data/ext/llama_cpp/src/ggml.h +24 -10
- data/ext/llama_cpp/src/llama.cpp +926 -780
- data/ext/llama_cpp/src/llama.h +8 -3
- data/lib/llama_cpp/version.rb +2 -2
- metadata +2 -2
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -1,11 +1,12 @@
|
|
1
1
|
#define LLAMA_API_INTERNAL
|
2
|
+
//#define LLAMA_GGML_BACKEND_CUDA_TEST // for testing only - enables ggml-cuda through ggml-backend, disables partial offloading
|
2
3
|
#include "llama.h"
|
3
4
|
|
4
5
|
#include "unicode.h"
|
5
6
|
|
6
7
|
#include "ggml.h"
|
7
|
-
|
8
8
|
#include "ggml-alloc.h"
|
9
|
+
#include "ggml-backend.h"
|
9
10
|
|
10
11
|
#ifdef GGML_USE_CUBLAS
|
11
12
|
# include "ggml-cuda.h"
|
@@ -32,6 +33,7 @@
|
|
32
33
|
#include <unistd.h>
|
33
34
|
#if defined(_POSIX_MAPPED_FILES)
|
34
35
|
#include <sys/mman.h>
|
36
|
+
#include <fcntl.h>
|
35
37
|
#endif
|
36
38
|
#if defined(_POSIX_MEMLOCK_RANGE)
|
37
39
|
#include <sys/resource.h>
|
@@ -195,6 +197,7 @@ enum llm_arch {
|
|
195
197
|
LLM_ARCH_BLOOM,
|
196
198
|
LLM_ARCH_STABLELM,
|
197
199
|
LLM_ARCH_QWEN,
|
200
|
+
LLM_ARCH_PHI2,
|
198
201
|
LLM_ARCH_UNKNOWN,
|
199
202
|
};
|
200
203
|
|
@@ -212,6 +215,7 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
|
|
212
215
|
{ LLM_ARCH_BLOOM, "bloom" },
|
213
216
|
{ LLM_ARCH_STABLELM, "stablelm" },
|
214
217
|
{ LLM_ARCH_QWEN, "qwen" },
|
218
|
+
{ LLM_ARCH_PHI2, "phi2" },
|
215
219
|
};
|
216
220
|
|
217
221
|
enum llm_kv {
|
@@ -550,6 +554,19 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
550
554
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
551
555
|
},
|
552
556
|
},
|
557
|
+
{
|
558
|
+
LLM_ARCH_PHI2,
|
559
|
+
{
|
560
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
561
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
562
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
563
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
564
|
+
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
565
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
566
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
567
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
568
|
+
},
|
569
|
+
},
|
553
570
|
|
554
571
|
{
|
555
572
|
LLM_ARCH_UNKNOWN,
|
@@ -697,38 +714,6 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
|
|
697
714
|
// llama helpers
|
698
715
|
//
|
699
716
|
|
700
|
-
inline void * llama_host_malloc(size_t n) {
|
701
|
-
#ifdef GGML_USE_CUBLAS
|
702
|
-
if (ggml_cublas_loaded()) {
|
703
|
-
return ggml_cuda_host_malloc(n);
|
704
|
-
} else {
|
705
|
-
return malloc(n);
|
706
|
-
}
|
707
|
-
#elif GGML_USE_METAL
|
708
|
-
return ggml_metal_host_malloc(n);
|
709
|
-
#elif GGML_USE_CPU_HBM
|
710
|
-
return hbw_malloc(n);
|
711
|
-
#else
|
712
|
-
return malloc(n);
|
713
|
-
#endif
|
714
|
-
}
|
715
|
-
|
716
|
-
inline void llama_host_free(void * ptr) {
|
717
|
-
#ifdef GGML_USE_CUBLAS
|
718
|
-
if (ggml_cublas_loaded()) {
|
719
|
-
return ggml_cuda_host_free(ptr);
|
720
|
-
} else {
|
721
|
-
return free(ptr);
|
722
|
-
}
|
723
|
-
#elif GGML_USE_METAL
|
724
|
-
return ggml_metal_host_free(ptr);
|
725
|
-
#elif GGML_USE_CPU_HBM
|
726
|
-
return hbw_free(ptr);
|
727
|
-
#else
|
728
|
-
return free(ptr);
|
729
|
-
#endif
|
730
|
-
}
|
731
|
-
|
732
717
|
#if defined(_WIN32)
|
733
718
|
static std::string llama_format_win_err(DWORD err) {
|
734
719
|
LPSTR buf;
|
@@ -743,40 +728,10 @@ static std::string llama_format_win_err(DWORD err) {
|
|
743
728
|
}
|
744
729
|
#endif
|
745
730
|
|
746
|
-
|
747
|
-
|
748
|
-
|
749
|
-
|
750
|
-
// fallback to malloc / free
|
751
|
-
// useful in cases where CUDA can try to allocate PINNED memory
|
752
|
-
bool fallback = false;
|
753
|
-
|
754
|
-
void resize(size_t n) {
|
755
|
-
llama_host_free(data);
|
756
|
-
|
757
|
-
data = llama_host_malloc(n);
|
758
|
-
if (!data) {
|
759
|
-
fallback = true;
|
760
|
-
data = malloc(n);
|
761
|
-
} else {
|
762
|
-
fallback = false;
|
763
|
-
}
|
764
|
-
|
765
|
-
GGML_ASSERT(data);
|
766
|
-
size = n;
|
767
|
-
}
|
768
|
-
|
769
|
-
~llama_buffer() {
|
770
|
-
if (data) {
|
771
|
-
if (fallback) { // NOLINT
|
772
|
-
free(data);
|
773
|
-
} else {
|
774
|
-
llama_host_free(data);
|
775
|
-
}
|
776
|
-
}
|
777
|
-
|
778
|
-
data = NULL;
|
779
|
-
}
|
731
|
+
template <typename T>
|
732
|
+
struct no_init {
|
733
|
+
T value;
|
734
|
+
no_init() { /* do nothing */ }
|
780
735
|
};
|
781
736
|
|
782
737
|
struct llama_file {
|
@@ -864,6 +819,9 @@ struct llama_mmap {
|
|
864
819
|
#ifdef _POSIX_MAPPED_FILES
|
865
820
|
static constexpr bool SUPPORTED = true;
|
866
821
|
|
822
|
+
// list of mapped fragments (first_offset, last_offset)
|
823
|
+
std::vector<std::pair<size_t, size_t>> mapped_fragments;
|
824
|
+
|
867
825
|
llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) {
|
868
826
|
size = file->size;
|
869
827
|
int fd = fileno(file->fp);
|
@@ -871,17 +829,22 @@ struct llama_mmap {
|
|
871
829
|
// prefetch/readahead impairs performance on NUMA systems
|
872
830
|
if (numa) { prefetch = 0; }
|
873
831
|
#ifdef __linux__
|
832
|
+
// advise the kernel to read the file sequentially (increases readahead)
|
833
|
+
if (posix_fadvise(fd, 0, 0, POSIX_FADV_SEQUENTIAL)) {
|
834
|
+
LLAMA_LOG_WARN("warning: posix_fadvise(.., POSIX_FADV_SEQUENTIAL) failed: %s\n",
|
835
|
+
strerror(errno));
|
836
|
+
}
|
874
837
|
if (prefetch) { flags |= MAP_POPULATE; }
|
875
838
|
#endif
|
876
839
|
addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
|
877
|
-
if (addr == MAP_FAILED) {
|
840
|
+
if (addr == MAP_FAILED) { // NOLINT
|
878
841
|
throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
|
879
842
|
}
|
880
843
|
|
881
844
|
if (prefetch > 0) {
|
882
|
-
//
|
845
|
+
// advise the kernel to preload the mapped memory
|
883
846
|
if (posix_madvise(addr, std::min(file->size, prefetch), POSIX_MADV_WILLNEED)) {
|
884
|
-
|
847
|
+
LLAMA_LOG_WARN("warning: posix_madvise(.., POSIX_MADV_WILLNEED) failed: %s\n",
|
885
848
|
strerror(errno));
|
886
849
|
}
|
887
850
|
}
|
@@ -889,14 +852,81 @@ struct llama_mmap {
|
|
889
852
|
// advise the kernel not to use readahead
|
890
853
|
// (because the next page might not belong on the same node)
|
891
854
|
if (posix_madvise(addr, file->size, POSIX_MADV_RANDOM)) {
|
892
|
-
|
855
|
+
LLAMA_LOG_WARN("warning: posix_madvise(.., POSIX_MADV_RANDOM) failed: %s\n",
|
893
856
|
strerror(errno));
|
894
857
|
}
|
895
858
|
}
|
859
|
+
|
860
|
+
// initialize list of mapped_fragments
|
861
|
+
mapped_fragments.emplace_back(0, file->size);
|
862
|
+
}
|
863
|
+
|
864
|
+
static void align_range(size_t * first, size_t * last, size_t page_size) {
|
865
|
+
// align first to the next page
|
866
|
+
size_t offset_in_page = *first & (page_size - 1);
|
867
|
+
size_t offset_to_page = offset_in_page == 0 ? 0 : page_size - offset_in_page;
|
868
|
+
*first += offset_to_page;
|
869
|
+
|
870
|
+
// align last to the previous page
|
871
|
+
*last = *last & ~(page_size - 1);
|
872
|
+
|
873
|
+
if (*last <= *first) {
|
874
|
+
*last = *first;
|
875
|
+
}
|
876
|
+
}
|
877
|
+
|
878
|
+
// partially unmap the file in the range [first, last)
|
879
|
+
void unmap_fragment(size_t first, size_t last) {
|
880
|
+
// note: this function must not be called multiple times with overlapping ranges
|
881
|
+
// otherwise, there is a risk of invalidating addresses that have been repurposed for other mappings
|
882
|
+
int page_size = sysconf(_SC_PAGESIZE);
|
883
|
+
align_range(&first, &last, page_size);
|
884
|
+
size_t len = last - first;
|
885
|
+
|
886
|
+
if (len == 0) {
|
887
|
+
return;
|
888
|
+
}
|
889
|
+
|
890
|
+
GGML_ASSERT(first % page_size == 0);
|
891
|
+
GGML_ASSERT(last % page_size == 0);
|
892
|
+
GGML_ASSERT(last > first);
|
893
|
+
|
894
|
+
void * next_page_start = (uint8_t *) addr + first;
|
895
|
+
|
896
|
+
// unmap the range
|
897
|
+
if (munmap(next_page_start, len)) {
|
898
|
+
LLAMA_LOG_WARN("warning: munmap failed: %s\n", strerror(errno));
|
899
|
+
}
|
900
|
+
|
901
|
+
// update the list of mapped fragments to avoid unmapping the same range again in the destructor
|
902
|
+
std::vector<std::pair<size_t, size_t>> new_mapped_fragments;
|
903
|
+
for (const auto & frag : mapped_fragments) {
|
904
|
+
if (frag.first < first && frag.second > last) {
|
905
|
+
// the range is in the middle of the fragment, split it
|
906
|
+
new_mapped_fragments.emplace_back(frag.first, first);
|
907
|
+
new_mapped_fragments.emplace_back(last, frag.second);
|
908
|
+
} else if (frag.first < first && frag.second > first) {
|
909
|
+
// the range starts in the middle of the fragment
|
910
|
+
new_mapped_fragments.emplace_back(frag.first, first);
|
911
|
+
} else if (frag.first < last && frag.second > last) {
|
912
|
+
// the range ends in the middle of the fragment
|
913
|
+
new_mapped_fragments.emplace_back(last, frag.second);
|
914
|
+
} else if (frag.first >= first && frag.second <= last) {
|
915
|
+
// the range covers the entire fragment
|
916
|
+
} else {
|
917
|
+
// the range is outside the fragment
|
918
|
+
new_mapped_fragments.push_back(frag);
|
919
|
+
}
|
920
|
+
}
|
921
|
+
mapped_fragments = std::move(new_mapped_fragments);
|
896
922
|
}
|
897
923
|
|
898
924
|
~llama_mmap() {
|
899
|
-
|
925
|
+
for (const auto & frag : mapped_fragments) {
|
926
|
+
if (munmap((char *) addr + frag.first, frag.second - frag.first)) {
|
927
|
+
LLAMA_LOG_WARN("warning: munmap failed: %s\n", strerror(errno));
|
928
|
+
}
|
929
|
+
}
|
900
930
|
}
|
901
931
|
#elif defined(_WIN32)
|
902
932
|
static constexpr bool SUPPORTED = true;
|
@@ -939,6 +969,12 @@ struct llama_mmap {
|
|
939
969
|
#endif // _WIN32_WINNT >= _WIN32_WINNT_WIN8
|
940
970
|
}
|
941
971
|
|
972
|
+
void unmap_fragment(size_t first, size_t last) {
|
973
|
+
// not supported
|
974
|
+
GGML_UNUSED(first);
|
975
|
+
GGML_UNUSED(last);
|
976
|
+
}
|
977
|
+
|
942
978
|
~llama_mmap() {
|
943
979
|
if (!UnmapViewOfFile(addr)) {
|
944
980
|
fprintf(stderr, "warning: UnmapViewOfFile failed: %s\n",
|
@@ -955,6 +991,13 @@ struct llama_mmap {
|
|
955
991
|
|
956
992
|
throw std::runtime_error(std::string("mmap not supported"));
|
957
993
|
}
|
994
|
+
|
995
|
+
void unmap(size_t offset, size_t len) {
|
996
|
+
(void) offset;
|
997
|
+
(void) len;
|
998
|
+
|
999
|
+
throw std::runtime_error(std::string("mmap not supported"));
|
1000
|
+
}
|
958
1001
|
#endif
|
959
1002
|
};
|
960
1003
|
|
@@ -1128,6 +1171,26 @@ static std::string llama_token_to_piece(const struct llama_context * ctx, llama_
|
|
1128
1171
|
return std::string(result.data(), result.size());
|
1129
1172
|
}
|
1130
1173
|
|
1174
|
+
static ggml_backend_buffer_type_t llama_default_buffer_type(int n_gpu_layers) {
|
1175
|
+
#ifdef GGML_USE_METAL
|
1176
|
+
if (n_gpu_layers > 0) {
|
1177
|
+
return ggml_backend_metal_buffer_type();
|
1178
|
+
}
|
1179
|
+
#elif defined(GGML_USE_CUBLAS) && defined(LLAMA_GGML_BACKEND_CUDA_TEST)
|
1180
|
+
if (n_gpu_layers > 0) {
|
1181
|
+
return ggml_backend_cuda_buffer_type(0);
|
1182
|
+
}
|
1183
|
+
#elif defined(GGML_USE_CUBLAS)
|
1184
|
+
return ggml_backend_cuda_host_buffer_type();
|
1185
|
+
#elif defined(GGML_USE_CPU_HBM)
|
1186
|
+
return ggml_backend_cpu_hbm_buffer_type();
|
1187
|
+
#endif
|
1188
|
+
|
1189
|
+
return ggml_backend_cpu_buffer_type();
|
1190
|
+
|
1191
|
+
GGML_UNUSED(n_gpu_layers);
|
1192
|
+
}
|
1193
|
+
|
1131
1194
|
//
|
1132
1195
|
// globals
|
1133
1196
|
//
|
@@ -1328,14 +1391,10 @@ struct llama_kv_cache {
|
|
1328
1391
|
|
1329
1392
|
struct ggml_context * ctx = NULL;
|
1330
1393
|
|
1331
|
-
|
1394
|
+
ggml_backend_buffer_t buf = NULL;
|
1332
1395
|
|
1333
1396
|
~llama_kv_cache() {
|
1334
|
-
|
1335
|
-
ggml_free(ctx);
|
1336
|
-
}
|
1337
|
-
|
1338
|
-
#ifdef GGML_USE_CUBLAS
|
1397
|
+
#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
|
1339
1398
|
if (ggml_cublas_loaded()) {
|
1340
1399
|
for (size_t i = 0; i < k_l.size(); ++i) {
|
1341
1400
|
ggml_cuda_free_data(k_l[i]);
|
@@ -1343,6 +1402,11 @@ struct llama_kv_cache {
|
|
1343
1402
|
}
|
1344
1403
|
}
|
1345
1404
|
#endif
|
1405
|
+
if (ctx) {
|
1406
|
+
ggml_free(ctx);
|
1407
|
+
}
|
1408
|
+
|
1409
|
+
ggml_backend_buffer_free(buf);
|
1346
1410
|
}
|
1347
1411
|
};
|
1348
1412
|
|
@@ -1382,11 +1446,11 @@ struct llama_vocab {
|
|
1382
1446
|
id special_suffix_id = 32008;
|
1383
1447
|
id special_eot_id = 32010;
|
1384
1448
|
|
1385
|
-
int find_bpe_rank(std::string token_left, std::string token_right) const {
|
1386
|
-
GGML_ASSERT(token_left.find(
|
1387
|
-
GGML_ASSERT(token_left.find(
|
1388
|
-
GGML_ASSERT(token_right.find(
|
1389
|
-
GGML_ASSERT(token_right.find(
|
1449
|
+
int find_bpe_rank(const std::string & token_left, const std::string & token_right) const {
|
1450
|
+
GGML_ASSERT(token_left.find(' ') == std::string::npos);
|
1451
|
+
GGML_ASSERT(token_left.find('\n') == std::string::npos);
|
1452
|
+
GGML_ASSERT(token_right.find(' ') == std::string::npos);
|
1453
|
+
GGML_ASSERT(token_right.find('\n') == std::string::npos);
|
1390
1454
|
|
1391
1455
|
auto it = bpe_ranks.find(std::make_pair(token_left, token_right));
|
1392
1456
|
if (it == bpe_ranks.end()) {
|
@@ -1415,6 +1479,7 @@ struct llama_model {
|
|
1415
1479
|
struct ggml_tensor * output_norm;
|
1416
1480
|
struct ggml_tensor * output_norm_b;
|
1417
1481
|
struct ggml_tensor * output;
|
1482
|
+
struct ggml_tensor * output_b;
|
1418
1483
|
|
1419
1484
|
std::vector<llama_layer> layers;
|
1420
1485
|
|
@@ -1427,7 +1492,7 @@ struct llama_model {
|
|
1427
1492
|
struct ggml_context * ctx = NULL;
|
1428
1493
|
|
1429
1494
|
// the model memory buffer
|
1430
|
-
|
1495
|
+
ggml_backend_buffer_t buf = NULL;
|
1431
1496
|
|
1432
1497
|
// model memory mapped file
|
1433
1498
|
std::unique_ptr<llama_mmap> mapping;
|
@@ -1443,11 +1508,7 @@ struct llama_model {
|
|
1443
1508
|
int64_t t_start_us = 0;
|
1444
1509
|
|
1445
1510
|
~llama_model() {
|
1446
|
-
|
1447
|
-
ggml_free(ctx);
|
1448
|
-
}
|
1449
|
-
|
1450
|
-
#ifdef GGML_USE_CUBLAS
|
1511
|
+
#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
|
1451
1512
|
if (ggml_cublas_loaded()) {
|
1452
1513
|
for (size_t i = 0; i < tensors_by_name.size(); ++i) {
|
1453
1514
|
ggml_cuda_free_data(tensors_by_name[i].second);
|
@@ -1461,24 +1522,26 @@ struct llama_model {
|
|
1461
1522
|
ggml_cl_free_data(tensors_by_name[i].second);
|
1462
1523
|
}
|
1463
1524
|
#endif
|
1525
|
+
if (ctx) {
|
1526
|
+
ggml_free(ctx);
|
1527
|
+
}
|
1528
|
+
|
1529
|
+
ggml_backend_buffer_free(buf);
|
1464
1530
|
}
|
1465
1531
|
};
|
1466
1532
|
|
1467
1533
|
struct llama_context {
|
1468
1534
|
llama_context(const llama_model & model) : model(model), t_start_us(model.t_start_us), t_load_us(model.t_load_us) {}
|
1469
1535
|
~llama_context() {
|
1470
|
-
|
1471
|
-
|
1472
|
-
|
1473
|
-
}
|
1474
|
-
#endif
|
1475
|
-
if (alloc) {
|
1476
|
-
ggml_allocr_free(alloc);
|
1477
|
-
}
|
1536
|
+
ggml_allocr_free(alloc);
|
1537
|
+
ggml_backend_buffer_free(buf_alloc);
|
1538
|
+
ggml_backend_free(backend);
|
1478
1539
|
}
|
1479
1540
|
|
1480
1541
|
llama_cparams cparams;
|
1481
1542
|
|
1543
|
+
ggml_backend_t backend = nullptr;
|
1544
|
+
|
1482
1545
|
const llama_model & model;
|
1483
1546
|
|
1484
1547
|
// key + value cache for the self attention
|
@@ -1500,23 +1563,22 @@ struct llama_context {
|
|
1500
1563
|
|
1501
1564
|
// decode output (2-dimensional array: [n_tokens][n_vocab])
|
1502
1565
|
std::vector<float> logits;
|
1566
|
+
#ifndef NDEBUG
|
1567
|
+
// guard against access to unset logits
|
1568
|
+
std::vector<bool> logits_valid;
|
1569
|
+
#endif
|
1503
1570
|
bool logits_all = false;
|
1504
1571
|
|
1505
1572
|
// input embedding (1-dimensional array: [n_embd])
|
1506
1573
|
std::vector<float> embedding;
|
1507
1574
|
|
1508
|
-
// reusable buffer for `struct ggml_graph_plan.work_data`
|
1509
|
-
std::vector<uint8_t> work_buffer;
|
1510
|
-
|
1511
1575
|
// memory buffers used to evaluate the model
|
1512
|
-
|
1513
|
-
|
1514
|
-
llama_buffer buf_alloc;
|
1576
|
+
std::vector<uint8_t> buf_compute_meta;
|
1577
|
+
ggml_backend_buffer_t buf_alloc = NULL;
|
1515
1578
|
ggml_allocr * alloc = NULL;
|
1516
1579
|
|
1517
|
-
|
1518
|
-
|
1519
|
-
#endif
|
1580
|
+
// temporary buffer for copying data to/from the backend
|
1581
|
+
std::vector<no_init<uint8_t>> buf_copy;
|
1520
1582
|
|
1521
1583
|
#ifdef GGML_USE_MPI
|
1522
1584
|
ggml_mpi_context * ctx_mpi = NULL;
|
@@ -1538,9 +1600,6 @@ static bool llama_kv_cache_init(
|
|
1538
1600
|
const uint32_t n_embd = hparams.n_embd_gqa();
|
1539
1601
|
const uint32_t n_layer = hparams.n_layer;
|
1540
1602
|
|
1541
|
-
const int64_t n_mem = n_layer*n_ctx;
|
1542
|
-
const int64_t n_elements = n_embd*n_mem;
|
1543
|
-
|
1544
1603
|
cache.has_shift = false;
|
1545
1604
|
|
1546
1605
|
cache.head = 0;
|
@@ -1550,13 +1609,10 @@ static bool llama_kv_cache_init(
|
|
1550
1609
|
cache.cells.clear();
|
1551
1610
|
cache.cells.resize(n_ctx);
|
1552
1611
|
|
1553
|
-
cache.buf.resize(ggml_row_size(ktype, n_elements) + ggml_row_size(vtype, n_elements) + 2u*n_layer*ggml_tensor_overhead());
|
1554
|
-
memset(cache.buf.data, 0, cache.buf.size);
|
1555
|
-
|
1556
1612
|
struct ggml_init_params params;
|
1557
|
-
params.mem_size =
|
1558
|
-
params.mem_buffer =
|
1559
|
-
params.no_alloc =
|
1613
|
+
params.mem_size = 2u*n_layer*ggml_tensor_overhead();
|
1614
|
+
params.mem_buffer = NULL;
|
1615
|
+
params.no_alloc = true;
|
1560
1616
|
|
1561
1617
|
cache.ctx = ggml_init(params);
|
1562
1618
|
|
@@ -1570,9 +1626,7 @@ static bool llama_kv_cache_init(
|
|
1570
1626
|
cache.k_l.reserve(n_layer);
|
1571
1627
|
cache.v_l.reserve(n_layer);
|
1572
1628
|
|
1573
|
-
const int i_gpu_start = (int) n_layer - n_gpu_layers;
|
1574
|
-
|
1575
|
-
GGML_UNUSED(offload);
|
1629
|
+
const int i_gpu_start = (int) n_layer - n_gpu_layers;
|
1576
1630
|
|
1577
1631
|
for (int i = 0; i < (int) n_layer; i++) {
|
1578
1632
|
ggml_tensor * k = ggml_new_tensor_1d(cache.ctx, ktype, n_embd*n_ctx);
|
@@ -1581,23 +1635,35 @@ static bool llama_kv_cache_init(
|
|
1581
1635
|
ggml_format_name(v, "cache_v_l%d", i);
|
1582
1636
|
cache.k_l.push_back(k);
|
1583
1637
|
cache.v_l.push_back(v);
|
1584
|
-
#
|
1638
|
+
#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
|
1585
1639
|
if (i >= i_gpu_start) {
|
1586
1640
|
if (offload) {
|
1587
1641
|
ggml_cuda_assign_buffers_no_scratch(k);
|
1588
|
-
vram_kv_cache += ggml_nbytes(k);
|
1589
1642
|
ggml_cuda_assign_buffers_no_scratch(v);
|
1643
|
+
vram_kv_cache += ggml_nbytes(k);
|
1590
1644
|
vram_kv_cache += ggml_nbytes(v);
|
1645
|
+
// HACK: mark tensor as allocated
|
1646
|
+
k->data = v->data = (void *)(uintptr_t)1;
|
1591
1647
|
}
|
1592
1648
|
}
|
1593
1649
|
#endif // GGML_USE_CUBLAS
|
1594
1650
|
}
|
1595
1651
|
|
1652
|
+
// allocate tensors
|
1653
|
+
cache.buf = ggml_backend_alloc_ctx_tensors_from_buft(cache.ctx, llama_default_buffer_type(n_gpu_layers));
|
1654
|
+
|
1655
|
+
// buf may be NULL with full offload
|
1656
|
+
if (cache.buf) {
|
1657
|
+
// initialize the buffer to avoid NaNs in the padding
|
1658
|
+
ggml_backend_buffer_clear(cache.buf, 0);
|
1659
|
+
}
|
1660
|
+
|
1596
1661
|
if (vram_kv_cache > 0) {
|
1597
1662
|
LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
|
1598
1663
|
}
|
1599
1664
|
|
1600
|
-
GGML_UNUSED(
|
1665
|
+
GGML_UNUSED(i_gpu_start);
|
1666
|
+
GGML_UNUSED(offload);
|
1601
1667
|
|
1602
1668
|
return true;
|
1603
1669
|
}
|
@@ -1928,7 +1994,7 @@ namespace GGUFMeta {
|
|
1928
1994
|
target = override->bool_value;
|
1929
1995
|
return true;
|
1930
1996
|
}
|
1931
|
-
return
|
1997
|
+
return false;
|
1932
1998
|
}
|
1933
1999
|
|
1934
2000
|
template<typename OT>
|
@@ -2048,17 +2114,16 @@ struct llama_model_loader {
|
|
2048
2114
|
enum ggml_type type_max = GGML_TYPE_F32;
|
2049
2115
|
|
2050
2116
|
for (int i = 0; i < n_tensors; i++) {
|
2051
|
-
|
2052
|
-
struct ggml_tensor * meta = ggml_get_tensor(ctx_meta, name);
|
2117
|
+
enum ggml_type type = gguf_get_tensor_type(ctx_gguf, i);
|
2053
2118
|
|
2054
|
-
n_type[
|
2119
|
+
n_type[type]++;
|
2055
2120
|
|
2056
|
-
if (n_type_max < n_type[
|
2057
|
-
n_type_max = n_type[
|
2058
|
-
type_max =
|
2121
|
+
if (n_type_max < n_type[type]) {
|
2122
|
+
n_type_max = n_type[type];
|
2123
|
+
type_max = type;
|
2059
2124
|
}
|
2060
2125
|
|
2061
|
-
LLAMA_LOG_INFO("%s: - tensor %4d: %32s %-8s [ %s ]\n", __func__, i, name, ggml_type_name(meta->type), llama_format_tensor_shape(meta).c_str());
|
2126
|
+
// LLAMA_LOG_INFO("%s: - tensor %4d: %32s %-8s [ %s ]\n", __func__, i, name, ggml_type_name(meta->type), llama_format_tensor_shape(meta).c_str());
|
2062
2127
|
}
|
2063
2128
|
|
2064
2129
|
switch (type_max) {
|
@@ -2196,34 +2261,19 @@ struct llama_model_loader {
|
|
2196
2261
|
return gguf_get_tensor_name(ctx_gguf, i);
|
2197
2262
|
}
|
2198
2263
|
|
2199
|
-
struct ggml_tensor * get_tensor_meta(
|
2200
|
-
return ggml_get_tensor(ctx_meta,
|
2264
|
+
struct ggml_tensor * get_tensor_meta(const char * name) const {
|
2265
|
+
return ggml_get_tensor(ctx_meta, name);
|
2201
2266
|
}
|
2202
2267
|
|
2203
|
-
|
2204
|
-
|
2205
|
-
mmapped_size_p = 0;
|
2206
|
-
|
2207
|
-
for (int i = 0; i < n_tensors; i++) {
|
2208
|
-
struct ggml_tensor * meta = get_tensor_meta(i);
|
2209
|
-
ctx_size_p += sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE;
|
2210
|
-
(use_mmap ? mmapped_size_p : ctx_size_p) += ggml_nbytes_pad(meta);
|
2211
|
-
}
|
2268
|
+
struct ggml_tensor * get_tensor_meta(int i) const {
|
2269
|
+
return get_tensor_meta(get_tensor_name(i));
|
2212
2270
|
}
|
2213
2271
|
|
2214
2272
|
struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta, ggml_backend_type backend) {
|
2215
|
-
if (backend != GGML_BACKEND_CPU) {
|
2216
|
-
ggml_set_no_alloc(ctx, true);
|
2217
|
-
}
|
2218
|
-
|
2219
2273
|
struct ggml_tensor * tensor = ggml_dup_tensor(ctx, meta);
|
2220
2274
|
tensor->backend = backend; // TODO: ggml_set_backend
|
2221
2275
|
ggml_set_name(tensor, ggml_get_name(meta));
|
2222
2276
|
|
2223
|
-
if (backend != GGML_BACKEND_CPU) {
|
2224
|
-
ggml_set_no_alloc(ctx, use_mmap);
|
2225
|
-
}
|
2226
|
-
|
2227
2277
|
n_created++;
|
2228
2278
|
|
2229
2279
|
return tensor;
|
@@ -2281,91 +2331,144 @@ struct llama_model_loader {
|
|
2281
2331
|
return gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, idx);
|
2282
2332
|
}
|
2283
2333
|
|
2334
|
+
void init_mapping(bool prefetch = true) {
|
2335
|
+
/*
|
2336
|
+
// prefetch only CPU tensors
|
2337
|
+
if (use_mmap) {
|
2338
|
+
size_t size_pref = 0; // prefetch
|
2339
|
+
|
2340
|
+
for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
|
2341
|
+
struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i));
|
2342
|
+
if (cur->backend == GGML_BACKEND_CPU) {
|
2343
|
+
size_t tensor_end = gguf_get_tensor_offset(ctx_gguf, i) + ggml_nbytes(cur);
|
2344
|
+
size_pref = std::max(size_pref, tensor_end);
|
2345
|
+
}
|
2346
|
+
}
|
2347
|
+
mapping.reset(new llama_mmap(&file, gguf_get_data_offset(ctx_gguf) + size_pref, ggml_is_numa()));
|
2348
|
+
}
|
2349
|
+
*/
|
2350
|
+
// prefetch the whole file - all the data is needed anyway
|
2351
|
+
if (use_mmap) {
|
2352
|
+
mapping.reset(new llama_mmap(&file, prefetch ? -1 : 0, ggml_is_numa()));
|
2353
|
+
}
|
2354
|
+
}
|
2355
|
+
|
2356
|
+
// for backwards compatibility, does not support ggml-backend
|
2284
2357
|
void load_data_for(struct ggml_tensor * cur) const {
|
2285
2358
|
const size_t offs = file_offset(ggml_get_name(cur));
|
2286
2359
|
|
2287
|
-
if (use_mmap) {
|
2288
|
-
cur->data
|
2360
|
+
if (use_mmap && mapping) {
|
2361
|
+
GGML_ASSERT(cur->data == nullptr);
|
2362
|
+
cur->data = (uint8_t *)mapping->addr + offs;
|
2289
2363
|
} else {
|
2364
|
+
GGML_ASSERT(cur->data != nullptr);
|
2290
2365
|
file.seek(offs, SEEK_SET);
|
2291
2366
|
file.read_raw(cur->data, ggml_nbytes(cur));
|
2292
2367
|
}
|
2293
2368
|
}
|
2294
2369
|
|
2295
|
-
|
2370
|
+
// Returns false if cancelled by progress_callback
|
2371
|
+
bool load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, ggml_backend_buffer_t buf_mmap, llama_mlock * lmlock) const {
|
2296
2372
|
size_t size_data = 0;
|
2297
|
-
size_t size_lock = 0;
|
2298
|
-
size_t size_pref = 0; // prefetch
|
2299
2373
|
|
2300
2374
|
for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
|
2301
2375
|
struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i));
|
2302
2376
|
size_data += ggml_nbytes(cur);
|
2303
|
-
if (cur->backend == GGML_BACKEND_CPU) {
|
2304
|
-
size_pref += ggml_nbytes(cur);
|
2305
|
-
}
|
2306
2377
|
}
|
2307
2378
|
|
2308
|
-
if (use_mmap) {
|
2309
|
-
mapping.reset(new llama_mmap(&file, size_pref, ggml_is_numa()));
|
2379
|
+
if (use_mmap && buf_mmap) {
|
2310
2380
|
if (lmlock) {
|
2311
2381
|
lmlock->init(mapping->addr);
|
2312
2382
|
}
|
2313
2383
|
}
|
2314
2384
|
|
2315
|
-
|
2385
|
+
#if (defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)) || defined(GGML_USE_CLBLAST)
|
2386
|
+
const bool legacy_offload = true;
|
2387
|
+
#else
|
2388
|
+
const bool legacy_offload = false;
|
2389
|
+
#endif
|
2390
|
+
|
2391
|
+
std::vector<no_init<uint8_t>> read_buf;
|
2392
|
+
|
2393
|
+
size_t size_done = 0;
|
2394
|
+
|
2395
|
+
size_t mmap_first = -1;
|
2396
|
+
size_t mmap_last = 0;
|
2397
|
+
|
2316
2398
|
for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
|
2317
2399
|
struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i));
|
2318
2400
|
GGML_ASSERT(cur); // unused tensors should have been caught by load_data already
|
2319
2401
|
|
2320
2402
|
if (progress_callback) {
|
2321
|
-
progress_callback((float)
|
2322
|
-
|
2323
|
-
|
2324
|
-
// allocate temp buffer if not using mmap
|
2325
|
-
if (!use_mmap && cur->data == NULL) {
|
2326
|
-
GGML_ASSERT(cur->backend != GGML_BACKEND_CPU);
|
2327
|
-
#ifdef GGML_USE_CPU_HBM
|
2328
|
-
cur->data = (uint8_t*)hbw_malloc(ggml_nbytes(cur));
|
2329
|
-
#else
|
2330
|
-
cur->data = (uint8_t*)malloc(ggml_nbytes(cur));
|
2331
|
-
#endif
|
2403
|
+
if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) {
|
2404
|
+
return false;
|
2405
|
+
}
|
2332
2406
|
}
|
2333
2407
|
|
2334
|
-
|
2408
|
+
const size_t offs = file_offset(ggml_get_name(cur));
|
2335
2409
|
|
2336
|
-
|
2337
|
-
|
2338
|
-
if (
|
2339
|
-
|
2340
|
-
lmlock
|
2410
|
+
if (!legacy_offload || cur->backend == GGML_BACKEND_CPU) {
|
2411
|
+
if (use_mmap && mapping) {
|
2412
|
+
if (buf_mmap) {
|
2413
|
+
ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *) mapping->addr + offs);
|
2414
|
+
if (lmlock) {
|
2415
|
+
lmlock->grow_to(offs + ggml_nbytes(cur));
|
2416
|
+
}
|
2417
|
+
mmap_first = std::min(mmap_first, offs);
|
2418
|
+
mmap_last = std::max(mmap_last, offs + ggml_nbytes(cur));
|
2419
|
+
} else {
|
2420
|
+
ggml_backend_tensor_set(cur, (uint8_t *) mapping->addr + offs, 0, ggml_nbytes(cur));
|
2341
2421
|
}
|
2342
|
-
|
2343
|
-
|
2344
|
-
|
2345
|
-
|
2346
|
-
|
2347
|
-
|
2348
|
-
|
2349
|
-
|
2350
|
-
|
2351
|
-
if (!use_mmap) {
|
2352
|
-
free(cur->data);
|
2422
|
+
} else {
|
2423
|
+
if (ggml_backend_buffer_is_host(cur->buffer)) {
|
2424
|
+
file.seek(offs, SEEK_SET);
|
2425
|
+
file.read_raw(cur->data, ggml_nbytes(cur));
|
2426
|
+
} else {
|
2427
|
+
read_buf.resize(ggml_nbytes(cur));
|
2428
|
+
file.seek(offs, SEEK_SET);
|
2429
|
+
file.read_raw(read_buf.data(), ggml_nbytes(cur));
|
2430
|
+
ggml_backend_tensor_set(cur, read_buf.data(), 0, ggml_nbytes(cur));
|
2353
2431
|
}
|
2354
|
-
|
2432
|
+
}
|
2433
|
+
} else {
|
2434
|
+
// HACK: mark tensor as allocated
|
2435
|
+
cur->data = (void *)(uintptr_t)1;
|
2436
|
+
void * data;
|
2437
|
+
if (use_mmap && mapping) {
|
2438
|
+
data = (uint8_t *) mapping->addr + offs;
|
2439
|
+
} else {
|
2440
|
+
read_buf.resize(ggml_nbytes(cur));
|
2441
|
+
file.seek(offs, SEEK_SET);
|
2442
|
+
file.read_raw(read_buf.data(), ggml_nbytes(cur));
|
2443
|
+
data = read_buf.data();
|
2444
|
+
}
|
2445
|
+
|
2446
|
+
#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
|
2447
|
+
ggml_cuda_transform_tensor(data, cur);
|
2355
2448
|
#elif defined(GGML_USE_CLBLAST)
|
2356
|
-
|
2357
|
-
|
2358
|
-
|
2359
|
-
|
2360
|
-
|
2361
|
-
break;
|
2449
|
+
GGML_ASSERT(cur->backend == GGML_BACKEND_GPU);
|
2450
|
+
ggml_cl_transform_tensor(data, cur);
|
2451
|
+
#else
|
2452
|
+
GGML_ASSERT(!"GPU tensor without a GPU backend");
|
2453
|
+
GGML_UNUSED(data);
|
2362
2454
|
#endif
|
2363
|
-
default:
|
2364
|
-
continue;
|
2365
2455
|
}
|
2366
2456
|
|
2367
|
-
|
2457
|
+
size_done += ggml_nbytes(cur);
|
2368
2458
|
}
|
2459
|
+
|
2460
|
+
// unmap offloaded tensors and metadata
|
2461
|
+
if (use_mmap && mapping) {
|
2462
|
+
mapping->unmap_fragment(0, mmap_first);
|
2463
|
+
mapping->unmap_fragment(mmap_last, mapping->size);
|
2464
|
+
}
|
2465
|
+
|
2466
|
+
if (progress_callback) {
|
2467
|
+
// Even though the model is done loading, we still honor
|
2468
|
+
// cancellation since we need to free allocations.
|
2469
|
+
return progress_callback(1.0f, progress_callback_user_data);
|
2470
|
+
}
|
2471
|
+
return true;
|
2369
2472
|
}
|
2370
2473
|
};
|
2371
2474
|
|
@@ -2388,25 +2491,25 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
|
2388
2491
|
|
2389
2492
|
switch (ftype) {
|
2390
2493
|
case LLAMA_FTYPE_ALL_F32: return "all F32";
|
2391
|
-
case LLAMA_FTYPE_MOSTLY_F16: return "
|
2392
|
-
case LLAMA_FTYPE_MOSTLY_Q4_0: return "
|
2393
|
-
case LLAMA_FTYPE_MOSTLY_Q4_1: return "
|
2494
|
+
case LLAMA_FTYPE_MOSTLY_F16: return "F16";
|
2495
|
+
case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0";
|
2496
|
+
case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1";
|
2394
2497
|
case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
|
2395
|
-
return "
|
2396
|
-
case LLAMA_FTYPE_MOSTLY_Q5_0: return "
|
2397
|
-
case LLAMA_FTYPE_MOSTLY_Q5_1: return "
|
2398
|
-
case LLAMA_FTYPE_MOSTLY_Q8_0: return "
|
2498
|
+
return "Q4_1, some F16";
|
2499
|
+
case LLAMA_FTYPE_MOSTLY_Q5_0: return "Q5_0";
|
2500
|
+
case LLAMA_FTYPE_MOSTLY_Q5_1: return "Q5_1";
|
2501
|
+
case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0";
|
2399
2502
|
|
2400
2503
|
// K-quants
|
2401
|
-
case LLAMA_FTYPE_MOSTLY_Q2_K: return "
|
2402
|
-
case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "
|
2403
|
-
case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "
|
2404
|
-
case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "
|
2405
|
-
case LLAMA_FTYPE_MOSTLY_Q4_K_S: return "
|
2406
|
-
case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "
|
2407
|
-
case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "
|
2408
|
-
case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "
|
2409
|
-
case LLAMA_FTYPE_MOSTLY_Q6_K: return "
|
2504
|
+
case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K";
|
2505
|
+
case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small";
|
2506
|
+
case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "Q3_K - Medium";
|
2507
|
+
case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "Q3_K - Large";
|
2508
|
+
case LLAMA_FTYPE_MOSTLY_Q4_K_S: return "Q4_K - Small";
|
2509
|
+
case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "Q4_K - Medium";
|
2510
|
+
case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small";
|
2511
|
+
case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium";
|
2512
|
+
case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
|
2410
2513
|
|
2411
2514
|
default: return "unknown, may not work";
|
2412
2515
|
}
|
@@ -2524,6 +2627,7 @@ static void llm_load_hparams(
|
|
2524
2627
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
2525
2628
|
|
2526
2629
|
switch (hparams.n_layer) {
|
2630
|
+
case 22: model.type = e_model::MODEL_1B; break;
|
2527
2631
|
case 26: model.type = e_model::MODEL_3B; break;
|
2528
2632
|
case 32: model.type = e_model::MODEL_7B; break;
|
2529
2633
|
case 40: model.type = e_model::MODEL_13B; break;
|
@@ -2625,6 +2729,15 @@ static void llm_load_hparams(
|
|
2625
2729
|
default: model.type = e_model::MODEL_UNKNOWN;
|
2626
2730
|
}
|
2627
2731
|
} break;
|
2732
|
+
case LLM_ARCH_PHI2:
|
2733
|
+
{
|
2734
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
2735
|
+
|
2736
|
+
switch (hparams.n_layer) {
|
2737
|
+
case 32: model.type = e_model::MODEL_3B; break;
|
2738
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
2739
|
+
}
|
2740
|
+
} break;
|
2628
2741
|
|
2629
2742
|
default: (void)0;
|
2630
2743
|
}
|
@@ -2932,7 +3045,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
2932
3045
|
if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
|
2933
3046
|
}
|
2934
3047
|
|
2935
|
-
|
3048
|
+
// Returns false if cancelled by progress_callback
|
3049
|
+
static bool llm_load_tensors(
|
2936
3050
|
llama_model_loader & ml,
|
2937
3051
|
llama_model & model,
|
2938
3052
|
int n_gpu_layers,
|
@@ -2948,25 +3062,16 @@ static void llm_load_tensors(
|
|
2948
3062
|
|
2949
3063
|
model.n_gpu_layers = n_gpu_layers;
|
2950
3064
|
|
2951
|
-
size_t ctx_size;
|
2952
|
-
size_t mmapped_size;
|
2953
|
-
|
2954
|
-
ml.calc_sizes(ctx_size, mmapped_size);
|
3065
|
+
size_t ctx_size = ggml_tensor_overhead() * ml.n_tensors;
|
2955
3066
|
|
2956
|
-
LLAMA_LOG_INFO("%s: ggml ctx size
|
3067
|
+
LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MiB\n", __func__, ctx_size/1024.0/1024.0);
|
2957
3068
|
|
2958
3069
|
// create the ggml context
|
2959
3070
|
{
|
2960
|
-
model.buf.resize(ctx_size);
|
2961
|
-
if (use_mlock) {
|
2962
|
-
model.mlock_buf.init (model.buf.data);
|
2963
|
-
model.mlock_buf.grow_to(model.buf.size);
|
2964
|
-
}
|
2965
|
-
|
2966
3071
|
struct ggml_init_params params = {
|
2967
|
-
/*.mem_size =*/
|
2968
|
-
/*.mem_buffer =*/
|
2969
|
-
/*.no_alloc =*/
|
3072
|
+
/*.mem_size =*/ ctx_size,
|
3073
|
+
/*.mem_buffer =*/ NULL,
|
3074
|
+
/*.no_alloc =*/ true,
|
2970
3075
|
};
|
2971
3076
|
|
2972
3077
|
model.ctx = ggml_init(params);
|
@@ -2977,25 +3082,24 @@ static void llm_load_tensors(
|
|
2977
3082
|
|
2978
3083
|
(void) main_gpu;
|
2979
3084
|
|
2980
|
-
enum ggml_backend_type llama_backend_offload
|
3085
|
+
enum ggml_backend_type llama_backend_offload = GGML_BACKEND_CPU;
|
2981
3086
|
enum ggml_backend_type llama_backend_offload_split = GGML_BACKEND_CPU;
|
2982
3087
|
|
2983
|
-
#
|
3088
|
+
#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
|
2984
3089
|
if (ggml_cublas_loaded()) {
|
2985
3090
|
LLAMA_LOG_INFO("%s: using " GGML_CUDA_NAME " for GPU acceleration\n", __func__);
|
2986
3091
|
ggml_cuda_set_main_device(main_gpu);
|
2987
3092
|
|
2988
|
-
llama_backend_offload
|
3093
|
+
llama_backend_offload = GGML_BACKEND_GPU;
|
2989
3094
|
llama_backend_offload_split = GGML_BACKEND_GPU_SPLIT;
|
2990
3095
|
}
|
2991
3096
|
#elif defined(GGML_USE_CLBLAST)
|
2992
3097
|
LLAMA_LOG_INFO("%s: using OpenCL for GPU acceleration\n", __func__);
|
2993
|
-
llama_backend_offload
|
3098
|
+
llama_backend_offload = GGML_BACKEND_GPU;
|
2994
3099
|
llama_backend_offload_split = GGML_BACKEND_GPU;
|
2995
3100
|
#endif
|
2996
3101
|
|
2997
|
-
//
|
2998
|
-
size_t vram_weights = 0;
|
3102
|
+
// create tensors for the weights
|
2999
3103
|
{
|
3000
3104
|
const int64_t n_embd = hparams.n_embd;
|
3001
3105
|
const int64_t n_embd_gqa = hparams.n_embd_gqa();
|
@@ -3024,13 +3128,6 @@ static void llm_load_tensors(
|
|
3024
3128
|
|
3025
3129
|
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
3026
3130
|
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
3027
|
-
|
3028
|
-
if (backend_norm == GGML_BACKEND_GPU) {
|
3029
|
-
vram_weights += ggml_nbytes(model.output_norm);
|
3030
|
-
}
|
3031
|
-
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
3032
|
-
vram_weights += ggml_nbytes(model.output);
|
3033
|
-
}
|
3034
3131
|
}
|
3035
3132
|
|
3036
3133
|
const uint32_t n_ff = hparams.n_ff;
|
@@ -3080,28 +3177,6 @@ static void llm_load_tensors(
|
|
3080
3177
|
layer.ffn_up_exp[x] = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), {n_embd, n_ff}, backend_split);
|
3081
3178
|
}
|
3082
3179
|
}
|
3083
|
-
|
3084
|
-
if (backend == GGML_BACKEND_GPU) {
|
3085
|
-
vram_weights +=
|
3086
|
-
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
|
3087
|
-
ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) +
|
3088
|
-
(layer.bq ? ggml_nbytes(layer.bq) : 0) +
|
3089
|
-
(layer.bk ? ggml_nbytes(layer.bk) : 0) +
|
3090
|
-
(layer.bv ? ggml_nbytes(layer.bv) : 0) +
|
3091
|
-
(layer.bo ? ggml_nbytes(layer.bo) : 0) +
|
3092
|
-
ggml_nbytes(layer.ffn_norm);
|
3093
|
-
|
3094
|
-
if (layer.ffn_gate_inp == nullptr) {
|
3095
|
-
vram_weights +=
|
3096
|
-
ggml_nbytes(layer.ffn_gate) + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
|
3097
|
-
} else {
|
3098
|
-
vram_weights += ggml_nbytes(layer.ffn_gate_inp);
|
3099
|
-
for (uint32_t x = 0; x < hparams.n_expert; ++x) {
|
3100
|
-
vram_weights +=
|
3101
|
-
ggml_nbytes(layer.ffn_gate_exp[x]) + ggml_nbytes(layer.ffn_down_exp[x]) + ggml_nbytes(layer.ffn_up_exp[x]);
|
3102
|
-
}
|
3103
|
-
}
|
3104
|
-
}
|
3105
3180
|
}
|
3106
3181
|
} break;
|
3107
3182
|
case LLM_ARCH_BAICHUAN:
|
@@ -3121,13 +3196,6 @@ static void llm_load_tensors(
|
|
3121
3196
|
|
3122
3197
|
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
3123
3198
|
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
3124
|
-
|
3125
|
-
if (backend_norm == GGML_BACKEND_GPU) {
|
3126
|
-
vram_weights += ggml_nbytes(model.output_norm);
|
3127
|
-
}
|
3128
|
-
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
3129
|
-
vram_weights += ggml_nbytes(model.output);
|
3130
|
-
}
|
3131
3199
|
}
|
3132
3200
|
|
3133
3201
|
const uint32_t n_ff = hparams.n_ff;
|
@@ -3154,19 +3222,10 @@ static void llm_load_tensors(
|
|
3154
3222
|
layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
|
3155
3223
|
layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
|
3156
3224
|
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
3157
|
-
|
3158
|
-
if (backend == GGML_BACKEND_GPU) {
|
3159
|
-
vram_weights +=
|
3160
|
-
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
|
3161
|
-
ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) +
|
3162
|
-
ggml_nbytes(layer.ffn_gate) + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
|
3163
|
-
}
|
3164
3225
|
}
|
3165
3226
|
} break;
|
3166
3227
|
case LLM_ARCH_FALCON:
|
3167
3228
|
{
|
3168
|
-
// TODO: CPU-only for now
|
3169
|
-
|
3170
3229
|
model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
3171
3230
|
|
3172
3231
|
// output
|
@@ -3185,14 +3244,6 @@ static void llm_load_tensors(
|
|
3185
3244
|
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
3186
3245
|
model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
|
3187
3246
|
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
3188
|
-
|
3189
|
-
if (backend_norm == GGML_BACKEND_GPU) {
|
3190
|
-
vram_weights += ggml_nbytes(model.output_norm);
|
3191
|
-
vram_weights += ggml_nbytes(model.output_norm_b);
|
3192
|
-
}
|
3193
|
-
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
3194
|
-
vram_weights += ggml_nbytes(model.output);
|
3195
|
-
}
|
3196
3247
|
}
|
3197
3248
|
|
3198
3249
|
const uint32_t n_ff = hparams.n_ff;
|
@@ -3213,11 +3264,6 @@ static void llm_load_tensors(
|
|
3213
3264
|
if (gguf_find_tensor(ml.ctx_gguf, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i).c_str()) >= 0) {
|
3214
3265
|
layer.attn_norm_2 = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, backend);
|
3215
3266
|
layer.attn_norm_2_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, backend);
|
3216
|
-
|
3217
|
-
if (backend == GGML_BACKEND_GPU) {
|
3218
|
-
vram_weights += ggml_nbytes(layer.attn_norm_2);
|
3219
|
-
vram_weights += ggml_nbytes(layer.attn_norm_2_b);
|
3220
|
-
}
|
3221
3267
|
}
|
3222
3268
|
|
3223
3269
|
layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
|
@@ -3225,13 +3271,6 @@ static void llm_load_tensors(
|
|
3225
3271
|
|
3226
3272
|
layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
|
3227
3273
|
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
3228
|
-
|
3229
|
-
if (backend == GGML_BACKEND_GPU) {
|
3230
|
-
vram_weights +=
|
3231
|
-
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
|
3232
|
-
ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.wo) +
|
3233
|
-
ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
|
3234
|
-
}
|
3235
3274
|
}
|
3236
3275
|
} break;
|
3237
3276
|
case LLM_ARCH_STARCODER:
|
@@ -3255,14 +3294,6 @@ static void llm_load_tensors(
|
|
3255
3294
|
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
3256
3295
|
model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
|
3257
3296
|
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
3258
|
-
|
3259
|
-
if (backend_norm == GGML_BACKEND_GPU) {
|
3260
|
-
vram_weights += ggml_nbytes(model.output_norm);
|
3261
|
-
vram_weights += ggml_nbytes(model.output_norm_b);
|
3262
|
-
}
|
3263
|
-
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
3264
|
-
vram_weights += ggml_nbytes(model.output);
|
3265
|
-
}
|
3266
3297
|
}
|
3267
3298
|
|
3268
3299
|
const uint32_t n_ff = hparams.n_ff;
|
@@ -3294,16 +3325,6 @@ static void llm_load_tensors(
|
|
3294
3325
|
|
3295
3326
|
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
3296
3327
|
layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend);
|
3297
|
-
|
3298
|
-
if (backend == GGML_BACKEND_GPU) {
|
3299
|
-
vram_weights +=
|
3300
|
-
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
|
3301
|
-
ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) +
|
3302
|
-
ggml_nbytes(layer.wo) + ggml_nbytes(layer.bo) +
|
3303
|
-
ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_norm_b) +
|
3304
|
-
ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_down_b) +
|
3305
|
-
ggml_nbytes(layer.ffn_up) + ggml_nbytes(layer.ffn_up_b);
|
3306
|
-
}
|
3307
3328
|
}
|
3308
3329
|
} break;
|
3309
3330
|
case LLM_ARCH_PERSIMMON:
|
@@ -3325,14 +3346,6 @@ static void llm_load_tensors(
|
|
3325
3346
|
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
3326
3347
|
model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
|
3327
3348
|
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
3328
|
-
|
3329
|
-
if (backend_norm == GGML_BACKEND_GPU) {
|
3330
|
-
vram_weights += ggml_nbytes(model.output_norm);
|
3331
|
-
vram_weights += ggml_nbytes(model.output_norm_b);
|
3332
|
-
}
|
3333
|
-
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
3334
|
-
vram_weights += ggml_nbytes(model.output);
|
3335
|
-
}
|
3336
3349
|
}
|
3337
3350
|
|
3338
3351
|
const uint32_t n_ff = hparams.n_ff;
|
@@ -3362,8 +3375,6 @@ static void llm_load_tensors(
|
|
3362
3375
|
} break;
|
3363
3376
|
case LLM_ARCH_BLOOM:
|
3364
3377
|
{
|
3365
|
-
// TODO: CPU-only for now
|
3366
|
-
|
3367
3378
|
model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
3368
3379
|
model.tok_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, GGML_BACKEND_CPU);
|
3369
3380
|
model.tok_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, GGML_BACKEND_CPU);
|
@@ -3384,14 +3395,6 @@ static void llm_load_tensors(
|
|
3384
3395
|
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
3385
3396
|
model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
|
3386
3397
|
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
3387
|
-
|
3388
|
-
if (backend_norm == GGML_BACKEND_GPU) {
|
3389
|
-
vram_weights += ggml_nbytes(model.output_norm);
|
3390
|
-
vram_weights += ggml_nbytes(model.output_norm_b);
|
3391
|
-
}
|
3392
|
-
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
3393
|
-
vram_weights += ggml_nbytes(model.output);
|
3394
|
-
}
|
3395
3398
|
}
|
3396
3399
|
|
3397
3400
|
const uint32_t n_ff = hparams.n_ff;
|
@@ -3423,16 +3426,6 @@ static void llm_load_tensors(
|
|
3423
3426
|
|
3424
3427
|
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
3425
3428
|
layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend);
|
3426
|
-
|
3427
|
-
if (backend == GGML_BACKEND_GPU) {
|
3428
|
-
vram_weights +=
|
3429
|
-
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
|
3430
|
-
ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) +
|
3431
|
-
ggml_nbytes(layer.wo) + ggml_nbytes(layer.bo) +
|
3432
|
-
ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_norm_b) +
|
3433
|
-
ggml_nbytes(layer.ffn_up) + ggml_nbytes(layer.ffn_up_b) +
|
3434
|
-
ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_down_b);
|
3435
|
-
}
|
3436
3429
|
}
|
3437
3430
|
} break;
|
3438
3431
|
case LLM_ARCH_MPT:
|
@@ -3454,13 +3447,6 @@ static void llm_load_tensors(
|
|
3454
3447
|
|
3455
3448
|
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
3456
3449
|
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
3457
|
-
|
3458
|
-
if (backend_norm == GGML_BACKEND_GPU) {
|
3459
|
-
vram_weights += ggml_nbytes(model.output_norm);
|
3460
|
-
}
|
3461
|
-
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
3462
|
-
vram_weights += ggml_nbytes(model.output);
|
3463
|
-
}
|
3464
3450
|
}
|
3465
3451
|
|
3466
3452
|
const uint32_t n_ff = hparams.n_ff;
|
@@ -3483,16 +3469,6 @@ static void llm_load_tensors(
|
|
3483
3469
|
|
3484
3470
|
layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
|
3485
3471
|
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
3486
|
-
|
3487
|
-
if (backend == GGML_BACKEND_GPU) {
|
3488
|
-
vram_weights +=
|
3489
|
-
ggml_nbytes(layer.attn_norm) +
|
3490
|
-
ggml_nbytes(layer.wqkv) +
|
3491
|
-
ggml_nbytes(layer.wo) +
|
3492
|
-
ggml_nbytes(layer.ffn_norm) +
|
3493
|
-
ggml_nbytes(layer.ffn_down) +
|
3494
|
-
ggml_nbytes(layer.ffn_up);
|
3495
|
-
}
|
3496
3472
|
}
|
3497
3473
|
} break;
|
3498
3474
|
case LLM_ARCH_STABLELM:
|
@@ -3515,13 +3491,6 @@ static void llm_load_tensors(
|
|
3515
3491
|
model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
|
3516
3492
|
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
3517
3493
|
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
3518
|
-
|
3519
|
-
if (backend_norm == GGML_BACKEND_GPU) {
|
3520
|
-
vram_weights += ggml_nbytes(model.output_norm);
|
3521
|
-
}
|
3522
|
-
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
3523
|
-
vram_weights += ggml_nbytes(model.output);
|
3524
|
-
}
|
3525
3494
|
}
|
3526
3495
|
|
3527
3496
|
const uint32_t n_ff = hparams.n_ff;
|
@@ -3553,13 +3522,6 @@ static void llm_load_tensors(
|
|
3553
3522
|
layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
|
3554
3523
|
layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
|
3555
3524
|
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
3556
|
-
|
3557
|
-
if (backend == GGML_BACKEND_GPU) {
|
3558
|
-
vram_weights +=
|
3559
|
-
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
|
3560
|
-
ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) +
|
3561
|
-
ggml_nbytes(layer.ffn_gate) + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
|
3562
|
-
}
|
3563
3525
|
}
|
3564
3526
|
} break;
|
3565
3527
|
case LLM_ARCH_QWEN:
|
@@ -3579,14 +3541,7 @@ static void llm_load_tensors(
|
|
3579
3541
|
|
3580
3542
|
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
3581
3543
|
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
3582
|
-
|
3583
|
-
if (backend_norm == GGML_BACKEND_GPU) {
|
3584
|
-
vram_weights += ggml_nbytes(model.output_norm);
|
3585
|
-
}
|
3586
|
-
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
3587
|
-
vram_weights += ggml_nbytes(model.output);
|
3588
|
-
}
|
3589
|
-
}
|
3544
|
+
}
|
3590
3545
|
|
3591
3546
|
const uint32_t n_ff = hparams.n_ff / 2;
|
3592
3547
|
|
@@ -3611,16 +3566,59 @@ static void llm_load_tensors(
|
|
3611
3566
|
layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
|
3612
3567
|
layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
|
3613
3568
|
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
3569
|
+
}
|
3570
|
+
} break;
|
3571
|
+
case LLM_ARCH_PHI2:
|
3572
|
+
{
|
3573
|
+
model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
3574
|
+
|
3575
|
+
// output
|
3576
|
+
{
|
3577
|
+
ggml_backend_type backend_norm;
|
3578
|
+
ggml_backend_type backend_output;
|
3614
3579
|
|
3615
|
-
if (
|
3616
|
-
|
3617
|
-
|
3618
|
-
|
3619
|
-
|
3580
|
+
if (n_gpu_layers > int(n_layer)) {
|
3581
|
+
backend_norm = llama_backend_offload;
|
3582
|
+
backend_output = llama_backend_offload;
|
3583
|
+
} else {
|
3584
|
+
backend_norm = GGML_BACKEND_CPU;
|
3585
|
+
backend_output = GGML_BACKEND_CPU;
|
3620
3586
|
}
|
3587
|
+
|
3588
|
+
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
3589
|
+
model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
|
3590
|
+
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
3591
|
+
model.output_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "bias"), {n_vocab}, backend_output);
|
3621
3592
|
}
|
3622
|
-
} break;
|
3623
3593
|
|
3594
|
+
const uint32_t n_ff = hparams.n_ff;
|
3595
|
+
|
3596
|
+
const int i_gpu_start = n_layer - n_gpu_layers;
|
3597
|
+
|
3598
|
+
model.layers.resize(n_layer);
|
3599
|
+
|
3600
|
+
for (uint32_t i = 0; i < n_layer; ++i) {
|
3601
|
+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
|
3602
|
+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
|
3603
|
+
|
3604
|
+
auto & layer = model.layers[i];
|
3605
|
+
|
3606
|
+
layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
|
3607
|
+
layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
|
3608
|
+
|
3609
|
+
layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
|
3610
|
+
layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend);
|
3611
|
+
|
3612
|
+
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
|
3613
|
+
layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend);
|
3614
|
+
|
3615
|
+
layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
|
3616
|
+
layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend);
|
3617
|
+
|
3618
|
+
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
3619
|
+
layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend);
|
3620
|
+
}
|
3621
|
+
} break;
|
3624
3622
|
default:
|
3625
3623
|
throw std::runtime_error("unknown architecture");
|
3626
3624
|
}
|
@@ -3628,16 +3626,78 @@ static void llm_load_tensors(
|
|
3628
3626
|
|
3629
3627
|
ml.done_getting_tensors();
|
3630
3628
|
|
3629
|
+
ml.init_mapping();
|
3630
|
+
|
3631
|
+
// allocate tensors
|
3632
|
+
size_t vram_weights = 0;
|
3633
|
+
size_t buf_size = 0;
|
3634
|
+
|
3635
|
+
ggml_backend_buffer_type_t buft = llama_default_buffer_type(n_gpu_layers);
|
3636
|
+
|
3637
|
+
for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
|
3638
|
+
// GGML_BACKEND_GPU tensors are for CUDA and OpenCL only, which are handled separately without ggml-backend
|
3639
|
+
if (t->backend == GGML_BACKEND_CPU) {
|
3640
|
+
buf_size += GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), ggml_backend_buft_get_alignment(buft));
|
3641
|
+
} else {
|
3642
|
+
vram_weights += ggml_nbytes(t);
|
3643
|
+
}
|
3644
|
+
}
|
3645
|
+
|
3646
|
+
// create backend buffer
|
3647
|
+
ggml_backend_buffer_t buf_mmap = nullptr;
|
3648
|
+
|
3649
|
+
#ifdef GGML_USE_METAL
|
3650
|
+
if (n_gpu_layers > 0) {
|
3651
|
+
if (ml.use_mmap) {
|
3652
|
+
const size_t max_size = ggml_get_max_tensor_size(ctx);
|
3653
|
+
model.buf = ggml_backend_metal_buffer_from_ptr(ml.mapping->addr, ml.mapping->size, max_size);
|
3654
|
+
buf_mmap = model.buf;
|
3655
|
+
} else {
|
3656
|
+
model.buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_metal_buffer_type());
|
3657
|
+
}
|
3658
|
+
}
|
3659
|
+
#elif defined(GGML_USE_CUBLAS) && defined(LLAMA_GGML_BACKEND_CUDA_TEST)
|
3660
|
+
// for testing only
|
3661
|
+
if (n_gpu_layers > 0) {
|
3662
|
+
model.buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_cuda_buffer_type(0));
|
3663
|
+
}
|
3664
|
+
#endif
|
3665
|
+
|
3666
|
+
if (model.buf == nullptr) {
|
3667
|
+
// CPU backend, and indirectly CUDA and OpenCL
|
3668
|
+
if (ml.use_mmap) {
|
3669
|
+
model.buf = ggml_backend_cpu_buffer_from_ptr(ml.mapping->addr, ml.mapping->size);
|
3670
|
+
buf_mmap = model.buf;
|
3671
|
+
} else {
|
3672
|
+
// allocate only CPU tensors
|
3673
|
+
model.buf = ggml_backend_buft_alloc_buffer(buft, buf_size);
|
3674
|
+
ggml_tallocr_t alloc = ggml_tallocr_new_from_buffer(model.buf);
|
3675
|
+
for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
|
3676
|
+
if (t->backend == GGML_BACKEND_CPU) {
|
3677
|
+
ggml_tallocr_alloc(alloc, t);
|
3678
|
+
}
|
3679
|
+
}
|
3680
|
+
ggml_tallocr_free(alloc);
|
3681
|
+
}
|
3682
|
+
}
|
3683
|
+
|
3684
|
+
if (use_mlock && ggml_backend_buffer_is_host(model.buf)) {
|
3685
|
+
model.mlock_buf.init (ggml_backend_buffer_get_base(model.buf));
|
3686
|
+
model.mlock_buf.grow_to(ggml_backend_buffer_get_size(model.buf));
|
3687
|
+
}
|
3688
|
+
|
3631
3689
|
// print memory requirements
|
3632
3690
|
{
|
3633
|
-
|
3634
|
-
size_t mem_required =
|
3635
|
-
ctx_size +
|
3636
|
-
mmapped_size - vram_weights; // weights in VRAM not in memory
|
3691
|
+
size_t sys_mem_required = ctx_size + buf_size;
|
3637
3692
|
|
3638
|
-
|
3693
|
+
if (sys_mem_required > 0) {
|
3694
|
+
LLAMA_LOG_INFO("%s: system memory used = %7.2f MiB\n", __func__, sys_mem_required / 1024.0 / 1024.0);
|
3695
|
+
}
|
3696
|
+
if (vram_weights > 0) {
|
3697
|
+
LLAMA_LOG_INFO("%s: VRAM used = %7.2f MiB\n", __func__, vram_weights / 1024.0 / 1024.0);
|
3698
|
+
}
|
3639
3699
|
|
3640
|
-
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
3700
|
+
#if (defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)) || defined(GGML_USE_CLBLAST)
|
3641
3701
|
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
3642
3702
|
|
3643
3703
|
LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
|
@@ -3645,38 +3705,27 @@ static void llm_load_tensors(
|
|
3645
3705
|
LLAMA_LOG_INFO("%s: offloading non-repeating layers to GPU\n", __func__);
|
3646
3706
|
}
|
3647
3707
|
|
3648
|
-
#ifdef GGML_USE_CUBLAS
|
3649
|
-
const int max_backend_supported_layers = hparams.n_layer + 1;
|
3650
|
-
const int max_offloadable_layers = hparams.n_layer + 1;
|
3651
|
-
#elif GGML_USE_CLBLAST
|
3652
3708
|
const int max_backend_supported_layers = hparams.n_layer + 1;
|
3653
3709
|
const int max_offloadable_layers = hparams.n_layer + 1;
|
3654
|
-
#endif // GGML_USE_CUBLAS
|
3655
3710
|
|
3656
3711
|
LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
|
3657
|
-
LLAMA_LOG_INFO("%s: VRAM used: %.2f MiB\n", __func__, vram_weights / 1024.0 / 1024.0);
|
3658
|
-
#else
|
3659
|
-
(void) n_gpu_layers;
|
3660
3712
|
#endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
3661
3713
|
}
|
3662
3714
|
|
3663
|
-
|
3715
|
+
#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
|
3716
|
+
ggml_cuda_set_tensor_split(tensor_split);
|
3717
|
+
#else
|
3718
|
+
GGML_UNUSED(tensor_split);
|
3719
|
+
#endif // GGML_USE_CUBLAS
|
3720
|
+
|
3721
|
+
// populate tensors_by_name
|
3664
3722
|
for (int i = 0; i < ml.n_tensors; ++i) {
|
3665
3723
|
struct ggml_tensor * cur = ggml_get_tensor(ctx, ml.get_tensor_name(i));
|
3666
3724
|
model.tensors_by_name.emplace_back(ggml_get_name(cur), cur);
|
3667
3725
|
}
|
3668
3726
|
|
3669
|
-
(
|
3670
|
-
|
3671
|
-
{
|
3672
|
-
ggml_cuda_set_tensor_split(tensor_split);
|
3673
|
-
}
|
3674
|
-
#endif
|
3675
|
-
|
3676
|
-
ml.load_all_data(ctx, progress_callback, progress_callback_user_data, use_mlock ? &model.mlock_mmap : NULL);
|
3677
|
-
|
3678
|
-
if (progress_callback) {
|
3679
|
-
progress_callback(1.0f, progress_callback_user_data);
|
3727
|
+
if (!ml.load_all_data(ctx, progress_callback, progress_callback_user_data, buf_mmap, use_mlock ? &model.mlock_mmap : NULL)) {
|
3728
|
+
return false;
|
3680
3729
|
}
|
3681
3730
|
|
3682
3731
|
model.mapping = std::move(ml.mapping);
|
@@ -3684,9 +3733,11 @@ static void llm_load_tensors(
|
|
3684
3733
|
// loading time will be recalculate after the first eval, so
|
3685
3734
|
// we take page faults deferred by mmap() into consideration
|
3686
3735
|
model.t_load_us = ggml_time_us() - model.t_start_us;
|
3736
|
+
return true;
|
3687
3737
|
}
|
3688
3738
|
|
3689
|
-
|
3739
|
+
// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
|
3740
|
+
static int llama_model_load(const std::string & fname, llama_model & model, const llama_model_params & params) {
|
3690
3741
|
try {
|
3691
3742
|
llama_model_loader ml(fname, params.use_mmap, params.kv_overrides);
|
3692
3743
|
|
@@ -3704,19 +3755,21 @@ static bool llama_model_load(const std::string & fname, llama_model & model, con
|
|
3704
3755
|
|
3705
3756
|
if (params.vocab_only) {
|
3706
3757
|
LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__);
|
3707
|
-
return
|
3758
|
+
return 0;
|
3708
3759
|
}
|
3709
3760
|
|
3710
|
-
llm_load_tensors(
|
3761
|
+
if (!llm_load_tensors(
|
3711
3762
|
ml, model, params.n_gpu_layers, params.main_gpu, params.tensor_split, params.use_mlock,
|
3712
3763
|
params.progress_callback, params.progress_callback_user_data
|
3713
|
-
)
|
3764
|
+
)) {
|
3765
|
+
return -2;
|
3766
|
+
}
|
3714
3767
|
} catch (const std::exception & err) {
|
3715
3768
|
LLAMA_LOG_ERROR("error loading model: %s\n", err.what());
|
3716
|
-
return
|
3769
|
+
return -1;
|
3717
3770
|
}
|
3718
3771
|
|
3719
|
-
return
|
3772
|
+
return 0;
|
3720
3773
|
}
|
3721
3774
|
|
3722
3775
|
//
|
@@ -3981,17 +4034,18 @@ static struct ggml_tensor * llm_build_ffn(
|
|
3981
4034
|
// if max_alibi_bias > 0 then apply ALiBi
|
3982
4035
|
static struct ggml_tensor * llm_build_kqv(
|
3983
4036
|
struct ggml_context * ctx,
|
4037
|
+
const llama_model & model,
|
3984
4038
|
const llama_hparams & hparams,
|
3985
4039
|
const llama_kv_cache & kv,
|
3986
4040
|
struct ggml_tensor * wo,
|
3987
4041
|
struct ggml_tensor * wo_b,
|
3988
4042
|
struct ggml_tensor * q_cur,
|
3989
|
-
struct ggml_tensor * kq_scale,
|
3990
4043
|
struct ggml_tensor * kq_mask,
|
3991
4044
|
int64_t n_ctx,
|
3992
4045
|
int32_t n_tokens,
|
3993
4046
|
int32_t n_kv,
|
3994
4047
|
float max_alibi_bias,
|
4048
|
+
float kq_scale,
|
3995
4049
|
const llm_build_cb & cb,
|
3996
4050
|
int il) {
|
3997
4051
|
const int64_t n_embd = hparams.n_embd;
|
@@ -4014,6 +4068,12 @@ static struct ggml_tensor * llm_build_kqv(
|
|
4014
4068
|
struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
|
4015
4069
|
cb(kq, "kq", il);
|
4016
4070
|
|
4071
|
+
if (model.arch == LLM_ARCH_PHI2) {
|
4072
|
+
// for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
|
4073
|
+
// ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
|
4074
|
+
ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
|
4075
|
+
}
|
4076
|
+
|
4017
4077
|
if (max_alibi_bias > 0.0f) {
|
4018
4078
|
// temporary branch until we figure out how to handle ggml_alibi through ggml_add
|
4019
4079
|
kq = ggml_scale(ctx, kq, kq_scale);
|
@@ -4033,7 +4093,7 @@ static struct ggml_tensor * llm_build_kqv(
|
|
4033
4093
|
kq = ggml_soft_max(ctx, kq);
|
4034
4094
|
cb(kq, "kq_soft_max", il);
|
4035
4095
|
} else {
|
4036
|
-
kq = ggml_soft_max_ext(ctx, kq, kq_mask,
|
4096
|
+
kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_scale);
|
4037
4097
|
cb(kq, "kq_soft_max_ext", il);
|
4038
4098
|
}
|
4039
4099
|
|
@@ -4102,7 +4162,7 @@ struct llm_build_context {
|
|
4102
4162
|
|
4103
4163
|
const llm_build_cb & cb;
|
4104
4164
|
|
4105
|
-
|
4165
|
+
std::vector<uint8_t> & buf_compute_meta;
|
4106
4166
|
|
4107
4167
|
struct ggml_context * ctx0 = nullptr;
|
4108
4168
|
|
@@ -4112,35 +4172,35 @@ struct llm_build_context {
|
|
4112
4172
|
const llama_batch & batch,
|
4113
4173
|
const llm_build_cb & cb,
|
4114
4174
|
bool worst_case) :
|
4115
|
-
model
|
4116
|
-
hparams
|
4117
|
-
cparams
|
4118
|
-
batch
|
4119
|
-
kv_self
|
4120
|
-
n_embd
|
4121
|
-
n_layer
|
4122
|
-
n_ctx
|
4123
|
-
n_head
|
4124
|
-
n_head_kv
|
4125
|
-
n_embd_head
|
4126
|
-
n_embd_gqa
|
4127
|
-
n_expert
|
4128
|
-
n_expert_used
|
4129
|
-
freq_base
|
4130
|
-
freq_scale
|
4131
|
-
ext_factor
|
4132
|
-
attn_factor
|
4133
|
-
beta_fast
|
4134
|
-
beta_slow
|
4135
|
-
norm_eps
|
4136
|
-
norm_rms_eps
|
4137
|
-
n_tokens
|
4138
|
-
n_kv
|
4139
|
-
kv_head
|
4140
|
-
n_orig_ctx
|
4141
|
-
do_rope_shift
|
4142
|
-
cb
|
4143
|
-
|
4175
|
+
model (lctx.model),
|
4176
|
+
hparams (model.hparams),
|
4177
|
+
cparams (lctx.cparams),
|
4178
|
+
batch (batch),
|
4179
|
+
kv_self (lctx.kv_self),
|
4180
|
+
n_embd (hparams.n_embd),
|
4181
|
+
n_layer (hparams.n_layer),
|
4182
|
+
n_ctx (cparams.n_ctx),
|
4183
|
+
n_head (hparams.n_head),
|
4184
|
+
n_head_kv (hparams.n_head_kv),
|
4185
|
+
n_embd_head (hparams.n_embd_head()),
|
4186
|
+
n_embd_gqa (hparams.n_embd_gqa()),
|
4187
|
+
n_expert (hparams.n_expert),
|
4188
|
+
n_expert_used (hparams.n_expert_used),
|
4189
|
+
freq_base (cparams.rope_freq_base),
|
4190
|
+
freq_scale (cparams.rope_freq_scale),
|
4191
|
+
ext_factor (cparams.yarn_ext_factor),
|
4192
|
+
attn_factor (cparams.yarn_attn_factor),
|
4193
|
+
beta_fast (cparams.yarn_beta_fast),
|
4194
|
+
beta_slow (cparams.yarn_beta_slow),
|
4195
|
+
norm_eps (hparams.f_norm_eps),
|
4196
|
+
norm_rms_eps (hparams.f_norm_rms_eps),
|
4197
|
+
n_tokens (batch.n_tokens),
|
4198
|
+
n_kv (worst_case ? n_ctx : kv_self.n),
|
4199
|
+
kv_head (worst_case ? n_ctx - n_tokens : kv_self.head),
|
4200
|
+
n_orig_ctx (cparams.n_yarn_orig_ctx),
|
4201
|
+
do_rope_shift (worst_case || kv_self.has_shift),
|
4202
|
+
cb (cb),
|
4203
|
+
buf_compute_meta (lctx.buf_compute_meta) {
|
4144
4204
|
GGML_ASSERT(!!kv_self.ctx);
|
4145
4205
|
|
4146
4206
|
// all initializations should be done in init()
|
@@ -4148,8 +4208,8 @@ struct llm_build_context {
|
|
4148
4208
|
|
4149
4209
|
void init() {
|
4150
4210
|
struct ggml_init_params params = {
|
4151
|
-
/*.mem_size =*/
|
4152
|
-
/*.mem_buffer =*/
|
4211
|
+
/*.mem_size =*/ buf_compute_meta.size(),
|
4212
|
+
/*.mem_buffer =*/ buf_compute_meta.data(),
|
4153
4213
|
/*.no_alloc =*/ true,
|
4154
4214
|
};
|
4155
4215
|
|
@@ -4178,10 +4238,6 @@ struct llm_build_context {
|
|
4178
4238
|
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
4179
4239
|
cb(inp_pos, "inp_pos", -1);
|
4180
4240
|
|
4181
|
-
// KQ_scale
|
4182
|
-
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
4183
|
-
cb(KQ_scale, "KQ_scale", -1);
|
4184
|
-
|
4185
4241
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
4186
4242
|
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
4187
4243
|
cb(KQ_mask, "KQ_mask", -1);
|
@@ -4240,9 +4296,9 @@ struct llm_build_context {
|
|
4240
4296
|
|
4241
4297
|
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
4242
4298
|
|
4243
|
-
cur = llm_build_kqv(ctx0, hparams, kv_self,
|
4299
|
+
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
|
4244
4300
|
model.layers[il].wo, model.layers[il].bo,
|
4245
|
-
Qcur,
|
4301
|
+
Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
4246
4302
|
cb(cur, "kqv_out", il);
|
4247
4303
|
}
|
4248
4304
|
|
@@ -4363,10 +4419,6 @@ struct llm_build_context {
|
|
4363
4419
|
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
4364
4420
|
cb(inp_pos, "inp_pos", -1);
|
4365
4421
|
|
4366
|
-
// KQ_scale
|
4367
|
-
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
4368
|
-
cb(KQ_scale, "KQ_scale", -1);
|
4369
|
-
|
4370
4422
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
4371
4423
|
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
4372
4424
|
cb(KQ_mask, "KQ_mask", -1);
|
@@ -4423,9 +4475,9 @@ struct llm_build_context {
|
|
4423
4475
|
// apply ALiBi for 13B model
|
4424
4476
|
const float max_alibi_bias = model.type == MODEL_13B ? 8.0f : -1.0f;
|
4425
4477
|
|
4426
|
-
cur = llm_build_kqv(ctx0, hparams, kv_self,
|
4478
|
+
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
|
4427
4479
|
model.layers[il].wo, NULL,
|
4428
|
-
Qcur,
|
4480
|
+
Qcur, KQ_mask, n_ctx, n_tokens, n_kv, max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
4429
4481
|
cb(cur, "kqv_out", il);
|
4430
4482
|
}
|
4431
4483
|
|
@@ -4483,10 +4535,6 @@ struct llm_build_context {
|
|
4483
4535
|
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
4484
4536
|
cb(inp_pos, "inp_pos", -1);
|
4485
4537
|
|
4486
|
-
// KQ_scale
|
4487
|
-
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
4488
|
-
cb(KQ_scale, "KQ_scale", -1);
|
4489
|
-
|
4490
4538
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
4491
4539
|
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
4492
4540
|
cb(KQ_mask, "KQ_mask", -1);
|
@@ -4547,9 +4595,9 @@ struct llm_build_context {
|
|
4547
4595
|
|
4548
4596
|
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
4549
4597
|
|
4550
|
-
cur = llm_build_kqv(ctx0, hparams, kv_self,
|
4598
|
+
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
|
4551
4599
|
model.layers[il].wo, NULL,
|
4552
|
-
Qcur,
|
4600
|
+
Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
4553
4601
|
cb(cur, "kqv_out", il);
|
4554
4602
|
}
|
4555
4603
|
|
@@ -4606,10 +4654,6 @@ struct llm_build_context {
|
|
4606
4654
|
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
4607
4655
|
cb(inp_pos, "inp_pos", -1);
|
4608
4656
|
|
4609
|
-
// KQ_scale
|
4610
|
-
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
4611
|
-
cb(KQ_scale, "KQ_scale", -1);
|
4612
|
-
|
4613
4657
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
4614
4658
|
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
4615
4659
|
cb(KQ_mask, "KQ_mask", -1);
|
@@ -4647,9 +4691,9 @@ struct llm_build_context {
|
|
4647
4691
|
|
4648
4692
|
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
4649
4693
|
|
4650
|
-
cur = llm_build_kqv(ctx0, hparams, kv_self,
|
4694
|
+
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
|
4651
4695
|
model.layers[il].wo, model.layers[il].bo,
|
4652
|
-
Qcur,
|
4696
|
+
Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
4653
4697
|
cb(cur, "kqv_out", il);
|
4654
4698
|
}
|
4655
4699
|
|
@@ -4706,10 +4750,6 @@ struct llm_build_context {
|
|
4706
4750
|
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
4707
4751
|
cb(inp_pos, "inp_pos", -1);
|
4708
4752
|
|
4709
|
-
// KQ_scale
|
4710
|
-
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
4711
|
-
cb(KQ_scale, "KQ_scale", -1);
|
4712
|
-
|
4713
4753
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
4714
4754
|
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
4715
4755
|
cb(KQ_mask, "KQ_mask", -1);
|
@@ -4856,9 +4896,9 @@ struct llm_build_context {
|
|
4856
4896
|
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
4857
4897
|
|
4858
4898
|
// TODO: not tested, could be broken
|
4859
|
-
cur = llm_build_kqv(ctx0, hparams, kv_self,
|
4899
|
+
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
|
4860
4900
|
model.layers[il].wo, model.layers[il].bo,
|
4861
|
-
Q,
|
4901
|
+
Q, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
4862
4902
|
cb(cur, "kqv_out", il);
|
4863
4903
|
}
|
4864
4904
|
|
@@ -4912,10 +4952,6 @@ struct llm_build_context {
|
|
4912
4952
|
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
|
4913
4953
|
cb(inpL, "inp_embd", -1);
|
4914
4954
|
|
4915
|
-
// KQ_scale
|
4916
|
-
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
4917
|
-
cb(KQ_scale, "KQ_scale", -1);
|
4918
|
-
|
4919
4955
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
4920
4956
|
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
4921
4957
|
cb(KQ_mask, "KQ_mask", -1);
|
@@ -4947,9 +4983,9 @@ struct llm_build_context {
|
|
4947
4983
|
|
4948
4984
|
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
4949
4985
|
|
4950
|
-
cur = llm_build_kqv(ctx0, hparams, kv_self,
|
4986
|
+
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
|
4951
4987
|
model.layers[il].wo, NULL,
|
4952
|
-
Qcur,
|
4988
|
+
Qcur, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
4953
4989
|
cb(cur, "kqv_out", il);
|
4954
4990
|
}
|
4955
4991
|
|
@@ -5003,10 +5039,6 @@ struct llm_build_context {
|
|
5003
5039
|
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
|
5004
5040
|
cb(inpL, "inp_embd", -1);
|
5005
5041
|
|
5006
|
-
// KQ_scale
|
5007
|
-
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
5008
|
-
cb(KQ_scale, "KQ_scale", -1);
|
5009
|
-
|
5010
5042
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
5011
5043
|
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
5012
5044
|
cb(KQ_mask, "KQ_mask", -1);
|
@@ -5044,9 +5076,9 @@ struct llm_build_context {
|
|
5044
5076
|
|
5045
5077
|
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
5046
5078
|
|
5047
|
-
cur = llm_build_kqv(ctx0, hparams, kv_self,
|
5079
|
+
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
|
5048
5080
|
model.layers[il].wo, model.layers[il].bo,
|
5049
|
-
Qcur,
|
5081
|
+
Qcur, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
5050
5082
|
cb(cur, "kqv_out", il);
|
5051
5083
|
}
|
5052
5084
|
|
@@ -5097,10 +5129,6 @@ struct llm_build_context {
|
|
5097
5129
|
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
|
5098
5130
|
cb(inpL, "inp_embd", -1);
|
5099
5131
|
|
5100
|
-
// KQ_scale
|
5101
|
-
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
5102
|
-
cb(KQ_scale, "KQ_scale", -1);
|
5103
|
-
|
5104
5132
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
5105
5133
|
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
5106
5134
|
cb(KQ_mask, "KQ_mask", -1);
|
@@ -5138,9 +5166,9 @@ struct llm_build_context {
|
|
5138
5166
|
|
5139
5167
|
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
5140
5168
|
|
5141
|
-
cur = llm_build_kqv(ctx0, hparams, kv_self,
|
5169
|
+
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
|
5142
5170
|
model.layers[il].wo, NULL,
|
5143
|
-
Qcur,
|
5171
|
+
Qcur, KQ_mask, n_ctx, n_tokens, n_kv, hparams.f_max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
5144
5172
|
cb(cur, "kqv_out", il);
|
5145
5173
|
}
|
5146
5174
|
|
@@ -5200,10 +5228,6 @@ struct llm_build_context {
|
|
5200
5228
|
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
5201
5229
|
cb(inp_pos, "inp_pos", -1);
|
5202
5230
|
|
5203
|
-
// KQ_scale
|
5204
|
-
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
5205
|
-
cb(KQ_scale, "KQ_scale", -1);
|
5206
|
-
|
5207
5231
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
5208
5232
|
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
5209
5233
|
cb(KQ_mask, "KQ_mask", -1);
|
@@ -5251,9 +5275,9 @@ struct llm_build_context {
|
|
5251
5275
|
|
5252
5276
|
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
5253
5277
|
|
5254
|
-
cur = llm_build_kqv(ctx0, hparams, kv_self,
|
5278
|
+
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
|
5255
5279
|
model.layers[il].wo, NULL,
|
5256
|
-
Qcur,
|
5280
|
+
Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
5257
5281
|
cb(cur, "kqv_out", il);
|
5258
5282
|
}
|
5259
5283
|
|
@@ -5310,15 +5334,11 @@ struct llm_build_context {
|
|
5310
5334
|
cb(inpL, "inp_embd", -1);
|
5311
5335
|
|
5312
5336
|
// inp_pos - contains the positions
|
5313
|
-
struct ggml_tensor * inp_pos= ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
5337
|
+
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
5314
5338
|
cb(inp_pos, "inp_pos", -1);
|
5315
5339
|
|
5316
|
-
// KQ_scale
|
5317
|
-
struct ggml_tensor * KQ_scale= ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
5318
|
-
cb(KQ_scale, "KQ_scale", -1);
|
5319
|
-
|
5320
5340
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
5321
|
-
struct ggml_tensor * KQ_mask= ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
5341
|
+
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
5322
5342
|
cb(KQ_mask, "KQ_mask", -1);
|
5323
5343
|
|
5324
5344
|
// shift the entire K-cache if needed
|
@@ -5368,9 +5388,9 @@ struct llm_build_context {
|
|
5368
5388
|
|
5369
5389
|
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
5370
5390
|
|
5371
|
-
cur = llm_build_kqv(ctx0, hparams, kv_self,
|
5391
|
+
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
|
5372
5392
|
model.layers[il].wo, NULL,
|
5373
|
-
Qcur,
|
5393
|
+
Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
5374
5394
|
cb(cur, "kqv_out", il);
|
5375
5395
|
}
|
5376
5396
|
|
@@ -5412,6 +5432,116 @@ struct llm_build_context {
|
|
5412
5432
|
|
5413
5433
|
ggml_build_forward_expand(gf, cur);
|
5414
5434
|
|
5435
|
+
return gf;
|
5436
|
+
}
|
5437
|
+
struct ggml_cgraph * build_phi2() {
|
5438
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
5439
|
+
|
5440
|
+
struct ggml_tensor * cur;
|
5441
|
+
struct ggml_tensor * attn_norm_output;
|
5442
|
+
struct ggml_tensor * ffn_output;
|
5443
|
+
struct ggml_tensor * inpL;
|
5444
|
+
|
5445
|
+
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
|
5446
|
+
cb(inpL, "inp_embd", -1);
|
5447
|
+
|
5448
|
+
// inp_pos - contains the positions
|
5449
|
+
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
5450
|
+
cb(inp_pos, "inp_pos", -1);
|
5451
|
+
|
5452
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
5453
|
+
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
5454
|
+
cb(KQ_mask, "KQ_mask", -1);
|
5455
|
+
|
5456
|
+
// shift the entire K-cache if needed
|
5457
|
+
if (do_rope_shift) {
|
5458
|
+
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, n_embd_head, freq_base, freq_scale, cb);
|
5459
|
+
}
|
5460
|
+
|
5461
|
+
for (int il = 0; il < n_layer; ++il) {
|
5462
|
+
attn_norm_output = llm_build_norm(ctx0, inpL, hparams,
|
5463
|
+
model.layers[il].attn_norm,
|
5464
|
+
model.layers[il].attn_norm_b,
|
5465
|
+
LLM_NORM, cb, il);
|
5466
|
+
cb(attn_norm_output, "attn_norm", il);
|
5467
|
+
|
5468
|
+
// self-attention
|
5469
|
+
{
|
5470
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, attn_norm_output);
|
5471
|
+
cb(cur, "wqkv", il);
|
5472
|
+
|
5473
|
+
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
5474
|
+
cb(cur, "bqkv", il);
|
5475
|
+
|
5476
|
+
struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
5477
|
+
struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
5478
|
+
struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
5479
|
+
|
5480
|
+
cb(Qcur, "Qcur", il);
|
5481
|
+
cb(Kcur, "Kcur", il);
|
5482
|
+
cb(Vcur, "Vcur", il);
|
5483
|
+
|
5484
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
5485
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
5486
|
+
|
5487
|
+
Qcur = ggml_rope_custom(
|
5488
|
+
ctx0, Qcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
|
5489
|
+
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
5490
|
+
);
|
5491
|
+
cb(Qcur, "Qcur", il);
|
5492
|
+
|
5493
|
+
// with phi2, we scale the Q to avoid precision issues
|
5494
|
+
// ref: https://github.com/ml-explore/mlx-examples/blob/08e862336ade809bc37d1035f94b359e7d1a5152/phi2/phi2.py#L64-L66
|
5495
|
+
Qcur = ggml_scale(ctx0, Qcur, 1.0f/sqrtf(float(n_embd_head)));
|
5496
|
+
cb(Qcur, "Qcur", il);
|
5497
|
+
|
5498
|
+
Kcur = ggml_rope_custom(
|
5499
|
+
ctx0, Kcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
|
5500
|
+
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
5501
|
+
);
|
5502
|
+
cb(Kcur, "Kcur", il);
|
5503
|
+
|
5504
|
+
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
5505
|
+
|
5506
|
+
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
|
5507
|
+
model.layers[il].wo, model.layers[il].bo,
|
5508
|
+
Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f, cb, il);
|
5509
|
+
cb(cur, "kqv_out", il);
|
5510
|
+
}
|
5511
|
+
|
5512
|
+
// FF
|
5513
|
+
{
|
5514
|
+
ffn_output = llm_build_ffn(ctx0, attn_norm_output,
|
5515
|
+
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
|
5516
|
+
NULL, NULL,
|
5517
|
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
5518
|
+
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
5519
|
+
cb(ffn_output, "ffn_out", il);
|
5520
|
+
}
|
5521
|
+
|
5522
|
+
cur = ggml_add(ctx0, cur, ffn_output);
|
5523
|
+
cb(cur, "l_out", il);
|
5524
|
+
|
5525
|
+
cur = ggml_add(ctx0, cur, inpL);
|
5526
|
+
cb(cur, "l_out", il);
|
5527
|
+
|
5528
|
+
inpL = cur;
|
5529
|
+
}
|
5530
|
+
|
5531
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
5532
|
+
model.output_norm,
|
5533
|
+
model.output_norm_b,
|
5534
|
+
LLM_NORM, cb, -1);
|
5535
|
+
cb(cur, "result_norm", -1);
|
5536
|
+
|
5537
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
5538
|
+
cb(cur, "result_output_no_bias", -1);
|
5539
|
+
|
5540
|
+
cur = ggml_add(ctx0, cur, model.output_b);
|
5541
|
+
cb(cur, "result_output", -1);
|
5542
|
+
|
5543
|
+
ggml_build_forward_expand(gf, cur);
|
5544
|
+
|
5415
5545
|
return gf;
|
5416
5546
|
}
|
5417
5547
|
};
|
@@ -5427,7 +5557,7 @@ enum llm_offload_func_e {
|
|
5427
5557
|
OFFLOAD_FUNC_FRC, // force offload
|
5428
5558
|
OFFLOAD_FUNC_KQV,
|
5429
5559
|
OFFLOAD_FUNC_NR,
|
5430
|
-
OFFLOAD_FUNC_EMB,
|
5560
|
+
OFFLOAD_FUNC_EMB, // embeddings
|
5431
5561
|
OFFLOAD_FUNC_OUT,
|
5432
5562
|
};
|
5433
5563
|
|
@@ -5512,7 +5642,6 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map
|
|
5512
5642
|
{ "pos_embd", OFFLOAD_FUNC_NR },
|
5513
5643
|
|
5514
5644
|
{ "inp_pos", OFFLOAD_FUNC_FRC }, // this is often used for KQ ops (e.g. rope)
|
5515
|
-
{ "KQ_scale", OFFLOAD_FUNC_FRC },
|
5516
5645
|
{ "KQ_mask", OFFLOAD_FUNC_FRC },
|
5517
5646
|
{ "K_shift", OFFLOAD_FUNC_FRC },
|
5518
5647
|
|
@@ -5596,6 +5725,7 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map
|
|
5596
5725
|
{ "l_out", OFFLOAD_FUNC },
|
5597
5726
|
|
5598
5727
|
{ "result_norm", OFFLOAD_FUNC_EMB },
|
5728
|
+
{ "result_output_no_bias", OFFLOAD_FUNC_EMB },
|
5599
5729
|
{ "result_output", OFFLOAD_FUNC_OUT },
|
5600
5730
|
};
|
5601
5731
|
|
@@ -5613,11 +5743,10 @@ static struct ggml_cgraph * llama_build_graph(
|
|
5613
5743
|
bool alloc_inp_tokens = false;
|
5614
5744
|
bool alloc_inp_embd = false;
|
5615
5745
|
bool alloc_inp_pos = false;
|
5616
|
-
bool alloc_inp_KQ_scale = false;
|
5617
5746
|
bool alloc_inp_KQ_mask = false;
|
5618
5747
|
bool alloc_inp_K_shift = false;
|
5619
5748
|
|
5620
|
-
#
|
5749
|
+
#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
|
5621
5750
|
const bool do_offload = true;
|
5622
5751
|
#else
|
5623
5752
|
const bool do_offload = true; // TODO: set to false after finishing refactoring
|
@@ -5645,7 +5774,7 @@ static struct ggml_cgraph * llama_build_graph(
|
|
5645
5774
|
if (!ggml_allocr_is_measure(lctx.alloc) && batch.token) {
|
5646
5775
|
const int64_t n_tokens = cur->ne[0];
|
5647
5776
|
|
5648
|
-
|
5777
|
+
ggml_backend_tensor_set(cur, batch.token, 0, n_tokens*ggml_element_size(cur));
|
5649
5778
|
}
|
5650
5779
|
|
5651
5780
|
alloc_inp_tokens = true;
|
@@ -5658,7 +5787,7 @@ static struct ggml_cgraph * llama_build_graph(
|
|
5658
5787
|
const int64_t n_embd = cur->ne[0];
|
5659
5788
|
const int64_t n_tokens = cur->ne[1];
|
5660
5789
|
|
5661
|
-
|
5790
|
+
ggml_backend_tensor_set(cur, batch.embd, 0, n_tokens*n_embd*ggml_element_size(cur));
|
5662
5791
|
}
|
5663
5792
|
|
5664
5793
|
alloc_inp_embd = true;
|
@@ -5670,27 +5799,13 @@ static struct ggml_cgraph * llama_build_graph(
|
|
5670
5799
|
if (!ggml_allocr_is_measure(lctx.alloc) && batch.pos) {
|
5671
5800
|
const int64_t n_tokens = cur->ne[0];
|
5672
5801
|
|
5673
|
-
int32_t
|
5674
|
-
|
5675
|
-
for (int i = 0; i < n_tokens; ++i) {
|
5676
|
-
data[i] = batch.pos[i];
|
5677
|
-
}
|
5802
|
+
static_assert(std::is_same<llama_pos, int32_t>::value, "llama_pos must be int32_t");
|
5803
|
+
ggml_backend_tensor_set(cur, batch.pos, 0, n_tokens*ggml_element_size(cur));
|
5678
5804
|
}
|
5679
5805
|
|
5680
5806
|
alloc_inp_pos = true;
|
5681
5807
|
}
|
5682
5808
|
|
5683
|
-
if (!alloc_inp_KQ_scale && strcmp(name, "KQ_scale") == 0) {
|
5684
|
-
ggml_allocr_alloc(lctx.alloc, cur);
|
5685
|
-
|
5686
|
-
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
5687
|
-
const int64_t n_embd_head = model.hparams.n_embd_head();
|
5688
|
-
ggml_set_f32(cur, 1.0f/sqrtf(float(n_embd_head)));
|
5689
|
-
}
|
5690
|
-
|
5691
|
-
alloc_inp_KQ_scale = true;
|
5692
|
-
}
|
5693
|
-
|
5694
5809
|
if (!alloc_inp_KQ_mask && strcmp(name, "KQ_mask") == 0) {
|
5695
5810
|
ggml_allocr_alloc(lctx.alloc, cur);
|
5696
5811
|
|
@@ -5698,8 +5813,13 @@ static struct ggml_cgraph * llama_build_graph(
|
|
5698
5813
|
const int64_t n_kv = cur->ne[0];
|
5699
5814
|
const int64_t n_tokens = cur->ne[1];
|
5700
5815
|
|
5701
|
-
float * data
|
5702
|
-
|
5816
|
+
float * data;
|
5817
|
+
if (ggml_backend_buffer_is_host(cur->buffer)) {
|
5818
|
+
data = (float *) cur->data;
|
5819
|
+
} else {
|
5820
|
+
lctx.buf_copy.resize(ggml_nbytes(cur));
|
5821
|
+
data = (float *) lctx.buf_copy.data();
|
5822
|
+
}
|
5703
5823
|
|
5704
5824
|
for (int h = 0; h < 1; ++h) {
|
5705
5825
|
for (int j = 0; j < n_tokens; ++j) {
|
@@ -5707,12 +5827,20 @@ static struct ggml_cgraph * llama_build_graph(
|
|
5707
5827
|
const llama_seq_id seq_id = batch.seq_id[j][0];
|
5708
5828
|
|
5709
5829
|
for (int i = 0; i < n_kv; ++i) {
|
5830
|
+
float f;
|
5710
5831
|
if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) {
|
5711
|
-
|
5832
|
+
f = -INFINITY;
|
5833
|
+
} else {
|
5834
|
+
f = 0;
|
5712
5835
|
}
|
5836
|
+
data[h*(n_kv*n_tokens) + j*n_kv + i] = f;
|
5713
5837
|
}
|
5714
5838
|
}
|
5715
5839
|
}
|
5840
|
+
|
5841
|
+
if (data != cur->data) {
|
5842
|
+
ggml_backend_tensor_set(cur, data, 0, ggml_nbytes(cur));
|
5843
|
+
}
|
5716
5844
|
}
|
5717
5845
|
|
5718
5846
|
alloc_inp_KQ_mask = true;
|
@@ -5724,11 +5852,21 @@ static struct ggml_cgraph * llama_build_graph(
|
|
5724
5852
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
5725
5853
|
const int64_t n_ctx = cur->ne[0];
|
5726
5854
|
|
5727
|
-
int32_t * data
|
5855
|
+
int32_t * data;
|
5856
|
+
if (ggml_backend_buffer_is_host(cur->buffer)) {
|
5857
|
+
data = (int32_t *) cur->data;
|
5858
|
+
} else {
|
5859
|
+
lctx.buf_copy.resize(ggml_nbytes(cur));
|
5860
|
+
data = (int32_t *) lctx.buf_copy.data();
|
5861
|
+
}
|
5728
5862
|
|
5729
5863
|
for (int i = 0; i < n_ctx; ++i) {
|
5730
5864
|
data[i] = lctx.kv_self.cells[i].delta;
|
5731
5865
|
}
|
5866
|
+
|
5867
|
+
if (data != cur->data) {
|
5868
|
+
ggml_backend_tensor_set(cur, data, 0, ggml_nbytes(cur));
|
5869
|
+
}
|
5732
5870
|
}
|
5733
5871
|
|
5734
5872
|
alloc_inp_K_shift = true;
|
@@ -5765,7 +5903,7 @@ static struct ggml_cgraph * llama_build_graph(
|
|
5765
5903
|
static const std::unordered_map<llm_offload_func_e, std::string, std::hash<int>> k_offload_func_name = {
|
5766
5904
|
{ OFFLOAD_FUNC_NOP, "CPU" },
|
5767
5905
|
{ OFFLOAD_FUNC_OUT, "CPU" },
|
5768
|
-
#
|
5906
|
+
#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
|
5769
5907
|
{ OFFLOAD_FUNC, "GPU (CUDA)" },
|
5770
5908
|
{ OFFLOAD_FUNC_FRC, "GPU (CUDA) FRC" },
|
5771
5909
|
{ OFFLOAD_FUNC_KQV, "GPU (CUDA) KQV" },
|
@@ -5838,7 +5976,7 @@ static struct ggml_cgraph * llama_build_graph(
|
|
5838
5976
|
offload_func_t func = ggml_offload_nop;
|
5839
5977
|
|
5840
5978
|
// this is needed for compatibility with Metal for example
|
5841
|
-
#
|
5979
|
+
#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
|
5842
5980
|
static offload_func_t ggml_offload_gpu = ggml_cuda_assign_buffers_no_alloc;
|
5843
5981
|
#else
|
5844
5982
|
static offload_func_t ggml_offload_gpu = ggml_offload_nop;
|
@@ -5912,6 +6050,10 @@ static struct ggml_cgraph * llama_build_graph(
|
|
5912
6050
|
{
|
5913
6051
|
result = llm.build_qwen();
|
5914
6052
|
} break;
|
6053
|
+
case LLM_ARCH_PHI2:
|
6054
|
+
{
|
6055
|
+
result = llm.build_phi2();
|
6056
|
+
} break;
|
5915
6057
|
default:
|
5916
6058
|
GGML_ASSERT(false);
|
5917
6059
|
}
|
@@ -6045,18 +6187,23 @@ static int llama_decode_internal(
|
|
6045
6187
|
|
6046
6188
|
ggml_allocr_alloc_graph(lctx.alloc, gf);
|
6047
6189
|
|
6048
|
-
|
6049
|
-
struct ggml_tensor *
|
6050
|
-
|
6051
|
-
GGML_ASSERT(strcmp(res->name, "result_output") == 0);
|
6052
|
-
GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
|
6190
|
+
// the output is always the last tensor in the graph
|
6191
|
+
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
|
6192
|
+
GGML_ASSERT(strcmp(res->name, "result_output") == 0);
|
6053
6193
|
|
6194
|
+
// the embeddings could be the second to last tensor, or the third to last tensor
|
6195
|
+
struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
|
6196
|
+
if (strcmp(embeddings->name, "result_norm") != 0) {
|
6197
|
+
embeddings = gf->nodes[gf->n_nodes - 3];
|
6198
|
+
GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
|
6199
|
+
}
|
6054
6200
|
|
6055
|
-
#
|
6201
|
+
#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
|
6202
|
+
char * buf_alloc_base = (char *)ggml_backend_buffer_get_base(lctx.buf_alloc);
|
6056
6203
|
for (int i = 0; i < gf->n_leafs; i++) {
|
6057
6204
|
ggml_tensor * node = gf->leafs[i];
|
6058
6205
|
if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) {
|
6059
|
-
ggml_cuda_assign_scratch_offset(node, (char*)node->data -
|
6206
|
+
ggml_cuda_assign_scratch_offset(node, (char *)node->data - buf_alloc_base);
|
6060
6207
|
ggml_cuda_copy_to_device(node);
|
6061
6208
|
}
|
6062
6209
|
}
|
@@ -6064,7 +6211,7 @@ static int llama_decode_internal(
|
|
6064
6211
|
for (int i = 0; i < gf->n_nodes; i++) {
|
6065
6212
|
ggml_tensor * node = gf->nodes[i];
|
6066
6213
|
if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) {
|
6067
|
-
ggml_cuda_assign_scratch_offset(node, (char*)node->data -
|
6214
|
+
ggml_cuda_assign_scratch_offset(node, (char *)node->data - buf_alloc_base);
|
6068
6215
|
}
|
6069
6216
|
}
|
6070
6217
|
|
@@ -6091,23 +6238,23 @@ static int llama_decode_internal(
|
|
6091
6238
|
n_threads = 1;
|
6092
6239
|
}
|
6093
6240
|
|
6094
|
-
#
|
6241
|
+
#ifdef GGML_USE_MPI
|
6095
6242
|
const int64_t n_layer = hparams.n_layer;
|
6096
6243
|
ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
|
6097
6244
|
#endif
|
6098
6245
|
|
6099
6246
|
#ifdef GGML_USE_METAL
|
6100
|
-
if (lctx.
|
6101
|
-
|
6102
|
-
ggml_metal_graph_compute(lctx.ctx_metal, gf);
|
6103
|
-
} else {
|
6104
|
-
ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
|
6247
|
+
if (ggml_backend_is_metal(lctx.backend)) {
|
6248
|
+
ggml_backend_metal_set_n_cb(lctx.backend, n_threads);
|
6105
6249
|
}
|
6106
|
-
#else
|
6107
|
-
ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
|
6108
6250
|
#endif
|
6109
6251
|
|
6110
|
-
|
6252
|
+
if (ggml_backend_is_cpu(lctx.backend)) {
|
6253
|
+
ggml_backend_cpu_set_n_threads(lctx.backend, n_threads);
|
6254
|
+
}
|
6255
|
+
ggml_backend_graph_compute(lctx.backend, gf);
|
6256
|
+
|
6257
|
+
#ifdef GGML_USE_MPI
|
6111
6258
|
ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
|
6112
6259
|
#endif
|
6113
6260
|
|
@@ -6145,20 +6292,37 @@ static int llama_decode_internal(
|
|
6145
6292
|
{
|
6146
6293
|
auto & logits_out = lctx.logits;
|
6147
6294
|
|
6295
|
+
#ifndef NDEBUG
|
6296
|
+
auto & logits_valid = lctx.logits_valid;
|
6297
|
+
logits_valid.clear();
|
6298
|
+
logits_valid.resize(n_tokens);
|
6299
|
+
|
6300
|
+
logits_out.clear();
|
6301
|
+
#endif
|
6302
|
+
|
6148
6303
|
if (batch.logits) {
|
6149
6304
|
logits_out.resize(n_vocab * n_tokens);
|
6150
6305
|
for (uint32_t i = 0; i < n_tokens; i++) {
|
6151
6306
|
if (batch.logits[i] == 0) {
|
6152
6307
|
continue;
|
6153
6308
|
}
|
6154
|
-
|
6309
|
+
ggml_backend_tensor_get(res, logits_out.data() + (n_vocab*i), (n_vocab*i)*sizeof(float), n_vocab*sizeof(float));
|
6310
|
+
#ifndef NDEBUG
|
6311
|
+
logits_valid[i] = true;
|
6312
|
+
#endif
|
6155
6313
|
}
|
6156
6314
|
} else if (lctx.logits_all) {
|
6157
6315
|
logits_out.resize(n_vocab * n_tokens);
|
6158
|
-
|
6316
|
+
ggml_backend_tensor_get(res, logits_out.data(), 0, n_vocab*n_tokens*sizeof(float));
|
6317
|
+
#ifndef NDEBUG
|
6318
|
+
std::fill(logits_valid.begin(), logits_valid.end(), true);
|
6319
|
+
#endif
|
6159
6320
|
} else {
|
6160
6321
|
logits_out.resize(n_vocab);
|
6161
|
-
|
6322
|
+
ggml_backend_tensor_get(res, logits_out.data(), (n_vocab*(n_tokens - 1))*sizeof(float), n_vocab*sizeof(float));
|
6323
|
+
#ifndef NDEBUG
|
6324
|
+
logits_valid[0] = true;
|
6325
|
+
#endif
|
6162
6326
|
}
|
6163
6327
|
}
|
6164
6328
|
|
@@ -6167,7 +6331,7 @@ static int llama_decode_internal(
|
|
6167
6331
|
auto & embedding_out = lctx.embedding;
|
6168
6332
|
|
6169
6333
|
embedding_out.resize(n_embd);
|
6170
|
-
|
6334
|
+
ggml_backend_tensor_get(embeddings, embedding_out.data(), (n_embd*(n_tokens - 1))*sizeof(float), n_embd*sizeof(float));
|
6171
6335
|
}
|
6172
6336
|
|
6173
6337
|
// measure the performance only for the single-token evals
|
@@ -8125,12 +8289,6 @@ void llama_beam_search(llama_context * ctx,
|
|
8125
8289
|
// quantization
|
8126
8290
|
//
|
8127
8291
|
|
8128
|
-
template <typename T>
|
8129
|
-
struct no_init {
|
8130
|
-
T value;
|
8131
|
-
no_init() { /* do nothing */ }
|
8132
|
-
};
|
8133
|
-
|
8134
8292
|
struct quantize_state_internal {
|
8135
8293
|
const llama_model & model;
|
8136
8294
|
const llama_model_quantize_params * params;
|
@@ -8373,9 +8531,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
8373
8531
|
#endif
|
8374
8532
|
|
8375
8533
|
llama_model_loader ml(fname_inp, use_mmap, NULL);
|
8376
|
-
|
8377
|
-
ml.mapping.reset(new llama_mmap(&ml.file, /* prefetch */ 0, ggml_is_numa()));
|
8378
|
-
}
|
8534
|
+
ml.init_mapping(false); // no prefetching?
|
8379
8535
|
|
8380
8536
|
llama_model model;
|
8381
8537
|
llm_load_arch(ml, model);
|
@@ -8621,74 +8777,63 @@ static int llama_apply_lora_from_file_internal(
|
|
8621
8777
|
|
8622
8778
|
const int64_t t_start_lora_us = ggml_time_us();
|
8623
8779
|
|
8624
|
-
|
8625
|
-
if (!fin) {
|
8626
|
-
LLAMA_LOG_ERROR("%s: failed to open '%s'\n", __func__, path_lora);
|
8627
|
-
return 1;
|
8628
|
-
}
|
8780
|
+
llama_file fin(path_lora, "rb");
|
8629
8781
|
|
8630
8782
|
// verify magic and version
|
8631
8783
|
{
|
8632
|
-
uint32_t magic;
|
8633
|
-
|
8634
|
-
|
8635
|
-
|
8784
|
+
uint32_t magic = fin.read_u32();
|
8785
|
+
if (magic != LLAMA_FILE_MAGIC_GGLA) {
|
8786
|
+
LLAMA_LOG_ERROR("%s: bad file magic\n", __func__);
|
8787
|
+
return 1;
|
8788
|
+
}
|
8636
8789
|
|
8790
|
+
uint32_t format_version = fin.read_u32();
|
8637
8791
|
if (format_version != 1) {
|
8638
8792
|
LLAMA_LOG_ERROR("%s: unsupported file version\n", __func__ );
|
8639
8793
|
return 1;
|
8640
8794
|
}
|
8641
8795
|
}
|
8642
8796
|
|
8643
|
-
int32_t lora_r;
|
8644
|
-
int32_t lora_alpha;
|
8645
|
-
fin.read((char *) &lora_r, sizeof(lora_r));
|
8646
|
-
fin.read((char *) &lora_alpha, sizeof(lora_alpha));
|
8797
|
+
int32_t lora_r = fin.read_u32();
|
8798
|
+
int32_t lora_alpha = fin.read_u32();
|
8647
8799
|
float scaling = scale * (float)lora_alpha / (float)lora_r;
|
8648
8800
|
|
8649
8801
|
LLAMA_LOG_INFO("%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);
|
8650
8802
|
|
8803
|
+
// create a name -> tensor map of the model to accelerate lookups
|
8804
|
+
// find the max tensor size to estimate the required temporary buffer size
|
8805
|
+
size_t max_tensor_size = 0;
|
8806
|
+
std::unordered_map<std::string, struct ggml_tensor*> model_tensors;
|
8807
|
+
for (const auto & kv : model.tensors_by_name) {
|
8808
|
+
model_tensors.insert(kv);
|
8809
|
+
size_t f32_size = ggml_nelements(kv.second) * sizeof(float);
|
8810
|
+
max_tensor_size = std::max(max_tensor_size, f32_size);
|
8811
|
+
}
|
8812
|
+
|
8651
8813
|
// create a temporary ggml context to store the lora tensors
|
8652
|
-
//
|
8653
|
-
|
8814
|
+
// TODO: use ggml-alloc
|
8815
|
+
size_t lora_ctx_size = max_tensor_size * 3;
|
8816
|
+
LLAMA_LOG_INFO("%s: allocating %.f MB for lora temporary buffer\n", __func__, lora_ctx_size / 1024.0 / 1024.0);
|
8817
|
+
std::vector<uint8_t> lora_buf(lora_ctx_size);
|
8818
|
+
|
8654
8819
|
struct ggml_init_params params;
|
8655
8820
|
params.mem_size = lora_buf.size();
|
8656
8821
|
params.mem_buffer = lora_buf.data();
|
8657
8822
|
params.no_alloc = false;
|
8658
8823
|
|
8659
|
-
|
8660
|
-
std::unordered_map<std::string, struct ggml_tensor *> lora_tensors;
|
8824
|
+
using unique_context = std::unique_ptr<ggml_context, decltype(&ggml_free)>;
|
8661
8825
|
|
8662
|
-
|
8663
|
-
|
8664
|
-
|
8665
|
-
model_tensors.insert(kv);
|
8666
|
-
}
|
8826
|
+
unique_context lora_ctx(nullptr, ggml_free);
|
8827
|
+
lora_ctx.reset(ggml_init(params));
|
8828
|
+
std::unordered_map<std::string, struct ggml_tensor *> lora_tensors;
|
8667
8829
|
|
8668
8830
|
// load base model
|
8669
8831
|
std::unique_ptr<llama_model_loader> ml;
|
8670
|
-
ggml_context * base_ctx = NULL;
|
8671
|
-
std::vector<uint8_t> base_buf;
|
8672
|
-
if (path_base_model) {
|
8673
|
-
LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
|
8674
|
-
ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*kv_overrides*/ NULL));
|
8675
|
-
|
8676
|
-
size_t ctx_size;
|
8677
|
-
size_t mmapped_size;
|
8678
|
-
ml->calc_sizes(ctx_size, mmapped_size);
|
8679
|
-
base_buf.resize(ctx_size);
|
8680
|
-
|
8681
|
-
ggml_init_params base_params;
|
8682
|
-
base_params.mem_size = base_buf.size();
|
8683
|
-
base_params.mem_buffer = base_buf.data();
|
8684
|
-
base_params.no_alloc = ml->use_mmap;
|
8685
8832
|
|
8686
|
-
|
8687
|
-
|
8688
|
-
|
8689
|
-
|
8690
|
-
ml->mapping.reset(new llama_mmap(&ml->file, /* prefetch */ 0, ggml_is_numa()));
|
8691
|
-
}
|
8833
|
+
if (path_base_model) {
|
8834
|
+
LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
|
8835
|
+
ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*kv_overrides*/ nullptr));
|
8836
|
+
ml->init_mapping(false); // no prefetching
|
8692
8837
|
}
|
8693
8838
|
|
8694
8839
|
// read tensors and apply
|
@@ -8698,27 +8843,35 @@ static int llama_apply_lora_from_file_internal(
|
|
8698
8843
|
std::vector<uint8_t> work_buffer;
|
8699
8844
|
|
8700
8845
|
while (true) {
|
8846
|
+
if (fin.tell() == fin.size) {
|
8847
|
+
// eof
|
8848
|
+
break;
|
8849
|
+
}
|
8850
|
+
|
8701
8851
|
int32_t n_dims;
|
8702
|
-
int32_t
|
8852
|
+
int32_t name_len;
|
8703
8853
|
int32_t ftype;
|
8704
8854
|
|
8705
|
-
fin.
|
8706
|
-
fin.
|
8707
|
-
fin.
|
8708
|
-
|
8709
|
-
|
8855
|
+
fin.read_raw(&n_dims, sizeof(n_dims));
|
8856
|
+
fin.read_raw(&name_len, sizeof(name_len));
|
8857
|
+
fin.read_raw(&ftype, sizeof(ftype));
|
8858
|
+
|
8859
|
+
if (n_dims != 1 && n_dims != 2) {
|
8860
|
+
LLAMA_LOG_ERROR("%s: unsupported tensor dimension %d\n", __func__, n_dims);
|
8861
|
+
return 1;
|
8710
8862
|
}
|
8711
8863
|
|
8712
8864
|
int32_t ne[2] = { 1, 1 };
|
8713
8865
|
for (int i = 0; i < n_dims; ++i) {
|
8714
|
-
fin.
|
8866
|
+
fin.read_raw(&ne[i], sizeof(ne[i]));
|
8715
8867
|
}
|
8716
8868
|
|
8717
8869
|
std::string name;
|
8718
8870
|
{
|
8871
|
+
GGML_ASSERT(name_len <= 1024);
|
8719
8872
|
char buf[1024];
|
8720
|
-
fin.
|
8721
|
-
name = std::string(buf,
|
8873
|
+
fin.read_raw(buf, name_len);
|
8874
|
+
name = std::string(buf, name_len);
|
8722
8875
|
}
|
8723
8876
|
|
8724
8877
|
// check for lora suffix and get the type of tensor
|
@@ -8732,7 +8885,7 @@ static int llama_apply_lora_from_file_internal(
|
|
8732
8885
|
std::string lora_type = name.substr(pos + lora_suffix.length());
|
8733
8886
|
std::string base_name = name;
|
8734
8887
|
base_name.erase(pos);
|
8735
|
-
// LLAMA_LOG_INFO("%s: %s => %s (lora type %s) \n", __func__, name.c_str(),base_name.c_str(), lora_type.c_str());
|
8888
|
+
// LLAMA_LOG_INFO("%s: %s => %s (lora type %s) \n", __func__, name.c_str(), base_name.c_str(), lora_type.c_str());
|
8736
8889
|
|
8737
8890
|
if (model_tensors.find(base_name) == model_tensors.end()) {
|
8738
8891
|
LLAMA_LOG_ERROR("%s: unknown tensor '%s' in lora adapter\n", __func__, name.data());
|
@@ -8751,22 +8904,15 @@ static int llama_apply_lora_from_file_internal(
|
|
8751
8904
|
return false;
|
8752
8905
|
}
|
8753
8906
|
}
|
8754
|
-
ggml_tensor * lora_tensor;
|
8755
|
-
|
8756
|
-
lora_tensor = ggml_new_tensor_2d(lora_ctx, wtype, ne[0], ne[1]);
|
8757
|
-
}
|
8758
|
-
else {
|
8759
|
-
LLAMA_LOG_ERROR("%s: unsupported tensor dimension %d\n", __func__, n_dims);
|
8760
|
-
return 1;
|
8761
|
-
}
|
8762
|
-
ggml_set_name(lora_tensor, "lora_tensor");
|
8907
|
+
ggml_tensor * lora_tensor = ggml_new_tensor_2d(lora_ctx.get(), wtype, ne[0], ne[1]);
|
8908
|
+
ggml_set_name(lora_tensor, name.c_str());
|
8763
8909
|
|
8764
8910
|
// load tensor data
|
8765
|
-
size_t offset = fin.
|
8911
|
+
size_t offset = fin.tell();
|
8766
8912
|
size_t tensor_data_size = ggml_nbytes(lora_tensor);
|
8767
8913
|
offset = (offset + 31) & -32;
|
8768
|
-
fin.
|
8769
|
-
fin.
|
8914
|
+
fin.seek(offset, SEEK_SET);
|
8915
|
+
fin.read_raw(lora_tensor->data, tensor_data_size);
|
8770
8916
|
|
8771
8917
|
lora_tensors[name] = lora_tensor;
|
8772
8918
|
|
@@ -8779,7 +8925,7 @@ static int llama_apply_lora_from_file_internal(
|
|
8779
8925
|
offload_func_t offload_func = ggml_offload_nop;
|
8780
8926
|
offload_func_t offload_func_force_inplace = ggml_offload_nop;
|
8781
8927
|
|
8782
|
-
#
|
8928
|
+
#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
|
8783
8929
|
if (dest_t->backend == GGML_BACKEND_GPU || dest_t->backend == GGML_BACKEND_GPU_SPLIT) {
|
8784
8930
|
if (dest_t->type != GGML_TYPE_F16) {
|
8785
8931
|
throw std::runtime_error(format(
|
@@ -8796,13 +8942,11 @@ static int llama_apply_lora_from_file_internal(
|
|
8796
8942
|
|
8797
8943
|
// load from base model
|
8798
8944
|
if (gguf_find_tensor(ctx_gguf, base_name.c_str()) < 0) {
|
8799
|
-
// TODO: throw
|
8800
8945
|
LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
|
8801
8946
|
return 1;
|
8802
8947
|
}
|
8803
8948
|
|
8804
|
-
|
8805
|
-
base_t = ml->create_tensor(base_ctx, base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] }, GGML_BACKEND_CPU);
|
8949
|
+
base_t = ml->get_tensor_meta(base_name.c_str());
|
8806
8950
|
ml->load_data_for(base_t);
|
8807
8951
|
} else {
|
8808
8952
|
base_t = dest_t;
|
@@ -8831,43 +8975,42 @@ static int llama_apply_lora_from_file_internal(
|
|
8831
8975
|
}
|
8832
8976
|
|
8833
8977
|
// w = w + BA*s
|
8834
|
-
ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB);
|
8978
|
+
ggml_tensor * BA = ggml_mul_mat(lora_ctx.get(), loraA, loraB);
|
8835
8979
|
offload_func(BA);
|
8836
8980
|
ggml_set_name(BA, "BA");
|
8837
8981
|
|
8838
8982
|
if (scaling != 1.0f) {
|
8839
|
-
|
8840
|
-
ggml_set_name(scale_tensor, "scale_tensor");
|
8841
|
-
|
8842
|
-
BA = ggml_scale_inplace(lora_ctx, BA, scale_tensor);
|
8983
|
+
BA = ggml_scale_inplace(lora_ctx.get(), BA, scaling);
|
8843
8984
|
offload_func(BA);
|
8844
8985
|
ggml_set_name(BA, "BA_scaled");
|
8845
8986
|
}
|
8846
8987
|
|
8847
8988
|
ggml_tensor * r;
|
8848
8989
|
if (base_t == dest_t) {
|
8849
|
-
r = ggml_add_inplace(lora_ctx, dest_t, BA);
|
8990
|
+
r = ggml_add_inplace(lora_ctx.get(), dest_t, BA);
|
8850
8991
|
offload_func_force_inplace(r);
|
8851
8992
|
ggml_set_name(r, "r_add_inplace");
|
8852
8993
|
}
|
8853
8994
|
else {
|
8854
|
-
r = ggml_add(lora_ctx, base_t, BA);
|
8995
|
+
r = ggml_add(lora_ctx.get(), base_t, BA);
|
8855
8996
|
offload_func(r);
|
8856
8997
|
ggml_set_name(r, "r_add");
|
8857
8998
|
|
8858
|
-
r = ggml_cpy(lora_ctx, r, dest_t);
|
8999
|
+
r = ggml_cpy(lora_ctx.get(), r, dest_t);
|
8859
9000
|
offload_func(r);
|
8860
9001
|
ggml_set_name(r, "r_cpy");
|
8861
9002
|
}
|
8862
9003
|
|
8863
|
-
struct ggml_cgraph * gf = ggml_new_graph(lora_ctx);
|
9004
|
+
struct ggml_cgraph * gf = ggml_new_graph(lora_ctx.get());
|
8864
9005
|
ggml_build_forward_expand(gf, r);
|
8865
9006
|
|
8866
9007
|
ggml_graph_compute_helper(work_buffer, gf, n_threads);
|
8867
9008
|
|
9009
|
+
// the tensors in the adapter must be sorted such that loraA and loraB of the same tensor are next to each other
|
9010
|
+
GGML_ASSERT(lora_tensors.size() == 2);
|
9011
|
+
|
8868
9012
|
// we won't need these tensors again, reset the context to save memory
|
8869
|
-
|
8870
|
-
lora_ctx = ggml_init(params);
|
9013
|
+
lora_ctx.reset(ggml_init(params));
|
8871
9014
|
lora_tensors.clear();
|
8872
9015
|
|
8873
9016
|
n_tensors++;
|
@@ -8877,12 +9020,6 @@ static int llama_apply_lora_from_file_internal(
|
|
8877
9020
|
}
|
8878
9021
|
}
|
8879
9022
|
|
8880
|
-
// TODO: this should be in a destructor, it will leak on failure
|
8881
|
-
ggml_free(lora_ctx);
|
8882
|
-
if (base_ctx) {
|
8883
|
-
ggml_free(base_ctx);
|
8884
|
-
}
|
8885
|
-
|
8886
9023
|
const int64_t t_lora_us = ggml_time_us() - t_start_lora_us;
|
8887
9024
|
LLAMA_LOG_INFO(" done (%.2f ms)\n", t_lora_us / 1000.0);
|
8888
9025
|
|
@@ -9012,11 +9149,18 @@ struct llama_model * llama_load_model_from_file(
|
|
9012
9149
|
LLAMA_LOG_INFO("\n");
|
9013
9150
|
}
|
9014
9151
|
}
|
9152
|
+
return true;
|
9015
9153
|
};
|
9016
9154
|
}
|
9017
9155
|
|
9018
|
-
|
9019
|
-
|
9156
|
+
int status = llama_model_load(path_model, *model, params);
|
9157
|
+
GGML_ASSERT(status <= 0);
|
9158
|
+
if (status < 0) {
|
9159
|
+
if (status == -1) {
|
9160
|
+
LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
|
9161
|
+
} else if (status == -2) {
|
9162
|
+
LLAMA_LOG_INFO("%s: cancelled model load\n", __func__);
|
9163
|
+
}
|
9020
9164
|
delete model;
|
9021
9165
|
return nullptr;
|
9022
9166
|
}
|
@@ -9091,7 +9235,39 @@ struct llama_context * llama_new_context_with_model(
|
|
9091
9235
|
|
9092
9236
|
// reserve memory for context buffers
|
9093
9237
|
if (!hparams.vocab_only) {
|
9094
|
-
|
9238
|
+
// initialize backend
|
9239
|
+
#ifdef GGML_USE_METAL
|
9240
|
+
if (model->n_gpu_layers > 0) {
|
9241
|
+
ctx->backend = ggml_backend_metal_init();
|
9242
|
+
if (ctx->backend == nullptr) {
|
9243
|
+
LLAMA_LOG_ERROR("%s: failed to initialize Metal backend\n", __func__);
|
9244
|
+
}
|
9245
|
+
}
|
9246
|
+
#elif defined(GGML_USE_CUBLAS) && defined(LLAMA_GGML_BACKEND_CUDA_TEST)
|
9247
|
+
// for testing only
|
9248
|
+
if (model->n_gpu_layers > 0) {
|
9249
|
+
ctx->backend = ggml_backend_cuda_init(0);
|
9250
|
+
if (ctx->backend == nullptr) {
|
9251
|
+
LLAMA_LOG_ERROR("%s: failed to initialize CUDA backend\n", __func__);
|
9252
|
+
}
|
9253
|
+
}
|
9254
|
+
#endif
|
9255
|
+
|
9256
|
+
if (ctx->backend == nullptr && ggml_backend_buffer_is_host(model->buf)) {
|
9257
|
+
ctx->backend = ggml_backend_cpu_init();
|
9258
|
+
if (ctx->backend == nullptr) {
|
9259
|
+
LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__);
|
9260
|
+
}
|
9261
|
+
}
|
9262
|
+
|
9263
|
+
if (ctx->backend == nullptr) {
|
9264
|
+
LLAMA_LOG_ERROR("%s: failed to initialize a backend\n", __func__);
|
9265
|
+
delete ctx;
|
9266
|
+
return nullptr;
|
9267
|
+
}
|
9268
|
+
|
9269
|
+
if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, type_k, type_v,
|
9270
|
+
cparams.n_ctx, model->n_gpu_layers, cparams.offload_kqv)) {
|
9095
9271
|
LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
|
9096
9272
|
llama_free(ctx);
|
9097
9273
|
return nullptr;
|
@@ -9127,12 +9303,11 @@ struct llama_context * llama_new_context_with_model(
|
|
9127
9303
|
}
|
9128
9304
|
|
9129
9305
|
{
|
9130
|
-
static const size_t tensor_alignment = 32;
|
9131
9306
|
// the compute buffer is used to store the tensor and graph structs, while the allocator buffer is used for the tensor data
|
9132
|
-
ctx->
|
9307
|
+
ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead());
|
9133
9308
|
|
9134
9309
|
// create measure allocator
|
9135
|
-
ctx->alloc =
|
9310
|
+
ctx->alloc = ggml_allocr_new_measure_from_backend(ctx->backend);
|
9136
9311
|
|
9137
9312
|
// build worst-case graph
|
9138
9313
|
int n_tokens = (int)std::min(cparams.n_ctx, cparams.n_batch);
|
@@ -9140,98 +9315,50 @@ struct llama_context * llama_new_context_with_model(
|
|
9140
9315
|
llama_token token = llama_token_bos(&ctx->model); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
|
9141
9316
|
ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0));
|
9142
9317
|
|
9143
|
-
#ifdef GGML_USE_METAL
|
9144
|
-
if (model->n_gpu_layers > 0) {
|
9145
|
-
ctx->ctx_metal = ggml_metal_init(1);
|
9146
|
-
if (!ctx->ctx_metal) {
|
9147
|
-
LLAMA_LOG_ERROR("%s: ggml_metal_init() failed\n", __func__);
|
9148
|
-
llama_free(ctx);
|
9149
|
-
return NULL;
|
9150
|
-
}
|
9151
|
-
//ggml_metal_graph_find_concurrency(ctx->ctx_metal, gf, false);
|
9152
|
-
//ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
|
9153
|
-
}
|
9154
|
-
#endif
|
9155
9318
|
// measure memory requirements for the graph
|
9156
|
-
size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf)
|
9319
|
+
size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf);
|
9157
9320
|
|
9158
|
-
LLAMA_LOG_INFO("%s: compute buffer total size = %.2f MiB\n", __func__, (ctx->
|
9321
|
+
LLAMA_LOG_INFO("%s: compute buffer total size = %.2f MiB\n", __func__, (ctx->buf_compute_meta.size() + alloc_size) / 1024.0 / 1024.0);
|
9159
9322
|
|
9160
|
-
//
|
9323
|
+
// create allocator again with exact memory requirements
|
9161
9324
|
ggml_allocr_free(ctx->alloc);
|
9162
9325
|
|
9163
|
-
ctx->buf_alloc
|
9164
|
-
ctx->alloc =
|
9165
|
-
#
|
9166
|
-
if (
|
9167
|
-
|
9168
|
-
|
9169
|
-
#endif
|
9170
|
-
#ifdef GGML_USE_CUBLAS
|
9171
|
-
ggml_cuda_set_scratch_size(alloc_size);
|
9172
|
-
LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MiB\n", __func__, alloc_size / 1024.0 / 1024.0);
|
9326
|
+
ctx->buf_alloc = ggml_backend_alloc_buffer(ctx->backend, alloc_size);
|
9327
|
+
ctx->alloc = ggml_allocr_new_from_buffer(ctx->buf_alloc);
|
9328
|
+
#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
|
9329
|
+
if (model->n_gpu_layers > 0) {
|
9330
|
+
ggml_cuda_set_scratch_size(alloc_size);
|
9331
|
+
LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MiB\n", __func__, alloc_size / 1024.0 / 1024.0);
|
9173
9332
|
|
9174
|
-
|
9175
|
-
|
9176
|
-
|
9177
|
-
|
9333
|
+
// calculate total VRAM usage
|
9334
|
+
auto add_tensor = [](const ggml_tensor * t, size_t & size) {
|
9335
|
+
if (t->backend == GGML_BACKEND_GPU || t->backend == GGML_BACKEND_GPU_SPLIT) {
|
9336
|
+
size += ggml_nbytes(t);
|
9337
|
+
}
|
9338
|
+
};
|
9339
|
+
size_t model_vram_size = 0;
|
9340
|
+
for (const auto & kv : model->tensors_by_name) {
|
9341
|
+
add_tensor(kv.second, model_vram_size);
|
9178
9342
|
}
|
9179
|
-
};
|
9180
|
-
size_t model_vram_size = 0;
|
9181
|
-
for (const auto & kv : model->tensors_by_name) {
|
9182
|
-
add_tensor(kv.second, model_vram_size);
|
9183
|
-
}
|
9184
|
-
|
9185
|
-
size_t kv_vram_size = 0;
|
9186
|
-
for (auto & k : ctx->kv_self.k_l) {
|
9187
|
-
add_tensor(k, kv_vram_size);
|
9188
|
-
}
|
9189
|
-
for (auto & v : ctx->kv_self.v_l) {
|
9190
|
-
add_tensor(v, kv_vram_size);
|
9191
|
-
}
|
9192
|
-
|
9193
|
-
size_t ctx_vram_size = alloc_size + kv_vram_size;
|
9194
|
-
size_t total_vram_size = model_vram_size + ctx_vram_size;
|
9195
9343
|
|
9196
|
-
|
9197
|
-
|
9198
|
-
|
9199
|
-
|
9200
|
-
|
9201
|
-
|
9202
|
-
|
9203
|
-
#ifdef GGML_USE_METAL
|
9204
|
-
if (model->n_gpu_layers > 0) {
|
9205
|
-
// this allocates all Metal resources and memory buffers
|
9206
|
-
|
9207
|
-
void * data_ptr = NULL;
|
9208
|
-
size_t data_size = 0;
|
9209
|
-
|
9210
|
-
if (ctx->model.mapping) {
|
9211
|
-
data_ptr = ctx->model.mapping->addr;
|
9212
|
-
data_size = ctx->model.mapping->size;
|
9213
|
-
} else {
|
9214
|
-
data_ptr = ggml_get_mem_buffer(ctx->model.ctx);
|
9215
|
-
data_size = ggml_get_mem_size (ctx->model.ctx);
|
9216
|
-
}
|
9217
|
-
|
9218
|
-
const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
|
9344
|
+
size_t kv_vram_size = 0;
|
9345
|
+
for (auto & k : ctx->kv_self.k_l) {
|
9346
|
+
add_tensor(k, kv_vram_size);
|
9347
|
+
}
|
9348
|
+
for (auto & v : ctx->kv_self.v_l) {
|
9349
|
+
add_tensor(v, kv_vram_size);
|
9350
|
+
}
|
9219
9351
|
|
9220
|
-
|
9352
|
+
size_t ctx_vram_size = alloc_size + kv_vram_size;
|
9353
|
+
size_t total_vram_size = model_vram_size + ctx_vram_size;
|
9221
9354
|
|
9222
|
-
|
9223
|
-
|
9224
|
-
|
9225
|
-
|
9226
|
-
return NULL; \
|
9355
|
+
LLAMA_LOG_INFO("%s: total VRAM used: %.2f MiB (model: %.2f MiB, context: %.2f MiB)\n", __func__,
|
9356
|
+
total_vram_size / 1024.0 / 1024.0,
|
9357
|
+
model_vram_size / 1024.0 / 1024.0,
|
9358
|
+
ctx_vram_size / 1024.0 / 1024.0);
|
9227
9359
|
}
|
9228
|
-
|
9229
|
-
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
|
9230
|
-
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.data, ctx->kv_self.buf.size, 0));
|
9231
|
-
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "alloc", ctx->buf_alloc.data, ctx->buf_alloc.size, 0));
|
9232
|
-
#undef LLAMA_METAL_CHECK_BUF
|
9233
|
-
}
|
9234
9360
|
#endif
|
9361
|
+
}
|
9235
9362
|
}
|
9236
9363
|
|
9237
9364
|
#ifdef GGML_USE_MPI
|
@@ -9259,10 +9386,14 @@ const llama_model * llama_get_model(const struct llama_context * ctx) {
|
|
9259
9386
|
return &ctx->model;
|
9260
9387
|
}
|
9261
9388
|
|
9262
|
-
|
9389
|
+
uint32_t llama_n_ctx(const struct llama_context * ctx) {
|
9263
9390
|
return ctx->cparams.n_ctx;
|
9264
9391
|
}
|
9265
9392
|
|
9393
|
+
uint32_t llama_n_batch(const struct llama_context * ctx) {
|
9394
|
+
return ctx->cparams.n_batch;
|
9395
|
+
}
|
9396
|
+
|
9266
9397
|
enum llama_vocab_type llama_vocab_type(const struct llama_model * model) {
|
9267
9398
|
return model->vocab.type;
|
9268
9399
|
}
|
@@ -9519,7 +9650,7 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
|
|
9519
9650
|
const size_t s_embedding = ctx->embedding.size() * sizeof(float);
|
9520
9651
|
const size_t s_kv_size = sizeof(size_t);
|
9521
9652
|
const size_t s_kv_ntok = sizeof(int);
|
9522
|
-
const size_t s_kv = ctx->kv_self.buf
|
9653
|
+
const size_t s_kv = ggml_backend_buffer_get_size(ctx->kv_self.buf);
|
9523
9654
|
|
9524
9655
|
const size_t s_total = (
|
9525
9656
|
+ s_rng_size
|
@@ -9647,7 +9778,7 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
9647
9778
|
const auto n_embd = hparams.n_embd_gqa();
|
9648
9779
|
const auto n_ctx = cparams.n_ctx;
|
9649
9780
|
|
9650
|
-
const size_t kv_buf_size = kv_self.buf
|
9781
|
+
const size_t kv_buf_size = ggml_backend_buffer_get_size(kv_self.buf);
|
9651
9782
|
const uint32_t kv_head = kv_self.head;
|
9652
9783
|
const uint32_t kv_size = kv_self.size;
|
9653
9784
|
const uint32_t kv_used = kv_self.used;
|
@@ -9663,17 +9794,12 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
9663
9794
|
ggml_context * cpy_ctx = ggml_init({ 6*n_layer*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
|
9664
9795
|
ggml_cgraph * gf = ggml_new_graph(cpy_ctx);
|
9665
9796
|
|
9666
|
-
std::vector<
|
9667
|
-
std::vector<
|
9797
|
+
std::vector<struct ggml_tensor *> kout2d(n_layer);
|
9798
|
+
std::vector<struct ggml_tensor *> vout2d(n_layer);
|
9668
9799
|
|
9669
9800
|
for (int il = 0; il < (int) n_layer; ++il) {
|
9670
|
-
|
9671
|
-
|
9672
|
-
kout2d->data = kout2d_data[il].data();
|
9673
|
-
|
9674
|
-
ggml_tensor * vout2d = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd);
|
9675
|
-
vout2d_data[il].resize(ggml_nbytes(vout2d));
|
9676
|
-
vout2d->data = vout2d_data[il].data();
|
9801
|
+
kout2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd, kv_head);
|
9802
|
+
vout2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd);
|
9677
9803
|
|
9678
9804
|
ggml_tensor * k2d = ggml_view_2d(cpy_ctx, kv_self.k_l[il],
|
9679
9805
|
n_embd, kv_head,
|
@@ -9683,20 +9809,28 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
9683
9809
|
kv_head, n_embd,
|
9684
9810
|
elt_size*n_ctx, 0);
|
9685
9811
|
|
9686
|
-
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, k2d, kout2d));
|
9687
|
-
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, v2d, vout2d));
|
9812
|
+
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, k2d, kout2d[il]));
|
9813
|
+
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, v2d, vout2d[il]));
|
9688
9814
|
}
|
9689
9815
|
|
9690
|
-
|
9816
|
+
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(cpy_ctx, ctx->backend);
|
9691
9817
|
|
9692
|
-
|
9818
|
+
ggml_backend_graph_compute(ctx->backend, gf);
|
9819
|
+
|
9820
|
+
std::vector<uint8_t> tmp_buf;
|
9821
|
+
for (int il = 0; il < (int) n_layer; ++il) {
|
9822
|
+
tmp_buf.resize(ggml_nbytes(kout2d[il]));
|
9823
|
+
ggml_backend_tensor_get(kout2d[il], tmp_buf.data(), 0, tmp_buf.size());
|
9824
|
+
data_ctx->write(tmp_buf.data(), tmp_buf.size());
|
9693
9825
|
|
9694
|
-
|
9695
|
-
|
9696
|
-
|
9697
|
-
data_ctx->write(kout2d_data[il].data(), kout2d_data[il].size());
|
9698
|
-
data_ctx->write(vout2d_data[il].data(), vout2d_data[il].size());
|
9826
|
+
tmp_buf.resize(ggml_nbytes(vout2d[il]));
|
9827
|
+
ggml_backend_tensor_get(vout2d[il], tmp_buf.data(), 0, tmp_buf.size());
|
9828
|
+
data_ctx->write(tmp_buf.data(), tmp_buf.size());
|
9699
9829
|
}
|
9830
|
+
|
9831
|
+
ggml_free(cpy_ctx);
|
9832
|
+
|
9833
|
+
ggml_backend_buffer_free(buf);
|
9700
9834
|
}
|
9701
9835
|
|
9702
9836
|
for (uint32_t i = 0; i < kv_size; ++i) {
|
@@ -9794,21 +9928,19 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
9794
9928
|
memcpy(&kv_used, inp, sizeof(kv_used)); inp += sizeof(kv_used);
|
9795
9929
|
|
9796
9930
|
if (kv_buf_size) {
|
9797
|
-
GGML_ASSERT(kv_self.buf
|
9931
|
+
GGML_ASSERT(ggml_backend_buffer_get_size(kv_self.buf) == kv_buf_size);
|
9798
9932
|
|
9799
9933
|
const size_t elt_size = ggml_element_size(kv_self.k_l[0]);
|
9800
9934
|
|
9801
9935
|
ggml_context * cpy_ctx = ggml_init({ 6*n_layer*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
|
9802
9936
|
ggml_cgraph * gf = ggml_new_graph(cpy_ctx);
|
9803
9937
|
|
9804
|
-
|
9805
|
-
|
9806
|
-
kin2d->data = (void *) inp;
|
9807
|
-
inp += ggml_nbytes(kin2d);
|
9938
|
+
std::vector<struct ggml_tensor *> kin2d(n_layer);
|
9939
|
+
std::vector<struct ggml_tensor *> vin2d(n_layer);
|
9808
9940
|
|
9809
|
-
|
9810
|
-
|
9811
|
-
|
9941
|
+
for (int il = 0; il < n_layer; ++il) {
|
9942
|
+
kin2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd, kv_head);
|
9943
|
+
vin2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd);
|
9812
9944
|
|
9813
9945
|
ggml_tensor * k2d = ggml_view_2d(cpy_ctx, kv_self.k_l[il],
|
9814
9946
|
n_embd, kv_head,
|
@@ -9818,13 +9950,26 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
9818
9950
|
kv_head, n_embd,
|
9819
9951
|
elt_size*n_ctx, 0);
|
9820
9952
|
|
9821
|
-
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, kin2d, k2d));
|
9822
|
-
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, vin2d, v2d));
|
9953
|
+
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, kin2d[il], k2d));
|
9954
|
+
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, vin2d[il], v2d));
|
9955
|
+
}
|
9956
|
+
|
9957
|
+
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(cpy_ctx, ctx->backend);
|
9958
|
+
|
9959
|
+
// load data into the tensors
|
9960
|
+
for (int il = 0; il < n_layer; ++il) {
|
9961
|
+
ggml_backend_tensor_set(kin2d[il], inp, 0, ggml_nbytes(kin2d[il]));
|
9962
|
+
inp += ggml_nbytes(kin2d[il]);
|
9963
|
+
|
9964
|
+
ggml_backend_tensor_set(vin2d[il], inp, 0, ggml_nbytes(vin2d[il]));
|
9965
|
+
inp += ggml_nbytes(vin2d[il]);
|
9823
9966
|
}
|
9824
9967
|
|
9825
|
-
|
9968
|
+
ggml_backend_graph_compute(ctx->backend, gf);
|
9826
9969
|
|
9827
9970
|
ggml_free(cpy_ctx);
|
9971
|
+
|
9972
|
+
ggml_backend_buffer_free(buf);
|
9828
9973
|
}
|
9829
9974
|
|
9830
9975
|
ctx->kv_self.head = kv_head;
|
@@ -10047,6 +10192,7 @@ float * llama_get_logits(struct llama_context * ctx) {
|
|
10047
10192
|
}
|
10048
10193
|
|
10049
10194
|
float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
|
10195
|
+
assert(ctx->logits_valid.at(i));
|
10050
10196
|
return ctx->logits.data() + i*ctx->model.hparams.n_vocab;
|
10051
10197
|
}
|
10052
10198
|
|