llama_cpp 0.10.0 → 0.10.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +12 -0
- data/ext/llama_cpp/llama_cpp.cpp +18 -1
- data/ext/llama_cpp/src/ggml-alloc.c +12 -4
- data/ext/llama_cpp/src/ggml-alloc.h +1 -1
- data/ext/llama_cpp/src/ggml-backend-impl.h +12 -8
- data/ext/llama_cpp/src/ggml-backend.c +75 -5
- data/ext/llama_cpp/src/ggml-backend.h +7 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +952 -232
- data/ext/llama_cpp/src/ggml-metal.h +3 -0
- data/ext/llama_cpp/src/ggml-metal.m +725 -98
- data/ext/llama_cpp/src/ggml-metal.metal +1508 -171
- data/ext/llama_cpp/src/ggml-quants.c +2 -2
- data/ext/llama_cpp/src/ggml.c +554 -215
- data/ext/llama_cpp/src/ggml.h +58 -23
- data/ext/llama_cpp/src/llama.cpp +1157 -851
- data/ext/llama_cpp/src/llama.h +9 -4
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +2 -0
- metadata +2 -2
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -1,11 +1,12 @@
|
|
1
1
|
#define LLAMA_API_INTERNAL
|
2
|
+
//#define LLAMA_GGML_BACKEND_CUDA_TEST // for testing only - enables ggml-cuda through ggml-backend, disables partial offloading
|
2
3
|
#include "llama.h"
|
3
4
|
|
4
5
|
#include "unicode.h"
|
5
6
|
|
6
7
|
#include "ggml.h"
|
7
|
-
|
8
8
|
#include "ggml-alloc.h"
|
9
|
+
#include "ggml-backend.h"
|
9
10
|
|
10
11
|
#ifdef GGML_USE_CUBLAS
|
11
12
|
# include "ggml-cuda.h"
|
@@ -32,6 +33,7 @@
|
|
32
33
|
#include <unistd.h>
|
33
34
|
#if defined(_POSIX_MAPPED_FILES)
|
34
35
|
#include <sys/mman.h>
|
36
|
+
#include <fcntl.h>
|
35
37
|
#endif
|
36
38
|
#if defined(_POSIX_MEMLOCK_RANGE)
|
37
39
|
#include <sys/resource.h>
|
@@ -91,7 +93,8 @@
|
|
91
93
|
#define LLAMA_ATTRIBUTE_FORMAT(...)
|
92
94
|
#endif
|
93
95
|
|
94
|
-
#define LLAMA_MAX_NODES
|
96
|
+
#define LLAMA_MAX_NODES 8192
|
97
|
+
#define LLAMA_MAX_EXPERTS 8
|
95
98
|
|
96
99
|
//
|
97
100
|
// logging
|
@@ -194,6 +197,7 @@ enum llm_arch {
|
|
194
197
|
LLM_ARCH_BLOOM,
|
195
198
|
LLM_ARCH_STABLELM,
|
196
199
|
LLM_ARCH_QWEN,
|
200
|
+
LLM_ARCH_PHI2,
|
197
201
|
LLM_ARCH_UNKNOWN,
|
198
202
|
};
|
199
203
|
|
@@ -211,6 +215,7 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
|
|
211
215
|
{ LLM_ARCH_BLOOM, "bloom" },
|
212
216
|
{ LLM_ARCH_STABLELM, "stablelm" },
|
213
217
|
{ LLM_ARCH_QWEN, "qwen" },
|
218
|
+
{ LLM_ARCH_PHI2, "phi2" },
|
214
219
|
};
|
215
220
|
|
216
221
|
enum llm_kv {
|
@@ -231,6 +236,8 @@ enum llm_kv {
|
|
231
236
|
LLM_KV_FEED_FORWARD_LENGTH,
|
232
237
|
LLM_KV_USE_PARALLEL_RESIDUAL,
|
233
238
|
LLM_KV_TENSOR_DATA_LAYOUT,
|
239
|
+
LLM_KV_EXPERT_COUNT,
|
240
|
+
LLM_KV_EXPERT_USED_COUNT,
|
234
241
|
|
235
242
|
LLM_KV_ATTENTION_HEAD_COUNT,
|
236
243
|
LLM_KV_ATTENTION_HEAD_COUNT_KV,
|
@@ -281,6 +288,8 @@ static std::map<llm_kv, std::string> LLM_KV_NAMES = {
|
|
281
288
|
{ LLM_KV_FEED_FORWARD_LENGTH, "%s.feed_forward_length" },
|
282
289
|
{ LLM_KV_USE_PARALLEL_RESIDUAL, "%s.use_parallel_residual" },
|
283
290
|
{ LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
|
291
|
+
{ LLM_KV_EXPERT_COUNT, "%s.expert_count" },
|
292
|
+
{ LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
|
284
293
|
|
285
294
|
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
|
286
295
|
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
|
@@ -338,10 +347,14 @@ enum llm_tensor {
|
|
338
347
|
LLM_TENSOR_ATTN_NORM,
|
339
348
|
LLM_TENSOR_ATTN_NORM_2,
|
340
349
|
LLM_TENSOR_ATTN_ROT_EMBD,
|
350
|
+
LLM_TENSOR_FFN_GATE_INP,
|
351
|
+
LLM_TENSOR_FFN_NORM,
|
341
352
|
LLM_TENSOR_FFN_GATE,
|
342
353
|
LLM_TENSOR_FFN_DOWN,
|
343
354
|
LLM_TENSOR_FFN_UP,
|
344
|
-
|
355
|
+
LLM_TENSOR_FFN_DOWN_EXP,
|
356
|
+
LLM_TENSOR_FFN_GATE_EXP,
|
357
|
+
LLM_TENSOR_FFN_UP_EXP,
|
345
358
|
LLM_TENSOR_ATTN_Q_NORM,
|
346
359
|
LLM_TENSOR_ATTN_K_NORM,
|
347
360
|
};
|
@@ -360,10 +373,14 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
360
373
|
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
361
374
|
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
362
375
|
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
|
376
|
+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
363
377
|
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
364
378
|
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
365
379
|
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
366
380
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
381
|
+
{ LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
|
382
|
+
{ LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
|
383
|
+
{ LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
|
367
384
|
},
|
368
385
|
},
|
369
386
|
{
|
@@ -537,6 +554,19 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
537
554
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
538
555
|
},
|
539
556
|
},
|
557
|
+
{
|
558
|
+
LLM_ARCH_PHI2,
|
559
|
+
{
|
560
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
561
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
562
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
563
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
564
|
+
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
565
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
566
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
567
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
568
|
+
},
|
569
|
+
},
|
540
570
|
|
541
571
|
{
|
542
572
|
LLM_ARCH_UNKNOWN,
|
@@ -585,6 +615,10 @@ struct LLM_TN {
|
|
585
615
|
std::string operator()(llm_tensor tensor, const std::string & suffix, int bid) const {
|
586
616
|
return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid) + "." + suffix;
|
587
617
|
}
|
618
|
+
|
619
|
+
std::string operator()(llm_tensor tensor, const std::string & suffix, int bid, int xid) const {
|
620
|
+
return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid, xid) + "." + suffix;
|
621
|
+
}
|
588
622
|
};
|
589
623
|
|
590
624
|
//
|
@@ -680,38 +714,6 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
|
|
680
714
|
// llama helpers
|
681
715
|
//
|
682
716
|
|
683
|
-
inline void * llama_host_malloc(size_t n) {
|
684
|
-
#ifdef GGML_USE_CUBLAS
|
685
|
-
if (ggml_cublas_loaded()) {
|
686
|
-
return ggml_cuda_host_malloc(n);
|
687
|
-
} else {
|
688
|
-
return malloc(n);
|
689
|
-
}
|
690
|
-
#elif GGML_USE_METAL
|
691
|
-
return ggml_metal_host_malloc(n);
|
692
|
-
#elif GGML_USE_CPU_HBM
|
693
|
-
return hbw_malloc(n);
|
694
|
-
#else
|
695
|
-
return malloc(n);
|
696
|
-
#endif
|
697
|
-
}
|
698
|
-
|
699
|
-
inline void llama_host_free(void * ptr) {
|
700
|
-
#ifdef GGML_USE_CUBLAS
|
701
|
-
if (ggml_cublas_loaded()) {
|
702
|
-
return ggml_cuda_host_free(ptr);
|
703
|
-
} else {
|
704
|
-
return free(ptr);
|
705
|
-
}
|
706
|
-
#elif GGML_USE_METAL
|
707
|
-
return ggml_metal_host_free(ptr);
|
708
|
-
#elif GGML_USE_CPU_HBM
|
709
|
-
return hbw_free(ptr);
|
710
|
-
#else
|
711
|
-
return free(ptr);
|
712
|
-
#endif
|
713
|
-
}
|
714
|
-
|
715
717
|
#if defined(_WIN32)
|
716
718
|
static std::string llama_format_win_err(DWORD err) {
|
717
719
|
LPSTR buf;
|
@@ -726,40 +728,10 @@ static std::string llama_format_win_err(DWORD err) {
|
|
726
728
|
}
|
727
729
|
#endif
|
728
730
|
|
729
|
-
|
730
|
-
|
731
|
-
|
732
|
-
|
733
|
-
// fallback to malloc / free
|
734
|
-
// useful in cases where CUDA can try to allocate PINNED memory
|
735
|
-
bool fallback = false;
|
736
|
-
|
737
|
-
void resize(size_t n) {
|
738
|
-
llama_host_free(data);
|
739
|
-
|
740
|
-
data = llama_host_malloc(n);
|
741
|
-
if (!data) {
|
742
|
-
fallback = true;
|
743
|
-
data = malloc(n);
|
744
|
-
} else {
|
745
|
-
fallback = false;
|
746
|
-
}
|
747
|
-
|
748
|
-
GGML_ASSERT(data);
|
749
|
-
size = n;
|
750
|
-
}
|
751
|
-
|
752
|
-
~llama_buffer() {
|
753
|
-
if (data) {
|
754
|
-
if (fallback) { // NOLINT
|
755
|
-
free(data);
|
756
|
-
} else {
|
757
|
-
llama_host_free(data);
|
758
|
-
}
|
759
|
-
}
|
760
|
-
|
761
|
-
data = NULL;
|
762
|
-
}
|
731
|
+
template <typename T>
|
732
|
+
struct no_init {
|
733
|
+
T value;
|
734
|
+
no_init() { /* do nothing */ }
|
763
735
|
};
|
764
736
|
|
765
737
|
struct llama_file {
|
@@ -847,6 +819,9 @@ struct llama_mmap {
|
|
847
819
|
#ifdef _POSIX_MAPPED_FILES
|
848
820
|
static constexpr bool SUPPORTED = true;
|
849
821
|
|
822
|
+
// list of mapped fragments (first_offset, last_offset)
|
823
|
+
std::vector<std::pair<size_t, size_t>> mapped_fragments;
|
824
|
+
|
850
825
|
llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) {
|
851
826
|
size = file->size;
|
852
827
|
int fd = fileno(file->fp);
|
@@ -854,17 +829,22 @@ struct llama_mmap {
|
|
854
829
|
// prefetch/readahead impairs performance on NUMA systems
|
855
830
|
if (numa) { prefetch = 0; }
|
856
831
|
#ifdef __linux__
|
832
|
+
// advise the kernel to read the file sequentially (increases readahead)
|
833
|
+
if (posix_fadvise(fd, 0, 0, POSIX_FADV_SEQUENTIAL)) {
|
834
|
+
LLAMA_LOG_WARN("warning: posix_fadvise(.., POSIX_FADV_SEQUENTIAL) failed: %s\n",
|
835
|
+
strerror(errno));
|
836
|
+
}
|
857
837
|
if (prefetch) { flags |= MAP_POPULATE; }
|
858
838
|
#endif
|
859
839
|
addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
|
860
|
-
if (addr == MAP_FAILED) {
|
840
|
+
if (addr == MAP_FAILED) { // NOLINT
|
861
841
|
throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
|
862
842
|
}
|
863
843
|
|
864
844
|
if (prefetch > 0) {
|
865
|
-
//
|
845
|
+
// advise the kernel to preload the mapped memory
|
866
846
|
if (posix_madvise(addr, std::min(file->size, prefetch), POSIX_MADV_WILLNEED)) {
|
867
|
-
|
847
|
+
LLAMA_LOG_WARN("warning: posix_madvise(.., POSIX_MADV_WILLNEED) failed: %s\n",
|
868
848
|
strerror(errno));
|
869
849
|
}
|
870
850
|
}
|
@@ -872,14 +852,81 @@ struct llama_mmap {
|
|
872
852
|
// advise the kernel not to use readahead
|
873
853
|
// (because the next page might not belong on the same node)
|
874
854
|
if (posix_madvise(addr, file->size, POSIX_MADV_RANDOM)) {
|
875
|
-
|
855
|
+
LLAMA_LOG_WARN("warning: posix_madvise(.., POSIX_MADV_RANDOM) failed: %s\n",
|
876
856
|
strerror(errno));
|
877
857
|
}
|
878
858
|
}
|
859
|
+
|
860
|
+
// initialize list of mapped_fragments
|
861
|
+
mapped_fragments.emplace_back(0, file->size);
|
862
|
+
}
|
863
|
+
|
864
|
+
static void align_range(size_t * first, size_t * last, size_t page_size) {
|
865
|
+
// align first to the next page
|
866
|
+
size_t offset_in_page = *first & (page_size - 1);
|
867
|
+
size_t offset_to_page = offset_in_page == 0 ? 0 : page_size - offset_in_page;
|
868
|
+
*first += offset_to_page;
|
869
|
+
|
870
|
+
// align last to the previous page
|
871
|
+
*last = *last & ~(page_size - 1);
|
872
|
+
|
873
|
+
if (*last <= *first) {
|
874
|
+
*last = *first;
|
875
|
+
}
|
876
|
+
}
|
877
|
+
|
878
|
+
// partially unmap the file in the range [first, last)
|
879
|
+
void unmap_fragment(size_t first, size_t last) {
|
880
|
+
// note: this function must not be called multiple times with overlapping ranges
|
881
|
+
// otherwise, there is a risk of invalidating addresses that have been repurposed for other mappings
|
882
|
+
int page_size = sysconf(_SC_PAGESIZE);
|
883
|
+
align_range(&first, &last, page_size);
|
884
|
+
size_t len = last - first;
|
885
|
+
|
886
|
+
if (len == 0) {
|
887
|
+
return;
|
888
|
+
}
|
889
|
+
|
890
|
+
GGML_ASSERT(first % page_size == 0);
|
891
|
+
GGML_ASSERT(last % page_size == 0);
|
892
|
+
GGML_ASSERT(last > first);
|
893
|
+
|
894
|
+
void * next_page_start = (uint8_t *) addr + first;
|
895
|
+
|
896
|
+
// unmap the range
|
897
|
+
if (munmap(next_page_start, len)) {
|
898
|
+
LLAMA_LOG_WARN("warning: munmap failed: %s\n", strerror(errno));
|
899
|
+
}
|
900
|
+
|
901
|
+
// update the list of mapped fragments to avoid unmapping the same range again in the destructor
|
902
|
+
std::vector<std::pair<size_t, size_t>> new_mapped_fragments;
|
903
|
+
for (const auto & frag : mapped_fragments) {
|
904
|
+
if (frag.first < first && frag.second > last) {
|
905
|
+
// the range is in the middle of the fragment, split it
|
906
|
+
new_mapped_fragments.emplace_back(frag.first, first);
|
907
|
+
new_mapped_fragments.emplace_back(last, frag.second);
|
908
|
+
} else if (frag.first < first && frag.second > first) {
|
909
|
+
// the range starts in the middle of the fragment
|
910
|
+
new_mapped_fragments.emplace_back(frag.first, first);
|
911
|
+
} else if (frag.first < last && frag.second > last) {
|
912
|
+
// the range ends in the middle of the fragment
|
913
|
+
new_mapped_fragments.emplace_back(last, frag.second);
|
914
|
+
} else if (frag.first >= first && frag.second <= last) {
|
915
|
+
// the range covers the entire fragment
|
916
|
+
} else {
|
917
|
+
// the range is outside the fragment
|
918
|
+
new_mapped_fragments.push_back(frag);
|
919
|
+
}
|
920
|
+
}
|
921
|
+
mapped_fragments = std::move(new_mapped_fragments);
|
879
922
|
}
|
880
923
|
|
881
924
|
~llama_mmap() {
|
882
|
-
|
925
|
+
for (const auto & frag : mapped_fragments) {
|
926
|
+
if (munmap((char *) addr + frag.first, frag.second - frag.first)) {
|
927
|
+
LLAMA_LOG_WARN("warning: munmap failed: %s\n", strerror(errno));
|
928
|
+
}
|
929
|
+
}
|
883
930
|
}
|
884
931
|
#elif defined(_WIN32)
|
885
932
|
static constexpr bool SUPPORTED = true;
|
@@ -922,6 +969,12 @@ struct llama_mmap {
|
|
922
969
|
#endif // _WIN32_WINNT >= _WIN32_WINNT_WIN8
|
923
970
|
}
|
924
971
|
|
972
|
+
void unmap_fragment(size_t first, size_t last) {
|
973
|
+
// not supported
|
974
|
+
GGML_UNUSED(first);
|
975
|
+
GGML_UNUSED(last);
|
976
|
+
}
|
977
|
+
|
925
978
|
~llama_mmap() {
|
926
979
|
if (!UnmapViewOfFile(addr)) {
|
927
980
|
fprintf(stderr, "warning: UnmapViewOfFile failed: %s\n",
|
@@ -938,6 +991,13 @@ struct llama_mmap {
|
|
938
991
|
|
939
992
|
throw std::runtime_error(std::string("mmap not supported"));
|
940
993
|
}
|
994
|
+
|
995
|
+
void unmap(size_t offset, size_t len) {
|
996
|
+
(void) offset;
|
997
|
+
(void) len;
|
998
|
+
|
999
|
+
throw std::runtime_error(std::string("mmap not supported"));
|
1000
|
+
}
|
941
1001
|
#endif
|
942
1002
|
};
|
943
1003
|
|
@@ -1111,6 +1171,26 @@ static std::string llama_token_to_piece(const struct llama_context * ctx, llama_
|
|
1111
1171
|
return std::string(result.data(), result.size());
|
1112
1172
|
}
|
1113
1173
|
|
1174
|
+
static ggml_backend_buffer_type_t llama_default_buffer_type(int n_gpu_layers) {
|
1175
|
+
#ifdef GGML_USE_METAL
|
1176
|
+
if (n_gpu_layers > 0) {
|
1177
|
+
return ggml_backend_metal_buffer_type();
|
1178
|
+
}
|
1179
|
+
#elif defined(GGML_USE_CUBLAS) && defined(LLAMA_GGML_BACKEND_CUDA_TEST)
|
1180
|
+
if (n_gpu_layers > 0) {
|
1181
|
+
return ggml_backend_cuda_buffer_type(0);
|
1182
|
+
}
|
1183
|
+
#elif defined(GGML_USE_CUBLAS)
|
1184
|
+
return ggml_backend_cuda_host_buffer_type();
|
1185
|
+
#elif defined(GGML_USE_CPU_HBM)
|
1186
|
+
return ggml_backend_cpu_hbm_buffer_type();
|
1187
|
+
#endif
|
1188
|
+
|
1189
|
+
return ggml_backend_cpu_buffer_type();
|
1190
|
+
|
1191
|
+
GGML_UNUSED(n_gpu_layers);
|
1192
|
+
}
|
1193
|
+
|
1114
1194
|
//
|
1115
1195
|
// globals
|
1116
1196
|
//
|
@@ -1159,6 +1239,8 @@ struct llama_hparams {
|
|
1159
1239
|
uint32_t n_layer;
|
1160
1240
|
uint32_t n_rot;
|
1161
1241
|
uint32_t n_ff;
|
1242
|
+
uint32_t n_expert = 0;
|
1243
|
+
uint32_t n_expert_used = 0;
|
1162
1244
|
|
1163
1245
|
float f_norm_eps;
|
1164
1246
|
float f_norm_rms_eps;
|
@@ -1173,15 +1255,18 @@ struct llama_hparams {
|
|
1173
1255
|
float f_max_alibi_bias;
|
1174
1256
|
|
1175
1257
|
bool operator!=(const llama_hparams & other) const {
|
1176
|
-
if (this->vocab_only
|
1177
|
-
if (this->n_vocab
|
1178
|
-
if (this->n_ctx_train
|
1179
|
-
if (this->n_embd
|
1180
|
-
if (this->n_head
|
1181
|
-
if (this->n_head_kv
|
1182
|
-
if (this->n_layer
|
1183
|
-
if (this->n_rot
|
1184
|
-
if (this->n_ff
|
1258
|
+
if (this->vocab_only != other.vocab_only) return true;
|
1259
|
+
if (this->n_vocab != other.n_vocab) return true;
|
1260
|
+
if (this->n_ctx_train != other.n_ctx_train) return true;
|
1261
|
+
if (this->n_embd != other.n_embd) return true;
|
1262
|
+
if (this->n_head != other.n_head) return true;
|
1263
|
+
if (this->n_head_kv != other.n_head_kv) return true;
|
1264
|
+
if (this->n_layer != other.n_layer) return true;
|
1265
|
+
if (this->n_rot != other.n_rot) return true;
|
1266
|
+
if (this->n_ff != other.n_ff) return true;
|
1267
|
+
if (this->n_expert != other.n_expert) return true;
|
1268
|
+
if (this->n_expert_used != other.n_expert_used) return true;
|
1269
|
+
|
1185
1270
|
if (this->rope_finetuned != other.rope_finetuned) return true;
|
1186
1271
|
if (this->n_yarn_orig_ctx != other.n_yarn_orig_ctx) return true;
|
1187
1272
|
|
@@ -1263,6 +1348,12 @@ struct llama_layer {
|
|
1263
1348
|
struct ggml_tensor * ffn_down; // w2
|
1264
1349
|
struct ggml_tensor * ffn_up; // w3
|
1265
1350
|
|
1351
|
+
// ff MoE
|
1352
|
+
struct ggml_tensor * ffn_gate_inp;
|
1353
|
+
struct ggml_tensor * ffn_gate_exp[LLAMA_MAX_EXPERTS];
|
1354
|
+
struct ggml_tensor * ffn_down_exp[LLAMA_MAX_EXPERTS];
|
1355
|
+
struct ggml_tensor * ffn_up_exp [LLAMA_MAX_EXPERTS];
|
1356
|
+
|
1266
1357
|
// ff bias
|
1267
1358
|
struct ggml_tensor * ffn_down_b; // b2
|
1268
1359
|
struct ggml_tensor * ffn_up_b; // b3
|
@@ -1300,14 +1391,10 @@ struct llama_kv_cache {
|
|
1300
1391
|
|
1301
1392
|
struct ggml_context * ctx = NULL;
|
1302
1393
|
|
1303
|
-
|
1394
|
+
ggml_backend_buffer_t buf = NULL;
|
1304
1395
|
|
1305
1396
|
~llama_kv_cache() {
|
1306
|
-
|
1307
|
-
ggml_free(ctx);
|
1308
|
-
}
|
1309
|
-
|
1310
|
-
#ifdef GGML_USE_CUBLAS
|
1397
|
+
#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
|
1311
1398
|
if (ggml_cublas_loaded()) {
|
1312
1399
|
for (size_t i = 0; i < k_l.size(); ++i) {
|
1313
1400
|
ggml_cuda_free_data(k_l[i]);
|
@@ -1315,6 +1402,11 @@ struct llama_kv_cache {
|
|
1315
1402
|
}
|
1316
1403
|
}
|
1317
1404
|
#endif
|
1405
|
+
if (ctx) {
|
1406
|
+
ggml_free(ctx);
|
1407
|
+
}
|
1408
|
+
|
1409
|
+
ggml_backend_buffer_free(buf);
|
1318
1410
|
}
|
1319
1411
|
};
|
1320
1412
|
|
@@ -1354,11 +1446,11 @@ struct llama_vocab {
|
|
1354
1446
|
id special_suffix_id = 32008;
|
1355
1447
|
id special_eot_id = 32010;
|
1356
1448
|
|
1357
|
-
int find_bpe_rank(std::string token_left, std::string token_right) const {
|
1358
|
-
GGML_ASSERT(token_left.find(
|
1359
|
-
GGML_ASSERT(token_left.find(
|
1360
|
-
GGML_ASSERT(token_right.find(
|
1361
|
-
GGML_ASSERT(token_right.find(
|
1449
|
+
int find_bpe_rank(const std::string & token_left, const std::string & token_right) const {
|
1450
|
+
GGML_ASSERT(token_left.find(' ') == std::string::npos);
|
1451
|
+
GGML_ASSERT(token_left.find('\n') == std::string::npos);
|
1452
|
+
GGML_ASSERT(token_right.find(' ') == std::string::npos);
|
1453
|
+
GGML_ASSERT(token_right.find('\n') == std::string::npos);
|
1362
1454
|
|
1363
1455
|
auto it = bpe_ranks.find(std::make_pair(token_left, token_right));
|
1364
1456
|
if (it == bpe_ranks.end()) {
|
@@ -1387,6 +1479,7 @@ struct llama_model {
|
|
1387
1479
|
struct ggml_tensor * output_norm;
|
1388
1480
|
struct ggml_tensor * output_norm_b;
|
1389
1481
|
struct ggml_tensor * output;
|
1482
|
+
struct ggml_tensor * output_b;
|
1390
1483
|
|
1391
1484
|
std::vector<llama_layer> layers;
|
1392
1485
|
|
@@ -1399,7 +1492,7 @@ struct llama_model {
|
|
1399
1492
|
struct ggml_context * ctx = NULL;
|
1400
1493
|
|
1401
1494
|
// the model memory buffer
|
1402
|
-
|
1495
|
+
ggml_backend_buffer_t buf = NULL;
|
1403
1496
|
|
1404
1497
|
// model memory mapped file
|
1405
1498
|
std::unique_ptr<llama_mmap> mapping;
|
@@ -1415,11 +1508,7 @@ struct llama_model {
|
|
1415
1508
|
int64_t t_start_us = 0;
|
1416
1509
|
|
1417
1510
|
~llama_model() {
|
1418
|
-
|
1419
|
-
ggml_free(ctx);
|
1420
|
-
}
|
1421
|
-
|
1422
|
-
#ifdef GGML_USE_CUBLAS
|
1511
|
+
#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
|
1423
1512
|
if (ggml_cublas_loaded()) {
|
1424
1513
|
for (size_t i = 0; i < tensors_by_name.size(); ++i) {
|
1425
1514
|
ggml_cuda_free_data(tensors_by_name[i].second);
|
@@ -1433,24 +1522,26 @@ struct llama_model {
|
|
1433
1522
|
ggml_cl_free_data(tensors_by_name[i].second);
|
1434
1523
|
}
|
1435
1524
|
#endif
|
1525
|
+
if (ctx) {
|
1526
|
+
ggml_free(ctx);
|
1527
|
+
}
|
1528
|
+
|
1529
|
+
ggml_backend_buffer_free(buf);
|
1436
1530
|
}
|
1437
1531
|
};
|
1438
1532
|
|
1439
1533
|
struct llama_context {
|
1440
1534
|
llama_context(const llama_model & model) : model(model), t_start_us(model.t_start_us), t_load_us(model.t_load_us) {}
|
1441
1535
|
~llama_context() {
|
1442
|
-
|
1443
|
-
|
1444
|
-
|
1445
|
-
}
|
1446
|
-
#endif
|
1447
|
-
if (alloc) {
|
1448
|
-
ggml_allocr_free(alloc);
|
1449
|
-
}
|
1536
|
+
ggml_allocr_free(alloc);
|
1537
|
+
ggml_backend_buffer_free(buf_alloc);
|
1538
|
+
ggml_backend_free(backend);
|
1450
1539
|
}
|
1451
1540
|
|
1452
1541
|
llama_cparams cparams;
|
1453
1542
|
|
1543
|
+
ggml_backend_t backend = nullptr;
|
1544
|
+
|
1454
1545
|
const llama_model & model;
|
1455
1546
|
|
1456
1547
|
// key + value cache for the self attention
|
@@ -1472,23 +1563,22 @@ struct llama_context {
|
|
1472
1563
|
|
1473
1564
|
// decode output (2-dimensional array: [n_tokens][n_vocab])
|
1474
1565
|
std::vector<float> logits;
|
1566
|
+
#ifndef NDEBUG
|
1567
|
+
// guard against access to unset logits
|
1568
|
+
std::vector<bool> logits_valid;
|
1569
|
+
#endif
|
1475
1570
|
bool logits_all = false;
|
1476
1571
|
|
1477
1572
|
// input embedding (1-dimensional array: [n_embd])
|
1478
1573
|
std::vector<float> embedding;
|
1479
1574
|
|
1480
|
-
// reusable buffer for `struct ggml_graph_plan.work_data`
|
1481
|
-
std::vector<uint8_t> work_buffer;
|
1482
|
-
|
1483
1575
|
// memory buffers used to evaluate the model
|
1484
|
-
|
1485
|
-
|
1486
|
-
llama_buffer buf_alloc;
|
1576
|
+
std::vector<uint8_t> buf_compute_meta;
|
1577
|
+
ggml_backend_buffer_t buf_alloc = NULL;
|
1487
1578
|
ggml_allocr * alloc = NULL;
|
1488
1579
|
|
1489
|
-
|
1490
|
-
|
1491
|
-
#endif
|
1580
|
+
// temporary buffer for copying data to/from the backend
|
1581
|
+
std::vector<no_init<uint8_t>> buf_copy;
|
1492
1582
|
|
1493
1583
|
#ifdef GGML_USE_MPI
|
1494
1584
|
ggml_mpi_context * ctx_mpi = NULL;
|
@@ -1510,9 +1600,6 @@ static bool llama_kv_cache_init(
|
|
1510
1600
|
const uint32_t n_embd = hparams.n_embd_gqa();
|
1511
1601
|
const uint32_t n_layer = hparams.n_layer;
|
1512
1602
|
|
1513
|
-
const int64_t n_mem = n_layer*n_ctx;
|
1514
|
-
const int64_t n_elements = n_embd*n_mem;
|
1515
|
-
|
1516
1603
|
cache.has_shift = false;
|
1517
1604
|
|
1518
1605
|
cache.head = 0;
|
@@ -1522,13 +1609,10 @@ static bool llama_kv_cache_init(
|
|
1522
1609
|
cache.cells.clear();
|
1523
1610
|
cache.cells.resize(n_ctx);
|
1524
1611
|
|
1525
|
-
cache.buf.resize(n_elements*(ggml_type_sizef(ktype) + ggml_type_sizef(vtype)) + 2u*n_layer*ggml_tensor_overhead());
|
1526
|
-
memset(cache.buf.data, 0, cache.buf.size);
|
1527
|
-
|
1528
1612
|
struct ggml_init_params params;
|
1529
|
-
params.mem_size =
|
1530
|
-
params.mem_buffer =
|
1531
|
-
params.no_alloc =
|
1613
|
+
params.mem_size = 2u*n_layer*ggml_tensor_overhead();
|
1614
|
+
params.mem_buffer = NULL;
|
1615
|
+
params.no_alloc = true;
|
1532
1616
|
|
1533
1617
|
cache.ctx = ggml_init(params);
|
1534
1618
|
|
@@ -1542,9 +1626,7 @@ static bool llama_kv_cache_init(
|
|
1542
1626
|
cache.k_l.reserve(n_layer);
|
1543
1627
|
cache.v_l.reserve(n_layer);
|
1544
1628
|
|
1545
|
-
const int i_gpu_start = (int) n_layer - n_gpu_layers;
|
1546
|
-
|
1547
|
-
GGML_UNUSED(offload);
|
1629
|
+
const int i_gpu_start = (int) n_layer - n_gpu_layers;
|
1548
1630
|
|
1549
1631
|
for (int i = 0; i < (int) n_layer; i++) {
|
1550
1632
|
ggml_tensor * k = ggml_new_tensor_1d(cache.ctx, ktype, n_embd*n_ctx);
|
@@ -1553,23 +1635,35 @@ static bool llama_kv_cache_init(
|
|
1553
1635
|
ggml_format_name(v, "cache_v_l%d", i);
|
1554
1636
|
cache.k_l.push_back(k);
|
1555
1637
|
cache.v_l.push_back(v);
|
1556
|
-
#
|
1638
|
+
#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
|
1557
1639
|
if (i >= i_gpu_start) {
|
1558
1640
|
if (offload) {
|
1559
1641
|
ggml_cuda_assign_buffers_no_scratch(k);
|
1560
|
-
vram_kv_cache += ggml_nbytes(k);
|
1561
1642
|
ggml_cuda_assign_buffers_no_scratch(v);
|
1643
|
+
vram_kv_cache += ggml_nbytes(k);
|
1562
1644
|
vram_kv_cache += ggml_nbytes(v);
|
1645
|
+
// HACK: mark tensor as allocated
|
1646
|
+
k->data = v->data = (void *)(uintptr_t)1;
|
1563
1647
|
}
|
1564
1648
|
}
|
1565
1649
|
#endif // GGML_USE_CUBLAS
|
1566
1650
|
}
|
1567
1651
|
|
1652
|
+
// allocate tensors
|
1653
|
+
cache.buf = ggml_backend_alloc_ctx_tensors_from_buft(cache.ctx, llama_default_buffer_type(n_gpu_layers));
|
1654
|
+
|
1655
|
+
// buf may be NULL with full offload
|
1656
|
+
if (cache.buf) {
|
1657
|
+
// initialize the buffer to avoid NaNs in the padding
|
1658
|
+
ggml_backend_buffer_clear(cache.buf, 0);
|
1659
|
+
}
|
1660
|
+
|
1568
1661
|
if (vram_kv_cache > 0) {
|
1569
1662
|
LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
|
1570
1663
|
}
|
1571
1664
|
|
1572
|
-
GGML_UNUSED(
|
1665
|
+
GGML_UNUSED(i_gpu_start);
|
1666
|
+
GGML_UNUSED(offload);
|
1573
1667
|
|
1574
1668
|
return true;
|
1575
1669
|
}
|
@@ -1900,7 +1994,7 @@ namespace GGUFMeta {
|
|
1900
1994
|
target = override->bool_value;
|
1901
1995
|
return true;
|
1902
1996
|
}
|
1903
|
-
return
|
1997
|
+
return false;
|
1904
1998
|
}
|
1905
1999
|
|
1906
2000
|
template<typename OT>
|
@@ -2020,17 +2114,16 @@ struct llama_model_loader {
|
|
2020
2114
|
enum ggml_type type_max = GGML_TYPE_F32;
|
2021
2115
|
|
2022
2116
|
for (int i = 0; i < n_tensors; i++) {
|
2023
|
-
|
2024
|
-
struct ggml_tensor * meta = ggml_get_tensor(ctx_meta, name);
|
2117
|
+
enum ggml_type type = gguf_get_tensor_type(ctx_gguf, i);
|
2025
2118
|
|
2026
|
-
n_type[
|
2119
|
+
n_type[type]++;
|
2027
2120
|
|
2028
|
-
if (n_type_max < n_type[
|
2029
|
-
n_type_max = n_type[
|
2030
|
-
type_max =
|
2121
|
+
if (n_type_max < n_type[type]) {
|
2122
|
+
n_type_max = n_type[type];
|
2123
|
+
type_max = type;
|
2031
2124
|
}
|
2032
2125
|
|
2033
|
-
LLAMA_LOG_INFO("%s: - tensor %4d: %32s %-8s [ %s ]\n", __func__, i, name, ggml_type_name(meta->type), llama_format_tensor_shape(meta).c_str());
|
2126
|
+
// LLAMA_LOG_INFO("%s: - tensor %4d: %32s %-8s [ %s ]\n", __func__, i, name, ggml_type_name(meta->type), llama_format_tensor_shape(meta).c_str());
|
2034
2127
|
}
|
2035
2128
|
|
2036
2129
|
switch (type_max) {
|
@@ -2168,34 +2261,19 @@ struct llama_model_loader {
|
|
2168
2261
|
return gguf_get_tensor_name(ctx_gguf, i);
|
2169
2262
|
}
|
2170
2263
|
|
2171
|
-
struct ggml_tensor * get_tensor_meta(
|
2172
|
-
return ggml_get_tensor(ctx_meta,
|
2264
|
+
struct ggml_tensor * get_tensor_meta(const char * name) const {
|
2265
|
+
return ggml_get_tensor(ctx_meta, name);
|
2173
2266
|
}
|
2174
2267
|
|
2175
|
-
|
2176
|
-
|
2177
|
-
mmapped_size_p = 0;
|
2178
|
-
|
2179
|
-
for (int i = 0; i < n_tensors; i++) {
|
2180
|
-
struct ggml_tensor * meta = get_tensor_meta(i);
|
2181
|
-
ctx_size_p += sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE;
|
2182
|
-
(use_mmap ? mmapped_size_p : ctx_size_p) += ggml_nbytes_pad(meta);
|
2183
|
-
}
|
2268
|
+
struct ggml_tensor * get_tensor_meta(int i) const {
|
2269
|
+
return get_tensor_meta(get_tensor_name(i));
|
2184
2270
|
}
|
2185
2271
|
|
2186
2272
|
struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta, ggml_backend_type backend) {
|
2187
|
-
if (backend != GGML_BACKEND_CPU) {
|
2188
|
-
ggml_set_no_alloc(ctx, true);
|
2189
|
-
}
|
2190
|
-
|
2191
2273
|
struct ggml_tensor * tensor = ggml_dup_tensor(ctx, meta);
|
2192
2274
|
tensor->backend = backend; // TODO: ggml_set_backend
|
2193
2275
|
ggml_set_name(tensor, ggml_get_name(meta));
|
2194
2276
|
|
2195
|
-
if (backend != GGML_BACKEND_CPU) {
|
2196
|
-
ggml_set_no_alloc(ctx, use_mmap);
|
2197
|
-
}
|
2198
|
-
|
2199
2277
|
n_created++;
|
2200
2278
|
|
2201
2279
|
return tensor;
|
@@ -2253,91 +2331,144 @@ struct llama_model_loader {
|
|
2253
2331
|
return gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, idx);
|
2254
2332
|
}
|
2255
2333
|
|
2334
|
+
void init_mapping(bool prefetch = true) {
|
2335
|
+
/*
|
2336
|
+
// prefetch only CPU tensors
|
2337
|
+
if (use_mmap) {
|
2338
|
+
size_t size_pref = 0; // prefetch
|
2339
|
+
|
2340
|
+
for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
|
2341
|
+
struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i));
|
2342
|
+
if (cur->backend == GGML_BACKEND_CPU) {
|
2343
|
+
size_t tensor_end = gguf_get_tensor_offset(ctx_gguf, i) + ggml_nbytes(cur);
|
2344
|
+
size_pref = std::max(size_pref, tensor_end);
|
2345
|
+
}
|
2346
|
+
}
|
2347
|
+
mapping.reset(new llama_mmap(&file, gguf_get_data_offset(ctx_gguf) + size_pref, ggml_is_numa()));
|
2348
|
+
}
|
2349
|
+
*/
|
2350
|
+
// prefetch the whole file - all the data is needed anyway
|
2351
|
+
if (use_mmap) {
|
2352
|
+
mapping.reset(new llama_mmap(&file, prefetch ? -1 : 0, ggml_is_numa()));
|
2353
|
+
}
|
2354
|
+
}
|
2355
|
+
|
2356
|
+
// for backwards compatibility, does not support ggml-backend
|
2256
2357
|
void load_data_for(struct ggml_tensor * cur) const {
|
2257
2358
|
const size_t offs = file_offset(ggml_get_name(cur));
|
2258
2359
|
|
2259
|
-
if (use_mmap) {
|
2260
|
-
cur->data
|
2360
|
+
if (use_mmap && mapping) {
|
2361
|
+
GGML_ASSERT(cur->data == nullptr);
|
2362
|
+
cur->data = (uint8_t *)mapping->addr + offs;
|
2261
2363
|
} else {
|
2364
|
+
GGML_ASSERT(cur->data != nullptr);
|
2262
2365
|
file.seek(offs, SEEK_SET);
|
2263
2366
|
file.read_raw(cur->data, ggml_nbytes(cur));
|
2264
2367
|
}
|
2265
2368
|
}
|
2266
2369
|
|
2267
|
-
|
2370
|
+
// Returns false if cancelled by progress_callback
|
2371
|
+
bool load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, ggml_backend_buffer_t buf_mmap, llama_mlock * lmlock) const {
|
2268
2372
|
size_t size_data = 0;
|
2269
|
-
size_t size_lock = 0;
|
2270
|
-
size_t size_pref = 0; // prefetch
|
2271
2373
|
|
2272
2374
|
for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
|
2273
2375
|
struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i));
|
2274
2376
|
size_data += ggml_nbytes(cur);
|
2275
|
-
if (cur->backend == GGML_BACKEND_CPU) {
|
2276
|
-
size_pref += ggml_nbytes(cur);
|
2277
|
-
}
|
2278
2377
|
}
|
2279
2378
|
|
2280
|
-
if (use_mmap) {
|
2281
|
-
mapping.reset(new llama_mmap(&file, size_pref, ggml_is_numa()));
|
2379
|
+
if (use_mmap && buf_mmap) {
|
2282
2380
|
if (lmlock) {
|
2283
2381
|
lmlock->init(mapping->addr);
|
2284
2382
|
}
|
2285
2383
|
}
|
2286
2384
|
|
2287
|
-
|
2385
|
+
#if (defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)) || defined(GGML_USE_CLBLAST)
|
2386
|
+
const bool legacy_offload = true;
|
2387
|
+
#else
|
2388
|
+
const bool legacy_offload = false;
|
2389
|
+
#endif
|
2390
|
+
|
2391
|
+
std::vector<no_init<uint8_t>> read_buf;
|
2392
|
+
|
2393
|
+
size_t size_done = 0;
|
2394
|
+
|
2395
|
+
size_t mmap_first = -1;
|
2396
|
+
size_t mmap_last = 0;
|
2397
|
+
|
2288
2398
|
for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
|
2289
2399
|
struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i));
|
2290
2400
|
GGML_ASSERT(cur); // unused tensors should have been caught by load_data already
|
2291
2401
|
|
2292
2402
|
if (progress_callback) {
|
2293
|
-
progress_callback((float)
|
2294
|
-
|
2295
|
-
|
2296
|
-
// allocate temp buffer if not using mmap
|
2297
|
-
if (!use_mmap && cur->data == NULL) {
|
2298
|
-
GGML_ASSERT(cur->backend != GGML_BACKEND_CPU);
|
2299
|
-
#ifdef GGML_USE_CPU_HBM
|
2300
|
-
cur->data = (uint8_t*)hbw_malloc(ggml_nbytes(cur));
|
2301
|
-
#else
|
2302
|
-
cur->data = (uint8_t*)malloc(ggml_nbytes(cur));
|
2303
|
-
#endif
|
2403
|
+
if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) {
|
2404
|
+
return false;
|
2405
|
+
}
|
2304
2406
|
}
|
2305
2407
|
|
2306
|
-
|
2408
|
+
const size_t offs = file_offset(ggml_get_name(cur));
|
2307
2409
|
|
2308
|
-
|
2309
|
-
|
2310
|
-
if (
|
2311
|
-
|
2312
|
-
lmlock
|
2410
|
+
if (!legacy_offload || cur->backend == GGML_BACKEND_CPU) {
|
2411
|
+
if (use_mmap && mapping) {
|
2412
|
+
if (buf_mmap) {
|
2413
|
+
ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *) mapping->addr + offs);
|
2414
|
+
if (lmlock) {
|
2415
|
+
lmlock->grow_to(offs + ggml_nbytes(cur));
|
2416
|
+
}
|
2417
|
+
mmap_first = std::min(mmap_first, offs);
|
2418
|
+
mmap_last = std::max(mmap_last, offs + ggml_nbytes(cur));
|
2419
|
+
} else {
|
2420
|
+
ggml_backend_tensor_set(cur, (uint8_t *) mapping->addr + offs, 0, ggml_nbytes(cur));
|
2313
2421
|
}
|
2314
|
-
|
2315
|
-
|
2316
|
-
|
2317
|
-
|
2318
|
-
|
2319
|
-
|
2320
|
-
|
2321
|
-
|
2322
|
-
|
2323
|
-
if (!use_mmap) {
|
2324
|
-
free(cur->data);
|
2422
|
+
} else {
|
2423
|
+
if (ggml_backend_buffer_is_host(cur->buffer)) {
|
2424
|
+
file.seek(offs, SEEK_SET);
|
2425
|
+
file.read_raw(cur->data, ggml_nbytes(cur));
|
2426
|
+
} else {
|
2427
|
+
read_buf.resize(ggml_nbytes(cur));
|
2428
|
+
file.seek(offs, SEEK_SET);
|
2429
|
+
file.read_raw(read_buf.data(), ggml_nbytes(cur));
|
2430
|
+
ggml_backend_tensor_set(cur, read_buf.data(), 0, ggml_nbytes(cur));
|
2325
2431
|
}
|
2326
|
-
|
2432
|
+
}
|
2433
|
+
} else {
|
2434
|
+
// HACK: mark tensor as allocated
|
2435
|
+
cur->data = (void *)(uintptr_t)1;
|
2436
|
+
void * data;
|
2437
|
+
if (use_mmap && mapping) {
|
2438
|
+
data = (uint8_t *) mapping->addr + offs;
|
2439
|
+
} else {
|
2440
|
+
read_buf.resize(ggml_nbytes(cur));
|
2441
|
+
file.seek(offs, SEEK_SET);
|
2442
|
+
file.read_raw(read_buf.data(), ggml_nbytes(cur));
|
2443
|
+
data = read_buf.data();
|
2444
|
+
}
|
2445
|
+
|
2446
|
+
#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
|
2447
|
+
ggml_cuda_transform_tensor(data, cur);
|
2327
2448
|
#elif defined(GGML_USE_CLBLAST)
|
2328
|
-
|
2329
|
-
|
2330
|
-
|
2331
|
-
|
2332
|
-
|
2333
|
-
break;
|
2449
|
+
GGML_ASSERT(cur->backend == GGML_BACKEND_GPU);
|
2450
|
+
ggml_cl_transform_tensor(data, cur);
|
2451
|
+
#else
|
2452
|
+
GGML_ASSERT(!"GPU tensor without a GPU backend");
|
2453
|
+
GGML_UNUSED(data);
|
2334
2454
|
#endif
|
2335
|
-
default:
|
2336
|
-
continue;
|
2337
2455
|
}
|
2338
2456
|
|
2339
|
-
|
2457
|
+
size_done += ggml_nbytes(cur);
|
2340
2458
|
}
|
2459
|
+
|
2460
|
+
// unmap offloaded tensors and metadata
|
2461
|
+
if (use_mmap && mapping) {
|
2462
|
+
mapping->unmap_fragment(0, mmap_first);
|
2463
|
+
mapping->unmap_fragment(mmap_last, mapping->size);
|
2464
|
+
}
|
2465
|
+
|
2466
|
+
if (progress_callback) {
|
2467
|
+
// Even though the model is done loading, we still honor
|
2468
|
+
// cancellation since we need to free allocations.
|
2469
|
+
return progress_callback(1.0f, progress_callback_user_data);
|
2470
|
+
}
|
2471
|
+
return true;
|
2341
2472
|
}
|
2342
2473
|
};
|
2343
2474
|
|
@@ -2360,25 +2491,25 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
|
2360
2491
|
|
2361
2492
|
switch (ftype) {
|
2362
2493
|
case LLAMA_FTYPE_ALL_F32: return "all F32";
|
2363
|
-
case LLAMA_FTYPE_MOSTLY_F16: return "
|
2364
|
-
case LLAMA_FTYPE_MOSTLY_Q4_0: return "
|
2365
|
-
case LLAMA_FTYPE_MOSTLY_Q4_1: return "
|
2494
|
+
case LLAMA_FTYPE_MOSTLY_F16: return "F16";
|
2495
|
+
case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0";
|
2496
|
+
case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1";
|
2366
2497
|
case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
|
2367
|
-
return "
|
2368
|
-
case LLAMA_FTYPE_MOSTLY_Q5_0: return "
|
2369
|
-
case LLAMA_FTYPE_MOSTLY_Q5_1: return "
|
2370
|
-
case LLAMA_FTYPE_MOSTLY_Q8_0: return "
|
2498
|
+
return "Q4_1, some F16";
|
2499
|
+
case LLAMA_FTYPE_MOSTLY_Q5_0: return "Q5_0";
|
2500
|
+
case LLAMA_FTYPE_MOSTLY_Q5_1: return "Q5_1";
|
2501
|
+
case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0";
|
2371
2502
|
|
2372
2503
|
// K-quants
|
2373
|
-
case LLAMA_FTYPE_MOSTLY_Q2_K: return "
|
2374
|
-
case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "
|
2375
|
-
case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "
|
2376
|
-
case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "
|
2377
|
-
case LLAMA_FTYPE_MOSTLY_Q4_K_S: return "
|
2378
|
-
case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "
|
2379
|
-
case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "
|
2380
|
-
case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "
|
2381
|
-
case LLAMA_FTYPE_MOSTLY_Q6_K: return "
|
2504
|
+
case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K";
|
2505
|
+
case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small";
|
2506
|
+
case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "Q3_K - Medium";
|
2507
|
+
case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "Q3_K - Large";
|
2508
|
+
case LLAMA_FTYPE_MOSTLY_Q4_K_S: return "Q4_K - Small";
|
2509
|
+
case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "Q4_K - Medium";
|
2510
|
+
case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small";
|
2511
|
+
case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium";
|
2512
|
+
case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
|
2382
2513
|
|
2383
2514
|
default: return "unknown, may not work";
|
2384
2515
|
}
|
@@ -2435,6 +2566,16 @@ static void llm_load_hparams(
|
|
2435
2566
|
ml.get_key (LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff);
|
2436
2567
|
ml.get_key (LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head);
|
2437
2568
|
ml.get_key (LLM_KV_BLOCK_COUNT, hparams.n_layer);
|
2569
|
+
ml.get_key (LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
|
2570
|
+
ml.get_key (LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
|
2571
|
+
|
2572
|
+
GGML_ASSERT(hparams.n_expert <= LLAMA_MAX_EXPERTS);
|
2573
|
+
GGML_ASSERT(hparams.n_expert_used <= hparams.n_expert);
|
2574
|
+
if (hparams.n_expert > 0) {
|
2575
|
+
GGML_ASSERT(hparams.n_expert_used > 0);
|
2576
|
+
} else {
|
2577
|
+
GGML_ASSERT(hparams.n_expert_used == 0);
|
2578
|
+
}
|
2438
2579
|
|
2439
2580
|
// n_head_kv is optional, default to n_head
|
2440
2581
|
hparams.n_head_kv = hparams.n_head;
|
@@ -2486,6 +2627,7 @@ static void llm_load_hparams(
|
|
2486
2627
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
2487
2628
|
|
2488
2629
|
switch (hparams.n_layer) {
|
2630
|
+
case 22: model.type = e_model::MODEL_1B; break;
|
2489
2631
|
case 26: model.type = e_model::MODEL_3B; break;
|
2490
2632
|
case 32: model.type = e_model::MODEL_7B; break;
|
2491
2633
|
case 40: model.type = e_model::MODEL_13B; break;
|
@@ -2587,6 +2729,15 @@ static void llm_load_hparams(
|
|
2587
2729
|
default: model.type = e_model::MODEL_UNKNOWN;
|
2588
2730
|
}
|
2589
2731
|
} break;
|
2732
|
+
case LLM_ARCH_PHI2:
|
2733
|
+
{
|
2734
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
2735
|
+
|
2736
|
+
switch (hparams.n_layer) {
|
2737
|
+
case 32: model.type = e_model::MODEL_3B; break;
|
2738
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
2739
|
+
}
|
2740
|
+
} break;
|
2590
2741
|
|
2591
2742
|
default: (void)0;
|
2592
2743
|
}
|
@@ -2753,7 +2904,7 @@ static void llm_load_vocab(
|
|
2753
2904
|
// The assumption is, since special tokens aren't meant to be exposed to end user, they are designed
|
2754
2905
|
// to be unmatchable by the tokenizer, therefore tokens from the vocab, which are unmatchable by the tokenizer
|
2755
2906
|
// are special tokens.
|
2756
|
-
// From testing, this appears to
|
2907
|
+
// From testing, this appears to correlate 1:1 with special tokens.
|
2757
2908
|
//
|
2758
2909
|
|
2759
2910
|
// Counting special tokens and verifying in only one direction
|
@@ -2866,6 +3017,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
2866
3017
|
LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv);
|
2867
3018
|
LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias);
|
2868
3019
|
LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
|
3020
|
+
LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
|
3021
|
+
LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
|
2869
3022
|
LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type.c_str());
|
2870
3023
|
LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
|
2871
3024
|
LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
|
@@ -2892,7 +3045,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
2892
3045
|
if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
|
2893
3046
|
}
|
2894
3047
|
|
2895
|
-
|
3048
|
+
// Returns false if cancelled by progress_callback
|
3049
|
+
static bool llm_load_tensors(
|
2896
3050
|
llama_model_loader & ml,
|
2897
3051
|
llama_model & model,
|
2898
3052
|
int n_gpu_layers,
|
@@ -2908,25 +3062,16 @@ static void llm_load_tensors(
|
|
2908
3062
|
|
2909
3063
|
model.n_gpu_layers = n_gpu_layers;
|
2910
3064
|
|
2911
|
-
size_t ctx_size;
|
2912
|
-
size_t mmapped_size;
|
2913
|
-
|
2914
|
-
ml.calc_sizes(ctx_size, mmapped_size);
|
3065
|
+
size_t ctx_size = ggml_tensor_overhead() * ml.n_tensors;
|
2915
3066
|
|
2916
|
-
LLAMA_LOG_INFO("%s: ggml ctx size
|
3067
|
+
LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MiB\n", __func__, ctx_size/1024.0/1024.0);
|
2917
3068
|
|
2918
3069
|
// create the ggml context
|
2919
3070
|
{
|
2920
|
-
model.buf.resize(ctx_size);
|
2921
|
-
if (use_mlock) {
|
2922
|
-
model.mlock_buf.init (model.buf.data);
|
2923
|
-
model.mlock_buf.grow_to(model.buf.size);
|
2924
|
-
}
|
2925
|
-
|
2926
3071
|
struct ggml_init_params params = {
|
2927
|
-
/*.mem_size =*/
|
2928
|
-
/*.mem_buffer =*/
|
2929
|
-
/*.no_alloc =*/
|
3072
|
+
/*.mem_size =*/ ctx_size,
|
3073
|
+
/*.mem_buffer =*/ NULL,
|
3074
|
+
/*.no_alloc =*/ true,
|
2930
3075
|
};
|
2931
3076
|
|
2932
3077
|
model.ctx = ggml_init(params);
|
@@ -2937,25 +3082,24 @@ static void llm_load_tensors(
|
|
2937
3082
|
|
2938
3083
|
(void) main_gpu;
|
2939
3084
|
|
2940
|
-
enum ggml_backend_type llama_backend_offload
|
3085
|
+
enum ggml_backend_type llama_backend_offload = GGML_BACKEND_CPU;
|
2941
3086
|
enum ggml_backend_type llama_backend_offload_split = GGML_BACKEND_CPU;
|
2942
3087
|
|
2943
|
-
#
|
3088
|
+
#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
|
2944
3089
|
if (ggml_cublas_loaded()) {
|
2945
3090
|
LLAMA_LOG_INFO("%s: using " GGML_CUDA_NAME " for GPU acceleration\n", __func__);
|
2946
3091
|
ggml_cuda_set_main_device(main_gpu);
|
2947
3092
|
|
2948
|
-
llama_backend_offload
|
3093
|
+
llama_backend_offload = GGML_BACKEND_GPU;
|
2949
3094
|
llama_backend_offload_split = GGML_BACKEND_GPU_SPLIT;
|
2950
3095
|
}
|
2951
3096
|
#elif defined(GGML_USE_CLBLAST)
|
2952
3097
|
LLAMA_LOG_INFO("%s: using OpenCL for GPU acceleration\n", __func__);
|
2953
|
-
llama_backend_offload
|
3098
|
+
llama_backend_offload = GGML_BACKEND_GPU;
|
2954
3099
|
llama_backend_offload_split = GGML_BACKEND_GPU;
|
2955
3100
|
#endif
|
2956
3101
|
|
2957
|
-
//
|
2958
|
-
size_t vram_weights = 0;
|
3102
|
+
// create tensors for the weights
|
2959
3103
|
{
|
2960
3104
|
const int64_t n_embd = hparams.n_embd;
|
2961
3105
|
const int64_t n_embd_gqa = hparams.n_embd_gqa();
|
@@ -2984,13 +3128,6 @@ static void llm_load_tensors(
|
|
2984
3128
|
|
2985
3129
|
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
2986
3130
|
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
2987
|
-
|
2988
|
-
if (backend_norm == GGML_BACKEND_GPU) {
|
2989
|
-
vram_weights += ggml_nbytes(model.output_norm);
|
2990
|
-
}
|
2991
|
-
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
2992
|
-
vram_weights += ggml_nbytes(model.output);
|
2993
|
-
}
|
2994
3131
|
}
|
2995
3132
|
|
2996
3133
|
const uint32_t n_ff = hparams.n_ff;
|
@@ -3020,20 +3157,25 @@ static void llm_load_tensors(
|
|
3020
3157
|
|
3021
3158
|
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
|
3022
3159
|
|
3023
|
-
layer.
|
3024
|
-
layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
|
3025
|
-
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
3160
|
+
layer.ffn_gate_inp = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd}, backend, false);
|
3026
3161
|
|
3027
|
-
if (
|
3028
|
-
|
3029
|
-
|
3030
|
-
|
3031
|
-
|
3032
|
-
|
3033
|
-
|
3034
|
-
|
3035
|
-
|
3036
|
-
|
3162
|
+
if (layer.ffn_gate_inp == nullptr) {
|
3163
|
+
GGML_ASSERT(hparams.n_expert == 0);
|
3164
|
+
GGML_ASSERT(hparams.n_expert_used == 0);
|
3165
|
+
|
3166
|
+
layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
|
3167
|
+
layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
|
3168
|
+
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
3169
|
+
} else {
|
3170
|
+
GGML_ASSERT(hparams.n_expert > 0);
|
3171
|
+
GGML_ASSERT(hparams.n_expert_used > 0);
|
3172
|
+
|
3173
|
+
// MoE branch
|
3174
|
+
for (uint32_t x = 0; x < hparams.n_expert; ++x) {
|
3175
|
+
layer.ffn_gate_exp[x] = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), {n_embd, n_ff}, backend_split);
|
3176
|
+
layer.ffn_down_exp[x] = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), { n_ff, n_embd}, backend_split);
|
3177
|
+
layer.ffn_up_exp[x] = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), {n_embd, n_ff}, backend_split);
|
3178
|
+
}
|
3037
3179
|
}
|
3038
3180
|
}
|
3039
3181
|
} break;
|
@@ -3054,13 +3196,6 @@ static void llm_load_tensors(
|
|
3054
3196
|
|
3055
3197
|
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
3056
3198
|
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
3057
|
-
|
3058
|
-
if (backend_norm == GGML_BACKEND_GPU) {
|
3059
|
-
vram_weights += ggml_nbytes(model.output_norm);
|
3060
|
-
}
|
3061
|
-
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
3062
|
-
vram_weights += ggml_nbytes(model.output);
|
3063
|
-
}
|
3064
3199
|
}
|
3065
3200
|
|
3066
3201
|
const uint32_t n_ff = hparams.n_ff;
|
@@ -3087,19 +3222,10 @@ static void llm_load_tensors(
|
|
3087
3222
|
layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
|
3088
3223
|
layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
|
3089
3224
|
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
3090
|
-
|
3091
|
-
if (backend == GGML_BACKEND_GPU) {
|
3092
|
-
vram_weights +=
|
3093
|
-
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
|
3094
|
-
ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) +
|
3095
|
-
ggml_nbytes(layer.ffn_gate) + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
|
3096
|
-
}
|
3097
3225
|
}
|
3098
3226
|
} break;
|
3099
3227
|
case LLM_ARCH_FALCON:
|
3100
3228
|
{
|
3101
|
-
// TODO: CPU-only for now
|
3102
|
-
|
3103
3229
|
model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
3104
3230
|
|
3105
3231
|
// output
|
@@ -3118,14 +3244,6 @@ static void llm_load_tensors(
|
|
3118
3244
|
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
3119
3245
|
model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
|
3120
3246
|
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
3121
|
-
|
3122
|
-
if (backend_norm == GGML_BACKEND_GPU) {
|
3123
|
-
vram_weights += ggml_nbytes(model.output_norm);
|
3124
|
-
vram_weights += ggml_nbytes(model.output_norm_b);
|
3125
|
-
}
|
3126
|
-
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
3127
|
-
vram_weights += ggml_nbytes(model.output);
|
3128
|
-
}
|
3129
3247
|
}
|
3130
3248
|
|
3131
3249
|
const uint32_t n_ff = hparams.n_ff;
|
@@ -3146,11 +3264,6 @@ static void llm_load_tensors(
|
|
3146
3264
|
if (gguf_find_tensor(ml.ctx_gguf, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i).c_str()) >= 0) {
|
3147
3265
|
layer.attn_norm_2 = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, backend);
|
3148
3266
|
layer.attn_norm_2_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, backend);
|
3149
|
-
|
3150
|
-
if (backend == GGML_BACKEND_GPU) {
|
3151
|
-
vram_weights += ggml_nbytes(layer.attn_norm_2);
|
3152
|
-
vram_weights += ggml_nbytes(layer.attn_norm_2_b);
|
3153
|
-
}
|
3154
3267
|
}
|
3155
3268
|
|
3156
3269
|
layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
|
@@ -3158,13 +3271,6 @@ static void llm_load_tensors(
|
|
3158
3271
|
|
3159
3272
|
layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
|
3160
3273
|
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
3161
|
-
|
3162
|
-
if (backend == GGML_BACKEND_GPU) {
|
3163
|
-
vram_weights +=
|
3164
|
-
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
|
3165
|
-
ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.wo) +
|
3166
|
-
ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
|
3167
|
-
}
|
3168
3274
|
}
|
3169
3275
|
} break;
|
3170
3276
|
case LLM_ARCH_STARCODER:
|
@@ -3188,14 +3294,6 @@ static void llm_load_tensors(
|
|
3188
3294
|
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
3189
3295
|
model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
|
3190
3296
|
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
3191
|
-
|
3192
|
-
if (backend_norm == GGML_BACKEND_GPU) {
|
3193
|
-
vram_weights += ggml_nbytes(model.output_norm);
|
3194
|
-
vram_weights += ggml_nbytes(model.output_norm_b);
|
3195
|
-
}
|
3196
|
-
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
3197
|
-
vram_weights += ggml_nbytes(model.output);
|
3198
|
-
}
|
3199
3297
|
}
|
3200
3298
|
|
3201
3299
|
const uint32_t n_ff = hparams.n_ff;
|
@@ -3227,16 +3325,6 @@ static void llm_load_tensors(
|
|
3227
3325
|
|
3228
3326
|
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
3229
3327
|
layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend);
|
3230
|
-
|
3231
|
-
if (backend == GGML_BACKEND_GPU) {
|
3232
|
-
vram_weights +=
|
3233
|
-
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
|
3234
|
-
ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) +
|
3235
|
-
ggml_nbytes(layer.wo) + ggml_nbytes(layer.bo) +
|
3236
|
-
ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_norm_b) +
|
3237
|
-
ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_down_b) +
|
3238
|
-
ggml_nbytes(layer.ffn_up) + ggml_nbytes(layer.ffn_up_b);
|
3239
|
-
}
|
3240
3328
|
}
|
3241
3329
|
} break;
|
3242
3330
|
case LLM_ARCH_PERSIMMON:
|
@@ -3258,14 +3346,6 @@ static void llm_load_tensors(
|
|
3258
3346
|
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
3259
3347
|
model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
|
3260
3348
|
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
3261
|
-
|
3262
|
-
if (backend_norm == GGML_BACKEND_GPU) {
|
3263
|
-
vram_weights += ggml_nbytes(model.output_norm);
|
3264
|
-
vram_weights += ggml_nbytes(model.output_norm_b);
|
3265
|
-
}
|
3266
|
-
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
3267
|
-
vram_weights += ggml_nbytes(model.output);
|
3268
|
-
}
|
3269
3349
|
}
|
3270
3350
|
|
3271
3351
|
const uint32_t n_ff = hparams.n_ff;
|
@@ -3295,8 +3375,6 @@ static void llm_load_tensors(
|
|
3295
3375
|
} break;
|
3296
3376
|
case LLM_ARCH_BLOOM:
|
3297
3377
|
{
|
3298
|
-
// TODO: CPU-only for now
|
3299
|
-
|
3300
3378
|
model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
3301
3379
|
model.tok_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, GGML_BACKEND_CPU);
|
3302
3380
|
model.tok_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, GGML_BACKEND_CPU);
|
@@ -3317,14 +3395,6 @@ static void llm_load_tensors(
|
|
3317
3395
|
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
3318
3396
|
model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
|
3319
3397
|
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
3320
|
-
|
3321
|
-
if (backend_norm == GGML_BACKEND_GPU) {
|
3322
|
-
vram_weights += ggml_nbytes(model.output_norm);
|
3323
|
-
vram_weights += ggml_nbytes(model.output_norm_b);
|
3324
|
-
}
|
3325
|
-
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
3326
|
-
vram_weights += ggml_nbytes(model.output);
|
3327
|
-
}
|
3328
3398
|
}
|
3329
3399
|
|
3330
3400
|
const uint32_t n_ff = hparams.n_ff;
|
@@ -3356,16 +3426,6 @@ static void llm_load_tensors(
|
|
3356
3426
|
|
3357
3427
|
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
3358
3428
|
layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend);
|
3359
|
-
|
3360
|
-
if (backend == GGML_BACKEND_GPU) {
|
3361
|
-
vram_weights +=
|
3362
|
-
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
|
3363
|
-
ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) +
|
3364
|
-
ggml_nbytes(layer.wo) + ggml_nbytes(layer.bo) +
|
3365
|
-
ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_norm_b) +
|
3366
|
-
ggml_nbytes(layer.ffn_up) + ggml_nbytes(layer.ffn_up_b) +
|
3367
|
-
ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_down_b);
|
3368
|
-
}
|
3369
3429
|
}
|
3370
3430
|
} break;
|
3371
3431
|
case LLM_ARCH_MPT:
|
@@ -3387,13 +3447,6 @@ static void llm_load_tensors(
|
|
3387
3447
|
|
3388
3448
|
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
3389
3449
|
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
3390
|
-
|
3391
|
-
if (backend_norm == GGML_BACKEND_GPU) {
|
3392
|
-
vram_weights += ggml_nbytes(model.output_norm);
|
3393
|
-
}
|
3394
|
-
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
3395
|
-
vram_weights += ggml_nbytes(model.output);
|
3396
|
-
}
|
3397
3450
|
}
|
3398
3451
|
|
3399
3452
|
const uint32_t n_ff = hparams.n_ff;
|
@@ -3416,16 +3469,6 @@ static void llm_load_tensors(
|
|
3416
3469
|
|
3417
3470
|
layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
|
3418
3471
|
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
3419
|
-
|
3420
|
-
if (backend == GGML_BACKEND_GPU) {
|
3421
|
-
vram_weights +=
|
3422
|
-
ggml_nbytes(layer.attn_norm) +
|
3423
|
-
ggml_nbytes(layer.wqkv) +
|
3424
|
-
ggml_nbytes(layer.wo) +
|
3425
|
-
ggml_nbytes(layer.ffn_norm) +
|
3426
|
-
ggml_nbytes(layer.ffn_down) +
|
3427
|
-
ggml_nbytes(layer.ffn_up);
|
3428
|
-
}
|
3429
3472
|
}
|
3430
3473
|
} break;
|
3431
3474
|
case LLM_ARCH_STABLELM:
|
@@ -3448,13 +3491,6 @@ static void llm_load_tensors(
|
|
3448
3491
|
model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
|
3449
3492
|
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
3450
3493
|
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
3451
|
-
|
3452
|
-
if (backend_norm == GGML_BACKEND_GPU) {
|
3453
|
-
vram_weights += ggml_nbytes(model.output_norm);
|
3454
|
-
}
|
3455
|
-
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
3456
|
-
vram_weights += ggml_nbytes(model.output);
|
3457
|
-
}
|
3458
3494
|
}
|
3459
3495
|
|
3460
3496
|
const uint32_t n_ff = hparams.n_ff;
|
@@ -3486,13 +3522,6 @@ static void llm_load_tensors(
|
|
3486
3522
|
layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
|
3487
3523
|
layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
|
3488
3524
|
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
3489
|
-
|
3490
|
-
if (backend == GGML_BACKEND_GPU) {
|
3491
|
-
vram_weights +=
|
3492
|
-
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
|
3493
|
-
ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) +
|
3494
|
-
ggml_nbytes(layer.ffn_gate) + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
|
3495
|
-
}
|
3496
3525
|
}
|
3497
3526
|
} break;
|
3498
3527
|
case LLM_ARCH_QWEN:
|
@@ -3512,14 +3541,7 @@ static void llm_load_tensors(
|
|
3512
3541
|
|
3513
3542
|
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
3514
3543
|
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
3515
|
-
|
3516
|
-
if (backend_norm == GGML_BACKEND_GPU) {
|
3517
|
-
vram_weights += ggml_nbytes(model.output_norm);
|
3518
|
-
}
|
3519
|
-
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
3520
|
-
vram_weights += ggml_nbytes(model.output);
|
3521
|
-
}
|
3522
|
-
}
|
3544
|
+
}
|
3523
3545
|
|
3524
3546
|
const uint32_t n_ff = hparams.n_ff / 2;
|
3525
3547
|
|
@@ -3544,16 +3566,59 @@ static void llm_load_tensors(
|
|
3544
3566
|
layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
|
3545
3567
|
layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
|
3546
3568
|
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
3569
|
+
}
|
3570
|
+
} break;
|
3571
|
+
case LLM_ARCH_PHI2:
|
3572
|
+
{
|
3573
|
+
model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
3547
3574
|
|
3548
|
-
|
3549
|
-
|
3550
|
-
|
3551
|
-
|
3552
|
-
|
3575
|
+
// output
|
3576
|
+
{
|
3577
|
+
ggml_backend_type backend_norm;
|
3578
|
+
ggml_backend_type backend_output;
|
3579
|
+
|
3580
|
+
if (n_gpu_layers > int(n_layer)) {
|
3581
|
+
backend_norm = llama_backend_offload;
|
3582
|
+
backend_output = llama_backend_offload;
|
3583
|
+
} else {
|
3584
|
+
backend_norm = GGML_BACKEND_CPU;
|
3585
|
+
backend_output = GGML_BACKEND_CPU;
|
3553
3586
|
}
|
3587
|
+
|
3588
|
+
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
3589
|
+
model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
|
3590
|
+
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
3591
|
+
model.output_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "bias"), {n_vocab}, backend_output);
|
3554
3592
|
}
|
3555
|
-
} break;
|
3556
3593
|
|
3594
|
+
const uint32_t n_ff = hparams.n_ff;
|
3595
|
+
|
3596
|
+
const int i_gpu_start = n_layer - n_gpu_layers;
|
3597
|
+
|
3598
|
+
model.layers.resize(n_layer);
|
3599
|
+
|
3600
|
+
for (uint32_t i = 0; i < n_layer; ++i) {
|
3601
|
+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
|
3602
|
+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
|
3603
|
+
|
3604
|
+
auto & layer = model.layers[i];
|
3605
|
+
|
3606
|
+
layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
|
3607
|
+
layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
|
3608
|
+
|
3609
|
+
layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
|
3610
|
+
layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend);
|
3611
|
+
|
3612
|
+
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
|
3613
|
+
layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend);
|
3614
|
+
|
3615
|
+
layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
|
3616
|
+
layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend);
|
3617
|
+
|
3618
|
+
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
3619
|
+
layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend);
|
3620
|
+
}
|
3621
|
+
} break;
|
3557
3622
|
default:
|
3558
3623
|
throw std::runtime_error("unknown architecture");
|
3559
3624
|
}
|
@@ -3561,16 +3626,78 @@ static void llm_load_tensors(
|
|
3561
3626
|
|
3562
3627
|
ml.done_getting_tensors();
|
3563
3628
|
|
3629
|
+
ml.init_mapping();
|
3630
|
+
|
3631
|
+
// allocate tensors
|
3632
|
+
size_t vram_weights = 0;
|
3633
|
+
size_t buf_size = 0;
|
3634
|
+
|
3635
|
+
ggml_backend_buffer_type_t buft = llama_default_buffer_type(n_gpu_layers);
|
3636
|
+
|
3637
|
+
for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
|
3638
|
+
// GGML_BACKEND_GPU tensors are for CUDA and OpenCL only, which are handled separately without ggml-backend
|
3639
|
+
if (t->backend == GGML_BACKEND_CPU) {
|
3640
|
+
buf_size += GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), ggml_backend_buft_get_alignment(buft));
|
3641
|
+
} else {
|
3642
|
+
vram_weights += ggml_nbytes(t);
|
3643
|
+
}
|
3644
|
+
}
|
3645
|
+
|
3646
|
+
// create backend buffer
|
3647
|
+
ggml_backend_buffer_t buf_mmap = nullptr;
|
3648
|
+
|
3649
|
+
#ifdef GGML_USE_METAL
|
3650
|
+
if (n_gpu_layers > 0) {
|
3651
|
+
if (ml.use_mmap) {
|
3652
|
+
const size_t max_size = ggml_get_max_tensor_size(ctx);
|
3653
|
+
model.buf = ggml_backend_metal_buffer_from_ptr(ml.mapping->addr, ml.mapping->size, max_size);
|
3654
|
+
buf_mmap = model.buf;
|
3655
|
+
} else {
|
3656
|
+
model.buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_metal_buffer_type());
|
3657
|
+
}
|
3658
|
+
}
|
3659
|
+
#elif defined(GGML_USE_CUBLAS) && defined(LLAMA_GGML_BACKEND_CUDA_TEST)
|
3660
|
+
// for testing only
|
3661
|
+
if (n_gpu_layers > 0) {
|
3662
|
+
model.buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_cuda_buffer_type(0));
|
3663
|
+
}
|
3664
|
+
#endif
|
3665
|
+
|
3666
|
+
if (model.buf == nullptr) {
|
3667
|
+
// CPU backend, and indirectly CUDA and OpenCL
|
3668
|
+
if (ml.use_mmap) {
|
3669
|
+
model.buf = ggml_backend_cpu_buffer_from_ptr(ml.mapping->addr, ml.mapping->size);
|
3670
|
+
buf_mmap = model.buf;
|
3671
|
+
} else {
|
3672
|
+
// allocate only CPU tensors
|
3673
|
+
model.buf = ggml_backend_buft_alloc_buffer(buft, buf_size);
|
3674
|
+
ggml_tallocr_t alloc = ggml_tallocr_new_from_buffer(model.buf);
|
3675
|
+
for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
|
3676
|
+
if (t->backend == GGML_BACKEND_CPU) {
|
3677
|
+
ggml_tallocr_alloc(alloc, t);
|
3678
|
+
}
|
3679
|
+
}
|
3680
|
+
ggml_tallocr_free(alloc);
|
3681
|
+
}
|
3682
|
+
}
|
3683
|
+
|
3684
|
+
if (use_mlock && ggml_backend_buffer_is_host(model.buf)) {
|
3685
|
+
model.mlock_buf.init (ggml_backend_buffer_get_base(model.buf));
|
3686
|
+
model.mlock_buf.grow_to(ggml_backend_buffer_get_size(model.buf));
|
3687
|
+
}
|
3688
|
+
|
3564
3689
|
// print memory requirements
|
3565
3690
|
{
|
3566
|
-
|
3567
|
-
size_t mem_required =
|
3568
|
-
ctx_size +
|
3569
|
-
mmapped_size - vram_weights; // weights in VRAM not in memory
|
3691
|
+
size_t sys_mem_required = ctx_size + buf_size;
|
3570
3692
|
|
3571
|
-
|
3693
|
+
if (sys_mem_required > 0) {
|
3694
|
+
LLAMA_LOG_INFO("%s: system memory used = %7.2f MiB\n", __func__, sys_mem_required / 1024.0 / 1024.0);
|
3695
|
+
}
|
3696
|
+
if (vram_weights > 0) {
|
3697
|
+
LLAMA_LOG_INFO("%s: VRAM used = %7.2f MiB\n", __func__, vram_weights / 1024.0 / 1024.0);
|
3698
|
+
}
|
3572
3699
|
|
3573
|
-
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
3700
|
+
#if (defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)) || defined(GGML_USE_CLBLAST)
|
3574
3701
|
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
3575
3702
|
|
3576
3703
|
LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
|
@@ -3578,38 +3705,27 @@ static void llm_load_tensors(
|
|
3578
3705
|
LLAMA_LOG_INFO("%s: offloading non-repeating layers to GPU\n", __func__);
|
3579
3706
|
}
|
3580
3707
|
|
3581
|
-
#ifdef GGML_USE_CUBLAS
|
3582
3708
|
const int max_backend_supported_layers = hparams.n_layer + 1;
|
3583
3709
|
const int max_offloadable_layers = hparams.n_layer + 1;
|
3584
|
-
#elif GGML_USE_CLBLAST
|
3585
|
-
const int max_backend_supported_layers = hparams.n_layer + 1;
|
3586
|
-
const int max_offloadable_layers = hparams.n_layer + 1;
|
3587
|
-
#endif // GGML_USE_CUBLAS
|
3588
3710
|
|
3589
3711
|
LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
|
3590
|
-
LLAMA_LOG_INFO("%s: VRAM used: %.2f MiB\n", __func__, vram_weights / 1024.0 / 1024.0);
|
3591
|
-
#else
|
3592
|
-
(void) n_gpu_layers;
|
3593
3712
|
#endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
3594
3713
|
}
|
3595
3714
|
|
3596
|
-
|
3715
|
+
#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
|
3716
|
+
ggml_cuda_set_tensor_split(tensor_split);
|
3717
|
+
#else
|
3718
|
+
GGML_UNUSED(tensor_split);
|
3719
|
+
#endif // GGML_USE_CUBLAS
|
3720
|
+
|
3721
|
+
// populate tensors_by_name
|
3597
3722
|
for (int i = 0; i < ml.n_tensors; ++i) {
|
3598
3723
|
struct ggml_tensor * cur = ggml_get_tensor(ctx, ml.get_tensor_name(i));
|
3599
3724
|
model.tensors_by_name.emplace_back(ggml_get_name(cur), cur);
|
3600
3725
|
}
|
3601
3726
|
|
3602
|
-
(
|
3603
|
-
|
3604
|
-
{
|
3605
|
-
ggml_cuda_set_tensor_split(tensor_split);
|
3606
|
-
}
|
3607
|
-
#endif
|
3608
|
-
|
3609
|
-
ml.load_all_data(ctx, progress_callback, progress_callback_user_data, use_mlock ? &model.mlock_mmap : NULL);
|
3610
|
-
|
3611
|
-
if (progress_callback) {
|
3612
|
-
progress_callback(1.0f, progress_callback_user_data);
|
3727
|
+
if (!ml.load_all_data(ctx, progress_callback, progress_callback_user_data, buf_mmap, use_mlock ? &model.mlock_mmap : NULL)) {
|
3728
|
+
return false;
|
3613
3729
|
}
|
3614
3730
|
|
3615
3731
|
model.mapping = std::move(ml.mapping);
|
@@ -3617,9 +3733,11 @@ static void llm_load_tensors(
|
|
3617
3733
|
// loading time will be recalculate after the first eval, so
|
3618
3734
|
// we take page faults deferred by mmap() into consideration
|
3619
3735
|
model.t_load_us = ggml_time_us() - model.t_start_us;
|
3736
|
+
return true;
|
3620
3737
|
}
|
3621
3738
|
|
3622
|
-
|
3739
|
+
// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
|
3740
|
+
static int llama_model_load(const std::string & fname, llama_model & model, const llama_model_params & params) {
|
3623
3741
|
try {
|
3624
3742
|
llama_model_loader ml(fname, params.use_mmap, params.kv_overrides);
|
3625
3743
|
|
@@ -3637,19 +3755,21 @@ static bool llama_model_load(const std::string & fname, llama_model & model, con
|
|
3637
3755
|
|
3638
3756
|
if (params.vocab_only) {
|
3639
3757
|
LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__);
|
3640
|
-
return
|
3758
|
+
return 0;
|
3641
3759
|
}
|
3642
3760
|
|
3643
|
-
llm_load_tensors(
|
3761
|
+
if (!llm_load_tensors(
|
3644
3762
|
ml, model, params.n_gpu_layers, params.main_gpu, params.tensor_split, params.use_mlock,
|
3645
3763
|
params.progress_callback, params.progress_callback_user_data
|
3646
|
-
)
|
3764
|
+
)) {
|
3765
|
+
return -2;
|
3766
|
+
}
|
3647
3767
|
} catch (const std::exception & err) {
|
3648
3768
|
LLAMA_LOG_ERROR("error loading model: %s\n", err.what());
|
3649
|
-
return
|
3769
|
+
return -1;
|
3650
3770
|
}
|
3651
3771
|
|
3652
|
-
return
|
3772
|
+
return 0;
|
3653
3773
|
}
|
3654
3774
|
|
3655
3775
|
//
|
@@ -3750,8 +3870,8 @@ static void llm_build_k_shift(
|
|
3750
3870
|
ggml_rope_custom_inplace(ctx,
|
3751
3871
|
ggml_view_3d(ctx, kv.k_l[il],
|
3752
3872
|
n_embd_head, n_head_kv, n_ctx,
|
3753
|
-
|
3754
|
-
|
3873
|
+
ggml_row_size(kv.k_l[il]->type, n_embd_head),
|
3874
|
+
ggml_row_size(kv.k_l[il]->type, n_embd_gqa),
|
3755
3875
|
0),
|
3756
3876
|
K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
3757
3877
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
@@ -3780,7 +3900,7 @@ static void llm_build_kv_store(
|
|
3780
3900
|
cb(v_cur_t, "v_cur_t", il);
|
3781
3901
|
|
3782
3902
|
struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_gqa,
|
3783
|
-
(
|
3903
|
+
(ggml_row_size(kv.k_l[il]->type, n_embd_gqa))*kv_head);
|
3784
3904
|
cb(k_cache_view, "k_cache_view", il);
|
3785
3905
|
|
3786
3906
|
struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_gqa,
|
@@ -3914,17 +4034,18 @@ static struct ggml_tensor * llm_build_ffn(
|
|
3914
4034
|
// if max_alibi_bias > 0 then apply ALiBi
|
3915
4035
|
static struct ggml_tensor * llm_build_kqv(
|
3916
4036
|
struct ggml_context * ctx,
|
4037
|
+
const llama_model & model,
|
3917
4038
|
const llama_hparams & hparams,
|
3918
4039
|
const llama_kv_cache & kv,
|
3919
4040
|
struct ggml_tensor * wo,
|
3920
4041
|
struct ggml_tensor * wo_b,
|
3921
4042
|
struct ggml_tensor * q_cur,
|
3922
|
-
struct ggml_tensor * kq_scale,
|
3923
4043
|
struct ggml_tensor * kq_mask,
|
3924
4044
|
int64_t n_ctx,
|
3925
4045
|
int32_t n_tokens,
|
3926
4046
|
int32_t n_kv,
|
3927
4047
|
float max_alibi_bias,
|
4048
|
+
float kq_scale,
|
3928
4049
|
const llm_build_cb & cb,
|
3929
4050
|
int il) {
|
3930
4051
|
const int64_t n_embd = hparams.n_embd;
|
@@ -3939,14 +4060,20 @@ static struct ggml_tensor * llm_build_kqv(
|
|
3939
4060
|
struct ggml_tensor * k =
|
3940
4061
|
ggml_view_3d(ctx, kv.k_l[il],
|
3941
4062
|
n_embd_head, n_kv, n_head_kv,
|
3942
|
-
|
3943
|
-
|
4063
|
+
ggml_row_size(kv.k_l[il]->type, n_embd_gqa),
|
4064
|
+
ggml_row_size(kv.k_l[il]->type, n_embd_head),
|
3944
4065
|
0);
|
3945
4066
|
cb(k, "k", il);
|
3946
4067
|
|
3947
4068
|
struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
|
3948
4069
|
cb(kq, "kq", il);
|
3949
4070
|
|
4071
|
+
if (model.arch == LLM_ARCH_PHI2) {
|
4072
|
+
// for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
|
4073
|
+
// ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
|
4074
|
+
ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
|
4075
|
+
}
|
4076
|
+
|
3950
4077
|
if (max_alibi_bias > 0.0f) {
|
3951
4078
|
// temporary branch until we figure out how to handle ggml_alibi through ggml_add
|
3952
4079
|
kq = ggml_scale(ctx, kq, kq_scale);
|
@@ -3966,7 +4093,7 @@ static struct ggml_tensor * llm_build_kqv(
|
|
3966
4093
|
kq = ggml_soft_max(ctx, kq);
|
3967
4094
|
cb(kq, "kq_soft_max", il);
|
3968
4095
|
} else {
|
3969
|
-
kq = ggml_soft_max_ext(ctx, kq, kq_mask,
|
4096
|
+
kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_scale);
|
3970
4097
|
cb(kq, "kq_soft_max_ext", il);
|
3971
4098
|
}
|
3972
4099
|
|
@@ -4014,6 +4141,8 @@ struct llm_build_context {
|
|
4014
4141
|
const int64_t n_head_kv;
|
4015
4142
|
const int64_t n_embd_head;
|
4016
4143
|
const int64_t n_embd_gqa;
|
4144
|
+
const int64_t n_expert;
|
4145
|
+
const int64_t n_expert_used;
|
4017
4146
|
|
4018
4147
|
const float freq_base;
|
4019
4148
|
const float freq_scale;
|
@@ -4033,7 +4162,7 @@ struct llm_build_context {
|
|
4033
4162
|
|
4034
4163
|
const llm_build_cb & cb;
|
4035
4164
|
|
4036
|
-
|
4165
|
+
std::vector<uint8_t> & buf_compute_meta;
|
4037
4166
|
|
4038
4167
|
struct ggml_context * ctx0 = nullptr;
|
4039
4168
|
|
@@ -4043,33 +4172,35 @@ struct llm_build_context {
|
|
4043
4172
|
const llama_batch & batch,
|
4044
4173
|
const llm_build_cb & cb,
|
4045
4174
|
bool worst_case) :
|
4046
|
-
model
|
4047
|
-
hparams
|
4048
|
-
cparams
|
4049
|
-
batch
|
4050
|
-
kv_self
|
4051
|
-
n_embd
|
4052
|
-
n_layer
|
4053
|
-
n_ctx
|
4054
|
-
n_head
|
4055
|
-
n_head_kv
|
4056
|
-
n_embd_head
|
4057
|
-
n_embd_gqa
|
4058
|
-
|
4059
|
-
|
4060
|
-
|
4061
|
-
|
4062
|
-
|
4063
|
-
|
4064
|
-
|
4065
|
-
|
4066
|
-
|
4067
|
-
|
4068
|
-
|
4069
|
-
|
4070
|
-
|
4071
|
-
|
4072
|
-
|
4175
|
+
model (lctx.model),
|
4176
|
+
hparams (model.hparams),
|
4177
|
+
cparams (lctx.cparams),
|
4178
|
+
batch (batch),
|
4179
|
+
kv_self (lctx.kv_self),
|
4180
|
+
n_embd (hparams.n_embd),
|
4181
|
+
n_layer (hparams.n_layer),
|
4182
|
+
n_ctx (cparams.n_ctx),
|
4183
|
+
n_head (hparams.n_head),
|
4184
|
+
n_head_kv (hparams.n_head_kv),
|
4185
|
+
n_embd_head (hparams.n_embd_head()),
|
4186
|
+
n_embd_gqa (hparams.n_embd_gqa()),
|
4187
|
+
n_expert (hparams.n_expert),
|
4188
|
+
n_expert_used (hparams.n_expert_used),
|
4189
|
+
freq_base (cparams.rope_freq_base),
|
4190
|
+
freq_scale (cparams.rope_freq_scale),
|
4191
|
+
ext_factor (cparams.yarn_ext_factor),
|
4192
|
+
attn_factor (cparams.yarn_attn_factor),
|
4193
|
+
beta_fast (cparams.yarn_beta_fast),
|
4194
|
+
beta_slow (cparams.yarn_beta_slow),
|
4195
|
+
norm_eps (hparams.f_norm_eps),
|
4196
|
+
norm_rms_eps (hparams.f_norm_rms_eps),
|
4197
|
+
n_tokens (batch.n_tokens),
|
4198
|
+
n_kv (worst_case ? n_ctx : kv_self.n),
|
4199
|
+
kv_head (worst_case ? n_ctx - n_tokens : kv_self.head),
|
4200
|
+
n_orig_ctx (cparams.n_yarn_orig_ctx),
|
4201
|
+
do_rope_shift (worst_case || kv_self.has_shift),
|
4202
|
+
cb (cb),
|
4203
|
+
buf_compute_meta (lctx.buf_compute_meta) {
|
4073
4204
|
GGML_ASSERT(!!kv_self.ctx);
|
4074
4205
|
|
4075
4206
|
// all initializations should be done in init()
|
@@ -4077,8 +4208,8 @@ struct llm_build_context {
|
|
4077
4208
|
|
4078
4209
|
void init() {
|
4079
4210
|
struct ggml_init_params params = {
|
4080
|
-
/*.mem_size =*/
|
4081
|
-
/*.mem_buffer =*/
|
4211
|
+
/*.mem_size =*/ buf_compute_meta.size(),
|
4212
|
+
/*.mem_buffer =*/ buf_compute_meta.data(),
|
4082
4213
|
/*.no_alloc =*/ true,
|
4083
4214
|
};
|
4084
4215
|
|
@@ -4107,10 +4238,6 @@ struct llm_build_context {
|
|
4107
4238
|
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
4108
4239
|
cb(inp_pos, "inp_pos", -1);
|
4109
4240
|
|
4110
|
-
// KQ_scale
|
4111
|
-
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
4112
|
-
cb(KQ_scale, "KQ_scale", -1);
|
4113
|
-
|
4114
4241
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
4115
4242
|
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
4116
4243
|
cb(KQ_mask, "KQ_mask", -1);
|
@@ -4169,9 +4296,9 @@ struct llm_build_context {
|
|
4169
4296
|
|
4170
4297
|
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
4171
4298
|
|
4172
|
-
cur = llm_build_kqv(ctx0, hparams, kv_self,
|
4299
|
+
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
|
4173
4300
|
model.layers[il].wo, model.layers[il].bo,
|
4174
|
-
Qcur,
|
4301
|
+
Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
4175
4302
|
cb(cur, "kqv_out", il);
|
4176
4303
|
}
|
4177
4304
|
|
@@ -4179,7 +4306,7 @@ struct llm_build_context {
|
|
4179
4306
|
cb(ffn_inp, "ffn_inp", il);
|
4180
4307
|
|
4181
4308
|
// feed-forward network
|
4182
|
-
{
|
4309
|
+
if (model.layers[il].ffn_gate_inp == nullptr) {
|
4183
4310
|
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
4184
4311
|
model.layers[il].ffn_norm, NULL,
|
4185
4312
|
LLM_NORM_RMS, cb, il);
|
@@ -4191,6 +4318,69 @@ struct llm_build_context {
|
|
4191
4318
|
model.layers[il].ffn_down, NULL,
|
4192
4319
|
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
4193
4320
|
cb(cur, "ffn_out", il);
|
4321
|
+
} else {
|
4322
|
+
// MoE branch
|
4323
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
4324
|
+
model.layers[il].ffn_norm, NULL,
|
4325
|
+
LLM_NORM_RMS, cb, il);
|
4326
|
+
cb(cur, "ffn_norm", il);
|
4327
|
+
|
4328
|
+
ggml_tensor * logits = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp, cur); // [n_tokens, num_experts]
|
4329
|
+
cb(logits, "ffn_moe_logits", il);
|
4330
|
+
|
4331
|
+
ggml_tensor * probs = ggml_soft_max(ctx0, logits); // [n_tokens, num_experts]
|
4332
|
+
cb(probs, "ffn_moe_probs", il);
|
4333
|
+
|
4334
|
+
// select experts
|
4335
|
+
ggml_tensor * selected_experts = ggml_top_k(ctx0, probs, n_expert_used); // [n_tokens, num_experts_per_tok]
|
4336
|
+
cb(selected_experts->src[0], "ffn_moe_argsort", il);
|
4337
|
+
|
4338
|
+
ggml_tensor * weights = ggml_get_rows(ctx0,
|
4339
|
+
ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts);
|
4340
|
+
cb(weights, "ffn_moe_weights", il);
|
4341
|
+
|
4342
|
+
weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens); // [n_tokens, num_experts_per_tok]
|
4343
|
+
|
4344
|
+
ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights);
|
4345
|
+
cb(weights_sum, "ffn_moe_weights_sum", il);
|
4346
|
+
|
4347
|
+
weights = ggml_div(ctx0, weights, weights_sum); // [n_tokens, num_experts_per_tok]
|
4348
|
+
cb(weights, "ffn_moe_weights_norm", il);
|
4349
|
+
|
4350
|
+
// compute expert outputs
|
4351
|
+
ggml_tensor * moe_out = nullptr;
|
4352
|
+
|
4353
|
+
for (int i = 0; i < n_expert_used; ++i) {
|
4354
|
+
ggml_tensor * cur_expert;
|
4355
|
+
|
4356
|
+
ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exp, n_expert, selected_experts, i, cur);
|
4357
|
+
cb(cur_up, "ffn_moe_up", il);
|
4358
|
+
|
4359
|
+
ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exp, n_expert, selected_experts, i, cur);
|
4360
|
+
cb(cur_gate, "ffn_moe_gate", il);
|
4361
|
+
|
4362
|
+
cur_gate = ggml_silu(ctx0, cur_gate);
|
4363
|
+
cb(cur_gate, "ffn_moe_silu", il);
|
4364
|
+
|
4365
|
+
cur_expert = ggml_mul(ctx0, cur_up, cur_gate); // [n_tokens, n_embd]
|
4366
|
+
cb(cur_expert, "ffn_moe_gate_par", il);
|
4367
|
+
|
4368
|
+
cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exp, n_expert, selected_experts, i, cur_expert); // [n_tokens, n_embd]
|
4369
|
+
cb(cur_expert, "ffn_moe_down", il);
|
4370
|
+
|
4371
|
+
cur_expert = ggml_mul(ctx0, cur_expert,
|
4372
|
+
ggml_view_2d(ctx0, weights, 1, n_tokens, weights->nb[1], i*weights->nb[0]));
|
4373
|
+
cb(cur_expert, "ffn_moe_weighted", il);
|
4374
|
+
|
4375
|
+
if (i == 0) {
|
4376
|
+
moe_out = cur_expert;
|
4377
|
+
} else {
|
4378
|
+
moe_out = ggml_add(ctx0, moe_out, cur_expert);
|
4379
|
+
cb(moe_out, "ffn_moe_out", il);
|
4380
|
+
}
|
4381
|
+
}
|
4382
|
+
|
4383
|
+
cur = moe_out;
|
4194
4384
|
}
|
4195
4385
|
|
4196
4386
|
cur = ggml_add(ctx0, cur, ffn_inp);
|
@@ -4229,10 +4419,6 @@ struct llm_build_context {
|
|
4229
4419
|
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
4230
4420
|
cb(inp_pos, "inp_pos", -1);
|
4231
4421
|
|
4232
|
-
// KQ_scale
|
4233
|
-
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
4234
|
-
cb(KQ_scale, "KQ_scale", -1);
|
4235
|
-
|
4236
4422
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
4237
4423
|
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
4238
4424
|
cb(KQ_mask, "KQ_mask", -1);
|
@@ -4289,9 +4475,9 @@ struct llm_build_context {
|
|
4289
4475
|
// apply ALiBi for 13B model
|
4290
4476
|
const float max_alibi_bias = model.type == MODEL_13B ? 8.0f : -1.0f;
|
4291
4477
|
|
4292
|
-
cur = llm_build_kqv(ctx0, hparams, kv_self,
|
4478
|
+
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
|
4293
4479
|
model.layers[il].wo, NULL,
|
4294
|
-
Qcur,
|
4480
|
+
Qcur, KQ_mask, n_ctx, n_tokens, n_kv, max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
4295
4481
|
cb(cur, "kqv_out", il);
|
4296
4482
|
}
|
4297
4483
|
|
@@ -4349,10 +4535,6 @@ struct llm_build_context {
|
|
4349
4535
|
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
4350
4536
|
cb(inp_pos, "inp_pos", -1);
|
4351
4537
|
|
4352
|
-
// KQ_scale
|
4353
|
-
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
4354
|
-
cb(KQ_scale, "KQ_scale", -1);
|
4355
|
-
|
4356
4538
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
4357
4539
|
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
4358
4540
|
cb(KQ_mask, "KQ_mask", -1);
|
@@ -4413,9 +4595,9 @@ struct llm_build_context {
|
|
4413
4595
|
|
4414
4596
|
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
4415
4597
|
|
4416
|
-
cur = llm_build_kqv(ctx0, hparams, kv_self,
|
4598
|
+
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
|
4417
4599
|
model.layers[il].wo, NULL,
|
4418
|
-
Qcur,
|
4600
|
+
Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
4419
4601
|
cb(cur, "kqv_out", il);
|
4420
4602
|
}
|
4421
4603
|
|
@@ -4472,10 +4654,6 @@ struct llm_build_context {
|
|
4472
4654
|
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
4473
4655
|
cb(inp_pos, "inp_pos", -1);
|
4474
4656
|
|
4475
|
-
// KQ_scale
|
4476
|
-
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
4477
|
-
cb(KQ_scale, "KQ_scale", -1);
|
4478
|
-
|
4479
4657
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
4480
4658
|
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
4481
4659
|
cb(KQ_mask, "KQ_mask", -1);
|
@@ -4513,9 +4691,9 @@ struct llm_build_context {
|
|
4513
4691
|
|
4514
4692
|
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
4515
4693
|
|
4516
|
-
cur = llm_build_kqv(ctx0, hparams, kv_self,
|
4694
|
+
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
|
4517
4695
|
model.layers[il].wo, model.layers[il].bo,
|
4518
|
-
Qcur,
|
4696
|
+
Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
4519
4697
|
cb(cur, "kqv_out", il);
|
4520
4698
|
}
|
4521
4699
|
|
@@ -4572,10 +4750,6 @@ struct llm_build_context {
|
|
4572
4750
|
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
4573
4751
|
cb(inp_pos, "inp_pos", -1);
|
4574
4752
|
|
4575
|
-
// KQ_scale
|
4576
|
-
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
4577
|
-
cb(KQ_scale, "KQ_scale", -1);
|
4578
|
-
|
4579
4753
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
4580
4754
|
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
4581
4755
|
cb(KQ_mask, "KQ_mask", -1);
|
@@ -4722,9 +4896,9 @@ struct llm_build_context {
|
|
4722
4896
|
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
4723
4897
|
|
4724
4898
|
// TODO: not tested, could be broken
|
4725
|
-
cur = llm_build_kqv(ctx0, hparams, kv_self,
|
4899
|
+
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
|
4726
4900
|
model.layers[il].wo, model.layers[il].bo,
|
4727
|
-
Q,
|
4901
|
+
Q, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
4728
4902
|
cb(cur, "kqv_out", il);
|
4729
4903
|
}
|
4730
4904
|
|
@@ -4778,10 +4952,6 @@ struct llm_build_context {
|
|
4778
4952
|
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
|
4779
4953
|
cb(inpL, "inp_embd", -1);
|
4780
4954
|
|
4781
|
-
// KQ_scale
|
4782
|
-
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
4783
|
-
cb(KQ_scale, "KQ_scale", -1);
|
4784
|
-
|
4785
4955
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
4786
4956
|
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
4787
4957
|
cb(KQ_mask, "KQ_mask", -1);
|
@@ -4813,9 +4983,9 @@ struct llm_build_context {
|
|
4813
4983
|
|
4814
4984
|
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
4815
4985
|
|
4816
|
-
cur = llm_build_kqv(ctx0, hparams, kv_self,
|
4986
|
+
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
|
4817
4987
|
model.layers[il].wo, NULL,
|
4818
|
-
Qcur,
|
4988
|
+
Qcur, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
4819
4989
|
cb(cur, "kqv_out", il);
|
4820
4990
|
}
|
4821
4991
|
|
@@ -4869,10 +5039,6 @@ struct llm_build_context {
|
|
4869
5039
|
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
|
4870
5040
|
cb(inpL, "inp_embd", -1);
|
4871
5041
|
|
4872
|
-
// KQ_scale
|
4873
|
-
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
4874
|
-
cb(KQ_scale, "KQ_scale", -1);
|
4875
|
-
|
4876
5042
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
4877
5043
|
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
4878
5044
|
cb(KQ_mask, "KQ_mask", -1);
|
@@ -4910,9 +5076,9 @@ struct llm_build_context {
|
|
4910
5076
|
|
4911
5077
|
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
4912
5078
|
|
4913
|
-
cur = llm_build_kqv(ctx0, hparams, kv_self,
|
5079
|
+
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
|
4914
5080
|
model.layers[il].wo, model.layers[il].bo,
|
4915
|
-
Qcur,
|
5081
|
+
Qcur, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
4916
5082
|
cb(cur, "kqv_out", il);
|
4917
5083
|
}
|
4918
5084
|
|
@@ -4963,10 +5129,6 @@ struct llm_build_context {
|
|
4963
5129
|
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
|
4964
5130
|
cb(inpL, "inp_embd", -1);
|
4965
5131
|
|
4966
|
-
// KQ_scale
|
4967
|
-
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
4968
|
-
cb(KQ_scale, "KQ_scale", -1);
|
4969
|
-
|
4970
5132
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
4971
5133
|
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
4972
5134
|
cb(KQ_mask, "KQ_mask", -1);
|
@@ -5004,9 +5166,9 @@ struct llm_build_context {
|
|
5004
5166
|
|
5005
5167
|
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
5006
5168
|
|
5007
|
-
cur = llm_build_kqv(ctx0, hparams, kv_self,
|
5169
|
+
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
|
5008
5170
|
model.layers[il].wo, NULL,
|
5009
|
-
Qcur,
|
5171
|
+
Qcur, KQ_mask, n_ctx, n_tokens, n_kv, hparams.f_max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
5010
5172
|
cb(cur, "kqv_out", il);
|
5011
5173
|
}
|
5012
5174
|
|
@@ -5066,10 +5228,6 @@ struct llm_build_context {
|
|
5066
5228
|
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
5067
5229
|
cb(inp_pos, "inp_pos", -1);
|
5068
5230
|
|
5069
|
-
// KQ_scale
|
5070
|
-
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
5071
|
-
cb(KQ_scale, "KQ_scale", -1);
|
5072
|
-
|
5073
5231
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
5074
5232
|
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
5075
5233
|
cb(KQ_mask, "KQ_mask", -1);
|
@@ -5098,40 +5256,152 @@ struct llm_build_context {
|
|
5098
5256
|
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
5099
5257
|
cb(Kcur, "Kcur", il);
|
5100
5258
|
|
5101
|
-
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
5259
|
+
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
5260
|
+
cb(Vcur, "Vcur", il);
|
5261
|
+
|
5262
|
+
Qcur = ggml_rope_custom(
|
5263
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
5264
|
+
hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
5265
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
5266
|
+
);
|
5267
|
+
cb(Qcur, "Qcur", il);
|
5268
|
+
|
5269
|
+
Kcur = ggml_rope_custom(
|
5270
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
5271
|
+
hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
5272
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
5273
|
+
);
|
5274
|
+
cb(Kcur, "Kcur", il);
|
5275
|
+
|
5276
|
+
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
5277
|
+
|
5278
|
+
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
|
5279
|
+
model.layers[il].wo, NULL,
|
5280
|
+
Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
5281
|
+
cb(cur, "kqv_out", il);
|
5282
|
+
}
|
5283
|
+
|
5284
|
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
5285
|
+
cb(ffn_inp, "ffn_inp", il);
|
5286
|
+
|
5287
|
+
// feed-forward network
|
5288
|
+
{
|
5289
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
5290
|
+
model.layers[il].ffn_norm,
|
5291
|
+
model.layers[il].ffn_norm_b,
|
5292
|
+
LLM_NORM, cb, il);
|
5293
|
+
cb(cur, "ffn_norm", il);
|
5294
|
+
|
5295
|
+
cur = llm_build_ffn(ctx0, cur,
|
5296
|
+
model.layers[il].ffn_up, NULL,
|
5297
|
+
model.layers[il].ffn_gate, NULL,
|
5298
|
+
model.layers[il].ffn_down, NULL,
|
5299
|
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
5300
|
+
cb(cur, "ffn_out", il);
|
5301
|
+
}
|
5302
|
+
|
5303
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
5304
|
+
cb(cur, "l_out", il);
|
5305
|
+
|
5306
|
+
// input for next layer
|
5307
|
+
inpL = cur;
|
5308
|
+
}
|
5309
|
+
|
5310
|
+
cur = inpL;
|
5311
|
+
|
5312
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
5313
|
+
model.output_norm,
|
5314
|
+
model.output_norm_b,
|
5315
|
+
LLM_NORM, cb, -1);
|
5316
|
+
cb(cur, "result_norm", -1);
|
5317
|
+
|
5318
|
+
// lm_head
|
5319
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
5320
|
+
cb(cur, "result_output", -1);
|
5321
|
+
|
5322
|
+
ggml_build_forward_expand(gf, cur);
|
5323
|
+
|
5324
|
+
return gf;
|
5325
|
+
}
|
5326
|
+
|
5327
|
+
struct ggml_cgraph * build_qwen() {
|
5328
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
5329
|
+
|
5330
|
+
struct ggml_tensor * cur;
|
5331
|
+
struct ggml_tensor * inpL;
|
5332
|
+
|
5333
|
+
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
|
5334
|
+
cb(inpL, "inp_embd", -1);
|
5335
|
+
|
5336
|
+
// inp_pos - contains the positions
|
5337
|
+
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
5338
|
+
cb(inp_pos, "inp_pos", -1);
|
5339
|
+
|
5340
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
5341
|
+
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
5342
|
+
cb(KQ_mask, "KQ_mask", -1);
|
5343
|
+
|
5344
|
+
// shift the entire K-cache if needed
|
5345
|
+
if (do_rope_shift) {
|
5346
|
+
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, n_embd_head, freq_base, freq_scale, cb);
|
5347
|
+
}
|
5348
|
+
|
5349
|
+
for (int il = 0; il < n_layer; ++il) {
|
5350
|
+
struct ggml_tensor * inpSA = inpL;
|
5351
|
+
|
5352
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
5353
|
+
model.layers[il].attn_norm, NULL,
|
5354
|
+
LLM_NORM_RMS, cb, il);
|
5355
|
+
cb(cur, "attn_norm", il);
|
5356
|
+
|
5357
|
+
// self-attention
|
5358
|
+
{
|
5359
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
|
5360
|
+
cb(cur, "wqkv", il);
|
5361
|
+
|
5362
|
+
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
5363
|
+
cb(cur, "bqkv", il);
|
5364
|
+
|
5365
|
+
struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
5366
|
+
struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
5367
|
+
struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 2*sizeof(float)*(n_embd)));
|
5368
|
+
|
5369
|
+
cb(Qcur, "Qcur", il);
|
5370
|
+
cb(Kcur, "Kcur", il);
|
5102
5371
|
cb(Vcur, "Vcur", il);
|
5103
5372
|
|
5373
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
5374
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
5375
|
+
|
5376
|
+
// using mode = 2 for neox mode
|
5104
5377
|
Qcur = ggml_rope_custom(
|
5105
|
-
ctx0,
|
5106
|
-
|
5107
|
-
ext_factor, attn_factor, beta_fast, beta_slow
|
5378
|
+
ctx0, Qcur, inp_pos, n_embd_head, 2, 0, n_orig_ctx,
|
5379
|
+
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
5108
5380
|
);
|
5109
5381
|
cb(Qcur, "Qcur", il);
|
5110
5382
|
|
5111
5383
|
Kcur = ggml_rope_custom(
|
5112
|
-
ctx0,
|
5113
|
-
|
5114
|
-
ext_factor, attn_factor, beta_fast, beta_slow
|
5384
|
+
ctx0, Kcur, inp_pos, n_embd_head, 2, 0, n_orig_ctx,
|
5385
|
+
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
5115
5386
|
);
|
5116
5387
|
cb(Kcur, "Kcur", il);
|
5117
5388
|
|
5118
5389
|
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
5119
5390
|
|
5120
|
-
cur = llm_build_kqv(ctx0, hparams, kv_self,
|
5391
|
+
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
|
5121
5392
|
model.layers[il].wo, NULL,
|
5122
|
-
Qcur,
|
5393
|
+
Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
5123
5394
|
cb(cur, "kqv_out", il);
|
5124
5395
|
}
|
5125
5396
|
|
5126
5397
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
5127
5398
|
cb(ffn_inp, "ffn_inp", il);
|
5128
5399
|
|
5129
|
-
// feed-forward
|
5400
|
+
// feed-forward forward
|
5130
5401
|
{
|
5131
5402
|
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
5132
|
-
model.layers[il].ffn_norm,
|
5133
|
-
|
5134
|
-
LLM_NORM, cb, il);
|
5403
|
+
model.layers[il].ffn_norm, NULL,
|
5404
|
+
LLM_NORM_RMS, cb, il);
|
5135
5405
|
cb(cur, "ffn_norm", il);
|
5136
5406
|
|
5137
5407
|
cur = llm_build_ffn(ctx0, cur,
|
@@ -5152,9 +5422,8 @@ struct llm_build_context {
|
|
5152
5422
|
cur = inpL;
|
5153
5423
|
|
5154
5424
|
cur = llm_build_norm(ctx0, cur, hparams,
|
5155
|
-
model.output_norm,
|
5156
|
-
|
5157
|
-
LLM_NORM, cb, -1);
|
5425
|
+
model.output_norm, NULL,
|
5426
|
+
LLM_NORM_RMS, cb, -1);
|
5158
5427
|
cb(cur, "result_norm", -1);
|
5159
5428
|
|
5160
5429
|
// lm_head
|
@@ -5165,26 +5434,23 @@ struct llm_build_context {
|
|
5165
5434
|
|
5166
5435
|
return gf;
|
5167
5436
|
}
|
5168
|
-
|
5169
|
-
struct ggml_cgraph * build_qwen() {
|
5437
|
+
struct ggml_cgraph * build_phi2() {
|
5170
5438
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
5171
5439
|
|
5172
5440
|
struct ggml_tensor * cur;
|
5441
|
+
struct ggml_tensor * attn_norm_output;
|
5442
|
+
struct ggml_tensor * ffn_output;
|
5173
5443
|
struct ggml_tensor * inpL;
|
5174
5444
|
|
5175
5445
|
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
|
5176
5446
|
cb(inpL, "inp_embd", -1);
|
5177
5447
|
|
5178
5448
|
// inp_pos - contains the positions
|
5179
|
-
struct ggml_tensor * inp_pos= ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
5449
|
+
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
5180
5450
|
cb(inp_pos, "inp_pos", -1);
|
5181
5451
|
|
5182
|
-
// KQ_scale
|
5183
|
-
struct ggml_tensor * KQ_scale= ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
5184
|
-
cb(KQ_scale, "KQ_scale", -1);
|
5185
|
-
|
5186
5452
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
5187
|
-
struct ggml_tensor * KQ_mask= ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
5453
|
+
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
5188
5454
|
cb(KQ_mask, "KQ_mask", -1);
|
5189
5455
|
|
5190
5456
|
// shift the entire K-cache if needed
|
@@ -5193,24 +5459,23 @@ struct llm_build_context {
|
|
5193
5459
|
}
|
5194
5460
|
|
5195
5461
|
for (int il = 0; il < n_layer; ++il) {
|
5196
|
-
|
5197
|
-
|
5198
|
-
|
5199
|
-
|
5200
|
-
|
5201
|
-
cb(cur, "attn_norm", il);
|
5462
|
+
attn_norm_output = llm_build_norm(ctx0, inpL, hparams,
|
5463
|
+
model.layers[il].attn_norm,
|
5464
|
+
model.layers[il].attn_norm_b,
|
5465
|
+
LLM_NORM, cb, il);
|
5466
|
+
cb(attn_norm_output, "attn_norm", il);
|
5202
5467
|
|
5203
5468
|
// self-attention
|
5204
5469
|
{
|
5205
|
-
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv,
|
5470
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, attn_norm_output);
|
5206
5471
|
cb(cur, "wqkv", il);
|
5207
5472
|
|
5208
5473
|
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
5209
5474
|
cb(cur, "bqkv", il);
|
5210
5475
|
|
5211
|
-
struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,
|
5212
|
-
struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur,
|
5213
|
-
struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur,
|
5476
|
+
struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
5477
|
+
struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
5478
|
+
struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
5214
5479
|
|
5215
5480
|
cb(Qcur, "Qcur", il);
|
5216
5481
|
cb(Kcur, "Kcur", il);
|
@@ -5219,61 +5484,60 @@ struct llm_build_context {
|
|
5219
5484
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
5220
5485
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
5221
5486
|
|
5222
|
-
// using mode = 2 for neox mode
|
5223
5487
|
Qcur = ggml_rope_custom(
|
5224
|
-
ctx0, Qcur, inp_pos,
|
5488
|
+
ctx0, Qcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
|
5225
5489
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
5226
5490
|
);
|
5227
5491
|
cb(Qcur, "Qcur", il);
|
5228
5492
|
|
5493
|
+
// with phi2, we scale the Q to avoid precision issues
|
5494
|
+
// ref: https://github.com/ml-explore/mlx-examples/blob/08e862336ade809bc37d1035f94b359e7d1a5152/phi2/phi2.py#L64-L66
|
5495
|
+
Qcur = ggml_scale(ctx0, Qcur, 1.0f/sqrtf(float(n_embd_head)));
|
5496
|
+
cb(Qcur, "Qcur", il);
|
5497
|
+
|
5229
5498
|
Kcur = ggml_rope_custom(
|
5230
|
-
ctx0, Kcur, inp_pos,
|
5499
|
+
ctx0, Kcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
|
5231
5500
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
5232
5501
|
);
|
5233
5502
|
cb(Kcur, "Kcur", il);
|
5234
5503
|
|
5235
5504
|
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
5236
5505
|
|
5237
|
-
cur = llm_build_kqv(ctx0, hparams, kv_self,
|
5238
|
-
model.layers[il].wo,
|
5239
|
-
Qcur,
|
5506
|
+
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
|
5507
|
+
model.layers[il].wo, model.layers[il].bo,
|
5508
|
+
Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f, cb, il);
|
5240
5509
|
cb(cur, "kqv_out", il);
|
5241
5510
|
}
|
5242
5511
|
|
5243
|
-
|
5244
|
-
cb(ffn_inp, "ffn_inp", il);
|
5245
|
-
|
5246
|
-
// feed-forward forward
|
5512
|
+
// FF
|
5247
5513
|
{
|
5248
|
-
|
5249
|
-
model.layers[il].
|
5250
|
-
|
5251
|
-
|
5252
|
-
|
5253
|
-
|
5254
|
-
model.layers[il].ffn_up, NULL,
|
5255
|
-
model.layers[il].ffn_gate, NULL,
|
5256
|
-
model.layers[il].ffn_down, NULL,
|
5257
|
-
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
5258
|
-
cb(cur, "ffn_out", il);
|
5514
|
+
ffn_output = llm_build_ffn(ctx0, attn_norm_output,
|
5515
|
+
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
|
5516
|
+
NULL, NULL,
|
5517
|
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
5518
|
+
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
5519
|
+
cb(ffn_output, "ffn_out", il);
|
5259
5520
|
}
|
5260
5521
|
|
5261
|
-
cur = ggml_add(ctx0, cur,
|
5522
|
+
cur = ggml_add(ctx0, cur, ffn_output);
|
5523
|
+
cb(cur, "l_out", il);
|
5524
|
+
|
5525
|
+
cur = ggml_add(ctx0, cur, inpL);
|
5262
5526
|
cb(cur, "l_out", il);
|
5263
5527
|
|
5264
|
-
// input for next layer
|
5265
5528
|
inpL = cur;
|
5266
5529
|
}
|
5267
5530
|
|
5268
|
-
cur = inpL
|
5269
|
-
|
5270
|
-
|
5271
|
-
|
5272
|
-
LLM_NORM_RMS, cb, -1);
|
5531
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
5532
|
+
model.output_norm,
|
5533
|
+
model.output_norm_b,
|
5534
|
+
LLM_NORM, cb, -1);
|
5273
5535
|
cb(cur, "result_norm", -1);
|
5274
5536
|
|
5275
|
-
// lm_head
|
5276
5537
|
cur = ggml_mul_mat(ctx0, model.output, cur);
|
5538
|
+
cb(cur, "result_output_no_bias", -1);
|
5539
|
+
|
5540
|
+
cur = ggml_add(ctx0, cur, model.output_b);
|
5277
5541
|
cb(cur, "result_output", -1);
|
5278
5542
|
|
5279
5543
|
ggml_build_forward_expand(gf, cur);
|
@@ -5293,7 +5557,7 @@ enum llm_offload_func_e {
|
|
5293
5557
|
OFFLOAD_FUNC_FRC, // force offload
|
5294
5558
|
OFFLOAD_FUNC_KQV,
|
5295
5559
|
OFFLOAD_FUNC_NR,
|
5296
|
-
OFFLOAD_FUNC_EMB,
|
5560
|
+
OFFLOAD_FUNC_EMB, // embeddings
|
5297
5561
|
OFFLOAD_FUNC_OUT,
|
5298
5562
|
};
|
5299
5563
|
|
@@ -5378,7 +5642,6 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map
|
|
5378
5642
|
{ "pos_embd", OFFLOAD_FUNC_NR },
|
5379
5643
|
|
5380
5644
|
{ "inp_pos", OFFLOAD_FUNC_FRC }, // this is often used for KQ ops (e.g. rope)
|
5381
|
-
{ "KQ_scale", OFFLOAD_FUNC_FRC },
|
5382
5645
|
{ "KQ_mask", OFFLOAD_FUNC_FRC },
|
5383
5646
|
{ "K_shift", OFFLOAD_FUNC_FRC },
|
5384
5647
|
|
@@ -5445,9 +5708,24 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map
|
|
5445
5708
|
{ "ffn_relu", OFFLOAD_FUNC },
|
5446
5709
|
{ "ffn_sqr(relu)", OFFLOAD_FUNC },
|
5447
5710
|
|
5711
|
+
{ "ffn_moe_logits", OFFLOAD_FUNC },
|
5712
|
+
{ "ffn_moe_probs", OFFLOAD_FUNC },
|
5713
|
+
{ "ffn_moe_argsort", OFFLOAD_FUNC },
|
5714
|
+
{ "ffn_moe_weights", OFFLOAD_FUNC },
|
5715
|
+
{ "ffn_moe_weights_sum", OFFLOAD_FUNC },
|
5716
|
+
{ "ffn_moe_weights_norm", OFFLOAD_FUNC },
|
5717
|
+
{ "ffn_moe_weighted", OFFLOAD_FUNC },
|
5718
|
+
{ "ffn_moe_up", OFFLOAD_FUNC },
|
5719
|
+
{ "ffn_moe_gate", OFFLOAD_FUNC },
|
5720
|
+
{ "ffn_moe_silu", OFFLOAD_FUNC },
|
5721
|
+
{ "ffn_moe_gate_par", OFFLOAD_FUNC },
|
5722
|
+
{ "ffn_moe_down", OFFLOAD_FUNC },
|
5723
|
+
{ "ffn_moe_out", OFFLOAD_FUNC },
|
5724
|
+
|
5448
5725
|
{ "l_out", OFFLOAD_FUNC },
|
5449
5726
|
|
5450
5727
|
{ "result_norm", OFFLOAD_FUNC_EMB },
|
5728
|
+
{ "result_output_no_bias", OFFLOAD_FUNC_EMB },
|
5451
5729
|
{ "result_output", OFFLOAD_FUNC_OUT },
|
5452
5730
|
};
|
5453
5731
|
|
@@ -5465,11 +5743,10 @@ static struct ggml_cgraph * llama_build_graph(
|
|
5465
5743
|
bool alloc_inp_tokens = false;
|
5466
5744
|
bool alloc_inp_embd = false;
|
5467
5745
|
bool alloc_inp_pos = false;
|
5468
|
-
bool alloc_inp_KQ_scale = false;
|
5469
5746
|
bool alloc_inp_KQ_mask = false;
|
5470
5747
|
bool alloc_inp_K_shift = false;
|
5471
5748
|
|
5472
|
-
#
|
5749
|
+
#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
|
5473
5750
|
const bool do_offload = true;
|
5474
5751
|
#else
|
5475
5752
|
const bool do_offload = true; // TODO: set to false after finishing refactoring
|
@@ -5497,7 +5774,7 @@ static struct ggml_cgraph * llama_build_graph(
|
|
5497
5774
|
if (!ggml_allocr_is_measure(lctx.alloc) && batch.token) {
|
5498
5775
|
const int64_t n_tokens = cur->ne[0];
|
5499
5776
|
|
5500
|
-
|
5777
|
+
ggml_backend_tensor_set(cur, batch.token, 0, n_tokens*ggml_element_size(cur));
|
5501
5778
|
}
|
5502
5779
|
|
5503
5780
|
alloc_inp_tokens = true;
|
@@ -5510,7 +5787,7 @@ static struct ggml_cgraph * llama_build_graph(
|
|
5510
5787
|
const int64_t n_embd = cur->ne[0];
|
5511
5788
|
const int64_t n_tokens = cur->ne[1];
|
5512
5789
|
|
5513
|
-
|
5790
|
+
ggml_backend_tensor_set(cur, batch.embd, 0, n_tokens*n_embd*ggml_element_size(cur));
|
5514
5791
|
}
|
5515
5792
|
|
5516
5793
|
alloc_inp_embd = true;
|
@@ -5522,27 +5799,13 @@ static struct ggml_cgraph * llama_build_graph(
|
|
5522
5799
|
if (!ggml_allocr_is_measure(lctx.alloc) && batch.pos) {
|
5523
5800
|
const int64_t n_tokens = cur->ne[0];
|
5524
5801
|
|
5525
|
-
int32_t
|
5526
|
-
|
5527
|
-
for (int i = 0; i < n_tokens; ++i) {
|
5528
|
-
data[i] = batch.pos[i];
|
5529
|
-
}
|
5802
|
+
static_assert(std::is_same<llama_pos, int32_t>::value, "llama_pos must be int32_t");
|
5803
|
+
ggml_backend_tensor_set(cur, batch.pos, 0, n_tokens*ggml_element_size(cur));
|
5530
5804
|
}
|
5531
5805
|
|
5532
5806
|
alloc_inp_pos = true;
|
5533
5807
|
}
|
5534
5808
|
|
5535
|
-
if (!alloc_inp_KQ_scale && strcmp(name, "KQ_scale") == 0) {
|
5536
|
-
ggml_allocr_alloc(lctx.alloc, cur);
|
5537
|
-
|
5538
|
-
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
5539
|
-
const int64_t n_embd_head = model.hparams.n_embd_head();
|
5540
|
-
ggml_set_f32(cur, 1.0f/sqrtf(float(n_embd_head)));
|
5541
|
-
}
|
5542
|
-
|
5543
|
-
alloc_inp_KQ_scale = true;
|
5544
|
-
}
|
5545
|
-
|
5546
5809
|
if (!alloc_inp_KQ_mask && strcmp(name, "KQ_mask") == 0) {
|
5547
5810
|
ggml_allocr_alloc(lctx.alloc, cur);
|
5548
5811
|
|
@@ -5550,8 +5813,13 @@ static struct ggml_cgraph * llama_build_graph(
|
|
5550
5813
|
const int64_t n_kv = cur->ne[0];
|
5551
5814
|
const int64_t n_tokens = cur->ne[1];
|
5552
5815
|
|
5553
|
-
float * data
|
5554
|
-
|
5816
|
+
float * data;
|
5817
|
+
if (ggml_backend_buffer_is_host(cur->buffer)) {
|
5818
|
+
data = (float *) cur->data;
|
5819
|
+
} else {
|
5820
|
+
lctx.buf_copy.resize(ggml_nbytes(cur));
|
5821
|
+
data = (float *) lctx.buf_copy.data();
|
5822
|
+
}
|
5555
5823
|
|
5556
5824
|
for (int h = 0; h < 1; ++h) {
|
5557
5825
|
for (int j = 0; j < n_tokens; ++j) {
|
@@ -5559,12 +5827,20 @@ static struct ggml_cgraph * llama_build_graph(
|
|
5559
5827
|
const llama_seq_id seq_id = batch.seq_id[j][0];
|
5560
5828
|
|
5561
5829
|
for (int i = 0; i < n_kv; ++i) {
|
5830
|
+
float f;
|
5562
5831
|
if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) {
|
5563
|
-
|
5832
|
+
f = -INFINITY;
|
5833
|
+
} else {
|
5834
|
+
f = 0;
|
5564
5835
|
}
|
5836
|
+
data[h*(n_kv*n_tokens) + j*n_kv + i] = f;
|
5565
5837
|
}
|
5566
5838
|
}
|
5567
5839
|
}
|
5840
|
+
|
5841
|
+
if (data != cur->data) {
|
5842
|
+
ggml_backend_tensor_set(cur, data, 0, ggml_nbytes(cur));
|
5843
|
+
}
|
5568
5844
|
}
|
5569
5845
|
|
5570
5846
|
alloc_inp_KQ_mask = true;
|
@@ -5576,11 +5852,21 @@ static struct ggml_cgraph * llama_build_graph(
|
|
5576
5852
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
5577
5853
|
const int64_t n_ctx = cur->ne[0];
|
5578
5854
|
|
5579
|
-
int32_t * data
|
5855
|
+
int32_t * data;
|
5856
|
+
if (ggml_backend_buffer_is_host(cur->buffer)) {
|
5857
|
+
data = (int32_t *) cur->data;
|
5858
|
+
} else {
|
5859
|
+
lctx.buf_copy.resize(ggml_nbytes(cur));
|
5860
|
+
data = (int32_t *) lctx.buf_copy.data();
|
5861
|
+
}
|
5580
5862
|
|
5581
5863
|
for (int i = 0; i < n_ctx; ++i) {
|
5582
5864
|
data[i] = lctx.kv_self.cells[i].delta;
|
5583
5865
|
}
|
5866
|
+
|
5867
|
+
if (data != cur->data) {
|
5868
|
+
ggml_backend_tensor_set(cur, data, 0, ggml_nbytes(cur));
|
5869
|
+
}
|
5584
5870
|
}
|
5585
5871
|
|
5586
5872
|
alloc_inp_K_shift = true;
|
@@ -5617,7 +5903,7 @@ static struct ggml_cgraph * llama_build_graph(
|
|
5617
5903
|
static const std::unordered_map<llm_offload_func_e, std::string, std::hash<int>> k_offload_func_name = {
|
5618
5904
|
{ OFFLOAD_FUNC_NOP, "CPU" },
|
5619
5905
|
{ OFFLOAD_FUNC_OUT, "CPU" },
|
5620
|
-
#
|
5906
|
+
#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
|
5621
5907
|
{ OFFLOAD_FUNC, "GPU (CUDA)" },
|
5622
5908
|
{ OFFLOAD_FUNC_FRC, "GPU (CUDA) FRC" },
|
5623
5909
|
{ OFFLOAD_FUNC_KQV, "GPU (CUDA) KQV" },
|
@@ -5690,7 +5976,7 @@ static struct ggml_cgraph * llama_build_graph(
|
|
5690
5976
|
offload_func_t func = ggml_offload_nop;
|
5691
5977
|
|
5692
5978
|
// this is needed for compatibility with Metal for example
|
5693
|
-
#
|
5979
|
+
#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
|
5694
5980
|
static offload_func_t ggml_offload_gpu = ggml_cuda_assign_buffers_no_alloc;
|
5695
5981
|
#else
|
5696
5982
|
static offload_func_t ggml_offload_gpu = ggml_offload_nop;
|
@@ -5764,6 +6050,10 @@ static struct ggml_cgraph * llama_build_graph(
|
|
5764
6050
|
{
|
5765
6051
|
result = llm.build_qwen();
|
5766
6052
|
} break;
|
6053
|
+
case LLM_ARCH_PHI2:
|
6054
|
+
{
|
6055
|
+
result = llm.build_phi2();
|
6056
|
+
} break;
|
5767
6057
|
default:
|
5768
6058
|
GGML_ASSERT(false);
|
5769
6059
|
}
|
@@ -5841,7 +6131,7 @@ static int llama_decode_internal(
|
|
5841
6131
|
const int64_t n_embd = hparams.n_embd;
|
5842
6132
|
const int64_t n_vocab = hparams.n_vocab;
|
5843
6133
|
|
5844
|
-
// helpers for smoother batch API
|
6134
|
+
// helpers for smoother batch API transition
|
5845
6135
|
// after deprecating the llama_eval calls, these will be removed
|
5846
6136
|
std::vector<llama_pos> pos;
|
5847
6137
|
|
@@ -5897,18 +6187,23 @@ static int llama_decode_internal(
|
|
5897
6187
|
|
5898
6188
|
ggml_allocr_alloc_graph(lctx.alloc, gf);
|
5899
6189
|
|
5900
|
-
|
5901
|
-
struct ggml_tensor *
|
5902
|
-
|
5903
|
-
GGML_ASSERT(strcmp(res->name, "result_output") == 0);
|
5904
|
-
GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
|
6190
|
+
// the output is always the last tensor in the graph
|
6191
|
+
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
|
6192
|
+
GGML_ASSERT(strcmp(res->name, "result_output") == 0);
|
5905
6193
|
|
6194
|
+
// the embeddings could be the second to last tensor, or the third to last tensor
|
6195
|
+
struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
|
6196
|
+
if (strcmp(embeddings->name, "result_norm") != 0) {
|
6197
|
+
embeddings = gf->nodes[gf->n_nodes - 3];
|
6198
|
+
GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
|
6199
|
+
}
|
5906
6200
|
|
5907
|
-
#
|
6201
|
+
#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
|
6202
|
+
char * buf_alloc_base = (char *)ggml_backend_buffer_get_base(lctx.buf_alloc);
|
5908
6203
|
for (int i = 0; i < gf->n_leafs; i++) {
|
5909
6204
|
ggml_tensor * node = gf->leafs[i];
|
5910
6205
|
if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) {
|
5911
|
-
ggml_cuda_assign_scratch_offset(node, (char*)node->data -
|
6206
|
+
ggml_cuda_assign_scratch_offset(node, (char *)node->data - buf_alloc_base);
|
5912
6207
|
ggml_cuda_copy_to_device(node);
|
5913
6208
|
}
|
5914
6209
|
}
|
@@ -5916,7 +6211,7 @@ static int llama_decode_internal(
|
|
5916
6211
|
for (int i = 0; i < gf->n_nodes; i++) {
|
5917
6212
|
ggml_tensor * node = gf->nodes[i];
|
5918
6213
|
if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) {
|
5919
|
-
ggml_cuda_assign_scratch_offset(node, (char*)node->data -
|
6214
|
+
ggml_cuda_assign_scratch_offset(node, (char *)node->data - buf_alloc_base);
|
5920
6215
|
}
|
5921
6216
|
}
|
5922
6217
|
|
@@ -5943,23 +6238,23 @@ static int llama_decode_internal(
|
|
5943
6238
|
n_threads = 1;
|
5944
6239
|
}
|
5945
6240
|
|
5946
|
-
#
|
6241
|
+
#ifdef GGML_USE_MPI
|
5947
6242
|
const int64_t n_layer = hparams.n_layer;
|
5948
6243
|
ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
|
5949
6244
|
#endif
|
5950
6245
|
|
5951
6246
|
#ifdef GGML_USE_METAL
|
5952
|
-
if (lctx.
|
5953
|
-
|
5954
|
-
ggml_metal_graph_compute(lctx.ctx_metal, gf);
|
5955
|
-
} else {
|
5956
|
-
ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
|
6247
|
+
if (ggml_backend_is_metal(lctx.backend)) {
|
6248
|
+
ggml_backend_metal_set_n_cb(lctx.backend, n_threads);
|
5957
6249
|
}
|
5958
|
-
#else
|
5959
|
-
ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
|
5960
6250
|
#endif
|
5961
6251
|
|
5962
|
-
|
6252
|
+
if (ggml_backend_is_cpu(lctx.backend)) {
|
6253
|
+
ggml_backend_cpu_set_n_threads(lctx.backend, n_threads);
|
6254
|
+
}
|
6255
|
+
ggml_backend_graph_compute(lctx.backend, gf);
|
6256
|
+
|
6257
|
+
#ifdef GGML_USE_MPI
|
5963
6258
|
ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
|
5964
6259
|
#endif
|
5965
6260
|
|
@@ -5997,20 +6292,37 @@ static int llama_decode_internal(
|
|
5997
6292
|
{
|
5998
6293
|
auto & logits_out = lctx.logits;
|
5999
6294
|
|
6295
|
+
#ifndef NDEBUG
|
6296
|
+
auto & logits_valid = lctx.logits_valid;
|
6297
|
+
logits_valid.clear();
|
6298
|
+
logits_valid.resize(n_tokens);
|
6299
|
+
|
6300
|
+
logits_out.clear();
|
6301
|
+
#endif
|
6302
|
+
|
6000
6303
|
if (batch.logits) {
|
6001
6304
|
logits_out.resize(n_vocab * n_tokens);
|
6002
6305
|
for (uint32_t i = 0; i < n_tokens; i++) {
|
6003
6306
|
if (batch.logits[i] == 0) {
|
6004
6307
|
continue;
|
6005
6308
|
}
|
6006
|
-
|
6309
|
+
ggml_backend_tensor_get(res, logits_out.data() + (n_vocab*i), (n_vocab*i)*sizeof(float), n_vocab*sizeof(float));
|
6310
|
+
#ifndef NDEBUG
|
6311
|
+
logits_valid[i] = true;
|
6312
|
+
#endif
|
6007
6313
|
}
|
6008
6314
|
} else if (lctx.logits_all) {
|
6009
6315
|
logits_out.resize(n_vocab * n_tokens);
|
6010
|
-
|
6316
|
+
ggml_backend_tensor_get(res, logits_out.data(), 0, n_vocab*n_tokens*sizeof(float));
|
6317
|
+
#ifndef NDEBUG
|
6318
|
+
std::fill(logits_valid.begin(), logits_valid.end(), true);
|
6319
|
+
#endif
|
6011
6320
|
} else {
|
6012
6321
|
logits_out.resize(n_vocab);
|
6013
|
-
|
6322
|
+
ggml_backend_tensor_get(res, logits_out.data(), (n_vocab*(n_tokens - 1))*sizeof(float), n_vocab*sizeof(float));
|
6323
|
+
#ifndef NDEBUG
|
6324
|
+
logits_valid[0] = true;
|
6325
|
+
#endif
|
6014
6326
|
}
|
6015
6327
|
}
|
6016
6328
|
|
@@ -6019,7 +6331,7 @@ static int llama_decode_internal(
|
|
6019
6331
|
auto & embedding_out = lctx.embedding;
|
6020
6332
|
|
6021
6333
|
embedding_out.resize(n_embd);
|
6022
|
-
|
6334
|
+
ggml_backend_tensor_get(embeddings, embedding_out.data(), (n_embd*(n_tokens - 1))*sizeof(float), n_embd*sizeof(float));
|
6023
6335
|
}
|
6024
6336
|
|
6025
6337
|
// measure the performance only for the single-token evals
|
@@ -6620,12 +6932,12 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
|
|
6620
6932
|
|
6621
6933
|
// loop over the text
|
6622
6934
|
while (true) {
|
6623
|
-
// find the first
|
6935
|
+
// find the first occurrence of a given special token in this fragment
|
6624
6936
|
// passing offset argument only limit the "search area" but match coordinates
|
6625
6937
|
// are still relative to the source full raw_text
|
6626
6938
|
auto match = raw_text->find(special_token, raw_text_base_offset);
|
6627
6939
|
|
6628
|
-
// no
|
6940
|
+
// no occurrences found, stop processing this fragment for a given special token
|
6629
6941
|
if (match == std::string::npos) break;
|
6630
6942
|
|
6631
6943
|
// check if match is within bounds of offset <-> length
|
@@ -7498,7 +7810,7 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
|
|
7498
7810
|
|
7499
7811
|
for (size_t i = 0; i < candidates->size; ++i) {
|
7500
7812
|
const llama_token id = candidates->data[i].id;
|
7501
|
-
const std::string
|
7813
|
+
const std::string piece = llama_token_to_piece(ctx, id);
|
7502
7814
|
if (id == eos) {
|
7503
7815
|
if (!allow_eos) {
|
7504
7816
|
candidates->data[i].logit = -INFINITY;
|
@@ -7710,7 +8022,7 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
|
|
7710
8022
|
GGML_ASSERT(false);
|
7711
8023
|
}
|
7712
8024
|
|
7713
|
-
const std::string
|
8025
|
+
const std::string piece = llama_token_to_piece(ctx, token);
|
7714
8026
|
|
7715
8027
|
// Note terminating 0 in decoded string
|
7716
8028
|
const auto decoded = decode_utf8(piece, grammar->partial_utf8);
|
@@ -7824,7 +8136,7 @@ struct llama_beam_search_data {
|
|
7824
8136
|
}
|
7825
8137
|
|
7826
8138
|
// Min-heaps are used to efficiently collect the top-k elements (k=n_beams).
|
7827
|
-
// The
|
8139
|
+
// The repetitive patterns below reflect the 2 stages of heaps:
|
7828
8140
|
// * Gather elements until the vector is full, then call std::make_heap() on it.
|
7829
8141
|
// * If the heap is full and a new element is found that should be included, pop the
|
7830
8142
|
// least element to the back(), replace it with the new, then push it into the heap.
|
@@ -7977,12 +8289,6 @@ void llama_beam_search(llama_context * ctx,
|
|
7977
8289
|
// quantization
|
7978
8290
|
//
|
7979
8291
|
|
7980
|
-
template <typename T>
|
7981
|
-
struct no_init {
|
7982
|
-
T value;
|
7983
|
-
no_init() { /* do nothing */ }
|
7984
|
-
};
|
7985
|
-
|
7986
8292
|
struct quantize_state_internal {
|
7987
8293
|
const llama_model & model;
|
7988
8294
|
const llama_model_quantize_params * params;
|
@@ -8062,11 +8368,9 @@ static void llama_convert_tensor_internal(
|
|
8062
8368
|
workers.clear();
|
8063
8369
|
}
|
8064
8370
|
|
8065
|
-
static ggml_type get_k_quant_type(
|
8066
|
-
quantize_state_internal & qs,
|
8067
|
-
ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype
|
8068
|
-
) {
|
8371
|
+
static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
|
8069
8372
|
const std::string name = ggml_get_name(tensor);
|
8373
|
+
|
8070
8374
|
// TODO: avoid hardcoded tensor names - use the TN_* constants
|
8071
8375
|
const llm_arch arch = qs.model.arch;
|
8072
8376
|
const auto tn = LLM_TN(arch);
|
@@ -8100,7 +8404,18 @@ static ggml_type get_k_quant_type(
|
|
8100
8404
|
// nearly negligible increase in model size by quantizing this tensor with more bits:
|
8101
8405
|
if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
|
8102
8406
|
}
|
8407
|
+
if (qs.model.hparams.n_expert == 8) {
|
8408
|
+
// for the 8-expert model, bumping this to Q8_0 trades just ~128MB
|
8409
|
+
// TODO: explore better strategies
|
8410
|
+
new_type = GGML_TYPE_Q8_0;
|
8411
|
+
}
|
8103
8412
|
++qs.i_attention_wv;
|
8413
|
+
} else if (name.find("attn_k.weight") != std::string::npos) {
|
8414
|
+
if (qs.model.hparams.n_expert == 8) {
|
8415
|
+
// for the 8-expert model, bumping this to Q8_0 trades just ~128MB
|
8416
|
+
// TODO: explore better strategies
|
8417
|
+
new_type = GGML_TYPE_Q8_0;
|
8418
|
+
}
|
8104
8419
|
} else if (name.find("ffn_down.weight") != std::string::npos) {
|
8105
8420
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
8106
8421
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
@@ -8216,9 +8531,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
8216
8531
|
#endif
|
8217
8532
|
|
8218
8533
|
llama_model_loader ml(fname_inp, use_mmap, NULL);
|
8219
|
-
|
8220
|
-
ml.mapping.reset(new llama_mmap(&ml.file, /* prefetch */ 0, ggml_is_numa()));
|
8221
|
-
}
|
8534
|
+
ml.init_mapping(false); // no prefetching?
|
8222
8535
|
|
8223
8536
|
llama_model model;
|
8224
8537
|
llm_load_arch(ml, model);
|
@@ -8309,10 +8622,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
8309
8622
|
bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'?
|
8310
8623
|
|
8311
8624
|
// quantize only 2D tensors
|
8312
|
-
quantize &= (tensor
|
8625
|
+
quantize &= (ggml_n_dims(tensor) == 2);
|
8313
8626
|
quantize &= params->quantize_output_tensor || name != "output.weight";
|
8314
8627
|
quantize &= !params->only_copy;
|
8315
8628
|
|
8629
|
+
// do not quantize expert gating tensors
|
8630
|
+
quantize &= name.find("ffn_gate_inp.weight") == std::string::npos;
|
8631
|
+
|
8316
8632
|
enum ggml_type new_type;
|
8317
8633
|
void * new_data;
|
8318
8634
|
size_t new_size;
|
@@ -8461,74 +8777,63 @@ static int llama_apply_lora_from_file_internal(
|
|
8461
8777
|
|
8462
8778
|
const int64_t t_start_lora_us = ggml_time_us();
|
8463
8779
|
|
8464
|
-
|
8465
|
-
if (!fin) {
|
8466
|
-
LLAMA_LOG_ERROR("%s: failed to open '%s'\n", __func__, path_lora);
|
8467
|
-
return 1;
|
8468
|
-
}
|
8780
|
+
llama_file fin(path_lora, "rb");
|
8469
8781
|
|
8470
8782
|
// verify magic and version
|
8471
8783
|
{
|
8472
|
-
uint32_t magic;
|
8473
|
-
|
8474
|
-
|
8475
|
-
|
8784
|
+
uint32_t magic = fin.read_u32();
|
8785
|
+
if (magic != LLAMA_FILE_MAGIC_GGLA) {
|
8786
|
+
LLAMA_LOG_ERROR("%s: bad file magic\n", __func__);
|
8787
|
+
return 1;
|
8788
|
+
}
|
8476
8789
|
|
8790
|
+
uint32_t format_version = fin.read_u32();
|
8477
8791
|
if (format_version != 1) {
|
8478
8792
|
LLAMA_LOG_ERROR("%s: unsupported file version\n", __func__ );
|
8479
8793
|
return 1;
|
8480
8794
|
}
|
8481
8795
|
}
|
8482
8796
|
|
8483
|
-
int32_t lora_r;
|
8484
|
-
int32_t lora_alpha;
|
8485
|
-
fin.read((char *) &lora_r, sizeof(lora_r));
|
8486
|
-
fin.read((char *) &lora_alpha, sizeof(lora_alpha));
|
8797
|
+
int32_t lora_r = fin.read_u32();
|
8798
|
+
int32_t lora_alpha = fin.read_u32();
|
8487
8799
|
float scaling = scale * (float)lora_alpha / (float)lora_r;
|
8488
8800
|
|
8489
8801
|
LLAMA_LOG_INFO("%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);
|
8490
8802
|
|
8803
|
+
// create a name -> tensor map of the model to accelerate lookups
|
8804
|
+
// find the max tensor size to estimate the required temporary buffer size
|
8805
|
+
size_t max_tensor_size = 0;
|
8806
|
+
std::unordered_map<std::string, struct ggml_tensor*> model_tensors;
|
8807
|
+
for (const auto & kv : model.tensors_by_name) {
|
8808
|
+
model_tensors.insert(kv);
|
8809
|
+
size_t f32_size = ggml_nelements(kv.second) * sizeof(float);
|
8810
|
+
max_tensor_size = std::max(max_tensor_size, f32_size);
|
8811
|
+
}
|
8812
|
+
|
8491
8813
|
// create a temporary ggml context to store the lora tensors
|
8492
|
-
//
|
8493
|
-
|
8814
|
+
// TODO: use ggml-alloc
|
8815
|
+
size_t lora_ctx_size = max_tensor_size * 3;
|
8816
|
+
LLAMA_LOG_INFO("%s: allocating %.f MB for lora temporary buffer\n", __func__, lora_ctx_size / 1024.0 / 1024.0);
|
8817
|
+
std::vector<uint8_t> lora_buf(lora_ctx_size);
|
8818
|
+
|
8494
8819
|
struct ggml_init_params params;
|
8495
8820
|
params.mem_size = lora_buf.size();
|
8496
8821
|
params.mem_buffer = lora_buf.data();
|
8497
8822
|
params.no_alloc = false;
|
8498
8823
|
|
8499
|
-
|
8500
|
-
std::unordered_map<std::string, struct ggml_tensor *> lora_tensors;
|
8824
|
+
using unique_context = std::unique_ptr<ggml_context, decltype(&ggml_free)>;
|
8501
8825
|
|
8502
|
-
|
8503
|
-
|
8504
|
-
|
8505
|
-
model_tensors.insert(kv);
|
8506
|
-
}
|
8826
|
+
unique_context lora_ctx(nullptr, ggml_free);
|
8827
|
+
lora_ctx.reset(ggml_init(params));
|
8828
|
+
std::unordered_map<std::string, struct ggml_tensor *> lora_tensors;
|
8507
8829
|
|
8508
8830
|
// load base model
|
8509
8831
|
std::unique_ptr<llama_model_loader> ml;
|
8510
|
-
ggml_context * base_ctx = NULL;
|
8511
|
-
std::vector<uint8_t> base_buf;
|
8512
|
-
if (path_base_model) {
|
8513
|
-
LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
|
8514
|
-
ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*kv_overrides*/ NULL));
|
8515
|
-
|
8516
|
-
size_t ctx_size;
|
8517
|
-
size_t mmapped_size;
|
8518
|
-
ml->calc_sizes(ctx_size, mmapped_size);
|
8519
|
-
base_buf.resize(ctx_size);
|
8520
|
-
|
8521
|
-
ggml_init_params base_params;
|
8522
|
-
base_params.mem_size = base_buf.size();
|
8523
|
-
base_params.mem_buffer = base_buf.data();
|
8524
|
-
base_params.no_alloc = ml->use_mmap;
|
8525
8832
|
|
8526
|
-
|
8527
|
-
|
8528
|
-
|
8529
|
-
|
8530
|
-
ml->mapping.reset(new llama_mmap(&ml->file, /* prefetch */ 0, ggml_is_numa()));
|
8531
|
-
}
|
8833
|
+
if (path_base_model) {
|
8834
|
+
LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
|
8835
|
+
ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*kv_overrides*/ nullptr));
|
8836
|
+
ml->init_mapping(false); // no prefetching
|
8532
8837
|
}
|
8533
8838
|
|
8534
8839
|
// read tensors and apply
|
@@ -8538,27 +8843,35 @@ static int llama_apply_lora_from_file_internal(
|
|
8538
8843
|
std::vector<uint8_t> work_buffer;
|
8539
8844
|
|
8540
8845
|
while (true) {
|
8846
|
+
if (fin.tell() == fin.size) {
|
8847
|
+
// eof
|
8848
|
+
break;
|
8849
|
+
}
|
8850
|
+
|
8541
8851
|
int32_t n_dims;
|
8542
|
-
int32_t
|
8852
|
+
int32_t name_len;
|
8543
8853
|
int32_t ftype;
|
8544
8854
|
|
8545
|
-
fin.
|
8546
|
-
fin.
|
8547
|
-
fin.
|
8548
|
-
|
8549
|
-
|
8855
|
+
fin.read_raw(&n_dims, sizeof(n_dims));
|
8856
|
+
fin.read_raw(&name_len, sizeof(name_len));
|
8857
|
+
fin.read_raw(&ftype, sizeof(ftype));
|
8858
|
+
|
8859
|
+
if (n_dims != 1 && n_dims != 2) {
|
8860
|
+
LLAMA_LOG_ERROR("%s: unsupported tensor dimension %d\n", __func__, n_dims);
|
8861
|
+
return 1;
|
8550
8862
|
}
|
8551
8863
|
|
8552
8864
|
int32_t ne[2] = { 1, 1 };
|
8553
8865
|
for (int i = 0; i < n_dims; ++i) {
|
8554
|
-
fin.
|
8866
|
+
fin.read_raw(&ne[i], sizeof(ne[i]));
|
8555
8867
|
}
|
8556
8868
|
|
8557
8869
|
std::string name;
|
8558
8870
|
{
|
8871
|
+
GGML_ASSERT(name_len <= 1024);
|
8559
8872
|
char buf[1024];
|
8560
|
-
fin.
|
8561
|
-
name = std::string(buf,
|
8873
|
+
fin.read_raw(buf, name_len);
|
8874
|
+
name = std::string(buf, name_len);
|
8562
8875
|
}
|
8563
8876
|
|
8564
8877
|
// check for lora suffix and get the type of tensor
|
@@ -8572,7 +8885,7 @@ static int llama_apply_lora_from_file_internal(
|
|
8572
8885
|
std::string lora_type = name.substr(pos + lora_suffix.length());
|
8573
8886
|
std::string base_name = name;
|
8574
8887
|
base_name.erase(pos);
|
8575
|
-
// LLAMA_LOG_INFO("%s: %s => %s (lora type %s) \n", __func__, name.c_str(),base_name.c_str(), lora_type.c_str());
|
8888
|
+
// LLAMA_LOG_INFO("%s: %s => %s (lora type %s) \n", __func__, name.c_str(), base_name.c_str(), lora_type.c_str());
|
8576
8889
|
|
8577
8890
|
if (model_tensors.find(base_name) == model_tensors.end()) {
|
8578
8891
|
LLAMA_LOG_ERROR("%s: unknown tensor '%s' in lora adapter\n", __func__, name.data());
|
@@ -8591,22 +8904,15 @@ static int llama_apply_lora_from_file_internal(
|
|
8591
8904
|
return false;
|
8592
8905
|
}
|
8593
8906
|
}
|
8594
|
-
ggml_tensor * lora_tensor;
|
8595
|
-
|
8596
|
-
lora_tensor = ggml_new_tensor_2d(lora_ctx, wtype, ne[0], ne[1]);
|
8597
|
-
}
|
8598
|
-
else {
|
8599
|
-
LLAMA_LOG_ERROR("%s: unsupported tensor dimension %d\n", __func__, n_dims);
|
8600
|
-
return 1;
|
8601
|
-
}
|
8602
|
-
ggml_set_name(lora_tensor, "lora_tensor");
|
8907
|
+
ggml_tensor * lora_tensor = ggml_new_tensor_2d(lora_ctx.get(), wtype, ne[0], ne[1]);
|
8908
|
+
ggml_set_name(lora_tensor, name.c_str());
|
8603
8909
|
|
8604
8910
|
// load tensor data
|
8605
|
-
size_t offset = fin.
|
8911
|
+
size_t offset = fin.tell();
|
8606
8912
|
size_t tensor_data_size = ggml_nbytes(lora_tensor);
|
8607
8913
|
offset = (offset + 31) & -32;
|
8608
|
-
fin.
|
8609
|
-
fin.
|
8914
|
+
fin.seek(offset, SEEK_SET);
|
8915
|
+
fin.read_raw(lora_tensor->data, tensor_data_size);
|
8610
8916
|
|
8611
8917
|
lora_tensors[name] = lora_tensor;
|
8612
8918
|
|
@@ -8619,7 +8925,7 @@ static int llama_apply_lora_from_file_internal(
|
|
8619
8925
|
offload_func_t offload_func = ggml_offload_nop;
|
8620
8926
|
offload_func_t offload_func_force_inplace = ggml_offload_nop;
|
8621
8927
|
|
8622
|
-
#
|
8928
|
+
#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
|
8623
8929
|
if (dest_t->backend == GGML_BACKEND_GPU || dest_t->backend == GGML_BACKEND_GPU_SPLIT) {
|
8624
8930
|
if (dest_t->type != GGML_TYPE_F16) {
|
8625
8931
|
throw std::runtime_error(format(
|
@@ -8636,13 +8942,11 @@ static int llama_apply_lora_from_file_internal(
|
|
8636
8942
|
|
8637
8943
|
// load from base model
|
8638
8944
|
if (gguf_find_tensor(ctx_gguf, base_name.c_str()) < 0) {
|
8639
|
-
// TODO: throw
|
8640
8945
|
LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
|
8641
8946
|
return 1;
|
8642
8947
|
}
|
8643
8948
|
|
8644
|
-
|
8645
|
-
base_t = ml->create_tensor(base_ctx, base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] }, GGML_BACKEND_CPU);
|
8949
|
+
base_t = ml->get_tensor_meta(base_name.c_str());
|
8646
8950
|
ml->load_data_for(base_t);
|
8647
8951
|
} else {
|
8648
8952
|
base_t = dest_t;
|
@@ -8671,43 +8975,42 @@ static int llama_apply_lora_from_file_internal(
|
|
8671
8975
|
}
|
8672
8976
|
|
8673
8977
|
// w = w + BA*s
|
8674
|
-
ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB);
|
8978
|
+
ggml_tensor * BA = ggml_mul_mat(lora_ctx.get(), loraA, loraB);
|
8675
8979
|
offload_func(BA);
|
8676
8980
|
ggml_set_name(BA, "BA");
|
8677
8981
|
|
8678
8982
|
if (scaling != 1.0f) {
|
8679
|
-
|
8680
|
-
ggml_set_name(scale_tensor, "scale_tensor");
|
8681
|
-
|
8682
|
-
BA = ggml_scale_inplace(lora_ctx, BA, scale_tensor);
|
8983
|
+
BA = ggml_scale_inplace(lora_ctx.get(), BA, scaling);
|
8683
8984
|
offload_func(BA);
|
8684
8985
|
ggml_set_name(BA, "BA_scaled");
|
8685
8986
|
}
|
8686
8987
|
|
8687
8988
|
ggml_tensor * r;
|
8688
8989
|
if (base_t == dest_t) {
|
8689
|
-
r = ggml_add_inplace(lora_ctx, dest_t, BA);
|
8990
|
+
r = ggml_add_inplace(lora_ctx.get(), dest_t, BA);
|
8690
8991
|
offload_func_force_inplace(r);
|
8691
8992
|
ggml_set_name(r, "r_add_inplace");
|
8692
8993
|
}
|
8693
8994
|
else {
|
8694
|
-
r = ggml_add(lora_ctx, base_t, BA);
|
8995
|
+
r = ggml_add(lora_ctx.get(), base_t, BA);
|
8695
8996
|
offload_func(r);
|
8696
8997
|
ggml_set_name(r, "r_add");
|
8697
8998
|
|
8698
|
-
r = ggml_cpy(lora_ctx, r, dest_t);
|
8999
|
+
r = ggml_cpy(lora_ctx.get(), r, dest_t);
|
8699
9000
|
offload_func(r);
|
8700
9001
|
ggml_set_name(r, "r_cpy");
|
8701
9002
|
}
|
8702
9003
|
|
8703
|
-
struct ggml_cgraph * gf = ggml_new_graph(lora_ctx);
|
9004
|
+
struct ggml_cgraph * gf = ggml_new_graph(lora_ctx.get());
|
8704
9005
|
ggml_build_forward_expand(gf, r);
|
8705
9006
|
|
8706
9007
|
ggml_graph_compute_helper(work_buffer, gf, n_threads);
|
8707
9008
|
|
9009
|
+
// the tensors in the adapter must be sorted such that loraA and loraB of the same tensor are next to each other
|
9010
|
+
GGML_ASSERT(lora_tensors.size() == 2);
|
9011
|
+
|
8708
9012
|
// we won't need these tensors again, reset the context to save memory
|
8709
|
-
|
8710
|
-
lora_ctx = ggml_init(params);
|
9013
|
+
lora_ctx.reset(ggml_init(params));
|
8711
9014
|
lora_tensors.clear();
|
8712
9015
|
|
8713
9016
|
n_tensors++;
|
@@ -8717,12 +9020,6 @@ static int llama_apply_lora_from_file_internal(
|
|
8717
9020
|
}
|
8718
9021
|
}
|
8719
9022
|
|
8720
|
-
// TODO: this should be in a destructor, it will leak on failure
|
8721
|
-
ggml_free(lora_ctx);
|
8722
|
-
if (base_ctx) {
|
8723
|
-
ggml_free(base_ctx);
|
8724
|
-
}
|
8725
|
-
|
8726
9023
|
const int64_t t_lora_us = ggml_time_us() - t_start_lora_us;
|
8727
9024
|
LLAMA_LOG_INFO(" done (%.2f ms)\n", t_lora_us / 1000.0);
|
8728
9025
|
|
@@ -8852,11 +9149,18 @@ struct llama_model * llama_load_model_from_file(
|
|
8852
9149
|
LLAMA_LOG_INFO("\n");
|
8853
9150
|
}
|
8854
9151
|
}
|
9152
|
+
return true;
|
8855
9153
|
};
|
8856
9154
|
}
|
8857
9155
|
|
8858
|
-
|
8859
|
-
|
9156
|
+
int status = llama_model_load(path_model, *model, params);
|
9157
|
+
GGML_ASSERT(status <= 0);
|
9158
|
+
if (status < 0) {
|
9159
|
+
if (status == -1) {
|
9160
|
+
LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
|
9161
|
+
} else if (status == -2) {
|
9162
|
+
LLAMA_LOG_INFO("%s: cancelled model load\n", __func__);
|
9163
|
+
}
|
8860
9164
|
delete model;
|
8861
9165
|
return nullptr;
|
8862
9166
|
}
|
@@ -8931,7 +9235,39 @@ struct llama_context * llama_new_context_with_model(
|
|
8931
9235
|
|
8932
9236
|
// reserve memory for context buffers
|
8933
9237
|
if (!hparams.vocab_only) {
|
8934
|
-
|
9238
|
+
// initialize backend
|
9239
|
+
#ifdef GGML_USE_METAL
|
9240
|
+
if (model->n_gpu_layers > 0) {
|
9241
|
+
ctx->backend = ggml_backend_metal_init();
|
9242
|
+
if (ctx->backend == nullptr) {
|
9243
|
+
LLAMA_LOG_ERROR("%s: failed to initialize Metal backend\n", __func__);
|
9244
|
+
}
|
9245
|
+
}
|
9246
|
+
#elif defined(GGML_USE_CUBLAS) && defined(LLAMA_GGML_BACKEND_CUDA_TEST)
|
9247
|
+
// for testing only
|
9248
|
+
if (model->n_gpu_layers > 0) {
|
9249
|
+
ctx->backend = ggml_backend_cuda_init(0);
|
9250
|
+
if (ctx->backend == nullptr) {
|
9251
|
+
LLAMA_LOG_ERROR("%s: failed to initialize CUDA backend\n", __func__);
|
9252
|
+
}
|
9253
|
+
}
|
9254
|
+
#endif
|
9255
|
+
|
9256
|
+
if (ctx->backend == nullptr && ggml_backend_buffer_is_host(model->buf)) {
|
9257
|
+
ctx->backend = ggml_backend_cpu_init();
|
9258
|
+
if (ctx->backend == nullptr) {
|
9259
|
+
LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__);
|
9260
|
+
}
|
9261
|
+
}
|
9262
|
+
|
9263
|
+
if (ctx->backend == nullptr) {
|
9264
|
+
LLAMA_LOG_ERROR("%s: failed to initialize a backend\n", __func__);
|
9265
|
+
delete ctx;
|
9266
|
+
return nullptr;
|
9267
|
+
}
|
9268
|
+
|
9269
|
+
if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, type_k, type_v,
|
9270
|
+
cparams.n_ctx, model->n_gpu_layers, cparams.offload_kqv)) {
|
8935
9271
|
LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
|
8936
9272
|
llama_free(ctx);
|
8937
9273
|
return nullptr;
|
@@ -8967,12 +9303,11 @@ struct llama_context * llama_new_context_with_model(
|
|
8967
9303
|
}
|
8968
9304
|
|
8969
9305
|
{
|
8970
|
-
static const size_t tensor_alignment = 32;
|
8971
9306
|
// the compute buffer is used to store the tensor and graph structs, while the allocator buffer is used for the tensor data
|
8972
|
-
ctx->
|
9307
|
+
ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead());
|
8973
9308
|
|
8974
9309
|
// create measure allocator
|
8975
|
-
ctx->alloc =
|
9310
|
+
ctx->alloc = ggml_allocr_new_measure_from_backend(ctx->backend);
|
8976
9311
|
|
8977
9312
|
// build worst-case graph
|
8978
9313
|
int n_tokens = (int)std::min(cparams.n_ctx, cparams.n_batch);
|
@@ -8980,98 +9315,50 @@ struct llama_context * llama_new_context_with_model(
|
|
8980
9315
|
llama_token token = llama_token_bos(&ctx->model); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
|
8981
9316
|
ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0));
|
8982
9317
|
|
8983
|
-
#ifdef GGML_USE_METAL
|
8984
|
-
if (model->n_gpu_layers > 0) {
|
8985
|
-
ctx->ctx_metal = ggml_metal_init(1);
|
8986
|
-
if (!ctx->ctx_metal) {
|
8987
|
-
LLAMA_LOG_ERROR("%s: ggml_metal_init() failed\n", __func__);
|
8988
|
-
llama_free(ctx);
|
8989
|
-
return NULL;
|
8990
|
-
}
|
8991
|
-
//ggml_metal_graph_find_concurrency(ctx->ctx_metal, gf, false);
|
8992
|
-
//ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
|
8993
|
-
}
|
8994
|
-
#endif
|
8995
9318
|
// measure memory requirements for the graph
|
8996
|
-
size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf)
|
9319
|
+
size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf);
|
8997
9320
|
|
8998
|
-
LLAMA_LOG_INFO("%s: compute buffer total size = %.2f MiB\n", __func__, (ctx->
|
9321
|
+
LLAMA_LOG_INFO("%s: compute buffer total size = %.2f MiB\n", __func__, (ctx->buf_compute_meta.size() + alloc_size) / 1024.0 / 1024.0);
|
8999
9322
|
|
9000
|
-
//
|
9323
|
+
// create allocator again with exact memory requirements
|
9001
9324
|
ggml_allocr_free(ctx->alloc);
|
9002
9325
|
|
9003
|
-
ctx->buf_alloc
|
9004
|
-
ctx->alloc =
|
9005
|
-
#
|
9006
|
-
if (
|
9007
|
-
|
9008
|
-
|
9009
|
-
#endif
|
9010
|
-
#ifdef GGML_USE_CUBLAS
|
9011
|
-
ggml_cuda_set_scratch_size(alloc_size);
|
9012
|
-
LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MiB\n", __func__, alloc_size / 1024.0 / 1024.0);
|
9326
|
+
ctx->buf_alloc = ggml_backend_alloc_buffer(ctx->backend, alloc_size);
|
9327
|
+
ctx->alloc = ggml_allocr_new_from_buffer(ctx->buf_alloc);
|
9328
|
+
#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
|
9329
|
+
if (model->n_gpu_layers > 0) {
|
9330
|
+
ggml_cuda_set_scratch_size(alloc_size);
|
9331
|
+
LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MiB\n", __func__, alloc_size / 1024.0 / 1024.0);
|
9013
9332
|
|
9014
|
-
|
9015
|
-
|
9016
|
-
|
9017
|
-
|
9333
|
+
// calculate total VRAM usage
|
9334
|
+
auto add_tensor = [](const ggml_tensor * t, size_t & size) {
|
9335
|
+
if (t->backend == GGML_BACKEND_GPU || t->backend == GGML_BACKEND_GPU_SPLIT) {
|
9336
|
+
size += ggml_nbytes(t);
|
9337
|
+
}
|
9338
|
+
};
|
9339
|
+
size_t model_vram_size = 0;
|
9340
|
+
for (const auto & kv : model->tensors_by_name) {
|
9341
|
+
add_tensor(kv.second, model_vram_size);
|
9018
9342
|
}
|
9019
|
-
};
|
9020
|
-
size_t model_vram_size = 0;
|
9021
|
-
for (const auto & kv : model->tensors_by_name) {
|
9022
|
-
add_tensor(kv.second, model_vram_size);
|
9023
|
-
}
|
9024
|
-
|
9025
|
-
size_t kv_vram_size = 0;
|
9026
|
-
for (auto & k : ctx->kv_self.k_l) {
|
9027
|
-
add_tensor(k, kv_vram_size);
|
9028
|
-
}
|
9029
|
-
for (auto & v : ctx->kv_self.v_l) {
|
9030
|
-
add_tensor(v, kv_vram_size);
|
9031
|
-
}
|
9032
|
-
|
9033
|
-
size_t ctx_vram_size = alloc_size + kv_vram_size;
|
9034
|
-
size_t total_vram_size = model_vram_size + ctx_vram_size;
|
9035
9343
|
|
9036
|
-
|
9037
|
-
|
9038
|
-
|
9039
|
-
|
9040
|
-
|
9041
|
-
|
9042
|
-
|
9043
|
-
#ifdef GGML_USE_METAL
|
9044
|
-
if (model->n_gpu_layers > 0) {
|
9045
|
-
// this allocates all Metal resources and memory buffers
|
9046
|
-
|
9047
|
-
void * data_ptr = NULL;
|
9048
|
-
size_t data_size = 0;
|
9049
|
-
|
9050
|
-
if (ctx->model.mapping) {
|
9051
|
-
data_ptr = ctx->model.mapping->addr;
|
9052
|
-
data_size = ctx->model.mapping->size;
|
9053
|
-
} else {
|
9054
|
-
data_ptr = ggml_get_mem_buffer(ctx->model.ctx);
|
9055
|
-
data_size = ggml_get_mem_size (ctx->model.ctx);
|
9056
|
-
}
|
9057
|
-
|
9058
|
-
const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
|
9344
|
+
size_t kv_vram_size = 0;
|
9345
|
+
for (auto & k : ctx->kv_self.k_l) {
|
9346
|
+
add_tensor(k, kv_vram_size);
|
9347
|
+
}
|
9348
|
+
for (auto & v : ctx->kv_self.v_l) {
|
9349
|
+
add_tensor(v, kv_vram_size);
|
9350
|
+
}
|
9059
9351
|
|
9060
|
-
|
9352
|
+
size_t ctx_vram_size = alloc_size + kv_vram_size;
|
9353
|
+
size_t total_vram_size = model_vram_size + ctx_vram_size;
|
9061
9354
|
|
9062
|
-
|
9063
|
-
|
9064
|
-
|
9065
|
-
|
9066
|
-
return NULL; \
|
9355
|
+
LLAMA_LOG_INFO("%s: total VRAM used: %.2f MiB (model: %.2f MiB, context: %.2f MiB)\n", __func__,
|
9356
|
+
total_vram_size / 1024.0 / 1024.0,
|
9357
|
+
model_vram_size / 1024.0 / 1024.0,
|
9358
|
+
ctx_vram_size / 1024.0 / 1024.0);
|
9067
9359
|
}
|
9068
|
-
|
9069
|
-
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
|
9070
|
-
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.data, ctx->kv_self.buf.size, 0));
|
9071
|
-
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "alloc", ctx->buf_alloc.data, ctx->buf_alloc.size, 0));
|
9072
|
-
#undef LLAMA_METAL_CHECK_BUF
|
9073
|
-
}
|
9074
9360
|
#endif
|
9361
|
+
}
|
9075
9362
|
}
|
9076
9363
|
|
9077
9364
|
#ifdef GGML_USE_MPI
|
@@ -9099,10 +9386,14 @@ const llama_model * llama_get_model(const struct llama_context * ctx) {
|
|
9099
9386
|
return &ctx->model;
|
9100
9387
|
}
|
9101
9388
|
|
9102
|
-
|
9389
|
+
uint32_t llama_n_ctx(const struct llama_context * ctx) {
|
9103
9390
|
return ctx->cparams.n_ctx;
|
9104
9391
|
}
|
9105
9392
|
|
9393
|
+
uint32_t llama_n_batch(const struct llama_context * ctx) {
|
9394
|
+
return ctx->cparams.n_batch;
|
9395
|
+
}
|
9396
|
+
|
9106
9397
|
enum llama_vocab_type llama_vocab_type(const struct llama_model * model) {
|
9107
9398
|
return model->vocab.type;
|
9108
9399
|
}
|
@@ -9359,7 +9650,7 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
|
|
9359
9650
|
const size_t s_embedding = ctx->embedding.size() * sizeof(float);
|
9360
9651
|
const size_t s_kv_size = sizeof(size_t);
|
9361
9652
|
const size_t s_kv_ntok = sizeof(int);
|
9362
|
-
const size_t s_kv = ctx->kv_self.buf
|
9653
|
+
const size_t s_kv = ggml_backend_buffer_get_size(ctx->kv_self.buf);
|
9363
9654
|
|
9364
9655
|
const size_t s_total = (
|
9365
9656
|
+ s_rng_size
|
@@ -9487,7 +9778,7 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
9487
9778
|
const auto n_embd = hparams.n_embd_gqa();
|
9488
9779
|
const auto n_ctx = cparams.n_ctx;
|
9489
9780
|
|
9490
|
-
const size_t kv_buf_size = kv_self.buf
|
9781
|
+
const size_t kv_buf_size = ggml_backend_buffer_get_size(kv_self.buf);
|
9491
9782
|
const uint32_t kv_head = kv_self.head;
|
9492
9783
|
const uint32_t kv_size = kv_self.size;
|
9493
9784
|
const uint32_t kv_used = kv_self.used;
|
@@ -9503,17 +9794,12 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
9503
9794
|
ggml_context * cpy_ctx = ggml_init({ 6*n_layer*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
|
9504
9795
|
ggml_cgraph * gf = ggml_new_graph(cpy_ctx);
|
9505
9796
|
|
9506
|
-
std::vector<
|
9507
|
-
std::vector<
|
9797
|
+
std::vector<struct ggml_tensor *> kout2d(n_layer);
|
9798
|
+
std::vector<struct ggml_tensor *> vout2d(n_layer);
|
9508
9799
|
|
9509
9800
|
for (int il = 0; il < (int) n_layer; ++il) {
|
9510
|
-
|
9511
|
-
|
9512
|
-
kout2d->data = kout2d_data[il].data();
|
9513
|
-
|
9514
|
-
ggml_tensor * vout2d = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd);
|
9515
|
-
vout2d_data[il].resize(ggml_nbytes(vout2d));
|
9516
|
-
vout2d->data = vout2d_data[il].data();
|
9801
|
+
kout2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd, kv_head);
|
9802
|
+
vout2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd);
|
9517
9803
|
|
9518
9804
|
ggml_tensor * k2d = ggml_view_2d(cpy_ctx, kv_self.k_l[il],
|
9519
9805
|
n_embd, kv_head,
|
@@ -9523,20 +9809,28 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
9523
9809
|
kv_head, n_embd,
|
9524
9810
|
elt_size*n_ctx, 0);
|
9525
9811
|
|
9526
|
-
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, k2d, kout2d));
|
9527
|
-
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, v2d, vout2d));
|
9812
|
+
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, k2d, kout2d[il]));
|
9813
|
+
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, v2d, vout2d[il]));
|
9528
9814
|
}
|
9529
9815
|
|
9530
|
-
|
9816
|
+
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(cpy_ctx, ctx->backend);
|
9531
9817
|
|
9532
|
-
|
9818
|
+
ggml_backend_graph_compute(ctx->backend, gf);
|
9819
|
+
|
9820
|
+
std::vector<uint8_t> tmp_buf;
|
9821
|
+
for (int il = 0; il < (int) n_layer; ++il) {
|
9822
|
+
tmp_buf.resize(ggml_nbytes(kout2d[il]));
|
9823
|
+
ggml_backend_tensor_get(kout2d[il], tmp_buf.data(), 0, tmp_buf.size());
|
9824
|
+
data_ctx->write(tmp_buf.data(), tmp_buf.size());
|
9533
9825
|
|
9534
|
-
|
9535
|
-
|
9536
|
-
|
9537
|
-
data_ctx->write(kout2d_data[il].data(), kout2d_data[il].size());
|
9538
|
-
data_ctx->write(vout2d_data[il].data(), vout2d_data[il].size());
|
9826
|
+
tmp_buf.resize(ggml_nbytes(vout2d[il]));
|
9827
|
+
ggml_backend_tensor_get(vout2d[il], tmp_buf.data(), 0, tmp_buf.size());
|
9828
|
+
data_ctx->write(tmp_buf.data(), tmp_buf.size());
|
9539
9829
|
}
|
9830
|
+
|
9831
|
+
ggml_free(cpy_ctx);
|
9832
|
+
|
9833
|
+
ggml_backend_buffer_free(buf);
|
9540
9834
|
}
|
9541
9835
|
|
9542
9836
|
for (uint32_t i = 0; i < kv_size; ++i) {
|
@@ -9634,21 +9928,19 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
9634
9928
|
memcpy(&kv_used, inp, sizeof(kv_used)); inp += sizeof(kv_used);
|
9635
9929
|
|
9636
9930
|
if (kv_buf_size) {
|
9637
|
-
GGML_ASSERT(kv_self.buf
|
9931
|
+
GGML_ASSERT(ggml_backend_buffer_get_size(kv_self.buf) == kv_buf_size);
|
9638
9932
|
|
9639
9933
|
const size_t elt_size = ggml_element_size(kv_self.k_l[0]);
|
9640
9934
|
|
9641
9935
|
ggml_context * cpy_ctx = ggml_init({ 6*n_layer*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
|
9642
9936
|
ggml_cgraph * gf = ggml_new_graph(cpy_ctx);
|
9643
9937
|
|
9644
|
-
|
9645
|
-
|
9646
|
-
kin2d->data = (void *) inp;
|
9647
|
-
inp += ggml_nbytes(kin2d);
|
9938
|
+
std::vector<struct ggml_tensor *> kin2d(n_layer);
|
9939
|
+
std::vector<struct ggml_tensor *> vin2d(n_layer);
|
9648
9940
|
|
9649
|
-
|
9650
|
-
|
9651
|
-
|
9941
|
+
for (int il = 0; il < n_layer; ++il) {
|
9942
|
+
kin2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd, kv_head);
|
9943
|
+
vin2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd);
|
9652
9944
|
|
9653
9945
|
ggml_tensor * k2d = ggml_view_2d(cpy_ctx, kv_self.k_l[il],
|
9654
9946
|
n_embd, kv_head,
|
@@ -9658,13 +9950,26 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
9658
9950
|
kv_head, n_embd,
|
9659
9951
|
elt_size*n_ctx, 0);
|
9660
9952
|
|
9661
|
-
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, kin2d, k2d));
|
9662
|
-
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, vin2d, v2d));
|
9953
|
+
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, kin2d[il], k2d));
|
9954
|
+
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, vin2d[il], v2d));
|
9955
|
+
}
|
9956
|
+
|
9957
|
+
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(cpy_ctx, ctx->backend);
|
9958
|
+
|
9959
|
+
// load data into the tensors
|
9960
|
+
for (int il = 0; il < n_layer; ++il) {
|
9961
|
+
ggml_backend_tensor_set(kin2d[il], inp, 0, ggml_nbytes(kin2d[il]));
|
9962
|
+
inp += ggml_nbytes(kin2d[il]);
|
9963
|
+
|
9964
|
+
ggml_backend_tensor_set(vin2d[il], inp, 0, ggml_nbytes(vin2d[il]));
|
9965
|
+
inp += ggml_nbytes(vin2d[il]);
|
9663
9966
|
}
|
9664
9967
|
|
9665
|
-
|
9968
|
+
ggml_backend_graph_compute(ctx->backend, gf);
|
9666
9969
|
|
9667
9970
|
ggml_free(cpy_ctx);
|
9971
|
+
|
9972
|
+
ggml_backend_buffer_free(buf);
|
9668
9973
|
}
|
9669
9974
|
|
9670
9975
|
ctx->kv_self.head = kv_head;
|
@@ -9887,6 +10192,7 @@ float * llama_get_logits(struct llama_context * ctx) {
|
|
9887
10192
|
}
|
9888
10193
|
|
9889
10194
|
float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
|
10195
|
+
assert(ctx->logits_valid.at(i));
|
9890
10196
|
return ctx->logits.data() + i*ctx->model.hparams.n_vocab;
|
9891
10197
|
}
|
9892
10198
|
|