llama_cpp 0.0.7 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +17 -0
- data/ext/llama_cpp/llama_cpp.cpp +829 -51
- data/ext/llama_cpp/src/ggml-cuda.h +9 -32
- data/ext/llama_cpp/src/ggml-opencl.c +169 -24
- data/ext/llama_cpp/src/ggml.c +6672 -4376
- data/ext/llama_cpp/src/ggml.h +250 -15
- data/ext/llama_cpp/src/{llama_util.h → llama-util.h} +76 -10
- data/ext/llama_cpp/src/llama.cpp +710 -217
- data/ext/llama_cpp/src/llama.h +75 -28
- data/lib/llama_cpp/client.rb +30 -9
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +27 -3
- data/sig/llama_cpp.rbs +41 -7
- metadata +3 -3
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -5,10 +5,13 @@
|
|
5
5
|
#include <cstdio>
|
6
6
|
#endif
|
7
7
|
|
8
|
-
#include "
|
8
|
+
#include "llama-util.h"
|
9
9
|
#include "llama.h"
|
10
10
|
|
11
11
|
#include "ggml.h"
|
12
|
+
#ifdef GGML_USE_CUBLAS
|
13
|
+
#include "ggml-cuda.h"
|
14
|
+
#endif
|
12
15
|
|
13
16
|
#include <array>
|
14
17
|
#include <ctime>
|
@@ -28,11 +31,11 @@
|
|
28
31
|
#include <atomic>
|
29
32
|
#include <mutex>
|
30
33
|
#include <sstream>
|
34
|
+
#include <numeric>
|
31
35
|
|
32
36
|
#define LLAMA_USE_SCRATCH
|
33
37
|
#define LLAMA_MAX_SCRATCH_BUFFERS 16
|
34
38
|
|
35
|
-
|
36
39
|
// available llama models
|
37
40
|
enum e_model {
|
38
41
|
MODEL_UNKNOWN,
|
@@ -50,49 +53,49 @@ static const size_t MB = 1024*1024;
|
|
50
53
|
|
51
54
|
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
|
52
55
|
{
|
53
|
-
static std::map<e_model, size_t>
|
56
|
+
static std::map<e_model, size_t> k_sizes = {
|
54
57
|
{ MODEL_7B, 512ull * MB },
|
55
58
|
{ MODEL_13B, 512ull * MB },
|
56
59
|
{ MODEL_30B, 512ull * MB },
|
57
60
|
{ MODEL_65B, 1024ull * MB },
|
58
61
|
};
|
59
|
-
return
|
62
|
+
return k_sizes;
|
60
63
|
}
|
61
64
|
|
62
65
|
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
|
63
66
|
{
|
64
|
-
static std::map<e_model, size_t>
|
67
|
+
static std::map<e_model, size_t> k_sizes = {
|
65
68
|
{ MODEL_7B, 512ull * MB },
|
66
69
|
{ MODEL_13B, 512ull * MB },
|
67
70
|
{ MODEL_30B, 512ull * MB },
|
68
71
|
{ MODEL_65B, 1024ull * MB },
|
69
72
|
};
|
70
|
-
return
|
73
|
+
return k_sizes;
|
71
74
|
}
|
72
75
|
|
73
76
|
// 2*n_embd*n_ctx*n_layer*sizeof(float16)
|
74
77
|
static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
|
75
78
|
{
|
76
|
-
static std::map<e_model, size_t>
|
79
|
+
static std::map<e_model, size_t> k_sizes = {
|
77
80
|
{ MODEL_7B, 1026ull * MB },
|
78
81
|
{ MODEL_13B, 1608ull * MB },
|
79
82
|
{ MODEL_30B, 3124ull * MB },
|
80
83
|
{ MODEL_65B, 5120ull * MB },
|
81
84
|
};
|
82
|
-
return
|
85
|
+
return k_sizes;
|
83
86
|
}
|
84
87
|
|
85
88
|
// this is mostly needed for temporary mul_mat buffers to dequantize the data
|
86
89
|
// not actually needed if BLAS is disabled
|
87
90
|
static const std::map<e_model, size_t> & MEM_REQ_EVAL()
|
88
91
|
{
|
89
|
-
static std::map<e_model, size_t>
|
92
|
+
static std::map<e_model, size_t> k_sizes = {
|
90
93
|
{ MODEL_7B, 768ull * MB },
|
91
94
|
{ MODEL_13B, 1024ull * MB },
|
92
95
|
{ MODEL_30B, 1280ull * MB },
|
93
96
|
{ MODEL_65B, 1536ull * MB },
|
94
97
|
};
|
95
|
-
return
|
98
|
+
return k_sizes;
|
96
99
|
}
|
97
100
|
|
98
101
|
// default hparams (LLaMA 7B)
|
@@ -136,7 +139,7 @@ struct llama_kv_cache {
|
|
136
139
|
|
137
140
|
struct ggml_context * ctx = NULL;
|
138
141
|
|
139
|
-
|
142
|
+
llama_ctx_buffer buf;
|
140
143
|
|
141
144
|
int n; // number of tokens currently in the cache
|
142
145
|
|
@@ -167,7 +170,7 @@ struct llama_model {
|
|
167
170
|
struct llama_kv_cache kv_self;
|
168
171
|
|
169
172
|
// the model memory buffer
|
170
|
-
|
173
|
+
llama_ctx_buffer buf;
|
171
174
|
|
172
175
|
// model memory mapped file
|
173
176
|
std::unique_ptr<llama_mmap> mapping;
|
@@ -228,8 +231,8 @@ struct llama_context {
|
|
228
231
|
|
229
232
|
// memory buffers used to evaluate the model
|
230
233
|
// TODO: move in llama_state
|
231
|
-
|
232
|
-
|
234
|
+
llama_ctx_buffer buf_compute;
|
235
|
+
llama_ctx_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
|
233
236
|
|
234
237
|
int buf_last = 0;
|
235
238
|
size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
|
@@ -402,6 +405,7 @@ enum llama_file_version {
|
|
402
405
|
LLAMA_FILE_VERSION_GGML,
|
403
406
|
LLAMA_FILE_VERSION_GGMF_V1, // added version field and scores in vocab
|
404
407
|
LLAMA_FILE_VERSION_GGJT_V1, // added padding
|
408
|
+
LLAMA_FILE_VERSION_GGJT_V2, // changed quantization format
|
405
409
|
};
|
406
410
|
|
407
411
|
struct llama_file_loader {
|
@@ -432,6 +436,8 @@ struct llama_file_loader {
|
|
432
436
|
file_version = LLAMA_FILE_VERSION_GGMF_V1;
|
433
437
|
} else if (magic == 'ggjt' && version == 1) {
|
434
438
|
file_version = LLAMA_FILE_VERSION_GGJT_V1;
|
439
|
+
} else if (magic == 'ggjt' && version == 2) {
|
440
|
+
file_version = LLAMA_FILE_VERSION_GGJT_V2;
|
435
441
|
} else {
|
436
442
|
throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
|
437
443
|
magic, version);
|
@@ -482,8 +488,6 @@ struct llama_file_loader {
|
|
482
488
|
case GGML_TYPE_F16:
|
483
489
|
case GGML_TYPE_Q4_0:
|
484
490
|
case GGML_TYPE_Q4_1:
|
485
|
-
case GGML_TYPE_Q4_2:
|
486
|
-
case GGML_TYPE_Q4_3:
|
487
491
|
case GGML_TYPE_Q5_0:
|
488
492
|
case GGML_TYPE_Q5_1:
|
489
493
|
case GGML_TYPE_Q8_0:
|
@@ -528,8 +532,8 @@ struct llama_file_saver {
|
|
528
532
|
write_vocab();
|
529
533
|
}
|
530
534
|
void write_magic() {
|
531
|
-
file.write_u32(
|
532
|
-
file.write_u32(
|
535
|
+
file.write_u32(LLAMA_FILE_MAGIC); // magic
|
536
|
+
file.write_u32(LLAMA_FILE_VERSION); // version
|
533
537
|
}
|
534
538
|
void write_hparams(enum llama_ftype new_ftype) {
|
535
539
|
const llama_hparams & hparams = any_file_loader->hparams;
|
@@ -559,8 +563,6 @@ struct llama_file_saver {
|
|
559
563
|
case GGML_TYPE_F16:
|
560
564
|
case GGML_TYPE_Q4_0:
|
561
565
|
case GGML_TYPE_Q4_1:
|
562
|
-
case GGML_TYPE_Q4_2:
|
563
|
-
case GGML_TYPE_Q4_3:
|
564
566
|
case GGML_TYPE_Q5_0:
|
565
567
|
case GGML_TYPE_Q5_1:
|
566
568
|
case GGML_TYPE_Q8_0:
|
@@ -587,12 +589,12 @@ struct llama_model_loader {
|
|
587
589
|
std::unique_ptr<llama_mmap> mapping;
|
588
590
|
|
589
591
|
llama_model_loader(const std::string & fname_base, bool use_mmap, bool vocab_only) {
|
590
|
-
auto first_file = new llama_file_loader(fname_base.c_str(), 0, tensors_map);
|
592
|
+
auto * first_file = new llama_file_loader(fname_base.c_str(), 0, tensors_map);
|
591
593
|
file_loaders.emplace_back(first_file);
|
592
594
|
uint32_t n_parts = vocab_only ? 1 : guess_n_parts();
|
593
595
|
for (uint32_t i = 1; i < n_parts; i++) {
|
594
596
|
std::string fname = fname_base + "." + std::to_string(i);
|
595
|
-
auto ith_file = new llama_file_loader(fname.c_str(), i, tensors_map);
|
597
|
+
auto * ith_file = new llama_file_loader(fname.c_str(), i, tensors_map);
|
596
598
|
file_loaders.emplace_back(ith_file);
|
597
599
|
if (ith_file->hparams != first_file->hparams) {
|
598
600
|
throw format("llama.cpp: hparams inconsistent between files");
|
@@ -639,7 +641,7 @@ struct llama_model_loader {
|
|
639
641
|
}
|
640
642
|
}
|
641
643
|
|
642
|
-
struct ggml_tensor * get_tensor(const std::string & name, std::vector<uint32_t> ne) {
|
644
|
+
struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne) {
|
643
645
|
auto it = tensors_map.name_to_idx.find(name);
|
644
646
|
if (it == tensors_map.name_to_idx.end()) {
|
645
647
|
throw format("llama.cpp: tensor '%s' is missing from model", name.c_str());
|
@@ -661,13 +663,14 @@ struct llama_model_loader {
|
|
661
663
|
LLAMA_ASSERT(lt.ne.size() == 1);
|
662
664
|
tensor = ggml_new_tensor_1d(ggml_ctx, lt.type, lt.ne.at(0));
|
663
665
|
}
|
666
|
+
ggml_set_name(tensor, lt.name.c_str());
|
664
667
|
LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor
|
665
668
|
lt.ggml_tensor = tensor;
|
666
669
|
num_ggml_tensors_created++;
|
667
670
|
return tensor;
|
668
671
|
}
|
669
672
|
|
670
|
-
void done_getting_tensors() {
|
673
|
+
void done_getting_tensors() const {
|
671
674
|
if (num_ggml_tensors_created != tensors_map.tensors.size()) {
|
672
675
|
throw std::string("llama.cpp: file contained more tensors than expected");
|
673
676
|
}
|
@@ -729,8 +732,7 @@ struct llama_model_loader {
|
|
729
732
|
LLAMA_ASSERT(offset == lt.size);
|
730
733
|
} else if (lt.split_type == SPLIT_BY_COLUMNS) {
|
731
734
|
// Let's load the data into temporary buffers to ensure the OS performs large loads.
|
732
|
-
std::vector<llama_buffer> tmp_bufs;
|
733
|
-
tmp_bufs.resize(lt.shards.size());
|
735
|
+
std::vector<llama_buffer> tmp_bufs(lt.shards.size());
|
734
736
|
for (size_t i = 0; i < lt.shards.size(); i++) {
|
735
737
|
llama_load_tensor_shard & shard = lt.shards.at(i);
|
736
738
|
llama_file & file = file_loaders.at(shard.file_idx)->file;
|
@@ -782,7 +784,7 @@ static bool kv_cache_init(
|
|
782
784
|
const int n_embd = hparams.n_embd;
|
783
785
|
const int n_layer = hparams.n_layer;
|
784
786
|
|
785
|
-
const int64_t n_mem =
|
787
|
+
const int64_t n_mem = n_layer*n_ctx;
|
786
788
|
const int64_t n_elements = n_embd*n_mem;
|
787
789
|
|
788
790
|
cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
|
@@ -801,6 +803,8 @@ static bool kv_cache_init(
|
|
801
803
|
|
802
804
|
cache.k = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
|
803
805
|
cache.v = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
|
806
|
+
ggml_set_name(cache.k, "cache_k");
|
807
|
+
ggml_set_name(cache.v, "cache_v");
|
804
808
|
|
805
809
|
return true;
|
806
810
|
}
|
@@ -808,9 +812,9 @@ static bool kv_cache_init(
|
|
808
812
|
struct llama_context_params llama_context_default_params() {
|
809
813
|
struct llama_context_params result = {
|
810
814
|
/*.n_ctx =*/ 512,
|
811
|
-
/*.
|
812
|
-
/*.seed =*/
|
813
|
-
/*.f16_kv =*/
|
815
|
+
/*.gpu_layers =*/ 0,
|
816
|
+
/*.seed =*/ -1,
|
817
|
+
/*.f16_kv =*/ true,
|
814
818
|
/*.logits_all =*/ false,
|
815
819
|
/*.vocab_only =*/ false,
|
816
820
|
/*.use_mmap =*/ true,
|
@@ -839,9 +843,11 @@ static const char *llama_file_version_name(llama_file_version version) {
|
|
839
843
|
switch (version) {
|
840
844
|
case LLAMA_FILE_VERSION_GGML: return "'ggml' (old version with low tokenizer quality and no mmap support)";
|
841
845
|
case LLAMA_FILE_VERSION_GGMF_V1: return "ggmf v1 (old version with no mmap support)";
|
842
|
-
case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (
|
843
|
-
|
846
|
+
case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (pre #1405)";
|
847
|
+
case LLAMA_FILE_VERSION_GGJT_V2: return "ggjt v2 (latest)";
|
844
848
|
}
|
849
|
+
|
850
|
+
return "unknown";
|
845
851
|
}
|
846
852
|
|
847
853
|
static const char *llama_ftype_name(enum llama_ftype ftype) {
|
@@ -852,8 +858,6 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
|
|
852
858
|
case LLAMA_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1";
|
853
859
|
case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
|
854
860
|
return "mostly Q4_1, some F16";
|
855
|
-
case LLAMA_FTYPE_MOSTLY_Q4_2: return "mostly Q4_2";
|
856
|
-
case LLAMA_FTYPE_MOSTLY_Q4_3: return "mostly Q4_3";
|
857
861
|
case LLAMA_FTYPE_MOSTLY_Q5_0: return "mostly Q5_0";
|
858
862
|
case LLAMA_FTYPE_MOSTLY_Q5_1: return "mostly Q5_1";
|
859
863
|
case LLAMA_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0";
|
@@ -875,6 +879,7 @@ static void llama_model_load_internal(
|
|
875
879
|
const std::string & fname,
|
876
880
|
llama_context & lctx,
|
877
881
|
int n_ctx,
|
882
|
+
int n_gpu_layers,
|
878
883
|
ggml_type memory_type,
|
879
884
|
bool use_mmap,
|
880
885
|
bool use_mlock,
|
@@ -919,15 +924,24 @@ static void llama_model_load_internal(
|
|
919
924
|
fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
|
920
925
|
}
|
921
926
|
|
927
|
+
if (file_version != LLAMA_FILE_VERSION_GGJT_V2) {
|
928
|
+
if (hparams.ftype != LLAMA_FTYPE_ALL_F32 &&
|
929
|
+
hparams.ftype != LLAMA_FTYPE_MOSTLY_F16 &&
|
930
|
+
hparams.ftype != LLAMA_FTYPE_MOSTLY_Q8_0) {
|
931
|
+
throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1305)");
|
932
|
+
}
|
933
|
+
}
|
934
|
+
|
922
935
|
if (vocab_only) {
|
923
936
|
return;
|
924
937
|
}
|
925
938
|
|
926
939
|
auto & ctx = model.ctx;
|
927
940
|
|
928
|
-
size_t ctx_size
|
941
|
+
size_t ctx_size;
|
942
|
+
size_t mmapped_size;
|
929
943
|
ml->calc_sizes(&ctx_size, &mmapped_size);
|
930
|
-
fprintf(stderr, "%s: ggml ctx size = %6.2f
|
944
|
+
fprintf(stderr, "%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/1024.0/1024.0);
|
931
945
|
|
932
946
|
// print memory requirements
|
933
947
|
{
|
@@ -971,8 +985,6 @@ static void llama_model_load_internal(
|
|
971
985
|
|
972
986
|
// prepare memory for the weights
|
973
987
|
{
|
974
|
-
const auto & hparams = model.hparams;
|
975
|
-
|
976
988
|
const uint32_t n_embd = hparams.n_embd;
|
977
989
|
const uint32_t n_layer = hparams.n_layer;
|
978
990
|
const uint32_t n_vocab = hparams.n_vocab;
|
@@ -1014,6 +1026,35 @@ static void llama_model_load_internal(
|
|
1014
1026
|
ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
|
1015
1027
|
|
1016
1028
|
model.mapping = std::move(ml->mapping);
|
1029
|
+
#ifdef GGML_USE_CUBLAS
|
1030
|
+
{
|
1031
|
+
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
1032
|
+
|
1033
|
+
fprintf(stderr, "%s: [cublas] offloading %d layers to GPU\n", __func__, n_gpu);
|
1034
|
+
|
1035
|
+
size_t vram_total = 0;
|
1036
|
+
|
1037
|
+
for (int i = 0; i < n_gpu; ++i) {
|
1038
|
+
const auto & layer = model.layers[i];
|
1039
|
+
|
1040
|
+
ggml_cuda_transform_tensor(layer.wq); vram_total += ggml_nbytes(layer.wq);
|
1041
|
+
ggml_cuda_transform_tensor(layer.wk); vram_total += ggml_nbytes(layer.wk);
|
1042
|
+
ggml_cuda_transform_tensor(layer.wv); vram_total += ggml_nbytes(layer.wv);
|
1043
|
+
ggml_cuda_transform_tensor(layer.wo); vram_total += ggml_nbytes(layer.wo);
|
1044
|
+
ggml_cuda_transform_tensor(layer.w1); vram_total += ggml_nbytes(layer.w1);
|
1045
|
+
ggml_cuda_transform_tensor(layer.w2); vram_total += ggml_nbytes(layer.w2);
|
1046
|
+
ggml_cuda_transform_tensor(layer.w3); vram_total += ggml_nbytes(layer.w3);
|
1047
|
+
}
|
1048
|
+
if (n_gpu_layers > (int) hparams.n_layer) {
|
1049
|
+
fprintf(stderr, "%s: [cublas] offloading output layer to GPU\n", __func__);
|
1050
|
+
ggml_cuda_transform_tensor(model.output); vram_total += ggml_nbytes(model.output);
|
1051
|
+
}
|
1052
|
+
|
1053
|
+
fprintf(stderr, "%s: [cublas] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
|
1054
|
+
}
|
1055
|
+
#else
|
1056
|
+
(void) n_gpu_layers;
|
1057
|
+
#endif
|
1017
1058
|
|
1018
1059
|
// loading time will be recalculate after the first eval, so
|
1019
1060
|
// we take page faults deferred by mmap() into consideration
|
@@ -1024,6 +1065,7 @@ static bool llama_model_load(
|
|
1024
1065
|
const std::string & fname,
|
1025
1066
|
llama_context & lctx,
|
1026
1067
|
int n_ctx,
|
1068
|
+
int n_gpu_layers,
|
1027
1069
|
ggml_type memory_type,
|
1028
1070
|
bool use_mmap,
|
1029
1071
|
bool use_mlock,
|
@@ -1031,7 +1073,7 @@ static bool llama_model_load(
|
|
1031
1073
|
llama_progress_callback progress_callback,
|
1032
1074
|
void *progress_callback_user_data) {
|
1033
1075
|
try {
|
1034
|
-
llama_model_load_internal(fname, lctx, n_ctx, memory_type, use_mmap, use_mlock,
|
1076
|
+
llama_model_load_internal(fname, lctx, n_ctx, n_gpu_layers, memory_type, use_mmap, use_mlock,
|
1035
1077
|
vocab_only, progress_callback, progress_callback_user_data);
|
1036
1078
|
return true;
|
1037
1079
|
} catch (const std::string & err) {
|
@@ -1053,6 +1095,13 @@ static bool llama_eval_internal(
|
|
1053
1095
|
const int n_tokens,
|
1054
1096
|
const int n_past,
|
1055
1097
|
const int n_threads) {
|
1098
|
+
|
1099
|
+
// enforce that the first token is BOS
|
1100
|
+
if (n_past == 0 && tokens[0] != llama_token_bos()) {
|
1101
|
+
fprintf(stderr, "%s: first token must be BOS\n", __func__);
|
1102
|
+
return false;
|
1103
|
+
}
|
1104
|
+
|
1056
1105
|
const int64_t t_start_us = ggml_time_us();
|
1057
1106
|
|
1058
1107
|
const int N = n_tokens;
|
@@ -1060,7 +1109,7 @@ static bool llama_eval_internal(
|
|
1060
1109
|
const auto & model = lctx.model;
|
1061
1110
|
const auto & hparams = model.hparams;
|
1062
1111
|
|
1063
|
-
auto & kv_self = model.kv_self;
|
1112
|
+
const auto & kv_self = model.kv_self;
|
1064
1113
|
|
1065
1114
|
LLAMA_ASSERT(!!kv_self.ctx);
|
1066
1115
|
|
@@ -1088,6 +1137,7 @@ static bool llama_eval_internal(
|
|
1088
1137
|
gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
|
1089
1138
|
|
1090
1139
|
struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
1140
|
+
ggml_set_name(embd, "embd");
|
1091
1141
|
memcpy(embd->data, tokens, N*ggml_element_size(embd));
|
1092
1142
|
|
1093
1143
|
struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
|
@@ -1112,8 +1162,10 @@ static bool llama_eval_internal(
|
|
1112
1162
|
// self-attention
|
1113
1163
|
{
|
1114
1164
|
// compute Q and K and RoPE them
|
1115
|
-
struct ggml_tensor * Qcur =
|
1116
|
-
struct ggml_tensor * Kcur =
|
1165
|
+
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
|
1166
|
+
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
|
1167
|
+
ggml_set_name(Qcur, "Qcur");
|
1168
|
+
ggml_set_name(Kcur, "Kcur");
|
1117
1169
|
|
1118
1170
|
// store key and value to memory
|
1119
1171
|
{
|
@@ -1134,6 +1186,7 @@ static bool llama_eval_internal(
|
|
1134
1186
|
ggml_permute(ctx0,
|
1135
1187
|
Qcur,
|
1136
1188
|
0, 2, 1, 3);
|
1189
|
+
ggml_set_name(Q, "Q");
|
1137
1190
|
|
1138
1191
|
struct ggml_tensor * K =
|
1139
1192
|
ggml_permute(ctx0,
|
@@ -1141,21 +1194,28 @@ static bool llama_eval_internal(
|
|
1141
1194
|
ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd),
|
1142
1195
|
n_embd/n_head, n_head, n_past + N),
|
1143
1196
|
0, 2, 1, 3);
|
1197
|
+
ggml_set_name(K, "K");
|
1144
1198
|
|
1145
1199
|
// K * Q
|
1146
1200
|
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
1201
|
+
ggml_set_name(KQ, "KQ");
|
1147
1202
|
|
1148
1203
|
// KQ_scaled = KQ / sqrt(n_embd/n_head)
|
1149
|
-
struct ggml_tensor *
|
1150
|
-
|
1151
|
-
|
1152
|
-
|
1204
|
+
struct ggml_tensor * KQ_scale = ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head));
|
1205
|
+
ggml_set_name(KQ_scale, "1/sqrt(n_embd/n_head)");
|
1206
|
+
|
1207
|
+
// KQ_scaled shape [n_past + N, N, n_head, 1]
|
1208
|
+
struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
|
1209
|
+
ggml_set_name(KQ_scaled, "KQ_scaled");
|
1153
1210
|
|
1154
1211
|
// KQ_masked = mask_past(KQ_scaled)
|
1155
|
-
struct ggml_tensor * KQ_masked =
|
1212
|
+
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
|
1213
|
+
ggml_set_name(KQ_masked, "KQ_masked");
|
1156
1214
|
|
1157
1215
|
// KQ = soft_max(KQ_masked)
|
1158
|
-
struct ggml_tensor * KQ_soft_max =
|
1216
|
+
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
|
1217
|
+
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
1218
|
+
|
1159
1219
|
|
1160
1220
|
// split cached V into n_head heads
|
1161
1221
|
struct ggml_tensor * V =
|
@@ -1164,9 +1224,11 @@ static bool llama_eval_internal(
|
|
1164
1224
|
n_ctx*ggml_element_size(kv_self.v),
|
1165
1225
|
n_ctx*ggml_element_size(kv_self.v)*n_embd/n_head,
|
1166
1226
|
il*n_ctx*ggml_element_size(kv_self.v)*n_embd);
|
1227
|
+
ggml_set_name(V, "V");
|
1167
1228
|
|
1168
1229
|
#if 1
|
1169
1230
|
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
1231
|
+
ggml_set_name(KQV, "KQV");
|
1170
1232
|
#else
|
1171
1233
|
// make V contiguous in memory to speed up the matmul, however we waste time on the copy
|
1172
1234
|
// on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
|
@@ -1177,11 +1239,13 @@ static bool llama_eval_internal(
|
|
1177
1239
|
|
1178
1240
|
// KQV_merged = KQV.permute(0, 2, 1, 3)
|
1179
1241
|
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
1242
|
+
ggml_set_name(KQV_merged, "KQV_merged");
|
1180
1243
|
|
1181
1244
|
// cur = KQV_merged.contiguous().view(n_embd, N)
|
1182
1245
|
cur = ggml_cpy(ctx0,
|
1183
1246
|
KQV_merged,
|
1184
1247
|
ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
|
1248
|
+
ggml_set_name(cur, "KQV_merged_contiguous");
|
1185
1249
|
|
1186
1250
|
// projection (no bias)
|
1187
1251
|
cur = ggml_mul_mat(ctx0,
|
@@ -1253,7 +1317,7 @@ static bool llama_eval_internal(
|
|
1253
1317
|
lctx.use_buf(ctx0, -1);
|
1254
1318
|
|
1255
1319
|
// logits -> probs
|
1256
|
-
//inpL =
|
1320
|
+
//inpL = ggml_soft_max_inplace(ctx0, inpL);
|
1257
1321
|
|
1258
1322
|
// run the computation
|
1259
1323
|
ggml_build_forward_expand(&gf, inpL);
|
@@ -1273,6 +1337,9 @@ static bool llama_eval_internal(
|
|
1273
1337
|
//embd_w.resize(n_vocab*N);
|
1274
1338
|
//memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
|
1275
1339
|
|
1340
|
+
// update kv token count
|
1341
|
+
lctx.model.kv_self.n = n_past + N;
|
1342
|
+
|
1276
1343
|
// extract logits
|
1277
1344
|
{
|
1278
1345
|
auto & logits_out = lctx.logits;
|
@@ -1288,7 +1355,7 @@ static bool llama_eval_internal(
|
|
1288
1355
|
}
|
1289
1356
|
|
1290
1357
|
// extract embeddings
|
1291
|
-
if (lctx.embedding.
|
1358
|
+
if (!lctx.embedding.empty()) {
|
1292
1359
|
auto & embedding_out = lctx.embedding;
|
1293
1360
|
|
1294
1361
|
embedding_out.resize(n_embd);
|
@@ -1339,6 +1406,8 @@ struct llama_sp_symbol {
|
|
1339
1406
|
size_t n;
|
1340
1407
|
};
|
1341
1408
|
|
1409
|
+
static_assert(std::is_trivially_copyable<llama_sp_symbol>::value, "llama_sp_symbol is not trivially copyable");
|
1410
|
+
|
1342
1411
|
struct llama_sp_bigram {
|
1343
1412
|
struct comparator {
|
1344
1413
|
bool operator()(llama_sp_bigram & l, llama_sp_bigram & r) {
|
@@ -1371,7 +1440,7 @@ struct llama_tokenizer {
|
|
1371
1440
|
sym.prev = index - 1;
|
1372
1441
|
sym.next = offs == text.size() ? -1 : index + 1;
|
1373
1442
|
index++;
|
1374
|
-
symbols_.emplace_back(
|
1443
|
+
symbols_.emplace_back(sym);
|
1375
1444
|
}
|
1376
1445
|
|
1377
1446
|
// seed the work queue with all possible 2-character tokens.
|
@@ -1462,12 +1531,12 @@ static std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, co
|
|
1462
1531
|
llama_tokenizer tokenizer(vocab);
|
1463
1532
|
std::vector<llama_vocab::id> output;
|
1464
1533
|
|
1465
|
-
if (text.
|
1534
|
+
if (text.empty()) {
|
1466
1535
|
return output;
|
1467
1536
|
}
|
1468
1537
|
|
1469
1538
|
if (bos) {
|
1470
|
-
output.push_back(
|
1539
|
+
output.push_back(llama_token_bos());
|
1471
1540
|
}
|
1472
1541
|
|
1473
1542
|
tokenizer.tokenize(text, output);
|
@@ -1478,109 +1547,402 @@ static std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, co
|
|
1478
1547
|
// sampling
|
1479
1548
|
//
|
1480
1549
|
|
1481
|
-
|
1482
|
-
|
1483
|
-
|
1484
|
-
|
1485
|
-
|
1486
|
-
|
1487
|
-
|
1488
|
-
|
1550
|
+
void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates) {
|
1551
|
+
assert(candidates->size > 0);
|
1552
|
+
|
1553
|
+
const int64_t t_start_sample_us = ggml_time_us();
|
1554
|
+
|
1555
|
+
// Sort the logits in descending order
|
1556
|
+
if (!candidates->sorted) {
|
1557
|
+
std::sort(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
|
1558
|
+
return a.logit > b.logit;
|
1559
|
+
});
|
1560
|
+
candidates->sorted = true;
|
1561
|
+
}
|
1489
1562
|
|
1490
|
-
|
1563
|
+
float max_l = candidates->data[0].logit;
|
1564
|
+
float cum_sum = 0.0f;
|
1565
|
+
for (size_t i = 0; i < candidates->size; ++i) {
|
1566
|
+
float p = expf(candidates->data[i].logit - max_l);
|
1567
|
+
candidates->data[i].p = p;
|
1568
|
+
cum_sum += p;
|
1569
|
+
}
|
1570
|
+
for (size_t i = 0; i < candidates->size; ++i) {
|
1571
|
+
candidates->data[i].p /= cum_sum;
|
1572
|
+
}
|
1573
|
+
|
1574
|
+
if (ctx) {
|
1575
|
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
1576
|
+
}
|
1491
1577
|
}
|
1492
1578
|
|
1493
|
-
|
1494
|
-
|
1495
|
-
|
1496
|
-
|
1497
|
-
|
1498
|
-
|
1499
|
-
|
1500
|
-
|
1501
|
-
|
1502
|
-
|
1503
|
-
|
1504
|
-
|
1505
|
-
|
1506
|
-
|
1507
|
-
|
1508
|
-
// select the token with the highest logit directly
|
1509
|
-
float max_logit = plogits[0];
|
1510
|
-
llama_vocab::id max_id = 0;
|
1511
|
-
|
1512
|
-
for (int i = 1; i < n_logits; ++i) {
|
1513
|
-
if (plogits[i] > max_logit) {
|
1514
|
-
max_logit = plogits[i];
|
1515
|
-
max_id = i;
|
1516
|
-
}
|
1579
|
+
void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep) {
|
1580
|
+
const int64_t t_start_sample_us = ggml_time_us();
|
1581
|
+
|
1582
|
+
k = std::max(k, (int) min_keep);
|
1583
|
+
k = std::min(k, (int) candidates->size);
|
1584
|
+
|
1585
|
+
// Sort scores in descending order
|
1586
|
+
if (!candidates->sorted) {
|
1587
|
+
auto comp = [](const llama_token_data & a, const llama_token_data & b) {
|
1588
|
+
return a.logit > b.logit;
|
1589
|
+
};
|
1590
|
+
if (k == (int) candidates->size) {
|
1591
|
+
std::sort(candidates->data, candidates->data + candidates->size, comp);
|
1592
|
+
} else {
|
1593
|
+
std::partial_sort(candidates->data, candidates->data + k, candidates->data + candidates->size, comp);
|
1517
1594
|
}
|
1518
|
-
|
1595
|
+
candidates->sorted = true;
|
1519
1596
|
}
|
1597
|
+
candidates->size = k;
|
1520
1598
|
|
1521
|
-
|
1522
|
-
|
1599
|
+
if (ctx) {
|
1600
|
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
1601
|
+
}
|
1602
|
+
}
|
1523
1603
|
|
1524
|
-
|
1525
|
-
|
1526
|
-
|
1527
|
-
|
1528
|
-
|
1529
|
-
|
1530
|
-
|
1531
|
-
|
1532
|
-
|
1533
|
-
|
1534
|
-
|
1535
|
-
|
1536
|
-
|
1537
|
-
|
1538
|
-
|
1604
|
+
void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) {
|
1605
|
+
if (p >= 1.0f) {
|
1606
|
+
return;
|
1607
|
+
}
|
1608
|
+
|
1609
|
+
const int64_t t_start_sample_us = ggml_time_us();
|
1610
|
+
|
1611
|
+
llama_sample_softmax(ctx, candidates);
|
1612
|
+
|
1613
|
+
// Compute the cumulative probabilities
|
1614
|
+
float cum_sum = 0.0f;
|
1615
|
+
size_t last_idx = candidates->size;
|
1616
|
+
|
1617
|
+
for (size_t i = 0; i < candidates->size; ++i) {
|
1618
|
+
cum_sum += candidates->data[i].p;
|
1619
|
+
|
1620
|
+
// Check if the running sum is greater than p or if we have kept at least min_keep tokens
|
1621
|
+
if (cum_sum > p && i >= min_keep) {
|
1622
|
+
last_idx = i;
|
1623
|
+
break;
|
1539
1624
|
}
|
1540
1625
|
}
|
1541
1626
|
|
1542
|
-
|
1627
|
+
// Resize the output vector to keep only the top-p tokens
|
1628
|
+
candidates->size = last_idx;
|
1543
1629
|
|
1544
|
-
|
1545
|
-
|
1546
|
-
|
1630
|
+
if (ctx) {
|
1631
|
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
1632
|
+
}
|
1633
|
+
}
|
1634
|
+
|
1635
|
+
void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep) {
|
1636
|
+
if (z >= 1.0f || candidates->size <= 2) {
|
1637
|
+
return;
|
1638
|
+
}
|
1547
1639
|
|
1548
|
-
|
1549
|
-
|
1550
|
-
|
1551
|
-
|
1552
|
-
|
1553
|
-
|
1640
|
+
const int64_t t_start_sample_us = ggml_time_us();
|
1641
|
+
|
1642
|
+
llama_sample_softmax(nullptr, candidates);
|
1643
|
+
|
1644
|
+
// Compute the first and second derivatives
|
1645
|
+
std::vector<float> first_derivatives(candidates->size - 1);
|
1646
|
+
std::vector<float> second_derivatives(candidates->size - 2);
|
1647
|
+
|
1648
|
+
for (size_t i = 0; i < first_derivatives.size(); ++i) {
|
1649
|
+
first_derivatives[i] = candidates->data[i].p - candidates->data[i + 1].p;
|
1650
|
+
}
|
1651
|
+
for (size_t i = 0; i < second_derivatives.size(); ++i) {
|
1652
|
+
second_derivatives[i] = first_derivatives[i] - first_derivatives[i + 1];
|
1554
1653
|
}
|
1555
1654
|
|
1556
|
-
//
|
1557
|
-
for (
|
1558
|
-
|
1655
|
+
// Calculate absolute value of second derivatives
|
1656
|
+
for (size_t i = 0; i < second_derivatives.size(); ++i) {
|
1657
|
+
second_derivatives[i] = abs(second_derivatives[i]);
|
1559
1658
|
}
|
1560
1659
|
|
1561
|
-
|
1562
|
-
|
1563
|
-
|
1564
|
-
|
1565
|
-
|
1566
|
-
|
1567
|
-
|
1568
|
-
|
1569
|
-
|
1660
|
+
// Normalize the second derivatives
|
1661
|
+
float second_derivatives_sum = std::accumulate(second_derivatives.begin(), second_derivatives.end(), 0.0f);
|
1662
|
+
for (float & value : second_derivatives) {
|
1663
|
+
value /= second_derivatives_sum;
|
1664
|
+
}
|
1665
|
+
|
1666
|
+
float cum_sum = 0.0f;
|
1667
|
+
size_t last_idx = candidates->size;
|
1668
|
+
for (size_t i = 0; i < second_derivatives.size(); ++i) {
|
1669
|
+
cum_sum += second_derivatives[i];
|
1670
|
+
|
1671
|
+
// Check if the running sum is greater than z or if we have kept at least min_keep tokens
|
1672
|
+
if (cum_sum > z && i >= min_keep) {
|
1673
|
+
last_idx = i;
|
1674
|
+
break;
|
1570
1675
|
}
|
1571
1676
|
}
|
1572
1677
|
|
1573
|
-
//
|
1574
|
-
|
1575
|
-
|
1576
|
-
|
1577
|
-
|
1578
|
-
|
1678
|
+
// Resize the output vector to keep only the tokens above the tail location
|
1679
|
+
candidates->size = last_idx;
|
1680
|
+
|
1681
|
+
if (ctx) {
|
1682
|
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
1683
|
+
}
|
1684
|
+
}
|
1685
|
+
|
1686
|
+
|
1687
|
+
void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) {
|
1688
|
+
// Reference implementation:
|
1689
|
+
// https://github.com/huggingface/transformers/compare/main...cimeister:typical-sampling:typical-pr
|
1690
|
+
if (p >= 1.0f) {
|
1691
|
+
return;
|
1692
|
+
}
|
1693
|
+
|
1694
|
+
const int64_t t_start_sample_us = ggml_time_us();
|
1695
|
+
|
1696
|
+
// Compute the softmax of logits and calculate entropy
|
1697
|
+
llama_sample_softmax(nullptr, candidates);
|
1698
|
+
|
1699
|
+
float entropy = 0.0f;
|
1700
|
+
for (size_t i = 0; i < candidates->size; ++i) {
|
1701
|
+
entropy += -candidates->data[i].p * logf(candidates->data[i].p);
|
1702
|
+
}
|
1703
|
+
|
1704
|
+
// Compute the absolute difference between negative log probability and entropy for each candidate
|
1705
|
+
std::vector<float> shifted_scores;
|
1706
|
+
for (size_t i = 0; i < candidates->size; ++i) {
|
1707
|
+
float shifted_score = fabsf(-logf(candidates->data[i].p) - entropy);
|
1708
|
+
shifted_scores.push_back(shifted_score);
|
1709
|
+
}
|
1710
|
+
|
1711
|
+
// Sort tokens based on the shifted_scores and their corresponding indices
|
1712
|
+
std::vector<size_t> indices(candidates->size);
|
1713
|
+
std::iota(indices.begin(), indices.end(), 0);
|
1714
|
+
|
1715
|
+
std::sort(indices.begin(), indices.end(), [&](size_t a, size_t b) {
|
1716
|
+
return shifted_scores[a] < shifted_scores[b];
|
1717
|
+
});
|
1718
|
+
|
1719
|
+
// Compute the cumulative probabilities
|
1720
|
+
float cum_sum = 0.0f;
|
1721
|
+
size_t last_idx = indices.size();
|
1722
|
+
|
1723
|
+
for (size_t i = 0; i < indices.size(); ++i) {
|
1724
|
+
size_t idx = indices[i];
|
1725
|
+
cum_sum += candidates->data[idx].p;
|
1726
|
+
|
1727
|
+
// Check if the running sum is greater than typical or if we have kept at least min_keep tokens
|
1728
|
+
if (cum_sum > p && i >= min_keep - 1) {
|
1729
|
+
last_idx = i + 1;
|
1730
|
+
break;
|
1731
|
+
}
|
1732
|
+
}
|
1733
|
+
|
1734
|
+
// Resize the output vector to keep only the locally typical tokens
|
1735
|
+
std::vector<llama_token_data> new_candidates;
|
1736
|
+
for (size_t i = 0; i < last_idx; ++i) {
|
1737
|
+
size_t idx = indices[i];
|
1738
|
+
new_candidates.push_back(candidates->data[idx]);
|
1739
|
+
}
|
1740
|
+
|
1741
|
+
// Replace the data in candidates with the new_candidates data
|
1742
|
+
std::copy(new_candidates.begin(), new_candidates.end(), candidates->data);
|
1743
|
+
candidates->size = new_candidates.size();
|
1744
|
+
|
1745
|
+
if (ctx) {
|
1746
|
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
1747
|
+
}
|
1748
|
+
}
|
1749
|
+
|
1750
|
+
void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
|
1751
|
+
const int64_t t_start_sample_us = ggml_time_us();
|
1752
|
+
|
1753
|
+
for (size_t i = 0; i < candidates_p->size; ++i) {
|
1754
|
+
candidates_p->data[i].logit /= temp;
|
1755
|
+
}
|
1756
|
+
|
1757
|
+
if (ctx) {
|
1758
|
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
1759
|
+
}
|
1760
|
+
}
|
1761
|
+
|
1762
|
+
void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty) {
|
1763
|
+
if (last_tokens_size == 0 || penalty == 1.0f) {
|
1764
|
+
return;
|
1765
|
+
}
|
1766
|
+
|
1767
|
+
const int64_t t_start_sample_us = ggml_time_us();
|
1768
|
+
|
1769
|
+
for (size_t i = 0; i < candidates->size; ++i) {
|
1770
|
+
const auto * token_iter = std::find(last_tokens, last_tokens + last_tokens_size, candidates->data[i].id);
|
1771
|
+
if (token_iter == last_tokens + last_tokens_size) {
|
1772
|
+
continue;
|
1773
|
+
}
|
1774
|
+
|
1775
|
+
// The academic publication that described this technique actually just only divided, but that would cause tokens with negative logits to become more likely, which is obviously wrong.
|
1776
|
+
// This is common fix for this problem, which is to multiply by the penalty instead of dividing.
|
1777
|
+
if (candidates->data[i].logit <= 0) {
|
1778
|
+
candidates->data[i].logit *= penalty;
|
1779
|
+
} else {
|
1780
|
+
candidates->data[i].logit /= penalty;
|
1781
|
+
}
|
1782
|
+
}
|
1783
|
+
|
1784
|
+
candidates->sorted = false;
|
1785
|
+
|
1786
|
+
if (ctx) {
|
1787
|
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
1788
|
+
}
|
1789
|
+
}
|
1790
|
+
|
1791
|
+
void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens_p, size_t last_tokens_size, float alpha_frequency, float alpha_presence) {
|
1792
|
+
if (last_tokens_size == 0 || (alpha_frequency == 0.0f && alpha_presence == 0.0f)) {
|
1793
|
+
return;
|
1794
|
+
}
|
1795
|
+
|
1796
|
+
const int64_t t_start_sample_us = ggml_time_us();
|
1797
|
+
|
1798
|
+
// Create a frequency map to count occurrences of each token in last_tokens
|
1799
|
+
std::unordered_map<llama_token, int> token_count;
|
1800
|
+
for (size_t i = 0; i < last_tokens_size; ++i) {
|
1801
|
+
token_count[last_tokens_p[i]]++;
|
1802
|
+
}
|
1803
|
+
|
1804
|
+
// Apply frequency and presence penalties to the candidates
|
1805
|
+
for (size_t i = 0; i < candidates->size; ++i) {
|
1806
|
+
auto token_iter = token_count.find(candidates->data[i].id);
|
1807
|
+
if (token_iter == token_count.end()) {
|
1808
|
+
continue;
|
1809
|
+
}
|
1810
|
+
|
1811
|
+
int count = token_iter->second;
|
1812
|
+
candidates->data[i].logit -= float(count) * alpha_frequency + float(count > 0) * alpha_presence;
|
1813
|
+
}
|
1814
|
+
|
1815
|
+
candidates->sorted = false;
|
1816
|
+
|
1817
|
+
if (ctx) {
|
1818
|
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
1819
|
+
}
|
1820
|
+
}
|
1821
|
+
|
1822
|
+
|
1823
|
+
llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu) {
|
1824
|
+
assert(ctx);
|
1825
|
+
auto N = float(llama_n_vocab(ctx));
|
1826
|
+
int64_t t_start_sample_us;
|
1827
|
+
t_start_sample_us = ggml_time_us();
|
1828
|
+
|
1829
|
+
llama_sample_softmax(nullptr, candidates);
|
1830
|
+
|
1831
|
+
// Estimate s_hat using the most probable m tokens
|
1832
|
+
float s_hat = 0.0;
|
1833
|
+
float sum_ti_bi = 0.0;
|
1834
|
+
float sum_ti_sq = 0.0;
|
1835
|
+
for (size_t i = 0; i < size_t(m - 1) && i < candidates->size - 1; ++i) {
|
1836
|
+
float t_i = logf(float(i + 2) / float(i + 1));
|
1837
|
+
float b_i = logf(candidates->data[i].p / candidates->data[i + 1].p);
|
1838
|
+
sum_ti_bi += t_i * b_i;
|
1839
|
+
sum_ti_sq += t_i * t_i;
|
1840
|
+
}
|
1841
|
+
s_hat = sum_ti_bi / sum_ti_sq;
|
1842
|
+
|
1843
|
+
// Compute k from the estimated s_hat and target surprise value
|
1844
|
+
float epsilon_hat = s_hat - 1;
|
1845
|
+
float k = powf((epsilon_hat * powf(2, *mu)) / (1 - powf(N, -epsilon_hat)), 1 / s_hat);
|
1846
|
+
|
1847
|
+
// Sample the next word X using top-k sampling
|
1848
|
+
llama_sample_top_k(nullptr, candidates, int(k), 1);
|
1849
|
+
if (ctx) {
|
1850
|
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
1851
|
+
}
|
1852
|
+
llama_token X = llama_sample_token(ctx, candidates);
|
1853
|
+
t_start_sample_us = ggml_time_us();
|
1854
|
+
|
1855
|
+
// Compute error as the difference between observed surprise and target surprise value
|
1856
|
+
size_t X_idx = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) {
|
1857
|
+
return candidate.id == X;
|
1858
|
+
}));
|
1859
|
+
float observed_surprise = -log2f(candidates->data[X_idx].p);
|
1860
|
+
float e = observed_surprise - tau;
|
1861
|
+
|
1862
|
+
// Update mu using the learning rate and error
|
1863
|
+
*mu = *mu - eta * e;
|
1864
|
+
|
1865
|
+
if (ctx) {
|
1866
|
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
1867
|
+
ctx->n_sample++;
|
1868
|
+
}
|
1869
|
+
return X;
|
1870
|
+
}
|
1871
|
+
|
1872
|
+
llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu) {
|
1873
|
+
assert(ctx);
|
1874
|
+
int64_t t_start_sample_us;
|
1875
|
+
t_start_sample_us = ggml_time_us();
|
1876
|
+
|
1877
|
+
llama_sample_softmax(ctx, candidates);
|
1878
|
+
|
1879
|
+
// Truncate the words with surprise values greater than mu
|
1880
|
+
candidates->size = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) {
|
1881
|
+
return -log2f(candidate.p) > *mu;
|
1882
|
+
}));
|
1883
|
+
|
1884
|
+
// Normalize the probabilities of the remaining words
|
1885
|
+
llama_sample_softmax(ctx, candidates);
|
1886
|
+
|
1887
|
+
// Sample the next word X from the remaining words
|
1888
|
+
if (ctx) {
|
1889
|
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
1890
|
+
}
|
1891
|
+
llama_token X = llama_sample_token(ctx, candidates);
|
1892
|
+
t_start_sample_us = ggml_time_us();
|
1893
|
+
|
1894
|
+
// Compute error as the difference between observed surprise and target surprise value
|
1895
|
+
size_t X_idx = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) {
|
1896
|
+
return candidate.id == X;
|
1897
|
+
}));
|
1898
|
+
float observed_surprise = -log2f(candidates->data[X_idx].p);
|
1899
|
+
float e = observed_surprise - tau;
|
1900
|
+
|
1901
|
+
// Update mu using the learning rate and error
|
1902
|
+
*mu = *mu - eta * e;
|
1903
|
+
|
1904
|
+
if (ctx) {
|
1905
|
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
1906
|
+
}
|
1907
|
+
return X;
|
1908
|
+
}
|
1909
|
+
|
1910
|
+
llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_data_array * candidates) {
|
1911
|
+
const int64_t t_start_sample_us = ggml_time_us();
|
1912
|
+
|
1913
|
+
// Find max element
|
1914
|
+
auto * max_iter = std::max_element(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
|
1915
|
+
return a.logit < b.logit;
|
1916
|
+
});
|
1917
|
+
|
1918
|
+
llama_token result = max_iter->id;
|
1919
|
+
if (ctx) {
|
1920
|
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
1921
|
+
ctx->n_sample++;
|
1922
|
+
}
|
1923
|
+
return result;
|
1924
|
+
}
|
1925
|
+
|
1926
|
+
llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates) {
|
1927
|
+
assert(ctx);
|
1928
|
+
const int64_t t_start_sample_us = ggml_time_us();
|
1929
|
+
llama_sample_softmax(nullptr, candidates);
|
1930
|
+
|
1931
|
+
std::vector<float> probs;
|
1932
|
+
probs.reserve(candidates->size);
|
1933
|
+
for (size_t i = 0; i < candidates->size; ++i) {
|
1934
|
+
probs.push_back(candidates->data[i].p);
|
1935
|
+
}
|
1579
1936
|
|
1580
1937
|
std::discrete_distribution<> dist(probs.begin(), probs.end());
|
1938
|
+
auto & rng = ctx->rng;
|
1581
1939
|
int idx = dist(rng);
|
1582
1940
|
|
1583
|
-
|
1941
|
+
llama_token result = candidates->data[idx].id;
|
1942
|
+
|
1943
|
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
1944
|
+
ctx->n_sample++;
|
1945
|
+
return result;
|
1584
1946
|
}
|
1585
1947
|
|
1586
1948
|
//
|
@@ -1592,8 +1954,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
1592
1954
|
switch (ftype) {
|
1593
1955
|
case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
|
1594
1956
|
case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
|
1595
|
-
case LLAMA_FTYPE_MOSTLY_Q4_2: quantized_type = GGML_TYPE_Q4_2; break;
|
1596
|
-
case LLAMA_FTYPE_MOSTLY_Q4_3: quantized_type = GGML_TYPE_Q4_3; break;
|
1597
1957
|
case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
|
1598
1958
|
case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
|
1599
1959
|
case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
|
@@ -1604,7 +1964,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
1604
1964
|
nthread = std::thread::hardware_concurrency();
|
1605
1965
|
}
|
1606
1966
|
|
1607
|
-
std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp
|
1967
|
+
std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp, /*use_mmap*/ false,
|
1608
1968
|
/*vocab_only*/ false));
|
1609
1969
|
llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), ftype);
|
1610
1970
|
|
@@ -1658,7 +2018,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
1658
2018
|
} else if (tensor.type == GGML_TYPE_F16) {
|
1659
2019
|
f32_conv_buf.resize(nelements * sizeof(float));
|
1660
2020
|
f32_data = (float *) f32_conv_buf.addr;
|
1661
|
-
auto f16_data = (const ggml_fp16_t *) tensor.data;
|
2021
|
+
const auto * f16_data = (const ggml_fp16_t *) tensor.data;
|
1662
2022
|
for (size_t i = 0; i < nelements; i++) {
|
1663
2023
|
f32_data[i] = ggml_fp16_to_fp32(f16_data[i]);
|
1664
2024
|
}
|
@@ -1689,21 +2049,31 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
1689
2049
|
size_t first = counter; counter += chunk_size;
|
1690
2050
|
if (first >= nelements) {
|
1691
2051
|
if (!local_hist.empty()) {
|
1692
|
-
for (int j=0; j<int(local_hist.size()); ++j)
|
2052
|
+
for (int j=0; j<int(local_hist.size()); ++j) {
|
2053
|
+
hist_cur[j] += local_hist[j];
|
2054
|
+
}
|
1693
2055
|
new_size += local_size;
|
1694
2056
|
}
|
1695
2057
|
break;
|
1696
2058
|
}
|
1697
2059
|
lock.unlock();
|
1698
2060
|
size_t last = std::min(nelements, first + chunk_size);
|
1699
|
-
if (local_hist.empty())
|
2061
|
+
if (local_hist.empty()) {
|
2062
|
+
local_hist.resize(hist_cur.size(), 0);
|
2063
|
+
}
|
1700
2064
|
local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data());
|
1701
2065
|
}
|
1702
2066
|
};
|
1703
|
-
if (int
|
1704
|
-
|
2067
|
+
if ((int) workers.size() < nthread_use - 1) {
|
2068
|
+
workers.resize(nthread_use - 1);
|
2069
|
+
}
|
2070
|
+
for (int it = 0; it < nthread_use - 1; ++it) {
|
2071
|
+
workers[it] = std::thread(compute);
|
2072
|
+
}
|
1705
2073
|
compute();
|
1706
|
-
for (int it = 0; it < nthread_use - 1; ++it)
|
2074
|
+
for (int it = 0; it < nthread_use - 1; ++it) {
|
2075
|
+
workers[it].join();
|
2076
|
+
}
|
1707
2077
|
}
|
1708
2078
|
|
1709
2079
|
printf("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0);
|
@@ -1749,7 +2119,7 @@ struct llama_context * llama_init_from_file(
|
|
1749
2119
|
|
1750
2120
|
llama_context * ctx = new llama_context;
|
1751
2121
|
|
1752
|
-
if (params.seed
|
2122
|
+
if (params.seed < 0) {
|
1753
2123
|
params.seed = time(NULL);
|
1754
2124
|
}
|
1755
2125
|
|
@@ -1775,7 +2145,7 @@ struct llama_context * llama_init_from_file(
|
|
1775
2145
|
|
1776
2146
|
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
1777
2147
|
|
1778
|
-
if (!llama_model_load(path_model, *ctx, params.n_ctx, memory_type,
|
2148
|
+
if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_gpu_layers, memory_type,
|
1779
2149
|
params.use_mmap, params.use_mlock, params.vocab_only,
|
1780
2150
|
params.progress_callback, params.progress_callback_user_data)) {
|
1781
2151
|
fprintf(stderr, "%s: failed to load model\n", __func__);
|
@@ -1901,7 +2271,8 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|
1901
2271
|
fprintf(stderr, "%s: loading base model from '%s'\n", __func__, path_base_model);
|
1902
2272
|
model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*vocab_only*/ false));
|
1903
2273
|
|
1904
|
-
size_t ctx_size
|
2274
|
+
size_t ctx_size;
|
2275
|
+
size_t mmapped_size;
|
1905
2276
|
model_loader->calc_sizes(&ctx_size, &mmapped_size);
|
1906
2277
|
base_buf.resize(ctx_size);
|
1907
2278
|
|
@@ -1940,8 +2311,12 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|
1940
2311
|
fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
|
1941
2312
|
}
|
1942
2313
|
|
1943
|
-
std::string name
|
1944
|
-
|
2314
|
+
std::string name;
|
2315
|
+
{
|
2316
|
+
char buf[1024];
|
2317
|
+
fin.read(buf, length);
|
2318
|
+
name = std::string(buf, length);
|
2319
|
+
}
|
1945
2320
|
|
1946
2321
|
// check for lora suffix and get the type of tensor
|
1947
2322
|
const std::string lora_suffix = ".lora";
|
@@ -1956,7 +2331,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|
1956
2331
|
base_name.erase(pos);
|
1957
2332
|
// fprintf(stderr, "%s: %s => %s (lora type %s) ", __func__, name.c_str(),base_name.c_str(), lora_type.c_str());
|
1958
2333
|
|
1959
|
-
if (model_tensors.find(base_name
|
2334
|
+
if (model_tensors.find(base_name) == model_tensors.end()) {
|
1960
2335
|
fprintf(stderr, "%s: unknown tensor '%s' in lora adapter\n", __func__, name.data());
|
1961
2336
|
return 1;
|
1962
2337
|
}
|
@@ -2036,7 +2411,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|
2036
2411
|
|
2037
2412
|
if (scaling != 1.0f) {
|
2038
2413
|
ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling);
|
2039
|
-
BA =
|
2414
|
+
BA = ggml_scale_inplace(lora_ctx, BA, scale_tensor);
|
2040
2415
|
}
|
2041
2416
|
|
2042
2417
|
ggml_tensor * r;
|
@@ -2058,8 +2433,9 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|
2058
2433
|
lora_tensors.clear();
|
2059
2434
|
|
2060
2435
|
n_tensors++;
|
2061
|
-
if (n_tensors % 4 == 0)
|
2436
|
+
if (n_tensors % 4 == 0) {
|
2062
2437
|
fprintf(stderr, ".");
|
2438
|
+
}
|
2063
2439
|
}
|
2064
2440
|
}
|
2065
2441
|
|
@@ -2084,21 +2460,21 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
|
|
2084
2460
|
}
|
2085
2461
|
}
|
2086
2462
|
|
2087
|
-
int llama_get_kv_cache_token_count(struct llama_context * ctx) {
|
2463
|
+
int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
|
2088
2464
|
return ctx->model.kv_self.n;
|
2089
2465
|
}
|
2090
2466
|
|
2091
|
-
#define LLAMA_MAX_RNG_STATE 64*1024
|
2467
|
+
#define LLAMA_MAX_RNG_STATE (64*1024)
|
2092
2468
|
|
2093
2469
|
void llama_set_rng_seed(struct llama_context * ctx, int seed) {
|
2094
|
-
if (seed
|
2470
|
+
if (seed < 0) {
|
2095
2471
|
seed = time(NULL);
|
2096
2472
|
}
|
2097
2473
|
ctx->rng.seed(seed);
|
2098
2474
|
}
|
2099
2475
|
|
2100
|
-
// Returns the size of the state
|
2101
|
-
size_t llama_get_state_size(struct llama_context * ctx) {
|
2476
|
+
// Returns the *maximum* size of the state
|
2477
|
+
size_t llama_get_state_size(const struct llama_context * ctx) {
|
2102
2478
|
// we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.
|
2103
2479
|
// for reference, std::mt19937(1337) serializes to 6701 bytes.
|
2104
2480
|
const size_t s_rng_size = sizeof(size_t);
|
@@ -2129,8 +2505,8 @@ size_t llama_get_state_size(struct llama_context * ctx) {
|
|
2129
2505
|
}
|
2130
2506
|
|
2131
2507
|
// Copies the state to the specified destination address
|
2132
|
-
size_t llama_copy_state_data(struct llama_context * ctx, uint8_t *
|
2133
|
-
uint8_t * out =
|
2508
|
+
size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
2509
|
+
uint8_t * out = dst;
|
2134
2510
|
|
2135
2511
|
// copy rng
|
2136
2512
|
{
|
@@ -2176,36 +2552,70 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) {
|
|
2176
2552
|
|
2177
2553
|
// copy kv cache
|
2178
2554
|
{
|
2179
|
-
const
|
2555
|
+
const auto & kv_self = ctx->model.kv_self;
|
2556
|
+
const auto & hparams = ctx->model.hparams;
|
2557
|
+
const int n_layer = hparams.n_layer;
|
2558
|
+
const int n_embd = hparams.n_embd;
|
2559
|
+
const int n_ctx = hparams.n_ctx;
|
2560
|
+
|
2561
|
+
const size_t kv_size = kv_self.buf.size;
|
2180
2562
|
const int kv_ntok = llama_get_kv_cache_token_count(ctx);
|
2181
2563
|
|
2182
2564
|
memcpy(out, &kv_size, sizeof(kv_size)); out += sizeof(kv_size);
|
2183
2565
|
memcpy(out, &kv_ntok, sizeof(kv_ntok)); out += sizeof(kv_ntok);
|
2184
2566
|
|
2185
2567
|
if (kv_size) {
|
2186
|
-
|
2568
|
+
const size_t elt_size = ggml_element_size(kv_self.k);
|
2569
|
+
|
2570
|
+
char buffer[4096];
|
2571
|
+
|
2572
|
+
ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
|
2573
|
+
ggml_cgraph gf{};
|
2574
|
+
gf.n_threads = 1;
|
2575
|
+
|
2576
|
+
ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
|
2577
|
+
kout3d->data = out;
|
2578
|
+
out += ggml_nbytes(kout3d);
|
2579
|
+
|
2580
|
+
ggml_tensor * vout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_ntok, n_embd, n_layer);
|
2581
|
+
vout3d->data = out;
|
2582
|
+
out += ggml_nbytes(vout3d);
|
2583
|
+
|
2584
|
+
ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k,
|
2585
|
+
n_embd, kv_ntok, n_layer,
|
2586
|
+
elt_size*n_embd, elt_size*n_embd*n_ctx, 0);
|
2587
|
+
|
2588
|
+
ggml_tensor * v3d = ggml_view_3d(cpy_ctx, kv_self.v,
|
2589
|
+
kv_ntok, n_embd, n_layer,
|
2590
|
+
elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
|
2591
|
+
|
2592
|
+
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
|
2593
|
+
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d));
|
2594
|
+
ggml_graph_compute(cpy_ctx, &gf);
|
2595
|
+
|
2596
|
+
ggml_free(cpy_ctx);
|
2187
2597
|
}
|
2188
2598
|
}
|
2189
2599
|
|
2190
|
-
const size_t written = out -
|
2191
|
-
const size_t
|
2600
|
+
const size_t written = out - dst;
|
2601
|
+
const size_t max_size = llama_get_state_size(ctx);
|
2192
2602
|
|
2193
|
-
LLAMA_ASSERT(written
|
2603
|
+
LLAMA_ASSERT(written <= max_size);
|
2194
2604
|
|
2195
2605
|
return written;
|
2196
2606
|
}
|
2197
2607
|
|
2198
2608
|
// Sets the state reading from the specified source address
|
2199
2609
|
size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
2200
|
-
const uint8_t *
|
2610
|
+
const uint8_t * inp = src;
|
2201
2611
|
|
2202
2612
|
// set rng
|
2203
2613
|
{
|
2204
2614
|
size_t rng_size;
|
2205
2615
|
char rng_buf[LLAMA_MAX_RNG_STATE];
|
2206
2616
|
|
2207
|
-
memcpy(&rng_size,
|
2208
|
-
memcpy(&rng_buf[0],
|
2617
|
+
memcpy(&rng_size, inp, sizeof(rng_size)); inp += sizeof(rng_size);
|
2618
|
+
memcpy(&rng_buf[0], inp, LLAMA_MAX_RNG_STATE); inp += LLAMA_MAX_RNG_STATE;
|
2209
2619
|
|
2210
2620
|
std::stringstream rng_ss;
|
2211
2621
|
rng_ss.str(std::string(&rng_buf[0], rng_size));
|
@@ -2219,65 +2629,171 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
|
2219
2629
|
size_t logits_cap;
|
2220
2630
|
size_t logits_size;
|
2221
2631
|
|
2222
|
-
memcpy(&logits_cap,
|
2223
|
-
memcpy(&logits_size,
|
2632
|
+
memcpy(&logits_cap, inp, sizeof(logits_cap)); inp += sizeof(logits_cap);
|
2633
|
+
memcpy(&logits_size, inp, sizeof(logits_size)); inp += sizeof(logits_size);
|
2224
2634
|
|
2225
2635
|
LLAMA_ASSERT(ctx->logits.capacity() == logits_cap);
|
2226
2636
|
|
2227
2637
|
if (logits_size) {
|
2228
2638
|
ctx->logits.resize(logits_size);
|
2229
|
-
memcpy(ctx->logits.data(),
|
2639
|
+
memcpy(ctx->logits.data(), inp, logits_size * sizeof(float));
|
2230
2640
|
}
|
2231
2641
|
|
2232
|
-
|
2642
|
+
inp += logits_cap * sizeof(float);
|
2233
2643
|
}
|
2234
2644
|
|
2235
2645
|
// set embeddings
|
2236
2646
|
{
|
2237
2647
|
size_t embedding_size;
|
2238
2648
|
|
2239
|
-
memcpy(&embedding_size,
|
2649
|
+
memcpy(&embedding_size, inp, sizeof(embedding_size)); inp += sizeof(embedding_size);
|
2240
2650
|
|
2241
2651
|
LLAMA_ASSERT(ctx->embedding.capacity() == embedding_size);
|
2242
2652
|
|
2243
2653
|
if (embedding_size) {
|
2244
|
-
memcpy(ctx->embedding.data(),
|
2245
|
-
|
2654
|
+
memcpy(ctx->embedding.data(), inp, embedding_size * sizeof(float));
|
2655
|
+
inp += embedding_size * sizeof(float);
|
2246
2656
|
}
|
2247
2657
|
}
|
2248
2658
|
|
2249
2659
|
// set kv cache
|
2250
2660
|
{
|
2661
|
+
const auto & kv_self = ctx->model.kv_self;
|
2662
|
+
const auto & hparams = ctx->model.hparams;
|
2663
|
+
const int n_layer = hparams.n_layer;
|
2664
|
+
const int n_embd = hparams.n_embd;
|
2665
|
+
const int n_ctx = hparams.n_ctx;
|
2666
|
+
|
2251
2667
|
size_t kv_size;
|
2252
2668
|
int kv_ntok;
|
2253
2669
|
|
2254
|
-
memcpy(&kv_size,
|
2255
|
-
memcpy(&kv_ntok,
|
2670
|
+
memcpy(&kv_size, inp, sizeof(kv_size)); inp += sizeof(kv_size);
|
2671
|
+
memcpy(&kv_ntok, inp, sizeof(kv_ntok)); inp += sizeof(kv_ntok);
|
2256
2672
|
|
2257
2673
|
if (kv_size) {
|
2258
|
-
LLAMA_ASSERT(
|
2674
|
+
LLAMA_ASSERT(kv_self.buf.size == kv_size);
|
2675
|
+
|
2676
|
+
const size_t elt_size = ggml_element_size(kv_self.k);
|
2677
|
+
|
2678
|
+
char buffer[4096];
|
2259
2679
|
|
2260
|
-
|
2261
|
-
|
2680
|
+
ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
|
2681
|
+
ggml_cgraph gf{};
|
2682
|
+
gf.n_threads = 1;
|
2262
2683
|
|
2263
|
-
|
2684
|
+
ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
|
2685
|
+
kin3d->data = (void *) inp;
|
2686
|
+
inp += ggml_nbytes(kin3d);
|
2264
2687
|
|
2265
|
-
|
2266
|
-
|
2688
|
+
ggml_tensor * vin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_ntok, n_embd, n_layer);
|
2689
|
+
vin3d->data = (void *) inp;
|
2690
|
+
inp += ggml_nbytes(vin3d);
|
2267
2691
|
|
2692
|
+
ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k,
|
2693
|
+
n_embd, kv_ntok, n_layer,
|
2694
|
+
elt_size*n_embd, elt_size*n_embd*n_ctx, 0);
|
2695
|
+
|
2696
|
+
ggml_tensor * v3d = ggml_view_3d(cpy_ctx, kv_self.v,
|
2697
|
+
kv_ntok, n_embd, n_layer,
|
2698
|
+
elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
|
2699
|
+
|
2700
|
+
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d));
|
2701
|
+
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d));
|
2702
|
+
ggml_graph_compute(cpy_ctx, &gf);
|
2703
|
+
|
2704
|
+
ggml_free(cpy_ctx);
|
2268
2705
|
}
|
2269
2706
|
|
2270
2707
|
ctx->model.kv_self.n = kv_ntok;
|
2271
2708
|
}
|
2272
2709
|
|
2273
|
-
const size_t nread =
|
2274
|
-
const size_t
|
2710
|
+
const size_t nread = inp - src;
|
2711
|
+
const size_t max_size = llama_get_state_size(ctx);
|
2275
2712
|
|
2276
|
-
LLAMA_ASSERT(nread
|
2713
|
+
LLAMA_ASSERT(nread <= max_size);
|
2277
2714
|
|
2278
2715
|
return nread;
|
2279
2716
|
}
|
2280
2717
|
|
2718
|
+
bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
|
2719
|
+
llama_file file(path_session, "rb");
|
2720
|
+
|
2721
|
+
// sanity checks
|
2722
|
+
{
|
2723
|
+
const uint32_t magic = file.read_u32();
|
2724
|
+
const uint32_t version = file.read_u32();
|
2725
|
+
|
2726
|
+
if (magic != LLAMA_SESSION_MAGIC || version != LLAMA_SESSION_VERSION) {
|
2727
|
+
fprintf(stderr, "%s : unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version);
|
2728
|
+
return false;
|
2729
|
+
}
|
2730
|
+
|
2731
|
+
llama_hparams session_hparams;
|
2732
|
+
file.read_raw(&session_hparams, sizeof(llama_hparams));
|
2733
|
+
|
2734
|
+
if (session_hparams != ctx->model.hparams) {
|
2735
|
+
fprintf(stderr, "%s : model hparams didn't match from session file!\n", __func__);
|
2736
|
+
return false;
|
2737
|
+
}
|
2738
|
+
}
|
2739
|
+
|
2740
|
+
// load the prompt
|
2741
|
+
{
|
2742
|
+
const uint32_t n_token_count = file.read_u32();
|
2743
|
+
|
2744
|
+
if (n_token_count > n_token_capacity) {
|
2745
|
+
fprintf(stderr, "%s : token count in session file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity);
|
2746
|
+
return false;
|
2747
|
+
}
|
2748
|
+
|
2749
|
+
file.read_raw(tokens_out, sizeof(llama_token) * n_token_count);
|
2750
|
+
*n_token_count_out = n_token_count;
|
2751
|
+
}
|
2752
|
+
|
2753
|
+
// restore the context state
|
2754
|
+
{
|
2755
|
+
const size_t n_state_size_cur = file.size - file.tell();
|
2756
|
+
const size_t n_state_size_max = llama_get_state_size(ctx);
|
2757
|
+
|
2758
|
+
if (n_state_size_cur > n_state_size_max) {
|
2759
|
+
fprintf(stderr, "%s : the state size in session file is too big! max %zu, got %zu\n", __func__, n_state_size_max, n_state_size_cur);
|
2760
|
+
return false;
|
2761
|
+
}
|
2762
|
+
|
2763
|
+
std::vector<uint8_t> state_data(n_state_size_max);
|
2764
|
+
file.read_raw(state_data.data(), n_state_size_cur);
|
2765
|
+
|
2766
|
+
llama_set_state_data(ctx, state_data.data());
|
2767
|
+
}
|
2768
|
+
|
2769
|
+
return true;
|
2770
|
+
}
|
2771
|
+
|
2772
|
+
bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
|
2773
|
+
llama_file file(path_session, "wb");
|
2774
|
+
|
2775
|
+
file.write_u32(LLAMA_SESSION_MAGIC);
|
2776
|
+
file.write_u32(LLAMA_SESSION_VERSION);
|
2777
|
+
|
2778
|
+
file.write_raw(&ctx->model.hparams, sizeof(llama_hparams));
|
2779
|
+
|
2780
|
+
// save the prompt
|
2781
|
+
file.write_u32((uint32_t) n_token_count);
|
2782
|
+
file.write_raw(tokens, sizeof(llama_token) * n_token_count);
|
2783
|
+
|
2784
|
+
// save the context state
|
2785
|
+
{
|
2786
|
+
const size_t n_state_size_max = llama_get_state_size(ctx);
|
2787
|
+
|
2788
|
+
std::vector<uint8_t> state_data(n_state_size_max);
|
2789
|
+
const size_t n_state_size_cur = llama_copy_state_data(ctx, state_data.data());
|
2790
|
+
|
2791
|
+
file.write_raw(state_data.data(), n_state_size_cur);
|
2792
|
+
}
|
2793
|
+
|
2794
|
+
return true;
|
2795
|
+
}
|
2796
|
+
|
2281
2797
|
int llama_eval(
|
2282
2798
|
struct llama_context * ctx,
|
2283
2799
|
const llama_token * tokens,
|
@@ -2288,11 +2804,14 @@ int llama_eval(
|
|
2288
2804
|
fprintf(stderr, "%s: failed to eval\n", __func__);
|
2289
2805
|
return 1;
|
2290
2806
|
}
|
2807
|
+
|
2291
2808
|
// get a more accurate load time, upon first eval
|
2809
|
+
// TODO: fix this
|
2292
2810
|
if (!ctx->has_evaluated_once) {
|
2293
2811
|
ctx->t_load_us = ggml_time_us() - ctx->t_start_us;
|
2294
2812
|
ctx->has_evaluated_once = true;
|
2295
2813
|
}
|
2814
|
+
|
2296
2815
|
return 0;
|
2297
2816
|
}
|
2298
2817
|
|
@@ -2316,15 +2835,15 @@ int llama_tokenize(
|
|
2316
2835
|
return res.size();
|
2317
2836
|
}
|
2318
2837
|
|
2319
|
-
int llama_n_vocab(struct llama_context * ctx) {
|
2838
|
+
int llama_n_vocab(const struct llama_context * ctx) {
|
2320
2839
|
return ctx->vocab.id_to_token.size();
|
2321
2840
|
}
|
2322
2841
|
|
2323
|
-
int llama_n_ctx(struct llama_context * ctx) {
|
2842
|
+
int llama_n_ctx(const struct llama_context * ctx) {
|
2324
2843
|
return ctx->model.hparams.n_ctx;
|
2325
2844
|
}
|
2326
2845
|
|
2327
|
-
int llama_n_embd(struct llama_context * ctx) {
|
2846
|
+
int llama_n_embd(const struct llama_context * ctx) {
|
2328
2847
|
return ctx->model.hparams.n_embd;
|
2329
2848
|
}
|
2330
2849
|
|
@@ -2336,7 +2855,7 @@ float * llama_get_embeddings(struct llama_context * ctx) {
|
|
2336
2855
|
return ctx->embedding.data();
|
2337
2856
|
}
|
2338
2857
|
|
2339
|
-
const char * llama_token_to_str(struct llama_context * ctx, llama_token token) {
|
2858
|
+
const char * llama_token_to_str(const struct llama_context * ctx, llama_token token) {
|
2340
2859
|
if (token >= llama_n_vocab(ctx)) {
|
2341
2860
|
return nullptr;
|
2342
2861
|
}
|
@@ -2352,33 +2871,8 @@ llama_token llama_token_eos() {
|
|
2352
2871
|
return 2;
|
2353
2872
|
}
|
2354
2873
|
|
2355
|
-
llama_token
|
2356
|
-
|
2357
|
-
const llama_token * last_n_tokens_data,
|
2358
|
-
int last_n_tokens_size,
|
2359
|
-
int top_k,
|
2360
|
-
float top_p,
|
2361
|
-
float temp,
|
2362
|
-
float repeat_penalty) {
|
2363
|
-
const int64_t t_start_sample_us = ggml_time_us();
|
2364
|
-
|
2365
|
-
llama_token result = 0;
|
2366
|
-
|
2367
|
-
// TODO: avoid this ...
|
2368
|
-
const auto last_n_tokens = std::vector<llama_token>(last_n_tokens_data, last_n_tokens_data + last_n_tokens_size);
|
2369
|
-
|
2370
|
-
result = llama_sample_top_p_top_k(
|
2371
|
-
*ctx,
|
2372
|
-
last_n_tokens,
|
2373
|
-
top_k,
|
2374
|
-
top_p,
|
2375
|
-
temp,
|
2376
|
-
repeat_penalty);
|
2377
|
-
|
2378
|
-
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
2379
|
-
ctx->n_sample++;
|
2380
|
-
|
2381
|
-
return result;
|
2874
|
+
llama_token llama_token_nl() {
|
2875
|
+
return 13;
|
2382
2876
|
}
|
2383
2877
|
|
2384
2878
|
|
@@ -2391,9 +2885,9 @@ void llama_print_timings(struct llama_context * ctx) {
|
|
2391
2885
|
|
2392
2886
|
fprintf(stderr, "\n");
|
2393
2887
|
fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0);
|
2394
|
-
fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per
|
2888
|
+
fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_sample_us, n_sample, 1e-3 * ctx->t_sample_us / n_sample);
|
2395
2889
|
fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_p_eval_us, n_p_eval, 1e-3 * ctx->t_p_eval_us / n_p_eval);
|
2396
|
-
fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per
|
2890
|
+
fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_eval_us, n_eval, 1e-3 * ctx->t_eval_us / n_eval);
|
2397
2891
|
fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0);
|
2398
2892
|
}
|
2399
2893
|
|
@@ -2430,4 +2924,3 @@ const char * llama_print_system_info(void) {
|
|
2430
2924
|
std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
|
2431
2925
|
return ctx->model.tensors_by_name;
|
2432
2926
|
}
|
2433
|
-
|