llama_cpp 0.3.3 → 0.3.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +31 -0
- data/ext/llama_cpp/extconf.rb +1 -0
- data/ext/llama_cpp/llama_cpp.cpp +439 -9
- data/ext/llama_cpp/src/ggml-cuda.cu +759 -136
- data/ext/llama_cpp/src/ggml-metal.h +7 -0
- data/ext/llama_cpp/src/ggml-metal.m +250 -111
- data/ext/llama_cpp/src/ggml-metal.metal +614 -483
- data/ext/llama_cpp/src/ggml.c +793 -1032
- data/ext/llama_cpp/src/ggml.h +95 -18
- data/ext/llama_cpp/src/k_quants.c +327 -3
- data/ext/llama_cpp/src/k_quants.h +8 -0
- data/ext/llama_cpp/src/llama.cpp +626 -166
- data/ext/llama_cpp/src/llama.h +94 -10
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +1 -0
- data/sig/llama_cpp.rbs +36 -1
- metadata +2 -2
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -67,6 +67,7 @@ enum e_model {
|
|
67
67
|
MODEL_13B,
|
68
68
|
MODEL_30B,
|
69
69
|
MODEL_65B,
|
70
|
+
MODEL_70B,
|
70
71
|
};
|
71
72
|
|
72
73
|
static const size_t kB = 1024;
|
@@ -98,17 +99,18 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
|
|
98
99
|
}
|
99
100
|
|
100
101
|
//
|
101
|
-
// memory sizes
|
102
|
+
// memory sizes (calculated for n_batch == 512)
|
102
103
|
//
|
103
104
|
|
104
|
-
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
|
105
|
+
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0(int n_ctx)
|
105
106
|
{
|
106
107
|
static std::map<e_model, size_t> k_sizes = {
|
107
|
-
{ MODEL_3B,
|
108
|
-
{ MODEL_7B,
|
109
|
-
{ MODEL_13B,
|
110
|
-
{ MODEL_30B,
|
111
|
-
{ MODEL_65B,
|
108
|
+
{ MODEL_3B, ((size_t) n_ctx / 16ull + 92ull) * MB },
|
109
|
+
{ MODEL_7B, ((size_t) n_ctx / 16ull + 100ull) * MB },
|
110
|
+
{ MODEL_13B, ((size_t) n_ctx / 12ull + 120ull) * MB },
|
111
|
+
{ MODEL_30B, ((size_t) n_ctx / 9ull + 160ull) * MB },
|
112
|
+
{ MODEL_65B, ((size_t) n_ctx / 6ull + 256ull) * MB }, // guess
|
113
|
+
{ MODEL_70B, ((size_t) n_ctx / 7ull + 164ull) * MB },
|
112
114
|
};
|
113
115
|
return k_sizes;
|
114
116
|
}
|
@@ -116,38 +118,26 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
|
|
116
118
|
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
|
117
119
|
{
|
118
120
|
static std::map<e_model, size_t> k_sizes = {
|
119
|
-
{ MODEL_3B,
|
120
|
-
{ MODEL_7B,
|
121
|
-
{ MODEL_13B,
|
122
|
-
{ MODEL_30B,
|
123
|
-
{ MODEL_65B,
|
121
|
+
{ MODEL_3B, 128ull * MB },
|
122
|
+
{ MODEL_7B, 160ull * MB },
|
123
|
+
{ MODEL_13B, 192ull * MB },
|
124
|
+
{ MODEL_30B, 256ull * MB },
|
125
|
+
{ MODEL_65B, 384ull * MB }, // guess
|
126
|
+
{ MODEL_70B, 304ull * MB },
|
124
127
|
};
|
125
128
|
return k_sizes;
|
126
129
|
}
|
127
130
|
|
128
|
-
//
|
129
|
-
static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
|
130
|
-
{
|
131
|
-
static std::map<e_model, size_t> k_sizes = {
|
132
|
-
{ MODEL_3B, 682ull * MB },
|
133
|
-
{ MODEL_7B, 1026ull * MB },
|
134
|
-
{ MODEL_13B, 1608ull * MB },
|
135
|
-
{ MODEL_30B, 3124ull * MB },
|
136
|
-
{ MODEL_65B, 5120ull * MB },
|
137
|
-
};
|
138
|
-
return k_sizes;
|
139
|
-
}
|
140
|
-
|
141
|
-
// this is mostly needed for temporary mul_mat buffers to dequantize the data
|
142
|
-
// not actually needed if BLAS is disabled
|
131
|
+
// used to store the compute graph tensors + non-scratch data
|
143
132
|
static const std::map<e_model, size_t> & MEM_REQ_EVAL()
|
144
133
|
{
|
145
134
|
static std::map<e_model, size_t> k_sizes = {
|
146
|
-
{ MODEL_3B,
|
147
|
-
{ MODEL_7B,
|
148
|
-
{ MODEL_13B,
|
149
|
-
{ MODEL_30B,
|
150
|
-
{ MODEL_65B,
|
135
|
+
{ MODEL_3B, 8ull * MB },
|
136
|
+
{ MODEL_7B, 10ull * MB },
|
137
|
+
{ MODEL_13B, 12ull * MB },
|
138
|
+
{ MODEL_30B, 16ull * MB },
|
139
|
+
{ MODEL_65B, 24ull * MB }, // guess
|
140
|
+
{ MODEL_70B, 24ull * MB },
|
151
141
|
};
|
152
142
|
return k_sizes;
|
153
143
|
}
|
@@ -162,6 +152,7 @@ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_BASE()
|
|
162
152
|
{ MODEL_13B, 640ull * kB },
|
163
153
|
{ MODEL_30B, 768ull * kB },
|
164
154
|
{ MODEL_65B, 1536ull * kB },
|
155
|
+
{ MODEL_70B, 1536ull * kB }, // TODO (likely can be reduced)
|
165
156
|
};
|
166
157
|
return k_sizes;
|
167
158
|
}
|
@@ -176,23 +167,55 @@ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT()
|
|
176
167
|
{ MODEL_13B, 160ull },
|
177
168
|
{ MODEL_30B, 208ull },
|
178
169
|
{ MODEL_65B, 416ull },
|
170
|
+
{ MODEL_70B, 416ull }, // TODO (likely can be reduced)
|
179
171
|
};
|
180
172
|
return k_sizes;
|
181
173
|
}
|
182
174
|
|
183
175
|
// default hparams (LLaMA 7B)
|
184
176
|
struct llama_hparams {
|
185
|
-
uint32_t n_vocab
|
186
|
-
uint32_t n_ctx
|
187
|
-
uint32_t n_embd
|
188
|
-
uint32_t n_mult
|
189
|
-
uint32_t n_head
|
190
|
-
uint32_t
|
191
|
-
uint32_t
|
177
|
+
uint32_t n_vocab = 32000;
|
178
|
+
uint32_t n_ctx = 512; // this is provided as user input?
|
179
|
+
uint32_t n_embd = 4096;
|
180
|
+
uint32_t n_mult = 256;
|
181
|
+
uint32_t n_head = 32;
|
182
|
+
uint32_t n_head_kv = 32;
|
183
|
+
uint32_t n_layer = 32;
|
184
|
+
uint32_t n_rot = 64;
|
185
|
+
|
186
|
+
// LLaMAv2
|
187
|
+
// TODO: load from model data hparams
|
188
|
+
float f_ffn_mult = 1.0f;
|
189
|
+
float f_rms_norm_eps = LLAMA_DEFAULT_RMS_EPS;
|
190
|
+
|
191
|
+
float rope_freq_base = 10000.0f;
|
192
|
+
float rope_freq_scale = 1.0f;
|
193
|
+
|
192
194
|
enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;
|
193
195
|
|
194
196
|
bool operator!=(const llama_hparams & other) const {
|
195
|
-
return static_cast<bool>(memcmp(this, &other, sizeof(llama_hparams)));
|
197
|
+
return static_cast<bool>(memcmp(this, &other, sizeof(llama_hparams))); // NOLINT
|
198
|
+
}
|
199
|
+
|
200
|
+
uint32_t n_gqa() const {
|
201
|
+
return n_head/n_head_kv;
|
202
|
+
}
|
203
|
+
|
204
|
+
uint32_t n_embd_head() const {
|
205
|
+
return n_embd/n_head;
|
206
|
+
}
|
207
|
+
|
208
|
+
uint32_t n_embd_gqa() const {
|
209
|
+
return n_embd/n_gqa();
|
210
|
+
}
|
211
|
+
|
212
|
+
size_t kv_size() const {
|
213
|
+
size_t result = 2ull;
|
214
|
+
result *= (size_t) n_embd_gqa();
|
215
|
+
result *= (size_t) n_ctx;
|
216
|
+
result *= (size_t) n_layer;
|
217
|
+
result *= sizeof(ggml_fp16_t);
|
218
|
+
return result;
|
196
219
|
}
|
197
220
|
};
|
198
221
|
|
@@ -303,7 +326,7 @@ struct llama_model {
|
|
303
326
|
};
|
304
327
|
|
305
328
|
struct llama_context {
|
306
|
-
llama_context(const llama_model & model
|
329
|
+
llama_context(const llama_model & model) : model(model), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
|
307
330
|
#ifdef GGML_USE_METAL
|
308
331
|
~llama_context() {
|
309
332
|
if (ctx_metal) {
|
@@ -324,7 +347,6 @@ struct llama_context {
|
|
324
347
|
int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
|
325
348
|
|
326
349
|
const llama_model & model;
|
327
|
-
const llama_vocab & vocab;
|
328
350
|
|
329
351
|
bool model_owner = false;
|
330
352
|
|
@@ -495,12 +517,16 @@ struct llama_file_loader {
|
|
495
517
|
}
|
496
518
|
void read_hparams() {
|
497
519
|
hparams.n_vocab = file.read_u32();
|
498
|
-
hparams.n_embd
|
499
|
-
hparams.n_mult
|
500
|
-
hparams.n_head
|
520
|
+
hparams.n_embd = file.read_u32();
|
521
|
+
hparams.n_mult = file.read_u32();
|
522
|
+
hparams.n_head = file.read_u32();
|
501
523
|
hparams.n_layer = file.read_u32();
|
502
|
-
hparams.n_rot
|
503
|
-
hparams.ftype
|
524
|
+
hparams.n_rot = file.read_u32();
|
525
|
+
hparams.ftype = (enum llama_ftype) file.read_u32();
|
526
|
+
|
527
|
+
// LLaMAv2
|
528
|
+
// TODO: read from header
|
529
|
+
hparams.n_head_kv = hparams.n_head;
|
504
530
|
}
|
505
531
|
void read_vocab() {
|
506
532
|
vocab.id_to_token.resize(hparams.n_vocab);
|
@@ -551,7 +577,9 @@ struct llama_file_loader {
|
|
551
577
|
}
|
552
578
|
|
553
579
|
// skip to the next multiple of 32 bytes
|
554
|
-
|
580
|
+
if (file_version >= LLAMA_FILE_VERSION_GGJT_V1) {
|
581
|
+
file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
|
582
|
+
}
|
555
583
|
|
556
584
|
tensor.file_off = file.tell();
|
557
585
|
tensor.name = name;
|
@@ -648,7 +676,7 @@ struct llama_model_loader {
|
|
648
676
|
*ctx_size_p = *mmapped_size_p = 0;
|
649
677
|
for (const llama_load_tensor & lt : tensors_map.tensors) {
|
650
678
|
*ctx_size_p += sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE;
|
651
|
-
*(use_mmap ? mmapped_size_p : ctx_size_p) += lt.size;
|
679
|
+
*(use_mmap ? mmapped_size_p : ctx_size_p) += lt.size + 16;
|
652
680
|
}
|
653
681
|
}
|
654
682
|
|
@@ -797,7 +825,7 @@ static bool kv_cache_init(
|
|
797
825
|
ggml_type wtype,
|
798
826
|
int n_ctx,
|
799
827
|
int n_gpu_layers) {
|
800
|
-
const int n_embd = hparams.
|
828
|
+
const int n_embd = hparams.n_embd_gqa();
|
801
829
|
const int n_layer = hparams.n_layer;
|
802
830
|
|
803
831
|
const int64_t n_mem = n_layer*n_ctx;
|
@@ -841,9 +869,13 @@ struct llama_context_params llama_context_default_params() {
|
|
841
869
|
/*.seed =*/ LLAMA_DEFAULT_SEED,
|
842
870
|
/*.n_ctx =*/ 512,
|
843
871
|
/*.n_batch =*/ 512,
|
872
|
+
/*.n_gqa =*/ 1,
|
873
|
+
/*.rms_norm_eps =*/ LLAMA_DEFAULT_RMS_EPS,
|
844
874
|
/*.gpu_layers =*/ 0,
|
845
875
|
/*.main_gpu =*/ 0,
|
846
|
-
/*.tensor_split =*/
|
876
|
+
/*.tensor_split =*/ nullptr,
|
877
|
+
/*.rope_freq_base =*/ 10000.0f,
|
878
|
+
/*.rope_freq_scale =*/ 1.0f,
|
847
879
|
/*.progress_callback =*/ nullptr,
|
848
880
|
/*.progress_callback_user_data =*/ nullptr,
|
849
881
|
/*.low_vram =*/ false,
|
@@ -869,6 +901,10 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
|
|
869
901
|
return result;
|
870
902
|
}
|
871
903
|
|
904
|
+
int llama_max_devices() {
|
905
|
+
return LLAMA_MAX_DEVICES;
|
906
|
+
}
|
907
|
+
|
872
908
|
bool llama_mmap_supported() {
|
873
909
|
return llama_mmap::SUPPORTED;
|
874
910
|
}
|
@@ -954,6 +990,7 @@ static const char *llama_model_type_name(e_model type) {
|
|
954
990
|
case MODEL_13B: return "13B";
|
955
991
|
case MODEL_30B: return "30B";
|
956
992
|
case MODEL_65B: return "65B";
|
993
|
+
case MODEL_70B: return "70B";
|
957
994
|
default: LLAMA_ASSERT(false);
|
958
995
|
}
|
959
996
|
}
|
@@ -964,9 +1001,13 @@ static void llama_model_load_internal(
|
|
964
1001
|
llama_vocab & vocab,
|
965
1002
|
int n_ctx,
|
966
1003
|
int n_batch,
|
1004
|
+
int n_gqa,
|
1005
|
+
float rms_norm_eps,
|
967
1006
|
int n_gpu_layers,
|
968
1007
|
int main_gpu,
|
969
1008
|
const float * tensor_split,
|
1009
|
+
float rope_freq_base,
|
1010
|
+
float rope_freq_scale,
|
970
1011
|
bool low_vram,
|
971
1012
|
ggml_type memory_type,
|
972
1013
|
bool use_mmap,
|
@@ -983,8 +1024,12 @@ static void llama_model_load_internal(
|
|
983
1024
|
model.hparams = ml->file_loader->hparams;
|
984
1025
|
model.n_gpu_layers = n_gpu_layers;
|
985
1026
|
llama_file_version file_version = ml->file_loader->file_version;
|
1027
|
+
|
986
1028
|
auto & hparams = model.hparams;
|
987
1029
|
|
1030
|
+
// TODO: read from file
|
1031
|
+
hparams.f_rms_norm_eps = rms_norm_eps;
|
1032
|
+
|
988
1033
|
{
|
989
1034
|
switch (hparams.n_layer) {
|
990
1035
|
case 26: model.type = e_model::MODEL_3B; break;
|
@@ -1001,22 +1046,44 @@ static void llama_model_load_internal(
|
|
1001
1046
|
}
|
1002
1047
|
|
1003
1048
|
hparams.n_ctx = n_ctx;
|
1049
|
+
|
1050
|
+
// LLaMAv2
|
1051
|
+
// TODO: temporary until GGUF
|
1052
|
+
LLAMA_ASSERT(hparams.n_head % n_gqa == 0);
|
1053
|
+
hparams.n_head_kv = hparams.n_head / n_gqa;
|
1054
|
+
if (model.type == e_model::MODEL_65B && n_gqa == 8) {
|
1055
|
+
fprintf(stderr, "%s: warning: assuming 70B model based on GQA == %d\n", __func__, n_gqa);
|
1056
|
+
model.type = e_model::MODEL_70B;
|
1057
|
+
hparams.f_ffn_mult = 1.3f; // from the params.json of the 70B model
|
1058
|
+
}
|
1059
|
+
|
1060
|
+
hparams.rope_freq_base = rope_freq_base;
|
1061
|
+
hparams.rope_freq_scale = rope_freq_scale;
|
1004
1062
|
}
|
1005
1063
|
|
1006
|
-
|
1064
|
+
// ref: https://github.com/facebookresearch/llama/blob/6c7fe276574e78057f917549435a2554000a876d/llama/model.py#L194-L199
|
1065
|
+
const uint32_t n_ff_raw = 2*(4*hparams.n_embd)/3;
|
1066
|
+
const uint32_t n_ff_mult = hparams.f_ffn_mult*n_ff_raw;
|
1067
|
+
const uint32_t n_ff = ((n_ff_mult + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
|
1068
|
+
//const uint32_t n_ff = 28672;
|
1007
1069
|
|
1008
1070
|
{
|
1009
|
-
fprintf(stderr, "%s: format = %s\n",
|
1010
|
-
fprintf(stderr, "%s: n_vocab = %u\n",
|
1011
|
-
fprintf(stderr, "%s: n_ctx = %u\n",
|
1012
|
-
fprintf(stderr, "%s: n_embd = %u\n",
|
1013
|
-
fprintf(stderr, "%s: n_mult = %u\n",
|
1014
|
-
fprintf(stderr, "%s: n_head = %u\n",
|
1015
|
-
fprintf(stderr, "%s:
|
1016
|
-
fprintf(stderr, "%s:
|
1071
|
+
fprintf(stderr, "%s: format = %s\n", __func__, llama_file_version_name(file_version));
|
1072
|
+
fprintf(stderr, "%s: n_vocab = %u\n", __func__, hparams.n_vocab);
|
1073
|
+
fprintf(stderr, "%s: n_ctx = %u\n", __func__, hparams.n_ctx);
|
1074
|
+
fprintf(stderr, "%s: n_embd = %u\n", __func__, hparams.n_embd);
|
1075
|
+
fprintf(stderr, "%s: n_mult = %u\n", __func__, hparams.n_mult);
|
1076
|
+
fprintf(stderr, "%s: n_head = %u\n", __func__, hparams.n_head);
|
1077
|
+
fprintf(stderr, "%s: n_head_kv = %u\n", __func__, hparams.n_head_kv);
|
1078
|
+
fprintf(stderr, "%s: n_layer = %u\n", __func__, hparams.n_layer);
|
1079
|
+
fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot); // a.k.a. n_embd_head, n_head_dim
|
1080
|
+
fprintf(stderr, "%s: n_gqa = %u\n", __func__, hparams.n_gqa());
|
1081
|
+
fprintf(stderr, "%s: rnorm_eps = %.1e\n", __func__, hparams.f_rms_norm_eps);
|
1082
|
+
fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff);
|
1083
|
+
fprintf(stderr, "%s: freq_base = %.1f\n", __func__, hparams.rope_freq_base);
|
1084
|
+
fprintf(stderr, "%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale);
|
1017
1085
|
fprintf(stderr, "%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype));
|
1018
|
-
fprintf(stderr, "%s:
|
1019
|
-
fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
|
1086
|
+
fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
|
1020
1087
|
}
|
1021
1088
|
|
1022
1089
|
if (file_version < LLAMA_FILE_VERSION_GGJT_V2) {
|
@@ -1050,7 +1117,7 @@ static void llama_model_load_internal(
|
|
1050
1117
|
{
|
1051
1118
|
model.buf.resize(ctx_size);
|
1052
1119
|
if (use_mlock) {
|
1053
|
-
model.mlock_buf.init(model.buf.addr);
|
1120
|
+
model.mlock_buf.init (model.buf.addr);
|
1054
1121
|
model.mlock_buf.grow_to(model.buf.size);
|
1055
1122
|
}
|
1056
1123
|
|
@@ -1085,9 +1152,10 @@ static void llama_model_load_internal(
|
|
1085
1152
|
size_t vram_weights = 0;
|
1086
1153
|
size_t vram_scratch = 0;
|
1087
1154
|
{
|
1088
|
-
const uint32_t n_embd
|
1089
|
-
const uint32_t
|
1090
|
-
const uint32_t
|
1155
|
+
const uint32_t n_embd = hparams.n_embd;
|
1156
|
+
const uint32_t n_embd_gqa = hparams.n_embd_gqa();
|
1157
|
+
const uint32_t n_layer = hparams.n_layer;
|
1158
|
+
const uint32_t n_vocab = hparams.n_vocab;
|
1091
1159
|
|
1092
1160
|
ml->ggml_ctx = ctx;
|
1093
1161
|
|
@@ -1135,16 +1203,16 @@ static void llama_model_load_internal(
|
|
1135
1203
|
|
1136
1204
|
layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd}, backend);
|
1137
1205
|
|
1138
|
-
layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd},
|
1139
|
-
layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd,
|
1140
|
-
layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd,
|
1141
|
-
layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd},
|
1206
|
+
layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}, backend_split);
|
1207
|
+
layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd_gqa}, backend_split);
|
1208
|
+
layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd_gqa}, backend_split);
|
1209
|
+
layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}, backend_split);
|
1142
1210
|
|
1143
1211
|
layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd}, backend);
|
1144
1212
|
|
1145
|
-
layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff},
|
1146
|
-
layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff,
|
1147
|
-
layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff},
|
1213
|
+
layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}, backend_split);
|
1214
|
+
layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}, backend_split);
|
1215
|
+
layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}, backend_split);
|
1148
1216
|
|
1149
1217
|
if (backend == GGML_BACKEND_GPU) {
|
1150
1218
|
vram_weights +=
|
@@ -1165,13 +1233,13 @@ static void llama_model_load_internal(
|
|
1165
1233
|
const size_t mem_required =
|
1166
1234
|
ctx_size +
|
1167
1235
|
mmapped_size - vram_weights + // weights in VRAM not in memory
|
1168
|
-
MEM_REQ_SCRATCH0().at(model.type) +
|
1236
|
+
MEM_REQ_SCRATCH0(hparams.n_ctx).at(model.type) +
|
1169
1237
|
MEM_REQ_SCRATCH1().at(model.type) +
|
1170
|
-
MEM_REQ_EVAL().at
|
1238
|
+
MEM_REQ_EVAL().at(model.type);
|
1171
1239
|
|
1172
1240
|
// this is the memory required by one llama_state
|
1173
1241
|
const size_t mem_required_state =
|
1174
|
-
scale*
|
1242
|
+
scale*hparams.kv_size();
|
1175
1243
|
|
1176
1244
|
fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
|
1177
1245
|
mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
|
@@ -1212,7 +1280,7 @@ static void llama_model_load_internal(
|
|
1212
1280
|
fprintf(stderr, "%s: cannot offload v cache to GPU due to low VRAM option\n", __func__);
|
1213
1281
|
} else {
|
1214
1282
|
fprintf(stderr, "%s: offloading v cache to GPU\n", __func__);
|
1215
|
-
vram_kv_cache +=
|
1283
|
+
vram_kv_cache += hparams.kv_size() / 2;
|
1216
1284
|
}
|
1217
1285
|
}
|
1218
1286
|
if (n_gpu_layers > (int) hparams.n_layer + 2) {
|
@@ -1220,7 +1288,7 @@ static void llama_model_load_internal(
|
|
1220
1288
|
fprintf(stderr, "%s: cannot offload k cache to GPU due to low VRAM option\n", __func__);
|
1221
1289
|
} else {
|
1222
1290
|
fprintf(stderr, "%s: offloading k cache to GPU\n", __func__);
|
1223
|
-
vram_kv_cache +=
|
1291
|
+
vram_kv_cache += hparams.kv_size() / 2;
|
1224
1292
|
}
|
1225
1293
|
}
|
1226
1294
|
#elif defined(GGML_USE_CLBLAST)
|
@@ -1268,9 +1336,13 @@ static bool llama_model_load(
|
|
1268
1336
|
llama_vocab & vocab,
|
1269
1337
|
int n_ctx,
|
1270
1338
|
int n_batch,
|
1339
|
+
int n_gqa,
|
1340
|
+
float rms_norm_eps,
|
1271
1341
|
int n_gpu_layers,
|
1272
1342
|
int main_gpu,
|
1273
|
-
float * tensor_split,
|
1343
|
+
const float * tensor_split,
|
1344
|
+
float rope_freq_base,
|
1345
|
+
float rope_freq_scale,
|
1274
1346
|
bool low_vram,
|
1275
1347
|
ggml_type memory_type,
|
1276
1348
|
bool use_mmap,
|
@@ -1279,7 +1351,7 @@ static bool llama_model_load(
|
|
1279
1351
|
llama_progress_callback progress_callback,
|
1280
1352
|
void *progress_callback_user_data) {
|
1281
1353
|
try {
|
1282
|
-
llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type,
|
1354
|
+
llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gqa, rms_norm_eps, n_gpu_layers, main_gpu, tensor_split, rope_freq_base, rope_freq_scale, low_vram, memory_type,
|
1283
1355
|
use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
|
1284
1356
|
return true;
|
1285
1357
|
} catch (const std::exception & err) {
|
@@ -1323,12 +1395,22 @@ static bool llama_eval_internal(
|
|
1323
1395
|
|
1324
1396
|
LLAMA_ASSERT(!!kv_self.ctx);
|
1325
1397
|
|
1326
|
-
const
|
1327
|
-
const
|
1328
|
-
const
|
1329
|
-
const
|
1330
|
-
const
|
1331
|
-
const
|
1398
|
+
const int64_t n_embd = hparams.n_embd;
|
1399
|
+
const int64_t n_layer = hparams.n_layer;
|
1400
|
+
const int64_t n_ctx = hparams.n_ctx;
|
1401
|
+
const int64_t n_head = hparams.n_head;
|
1402
|
+
const int64_t n_head_kv = hparams.n_head_kv;
|
1403
|
+
const int64_t n_embd_head = hparams.n_embd_head();
|
1404
|
+
const int64_t n_vocab = hparams.n_vocab;
|
1405
|
+
const int64_t n_embd_gqa = hparams.n_embd_gqa();
|
1406
|
+
|
1407
|
+
|
1408
|
+
LLAMA_ASSERT(n_embd_head == hparams.n_rot);
|
1409
|
+
|
1410
|
+
const float freq_base = hparams.rope_freq_base;
|
1411
|
+
const float freq_scale = hparams.rope_freq_scale;
|
1412
|
+
const float rms_norm_eps = hparams.f_rms_norm_eps;
|
1413
|
+
|
1332
1414
|
const int n_gpu_layers = model.n_gpu_layers;
|
1333
1415
|
|
1334
1416
|
auto & mem_per_token = lctx.mem_per_token;
|
@@ -1342,7 +1424,7 @@ static bool llama_eval_internal(
|
|
1342
1424
|
|
1343
1425
|
struct ggml_context * ctx0 = ggml_init(params);
|
1344
1426
|
|
1345
|
-
ggml_cgraph gf =
|
1427
|
+
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
1346
1428
|
|
1347
1429
|
// for big prompts, if BLAS is enabled, it is better to use only one thread
|
1348
1430
|
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
|
@@ -1407,7 +1489,7 @@ static bool llama_eval_internal(
|
|
1407
1489
|
|
1408
1490
|
// norm
|
1409
1491
|
{
|
1410
|
-
cur = ggml_rms_norm(ctx0, inpL);
|
1492
|
+
cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
|
1411
1493
|
offload_func(cur);
|
1412
1494
|
ggml_set_name(cur, "rms_norm_0");
|
1413
1495
|
|
@@ -1428,11 +1510,11 @@ static bool llama_eval_internal(
|
|
1428
1510
|
offload_func_kq(tmpq);
|
1429
1511
|
ggml_set_name(tmpq, "tmpq");
|
1430
1512
|
|
1431
|
-
struct ggml_tensor * Kcur =
|
1513
|
+
struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
|
1432
1514
|
offload_func_kq(Kcur);
|
1433
1515
|
ggml_set_name(Kcur, "Kcur");
|
1434
1516
|
|
1435
|
-
struct ggml_tensor * Qcur =
|
1517
|
+
struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
|
1436
1518
|
offload_func_kq(Qcur);
|
1437
1519
|
ggml_set_name(Qcur, "Qcur");
|
1438
1520
|
|
@@ -1444,23 +1526,23 @@ static bool llama_eval_internal(
|
|
1444
1526
|
offload_func_v(tmpv);
|
1445
1527
|
ggml_set_name(tmpv, "tmpv");
|
1446
1528
|
|
1447
|
-
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv,
|
1529
|
+
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, N));
|
1448
1530
|
offload_func_v(Vcur);
|
1449
1531
|
ggml_set_name(Vcur, "Vcur");
|
1450
1532
|
|
1451
|
-
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*
|
1533
|
+
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past));
|
1452
1534
|
offload_func_kq(k);
|
1453
1535
|
ggml_set_name(k, "k");
|
1454
1536
|
|
1455
|
-
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N,
|
1537
|
+
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd_gqa,
|
1456
1538
|
( n_ctx)*ggml_element_size(kv_self.v),
|
1457
|
-
(il*n_ctx)*ggml_element_size(kv_self.v)*
|
1539
|
+
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + n_past*ggml_element_size(kv_self.v));
|
1458
1540
|
offload_func_v(v);
|
1459
1541
|
ggml_set_name(v, "v");
|
1460
1542
|
|
1461
1543
|
// important: storing RoPE-ed version of K in the KV cache!
|
1462
|
-
ggml_build_forward_expand(
|
1463
|
-
ggml_build_forward_expand(
|
1544
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
|
1545
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
|
1464
1546
|
}
|
1465
1547
|
|
1466
1548
|
struct ggml_tensor * Q =
|
@@ -1473,8 +1555,8 @@ static bool llama_eval_internal(
|
|
1473
1555
|
struct ggml_tensor * K =
|
1474
1556
|
ggml_permute(ctx0,
|
1475
1557
|
ggml_reshape_3d(ctx0,
|
1476
|
-
ggml_view_1d(ctx0, kv_self.k, (n_past + N)*
|
1477
|
-
|
1558
|
+
ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd_gqa, il*n_ctx*ggml_element_size(kv_self.k)*n_embd_gqa),
|
1559
|
+
n_embd_head, n_head_kv, n_past + N),
|
1478
1560
|
0, 2, 1, 3);
|
1479
1561
|
offload_func_kq(K);
|
1480
1562
|
ggml_set_name(K, "K");
|
@@ -1484,9 +1566,9 @@ static bool llama_eval_internal(
|
|
1484
1566
|
offload_func_kq(KQ);
|
1485
1567
|
ggml_set_name(KQ, "KQ");
|
1486
1568
|
|
1487
|
-
// KQ_scaled = KQ / sqrt(
|
1569
|
+
// KQ_scaled = KQ / sqrt(n_embd_head)
|
1488
1570
|
struct ggml_tensor * KQ_scale = ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head));
|
1489
|
-
ggml_set_name(KQ_scale, "1/sqrt(
|
1571
|
+
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
1490
1572
|
|
1491
1573
|
// KQ_scaled shape [n_past + N, N, n_head, 1]
|
1492
1574
|
struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
|
@@ -1506,10 +1588,10 @@ static bool llama_eval_internal(
|
|
1506
1588
|
// split cached V into n_head heads
|
1507
1589
|
struct ggml_tensor * V =
|
1508
1590
|
ggml_view_3d(ctx0, kv_self.v,
|
1509
|
-
n_past + N,
|
1591
|
+
n_past + N, n_embd_head, n_head_kv,
|
1510
1592
|
n_ctx*ggml_element_size(kv_self.v),
|
1511
|
-
n_ctx*ggml_element_size(kv_self.v)*
|
1512
|
-
|
1593
|
+
n_ctx*ggml_element_size(kv_self.v)*n_embd_head,
|
1594
|
+
n_ctx*ggml_element_size(kv_self.v)*n_embd_gqa*il);
|
1513
1595
|
offload_func_v(V);
|
1514
1596
|
ggml_set_name(V, "V");
|
1515
1597
|
|
@@ -1521,7 +1603,7 @@ static bool llama_eval_internal(
|
|
1521
1603
|
// make V contiguous in memory to speed up the matmul, however we waste time on the copy
|
1522
1604
|
// on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
|
1523
1605
|
// is there a better way?
|
1524
|
-
struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N,
|
1606
|
+
struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd_head, n_head));
|
1525
1607
|
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
|
1526
1608
|
#endif
|
1527
1609
|
|
@@ -1555,7 +1637,7 @@ static bool llama_eval_internal(
|
|
1555
1637
|
{
|
1556
1638
|
// norm
|
1557
1639
|
{
|
1558
|
-
cur = ggml_rms_norm(ctx0, inpFF);
|
1640
|
+
cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps);
|
1559
1641
|
offload_func(cur);
|
1560
1642
|
ggml_set_name(cur, "rms_norm_1");
|
1561
1643
|
|
@@ -1608,7 +1690,7 @@ static bool llama_eval_internal(
|
|
1608
1690
|
|
1609
1691
|
// norm
|
1610
1692
|
{
|
1611
|
-
cur = ggml_rms_norm(ctx0, inpL);
|
1693
|
+
cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
|
1612
1694
|
offload_func_nr(cur);
|
1613
1695
|
ggml_set_name(cur, "rms_norm_2");
|
1614
1696
|
|
@@ -1630,16 +1712,22 @@ static bool llama_eval_internal(
|
|
1630
1712
|
//cur = ggml_soft_max_inplace(ctx0, cur);
|
1631
1713
|
|
1632
1714
|
// run the computation
|
1633
|
-
ggml_build_forward_expand(
|
1715
|
+
ggml_build_forward_expand(gf, cur);
|
1716
|
+
|
1717
|
+
// fprintf(stderr, "graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf.n_nodes, gf.n_leafs);
|
1634
1718
|
|
1635
1719
|
#if GGML_USE_MPI
|
1636
|
-
ggml_mpi_graph_compute_pre(lctx.ctx_mpi,
|
1720
|
+
ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
|
1637
1721
|
#endif
|
1638
1722
|
|
1639
1723
|
#ifdef GGML_USE_METAL
|
1640
1724
|
if (lctx.ctx_metal && N == 1) {
|
1725
|
+
// TODO: disabled until #2413 is resolved
|
1726
|
+
//if (!ggml_metal_if_optimized(lctx.ctx_metal)) {
|
1727
|
+
// ggml_metal_graph_find_concurrency(lctx.ctx_metal, gf);
|
1728
|
+
//}
|
1641
1729
|
ggml_metal_set_n_cb (lctx.ctx_metal, n_threads);
|
1642
|
-
ggml_metal_graph_compute(lctx.ctx_metal,
|
1730
|
+
ggml_metal_graph_compute(lctx.ctx_metal, gf);
|
1643
1731
|
ggml_metal_get_tensor (lctx.ctx_metal, cur);
|
1644
1732
|
} else {
|
1645
1733
|
// IMPORTANT:
|
@@ -1658,34 +1746,34 @@ static bool llama_eval_internal(
|
|
1658
1746
|
ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v);
|
1659
1747
|
}
|
1660
1748
|
|
1661
|
-
ggml_graph_compute_helper(lctx.work_buffer,
|
1749
|
+
ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
|
1662
1750
|
}
|
1663
1751
|
#else
|
1664
|
-
ggml_graph_compute_helper(lctx.work_buffer,
|
1752
|
+
ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
|
1665
1753
|
#endif
|
1666
1754
|
|
1667
1755
|
#if GGML_USE_MPI
|
1668
|
-
ggml_mpi_graph_compute_post(lctx.ctx_mpi,
|
1756
|
+
ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
|
1669
1757
|
#endif
|
1670
1758
|
|
1671
1759
|
// update kv token count
|
1672
1760
|
lctx.kv_self.n = n_past + N;
|
1673
1761
|
|
1674
|
-
struct ggml_tensor * res = gf
|
1762
|
+
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
|
1675
1763
|
|
1676
1764
|
if (cgraph_fname) {
|
1677
|
-
ggml_graph_export(
|
1765
|
+
ggml_graph_export(gf, cgraph_fname);
|
1678
1766
|
}
|
1679
1767
|
|
1680
1768
|
#ifdef GGML_PERF
|
1681
1769
|
// print timing information per ggml operation (for debugging purposes)
|
1682
1770
|
// requires GGML_PERF to be defined
|
1683
|
-
ggml_graph_print(
|
1771
|
+
ggml_graph_print(gf);
|
1684
1772
|
#endif
|
1685
1773
|
|
1686
1774
|
// plot the computation graph in dot format (for debugging purposes)
|
1687
1775
|
//if (n_past%100 == 0) {
|
1688
|
-
// ggml_graph_dump_dot(
|
1776
|
+
// ggml_graph_dump_dot(gf, NULL, "llama.dot");
|
1689
1777
|
//}
|
1690
1778
|
|
1691
1779
|
// extract logits
|
@@ -1715,10 +1803,12 @@ static bool llama_eval_internal(
|
|
1715
1803
|
}
|
1716
1804
|
|
1717
1805
|
#if 0
|
1718
|
-
printf("\n%s: used_mem
|
1806
|
+
printf("\n%s: used_mem: eval ctx %.3f MB, scratch %.3f MB %.3f MB, work buf %.3f MB, n_past = %d, N = %d\n", __func__,
|
1719
1807
|
ggml_used_mem(ctx0)/1024.0/1024.0,
|
1720
1808
|
lctx.get_buf_max_mem(0)/1024.0/1024.0,
|
1721
|
-
lctx.get_buf_max_mem(1)/1024.0/1024.0
|
1809
|
+
lctx.get_buf_max_mem(1)/1024.0/1024.0,
|
1810
|
+
lctx.work_buffer.size()/1024.0/1024.0,
|
1811
|
+
n_past, N);
|
1722
1812
|
#endif
|
1723
1813
|
|
1724
1814
|
ggml_free(ctx0);
|
@@ -1891,6 +1981,279 @@ static std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, co
|
|
1891
1981
|
return output;
|
1892
1982
|
}
|
1893
1983
|
|
1984
|
+
//
|
1985
|
+
// grammar - internal
|
1986
|
+
//
|
1987
|
+
|
1988
|
+
struct llama_grammar {
|
1989
|
+
const std::vector<std::vector<llama_grammar_element>> rules;
|
1990
|
+
std::vector<std::vector<const llama_grammar_element *>> stacks;
|
1991
|
+
};
|
1992
|
+
|
1993
|
+
struct llama_grammar_candidate {
|
1994
|
+
size_t index;
|
1995
|
+
const uint32_t * code_points;
|
1996
|
+
};
|
1997
|
+
|
1998
|
+
// NOTE: assumes valid utf8 (but checks for overrun)
|
1999
|
+
// adds a terminating 0 for use as pointer
|
2000
|
+
std::vector<uint32_t> decode_utf8(const char * src) {
|
2001
|
+
static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
|
2002
|
+
const char * pos = src;
|
2003
|
+
std::vector<uint32_t> code_points;
|
2004
|
+
while (*pos != 0) {
|
2005
|
+
uint8_t first_byte = static_cast<uint8_t>(*pos);
|
2006
|
+
uint8_t highbits = first_byte >> 4;
|
2007
|
+
int len = lookup[highbits];
|
2008
|
+
uint8_t mask = (1 << (8 - len)) - 1;
|
2009
|
+
uint32_t value = first_byte & mask;
|
2010
|
+
const char * end = pos + len; // may overrun!
|
2011
|
+
++pos;
|
2012
|
+
for ( ; pos < end && *pos != 0; ++pos) {
|
2013
|
+
value = (value << 6) + (static_cast<uint8_t>(*pos) & 0x3F);
|
2014
|
+
}
|
2015
|
+
code_points.push_back(value);
|
2016
|
+
}
|
2017
|
+
code_points.push_back(0);
|
2018
|
+
return code_points;
|
2019
|
+
}
|
2020
|
+
|
2021
|
+
// returns true iff pos points to the end of one of the definitions of a rule
|
2022
|
+
static bool llama_grammar_is_end_of_sequence(const llama_grammar_element * pos) {
|
2023
|
+
switch (pos->type) {
|
2024
|
+
case LLAMA_GRETYPE_END: return true;
|
2025
|
+
case LLAMA_GRETYPE_ALT: return true;
|
2026
|
+
default: return false;
|
2027
|
+
}
|
2028
|
+
}
|
2029
|
+
|
2030
|
+
// returns true iff chr satisfies the char range at pos (regular or inverse range)
|
2031
|
+
// asserts that pos is pointing to a char range element
|
2032
|
+
static std::pair<bool, const llama_grammar_element *> llama_grammar_match_char(
|
2033
|
+
const llama_grammar_element * pos,
|
2034
|
+
const uint32_t chr) {
|
2035
|
+
|
2036
|
+
bool found = false;
|
2037
|
+
bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR;
|
2038
|
+
LLAMA_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT);
|
2039
|
+
|
2040
|
+
do {
|
2041
|
+
if (pos[1].type == LLAMA_GRETYPE_CHAR_RNG_UPPER) {
|
2042
|
+
// inclusive range, e.g. [a-z]
|
2043
|
+
found = found || (pos->value <= chr && chr <= pos[1].value);
|
2044
|
+
pos += 2;
|
2045
|
+
} else {
|
2046
|
+
// exact char match, e.g. [a] or "a"
|
2047
|
+
found = found || pos->value == chr;
|
2048
|
+
pos += 1;
|
2049
|
+
}
|
2050
|
+
} while (pos->type == LLAMA_GRETYPE_CHAR_ALT);
|
2051
|
+
|
2052
|
+
return std::make_pair(found == is_positive_char, pos);
|
2053
|
+
}
|
2054
|
+
|
2055
|
+
// transforms a grammar pushdown stack into N possible stacks, all ending
|
2056
|
+
// at a character range (terminal element)
|
2057
|
+
static void llama_grammar_advance_stack(
|
2058
|
+
const std::vector<std::vector<llama_grammar_element>> & rules,
|
2059
|
+
const std::vector<const llama_grammar_element *> & stack,
|
2060
|
+
std::vector<std::vector<const llama_grammar_element *>> & new_stacks) {
|
2061
|
+
|
2062
|
+
if (stack.empty()) {
|
2063
|
+
new_stacks.push_back(stack);
|
2064
|
+
return;
|
2065
|
+
}
|
2066
|
+
|
2067
|
+
const llama_grammar_element * pos = stack.back();
|
2068
|
+
|
2069
|
+
switch (pos->type) {
|
2070
|
+
case LLAMA_GRETYPE_RULE_REF: {
|
2071
|
+
const size_t rule_id = static_cast<size_t>(pos->value);
|
2072
|
+
const llama_grammar_element * subpos = rules[rule_id].data();
|
2073
|
+
do {
|
2074
|
+
// init new stack without the top (pos)
|
2075
|
+
std::vector<const llama_grammar_element *> new_stack(stack.begin(), stack.end() - 1);
|
2076
|
+
if (!llama_grammar_is_end_of_sequence(pos + 1)) {
|
2077
|
+
// if this rule ref is followed by another element, add that to stack
|
2078
|
+
new_stack.push_back(pos + 1);
|
2079
|
+
}
|
2080
|
+
if (!llama_grammar_is_end_of_sequence(subpos)) {
|
2081
|
+
// if alternate is nonempty, add to stack
|
2082
|
+
new_stack.push_back(subpos);
|
2083
|
+
}
|
2084
|
+
llama_grammar_advance_stack(rules, new_stack, new_stacks);
|
2085
|
+
while (!llama_grammar_is_end_of_sequence(subpos)) {
|
2086
|
+
// scan to end of alternate def
|
2087
|
+
subpos++;
|
2088
|
+
}
|
2089
|
+
if (subpos->type == LLAMA_GRETYPE_ALT) {
|
2090
|
+
// there's another alternate def of this rule to process
|
2091
|
+
subpos++;
|
2092
|
+
} else {
|
2093
|
+
break;
|
2094
|
+
}
|
2095
|
+
} while (true);
|
2096
|
+
break;
|
2097
|
+
}
|
2098
|
+
case LLAMA_GRETYPE_CHAR:
|
2099
|
+
case LLAMA_GRETYPE_CHAR_NOT:
|
2100
|
+
new_stacks.push_back(stack);
|
2101
|
+
break;
|
2102
|
+
default:
|
2103
|
+
// end of alternate (LLAMA_GRETYPE_END, LLAMA_GRETYPE_ALT) or middle of char range
|
2104
|
+
// (LLAMA_GRETYPE_CHAR_ALT, LLAMA_GRETYPE_CHAR_RNG_UPPER); stack should never be left on
|
2105
|
+
// those
|
2106
|
+
LLAMA_ASSERT(false);
|
2107
|
+
}
|
2108
|
+
}
|
2109
|
+
|
2110
|
+
// takes a set of possible pushdown stacks on a grammar, which are required to
|
2111
|
+
// be positioned at a character range (see `llama_grammar_advance_stack`), and
|
2112
|
+
// produces the N possible stacks if the given char is accepted at those
|
2113
|
+
// positions
|
2114
|
+
static std::vector<std::vector<const llama_grammar_element *>> llama_grammar_accept(
|
2115
|
+
const std::vector<std::vector<llama_grammar_element>> & rules,
|
2116
|
+
const std::vector<std::vector<const llama_grammar_element *>> & stacks,
|
2117
|
+
const uint32_t chr) {
|
2118
|
+
|
2119
|
+
std::vector<std::vector<const llama_grammar_element *>> new_stacks;
|
2120
|
+
|
2121
|
+
for (const auto & stack : stacks) {
|
2122
|
+
if (stack.empty()) {
|
2123
|
+
continue;
|
2124
|
+
}
|
2125
|
+
|
2126
|
+
auto match = llama_grammar_match_char(stack.back(), chr);
|
2127
|
+
if (match.first) {
|
2128
|
+
const llama_grammar_element * pos = match.second;
|
2129
|
+
|
2130
|
+
// update top of stack to next element, if any
|
2131
|
+
std::vector<const llama_grammar_element *> new_stack(stack.begin(), stack.end() - 1);
|
2132
|
+
if (!llama_grammar_is_end_of_sequence(pos)) {
|
2133
|
+
new_stack.push_back(pos);
|
2134
|
+
}
|
2135
|
+
llama_grammar_advance_stack(rules, new_stack, new_stacks);
|
2136
|
+
}
|
2137
|
+
}
|
2138
|
+
|
2139
|
+
return new_stacks;
|
2140
|
+
}
|
2141
|
+
|
2142
|
+
static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates(
|
2143
|
+
const std::vector<std::vector<llama_grammar_element>> & rules,
|
2144
|
+
const std::vector<std::vector<const llama_grammar_element *>> & stacks,
|
2145
|
+
const std::vector<llama_grammar_candidate> & candidates);
|
2146
|
+
|
2147
|
+
static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_stack(
|
2148
|
+
const std::vector<std::vector<llama_grammar_element>> & rules,
|
2149
|
+
const std::vector<const llama_grammar_element *> & stack,
|
2150
|
+
const std::vector<llama_grammar_candidate> & candidates) {
|
2151
|
+
|
2152
|
+
std::vector<llama_grammar_candidate> rejects;
|
2153
|
+
|
2154
|
+
if (stack.empty()) {
|
2155
|
+
// accept nothing; EOS is handled elsewhere
|
2156
|
+
rejects.insert(rejects.end(), candidates.begin(), candidates.end());
|
2157
|
+
return rejects;
|
2158
|
+
}
|
2159
|
+
|
2160
|
+
const llama_grammar_element * stack_pos = stack.back();
|
2161
|
+
|
2162
|
+
std::vector<llama_grammar_candidate> next_candidates;
|
2163
|
+
for (auto tok : candidates) {
|
2164
|
+
if (llama_grammar_match_char(stack_pos, tok.code_points[0]).first) {
|
2165
|
+
if (tok.code_points[1] != 0) {
|
2166
|
+
next_candidates.push_back({ tok.index, tok.code_points + 1 });
|
2167
|
+
}
|
2168
|
+
} else {
|
2169
|
+
rejects.push_back(tok);
|
2170
|
+
}
|
2171
|
+
}
|
2172
|
+
|
2173
|
+
auto stack_pos_after = llama_grammar_match_char(stack_pos, 0).second;
|
2174
|
+
|
2175
|
+
// update top of stack to next element, if any
|
2176
|
+
std::vector<const llama_grammar_element *> stack_after(stack.begin(), stack.end() - 1);
|
2177
|
+
if (!llama_grammar_is_end_of_sequence(stack_pos_after)) {
|
2178
|
+
stack_after.push_back(stack_pos_after);
|
2179
|
+
}
|
2180
|
+
std::vector<std::vector<const llama_grammar_element *>> next_stacks;
|
2181
|
+
llama_grammar_advance_stack(rules, stack_after, next_stacks);
|
2182
|
+
|
2183
|
+
auto next_rejects = llama_grammar_reject_candidates(rules, next_stacks, next_candidates);
|
2184
|
+
for (auto tok : next_rejects) {
|
2185
|
+
rejects.push_back({ tok.index, tok.code_points - 1 });
|
2186
|
+
}
|
2187
|
+
|
2188
|
+
return rejects;
|
2189
|
+
}
|
2190
|
+
|
2191
|
+
static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates(
|
2192
|
+
const std::vector<std::vector<llama_grammar_element>> & rules,
|
2193
|
+
const std::vector<std::vector<const llama_grammar_element *>> & stacks,
|
2194
|
+
const std::vector<llama_grammar_candidate> & candidates) {
|
2195
|
+
LLAMA_ASSERT(!stacks.empty()); // REVIEW
|
2196
|
+
|
2197
|
+
if (candidates.empty()) {
|
2198
|
+
return std::vector<llama_grammar_candidate>();
|
2199
|
+
}
|
2200
|
+
|
2201
|
+
auto rejects = llama_grammar_reject_candidates_for_stack(rules, stacks.front(), candidates);
|
2202
|
+
|
2203
|
+
for (size_t i = 1, size = stacks.size(); i < size; ++i) {
|
2204
|
+
rejects = llama_grammar_reject_candidates_for_stack(rules, stacks[i], rejects);
|
2205
|
+
}
|
2206
|
+
return rejects;
|
2207
|
+
}
|
2208
|
+
|
2209
|
+
//
|
2210
|
+
// grammar - external
|
2211
|
+
//
|
2212
|
+
|
2213
|
+
struct llama_grammar * llama_grammar_init(
|
2214
|
+
const llama_grammar_element ** rules,
|
2215
|
+
size_t n_rules,
|
2216
|
+
size_t start_rule_index) {
|
2217
|
+
const llama_grammar_element * pos;
|
2218
|
+
|
2219
|
+
// copy rule definitions into vectors
|
2220
|
+
std::vector<std::vector<llama_grammar_element>> vec_rules(n_rules);
|
2221
|
+
for (size_t i = 0; i < n_rules; i++) {
|
2222
|
+
for (pos = rules[i]; pos->type != LLAMA_GRETYPE_END; pos++) {
|
2223
|
+
vec_rules[i].push_back(*pos);
|
2224
|
+
}
|
2225
|
+
vec_rules[i].push_back({LLAMA_GRETYPE_END, 0});
|
2226
|
+
}
|
2227
|
+
|
2228
|
+
// loop over alternates of start rule to build initial stacks
|
2229
|
+
std::vector<std::vector<const llama_grammar_element *>> stacks;
|
2230
|
+
pos = rules[start_rule_index];
|
2231
|
+
do {
|
2232
|
+
std::vector<const llama_grammar_element *> stack;
|
2233
|
+
if (!llama_grammar_is_end_of_sequence(pos)) {
|
2234
|
+
// if alternate is nonempty, add to stack
|
2235
|
+
stack.push_back(pos);
|
2236
|
+
}
|
2237
|
+
llama_grammar_advance_stack(vec_rules, stack, stacks);
|
2238
|
+
while (!llama_grammar_is_end_of_sequence(pos)) {
|
2239
|
+
// scan to end of alternate def
|
2240
|
+
pos++;
|
2241
|
+
}
|
2242
|
+
if (pos->type == LLAMA_GRETYPE_ALT) {
|
2243
|
+
// there's another alternate def of this rule to process
|
2244
|
+
pos++;
|
2245
|
+
} else {
|
2246
|
+
break;
|
2247
|
+
}
|
2248
|
+
} while (true);
|
2249
|
+
|
2250
|
+
return new llama_grammar{ std::move(vec_rules), std::move(stacks) };
|
2251
|
+
}
|
2252
|
+
|
2253
|
+
void llama_grammar_free(struct llama_grammar * grammar) {
|
2254
|
+
delete grammar;
|
2255
|
+
}
|
2256
|
+
|
1894
2257
|
//
|
1895
2258
|
// sampling
|
1896
2259
|
//
|
@@ -2006,9 +2369,18 @@ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array *
|
|
2006
2369
|
}
|
2007
2370
|
|
2008
2371
|
// Normalize the second derivatives
|
2009
|
-
|
2010
|
-
|
2011
|
-
|
2372
|
+
{
|
2373
|
+
const float second_derivatives_sum = std::accumulate(second_derivatives.begin(), second_derivatives.end(), 0.0f);
|
2374
|
+
|
2375
|
+
if (second_derivatives_sum > 1e-6f) {
|
2376
|
+
for (float & value : second_derivatives) {
|
2377
|
+
value /= second_derivatives_sum;
|
2378
|
+
}
|
2379
|
+
} else {
|
2380
|
+
for (float & value : second_derivatives) {
|
2381
|
+
value = 1.0f / second_derivatives.size();
|
2382
|
+
}
|
2383
|
+
}
|
2012
2384
|
}
|
2013
2385
|
|
2014
2386
|
float cum_sum = 0.0f;
|
@@ -2167,6 +2539,47 @@ void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, l
|
|
2167
2539
|
}
|
2168
2540
|
}
|
2169
2541
|
|
2542
|
+
void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, const struct llama_grammar * grammar) {
|
2543
|
+
assert(ctx);
|
2544
|
+
const int64_t t_start_sample_us = ggml_time_us();
|
2545
|
+
|
2546
|
+
bool allow_eos = false;
|
2547
|
+
for (const auto & stack : grammar->stacks) {
|
2548
|
+
if (stack.empty()) {
|
2549
|
+
allow_eos = true;
|
2550
|
+
break;
|
2551
|
+
}
|
2552
|
+
}
|
2553
|
+
|
2554
|
+
const llama_token eos = llama_token_eos();
|
2555
|
+
|
2556
|
+
std::vector<std::vector<uint32_t>> candidates_decoded;
|
2557
|
+
std::vector<llama_grammar_candidate> candidates_grammar;
|
2558
|
+
|
2559
|
+
for (size_t i = 0; i < candidates->size; ++i) {
|
2560
|
+
const llama_token id = candidates->data[i].id;
|
2561
|
+
const char * str = llama_token_to_str(ctx, id);
|
2562
|
+
if (id == eos) {
|
2563
|
+
if (!allow_eos) {
|
2564
|
+
candidates->data[i].logit = -INFINITY;
|
2565
|
+
}
|
2566
|
+
} else if (*str == 0) {
|
2567
|
+
candidates->data[i].logit = -INFINITY;
|
2568
|
+
} else {
|
2569
|
+
candidates_decoded.push_back(decode_utf8(str));
|
2570
|
+
candidates_grammar.push_back({ i, candidates_decoded.back().data() });
|
2571
|
+
}
|
2572
|
+
}
|
2573
|
+
|
2574
|
+
const auto rejects =
|
2575
|
+
llama_grammar_reject_candidates(grammar->rules, grammar->stacks, candidates_grammar);
|
2576
|
+
for (auto & reject : rejects) {
|
2577
|
+
candidates->data[reject.index].logit = -INFINITY;
|
2578
|
+
}
|
2579
|
+
|
2580
|
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
2581
|
+
}
|
2582
|
+
|
2170
2583
|
static void llama_log_softmax(float * array, size_t size) {
|
2171
2584
|
float max_l = *std::max_element(array, array + size);
|
2172
2585
|
float sum = 0.f;
|
@@ -2185,9 +2598,8 @@ void llama_sample_classifier_free_guidance(
|
|
2185
2598
|
struct llama_context * ctx,
|
2186
2599
|
llama_token_data_array * candidates,
|
2187
2600
|
struct llama_context * guidance_ctx,
|
2188
|
-
float scale
|
2189
|
-
|
2190
|
-
int64_t t_start_sample_us = t_start_sample_us = ggml_time_us();
|
2601
|
+
float scale) {
|
2602
|
+
int64_t t_start_sample_us = ggml_time_us();
|
2191
2603
|
|
2192
2604
|
assert(ctx);
|
2193
2605
|
auto n_vocab = llama_n_vocab(ctx);
|
@@ -2207,16 +2619,7 @@ void llama_sample_classifier_free_guidance(
|
|
2207
2619
|
for (int i = 0; i < n_vocab; ++i) {
|
2208
2620
|
float logit_guidance = logits_guidance[i];
|
2209
2621
|
float logit_base = logits_base[i];
|
2210
|
-
|
2211
|
-
}
|
2212
|
-
|
2213
|
-
llama_log_softmax(logits_guidance, n_vocab);
|
2214
|
-
|
2215
|
-
for (int i = 0; i < n_vocab; ++i) {
|
2216
|
-
float logit_base = logits_base[i];
|
2217
|
-
float logit_guidance = logits_guidance[i];
|
2218
|
-
|
2219
|
-
candidates->data[i].logit = smooth_factor * logit_guidance + (1.f - smooth_factor) * logit_base;
|
2622
|
+
candidates->data[i].logit = scale * (logit_base - logit_guidance) + logit_guidance;
|
2220
2623
|
}
|
2221
2624
|
|
2222
2625
|
if (ctx) {
|
@@ -2352,6 +2755,29 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra
|
|
2352
2755
|
return result;
|
2353
2756
|
}
|
2354
2757
|
|
2758
|
+
void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token) {
|
2759
|
+
const int64_t t_start_sample_us = ggml_time_us();
|
2760
|
+
|
2761
|
+
if (token == llama_token_eos()) {
|
2762
|
+
for (const auto & stack : grammar->stacks) {
|
2763
|
+
if (stack.empty()) {
|
2764
|
+
return;
|
2765
|
+
}
|
2766
|
+
}
|
2767
|
+
LLAMA_ASSERT(false);
|
2768
|
+
}
|
2769
|
+
|
2770
|
+
const char * str = llama_token_to_str(ctx, token);
|
2771
|
+
// Note terminating 0 in decoded string
|
2772
|
+
auto code_points = decode_utf8(str);
|
2773
|
+
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
|
2774
|
+
grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
|
2775
|
+
}
|
2776
|
+
LLAMA_ASSERT(!grammar->stacks.empty());
|
2777
|
+
|
2778
|
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
2779
|
+
}
|
2780
|
+
|
2355
2781
|
//
|
2356
2782
|
// quantization
|
2357
2783
|
//
|
@@ -2425,8 +2851,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2425
2851
|
case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
|
2426
2852
|
case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
|
2427
2853
|
case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
|
2428
|
-
case LLAMA_FTYPE_MOSTLY_F16:
|
2429
|
-
case LLAMA_FTYPE_ALL_F32:
|
2854
|
+
case LLAMA_FTYPE_MOSTLY_F16: quantized_type = GGML_TYPE_F16; break;
|
2855
|
+
case LLAMA_FTYPE_ALL_F32: quantized_type = GGML_TYPE_F32; break;
|
2430
2856
|
|
2431
2857
|
#ifdef GGML_USE_K_QUANTS
|
2432
2858
|
// K-quants
|
@@ -2510,16 +2936,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2510
2936
|
} else {
|
2511
2937
|
new_type = quantized_type;
|
2512
2938
|
#ifdef GGML_USE_K_QUANTS
|
2513
|
-
bool convert_incompatible_tensor = false;
|
2514
|
-
if (quantized_type == GGML_TYPE_Q2_K || quantized_type == GGML_TYPE_Q3_K || quantized_type == GGML_TYPE_Q4_K ||
|
2515
|
-
quantized_type == GGML_TYPE_Q5_K || quantized_type == GGML_TYPE_Q6_K) {
|
2516
|
-
int nx = tensor.ne.at(0);
|
2517
|
-
int ny = tensor.ne.at(1);
|
2518
|
-
if (nx % QK_K != 0 || ny % QK_K != 0) {
|
2519
|
-
fprintf(stderr, "\n\nTensor sizes %d x %d are not divisible by %d, required for k-quants.\n",nx,ny,QK_K);
|
2520
|
-
convert_incompatible_tensor = true;
|
2521
|
-
}
|
2522
|
-
}
|
2523
2939
|
if (tensor.name == "output.weight") {
|
2524
2940
|
int nx = tensor.ne.at(0);
|
2525
2941
|
int ny = tensor.ne.at(1);
|
@@ -2545,6 +2961,16 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2545
2961
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
2546
2962
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
2547
2963
|
}
|
2964
|
+
bool convert_incompatible_tensor = false;
|
2965
|
+
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
|
2966
|
+
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) {
|
2967
|
+
int nx = tensor.ne.at(0);
|
2968
|
+
int ny = tensor.ne.at(1);
|
2969
|
+
if (nx % QK_K != 0 || ny % QK_K != 0) {
|
2970
|
+
fprintf(stderr, "\n\nTensor sizes %d x %d are not divisible by %d, required for k-quants.\n",nx,ny,QK_K);
|
2971
|
+
convert_incompatible_tensor = true;
|
2972
|
+
}
|
2973
|
+
}
|
2548
2974
|
if (convert_incompatible_tensor) {
|
2549
2975
|
if (tensor.name == "output.weight") {
|
2550
2976
|
new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing.
|
@@ -2571,7 +2997,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2571
2997
|
f32_data = (float *) f32_conv_buf.addr;
|
2572
2998
|
}
|
2573
2999
|
|
2574
|
-
printf("quantizing .. ");
|
3000
|
+
printf("quantizing to %s .. ", ggml_type_name(new_type));
|
2575
3001
|
fflush(stdout);
|
2576
3002
|
|
2577
3003
|
work.resize(nelements * 4); // upper bound on size
|
@@ -2674,9 +3100,10 @@ struct llama_model * llama_load_model_from_file(
|
|
2674
3100
|
|
2675
3101
|
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
2676
3102
|
|
2677
|
-
if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gpu_layers,
|
2678
|
-
params.main_gpu, params.tensor_split, params.
|
2679
|
-
params.
|
3103
|
+
if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gqa, params.rms_norm_eps, params.n_gpu_layers,
|
3104
|
+
params.main_gpu, params.tensor_split, params.rope_freq_base, params.rope_freq_scale,params.low_vram,
|
3105
|
+
memory_type, params.use_mmap, params.use_mlock, params.vocab_only, params.progress_callback,
|
3106
|
+
params.progress_callback_user_data)) {
|
2680
3107
|
delete model;
|
2681
3108
|
fprintf(stderr, "%s: failed to load model\n", __func__);
|
2682
3109
|
return nullptr;
|
@@ -2697,7 +3124,7 @@ struct llama_context * llama_new_context_with_model(
|
|
2697
3124
|
return nullptr;
|
2698
3125
|
}
|
2699
3126
|
|
2700
|
-
llama_context * ctx = new llama_context(*model
|
3127
|
+
llama_context * ctx = new llama_context(*model);
|
2701
3128
|
|
2702
3129
|
if (params.seed == LLAMA_DEFAULT_SEED) {
|
2703
3130
|
params.seed = time(NULL);
|
@@ -2751,9 +3178,9 @@ struct llama_context * llama_new_context_with_model(
|
|
2751
3178
|
ctx->embedding.resize(hparams.n_embd);
|
2752
3179
|
}
|
2753
3180
|
|
2754
|
-
ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type));
|
3181
|
+
ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type) + ggml_graph_overhead());
|
2755
3182
|
|
2756
|
-
ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0().at(ctx->model.type));
|
3183
|
+
ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type));
|
2757
3184
|
ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
|
2758
3185
|
}
|
2759
3186
|
|
@@ -2775,7 +3202,7 @@ struct llama_context * llama_new_context_with_model(
|
|
2775
3202
|
|
2776
3203
|
const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
|
2777
3204
|
|
2778
|
-
|
3205
|
+
fprintf(stderr, "%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
|
2779
3206
|
|
2780
3207
|
#define LLAMA_METAL_CHECK_BUF(result) \
|
2781
3208
|
if (!(result)) { \
|
@@ -3535,13 +3962,13 @@ int llama_eval_export(struct llama_context * ctx, const char * fname) {
|
|
3535
3962
|
return 0;
|
3536
3963
|
}
|
3537
3964
|
|
3538
|
-
int
|
3539
|
-
|
3965
|
+
int llama_tokenize_with_model(
|
3966
|
+
const struct llama_model * model,
|
3540
3967
|
const char * text,
|
3541
3968
|
llama_token * tokens,
|
3542
3969
|
int n_max_tokens,
|
3543
3970
|
bool add_bos) {
|
3544
|
-
auto res = llama_tokenize(
|
3971
|
+
auto res = llama_tokenize(model->vocab, text, add_bos);
|
3545
3972
|
|
3546
3973
|
if (n_max_tokens < (int) res.size()) {
|
3547
3974
|
fprintf(stderr, "%s: too many tokens\n", __func__);
|
@@ -3555,8 +3982,29 @@ int llama_tokenize(
|
|
3555
3982
|
return res.size();
|
3556
3983
|
}
|
3557
3984
|
|
3985
|
+
int llama_tokenize(
|
3986
|
+
struct llama_context * ctx,
|
3987
|
+
const char * text,
|
3988
|
+
llama_token * tokens,
|
3989
|
+
int n_max_tokens,
|
3990
|
+
bool add_bos) {
|
3991
|
+
return llama_tokenize_with_model(&ctx->model, text, tokens, n_max_tokens, add_bos);
|
3992
|
+
}
|
3993
|
+
|
3994
|
+
int llama_n_vocab_from_model(const struct llama_model * model) {
|
3995
|
+
return model->vocab.id_to_token.size();
|
3996
|
+
}
|
3997
|
+
|
3998
|
+
int llama_n_ctx_from_model(const struct llama_model * model) {
|
3999
|
+
return model->hparams.n_ctx;
|
4000
|
+
}
|
4001
|
+
|
4002
|
+
int llama_n_embd_from_model(const struct llama_model * model) {
|
4003
|
+
return model->hparams.n_embd;
|
4004
|
+
}
|
4005
|
+
|
3558
4006
|
int llama_n_vocab(const struct llama_context * ctx) {
|
3559
|
-
return ctx->vocab.id_to_token.size();
|
4007
|
+
return ctx->model.vocab.id_to_token.size();
|
3560
4008
|
}
|
3561
4009
|
|
3562
4010
|
int llama_n_ctx(const struct llama_context * ctx) {
|
@@ -3567,19 +4015,27 @@ int llama_n_embd(const struct llama_context * ctx) {
|
|
3567
4015
|
return ctx->model.hparams.n_embd;
|
3568
4016
|
}
|
3569
4017
|
|
3570
|
-
int
|
3571
|
-
const struct
|
4018
|
+
int llama_get_vocab_from_model(
|
4019
|
+
const struct llama_model * model,
|
3572
4020
|
const char * * strings,
|
3573
4021
|
float * scores,
|
3574
4022
|
int capacity) {
|
3575
|
-
int n = std::min(capacity, (int)
|
4023
|
+
int n = std::min(capacity, (int) model->vocab.id_to_token.size());
|
3576
4024
|
for (int i = 0; i<n; ++i) {
|
3577
|
-
strings[i] =
|
3578
|
-
scores[i] =
|
4025
|
+
strings[i] = model->vocab.id_to_token[i].tok.c_str();
|
4026
|
+
scores[i] = model->vocab.id_to_token[i].score;
|
3579
4027
|
}
|
3580
4028
|
return n;
|
3581
4029
|
}
|
3582
4030
|
|
4031
|
+
int llama_get_vocab(
|
4032
|
+
const struct llama_context * ctx,
|
4033
|
+
const char * * strings,
|
4034
|
+
float * scores,
|
4035
|
+
int capacity) {
|
4036
|
+
return llama_get_vocab_from_model(&ctx->model, strings, scores, capacity);
|
4037
|
+
}
|
4038
|
+
|
3583
4039
|
float * llama_get_logits(struct llama_context * ctx) {
|
3584
4040
|
return ctx->logits.data();
|
3585
4041
|
}
|
@@ -3588,12 +4044,16 @@ float * llama_get_embeddings(struct llama_context * ctx) {
|
|
3588
4044
|
return ctx->embedding.data();
|
3589
4045
|
}
|
3590
4046
|
|
3591
|
-
const char *
|
3592
|
-
if (token >=
|
4047
|
+
const char * llama_token_to_str_with_model(const struct llama_model * model, llama_token token) {
|
4048
|
+
if (token >= llama_n_vocab_from_model(model)) {
|
3593
4049
|
return nullptr;
|
3594
4050
|
}
|
3595
4051
|
|
3596
|
-
return
|
4052
|
+
return model->vocab.id_to_token[token].tok.c_str();
|
4053
|
+
}
|
4054
|
+
|
4055
|
+
const char * llama_token_to_str(const struct llama_context * ctx, llama_token token) {
|
4056
|
+
return llama_token_to_str_with_model(&ctx->model, token);
|
3597
4057
|
}
|
3598
4058
|
|
3599
4059
|
llama_token llama_token_bos() {
|