llama_cpp 0.3.3 → 0.3.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +31 -0
- data/ext/llama_cpp/extconf.rb +1 -0
- data/ext/llama_cpp/llama_cpp.cpp +439 -9
- data/ext/llama_cpp/src/ggml-cuda.cu +759 -136
- data/ext/llama_cpp/src/ggml-metal.h +7 -0
- data/ext/llama_cpp/src/ggml-metal.m +250 -111
- data/ext/llama_cpp/src/ggml-metal.metal +614 -483
- data/ext/llama_cpp/src/ggml.c +793 -1032
- data/ext/llama_cpp/src/ggml.h +95 -18
- data/ext/llama_cpp/src/k_quants.c +327 -3
- data/ext/llama_cpp/src/k_quants.h +8 -0
- data/ext/llama_cpp/src/llama.cpp +626 -166
- data/ext/llama_cpp/src/llama.h +94 -10
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +1 -0
- data/sig/llama_cpp.rbs +36 -1
- metadata +2 -2
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -67,6 +67,7 @@ enum e_model {
|
|
67
67
|
MODEL_13B,
|
68
68
|
MODEL_30B,
|
69
69
|
MODEL_65B,
|
70
|
+
MODEL_70B,
|
70
71
|
};
|
71
72
|
|
72
73
|
static const size_t kB = 1024;
|
@@ -98,17 +99,18 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
|
|
98
99
|
}
|
99
100
|
|
100
101
|
//
|
101
|
-
// memory sizes
|
102
|
+
// memory sizes (calculated for n_batch == 512)
|
102
103
|
//
|
103
104
|
|
104
|
-
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
|
105
|
+
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0(int n_ctx)
|
105
106
|
{
|
106
107
|
static std::map<e_model, size_t> k_sizes = {
|
107
|
-
{ MODEL_3B,
|
108
|
-
{ MODEL_7B,
|
109
|
-
{ MODEL_13B,
|
110
|
-
{ MODEL_30B,
|
111
|
-
{ MODEL_65B,
|
108
|
+
{ MODEL_3B, ((size_t) n_ctx / 16ull + 92ull) * MB },
|
109
|
+
{ MODEL_7B, ((size_t) n_ctx / 16ull + 100ull) * MB },
|
110
|
+
{ MODEL_13B, ((size_t) n_ctx / 12ull + 120ull) * MB },
|
111
|
+
{ MODEL_30B, ((size_t) n_ctx / 9ull + 160ull) * MB },
|
112
|
+
{ MODEL_65B, ((size_t) n_ctx / 6ull + 256ull) * MB }, // guess
|
113
|
+
{ MODEL_70B, ((size_t) n_ctx / 7ull + 164ull) * MB },
|
112
114
|
};
|
113
115
|
return k_sizes;
|
114
116
|
}
|
@@ -116,38 +118,26 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
|
|
116
118
|
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
|
117
119
|
{
|
118
120
|
static std::map<e_model, size_t> k_sizes = {
|
119
|
-
{ MODEL_3B,
|
120
|
-
{ MODEL_7B,
|
121
|
-
{ MODEL_13B,
|
122
|
-
{ MODEL_30B,
|
123
|
-
{ MODEL_65B,
|
121
|
+
{ MODEL_3B, 128ull * MB },
|
122
|
+
{ MODEL_7B, 160ull * MB },
|
123
|
+
{ MODEL_13B, 192ull * MB },
|
124
|
+
{ MODEL_30B, 256ull * MB },
|
125
|
+
{ MODEL_65B, 384ull * MB }, // guess
|
126
|
+
{ MODEL_70B, 304ull * MB },
|
124
127
|
};
|
125
128
|
return k_sizes;
|
126
129
|
}
|
127
130
|
|
128
|
-
//
|
129
|
-
static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
|
130
|
-
{
|
131
|
-
static std::map<e_model, size_t> k_sizes = {
|
132
|
-
{ MODEL_3B, 682ull * MB },
|
133
|
-
{ MODEL_7B, 1026ull * MB },
|
134
|
-
{ MODEL_13B, 1608ull * MB },
|
135
|
-
{ MODEL_30B, 3124ull * MB },
|
136
|
-
{ MODEL_65B, 5120ull * MB },
|
137
|
-
};
|
138
|
-
return k_sizes;
|
139
|
-
}
|
140
|
-
|
141
|
-
// this is mostly needed for temporary mul_mat buffers to dequantize the data
|
142
|
-
// not actually needed if BLAS is disabled
|
131
|
+
// used to store the compute graph tensors + non-scratch data
|
143
132
|
static const std::map<e_model, size_t> & MEM_REQ_EVAL()
|
144
133
|
{
|
145
134
|
static std::map<e_model, size_t> k_sizes = {
|
146
|
-
{ MODEL_3B,
|
147
|
-
{ MODEL_7B,
|
148
|
-
{ MODEL_13B,
|
149
|
-
{ MODEL_30B,
|
150
|
-
{ MODEL_65B,
|
135
|
+
{ MODEL_3B, 8ull * MB },
|
136
|
+
{ MODEL_7B, 10ull * MB },
|
137
|
+
{ MODEL_13B, 12ull * MB },
|
138
|
+
{ MODEL_30B, 16ull * MB },
|
139
|
+
{ MODEL_65B, 24ull * MB }, // guess
|
140
|
+
{ MODEL_70B, 24ull * MB },
|
151
141
|
};
|
152
142
|
return k_sizes;
|
153
143
|
}
|
@@ -162,6 +152,7 @@ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_BASE()
|
|
162
152
|
{ MODEL_13B, 640ull * kB },
|
163
153
|
{ MODEL_30B, 768ull * kB },
|
164
154
|
{ MODEL_65B, 1536ull * kB },
|
155
|
+
{ MODEL_70B, 1536ull * kB }, // TODO (likely can be reduced)
|
165
156
|
};
|
166
157
|
return k_sizes;
|
167
158
|
}
|
@@ -176,23 +167,55 @@ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT()
|
|
176
167
|
{ MODEL_13B, 160ull },
|
177
168
|
{ MODEL_30B, 208ull },
|
178
169
|
{ MODEL_65B, 416ull },
|
170
|
+
{ MODEL_70B, 416ull }, // TODO (likely can be reduced)
|
179
171
|
};
|
180
172
|
return k_sizes;
|
181
173
|
}
|
182
174
|
|
183
175
|
// default hparams (LLaMA 7B)
|
184
176
|
struct llama_hparams {
|
185
|
-
uint32_t n_vocab
|
186
|
-
uint32_t n_ctx
|
187
|
-
uint32_t n_embd
|
188
|
-
uint32_t n_mult
|
189
|
-
uint32_t n_head
|
190
|
-
uint32_t
|
191
|
-
uint32_t
|
177
|
+
uint32_t n_vocab = 32000;
|
178
|
+
uint32_t n_ctx = 512; // this is provided as user input?
|
179
|
+
uint32_t n_embd = 4096;
|
180
|
+
uint32_t n_mult = 256;
|
181
|
+
uint32_t n_head = 32;
|
182
|
+
uint32_t n_head_kv = 32;
|
183
|
+
uint32_t n_layer = 32;
|
184
|
+
uint32_t n_rot = 64;
|
185
|
+
|
186
|
+
// LLaMAv2
|
187
|
+
// TODO: load from model data hparams
|
188
|
+
float f_ffn_mult = 1.0f;
|
189
|
+
float f_rms_norm_eps = LLAMA_DEFAULT_RMS_EPS;
|
190
|
+
|
191
|
+
float rope_freq_base = 10000.0f;
|
192
|
+
float rope_freq_scale = 1.0f;
|
193
|
+
|
192
194
|
enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;
|
193
195
|
|
194
196
|
bool operator!=(const llama_hparams & other) const {
|
195
|
-
return static_cast<bool>(memcmp(this, &other, sizeof(llama_hparams)));
|
197
|
+
return static_cast<bool>(memcmp(this, &other, sizeof(llama_hparams))); // NOLINT
|
198
|
+
}
|
199
|
+
|
200
|
+
uint32_t n_gqa() const {
|
201
|
+
return n_head/n_head_kv;
|
202
|
+
}
|
203
|
+
|
204
|
+
uint32_t n_embd_head() const {
|
205
|
+
return n_embd/n_head;
|
206
|
+
}
|
207
|
+
|
208
|
+
uint32_t n_embd_gqa() const {
|
209
|
+
return n_embd/n_gqa();
|
210
|
+
}
|
211
|
+
|
212
|
+
size_t kv_size() const {
|
213
|
+
size_t result = 2ull;
|
214
|
+
result *= (size_t) n_embd_gqa();
|
215
|
+
result *= (size_t) n_ctx;
|
216
|
+
result *= (size_t) n_layer;
|
217
|
+
result *= sizeof(ggml_fp16_t);
|
218
|
+
return result;
|
196
219
|
}
|
197
220
|
};
|
198
221
|
|
@@ -303,7 +326,7 @@ struct llama_model {
|
|
303
326
|
};
|
304
327
|
|
305
328
|
struct llama_context {
|
306
|
-
llama_context(const llama_model & model
|
329
|
+
llama_context(const llama_model & model) : model(model), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
|
307
330
|
#ifdef GGML_USE_METAL
|
308
331
|
~llama_context() {
|
309
332
|
if (ctx_metal) {
|
@@ -324,7 +347,6 @@ struct llama_context {
|
|
324
347
|
int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
|
325
348
|
|
326
349
|
const llama_model & model;
|
327
|
-
const llama_vocab & vocab;
|
328
350
|
|
329
351
|
bool model_owner = false;
|
330
352
|
|
@@ -495,12 +517,16 @@ struct llama_file_loader {
|
|
495
517
|
}
|
496
518
|
void read_hparams() {
|
497
519
|
hparams.n_vocab = file.read_u32();
|
498
|
-
hparams.n_embd
|
499
|
-
hparams.n_mult
|
500
|
-
hparams.n_head
|
520
|
+
hparams.n_embd = file.read_u32();
|
521
|
+
hparams.n_mult = file.read_u32();
|
522
|
+
hparams.n_head = file.read_u32();
|
501
523
|
hparams.n_layer = file.read_u32();
|
502
|
-
hparams.n_rot
|
503
|
-
hparams.ftype
|
524
|
+
hparams.n_rot = file.read_u32();
|
525
|
+
hparams.ftype = (enum llama_ftype) file.read_u32();
|
526
|
+
|
527
|
+
// LLaMAv2
|
528
|
+
// TODO: read from header
|
529
|
+
hparams.n_head_kv = hparams.n_head;
|
504
530
|
}
|
505
531
|
void read_vocab() {
|
506
532
|
vocab.id_to_token.resize(hparams.n_vocab);
|
@@ -551,7 +577,9 @@ struct llama_file_loader {
|
|
551
577
|
}
|
552
578
|
|
553
579
|
// skip to the next multiple of 32 bytes
|
554
|
-
|
580
|
+
if (file_version >= LLAMA_FILE_VERSION_GGJT_V1) {
|
581
|
+
file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
|
582
|
+
}
|
555
583
|
|
556
584
|
tensor.file_off = file.tell();
|
557
585
|
tensor.name = name;
|
@@ -648,7 +676,7 @@ struct llama_model_loader {
|
|
648
676
|
*ctx_size_p = *mmapped_size_p = 0;
|
649
677
|
for (const llama_load_tensor & lt : tensors_map.tensors) {
|
650
678
|
*ctx_size_p += sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE;
|
651
|
-
*(use_mmap ? mmapped_size_p : ctx_size_p) += lt.size;
|
679
|
+
*(use_mmap ? mmapped_size_p : ctx_size_p) += lt.size + 16;
|
652
680
|
}
|
653
681
|
}
|
654
682
|
|
@@ -797,7 +825,7 @@ static bool kv_cache_init(
|
|
797
825
|
ggml_type wtype,
|
798
826
|
int n_ctx,
|
799
827
|
int n_gpu_layers) {
|
800
|
-
const int n_embd = hparams.
|
828
|
+
const int n_embd = hparams.n_embd_gqa();
|
801
829
|
const int n_layer = hparams.n_layer;
|
802
830
|
|
803
831
|
const int64_t n_mem = n_layer*n_ctx;
|
@@ -841,9 +869,13 @@ struct llama_context_params llama_context_default_params() {
|
|
841
869
|
/*.seed =*/ LLAMA_DEFAULT_SEED,
|
842
870
|
/*.n_ctx =*/ 512,
|
843
871
|
/*.n_batch =*/ 512,
|
872
|
+
/*.n_gqa =*/ 1,
|
873
|
+
/*.rms_norm_eps =*/ LLAMA_DEFAULT_RMS_EPS,
|
844
874
|
/*.gpu_layers =*/ 0,
|
845
875
|
/*.main_gpu =*/ 0,
|
846
|
-
/*.tensor_split =*/
|
876
|
+
/*.tensor_split =*/ nullptr,
|
877
|
+
/*.rope_freq_base =*/ 10000.0f,
|
878
|
+
/*.rope_freq_scale =*/ 1.0f,
|
847
879
|
/*.progress_callback =*/ nullptr,
|
848
880
|
/*.progress_callback_user_data =*/ nullptr,
|
849
881
|
/*.low_vram =*/ false,
|
@@ -869,6 +901,10 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
|
|
869
901
|
return result;
|
870
902
|
}
|
871
903
|
|
904
|
+
int llama_max_devices() {
|
905
|
+
return LLAMA_MAX_DEVICES;
|
906
|
+
}
|
907
|
+
|
872
908
|
bool llama_mmap_supported() {
|
873
909
|
return llama_mmap::SUPPORTED;
|
874
910
|
}
|
@@ -954,6 +990,7 @@ static const char *llama_model_type_name(e_model type) {
|
|
954
990
|
case MODEL_13B: return "13B";
|
955
991
|
case MODEL_30B: return "30B";
|
956
992
|
case MODEL_65B: return "65B";
|
993
|
+
case MODEL_70B: return "70B";
|
957
994
|
default: LLAMA_ASSERT(false);
|
958
995
|
}
|
959
996
|
}
|
@@ -964,9 +1001,13 @@ static void llama_model_load_internal(
|
|
964
1001
|
llama_vocab & vocab,
|
965
1002
|
int n_ctx,
|
966
1003
|
int n_batch,
|
1004
|
+
int n_gqa,
|
1005
|
+
float rms_norm_eps,
|
967
1006
|
int n_gpu_layers,
|
968
1007
|
int main_gpu,
|
969
1008
|
const float * tensor_split,
|
1009
|
+
float rope_freq_base,
|
1010
|
+
float rope_freq_scale,
|
970
1011
|
bool low_vram,
|
971
1012
|
ggml_type memory_type,
|
972
1013
|
bool use_mmap,
|
@@ -983,8 +1024,12 @@ static void llama_model_load_internal(
|
|
983
1024
|
model.hparams = ml->file_loader->hparams;
|
984
1025
|
model.n_gpu_layers = n_gpu_layers;
|
985
1026
|
llama_file_version file_version = ml->file_loader->file_version;
|
1027
|
+
|
986
1028
|
auto & hparams = model.hparams;
|
987
1029
|
|
1030
|
+
// TODO: read from file
|
1031
|
+
hparams.f_rms_norm_eps = rms_norm_eps;
|
1032
|
+
|
988
1033
|
{
|
989
1034
|
switch (hparams.n_layer) {
|
990
1035
|
case 26: model.type = e_model::MODEL_3B; break;
|
@@ -1001,22 +1046,44 @@ static void llama_model_load_internal(
|
|
1001
1046
|
}
|
1002
1047
|
|
1003
1048
|
hparams.n_ctx = n_ctx;
|
1049
|
+
|
1050
|
+
// LLaMAv2
|
1051
|
+
// TODO: temporary until GGUF
|
1052
|
+
LLAMA_ASSERT(hparams.n_head % n_gqa == 0);
|
1053
|
+
hparams.n_head_kv = hparams.n_head / n_gqa;
|
1054
|
+
if (model.type == e_model::MODEL_65B && n_gqa == 8) {
|
1055
|
+
fprintf(stderr, "%s: warning: assuming 70B model based on GQA == %d\n", __func__, n_gqa);
|
1056
|
+
model.type = e_model::MODEL_70B;
|
1057
|
+
hparams.f_ffn_mult = 1.3f; // from the params.json of the 70B model
|
1058
|
+
}
|
1059
|
+
|
1060
|
+
hparams.rope_freq_base = rope_freq_base;
|
1061
|
+
hparams.rope_freq_scale = rope_freq_scale;
|
1004
1062
|
}
|
1005
1063
|
|
1006
|
-
|
1064
|
+
// ref: https://github.com/facebookresearch/llama/blob/6c7fe276574e78057f917549435a2554000a876d/llama/model.py#L194-L199
|
1065
|
+
const uint32_t n_ff_raw = 2*(4*hparams.n_embd)/3;
|
1066
|
+
const uint32_t n_ff_mult = hparams.f_ffn_mult*n_ff_raw;
|
1067
|
+
const uint32_t n_ff = ((n_ff_mult + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
|
1068
|
+
//const uint32_t n_ff = 28672;
|
1007
1069
|
|
1008
1070
|
{
|
1009
|
-
fprintf(stderr, "%s: format = %s\n",
|
1010
|
-
fprintf(stderr, "%s: n_vocab = %u\n",
|
1011
|
-
fprintf(stderr, "%s: n_ctx = %u\n",
|
1012
|
-
fprintf(stderr, "%s: n_embd = %u\n",
|
1013
|
-
fprintf(stderr, "%s: n_mult = %u\n",
|
1014
|
-
fprintf(stderr, "%s: n_head = %u\n",
|
1015
|
-
fprintf(stderr, "%s:
|
1016
|
-
fprintf(stderr, "%s:
|
1071
|
+
fprintf(stderr, "%s: format = %s\n", __func__, llama_file_version_name(file_version));
|
1072
|
+
fprintf(stderr, "%s: n_vocab = %u\n", __func__, hparams.n_vocab);
|
1073
|
+
fprintf(stderr, "%s: n_ctx = %u\n", __func__, hparams.n_ctx);
|
1074
|
+
fprintf(stderr, "%s: n_embd = %u\n", __func__, hparams.n_embd);
|
1075
|
+
fprintf(stderr, "%s: n_mult = %u\n", __func__, hparams.n_mult);
|
1076
|
+
fprintf(stderr, "%s: n_head = %u\n", __func__, hparams.n_head);
|
1077
|
+
fprintf(stderr, "%s: n_head_kv = %u\n", __func__, hparams.n_head_kv);
|
1078
|
+
fprintf(stderr, "%s: n_layer = %u\n", __func__, hparams.n_layer);
|
1079
|
+
fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot); // a.k.a. n_embd_head, n_head_dim
|
1080
|
+
fprintf(stderr, "%s: n_gqa = %u\n", __func__, hparams.n_gqa());
|
1081
|
+
fprintf(stderr, "%s: rnorm_eps = %.1e\n", __func__, hparams.f_rms_norm_eps);
|
1082
|
+
fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff);
|
1083
|
+
fprintf(stderr, "%s: freq_base = %.1f\n", __func__, hparams.rope_freq_base);
|
1084
|
+
fprintf(stderr, "%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale);
|
1017
1085
|
fprintf(stderr, "%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype));
|
1018
|
-
fprintf(stderr, "%s:
|
1019
|
-
fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
|
1086
|
+
fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
|
1020
1087
|
}
|
1021
1088
|
|
1022
1089
|
if (file_version < LLAMA_FILE_VERSION_GGJT_V2) {
|
@@ -1050,7 +1117,7 @@ static void llama_model_load_internal(
|
|
1050
1117
|
{
|
1051
1118
|
model.buf.resize(ctx_size);
|
1052
1119
|
if (use_mlock) {
|
1053
|
-
model.mlock_buf.init(model.buf.addr);
|
1120
|
+
model.mlock_buf.init (model.buf.addr);
|
1054
1121
|
model.mlock_buf.grow_to(model.buf.size);
|
1055
1122
|
}
|
1056
1123
|
|
@@ -1085,9 +1152,10 @@ static void llama_model_load_internal(
|
|
1085
1152
|
size_t vram_weights = 0;
|
1086
1153
|
size_t vram_scratch = 0;
|
1087
1154
|
{
|
1088
|
-
const uint32_t n_embd
|
1089
|
-
const uint32_t
|
1090
|
-
const uint32_t
|
1155
|
+
const uint32_t n_embd = hparams.n_embd;
|
1156
|
+
const uint32_t n_embd_gqa = hparams.n_embd_gqa();
|
1157
|
+
const uint32_t n_layer = hparams.n_layer;
|
1158
|
+
const uint32_t n_vocab = hparams.n_vocab;
|
1091
1159
|
|
1092
1160
|
ml->ggml_ctx = ctx;
|
1093
1161
|
|
@@ -1135,16 +1203,16 @@ static void llama_model_load_internal(
|
|
1135
1203
|
|
1136
1204
|
layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd}, backend);
|
1137
1205
|
|
1138
|
-
layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd},
|
1139
|
-
layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd,
|
1140
|
-
layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd,
|
1141
|
-
layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd},
|
1206
|
+
layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}, backend_split);
|
1207
|
+
layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd_gqa}, backend_split);
|
1208
|
+
layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd_gqa}, backend_split);
|
1209
|
+
layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}, backend_split);
|
1142
1210
|
|
1143
1211
|
layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd}, backend);
|
1144
1212
|
|
1145
|
-
layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff},
|
1146
|
-
layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff,
|
1147
|
-
layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff},
|
1213
|
+
layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}, backend_split);
|
1214
|
+
layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}, backend_split);
|
1215
|
+
layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}, backend_split);
|
1148
1216
|
|
1149
1217
|
if (backend == GGML_BACKEND_GPU) {
|
1150
1218
|
vram_weights +=
|
@@ -1165,13 +1233,13 @@ static void llama_model_load_internal(
|
|
1165
1233
|
const size_t mem_required =
|
1166
1234
|
ctx_size +
|
1167
1235
|
mmapped_size - vram_weights + // weights in VRAM not in memory
|
1168
|
-
MEM_REQ_SCRATCH0().at(model.type) +
|
1236
|
+
MEM_REQ_SCRATCH0(hparams.n_ctx).at(model.type) +
|
1169
1237
|
MEM_REQ_SCRATCH1().at(model.type) +
|
1170
|
-
MEM_REQ_EVAL().at
|
1238
|
+
MEM_REQ_EVAL().at(model.type);
|
1171
1239
|
|
1172
1240
|
// this is the memory required by one llama_state
|
1173
1241
|
const size_t mem_required_state =
|
1174
|
-
scale*
|
1242
|
+
scale*hparams.kv_size();
|
1175
1243
|
|
1176
1244
|
fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
|
1177
1245
|
mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
|
@@ -1212,7 +1280,7 @@ static void llama_model_load_internal(
|
|
1212
1280
|
fprintf(stderr, "%s: cannot offload v cache to GPU due to low VRAM option\n", __func__);
|
1213
1281
|
} else {
|
1214
1282
|
fprintf(stderr, "%s: offloading v cache to GPU\n", __func__);
|
1215
|
-
vram_kv_cache +=
|
1283
|
+
vram_kv_cache += hparams.kv_size() / 2;
|
1216
1284
|
}
|
1217
1285
|
}
|
1218
1286
|
if (n_gpu_layers > (int) hparams.n_layer + 2) {
|
@@ -1220,7 +1288,7 @@ static void llama_model_load_internal(
|
|
1220
1288
|
fprintf(stderr, "%s: cannot offload k cache to GPU due to low VRAM option\n", __func__);
|
1221
1289
|
} else {
|
1222
1290
|
fprintf(stderr, "%s: offloading k cache to GPU\n", __func__);
|
1223
|
-
vram_kv_cache +=
|
1291
|
+
vram_kv_cache += hparams.kv_size() / 2;
|
1224
1292
|
}
|
1225
1293
|
}
|
1226
1294
|
#elif defined(GGML_USE_CLBLAST)
|
@@ -1268,9 +1336,13 @@ static bool llama_model_load(
|
|
1268
1336
|
llama_vocab & vocab,
|
1269
1337
|
int n_ctx,
|
1270
1338
|
int n_batch,
|
1339
|
+
int n_gqa,
|
1340
|
+
float rms_norm_eps,
|
1271
1341
|
int n_gpu_layers,
|
1272
1342
|
int main_gpu,
|
1273
|
-
float * tensor_split,
|
1343
|
+
const float * tensor_split,
|
1344
|
+
float rope_freq_base,
|
1345
|
+
float rope_freq_scale,
|
1274
1346
|
bool low_vram,
|
1275
1347
|
ggml_type memory_type,
|
1276
1348
|
bool use_mmap,
|
@@ -1279,7 +1351,7 @@ static bool llama_model_load(
|
|
1279
1351
|
llama_progress_callback progress_callback,
|
1280
1352
|
void *progress_callback_user_data) {
|
1281
1353
|
try {
|
1282
|
-
llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type,
|
1354
|
+
llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gqa, rms_norm_eps, n_gpu_layers, main_gpu, tensor_split, rope_freq_base, rope_freq_scale, low_vram, memory_type,
|
1283
1355
|
use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
|
1284
1356
|
return true;
|
1285
1357
|
} catch (const std::exception & err) {
|
@@ -1323,12 +1395,22 @@ static bool llama_eval_internal(
|
|
1323
1395
|
|
1324
1396
|
LLAMA_ASSERT(!!kv_self.ctx);
|
1325
1397
|
|
1326
|
-
const
|
1327
|
-
const
|
1328
|
-
const
|
1329
|
-
const
|
1330
|
-
const
|
1331
|
-
const
|
1398
|
+
const int64_t n_embd = hparams.n_embd;
|
1399
|
+
const int64_t n_layer = hparams.n_layer;
|
1400
|
+
const int64_t n_ctx = hparams.n_ctx;
|
1401
|
+
const int64_t n_head = hparams.n_head;
|
1402
|
+
const int64_t n_head_kv = hparams.n_head_kv;
|
1403
|
+
const int64_t n_embd_head = hparams.n_embd_head();
|
1404
|
+
const int64_t n_vocab = hparams.n_vocab;
|
1405
|
+
const int64_t n_embd_gqa = hparams.n_embd_gqa();
|
1406
|
+
|
1407
|
+
|
1408
|
+
LLAMA_ASSERT(n_embd_head == hparams.n_rot);
|
1409
|
+
|
1410
|
+
const float freq_base = hparams.rope_freq_base;
|
1411
|
+
const float freq_scale = hparams.rope_freq_scale;
|
1412
|
+
const float rms_norm_eps = hparams.f_rms_norm_eps;
|
1413
|
+
|
1332
1414
|
const int n_gpu_layers = model.n_gpu_layers;
|
1333
1415
|
|
1334
1416
|
auto & mem_per_token = lctx.mem_per_token;
|
@@ -1342,7 +1424,7 @@ static bool llama_eval_internal(
|
|
1342
1424
|
|
1343
1425
|
struct ggml_context * ctx0 = ggml_init(params);
|
1344
1426
|
|
1345
|
-
ggml_cgraph gf =
|
1427
|
+
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
1346
1428
|
|
1347
1429
|
// for big prompts, if BLAS is enabled, it is better to use only one thread
|
1348
1430
|
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
|
@@ -1407,7 +1489,7 @@ static bool llama_eval_internal(
|
|
1407
1489
|
|
1408
1490
|
// norm
|
1409
1491
|
{
|
1410
|
-
cur = ggml_rms_norm(ctx0, inpL);
|
1492
|
+
cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
|
1411
1493
|
offload_func(cur);
|
1412
1494
|
ggml_set_name(cur, "rms_norm_0");
|
1413
1495
|
|
@@ -1428,11 +1510,11 @@ static bool llama_eval_internal(
|
|
1428
1510
|
offload_func_kq(tmpq);
|
1429
1511
|
ggml_set_name(tmpq, "tmpq");
|
1430
1512
|
|
1431
|
-
struct ggml_tensor * Kcur =
|
1513
|
+
struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
|
1432
1514
|
offload_func_kq(Kcur);
|
1433
1515
|
ggml_set_name(Kcur, "Kcur");
|
1434
1516
|
|
1435
|
-
struct ggml_tensor * Qcur =
|
1517
|
+
struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
|
1436
1518
|
offload_func_kq(Qcur);
|
1437
1519
|
ggml_set_name(Qcur, "Qcur");
|
1438
1520
|
|
@@ -1444,23 +1526,23 @@ static bool llama_eval_internal(
|
|
1444
1526
|
offload_func_v(tmpv);
|
1445
1527
|
ggml_set_name(tmpv, "tmpv");
|
1446
1528
|
|
1447
|
-
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv,
|
1529
|
+
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, N));
|
1448
1530
|
offload_func_v(Vcur);
|
1449
1531
|
ggml_set_name(Vcur, "Vcur");
|
1450
1532
|
|
1451
|
-
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*
|
1533
|
+
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past));
|
1452
1534
|
offload_func_kq(k);
|
1453
1535
|
ggml_set_name(k, "k");
|
1454
1536
|
|
1455
|
-
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N,
|
1537
|
+
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd_gqa,
|
1456
1538
|
( n_ctx)*ggml_element_size(kv_self.v),
|
1457
|
-
(il*n_ctx)*ggml_element_size(kv_self.v)*
|
1539
|
+
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + n_past*ggml_element_size(kv_self.v));
|
1458
1540
|
offload_func_v(v);
|
1459
1541
|
ggml_set_name(v, "v");
|
1460
1542
|
|
1461
1543
|
// important: storing RoPE-ed version of K in the KV cache!
|
1462
|
-
ggml_build_forward_expand(
|
1463
|
-
ggml_build_forward_expand(
|
1544
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
|
1545
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
|
1464
1546
|
}
|
1465
1547
|
|
1466
1548
|
struct ggml_tensor * Q =
|
@@ -1473,8 +1555,8 @@ static bool llama_eval_internal(
|
|
1473
1555
|
struct ggml_tensor * K =
|
1474
1556
|
ggml_permute(ctx0,
|
1475
1557
|
ggml_reshape_3d(ctx0,
|
1476
|
-
ggml_view_1d(ctx0, kv_self.k, (n_past + N)*
|
1477
|
-
|
1558
|
+
ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd_gqa, il*n_ctx*ggml_element_size(kv_self.k)*n_embd_gqa),
|
1559
|
+
n_embd_head, n_head_kv, n_past + N),
|
1478
1560
|
0, 2, 1, 3);
|
1479
1561
|
offload_func_kq(K);
|
1480
1562
|
ggml_set_name(K, "K");
|
@@ -1484,9 +1566,9 @@ static bool llama_eval_internal(
|
|
1484
1566
|
offload_func_kq(KQ);
|
1485
1567
|
ggml_set_name(KQ, "KQ");
|
1486
1568
|
|
1487
|
-
// KQ_scaled = KQ / sqrt(
|
1569
|
+
// KQ_scaled = KQ / sqrt(n_embd_head)
|
1488
1570
|
struct ggml_tensor * KQ_scale = ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head));
|
1489
|
-
ggml_set_name(KQ_scale, "1/sqrt(
|
1571
|
+
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
1490
1572
|
|
1491
1573
|
// KQ_scaled shape [n_past + N, N, n_head, 1]
|
1492
1574
|
struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
|
@@ -1506,10 +1588,10 @@ static bool llama_eval_internal(
|
|
1506
1588
|
// split cached V into n_head heads
|
1507
1589
|
struct ggml_tensor * V =
|
1508
1590
|
ggml_view_3d(ctx0, kv_self.v,
|
1509
|
-
n_past + N,
|
1591
|
+
n_past + N, n_embd_head, n_head_kv,
|
1510
1592
|
n_ctx*ggml_element_size(kv_self.v),
|
1511
|
-
n_ctx*ggml_element_size(kv_self.v)*
|
1512
|
-
|
1593
|
+
n_ctx*ggml_element_size(kv_self.v)*n_embd_head,
|
1594
|
+
n_ctx*ggml_element_size(kv_self.v)*n_embd_gqa*il);
|
1513
1595
|
offload_func_v(V);
|
1514
1596
|
ggml_set_name(V, "V");
|
1515
1597
|
|
@@ -1521,7 +1603,7 @@ static bool llama_eval_internal(
|
|
1521
1603
|
// make V contiguous in memory to speed up the matmul, however we waste time on the copy
|
1522
1604
|
// on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
|
1523
1605
|
// is there a better way?
|
1524
|
-
struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N,
|
1606
|
+
struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd_head, n_head));
|
1525
1607
|
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
|
1526
1608
|
#endif
|
1527
1609
|
|
@@ -1555,7 +1637,7 @@ static bool llama_eval_internal(
|
|
1555
1637
|
{
|
1556
1638
|
// norm
|
1557
1639
|
{
|
1558
|
-
cur = ggml_rms_norm(ctx0, inpFF);
|
1640
|
+
cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps);
|
1559
1641
|
offload_func(cur);
|
1560
1642
|
ggml_set_name(cur, "rms_norm_1");
|
1561
1643
|
|
@@ -1608,7 +1690,7 @@ static bool llama_eval_internal(
|
|
1608
1690
|
|
1609
1691
|
// norm
|
1610
1692
|
{
|
1611
|
-
cur = ggml_rms_norm(ctx0, inpL);
|
1693
|
+
cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
|
1612
1694
|
offload_func_nr(cur);
|
1613
1695
|
ggml_set_name(cur, "rms_norm_2");
|
1614
1696
|
|
@@ -1630,16 +1712,22 @@ static bool llama_eval_internal(
|
|
1630
1712
|
//cur = ggml_soft_max_inplace(ctx0, cur);
|
1631
1713
|
|
1632
1714
|
// run the computation
|
1633
|
-
ggml_build_forward_expand(
|
1715
|
+
ggml_build_forward_expand(gf, cur);
|
1716
|
+
|
1717
|
+
// fprintf(stderr, "graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf.n_nodes, gf.n_leafs);
|
1634
1718
|
|
1635
1719
|
#if GGML_USE_MPI
|
1636
|
-
ggml_mpi_graph_compute_pre(lctx.ctx_mpi,
|
1720
|
+
ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
|
1637
1721
|
#endif
|
1638
1722
|
|
1639
1723
|
#ifdef GGML_USE_METAL
|
1640
1724
|
if (lctx.ctx_metal && N == 1) {
|
1725
|
+
// TODO: disabled until #2413 is resolved
|
1726
|
+
//if (!ggml_metal_if_optimized(lctx.ctx_metal)) {
|
1727
|
+
// ggml_metal_graph_find_concurrency(lctx.ctx_metal, gf);
|
1728
|
+
//}
|
1641
1729
|
ggml_metal_set_n_cb (lctx.ctx_metal, n_threads);
|
1642
|
-
ggml_metal_graph_compute(lctx.ctx_metal,
|
1730
|
+
ggml_metal_graph_compute(lctx.ctx_metal, gf);
|
1643
1731
|
ggml_metal_get_tensor (lctx.ctx_metal, cur);
|
1644
1732
|
} else {
|
1645
1733
|
// IMPORTANT:
|
@@ -1658,34 +1746,34 @@ static bool llama_eval_internal(
|
|
1658
1746
|
ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v);
|
1659
1747
|
}
|
1660
1748
|
|
1661
|
-
ggml_graph_compute_helper(lctx.work_buffer,
|
1749
|
+
ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
|
1662
1750
|
}
|
1663
1751
|
#else
|
1664
|
-
ggml_graph_compute_helper(lctx.work_buffer,
|
1752
|
+
ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
|
1665
1753
|
#endif
|
1666
1754
|
|
1667
1755
|
#if GGML_USE_MPI
|
1668
|
-
ggml_mpi_graph_compute_post(lctx.ctx_mpi,
|
1756
|
+
ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
|
1669
1757
|
#endif
|
1670
1758
|
|
1671
1759
|
// update kv token count
|
1672
1760
|
lctx.kv_self.n = n_past + N;
|
1673
1761
|
|
1674
|
-
struct ggml_tensor * res = gf
|
1762
|
+
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
|
1675
1763
|
|
1676
1764
|
if (cgraph_fname) {
|
1677
|
-
ggml_graph_export(
|
1765
|
+
ggml_graph_export(gf, cgraph_fname);
|
1678
1766
|
}
|
1679
1767
|
|
1680
1768
|
#ifdef GGML_PERF
|
1681
1769
|
// print timing information per ggml operation (for debugging purposes)
|
1682
1770
|
// requires GGML_PERF to be defined
|
1683
|
-
ggml_graph_print(
|
1771
|
+
ggml_graph_print(gf);
|
1684
1772
|
#endif
|
1685
1773
|
|
1686
1774
|
// plot the computation graph in dot format (for debugging purposes)
|
1687
1775
|
//if (n_past%100 == 0) {
|
1688
|
-
// ggml_graph_dump_dot(
|
1776
|
+
// ggml_graph_dump_dot(gf, NULL, "llama.dot");
|
1689
1777
|
//}
|
1690
1778
|
|
1691
1779
|
// extract logits
|
@@ -1715,10 +1803,12 @@ static bool llama_eval_internal(
|
|
1715
1803
|
}
|
1716
1804
|
|
1717
1805
|
#if 0
|
1718
|
-
printf("\n%s: used_mem
|
1806
|
+
printf("\n%s: used_mem: eval ctx %.3f MB, scratch %.3f MB %.3f MB, work buf %.3f MB, n_past = %d, N = %d\n", __func__,
|
1719
1807
|
ggml_used_mem(ctx0)/1024.0/1024.0,
|
1720
1808
|
lctx.get_buf_max_mem(0)/1024.0/1024.0,
|
1721
|
-
lctx.get_buf_max_mem(1)/1024.0/1024.0
|
1809
|
+
lctx.get_buf_max_mem(1)/1024.0/1024.0,
|
1810
|
+
lctx.work_buffer.size()/1024.0/1024.0,
|
1811
|
+
n_past, N);
|
1722
1812
|
#endif
|
1723
1813
|
|
1724
1814
|
ggml_free(ctx0);
|
@@ -1891,6 +1981,279 @@ static std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, co
|
|
1891
1981
|
return output;
|
1892
1982
|
}
|
1893
1983
|
|
1984
|
+
//
|
1985
|
+
// grammar - internal
|
1986
|
+
//
|
1987
|
+
|
1988
|
+
struct llama_grammar {
|
1989
|
+
const std::vector<std::vector<llama_grammar_element>> rules;
|
1990
|
+
std::vector<std::vector<const llama_grammar_element *>> stacks;
|
1991
|
+
};
|
1992
|
+
|
1993
|
+
struct llama_grammar_candidate {
|
1994
|
+
size_t index;
|
1995
|
+
const uint32_t * code_points;
|
1996
|
+
};
|
1997
|
+
|
1998
|
+
// NOTE: assumes valid utf8 (but checks for overrun)
|
1999
|
+
// adds a terminating 0 for use as pointer
|
2000
|
+
std::vector<uint32_t> decode_utf8(const char * src) {
|
2001
|
+
static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
|
2002
|
+
const char * pos = src;
|
2003
|
+
std::vector<uint32_t> code_points;
|
2004
|
+
while (*pos != 0) {
|
2005
|
+
uint8_t first_byte = static_cast<uint8_t>(*pos);
|
2006
|
+
uint8_t highbits = first_byte >> 4;
|
2007
|
+
int len = lookup[highbits];
|
2008
|
+
uint8_t mask = (1 << (8 - len)) - 1;
|
2009
|
+
uint32_t value = first_byte & mask;
|
2010
|
+
const char * end = pos + len; // may overrun!
|
2011
|
+
++pos;
|
2012
|
+
for ( ; pos < end && *pos != 0; ++pos) {
|
2013
|
+
value = (value << 6) + (static_cast<uint8_t>(*pos) & 0x3F);
|
2014
|
+
}
|
2015
|
+
code_points.push_back(value);
|
2016
|
+
}
|
2017
|
+
code_points.push_back(0);
|
2018
|
+
return code_points;
|
2019
|
+
}
|
2020
|
+
|
2021
|
+
// returns true iff pos points to the end of one of the definitions of a rule
|
2022
|
+
static bool llama_grammar_is_end_of_sequence(const llama_grammar_element * pos) {
|
2023
|
+
switch (pos->type) {
|
2024
|
+
case LLAMA_GRETYPE_END: return true;
|
2025
|
+
case LLAMA_GRETYPE_ALT: return true;
|
2026
|
+
default: return false;
|
2027
|
+
}
|
2028
|
+
}
|
2029
|
+
|
2030
|
+
// returns true iff chr satisfies the char range at pos (regular or inverse range)
|
2031
|
+
// asserts that pos is pointing to a char range element
|
2032
|
+
static std::pair<bool, const llama_grammar_element *> llama_grammar_match_char(
|
2033
|
+
const llama_grammar_element * pos,
|
2034
|
+
const uint32_t chr) {
|
2035
|
+
|
2036
|
+
bool found = false;
|
2037
|
+
bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR;
|
2038
|
+
LLAMA_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT);
|
2039
|
+
|
2040
|
+
do {
|
2041
|
+
if (pos[1].type == LLAMA_GRETYPE_CHAR_RNG_UPPER) {
|
2042
|
+
// inclusive range, e.g. [a-z]
|
2043
|
+
found = found || (pos->value <= chr && chr <= pos[1].value);
|
2044
|
+
pos += 2;
|
2045
|
+
} else {
|
2046
|
+
// exact char match, e.g. [a] or "a"
|
2047
|
+
found = found || pos->value == chr;
|
2048
|
+
pos += 1;
|
2049
|
+
}
|
2050
|
+
} while (pos->type == LLAMA_GRETYPE_CHAR_ALT);
|
2051
|
+
|
2052
|
+
return std::make_pair(found == is_positive_char, pos);
|
2053
|
+
}
|
2054
|
+
|
2055
|
+
// transforms a grammar pushdown stack into N possible stacks, all ending
|
2056
|
+
// at a character range (terminal element)
|
2057
|
+
static void llama_grammar_advance_stack(
|
2058
|
+
const std::vector<std::vector<llama_grammar_element>> & rules,
|
2059
|
+
const std::vector<const llama_grammar_element *> & stack,
|
2060
|
+
std::vector<std::vector<const llama_grammar_element *>> & new_stacks) {
|
2061
|
+
|
2062
|
+
if (stack.empty()) {
|
2063
|
+
new_stacks.push_back(stack);
|
2064
|
+
return;
|
2065
|
+
}
|
2066
|
+
|
2067
|
+
const llama_grammar_element * pos = stack.back();
|
2068
|
+
|
2069
|
+
switch (pos->type) {
|
2070
|
+
case LLAMA_GRETYPE_RULE_REF: {
|
2071
|
+
const size_t rule_id = static_cast<size_t>(pos->value);
|
2072
|
+
const llama_grammar_element * subpos = rules[rule_id].data();
|
2073
|
+
do {
|
2074
|
+
// init new stack without the top (pos)
|
2075
|
+
std::vector<const llama_grammar_element *> new_stack(stack.begin(), stack.end() - 1);
|
2076
|
+
if (!llama_grammar_is_end_of_sequence(pos + 1)) {
|
2077
|
+
// if this rule ref is followed by another element, add that to stack
|
2078
|
+
new_stack.push_back(pos + 1);
|
2079
|
+
}
|
2080
|
+
if (!llama_grammar_is_end_of_sequence(subpos)) {
|
2081
|
+
// if alternate is nonempty, add to stack
|
2082
|
+
new_stack.push_back(subpos);
|
2083
|
+
}
|
2084
|
+
llama_grammar_advance_stack(rules, new_stack, new_stacks);
|
2085
|
+
while (!llama_grammar_is_end_of_sequence(subpos)) {
|
2086
|
+
// scan to end of alternate def
|
2087
|
+
subpos++;
|
2088
|
+
}
|
2089
|
+
if (subpos->type == LLAMA_GRETYPE_ALT) {
|
2090
|
+
// there's another alternate def of this rule to process
|
2091
|
+
subpos++;
|
2092
|
+
} else {
|
2093
|
+
break;
|
2094
|
+
}
|
2095
|
+
} while (true);
|
2096
|
+
break;
|
2097
|
+
}
|
2098
|
+
case LLAMA_GRETYPE_CHAR:
|
2099
|
+
case LLAMA_GRETYPE_CHAR_NOT:
|
2100
|
+
new_stacks.push_back(stack);
|
2101
|
+
break;
|
2102
|
+
default:
|
2103
|
+
// end of alternate (LLAMA_GRETYPE_END, LLAMA_GRETYPE_ALT) or middle of char range
|
2104
|
+
// (LLAMA_GRETYPE_CHAR_ALT, LLAMA_GRETYPE_CHAR_RNG_UPPER); stack should never be left on
|
2105
|
+
// those
|
2106
|
+
LLAMA_ASSERT(false);
|
2107
|
+
}
|
2108
|
+
}
|
2109
|
+
|
2110
|
+
// takes a set of possible pushdown stacks on a grammar, which are required to
|
2111
|
+
// be positioned at a character range (see `llama_grammar_advance_stack`), and
|
2112
|
+
// produces the N possible stacks if the given char is accepted at those
|
2113
|
+
// positions
|
2114
|
+
static std::vector<std::vector<const llama_grammar_element *>> llama_grammar_accept(
|
2115
|
+
const std::vector<std::vector<llama_grammar_element>> & rules,
|
2116
|
+
const std::vector<std::vector<const llama_grammar_element *>> & stacks,
|
2117
|
+
const uint32_t chr) {
|
2118
|
+
|
2119
|
+
std::vector<std::vector<const llama_grammar_element *>> new_stacks;
|
2120
|
+
|
2121
|
+
for (const auto & stack : stacks) {
|
2122
|
+
if (stack.empty()) {
|
2123
|
+
continue;
|
2124
|
+
}
|
2125
|
+
|
2126
|
+
auto match = llama_grammar_match_char(stack.back(), chr);
|
2127
|
+
if (match.first) {
|
2128
|
+
const llama_grammar_element * pos = match.second;
|
2129
|
+
|
2130
|
+
// update top of stack to next element, if any
|
2131
|
+
std::vector<const llama_grammar_element *> new_stack(stack.begin(), stack.end() - 1);
|
2132
|
+
if (!llama_grammar_is_end_of_sequence(pos)) {
|
2133
|
+
new_stack.push_back(pos);
|
2134
|
+
}
|
2135
|
+
llama_grammar_advance_stack(rules, new_stack, new_stacks);
|
2136
|
+
}
|
2137
|
+
}
|
2138
|
+
|
2139
|
+
return new_stacks;
|
2140
|
+
}
|
2141
|
+
|
2142
|
+
static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates(
|
2143
|
+
const std::vector<std::vector<llama_grammar_element>> & rules,
|
2144
|
+
const std::vector<std::vector<const llama_grammar_element *>> & stacks,
|
2145
|
+
const std::vector<llama_grammar_candidate> & candidates);
|
2146
|
+
|
2147
|
+
static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_stack(
|
2148
|
+
const std::vector<std::vector<llama_grammar_element>> & rules,
|
2149
|
+
const std::vector<const llama_grammar_element *> & stack,
|
2150
|
+
const std::vector<llama_grammar_candidate> & candidates) {
|
2151
|
+
|
2152
|
+
std::vector<llama_grammar_candidate> rejects;
|
2153
|
+
|
2154
|
+
if (stack.empty()) {
|
2155
|
+
// accept nothing; EOS is handled elsewhere
|
2156
|
+
rejects.insert(rejects.end(), candidates.begin(), candidates.end());
|
2157
|
+
return rejects;
|
2158
|
+
}
|
2159
|
+
|
2160
|
+
const llama_grammar_element * stack_pos = stack.back();
|
2161
|
+
|
2162
|
+
std::vector<llama_grammar_candidate> next_candidates;
|
2163
|
+
for (auto tok : candidates) {
|
2164
|
+
if (llama_grammar_match_char(stack_pos, tok.code_points[0]).first) {
|
2165
|
+
if (tok.code_points[1] != 0) {
|
2166
|
+
next_candidates.push_back({ tok.index, tok.code_points + 1 });
|
2167
|
+
}
|
2168
|
+
} else {
|
2169
|
+
rejects.push_back(tok);
|
2170
|
+
}
|
2171
|
+
}
|
2172
|
+
|
2173
|
+
auto stack_pos_after = llama_grammar_match_char(stack_pos, 0).second;
|
2174
|
+
|
2175
|
+
// update top of stack to next element, if any
|
2176
|
+
std::vector<const llama_grammar_element *> stack_after(stack.begin(), stack.end() - 1);
|
2177
|
+
if (!llama_grammar_is_end_of_sequence(stack_pos_after)) {
|
2178
|
+
stack_after.push_back(stack_pos_after);
|
2179
|
+
}
|
2180
|
+
std::vector<std::vector<const llama_grammar_element *>> next_stacks;
|
2181
|
+
llama_grammar_advance_stack(rules, stack_after, next_stacks);
|
2182
|
+
|
2183
|
+
auto next_rejects = llama_grammar_reject_candidates(rules, next_stacks, next_candidates);
|
2184
|
+
for (auto tok : next_rejects) {
|
2185
|
+
rejects.push_back({ tok.index, tok.code_points - 1 });
|
2186
|
+
}
|
2187
|
+
|
2188
|
+
return rejects;
|
2189
|
+
}
|
2190
|
+
|
2191
|
+
static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates(
|
2192
|
+
const std::vector<std::vector<llama_grammar_element>> & rules,
|
2193
|
+
const std::vector<std::vector<const llama_grammar_element *>> & stacks,
|
2194
|
+
const std::vector<llama_grammar_candidate> & candidates) {
|
2195
|
+
LLAMA_ASSERT(!stacks.empty()); // REVIEW
|
2196
|
+
|
2197
|
+
if (candidates.empty()) {
|
2198
|
+
return std::vector<llama_grammar_candidate>();
|
2199
|
+
}
|
2200
|
+
|
2201
|
+
auto rejects = llama_grammar_reject_candidates_for_stack(rules, stacks.front(), candidates);
|
2202
|
+
|
2203
|
+
for (size_t i = 1, size = stacks.size(); i < size; ++i) {
|
2204
|
+
rejects = llama_grammar_reject_candidates_for_stack(rules, stacks[i], rejects);
|
2205
|
+
}
|
2206
|
+
return rejects;
|
2207
|
+
}
|
2208
|
+
|
2209
|
+
//
|
2210
|
+
// grammar - external
|
2211
|
+
//
|
2212
|
+
|
2213
|
+
struct llama_grammar * llama_grammar_init(
|
2214
|
+
const llama_grammar_element ** rules,
|
2215
|
+
size_t n_rules,
|
2216
|
+
size_t start_rule_index) {
|
2217
|
+
const llama_grammar_element * pos;
|
2218
|
+
|
2219
|
+
// copy rule definitions into vectors
|
2220
|
+
std::vector<std::vector<llama_grammar_element>> vec_rules(n_rules);
|
2221
|
+
for (size_t i = 0; i < n_rules; i++) {
|
2222
|
+
for (pos = rules[i]; pos->type != LLAMA_GRETYPE_END; pos++) {
|
2223
|
+
vec_rules[i].push_back(*pos);
|
2224
|
+
}
|
2225
|
+
vec_rules[i].push_back({LLAMA_GRETYPE_END, 0});
|
2226
|
+
}
|
2227
|
+
|
2228
|
+
// loop over alternates of start rule to build initial stacks
|
2229
|
+
std::vector<std::vector<const llama_grammar_element *>> stacks;
|
2230
|
+
pos = rules[start_rule_index];
|
2231
|
+
do {
|
2232
|
+
std::vector<const llama_grammar_element *> stack;
|
2233
|
+
if (!llama_grammar_is_end_of_sequence(pos)) {
|
2234
|
+
// if alternate is nonempty, add to stack
|
2235
|
+
stack.push_back(pos);
|
2236
|
+
}
|
2237
|
+
llama_grammar_advance_stack(vec_rules, stack, stacks);
|
2238
|
+
while (!llama_grammar_is_end_of_sequence(pos)) {
|
2239
|
+
// scan to end of alternate def
|
2240
|
+
pos++;
|
2241
|
+
}
|
2242
|
+
if (pos->type == LLAMA_GRETYPE_ALT) {
|
2243
|
+
// there's another alternate def of this rule to process
|
2244
|
+
pos++;
|
2245
|
+
} else {
|
2246
|
+
break;
|
2247
|
+
}
|
2248
|
+
} while (true);
|
2249
|
+
|
2250
|
+
return new llama_grammar{ std::move(vec_rules), std::move(stacks) };
|
2251
|
+
}
|
2252
|
+
|
2253
|
+
void llama_grammar_free(struct llama_grammar * grammar) {
|
2254
|
+
delete grammar;
|
2255
|
+
}
|
2256
|
+
|
1894
2257
|
//
|
1895
2258
|
// sampling
|
1896
2259
|
//
|
@@ -2006,9 +2369,18 @@ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array *
|
|
2006
2369
|
}
|
2007
2370
|
|
2008
2371
|
// Normalize the second derivatives
|
2009
|
-
|
2010
|
-
|
2011
|
-
|
2372
|
+
{
|
2373
|
+
const float second_derivatives_sum = std::accumulate(second_derivatives.begin(), second_derivatives.end(), 0.0f);
|
2374
|
+
|
2375
|
+
if (second_derivatives_sum > 1e-6f) {
|
2376
|
+
for (float & value : second_derivatives) {
|
2377
|
+
value /= second_derivatives_sum;
|
2378
|
+
}
|
2379
|
+
} else {
|
2380
|
+
for (float & value : second_derivatives) {
|
2381
|
+
value = 1.0f / second_derivatives.size();
|
2382
|
+
}
|
2383
|
+
}
|
2012
2384
|
}
|
2013
2385
|
|
2014
2386
|
float cum_sum = 0.0f;
|
@@ -2167,6 +2539,47 @@ void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, l
|
|
2167
2539
|
}
|
2168
2540
|
}
|
2169
2541
|
|
2542
|
+
void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, const struct llama_grammar * grammar) {
|
2543
|
+
assert(ctx);
|
2544
|
+
const int64_t t_start_sample_us = ggml_time_us();
|
2545
|
+
|
2546
|
+
bool allow_eos = false;
|
2547
|
+
for (const auto & stack : grammar->stacks) {
|
2548
|
+
if (stack.empty()) {
|
2549
|
+
allow_eos = true;
|
2550
|
+
break;
|
2551
|
+
}
|
2552
|
+
}
|
2553
|
+
|
2554
|
+
const llama_token eos = llama_token_eos();
|
2555
|
+
|
2556
|
+
std::vector<std::vector<uint32_t>> candidates_decoded;
|
2557
|
+
std::vector<llama_grammar_candidate> candidates_grammar;
|
2558
|
+
|
2559
|
+
for (size_t i = 0; i < candidates->size; ++i) {
|
2560
|
+
const llama_token id = candidates->data[i].id;
|
2561
|
+
const char * str = llama_token_to_str(ctx, id);
|
2562
|
+
if (id == eos) {
|
2563
|
+
if (!allow_eos) {
|
2564
|
+
candidates->data[i].logit = -INFINITY;
|
2565
|
+
}
|
2566
|
+
} else if (*str == 0) {
|
2567
|
+
candidates->data[i].logit = -INFINITY;
|
2568
|
+
} else {
|
2569
|
+
candidates_decoded.push_back(decode_utf8(str));
|
2570
|
+
candidates_grammar.push_back({ i, candidates_decoded.back().data() });
|
2571
|
+
}
|
2572
|
+
}
|
2573
|
+
|
2574
|
+
const auto rejects =
|
2575
|
+
llama_grammar_reject_candidates(grammar->rules, grammar->stacks, candidates_grammar);
|
2576
|
+
for (auto & reject : rejects) {
|
2577
|
+
candidates->data[reject.index].logit = -INFINITY;
|
2578
|
+
}
|
2579
|
+
|
2580
|
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
2581
|
+
}
|
2582
|
+
|
2170
2583
|
static void llama_log_softmax(float * array, size_t size) {
|
2171
2584
|
float max_l = *std::max_element(array, array + size);
|
2172
2585
|
float sum = 0.f;
|
@@ -2185,9 +2598,8 @@ void llama_sample_classifier_free_guidance(
|
|
2185
2598
|
struct llama_context * ctx,
|
2186
2599
|
llama_token_data_array * candidates,
|
2187
2600
|
struct llama_context * guidance_ctx,
|
2188
|
-
float scale
|
2189
|
-
|
2190
|
-
int64_t t_start_sample_us = t_start_sample_us = ggml_time_us();
|
2601
|
+
float scale) {
|
2602
|
+
int64_t t_start_sample_us = ggml_time_us();
|
2191
2603
|
|
2192
2604
|
assert(ctx);
|
2193
2605
|
auto n_vocab = llama_n_vocab(ctx);
|
@@ -2207,16 +2619,7 @@ void llama_sample_classifier_free_guidance(
|
|
2207
2619
|
for (int i = 0; i < n_vocab; ++i) {
|
2208
2620
|
float logit_guidance = logits_guidance[i];
|
2209
2621
|
float logit_base = logits_base[i];
|
2210
|
-
|
2211
|
-
}
|
2212
|
-
|
2213
|
-
llama_log_softmax(logits_guidance, n_vocab);
|
2214
|
-
|
2215
|
-
for (int i = 0; i < n_vocab; ++i) {
|
2216
|
-
float logit_base = logits_base[i];
|
2217
|
-
float logit_guidance = logits_guidance[i];
|
2218
|
-
|
2219
|
-
candidates->data[i].logit = smooth_factor * logit_guidance + (1.f - smooth_factor) * logit_base;
|
2622
|
+
candidates->data[i].logit = scale * (logit_base - logit_guidance) + logit_guidance;
|
2220
2623
|
}
|
2221
2624
|
|
2222
2625
|
if (ctx) {
|
@@ -2352,6 +2755,29 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra
|
|
2352
2755
|
return result;
|
2353
2756
|
}
|
2354
2757
|
|
2758
|
+
void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token) {
|
2759
|
+
const int64_t t_start_sample_us = ggml_time_us();
|
2760
|
+
|
2761
|
+
if (token == llama_token_eos()) {
|
2762
|
+
for (const auto & stack : grammar->stacks) {
|
2763
|
+
if (stack.empty()) {
|
2764
|
+
return;
|
2765
|
+
}
|
2766
|
+
}
|
2767
|
+
LLAMA_ASSERT(false);
|
2768
|
+
}
|
2769
|
+
|
2770
|
+
const char * str = llama_token_to_str(ctx, token);
|
2771
|
+
// Note terminating 0 in decoded string
|
2772
|
+
auto code_points = decode_utf8(str);
|
2773
|
+
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
|
2774
|
+
grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
|
2775
|
+
}
|
2776
|
+
LLAMA_ASSERT(!grammar->stacks.empty());
|
2777
|
+
|
2778
|
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
2779
|
+
}
|
2780
|
+
|
2355
2781
|
//
|
2356
2782
|
// quantization
|
2357
2783
|
//
|
@@ -2425,8 +2851,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2425
2851
|
case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
|
2426
2852
|
case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
|
2427
2853
|
case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
|
2428
|
-
case LLAMA_FTYPE_MOSTLY_F16:
|
2429
|
-
case LLAMA_FTYPE_ALL_F32:
|
2854
|
+
case LLAMA_FTYPE_MOSTLY_F16: quantized_type = GGML_TYPE_F16; break;
|
2855
|
+
case LLAMA_FTYPE_ALL_F32: quantized_type = GGML_TYPE_F32; break;
|
2430
2856
|
|
2431
2857
|
#ifdef GGML_USE_K_QUANTS
|
2432
2858
|
// K-quants
|
@@ -2510,16 +2936,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2510
2936
|
} else {
|
2511
2937
|
new_type = quantized_type;
|
2512
2938
|
#ifdef GGML_USE_K_QUANTS
|
2513
|
-
bool convert_incompatible_tensor = false;
|
2514
|
-
if (quantized_type == GGML_TYPE_Q2_K || quantized_type == GGML_TYPE_Q3_K || quantized_type == GGML_TYPE_Q4_K ||
|
2515
|
-
quantized_type == GGML_TYPE_Q5_K || quantized_type == GGML_TYPE_Q6_K) {
|
2516
|
-
int nx = tensor.ne.at(0);
|
2517
|
-
int ny = tensor.ne.at(1);
|
2518
|
-
if (nx % QK_K != 0 || ny % QK_K != 0) {
|
2519
|
-
fprintf(stderr, "\n\nTensor sizes %d x %d are not divisible by %d, required for k-quants.\n",nx,ny,QK_K);
|
2520
|
-
convert_incompatible_tensor = true;
|
2521
|
-
}
|
2522
|
-
}
|
2523
2939
|
if (tensor.name == "output.weight") {
|
2524
2940
|
int nx = tensor.ne.at(0);
|
2525
2941
|
int ny = tensor.ne.at(1);
|
@@ -2545,6 +2961,16 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2545
2961
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
2546
2962
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
2547
2963
|
}
|
2964
|
+
bool convert_incompatible_tensor = false;
|
2965
|
+
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
|
2966
|
+
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) {
|
2967
|
+
int nx = tensor.ne.at(0);
|
2968
|
+
int ny = tensor.ne.at(1);
|
2969
|
+
if (nx % QK_K != 0 || ny % QK_K != 0) {
|
2970
|
+
fprintf(stderr, "\n\nTensor sizes %d x %d are not divisible by %d, required for k-quants.\n",nx,ny,QK_K);
|
2971
|
+
convert_incompatible_tensor = true;
|
2972
|
+
}
|
2973
|
+
}
|
2548
2974
|
if (convert_incompatible_tensor) {
|
2549
2975
|
if (tensor.name == "output.weight") {
|
2550
2976
|
new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing.
|
@@ -2571,7 +2997,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2571
2997
|
f32_data = (float *) f32_conv_buf.addr;
|
2572
2998
|
}
|
2573
2999
|
|
2574
|
-
printf("quantizing .. ");
|
3000
|
+
printf("quantizing to %s .. ", ggml_type_name(new_type));
|
2575
3001
|
fflush(stdout);
|
2576
3002
|
|
2577
3003
|
work.resize(nelements * 4); // upper bound on size
|
@@ -2674,9 +3100,10 @@ struct llama_model * llama_load_model_from_file(
|
|
2674
3100
|
|
2675
3101
|
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
2676
3102
|
|
2677
|
-
if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gpu_layers,
|
2678
|
-
params.main_gpu, params.tensor_split, params.
|
2679
|
-
params.
|
3103
|
+
if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gqa, params.rms_norm_eps, params.n_gpu_layers,
|
3104
|
+
params.main_gpu, params.tensor_split, params.rope_freq_base, params.rope_freq_scale,params.low_vram,
|
3105
|
+
memory_type, params.use_mmap, params.use_mlock, params.vocab_only, params.progress_callback,
|
3106
|
+
params.progress_callback_user_data)) {
|
2680
3107
|
delete model;
|
2681
3108
|
fprintf(stderr, "%s: failed to load model\n", __func__);
|
2682
3109
|
return nullptr;
|
@@ -2697,7 +3124,7 @@ struct llama_context * llama_new_context_with_model(
|
|
2697
3124
|
return nullptr;
|
2698
3125
|
}
|
2699
3126
|
|
2700
|
-
llama_context * ctx = new llama_context(*model
|
3127
|
+
llama_context * ctx = new llama_context(*model);
|
2701
3128
|
|
2702
3129
|
if (params.seed == LLAMA_DEFAULT_SEED) {
|
2703
3130
|
params.seed = time(NULL);
|
@@ -2751,9 +3178,9 @@ struct llama_context * llama_new_context_with_model(
|
|
2751
3178
|
ctx->embedding.resize(hparams.n_embd);
|
2752
3179
|
}
|
2753
3180
|
|
2754
|
-
ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type));
|
3181
|
+
ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type) + ggml_graph_overhead());
|
2755
3182
|
|
2756
|
-
ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0().at(ctx->model.type));
|
3183
|
+
ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type));
|
2757
3184
|
ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
|
2758
3185
|
}
|
2759
3186
|
|
@@ -2775,7 +3202,7 @@ struct llama_context * llama_new_context_with_model(
|
|
2775
3202
|
|
2776
3203
|
const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
|
2777
3204
|
|
2778
|
-
|
3205
|
+
fprintf(stderr, "%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
|
2779
3206
|
|
2780
3207
|
#define LLAMA_METAL_CHECK_BUF(result) \
|
2781
3208
|
if (!(result)) { \
|
@@ -3535,13 +3962,13 @@ int llama_eval_export(struct llama_context * ctx, const char * fname) {
|
|
3535
3962
|
return 0;
|
3536
3963
|
}
|
3537
3964
|
|
3538
|
-
int
|
3539
|
-
|
3965
|
+
int llama_tokenize_with_model(
|
3966
|
+
const struct llama_model * model,
|
3540
3967
|
const char * text,
|
3541
3968
|
llama_token * tokens,
|
3542
3969
|
int n_max_tokens,
|
3543
3970
|
bool add_bos) {
|
3544
|
-
auto res = llama_tokenize(
|
3971
|
+
auto res = llama_tokenize(model->vocab, text, add_bos);
|
3545
3972
|
|
3546
3973
|
if (n_max_tokens < (int) res.size()) {
|
3547
3974
|
fprintf(stderr, "%s: too many tokens\n", __func__);
|
@@ -3555,8 +3982,29 @@ int llama_tokenize(
|
|
3555
3982
|
return res.size();
|
3556
3983
|
}
|
3557
3984
|
|
3985
|
+
int llama_tokenize(
|
3986
|
+
struct llama_context * ctx,
|
3987
|
+
const char * text,
|
3988
|
+
llama_token * tokens,
|
3989
|
+
int n_max_tokens,
|
3990
|
+
bool add_bos) {
|
3991
|
+
return llama_tokenize_with_model(&ctx->model, text, tokens, n_max_tokens, add_bos);
|
3992
|
+
}
|
3993
|
+
|
3994
|
+
int llama_n_vocab_from_model(const struct llama_model * model) {
|
3995
|
+
return model->vocab.id_to_token.size();
|
3996
|
+
}
|
3997
|
+
|
3998
|
+
int llama_n_ctx_from_model(const struct llama_model * model) {
|
3999
|
+
return model->hparams.n_ctx;
|
4000
|
+
}
|
4001
|
+
|
4002
|
+
int llama_n_embd_from_model(const struct llama_model * model) {
|
4003
|
+
return model->hparams.n_embd;
|
4004
|
+
}
|
4005
|
+
|
3558
4006
|
int llama_n_vocab(const struct llama_context * ctx) {
|
3559
|
-
return ctx->vocab.id_to_token.size();
|
4007
|
+
return ctx->model.vocab.id_to_token.size();
|
3560
4008
|
}
|
3561
4009
|
|
3562
4010
|
int llama_n_ctx(const struct llama_context * ctx) {
|
@@ -3567,19 +4015,27 @@ int llama_n_embd(const struct llama_context * ctx) {
|
|
3567
4015
|
return ctx->model.hparams.n_embd;
|
3568
4016
|
}
|
3569
4017
|
|
3570
|
-
int
|
3571
|
-
const struct
|
4018
|
+
int llama_get_vocab_from_model(
|
4019
|
+
const struct llama_model * model,
|
3572
4020
|
const char * * strings,
|
3573
4021
|
float * scores,
|
3574
4022
|
int capacity) {
|
3575
|
-
int n = std::min(capacity, (int)
|
4023
|
+
int n = std::min(capacity, (int) model->vocab.id_to_token.size());
|
3576
4024
|
for (int i = 0; i<n; ++i) {
|
3577
|
-
strings[i] =
|
3578
|
-
scores[i] =
|
4025
|
+
strings[i] = model->vocab.id_to_token[i].tok.c_str();
|
4026
|
+
scores[i] = model->vocab.id_to_token[i].score;
|
3579
4027
|
}
|
3580
4028
|
return n;
|
3581
4029
|
}
|
3582
4030
|
|
4031
|
+
int llama_get_vocab(
|
4032
|
+
const struct llama_context * ctx,
|
4033
|
+
const char * * strings,
|
4034
|
+
float * scores,
|
4035
|
+
int capacity) {
|
4036
|
+
return llama_get_vocab_from_model(&ctx->model, strings, scores, capacity);
|
4037
|
+
}
|
4038
|
+
|
3583
4039
|
float * llama_get_logits(struct llama_context * ctx) {
|
3584
4040
|
return ctx->logits.data();
|
3585
4041
|
}
|
@@ -3588,12 +4044,16 @@ float * llama_get_embeddings(struct llama_context * ctx) {
|
|
3588
4044
|
return ctx->embedding.data();
|
3589
4045
|
}
|
3590
4046
|
|
3591
|
-
const char *
|
3592
|
-
if (token >=
|
4047
|
+
const char * llama_token_to_str_with_model(const struct llama_model * model, llama_token token) {
|
4048
|
+
if (token >= llama_n_vocab_from_model(model)) {
|
3593
4049
|
return nullptr;
|
3594
4050
|
}
|
3595
4051
|
|
3596
|
-
return
|
4052
|
+
return model->vocab.id_to_token[token].tok.c_str();
|
4053
|
+
}
|
4054
|
+
|
4055
|
+
const char * llama_token_to_str(const struct llama_context * ctx, llama_token token) {
|
4056
|
+
return llama_token_to_str_with_model(&ctx->model, token);
|
3597
4057
|
}
|
3598
4058
|
|
3599
4059
|
llama_token llama_token_bos() {
|