llama_cpp 0.3.4 → 0.3.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -0
- data/ext/llama_cpp/extconf.rb +1 -0
- data/ext/llama_cpp/llama_cpp.cpp +293 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +304 -99
- data/ext/llama_cpp/src/ggml-metal.h +7 -0
- data/ext/llama_cpp/src/ggml-metal.m +201 -71
- data/ext/llama_cpp/src/ggml-metal.metal +68 -54
- data/ext/llama_cpp/src/ggml.c +713 -978
- data/ext/llama_cpp/src/ggml.h +82 -17
- data/ext/llama_cpp/src/k_quants.c +327 -3
- data/ext/llama_cpp/src/llama.cpp +524 -121
- data/ext/llama_cpp/src/llama.h +60 -5
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +24 -0
- metadata +2 -2
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -67,6 +67,7 @@ enum e_model {
|
|
67
67
|
MODEL_13B,
|
68
68
|
MODEL_30B,
|
69
69
|
MODEL_65B,
|
70
|
+
MODEL_70B,
|
70
71
|
};
|
71
72
|
|
72
73
|
static const size_t kB = 1024;
|
@@ -98,18 +99,18 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
|
|
98
99
|
}
|
99
100
|
|
100
101
|
//
|
101
|
-
// memory sizes
|
102
|
+
// memory sizes (calculated for n_batch == 512)
|
102
103
|
//
|
103
104
|
|
104
105
|
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0(int n_ctx)
|
105
106
|
{
|
106
107
|
static std::map<e_model, size_t> k_sizes = {
|
107
|
-
|
108
|
-
{
|
109
|
-
{
|
110
|
-
{
|
111
|
-
{
|
112
|
-
{
|
108
|
+
{ MODEL_3B, ((size_t) n_ctx / 16ull + 92ull) * MB },
|
109
|
+
{ MODEL_7B, ((size_t) n_ctx / 16ull + 100ull) * MB },
|
110
|
+
{ MODEL_13B, ((size_t) n_ctx / 12ull + 120ull) * MB },
|
111
|
+
{ MODEL_30B, ((size_t) n_ctx / 9ull + 160ull) * MB },
|
112
|
+
{ MODEL_65B, ((size_t) n_ctx / 6ull + 256ull) * MB }, // guess
|
113
|
+
{ MODEL_70B, ((size_t) n_ctx / 7ull + 164ull) * MB },
|
113
114
|
};
|
114
115
|
return k_sizes;
|
115
116
|
}
|
@@ -117,38 +118,26 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0(int n_ctx)
|
|
117
118
|
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
|
118
119
|
{
|
119
120
|
static std::map<e_model, size_t> k_sizes = {
|
120
|
-
{ MODEL_3B,
|
121
|
-
{ MODEL_7B,
|
122
|
-
{ MODEL_13B,
|
123
|
-
{ MODEL_30B,
|
124
|
-
{ MODEL_65B,
|
121
|
+
{ MODEL_3B, 128ull * MB },
|
122
|
+
{ MODEL_7B, 160ull * MB },
|
123
|
+
{ MODEL_13B, 192ull * MB },
|
124
|
+
{ MODEL_30B, 256ull * MB },
|
125
|
+
{ MODEL_65B, 384ull * MB }, // guess
|
126
|
+
{ MODEL_70B, 304ull * MB },
|
125
127
|
};
|
126
128
|
return k_sizes;
|
127
129
|
}
|
128
130
|
|
129
|
-
//
|
130
|
-
static const std::map<e_model, size_t> &
|
131
|
+
// used to store the compute graph tensors + non-scratch data
|
132
|
+
static const std::map<e_model, size_t> & MEM_REQ_EVAL()
|
131
133
|
{
|
132
134
|
static std::map<e_model, size_t> k_sizes = {
|
133
|
-
{ MODEL_3B,
|
134
|
-
{ MODEL_7B,
|
135
|
-
{ MODEL_13B,
|
136
|
-
{ MODEL_30B,
|
137
|
-
{ MODEL_65B,
|
138
|
-
|
139
|
-
return k_sizes;
|
140
|
-
}
|
141
|
-
|
142
|
-
// this is mostly needed for temporary mul_mat buffers to dequantize the data
|
143
|
-
// not actually needed if BLAS is disabled
|
144
|
-
static const std::map<e_model, size_t> & MEM_REQ_EVAL(int n_ctx)
|
145
|
-
{
|
146
|
-
static std::map<e_model, size_t> k_sizes = {
|
147
|
-
{ MODEL_3B, ((size_t) n_ctx / 256ull + 512ull) * MB },
|
148
|
-
{ MODEL_7B, ((size_t) n_ctx / 256ull + 768ull) * MB },
|
149
|
-
{ MODEL_13B, ((size_t) n_ctx / 256ull + 1024ull) * MB },
|
150
|
-
{ MODEL_30B, ((size_t) n_ctx / 256ull + 1280ull) * MB },
|
151
|
-
{ MODEL_65B, ((size_t) n_ctx / 256ull + 1536ull) * MB },
|
135
|
+
{ MODEL_3B, 8ull * MB },
|
136
|
+
{ MODEL_7B, 10ull * MB },
|
137
|
+
{ MODEL_13B, 12ull * MB },
|
138
|
+
{ MODEL_30B, 16ull * MB },
|
139
|
+
{ MODEL_65B, 24ull * MB }, // guess
|
140
|
+
{ MODEL_70B, 24ull * MB },
|
152
141
|
};
|
153
142
|
return k_sizes;
|
154
143
|
}
|
@@ -163,6 +152,7 @@ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_BASE()
|
|
163
152
|
{ MODEL_13B, 640ull * kB },
|
164
153
|
{ MODEL_30B, 768ull * kB },
|
165
154
|
{ MODEL_65B, 1536ull * kB },
|
155
|
+
{ MODEL_70B, 1536ull * kB }, // TODO (likely can be reduced)
|
166
156
|
};
|
167
157
|
return k_sizes;
|
168
158
|
}
|
@@ -177,19 +167,26 @@ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT()
|
|
177
167
|
{ MODEL_13B, 160ull },
|
178
168
|
{ MODEL_30B, 208ull },
|
179
169
|
{ MODEL_65B, 416ull },
|
170
|
+
{ MODEL_70B, 416ull }, // TODO (likely can be reduced)
|
180
171
|
};
|
181
172
|
return k_sizes;
|
182
173
|
}
|
183
174
|
|
184
175
|
// default hparams (LLaMA 7B)
|
185
176
|
struct llama_hparams {
|
186
|
-
uint32_t n_vocab
|
187
|
-
uint32_t n_ctx
|
188
|
-
uint32_t n_embd
|
189
|
-
uint32_t n_mult
|
190
|
-
uint32_t n_head
|
191
|
-
uint32_t
|
192
|
-
uint32_t
|
177
|
+
uint32_t n_vocab = 32000;
|
178
|
+
uint32_t n_ctx = 512; // this is provided as user input?
|
179
|
+
uint32_t n_embd = 4096;
|
180
|
+
uint32_t n_mult = 256;
|
181
|
+
uint32_t n_head = 32;
|
182
|
+
uint32_t n_head_kv = 32;
|
183
|
+
uint32_t n_layer = 32;
|
184
|
+
uint32_t n_rot = 64;
|
185
|
+
|
186
|
+
// LLaMAv2
|
187
|
+
// TODO: load from model data hparams
|
188
|
+
float f_ffn_mult = 1.0f;
|
189
|
+
float f_rms_norm_eps = LLAMA_DEFAULT_RMS_EPS;
|
193
190
|
|
194
191
|
float rope_freq_base = 10000.0f;
|
195
192
|
float rope_freq_scale = 1.0f;
|
@@ -197,7 +194,28 @@ struct llama_hparams {
|
|
197
194
|
enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;
|
198
195
|
|
199
196
|
bool operator!=(const llama_hparams & other) const {
|
200
|
-
return static_cast<bool>(memcmp(this, &other, sizeof(llama_hparams)));
|
197
|
+
return static_cast<bool>(memcmp(this, &other, sizeof(llama_hparams))); // NOLINT
|
198
|
+
}
|
199
|
+
|
200
|
+
uint32_t n_gqa() const {
|
201
|
+
return n_head/n_head_kv;
|
202
|
+
}
|
203
|
+
|
204
|
+
uint32_t n_embd_head() const {
|
205
|
+
return n_embd/n_head;
|
206
|
+
}
|
207
|
+
|
208
|
+
uint32_t n_embd_gqa() const {
|
209
|
+
return n_embd/n_gqa();
|
210
|
+
}
|
211
|
+
|
212
|
+
size_t kv_size() const {
|
213
|
+
size_t result = 2ull;
|
214
|
+
result *= (size_t) n_embd_gqa();
|
215
|
+
result *= (size_t) n_ctx;
|
216
|
+
result *= (size_t) n_layer;
|
217
|
+
result *= sizeof(ggml_fp16_t);
|
218
|
+
return result;
|
201
219
|
}
|
202
220
|
};
|
203
221
|
|
@@ -499,12 +517,16 @@ struct llama_file_loader {
|
|
499
517
|
}
|
500
518
|
void read_hparams() {
|
501
519
|
hparams.n_vocab = file.read_u32();
|
502
|
-
hparams.n_embd
|
503
|
-
hparams.n_mult
|
504
|
-
hparams.n_head
|
520
|
+
hparams.n_embd = file.read_u32();
|
521
|
+
hparams.n_mult = file.read_u32();
|
522
|
+
hparams.n_head = file.read_u32();
|
505
523
|
hparams.n_layer = file.read_u32();
|
506
|
-
hparams.n_rot
|
507
|
-
hparams.ftype
|
524
|
+
hparams.n_rot = file.read_u32();
|
525
|
+
hparams.ftype = (enum llama_ftype) file.read_u32();
|
526
|
+
|
527
|
+
// LLaMAv2
|
528
|
+
// TODO: read from header
|
529
|
+
hparams.n_head_kv = hparams.n_head;
|
508
530
|
}
|
509
531
|
void read_vocab() {
|
510
532
|
vocab.id_to_token.resize(hparams.n_vocab);
|
@@ -803,7 +825,7 @@ static bool kv_cache_init(
|
|
803
825
|
ggml_type wtype,
|
804
826
|
int n_ctx,
|
805
827
|
int n_gpu_layers) {
|
806
|
-
const int n_embd = hparams.
|
828
|
+
const int n_embd = hparams.n_embd_gqa();
|
807
829
|
const int n_layer = hparams.n_layer;
|
808
830
|
|
809
831
|
const int64_t n_mem = n_layer*n_ctx;
|
@@ -847,6 +869,8 @@ struct llama_context_params llama_context_default_params() {
|
|
847
869
|
/*.seed =*/ LLAMA_DEFAULT_SEED,
|
848
870
|
/*.n_ctx =*/ 512,
|
849
871
|
/*.n_batch =*/ 512,
|
872
|
+
/*.n_gqa =*/ 1,
|
873
|
+
/*.rms_norm_eps =*/ LLAMA_DEFAULT_RMS_EPS,
|
850
874
|
/*.gpu_layers =*/ 0,
|
851
875
|
/*.main_gpu =*/ 0,
|
852
876
|
/*.tensor_split =*/ nullptr,
|
@@ -966,6 +990,7 @@ static const char *llama_model_type_name(e_model type) {
|
|
966
990
|
case MODEL_13B: return "13B";
|
967
991
|
case MODEL_30B: return "30B";
|
968
992
|
case MODEL_65B: return "65B";
|
993
|
+
case MODEL_70B: return "70B";
|
969
994
|
default: LLAMA_ASSERT(false);
|
970
995
|
}
|
971
996
|
}
|
@@ -976,6 +1001,8 @@ static void llama_model_load_internal(
|
|
976
1001
|
llama_vocab & vocab,
|
977
1002
|
int n_ctx,
|
978
1003
|
int n_batch,
|
1004
|
+
int n_gqa,
|
1005
|
+
float rms_norm_eps,
|
979
1006
|
int n_gpu_layers,
|
980
1007
|
int main_gpu,
|
981
1008
|
const float * tensor_split,
|
@@ -997,8 +1024,12 @@ static void llama_model_load_internal(
|
|
997
1024
|
model.hparams = ml->file_loader->hparams;
|
998
1025
|
model.n_gpu_layers = n_gpu_layers;
|
999
1026
|
llama_file_version file_version = ml->file_loader->file_version;
|
1027
|
+
|
1000
1028
|
auto & hparams = model.hparams;
|
1001
1029
|
|
1030
|
+
// TODO: read from file
|
1031
|
+
hparams.f_rms_norm_eps = rms_norm_eps;
|
1032
|
+
|
1002
1033
|
{
|
1003
1034
|
switch (hparams.n_layer) {
|
1004
1035
|
case 26: model.type = e_model::MODEL_3B; break;
|
@@ -1016,11 +1047,25 @@ static void llama_model_load_internal(
|
|
1016
1047
|
|
1017
1048
|
hparams.n_ctx = n_ctx;
|
1018
1049
|
|
1050
|
+
// LLaMAv2
|
1051
|
+
// TODO: temporary until GGUF
|
1052
|
+
LLAMA_ASSERT(hparams.n_head % n_gqa == 0);
|
1053
|
+
hparams.n_head_kv = hparams.n_head / n_gqa;
|
1054
|
+
if (model.type == e_model::MODEL_65B && n_gqa == 8) {
|
1055
|
+
fprintf(stderr, "%s: warning: assuming 70B model based on GQA == %d\n", __func__, n_gqa);
|
1056
|
+
model.type = e_model::MODEL_70B;
|
1057
|
+
hparams.f_ffn_mult = 1.3f; // from the params.json of the 70B model
|
1058
|
+
}
|
1059
|
+
|
1019
1060
|
hparams.rope_freq_base = rope_freq_base;
|
1020
1061
|
hparams.rope_freq_scale = rope_freq_scale;
|
1021
1062
|
}
|
1022
1063
|
|
1023
|
-
|
1064
|
+
// ref: https://github.com/facebookresearch/llama/blob/6c7fe276574e78057f917549435a2554000a876d/llama/model.py#L194-L199
|
1065
|
+
const uint32_t n_ff_raw = 2*(4*hparams.n_embd)/3;
|
1066
|
+
const uint32_t n_ff_mult = hparams.f_ffn_mult*n_ff_raw;
|
1067
|
+
const uint32_t n_ff = ((n_ff_mult + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
|
1068
|
+
//const uint32_t n_ff = 28672;
|
1024
1069
|
|
1025
1070
|
{
|
1026
1071
|
fprintf(stderr, "%s: format = %s\n", __func__, llama_file_version_name(file_version));
|
@@ -1029,12 +1074,15 @@ static void llama_model_load_internal(
|
|
1029
1074
|
fprintf(stderr, "%s: n_embd = %u\n", __func__, hparams.n_embd);
|
1030
1075
|
fprintf(stderr, "%s: n_mult = %u\n", __func__, hparams.n_mult);
|
1031
1076
|
fprintf(stderr, "%s: n_head = %u\n", __func__, hparams.n_head);
|
1077
|
+
fprintf(stderr, "%s: n_head_kv = %u\n", __func__, hparams.n_head_kv);
|
1032
1078
|
fprintf(stderr, "%s: n_layer = %u\n", __func__, hparams.n_layer);
|
1033
|
-
fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot);
|
1079
|
+
fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot); // a.k.a. n_embd_head, n_head_dim
|
1080
|
+
fprintf(stderr, "%s: n_gqa = %u\n", __func__, hparams.n_gqa());
|
1081
|
+
fprintf(stderr, "%s: rnorm_eps = %.1e\n", __func__, hparams.f_rms_norm_eps);
|
1082
|
+
fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff);
|
1034
1083
|
fprintf(stderr, "%s: freq_base = %.1f\n", __func__, hparams.rope_freq_base);
|
1035
1084
|
fprintf(stderr, "%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale);
|
1036
1085
|
fprintf(stderr, "%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype));
|
1037
|
-
fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff);
|
1038
1086
|
fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
|
1039
1087
|
}
|
1040
1088
|
|
@@ -1069,7 +1117,7 @@ static void llama_model_load_internal(
|
|
1069
1117
|
{
|
1070
1118
|
model.buf.resize(ctx_size);
|
1071
1119
|
if (use_mlock) {
|
1072
|
-
model.mlock_buf.init(model.buf.addr);
|
1120
|
+
model.mlock_buf.init (model.buf.addr);
|
1073
1121
|
model.mlock_buf.grow_to(model.buf.size);
|
1074
1122
|
}
|
1075
1123
|
|
@@ -1104,9 +1152,10 @@ static void llama_model_load_internal(
|
|
1104
1152
|
size_t vram_weights = 0;
|
1105
1153
|
size_t vram_scratch = 0;
|
1106
1154
|
{
|
1107
|
-
const uint32_t n_embd
|
1108
|
-
const uint32_t
|
1109
|
-
const uint32_t
|
1155
|
+
const uint32_t n_embd = hparams.n_embd;
|
1156
|
+
const uint32_t n_embd_gqa = hparams.n_embd_gqa();
|
1157
|
+
const uint32_t n_layer = hparams.n_layer;
|
1158
|
+
const uint32_t n_vocab = hparams.n_vocab;
|
1110
1159
|
|
1111
1160
|
ml->ggml_ctx = ctx;
|
1112
1161
|
|
@@ -1154,16 +1203,16 @@ static void llama_model_load_internal(
|
|
1154
1203
|
|
1155
1204
|
layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd}, backend);
|
1156
1205
|
|
1157
|
-
layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd},
|
1158
|
-
layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd,
|
1159
|
-
layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd,
|
1160
|
-
layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd},
|
1206
|
+
layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}, backend_split);
|
1207
|
+
layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd_gqa}, backend_split);
|
1208
|
+
layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd_gqa}, backend_split);
|
1209
|
+
layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}, backend_split);
|
1161
1210
|
|
1162
1211
|
layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd}, backend);
|
1163
1212
|
|
1164
|
-
layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff},
|
1165
|
-
layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff,
|
1166
|
-
layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff},
|
1213
|
+
layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}, backend_split);
|
1214
|
+
layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}, backend_split);
|
1215
|
+
layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}, backend_split);
|
1167
1216
|
|
1168
1217
|
if (backend == GGML_BACKEND_GPU) {
|
1169
1218
|
vram_weights +=
|
@@ -1186,11 +1235,11 @@ static void llama_model_load_internal(
|
|
1186
1235
|
mmapped_size - vram_weights + // weights in VRAM not in memory
|
1187
1236
|
MEM_REQ_SCRATCH0(hparams.n_ctx).at(model.type) +
|
1188
1237
|
MEM_REQ_SCRATCH1().at(model.type) +
|
1189
|
-
MEM_REQ_EVAL(
|
1238
|
+
MEM_REQ_EVAL().at(model.type);
|
1190
1239
|
|
1191
1240
|
// this is the memory required by one llama_state
|
1192
1241
|
const size_t mem_required_state =
|
1193
|
-
scale*
|
1242
|
+
scale*hparams.kv_size();
|
1194
1243
|
|
1195
1244
|
fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
|
1196
1245
|
mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
|
@@ -1231,7 +1280,7 @@ static void llama_model_load_internal(
|
|
1231
1280
|
fprintf(stderr, "%s: cannot offload v cache to GPU due to low VRAM option\n", __func__);
|
1232
1281
|
} else {
|
1233
1282
|
fprintf(stderr, "%s: offloading v cache to GPU\n", __func__);
|
1234
|
-
vram_kv_cache +=
|
1283
|
+
vram_kv_cache += hparams.kv_size() / 2;
|
1235
1284
|
}
|
1236
1285
|
}
|
1237
1286
|
if (n_gpu_layers > (int) hparams.n_layer + 2) {
|
@@ -1239,7 +1288,7 @@ static void llama_model_load_internal(
|
|
1239
1288
|
fprintf(stderr, "%s: cannot offload k cache to GPU due to low VRAM option\n", __func__);
|
1240
1289
|
} else {
|
1241
1290
|
fprintf(stderr, "%s: offloading k cache to GPU\n", __func__);
|
1242
|
-
vram_kv_cache +=
|
1291
|
+
vram_kv_cache += hparams.kv_size() / 2;
|
1243
1292
|
}
|
1244
1293
|
}
|
1245
1294
|
#elif defined(GGML_USE_CLBLAST)
|
@@ -1287,6 +1336,8 @@ static bool llama_model_load(
|
|
1287
1336
|
llama_vocab & vocab,
|
1288
1337
|
int n_ctx,
|
1289
1338
|
int n_batch,
|
1339
|
+
int n_gqa,
|
1340
|
+
float rms_norm_eps,
|
1290
1341
|
int n_gpu_layers,
|
1291
1342
|
int main_gpu,
|
1292
1343
|
const float * tensor_split,
|
@@ -1300,7 +1351,7 @@ static bool llama_model_load(
|
|
1300
1351
|
llama_progress_callback progress_callback,
|
1301
1352
|
void *progress_callback_user_data) {
|
1302
1353
|
try {
|
1303
|
-
llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, rope_freq_base, rope_freq_scale, low_vram, memory_type,
|
1354
|
+
llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gqa, rms_norm_eps, n_gpu_layers, main_gpu, tensor_split, rope_freq_base, rope_freq_scale, low_vram, memory_type,
|
1304
1355
|
use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
|
1305
1356
|
return true;
|
1306
1357
|
} catch (const std::exception & err) {
|
@@ -1344,16 +1395,23 @@ static bool llama_eval_internal(
|
|
1344
1395
|
|
1345
1396
|
LLAMA_ASSERT(!!kv_self.ctx);
|
1346
1397
|
|
1347
|
-
const
|
1348
|
-
const
|
1349
|
-
const
|
1350
|
-
const
|
1351
|
-
const
|
1352
|
-
const
|
1353
|
-
const
|
1398
|
+
const int64_t n_embd = hparams.n_embd;
|
1399
|
+
const int64_t n_layer = hparams.n_layer;
|
1400
|
+
const int64_t n_ctx = hparams.n_ctx;
|
1401
|
+
const int64_t n_head = hparams.n_head;
|
1402
|
+
const int64_t n_head_kv = hparams.n_head_kv;
|
1403
|
+
const int64_t n_embd_head = hparams.n_embd_head();
|
1404
|
+
const int64_t n_vocab = hparams.n_vocab;
|
1405
|
+
const int64_t n_embd_gqa = hparams.n_embd_gqa();
|
1406
|
+
|
1407
|
+
|
1408
|
+
LLAMA_ASSERT(n_embd_head == hparams.n_rot);
|
1354
1409
|
|
1355
1410
|
const float freq_base = hparams.rope_freq_base;
|
1356
1411
|
const float freq_scale = hparams.rope_freq_scale;
|
1412
|
+
const float rms_norm_eps = hparams.f_rms_norm_eps;
|
1413
|
+
|
1414
|
+
const int n_gpu_layers = model.n_gpu_layers;
|
1357
1415
|
|
1358
1416
|
auto & mem_per_token = lctx.mem_per_token;
|
1359
1417
|
auto & buf_compute = lctx.buf_compute;
|
@@ -1366,7 +1424,7 @@ static bool llama_eval_internal(
|
|
1366
1424
|
|
1367
1425
|
struct ggml_context * ctx0 = ggml_init(params);
|
1368
1426
|
|
1369
|
-
ggml_cgraph gf =
|
1427
|
+
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
1370
1428
|
|
1371
1429
|
// for big prompts, if BLAS is enabled, it is better to use only one thread
|
1372
1430
|
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
|
@@ -1431,7 +1489,7 @@ static bool llama_eval_internal(
|
|
1431
1489
|
|
1432
1490
|
// norm
|
1433
1491
|
{
|
1434
|
-
cur = ggml_rms_norm(ctx0, inpL);
|
1492
|
+
cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
|
1435
1493
|
offload_func(cur);
|
1436
1494
|
ggml_set_name(cur, "rms_norm_0");
|
1437
1495
|
|
@@ -1452,11 +1510,11 @@ static bool llama_eval_internal(
|
|
1452
1510
|
offload_func_kq(tmpq);
|
1453
1511
|
ggml_set_name(tmpq, "tmpq");
|
1454
1512
|
|
1455
|
-
struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk,
|
1513
|
+
struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
|
1456
1514
|
offload_func_kq(Kcur);
|
1457
1515
|
ggml_set_name(Kcur, "Kcur");
|
1458
1516
|
|
1459
|
-
struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq,
|
1517
|
+
struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
|
1460
1518
|
offload_func_kq(Qcur);
|
1461
1519
|
ggml_set_name(Qcur, "Qcur");
|
1462
1520
|
|
@@ -1468,23 +1526,23 @@ static bool llama_eval_internal(
|
|
1468
1526
|
offload_func_v(tmpv);
|
1469
1527
|
ggml_set_name(tmpv, "tmpv");
|
1470
1528
|
|
1471
|
-
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv,
|
1529
|
+
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, N));
|
1472
1530
|
offload_func_v(Vcur);
|
1473
1531
|
ggml_set_name(Vcur, "Vcur");
|
1474
1532
|
|
1475
|
-
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*
|
1533
|
+
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past));
|
1476
1534
|
offload_func_kq(k);
|
1477
1535
|
ggml_set_name(k, "k");
|
1478
1536
|
|
1479
|
-
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N,
|
1537
|
+
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd_gqa,
|
1480
1538
|
( n_ctx)*ggml_element_size(kv_self.v),
|
1481
|
-
(il*n_ctx)*ggml_element_size(kv_self.v)*
|
1539
|
+
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + n_past*ggml_element_size(kv_self.v));
|
1482
1540
|
offload_func_v(v);
|
1483
1541
|
ggml_set_name(v, "v");
|
1484
1542
|
|
1485
1543
|
// important: storing RoPE-ed version of K in the KV cache!
|
1486
|
-
ggml_build_forward_expand(
|
1487
|
-
ggml_build_forward_expand(
|
1544
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
|
1545
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
|
1488
1546
|
}
|
1489
1547
|
|
1490
1548
|
struct ggml_tensor * Q =
|
@@ -1497,8 +1555,8 @@ static bool llama_eval_internal(
|
|
1497
1555
|
struct ggml_tensor * K =
|
1498
1556
|
ggml_permute(ctx0,
|
1499
1557
|
ggml_reshape_3d(ctx0,
|
1500
|
-
ggml_view_1d(ctx0, kv_self.k, (n_past + N)*
|
1501
|
-
|
1558
|
+
ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd_gqa, il*n_ctx*ggml_element_size(kv_self.k)*n_embd_gqa),
|
1559
|
+
n_embd_head, n_head_kv, n_past + N),
|
1502
1560
|
0, 2, 1, 3);
|
1503
1561
|
offload_func_kq(K);
|
1504
1562
|
ggml_set_name(K, "K");
|
@@ -1508,9 +1566,9 @@ static bool llama_eval_internal(
|
|
1508
1566
|
offload_func_kq(KQ);
|
1509
1567
|
ggml_set_name(KQ, "KQ");
|
1510
1568
|
|
1511
|
-
// KQ_scaled = KQ / sqrt(
|
1569
|
+
// KQ_scaled = KQ / sqrt(n_embd_head)
|
1512
1570
|
struct ggml_tensor * KQ_scale = ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head));
|
1513
|
-
ggml_set_name(KQ_scale, "1/sqrt(
|
1571
|
+
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
1514
1572
|
|
1515
1573
|
// KQ_scaled shape [n_past + N, N, n_head, 1]
|
1516
1574
|
struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
|
@@ -1530,10 +1588,10 @@ static bool llama_eval_internal(
|
|
1530
1588
|
// split cached V into n_head heads
|
1531
1589
|
struct ggml_tensor * V =
|
1532
1590
|
ggml_view_3d(ctx0, kv_self.v,
|
1533
|
-
n_past + N,
|
1591
|
+
n_past + N, n_embd_head, n_head_kv,
|
1534
1592
|
n_ctx*ggml_element_size(kv_self.v),
|
1535
|
-
n_ctx*ggml_element_size(kv_self.v)*
|
1536
|
-
|
1593
|
+
n_ctx*ggml_element_size(kv_self.v)*n_embd_head,
|
1594
|
+
n_ctx*ggml_element_size(kv_self.v)*n_embd_gqa*il);
|
1537
1595
|
offload_func_v(V);
|
1538
1596
|
ggml_set_name(V, "V");
|
1539
1597
|
|
@@ -1545,7 +1603,7 @@ static bool llama_eval_internal(
|
|
1545
1603
|
// make V contiguous in memory to speed up the matmul, however we waste time on the copy
|
1546
1604
|
// on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
|
1547
1605
|
// is there a better way?
|
1548
|
-
struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N,
|
1606
|
+
struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd_head, n_head));
|
1549
1607
|
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
|
1550
1608
|
#endif
|
1551
1609
|
|
@@ -1579,7 +1637,7 @@ static bool llama_eval_internal(
|
|
1579
1637
|
{
|
1580
1638
|
// norm
|
1581
1639
|
{
|
1582
|
-
cur = ggml_rms_norm(ctx0, inpFF);
|
1640
|
+
cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps);
|
1583
1641
|
offload_func(cur);
|
1584
1642
|
ggml_set_name(cur, "rms_norm_1");
|
1585
1643
|
|
@@ -1632,7 +1690,7 @@ static bool llama_eval_internal(
|
|
1632
1690
|
|
1633
1691
|
// norm
|
1634
1692
|
{
|
1635
|
-
cur = ggml_rms_norm(ctx0, inpL);
|
1693
|
+
cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
|
1636
1694
|
offload_func_nr(cur);
|
1637
1695
|
ggml_set_name(cur, "rms_norm_2");
|
1638
1696
|
|
@@ -1654,16 +1712,22 @@ static bool llama_eval_internal(
|
|
1654
1712
|
//cur = ggml_soft_max_inplace(ctx0, cur);
|
1655
1713
|
|
1656
1714
|
// run the computation
|
1657
|
-
ggml_build_forward_expand(
|
1715
|
+
ggml_build_forward_expand(gf, cur);
|
1716
|
+
|
1717
|
+
// fprintf(stderr, "graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf.n_nodes, gf.n_leafs);
|
1658
1718
|
|
1659
1719
|
#if GGML_USE_MPI
|
1660
|
-
ggml_mpi_graph_compute_pre(lctx.ctx_mpi,
|
1720
|
+
ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
|
1661
1721
|
#endif
|
1662
1722
|
|
1663
1723
|
#ifdef GGML_USE_METAL
|
1664
1724
|
if (lctx.ctx_metal && N == 1) {
|
1725
|
+
// TODO: disabled until #2413 is resolved
|
1726
|
+
//if (!ggml_metal_if_optimized(lctx.ctx_metal)) {
|
1727
|
+
// ggml_metal_graph_find_concurrency(lctx.ctx_metal, gf);
|
1728
|
+
//}
|
1665
1729
|
ggml_metal_set_n_cb (lctx.ctx_metal, n_threads);
|
1666
|
-
ggml_metal_graph_compute(lctx.ctx_metal,
|
1730
|
+
ggml_metal_graph_compute(lctx.ctx_metal, gf);
|
1667
1731
|
ggml_metal_get_tensor (lctx.ctx_metal, cur);
|
1668
1732
|
} else {
|
1669
1733
|
// IMPORTANT:
|
@@ -1682,34 +1746,34 @@ static bool llama_eval_internal(
|
|
1682
1746
|
ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v);
|
1683
1747
|
}
|
1684
1748
|
|
1685
|
-
ggml_graph_compute_helper(lctx.work_buffer,
|
1749
|
+
ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
|
1686
1750
|
}
|
1687
1751
|
#else
|
1688
|
-
ggml_graph_compute_helper(lctx.work_buffer,
|
1752
|
+
ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
|
1689
1753
|
#endif
|
1690
1754
|
|
1691
1755
|
#if GGML_USE_MPI
|
1692
|
-
ggml_mpi_graph_compute_post(lctx.ctx_mpi,
|
1756
|
+
ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
|
1693
1757
|
#endif
|
1694
1758
|
|
1695
1759
|
// update kv token count
|
1696
1760
|
lctx.kv_self.n = n_past + N;
|
1697
1761
|
|
1698
|
-
struct ggml_tensor * res = gf
|
1762
|
+
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
|
1699
1763
|
|
1700
1764
|
if (cgraph_fname) {
|
1701
|
-
ggml_graph_export(
|
1765
|
+
ggml_graph_export(gf, cgraph_fname);
|
1702
1766
|
}
|
1703
1767
|
|
1704
1768
|
#ifdef GGML_PERF
|
1705
1769
|
// print timing information per ggml operation (for debugging purposes)
|
1706
1770
|
// requires GGML_PERF to be defined
|
1707
|
-
ggml_graph_print(
|
1771
|
+
ggml_graph_print(gf);
|
1708
1772
|
#endif
|
1709
1773
|
|
1710
1774
|
// plot the computation graph in dot format (for debugging purposes)
|
1711
1775
|
//if (n_past%100 == 0) {
|
1712
|
-
// ggml_graph_dump_dot(
|
1776
|
+
// ggml_graph_dump_dot(gf, NULL, "llama.dot");
|
1713
1777
|
//}
|
1714
1778
|
|
1715
1779
|
// extract logits
|
@@ -1739,10 +1803,12 @@ static bool llama_eval_internal(
|
|
1739
1803
|
}
|
1740
1804
|
|
1741
1805
|
#if 0
|
1742
|
-
printf("\n%s: used_mem
|
1806
|
+
printf("\n%s: used_mem: eval ctx %.3f MB, scratch %.3f MB %.3f MB, work buf %.3f MB, n_past = %d, N = %d\n", __func__,
|
1743
1807
|
ggml_used_mem(ctx0)/1024.0/1024.0,
|
1744
1808
|
lctx.get_buf_max_mem(0)/1024.0/1024.0,
|
1745
|
-
lctx.get_buf_max_mem(1)/1024.0/1024.0
|
1809
|
+
lctx.get_buf_max_mem(1)/1024.0/1024.0,
|
1810
|
+
lctx.work_buffer.size()/1024.0/1024.0,
|
1811
|
+
n_past, N);
|
1746
1812
|
#endif
|
1747
1813
|
|
1748
1814
|
ggml_free(ctx0);
|
@@ -1915,6 +1981,279 @@ static std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, co
|
|
1915
1981
|
return output;
|
1916
1982
|
}
|
1917
1983
|
|
1984
|
+
//
|
1985
|
+
// grammar - internal
|
1986
|
+
//
|
1987
|
+
|
1988
|
+
struct llama_grammar {
|
1989
|
+
const std::vector<std::vector<llama_grammar_element>> rules;
|
1990
|
+
std::vector<std::vector<const llama_grammar_element *>> stacks;
|
1991
|
+
};
|
1992
|
+
|
1993
|
+
struct llama_grammar_candidate {
|
1994
|
+
size_t index;
|
1995
|
+
const uint32_t * code_points;
|
1996
|
+
};
|
1997
|
+
|
1998
|
+
// NOTE: assumes valid utf8 (but checks for overrun)
|
1999
|
+
// adds a terminating 0 for use as pointer
|
2000
|
+
std::vector<uint32_t> decode_utf8(const char * src) {
|
2001
|
+
static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
|
2002
|
+
const char * pos = src;
|
2003
|
+
std::vector<uint32_t> code_points;
|
2004
|
+
while (*pos != 0) {
|
2005
|
+
uint8_t first_byte = static_cast<uint8_t>(*pos);
|
2006
|
+
uint8_t highbits = first_byte >> 4;
|
2007
|
+
int len = lookup[highbits];
|
2008
|
+
uint8_t mask = (1 << (8 - len)) - 1;
|
2009
|
+
uint32_t value = first_byte & mask;
|
2010
|
+
const char * end = pos + len; // may overrun!
|
2011
|
+
++pos;
|
2012
|
+
for ( ; pos < end && *pos != 0; ++pos) {
|
2013
|
+
value = (value << 6) + (static_cast<uint8_t>(*pos) & 0x3F);
|
2014
|
+
}
|
2015
|
+
code_points.push_back(value);
|
2016
|
+
}
|
2017
|
+
code_points.push_back(0);
|
2018
|
+
return code_points;
|
2019
|
+
}
|
2020
|
+
|
2021
|
+
// returns true iff pos points to the end of one of the definitions of a rule
|
2022
|
+
static bool llama_grammar_is_end_of_sequence(const llama_grammar_element * pos) {
|
2023
|
+
switch (pos->type) {
|
2024
|
+
case LLAMA_GRETYPE_END: return true;
|
2025
|
+
case LLAMA_GRETYPE_ALT: return true;
|
2026
|
+
default: return false;
|
2027
|
+
}
|
2028
|
+
}
|
2029
|
+
|
2030
|
+
// returns true iff chr satisfies the char range at pos (regular or inverse range)
|
2031
|
+
// asserts that pos is pointing to a char range element
|
2032
|
+
static std::pair<bool, const llama_grammar_element *> llama_grammar_match_char(
|
2033
|
+
const llama_grammar_element * pos,
|
2034
|
+
const uint32_t chr) {
|
2035
|
+
|
2036
|
+
bool found = false;
|
2037
|
+
bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR;
|
2038
|
+
LLAMA_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT);
|
2039
|
+
|
2040
|
+
do {
|
2041
|
+
if (pos[1].type == LLAMA_GRETYPE_CHAR_RNG_UPPER) {
|
2042
|
+
// inclusive range, e.g. [a-z]
|
2043
|
+
found = found || (pos->value <= chr && chr <= pos[1].value);
|
2044
|
+
pos += 2;
|
2045
|
+
} else {
|
2046
|
+
// exact char match, e.g. [a] or "a"
|
2047
|
+
found = found || pos->value == chr;
|
2048
|
+
pos += 1;
|
2049
|
+
}
|
2050
|
+
} while (pos->type == LLAMA_GRETYPE_CHAR_ALT);
|
2051
|
+
|
2052
|
+
return std::make_pair(found == is_positive_char, pos);
|
2053
|
+
}
|
2054
|
+
|
2055
|
+
// transforms a grammar pushdown stack into N possible stacks, all ending
|
2056
|
+
// at a character range (terminal element)
|
2057
|
+
static void llama_grammar_advance_stack(
|
2058
|
+
const std::vector<std::vector<llama_grammar_element>> & rules,
|
2059
|
+
const std::vector<const llama_grammar_element *> & stack,
|
2060
|
+
std::vector<std::vector<const llama_grammar_element *>> & new_stacks) {
|
2061
|
+
|
2062
|
+
if (stack.empty()) {
|
2063
|
+
new_stacks.push_back(stack);
|
2064
|
+
return;
|
2065
|
+
}
|
2066
|
+
|
2067
|
+
const llama_grammar_element * pos = stack.back();
|
2068
|
+
|
2069
|
+
switch (pos->type) {
|
2070
|
+
case LLAMA_GRETYPE_RULE_REF: {
|
2071
|
+
const size_t rule_id = static_cast<size_t>(pos->value);
|
2072
|
+
const llama_grammar_element * subpos = rules[rule_id].data();
|
2073
|
+
do {
|
2074
|
+
// init new stack without the top (pos)
|
2075
|
+
std::vector<const llama_grammar_element *> new_stack(stack.begin(), stack.end() - 1);
|
2076
|
+
if (!llama_grammar_is_end_of_sequence(pos + 1)) {
|
2077
|
+
// if this rule ref is followed by another element, add that to stack
|
2078
|
+
new_stack.push_back(pos + 1);
|
2079
|
+
}
|
2080
|
+
if (!llama_grammar_is_end_of_sequence(subpos)) {
|
2081
|
+
// if alternate is nonempty, add to stack
|
2082
|
+
new_stack.push_back(subpos);
|
2083
|
+
}
|
2084
|
+
llama_grammar_advance_stack(rules, new_stack, new_stacks);
|
2085
|
+
while (!llama_grammar_is_end_of_sequence(subpos)) {
|
2086
|
+
// scan to end of alternate def
|
2087
|
+
subpos++;
|
2088
|
+
}
|
2089
|
+
if (subpos->type == LLAMA_GRETYPE_ALT) {
|
2090
|
+
// there's another alternate def of this rule to process
|
2091
|
+
subpos++;
|
2092
|
+
} else {
|
2093
|
+
break;
|
2094
|
+
}
|
2095
|
+
} while (true);
|
2096
|
+
break;
|
2097
|
+
}
|
2098
|
+
case LLAMA_GRETYPE_CHAR:
|
2099
|
+
case LLAMA_GRETYPE_CHAR_NOT:
|
2100
|
+
new_stacks.push_back(stack);
|
2101
|
+
break;
|
2102
|
+
default:
|
2103
|
+
// end of alternate (LLAMA_GRETYPE_END, LLAMA_GRETYPE_ALT) or middle of char range
|
2104
|
+
// (LLAMA_GRETYPE_CHAR_ALT, LLAMA_GRETYPE_CHAR_RNG_UPPER); stack should never be left on
|
2105
|
+
// those
|
2106
|
+
LLAMA_ASSERT(false);
|
2107
|
+
}
|
2108
|
+
}
|
2109
|
+
|
2110
|
+
// takes a set of possible pushdown stacks on a grammar, which are required to
|
2111
|
+
// be positioned at a character range (see `llama_grammar_advance_stack`), and
|
2112
|
+
// produces the N possible stacks if the given char is accepted at those
|
2113
|
+
// positions
|
2114
|
+
static std::vector<std::vector<const llama_grammar_element *>> llama_grammar_accept(
|
2115
|
+
const std::vector<std::vector<llama_grammar_element>> & rules,
|
2116
|
+
const std::vector<std::vector<const llama_grammar_element *>> & stacks,
|
2117
|
+
const uint32_t chr) {
|
2118
|
+
|
2119
|
+
std::vector<std::vector<const llama_grammar_element *>> new_stacks;
|
2120
|
+
|
2121
|
+
for (const auto & stack : stacks) {
|
2122
|
+
if (stack.empty()) {
|
2123
|
+
continue;
|
2124
|
+
}
|
2125
|
+
|
2126
|
+
auto match = llama_grammar_match_char(stack.back(), chr);
|
2127
|
+
if (match.first) {
|
2128
|
+
const llama_grammar_element * pos = match.second;
|
2129
|
+
|
2130
|
+
// update top of stack to next element, if any
|
2131
|
+
std::vector<const llama_grammar_element *> new_stack(stack.begin(), stack.end() - 1);
|
2132
|
+
if (!llama_grammar_is_end_of_sequence(pos)) {
|
2133
|
+
new_stack.push_back(pos);
|
2134
|
+
}
|
2135
|
+
llama_grammar_advance_stack(rules, new_stack, new_stacks);
|
2136
|
+
}
|
2137
|
+
}
|
2138
|
+
|
2139
|
+
return new_stacks;
|
2140
|
+
}
|
2141
|
+
|
2142
|
+
static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates(
|
2143
|
+
const std::vector<std::vector<llama_grammar_element>> & rules,
|
2144
|
+
const std::vector<std::vector<const llama_grammar_element *>> & stacks,
|
2145
|
+
const std::vector<llama_grammar_candidate> & candidates);
|
2146
|
+
|
2147
|
+
static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_stack(
|
2148
|
+
const std::vector<std::vector<llama_grammar_element>> & rules,
|
2149
|
+
const std::vector<const llama_grammar_element *> & stack,
|
2150
|
+
const std::vector<llama_grammar_candidate> & candidates) {
|
2151
|
+
|
2152
|
+
std::vector<llama_grammar_candidate> rejects;
|
2153
|
+
|
2154
|
+
if (stack.empty()) {
|
2155
|
+
// accept nothing; EOS is handled elsewhere
|
2156
|
+
rejects.insert(rejects.end(), candidates.begin(), candidates.end());
|
2157
|
+
return rejects;
|
2158
|
+
}
|
2159
|
+
|
2160
|
+
const llama_grammar_element * stack_pos = stack.back();
|
2161
|
+
|
2162
|
+
std::vector<llama_grammar_candidate> next_candidates;
|
2163
|
+
for (auto tok : candidates) {
|
2164
|
+
if (llama_grammar_match_char(stack_pos, tok.code_points[0]).first) {
|
2165
|
+
if (tok.code_points[1] != 0) {
|
2166
|
+
next_candidates.push_back({ tok.index, tok.code_points + 1 });
|
2167
|
+
}
|
2168
|
+
} else {
|
2169
|
+
rejects.push_back(tok);
|
2170
|
+
}
|
2171
|
+
}
|
2172
|
+
|
2173
|
+
auto stack_pos_after = llama_grammar_match_char(stack_pos, 0).second;
|
2174
|
+
|
2175
|
+
// update top of stack to next element, if any
|
2176
|
+
std::vector<const llama_grammar_element *> stack_after(stack.begin(), stack.end() - 1);
|
2177
|
+
if (!llama_grammar_is_end_of_sequence(stack_pos_after)) {
|
2178
|
+
stack_after.push_back(stack_pos_after);
|
2179
|
+
}
|
2180
|
+
std::vector<std::vector<const llama_grammar_element *>> next_stacks;
|
2181
|
+
llama_grammar_advance_stack(rules, stack_after, next_stacks);
|
2182
|
+
|
2183
|
+
auto next_rejects = llama_grammar_reject_candidates(rules, next_stacks, next_candidates);
|
2184
|
+
for (auto tok : next_rejects) {
|
2185
|
+
rejects.push_back({ tok.index, tok.code_points - 1 });
|
2186
|
+
}
|
2187
|
+
|
2188
|
+
return rejects;
|
2189
|
+
}
|
2190
|
+
|
2191
|
+
static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates(
|
2192
|
+
const std::vector<std::vector<llama_grammar_element>> & rules,
|
2193
|
+
const std::vector<std::vector<const llama_grammar_element *>> & stacks,
|
2194
|
+
const std::vector<llama_grammar_candidate> & candidates) {
|
2195
|
+
LLAMA_ASSERT(!stacks.empty()); // REVIEW
|
2196
|
+
|
2197
|
+
if (candidates.empty()) {
|
2198
|
+
return std::vector<llama_grammar_candidate>();
|
2199
|
+
}
|
2200
|
+
|
2201
|
+
auto rejects = llama_grammar_reject_candidates_for_stack(rules, stacks.front(), candidates);
|
2202
|
+
|
2203
|
+
for (size_t i = 1, size = stacks.size(); i < size; ++i) {
|
2204
|
+
rejects = llama_grammar_reject_candidates_for_stack(rules, stacks[i], rejects);
|
2205
|
+
}
|
2206
|
+
return rejects;
|
2207
|
+
}
|
2208
|
+
|
2209
|
+
//
|
2210
|
+
// grammar - external
|
2211
|
+
//
|
2212
|
+
|
2213
|
+
struct llama_grammar * llama_grammar_init(
|
2214
|
+
const llama_grammar_element ** rules,
|
2215
|
+
size_t n_rules,
|
2216
|
+
size_t start_rule_index) {
|
2217
|
+
const llama_grammar_element * pos;
|
2218
|
+
|
2219
|
+
// copy rule definitions into vectors
|
2220
|
+
std::vector<std::vector<llama_grammar_element>> vec_rules(n_rules);
|
2221
|
+
for (size_t i = 0; i < n_rules; i++) {
|
2222
|
+
for (pos = rules[i]; pos->type != LLAMA_GRETYPE_END; pos++) {
|
2223
|
+
vec_rules[i].push_back(*pos);
|
2224
|
+
}
|
2225
|
+
vec_rules[i].push_back({LLAMA_GRETYPE_END, 0});
|
2226
|
+
}
|
2227
|
+
|
2228
|
+
// loop over alternates of start rule to build initial stacks
|
2229
|
+
std::vector<std::vector<const llama_grammar_element *>> stacks;
|
2230
|
+
pos = rules[start_rule_index];
|
2231
|
+
do {
|
2232
|
+
std::vector<const llama_grammar_element *> stack;
|
2233
|
+
if (!llama_grammar_is_end_of_sequence(pos)) {
|
2234
|
+
// if alternate is nonempty, add to stack
|
2235
|
+
stack.push_back(pos);
|
2236
|
+
}
|
2237
|
+
llama_grammar_advance_stack(vec_rules, stack, stacks);
|
2238
|
+
while (!llama_grammar_is_end_of_sequence(pos)) {
|
2239
|
+
// scan to end of alternate def
|
2240
|
+
pos++;
|
2241
|
+
}
|
2242
|
+
if (pos->type == LLAMA_GRETYPE_ALT) {
|
2243
|
+
// there's another alternate def of this rule to process
|
2244
|
+
pos++;
|
2245
|
+
} else {
|
2246
|
+
break;
|
2247
|
+
}
|
2248
|
+
} while (true);
|
2249
|
+
|
2250
|
+
return new llama_grammar{ std::move(vec_rules), std::move(stacks) };
|
2251
|
+
}
|
2252
|
+
|
2253
|
+
void llama_grammar_free(struct llama_grammar * grammar) {
|
2254
|
+
delete grammar;
|
2255
|
+
}
|
2256
|
+
|
1918
2257
|
//
|
1919
2258
|
// sampling
|
1920
2259
|
//
|
@@ -2200,6 +2539,47 @@ void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, l
|
|
2200
2539
|
}
|
2201
2540
|
}
|
2202
2541
|
|
2542
|
+
void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, const struct llama_grammar * grammar) {
|
2543
|
+
assert(ctx);
|
2544
|
+
const int64_t t_start_sample_us = ggml_time_us();
|
2545
|
+
|
2546
|
+
bool allow_eos = false;
|
2547
|
+
for (const auto & stack : grammar->stacks) {
|
2548
|
+
if (stack.empty()) {
|
2549
|
+
allow_eos = true;
|
2550
|
+
break;
|
2551
|
+
}
|
2552
|
+
}
|
2553
|
+
|
2554
|
+
const llama_token eos = llama_token_eos();
|
2555
|
+
|
2556
|
+
std::vector<std::vector<uint32_t>> candidates_decoded;
|
2557
|
+
std::vector<llama_grammar_candidate> candidates_grammar;
|
2558
|
+
|
2559
|
+
for (size_t i = 0; i < candidates->size; ++i) {
|
2560
|
+
const llama_token id = candidates->data[i].id;
|
2561
|
+
const char * str = llama_token_to_str(ctx, id);
|
2562
|
+
if (id == eos) {
|
2563
|
+
if (!allow_eos) {
|
2564
|
+
candidates->data[i].logit = -INFINITY;
|
2565
|
+
}
|
2566
|
+
} else if (*str == 0) {
|
2567
|
+
candidates->data[i].logit = -INFINITY;
|
2568
|
+
} else {
|
2569
|
+
candidates_decoded.push_back(decode_utf8(str));
|
2570
|
+
candidates_grammar.push_back({ i, candidates_decoded.back().data() });
|
2571
|
+
}
|
2572
|
+
}
|
2573
|
+
|
2574
|
+
const auto rejects =
|
2575
|
+
llama_grammar_reject_candidates(grammar->rules, grammar->stacks, candidates_grammar);
|
2576
|
+
for (auto & reject : rejects) {
|
2577
|
+
candidates->data[reject.index].logit = -INFINITY;
|
2578
|
+
}
|
2579
|
+
|
2580
|
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
2581
|
+
}
|
2582
|
+
|
2203
2583
|
static void llama_log_softmax(float * array, size_t size) {
|
2204
2584
|
float max_l = *std::max_element(array, array + size);
|
2205
2585
|
float sum = 0.f;
|
@@ -2375,6 +2755,29 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra
|
|
2375
2755
|
return result;
|
2376
2756
|
}
|
2377
2757
|
|
2758
|
+
void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token) {
|
2759
|
+
const int64_t t_start_sample_us = ggml_time_us();
|
2760
|
+
|
2761
|
+
if (token == llama_token_eos()) {
|
2762
|
+
for (const auto & stack : grammar->stacks) {
|
2763
|
+
if (stack.empty()) {
|
2764
|
+
return;
|
2765
|
+
}
|
2766
|
+
}
|
2767
|
+
LLAMA_ASSERT(false);
|
2768
|
+
}
|
2769
|
+
|
2770
|
+
const char * str = llama_token_to_str(ctx, token);
|
2771
|
+
// Note terminating 0 in decoded string
|
2772
|
+
auto code_points = decode_utf8(str);
|
2773
|
+
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
|
2774
|
+
grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
|
2775
|
+
}
|
2776
|
+
LLAMA_ASSERT(!grammar->stacks.empty());
|
2777
|
+
|
2778
|
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
2779
|
+
}
|
2780
|
+
|
2378
2781
|
//
|
2379
2782
|
// quantization
|
2380
2783
|
//
|
@@ -2448,8 +2851,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2448
2851
|
case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
|
2449
2852
|
case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
|
2450
2853
|
case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
|
2451
|
-
case LLAMA_FTYPE_MOSTLY_F16:
|
2452
|
-
case LLAMA_FTYPE_ALL_F32:
|
2854
|
+
case LLAMA_FTYPE_MOSTLY_F16: quantized_type = GGML_TYPE_F16; break;
|
2855
|
+
case LLAMA_FTYPE_ALL_F32: quantized_type = GGML_TYPE_F32; break;
|
2453
2856
|
|
2454
2857
|
#ifdef GGML_USE_K_QUANTS
|
2455
2858
|
// K-quants
|
@@ -2533,16 +2936,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2533
2936
|
} else {
|
2534
2937
|
new_type = quantized_type;
|
2535
2938
|
#ifdef GGML_USE_K_QUANTS
|
2536
|
-
bool convert_incompatible_tensor = false;
|
2537
|
-
if (quantized_type == GGML_TYPE_Q2_K || quantized_type == GGML_TYPE_Q3_K || quantized_type == GGML_TYPE_Q4_K ||
|
2538
|
-
quantized_type == GGML_TYPE_Q5_K || quantized_type == GGML_TYPE_Q6_K) {
|
2539
|
-
int nx = tensor.ne.at(0);
|
2540
|
-
int ny = tensor.ne.at(1);
|
2541
|
-
if (nx % QK_K != 0 || ny % QK_K != 0) {
|
2542
|
-
fprintf(stderr, "\n\nTensor sizes %d x %d are not divisible by %d, required for k-quants.\n",nx,ny,QK_K);
|
2543
|
-
convert_incompatible_tensor = true;
|
2544
|
-
}
|
2545
|
-
}
|
2546
2939
|
if (tensor.name == "output.weight") {
|
2547
2940
|
int nx = tensor.ne.at(0);
|
2548
2941
|
int ny = tensor.ne.at(1);
|
@@ -2568,6 +2961,16 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2568
2961
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
2569
2962
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
2570
2963
|
}
|
2964
|
+
bool convert_incompatible_tensor = false;
|
2965
|
+
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
|
2966
|
+
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) {
|
2967
|
+
int nx = tensor.ne.at(0);
|
2968
|
+
int ny = tensor.ne.at(1);
|
2969
|
+
if (nx % QK_K != 0 || ny % QK_K != 0) {
|
2970
|
+
fprintf(stderr, "\n\nTensor sizes %d x %d are not divisible by %d, required for k-quants.\n",nx,ny,QK_K);
|
2971
|
+
convert_incompatible_tensor = true;
|
2972
|
+
}
|
2973
|
+
}
|
2571
2974
|
if (convert_incompatible_tensor) {
|
2572
2975
|
if (tensor.name == "output.weight") {
|
2573
2976
|
new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing.
|
@@ -2594,7 +2997,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2594
2997
|
f32_data = (float *) f32_conv_buf.addr;
|
2595
2998
|
}
|
2596
2999
|
|
2597
|
-
printf("quantizing .. ");
|
3000
|
+
printf("quantizing to %s .. ", ggml_type_name(new_type));
|
2598
3001
|
fflush(stdout);
|
2599
3002
|
|
2600
3003
|
work.resize(nelements * 4); // upper bound on size
|
@@ -2697,7 +3100,7 @@ struct llama_model * llama_load_model_from_file(
|
|
2697
3100
|
|
2698
3101
|
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
2699
3102
|
|
2700
|
-
if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gpu_layers,
|
3103
|
+
if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gqa, params.rms_norm_eps, params.n_gpu_layers,
|
2701
3104
|
params.main_gpu, params.tensor_split, params.rope_freq_base, params.rope_freq_scale,params.low_vram,
|
2702
3105
|
memory_type, params.use_mmap, params.use_mlock, params.vocab_only, params.progress_callback,
|
2703
3106
|
params.progress_callback_user_data)) {
|
@@ -2775,7 +3178,7 @@ struct llama_context * llama_new_context_with_model(
|
|
2775
3178
|
ctx->embedding.resize(hparams.n_embd);
|
2776
3179
|
}
|
2777
3180
|
|
2778
|
-
ctx->buf_compute.resize(MEM_REQ_EVAL(
|
3181
|
+
ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type) + ggml_graph_overhead());
|
2779
3182
|
|
2780
3183
|
ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type));
|
2781
3184
|
ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
|
@@ -2799,7 +3202,7 @@ struct llama_context * llama_new_context_with_model(
|
|
2799
3202
|
|
2800
3203
|
const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
|
2801
3204
|
|
2802
|
-
|
3205
|
+
fprintf(stderr, "%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
|
2803
3206
|
|
2804
3207
|
#define LLAMA_METAL_CHECK_BUF(result) \
|
2805
3208
|
if (!(result)) { \
|