llama_cpp 0.3.4 → 0.3.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -0
- data/ext/llama_cpp/extconf.rb +1 -0
- data/ext/llama_cpp/llama_cpp.cpp +293 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +304 -99
- data/ext/llama_cpp/src/ggml-metal.h +7 -0
- data/ext/llama_cpp/src/ggml-metal.m +201 -71
- data/ext/llama_cpp/src/ggml-metal.metal +68 -54
- data/ext/llama_cpp/src/ggml.c +713 -978
- data/ext/llama_cpp/src/ggml.h +82 -17
- data/ext/llama_cpp/src/k_quants.c +327 -3
- data/ext/llama_cpp/src/llama.cpp +524 -121
- data/ext/llama_cpp/src/llama.h +60 -5
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +24 -0
- metadata +2 -2
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -67,6 +67,7 @@ enum e_model {
|
|
67
67
|
MODEL_13B,
|
68
68
|
MODEL_30B,
|
69
69
|
MODEL_65B,
|
70
|
+
MODEL_70B,
|
70
71
|
};
|
71
72
|
|
72
73
|
static const size_t kB = 1024;
|
@@ -98,18 +99,18 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
|
|
98
99
|
}
|
99
100
|
|
100
101
|
//
|
101
|
-
// memory sizes
|
102
|
+
// memory sizes (calculated for n_batch == 512)
|
102
103
|
//
|
103
104
|
|
104
105
|
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0(int n_ctx)
|
105
106
|
{
|
106
107
|
static std::map<e_model, size_t> k_sizes = {
|
107
|
-
|
108
|
-
{
|
109
|
-
{
|
110
|
-
{
|
111
|
-
{
|
112
|
-
{
|
108
|
+
{ MODEL_3B, ((size_t) n_ctx / 16ull + 92ull) * MB },
|
109
|
+
{ MODEL_7B, ((size_t) n_ctx / 16ull + 100ull) * MB },
|
110
|
+
{ MODEL_13B, ((size_t) n_ctx / 12ull + 120ull) * MB },
|
111
|
+
{ MODEL_30B, ((size_t) n_ctx / 9ull + 160ull) * MB },
|
112
|
+
{ MODEL_65B, ((size_t) n_ctx / 6ull + 256ull) * MB }, // guess
|
113
|
+
{ MODEL_70B, ((size_t) n_ctx / 7ull + 164ull) * MB },
|
113
114
|
};
|
114
115
|
return k_sizes;
|
115
116
|
}
|
@@ -117,38 +118,26 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0(int n_ctx)
|
|
117
118
|
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
|
118
119
|
{
|
119
120
|
static std::map<e_model, size_t> k_sizes = {
|
120
|
-
{ MODEL_3B,
|
121
|
-
{ MODEL_7B,
|
122
|
-
{ MODEL_13B,
|
123
|
-
{ MODEL_30B,
|
124
|
-
{ MODEL_65B,
|
121
|
+
{ MODEL_3B, 128ull * MB },
|
122
|
+
{ MODEL_7B, 160ull * MB },
|
123
|
+
{ MODEL_13B, 192ull * MB },
|
124
|
+
{ MODEL_30B, 256ull * MB },
|
125
|
+
{ MODEL_65B, 384ull * MB }, // guess
|
126
|
+
{ MODEL_70B, 304ull * MB },
|
125
127
|
};
|
126
128
|
return k_sizes;
|
127
129
|
}
|
128
130
|
|
129
|
-
//
|
130
|
-
static const std::map<e_model, size_t> &
|
131
|
+
// used to store the compute graph tensors + non-scratch data
|
132
|
+
static const std::map<e_model, size_t> & MEM_REQ_EVAL()
|
131
133
|
{
|
132
134
|
static std::map<e_model, size_t> k_sizes = {
|
133
|
-
{ MODEL_3B,
|
134
|
-
{ MODEL_7B,
|
135
|
-
{ MODEL_13B,
|
136
|
-
{ MODEL_30B,
|
137
|
-
{ MODEL_65B,
|
138
|
-
|
139
|
-
return k_sizes;
|
140
|
-
}
|
141
|
-
|
142
|
-
// this is mostly needed for temporary mul_mat buffers to dequantize the data
|
143
|
-
// not actually needed if BLAS is disabled
|
144
|
-
static const std::map<e_model, size_t> & MEM_REQ_EVAL(int n_ctx)
|
145
|
-
{
|
146
|
-
static std::map<e_model, size_t> k_sizes = {
|
147
|
-
{ MODEL_3B, ((size_t) n_ctx / 256ull + 512ull) * MB },
|
148
|
-
{ MODEL_7B, ((size_t) n_ctx / 256ull + 768ull) * MB },
|
149
|
-
{ MODEL_13B, ((size_t) n_ctx / 256ull + 1024ull) * MB },
|
150
|
-
{ MODEL_30B, ((size_t) n_ctx / 256ull + 1280ull) * MB },
|
151
|
-
{ MODEL_65B, ((size_t) n_ctx / 256ull + 1536ull) * MB },
|
135
|
+
{ MODEL_3B, 8ull * MB },
|
136
|
+
{ MODEL_7B, 10ull * MB },
|
137
|
+
{ MODEL_13B, 12ull * MB },
|
138
|
+
{ MODEL_30B, 16ull * MB },
|
139
|
+
{ MODEL_65B, 24ull * MB }, // guess
|
140
|
+
{ MODEL_70B, 24ull * MB },
|
152
141
|
};
|
153
142
|
return k_sizes;
|
154
143
|
}
|
@@ -163,6 +152,7 @@ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_BASE()
|
|
163
152
|
{ MODEL_13B, 640ull * kB },
|
164
153
|
{ MODEL_30B, 768ull * kB },
|
165
154
|
{ MODEL_65B, 1536ull * kB },
|
155
|
+
{ MODEL_70B, 1536ull * kB }, // TODO (likely can be reduced)
|
166
156
|
};
|
167
157
|
return k_sizes;
|
168
158
|
}
|
@@ -177,19 +167,26 @@ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT()
|
|
177
167
|
{ MODEL_13B, 160ull },
|
178
168
|
{ MODEL_30B, 208ull },
|
179
169
|
{ MODEL_65B, 416ull },
|
170
|
+
{ MODEL_70B, 416ull }, // TODO (likely can be reduced)
|
180
171
|
};
|
181
172
|
return k_sizes;
|
182
173
|
}
|
183
174
|
|
184
175
|
// default hparams (LLaMA 7B)
|
185
176
|
struct llama_hparams {
|
186
|
-
uint32_t n_vocab
|
187
|
-
uint32_t n_ctx
|
188
|
-
uint32_t n_embd
|
189
|
-
uint32_t n_mult
|
190
|
-
uint32_t n_head
|
191
|
-
uint32_t
|
192
|
-
uint32_t
|
177
|
+
uint32_t n_vocab = 32000;
|
178
|
+
uint32_t n_ctx = 512; // this is provided as user input?
|
179
|
+
uint32_t n_embd = 4096;
|
180
|
+
uint32_t n_mult = 256;
|
181
|
+
uint32_t n_head = 32;
|
182
|
+
uint32_t n_head_kv = 32;
|
183
|
+
uint32_t n_layer = 32;
|
184
|
+
uint32_t n_rot = 64;
|
185
|
+
|
186
|
+
// LLaMAv2
|
187
|
+
// TODO: load from model data hparams
|
188
|
+
float f_ffn_mult = 1.0f;
|
189
|
+
float f_rms_norm_eps = LLAMA_DEFAULT_RMS_EPS;
|
193
190
|
|
194
191
|
float rope_freq_base = 10000.0f;
|
195
192
|
float rope_freq_scale = 1.0f;
|
@@ -197,7 +194,28 @@ struct llama_hparams {
|
|
197
194
|
enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;
|
198
195
|
|
199
196
|
bool operator!=(const llama_hparams & other) const {
|
200
|
-
return static_cast<bool>(memcmp(this, &other, sizeof(llama_hparams)));
|
197
|
+
return static_cast<bool>(memcmp(this, &other, sizeof(llama_hparams))); // NOLINT
|
198
|
+
}
|
199
|
+
|
200
|
+
uint32_t n_gqa() const {
|
201
|
+
return n_head/n_head_kv;
|
202
|
+
}
|
203
|
+
|
204
|
+
uint32_t n_embd_head() const {
|
205
|
+
return n_embd/n_head;
|
206
|
+
}
|
207
|
+
|
208
|
+
uint32_t n_embd_gqa() const {
|
209
|
+
return n_embd/n_gqa();
|
210
|
+
}
|
211
|
+
|
212
|
+
size_t kv_size() const {
|
213
|
+
size_t result = 2ull;
|
214
|
+
result *= (size_t) n_embd_gqa();
|
215
|
+
result *= (size_t) n_ctx;
|
216
|
+
result *= (size_t) n_layer;
|
217
|
+
result *= sizeof(ggml_fp16_t);
|
218
|
+
return result;
|
201
219
|
}
|
202
220
|
};
|
203
221
|
|
@@ -499,12 +517,16 @@ struct llama_file_loader {
|
|
499
517
|
}
|
500
518
|
void read_hparams() {
|
501
519
|
hparams.n_vocab = file.read_u32();
|
502
|
-
hparams.n_embd
|
503
|
-
hparams.n_mult
|
504
|
-
hparams.n_head
|
520
|
+
hparams.n_embd = file.read_u32();
|
521
|
+
hparams.n_mult = file.read_u32();
|
522
|
+
hparams.n_head = file.read_u32();
|
505
523
|
hparams.n_layer = file.read_u32();
|
506
|
-
hparams.n_rot
|
507
|
-
hparams.ftype
|
524
|
+
hparams.n_rot = file.read_u32();
|
525
|
+
hparams.ftype = (enum llama_ftype) file.read_u32();
|
526
|
+
|
527
|
+
// LLaMAv2
|
528
|
+
// TODO: read from header
|
529
|
+
hparams.n_head_kv = hparams.n_head;
|
508
530
|
}
|
509
531
|
void read_vocab() {
|
510
532
|
vocab.id_to_token.resize(hparams.n_vocab);
|
@@ -803,7 +825,7 @@ static bool kv_cache_init(
|
|
803
825
|
ggml_type wtype,
|
804
826
|
int n_ctx,
|
805
827
|
int n_gpu_layers) {
|
806
|
-
const int n_embd = hparams.
|
828
|
+
const int n_embd = hparams.n_embd_gqa();
|
807
829
|
const int n_layer = hparams.n_layer;
|
808
830
|
|
809
831
|
const int64_t n_mem = n_layer*n_ctx;
|
@@ -847,6 +869,8 @@ struct llama_context_params llama_context_default_params() {
|
|
847
869
|
/*.seed =*/ LLAMA_DEFAULT_SEED,
|
848
870
|
/*.n_ctx =*/ 512,
|
849
871
|
/*.n_batch =*/ 512,
|
872
|
+
/*.n_gqa =*/ 1,
|
873
|
+
/*.rms_norm_eps =*/ LLAMA_DEFAULT_RMS_EPS,
|
850
874
|
/*.gpu_layers =*/ 0,
|
851
875
|
/*.main_gpu =*/ 0,
|
852
876
|
/*.tensor_split =*/ nullptr,
|
@@ -966,6 +990,7 @@ static const char *llama_model_type_name(e_model type) {
|
|
966
990
|
case MODEL_13B: return "13B";
|
967
991
|
case MODEL_30B: return "30B";
|
968
992
|
case MODEL_65B: return "65B";
|
993
|
+
case MODEL_70B: return "70B";
|
969
994
|
default: LLAMA_ASSERT(false);
|
970
995
|
}
|
971
996
|
}
|
@@ -976,6 +1001,8 @@ static void llama_model_load_internal(
|
|
976
1001
|
llama_vocab & vocab,
|
977
1002
|
int n_ctx,
|
978
1003
|
int n_batch,
|
1004
|
+
int n_gqa,
|
1005
|
+
float rms_norm_eps,
|
979
1006
|
int n_gpu_layers,
|
980
1007
|
int main_gpu,
|
981
1008
|
const float * tensor_split,
|
@@ -997,8 +1024,12 @@ static void llama_model_load_internal(
|
|
997
1024
|
model.hparams = ml->file_loader->hparams;
|
998
1025
|
model.n_gpu_layers = n_gpu_layers;
|
999
1026
|
llama_file_version file_version = ml->file_loader->file_version;
|
1027
|
+
|
1000
1028
|
auto & hparams = model.hparams;
|
1001
1029
|
|
1030
|
+
// TODO: read from file
|
1031
|
+
hparams.f_rms_norm_eps = rms_norm_eps;
|
1032
|
+
|
1002
1033
|
{
|
1003
1034
|
switch (hparams.n_layer) {
|
1004
1035
|
case 26: model.type = e_model::MODEL_3B; break;
|
@@ -1016,11 +1047,25 @@ static void llama_model_load_internal(
|
|
1016
1047
|
|
1017
1048
|
hparams.n_ctx = n_ctx;
|
1018
1049
|
|
1050
|
+
// LLaMAv2
|
1051
|
+
// TODO: temporary until GGUF
|
1052
|
+
LLAMA_ASSERT(hparams.n_head % n_gqa == 0);
|
1053
|
+
hparams.n_head_kv = hparams.n_head / n_gqa;
|
1054
|
+
if (model.type == e_model::MODEL_65B && n_gqa == 8) {
|
1055
|
+
fprintf(stderr, "%s: warning: assuming 70B model based on GQA == %d\n", __func__, n_gqa);
|
1056
|
+
model.type = e_model::MODEL_70B;
|
1057
|
+
hparams.f_ffn_mult = 1.3f; // from the params.json of the 70B model
|
1058
|
+
}
|
1059
|
+
|
1019
1060
|
hparams.rope_freq_base = rope_freq_base;
|
1020
1061
|
hparams.rope_freq_scale = rope_freq_scale;
|
1021
1062
|
}
|
1022
1063
|
|
1023
|
-
|
1064
|
+
// ref: https://github.com/facebookresearch/llama/blob/6c7fe276574e78057f917549435a2554000a876d/llama/model.py#L194-L199
|
1065
|
+
const uint32_t n_ff_raw = 2*(4*hparams.n_embd)/3;
|
1066
|
+
const uint32_t n_ff_mult = hparams.f_ffn_mult*n_ff_raw;
|
1067
|
+
const uint32_t n_ff = ((n_ff_mult + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
|
1068
|
+
//const uint32_t n_ff = 28672;
|
1024
1069
|
|
1025
1070
|
{
|
1026
1071
|
fprintf(stderr, "%s: format = %s\n", __func__, llama_file_version_name(file_version));
|
@@ -1029,12 +1074,15 @@ static void llama_model_load_internal(
|
|
1029
1074
|
fprintf(stderr, "%s: n_embd = %u\n", __func__, hparams.n_embd);
|
1030
1075
|
fprintf(stderr, "%s: n_mult = %u\n", __func__, hparams.n_mult);
|
1031
1076
|
fprintf(stderr, "%s: n_head = %u\n", __func__, hparams.n_head);
|
1077
|
+
fprintf(stderr, "%s: n_head_kv = %u\n", __func__, hparams.n_head_kv);
|
1032
1078
|
fprintf(stderr, "%s: n_layer = %u\n", __func__, hparams.n_layer);
|
1033
|
-
fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot);
|
1079
|
+
fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot); // a.k.a. n_embd_head, n_head_dim
|
1080
|
+
fprintf(stderr, "%s: n_gqa = %u\n", __func__, hparams.n_gqa());
|
1081
|
+
fprintf(stderr, "%s: rnorm_eps = %.1e\n", __func__, hparams.f_rms_norm_eps);
|
1082
|
+
fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff);
|
1034
1083
|
fprintf(stderr, "%s: freq_base = %.1f\n", __func__, hparams.rope_freq_base);
|
1035
1084
|
fprintf(stderr, "%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale);
|
1036
1085
|
fprintf(stderr, "%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype));
|
1037
|
-
fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff);
|
1038
1086
|
fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
|
1039
1087
|
}
|
1040
1088
|
|
@@ -1069,7 +1117,7 @@ static void llama_model_load_internal(
|
|
1069
1117
|
{
|
1070
1118
|
model.buf.resize(ctx_size);
|
1071
1119
|
if (use_mlock) {
|
1072
|
-
model.mlock_buf.init(model.buf.addr);
|
1120
|
+
model.mlock_buf.init (model.buf.addr);
|
1073
1121
|
model.mlock_buf.grow_to(model.buf.size);
|
1074
1122
|
}
|
1075
1123
|
|
@@ -1104,9 +1152,10 @@ static void llama_model_load_internal(
|
|
1104
1152
|
size_t vram_weights = 0;
|
1105
1153
|
size_t vram_scratch = 0;
|
1106
1154
|
{
|
1107
|
-
const uint32_t n_embd
|
1108
|
-
const uint32_t
|
1109
|
-
const uint32_t
|
1155
|
+
const uint32_t n_embd = hparams.n_embd;
|
1156
|
+
const uint32_t n_embd_gqa = hparams.n_embd_gqa();
|
1157
|
+
const uint32_t n_layer = hparams.n_layer;
|
1158
|
+
const uint32_t n_vocab = hparams.n_vocab;
|
1110
1159
|
|
1111
1160
|
ml->ggml_ctx = ctx;
|
1112
1161
|
|
@@ -1154,16 +1203,16 @@ static void llama_model_load_internal(
|
|
1154
1203
|
|
1155
1204
|
layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd}, backend);
|
1156
1205
|
|
1157
|
-
layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd},
|
1158
|
-
layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd,
|
1159
|
-
layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd,
|
1160
|
-
layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd},
|
1206
|
+
layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}, backend_split);
|
1207
|
+
layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd_gqa}, backend_split);
|
1208
|
+
layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd_gqa}, backend_split);
|
1209
|
+
layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}, backend_split);
|
1161
1210
|
|
1162
1211
|
layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd}, backend);
|
1163
1212
|
|
1164
|
-
layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff},
|
1165
|
-
layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff,
|
1166
|
-
layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff},
|
1213
|
+
layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}, backend_split);
|
1214
|
+
layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}, backend_split);
|
1215
|
+
layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}, backend_split);
|
1167
1216
|
|
1168
1217
|
if (backend == GGML_BACKEND_GPU) {
|
1169
1218
|
vram_weights +=
|
@@ -1186,11 +1235,11 @@ static void llama_model_load_internal(
|
|
1186
1235
|
mmapped_size - vram_weights + // weights in VRAM not in memory
|
1187
1236
|
MEM_REQ_SCRATCH0(hparams.n_ctx).at(model.type) +
|
1188
1237
|
MEM_REQ_SCRATCH1().at(model.type) +
|
1189
|
-
MEM_REQ_EVAL(
|
1238
|
+
MEM_REQ_EVAL().at(model.type);
|
1190
1239
|
|
1191
1240
|
// this is the memory required by one llama_state
|
1192
1241
|
const size_t mem_required_state =
|
1193
|
-
scale*
|
1242
|
+
scale*hparams.kv_size();
|
1194
1243
|
|
1195
1244
|
fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
|
1196
1245
|
mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
|
@@ -1231,7 +1280,7 @@ static void llama_model_load_internal(
|
|
1231
1280
|
fprintf(stderr, "%s: cannot offload v cache to GPU due to low VRAM option\n", __func__);
|
1232
1281
|
} else {
|
1233
1282
|
fprintf(stderr, "%s: offloading v cache to GPU\n", __func__);
|
1234
|
-
vram_kv_cache +=
|
1283
|
+
vram_kv_cache += hparams.kv_size() / 2;
|
1235
1284
|
}
|
1236
1285
|
}
|
1237
1286
|
if (n_gpu_layers > (int) hparams.n_layer + 2) {
|
@@ -1239,7 +1288,7 @@ static void llama_model_load_internal(
|
|
1239
1288
|
fprintf(stderr, "%s: cannot offload k cache to GPU due to low VRAM option\n", __func__);
|
1240
1289
|
} else {
|
1241
1290
|
fprintf(stderr, "%s: offloading k cache to GPU\n", __func__);
|
1242
|
-
vram_kv_cache +=
|
1291
|
+
vram_kv_cache += hparams.kv_size() / 2;
|
1243
1292
|
}
|
1244
1293
|
}
|
1245
1294
|
#elif defined(GGML_USE_CLBLAST)
|
@@ -1287,6 +1336,8 @@ static bool llama_model_load(
|
|
1287
1336
|
llama_vocab & vocab,
|
1288
1337
|
int n_ctx,
|
1289
1338
|
int n_batch,
|
1339
|
+
int n_gqa,
|
1340
|
+
float rms_norm_eps,
|
1290
1341
|
int n_gpu_layers,
|
1291
1342
|
int main_gpu,
|
1292
1343
|
const float * tensor_split,
|
@@ -1300,7 +1351,7 @@ static bool llama_model_load(
|
|
1300
1351
|
llama_progress_callback progress_callback,
|
1301
1352
|
void *progress_callback_user_data) {
|
1302
1353
|
try {
|
1303
|
-
llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, rope_freq_base, rope_freq_scale, low_vram, memory_type,
|
1354
|
+
llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gqa, rms_norm_eps, n_gpu_layers, main_gpu, tensor_split, rope_freq_base, rope_freq_scale, low_vram, memory_type,
|
1304
1355
|
use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
|
1305
1356
|
return true;
|
1306
1357
|
} catch (const std::exception & err) {
|
@@ -1344,16 +1395,23 @@ static bool llama_eval_internal(
|
|
1344
1395
|
|
1345
1396
|
LLAMA_ASSERT(!!kv_self.ctx);
|
1346
1397
|
|
1347
|
-
const
|
1348
|
-
const
|
1349
|
-
const
|
1350
|
-
const
|
1351
|
-
const
|
1352
|
-
const
|
1353
|
-
const
|
1398
|
+
const int64_t n_embd = hparams.n_embd;
|
1399
|
+
const int64_t n_layer = hparams.n_layer;
|
1400
|
+
const int64_t n_ctx = hparams.n_ctx;
|
1401
|
+
const int64_t n_head = hparams.n_head;
|
1402
|
+
const int64_t n_head_kv = hparams.n_head_kv;
|
1403
|
+
const int64_t n_embd_head = hparams.n_embd_head();
|
1404
|
+
const int64_t n_vocab = hparams.n_vocab;
|
1405
|
+
const int64_t n_embd_gqa = hparams.n_embd_gqa();
|
1406
|
+
|
1407
|
+
|
1408
|
+
LLAMA_ASSERT(n_embd_head == hparams.n_rot);
|
1354
1409
|
|
1355
1410
|
const float freq_base = hparams.rope_freq_base;
|
1356
1411
|
const float freq_scale = hparams.rope_freq_scale;
|
1412
|
+
const float rms_norm_eps = hparams.f_rms_norm_eps;
|
1413
|
+
|
1414
|
+
const int n_gpu_layers = model.n_gpu_layers;
|
1357
1415
|
|
1358
1416
|
auto & mem_per_token = lctx.mem_per_token;
|
1359
1417
|
auto & buf_compute = lctx.buf_compute;
|
@@ -1366,7 +1424,7 @@ static bool llama_eval_internal(
|
|
1366
1424
|
|
1367
1425
|
struct ggml_context * ctx0 = ggml_init(params);
|
1368
1426
|
|
1369
|
-
ggml_cgraph gf =
|
1427
|
+
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
1370
1428
|
|
1371
1429
|
// for big prompts, if BLAS is enabled, it is better to use only one thread
|
1372
1430
|
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
|
@@ -1431,7 +1489,7 @@ static bool llama_eval_internal(
|
|
1431
1489
|
|
1432
1490
|
// norm
|
1433
1491
|
{
|
1434
|
-
cur = ggml_rms_norm(ctx0, inpL);
|
1492
|
+
cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
|
1435
1493
|
offload_func(cur);
|
1436
1494
|
ggml_set_name(cur, "rms_norm_0");
|
1437
1495
|
|
@@ -1452,11 +1510,11 @@ static bool llama_eval_internal(
|
|
1452
1510
|
offload_func_kq(tmpq);
|
1453
1511
|
ggml_set_name(tmpq, "tmpq");
|
1454
1512
|
|
1455
|
-
struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk,
|
1513
|
+
struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
|
1456
1514
|
offload_func_kq(Kcur);
|
1457
1515
|
ggml_set_name(Kcur, "Kcur");
|
1458
1516
|
|
1459
|
-
struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq,
|
1517
|
+
struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
|
1460
1518
|
offload_func_kq(Qcur);
|
1461
1519
|
ggml_set_name(Qcur, "Qcur");
|
1462
1520
|
|
@@ -1468,23 +1526,23 @@ static bool llama_eval_internal(
|
|
1468
1526
|
offload_func_v(tmpv);
|
1469
1527
|
ggml_set_name(tmpv, "tmpv");
|
1470
1528
|
|
1471
|
-
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv,
|
1529
|
+
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, N));
|
1472
1530
|
offload_func_v(Vcur);
|
1473
1531
|
ggml_set_name(Vcur, "Vcur");
|
1474
1532
|
|
1475
|
-
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*
|
1533
|
+
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past));
|
1476
1534
|
offload_func_kq(k);
|
1477
1535
|
ggml_set_name(k, "k");
|
1478
1536
|
|
1479
|
-
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N,
|
1537
|
+
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd_gqa,
|
1480
1538
|
( n_ctx)*ggml_element_size(kv_self.v),
|
1481
|
-
(il*n_ctx)*ggml_element_size(kv_self.v)*
|
1539
|
+
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + n_past*ggml_element_size(kv_self.v));
|
1482
1540
|
offload_func_v(v);
|
1483
1541
|
ggml_set_name(v, "v");
|
1484
1542
|
|
1485
1543
|
// important: storing RoPE-ed version of K in the KV cache!
|
1486
|
-
ggml_build_forward_expand(
|
1487
|
-
ggml_build_forward_expand(
|
1544
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
|
1545
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
|
1488
1546
|
}
|
1489
1547
|
|
1490
1548
|
struct ggml_tensor * Q =
|
@@ -1497,8 +1555,8 @@ static bool llama_eval_internal(
|
|
1497
1555
|
struct ggml_tensor * K =
|
1498
1556
|
ggml_permute(ctx0,
|
1499
1557
|
ggml_reshape_3d(ctx0,
|
1500
|
-
ggml_view_1d(ctx0, kv_self.k, (n_past + N)*
|
1501
|
-
|
1558
|
+
ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd_gqa, il*n_ctx*ggml_element_size(kv_self.k)*n_embd_gqa),
|
1559
|
+
n_embd_head, n_head_kv, n_past + N),
|
1502
1560
|
0, 2, 1, 3);
|
1503
1561
|
offload_func_kq(K);
|
1504
1562
|
ggml_set_name(K, "K");
|
@@ -1508,9 +1566,9 @@ static bool llama_eval_internal(
|
|
1508
1566
|
offload_func_kq(KQ);
|
1509
1567
|
ggml_set_name(KQ, "KQ");
|
1510
1568
|
|
1511
|
-
// KQ_scaled = KQ / sqrt(
|
1569
|
+
// KQ_scaled = KQ / sqrt(n_embd_head)
|
1512
1570
|
struct ggml_tensor * KQ_scale = ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head));
|
1513
|
-
ggml_set_name(KQ_scale, "1/sqrt(
|
1571
|
+
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
1514
1572
|
|
1515
1573
|
// KQ_scaled shape [n_past + N, N, n_head, 1]
|
1516
1574
|
struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
|
@@ -1530,10 +1588,10 @@ static bool llama_eval_internal(
|
|
1530
1588
|
// split cached V into n_head heads
|
1531
1589
|
struct ggml_tensor * V =
|
1532
1590
|
ggml_view_3d(ctx0, kv_self.v,
|
1533
|
-
n_past + N,
|
1591
|
+
n_past + N, n_embd_head, n_head_kv,
|
1534
1592
|
n_ctx*ggml_element_size(kv_self.v),
|
1535
|
-
n_ctx*ggml_element_size(kv_self.v)*
|
1536
|
-
|
1593
|
+
n_ctx*ggml_element_size(kv_self.v)*n_embd_head,
|
1594
|
+
n_ctx*ggml_element_size(kv_self.v)*n_embd_gqa*il);
|
1537
1595
|
offload_func_v(V);
|
1538
1596
|
ggml_set_name(V, "V");
|
1539
1597
|
|
@@ -1545,7 +1603,7 @@ static bool llama_eval_internal(
|
|
1545
1603
|
// make V contiguous in memory to speed up the matmul, however we waste time on the copy
|
1546
1604
|
// on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
|
1547
1605
|
// is there a better way?
|
1548
|
-
struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N,
|
1606
|
+
struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd_head, n_head));
|
1549
1607
|
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
|
1550
1608
|
#endif
|
1551
1609
|
|
@@ -1579,7 +1637,7 @@ static bool llama_eval_internal(
|
|
1579
1637
|
{
|
1580
1638
|
// norm
|
1581
1639
|
{
|
1582
|
-
cur = ggml_rms_norm(ctx0, inpFF);
|
1640
|
+
cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps);
|
1583
1641
|
offload_func(cur);
|
1584
1642
|
ggml_set_name(cur, "rms_norm_1");
|
1585
1643
|
|
@@ -1632,7 +1690,7 @@ static bool llama_eval_internal(
|
|
1632
1690
|
|
1633
1691
|
// norm
|
1634
1692
|
{
|
1635
|
-
cur = ggml_rms_norm(ctx0, inpL);
|
1693
|
+
cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
|
1636
1694
|
offload_func_nr(cur);
|
1637
1695
|
ggml_set_name(cur, "rms_norm_2");
|
1638
1696
|
|
@@ -1654,16 +1712,22 @@ static bool llama_eval_internal(
|
|
1654
1712
|
//cur = ggml_soft_max_inplace(ctx0, cur);
|
1655
1713
|
|
1656
1714
|
// run the computation
|
1657
|
-
ggml_build_forward_expand(
|
1715
|
+
ggml_build_forward_expand(gf, cur);
|
1716
|
+
|
1717
|
+
// fprintf(stderr, "graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf.n_nodes, gf.n_leafs);
|
1658
1718
|
|
1659
1719
|
#if GGML_USE_MPI
|
1660
|
-
ggml_mpi_graph_compute_pre(lctx.ctx_mpi,
|
1720
|
+
ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
|
1661
1721
|
#endif
|
1662
1722
|
|
1663
1723
|
#ifdef GGML_USE_METAL
|
1664
1724
|
if (lctx.ctx_metal && N == 1) {
|
1725
|
+
// TODO: disabled until #2413 is resolved
|
1726
|
+
//if (!ggml_metal_if_optimized(lctx.ctx_metal)) {
|
1727
|
+
// ggml_metal_graph_find_concurrency(lctx.ctx_metal, gf);
|
1728
|
+
//}
|
1665
1729
|
ggml_metal_set_n_cb (lctx.ctx_metal, n_threads);
|
1666
|
-
ggml_metal_graph_compute(lctx.ctx_metal,
|
1730
|
+
ggml_metal_graph_compute(lctx.ctx_metal, gf);
|
1667
1731
|
ggml_metal_get_tensor (lctx.ctx_metal, cur);
|
1668
1732
|
} else {
|
1669
1733
|
// IMPORTANT:
|
@@ -1682,34 +1746,34 @@ static bool llama_eval_internal(
|
|
1682
1746
|
ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v);
|
1683
1747
|
}
|
1684
1748
|
|
1685
|
-
ggml_graph_compute_helper(lctx.work_buffer,
|
1749
|
+
ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
|
1686
1750
|
}
|
1687
1751
|
#else
|
1688
|
-
ggml_graph_compute_helper(lctx.work_buffer,
|
1752
|
+
ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
|
1689
1753
|
#endif
|
1690
1754
|
|
1691
1755
|
#if GGML_USE_MPI
|
1692
|
-
ggml_mpi_graph_compute_post(lctx.ctx_mpi,
|
1756
|
+
ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
|
1693
1757
|
#endif
|
1694
1758
|
|
1695
1759
|
// update kv token count
|
1696
1760
|
lctx.kv_self.n = n_past + N;
|
1697
1761
|
|
1698
|
-
struct ggml_tensor * res = gf
|
1762
|
+
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
|
1699
1763
|
|
1700
1764
|
if (cgraph_fname) {
|
1701
|
-
ggml_graph_export(
|
1765
|
+
ggml_graph_export(gf, cgraph_fname);
|
1702
1766
|
}
|
1703
1767
|
|
1704
1768
|
#ifdef GGML_PERF
|
1705
1769
|
// print timing information per ggml operation (for debugging purposes)
|
1706
1770
|
// requires GGML_PERF to be defined
|
1707
|
-
ggml_graph_print(
|
1771
|
+
ggml_graph_print(gf);
|
1708
1772
|
#endif
|
1709
1773
|
|
1710
1774
|
// plot the computation graph in dot format (for debugging purposes)
|
1711
1775
|
//if (n_past%100 == 0) {
|
1712
|
-
// ggml_graph_dump_dot(
|
1776
|
+
// ggml_graph_dump_dot(gf, NULL, "llama.dot");
|
1713
1777
|
//}
|
1714
1778
|
|
1715
1779
|
// extract logits
|
@@ -1739,10 +1803,12 @@ static bool llama_eval_internal(
|
|
1739
1803
|
}
|
1740
1804
|
|
1741
1805
|
#if 0
|
1742
|
-
printf("\n%s: used_mem
|
1806
|
+
printf("\n%s: used_mem: eval ctx %.3f MB, scratch %.3f MB %.3f MB, work buf %.3f MB, n_past = %d, N = %d\n", __func__,
|
1743
1807
|
ggml_used_mem(ctx0)/1024.0/1024.0,
|
1744
1808
|
lctx.get_buf_max_mem(0)/1024.0/1024.0,
|
1745
|
-
lctx.get_buf_max_mem(1)/1024.0/1024.0
|
1809
|
+
lctx.get_buf_max_mem(1)/1024.0/1024.0,
|
1810
|
+
lctx.work_buffer.size()/1024.0/1024.0,
|
1811
|
+
n_past, N);
|
1746
1812
|
#endif
|
1747
1813
|
|
1748
1814
|
ggml_free(ctx0);
|
@@ -1915,6 +1981,279 @@ static std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, co
|
|
1915
1981
|
return output;
|
1916
1982
|
}
|
1917
1983
|
|
1984
|
+
//
|
1985
|
+
// grammar - internal
|
1986
|
+
//
|
1987
|
+
|
1988
|
+
struct llama_grammar {
|
1989
|
+
const std::vector<std::vector<llama_grammar_element>> rules;
|
1990
|
+
std::vector<std::vector<const llama_grammar_element *>> stacks;
|
1991
|
+
};
|
1992
|
+
|
1993
|
+
struct llama_grammar_candidate {
|
1994
|
+
size_t index;
|
1995
|
+
const uint32_t * code_points;
|
1996
|
+
};
|
1997
|
+
|
1998
|
+
// NOTE: assumes valid utf8 (but checks for overrun)
|
1999
|
+
// adds a terminating 0 for use as pointer
|
2000
|
+
std::vector<uint32_t> decode_utf8(const char * src) {
|
2001
|
+
static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
|
2002
|
+
const char * pos = src;
|
2003
|
+
std::vector<uint32_t> code_points;
|
2004
|
+
while (*pos != 0) {
|
2005
|
+
uint8_t first_byte = static_cast<uint8_t>(*pos);
|
2006
|
+
uint8_t highbits = first_byte >> 4;
|
2007
|
+
int len = lookup[highbits];
|
2008
|
+
uint8_t mask = (1 << (8 - len)) - 1;
|
2009
|
+
uint32_t value = first_byte & mask;
|
2010
|
+
const char * end = pos + len; // may overrun!
|
2011
|
+
++pos;
|
2012
|
+
for ( ; pos < end && *pos != 0; ++pos) {
|
2013
|
+
value = (value << 6) + (static_cast<uint8_t>(*pos) & 0x3F);
|
2014
|
+
}
|
2015
|
+
code_points.push_back(value);
|
2016
|
+
}
|
2017
|
+
code_points.push_back(0);
|
2018
|
+
return code_points;
|
2019
|
+
}
|
2020
|
+
|
2021
|
+
// returns true iff pos points to the end of one of the definitions of a rule
|
2022
|
+
static bool llama_grammar_is_end_of_sequence(const llama_grammar_element * pos) {
|
2023
|
+
switch (pos->type) {
|
2024
|
+
case LLAMA_GRETYPE_END: return true;
|
2025
|
+
case LLAMA_GRETYPE_ALT: return true;
|
2026
|
+
default: return false;
|
2027
|
+
}
|
2028
|
+
}
|
2029
|
+
|
2030
|
+
// returns true iff chr satisfies the char range at pos (regular or inverse range)
|
2031
|
+
// asserts that pos is pointing to a char range element
|
2032
|
+
static std::pair<bool, const llama_grammar_element *> llama_grammar_match_char(
|
2033
|
+
const llama_grammar_element * pos,
|
2034
|
+
const uint32_t chr) {
|
2035
|
+
|
2036
|
+
bool found = false;
|
2037
|
+
bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR;
|
2038
|
+
LLAMA_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT);
|
2039
|
+
|
2040
|
+
do {
|
2041
|
+
if (pos[1].type == LLAMA_GRETYPE_CHAR_RNG_UPPER) {
|
2042
|
+
// inclusive range, e.g. [a-z]
|
2043
|
+
found = found || (pos->value <= chr && chr <= pos[1].value);
|
2044
|
+
pos += 2;
|
2045
|
+
} else {
|
2046
|
+
// exact char match, e.g. [a] or "a"
|
2047
|
+
found = found || pos->value == chr;
|
2048
|
+
pos += 1;
|
2049
|
+
}
|
2050
|
+
} while (pos->type == LLAMA_GRETYPE_CHAR_ALT);
|
2051
|
+
|
2052
|
+
return std::make_pair(found == is_positive_char, pos);
|
2053
|
+
}
|
2054
|
+
|
2055
|
+
// transforms a grammar pushdown stack into N possible stacks, all ending
|
2056
|
+
// at a character range (terminal element)
|
2057
|
+
static void llama_grammar_advance_stack(
|
2058
|
+
const std::vector<std::vector<llama_grammar_element>> & rules,
|
2059
|
+
const std::vector<const llama_grammar_element *> & stack,
|
2060
|
+
std::vector<std::vector<const llama_grammar_element *>> & new_stacks) {
|
2061
|
+
|
2062
|
+
if (stack.empty()) {
|
2063
|
+
new_stacks.push_back(stack);
|
2064
|
+
return;
|
2065
|
+
}
|
2066
|
+
|
2067
|
+
const llama_grammar_element * pos = stack.back();
|
2068
|
+
|
2069
|
+
switch (pos->type) {
|
2070
|
+
case LLAMA_GRETYPE_RULE_REF: {
|
2071
|
+
const size_t rule_id = static_cast<size_t>(pos->value);
|
2072
|
+
const llama_grammar_element * subpos = rules[rule_id].data();
|
2073
|
+
do {
|
2074
|
+
// init new stack without the top (pos)
|
2075
|
+
std::vector<const llama_grammar_element *> new_stack(stack.begin(), stack.end() - 1);
|
2076
|
+
if (!llama_grammar_is_end_of_sequence(pos + 1)) {
|
2077
|
+
// if this rule ref is followed by another element, add that to stack
|
2078
|
+
new_stack.push_back(pos + 1);
|
2079
|
+
}
|
2080
|
+
if (!llama_grammar_is_end_of_sequence(subpos)) {
|
2081
|
+
// if alternate is nonempty, add to stack
|
2082
|
+
new_stack.push_back(subpos);
|
2083
|
+
}
|
2084
|
+
llama_grammar_advance_stack(rules, new_stack, new_stacks);
|
2085
|
+
while (!llama_grammar_is_end_of_sequence(subpos)) {
|
2086
|
+
// scan to end of alternate def
|
2087
|
+
subpos++;
|
2088
|
+
}
|
2089
|
+
if (subpos->type == LLAMA_GRETYPE_ALT) {
|
2090
|
+
// there's another alternate def of this rule to process
|
2091
|
+
subpos++;
|
2092
|
+
} else {
|
2093
|
+
break;
|
2094
|
+
}
|
2095
|
+
} while (true);
|
2096
|
+
break;
|
2097
|
+
}
|
2098
|
+
case LLAMA_GRETYPE_CHAR:
|
2099
|
+
case LLAMA_GRETYPE_CHAR_NOT:
|
2100
|
+
new_stacks.push_back(stack);
|
2101
|
+
break;
|
2102
|
+
default:
|
2103
|
+
// end of alternate (LLAMA_GRETYPE_END, LLAMA_GRETYPE_ALT) or middle of char range
|
2104
|
+
// (LLAMA_GRETYPE_CHAR_ALT, LLAMA_GRETYPE_CHAR_RNG_UPPER); stack should never be left on
|
2105
|
+
// those
|
2106
|
+
LLAMA_ASSERT(false);
|
2107
|
+
}
|
2108
|
+
}
|
2109
|
+
|
2110
|
+
// takes a set of possible pushdown stacks on a grammar, which are required to
|
2111
|
+
// be positioned at a character range (see `llama_grammar_advance_stack`), and
|
2112
|
+
// produces the N possible stacks if the given char is accepted at those
|
2113
|
+
// positions
|
2114
|
+
static std::vector<std::vector<const llama_grammar_element *>> llama_grammar_accept(
|
2115
|
+
const std::vector<std::vector<llama_grammar_element>> & rules,
|
2116
|
+
const std::vector<std::vector<const llama_grammar_element *>> & stacks,
|
2117
|
+
const uint32_t chr) {
|
2118
|
+
|
2119
|
+
std::vector<std::vector<const llama_grammar_element *>> new_stacks;
|
2120
|
+
|
2121
|
+
for (const auto & stack : stacks) {
|
2122
|
+
if (stack.empty()) {
|
2123
|
+
continue;
|
2124
|
+
}
|
2125
|
+
|
2126
|
+
auto match = llama_grammar_match_char(stack.back(), chr);
|
2127
|
+
if (match.first) {
|
2128
|
+
const llama_grammar_element * pos = match.second;
|
2129
|
+
|
2130
|
+
// update top of stack to next element, if any
|
2131
|
+
std::vector<const llama_grammar_element *> new_stack(stack.begin(), stack.end() - 1);
|
2132
|
+
if (!llama_grammar_is_end_of_sequence(pos)) {
|
2133
|
+
new_stack.push_back(pos);
|
2134
|
+
}
|
2135
|
+
llama_grammar_advance_stack(rules, new_stack, new_stacks);
|
2136
|
+
}
|
2137
|
+
}
|
2138
|
+
|
2139
|
+
return new_stacks;
|
2140
|
+
}
|
2141
|
+
|
2142
|
+
static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates(
|
2143
|
+
const std::vector<std::vector<llama_grammar_element>> & rules,
|
2144
|
+
const std::vector<std::vector<const llama_grammar_element *>> & stacks,
|
2145
|
+
const std::vector<llama_grammar_candidate> & candidates);
|
2146
|
+
|
2147
|
+
static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_stack(
|
2148
|
+
const std::vector<std::vector<llama_grammar_element>> & rules,
|
2149
|
+
const std::vector<const llama_grammar_element *> & stack,
|
2150
|
+
const std::vector<llama_grammar_candidate> & candidates) {
|
2151
|
+
|
2152
|
+
std::vector<llama_grammar_candidate> rejects;
|
2153
|
+
|
2154
|
+
if (stack.empty()) {
|
2155
|
+
// accept nothing; EOS is handled elsewhere
|
2156
|
+
rejects.insert(rejects.end(), candidates.begin(), candidates.end());
|
2157
|
+
return rejects;
|
2158
|
+
}
|
2159
|
+
|
2160
|
+
const llama_grammar_element * stack_pos = stack.back();
|
2161
|
+
|
2162
|
+
std::vector<llama_grammar_candidate> next_candidates;
|
2163
|
+
for (auto tok : candidates) {
|
2164
|
+
if (llama_grammar_match_char(stack_pos, tok.code_points[0]).first) {
|
2165
|
+
if (tok.code_points[1] != 0) {
|
2166
|
+
next_candidates.push_back({ tok.index, tok.code_points + 1 });
|
2167
|
+
}
|
2168
|
+
} else {
|
2169
|
+
rejects.push_back(tok);
|
2170
|
+
}
|
2171
|
+
}
|
2172
|
+
|
2173
|
+
auto stack_pos_after = llama_grammar_match_char(stack_pos, 0).second;
|
2174
|
+
|
2175
|
+
// update top of stack to next element, if any
|
2176
|
+
std::vector<const llama_grammar_element *> stack_after(stack.begin(), stack.end() - 1);
|
2177
|
+
if (!llama_grammar_is_end_of_sequence(stack_pos_after)) {
|
2178
|
+
stack_after.push_back(stack_pos_after);
|
2179
|
+
}
|
2180
|
+
std::vector<std::vector<const llama_grammar_element *>> next_stacks;
|
2181
|
+
llama_grammar_advance_stack(rules, stack_after, next_stacks);
|
2182
|
+
|
2183
|
+
auto next_rejects = llama_grammar_reject_candidates(rules, next_stacks, next_candidates);
|
2184
|
+
for (auto tok : next_rejects) {
|
2185
|
+
rejects.push_back({ tok.index, tok.code_points - 1 });
|
2186
|
+
}
|
2187
|
+
|
2188
|
+
return rejects;
|
2189
|
+
}
|
2190
|
+
|
2191
|
+
static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates(
|
2192
|
+
const std::vector<std::vector<llama_grammar_element>> & rules,
|
2193
|
+
const std::vector<std::vector<const llama_grammar_element *>> & stacks,
|
2194
|
+
const std::vector<llama_grammar_candidate> & candidates) {
|
2195
|
+
LLAMA_ASSERT(!stacks.empty()); // REVIEW
|
2196
|
+
|
2197
|
+
if (candidates.empty()) {
|
2198
|
+
return std::vector<llama_grammar_candidate>();
|
2199
|
+
}
|
2200
|
+
|
2201
|
+
auto rejects = llama_grammar_reject_candidates_for_stack(rules, stacks.front(), candidates);
|
2202
|
+
|
2203
|
+
for (size_t i = 1, size = stacks.size(); i < size; ++i) {
|
2204
|
+
rejects = llama_grammar_reject_candidates_for_stack(rules, stacks[i], rejects);
|
2205
|
+
}
|
2206
|
+
return rejects;
|
2207
|
+
}
|
2208
|
+
|
2209
|
+
//
|
2210
|
+
// grammar - external
|
2211
|
+
//
|
2212
|
+
|
2213
|
+
struct llama_grammar * llama_grammar_init(
|
2214
|
+
const llama_grammar_element ** rules,
|
2215
|
+
size_t n_rules,
|
2216
|
+
size_t start_rule_index) {
|
2217
|
+
const llama_grammar_element * pos;
|
2218
|
+
|
2219
|
+
// copy rule definitions into vectors
|
2220
|
+
std::vector<std::vector<llama_grammar_element>> vec_rules(n_rules);
|
2221
|
+
for (size_t i = 0; i < n_rules; i++) {
|
2222
|
+
for (pos = rules[i]; pos->type != LLAMA_GRETYPE_END; pos++) {
|
2223
|
+
vec_rules[i].push_back(*pos);
|
2224
|
+
}
|
2225
|
+
vec_rules[i].push_back({LLAMA_GRETYPE_END, 0});
|
2226
|
+
}
|
2227
|
+
|
2228
|
+
// loop over alternates of start rule to build initial stacks
|
2229
|
+
std::vector<std::vector<const llama_grammar_element *>> stacks;
|
2230
|
+
pos = rules[start_rule_index];
|
2231
|
+
do {
|
2232
|
+
std::vector<const llama_grammar_element *> stack;
|
2233
|
+
if (!llama_grammar_is_end_of_sequence(pos)) {
|
2234
|
+
// if alternate is nonempty, add to stack
|
2235
|
+
stack.push_back(pos);
|
2236
|
+
}
|
2237
|
+
llama_grammar_advance_stack(vec_rules, stack, stacks);
|
2238
|
+
while (!llama_grammar_is_end_of_sequence(pos)) {
|
2239
|
+
// scan to end of alternate def
|
2240
|
+
pos++;
|
2241
|
+
}
|
2242
|
+
if (pos->type == LLAMA_GRETYPE_ALT) {
|
2243
|
+
// there's another alternate def of this rule to process
|
2244
|
+
pos++;
|
2245
|
+
} else {
|
2246
|
+
break;
|
2247
|
+
}
|
2248
|
+
} while (true);
|
2249
|
+
|
2250
|
+
return new llama_grammar{ std::move(vec_rules), std::move(stacks) };
|
2251
|
+
}
|
2252
|
+
|
2253
|
+
void llama_grammar_free(struct llama_grammar * grammar) {
|
2254
|
+
delete grammar;
|
2255
|
+
}
|
2256
|
+
|
1918
2257
|
//
|
1919
2258
|
// sampling
|
1920
2259
|
//
|
@@ -2200,6 +2539,47 @@ void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, l
|
|
2200
2539
|
}
|
2201
2540
|
}
|
2202
2541
|
|
2542
|
+
void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, const struct llama_grammar * grammar) {
|
2543
|
+
assert(ctx);
|
2544
|
+
const int64_t t_start_sample_us = ggml_time_us();
|
2545
|
+
|
2546
|
+
bool allow_eos = false;
|
2547
|
+
for (const auto & stack : grammar->stacks) {
|
2548
|
+
if (stack.empty()) {
|
2549
|
+
allow_eos = true;
|
2550
|
+
break;
|
2551
|
+
}
|
2552
|
+
}
|
2553
|
+
|
2554
|
+
const llama_token eos = llama_token_eos();
|
2555
|
+
|
2556
|
+
std::vector<std::vector<uint32_t>> candidates_decoded;
|
2557
|
+
std::vector<llama_grammar_candidate> candidates_grammar;
|
2558
|
+
|
2559
|
+
for (size_t i = 0; i < candidates->size; ++i) {
|
2560
|
+
const llama_token id = candidates->data[i].id;
|
2561
|
+
const char * str = llama_token_to_str(ctx, id);
|
2562
|
+
if (id == eos) {
|
2563
|
+
if (!allow_eos) {
|
2564
|
+
candidates->data[i].logit = -INFINITY;
|
2565
|
+
}
|
2566
|
+
} else if (*str == 0) {
|
2567
|
+
candidates->data[i].logit = -INFINITY;
|
2568
|
+
} else {
|
2569
|
+
candidates_decoded.push_back(decode_utf8(str));
|
2570
|
+
candidates_grammar.push_back({ i, candidates_decoded.back().data() });
|
2571
|
+
}
|
2572
|
+
}
|
2573
|
+
|
2574
|
+
const auto rejects =
|
2575
|
+
llama_grammar_reject_candidates(grammar->rules, grammar->stacks, candidates_grammar);
|
2576
|
+
for (auto & reject : rejects) {
|
2577
|
+
candidates->data[reject.index].logit = -INFINITY;
|
2578
|
+
}
|
2579
|
+
|
2580
|
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
2581
|
+
}
|
2582
|
+
|
2203
2583
|
static void llama_log_softmax(float * array, size_t size) {
|
2204
2584
|
float max_l = *std::max_element(array, array + size);
|
2205
2585
|
float sum = 0.f;
|
@@ -2375,6 +2755,29 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra
|
|
2375
2755
|
return result;
|
2376
2756
|
}
|
2377
2757
|
|
2758
|
+
void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token) {
|
2759
|
+
const int64_t t_start_sample_us = ggml_time_us();
|
2760
|
+
|
2761
|
+
if (token == llama_token_eos()) {
|
2762
|
+
for (const auto & stack : grammar->stacks) {
|
2763
|
+
if (stack.empty()) {
|
2764
|
+
return;
|
2765
|
+
}
|
2766
|
+
}
|
2767
|
+
LLAMA_ASSERT(false);
|
2768
|
+
}
|
2769
|
+
|
2770
|
+
const char * str = llama_token_to_str(ctx, token);
|
2771
|
+
// Note terminating 0 in decoded string
|
2772
|
+
auto code_points = decode_utf8(str);
|
2773
|
+
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
|
2774
|
+
grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
|
2775
|
+
}
|
2776
|
+
LLAMA_ASSERT(!grammar->stacks.empty());
|
2777
|
+
|
2778
|
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
2779
|
+
}
|
2780
|
+
|
2378
2781
|
//
|
2379
2782
|
// quantization
|
2380
2783
|
//
|
@@ -2448,8 +2851,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2448
2851
|
case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
|
2449
2852
|
case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
|
2450
2853
|
case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
|
2451
|
-
case LLAMA_FTYPE_MOSTLY_F16:
|
2452
|
-
case LLAMA_FTYPE_ALL_F32:
|
2854
|
+
case LLAMA_FTYPE_MOSTLY_F16: quantized_type = GGML_TYPE_F16; break;
|
2855
|
+
case LLAMA_FTYPE_ALL_F32: quantized_type = GGML_TYPE_F32; break;
|
2453
2856
|
|
2454
2857
|
#ifdef GGML_USE_K_QUANTS
|
2455
2858
|
// K-quants
|
@@ -2533,16 +2936,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2533
2936
|
} else {
|
2534
2937
|
new_type = quantized_type;
|
2535
2938
|
#ifdef GGML_USE_K_QUANTS
|
2536
|
-
bool convert_incompatible_tensor = false;
|
2537
|
-
if (quantized_type == GGML_TYPE_Q2_K || quantized_type == GGML_TYPE_Q3_K || quantized_type == GGML_TYPE_Q4_K ||
|
2538
|
-
quantized_type == GGML_TYPE_Q5_K || quantized_type == GGML_TYPE_Q6_K) {
|
2539
|
-
int nx = tensor.ne.at(0);
|
2540
|
-
int ny = tensor.ne.at(1);
|
2541
|
-
if (nx % QK_K != 0 || ny % QK_K != 0) {
|
2542
|
-
fprintf(stderr, "\n\nTensor sizes %d x %d are not divisible by %d, required for k-quants.\n",nx,ny,QK_K);
|
2543
|
-
convert_incompatible_tensor = true;
|
2544
|
-
}
|
2545
|
-
}
|
2546
2939
|
if (tensor.name == "output.weight") {
|
2547
2940
|
int nx = tensor.ne.at(0);
|
2548
2941
|
int ny = tensor.ne.at(1);
|
@@ -2568,6 +2961,16 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2568
2961
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
2569
2962
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
2570
2963
|
}
|
2964
|
+
bool convert_incompatible_tensor = false;
|
2965
|
+
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
|
2966
|
+
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) {
|
2967
|
+
int nx = tensor.ne.at(0);
|
2968
|
+
int ny = tensor.ne.at(1);
|
2969
|
+
if (nx % QK_K != 0 || ny % QK_K != 0) {
|
2970
|
+
fprintf(stderr, "\n\nTensor sizes %d x %d are not divisible by %d, required for k-quants.\n",nx,ny,QK_K);
|
2971
|
+
convert_incompatible_tensor = true;
|
2972
|
+
}
|
2973
|
+
}
|
2571
2974
|
if (convert_incompatible_tensor) {
|
2572
2975
|
if (tensor.name == "output.weight") {
|
2573
2976
|
new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing.
|
@@ -2594,7 +2997,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2594
2997
|
f32_data = (float *) f32_conv_buf.addr;
|
2595
2998
|
}
|
2596
2999
|
|
2597
|
-
printf("quantizing .. ");
|
3000
|
+
printf("quantizing to %s .. ", ggml_type_name(new_type));
|
2598
3001
|
fflush(stdout);
|
2599
3002
|
|
2600
3003
|
work.resize(nelements * 4); // upper bound on size
|
@@ -2697,7 +3100,7 @@ struct llama_model * llama_load_model_from_file(
|
|
2697
3100
|
|
2698
3101
|
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
2699
3102
|
|
2700
|
-
if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gpu_layers,
|
3103
|
+
if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gqa, params.rms_norm_eps, params.n_gpu_layers,
|
2701
3104
|
params.main_gpu, params.tensor_split, params.rope_freq_base, params.rope_freq_scale,params.low_vram,
|
2702
3105
|
memory_type, params.use_mmap, params.use_mlock, params.vocab_only, params.progress_callback,
|
2703
3106
|
params.progress_callback_user_data)) {
|
@@ -2775,7 +3178,7 @@ struct llama_context * llama_new_context_with_model(
|
|
2775
3178
|
ctx->embedding.resize(hparams.n_embd);
|
2776
3179
|
}
|
2777
3180
|
|
2778
|
-
ctx->buf_compute.resize(MEM_REQ_EVAL(
|
3181
|
+
ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type) + ggml_graph_overhead());
|
2779
3182
|
|
2780
3183
|
ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type));
|
2781
3184
|
ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
|
@@ -2799,7 +3202,7 @@ struct llama_context * llama_new_context_with_model(
|
|
2799
3202
|
|
2800
3203
|
const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
|
2801
3204
|
|
2802
|
-
|
3205
|
+
fprintf(stderr, "%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
|
2803
3206
|
|
2804
3207
|
#define LLAMA_METAL_CHECK_BUF(result) \
|
2805
3208
|
if (!(result)) { \
|