llama_cpp 0.3.4 → 0.3.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +12 -0
- data/README.md +18 -2
- data/ext/llama_cpp/extconf.rb +2 -1
- data/ext/llama_cpp/llama_cpp.cpp +315 -8
- data/ext/llama_cpp/src/ggml-alloc.c +541 -0
- data/ext/llama_cpp/src/ggml-alloc.h +22 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +2271 -414
- data/ext/llama_cpp/src/ggml-cuda.h +1 -0
- data/ext/llama_cpp/src/ggml-metal.h +7 -0
- data/ext/llama_cpp/src/ggml-metal.m +218 -87
- data/ext/llama_cpp/src/ggml-metal.metal +72 -55
- data/ext/llama_cpp/src/ggml.c +754 -996
- data/ext/llama_cpp/src/ggml.h +94 -18
- data/ext/llama_cpp/src/k_quants.c +350 -24
- data/ext/llama_cpp/src/llama.cpp +713 -179
- data/ext/llama_cpp/src/llama.h +61 -5
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +26 -0
- metadata +4 -2
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -56,8 +56,14 @@
|
|
56
56
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
57
57
|
#endif
|
58
58
|
|
59
|
+
#if !defined(GGML_USE_CUBLAS) && !defined(GGML_USE_METAL)
|
60
|
+
#include "ggml-alloc.h"
|
61
|
+
#define LLAMA_USE_ALLOCATOR
|
62
|
+
#else
|
59
63
|
#define LLAMA_USE_SCRATCH
|
60
64
|
#define LLAMA_MAX_SCRATCH_BUFFERS 16
|
65
|
+
#endif
|
66
|
+
|
61
67
|
|
62
68
|
// available llama models
|
63
69
|
enum e_model {
|
@@ -67,6 +73,7 @@ enum e_model {
|
|
67
73
|
MODEL_13B,
|
68
74
|
MODEL_30B,
|
69
75
|
MODEL_65B,
|
76
|
+
MODEL_70B,
|
70
77
|
};
|
71
78
|
|
72
79
|
static const size_t kB = 1024;
|
@@ -98,18 +105,18 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
|
|
98
105
|
}
|
99
106
|
|
100
107
|
//
|
101
|
-
// memory sizes
|
108
|
+
// memory sizes (calculated for n_batch == 512)
|
102
109
|
//
|
103
110
|
|
104
111
|
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0(int n_ctx)
|
105
112
|
{
|
106
113
|
static std::map<e_model, size_t> k_sizes = {
|
107
|
-
|
108
|
-
{
|
109
|
-
{
|
110
|
-
{
|
111
|
-
{
|
112
|
-
{
|
114
|
+
{ MODEL_3B, ((size_t) n_ctx / 16ull + 92ull) * MB },
|
115
|
+
{ MODEL_7B, ((size_t) n_ctx / 16ull + 100ull) * MB },
|
116
|
+
{ MODEL_13B, ((size_t) n_ctx / 12ull + 120ull) * MB },
|
117
|
+
{ MODEL_30B, ((size_t) n_ctx / 9ull + 160ull) * MB },
|
118
|
+
{ MODEL_65B, ((size_t) n_ctx / 6ull + 256ull) * MB }, // guess
|
119
|
+
{ MODEL_70B, ((size_t) n_ctx / 7ull + 164ull) * MB },
|
113
120
|
};
|
114
121
|
return k_sizes;
|
115
122
|
}
|
@@ -117,38 +124,26 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0(int n_ctx)
|
|
117
124
|
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
|
118
125
|
{
|
119
126
|
static std::map<e_model, size_t> k_sizes = {
|
120
|
-
{ MODEL_3B,
|
121
|
-
{ MODEL_7B,
|
122
|
-
{ MODEL_13B,
|
123
|
-
{ MODEL_30B,
|
124
|
-
{ MODEL_65B,
|
125
|
-
|
126
|
-
return k_sizes;
|
127
|
-
}
|
128
|
-
|
129
|
-
// 2*n_embd*n_ctx*n_layer*sizeof(float16)
|
130
|
-
static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
|
131
|
-
{
|
132
|
-
static std::map<e_model, size_t> k_sizes = {
|
133
|
-
{ MODEL_3B, 682ull * MB },
|
134
|
-
{ MODEL_7B, 1026ull * MB },
|
135
|
-
{ MODEL_13B, 1608ull * MB },
|
136
|
-
{ MODEL_30B, 3124ull * MB },
|
137
|
-
{ MODEL_65B, 5120ull * MB },
|
127
|
+
{ MODEL_3B, 128ull * MB },
|
128
|
+
{ MODEL_7B, 160ull * MB },
|
129
|
+
{ MODEL_13B, 192ull * MB },
|
130
|
+
{ MODEL_30B, 256ull * MB },
|
131
|
+
{ MODEL_65B, 384ull * MB }, // guess
|
132
|
+
{ MODEL_70B, 304ull * MB },
|
138
133
|
};
|
139
134
|
return k_sizes;
|
140
135
|
}
|
141
136
|
|
142
|
-
//
|
143
|
-
|
144
|
-
static const std::map<e_model, size_t> & MEM_REQ_EVAL(int n_ctx)
|
137
|
+
// used to store the compute graph tensors + non-scratch data
|
138
|
+
static const std::map<e_model, size_t> & MEM_REQ_EVAL()
|
145
139
|
{
|
146
140
|
static std::map<e_model, size_t> k_sizes = {
|
147
|
-
{ MODEL_3B,
|
148
|
-
{ MODEL_7B,
|
149
|
-
{ MODEL_13B,
|
150
|
-
{ MODEL_30B,
|
151
|
-
{ MODEL_65B,
|
141
|
+
{ MODEL_3B, 8ull * MB },
|
142
|
+
{ MODEL_7B, 10ull * MB },
|
143
|
+
{ MODEL_13B, 12ull * MB },
|
144
|
+
{ MODEL_30B, 16ull * MB },
|
145
|
+
{ MODEL_65B, 24ull * MB }, // guess
|
146
|
+
{ MODEL_70B, 24ull * MB },
|
152
147
|
};
|
153
148
|
return k_sizes;
|
154
149
|
}
|
@@ -163,6 +158,7 @@ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_BASE()
|
|
163
158
|
{ MODEL_13B, 640ull * kB },
|
164
159
|
{ MODEL_30B, 768ull * kB },
|
165
160
|
{ MODEL_65B, 1536ull * kB },
|
161
|
+
{ MODEL_70B, 1536ull * kB }, // TODO (likely can be reduced)
|
166
162
|
};
|
167
163
|
return k_sizes;
|
168
164
|
}
|
@@ -177,19 +173,26 @@ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT()
|
|
177
173
|
{ MODEL_13B, 160ull },
|
178
174
|
{ MODEL_30B, 208ull },
|
179
175
|
{ MODEL_65B, 416ull },
|
176
|
+
{ MODEL_70B, 416ull }, // TODO (likely can be reduced)
|
180
177
|
};
|
181
178
|
return k_sizes;
|
182
179
|
}
|
183
180
|
|
184
181
|
// default hparams (LLaMA 7B)
|
185
182
|
struct llama_hparams {
|
186
|
-
uint32_t n_vocab
|
187
|
-
uint32_t n_ctx
|
188
|
-
uint32_t n_embd
|
189
|
-
uint32_t n_mult
|
190
|
-
uint32_t n_head
|
191
|
-
uint32_t
|
192
|
-
uint32_t
|
183
|
+
uint32_t n_vocab = 32000;
|
184
|
+
uint32_t n_ctx = 512; // this is provided as user input?
|
185
|
+
uint32_t n_embd = 4096;
|
186
|
+
uint32_t n_mult = 256;
|
187
|
+
uint32_t n_head = 32;
|
188
|
+
uint32_t n_head_kv = 32;
|
189
|
+
uint32_t n_layer = 32;
|
190
|
+
uint32_t n_rot = 64;
|
191
|
+
|
192
|
+
// LLaMAv2
|
193
|
+
// TODO: load from model data hparams
|
194
|
+
float f_ffn_mult = 1.0f;
|
195
|
+
float f_rms_norm_eps = LLAMA_DEFAULT_RMS_EPS;
|
193
196
|
|
194
197
|
float rope_freq_base = 10000.0f;
|
195
198
|
float rope_freq_scale = 1.0f;
|
@@ -197,7 +200,28 @@ struct llama_hparams {
|
|
197
200
|
enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;
|
198
201
|
|
199
202
|
bool operator!=(const llama_hparams & other) const {
|
200
|
-
return static_cast<bool>(memcmp(this, &other, sizeof(llama_hparams)));
|
203
|
+
return static_cast<bool>(memcmp(this, &other, sizeof(llama_hparams))); // NOLINT
|
204
|
+
}
|
205
|
+
|
206
|
+
uint32_t n_gqa() const {
|
207
|
+
return n_head/n_head_kv;
|
208
|
+
}
|
209
|
+
|
210
|
+
uint32_t n_embd_head() const {
|
211
|
+
return n_embd/n_head;
|
212
|
+
}
|
213
|
+
|
214
|
+
uint32_t n_embd_gqa() const {
|
215
|
+
return n_embd/n_gqa();
|
216
|
+
}
|
217
|
+
|
218
|
+
size_t kv_size() const {
|
219
|
+
size_t result = 2ull;
|
220
|
+
result *= (size_t) n_embd_gqa();
|
221
|
+
result *= (size_t) n_ctx;
|
222
|
+
result *= (size_t) n_layer;
|
223
|
+
result *= sizeof(ggml_fp16_t);
|
224
|
+
return result;
|
201
225
|
}
|
202
226
|
};
|
203
227
|
|
@@ -309,13 +333,22 @@ struct llama_model {
|
|
309
333
|
|
310
334
|
struct llama_context {
|
311
335
|
llama_context(const llama_model & model) : model(model), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
|
312
|
-
#ifdef GGML_USE_METAL
|
313
336
|
~llama_context() {
|
337
|
+
if (model_owner) {
|
338
|
+
delete &model;
|
339
|
+
}
|
340
|
+
#ifdef GGML_USE_METAL
|
314
341
|
if (ctx_metal) {
|
315
342
|
ggml_metal_free(ctx_metal);
|
316
343
|
}
|
317
|
-
}
|
318
344
|
#endif
|
345
|
+
#ifdef LLAMA_USE_ALLOCATOR
|
346
|
+
if (alloc) {
|
347
|
+
ggml_allocr_free(alloc);
|
348
|
+
}
|
349
|
+
#endif
|
350
|
+
}
|
351
|
+
|
319
352
|
std::mt19937 rng;
|
320
353
|
|
321
354
|
bool has_evaluated_once = false;
|
@@ -353,7 +386,17 @@ struct llama_context {
|
|
353
386
|
// memory buffers used to evaluate the model
|
354
387
|
// TODO: move in llama_state
|
355
388
|
llama_ctx_buffer buf_compute;
|
389
|
+
|
390
|
+
#ifdef LLAMA_USE_ALLOCATOR
|
391
|
+
llama_ctx_buffer buf_alloc;
|
392
|
+
ggml_allocr * alloc = NULL;
|
393
|
+
#endif
|
394
|
+
|
395
|
+
#ifdef LLAMA_USE_SCRATCH
|
356
396
|
llama_ctx_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
|
397
|
+
int buf_last = 0;
|
398
|
+
size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
|
399
|
+
#endif
|
357
400
|
|
358
401
|
#ifdef GGML_USE_METAL
|
359
402
|
ggml_metal_context * ctx_metal = NULL;
|
@@ -363,9 +406,6 @@ struct llama_context {
|
|
363
406
|
ggml_mpi_context * ctx_mpi = NULL;
|
364
407
|
#endif
|
365
408
|
|
366
|
-
int buf_last = 0;
|
367
|
-
size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
|
368
|
-
|
369
409
|
void use_buf(struct ggml_context * ctx, int i) {
|
370
410
|
#if defined(LLAMA_USE_SCRATCH)
|
371
411
|
size_t last_size = 0;
|
@@ -499,12 +539,16 @@ struct llama_file_loader {
|
|
499
539
|
}
|
500
540
|
void read_hparams() {
|
501
541
|
hparams.n_vocab = file.read_u32();
|
502
|
-
hparams.n_embd
|
503
|
-
hparams.n_mult
|
504
|
-
hparams.n_head
|
542
|
+
hparams.n_embd = file.read_u32();
|
543
|
+
hparams.n_mult = file.read_u32();
|
544
|
+
hparams.n_head = file.read_u32();
|
505
545
|
hparams.n_layer = file.read_u32();
|
506
|
-
hparams.n_rot
|
507
|
-
hparams.ftype
|
546
|
+
hparams.n_rot = file.read_u32();
|
547
|
+
hparams.ftype = (enum llama_ftype) file.read_u32();
|
548
|
+
|
549
|
+
// LLaMAv2
|
550
|
+
// TODO: read from header
|
551
|
+
hparams.n_head_kv = hparams.n_head;
|
508
552
|
}
|
509
553
|
void read_vocab() {
|
510
554
|
vocab.id_to_token.resize(hparams.n_vocab);
|
@@ -803,7 +847,7 @@ static bool kv_cache_init(
|
|
803
847
|
ggml_type wtype,
|
804
848
|
int n_ctx,
|
805
849
|
int n_gpu_layers) {
|
806
|
-
const int n_embd = hparams.
|
850
|
+
const int n_embd = hparams.n_embd_gqa();
|
807
851
|
const int n_layer = hparams.n_layer;
|
808
852
|
|
809
853
|
const int64_t n_mem = n_layer*n_ctx;
|
@@ -847,6 +891,8 @@ struct llama_context_params llama_context_default_params() {
|
|
847
891
|
/*.seed =*/ LLAMA_DEFAULT_SEED,
|
848
892
|
/*.n_ctx =*/ 512,
|
849
893
|
/*.n_batch =*/ 512,
|
894
|
+
/*.n_gqa =*/ 1,
|
895
|
+
/*.rms_norm_eps =*/ LLAMA_DEFAULT_RMS_EPS,
|
850
896
|
/*.gpu_layers =*/ 0,
|
851
897
|
/*.main_gpu =*/ 0,
|
852
898
|
/*.tensor_split =*/ nullptr,
|
@@ -855,6 +901,7 @@ struct llama_context_params llama_context_default_params() {
|
|
855
901
|
/*.progress_callback =*/ nullptr,
|
856
902
|
/*.progress_callback_user_data =*/ nullptr,
|
857
903
|
/*.low_vram =*/ false,
|
904
|
+
/*.mul_mat_q =*/ false,
|
858
905
|
/*.f16_kv =*/ true,
|
859
906
|
/*.logits_all =*/ false,
|
860
907
|
/*.vocab_only =*/ false,
|
@@ -966,6 +1013,7 @@ static const char *llama_model_type_name(e_model type) {
|
|
966
1013
|
case MODEL_13B: return "13B";
|
967
1014
|
case MODEL_30B: return "30B";
|
968
1015
|
case MODEL_65B: return "65B";
|
1016
|
+
case MODEL_70B: return "70B";
|
969
1017
|
default: LLAMA_ASSERT(false);
|
970
1018
|
}
|
971
1019
|
}
|
@@ -976,9 +1024,12 @@ static void llama_model_load_internal(
|
|
976
1024
|
llama_vocab & vocab,
|
977
1025
|
int n_ctx,
|
978
1026
|
int n_batch,
|
1027
|
+
int n_gqa,
|
1028
|
+
float rms_norm_eps,
|
979
1029
|
int n_gpu_layers,
|
980
1030
|
int main_gpu,
|
981
1031
|
const float * tensor_split,
|
1032
|
+
const bool mul_mat_q,
|
982
1033
|
float rope_freq_base,
|
983
1034
|
float rope_freq_scale,
|
984
1035
|
bool low_vram,
|
@@ -997,8 +1048,12 @@ static void llama_model_load_internal(
|
|
997
1048
|
model.hparams = ml->file_loader->hparams;
|
998
1049
|
model.n_gpu_layers = n_gpu_layers;
|
999
1050
|
llama_file_version file_version = ml->file_loader->file_version;
|
1051
|
+
|
1000
1052
|
auto & hparams = model.hparams;
|
1001
1053
|
|
1054
|
+
// TODO: read from file
|
1055
|
+
hparams.f_rms_norm_eps = rms_norm_eps;
|
1056
|
+
|
1002
1057
|
{
|
1003
1058
|
switch (hparams.n_layer) {
|
1004
1059
|
case 26: model.type = e_model::MODEL_3B; break;
|
@@ -1016,11 +1071,25 @@ static void llama_model_load_internal(
|
|
1016
1071
|
|
1017
1072
|
hparams.n_ctx = n_ctx;
|
1018
1073
|
|
1074
|
+
// LLaMAv2
|
1075
|
+
// TODO: temporary until GGUF
|
1076
|
+
LLAMA_ASSERT(hparams.n_head % n_gqa == 0);
|
1077
|
+
hparams.n_head_kv = hparams.n_head / n_gqa;
|
1078
|
+
if (model.type == e_model::MODEL_65B && n_gqa == 8) {
|
1079
|
+
fprintf(stderr, "%s: warning: assuming 70B model based on GQA == %d\n", __func__, n_gqa);
|
1080
|
+
model.type = e_model::MODEL_70B;
|
1081
|
+
hparams.f_ffn_mult = 1.3f; // from the params.json of the 70B model
|
1082
|
+
}
|
1083
|
+
|
1019
1084
|
hparams.rope_freq_base = rope_freq_base;
|
1020
1085
|
hparams.rope_freq_scale = rope_freq_scale;
|
1021
1086
|
}
|
1022
1087
|
|
1023
|
-
|
1088
|
+
// ref: https://github.com/facebookresearch/llama/blob/6c7fe276574e78057f917549435a2554000a876d/llama/model.py#L194-L199
|
1089
|
+
const uint32_t n_ff_raw = 2*(4*hparams.n_embd)/3;
|
1090
|
+
const uint32_t n_ff_mult = hparams.f_ffn_mult*n_ff_raw;
|
1091
|
+
const uint32_t n_ff = ((n_ff_mult + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
|
1092
|
+
//const uint32_t n_ff = 28672;
|
1024
1093
|
|
1025
1094
|
{
|
1026
1095
|
fprintf(stderr, "%s: format = %s\n", __func__, llama_file_version_name(file_version));
|
@@ -1029,12 +1098,15 @@ static void llama_model_load_internal(
|
|
1029
1098
|
fprintf(stderr, "%s: n_embd = %u\n", __func__, hparams.n_embd);
|
1030
1099
|
fprintf(stderr, "%s: n_mult = %u\n", __func__, hparams.n_mult);
|
1031
1100
|
fprintf(stderr, "%s: n_head = %u\n", __func__, hparams.n_head);
|
1101
|
+
fprintf(stderr, "%s: n_head_kv = %u\n", __func__, hparams.n_head_kv);
|
1032
1102
|
fprintf(stderr, "%s: n_layer = %u\n", __func__, hparams.n_layer);
|
1033
|
-
fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot);
|
1103
|
+
fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot); // a.k.a. n_embd_head, n_head_dim
|
1104
|
+
fprintf(stderr, "%s: n_gqa = %u\n", __func__, hparams.n_gqa());
|
1105
|
+
fprintf(stderr, "%s: rnorm_eps = %.1e\n", __func__, hparams.f_rms_norm_eps);
|
1106
|
+
fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff);
|
1034
1107
|
fprintf(stderr, "%s: freq_base = %.1f\n", __func__, hparams.rope_freq_base);
|
1035
1108
|
fprintf(stderr, "%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale);
|
1036
1109
|
fprintf(stderr, "%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype));
|
1037
|
-
fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff);
|
1038
1110
|
fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
|
1039
1111
|
}
|
1040
1112
|
|
@@ -1069,7 +1141,7 @@ static void llama_model_load_internal(
|
|
1069
1141
|
{
|
1070
1142
|
model.buf.resize(ctx_size);
|
1071
1143
|
if (use_mlock) {
|
1072
|
-
model.mlock_buf.init(model.buf.addr);
|
1144
|
+
model.mlock_buf.init (model.buf.addr);
|
1073
1145
|
model.mlock_buf.grow_to(model.buf.size);
|
1074
1146
|
}
|
1075
1147
|
|
@@ -1086,9 +1158,11 @@ static void llama_model_load_internal(
|
|
1086
1158
|
}
|
1087
1159
|
|
1088
1160
|
(void) main_gpu;
|
1161
|
+
(void) mul_mat_q;
|
1089
1162
|
#if defined(GGML_USE_CUBLAS)
|
1090
1163
|
fprintf(stderr, "%s: using CUDA for GPU acceleration\n", __func__);
|
1091
1164
|
ggml_cuda_set_main_device(main_gpu);
|
1165
|
+
ggml_cuda_set_mul_mat_q(mul_mat_q);
|
1092
1166
|
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
|
1093
1167
|
#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT
|
1094
1168
|
#elif defined(GGML_USE_CLBLAST)
|
@@ -1104,9 +1178,10 @@ static void llama_model_load_internal(
|
|
1104
1178
|
size_t vram_weights = 0;
|
1105
1179
|
size_t vram_scratch = 0;
|
1106
1180
|
{
|
1107
|
-
const uint32_t n_embd
|
1108
|
-
const uint32_t
|
1109
|
-
const uint32_t
|
1181
|
+
const uint32_t n_embd = hparams.n_embd;
|
1182
|
+
const uint32_t n_embd_gqa = hparams.n_embd_gqa();
|
1183
|
+
const uint32_t n_layer = hparams.n_layer;
|
1184
|
+
const uint32_t n_vocab = hparams.n_vocab;
|
1110
1185
|
|
1111
1186
|
ml->ggml_ctx = ctx;
|
1112
1187
|
|
@@ -1154,16 +1229,16 @@ static void llama_model_load_internal(
|
|
1154
1229
|
|
1155
1230
|
layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd}, backend);
|
1156
1231
|
|
1157
|
-
layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd},
|
1158
|
-
layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd,
|
1159
|
-
layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd,
|
1160
|
-
layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd},
|
1232
|
+
layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}, backend_split);
|
1233
|
+
layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd_gqa}, backend_split);
|
1234
|
+
layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd_gqa}, backend_split);
|
1235
|
+
layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}, backend_split);
|
1161
1236
|
|
1162
1237
|
layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd}, backend);
|
1163
1238
|
|
1164
|
-
layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff},
|
1165
|
-
layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff,
|
1166
|
-
layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff},
|
1239
|
+
layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}, backend_split);
|
1240
|
+
layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}, backend_split);
|
1241
|
+
layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}, backend_split);
|
1167
1242
|
|
1168
1243
|
if (backend == GGML_BACKEND_GPU) {
|
1169
1244
|
vram_weights +=
|
@@ -1181,16 +1256,20 @@ static void llama_model_load_internal(
|
|
1181
1256
|
const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
|
1182
1257
|
|
1183
1258
|
// this is the total memory required to run the inference
|
1184
|
-
|
1259
|
+
size_t mem_required =
|
1185
1260
|
ctx_size +
|
1186
|
-
mmapped_size - vram_weights
|
1261
|
+
mmapped_size - vram_weights; // weights in VRAM not in memory
|
1262
|
+
|
1263
|
+
#ifndef LLAMA_USE_ALLOCATOR
|
1264
|
+
mem_required +=
|
1187
1265
|
MEM_REQ_SCRATCH0(hparams.n_ctx).at(model.type) +
|
1188
1266
|
MEM_REQ_SCRATCH1().at(model.type) +
|
1189
|
-
MEM_REQ_EVAL(
|
1267
|
+
MEM_REQ_EVAL().at(model.type);
|
1268
|
+
#endif
|
1190
1269
|
|
1191
1270
|
// this is the memory required by one llama_state
|
1192
1271
|
const size_t mem_required_state =
|
1193
|
-
scale*
|
1272
|
+
scale*hparams.kv_size();
|
1194
1273
|
|
1195
1274
|
fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
|
1196
1275
|
mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
|
@@ -1231,7 +1310,7 @@ static void llama_model_load_internal(
|
|
1231
1310
|
fprintf(stderr, "%s: cannot offload v cache to GPU due to low VRAM option\n", __func__);
|
1232
1311
|
} else {
|
1233
1312
|
fprintf(stderr, "%s: offloading v cache to GPU\n", __func__);
|
1234
|
-
vram_kv_cache +=
|
1313
|
+
vram_kv_cache += hparams.kv_size() / 2;
|
1235
1314
|
}
|
1236
1315
|
}
|
1237
1316
|
if (n_gpu_layers > (int) hparams.n_layer + 2) {
|
@@ -1239,7 +1318,7 @@ static void llama_model_load_internal(
|
|
1239
1318
|
fprintf(stderr, "%s: cannot offload k cache to GPU due to low VRAM option\n", __func__);
|
1240
1319
|
} else {
|
1241
1320
|
fprintf(stderr, "%s: offloading k cache to GPU\n", __func__);
|
1242
|
-
vram_kv_cache +=
|
1321
|
+
vram_kv_cache += hparams.kv_size() / 2;
|
1243
1322
|
}
|
1244
1323
|
}
|
1245
1324
|
#elif defined(GGML_USE_CLBLAST)
|
@@ -1287,9 +1366,12 @@ static bool llama_model_load(
|
|
1287
1366
|
llama_vocab & vocab,
|
1288
1367
|
int n_ctx,
|
1289
1368
|
int n_batch,
|
1369
|
+
int n_gqa,
|
1370
|
+
float rms_norm_eps,
|
1290
1371
|
int n_gpu_layers,
|
1291
1372
|
int main_gpu,
|
1292
1373
|
const float * tensor_split,
|
1374
|
+
const bool mul_mat_q,
|
1293
1375
|
float rope_freq_base,
|
1294
1376
|
float rope_freq_scale,
|
1295
1377
|
bool low_vram,
|
@@ -1300,7 +1382,8 @@ static bool llama_model_load(
|
|
1300
1382
|
llama_progress_callback progress_callback,
|
1301
1383
|
void *progress_callback_user_data) {
|
1302
1384
|
try {
|
1303
|
-
llama_model_load_internal(fname, model, vocab, n_ctx, n_batch,
|
1385
|
+
llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gqa, rms_norm_eps, n_gpu_layers,
|
1386
|
+
main_gpu, tensor_split, mul_mat_q, rope_freq_base, rope_freq_scale, low_vram, memory_type,
|
1304
1387
|
use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
|
1305
1388
|
return true;
|
1306
1389
|
} catch (const std::exception & err) {
|
@@ -1309,32 +1392,15 @@ static bool llama_model_load(
|
|
1309
1392
|
}
|
1310
1393
|
}
|
1311
1394
|
|
1312
|
-
|
1313
|
-
//
|
1314
|
-
// - lctx: llama context
|
1315
|
-
// - tokens: new batch of tokens to process
|
1316
|
-
// - embd embeddings input
|
1317
|
-
// - n_tokens number of tokens
|
1318
|
-
// - n_past: the context size so far
|
1319
|
-
// - n_threads: number of threads to use
|
1320
|
-
//
|
1321
|
-
static bool llama_eval_internal(
|
1395
|
+
static struct ggml_cgraph * llama_build_graph(
|
1322
1396
|
llama_context & lctx,
|
1323
1397
|
const llama_token * tokens,
|
1324
1398
|
const float * embd,
|
1325
1399
|
int n_tokens,
|
1326
|
-
int n_past
|
1327
|
-
int n_threads,
|
1328
|
-
const char * cgraph_fname) {
|
1400
|
+
int n_past) {
|
1329
1401
|
|
1330
1402
|
LLAMA_ASSERT((!tokens && embd) || (tokens && !embd));
|
1331
1403
|
|
1332
|
-
#ifdef GGML_USE_MPI
|
1333
|
-
ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
|
1334
|
-
#endif
|
1335
|
-
|
1336
|
-
const int64_t t_start_us = ggml_time_us();
|
1337
|
-
|
1338
1404
|
const int N = n_tokens;
|
1339
1405
|
|
1340
1406
|
const auto & model = lctx.model;
|
@@ -1344,40 +1410,54 @@ static bool llama_eval_internal(
|
|
1344
1410
|
|
1345
1411
|
LLAMA_ASSERT(!!kv_self.ctx);
|
1346
1412
|
|
1347
|
-
const
|
1348
|
-
const
|
1349
|
-
const
|
1350
|
-
const
|
1351
|
-
const
|
1352
|
-
const
|
1353
|
-
const
|
1413
|
+
const int64_t n_embd = hparams.n_embd;
|
1414
|
+
const int64_t n_layer = hparams.n_layer;
|
1415
|
+
const int64_t n_ctx = hparams.n_ctx;
|
1416
|
+
const int64_t n_head = hparams.n_head;
|
1417
|
+
const int64_t n_head_kv = hparams.n_head_kv;
|
1418
|
+
const int64_t n_embd_head = hparams.n_embd_head();
|
1419
|
+
const int64_t n_embd_gqa = hparams.n_embd_gqa();
|
1420
|
+
|
1421
|
+
LLAMA_ASSERT(n_embd_head == hparams.n_rot);
|
1354
1422
|
|
1355
1423
|
const float freq_base = hparams.rope_freq_base;
|
1356
1424
|
const float freq_scale = hparams.rope_freq_scale;
|
1425
|
+
const float rms_norm_eps = hparams.f_rms_norm_eps;
|
1426
|
+
|
1427
|
+
const int n_gpu_layers = model.n_gpu_layers;
|
1357
1428
|
|
1358
1429
|
auto & mem_per_token = lctx.mem_per_token;
|
1359
1430
|
auto & buf_compute = lctx.buf_compute;
|
1360
1431
|
|
1432
|
+
|
1361
1433
|
struct ggml_init_params params = {
|
1362
1434
|
/*.mem_size =*/ buf_compute.size,
|
1363
1435
|
/*.mem_buffer =*/ buf_compute.addr,
|
1364
1436
|
/*.no_alloc =*/ false,
|
1365
1437
|
};
|
1366
1438
|
|
1367
|
-
|
1439
|
+
#ifdef LLAMA_USE_ALLOCATOR
|
1440
|
+
params.no_alloc = true;
|
1441
|
+
#endif
|
1368
1442
|
|
1369
|
-
|
1443
|
+
struct ggml_context * ctx0 = ggml_init(params);
|
1370
1444
|
|
1371
|
-
|
1372
|
-
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
|
1373
|
-
n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
|
1445
|
+
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
1374
1446
|
|
1375
1447
|
struct ggml_tensor * cur;
|
1376
1448
|
struct ggml_tensor * inpL;
|
1377
1449
|
|
1378
1450
|
if (tokens) {
|
1379
1451
|
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
1452
|
+
|
1453
|
+
#ifdef LLAMA_USE_ALLOCATOR
|
1454
|
+
ggml_allocr_alloc(lctx.alloc, inp_tokens);
|
1455
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
1456
|
+
memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
|
1457
|
+
}
|
1458
|
+
#else
|
1380
1459
|
memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
|
1460
|
+
#endif
|
1381
1461
|
ggml_set_name(inp_tokens, "inp_tokens");
|
1382
1462
|
|
1383
1463
|
inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
|
@@ -1387,7 +1467,15 @@ static bool llama_eval_internal(
|
|
1387
1467
|
#endif
|
1388
1468
|
|
1389
1469
|
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
|
1470
|
+
|
1471
|
+
#ifdef LLAMA_USE_ALLOCATOR
|
1472
|
+
ggml_allocr_alloc(lctx.alloc, inpL);
|
1473
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
1474
|
+
memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
|
1475
|
+
}
|
1476
|
+
#else
|
1390
1477
|
memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
|
1478
|
+
#endif
|
1391
1479
|
}
|
1392
1480
|
|
1393
1481
|
const int i_gpu_start = n_layer - n_gpu_layers;
|
@@ -1414,6 +1502,17 @@ static bool llama_eval_internal(
|
|
1414
1502
|
}
|
1415
1503
|
#endif // GGML_USE_CUBLAS
|
1416
1504
|
|
1505
|
+
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
1506
|
+
#ifdef LLAMA_USE_ALLOCATOR
|
1507
|
+
ggml_allocr_alloc(lctx.alloc, KQ_scale);
|
1508
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
1509
|
+
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
|
1510
|
+
}
|
1511
|
+
#else
|
1512
|
+
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
|
1513
|
+
#endif
|
1514
|
+
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
1515
|
+
|
1417
1516
|
for (int il = 0; il < n_layer; ++il) {
|
1418
1517
|
ggml_format_name(inpL, "layer_inp_%d", il);
|
1419
1518
|
|
@@ -1431,7 +1530,7 @@ static bool llama_eval_internal(
|
|
1431
1530
|
|
1432
1531
|
// norm
|
1433
1532
|
{
|
1434
|
-
cur = ggml_rms_norm(ctx0, inpL);
|
1533
|
+
cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
|
1435
1534
|
offload_func(cur);
|
1436
1535
|
ggml_set_name(cur, "rms_norm_0");
|
1437
1536
|
|
@@ -1452,11 +1551,11 @@ static bool llama_eval_internal(
|
|
1452
1551
|
offload_func_kq(tmpq);
|
1453
1552
|
ggml_set_name(tmpq, "tmpq");
|
1454
1553
|
|
1455
|
-
struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk,
|
1554
|
+
struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
|
1456
1555
|
offload_func_kq(Kcur);
|
1457
1556
|
ggml_set_name(Kcur, "Kcur");
|
1458
1557
|
|
1459
|
-
struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq,
|
1558
|
+
struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
|
1460
1559
|
offload_func_kq(Qcur);
|
1461
1560
|
ggml_set_name(Qcur, "Qcur");
|
1462
1561
|
|
@@ -1468,23 +1567,23 @@ static bool llama_eval_internal(
|
|
1468
1567
|
offload_func_v(tmpv);
|
1469
1568
|
ggml_set_name(tmpv, "tmpv");
|
1470
1569
|
|
1471
|
-
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv,
|
1570
|
+
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, N));
|
1472
1571
|
offload_func_v(Vcur);
|
1473
1572
|
ggml_set_name(Vcur, "Vcur");
|
1474
1573
|
|
1475
|
-
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*
|
1574
|
+
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past));
|
1476
1575
|
offload_func_kq(k);
|
1477
1576
|
ggml_set_name(k, "k");
|
1478
1577
|
|
1479
|
-
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N,
|
1578
|
+
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd_gqa,
|
1480
1579
|
( n_ctx)*ggml_element_size(kv_self.v),
|
1481
|
-
(il*n_ctx)*ggml_element_size(kv_self.v)*
|
1580
|
+
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + n_past*ggml_element_size(kv_self.v));
|
1482
1581
|
offload_func_v(v);
|
1483
1582
|
ggml_set_name(v, "v");
|
1484
1583
|
|
1485
1584
|
// important: storing RoPE-ed version of K in the KV cache!
|
1486
|
-
ggml_build_forward_expand(
|
1487
|
-
ggml_build_forward_expand(
|
1585
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
|
1586
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
|
1488
1587
|
}
|
1489
1588
|
|
1490
1589
|
struct ggml_tensor * Q =
|
@@ -1497,8 +1596,8 @@ static bool llama_eval_internal(
|
|
1497
1596
|
struct ggml_tensor * K =
|
1498
1597
|
ggml_permute(ctx0,
|
1499
1598
|
ggml_reshape_3d(ctx0,
|
1500
|
-
ggml_view_1d(ctx0, kv_self.k, (n_past + N)*
|
1501
|
-
|
1599
|
+
ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd_gqa, il*n_ctx*ggml_element_size(kv_self.k)*n_embd_gqa),
|
1600
|
+
n_embd_head, n_head_kv, n_past + N),
|
1502
1601
|
0, 2, 1, 3);
|
1503
1602
|
offload_func_kq(K);
|
1504
1603
|
ggml_set_name(K, "K");
|
@@ -1508,10 +1607,7 @@ static bool llama_eval_internal(
|
|
1508
1607
|
offload_func_kq(KQ);
|
1509
1608
|
ggml_set_name(KQ, "KQ");
|
1510
1609
|
|
1511
|
-
// KQ_scaled = KQ / sqrt(
|
1512
|
-
struct ggml_tensor * KQ_scale = ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head));
|
1513
|
-
ggml_set_name(KQ_scale, "1/sqrt(n_embd/n_head)");
|
1514
|
-
|
1610
|
+
// KQ_scaled = KQ / sqrt(n_embd_head)
|
1515
1611
|
// KQ_scaled shape [n_past + N, N, n_head, 1]
|
1516
1612
|
struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
|
1517
1613
|
offload_func_kq(KQ_scaled);
|
@@ -1530,10 +1626,10 @@ static bool llama_eval_internal(
|
|
1530
1626
|
// split cached V into n_head heads
|
1531
1627
|
struct ggml_tensor * V =
|
1532
1628
|
ggml_view_3d(ctx0, kv_self.v,
|
1533
|
-
n_past + N,
|
1629
|
+
n_past + N, n_embd_head, n_head_kv,
|
1534
1630
|
n_ctx*ggml_element_size(kv_self.v),
|
1535
|
-
n_ctx*ggml_element_size(kv_self.v)*
|
1536
|
-
|
1631
|
+
n_ctx*ggml_element_size(kv_self.v)*n_embd_head,
|
1632
|
+
n_ctx*ggml_element_size(kv_self.v)*n_embd_gqa*il);
|
1537
1633
|
offload_func_v(V);
|
1538
1634
|
ggml_set_name(V, "V");
|
1539
1635
|
|
@@ -1545,7 +1641,7 @@ static bool llama_eval_internal(
|
|
1545
1641
|
// make V contiguous in memory to speed up the matmul, however we waste time on the copy
|
1546
1642
|
// on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
|
1547
1643
|
// is there a better way?
|
1548
|
-
struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N,
|
1644
|
+
struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd_head, n_head));
|
1549
1645
|
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
|
1550
1646
|
#endif
|
1551
1647
|
|
@@ -1579,7 +1675,7 @@ static bool llama_eval_internal(
|
|
1579
1675
|
{
|
1580
1676
|
// norm
|
1581
1677
|
{
|
1582
|
-
cur = ggml_rms_norm(ctx0, inpFF);
|
1678
|
+
cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps);
|
1583
1679
|
offload_func(cur);
|
1584
1680
|
ggml_set_name(cur, "rms_norm_1");
|
1585
1681
|
|
@@ -1627,12 +1723,9 @@ static bool llama_eval_internal(
|
|
1627
1723
|
|
1628
1724
|
lctx.use_buf(ctx0, 0);
|
1629
1725
|
|
1630
|
-
// used at the end to optionally extract the embeddings
|
1631
|
-
struct ggml_tensor * embeddings = NULL;
|
1632
|
-
|
1633
1726
|
// norm
|
1634
1727
|
{
|
1635
|
-
cur = ggml_rms_norm(ctx0, inpL);
|
1728
|
+
cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
|
1636
1729
|
offload_func_nr(cur);
|
1637
1730
|
ggml_set_name(cur, "rms_norm_2");
|
1638
1731
|
|
@@ -1640,8 +1733,6 @@ static bool llama_eval_internal(
|
|
1640
1733
|
cur = ggml_mul(ctx0, cur, model.norm);
|
1641
1734
|
// offload_func_nr(cur); // TODO CPU + GPU mirrored backend
|
1642
1735
|
ggml_set_name(cur, "result_norm");
|
1643
|
-
|
1644
|
-
embeddings = cur;
|
1645
1736
|
}
|
1646
1737
|
|
1647
1738
|
// lm_head
|
@@ -1653,18 +1744,103 @@ static bool llama_eval_internal(
|
|
1653
1744
|
// logits -> probs
|
1654
1745
|
//cur = ggml_soft_max_inplace(ctx0, cur);
|
1655
1746
|
|
1656
|
-
|
1657
|
-
|
1747
|
+
ggml_build_forward_expand(gf, cur);
|
1748
|
+
|
1749
|
+
if (mem_per_token == 0) {
|
1750
|
+
mem_per_token = ggml_used_mem(ctx0)/N;
|
1751
|
+
}
|
1752
|
+
|
1753
|
+
#if 0
|
1754
|
+
printf("\n%s: used_mem: eval ctx %.3f MB, scratch %.3f MB %.3f MB, work buf %.3f MB, n_past = %d, N = %d\n", __func__,
|
1755
|
+
ggml_used_mem(ctx0)/1024.0/1024.0,
|
1756
|
+
lctx.get_buf_max_mem(0)/1024.0/1024.0,
|
1757
|
+
lctx.get_buf_max_mem(1)/1024.0/1024.0,
|
1758
|
+
lctx.work_buffer.size()/1024.0/1024.0,
|
1759
|
+
n_past, N);
|
1760
|
+
#endif
|
1761
|
+
|
1762
|
+
ggml_free(ctx0);
|
1763
|
+
|
1764
|
+
return gf;
|
1765
|
+
}
|
1766
|
+
|
1767
|
+
// evaluate the transformer
|
1768
|
+
//
|
1769
|
+
// - lctx: llama context
|
1770
|
+
// - tokens: new batch of tokens to process
|
1771
|
+
// - embd embeddings input
|
1772
|
+
// - n_tokens number of tokens
|
1773
|
+
// - n_past: the context size so far
|
1774
|
+
// - n_threads: number of threads to use
|
1775
|
+
//
|
1776
|
+
static bool llama_eval_internal(
|
1777
|
+
llama_context & lctx,
|
1778
|
+
const llama_token * tokens,
|
1779
|
+
const float * embd,
|
1780
|
+
int n_tokens,
|
1781
|
+
int n_past,
|
1782
|
+
int n_threads,
|
1783
|
+
const char * cgraph_fname) {
|
1784
|
+
|
1785
|
+
LLAMA_ASSERT((!tokens && embd) || (tokens && !embd));
|
1786
|
+
|
1787
|
+
const int64_t t_start_us = ggml_time_us();
|
1788
|
+
|
1789
|
+
#ifdef GGML_USE_MPI
|
1790
|
+
ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
|
1791
|
+
#endif
|
1792
|
+
|
1793
|
+
const int N = n_tokens;
|
1794
|
+
|
1795
|
+
const auto & model = lctx.model;
|
1796
|
+
const auto & hparams = model.hparams;
|
1797
|
+
|
1798
|
+
const auto & kv_self = lctx.kv_self;
|
1799
|
+
|
1800
|
+
LLAMA_ASSERT(!!kv_self.ctx);
|
1801
|
+
|
1802
|
+
const int64_t n_embd = hparams.n_embd;
|
1803
|
+
const int64_t n_vocab = hparams.n_vocab;
|
1804
|
+
|
1805
|
+
#ifdef LLAMA_USE_ALLOCATOR
|
1806
|
+
ggml_allocr_reset(lctx.alloc);
|
1807
|
+
#endif
|
1808
|
+
|
1809
|
+
ggml_cgraph * gf = llama_build_graph(lctx, tokens, embd, n_tokens, n_past);
|
1810
|
+
|
1811
|
+
#ifdef LLAMA_USE_ALLOCATOR
|
1812
|
+
ggml_allocr_alloc_graph(lctx.alloc, gf);
|
1813
|
+
#endif
|
1814
|
+
|
1815
|
+
// fprintf(stderr, "graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
|
1816
|
+
|
1817
|
+
// for big prompts, if BLAS is enabled, it is better to use only one thread
|
1818
|
+
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
|
1819
|
+
n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
|
1820
|
+
|
1821
|
+
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
|
1822
|
+
struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
|
1823
|
+
|
1824
|
+
LLAMA_ASSERT(strcmp(res->name, "result_output") == 0);
|
1825
|
+
LLAMA_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
|
1658
1826
|
|
1659
1827
|
#if GGML_USE_MPI
|
1660
|
-
|
1828
|
+
const int64_t n_layer = hparams.n_layer;
|
1829
|
+
ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
|
1661
1830
|
#endif
|
1662
1831
|
|
1663
1832
|
#ifdef GGML_USE_METAL
|
1664
1833
|
if (lctx.ctx_metal && N == 1) {
|
1834
|
+
// TODO: disabled until #2413 is resolved
|
1835
|
+
//if (!ggml_metal_if_optimized(lctx.ctx_metal)) {
|
1836
|
+
// ggml_metal_graph_find_concurrency(lctx.ctx_metal, gf);
|
1837
|
+
//}
|
1665
1838
|
ggml_metal_set_n_cb (lctx.ctx_metal, n_threads);
|
1666
|
-
ggml_metal_graph_compute(lctx.ctx_metal,
|
1667
|
-
ggml_metal_get_tensor (lctx.ctx_metal,
|
1839
|
+
ggml_metal_graph_compute(lctx.ctx_metal, gf);
|
1840
|
+
ggml_metal_get_tensor (lctx.ctx_metal, res);
|
1841
|
+
if (!lctx.embedding.empty()) {
|
1842
|
+
ggml_metal_get_tensor(lctx.ctx_metal, embeddings);
|
1843
|
+
}
|
1668
1844
|
} else {
|
1669
1845
|
// IMPORTANT:
|
1670
1846
|
// Since we don't have efficient Matrix x Matrix Metal multiplication yet, we fallback to vanilla
|
@@ -1682,34 +1858,32 @@ static bool llama_eval_internal(
|
|
1682
1858
|
ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v);
|
1683
1859
|
}
|
1684
1860
|
|
1685
|
-
ggml_graph_compute_helper(lctx.work_buffer,
|
1861
|
+
ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
|
1686
1862
|
}
|
1687
1863
|
#else
|
1688
|
-
ggml_graph_compute_helper(lctx.work_buffer,
|
1864
|
+
ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
|
1689
1865
|
#endif
|
1690
1866
|
|
1691
1867
|
#if GGML_USE_MPI
|
1692
|
-
ggml_mpi_graph_compute_post(lctx.ctx_mpi,
|
1868
|
+
ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
|
1693
1869
|
#endif
|
1694
1870
|
|
1695
1871
|
// update kv token count
|
1696
1872
|
lctx.kv_self.n = n_past + N;
|
1697
1873
|
|
1698
|
-
struct ggml_tensor * res = gf.nodes[gf.n_nodes - 1];
|
1699
|
-
|
1700
1874
|
if (cgraph_fname) {
|
1701
|
-
ggml_graph_export(
|
1875
|
+
ggml_graph_export(gf, cgraph_fname);
|
1702
1876
|
}
|
1703
1877
|
|
1704
1878
|
#ifdef GGML_PERF
|
1705
1879
|
// print timing information per ggml operation (for debugging purposes)
|
1706
1880
|
// requires GGML_PERF to be defined
|
1707
|
-
ggml_graph_print(
|
1881
|
+
ggml_graph_print(gf);
|
1708
1882
|
#endif
|
1709
1883
|
|
1710
1884
|
// plot the computation graph in dot format (for debugging purposes)
|
1711
1885
|
//if (n_past%100 == 0) {
|
1712
|
-
// ggml_graph_dump_dot(
|
1886
|
+
// ggml_graph_dump_dot(gf, NULL, "llama.dot");
|
1713
1887
|
//}
|
1714
1888
|
|
1715
1889
|
// extract logits
|
@@ -1734,19 +1908,6 @@ static bool llama_eval_internal(
|
|
1734
1908
|
memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(N - 1)), sizeof(float)*n_embd);
|
1735
1909
|
}
|
1736
1910
|
|
1737
|
-
if (mem_per_token == 0) {
|
1738
|
-
mem_per_token = ggml_used_mem(ctx0)/N;
|
1739
|
-
}
|
1740
|
-
|
1741
|
-
#if 0
|
1742
|
-
printf("\n%s: used_mem = %.3f MB, scratch -- %.3f MB %.3f MB\n", __func__,
|
1743
|
-
ggml_used_mem(ctx0)/1024.0/1024.0,
|
1744
|
-
lctx.get_buf_max_mem(0)/1024.0/1024.0,
|
1745
|
-
lctx.get_buf_max_mem(1)/1024.0/1024.0);
|
1746
|
-
#endif
|
1747
|
-
|
1748
|
-
ggml_free(ctx0);
|
1749
|
-
|
1750
1911
|
// measure the performance only for the single-token evals
|
1751
1912
|
if (N == 1) {
|
1752
1913
|
lctx.t_eval_us += ggml_time_us() - t_start_us;
|
@@ -1858,7 +2019,9 @@ struct llama_tokenizer {
|
|
1858
2019
|
if (token == vocab_.token_to_id.end()) {
|
1859
2020
|
// output any symbols that did not form tokens as bytes.
|
1860
2021
|
for (int j = 0; j < (int) symbol.n; ++j) {
|
1861
|
-
|
2022
|
+
// NOTE: old version, before #2420 - not sure what are the implications of this
|
2023
|
+
//llama_vocab::id token_id = static_cast<uint8_t>(symbol.text[j]) + 3;
|
2024
|
+
llama_vocab::id token_id = vocab_.token_to_id.at(std::string(1, symbol.text[j]));
|
1862
2025
|
output.push_back(token_id);
|
1863
2026
|
}
|
1864
2027
|
} else {
|
@@ -1915,6 +2078,279 @@ static std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, co
|
|
1915
2078
|
return output;
|
1916
2079
|
}
|
1917
2080
|
|
2081
|
+
//
|
2082
|
+
// grammar - internal
|
2083
|
+
//
|
2084
|
+
|
2085
|
+
struct llama_grammar {
|
2086
|
+
const std::vector<std::vector<llama_grammar_element>> rules;
|
2087
|
+
std::vector<std::vector<const llama_grammar_element *>> stacks;
|
2088
|
+
};
|
2089
|
+
|
2090
|
+
struct llama_grammar_candidate {
|
2091
|
+
size_t index;
|
2092
|
+
const uint32_t * code_points;
|
2093
|
+
};
|
2094
|
+
|
2095
|
+
// NOTE: assumes valid utf8 (but checks for overrun)
|
2096
|
+
// adds a terminating 0 for use as pointer
|
2097
|
+
std::vector<uint32_t> decode_utf8(const char * src) {
|
2098
|
+
static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
|
2099
|
+
const char * pos = src;
|
2100
|
+
std::vector<uint32_t> code_points;
|
2101
|
+
while (*pos != 0) {
|
2102
|
+
uint8_t first_byte = static_cast<uint8_t>(*pos);
|
2103
|
+
uint8_t highbits = first_byte >> 4;
|
2104
|
+
int len = lookup[highbits];
|
2105
|
+
uint8_t mask = (1 << (8 - len)) - 1;
|
2106
|
+
uint32_t value = first_byte & mask;
|
2107
|
+
const char * end = pos + len; // may overrun!
|
2108
|
+
++pos;
|
2109
|
+
for ( ; pos < end && *pos != 0; ++pos) {
|
2110
|
+
value = (value << 6) + (static_cast<uint8_t>(*pos) & 0x3F);
|
2111
|
+
}
|
2112
|
+
code_points.push_back(value);
|
2113
|
+
}
|
2114
|
+
code_points.push_back(0);
|
2115
|
+
return code_points;
|
2116
|
+
}
|
2117
|
+
|
2118
|
+
// returns true iff pos points to the end of one of the definitions of a rule
|
2119
|
+
static bool llama_grammar_is_end_of_sequence(const llama_grammar_element * pos) {
|
2120
|
+
switch (pos->type) {
|
2121
|
+
case LLAMA_GRETYPE_END: return true;
|
2122
|
+
case LLAMA_GRETYPE_ALT: return true;
|
2123
|
+
default: return false;
|
2124
|
+
}
|
2125
|
+
}
|
2126
|
+
|
2127
|
+
// returns true iff chr satisfies the char range at pos (regular or inverse range)
|
2128
|
+
// asserts that pos is pointing to a char range element
|
2129
|
+
static std::pair<bool, const llama_grammar_element *> llama_grammar_match_char(
|
2130
|
+
const llama_grammar_element * pos,
|
2131
|
+
const uint32_t chr) {
|
2132
|
+
|
2133
|
+
bool found = false;
|
2134
|
+
bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR;
|
2135
|
+
LLAMA_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT);
|
2136
|
+
|
2137
|
+
do {
|
2138
|
+
if (pos[1].type == LLAMA_GRETYPE_CHAR_RNG_UPPER) {
|
2139
|
+
// inclusive range, e.g. [a-z]
|
2140
|
+
found = found || (pos->value <= chr && chr <= pos[1].value);
|
2141
|
+
pos += 2;
|
2142
|
+
} else {
|
2143
|
+
// exact char match, e.g. [a] or "a"
|
2144
|
+
found = found || pos->value == chr;
|
2145
|
+
pos += 1;
|
2146
|
+
}
|
2147
|
+
} while (pos->type == LLAMA_GRETYPE_CHAR_ALT);
|
2148
|
+
|
2149
|
+
return std::make_pair(found == is_positive_char, pos);
|
2150
|
+
}
|
2151
|
+
|
2152
|
+
// transforms a grammar pushdown stack into N possible stacks, all ending
|
2153
|
+
// at a character range (terminal element)
|
2154
|
+
static void llama_grammar_advance_stack(
|
2155
|
+
const std::vector<std::vector<llama_grammar_element>> & rules,
|
2156
|
+
const std::vector<const llama_grammar_element *> & stack,
|
2157
|
+
std::vector<std::vector<const llama_grammar_element *>> & new_stacks) {
|
2158
|
+
|
2159
|
+
if (stack.empty()) {
|
2160
|
+
new_stacks.push_back(stack);
|
2161
|
+
return;
|
2162
|
+
}
|
2163
|
+
|
2164
|
+
const llama_grammar_element * pos = stack.back();
|
2165
|
+
|
2166
|
+
switch (pos->type) {
|
2167
|
+
case LLAMA_GRETYPE_RULE_REF: {
|
2168
|
+
const size_t rule_id = static_cast<size_t>(pos->value);
|
2169
|
+
const llama_grammar_element * subpos = rules[rule_id].data();
|
2170
|
+
do {
|
2171
|
+
// init new stack without the top (pos)
|
2172
|
+
std::vector<const llama_grammar_element *> new_stack(stack.begin(), stack.end() - 1);
|
2173
|
+
if (!llama_grammar_is_end_of_sequence(pos + 1)) {
|
2174
|
+
// if this rule ref is followed by another element, add that to stack
|
2175
|
+
new_stack.push_back(pos + 1);
|
2176
|
+
}
|
2177
|
+
if (!llama_grammar_is_end_of_sequence(subpos)) {
|
2178
|
+
// if alternate is nonempty, add to stack
|
2179
|
+
new_stack.push_back(subpos);
|
2180
|
+
}
|
2181
|
+
llama_grammar_advance_stack(rules, new_stack, new_stacks);
|
2182
|
+
while (!llama_grammar_is_end_of_sequence(subpos)) {
|
2183
|
+
// scan to end of alternate def
|
2184
|
+
subpos++;
|
2185
|
+
}
|
2186
|
+
if (subpos->type == LLAMA_GRETYPE_ALT) {
|
2187
|
+
// there's another alternate def of this rule to process
|
2188
|
+
subpos++;
|
2189
|
+
} else {
|
2190
|
+
break;
|
2191
|
+
}
|
2192
|
+
} while (true);
|
2193
|
+
break;
|
2194
|
+
}
|
2195
|
+
case LLAMA_GRETYPE_CHAR:
|
2196
|
+
case LLAMA_GRETYPE_CHAR_NOT:
|
2197
|
+
new_stacks.push_back(stack);
|
2198
|
+
break;
|
2199
|
+
default:
|
2200
|
+
// end of alternate (LLAMA_GRETYPE_END, LLAMA_GRETYPE_ALT) or middle of char range
|
2201
|
+
// (LLAMA_GRETYPE_CHAR_ALT, LLAMA_GRETYPE_CHAR_RNG_UPPER); stack should never be left on
|
2202
|
+
// those
|
2203
|
+
LLAMA_ASSERT(false);
|
2204
|
+
}
|
2205
|
+
}
|
2206
|
+
|
2207
|
+
// takes a set of possible pushdown stacks on a grammar, which are required to
|
2208
|
+
// be positioned at a character range (see `llama_grammar_advance_stack`), and
|
2209
|
+
// produces the N possible stacks if the given char is accepted at those
|
2210
|
+
// positions
|
2211
|
+
static std::vector<std::vector<const llama_grammar_element *>> llama_grammar_accept(
|
2212
|
+
const std::vector<std::vector<llama_grammar_element>> & rules,
|
2213
|
+
const std::vector<std::vector<const llama_grammar_element *>> & stacks,
|
2214
|
+
const uint32_t chr) {
|
2215
|
+
|
2216
|
+
std::vector<std::vector<const llama_grammar_element *>> new_stacks;
|
2217
|
+
|
2218
|
+
for (const auto & stack : stacks) {
|
2219
|
+
if (stack.empty()) {
|
2220
|
+
continue;
|
2221
|
+
}
|
2222
|
+
|
2223
|
+
auto match = llama_grammar_match_char(stack.back(), chr);
|
2224
|
+
if (match.first) {
|
2225
|
+
const llama_grammar_element * pos = match.second;
|
2226
|
+
|
2227
|
+
// update top of stack to next element, if any
|
2228
|
+
std::vector<const llama_grammar_element *> new_stack(stack.begin(), stack.end() - 1);
|
2229
|
+
if (!llama_grammar_is_end_of_sequence(pos)) {
|
2230
|
+
new_stack.push_back(pos);
|
2231
|
+
}
|
2232
|
+
llama_grammar_advance_stack(rules, new_stack, new_stacks);
|
2233
|
+
}
|
2234
|
+
}
|
2235
|
+
|
2236
|
+
return new_stacks;
|
2237
|
+
}
|
2238
|
+
|
2239
|
+
static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates(
|
2240
|
+
const std::vector<std::vector<llama_grammar_element>> & rules,
|
2241
|
+
const std::vector<std::vector<const llama_grammar_element *>> & stacks,
|
2242
|
+
const std::vector<llama_grammar_candidate> & candidates);
|
2243
|
+
|
2244
|
+
static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_stack(
|
2245
|
+
const std::vector<std::vector<llama_grammar_element>> & rules,
|
2246
|
+
const std::vector<const llama_grammar_element *> & stack,
|
2247
|
+
const std::vector<llama_grammar_candidate> & candidates) {
|
2248
|
+
|
2249
|
+
std::vector<llama_grammar_candidate> rejects;
|
2250
|
+
|
2251
|
+
if (stack.empty()) {
|
2252
|
+
// accept nothing; EOS is handled elsewhere
|
2253
|
+
rejects.insert(rejects.end(), candidates.begin(), candidates.end());
|
2254
|
+
return rejects;
|
2255
|
+
}
|
2256
|
+
|
2257
|
+
const llama_grammar_element * stack_pos = stack.back();
|
2258
|
+
|
2259
|
+
std::vector<llama_grammar_candidate> next_candidates;
|
2260
|
+
for (auto tok : candidates) {
|
2261
|
+
if (llama_grammar_match_char(stack_pos, tok.code_points[0]).first) {
|
2262
|
+
if (tok.code_points[1] != 0) {
|
2263
|
+
next_candidates.push_back({ tok.index, tok.code_points + 1 });
|
2264
|
+
}
|
2265
|
+
} else {
|
2266
|
+
rejects.push_back(tok);
|
2267
|
+
}
|
2268
|
+
}
|
2269
|
+
|
2270
|
+
auto stack_pos_after = llama_grammar_match_char(stack_pos, 0).second;
|
2271
|
+
|
2272
|
+
// update top of stack to next element, if any
|
2273
|
+
std::vector<const llama_grammar_element *> stack_after(stack.begin(), stack.end() - 1);
|
2274
|
+
if (!llama_grammar_is_end_of_sequence(stack_pos_after)) {
|
2275
|
+
stack_after.push_back(stack_pos_after);
|
2276
|
+
}
|
2277
|
+
std::vector<std::vector<const llama_grammar_element *>> next_stacks;
|
2278
|
+
llama_grammar_advance_stack(rules, stack_after, next_stacks);
|
2279
|
+
|
2280
|
+
auto next_rejects = llama_grammar_reject_candidates(rules, next_stacks, next_candidates);
|
2281
|
+
for (auto tok : next_rejects) {
|
2282
|
+
rejects.push_back({ tok.index, tok.code_points - 1 });
|
2283
|
+
}
|
2284
|
+
|
2285
|
+
return rejects;
|
2286
|
+
}
|
2287
|
+
|
2288
|
+
static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates(
|
2289
|
+
const std::vector<std::vector<llama_grammar_element>> & rules,
|
2290
|
+
const std::vector<std::vector<const llama_grammar_element *>> & stacks,
|
2291
|
+
const std::vector<llama_grammar_candidate> & candidates) {
|
2292
|
+
LLAMA_ASSERT(!stacks.empty()); // REVIEW
|
2293
|
+
|
2294
|
+
if (candidates.empty()) {
|
2295
|
+
return std::vector<llama_grammar_candidate>();
|
2296
|
+
}
|
2297
|
+
|
2298
|
+
auto rejects = llama_grammar_reject_candidates_for_stack(rules, stacks.front(), candidates);
|
2299
|
+
|
2300
|
+
for (size_t i = 1, size = stacks.size(); i < size; ++i) {
|
2301
|
+
rejects = llama_grammar_reject_candidates_for_stack(rules, stacks[i], rejects);
|
2302
|
+
}
|
2303
|
+
return rejects;
|
2304
|
+
}
|
2305
|
+
|
2306
|
+
//
|
2307
|
+
// grammar - external
|
2308
|
+
//
|
2309
|
+
|
2310
|
+
struct llama_grammar * llama_grammar_init(
|
2311
|
+
const llama_grammar_element ** rules,
|
2312
|
+
size_t n_rules,
|
2313
|
+
size_t start_rule_index) {
|
2314
|
+
const llama_grammar_element * pos;
|
2315
|
+
|
2316
|
+
// copy rule definitions into vectors
|
2317
|
+
std::vector<std::vector<llama_grammar_element>> vec_rules(n_rules);
|
2318
|
+
for (size_t i = 0; i < n_rules; i++) {
|
2319
|
+
for (pos = rules[i]; pos->type != LLAMA_GRETYPE_END; pos++) {
|
2320
|
+
vec_rules[i].push_back(*pos);
|
2321
|
+
}
|
2322
|
+
vec_rules[i].push_back({LLAMA_GRETYPE_END, 0});
|
2323
|
+
}
|
2324
|
+
|
2325
|
+
// loop over alternates of start rule to build initial stacks
|
2326
|
+
std::vector<std::vector<const llama_grammar_element *>> stacks;
|
2327
|
+
pos = rules[start_rule_index];
|
2328
|
+
do {
|
2329
|
+
std::vector<const llama_grammar_element *> stack;
|
2330
|
+
if (!llama_grammar_is_end_of_sequence(pos)) {
|
2331
|
+
// if alternate is nonempty, add to stack
|
2332
|
+
stack.push_back(pos);
|
2333
|
+
}
|
2334
|
+
llama_grammar_advance_stack(vec_rules, stack, stacks);
|
2335
|
+
while (!llama_grammar_is_end_of_sequence(pos)) {
|
2336
|
+
// scan to end of alternate def
|
2337
|
+
pos++;
|
2338
|
+
}
|
2339
|
+
if (pos->type == LLAMA_GRETYPE_ALT) {
|
2340
|
+
// there's another alternate def of this rule to process
|
2341
|
+
pos++;
|
2342
|
+
} else {
|
2343
|
+
break;
|
2344
|
+
}
|
2345
|
+
} while (true);
|
2346
|
+
|
2347
|
+
return new llama_grammar{ std::move(vec_rules), std::move(stacks) };
|
2348
|
+
}
|
2349
|
+
|
2350
|
+
void llama_grammar_free(struct llama_grammar * grammar) {
|
2351
|
+
delete grammar;
|
2352
|
+
}
|
2353
|
+
|
1918
2354
|
//
|
1919
2355
|
// sampling
|
1920
2356
|
//
|
@@ -2200,6 +2636,47 @@ void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, l
|
|
2200
2636
|
}
|
2201
2637
|
}
|
2202
2638
|
|
2639
|
+
void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, const struct llama_grammar * grammar) {
|
2640
|
+
assert(ctx);
|
2641
|
+
const int64_t t_start_sample_us = ggml_time_us();
|
2642
|
+
|
2643
|
+
bool allow_eos = false;
|
2644
|
+
for (const auto & stack : grammar->stacks) {
|
2645
|
+
if (stack.empty()) {
|
2646
|
+
allow_eos = true;
|
2647
|
+
break;
|
2648
|
+
}
|
2649
|
+
}
|
2650
|
+
|
2651
|
+
const llama_token eos = llama_token_eos();
|
2652
|
+
|
2653
|
+
std::vector<std::vector<uint32_t>> candidates_decoded;
|
2654
|
+
std::vector<llama_grammar_candidate> candidates_grammar;
|
2655
|
+
|
2656
|
+
for (size_t i = 0; i < candidates->size; ++i) {
|
2657
|
+
const llama_token id = candidates->data[i].id;
|
2658
|
+
const char * str = llama_token_to_str(ctx, id);
|
2659
|
+
if (id == eos) {
|
2660
|
+
if (!allow_eos) {
|
2661
|
+
candidates->data[i].logit = -INFINITY;
|
2662
|
+
}
|
2663
|
+
} else if (*str == 0) {
|
2664
|
+
candidates->data[i].logit = -INFINITY;
|
2665
|
+
} else {
|
2666
|
+
candidates_decoded.push_back(decode_utf8(str));
|
2667
|
+
candidates_grammar.push_back({ i, candidates_decoded.back().data() });
|
2668
|
+
}
|
2669
|
+
}
|
2670
|
+
|
2671
|
+
const auto rejects =
|
2672
|
+
llama_grammar_reject_candidates(grammar->rules, grammar->stacks, candidates_grammar);
|
2673
|
+
for (auto & reject : rejects) {
|
2674
|
+
candidates->data[reject.index].logit = -INFINITY;
|
2675
|
+
}
|
2676
|
+
|
2677
|
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
2678
|
+
}
|
2679
|
+
|
2203
2680
|
static void llama_log_softmax(float * array, size_t size) {
|
2204
2681
|
float max_l = *std::max_element(array, array + size);
|
2205
2682
|
float sum = 0.f;
|
@@ -2375,6 +2852,29 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra
|
|
2375
2852
|
return result;
|
2376
2853
|
}
|
2377
2854
|
|
2855
|
+
void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token) {
|
2856
|
+
const int64_t t_start_sample_us = ggml_time_us();
|
2857
|
+
|
2858
|
+
if (token == llama_token_eos()) {
|
2859
|
+
for (const auto & stack : grammar->stacks) {
|
2860
|
+
if (stack.empty()) {
|
2861
|
+
return;
|
2862
|
+
}
|
2863
|
+
}
|
2864
|
+
LLAMA_ASSERT(false);
|
2865
|
+
}
|
2866
|
+
|
2867
|
+
const char * str = llama_token_to_str(ctx, token);
|
2868
|
+
// Note terminating 0 in decoded string
|
2869
|
+
auto code_points = decode_utf8(str);
|
2870
|
+
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
|
2871
|
+
grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
|
2872
|
+
}
|
2873
|
+
LLAMA_ASSERT(!grammar->stacks.empty());
|
2874
|
+
|
2875
|
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
2876
|
+
}
|
2877
|
+
|
2378
2878
|
//
|
2379
2879
|
// quantization
|
2380
2880
|
//
|
@@ -2448,8 +2948,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2448
2948
|
case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
|
2449
2949
|
case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
|
2450
2950
|
case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
|
2451
|
-
case LLAMA_FTYPE_MOSTLY_F16:
|
2452
|
-
case LLAMA_FTYPE_ALL_F32:
|
2951
|
+
case LLAMA_FTYPE_MOSTLY_F16: quantized_type = GGML_TYPE_F16; break;
|
2952
|
+
case LLAMA_FTYPE_ALL_F32: quantized_type = GGML_TYPE_F32; break;
|
2453
2953
|
|
2454
2954
|
#ifdef GGML_USE_K_QUANTS
|
2455
2955
|
// K-quants
|
@@ -2533,16 +3033,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2533
3033
|
} else {
|
2534
3034
|
new_type = quantized_type;
|
2535
3035
|
#ifdef GGML_USE_K_QUANTS
|
2536
|
-
bool convert_incompatible_tensor = false;
|
2537
|
-
if (quantized_type == GGML_TYPE_Q2_K || quantized_type == GGML_TYPE_Q3_K || quantized_type == GGML_TYPE_Q4_K ||
|
2538
|
-
quantized_type == GGML_TYPE_Q5_K || quantized_type == GGML_TYPE_Q6_K) {
|
2539
|
-
int nx = tensor.ne.at(0);
|
2540
|
-
int ny = tensor.ne.at(1);
|
2541
|
-
if (nx % QK_K != 0 || ny % QK_K != 0) {
|
2542
|
-
fprintf(stderr, "\n\nTensor sizes %d x %d are not divisible by %d, required for k-quants.\n",nx,ny,QK_K);
|
2543
|
-
convert_incompatible_tensor = true;
|
2544
|
-
}
|
2545
|
-
}
|
2546
3036
|
if (tensor.name == "output.weight") {
|
2547
3037
|
int nx = tensor.ne.at(0);
|
2548
3038
|
int ny = tensor.ne.at(1);
|
@@ -2568,6 +3058,16 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2568
3058
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
2569
3059
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
2570
3060
|
}
|
3061
|
+
bool convert_incompatible_tensor = false;
|
3062
|
+
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
|
3063
|
+
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) {
|
3064
|
+
int nx = tensor.ne.at(0);
|
3065
|
+
int ny = tensor.ne.at(1);
|
3066
|
+
if (nx % QK_K != 0 || ny % QK_K != 0) {
|
3067
|
+
fprintf(stderr, "\n\nTensor sizes %d x %d are not divisible by %d, required for k-quants.\n",nx,ny,QK_K);
|
3068
|
+
convert_incompatible_tensor = true;
|
3069
|
+
}
|
3070
|
+
}
|
2571
3071
|
if (convert_incompatible_tensor) {
|
2572
3072
|
if (tensor.name == "output.weight") {
|
2573
3073
|
new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing.
|
@@ -2594,7 +3094,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2594
3094
|
f32_data = (float *) f32_conv_buf.addr;
|
2595
3095
|
}
|
2596
3096
|
|
2597
|
-
printf("quantizing .. ");
|
3097
|
+
printf("quantizing to %s .. ", ggml_type_name(new_type));
|
2598
3098
|
fflush(stdout);
|
2599
3099
|
|
2600
3100
|
work.resize(nelements * 4); // upper bound on size
|
@@ -2697,8 +3197,8 @@ struct llama_model * llama_load_model_from_file(
|
|
2697
3197
|
|
2698
3198
|
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
2699
3199
|
|
2700
|
-
if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gpu_layers,
|
2701
|
-
params.main_gpu, params.tensor_split, params.rope_freq_base, params.rope_freq_scale,params.low_vram,
|
3200
|
+
if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gqa, params.rms_norm_eps, params.n_gpu_layers,
|
3201
|
+
params.main_gpu, params.tensor_split, params.mul_mat_q, params.rope_freq_base, params.rope_freq_scale,params.low_vram,
|
2702
3202
|
memory_type, params.use_mmap, params.use_mlock, params.vocab_only, params.progress_callback,
|
2703
3203
|
params.progress_callback_user_data)) {
|
2704
3204
|
delete model;
|
@@ -2775,10 +3275,47 @@ struct llama_context * llama_new_context_with_model(
|
|
2775
3275
|
ctx->embedding.resize(hparams.n_embd);
|
2776
3276
|
}
|
2777
3277
|
|
2778
|
-
|
3278
|
+
#ifdef LLAMA_USE_ALLOCATOR
|
3279
|
+
{
|
3280
|
+
static const size_t tensor_alignment = 32;
|
3281
|
+
// the compute buffer is used to store the tensor and graph structs, while the allocator buffer is used for the tensor data
|
3282
|
+
ctx->buf_compute.resize(ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead());
|
3283
|
+
|
3284
|
+
// create measure allocator
|
3285
|
+
ctx->alloc = ggml_allocr_new_measure(tensor_alignment);
|
3286
|
+
|
3287
|
+
// build worst-case graph
|
3288
|
+
int n_tokens = std::min((int)hparams.n_ctx, params.n_batch);
|
3289
|
+
int n_past = hparams.n_ctx - n_tokens;
|
3290
|
+
llama_token token = llama_token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
|
3291
|
+
ggml_cgraph * gf = llama_build_graph(*ctx, &token, NULL, n_tokens, n_past);
|
3292
|
+
|
3293
|
+
// measure memory requirements for the graph
|
3294
|
+
size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment;
|
2779
3295
|
|
3296
|
+
fprintf(stderr, "%s: compute buffer total size = %7.2f MB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0);
|
3297
|
+
|
3298
|
+
// debug - for comparison with scratch buffer
|
3299
|
+
//size_t prev_req =
|
3300
|
+
// MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type) +
|
3301
|
+
// MEM_REQ_SCRATCH1().at(ctx->model.type) +
|
3302
|
+
// MEM_REQ_EVAL().at(ctx->model.type);
|
3303
|
+
//fprintf(stderr, "%s: (debug) equivalent with scratch buffer = %7.2f MB\n", __func__, prev_req / 1024.0 / 1024.0);
|
3304
|
+
|
3305
|
+
// recreate allocator with exact memory requirements
|
3306
|
+
ggml_allocr_free(ctx->alloc);
|
3307
|
+
|
3308
|
+
ctx->buf_alloc.resize(alloc_size);
|
3309
|
+
ctx->alloc = ggml_allocr_new(ctx->buf_alloc.addr, ctx->buf_alloc.size, tensor_alignment);
|
3310
|
+
}
|
3311
|
+
#else
|
3312
|
+
ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type) + ggml_graph_overhead());
|
3313
|
+
#endif
|
3314
|
+
|
3315
|
+
#ifdef LLAMA_USE_SCRATCH
|
2780
3316
|
ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type));
|
2781
3317
|
ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
|
3318
|
+
#endif
|
2782
3319
|
}
|
2783
3320
|
|
2784
3321
|
#ifdef GGML_USE_METAL
|
@@ -2799,7 +3336,7 @@ struct llama_context * llama_new_context_with_model(
|
|
2799
3336
|
|
2800
3337
|
const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
|
2801
3338
|
|
2802
|
-
|
3339
|
+
fprintf(stderr, "%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
|
2803
3340
|
|
2804
3341
|
#define LLAMA_METAL_CHECK_BUF(result) \
|
2805
3342
|
if (!(result)) { \
|
@@ -2848,9 +3385,6 @@ struct llama_context * llama_init_from_file(
|
|
2848
3385
|
}
|
2849
3386
|
|
2850
3387
|
void llama_free(struct llama_context * ctx) {
|
2851
|
-
if (ctx->model_owner) {
|
2852
|
-
delete &ctx->model;
|
2853
|
-
}
|
2854
3388
|
delete ctx;
|
2855
3389
|
}
|
2856
3390
|
|
@@ -3260,7 +3794,7 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
|
3260
3794
|
const auto & kv_self = ctx->kv_self;
|
3261
3795
|
const auto & hparams = ctx->model.hparams;
|
3262
3796
|
const int n_layer = hparams.n_layer;
|
3263
|
-
const int n_embd = hparams.
|
3797
|
+
const int n_embd = hparams.n_embd_gqa();
|
3264
3798
|
const int n_ctx = hparams.n_ctx;
|
3265
3799
|
|
3266
3800
|
const size_t kv_size = kv_self.buf.size;
|
@@ -3363,7 +3897,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
3363
3897
|
const auto & kv_self = ctx->kv_self;
|
3364
3898
|
const auto & hparams = ctx->model.hparams;
|
3365
3899
|
const int n_layer = hparams.n_layer;
|
3366
|
-
const int n_embd = hparams.
|
3900
|
+
const int n_embd = hparams.n_embd_gqa();
|
3367
3901
|
const int n_ctx = hparams.n_ctx;
|
3368
3902
|
|
3369
3903
|
size_t kv_size;
|