llama_cpp 0.3.4 → 0.3.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +12 -0
- data/README.md +18 -2
- data/ext/llama_cpp/extconf.rb +2 -1
- data/ext/llama_cpp/llama_cpp.cpp +315 -8
- data/ext/llama_cpp/src/ggml-alloc.c +541 -0
- data/ext/llama_cpp/src/ggml-alloc.h +22 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +2271 -414
- data/ext/llama_cpp/src/ggml-cuda.h +1 -0
- data/ext/llama_cpp/src/ggml-metal.h +7 -0
- data/ext/llama_cpp/src/ggml-metal.m +218 -87
- data/ext/llama_cpp/src/ggml-metal.metal +72 -55
- data/ext/llama_cpp/src/ggml.c +754 -996
- data/ext/llama_cpp/src/ggml.h +94 -18
- data/ext/llama_cpp/src/k_quants.c +350 -24
- data/ext/llama_cpp/src/llama.cpp +713 -179
- data/ext/llama_cpp/src/llama.h +61 -5
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +26 -0
- metadata +4 -2
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -56,8 +56,14 @@
|
|
56
56
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
57
57
|
#endif
|
58
58
|
|
59
|
+
#if !defined(GGML_USE_CUBLAS) && !defined(GGML_USE_METAL)
|
60
|
+
#include "ggml-alloc.h"
|
61
|
+
#define LLAMA_USE_ALLOCATOR
|
62
|
+
#else
|
59
63
|
#define LLAMA_USE_SCRATCH
|
60
64
|
#define LLAMA_MAX_SCRATCH_BUFFERS 16
|
65
|
+
#endif
|
66
|
+
|
61
67
|
|
62
68
|
// available llama models
|
63
69
|
enum e_model {
|
@@ -67,6 +73,7 @@ enum e_model {
|
|
67
73
|
MODEL_13B,
|
68
74
|
MODEL_30B,
|
69
75
|
MODEL_65B,
|
76
|
+
MODEL_70B,
|
70
77
|
};
|
71
78
|
|
72
79
|
static const size_t kB = 1024;
|
@@ -98,18 +105,18 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
|
|
98
105
|
}
|
99
106
|
|
100
107
|
//
|
101
|
-
// memory sizes
|
108
|
+
// memory sizes (calculated for n_batch == 512)
|
102
109
|
//
|
103
110
|
|
104
111
|
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0(int n_ctx)
|
105
112
|
{
|
106
113
|
static std::map<e_model, size_t> k_sizes = {
|
107
|
-
|
108
|
-
{
|
109
|
-
{
|
110
|
-
{
|
111
|
-
{
|
112
|
-
{
|
114
|
+
{ MODEL_3B, ((size_t) n_ctx / 16ull + 92ull) * MB },
|
115
|
+
{ MODEL_7B, ((size_t) n_ctx / 16ull + 100ull) * MB },
|
116
|
+
{ MODEL_13B, ((size_t) n_ctx / 12ull + 120ull) * MB },
|
117
|
+
{ MODEL_30B, ((size_t) n_ctx / 9ull + 160ull) * MB },
|
118
|
+
{ MODEL_65B, ((size_t) n_ctx / 6ull + 256ull) * MB }, // guess
|
119
|
+
{ MODEL_70B, ((size_t) n_ctx / 7ull + 164ull) * MB },
|
113
120
|
};
|
114
121
|
return k_sizes;
|
115
122
|
}
|
@@ -117,38 +124,26 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0(int n_ctx)
|
|
117
124
|
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
|
118
125
|
{
|
119
126
|
static std::map<e_model, size_t> k_sizes = {
|
120
|
-
{ MODEL_3B,
|
121
|
-
{ MODEL_7B,
|
122
|
-
{ MODEL_13B,
|
123
|
-
{ MODEL_30B,
|
124
|
-
{ MODEL_65B,
|
125
|
-
|
126
|
-
return k_sizes;
|
127
|
-
}
|
128
|
-
|
129
|
-
// 2*n_embd*n_ctx*n_layer*sizeof(float16)
|
130
|
-
static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
|
131
|
-
{
|
132
|
-
static std::map<e_model, size_t> k_sizes = {
|
133
|
-
{ MODEL_3B, 682ull * MB },
|
134
|
-
{ MODEL_7B, 1026ull * MB },
|
135
|
-
{ MODEL_13B, 1608ull * MB },
|
136
|
-
{ MODEL_30B, 3124ull * MB },
|
137
|
-
{ MODEL_65B, 5120ull * MB },
|
127
|
+
{ MODEL_3B, 128ull * MB },
|
128
|
+
{ MODEL_7B, 160ull * MB },
|
129
|
+
{ MODEL_13B, 192ull * MB },
|
130
|
+
{ MODEL_30B, 256ull * MB },
|
131
|
+
{ MODEL_65B, 384ull * MB }, // guess
|
132
|
+
{ MODEL_70B, 304ull * MB },
|
138
133
|
};
|
139
134
|
return k_sizes;
|
140
135
|
}
|
141
136
|
|
142
|
-
//
|
143
|
-
|
144
|
-
static const std::map<e_model, size_t> & MEM_REQ_EVAL(int n_ctx)
|
137
|
+
// used to store the compute graph tensors + non-scratch data
|
138
|
+
static const std::map<e_model, size_t> & MEM_REQ_EVAL()
|
145
139
|
{
|
146
140
|
static std::map<e_model, size_t> k_sizes = {
|
147
|
-
{ MODEL_3B,
|
148
|
-
{ MODEL_7B,
|
149
|
-
{ MODEL_13B,
|
150
|
-
{ MODEL_30B,
|
151
|
-
{ MODEL_65B,
|
141
|
+
{ MODEL_3B, 8ull * MB },
|
142
|
+
{ MODEL_7B, 10ull * MB },
|
143
|
+
{ MODEL_13B, 12ull * MB },
|
144
|
+
{ MODEL_30B, 16ull * MB },
|
145
|
+
{ MODEL_65B, 24ull * MB }, // guess
|
146
|
+
{ MODEL_70B, 24ull * MB },
|
152
147
|
};
|
153
148
|
return k_sizes;
|
154
149
|
}
|
@@ -163,6 +158,7 @@ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_BASE()
|
|
163
158
|
{ MODEL_13B, 640ull * kB },
|
164
159
|
{ MODEL_30B, 768ull * kB },
|
165
160
|
{ MODEL_65B, 1536ull * kB },
|
161
|
+
{ MODEL_70B, 1536ull * kB }, // TODO (likely can be reduced)
|
166
162
|
};
|
167
163
|
return k_sizes;
|
168
164
|
}
|
@@ -177,19 +173,26 @@ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT()
|
|
177
173
|
{ MODEL_13B, 160ull },
|
178
174
|
{ MODEL_30B, 208ull },
|
179
175
|
{ MODEL_65B, 416ull },
|
176
|
+
{ MODEL_70B, 416ull }, // TODO (likely can be reduced)
|
180
177
|
};
|
181
178
|
return k_sizes;
|
182
179
|
}
|
183
180
|
|
184
181
|
// default hparams (LLaMA 7B)
|
185
182
|
struct llama_hparams {
|
186
|
-
uint32_t n_vocab
|
187
|
-
uint32_t n_ctx
|
188
|
-
uint32_t n_embd
|
189
|
-
uint32_t n_mult
|
190
|
-
uint32_t n_head
|
191
|
-
uint32_t
|
192
|
-
uint32_t
|
183
|
+
uint32_t n_vocab = 32000;
|
184
|
+
uint32_t n_ctx = 512; // this is provided as user input?
|
185
|
+
uint32_t n_embd = 4096;
|
186
|
+
uint32_t n_mult = 256;
|
187
|
+
uint32_t n_head = 32;
|
188
|
+
uint32_t n_head_kv = 32;
|
189
|
+
uint32_t n_layer = 32;
|
190
|
+
uint32_t n_rot = 64;
|
191
|
+
|
192
|
+
// LLaMAv2
|
193
|
+
// TODO: load from model data hparams
|
194
|
+
float f_ffn_mult = 1.0f;
|
195
|
+
float f_rms_norm_eps = LLAMA_DEFAULT_RMS_EPS;
|
193
196
|
|
194
197
|
float rope_freq_base = 10000.0f;
|
195
198
|
float rope_freq_scale = 1.0f;
|
@@ -197,7 +200,28 @@ struct llama_hparams {
|
|
197
200
|
enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;
|
198
201
|
|
199
202
|
bool operator!=(const llama_hparams & other) const {
|
200
|
-
return static_cast<bool>(memcmp(this, &other, sizeof(llama_hparams)));
|
203
|
+
return static_cast<bool>(memcmp(this, &other, sizeof(llama_hparams))); // NOLINT
|
204
|
+
}
|
205
|
+
|
206
|
+
uint32_t n_gqa() const {
|
207
|
+
return n_head/n_head_kv;
|
208
|
+
}
|
209
|
+
|
210
|
+
uint32_t n_embd_head() const {
|
211
|
+
return n_embd/n_head;
|
212
|
+
}
|
213
|
+
|
214
|
+
uint32_t n_embd_gqa() const {
|
215
|
+
return n_embd/n_gqa();
|
216
|
+
}
|
217
|
+
|
218
|
+
size_t kv_size() const {
|
219
|
+
size_t result = 2ull;
|
220
|
+
result *= (size_t) n_embd_gqa();
|
221
|
+
result *= (size_t) n_ctx;
|
222
|
+
result *= (size_t) n_layer;
|
223
|
+
result *= sizeof(ggml_fp16_t);
|
224
|
+
return result;
|
201
225
|
}
|
202
226
|
};
|
203
227
|
|
@@ -309,13 +333,22 @@ struct llama_model {
|
|
309
333
|
|
310
334
|
struct llama_context {
|
311
335
|
llama_context(const llama_model & model) : model(model), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
|
312
|
-
#ifdef GGML_USE_METAL
|
313
336
|
~llama_context() {
|
337
|
+
if (model_owner) {
|
338
|
+
delete &model;
|
339
|
+
}
|
340
|
+
#ifdef GGML_USE_METAL
|
314
341
|
if (ctx_metal) {
|
315
342
|
ggml_metal_free(ctx_metal);
|
316
343
|
}
|
317
|
-
}
|
318
344
|
#endif
|
345
|
+
#ifdef LLAMA_USE_ALLOCATOR
|
346
|
+
if (alloc) {
|
347
|
+
ggml_allocr_free(alloc);
|
348
|
+
}
|
349
|
+
#endif
|
350
|
+
}
|
351
|
+
|
319
352
|
std::mt19937 rng;
|
320
353
|
|
321
354
|
bool has_evaluated_once = false;
|
@@ -353,7 +386,17 @@ struct llama_context {
|
|
353
386
|
// memory buffers used to evaluate the model
|
354
387
|
// TODO: move in llama_state
|
355
388
|
llama_ctx_buffer buf_compute;
|
389
|
+
|
390
|
+
#ifdef LLAMA_USE_ALLOCATOR
|
391
|
+
llama_ctx_buffer buf_alloc;
|
392
|
+
ggml_allocr * alloc = NULL;
|
393
|
+
#endif
|
394
|
+
|
395
|
+
#ifdef LLAMA_USE_SCRATCH
|
356
396
|
llama_ctx_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
|
397
|
+
int buf_last = 0;
|
398
|
+
size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
|
399
|
+
#endif
|
357
400
|
|
358
401
|
#ifdef GGML_USE_METAL
|
359
402
|
ggml_metal_context * ctx_metal = NULL;
|
@@ -363,9 +406,6 @@ struct llama_context {
|
|
363
406
|
ggml_mpi_context * ctx_mpi = NULL;
|
364
407
|
#endif
|
365
408
|
|
366
|
-
int buf_last = 0;
|
367
|
-
size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
|
368
|
-
|
369
409
|
void use_buf(struct ggml_context * ctx, int i) {
|
370
410
|
#if defined(LLAMA_USE_SCRATCH)
|
371
411
|
size_t last_size = 0;
|
@@ -499,12 +539,16 @@ struct llama_file_loader {
|
|
499
539
|
}
|
500
540
|
void read_hparams() {
|
501
541
|
hparams.n_vocab = file.read_u32();
|
502
|
-
hparams.n_embd
|
503
|
-
hparams.n_mult
|
504
|
-
hparams.n_head
|
542
|
+
hparams.n_embd = file.read_u32();
|
543
|
+
hparams.n_mult = file.read_u32();
|
544
|
+
hparams.n_head = file.read_u32();
|
505
545
|
hparams.n_layer = file.read_u32();
|
506
|
-
hparams.n_rot
|
507
|
-
hparams.ftype
|
546
|
+
hparams.n_rot = file.read_u32();
|
547
|
+
hparams.ftype = (enum llama_ftype) file.read_u32();
|
548
|
+
|
549
|
+
// LLaMAv2
|
550
|
+
// TODO: read from header
|
551
|
+
hparams.n_head_kv = hparams.n_head;
|
508
552
|
}
|
509
553
|
void read_vocab() {
|
510
554
|
vocab.id_to_token.resize(hparams.n_vocab);
|
@@ -803,7 +847,7 @@ static bool kv_cache_init(
|
|
803
847
|
ggml_type wtype,
|
804
848
|
int n_ctx,
|
805
849
|
int n_gpu_layers) {
|
806
|
-
const int n_embd = hparams.
|
850
|
+
const int n_embd = hparams.n_embd_gqa();
|
807
851
|
const int n_layer = hparams.n_layer;
|
808
852
|
|
809
853
|
const int64_t n_mem = n_layer*n_ctx;
|
@@ -847,6 +891,8 @@ struct llama_context_params llama_context_default_params() {
|
|
847
891
|
/*.seed =*/ LLAMA_DEFAULT_SEED,
|
848
892
|
/*.n_ctx =*/ 512,
|
849
893
|
/*.n_batch =*/ 512,
|
894
|
+
/*.n_gqa =*/ 1,
|
895
|
+
/*.rms_norm_eps =*/ LLAMA_DEFAULT_RMS_EPS,
|
850
896
|
/*.gpu_layers =*/ 0,
|
851
897
|
/*.main_gpu =*/ 0,
|
852
898
|
/*.tensor_split =*/ nullptr,
|
@@ -855,6 +901,7 @@ struct llama_context_params llama_context_default_params() {
|
|
855
901
|
/*.progress_callback =*/ nullptr,
|
856
902
|
/*.progress_callback_user_data =*/ nullptr,
|
857
903
|
/*.low_vram =*/ false,
|
904
|
+
/*.mul_mat_q =*/ false,
|
858
905
|
/*.f16_kv =*/ true,
|
859
906
|
/*.logits_all =*/ false,
|
860
907
|
/*.vocab_only =*/ false,
|
@@ -966,6 +1013,7 @@ static const char *llama_model_type_name(e_model type) {
|
|
966
1013
|
case MODEL_13B: return "13B";
|
967
1014
|
case MODEL_30B: return "30B";
|
968
1015
|
case MODEL_65B: return "65B";
|
1016
|
+
case MODEL_70B: return "70B";
|
969
1017
|
default: LLAMA_ASSERT(false);
|
970
1018
|
}
|
971
1019
|
}
|
@@ -976,9 +1024,12 @@ static void llama_model_load_internal(
|
|
976
1024
|
llama_vocab & vocab,
|
977
1025
|
int n_ctx,
|
978
1026
|
int n_batch,
|
1027
|
+
int n_gqa,
|
1028
|
+
float rms_norm_eps,
|
979
1029
|
int n_gpu_layers,
|
980
1030
|
int main_gpu,
|
981
1031
|
const float * tensor_split,
|
1032
|
+
const bool mul_mat_q,
|
982
1033
|
float rope_freq_base,
|
983
1034
|
float rope_freq_scale,
|
984
1035
|
bool low_vram,
|
@@ -997,8 +1048,12 @@ static void llama_model_load_internal(
|
|
997
1048
|
model.hparams = ml->file_loader->hparams;
|
998
1049
|
model.n_gpu_layers = n_gpu_layers;
|
999
1050
|
llama_file_version file_version = ml->file_loader->file_version;
|
1051
|
+
|
1000
1052
|
auto & hparams = model.hparams;
|
1001
1053
|
|
1054
|
+
// TODO: read from file
|
1055
|
+
hparams.f_rms_norm_eps = rms_norm_eps;
|
1056
|
+
|
1002
1057
|
{
|
1003
1058
|
switch (hparams.n_layer) {
|
1004
1059
|
case 26: model.type = e_model::MODEL_3B; break;
|
@@ -1016,11 +1071,25 @@ static void llama_model_load_internal(
|
|
1016
1071
|
|
1017
1072
|
hparams.n_ctx = n_ctx;
|
1018
1073
|
|
1074
|
+
// LLaMAv2
|
1075
|
+
// TODO: temporary until GGUF
|
1076
|
+
LLAMA_ASSERT(hparams.n_head % n_gqa == 0);
|
1077
|
+
hparams.n_head_kv = hparams.n_head / n_gqa;
|
1078
|
+
if (model.type == e_model::MODEL_65B && n_gqa == 8) {
|
1079
|
+
fprintf(stderr, "%s: warning: assuming 70B model based on GQA == %d\n", __func__, n_gqa);
|
1080
|
+
model.type = e_model::MODEL_70B;
|
1081
|
+
hparams.f_ffn_mult = 1.3f; // from the params.json of the 70B model
|
1082
|
+
}
|
1083
|
+
|
1019
1084
|
hparams.rope_freq_base = rope_freq_base;
|
1020
1085
|
hparams.rope_freq_scale = rope_freq_scale;
|
1021
1086
|
}
|
1022
1087
|
|
1023
|
-
|
1088
|
+
// ref: https://github.com/facebookresearch/llama/blob/6c7fe276574e78057f917549435a2554000a876d/llama/model.py#L194-L199
|
1089
|
+
const uint32_t n_ff_raw = 2*(4*hparams.n_embd)/3;
|
1090
|
+
const uint32_t n_ff_mult = hparams.f_ffn_mult*n_ff_raw;
|
1091
|
+
const uint32_t n_ff = ((n_ff_mult + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
|
1092
|
+
//const uint32_t n_ff = 28672;
|
1024
1093
|
|
1025
1094
|
{
|
1026
1095
|
fprintf(stderr, "%s: format = %s\n", __func__, llama_file_version_name(file_version));
|
@@ -1029,12 +1098,15 @@ static void llama_model_load_internal(
|
|
1029
1098
|
fprintf(stderr, "%s: n_embd = %u\n", __func__, hparams.n_embd);
|
1030
1099
|
fprintf(stderr, "%s: n_mult = %u\n", __func__, hparams.n_mult);
|
1031
1100
|
fprintf(stderr, "%s: n_head = %u\n", __func__, hparams.n_head);
|
1101
|
+
fprintf(stderr, "%s: n_head_kv = %u\n", __func__, hparams.n_head_kv);
|
1032
1102
|
fprintf(stderr, "%s: n_layer = %u\n", __func__, hparams.n_layer);
|
1033
|
-
fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot);
|
1103
|
+
fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot); // a.k.a. n_embd_head, n_head_dim
|
1104
|
+
fprintf(stderr, "%s: n_gqa = %u\n", __func__, hparams.n_gqa());
|
1105
|
+
fprintf(stderr, "%s: rnorm_eps = %.1e\n", __func__, hparams.f_rms_norm_eps);
|
1106
|
+
fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff);
|
1034
1107
|
fprintf(stderr, "%s: freq_base = %.1f\n", __func__, hparams.rope_freq_base);
|
1035
1108
|
fprintf(stderr, "%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale);
|
1036
1109
|
fprintf(stderr, "%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype));
|
1037
|
-
fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff);
|
1038
1110
|
fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
|
1039
1111
|
}
|
1040
1112
|
|
@@ -1069,7 +1141,7 @@ static void llama_model_load_internal(
|
|
1069
1141
|
{
|
1070
1142
|
model.buf.resize(ctx_size);
|
1071
1143
|
if (use_mlock) {
|
1072
|
-
model.mlock_buf.init(model.buf.addr);
|
1144
|
+
model.mlock_buf.init (model.buf.addr);
|
1073
1145
|
model.mlock_buf.grow_to(model.buf.size);
|
1074
1146
|
}
|
1075
1147
|
|
@@ -1086,9 +1158,11 @@ static void llama_model_load_internal(
|
|
1086
1158
|
}
|
1087
1159
|
|
1088
1160
|
(void) main_gpu;
|
1161
|
+
(void) mul_mat_q;
|
1089
1162
|
#if defined(GGML_USE_CUBLAS)
|
1090
1163
|
fprintf(stderr, "%s: using CUDA for GPU acceleration\n", __func__);
|
1091
1164
|
ggml_cuda_set_main_device(main_gpu);
|
1165
|
+
ggml_cuda_set_mul_mat_q(mul_mat_q);
|
1092
1166
|
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
|
1093
1167
|
#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT
|
1094
1168
|
#elif defined(GGML_USE_CLBLAST)
|
@@ -1104,9 +1178,10 @@ static void llama_model_load_internal(
|
|
1104
1178
|
size_t vram_weights = 0;
|
1105
1179
|
size_t vram_scratch = 0;
|
1106
1180
|
{
|
1107
|
-
const uint32_t n_embd
|
1108
|
-
const uint32_t
|
1109
|
-
const uint32_t
|
1181
|
+
const uint32_t n_embd = hparams.n_embd;
|
1182
|
+
const uint32_t n_embd_gqa = hparams.n_embd_gqa();
|
1183
|
+
const uint32_t n_layer = hparams.n_layer;
|
1184
|
+
const uint32_t n_vocab = hparams.n_vocab;
|
1110
1185
|
|
1111
1186
|
ml->ggml_ctx = ctx;
|
1112
1187
|
|
@@ -1154,16 +1229,16 @@ static void llama_model_load_internal(
|
|
1154
1229
|
|
1155
1230
|
layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd}, backend);
|
1156
1231
|
|
1157
|
-
layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd},
|
1158
|
-
layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd,
|
1159
|
-
layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd,
|
1160
|
-
layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd},
|
1232
|
+
layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}, backend_split);
|
1233
|
+
layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd_gqa}, backend_split);
|
1234
|
+
layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd_gqa}, backend_split);
|
1235
|
+
layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}, backend_split);
|
1161
1236
|
|
1162
1237
|
layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd}, backend);
|
1163
1238
|
|
1164
|
-
layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff},
|
1165
|
-
layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff,
|
1166
|
-
layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff},
|
1239
|
+
layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}, backend_split);
|
1240
|
+
layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}, backend_split);
|
1241
|
+
layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}, backend_split);
|
1167
1242
|
|
1168
1243
|
if (backend == GGML_BACKEND_GPU) {
|
1169
1244
|
vram_weights +=
|
@@ -1181,16 +1256,20 @@ static void llama_model_load_internal(
|
|
1181
1256
|
const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
|
1182
1257
|
|
1183
1258
|
// this is the total memory required to run the inference
|
1184
|
-
|
1259
|
+
size_t mem_required =
|
1185
1260
|
ctx_size +
|
1186
|
-
mmapped_size - vram_weights
|
1261
|
+
mmapped_size - vram_weights; // weights in VRAM not in memory
|
1262
|
+
|
1263
|
+
#ifndef LLAMA_USE_ALLOCATOR
|
1264
|
+
mem_required +=
|
1187
1265
|
MEM_REQ_SCRATCH0(hparams.n_ctx).at(model.type) +
|
1188
1266
|
MEM_REQ_SCRATCH1().at(model.type) +
|
1189
|
-
MEM_REQ_EVAL(
|
1267
|
+
MEM_REQ_EVAL().at(model.type);
|
1268
|
+
#endif
|
1190
1269
|
|
1191
1270
|
// this is the memory required by one llama_state
|
1192
1271
|
const size_t mem_required_state =
|
1193
|
-
scale*
|
1272
|
+
scale*hparams.kv_size();
|
1194
1273
|
|
1195
1274
|
fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
|
1196
1275
|
mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
|
@@ -1231,7 +1310,7 @@ static void llama_model_load_internal(
|
|
1231
1310
|
fprintf(stderr, "%s: cannot offload v cache to GPU due to low VRAM option\n", __func__);
|
1232
1311
|
} else {
|
1233
1312
|
fprintf(stderr, "%s: offloading v cache to GPU\n", __func__);
|
1234
|
-
vram_kv_cache +=
|
1313
|
+
vram_kv_cache += hparams.kv_size() / 2;
|
1235
1314
|
}
|
1236
1315
|
}
|
1237
1316
|
if (n_gpu_layers > (int) hparams.n_layer + 2) {
|
@@ -1239,7 +1318,7 @@ static void llama_model_load_internal(
|
|
1239
1318
|
fprintf(stderr, "%s: cannot offload k cache to GPU due to low VRAM option\n", __func__);
|
1240
1319
|
} else {
|
1241
1320
|
fprintf(stderr, "%s: offloading k cache to GPU\n", __func__);
|
1242
|
-
vram_kv_cache +=
|
1321
|
+
vram_kv_cache += hparams.kv_size() / 2;
|
1243
1322
|
}
|
1244
1323
|
}
|
1245
1324
|
#elif defined(GGML_USE_CLBLAST)
|
@@ -1287,9 +1366,12 @@ static bool llama_model_load(
|
|
1287
1366
|
llama_vocab & vocab,
|
1288
1367
|
int n_ctx,
|
1289
1368
|
int n_batch,
|
1369
|
+
int n_gqa,
|
1370
|
+
float rms_norm_eps,
|
1290
1371
|
int n_gpu_layers,
|
1291
1372
|
int main_gpu,
|
1292
1373
|
const float * tensor_split,
|
1374
|
+
const bool mul_mat_q,
|
1293
1375
|
float rope_freq_base,
|
1294
1376
|
float rope_freq_scale,
|
1295
1377
|
bool low_vram,
|
@@ -1300,7 +1382,8 @@ static bool llama_model_load(
|
|
1300
1382
|
llama_progress_callback progress_callback,
|
1301
1383
|
void *progress_callback_user_data) {
|
1302
1384
|
try {
|
1303
|
-
llama_model_load_internal(fname, model, vocab, n_ctx, n_batch,
|
1385
|
+
llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gqa, rms_norm_eps, n_gpu_layers,
|
1386
|
+
main_gpu, tensor_split, mul_mat_q, rope_freq_base, rope_freq_scale, low_vram, memory_type,
|
1304
1387
|
use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
|
1305
1388
|
return true;
|
1306
1389
|
} catch (const std::exception & err) {
|
@@ -1309,32 +1392,15 @@ static bool llama_model_load(
|
|
1309
1392
|
}
|
1310
1393
|
}
|
1311
1394
|
|
1312
|
-
|
1313
|
-
//
|
1314
|
-
// - lctx: llama context
|
1315
|
-
// - tokens: new batch of tokens to process
|
1316
|
-
// - embd embeddings input
|
1317
|
-
// - n_tokens number of tokens
|
1318
|
-
// - n_past: the context size so far
|
1319
|
-
// - n_threads: number of threads to use
|
1320
|
-
//
|
1321
|
-
static bool llama_eval_internal(
|
1395
|
+
static struct ggml_cgraph * llama_build_graph(
|
1322
1396
|
llama_context & lctx,
|
1323
1397
|
const llama_token * tokens,
|
1324
1398
|
const float * embd,
|
1325
1399
|
int n_tokens,
|
1326
|
-
int n_past
|
1327
|
-
int n_threads,
|
1328
|
-
const char * cgraph_fname) {
|
1400
|
+
int n_past) {
|
1329
1401
|
|
1330
1402
|
LLAMA_ASSERT((!tokens && embd) || (tokens && !embd));
|
1331
1403
|
|
1332
|
-
#ifdef GGML_USE_MPI
|
1333
|
-
ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
|
1334
|
-
#endif
|
1335
|
-
|
1336
|
-
const int64_t t_start_us = ggml_time_us();
|
1337
|
-
|
1338
1404
|
const int N = n_tokens;
|
1339
1405
|
|
1340
1406
|
const auto & model = lctx.model;
|
@@ -1344,40 +1410,54 @@ static bool llama_eval_internal(
|
|
1344
1410
|
|
1345
1411
|
LLAMA_ASSERT(!!kv_self.ctx);
|
1346
1412
|
|
1347
|
-
const
|
1348
|
-
const
|
1349
|
-
const
|
1350
|
-
const
|
1351
|
-
const
|
1352
|
-
const
|
1353
|
-
const
|
1413
|
+
const int64_t n_embd = hparams.n_embd;
|
1414
|
+
const int64_t n_layer = hparams.n_layer;
|
1415
|
+
const int64_t n_ctx = hparams.n_ctx;
|
1416
|
+
const int64_t n_head = hparams.n_head;
|
1417
|
+
const int64_t n_head_kv = hparams.n_head_kv;
|
1418
|
+
const int64_t n_embd_head = hparams.n_embd_head();
|
1419
|
+
const int64_t n_embd_gqa = hparams.n_embd_gqa();
|
1420
|
+
|
1421
|
+
LLAMA_ASSERT(n_embd_head == hparams.n_rot);
|
1354
1422
|
|
1355
1423
|
const float freq_base = hparams.rope_freq_base;
|
1356
1424
|
const float freq_scale = hparams.rope_freq_scale;
|
1425
|
+
const float rms_norm_eps = hparams.f_rms_norm_eps;
|
1426
|
+
|
1427
|
+
const int n_gpu_layers = model.n_gpu_layers;
|
1357
1428
|
|
1358
1429
|
auto & mem_per_token = lctx.mem_per_token;
|
1359
1430
|
auto & buf_compute = lctx.buf_compute;
|
1360
1431
|
|
1432
|
+
|
1361
1433
|
struct ggml_init_params params = {
|
1362
1434
|
/*.mem_size =*/ buf_compute.size,
|
1363
1435
|
/*.mem_buffer =*/ buf_compute.addr,
|
1364
1436
|
/*.no_alloc =*/ false,
|
1365
1437
|
};
|
1366
1438
|
|
1367
|
-
|
1439
|
+
#ifdef LLAMA_USE_ALLOCATOR
|
1440
|
+
params.no_alloc = true;
|
1441
|
+
#endif
|
1368
1442
|
|
1369
|
-
|
1443
|
+
struct ggml_context * ctx0 = ggml_init(params);
|
1370
1444
|
|
1371
|
-
|
1372
|
-
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
|
1373
|
-
n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
|
1445
|
+
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
1374
1446
|
|
1375
1447
|
struct ggml_tensor * cur;
|
1376
1448
|
struct ggml_tensor * inpL;
|
1377
1449
|
|
1378
1450
|
if (tokens) {
|
1379
1451
|
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
1452
|
+
|
1453
|
+
#ifdef LLAMA_USE_ALLOCATOR
|
1454
|
+
ggml_allocr_alloc(lctx.alloc, inp_tokens);
|
1455
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
1456
|
+
memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
|
1457
|
+
}
|
1458
|
+
#else
|
1380
1459
|
memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
|
1460
|
+
#endif
|
1381
1461
|
ggml_set_name(inp_tokens, "inp_tokens");
|
1382
1462
|
|
1383
1463
|
inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
|
@@ -1387,7 +1467,15 @@ static bool llama_eval_internal(
|
|
1387
1467
|
#endif
|
1388
1468
|
|
1389
1469
|
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
|
1470
|
+
|
1471
|
+
#ifdef LLAMA_USE_ALLOCATOR
|
1472
|
+
ggml_allocr_alloc(lctx.alloc, inpL);
|
1473
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
1474
|
+
memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
|
1475
|
+
}
|
1476
|
+
#else
|
1390
1477
|
memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
|
1478
|
+
#endif
|
1391
1479
|
}
|
1392
1480
|
|
1393
1481
|
const int i_gpu_start = n_layer - n_gpu_layers;
|
@@ -1414,6 +1502,17 @@ static bool llama_eval_internal(
|
|
1414
1502
|
}
|
1415
1503
|
#endif // GGML_USE_CUBLAS
|
1416
1504
|
|
1505
|
+
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
1506
|
+
#ifdef LLAMA_USE_ALLOCATOR
|
1507
|
+
ggml_allocr_alloc(lctx.alloc, KQ_scale);
|
1508
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
1509
|
+
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
|
1510
|
+
}
|
1511
|
+
#else
|
1512
|
+
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
|
1513
|
+
#endif
|
1514
|
+
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
1515
|
+
|
1417
1516
|
for (int il = 0; il < n_layer; ++il) {
|
1418
1517
|
ggml_format_name(inpL, "layer_inp_%d", il);
|
1419
1518
|
|
@@ -1431,7 +1530,7 @@ static bool llama_eval_internal(
|
|
1431
1530
|
|
1432
1531
|
// norm
|
1433
1532
|
{
|
1434
|
-
cur = ggml_rms_norm(ctx0, inpL);
|
1533
|
+
cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
|
1435
1534
|
offload_func(cur);
|
1436
1535
|
ggml_set_name(cur, "rms_norm_0");
|
1437
1536
|
|
@@ -1452,11 +1551,11 @@ static bool llama_eval_internal(
|
|
1452
1551
|
offload_func_kq(tmpq);
|
1453
1552
|
ggml_set_name(tmpq, "tmpq");
|
1454
1553
|
|
1455
|
-
struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk,
|
1554
|
+
struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
|
1456
1555
|
offload_func_kq(Kcur);
|
1457
1556
|
ggml_set_name(Kcur, "Kcur");
|
1458
1557
|
|
1459
|
-
struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq,
|
1558
|
+
struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
|
1460
1559
|
offload_func_kq(Qcur);
|
1461
1560
|
ggml_set_name(Qcur, "Qcur");
|
1462
1561
|
|
@@ -1468,23 +1567,23 @@ static bool llama_eval_internal(
|
|
1468
1567
|
offload_func_v(tmpv);
|
1469
1568
|
ggml_set_name(tmpv, "tmpv");
|
1470
1569
|
|
1471
|
-
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv,
|
1570
|
+
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, N));
|
1472
1571
|
offload_func_v(Vcur);
|
1473
1572
|
ggml_set_name(Vcur, "Vcur");
|
1474
1573
|
|
1475
|
-
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*
|
1574
|
+
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past));
|
1476
1575
|
offload_func_kq(k);
|
1477
1576
|
ggml_set_name(k, "k");
|
1478
1577
|
|
1479
|
-
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N,
|
1578
|
+
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd_gqa,
|
1480
1579
|
( n_ctx)*ggml_element_size(kv_self.v),
|
1481
|
-
(il*n_ctx)*ggml_element_size(kv_self.v)*
|
1580
|
+
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + n_past*ggml_element_size(kv_self.v));
|
1482
1581
|
offload_func_v(v);
|
1483
1582
|
ggml_set_name(v, "v");
|
1484
1583
|
|
1485
1584
|
// important: storing RoPE-ed version of K in the KV cache!
|
1486
|
-
ggml_build_forward_expand(
|
1487
|
-
ggml_build_forward_expand(
|
1585
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
|
1586
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
|
1488
1587
|
}
|
1489
1588
|
|
1490
1589
|
struct ggml_tensor * Q =
|
@@ -1497,8 +1596,8 @@ static bool llama_eval_internal(
|
|
1497
1596
|
struct ggml_tensor * K =
|
1498
1597
|
ggml_permute(ctx0,
|
1499
1598
|
ggml_reshape_3d(ctx0,
|
1500
|
-
ggml_view_1d(ctx0, kv_self.k, (n_past + N)*
|
1501
|
-
|
1599
|
+
ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd_gqa, il*n_ctx*ggml_element_size(kv_self.k)*n_embd_gqa),
|
1600
|
+
n_embd_head, n_head_kv, n_past + N),
|
1502
1601
|
0, 2, 1, 3);
|
1503
1602
|
offload_func_kq(K);
|
1504
1603
|
ggml_set_name(K, "K");
|
@@ -1508,10 +1607,7 @@ static bool llama_eval_internal(
|
|
1508
1607
|
offload_func_kq(KQ);
|
1509
1608
|
ggml_set_name(KQ, "KQ");
|
1510
1609
|
|
1511
|
-
// KQ_scaled = KQ / sqrt(
|
1512
|
-
struct ggml_tensor * KQ_scale = ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head));
|
1513
|
-
ggml_set_name(KQ_scale, "1/sqrt(n_embd/n_head)");
|
1514
|
-
|
1610
|
+
// KQ_scaled = KQ / sqrt(n_embd_head)
|
1515
1611
|
// KQ_scaled shape [n_past + N, N, n_head, 1]
|
1516
1612
|
struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
|
1517
1613
|
offload_func_kq(KQ_scaled);
|
@@ -1530,10 +1626,10 @@ static bool llama_eval_internal(
|
|
1530
1626
|
// split cached V into n_head heads
|
1531
1627
|
struct ggml_tensor * V =
|
1532
1628
|
ggml_view_3d(ctx0, kv_self.v,
|
1533
|
-
n_past + N,
|
1629
|
+
n_past + N, n_embd_head, n_head_kv,
|
1534
1630
|
n_ctx*ggml_element_size(kv_self.v),
|
1535
|
-
n_ctx*ggml_element_size(kv_self.v)*
|
1536
|
-
|
1631
|
+
n_ctx*ggml_element_size(kv_self.v)*n_embd_head,
|
1632
|
+
n_ctx*ggml_element_size(kv_self.v)*n_embd_gqa*il);
|
1537
1633
|
offload_func_v(V);
|
1538
1634
|
ggml_set_name(V, "V");
|
1539
1635
|
|
@@ -1545,7 +1641,7 @@ static bool llama_eval_internal(
|
|
1545
1641
|
// make V contiguous in memory to speed up the matmul, however we waste time on the copy
|
1546
1642
|
// on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
|
1547
1643
|
// is there a better way?
|
1548
|
-
struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N,
|
1644
|
+
struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd_head, n_head));
|
1549
1645
|
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
|
1550
1646
|
#endif
|
1551
1647
|
|
@@ -1579,7 +1675,7 @@ static bool llama_eval_internal(
|
|
1579
1675
|
{
|
1580
1676
|
// norm
|
1581
1677
|
{
|
1582
|
-
cur = ggml_rms_norm(ctx0, inpFF);
|
1678
|
+
cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps);
|
1583
1679
|
offload_func(cur);
|
1584
1680
|
ggml_set_name(cur, "rms_norm_1");
|
1585
1681
|
|
@@ -1627,12 +1723,9 @@ static bool llama_eval_internal(
|
|
1627
1723
|
|
1628
1724
|
lctx.use_buf(ctx0, 0);
|
1629
1725
|
|
1630
|
-
// used at the end to optionally extract the embeddings
|
1631
|
-
struct ggml_tensor * embeddings = NULL;
|
1632
|
-
|
1633
1726
|
// norm
|
1634
1727
|
{
|
1635
|
-
cur = ggml_rms_norm(ctx0, inpL);
|
1728
|
+
cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
|
1636
1729
|
offload_func_nr(cur);
|
1637
1730
|
ggml_set_name(cur, "rms_norm_2");
|
1638
1731
|
|
@@ -1640,8 +1733,6 @@ static bool llama_eval_internal(
|
|
1640
1733
|
cur = ggml_mul(ctx0, cur, model.norm);
|
1641
1734
|
// offload_func_nr(cur); // TODO CPU + GPU mirrored backend
|
1642
1735
|
ggml_set_name(cur, "result_norm");
|
1643
|
-
|
1644
|
-
embeddings = cur;
|
1645
1736
|
}
|
1646
1737
|
|
1647
1738
|
// lm_head
|
@@ -1653,18 +1744,103 @@ static bool llama_eval_internal(
|
|
1653
1744
|
// logits -> probs
|
1654
1745
|
//cur = ggml_soft_max_inplace(ctx0, cur);
|
1655
1746
|
|
1656
|
-
|
1657
|
-
|
1747
|
+
ggml_build_forward_expand(gf, cur);
|
1748
|
+
|
1749
|
+
if (mem_per_token == 0) {
|
1750
|
+
mem_per_token = ggml_used_mem(ctx0)/N;
|
1751
|
+
}
|
1752
|
+
|
1753
|
+
#if 0
|
1754
|
+
printf("\n%s: used_mem: eval ctx %.3f MB, scratch %.3f MB %.3f MB, work buf %.3f MB, n_past = %d, N = %d\n", __func__,
|
1755
|
+
ggml_used_mem(ctx0)/1024.0/1024.0,
|
1756
|
+
lctx.get_buf_max_mem(0)/1024.0/1024.0,
|
1757
|
+
lctx.get_buf_max_mem(1)/1024.0/1024.0,
|
1758
|
+
lctx.work_buffer.size()/1024.0/1024.0,
|
1759
|
+
n_past, N);
|
1760
|
+
#endif
|
1761
|
+
|
1762
|
+
ggml_free(ctx0);
|
1763
|
+
|
1764
|
+
return gf;
|
1765
|
+
}
|
1766
|
+
|
1767
|
+
// evaluate the transformer
|
1768
|
+
//
|
1769
|
+
// - lctx: llama context
|
1770
|
+
// - tokens: new batch of tokens to process
|
1771
|
+
// - embd embeddings input
|
1772
|
+
// - n_tokens number of tokens
|
1773
|
+
// - n_past: the context size so far
|
1774
|
+
// - n_threads: number of threads to use
|
1775
|
+
//
|
1776
|
+
static bool llama_eval_internal(
|
1777
|
+
llama_context & lctx,
|
1778
|
+
const llama_token * tokens,
|
1779
|
+
const float * embd,
|
1780
|
+
int n_tokens,
|
1781
|
+
int n_past,
|
1782
|
+
int n_threads,
|
1783
|
+
const char * cgraph_fname) {
|
1784
|
+
|
1785
|
+
LLAMA_ASSERT((!tokens && embd) || (tokens && !embd));
|
1786
|
+
|
1787
|
+
const int64_t t_start_us = ggml_time_us();
|
1788
|
+
|
1789
|
+
#ifdef GGML_USE_MPI
|
1790
|
+
ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
|
1791
|
+
#endif
|
1792
|
+
|
1793
|
+
const int N = n_tokens;
|
1794
|
+
|
1795
|
+
const auto & model = lctx.model;
|
1796
|
+
const auto & hparams = model.hparams;
|
1797
|
+
|
1798
|
+
const auto & kv_self = lctx.kv_self;
|
1799
|
+
|
1800
|
+
LLAMA_ASSERT(!!kv_self.ctx);
|
1801
|
+
|
1802
|
+
const int64_t n_embd = hparams.n_embd;
|
1803
|
+
const int64_t n_vocab = hparams.n_vocab;
|
1804
|
+
|
1805
|
+
#ifdef LLAMA_USE_ALLOCATOR
|
1806
|
+
ggml_allocr_reset(lctx.alloc);
|
1807
|
+
#endif
|
1808
|
+
|
1809
|
+
ggml_cgraph * gf = llama_build_graph(lctx, tokens, embd, n_tokens, n_past);
|
1810
|
+
|
1811
|
+
#ifdef LLAMA_USE_ALLOCATOR
|
1812
|
+
ggml_allocr_alloc_graph(lctx.alloc, gf);
|
1813
|
+
#endif
|
1814
|
+
|
1815
|
+
// fprintf(stderr, "graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
|
1816
|
+
|
1817
|
+
// for big prompts, if BLAS is enabled, it is better to use only one thread
|
1818
|
+
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
|
1819
|
+
n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
|
1820
|
+
|
1821
|
+
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
|
1822
|
+
struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
|
1823
|
+
|
1824
|
+
LLAMA_ASSERT(strcmp(res->name, "result_output") == 0);
|
1825
|
+
LLAMA_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
|
1658
1826
|
|
1659
1827
|
#if GGML_USE_MPI
|
1660
|
-
|
1828
|
+
const int64_t n_layer = hparams.n_layer;
|
1829
|
+
ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
|
1661
1830
|
#endif
|
1662
1831
|
|
1663
1832
|
#ifdef GGML_USE_METAL
|
1664
1833
|
if (lctx.ctx_metal && N == 1) {
|
1834
|
+
// TODO: disabled until #2413 is resolved
|
1835
|
+
//if (!ggml_metal_if_optimized(lctx.ctx_metal)) {
|
1836
|
+
// ggml_metal_graph_find_concurrency(lctx.ctx_metal, gf);
|
1837
|
+
//}
|
1665
1838
|
ggml_metal_set_n_cb (lctx.ctx_metal, n_threads);
|
1666
|
-
ggml_metal_graph_compute(lctx.ctx_metal,
|
1667
|
-
ggml_metal_get_tensor (lctx.ctx_metal,
|
1839
|
+
ggml_metal_graph_compute(lctx.ctx_metal, gf);
|
1840
|
+
ggml_metal_get_tensor (lctx.ctx_metal, res);
|
1841
|
+
if (!lctx.embedding.empty()) {
|
1842
|
+
ggml_metal_get_tensor(lctx.ctx_metal, embeddings);
|
1843
|
+
}
|
1668
1844
|
} else {
|
1669
1845
|
// IMPORTANT:
|
1670
1846
|
// Since we don't have efficient Matrix x Matrix Metal multiplication yet, we fallback to vanilla
|
@@ -1682,34 +1858,32 @@ static bool llama_eval_internal(
|
|
1682
1858
|
ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v);
|
1683
1859
|
}
|
1684
1860
|
|
1685
|
-
ggml_graph_compute_helper(lctx.work_buffer,
|
1861
|
+
ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
|
1686
1862
|
}
|
1687
1863
|
#else
|
1688
|
-
ggml_graph_compute_helper(lctx.work_buffer,
|
1864
|
+
ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
|
1689
1865
|
#endif
|
1690
1866
|
|
1691
1867
|
#if GGML_USE_MPI
|
1692
|
-
ggml_mpi_graph_compute_post(lctx.ctx_mpi,
|
1868
|
+
ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
|
1693
1869
|
#endif
|
1694
1870
|
|
1695
1871
|
// update kv token count
|
1696
1872
|
lctx.kv_self.n = n_past + N;
|
1697
1873
|
|
1698
|
-
struct ggml_tensor * res = gf.nodes[gf.n_nodes - 1];
|
1699
|
-
|
1700
1874
|
if (cgraph_fname) {
|
1701
|
-
ggml_graph_export(
|
1875
|
+
ggml_graph_export(gf, cgraph_fname);
|
1702
1876
|
}
|
1703
1877
|
|
1704
1878
|
#ifdef GGML_PERF
|
1705
1879
|
// print timing information per ggml operation (for debugging purposes)
|
1706
1880
|
// requires GGML_PERF to be defined
|
1707
|
-
ggml_graph_print(
|
1881
|
+
ggml_graph_print(gf);
|
1708
1882
|
#endif
|
1709
1883
|
|
1710
1884
|
// plot the computation graph in dot format (for debugging purposes)
|
1711
1885
|
//if (n_past%100 == 0) {
|
1712
|
-
// ggml_graph_dump_dot(
|
1886
|
+
// ggml_graph_dump_dot(gf, NULL, "llama.dot");
|
1713
1887
|
//}
|
1714
1888
|
|
1715
1889
|
// extract logits
|
@@ -1734,19 +1908,6 @@ static bool llama_eval_internal(
|
|
1734
1908
|
memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(N - 1)), sizeof(float)*n_embd);
|
1735
1909
|
}
|
1736
1910
|
|
1737
|
-
if (mem_per_token == 0) {
|
1738
|
-
mem_per_token = ggml_used_mem(ctx0)/N;
|
1739
|
-
}
|
1740
|
-
|
1741
|
-
#if 0
|
1742
|
-
printf("\n%s: used_mem = %.3f MB, scratch -- %.3f MB %.3f MB\n", __func__,
|
1743
|
-
ggml_used_mem(ctx0)/1024.0/1024.0,
|
1744
|
-
lctx.get_buf_max_mem(0)/1024.0/1024.0,
|
1745
|
-
lctx.get_buf_max_mem(1)/1024.0/1024.0);
|
1746
|
-
#endif
|
1747
|
-
|
1748
|
-
ggml_free(ctx0);
|
1749
|
-
|
1750
1911
|
// measure the performance only for the single-token evals
|
1751
1912
|
if (N == 1) {
|
1752
1913
|
lctx.t_eval_us += ggml_time_us() - t_start_us;
|
@@ -1858,7 +2019,9 @@ struct llama_tokenizer {
|
|
1858
2019
|
if (token == vocab_.token_to_id.end()) {
|
1859
2020
|
// output any symbols that did not form tokens as bytes.
|
1860
2021
|
for (int j = 0; j < (int) symbol.n; ++j) {
|
1861
|
-
|
2022
|
+
// NOTE: old version, before #2420 - not sure what are the implications of this
|
2023
|
+
//llama_vocab::id token_id = static_cast<uint8_t>(symbol.text[j]) + 3;
|
2024
|
+
llama_vocab::id token_id = vocab_.token_to_id.at(std::string(1, symbol.text[j]));
|
1862
2025
|
output.push_back(token_id);
|
1863
2026
|
}
|
1864
2027
|
} else {
|
@@ -1915,6 +2078,279 @@ static std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, co
|
|
1915
2078
|
return output;
|
1916
2079
|
}
|
1917
2080
|
|
2081
|
+
//
|
2082
|
+
// grammar - internal
|
2083
|
+
//
|
2084
|
+
|
2085
|
+
struct llama_grammar {
|
2086
|
+
const std::vector<std::vector<llama_grammar_element>> rules;
|
2087
|
+
std::vector<std::vector<const llama_grammar_element *>> stacks;
|
2088
|
+
};
|
2089
|
+
|
2090
|
+
struct llama_grammar_candidate {
|
2091
|
+
size_t index;
|
2092
|
+
const uint32_t * code_points;
|
2093
|
+
};
|
2094
|
+
|
2095
|
+
// NOTE: assumes valid utf8 (but checks for overrun)
|
2096
|
+
// adds a terminating 0 for use as pointer
|
2097
|
+
std::vector<uint32_t> decode_utf8(const char * src) {
|
2098
|
+
static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
|
2099
|
+
const char * pos = src;
|
2100
|
+
std::vector<uint32_t> code_points;
|
2101
|
+
while (*pos != 0) {
|
2102
|
+
uint8_t first_byte = static_cast<uint8_t>(*pos);
|
2103
|
+
uint8_t highbits = first_byte >> 4;
|
2104
|
+
int len = lookup[highbits];
|
2105
|
+
uint8_t mask = (1 << (8 - len)) - 1;
|
2106
|
+
uint32_t value = first_byte & mask;
|
2107
|
+
const char * end = pos + len; // may overrun!
|
2108
|
+
++pos;
|
2109
|
+
for ( ; pos < end && *pos != 0; ++pos) {
|
2110
|
+
value = (value << 6) + (static_cast<uint8_t>(*pos) & 0x3F);
|
2111
|
+
}
|
2112
|
+
code_points.push_back(value);
|
2113
|
+
}
|
2114
|
+
code_points.push_back(0);
|
2115
|
+
return code_points;
|
2116
|
+
}
|
2117
|
+
|
2118
|
+
// returns true iff pos points to the end of one of the definitions of a rule
|
2119
|
+
static bool llama_grammar_is_end_of_sequence(const llama_grammar_element * pos) {
|
2120
|
+
switch (pos->type) {
|
2121
|
+
case LLAMA_GRETYPE_END: return true;
|
2122
|
+
case LLAMA_GRETYPE_ALT: return true;
|
2123
|
+
default: return false;
|
2124
|
+
}
|
2125
|
+
}
|
2126
|
+
|
2127
|
+
// returns true iff chr satisfies the char range at pos (regular or inverse range)
|
2128
|
+
// asserts that pos is pointing to a char range element
|
2129
|
+
static std::pair<bool, const llama_grammar_element *> llama_grammar_match_char(
|
2130
|
+
const llama_grammar_element * pos,
|
2131
|
+
const uint32_t chr) {
|
2132
|
+
|
2133
|
+
bool found = false;
|
2134
|
+
bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR;
|
2135
|
+
LLAMA_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT);
|
2136
|
+
|
2137
|
+
do {
|
2138
|
+
if (pos[1].type == LLAMA_GRETYPE_CHAR_RNG_UPPER) {
|
2139
|
+
// inclusive range, e.g. [a-z]
|
2140
|
+
found = found || (pos->value <= chr && chr <= pos[1].value);
|
2141
|
+
pos += 2;
|
2142
|
+
} else {
|
2143
|
+
// exact char match, e.g. [a] or "a"
|
2144
|
+
found = found || pos->value == chr;
|
2145
|
+
pos += 1;
|
2146
|
+
}
|
2147
|
+
} while (pos->type == LLAMA_GRETYPE_CHAR_ALT);
|
2148
|
+
|
2149
|
+
return std::make_pair(found == is_positive_char, pos);
|
2150
|
+
}
|
2151
|
+
|
2152
|
+
// transforms a grammar pushdown stack into N possible stacks, all ending
|
2153
|
+
// at a character range (terminal element)
|
2154
|
+
static void llama_grammar_advance_stack(
|
2155
|
+
const std::vector<std::vector<llama_grammar_element>> & rules,
|
2156
|
+
const std::vector<const llama_grammar_element *> & stack,
|
2157
|
+
std::vector<std::vector<const llama_grammar_element *>> & new_stacks) {
|
2158
|
+
|
2159
|
+
if (stack.empty()) {
|
2160
|
+
new_stacks.push_back(stack);
|
2161
|
+
return;
|
2162
|
+
}
|
2163
|
+
|
2164
|
+
const llama_grammar_element * pos = stack.back();
|
2165
|
+
|
2166
|
+
switch (pos->type) {
|
2167
|
+
case LLAMA_GRETYPE_RULE_REF: {
|
2168
|
+
const size_t rule_id = static_cast<size_t>(pos->value);
|
2169
|
+
const llama_grammar_element * subpos = rules[rule_id].data();
|
2170
|
+
do {
|
2171
|
+
// init new stack without the top (pos)
|
2172
|
+
std::vector<const llama_grammar_element *> new_stack(stack.begin(), stack.end() - 1);
|
2173
|
+
if (!llama_grammar_is_end_of_sequence(pos + 1)) {
|
2174
|
+
// if this rule ref is followed by another element, add that to stack
|
2175
|
+
new_stack.push_back(pos + 1);
|
2176
|
+
}
|
2177
|
+
if (!llama_grammar_is_end_of_sequence(subpos)) {
|
2178
|
+
// if alternate is nonempty, add to stack
|
2179
|
+
new_stack.push_back(subpos);
|
2180
|
+
}
|
2181
|
+
llama_grammar_advance_stack(rules, new_stack, new_stacks);
|
2182
|
+
while (!llama_grammar_is_end_of_sequence(subpos)) {
|
2183
|
+
// scan to end of alternate def
|
2184
|
+
subpos++;
|
2185
|
+
}
|
2186
|
+
if (subpos->type == LLAMA_GRETYPE_ALT) {
|
2187
|
+
// there's another alternate def of this rule to process
|
2188
|
+
subpos++;
|
2189
|
+
} else {
|
2190
|
+
break;
|
2191
|
+
}
|
2192
|
+
} while (true);
|
2193
|
+
break;
|
2194
|
+
}
|
2195
|
+
case LLAMA_GRETYPE_CHAR:
|
2196
|
+
case LLAMA_GRETYPE_CHAR_NOT:
|
2197
|
+
new_stacks.push_back(stack);
|
2198
|
+
break;
|
2199
|
+
default:
|
2200
|
+
// end of alternate (LLAMA_GRETYPE_END, LLAMA_GRETYPE_ALT) or middle of char range
|
2201
|
+
// (LLAMA_GRETYPE_CHAR_ALT, LLAMA_GRETYPE_CHAR_RNG_UPPER); stack should never be left on
|
2202
|
+
// those
|
2203
|
+
LLAMA_ASSERT(false);
|
2204
|
+
}
|
2205
|
+
}
|
2206
|
+
|
2207
|
+
// takes a set of possible pushdown stacks on a grammar, which are required to
|
2208
|
+
// be positioned at a character range (see `llama_grammar_advance_stack`), and
|
2209
|
+
// produces the N possible stacks if the given char is accepted at those
|
2210
|
+
// positions
|
2211
|
+
static std::vector<std::vector<const llama_grammar_element *>> llama_grammar_accept(
|
2212
|
+
const std::vector<std::vector<llama_grammar_element>> & rules,
|
2213
|
+
const std::vector<std::vector<const llama_grammar_element *>> & stacks,
|
2214
|
+
const uint32_t chr) {
|
2215
|
+
|
2216
|
+
std::vector<std::vector<const llama_grammar_element *>> new_stacks;
|
2217
|
+
|
2218
|
+
for (const auto & stack : stacks) {
|
2219
|
+
if (stack.empty()) {
|
2220
|
+
continue;
|
2221
|
+
}
|
2222
|
+
|
2223
|
+
auto match = llama_grammar_match_char(stack.back(), chr);
|
2224
|
+
if (match.first) {
|
2225
|
+
const llama_grammar_element * pos = match.second;
|
2226
|
+
|
2227
|
+
// update top of stack to next element, if any
|
2228
|
+
std::vector<const llama_grammar_element *> new_stack(stack.begin(), stack.end() - 1);
|
2229
|
+
if (!llama_grammar_is_end_of_sequence(pos)) {
|
2230
|
+
new_stack.push_back(pos);
|
2231
|
+
}
|
2232
|
+
llama_grammar_advance_stack(rules, new_stack, new_stacks);
|
2233
|
+
}
|
2234
|
+
}
|
2235
|
+
|
2236
|
+
return new_stacks;
|
2237
|
+
}
|
2238
|
+
|
2239
|
+
static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates(
|
2240
|
+
const std::vector<std::vector<llama_grammar_element>> & rules,
|
2241
|
+
const std::vector<std::vector<const llama_grammar_element *>> & stacks,
|
2242
|
+
const std::vector<llama_grammar_candidate> & candidates);
|
2243
|
+
|
2244
|
+
static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_stack(
|
2245
|
+
const std::vector<std::vector<llama_grammar_element>> & rules,
|
2246
|
+
const std::vector<const llama_grammar_element *> & stack,
|
2247
|
+
const std::vector<llama_grammar_candidate> & candidates) {
|
2248
|
+
|
2249
|
+
std::vector<llama_grammar_candidate> rejects;
|
2250
|
+
|
2251
|
+
if (stack.empty()) {
|
2252
|
+
// accept nothing; EOS is handled elsewhere
|
2253
|
+
rejects.insert(rejects.end(), candidates.begin(), candidates.end());
|
2254
|
+
return rejects;
|
2255
|
+
}
|
2256
|
+
|
2257
|
+
const llama_grammar_element * stack_pos = stack.back();
|
2258
|
+
|
2259
|
+
std::vector<llama_grammar_candidate> next_candidates;
|
2260
|
+
for (auto tok : candidates) {
|
2261
|
+
if (llama_grammar_match_char(stack_pos, tok.code_points[0]).first) {
|
2262
|
+
if (tok.code_points[1] != 0) {
|
2263
|
+
next_candidates.push_back({ tok.index, tok.code_points + 1 });
|
2264
|
+
}
|
2265
|
+
} else {
|
2266
|
+
rejects.push_back(tok);
|
2267
|
+
}
|
2268
|
+
}
|
2269
|
+
|
2270
|
+
auto stack_pos_after = llama_grammar_match_char(stack_pos, 0).second;
|
2271
|
+
|
2272
|
+
// update top of stack to next element, if any
|
2273
|
+
std::vector<const llama_grammar_element *> stack_after(stack.begin(), stack.end() - 1);
|
2274
|
+
if (!llama_grammar_is_end_of_sequence(stack_pos_after)) {
|
2275
|
+
stack_after.push_back(stack_pos_after);
|
2276
|
+
}
|
2277
|
+
std::vector<std::vector<const llama_grammar_element *>> next_stacks;
|
2278
|
+
llama_grammar_advance_stack(rules, stack_after, next_stacks);
|
2279
|
+
|
2280
|
+
auto next_rejects = llama_grammar_reject_candidates(rules, next_stacks, next_candidates);
|
2281
|
+
for (auto tok : next_rejects) {
|
2282
|
+
rejects.push_back({ tok.index, tok.code_points - 1 });
|
2283
|
+
}
|
2284
|
+
|
2285
|
+
return rejects;
|
2286
|
+
}
|
2287
|
+
|
2288
|
+
static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates(
|
2289
|
+
const std::vector<std::vector<llama_grammar_element>> & rules,
|
2290
|
+
const std::vector<std::vector<const llama_grammar_element *>> & stacks,
|
2291
|
+
const std::vector<llama_grammar_candidate> & candidates) {
|
2292
|
+
LLAMA_ASSERT(!stacks.empty()); // REVIEW
|
2293
|
+
|
2294
|
+
if (candidates.empty()) {
|
2295
|
+
return std::vector<llama_grammar_candidate>();
|
2296
|
+
}
|
2297
|
+
|
2298
|
+
auto rejects = llama_grammar_reject_candidates_for_stack(rules, stacks.front(), candidates);
|
2299
|
+
|
2300
|
+
for (size_t i = 1, size = stacks.size(); i < size; ++i) {
|
2301
|
+
rejects = llama_grammar_reject_candidates_for_stack(rules, stacks[i], rejects);
|
2302
|
+
}
|
2303
|
+
return rejects;
|
2304
|
+
}
|
2305
|
+
|
2306
|
+
//
|
2307
|
+
// grammar - external
|
2308
|
+
//
|
2309
|
+
|
2310
|
+
struct llama_grammar * llama_grammar_init(
|
2311
|
+
const llama_grammar_element ** rules,
|
2312
|
+
size_t n_rules,
|
2313
|
+
size_t start_rule_index) {
|
2314
|
+
const llama_grammar_element * pos;
|
2315
|
+
|
2316
|
+
// copy rule definitions into vectors
|
2317
|
+
std::vector<std::vector<llama_grammar_element>> vec_rules(n_rules);
|
2318
|
+
for (size_t i = 0; i < n_rules; i++) {
|
2319
|
+
for (pos = rules[i]; pos->type != LLAMA_GRETYPE_END; pos++) {
|
2320
|
+
vec_rules[i].push_back(*pos);
|
2321
|
+
}
|
2322
|
+
vec_rules[i].push_back({LLAMA_GRETYPE_END, 0});
|
2323
|
+
}
|
2324
|
+
|
2325
|
+
// loop over alternates of start rule to build initial stacks
|
2326
|
+
std::vector<std::vector<const llama_grammar_element *>> stacks;
|
2327
|
+
pos = rules[start_rule_index];
|
2328
|
+
do {
|
2329
|
+
std::vector<const llama_grammar_element *> stack;
|
2330
|
+
if (!llama_grammar_is_end_of_sequence(pos)) {
|
2331
|
+
// if alternate is nonempty, add to stack
|
2332
|
+
stack.push_back(pos);
|
2333
|
+
}
|
2334
|
+
llama_grammar_advance_stack(vec_rules, stack, stacks);
|
2335
|
+
while (!llama_grammar_is_end_of_sequence(pos)) {
|
2336
|
+
// scan to end of alternate def
|
2337
|
+
pos++;
|
2338
|
+
}
|
2339
|
+
if (pos->type == LLAMA_GRETYPE_ALT) {
|
2340
|
+
// there's another alternate def of this rule to process
|
2341
|
+
pos++;
|
2342
|
+
} else {
|
2343
|
+
break;
|
2344
|
+
}
|
2345
|
+
} while (true);
|
2346
|
+
|
2347
|
+
return new llama_grammar{ std::move(vec_rules), std::move(stacks) };
|
2348
|
+
}
|
2349
|
+
|
2350
|
+
void llama_grammar_free(struct llama_grammar * grammar) {
|
2351
|
+
delete grammar;
|
2352
|
+
}
|
2353
|
+
|
1918
2354
|
//
|
1919
2355
|
// sampling
|
1920
2356
|
//
|
@@ -2200,6 +2636,47 @@ void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, l
|
|
2200
2636
|
}
|
2201
2637
|
}
|
2202
2638
|
|
2639
|
+
void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, const struct llama_grammar * grammar) {
|
2640
|
+
assert(ctx);
|
2641
|
+
const int64_t t_start_sample_us = ggml_time_us();
|
2642
|
+
|
2643
|
+
bool allow_eos = false;
|
2644
|
+
for (const auto & stack : grammar->stacks) {
|
2645
|
+
if (stack.empty()) {
|
2646
|
+
allow_eos = true;
|
2647
|
+
break;
|
2648
|
+
}
|
2649
|
+
}
|
2650
|
+
|
2651
|
+
const llama_token eos = llama_token_eos();
|
2652
|
+
|
2653
|
+
std::vector<std::vector<uint32_t>> candidates_decoded;
|
2654
|
+
std::vector<llama_grammar_candidate> candidates_grammar;
|
2655
|
+
|
2656
|
+
for (size_t i = 0; i < candidates->size; ++i) {
|
2657
|
+
const llama_token id = candidates->data[i].id;
|
2658
|
+
const char * str = llama_token_to_str(ctx, id);
|
2659
|
+
if (id == eos) {
|
2660
|
+
if (!allow_eos) {
|
2661
|
+
candidates->data[i].logit = -INFINITY;
|
2662
|
+
}
|
2663
|
+
} else if (*str == 0) {
|
2664
|
+
candidates->data[i].logit = -INFINITY;
|
2665
|
+
} else {
|
2666
|
+
candidates_decoded.push_back(decode_utf8(str));
|
2667
|
+
candidates_grammar.push_back({ i, candidates_decoded.back().data() });
|
2668
|
+
}
|
2669
|
+
}
|
2670
|
+
|
2671
|
+
const auto rejects =
|
2672
|
+
llama_grammar_reject_candidates(grammar->rules, grammar->stacks, candidates_grammar);
|
2673
|
+
for (auto & reject : rejects) {
|
2674
|
+
candidates->data[reject.index].logit = -INFINITY;
|
2675
|
+
}
|
2676
|
+
|
2677
|
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
2678
|
+
}
|
2679
|
+
|
2203
2680
|
static void llama_log_softmax(float * array, size_t size) {
|
2204
2681
|
float max_l = *std::max_element(array, array + size);
|
2205
2682
|
float sum = 0.f;
|
@@ -2375,6 +2852,29 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra
|
|
2375
2852
|
return result;
|
2376
2853
|
}
|
2377
2854
|
|
2855
|
+
void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token) {
|
2856
|
+
const int64_t t_start_sample_us = ggml_time_us();
|
2857
|
+
|
2858
|
+
if (token == llama_token_eos()) {
|
2859
|
+
for (const auto & stack : grammar->stacks) {
|
2860
|
+
if (stack.empty()) {
|
2861
|
+
return;
|
2862
|
+
}
|
2863
|
+
}
|
2864
|
+
LLAMA_ASSERT(false);
|
2865
|
+
}
|
2866
|
+
|
2867
|
+
const char * str = llama_token_to_str(ctx, token);
|
2868
|
+
// Note terminating 0 in decoded string
|
2869
|
+
auto code_points = decode_utf8(str);
|
2870
|
+
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
|
2871
|
+
grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
|
2872
|
+
}
|
2873
|
+
LLAMA_ASSERT(!grammar->stacks.empty());
|
2874
|
+
|
2875
|
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
2876
|
+
}
|
2877
|
+
|
2378
2878
|
//
|
2379
2879
|
// quantization
|
2380
2880
|
//
|
@@ -2448,8 +2948,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2448
2948
|
case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
|
2449
2949
|
case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
|
2450
2950
|
case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
|
2451
|
-
case LLAMA_FTYPE_MOSTLY_F16:
|
2452
|
-
case LLAMA_FTYPE_ALL_F32:
|
2951
|
+
case LLAMA_FTYPE_MOSTLY_F16: quantized_type = GGML_TYPE_F16; break;
|
2952
|
+
case LLAMA_FTYPE_ALL_F32: quantized_type = GGML_TYPE_F32; break;
|
2453
2953
|
|
2454
2954
|
#ifdef GGML_USE_K_QUANTS
|
2455
2955
|
// K-quants
|
@@ -2533,16 +3033,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2533
3033
|
} else {
|
2534
3034
|
new_type = quantized_type;
|
2535
3035
|
#ifdef GGML_USE_K_QUANTS
|
2536
|
-
bool convert_incompatible_tensor = false;
|
2537
|
-
if (quantized_type == GGML_TYPE_Q2_K || quantized_type == GGML_TYPE_Q3_K || quantized_type == GGML_TYPE_Q4_K ||
|
2538
|
-
quantized_type == GGML_TYPE_Q5_K || quantized_type == GGML_TYPE_Q6_K) {
|
2539
|
-
int nx = tensor.ne.at(0);
|
2540
|
-
int ny = tensor.ne.at(1);
|
2541
|
-
if (nx % QK_K != 0 || ny % QK_K != 0) {
|
2542
|
-
fprintf(stderr, "\n\nTensor sizes %d x %d are not divisible by %d, required for k-quants.\n",nx,ny,QK_K);
|
2543
|
-
convert_incompatible_tensor = true;
|
2544
|
-
}
|
2545
|
-
}
|
2546
3036
|
if (tensor.name == "output.weight") {
|
2547
3037
|
int nx = tensor.ne.at(0);
|
2548
3038
|
int ny = tensor.ne.at(1);
|
@@ -2568,6 +3058,16 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2568
3058
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
2569
3059
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
2570
3060
|
}
|
3061
|
+
bool convert_incompatible_tensor = false;
|
3062
|
+
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
|
3063
|
+
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) {
|
3064
|
+
int nx = tensor.ne.at(0);
|
3065
|
+
int ny = tensor.ne.at(1);
|
3066
|
+
if (nx % QK_K != 0 || ny % QK_K != 0) {
|
3067
|
+
fprintf(stderr, "\n\nTensor sizes %d x %d are not divisible by %d, required for k-quants.\n",nx,ny,QK_K);
|
3068
|
+
convert_incompatible_tensor = true;
|
3069
|
+
}
|
3070
|
+
}
|
2571
3071
|
if (convert_incompatible_tensor) {
|
2572
3072
|
if (tensor.name == "output.weight") {
|
2573
3073
|
new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing.
|
@@ -2594,7 +3094,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2594
3094
|
f32_data = (float *) f32_conv_buf.addr;
|
2595
3095
|
}
|
2596
3096
|
|
2597
|
-
printf("quantizing .. ");
|
3097
|
+
printf("quantizing to %s .. ", ggml_type_name(new_type));
|
2598
3098
|
fflush(stdout);
|
2599
3099
|
|
2600
3100
|
work.resize(nelements * 4); // upper bound on size
|
@@ -2697,8 +3197,8 @@ struct llama_model * llama_load_model_from_file(
|
|
2697
3197
|
|
2698
3198
|
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
2699
3199
|
|
2700
|
-
if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gpu_layers,
|
2701
|
-
params.main_gpu, params.tensor_split, params.rope_freq_base, params.rope_freq_scale,params.low_vram,
|
3200
|
+
if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gqa, params.rms_norm_eps, params.n_gpu_layers,
|
3201
|
+
params.main_gpu, params.tensor_split, params.mul_mat_q, params.rope_freq_base, params.rope_freq_scale,params.low_vram,
|
2702
3202
|
memory_type, params.use_mmap, params.use_mlock, params.vocab_only, params.progress_callback,
|
2703
3203
|
params.progress_callback_user_data)) {
|
2704
3204
|
delete model;
|
@@ -2775,10 +3275,47 @@ struct llama_context * llama_new_context_with_model(
|
|
2775
3275
|
ctx->embedding.resize(hparams.n_embd);
|
2776
3276
|
}
|
2777
3277
|
|
2778
|
-
|
3278
|
+
#ifdef LLAMA_USE_ALLOCATOR
|
3279
|
+
{
|
3280
|
+
static const size_t tensor_alignment = 32;
|
3281
|
+
// the compute buffer is used to store the tensor and graph structs, while the allocator buffer is used for the tensor data
|
3282
|
+
ctx->buf_compute.resize(ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead());
|
3283
|
+
|
3284
|
+
// create measure allocator
|
3285
|
+
ctx->alloc = ggml_allocr_new_measure(tensor_alignment);
|
3286
|
+
|
3287
|
+
// build worst-case graph
|
3288
|
+
int n_tokens = std::min((int)hparams.n_ctx, params.n_batch);
|
3289
|
+
int n_past = hparams.n_ctx - n_tokens;
|
3290
|
+
llama_token token = llama_token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
|
3291
|
+
ggml_cgraph * gf = llama_build_graph(*ctx, &token, NULL, n_tokens, n_past);
|
3292
|
+
|
3293
|
+
// measure memory requirements for the graph
|
3294
|
+
size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment;
|
2779
3295
|
|
3296
|
+
fprintf(stderr, "%s: compute buffer total size = %7.2f MB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0);
|
3297
|
+
|
3298
|
+
// debug - for comparison with scratch buffer
|
3299
|
+
//size_t prev_req =
|
3300
|
+
// MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type) +
|
3301
|
+
// MEM_REQ_SCRATCH1().at(ctx->model.type) +
|
3302
|
+
// MEM_REQ_EVAL().at(ctx->model.type);
|
3303
|
+
//fprintf(stderr, "%s: (debug) equivalent with scratch buffer = %7.2f MB\n", __func__, prev_req / 1024.0 / 1024.0);
|
3304
|
+
|
3305
|
+
// recreate allocator with exact memory requirements
|
3306
|
+
ggml_allocr_free(ctx->alloc);
|
3307
|
+
|
3308
|
+
ctx->buf_alloc.resize(alloc_size);
|
3309
|
+
ctx->alloc = ggml_allocr_new(ctx->buf_alloc.addr, ctx->buf_alloc.size, tensor_alignment);
|
3310
|
+
}
|
3311
|
+
#else
|
3312
|
+
ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type) + ggml_graph_overhead());
|
3313
|
+
#endif
|
3314
|
+
|
3315
|
+
#ifdef LLAMA_USE_SCRATCH
|
2780
3316
|
ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type));
|
2781
3317
|
ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
|
3318
|
+
#endif
|
2782
3319
|
}
|
2783
3320
|
|
2784
3321
|
#ifdef GGML_USE_METAL
|
@@ -2799,7 +3336,7 @@ struct llama_context * llama_new_context_with_model(
|
|
2799
3336
|
|
2800
3337
|
const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
|
2801
3338
|
|
2802
|
-
|
3339
|
+
fprintf(stderr, "%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
|
2803
3340
|
|
2804
3341
|
#define LLAMA_METAL_CHECK_BUF(result) \
|
2805
3342
|
if (!(result)) { \
|
@@ -2848,9 +3385,6 @@ struct llama_context * llama_init_from_file(
|
|
2848
3385
|
}
|
2849
3386
|
|
2850
3387
|
void llama_free(struct llama_context * ctx) {
|
2851
|
-
if (ctx->model_owner) {
|
2852
|
-
delete &ctx->model;
|
2853
|
-
}
|
2854
3388
|
delete ctx;
|
2855
3389
|
}
|
2856
3390
|
|
@@ -3260,7 +3794,7 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
|
3260
3794
|
const auto & kv_self = ctx->kv_self;
|
3261
3795
|
const auto & hparams = ctx->model.hparams;
|
3262
3796
|
const int n_layer = hparams.n_layer;
|
3263
|
-
const int n_embd = hparams.
|
3797
|
+
const int n_embd = hparams.n_embd_gqa();
|
3264
3798
|
const int n_ctx = hparams.n_ctx;
|
3265
3799
|
|
3266
3800
|
const size_t kv_size = kv_self.buf.size;
|
@@ -3363,7 +3897,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
3363
3897
|
const auto & kv_self = ctx->kv_self;
|
3364
3898
|
const auto & hparams = ctx->model.hparams;
|
3365
3899
|
const int n_layer = hparams.n_layer;
|
3366
|
-
const int n_embd = hparams.
|
3900
|
+
const int n_embd = hparams.n_embd_gqa();
|
3367
3901
|
const int n_ctx = hparams.n_ctx;
|
3368
3902
|
|
3369
3903
|
size_t kv_size;
|