llama_cpp 0.3.5 → 0.3.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -0
- data/README.md +18 -2
- data/ext/llama_cpp/extconf.rb +1 -1
- data/ext/llama_cpp/llama_cpp.cpp +22 -8
- data/ext/llama_cpp/src/ggml-alloc.c +549 -0
- data/ext/llama_cpp/src/ggml-alloc.h +22 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +2526 -430
- data/ext/llama_cpp/src/ggml-cuda.h +1 -0
- data/ext/llama_cpp/src/ggml-metal.m +56 -34
- data/ext/llama_cpp/src/ggml-metal.metal +4 -1
- data/ext/llama_cpp/src/ggml.c +445 -176
- data/ext/llama_cpp/src/ggml.h +125 -33
- data/ext/llama_cpp/src/k_quants.c +32 -30
- data/ext/llama_cpp/src/llama-util.h +41 -1
- data/ext/llama_cpp/src/llama.cpp +409 -210
- data/ext/llama_cpp/src/llama.h +19 -1
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +2 -0
- metadata +4 -2
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -56,8 +56,21 @@
|
|
56
56
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
57
57
|
#endif
|
58
58
|
|
59
|
+
static void llama_log_internal(llama_log_level level, const char* format, ...);
|
60
|
+
static void llama_log_callback_default(llama_log_level level, const char * text, void * user_data);
|
61
|
+
#define LLAMA_LOG_INFO(...) llama_log_internal(LLAMA_LOG_LEVEL_INFO , __VA_ARGS__)
|
62
|
+
#define LLAMA_LOG_WARN(...) llama_log_internal(LLAMA_LOG_LEVEL_WARN , __VA_ARGS__)
|
63
|
+
#define LLAMA_LOG_ERROR(...) llama_log_internal(LLAMA_LOG_LEVEL_ERROR, __VA_ARGS__)
|
64
|
+
|
65
|
+
|
66
|
+
#if !defined(GGML_USE_CUBLAS) && !defined(GGML_USE_METAL)
|
67
|
+
#include "ggml-alloc.h"
|
68
|
+
#define LLAMA_USE_ALLOCATOR
|
69
|
+
#else
|
59
70
|
#define LLAMA_USE_SCRATCH
|
60
71
|
#define LLAMA_MAX_SCRATCH_BUFFERS 16
|
72
|
+
#endif
|
73
|
+
|
61
74
|
|
62
75
|
// available llama models
|
63
76
|
enum e_model {
|
@@ -143,7 +156,7 @@ static const std::map<e_model, size_t> & MEM_REQ_EVAL()
|
|
143
156
|
}
|
144
157
|
|
145
158
|
// amount of VRAM needed per batch size to hold temporary results
|
146
|
-
// the values for 3b
|
159
|
+
// the values for 3b are not derived from testing but instead chosen conservatively
|
147
160
|
static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_BASE()
|
148
161
|
{
|
149
162
|
static std::map<e_model, size_t> k_sizes = {
|
@@ -151,14 +164,14 @@ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_BASE()
|
|
151
164
|
{ MODEL_7B, 512ull * kB },
|
152
165
|
{ MODEL_13B, 640ull * kB },
|
153
166
|
{ MODEL_30B, 768ull * kB },
|
154
|
-
{ MODEL_65B,
|
155
|
-
{ MODEL_70B,
|
167
|
+
{ MODEL_65B, 1280ull * kB },
|
168
|
+
{ MODEL_70B, 1280ull * kB },
|
156
169
|
};
|
157
170
|
return k_sizes;
|
158
171
|
}
|
159
172
|
|
160
173
|
// amount of VRAM needed per batch size and context to hold temporary results
|
161
|
-
// the values for 3b
|
174
|
+
// the values for 3b are not derived from testing but instead chosen conservatively
|
162
175
|
static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT()
|
163
176
|
{
|
164
177
|
static std::map<e_model, size_t> k_sizes = {
|
@@ -166,8 +179,8 @@ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT()
|
|
166
179
|
{ MODEL_7B, 128ull },
|
167
180
|
{ MODEL_13B, 160ull },
|
168
181
|
{ MODEL_30B, 208ull },
|
169
|
-
{ MODEL_65B,
|
170
|
-
{ MODEL_70B,
|
182
|
+
{ MODEL_65B, 256ull },
|
183
|
+
{ MODEL_70B, 256ull },
|
171
184
|
};
|
172
185
|
return k_sizes;
|
173
186
|
}
|
@@ -327,13 +340,22 @@ struct llama_model {
|
|
327
340
|
|
328
341
|
struct llama_context {
|
329
342
|
llama_context(const llama_model & model) : model(model), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
|
330
|
-
#ifdef GGML_USE_METAL
|
331
343
|
~llama_context() {
|
344
|
+
if (model_owner) {
|
345
|
+
delete &model;
|
346
|
+
}
|
347
|
+
#ifdef GGML_USE_METAL
|
332
348
|
if (ctx_metal) {
|
333
349
|
ggml_metal_free(ctx_metal);
|
334
350
|
}
|
335
|
-
}
|
336
351
|
#endif
|
352
|
+
#ifdef LLAMA_USE_ALLOCATOR
|
353
|
+
if (alloc) {
|
354
|
+
ggml_allocr_free(alloc);
|
355
|
+
}
|
356
|
+
#endif
|
357
|
+
}
|
358
|
+
|
337
359
|
std::mt19937 rng;
|
338
360
|
|
339
361
|
bool has_evaluated_once = false;
|
@@ -371,7 +393,17 @@ struct llama_context {
|
|
371
393
|
// memory buffers used to evaluate the model
|
372
394
|
// TODO: move in llama_state
|
373
395
|
llama_ctx_buffer buf_compute;
|
396
|
+
|
397
|
+
#ifdef LLAMA_USE_ALLOCATOR
|
398
|
+
llama_ctx_buffer buf_alloc;
|
399
|
+
ggml_allocr * alloc = NULL;
|
400
|
+
#endif
|
401
|
+
|
402
|
+
#ifdef LLAMA_USE_SCRATCH
|
374
403
|
llama_ctx_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
|
404
|
+
int buf_last = 0;
|
405
|
+
size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
|
406
|
+
#endif
|
375
407
|
|
376
408
|
#ifdef GGML_USE_METAL
|
377
409
|
ggml_metal_context * ctx_metal = NULL;
|
@@ -381,9 +413,6 @@ struct llama_context {
|
|
381
413
|
ggml_mpi_context * ctx_mpi = NULL;
|
382
414
|
#endif
|
383
415
|
|
384
|
-
int buf_last = 0;
|
385
|
-
size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
|
386
|
-
|
387
416
|
void use_buf(struct ggml_context * ctx, int i) {
|
388
417
|
#if defined(LLAMA_USE_SCRATCH)
|
389
418
|
size_t last_size = 0;
|
@@ -416,6 +445,14 @@ struct llama_context {
|
|
416
445
|
}
|
417
446
|
};
|
418
447
|
|
448
|
+
struct llama_state {
|
449
|
+
// We save the log callback globally
|
450
|
+
llama_log_callback log_callback = llama_log_callback_default;
|
451
|
+
void * log_callback_user_data = nullptr;
|
452
|
+
};
|
453
|
+
// global state
|
454
|
+
static llama_state g_state;
|
455
|
+
|
419
456
|
template <typename T>
|
420
457
|
static T checked_mul(T a, T b) {
|
421
458
|
T ret = a * b;
|
@@ -482,7 +519,7 @@ struct llama_file_loader {
|
|
482
519
|
|
483
520
|
llama_file_loader(const char * fname, llama_load_tensors_map & tensors_map)
|
484
521
|
: file(fname, "rb") {
|
485
|
-
|
522
|
+
LLAMA_LOG_INFO("llama.cpp: loading model from %s\n", fname);
|
486
523
|
read_magic();
|
487
524
|
read_hparams();
|
488
525
|
read_vocab();
|
@@ -597,7 +634,7 @@ struct llama_file_saver {
|
|
597
634
|
llama_file_loader * any_file_loader;
|
598
635
|
llama_file_saver(const char * fname, llama_file_loader * any_file_loader, enum llama_ftype new_ftype)
|
599
636
|
: file(fname, "wb"), any_file_loader(any_file_loader) {
|
600
|
-
|
637
|
+
LLAMA_LOG_INFO("llama.cpp: saving model to %s\n", fname);
|
601
638
|
write_magic();
|
602
639
|
write_hparams(new_ftype);
|
603
640
|
write_vocab();
|
@@ -618,7 +655,7 @@ struct llama_file_saver {
|
|
618
655
|
}
|
619
656
|
void write_vocab() {
|
620
657
|
if (any_file_loader->file_version == LLAMA_FILE_VERSION_GGML) {
|
621
|
-
|
658
|
+
LLAMA_LOG_WARN("llama.cpp: WARNING: input is an old file that doesn't have scores; will add dummy scores\n");
|
622
659
|
}
|
623
660
|
uint32_t n_vocab = any_file_loader->hparams.n_vocab;
|
624
661
|
for (uint32_t i = 0; i < n_vocab; i++) {
|
@@ -725,12 +762,12 @@ struct llama_model_loader {
|
|
725
762
|
|
726
763
|
void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) {
|
727
764
|
size_t data_size = 0;
|
728
|
-
size_t prefetch_size =
|
765
|
+
size_t prefetch_size = file_loader->file.size;
|
729
766
|
size_t lock_size = 0;
|
730
767
|
for (const llama_load_tensor & lt : tensors_map.tensors) {
|
731
768
|
data_size += lt.size;
|
732
|
-
if (lt.ggml_tensor->backend
|
733
|
-
prefetch_size
|
769
|
+
if (lt.ggml_tensor->backend != GGML_BACKEND_CPU) {
|
770
|
+
prefetch_size -= lt.size;
|
734
771
|
}
|
735
772
|
}
|
736
773
|
|
@@ -809,7 +846,7 @@ struct llama_model_loader {
|
|
809
846
|
uint8_t byte = lt.data[i];
|
810
847
|
sum = byte + (sum << 6) + (sum << 16) - sum; // sdbm hash
|
811
848
|
}
|
812
|
-
|
849
|
+
LLAMA_LOG_INFO("%s checksum: %#08x (%s, size %zu)\n", lt.name.c_str(), sum,
|
813
850
|
llama_format_tensor_shape(lt.ne).c_str(), lt.size);
|
814
851
|
}
|
815
852
|
|
@@ -842,7 +879,7 @@ static bool kv_cache_init(
|
|
842
879
|
cache.ctx = ggml_init(params);
|
843
880
|
|
844
881
|
if (!cache.ctx) {
|
845
|
-
|
882
|
+
LLAMA_LOG_ERROR("%s: failed to allocate memory for kv cache\n", __func__);
|
846
883
|
return false;
|
847
884
|
}
|
848
885
|
|
@@ -879,6 +916,7 @@ struct llama_context_params llama_context_default_params() {
|
|
879
916
|
/*.progress_callback =*/ nullptr,
|
880
917
|
/*.progress_callback_user_data =*/ nullptr,
|
881
918
|
/*.low_vram =*/ false,
|
919
|
+
/*.mul_mat_q =*/ false,
|
882
920
|
/*.f16_kv =*/ true,
|
883
921
|
/*.logits_all =*/ false,
|
884
922
|
/*.vocab_only =*/ false,
|
@@ -1006,6 +1044,7 @@ static void llama_model_load_internal(
|
|
1006
1044
|
int n_gpu_layers,
|
1007
1045
|
int main_gpu,
|
1008
1046
|
const float * tensor_split,
|
1047
|
+
const bool mul_mat_q,
|
1009
1048
|
float rope_freq_base,
|
1010
1049
|
float rope_freq_scale,
|
1011
1050
|
bool low_vram,
|
@@ -1052,7 +1091,7 @@ static void llama_model_load_internal(
|
|
1052
1091
|
LLAMA_ASSERT(hparams.n_head % n_gqa == 0);
|
1053
1092
|
hparams.n_head_kv = hparams.n_head / n_gqa;
|
1054
1093
|
if (model.type == e_model::MODEL_65B && n_gqa == 8) {
|
1055
|
-
|
1094
|
+
LLAMA_LOG_WARN("%s: warning: assuming 70B model based on GQA == %d\n", __func__, n_gqa);
|
1056
1095
|
model.type = e_model::MODEL_70B;
|
1057
1096
|
hparams.f_ffn_mult = 1.3f; // from the params.json of the 70B model
|
1058
1097
|
}
|
@@ -1068,22 +1107,22 @@ static void llama_model_load_internal(
|
|
1068
1107
|
//const uint32_t n_ff = 28672;
|
1069
1108
|
|
1070
1109
|
{
|
1071
|
-
|
1072
|
-
|
1073
|
-
|
1074
|
-
|
1075
|
-
|
1076
|
-
|
1077
|
-
|
1078
|
-
|
1079
|
-
|
1080
|
-
|
1081
|
-
|
1082
|
-
|
1083
|
-
|
1084
|
-
|
1085
|
-
|
1086
|
-
|
1110
|
+
LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(file_version));
|
1111
|
+
LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, hparams.n_vocab);
|
1112
|
+
LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, hparams.n_ctx);
|
1113
|
+
LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd);
|
1114
|
+
LLAMA_LOG_INFO("%s: n_mult = %u\n", __func__, hparams.n_mult);
|
1115
|
+
LLAMA_LOG_INFO("%s: n_head = %u\n", __func__, hparams.n_head);
|
1116
|
+
LLAMA_LOG_INFO("%s: n_head_kv = %u\n", __func__, hparams.n_head_kv);
|
1117
|
+
LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer);
|
1118
|
+
LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot); // a.k.a. n_embd_head, n_head_dim
|
1119
|
+
LLAMA_LOG_INFO("%s: n_gqa = %u\n", __func__, hparams.n_gqa());
|
1120
|
+
LLAMA_LOG_INFO("%s: rnorm_eps = %.1e\n", __func__, hparams.f_rms_norm_eps);
|
1121
|
+
LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, n_ff);
|
1122
|
+
LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, hparams.rope_freq_base);
|
1123
|
+
LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale);
|
1124
|
+
LLAMA_LOG_INFO("%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype));
|
1125
|
+
LLAMA_LOG_INFO("%s: model size = %s\n", __func__, llama_model_type_name(model.type));
|
1087
1126
|
}
|
1088
1127
|
|
1089
1128
|
if (file_version < LLAMA_FILE_VERSION_GGJT_V2) {
|
@@ -1111,7 +1150,7 @@ static void llama_model_load_internal(
|
|
1111
1150
|
size_t ctx_size;
|
1112
1151
|
size_t mmapped_size;
|
1113
1152
|
ml->calc_sizes(&ctx_size, &mmapped_size);
|
1114
|
-
|
1153
|
+
LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MB\n", __func__, ctx_size/1024.0/1024.0);
|
1115
1154
|
|
1116
1155
|
// create the ggml context
|
1117
1156
|
{
|
@@ -1134,13 +1173,15 @@ static void llama_model_load_internal(
|
|
1134
1173
|
}
|
1135
1174
|
|
1136
1175
|
(void) main_gpu;
|
1176
|
+
(void) mul_mat_q;
|
1137
1177
|
#if defined(GGML_USE_CUBLAS)
|
1138
|
-
|
1178
|
+
LLAMA_LOG_INFO("%s: using CUDA for GPU acceleration\n", __func__);
|
1139
1179
|
ggml_cuda_set_main_device(main_gpu);
|
1180
|
+
ggml_cuda_set_mul_mat_q(mul_mat_q);
|
1140
1181
|
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
|
1141
1182
|
#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT
|
1142
1183
|
#elif defined(GGML_USE_CLBLAST)
|
1143
|
-
|
1184
|
+
LLAMA_LOG_INFO("%s: using OpenCL for GPU acceleration\n", __func__);
|
1144
1185
|
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
|
1145
1186
|
#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU
|
1146
1187
|
#else
|
@@ -1230,25 +1271,29 @@ static void llama_model_load_internal(
|
|
1230
1271
|
const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
|
1231
1272
|
|
1232
1273
|
// this is the total memory required to run the inference
|
1233
|
-
|
1274
|
+
size_t mem_required =
|
1234
1275
|
ctx_size +
|
1235
|
-
mmapped_size - vram_weights
|
1276
|
+
mmapped_size - vram_weights; // weights in VRAM not in memory
|
1277
|
+
|
1278
|
+
#ifndef LLAMA_USE_ALLOCATOR
|
1279
|
+
mem_required +=
|
1236
1280
|
MEM_REQ_SCRATCH0(hparams.n_ctx).at(model.type) +
|
1237
1281
|
MEM_REQ_SCRATCH1().at(model.type) +
|
1238
1282
|
MEM_REQ_EVAL().at(model.type);
|
1283
|
+
#endif
|
1239
1284
|
|
1240
1285
|
// this is the memory required by one llama_state
|
1241
1286
|
const size_t mem_required_state =
|
1242
1287
|
scale*hparams.kv_size();
|
1243
1288
|
|
1244
|
-
|
1289
|
+
LLAMA_LOG_INFO("%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
|
1245
1290
|
mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
|
1246
1291
|
|
1247
1292
|
(void) vram_scratch;
|
1248
1293
|
(void) n_batch;
|
1249
1294
|
#ifdef GGML_USE_CUBLAS
|
1250
1295
|
if (low_vram) {
|
1251
|
-
|
1296
|
+
LLAMA_LOG_INFO("%s: not allocating a VRAM scratch buffer due to low VRAM option\n", __func__);
|
1252
1297
|
ggml_cuda_set_scratch_size(0); // disable scratch
|
1253
1298
|
} else {
|
1254
1299
|
const size_t vram_scratch_base = VRAM_REQ_SCRATCH_BASE().at(model.type);
|
@@ -1256,7 +1301,7 @@ static void llama_model_load_internal(
|
|
1256
1301
|
vram_scratch = n_batch * (vram_scratch_base + n_ctx * vram_scratch_per_context);
|
1257
1302
|
ggml_cuda_set_scratch_size(vram_scratch);
|
1258
1303
|
if (n_gpu_layers > 0) {
|
1259
|
-
|
1304
|
+
LLAMA_LOG_INFO("%s: allocating batch_size x (%zd kB + n_ctx x %zd B) = %zd MB VRAM for the scratch buffer\n",
|
1260
1305
|
__func__, vram_scratch_base / kB, vram_scratch_per_context,
|
1261
1306
|
(vram_scratch + MB - 1) / MB); // round up
|
1262
1307
|
}
|
@@ -1266,9 +1311,9 @@ static void llama_model_load_internal(
|
|
1266
1311
|
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
1267
1312
|
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
1268
1313
|
|
1269
|
-
|
1314
|
+
LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
|
1270
1315
|
if (n_gpu_layers > (int) hparams.n_layer) {
|
1271
|
-
|
1316
|
+
LLAMA_LOG_INFO("%s: offloading non-repeating layers to GPU\n", __func__);
|
1272
1317
|
}
|
1273
1318
|
size_t vram_kv_cache = 0;
|
1274
1319
|
|
@@ -1277,17 +1322,17 @@ static void llama_model_load_internal(
|
|
1277
1322
|
const int max_offloadable_layers = low_vram ? hparams.n_layer + 1 : hparams.n_layer + 3;
|
1278
1323
|
if (n_gpu_layers > (int) hparams.n_layer + 1) {
|
1279
1324
|
if (low_vram) {
|
1280
|
-
|
1325
|
+
LLAMA_LOG_INFO("%s: cannot offload v cache to GPU due to low VRAM option\n", __func__);
|
1281
1326
|
} else {
|
1282
|
-
|
1327
|
+
LLAMA_LOG_INFO("%s: offloading v cache to GPU\n", __func__);
|
1283
1328
|
vram_kv_cache += hparams.kv_size() / 2;
|
1284
1329
|
}
|
1285
1330
|
}
|
1286
1331
|
if (n_gpu_layers > (int) hparams.n_layer + 2) {
|
1287
1332
|
if (low_vram) {
|
1288
|
-
|
1333
|
+
LLAMA_LOG_WARN("%s: cannot offload k cache to GPU due to low VRAM option\n", __func__);
|
1289
1334
|
} else {
|
1290
|
-
|
1335
|
+
LLAMA_LOG_INFO("%s: offloading k cache to GPU\n", __func__);
|
1291
1336
|
vram_kv_cache += hparams.kv_size() / 2;
|
1292
1337
|
}
|
1293
1338
|
}
|
@@ -1296,9 +1341,9 @@ static void llama_model_load_internal(
|
|
1296
1341
|
const int max_offloadable_layers = hparams.n_layer + 1;
|
1297
1342
|
#endif // GGML_USE_CUBLAS
|
1298
1343
|
|
1299
|
-
|
1344
|
+
LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n",
|
1300
1345
|
__func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
|
1301
|
-
|
1346
|
+
LLAMA_LOG_INFO("%s: total VRAM used: %zu MB\n",
|
1302
1347
|
__func__, (vram_weights + vram_scratch + vram_kv_cache + MB - 1) / MB); // round up
|
1303
1348
|
#else
|
1304
1349
|
(void) n_gpu_layers;
|
@@ -1341,6 +1386,7 @@ static bool llama_model_load(
|
|
1341
1386
|
int n_gpu_layers,
|
1342
1387
|
int main_gpu,
|
1343
1388
|
const float * tensor_split,
|
1389
|
+
const bool mul_mat_q,
|
1344
1390
|
float rope_freq_base,
|
1345
1391
|
float rope_freq_scale,
|
1346
1392
|
bool low_vram,
|
@@ -1351,41 +1397,25 @@ static bool llama_model_load(
|
|
1351
1397
|
llama_progress_callback progress_callback,
|
1352
1398
|
void *progress_callback_user_data) {
|
1353
1399
|
try {
|
1354
|
-
llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gqa, rms_norm_eps, n_gpu_layers,
|
1400
|
+
llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gqa, rms_norm_eps, n_gpu_layers,
|
1401
|
+
main_gpu, tensor_split, mul_mat_q, rope_freq_base, rope_freq_scale, low_vram, memory_type,
|
1355
1402
|
use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
|
1356
1403
|
return true;
|
1357
1404
|
} catch (const std::exception & err) {
|
1358
|
-
|
1405
|
+
LLAMA_LOG_ERROR("error loading model: %s\n", err.what());
|
1359
1406
|
return false;
|
1360
1407
|
}
|
1361
1408
|
}
|
1362
1409
|
|
1363
|
-
|
1364
|
-
//
|
1365
|
-
// - lctx: llama context
|
1366
|
-
// - tokens: new batch of tokens to process
|
1367
|
-
// - embd embeddings input
|
1368
|
-
// - n_tokens number of tokens
|
1369
|
-
// - n_past: the context size so far
|
1370
|
-
// - n_threads: number of threads to use
|
1371
|
-
//
|
1372
|
-
static bool llama_eval_internal(
|
1410
|
+
static struct ggml_cgraph * llama_build_graph(
|
1373
1411
|
llama_context & lctx,
|
1374
1412
|
const llama_token * tokens,
|
1375
1413
|
const float * embd,
|
1376
1414
|
int n_tokens,
|
1377
|
-
int n_past
|
1378
|
-
int n_threads,
|
1379
|
-
const char * cgraph_fname) {
|
1415
|
+
int n_past) {
|
1380
1416
|
|
1381
1417
|
LLAMA_ASSERT((!tokens && embd) || (tokens && !embd));
|
1382
1418
|
|
1383
|
-
#ifdef GGML_USE_MPI
|
1384
|
-
ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
|
1385
|
-
#endif
|
1386
|
-
|
1387
|
-
const int64_t t_start_us = ggml_time_us();
|
1388
|
-
|
1389
1419
|
const int N = n_tokens;
|
1390
1420
|
|
1391
1421
|
const auto & model = lctx.model;
|
@@ -1401,10 +1431,8 @@ static bool llama_eval_internal(
|
|
1401
1431
|
const int64_t n_head = hparams.n_head;
|
1402
1432
|
const int64_t n_head_kv = hparams.n_head_kv;
|
1403
1433
|
const int64_t n_embd_head = hparams.n_embd_head();
|
1404
|
-
const int64_t n_vocab = hparams.n_vocab;
|
1405
1434
|
const int64_t n_embd_gqa = hparams.n_embd_gqa();
|
1406
1435
|
|
1407
|
-
|
1408
1436
|
LLAMA_ASSERT(n_embd_head == hparams.n_rot);
|
1409
1437
|
|
1410
1438
|
const float freq_base = hparams.rope_freq_base;
|
@@ -1416,26 +1444,35 @@ static bool llama_eval_internal(
|
|
1416
1444
|
auto & mem_per_token = lctx.mem_per_token;
|
1417
1445
|
auto & buf_compute = lctx.buf_compute;
|
1418
1446
|
|
1447
|
+
|
1419
1448
|
struct ggml_init_params params = {
|
1420
1449
|
/*.mem_size =*/ buf_compute.size,
|
1421
1450
|
/*.mem_buffer =*/ buf_compute.addr,
|
1422
1451
|
/*.no_alloc =*/ false,
|
1423
1452
|
};
|
1424
1453
|
|
1454
|
+
#ifdef LLAMA_USE_ALLOCATOR
|
1455
|
+
params.no_alloc = true;
|
1456
|
+
#endif
|
1457
|
+
|
1425
1458
|
struct ggml_context * ctx0 = ggml_init(params);
|
1426
1459
|
|
1427
1460
|
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
1428
1461
|
|
1429
|
-
// for big prompts, if BLAS is enabled, it is better to use only one thread
|
1430
|
-
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
|
1431
|
-
n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
|
1432
|
-
|
1433
1462
|
struct ggml_tensor * cur;
|
1434
1463
|
struct ggml_tensor * inpL;
|
1435
1464
|
|
1436
1465
|
if (tokens) {
|
1437
1466
|
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
1467
|
+
|
1468
|
+
#ifdef LLAMA_USE_ALLOCATOR
|
1469
|
+
ggml_allocr_alloc(lctx.alloc, inp_tokens);
|
1470
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
1471
|
+
memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
|
1472
|
+
}
|
1473
|
+
#else
|
1438
1474
|
memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
|
1475
|
+
#endif
|
1439
1476
|
ggml_set_name(inp_tokens, "inp_tokens");
|
1440
1477
|
|
1441
1478
|
inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
|
@@ -1445,7 +1482,15 @@ static bool llama_eval_internal(
|
|
1445
1482
|
#endif
|
1446
1483
|
|
1447
1484
|
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
|
1485
|
+
|
1486
|
+
#ifdef LLAMA_USE_ALLOCATOR
|
1487
|
+
ggml_allocr_alloc(lctx.alloc, inpL);
|
1488
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
1489
|
+
memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
|
1490
|
+
}
|
1491
|
+
#else
|
1448
1492
|
memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
|
1493
|
+
#endif
|
1449
1494
|
}
|
1450
1495
|
|
1451
1496
|
const int i_gpu_start = n_layer - n_gpu_layers;
|
@@ -1472,6 +1517,17 @@ static bool llama_eval_internal(
|
|
1472
1517
|
}
|
1473
1518
|
#endif // GGML_USE_CUBLAS
|
1474
1519
|
|
1520
|
+
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
1521
|
+
#ifdef LLAMA_USE_ALLOCATOR
|
1522
|
+
ggml_allocr_alloc(lctx.alloc, KQ_scale);
|
1523
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
1524
|
+
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
|
1525
|
+
}
|
1526
|
+
#else
|
1527
|
+
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
|
1528
|
+
#endif
|
1529
|
+
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
1530
|
+
|
1475
1531
|
for (int il = 0; il < n_layer; ++il) {
|
1476
1532
|
ggml_format_name(inpL, "layer_inp_%d", il);
|
1477
1533
|
|
@@ -1567,9 +1623,6 @@ static bool llama_eval_internal(
|
|
1567
1623
|
ggml_set_name(KQ, "KQ");
|
1568
1624
|
|
1569
1625
|
// KQ_scaled = KQ / sqrt(n_embd_head)
|
1570
|
-
struct ggml_tensor * KQ_scale = ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head));
|
1571
|
-
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
1572
|
-
|
1573
1626
|
// KQ_scaled shape [n_past + N, N, n_head, 1]
|
1574
1627
|
struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
|
1575
1628
|
offload_func_kq(KQ_scaled);
|
@@ -1685,9 +1738,6 @@ static bool llama_eval_internal(
|
|
1685
1738
|
|
1686
1739
|
lctx.use_buf(ctx0, 0);
|
1687
1740
|
|
1688
|
-
// used at the end to optionally extract the embeddings
|
1689
|
-
struct ggml_tensor * embeddings = NULL;
|
1690
|
-
|
1691
1741
|
// norm
|
1692
1742
|
{
|
1693
1743
|
cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
|
@@ -1698,8 +1748,6 @@ static bool llama_eval_internal(
|
|
1698
1748
|
cur = ggml_mul(ctx0, cur, model.norm);
|
1699
1749
|
// offload_func_nr(cur); // TODO CPU + GPU mirrored backend
|
1700
1750
|
ggml_set_name(cur, "result_norm");
|
1701
|
-
|
1702
|
-
embeddings = cur;
|
1703
1751
|
}
|
1704
1752
|
|
1705
1753
|
// lm_head
|
@@ -1711,12 +1759,88 @@ static bool llama_eval_internal(
|
|
1711
1759
|
// logits -> probs
|
1712
1760
|
//cur = ggml_soft_max_inplace(ctx0, cur);
|
1713
1761
|
|
1714
|
-
// run the computation
|
1715
1762
|
ggml_build_forward_expand(gf, cur);
|
1716
1763
|
|
1717
|
-
|
1764
|
+
if (mem_per_token == 0) {
|
1765
|
+
mem_per_token = ggml_used_mem(ctx0)/N;
|
1766
|
+
}
|
1767
|
+
|
1768
|
+
#if 0
|
1769
|
+
LLAMA_LOG_INFO("\n%s: used_mem: eval ctx %.3f MB, scratch %.3f MB %.3f MB, work buf %.3f MB, n_past = %d, N = %d\n", __func__,
|
1770
|
+
ggml_used_mem(ctx0)/1024.0/1024.0,
|
1771
|
+
lctx.get_buf_max_mem(0)/1024.0/1024.0,
|
1772
|
+
lctx.get_buf_max_mem(1)/1024.0/1024.0,
|
1773
|
+
lctx.work_buffer.size()/1024.0/1024.0,
|
1774
|
+
n_past, N);
|
1775
|
+
#endif
|
1776
|
+
|
1777
|
+
ggml_free(ctx0);
|
1778
|
+
|
1779
|
+
return gf;
|
1780
|
+
}
|
1781
|
+
|
1782
|
+
// evaluate the transformer
|
1783
|
+
//
|
1784
|
+
// - lctx: llama context
|
1785
|
+
// - tokens: new batch of tokens to process
|
1786
|
+
// - embd embeddings input
|
1787
|
+
// - n_tokens number of tokens
|
1788
|
+
// - n_past: the context size so far
|
1789
|
+
// - n_threads: number of threads to use
|
1790
|
+
//
|
1791
|
+
static bool llama_eval_internal(
|
1792
|
+
llama_context & lctx,
|
1793
|
+
const llama_token * tokens,
|
1794
|
+
const float * embd,
|
1795
|
+
int n_tokens,
|
1796
|
+
int n_past,
|
1797
|
+
int n_threads,
|
1798
|
+
const char * cgraph_fname) {
|
1799
|
+
|
1800
|
+
LLAMA_ASSERT((!tokens && embd) || (tokens && !embd));
|
1801
|
+
|
1802
|
+
const int64_t t_start_us = ggml_time_us();
|
1803
|
+
|
1804
|
+
#ifdef GGML_USE_MPI
|
1805
|
+
ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
|
1806
|
+
#endif
|
1807
|
+
|
1808
|
+
const int N = n_tokens;
|
1809
|
+
|
1810
|
+
const auto & model = lctx.model;
|
1811
|
+
const auto & hparams = model.hparams;
|
1812
|
+
|
1813
|
+
const auto & kv_self = lctx.kv_self;
|
1814
|
+
|
1815
|
+
LLAMA_ASSERT(!!kv_self.ctx);
|
1816
|
+
|
1817
|
+
const int64_t n_embd = hparams.n_embd;
|
1818
|
+
const int64_t n_vocab = hparams.n_vocab;
|
1819
|
+
|
1820
|
+
#ifdef LLAMA_USE_ALLOCATOR
|
1821
|
+
ggml_allocr_reset(lctx.alloc);
|
1822
|
+
#endif
|
1823
|
+
|
1824
|
+
ggml_cgraph * gf = llama_build_graph(lctx, tokens, embd, n_tokens, n_past);
|
1825
|
+
|
1826
|
+
#ifdef LLAMA_USE_ALLOCATOR
|
1827
|
+
ggml_allocr_alloc_graph(lctx.alloc, gf);
|
1828
|
+
#endif
|
1829
|
+
|
1830
|
+
// LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
|
1831
|
+
|
1832
|
+
// for big prompts, if BLAS is enabled, it is better to use only one thread
|
1833
|
+
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
|
1834
|
+
n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
|
1835
|
+
|
1836
|
+
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
|
1837
|
+
struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
|
1838
|
+
|
1839
|
+
LLAMA_ASSERT(strcmp(res->name, "result_output") == 0);
|
1840
|
+
LLAMA_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
|
1718
1841
|
|
1719
1842
|
#if GGML_USE_MPI
|
1843
|
+
const int64_t n_layer = hparams.n_layer;
|
1720
1844
|
ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
|
1721
1845
|
#endif
|
1722
1846
|
|
@@ -1728,7 +1852,10 @@ static bool llama_eval_internal(
|
|
1728
1852
|
//}
|
1729
1853
|
ggml_metal_set_n_cb (lctx.ctx_metal, n_threads);
|
1730
1854
|
ggml_metal_graph_compute(lctx.ctx_metal, gf);
|
1731
|
-
ggml_metal_get_tensor (lctx.ctx_metal,
|
1855
|
+
ggml_metal_get_tensor (lctx.ctx_metal, res);
|
1856
|
+
if (!lctx.embedding.empty()) {
|
1857
|
+
ggml_metal_get_tensor(lctx.ctx_metal, embeddings);
|
1858
|
+
}
|
1732
1859
|
} else {
|
1733
1860
|
// IMPORTANT:
|
1734
1861
|
// Since we don't have efficient Matrix x Matrix Metal multiplication yet, we fallback to vanilla
|
@@ -1759,8 +1886,6 @@ static bool llama_eval_internal(
|
|
1759
1886
|
// update kv token count
|
1760
1887
|
lctx.kv_self.n = n_past + N;
|
1761
1888
|
|
1762
|
-
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
|
1763
|
-
|
1764
1889
|
if (cgraph_fname) {
|
1765
1890
|
ggml_graph_export(gf, cgraph_fname);
|
1766
1891
|
}
|
@@ -1798,21 +1923,6 @@ static bool llama_eval_internal(
|
|
1798
1923
|
memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(N - 1)), sizeof(float)*n_embd);
|
1799
1924
|
}
|
1800
1925
|
|
1801
|
-
if (mem_per_token == 0) {
|
1802
|
-
mem_per_token = ggml_used_mem(ctx0)/N;
|
1803
|
-
}
|
1804
|
-
|
1805
|
-
#if 0
|
1806
|
-
printf("\n%s: used_mem: eval ctx %.3f MB, scratch %.3f MB %.3f MB, work buf %.3f MB, n_past = %d, N = %d\n", __func__,
|
1807
|
-
ggml_used_mem(ctx0)/1024.0/1024.0,
|
1808
|
-
lctx.get_buf_max_mem(0)/1024.0/1024.0,
|
1809
|
-
lctx.get_buf_max_mem(1)/1024.0/1024.0,
|
1810
|
-
lctx.work_buffer.size()/1024.0/1024.0,
|
1811
|
-
n_past, N);
|
1812
|
-
#endif
|
1813
|
-
|
1814
|
-
ggml_free(ctx0);
|
1815
|
-
|
1816
1926
|
// measure the performance only for the single-token evals
|
1817
1927
|
if (N == 1) {
|
1818
1928
|
lctx.t_eval_us += ggml_time_us() - t_start_us;
|
@@ -1904,7 +2014,7 @@ struct llama_tokenizer {
|
|
1904
2014
|
left_sym.n += right_sym.n;
|
1905
2015
|
right_sym.n = 0;
|
1906
2016
|
|
1907
|
-
//
|
2017
|
+
//LLAMA_LOG_INFO("left = '%*s' size = %zu\n", (int) left_sym.n, left_sym.text, bigram.size);
|
1908
2018
|
|
1909
2019
|
// remove the right sym from the chain
|
1910
2020
|
left_sym.next = right_sym.next;
|
@@ -1924,7 +2034,9 @@ struct llama_tokenizer {
|
|
1924
2034
|
if (token == vocab_.token_to_id.end()) {
|
1925
2035
|
// output any symbols that did not form tokens as bytes.
|
1926
2036
|
for (int j = 0; j < (int) symbol.n; ++j) {
|
1927
|
-
|
2037
|
+
// NOTE: old version, before #2420 - not sure what are the implications of this
|
2038
|
+
//llama_vocab::id token_id = static_cast<uint8_t>(symbol.text[j]) + 3;
|
2039
|
+
llama_vocab::id token_id = vocab_.token_to_id.at(std::string(1, symbol.text[j]));
|
1928
2040
|
output.push_back(token_id);
|
1929
2041
|
}
|
1930
2042
|
} else {
|
@@ -2910,7 +3022,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2910
3022
|
tensor.data = read_data.addr;
|
2911
3023
|
model_loader->load_data_for(tensor);
|
2912
3024
|
|
2913
|
-
|
3025
|
+
LLAMA_LOG_INFO("[%4zu/%4zu] %36s - %16s, type = %6s, ",
|
2914
3026
|
++idx, model_loader->tensors_map.tensors.size(),
|
2915
3027
|
tensor.name.c_str(), llama_format_tensor_shape(tensor.ne).c_str(),
|
2916
3028
|
ggml_type_name(tensor.type));
|
@@ -2932,7 +3044,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2932
3044
|
new_type = tensor.type;
|
2933
3045
|
new_data = tensor.data;
|
2934
3046
|
new_size = tensor.size;
|
2935
|
-
|
3047
|
+
LLAMA_LOG_INFO("size = %8.3f MB\n", tensor.size/1024.0/1024.0);
|
2936
3048
|
} else {
|
2937
3049
|
new_type = quantized_type;
|
2938
3050
|
#ifdef GGML_USE_K_QUANTS
|
@@ -2967,17 +3079,17 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2967
3079
|
int nx = tensor.ne.at(0);
|
2968
3080
|
int ny = tensor.ne.at(1);
|
2969
3081
|
if (nx % QK_K != 0 || ny % QK_K != 0) {
|
2970
|
-
|
3082
|
+
LLAMA_LOG_INFO("\n\nTensor sizes %d x %d are not divisible by %d, required for k-quants.\n",nx,ny,QK_K);
|
2971
3083
|
convert_incompatible_tensor = true;
|
2972
3084
|
}
|
2973
3085
|
}
|
2974
3086
|
if (convert_incompatible_tensor) {
|
2975
3087
|
if (tensor.name == "output.weight") {
|
2976
3088
|
new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing.
|
2977
|
-
|
3089
|
+
LLAMA_LOG_WARN("F16 will be used for this tensor instead.\n");
|
2978
3090
|
} else if (tensor.name == "tok_embeddings.weight") {
|
2979
3091
|
new_type = GGML_TYPE_Q4_0; //fall back to Q4_0 instead of just failing.
|
2980
|
-
|
3092
|
+
LLAMA_LOG_WARN("Q4_0 will be used for this tensor instead.\n");
|
2981
3093
|
} else {
|
2982
3094
|
throw std::runtime_error("Unsupported tensor size encountered\n");
|
2983
3095
|
}
|
@@ -2997,7 +3109,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2997
3109
|
f32_data = (float *) f32_conv_buf.addr;
|
2998
3110
|
}
|
2999
3111
|
|
3000
|
-
|
3112
|
+
LLAMA_LOG_INFO("quantizing to %s .. ", ggml_type_name(new_type));
|
3001
3113
|
fflush(stdout);
|
3002
3114
|
|
3003
3115
|
work.resize(nelements * 4); // upper bound on size
|
@@ -3047,7 +3159,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
3047
3159
|
}
|
3048
3160
|
}
|
3049
3161
|
|
3050
|
-
|
3162
|
+
LLAMA_LOG_INFO("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0);
|
3051
3163
|
int64_t tot_count = 0;
|
3052
3164
|
for (size_t i = 0; i < hist_cur.size(); i++) {
|
3053
3165
|
hist_all[i] += hist_cur[i];
|
@@ -3056,18 +3168,18 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
3056
3168
|
|
3057
3169
|
if (tot_count > 0) {
|
3058
3170
|
for (size_t i = 0; i < hist_cur.size(); i++) {
|
3059
|
-
|
3171
|
+
LLAMA_LOG_INFO("%5.3f ", hist_cur[i] / float(nelements));
|
3060
3172
|
}
|
3061
3173
|
}
|
3062
|
-
|
3174
|
+
LLAMA_LOG_INFO("\n");
|
3063
3175
|
}
|
3064
3176
|
total_size_org += tensor.size;
|
3065
3177
|
total_size_new += new_size;
|
3066
3178
|
file_saver.write_tensor(tensor, new_type, new_data, new_size);
|
3067
3179
|
}
|
3068
3180
|
|
3069
|
-
|
3070
|
-
|
3181
|
+
LLAMA_LOG_INFO("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
|
3182
|
+
LLAMA_LOG_INFO("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0);
|
3071
3183
|
|
3072
3184
|
{
|
3073
3185
|
int64_t sum_all = 0;
|
@@ -3076,11 +3188,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
3076
3188
|
}
|
3077
3189
|
|
3078
3190
|
if (sum_all > 0) {
|
3079
|
-
|
3191
|
+
LLAMA_LOG_INFO("%s: hist: ", __func__);
|
3080
3192
|
for (size_t i = 0; i < hist_all.size(); i++) {
|
3081
|
-
|
3193
|
+
LLAMA_LOG_INFO("%5.3f ", hist_all[i] / float(sum_all));
|
3082
3194
|
}
|
3083
|
-
|
3195
|
+
LLAMA_LOG_INFO("\n");
|
3084
3196
|
}
|
3085
3197
|
}
|
3086
3198
|
}
|
@@ -3101,11 +3213,11 @@ struct llama_model * llama_load_model_from_file(
|
|
3101
3213
|
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
3102
3214
|
|
3103
3215
|
if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gqa, params.rms_norm_eps, params.n_gpu_layers,
|
3104
|
-
params.main_gpu, params.tensor_split, params.rope_freq_base, params.rope_freq_scale,params.low_vram,
|
3216
|
+
params.main_gpu, params.tensor_split, params.mul_mat_q, params.rope_freq_base, params.rope_freq_scale,params.low_vram,
|
3105
3217
|
memory_type, params.use_mmap, params.use_mlock, params.vocab_only, params.progress_callback,
|
3106
3218
|
params.progress_callback_user_data)) {
|
3219
|
+
LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
|
3107
3220
|
delete model;
|
3108
|
-
fprintf(stderr, "%s: failed to load model\n", __func__);
|
3109
3221
|
return nullptr;
|
3110
3222
|
}
|
3111
3223
|
|
@@ -3138,10 +3250,9 @@ struct llama_context * llama_new_context_with_model(
|
|
3138
3250
|
unsigned percentage = (unsigned) (100 * progress);
|
3139
3251
|
while (percentage > *cur_percentage_p) {
|
3140
3252
|
*cur_percentage_p = percentage;
|
3141
|
-
|
3142
|
-
fflush(stderr);
|
3253
|
+
LLAMA_LOG_INFO(".");
|
3143
3254
|
if (percentage >= 100) {
|
3144
|
-
|
3255
|
+
LLAMA_LOG_INFO("\n");
|
3145
3256
|
}
|
3146
3257
|
}
|
3147
3258
|
};
|
@@ -3155,14 +3266,14 @@ struct llama_context * llama_new_context_with_model(
|
|
3155
3266
|
// reserve memory for context buffers
|
3156
3267
|
if (!params.vocab_only) {
|
3157
3268
|
if (!kv_cache_init(ctx->model.hparams, ctx->kv_self, memory_type, ctx->model.hparams.n_ctx, params.n_gpu_layers)) {
|
3158
|
-
|
3269
|
+
LLAMA_LOG_ERROR("%s: kv_cache_init() failed for self-attention cache\n", __func__);
|
3159
3270
|
llama_free(ctx);
|
3160
3271
|
return nullptr;
|
3161
3272
|
}
|
3162
3273
|
|
3163
3274
|
{
|
3164
3275
|
const size_t memory_size = ggml_nbytes(ctx->kv_self.k) + ggml_nbytes(ctx->kv_self.v);
|
3165
|
-
|
3276
|
+
LLAMA_LOG_INFO("%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
|
3166
3277
|
}
|
3167
3278
|
|
3168
3279
|
const auto & hparams = ctx->model.hparams;
|
@@ -3178,10 +3289,47 @@ struct llama_context * llama_new_context_with_model(
|
|
3178
3289
|
ctx->embedding.resize(hparams.n_embd);
|
3179
3290
|
}
|
3180
3291
|
|
3292
|
+
#ifdef LLAMA_USE_ALLOCATOR
|
3293
|
+
{
|
3294
|
+
static const size_t tensor_alignment = 32;
|
3295
|
+
// the compute buffer is used to store the tensor and graph structs, while the allocator buffer is used for the tensor data
|
3296
|
+
ctx->buf_compute.resize(ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead());
|
3297
|
+
|
3298
|
+
// create measure allocator
|
3299
|
+
ctx->alloc = ggml_allocr_new_measure(tensor_alignment);
|
3300
|
+
|
3301
|
+
// build worst-case graph
|
3302
|
+
int n_tokens = std::min((int)hparams.n_ctx, params.n_batch);
|
3303
|
+
int n_past = hparams.n_ctx - n_tokens;
|
3304
|
+
llama_token token = llama_token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
|
3305
|
+
ggml_cgraph * gf = llama_build_graph(*ctx, &token, NULL, n_tokens, n_past);
|
3306
|
+
|
3307
|
+
// measure memory requirements for the graph
|
3308
|
+
size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment;
|
3309
|
+
|
3310
|
+
LLAMA_LOG_INFO("%s: compute buffer total size = %7.2f MB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0);
|
3311
|
+
|
3312
|
+
// debug - for comparison with scratch buffer
|
3313
|
+
//size_t prev_req =
|
3314
|
+
// MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type) +
|
3315
|
+
// MEM_REQ_SCRATCH1().at(ctx->model.type) +
|
3316
|
+
// MEM_REQ_EVAL().at(ctx->model.type);
|
3317
|
+
//LLAMA_LOG_INFO("%s: (debug) equivalent with scratch buffer = %7.2f MB\n", __func__, prev_req / 1024.0 / 1024.0);
|
3318
|
+
|
3319
|
+
// recreate allocator with exact memory requirements
|
3320
|
+
ggml_allocr_free(ctx->alloc);
|
3321
|
+
|
3322
|
+
ctx->buf_alloc.resize(alloc_size);
|
3323
|
+
ctx->alloc = ggml_allocr_new(ctx->buf_alloc.addr, ctx->buf_alloc.size, tensor_alignment);
|
3324
|
+
}
|
3325
|
+
#else
|
3181
3326
|
ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type) + ggml_graph_overhead());
|
3327
|
+
#endif
|
3182
3328
|
|
3329
|
+
#ifdef LLAMA_USE_SCRATCH
|
3183
3330
|
ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type));
|
3184
3331
|
ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
|
3332
|
+
#endif
|
3185
3333
|
}
|
3186
3334
|
|
3187
3335
|
#ifdef GGML_USE_METAL
|
@@ -3202,13 +3350,13 @@ struct llama_context * llama_new_context_with_model(
|
|
3202
3350
|
|
3203
3351
|
const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
|
3204
3352
|
|
3205
|
-
|
3353
|
+
LLAMA_LOG_INFO("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
|
3206
3354
|
|
3207
|
-
#define LLAMA_METAL_CHECK_BUF(result)
|
3208
|
-
if (!(result)) {
|
3209
|
-
|
3210
|
-
llama_free(ctx);
|
3211
|
-
return NULL;
|
3355
|
+
#define LLAMA_METAL_CHECK_BUF(result) \
|
3356
|
+
if (!(result)) { \
|
3357
|
+
LLAMA_LOG_ERROR("%s: failed to add buffer\n", __func__); \
|
3358
|
+
llama_free(ctx); \
|
3359
|
+
return NULL; \
|
3212
3360
|
}
|
3213
3361
|
|
3214
3362
|
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
|
@@ -3251,9 +3399,6 @@ struct llama_context * llama_init_from_file(
|
|
3251
3399
|
}
|
3252
3400
|
|
3253
3401
|
void llama_free(struct llama_context * ctx) {
|
3254
|
-
if (ctx->model_owner) {
|
3255
|
-
delete &ctx->model;
|
3256
|
-
}
|
3257
3402
|
delete ctx;
|
3258
3403
|
}
|
3259
3404
|
|
@@ -3265,19 +3410,19 @@ int llama_model_quantize(
|
|
3265
3410
|
llama_model_quantize_internal(fname_inp, fname_out, params);
|
3266
3411
|
return 0;
|
3267
3412
|
} catch (const std::exception & err) {
|
3268
|
-
|
3413
|
+
LLAMA_LOG_ERROR("%s: failed to quantize: %s\n", __func__, err.what());
|
3269
3414
|
return 1;
|
3270
3415
|
}
|
3271
3416
|
}
|
3272
3417
|
|
3273
3418
|
int llama_apply_lora_from_file_internal(const struct llama_model & model, const char * path_lora, const char * path_base_model, int n_threads) {
|
3274
|
-
|
3419
|
+
LLAMA_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
|
3275
3420
|
|
3276
3421
|
const int64_t t_start_lora_us = ggml_time_us();
|
3277
3422
|
|
3278
3423
|
auto fin = std::ifstream(path_lora, std::ios::binary);
|
3279
3424
|
if (!fin) {
|
3280
|
-
|
3425
|
+
LLAMA_LOG_ERROR("%s: failed to open '%s'\n", __func__, path_lora);
|
3281
3426
|
return 1;
|
3282
3427
|
}
|
3283
3428
|
|
@@ -3286,14 +3431,14 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
|
3286
3431
|
uint32_t magic;
|
3287
3432
|
fin.read((char *) &magic, sizeof(magic));
|
3288
3433
|
if (magic != LLAMA_FILE_MAGIC_GGLA) {
|
3289
|
-
|
3434
|
+
LLAMA_LOG_ERROR("%s: bad file magic\n", __func__);
|
3290
3435
|
return 1;
|
3291
3436
|
}
|
3292
3437
|
uint32_t format_version;
|
3293
3438
|
fin.read((char *) &format_version, sizeof(format_version));
|
3294
3439
|
|
3295
3440
|
if (format_version != 1) {
|
3296
|
-
|
3441
|
+
LLAMA_LOG_ERROR("%s: unsupported file version\n", __func__ );
|
3297
3442
|
return 1;
|
3298
3443
|
}
|
3299
3444
|
}
|
@@ -3304,7 +3449,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
|
3304
3449
|
fin.read((char *) &lora_alpha, sizeof(lora_alpha));
|
3305
3450
|
float scaling = (float)lora_alpha / (float)lora_r;
|
3306
3451
|
|
3307
|
-
|
3452
|
+
LLAMA_LOG_INFO("%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);
|
3308
3453
|
|
3309
3454
|
|
3310
3455
|
// create a temporary ggml context to store the lora tensors
|
@@ -3330,7 +3475,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
|
3330
3475
|
ggml_context * base_ctx = NULL;
|
3331
3476
|
llama_buffer base_buf;
|
3332
3477
|
if (path_base_model) {
|
3333
|
-
|
3478
|
+
LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
|
3334
3479
|
model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true));
|
3335
3480
|
|
3336
3481
|
size_t ctx_size;
|
@@ -3387,17 +3532,17 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
|
3387
3532
|
const std::string lora_suffix = ".lora";
|
3388
3533
|
size_t pos = name.rfind(lora_suffix);
|
3389
3534
|
if (pos == std::string::npos) {
|
3390
|
-
|
3535
|
+
LLAMA_LOG_ERROR("%s: error: '%s' is not a lora tensor\n", __func__, name.c_str());
|
3391
3536
|
return 1;
|
3392
3537
|
}
|
3393
3538
|
|
3394
3539
|
std::string lora_type = name.substr(pos + lora_suffix.length());
|
3395
3540
|
std::string base_name = name;
|
3396
3541
|
base_name.erase(pos);
|
3397
|
-
//
|
3542
|
+
// LLAMA_LOG_INFO("%s: %s => %s (lora type %s) \n", __func__, name.c_str(),base_name.c_str(), lora_type.c_str());
|
3398
3543
|
|
3399
3544
|
if (model_tensors.find(base_name) == model_tensors.end()) {
|
3400
|
-
|
3545
|
+
LLAMA_LOG_ERROR("%s: unknown tensor '%s' in lora adapter\n", __func__, name.data());
|
3401
3546
|
return 1;
|
3402
3547
|
}
|
3403
3548
|
|
@@ -3408,7 +3553,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
|
3408
3553
|
case 1: wtype = GGML_TYPE_F16; break;
|
3409
3554
|
default:
|
3410
3555
|
{
|
3411
|
-
|
3556
|
+
LLAMA_LOG_ERROR("%s: invalid tensor data type '%d'\n",
|
3412
3557
|
__func__, ftype);
|
3413
3558
|
return false;
|
3414
3559
|
}
|
@@ -3418,7 +3563,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
|
3418
3563
|
lora_tensor = ggml_new_tensor_2d(lora_ctx, wtype, ne[0], ne[1]);
|
3419
3564
|
}
|
3420
3565
|
else {
|
3421
|
-
|
3566
|
+
LLAMA_LOG_ERROR("%s: unsupported tensor dimension %d\n", __func__, n_dims);
|
3422
3567
|
return 1;
|
3423
3568
|
}
|
3424
3569
|
ggml_set_name(lora_tensor, "lora_tensor");
|
@@ -3456,7 +3601,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
|
3456
3601
|
if (model_loader) {
|
3457
3602
|
// load from base model
|
3458
3603
|
if (model_loader->tensors_map.name_to_idx.find(base_name) == model_loader->tensors_map.name_to_idx.end()) {
|
3459
|
-
|
3604
|
+
LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
|
3460
3605
|
return 1;
|
3461
3606
|
}
|
3462
3607
|
size_t idx = model_loader->tensors_map.name_to_idx[base_name];
|
@@ -3472,8 +3617,8 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
|
3472
3617
|
|
3473
3618
|
if (ggml_is_quantized(base_t->type)) {
|
3474
3619
|
if (!warned) {
|
3475
|
-
|
3476
|
-
|
3620
|
+
LLAMA_LOG_WARN("%s: warning: using a lora adapter with a quantized model may result in poor quality, "
|
3621
|
+
"use a f16 or f32 base model with --lora-base\n", __func__);
|
3477
3622
|
warned = true;
|
3478
3623
|
}
|
3479
3624
|
}
|
@@ -3487,8 +3632,8 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
|
3487
3632
|
ggml_set_name(loraB, "loraB");
|
3488
3633
|
|
3489
3634
|
if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) {
|
3490
|
-
|
3491
|
-
|
3635
|
+
LLAMA_LOG_ERROR("%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");"
|
3636
|
+
" are you sure that this adapter is for this model?\n", __func__, base_t->ne[0], loraA->ne[1]);
|
3492
3637
|
return 1;
|
3493
3638
|
}
|
3494
3639
|
|
@@ -3533,7 +3678,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
|
3533
3678
|
|
3534
3679
|
n_tensors++;
|
3535
3680
|
if (n_tensors % 4 == 0) {
|
3536
|
-
|
3681
|
+
LLAMA_LOG_INFO(".");
|
3537
3682
|
}
|
3538
3683
|
}
|
3539
3684
|
}
|
@@ -3545,7 +3690,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
|
3545
3690
|
}
|
3546
3691
|
|
3547
3692
|
const int64_t t_lora_us = ggml_time_us() - t_start_lora_us;
|
3548
|
-
|
3693
|
+
LLAMA_LOG_INFO(" done (%.2f ms)\n", t_lora_us / 1000.0);
|
3549
3694
|
|
3550
3695
|
return 0;
|
3551
3696
|
}
|
@@ -3554,7 +3699,7 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
|
|
3554
3699
|
try {
|
3555
3700
|
return llama_apply_lora_from_file_internal(ctx->model, path_lora, path_base_model, n_threads);
|
3556
3701
|
} catch (const std::exception & err) {
|
3557
|
-
|
3702
|
+
LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
|
3558
3703
|
return 1;
|
3559
3704
|
}
|
3560
3705
|
}
|
@@ -3563,7 +3708,7 @@ int llama_model_apply_lora_from_file(const struct llama_model * model, const cha
|
|
3563
3708
|
try {
|
3564
3709
|
return llama_apply_lora_from_file_internal(*model, path_lora, path_base_model, n_threads);
|
3565
3710
|
} catch (const std::exception & err) {
|
3566
|
-
|
3711
|
+
LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
|
3567
3712
|
return 1;
|
3568
3713
|
}
|
3569
3714
|
}
|
@@ -3612,10 +3757,20 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
|
|
3612
3757
|
return s_total;
|
3613
3758
|
}
|
3614
3759
|
|
3615
|
-
|
3616
|
-
|
3617
|
-
|
3618
|
-
|
3760
|
+
/** copy state data into either a buffer or file depending on the passed in context
|
3761
|
+
*
|
3762
|
+
* file context:
|
3763
|
+
* llama_file file("/path", "wb");
|
3764
|
+
* llama_data_file_context data_ctx(&file);
|
3765
|
+
* llama_copy_state_data(ctx, &data_ctx);
|
3766
|
+
*
|
3767
|
+
* buffer context:
|
3768
|
+
* std::vector<uint8_t> buf(max_size, 0);
|
3769
|
+
* llama_data_buffer_context data_ctx(&buf.data());
|
3770
|
+
* llama_copy_state_data(ctx, &data_ctx);
|
3771
|
+
*
|
3772
|
+
*/
|
3773
|
+
void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
|
3619
3774
|
// copy rng
|
3620
3775
|
{
|
3621
3776
|
std::stringstream rng_ss;
|
@@ -3627,8 +3782,8 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
|
3627
3782
|
memset(&rng_buf[0], 0, LLAMA_MAX_RNG_STATE);
|
3628
3783
|
memcpy(&rng_buf[0], rng_ss.str().data(), rng_ss.str().size());
|
3629
3784
|
|
3630
|
-
|
3631
|
-
|
3785
|
+
data_ctx->write(&rng_size, sizeof(rng_size));
|
3786
|
+
data_ctx->write(&rng_buf[0], LLAMA_MAX_RNG_STATE);
|
3632
3787
|
}
|
3633
3788
|
|
3634
3789
|
// copy logits
|
@@ -3636,25 +3791,29 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
|
3636
3791
|
const size_t logits_cap = ctx->logits.capacity();
|
3637
3792
|
const size_t logits_size = ctx->logits.size();
|
3638
3793
|
|
3639
|
-
|
3640
|
-
|
3794
|
+
data_ctx->write(&logits_cap, sizeof(logits_cap));
|
3795
|
+
data_ctx->write(&logits_size, sizeof(logits_size));
|
3641
3796
|
|
3642
3797
|
if (logits_size) {
|
3643
|
-
|
3798
|
+
data_ctx->write(ctx->logits.data(), logits_size * sizeof(float));
|
3644
3799
|
}
|
3645
3800
|
|
3646
|
-
|
3801
|
+
// If there is a gap between the size and the capacity, write padding
|
3802
|
+
size_t padding_size = (logits_cap - logits_size) * sizeof(float);
|
3803
|
+
if (padding_size > 0) {
|
3804
|
+
std::vector<uint8_t> padding(padding_size, 0); // Create a buffer filled with zeros
|
3805
|
+
data_ctx->write(padding.data(), padding_size);
|
3806
|
+
}
|
3647
3807
|
}
|
3648
3808
|
|
3649
3809
|
// copy embeddings
|
3650
3810
|
{
|
3651
3811
|
const size_t embedding_size = ctx->embedding.size();
|
3652
3812
|
|
3653
|
-
|
3813
|
+
data_ctx->write(&embedding_size, sizeof(embedding_size));
|
3654
3814
|
|
3655
3815
|
if (embedding_size) {
|
3656
|
-
|
3657
|
-
out += embedding_size * sizeof(float);
|
3816
|
+
data_ctx->write(ctx->embedding.data(), embedding_size * sizeof(float));
|
3658
3817
|
}
|
3659
3818
|
}
|
3660
3819
|
|
@@ -3663,14 +3822,14 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
|
3663
3822
|
const auto & kv_self = ctx->kv_self;
|
3664
3823
|
const auto & hparams = ctx->model.hparams;
|
3665
3824
|
const int n_layer = hparams.n_layer;
|
3666
|
-
const int n_embd = hparams.
|
3825
|
+
const int n_embd = hparams.n_embd_gqa();
|
3667
3826
|
const int n_ctx = hparams.n_ctx;
|
3668
3827
|
|
3669
3828
|
const size_t kv_size = kv_self.buf.size;
|
3670
3829
|
const int kv_ntok = llama_get_kv_cache_token_count(ctx);
|
3671
3830
|
|
3672
|
-
|
3673
|
-
|
3831
|
+
data_ctx->write(&kv_size, sizeof(kv_size));
|
3832
|
+
data_ctx->write(&kv_ntok, sizeof(kv_ntok));
|
3674
3833
|
|
3675
3834
|
if (kv_size) {
|
3676
3835
|
const size_t elt_size = ggml_element_size(kv_self.k);
|
@@ -3679,12 +3838,12 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
|
3679
3838
|
ggml_cgraph gf{};
|
3680
3839
|
|
3681
3840
|
ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
|
3682
|
-
kout3d
|
3683
|
-
|
3841
|
+
std::vector<uint8_t> kout3d_data(ggml_nbytes(kout3d), 0);
|
3842
|
+
kout3d->data = kout3d_data.data();
|
3684
3843
|
|
3685
3844
|
ggml_tensor * vout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_ntok, n_embd, n_layer);
|
3686
|
-
vout3d
|
3687
|
-
|
3845
|
+
std::vector<uint8_t> vout3d_data(ggml_nbytes(vout3d), 0);
|
3846
|
+
vout3d->data = vout3d_data.data();
|
3688
3847
|
|
3689
3848
|
ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k,
|
3690
3849
|
n_embd, kv_ntok, n_layer,
|
@@ -3699,15 +3858,20 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
|
3699
3858
|
ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1);
|
3700
3859
|
|
3701
3860
|
ggml_free(cpy_ctx);
|
3861
|
+
|
3862
|
+
// our data is now in the kout3d_data and vout3d_data buffers
|
3863
|
+
// write them to file
|
3864
|
+
data_ctx->write(kout3d_data.data(), kout3d_data.size());
|
3865
|
+
data_ctx->write(vout3d_data.data(), vout3d_data.size());
|
3702
3866
|
}
|
3703
3867
|
}
|
3868
|
+
}
|
3704
3869
|
|
3705
|
-
|
3706
|
-
|
3707
|
-
|
3708
|
-
LLAMA_ASSERT(written <= max_size);
|
3870
|
+
size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
3871
|
+
llama_data_buffer_context data_ctx(dst);
|
3872
|
+
llama_copy_state_data_internal(ctx, &data_ctx);
|
3709
3873
|
|
3710
|
-
return
|
3874
|
+
return data_ctx.get_size_written();
|
3711
3875
|
}
|
3712
3876
|
|
3713
3877
|
// Sets the state reading from the specified source address
|
@@ -3766,7 +3930,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
3766
3930
|
const auto & kv_self = ctx->kv_self;
|
3767
3931
|
const auto & hparams = ctx->model.hparams;
|
3768
3932
|
const int n_layer = hparams.n_layer;
|
3769
|
-
const int n_embd = hparams.
|
3933
|
+
const int n_embd = hparams.n_embd_gqa();
|
3770
3934
|
const int n_ctx = hparams.n_ctx;
|
3771
3935
|
|
3772
3936
|
size_t kv_size;
|
@@ -3826,7 +3990,7 @@ static bool llama_load_session_file_internal(struct llama_context * ctx, const c
|
|
3826
3990
|
const uint32_t version = file.read_u32();
|
3827
3991
|
|
3828
3992
|
if (magic != LLAMA_SESSION_MAGIC || version != LLAMA_SESSION_VERSION) {
|
3829
|
-
|
3993
|
+
LLAMA_LOG_ERROR("%s : unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version);
|
3830
3994
|
return false;
|
3831
3995
|
}
|
3832
3996
|
|
@@ -3834,7 +3998,7 @@ static bool llama_load_session_file_internal(struct llama_context * ctx, const c
|
|
3834
3998
|
file.read_raw(&session_hparams, sizeof(llama_hparams));
|
3835
3999
|
|
3836
4000
|
if (session_hparams != ctx->model.hparams) {
|
3837
|
-
|
4001
|
+
LLAMA_LOG_INFO("%s : model hparams didn't match from session file!\n", __func__);
|
3838
4002
|
return false;
|
3839
4003
|
}
|
3840
4004
|
}
|
@@ -3844,7 +4008,7 @@ static bool llama_load_session_file_internal(struct llama_context * ctx, const c
|
|
3844
4008
|
const uint32_t n_token_count = file.read_u32();
|
3845
4009
|
|
3846
4010
|
if (n_token_count > n_token_capacity) {
|
3847
|
-
|
4011
|
+
LLAMA_LOG_ERROR("%s : token count in session file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity);
|
3848
4012
|
return false;
|
3849
4013
|
}
|
3850
4014
|
|
@@ -3858,7 +4022,7 @@ static bool llama_load_session_file_internal(struct llama_context * ctx, const c
|
|
3858
4022
|
const size_t n_state_size_max = llama_get_state_size(ctx);
|
3859
4023
|
|
3860
4024
|
if (n_state_size_cur > n_state_size_max) {
|
3861
|
-
|
4025
|
+
LLAMA_LOG_ERROR("%s : the state size in session file is too big! max %zu, got %zu\n", __func__, n_state_size_max, n_state_size_cur);
|
3862
4026
|
return false;
|
3863
4027
|
}
|
3864
4028
|
|
@@ -3875,7 +4039,7 @@ bool llama_load_session_file(struct llama_context * ctx, const char * path_sessi
|
|
3875
4039
|
try {
|
3876
4040
|
return llama_load_session_file_internal(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out);
|
3877
4041
|
} catch (const std::exception & err) {
|
3878
|
-
|
4042
|
+
LLAMA_LOG_ERROR("error loading session file: %s\n", err.what());
|
3879
4043
|
return false;
|
3880
4044
|
}
|
3881
4045
|
}
|
@@ -3892,15 +4056,9 @@ bool llama_save_session_file(struct llama_context * ctx, const char * path_sessi
|
|
3892
4056
|
file.write_u32((uint32_t) n_token_count);
|
3893
4057
|
file.write_raw(tokens, sizeof(llama_token) * n_token_count);
|
3894
4058
|
|
3895
|
-
// save the context state
|
3896
|
-
|
3897
|
-
|
3898
|
-
|
3899
|
-
std::vector<uint8_t> state_data(n_state_size_max);
|
3900
|
-
const size_t n_state_size_cur = llama_copy_state_data(ctx, state_data.data());
|
3901
|
-
|
3902
|
-
file.write_raw(state_data.data(), n_state_size_cur);
|
3903
|
-
}
|
4059
|
+
// save the context state using stream saving
|
4060
|
+
llama_data_file_context data_ctx(&file);
|
4061
|
+
llama_copy_state_data_internal(ctx, &data_ctx);
|
3904
4062
|
|
3905
4063
|
return true;
|
3906
4064
|
}
|
@@ -3912,7 +4070,7 @@ int llama_eval(
|
|
3912
4070
|
int n_past,
|
3913
4071
|
int n_threads) {
|
3914
4072
|
if (!llama_eval_internal(*ctx, tokens, nullptr, n_tokens, n_past, n_threads, nullptr)) {
|
3915
|
-
|
4073
|
+
LLAMA_LOG_ERROR("%s: failed to eval\n", __func__);
|
3916
4074
|
return 1;
|
3917
4075
|
}
|
3918
4076
|
|
@@ -3934,7 +4092,7 @@ int llama_eval_embd(
|
|
3934
4092
|
int n_past,
|
3935
4093
|
int n_threads) {
|
3936
4094
|
if (!llama_eval_internal(*ctx, nullptr, embd, n_tokens, n_past, n_threads, nullptr)) {
|
3937
|
-
|
4095
|
+
LLAMA_LOG_ERROR("%s: failed to eval\n", __func__);
|
3938
4096
|
return 1;
|
3939
4097
|
}
|
3940
4098
|
|
@@ -3955,7 +4113,7 @@ int llama_eval_export(struct llama_context * ctx, const char * fname) {
|
|
3955
4113
|
const std::vector<llama_token> tmp(n_batch, llama_token_bos());
|
3956
4114
|
|
3957
4115
|
if (!llama_eval_internal(*ctx, tmp.data(), nullptr, tmp.size(), n_ctx, 1, fname)) {
|
3958
|
-
|
4116
|
+
LLAMA_LOG_ERROR("%s: failed to eval\n", __func__);
|
3959
4117
|
return 1;
|
3960
4118
|
}
|
3961
4119
|
|
@@ -3971,7 +4129,7 @@ int llama_tokenize_with_model(
|
|
3971
4129
|
auto res = llama_tokenize(model->vocab, text, add_bos);
|
3972
4130
|
|
3973
4131
|
if (n_max_tokens < (int) res.size()) {
|
3974
|
-
|
4132
|
+
LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
|
3975
4133
|
return -((int) res.size());
|
3976
4134
|
}
|
3977
4135
|
|
@@ -4088,15 +4246,15 @@ struct llama_timings llama_get_timings(struct llama_context * ctx) {
|
|
4088
4246
|
void llama_print_timings(struct llama_context * ctx) {
|
4089
4247
|
const llama_timings timings = llama_get_timings(ctx);
|
4090
4248
|
|
4091
|
-
|
4092
|
-
|
4093
|
-
|
4249
|
+
LLAMA_LOG_INFO("\n");
|
4250
|
+
LLAMA_LOG_INFO("%s: load time = %8.2f ms\n", __func__, timings.t_load_ms);
|
4251
|
+
LLAMA_LOG_INFO("%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
4094
4252
|
__func__, timings.t_sample_ms, timings.n_sample, timings.t_sample_ms / timings.n_sample, 1e3 / timings.t_sample_ms * timings.n_sample);
|
4095
|
-
|
4253
|
+
LLAMA_LOG_INFO("%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
|
4096
4254
|
__func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval);
|
4097
|
-
|
4255
|
+
LLAMA_LOG_INFO("%s: eval time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
4098
4256
|
__func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval);
|
4099
|
-
|
4257
|
+
LLAMA_LOG_INFO("%s: total time = %8.2f ms\n", __func__, (timings.t_end_ms - timings.t_start_ms));
|
4100
4258
|
}
|
4101
4259
|
|
4102
4260
|
void llama_reset_timings(struct llama_context * ctx) {
|
@@ -4132,3 +4290,44 @@ const char * llama_print_system_info(void) {
|
|
4132
4290
|
const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
|
4133
4291
|
return ctx->model.tensors_by_name;
|
4134
4292
|
}
|
4293
|
+
|
4294
|
+
|
4295
|
+
void llama_log_set(llama_log_callback log_callback, void * user_data) {
|
4296
|
+
g_state.log_callback = log_callback ? log_callback : llama_log_callback_default;
|
4297
|
+
g_state.log_callback_user_data = user_data;
|
4298
|
+
}
|
4299
|
+
|
4300
|
+
#if defined(_MSC_VER) && !defined(vsnprintf)
|
4301
|
+
#define vsnprintf _vsnprintf
|
4302
|
+
#endif
|
4303
|
+
|
4304
|
+
static void llama_log_internal_v(llama_log_level level, const char * format, va_list args) {
|
4305
|
+
va_list args_copy;
|
4306
|
+
va_copy(args_copy, args);
|
4307
|
+
char buffer[128];
|
4308
|
+
int len = vsnprintf(buffer, 128, format, args);
|
4309
|
+
if (len < 128) {
|
4310
|
+
g_state.log_callback(level, buffer, g_state.log_callback_user_data);
|
4311
|
+
} else {
|
4312
|
+
char* buffer2 = new char[len+1];
|
4313
|
+
vsnprintf(buffer2, len+1, format, args_copy);
|
4314
|
+
buffer2[len] = 0;
|
4315
|
+
g_state.log_callback(level, buffer2, g_state.log_callback_user_data);
|
4316
|
+
delete[] buffer2;
|
4317
|
+
}
|
4318
|
+
va_end(args_copy);
|
4319
|
+
}
|
4320
|
+
|
4321
|
+
static void llama_log_internal(llama_log_level level, const char * format, ...) {
|
4322
|
+
va_list args;
|
4323
|
+
va_start(args, format);
|
4324
|
+
llama_log_internal_v(level, format, args);
|
4325
|
+
va_end(args);
|
4326
|
+
}
|
4327
|
+
|
4328
|
+
static void llama_log_callback_default(llama_log_level level, const char * text, void * user_data) {
|
4329
|
+
(void) level;
|
4330
|
+
(void) user_data;
|
4331
|
+
fputs(text, stderr);
|
4332
|
+
fflush(stderr);
|
4333
|
+
}
|