llama_cpp 0.3.6 → 0.3.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/ext/llama_cpp/src/ggml-alloc.c +8 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +1165 -721
- data/ext/llama_cpp/src/ggml-metal.m +39 -18
- data/ext/llama_cpp/src/ggml.c +396 -150
- data/ext/llama_cpp/src/ggml.h +113 -32
- data/ext/llama_cpp/src/llama-util.h +41 -1
- data/ext/llama_cpp/src/llama.cpp +214 -146
- data/ext/llama_cpp/src/llama.h +18 -1
- data/lib/llama_cpp/version.rb +2 -2
- metadata +2 -2
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -56,6 +56,13 @@
|
|
56
56
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
57
57
|
#endif
|
58
58
|
|
59
|
+
static void llama_log_internal(llama_log_level level, const char* format, ...);
|
60
|
+
static void llama_log_callback_default(llama_log_level level, const char * text, void * user_data);
|
61
|
+
#define LLAMA_LOG_INFO(...) llama_log_internal(LLAMA_LOG_LEVEL_INFO , __VA_ARGS__)
|
62
|
+
#define LLAMA_LOG_WARN(...) llama_log_internal(LLAMA_LOG_LEVEL_WARN , __VA_ARGS__)
|
63
|
+
#define LLAMA_LOG_ERROR(...) llama_log_internal(LLAMA_LOG_LEVEL_ERROR, __VA_ARGS__)
|
64
|
+
|
65
|
+
|
59
66
|
#if !defined(GGML_USE_CUBLAS) && !defined(GGML_USE_METAL)
|
60
67
|
#include "ggml-alloc.h"
|
61
68
|
#define LLAMA_USE_ALLOCATOR
|
@@ -149,7 +156,7 @@ static const std::map<e_model, size_t> & MEM_REQ_EVAL()
|
|
149
156
|
}
|
150
157
|
|
151
158
|
// amount of VRAM needed per batch size to hold temporary results
|
152
|
-
// the values for 3b
|
159
|
+
// the values for 3b are not derived from testing but instead chosen conservatively
|
153
160
|
static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_BASE()
|
154
161
|
{
|
155
162
|
static std::map<e_model, size_t> k_sizes = {
|
@@ -157,14 +164,14 @@ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_BASE()
|
|
157
164
|
{ MODEL_7B, 512ull * kB },
|
158
165
|
{ MODEL_13B, 640ull * kB },
|
159
166
|
{ MODEL_30B, 768ull * kB },
|
160
|
-
{ MODEL_65B,
|
161
|
-
{ MODEL_70B,
|
167
|
+
{ MODEL_65B, 1280ull * kB },
|
168
|
+
{ MODEL_70B, 1280ull * kB },
|
162
169
|
};
|
163
170
|
return k_sizes;
|
164
171
|
}
|
165
172
|
|
166
173
|
// amount of VRAM needed per batch size and context to hold temporary results
|
167
|
-
// the values for 3b
|
174
|
+
// the values for 3b are not derived from testing but instead chosen conservatively
|
168
175
|
static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT()
|
169
176
|
{
|
170
177
|
static std::map<e_model, size_t> k_sizes = {
|
@@ -172,8 +179,8 @@ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT()
|
|
172
179
|
{ MODEL_7B, 128ull },
|
173
180
|
{ MODEL_13B, 160ull },
|
174
181
|
{ MODEL_30B, 208ull },
|
175
|
-
{ MODEL_65B,
|
176
|
-
{ MODEL_70B,
|
182
|
+
{ MODEL_65B, 256ull },
|
183
|
+
{ MODEL_70B, 256ull },
|
177
184
|
};
|
178
185
|
return k_sizes;
|
179
186
|
}
|
@@ -438,6 +445,14 @@ struct llama_context {
|
|
438
445
|
}
|
439
446
|
};
|
440
447
|
|
448
|
+
struct llama_state {
|
449
|
+
// We save the log callback globally
|
450
|
+
llama_log_callback log_callback = llama_log_callback_default;
|
451
|
+
void * log_callback_user_data = nullptr;
|
452
|
+
};
|
453
|
+
// global state
|
454
|
+
static llama_state g_state;
|
455
|
+
|
441
456
|
template <typename T>
|
442
457
|
static T checked_mul(T a, T b) {
|
443
458
|
T ret = a * b;
|
@@ -504,7 +519,7 @@ struct llama_file_loader {
|
|
504
519
|
|
505
520
|
llama_file_loader(const char * fname, llama_load_tensors_map & tensors_map)
|
506
521
|
: file(fname, "rb") {
|
507
|
-
|
522
|
+
LLAMA_LOG_INFO("llama.cpp: loading model from %s\n", fname);
|
508
523
|
read_magic();
|
509
524
|
read_hparams();
|
510
525
|
read_vocab();
|
@@ -619,7 +634,7 @@ struct llama_file_saver {
|
|
619
634
|
llama_file_loader * any_file_loader;
|
620
635
|
llama_file_saver(const char * fname, llama_file_loader * any_file_loader, enum llama_ftype new_ftype)
|
621
636
|
: file(fname, "wb"), any_file_loader(any_file_loader) {
|
622
|
-
|
637
|
+
LLAMA_LOG_INFO("llama.cpp: saving model to %s\n", fname);
|
623
638
|
write_magic();
|
624
639
|
write_hparams(new_ftype);
|
625
640
|
write_vocab();
|
@@ -640,7 +655,7 @@ struct llama_file_saver {
|
|
640
655
|
}
|
641
656
|
void write_vocab() {
|
642
657
|
if (any_file_loader->file_version == LLAMA_FILE_VERSION_GGML) {
|
643
|
-
|
658
|
+
LLAMA_LOG_WARN("llama.cpp: WARNING: input is an old file that doesn't have scores; will add dummy scores\n");
|
644
659
|
}
|
645
660
|
uint32_t n_vocab = any_file_loader->hparams.n_vocab;
|
646
661
|
for (uint32_t i = 0; i < n_vocab; i++) {
|
@@ -747,12 +762,12 @@ struct llama_model_loader {
|
|
747
762
|
|
748
763
|
void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) {
|
749
764
|
size_t data_size = 0;
|
750
|
-
size_t prefetch_size =
|
765
|
+
size_t prefetch_size = file_loader->file.size;
|
751
766
|
size_t lock_size = 0;
|
752
767
|
for (const llama_load_tensor & lt : tensors_map.tensors) {
|
753
768
|
data_size += lt.size;
|
754
|
-
if (lt.ggml_tensor->backend
|
755
|
-
prefetch_size
|
769
|
+
if (lt.ggml_tensor->backend != GGML_BACKEND_CPU) {
|
770
|
+
prefetch_size -= lt.size;
|
756
771
|
}
|
757
772
|
}
|
758
773
|
|
@@ -831,7 +846,7 @@ struct llama_model_loader {
|
|
831
846
|
uint8_t byte = lt.data[i];
|
832
847
|
sum = byte + (sum << 6) + (sum << 16) - sum; // sdbm hash
|
833
848
|
}
|
834
|
-
|
849
|
+
LLAMA_LOG_INFO("%s checksum: %#08x (%s, size %zu)\n", lt.name.c_str(), sum,
|
835
850
|
llama_format_tensor_shape(lt.ne).c_str(), lt.size);
|
836
851
|
}
|
837
852
|
|
@@ -864,7 +879,7 @@ static bool kv_cache_init(
|
|
864
879
|
cache.ctx = ggml_init(params);
|
865
880
|
|
866
881
|
if (!cache.ctx) {
|
867
|
-
|
882
|
+
LLAMA_LOG_ERROR("%s: failed to allocate memory for kv cache\n", __func__);
|
868
883
|
return false;
|
869
884
|
}
|
870
885
|
|
@@ -1076,7 +1091,7 @@ static void llama_model_load_internal(
|
|
1076
1091
|
LLAMA_ASSERT(hparams.n_head % n_gqa == 0);
|
1077
1092
|
hparams.n_head_kv = hparams.n_head / n_gqa;
|
1078
1093
|
if (model.type == e_model::MODEL_65B && n_gqa == 8) {
|
1079
|
-
|
1094
|
+
LLAMA_LOG_WARN("%s: warning: assuming 70B model based on GQA == %d\n", __func__, n_gqa);
|
1080
1095
|
model.type = e_model::MODEL_70B;
|
1081
1096
|
hparams.f_ffn_mult = 1.3f; // from the params.json of the 70B model
|
1082
1097
|
}
|
@@ -1092,22 +1107,22 @@ static void llama_model_load_internal(
|
|
1092
1107
|
//const uint32_t n_ff = 28672;
|
1093
1108
|
|
1094
1109
|
{
|
1095
|
-
|
1096
|
-
|
1097
|
-
|
1098
|
-
|
1099
|
-
|
1100
|
-
|
1101
|
-
|
1102
|
-
|
1103
|
-
|
1104
|
-
|
1105
|
-
|
1106
|
-
|
1107
|
-
|
1108
|
-
|
1109
|
-
|
1110
|
-
|
1110
|
+
LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(file_version));
|
1111
|
+
LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, hparams.n_vocab);
|
1112
|
+
LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, hparams.n_ctx);
|
1113
|
+
LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd);
|
1114
|
+
LLAMA_LOG_INFO("%s: n_mult = %u\n", __func__, hparams.n_mult);
|
1115
|
+
LLAMA_LOG_INFO("%s: n_head = %u\n", __func__, hparams.n_head);
|
1116
|
+
LLAMA_LOG_INFO("%s: n_head_kv = %u\n", __func__, hparams.n_head_kv);
|
1117
|
+
LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer);
|
1118
|
+
LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot); // a.k.a. n_embd_head, n_head_dim
|
1119
|
+
LLAMA_LOG_INFO("%s: n_gqa = %u\n", __func__, hparams.n_gqa());
|
1120
|
+
LLAMA_LOG_INFO("%s: rnorm_eps = %.1e\n", __func__, hparams.f_rms_norm_eps);
|
1121
|
+
LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, n_ff);
|
1122
|
+
LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, hparams.rope_freq_base);
|
1123
|
+
LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale);
|
1124
|
+
LLAMA_LOG_INFO("%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype));
|
1125
|
+
LLAMA_LOG_INFO("%s: model size = %s\n", __func__, llama_model_type_name(model.type));
|
1111
1126
|
}
|
1112
1127
|
|
1113
1128
|
if (file_version < LLAMA_FILE_VERSION_GGJT_V2) {
|
@@ -1135,7 +1150,7 @@ static void llama_model_load_internal(
|
|
1135
1150
|
size_t ctx_size;
|
1136
1151
|
size_t mmapped_size;
|
1137
1152
|
ml->calc_sizes(&ctx_size, &mmapped_size);
|
1138
|
-
|
1153
|
+
LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MB\n", __func__, ctx_size/1024.0/1024.0);
|
1139
1154
|
|
1140
1155
|
// create the ggml context
|
1141
1156
|
{
|
@@ -1160,13 +1175,13 @@ static void llama_model_load_internal(
|
|
1160
1175
|
(void) main_gpu;
|
1161
1176
|
(void) mul_mat_q;
|
1162
1177
|
#if defined(GGML_USE_CUBLAS)
|
1163
|
-
|
1178
|
+
LLAMA_LOG_INFO("%s: using CUDA for GPU acceleration\n", __func__);
|
1164
1179
|
ggml_cuda_set_main_device(main_gpu);
|
1165
1180
|
ggml_cuda_set_mul_mat_q(mul_mat_q);
|
1166
1181
|
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
|
1167
1182
|
#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT
|
1168
1183
|
#elif defined(GGML_USE_CLBLAST)
|
1169
|
-
|
1184
|
+
LLAMA_LOG_INFO("%s: using OpenCL for GPU acceleration\n", __func__);
|
1170
1185
|
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
|
1171
1186
|
#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU
|
1172
1187
|
#else
|
@@ -1271,14 +1286,14 @@ static void llama_model_load_internal(
|
|
1271
1286
|
const size_t mem_required_state =
|
1272
1287
|
scale*hparams.kv_size();
|
1273
1288
|
|
1274
|
-
|
1289
|
+
LLAMA_LOG_INFO("%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
|
1275
1290
|
mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
|
1276
1291
|
|
1277
1292
|
(void) vram_scratch;
|
1278
1293
|
(void) n_batch;
|
1279
1294
|
#ifdef GGML_USE_CUBLAS
|
1280
1295
|
if (low_vram) {
|
1281
|
-
|
1296
|
+
LLAMA_LOG_INFO("%s: not allocating a VRAM scratch buffer due to low VRAM option\n", __func__);
|
1282
1297
|
ggml_cuda_set_scratch_size(0); // disable scratch
|
1283
1298
|
} else {
|
1284
1299
|
const size_t vram_scratch_base = VRAM_REQ_SCRATCH_BASE().at(model.type);
|
@@ -1286,7 +1301,7 @@ static void llama_model_load_internal(
|
|
1286
1301
|
vram_scratch = n_batch * (vram_scratch_base + n_ctx * vram_scratch_per_context);
|
1287
1302
|
ggml_cuda_set_scratch_size(vram_scratch);
|
1288
1303
|
if (n_gpu_layers > 0) {
|
1289
|
-
|
1304
|
+
LLAMA_LOG_INFO("%s: allocating batch_size x (%zd kB + n_ctx x %zd B) = %zd MB VRAM for the scratch buffer\n",
|
1290
1305
|
__func__, vram_scratch_base / kB, vram_scratch_per_context,
|
1291
1306
|
(vram_scratch + MB - 1) / MB); // round up
|
1292
1307
|
}
|
@@ -1296,9 +1311,9 @@ static void llama_model_load_internal(
|
|
1296
1311
|
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
1297
1312
|
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
1298
1313
|
|
1299
|
-
|
1314
|
+
LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
|
1300
1315
|
if (n_gpu_layers > (int) hparams.n_layer) {
|
1301
|
-
|
1316
|
+
LLAMA_LOG_INFO("%s: offloading non-repeating layers to GPU\n", __func__);
|
1302
1317
|
}
|
1303
1318
|
size_t vram_kv_cache = 0;
|
1304
1319
|
|
@@ -1307,17 +1322,17 @@ static void llama_model_load_internal(
|
|
1307
1322
|
const int max_offloadable_layers = low_vram ? hparams.n_layer + 1 : hparams.n_layer + 3;
|
1308
1323
|
if (n_gpu_layers > (int) hparams.n_layer + 1) {
|
1309
1324
|
if (low_vram) {
|
1310
|
-
|
1325
|
+
LLAMA_LOG_INFO("%s: cannot offload v cache to GPU due to low VRAM option\n", __func__);
|
1311
1326
|
} else {
|
1312
|
-
|
1327
|
+
LLAMA_LOG_INFO("%s: offloading v cache to GPU\n", __func__);
|
1313
1328
|
vram_kv_cache += hparams.kv_size() / 2;
|
1314
1329
|
}
|
1315
1330
|
}
|
1316
1331
|
if (n_gpu_layers > (int) hparams.n_layer + 2) {
|
1317
1332
|
if (low_vram) {
|
1318
|
-
|
1333
|
+
LLAMA_LOG_WARN("%s: cannot offload k cache to GPU due to low VRAM option\n", __func__);
|
1319
1334
|
} else {
|
1320
|
-
|
1335
|
+
LLAMA_LOG_INFO("%s: offloading k cache to GPU\n", __func__);
|
1321
1336
|
vram_kv_cache += hparams.kv_size() / 2;
|
1322
1337
|
}
|
1323
1338
|
}
|
@@ -1326,9 +1341,9 @@ static void llama_model_load_internal(
|
|
1326
1341
|
const int max_offloadable_layers = hparams.n_layer + 1;
|
1327
1342
|
#endif // GGML_USE_CUBLAS
|
1328
1343
|
|
1329
|
-
|
1344
|
+
LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n",
|
1330
1345
|
__func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
|
1331
|
-
|
1346
|
+
LLAMA_LOG_INFO("%s: total VRAM used: %zu MB\n",
|
1332
1347
|
__func__, (vram_weights + vram_scratch + vram_kv_cache + MB - 1) / MB); // round up
|
1333
1348
|
#else
|
1334
1349
|
(void) n_gpu_layers;
|
@@ -1387,7 +1402,7 @@ static bool llama_model_load(
|
|
1387
1402
|
use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
|
1388
1403
|
return true;
|
1389
1404
|
} catch (const std::exception & err) {
|
1390
|
-
|
1405
|
+
LLAMA_LOG_ERROR("error loading model: %s\n", err.what());
|
1391
1406
|
return false;
|
1392
1407
|
}
|
1393
1408
|
}
|
@@ -1751,7 +1766,7 @@ static struct ggml_cgraph * llama_build_graph(
|
|
1751
1766
|
}
|
1752
1767
|
|
1753
1768
|
#if 0
|
1754
|
-
|
1769
|
+
LLAMA_LOG_INFO("\n%s: used_mem: eval ctx %.3f MB, scratch %.3f MB %.3f MB, work buf %.3f MB, n_past = %d, N = %d\n", __func__,
|
1755
1770
|
ggml_used_mem(ctx0)/1024.0/1024.0,
|
1756
1771
|
lctx.get_buf_max_mem(0)/1024.0/1024.0,
|
1757
1772
|
lctx.get_buf_max_mem(1)/1024.0/1024.0,
|
@@ -1812,7 +1827,7 @@ static bool llama_eval_internal(
|
|
1812
1827
|
ggml_allocr_alloc_graph(lctx.alloc, gf);
|
1813
1828
|
#endif
|
1814
1829
|
|
1815
|
-
//
|
1830
|
+
// LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
|
1816
1831
|
|
1817
1832
|
// for big prompts, if BLAS is enabled, it is better to use only one thread
|
1818
1833
|
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
|
@@ -1999,7 +2014,7 @@ struct llama_tokenizer {
|
|
1999
2014
|
left_sym.n += right_sym.n;
|
2000
2015
|
right_sym.n = 0;
|
2001
2016
|
|
2002
|
-
//
|
2017
|
+
//LLAMA_LOG_INFO("left = '%*s' size = %zu\n", (int) left_sym.n, left_sym.text, bigram.size);
|
2003
2018
|
|
2004
2019
|
// remove the right sym from the chain
|
2005
2020
|
left_sym.next = right_sym.next;
|
@@ -3007,7 +3022,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
3007
3022
|
tensor.data = read_data.addr;
|
3008
3023
|
model_loader->load_data_for(tensor);
|
3009
3024
|
|
3010
|
-
|
3025
|
+
LLAMA_LOG_INFO("[%4zu/%4zu] %36s - %16s, type = %6s, ",
|
3011
3026
|
++idx, model_loader->tensors_map.tensors.size(),
|
3012
3027
|
tensor.name.c_str(), llama_format_tensor_shape(tensor.ne).c_str(),
|
3013
3028
|
ggml_type_name(tensor.type));
|
@@ -3029,7 +3044,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
3029
3044
|
new_type = tensor.type;
|
3030
3045
|
new_data = tensor.data;
|
3031
3046
|
new_size = tensor.size;
|
3032
|
-
|
3047
|
+
LLAMA_LOG_INFO("size = %8.3f MB\n", tensor.size/1024.0/1024.0);
|
3033
3048
|
} else {
|
3034
3049
|
new_type = quantized_type;
|
3035
3050
|
#ifdef GGML_USE_K_QUANTS
|
@@ -3064,17 +3079,17 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
3064
3079
|
int nx = tensor.ne.at(0);
|
3065
3080
|
int ny = tensor.ne.at(1);
|
3066
3081
|
if (nx % QK_K != 0 || ny % QK_K != 0) {
|
3067
|
-
|
3082
|
+
LLAMA_LOG_INFO("\n\nTensor sizes %d x %d are not divisible by %d, required for k-quants.\n",nx,ny,QK_K);
|
3068
3083
|
convert_incompatible_tensor = true;
|
3069
3084
|
}
|
3070
3085
|
}
|
3071
3086
|
if (convert_incompatible_tensor) {
|
3072
3087
|
if (tensor.name == "output.weight") {
|
3073
3088
|
new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing.
|
3074
|
-
|
3089
|
+
LLAMA_LOG_WARN("F16 will be used for this tensor instead.\n");
|
3075
3090
|
} else if (tensor.name == "tok_embeddings.weight") {
|
3076
3091
|
new_type = GGML_TYPE_Q4_0; //fall back to Q4_0 instead of just failing.
|
3077
|
-
|
3092
|
+
LLAMA_LOG_WARN("Q4_0 will be used for this tensor instead.\n");
|
3078
3093
|
} else {
|
3079
3094
|
throw std::runtime_error("Unsupported tensor size encountered\n");
|
3080
3095
|
}
|
@@ -3094,7 +3109,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
3094
3109
|
f32_data = (float *) f32_conv_buf.addr;
|
3095
3110
|
}
|
3096
3111
|
|
3097
|
-
|
3112
|
+
LLAMA_LOG_INFO("quantizing to %s .. ", ggml_type_name(new_type));
|
3098
3113
|
fflush(stdout);
|
3099
3114
|
|
3100
3115
|
work.resize(nelements * 4); // upper bound on size
|
@@ -3144,7 +3159,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
3144
3159
|
}
|
3145
3160
|
}
|
3146
3161
|
|
3147
|
-
|
3162
|
+
LLAMA_LOG_INFO("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0);
|
3148
3163
|
int64_t tot_count = 0;
|
3149
3164
|
for (size_t i = 0; i < hist_cur.size(); i++) {
|
3150
3165
|
hist_all[i] += hist_cur[i];
|
@@ -3153,18 +3168,18 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
3153
3168
|
|
3154
3169
|
if (tot_count > 0) {
|
3155
3170
|
for (size_t i = 0; i < hist_cur.size(); i++) {
|
3156
|
-
|
3171
|
+
LLAMA_LOG_INFO("%5.3f ", hist_cur[i] / float(nelements));
|
3157
3172
|
}
|
3158
3173
|
}
|
3159
|
-
|
3174
|
+
LLAMA_LOG_INFO("\n");
|
3160
3175
|
}
|
3161
3176
|
total_size_org += tensor.size;
|
3162
3177
|
total_size_new += new_size;
|
3163
3178
|
file_saver.write_tensor(tensor, new_type, new_data, new_size);
|
3164
3179
|
}
|
3165
3180
|
|
3166
|
-
|
3167
|
-
|
3181
|
+
LLAMA_LOG_INFO("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
|
3182
|
+
LLAMA_LOG_INFO("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0);
|
3168
3183
|
|
3169
3184
|
{
|
3170
3185
|
int64_t sum_all = 0;
|
@@ -3173,11 +3188,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
3173
3188
|
}
|
3174
3189
|
|
3175
3190
|
if (sum_all > 0) {
|
3176
|
-
|
3191
|
+
LLAMA_LOG_INFO("%s: hist: ", __func__);
|
3177
3192
|
for (size_t i = 0; i < hist_all.size(); i++) {
|
3178
|
-
|
3193
|
+
LLAMA_LOG_INFO("%5.3f ", hist_all[i] / float(sum_all));
|
3179
3194
|
}
|
3180
|
-
|
3195
|
+
LLAMA_LOG_INFO("\n");
|
3181
3196
|
}
|
3182
3197
|
}
|
3183
3198
|
}
|
@@ -3201,8 +3216,8 @@ struct llama_model * llama_load_model_from_file(
|
|
3201
3216
|
params.main_gpu, params.tensor_split, params.mul_mat_q, params.rope_freq_base, params.rope_freq_scale,params.low_vram,
|
3202
3217
|
memory_type, params.use_mmap, params.use_mlock, params.vocab_only, params.progress_callback,
|
3203
3218
|
params.progress_callback_user_data)) {
|
3219
|
+
LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
|
3204
3220
|
delete model;
|
3205
|
-
fprintf(stderr, "%s: failed to load model\n", __func__);
|
3206
3221
|
return nullptr;
|
3207
3222
|
}
|
3208
3223
|
|
@@ -3235,10 +3250,9 @@ struct llama_context * llama_new_context_with_model(
|
|
3235
3250
|
unsigned percentage = (unsigned) (100 * progress);
|
3236
3251
|
while (percentage > *cur_percentage_p) {
|
3237
3252
|
*cur_percentage_p = percentage;
|
3238
|
-
|
3239
|
-
fflush(stderr);
|
3253
|
+
LLAMA_LOG_INFO(".");
|
3240
3254
|
if (percentage >= 100) {
|
3241
|
-
|
3255
|
+
LLAMA_LOG_INFO("\n");
|
3242
3256
|
}
|
3243
3257
|
}
|
3244
3258
|
};
|
@@ -3252,14 +3266,14 @@ struct llama_context * llama_new_context_with_model(
|
|
3252
3266
|
// reserve memory for context buffers
|
3253
3267
|
if (!params.vocab_only) {
|
3254
3268
|
if (!kv_cache_init(ctx->model.hparams, ctx->kv_self, memory_type, ctx->model.hparams.n_ctx, params.n_gpu_layers)) {
|
3255
|
-
|
3269
|
+
LLAMA_LOG_ERROR("%s: kv_cache_init() failed for self-attention cache\n", __func__);
|
3256
3270
|
llama_free(ctx);
|
3257
3271
|
return nullptr;
|
3258
3272
|
}
|
3259
3273
|
|
3260
3274
|
{
|
3261
3275
|
const size_t memory_size = ggml_nbytes(ctx->kv_self.k) + ggml_nbytes(ctx->kv_self.v);
|
3262
|
-
|
3276
|
+
LLAMA_LOG_INFO("%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
|
3263
3277
|
}
|
3264
3278
|
|
3265
3279
|
const auto & hparams = ctx->model.hparams;
|
@@ -3293,14 +3307,14 @@ struct llama_context * llama_new_context_with_model(
|
|
3293
3307
|
// measure memory requirements for the graph
|
3294
3308
|
size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment;
|
3295
3309
|
|
3296
|
-
|
3310
|
+
LLAMA_LOG_INFO("%s: compute buffer total size = %7.2f MB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0);
|
3297
3311
|
|
3298
3312
|
// debug - for comparison with scratch buffer
|
3299
3313
|
//size_t prev_req =
|
3300
3314
|
// MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type) +
|
3301
3315
|
// MEM_REQ_SCRATCH1().at(ctx->model.type) +
|
3302
3316
|
// MEM_REQ_EVAL().at(ctx->model.type);
|
3303
|
-
//
|
3317
|
+
//LLAMA_LOG_INFO("%s: (debug) equivalent with scratch buffer = %7.2f MB\n", __func__, prev_req / 1024.0 / 1024.0);
|
3304
3318
|
|
3305
3319
|
// recreate allocator with exact memory requirements
|
3306
3320
|
ggml_allocr_free(ctx->alloc);
|
@@ -3336,13 +3350,13 @@ struct llama_context * llama_new_context_with_model(
|
|
3336
3350
|
|
3337
3351
|
const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
|
3338
3352
|
|
3339
|
-
|
3353
|
+
LLAMA_LOG_INFO("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
|
3340
3354
|
|
3341
|
-
#define LLAMA_METAL_CHECK_BUF(result)
|
3342
|
-
if (!(result)) {
|
3343
|
-
|
3344
|
-
llama_free(ctx);
|
3345
|
-
return NULL;
|
3355
|
+
#define LLAMA_METAL_CHECK_BUF(result) \
|
3356
|
+
if (!(result)) { \
|
3357
|
+
LLAMA_LOG_ERROR("%s: failed to add buffer\n", __func__); \
|
3358
|
+
llama_free(ctx); \
|
3359
|
+
return NULL; \
|
3346
3360
|
}
|
3347
3361
|
|
3348
3362
|
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
|
@@ -3396,19 +3410,19 @@ int llama_model_quantize(
|
|
3396
3410
|
llama_model_quantize_internal(fname_inp, fname_out, params);
|
3397
3411
|
return 0;
|
3398
3412
|
} catch (const std::exception & err) {
|
3399
|
-
|
3413
|
+
LLAMA_LOG_ERROR("%s: failed to quantize: %s\n", __func__, err.what());
|
3400
3414
|
return 1;
|
3401
3415
|
}
|
3402
3416
|
}
|
3403
3417
|
|
3404
3418
|
int llama_apply_lora_from_file_internal(const struct llama_model & model, const char * path_lora, const char * path_base_model, int n_threads) {
|
3405
|
-
|
3419
|
+
LLAMA_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
|
3406
3420
|
|
3407
3421
|
const int64_t t_start_lora_us = ggml_time_us();
|
3408
3422
|
|
3409
3423
|
auto fin = std::ifstream(path_lora, std::ios::binary);
|
3410
3424
|
if (!fin) {
|
3411
|
-
|
3425
|
+
LLAMA_LOG_ERROR("%s: failed to open '%s'\n", __func__, path_lora);
|
3412
3426
|
return 1;
|
3413
3427
|
}
|
3414
3428
|
|
@@ -3417,14 +3431,14 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
|
3417
3431
|
uint32_t magic;
|
3418
3432
|
fin.read((char *) &magic, sizeof(magic));
|
3419
3433
|
if (magic != LLAMA_FILE_MAGIC_GGLA) {
|
3420
|
-
|
3434
|
+
LLAMA_LOG_ERROR("%s: bad file magic\n", __func__);
|
3421
3435
|
return 1;
|
3422
3436
|
}
|
3423
3437
|
uint32_t format_version;
|
3424
3438
|
fin.read((char *) &format_version, sizeof(format_version));
|
3425
3439
|
|
3426
3440
|
if (format_version != 1) {
|
3427
|
-
|
3441
|
+
LLAMA_LOG_ERROR("%s: unsupported file version\n", __func__ );
|
3428
3442
|
return 1;
|
3429
3443
|
}
|
3430
3444
|
}
|
@@ -3435,7 +3449,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
|
3435
3449
|
fin.read((char *) &lora_alpha, sizeof(lora_alpha));
|
3436
3450
|
float scaling = (float)lora_alpha / (float)lora_r;
|
3437
3451
|
|
3438
|
-
|
3452
|
+
LLAMA_LOG_INFO("%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);
|
3439
3453
|
|
3440
3454
|
|
3441
3455
|
// create a temporary ggml context to store the lora tensors
|
@@ -3461,7 +3475,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
|
3461
3475
|
ggml_context * base_ctx = NULL;
|
3462
3476
|
llama_buffer base_buf;
|
3463
3477
|
if (path_base_model) {
|
3464
|
-
|
3478
|
+
LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
|
3465
3479
|
model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true));
|
3466
3480
|
|
3467
3481
|
size_t ctx_size;
|
@@ -3518,17 +3532,17 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
|
3518
3532
|
const std::string lora_suffix = ".lora";
|
3519
3533
|
size_t pos = name.rfind(lora_suffix);
|
3520
3534
|
if (pos == std::string::npos) {
|
3521
|
-
|
3535
|
+
LLAMA_LOG_ERROR("%s: error: '%s' is not a lora tensor\n", __func__, name.c_str());
|
3522
3536
|
return 1;
|
3523
3537
|
}
|
3524
3538
|
|
3525
3539
|
std::string lora_type = name.substr(pos + lora_suffix.length());
|
3526
3540
|
std::string base_name = name;
|
3527
3541
|
base_name.erase(pos);
|
3528
|
-
//
|
3542
|
+
// LLAMA_LOG_INFO("%s: %s => %s (lora type %s) \n", __func__, name.c_str(),base_name.c_str(), lora_type.c_str());
|
3529
3543
|
|
3530
3544
|
if (model_tensors.find(base_name) == model_tensors.end()) {
|
3531
|
-
|
3545
|
+
LLAMA_LOG_ERROR("%s: unknown tensor '%s' in lora adapter\n", __func__, name.data());
|
3532
3546
|
return 1;
|
3533
3547
|
}
|
3534
3548
|
|
@@ -3539,7 +3553,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
|
3539
3553
|
case 1: wtype = GGML_TYPE_F16; break;
|
3540
3554
|
default:
|
3541
3555
|
{
|
3542
|
-
|
3556
|
+
LLAMA_LOG_ERROR("%s: invalid tensor data type '%d'\n",
|
3543
3557
|
__func__, ftype);
|
3544
3558
|
return false;
|
3545
3559
|
}
|
@@ -3549,7 +3563,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
|
3549
3563
|
lora_tensor = ggml_new_tensor_2d(lora_ctx, wtype, ne[0], ne[1]);
|
3550
3564
|
}
|
3551
3565
|
else {
|
3552
|
-
|
3566
|
+
LLAMA_LOG_ERROR("%s: unsupported tensor dimension %d\n", __func__, n_dims);
|
3553
3567
|
return 1;
|
3554
3568
|
}
|
3555
3569
|
ggml_set_name(lora_tensor, "lora_tensor");
|
@@ -3587,7 +3601,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
|
3587
3601
|
if (model_loader) {
|
3588
3602
|
// load from base model
|
3589
3603
|
if (model_loader->tensors_map.name_to_idx.find(base_name) == model_loader->tensors_map.name_to_idx.end()) {
|
3590
|
-
|
3604
|
+
LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
|
3591
3605
|
return 1;
|
3592
3606
|
}
|
3593
3607
|
size_t idx = model_loader->tensors_map.name_to_idx[base_name];
|
@@ -3603,8 +3617,8 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
|
3603
3617
|
|
3604
3618
|
if (ggml_is_quantized(base_t->type)) {
|
3605
3619
|
if (!warned) {
|
3606
|
-
|
3607
|
-
|
3620
|
+
LLAMA_LOG_WARN("%s: warning: using a lora adapter with a quantized model may result in poor quality, "
|
3621
|
+
"use a f16 or f32 base model with --lora-base\n", __func__);
|
3608
3622
|
warned = true;
|
3609
3623
|
}
|
3610
3624
|
}
|
@@ -3618,8 +3632,8 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
|
3618
3632
|
ggml_set_name(loraB, "loraB");
|
3619
3633
|
|
3620
3634
|
if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) {
|
3621
|
-
|
3622
|
-
|
3635
|
+
LLAMA_LOG_ERROR("%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");"
|
3636
|
+
" are you sure that this adapter is for this model?\n", __func__, base_t->ne[0], loraA->ne[1]);
|
3623
3637
|
return 1;
|
3624
3638
|
}
|
3625
3639
|
|
@@ -3664,7 +3678,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
|
3664
3678
|
|
3665
3679
|
n_tensors++;
|
3666
3680
|
if (n_tensors % 4 == 0) {
|
3667
|
-
|
3681
|
+
LLAMA_LOG_INFO(".");
|
3668
3682
|
}
|
3669
3683
|
}
|
3670
3684
|
}
|
@@ -3676,7 +3690,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
|
3676
3690
|
}
|
3677
3691
|
|
3678
3692
|
const int64_t t_lora_us = ggml_time_us() - t_start_lora_us;
|
3679
|
-
|
3693
|
+
LLAMA_LOG_INFO(" done (%.2f ms)\n", t_lora_us / 1000.0);
|
3680
3694
|
|
3681
3695
|
return 0;
|
3682
3696
|
}
|
@@ -3685,7 +3699,7 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
|
|
3685
3699
|
try {
|
3686
3700
|
return llama_apply_lora_from_file_internal(ctx->model, path_lora, path_base_model, n_threads);
|
3687
3701
|
} catch (const std::exception & err) {
|
3688
|
-
|
3702
|
+
LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
|
3689
3703
|
return 1;
|
3690
3704
|
}
|
3691
3705
|
}
|
@@ -3694,7 +3708,7 @@ int llama_model_apply_lora_from_file(const struct llama_model * model, const cha
|
|
3694
3708
|
try {
|
3695
3709
|
return llama_apply_lora_from_file_internal(*model, path_lora, path_base_model, n_threads);
|
3696
3710
|
} catch (const std::exception & err) {
|
3697
|
-
|
3711
|
+
LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
|
3698
3712
|
return 1;
|
3699
3713
|
}
|
3700
3714
|
}
|
@@ -3743,10 +3757,20 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
|
|
3743
3757
|
return s_total;
|
3744
3758
|
}
|
3745
3759
|
|
3746
|
-
|
3747
|
-
|
3748
|
-
|
3749
|
-
|
3760
|
+
/** copy state data into either a buffer or file depending on the passed in context
|
3761
|
+
*
|
3762
|
+
* file context:
|
3763
|
+
* llama_file file("/path", "wb");
|
3764
|
+
* llama_data_file_context data_ctx(&file);
|
3765
|
+
* llama_copy_state_data(ctx, &data_ctx);
|
3766
|
+
*
|
3767
|
+
* buffer context:
|
3768
|
+
* std::vector<uint8_t> buf(max_size, 0);
|
3769
|
+
* llama_data_buffer_context data_ctx(&buf.data());
|
3770
|
+
* llama_copy_state_data(ctx, &data_ctx);
|
3771
|
+
*
|
3772
|
+
*/
|
3773
|
+
void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
|
3750
3774
|
// copy rng
|
3751
3775
|
{
|
3752
3776
|
std::stringstream rng_ss;
|
@@ -3758,8 +3782,8 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
|
3758
3782
|
memset(&rng_buf[0], 0, LLAMA_MAX_RNG_STATE);
|
3759
3783
|
memcpy(&rng_buf[0], rng_ss.str().data(), rng_ss.str().size());
|
3760
3784
|
|
3761
|
-
|
3762
|
-
|
3785
|
+
data_ctx->write(&rng_size, sizeof(rng_size));
|
3786
|
+
data_ctx->write(&rng_buf[0], LLAMA_MAX_RNG_STATE);
|
3763
3787
|
}
|
3764
3788
|
|
3765
3789
|
// copy logits
|
@@ -3767,25 +3791,29 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
|
3767
3791
|
const size_t logits_cap = ctx->logits.capacity();
|
3768
3792
|
const size_t logits_size = ctx->logits.size();
|
3769
3793
|
|
3770
|
-
|
3771
|
-
|
3794
|
+
data_ctx->write(&logits_cap, sizeof(logits_cap));
|
3795
|
+
data_ctx->write(&logits_size, sizeof(logits_size));
|
3772
3796
|
|
3773
3797
|
if (logits_size) {
|
3774
|
-
|
3798
|
+
data_ctx->write(ctx->logits.data(), logits_size * sizeof(float));
|
3775
3799
|
}
|
3776
3800
|
|
3777
|
-
|
3801
|
+
// If there is a gap between the size and the capacity, write padding
|
3802
|
+
size_t padding_size = (logits_cap - logits_size) * sizeof(float);
|
3803
|
+
if (padding_size > 0) {
|
3804
|
+
std::vector<uint8_t> padding(padding_size, 0); // Create a buffer filled with zeros
|
3805
|
+
data_ctx->write(padding.data(), padding_size);
|
3806
|
+
}
|
3778
3807
|
}
|
3779
3808
|
|
3780
3809
|
// copy embeddings
|
3781
3810
|
{
|
3782
3811
|
const size_t embedding_size = ctx->embedding.size();
|
3783
3812
|
|
3784
|
-
|
3813
|
+
data_ctx->write(&embedding_size, sizeof(embedding_size));
|
3785
3814
|
|
3786
3815
|
if (embedding_size) {
|
3787
|
-
|
3788
|
-
out += embedding_size * sizeof(float);
|
3816
|
+
data_ctx->write(ctx->embedding.data(), embedding_size * sizeof(float));
|
3789
3817
|
}
|
3790
3818
|
}
|
3791
3819
|
|
@@ -3800,8 +3828,8 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
|
3800
3828
|
const size_t kv_size = kv_self.buf.size;
|
3801
3829
|
const int kv_ntok = llama_get_kv_cache_token_count(ctx);
|
3802
3830
|
|
3803
|
-
|
3804
|
-
|
3831
|
+
data_ctx->write(&kv_size, sizeof(kv_size));
|
3832
|
+
data_ctx->write(&kv_ntok, sizeof(kv_ntok));
|
3805
3833
|
|
3806
3834
|
if (kv_size) {
|
3807
3835
|
const size_t elt_size = ggml_element_size(kv_self.k);
|
@@ -3810,12 +3838,12 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
|
3810
3838
|
ggml_cgraph gf{};
|
3811
3839
|
|
3812
3840
|
ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
|
3813
|
-
kout3d
|
3814
|
-
|
3841
|
+
std::vector<uint8_t> kout3d_data(ggml_nbytes(kout3d), 0);
|
3842
|
+
kout3d->data = kout3d_data.data();
|
3815
3843
|
|
3816
3844
|
ggml_tensor * vout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_ntok, n_embd, n_layer);
|
3817
|
-
vout3d
|
3818
|
-
|
3845
|
+
std::vector<uint8_t> vout3d_data(ggml_nbytes(vout3d), 0);
|
3846
|
+
vout3d->data = vout3d_data.data();
|
3819
3847
|
|
3820
3848
|
ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k,
|
3821
3849
|
n_embd, kv_ntok, n_layer,
|
@@ -3830,15 +3858,20 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
|
3830
3858
|
ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1);
|
3831
3859
|
|
3832
3860
|
ggml_free(cpy_ctx);
|
3861
|
+
|
3862
|
+
// our data is now in the kout3d_data and vout3d_data buffers
|
3863
|
+
// write them to file
|
3864
|
+
data_ctx->write(kout3d_data.data(), kout3d_data.size());
|
3865
|
+
data_ctx->write(vout3d_data.data(), vout3d_data.size());
|
3833
3866
|
}
|
3834
3867
|
}
|
3868
|
+
}
|
3835
3869
|
|
3836
|
-
|
3837
|
-
|
3838
|
-
|
3839
|
-
LLAMA_ASSERT(written <= max_size);
|
3870
|
+
size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
3871
|
+
llama_data_buffer_context data_ctx(dst);
|
3872
|
+
llama_copy_state_data_internal(ctx, &data_ctx);
|
3840
3873
|
|
3841
|
-
return
|
3874
|
+
return data_ctx.get_size_written();
|
3842
3875
|
}
|
3843
3876
|
|
3844
3877
|
// Sets the state reading from the specified source address
|
@@ -3957,7 +3990,7 @@ static bool llama_load_session_file_internal(struct llama_context * ctx, const c
|
|
3957
3990
|
const uint32_t version = file.read_u32();
|
3958
3991
|
|
3959
3992
|
if (magic != LLAMA_SESSION_MAGIC || version != LLAMA_SESSION_VERSION) {
|
3960
|
-
|
3993
|
+
LLAMA_LOG_ERROR("%s : unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version);
|
3961
3994
|
return false;
|
3962
3995
|
}
|
3963
3996
|
|
@@ -3965,7 +3998,7 @@ static bool llama_load_session_file_internal(struct llama_context * ctx, const c
|
|
3965
3998
|
file.read_raw(&session_hparams, sizeof(llama_hparams));
|
3966
3999
|
|
3967
4000
|
if (session_hparams != ctx->model.hparams) {
|
3968
|
-
|
4001
|
+
LLAMA_LOG_INFO("%s : model hparams didn't match from session file!\n", __func__);
|
3969
4002
|
return false;
|
3970
4003
|
}
|
3971
4004
|
}
|
@@ -3975,7 +4008,7 @@ static bool llama_load_session_file_internal(struct llama_context * ctx, const c
|
|
3975
4008
|
const uint32_t n_token_count = file.read_u32();
|
3976
4009
|
|
3977
4010
|
if (n_token_count > n_token_capacity) {
|
3978
|
-
|
4011
|
+
LLAMA_LOG_ERROR("%s : token count in session file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity);
|
3979
4012
|
return false;
|
3980
4013
|
}
|
3981
4014
|
|
@@ -3989,7 +4022,7 @@ static bool llama_load_session_file_internal(struct llama_context * ctx, const c
|
|
3989
4022
|
const size_t n_state_size_max = llama_get_state_size(ctx);
|
3990
4023
|
|
3991
4024
|
if (n_state_size_cur > n_state_size_max) {
|
3992
|
-
|
4025
|
+
LLAMA_LOG_ERROR("%s : the state size in session file is too big! max %zu, got %zu\n", __func__, n_state_size_max, n_state_size_cur);
|
3993
4026
|
return false;
|
3994
4027
|
}
|
3995
4028
|
|
@@ -4006,7 +4039,7 @@ bool llama_load_session_file(struct llama_context * ctx, const char * path_sessi
|
|
4006
4039
|
try {
|
4007
4040
|
return llama_load_session_file_internal(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out);
|
4008
4041
|
} catch (const std::exception & err) {
|
4009
|
-
|
4042
|
+
LLAMA_LOG_ERROR("error loading session file: %s\n", err.what());
|
4010
4043
|
return false;
|
4011
4044
|
}
|
4012
4045
|
}
|
@@ -4023,15 +4056,9 @@ bool llama_save_session_file(struct llama_context * ctx, const char * path_sessi
|
|
4023
4056
|
file.write_u32((uint32_t) n_token_count);
|
4024
4057
|
file.write_raw(tokens, sizeof(llama_token) * n_token_count);
|
4025
4058
|
|
4026
|
-
// save the context state
|
4027
|
-
|
4028
|
-
|
4029
|
-
|
4030
|
-
std::vector<uint8_t> state_data(n_state_size_max);
|
4031
|
-
const size_t n_state_size_cur = llama_copy_state_data(ctx, state_data.data());
|
4032
|
-
|
4033
|
-
file.write_raw(state_data.data(), n_state_size_cur);
|
4034
|
-
}
|
4059
|
+
// save the context state using stream saving
|
4060
|
+
llama_data_file_context data_ctx(&file);
|
4061
|
+
llama_copy_state_data_internal(ctx, &data_ctx);
|
4035
4062
|
|
4036
4063
|
return true;
|
4037
4064
|
}
|
@@ -4043,7 +4070,7 @@ int llama_eval(
|
|
4043
4070
|
int n_past,
|
4044
4071
|
int n_threads) {
|
4045
4072
|
if (!llama_eval_internal(*ctx, tokens, nullptr, n_tokens, n_past, n_threads, nullptr)) {
|
4046
|
-
|
4073
|
+
LLAMA_LOG_ERROR("%s: failed to eval\n", __func__);
|
4047
4074
|
return 1;
|
4048
4075
|
}
|
4049
4076
|
|
@@ -4065,7 +4092,7 @@ int llama_eval_embd(
|
|
4065
4092
|
int n_past,
|
4066
4093
|
int n_threads) {
|
4067
4094
|
if (!llama_eval_internal(*ctx, nullptr, embd, n_tokens, n_past, n_threads, nullptr)) {
|
4068
|
-
|
4095
|
+
LLAMA_LOG_ERROR("%s: failed to eval\n", __func__);
|
4069
4096
|
return 1;
|
4070
4097
|
}
|
4071
4098
|
|
@@ -4086,7 +4113,7 @@ int llama_eval_export(struct llama_context * ctx, const char * fname) {
|
|
4086
4113
|
const std::vector<llama_token> tmp(n_batch, llama_token_bos());
|
4087
4114
|
|
4088
4115
|
if (!llama_eval_internal(*ctx, tmp.data(), nullptr, tmp.size(), n_ctx, 1, fname)) {
|
4089
|
-
|
4116
|
+
LLAMA_LOG_ERROR("%s: failed to eval\n", __func__);
|
4090
4117
|
return 1;
|
4091
4118
|
}
|
4092
4119
|
|
@@ -4102,7 +4129,7 @@ int llama_tokenize_with_model(
|
|
4102
4129
|
auto res = llama_tokenize(model->vocab, text, add_bos);
|
4103
4130
|
|
4104
4131
|
if (n_max_tokens < (int) res.size()) {
|
4105
|
-
|
4132
|
+
LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
|
4106
4133
|
return -((int) res.size());
|
4107
4134
|
}
|
4108
4135
|
|
@@ -4219,15 +4246,15 @@ struct llama_timings llama_get_timings(struct llama_context * ctx) {
|
|
4219
4246
|
void llama_print_timings(struct llama_context * ctx) {
|
4220
4247
|
const llama_timings timings = llama_get_timings(ctx);
|
4221
4248
|
|
4222
|
-
|
4223
|
-
|
4224
|
-
|
4249
|
+
LLAMA_LOG_INFO("\n");
|
4250
|
+
LLAMA_LOG_INFO("%s: load time = %8.2f ms\n", __func__, timings.t_load_ms);
|
4251
|
+
LLAMA_LOG_INFO("%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
4225
4252
|
__func__, timings.t_sample_ms, timings.n_sample, timings.t_sample_ms / timings.n_sample, 1e3 / timings.t_sample_ms * timings.n_sample);
|
4226
|
-
|
4253
|
+
LLAMA_LOG_INFO("%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
|
4227
4254
|
__func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval);
|
4228
|
-
|
4255
|
+
LLAMA_LOG_INFO("%s: eval time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
4229
4256
|
__func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval);
|
4230
|
-
|
4257
|
+
LLAMA_LOG_INFO("%s: total time = %8.2f ms\n", __func__, (timings.t_end_ms - timings.t_start_ms));
|
4231
4258
|
}
|
4232
4259
|
|
4233
4260
|
void llama_reset_timings(struct llama_context * ctx) {
|
@@ -4263,3 +4290,44 @@ const char * llama_print_system_info(void) {
|
|
4263
4290
|
const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
|
4264
4291
|
return ctx->model.tensors_by_name;
|
4265
4292
|
}
|
4293
|
+
|
4294
|
+
|
4295
|
+
void llama_log_set(llama_log_callback log_callback, void * user_data) {
|
4296
|
+
g_state.log_callback = log_callback ? log_callback : llama_log_callback_default;
|
4297
|
+
g_state.log_callback_user_data = user_data;
|
4298
|
+
}
|
4299
|
+
|
4300
|
+
#if defined(_MSC_VER) && !defined(vsnprintf)
|
4301
|
+
#define vsnprintf _vsnprintf
|
4302
|
+
#endif
|
4303
|
+
|
4304
|
+
static void llama_log_internal_v(llama_log_level level, const char * format, va_list args) {
|
4305
|
+
va_list args_copy;
|
4306
|
+
va_copy(args_copy, args);
|
4307
|
+
char buffer[128];
|
4308
|
+
int len = vsnprintf(buffer, 128, format, args);
|
4309
|
+
if (len < 128) {
|
4310
|
+
g_state.log_callback(level, buffer, g_state.log_callback_user_data);
|
4311
|
+
} else {
|
4312
|
+
char* buffer2 = new char[len+1];
|
4313
|
+
vsnprintf(buffer2, len+1, format, args_copy);
|
4314
|
+
buffer2[len] = 0;
|
4315
|
+
g_state.log_callback(level, buffer2, g_state.log_callback_user_data);
|
4316
|
+
delete[] buffer2;
|
4317
|
+
}
|
4318
|
+
va_end(args_copy);
|
4319
|
+
}
|
4320
|
+
|
4321
|
+
static void llama_log_internal(llama_log_level level, const char * format, ...) {
|
4322
|
+
va_list args;
|
4323
|
+
va_start(args, format);
|
4324
|
+
llama_log_internal_v(level, format, args);
|
4325
|
+
va_end(args);
|
4326
|
+
}
|
4327
|
+
|
4328
|
+
static void llama_log_callback_default(llama_log_level level, const char * text, void * user_data) {
|
4329
|
+
(void) level;
|
4330
|
+
(void) user_data;
|
4331
|
+
fputs(text, stderr);
|
4332
|
+
fflush(stderr);
|
4333
|
+
}
|