llama_cpp 0.3.6 → 0.3.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +11 -0
- data/ext/llama_cpp/extconf.rb +2 -2
- data/ext/llama_cpp/llama_cpp.cpp +8 -0
- data/ext/llama_cpp/src/ggml-alloc.c +44 -6
- data/ext/llama_cpp/src/ggml-alloc.h +4 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +1398 -702
- data/ext/llama_cpp/src/ggml-cuda.h +19 -23
- data/ext/llama_cpp/src/ggml-metal.h +6 -3
- data/ext/llama_cpp/src/ggml-metal.m +112 -146
- data/ext/llama_cpp/src/ggml-metal.metal +471 -498
- data/ext/llama_cpp/src/ggml.c +396 -150
- data/ext/llama_cpp/src/ggml.h +113 -32
- data/ext/llama_cpp/src/llama-util.h +51 -9
- data/ext/llama_cpp/src/llama.cpp +390 -210
- data/ext/llama_cpp/src/llama.h +20 -1
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +1 -0
- metadata +2 -2
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -56,7 +56,14 @@
|
|
56
56
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
57
57
|
#endif
|
58
58
|
|
59
|
-
|
59
|
+
static void llama_log_internal(llama_log_level level, const char* format, ...);
|
60
|
+
static void llama_log_callback_default(llama_log_level level, const char * text, void * user_data);
|
61
|
+
#define LLAMA_LOG_INFO(...) llama_log_internal(LLAMA_LOG_LEVEL_INFO , __VA_ARGS__)
|
62
|
+
#define LLAMA_LOG_WARN(...) llama_log_internal(LLAMA_LOG_LEVEL_WARN , __VA_ARGS__)
|
63
|
+
#define LLAMA_LOG_ERROR(...) llama_log_internal(LLAMA_LOG_LEVEL_ERROR, __VA_ARGS__)
|
64
|
+
|
65
|
+
|
66
|
+
#if !defined(GGML_USE_CUBLAS)
|
60
67
|
#include "ggml-alloc.h"
|
61
68
|
#define LLAMA_USE_ALLOCATOR
|
62
69
|
#else
|
@@ -108,9 +115,9 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
|
|
108
115
|
// memory sizes (calculated for n_batch == 512)
|
109
116
|
//
|
110
117
|
|
111
|
-
static
|
118
|
+
static std::map<e_model, size_t> MEM_REQ_SCRATCH0(int n_ctx)
|
112
119
|
{
|
113
|
-
|
120
|
+
std::map<e_model, size_t> k_sizes = {
|
114
121
|
{ MODEL_3B, ((size_t) n_ctx / 16ull + 92ull) * MB },
|
115
122
|
{ MODEL_7B, ((size_t) n_ctx / 16ull + 100ull) * MB },
|
116
123
|
{ MODEL_13B, ((size_t) n_ctx / 12ull + 120ull) * MB },
|
@@ -149,7 +156,7 @@ static const std::map<e_model, size_t> & MEM_REQ_EVAL()
|
|
149
156
|
}
|
150
157
|
|
151
158
|
// amount of VRAM needed per batch size to hold temporary results
|
152
|
-
// the values for 3b
|
159
|
+
// the values for 3b are not derived from testing but instead chosen conservatively
|
153
160
|
static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_BASE()
|
154
161
|
{
|
155
162
|
static std::map<e_model, size_t> k_sizes = {
|
@@ -157,14 +164,14 @@ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_BASE()
|
|
157
164
|
{ MODEL_7B, 512ull * kB },
|
158
165
|
{ MODEL_13B, 640ull * kB },
|
159
166
|
{ MODEL_30B, 768ull * kB },
|
160
|
-
{ MODEL_65B,
|
161
|
-
{ MODEL_70B,
|
167
|
+
{ MODEL_65B, 1280ull * kB },
|
168
|
+
{ MODEL_70B, 1280ull * kB },
|
162
169
|
};
|
163
170
|
return k_sizes;
|
164
171
|
}
|
165
172
|
|
166
173
|
// amount of VRAM needed per batch size and context to hold temporary results
|
167
|
-
// the values for 3b
|
174
|
+
// the values for 3b are not derived from testing but instead chosen conservatively
|
168
175
|
static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT()
|
169
176
|
{
|
170
177
|
static std::map<e_model, size_t> k_sizes = {
|
@@ -172,8 +179,8 @@ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT()
|
|
172
179
|
{ MODEL_7B, 128ull },
|
173
180
|
{ MODEL_13B, 160ull },
|
174
181
|
{ MODEL_30B, 208ull },
|
175
|
-
{ MODEL_65B,
|
176
|
-
{ MODEL_70B,
|
182
|
+
{ MODEL_65B, 256ull },
|
183
|
+
{ MODEL_70B, 256ull },
|
177
184
|
};
|
178
185
|
return k_sizes;
|
179
186
|
}
|
@@ -438,6 +445,14 @@ struct llama_context {
|
|
438
445
|
}
|
439
446
|
};
|
440
447
|
|
448
|
+
struct llama_state {
|
449
|
+
// We save the log callback globally
|
450
|
+
llama_log_callback log_callback = llama_log_callback_default;
|
451
|
+
void * log_callback_user_data = nullptr;
|
452
|
+
};
|
453
|
+
// global state
|
454
|
+
static llama_state g_state;
|
455
|
+
|
441
456
|
template <typename T>
|
442
457
|
static T checked_mul(T a, T b) {
|
443
458
|
T ret = a * b;
|
@@ -504,7 +519,7 @@ struct llama_file_loader {
|
|
504
519
|
|
505
520
|
llama_file_loader(const char * fname, llama_load_tensors_map & tensors_map)
|
506
521
|
: file(fname, "rb") {
|
507
|
-
|
522
|
+
LLAMA_LOG_INFO("llama.cpp: loading model from %s\n", fname);
|
508
523
|
read_magic();
|
509
524
|
read_hparams();
|
510
525
|
read_vocab();
|
@@ -619,7 +634,7 @@ struct llama_file_saver {
|
|
619
634
|
llama_file_loader * any_file_loader;
|
620
635
|
llama_file_saver(const char * fname, llama_file_loader * any_file_loader, enum llama_ftype new_ftype)
|
621
636
|
: file(fname, "wb"), any_file_loader(any_file_loader) {
|
622
|
-
|
637
|
+
LLAMA_LOG_INFO("llama.cpp: saving model to %s\n", fname);
|
623
638
|
write_magic();
|
624
639
|
write_hparams(new_ftype);
|
625
640
|
write_vocab();
|
@@ -640,7 +655,7 @@ struct llama_file_saver {
|
|
640
655
|
}
|
641
656
|
void write_vocab() {
|
642
657
|
if (any_file_loader->file_version == LLAMA_FILE_VERSION_GGML) {
|
643
|
-
|
658
|
+
LLAMA_LOG_WARN("llama.cpp: WARNING: input is an old file that doesn't have scores; will add dummy scores\n");
|
644
659
|
}
|
645
660
|
uint32_t n_vocab = any_file_loader->hparams.n_vocab;
|
646
661
|
for (uint32_t i = 0; i < n_vocab; i++) {
|
@@ -747,12 +762,12 @@ struct llama_model_loader {
|
|
747
762
|
|
748
763
|
void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) {
|
749
764
|
size_t data_size = 0;
|
750
|
-
size_t prefetch_size =
|
765
|
+
size_t prefetch_size = file_loader->file.size;
|
751
766
|
size_t lock_size = 0;
|
752
767
|
for (const llama_load_tensor & lt : tensors_map.tensors) {
|
753
768
|
data_size += lt.size;
|
754
|
-
if (lt.ggml_tensor->backend
|
755
|
-
prefetch_size
|
769
|
+
if (lt.ggml_tensor->backend != GGML_BACKEND_CPU) {
|
770
|
+
prefetch_size -= lt.size;
|
756
771
|
}
|
757
772
|
}
|
758
773
|
|
@@ -831,7 +846,7 @@ struct llama_model_loader {
|
|
831
846
|
uint8_t byte = lt.data[i];
|
832
847
|
sum = byte + (sum << 6) + (sum << 16) - sum; // sdbm hash
|
833
848
|
}
|
834
|
-
|
849
|
+
LLAMA_LOG_INFO("%s checksum: %#08x (%s, size %zu)\n", lt.name.c_str(), sum,
|
835
850
|
llama_format_tensor_shape(lt.ne).c_str(), lt.size);
|
836
851
|
}
|
837
852
|
|
@@ -864,7 +879,7 @@ static bool kv_cache_init(
|
|
864
879
|
cache.ctx = ggml_init(params);
|
865
880
|
|
866
881
|
if (!cache.ctx) {
|
867
|
-
|
882
|
+
LLAMA_LOG_ERROR("%s: failed to allocate memory for kv cache\n", __func__);
|
868
883
|
return false;
|
869
884
|
}
|
870
885
|
|
@@ -969,7 +984,7 @@ int64_t llama_time_us() {
|
|
969
984
|
// model loading
|
970
985
|
//
|
971
986
|
|
972
|
-
static const char *llama_file_version_name(llama_file_version version) {
|
987
|
+
static const char * llama_file_version_name(llama_file_version version) {
|
973
988
|
switch (version) {
|
974
989
|
case LLAMA_FILE_VERSION_GGML: return "'ggml' (old version with low tokenizer quality and no mmap support)";
|
975
990
|
case LLAMA_FILE_VERSION_GGMF_V1: return "ggmf v1 (old version with no mmap support)";
|
@@ -981,7 +996,7 @@ static const char *llama_file_version_name(llama_file_version version) {
|
|
981
996
|
return "unknown";
|
982
997
|
}
|
983
998
|
|
984
|
-
|
999
|
+
const char * llama_ftype_name(enum llama_ftype ftype) {
|
985
1000
|
switch (ftype) {
|
986
1001
|
case LLAMA_FTYPE_ALL_F32: return "all F32";
|
987
1002
|
case LLAMA_FTYPE_MOSTLY_F16: return "mostly F16";
|
@@ -1006,7 +1021,7 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
|
|
1006
1021
|
}
|
1007
1022
|
}
|
1008
1023
|
|
1009
|
-
static const char *llama_model_type_name(e_model type) {
|
1024
|
+
static const char * llama_model_type_name(e_model type) {
|
1010
1025
|
switch (type) {
|
1011
1026
|
case MODEL_3B: return "3B";
|
1012
1027
|
case MODEL_7B: return "7B";
|
@@ -1076,7 +1091,7 @@ static void llama_model_load_internal(
|
|
1076
1091
|
LLAMA_ASSERT(hparams.n_head % n_gqa == 0);
|
1077
1092
|
hparams.n_head_kv = hparams.n_head / n_gqa;
|
1078
1093
|
if (model.type == e_model::MODEL_65B && n_gqa == 8) {
|
1079
|
-
|
1094
|
+
LLAMA_LOG_WARN("%s: warning: assuming 70B model based on GQA == %d\n", __func__, n_gqa);
|
1080
1095
|
model.type = e_model::MODEL_70B;
|
1081
1096
|
hparams.f_ffn_mult = 1.3f; // from the params.json of the 70B model
|
1082
1097
|
}
|
@@ -1092,22 +1107,22 @@ static void llama_model_load_internal(
|
|
1092
1107
|
//const uint32_t n_ff = 28672;
|
1093
1108
|
|
1094
1109
|
{
|
1095
|
-
|
1096
|
-
|
1097
|
-
|
1098
|
-
|
1099
|
-
|
1100
|
-
|
1101
|
-
|
1102
|
-
|
1103
|
-
|
1104
|
-
|
1105
|
-
|
1106
|
-
|
1107
|
-
|
1108
|
-
|
1109
|
-
|
1110
|
-
|
1110
|
+
LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(file_version));
|
1111
|
+
LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, hparams.n_vocab);
|
1112
|
+
LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, hparams.n_ctx);
|
1113
|
+
LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd);
|
1114
|
+
LLAMA_LOG_INFO("%s: n_mult = %u\n", __func__, hparams.n_mult);
|
1115
|
+
LLAMA_LOG_INFO("%s: n_head = %u\n", __func__, hparams.n_head);
|
1116
|
+
LLAMA_LOG_INFO("%s: n_head_kv = %u\n", __func__, hparams.n_head_kv);
|
1117
|
+
LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer);
|
1118
|
+
LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot); // a.k.a. n_embd_head, n_head_dim
|
1119
|
+
LLAMA_LOG_INFO("%s: n_gqa = %u\n", __func__, hparams.n_gqa());
|
1120
|
+
LLAMA_LOG_INFO("%s: rnorm_eps = %.1e\n", __func__, hparams.f_rms_norm_eps);
|
1121
|
+
LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, n_ff);
|
1122
|
+
LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, hparams.rope_freq_base);
|
1123
|
+
LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale);
|
1124
|
+
LLAMA_LOG_INFO("%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype));
|
1125
|
+
LLAMA_LOG_INFO("%s: model size = %s\n", __func__, llama_model_type_name(model.type));
|
1111
1126
|
}
|
1112
1127
|
|
1113
1128
|
if (file_version < LLAMA_FILE_VERSION_GGJT_V2) {
|
@@ -1135,7 +1150,7 @@ static void llama_model_load_internal(
|
|
1135
1150
|
size_t ctx_size;
|
1136
1151
|
size_t mmapped_size;
|
1137
1152
|
ml->calc_sizes(&ctx_size, &mmapped_size);
|
1138
|
-
|
1153
|
+
LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MB\n", __func__, ctx_size/1024.0/1024.0);
|
1139
1154
|
|
1140
1155
|
// create the ggml context
|
1141
1156
|
{
|
@@ -1160,13 +1175,13 @@ static void llama_model_load_internal(
|
|
1160
1175
|
(void) main_gpu;
|
1161
1176
|
(void) mul_mat_q;
|
1162
1177
|
#if defined(GGML_USE_CUBLAS)
|
1163
|
-
|
1178
|
+
LLAMA_LOG_INFO("%s: using CUDA for GPU acceleration\n", __func__);
|
1164
1179
|
ggml_cuda_set_main_device(main_gpu);
|
1165
1180
|
ggml_cuda_set_mul_mat_q(mul_mat_q);
|
1166
1181
|
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
|
1167
1182
|
#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT
|
1168
1183
|
#elif defined(GGML_USE_CLBLAST)
|
1169
|
-
|
1184
|
+
LLAMA_LOG_INFO("%s: using OpenCL for GPU acceleration\n", __func__);
|
1170
1185
|
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
|
1171
1186
|
#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU
|
1172
1187
|
#else
|
@@ -1271,14 +1286,14 @@ static void llama_model_load_internal(
|
|
1271
1286
|
const size_t mem_required_state =
|
1272
1287
|
scale*hparams.kv_size();
|
1273
1288
|
|
1274
|
-
|
1289
|
+
LLAMA_LOG_INFO("%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
|
1275
1290
|
mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
|
1276
1291
|
|
1277
1292
|
(void) vram_scratch;
|
1278
1293
|
(void) n_batch;
|
1279
1294
|
#ifdef GGML_USE_CUBLAS
|
1280
1295
|
if (low_vram) {
|
1281
|
-
|
1296
|
+
LLAMA_LOG_INFO("%s: not allocating a VRAM scratch buffer due to low VRAM option\n", __func__);
|
1282
1297
|
ggml_cuda_set_scratch_size(0); // disable scratch
|
1283
1298
|
} else {
|
1284
1299
|
const size_t vram_scratch_base = VRAM_REQ_SCRATCH_BASE().at(model.type);
|
@@ -1286,7 +1301,7 @@ static void llama_model_load_internal(
|
|
1286
1301
|
vram_scratch = n_batch * (vram_scratch_base + n_ctx * vram_scratch_per_context);
|
1287
1302
|
ggml_cuda_set_scratch_size(vram_scratch);
|
1288
1303
|
if (n_gpu_layers > 0) {
|
1289
|
-
|
1304
|
+
LLAMA_LOG_INFO("%s: allocating batch_size x (%zd kB + n_ctx x %zd B) = %zd MB VRAM for the scratch buffer\n",
|
1290
1305
|
__func__, vram_scratch_base / kB, vram_scratch_per_context,
|
1291
1306
|
(vram_scratch + MB - 1) / MB); // round up
|
1292
1307
|
}
|
@@ -1296,9 +1311,9 @@ static void llama_model_load_internal(
|
|
1296
1311
|
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
1297
1312
|
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
1298
1313
|
|
1299
|
-
|
1314
|
+
LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
|
1300
1315
|
if (n_gpu_layers > (int) hparams.n_layer) {
|
1301
|
-
|
1316
|
+
LLAMA_LOG_INFO("%s: offloading non-repeating layers to GPU\n", __func__);
|
1302
1317
|
}
|
1303
1318
|
size_t vram_kv_cache = 0;
|
1304
1319
|
|
@@ -1307,17 +1322,17 @@ static void llama_model_load_internal(
|
|
1307
1322
|
const int max_offloadable_layers = low_vram ? hparams.n_layer + 1 : hparams.n_layer + 3;
|
1308
1323
|
if (n_gpu_layers > (int) hparams.n_layer + 1) {
|
1309
1324
|
if (low_vram) {
|
1310
|
-
|
1325
|
+
LLAMA_LOG_INFO("%s: cannot offload v cache to GPU due to low VRAM option\n", __func__);
|
1311
1326
|
} else {
|
1312
|
-
|
1327
|
+
LLAMA_LOG_INFO("%s: offloading v cache to GPU\n", __func__);
|
1313
1328
|
vram_kv_cache += hparams.kv_size() / 2;
|
1314
1329
|
}
|
1315
1330
|
}
|
1316
1331
|
if (n_gpu_layers > (int) hparams.n_layer + 2) {
|
1317
1332
|
if (low_vram) {
|
1318
|
-
|
1333
|
+
LLAMA_LOG_WARN("%s: cannot offload k cache to GPU due to low VRAM option\n", __func__);
|
1319
1334
|
} else {
|
1320
|
-
|
1335
|
+
LLAMA_LOG_INFO("%s: offloading k cache to GPU\n", __func__);
|
1321
1336
|
vram_kv_cache += hparams.kv_size() / 2;
|
1322
1337
|
}
|
1323
1338
|
}
|
@@ -1326,9 +1341,9 @@ static void llama_model_load_internal(
|
|
1326
1341
|
const int max_offloadable_layers = hparams.n_layer + 1;
|
1327
1342
|
#endif // GGML_USE_CUBLAS
|
1328
1343
|
|
1329
|
-
|
1344
|
+
LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n",
|
1330
1345
|
__func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
|
1331
|
-
|
1346
|
+
LLAMA_LOG_INFO("%s: total VRAM used: %zu MB\n",
|
1332
1347
|
__func__, (vram_weights + vram_scratch + vram_kv_cache + MB - 1) / MB); // round up
|
1333
1348
|
#else
|
1334
1349
|
(void) n_gpu_layers;
|
@@ -1387,7 +1402,7 @@ static bool llama_model_load(
|
|
1387
1402
|
use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
|
1388
1403
|
return true;
|
1389
1404
|
} catch (const std::exception & err) {
|
1390
|
-
|
1405
|
+
LLAMA_LOG_ERROR("error loading model: %s\n", err.what());
|
1391
1406
|
return false;
|
1392
1407
|
}
|
1393
1408
|
}
|
@@ -1594,11 +1609,11 @@ static struct ggml_cgraph * llama_build_graph(
|
|
1594
1609
|
ggml_set_name(Q, "Q");
|
1595
1610
|
|
1596
1611
|
struct ggml_tensor * K =
|
1597
|
-
|
1598
|
-
|
1599
|
-
|
1600
|
-
|
1601
|
-
|
1612
|
+
ggml_view_3d(ctx0, kv_self.k,
|
1613
|
+
n_embd_head, n_past + N, n_head_kv,
|
1614
|
+
ggml_element_size(kv_self.k)*n_embd_gqa,
|
1615
|
+
ggml_element_size(kv_self.k)*n_embd_head,
|
1616
|
+
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
|
1602
1617
|
offload_func_kq(K);
|
1603
1618
|
ggml_set_name(K, "K");
|
1604
1619
|
|
@@ -1627,9 +1642,9 @@ static struct ggml_cgraph * llama_build_graph(
|
|
1627
1642
|
struct ggml_tensor * V =
|
1628
1643
|
ggml_view_3d(ctx0, kv_self.v,
|
1629
1644
|
n_past + N, n_embd_head, n_head_kv,
|
1630
|
-
|
1631
|
-
|
1632
|
-
|
1645
|
+
ggml_element_size(kv_self.v)*n_ctx,
|
1646
|
+
ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
|
1647
|
+
ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
|
1633
1648
|
offload_func_v(V);
|
1634
1649
|
ggml_set_name(V, "V");
|
1635
1650
|
|
@@ -1751,7 +1766,7 @@ static struct ggml_cgraph * llama_build_graph(
|
|
1751
1766
|
}
|
1752
1767
|
|
1753
1768
|
#if 0
|
1754
|
-
|
1769
|
+
LLAMA_LOG_INFO("\n%s: used_mem: eval ctx %.3f MB, scratch %.3f MB %.3f MB, work buf %.3f MB, n_past = %d, N = %d\n", __func__,
|
1755
1770
|
ggml_used_mem(ctx0)/1024.0/1024.0,
|
1756
1771
|
lctx.get_buf_max_mem(0)/1024.0/1024.0,
|
1757
1772
|
lctx.get_buf_max_mem(1)/1024.0/1024.0,
|
@@ -1784,6 +1799,13 @@ static bool llama_eval_internal(
|
|
1784
1799
|
|
1785
1800
|
LLAMA_ASSERT((!tokens && embd) || (tokens && !embd));
|
1786
1801
|
|
1802
|
+
LLAMA_ASSERT(n_tokens > 0);
|
1803
|
+
LLAMA_ASSERT(n_past >= 0);
|
1804
|
+
LLAMA_ASSERT(n_threads > 0);
|
1805
|
+
// TODO: keep the values of n_batch and n_ctx
|
1806
|
+
// LLAMA_ASSERT(n_tokens <= n_batch);
|
1807
|
+
// LLAMA_ASSERT(n_past + n_tokens <= n_ctx);
|
1808
|
+
|
1787
1809
|
const int64_t t_start_us = ggml_time_us();
|
1788
1810
|
|
1789
1811
|
#ifdef GGML_USE_MPI
|
@@ -1812,7 +1834,7 @@ static bool llama_eval_internal(
|
|
1812
1834
|
ggml_allocr_alloc_graph(lctx.alloc, gf);
|
1813
1835
|
#endif
|
1814
1836
|
|
1815
|
-
//
|
1837
|
+
// LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
|
1816
1838
|
|
1817
1839
|
// for big prompts, if BLAS is enabled, it is better to use only one thread
|
1818
1840
|
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
|
@@ -1830,11 +1852,7 @@ static bool llama_eval_internal(
|
|
1830
1852
|
#endif
|
1831
1853
|
|
1832
1854
|
#ifdef GGML_USE_METAL
|
1833
|
-
if (lctx.ctx_metal
|
1834
|
-
// TODO: disabled until #2413 is resolved
|
1835
|
-
//if (!ggml_metal_if_optimized(lctx.ctx_metal)) {
|
1836
|
-
// ggml_metal_graph_find_concurrency(lctx.ctx_metal, gf);
|
1837
|
-
//}
|
1855
|
+
if (lctx.ctx_metal) {
|
1838
1856
|
ggml_metal_set_n_cb (lctx.ctx_metal, n_threads);
|
1839
1857
|
ggml_metal_graph_compute(lctx.ctx_metal, gf);
|
1840
1858
|
ggml_metal_get_tensor (lctx.ctx_metal, res);
|
@@ -1842,22 +1860,6 @@ static bool llama_eval_internal(
|
|
1842
1860
|
ggml_metal_get_tensor(lctx.ctx_metal, embeddings);
|
1843
1861
|
}
|
1844
1862
|
} else {
|
1845
|
-
// IMPORTANT:
|
1846
|
-
// Since we don't have efficient Matrix x Matrix Metal multiplication yet, we fallback to vanilla
|
1847
|
-
// ggml_graph_compute(). It uses Apple's Accelerate CBLAS API which takes advantage of the ANE or the AMX
|
1848
|
-
// coprocessor.
|
1849
|
-
//
|
1850
|
-
// When we implement Matrix x Matrix Metal multiplication, we can avoid this branch.
|
1851
|
-
// But for now, we have focused only on Matrix x Vector Metal multiplication.
|
1852
|
-
//
|
1853
|
-
// TODO: avoid these syncs via shared memory (ref #1696)
|
1854
|
-
//
|
1855
|
-
if (lctx.ctx_metal) {
|
1856
|
-
// We need to sync the GPU KV cache with the CPU KV cache
|
1857
|
-
ggml_metal_get_tensor(lctx.ctx_metal, kv_self.k);
|
1858
|
-
ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v);
|
1859
|
-
}
|
1860
|
-
|
1861
1863
|
ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
|
1862
1864
|
}
|
1863
1865
|
#else
|
@@ -1999,7 +2001,7 @@ struct llama_tokenizer {
|
|
1999
2001
|
left_sym.n += right_sym.n;
|
2000
2002
|
right_sym.n = 0;
|
2001
2003
|
|
2002
|
-
//
|
2004
|
+
//LLAMA_LOG_INFO("left = '%*s' size = %zu\n", (int) left_sym.n, left_sym.text, bigram.size);
|
2003
2005
|
|
2004
2006
|
// remove the right sym from the chain
|
2005
2007
|
left_sym.next = right_sym.next;
|
@@ -2082,37 +2084,81 @@ static std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, co
|
|
2082
2084
|
// grammar - internal
|
2083
2085
|
//
|
2084
2086
|
|
2087
|
+
struct llama_partial_utf8 {
|
2088
|
+
uint32_t value; // bit value so far (unshifted)
|
2089
|
+
int n_remain; // num bytes remaining; -1 indicates invalid sequence
|
2090
|
+
};
|
2091
|
+
|
2085
2092
|
struct llama_grammar {
|
2086
2093
|
const std::vector<std::vector<llama_grammar_element>> rules;
|
2087
2094
|
std::vector<std::vector<const llama_grammar_element *>> stacks;
|
2095
|
+
|
2096
|
+
// buffer for partially generated UTF-8 sequence from accepted tokens
|
2097
|
+
llama_partial_utf8 partial_utf8;
|
2088
2098
|
};
|
2089
2099
|
|
2090
2100
|
struct llama_grammar_candidate {
|
2091
|
-
size_t
|
2092
|
-
const uint32_t
|
2101
|
+
size_t index;
|
2102
|
+
const uint32_t * code_points;
|
2103
|
+
llama_partial_utf8 partial_utf8;
|
2093
2104
|
};
|
2094
2105
|
|
2095
|
-
//
|
2096
|
-
//
|
2097
|
-
std::vector<uint32_t> decode_utf8(
|
2098
|
-
|
2106
|
+
// Decodes a UTF-8 string which may end in an incomplete sequence. Adds a terminating 0 for use as
|
2107
|
+
// pointer. If an invalid sequence is encountered, returns `llama_partial_utf8.n_remain == -1`.
|
2108
|
+
std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
|
2109
|
+
const char * src,
|
2110
|
+
llama_partial_utf8 partial_start) {
|
2111
|
+
static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 };
|
2099
2112
|
const char * pos = src;
|
2100
2113
|
std::vector<uint32_t> code_points;
|
2114
|
+
uint32_t value = partial_start.value;
|
2115
|
+
int n_remain = partial_start.n_remain;
|
2116
|
+
|
2117
|
+
// continue previous decode, if applicable
|
2118
|
+
while (*pos != 0 && n_remain > 0) {
|
2119
|
+
uint8_t next_byte = static_cast<uint8_t>(*pos);
|
2120
|
+
if ((next_byte >> 6) != 2) {
|
2121
|
+
// invalid sequence, abort
|
2122
|
+
code_points.push_back(0);
|
2123
|
+
return std::make_pair(std::move(code_points), llama_partial_utf8{ 0, -1 });
|
2124
|
+
}
|
2125
|
+
value = (value << 6) + (next_byte & 0x3F);
|
2126
|
+
++pos;
|
2127
|
+
--n_remain;
|
2128
|
+
}
|
2129
|
+
|
2130
|
+
if (partial_start.n_remain > 0 && n_remain == 0) {
|
2131
|
+
code_points.push_back(value);
|
2132
|
+
}
|
2133
|
+
|
2134
|
+
// decode any subsequent utf-8 sequences, which may end in an incomplete one
|
2101
2135
|
while (*pos != 0) {
|
2102
2136
|
uint8_t first_byte = static_cast<uint8_t>(*pos);
|
2103
2137
|
uint8_t highbits = first_byte >> 4;
|
2104
|
-
|
2105
|
-
|
2106
|
-
|
2107
|
-
|
2138
|
+
n_remain = lookup[highbits] - 1;
|
2139
|
+
|
2140
|
+
if (n_remain < 0) {
|
2141
|
+
// invalid sequence, abort
|
2142
|
+
code_points.clear();
|
2143
|
+
code_points.push_back(0);
|
2144
|
+
return std::make_pair(std::move(code_points), llama_partial_utf8{ 0, n_remain });
|
2145
|
+
}
|
2146
|
+
|
2147
|
+
uint8_t mask = (1 << (7 - n_remain)) - 1;
|
2148
|
+
value = first_byte & mask;
|
2108
2149
|
++pos;
|
2109
|
-
|
2150
|
+
while (*pos != 0 && n_remain > 0) {
|
2110
2151
|
value = (value << 6) + (static_cast<uint8_t>(*pos) & 0x3F);
|
2152
|
+
++pos;
|
2153
|
+
--n_remain;
|
2154
|
+
}
|
2155
|
+
if (n_remain == 0) {
|
2156
|
+
code_points.push_back(value);
|
2111
2157
|
}
|
2112
|
-
code_points.push_back(value);
|
2113
2158
|
}
|
2114
2159
|
code_points.push_back(0);
|
2115
|
-
|
2160
|
+
|
2161
|
+
return std::make_pair(std::move(code_points), llama_partial_utf8{ value, n_remain });
|
2116
2162
|
}
|
2117
2163
|
|
2118
2164
|
// returns true iff pos points to the end of one of the definitions of a rule
|
@@ -2149,6 +2195,56 @@ static std::pair<bool, const llama_grammar_element *> llama_grammar_match_char(
|
|
2149
2195
|
return std::make_pair(found == is_positive_char, pos);
|
2150
2196
|
}
|
2151
2197
|
|
2198
|
+
// returns true iff some continuation of the given partial UTF-8 sequence could satisfy the char
|
2199
|
+
// range at pos (regular or inverse range)
|
2200
|
+
// asserts that pos is pointing to a char range element
|
2201
|
+
static bool llama_grammar_match_partial_char(
|
2202
|
+
const llama_grammar_element * pos,
|
2203
|
+
const llama_partial_utf8 partial_utf8) {
|
2204
|
+
|
2205
|
+
bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR;
|
2206
|
+
LLAMA_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT);
|
2207
|
+
|
2208
|
+
uint32_t partial_value = partial_utf8.value;
|
2209
|
+
int n_remain = partial_utf8.n_remain;
|
2210
|
+
|
2211
|
+
// invalid sequence or 7-bit char split across 2 bytes (overlong)
|
2212
|
+
if (n_remain < 0 || (n_remain == 1 && partial_value < 2)) {
|
2213
|
+
return false;
|
2214
|
+
}
|
2215
|
+
|
2216
|
+
// range of possible code points this partial UTF-8 sequence could complete to
|
2217
|
+
uint32_t low = partial_value << (n_remain * 6);
|
2218
|
+
uint32_t high = low | ((1 << (n_remain * 6)) - 1);
|
2219
|
+
|
2220
|
+
if (low == 0) {
|
2221
|
+
if (n_remain == 2) {
|
2222
|
+
low = 1 << 11;
|
2223
|
+
} else if (n_remain == 3) {
|
2224
|
+
low = 1 << 16;
|
2225
|
+
}
|
2226
|
+
}
|
2227
|
+
|
2228
|
+
do {
|
2229
|
+
if (pos[1].type == LLAMA_GRETYPE_CHAR_RNG_UPPER) {
|
2230
|
+
// inclusive range, e.g. [a-z]
|
2231
|
+
if (pos->value <= high && low <= pos[1].value) {
|
2232
|
+
return is_positive_char;
|
2233
|
+
}
|
2234
|
+
pos += 2;
|
2235
|
+
} else {
|
2236
|
+
// exact char match, e.g. [a] or "a"
|
2237
|
+
if (low <= pos->value && pos->value <= high) {
|
2238
|
+
return is_positive_char;
|
2239
|
+
}
|
2240
|
+
pos += 1;
|
2241
|
+
}
|
2242
|
+
} while (pos->type == LLAMA_GRETYPE_CHAR_ALT);
|
2243
|
+
|
2244
|
+
return !is_positive_char;
|
2245
|
+
}
|
2246
|
+
|
2247
|
+
|
2152
2248
|
// transforms a grammar pushdown stack into N possible stacks, all ending
|
2153
2249
|
// at a character range (terminal element)
|
2154
2250
|
static void llama_grammar_advance_stack(
|
@@ -2249,8 +2345,11 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
|
|
2249
2345
|
std::vector<llama_grammar_candidate> rejects;
|
2250
2346
|
|
2251
2347
|
if (stack.empty()) {
|
2252
|
-
|
2253
|
-
|
2348
|
+
for (auto tok : candidates) {
|
2349
|
+
if (*tok.code_points != 0 || tok.partial_utf8.n_remain != 0) {
|
2350
|
+
rejects.push_back(tok);
|
2351
|
+
}
|
2352
|
+
}
|
2254
2353
|
return rejects;
|
2255
2354
|
}
|
2256
2355
|
|
@@ -2258,10 +2357,15 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
|
|
2258
2357
|
|
2259
2358
|
std::vector<llama_grammar_candidate> next_candidates;
|
2260
2359
|
for (auto tok : candidates) {
|
2261
|
-
if (
|
2262
|
-
|
2263
|
-
|
2360
|
+
if (*tok.code_points == 0) {
|
2361
|
+
// reached end of full codepoints in token, reject iff it ended in a partial sequence
|
2362
|
+
// that cannot satisfy this position in grammar
|
2363
|
+
if (tok.partial_utf8.n_remain != 0 &&
|
2364
|
+
!llama_grammar_match_partial_char(stack_pos, tok.partial_utf8)) {
|
2365
|
+
rejects.push_back(tok);
|
2264
2366
|
}
|
2367
|
+
} else if (llama_grammar_match_char(stack_pos, *tok.code_points).first) {
|
2368
|
+
next_candidates.push_back({ tok.index, tok.code_points + 1, tok.partial_utf8 });
|
2265
2369
|
} else {
|
2266
2370
|
rejects.push_back(tok);
|
2267
2371
|
}
|
@@ -2279,7 +2383,7 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
|
|
2279
2383
|
|
2280
2384
|
auto next_rejects = llama_grammar_reject_candidates(rules, next_stacks, next_candidates);
|
2281
2385
|
for (auto tok : next_rejects) {
|
2282
|
-
rejects.push_back({ tok.index, tok.code_points - 1 });
|
2386
|
+
rejects.push_back({ tok.index, tok.code_points - 1, tok.partial_utf8 });
|
2283
2387
|
}
|
2284
2388
|
|
2285
2389
|
return rejects;
|
@@ -2344,7 +2448,7 @@ struct llama_grammar * llama_grammar_init(
|
|
2344
2448
|
}
|
2345
2449
|
} while (true);
|
2346
2450
|
|
2347
|
-
return new llama_grammar{ std::move(vec_rules), std::move(stacks) };
|
2451
|
+
return new llama_grammar{ std::move(vec_rules), std::move(stacks), {} };
|
2348
2452
|
}
|
2349
2453
|
|
2350
2454
|
void llama_grammar_free(struct llama_grammar * grammar) {
|
@@ -2650,8 +2754,8 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
|
|
2650
2754
|
|
2651
2755
|
const llama_token eos = llama_token_eos();
|
2652
2756
|
|
2653
|
-
std::vector<std::vector<uint32_t>>
|
2654
|
-
std::vector<llama_grammar_candidate>
|
2757
|
+
std::vector<std::pair<std::vector<uint32_t>, llama_partial_utf8>> candidates_decoded;
|
2758
|
+
std::vector<llama_grammar_candidate> candidates_grammar;
|
2655
2759
|
|
2656
2760
|
for (size_t i = 0; i < candidates->size; ++i) {
|
2657
2761
|
const llama_token id = candidates->data[i].id;
|
@@ -2663,8 +2767,10 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
|
|
2663
2767
|
} else if (*str == 0) {
|
2664
2768
|
candidates->data[i].logit = -INFINITY;
|
2665
2769
|
} else {
|
2666
|
-
candidates_decoded.push_back(decode_utf8(str));
|
2667
|
-
candidates_grammar.push_back({
|
2770
|
+
candidates_decoded.push_back(decode_utf8(str, grammar->partial_utf8));
|
2771
|
+
candidates_grammar.push_back({
|
2772
|
+
i, candidates_decoded.back().first.data(), candidates_decoded.back().second
|
2773
|
+
});
|
2668
2774
|
}
|
2669
2775
|
}
|
2670
2776
|
|
@@ -2865,11 +2971,14 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
|
|
2865
2971
|
}
|
2866
2972
|
|
2867
2973
|
const char * str = llama_token_to_str(ctx, token);
|
2974
|
+
|
2868
2975
|
// Note terminating 0 in decoded string
|
2869
|
-
auto
|
2976
|
+
const auto decoded = decode_utf8(str, grammar->partial_utf8);
|
2977
|
+
const auto & code_points = decoded.first;
|
2870
2978
|
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
|
2871
2979
|
grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
|
2872
2980
|
}
|
2981
|
+
grammar->partial_utf8 = decoded.second;
|
2873
2982
|
LLAMA_ASSERT(!grammar->stacks.empty());
|
2874
2983
|
|
2875
2984
|
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
@@ -3007,7 +3116,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
3007
3116
|
tensor.data = read_data.addr;
|
3008
3117
|
model_loader->load_data_for(tensor);
|
3009
3118
|
|
3010
|
-
|
3119
|
+
LLAMA_LOG_INFO("[%4zu/%4zu] %36s - %16s, type = %6s, ",
|
3011
3120
|
++idx, model_loader->tensors_map.tensors.size(),
|
3012
3121
|
tensor.name.c_str(), llama_format_tensor_shape(tensor.ne).c_str(),
|
3013
3122
|
ggml_type_name(tensor.type));
|
@@ -3029,7 +3138,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
3029
3138
|
new_type = tensor.type;
|
3030
3139
|
new_data = tensor.data;
|
3031
3140
|
new_size = tensor.size;
|
3032
|
-
|
3141
|
+
LLAMA_LOG_INFO("size = %8.3f MB\n", tensor.size/1024.0/1024.0);
|
3033
3142
|
} else {
|
3034
3143
|
new_type = quantized_type;
|
3035
3144
|
#ifdef GGML_USE_K_QUANTS
|
@@ -3064,17 +3173,17 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
3064
3173
|
int nx = tensor.ne.at(0);
|
3065
3174
|
int ny = tensor.ne.at(1);
|
3066
3175
|
if (nx % QK_K != 0 || ny % QK_K != 0) {
|
3067
|
-
|
3176
|
+
LLAMA_LOG_INFO("\n\nTensor sizes %d x %d are not divisible by %d, required for k-quants.\n",nx,ny,QK_K);
|
3068
3177
|
convert_incompatible_tensor = true;
|
3069
3178
|
}
|
3070
3179
|
}
|
3071
3180
|
if (convert_incompatible_tensor) {
|
3072
3181
|
if (tensor.name == "output.weight") {
|
3073
3182
|
new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing.
|
3074
|
-
|
3183
|
+
LLAMA_LOG_WARN("F16 will be used for this tensor instead.\n");
|
3075
3184
|
} else if (tensor.name == "tok_embeddings.weight") {
|
3076
3185
|
new_type = GGML_TYPE_Q4_0; //fall back to Q4_0 instead of just failing.
|
3077
|
-
|
3186
|
+
LLAMA_LOG_WARN("Q4_0 will be used for this tensor instead.\n");
|
3078
3187
|
} else {
|
3079
3188
|
throw std::runtime_error("Unsupported tensor size encountered\n");
|
3080
3189
|
}
|
@@ -3094,7 +3203,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
3094
3203
|
f32_data = (float *) f32_conv_buf.addr;
|
3095
3204
|
}
|
3096
3205
|
|
3097
|
-
|
3206
|
+
LLAMA_LOG_INFO("quantizing to %s .. ", ggml_type_name(new_type));
|
3098
3207
|
fflush(stdout);
|
3099
3208
|
|
3100
3209
|
work.resize(nelements * 4); // upper bound on size
|
@@ -3144,7 +3253,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
3144
3253
|
}
|
3145
3254
|
}
|
3146
3255
|
|
3147
|
-
|
3256
|
+
LLAMA_LOG_INFO("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0);
|
3148
3257
|
int64_t tot_count = 0;
|
3149
3258
|
for (size_t i = 0; i < hist_cur.size(); i++) {
|
3150
3259
|
hist_all[i] += hist_cur[i];
|
@@ -3153,18 +3262,18 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
3153
3262
|
|
3154
3263
|
if (tot_count > 0) {
|
3155
3264
|
for (size_t i = 0; i < hist_cur.size(); i++) {
|
3156
|
-
|
3265
|
+
LLAMA_LOG_INFO("%5.3f ", hist_cur[i] / float(nelements));
|
3157
3266
|
}
|
3158
3267
|
}
|
3159
|
-
|
3268
|
+
LLAMA_LOG_INFO("\n");
|
3160
3269
|
}
|
3161
3270
|
total_size_org += tensor.size;
|
3162
3271
|
total_size_new += new_size;
|
3163
3272
|
file_saver.write_tensor(tensor, new_type, new_data, new_size);
|
3164
3273
|
}
|
3165
3274
|
|
3166
|
-
|
3167
|
-
|
3275
|
+
LLAMA_LOG_INFO("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
|
3276
|
+
LLAMA_LOG_INFO("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0);
|
3168
3277
|
|
3169
3278
|
{
|
3170
3279
|
int64_t sum_all = 0;
|
@@ -3173,11 +3282,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
3173
3282
|
}
|
3174
3283
|
|
3175
3284
|
if (sum_all > 0) {
|
3176
|
-
|
3285
|
+
LLAMA_LOG_INFO("%s: hist: ", __func__);
|
3177
3286
|
for (size_t i = 0; i < hist_all.size(); i++) {
|
3178
|
-
|
3287
|
+
LLAMA_LOG_INFO("%5.3f ", hist_all[i] / float(sum_all));
|
3179
3288
|
}
|
3180
|
-
|
3289
|
+
LLAMA_LOG_INFO("\n");
|
3181
3290
|
}
|
3182
3291
|
}
|
3183
3292
|
}
|
@@ -3201,8 +3310,8 @@ struct llama_model * llama_load_model_from_file(
|
|
3201
3310
|
params.main_gpu, params.tensor_split, params.mul_mat_q, params.rope_freq_base, params.rope_freq_scale,params.low_vram,
|
3202
3311
|
memory_type, params.use_mmap, params.use_mlock, params.vocab_only, params.progress_callback,
|
3203
3312
|
params.progress_callback_user_data)) {
|
3313
|
+
LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
|
3204
3314
|
delete model;
|
3205
|
-
fprintf(stderr, "%s: failed to load model\n", __func__);
|
3206
3315
|
return nullptr;
|
3207
3316
|
}
|
3208
3317
|
|
@@ -3235,10 +3344,9 @@ struct llama_context * llama_new_context_with_model(
|
|
3235
3344
|
unsigned percentage = (unsigned) (100 * progress);
|
3236
3345
|
while (percentage > *cur_percentage_p) {
|
3237
3346
|
*cur_percentage_p = percentage;
|
3238
|
-
|
3239
|
-
fflush(stderr);
|
3347
|
+
LLAMA_LOG_INFO(".");
|
3240
3348
|
if (percentage >= 100) {
|
3241
|
-
|
3349
|
+
LLAMA_LOG_INFO("\n");
|
3242
3350
|
}
|
3243
3351
|
}
|
3244
3352
|
};
|
@@ -3252,14 +3360,14 @@ struct llama_context * llama_new_context_with_model(
|
|
3252
3360
|
// reserve memory for context buffers
|
3253
3361
|
if (!params.vocab_only) {
|
3254
3362
|
if (!kv_cache_init(ctx->model.hparams, ctx->kv_self, memory_type, ctx->model.hparams.n_ctx, params.n_gpu_layers)) {
|
3255
|
-
|
3363
|
+
LLAMA_LOG_ERROR("%s: kv_cache_init() failed for self-attention cache\n", __func__);
|
3256
3364
|
llama_free(ctx);
|
3257
3365
|
return nullptr;
|
3258
3366
|
}
|
3259
3367
|
|
3260
3368
|
{
|
3261
3369
|
const size_t memory_size = ggml_nbytes(ctx->kv_self.k) + ggml_nbytes(ctx->kv_self.v);
|
3262
|
-
|
3370
|
+
LLAMA_LOG_INFO("%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
|
3263
3371
|
}
|
3264
3372
|
|
3265
3373
|
const auto & hparams = ctx->model.hparams;
|
@@ -3289,24 +3397,40 @@ struct llama_context * llama_new_context_with_model(
|
|
3289
3397
|
int n_past = hparams.n_ctx - n_tokens;
|
3290
3398
|
llama_token token = llama_token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
|
3291
3399
|
ggml_cgraph * gf = llama_build_graph(*ctx, &token, NULL, n_tokens, n_past);
|
3292
|
-
|
3400
|
+
#ifdef GGML_USE_METAL
|
3401
|
+
if (params.n_gpu_layers > 0) {
|
3402
|
+
ctx->ctx_metal = ggml_metal_init(1);
|
3403
|
+
if (!ctx->ctx_metal) {
|
3404
|
+
LLAMA_LOG_ERROR("%s: ggml_metal_init() failed\n", __func__);
|
3405
|
+
llama_free(ctx);
|
3406
|
+
return NULL;
|
3407
|
+
}
|
3408
|
+
ggml_metal_graph_find_concurrency(ctx->ctx_metal, gf, false);
|
3409
|
+
ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
|
3410
|
+
}
|
3411
|
+
#endif
|
3293
3412
|
// measure memory requirements for the graph
|
3294
3413
|
size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment;
|
3295
3414
|
|
3296
|
-
|
3415
|
+
LLAMA_LOG_INFO("%s: compute buffer total size = %7.2f MB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0);
|
3297
3416
|
|
3298
3417
|
// debug - for comparison with scratch buffer
|
3299
3418
|
//size_t prev_req =
|
3300
3419
|
// MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type) +
|
3301
3420
|
// MEM_REQ_SCRATCH1().at(ctx->model.type) +
|
3302
3421
|
// MEM_REQ_EVAL().at(ctx->model.type);
|
3303
|
-
//
|
3422
|
+
//LLAMA_LOG_INFO("%s: (debug) equivalent with scratch buffer = %7.2f MB\n", __func__, prev_req / 1024.0 / 1024.0);
|
3304
3423
|
|
3305
3424
|
// recreate allocator with exact memory requirements
|
3306
3425
|
ggml_allocr_free(ctx->alloc);
|
3307
3426
|
|
3308
3427
|
ctx->buf_alloc.resize(alloc_size);
|
3309
3428
|
ctx->alloc = ggml_allocr_new(ctx->buf_alloc.addr, ctx->buf_alloc.size, tensor_alignment);
|
3429
|
+
#ifdef GGML_USE_METAL
|
3430
|
+
if (ctx->ctx_metal) {
|
3431
|
+
ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
|
3432
|
+
}
|
3433
|
+
#endif
|
3310
3434
|
}
|
3311
3435
|
#else
|
3312
3436
|
ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type) + ggml_graph_overhead());
|
@@ -3321,7 +3445,6 @@ struct llama_context * llama_new_context_with_model(
|
|
3321
3445
|
#ifdef GGML_USE_METAL
|
3322
3446
|
if (params.n_gpu_layers > 0) {
|
3323
3447
|
// this allocates all Metal resources and memory buffers
|
3324
|
-
ctx->ctx_metal = ggml_metal_init(1);
|
3325
3448
|
|
3326
3449
|
void * data_ptr = NULL;
|
3327
3450
|
size_t data_size = 0;
|
@@ -3336,13 +3459,13 @@ struct llama_context * llama_new_context_with_model(
|
|
3336
3459
|
|
3337
3460
|
const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
|
3338
3461
|
|
3339
|
-
|
3462
|
+
LLAMA_LOG_INFO("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
|
3340
3463
|
|
3341
|
-
#define LLAMA_METAL_CHECK_BUF(result)
|
3342
|
-
if (!(result)) {
|
3343
|
-
|
3344
|
-
llama_free(ctx);
|
3345
|
-
return NULL;
|
3464
|
+
#define LLAMA_METAL_CHECK_BUF(result) \
|
3465
|
+
if (!(result)) { \
|
3466
|
+
LLAMA_LOG_ERROR("%s: failed to add buffer\n", __func__); \
|
3467
|
+
llama_free(ctx); \
|
3468
|
+
return NULL; \
|
3346
3469
|
}
|
3347
3470
|
|
3348
3471
|
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
|
@@ -3350,8 +3473,7 @@ struct llama_context * llama_new_context_with_model(
|
|
3350
3473
|
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size, 0));
|
3351
3474
|
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.addr, ctx->kv_self.buf.size, 0));
|
3352
3475
|
|
3353
|
-
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "
|
3354
|
-
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size, 0));
|
3476
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "alloc", ctx->buf_alloc.addr, ctx->buf_alloc.size, 0));
|
3355
3477
|
#undef LLAMA_METAL_CHECK_BUF
|
3356
3478
|
}
|
3357
3479
|
#endif
|
@@ -3396,19 +3518,19 @@ int llama_model_quantize(
|
|
3396
3518
|
llama_model_quantize_internal(fname_inp, fname_out, params);
|
3397
3519
|
return 0;
|
3398
3520
|
} catch (const std::exception & err) {
|
3399
|
-
|
3521
|
+
LLAMA_LOG_ERROR("%s: failed to quantize: %s\n", __func__, err.what());
|
3400
3522
|
return 1;
|
3401
3523
|
}
|
3402
3524
|
}
|
3403
3525
|
|
3404
3526
|
int llama_apply_lora_from_file_internal(const struct llama_model & model, const char * path_lora, const char * path_base_model, int n_threads) {
|
3405
|
-
|
3527
|
+
LLAMA_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
|
3406
3528
|
|
3407
3529
|
const int64_t t_start_lora_us = ggml_time_us();
|
3408
3530
|
|
3409
3531
|
auto fin = std::ifstream(path_lora, std::ios::binary);
|
3410
3532
|
if (!fin) {
|
3411
|
-
|
3533
|
+
LLAMA_LOG_ERROR("%s: failed to open '%s'\n", __func__, path_lora);
|
3412
3534
|
return 1;
|
3413
3535
|
}
|
3414
3536
|
|
@@ -3417,14 +3539,14 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
|
3417
3539
|
uint32_t magic;
|
3418
3540
|
fin.read((char *) &magic, sizeof(magic));
|
3419
3541
|
if (magic != LLAMA_FILE_MAGIC_GGLA) {
|
3420
|
-
|
3542
|
+
LLAMA_LOG_ERROR("%s: bad file magic\n", __func__);
|
3421
3543
|
return 1;
|
3422
3544
|
}
|
3423
3545
|
uint32_t format_version;
|
3424
3546
|
fin.read((char *) &format_version, sizeof(format_version));
|
3425
3547
|
|
3426
3548
|
if (format_version != 1) {
|
3427
|
-
|
3549
|
+
LLAMA_LOG_ERROR("%s: unsupported file version\n", __func__ );
|
3428
3550
|
return 1;
|
3429
3551
|
}
|
3430
3552
|
}
|
@@ -3435,7 +3557,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
|
3435
3557
|
fin.read((char *) &lora_alpha, sizeof(lora_alpha));
|
3436
3558
|
float scaling = (float)lora_alpha / (float)lora_r;
|
3437
3559
|
|
3438
|
-
|
3560
|
+
LLAMA_LOG_INFO("%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);
|
3439
3561
|
|
3440
3562
|
|
3441
3563
|
// create a temporary ggml context to store the lora tensors
|
@@ -3461,7 +3583,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
|
3461
3583
|
ggml_context * base_ctx = NULL;
|
3462
3584
|
llama_buffer base_buf;
|
3463
3585
|
if (path_base_model) {
|
3464
|
-
|
3586
|
+
LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
|
3465
3587
|
model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true));
|
3466
3588
|
|
3467
3589
|
size_t ctx_size;
|
@@ -3518,17 +3640,17 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
|
3518
3640
|
const std::string lora_suffix = ".lora";
|
3519
3641
|
size_t pos = name.rfind(lora_suffix);
|
3520
3642
|
if (pos == std::string::npos) {
|
3521
|
-
|
3643
|
+
LLAMA_LOG_ERROR("%s: error: '%s' is not a lora tensor\n", __func__, name.c_str());
|
3522
3644
|
return 1;
|
3523
3645
|
}
|
3524
3646
|
|
3525
3647
|
std::string lora_type = name.substr(pos + lora_suffix.length());
|
3526
3648
|
std::string base_name = name;
|
3527
3649
|
base_name.erase(pos);
|
3528
|
-
//
|
3650
|
+
// LLAMA_LOG_INFO("%s: %s => %s (lora type %s) \n", __func__, name.c_str(),base_name.c_str(), lora_type.c_str());
|
3529
3651
|
|
3530
3652
|
if (model_tensors.find(base_name) == model_tensors.end()) {
|
3531
|
-
|
3653
|
+
LLAMA_LOG_ERROR("%s: unknown tensor '%s' in lora adapter\n", __func__, name.data());
|
3532
3654
|
return 1;
|
3533
3655
|
}
|
3534
3656
|
|
@@ -3539,7 +3661,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
|
3539
3661
|
case 1: wtype = GGML_TYPE_F16; break;
|
3540
3662
|
default:
|
3541
3663
|
{
|
3542
|
-
|
3664
|
+
LLAMA_LOG_ERROR("%s: invalid tensor data type '%d'\n",
|
3543
3665
|
__func__, ftype);
|
3544
3666
|
return false;
|
3545
3667
|
}
|
@@ -3549,7 +3671,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
|
3549
3671
|
lora_tensor = ggml_new_tensor_2d(lora_ctx, wtype, ne[0], ne[1]);
|
3550
3672
|
}
|
3551
3673
|
else {
|
3552
|
-
|
3674
|
+
LLAMA_LOG_ERROR("%s: unsupported tensor dimension %d\n", __func__, n_dims);
|
3553
3675
|
return 1;
|
3554
3676
|
}
|
3555
3677
|
ggml_set_name(lora_tensor, "lora_tensor");
|
@@ -3587,7 +3709,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
|
3587
3709
|
if (model_loader) {
|
3588
3710
|
// load from base model
|
3589
3711
|
if (model_loader->tensors_map.name_to_idx.find(base_name) == model_loader->tensors_map.name_to_idx.end()) {
|
3590
|
-
|
3712
|
+
LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
|
3591
3713
|
return 1;
|
3592
3714
|
}
|
3593
3715
|
size_t idx = model_loader->tensors_map.name_to_idx[base_name];
|
@@ -3603,8 +3725,8 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
|
3603
3725
|
|
3604
3726
|
if (ggml_is_quantized(base_t->type)) {
|
3605
3727
|
if (!warned) {
|
3606
|
-
|
3607
|
-
|
3728
|
+
LLAMA_LOG_WARN("%s: warning: using a lora adapter with a quantized model may result in poor quality, "
|
3729
|
+
"use a f16 or f32 base model with --lora-base\n", __func__);
|
3608
3730
|
warned = true;
|
3609
3731
|
}
|
3610
3732
|
}
|
@@ -3618,8 +3740,8 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
|
3618
3740
|
ggml_set_name(loraB, "loraB");
|
3619
3741
|
|
3620
3742
|
if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) {
|
3621
|
-
|
3622
|
-
|
3743
|
+
LLAMA_LOG_ERROR("%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");"
|
3744
|
+
" are you sure that this adapter is for this model?\n", __func__, base_t->ne[0], loraA->ne[1]);
|
3623
3745
|
return 1;
|
3624
3746
|
}
|
3625
3747
|
|
@@ -3664,7 +3786,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
|
3664
3786
|
|
3665
3787
|
n_tensors++;
|
3666
3788
|
if (n_tensors % 4 == 0) {
|
3667
|
-
|
3789
|
+
LLAMA_LOG_INFO(".");
|
3668
3790
|
}
|
3669
3791
|
}
|
3670
3792
|
}
|
@@ -3676,7 +3798,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
|
3676
3798
|
}
|
3677
3799
|
|
3678
3800
|
const int64_t t_lora_us = ggml_time_us() - t_start_lora_us;
|
3679
|
-
|
3801
|
+
LLAMA_LOG_INFO(" done (%.2f ms)\n", t_lora_us / 1000.0);
|
3680
3802
|
|
3681
3803
|
return 0;
|
3682
3804
|
}
|
@@ -3685,7 +3807,7 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
|
|
3685
3807
|
try {
|
3686
3808
|
return llama_apply_lora_from_file_internal(ctx->model, path_lora, path_base_model, n_threads);
|
3687
3809
|
} catch (const std::exception & err) {
|
3688
|
-
|
3810
|
+
LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
|
3689
3811
|
return 1;
|
3690
3812
|
}
|
3691
3813
|
}
|
@@ -3694,7 +3816,7 @@ int llama_model_apply_lora_from_file(const struct llama_model * model, const cha
|
|
3694
3816
|
try {
|
3695
3817
|
return llama_apply_lora_from_file_internal(*model, path_lora, path_base_model, n_threads);
|
3696
3818
|
} catch (const std::exception & err) {
|
3697
|
-
|
3819
|
+
LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
|
3698
3820
|
return 1;
|
3699
3821
|
}
|
3700
3822
|
}
|
@@ -3743,10 +3865,20 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
|
|
3743
3865
|
return s_total;
|
3744
3866
|
}
|
3745
3867
|
|
3746
|
-
|
3747
|
-
|
3748
|
-
|
3749
|
-
|
3868
|
+
/** copy state data into either a buffer or file depending on the passed in context
|
3869
|
+
*
|
3870
|
+
* file context:
|
3871
|
+
* llama_file file("/path", "wb");
|
3872
|
+
* llama_data_file_context data_ctx(&file);
|
3873
|
+
* llama_copy_state_data(ctx, &data_ctx);
|
3874
|
+
*
|
3875
|
+
* buffer context:
|
3876
|
+
* std::vector<uint8_t> buf(max_size, 0);
|
3877
|
+
* llama_data_buffer_context data_ctx(&buf.data());
|
3878
|
+
* llama_copy_state_data(ctx, &data_ctx);
|
3879
|
+
*
|
3880
|
+
*/
|
3881
|
+
void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
|
3750
3882
|
// copy rng
|
3751
3883
|
{
|
3752
3884
|
std::stringstream rng_ss;
|
@@ -3758,8 +3890,8 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
|
3758
3890
|
memset(&rng_buf[0], 0, LLAMA_MAX_RNG_STATE);
|
3759
3891
|
memcpy(&rng_buf[0], rng_ss.str().data(), rng_ss.str().size());
|
3760
3892
|
|
3761
|
-
|
3762
|
-
|
3893
|
+
data_ctx->write(&rng_size, sizeof(rng_size));
|
3894
|
+
data_ctx->write(&rng_buf[0], LLAMA_MAX_RNG_STATE);
|
3763
3895
|
}
|
3764
3896
|
|
3765
3897
|
// copy logits
|
@@ -3767,25 +3899,29 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
|
3767
3899
|
const size_t logits_cap = ctx->logits.capacity();
|
3768
3900
|
const size_t logits_size = ctx->logits.size();
|
3769
3901
|
|
3770
|
-
|
3771
|
-
|
3902
|
+
data_ctx->write(&logits_cap, sizeof(logits_cap));
|
3903
|
+
data_ctx->write(&logits_size, sizeof(logits_size));
|
3772
3904
|
|
3773
3905
|
if (logits_size) {
|
3774
|
-
|
3906
|
+
data_ctx->write(ctx->logits.data(), logits_size * sizeof(float));
|
3775
3907
|
}
|
3776
3908
|
|
3777
|
-
|
3909
|
+
// If there is a gap between the size and the capacity, write padding
|
3910
|
+
size_t padding_size = (logits_cap - logits_size) * sizeof(float);
|
3911
|
+
if (padding_size > 0) {
|
3912
|
+
std::vector<uint8_t> padding(padding_size, 0); // Create a buffer filled with zeros
|
3913
|
+
data_ctx->write(padding.data(), padding_size);
|
3914
|
+
}
|
3778
3915
|
}
|
3779
3916
|
|
3780
3917
|
// copy embeddings
|
3781
3918
|
{
|
3782
3919
|
const size_t embedding_size = ctx->embedding.size();
|
3783
3920
|
|
3784
|
-
|
3921
|
+
data_ctx->write(&embedding_size, sizeof(embedding_size));
|
3785
3922
|
|
3786
3923
|
if (embedding_size) {
|
3787
|
-
|
3788
|
-
out += embedding_size * sizeof(float);
|
3924
|
+
data_ctx->write(ctx->embedding.data(), embedding_size * sizeof(float));
|
3789
3925
|
}
|
3790
3926
|
}
|
3791
3927
|
|
@@ -3800,8 +3936,8 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
|
3800
3936
|
const size_t kv_size = kv_self.buf.size;
|
3801
3937
|
const int kv_ntok = llama_get_kv_cache_token_count(ctx);
|
3802
3938
|
|
3803
|
-
|
3804
|
-
|
3939
|
+
data_ctx->write(&kv_size, sizeof(kv_size));
|
3940
|
+
data_ctx->write(&kv_ntok, sizeof(kv_ntok));
|
3805
3941
|
|
3806
3942
|
if (kv_size) {
|
3807
3943
|
const size_t elt_size = ggml_element_size(kv_self.k);
|
@@ -3810,12 +3946,12 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
|
3810
3946
|
ggml_cgraph gf{};
|
3811
3947
|
|
3812
3948
|
ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
|
3813
|
-
kout3d
|
3814
|
-
|
3949
|
+
std::vector<uint8_t> kout3d_data(ggml_nbytes(kout3d), 0);
|
3950
|
+
kout3d->data = kout3d_data.data();
|
3815
3951
|
|
3816
3952
|
ggml_tensor * vout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_ntok, n_embd, n_layer);
|
3817
|
-
vout3d
|
3818
|
-
|
3953
|
+
std::vector<uint8_t> vout3d_data(ggml_nbytes(vout3d), 0);
|
3954
|
+
vout3d->data = vout3d_data.data();
|
3819
3955
|
|
3820
3956
|
ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k,
|
3821
3957
|
n_embd, kv_ntok, n_layer,
|
@@ -3830,15 +3966,20 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
|
3830
3966
|
ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1);
|
3831
3967
|
|
3832
3968
|
ggml_free(cpy_ctx);
|
3969
|
+
|
3970
|
+
// our data is now in the kout3d_data and vout3d_data buffers
|
3971
|
+
// write them to file
|
3972
|
+
data_ctx->write(kout3d_data.data(), kout3d_data.size());
|
3973
|
+
data_ctx->write(vout3d_data.data(), vout3d_data.size());
|
3833
3974
|
}
|
3834
3975
|
}
|
3976
|
+
}
|
3835
3977
|
|
3836
|
-
|
3837
|
-
|
3838
|
-
|
3839
|
-
LLAMA_ASSERT(written <= max_size);
|
3978
|
+
size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
3979
|
+
llama_data_buffer_context data_ctx(dst);
|
3980
|
+
llama_copy_state_data_internal(ctx, &data_ctx);
|
3840
3981
|
|
3841
|
-
return
|
3982
|
+
return data_ctx.get_size_written();
|
3842
3983
|
}
|
3843
3984
|
|
3844
3985
|
// Sets the state reading from the specified source address
|
@@ -3957,7 +4098,7 @@ static bool llama_load_session_file_internal(struct llama_context * ctx, const c
|
|
3957
4098
|
const uint32_t version = file.read_u32();
|
3958
4099
|
|
3959
4100
|
if (magic != LLAMA_SESSION_MAGIC || version != LLAMA_SESSION_VERSION) {
|
3960
|
-
|
4101
|
+
LLAMA_LOG_ERROR("%s : unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version);
|
3961
4102
|
return false;
|
3962
4103
|
}
|
3963
4104
|
|
@@ -3965,7 +4106,7 @@ static bool llama_load_session_file_internal(struct llama_context * ctx, const c
|
|
3965
4106
|
file.read_raw(&session_hparams, sizeof(llama_hparams));
|
3966
4107
|
|
3967
4108
|
if (session_hparams != ctx->model.hparams) {
|
3968
|
-
|
4109
|
+
LLAMA_LOG_INFO("%s : model hparams didn't match from session file!\n", __func__);
|
3969
4110
|
return false;
|
3970
4111
|
}
|
3971
4112
|
}
|
@@ -3975,7 +4116,7 @@ static bool llama_load_session_file_internal(struct llama_context * ctx, const c
|
|
3975
4116
|
const uint32_t n_token_count = file.read_u32();
|
3976
4117
|
|
3977
4118
|
if (n_token_count > n_token_capacity) {
|
3978
|
-
|
4119
|
+
LLAMA_LOG_ERROR("%s : token count in session file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity);
|
3979
4120
|
return false;
|
3980
4121
|
}
|
3981
4122
|
|
@@ -3989,7 +4130,7 @@ static bool llama_load_session_file_internal(struct llama_context * ctx, const c
|
|
3989
4130
|
const size_t n_state_size_max = llama_get_state_size(ctx);
|
3990
4131
|
|
3991
4132
|
if (n_state_size_cur > n_state_size_max) {
|
3992
|
-
|
4133
|
+
LLAMA_LOG_ERROR("%s : the state size in session file is too big! max %zu, got %zu\n", __func__, n_state_size_max, n_state_size_cur);
|
3993
4134
|
return false;
|
3994
4135
|
}
|
3995
4136
|
|
@@ -4006,7 +4147,7 @@ bool llama_load_session_file(struct llama_context * ctx, const char * path_sessi
|
|
4006
4147
|
try {
|
4007
4148
|
return llama_load_session_file_internal(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out);
|
4008
4149
|
} catch (const std::exception & err) {
|
4009
|
-
|
4150
|
+
LLAMA_LOG_ERROR("error loading session file: %s\n", err.what());
|
4010
4151
|
return false;
|
4011
4152
|
}
|
4012
4153
|
}
|
@@ -4023,15 +4164,9 @@ bool llama_save_session_file(struct llama_context * ctx, const char * path_sessi
|
|
4023
4164
|
file.write_u32((uint32_t) n_token_count);
|
4024
4165
|
file.write_raw(tokens, sizeof(llama_token) * n_token_count);
|
4025
4166
|
|
4026
|
-
// save the context state
|
4027
|
-
|
4028
|
-
|
4029
|
-
|
4030
|
-
std::vector<uint8_t> state_data(n_state_size_max);
|
4031
|
-
const size_t n_state_size_cur = llama_copy_state_data(ctx, state_data.data());
|
4032
|
-
|
4033
|
-
file.write_raw(state_data.data(), n_state_size_cur);
|
4034
|
-
}
|
4167
|
+
// save the context state using stream saving
|
4168
|
+
llama_data_file_context data_ctx(&file);
|
4169
|
+
llama_copy_state_data_internal(ctx, &data_ctx);
|
4035
4170
|
|
4036
4171
|
return true;
|
4037
4172
|
}
|
@@ -4043,7 +4178,7 @@ int llama_eval(
|
|
4043
4178
|
int n_past,
|
4044
4179
|
int n_threads) {
|
4045
4180
|
if (!llama_eval_internal(*ctx, tokens, nullptr, n_tokens, n_past, n_threads, nullptr)) {
|
4046
|
-
|
4181
|
+
LLAMA_LOG_ERROR("%s: failed to eval\n", __func__);
|
4047
4182
|
return 1;
|
4048
4183
|
}
|
4049
4184
|
|
@@ -4065,7 +4200,7 @@ int llama_eval_embd(
|
|
4065
4200
|
int n_past,
|
4066
4201
|
int n_threads) {
|
4067
4202
|
if (!llama_eval_internal(*ctx, nullptr, embd, n_tokens, n_past, n_threads, nullptr)) {
|
4068
|
-
|
4203
|
+
LLAMA_LOG_ERROR("%s: failed to eval\n", __func__);
|
4069
4204
|
return 1;
|
4070
4205
|
}
|
4071
4206
|
|
@@ -4086,7 +4221,7 @@ int llama_eval_export(struct llama_context * ctx, const char * fname) {
|
|
4086
4221
|
const std::vector<llama_token> tmp(n_batch, llama_token_bos());
|
4087
4222
|
|
4088
4223
|
if (!llama_eval_internal(*ctx, tmp.data(), nullptr, tmp.size(), n_ctx, 1, fname)) {
|
4089
|
-
|
4224
|
+
LLAMA_LOG_ERROR("%s: failed to eval\n", __func__);
|
4090
4225
|
return 1;
|
4091
4226
|
}
|
4092
4227
|
|
@@ -4102,7 +4237,7 @@ int llama_tokenize_with_model(
|
|
4102
4237
|
auto res = llama_tokenize(model->vocab, text, add_bos);
|
4103
4238
|
|
4104
4239
|
if (n_max_tokens < (int) res.size()) {
|
4105
|
-
|
4240
|
+
LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
|
4106
4241
|
return -((int) res.size());
|
4107
4242
|
}
|
4108
4243
|
|
@@ -4146,6 +4281,10 @@ int llama_n_embd(const struct llama_context * ctx) {
|
|
4146
4281
|
return ctx->model.hparams.n_embd;
|
4147
4282
|
}
|
4148
4283
|
|
4284
|
+
int llama_model_type(const struct llama_model * model, char * buf, size_t buf_size) {
|
4285
|
+
return snprintf(buf, buf_size, "LLaMA %s %s", llama_model_type_name(model->type), llama_ftype_name(model->hparams.ftype));
|
4286
|
+
}
|
4287
|
+
|
4149
4288
|
int llama_get_vocab_from_model(
|
4150
4289
|
const struct llama_model * model,
|
4151
4290
|
const char * * strings,
|
@@ -4219,15 +4358,15 @@ struct llama_timings llama_get_timings(struct llama_context * ctx) {
|
|
4219
4358
|
void llama_print_timings(struct llama_context * ctx) {
|
4220
4359
|
const llama_timings timings = llama_get_timings(ctx);
|
4221
4360
|
|
4222
|
-
|
4223
|
-
|
4224
|
-
|
4361
|
+
LLAMA_LOG_INFO("\n");
|
4362
|
+
LLAMA_LOG_INFO("%s: load time = %8.2f ms\n", __func__, timings.t_load_ms);
|
4363
|
+
LLAMA_LOG_INFO("%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
4225
4364
|
__func__, timings.t_sample_ms, timings.n_sample, timings.t_sample_ms / timings.n_sample, 1e3 / timings.t_sample_ms * timings.n_sample);
|
4226
|
-
|
4365
|
+
LLAMA_LOG_INFO("%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
|
4227
4366
|
__func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval);
|
4228
|
-
|
4367
|
+
LLAMA_LOG_INFO("%s: eval time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
4229
4368
|
__func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval);
|
4230
|
-
|
4369
|
+
LLAMA_LOG_INFO("%s: total time = %8.2f ms\n", __func__, (timings.t_end_ms - timings.t_start_ms));
|
4231
4370
|
}
|
4232
4371
|
|
4233
4372
|
void llama_reset_timings(struct llama_context * ctx) {
|
@@ -4263,3 +4402,44 @@ const char * llama_print_system_info(void) {
|
|
4263
4402
|
const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
|
4264
4403
|
return ctx->model.tensors_by_name;
|
4265
4404
|
}
|
4405
|
+
|
4406
|
+
|
4407
|
+
void llama_log_set(llama_log_callback log_callback, void * user_data) {
|
4408
|
+
g_state.log_callback = log_callback ? log_callback : llama_log_callback_default;
|
4409
|
+
g_state.log_callback_user_data = user_data;
|
4410
|
+
}
|
4411
|
+
|
4412
|
+
#if defined(_MSC_VER) && !defined(vsnprintf)
|
4413
|
+
#define vsnprintf _vsnprintf
|
4414
|
+
#endif
|
4415
|
+
|
4416
|
+
static void llama_log_internal_v(llama_log_level level, const char * format, va_list args) {
|
4417
|
+
va_list args_copy;
|
4418
|
+
va_copy(args_copy, args);
|
4419
|
+
char buffer[128];
|
4420
|
+
int len = vsnprintf(buffer, 128, format, args);
|
4421
|
+
if (len < 128) {
|
4422
|
+
g_state.log_callback(level, buffer, g_state.log_callback_user_data);
|
4423
|
+
} else {
|
4424
|
+
char* buffer2 = new char[len+1];
|
4425
|
+
vsnprintf(buffer2, len+1, format, args_copy);
|
4426
|
+
buffer2[len] = 0;
|
4427
|
+
g_state.log_callback(level, buffer2, g_state.log_callback_user_data);
|
4428
|
+
delete[] buffer2;
|
4429
|
+
}
|
4430
|
+
va_end(args_copy);
|
4431
|
+
}
|
4432
|
+
|
4433
|
+
static void llama_log_internal(llama_log_level level, const char * format, ...) {
|
4434
|
+
va_list args;
|
4435
|
+
va_start(args, format);
|
4436
|
+
llama_log_internal_v(level, format, args);
|
4437
|
+
va_end(args);
|
4438
|
+
}
|
4439
|
+
|
4440
|
+
static void llama_log_callback_default(llama_log_level level, const char * text, void * user_data) {
|
4441
|
+
(void) level;
|
4442
|
+
(void) user_data;
|
4443
|
+
fputs(text, stderr);
|
4444
|
+
fflush(stderr);
|
4445
|
+
}
|