llama_cpp 0.2.1 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +32 -0
- data/README.md +39 -6
- data/examples/README.md +32 -0
- data/examples/chat.rb +2 -1
- data/examples/embedding.rb +38 -0
- data/ext/llama_cpp/extconf.rb +13 -0
- data/ext/llama_cpp/llama_cpp.cpp +231 -132
- data/ext/llama_cpp/src/ggml-cuda.cu +844 -337
- data/ext/llama_cpp/src/ggml-metal.h +4 -1
- data/ext/llama_cpp/src/ggml-metal.m +193 -49
- data/ext/llama_cpp/src/ggml-metal.metal +477 -84
- data/ext/llama_cpp/src/ggml-opencl.cpp +493 -4
- data/ext/llama_cpp/src/ggml.c +1565 -430
- data/ext/llama_cpp/src/ggml.h +208 -14
- data/ext/llama_cpp/src/k_quants.c +1712 -56
- data/ext/llama_cpp/src/k_quants.h +41 -6
- data/ext/llama_cpp/src/llama-util.h +19 -5
- data/ext/llama_cpp/src/llama.cpp +194 -101
- data/ext/llama_cpp/src/llama.h +41 -14
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +0 -2
- data/sig/llama_cpp.rbs +12 -17
- metadata +3 -3
- data/lib/llama_cpp/client.rb +0 -172
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -19,6 +19,15 @@
|
|
19
19
|
#ifdef GGML_USE_METAL
|
20
20
|
#include "ggml-metal.h"
|
21
21
|
#endif
|
22
|
+
#ifdef GGML_USE_K_QUANTS
|
23
|
+
#ifndef QK_K
|
24
|
+
#ifdef GGML_QKK_64
|
25
|
+
#define QK_K 64
|
26
|
+
#else
|
27
|
+
#define QK_K 256
|
28
|
+
#endif
|
29
|
+
#endif
|
30
|
+
#endif
|
22
31
|
|
23
32
|
#include <array>
|
24
33
|
#include <ctime>
|
@@ -40,6 +49,10 @@
|
|
40
49
|
#include <sstream>
|
41
50
|
#include <numeric>
|
42
51
|
|
52
|
+
#if defined(_MSC_VER)
|
53
|
+
#pragma warning(disable: 4244 4267) // possible loss of data
|
54
|
+
#endif
|
55
|
+
|
43
56
|
#define LLAMA_USE_SCRATCH
|
44
57
|
#define LLAMA_MAX_SCRATCH_BUFFERS 16
|
45
58
|
|
@@ -173,6 +186,19 @@ struct llama_kv_cache {
|
|
173
186
|
}
|
174
187
|
};
|
175
188
|
|
189
|
+
struct llama_vocab {
|
190
|
+
using id = int32_t;
|
191
|
+
using token = std::string;
|
192
|
+
|
193
|
+
struct token_score {
|
194
|
+
token tok;
|
195
|
+
float score;
|
196
|
+
};
|
197
|
+
|
198
|
+
std::unordered_map<token, id> token_to_id;
|
199
|
+
std::vector<token_score> id_to_token;
|
200
|
+
};
|
201
|
+
|
176
202
|
struct llama_model {
|
177
203
|
e_model type = MODEL_UNKNOWN;
|
178
204
|
|
@@ -189,10 +215,6 @@ struct llama_model {
|
|
189
215
|
// context
|
190
216
|
struct ggml_context * ctx = NULL;
|
191
217
|
|
192
|
-
// key + value cache for the self attention
|
193
|
-
// TODO: move to llama_state
|
194
|
-
struct llama_kv_cache kv_self;
|
195
|
-
|
196
218
|
// the model memory buffer
|
197
219
|
llama_ctx_buffer buf;
|
198
220
|
|
@@ -206,6 +228,11 @@ struct llama_model {
|
|
206
228
|
// for quantize-stats only
|
207
229
|
std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
|
208
230
|
|
231
|
+
int64_t t_load_us = 0;
|
232
|
+
int64_t t_start_us = 0;
|
233
|
+
|
234
|
+
llama_vocab vocab;
|
235
|
+
|
209
236
|
~llama_model() {
|
210
237
|
if (ctx) {
|
211
238
|
ggml_free(ctx);
|
@@ -224,24 +251,11 @@ struct llama_model {
|
|
224
251
|
}
|
225
252
|
};
|
226
253
|
|
227
|
-
struct llama_vocab {
|
228
|
-
using id = int32_t;
|
229
|
-
using token = std::string;
|
230
|
-
|
231
|
-
struct token_score {
|
232
|
-
token tok;
|
233
|
-
float score;
|
234
|
-
};
|
235
|
-
|
236
|
-
std::unordered_map<token, id> token_to_id;
|
237
|
-
std::vector<token_score> id_to_token;
|
238
|
-
};
|
239
|
-
|
240
254
|
struct llama_context {
|
255
|
+
llama_context(const llama_model & model, const llama_vocab & vocab) : model(model), vocab(vocab), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
|
256
|
+
|
241
257
|
std::mt19937 rng;
|
242
258
|
|
243
|
-
int64_t t_load_us = 0;
|
244
|
-
int64_t t_start_us = 0;
|
245
259
|
bool has_evaluated_once = false;
|
246
260
|
|
247
261
|
int64_t t_sample_us = 0;
|
@@ -252,8 +266,16 @@ struct llama_context {
|
|
252
266
|
int32_t n_eval = 0; // number of eval calls
|
253
267
|
int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
|
254
268
|
|
255
|
-
llama_model model;
|
256
|
-
llama_vocab vocab;
|
269
|
+
const llama_model & model;
|
270
|
+
const llama_vocab & vocab;
|
271
|
+
|
272
|
+
bool model_owner = false;
|
273
|
+
|
274
|
+
int64_t t_load_us;
|
275
|
+
int64_t t_start_us;
|
276
|
+
|
277
|
+
// key + value cache for the self attention
|
278
|
+
struct llama_kv_cache kv_self;
|
257
279
|
|
258
280
|
size_t mem_per_token = 0;
|
259
281
|
|
@@ -752,7 +774,7 @@ struct llama_model_loader {
|
|
752
774
|
}
|
753
775
|
|
754
776
|
if (use_mmap) {
|
755
|
-
mapping.reset(new llama_mmap(&file_loaders.at(0)->file, prefetch_size));
|
777
|
+
mapping.reset(new llama_mmap(&file_loaders.at(0)->file, prefetch_size, ggml_is_numa()));
|
756
778
|
if (lmlock) {
|
757
779
|
lmlock->init(mapping->addr);
|
758
780
|
}
|
@@ -882,6 +904,7 @@ static bool kv_cache_init(
|
|
882
904
|
const int64_t n_elements = n_embd*n_mem;
|
883
905
|
|
884
906
|
cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
|
907
|
+
cache.n = 0;
|
885
908
|
|
886
909
|
struct ggml_init_params params;
|
887
910
|
params.mem_size = cache.buf.size;
|
@@ -900,6 +923,7 @@ static bool kv_cache_init(
|
|
900
923
|
ggml_set_name(cache.k, "cache_k");
|
901
924
|
ggml_set_name(cache.v, "cache_v");
|
902
925
|
|
926
|
+
(void) n_gpu_layers;
|
903
927
|
#ifdef GGML_USE_CUBLAS
|
904
928
|
if (n_gpu_layers > n_layer + 1) {
|
905
929
|
ggml_cuda_assign_buffers_no_scratch(cache.v);
|
@@ -914,21 +938,21 @@ static bool kv_cache_init(
|
|
914
938
|
|
915
939
|
struct llama_context_params llama_context_default_params() {
|
916
940
|
struct llama_context_params result = {
|
941
|
+
/*.seed =*/ -1,
|
917
942
|
/*.n_ctx =*/ 512,
|
918
943
|
/*.n_batch =*/ 512,
|
919
944
|
/*.gpu_layers =*/ 0,
|
920
945
|
/*.main_gpu =*/ 0,
|
921
946
|
/*.tensor_split =*/ {0},
|
947
|
+
/*.progress_callback =*/ nullptr,
|
948
|
+
/*.progress_callback_user_data =*/ nullptr,
|
922
949
|
/*.low_vram =*/ false,
|
923
|
-
/*.seed =*/ -1,
|
924
950
|
/*.f16_kv =*/ true,
|
925
951
|
/*.logits_all =*/ false,
|
926
952
|
/*.vocab_only =*/ false,
|
927
953
|
/*.use_mmap =*/ true,
|
928
954
|
/*.use_mlock =*/ false,
|
929
955
|
/*.embedding =*/ false,
|
930
|
-
/*.progress_callback =*/ nullptr,
|
931
|
-
/*.progress_callback_user_data =*/ nullptr,
|
932
956
|
};
|
933
957
|
|
934
958
|
return result;
|
@@ -953,7 +977,7 @@ bool llama_mlock_supported() {
|
|
953
977
|
return llama_mlock::SUPPORTED;
|
954
978
|
}
|
955
979
|
|
956
|
-
void llama_init_backend() {
|
980
|
+
void llama_init_backend(bool numa) {
|
957
981
|
ggml_time_init();
|
958
982
|
|
959
983
|
// needed to initialize f16 tables
|
@@ -962,6 +986,10 @@ void llama_init_backend() {
|
|
962
986
|
struct ggml_context * ctx = ggml_init(params);
|
963
987
|
ggml_free(ctx);
|
964
988
|
}
|
989
|
+
|
990
|
+
if (numa) {
|
991
|
+
ggml_numa_init();
|
992
|
+
}
|
965
993
|
}
|
966
994
|
|
967
995
|
int64_t llama_time_us() {
|
@@ -1022,7 +1050,8 @@ static const char *llama_model_type_name(e_model type) {
|
|
1022
1050
|
|
1023
1051
|
static void llama_model_load_internal(
|
1024
1052
|
const std::string & fname,
|
1025
|
-
|
1053
|
+
llama_model & model,
|
1054
|
+
llama_vocab & vocab,
|
1026
1055
|
int n_ctx,
|
1027
1056
|
int n_batch,
|
1028
1057
|
int n_gpu_layers,
|
@@ -1036,12 +1065,11 @@ static void llama_model_load_internal(
|
|
1036
1065
|
llama_progress_callback progress_callback,
|
1037
1066
|
void * progress_callback_user_data) {
|
1038
1067
|
|
1039
|
-
|
1068
|
+
model.t_start_us = ggml_time_us();
|
1040
1069
|
|
1041
1070
|
std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap, vocab_only));
|
1042
1071
|
|
1043
|
-
|
1044
|
-
auto & model = lctx.model;
|
1072
|
+
vocab = std::move(ml->file_loaders.at(0)->vocab);
|
1045
1073
|
model.hparams = ml->file_loaders.at(0)->hparams;
|
1046
1074
|
model.n_gpu_layers = n_gpu_layers;
|
1047
1075
|
llama_file_version file_version = ml->file_loaders.at(0)->file_version;
|
@@ -1111,15 +1139,15 @@ static void llama_model_load_internal(
|
|
1111
1139
|
|
1112
1140
|
// create the ggml context
|
1113
1141
|
{
|
1114
|
-
|
1142
|
+
model.buf.resize(ctx_size);
|
1115
1143
|
if (use_mlock) {
|
1116
|
-
|
1117
|
-
|
1144
|
+
model.mlock_buf.init(model.buf.addr);
|
1145
|
+
model.mlock_buf.grow_to(model.buf.size);
|
1118
1146
|
}
|
1119
1147
|
|
1120
1148
|
struct ggml_init_params params = {
|
1121
|
-
/*.mem_size =*/
|
1122
|
-
/*.mem_buffer =*/
|
1149
|
+
/*.mem_size =*/ model.buf.size,
|
1150
|
+
/*.mem_buffer =*/ model.buf.addr,
|
1123
1151
|
/*.no_alloc =*/ ml->use_mmap,
|
1124
1152
|
};
|
1125
1153
|
|
@@ -1249,7 +1277,7 @@ static void llama_model_load_internal(
|
|
1249
1277
|
vram_scratch = n_batch * MB;
|
1250
1278
|
ggml_cuda_set_scratch_size(vram_scratch);
|
1251
1279
|
if (n_gpu_layers > 0) {
|
1252
|
-
fprintf(stderr, "%s: allocating batch_size x 1 MB = %
|
1280
|
+
fprintf(stderr, "%s: allocating batch_size x 1 MB = %zd MB VRAM for the scratch buffer\n",
|
1253
1281
|
__func__, vram_scratch / MB);
|
1254
1282
|
}
|
1255
1283
|
}
|
@@ -1300,7 +1328,7 @@ static void llama_model_load_internal(
|
|
1300
1328
|
}
|
1301
1329
|
#endif
|
1302
1330
|
|
1303
|
-
ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &
|
1331
|
+
ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &model.mlock_mmap : NULL);
|
1304
1332
|
|
1305
1333
|
if (progress_callback) {
|
1306
1334
|
progress_callback(1.0f, progress_callback_user_data);
|
@@ -1310,12 +1338,13 @@ static void llama_model_load_internal(
|
|
1310
1338
|
|
1311
1339
|
// loading time will be recalculate after the first eval, so
|
1312
1340
|
// we take page faults deferred by mmap() into consideration
|
1313
|
-
|
1341
|
+
model.t_load_us = ggml_time_us() - model.t_start_us;
|
1314
1342
|
}
|
1315
1343
|
|
1316
1344
|
static bool llama_model_load(
|
1317
1345
|
const std::string & fname,
|
1318
|
-
|
1346
|
+
llama_model & model,
|
1347
|
+
llama_vocab & vocab,
|
1319
1348
|
int n_ctx,
|
1320
1349
|
int n_batch,
|
1321
1350
|
int n_gpu_layers,
|
@@ -1329,7 +1358,7 @@ static bool llama_model_load(
|
|
1329
1358
|
llama_progress_callback progress_callback,
|
1330
1359
|
void *progress_callback_user_data) {
|
1331
1360
|
try {
|
1332
|
-
llama_model_load_internal(fname,
|
1361
|
+
llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type,
|
1333
1362
|
use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
|
1334
1363
|
return true;
|
1335
1364
|
} catch (const std::exception & err) {
|
@@ -1367,7 +1396,7 @@ static bool llama_eval_internal(
|
|
1367
1396
|
const auto & model = lctx.model;
|
1368
1397
|
const auto & hparams = model.hparams;
|
1369
1398
|
|
1370
|
-
const auto & kv_self =
|
1399
|
+
const auto & kv_self = lctx.kv_self;
|
1371
1400
|
|
1372
1401
|
LLAMA_ASSERT(!!kv_self.ctx);
|
1373
1402
|
|
@@ -1462,11 +1491,11 @@ static bool llama_eval_internal(
|
|
1462
1491
|
offload_func_kq(tmpq);
|
1463
1492
|
ggml_set_name(tmpq, "tmpq");
|
1464
1493
|
|
1465
|
-
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0);
|
1494
|
+
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
|
1466
1495
|
offload_func_kq(Kcur);
|
1467
1496
|
ggml_set_name(Kcur, "Kcur");
|
1468
1497
|
|
1469
|
-
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0);
|
1498
|
+
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
|
1470
1499
|
offload_func_kq(Qcur);
|
1471
1500
|
ggml_set_name(Qcur, "Qcur");
|
1472
1501
|
|
@@ -1609,7 +1638,7 @@ static bool llama_eval_internal(
|
|
1609
1638
|
model.layers[il].w1,
|
1610
1639
|
cur);
|
1611
1640
|
offload_func(cur);
|
1612
|
-
ggml_set_name(cur, "
|
1641
|
+
ggml_set_name(cur, "result_w1");
|
1613
1642
|
|
1614
1643
|
// SILU activation
|
1615
1644
|
cur = ggml_silu(ctx0, cur);
|
@@ -1646,15 +1675,11 @@ static bool llama_eval_internal(
|
|
1646
1675
|
{
|
1647
1676
|
cur = ggml_rms_norm(ctx0, inpL);
|
1648
1677
|
offload_func_nr(cur);
|
1649
|
-
ggml_set_name(cur, "
|
1650
|
-
|
1651
|
-
cur = ggml_rms_norm(ctx0, cur);
|
1652
|
-
offload_func_nr(cur);
|
1653
|
-
ggml_set_name(cur, "rms_norm_after");
|
1678
|
+
ggml_set_name(cur, "rms_norm_2");
|
1654
1679
|
|
1655
1680
|
// cur = cur*norm(broadcasted)
|
1656
1681
|
cur = ggml_mul(ctx0, cur, model.norm);
|
1657
|
-
offload_func_nr(cur);
|
1682
|
+
// offload_func_nr(cur); // TODO CPU + GPU mirrored backend
|
1658
1683
|
ggml_set_name(cur, "result_norm");
|
1659
1684
|
|
1660
1685
|
embeddings = cur;
|
@@ -1719,7 +1744,7 @@ static bool llama_eval_internal(
|
|
1719
1744
|
//memcpy(embd_w.data(), ggml_get_data(cur), sizeof(float)*n_vocab*N);
|
1720
1745
|
|
1721
1746
|
// update kv token count
|
1722
|
-
lctx.
|
1747
|
+
lctx.kv_self.n = n_past + N;
|
1723
1748
|
|
1724
1749
|
// extract logits
|
1725
1750
|
{
|
@@ -1998,9 +2023,10 @@ void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * can
|
|
1998
2023
|
for (size_t i = 0; i < candidates->size; ++i) {
|
1999
2024
|
cum_sum += candidates->data[i].p;
|
2000
2025
|
|
2001
|
-
// Check if the running sum is
|
2002
|
-
|
2003
|
-
|
2026
|
+
// Check if the running sum is at least p or if we have kept at least min_keep tokens
|
2027
|
+
// we set the last index to i+1 to indicate that the current iterate should be included in the set
|
2028
|
+
if (cum_sum >= p && i + 1 >= min_keep) {
|
2029
|
+
last_idx = i + 1;
|
2004
2030
|
break;
|
2005
2031
|
}
|
2006
2032
|
}
|
@@ -2452,6 +2478,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2452
2478
|
std::vector<std::thread> workers;
|
2453
2479
|
std::mutex mutex;
|
2454
2480
|
|
2481
|
+
auto use_more_bits = [] (int i_layer, int num_layers) -> bool {
|
2482
|
+
return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2;
|
2483
|
+
};
|
2484
|
+
|
2455
2485
|
size_t idx = 0;
|
2456
2486
|
for (llama_load_tensor & tensor : model_loader->tensors_map.tensors) {
|
2457
2487
|
llama_buffer read_data;
|
@@ -2485,21 +2515,37 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2485
2515
|
} else {
|
2486
2516
|
new_type = quantized_type;
|
2487
2517
|
#ifdef GGML_USE_K_QUANTS
|
2518
|
+
if (quantized_type == GGML_TYPE_Q2_K || quantized_type == GGML_TYPE_Q3_K || quantized_type == GGML_TYPE_Q4_K ||
|
2519
|
+
quantized_type == GGML_TYPE_Q5_K || quantized_type == GGML_TYPE_Q6_K) {
|
2520
|
+
int nx = tensor.ne.at(0);
|
2521
|
+
int ny = tensor.ne.at(1);
|
2522
|
+
if (nx % QK_K != 0 || ny % QK_K != 0) {
|
2523
|
+
fprintf(stderr, "\n\n========================= Tensor sizes %d x %d are not divisible by %d\n",nx,ny,QK_K);
|
2524
|
+
fprintf(stderr, "This is required to be able to use k-quants for now!\n");
|
2525
|
+
fprintf(stderr, "========================================================================================\n\n");
|
2526
|
+
throw std::runtime_error("Unsupported tensor size encountered\n");
|
2527
|
+
}
|
2528
|
+
}
|
2488
2529
|
if (tensor.name == "output.weight") {
|
2489
|
-
|
2530
|
+
int nx = tensor.ne.at(0);
|
2531
|
+
int ny = tensor.ne.at(1);
|
2532
|
+
if (nx % QK_K == 0 && ny % QK_K == 0) {
|
2533
|
+
new_type = GGML_TYPE_Q6_K;
|
2534
|
+
}
|
2490
2535
|
} else if (tensor.name.find("attention.wv.weight") != std::string::npos) {
|
2491
2536
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
2492
2537
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
2493
2538
|
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
2494
|
-
|
2495
|
-
|
2539
|
+
use_more_bits(i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K;
|
2540
|
+
else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
|
2541
|
+
(i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
|
2496
2542
|
++i_attention_wv;
|
2497
2543
|
} else if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) {
|
2498
2544
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
2499
2545
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
2500
2546
|
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
2501
|
-
(i_feed_forward_w2
|
2502
|
-
|
2547
|
+
use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
|
2548
|
+
//else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_feed_forward_w2 < n_feed_forward_w2/8) new_type = GGML_TYPE_Q6_K;
|
2503
2549
|
++i_feed_forward_w2;
|
2504
2550
|
} else if (tensor.name.find("attention.wo.weight") != std::string::npos) {
|
2505
2551
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
@@ -2612,12 +2658,39 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2612
2658
|
// interface implementation
|
2613
2659
|
//
|
2614
2660
|
|
2615
|
-
struct
|
2661
|
+
struct llama_model * llama_load_model_from_file(
|
2616
2662
|
const char * path_model,
|
2617
2663
|
struct llama_context_params params) {
|
2618
2664
|
ggml_time_init();
|
2619
2665
|
|
2620
|
-
|
2666
|
+
llama_model * model = new llama_model;
|
2667
|
+
|
2668
|
+
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
2669
|
+
|
2670
|
+
if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gpu_layers,
|
2671
|
+
params.main_gpu, params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock,
|
2672
|
+
params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
|
2673
|
+
delete model;
|
2674
|
+
fprintf(stderr, "%s: failed to load model\n", __func__);
|
2675
|
+
return nullptr;
|
2676
|
+
}
|
2677
|
+
|
2678
|
+
return model;
|
2679
|
+
}
|
2680
|
+
|
2681
|
+
void llama_free_model(struct llama_model * model) {
|
2682
|
+
delete model;
|
2683
|
+
}
|
2684
|
+
|
2685
|
+
struct llama_context * llama_new_context_with_model(
|
2686
|
+
struct llama_model * model,
|
2687
|
+
struct llama_context_params params) {
|
2688
|
+
|
2689
|
+
if (!model) {
|
2690
|
+
return nullptr;
|
2691
|
+
}
|
2692
|
+
|
2693
|
+
llama_context * ctx = new llama_context(*model, model->vocab);
|
2621
2694
|
|
2622
2695
|
if (params.seed < 0) {
|
2623
2696
|
params.seed = time(NULL);
|
@@ -2645,24 +2718,16 @@ struct llama_context * llama_init_from_file(
|
|
2645
2718
|
|
2646
2719
|
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
2647
2720
|
|
2648
|
-
if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_batch, params.n_gpu_layers, params.main_gpu,
|
2649
|
-
params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock,
|
2650
|
-
params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
|
2651
|
-
fprintf(stderr, "%s: failed to load model\n", __func__);
|
2652
|
-
llama_free(ctx);
|
2653
|
-
return nullptr;
|
2654
|
-
}
|
2655
|
-
|
2656
2721
|
// reserve memory for context buffers
|
2657
2722
|
if (!params.vocab_only) {
|
2658
|
-
if (!kv_cache_init(ctx->model.hparams, ctx->
|
2723
|
+
if (!kv_cache_init(ctx->model.hparams, ctx->kv_self, memory_type, ctx->model.hparams.n_ctx, params.n_gpu_layers)) {
|
2659
2724
|
fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
|
2660
2725
|
llama_free(ctx);
|
2661
2726
|
return nullptr;
|
2662
2727
|
}
|
2663
2728
|
|
2664
2729
|
{
|
2665
|
-
const size_t memory_size = ggml_nbytes(ctx->
|
2730
|
+
const size_t memory_size = ggml_nbytes(ctx->kv_self.k) + ggml_nbytes(ctx->kv_self.v);
|
2666
2731
|
fprintf(stderr, "%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
|
2667
2732
|
}
|
2668
2733
|
|
@@ -2690,16 +2755,21 @@ struct llama_context * llama_init_from_file(
|
|
2690
2755
|
// this allocates all Metal resources and memory buffers
|
2691
2756
|
ctx->ctx_metal = ggml_metal_init();
|
2692
2757
|
|
2693
|
-
void *data_ptr
|
2758
|
+
void * data_ptr = NULL;
|
2694
2759
|
size_t data_size = 0;
|
2760
|
+
|
2695
2761
|
if (params.use_mmap) {
|
2696
|
-
data_ptr
|
2697
|
-
data_size= ctx->model.mapping->size;
|
2762
|
+
data_ptr = ctx->model.mapping->addr;
|
2763
|
+
data_size = ctx->model.mapping->size;
|
2698
2764
|
} else {
|
2699
|
-
data_ptr
|
2700
|
-
data_size= ggml_get_mem_size(ctx->model.ctx);
|
2765
|
+
data_ptr = ggml_get_mem_buffer(ctx->model.ctx);
|
2766
|
+
data_size = ggml_get_mem_size (ctx->model.ctx);
|
2701
2767
|
}
|
2702
2768
|
|
2769
|
+
const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
|
2770
|
+
|
2771
|
+
printf("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
|
2772
|
+
|
2703
2773
|
#define LLAMA_METAL_CHECK_BUF(result) \
|
2704
2774
|
if (!(result)) { \
|
2705
2775
|
fprintf(stderr, "%s: failed to add buffer\n", __func__); \
|
@@ -2707,12 +2777,13 @@ struct llama_context * llama_init_from_file(
|
|
2707
2777
|
return NULL; \
|
2708
2778
|
}
|
2709
2779
|
|
2710
|
-
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size));
|
2711
|
-
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size));
|
2780
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
|
2712
2781
|
|
2713
|
-
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "
|
2714
|
-
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "
|
2715
|
-
|
2782
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size, 0));
|
2783
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.addr, ctx->kv_self.buf.size, 0));
|
2784
|
+
|
2785
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size, 0));
|
2786
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size, 0));
|
2716
2787
|
#undef LLAMA_METAL_CHECK_BUF
|
2717
2788
|
}
|
2718
2789
|
#endif
|
@@ -2720,7 +2791,23 @@ struct llama_context * llama_init_from_file(
|
|
2720
2791
|
return ctx;
|
2721
2792
|
}
|
2722
2793
|
|
2794
|
+
struct llama_context * llama_init_from_file(
|
2795
|
+
const char * path_model,
|
2796
|
+
struct llama_context_params params) {
|
2797
|
+
|
2798
|
+
struct llama_model * model = llama_load_model_from_file(path_model, params);
|
2799
|
+
if (!model) {
|
2800
|
+
return nullptr;
|
2801
|
+
}
|
2802
|
+
struct llama_context * ctx = llama_new_context_with_model(model, params);
|
2803
|
+
ctx->model_owner = true;
|
2804
|
+
return ctx;
|
2805
|
+
}
|
2806
|
+
|
2723
2807
|
void llama_free(struct llama_context * ctx) {
|
2808
|
+
if (ctx->model_owner) {
|
2809
|
+
delete &ctx->model;
|
2810
|
+
}
|
2724
2811
|
delete ctx;
|
2725
2812
|
}
|
2726
2813
|
|
@@ -2737,11 +2824,9 @@ int llama_model_quantize(
|
|
2737
2824
|
}
|
2738
2825
|
}
|
2739
2826
|
|
2740
|
-
int llama_apply_lora_from_file_internal(struct
|
2827
|
+
int llama_apply_lora_from_file_internal(const struct llama_model & model, const char * path_lora, const char * path_base_model, int n_threads) {
|
2741
2828
|
fprintf(stderr, "%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
|
2742
2829
|
|
2743
|
-
auto & model = ctx->model;
|
2744
|
-
|
2745
2830
|
const int64_t t_start_lora_us = ggml_time_us();
|
2746
2831
|
|
2747
2832
|
auto fin = std::ifstream(path_lora, std::ios::binary);
|
@@ -2818,7 +2903,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|
2818
2903
|
|
2819
2904
|
// maybe this should in llama_model_loader
|
2820
2905
|
if (model_loader->use_mmap) {
|
2821
|
-
model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ 0));
|
2906
|
+
model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ 0, ggml_is_numa()));
|
2822
2907
|
}
|
2823
2908
|
}
|
2824
2909
|
|
@@ -2984,7 +3069,16 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|
2984
3069
|
|
2985
3070
|
int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
|
2986
3071
|
try {
|
2987
|
-
return llama_apply_lora_from_file_internal(ctx, path_lora, path_base_model, n_threads);
|
3072
|
+
return llama_apply_lora_from_file_internal(ctx->model, path_lora, path_base_model, n_threads);
|
3073
|
+
} catch (const std::exception & err) {
|
3074
|
+
fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what());
|
3075
|
+
return 1;
|
3076
|
+
}
|
3077
|
+
}
|
3078
|
+
|
3079
|
+
int llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, const char * path_base_model, int n_threads) {
|
3080
|
+
try {
|
3081
|
+
return llama_apply_lora_from_file_internal(*model, path_lora, path_base_model, n_threads);
|
2988
3082
|
} catch (const std::exception & err) {
|
2989
3083
|
fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what());
|
2990
3084
|
return 1;
|
@@ -2992,7 +3086,7 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
|
|
2992
3086
|
}
|
2993
3087
|
|
2994
3088
|
int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
|
2995
|
-
return ctx->
|
3089
|
+
return ctx->kv_self.n;
|
2996
3090
|
}
|
2997
3091
|
|
2998
3092
|
#define LLAMA_MAX_RNG_STATE (64*1024)
|
@@ -3017,7 +3111,7 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
|
|
3017
3111
|
const size_t s_embedding = ctx->embedding.size() * sizeof(float);
|
3018
3112
|
const size_t s_kv_size = sizeof(size_t);
|
3019
3113
|
const size_t s_kv_ntok = sizeof(int);
|
3020
|
-
const size_t s_kv = ctx->
|
3114
|
+
const size_t s_kv = ctx->kv_self.buf.size;
|
3021
3115
|
|
3022
3116
|
const size_t s_total = (
|
3023
3117
|
+ s_rng_size
|
@@ -3083,7 +3177,7 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
|
3083
3177
|
|
3084
3178
|
// copy kv cache
|
3085
3179
|
{
|
3086
|
-
const auto & kv_self = ctx->
|
3180
|
+
const auto & kv_self = ctx->kv_self;
|
3087
3181
|
const auto & hparams = ctx->model.hparams;
|
3088
3182
|
const int n_layer = hparams.n_layer;
|
3089
3183
|
const int n_embd = hparams.n_embd;
|
@@ -3098,9 +3192,7 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
|
3098
3192
|
if (kv_size) {
|
3099
3193
|
const size_t elt_size = ggml_element_size(kv_self.k);
|
3100
3194
|
|
3101
|
-
|
3102
|
-
|
3103
|
-
ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
|
3195
|
+
ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
|
3104
3196
|
ggml_cgraph gf{};
|
3105
3197
|
gf.n_threads = 1;
|
3106
3198
|
|
@@ -3189,7 +3281,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
3189
3281
|
|
3190
3282
|
// set kv cache
|
3191
3283
|
{
|
3192
|
-
const auto & kv_self = ctx->
|
3284
|
+
const auto & kv_self = ctx->kv_self;
|
3193
3285
|
const auto & hparams = ctx->model.hparams;
|
3194
3286
|
const int n_layer = hparams.n_layer;
|
3195
3287
|
const int n_embd = hparams.n_embd;
|
@@ -3206,9 +3298,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
3206
3298
|
|
3207
3299
|
const size_t elt_size = ggml_element_size(kv_self.k);
|
3208
3300
|
|
3209
|
-
|
3210
|
-
|
3211
|
-
ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
|
3301
|
+
ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
|
3212
3302
|
ggml_cgraph gf{};
|
3213
3303
|
gf.n_threads = 1;
|
3214
3304
|
|
@@ -3235,7 +3325,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
3235
3325
|
ggml_free(cpy_ctx);
|
3236
3326
|
}
|
3237
3327
|
|
3238
|
-
ctx->
|
3328
|
+
ctx->kv_self.n = kv_ntok;
|
3239
3329
|
}
|
3240
3330
|
|
3241
3331
|
const size_t nread = inp - src;
|
@@ -3443,9 +3533,12 @@ void llama_print_timings(struct llama_context * ctx) {
|
|
3443
3533
|
|
3444
3534
|
fprintf(stderr, "\n");
|
3445
3535
|
fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0);
|
3446
|
-
fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token
|
3447
|
-
|
3448
|
-
fprintf(stderr, "%s:
|
3536
|
+
fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
3537
|
+
__func__, 1e-3 * ctx->t_sample_us, n_sample, 1e-3 * ctx->t_sample_us / n_sample, 1e6 / ctx->t_sample_us * n_sample);
|
3538
|
+
fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
|
3539
|
+
__func__, 1e-3 * ctx->t_p_eval_us, n_p_eval, 1e-3 * ctx->t_p_eval_us / n_p_eval, 1e6 / ctx->t_p_eval_us * n_p_eval);
|
3540
|
+
fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
3541
|
+
__func__, 1e-3 * ctx->t_eval_us, n_eval, 1e-3 * ctx->t_eval_us / n_eval, 1e6 / ctx->t_eval_us * n_eval);
|
3449
3542
|
fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0);
|
3450
3543
|
}
|
3451
3544
|
|
@@ -3479,6 +3572,6 @@ const char * llama_print_system_info(void) {
|
|
3479
3572
|
}
|
3480
3573
|
|
3481
3574
|
// For internal test use
|
3482
|
-
std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
|
3575
|
+
const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
|
3483
3576
|
return ctx->model.tensors_by_name;
|
3484
3577
|
}
|