llama_cpp 0.2.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +32 -0
- data/README.md +39 -6
- data/examples/README.md +32 -0
- data/examples/chat.rb +2 -1
- data/examples/embedding.rb +38 -0
- data/ext/llama_cpp/extconf.rb +13 -0
- data/ext/llama_cpp/llama_cpp.cpp +231 -132
- data/ext/llama_cpp/src/ggml-cuda.cu +844 -337
- data/ext/llama_cpp/src/ggml-metal.h +4 -1
- data/ext/llama_cpp/src/ggml-metal.m +193 -49
- data/ext/llama_cpp/src/ggml-metal.metal +477 -84
- data/ext/llama_cpp/src/ggml-opencl.cpp +493 -4
- data/ext/llama_cpp/src/ggml.c +1565 -430
- data/ext/llama_cpp/src/ggml.h +208 -14
- data/ext/llama_cpp/src/k_quants.c +1712 -56
- data/ext/llama_cpp/src/k_quants.h +41 -6
- data/ext/llama_cpp/src/llama-util.h +19 -5
- data/ext/llama_cpp/src/llama.cpp +194 -101
- data/ext/llama_cpp/src/llama.h +41 -14
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +0 -2
- data/sig/llama_cpp.rbs +12 -17
- metadata +3 -3
- data/lib/llama_cpp/client.rb +0 -172
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -19,6 +19,15 @@
|
|
19
19
|
#ifdef GGML_USE_METAL
|
20
20
|
#include "ggml-metal.h"
|
21
21
|
#endif
|
22
|
+
#ifdef GGML_USE_K_QUANTS
|
23
|
+
#ifndef QK_K
|
24
|
+
#ifdef GGML_QKK_64
|
25
|
+
#define QK_K 64
|
26
|
+
#else
|
27
|
+
#define QK_K 256
|
28
|
+
#endif
|
29
|
+
#endif
|
30
|
+
#endif
|
22
31
|
|
23
32
|
#include <array>
|
24
33
|
#include <ctime>
|
@@ -40,6 +49,10 @@
|
|
40
49
|
#include <sstream>
|
41
50
|
#include <numeric>
|
42
51
|
|
52
|
+
#if defined(_MSC_VER)
|
53
|
+
#pragma warning(disable: 4244 4267) // possible loss of data
|
54
|
+
#endif
|
55
|
+
|
43
56
|
#define LLAMA_USE_SCRATCH
|
44
57
|
#define LLAMA_MAX_SCRATCH_BUFFERS 16
|
45
58
|
|
@@ -173,6 +186,19 @@ struct llama_kv_cache {
|
|
173
186
|
}
|
174
187
|
};
|
175
188
|
|
189
|
+
struct llama_vocab {
|
190
|
+
using id = int32_t;
|
191
|
+
using token = std::string;
|
192
|
+
|
193
|
+
struct token_score {
|
194
|
+
token tok;
|
195
|
+
float score;
|
196
|
+
};
|
197
|
+
|
198
|
+
std::unordered_map<token, id> token_to_id;
|
199
|
+
std::vector<token_score> id_to_token;
|
200
|
+
};
|
201
|
+
|
176
202
|
struct llama_model {
|
177
203
|
e_model type = MODEL_UNKNOWN;
|
178
204
|
|
@@ -189,10 +215,6 @@ struct llama_model {
|
|
189
215
|
// context
|
190
216
|
struct ggml_context * ctx = NULL;
|
191
217
|
|
192
|
-
// key + value cache for the self attention
|
193
|
-
// TODO: move to llama_state
|
194
|
-
struct llama_kv_cache kv_self;
|
195
|
-
|
196
218
|
// the model memory buffer
|
197
219
|
llama_ctx_buffer buf;
|
198
220
|
|
@@ -206,6 +228,11 @@ struct llama_model {
|
|
206
228
|
// for quantize-stats only
|
207
229
|
std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
|
208
230
|
|
231
|
+
int64_t t_load_us = 0;
|
232
|
+
int64_t t_start_us = 0;
|
233
|
+
|
234
|
+
llama_vocab vocab;
|
235
|
+
|
209
236
|
~llama_model() {
|
210
237
|
if (ctx) {
|
211
238
|
ggml_free(ctx);
|
@@ -224,24 +251,11 @@ struct llama_model {
|
|
224
251
|
}
|
225
252
|
};
|
226
253
|
|
227
|
-
struct llama_vocab {
|
228
|
-
using id = int32_t;
|
229
|
-
using token = std::string;
|
230
|
-
|
231
|
-
struct token_score {
|
232
|
-
token tok;
|
233
|
-
float score;
|
234
|
-
};
|
235
|
-
|
236
|
-
std::unordered_map<token, id> token_to_id;
|
237
|
-
std::vector<token_score> id_to_token;
|
238
|
-
};
|
239
|
-
|
240
254
|
struct llama_context {
|
255
|
+
llama_context(const llama_model & model, const llama_vocab & vocab) : model(model), vocab(vocab), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
|
256
|
+
|
241
257
|
std::mt19937 rng;
|
242
258
|
|
243
|
-
int64_t t_load_us = 0;
|
244
|
-
int64_t t_start_us = 0;
|
245
259
|
bool has_evaluated_once = false;
|
246
260
|
|
247
261
|
int64_t t_sample_us = 0;
|
@@ -252,8 +266,16 @@ struct llama_context {
|
|
252
266
|
int32_t n_eval = 0; // number of eval calls
|
253
267
|
int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
|
254
268
|
|
255
|
-
llama_model model;
|
256
|
-
llama_vocab vocab;
|
269
|
+
const llama_model & model;
|
270
|
+
const llama_vocab & vocab;
|
271
|
+
|
272
|
+
bool model_owner = false;
|
273
|
+
|
274
|
+
int64_t t_load_us;
|
275
|
+
int64_t t_start_us;
|
276
|
+
|
277
|
+
// key + value cache for the self attention
|
278
|
+
struct llama_kv_cache kv_self;
|
257
279
|
|
258
280
|
size_t mem_per_token = 0;
|
259
281
|
|
@@ -752,7 +774,7 @@ struct llama_model_loader {
|
|
752
774
|
}
|
753
775
|
|
754
776
|
if (use_mmap) {
|
755
|
-
mapping.reset(new llama_mmap(&file_loaders.at(0)->file, prefetch_size));
|
777
|
+
mapping.reset(new llama_mmap(&file_loaders.at(0)->file, prefetch_size, ggml_is_numa()));
|
756
778
|
if (lmlock) {
|
757
779
|
lmlock->init(mapping->addr);
|
758
780
|
}
|
@@ -882,6 +904,7 @@ static bool kv_cache_init(
|
|
882
904
|
const int64_t n_elements = n_embd*n_mem;
|
883
905
|
|
884
906
|
cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
|
907
|
+
cache.n = 0;
|
885
908
|
|
886
909
|
struct ggml_init_params params;
|
887
910
|
params.mem_size = cache.buf.size;
|
@@ -900,6 +923,7 @@ static bool kv_cache_init(
|
|
900
923
|
ggml_set_name(cache.k, "cache_k");
|
901
924
|
ggml_set_name(cache.v, "cache_v");
|
902
925
|
|
926
|
+
(void) n_gpu_layers;
|
903
927
|
#ifdef GGML_USE_CUBLAS
|
904
928
|
if (n_gpu_layers > n_layer + 1) {
|
905
929
|
ggml_cuda_assign_buffers_no_scratch(cache.v);
|
@@ -914,21 +938,21 @@ static bool kv_cache_init(
|
|
914
938
|
|
915
939
|
struct llama_context_params llama_context_default_params() {
|
916
940
|
struct llama_context_params result = {
|
941
|
+
/*.seed =*/ -1,
|
917
942
|
/*.n_ctx =*/ 512,
|
918
943
|
/*.n_batch =*/ 512,
|
919
944
|
/*.gpu_layers =*/ 0,
|
920
945
|
/*.main_gpu =*/ 0,
|
921
946
|
/*.tensor_split =*/ {0},
|
947
|
+
/*.progress_callback =*/ nullptr,
|
948
|
+
/*.progress_callback_user_data =*/ nullptr,
|
922
949
|
/*.low_vram =*/ false,
|
923
|
-
/*.seed =*/ -1,
|
924
950
|
/*.f16_kv =*/ true,
|
925
951
|
/*.logits_all =*/ false,
|
926
952
|
/*.vocab_only =*/ false,
|
927
953
|
/*.use_mmap =*/ true,
|
928
954
|
/*.use_mlock =*/ false,
|
929
955
|
/*.embedding =*/ false,
|
930
|
-
/*.progress_callback =*/ nullptr,
|
931
|
-
/*.progress_callback_user_data =*/ nullptr,
|
932
956
|
};
|
933
957
|
|
934
958
|
return result;
|
@@ -953,7 +977,7 @@ bool llama_mlock_supported() {
|
|
953
977
|
return llama_mlock::SUPPORTED;
|
954
978
|
}
|
955
979
|
|
956
|
-
void llama_init_backend() {
|
980
|
+
void llama_init_backend(bool numa) {
|
957
981
|
ggml_time_init();
|
958
982
|
|
959
983
|
// needed to initialize f16 tables
|
@@ -962,6 +986,10 @@ void llama_init_backend() {
|
|
962
986
|
struct ggml_context * ctx = ggml_init(params);
|
963
987
|
ggml_free(ctx);
|
964
988
|
}
|
989
|
+
|
990
|
+
if (numa) {
|
991
|
+
ggml_numa_init();
|
992
|
+
}
|
965
993
|
}
|
966
994
|
|
967
995
|
int64_t llama_time_us() {
|
@@ -1022,7 +1050,8 @@ static const char *llama_model_type_name(e_model type) {
|
|
1022
1050
|
|
1023
1051
|
static void llama_model_load_internal(
|
1024
1052
|
const std::string & fname,
|
1025
|
-
|
1053
|
+
llama_model & model,
|
1054
|
+
llama_vocab & vocab,
|
1026
1055
|
int n_ctx,
|
1027
1056
|
int n_batch,
|
1028
1057
|
int n_gpu_layers,
|
@@ -1036,12 +1065,11 @@ static void llama_model_load_internal(
|
|
1036
1065
|
llama_progress_callback progress_callback,
|
1037
1066
|
void * progress_callback_user_data) {
|
1038
1067
|
|
1039
|
-
|
1068
|
+
model.t_start_us = ggml_time_us();
|
1040
1069
|
|
1041
1070
|
std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap, vocab_only));
|
1042
1071
|
|
1043
|
-
|
1044
|
-
auto & model = lctx.model;
|
1072
|
+
vocab = std::move(ml->file_loaders.at(0)->vocab);
|
1045
1073
|
model.hparams = ml->file_loaders.at(0)->hparams;
|
1046
1074
|
model.n_gpu_layers = n_gpu_layers;
|
1047
1075
|
llama_file_version file_version = ml->file_loaders.at(0)->file_version;
|
@@ -1111,15 +1139,15 @@ static void llama_model_load_internal(
|
|
1111
1139
|
|
1112
1140
|
// create the ggml context
|
1113
1141
|
{
|
1114
|
-
|
1142
|
+
model.buf.resize(ctx_size);
|
1115
1143
|
if (use_mlock) {
|
1116
|
-
|
1117
|
-
|
1144
|
+
model.mlock_buf.init(model.buf.addr);
|
1145
|
+
model.mlock_buf.grow_to(model.buf.size);
|
1118
1146
|
}
|
1119
1147
|
|
1120
1148
|
struct ggml_init_params params = {
|
1121
|
-
/*.mem_size =*/
|
1122
|
-
/*.mem_buffer =*/
|
1149
|
+
/*.mem_size =*/ model.buf.size,
|
1150
|
+
/*.mem_buffer =*/ model.buf.addr,
|
1123
1151
|
/*.no_alloc =*/ ml->use_mmap,
|
1124
1152
|
};
|
1125
1153
|
|
@@ -1249,7 +1277,7 @@ static void llama_model_load_internal(
|
|
1249
1277
|
vram_scratch = n_batch * MB;
|
1250
1278
|
ggml_cuda_set_scratch_size(vram_scratch);
|
1251
1279
|
if (n_gpu_layers > 0) {
|
1252
|
-
fprintf(stderr, "%s: allocating batch_size x 1 MB = %
|
1280
|
+
fprintf(stderr, "%s: allocating batch_size x 1 MB = %zd MB VRAM for the scratch buffer\n",
|
1253
1281
|
__func__, vram_scratch / MB);
|
1254
1282
|
}
|
1255
1283
|
}
|
@@ -1300,7 +1328,7 @@ static void llama_model_load_internal(
|
|
1300
1328
|
}
|
1301
1329
|
#endif
|
1302
1330
|
|
1303
|
-
ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &
|
1331
|
+
ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &model.mlock_mmap : NULL);
|
1304
1332
|
|
1305
1333
|
if (progress_callback) {
|
1306
1334
|
progress_callback(1.0f, progress_callback_user_data);
|
@@ -1310,12 +1338,13 @@ static void llama_model_load_internal(
|
|
1310
1338
|
|
1311
1339
|
// loading time will be recalculate after the first eval, so
|
1312
1340
|
// we take page faults deferred by mmap() into consideration
|
1313
|
-
|
1341
|
+
model.t_load_us = ggml_time_us() - model.t_start_us;
|
1314
1342
|
}
|
1315
1343
|
|
1316
1344
|
static bool llama_model_load(
|
1317
1345
|
const std::string & fname,
|
1318
|
-
|
1346
|
+
llama_model & model,
|
1347
|
+
llama_vocab & vocab,
|
1319
1348
|
int n_ctx,
|
1320
1349
|
int n_batch,
|
1321
1350
|
int n_gpu_layers,
|
@@ -1329,7 +1358,7 @@ static bool llama_model_load(
|
|
1329
1358
|
llama_progress_callback progress_callback,
|
1330
1359
|
void *progress_callback_user_data) {
|
1331
1360
|
try {
|
1332
|
-
llama_model_load_internal(fname,
|
1361
|
+
llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type,
|
1333
1362
|
use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
|
1334
1363
|
return true;
|
1335
1364
|
} catch (const std::exception & err) {
|
@@ -1367,7 +1396,7 @@ static bool llama_eval_internal(
|
|
1367
1396
|
const auto & model = lctx.model;
|
1368
1397
|
const auto & hparams = model.hparams;
|
1369
1398
|
|
1370
|
-
const auto & kv_self =
|
1399
|
+
const auto & kv_self = lctx.kv_self;
|
1371
1400
|
|
1372
1401
|
LLAMA_ASSERT(!!kv_self.ctx);
|
1373
1402
|
|
@@ -1462,11 +1491,11 @@ static bool llama_eval_internal(
|
|
1462
1491
|
offload_func_kq(tmpq);
|
1463
1492
|
ggml_set_name(tmpq, "tmpq");
|
1464
1493
|
|
1465
|
-
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0);
|
1494
|
+
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
|
1466
1495
|
offload_func_kq(Kcur);
|
1467
1496
|
ggml_set_name(Kcur, "Kcur");
|
1468
1497
|
|
1469
|
-
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0);
|
1498
|
+
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
|
1470
1499
|
offload_func_kq(Qcur);
|
1471
1500
|
ggml_set_name(Qcur, "Qcur");
|
1472
1501
|
|
@@ -1609,7 +1638,7 @@ static bool llama_eval_internal(
|
|
1609
1638
|
model.layers[il].w1,
|
1610
1639
|
cur);
|
1611
1640
|
offload_func(cur);
|
1612
|
-
ggml_set_name(cur, "
|
1641
|
+
ggml_set_name(cur, "result_w1");
|
1613
1642
|
|
1614
1643
|
// SILU activation
|
1615
1644
|
cur = ggml_silu(ctx0, cur);
|
@@ -1646,15 +1675,11 @@ static bool llama_eval_internal(
|
|
1646
1675
|
{
|
1647
1676
|
cur = ggml_rms_norm(ctx0, inpL);
|
1648
1677
|
offload_func_nr(cur);
|
1649
|
-
ggml_set_name(cur, "
|
1650
|
-
|
1651
|
-
cur = ggml_rms_norm(ctx0, cur);
|
1652
|
-
offload_func_nr(cur);
|
1653
|
-
ggml_set_name(cur, "rms_norm_after");
|
1678
|
+
ggml_set_name(cur, "rms_norm_2");
|
1654
1679
|
|
1655
1680
|
// cur = cur*norm(broadcasted)
|
1656
1681
|
cur = ggml_mul(ctx0, cur, model.norm);
|
1657
|
-
offload_func_nr(cur);
|
1682
|
+
// offload_func_nr(cur); // TODO CPU + GPU mirrored backend
|
1658
1683
|
ggml_set_name(cur, "result_norm");
|
1659
1684
|
|
1660
1685
|
embeddings = cur;
|
@@ -1719,7 +1744,7 @@ static bool llama_eval_internal(
|
|
1719
1744
|
//memcpy(embd_w.data(), ggml_get_data(cur), sizeof(float)*n_vocab*N);
|
1720
1745
|
|
1721
1746
|
// update kv token count
|
1722
|
-
lctx.
|
1747
|
+
lctx.kv_self.n = n_past + N;
|
1723
1748
|
|
1724
1749
|
// extract logits
|
1725
1750
|
{
|
@@ -1998,9 +2023,10 @@ void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * can
|
|
1998
2023
|
for (size_t i = 0; i < candidates->size; ++i) {
|
1999
2024
|
cum_sum += candidates->data[i].p;
|
2000
2025
|
|
2001
|
-
// Check if the running sum is
|
2002
|
-
|
2003
|
-
|
2026
|
+
// Check if the running sum is at least p or if we have kept at least min_keep tokens
|
2027
|
+
// we set the last index to i+1 to indicate that the current iterate should be included in the set
|
2028
|
+
if (cum_sum >= p && i + 1 >= min_keep) {
|
2029
|
+
last_idx = i + 1;
|
2004
2030
|
break;
|
2005
2031
|
}
|
2006
2032
|
}
|
@@ -2452,6 +2478,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2452
2478
|
std::vector<std::thread> workers;
|
2453
2479
|
std::mutex mutex;
|
2454
2480
|
|
2481
|
+
auto use_more_bits = [] (int i_layer, int num_layers) -> bool {
|
2482
|
+
return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2;
|
2483
|
+
};
|
2484
|
+
|
2455
2485
|
size_t idx = 0;
|
2456
2486
|
for (llama_load_tensor & tensor : model_loader->tensors_map.tensors) {
|
2457
2487
|
llama_buffer read_data;
|
@@ -2485,21 +2515,37 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2485
2515
|
} else {
|
2486
2516
|
new_type = quantized_type;
|
2487
2517
|
#ifdef GGML_USE_K_QUANTS
|
2518
|
+
if (quantized_type == GGML_TYPE_Q2_K || quantized_type == GGML_TYPE_Q3_K || quantized_type == GGML_TYPE_Q4_K ||
|
2519
|
+
quantized_type == GGML_TYPE_Q5_K || quantized_type == GGML_TYPE_Q6_K) {
|
2520
|
+
int nx = tensor.ne.at(0);
|
2521
|
+
int ny = tensor.ne.at(1);
|
2522
|
+
if (nx % QK_K != 0 || ny % QK_K != 0) {
|
2523
|
+
fprintf(stderr, "\n\n========================= Tensor sizes %d x %d are not divisible by %d\n",nx,ny,QK_K);
|
2524
|
+
fprintf(stderr, "This is required to be able to use k-quants for now!\n");
|
2525
|
+
fprintf(stderr, "========================================================================================\n\n");
|
2526
|
+
throw std::runtime_error("Unsupported tensor size encountered\n");
|
2527
|
+
}
|
2528
|
+
}
|
2488
2529
|
if (tensor.name == "output.weight") {
|
2489
|
-
|
2530
|
+
int nx = tensor.ne.at(0);
|
2531
|
+
int ny = tensor.ne.at(1);
|
2532
|
+
if (nx % QK_K == 0 && ny % QK_K == 0) {
|
2533
|
+
new_type = GGML_TYPE_Q6_K;
|
2534
|
+
}
|
2490
2535
|
} else if (tensor.name.find("attention.wv.weight") != std::string::npos) {
|
2491
2536
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
2492
2537
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
2493
2538
|
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
2494
|
-
|
2495
|
-
|
2539
|
+
use_more_bits(i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K;
|
2540
|
+
else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
|
2541
|
+
(i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
|
2496
2542
|
++i_attention_wv;
|
2497
2543
|
} else if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) {
|
2498
2544
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
2499
2545
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
2500
2546
|
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
2501
|
-
(i_feed_forward_w2
|
2502
|
-
|
2547
|
+
use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
|
2548
|
+
//else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_feed_forward_w2 < n_feed_forward_w2/8) new_type = GGML_TYPE_Q6_K;
|
2503
2549
|
++i_feed_forward_w2;
|
2504
2550
|
} else if (tensor.name.find("attention.wo.weight") != std::string::npos) {
|
2505
2551
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
@@ -2612,12 +2658,39 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2612
2658
|
// interface implementation
|
2613
2659
|
//
|
2614
2660
|
|
2615
|
-
struct
|
2661
|
+
struct llama_model * llama_load_model_from_file(
|
2616
2662
|
const char * path_model,
|
2617
2663
|
struct llama_context_params params) {
|
2618
2664
|
ggml_time_init();
|
2619
2665
|
|
2620
|
-
|
2666
|
+
llama_model * model = new llama_model;
|
2667
|
+
|
2668
|
+
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
2669
|
+
|
2670
|
+
if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gpu_layers,
|
2671
|
+
params.main_gpu, params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock,
|
2672
|
+
params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
|
2673
|
+
delete model;
|
2674
|
+
fprintf(stderr, "%s: failed to load model\n", __func__);
|
2675
|
+
return nullptr;
|
2676
|
+
}
|
2677
|
+
|
2678
|
+
return model;
|
2679
|
+
}
|
2680
|
+
|
2681
|
+
void llama_free_model(struct llama_model * model) {
|
2682
|
+
delete model;
|
2683
|
+
}
|
2684
|
+
|
2685
|
+
struct llama_context * llama_new_context_with_model(
|
2686
|
+
struct llama_model * model,
|
2687
|
+
struct llama_context_params params) {
|
2688
|
+
|
2689
|
+
if (!model) {
|
2690
|
+
return nullptr;
|
2691
|
+
}
|
2692
|
+
|
2693
|
+
llama_context * ctx = new llama_context(*model, model->vocab);
|
2621
2694
|
|
2622
2695
|
if (params.seed < 0) {
|
2623
2696
|
params.seed = time(NULL);
|
@@ -2645,24 +2718,16 @@ struct llama_context * llama_init_from_file(
|
|
2645
2718
|
|
2646
2719
|
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
2647
2720
|
|
2648
|
-
if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_batch, params.n_gpu_layers, params.main_gpu,
|
2649
|
-
params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock,
|
2650
|
-
params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
|
2651
|
-
fprintf(stderr, "%s: failed to load model\n", __func__);
|
2652
|
-
llama_free(ctx);
|
2653
|
-
return nullptr;
|
2654
|
-
}
|
2655
|
-
|
2656
2721
|
// reserve memory for context buffers
|
2657
2722
|
if (!params.vocab_only) {
|
2658
|
-
if (!kv_cache_init(ctx->model.hparams, ctx->
|
2723
|
+
if (!kv_cache_init(ctx->model.hparams, ctx->kv_self, memory_type, ctx->model.hparams.n_ctx, params.n_gpu_layers)) {
|
2659
2724
|
fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
|
2660
2725
|
llama_free(ctx);
|
2661
2726
|
return nullptr;
|
2662
2727
|
}
|
2663
2728
|
|
2664
2729
|
{
|
2665
|
-
const size_t memory_size = ggml_nbytes(ctx->
|
2730
|
+
const size_t memory_size = ggml_nbytes(ctx->kv_self.k) + ggml_nbytes(ctx->kv_self.v);
|
2666
2731
|
fprintf(stderr, "%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
|
2667
2732
|
}
|
2668
2733
|
|
@@ -2690,16 +2755,21 @@ struct llama_context * llama_init_from_file(
|
|
2690
2755
|
// this allocates all Metal resources and memory buffers
|
2691
2756
|
ctx->ctx_metal = ggml_metal_init();
|
2692
2757
|
|
2693
|
-
void *data_ptr
|
2758
|
+
void * data_ptr = NULL;
|
2694
2759
|
size_t data_size = 0;
|
2760
|
+
|
2695
2761
|
if (params.use_mmap) {
|
2696
|
-
data_ptr
|
2697
|
-
data_size= ctx->model.mapping->size;
|
2762
|
+
data_ptr = ctx->model.mapping->addr;
|
2763
|
+
data_size = ctx->model.mapping->size;
|
2698
2764
|
} else {
|
2699
|
-
data_ptr
|
2700
|
-
data_size= ggml_get_mem_size(ctx->model.ctx);
|
2765
|
+
data_ptr = ggml_get_mem_buffer(ctx->model.ctx);
|
2766
|
+
data_size = ggml_get_mem_size (ctx->model.ctx);
|
2701
2767
|
}
|
2702
2768
|
|
2769
|
+
const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
|
2770
|
+
|
2771
|
+
printf("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
|
2772
|
+
|
2703
2773
|
#define LLAMA_METAL_CHECK_BUF(result) \
|
2704
2774
|
if (!(result)) { \
|
2705
2775
|
fprintf(stderr, "%s: failed to add buffer\n", __func__); \
|
@@ -2707,12 +2777,13 @@ struct llama_context * llama_init_from_file(
|
|
2707
2777
|
return NULL; \
|
2708
2778
|
}
|
2709
2779
|
|
2710
|
-
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size));
|
2711
|
-
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size));
|
2780
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
|
2712
2781
|
|
2713
|
-
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "
|
2714
|
-
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "
|
2715
|
-
|
2782
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size, 0));
|
2783
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.addr, ctx->kv_self.buf.size, 0));
|
2784
|
+
|
2785
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size, 0));
|
2786
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size, 0));
|
2716
2787
|
#undef LLAMA_METAL_CHECK_BUF
|
2717
2788
|
}
|
2718
2789
|
#endif
|
@@ -2720,7 +2791,23 @@ struct llama_context * llama_init_from_file(
|
|
2720
2791
|
return ctx;
|
2721
2792
|
}
|
2722
2793
|
|
2794
|
+
struct llama_context * llama_init_from_file(
|
2795
|
+
const char * path_model,
|
2796
|
+
struct llama_context_params params) {
|
2797
|
+
|
2798
|
+
struct llama_model * model = llama_load_model_from_file(path_model, params);
|
2799
|
+
if (!model) {
|
2800
|
+
return nullptr;
|
2801
|
+
}
|
2802
|
+
struct llama_context * ctx = llama_new_context_with_model(model, params);
|
2803
|
+
ctx->model_owner = true;
|
2804
|
+
return ctx;
|
2805
|
+
}
|
2806
|
+
|
2723
2807
|
void llama_free(struct llama_context * ctx) {
|
2808
|
+
if (ctx->model_owner) {
|
2809
|
+
delete &ctx->model;
|
2810
|
+
}
|
2724
2811
|
delete ctx;
|
2725
2812
|
}
|
2726
2813
|
|
@@ -2737,11 +2824,9 @@ int llama_model_quantize(
|
|
2737
2824
|
}
|
2738
2825
|
}
|
2739
2826
|
|
2740
|
-
int llama_apply_lora_from_file_internal(struct
|
2827
|
+
int llama_apply_lora_from_file_internal(const struct llama_model & model, const char * path_lora, const char * path_base_model, int n_threads) {
|
2741
2828
|
fprintf(stderr, "%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
|
2742
2829
|
|
2743
|
-
auto & model = ctx->model;
|
2744
|
-
|
2745
2830
|
const int64_t t_start_lora_us = ggml_time_us();
|
2746
2831
|
|
2747
2832
|
auto fin = std::ifstream(path_lora, std::ios::binary);
|
@@ -2818,7 +2903,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|
2818
2903
|
|
2819
2904
|
// maybe this should in llama_model_loader
|
2820
2905
|
if (model_loader->use_mmap) {
|
2821
|
-
model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ 0));
|
2906
|
+
model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ 0, ggml_is_numa()));
|
2822
2907
|
}
|
2823
2908
|
}
|
2824
2909
|
|
@@ -2984,7 +3069,16 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|
2984
3069
|
|
2985
3070
|
int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
|
2986
3071
|
try {
|
2987
|
-
return llama_apply_lora_from_file_internal(ctx, path_lora, path_base_model, n_threads);
|
3072
|
+
return llama_apply_lora_from_file_internal(ctx->model, path_lora, path_base_model, n_threads);
|
3073
|
+
} catch (const std::exception & err) {
|
3074
|
+
fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what());
|
3075
|
+
return 1;
|
3076
|
+
}
|
3077
|
+
}
|
3078
|
+
|
3079
|
+
int llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, const char * path_base_model, int n_threads) {
|
3080
|
+
try {
|
3081
|
+
return llama_apply_lora_from_file_internal(*model, path_lora, path_base_model, n_threads);
|
2988
3082
|
} catch (const std::exception & err) {
|
2989
3083
|
fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what());
|
2990
3084
|
return 1;
|
@@ -2992,7 +3086,7 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
|
|
2992
3086
|
}
|
2993
3087
|
|
2994
3088
|
int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
|
2995
|
-
return ctx->
|
3089
|
+
return ctx->kv_self.n;
|
2996
3090
|
}
|
2997
3091
|
|
2998
3092
|
#define LLAMA_MAX_RNG_STATE (64*1024)
|
@@ -3017,7 +3111,7 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
|
|
3017
3111
|
const size_t s_embedding = ctx->embedding.size() * sizeof(float);
|
3018
3112
|
const size_t s_kv_size = sizeof(size_t);
|
3019
3113
|
const size_t s_kv_ntok = sizeof(int);
|
3020
|
-
const size_t s_kv = ctx->
|
3114
|
+
const size_t s_kv = ctx->kv_self.buf.size;
|
3021
3115
|
|
3022
3116
|
const size_t s_total = (
|
3023
3117
|
+ s_rng_size
|
@@ -3083,7 +3177,7 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
|
3083
3177
|
|
3084
3178
|
// copy kv cache
|
3085
3179
|
{
|
3086
|
-
const auto & kv_self = ctx->
|
3180
|
+
const auto & kv_self = ctx->kv_self;
|
3087
3181
|
const auto & hparams = ctx->model.hparams;
|
3088
3182
|
const int n_layer = hparams.n_layer;
|
3089
3183
|
const int n_embd = hparams.n_embd;
|
@@ -3098,9 +3192,7 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
|
3098
3192
|
if (kv_size) {
|
3099
3193
|
const size_t elt_size = ggml_element_size(kv_self.k);
|
3100
3194
|
|
3101
|
-
|
3102
|
-
|
3103
|
-
ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
|
3195
|
+
ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
|
3104
3196
|
ggml_cgraph gf{};
|
3105
3197
|
gf.n_threads = 1;
|
3106
3198
|
|
@@ -3189,7 +3281,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
3189
3281
|
|
3190
3282
|
// set kv cache
|
3191
3283
|
{
|
3192
|
-
const auto & kv_self = ctx->
|
3284
|
+
const auto & kv_self = ctx->kv_self;
|
3193
3285
|
const auto & hparams = ctx->model.hparams;
|
3194
3286
|
const int n_layer = hparams.n_layer;
|
3195
3287
|
const int n_embd = hparams.n_embd;
|
@@ -3206,9 +3298,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
3206
3298
|
|
3207
3299
|
const size_t elt_size = ggml_element_size(kv_self.k);
|
3208
3300
|
|
3209
|
-
|
3210
|
-
|
3211
|
-
ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
|
3301
|
+
ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
|
3212
3302
|
ggml_cgraph gf{};
|
3213
3303
|
gf.n_threads = 1;
|
3214
3304
|
|
@@ -3235,7 +3325,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
3235
3325
|
ggml_free(cpy_ctx);
|
3236
3326
|
}
|
3237
3327
|
|
3238
|
-
ctx->
|
3328
|
+
ctx->kv_self.n = kv_ntok;
|
3239
3329
|
}
|
3240
3330
|
|
3241
3331
|
const size_t nread = inp - src;
|
@@ -3443,9 +3533,12 @@ void llama_print_timings(struct llama_context * ctx) {
|
|
3443
3533
|
|
3444
3534
|
fprintf(stderr, "\n");
|
3445
3535
|
fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0);
|
3446
|
-
fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token
|
3447
|
-
|
3448
|
-
fprintf(stderr, "%s:
|
3536
|
+
fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
3537
|
+
__func__, 1e-3 * ctx->t_sample_us, n_sample, 1e-3 * ctx->t_sample_us / n_sample, 1e6 / ctx->t_sample_us * n_sample);
|
3538
|
+
fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
|
3539
|
+
__func__, 1e-3 * ctx->t_p_eval_us, n_p_eval, 1e-3 * ctx->t_p_eval_us / n_p_eval, 1e6 / ctx->t_p_eval_us * n_p_eval);
|
3540
|
+
fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
3541
|
+
__func__, 1e-3 * ctx->t_eval_us, n_eval, 1e-3 * ctx->t_eval_us / n_eval, 1e6 / ctx->t_eval_us * n_eval);
|
3449
3542
|
fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0);
|
3450
3543
|
}
|
3451
3544
|
|
@@ -3479,6 +3572,6 @@ const char * llama_print_system_info(void) {
|
|
3479
3572
|
}
|
3480
3573
|
|
3481
3574
|
// For internal test use
|
3482
|
-
std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
|
3575
|
+
const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
|
3483
3576
|
return ctx->model.tensors_by_name;
|
3484
3577
|
}
|