llama_cpp 0.2.2 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +28 -0
- data/README.md +39 -6
- data/examples/chat.rb +2 -1
- data/examples/embedding.rb +3 -2
- data/ext/llama_cpp/extconf.rb +13 -0
- data/ext/llama_cpp/llama_cpp.cpp +231 -132
- data/ext/llama_cpp/src/ggml-cuda.cu +319 -52
- data/ext/llama_cpp/src/ggml-metal.m +36 -30
- data/ext/llama_cpp/src/ggml-metal.metal +328 -84
- data/ext/llama_cpp/src/ggml.c +800 -303
- data/ext/llama_cpp/src/ggml.h +68 -5
- data/ext/llama_cpp/src/k_quants.c +1712 -56
- data/ext/llama_cpp/src/k_quants.h +41 -6
- data/ext/llama_cpp/src/llama-util.h +19 -5
- data/ext/llama_cpp/src/llama.cpp +138 -72
- data/ext/llama_cpp/src/llama.h +33 -5
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +0 -2
- data/sig/llama_cpp.rbs +12 -17
- metadata +2 -3
- data/lib/llama_cpp/client.rb +0 -172
@@ -7,7 +7,13 @@
|
|
7
7
|
#include <stddef.h>
|
8
8
|
|
9
9
|
// Super-block size
|
10
|
+
#ifdef GGML_QKK_64
|
11
|
+
#define QK_K 64
|
12
|
+
#define K_SCALE_SIZE 4
|
13
|
+
#else
|
10
14
|
#define QK_K 256
|
15
|
+
#define K_SCALE_SIZE 12
|
16
|
+
#endif
|
11
17
|
|
12
18
|
//
|
13
19
|
// Super-block quantization structures
|
@@ -29,38 +35,67 @@ static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "w
|
|
29
35
|
// weight is represented as x = a * q
|
30
36
|
// 16 blocks of 16 elemenets each
|
31
37
|
// Effectively 3.4375 bits per weight
|
38
|
+
#ifdef GGML_QKK_64
|
32
39
|
typedef struct {
|
33
40
|
uint8_t hmask[QK_K/8]; // quants - high bit
|
34
41
|
uint8_t qs[QK_K/4]; // quants - low 2 bits
|
35
|
-
uint8_t scales[
|
42
|
+
uint8_t scales[2];
|
36
43
|
ggml_fp16_t d; // super-block scale
|
37
44
|
} block_q3_K;
|
38
|
-
static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 +
|
45
|
+
static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + 2, "wrong q3_K block size/padding");
|
46
|
+
#else
|
47
|
+
typedef struct {
|
48
|
+
uint8_t hmask[QK_K/8]; // quants - high bit
|
49
|
+
uint8_t qs[QK_K/4]; // quants - low 2 bits
|
50
|
+
uint8_t scales[12]; // scales, quantized with 6 bits
|
51
|
+
ggml_fp16_t d; // super-block scale
|
52
|
+
} block_q3_K;
|
53
|
+
static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + 12, "wrong q3_K block size/padding");
|
54
|
+
#endif
|
39
55
|
|
40
56
|
// 4-bit quantization
|
41
57
|
// 16 blocks of 32 elements each
|
42
58
|
// weight is represented as x = a * q + b
|
43
59
|
// Effectively 4.5 bits per weight
|
60
|
+
#ifdef GGML_QKK_64
|
61
|
+
typedef struct {
|
62
|
+
ggml_fp16_t d[2]; // super-block scales/mins
|
63
|
+
uint8_t scales[2]; // 4-bit block scales/mins
|
64
|
+
uint8_t qs[QK_K/2]; // 4--bit quants
|
65
|
+
} block_q4_K;
|
66
|
+
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + QK_K/2 + 2, "wrong q4_K block size/padding");
|
67
|
+
#else
|
44
68
|
typedef struct {
|
45
69
|
ggml_fp16_t d; // super-block scale for quantized scales
|
46
70
|
ggml_fp16_t dmin; // super-block scale for quantized mins
|
47
|
-
uint8_t scales[
|
71
|
+
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
|
48
72
|
uint8_t qs[QK_K/2]; // 4--bit quants
|
49
73
|
} block_q4_K;
|
50
|
-
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) +
|
74
|
+
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2, "wrong q4_K block size/padding");
|
75
|
+
#endif
|
51
76
|
|
52
77
|
// 5-bit quantization
|
53
78
|
// 16 blocks of 32 elements each
|
54
79
|
// weight is represented as x = a * q + b
|
55
80
|
// Effectively 5.5 bits per weight
|
81
|
+
#ifdef GGML_QKK_64
|
82
|
+
typedef struct {
|
83
|
+
ggml_fp16_t d; // super-block scale
|
84
|
+
int8_t scales[QK_K/16]; // 8-bit block scales
|
85
|
+
uint8_t qh[QK_K/8]; // quants, high bit
|
86
|
+
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
87
|
+
} block_q5_K;
|
88
|
+
static_assert(sizeof(block_q5_K) == sizeof(ggml_fp16_t) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
|
89
|
+
#else
|
56
90
|
typedef struct {
|
57
91
|
ggml_fp16_t d; // super-block scale for quantized scales
|
58
92
|
ggml_fp16_t dmin; // super-block scale for quantized mins
|
59
|
-
uint8_t scales[
|
93
|
+
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
|
60
94
|
uint8_t qh[QK_K/8]; // quants, high bit
|
61
95
|
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
62
96
|
} block_q5_K;
|
63
|
-
static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) +
|
97
|
+
static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
|
98
|
+
#endif
|
64
99
|
|
65
100
|
// 6-bit quantization
|
66
101
|
// weight is represented as x = a * q
|
@@ -172,12 +172,14 @@ struct llama_mmap {
|
|
172
172
|
#ifdef _POSIX_MAPPED_FILES
|
173
173
|
static constexpr bool SUPPORTED = true;
|
174
174
|
|
175
|
-
llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value
|
175
|
+
llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) {
|
176
176
|
size = file->size;
|
177
177
|
int fd = fileno(file->fp);
|
178
178
|
int flags = MAP_SHARED;
|
179
|
+
// prefetch/readahead impairs performance on NUMA systems
|
180
|
+
if (numa) { prefetch = 0; }
|
179
181
|
#ifdef __linux__
|
180
|
-
flags |= MAP_POPULATE;
|
182
|
+
if (prefetch) { flags |= MAP_POPULATE; }
|
181
183
|
#endif
|
182
184
|
addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
|
183
185
|
if (addr == MAP_FAILED) {
|
@@ -191,6 +193,14 @@ struct llama_mmap {
|
|
191
193
|
strerror(errno));
|
192
194
|
}
|
193
195
|
}
|
196
|
+
if (numa) {
|
197
|
+
// advise the kernel not to use readahead
|
198
|
+
// (because the next page might not belong on the same node)
|
199
|
+
if (madvise(addr, file->size, MADV_RANDOM)) {
|
200
|
+
fprintf(stderr, "warning: madvise(.., MADV_RANDOM) failed: %s\n",
|
201
|
+
strerror(errno));
|
202
|
+
}
|
203
|
+
}
|
194
204
|
}
|
195
205
|
|
196
206
|
~llama_mmap() {
|
@@ -199,7 +209,9 @@ struct llama_mmap {
|
|
199
209
|
#elif defined(_WIN32)
|
200
210
|
static constexpr bool SUPPORTED = true;
|
201
211
|
|
202
|
-
llama_mmap(struct llama_file * file, bool prefetch = true) {
|
212
|
+
llama_mmap(struct llama_file * file, bool prefetch = true, bool numa = false) {
|
213
|
+
(void) numa;
|
214
|
+
|
203
215
|
size = file->size;
|
204
216
|
|
205
217
|
HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp));
|
@@ -244,8 +256,10 @@ struct llama_mmap {
|
|
244
256
|
#else
|
245
257
|
static constexpr bool SUPPORTED = false;
|
246
258
|
|
247
|
-
llama_mmap(struct llama_file *, bool prefetch = true) {
|
248
|
-
(void)prefetch;
|
259
|
+
llama_mmap(struct llama_file *, bool prefetch = true, bool numa = false) {
|
260
|
+
(void) prefetch;
|
261
|
+
(void) numa;
|
262
|
+
|
249
263
|
throw std::runtime_error(std::string("mmap not supported"));
|
250
264
|
}
|
251
265
|
#endif
|
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -21,9 +21,13 @@
|
|
21
21
|
#endif
|
22
22
|
#ifdef GGML_USE_K_QUANTS
|
23
23
|
#ifndef QK_K
|
24
|
+
#ifdef GGML_QKK_64
|
25
|
+
#define QK_K 64
|
26
|
+
#else
|
24
27
|
#define QK_K 256
|
25
28
|
#endif
|
26
29
|
#endif
|
30
|
+
#endif
|
27
31
|
|
28
32
|
#include <array>
|
29
33
|
#include <ctime>
|
@@ -182,6 +186,19 @@ struct llama_kv_cache {
|
|
182
186
|
}
|
183
187
|
};
|
184
188
|
|
189
|
+
struct llama_vocab {
|
190
|
+
using id = int32_t;
|
191
|
+
using token = std::string;
|
192
|
+
|
193
|
+
struct token_score {
|
194
|
+
token tok;
|
195
|
+
float score;
|
196
|
+
};
|
197
|
+
|
198
|
+
std::unordered_map<token, id> token_to_id;
|
199
|
+
std::vector<token_score> id_to_token;
|
200
|
+
};
|
201
|
+
|
185
202
|
struct llama_model {
|
186
203
|
e_model type = MODEL_UNKNOWN;
|
187
204
|
|
@@ -198,10 +215,6 @@ struct llama_model {
|
|
198
215
|
// context
|
199
216
|
struct ggml_context * ctx = NULL;
|
200
217
|
|
201
|
-
// key + value cache for the self attention
|
202
|
-
// TODO: move to llama_state
|
203
|
-
struct llama_kv_cache kv_self;
|
204
|
-
|
205
218
|
// the model memory buffer
|
206
219
|
llama_ctx_buffer buf;
|
207
220
|
|
@@ -215,6 +228,11 @@ struct llama_model {
|
|
215
228
|
// for quantize-stats only
|
216
229
|
std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
|
217
230
|
|
231
|
+
int64_t t_load_us = 0;
|
232
|
+
int64_t t_start_us = 0;
|
233
|
+
|
234
|
+
llama_vocab vocab;
|
235
|
+
|
218
236
|
~llama_model() {
|
219
237
|
if (ctx) {
|
220
238
|
ggml_free(ctx);
|
@@ -233,24 +251,11 @@ struct llama_model {
|
|
233
251
|
}
|
234
252
|
};
|
235
253
|
|
236
|
-
struct llama_vocab {
|
237
|
-
using id = int32_t;
|
238
|
-
using token = std::string;
|
239
|
-
|
240
|
-
struct token_score {
|
241
|
-
token tok;
|
242
|
-
float score;
|
243
|
-
};
|
244
|
-
|
245
|
-
std::unordered_map<token, id> token_to_id;
|
246
|
-
std::vector<token_score> id_to_token;
|
247
|
-
};
|
248
|
-
|
249
254
|
struct llama_context {
|
255
|
+
llama_context(const llama_model & model, const llama_vocab & vocab) : model(model), vocab(vocab), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
|
256
|
+
|
250
257
|
std::mt19937 rng;
|
251
258
|
|
252
|
-
int64_t t_load_us = 0;
|
253
|
-
int64_t t_start_us = 0;
|
254
259
|
bool has_evaluated_once = false;
|
255
260
|
|
256
261
|
int64_t t_sample_us = 0;
|
@@ -261,8 +266,16 @@ struct llama_context {
|
|
261
266
|
int32_t n_eval = 0; // number of eval calls
|
262
267
|
int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
|
263
268
|
|
264
|
-
llama_model model;
|
265
|
-
llama_vocab vocab;
|
269
|
+
const llama_model & model;
|
270
|
+
const llama_vocab & vocab;
|
271
|
+
|
272
|
+
bool model_owner = false;
|
273
|
+
|
274
|
+
int64_t t_load_us;
|
275
|
+
int64_t t_start_us;
|
276
|
+
|
277
|
+
// key + value cache for the self attention
|
278
|
+
struct llama_kv_cache kv_self;
|
266
279
|
|
267
280
|
size_t mem_per_token = 0;
|
268
281
|
|
@@ -761,7 +774,7 @@ struct llama_model_loader {
|
|
761
774
|
}
|
762
775
|
|
763
776
|
if (use_mmap) {
|
764
|
-
mapping.reset(new llama_mmap(&file_loaders.at(0)->file, prefetch_size));
|
777
|
+
mapping.reset(new llama_mmap(&file_loaders.at(0)->file, prefetch_size, ggml_is_numa()));
|
765
778
|
if (lmlock) {
|
766
779
|
lmlock->init(mapping->addr);
|
767
780
|
}
|
@@ -964,7 +977,7 @@ bool llama_mlock_supported() {
|
|
964
977
|
return llama_mlock::SUPPORTED;
|
965
978
|
}
|
966
979
|
|
967
|
-
void llama_init_backend() {
|
980
|
+
void llama_init_backend(bool numa) {
|
968
981
|
ggml_time_init();
|
969
982
|
|
970
983
|
// needed to initialize f16 tables
|
@@ -973,6 +986,10 @@ void llama_init_backend() {
|
|
973
986
|
struct ggml_context * ctx = ggml_init(params);
|
974
987
|
ggml_free(ctx);
|
975
988
|
}
|
989
|
+
|
990
|
+
if (numa) {
|
991
|
+
ggml_numa_init();
|
992
|
+
}
|
976
993
|
}
|
977
994
|
|
978
995
|
int64_t llama_time_us() {
|
@@ -1033,7 +1050,8 @@ static const char *llama_model_type_name(e_model type) {
|
|
1033
1050
|
|
1034
1051
|
static void llama_model_load_internal(
|
1035
1052
|
const std::string & fname,
|
1036
|
-
|
1053
|
+
llama_model & model,
|
1054
|
+
llama_vocab & vocab,
|
1037
1055
|
int n_ctx,
|
1038
1056
|
int n_batch,
|
1039
1057
|
int n_gpu_layers,
|
@@ -1047,12 +1065,11 @@ static void llama_model_load_internal(
|
|
1047
1065
|
llama_progress_callback progress_callback,
|
1048
1066
|
void * progress_callback_user_data) {
|
1049
1067
|
|
1050
|
-
|
1068
|
+
model.t_start_us = ggml_time_us();
|
1051
1069
|
|
1052
1070
|
std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap, vocab_only));
|
1053
1071
|
|
1054
|
-
|
1055
|
-
auto & model = lctx.model;
|
1072
|
+
vocab = std::move(ml->file_loaders.at(0)->vocab);
|
1056
1073
|
model.hparams = ml->file_loaders.at(0)->hparams;
|
1057
1074
|
model.n_gpu_layers = n_gpu_layers;
|
1058
1075
|
llama_file_version file_version = ml->file_loaders.at(0)->file_version;
|
@@ -1122,15 +1139,15 @@ static void llama_model_load_internal(
|
|
1122
1139
|
|
1123
1140
|
// create the ggml context
|
1124
1141
|
{
|
1125
|
-
|
1142
|
+
model.buf.resize(ctx_size);
|
1126
1143
|
if (use_mlock) {
|
1127
|
-
|
1128
|
-
|
1144
|
+
model.mlock_buf.init(model.buf.addr);
|
1145
|
+
model.mlock_buf.grow_to(model.buf.size);
|
1129
1146
|
}
|
1130
1147
|
|
1131
1148
|
struct ggml_init_params params = {
|
1132
|
-
/*.mem_size =*/
|
1133
|
-
/*.mem_buffer =*/
|
1149
|
+
/*.mem_size =*/ model.buf.size,
|
1150
|
+
/*.mem_buffer =*/ model.buf.addr,
|
1134
1151
|
/*.no_alloc =*/ ml->use_mmap,
|
1135
1152
|
};
|
1136
1153
|
|
@@ -1311,7 +1328,7 @@ static void llama_model_load_internal(
|
|
1311
1328
|
}
|
1312
1329
|
#endif
|
1313
1330
|
|
1314
|
-
ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &
|
1331
|
+
ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &model.mlock_mmap : NULL);
|
1315
1332
|
|
1316
1333
|
if (progress_callback) {
|
1317
1334
|
progress_callback(1.0f, progress_callback_user_data);
|
@@ -1321,12 +1338,13 @@ static void llama_model_load_internal(
|
|
1321
1338
|
|
1322
1339
|
// loading time will be recalculate after the first eval, so
|
1323
1340
|
// we take page faults deferred by mmap() into consideration
|
1324
|
-
|
1341
|
+
model.t_load_us = ggml_time_us() - model.t_start_us;
|
1325
1342
|
}
|
1326
1343
|
|
1327
1344
|
static bool llama_model_load(
|
1328
1345
|
const std::string & fname,
|
1329
|
-
|
1346
|
+
llama_model & model,
|
1347
|
+
llama_vocab & vocab,
|
1330
1348
|
int n_ctx,
|
1331
1349
|
int n_batch,
|
1332
1350
|
int n_gpu_layers,
|
@@ -1340,7 +1358,7 @@ static bool llama_model_load(
|
|
1340
1358
|
llama_progress_callback progress_callback,
|
1341
1359
|
void *progress_callback_user_data) {
|
1342
1360
|
try {
|
1343
|
-
llama_model_load_internal(fname,
|
1361
|
+
llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type,
|
1344
1362
|
use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
|
1345
1363
|
return true;
|
1346
1364
|
} catch (const std::exception & err) {
|
@@ -1378,7 +1396,7 @@ static bool llama_eval_internal(
|
|
1378
1396
|
const auto & model = lctx.model;
|
1379
1397
|
const auto & hparams = model.hparams;
|
1380
1398
|
|
1381
|
-
const auto & kv_self =
|
1399
|
+
const auto & kv_self = lctx.kv_self;
|
1382
1400
|
|
1383
1401
|
LLAMA_ASSERT(!!kv_self.ctx);
|
1384
1402
|
|
@@ -1473,11 +1491,11 @@ static bool llama_eval_internal(
|
|
1473
1491
|
offload_func_kq(tmpq);
|
1474
1492
|
ggml_set_name(tmpq, "tmpq");
|
1475
1493
|
|
1476
|
-
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0);
|
1494
|
+
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
|
1477
1495
|
offload_func_kq(Kcur);
|
1478
1496
|
ggml_set_name(Kcur, "Kcur");
|
1479
1497
|
|
1480
|
-
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0);
|
1498
|
+
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
|
1481
1499
|
offload_func_kq(Qcur);
|
1482
1500
|
ggml_set_name(Qcur, "Qcur");
|
1483
1501
|
|
@@ -1726,7 +1744,7 @@ static bool llama_eval_internal(
|
|
1726
1744
|
//memcpy(embd_w.data(), ggml_get_data(cur), sizeof(float)*n_vocab*N);
|
1727
1745
|
|
1728
1746
|
// update kv token count
|
1729
|
-
lctx.
|
1747
|
+
lctx.kv_self.n = n_past + N;
|
1730
1748
|
|
1731
1749
|
// extract logits
|
1732
1750
|
{
|
@@ -2005,9 +2023,10 @@ void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * can
|
|
2005
2023
|
for (size_t i = 0; i < candidates->size; ++i) {
|
2006
2024
|
cum_sum += candidates->data[i].p;
|
2007
2025
|
|
2008
|
-
// Check if the running sum is
|
2009
|
-
|
2010
|
-
|
2026
|
+
// Check if the running sum is at least p or if we have kept at least min_keep tokens
|
2027
|
+
// we set the last index to i+1 to indicate that the current iterate should be included in the set
|
2028
|
+
if (cum_sum >= p && i + 1 >= min_keep) {
|
2029
|
+
last_idx = i + 1;
|
2011
2030
|
break;
|
2012
2031
|
}
|
2013
2032
|
}
|
@@ -2459,6 +2478,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2459
2478
|
std::vector<std::thread> workers;
|
2460
2479
|
std::mutex mutex;
|
2461
2480
|
|
2481
|
+
auto use_more_bits = [] (int i_layer, int num_layers) -> bool {
|
2482
|
+
return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2;
|
2483
|
+
};
|
2484
|
+
|
2462
2485
|
size_t idx = 0;
|
2463
2486
|
for (llama_load_tensor & tensor : model_loader->tensors_map.tensors) {
|
2464
2487
|
llama_buffer read_data;
|
@@ -2513,15 +2536,16 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2513
2536
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
2514
2537
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
2515
2538
|
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
2516
|
-
|
2517
|
-
|
2539
|
+
use_more_bits(i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K;
|
2540
|
+
else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
|
2541
|
+
(i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
|
2518
2542
|
++i_attention_wv;
|
2519
2543
|
} else if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) {
|
2520
2544
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
2521
2545
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
2522
2546
|
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
2523
|
-
(i_feed_forward_w2
|
2524
|
-
|
2547
|
+
use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
|
2548
|
+
//else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_feed_forward_w2 < n_feed_forward_w2/8) new_type = GGML_TYPE_Q6_K;
|
2525
2549
|
++i_feed_forward_w2;
|
2526
2550
|
} else if (tensor.name.find("attention.wo.weight") != std::string::npos) {
|
2527
2551
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
@@ -2634,12 +2658,39 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2634
2658
|
// interface implementation
|
2635
2659
|
//
|
2636
2660
|
|
2637
|
-
struct
|
2661
|
+
struct llama_model * llama_load_model_from_file(
|
2638
2662
|
const char * path_model,
|
2639
2663
|
struct llama_context_params params) {
|
2640
2664
|
ggml_time_init();
|
2641
2665
|
|
2642
|
-
|
2666
|
+
llama_model * model = new llama_model;
|
2667
|
+
|
2668
|
+
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
2669
|
+
|
2670
|
+
if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gpu_layers,
|
2671
|
+
params.main_gpu, params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock,
|
2672
|
+
params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
|
2673
|
+
delete model;
|
2674
|
+
fprintf(stderr, "%s: failed to load model\n", __func__);
|
2675
|
+
return nullptr;
|
2676
|
+
}
|
2677
|
+
|
2678
|
+
return model;
|
2679
|
+
}
|
2680
|
+
|
2681
|
+
void llama_free_model(struct llama_model * model) {
|
2682
|
+
delete model;
|
2683
|
+
}
|
2684
|
+
|
2685
|
+
struct llama_context * llama_new_context_with_model(
|
2686
|
+
struct llama_model * model,
|
2687
|
+
struct llama_context_params params) {
|
2688
|
+
|
2689
|
+
if (!model) {
|
2690
|
+
return nullptr;
|
2691
|
+
}
|
2692
|
+
|
2693
|
+
llama_context * ctx = new llama_context(*model, model->vocab);
|
2643
2694
|
|
2644
2695
|
if (params.seed < 0) {
|
2645
2696
|
params.seed = time(NULL);
|
@@ -2667,24 +2718,16 @@ struct llama_context * llama_init_from_file(
|
|
2667
2718
|
|
2668
2719
|
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
2669
2720
|
|
2670
|
-
if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_batch, params.n_gpu_layers, params.main_gpu,
|
2671
|
-
params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock,
|
2672
|
-
params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
|
2673
|
-
fprintf(stderr, "%s: failed to load model\n", __func__);
|
2674
|
-
llama_free(ctx);
|
2675
|
-
return nullptr;
|
2676
|
-
}
|
2677
|
-
|
2678
2721
|
// reserve memory for context buffers
|
2679
2722
|
if (!params.vocab_only) {
|
2680
|
-
if (!kv_cache_init(ctx->model.hparams, ctx->
|
2723
|
+
if (!kv_cache_init(ctx->model.hparams, ctx->kv_self, memory_type, ctx->model.hparams.n_ctx, params.n_gpu_layers)) {
|
2681
2724
|
fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
|
2682
2725
|
llama_free(ctx);
|
2683
2726
|
return nullptr;
|
2684
2727
|
}
|
2685
2728
|
|
2686
2729
|
{
|
2687
|
-
const size_t memory_size = ggml_nbytes(ctx->
|
2730
|
+
const size_t memory_size = ggml_nbytes(ctx->kv_self.k) + ggml_nbytes(ctx->kv_self.v);
|
2688
2731
|
fprintf(stderr, "%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
|
2689
2732
|
}
|
2690
2733
|
|
@@ -2736,8 +2779,8 @@ struct llama_context * llama_init_from_file(
|
|
2736
2779
|
|
2737
2780
|
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
|
2738
2781
|
|
2739
|
-
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr,
|
2740
|
-
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->
|
2782
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size, 0));
|
2783
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.addr, ctx->kv_self.buf.size, 0));
|
2741
2784
|
|
2742
2785
|
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size, 0));
|
2743
2786
|
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size, 0));
|
@@ -2748,7 +2791,23 @@ struct llama_context * llama_init_from_file(
|
|
2748
2791
|
return ctx;
|
2749
2792
|
}
|
2750
2793
|
|
2794
|
+
struct llama_context * llama_init_from_file(
|
2795
|
+
const char * path_model,
|
2796
|
+
struct llama_context_params params) {
|
2797
|
+
|
2798
|
+
struct llama_model * model = llama_load_model_from_file(path_model, params);
|
2799
|
+
if (!model) {
|
2800
|
+
return nullptr;
|
2801
|
+
}
|
2802
|
+
struct llama_context * ctx = llama_new_context_with_model(model, params);
|
2803
|
+
ctx->model_owner = true;
|
2804
|
+
return ctx;
|
2805
|
+
}
|
2806
|
+
|
2751
2807
|
void llama_free(struct llama_context * ctx) {
|
2808
|
+
if (ctx->model_owner) {
|
2809
|
+
delete &ctx->model;
|
2810
|
+
}
|
2752
2811
|
delete ctx;
|
2753
2812
|
}
|
2754
2813
|
|
@@ -2765,11 +2824,9 @@ int llama_model_quantize(
|
|
2765
2824
|
}
|
2766
2825
|
}
|
2767
2826
|
|
2768
|
-
int llama_apply_lora_from_file_internal(struct
|
2827
|
+
int llama_apply_lora_from_file_internal(const struct llama_model & model, const char * path_lora, const char * path_base_model, int n_threads) {
|
2769
2828
|
fprintf(stderr, "%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
|
2770
2829
|
|
2771
|
-
auto & model = ctx->model;
|
2772
|
-
|
2773
2830
|
const int64_t t_start_lora_us = ggml_time_us();
|
2774
2831
|
|
2775
2832
|
auto fin = std::ifstream(path_lora, std::ios::binary);
|
@@ -2846,7 +2903,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|
2846
2903
|
|
2847
2904
|
// maybe this should in llama_model_loader
|
2848
2905
|
if (model_loader->use_mmap) {
|
2849
|
-
model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ 0));
|
2906
|
+
model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ 0, ggml_is_numa()));
|
2850
2907
|
}
|
2851
2908
|
}
|
2852
2909
|
|
@@ -3012,7 +3069,16 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|
3012
3069
|
|
3013
3070
|
int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
|
3014
3071
|
try {
|
3015
|
-
return llama_apply_lora_from_file_internal(ctx, path_lora, path_base_model, n_threads);
|
3072
|
+
return llama_apply_lora_from_file_internal(ctx->model, path_lora, path_base_model, n_threads);
|
3073
|
+
} catch (const std::exception & err) {
|
3074
|
+
fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what());
|
3075
|
+
return 1;
|
3076
|
+
}
|
3077
|
+
}
|
3078
|
+
|
3079
|
+
int llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, const char * path_base_model, int n_threads) {
|
3080
|
+
try {
|
3081
|
+
return llama_apply_lora_from_file_internal(*model, path_lora, path_base_model, n_threads);
|
3016
3082
|
} catch (const std::exception & err) {
|
3017
3083
|
fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what());
|
3018
3084
|
return 1;
|
@@ -3020,7 +3086,7 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
|
|
3020
3086
|
}
|
3021
3087
|
|
3022
3088
|
int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
|
3023
|
-
return ctx->
|
3089
|
+
return ctx->kv_self.n;
|
3024
3090
|
}
|
3025
3091
|
|
3026
3092
|
#define LLAMA_MAX_RNG_STATE (64*1024)
|
@@ -3045,7 +3111,7 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
|
|
3045
3111
|
const size_t s_embedding = ctx->embedding.size() * sizeof(float);
|
3046
3112
|
const size_t s_kv_size = sizeof(size_t);
|
3047
3113
|
const size_t s_kv_ntok = sizeof(int);
|
3048
|
-
const size_t s_kv = ctx->
|
3114
|
+
const size_t s_kv = ctx->kv_self.buf.size;
|
3049
3115
|
|
3050
3116
|
const size_t s_total = (
|
3051
3117
|
+ s_rng_size
|
@@ -3111,7 +3177,7 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
|
3111
3177
|
|
3112
3178
|
// copy kv cache
|
3113
3179
|
{
|
3114
|
-
const auto & kv_self = ctx->
|
3180
|
+
const auto & kv_self = ctx->kv_self;
|
3115
3181
|
const auto & hparams = ctx->model.hparams;
|
3116
3182
|
const int n_layer = hparams.n_layer;
|
3117
3183
|
const int n_embd = hparams.n_embd;
|
@@ -3215,7 +3281,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
3215
3281
|
|
3216
3282
|
// set kv cache
|
3217
3283
|
{
|
3218
|
-
const auto & kv_self = ctx->
|
3284
|
+
const auto & kv_self = ctx->kv_self;
|
3219
3285
|
const auto & hparams = ctx->model.hparams;
|
3220
3286
|
const int n_layer = hparams.n_layer;
|
3221
3287
|
const int n_embd = hparams.n_embd;
|
@@ -3259,7 +3325,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
3259
3325
|
ggml_free(cpy_ctx);
|
3260
3326
|
}
|
3261
3327
|
|
3262
|
-
ctx->
|
3328
|
+
ctx->kv_self.n = kv_ntok;
|
3263
3329
|
}
|
3264
3330
|
|
3265
3331
|
const size_t nread = inp - src;
|
@@ -3506,6 +3572,6 @@ const char * llama_print_system_info(void) {
|
|
3506
3572
|
}
|
3507
3573
|
|
3508
3574
|
// For internal test use
|
3509
|
-
std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
|
3575
|
+
const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
|
3510
3576
|
return ctx->model.tensors_by_name;
|
3511
3577
|
}
|