llama_cpp 0.2.2 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +28 -0
- data/README.md +39 -6
- data/examples/chat.rb +2 -1
- data/examples/embedding.rb +3 -2
- data/ext/llama_cpp/extconf.rb +13 -0
- data/ext/llama_cpp/llama_cpp.cpp +231 -132
- data/ext/llama_cpp/src/ggml-cuda.cu +319 -52
- data/ext/llama_cpp/src/ggml-metal.m +36 -30
- data/ext/llama_cpp/src/ggml-metal.metal +328 -84
- data/ext/llama_cpp/src/ggml.c +800 -303
- data/ext/llama_cpp/src/ggml.h +68 -5
- data/ext/llama_cpp/src/k_quants.c +1712 -56
- data/ext/llama_cpp/src/k_quants.h +41 -6
- data/ext/llama_cpp/src/llama-util.h +19 -5
- data/ext/llama_cpp/src/llama.cpp +138 -72
- data/ext/llama_cpp/src/llama.h +33 -5
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +0 -2
- data/sig/llama_cpp.rbs +12 -17
- metadata +2 -3
- data/lib/llama_cpp/client.rb +0 -172
@@ -7,7 +7,13 @@
|
|
7
7
|
#include <stddef.h>
|
8
8
|
|
9
9
|
// Super-block size
|
10
|
+
#ifdef GGML_QKK_64
|
11
|
+
#define QK_K 64
|
12
|
+
#define K_SCALE_SIZE 4
|
13
|
+
#else
|
10
14
|
#define QK_K 256
|
15
|
+
#define K_SCALE_SIZE 12
|
16
|
+
#endif
|
11
17
|
|
12
18
|
//
|
13
19
|
// Super-block quantization structures
|
@@ -29,38 +35,67 @@ static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "w
|
|
29
35
|
// weight is represented as x = a * q
|
30
36
|
// 16 blocks of 16 elemenets each
|
31
37
|
// Effectively 3.4375 bits per weight
|
38
|
+
#ifdef GGML_QKK_64
|
32
39
|
typedef struct {
|
33
40
|
uint8_t hmask[QK_K/8]; // quants - high bit
|
34
41
|
uint8_t qs[QK_K/4]; // quants - low 2 bits
|
35
|
-
uint8_t scales[
|
42
|
+
uint8_t scales[2];
|
36
43
|
ggml_fp16_t d; // super-block scale
|
37
44
|
} block_q3_K;
|
38
|
-
static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 +
|
45
|
+
static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + 2, "wrong q3_K block size/padding");
|
46
|
+
#else
|
47
|
+
typedef struct {
|
48
|
+
uint8_t hmask[QK_K/8]; // quants - high bit
|
49
|
+
uint8_t qs[QK_K/4]; // quants - low 2 bits
|
50
|
+
uint8_t scales[12]; // scales, quantized with 6 bits
|
51
|
+
ggml_fp16_t d; // super-block scale
|
52
|
+
} block_q3_K;
|
53
|
+
static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + 12, "wrong q3_K block size/padding");
|
54
|
+
#endif
|
39
55
|
|
40
56
|
// 4-bit quantization
|
41
57
|
// 16 blocks of 32 elements each
|
42
58
|
// weight is represented as x = a * q + b
|
43
59
|
// Effectively 4.5 bits per weight
|
60
|
+
#ifdef GGML_QKK_64
|
61
|
+
typedef struct {
|
62
|
+
ggml_fp16_t d[2]; // super-block scales/mins
|
63
|
+
uint8_t scales[2]; // 4-bit block scales/mins
|
64
|
+
uint8_t qs[QK_K/2]; // 4--bit quants
|
65
|
+
} block_q4_K;
|
66
|
+
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + QK_K/2 + 2, "wrong q4_K block size/padding");
|
67
|
+
#else
|
44
68
|
typedef struct {
|
45
69
|
ggml_fp16_t d; // super-block scale for quantized scales
|
46
70
|
ggml_fp16_t dmin; // super-block scale for quantized mins
|
47
|
-
uint8_t scales[
|
71
|
+
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
|
48
72
|
uint8_t qs[QK_K/2]; // 4--bit quants
|
49
73
|
} block_q4_K;
|
50
|
-
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) +
|
74
|
+
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2, "wrong q4_K block size/padding");
|
75
|
+
#endif
|
51
76
|
|
52
77
|
// 5-bit quantization
|
53
78
|
// 16 blocks of 32 elements each
|
54
79
|
// weight is represented as x = a * q + b
|
55
80
|
// Effectively 5.5 bits per weight
|
81
|
+
#ifdef GGML_QKK_64
|
82
|
+
typedef struct {
|
83
|
+
ggml_fp16_t d; // super-block scale
|
84
|
+
int8_t scales[QK_K/16]; // 8-bit block scales
|
85
|
+
uint8_t qh[QK_K/8]; // quants, high bit
|
86
|
+
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
87
|
+
} block_q5_K;
|
88
|
+
static_assert(sizeof(block_q5_K) == sizeof(ggml_fp16_t) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
|
89
|
+
#else
|
56
90
|
typedef struct {
|
57
91
|
ggml_fp16_t d; // super-block scale for quantized scales
|
58
92
|
ggml_fp16_t dmin; // super-block scale for quantized mins
|
59
|
-
uint8_t scales[
|
93
|
+
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
|
60
94
|
uint8_t qh[QK_K/8]; // quants, high bit
|
61
95
|
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
62
96
|
} block_q5_K;
|
63
|
-
static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) +
|
97
|
+
static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
|
98
|
+
#endif
|
64
99
|
|
65
100
|
// 6-bit quantization
|
66
101
|
// weight is represented as x = a * q
|
@@ -172,12 +172,14 @@ struct llama_mmap {
|
|
172
172
|
#ifdef _POSIX_MAPPED_FILES
|
173
173
|
static constexpr bool SUPPORTED = true;
|
174
174
|
|
175
|
-
llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value
|
175
|
+
llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) {
|
176
176
|
size = file->size;
|
177
177
|
int fd = fileno(file->fp);
|
178
178
|
int flags = MAP_SHARED;
|
179
|
+
// prefetch/readahead impairs performance on NUMA systems
|
180
|
+
if (numa) { prefetch = 0; }
|
179
181
|
#ifdef __linux__
|
180
|
-
flags |= MAP_POPULATE;
|
182
|
+
if (prefetch) { flags |= MAP_POPULATE; }
|
181
183
|
#endif
|
182
184
|
addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
|
183
185
|
if (addr == MAP_FAILED) {
|
@@ -191,6 +193,14 @@ struct llama_mmap {
|
|
191
193
|
strerror(errno));
|
192
194
|
}
|
193
195
|
}
|
196
|
+
if (numa) {
|
197
|
+
// advise the kernel not to use readahead
|
198
|
+
// (because the next page might not belong on the same node)
|
199
|
+
if (madvise(addr, file->size, MADV_RANDOM)) {
|
200
|
+
fprintf(stderr, "warning: madvise(.., MADV_RANDOM) failed: %s\n",
|
201
|
+
strerror(errno));
|
202
|
+
}
|
203
|
+
}
|
194
204
|
}
|
195
205
|
|
196
206
|
~llama_mmap() {
|
@@ -199,7 +209,9 @@ struct llama_mmap {
|
|
199
209
|
#elif defined(_WIN32)
|
200
210
|
static constexpr bool SUPPORTED = true;
|
201
211
|
|
202
|
-
llama_mmap(struct llama_file * file, bool prefetch = true) {
|
212
|
+
llama_mmap(struct llama_file * file, bool prefetch = true, bool numa = false) {
|
213
|
+
(void) numa;
|
214
|
+
|
203
215
|
size = file->size;
|
204
216
|
|
205
217
|
HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp));
|
@@ -244,8 +256,10 @@ struct llama_mmap {
|
|
244
256
|
#else
|
245
257
|
static constexpr bool SUPPORTED = false;
|
246
258
|
|
247
|
-
llama_mmap(struct llama_file *, bool prefetch = true) {
|
248
|
-
(void)prefetch;
|
259
|
+
llama_mmap(struct llama_file *, bool prefetch = true, bool numa = false) {
|
260
|
+
(void) prefetch;
|
261
|
+
(void) numa;
|
262
|
+
|
249
263
|
throw std::runtime_error(std::string("mmap not supported"));
|
250
264
|
}
|
251
265
|
#endif
|
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -21,9 +21,13 @@
|
|
21
21
|
#endif
|
22
22
|
#ifdef GGML_USE_K_QUANTS
|
23
23
|
#ifndef QK_K
|
24
|
+
#ifdef GGML_QKK_64
|
25
|
+
#define QK_K 64
|
26
|
+
#else
|
24
27
|
#define QK_K 256
|
25
28
|
#endif
|
26
29
|
#endif
|
30
|
+
#endif
|
27
31
|
|
28
32
|
#include <array>
|
29
33
|
#include <ctime>
|
@@ -182,6 +186,19 @@ struct llama_kv_cache {
|
|
182
186
|
}
|
183
187
|
};
|
184
188
|
|
189
|
+
struct llama_vocab {
|
190
|
+
using id = int32_t;
|
191
|
+
using token = std::string;
|
192
|
+
|
193
|
+
struct token_score {
|
194
|
+
token tok;
|
195
|
+
float score;
|
196
|
+
};
|
197
|
+
|
198
|
+
std::unordered_map<token, id> token_to_id;
|
199
|
+
std::vector<token_score> id_to_token;
|
200
|
+
};
|
201
|
+
|
185
202
|
struct llama_model {
|
186
203
|
e_model type = MODEL_UNKNOWN;
|
187
204
|
|
@@ -198,10 +215,6 @@ struct llama_model {
|
|
198
215
|
// context
|
199
216
|
struct ggml_context * ctx = NULL;
|
200
217
|
|
201
|
-
// key + value cache for the self attention
|
202
|
-
// TODO: move to llama_state
|
203
|
-
struct llama_kv_cache kv_self;
|
204
|
-
|
205
218
|
// the model memory buffer
|
206
219
|
llama_ctx_buffer buf;
|
207
220
|
|
@@ -215,6 +228,11 @@ struct llama_model {
|
|
215
228
|
// for quantize-stats only
|
216
229
|
std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
|
217
230
|
|
231
|
+
int64_t t_load_us = 0;
|
232
|
+
int64_t t_start_us = 0;
|
233
|
+
|
234
|
+
llama_vocab vocab;
|
235
|
+
|
218
236
|
~llama_model() {
|
219
237
|
if (ctx) {
|
220
238
|
ggml_free(ctx);
|
@@ -233,24 +251,11 @@ struct llama_model {
|
|
233
251
|
}
|
234
252
|
};
|
235
253
|
|
236
|
-
struct llama_vocab {
|
237
|
-
using id = int32_t;
|
238
|
-
using token = std::string;
|
239
|
-
|
240
|
-
struct token_score {
|
241
|
-
token tok;
|
242
|
-
float score;
|
243
|
-
};
|
244
|
-
|
245
|
-
std::unordered_map<token, id> token_to_id;
|
246
|
-
std::vector<token_score> id_to_token;
|
247
|
-
};
|
248
|
-
|
249
254
|
struct llama_context {
|
255
|
+
llama_context(const llama_model & model, const llama_vocab & vocab) : model(model), vocab(vocab), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
|
256
|
+
|
250
257
|
std::mt19937 rng;
|
251
258
|
|
252
|
-
int64_t t_load_us = 0;
|
253
|
-
int64_t t_start_us = 0;
|
254
259
|
bool has_evaluated_once = false;
|
255
260
|
|
256
261
|
int64_t t_sample_us = 0;
|
@@ -261,8 +266,16 @@ struct llama_context {
|
|
261
266
|
int32_t n_eval = 0; // number of eval calls
|
262
267
|
int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
|
263
268
|
|
264
|
-
llama_model model;
|
265
|
-
llama_vocab vocab;
|
269
|
+
const llama_model & model;
|
270
|
+
const llama_vocab & vocab;
|
271
|
+
|
272
|
+
bool model_owner = false;
|
273
|
+
|
274
|
+
int64_t t_load_us;
|
275
|
+
int64_t t_start_us;
|
276
|
+
|
277
|
+
// key + value cache for the self attention
|
278
|
+
struct llama_kv_cache kv_self;
|
266
279
|
|
267
280
|
size_t mem_per_token = 0;
|
268
281
|
|
@@ -761,7 +774,7 @@ struct llama_model_loader {
|
|
761
774
|
}
|
762
775
|
|
763
776
|
if (use_mmap) {
|
764
|
-
mapping.reset(new llama_mmap(&file_loaders.at(0)->file, prefetch_size));
|
777
|
+
mapping.reset(new llama_mmap(&file_loaders.at(0)->file, prefetch_size, ggml_is_numa()));
|
765
778
|
if (lmlock) {
|
766
779
|
lmlock->init(mapping->addr);
|
767
780
|
}
|
@@ -964,7 +977,7 @@ bool llama_mlock_supported() {
|
|
964
977
|
return llama_mlock::SUPPORTED;
|
965
978
|
}
|
966
979
|
|
967
|
-
void llama_init_backend() {
|
980
|
+
void llama_init_backend(bool numa) {
|
968
981
|
ggml_time_init();
|
969
982
|
|
970
983
|
// needed to initialize f16 tables
|
@@ -973,6 +986,10 @@ void llama_init_backend() {
|
|
973
986
|
struct ggml_context * ctx = ggml_init(params);
|
974
987
|
ggml_free(ctx);
|
975
988
|
}
|
989
|
+
|
990
|
+
if (numa) {
|
991
|
+
ggml_numa_init();
|
992
|
+
}
|
976
993
|
}
|
977
994
|
|
978
995
|
int64_t llama_time_us() {
|
@@ -1033,7 +1050,8 @@ static const char *llama_model_type_name(e_model type) {
|
|
1033
1050
|
|
1034
1051
|
static void llama_model_load_internal(
|
1035
1052
|
const std::string & fname,
|
1036
|
-
|
1053
|
+
llama_model & model,
|
1054
|
+
llama_vocab & vocab,
|
1037
1055
|
int n_ctx,
|
1038
1056
|
int n_batch,
|
1039
1057
|
int n_gpu_layers,
|
@@ -1047,12 +1065,11 @@ static void llama_model_load_internal(
|
|
1047
1065
|
llama_progress_callback progress_callback,
|
1048
1066
|
void * progress_callback_user_data) {
|
1049
1067
|
|
1050
|
-
|
1068
|
+
model.t_start_us = ggml_time_us();
|
1051
1069
|
|
1052
1070
|
std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap, vocab_only));
|
1053
1071
|
|
1054
|
-
|
1055
|
-
auto & model = lctx.model;
|
1072
|
+
vocab = std::move(ml->file_loaders.at(0)->vocab);
|
1056
1073
|
model.hparams = ml->file_loaders.at(0)->hparams;
|
1057
1074
|
model.n_gpu_layers = n_gpu_layers;
|
1058
1075
|
llama_file_version file_version = ml->file_loaders.at(0)->file_version;
|
@@ -1122,15 +1139,15 @@ static void llama_model_load_internal(
|
|
1122
1139
|
|
1123
1140
|
// create the ggml context
|
1124
1141
|
{
|
1125
|
-
|
1142
|
+
model.buf.resize(ctx_size);
|
1126
1143
|
if (use_mlock) {
|
1127
|
-
|
1128
|
-
|
1144
|
+
model.mlock_buf.init(model.buf.addr);
|
1145
|
+
model.mlock_buf.grow_to(model.buf.size);
|
1129
1146
|
}
|
1130
1147
|
|
1131
1148
|
struct ggml_init_params params = {
|
1132
|
-
/*.mem_size =*/
|
1133
|
-
/*.mem_buffer =*/
|
1149
|
+
/*.mem_size =*/ model.buf.size,
|
1150
|
+
/*.mem_buffer =*/ model.buf.addr,
|
1134
1151
|
/*.no_alloc =*/ ml->use_mmap,
|
1135
1152
|
};
|
1136
1153
|
|
@@ -1311,7 +1328,7 @@ static void llama_model_load_internal(
|
|
1311
1328
|
}
|
1312
1329
|
#endif
|
1313
1330
|
|
1314
|
-
ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &
|
1331
|
+
ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &model.mlock_mmap : NULL);
|
1315
1332
|
|
1316
1333
|
if (progress_callback) {
|
1317
1334
|
progress_callback(1.0f, progress_callback_user_data);
|
@@ -1321,12 +1338,13 @@ static void llama_model_load_internal(
|
|
1321
1338
|
|
1322
1339
|
// loading time will be recalculate after the first eval, so
|
1323
1340
|
// we take page faults deferred by mmap() into consideration
|
1324
|
-
|
1341
|
+
model.t_load_us = ggml_time_us() - model.t_start_us;
|
1325
1342
|
}
|
1326
1343
|
|
1327
1344
|
static bool llama_model_load(
|
1328
1345
|
const std::string & fname,
|
1329
|
-
|
1346
|
+
llama_model & model,
|
1347
|
+
llama_vocab & vocab,
|
1330
1348
|
int n_ctx,
|
1331
1349
|
int n_batch,
|
1332
1350
|
int n_gpu_layers,
|
@@ -1340,7 +1358,7 @@ static bool llama_model_load(
|
|
1340
1358
|
llama_progress_callback progress_callback,
|
1341
1359
|
void *progress_callback_user_data) {
|
1342
1360
|
try {
|
1343
|
-
llama_model_load_internal(fname,
|
1361
|
+
llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type,
|
1344
1362
|
use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
|
1345
1363
|
return true;
|
1346
1364
|
} catch (const std::exception & err) {
|
@@ -1378,7 +1396,7 @@ static bool llama_eval_internal(
|
|
1378
1396
|
const auto & model = lctx.model;
|
1379
1397
|
const auto & hparams = model.hparams;
|
1380
1398
|
|
1381
|
-
const auto & kv_self =
|
1399
|
+
const auto & kv_self = lctx.kv_self;
|
1382
1400
|
|
1383
1401
|
LLAMA_ASSERT(!!kv_self.ctx);
|
1384
1402
|
|
@@ -1473,11 +1491,11 @@ static bool llama_eval_internal(
|
|
1473
1491
|
offload_func_kq(tmpq);
|
1474
1492
|
ggml_set_name(tmpq, "tmpq");
|
1475
1493
|
|
1476
|
-
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0);
|
1494
|
+
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
|
1477
1495
|
offload_func_kq(Kcur);
|
1478
1496
|
ggml_set_name(Kcur, "Kcur");
|
1479
1497
|
|
1480
|
-
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0);
|
1498
|
+
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
|
1481
1499
|
offload_func_kq(Qcur);
|
1482
1500
|
ggml_set_name(Qcur, "Qcur");
|
1483
1501
|
|
@@ -1726,7 +1744,7 @@ static bool llama_eval_internal(
|
|
1726
1744
|
//memcpy(embd_w.data(), ggml_get_data(cur), sizeof(float)*n_vocab*N);
|
1727
1745
|
|
1728
1746
|
// update kv token count
|
1729
|
-
lctx.
|
1747
|
+
lctx.kv_self.n = n_past + N;
|
1730
1748
|
|
1731
1749
|
// extract logits
|
1732
1750
|
{
|
@@ -2005,9 +2023,10 @@ void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * can
|
|
2005
2023
|
for (size_t i = 0; i < candidates->size; ++i) {
|
2006
2024
|
cum_sum += candidates->data[i].p;
|
2007
2025
|
|
2008
|
-
// Check if the running sum is
|
2009
|
-
|
2010
|
-
|
2026
|
+
// Check if the running sum is at least p or if we have kept at least min_keep tokens
|
2027
|
+
// we set the last index to i+1 to indicate that the current iterate should be included in the set
|
2028
|
+
if (cum_sum >= p && i + 1 >= min_keep) {
|
2029
|
+
last_idx = i + 1;
|
2011
2030
|
break;
|
2012
2031
|
}
|
2013
2032
|
}
|
@@ -2459,6 +2478,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2459
2478
|
std::vector<std::thread> workers;
|
2460
2479
|
std::mutex mutex;
|
2461
2480
|
|
2481
|
+
auto use_more_bits = [] (int i_layer, int num_layers) -> bool {
|
2482
|
+
return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2;
|
2483
|
+
};
|
2484
|
+
|
2462
2485
|
size_t idx = 0;
|
2463
2486
|
for (llama_load_tensor & tensor : model_loader->tensors_map.tensors) {
|
2464
2487
|
llama_buffer read_data;
|
@@ -2513,15 +2536,16 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2513
2536
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
2514
2537
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
2515
2538
|
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
2516
|
-
|
2517
|
-
|
2539
|
+
use_more_bits(i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K;
|
2540
|
+
else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
|
2541
|
+
(i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
|
2518
2542
|
++i_attention_wv;
|
2519
2543
|
} else if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) {
|
2520
2544
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
2521
2545
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
2522
2546
|
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
2523
|
-
(i_feed_forward_w2
|
2524
|
-
|
2547
|
+
use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
|
2548
|
+
//else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_feed_forward_w2 < n_feed_forward_w2/8) new_type = GGML_TYPE_Q6_K;
|
2525
2549
|
++i_feed_forward_w2;
|
2526
2550
|
} else if (tensor.name.find("attention.wo.weight") != std::string::npos) {
|
2527
2551
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
@@ -2634,12 +2658,39 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2634
2658
|
// interface implementation
|
2635
2659
|
//
|
2636
2660
|
|
2637
|
-
struct
|
2661
|
+
struct llama_model * llama_load_model_from_file(
|
2638
2662
|
const char * path_model,
|
2639
2663
|
struct llama_context_params params) {
|
2640
2664
|
ggml_time_init();
|
2641
2665
|
|
2642
|
-
|
2666
|
+
llama_model * model = new llama_model;
|
2667
|
+
|
2668
|
+
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
2669
|
+
|
2670
|
+
if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gpu_layers,
|
2671
|
+
params.main_gpu, params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock,
|
2672
|
+
params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
|
2673
|
+
delete model;
|
2674
|
+
fprintf(stderr, "%s: failed to load model\n", __func__);
|
2675
|
+
return nullptr;
|
2676
|
+
}
|
2677
|
+
|
2678
|
+
return model;
|
2679
|
+
}
|
2680
|
+
|
2681
|
+
void llama_free_model(struct llama_model * model) {
|
2682
|
+
delete model;
|
2683
|
+
}
|
2684
|
+
|
2685
|
+
struct llama_context * llama_new_context_with_model(
|
2686
|
+
struct llama_model * model,
|
2687
|
+
struct llama_context_params params) {
|
2688
|
+
|
2689
|
+
if (!model) {
|
2690
|
+
return nullptr;
|
2691
|
+
}
|
2692
|
+
|
2693
|
+
llama_context * ctx = new llama_context(*model, model->vocab);
|
2643
2694
|
|
2644
2695
|
if (params.seed < 0) {
|
2645
2696
|
params.seed = time(NULL);
|
@@ -2667,24 +2718,16 @@ struct llama_context * llama_init_from_file(
|
|
2667
2718
|
|
2668
2719
|
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
2669
2720
|
|
2670
|
-
if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_batch, params.n_gpu_layers, params.main_gpu,
|
2671
|
-
params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock,
|
2672
|
-
params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
|
2673
|
-
fprintf(stderr, "%s: failed to load model\n", __func__);
|
2674
|
-
llama_free(ctx);
|
2675
|
-
return nullptr;
|
2676
|
-
}
|
2677
|
-
|
2678
2721
|
// reserve memory for context buffers
|
2679
2722
|
if (!params.vocab_only) {
|
2680
|
-
if (!kv_cache_init(ctx->model.hparams, ctx->
|
2723
|
+
if (!kv_cache_init(ctx->model.hparams, ctx->kv_self, memory_type, ctx->model.hparams.n_ctx, params.n_gpu_layers)) {
|
2681
2724
|
fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
|
2682
2725
|
llama_free(ctx);
|
2683
2726
|
return nullptr;
|
2684
2727
|
}
|
2685
2728
|
|
2686
2729
|
{
|
2687
|
-
const size_t memory_size = ggml_nbytes(ctx->
|
2730
|
+
const size_t memory_size = ggml_nbytes(ctx->kv_self.k) + ggml_nbytes(ctx->kv_self.v);
|
2688
2731
|
fprintf(stderr, "%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
|
2689
2732
|
}
|
2690
2733
|
|
@@ -2736,8 +2779,8 @@ struct llama_context * llama_init_from_file(
|
|
2736
2779
|
|
2737
2780
|
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
|
2738
2781
|
|
2739
|
-
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr,
|
2740
|
-
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->
|
2782
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size, 0));
|
2783
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.addr, ctx->kv_self.buf.size, 0));
|
2741
2784
|
|
2742
2785
|
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size, 0));
|
2743
2786
|
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size, 0));
|
@@ -2748,7 +2791,23 @@ struct llama_context * llama_init_from_file(
|
|
2748
2791
|
return ctx;
|
2749
2792
|
}
|
2750
2793
|
|
2794
|
+
struct llama_context * llama_init_from_file(
|
2795
|
+
const char * path_model,
|
2796
|
+
struct llama_context_params params) {
|
2797
|
+
|
2798
|
+
struct llama_model * model = llama_load_model_from_file(path_model, params);
|
2799
|
+
if (!model) {
|
2800
|
+
return nullptr;
|
2801
|
+
}
|
2802
|
+
struct llama_context * ctx = llama_new_context_with_model(model, params);
|
2803
|
+
ctx->model_owner = true;
|
2804
|
+
return ctx;
|
2805
|
+
}
|
2806
|
+
|
2751
2807
|
void llama_free(struct llama_context * ctx) {
|
2808
|
+
if (ctx->model_owner) {
|
2809
|
+
delete &ctx->model;
|
2810
|
+
}
|
2752
2811
|
delete ctx;
|
2753
2812
|
}
|
2754
2813
|
|
@@ -2765,11 +2824,9 @@ int llama_model_quantize(
|
|
2765
2824
|
}
|
2766
2825
|
}
|
2767
2826
|
|
2768
|
-
int llama_apply_lora_from_file_internal(struct
|
2827
|
+
int llama_apply_lora_from_file_internal(const struct llama_model & model, const char * path_lora, const char * path_base_model, int n_threads) {
|
2769
2828
|
fprintf(stderr, "%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
|
2770
2829
|
|
2771
|
-
auto & model = ctx->model;
|
2772
|
-
|
2773
2830
|
const int64_t t_start_lora_us = ggml_time_us();
|
2774
2831
|
|
2775
2832
|
auto fin = std::ifstream(path_lora, std::ios::binary);
|
@@ -2846,7 +2903,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|
2846
2903
|
|
2847
2904
|
// maybe this should in llama_model_loader
|
2848
2905
|
if (model_loader->use_mmap) {
|
2849
|
-
model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ 0));
|
2906
|
+
model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ 0, ggml_is_numa()));
|
2850
2907
|
}
|
2851
2908
|
}
|
2852
2909
|
|
@@ -3012,7 +3069,16 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|
3012
3069
|
|
3013
3070
|
int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
|
3014
3071
|
try {
|
3015
|
-
return llama_apply_lora_from_file_internal(ctx, path_lora, path_base_model, n_threads);
|
3072
|
+
return llama_apply_lora_from_file_internal(ctx->model, path_lora, path_base_model, n_threads);
|
3073
|
+
} catch (const std::exception & err) {
|
3074
|
+
fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what());
|
3075
|
+
return 1;
|
3076
|
+
}
|
3077
|
+
}
|
3078
|
+
|
3079
|
+
int llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, const char * path_base_model, int n_threads) {
|
3080
|
+
try {
|
3081
|
+
return llama_apply_lora_from_file_internal(*model, path_lora, path_base_model, n_threads);
|
3016
3082
|
} catch (const std::exception & err) {
|
3017
3083
|
fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what());
|
3018
3084
|
return 1;
|
@@ -3020,7 +3086,7 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
|
|
3020
3086
|
}
|
3021
3087
|
|
3022
3088
|
int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
|
3023
|
-
return ctx->
|
3089
|
+
return ctx->kv_self.n;
|
3024
3090
|
}
|
3025
3091
|
|
3026
3092
|
#define LLAMA_MAX_RNG_STATE (64*1024)
|
@@ -3045,7 +3111,7 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
|
|
3045
3111
|
const size_t s_embedding = ctx->embedding.size() * sizeof(float);
|
3046
3112
|
const size_t s_kv_size = sizeof(size_t);
|
3047
3113
|
const size_t s_kv_ntok = sizeof(int);
|
3048
|
-
const size_t s_kv = ctx->
|
3114
|
+
const size_t s_kv = ctx->kv_self.buf.size;
|
3049
3115
|
|
3050
3116
|
const size_t s_total = (
|
3051
3117
|
+ s_rng_size
|
@@ -3111,7 +3177,7 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
|
3111
3177
|
|
3112
3178
|
// copy kv cache
|
3113
3179
|
{
|
3114
|
-
const auto & kv_self = ctx->
|
3180
|
+
const auto & kv_self = ctx->kv_self;
|
3115
3181
|
const auto & hparams = ctx->model.hparams;
|
3116
3182
|
const int n_layer = hparams.n_layer;
|
3117
3183
|
const int n_embd = hparams.n_embd;
|
@@ -3215,7 +3281,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
3215
3281
|
|
3216
3282
|
// set kv cache
|
3217
3283
|
{
|
3218
|
-
const auto & kv_self = ctx->
|
3284
|
+
const auto & kv_self = ctx->kv_self;
|
3219
3285
|
const auto & hparams = ctx->model.hparams;
|
3220
3286
|
const int n_layer = hparams.n_layer;
|
3221
3287
|
const int n_embd = hparams.n_embd;
|
@@ -3259,7 +3325,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
3259
3325
|
ggml_free(cpy_ctx);
|
3260
3326
|
}
|
3261
3327
|
|
3262
|
-
ctx->
|
3328
|
+
ctx->kv_self.n = kv_ntok;
|
3263
3329
|
}
|
3264
3330
|
|
3265
3331
|
const size_t nread = inp - src;
|
@@ -3506,6 +3572,6 @@ const char * llama_print_system_info(void) {
|
|
3506
3572
|
}
|
3507
3573
|
|
3508
3574
|
// For internal test use
|
3509
|
-
std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
|
3575
|
+
const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
|
3510
3576
|
return ctx->model.tensors_by_name;
|
3511
3577
|
}
|