llama_cpp 0.0.3 → 0.0.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +36 -0
- data/README.md +5 -4
- data/ext/llama_cpp/extconf.rb +38 -0
- data/ext/llama_cpp/llama_cpp.cpp +118 -2
- data/ext/llama_cpp/src/ggml.c +1740 -658
- data/ext/llama_cpp/src/ggml.h +84 -16
- data/ext/llama_cpp/src/llama.cpp +1108 -756
- data/ext/llama_cpp/src/llama.h +37 -1
- data/ext/llama_cpp/src/llama_util.h +396 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +3 -3
- data/sig/llama_cpp.rbs +6 -0
- metadata +3 -2
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -1,49 +1,33 @@
|
|
1
|
+
// Defines fileno on msys:
|
2
|
+
#ifndef _GNU_SOURCE
|
3
|
+
#define _GNU_SOURCE
|
4
|
+
#include <cstdint>
|
5
|
+
#include <cstdio>
|
6
|
+
#endif
|
7
|
+
|
8
|
+
#include "llama_util.h"
|
1
9
|
#include "llama.h"
|
2
10
|
|
3
11
|
#include "ggml.h"
|
4
12
|
|
13
|
+
#include <array>
|
14
|
+
#include <ctime>
|
5
15
|
#include <cinttypes>
|
6
16
|
#include <fstream>
|
7
17
|
#include <random>
|
8
18
|
#include <map>
|
9
19
|
#include <unordered_map>
|
10
20
|
#include <queue>
|
11
|
-
#include <regex>
|
12
21
|
#include <cassert>
|
13
22
|
#include <cstring>
|
14
|
-
|
15
|
-
#
|
16
|
-
#
|
17
|
-
#include <
|
18
|
-
#else
|
19
|
-
#include <sys/types.h>
|
20
|
-
#include <sys/mman.h>
|
21
|
-
#include <unistd.h>
|
22
|
-
#include <fcntl.h>
|
23
|
-
#endif
|
24
|
-
|
25
|
-
#define Min(X, Y) ((Y) > (X) ? (X) : (Y))
|
26
|
-
#define Max(X, Y) ((Y) < (X) ? (X) : (Y))
|
23
|
+
#include <climits>
|
24
|
+
#include <memory>
|
25
|
+
#include <algorithm>
|
26
|
+
#include <initializer_list>
|
27
27
|
|
28
28
|
#define LLAMA_USE_SCRATCH
|
29
29
|
#define LLAMA_MAX_SCRATCH_BUFFERS 16
|
30
30
|
|
31
|
-
#define LLAMA_ASSERT(x) \
|
32
|
-
do { \
|
33
|
-
if (!(x)) { \
|
34
|
-
fprintf(stderr, "LLAMA_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
|
35
|
-
abort(); \
|
36
|
-
} \
|
37
|
-
} while (0)
|
38
|
-
|
39
|
-
|
40
|
-
// determine number of model parts based on the dimension
|
41
|
-
static const std::unordered_map<int, int> LLAMA_N_PARTS = {
|
42
|
-
{ 4096, 1 },
|
43
|
-
{ 5120, 2 },
|
44
|
-
{ 6656, 4 },
|
45
|
-
{ 8192, 8 },
|
46
|
-
};
|
47
31
|
|
48
32
|
// available llama models
|
49
33
|
enum e_model {
|
@@ -60,47 +44,67 @@ static const size_t MB = 1024*1024;
|
|
60
44
|
// TODO: dynamically determine these sizes
|
61
45
|
// needs modifications in ggml
|
62
46
|
|
63
|
-
static const std::map<e_model, size_t> MEM_REQ_SCRATCH0
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
}
|
47
|
+
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
|
48
|
+
{
|
49
|
+
static std::map<e_model, size_t> _MEM_REQ_SCRATCH0 = {
|
50
|
+
{ MODEL_7B, 512ull * MB },
|
51
|
+
{ MODEL_13B, 512ull * MB },
|
52
|
+
{ MODEL_30B, 512ull * MB },
|
53
|
+
{ MODEL_65B, 512ull * MB },
|
54
|
+
};
|
55
|
+
return _MEM_REQ_SCRATCH0;
|
56
|
+
}
|
69
57
|
|
70
|
-
static const std::map<e_model, size_t> MEM_REQ_SCRATCH1
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
58
|
+
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
|
59
|
+
{
|
60
|
+
static std::map<e_model, size_t> _MEM_REQ_SCRATCH1 = {
|
61
|
+
{ MODEL_7B, 512ull * MB },
|
62
|
+
{ MODEL_13B, 512ull * MB },
|
63
|
+
{ MODEL_30B, 512ull * MB },
|
64
|
+
{ MODEL_65B, 512ull * MB },
|
65
|
+
};
|
66
|
+
return _MEM_REQ_SCRATCH1;
|
75
67
|
};
|
76
68
|
|
77
69
|
// 2*n_embd*n_ctx*n_layer*sizeof(float16)
|
78
|
-
static const std::map<e_model, size_t> MEM_REQ_KV_SELF
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
70
|
+
static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
|
71
|
+
{
|
72
|
+
static std::map<e_model, size_t> _MEM_REQ_KV_SELF = {
|
73
|
+
{ MODEL_7B, 1026ull * MB },
|
74
|
+
{ MODEL_13B, 1608ull * MB },
|
75
|
+
{ MODEL_30B, 3124ull * MB },
|
76
|
+
{ MODEL_65B, 5120ull * MB },
|
77
|
+
};
|
78
|
+
return _MEM_REQ_KV_SELF;
|
83
79
|
};
|
84
80
|
|
85
81
|
// this is mostly needed for temporary mul_mat buffers to dequantize the data
|
86
82
|
// not actually needed if BLAS is disabled
|
87
|
-
static const std::map<e_model, size_t> MEM_REQ_EVAL
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
83
|
+
static const std::map<e_model, size_t> & MEM_REQ_EVAL()
|
84
|
+
{
|
85
|
+
static std::map<e_model, size_t> _MEM_REQ_EVAL = {
|
86
|
+
{ MODEL_7B, 768ull * MB },
|
87
|
+
{ MODEL_13B, 1024ull * MB },
|
88
|
+
{ MODEL_30B, 1280ull * MB },
|
89
|
+
{ MODEL_65B, 1536ull * MB },
|
90
|
+
};
|
91
|
+
return _MEM_REQ_EVAL;
|
92
92
|
};
|
93
93
|
|
94
94
|
// default hparams (LLaMA 7B)
|
95
95
|
struct llama_hparams {
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
96
|
+
uint32_t n_vocab = 32000;
|
97
|
+
uint32_t n_ctx = 512; // this is provided as user input?
|
98
|
+
uint32_t n_embd = 4096;
|
99
|
+
uint32_t n_mult = 256;
|
100
|
+
uint32_t n_head = 32;
|
101
|
+
uint32_t n_layer = 32;
|
102
|
+
uint32_t n_rot = 64;
|
103
|
+
enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;
|
104
|
+
|
105
|
+
bool operator!=(const llama_hparams & other) const {
|
106
|
+
return memcmp(this, &other, sizeof(llama_hparams));
|
107
|
+
}
|
104
108
|
};
|
105
109
|
|
106
110
|
struct llama_layer {
|
@@ -126,11 +130,17 @@ struct llama_kv_cache {
|
|
126
130
|
struct ggml_tensor * k;
|
127
131
|
struct ggml_tensor * v;
|
128
132
|
|
129
|
-
struct ggml_context * ctx;
|
133
|
+
struct ggml_context * ctx = NULL;
|
130
134
|
|
131
|
-
|
135
|
+
llama_buffer buf;
|
132
136
|
|
133
137
|
int n; // number of tokens currently in the cache
|
138
|
+
|
139
|
+
~llama_kv_cache() {
|
140
|
+
if (ctx) {
|
141
|
+
ggml_free(ctx);
|
142
|
+
}
|
143
|
+
}
|
134
144
|
};
|
135
145
|
|
136
146
|
struct llama_model {
|
@@ -146,22 +156,30 @@ struct llama_model {
|
|
146
156
|
std::vector<llama_layer> layers;
|
147
157
|
|
148
158
|
// context
|
149
|
-
struct ggml_context * ctx;
|
159
|
+
struct ggml_context * ctx = NULL;
|
150
160
|
|
151
161
|
// key + value cache for the self attention
|
152
162
|
// TODO: move to llama_state
|
153
163
|
struct llama_kv_cache kv_self;
|
154
164
|
|
155
165
|
// the model memory buffer
|
156
|
-
|
166
|
+
llama_buffer buf;
|
157
167
|
|
158
168
|
// model memory mapped file
|
159
|
-
|
160
|
-
|
169
|
+
std::unique_ptr<llama_mmap> mapping;
|
170
|
+
|
171
|
+
// objects representing data potentially being locked in memory
|
172
|
+
llama_mlock mlock_buf;
|
173
|
+
llama_mlock mlock_mmap;
|
174
|
+
|
175
|
+
// for quantize-stats only
|
176
|
+
std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
|
161
177
|
|
162
|
-
|
163
|
-
|
164
|
-
|
178
|
+
~llama_model() {
|
179
|
+
if (ctx) {
|
180
|
+
ggml_free(ctx);
|
181
|
+
}
|
182
|
+
}
|
165
183
|
};
|
166
184
|
|
167
185
|
struct llama_vocab {
|
@@ -206,8 +224,8 @@ struct llama_context {
|
|
206
224
|
|
207
225
|
// memory buffers used to evaluate the model
|
208
226
|
// TODO: move in llama_state
|
209
|
-
|
210
|
-
|
227
|
+
llama_buffer buf_compute;
|
228
|
+
llama_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
|
211
229
|
|
212
230
|
int buf_last = 0;
|
213
231
|
size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
|
@@ -220,11 +238,11 @@ struct llama_context {
|
|
220
238
|
last_size = ggml_set_scratch(ctx, { 0, 0, nullptr, });
|
221
239
|
} else {
|
222
240
|
auto & buf = buf_scratch[i];
|
223
|
-
last_size = ggml_set_scratch(ctx, { 0, buf.size
|
241
|
+
last_size = ggml_set_scratch(ctx, { 0, buf.size, buf.addr, });
|
224
242
|
}
|
225
243
|
|
226
244
|
if (buf_last >= 0) {
|
227
|
-
buf_max_size[buf_last] =
|
245
|
+
buf_max_size[buf_last] = std::max(buf_max_size[buf_last], last_size);
|
228
246
|
}
|
229
247
|
|
230
248
|
buf_last = i;
|
@@ -244,6 +262,500 @@ struct llama_context {
|
|
244
262
|
}
|
245
263
|
};
|
246
264
|
|
265
|
+
template <typename T>
|
266
|
+
static T checked_mul(T a, T b) {
|
267
|
+
T ret = a * b;
|
268
|
+
if (a != 0 && ret / a != b) {
|
269
|
+
throw format("overflow multiplying %llu * %llu",
|
270
|
+
(unsigned long long) a, (unsigned long long) b);
|
271
|
+
}
|
272
|
+
return ret;
|
273
|
+
}
|
274
|
+
|
275
|
+
static size_t checked_div(size_t a, size_t b) {
|
276
|
+
if (b == 0 || a % b != 0) {
|
277
|
+
throw format("error dividing %zu / %zu", a, b);
|
278
|
+
}
|
279
|
+
return a / b;
|
280
|
+
}
|
281
|
+
|
282
|
+
static std::string llama_format_tensor_shape(const std::vector<uint32_t> & ne) {
|
283
|
+
char buf[256];
|
284
|
+
snprintf(buf, sizeof(buf), "%5u", ne.at(0));
|
285
|
+
for (size_t i = 1; i < ne.size(); i++) {
|
286
|
+
snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), " x %5u", ne.at(i));
|
287
|
+
}
|
288
|
+
return buf;
|
289
|
+
}
|
290
|
+
|
291
|
+
static size_t llama_calc_tensor_size(const std::vector<uint32_t> & ne, enum ggml_type type) {
|
292
|
+
size_t size = ggml_type_size(type);
|
293
|
+
for (uint32_t dim : ne) {
|
294
|
+
size = checked_mul<size_t>(size, dim);
|
295
|
+
}
|
296
|
+
return size / ggml_blck_size(type);
|
297
|
+
}
|
298
|
+
|
299
|
+
struct llama_load_tensor_shard {
|
300
|
+
std::vector<uint32_t> ne;
|
301
|
+
size_t size;
|
302
|
+
enum ggml_type type;
|
303
|
+
size_t file_idx;
|
304
|
+
size_t file_off;
|
305
|
+
|
306
|
+
void calc_size() {
|
307
|
+
size = llama_calc_tensor_size(ne, type);
|
308
|
+
}
|
309
|
+
};
|
310
|
+
|
311
|
+
enum llama_split_type {
|
312
|
+
SPLIT_NONE,
|
313
|
+
SPLIT_BY_COLUMNS,
|
314
|
+
SPLIT_BY_ROWS
|
315
|
+
};
|
316
|
+
|
317
|
+
struct llama_load_tensor {
|
318
|
+
std::vector<llama_load_tensor_shard> shards;
|
319
|
+
|
320
|
+
std::string name;
|
321
|
+
enum ggml_type type = GGML_TYPE_F32;
|
322
|
+
llama_split_type split_type = SPLIT_NONE;
|
323
|
+
std::vector<uint32_t> ne;
|
324
|
+
size_t size;
|
325
|
+
struct ggml_tensor * ggml_tensor = NULL;
|
326
|
+
uint8_t * data;
|
327
|
+
|
328
|
+
llama_load_tensor(const std::string & name) : name(name) {}
|
329
|
+
|
330
|
+
void calc_all() {
|
331
|
+
calc_type();
|
332
|
+
calc_split_type();
|
333
|
+
calc_ne();
|
334
|
+
calc_size();
|
335
|
+
}
|
336
|
+
|
337
|
+
void calc_type() {
|
338
|
+
const auto & first_shard = shards.at(0);
|
339
|
+
for (const auto & shard : shards) {
|
340
|
+
if (shard.type != first_shard.type) {
|
341
|
+
throw format("inconsistent tensor shard type in '%s'", name.c_str());
|
342
|
+
}
|
343
|
+
}
|
344
|
+
type = first_shard.type;
|
345
|
+
}
|
346
|
+
|
347
|
+
void calc_split_type() {
|
348
|
+
if (shards.at(0).ne.size() == 1 || // 1D tensors are just duplicated in every file
|
349
|
+
shards.size() == 1) { // only one file?
|
350
|
+
split_type = SPLIT_NONE;
|
351
|
+
} else if (name.find("tok_embeddings.") == 0 ||
|
352
|
+
name.find(".attention.wo.weight") != std::string::npos ||
|
353
|
+
name.find(".feed_forward.w2.weight") != std::string::npos) {
|
354
|
+
split_type = SPLIT_BY_COLUMNS;
|
355
|
+
} else {
|
356
|
+
split_type = SPLIT_BY_ROWS;
|
357
|
+
}
|
358
|
+
}
|
359
|
+
|
360
|
+
void calc_ne() {
|
361
|
+
const auto & first_shard = shards.at(0);
|
362
|
+
for (const auto & shard : shards) {
|
363
|
+
if (shard.ne != first_shard.ne) {
|
364
|
+
throw format("inconsistent tensor shard shape in '%s': first was %s, other was %s",
|
365
|
+
name.c_str(), llama_format_tensor_shape(first_shard.ne).c_str(), llama_format_tensor_shape(shard.ne).c_str());
|
366
|
+
}
|
367
|
+
}
|
368
|
+
ne = first_shard.ne;
|
369
|
+
LLAMA_ASSERT(shards.size() <= UINT32_MAX);
|
370
|
+
uint32_t n_shards = (uint32_t) shards.size();
|
371
|
+
switch (split_type) {
|
372
|
+
case SPLIT_NONE:
|
373
|
+
ne = first_shard.ne;
|
374
|
+
break;
|
375
|
+
case SPLIT_BY_COLUMNS:
|
376
|
+
ne = {checked_mul<uint32_t>(first_shard.ne[0], n_shards),
|
377
|
+
first_shard.ne[1]};
|
378
|
+
break;
|
379
|
+
case SPLIT_BY_ROWS:
|
380
|
+
ne = {first_shard.ne[0],
|
381
|
+
checked_mul<uint32_t>(first_shard.ne[1], n_shards)};
|
382
|
+
break;
|
383
|
+
}
|
384
|
+
}
|
385
|
+
|
386
|
+
void calc_size() {
|
387
|
+
size = llama_calc_tensor_size(ne, type);
|
388
|
+
}
|
389
|
+
};
|
390
|
+
|
391
|
+
struct llama_load_tensors_map {
|
392
|
+
// tensors is kept in a separate vector to preserve file order
|
393
|
+
std::vector<llama_load_tensor> tensors;
|
394
|
+
std::unordered_map<std::string, size_t> name_to_idx;
|
395
|
+
};
|
396
|
+
|
397
|
+
enum llama_file_version {
|
398
|
+
LLAMA_FILE_VERSION_GGML,
|
399
|
+
LLAMA_FILE_VERSION_GGMF_V1, // added version field and scores in vocab
|
400
|
+
LLAMA_FILE_VERSION_GGJT_V1, // added padding
|
401
|
+
};
|
402
|
+
|
403
|
+
struct llama_file_loader {
|
404
|
+
llama_file file;
|
405
|
+
llama_file_version file_version;
|
406
|
+
llama_hparams hparams;
|
407
|
+
llama_vocab vocab;
|
408
|
+
|
409
|
+
llama_file_loader(const char * fname, size_t file_idx, llama_load_tensors_map & tensors_map)
|
410
|
+
: file(fname, "rb") {
|
411
|
+
fprintf(stderr, "llama.cpp: loading model from %s\n", fname);
|
412
|
+
read_magic();
|
413
|
+
read_hparams();
|
414
|
+
read_vocab();
|
415
|
+
read_tensor_metadata(file_idx, tensors_map);
|
416
|
+
}
|
417
|
+
void read_magic() {
|
418
|
+
uint32_t magic = file.read_u32();
|
419
|
+
uint32_t version = 0;
|
420
|
+
|
421
|
+
if (magic != 'ggml') {
|
422
|
+
version = file.read_u32();
|
423
|
+
}
|
424
|
+
|
425
|
+
if (magic == 'ggml' && version == 0) {
|
426
|
+
file_version = LLAMA_FILE_VERSION_GGML;
|
427
|
+
} else if (magic == 'ggmf' && version == 1) {
|
428
|
+
file_version = LLAMA_FILE_VERSION_GGMF_V1;
|
429
|
+
} else if (magic == 'ggjt' && version == 1) {
|
430
|
+
file_version = LLAMA_FILE_VERSION_GGJT_V1;
|
431
|
+
} else {
|
432
|
+
throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
|
433
|
+
magic, version);
|
434
|
+
}
|
435
|
+
}
|
436
|
+
void read_hparams() {
|
437
|
+
hparams.n_vocab = file.read_u32();
|
438
|
+
hparams.n_embd = file.read_u32();
|
439
|
+
hparams.n_mult = file.read_u32();
|
440
|
+
hparams.n_head = file.read_u32();
|
441
|
+
hparams.n_layer = file.read_u32();
|
442
|
+
hparams.n_rot = file.read_u32();
|
443
|
+
hparams.ftype = (enum llama_ftype) file.read_u32();
|
444
|
+
}
|
445
|
+
void read_vocab() {
|
446
|
+
vocab.id_to_token.resize(hparams.n_vocab);
|
447
|
+
|
448
|
+
for (uint32_t i = 0; i < hparams.n_vocab; i++) {
|
449
|
+
uint32_t len = file.read_u32();
|
450
|
+
std::string word = file.read_string(len);
|
451
|
+
|
452
|
+
float score = 0.0f;
|
453
|
+
if (file_version >= LLAMA_FILE_VERSION_GGMF_V1) {
|
454
|
+
file.read_raw(&score, sizeof(score));
|
455
|
+
}
|
456
|
+
|
457
|
+
vocab.token_to_id[word] = i;
|
458
|
+
|
459
|
+
auto & tok_score = vocab.id_to_token[i];
|
460
|
+
tok_score.tok = std::move(word);
|
461
|
+
tok_score.score = score;
|
462
|
+
}
|
463
|
+
}
|
464
|
+
void read_tensor_metadata(size_t file_idx, llama_load_tensors_map & tensors_map) {
|
465
|
+
while (file.tell() < file.size) {
|
466
|
+
llama_load_tensor_shard shard;
|
467
|
+
uint32_t n_dims = file.read_u32();
|
468
|
+
uint32_t name_len = file.read_u32();
|
469
|
+
shard.type = (enum ggml_type) file.read_u32();
|
470
|
+
shard.ne.resize(n_dims);
|
471
|
+
file.read_raw(shard.ne.data(), sizeof(shard.ne[0]) * n_dims);
|
472
|
+
std::string name = file.read_string(name_len);
|
473
|
+
if (n_dims < 1 || n_dims > 2) {
|
474
|
+
throw format("llama.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims);
|
475
|
+
}
|
476
|
+
switch (shard.type) {
|
477
|
+
case GGML_TYPE_F32:
|
478
|
+
case GGML_TYPE_F16:
|
479
|
+
case GGML_TYPE_Q4_0:
|
480
|
+
case GGML_TYPE_Q4_1:
|
481
|
+
break;
|
482
|
+
default: {
|
483
|
+
throw format("unrecognized tensor type %u\n", shard.type);
|
484
|
+
}
|
485
|
+
}
|
486
|
+
|
487
|
+
if (file_version >= LLAMA_FILE_VERSION_GGJT_V1) {
|
488
|
+
// skip to the next multiple of 32 bytes
|
489
|
+
file.seek(-file.tell() & 31, SEEK_CUR);
|
490
|
+
}
|
491
|
+
shard.file_idx = file_idx;
|
492
|
+
shard.file_off = file.tell();
|
493
|
+
|
494
|
+
shard.calc_size();
|
495
|
+
file.seek(shard.size, SEEK_CUR);
|
496
|
+
|
497
|
+
auto it = tensors_map.name_to_idx.find(name);
|
498
|
+
size_t idx;
|
499
|
+
if (it != tensors_map.name_to_idx.end()) {
|
500
|
+
idx = it->second;
|
501
|
+
} else {
|
502
|
+
tensors_map.tensors.emplace_back(name);
|
503
|
+
idx = tensors_map.tensors.size() - 1;
|
504
|
+
tensors_map.name_to_idx.emplace(name, idx);
|
505
|
+
}
|
506
|
+
tensors_map.tensors.at(idx).shards.push_back(shard);
|
507
|
+
}
|
508
|
+
}
|
509
|
+
};
|
510
|
+
|
511
|
+
struct llama_file_saver {
|
512
|
+
llama_file file;
|
513
|
+
llama_file_loader * any_file_loader;
|
514
|
+
llama_file_saver(const char * fname, llama_file_loader * any_file_loader, enum llama_ftype new_ftype)
|
515
|
+
: file(fname, "wb"), any_file_loader(any_file_loader) {
|
516
|
+
fprintf(stderr, "llama.cpp: saving model to %s\n", fname);
|
517
|
+
write_magic();
|
518
|
+
write_hparams(new_ftype);
|
519
|
+
write_vocab();
|
520
|
+
}
|
521
|
+
void write_magic() {
|
522
|
+
file.write_u32('ggjt'); // magic
|
523
|
+
file.write_u32(1); // version
|
524
|
+
}
|
525
|
+
void write_hparams(enum llama_ftype new_ftype) {
|
526
|
+
const llama_hparams & hparams = any_file_loader->hparams;
|
527
|
+
file.write_u32(hparams.n_vocab);
|
528
|
+
file.write_u32(hparams.n_embd);
|
529
|
+
file.write_u32(hparams.n_mult);
|
530
|
+
file.write_u32(hparams.n_head);
|
531
|
+
file.write_u32(hparams.n_layer);
|
532
|
+
file.write_u32(hparams.n_rot);
|
533
|
+
file.write_u32(new_ftype);
|
534
|
+
}
|
535
|
+
void write_vocab() {
|
536
|
+
if (any_file_loader->file_version == LLAMA_FILE_VERSION_GGML) {
|
537
|
+
fprintf(stderr, "llama.cpp: WARNING: input is an old file that doesn't have scores; will add dummy scores\n");
|
538
|
+
}
|
539
|
+
uint32_t n_vocab = any_file_loader->hparams.n_vocab;
|
540
|
+
for (uint32_t i = 0; i < n_vocab; i++) {
|
541
|
+
const auto & token_score = any_file_loader->vocab.id_to_token.at(i);
|
542
|
+
file.write_u32((uint32_t) token_score.tok.size());
|
543
|
+
file.write_raw(token_score.tok.data(), token_score.tok.size());
|
544
|
+
file.write_raw(&token_score.score, sizeof(token_score.score));
|
545
|
+
}
|
546
|
+
}
|
547
|
+
void write_tensor(llama_load_tensor & tensor, enum ggml_type new_type, const void * new_data, size_t new_size) {
|
548
|
+
switch (new_type) {
|
549
|
+
case GGML_TYPE_F32:
|
550
|
+
case GGML_TYPE_F16:
|
551
|
+
case GGML_TYPE_Q4_0:
|
552
|
+
case GGML_TYPE_Q4_1:
|
553
|
+
break;
|
554
|
+
default: LLAMA_ASSERT(false);
|
555
|
+
}
|
556
|
+
file.write_u32((uint32_t) tensor.ne.size());
|
557
|
+
file.write_u32((uint32_t) tensor.name.size());
|
558
|
+
file.write_u32(new_type);
|
559
|
+
file.write_raw(tensor.ne.data(), sizeof(tensor.ne[0]) * tensor.ne.size());
|
560
|
+
file.write_raw(tensor.name.data(), tensor.name.size());
|
561
|
+
file.seek(-file.tell() & 31, SEEK_CUR);
|
562
|
+
LLAMA_ASSERT(new_size == llama_calc_tensor_size(tensor.ne, new_type));
|
563
|
+
file.write_raw(new_data, new_size);
|
564
|
+
}
|
565
|
+
};
|
566
|
+
|
567
|
+
struct llama_model_loader {
|
568
|
+
std::vector<std::unique_ptr<llama_file_loader>> file_loaders;
|
569
|
+
llama_load_tensors_map tensors_map;
|
570
|
+
bool use_mmap;
|
571
|
+
size_t num_ggml_tensors_created = 0;
|
572
|
+
struct ggml_context * ggml_ctx = NULL;
|
573
|
+
std::unique_ptr<llama_mmap> mapping;
|
574
|
+
|
575
|
+
llama_model_loader(const std::string & fname_base, bool use_mmap, bool vocab_only) {
|
576
|
+
auto first_file = new llama_file_loader(fname_base.c_str(), 0, tensors_map);
|
577
|
+
file_loaders.emplace_back(first_file);
|
578
|
+
uint32_t n_parts = vocab_only ? 1 : guess_n_parts();
|
579
|
+
for (uint32_t i = 1; i < n_parts; i++) {
|
580
|
+
std::string fname = fname_base + "." + std::to_string(i);
|
581
|
+
auto ith_file = new llama_file_loader(fname.c_str(), i, tensors_map);
|
582
|
+
file_loaders.emplace_back(ith_file);
|
583
|
+
if (ith_file->hparams != first_file->hparams) {
|
584
|
+
throw format("llama.cpp: hparams inconsistent between files");
|
585
|
+
}
|
586
|
+
}
|
587
|
+
if (!llama_mmap::SUPPORTED) {
|
588
|
+
use_mmap = false;
|
589
|
+
}
|
590
|
+
if (use_mmap && alignment_prevents_mmap()) {
|
591
|
+
fprintf(stderr, "llama.cpp: can't use mmap because tensors are not aligned; convert to new format to avoid this\n");
|
592
|
+
use_mmap = false;
|
593
|
+
}
|
594
|
+
this->use_mmap = use_mmap;
|
595
|
+
for (llama_load_tensor & lt : tensors_map.tensors) {
|
596
|
+
lt.calc_all();
|
597
|
+
}
|
598
|
+
}
|
599
|
+
|
600
|
+
bool alignment_prevents_mmap() {
|
601
|
+
for (const llama_load_tensor & lt : tensors_map.tensors) {
|
602
|
+
for (const llama_load_tensor_shard & shard : lt.shards) {
|
603
|
+
if (shard.file_off & 3) {
|
604
|
+
return true;
|
605
|
+
}
|
606
|
+
}
|
607
|
+
}
|
608
|
+
return false;
|
609
|
+
}
|
610
|
+
|
611
|
+
uint32_t guess_n_parts() const {
|
612
|
+
auto it = tensors_map.name_to_idx.find("tok_embeddings.weight");
|
613
|
+
if (it == tensors_map.name_to_idx.end()) {
|
614
|
+
throw std::string("missing tok_embeddings.weight");
|
615
|
+
}
|
616
|
+
const llama_load_tensor & lt = tensors_map.tensors.at(it->second);
|
617
|
+
return file_loaders.at(0)->hparams.n_embd / lt.shards.at(0).ne.at(0);
|
618
|
+
}
|
619
|
+
|
620
|
+
void calc_sizes(size_t * ctx_size_p, size_t * mmapped_size_p) const {
|
621
|
+
*ctx_size_p = *mmapped_size_p = 0;
|
622
|
+
for (const llama_load_tensor & lt : tensors_map.tensors) {
|
623
|
+
*ctx_size_p += sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE;
|
624
|
+
*(use_mmap ? mmapped_size_p : ctx_size_p) += lt.size;
|
625
|
+
}
|
626
|
+
}
|
627
|
+
|
628
|
+
struct ggml_tensor * get_tensor(const std::string & name, std::vector<uint32_t> ne) {
|
629
|
+
auto it = tensors_map.name_to_idx.find(name);
|
630
|
+
if (it == tensors_map.name_to_idx.end()) {
|
631
|
+
throw format("llama.cpp: tensor '%s' is missing from model", name.c_str());
|
632
|
+
}
|
633
|
+
llama_load_tensor & lt = tensors_map.tensors.at(it->second);
|
634
|
+
if (lt.ne != ne) {
|
635
|
+
throw format("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s",
|
636
|
+
name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str());
|
637
|
+
}
|
638
|
+
|
639
|
+
return get_tensor_for(lt);
|
640
|
+
}
|
641
|
+
|
642
|
+
struct ggml_tensor * get_tensor_for(llama_load_tensor & lt) {
|
643
|
+
struct ggml_tensor * tensor;
|
644
|
+
if (lt.ne.size() == 2) {
|
645
|
+
tensor = ggml_new_tensor_2d(ggml_ctx, lt.type, lt.ne.at(0), lt.ne.at(1));
|
646
|
+
} else {
|
647
|
+
LLAMA_ASSERT(lt.ne.size() == 1);
|
648
|
+
tensor = ggml_new_tensor_1d(ggml_ctx, lt.type, lt.ne.at(0));
|
649
|
+
}
|
650
|
+
LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor
|
651
|
+
lt.ggml_tensor = tensor;
|
652
|
+
num_ggml_tensors_created++;
|
653
|
+
return tensor;
|
654
|
+
}
|
655
|
+
|
656
|
+
void done_getting_tensors() {
|
657
|
+
if (num_ggml_tensors_created != tensors_map.tensors.size()) {
|
658
|
+
throw std::string("llama.cpp: file contained more tensors than expected");
|
659
|
+
}
|
660
|
+
}
|
661
|
+
|
662
|
+
void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) {
|
663
|
+
size_t data_size = 0;
|
664
|
+
for (const llama_load_tensor & lt : tensors_map.tensors) {
|
665
|
+
data_size += lt.size;
|
666
|
+
}
|
667
|
+
|
668
|
+
if (use_mmap) {
|
669
|
+
mapping.reset(new llama_mmap(&file_loaders.at(0)->file));
|
670
|
+
if (!lmlock) {
|
671
|
+
// Don't call the callback since the actual loading will be lazy
|
672
|
+
// and we can't measure it.
|
673
|
+
progress_callback = NULL;
|
674
|
+
}
|
675
|
+
if (lmlock) {
|
676
|
+
lmlock->init(mapping->addr);
|
677
|
+
}
|
678
|
+
}
|
679
|
+
|
680
|
+
size_t done_size = 0;
|
681
|
+
for (llama_load_tensor & lt : tensors_map.tensors) {
|
682
|
+
if (progress_callback) {
|
683
|
+
progress_callback((float) done_size / data_size, progress_callback_user_data);
|
684
|
+
}
|
685
|
+
LLAMA_ASSERT(lt.ggml_tensor); // unused tensors should have been caught by load_data already
|
686
|
+
lt.data = (uint8_t *) lt.ggml_tensor->data;
|
687
|
+
load_data_for(lt);
|
688
|
+
lt.ggml_tensor->data = lt.data;
|
689
|
+
done_size += lt.size;
|
690
|
+
if (use_mmap && lmlock) {
|
691
|
+
lmlock->grow_to(done_size);
|
692
|
+
}
|
693
|
+
}
|
694
|
+
if (progress_callback) {
|
695
|
+
progress_callback(1.0f, progress_callback_user_data);
|
696
|
+
}
|
697
|
+
}
|
698
|
+
|
699
|
+
void load_data_for(llama_load_tensor & lt) {
|
700
|
+
if (use_mmap) {
|
701
|
+
LLAMA_ASSERT(lt.shards.size() == 1);
|
702
|
+
lt.data = (uint8_t *) mapping->addr + lt.shards.at(0).file_off;
|
703
|
+
} else if (lt.split_type == SPLIT_NONE) {
|
704
|
+
llama_file & file = file_loaders.at(lt.shards.at(0).file_idx)->file;
|
705
|
+
file.seek(lt.shards.at(0).file_off, SEEK_SET);
|
706
|
+
file.read_raw(lt.data, lt.size);
|
707
|
+
} else if (lt.split_type == SPLIT_BY_ROWS) {
|
708
|
+
size_t offset = 0;
|
709
|
+
for (llama_load_tensor_shard & shard : lt.shards) {
|
710
|
+
llama_file & file = file_loaders.at(shard.file_idx)->file;
|
711
|
+
file.seek(shard.file_off, SEEK_SET);
|
712
|
+
file.read_raw(lt.data + offset, shard.size);
|
713
|
+
offset += shard.size;
|
714
|
+
}
|
715
|
+
LLAMA_ASSERT(offset == lt.size);
|
716
|
+
} else if (lt.split_type == SPLIT_BY_COLUMNS) {
|
717
|
+
// Let's load the data into temporary buffers to ensure the OS performs large loads.
|
718
|
+
std::vector<llama_buffer> tmp_bufs;
|
719
|
+
tmp_bufs.resize(lt.shards.size());
|
720
|
+
for (size_t i = 0; i < lt.shards.size(); i++) {
|
721
|
+
llama_load_tensor_shard & shard = lt.shards.at(i);
|
722
|
+
llama_file & file = file_loaders.at(shard.file_idx)->file;
|
723
|
+
file.seek(shard.file_off, SEEK_SET);
|
724
|
+
tmp_bufs.at(i).resize(shard.size);
|
725
|
+
file.read_raw(tmp_bufs.at(i).addr, shard.size);
|
726
|
+
}
|
727
|
+
// Then reshape.
|
728
|
+
size_t num_rows = lt.ne.at(1);
|
729
|
+
size_t per_shard_row_size = lt.shards.at(0).size / num_rows;
|
730
|
+
size_t out_offset = 0;
|
731
|
+
for (size_t row = 0; row < num_rows; row++) {
|
732
|
+
for (llama_buffer & tmp_buf : tmp_bufs) {
|
733
|
+
memcpy(lt.data + out_offset,
|
734
|
+
tmp_buf.addr + row * per_shard_row_size,
|
735
|
+
per_shard_row_size);
|
736
|
+
out_offset += per_shard_row_size;
|
737
|
+
}
|
738
|
+
}
|
739
|
+
LLAMA_ASSERT(out_offset == lt.size);
|
740
|
+
}
|
741
|
+
if (0) {
|
742
|
+
print_checksum(lt);
|
743
|
+
}
|
744
|
+
}
|
745
|
+
|
746
|
+
static void print_checksum(llama_load_tensor & lt) {
|
747
|
+
uint32_t sum = 0;
|
748
|
+
for (size_t i = 0; i < lt.size; i++) {
|
749
|
+
uint8_t byte = lt.data[i];
|
750
|
+
sum = byte + (sum << 6) + (sum << 16) - sum; // sdbm hash
|
751
|
+
}
|
752
|
+
fprintf(stderr, "%s checksum: %#08x (%s, size %zu)\n", lt.name.c_str(), sum,
|
753
|
+
llama_format_tensor_shape(lt.ne).c_str(), lt.size);
|
754
|
+
}
|
755
|
+
|
756
|
+
};
|
757
|
+
|
758
|
+
|
247
759
|
//
|
248
760
|
// kv cache
|
249
761
|
//
|
@@ -262,8 +774,8 @@ static bool kv_cache_init(
|
|
262
774
|
cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
|
263
775
|
|
264
776
|
struct ggml_init_params params;
|
265
|
-
params.mem_size = cache.buf.size
|
266
|
-
params.mem_buffer = cache.buf.
|
777
|
+
params.mem_size = cache.buf.size;
|
778
|
+
params.mem_buffer = cache.buf.addr;
|
267
779
|
params.no_alloc = false;
|
268
780
|
|
269
781
|
cache.ctx = ggml_init(params);
|
@@ -279,13 +791,6 @@ static bool kv_cache_init(
|
|
279
791
|
return true;
|
280
792
|
}
|
281
793
|
|
282
|
-
static void kv_cache_free(struct llama_kv_cache & cache) {
|
283
|
-
if (cache.ctx) {
|
284
|
-
ggml_free(cache.ctx);
|
285
|
-
cache.ctx = nullptr;
|
286
|
-
}
|
287
|
-
}
|
288
|
-
|
289
794
|
struct llama_context_params llama_context_default_params() {
|
290
795
|
struct llama_context_params result = {
|
291
796
|
/*.n_ctx =*/ 512,
|
@@ -294,6 +799,7 @@ struct llama_context_params llama_context_default_params() {
|
|
294
799
|
/*.f16_kv =*/ false,
|
295
800
|
/*.logits_all =*/ false,
|
296
801
|
/*.vocab_only =*/ false,
|
802
|
+
/*.use_mmap =*/ true,
|
297
803
|
/*.use_mlock =*/ false,
|
298
804
|
/*.embedding =*/ false,
|
299
805
|
/*.progress_callback =*/ nullptr,
|
@@ -303,243 +809,106 @@ struct llama_context_params llama_context_default_params() {
|
|
303
809
|
return result;
|
304
810
|
}
|
305
811
|
|
812
|
+
bool llama_mmap_supported() {
|
813
|
+
return llama_mmap::SUPPORTED;
|
814
|
+
}
|
815
|
+
|
816
|
+
bool llama_mlock_supported() {
|
817
|
+
return llama_mlock::SUPPORTED;
|
818
|
+
}
|
819
|
+
|
306
820
|
//
|
307
821
|
// model loading
|
308
822
|
//
|
309
823
|
|
310
|
-
static
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
FILE_ATTRIBUTE_NORMAL | FILE_ATTRIBUTE_NOT_CONTENT_INDEXED,
|
318
|
-
NULL);
|
319
|
-
if (hFile == INVALID_HANDLE_VALUE) return 0;
|
320
|
-
LARGE_INTEGER fileSize;
|
321
|
-
fileSize.QuadPart = -1;
|
322
|
-
GetFileSizeEx(hFile, &fileSize);
|
323
|
-
int64_t length = fileSize.QuadPart;
|
324
|
-
HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
|
325
|
-
CloseHandle(hFile);
|
326
|
-
if (!hMapping) return 0;
|
327
|
-
void *addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
|
328
|
-
CloseHandle(hMapping);
|
329
|
-
if (!addr) return 0;
|
330
|
-
#else
|
331
|
-
int fd = open(fname, O_RDONLY);
|
332
|
-
if (fd == -1) return 0;
|
333
|
-
int64_t length = lseek(fd, 0, SEEK_END);
|
334
|
-
void *addr = mmap(NULL, length, PROT_READ, MAP_SHARED, fd, 0);
|
335
|
-
close(fd);
|
336
|
-
if (addr == MAP_FAILED) return 0;
|
337
|
-
#endif
|
338
|
-
*mm_length = length;
|
339
|
-
return addr;
|
824
|
+
static const char *llama_file_version_name(llama_file_version version) {
|
825
|
+
switch (version) {
|
826
|
+
case LLAMA_FILE_VERSION_GGML: return "'ggml' (old version with low tokenizer quality and no mmap support)";
|
827
|
+
case LLAMA_FILE_VERSION_GGMF_V1: return "ggmf v1 (old version with no mmap support)";
|
828
|
+
case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (latest)";
|
829
|
+
default: LLAMA_ASSERT(false);
|
830
|
+
}
|
340
831
|
}
|
341
832
|
|
342
|
-
static
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
|
833
|
+
static const char *llama_ftype_name(enum llama_ftype ftype) {
|
834
|
+
switch (ftype) {
|
835
|
+
case LLAMA_FTYPE_ALL_F32: return "all F32";
|
836
|
+
case LLAMA_FTYPE_MOSTLY_F16: return "mostly F16";
|
837
|
+
case LLAMA_FTYPE_MOSTLY_Q4_0: return "mostly Q4_0";
|
838
|
+
case LLAMA_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1";
|
839
|
+
case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
|
840
|
+
return "mostly Q4_1, some F16";
|
841
|
+
default: return "unknown, may not work";
|
842
|
+
}
|
348
843
|
}
|
349
844
|
|
350
|
-
static
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
path, got, want);
|
359
|
-
return false;
|
845
|
+
static const char *llama_model_type_name(e_model type) {
|
846
|
+
switch (type) {
|
847
|
+
case MODEL_7B: return "7B";
|
848
|
+
case MODEL_13B: return "13B";
|
849
|
+
case MODEL_30B: return "30B";
|
850
|
+
case MODEL_65B: return "65B";
|
851
|
+
default: LLAMA_ASSERT(false);
|
852
|
+
}
|
360
853
|
}
|
361
854
|
|
362
|
-
static
|
855
|
+
static void llama_model_load_internal(
|
363
856
|
const std::string & fname,
|
364
857
|
llama_context & lctx,
|
365
858
|
int n_ctx,
|
366
|
-
int n_parts,
|
367
859
|
ggml_type memory_type,
|
860
|
+
bool use_mmap,
|
861
|
+
bool use_mlock,
|
368
862
|
bool vocab_only,
|
369
863
|
llama_progress_callback progress_callback,
|
370
|
-
void *progress_callback_user_data) {
|
371
|
-
fprintf(stderr, "%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
|
864
|
+
void * progress_callback_user_data) {
|
372
865
|
|
373
866
|
lctx.t_start_us = ggml_time_us();
|
374
867
|
|
375
|
-
|
376
|
-
auto & vocab = lctx.vocab;
|
868
|
+
std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap, vocab_only));
|
377
869
|
|
378
|
-
|
379
|
-
|
380
|
-
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
std::vector<char> f_buf(1024*1024);
|
385
|
-
fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size());
|
386
|
-
|
387
|
-
fin.seekg(0, fin.end);
|
388
|
-
const size_t file_size = fin.tellg();
|
389
|
-
fin.seekg(0);
|
870
|
+
lctx.vocab = std::move(ml->file_loaders.at(0)->vocab);
|
871
|
+
auto & model = lctx.model;
|
872
|
+
model.hparams = ml->file_loaders.at(0)->hparams;
|
873
|
+
llama_file_version file_version = ml->file_loaders.at(0)->file_version;
|
874
|
+
auto & hparams = model.hparams;
|
875
|
+
uint32_t n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
|
390
876
|
|
391
|
-
// verify magic
|
392
877
|
{
|
393
|
-
|
394
|
-
|
395
|
-
|
396
|
-
|
397
|
-
|
398
|
-
return false;
|
878
|
+
switch (hparams.n_layer) {
|
879
|
+
case 32: model.type = e_model::MODEL_7B; break;
|
880
|
+
case 40: model.type = e_model::MODEL_13B; break;
|
881
|
+
case 60: model.type = e_model::MODEL_30B; break;
|
882
|
+
case 80: model.type = e_model::MODEL_65B; break;
|
399
883
|
}
|
400
|
-
if (magic != LLAMA_FILE_MAGIC) {
|
401
|
-
return report_bad_magic(fname.c_str(), magic, LLAMA_FILE_MAGIC);
|
402
|
-
}
|
403
|
-
|
404
|
-
uint32_t format_version;
|
405
|
-
fin.read((char *) &format_version, sizeof(format_version));
|
406
|
-
|
407
|
-
if (format_version != LLAMA_FILE_VERSION) {
|
408
|
-
fprintf(stderr, "%s: invalid model file '%s' (unsupported format version %" PRIu32 ", expected %d)\n",
|
409
|
-
__func__, fname.c_str(), format_version, LLAMA_FILE_VERSION);
|
410
|
-
return false;
|
411
|
-
}
|
412
|
-
}
|
413
|
-
|
414
|
-
int n_ff = 0;
|
415
|
-
|
416
|
-
// load hparams
|
417
|
-
{
|
418
|
-
auto & hparams = model.hparams;
|
419
|
-
|
420
|
-
fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
|
421
|
-
//fin.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
|
422
|
-
fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd));
|
423
|
-
fin.read((char *) &hparams.n_mult, sizeof(hparams.n_mult));
|
424
|
-
fin.read((char *) &hparams.n_head, sizeof(hparams.n_head));
|
425
|
-
fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
|
426
|
-
fin.read((char *) &hparams.n_rot, sizeof(hparams.n_rot));
|
427
|
-
fin.read((char *) &hparams.f16, sizeof(hparams.f16));
|
428
884
|
|
429
885
|
hparams.n_ctx = n_ctx;
|
430
|
-
|
431
|
-
n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
|
432
|
-
|
433
|
-
if (n_parts < 1) {
|
434
|
-
n_parts = LLAMA_N_PARTS.at(hparams.n_embd);
|
435
|
-
}
|
436
|
-
|
437
|
-
// temp warning to tell the user to use "--n_parts"
|
438
|
-
if (hparams.f16 == 4 && n_parts != 1) {
|
439
|
-
fprintf(stderr, "%s: GPTQ model detected - are you sure n_parts should be %d? we normally expect it to be 1\n", __func__, n_parts);
|
440
|
-
fprintf(stderr, "%s: use '--n_parts 1' if necessary\n", __func__);
|
441
|
-
}
|
442
|
-
|
443
|
-
if (hparams.n_layer == 32) {
|
444
|
-
model.type = e_model::MODEL_7B;
|
445
|
-
}
|
446
|
-
|
447
|
-
if (hparams.n_layer == 40) {
|
448
|
-
model.type = e_model::MODEL_13B;
|
449
|
-
}
|
450
|
-
|
451
|
-
if (hparams.n_layer == 60) {
|
452
|
-
model.type = e_model::MODEL_30B;
|
453
|
-
}
|
454
|
-
|
455
|
-
if (hparams.n_layer == 80) {
|
456
|
-
model.type = e_model::MODEL_65B;
|
457
|
-
}
|
458
|
-
|
459
|
-
fprintf(stderr, "%s: n_vocab = %d\n", __func__, hparams.n_vocab);
|
460
|
-
fprintf(stderr, "%s: n_ctx = %d\n", __func__, hparams.n_ctx);
|
461
|
-
fprintf(stderr, "%s: n_embd = %d\n", __func__, hparams.n_embd);
|
462
|
-
fprintf(stderr, "%s: n_mult = %d\n", __func__, hparams.n_mult);
|
463
|
-
fprintf(stderr, "%s: n_head = %d\n", __func__, hparams.n_head);
|
464
|
-
fprintf(stderr, "%s: n_layer = %d\n", __func__, hparams.n_layer);
|
465
|
-
fprintf(stderr, "%s: n_rot = %d\n", __func__, hparams.n_rot);
|
466
|
-
fprintf(stderr, "%s: f16 = %d\n", __func__, hparams.f16);
|
467
|
-
fprintf(stderr, "%s: n_ff = %d\n", __func__, n_ff);
|
468
|
-
fprintf(stderr, "%s: n_parts = %d\n", __func__, n_parts);
|
469
|
-
fprintf(stderr, "%s: type = %d\n", __func__, model.type);
|
470
886
|
}
|
471
887
|
|
472
|
-
// load vocab
|
473
888
|
{
|
474
|
-
|
475
|
-
|
476
|
-
|
477
|
-
|
478
|
-
|
479
|
-
|
480
|
-
|
481
|
-
|
482
|
-
|
483
|
-
|
484
|
-
|
485
|
-
|
486
|
-
word.assign(tmp.data(), len);
|
487
|
-
} else {
|
488
|
-
word.clear();
|
489
|
-
}
|
490
|
-
|
491
|
-
float score;
|
492
|
-
fin.read((char *) &score, sizeof(score));
|
493
|
-
|
494
|
-
vocab.token_to_id[word] = i;
|
495
|
-
|
496
|
-
auto &tok_score = vocab.id_to_token[i];
|
497
|
-
tok_score.tok = word;
|
498
|
-
tok_score.score = score;
|
499
|
-
}
|
889
|
+
fprintf(stderr, "%s: format = %s\n", __func__, llama_file_version_name(file_version));
|
890
|
+
fprintf(stderr, "%s: n_vocab = %u\n", __func__, hparams.n_vocab);
|
891
|
+
fprintf(stderr, "%s: n_ctx = %u\n", __func__, hparams.n_ctx);
|
892
|
+
fprintf(stderr, "%s: n_embd = %u\n", __func__, hparams.n_embd);
|
893
|
+
fprintf(stderr, "%s: n_mult = %u\n", __func__, hparams.n_mult);
|
894
|
+
fprintf(stderr, "%s: n_head = %u\n", __func__, hparams.n_head);
|
895
|
+
fprintf(stderr, "%s: n_layer = %u\n", __func__, hparams.n_layer);
|
896
|
+
fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot);
|
897
|
+
fprintf(stderr, "%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype));
|
898
|
+
fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff);
|
899
|
+
fprintf(stderr, "%s: n_parts = %zu\n", __func__, ml->file_loaders.size());
|
900
|
+
fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
|
500
901
|
}
|
501
902
|
|
502
903
|
if (vocab_only) {
|
503
|
-
return
|
504
|
-
}
|
505
|
-
|
506
|
-
// for the big tensors, we have the option to store the data in 16-bit floats or quantized
|
507
|
-
// in order to save memory and also to speed up the computation
|
508
|
-
// wtype is for per-layer weights, while vtype is for other weights
|
509
|
-
ggml_type wtype, vtype;
|
510
|
-
switch (model.hparams.f16) {
|
511
|
-
case 0: wtype = vtype = GGML_TYPE_F32; break;
|
512
|
-
case 1: wtype = vtype = GGML_TYPE_F16; break;
|
513
|
-
case 2: wtype = vtype = GGML_TYPE_Q4_0; break;
|
514
|
-
case 3: wtype = vtype = GGML_TYPE_Q4_1; break;
|
515
|
-
case 4: wtype = GGML_TYPE_Q4_1; vtype = GGML_TYPE_F16; break;
|
516
|
-
default:
|
517
|
-
{
|
518
|
-
fprintf(stderr, "%s: invalid model file '%s' (bad f16 value %d)\n",
|
519
|
-
__func__, fname.c_str(), model.hparams.f16);
|
520
|
-
return false;
|
521
|
-
}
|
904
|
+
return;
|
522
905
|
}
|
523
906
|
|
524
|
-
// map model into memory
|
525
|
-
char *mm_addr = NULL;
|
526
|
-
model.mm_addr = mmap_file(fname.c_str(), &model.mm_length);
|
527
|
-
if (model.mm_addr == NULL) {
|
528
|
-
fprintf(stderr, "%s: failed to mmap '%s'\n", __func__, fname.c_str());
|
529
|
-
return false;
|
530
|
-
}
|
531
|
-
mm_addr = (char *)model.mm_addr;
|
532
|
-
fprintf(stderr, "%s: ggml map size = %6.2f MB\n", __func__, model.mm_length/(1024.0*1024.0));
|
533
|
-
|
534
907
|
auto & ctx = model.ctx;
|
535
908
|
|
536
|
-
size_t ctx_size
|
537
|
-
|
538
|
-
|
539
|
-
const int n_layer = hparams.n_layer;
|
540
|
-
ctx_size += (5 + 10*n_layer)*256; // object overhead
|
541
|
-
fprintf(stderr, "%s: ggml ctx size = %6.2f KB\n", __func__, ctx_size/1024.0);
|
542
|
-
}
|
909
|
+
size_t ctx_size, mmapped_size;
|
910
|
+
ml->calc_sizes(&ctx_size, &mmapped_size);
|
911
|
+
fprintf(stderr, "%s: ggml ctx size = %6.2f KB\n", __func__, ctx_size/1024.0);
|
543
912
|
|
544
913
|
// print memory requirements
|
545
914
|
{
|
@@ -548,14 +917,14 @@ static bool llama_model_load(
|
|
548
917
|
// this is the total memory required to run the inference
|
549
918
|
const size_t mem_required =
|
550
919
|
ctx_size +
|
551
|
-
|
552
|
-
MEM_REQ_SCRATCH0.at(model.type) +
|
553
|
-
MEM_REQ_SCRATCH1.at(model.type) +
|
554
|
-
MEM_REQ_EVAL.at
|
920
|
+
mmapped_size +
|
921
|
+
MEM_REQ_SCRATCH0().at(model.type) +
|
922
|
+
MEM_REQ_SCRATCH1().at(model.type) +
|
923
|
+
MEM_REQ_EVAL().at(model.type);
|
555
924
|
|
556
925
|
// this is the memory required by one llama_state
|
557
926
|
const size_t mem_required_state =
|
558
|
-
scale*MEM_REQ_KV_SELF.at(model.type);
|
927
|
+
scale*MEM_REQ_KV_SELF().at(model.type);
|
559
928
|
|
560
929
|
fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
|
561
930
|
mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
|
@@ -564,17 +933,20 @@ static bool llama_model_load(
|
|
564
933
|
// create the ggml context
|
565
934
|
{
|
566
935
|
lctx.model.buf.resize(ctx_size);
|
936
|
+
if (use_mlock) {
|
937
|
+
lctx.model.mlock_buf.init(lctx.model.buf.addr);
|
938
|
+
lctx.model.mlock_buf.grow_to(lctx.model.buf.size);
|
939
|
+
}
|
567
940
|
|
568
941
|
struct ggml_init_params params = {
|
569
|
-
/*.mem_size =*/ lctx.model.buf.size
|
570
|
-
/*.mem_buffer =*/ lctx.model.buf.
|
571
|
-
/*.no_alloc =*/
|
942
|
+
/*.mem_size =*/ lctx.model.buf.size,
|
943
|
+
/*.mem_buffer =*/ lctx.model.buf.addr,
|
944
|
+
/*.no_alloc =*/ ml->use_mmap,
|
572
945
|
};
|
573
946
|
|
574
947
|
model.ctx = ggml_init(params);
|
575
948
|
if (!model.ctx) {
|
576
|
-
|
577
|
-
return false;
|
949
|
+
throw format("ggml_init() failed");
|
578
950
|
}
|
579
951
|
}
|
580
952
|
|
@@ -582,161 +954,71 @@ static bool llama_model_load(
|
|
582
954
|
{
|
583
955
|
const auto & hparams = model.hparams;
|
584
956
|
|
585
|
-
const
|
586
|
-
const
|
587
|
-
const
|
588
|
-
|
589
|
-
model.layers.resize(n_layer);
|
590
|
-
|
591
|
-
model.tok_embeddings = ggml_new_tensor_2d(ctx, vtype, n_embd, n_vocab);
|
957
|
+
const uint32_t n_embd = hparams.n_embd;
|
958
|
+
const uint32_t n_layer = hparams.n_layer;
|
959
|
+
const uint32_t n_vocab = hparams.n_vocab;
|
592
960
|
|
593
|
-
|
594
|
-
model.output = ggml_new_tensor_2d(ctx, vtype, n_embd, n_vocab);
|
961
|
+
ml->ggml_ctx = ctx;
|
595
962
|
|
596
|
-
|
597
|
-
model.
|
963
|
+
model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab});
|
964
|
+
model.norm = ml->get_tensor("norm.weight", {n_embd});
|
965
|
+
model.output = ml->get_tensor("output.weight", {n_embd, n_vocab});
|
598
966
|
|
599
|
-
model.
|
600
|
-
|
601
|
-
|
602
|
-
for (int i = 0; i < n_layer; ++i) {
|
967
|
+
model.layers.resize(n_layer);
|
968
|
+
for (uint32_t i = 0; i < n_layer; ++i) {
|
603
969
|
auto & layer = model.layers[i];
|
604
970
|
|
605
|
-
|
606
|
-
|
607
|
-
layer.wq = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
|
608
|
-
layer.wk = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
|
609
|
-
layer.wv = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
|
610
|
-
layer.wo = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
|
611
|
-
|
612
|
-
layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
971
|
+
std::string layers_i = "layers." + std::to_string(i);
|
613
972
|
|
614
|
-
layer.
|
615
|
-
layer.w2 = ggml_new_tensor_2d(ctx, wtype, n_ff, n_embd);
|
616
|
-
layer.w3 = ggml_new_tensor_2d(ctx, wtype, n_embd, n_ff);
|
973
|
+
layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd});
|
617
974
|
|
618
|
-
|
619
|
-
|
975
|
+
layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd});
|
976
|
+
layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd});
|
977
|
+
layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd});
|
978
|
+
layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd});
|
620
979
|
|
621
|
-
|
622
|
-
model.tensors["layers." + std::to_string(i) + ".attention.wk.weight"] = layer.wk;
|
623
|
-
model.tensors["layers." + std::to_string(i) + ".attention.wv.weight"] = layer.wv;
|
624
|
-
model.tensors["layers." + std::to_string(i) + ".attention.wo.weight"] = layer.wo;
|
980
|
+
layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd});
|
625
981
|
|
626
|
-
|
627
|
-
|
628
|
-
|
629
|
-
model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.weight"] = layer.w2;
|
630
|
-
model.tensors["layers." + std::to_string(i) + ".feed_forward.w3.weight"] = layer.w3;
|
982
|
+
layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff});
|
983
|
+
layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd});
|
984
|
+
layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff});
|
631
985
|
}
|
632
986
|
}
|
633
987
|
|
634
|
-
|
988
|
+
ml->done_getting_tensors();
|
635
989
|
|
636
|
-
|
637
|
-
|
990
|
+
// populate `tensors_by_name`
|
991
|
+
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
|
992
|
+
model.tensors_by_name.emplace_back(lt.name, lt.ggml_tensor);
|
638
993
|
}
|
639
994
|
|
640
|
-
|
641
|
-
|
642
|
-
// load weights
|
643
|
-
{
|
644
|
-
size_t total_size = 0;
|
645
|
-
model.n_loaded = 0;
|
646
|
-
|
647
|
-
while (true) {
|
648
|
-
int32_t n_dims;
|
649
|
-
int32_t length;
|
650
|
-
int32_t ftype;
|
651
|
-
|
652
|
-
fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
|
653
|
-
fin.read(reinterpret_cast<char *>(&length), sizeof(length));
|
654
|
-
fin.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
|
655
|
-
|
656
|
-
if (fin.eof()) {
|
657
|
-
break;
|
658
|
-
}
|
659
|
-
|
660
|
-
int32_t nelements = 1;
|
661
|
-
int32_t ne[2] = { 1, 1 };
|
662
|
-
for (int i = 0; i < n_dims; ++i) {
|
663
|
-
fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
|
664
|
-
nelements *= ne[i];
|
665
|
-
}
|
666
|
-
|
667
|
-
std::string name(length, 0);
|
668
|
-
fin.read(&name[0], length);
|
669
|
-
|
670
|
-
if (model.tensors.find(name.data()) == model.tensors.end()) {
|
671
|
-
fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
|
672
|
-
return false;
|
673
|
-
}
|
674
|
-
|
675
|
-
auto tensor = model.tensors[name.data()];
|
676
|
-
|
677
|
-
if (ggml_nelements(tensor) != nelements) {
|
678
|
-
fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
|
679
|
-
return false;
|
680
|
-
}
|
681
|
-
if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
|
682
|
-
fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%" PRId64 ", %" PRId64 "], expected [%d, %d]\n",
|
683
|
-
__func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
|
684
|
-
return false;
|
685
|
-
}
|
686
|
-
if (0) {
|
687
|
-
static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
|
688
|
-
fprintf(stderr, "%24s - [%5d, %5d], type = %6s\n", name.data(), ne[0], ne[1], ftype_str[ftype]);
|
689
|
-
}
|
690
|
-
|
691
|
-
switch (ftype) {
|
692
|
-
case 0: // f32
|
693
|
-
case 1: // f16
|
694
|
-
break;
|
695
|
-
case 2: // q4_0
|
696
|
-
case 3: // q4_1
|
697
|
-
assert(ne[0] % 64 == 0);
|
698
|
-
break;
|
699
|
-
default:
|
700
|
-
fprintf(stderr, "%s: unknown ftype %d in model file\n", __func__, ftype);
|
701
|
-
return false;
|
702
|
-
};
|
703
|
-
|
704
|
-
// load the tensor data into memory without copying or reading it
|
705
|
-
size_t offset = fin.tellg();
|
706
|
-
size_t tensor_data_size = ggml_nbytes(tensor);
|
707
|
-
offset = (offset + 31) & -32;
|
708
|
-
tensor->data = mm_addr + offset;
|
709
|
-
fin.seekg(offset + tensor_data_size);
|
710
|
-
total_size += tensor_data_size;
|
711
|
-
model.n_loaded++;
|
712
|
-
|
713
|
-
// progress
|
714
|
-
if (progress_callback) {
|
715
|
-
double current_progress = size_t(fin.tellg()) / double(file_size);
|
716
|
-
progress_callback(current_progress, progress_callback_user_data);
|
717
|
-
}
|
718
|
-
}
|
719
|
-
|
720
|
-
fin.close();
|
995
|
+
ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
|
721
996
|
|
722
|
-
|
723
|
-
if (model.n_loaded == 0) {
|
724
|
-
fprintf(stderr, "%s: WARN no tensors loaded from model file - assuming empty model for testing\n", __func__);
|
725
|
-
} else if (model.n_loaded != (int) model.tensors.size()) {
|
726
|
-
fprintf(stderr, "%s: ERROR not all tensors loaded from model file - expected %zu, got %d\n", __func__, model.tensors.size(), model.n_loaded);
|
727
|
-
return false;
|
728
|
-
}
|
729
|
-
}
|
997
|
+
model.mapping = std::move(ml->mapping);
|
730
998
|
|
731
999
|
// loading time will be recalculate after the first eval, so
|
732
1000
|
// we take page faults deferred by mmap() into consideration
|
733
1001
|
lctx.t_load_us = ggml_time_us() - lctx.t_start_us;
|
1002
|
+
}
|
734
1003
|
|
735
|
-
|
736
|
-
|
1004
|
+
static bool llama_model_load(
|
1005
|
+
const std::string & fname,
|
1006
|
+
llama_context & lctx,
|
1007
|
+
int n_ctx,
|
1008
|
+
ggml_type memory_type,
|
1009
|
+
bool use_mmap,
|
1010
|
+
bool use_mlock,
|
1011
|
+
bool vocab_only,
|
1012
|
+
llama_progress_callback progress_callback,
|
1013
|
+
void *progress_callback_user_data) {
|
1014
|
+
try {
|
1015
|
+
llama_model_load_internal(fname, lctx, n_ctx, memory_type, use_mmap, use_mlock,
|
1016
|
+
vocab_only, progress_callback, progress_callback_user_data);
|
1017
|
+
return true;
|
1018
|
+
} catch (const std::string & err) {
|
1019
|
+
fprintf(stderr, "error loading model: %s\n", err.c_str());
|
1020
|
+
return false;
|
737
1021
|
}
|
738
|
-
|
739
|
-
return true;
|
740
1022
|
}
|
741
1023
|
|
742
1024
|
// evaluate the transformer
|
@@ -774,8 +1056,8 @@ static bool llama_eval_internal(
|
|
774
1056
|
auto & buf_compute = lctx.buf_compute;
|
775
1057
|
|
776
1058
|
struct ggml_init_params params = {
|
777
|
-
/*.mem_size =*/ buf_compute.size
|
778
|
-
/*.mem_buffer =*/ buf_compute.
|
1059
|
+
/*.mem_size =*/ buf_compute.size,
|
1060
|
+
/*.mem_buffer =*/ buf_compute.addr,
|
779
1061
|
/*.no_alloc =*/ false,
|
780
1062
|
};
|
781
1063
|
|
@@ -1061,7 +1343,7 @@ struct llama_tokenizer {
|
|
1061
1343
|
size_t offs = 0;
|
1062
1344
|
while (offs < text.size()) {
|
1063
1345
|
llama_sp_symbol sym;
|
1064
|
-
size_t char_len =
|
1346
|
+
size_t char_len = std::min(text.size() - offs, utf8_len(text[offs]));
|
1065
1347
|
sym.text = text.c_str() + offs;
|
1066
1348
|
sym.n = char_len;
|
1067
1349
|
offs += char_len;
|
@@ -1236,7 +1518,7 @@ static llama_vocab::id llama_sample_top_p_top_k(
|
|
1236
1518
|
}
|
1237
1519
|
}
|
1238
1520
|
|
1239
|
-
sample_top_k(logits_id, top_k > 0 ?
|
1521
|
+
sample_top_k(logits_id, top_k > 0 ? std::min(top_k, n_logits) : n_logits);
|
1240
1522
|
|
1241
1523
|
// compute probs for the top k tokens
|
1242
1524
|
std::vector<float> probs;
|
@@ -1284,298 +1566,118 @@ static llama_vocab::id llama_sample_top_p_top_k(
|
|
1284
1566
|
// quantization
|
1285
1567
|
//
|
1286
1568
|
|
1287
|
-
|
1288
|
-
|
1289
|
-
|
1290
|
-
|
1291
|
-
|
1292
|
-
|
1293
|
-
case 3: type = GGML_TYPE_Q4_1; break;
|
1294
|
-
default: fprintf(stderr, "%s: invalid quantization type %d\n", __func__, itype); return 1;
|
1569
|
+
static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, enum llama_ftype ftype) {
|
1570
|
+
ggml_type quantized_type;
|
1571
|
+
switch (ftype) {
|
1572
|
+
case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
|
1573
|
+
case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
|
1574
|
+
default: throw format("invalid output file type %d\n", ftype);
|
1295
1575
|
};
|
1296
1576
|
|
1297
|
-
|
1298
|
-
|
1299
|
-
|
1300
|
-
|
1301
|
-
|
1302
|
-
|
1303
|
-
|
1304
|
-
|
1305
|
-
|
1306
|
-
|
1307
|
-
|
1308
|
-
|
1309
|
-
|
1310
|
-
|
1311
|
-
|
1312
|
-
|
1313
|
-
|
1314
|
-
|
1315
|
-
|
1316
|
-
|
1317
|
-
|
1318
|
-
|
1319
|
-
|
1320
|
-
|
1321
|
-
|
1322
|
-
|
1323
|
-
|
1324
|
-
|
1325
|
-
|
1326
|
-
|
1327
|
-
|
1328
|
-
|
1329
|
-
|
1330
|
-
|
1331
|
-
|
1332
|
-
|
1333
|
-
|
1334
|
-
|
1335
|
-
|
1336
|
-
|
1337
|
-
|
1338
|
-
|
1339
|
-
|
1340
|
-
|
1341
|
-
|
1342
|
-
|
1343
|
-
|
1344
|
-
|
1345
|
-
|
1346
|
-
|
1347
|
-
// load hparams
|
1348
|
-
{
|
1349
|
-
finp.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
|
1350
|
-
//finp.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
|
1351
|
-
finp.read((char *) &hparams.n_embd, sizeof(hparams.n_embd));
|
1352
|
-
finp.read((char *) &hparams.n_mult, sizeof(hparams.n_mult));
|
1353
|
-
finp.read((char *) &hparams.n_head, sizeof(hparams.n_head));
|
1354
|
-
finp.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
|
1355
|
-
finp.read((char *) &hparams.n_rot, sizeof(hparams.n_rot));
|
1356
|
-
finp.read((char *) &hparams.f16, sizeof(hparams.f16));
|
1357
|
-
|
1358
|
-
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
|
1359
|
-
printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
|
1360
|
-
printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
|
1361
|
-
printf("%s: n_mult = %d\n", __func__, hparams.n_mult);
|
1362
|
-
printf("%s: n_head = %d\n", __func__, hparams.n_head);
|
1363
|
-
printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
|
1364
|
-
printf("%s: f16 = %d\n", __func__, hparams.f16);
|
1365
|
-
|
1366
|
-
fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
|
1367
|
-
//fout.write((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
|
1368
|
-
fout.write((char *) &hparams.n_embd, sizeof(hparams.n_embd));
|
1369
|
-
fout.write((char *) &hparams.n_mult, sizeof(hparams.n_mult));
|
1370
|
-
fout.write((char *) &hparams.n_head, sizeof(hparams.n_head));
|
1371
|
-
fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer));
|
1372
|
-
fout.write((char *) &hparams.n_rot, sizeof(hparams.n_rot));
|
1373
|
-
fout.write((char *) &itype, sizeof(hparams.f16));
|
1374
|
-
}
|
1375
|
-
|
1376
|
-
// load vocab
|
1377
|
-
{
|
1378
|
-
const int32_t n_vocab = hparams.n_vocab;
|
1379
|
-
|
1380
|
-
if (n_vocab != hparams.n_vocab) {
|
1381
|
-
fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
|
1382
|
-
__func__, fname_inp.c_str(), n_vocab, hparams.n_vocab);
|
1383
|
-
return false;
|
1384
|
-
}
|
1385
|
-
|
1386
|
-
std::vector<char> word(32);
|
1387
|
-
vocab.id_to_token.resize(n_vocab);
|
1388
|
-
for (int i = 0; i < n_vocab; i++) {
|
1389
|
-
uint32_t len;
|
1390
|
-
finp.read ((char *) &len, sizeof(len));
|
1391
|
-
fout.write((char *) &len, sizeof(len));
|
1392
|
-
|
1393
|
-
word.resize(len);
|
1394
|
-
finp.read ((char *) &word[0], len);
|
1395
|
-
fout.write((char *) &word[0], len);
|
1396
|
-
|
1397
|
-
float score;
|
1398
|
-
finp.read ((char *) &score, sizeof(score));
|
1399
|
-
fout.write((char *) &score, sizeof(score));
|
1400
|
-
|
1401
|
-
vocab.token_to_id[word.data()] = i;
|
1402
|
-
|
1403
|
-
auto &tok_score = vocab.id_to_token[i];
|
1404
|
-
tok_score.tok = word.data();
|
1405
|
-
tok_score.score = score;
|
1406
|
-
}
|
1407
|
-
}
|
1408
|
-
|
1409
|
-
// load weights
|
1410
|
-
{
|
1411
|
-
size_t total_size_org = 0;
|
1412
|
-
size_t total_size_new = 0;
|
1413
|
-
|
1414
|
-
std::vector<float> work;
|
1415
|
-
|
1416
|
-
std::vector<uint8_t> data_u8;
|
1417
|
-
std::vector<ggml_fp16_t> data_f16;
|
1418
|
-
std::vector<float> data_f32;
|
1419
|
-
|
1420
|
-
std::vector<int64_t> hist_all(1 << 4, 0);
|
1421
|
-
|
1422
|
-
while (true) {
|
1423
|
-
int32_t n_dims;
|
1424
|
-
int32_t length;
|
1425
|
-
int32_t ftype;
|
1426
|
-
|
1427
|
-
finp.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
|
1428
|
-
finp.read(reinterpret_cast<char *>(&length), sizeof(length));
|
1429
|
-
finp.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
|
1430
|
-
|
1431
|
-
if (finp.eof()) {
|
1432
|
-
break;
|
1433
|
-
}
|
1434
|
-
|
1435
|
-
int32_t nelements = 1;
|
1436
|
-
int32_t ne[2] = { 1, 1 };
|
1437
|
-
for (int i = 0; i < n_dims; ++i) {
|
1438
|
-
finp.read (reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
|
1439
|
-
nelements *= ne[i];
|
1440
|
-
}
|
1441
|
-
|
1442
|
-
std::string name(length, 0);
|
1443
|
-
finp.read (&name[0], length);
|
1444
|
-
|
1445
|
-
{
|
1446
|
-
// ensure tensor data is aligned
|
1447
|
-
uint64_t offset = finp.tellg();
|
1448
|
-
offset = (offset + 31) & -32;
|
1449
|
-
finp.seekg(offset);
|
1450
|
-
}
|
1451
|
-
|
1452
|
-
{
|
1453
|
-
static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
|
1454
|
-
printf("%48s - [%5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ftype_str[ftype]);
|
1455
|
-
}
|
1456
|
-
|
1457
|
-
// regexes of tensor names to be quantized
|
1458
|
-
const std::vector<std::string> k_names = {
|
1459
|
-
".*weight",
|
1460
|
-
};
|
1461
|
-
|
1462
|
-
bool quantize = false;
|
1463
|
-
for (const auto & s : k_names) {
|
1464
|
-
if (std::regex_match(name, std::regex(s))) {
|
1465
|
-
quantize = true;
|
1466
|
-
break;
|
1467
|
-
}
|
1468
|
-
}
|
1469
|
-
|
1470
|
-
// quantize only 2D tensors
|
1471
|
-
quantize &= (n_dims == 2);
|
1472
|
-
|
1473
|
-
if (quantize) {
|
1474
|
-
if (ftype != 0 && ftype != 1) {
|
1475
|
-
fprintf(stderr, "%s: unsupported ftype %d for integer quantization\n", __func__, ftype);
|
1476
|
-
return false;
|
1477
|
-
}
|
1478
|
-
|
1479
|
-
if (ftype == 1) {
|
1480
|
-
data_f16.resize(nelements);
|
1481
|
-
finp.read(reinterpret_cast<char *>(data_f16.data()), nelements * sizeof(ggml_fp16_t));
|
1482
|
-
data_f32.resize(nelements);
|
1483
|
-
for (int i = 0; i < nelements; ++i) {
|
1484
|
-
data_f32[i] = ggml_fp16_to_fp32(data_f16[i]);
|
1485
|
-
}
|
1486
|
-
} else {
|
1487
|
-
data_f32.resize(nelements);
|
1488
|
-
finp.read(reinterpret_cast<char *>(data_f32.data()), nelements * sizeof(float));
|
1577
|
+
std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp.c_str(), /*use_mmap*/ false,
|
1578
|
+
/*vocab_only*/ false));
|
1579
|
+
llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), ftype);
|
1580
|
+
|
1581
|
+
size_t total_size_org = 0;
|
1582
|
+
size_t total_size_new = 0;
|
1583
|
+
std::vector<int64_t> hist_all(1 << 4, 0);
|
1584
|
+
|
1585
|
+
size_t idx = 0;
|
1586
|
+
for (llama_load_tensor & tensor : model_loader->tensors_map.tensors) {
|
1587
|
+
llama_buffer read_data;
|
1588
|
+
read_data.resize(tensor.size);
|
1589
|
+
tensor.data = read_data.addr;
|
1590
|
+
model_loader->load_data_for(tensor);
|
1591
|
+
|
1592
|
+
printf("[%4zu/%4zu] %36s - %16s, type = %6s, ",
|
1593
|
+
++idx, model_loader->tensors_map.tensors.size(),
|
1594
|
+
tensor.name.c_str(), llama_format_tensor_shape(tensor.ne).c_str(),
|
1595
|
+
ggml_type_name(tensor.type));
|
1596
|
+
|
1597
|
+
// This used to be a regex, but <regex> has an extreme cost to compile times.
|
1598
|
+
bool quantize = tensor.name.rfind("weight") == tensor.name.size() - 6; // ends with 'weight'?
|
1599
|
+
|
1600
|
+
// quantize only 2D tensors
|
1601
|
+
quantize &= (tensor.ne.size() == 2);
|
1602
|
+
|
1603
|
+
enum ggml_type new_type;
|
1604
|
+
void * new_data;
|
1605
|
+
size_t new_size;
|
1606
|
+
llama_buffer work;
|
1607
|
+
|
1608
|
+
if (!quantize) {
|
1609
|
+
new_type = tensor.type;
|
1610
|
+
new_data = tensor.data;
|
1611
|
+
new_size = tensor.size;
|
1612
|
+
printf("size = %8.3f MB\n", tensor.size/1024.0/1024.0);
|
1613
|
+
} else {
|
1614
|
+
new_type = quantized_type;
|
1615
|
+
float * f32_data;
|
1616
|
+
size_t nelements = tensor.ne.at(0) * tensor.ne.at(1);
|
1617
|
+
llama_buffer f32_conv_buf;
|
1618
|
+
if (tensor.type == GGML_TYPE_F32) {
|
1619
|
+
f32_data = (float *) tensor.data;
|
1620
|
+
} else if (tensor.type == GGML_TYPE_F16) {
|
1621
|
+
f32_conv_buf.resize(nelements * sizeof(float));
|
1622
|
+
f32_data = (float *) f32_conv_buf.addr;
|
1623
|
+
auto f16_data = (const ggml_fp16_t *) tensor.data;
|
1624
|
+
for (size_t i = 0; i < nelements; i++) {
|
1625
|
+
f32_data[i] = ggml_fp16_to_fp32(f16_data[i]);
|
1489
1626
|
}
|
1490
|
-
|
1491
|
-
ftype = itype;
|
1492
1627
|
} else {
|
1493
|
-
|
1494
|
-
|
1495
|
-
data_u8.resize(nelements*bpe);
|
1496
|
-
finp.read(reinterpret_cast<char *>(data_u8.data()), nelements * bpe);
|
1628
|
+
throw format("type %s unsupported for integer quantization", ggml_type_name(tensor.type));
|
1497
1629
|
}
|
1498
1630
|
|
1499
|
-
|
1500
|
-
|
1501
|
-
|
1502
|
-
|
1503
|
-
|
1631
|
+
printf("quantizing .. ");
|
1632
|
+
fflush(stdout);
|
1633
|
+
|
1634
|
+
work.resize(nelements * 4); // upper bound on size
|
1635
|
+
new_data = work.addr;
|
1636
|
+
std::vector<int64_t> hist_cur(1 << 4, 0);
|
1637
|
+
|
1638
|
+
switch (new_type) {
|
1639
|
+
case GGML_TYPE_Q4_0:
|
1640
|
+
{
|
1641
|
+
new_size = ggml_quantize_q4_0(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
|
1642
|
+
} break;
|
1643
|
+
case GGML_TYPE_Q4_1:
|
1644
|
+
{
|
1645
|
+
new_size = ggml_quantize_q4_1(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
|
1646
|
+
} break;
|
1647
|
+
default:
|
1648
|
+
LLAMA_ASSERT(false);
|
1504
1649
|
}
|
1505
|
-
fout.write(&name[0], length);
|
1506
1650
|
|
1507
|
-
|
1508
|
-
|
1509
|
-
|
1510
|
-
offset = (offset + 31) & -32;
|
1511
|
-
fout.seekp(offset);
|
1651
|
+
printf("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0);
|
1652
|
+
for (size_t i = 0; i < hist_cur.size(); i++) {
|
1653
|
+
hist_all[i] += hist_cur[i];
|
1512
1654
|
}
|
1513
1655
|
|
1514
|
-
|
1515
|
-
printf("
|
1516
|
-
work.resize(nelements); // for quantization
|
1517
|
-
|
1518
|
-
size_t cur_size = 0;
|
1519
|
-
std::vector<int64_t> hist_cur(1 << 4, 0);
|
1520
|
-
|
1521
|
-
switch (type) {
|
1522
|
-
case GGML_TYPE_Q4_0:
|
1523
|
-
{
|
1524
|
-
cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
1525
|
-
} break;
|
1526
|
-
case GGML_TYPE_Q4_1:
|
1527
|
-
{
|
1528
|
-
cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
1529
|
-
} break;
|
1530
|
-
default:
|
1531
|
-
{
|
1532
|
-
fprintf(stderr, "%s: unsupported quantization type %d\n", __func__, type);
|
1533
|
-
return false;
|
1534
|
-
}
|
1535
|
-
}
|
1536
|
-
|
1537
|
-
fout.write(reinterpret_cast<char *>(work.data()), cur_size);
|
1538
|
-
total_size_new += cur_size;
|
1539
|
-
|
1540
|
-
printf("size = %8.2f MB -> %8.2f MB | hist: ", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0);
|
1541
|
-
for (int i = 0; i < (int) hist_cur.size(); ++i) {
|
1542
|
-
hist_all[i] += hist_cur[i];
|
1543
|
-
}
|
1544
|
-
|
1545
|
-
for (int i = 0; i < (int) hist_cur.size(); ++i) {
|
1546
|
-
printf("%5.3f ", hist_cur[i] / float(nelements));
|
1547
|
-
}
|
1548
|
-
printf("\n");
|
1549
|
-
} else {
|
1550
|
-
printf("size = %8.3f MB\n", data_u8.size()/1024.0/1024.0);
|
1551
|
-
fout.write(reinterpret_cast<char *>(data_u8.data()), data_u8.size());
|
1552
|
-
total_size_new += data_u8.size();
|
1656
|
+
for (size_t i = 0; i < hist_cur.size(); i++) {
|
1657
|
+
printf("%5.3f ", hist_cur[i] / float(nelements));
|
1553
1658
|
}
|
1554
|
-
|
1555
|
-
total_size_org += nelements * sizeof(float);
|
1659
|
+
printf("\n");
|
1556
1660
|
}
|
1661
|
+
total_size_org += tensor.size;
|
1662
|
+
total_size_new += new_size;
|
1663
|
+
file_saver.write_tensor(tensor, new_type, new_data, new_size);
|
1664
|
+
}
|
1557
1665
|
|
1558
|
-
|
1559
|
-
|
1666
|
+
printf("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
|
1667
|
+
printf("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0);
|
1560
1668
|
|
1561
|
-
|
1562
|
-
|
1563
|
-
|
1564
|
-
|
1565
|
-
|
1669
|
+
{
|
1670
|
+
int64_t sum_all = 0;
|
1671
|
+
for (size_t i = 0; i < hist_all.size(); i++) {
|
1672
|
+
sum_all += hist_all[i];
|
1673
|
+
}
|
1566
1674
|
|
1567
|
-
|
1568
|
-
|
1569
|
-
|
1570
|
-
}
|
1571
|
-
printf("\n");
|
1675
|
+
printf("%s: hist: ", __func__);
|
1676
|
+
for (size_t i = 0; i < hist_all.size(); i++) {
|
1677
|
+
printf("%5.3f ", hist_all[i] / float(sum_all));
|
1572
1678
|
}
|
1679
|
+
printf("\n");
|
1573
1680
|
}
|
1574
|
-
|
1575
|
-
finp.close();
|
1576
|
-
fout.close();
|
1577
|
-
|
1578
|
-
return true;
|
1579
1681
|
}
|
1580
1682
|
|
1581
1683
|
//
|
@@ -1593,32 +1695,36 @@ struct llama_context * llama_init_from_file(
|
|
1593
1695
|
params.seed = time(NULL);
|
1594
1696
|
}
|
1595
1697
|
|
1698
|
+
unsigned cur_percentage = 0;
|
1699
|
+
if (params.progress_callback == NULL) {
|
1700
|
+
params.progress_callback_user_data = &cur_percentage;
|
1701
|
+
params.progress_callback = [](float progress, void * ctx) {
|
1702
|
+
unsigned * cur_percentage_p = (unsigned *) ctx;
|
1703
|
+
unsigned percentage = (unsigned) (100 * progress);
|
1704
|
+
while (percentage > *cur_percentage_p) {
|
1705
|
+
++*cur_percentage_p;
|
1706
|
+
fprintf(stderr, ".");
|
1707
|
+
fflush(stderr);
|
1708
|
+
if (percentage >= 100) {
|
1709
|
+
fprintf(stderr, "\n");
|
1710
|
+
}
|
1711
|
+
}
|
1712
|
+
};
|
1713
|
+
}
|
1714
|
+
|
1596
1715
|
ctx->rng = std::mt19937(params.seed);
|
1597
1716
|
ctx->logits_all = params.logits_all;
|
1598
1717
|
|
1599
1718
|
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
1600
1719
|
|
1601
|
-
if (!llama_model_load(path_model, *ctx, params.n_ctx,
|
1602
|
-
params.
|
1603
|
-
params.progress_callback_user_data)) {
|
1720
|
+
if (!llama_model_load(path_model, *ctx, params.n_ctx, memory_type,
|
1721
|
+
params.use_mmap, params.use_mlock, params.vocab_only,
|
1722
|
+
params.progress_callback, params.progress_callback_user_data)) {
|
1604
1723
|
fprintf(stderr, "%s: failed to load model\n", __func__);
|
1605
1724
|
llama_free(ctx);
|
1606
1725
|
return nullptr;
|
1607
1726
|
}
|
1608
1727
|
|
1609
|
-
if (params.use_mlock) {
|
1610
|
-
char *err;
|
1611
|
-
if (!ggml_mlock(ctx->model.ctx,
|
1612
|
-
ctx->model.mm_addr,
|
1613
|
-
ctx->model.mm_length,
|
1614
|
-
&err)) {
|
1615
|
-
fprintf(stderr, "%s\n", err);
|
1616
|
-
free(err);
|
1617
|
-
llama_free(ctx);
|
1618
|
-
return nullptr;
|
1619
|
-
}
|
1620
|
-
}
|
1621
|
-
|
1622
1728
|
// reserve memory for context buffers
|
1623
1729
|
if (!params.vocab_only) {
|
1624
1730
|
if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx)) {
|
@@ -1645,50 +1751,289 @@ struct llama_context * llama_init_from_file(
|
|
1645
1751
|
ctx->embedding.resize(hparams.n_embd);
|
1646
1752
|
}
|
1647
1753
|
|
1648
|
-
ctx->buf_compute.resize(MEM_REQ_EVAL.at(ctx->model.type));
|
1754
|
+
ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type));
|
1649
1755
|
|
1650
|
-
ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0.at(ctx->model.type));
|
1651
|
-
ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1.at(ctx->model.type));
|
1756
|
+
ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0().at(ctx->model.type));
|
1757
|
+
ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
|
1652
1758
|
}
|
1653
1759
|
|
1654
1760
|
return ctx;
|
1655
1761
|
}
|
1656
1762
|
|
1657
1763
|
void llama_free(struct llama_context * ctx) {
|
1658
|
-
kv_cache_free(ctx->model.kv_self);
|
1659
|
-
|
1660
|
-
if (ctx->model.ctx) {
|
1661
|
-
ggml_free(ctx->model.ctx);
|
1662
|
-
}
|
1663
|
-
|
1664
|
-
if (ctx->model.mm_addr) {
|
1665
|
-
munmap_file(ctx->model.mm_addr, ctx->model.mm_length);
|
1666
|
-
}
|
1667
|
-
|
1668
1764
|
delete ctx;
|
1669
1765
|
}
|
1670
1766
|
|
1671
1767
|
int llama_model_quantize(
|
1672
1768
|
const char * fname_inp,
|
1673
1769
|
const char * fname_out,
|
1674
|
-
|
1675
|
-
|
1676
|
-
|
1770
|
+
enum llama_ftype ftype) {
|
1771
|
+
try {
|
1772
|
+
llama_model_quantize_internal(fname_inp, fname_out, ftype);
|
1773
|
+
return 0;
|
1774
|
+
} catch (const std::string & err) {
|
1775
|
+
fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.c_str());
|
1776
|
+
return 1;
|
1777
|
+
}
|
1778
|
+
}
|
1779
|
+
|
1780
|
+
int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
|
1781
|
+
fprintf(stderr, "%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
|
1782
|
+
|
1783
|
+
auto & model = ctx->model;
|
1784
|
+
|
1785
|
+
const int64_t t_start_lora_us = ggml_time_us();
|
1786
|
+
|
1787
|
+
auto fin = std::ifstream(path_lora, std::ios::binary);
|
1788
|
+
if (!fin) {
|
1789
|
+
fprintf(stderr, "%s: failed to open '%s'\n", __func__, path_lora);
|
1677
1790
|
return 1;
|
1678
1791
|
}
|
1679
1792
|
|
1793
|
+
// verify magic and version
|
1794
|
+
{
|
1795
|
+
uint32_t magic;
|
1796
|
+
fin.read((char *) &magic, sizeof(magic));
|
1797
|
+
if (magic != 'ggla') {
|
1798
|
+
fprintf(stderr, "%s: bad file magic\n", __func__);
|
1799
|
+
return 1;
|
1800
|
+
}
|
1801
|
+
uint32_t format_version;
|
1802
|
+
fin.read((char *) &format_version, sizeof(format_version));
|
1803
|
+
|
1804
|
+
if (format_version != 1) {
|
1805
|
+
fprintf(stderr, "%s: unsupported file version\n", __func__ );
|
1806
|
+
return 1;
|
1807
|
+
}
|
1808
|
+
}
|
1809
|
+
|
1810
|
+
int32_t lora_r;
|
1811
|
+
int32_t lora_alpha;
|
1812
|
+
fin.read((char *) &lora_r, sizeof(lora_r));
|
1813
|
+
fin.read((char *) &lora_alpha, sizeof(lora_alpha));
|
1814
|
+
float scaling = (float)lora_alpha / (float)lora_r;
|
1815
|
+
|
1816
|
+
fprintf(stderr, "%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);
|
1817
|
+
|
1818
|
+
|
1819
|
+
// create a temporary ggml context to store the lora tensors
|
1820
|
+
// todo: calculate size from biggest possible tensor
|
1821
|
+
std::vector<uint8_t> lora_buf(1024ull * 1024ull * 1024ull);
|
1822
|
+
struct ggml_init_params params;
|
1823
|
+
params.mem_size = lora_buf.size();
|
1824
|
+
params.mem_buffer = lora_buf.data();
|
1825
|
+
params.no_alloc = false;
|
1826
|
+
|
1827
|
+
ggml_context * lora_ctx = ggml_init(params);
|
1828
|
+
std::unordered_map<std::string, struct ggml_tensor *> lora_tensors;
|
1829
|
+
|
1830
|
+
// create a name -> tensor map of the model to accelerate lookups
|
1831
|
+
std::unordered_map<std::string, struct ggml_tensor*> model_tensors;
|
1832
|
+
for (auto & kv: model.tensors_by_name) {
|
1833
|
+
model_tensors.insert(kv);
|
1834
|
+
}
|
1835
|
+
|
1836
|
+
|
1837
|
+
// load base model
|
1838
|
+
std::unique_ptr<llama_model_loader> model_loader;
|
1839
|
+
ggml_context * base_ctx = NULL;
|
1840
|
+
llama_buffer base_buf;
|
1841
|
+
if (path_base_model) {
|
1842
|
+
fprintf(stderr, "%s: loading base model from '%s'\n", __func__, path_base_model);
|
1843
|
+
model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*vocab_only*/ false));
|
1844
|
+
|
1845
|
+
size_t ctx_size, mmapped_size;
|
1846
|
+
model_loader->calc_sizes(&ctx_size, &mmapped_size);
|
1847
|
+
base_buf.resize(ctx_size);
|
1848
|
+
|
1849
|
+
ggml_init_params base_params;
|
1850
|
+
base_params.mem_size = base_buf.size;
|
1851
|
+
base_params.mem_buffer = base_buf.addr;
|
1852
|
+
base_params.no_alloc = model_loader->use_mmap;
|
1853
|
+
|
1854
|
+
base_ctx = ggml_init(base_params);
|
1855
|
+
|
1856
|
+
model_loader->ggml_ctx = base_ctx;
|
1857
|
+
|
1858
|
+
// maybe this should in llama_model_loader
|
1859
|
+
if (model_loader->use_mmap) {
|
1860
|
+
model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ false));
|
1861
|
+
}
|
1862
|
+
}
|
1863
|
+
|
1864
|
+
// read tensors and apply
|
1865
|
+
bool warned = false;
|
1866
|
+
int n_tensors = 0;
|
1867
|
+
while (true) {
|
1868
|
+
int32_t n_dims;
|
1869
|
+
int32_t length;
|
1870
|
+
int32_t ftype;
|
1871
|
+
|
1872
|
+
fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
|
1873
|
+
fin.read(reinterpret_cast<char *>(&length), sizeof(length));
|
1874
|
+
fin.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
|
1875
|
+
if (fin.eof()) {
|
1876
|
+
break;
|
1877
|
+
}
|
1878
|
+
|
1879
|
+
int32_t ne[2] = { 1, 1 };
|
1880
|
+
for (int i = 0; i < n_dims; ++i) {
|
1881
|
+
fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
|
1882
|
+
}
|
1883
|
+
|
1884
|
+
std::string name(length, 0);
|
1885
|
+
fin.read(&name[0], length);
|
1886
|
+
|
1887
|
+
// check for lora suffix and get the type of tensor
|
1888
|
+
const std::string lora_suffix = ".lora";
|
1889
|
+
size_t pos = name.rfind(lora_suffix);
|
1890
|
+
if (pos == std::string::npos) {
|
1891
|
+
fprintf(stderr, "%s: error: '%s' is not a lora tensor\n", __func__, name.c_str());
|
1892
|
+
return 1;
|
1893
|
+
}
|
1894
|
+
|
1895
|
+
std::string lora_type = name.substr(pos + lora_suffix.length());
|
1896
|
+
std::string base_name = name;
|
1897
|
+
base_name.erase(pos);
|
1898
|
+
// fprintf(stderr, "%s: %s => %s (lora type %s) ", __func__, name.c_str(),base_name.c_str(), lora_type.c_str());
|
1899
|
+
|
1900
|
+
if (model_tensors.find(base_name.data()) == model_tensors.end()) {
|
1901
|
+
fprintf(stderr, "%s: unknown tensor '%s' in lora adapter\n", __func__, name.data());
|
1902
|
+
return 1;
|
1903
|
+
}
|
1904
|
+
|
1905
|
+
// create ggml tensor
|
1906
|
+
ggml_type wtype;
|
1907
|
+
switch (ftype) {
|
1908
|
+
case 0: wtype = GGML_TYPE_F32; break;
|
1909
|
+
case 1: wtype = GGML_TYPE_F16; break;
|
1910
|
+
default:
|
1911
|
+
{
|
1912
|
+
fprintf(stderr, "%s: invalid tensor data type '%d'\n",
|
1913
|
+
__func__, ftype);
|
1914
|
+
return false;
|
1915
|
+
}
|
1916
|
+
}
|
1917
|
+
ggml_tensor* lora_tensor;
|
1918
|
+
if (n_dims == 2) {
|
1919
|
+
lora_tensor = ggml_new_tensor_2d(lora_ctx, wtype, ne[0], ne[1]);
|
1920
|
+
}
|
1921
|
+
else {
|
1922
|
+
fprintf(stderr, "%s: unsupported tensor dimension %d\n", __func__, n_dims);
|
1923
|
+
return 1;
|
1924
|
+
}
|
1925
|
+
|
1926
|
+
// load tensor data
|
1927
|
+
size_t offset = fin.tellg();
|
1928
|
+
size_t tensor_data_size = ggml_nbytes(lora_tensor);
|
1929
|
+
offset = (offset + 31) & -32;
|
1930
|
+
fin.seekg(offset);
|
1931
|
+
fin.read((char*)lora_tensor->data, tensor_data_size);
|
1932
|
+
|
1933
|
+
lora_tensors[name] = lora_tensor;
|
1934
|
+
|
1935
|
+
// check if we have both A and B tensors and apply
|
1936
|
+
if (lora_tensors.find(base_name + ".loraA") != lora_tensors.end() &&
|
1937
|
+
lora_tensors.find(base_name + ".loraB") != lora_tensors.end()) {
|
1938
|
+
|
1939
|
+
ggml_tensor * dest_t = model_tensors[base_name];
|
1940
|
+
ggml_tensor * base_t;
|
1941
|
+
if (model_loader) {
|
1942
|
+
// load from base model
|
1943
|
+
if (model_loader->tensors_map.name_to_idx.find(base_name) == model_loader->tensors_map.name_to_idx.end()) {
|
1944
|
+
fprintf(stderr, "%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
|
1945
|
+
return 1;
|
1946
|
+
}
|
1947
|
+
size_t idx = model_loader->tensors_map.name_to_idx[base_name];
|
1948
|
+
llama_load_tensor & lt = model_loader->tensors_map.tensors[idx];
|
1949
|
+
base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] });
|
1950
|
+
lt.data = (uint8_t *) lt.ggml_tensor->data;
|
1951
|
+
model_loader->load_data_for(lt);
|
1952
|
+
lt.ggml_tensor->data = lt.data;
|
1953
|
+
}
|
1954
|
+
else {
|
1955
|
+
base_t = dest_t;
|
1956
|
+
}
|
1957
|
+
|
1958
|
+
if (base_t->type == GGML_TYPE_Q4_0 || base_t->type == GGML_TYPE_Q4_1) {
|
1959
|
+
if (!warned) {
|
1960
|
+
fprintf(stderr, "%s: warning: using a lora adapter with a quantized model may result in poor quality, "
|
1961
|
+
"use a f16 or f32 base model with --lora-base\n", __func__);
|
1962
|
+
warned = true;
|
1963
|
+
}
|
1964
|
+
}
|
1965
|
+
|
1966
|
+
ggml_tensor * loraA = lora_tensors[base_name + ".loraA"];
|
1967
|
+
ggml_tensor * loraB = lora_tensors[base_name + ".loraB"];
|
1968
|
+
|
1969
|
+
if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) {
|
1970
|
+
fprintf(stderr, "%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");"
|
1971
|
+
" are you sure that this adapter is for this model?\n", __func__, base_t->ne[0], loraA->ne[1]);
|
1972
|
+
return 1;
|
1973
|
+
}
|
1974
|
+
|
1975
|
+
// w = w + BA*s
|
1976
|
+
ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB);
|
1977
|
+
|
1978
|
+
if (scaling != 1.0f) {
|
1979
|
+
ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling);
|
1980
|
+
BA = ggml_scale(lora_ctx, BA, scale_tensor);
|
1981
|
+
}
|
1982
|
+
|
1983
|
+
ggml_tensor * r;
|
1984
|
+
if (base_t == dest_t) {
|
1985
|
+
r = ggml_add_inplace(lora_ctx, dest_t, BA);
|
1986
|
+
}
|
1987
|
+
else {
|
1988
|
+
r = ggml_add(lora_ctx, base_t, BA);
|
1989
|
+
r = ggml_cpy(lora_ctx, r, dest_t);
|
1990
|
+
}
|
1991
|
+
|
1992
|
+
struct ggml_cgraph gf = ggml_build_forward(r);
|
1993
|
+
gf.n_threads = n_threads;
|
1994
|
+
ggml_graph_compute(lora_ctx, &gf);
|
1995
|
+
|
1996
|
+
// we won't need these tensors again, reset the context to save memory
|
1997
|
+
ggml_free(lora_ctx);
|
1998
|
+
lora_ctx = ggml_init(params);
|
1999
|
+
lora_tensors.clear();
|
2000
|
+
|
2001
|
+
n_tensors++;
|
2002
|
+
if (n_tensors % 4 == 0)
|
2003
|
+
fprintf(stderr, ".");
|
2004
|
+
}
|
2005
|
+
}
|
2006
|
+
|
2007
|
+
// TODO: this should be in a destructor, it will leak on failure
|
2008
|
+
ggml_free(lora_ctx);
|
2009
|
+
if (base_ctx) {
|
2010
|
+
ggml_free(base_ctx);
|
2011
|
+
}
|
2012
|
+
|
2013
|
+
const int64_t t_lora_us = ggml_time_us() - t_start_lora_us;
|
2014
|
+
fprintf(stderr, " done (%.2f ms)\n", t_lora_us / 1000.0);
|
2015
|
+
|
1680
2016
|
return 0;
|
1681
2017
|
}
|
1682
2018
|
|
2019
|
+
int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
|
2020
|
+
try {
|
2021
|
+
return llama_apply_lora_from_file_internal(ctx, path_lora, path_base_model, n_threads);
|
2022
|
+
} catch (const std::string & err) {
|
2023
|
+
fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.c_str());
|
2024
|
+
return 1;
|
2025
|
+
}
|
2026
|
+
}
|
2027
|
+
|
1683
2028
|
// Returns the KV cache that will contain the context for the
|
1684
2029
|
// ongoing prediction with the model.
|
1685
2030
|
const uint8_t * llama_get_kv_cache(struct llama_context * ctx) {
|
1686
|
-
return ctx->model.kv_self.buf.
|
2031
|
+
return ctx->model.kv_self.buf.addr;
|
1687
2032
|
}
|
1688
2033
|
|
1689
2034
|
// Returns the size of the KV cache
|
1690
2035
|
size_t llama_get_kv_cache_size(struct llama_context * ctx) {
|
1691
|
-
return ctx->model.kv_self.buf.size
|
2036
|
+
return ctx->model.kv_self.buf.size;
|
1692
2037
|
}
|
1693
2038
|
|
1694
2039
|
int llama_get_kv_cache_token_count(struct llama_context * ctx) {
|
@@ -1702,8 +2047,8 @@ void llama_set_kv_cache(
|
|
1702
2047
|
size_t n_size,
|
1703
2048
|
int n_token_count) {
|
1704
2049
|
// Make sure we have the same kv cache setup
|
1705
|
-
LLAMA_ASSERT(ctx->model.kv_self.buf.size
|
1706
|
-
memcpy(ctx->model.kv_self.buf.
|
2050
|
+
LLAMA_ASSERT(ctx->model.kv_self.buf.size == n_size);
|
2051
|
+
memcpy(ctx->model.kv_self.buf.addr, kv_cache, n_size);
|
1707
2052
|
ctx->model.kv_self.n = n_token_count;
|
1708
2053
|
}
|
1709
2054
|
|
@@ -1814,9 +2159,9 @@ llama_token llama_sample_top_p_top_k(
|
|
1814
2159
|
void llama_print_timings(struct llama_context * ctx) {
|
1815
2160
|
const int64_t t_end_us = ggml_time_us();
|
1816
2161
|
|
1817
|
-
const int32_t n_sample =
|
1818
|
-
const int32_t n_eval =
|
1819
|
-
const int32_t n_p_eval =
|
2162
|
+
const int32_t n_sample = std::max(1, ctx->n_sample);
|
2163
|
+
const int32_t n_eval = std::max(1, ctx->n_eval);
|
2164
|
+
const int32_t n_p_eval = std::max(1, ctx->n_p_eval);
|
1820
2165
|
|
1821
2166
|
fprintf(stderr, "\n");
|
1822
2167
|
fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0);
|
@@ -1837,18 +2182,25 @@ const char * llama_print_system_info(void) {
|
|
1837
2182
|
static std::string s;
|
1838
2183
|
|
1839
2184
|
s = "";
|
1840
|
-
s += "AVX = "
|
1841
|
-
s += "AVX2 = "
|
1842
|
-
s += "AVX512 = "
|
1843
|
-
s += "
|
1844
|
-
s += "
|
1845
|
-
s += "
|
1846
|
-
s += "
|
1847
|
-
s += "
|
1848
|
-
s += "
|
1849
|
-
s += "
|
1850
|
-
s += "
|
1851
|
-
s += "
|
2185
|
+
s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | ";
|
2186
|
+
s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | ";
|
2187
|
+
s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
|
2188
|
+
s += "AVX512_VBMI = " + std::to_string(ggml_cpu_has_avx512_vbmi()) + " | ";
|
2189
|
+
s += "AVX512_VNNI = " + std::to_string(ggml_cpu_has_avx512_vnni()) + " | ";
|
2190
|
+
s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | ";
|
2191
|
+
s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
|
2192
|
+
s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | ";
|
2193
|
+
s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | ";
|
2194
|
+
s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
|
2195
|
+
s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
|
2196
|
+
s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | ";
|
2197
|
+
s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
|
2198
|
+
s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
|
1852
2199
|
|
1853
2200
|
return s.c_str();
|
1854
2201
|
}
|
2202
|
+
|
2203
|
+
// For internal test use
|
2204
|
+
std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
|
2205
|
+
return ctx->model.tensors_by_name;
|
2206
|
+
}
|