llama_cpp 0.0.3 → 0.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +36 -0
- data/README.md +5 -4
- data/ext/llama_cpp/extconf.rb +38 -0
- data/ext/llama_cpp/llama_cpp.cpp +118 -2
- data/ext/llama_cpp/src/ggml.c +1740 -658
- data/ext/llama_cpp/src/ggml.h +84 -16
- data/ext/llama_cpp/src/llama.cpp +1108 -756
- data/ext/llama_cpp/src/llama.h +37 -1
- data/ext/llama_cpp/src/llama_util.h +396 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +3 -3
- data/sig/llama_cpp.rbs +6 -0
- metadata +3 -2
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -1,49 +1,33 @@
|
|
1
|
+
// Defines fileno on msys:
|
2
|
+
#ifndef _GNU_SOURCE
|
3
|
+
#define _GNU_SOURCE
|
4
|
+
#include <cstdint>
|
5
|
+
#include <cstdio>
|
6
|
+
#endif
|
7
|
+
|
8
|
+
#include "llama_util.h"
|
1
9
|
#include "llama.h"
|
2
10
|
|
3
11
|
#include "ggml.h"
|
4
12
|
|
13
|
+
#include <array>
|
14
|
+
#include <ctime>
|
5
15
|
#include <cinttypes>
|
6
16
|
#include <fstream>
|
7
17
|
#include <random>
|
8
18
|
#include <map>
|
9
19
|
#include <unordered_map>
|
10
20
|
#include <queue>
|
11
|
-
#include <regex>
|
12
21
|
#include <cassert>
|
13
22
|
#include <cstring>
|
14
|
-
|
15
|
-
#
|
16
|
-
#
|
17
|
-
#include <
|
18
|
-
#else
|
19
|
-
#include <sys/types.h>
|
20
|
-
#include <sys/mman.h>
|
21
|
-
#include <unistd.h>
|
22
|
-
#include <fcntl.h>
|
23
|
-
#endif
|
24
|
-
|
25
|
-
#define Min(X, Y) ((Y) > (X) ? (X) : (Y))
|
26
|
-
#define Max(X, Y) ((Y) < (X) ? (X) : (Y))
|
23
|
+
#include <climits>
|
24
|
+
#include <memory>
|
25
|
+
#include <algorithm>
|
26
|
+
#include <initializer_list>
|
27
27
|
|
28
28
|
#define LLAMA_USE_SCRATCH
|
29
29
|
#define LLAMA_MAX_SCRATCH_BUFFERS 16
|
30
30
|
|
31
|
-
#define LLAMA_ASSERT(x) \
|
32
|
-
do { \
|
33
|
-
if (!(x)) { \
|
34
|
-
fprintf(stderr, "LLAMA_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
|
35
|
-
abort(); \
|
36
|
-
} \
|
37
|
-
} while (0)
|
38
|
-
|
39
|
-
|
40
|
-
// determine number of model parts based on the dimension
|
41
|
-
static const std::unordered_map<int, int> LLAMA_N_PARTS = {
|
42
|
-
{ 4096, 1 },
|
43
|
-
{ 5120, 2 },
|
44
|
-
{ 6656, 4 },
|
45
|
-
{ 8192, 8 },
|
46
|
-
};
|
47
31
|
|
48
32
|
// available llama models
|
49
33
|
enum e_model {
|
@@ -60,47 +44,67 @@ static const size_t MB = 1024*1024;
|
|
60
44
|
// TODO: dynamically determine these sizes
|
61
45
|
// needs modifications in ggml
|
62
46
|
|
63
|
-
static const std::map<e_model, size_t> MEM_REQ_SCRATCH0
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
}
|
47
|
+
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
|
48
|
+
{
|
49
|
+
static std::map<e_model, size_t> _MEM_REQ_SCRATCH0 = {
|
50
|
+
{ MODEL_7B, 512ull * MB },
|
51
|
+
{ MODEL_13B, 512ull * MB },
|
52
|
+
{ MODEL_30B, 512ull * MB },
|
53
|
+
{ MODEL_65B, 512ull * MB },
|
54
|
+
};
|
55
|
+
return _MEM_REQ_SCRATCH0;
|
56
|
+
}
|
69
57
|
|
70
|
-
static const std::map<e_model, size_t> MEM_REQ_SCRATCH1
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
58
|
+
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
|
59
|
+
{
|
60
|
+
static std::map<e_model, size_t> _MEM_REQ_SCRATCH1 = {
|
61
|
+
{ MODEL_7B, 512ull * MB },
|
62
|
+
{ MODEL_13B, 512ull * MB },
|
63
|
+
{ MODEL_30B, 512ull * MB },
|
64
|
+
{ MODEL_65B, 512ull * MB },
|
65
|
+
};
|
66
|
+
return _MEM_REQ_SCRATCH1;
|
75
67
|
};
|
76
68
|
|
77
69
|
// 2*n_embd*n_ctx*n_layer*sizeof(float16)
|
78
|
-
static const std::map<e_model, size_t> MEM_REQ_KV_SELF
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
70
|
+
static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
|
71
|
+
{
|
72
|
+
static std::map<e_model, size_t> _MEM_REQ_KV_SELF = {
|
73
|
+
{ MODEL_7B, 1026ull * MB },
|
74
|
+
{ MODEL_13B, 1608ull * MB },
|
75
|
+
{ MODEL_30B, 3124ull * MB },
|
76
|
+
{ MODEL_65B, 5120ull * MB },
|
77
|
+
};
|
78
|
+
return _MEM_REQ_KV_SELF;
|
83
79
|
};
|
84
80
|
|
85
81
|
// this is mostly needed for temporary mul_mat buffers to dequantize the data
|
86
82
|
// not actually needed if BLAS is disabled
|
87
|
-
static const std::map<e_model, size_t> MEM_REQ_EVAL
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
83
|
+
static const std::map<e_model, size_t> & MEM_REQ_EVAL()
|
84
|
+
{
|
85
|
+
static std::map<e_model, size_t> _MEM_REQ_EVAL = {
|
86
|
+
{ MODEL_7B, 768ull * MB },
|
87
|
+
{ MODEL_13B, 1024ull * MB },
|
88
|
+
{ MODEL_30B, 1280ull * MB },
|
89
|
+
{ MODEL_65B, 1536ull * MB },
|
90
|
+
};
|
91
|
+
return _MEM_REQ_EVAL;
|
92
92
|
};
|
93
93
|
|
94
94
|
// default hparams (LLaMA 7B)
|
95
95
|
struct llama_hparams {
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
96
|
+
uint32_t n_vocab = 32000;
|
97
|
+
uint32_t n_ctx = 512; // this is provided as user input?
|
98
|
+
uint32_t n_embd = 4096;
|
99
|
+
uint32_t n_mult = 256;
|
100
|
+
uint32_t n_head = 32;
|
101
|
+
uint32_t n_layer = 32;
|
102
|
+
uint32_t n_rot = 64;
|
103
|
+
enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;
|
104
|
+
|
105
|
+
bool operator!=(const llama_hparams & other) const {
|
106
|
+
return memcmp(this, &other, sizeof(llama_hparams));
|
107
|
+
}
|
104
108
|
};
|
105
109
|
|
106
110
|
struct llama_layer {
|
@@ -126,11 +130,17 @@ struct llama_kv_cache {
|
|
126
130
|
struct ggml_tensor * k;
|
127
131
|
struct ggml_tensor * v;
|
128
132
|
|
129
|
-
struct ggml_context * ctx;
|
133
|
+
struct ggml_context * ctx = NULL;
|
130
134
|
|
131
|
-
|
135
|
+
llama_buffer buf;
|
132
136
|
|
133
137
|
int n; // number of tokens currently in the cache
|
138
|
+
|
139
|
+
~llama_kv_cache() {
|
140
|
+
if (ctx) {
|
141
|
+
ggml_free(ctx);
|
142
|
+
}
|
143
|
+
}
|
134
144
|
};
|
135
145
|
|
136
146
|
struct llama_model {
|
@@ -146,22 +156,30 @@ struct llama_model {
|
|
146
156
|
std::vector<llama_layer> layers;
|
147
157
|
|
148
158
|
// context
|
149
|
-
struct ggml_context * ctx;
|
159
|
+
struct ggml_context * ctx = NULL;
|
150
160
|
|
151
161
|
// key + value cache for the self attention
|
152
162
|
// TODO: move to llama_state
|
153
163
|
struct llama_kv_cache kv_self;
|
154
164
|
|
155
165
|
// the model memory buffer
|
156
|
-
|
166
|
+
llama_buffer buf;
|
157
167
|
|
158
168
|
// model memory mapped file
|
159
|
-
|
160
|
-
|
169
|
+
std::unique_ptr<llama_mmap> mapping;
|
170
|
+
|
171
|
+
// objects representing data potentially being locked in memory
|
172
|
+
llama_mlock mlock_buf;
|
173
|
+
llama_mlock mlock_mmap;
|
174
|
+
|
175
|
+
// for quantize-stats only
|
176
|
+
std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
|
161
177
|
|
162
|
-
|
163
|
-
|
164
|
-
|
178
|
+
~llama_model() {
|
179
|
+
if (ctx) {
|
180
|
+
ggml_free(ctx);
|
181
|
+
}
|
182
|
+
}
|
165
183
|
};
|
166
184
|
|
167
185
|
struct llama_vocab {
|
@@ -206,8 +224,8 @@ struct llama_context {
|
|
206
224
|
|
207
225
|
// memory buffers used to evaluate the model
|
208
226
|
// TODO: move in llama_state
|
209
|
-
|
210
|
-
|
227
|
+
llama_buffer buf_compute;
|
228
|
+
llama_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
|
211
229
|
|
212
230
|
int buf_last = 0;
|
213
231
|
size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
|
@@ -220,11 +238,11 @@ struct llama_context {
|
|
220
238
|
last_size = ggml_set_scratch(ctx, { 0, 0, nullptr, });
|
221
239
|
} else {
|
222
240
|
auto & buf = buf_scratch[i];
|
223
|
-
last_size = ggml_set_scratch(ctx, { 0, buf.size
|
241
|
+
last_size = ggml_set_scratch(ctx, { 0, buf.size, buf.addr, });
|
224
242
|
}
|
225
243
|
|
226
244
|
if (buf_last >= 0) {
|
227
|
-
buf_max_size[buf_last] =
|
245
|
+
buf_max_size[buf_last] = std::max(buf_max_size[buf_last], last_size);
|
228
246
|
}
|
229
247
|
|
230
248
|
buf_last = i;
|
@@ -244,6 +262,500 @@ struct llama_context {
|
|
244
262
|
}
|
245
263
|
};
|
246
264
|
|
265
|
+
template <typename T>
|
266
|
+
static T checked_mul(T a, T b) {
|
267
|
+
T ret = a * b;
|
268
|
+
if (a != 0 && ret / a != b) {
|
269
|
+
throw format("overflow multiplying %llu * %llu",
|
270
|
+
(unsigned long long) a, (unsigned long long) b);
|
271
|
+
}
|
272
|
+
return ret;
|
273
|
+
}
|
274
|
+
|
275
|
+
static size_t checked_div(size_t a, size_t b) {
|
276
|
+
if (b == 0 || a % b != 0) {
|
277
|
+
throw format("error dividing %zu / %zu", a, b);
|
278
|
+
}
|
279
|
+
return a / b;
|
280
|
+
}
|
281
|
+
|
282
|
+
static std::string llama_format_tensor_shape(const std::vector<uint32_t> & ne) {
|
283
|
+
char buf[256];
|
284
|
+
snprintf(buf, sizeof(buf), "%5u", ne.at(0));
|
285
|
+
for (size_t i = 1; i < ne.size(); i++) {
|
286
|
+
snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), " x %5u", ne.at(i));
|
287
|
+
}
|
288
|
+
return buf;
|
289
|
+
}
|
290
|
+
|
291
|
+
static size_t llama_calc_tensor_size(const std::vector<uint32_t> & ne, enum ggml_type type) {
|
292
|
+
size_t size = ggml_type_size(type);
|
293
|
+
for (uint32_t dim : ne) {
|
294
|
+
size = checked_mul<size_t>(size, dim);
|
295
|
+
}
|
296
|
+
return size / ggml_blck_size(type);
|
297
|
+
}
|
298
|
+
|
299
|
+
struct llama_load_tensor_shard {
|
300
|
+
std::vector<uint32_t> ne;
|
301
|
+
size_t size;
|
302
|
+
enum ggml_type type;
|
303
|
+
size_t file_idx;
|
304
|
+
size_t file_off;
|
305
|
+
|
306
|
+
void calc_size() {
|
307
|
+
size = llama_calc_tensor_size(ne, type);
|
308
|
+
}
|
309
|
+
};
|
310
|
+
|
311
|
+
enum llama_split_type {
|
312
|
+
SPLIT_NONE,
|
313
|
+
SPLIT_BY_COLUMNS,
|
314
|
+
SPLIT_BY_ROWS
|
315
|
+
};
|
316
|
+
|
317
|
+
struct llama_load_tensor {
|
318
|
+
std::vector<llama_load_tensor_shard> shards;
|
319
|
+
|
320
|
+
std::string name;
|
321
|
+
enum ggml_type type = GGML_TYPE_F32;
|
322
|
+
llama_split_type split_type = SPLIT_NONE;
|
323
|
+
std::vector<uint32_t> ne;
|
324
|
+
size_t size;
|
325
|
+
struct ggml_tensor * ggml_tensor = NULL;
|
326
|
+
uint8_t * data;
|
327
|
+
|
328
|
+
llama_load_tensor(const std::string & name) : name(name) {}
|
329
|
+
|
330
|
+
void calc_all() {
|
331
|
+
calc_type();
|
332
|
+
calc_split_type();
|
333
|
+
calc_ne();
|
334
|
+
calc_size();
|
335
|
+
}
|
336
|
+
|
337
|
+
void calc_type() {
|
338
|
+
const auto & first_shard = shards.at(0);
|
339
|
+
for (const auto & shard : shards) {
|
340
|
+
if (shard.type != first_shard.type) {
|
341
|
+
throw format("inconsistent tensor shard type in '%s'", name.c_str());
|
342
|
+
}
|
343
|
+
}
|
344
|
+
type = first_shard.type;
|
345
|
+
}
|
346
|
+
|
347
|
+
void calc_split_type() {
|
348
|
+
if (shards.at(0).ne.size() == 1 || // 1D tensors are just duplicated in every file
|
349
|
+
shards.size() == 1) { // only one file?
|
350
|
+
split_type = SPLIT_NONE;
|
351
|
+
} else if (name.find("tok_embeddings.") == 0 ||
|
352
|
+
name.find(".attention.wo.weight") != std::string::npos ||
|
353
|
+
name.find(".feed_forward.w2.weight") != std::string::npos) {
|
354
|
+
split_type = SPLIT_BY_COLUMNS;
|
355
|
+
} else {
|
356
|
+
split_type = SPLIT_BY_ROWS;
|
357
|
+
}
|
358
|
+
}
|
359
|
+
|
360
|
+
void calc_ne() {
|
361
|
+
const auto & first_shard = shards.at(0);
|
362
|
+
for (const auto & shard : shards) {
|
363
|
+
if (shard.ne != first_shard.ne) {
|
364
|
+
throw format("inconsistent tensor shard shape in '%s': first was %s, other was %s",
|
365
|
+
name.c_str(), llama_format_tensor_shape(first_shard.ne).c_str(), llama_format_tensor_shape(shard.ne).c_str());
|
366
|
+
}
|
367
|
+
}
|
368
|
+
ne = first_shard.ne;
|
369
|
+
LLAMA_ASSERT(shards.size() <= UINT32_MAX);
|
370
|
+
uint32_t n_shards = (uint32_t) shards.size();
|
371
|
+
switch (split_type) {
|
372
|
+
case SPLIT_NONE:
|
373
|
+
ne = first_shard.ne;
|
374
|
+
break;
|
375
|
+
case SPLIT_BY_COLUMNS:
|
376
|
+
ne = {checked_mul<uint32_t>(first_shard.ne[0], n_shards),
|
377
|
+
first_shard.ne[1]};
|
378
|
+
break;
|
379
|
+
case SPLIT_BY_ROWS:
|
380
|
+
ne = {first_shard.ne[0],
|
381
|
+
checked_mul<uint32_t>(first_shard.ne[1], n_shards)};
|
382
|
+
break;
|
383
|
+
}
|
384
|
+
}
|
385
|
+
|
386
|
+
void calc_size() {
|
387
|
+
size = llama_calc_tensor_size(ne, type);
|
388
|
+
}
|
389
|
+
};
|
390
|
+
|
391
|
+
struct llama_load_tensors_map {
|
392
|
+
// tensors is kept in a separate vector to preserve file order
|
393
|
+
std::vector<llama_load_tensor> tensors;
|
394
|
+
std::unordered_map<std::string, size_t> name_to_idx;
|
395
|
+
};
|
396
|
+
|
397
|
+
enum llama_file_version {
|
398
|
+
LLAMA_FILE_VERSION_GGML,
|
399
|
+
LLAMA_FILE_VERSION_GGMF_V1, // added version field and scores in vocab
|
400
|
+
LLAMA_FILE_VERSION_GGJT_V1, // added padding
|
401
|
+
};
|
402
|
+
|
403
|
+
struct llama_file_loader {
|
404
|
+
llama_file file;
|
405
|
+
llama_file_version file_version;
|
406
|
+
llama_hparams hparams;
|
407
|
+
llama_vocab vocab;
|
408
|
+
|
409
|
+
llama_file_loader(const char * fname, size_t file_idx, llama_load_tensors_map & tensors_map)
|
410
|
+
: file(fname, "rb") {
|
411
|
+
fprintf(stderr, "llama.cpp: loading model from %s\n", fname);
|
412
|
+
read_magic();
|
413
|
+
read_hparams();
|
414
|
+
read_vocab();
|
415
|
+
read_tensor_metadata(file_idx, tensors_map);
|
416
|
+
}
|
417
|
+
void read_magic() {
|
418
|
+
uint32_t magic = file.read_u32();
|
419
|
+
uint32_t version = 0;
|
420
|
+
|
421
|
+
if (magic != 'ggml') {
|
422
|
+
version = file.read_u32();
|
423
|
+
}
|
424
|
+
|
425
|
+
if (magic == 'ggml' && version == 0) {
|
426
|
+
file_version = LLAMA_FILE_VERSION_GGML;
|
427
|
+
} else if (magic == 'ggmf' && version == 1) {
|
428
|
+
file_version = LLAMA_FILE_VERSION_GGMF_V1;
|
429
|
+
} else if (magic == 'ggjt' && version == 1) {
|
430
|
+
file_version = LLAMA_FILE_VERSION_GGJT_V1;
|
431
|
+
} else {
|
432
|
+
throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
|
433
|
+
magic, version);
|
434
|
+
}
|
435
|
+
}
|
436
|
+
void read_hparams() {
|
437
|
+
hparams.n_vocab = file.read_u32();
|
438
|
+
hparams.n_embd = file.read_u32();
|
439
|
+
hparams.n_mult = file.read_u32();
|
440
|
+
hparams.n_head = file.read_u32();
|
441
|
+
hparams.n_layer = file.read_u32();
|
442
|
+
hparams.n_rot = file.read_u32();
|
443
|
+
hparams.ftype = (enum llama_ftype) file.read_u32();
|
444
|
+
}
|
445
|
+
void read_vocab() {
|
446
|
+
vocab.id_to_token.resize(hparams.n_vocab);
|
447
|
+
|
448
|
+
for (uint32_t i = 0; i < hparams.n_vocab; i++) {
|
449
|
+
uint32_t len = file.read_u32();
|
450
|
+
std::string word = file.read_string(len);
|
451
|
+
|
452
|
+
float score = 0.0f;
|
453
|
+
if (file_version >= LLAMA_FILE_VERSION_GGMF_V1) {
|
454
|
+
file.read_raw(&score, sizeof(score));
|
455
|
+
}
|
456
|
+
|
457
|
+
vocab.token_to_id[word] = i;
|
458
|
+
|
459
|
+
auto & tok_score = vocab.id_to_token[i];
|
460
|
+
tok_score.tok = std::move(word);
|
461
|
+
tok_score.score = score;
|
462
|
+
}
|
463
|
+
}
|
464
|
+
void read_tensor_metadata(size_t file_idx, llama_load_tensors_map & tensors_map) {
|
465
|
+
while (file.tell() < file.size) {
|
466
|
+
llama_load_tensor_shard shard;
|
467
|
+
uint32_t n_dims = file.read_u32();
|
468
|
+
uint32_t name_len = file.read_u32();
|
469
|
+
shard.type = (enum ggml_type) file.read_u32();
|
470
|
+
shard.ne.resize(n_dims);
|
471
|
+
file.read_raw(shard.ne.data(), sizeof(shard.ne[0]) * n_dims);
|
472
|
+
std::string name = file.read_string(name_len);
|
473
|
+
if (n_dims < 1 || n_dims > 2) {
|
474
|
+
throw format("llama.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims);
|
475
|
+
}
|
476
|
+
switch (shard.type) {
|
477
|
+
case GGML_TYPE_F32:
|
478
|
+
case GGML_TYPE_F16:
|
479
|
+
case GGML_TYPE_Q4_0:
|
480
|
+
case GGML_TYPE_Q4_1:
|
481
|
+
break;
|
482
|
+
default: {
|
483
|
+
throw format("unrecognized tensor type %u\n", shard.type);
|
484
|
+
}
|
485
|
+
}
|
486
|
+
|
487
|
+
if (file_version >= LLAMA_FILE_VERSION_GGJT_V1) {
|
488
|
+
// skip to the next multiple of 32 bytes
|
489
|
+
file.seek(-file.tell() & 31, SEEK_CUR);
|
490
|
+
}
|
491
|
+
shard.file_idx = file_idx;
|
492
|
+
shard.file_off = file.tell();
|
493
|
+
|
494
|
+
shard.calc_size();
|
495
|
+
file.seek(shard.size, SEEK_CUR);
|
496
|
+
|
497
|
+
auto it = tensors_map.name_to_idx.find(name);
|
498
|
+
size_t idx;
|
499
|
+
if (it != tensors_map.name_to_idx.end()) {
|
500
|
+
idx = it->second;
|
501
|
+
} else {
|
502
|
+
tensors_map.tensors.emplace_back(name);
|
503
|
+
idx = tensors_map.tensors.size() - 1;
|
504
|
+
tensors_map.name_to_idx.emplace(name, idx);
|
505
|
+
}
|
506
|
+
tensors_map.tensors.at(idx).shards.push_back(shard);
|
507
|
+
}
|
508
|
+
}
|
509
|
+
};
|
510
|
+
|
511
|
+
struct llama_file_saver {
|
512
|
+
llama_file file;
|
513
|
+
llama_file_loader * any_file_loader;
|
514
|
+
llama_file_saver(const char * fname, llama_file_loader * any_file_loader, enum llama_ftype new_ftype)
|
515
|
+
: file(fname, "wb"), any_file_loader(any_file_loader) {
|
516
|
+
fprintf(stderr, "llama.cpp: saving model to %s\n", fname);
|
517
|
+
write_magic();
|
518
|
+
write_hparams(new_ftype);
|
519
|
+
write_vocab();
|
520
|
+
}
|
521
|
+
void write_magic() {
|
522
|
+
file.write_u32('ggjt'); // magic
|
523
|
+
file.write_u32(1); // version
|
524
|
+
}
|
525
|
+
void write_hparams(enum llama_ftype new_ftype) {
|
526
|
+
const llama_hparams & hparams = any_file_loader->hparams;
|
527
|
+
file.write_u32(hparams.n_vocab);
|
528
|
+
file.write_u32(hparams.n_embd);
|
529
|
+
file.write_u32(hparams.n_mult);
|
530
|
+
file.write_u32(hparams.n_head);
|
531
|
+
file.write_u32(hparams.n_layer);
|
532
|
+
file.write_u32(hparams.n_rot);
|
533
|
+
file.write_u32(new_ftype);
|
534
|
+
}
|
535
|
+
void write_vocab() {
|
536
|
+
if (any_file_loader->file_version == LLAMA_FILE_VERSION_GGML) {
|
537
|
+
fprintf(stderr, "llama.cpp: WARNING: input is an old file that doesn't have scores; will add dummy scores\n");
|
538
|
+
}
|
539
|
+
uint32_t n_vocab = any_file_loader->hparams.n_vocab;
|
540
|
+
for (uint32_t i = 0; i < n_vocab; i++) {
|
541
|
+
const auto & token_score = any_file_loader->vocab.id_to_token.at(i);
|
542
|
+
file.write_u32((uint32_t) token_score.tok.size());
|
543
|
+
file.write_raw(token_score.tok.data(), token_score.tok.size());
|
544
|
+
file.write_raw(&token_score.score, sizeof(token_score.score));
|
545
|
+
}
|
546
|
+
}
|
547
|
+
void write_tensor(llama_load_tensor & tensor, enum ggml_type new_type, const void * new_data, size_t new_size) {
|
548
|
+
switch (new_type) {
|
549
|
+
case GGML_TYPE_F32:
|
550
|
+
case GGML_TYPE_F16:
|
551
|
+
case GGML_TYPE_Q4_0:
|
552
|
+
case GGML_TYPE_Q4_1:
|
553
|
+
break;
|
554
|
+
default: LLAMA_ASSERT(false);
|
555
|
+
}
|
556
|
+
file.write_u32((uint32_t) tensor.ne.size());
|
557
|
+
file.write_u32((uint32_t) tensor.name.size());
|
558
|
+
file.write_u32(new_type);
|
559
|
+
file.write_raw(tensor.ne.data(), sizeof(tensor.ne[0]) * tensor.ne.size());
|
560
|
+
file.write_raw(tensor.name.data(), tensor.name.size());
|
561
|
+
file.seek(-file.tell() & 31, SEEK_CUR);
|
562
|
+
LLAMA_ASSERT(new_size == llama_calc_tensor_size(tensor.ne, new_type));
|
563
|
+
file.write_raw(new_data, new_size);
|
564
|
+
}
|
565
|
+
};
|
566
|
+
|
567
|
+
struct llama_model_loader {
|
568
|
+
std::vector<std::unique_ptr<llama_file_loader>> file_loaders;
|
569
|
+
llama_load_tensors_map tensors_map;
|
570
|
+
bool use_mmap;
|
571
|
+
size_t num_ggml_tensors_created = 0;
|
572
|
+
struct ggml_context * ggml_ctx = NULL;
|
573
|
+
std::unique_ptr<llama_mmap> mapping;
|
574
|
+
|
575
|
+
llama_model_loader(const std::string & fname_base, bool use_mmap, bool vocab_only) {
|
576
|
+
auto first_file = new llama_file_loader(fname_base.c_str(), 0, tensors_map);
|
577
|
+
file_loaders.emplace_back(first_file);
|
578
|
+
uint32_t n_parts = vocab_only ? 1 : guess_n_parts();
|
579
|
+
for (uint32_t i = 1; i < n_parts; i++) {
|
580
|
+
std::string fname = fname_base + "." + std::to_string(i);
|
581
|
+
auto ith_file = new llama_file_loader(fname.c_str(), i, tensors_map);
|
582
|
+
file_loaders.emplace_back(ith_file);
|
583
|
+
if (ith_file->hparams != first_file->hparams) {
|
584
|
+
throw format("llama.cpp: hparams inconsistent between files");
|
585
|
+
}
|
586
|
+
}
|
587
|
+
if (!llama_mmap::SUPPORTED) {
|
588
|
+
use_mmap = false;
|
589
|
+
}
|
590
|
+
if (use_mmap && alignment_prevents_mmap()) {
|
591
|
+
fprintf(stderr, "llama.cpp: can't use mmap because tensors are not aligned; convert to new format to avoid this\n");
|
592
|
+
use_mmap = false;
|
593
|
+
}
|
594
|
+
this->use_mmap = use_mmap;
|
595
|
+
for (llama_load_tensor & lt : tensors_map.tensors) {
|
596
|
+
lt.calc_all();
|
597
|
+
}
|
598
|
+
}
|
599
|
+
|
600
|
+
bool alignment_prevents_mmap() {
|
601
|
+
for (const llama_load_tensor & lt : tensors_map.tensors) {
|
602
|
+
for (const llama_load_tensor_shard & shard : lt.shards) {
|
603
|
+
if (shard.file_off & 3) {
|
604
|
+
return true;
|
605
|
+
}
|
606
|
+
}
|
607
|
+
}
|
608
|
+
return false;
|
609
|
+
}
|
610
|
+
|
611
|
+
uint32_t guess_n_parts() const {
|
612
|
+
auto it = tensors_map.name_to_idx.find("tok_embeddings.weight");
|
613
|
+
if (it == tensors_map.name_to_idx.end()) {
|
614
|
+
throw std::string("missing tok_embeddings.weight");
|
615
|
+
}
|
616
|
+
const llama_load_tensor & lt = tensors_map.tensors.at(it->second);
|
617
|
+
return file_loaders.at(0)->hparams.n_embd / lt.shards.at(0).ne.at(0);
|
618
|
+
}
|
619
|
+
|
620
|
+
void calc_sizes(size_t * ctx_size_p, size_t * mmapped_size_p) const {
|
621
|
+
*ctx_size_p = *mmapped_size_p = 0;
|
622
|
+
for (const llama_load_tensor & lt : tensors_map.tensors) {
|
623
|
+
*ctx_size_p += sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE;
|
624
|
+
*(use_mmap ? mmapped_size_p : ctx_size_p) += lt.size;
|
625
|
+
}
|
626
|
+
}
|
627
|
+
|
628
|
+
struct ggml_tensor * get_tensor(const std::string & name, std::vector<uint32_t> ne) {
|
629
|
+
auto it = tensors_map.name_to_idx.find(name);
|
630
|
+
if (it == tensors_map.name_to_idx.end()) {
|
631
|
+
throw format("llama.cpp: tensor '%s' is missing from model", name.c_str());
|
632
|
+
}
|
633
|
+
llama_load_tensor & lt = tensors_map.tensors.at(it->second);
|
634
|
+
if (lt.ne != ne) {
|
635
|
+
throw format("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s",
|
636
|
+
name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str());
|
637
|
+
}
|
638
|
+
|
639
|
+
return get_tensor_for(lt);
|
640
|
+
}
|
641
|
+
|
642
|
+
struct ggml_tensor * get_tensor_for(llama_load_tensor & lt) {
|
643
|
+
struct ggml_tensor * tensor;
|
644
|
+
if (lt.ne.size() == 2) {
|
645
|
+
tensor = ggml_new_tensor_2d(ggml_ctx, lt.type, lt.ne.at(0), lt.ne.at(1));
|
646
|
+
} else {
|
647
|
+
LLAMA_ASSERT(lt.ne.size() == 1);
|
648
|
+
tensor = ggml_new_tensor_1d(ggml_ctx, lt.type, lt.ne.at(0));
|
649
|
+
}
|
650
|
+
LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor
|
651
|
+
lt.ggml_tensor = tensor;
|
652
|
+
num_ggml_tensors_created++;
|
653
|
+
return tensor;
|
654
|
+
}
|
655
|
+
|
656
|
+
void done_getting_tensors() {
|
657
|
+
if (num_ggml_tensors_created != tensors_map.tensors.size()) {
|
658
|
+
throw std::string("llama.cpp: file contained more tensors than expected");
|
659
|
+
}
|
660
|
+
}
|
661
|
+
|
662
|
+
void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) {
|
663
|
+
size_t data_size = 0;
|
664
|
+
for (const llama_load_tensor & lt : tensors_map.tensors) {
|
665
|
+
data_size += lt.size;
|
666
|
+
}
|
667
|
+
|
668
|
+
if (use_mmap) {
|
669
|
+
mapping.reset(new llama_mmap(&file_loaders.at(0)->file));
|
670
|
+
if (!lmlock) {
|
671
|
+
// Don't call the callback since the actual loading will be lazy
|
672
|
+
// and we can't measure it.
|
673
|
+
progress_callback = NULL;
|
674
|
+
}
|
675
|
+
if (lmlock) {
|
676
|
+
lmlock->init(mapping->addr);
|
677
|
+
}
|
678
|
+
}
|
679
|
+
|
680
|
+
size_t done_size = 0;
|
681
|
+
for (llama_load_tensor & lt : tensors_map.tensors) {
|
682
|
+
if (progress_callback) {
|
683
|
+
progress_callback((float) done_size / data_size, progress_callback_user_data);
|
684
|
+
}
|
685
|
+
LLAMA_ASSERT(lt.ggml_tensor); // unused tensors should have been caught by load_data already
|
686
|
+
lt.data = (uint8_t *) lt.ggml_tensor->data;
|
687
|
+
load_data_for(lt);
|
688
|
+
lt.ggml_tensor->data = lt.data;
|
689
|
+
done_size += lt.size;
|
690
|
+
if (use_mmap && lmlock) {
|
691
|
+
lmlock->grow_to(done_size);
|
692
|
+
}
|
693
|
+
}
|
694
|
+
if (progress_callback) {
|
695
|
+
progress_callback(1.0f, progress_callback_user_data);
|
696
|
+
}
|
697
|
+
}
|
698
|
+
|
699
|
+
void load_data_for(llama_load_tensor & lt) {
|
700
|
+
if (use_mmap) {
|
701
|
+
LLAMA_ASSERT(lt.shards.size() == 1);
|
702
|
+
lt.data = (uint8_t *) mapping->addr + lt.shards.at(0).file_off;
|
703
|
+
} else if (lt.split_type == SPLIT_NONE) {
|
704
|
+
llama_file & file = file_loaders.at(lt.shards.at(0).file_idx)->file;
|
705
|
+
file.seek(lt.shards.at(0).file_off, SEEK_SET);
|
706
|
+
file.read_raw(lt.data, lt.size);
|
707
|
+
} else if (lt.split_type == SPLIT_BY_ROWS) {
|
708
|
+
size_t offset = 0;
|
709
|
+
for (llama_load_tensor_shard & shard : lt.shards) {
|
710
|
+
llama_file & file = file_loaders.at(shard.file_idx)->file;
|
711
|
+
file.seek(shard.file_off, SEEK_SET);
|
712
|
+
file.read_raw(lt.data + offset, shard.size);
|
713
|
+
offset += shard.size;
|
714
|
+
}
|
715
|
+
LLAMA_ASSERT(offset == lt.size);
|
716
|
+
} else if (lt.split_type == SPLIT_BY_COLUMNS) {
|
717
|
+
// Let's load the data into temporary buffers to ensure the OS performs large loads.
|
718
|
+
std::vector<llama_buffer> tmp_bufs;
|
719
|
+
tmp_bufs.resize(lt.shards.size());
|
720
|
+
for (size_t i = 0; i < lt.shards.size(); i++) {
|
721
|
+
llama_load_tensor_shard & shard = lt.shards.at(i);
|
722
|
+
llama_file & file = file_loaders.at(shard.file_idx)->file;
|
723
|
+
file.seek(shard.file_off, SEEK_SET);
|
724
|
+
tmp_bufs.at(i).resize(shard.size);
|
725
|
+
file.read_raw(tmp_bufs.at(i).addr, shard.size);
|
726
|
+
}
|
727
|
+
// Then reshape.
|
728
|
+
size_t num_rows = lt.ne.at(1);
|
729
|
+
size_t per_shard_row_size = lt.shards.at(0).size / num_rows;
|
730
|
+
size_t out_offset = 0;
|
731
|
+
for (size_t row = 0; row < num_rows; row++) {
|
732
|
+
for (llama_buffer & tmp_buf : tmp_bufs) {
|
733
|
+
memcpy(lt.data + out_offset,
|
734
|
+
tmp_buf.addr + row * per_shard_row_size,
|
735
|
+
per_shard_row_size);
|
736
|
+
out_offset += per_shard_row_size;
|
737
|
+
}
|
738
|
+
}
|
739
|
+
LLAMA_ASSERT(out_offset == lt.size);
|
740
|
+
}
|
741
|
+
if (0) {
|
742
|
+
print_checksum(lt);
|
743
|
+
}
|
744
|
+
}
|
745
|
+
|
746
|
+
static void print_checksum(llama_load_tensor & lt) {
|
747
|
+
uint32_t sum = 0;
|
748
|
+
for (size_t i = 0; i < lt.size; i++) {
|
749
|
+
uint8_t byte = lt.data[i];
|
750
|
+
sum = byte + (sum << 6) + (sum << 16) - sum; // sdbm hash
|
751
|
+
}
|
752
|
+
fprintf(stderr, "%s checksum: %#08x (%s, size %zu)\n", lt.name.c_str(), sum,
|
753
|
+
llama_format_tensor_shape(lt.ne).c_str(), lt.size);
|
754
|
+
}
|
755
|
+
|
756
|
+
};
|
757
|
+
|
758
|
+
|
247
759
|
//
|
248
760
|
// kv cache
|
249
761
|
//
|
@@ -262,8 +774,8 @@ static bool kv_cache_init(
|
|
262
774
|
cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
|
263
775
|
|
264
776
|
struct ggml_init_params params;
|
265
|
-
params.mem_size = cache.buf.size
|
266
|
-
params.mem_buffer = cache.buf.
|
777
|
+
params.mem_size = cache.buf.size;
|
778
|
+
params.mem_buffer = cache.buf.addr;
|
267
779
|
params.no_alloc = false;
|
268
780
|
|
269
781
|
cache.ctx = ggml_init(params);
|
@@ -279,13 +791,6 @@ static bool kv_cache_init(
|
|
279
791
|
return true;
|
280
792
|
}
|
281
793
|
|
282
|
-
static void kv_cache_free(struct llama_kv_cache & cache) {
|
283
|
-
if (cache.ctx) {
|
284
|
-
ggml_free(cache.ctx);
|
285
|
-
cache.ctx = nullptr;
|
286
|
-
}
|
287
|
-
}
|
288
|
-
|
289
794
|
struct llama_context_params llama_context_default_params() {
|
290
795
|
struct llama_context_params result = {
|
291
796
|
/*.n_ctx =*/ 512,
|
@@ -294,6 +799,7 @@ struct llama_context_params llama_context_default_params() {
|
|
294
799
|
/*.f16_kv =*/ false,
|
295
800
|
/*.logits_all =*/ false,
|
296
801
|
/*.vocab_only =*/ false,
|
802
|
+
/*.use_mmap =*/ true,
|
297
803
|
/*.use_mlock =*/ false,
|
298
804
|
/*.embedding =*/ false,
|
299
805
|
/*.progress_callback =*/ nullptr,
|
@@ -303,243 +809,106 @@ struct llama_context_params llama_context_default_params() {
|
|
303
809
|
return result;
|
304
810
|
}
|
305
811
|
|
812
|
+
bool llama_mmap_supported() {
|
813
|
+
return llama_mmap::SUPPORTED;
|
814
|
+
}
|
815
|
+
|
816
|
+
bool llama_mlock_supported() {
|
817
|
+
return llama_mlock::SUPPORTED;
|
818
|
+
}
|
819
|
+
|
306
820
|
//
|
307
821
|
// model loading
|
308
822
|
//
|
309
823
|
|
310
|
-
static
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
FILE_ATTRIBUTE_NORMAL | FILE_ATTRIBUTE_NOT_CONTENT_INDEXED,
|
318
|
-
NULL);
|
319
|
-
if (hFile == INVALID_HANDLE_VALUE) return 0;
|
320
|
-
LARGE_INTEGER fileSize;
|
321
|
-
fileSize.QuadPart = -1;
|
322
|
-
GetFileSizeEx(hFile, &fileSize);
|
323
|
-
int64_t length = fileSize.QuadPart;
|
324
|
-
HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
|
325
|
-
CloseHandle(hFile);
|
326
|
-
if (!hMapping) return 0;
|
327
|
-
void *addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
|
328
|
-
CloseHandle(hMapping);
|
329
|
-
if (!addr) return 0;
|
330
|
-
#else
|
331
|
-
int fd = open(fname, O_RDONLY);
|
332
|
-
if (fd == -1) return 0;
|
333
|
-
int64_t length = lseek(fd, 0, SEEK_END);
|
334
|
-
void *addr = mmap(NULL, length, PROT_READ, MAP_SHARED, fd, 0);
|
335
|
-
close(fd);
|
336
|
-
if (addr == MAP_FAILED) return 0;
|
337
|
-
#endif
|
338
|
-
*mm_length = length;
|
339
|
-
return addr;
|
824
|
+
static const char *llama_file_version_name(llama_file_version version) {
|
825
|
+
switch (version) {
|
826
|
+
case LLAMA_FILE_VERSION_GGML: return "'ggml' (old version with low tokenizer quality and no mmap support)";
|
827
|
+
case LLAMA_FILE_VERSION_GGMF_V1: return "ggmf v1 (old version with no mmap support)";
|
828
|
+
case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (latest)";
|
829
|
+
default: LLAMA_ASSERT(false);
|
830
|
+
}
|
340
831
|
}
|
341
832
|
|
342
|
-
static
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
|
833
|
+
static const char *llama_ftype_name(enum llama_ftype ftype) {
|
834
|
+
switch (ftype) {
|
835
|
+
case LLAMA_FTYPE_ALL_F32: return "all F32";
|
836
|
+
case LLAMA_FTYPE_MOSTLY_F16: return "mostly F16";
|
837
|
+
case LLAMA_FTYPE_MOSTLY_Q4_0: return "mostly Q4_0";
|
838
|
+
case LLAMA_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1";
|
839
|
+
case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
|
840
|
+
return "mostly Q4_1, some F16";
|
841
|
+
default: return "unknown, may not work";
|
842
|
+
}
|
348
843
|
}
|
349
844
|
|
350
|
-
static
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
path, got, want);
|
359
|
-
return false;
|
845
|
+
static const char *llama_model_type_name(e_model type) {
|
846
|
+
switch (type) {
|
847
|
+
case MODEL_7B: return "7B";
|
848
|
+
case MODEL_13B: return "13B";
|
849
|
+
case MODEL_30B: return "30B";
|
850
|
+
case MODEL_65B: return "65B";
|
851
|
+
default: LLAMA_ASSERT(false);
|
852
|
+
}
|
360
853
|
}
|
361
854
|
|
362
|
-
static
|
855
|
+
static void llama_model_load_internal(
|
363
856
|
const std::string & fname,
|
364
857
|
llama_context & lctx,
|
365
858
|
int n_ctx,
|
366
|
-
int n_parts,
|
367
859
|
ggml_type memory_type,
|
860
|
+
bool use_mmap,
|
861
|
+
bool use_mlock,
|
368
862
|
bool vocab_only,
|
369
863
|
llama_progress_callback progress_callback,
|
370
|
-
void *progress_callback_user_data) {
|
371
|
-
fprintf(stderr, "%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
|
864
|
+
void * progress_callback_user_data) {
|
372
865
|
|
373
866
|
lctx.t_start_us = ggml_time_us();
|
374
867
|
|
375
|
-
|
376
|
-
auto & vocab = lctx.vocab;
|
868
|
+
std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap, vocab_only));
|
377
869
|
|
378
|
-
|
379
|
-
|
380
|
-
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
std::vector<char> f_buf(1024*1024);
|
385
|
-
fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size());
|
386
|
-
|
387
|
-
fin.seekg(0, fin.end);
|
388
|
-
const size_t file_size = fin.tellg();
|
389
|
-
fin.seekg(0);
|
870
|
+
lctx.vocab = std::move(ml->file_loaders.at(0)->vocab);
|
871
|
+
auto & model = lctx.model;
|
872
|
+
model.hparams = ml->file_loaders.at(0)->hparams;
|
873
|
+
llama_file_version file_version = ml->file_loaders.at(0)->file_version;
|
874
|
+
auto & hparams = model.hparams;
|
875
|
+
uint32_t n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
|
390
876
|
|
391
|
-
// verify magic
|
392
877
|
{
|
393
|
-
|
394
|
-
|
395
|
-
|
396
|
-
|
397
|
-
|
398
|
-
return false;
|
878
|
+
switch (hparams.n_layer) {
|
879
|
+
case 32: model.type = e_model::MODEL_7B; break;
|
880
|
+
case 40: model.type = e_model::MODEL_13B; break;
|
881
|
+
case 60: model.type = e_model::MODEL_30B; break;
|
882
|
+
case 80: model.type = e_model::MODEL_65B; break;
|
399
883
|
}
|
400
|
-
if (magic != LLAMA_FILE_MAGIC) {
|
401
|
-
return report_bad_magic(fname.c_str(), magic, LLAMA_FILE_MAGIC);
|
402
|
-
}
|
403
|
-
|
404
|
-
uint32_t format_version;
|
405
|
-
fin.read((char *) &format_version, sizeof(format_version));
|
406
|
-
|
407
|
-
if (format_version != LLAMA_FILE_VERSION) {
|
408
|
-
fprintf(stderr, "%s: invalid model file '%s' (unsupported format version %" PRIu32 ", expected %d)\n",
|
409
|
-
__func__, fname.c_str(), format_version, LLAMA_FILE_VERSION);
|
410
|
-
return false;
|
411
|
-
}
|
412
|
-
}
|
413
|
-
|
414
|
-
int n_ff = 0;
|
415
|
-
|
416
|
-
// load hparams
|
417
|
-
{
|
418
|
-
auto & hparams = model.hparams;
|
419
|
-
|
420
|
-
fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
|
421
|
-
//fin.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
|
422
|
-
fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd));
|
423
|
-
fin.read((char *) &hparams.n_mult, sizeof(hparams.n_mult));
|
424
|
-
fin.read((char *) &hparams.n_head, sizeof(hparams.n_head));
|
425
|
-
fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
|
426
|
-
fin.read((char *) &hparams.n_rot, sizeof(hparams.n_rot));
|
427
|
-
fin.read((char *) &hparams.f16, sizeof(hparams.f16));
|
428
884
|
|
429
885
|
hparams.n_ctx = n_ctx;
|
430
|
-
|
431
|
-
n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
|
432
|
-
|
433
|
-
if (n_parts < 1) {
|
434
|
-
n_parts = LLAMA_N_PARTS.at(hparams.n_embd);
|
435
|
-
}
|
436
|
-
|
437
|
-
// temp warning to tell the user to use "--n_parts"
|
438
|
-
if (hparams.f16 == 4 && n_parts != 1) {
|
439
|
-
fprintf(stderr, "%s: GPTQ model detected - are you sure n_parts should be %d? we normally expect it to be 1\n", __func__, n_parts);
|
440
|
-
fprintf(stderr, "%s: use '--n_parts 1' if necessary\n", __func__);
|
441
|
-
}
|
442
|
-
|
443
|
-
if (hparams.n_layer == 32) {
|
444
|
-
model.type = e_model::MODEL_7B;
|
445
|
-
}
|
446
|
-
|
447
|
-
if (hparams.n_layer == 40) {
|
448
|
-
model.type = e_model::MODEL_13B;
|
449
|
-
}
|
450
|
-
|
451
|
-
if (hparams.n_layer == 60) {
|
452
|
-
model.type = e_model::MODEL_30B;
|
453
|
-
}
|
454
|
-
|
455
|
-
if (hparams.n_layer == 80) {
|
456
|
-
model.type = e_model::MODEL_65B;
|
457
|
-
}
|
458
|
-
|
459
|
-
fprintf(stderr, "%s: n_vocab = %d\n", __func__, hparams.n_vocab);
|
460
|
-
fprintf(stderr, "%s: n_ctx = %d\n", __func__, hparams.n_ctx);
|
461
|
-
fprintf(stderr, "%s: n_embd = %d\n", __func__, hparams.n_embd);
|
462
|
-
fprintf(stderr, "%s: n_mult = %d\n", __func__, hparams.n_mult);
|
463
|
-
fprintf(stderr, "%s: n_head = %d\n", __func__, hparams.n_head);
|
464
|
-
fprintf(stderr, "%s: n_layer = %d\n", __func__, hparams.n_layer);
|
465
|
-
fprintf(stderr, "%s: n_rot = %d\n", __func__, hparams.n_rot);
|
466
|
-
fprintf(stderr, "%s: f16 = %d\n", __func__, hparams.f16);
|
467
|
-
fprintf(stderr, "%s: n_ff = %d\n", __func__, n_ff);
|
468
|
-
fprintf(stderr, "%s: n_parts = %d\n", __func__, n_parts);
|
469
|
-
fprintf(stderr, "%s: type = %d\n", __func__, model.type);
|
470
886
|
}
|
471
887
|
|
472
|
-
// load vocab
|
473
888
|
{
|
474
|
-
|
475
|
-
|
476
|
-
|
477
|
-
|
478
|
-
|
479
|
-
|
480
|
-
|
481
|
-
|
482
|
-
|
483
|
-
|
484
|
-
|
485
|
-
|
486
|
-
word.assign(tmp.data(), len);
|
487
|
-
} else {
|
488
|
-
word.clear();
|
489
|
-
}
|
490
|
-
|
491
|
-
float score;
|
492
|
-
fin.read((char *) &score, sizeof(score));
|
493
|
-
|
494
|
-
vocab.token_to_id[word] = i;
|
495
|
-
|
496
|
-
auto &tok_score = vocab.id_to_token[i];
|
497
|
-
tok_score.tok = word;
|
498
|
-
tok_score.score = score;
|
499
|
-
}
|
889
|
+
fprintf(stderr, "%s: format = %s\n", __func__, llama_file_version_name(file_version));
|
890
|
+
fprintf(stderr, "%s: n_vocab = %u\n", __func__, hparams.n_vocab);
|
891
|
+
fprintf(stderr, "%s: n_ctx = %u\n", __func__, hparams.n_ctx);
|
892
|
+
fprintf(stderr, "%s: n_embd = %u\n", __func__, hparams.n_embd);
|
893
|
+
fprintf(stderr, "%s: n_mult = %u\n", __func__, hparams.n_mult);
|
894
|
+
fprintf(stderr, "%s: n_head = %u\n", __func__, hparams.n_head);
|
895
|
+
fprintf(stderr, "%s: n_layer = %u\n", __func__, hparams.n_layer);
|
896
|
+
fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot);
|
897
|
+
fprintf(stderr, "%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype));
|
898
|
+
fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff);
|
899
|
+
fprintf(stderr, "%s: n_parts = %zu\n", __func__, ml->file_loaders.size());
|
900
|
+
fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
|
500
901
|
}
|
501
902
|
|
502
903
|
if (vocab_only) {
|
503
|
-
return
|
504
|
-
}
|
505
|
-
|
506
|
-
// for the big tensors, we have the option to store the data in 16-bit floats or quantized
|
507
|
-
// in order to save memory and also to speed up the computation
|
508
|
-
// wtype is for per-layer weights, while vtype is for other weights
|
509
|
-
ggml_type wtype, vtype;
|
510
|
-
switch (model.hparams.f16) {
|
511
|
-
case 0: wtype = vtype = GGML_TYPE_F32; break;
|
512
|
-
case 1: wtype = vtype = GGML_TYPE_F16; break;
|
513
|
-
case 2: wtype = vtype = GGML_TYPE_Q4_0; break;
|
514
|
-
case 3: wtype = vtype = GGML_TYPE_Q4_1; break;
|
515
|
-
case 4: wtype = GGML_TYPE_Q4_1; vtype = GGML_TYPE_F16; break;
|
516
|
-
default:
|
517
|
-
{
|
518
|
-
fprintf(stderr, "%s: invalid model file '%s' (bad f16 value %d)\n",
|
519
|
-
__func__, fname.c_str(), model.hparams.f16);
|
520
|
-
return false;
|
521
|
-
}
|
904
|
+
return;
|
522
905
|
}
|
523
906
|
|
524
|
-
// map model into memory
|
525
|
-
char *mm_addr = NULL;
|
526
|
-
model.mm_addr = mmap_file(fname.c_str(), &model.mm_length);
|
527
|
-
if (model.mm_addr == NULL) {
|
528
|
-
fprintf(stderr, "%s: failed to mmap '%s'\n", __func__, fname.c_str());
|
529
|
-
return false;
|
530
|
-
}
|
531
|
-
mm_addr = (char *)model.mm_addr;
|
532
|
-
fprintf(stderr, "%s: ggml map size = %6.2f MB\n", __func__, model.mm_length/(1024.0*1024.0));
|
533
|
-
|
534
907
|
auto & ctx = model.ctx;
|
535
908
|
|
536
|
-
size_t ctx_size
|
537
|
-
|
538
|
-
|
539
|
-
const int n_layer = hparams.n_layer;
|
540
|
-
ctx_size += (5 + 10*n_layer)*256; // object overhead
|
541
|
-
fprintf(stderr, "%s: ggml ctx size = %6.2f KB\n", __func__, ctx_size/1024.0);
|
542
|
-
}
|
909
|
+
size_t ctx_size, mmapped_size;
|
910
|
+
ml->calc_sizes(&ctx_size, &mmapped_size);
|
911
|
+
fprintf(stderr, "%s: ggml ctx size = %6.2f KB\n", __func__, ctx_size/1024.0);
|
543
912
|
|
544
913
|
// print memory requirements
|
545
914
|
{
|
@@ -548,14 +917,14 @@ static bool llama_model_load(
|
|
548
917
|
// this is the total memory required to run the inference
|
549
918
|
const size_t mem_required =
|
550
919
|
ctx_size +
|
551
|
-
|
552
|
-
MEM_REQ_SCRATCH0.at(model.type) +
|
553
|
-
MEM_REQ_SCRATCH1.at(model.type) +
|
554
|
-
MEM_REQ_EVAL.at
|
920
|
+
mmapped_size +
|
921
|
+
MEM_REQ_SCRATCH0().at(model.type) +
|
922
|
+
MEM_REQ_SCRATCH1().at(model.type) +
|
923
|
+
MEM_REQ_EVAL().at(model.type);
|
555
924
|
|
556
925
|
// this is the memory required by one llama_state
|
557
926
|
const size_t mem_required_state =
|
558
|
-
scale*MEM_REQ_KV_SELF.at(model.type);
|
927
|
+
scale*MEM_REQ_KV_SELF().at(model.type);
|
559
928
|
|
560
929
|
fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
|
561
930
|
mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
|
@@ -564,17 +933,20 @@ static bool llama_model_load(
|
|
564
933
|
// create the ggml context
|
565
934
|
{
|
566
935
|
lctx.model.buf.resize(ctx_size);
|
936
|
+
if (use_mlock) {
|
937
|
+
lctx.model.mlock_buf.init(lctx.model.buf.addr);
|
938
|
+
lctx.model.mlock_buf.grow_to(lctx.model.buf.size);
|
939
|
+
}
|
567
940
|
|
568
941
|
struct ggml_init_params params = {
|
569
|
-
/*.mem_size =*/ lctx.model.buf.size
|
570
|
-
/*.mem_buffer =*/ lctx.model.buf.
|
571
|
-
/*.no_alloc =*/
|
942
|
+
/*.mem_size =*/ lctx.model.buf.size,
|
943
|
+
/*.mem_buffer =*/ lctx.model.buf.addr,
|
944
|
+
/*.no_alloc =*/ ml->use_mmap,
|
572
945
|
};
|
573
946
|
|
574
947
|
model.ctx = ggml_init(params);
|
575
948
|
if (!model.ctx) {
|
576
|
-
|
577
|
-
return false;
|
949
|
+
throw format("ggml_init() failed");
|
578
950
|
}
|
579
951
|
}
|
580
952
|
|
@@ -582,161 +954,71 @@ static bool llama_model_load(
|
|
582
954
|
{
|
583
955
|
const auto & hparams = model.hparams;
|
584
956
|
|
585
|
-
const
|
586
|
-
const
|
587
|
-
const
|
588
|
-
|
589
|
-
model.layers.resize(n_layer);
|
590
|
-
|
591
|
-
model.tok_embeddings = ggml_new_tensor_2d(ctx, vtype, n_embd, n_vocab);
|
957
|
+
const uint32_t n_embd = hparams.n_embd;
|
958
|
+
const uint32_t n_layer = hparams.n_layer;
|
959
|
+
const uint32_t n_vocab = hparams.n_vocab;
|
592
960
|
|
593
|
-
|
594
|
-
model.output = ggml_new_tensor_2d(ctx, vtype, n_embd, n_vocab);
|
961
|
+
ml->ggml_ctx = ctx;
|
595
962
|
|
596
|
-
|
597
|
-
model.
|
963
|
+
model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab});
|
964
|
+
model.norm = ml->get_tensor("norm.weight", {n_embd});
|
965
|
+
model.output = ml->get_tensor("output.weight", {n_embd, n_vocab});
|
598
966
|
|
599
|
-
model.
|
600
|
-
|
601
|
-
|
602
|
-
for (int i = 0; i < n_layer; ++i) {
|
967
|
+
model.layers.resize(n_layer);
|
968
|
+
for (uint32_t i = 0; i < n_layer; ++i) {
|
603
969
|
auto & layer = model.layers[i];
|
604
970
|
|
605
|
-
|
606
|
-
|
607
|
-
layer.wq = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
|
608
|
-
layer.wk = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
|
609
|
-
layer.wv = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
|
610
|
-
layer.wo = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
|
611
|
-
|
612
|
-
layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
971
|
+
std::string layers_i = "layers." + std::to_string(i);
|
613
972
|
|
614
|
-
layer.
|
615
|
-
layer.w2 = ggml_new_tensor_2d(ctx, wtype, n_ff, n_embd);
|
616
|
-
layer.w3 = ggml_new_tensor_2d(ctx, wtype, n_embd, n_ff);
|
973
|
+
layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd});
|
617
974
|
|
618
|
-
|
619
|
-
|
975
|
+
layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd});
|
976
|
+
layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd});
|
977
|
+
layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd});
|
978
|
+
layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd});
|
620
979
|
|
621
|
-
|
622
|
-
model.tensors["layers." + std::to_string(i) + ".attention.wk.weight"] = layer.wk;
|
623
|
-
model.tensors["layers." + std::to_string(i) + ".attention.wv.weight"] = layer.wv;
|
624
|
-
model.tensors["layers." + std::to_string(i) + ".attention.wo.weight"] = layer.wo;
|
980
|
+
layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd});
|
625
981
|
|
626
|
-
|
627
|
-
|
628
|
-
|
629
|
-
model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.weight"] = layer.w2;
|
630
|
-
model.tensors["layers." + std::to_string(i) + ".feed_forward.w3.weight"] = layer.w3;
|
982
|
+
layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff});
|
983
|
+
layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd});
|
984
|
+
layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff});
|
631
985
|
}
|
632
986
|
}
|
633
987
|
|
634
|
-
|
988
|
+
ml->done_getting_tensors();
|
635
989
|
|
636
|
-
|
637
|
-
|
990
|
+
// populate `tensors_by_name`
|
991
|
+
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
|
992
|
+
model.tensors_by_name.emplace_back(lt.name, lt.ggml_tensor);
|
638
993
|
}
|
639
994
|
|
640
|
-
|
641
|
-
|
642
|
-
// load weights
|
643
|
-
{
|
644
|
-
size_t total_size = 0;
|
645
|
-
model.n_loaded = 0;
|
646
|
-
|
647
|
-
while (true) {
|
648
|
-
int32_t n_dims;
|
649
|
-
int32_t length;
|
650
|
-
int32_t ftype;
|
651
|
-
|
652
|
-
fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
|
653
|
-
fin.read(reinterpret_cast<char *>(&length), sizeof(length));
|
654
|
-
fin.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
|
655
|
-
|
656
|
-
if (fin.eof()) {
|
657
|
-
break;
|
658
|
-
}
|
659
|
-
|
660
|
-
int32_t nelements = 1;
|
661
|
-
int32_t ne[2] = { 1, 1 };
|
662
|
-
for (int i = 0; i < n_dims; ++i) {
|
663
|
-
fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
|
664
|
-
nelements *= ne[i];
|
665
|
-
}
|
666
|
-
|
667
|
-
std::string name(length, 0);
|
668
|
-
fin.read(&name[0], length);
|
669
|
-
|
670
|
-
if (model.tensors.find(name.data()) == model.tensors.end()) {
|
671
|
-
fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
|
672
|
-
return false;
|
673
|
-
}
|
674
|
-
|
675
|
-
auto tensor = model.tensors[name.data()];
|
676
|
-
|
677
|
-
if (ggml_nelements(tensor) != nelements) {
|
678
|
-
fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
|
679
|
-
return false;
|
680
|
-
}
|
681
|
-
if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
|
682
|
-
fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%" PRId64 ", %" PRId64 "], expected [%d, %d]\n",
|
683
|
-
__func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
|
684
|
-
return false;
|
685
|
-
}
|
686
|
-
if (0) {
|
687
|
-
static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
|
688
|
-
fprintf(stderr, "%24s - [%5d, %5d], type = %6s\n", name.data(), ne[0], ne[1], ftype_str[ftype]);
|
689
|
-
}
|
690
|
-
|
691
|
-
switch (ftype) {
|
692
|
-
case 0: // f32
|
693
|
-
case 1: // f16
|
694
|
-
break;
|
695
|
-
case 2: // q4_0
|
696
|
-
case 3: // q4_1
|
697
|
-
assert(ne[0] % 64 == 0);
|
698
|
-
break;
|
699
|
-
default:
|
700
|
-
fprintf(stderr, "%s: unknown ftype %d in model file\n", __func__, ftype);
|
701
|
-
return false;
|
702
|
-
};
|
703
|
-
|
704
|
-
// load the tensor data into memory without copying or reading it
|
705
|
-
size_t offset = fin.tellg();
|
706
|
-
size_t tensor_data_size = ggml_nbytes(tensor);
|
707
|
-
offset = (offset + 31) & -32;
|
708
|
-
tensor->data = mm_addr + offset;
|
709
|
-
fin.seekg(offset + tensor_data_size);
|
710
|
-
total_size += tensor_data_size;
|
711
|
-
model.n_loaded++;
|
712
|
-
|
713
|
-
// progress
|
714
|
-
if (progress_callback) {
|
715
|
-
double current_progress = size_t(fin.tellg()) / double(file_size);
|
716
|
-
progress_callback(current_progress, progress_callback_user_data);
|
717
|
-
}
|
718
|
-
}
|
719
|
-
|
720
|
-
fin.close();
|
995
|
+
ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
|
721
996
|
|
722
|
-
|
723
|
-
if (model.n_loaded == 0) {
|
724
|
-
fprintf(stderr, "%s: WARN no tensors loaded from model file - assuming empty model for testing\n", __func__);
|
725
|
-
} else if (model.n_loaded != (int) model.tensors.size()) {
|
726
|
-
fprintf(stderr, "%s: ERROR not all tensors loaded from model file - expected %zu, got %d\n", __func__, model.tensors.size(), model.n_loaded);
|
727
|
-
return false;
|
728
|
-
}
|
729
|
-
}
|
997
|
+
model.mapping = std::move(ml->mapping);
|
730
998
|
|
731
999
|
// loading time will be recalculate after the first eval, so
|
732
1000
|
// we take page faults deferred by mmap() into consideration
|
733
1001
|
lctx.t_load_us = ggml_time_us() - lctx.t_start_us;
|
1002
|
+
}
|
734
1003
|
|
735
|
-
|
736
|
-
|
1004
|
+
static bool llama_model_load(
|
1005
|
+
const std::string & fname,
|
1006
|
+
llama_context & lctx,
|
1007
|
+
int n_ctx,
|
1008
|
+
ggml_type memory_type,
|
1009
|
+
bool use_mmap,
|
1010
|
+
bool use_mlock,
|
1011
|
+
bool vocab_only,
|
1012
|
+
llama_progress_callback progress_callback,
|
1013
|
+
void *progress_callback_user_data) {
|
1014
|
+
try {
|
1015
|
+
llama_model_load_internal(fname, lctx, n_ctx, memory_type, use_mmap, use_mlock,
|
1016
|
+
vocab_only, progress_callback, progress_callback_user_data);
|
1017
|
+
return true;
|
1018
|
+
} catch (const std::string & err) {
|
1019
|
+
fprintf(stderr, "error loading model: %s\n", err.c_str());
|
1020
|
+
return false;
|
737
1021
|
}
|
738
|
-
|
739
|
-
return true;
|
740
1022
|
}
|
741
1023
|
|
742
1024
|
// evaluate the transformer
|
@@ -774,8 +1056,8 @@ static bool llama_eval_internal(
|
|
774
1056
|
auto & buf_compute = lctx.buf_compute;
|
775
1057
|
|
776
1058
|
struct ggml_init_params params = {
|
777
|
-
/*.mem_size =*/ buf_compute.size
|
778
|
-
/*.mem_buffer =*/ buf_compute.
|
1059
|
+
/*.mem_size =*/ buf_compute.size,
|
1060
|
+
/*.mem_buffer =*/ buf_compute.addr,
|
779
1061
|
/*.no_alloc =*/ false,
|
780
1062
|
};
|
781
1063
|
|
@@ -1061,7 +1343,7 @@ struct llama_tokenizer {
|
|
1061
1343
|
size_t offs = 0;
|
1062
1344
|
while (offs < text.size()) {
|
1063
1345
|
llama_sp_symbol sym;
|
1064
|
-
size_t char_len =
|
1346
|
+
size_t char_len = std::min(text.size() - offs, utf8_len(text[offs]));
|
1065
1347
|
sym.text = text.c_str() + offs;
|
1066
1348
|
sym.n = char_len;
|
1067
1349
|
offs += char_len;
|
@@ -1236,7 +1518,7 @@ static llama_vocab::id llama_sample_top_p_top_k(
|
|
1236
1518
|
}
|
1237
1519
|
}
|
1238
1520
|
|
1239
|
-
sample_top_k(logits_id, top_k > 0 ?
|
1521
|
+
sample_top_k(logits_id, top_k > 0 ? std::min(top_k, n_logits) : n_logits);
|
1240
1522
|
|
1241
1523
|
// compute probs for the top k tokens
|
1242
1524
|
std::vector<float> probs;
|
@@ -1284,298 +1566,118 @@ static llama_vocab::id llama_sample_top_p_top_k(
|
|
1284
1566
|
// quantization
|
1285
1567
|
//
|
1286
1568
|
|
1287
|
-
|
1288
|
-
|
1289
|
-
|
1290
|
-
|
1291
|
-
|
1292
|
-
|
1293
|
-
case 3: type = GGML_TYPE_Q4_1; break;
|
1294
|
-
default: fprintf(stderr, "%s: invalid quantization type %d\n", __func__, itype); return 1;
|
1569
|
+
static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, enum llama_ftype ftype) {
|
1570
|
+
ggml_type quantized_type;
|
1571
|
+
switch (ftype) {
|
1572
|
+
case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
|
1573
|
+
case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
|
1574
|
+
default: throw format("invalid output file type %d\n", ftype);
|
1295
1575
|
};
|
1296
1576
|
|
1297
|
-
|
1298
|
-
|
1299
|
-
|
1300
|
-
|
1301
|
-
|
1302
|
-
|
1303
|
-
|
1304
|
-
|
1305
|
-
|
1306
|
-
|
1307
|
-
|
1308
|
-
|
1309
|
-
|
1310
|
-
|
1311
|
-
|
1312
|
-
|
1313
|
-
|
1314
|
-
|
1315
|
-
|
1316
|
-
|
1317
|
-
|
1318
|
-
|
1319
|
-
|
1320
|
-
|
1321
|
-
|
1322
|
-
|
1323
|
-
|
1324
|
-
|
1325
|
-
|
1326
|
-
|
1327
|
-
|
1328
|
-
|
1329
|
-
|
1330
|
-
|
1331
|
-
|
1332
|
-
|
1333
|
-
|
1334
|
-
|
1335
|
-
|
1336
|
-
|
1337
|
-
|
1338
|
-
|
1339
|
-
|
1340
|
-
|
1341
|
-
|
1342
|
-
|
1343
|
-
|
1344
|
-
|
1345
|
-
|
1346
|
-
|
1347
|
-
// load hparams
|
1348
|
-
{
|
1349
|
-
finp.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
|
1350
|
-
//finp.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
|
1351
|
-
finp.read((char *) &hparams.n_embd, sizeof(hparams.n_embd));
|
1352
|
-
finp.read((char *) &hparams.n_mult, sizeof(hparams.n_mult));
|
1353
|
-
finp.read((char *) &hparams.n_head, sizeof(hparams.n_head));
|
1354
|
-
finp.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
|
1355
|
-
finp.read((char *) &hparams.n_rot, sizeof(hparams.n_rot));
|
1356
|
-
finp.read((char *) &hparams.f16, sizeof(hparams.f16));
|
1357
|
-
|
1358
|
-
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
|
1359
|
-
printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
|
1360
|
-
printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
|
1361
|
-
printf("%s: n_mult = %d\n", __func__, hparams.n_mult);
|
1362
|
-
printf("%s: n_head = %d\n", __func__, hparams.n_head);
|
1363
|
-
printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
|
1364
|
-
printf("%s: f16 = %d\n", __func__, hparams.f16);
|
1365
|
-
|
1366
|
-
fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
|
1367
|
-
//fout.write((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
|
1368
|
-
fout.write((char *) &hparams.n_embd, sizeof(hparams.n_embd));
|
1369
|
-
fout.write((char *) &hparams.n_mult, sizeof(hparams.n_mult));
|
1370
|
-
fout.write((char *) &hparams.n_head, sizeof(hparams.n_head));
|
1371
|
-
fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer));
|
1372
|
-
fout.write((char *) &hparams.n_rot, sizeof(hparams.n_rot));
|
1373
|
-
fout.write((char *) &itype, sizeof(hparams.f16));
|
1374
|
-
}
|
1375
|
-
|
1376
|
-
// load vocab
|
1377
|
-
{
|
1378
|
-
const int32_t n_vocab = hparams.n_vocab;
|
1379
|
-
|
1380
|
-
if (n_vocab != hparams.n_vocab) {
|
1381
|
-
fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
|
1382
|
-
__func__, fname_inp.c_str(), n_vocab, hparams.n_vocab);
|
1383
|
-
return false;
|
1384
|
-
}
|
1385
|
-
|
1386
|
-
std::vector<char> word(32);
|
1387
|
-
vocab.id_to_token.resize(n_vocab);
|
1388
|
-
for (int i = 0; i < n_vocab; i++) {
|
1389
|
-
uint32_t len;
|
1390
|
-
finp.read ((char *) &len, sizeof(len));
|
1391
|
-
fout.write((char *) &len, sizeof(len));
|
1392
|
-
|
1393
|
-
word.resize(len);
|
1394
|
-
finp.read ((char *) &word[0], len);
|
1395
|
-
fout.write((char *) &word[0], len);
|
1396
|
-
|
1397
|
-
float score;
|
1398
|
-
finp.read ((char *) &score, sizeof(score));
|
1399
|
-
fout.write((char *) &score, sizeof(score));
|
1400
|
-
|
1401
|
-
vocab.token_to_id[word.data()] = i;
|
1402
|
-
|
1403
|
-
auto &tok_score = vocab.id_to_token[i];
|
1404
|
-
tok_score.tok = word.data();
|
1405
|
-
tok_score.score = score;
|
1406
|
-
}
|
1407
|
-
}
|
1408
|
-
|
1409
|
-
// load weights
|
1410
|
-
{
|
1411
|
-
size_t total_size_org = 0;
|
1412
|
-
size_t total_size_new = 0;
|
1413
|
-
|
1414
|
-
std::vector<float> work;
|
1415
|
-
|
1416
|
-
std::vector<uint8_t> data_u8;
|
1417
|
-
std::vector<ggml_fp16_t> data_f16;
|
1418
|
-
std::vector<float> data_f32;
|
1419
|
-
|
1420
|
-
std::vector<int64_t> hist_all(1 << 4, 0);
|
1421
|
-
|
1422
|
-
while (true) {
|
1423
|
-
int32_t n_dims;
|
1424
|
-
int32_t length;
|
1425
|
-
int32_t ftype;
|
1426
|
-
|
1427
|
-
finp.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
|
1428
|
-
finp.read(reinterpret_cast<char *>(&length), sizeof(length));
|
1429
|
-
finp.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
|
1430
|
-
|
1431
|
-
if (finp.eof()) {
|
1432
|
-
break;
|
1433
|
-
}
|
1434
|
-
|
1435
|
-
int32_t nelements = 1;
|
1436
|
-
int32_t ne[2] = { 1, 1 };
|
1437
|
-
for (int i = 0; i < n_dims; ++i) {
|
1438
|
-
finp.read (reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
|
1439
|
-
nelements *= ne[i];
|
1440
|
-
}
|
1441
|
-
|
1442
|
-
std::string name(length, 0);
|
1443
|
-
finp.read (&name[0], length);
|
1444
|
-
|
1445
|
-
{
|
1446
|
-
// ensure tensor data is aligned
|
1447
|
-
uint64_t offset = finp.tellg();
|
1448
|
-
offset = (offset + 31) & -32;
|
1449
|
-
finp.seekg(offset);
|
1450
|
-
}
|
1451
|
-
|
1452
|
-
{
|
1453
|
-
static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
|
1454
|
-
printf("%48s - [%5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ftype_str[ftype]);
|
1455
|
-
}
|
1456
|
-
|
1457
|
-
// regexes of tensor names to be quantized
|
1458
|
-
const std::vector<std::string> k_names = {
|
1459
|
-
".*weight",
|
1460
|
-
};
|
1461
|
-
|
1462
|
-
bool quantize = false;
|
1463
|
-
for (const auto & s : k_names) {
|
1464
|
-
if (std::regex_match(name, std::regex(s))) {
|
1465
|
-
quantize = true;
|
1466
|
-
break;
|
1467
|
-
}
|
1468
|
-
}
|
1469
|
-
|
1470
|
-
// quantize only 2D tensors
|
1471
|
-
quantize &= (n_dims == 2);
|
1472
|
-
|
1473
|
-
if (quantize) {
|
1474
|
-
if (ftype != 0 && ftype != 1) {
|
1475
|
-
fprintf(stderr, "%s: unsupported ftype %d for integer quantization\n", __func__, ftype);
|
1476
|
-
return false;
|
1477
|
-
}
|
1478
|
-
|
1479
|
-
if (ftype == 1) {
|
1480
|
-
data_f16.resize(nelements);
|
1481
|
-
finp.read(reinterpret_cast<char *>(data_f16.data()), nelements * sizeof(ggml_fp16_t));
|
1482
|
-
data_f32.resize(nelements);
|
1483
|
-
for (int i = 0; i < nelements; ++i) {
|
1484
|
-
data_f32[i] = ggml_fp16_to_fp32(data_f16[i]);
|
1485
|
-
}
|
1486
|
-
} else {
|
1487
|
-
data_f32.resize(nelements);
|
1488
|
-
finp.read(reinterpret_cast<char *>(data_f32.data()), nelements * sizeof(float));
|
1577
|
+
std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp.c_str(), /*use_mmap*/ false,
|
1578
|
+
/*vocab_only*/ false));
|
1579
|
+
llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), ftype);
|
1580
|
+
|
1581
|
+
size_t total_size_org = 0;
|
1582
|
+
size_t total_size_new = 0;
|
1583
|
+
std::vector<int64_t> hist_all(1 << 4, 0);
|
1584
|
+
|
1585
|
+
size_t idx = 0;
|
1586
|
+
for (llama_load_tensor & tensor : model_loader->tensors_map.tensors) {
|
1587
|
+
llama_buffer read_data;
|
1588
|
+
read_data.resize(tensor.size);
|
1589
|
+
tensor.data = read_data.addr;
|
1590
|
+
model_loader->load_data_for(tensor);
|
1591
|
+
|
1592
|
+
printf("[%4zu/%4zu] %36s - %16s, type = %6s, ",
|
1593
|
+
++idx, model_loader->tensors_map.tensors.size(),
|
1594
|
+
tensor.name.c_str(), llama_format_tensor_shape(tensor.ne).c_str(),
|
1595
|
+
ggml_type_name(tensor.type));
|
1596
|
+
|
1597
|
+
// This used to be a regex, but <regex> has an extreme cost to compile times.
|
1598
|
+
bool quantize = tensor.name.rfind("weight") == tensor.name.size() - 6; // ends with 'weight'?
|
1599
|
+
|
1600
|
+
// quantize only 2D tensors
|
1601
|
+
quantize &= (tensor.ne.size() == 2);
|
1602
|
+
|
1603
|
+
enum ggml_type new_type;
|
1604
|
+
void * new_data;
|
1605
|
+
size_t new_size;
|
1606
|
+
llama_buffer work;
|
1607
|
+
|
1608
|
+
if (!quantize) {
|
1609
|
+
new_type = tensor.type;
|
1610
|
+
new_data = tensor.data;
|
1611
|
+
new_size = tensor.size;
|
1612
|
+
printf("size = %8.3f MB\n", tensor.size/1024.0/1024.0);
|
1613
|
+
} else {
|
1614
|
+
new_type = quantized_type;
|
1615
|
+
float * f32_data;
|
1616
|
+
size_t nelements = tensor.ne.at(0) * tensor.ne.at(1);
|
1617
|
+
llama_buffer f32_conv_buf;
|
1618
|
+
if (tensor.type == GGML_TYPE_F32) {
|
1619
|
+
f32_data = (float *) tensor.data;
|
1620
|
+
} else if (tensor.type == GGML_TYPE_F16) {
|
1621
|
+
f32_conv_buf.resize(nelements * sizeof(float));
|
1622
|
+
f32_data = (float *) f32_conv_buf.addr;
|
1623
|
+
auto f16_data = (const ggml_fp16_t *) tensor.data;
|
1624
|
+
for (size_t i = 0; i < nelements; i++) {
|
1625
|
+
f32_data[i] = ggml_fp16_to_fp32(f16_data[i]);
|
1489
1626
|
}
|
1490
|
-
|
1491
|
-
ftype = itype;
|
1492
1627
|
} else {
|
1493
|
-
|
1494
|
-
|
1495
|
-
data_u8.resize(nelements*bpe);
|
1496
|
-
finp.read(reinterpret_cast<char *>(data_u8.data()), nelements * bpe);
|
1628
|
+
throw format("type %s unsupported for integer quantization", ggml_type_name(tensor.type));
|
1497
1629
|
}
|
1498
1630
|
|
1499
|
-
|
1500
|
-
|
1501
|
-
|
1502
|
-
|
1503
|
-
|
1631
|
+
printf("quantizing .. ");
|
1632
|
+
fflush(stdout);
|
1633
|
+
|
1634
|
+
work.resize(nelements * 4); // upper bound on size
|
1635
|
+
new_data = work.addr;
|
1636
|
+
std::vector<int64_t> hist_cur(1 << 4, 0);
|
1637
|
+
|
1638
|
+
switch (new_type) {
|
1639
|
+
case GGML_TYPE_Q4_0:
|
1640
|
+
{
|
1641
|
+
new_size = ggml_quantize_q4_0(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
|
1642
|
+
} break;
|
1643
|
+
case GGML_TYPE_Q4_1:
|
1644
|
+
{
|
1645
|
+
new_size = ggml_quantize_q4_1(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
|
1646
|
+
} break;
|
1647
|
+
default:
|
1648
|
+
LLAMA_ASSERT(false);
|
1504
1649
|
}
|
1505
|
-
fout.write(&name[0], length);
|
1506
1650
|
|
1507
|
-
|
1508
|
-
|
1509
|
-
|
1510
|
-
offset = (offset + 31) & -32;
|
1511
|
-
fout.seekp(offset);
|
1651
|
+
printf("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0);
|
1652
|
+
for (size_t i = 0; i < hist_cur.size(); i++) {
|
1653
|
+
hist_all[i] += hist_cur[i];
|
1512
1654
|
}
|
1513
1655
|
|
1514
|
-
|
1515
|
-
printf("
|
1516
|
-
work.resize(nelements); // for quantization
|
1517
|
-
|
1518
|
-
size_t cur_size = 0;
|
1519
|
-
std::vector<int64_t> hist_cur(1 << 4, 0);
|
1520
|
-
|
1521
|
-
switch (type) {
|
1522
|
-
case GGML_TYPE_Q4_0:
|
1523
|
-
{
|
1524
|
-
cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
1525
|
-
} break;
|
1526
|
-
case GGML_TYPE_Q4_1:
|
1527
|
-
{
|
1528
|
-
cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
1529
|
-
} break;
|
1530
|
-
default:
|
1531
|
-
{
|
1532
|
-
fprintf(stderr, "%s: unsupported quantization type %d\n", __func__, type);
|
1533
|
-
return false;
|
1534
|
-
}
|
1535
|
-
}
|
1536
|
-
|
1537
|
-
fout.write(reinterpret_cast<char *>(work.data()), cur_size);
|
1538
|
-
total_size_new += cur_size;
|
1539
|
-
|
1540
|
-
printf("size = %8.2f MB -> %8.2f MB | hist: ", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0);
|
1541
|
-
for (int i = 0; i < (int) hist_cur.size(); ++i) {
|
1542
|
-
hist_all[i] += hist_cur[i];
|
1543
|
-
}
|
1544
|
-
|
1545
|
-
for (int i = 0; i < (int) hist_cur.size(); ++i) {
|
1546
|
-
printf("%5.3f ", hist_cur[i] / float(nelements));
|
1547
|
-
}
|
1548
|
-
printf("\n");
|
1549
|
-
} else {
|
1550
|
-
printf("size = %8.3f MB\n", data_u8.size()/1024.0/1024.0);
|
1551
|
-
fout.write(reinterpret_cast<char *>(data_u8.data()), data_u8.size());
|
1552
|
-
total_size_new += data_u8.size();
|
1656
|
+
for (size_t i = 0; i < hist_cur.size(); i++) {
|
1657
|
+
printf("%5.3f ", hist_cur[i] / float(nelements));
|
1553
1658
|
}
|
1554
|
-
|
1555
|
-
total_size_org += nelements * sizeof(float);
|
1659
|
+
printf("\n");
|
1556
1660
|
}
|
1661
|
+
total_size_org += tensor.size;
|
1662
|
+
total_size_new += new_size;
|
1663
|
+
file_saver.write_tensor(tensor, new_type, new_data, new_size);
|
1664
|
+
}
|
1557
1665
|
|
1558
|
-
|
1559
|
-
|
1666
|
+
printf("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
|
1667
|
+
printf("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0);
|
1560
1668
|
|
1561
|
-
|
1562
|
-
|
1563
|
-
|
1564
|
-
|
1565
|
-
|
1669
|
+
{
|
1670
|
+
int64_t sum_all = 0;
|
1671
|
+
for (size_t i = 0; i < hist_all.size(); i++) {
|
1672
|
+
sum_all += hist_all[i];
|
1673
|
+
}
|
1566
1674
|
|
1567
|
-
|
1568
|
-
|
1569
|
-
|
1570
|
-
}
|
1571
|
-
printf("\n");
|
1675
|
+
printf("%s: hist: ", __func__);
|
1676
|
+
for (size_t i = 0; i < hist_all.size(); i++) {
|
1677
|
+
printf("%5.3f ", hist_all[i] / float(sum_all));
|
1572
1678
|
}
|
1679
|
+
printf("\n");
|
1573
1680
|
}
|
1574
|
-
|
1575
|
-
finp.close();
|
1576
|
-
fout.close();
|
1577
|
-
|
1578
|
-
return true;
|
1579
1681
|
}
|
1580
1682
|
|
1581
1683
|
//
|
@@ -1593,32 +1695,36 @@ struct llama_context * llama_init_from_file(
|
|
1593
1695
|
params.seed = time(NULL);
|
1594
1696
|
}
|
1595
1697
|
|
1698
|
+
unsigned cur_percentage = 0;
|
1699
|
+
if (params.progress_callback == NULL) {
|
1700
|
+
params.progress_callback_user_data = &cur_percentage;
|
1701
|
+
params.progress_callback = [](float progress, void * ctx) {
|
1702
|
+
unsigned * cur_percentage_p = (unsigned *) ctx;
|
1703
|
+
unsigned percentage = (unsigned) (100 * progress);
|
1704
|
+
while (percentage > *cur_percentage_p) {
|
1705
|
+
++*cur_percentage_p;
|
1706
|
+
fprintf(stderr, ".");
|
1707
|
+
fflush(stderr);
|
1708
|
+
if (percentage >= 100) {
|
1709
|
+
fprintf(stderr, "\n");
|
1710
|
+
}
|
1711
|
+
}
|
1712
|
+
};
|
1713
|
+
}
|
1714
|
+
|
1596
1715
|
ctx->rng = std::mt19937(params.seed);
|
1597
1716
|
ctx->logits_all = params.logits_all;
|
1598
1717
|
|
1599
1718
|
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
1600
1719
|
|
1601
|
-
if (!llama_model_load(path_model, *ctx, params.n_ctx,
|
1602
|
-
params.
|
1603
|
-
params.progress_callback_user_data)) {
|
1720
|
+
if (!llama_model_load(path_model, *ctx, params.n_ctx, memory_type,
|
1721
|
+
params.use_mmap, params.use_mlock, params.vocab_only,
|
1722
|
+
params.progress_callback, params.progress_callback_user_data)) {
|
1604
1723
|
fprintf(stderr, "%s: failed to load model\n", __func__);
|
1605
1724
|
llama_free(ctx);
|
1606
1725
|
return nullptr;
|
1607
1726
|
}
|
1608
1727
|
|
1609
|
-
if (params.use_mlock) {
|
1610
|
-
char *err;
|
1611
|
-
if (!ggml_mlock(ctx->model.ctx,
|
1612
|
-
ctx->model.mm_addr,
|
1613
|
-
ctx->model.mm_length,
|
1614
|
-
&err)) {
|
1615
|
-
fprintf(stderr, "%s\n", err);
|
1616
|
-
free(err);
|
1617
|
-
llama_free(ctx);
|
1618
|
-
return nullptr;
|
1619
|
-
}
|
1620
|
-
}
|
1621
|
-
|
1622
1728
|
// reserve memory for context buffers
|
1623
1729
|
if (!params.vocab_only) {
|
1624
1730
|
if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx)) {
|
@@ -1645,50 +1751,289 @@ struct llama_context * llama_init_from_file(
|
|
1645
1751
|
ctx->embedding.resize(hparams.n_embd);
|
1646
1752
|
}
|
1647
1753
|
|
1648
|
-
ctx->buf_compute.resize(MEM_REQ_EVAL.at(ctx->model.type));
|
1754
|
+
ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type));
|
1649
1755
|
|
1650
|
-
ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0.at(ctx->model.type));
|
1651
|
-
ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1.at(ctx->model.type));
|
1756
|
+
ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0().at(ctx->model.type));
|
1757
|
+
ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
|
1652
1758
|
}
|
1653
1759
|
|
1654
1760
|
return ctx;
|
1655
1761
|
}
|
1656
1762
|
|
1657
1763
|
void llama_free(struct llama_context * ctx) {
|
1658
|
-
kv_cache_free(ctx->model.kv_self);
|
1659
|
-
|
1660
|
-
if (ctx->model.ctx) {
|
1661
|
-
ggml_free(ctx->model.ctx);
|
1662
|
-
}
|
1663
|
-
|
1664
|
-
if (ctx->model.mm_addr) {
|
1665
|
-
munmap_file(ctx->model.mm_addr, ctx->model.mm_length);
|
1666
|
-
}
|
1667
|
-
|
1668
1764
|
delete ctx;
|
1669
1765
|
}
|
1670
1766
|
|
1671
1767
|
int llama_model_quantize(
|
1672
1768
|
const char * fname_inp,
|
1673
1769
|
const char * fname_out,
|
1674
|
-
|
1675
|
-
|
1676
|
-
|
1770
|
+
enum llama_ftype ftype) {
|
1771
|
+
try {
|
1772
|
+
llama_model_quantize_internal(fname_inp, fname_out, ftype);
|
1773
|
+
return 0;
|
1774
|
+
} catch (const std::string & err) {
|
1775
|
+
fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.c_str());
|
1776
|
+
return 1;
|
1777
|
+
}
|
1778
|
+
}
|
1779
|
+
|
1780
|
+
int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
|
1781
|
+
fprintf(stderr, "%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
|
1782
|
+
|
1783
|
+
auto & model = ctx->model;
|
1784
|
+
|
1785
|
+
const int64_t t_start_lora_us = ggml_time_us();
|
1786
|
+
|
1787
|
+
auto fin = std::ifstream(path_lora, std::ios::binary);
|
1788
|
+
if (!fin) {
|
1789
|
+
fprintf(stderr, "%s: failed to open '%s'\n", __func__, path_lora);
|
1677
1790
|
return 1;
|
1678
1791
|
}
|
1679
1792
|
|
1793
|
+
// verify magic and version
|
1794
|
+
{
|
1795
|
+
uint32_t magic;
|
1796
|
+
fin.read((char *) &magic, sizeof(magic));
|
1797
|
+
if (magic != 'ggla') {
|
1798
|
+
fprintf(stderr, "%s: bad file magic\n", __func__);
|
1799
|
+
return 1;
|
1800
|
+
}
|
1801
|
+
uint32_t format_version;
|
1802
|
+
fin.read((char *) &format_version, sizeof(format_version));
|
1803
|
+
|
1804
|
+
if (format_version != 1) {
|
1805
|
+
fprintf(stderr, "%s: unsupported file version\n", __func__ );
|
1806
|
+
return 1;
|
1807
|
+
}
|
1808
|
+
}
|
1809
|
+
|
1810
|
+
int32_t lora_r;
|
1811
|
+
int32_t lora_alpha;
|
1812
|
+
fin.read((char *) &lora_r, sizeof(lora_r));
|
1813
|
+
fin.read((char *) &lora_alpha, sizeof(lora_alpha));
|
1814
|
+
float scaling = (float)lora_alpha / (float)lora_r;
|
1815
|
+
|
1816
|
+
fprintf(stderr, "%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);
|
1817
|
+
|
1818
|
+
|
1819
|
+
// create a temporary ggml context to store the lora tensors
|
1820
|
+
// todo: calculate size from biggest possible tensor
|
1821
|
+
std::vector<uint8_t> lora_buf(1024ull * 1024ull * 1024ull);
|
1822
|
+
struct ggml_init_params params;
|
1823
|
+
params.mem_size = lora_buf.size();
|
1824
|
+
params.mem_buffer = lora_buf.data();
|
1825
|
+
params.no_alloc = false;
|
1826
|
+
|
1827
|
+
ggml_context * lora_ctx = ggml_init(params);
|
1828
|
+
std::unordered_map<std::string, struct ggml_tensor *> lora_tensors;
|
1829
|
+
|
1830
|
+
// create a name -> tensor map of the model to accelerate lookups
|
1831
|
+
std::unordered_map<std::string, struct ggml_tensor*> model_tensors;
|
1832
|
+
for (auto & kv: model.tensors_by_name) {
|
1833
|
+
model_tensors.insert(kv);
|
1834
|
+
}
|
1835
|
+
|
1836
|
+
|
1837
|
+
// load base model
|
1838
|
+
std::unique_ptr<llama_model_loader> model_loader;
|
1839
|
+
ggml_context * base_ctx = NULL;
|
1840
|
+
llama_buffer base_buf;
|
1841
|
+
if (path_base_model) {
|
1842
|
+
fprintf(stderr, "%s: loading base model from '%s'\n", __func__, path_base_model);
|
1843
|
+
model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*vocab_only*/ false));
|
1844
|
+
|
1845
|
+
size_t ctx_size, mmapped_size;
|
1846
|
+
model_loader->calc_sizes(&ctx_size, &mmapped_size);
|
1847
|
+
base_buf.resize(ctx_size);
|
1848
|
+
|
1849
|
+
ggml_init_params base_params;
|
1850
|
+
base_params.mem_size = base_buf.size;
|
1851
|
+
base_params.mem_buffer = base_buf.addr;
|
1852
|
+
base_params.no_alloc = model_loader->use_mmap;
|
1853
|
+
|
1854
|
+
base_ctx = ggml_init(base_params);
|
1855
|
+
|
1856
|
+
model_loader->ggml_ctx = base_ctx;
|
1857
|
+
|
1858
|
+
// maybe this should in llama_model_loader
|
1859
|
+
if (model_loader->use_mmap) {
|
1860
|
+
model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ false));
|
1861
|
+
}
|
1862
|
+
}
|
1863
|
+
|
1864
|
+
// read tensors and apply
|
1865
|
+
bool warned = false;
|
1866
|
+
int n_tensors = 0;
|
1867
|
+
while (true) {
|
1868
|
+
int32_t n_dims;
|
1869
|
+
int32_t length;
|
1870
|
+
int32_t ftype;
|
1871
|
+
|
1872
|
+
fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
|
1873
|
+
fin.read(reinterpret_cast<char *>(&length), sizeof(length));
|
1874
|
+
fin.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
|
1875
|
+
if (fin.eof()) {
|
1876
|
+
break;
|
1877
|
+
}
|
1878
|
+
|
1879
|
+
int32_t ne[2] = { 1, 1 };
|
1880
|
+
for (int i = 0; i < n_dims; ++i) {
|
1881
|
+
fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
|
1882
|
+
}
|
1883
|
+
|
1884
|
+
std::string name(length, 0);
|
1885
|
+
fin.read(&name[0], length);
|
1886
|
+
|
1887
|
+
// check for lora suffix and get the type of tensor
|
1888
|
+
const std::string lora_suffix = ".lora";
|
1889
|
+
size_t pos = name.rfind(lora_suffix);
|
1890
|
+
if (pos == std::string::npos) {
|
1891
|
+
fprintf(stderr, "%s: error: '%s' is not a lora tensor\n", __func__, name.c_str());
|
1892
|
+
return 1;
|
1893
|
+
}
|
1894
|
+
|
1895
|
+
std::string lora_type = name.substr(pos + lora_suffix.length());
|
1896
|
+
std::string base_name = name;
|
1897
|
+
base_name.erase(pos);
|
1898
|
+
// fprintf(stderr, "%s: %s => %s (lora type %s) ", __func__, name.c_str(),base_name.c_str(), lora_type.c_str());
|
1899
|
+
|
1900
|
+
if (model_tensors.find(base_name.data()) == model_tensors.end()) {
|
1901
|
+
fprintf(stderr, "%s: unknown tensor '%s' in lora adapter\n", __func__, name.data());
|
1902
|
+
return 1;
|
1903
|
+
}
|
1904
|
+
|
1905
|
+
// create ggml tensor
|
1906
|
+
ggml_type wtype;
|
1907
|
+
switch (ftype) {
|
1908
|
+
case 0: wtype = GGML_TYPE_F32; break;
|
1909
|
+
case 1: wtype = GGML_TYPE_F16; break;
|
1910
|
+
default:
|
1911
|
+
{
|
1912
|
+
fprintf(stderr, "%s: invalid tensor data type '%d'\n",
|
1913
|
+
__func__, ftype);
|
1914
|
+
return false;
|
1915
|
+
}
|
1916
|
+
}
|
1917
|
+
ggml_tensor* lora_tensor;
|
1918
|
+
if (n_dims == 2) {
|
1919
|
+
lora_tensor = ggml_new_tensor_2d(lora_ctx, wtype, ne[0], ne[1]);
|
1920
|
+
}
|
1921
|
+
else {
|
1922
|
+
fprintf(stderr, "%s: unsupported tensor dimension %d\n", __func__, n_dims);
|
1923
|
+
return 1;
|
1924
|
+
}
|
1925
|
+
|
1926
|
+
// load tensor data
|
1927
|
+
size_t offset = fin.tellg();
|
1928
|
+
size_t tensor_data_size = ggml_nbytes(lora_tensor);
|
1929
|
+
offset = (offset + 31) & -32;
|
1930
|
+
fin.seekg(offset);
|
1931
|
+
fin.read((char*)lora_tensor->data, tensor_data_size);
|
1932
|
+
|
1933
|
+
lora_tensors[name] = lora_tensor;
|
1934
|
+
|
1935
|
+
// check if we have both A and B tensors and apply
|
1936
|
+
if (lora_tensors.find(base_name + ".loraA") != lora_tensors.end() &&
|
1937
|
+
lora_tensors.find(base_name + ".loraB") != lora_tensors.end()) {
|
1938
|
+
|
1939
|
+
ggml_tensor * dest_t = model_tensors[base_name];
|
1940
|
+
ggml_tensor * base_t;
|
1941
|
+
if (model_loader) {
|
1942
|
+
// load from base model
|
1943
|
+
if (model_loader->tensors_map.name_to_idx.find(base_name) == model_loader->tensors_map.name_to_idx.end()) {
|
1944
|
+
fprintf(stderr, "%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
|
1945
|
+
return 1;
|
1946
|
+
}
|
1947
|
+
size_t idx = model_loader->tensors_map.name_to_idx[base_name];
|
1948
|
+
llama_load_tensor & lt = model_loader->tensors_map.tensors[idx];
|
1949
|
+
base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] });
|
1950
|
+
lt.data = (uint8_t *) lt.ggml_tensor->data;
|
1951
|
+
model_loader->load_data_for(lt);
|
1952
|
+
lt.ggml_tensor->data = lt.data;
|
1953
|
+
}
|
1954
|
+
else {
|
1955
|
+
base_t = dest_t;
|
1956
|
+
}
|
1957
|
+
|
1958
|
+
if (base_t->type == GGML_TYPE_Q4_0 || base_t->type == GGML_TYPE_Q4_1) {
|
1959
|
+
if (!warned) {
|
1960
|
+
fprintf(stderr, "%s: warning: using a lora adapter with a quantized model may result in poor quality, "
|
1961
|
+
"use a f16 or f32 base model with --lora-base\n", __func__);
|
1962
|
+
warned = true;
|
1963
|
+
}
|
1964
|
+
}
|
1965
|
+
|
1966
|
+
ggml_tensor * loraA = lora_tensors[base_name + ".loraA"];
|
1967
|
+
ggml_tensor * loraB = lora_tensors[base_name + ".loraB"];
|
1968
|
+
|
1969
|
+
if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) {
|
1970
|
+
fprintf(stderr, "%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");"
|
1971
|
+
" are you sure that this adapter is for this model?\n", __func__, base_t->ne[0], loraA->ne[1]);
|
1972
|
+
return 1;
|
1973
|
+
}
|
1974
|
+
|
1975
|
+
// w = w + BA*s
|
1976
|
+
ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB);
|
1977
|
+
|
1978
|
+
if (scaling != 1.0f) {
|
1979
|
+
ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling);
|
1980
|
+
BA = ggml_scale(lora_ctx, BA, scale_tensor);
|
1981
|
+
}
|
1982
|
+
|
1983
|
+
ggml_tensor * r;
|
1984
|
+
if (base_t == dest_t) {
|
1985
|
+
r = ggml_add_inplace(lora_ctx, dest_t, BA);
|
1986
|
+
}
|
1987
|
+
else {
|
1988
|
+
r = ggml_add(lora_ctx, base_t, BA);
|
1989
|
+
r = ggml_cpy(lora_ctx, r, dest_t);
|
1990
|
+
}
|
1991
|
+
|
1992
|
+
struct ggml_cgraph gf = ggml_build_forward(r);
|
1993
|
+
gf.n_threads = n_threads;
|
1994
|
+
ggml_graph_compute(lora_ctx, &gf);
|
1995
|
+
|
1996
|
+
// we won't need these tensors again, reset the context to save memory
|
1997
|
+
ggml_free(lora_ctx);
|
1998
|
+
lora_ctx = ggml_init(params);
|
1999
|
+
lora_tensors.clear();
|
2000
|
+
|
2001
|
+
n_tensors++;
|
2002
|
+
if (n_tensors % 4 == 0)
|
2003
|
+
fprintf(stderr, ".");
|
2004
|
+
}
|
2005
|
+
}
|
2006
|
+
|
2007
|
+
// TODO: this should be in a destructor, it will leak on failure
|
2008
|
+
ggml_free(lora_ctx);
|
2009
|
+
if (base_ctx) {
|
2010
|
+
ggml_free(base_ctx);
|
2011
|
+
}
|
2012
|
+
|
2013
|
+
const int64_t t_lora_us = ggml_time_us() - t_start_lora_us;
|
2014
|
+
fprintf(stderr, " done (%.2f ms)\n", t_lora_us / 1000.0);
|
2015
|
+
|
1680
2016
|
return 0;
|
1681
2017
|
}
|
1682
2018
|
|
2019
|
+
int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
|
2020
|
+
try {
|
2021
|
+
return llama_apply_lora_from_file_internal(ctx, path_lora, path_base_model, n_threads);
|
2022
|
+
} catch (const std::string & err) {
|
2023
|
+
fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.c_str());
|
2024
|
+
return 1;
|
2025
|
+
}
|
2026
|
+
}
|
2027
|
+
|
1683
2028
|
// Returns the KV cache that will contain the context for the
|
1684
2029
|
// ongoing prediction with the model.
|
1685
2030
|
const uint8_t * llama_get_kv_cache(struct llama_context * ctx) {
|
1686
|
-
return ctx->model.kv_self.buf.
|
2031
|
+
return ctx->model.kv_self.buf.addr;
|
1687
2032
|
}
|
1688
2033
|
|
1689
2034
|
// Returns the size of the KV cache
|
1690
2035
|
size_t llama_get_kv_cache_size(struct llama_context * ctx) {
|
1691
|
-
return ctx->model.kv_self.buf.size
|
2036
|
+
return ctx->model.kv_self.buf.size;
|
1692
2037
|
}
|
1693
2038
|
|
1694
2039
|
int llama_get_kv_cache_token_count(struct llama_context * ctx) {
|
@@ -1702,8 +2047,8 @@ void llama_set_kv_cache(
|
|
1702
2047
|
size_t n_size,
|
1703
2048
|
int n_token_count) {
|
1704
2049
|
// Make sure we have the same kv cache setup
|
1705
|
-
LLAMA_ASSERT(ctx->model.kv_self.buf.size
|
1706
|
-
memcpy(ctx->model.kv_self.buf.
|
2050
|
+
LLAMA_ASSERT(ctx->model.kv_self.buf.size == n_size);
|
2051
|
+
memcpy(ctx->model.kv_self.buf.addr, kv_cache, n_size);
|
1707
2052
|
ctx->model.kv_self.n = n_token_count;
|
1708
2053
|
}
|
1709
2054
|
|
@@ -1814,9 +2159,9 @@ llama_token llama_sample_top_p_top_k(
|
|
1814
2159
|
void llama_print_timings(struct llama_context * ctx) {
|
1815
2160
|
const int64_t t_end_us = ggml_time_us();
|
1816
2161
|
|
1817
|
-
const int32_t n_sample =
|
1818
|
-
const int32_t n_eval =
|
1819
|
-
const int32_t n_p_eval =
|
2162
|
+
const int32_t n_sample = std::max(1, ctx->n_sample);
|
2163
|
+
const int32_t n_eval = std::max(1, ctx->n_eval);
|
2164
|
+
const int32_t n_p_eval = std::max(1, ctx->n_p_eval);
|
1820
2165
|
|
1821
2166
|
fprintf(stderr, "\n");
|
1822
2167
|
fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0);
|
@@ -1837,18 +2182,25 @@ const char * llama_print_system_info(void) {
|
|
1837
2182
|
static std::string s;
|
1838
2183
|
|
1839
2184
|
s = "";
|
1840
|
-
s += "AVX = "
|
1841
|
-
s += "AVX2 = "
|
1842
|
-
s += "AVX512 = "
|
1843
|
-
s += "
|
1844
|
-
s += "
|
1845
|
-
s += "
|
1846
|
-
s += "
|
1847
|
-
s += "
|
1848
|
-
s += "
|
1849
|
-
s += "
|
1850
|
-
s += "
|
1851
|
-
s += "
|
2185
|
+
s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | ";
|
2186
|
+
s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | ";
|
2187
|
+
s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
|
2188
|
+
s += "AVX512_VBMI = " + std::to_string(ggml_cpu_has_avx512_vbmi()) + " | ";
|
2189
|
+
s += "AVX512_VNNI = " + std::to_string(ggml_cpu_has_avx512_vnni()) + " | ";
|
2190
|
+
s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | ";
|
2191
|
+
s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
|
2192
|
+
s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | ";
|
2193
|
+
s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | ";
|
2194
|
+
s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
|
2195
|
+
s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
|
2196
|
+
s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | ";
|
2197
|
+
s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
|
2198
|
+
s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
|
1852
2199
|
|
1853
2200
|
return s.c_str();
|
1854
2201
|
}
|
2202
|
+
|
2203
|
+
// For internal test use
|
2204
|
+
std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
|
2205
|
+
return ctx->model.tensors_by_name;
|
2206
|
+
}
|