llama_cpp 0.0.3 → 0.0.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +18 -0
- data/README.md +2 -2
- data/ext/llama_cpp/extconf.rb +26 -0
- data/ext/llama_cpp/llama_cpp.cpp +58 -2
- data/ext/llama_cpp/src/ggml.c +735 -253
- data/ext/llama_cpp/src/ggml.h +74 -16
- data/ext/llama_cpp/src/llama.cpp +800 -718
- data/ext/llama_cpp/src/llama.h +25 -1
- data/ext/llama_cpp/src/llama_util.h +389 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +3 -0
- metadata +3 -2
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -1,49 +1,30 @@
|
|
1
|
+
// Defines fileno on msys:
|
2
|
+
#ifndef _GNU_SOURCE
|
3
|
+
#define _GNU_SOURCE
|
4
|
+
#endif
|
5
|
+
|
6
|
+
#include "llama_util.h"
|
1
7
|
#include "llama.h"
|
2
8
|
|
3
9
|
#include "ggml.h"
|
4
10
|
|
11
|
+
#include <array>
|
5
12
|
#include <cinttypes>
|
6
13
|
#include <fstream>
|
7
14
|
#include <random>
|
8
15
|
#include <map>
|
9
16
|
#include <unordered_map>
|
10
17
|
#include <queue>
|
11
|
-
#include <regex>
|
12
18
|
#include <cassert>
|
13
19
|
#include <cstring>
|
14
|
-
|
15
|
-
#
|
16
|
-
#
|
17
|
-
#include <
|
18
|
-
#else
|
19
|
-
#include <sys/types.h>
|
20
|
-
#include <sys/mman.h>
|
21
|
-
#include <unistd.h>
|
22
|
-
#include <fcntl.h>
|
23
|
-
#endif
|
24
|
-
|
25
|
-
#define Min(X, Y) ((Y) > (X) ? (X) : (Y))
|
26
|
-
#define Max(X, Y) ((Y) < (X) ? (X) : (Y))
|
20
|
+
#include <climits>
|
21
|
+
#include <memory>
|
22
|
+
#include <algorithm>
|
23
|
+
#include <initializer_list>
|
27
24
|
|
28
25
|
#define LLAMA_USE_SCRATCH
|
29
26
|
#define LLAMA_MAX_SCRATCH_BUFFERS 16
|
30
27
|
|
31
|
-
#define LLAMA_ASSERT(x) \
|
32
|
-
do { \
|
33
|
-
if (!(x)) { \
|
34
|
-
fprintf(stderr, "LLAMA_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
|
35
|
-
abort(); \
|
36
|
-
} \
|
37
|
-
} while (0)
|
38
|
-
|
39
|
-
|
40
|
-
// determine number of model parts based on the dimension
|
41
|
-
static const std::unordered_map<int, int> LLAMA_N_PARTS = {
|
42
|
-
{ 4096, 1 },
|
43
|
-
{ 5120, 2 },
|
44
|
-
{ 6656, 4 },
|
45
|
-
{ 8192, 8 },
|
46
|
-
};
|
47
28
|
|
48
29
|
// available llama models
|
49
30
|
enum e_model {
|
@@ -93,14 +74,18 @@ static const std::map<e_model, size_t> MEM_REQ_EVAL = {
|
|
93
74
|
|
94
75
|
// default hparams (LLaMA 7B)
|
95
76
|
struct llama_hparams {
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
77
|
+
uint32_t n_vocab = 32000;
|
78
|
+
uint32_t n_ctx = 512; // this is provided as user input?
|
79
|
+
uint32_t n_embd = 4096;
|
80
|
+
uint32_t n_mult = 256;
|
81
|
+
uint32_t n_head = 32;
|
82
|
+
uint32_t n_layer = 32;
|
83
|
+
uint32_t n_rot = 64;
|
84
|
+
enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;
|
85
|
+
|
86
|
+
bool operator!=(const llama_hparams & other) const {
|
87
|
+
return memcmp(this, &other, sizeof(llama_hparams));
|
88
|
+
}
|
104
89
|
};
|
105
90
|
|
106
91
|
struct llama_layer {
|
@@ -126,11 +111,17 @@ struct llama_kv_cache {
|
|
126
111
|
struct ggml_tensor * k;
|
127
112
|
struct ggml_tensor * v;
|
128
113
|
|
129
|
-
struct ggml_context * ctx;
|
114
|
+
struct ggml_context * ctx = NULL;
|
130
115
|
|
131
|
-
|
116
|
+
llama_buffer buf;
|
132
117
|
|
133
118
|
int n; // number of tokens currently in the cache
|
119
|
+
|
120
|
+
~llama_kv_cache() {
|
121
|
+
if (ctx) {
|
122
|
+
ggml_free(ctx);
|
123
|
+
}
|
124
|
+
}
|
134
125
|
};
|
135
126
|
|
136
127
|
struct llama_model {
|
@@ -146,22 +137,30 @@ struct llama_model {
|
|
146
137
|
std::vector<llama_layer> layers;
|
147
138
|
|
148
139
|
// context
|
149
|
-
struct ggml_context * ctx;
|
140
|
+
struct ggml_context * ctx = NULL;
|
150
141
|
|
151
142
|
// key + value cache for the self attention
|
152
143
|
// TODO: move to llama_state
|
153
144
|
struct llama_kv_cache kv_self;
|
154
145
|
|
155
146
|
// the model memory buffer
|
156
|
-
|
147
|
+
llama_buffer buf;
|
157
148
|
|
158
149
|
// model memory mapped file
|
159
|
-
|
160
|
-
|
150
|
+
std::unique_ptr<llama_mmap> mapping;
|
151
|
+
|
152
|
+
// objects representing data potentially being locked in memory
|
153
|
+
llama_mlock mlock_buf;
|
154
|
+
llama_mlock mlock_mmap;
|
155
|
+
|
156
|
+
// for quantize-stats only
|
157
|
+
std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
|
161
158
|
|
162
|
-
|
163
|
-
|
164
|
-
|
159
|
+
~llama_model() {
|
160
|
+
if (ctx) {
|
161
|
+
ggml_free(ctx);
|
162
|
+
}
|
163
|
+
}
|
165
164
|
};
|
166
165
|
|
167
166
|
struct llama_vocab {
|
@@ -206,8 +205,8 @@ struct llama_context {
|
|
206
205
|
|
207
206
|
// memory buffers used to evaluate the model
|
208
207
|
// TODO: move in llama_state
|
209
|
-
|
210
|
-
|
208
|
+
llama_buffer buf_compute;
|
209
|
+
llama_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
|
211
210
|
|
212
211
|
int buf_last = 0;
|
213
212
|
size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
|
@@ -220,11 +219,11 @@ struct llama_context {
|
|
220
219
|
last_size = ggml_set_scratch(ctx, { 0, 0, nullptr, });
|
221
220
|
} else {
|
222
221
|
auto & buf = buf_scratch[i];
|
223
|
-
last_size = ggml_set_scratch(ctx, { 0, buf.size
|
222
|
+
last_size = ggml_set_scratch(ctx, { 0, buf.size, buf.addr, });
|
224
223
|
}
|
225
224
|
|
226
225
|
if (buf_last >= 0) {
|
227
|
-
buf_max_size[buf_last] =
|
226
|
+
buf_max_size[buf_last] = std::max(buf_max_size[buf_last], last_size);
|
228
227
|
}
|
229
228
|
|
230
229
|
buf_last = i;
|
@@ -244,6 +243,499 @@ struct llama_context {
|
|
244
243
|
}
|
245
244
|
};
|
246
245
|
|
246
|
+
template <typename T>
|
247
|
+
static T checked_mul(T a, T b) {
|
248
|
+
T ret = a * b;
|
249
|
+
if (a != 0 && ret / a != b) {
|
250
|
+
throw format("overflow multiplying %llu * %llu",
|
251
|
+
(unsigned long long) a, (unsigned long long) b);
|
252
|
+
}
|
253
|
+
return ret;
|
254
|
+
}
|
255
|
+
|
256
|
+
static size_t checked_div(size_t a, size_t b) {
|
257
|
+
if (b == 0 || a % b != 0) {
|
258
|
+
throw format("error dividing %zu / %zu", a, b);
|
259
|
+
}
|
260
|
+
return a / b;
|
261
|
+
}
|
262
|
+
|
263
|
+
static std::string llama_format_tensor_shape(const std::vector<uint32_t> & ne) {
|
264
|
+
std::string ret = "[" + std::to_string(ne.at(0));
|
265
|
+
for (size_t i = 1; i < ne.size(); i++) {
|
266
|
+
ret += " x " + std::to_string(ne.at(i));
|
267
|
+
}
|
268
|
+
ret += "]";
|
269
|
+
return ret;
|
270
|
+
}
|
271
|
+
|
272
|
+
static size_t llama_calc_tensor_size(const std::vector<uint32_t> & ne, enum ggml_type type) {
|
273
|
+
size_t size = ggml_type_size(type);
|
274
|
+
for (uint32_t dim : ne) {
|
275
|
+
size = checked_mul<size_t>(size, dim);
|
276
|
+
}
|
277
|
+
return size / ggml_blck_size(type);
|
278
|
+
}
|
279
|
+
|
280
|
+
struct llama_load_tensor_shard {
|
281
|
+
std::vector<uint32_t> ne;
|
282
|
+
size_t size;
|
283
|
+
enum ggml_type type;
|
284
|
+
size_t file_idx;
|
285
|
+
size_t file_off;
|
286
|
+
|
287
|
+
void calc_size() {
|
288
|
+
size = llama_calc_tensor_size(ne, type);
|
289
|
+
}
|
290
|
+
};
|
291
|
+
|
292
|
+
enum llama_split_type {
|
293
|
+
SPLIT_NONE,
|
294
|
+
SPLIT_BY_COLUMNS,
|
295
|
+
SPLIT_BY_ROWS
|
296
|
+
};
|
297
|
+
|
298
|
+
struct llama_load_tensor {
|
299
|
+
std::vector<llama_load_tensor_shard> shards;
|
300
|
+
|
301
|
+
std::string name;
|
302
|
+
enum ggml_type type = GGML_TYPE_F32;
|
303
|
+
llama_split_type split_type = SPLIT_NONE;
|
304
|
+
std::vector<uint32_t> ne;
|
305
|
+
size_t size;
|
306
|
+
struct ggml_tensor * ggml_tensor = NULL;
|
307
|
+
uint8_t * data;
|
308
|
+
|
309
|
+
llama_load_tensor(const std::string & name) : name(name) {}
|
310
|
+
|
311
|
+
void calc_all() {
|
312
|
+
calc_type();
|
313
|
+
calc_split_type();
|
314
|
+
calc_ne();
|
315
|
+
calc_size();
|
316
|
+
}
|
317
|
+
|
318
|
+
void calc_type() {
|
319
|
+
const auto & first_shard = shards.at(0);
|
320
|
+
for (const auto & shard : shards) {
|
321
|
+
if (shard.type != first_shard.type) {
|
322
|
+
throw format("inconsistent tensor shard type in '%s'", name.c_str());
|
323
|
+
}
|
324
|
+
}
|
325
|
+
type = first_shard.type;
|
326
|
+
}
|
327
|
+
|
328
|
+
void calc_split_type() {
|
329
|
+
if (shards.at(0).ne.size() == 1 || // 1D tensors are just duplicated in every file
|
330
|
+
shards.size() == 1) { // only one file?
|
331
|
+
split_type = SPLIT_NONE;
|
332
|
+
} else if (name.find("tok_embeddings.") == 0 ||
|
333
|
+
name.find(".attention.wo.weight") != std::string::npos ||
|
334
|
+
name.find(".feed_forward.w2.weight") != std::string::npos) {
|
335
|
+
split_type = SPLIT_BY_COLUMNS;
|
336
|
+
} else {
|
337
|
+
split_type = SPLIT_BY_ROWS;
|
338
|
+
}
|
339
|
+
}
|
340
|
+
|
341
|
+
void calc_ne() {
|
342
|
+
const auto & first_shard = shards.at(0);
|
343
|
+
for (const auto & shard : shards) {
|
344
|
+
if (shard.ne != first_shard.ne) {
|
345
|
+
throw format("inconsistent tensor shard shape in '%s': first was %s, other was %s",
|
346
|
+
name.c_str(), llama_format_tensor_shape(first_shard.ne).c_str(), llama_format_tensor_shape(shard.ne).c_str());
|
347
|
+
}
|
348
|
+
}
|
349
|
+
ne = first_shard.ne;
|
350
|
+
LLAMA_ASSERT(shards.size() <= UINT32_MAX);
|
351
|
+
uint32_t n_shards = (uint32_t) shards.size();
|
352
|
+
switch (split_type) {
|
353
|
+
case SPLIT_NONE:
|
354
|
+
ne = first_shard.ne;
|
355
|
+
break;
|
356
|
+
case SPLIT_BY_COLUMNS:
|
357
|
+
ne = {checked_mul<uint32_t>(first_shard.ne[0], n_shards),
|
358
|
+
first_shard.ne[1]};
|
359
|
+
break;
|
360
|
+
case SPLIT_BY_ROWS:
|
361
|
+
ne = {first_shard.ne[0],
|
362
|
+
checked_mul<uint32_t>(first_shard.ne[1], n_shards)};
|
363
|
+
break;
|
364
|
+
}
|
365
|
+
}
|
366
|
+
|
367
|
+
void calc_size() {
|
368
|
+
size = llama_calc_tensor_size(ne, type);
|
369
|
+
}
|
370
|
+
};
|
371
|
+
|
372
|
+
struct llama_load_tensors_map {
|
373
|
+
// tensors is kept in a separate vector to preserve file order
|
374
|
+
std::vector<llama_load_tensor> tensors;
|
375
|
+
std::unordered_map<std::string, size_t> name_to_idx;
|
376
|
+
};
|
377
|
+
|
378
|
+
enum llama_file_version {
|
379
|
+
LLAMA_FILE_VERSION_GGML,
|
380
|
+
LLAMA_FILE_VERSION_GGMF_V1, // added version field and scores in vocab
|
381
|
+
LLAMA_FILE_VERSION_GGJT_V1, // added padding
|
382
|
+
};
|
383
|
+
|
384
|
+
struct llama_file_loader {
|
385
|
+
llama_file file;
|
386
|
+
llama_file_version file_version;
|
387
|
+
llama_hparams hparams;
|
388
|
+
llama_vocab vocab;
|
389
|
+
|
390
|
+
llama_file_loader(const char * fname, size_t file_idx, llama_load_tensors_map & tensors_map)
|
391
|
+
: file(fname, "rb") {
|
392
|
+
fprintf(stderr, "llama.cpp: loading model from %s\n", fname);
|
393
|
+
read_magic();
|
394
|
+
read_hparams();
|
395
|
+
read_vocab();
|
396
|
+
read_tensor_metadata(file_idx, tensors_map);
|
397
|
+
}
|
398
|
+
void read_magic() {
|
399
|
+
uint32_t magic = file.read_u32();
|
400
|
+
uint32_t version = 0;
|
401
|
+
|
402
|
+
if (magic != 'ggml') {
|
403
|
+
version = file.read_u32();
|
404
|
+
}
|
405
|
+
|
406
|
+
if (magic == 'ggml' && version == 0) {
|
407
|
+
file_version = LLAMA_FILE_VERSION_GGML;
|
408
|
+
} else if (magic == 'ggmf' && version == 1) {
|
409
|
+
file_version = LLAMA_FILE_VERSION_GGMF_V1;
|
410
|
+
} else if (magic == 'ggjt' && version == 1) {
|
411
|
+
file_version = LLAMA_FILE_VERSION_GGJT_V1;
|
412
|
+
} else {
|
413
|
+
throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
|
414
|
+
magic, version);
|
415
|
+
}
|
416
|
+
}
|
417
|
+
void read_hparams() {
|
418
|
+
hparams.n_vocab = file.read_u32();
|
419
|
+
hparams.n_embd = file.read_u32();
|
420
|
+
hparams.n_mult = file.read_u32();
|
421
|
+
hparams.n_head = file.read_u32();
|
422
|
+
hparams.n_layer = file.read_u32();
|
423
|
+
hparams.n_rot = file.read_u32();
|
424
|
+
hparams.ftype = (enum llama_ftype) file.read_u32();
|
425
|
+
}
|
426
|
+
void read_vocab() {
|
427
|
+
vocab.id_to_token.resize(hparams.n_vocab);
|
428
|
+
|
429
|
+
for (uint32_t i = 0; i < hparams.n_vocab; i++) {
|
430
|
+
uint32_t len = file.read_u32();
|
431
|
+
std::string word = file.read_string(len);
|
432
|
+
|
433
|
+
float score = 0.0f;
|
434
|
+
if (file_version >= LLAMA_FILE_VERSION_GGMF_V1) {
|
435
|
+
file.read_raw(&score, sizeof(score));
|
436
|
+
}
|
437
|
+
|
438
|
+
vocab.token_to_id[word] = i;
|
439
|
+
|
440
|
+
auto & tok_score = vocab.id_to_token[i];
|
441
|
+
tok_score.tok = std::move(word);
|
442
|
+
tok_score.score = score;
|
443
|
+
}
|
444
|
+
}
|
445
|
+
void read_tensor_metadata(size_t file_idx, llama_load_tensors_map & tensors_map) {
|
446
|
+
while (file.tell() < file.size) {
|
447
|
+
llama_load_tensor_shard shard;
|
448
|
+
uint32_t n_dims = file.read_u32();
|
449
|
+
uint32_t name_len = file.read_u32();
|
450
|
+
shard.type = (enum ggml_type) file.read_u32();
|
451
|
+
shard.ne.resize(n_dims);
|
452
|
+
file.read_raw(shard.ne.data(), sizeof(shard.ne[0]) * n_dims);
|
453
|
+
std::string name = file.read_string(name_len);
|
454
|
+
if (n_dims < 1 || n_dims > 2) {
|
455
|
+
throw format("llama.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims);
|
456
|
+
}
|
457
|
+
switch (shard.type) {
|
458
|
+
case GGML_TYPE_F32:
|
459
|
+
case GGML_TYPE_F16:
|
460
|
+
case GGML_TYPE_Q4_0:
|
461
|
+
case GGML_TYPE_Q4_1:
|
462
|
+
break;
|
463
|
+
default: {
|
464
|
+
throw format("unrecognized tensor type %u\n", shard.type);
|
465
|
+
}
|
466
|
+
}
|
467
|
+
|
468
|
+
if (file_version >= LLAMA_FILE_VERSION_GGJT_V1) {
|
469
|
+
// skip to the next multiple of 32 bytes
|
470
|
+
file.seek(-file.tell() & 31, SEEK_CUR);
|
471
|
+
}
|
472
|
+
shard.file_idx = file_idx;
|
473
|
+
shard.file_off = file.tell();
|
474
|
+
|
475
|
+
shard.calc_size();
|
476
|
+
file.seek(shard.size, SEEK_CUR);
|
477
|
+
|
478
|
+
auto it = tensors_map.name_to_idx.find(name);
|
479
|
+
size_t idx;
|
480
|
+
if (it != tensors_map.name_to_idx.end()) {
|
481
|
+
idx = it->second;
|
482
|
+
} else {
|
483
|
+
tensors_map.tensors.emplace_back(name);
|
484
|
+
idx = tensors_map.tensors.size() - 1;
|
485
|
+
tensors_map.name_to_idx.emplace(name, idx);
|
486
|
+
}
|
487
|
+
tensors_map.tensors.at(idx).shards.push_back(shard);
|
488
|
+
}
|
489
|
+
}
|
490
|
+
};
|
491
|
+
|
492
|
+
struct llama_file_saver {
|
493
|
+
llama_file file;
|
494
|
+
llama_file_loader * any_file_loader;
|
495
|
+
llama_file_saver(const char * fname, llama_file_loader * any_file_loader, enum llama_ftype new_ftype)
|
496
|
+
: file(fname, "wb"), any_file_loader(any_file_loader) {
|
497
|
+
fprintf(stderr, "llama.cpp: saving model to %s\n", fname);
|
498
|
+
write_magic();
|
499
|
+
write_hparams(new_ftype);
|
500
|
+
write_vocab();
|
501
|
+
}
|
502
|
+
void write_magic() {
|
503
|
+
file.write_u32('ggjt'); // magic
|
504
|
+
file.write_u32(1); // version
|
505
|
+
}
|
506
|
+
void write_hparams(enum llama_ftype new_ftype) {
|
507
|
+
const llama_hparams & hparams = any_file_loader->hparams;
|
508
|
+
file.write_u32(hparams.n_vocab);
|
509
|
+
file.write_u32(hparams.n_embd);
|
510
|
+
file.write_u32(hparams.n_mult);
|
511
|
+
file.write_u32(hparams.n_head);
|
512
|
+
file.write_u32(hparams.n_layer);
|
513
|
+
file.write_u32(hparams.n_rot);
|
514
|
+
file.write_u32(new_ftype);
|
515
|
+
}
|
516
|
+
void write_vocab() {
|
517
|
+
if (any_file_loader->file_version == LLAMA_FILE_VERSION_GGML) {
|
518
|
+
fprintf(stderr, "llama.cpp: WARNING: input is an old file that doesn't have scores; will add dummy scores\n");
|
519
|
+
}
|
520
|
+
uint32_t n_vocab = any_file_loader->hparams.n_vocab;
|
521
|
+
for (uint32_t i = 0; i < n_vocab; i++) {
|
522
|
+
const auto & token_score = any_file_loader->vocab.id_to_token.at(i);
|
523
|
+
file.write_u32((uint32_t) token_score.tok.size());
|
524
|
+
file.write_raw(token_score.tok.data(), token_score.tok.size());
|
525
|
+
file.write_raw(&token_score.score, sizeof(token_score.score));
|
526
|
+
}
|
527
|
+
}
|
528
|
+
void write_tensor(llama_load_tensor & tensor, enum ggml_type new_type, const void * new_data, size_t new_size) {
|
529
|
+
switch (new_type) {
|
530
|
+
case GGML_TYPE_F32:
|
531
|
+
case GGML_TYPE_F16:
|
532
|
+
case GGML_TYPE_Q4_0:
|
533
|
+
case GGML_TYPE_Q4_1:
|
534
|
+
break;
|
535
|
+
default: LLAMA_ASSERT(false);
|
536
|
+
}
|
537
|
+
file.write_u32((uint32_t) tensor.ne.size());
|
538
|
+
file.write_u32((uint32_t) tensor.name.size());
|
539
|
+
file.write_u32(new_type);
|
540
|
+
file.write_raw(tensor.ne.data(), sizeof(tensor.ne[0]) * tensor.ne.size());
|
541
|
+
file.write_raw(tensor.name.data(), tensor.name.size());
|
542
|
+
file.seek(-file.tell() & 31, SEEK_CUR);
|
543
|
+
LLAMA_ASSERT(new_size == llama_calc_tensor_size(tensor.ne, new_type));
|
544
|
+
file.write_raw(new_data, new_size);
|
545
|
+
}
|
546
|
+
};
|
547
|
+
|
548
|
+
struct llama_model_loader {
|
549
|
+
std::vector<std::unique_ptr<llama_file_loader>> file_loaders;
|
550
|
+
llama_load_tensors_map tensors_map;
|
551
|
+
bool use_mmap;
|
552
|
+
size_t num_ggml_tensors_created = 0;
|
553
|
+
struct ggml_context * ggml_ctx = NULL;
|
554
|
+
std::unique_ptr<llama_mmap> mapping;
|
555
|
+
|
556
|
+
llama_model_loader(const std::string & fname_base, bool use_mmap, bool vocab_only) {
|
557
|
+
auto first_file = new llama_file_loader(fname_base.c_str(), 0, tensors_map);
|
558
|
+
file_loaders.emplace_back(first_file);
|
559
|
+
uint32_t n_parts = vocab_only ? 1 : guess_n_parts();
|
560
|
+
for (uint32_t i = 1; i < n_parts; i++) {
|
561
|
+
std::string fname = fname_base + "." + std::to_string(i);
|
562
|
+
auto ith_file = new llama_file_loader(fname.c_str(), i, tensors_map);
|
563
|
+
file_loaders.emplace_back(ith_file);
|
564
|
+
if (ith_file->hparams != first_file->hparams) {
|
565
|
+
throw format("llama.cpp: hparams inconsistent between files");
|
566
|
+
}
|
567
|
+
}
|
568
|
+
if (!llama_mmap::SUPPORTED) {
|
569
|
+
use_mmap = false;
|
570
|
+
}
|
571
|
+
if (use_mmap && alignment_prevents_mmap()) {
|
572
|
+
fprintf(stderr, "llama.cpp: can't use mmap because tensors are not aligned; convert to new format to avoid this\n");
|
573
|
+
use_mmap = false;
|
574
|
+
}
|
575
|
+
this->use_mmap = use_mmap;
|
576
|
+
for (llama_load_tensor & lt : tensors_map.tensors) {
|
577
|
+
lt.calc_all();
|
578
|
+
}
|
579
|
+
}
|
580
|
+
|
581
|
+
bool alignment_prevents_mmap() {
|
582
|
+
for (const llama_load_tensor & lt : tensors_map.tensors) {
|
583
|
+
for (const llama_load_tensor_shard & shard : lt.shards) {
|
584
|
+
if (shard.file_off & 3) {
|
585
|
+
return true;
|
586
|
+
}
|
587
|
+
}
|
588
|
+
}
|
589
|
+
return false;
|
590
|
+
}
|
591
|
+
|
592
|
+
uint32_t guess_n_parts() const {
|
593
|
+
auto it = tensors_map.name_to_idx.find("tok_embeddings.weight");
|
594
|
+
if (it == tensors_map.name_to_idx.end()) {
|
595
|
+
throw std::string("missing tok_embeddings.weight");
|
596
|
+
}
|
597
|
+
const llama_load_tensor & lt = tensors_map.tensors.at(it->second);
|
598
|
+
return file_loaders.at(0)->hparams.n_embd / lt.shards.at(0).ne.at(0);
|
599
|
+
}
|
600
|
+
|
601
|
+
void calc_sizes(size_t * ctx_size_p, size_t * mmapped_size_p) const {
|
602
|
+
*ctx_size_p = *mmapped_size_p = 0;
|
603
|
+
for (const llama_load_tensor & lt : tensors_map.tensors) {
|
604
|
+
*ctx_size_p += sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE;
|
605
|
+
*(use_mmap ? mmapped_size_p : ctx_size_p) += lt.size;
|
606
|
+
}
|
607
|
+
}
|
608
|
+
|
609
|
+
struct ggml_tensor * get_tensor(const std::string & name, std::vector<uint32_t> ne) {
|
610
|
+
auto it = tensors_map.name_to_idx.find(name);
|
611
|
+
if (it == tensors_map.name_to_idx.end()) {
|
612
|
+
throw format("llama.cpp: tensor '%s' is missing from model", name.c_str());
|
613
|
+
}
|
614
|
+
llama_load_tensor & lt = tensors_map.tensors.at(it->second);
|
615
|
+
if (lt.ne != ne) {
|
616
|
+
throw format("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s",
|
617
|
+
name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str());
|
618
|
+
}
|
619
|
+
return get_tensor_for(lt);
|
620
|
+
}
|
621
|
+
|
622
|
+
struct ggml_tensor * get_tensor_for(llama_load_tensor & lt) {
|
623
|
+
struct ggml_tensor * tensor;
|
624
|
+
if (lt.ne.size() == 2) {
|
625
|
+
tensor = ggml_new_tensor_2d(ggml_ctx, lt.type, lt.ne.at(0), lt.ne.at(1));
|
626
|
+
} else {
|
627
|
+
LLAMA_ASSERT(lt.ne.size() == 1);
|
628
|
+
tensor = ggml_new_tensor_1d(ggml_ctx, lt.type, lt.ne.at(0));
|
629
|
+
}
|
630
|
+
LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor
|
631
|
+
lt.ggml_tensor = tensor;
|
632
|
+
num_ggml_tensors_created++;
|
633
|
+
return tensor;
|
634
|
+
}
|
635
|
+
|
636
|
+
void done_getting_tensors() {
|
637
|
+
if (num_ggml_tensors_created != tensors_map.tensors.size()) {
|
638
|
+
throw std::string("llama.cpp: file contained more tensors than expected");
|
639
|
+
}
|
640
|
+
}
|
641
|
+
|
642
|
+
void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) {
|
643
|
+
size_t data_size = 0;
|
644
|
+
for (const llama_load_tensor & lt : tensors_map.tensors) {
|
645
|
+
data_size += lt.size;
|
646
|
+
}
|
647
|
+
|
648
|
+
if (use_mmap) {
|
649
|
+
mapping.reset(new llama_mmap(&file_loaders.at(0)->file));
|
650
|
+
if (!lmlock) {
|
651
|
+
// Don't call the callback since the actual loading will be lazy
|
652
|
+
// and we can't measure it.
|
653
|
+
progress_callback = NULL;
|
654
|
+
}
|
655
|
+
if (lmlock) {
|
656
|
+
lmlock->init(mapping->addr);
|
657
|
+
}
|
658
|
+
}
|
659
|
+
|
660
|
+
size_t done_size = 0;
|
661
|
+
for (llama_load_tensor & lt : tensors_map.tensors) {
|
662
|
+
if (progress_callback) {
|
663
|
+
progress_callback((float) done_size / data_size, progress_callback_user_data);
|
664
|
+
}
|
665
|
+
LLAMA_ASSERT(lt.ggml_tensor); // unused tensors should have been caught by load_data already
|
666
|
+
lt.data = (uint8_t *) lt.ggml_tensor->data;
|
667
|
+
load_data_for(lt);
|
668
|
+
lt.ggml_tensor->data = lt.data;
|
669
|
+
done_size += lt.size;
|
670
|
+
if (use_mmap && lmlock) {
|
671
|
+
lmlock->grow_to(done_size);
|
672
|
+
}
|
673
|
+
}
|
674
|
+
if (progress_callback) {
|
675
|
+
progress_callback(1.0f, progress_callback_user_data);
|
676
|
+
}
|
677
|
+
}
|
678
|
+
|
679
|
+
void load_data_for(llama_load_tensor & lt) {
|
680
|
+
if (use_mmap) {
|
681
|
+
LLAMA_ASSERT(lt.shards.size() == 1);
|
682
|
+
lt.data = (uint8_t *) mapping->addr + lt.shards.at(0).file_off;
|
683
|
+
} else if (lt.split_type == SPLIT_NONE) {
|
684
|
+
llama_file & file = file_loaders.at(lt.shards.at(0).file_idx)->file;
|
685
|
+
file.seek(lt.shards.at(0).file_off, SEEK_SET);
|
686
|
+
file.read_raw(lt.data, lt.size);
|
687
|
+
} else if (lt.split_type == SPLIT_BY_ROWS) {
|
688
|
+
size_t offset = 0;
|
689
|
+
for (llama_load_tensor_shard & shard : lt.shards) {
|
690
|
+
llama_file & file = file_loaders.at(shard.file_idx)->file;
|
691
|
+
file.seek(shard.file_off, SEEK_SET);
|
692
|
+
file.read_raw(lt.data + offset, shard.size);
|
693
|
+
offset += shard.size;
|
694
|
+
}
|
695
|
+
LLAMA_ASSERT(offset == lt.size);
|
696
|
+
} else if (lt.split_type == SPLIT_BY_COLUMNS) {
|
697
|
+
// Let's load the data into temporary buffers to ensure the OS performs large loads.
|
698
|
+
std::vector<llama_buffer> tmp_bufs;
|
699
|
+
tmp_bufs.resize(lt.shards.size());
|
700
|
+
for (size_t i = 0; i < lt.shards.size(); i++) {
|
701
|
+
llama_load_tensor_shard & shard = lt.shards.at(i);
|
702
|
+
llama_file & file = file_loaders.at(shard.file_idx)->file;
|
703
|
+
file.seek(shard.file_off, SEEK_SET);
|
704
|
+
tmp_bufs.at(i).resize(shard.size);
|
705
|
+
file.read_raw(tmp_bufs.at(i).addr, shard.size);
|
706
|
+
}
|
707
|
+
// Then reshape.
|
708
|
+
size_t num_rows = lt.ne.at(1);
|
709
|
+
size_t per_shard_row_size = lt.shards.at(0).size / num_rows;
|
710
|
+
size_t out_offset = 0;
|
711
|
+
for (size_t row = 0; row < num_rows; row++) {
|
712
|
+
for (llama_buffer & tmp_buf : tmp_bufs) {
|
713
|
+
memcpy(lt.data + out_offset,
|
714
|
+
tmp_buf.addr + row * per_shard_row_size,
|
715
|
+
per_shard_row_size);
|
716
|
+
out_offset += per_shard_row_size;
|
717
|
+
}
|
718
|
+
}
|
719
|
+
LLAMA_ASSERT(out_offset == lt.size);
|
720
|
+
}
|
721
|
+
if (0) {
|
722
|
+
print_checksum(lt);
|
723
|
+
}
|
724
|
+
}
|
725
|
+
|
726
|
+
static void print_checksum(llama_load_tensor & lt) {
|
727
|
+
uint32_t sum = 0;
|
728
|
+
for (size_t i = 0; i < lt.size; i++) {
|
729
|
+
uint8_t byte = lt.data[i];
|
730
|
+
sum = byte + (sum << 6) + (sum << 16) - sum; // sdbm hash
|
731
|
+
}
|
732
|
+
fprintf(stderr, "%s checksum: %#08x (%s, size %zu)\n", lt.name.c_str(), sum,
|
733
|
+
llama_format_tensor_shape(lt.ne).c_str(), lt.size);
|
734
|
+
}
|
735
|
+
|
736
|
+
};
|
737
|
+
|
738
|
+
|
247
739
|
//
|
248
740
|
// kv cache
|
249
741
|
//
|
@@ -262,8 +754,8 @@ static bool kv_cache_init(
|
|
262
754
|
cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
|
263
755
|
|
264
756
|
struct ggml_init_params params;
|
265
|
-
params.mem_size = cache.buf.size
|
266
|
-
params.mem_buffer = cache.buf.
|
757
|
+
params.mem_size = cache.buf.size;
|
758
|
+
params.mem_buffer = cache.buf.addr;
|
267
759
|
params.no_alloc = false;
|
268
760
|
|
269
761
|
cache.ctx = ggml_init(params);
|
@@ -279,13 +771,6 @@ static bool kv_cache_init(
|
|
279
771
|
return true;
|
280
772
|
}
|
281
773
|
|
282
|
-
static void kv_cache_free(struct llama_kv_cache & cache) {
|
283
|
-
if (cache.ctx) {
|
284
|
-
ggml_free(cache.ctx);
|
285
|
-
cache.ctx = nullptr;
|
286
|
-
}
|
287
|
-
}
|
288
|
-
|
289
774
|
struct llama_context_params llama_context_default_params() {
|
290
775
|
struct llama_context_params result = {
|
291
776
|
/*.n_ctx =*/ 512,
|
@@ -294,6 +779,7 @@ struct llama_context_params llama_context_default_params() {
|
|
294
779
|
/*.f16_kv =*/ false,
|
295
780
|
/*.logits_all =*/ false,
|
296
781
|
/*.vocab_only =*/ false,
|
782
|
+
/*.use_mmap =*/ true,
|
297
783
|
/*.use_mlock =*/ false,
|
298
784
|
/*.embedding =*/ false,
|
299
785
|
/*.progress_callback =*/ nullptr,
|
@@ -303,243 +789,106 @@ struct llama_context_params llama_context_default_params() {
|
|
303
789
|
return result;
|
304
790
|
}
|
305
791
|
|
792
|
+
bool llama_mmap_supported() {
|
793
|
+
return llama_mmap::SUPPORTED;
|
794
|
+
}
|
795
|
+
|
796
|
+
bool llama_mlock_supported() {
|
797
|
+
return llama_mlock::SUPPORTED;
|
798
|
+
}
|
799
|
+
|
306
800
|
//
|
307
801
|
// model loading
|
308
802
|
//
|
309
803
|
|
310
|
-
static
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
FILE_ATTRIBUTE_NORMAL | FILE_ATTRIBUTE_NOT_CONTENT_INDEXED,
|
318
|
-
NULL);
|
319
|
-
if (hFile == INVALID_HANDLE_VALUE) return 0;
|
320
|
-
LARGE_INTEGER fileSize;
|
321
|
-
fileSize.QuadPart = -1;
|
322
|
-
GetFileSizeEx(hFile, &fileSize);
|
323
|
-
int64_t length = fileSize.QuadPart;
|
324
|
-
HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
|
325
|
-
CloseHandle(hFile);
|
326
|
-
if (!hMapping) return 0;
|
327
|
-
void *addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
|
328
|
-
CloseHandle(hMapping);
|
329
|
-
if (!addr) return 0;
|
330
|
-
#else
|
331
|
-
int fd = open(fname, O_RDONLY);
|
332
|
-
if (fd == -1) return 0;
|
333
|
-
int64_t length = lseek(fd, 0, SEEK_END);
|
334
|
-
void *addr = mmap(NULL, length, PROT_READ, MAP_SHARED, fd, 0);
|
335
|
-
close(fd);
|
336
|
-
if (addr == MAP_FAILED) return 0;
|
337
|
-
#endif
|
338
|
-
*mm_length = length;
|
339
|
-
return addr;
|
804
|
+
static const char *llama_file_version_name(llama_file_version version) {
|
805
|
+
switch (version) {
|
806
|
+
case LLAMA_FILE_VERSION_GGML: return "'ggml' (old version with low tokenizer quality and no mmap support)";
|
807
|
+
case LLAMA_FILE_VERSION_GGMF_V1: return "ggmf v1 (old version with no mmap support)";
|
808
|
+
case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (latest)";
|
809
|
+
default: LLAMA_ASSERT(false);
|
810
|
+
}
|
340
811
|
}
|
341
812
|
|
342
|
-
static
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
|
813
|
+
static const char *llama_ftype_name(enum llama_ftype ftype) {
|
814
|
+
switch (ftype) {
|
815
|
+
case LLAMA_FTYPE_ALL_F32: return "all F32";
|
816
|
+
case LLAMA_FTYPE_MOSTLY_F16: return "mostly F16";
|
817
|
+
case LLAMA_FTYPE_MOSTLY_Q4_0: return "mostly Q4_0";
|
818
|
+
case LLAMA_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1";
|
819
|
+
case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
|
820
|
+
return "mostly Q4_1, some F16";
|
821
|
+
default: return "unknown, may not work";
|
822
|
+
}
|
348
823
|
}
|
349
824
|
|
350
|
-
static
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
path, got, want);
|
359
|
-
return false;
|
825
|
+
static const char *llama_model_type_name(e_model type) {
|
826
|
+
switch (type) {
|
827
|
+
case MODEL_7B: return "7B";
|
828
|
+
case MODEL_13B: return "13B";
|
829
|
+
case MODEL_30B: return "30B";
|
830
|
+
case MODEL_65B: return "65B";
|
831
|
+
default: LLAMA_ASSERT(false);
|
832
|
+
}
|
360
833
|
}
|
361
834
|
|
362
|
-
static
|
835
|
+
static void llama_model_load_internal(
|
363
836
|
const std::string & fname,
|
364
837
|
llama_context & lctx,
|
365
838
|
int n_ctx,
|
366
|
-
int n_parts,
|
367
839
|
ggml_type memory_type,
|
840
|
+
bool use_mmap,
|
841
|
+
bool use_mlock,
|
368
842
|
bool vocab_only,
|
369
843
|
llama_progress_callback progress_callback,
|
370
|
-
void *progress_callback_user_data) {
|
371
|
-
fprintf(stderr, "%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
|
844
|
+
void * progress_callback_user_data) {
|
372
845
|
|
373
846
|
lctx.t_start_us = ggml_time_us();
|
374
847
|
|
375
|
-
|
376
|
-
auto & vocab = lctx.vocab;
|
377
|
-
|
378
|
-
auto fin = std::ifstream(fname, std::ios::binary);
|
379
|
-
if (!fin) {
|
380
|
-
fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
|
381
|
-
return false;
|
382
|
-
}
|
383
|
-
|
384
|
-
std::vector<char> f_buf(1024*1024);
|
385
|
-
fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size());
|
848
|
+
std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap, vocab_only));
|
386
849
|
|
387
|
-
|
388
|
-
|
389
|
-
|
850
|
+
lctx.vocab = std::move(ml->file_loaders.at(0)->vocab);
|
851
|
+
auto & model = lctx.model;
|
852
|
+
model.hparams = ml->file_loaders.at(0)->hparams;
|
853
|
+
llama_file_version file_version = ml->file_loaders.at(0)->file_version;
|
854
|
+
auto & hparams = model.hparams;
|
855
|
+
uint32_t n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
|
390
856
|
|
391
|
-
// verify magic
|
392
857
|
{
|
393
|
-
|
394
|
-
|
395
|
-
|
396
|
-
|
397
|
-
|
398
|
-
return false;
|
858
|
+
switch (hparams.n_layer) {
|
859
|
+
case 32: model.type = e_model::MODEL_7B; break;
|
860
|
+
case 40: model.type = e_model::MODEL_13B; break;
|
861
|
+
case 60: model.type = e_model::MODEL_30B; break;
|
862
|
+
case 80: model.type = e_model::MODEL_65B; break;
|
399
863
|
}
|
400
|
-
if (magic != LLAMA_FILE_MAGIC) {
|
401
|
-
return report_bad_magic(fname.c_str(), magic, LLAMA_FILE_MAGIC);
|
402
|
-
}
|
403
|
-
|
404
|
-
uint32_t format_version;
|
405
|
-
fin.read((char *) &format_version, sizeof(format_version));
|
406
|
-
|
407
|
-
if (format_version != LLAMA_FILE_VERSION) {
|
408
|
-
fprintf(stderr, "%s: invalid model file '%s' (unsupported format version %" PRIu32 ", expected %d)\n",
|
409
|
-
__func__, fname.c_str(), format_version, LLAMA_FILE_VERSION);
|
410
|
-
return false;
|
411
|
-
}
|
412
|
-
}
|
413
|
-
|
414
|
-
int n_ff = 0;
|
415
|
-
|
416
|
-
// load hparams
|
417
|
-
{
|
418
|
-
auto & hparams = model.hparams;
|
419
|
-
|
420
|
-
fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
|
421
|
-
//fin.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
|
422
|
-
fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd));
|
423
|
-
fin.read((char *) &hparams.n_mult, sizeof(hparams.n_mult));
|
424
|
-
fin.read((char *) &hparams.n_head, sizeof(hparams.n_head));
|
425
|
-
fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
|
426
|
-
fin.read((char *) &hparams.n_rot, sizeof(hparams.n_rot));
|
427
|
-
fin.read((char *) &hparams.f16, sizeof(hparams.f16));
|
428
864
|
|
429
865
|
hparams.n_ctx = n_ctx;
|
430
|
-
|
431
|
-
n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
|
432
|
-
|
433
|
-
if (n_parts < 1) {
|
434
|
-
n_parts = LLAMA_N_PARTS.at(hparams.n_embd);
|
435
|
-
}
|
436
|
-
|
437
|
-
// temp warning to tell the user to use "--n_parts"
|
438
|
-
if (hparams.f16 == 4 && n_parts != 1) {
|
439
|
-
fprintf(stderr, "%s: GPTQ model detected - are you sure n_parts should be %d? we normally expect it to be 1\n", __func__, n_parts);
|
440
|
-
fprintf(stderr, "%s: use '--n_parts 1' if necessary\n", __func__);
|
441
|
-
}
|
442
|
-
|
443
|
-
if (hparams.n_layer == 32) {
|
444
|
-
model.type = e_model::MODEL_7B;
|
445
|
-
}
|
446
|
-
|
447
|
-
if (hparams.n_layer == 40) {
|
448
|
-
model.type = e_model::MODEL_13B;
|
449
|
-
}
|
450
|
-
|
451
|
-
if (hparams.n_layer == 60) {
|
452
|
-
model.type = e_model::MODEL_30B;
|
453
|
-
}
|
454
|
-
|
455
|
-
if (hparams.n_layer == 80) {
|
456
|
-
model.type = e_model::MODEL_65B;
|
457
|
-
}
|
458
|
-
|
459
|
-
fprintf(stderr, "%s: n_vocab = %d\n", __func__, hparams.n_vocab);
|
460
|
-
fprintf(stderr, "%s: n_ctx = %d\n", __func__, hparams.n_ctx);
|
461
|
-
fprintf(stderr, "%s: n_embd = %d\n", __func__, hparams.n_embd);
|
462
|
-
fprintf(stderr, "%s: n_mult = %d\n", __func__, hparams.n_mult);
|
463
|
-
fprintf(stderr, "%s: n_head = %d\n", __func__, hparams.n_head);
|
464
|
-
fprintf(stderr, "%s: n_layer = %d\n", __func__, hparams.n_layer);
|
465
|
-
fprintf(stderr, "%s: n_rot = %d\n", __func__, hparams.n_rot);
|
466
|
-
fprintf(stderr, "%s: f16 = %d\n", __func__, hparams.f16);
|
467
|
-
fprintf(stderr, "%s: n_ff = %d\n", __func__, n_ff);
|
468
|
-
fprintf(stderr, "%s: n_parts = %d\n", __func__, n_parts);
|
469
|
-
fprintf(stderr, "%s: type = %d\n", __func__, model.type);
|
470
866
|
}
|
471
867
|
|
472
|
-
// load vocab
|
473
868
|
{
|
474
|
-
|
475
|
-
|
476
|
-
|
477
|
-
|
478
|
-
|
479
|
-
|
480
|
-
|
481
|
-
|
482
|
-
|
483
|
-
|
484
|
-
|
485
|
-
|
486
|
-
word.assign(tmp.data(), len);
|
487
|
-
} else {
|
488
|
-
word.clear();
|
489
|
-
}
|
490
|
-
|
491
|
-
float score;
|
492
|
-
fin.read((char *) &score, sizeof(score));
|
493
|
-
|
494
|
-
vocab.token_to_id[word] = i;
|
495
|
-
|
496
|
-
auto &tok_score = vocab.id_to_token[i];
|
497
|
-
tok_score.tok = word;
|
498
|
-
tok_score.score = score;
|
499
|
-
}
|
869
|
+
fprintf(stderr, "%s: format = %s\n", __func__, llama_file_version_name(file_version));
|
870
|
+
fprintf(stderr, "%s: n_vocab = %u\n", __func__, hparams.n_vocab);
|
871
|
+
fprintf(stderr, "%s: n_ctx = %u\n", __func__, hparams.n_ctx);
|
872
|
+
fprintf(stderr, "%s: n_embd = %u\n", __func__, hparams.n_embd);
|
873
|
+
fprintf(stderr, "%s: n_mult = %u\n", __func__, hparams.n_mult);
|
874
|
+
fprintf(stderr, "%s: n_head = %u\n", __func__, hparams.n_head);
|
875
|
+
fprintf(stderr, "%s: n_layer = %u\n", __func__, hparams.n_layer);
|
876
|
+
fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot);
|
877
|
+
fprintf(stderr, "%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype));
|
878
|
+
fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff);
|
879
|
+
fprintf(stderr, "%s: n_parts = %zu\n", __func__, ml->file_loaders.size());
|
880
|
+
fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
|
500
881
|
}
|
501
882
|
|
502
883
|
if (vocab_only) {
|
503
|
-
return
|
504
|
-
}
|
505
|
-
|
506
|
-
// for the big tensors, we have the option to store the data in 16-bit floats or quantized
|
507
|
-
// in order to save memory and also to speed up the computation
|
508
|
-
// wtype is for per-layer weights, while vtype is for other weights
|
509
|
-
ggml_type wtype, vtype;
|
510
|
-
switch (model.hparams.f16) {
|
511
|
-
case 0: wtype = vtype = GGML_TYPE_F32; break;
|
512
|
-
case 1: wtype = vtype = GGML_TYPE_F16; break;
|
513
|
-
case 2: wtype = vtype = GGML_TYPE_Q4_0; break;
|
514
|
-
case 3: wtype = vtype = GGML_TYPE_Q4_1; break;
|
515
|
-
case 4: wtype = GGML_TYPE_Q4_1; vtype = GGML_TYPE_F16; break;
|
516
|
-
default:
|
517
|
-
{
|
518
|
-
fprintf(stderr, "%s: invalid model file '%s' (bad f16 value %d)\n",
|
519
|
-
__func__, fname.c_str(), model.hparams.f16);
|
520
|
-
return false;
|
521
|
-
}
|
522
|
-
}
|
523
|
-
|
524
|
-
// map model into memory
|
525
|
-
char *mm_addr = NULL;
|
526
|
-
model.mm_addr = mmap_file(fname.c_str(), &model.mm_length);
|
527
|
-
if (model.mm_addr == NULL) {
|
528
|
-
fprintf(stderr, "%s: failed to mmap '%s'\n", __func__, fname.c_str());
|
529
|
-
return false;
|
884
|
+
return;
|
530
885
|
}
|
531
|
-
mm_addr = (char *)model.mm_addr;
|
532
|
-
fprintf(stderr, "%s: ggml map size = %6.2f MB\n", __func__, model.mm_length/(1024.0*1024.0));
|
533
886
|
|
534
887
|
auto & ctx = model.ctx;
|
535
888
|
|
536
|
-
size_t ctx_size
|
537
|
-
|
538
|
-
|
539
|
-
const int n_layer = hparams.n_layer;
|
540
|
-
ctx_size += (5 + 10*n_layer)*256; // object overhead
|
541
|
-
fprintf(stderr, "%s: ggml ctx size = %6.2f KB\n", __func__, ctx_size/1024.0);
|
542
|
-
}
|
889
|
+
size_t ctx_size, mmapped_size;
|
890
|
+
ml->calc_sizes(&ctx_size, &mmapped_size);
|
891
|
+
fprintf(stderr, "%s: ggml ctx size = %6.2f KB\n", __func__, ctx_size/1024.0);
|
543
892
|
|
544
893
|
// print memory requirements
|
545
894
|
{
|
@@ -548,7 +897,7 @@ static bool llama_model_load(
|
|
548
897
|
// this is the total memory required to run the inference
|
549
898
|
const size_t mem_required =
|
550
899
|
ctx_size +
|
551
|
-
|
900
|
+
mmapped_size +
|
552
901
|
MEM_REQ_SCRATCH0.at(model.type) +
|
553
902
|
MEM_REQ_SCRATCH1.at(model.type) +
|
554
903
|
MEM_REQ_EVAL.at (model.type);
|
@@ -564,17 +913,20 @@ static bool llama_model_load(
|
|
564
913
|
// create the ggml context
|
565
914
|
{
|
566
915
|
lctx.model.buf.resize(ctx_size);
|
916
|
+
if (use_mlock) {
|
917
|
+
lctx.model.mlock_buf.init(lctx.model.buf.addr);
|
918
|
+
lctx.model.mlock_buf.grow_to(lctx.model.buf.size);
|
919
|
+
}
|
567
920
|
|
568
921
|
struct ggml_init_params params = {
|
569
|
-
/*.mem_size =*/ lctx.model.buf.size
|
570
|
-
/*.mem_buffer =*/ lctx.model.buf.
|
571
|
-
/*.no_alloc =*/
|
922
|
+
/*.mem_size =*/ lctx.model.buf.size,
|
923
|
+
/*.mem_buffer =*/ lctx.model.buf.addr,
|
924
|
+
/*.no_alloc =*/ ml->use_mmap,
|
572
925
|
};
|
573
926
|
|
574
927
|
model.ctx = ggml_init(params);
|
575
928
|
if (!model.ctx) {
|
576
|
-
|
577
|
-
return false;
|
929
|
+
throw format("ggml_init() failed");
|
578
930
|
}
|
579
931
|
}
|
580
932
|
|
@@ -582,161 +934,71 @@ static bool llama_model_load(
|
|
582
934
|
{
|
583
935
|
const auto & hparams = model.hparams;
|
584
936
|
|
585
|
-
const
|
586
|
-
const
|
587
|
-
const
|
588
|
-
|
589
|
-
model.layers.resize(n_layer);
|
590
|
-
|
591
|
-
model.tok_embeddings = ggml_new_tensor_2d(ctx, vtype, n_embd, n_vocab);
|
592
|
-
|
593
|
-
model.norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
594
|
-
model.output = ggml_new_tensor_2d(ctx, vtype, n_embd, n_vocab);
|
937
|
+
const uint32_t n_embd = hparams.n_embd;
|
938
|
+
const uint32_t n_layer = hparams.n_layer;
|
939
|
+
const uint32_t n_vocab = hparams.n_vocab;
|
595
940
|
|
596
|
-
|
597
|
-
model.tensors["tok_embeddings.weight"] = model.tok_embeddings;
|
941
|
+
ml->ggml_ctx = ctx;
|
598
942
|
|
599
|
-
model.
|
600
|
-
model.
|
943
|
+
model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab});
|
944
|
+
model.norm = ml->get_tensor("norm.weight", {n_embd});
|
945
|
+
model.output = ml->get_tensor("output.weight", {n_embd, n_vocab});
|
601
946
|
|
602
|
-
|
947
|
+
model.layers.resize(n_layer);
|
948
|
+
for (uint32_t i = 0; i < n_layer; ++i) {
|
603
949
|
auto & layer = model.layers[i];
|
604
950
|
|
605
|
-
|
606
|
-
|
607
|
-
layer.wq = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
|
608
|
-
layer.wk = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
|
609
|
-
layer.wv = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
|
610
|
-
layer.wo = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
|
611
|
-
|
612
|
-
layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
613
|
-
|
614
|
-
layer.w1 = ggml_new_tensor_2d(ctx, wtype, n_embd, n_ff);
|
615
|
-
layer.w2 = ggml_new_tensor_2d(ctx, wtype, n_ff, n_embd);
|
616
|
-
layer.w3 = ggml_new_tensor_2d(ctx, wtype, n_embd, n_ff);
|
951
|
+
std::string layers_i = "layers." + std::to_string(i);
|
617
952
|
|
618
|
-
|
619
|
-
model.tensors["layers." + std::to_string(i) + ".attention_norm.weight"] = layer.attention_norm;
|
953
|
+
layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd});
|
620
954
|
|
621
|
-
|
622
|
-
|
623
|
-
|
624
|
-
|
955
|
+
layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd});
|
956
|
+
layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd});
|
957
|
+
layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd});
|
958
|
+
layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd});
|
625
959
|
|
626
|
-
|
960
|
+
layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd});
|
627
961
|
|
628
|
-
|
629
|
-
|
630
|
-
|
962
|
+
layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff});
|
963
|
+
layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd});
|
964
|
+
layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff});
|
631
965
|
}
|
632
966
|
}
|
633
967
|
|
634
|
-
|
968
|
+
ml->done_getting_tensors();
|
635
969
|
|
636
|
-
|
637
|
-
|
970
|
+
// populate `tensors_by_name`
|
971
|
+
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
|
972
|
+
model.tensors_by_name.emplace_back(lt.name, lt.ggml_tensor);
|
638
973
|
}
|
639
974
|
|
640
|
-
|
975
|
+
ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
|
641
976
|
|
642
|
-
|
643
|
-
{
|
644
|
-
size_t total_size = 0;
|
645
|
-
model.n_loaded = 0;
|
646
|
-
|
647
|
-
while (true) {
|
648
|
-
int32_t n_dims;
|
649
|
-
int32_t length;
|
650
|
-
int32_t ftype;
|
651
|
-
|
652
|
-
fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
|
653
|
-
fin.read(reinterpret_cast<char *>(&length), sizeof(length));
|
654
|
-
fin.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
|
655
|
-
|
656
|
-
if (fin.eof()) {
|
657
|
-
break;
|
658
|
-
}
|
659
|
-
|
660
|
-
int32_t nelements = 1;
|
661
|
-
int32_t ne[2] = { 1, 1 };
|
662
|
-
for (int i = 0; i < n_dims; ++i) {
|
663
|
-
fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
|
664
|
-
nelements *= ne[i];
|
665
|
-
}
|
666
|
-
|
667
|
-
std::string name(length, 0);
|
668
|
-
fin.read(&name[0], length);
|
669
|
-
|
670
|
-
if (model.tensors.find(name.data()) == model.tensors.end()) {
|
671
|
-
fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
|
672
|
-
return false;
|
673
|
-
}
|
674
|
-
|
675
|
-
auto tensor = model.tensors[name.data()];
|
676
|
-
|
677
|
-
if (ggml_nelements(tensor) != nelements) {
|
678
|
-
fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
|
679
|
-
return false;
|
680
|
-
}
|
681
|
-
if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
|
682
|
-
fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%" PRId64 ", %" PRId64 "], expected [%d, %d]\n",
|
683
|
-
__func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
|
684
|
-
return false;
|
685
|
-
}
|
686
|
-
if (0) {
|
687
|
-
static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
|
688
|
-
fprintf(stderr, "%24s - [%5d, %5d], type = %6s\n", name.data(), ne[0], ne[1], ftype_str[ftype]);
|
689
|
-
}
|
690
|
-
|
691
|
-
switch (ftype) {
|
692
|
-
case 0: // f32
|
693
|
-
case 1: // f16
|
694
|
-
break;
|
695
|
-
case 2: // q4_0
|
696
|
-
case 3: // q4_1
|
697
|
-
assert(ne[0] % 64 == 0);
|
698
|
-
break;
|
699
|
-
default:
|
700
|
-
fprintf(stderr, "%s: unknown ftype %d in model file\n", __func__, ftype);
|
701
|
-
return false;
|
702
|
-
};
|
703
|
-
|
704
|
-
// load the tensor data into memory without copying or reading it
|
705
|
-
size_t offset = fin.tellg();
|
706
|
-
size_t tensor_data_size = ggml_nbytes(tensor);
|
707
|
-
offset = (offset + 31) & -32;
|
708
|
-
tensor->data = mm_addr + offset;
|
709
|
-
fin.seekg(offset + tensor_data_size);
|
710
|
-
total_size += tensor_data_size;
|
711
|
-
model.n_loaded++;
|
712
|
-
|
713
|
-
// progress
|
714
|
-
if (progress_callback) {
|
715
|
-
double current_progress = size_t(fin.tellg()) / double(file_size);
|
716
|
-
progress_callback(current_progress, progress_callback_user_data);
|
717
|
-
}
|
718
|
-
}
|
719
|
-
|
720
|
-
fin.close();
|
721
|
-
|
722
|
-
fprintf(stderr, "%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, model.n_loaded);
|
723
|
-
if (model.n_loaded == 0) {
|
724
|
-
fprintf(stderr, "%s: WARN no tensors loaded from model file - assuming empty model for testing\n", __func__);
|
725
|
-
} else if (model.n_loaded != (int) model.tensors.size()) {
|
726
|
-
fprintf(stderr, "%s: ERROR not all tensors loaded from model file - expected %zu, got %d\n", __func__, model.tensors.size(), model.n_loaded);
|
727
|
-
return false;
|
728
|
-
}
|
729
|
-
}
|
977
|
+
model.mapping = std::move(ml->mapping);
|
730
978
|
|
731
979
|
// loading time will be recalculate after the first eval, so
|
732
980
|
// we take page faults deferred by mmap() into consideration
|
733
981
|
lctx.t_load_us = ggml_time_us() - lctx.t_start_us;
|
982
|
+
}
|
734
983
|
|
735
|
-
|
736
|
-
|
984
|
+
static bool llama_model_load(
|
985
|
+
const std::string & fname,
|
986
|
+
llama_context & lctx,
|
987
|
+
int n_ctx,
|
988
|
+
ggml_type memory_type,
|
989
|
+
bool use_mmap,
|
990
|
+
bool use_mlock,
|
991
|
+
bool vocab_only,
|
992
|
+
llama_progress_callback progress_callback,
|
993
|
+
void *progress_callback_user_data) {
|
994
|
+
try {
|
995
|
+
llama_model_load_internal(fname, lctx, n_ctx, memory_type, use_mmap, use_mlock,
|
996
|
+
vocab_only, progress_callback, progress_callback_user_data);
|
997
|
+
return true;
|
998
|
+
} catch (const std::string & err) {
|
999
|
+
fprintf(stderr, "error loading model: %s\n", err.c_str());
|
1000
|
+
return false;
|
737
1001
|
}
|
738
|
-
|
739
|
-
return true;
|
740
1002
|
}
|
741
1003
|
|
742
1004
|
// evaluate the transformer
|
@@ -774,8 +1036,8 @@ static bool llama_eval_internal(
|
|
774
1036
|
auto & buf_compute = lctx.buf_compute;
|
775
1037
|
|
776
1038
|
struct ggml_init_params params = {
|
777
|
-
/*.mem_size =*/ buf_compute.size
|
778
|
-
/*.mem_buffer =*/ buf_compute.
|
1039
|
+
/*.mem_size =*/ buf_compute.size,
|
1040
|
+
/*.mem_buffer =*/ buf_compute.addr,
|
779
1041
|
/*.no_alloc =*/ false,
|
780
1042
|
};
|
781
1043
|
|
@@ -1061,7 +1323,7 @@ struct llama_tokenizer {
|
|
1061
1323
|
size_t offs = 0;
|
1062
1324
|
while (offs < text.size()) {
|
1063
1325
|
llama_sp_symbol sym;
|
1064
|
-
size_t char_len =
|
1326
|
+
size_t char_len = std::min(text.size() - offs, utf8_len(text[offs]));
|
1065
1327
|
sym.text = text.c_str() + offs;
|
1066
1328
|
sym.n = char_len;
|
1067
1329
|
offs += char_len;
|
@@ -1236,7 +1498,7 @@ static llama_vocab::id llama_sample_top_p_top_k(
|
|
1236
1498
|
}
|
1237
1499
|
}
|
1238
1500
|
|
1239
|
-
sample_top_k(logits_id, top_k > 0 ?
|
1501
|
+
sample_top_k(logits_id, top_k > 0 ? std::min(top_k, n_logits) : n_logits);
|
1240
1502
|
|
1241
1503
|
// compute probs for the top k tokens
|
1242
1504
|
std::vector<float> probs;
|
@@ -1284,298 +1546,118 @@ static llama_vocab::id llama_sample_top_p_top_k(
|
|
1284
1546
|
// quantization
|
1285
1547
|
//
|
1286
1548
|
|
1287
|
-
|
1288
|
-
|
1289
|
-
|
1290
|
-
|
1291
|
-
|
1292
|
-
|
1293
|
-
case 3: type = GGML_TYPE_Q4_1; break;
|
1294
|
-
default: fprintf(stderr, "%s: invalid quantization type %d\n", __func__, itype); return 1;
|
1549
|
+
static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, enum llama_ftype ftype) {
|
1550
|
+
ggml_type quantized_type;
|
1551
|
+
switch (ftype) {
|
1552
|
+
case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
|
1553
|
+
case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
|
1554
|
+
default: throw format("invalid output file type %d\n", ftype);
|
1295
1555
|
};
|
1296
1556
|
|
1297
|
-
|
1298
|
-
|
1299
|
-
|
1300
|
-
|
1301
|
-
|
1302
|
-
|
1303
|
-
|
1304
|
-
|
1305
|
-
|
1306
|
-
|
1307
|
-
|
1308
|
-
|
1309
|
-
|
1310
|
-
|
1311
|
-
|
1312
|
-
|
1313
|
-
|
1314
|
-
|
1315
|
-
|
1316
|
-
|
1317
|
-
|
1318
|
-
|
1319
|
-
|
1320
|
-
|
1321
|
-
|
1322
|
-
|
1323
|
-
|
1324
|
-
|
1325
|
-
|
1326
|
-
|
1327
|
-
|
1328
|
-
|
1329
|
-
|
1330
|
-
|
1331
|
-
|
1332
|
-
|
1333
|
-
|
1334
|
-
|
1335
|
-
|
1336
|
-
|
1337
|
-
|
1338
|
-
|
1339
|
-
|
1340
|
-
|
1341
|
-
|
1342
|
-
|
1343
|
-
|
1344
|
-
|
1345
|
-
|
1346
|
-
|
1347
|
-
// load hparams
|
1348
|
-
{
|
1349
|
-
finp.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
|
1350
|
-
//finp.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
|
1351
|
-
finp.read((char *) &hparams.n_embd, sizeof(hparams.n_embd));
|
1352
|
-
finp.read((char *) &hparams.n_mult, sizeof(hparams.n_mult));
|
1353
|
-
finp.read((char *) &hparams.n_head, sizeof(hparams.n_head));
|
1354
|
-
finp.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
|
1355
|
-
finp.read((char *) &hparams.n_rot, sizeof(hparams.n_rot));
|
1356
|
-
finp.read((char *) &hparams.f16, sizeof(hparams.f16));
|
1357
|
-
|
1358
|
-
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
|
1359
|
-
printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
|
1360
|
-
printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
|
1361
|
-
printf("%s: n_mult = %d\n", __func__, hparams.n_mult);
|
1362
|
-
printf("%s: n_head = %d\n", __func__, hparams.n_head);
|
1363
|
-
printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
|
1364
|
-
printf("%s: f16 = %d\n", __func__, hparams.f16);
|
1365
|
-
|
1366
|
-
fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
|
1367
|
-
//fout.write((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
|
1368
|
-
fout.write((char *) &hparams.n_embd, sizeof(hparams.n_embd));
|
1369
|
-
fout.write((char *) &hparams.n_mult, sizeof(hparams.n_mult));
|
1370
|
-
fout.write((char *) &hparams.n_head, sizeof(hparams.n_head));
|
1371
|
-
fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer));
|
1372
|
-
fout.write((char *) &hparams.n_rot, sizeof(hparams.n_rot));
|
1373
|
-
fout.write((char *) &itype, sizeof(hparams.f16));
|
1374
|
-
}
|
1375
|
-
|
1376
|
-
// load vocab
|
1377
|
-
{
|
1378
|
-
const int32_t n_vocab = hparams.n_vocab;
|
1379
|
-
|
1380
|
-
if (n_vocab != hparams.n_vocab) {
|
1381
|
-
fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
|
1382
|
-
__func__, fname_inp.c_str(), n_vocab, hparams.n_vocab);
|
1383
|
-
return false;
|
1384
|
-
}
|
1385
|
-
|
1386
|
-
std::vector<char> word(32);
|
1387
|
-
vocab.id_to_token.resize(n_vocab);
|
1388
|
-
for (int i = 0; i < n_vocab; i++) {
|
1389
|
-
uint32_t len;
|
1390
|
-
finp.read ((char *) &len, sizeof(len));
|
1391
|
-
fout.write((char *) &len, sizeof(len));
|
1392
|
-
|
1393
|
-
word.resize(len);
|
1394
|
-
finp.read ((char *) &word[0], len);
|
1395
|
-
fout.write((char *) &word[0], len);
|
1396
|
-
|
1397
|
-
float score;
|
1398
|
-
finp.read ((char *) &score, sizeof(score));
|
1399
|
-
fout.write((char *) &score, sizeof(score));
|
1400
|
-
|
1401
|
-
vocab.token_to_id[word.data()] = i;
|
1402
|
-
|
1403
|
-
auto &tok_score = vocab.id_to_token[i];
|
1404
|
-
tok_score.tok = word.data();
|
1405
|
-
tok_score.score = score;
|
1406
|
-
}
|
1407
|
-
}
|
1408
|
-
|
1409
|
-
// load weights
|
1410
|
-
{
|
1411
|
-
size_t total_size_org = 0;
|
1412
|
-
size_t total_size_new = 0;
|
1413
|
-
|
1414
|
-
std::vector<float> work;
|
1415
|
-
|
1416
|
-
std::vector<uint8_t> data_u8;
|
1417
|
-
std::vector<ggml_fp16_t> data_f16;
|
1418
|
-
std::vector<float> data_f32;
|
1419
|
-
|
1420
|
-
std::vector<int64_t> hist_all(1 << 4, 0);
|
1421
|
-
|
1422
|
-
while (true) {
|
1423
|
-
int32_t n_dims;
|
1424
|
-
int32_t length;
|
1425
|
-
int32_t ftype;
|
1426
|
-
|
1427
|
-
finp.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
|
1428
|
-
finp.read(reinterpret_cast<char *>(&length), sizeof(length));
|
1429
|
-
finp.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
|
1430
|
-
|
1431
|
-
if (finp.eof()) {
|
1432
|
-
break;
|
1433
|
-
}
|
1434
|
-
|
1435
|
-
int32_t nelements = 1;
|
1436
|
-
int32_t ne[2] = { 1, 1 };
|
1437
|
-
for (int i = 0; i < n_dims; ++i) {
|
1438
|
-
finp.read (reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
|
1439
|
-
nelements *= ne[i];
|
1440
|
-
}
|
1441
|
-
|
1442
|
-
std::string name(length, 0);
|
1443
|
-
finp.read (&name[0], length);
|
1444
|
-
|
1445
|
-
{
|
1446
|
-
// ensure tensor data is aligned
|
1447
|
-
uint64_t offset = finp.tellg();
|
1448
|
-
offset = (offset + 31) & -32;
|
1449
|
-
finp.seekg(offset);
|
1450
|
-
}
|
1451
|
-
|
1452
|
-
{
|
1453
|
-
static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
|
1454
|
-
printf("%48s - [%5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ftype_str[ftype]);
|
1455
|
-
}
|
1456
|
-
|
1457
|
-
// regexes of tensor names to be quantized
|
1458
|
-
const std::vector<std::string> k_names = {
|
1459
|
-
".*weight",
|
1460
|
-
};
|
1461
|
-
|
1462
|
-
bool quantize = false;
|
1463
|
-
for (const auto & s : k_names) {
|
1464
|
-
if (std::regex_match(name, std::regex(s))) {
|
1465
|
-
quantize = true;
|
1466
|
-
break;
|
1467
|
-
}
|
1468
|
-
}
|
1469
|
-
|
1470
|
-
// quantize only 2D tensors
|
1471
|
-
quantize &= (n_dims == 2);
|
1472
|
-
|
1473
|
-
if (quantize) {
|
1474
|
-
if (ftype != 0 && ftype != 1) {
|
1475
|
-
fprintf(stderr, "%s: unsupported ftype %d for integer quantization\n", __func__, ftype);
|
1476
|
-
return false;
|
1477
|
-
}
|
1478
|
-
|
1479
|
-
if (ftype == 1) {
|
1480
|
-
data_f16.resize(nelements);
|
1481
|
-
finp.read(reinterpret_cast<char *>(data_f16.data()), nelements * sizeof(ggml_fp16_t));
|
1482
|
-
data_f32.resize(nelements);
|
1483
|
-
for (int i = 0; i < nelements; ++i) {
|
1484
|
-
data_f32[i] = ggml_fp16_to_fp32(data_f16[i]);
|
1485
|
-
}
|
1486
|
-
} else {
|
1487
|
-
data_f32.resize(nelements);
|
1488
|
-
finp.read(reinterpret_cast<char *>(data_f32.data()), nelements * sizeof(float));
|
1557
|
+
std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp.c_str(), /*use_mmap*/ false,
|
1558
|
+
/*vocab_only*/ false));
|
1559
|
+
llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), ftype);
|
1560
|
+
|
1561
|
+
size_t total_size_org = 0;
|
1562
|
+
size_t total_size_new = 0;
|
1563
|
+
std::vector<int64_t> hist_all(1 << 4, 0);
|
1564
|
+
|
1565
|
+
size_t idx = 0;
|
1566
|
+
for (llama_load_tensor & tensor : model_loader->tensors_map.tensors) {
|
1567
|
+
llama_buffer read_data;
|
1568
|
+
read_data.resize(tensor.size);
|
1569
|
+
tensor.data = read_data.addr;
|
1570
|
+
model_loader->load_data_for(tensor);
|
1571
|
+
|
1572
|
+
printf("[%zu/%zu] %36s - %s, type = %6s, ",
|
1573
|
+
++idx, model_loader->tensors_map.tensors.size(),
|
1574
|
+
tensor.name.c_str(), llama_format_tensor_shape(tensor.ne).c_str(),
|
1575
|
+
ggml_type_name(tensor.type));
|
1576
|
+
|
1577
|
+
// This used to be a regex, but <regex> has an extreme cost to compile times.
|
1578
|
+
bool quantize = tensor.name.rfind("weight") == tensor.name.size() - 6; // ends with 'weight'?
|
1579
|
+
|
1580
|
+
// quantize only 2D tensors
|
1581
|
+
quantize &= (tensor.ne.size() == 2);
|
1582
|
+
|
1583
|
+
enum ggml_type new_type;
|
1584
|
+
void * new_data;
|
1585
|
+
size_t new_size;
|
1586
|
+
llama_buffer work;
|
1587
|
+
|
1588
|
+
if (!quantize) {
|
1589
|
+
new_type = tensor.type;
|
1590
|
+
new_data = tensor.data;
|
1591
|
+
new_size = tensor.size;
|
1592
|
+
printf("size = %8.3f MB\n", tensor.size/1024.0/1024.0);
|
1593
|
+
} else {
|
1594
|
+
new_type = quantized_type;
|
1595
|
+
float * f32_data;
|
1596
|
+
size_t nelements = tensor.ne.at(0) * tensor.ne.at(1);
|
1597
|
+
llama_buffer f32_conv_buf;
|
1598
|
+
if (tensor.type == GGML_TYPE_F32) {
|
1599
|
+
f32_data = (float *) tensor.data;
|
1600
|
+
} else if (tensor.type == GGML_TYPE_F16) {
|
1601
|
+
f32_conv_buf.resize(nelements * sizeof(float));
|
1602
|
+
f32_data = (float *) f32_conv_buf.addr;
|
1603
|
+
auto f16_data = (const ggml_fp16_t *) tensor.data;
|
1604
|
+
for (size_t i = 0; i < nelements; i++) {
|
1605
|
+
f32_data[i] = ggml_fp16_to_fp32(f16_data[i]);
|
1489
1606
|
}
|
1490
|
-
|
1491
|
-
ftype = itype;
|
1492
1607
|
} else {
|
1493
|
-
|
1494
|
-
|
1495
|
-
data_u8.resize(nelements*bpe);
|
1496
|
-
finp.read(reinterpret_cast<char *>(data_u8.data()), nelements * bpe);
|
1608
|
+
throw format("type %s unsupported for integer quantization", ggml_type_name(tensor.type));
|
1497
1609
|
}
|
1498
1610
|
|
1499
|
-
|
1500
|
-
|
1501
|
-
|
1502
|
-
|
1503
|
-
|
1611
|
+
printf("quantizing .. ");
|
1612
|
+
fflush(stdout);
|
1613
|
+
|
1614
|
+
work.resize(nelements * 4); // upper bound on size
|
1615
|
+
new_data = work.addr;
|
1616
|
+
std::vector<int64_t> hist_cur(1 << 4, 0);
|
1617
|
+
|
1618
|
+
switch (new_type) {
|
1619
|
+
case GGML_TYPE_Q4_0:
|
1620
|
+
{
|
1621
|
+
new_size = ggml_quantize_q4_0(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
|
1622
|
+
} break;
|
1623
|
+
case GGML_TYPE_Q4_1:
|
1624
|
+
{
|
1625
|
+
new_size = ggml_quantize_q4_1(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
|
1626
|
+
} break;
|
1627
|
+
default:
|
1628
|
+
LLAMA_ASSERT(false);
|
1504
1629
|
}
|
1505
|
-
fout.write(&name[0], length);
|
1506
1630
|
|
1507
|
-
|
1508
|
-
|
1509
|
-
|
1510
|
-
offset = (offset + 31) & -32;
|
1511
|
-
fout.seekp(offset);
|
1631
|
+
printf("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0);
|
1632
|
+
for (size_t i = 0; i < hist_cur.size(); i++) {
|
1633
|
+
hist_all[i] += hist_cur[i];
|
1512
1634
|
}
|
1513
1635
|
|
1514
|
-
|
1515
|
-
printf("
|
1516
|
-
work.resize(nelements); // for quantization
|
1517
|
-
|
1518
|
-
size_t cur_size = 0;
|
1519
|
-
std::vector<int64_t> hist_cur(1 << 4, 0);
|
1520
|
-
|
1521
|
-
switch (type) {
|
1522
|
-
case GGML_TYPE_Q4_0:
|
1523
|
-
{
|
1524
|
-
cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
1525
|
-
} break;
|
1526
|
-
case GGML_TYPE_Q4_1:
|
1527
|
-
{
|
1528
|
-
cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
1529
|
-
} break;
|
1530
|
-
default:
|
1531
|
-
{
|
1532
|
-
fprintf(stderr, "%s: unsupported quantization type %d\n", __func__, type);
|
1533
|
-
return false;
|
1534
|
-
}
|
1535
|
-
}
|
1536
|
-
|
1537
|
-
fout.write(reinterpret_cast<char *>(work.data()), cur_size);
|
1538
|
-
total_size_new += cur_size;
|
1539
|
-
|
1540
|
-
printf("size = %8.2f MB -> %8.2f MB | hist: ", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0);
|
1541
|
-
for (int i = 0; i < (int) hist_cur.size(); ++i) {
|
1542
|
-
hist_all[i] += hist_cur[i];
|
1543
|
-
}
|
1544
|
-
|
1545
|
-
for (int i = 0; i < (int) hist_cur.size(); ++i) {
|
1546
|
-
printf("%5.3f ", hist_cur[i] / float(nelements));
|
1547
|
-
}
|
1548
|
-
printf("\n");
|
1549
|
-
} else {
|
1550
|
-
printf("size = %8.3f MB\n", data_u8.size()/1024.0/1024.0);
|
1551
|
-
fout.write(reinterpret_cast<char *>(data_u8.data()), data_u8.size());
|
1552
|
-
total_size_new += data_u8.size();
|
1636
|
+
for (size_t i = 0; i < hist_cur.size(); i++) {
|
1637
|
+
printf("%5.3f ", hist_cur[i] / float(nelements));
|
1553
1638
|
}
|
1554
|
-
|
1555
|
-
total_size_org += nelements * sizeof(float);
|
1639
|
+
printf("\n");
|
1556
1640
|
}
|
1641
|
+
total_size_org += tensor.size;
|
1642
|
+
total_size_new += new_size;
|
1643
|
+
file_saver.write_tensor(tensor, new_type, new_data, new_size);
|
1644
|
+
}
|
1557
1645
|
|
1558
|
-
|
1559
|
-
|
1646
|
+
printf("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
|
1647
|
+
printf("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0);
|
1560
1648
|
|
1561
|
-
|
1562
|
-
|
1563
|
-
|
1564
|
-
|
1565
|
-
|
1649
|
+
{
|
1650
|
+
int64_t sum_all = 0;
|
1651
|
+
for (size_t i = 0; i < hist_all.size(); i++) {
|
1652
|
+
sum_all += hist_all[i];
|
1653
|
+
}
|
1566
1654
|
|
1567
|
-
|
1568
|
-
|
1569
|
-
|
1570
|
-
}
|
1571
|
-
printf("\n");
|
1655
|
+
printf("%s: hist: ", __func__);
|
1656
|
+
for (size_t i = 0; i < hist_all.size(); i++) {
|
1657
|
+
printf("%5.3f ", hist_all[i] / float(sum_all));
|
1572
1658
|
}
|
1659
|
+
printf("\n");
|
1573
1660
|
}
|
1574
|
-
|
1575
|
-
finp.close();
|
1576
|
-
fout.close();
|
1577
|
-
|
1578
|
-
return true;
|
1579
1661
|
}
|
1580
1662
|
|
1581
1663
|
//
|
@@ -1593,32 +1675,36 @@ struct llama_context * llama_init_from_file(
|
|
1593
1675
|
params.seed = time(NULL);
|
1594
1676
|
}
|
1595
1677
|
|
1678
|
+
unsigned cur_percentage = 0;
|
1679
|
+
if (params.progress_callback == NULL) {
|
1680
|
+
params.progress_callback_user_data = &cur_percentage;
|
1681
|
+
params.progress_callback = [](float progress, void * ctx) {
|
1682
|
+
unsigned * cur_percentage_p = (unsigned *) ctx;
|
1683
|
+
unsigned percentage = (unsigned) (100 * progress);
|
1684
|
+
while (percentage > *cur_percentage_p) {
|
1685
|
+
++*cur_percentage_p;
|
1686
|
+
fprintf(stderr, ".");
|
1687
|
+
fflush(stderr);
|
1688
|
+
if (percentage >= 100) {
|
1689
|
+
fprintf(stderr, "\n");
|
1690
|
+
}
|
1691
|
+
}
|
1692
|
+
};
|
1693
|
+
}
|
1694
|
+
|
1596
1695
|
ctx->rng = std::mt19937(params.seed);
|
1597
1696
|
ctx->logits_all = params.logits_all;
|
1598
1697
|
|
1599
1698
|
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
1600
1699
|
|
1601
|
-
if (!llama_model_load(path_model, *ctx, params.n_ctx,
|
1602
|
-
params.
|
1603
|
-
params.progress_callback_user_data)) {
|
1700
|
+
if (!llama_model_load(path_model, *ctx, params.n_ctx, memory_type,
|
1701
|
+
params.use_mmap, params.use_mlock, params.vocab_only,
|
1702
|
+
params.progress_callback, params.progress_callback_user_data)) {
|
1604
1703
|
fprintf(stderr, "%s: failed to load model\n", __func__);
|
1605
1704
|
llama_free(ctx);
|
1606
1705
|
return nullptr;
|
1607
1706
|
}
|
1608
1707
|
|
1609
|
-
if (params.use_mlock) {
|
1610
|
-
char *err;
|
1611
|
-
if (!ggml_mlock(ctx->model.ctx,
|
1612
|
-
ctx->model.mm_addr,
|
1613
|
-
ctx->model.mm_length,
|
1614
|
-
&err)) {
|
1615
|
-
fprintf(stderr, "%s\n", err);
|
1616
|
-
free(err);
|
1617
|
-
llama_free(ctx);
|
1618
|
-
return nullptr;
|
1619
|
-
}
|
1620
|
-
}
|
1621
|
-
|
1622
1708
|
// reserve memory for context buffers
|
1623
1709
|
if (!params.vocab_only) {
|
1624
1710
|
if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx)) {
|
@@ -1655,40 +1741,31 @@ struct llama_context * llama_init_from_file(
|
|
1655
1741
|
}
|
1656
1742
|
|
1657
1743
|
void llama_free(struct llama_context * ctx) {
|
1658
|
-
kv_cache_free(ctx->model.kv_self);
|
1659
|
-
|
1660
|
-
if (ctx->model.ctx) {
|
1661
|
-
ggml_free(ctx->model.ctx);
|
1662
|
-
}
|
1663
|
-
|
1664
|
-
if (ctx->model.mm_addr) {
|
1665
|
-
munmap_file(ctx->model.mm_addr, ctx->model.mm_length);
|
1666
|
-
}
|
1667
|
-
|
1668
1744
|
delete ctx;
|
1669
1745
|
}
|
1670
1746
|
|
1671
1747
|
int llama_model_quantize(
|
1672
1748
|
const char * fname_inp,
|
1673
1749
|
const char * fname_out,
|
1674
|
-
|
1675
|
-
|
1676
|
-
|
1750
|
+
enum llama_ftype ftype) {
|
1751
|
+
try {
|
1752
|
+
llama_model_quantize_internal(fname_inp, fname_out, ftype);
|
1753
|
+
return 0;
|
1754
|
+
} catch (const std::string & err) {
|
1755
|
+
fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.c_str());
|
1677
1756
|
return 1;
|
1678
1757
|
}
|
1679
|
-
|
1680
|
-
return 0;
|
1681
1758
|
}
|
1682
1759
|
|
1683
1760
|
// Returns the KV cache that will contain the context for the
|
1684
1761
|
// ongoing prediction with the model.
|
1685
1762
|
const uint8_t * llama_get_kv_cache(struct llama_context * ctx) {
|
1686
|
-
return ctx->model.kv_self.buf.
|
1763
|
+
return ctx->model.kv_self.buf.addr;
|
1687
1764
|
}
|
1688
1765
|
|
1689
1766
|
// Returns the size of the KV cache
|
1690
1767
|
size_t llama_get_kv_cache_size(struct llama_context * ctx) {
|
1691
|
-
return ctx->model.kv_self.buf.size
|
1768
|
+
return ctx->model.kv_self.buf.size;
|
1692
1769
|
}
|
1693
1770
|
|
1694
1771
|
int llama_get_kv_cache_token_count(struct llama_context * ctx) {
|
@@ -1702,8 +1779,8 @@ void llama_set_kv_cache(
|
|
1702
1779
|
size_t n_size,
|
1703
1780
|
int n_token_count) {
|
1704
1781
|
// Make sure we have the same kv cache setup
|
1705
|
-
LLAMA_ASSERT(ctx->model.kv_self.buf.size
|
1706
|
-
memcpy(ctx->model.kv_self.buf.
|
1782
|
+
LLAMA_ASSERT(ctx->model.kv_self.buf.size == n_size);
|
1783
|
+
memcpy(ctx->model.kv_self.buf.addr, kv_cache, n_size);
|
1707
1784
|
ctx->model.kv_self.n = n_token_count;
|
1708
1785
|
}
|
1709
1786
|
|
@@ -1814,9 +1891,9 @@ llama_token llama_sample_top_p_top_k(
|
|
1814
1891
|
void llama_print_timings(struct llama_context * ctx) {
|
1815
1892
|
const int64_t t_end_us = ggml_time_us();
|
1816
1893
|
|
1817
|
-
const int32_t n_sample =
|
1818
|
-
const int32_t n_eval =
|
1819
|
-
const int32_t n_p_eval =
|
1894
|
+
const int32_t n_sample = std::max(1, ctx->n_sample);
|
1895
|
+
const int32_t n_eval = std::max(1, ctx->n_eval);
|
1896
|
+
const int32_t n_p_eval = std::max(1, ctx->n_p_eval);
|
1820
1897
|
|
1821
1898
|
fprintf(stderr, "\n");
|
1822
1899
|
fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0);
|
@@ -1852,3 +1929,8 @@ const char * llama_print_system_info(void) {
|
|
1852
1929
|
|
1853
1930
|
return s.c_str();
|
1854
1931
|
}
|
1932
|
+
|
1933
|
+
// For internal test use
|
1934
|
+
std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
|
1935
|
+
return ctx->model.tensors_by_name;
|
1936
|
+
}
|