llama_cpp 0.0.3 → 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +18 -0
- data/README.md +2 -2
- data/ext/llama_cpp/extconf.rb +26 -0
- data/ext/llama_cpp/llama_cpp.cpp +58 -2
- data/ext/llama_cpp/src/ggml.c +735 -253
- data/ext/llama_cpp/src/ggml.h +74 -16
- data/ext/llama_cpp/src/llama.cpp +800 -718
- data/ext/llama_cpp/src/llama.h +25 -1
- data/ext/llama_cpp/src/llama_util.h +389 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +3 -0
- metadata +3 -2
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -1,49 +1,30 @@
|
|
1
|
+
// Defines fileno on msys:
|
2
|
+
#ifndef _GNU_SOURCE
|
3
|
+
#define _GNU_SOURCE
|
4
|
+
#endif
|
5
|
+
|
6
|
+
#include "llama_util.h"
|
1
7
|
#include "llama.h"
|
2
8
|
|
3
9
|
#include "ggml.h"
|
4
10
|
|
11
|
+
#include <array>
|
5
12
|
#include <cinttypes>
|
6
13
|
#include <fstream>
|
7
14
|
#include <random>
|
8
15
|
#include <map>
|
9
16
|
#include <unordered_map>
|
10
17
|
#include <queue>
|
11
|
-
#include <regex>
|
12
18
|
#include <cassert>
|
13
19
|
#include <cstring>
|
14
|
-
|
15
|
-
#
|
16
|
-
#
|
17
|
-
#include <
|
18
|
-
#else
|
19
|
-
#include <sys/types.h>
|
20
|
-
#include <sys/mman.h>
|
21
|
-
#include <unistd.h>
|
22
|
-
#include <fcntl.h>
|
23
|
-
#endif
|
24
|
-
|
25
|
-
#define Min(X, Y) ((Y) > (X) ? (X) : (Y))
|
26
|
-
#define Max(X, Y) ((Y) < (X) ? (X) : (Y))
|
20
|
+
#include <climits>
|
21
|
+
#include <memory>
|
22
|
+
#include <algorithm>
|
23
|
+
#include <initializer_list>
|
27
24
|
|
28
25
|
#define LLAMA_USE_SCRATCH
|
29
26
|
#define LLAMA_MAX_SCRATCH_BUFFERS 16
|
30
27
|
|
31
|
-
#define LLAMA_ASSERT(x) \
|
32
|
-
do { \
|
33
|
-
if (!(x)) { \
|
34
|
-
fprintf(stderr, "LLAMA_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
|
35
|
-
abort(); \
|
36
|
-
} \
|
37
|
-
} while (0)
|
38
|
-
|
39
|
-
|
40
|
-
// determine number of model parts based on the dimension
|
41
|
-
static const std::unordered_map<int, int> LLAMA_N_PARTS = {
|
42
|
-
{ 4096, 1 },
|
43
|
-
{ 5120, 2 },
|
44
|
-
{ 6656, 4 },
|
45
|
-
{ 8192, 8 },
|
46
|
-
};
|
47
28
|
|
48
29
|
// available llama models
|
49
30
|
enum e_model {
|
@@ -93,14 +74,18 @@ static const std::map<e_model, size_t> MEM_REQ_EVAL = {
|
|
93
74
|
|
94
75
|
// default hparams (LLaMA 7B)
|
95
76
|
struct llama_hparams {
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
77
|
+
uint32_t n_vocab = 32000;
|
78
|
+
uint32_t n_ctx = 512; // this is provided as user input?
|
79
|
+
uint32_t n_embd = 4096;
|
80
|
+
uint32_t n_mult = 256;
|
81
|
+
uint32_t n_head = 32;
|
82
|
+
uint32_t n_layer = 32;
|
83
|
+
uint32_t n_rot = 64;
|
84
|
+
enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;
|
85
|
+
|
86
|
+
bool operator!=(const llama_hparams & other) const {
|
87
|
+
return memcmp(this, &other, sizeof(llama_hparams));
|
88
|
+
}
|
104
89
|
};
|
105
90
|
|
106
91
|
struct llama_layer {
|
@@ -126,11 +111,17 @@ struct llama_kv_cache {
|
|
126
111
|
struct ggml_tensor * k;
|
127
112
|
struct ggml_tensor * v;
|
128
113
|
|
129
|
-
struct ggml_context * ctx;
|
114
|
+
struct ggml_context * ctx = NULL;
|
130
115
|
|
131
|
-
|
116
|
+
llama_buffer buf;
|
132
117
|
|
133
118
|
int n; // number of tokens currently in the cache
|
119
|
+
|
120
|
+
~llama_kv_cache() {
|
121
|
+
if (ctx) {
|
122
|
+
ggml_free(ctx);
|
123
|
+
}
|
124
|
+
}
|
134
125
|
};
|
135
126
|
|
136
127
|
struct llama_model {
|
@@ -146,22 +137,30 @@ struct llama_model {
|
|
146
137
|
std::vector<llama_layer> layers;
|
147
138
|
|
148
139
|
// context
|
149
|
-
struct ggml_context * ctx;
|
140
|
+
struct ggml_context * ctx = NULL;
|
150
141
|
|
151
142
|
// key + value cache for the self attention
|
152
143
|
// TODO: move to llama_state
|
153
144
|
struct llama_kv_cache kv_self;
|
154
145
|
|
155
146
|
// the model memory buffer
|
156
|
-
|
147
|
+
llama_buffer buf;
|
157
148
|
|
158
149
|
// model memory mapped file
|
159
|
-
|
160
|
-
|
150
|
+
std::unique_ptr<llama_mmap> mapping;
|
151
|
+
|
152
|
+
// objects representing data potentially being locked in memory
|
153
|
+
llama_mlock mlock_buf;
|
154
|
+
llama_mlock mlock_mmap;
|
155
|
+
|
156
|
+
// for quantize-stats only
|
157
|
+
std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
|
161
158
|
|
162
|
-
|
163
|
-
|
164
|
-
|
159
|
+
~llama_model() {
|
160
|
+
if (ctx) {
|
161
|
+
ggml_free(ctx);
|
162
|
+
}
|
163
|
+
}
|
165
164
|
};
|
166
165
|
|
167
166
|
struct llama_vocab {
|
@@ -206,8 +205,8 @@ struct llama_context {
|
|
206
205
|
|
207
206
|
// memory buffers used to evaluate the model
|
208
207
|
// TODO: move in llama_state
|
209
|
-
|
210
|
-
|
208
|
+
llama_buffer buf_compute;
|
209
|
+
llama_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
|
211
210
|
|
212
211
|
int buf_last = 0;
|
213
212
|
size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
|
@@ -220,11 +219,11 @@ struct llama_context {
|
|
220
219
|
last_size = ggml_set_scratch(ctx, { 0, 0, nullptr, });
|
221
220
|
} else {
|
222
221
|
auto & buf = buf_scratch[i];
|
223
|
-
last_size = ggml_set_scratch(ctx, { 0, buf.size
|
222
|
+
last_size = ggml_set_scratch(ctx, { 0, buf.size, buf.addr, });
|
224
223
|
}
|
225
224
|
|
226
225
|
if (buf_last >= 0) {
|
227
|
-
buf_max_size[buf_last] =
|
226
|
+
buf_max_size[buf_last] = std::max(buf_max_size[buf_last], last_size);
|
228
227
|
}
|
229
228
|
|
230
229
|
buf_last = i;
|
@@ -244,6 +243,499 @@ struct llama_context {
|
|
244
243
|
}
|
245
244
|
};
|
246
245
|
|
246
|
+
template <typename T>
|
247
|
+
static T checked_mul(T a, T b) {
|
248
|
+
T ret = a * b;
|
249
|
+
if (a != 0 && ret / a != b) {
|
250
|
+
throw format("overflow multiplying %llu * %llu",
|
251
|
+
(unsigned long long) a, (unsigned long long) b);
|
252
|
+
}
|
253
|
+
return ret;
|
254
|
+
}
|
255
|
+
|
256
|
+
static size_t checked_div(size_t a, size_t b) {
|
257
|
+
if (b == 0 || a % b != 0) {
|
258
|
+
throw format("error dividing %zu / %zu", a, b);
|
259
|
+
}
|
260
|
+
return a / b;
|
261
|
+
}
|
262
|
+
|
263
|
+
static std::string llama_format_tensor_shape(const std::vector<uint32_t> & ne) {
|
264
|
+
std::string ret = "[" + std::to_string(ne.at(0));
|
265
|
+
for (size_t i = 1; i < ne.size(); i++) {
|
266
|
+
ret += " x " + std::to_string(ne.at(i));
|
267
|
+
}
|
268
|
+
ret += "]";
|
269
|
+
return ret;
|
270
|
+
}
|
271
|
+
|
272
|
+
static size_t llama_calc_tensor_size(const std::vector<uint32_t> & ne, enum ggml_type type) {
|
273
|
+
size_t size = ggml_type_size(type);
|
274
|
+
for (uint32_t dim : ne) {
|
275
|
+
size = checked_mul<size_t>(size, dim);
|
276
|
+
}
|
277
|
+
return size / ggml_blck_size(type);
|
278
|
+
}
|
279
|
+
|
280
|
+
struct llama_load_tensor_shard {
|
281
|
+
std::vector<uint32_t> ne;
|
282
|
+
size_t size;
|
283
|
+
enum ggml_type type;
|
284
|
+
size_t file_idx;
|
285
|
+
size_t file_off;
|
286
|
+
|
287
|
+
void calc_size() {
|
288
|
+
size = llama_calc_tensor_size(ne, type);
|
289
|
+
}
|
290
|
+
};
|
291
|
+
|
292
|
+
enum llama_split_type {
|
293
|
+
SPLIT_NONE,
|
294
|
+
SPLIT_BY_COLUMNS,
|
295
|
+
SPLIT_BY_ROWS
|
296
|
+
};
|
297
|
+
|
298
|
+
struct llama_load_tensor {
|
299
|
+
std::vector<llama_load_tensor_shard> shards;
|
300
|
+
|
301
|
+
std::string name;
|
302
|
+
enum ggml_type type = GGML_TYPE_F32;
|
303
|
+
llama_split_type split_type = SPLIT_NONE;
|
304
|
+
std::vector<uint32_t> ne;
|
305
|
+
size_t size;
|
306
|
+
struct ggml_tensor * ggml_tensor = NULL;
|
307
|
+
uint8_t * data;
|
308
|
+
|
309
|
+
llama_load_tensor(const std::string & name) : name(name) {}
|
310
|
+
|
311
|
+
void calc_all() {
|
312
|
+
calc_type();
|
313
|
+
calc_split_type();
|
314
|
+
calc_ne();
|
315
|
+
calc_size();
|
316
|
+
}
|
317
|
+
|
318
|
+
void calc_type() {
|
319
|
+
const auto & first_shard = shards.at(0);
|
320
|
+
for (const auto & shard : shards) {
|
321
|
+
if (shard.type != first_shard.type) {
|
322
|
+
throw format("inconsistent tensor shard type in '%s'", name.c_str());
|
323
|
+
}
|
324
|
+
}
|
325
|
+
type = first_shard.type;
|
326
|
+
}
|
327
|
+
|
328
|
+
void calc_split_type() {
|
329
|
+
if (shards.at(0).ne.size() == 1 || // 1D tensors are just duplicated in every file
|
330
|
+
shards.size() == 1) { // only one file?
|
331
|
+
split_type = SPLIT_NONE;
|
332
|
+
} else if (name.find("tok_embeddings.") == 0 ||
|
333
|
+
name.find(".attention.wo.weight") != std::string::npos ||
|
334
|
+
name.find(".feed_forward.w2.weight") != std::string::npos) {
|
335
|
+
split_type = SPLIT_BY_COLUMNS;
|
336
|
+
} else {
|
337
|
+
split_type = SPLIT_BY_ROWS;
|
338
|
+
}
|
339
|
+
}
|
340
|
+
|
341
|
+
void calc_ne() {
|
342
|
+
const auto & first_shard = shards.at(0);
|
343
|
+
for (const auto & shard : shards) {
|
344
|
+
if (shard.ne != first_shard.ne) {
|
345
|
+
throw format("inconsistent tensor shard shape in '%s': first was %s, other was %s",
|
346
|
+
name.c_str(), llama_format_tensor_shape(first_shard.ne).c_str(), llama_format_tensor_shape(shard.ne).c_str());
|
347
|
+
}
|
348
|
+
}
|
349
|
+
ne = first_shard.ne;
|
350
|
+
LLAMA_ASSERT(shards.size() <= UINT32_MAX);
|
351
|
+
uint32_t n_shards = (uint32_t) shards.size();
|
352
|
+
switch (split_type) {
|
353
|
+
case SPLIT_NONE:
|
354
|
+
ne = first_shard.ne;
|
355
|
+
break;
|
356
|
+
case SPLIT_BY_COLUMNS:
|
357
|
+
ne = {checked_mul<uint32_t>(first_shard.ne[0], n_shards),
|
358
|
+
first_shard.ne[1]};
|
359
|
+
break;
|
360
|
+
case SPLIT_BY_ROWS:
|
361
|
+
ne = {first_shard.ne[0],
|
362
|
+
checked_mul<uint32_t>(first_shard.ne[1], n_shards)};
|
363
|
+
break;
|
364
|
+
}
|
365
|
+
}
|
366
|
+
|
367
|
+
void calc_size() {
|
368
|
+
size = llama_calc_tensor_size(ne, type);
|
369
|
+
}
|
370
|
+
};
|
371
|
+
|
372
|
+
struct llama_load_tensors_map {
|
373
|
+
// tensors is kept in a separate vector to preserve file order
|
374
|
+
std::vector<llama_load_tensor> tensors;
|
375
|
+
std::unordered_map<std::string, size_t> name_to_idx;
|
376
|
+
};
|
377
|
+
|
378
|
+
enum llama_file_version {
|
379
|
+
LLAMA_FILE_VERSION_GGML,
|
380
|
+
LLAMA_FILE_VERSION_GGMF_V1, // added version field and scores in vocab
|
381
|
+
LLAMA_FILE_VERSION_GGJT_V1, // added padding
|
382
|
+
};
|
383
|
+
|
384
|
+
struct llama_file_loader {
|
385
|
+
llama_file file;
|
386
|
+
llama_file_version file_version;
|
387
|
+
llama_hparams hparams;
|
388
|
+
llama_vocab vocab;
|
389
|
+
|
390
|
+
llama_file_loader(const char * fname, size_t file_idx, llama_load_tensors_map & tensors_map)
|
391
|
+
: file(fname, "rb") {
|
392
|
+
fprintf(stderr, "llama.cpp: loading model from %s\n", fname);
|
393
|
+
read_magic();
|
394
|
+
read_hparams();
|
395
|
+
read_vocab();
|
396
|
+
read_tensor_metadata(file_idx, tensors_map);
|
397
|
+
}
|
398
|
+
void read_magic() {
|
399
|
+
uint32_t magic = file.read_u32();
|
400
|
+
uint32_t version = 0;
|
401
|
+
|
402
|
+
if (magic != 'ggml') {
|
403
|
+
version = file.read_u32();
|
404
|
+
}
|
405
|
+
|
406
|
+
if (magic == 'ggml' && version == 0) {
|
407
|
+
file_version = LLAMA_FILE_VERSION_GGML;
|
408
|
+
} else if (magic == 'ggmf' && version == 1) {
|
409
|
+
file_version = LLAMA_FILE_VERSION_GGMF_V1;
|
410
|
+
} else if (magic == 'ggjt' && version == 1) {
|
411
|
+
file_version = LLAMA_FILE_VERSION_GGJT_V1;
|
412
|
+
} else {
|
413
|
+
throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
|
414
|
+
magic, version);
|
415
|
+
}
|
416
|
+
}
|
417
|
+
void read_hparams() {
|
418
|
+
hparams.n_vocab = file.read_u32();
|
419
|
+
hparams.n_embd = file.read_u32();
|
420
|
+
hparams.n_mult = file.read_u32();
|
421
|
+
hparams.n_head = file.read_u32();
|
422
|
+
hparams.n_layer = file.read_u32();
|
423
|
+
hparams.n_rot = file.read_u32();
|
424
|
+
hparams.ftype = (enum llama_ftype) file.read_u32();
|
425
|
+
}
|
426
|
+
void read_vocab() {
|
427
|
+
vocab.id_to_token.resize(hparams.n_vocab);
|
428
|
+
|
429
|
+
for (uint32_t i = 0; i < hparams.n_vocab; i++) {
|
430
|
+
uint32_t len = file.read_u32();
|
431
|
+
std::string word = file.read_string(len);
|
432
|
+
|
433
|
+
float score = 0.0f;
|
434
|
+
if (file_version >= LLAMA_FILE_VERSION_GGMF_V1) {
|
435
|
+
file.read_raw(&score, sizeof(score));
|
436
|
+
}
|
437
|
+
|
438
|
+
vocab.token_to_id[word] = i;
|
439
|
+
|
440
|
+
auto & tok_score = vocab.id_to_token[i];
|
441
|
+
tok_score.tok = std::move(word);
|
442
|
+
tok_score.score = score;
|
443
|
+
}
|
444
|
+
}
|
445
|
+
void read_tensor_metadata(size_t file_idx, llama_load_tensors_map & tensors_map) {
|
446
|
+
while (file.tell() < file.size) {
|
447
|
+
llama_load_tensor_shard shard;
|
448
|
+
uint32_t n_dims = file.read_u32();
|
449
|
+
uint32_t name_len = file.read_u32();
|
450
|
+
shard.type = (enum ggml_type) file.read_u32();
|
451
|
+
shard.ne.resize(n_dims);
|
452
|
+
file.read_raw(shard.ne.data(), sizeof(shard.ne[0]) * n_dims);
|
453
|
+
std::string name = file.read_string(name_len);
|
454
|
+
if (n_dims < 1 || n_dims > 2) {
|
455
|
+
throw format("llama.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims);
|
456
|
+
}
|
457
|
+
switch (shard.type) {
|
458
|
+
case GGML_TYPE_F32:
|
459
|
+
case GGML_TYPE_F16:
|
460
|
+
case GGML_TYPE_Q4_0:
|
461
|
+
case GGML_TYPE_Q4_1:
|
462
|
+
break;
|
463
|
+
default: {
|
464
|
+
throw format("unrecognized tensor type %u\n", shard.type);
|
465
|
+
}
|
466
|
+
}
|
467
|
+
|
468
|
+
if (file_version >= LLAMA_FILE_VERSION_GGJT_V1) {
|
469
|
+
// skip to the next multiple of 32 bytes
|
470
|
+
file.seek(-file.tell() & 31, SEEK_CUR);
|
471
|
+
}
|
472
|
+
shard.file_idx = file_idx;
|
473
|
+
shard.file_off = file.tell();
|
474
|
+
|
475
|
+
shard.calc_size();
|
476
|
+
file.seek(shard.size, SEEK_CUR);
|
477
|
+
|
478
|
+
auto it = tensors_map.name_to_idx.find(name);
|
479
|
+
size_t idx;
|
480
|
+
if (it != tensors_map.name_to_idx.end()) {
|
481
|
+
idx = it->second;
|
482
|
+
} else {
|
483
|
+
tensors_map.tensors.emplace_back(name);
|
484
|
+
idx = tensors_map.tensors.size() - 1;
|
485
|
+
tensors_map.name_to_idx.emplace(name, idx);
|
486
|
+
}
|
487
|
+
tensors_map.tensors.at(idx).shards.push_back(shard);
|
488
|
+
}
|
489
|
+
}
|
490
|
+
};
|
491
|
+
|
492
|
+
struct llama_file_saver {
|
493
|
+
llama_file file;
|
494
|
+
llama_file_loader * any_file_loader;
|
495
|
+
llama_file_saver(const char * fname, llama_file_loader * any_file_loader, enum llama_ftype new_ftype)
|
496
|
+
: file(fname, "wb"), any_file_loader(any_file_loader) {
|
497
|
+
fprintf(stderr, "llama.cpp: saving model to %s\n", fname);
|
498
|
+
write_magic();
|
499
|
+
write_hparams(new_ftype);
|
500
|
+
write_vocab();
|
501
|
+
}
|
502
|
+
void write_magic() {
|
503
|
+
file.write_u32('ggjt'); // magic
|
504
|
+
file.write_u32(1); // version
|
505
|
+
}
|
506
|
+
void write_hparams(enum llama_ftype new_ftype) {
|
507
|
+
const llama_hparams & hparams = any_file_loader->hparams;
|
508
|
+
file.write_u32(hparams.n_vocab);
|
509
|
+
file.write_u32(hparams.n_embd);
|
510
|
+
file.write_u32(hparams.n_mult);
|
511
|
+
file.write_u32(hparams.n_head);
|
512
|
+
file.write_u32(hparams.n_layer);
|
513
|
+
file.write_u32(hparams.n_rot);
|
514
|
+
file.write_u32(new_ftype);
|
515
|
+
}
|
516
|
+
void write_vocab() {
|
517
|
+
if (any_file_loader->file_version == LLAMA_FILE_VERSION_GGML) {
|
518
|
+
fprintf(stderr, "llama.cpp: WARNING: input is an old file that doesn't have scores; will add dummy scores\n");
|
519
|
+
}
|
520
|
+
uint32_t n_vocab = any_file_loader->hparams.n_vocab;
|
521
|
+
for (uint32_t i = 0; i < n_vocab; i++) {
|
522
|
+
const auto & token_score = any_file_loader->vocab.id_to_token.at(i);
|
523
|
+
file.write_u32((uint32_t) token_score.tok.size());
|
524
|
+
file.write_raw(token_score.tok.data(), token_score.tok.size());
|
525
|
+
file.write_raw(&token_score.score, sizeof(token_score.score));
|
526
|
+
}
|
527
|
+
}
|
528
|
+
void write_tensor(llama_load_tensor & tensor, enum ggml_type new_type, const void * new_data, size_t new_size) {
|
529
|
+
switch (new_type) {
|
530
|
+
case GGML_TYPE_F32:
|
531
|
+
case GGML_TYPE_F16:
|
532
|
+
case GGML_TYPE_Q4_0:
|
533
|
+
case GGML_TYPE_Q4_1:
|
534
|
+
break;
|
535
|
+
default: LLAMA_ASSERT(false);
|
536
|
+
}
|
537
|
+
file.write_u32((uint32_t) tensor.ne.size());
|
538
|
+
file.write_u32((uint32_t) tensor.name.size());
|
539
|
+
file.write_u32(new_type);
|
540
|
+
file.write_raw(tensor.ne.data(), sizeof(tensor.ne[0]) * tensor.ne.size());
|
541
|
+
file.write_raw(tensor.name.data(), tensor.name.size());
|
542
|
+
file.seek(-file.tell() & 31, SEEK_CUR);
|
543
|
+
LLAMA_ASSERT(new_size == llama_calc_tensor_size(tensor.ne, new_type));
|
544
|
+
file.write_raw(new_data, new_size);
|
545
|
+
}
|
546
|
+
};
|
547
|
+
|
548
|
+
struct llama_model_loader {
|
549
|
+
std::vector<std::unique_ptr<llama_file_loader>> file_loaders;
|
550
|
+
llama_load_tensors_map tensors_map;
|
551
|
+
bool use_mmap;
|
552
|
+
size_t num_ggml_tensors_created = 0;
|
553
|
+
struct ggml_context * ggml_ctx = NULL;
|
554
|
+
std::unique_ptr<llama_mmap> mapping;
|
555
|
+
|
556
|
+
llama_model_loader(const std::string & fname_base, bool use_mmap, bool vocab_only) {
|
557
|
+
auto first_file = new llama_file_loader(fname_base.c_str(), 0, tensors_map);
|
558
|
+
file_loaders.emplace_back(first_file);
|
559
|
+
uint32_t n_parts = vocab_only ? 1 : guess_n_parts();
|
560
|
+
for (uint32_t i = 1; i < n_parts; i++) {
|
561
|
+
std::string fname = fname_base + "." + std::to_string(i);
|
562
|
+
auto ith_file = new llama_file_loader(fname.c_str(), i, tensors_map);
|
563
|
+
file_loaders.emplace_back(ith_file);
|
564
|
+
if (ith_file->hparams != first_file->hparams) {
|
565
|
+
throw format("llama.cpp: hparams inconsistent between files");
|
566
|
+
}
|
567
|
+
}
|
568
|
+
if (!llama_mmap::SUPPORTED) {
|
569
|
+
use_mmap = false;
|
570
|
+
}
|
571
|
+
if (use_mmap && alignment_prevents_mmap()) {
|
572
|
+
fprintf(stderr, "llama.cpp: can't use mmap because tensors are not aligned; convert to new format to avoid this\n");
|
573
|
+
use_mmap = false;
|
574
|
+
}
|
575
|
+
this->use_mmap = use_mmap;
|
576
|
+
for (llama_load_tensor & lt : tensors_map.tensors) {
|
577
|
+
lt.calc_all();
|
578
|
+
}
|
579
|
+
}
|
580
|
+
|
581
|
+
bool alignment_prevents_mmap() {
|
582
|
+
for (const llama_load_tensor & lt : tensors_map.tensors) {
|
583
|
+
for (const llama_load_tensor_shard & shard : lt.shards) {
|
584
|
+
if (shard.file_off & 3) {
|
585
|
+
return true;
|
586
|
+
}
|
587
|
+
}
|
588
|
+
}
|
589
|
+
return false;
|
590
|
+
}
|
591
|
+
|
592
|
+
uint32_t guess_n_parts() const {
|
593
|
+
auto it = tensors_map.name_to_idx.find("tok_embeddings.weight");
|
594
|
+
if (it == tensors_map.name_to_idx.end()) {
|
595
|
+
throw std::string("missing tok_embeddings.weight");
|
596
|
+
}
|
597
|
+
const llama_load_tensor & lt = tensors_map.tensors.at(it->second);
|
598
|
+
return file_loaders.at(0)->hparams.n_embd / lt.shards.at(0).ne.at(0);
|
599
|
+
}
|
600
|
+
|
601
|
+
void calc_sizes(size_t * ctx_size_p, size_t * mmapped_size_p) const {
|
602
|
+
*ctx_size_p = *mmapped_size_p = 0;
|
603
|
+
for (const llama_load_tensor & lt : tensors_map.tensors) {
|
604
|
+
*ctx_size_p += sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE;
|
605
|
+
*(use_mmap ? mmapped_size_p : ctx_size_p) += lt.size;
|
606
|
+
}
|
607
|
+
}
|
608
|
+
|
609
|
+
struct ggml_tensor * get_tensor(const std::string & name, std::vector<uint32_t> ne) {
|
610
|
+
auto it = tensors_map.name_to_idx.find(name);
|
611
|
+
if (it == tensors_map.name_to_idx.end()) {
|
612
|
+
throw format("llama.cpp: tensor '%s' is missing from model", name.c_str());
|
613
|
+
}
|
614
|
+
llama_load_tensor & lt = tensors_map.tensors.at(it->second);
|
615
|
+
if (lt.ne != ne) {
|
616
|
+
throw format("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s",
|
617
|
+
name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str());
|
618
|
+
}
|
619
|
+
return get_tensor_for(lt);
|
620
|
+
}
|
621
|
+
|
622
|
+
struct ggml_tensor * get_tensor_for(llama_load_tensor & lt) {
|
623
|
+
struct ggml_tensor * tensor;
|
624
|
+
if (lt.ne.size() == 2) {
|
625
|
+
tensor = ggml_new_tensor_2d(ggml_ctx, lt.type, lt.ne.at(0), lt.ne.at(1));
|
626
|
+
} else {
|
627
|
+
LLAMA_ASSERT(lt.ne.size() == 1);
|
628
|
+
tensor = ggml_new_tensor_1d(ggml_ctx, lt.type, lt.ne.at(0));
|
629
|
+
}
|
630
|
+
LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor
|
631
|
+
lt.ggml_tensor = tensor;
|
632
|
+
num_ggml_tensors_created++;
|
633
|
+
return tensor;
|
634
|
+
}
|
635
|
+
|
636
|
+
void done_getting_tensors() {
|
637
|
+
if (num_ggml_tensors_created != tensors_map.tensors.size()) {
|
638
|
+
throw std::string("llama.cpp: file contained more tensors than expected");
|
639
|
+
}
|
640
|
+
}
|
641
|
+
|
642
|
+
void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) {
|
643
|
+
size_t data_size = 0;
|
644
|
+
for (const llama_load_tensor & lt : tensors_map.tensors) {
|
645
|
+
data_size += lt.size;
|
646
|
+
}
|
647
|
+
|
648
|
+
if (use_mmap) {
|
649
|
+
mapping.reset(new llama_mmap(&file_loaders.at(0)->file));
|
650
|
+
if (!lmlock) {
|
651
|
+
// Don't call the callback since the actual loading will be lazy
|
652
|
+
// and we can't measure it.
|
653
|
+
progress_callback = NULL;
|
654
|
+
}
|
655
|
+
if (lmlock) {
|
656
|
+
lmlock->init(mapping->addr);
|
657
|
+
}
|
658
|
+
}
|
659
|
+
|
660
|
+
size_t done_size = 0;
|
661
|
+
for (llama_load_tensor & lt : tensors_map.tensors) {
|
662
|
+
if (progress_callback) {
|
663
|
+
progress_callback((float) done_size / data_size, progress_callback_user_data);
|
664
|
+
}
|
665
|
+
LLAMA_ASSERT(lt.ggml_tensor); // unused tensors should have been caught by load_data already
|
666
|
+
lt.data = (uint8_t *) lt.ggml_tensor->data;
|
667
|
+
load_data_for(lt);
|
668
|
+
lt.ggml_tensor->data = lt.data;
|
669
|
+
done_size += lt.size;
|
670
|
+
if (use_mmap && lmlock) {
|
671
|
+
lmlock->grow_to(done_size);
|
672
|
+
}
|
673
|
+
}
|
674
|
+
if (progress_callback) {
|
675
|
+
progress_callback(1.0f, progress_callback_user_data);
|
676
|
+
}
|
677
|
+
}
|
678
|
+
|
679
|
+
void load_data_for(llama_load_tensor & lt) {
|
680
|
+
if (use_mmap) {
|
681
|
+
LLAMA_ASSERT(lt.shards.size() == 1);
|
682
|
+
lt.data = (uint8_t *) mapping->addr + lt.shards.at(0).file_off;
|
683
|
+
} else if (lt.split_type == SPLIT_NONE) {
|
684
|
+
llama_file & file = file_loaders.at(lt.shards.at(0).file_idx)->file;
|
685
|
+
file.seek(lt.shards.at(0).file_off, SEEK_SET);
|
686
|
+
file.read_raw(lt.data, lt.size);
|
687
|
+
} else if (lt.split_type == SPLIT_BY_ROWS) {
|
688
|
+
size_t offset = 0;
|
689
|
+
for (llama_load_tensor_shard & shard : lt.shards) {
|
690
|
+
llama_file & file = file_loaders.at(shard.file_idx)->file;
|
691
|
+
file.seek(shard.file_off, SEEK_SET);
|
692
|
+
file.read_raw(lt.data + offset, shard.size);
|
693
|
+
offset += shard.size;
|
694
|
+
}
|
695
|
+
LLAMA_ASSERT(offset == lt.size);
|
696
|
+
} else if (lt.split_type == SPLIT_BY_COLUMNS) {
|
697
|
+
// Let's load the data into temporary buffers to ensure the OS performs large loads.
|
698
|
+
std::vector<llama_buffer> tmp_bufs;
|
699
|
+
tmp_bufs.resize(lt.shards.size());
|
700
|
+
for (size_t i = 0; i < lt.shards.size(); i++) {
|
701
|
+
llama_load_tensor_shard & shard = lt.shards.at(i);
|
702
|
+
llama_file & file = file_loaders.at(shard.file_idx)->file;
|
703
|
+
file.seek(shard.file_off, SEEK_SET);
|
704
|
+
tmp_bufs.at(i).resize(shard.size);
|
705
|
+
file.read_raw(tmp_bufs.at(i).addr, shard.size);
|
706
|
+
}
|
707
|
+
// Then reshape.
|
708
|
+
size_t num_rows = lt.ne.at(1);
|
709
|
+
size_t per_shard_row_size = lt.shards.at(0).size / num_rows;
|
710
|
+
size_t out_offset = 0;
|
711
|
+
for (size_t row = 0; row < num_rows; row++) {
|
712
|
+
for (llama_buffer & tmp_buf : tmp_bufs) {
|
713
|
+
memcpy(lt.data + out_offset,
|
714
|
+
tmp_buf.addr + row * per_shard_row_size,
|
715
|
+
per_shard_row_size);
|
716
|
+
out_offset += per_shard_row_size;
|
717
|
+
}
|
718
|
+
}
|
719
|
+
LLAMA_ASSERT(out_offset == lt.size);
|
720
|
+
}
|
721
|
+
if (0) {
|
722
|
+
print_checksum(lt);
|
723
|
+
}
|
724
|
+
}
|
725
|
+
|
726
|
+
static void print_checksum(llama_load_tensor & lt) {
|
727
|
+
uint32_t sum = 0;
|
728
|
+
for (size_t i = 0; i < lt.size; i++) {
|
729
|
+
uint8_t byte = lt.data[i];
|
730
|
+
sum = byte + (sum << 6) + (sum << 16) - sum; // sdbm hash
|
731
|
+
}
|
732
|
+
fprintf(stderr, "%s checksum: %#08x (%s, size %zu)\n", lt.name.c_str(), sum,
|
733
|
+
llama_format_tensor_shape(lt.ne).c_str(), lt.size);
|
734
|
+
}
|
735
|
+
|
736
|
+
};
|
737
|
+
|
738
|
+
|
247
739
|
//
|
248
740
|
// kv cache
|
249
741
|
//
|
@@ -262,8 +754,8 @@ static bool kv_cache_init(
|
|
262
754
|
cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
|
263
755
|
|
264
756
|
struct ggml_init_params params;
|
265
|
-
params.mem_size = cache.buf.size
|
266
|
-
params.mem_buffer = cache.buf.
|
757
|
+
params.mem_size = cache.buf.size;
|
758
|
+
params.mem_buffer = cache.buf.addr;
|
267
759
|
params.no_alloc = false;
|
268
760
|
|
269
761
|
cache.ctx = ggml_init(params);
|
@@ -279,13 +771,6 @@ static bool kv_cache_init(
|
|
279
771
|
return true;
|
280
772
|
}
|
281
773
|
|
282
|
-
static void kv_cache_free(struct llama_kv_cache & cache) {
|
283
|
-
if (cache.ctx) {
|
284
|
-
ggml_free(cache.ctx);
|
285
|
-
cache.ctx = nullptr;
|
286
|
-
}
|
287
|
-
}
|
288
|
-
|
289
774
|
struct llama_context_params llama_context_default_params() {
|
290
775
|
struct llama_context_params result = {
|
291
776
|
/*.n_ctx =*/ 512,
|
@@ -294,6 +779,7 @@ struct llama_context_params llama_context_default_params() {
|
|
294
779
|
/*.f16_kv =*/ false,
|
295
780
|
/*.logits_all =*/ false,
|
296
781
|
/*.vocab_only =*/ false,
|
782
|
+
/*.use_mmap =*/ true,
|
297
783
|
/*.use_mlock =*/ false,
|
298
784
|
/*.embedding =*/ false,
|
299
785
|
/*.progress_callback =*/ nullptr,
|
@@ -303,243 +789,106 @@ struct llama_context_params llama_context_default_params() {
|
|
303
789
|
return result;
|
304
790
|
}
|
305
791
|
|
792
|
+
bool llama_mmap_supported() {
|
793
|
+
return llama_mmap::SUPPORTED;
|
794
|
+
}
|
795
|
+
|
796
|
+
bool llama_mlock_supported() {
|
797
|
+
return llama_mlock::SUPPORTED;
|
798
|
+
}
|
799
|
+
|
306
800
|
//
|
307
801
|
// model loading
|
308
802
|
//
|
309
803
|
|
310
|
-
static
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
FILE_ATTRIBUTE_NORMAL | FILE_ATTRIBUTE_NOT_CONTENT_INDEXED,
|
318
|
-
NULL);
|
319
|
-
if (hFile == INVALID_HANDLE_VALUE) return 0;
|
320
|
-
LARGE_INTEGER fileSize;
|
321
|
-
fileSize.QuadPart = -1;
|
322
|
-
GetFileSizeEx(hFile, &fileSize);
|
323
|
-
int64_t length = fileSize.QuadPart;
|
324
|
-
HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
|
325
|
-
CloseHandle(hFile);
|
326
|
-
if (!hMapping) return 0;
|
327
|
-
void *addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
|
328
|
-
CloseHandle(hMapping);
|
329
|
-
if (!addr) return 0;
|
330
|
-
#else
|
331
|
-
int fd = open(fname, O_RDONLY);
|
332
|
-
if (fd == -1) return 0;
|
333
|
-
int64_t length = lseek(fd, 0, SEEK_END);
|
334
|
-
void *addr = mmap(NULL, length, PROT_READ, MAP_SHARED, fd, 0);
|
335
|
-
close(fd);
|
336
|
-
if (addr == MAP_FAILED) return 0;
|
337
|
-
#endif
|
338
|
-
*mm_length = length;
|
339
|
-
return addr;
|
804
|
+
static const char *llama_file_version_name(llama_file_version version) {
|
805
|
+
switch (version) {
|
806
|
+
case LLAMA_FILE_VERSION_GGML: return "'ggml' (old version with low tokenizer quality and no mmap support)";
|
807
|
+
case LLAMA_FILE_VERSION_GGMF_V1: return "ggmf v1 (old version with no mmap support)";
|
808
|
+
case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (latest)";
|
809
|
+
default: LLAMA_ASSERT(false);
|
810
|
+
}
|
340
811
|
}
|
341
812
|
|
342
|
-
static
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
|
813
|
+
static const char *llama_ftype_name(enum llama_ftype ftype) {
|
814
|
+
switch (ftype) {
|
815
|
+
case LLAMA_FTYPE_ALL_F32: return "all F32";
|
816
|
+
case LLAMA_FTYPE_MOSTLY_F16: return "mostly F16";
|
817
|
+
case LLAMA_FTYPE_MOSTLY_Q4_0: return "mostly Q4_0";
|
818
|
+
case LLAMA_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1";
|
819
|
+
case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
|
820
|
+
return "mostly Q4_1, some F16";
|
821
|
+
default: return "unknown, may not work";
|
822
|
+
}
|
348
823
|
}
|
349
824
|
|
350
|
-
static
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
path, got, want);
|
359
|
-
return false;
|
825
|
+
static const char *llama_model_type_name(e_model type) {
|
826
|
+
switch (type) {
|
827
|
+
case MODEL_7B: return "7B";
|
828
|
+
case MODEL_13B: return "13B";
|
829
|
+
case MODEL_30B: return "30B";
|
830
|
+
case MODEL_65B: return "65B";
|
831
|
+
default: LLAMA_ASSERT(false);
|
832
|
+
}
|
360
833
|
}
|
361
834
|
|
362
|
-
static
|
835
|
+
static void llama_model_load_internal(
|
363
836
|
const std::string & fname,
|
364
837
|
llama_context & lctx,
|
365
838
|
int n_ctx,
|
366
|
-
int n_parts,
|
367
839
|
ggml_type memory_type,
|
840
|
+
bool use_mmap,
|
841
|
+
bool use_mlock,
|
368
842
|
bool vocab_only,
|
369
843
|
llama_progress_callback progress_callback,
|
370
|
-
void *progress_callback_user_data) {
|
371
|
-
fprintf(stderr, "%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
|
844
|
+
void * progress_callback_user_data) {
|
372
845
|
|
373
846
|
lctx.t_start_us = ggml_time_us();
|
374
847
|
|
375
|
-
|
376
|
-
auto & vocab = lctx.vocab;
|
377
|
-
|
378
|
-
auto fin = std::ifstream(fname, std::ios::binary);
|
379
|
-
if (!fin) {
|
380
|
-
fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
|
381
|
-
return false;
|
382
|
-
}
|
383
|
-
|
384
|
-
std::vector<char> f_buf(1024*1024);
|
385
|
-
fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size());
|
848
|
+
std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap, vocab_only));
|
386
849
|
|
387
|
-
|
388
|
-
|
389
|
-
|
850
|
+
lctx.vocab = std::move(ml->file_loaders.at(0)->vocab);
|
851
|
+
auto & model = lctx.model;
|
852
|
+
model.hparams = ml->file_loaders.at(0)->hparams;
|
853
|
+
llama_file_version file_version = ml->file_loaders.at(0)->file_version;
|
854
|
+
auto & hparams = model.hparams;
|
855
|
+
uint32_t n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
|
390
856
|
|
391
|
-
// verify magic
|
392
857
|
{
|
393
|
-
|
394
|
-
|
395
|
-
|
396
|
-
|
397
|
-
|
398
|
-
return false;
|
858
|
+
switch (hparams.n_layer) {
|
859
|
+
case 32: model.type = e_model::MODEL_7B; break;
|
860
|
+
case 40: model.type = e_model::MODEL_13B; break;
|
861
|
+
case 60: model.type = e_model::MODEL_30B; break;
|
862
|
+
case 80: model.type = e_model::MODEL_65B; break;
|
399
863
|
}
|
400
|
-
if (magic != LLAMA_FILE_MAGIC) {
|
401
|
-
return report_bad_magic(fname.c_str(), magic, LLAMA_FILE_MAGIC);
|
402
|
-
}
|
403
|
-
|
404
|
-
uint32_t format_version;
|
405
|
-
fin.read((char *) &format_version, sizeof(format_version));
|
406
|
-
|
407
|
-
if (format_version != LLAMA_FILE_VERSION) {
|
408
|
-
fprintf(stderr, "%s: invalid model file '%s' (unsupported format version %" PRIu32 ", expected %d)\n",
|
409
|
-
__func__, fname.c_str(), format_version, LLAMA_FILE_VERSION);
|
410
|
-
return false;
|
411
|
-
}
|
412
|
-
}
|
413
|
-
|
414
|
-
int n_ff = 0;
|
415
|
-
|
416
|
-
// load hparams
|
417
|
-
{
|
418
|
-
auto & hparams = model.hparams;
|
419
|
-
|
420
|
-
fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
|
421
|
-
//fin.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
|
422
|
-
fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd));
|
423
|
-
fin.read((char *) &hparams.n_mult, sizeof(hparams.n_mult));
|
424
|
-
fin.read((char *) &hparams.n_head, sizeof(hparams.n_head));
|
425
|
-
fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
|
426
|
-
fin.read((char *) &hparams.n_rot, sizeof(hparams.n_rot));
|
427
|
-
fin.read((char *) &hparams.f16, sizeof(hparams.f16));
|
428
864
|
|
429
865
|
hparams.n_ctx = n_ctx;
|
430
|
-
|
431
|
-
n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
|
432
|
-
|
433
|
-
if (n_parts < 1) {
|
434
|
-
n_parts = LLAMA_N_PARTS.at(hparams.n_embd);
|
435
|
-
}
|
436
|
-
|
437
|
-
// temp warning to tell the user to use "--n_parts"
|
438
|
-
if (hparams.f16 == 4 && n_parts != 1) {
|
439
|
-
fprintf(stderr, "%s: GPTQ model detected - are you sure n_parts should be %d? we normally expect it to be 1\n", __func__, n_parts);
|
440
|
-
fprintf(stderr, "%s: use '--n_parts 1' if necessary\n", __func__);
|
441
|
-
}
|
442
|
-
|
443
|
-
if (hparams.n_layer == 32) {
|
444
|
-
model.type = e_model::MODEL_7B;
|
445
|
-
}
|
446
|
-
|
447
|
-
if (hparams.n_layer == 40) {
|
448
|
-
model.type = e_model::MODEL_13B;
|
449
|
-
}
|
450
|
-
|
451
|
-
if (hparams.n_layer == 60) {
|
452
|
-
model.type = e_model::MODEL_30B;
|
453
|
-
}
|
454
|
-
|
455
|
-
if (hparams.n_layer == 80) {
|
456
|
-
model.type = e_model::MODEL_65B;
|
457
|
-
}
|
458
|
-
|
459
|
-
fprintf(stderr, "%s: n_vocab = %d\n", __func__, hparams.n_vocab);
|
460
|
-
fprintf(stderr, "%s: n_ctx = %d\n", __func__, hparams.n_ctx);
|
461
|
-
fprintf(stderr, "%s: n_embd = %d\n", __func__, hparams.n_embd);
|
462
|
-
fprintf(stderr, "%s: n_mult = %d\n", __func__, hparams.n_mult);
|
463
|
-
fprintf(stderr, "%s: n_head = %d\n", __func__, hparams.n_head);
|
464
|
-
fprintf(stderr, "%s: n_layer = %d\n", __func__, hparams.n_layer);
|
465
|
-
fprintf(stderr, "%s: n_rot = %d\n", __func__, hparams.n_rot);
|
466
|
-
fprintf(stderr, "%s: f16 = %d\n", __func__, hparams.f16);
|
467
|
-
fprintf(stderr, "%s: n_ff = %d\n", __func__, n_ff);
|
468
|
-
fprintf(stderr, "%s: n_parts = %d\n", __func__, n_parts);
|
469
|
-
fprintf(stderr, "%s: type = %d\n", __func__, model.type);
|
470
866
|
}
|
471
867
|
|
472
|
-
// load vocab
|
473
868
|
{
|
474
|
-
|
475
|
-
|
476
|
-
|
477
|
-
|
478
|
-
|
479
|
-
|
480
|
-
|
481
|
-
|
482
|
-
|
483
|
-
|
484
|
-
|
485
|
-
|
486
|
-
word.assign(tmp.data(), len);
|
487
|
-
} else {
|
488
|
-
word.clear();
|
489
|
-
}
|
490
|
-
|
491
|
-
float score;
|
492
|
-
fin.read((char *) &score, sizeof(score));
|
493
|
-
|
494
|
-
vocab.token_to_id[word] = i;
|
495
|
-
|
496
|
-
auto &tok_score = vocab.id_to_token[i];
|
497
|
-
tok_score.tok = word;
|
498
|
-
tok_score.score = score;
|
499
|
-
}
|
869
|
+
fprintf(stderr, "%s: format = %s\n", __func__, llama_file_version_name(file_version));
|
870
|
+
fprintf(stderr, "%s: n_vocab = %u\n", __func__, hparams.n_vocab);
|
871
|
+
fprintf(stderr, "%s: n_ctx = %u\n", __func__, hparams.n_ctx);
|
872
|
+
fprintf(stderr, "%s: n_embd = %u\n", __func__, hparams.n_embd);
|
873
|
+
fprintf(stderr, "%s: n_mult = %u\n", __func__, hparams.n_mult);
|
874
|
+
fprintf(stderr, "%s: n_head = %u\n", __func__, hparams.n_head);
|
875
|
+
fprintf(stderr, "%s: n_layer = %u\n", __func__, hparams.n_layer);
|
876
|
+
fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot);
|
877
|
+
fprintf(stderr, "%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype));
|
878
|
+
fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff);
|
879
|
+
fprintf(stderr, "%s: n_parts = %zu\n", __func__, ml->file_loaders.size());
|
880
|
+
fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
|
500
881
|
}
|
501
882
|
|
502
883
|
if (vocab_only) {
|
503
|
-
return
|
504
|
-
}
|
505
|
-
|
506
|
-
// for the big tensors, we have the option to store the data in 16-bit floats or quantized
|
507
|
-
// in order to save memory and also to speed up the computation
|
508
|
-
// wtype is for per-layer weights, while vtype is for other weights
|
509
|
-
ggml_type wtype, vtype;
|
510
|
-
switch (model.hparams.f16) {
|
511
|
-
case 0: wtype = vtype = GGML_TYPE_F32; break;
|
512
|
-
case 1: wtype = vtype = GGML_TYPE_F16; break;
|
513
|
-
case 2: wtype = vtype = GGML_TYPE_Q4_0; break;
|
514
|
-
case 3: wtype = vtype = GGML_TYPE_Q4_1; break;
|
515
|
-
case 4: wtype = GGML_TYPE_Q4_1; vtype = GGML_TYPE_F16; break;
|
516
|
-
default:
|
517
|
-
{
|
518
|
-
fprintf(stderr, "%s: invalid model file '%s' (bad f16 value %d)\n",
|
519
|
-
__func__, fname.c_str(), model.hparams.f16);
|
520
|
-
return false;
|
521
|
-
}
|
522
|
-
}
|
523
|
-
|
524
|
-
// map model into memory
|
525
|
-
char *mm_addr = NULL;
|
526
|
-
model.mm_addr = mmap_file(fname.c_str(), &model.mm_length);
|
527
|
-
if (model.mm_addr == NULL) {
|
528
|
-
fprintf(stderr, "%s: failed to mmap '%s'\n", __func__, fname.c_str());
|
529
|
-
return false;
|
884
|
+
return;
|
530
885
|
}
|
531
|
-
mm_addr = (char *)model.mm_addr;
|
532
|
-
fprintf(stderr, "%s: ggml map size = %6.2f MB\n", __func__, model.mm_length/(1024.0*1024.0));
|
533
886
|
|
534
887
|
auto & ctx = model.ctx;
|
535
888
|
|
536
|
-
size_t ctx_size
|
537
|
-
|
538
|
-
|
539
|
-
const int n_layer = hparams.n_layer;
|
540
|
-
ctx_size += (5 + 10*n_layer)*256; // object overhead
|
541
|
-
fprintf(stderr, "%s: ggml ctx size = %6.2f KB\n", __func__, ctx_size/1024.0);
|
542
|
-
}
|
889
|
+
size_t ctx_size, mmapped_size;
|
890
|
+
ml->calc_sizes(&ctx_size, &mmapped_size);
|
891
|
+
fprintf(stderr, "%s: ggml ctx size = %6.2f KB\n", __func__, ctx_size/1024.0);
|
543
892
|
|
544
893
|
// print memory requirements
|
545
894
|
{
|
@@ -548,7 +897,7 @@ static bool llama_model_load(
|
|
548
897
|
// this is the total memory required to run the inference
|
549
898
|
const size_t mem_required =
|
550
899
|
ctx_size +
|
551
|
-
|
900
|
+
mmapped_size +
|
552
901
|
MEM_REQ_SCRATCH0.at(model.type) +
|
553
902
|
MEM_REQ_SCRATCH1.at(model.type) +
|
554
903
|
MEM_REQ_EVAL.at (model.type);
|
@@ -564,17 +913,20 @@ static bool llama_model_load(
|
|
564
913
|
// create the ggml context
|
565
914
|
{
|
566
915
|
lctx.model.buf.resize(ctx_size);
|
916
|
+
if (use_mlock) {
|
917
|
+
lctx.model.mlock_buf.init(lctx.model.buf.addr);
|
918
|
+
lctx.model.mlock_buf.grow_to(lctx.model.buf.size);
|
919
|
+
}
|
567
920
|
|
568
921
|
struct ggml_init_params params = {
|
569
|
-
/*.mem_size =*/ lctx.model.buf.size
|
570
|
-
/*.mem_buffer =*/ lctx.model.buf.
|
571
|
-
/*.no_alloc =*/
|
922
|
+
/*.mem_size =*/ lctx.model.buf.size,
|
923
|
+
/*.mem_buffer =*/ lctx.model.buf.addr,
|
924
|
+
/*.no_alloc =*/ ml->use_mmap,
|
572
925
|
};
|
573
926
|
|
574
927
|
model.ctx = ggml_init(params);
|
575
928
|
if (!model.ctx) {
|
576
|
-
|
577
|
-
return false;
|
929
|
+
throw format("ggml_init() failed");
|
578
930
|
}
|
579
931
|
}
|
580
932
|
|
@@ -582,161 +934,71 @@ static bool llama_model_load(
|
|
582
934
|
{
|
583
935
|
const auto & hparams = model.hparams;
|
584
936
|
|
585
|
-
const
|
586
|
-
const
|
587
|
-
const
|
588
|
-
|
589
|
-
model.layers.resize(n_layer);
|
590
|
-
|
591
|
-
model.tok_embeddings = ggml_new_tensor_2d(ctx, vtype, n_embd, n_vocab);
|
592
|
-
|
593
|
-
model.norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
594
|
-
model.output = ggml_new_tensor_2d(ctx, vtype, n_embd, n_vocab);
|
937
|
+
const uint32_t n_embd = hparams.n_embd;
|
938
|
+
const uint32_t n_layer = hparams.n_layer;
|
939
|
+
const uint32_t n_vocab = hparams.n_vocab;
|
595
940
|
|
596
|
-
|
597
|
-
model.tensors["tok_embeddings.weight"] = model.tok_embeddings;
|
941
|
+
ml->ggml_ctx = ctx;
|
598
942
|
|
599
|
-
model.
|
600
|
-
model.
|
943
|
+
model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab});
|
944
|
+
model.norm = ml->get_tensor("norm.weight", {n_embd});
|
945
|
+
model.output = ml->get_tensor("output.weight", {n_embd, n_vocab});
|
601
946
|
|
602
|
-
|
947
|
+
model.layers.resize(n_layer);
|
948
|
+
for (uint32_t i = 0; i < n_layer; ++i) {
|
603
949
|
auto & layer = model.layers[i];
|
604
950
|
|
605
|
-
|
606
|
-
|
607
|
-
layer.wq = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
|
608
|
-
layer.wk = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
|
609
|
-
layer.wv = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
|
610
|
-
layer.wo = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
|
611
|
-
|
612
|
-
layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
613
|
-
|
614
|
-
layer.w1 = ggml_new_tensor_2d(ctx, wtype, n_embd, n_ff);
|
615
|
-
layer.w2 = ggml_new_tensor_2d(ctx, wtype, n_ff, n_embd);
|
616
|
-
layer.w3 = ggml_new_tensor_2d(ctx, wtype, n_embd, n_ff);
|
951
|
+
std::string layers_i = "layers." + std::to_string(i);
|
617
952
|
|
618
|
-
|
619
|
-
model.tensors["layers." + std::to_string(i) + ".attention_norm.weight"] = layer.attention_norm;
|
953
|
+
layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd});
|
620
954
|
|
621
|
-
|
622
|
-
|
623
|
-
|
624
|
-
|
955
|
+
layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd});
|
956
|
+
layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd});
|
957
|
+
layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd});
|
958
|
+
layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd});
|
625
959
|
|
626
|
-
|
960
|
+
layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd});
|
627
961
|
|
628
|
-
|
629
|
-
|
630
|
-
|
962
|
+
layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff});
|
963
|
+
layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd});
|
964
|
+
layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff});
|
631
965
|
}
|
632
966
|
}
|
633
967
|
|
634
|
-
|
968
|
+
ml->done_getting_tensors();
|
635
969
|
|
636
|
-
|
637
|
-
|
970
|
+
// populate `tensors_by_name`
|
971
|
+
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
|
972
|
+
model.tensors_by_name.emplace_back(lt.name, lt.ggml_tensor);
|
638
973
|
}
|
639
974
|
|
640
|
-
|
975
|
+
ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
|
641
976
|
|
642
|
-
|
643
|
-
{
|
644
|
-
size_t total_size = 0;
|
645
|
-
model.n_loaded = 0;
|
646
|
-
|
647
|
-
while (true) {
|
648
|
-
int32_t n_dims;
|
649
|
-
int32_t length;
|
650
|
-
int32_t ftype;
|
651
|
-
|
652
|
-
fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
|
653
|
-
fin.read(reinterpret_cast<char *>(&length), sizeof(length));
|
654
|
-
fin.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
|
655
|
-
|
656
|
-
if (fin.eof()) {
|
657
|
-
break;
|
658
|
-
}
|
659
|
-
|
660
|
-
int32_t nelements = 1;
|
661
|
-
int32_t ne[2] = { 1, 1 };
|
662
|
-
for (int i = 0; i < n_dims; ++i) {
|
663
|
-
fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
|
664
|
-
nelements *= ne[i];
|
665
|
-
}
|
666
|
-
|
667
|
-
std::string name(length, 0);
|
668
|
-
fin.read(&name[0], length);
|
669
|
-
|
670
|
-
if (model.tensors.find(name.data()) == model.tensors.end()) {
|
671
|
-
fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
|
672
|
-
return false;
|
673
|
-
}
|
674
|
-
|
675
|
-
auto tensor = model.tensors[name.data()];
|
676
|
-
|
677
|
-
if (ggml_nelements(tensor) != nelements) {
|
678
|
-
fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
|
679
|
-
return false;
|
680
|
-
}
|
681
|
-
if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
|
682
|
-
fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%" PRId64 ", %" PRId64 "], expected [%d, %d]\n",
|
683
|
-
__func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
|
684
|
-
return false;
|
685
|
-
}
|
686
|
-
if (0) {
|
687
|
-
static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
|
688
|
-
fprintf(stderr, "%24s - [%5d, %5d], type = %6s\n", name.data(), ne[0], ne[1], ftype_str[ftype]);
|
689
|
-
}
|
690
|
-
|
691
|
-
switch (ftype) {
|
692
|
-
case 0: // f32
|
693
|
-
case 1: // f16
|
694
|
-
break;
|
695
|
-
case 2: // q4_0
|
696
|
-
case 3: // q4_1
|
697
|
-
assert(ne[0] % 64 == 0);
|
698
|
-
break;
|
699
|
-
default:
|
700
|
-
fprintf(stderr, "%s: unknown ftype %d in model file\n", __func__, ftype);
|
701
|
-
return false;
|
702
|
-
};
|
703
|
-
|
704
|
-
// load the tensor data into memory without copying or reading it
|
705
|
-
size_t offset = fin.tellg();
|
706
|
-
size_t tensor_data_size = ggml_nbytes(tensor);
|
707
|
-
offset = (offset + 31) & -32;
|
708
|
-
tensor->data = mm_addr + offset;
|
709
|
-
fin.seekg(offset + tensor_data_size);
|
710
|
-
total_size += tensor_data_size;
|
711
|
-
model.n_loaded++;
|
712
|
-
|
713
|
-
// progress
|
714
|
-
if (progress_callback) {
|
715
|
-
double current_progress = size_t(fin.tellg()) / double(file_size);
|
716
|
-
progress_callback(current_progress, progress_callback_user_data);
|
717
|
-
}
|
718
|
-
}
|
719
|
-
|
720
|
-
fin.close();
|
721
|
-
|
722
|
-
fprintf(stderr, "%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, model.n_loaded);
|
723
|
-
if (model.n_loaded == 0) {
|
724
|
-
fprintf(stderr, "%s: WARN no tensors loaded from model file - assuming empty model for testing\n", __func__);
|
725
|
-
} else if (model.n_loaded != (int) model.tensors.size()) {
|
726
|
-
fprintf(stderr, "%s: ERROR not all tensors loaded from model file - expected %zu, got %d\n", __func__, model.tensors.size(), model.n_loaded);
|
727
|
-
return false;
|
728
|
-
}
|
729
|
-
}
|
977
|
+
model.mapping = std::move(ml->mapping);
|
730
978
|
|
731
979
|
// loading time will be recalculate after the first eval, so
|
732
980
|
// we take page faults deferred by mmap() into consideration
|
733
981
|
lctx.t_load_us = ggml_time_us() - lctx.t_start_us;
|
982
|
+
}
|
734
983
|
|
735
|
-
|
736
|
-
|
984
|
+
static bool llama_model_load(
|
985
|
+
const std::string & fname,
|
986
|
+
llama_context & lctx,
|
987
|
+
int n_ctx,
|
988
|
+
ggml_type memory_type,
|
989
|
+
bool use_mmap,
|
990
|
+
bool use_mlock,
|
991
|
+
bool vocab_only,
|
992
|
+
llama_progress_callback progress_callback,
|
993
|
+
void *progress_callback_user_data) {
|
994
|
+
try {
|
995
|
+
llama_model_load_internal(fname, lctx, n_ctx, memory_type, use_mmap, use_mlock,
|
996
|
+
vocab_only, progress_callback, progress_callback_user_data);
|
997
|
+
return true;
|
998
|
+
} catch (const std::string & err) {
|
999
|
+
fprintf(stderr, "error loading model: %s\n", err.c_str());
|
1000
|
+
return false;
|
737
1001
|
}
|
738
|
-
|
739
|
-
return true;
|
740
1002
|
}
|
741
1003
|
|
742
1004
|
// evaluate the transformer
|
@@ -774,8 +1036,8 @@ static bool llama_eval_internal(
|
|
774
1036
|
auto & buf_compute = lctx.buf_compute;
|
775
1037
|
|
776
1038
|
struct ggml_init_params params = {
|
777
|
-
/*.mem_size =*/ buf_compute.size
|
778
|
-
/*.mem_buffer =*/ buf_compute.
|
1039
|
+
/*.mem_size =*/ buf_compute.size,
|
1040
|
+
/*.mem_buffer =*/ buf_compute.addr,
|
779
1041
|
/*.no_alloc =*/ false,
|
780
1042
|
};
|
781
1043
|
|
@@ -1061,7 +1323,7 @@ struct llama_tokenizer {
|
|
1061
1323
|
size_t offs = 0;
|
1062
1324
|
while (offs < text.size()) {
|
1063
1325
|
llama_sp_symbol sym;
|
1064
|
-
size_t char_len =
|
1326
|
+
size_t char_len = std::min(text.size() - offs, utf8_len(text[offs]));
|
1065
1327
|
sym.text = text.c_str() + offs;
|
1066
1328
|
sym.n = char_len;
|
1067
1329
|
offs += char_len;
|
@@ -1236,7 +1498,7 @@ static llama_vocab::id llama_sample_top_p_top_k(
|
|
1236
1498
|
}
|
1237
1499
|
}
|
1238
1500
|
|
1239
|
-
sample_top_k(logits_id, top_k > 0 ?
|
1501
|
+
sample_top_k(logits_id, top_k > 0 ? std::min(top_k, n_logits) : n_logits);
|
1240
1502
|
|
1241
1503
|
// compute probs for the top k tokens
|
1242
1504
|
std::vector<float> probs;
|
@@ -1284,298 +1546,118 @@ static llama_vocab::id llama_sample_top_p_top_k(
|
|
1284
1546
|
// quantization
|
1285
1547
|
//
|
1286
1548
|
|
1287
|
-
|
1288
|
-
|
1289
|
-
|
1290
|
-
|
1291
|
-
|
1292
|
-
|
1293
|
-
case 3: type = GGML_TYPE_Q4_1; break;
|
1294
|
-
default: fprintf(stderr, "%s: invalid quantization type %d\n", __func__, itype); return 1;
|
1549
|
+
static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, enum llama_ftype ftype) {
|
1550
|
+
ggml_type quantized_type;
|
1551
|
+
switch (ftype) {
|
1552
|
+
case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
|
1553
|
+
case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
|
1554
|
+
default: throw format("invalid output file type %d\n", ftype);
|
1295
1555
|
};
|
1296
1556
|
|
1297
|
-
|
1298
|
-
|
1299
|
-
|
1300
|
-
|
1301
|
-
|
1302
|
-
|
1303
|
-
|
1304
|
-
|
1305
|
-
|
1306
|
-
|
1307
|
-
|
1308
|
-
|
1309
|
-
|
1310
|
-
|
1311
|
-
|
1312
|
-
|
1313
|
-
|
1314
|
-
|
1315
|
-
|
1316
|
-
|
1317
|
-
|
1318
|
-
|
1319
|
-
|
1320
|
-
|
1321
|
-
|
1322
|
-
|
1323
|
-
|
1324
|
-
|
1325
|
-
|
1326
|
-
|
1327
|
-
|
1328
|
-
|
1329
|
-
|
1330
|
-
|
1331
|
-
|
1332
|
-
|
1333
|
-
|
1334
|
-
|
1335
|
-
|
1336
|
-
|
1337
|
-
|
1338
|
-
|
1339
|
-
|
1340
|
-
|
1341
|
-
|
1342
|
-
|
1343
|
-
|
1344
|
-
|
1345
|
-
|
1346
|
-
|
1347
|
-
// load hparams
|
1348
|
-
{
|
1349
|
-
finp.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
|
1350
|
-
//finp.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
|
1351
|
-
finp.read((char *) &hparams.n_embd, sizeof(hparams.n_embd));
|
1352
|
-
finp.read((char *) &hparams.n_mult, sizeof(hparams.n_mult));
|
1353
|
-
finp.read((char *) &hparams.n_head, sizeof(hparams.n_head));
|
1354
|
-
finp.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
|
1355
|
-
finp.read((char *) &hparams.n_rot, sizeof(hparams.n_rot));
|
1356
|
-
finp.read((char *) &hparams.f16, sizeof(hparams.f16));
|
1357
|
-
|
1358
|
-
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
|
1359
|
-
printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
|
1360
|
-
printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
|
1361
|
-
printf("%s: n_mult = %d\n", __func__, hparams.n_mult);
|
1362
|
-
printf("%s: n_head = %d\n", __func__, hparams.n_head);
|
1363
|
-
printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
|
1364
|
-
printf("%s: f16 = %d\n", __func__, hparams.f16);
|
1365
|
-
|
1366
|
-
fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
|
1367
|
-
//fout.write((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
|
1368
|
-
fout.write((char *) &hparams.n_embd, sizeof(hparams.n_embd));
|
1369
|
-
fout.write((char *) &hparams.n_mult, sizeof(hparams.n_mult));
|
1370
|
-
fout.write((char *) &hparams.n_head, sizeof(hparams.n_head));
|
1371
|
-
fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer));
|
1372
|
-
fout.write((char *) &hparams.n_rot, sizeof(hparams.n_rot));
|
1373
|
-
fout.write((char *) &itype, sizeof(hparams.f16));
|
1374
|
-
}
|
1375
|
-
|
1376
|
-
// load vocab
|
1377
|
-
{
|
1378
|
-
const int32_t n_vocab = hparams.n_vocab;
|
1379
|
-
|
1380
|
-
if (n_vocab != hparams.n_vocab) {
|
1381
|
-
fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
|
1382
|
-
__func__, fname_inp.c_str(), n_vocab, hparams.n_vocab);
|
1383
|
-
return false;
|
1384
|
-
}
|
1385
|
-
|
1386
|
-
std::vector<char> word(32);
|
1387
|
-
vocab.id_to_token.resize(n_vocab);
|
1388
|
-
for (int i = 0; i < n_vocab; i++) {
|
1389
|
-
uint32_t len;
|
1390
|
-
finp.read ((char *) &len, sizeof(len));
|
1391
|
-
fout.write((char *) &len, sizeof(len));
|
1392
|
-
|
1393
|
-
word.resize(len);
|
1394
|
-
finp.read ((char *) &word[0], len);
|
1395
|
-
fout.write((char *) &word[0], len);
|
1396
|
-
|
1397
|
-
float score;
|
1398
|
-
finp.read ((char *) &score, sizeof(score));
|
1399
|
-
fout.write((char *) &score, sizeof(score));
|
1400
|
-
|
1401
|
-
vocab.token_to_id[word.data()] = i;
|
1402
|
-
|
1403
|
-
auto &tok_score = vocab.id_to_token[i];
|
1404
|
-
tok_score.tok = word.data();
|
1405
|
-
tok_score.score = score;
|
1406
|
-
}
|
1407
|
-
}
|
1408
|
-
|
1409
|
-
// load weights
|
1410
|
-
{
|
1411
|
-
size_t total_size_org = 0;
|
1412
|
-
size_t total_size_new = 0;
|
1413
|
-
|
1414
|
-
std::vector<float> work;
|
1415
|
-
|
1416
|
-
std::vector<uint8_t> data_u8;
|
1417
|
-
std::vector<ggml_fp16_t> data_f16;
|
1418
|
-
std::vector<float> data_f32;
|
1419
|
-
|
1420
|
-
std::vector<int64_t> hist_all(1 << 4, 0);
|
1421
|
-
|
1422
|
-
while (true) {
|
1423
|
-
int32_t n_dims;
|
1424
|
-
int32_t length;
|
1425
|
-
int32_t ftype;
|
1426
|
-
|
1427
|
-
finp.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
|
1428
|
-
finp.read(reinterpret_cast<char *>(&length), sizeof(length));
|
1429
|
-
finp.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
|
1430
|
-
|
1431
|
-
if (finp.eof()) {
|
1432
|
-
break;
|
1433
|
-
}
|
1434
|
-
|
1435
|
-
int32_t nelements = 1;
|
1436
|
-
int32_t ne[2] = { 1, 1 };
|
1437
|
-
for (int i = 0; i < n_dims; ++i) {
|
1438
|
-
finp.read (reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
|
1439
|
-
nelements *= ne[i];
|
1440
|
-
}
|
1441
|
-
|
1442
|
-
std::string name(length, 0);
|
1443
|
-
finp.read (&name[0], length);
|
1444
|
-
|
1445
|
-
{
|
1446
|
-
// ensure tensor data is aligned
|
1447
|
-
uint64_t offset = finp.tellg();
|
1448
|
-
offset = (offset + 31) & -32;
|
1449
|
-
finp.seekg(offset);
|
1450
|
-
}
|
1451
|
-
|
1452
|
-
{
|
1453
|
-
static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
|
1454
|
-
printf("%48s - [%5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ftype_str[ftype]);
|
1455
|
-
}
|
1456
|
-
|
1457
|
-
// regexes of tensor names to be quantized
|
1458
|
-
const std::vector<std::string> k_names = {
|
1459
|
-
".*weight",
|
1460
|
-
};
|
1461
|
-
|
1462
|
-
bool quantize = false;
|
1463
|
-
for (const auto & s : k_names) {
|
1464
|
-
if (std::regex_match(name, std::regex(s))) {
|
1465
|
-
quantize = true;
|
1466
|
-
break;
|
1467
|
-
}
|
1468
|
-
}
|
1469
|
-
|
1470
|
-
// quantize only 2D tensors
|
1471
|
-
quantize &= (n_dims == 2);
|
1472
|
-
|
1473
|
-
if (quantize) {
|
1474
|
-
if (ftype != 0 && ftype != 1) {
|
1475
|
-
fprintf(stderr, "%s: unsupported ftype %d for integer quantization\n", __func__, ftype);
|
1476
|
-
return false;
|
1477
|
-
}
|
1478
|
-
|
1479
|
-
if (ftype == 1) {
|
1480
|
-
data_f16.resize(nelements);
|
1481
|
-
finp.read(reinterpret_cast<char *>(data_f16.data()), nelements * sizeof(ggml_fp16_t));
|
1482
|
-
data_f32.resize(nelements);
|
1483
|
-
for (int i = 0; i < nelements; ++i) {
|
1484
|
-
data_f32[i] = ggml_fp16_to_fp32(data_f16[i]);
|
1485
|
-
}
|
1486
|
-
} else {
|
1487
|
-
data_f32.resize(nelements);
|
1488
|
-
finp.read(reinterpret_cast<char *>(data_f32.data()), nelements * sizeof(float));
|
1557
|
+
std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp.c_str(), /*use_mmap*/ false,
|
1558
|
+
/*vocab_only*/ false));
|
1559
|
+
llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), ftype);
|
1560
|
+
|
1561
|
+
size_t total_size_org = 0;
|
1562
|
+
size_t total_size_new = 0;
|
1563
|
+
std::vector<int64_t> hist_all(1 << 4, 0);
|
1564
|
+
|
1565
|
+
size_t idx = 0;
|
1566
|
+
for (llama_load_tensor & tensor : model_loader->tensors_map.tensors) {
|
1567
|
+
llama_buffer read_data;
|
1568
|
+
read_data.resize(tensor.size);
|
1569
|
+
tensor.data = read_data.addr;
|
1570
|
+
model_loader->load_data_for(tensor);
|
1571
|
+
|
1572
|
+
printf("[%zu/%zu] %36s - %s, type = %6s, ",
|
1573
|
+
++idx, model_loader->tensors_map.tensors.size(),
|
1574
|
+
tensor.name.c_str(), llama_format_tensor_shape(tensor.ne).c_str(),
|
1575
|
+
ggml_type_name(tensor.type));
|
1576
|
+
|
1577
|
+
// This used to be a regex, but <regex> has an extreme cost to compile times.
|
1578
|
+
bool quantize = tensor.name.rfind("weight") == tensor.name.size() - 6; // ends with 'weight'?
|
1579
|
+
|
1580
|
+
// quantize only 2D tensors
|
1581
|
+
quantize &= (tensor.ne.size() == 2);
|
1582
|
+
|
1583
|
+
enum ggml_type new_type;
|
1584
|
+
void * new_data;
|
1585
|
+
size_t new_size;
|
1586
|
+
llama_buffer work;
|
1587
|
+
|
1588
|
+
if (!quantize) {
|
1589
|
+
new_type = tensor.type;
|
1590
|
+
new_data = tensor.data;
|
1591
|
+
new_size = tensor.size;
|
1592
|
+
printf("size = %8.3f MB\n", tensor.size/1024.0/1024.0);
|
1593
|
+
} else {
|
1594
|
+
new_type = quantized_type;
|
1595
|
+
float * f32_data;
|
1596
|
+
size_t nelements = tensor.ne.at(0) * tensor.ne.at(1);
|
1597
|
+
llama_buffer f32_conv_buf;
|
1598
|
+
if (tensor.type == GGML_TYPE_F32) {
|
1599
|
+
f32_data = (float *) tensor.data;
|
1600
|
+
} else if (tensor.type == GGML_TYPE_F16) {
|
1601
|
+
f32_conv_buf.resize(nelements * sizeof(float));
|
1602
|
+
f32_data = (float *) f32_conv_buf.addr;
|
1603
|
+
auto f16_data = (const ggml_fp16_t *) tensor.data;
|
1604
|
+
for (size_t i = 0; i < nelements; i++) {
|
1605
|
+
f32_data[i] = ggml_fp16_to_fp32(f16_data[i]);
|
1489
1606
|
}
|
1490
|
-
|
1491
|
-
ftype = itype;
|
1492
1607
|
} else {
|
1493
|
-
|
1494
|
-
|
1495
|
-
data_u8.resize(nelements*bpe);
|
1496
|
-
finp.read(reinterpret_cast<char *>(data_u8.data()), nelements * bpe);
|
1608
|
+
throw format("type %s unsupported for integer quantization", ggml_type_name(tensor.type));
|
1497
1609
|
}
|
1498
1610
|
|
1499
|
-
|
1500
|
-
|
1501
|
-
|
1502
|
-
|
1503
|
-
|
1611
|
+
printf("quantizing .. ");
|
1612
|
+
fflush(stdout);
|
1613
|
+
|
1614
|
+
work.resize(nelements * 4); // upper bound on size
|
1615
|
+
new_data = work.addr;
|
1616
|
+
std::vector<int64_t> hist_cur(1 << 4, 0);
|
1617
|
+
|
1618
|
+
switch (new_type) {
|
1619
|
+
case GGML_TYPE_Q4_0:
|
1620
|
+
{
|
1621
|
+
new_size = ggml_quantize_q4_0(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
|
1622
|
+
} break;
|
1623
|
+
case GGML_TYPE_Q4_1:
|
1624
|
+
{
|
1625
|
+
new_size = ggml_quantize_q4_1(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
|
1626
|
+
} break;
|
1627
|
+
default:
|
1628
|
+
LLAMA_ASSERT(false);
|
1504
1629
|
}
|
1505
|
-
fout.write(&name[0], length);
|
1506
1630
|
|
1507
|
-
|
1508
|
-
|
1509
|
-
|
1510
|
-
offset = (offset + 31) & -32;
|
1511
|
-
fout.seekp(offset);
|
1631
|
+
printf("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0);
|
1632
|
+
for (size_t i = 0; i < hist_cur.size(); i++) {
|
1633
|
+
hist_all[i] += hist_cur[i];
|
1512
1634
|
}
|
1513
1635
|
|
1514
|
-
|
1515
|
-
printf("
|
1516
|
-
work.resize(nelements); // for quantization
|
1517
|
-
|
1518
|
-
size_t cur_size = 0;
|
1519
|
-
std::vector<int64_t> hist_cur(1 << 4, 0);
|
1520
|
-
|
1521
|
-
switch (type) {
|
1522
|
-
case GGML_TYPE_Q4_0:
|
1523
|
-
{
|
1524
|
-
cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
1525
|
-
} break;
|
1526
|
-
case GGML_TYPE_Q4_1:
|
1527
|
-
{
|
1528
|
-
cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
1529
|
-
} break;
|
1530
|
-
default:
|
1531
|
-
{
|
1532
|
-
fprintf(stderr, "%s: unsupported quantization type %d\n", __func__, type);
|
1533
|
-
return false;
|
1534
|
-
}
|
1535
|
-
}
|
1536
|
-
|
1537
|
-
fout.write(reinterpret_cast<char *>(work.data()), cur_size);
|
1538
|
-
total_size_new += cur_size;
|
1539
|
-
|
1540
|
-
printf("size = %8.2f MB -> %8.2f MB | hist: ", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0);
|
1541
|
-
for (int i = 0; i < (int) hist_cur.size(); ++i) {
|
1542
|
-
hist_all[i] += hist_cur[i];
|
1543
|
-
}
|
1544
|
-
|
1545
|
-
for (int i = 0; i < (int) hist_cur.size(); ++i) {
|
1546
|
-
printf("%5.3f ", hist_cur[i] / float(nelements));
|
1547
|
-
}
|
1548
|
-
printf("\n");
|
1549
|
-
} else {
|
1550
|
-
printf("size = %8.3f MB\n", data_u8.size()/1024.0/1024.0);
|
1551
|
-
fout.write(reinterpret_cast<char *>(data_u8.data()), data_u8.size());
|
1552
|
-
total_size_new += data_u8.size();
|
1636
|
+
for (size_t i = 0; i < hist_cur.size(); i++) {
|
1637
|
+
printf("%5.3f ", hist_cur[i] / float(nelements));
|
1553
1638
|
}
|
1554
|
-
|
1555
|
-
total_size_org += nelements * sizeof(float);
|
1639
|
+
printf("\n");
|
1556
1640
|
}
|
1641
|
+
total_size_org += tensor.size;
|
1642
|
+
total_size_new += new_size;
|
1643
|
+
file_saver.write_tensor(tensor, new_type, new_data, new_size);
|
1644
|
+
}
|
1557
1645
|
|
1558
|
-
|
1559
|
-
|
1646
|
+
printf("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
|
1647
|
+
printf("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0);
|
1560
1648
|
|
1561
|
-
|
1562
|
-
|
1563
|
-
|
1564
|
-
|
1565
|
-
|
1649
|
+
{
|
1650
|
+
int64_t sum_all = 0;
|
1651
|
+
for (size_t i = 0; i < hist_all.size(); i++) {
|
1652
|
+
sum_all += hist_all[i];
|
1653
|
+
}
|
1566
1654
|
|
1567
|
-
|
1568
|
-
|
1569
|
-
|
1570
|
-
}
|
1571
|
-
printf("\n");
|
1655
|
+
printf("%s: hist: ", __func__);
|
1656
|
+
for (size_t i = 0; i < hist_all.size(); i++) {
|
1657
|
+
printf("%5.3f ", hist_all[i] / float(sum_all));
|
1572
1658
|
}
|
1659
|
+
printf("\n");
|
1573
1660
|
}
|
1574
|
-
|
1575
|
-
finp.close();
|
1576
|
-
fout.close();
|
1577
|
-
|
1578
|
-
return true;
|
1579
1661
|
}
|
1580
1662
|
|
1581
1663
|
//
|
@@ -1593,32 +1675,36 @@ struct llama_context * llama_init_from_file(
|
|
1593
1675
|
params.seed = time(NULL);
|
1594
1676
|
}
|
1595
1677
|
|
1678
|
+
unsigned cur_percentage = 0;
|
1679
|
+
if (params.progress_callback == NULL) {
|
1680
|
+
params.progress_callback_user_data = &cur_percentage;
|
1681
|
+
params.progress_callback = [](float progress, void * ctx) {
|
1682
|
+
unsigned * cur_percentage_p = (unsigned *) ctx;
|
1683
|
+
unsigned percentage = (unsigned) (100 * progress);
|
1684
|
+
while (percentage > *cur_percentage_p) {
|
1685
|
+
++*cur_percentage_p;
|
1686
|
+
fprintf(stderr, ".");
|
1687
|
+
fflush(stderr);
|
1688
|
+
if (percentage >= 100) {
|
1689
|
+
fprintf(stderr, "\n");
|
1690
|
+
}
|
1691
|
+
}
|
1692
|
+
};
|
1693
|
+
}
|
1694
|
+
|
1596
1695
|
ctx->rng = std::mt19937(params.seed);
|
1597
1696
|
ctx->logits_all = params.logits_all;
|
1598
1697
|
|
1599
1698
|
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
1600
1699
|
|
1601
|
-
if (!llama_model_load(path_model, *ctx, params.n_ctx,
|
1602
|
-
params.
|
1603
|
-
params.progress_callback_user_data)) {
|
1700
|
+
if (!llama_model_load(path_model, *ctx, params.n_ctx, memory_type,
|
1701
|
+
params.use_mmap, params.use_mlock, params.vocab_only,
|
1702
|
+
params.progress_callback, params.progress_callback_user_data)) {
|
1604
1703
|
fprintf(stderr, "%s: failed to load model\n", __func__);
|
1605
1704
|
llama_free(ctx);
|
1606
1705
|
return nullptr;
|
1607
1706
|
}
|
1608
1707
|
|
1609
|
-
if (params.use_mlock) {
|
1610
|
-
char *err;
|
1611
|
-
if (!ggml_mlock(ctx->model.ctx,
|
1612
|
-
ctx->model.mm_addr,
|
1613
|
-
ctx->model.mm_length,
|
1614
|
-
&err)) {
|
1615
|
-
fprintf(stderr, "%s\n", err);
|
1616
|
-
free(err);
|
1617
|
-
llama_free(ctx);
|
1618
|
-
return nullptr;
|
1619
|
-
}
|
1620
|
-
}
|
1621
|
-
|
1622
1708
|
// reserve memory for context buffers
|
1623
1709
|
if (!params.vocab_only) {
|
1624
1710
|
if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx)) {
|
@@ -1655,40 +1741,31 @@ struct llama_context * llama_init_from_file(
|
|
1655
1741
|
}
|
1656
1742
|
|
1657
1743
|
void llama_free(struct llama_context * ctx) {
|
1658
|
-
kv_cache_free(ctx->model.kv_self);
|
1659
|
-
|
1660
|
-
if (ctx->model.ctx) {
|
1661
|
-
ggml_free(ctx->model.ctx);
|
1662
|
-
}
|
1663
|
-
|
1664
|
-
if (ctx->model.mm_addr) {
|
1665
|
-
munmap_file(ctx->model.mm_addr, ctx->model.mm_length);
|
1666
|
-
}
|
1667
|
-
|
1668
1744
|
delete ctx;
|
1669
1745
|
}
|
1670
1746
|
|
1671
1747
|
int llama_model_quantize(
|
1672
1748
|
const char * fname_inp,
|
1673
1749
|
const char * fname_out,
|
1674
|
-
|
1675
|
-
|
1676
|
-
|
1750
|
+
enum llama_ftype ftype) {
|
1751
|
+
try {
|
1752
|
+
llama_model_quantize_internal(fname_inp, fname_out, ftype);
|
1753
|
+
return 0;
|
1754
|
+
} catch (const std::string & err) {
|
1755
|
+
fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.c_str());
|
1677
1756
|
return 1;
|
1678
1757
|
}
|
1679
|
-
|
1680
|
-
return 0;
|
1681
1758
|
}
|
1682
1759
|
|
1683
1760
|
// Returns the KV cache that will contain the context for the
|
1684
1761
|
// ongoing prediction with the model.
|
1685
1762
|
const uint8_t * llama_get_kv_cache(struct llama_context * ctx) {
|
1686
|
-
return ctx->model.kv_self.buf.
|
1763
|
+
return ctx->model.kv_self.buf.addr;
|
1687
1764
|
}
|
1688
1765
|
|
1689
1766
|
// Returns the size of the KV cache
|
1690
1767
|
size_t llama_get_kv_cache_size(struct llama_context * ctx) {
|
1691
|
-
return ctx->model.kv_self.buf.size
|
1768
|
+
return ctx->model.kv_self.buf.size;
|
1692
1769
|
}
|
1693
1770
|
|
1694
1771
|
int llama_get_kv_cache_token_count(struct llama_context * ctx) {
|
@@ -1702,8 +1779,8 @@ void llama_set_kv_cache(
|
|
1702
1779
|
size_t n_size,
|
1703
1780
|
int n_token_count) {
|
1704
1781
|
// Make sure we have the same kv cache setup
|
1705
|
-
LLAMA_ASSERT(ctx->model.kv_self.buf.size
|
1706
|
-
memcpy(ctx->model.kv_self.buf.
|
1782
|
+
LLAMA_ASSERT(ctx->model.kv_self.buf.size == n_size);
|
1783
|
+
memcpy(ctx->model.kv_self.buf.addr, kv_cache, n_size);
|
1707
1784
|
ctx->model.kv_self.n = n_token_count;
|
1708
1785
|
}
|
1709
1786
|
|
@@ -1814,9 +1891,9 @@ llama_token llama_sample_top_p_top_k(
|
|
1814
1891
|
void llama_print_timings(struct llama_context * ctx) {
|
1815
1892
|
const int64_t t_end_us = ggml_time_us();
|
1816
1893
|
|
1817
|
-
const int32_t n_sample =
|
1818
|
-
const int32_t n_eval =
|
1819
|
-
const int32_t n_p_eval =
|
1894
|
+
const int32_t n_sample = std::max(1, ctx->n_sample);
|
1895
|
+
const int32_t n_eval = std::max(1, ctx->n_eval);
|
1896
|
+
const int32_t n_p_eval = std::max(1, ctx->n_p_eval);
|
1820
1897
|
|
1821
1898
|
fprintf(stderr, "\n");
|
1822
1899
|
fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0);
|
@@ -1852,3 +1929,8 @@ const char * llama_print_system_info(void) {
|
|
1852
1929
|
|
1853
1930
|
return s.c_str();
|
1854
1931
|
}
|
1932
|
+
|
1933
|
+
// For internal test use
|
1934
|
+
std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
|
1935
|
+
return ctx->model.tensors_by_name;
|
1936
|
+
}
|