llama_cpp 0.0.3 → 0.0.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,49 +1,30 @@
1
+ // Defines fileno on msys:
2
+ #ifndef _GNU_SOURCE
3
+ #define _GNU_SOURCE
4
+ #endif
5
+
6
+ #include "llama_util.h"
1
7
  #include "llama.h"
2
8
 
3
9
  #include "ggml.h"
4
10
 
11
+ #include <array>
5
12
  #include <cinttypes>
6
13
  #include <fstream>
7
14
  #include <random>
8
15
  #include <map>
9
16
  #include <unordered_map>
10
17
  #include <queue>
11
- #include <regex>
12
18
  #include <cassert>
13
19
  #include <cstring>
14
-
15
- #if defined(_WIN32) && !defined(_POSIX_MAPPED_FILES)
16
- #define WIN32_LEAN_AND_MEAN
17
- #include <Windows.h>
18
- #else
19
- #include <sys/types.h>
20
- #include <sys/mman.h>
21
- #include <unistd.h>
22
- #include <fcntl.h>
23
- #endif
24
-
25
- #define Min(X, Y) ((Y) > (X) ? (X) : (Y))
26
- #define Max(X, Y) ((Y) < (X) ? (X) : (Y))
20
+ #include <climits>
21
+ #include <memory>
22
+ #include <algorithm>
23
+ #include <initializer_list>
27
24
 
28
25
  #define LLAMA_USE_SCRATCH
29
26
  #define LLAMA_MAX_SCRATCH_BUFFERS 16
30
27
 
31
- #define LLAMA_ASSERT(x) \
32
- do { \
33
- if (!(x)) { \
34
- fprintf(stderr, "LLAMA_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
35
- abort(); \
36
- } \
37
- } while (0)
38
-
39
-
40
- // determine number of model parts based on the dimension
41
- static const std::unordered_map<int, int> LLAMA_N_PARTS = {
42
- { 4096, 1 },
43
- { 5120, 2 },
44
- { 6656, 4 },
45
- { 8192, 8 },
46
- };
47
28
 
48
29
  // available llama models
49
30
  enum e_model {
@@ -93,14 +74,18 @@ static const std::map<e_model, size_t> MEM_REQ_EVAL = {
93
74
 
94
75
  // default hparams (LLaMA 7B)
95
76
  struct llama_hparams {
96
- int32_t n_vocab = 32000;
97
- int32_t n_ctx = 512; // this is provided as user input?
98
- int32_t n_embd = 4096;
99
- int32_t n_mult = 256;
100
- int32_t n_head = 32;
101
- int32_t n_layer = 32;
102
- int32_t n_rot = 64;
103
- int32_t f16 = 1;
77
+ uint32_t n_vocab = 32000;
78
+ uint32_t n_ctx = 512; // this is provided as user input?
79
+ uint32_t n_embd = 4096;
80
+ uint32_t n_mult = 256;
81
+ uint32_t n_head = 32;
82
+ uint32_t n_layer = 32;
83
+ uint32_t n_rot = 64;
84
+ enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;
85
+
86
+ bool operator!=(const llama_hparams & other) const {
87
+ return memcmp(this, &other, sizeof(llama_hparams));
88
+ }
104
89
  };
105
90
 
106
91
  struct llama_layer {
@@ -126,11 +111,17 @@ struct llama_kv_cache {
126
111
  struct ggml_tensor * k;
127
112
  struct ggml_tensor * v;
128
113
 
129
- struct ggml_context * ctx;
114
+ struct ggml_context * ctx = NULL;
130
115
 
131
- std::vector<uint8_t> buf;
116
+ llama_buffer buf;
132
117
 
133
118
  int n; // number of tokens currently in the cache
119
+
120
+ ~llama_kv_cache() {
121
+ if (ctx) {
122
+ ggml_free(ctx);
123
+ }
124
+ }
134
125
  };
135
126
 
136
127
  struct llama_model {
@@ -146,22 +137,30 @@ struct llama_model {
146
137
  std::vector<llama_layer> layers;
147
138
 
148
139
  // context
149
- struct ggml_context * ctx;
140
+ struct ggml_context * ctx = NULL;
150
141
 
151
142
  // key + value cache for the self attention
152
143
  // TODO: move to llama_state
153
144
  struct llama_kv_cache kv_self;
154
145
 
155
146
  // the model memory buffer
156
- std::vector<uint8_t> buf;
147
+ llama_buffer buf;
157
148
 
158
149
  // model memory mapped file
159
- void * mm_addr = NULL;
160
- uint64_t mm_length = 0;
150
+ std::unique_ptr<llama_mmap> mapping;
151
+
152
+ // objects representing data potentially being locked in memory
153
+ llama_mlock mlock_buf;
154
+ llama_mlock mlock_mmap;
155
+
156
+ // for quantize-stats only
157
+ std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
161
158
 
162
- // tensors
163
- int n_loaded;
164
- std::unordered_map<std::string, struct ggml_tensor *> tensors;
159
+ ~llama_model() {
160
+ if (ctx) {
161
+ ggml_free(ctx);
162
+ }
163
+ }
165
164
  };
166
165
 
167
166
  struct llama_vocab {
@@ -206,8 +205,8 @@ struct llama_context {
206
205
 
207
206
  // memory buffers used to evaluate the model
208
207
  // TODO: move in llama_state
209
- std::vector<uint8_t> buf_compute;
210
- std::vector<uint8_t> buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
208
+ llama_buffer buf_compute;
209
+ llama_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
211
210
 
212
211
  int buf_last = 0;
213
212
  size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
@@ -220,11 +219,11 @@ struct llama_context {
220
219
  last_size = ggml_set_scratch(ctx, { 0, 0, nullptr, });
221
220
  } else {
222
221
  auto & buf = buf_scratch[i];
223
- last_size = ggml_set_scratch(ctx, { 0, buf.size(), buf.data(), });
222
+ last_size = ggml_set_scratch(ctx, { 0, buf.size, buf.addr, });
224
223
  }
225
224
 
226
225
  if (buf_last >= 0) {
227
- buf_max_size[buf_last] = Max(buf_max_size[buf_last], last_size);
226
+ buf_max_size[buf_last] = std::max(buf_max_size[buf_last], last_size);
228
227
  }
229
228
 
230
229
  buf_last = i;
@@ -244,6 +243,499 @@ struct llama_context {
244
243
  }
245
244
  };
246
245
 
246
+ template <typename T>
247
+ static T checked_mul(T a, T b) {
248
+ T ret = a * b;
249
+ if (a != 0 && ret / a != b) {
250
+ throw format("overflow multiplying %llu * %llu",
251
+ (unsigned long long) a, (unsigned long long) b);
252
+ }
253
+ return ret;
254
+ }
255
+
256
+ static size_t checked_div(size_t a, size_t b) {
257
+ if (b == 0 || a % b != 0) {
258
+ throw format("error dividing %zu / %zu", a, b);
259
+ }
260
+ return a / b;
261
+ }
262
+
263
+ static std::string llama_format_tensor_shape(const std::vector<uint32_t> & ne) {
264
+ std::string ret = "[" + std::to_string(ne.at(0));
265
+ for (size_t i = 1; i < ne.size(); i++) {
266
+ ret += " x " + std::to_string(ne.at(i));
267
+ }
268
+ ret += "]";
269
+ return ret;
270
+ }
271
+
272
+ static size_t llama_calc_tensor_size(const std::vector<uint32_t> & ne, enum ggml_type type) {
273
+ size_t size = ggml_type_size(type);
274
+ for (uint32_t dim : ne) {
275
+ size = checked_mul<size_t>(size, dim);
276
+ }
277
+ return size / ggml_blck_size(type);
278
+ }
279
+
280
+ struct llama_load_tensor_shard {
281
+ std::vector<uint32_t> ne;
282
+ size_t size;
283
+ enum ggml_type type;
284
+ size_t file_idx;
285
+ size_t file_off;
286
+
287
+ void calc_size() {
288
+ size = llama_calc_tensor_size(ne, type);
289
+ }
290
+ };
291
+
292
+ enum llama_split_type {
293
+ SPLIT_NONE,
294
+ SPLIT_BY_COLUMNS,
295
+ SPLIT_BY_ROWS
296
+ };
297
+
298
+ struct llama_load_tensor {
299
+ std::vector<llama_load_tensor_shard> shards;
300
+
301
+ std::string name;
302
+ enum ggml_type type = GGML_TYPE_F32;
303
+ llama_split_type split_type = SPLIT_NONE;
304
+ std::vector<uint32_t> ne;
305
+ size_t size;
306
+ struct ggml_tensor * ggml_tensor = NULL;
307
+ uint8_t * data;
308
+
309
+ llama_load_tensor(const std::string & name) : name(name) {}
310
+
311
+ void calc_all() {
312
+ calc_type();
313
+ calc_split_type();
314
+ calc_ne();
315
+ calc_size();
316
+ }
317
+
318
+ void calc_type() {
319
+ const auto & first_shard = shards.at(0);
320
+ for (const auto & shard : shards) {
321
+ if (shard.type != first_shard.type) {
322
+ throw format("inconsistent tensor shard type in '%s'", name.c_str());
323
+ }
324
+ }
325
+ type = first_shard.type;
326
+ }
327
+
328
+ void calc_split_type() {
329
+ if (shards.at(0).ne.size() == 1 || // 1D tensors are just duplicated in every file
330
+ shards.size() == 1) { // only one file?
331
+ split_type = SPLIT_NONE;
332
+ } else if (name.find("tok_embeddings.") == 0 ||
333
+ name.find(".attention.wo.weight") != std::string::npos ||
334
+ name.find(".feed_forward.w2.weight") != std::string::npos) {
335
+ split_type = SPLIT_BY_COLUMNS;
336
+ } else {
337
+ split_type = SPLIT_BY_ROWS;
338
+ }
339
+ }
340
+
341
+ void calc_ne() {
342
+ const auto & first_shard = shards.at(0);
343
+ for (const auto & shard : shards) {
344
+ if (shard.ne != first_shard.ne) {
345
+ throw format("inconsistent tensor shard shape in '%s': first was %s, other was %s",
346
+ name.c_str(), llama_format_tensor_shape(first_shard.ne).c_str(), llama_format_tensor_shape(shard.ne).c_str());
347
+ }
348
+ }
349
+ ne = first_shard.ne;
350
+ LLAMA_ASSERT(shards.size() <= UINT32_MAX);
351
+ uint32_t n_shards = (uint32_t) shards.size();
352
+ switch (split_type) {
353
+ case SPLIT_NONE:
354
+ ne = first_shard.ne;
355
+ break;
356
+ case SPLIT_BY_COLUMNS:
357
+ ne = {checked_mul<uint32_t>(first_shard.ne[0], n_shards),
358
+ first_shard.ne[1]};
359
+ break;
360
+ case SPLIT_BY_ROWS:
361
+ ne = {first_shard.ne[0],
362
+ checked_mul<uint32_t>(first_shard.ne[1], n_shards)};
363
+ break;
364
+ }
365
+ }
366
+
367
+ void calc_size() {
368
+ size = llama_calc_tensor_size(ne, type);
369
+ }
370
+ };
371
+
372
+ struct llama_load_tensors_map {
373
+ // tensors is kept in a separate vector to preserve file order
374
+ std::vector<llama_load_tensor> tensors;
375
+ std::unordered_map<std::string, size_t> name_to_idx;
376
+ };
377
+
378
+ enum llama_file_version {
379
+ LLAMA_FILE_VERSION_GGML,
380
+ LLAMA_FILE_VERSION_GGMF_V1, // added version field and scores in vocab
381
+ LLAMA_FILE_VERSION_GGJT_V1, // added padding
382
+ };
383
+
384
+ struct llama_file_loader {
385
+ llama_file file;
386
+ llama_file_version file_version;
387
+ llama_hparams hparams;
388
+ llama_vocab vocab;
389
+
390
+ llama_file_loader(const char * fname, size_t file_idx, llama_load_tensors_map & tensors_map)
391
+ : file(fname, "rb") {
392
+ fprintf(stderr, "llama.cpp: loading model from %s\n", fname);
393
+ read_magic();
394
+ read_hparams();
395
+ read_vocab();
396
+ read_tensor_metadata(file_idx, tensors_map);
397
+ }
398
+ void read_magic() {
399
+ uint32_t magic = file.read_u32();
400
+ uint32_t version = 0;
401
+
402
+ if (magic != 'ggml') {
403
+ version = file.read_u32();
404
+ }
405
+
406
+ if (magic == 'ggml' && version == 0) {
407
+ file_version = LLAMA_FILE_VERSION_GGML;
408
+ } else if (magic == 'ggmf' && version == 1) {
409
+ file_version = LLAMA_FILE_VERSION_GGMF_V1;
410
+ } else if (magic == 'ggjt' && version == 1) {
411
+ file_version = LLAMA_FILE_VERSION_GGJT_V1;
412
+ } else {
413
+ throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
414
+ magic, version);
415
+ }
416
+ }
417
+ void read_hparams() {
418
+ hparams.n_vocab = file.read_u32();
419
+ hparams.n_embd = file.read_u32();
420
+ hparams.n_mult = file.read_u32();
421
+ hparams.n_head = file.read_u32();
422
+ hparams.n_layer = file.read_u32();
423
+ hparams.n_rot = file.read_u32();
424
+ hparams.ftype = (enum llama_ftype) file.read_u32();
425
+ }
426
+ void read_vocab() {
427
+ vocab.id_to_token.resize(hparams.n_vocab);
428
+
429
+ for (uint32_t i = 0; i < hparams.n_vocab; i++) {
430
+ uint32_t len = file.read_u32();
431
+ std::string word = file.read_string(len);
432
+
433
+ float score = 0.0f;
434
+ if (file_version >= LLAMA_FILE_VERSION_GGMF_V1) {
435
+ file.read_raw(&score, sizeof(score));
436
+ }
437
+
438
+ vocab.token_to_id[word] = i;
439
+
440
+ auto & tok_score = vocab.id_to_token[i];
441
+ tok_score.tok = std::move(word);
442
+ tok_score.score = score;
443
+ }
444
+ }
445
+ void read_tensor_metadata(size_t file_idx, llama_load_tensors_map & tensors_map) {
446
+ while (file.tell() < file.size) {
447
+ llama_load_tensor_shard shard;
448
+ uint32_t n_dims = file.read_u32();
449
+ uint32_t name_len = file.read_u32();
450
+ shard.type = (enum ggml_type) file.read_u32();
451
+ shard.ne.resize(n_dims);
452
+ file.read_raw(shard.ne.data(), sizeof(shard.ne[0]) * n_dims);
453
+ std::string name = file.read_string(name_len);
454
+ if (n_dims < 1 || n_dims > 2) {
455
+ throw format("llama.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims);
456
+ }
457
+ switch (shard.type) {
458
+ case GGML_TYPE_F32:
459
+ case GGML_TYPE_F16:
460
+ case GGML_TYPE_Q4_0:
461
+ case GGML_TYPE_Q4_1:
462
+ break;
463
+ default: {
464
+ throw format("unrecognized tensor type %u\n", shard.type);
465
+ }
466
+ }
467
+
468
+ if (file_version >= LLAMA_FILE_VERSION_GGJT_V1) {
469
+ // skip to the next multiple of 32 bytes
470
+ file.seek(-file.tell() & 31, SEEK_CUR);
471
+ }
472
+ shard.file_idx = file_idx;
473
+ shard.file_off = file.tell();
474
+
475
+ shard.calc_size();
476
+ file.seek(shard.size, SEEK_CUR);
477
+
478
+ auto it = tensors_map.name_to_idx.find(name);
479
+ size_t idx;
480
+ if (it != tensors_map.name_to_idx.end()) {
481
+ idx = it->second;
482
+ } else {
483
+ tensors_map.tensors.emplace_back(name);
484
+ idx = tensors_map.tensors.size() - 1;
485
+ tensors_map.name_to_idx.emplace(name, idx);
486
+ }
487
+ tensors_map.tensors.at(idx).shards.push_back(shard);
488
+ }
489
+ }
490
+ };
491
+
492
+ struct llama_file_saver {
493
+ llama_file file;
494
+ llama_file_loader * any_file_loader;
495
+ llama_file_saver(const char * fname, llama_file_loader * any_file_loader, enum llama_ftype new_ftype)
496
+ : file(fname, "wb"), any_file_loader(any_file_loader) {
497
+ fprintf(stderr, "llama.cpp: saving model to %s\n", fname);
498
+ write_magic();
499
+ write_hparams(new_ftype);
500
+ write_vocab();
501
+ }
502
+ void write_magic() {
503
+ file.write_u32('ggjt'); // magic
504
+ file.write_u32(1); // version
505
+ }
506
+ void write_hparams(enum llama_ftype new_ftype) {
507
+ const llama_hparams & hparams = any_file_loader->hparams;
508
+ file.write_u32(hparams.n_vocab);
509
+ file.write_u32(hparams.n_embd);
510
+ file.write_u32(hparams.n_mult);
511
+ file.write_u32(hparams.n_head);
512
+ file.write_u32(hparams.n_layer);
513
+ file.write_u32(hparams.n_rot);
514
+ file.write_u32(new_ftype);
515
+ }
516
+ void write_vocab() {
517
+ if (any_file_loader->file_version == LLAMA_FILE_VERSION_GGML) {
518
+ fprintf(stderr, "llama.cpp: WARNING: input is an old file that doesn't have scores; will add dummy scores\n");
519
+ }
520
+ uint32_t n_vocab = any_file_loader->hparams.n_vocab;
521
+ for (uint32_t i = 0; i < n_vocab; i++) {
522
+ const auto & token_score = any_file_loader->vocab.id_to_token.at(i);
523
+ file.write_u32((uint32_t) token_score.tok.size());
524
+ file.write_raw(token_score.tok.data(), token_score.tok.size());
525
+ file.write_raw(&token_score.score, sizeof(token_score.score));
526
+ }
527
+ }
528
+ void write_tensor(llama_load_tensor & tensor, enum ggml_type new_type, const void * new_data, size_t new_size) {
529
+ switch (new_type) {
530
+ case GGML_TYPE_F32:
531
+ case GGML_TYPE_F16:
532
+ case GGML_TYPE_Q4_0:
533
+ case GGML_TYPE_Q4_1:
534
+ break;
535
+ default: LLAMA_ASSERT(false);
536
+ }
537
+ file.write_u32((uint32_t) tensor.ne.size());
538
+ file.write_u32((uint32_t) tensor.name.size());
539
+ file.write_u32(new_type);
540
+ file.write_raw(tensor.ne.data(), sizeof(tensor.ne[0]) * tensor.ne.size());
541
+ file.write_raw(tensor.name.data(), tensor.name.size());
542
+ file.seek(-file.tell() & 31, SEEK_CUR);
543
+ LLAMA_ASSERT(new_size == llama_calc_tensor_size(tensor.ne, new_type));
544
+ file.write_raw(new_data, new_size);
545
+ }
546
+ };
547
+
548
+ struct llama_model_loader {
549
+ std::vector<std::unique_ptr<llama_file_loader>> file_loaders;
550
+ llama_load_tensors_map tensors_map;
551
+ bool use_mmap;
552
+ size_t num_ggml_tensors_created = 0;
553
+ struct ggml_context * ggml_ctx = NULL;
554
+ std::unique_ptr<llama_mmap> mapping;
555
+
556
+ llama_model_loader(const std::string & fname_base, bool use_mmap, bool vocab_only) {
557
+ auto first_file = new llama_file_loader(fname_base.c_str(), 0, tensors_map);
558
+ file_loaders.emplace_back(first_file);
559
+ uint32_t n_parts = vocab_only ? 1 : guess_n_parts();
560
+ for (uint32_t i = 1; i < n_parts; i++) {
561
+ std::string fname = fname_base + "." + std::to_string(i);
562
+ auto ith_file = new llama_file_loader(fname.c_str(), i, tensors_map);
563
+ file_loaders.emplace_back(ith_file);
564
+ if (ith_file->hparams != first_file->hparams) {
565
+ throw format("llama.cpp: hparams inconsistent between files");
566
+ }
567
+ }
568
+ if (!llama_mmap::SUPPORTED) {
569
+ use_mmap = false;
570
+ }
571
+ if (use_mmap && alignment_prevents_mmap()) {
572
+ fprintf(stderr, "llama.cpp: can't use mmap because tensors are not aligned; convert to new format to avoid this\n");
573
+ use_mmap = false;
574
+ }
575
+ this->use_mmap = use_mmap;
576
+ for (llama_load_tensor & lt : tensors_map.tensors) {
577
+ lt.calc_all();
578
+ }
579
+ }
580
+
581
+ bool alignment_prevents_mmap() {
582
+ for (const llama_load_tensor & lt : tensors_map.tensors) {
583
+ for (const llama_load_tensor_shard & shard : lt.shards) {
584
+ if (shard.file_off & 3) {
585
+ return true;
586
+ }
587
+ }
588
+ }
589
+ return false;
590
+ }
591
+
592
+ uint32_t guess_n_parts() const {
593
+ auto it = tensors_map.name_to_idx.find("tok_embeddings.weight");
594
+ if (it == tensors_map.name_to_idx.end()) {
595
+ throw std::string("missing tok_embeddings.weight");
596
+ }
597
+ const llama_load_tensor & lt = tensors_map.tensors.at(it->second);
598
+ return file_loaders.at(0)->hparams.n_embd / lt.shards.at(0).ne.at(0);
599
+ }
600
+
601
+ void calc_sizes(size_t * ctx_size_p, size_t * mmapped_size_p) const {
602
+ *ctx_size_p = *mmapped_size_p = 0;
603
+ for (const llama_load_tensor & lt : tensors_map.tensors) {
604
+ *ctx_size_p += sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE;
605
+ *(use_mmap ? mmapped_size_p : ctx_size_p) += lt.size;
606
+ }
607
+ }
608
+
609
+ struct ggml_tensor * get_tensor(const std::string & name, std::vector<uint32_t> ne) {
610
+ auto it = tensors_map.name_to_idx.find(name);
611
+ if (it == tensors_map.name_to_idx.end()) {
612
+ throw format("llama.cpp: tensor '%s' is missing from model", name.c_str());
613
+ }
614
+ llama_load_tensor & lt = tensors_map.tensors.at(it->second);
615
+ if (lt.ne != ne) {
616
+ throw format("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s",
617
+ name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str());
618
+ }
619
+ return get_tensor_for(lt);
620
+ }
621
+
622
+ struct ggml_tensor * get_tensor_for(llama_load_tensor & lt) {
623
+ struct ggml_tensor * tensor;
624
+ if (lt.ne.size() == 2) {
625
+ tensor = ggml_new_tensor_2d(ggml_ctx, lt.type, lt.ne.at(0), lt.ne.at(1));
626
+ } else {
627
+ LLAMA_ASSERT(lt.ne.size() == 1);
628
+ tensor = ggml_new_tensor_1d(ggml_ctx, lt.type, lt.ne.at(0));
629
+ }
630
+ LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor
631
+ lt.ggml_tensor = tensor;
632
+ num_ggml_tensors_created++;
633
+ return tensor;
634
+ }
635
+
636
+ void done_getting_tensors() {
637
+ if (num_ggml_tensors_created != tensors_map.tensors.size()) {
638
+ throw std::string("llama.cpp: file contained more tensors than expected");
639
+ }
640
+ }
641
+
642
+ void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) {
643
+ size_t data_size = 0;
644
+ for (const llama_load_tensor & lt : tensors_map.tensors) {
645
+ data_size += lt.size;
646
+ }
647
+
648
+ if (use_mmap) {
649
+ mapping.reset(new llama_mmap(&file_loaders.at(0)->file));
650
+ if (!lmlock) {
651
+ // Don't call the callback since the actual loading will be lazy
652
+ // and we can't measure it.
653
+ progress_callback = NULL;
654
+ }
655
+ if (lmlock) {
656
+ lmlock->init(mapping->addr);
657
+ }
658
+ }
659
+
660
+ size_t done_size = 0;
661
+ for (llama_load_tensor & lt : tensors_map.tensors) {
662
+ if (progress_callback) {
663
+ progress_callback((float) done_size / data_size, progress_callback_user_data);
664
+ }
665
+ LLAMA_ASSERT(lt.ggml_tensor); // unused tensors should have been caught by load_data already
666
+ lt.data = (uint8_t *) lt.ggml_tensor->data;
667
+ load_data_for(lt);
668
+ lt.ggml_tensor->data = lt.data;
669
+ done_size += lt.size;
670
+ if (use_mmap && lmlock) {
671
+ lmlock->grow_to(done_size);
672
+ }
673
+ }
674
+ if (progress_callback) {
675
+ progress_callback(1.0f, progress_callback_user_data);
676
+ }
677
+ }
678
+
679
+ void load_data_for(llama_load_tensor & lt) {
680
+ if (use_mmap) {
681
+ LLAMA_ASSERT(lt.shards.size() == 1);
682
+ lt.data = (uint8_t *) mapping->addr + lt.shards.at(0).file_off;
683
+ } else if (lt.split_type == SPLIT_NONE) {
684
+ llama_file & file = file_loaders.at(lt.shards.at(0).file_idx)->file;
685
+ file.seek(lt.shards.at(0).file_off, SEEK_SET);
686
+ file.read_raw(lt.data, lt.size);
687
+ } else if (lt.split_type == SPLIT_BY_ROWS) {
688
+ size_t offset = 0;
689
+ for (llama_load_tensor_shard & shard : lt.shards) {
690
+ llama_file & file = file_loaders.at(shard.file_idx)->file;
691
+ file.seek(shard.file_off, SEEK_SET);
692
+ file.read_raw(lt.data + offset, shard.size);
693
+ offset += shard.size;
694
+ }
695
+ LLAMA_ASSERT(offset == lt.size);
696
+ } else if (lt.split_type == SPLIT_BY_COLUMNS) {
697
+ // Let's load the data into temporary buffers to ensure the OS performs large loads.
698
+ std::vector<llama_buffer> tmp_bufs;
699
+ tmp_bufs.resize(lt.shards.size());
700
+ for (size_t i = 0; i < lt.shards.size(); i++) {
701
+ llama_load_tensor_shard & shard = lt.shards.at(i);
702
+ llama_file & file = file_loaders.at(shard.file_idx)->file;
703
+ file.seek(shard.file_off, SEEK_SET);
704
+ tmp_bufs.at(i).resize(shard.size);
705
+ file.read_raw(tmp_bufs.at(i).addr, shard.size);
706
+ }
707
+ // Then reshape.
708
+ size_t num_rows = lt.ne.at(1);
709
+ size_t per_shard_row_size = lt.shards.at(0).size / num_rows;
710
+ size_t out_offset = 0;
711
+ for (size_t row = 0; row < num_rows; row++) {
712
+ for (llama_buffer & tmp_buf : tmp_bufs) {
713
+ memcpy(lt.data + out_offset,
714
+ tmp_buf.addr + row * per_shard_row_size,
715
+ per_shard_row_size);
716
+ out_offset += per_shard_row_size;
717
+ }
718
+ }
719
+ LLAMA_ASSERT(out_offset == lt.size);
720
+ }
721
+ if (0) {
722
+ print_checksum(lt);
723
+ }
724
+ }
725
+
726
+ static void print_checksum(llama_load_tensor & lt) {
727
+ uint32_t sum = 0;
728
+ for (size_t i = 0; i < lt.size; i++) {
729
+ uint8_t byte = lt.data[i];
730
+ sum = byte + (sum << 6) + (sum << 16) - sum; // sdbm hash
731
+ }
732
+ fprintf(stderr, "%s checksum: %#08x (%s, size %zu)\n", lt.name.c_str(), sum,
733
+ llama_format_tensor_shape(lt.ne).c_str(), lt.size);
734
+ }
735
+
736
+ };
737
+
738
+
247
739
  //
248
740
  // kv cache
249
741
  //
@@ -262,8 +754,8 @@ static bool kv_cache_init(
262
754
  cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
263
755
 
264
756
  struct ggml_init_params params;
265
- params.mem_size = cache.buf.size();
266
- params.mem_buffer = cache.buf.data();
757
+ params.mem_size = cache.buf.size;
758
+ params.mem_buffer = cache.buf.addr;
267
759
  params.no_alloc = false;
268
760
 
269
761
  cache.ctx = ggml_init(params);
@@ -279,13 +771,6 @@ static bool kv_cache_init(
279
771
  return true;
280
772
  }
281
773
 
282
- static void kv_cache_free(struct llama_kv_cache & cache) {
283
- if (cache.ctx) {
284
- ggml_free(cache.ctx);
285
- cache.ctx = nullptr;
286
- }
287
- }
288
-
289
774
  struct llama_context_params llama_context_default_params() {
290
775
  struct llama_context_params result = {
291
776
  /*.n_ctx =*/ 512,
@@ -294,6 +779,7 @@ struct llama_context_params llama_context_default_params() {
294
779
  /*.f16_kv =*/ false,
295
780
  /*.logits_all =*/ false,
296
781
  /*.vocab_only =*/ false,
782
+ /*.use_mmap =*/ true,
297
783
  /*.use_mlock =*/ false,
298
784
  /*.embedding =*/ false,
299
785
  /*.progress_callback =*/ nullptr,
@@ -303,243 +789,106 @@ struct llama_context_params llama_context_default_params() {
303
789
  return result;
304
790
  }
305
791
 
792
+ bool llama_mmap_supported() {
793
+ return llama_mmap::SUPPORTED;
794
+ }
795
+
796
+ bool llama_mlock_supported() {
797
+ return llama_mlock::SUPPORTED;
798
+ }
799
+
306
800
  //
307
801
  // model loading
308
802
  //
309
803
 
310
- static void *mmap_file(const char *fname, uint64_t *mm_length) {
311
- #if defined(_WIN32) && !defined(_POSIX_MAPPED_FILES)
312
- HANDLE hFile = CreateFileA(fname,
313
- GENERIC_READ,
314
- FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
315
- NULL,
316
- OPEN_EXISTING,
317
- FILE_ATTRIBUTE_NORMAL | FILE_ATTRIBUTE_NOT_CONTENT_INDEXED,
318
- NULL);
319
- if (hFile == INVALID_HANDLE_VALUE) return 0;
320
- LARGE_INTEGER fileSize;
321
- fileSize.QuadPart = -1;
322
- GetFileSizeEx(hFile, &fileSize);
323
- int64_t length = fileSize.QuadPart;
324
- HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
325
- CloseHandle(hFile);
326
- if (!hMapping) return 0;
327
- void *addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
328
- CloseHandle(hMapping);
329
- if (!addr) return 0;
330
- #else
331
- int fd = open(fname, O_RDONLY);
332
- if (fd == -1) return 0;
333
- int64_t length = lseek(fd, 0, SEEK_END);
334
- void *addr = mmap(NULL, length, PROT_READ, MAP_SHARED, fd, 0);
335
- close(fd);
336
- if (addr == MAP_FAILED) return 0;
337
- #endif
338
- *mm_length = length;
339
- return addr;
804
+ static const char *llama_file_version_name(llama_file_version version) {
805
+ switch (version) {
806
+ case LLAMA_FILE_VERSION_GGML: return "'ggml' (old version with low tokenizer quality and no mmap support)";
807
+ case LLAMA_FILE_VERSION_GGMF_V1: return "ggmf v1 (old version with no mmap support)";
808
+ case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (latest)";
809
+ default: LLAMA_ASSERT(false);
810
+ }
340
811
  }
341
812
 
342
- static void munmap_file(void * addr, size_t length) {
343
- #if defined(_WIN32) && !defined(_POSIX_MAPPED_FILES)
344
- UnmapViewOfFile(addr);
345
- #else
346
- munmap(addr, length);
347
- #endif
813
+ static const char *llama_ftype_name(enum llama_ftype ftype) {
814
+ switch (ftype) {
815
+ case LLAMA_FTYPE_ALL_F32: return "all F32";
816
+ case LLAMA_FTYPE_MOSTLY_F16: return "mostly F16";
817
+ case LLAMA_FTYPE_MOSTLY_Q4_0: return "mostly Q4_0";
818
+ case LLAMA_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1";
819
+ case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
820
+ return "mostly Q4_1, some F16";
821
+ default: return "unknown, may not work";
822
+ }
348
823
  }
349
824
 
350
- static bool report_bad_magic(const char *path, uint32_t got, uint32_t want) {
351
- fprintf(stderr,
352
- "%s: invalid model file (bad magic [got %#x want %#x])\n"
353
- "\tyou most likely need to regenerate your ggml files\n"
354
- "\tthe benefit is you'll get 10-100x faster load times\n"
355
- "\tsee https://github.com/ggerganov/llama.cpp/issues/91\n"
356
- "\tuse convert-pth-to-ggml.py to regenerate from original pth\n"
357
- "\tuse migrate-ggml-2023-03-30-pr613.py if you deleted originals\n",
358
- path, got, want);
359
- return false;
825
+ static const char *llama_model_type_name(e_model type) {
826
+ switch (type) {
827
+ case MODEL_7B: return "7B";
828
+ case MODEL_13B: return "13B";
829
+ case MODEL_30B: return "30B";
830
+ case MODEL_65B: return "65B";
831
+ default: LLAMA_ASSERT(false);
832
+ }
360
833
  }
361
834
 
362
- static bool llama_model_load(
835
+ static void llama_model_load_internal(
363
836
  const std::string & fname,
364
837
  llama_context & lctx,
365
838
  int n_ctx,
366
- int n_parts,
367
839
  ggml_type memory_type,
840
+ bool use_mmap,
841
+ bool use_mlock,
368
842
  bool vocab_only,
369
843
  llama_progress_callback progress_callback,
370
- void *progress_callback_user_data) {
371
- fprintf(stderr, "%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
844
+ void * progress_callback_user_data) {
372
845
 
373
846
  lctx.t_start_us = ggml_time_us();
374
847
 
375
- auto & model = lctx.model;
376
- auto & vocab = lctx.vocab;
377
-
378
- auto fin = std::ifstream(fname, std::ios::binary);
379
- if (!fin) {
380
- fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
381
- return false;
382
- }
383
-
384
- std::vector<char> f_buf(1024*1024);
385
- fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size());
848
+ std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap, vocab_only));
386
849
 
387
- fin.seekg(0, fin.end);
388
- const size_t file_size = fin.tellg();
389
- fin.seekg(0);
850
+ lctx.vocab = std::move(ml->file_loaders.at(0)->vocab);
851
+ auto & model = lctx.model;
852
+ model.hparams = ml->file_loaders.at(0)->hparams;
853
+ llama_file_version file_version = ml->file_loaders.at(0)->file_version;
854
+ auto & hparams = model.hparams;
855
+ uint32_t n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
390
856
 
391
- // verify magic
392
857
  {
393
- uint32_t magic;
394
- fin.read((char *) &magic, sizeof(magic));
395
- if (magic == LLAMA_FILE_MAGIC_UNVERSIONED) {
396
- fprintf(stderr, "%s: invalid model file '%s' (too old, regenerate your model files or convert them with convert-unversioned-ggml-to-ggml.py!)\n",
397
- __func__, fname.c_str());
398
- return false;
858
+ switch (hparams.n_layer) {
859
+ case 32: model.type = e_model::MODEL_7B; break;
860
+ case 40: model.type = e_model::MODEL_13B; break;
861
+ case 60: model.type = e_model::MODEL_30B; break;
862
+ case 80: model.type = e_model::MODEL_65B; break;
399
863
  }
400
- if (magic != LLAMA_FILE_MAGIC) {
401
- return report_bad_magic(fname.c_str(), magic, LLAMA_FILE_MAGIC);
402
- }
403
-
404
- uint32_t format_version;
405
- fin.read((char *) &format_version, sizeof(format_version));
406
-
407
- if (format_version != LLAMA_FILE_VERSION) {
408
- fprintf(stderr, "%s: invalid model file '%s' (unsupported format version %" PRIu32 ", expected %d)\n",
409
- __func__, fname.c_str(), format_version, LLAMA_FILE_VERSION);
410
- return false;
411
- }
412
- }
413
-
414
- int n_ff = 0;
415
-
416
- // load hparams
417
- {
418
- auto & hparams = model.hparams;
419
-
420
- fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
421
- //fin.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
422
- fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd));
423
- fin.read((char *) &hparams.n_mult, sizeof(hparams.n_mult));
424
- fin.read((char *) &hparams.n_head, sizeof(hparams.n_head));
425
- fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
426
- fin.read((char *) &hparams.n_rot, sizeof(hparams.n_rot));
427
- fin.read((char *) &hparams.f16, sizeof(hparams.f16));
428
864
 
429
865
  hparams.n_ctx = n_ctx;
430
-
431
- n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
432
-
433
- if (n_parts < 1) {
434
- n_parts = LLAMA_N_PARTS.at(hparams.n_embd);
435
- }
436
-
437
- // temp warning to tell the user to use "--n_parts"
438
- if (hparams.f16 == 4 && n_parts != 1) {
439
- fprintf(stderr, "%s: GPTQ model detected - are you sure n_parts should be %d? we normally expect it to be 1\n", __func__, n_parts);
440
- fprintf(stderr, "%s: use '--n_parts 1' if necessary\n", __func__);
441
- }
442
-
443
- if (hparams.n_layer == 32) {
444
- model.type = e_model::MODEL_7B;
445
- }
446
-
447
- if (hparams.n_layer == 40) {
448
- model.type = e_model::MODEL_13B;
449
- }
450
-
451
- if (hparams.n_layer == 60) {
452
- model.type = e_model::MODEL_30B;
453
- }
454
-
455
- if (hparams.n_layer == 80) {
456
- model.type = e_model::MODEL_65B;
457
- }
458
-
459
- fprintf(stderr, "%s: n_vocab = %d\n", __func__, hparams.n_vocab);
460
- fprintf(stderr, "%s: n_ctx = %d\n", __func__, hparams.n_ctx);
461
- fprintf(stderr, "%s: n_embd = %d\n", __func__, hparams.n_embd);
462
- fprintf(stderr, "%s: n_mult = %d\n", __func__, hparams.n_mult);
463
- fprintf(stderr, "%s: n_head = %d\n", __func__, hparams.n_head);
464
- fprintf(stderr, "%s: n_layer = %d\n", __func__, hparams.n_layer);
465
- fprintf(stderr, "%s: n_rot = %d\n", __func__, hparams.n_rot);
466
- fprintf(stderr, "%s: f16 = %d\n", __func__, hparams.f16);
467
- fprintf(stderr, "%s: n_ff = %d\n", __func__, n_ff);
468
- fprintf(stderr, "%s: n_parts = %d\n", __func__, n_parts);
469
- fprintf(stderr, "%s: type = %d\n", __func__, model.type);
470
866
  }
471
867
 
472
- // load vocab
473
868
  {
474
- std::string word;
475
- vocab.id_to_token.resize(model.hparams.n_vocab);
476
- std::vector<char> tmp(64);
477
-
478
- for (int i = 0; i < model.hparams.n_vocab; i++) {
479
- uint32_t len;
480
- fin.read((char *) &len, sizeof(len));
481
-
482
- word.resize(len);
483
- if (len > 0) {
484
- tmp.resize(len);
485
- fin.read(tmp.data(), len);
486
- word.assign(tmp.data(), len);
487
- } else {
488
- word.clear();
489
- }
490
-
491
- float score;
492
- fin.read((char *) &score, sizeof(score));
493
-
494
- vocab.token_to_id[word] = i;
495
-
496
- auto &tok_score = vocab.id_to_token[i];
497
- tok_score.tok = word;
498
- tok_score.score = score;
499
- }
869
+ fprintf(stderr, "%s: format = %s\n", __func__, llama_file_version_name(file_version));
870
+ fprintf(stderr, "%s: n_vocab = %u\n", __func__, hparams.n_vocab);
871
+ fprintf(stderr, "%s: n_ctx = %u\n", __func__, hparams.n_ctx);
872
+ fprintf(stderr, "%s: n_embd = %u\n", __func__, hparams.n_embd);
873
+ fprintf(stderr, "%s: n_mult = %u\n", __func__, hparams.n_mult);
874
+ fprintf(stderr, "%s: n_head = %u\n", __func__, hparams.n_head);
875
+ fprintf(stderr, "%s: n_layer = %u\n", __func__, hparams.n_layer);
876
+ fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot);
877
+ fprintf(stderr, "%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype));
878
+ fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff);
879
+ fprintf(stderr, "%s: n_parts = %zu\n", __func__, ml->file_loaders.size());
880
+ fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
500
881
  }
501
882
 
502
883
  if (vocab_only) {
503
- return true;
504
- }
505
-
506
- // for the big tensors, we have the option to store the data in 16-bit floats or quantized
507
- // in order to save memory and also to speed up the computation
508
- // wtype is for per-layer weights, while vtype is for other weights
509
- ggml_type wtype, vtype;
510
- switch (model.hparams.f16) {
511
- case 0: wtype = vtype = GGML_TYPE_F32; break;
512
- case 1: wtype = vtype = GGML_TYPE_F16; break;
513
- case 2: wtype = vtype = GGML_TYPE_Q4_0; break;
514
- case 3: wtype = vtype = GGML_TYPE_Q4_1; break;
515
- case 4: wtype = GGML_TYPE_Q4_1; vtype = GGML_TYPE_F16; break;
516
- default:
517
- {
518
- fprintf(stderr, "%s: invalid model file '%s' (bad f16 value %d)\n",
519
- __func__, fname.c_str(), model.hparams.f16);
520
- return false;
521
- }
522
- }
523
-
524
- // map model into memory
525
- char *mm_addr = NULL;
526
- model.mm_addr = mmap_file(fname.c_str(), &model.mm_length);
527
- if (model.mm_addr == NULL) {
528
- fprintf(stderr, "%s: failed to mmap '%s'\n", __func__, fname.c_str());
529
- return false;
884
+ return;
530
885
  }
531
- mm_addr = (char *)model.mm_addr;
532
- fprintf(stderr, "%s: ggml map size = %6.2f MB\n", __func__, model.mm_length/(1024.0*1024.0));
533
886
 
534
887
  auto & ctx = model.ctx;
535
888
 
536
- size_t ctx_size = 0;
537
- {
538
- const auto &hparams = model.hparams;
539
- const int n_layer = hparams.n_layer;
540
- ctx_size += (5 + 10*n_layer)*256; // object overhead
541
- fprintf(stderr, "%s: ggml ctx size = %6.2f KB\n", __func__, ctx_size/1024.0);
542
- }
889
+ size_t ctx_size, mmapped_size;
890
+ ml->calc_sizes(&ctx_size, &mmapped_size);
891
+ fprintf(stderr, "%s: ggml ctx size = %6.2f KB\n", __func__, ctx_size/1024.0);
543
892
 
544
893
  // print memory requirements
545
894
  {
@@ -548,7 +897,7 @@ static bool llama_model_load(
548
897
  // this is the total memory required to run the inference
549
898
  const size_t mem_required =
550
899
  ctx_size +
551
- model.mm_length +
900
+ mmapped_size +
552
901
  MEM_REQ_SCRATCH0.at(model.type) +
553
902
  MEM_REQ_SCRATCH1.at(model.type) +
554
903
  MEM_REQ_EVAL.at (model.type);
@@ -564,17 +913,20 @@ static bool llama_model_load(
564
913
  // create the ggml context
565
914
  {
566
915
  lctx.model.buf.resize(ctx_size);
916
+ if (use_mlock) {
917
+ lctx.model.mlock_buf.init(lctx.model.buf.addr);
918
+ lctx.model.mlock_buf.grow_to(lctx.model.buf.size);
919
+ }
567
920
 
568
921
  struct ggml_init_params params = {
569
- /*.mem_size =*/ lctx.model.buf.size(),
570
- /*.mem_buffer =*/ lctx.model.buf.data(),
571
- /*.no_alloc =*/ true,
922
+ /*.mem_size =*/ lctx.model.buf.size,
923
+ /*.mem_buffer =*/ lctx.model.buf.addr,
924
+ /*.no_alloc =*/ ml->use_mmap,
572
925
  };
573
926
 
574
927
  model.ctx = ggml_init(params);
575
928
  if (!model.ctx) {
576
- fprintf(stderr, "%s: ggml_init() failed\n", __func__);
577
- return false;
929
+ throw format("ggml_init() failed");
578
930
  }
579
931
  }
580
932
 
@@ -582,161 +934,71 @@ static bool llama_model_load(
582
934
  {
583
935
  const auto & hparams = model.hparams;
584
936
 
585
- const int n_embd = hparams.n_embd;
586
- const int n_layer = hparams.n_layer;
587
- const int n_vocab = hparams.n_vocab;
588
-
589
- model.layers.resize(n_layer);
590
-
591
- model.tok_embeddings = ggml_new_tensor_2d(ctx, vtype, n_embd, n_vocab);
592
-
593
- model.norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
594
- model.output = ggml_new_tensor_2d(ctx, vtype, n_embd, n_vocab);
937
+ const uint32_t n_embd = hparams.n_embd;
938
+ const uint32_t n_layer = hparams.n_layer;
939
+ const uint32_t n_vocab = hparams.n_vocab;
595
940
 
596
- // map by name
597
- model.tensors["tok_embeddings.weight"] = model.tok_embeddings;
941
+ ml->ggml_ctx = ctx;
598
942
 
599
- model.tensors["norm.weight"] = model.norm;
600
- model.tensors["output.weight"] = model.output;
943
+ model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab});
944
+ model.norm = ml->get_tensor("norm.weight", {n_embd});
945
+ model.output = ml->get_tensor("output.weight", {n_embd, n_vocab});
601
946
 
602
- for (int i = 0; i < n_layer; ++i) {
947
+ model.layers.resize(n_layer);
948
+ for (uint32_t i = 0; i < n_layer; ++i) {
603
949
  auto & layer = model.layers[i];
604
950
 
605
- layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
606
-
607
- layer.wq = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
608
- layer.wk = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
609
- layer.wv = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
610
- layer.wo = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
611
-
612
- layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
613
-
614
- layer.w1 = ggml_new_tensor_2d(ctx, wtype, n_embd, n_ff);
615
- layer.w2 = ggml_new_tensor_2d(ctx, wtype, n_ff, n_embd);
616
- layer.w3 = ggml_new_tensor_2d(ctx, wtype, n_embd, n_ff);
951
+ std::string layers_i = "layers." + std::to_string(i);
617
952
 
618
- // map by name
619
- model.tensors["layers." + std::to_string(i) + ".attention_norm.weight"] = layer.attention_norm;
953
+ layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd});
620
954
 
621
- model.tensors["layers." + std::to_string(i) + ".attention.wq.weight"] = layer.wq;
622
- model.tensors["layers." + std::to_string(i) + ".attention.wk.weight"] = layer.wk;
623
- model.tensors["layers." + std::to_string(i) + ".attention.wv.weight"] = layer.wv;
624
- model.tensors["layers." + std::to_string(i) + ".attention.wo.weight"] = layer.wo;
955
+ layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd});
956
+ layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd});
957
+ layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd});
958
+ layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd});
625
959
 
626
- model.tensors["layers." + std::to_string(i) + ".ffn_norm.weight"] = layer.ffn_norm;
960
+ layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd});
627
961
 
628
- model.tensors["layers." + std::to_string(i) + ".feed_forward.w1.weight"] = layer.w1;
629
- model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.weight"] = layer.w2;
630
- model.tensors["layers." + std::to_string(i) + ".feed_forward.w3.weight"] = layer.w3;
962
+ layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff});
963
+ layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd});
964
+ layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff});
631
965
  }
632
966
  }
633
967
 
634
- std::vector<uint8_t> tmp;
968
+ ml->done_getting_tensors();
635
969
 
636
- if (progress_callback) {
637
- progress_callback(0.0, progress_callback_user_data);
970
+ // populate `tensors_by_name`
971
+ for (llama_load_tensor & lt : ml->tensors_map.tensors) {
972
+ model.tensors_by_name.emplace_back(lt.name, lt.ggml_tensor);
638
973
  }
639
974
 
640
- fprintf(stderr, "%s: loading tensors from '%s'\n", __func__, fname.c_str());
975
+ ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
641
976
 
642
- // load weights
643
- {
644
- size_t total_size = 0;
645
- model.n_loaded = 0;
646
-
647
- while (true) {
648
- int32_t n_dims;
649
- int32_t length;
650
- int32_t ftype;
651
-
652
- fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
653
- fin.read(reinterpret_cast<char *>(&length), sizeof(length));
654
- fin.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
655
-
656
- if (fin.eof()) {
657
- break;
658
- }
659
-
660
- int32_t nelements = 1;
661
- int32_t ne[2] = { 1, 1 };
662
- for (int i = 0; i < n_dims; ++i) {
663
- fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
664
- nelements *= ne[i];
665
- }
666
-
667
- std::string name(length, 0);
668
- fin.read(&name[0], length);
669
-
670
- if (model.tensors.find(name.data()) == model.tensors.end()) {
671
- fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
672
- return false;
673
- }
674
-
675
- auto tensor = model.tensors[name.data()];
676
-
677
- if (ggml_nelements(tensor) != nelements) {
678
- fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
679
- return false;
680
- }
681
- if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
682
- fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%" PRId64 ", %" PRId64 "], expected [%d, %d]\n",
683
- __func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
684
- return false;
685
- }
686
- if (0) {
687
- static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
688
- fprintf(stderr, "%24s - [%5d, %5d], type = %6s\n", name.data(), ne[0], ne[1], ftype_str[ftype]);
689
- }
690
-
691
- switch (ftype) {
692
- case 0: // f32
693
- case 1: // f16
694
- break;
695
- case 2: // q4_0
696
- case 3: // q4_1
697
- assert(ne[0] % 64 == 0);
698
- break;
699
- default:
700
- fprintf(stderr, "%s: unknown ftype %d in model file\n", __func__, ftype);
701
- return false;
702
- };
703
-
704
- // load the tensor data into memory without copying or reading it
705
- size_t offset = fin.tellg();
706
- size_t tensor_data_size = ggml_nbytes(tensor);
707
- offset = (offset + 31) & -32;
708
- tensor->data = mm_addr + offset;
709
- fin.seekg(offset + tensor_data_size);
710
- total_size += tensor_data_size;
711
- model.n_loaded++;
712
-
713
- // progress
714
- if (progress_callback) {
715
- double current_progress = size_t(fin.tellg()) / double(file_size);
716
- progress_callback(current_progress, progress_callback_user_data);
717
- }
718
- }
719
-
720
- fin.close();
721
-
722
- fprintf(stderr, "%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, model.n_loaded);
723
- if (model.n_loaded == 0) {
724
- fprintf(stderr, "%s: WARN no tensors loaded from model file - assuming empty model for testing\n", __func__);
725
- } else if (model.n_loaded != (int) model.tensors.size()) {
726
- fprintf(stderr, "%s: ERROR not all tensors loaded from model file - expected %zu, got %d\n", __func__, model.tensors.size(), model.n_loaded);
727
- return false;
728
- }
729
- }
977
+ model.mapping = std::move(ml->mapping);
730
978
 
731
979
  // loading time will be recalculate after the first eval, so
732
980
  // we take page faults deferred by mmap() into consideration
733
981
  lctx.t_load_us = ggml_time_us() - lctx.t_start_us;
982
+ }
734
983
 
735
- if (progress_callback) {
736
- progress_callback(1.0, progress_callback_user_data);
984
+ static bool llama_model_load(
985
+ const std::string & fname,
986
+ llama_context & lctx,
987
+ int n_ctx,
988
+ ggml_type memory_type,
989
+ bool use_mmap,
990
+ bool use_mlock,
991
+ bool vocab_only,
992
+ llama_progress_callback progress_callback,
993
+ void *progress_callback_user_data) {
994
+ try {
995
+ llama_model_load_internal(fname, lctx, n_ctx, memory_type, use_mmap, use_mlock,
996
+ vocab_only, progress_callback, progress_callback_user_data);
997
+ return true;
998
+ } catch (const std::string & err) {
999
+ fprintf(stderr, "error loading model: %s\n", err.c_str());
1000
+ return false;
737
1001
  }
738
-
739
- return true;
740
1002
  }
741
1003
 
742
1004
  // evaluate the transformer
@@ -774,8 +1036,8 @@ static bool llama_eval_internal(
774
1036
  auto & buf_compute = lctx.buf_compute;
775
1037
 
776
1038
  struct ggml_init_params params = {
777
- /*.mem_size =*/ buf_compute.size(),
778
- /*.mem_buffer =*/ buf_compute.data(),
1039
+ /*.mem_size =*/ buf_compute.size,
1040
+ /*.mem_buffer =*/ buf_compute.addr,
779
1041
  /*.no_alloc =*/ false,
780
1042
  };
781
1043
 
@@ -1061,7 +1323,7 @@ struct llama_tokenizer {
1061
1323
  size_t offs = 0;
1062
1324
  while (offs < text.size()) {
1063
1325
  llama_sp_symbol sym;
1064
- size_t char_len = Min(text.size() - offs, utf8_len(text[offs]));
1326
+ size_t char_len = std::min(text.size() - offs, utf8_len(text[offs]));
1065
1327
  sym.text = text.c_str() + offs;
1066
1328
  sym.n = char_len;
1067
1329
  offs += char_len;
@@ -1236,7 +1498,7 @@ static llama_vocab::id llama_sample_top_p_top_k(
1236
1498
  }
1237
1499
  }
1238
1500
 
1239
- sample_top_k(logits_id, top_k > 0 ? Min(top_k, n_logits) : n_logits);
1501
+ sample_top_k(logits_id, top_k > 0 ? std::min(top_k, n_logits) : n_logits);
1240
1502
 
1241
1503
  // compute probs for the top k tokens
1242
1504
  std::vector<float> probs;
@@ -1284,298 +1546,118 @@ static llama_vocab::id llama_sample_top_p_top_k(
1284
1546
  // quantization
1285
1547
  //
1286
1548
 
1287
- // TODO: reuse code from the llama_model_load() somehow
1288
- static bool llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, int itype) {
1289
- ggml_type type = GGML_TYPE_Q4_1;
1290
-
1291
- switch (itype) {
1292
- case 2: type = GGML_TYPE_Q4_0; break;
1293
- case 3: type = GGML_TYPE_Q4_1; break;
1294
- default: fprintf(stderr, "%s: invalid quantization type %d\n", __func__, itype); return 1;
1549
+ static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, enum llama_ftype ftype) {
1550
+ ggml_type quantized_type;
1551
+ switch (ftype) {
1552
+ case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
1553
+ case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
1554
+ default: throw format("invalid output file type %d\n", ftype);
1295
1555
  };
1296
1556
 
1297
- if (type != GGML_TYPE_Q4_0 && type != GGML_TYPE_Q4_1) {
1298
- fprintf(stderr, "%s: invalid quantization type %d\n", __func__, type);
1299
- return false;
1300
- }
1301
-
1302
- llama_vocab vocab;
1303
-
1304
- printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
1305
-
1306
- auto finp = std::ifstream(fname_inp, std::ios::binary);
1307
- if (!finp) {
1308
- fprintf(stderr, "%s: failed to open '%s' for reading\n", __func__, fname_inp.c_str());
1309
- return false;
1310
- }
1311
-
1312
- auto fout = std::ofstream(fname_out, std::ios::binary);
1313
- if (!fout) {
1314
- fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname_out.c_str());
1315
- return false;
1316
- }
1317
-
1318
- // verify magic
1319
- {
1320
- uint32_t magic;
1321
- finp.read((char *) &magic, sizeof(magic));
1322
- if (magic == LLAMA_FILE_MAGIC_UNVERSIONED) {
1323
- fprintf(stderr, "%s: invalid model file '%s' (too old, regenerate your model files!)\n",
1324
- __func__, fname_inp.c_str());
1325
- return false;
1326
- }
1327
- if (magic != LLAMA_FILE_MAGIC) {
1328
- return report_bad_magic(fname_inp.c_str(), magic, LLAMA_FILE_MAGIC);
1329
- }
1330
-
1331
- fout.write((char *) &magic, sizeof(magic));
1332
-
1333
- uint32_t format_version;
1334
- finp.read((char *) &format_version, sizeof(format_version));
1335
-
1336
- if (format_version != LLAMA_FILE_VERSION) {
1337
- fprintf(stderr, "%s: invalid model file '%s' (unsupported format version %" PRIu32 ", expected %d)\n",
1338
- __func__, fname_inp.c_str(), format_version, LLAMA_FILE_VERSION);
1339
- return false;
1340
- }
1341
-
1342
- fout.write((char *) &format_version, sizeof(format_version));
1343
- }
1344
-
1345
- llama_hparams hparams;
1346
-
1347
- // load hparams
1348
- {
1349
- finp.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
1350
- //finp.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
1351
- finp.read((char *) &hparams.n_embd, sizeof(hparams.n_embd));
1352
- finp.read((char *) &hparams.n_mult, sizeof(hparams.n_mult));
1353
- finp.read((char *) &hparams.n_head, sizeof(hparams.n_head));
1354
- finp.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
1355
- finp.read((char *) &hparams.n_rot, sizeof(hparams.n_rot));
1356
- finp.read((char *) &hparams.f16, sizeof(hparams.f16));
1357
-
1358
- printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
1359
- printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
1360
- printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
1361
- printf("%s: n_mult = %d\n", __func__, hparams.n_mult);
1362
- printf("%s: n_head = %d\n", __func__, hparams.n_head);
1363
- printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
1364
- printf("%s: f16 = %d\n", __func__, hparams.f16);
1365
-
1366
- fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
1367
- //fout.write((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
1368
- fout.write((char *) &hparams.n_embd, sizeof(hparams.n_embd));
1369
- fout.write((char *) &hparams.n_mult, sizeof(hparams.n_mult));
1370
- fout.write((char *) &hparams.n_head, sizeof(hparams.n_head));
1371
- fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer));
1372
- fout.write((char *) &hparams.n_rot, sizeof(hparams.n_rot));
1373
- fout.write((char *) &itype, sizeof(hparams.f16));
1374
- }
1375
-
1376
- // load vocab
1377
- {
1378
- const int32_t n_vocab = hparams.n_vocab;
1379
-
1380
- if (n_vocab != hparams.n_vocab) {
1381
- fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
1382
- __func__, fname_inp.c_str(), n_vocab, hparams.n_vocab);
1383
- return false;
1384
- }
1385
-
1386
- std::vector<char> word(32);
1387
- vocab.id_to_token.resize(n_vocab);
1388
- for (int i = 0; i < n_vocab; i++) {
1389
- uint32_t len;
1390
- finp.read ((char *) &len, sizeof(len));
1391
- fout.write((char *) &len, sizeof(len));
1392
-
1393
- word.resize(len);
1394
- finp.read ((char *) &word[0], len);
1395
- fout.write((char *) &word[0], len);
1396
-
1397
- float score;
1398
- finp.read ((char *) &score, sizeof(score));
1399
- fout.write((char *) &score, sizeof(score));
1400
-
1401
- vocab.token_to_id[word.data()] = i;
1402
-
1403
- auto &tok_score = vocab.id_to_token[i];
1404
- tok_score.tok = word.data();
1405
- tok_score.score = score;
1406
- }
1407
- }
1408
-
1409
- // load weights
1410
- {
1411
- size_t total_size_org = 0;
1412
- size_t total_size_new = 0;
1413
-
1414
- std::vector<float> work;
1415
-
1416
- std::vector<uint8_t> data_u8;
1417
- std::vector<ggml_fp16_t> data_f16;
1418
- std::vector<float> data_f32;
1419
-
1420
- std::vector<int64_t> hist_all(1 << 4, 0);
1421
-
1422
- while (true) {
1423
- int32_t n_dims;
1424
- int32_t length;
1425
- int32_t ftype;
1426
-
1427
- finp.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
1428
- finp.read(reinterpret_cast<char *>(&length), sizeof(length));
1429
- finp.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
1430
-
1431
- if (finp.eof()) {
1432
- break;
1433
- }
1434
-
1435
- int32_t nelements = 1;
1436
- int32_t ne[2] = { 1, 1 };
1437
- for (int i = 0; i < n_dims; ++i) {
1438
- finp.read (reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
1439
- nelements *= ne[i];
1440
- }
1441
-
1442
- std::string name(length, 0);
1443
- finp.read (&name[0], length);
1444
-
1445
- {
1446
- // ensure tensor data is aligned
1447
- uint64_t offset = finp.tellg();
1448
- offset = (offset + 31) & -32;
1449
- finp.seekg(offset);
1450
- }
1451
-
1452
- {
1453
- static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
1454
- printf("%48s - [%5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ftype_str[ftype]);
1455
- }
1456
-
1457
- // regexes of tensor names to be quantized
1458
- const std::vector<std::string> k_names = {
1459
- ".*weight",
1460
- };
1461
-
1462
- bool quantize = false;
1463
- for (const auto & s : k_names) {
1464
- if (std::regex_match(name, std::regex(s))) {
1465
- quantize = true;
1466
- break;
1467
- }
1468
- }
1469
-
1470
- // quantize only 2D tensors
1471
- quantize &= (n_dims == 2);
1472
-
1473
- if (quantize) {
1474
- if (ftype != 0 && ftype != 1) {
1475
- fprintf(stderr, "%s: unsupported ftype %d for integer quantization\n", __func__, ftype);
1476
- return false;
1477
- }
1478
-
1479
- if (ftype == 1) {
1480
- data_f16.resize(nelements);
1481
- finp.read(reinterpret_cast<char *>(data_f16.data()), nelements * sizeof(ggml_fp16_t));
1482
- data_f32.resize(nelements);
1483
- for (int i = 0; i < nelements; ++i) {
1484
- data_f32[i] = ggml_fp16_to_fp32(data_f16[i]);
1485
- }
1486
- } else {
1487
- data_f32.resize(nelements);
1488
- finp.read(reinterpret_cast<char *>(data_f32.data()), nelements * sizeof(float));
1557
+ std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp.c_str(), /*use_mmap*/ false,
1558
+ /*vocab_only*/ false));
1559
+ llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), ftype);
1560
+
1561
+ size_t total_size_org = 0;
1562
+ size_t total_size_new = 0;
1563
+ std::vector<int64_t> hist_all(1 << 4, 0);
1564
+
1565
+ size_t idx = 0;
1566
+ for (llama_load_tensor & tensor : model_loader->tensors_map.tensors) {
1567
+ llama_buffer read_data;
1568
+ read_data.resize(tensor.size);
1569
+ tensor.data = read_data.addr;
1570
+ model_loader->load_data_for(tensor);
1571
+
1572
+ printf("[%zu/%zu] %36s - %s, type = %6s, ",
1573
+ ++idx, model_loader->tensors_map.tensors.size(),
1574
+ tensor.name.c_str(), llama_format_tensor_shape(tensor.ne).c_str(),
1575
+ ggml_type_name(tensor.type));
1576
+
1577
+ // This used to be a regex, but <regex> has an extreme cost to compile times.
1578
+ bool quantize = tensor.name.rfind("weight") == tensor.name.size() - 6; // ends with 'weight'?
1579
+
1580
+ // quantize only 2D tensors
1581
+ quantize &= (tensor.ne.size() == 2);
1582
+
1583
+ enum ggml_type new_type;
1584
+ void * new_data;
1585
+ size_t new_size;
1586
+ llama_buffer work;
1587
+
1588
+ if (!quantize) {
1589
+ new_type = tensor.type;
1590
+ new_data = tensor.data;
1591
+ new_size = tensor.size;
1592
+ printf("size = %8.3f MB\n", tensor.size/1024.0/1024.0);
1593
+ } else {
1594
+ new_type = quantized_type;
1595
+ float * f32_data;
1596
+ size_t nelements = tensor.ne.at(0) * tensor.ne.at(1);
1597
+ llama_buffer f32_conv_buf;
1598
+ if (tensor.type == GGML_TYPE_F32) {
1599
+ f32_data = (float *) tensor.data;
1600
+ } else if (tensor.type == GGML_TYPE_F16) {
1601
+ f32_conv_buf.resize(nelements * sizeof(float));
1602
+ f32_data = (float *) f32_conv_buf.addr;
1603
+ auto f16_data = (const ggml_fp16_t *) tensor.data;
1604
+ for (size_t i = 0; i < nelements; i++) {
1605
+ f32_data[i] = ggml_fp16_to_fp32(f16_data[i]);
1489
1606
  }
1490
-
1491
- ftype = itype;
1492
1607
  } else {
1493
- const int bpe = (ftype == 0) ? sizeof(float) : sizeof(uint16_t);
1494
-
1495
- data_u8.resize(nelements*bpe);
1496
- finp.read(reinterpret_cast<char *>(data_u8.data()), nelements * bpe);
1608
+ throw format("type %s unsupported for integer quantization", ggml_type_name(tensor.type));
1497
1609
  }
1498
1610
 
1499
- fout.write(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
1500
- fout.write(reinterpret_cast<char *>(&length), sizeof(length));
1501
- fout.write(reinterpret_cast<char *>(&ftype), sizeof(ftype));
1502
- for (int i = 0; i < n_dims; ++i) {
1503
- fout.write(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
1611
+ printf("quantizing .. ");
1612
+ fflush(stdout);
1613
+
1614
+ work.resize(nelements * 4); // upper bound on size
1615
+ new_data = work.addr;
1616
+ std::vector<int64_t> hist_cur(1 << 4, 0);
1617
+
1618
+ switch (new_type) {
1619
+ case GGML_TYPE_Q4_0:
1620
+ {
1621
+ new_size = ggml_quantize_q4_0(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
1622
+ } break;
1623
+ case GGML_TYPE_Q4_1:
1624
+ {
1625
+ new_size = ggml_quantize_q4_1(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
1626
+ } break;
1627
+ default:
1628
+ LLAMA_ASSERT(false);
1504
1629
  }
1505
- fout.write(&name[0], length);
1506
1630
 
1507
- {
1508
- // ensure tensor data is aligned
1509
- uint64_t offset = fout.tellp();
1510
- offset = (offset + 31) & -32;
1511
- fout.seekp(offset);
1631
+ printf("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0);
1632
+ for (size_t i = 0; i < hist_cur.size(); i++) {
1633
+ hist_all[i] += hist_cur[i];
1512
1634
  }
1513
1635
 
1514
- if (quantize) {
1515
- printf("quantizing .. ");
1516
- work.resize(nelements); // for quantization
1517
-
1518
- size_t cur_size = 0;
1519
- std::vector<int64_t> hist_cur(1 << 4, 0);
1520
-
1521
- switch (type) {
1522
- case GGML_TYPE_Q4_0:
1523
- {
1524
- cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
1525
- } break;
1526
- case GGML_TYPE_Q4_1:
1527
- {
1528
- cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
1529
- } break;
1530
- default:
1531
- {
1532
- fprintf(stderr, "%s: unsupported quantization type %d\n", __func__, type);
1533
- return false;
1534
- }
1535
- }
1536
-
1537
- fout.write(reinterpret_cast<char *>(work.data()), cur_size);
1538
- total_size_new += cur_size;
1539
-
1540
- printf("size = %8.2f MB -> %8.2f MB | hist: ", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0);
1541
- for (int i = 0; i < (int) hist_cur.size(); ++i) {
1542
- hist_all[i] += hist_cur[i];
1543
- }
1544
-
1545
- for (int i = 0; i < (int) hist_cur.size(); ++i) {
1546
- printf("%5.3f ", hist_cur[i] / float(nelements));
1547
- }
1548
- printf("\n");
1549
- } else {
1550
- printf("size = %8.3f MB\n", data_u8.size()/1024.0/1024.0);
1551
- fout.write(reinterpret_cast<char *>(data_u8.data()), data_u8.size());
1552
- total_size_new += data_u8.size();
1636
+ for (size_t i = 0; i < hist_cur.size(); i++) {
1637
+ printf("%5.3f ", hist_cur[i] / float(nelements));
1553
1638
  }
1554
-
1555
- total_size_org += nelements * sizeof(float);
1639
+ printf("\n");
1556
1640
  }
1641
+ total_size_org += tensor.size;
1642
+ total_size_new += new_size;
1643
+ file_saver.write_tensor(tensor, new_type, new_data, new_size);
1644
+ }
1557
1645
 
1558
- printf("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
1559
- printf("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0);
1646
+ printf("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
1647
+ printf("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0);
1560
1648
 
1561
- {
1562
- int64_t sum_all = 0;
1563
- for (int i = 0; i < (int) hist_all.size(); ++i) {
1564
- sum_all += hist_all[i];
1565
- }
1649
+ {
1650
+ int64_t sum_all = 0;
1651
+ for (size_t i = 0; i < hist_all.size(); i++) {
1652
+ sum_all += hist_all[i];
1653
+ }
1566
1654
 
1567
- printf("%s: hist: ", __func__);
1568
- for (int i = 0; i < (int) hist_all.size(); ++i) {
1569
- printf("%5.3f ", hist_all[i] / float(sum_all));
1570
- }
1571
- printf("\n");
1655
+ printf("%s: hist: ", __func__);
1656
+ for (size_t i = 0; i < hist_all.size(); i++) {
1657
+ printf("%5.3f ", hist_all[i] / float(sum_all));
1572
1658
  }
1659
+ printf("\n");
1573
1660
  }
1574
-
1575
- finp.close();
1576
- fout.close();
1577
-
1578
- return true;
1579
1661
  }
1580
1662
 
1581
1663
  //
@@ -1593,32 +1675,36 @@ struct llama_context * llama_init_from_file(
1593
1675
  params.seed = time(NULL);
1594
1676
  }
1595
1677
 
1678
+ unsigned cur_percentage = 0;
1679
+ if (params.progress_callback == NULL) {
1680
+ params.progress_callback_user_data = &cur_percentage;
1681
+ params.progress_callback = [](float progress, void * ctx) {
1682
+ unsigned * cur_percentage_p = (unsigned *) ctx;
1683
+ unsigned percentage = (unsigned) (100 * progress);
1684
+ while (percentage > *cur_percentage_p) {
1685
+ ++*cur_percentage_p;
1686
+ fprintf(stderr, ".");
1687
+ fflush(stderr);
1688
+ if (percentage >= 100) {
1689
+ fprintf(stderr, "\n");
1690
+ }
1691
+ }
1692
+ };
1693
+ }
1694
+
1596
1695
  ctx->rng = std::mt19937(params.seed);
1597
1696
  ctx->logits_all = params.logits_all;
1598
1697
 
1599
1698
  ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
1600
1699
 
1601
- if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_parts, memory_type,
1602
- params.vocab_only, params.progress_callback,
1603
- params.progress_callback_user_data)) {
1700
+ if (!llama_model_load(path_model, *ctx, params.n_ctx, memory_type,
1701
+ params.use_mmap, params.use_mlock, params.vocab_only,
1702
+ params.progress_callback, params.progress_callback_user_data)) {
1604
1703
  fprintf(stderr, "%s: failed to load model\n", __func__);
1605
1704
  llama_free(ctx);
1606
1705
  return nullptr;
1607
1706
  }
1608
1707
 
1609
- if (params.use_mlock) {
1610
- char *err;
1611
- if (!ggml_mlock(ctx->model.ctx,
1612
- ctx->model.mm_addr,
1613
- ctx->model.mm_length,
1614
- &err)) {
1615
- fprintf(stderr, "%s\n", err);
1616
- free(err);
1617
- llama_free(ctx);
1618
- return nullptr;
1619
- }
1620
- }
1621
-
1622
1708
  // reserve memory for context buffers
1623
1709
  if (!params.vocab_only) {
1624
1710
  if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx)) {
@@ -1655,40 +1741,31 @@ struct llama_context * llama_init_from_file(
1655
1741
  }
1656
1742
 
1657
1743
  void llama_free(struct llama_context * ctx) {
1658
- kv_cache_free(ctx->model.kv_self);
1659
-
1660
- if (ctx->model.ctx) {
1661
- ggml_free(ctx->model.ctx);
1662
- }
1663
-
1664
- if (ctx->model.mm_addr) {
1665
- munmap_file(ctx->model.mm_addr, ctx->model.mm_length);
1666
- }
1667
-
1668
1744
  delete ctx;
1669
1745
  }
1670
1746
 
1671
1747
  int llama_model_quantize(
1672
1748
  const char * fname_inp,
1673
1749
  const char * fname_out,
1674
- int itype) {
1675
- if (!llama_model_quantize_internal(fname_inp, fname_out, itype)) {
1676
- fprintf(stderr, "%s: failed to quantize\n", __func__);
1750
+ enum llama_ftype ftype) {
1751
+ try {
1752
+ llama_model_quantize_internal(fname_inp, fname_out, ftype);
1753
+ return 0;
1754
+ } catch (const std::string & err) {
1755
+ fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.c_str());
1677
1756
  return 1;
1678
1757
  }
1679
-
1680
- return 0;
1681
1758
  }
1682
1759
 
1683
1760
  // Returns the KV cache that will contain the context for the
1684
1761
  // ongoing prediction with the model.
1685
1762
  const uint8_t * llama_get_kv_cache(struct llama_context * ctx) {
1686
- return ctx->model.kv_self.buf.data();
1763
+ return ctx->model.kv_self.buf.addr;
1687
1764
  }
1688
1765
 
1689
1766
  // Returns the size of the KV cache
1690
1767
  size_t llama_get_kv_cache_size(struct llama_context * ctx) {
1691
- return ctx->model.kv_self.buf.size();
1768
+ return ctx->model.kv_self.buf.size;
1692
1769
  }
1693
1770
 
1694
1771
  int llama_get_kv_cache_token_count(struct llama_context * ctx) {
@@ -1702,8 +1779,8 @@ void llama_set_kv_cache(
1702
1779
  size_t n_size,
1703
1780
  int n_token_count) {
1704
1781
  // Make sure we have the same kv cache setup
1705
- LLAMA_ASSERT(ctx->model.kv_self.buf.size() == n_size);
1706
- memcpy(ctx->model.kv_self.buf.data(), kv_cache, n_size);
1782
+ LLAMA_ASSERT(ctx->model.kv_self.buf.size == n_size);
1783
+ memcpy(ctx->model.kv_self.buf.addr, kv_cache, n_size);
1707
1784
  ctx->model.kv_self.n = n_token_count;
1708
1785
  }
1709
1786
 
@@ -1814,9 +1891,9 @@ llama_token llama_sample_top_p_top_k(
1814
1891
  void llama_print_timings(struct llama_context * ctx) {
1815
1892
  const int64_t t_end_us = ggml_time_us();
1816
1893
 
1817
- const int32_t n_sample = Max(1, ctx->n_sample);
1818
- const int32_t n_eval = Max(1, ctx->n_eval);
1819
- const int32_t n_p_eval = Max(1, ctx->n_p_eval);
1894
+ const int32_t n_sample = std::max(1, ctx->n_sample);
1895
+ const int32_t n_eval = std::max(1, ctx->n_eval);
1896
+ const int32_t n_p_eval = std::max(1, ctx->n_p_eval);
1820
1897
 
1821
1898
  fprintf(stderr, "\n");
1822
1899
  fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0);
@@ -1852,3 +1929,8 @@ const char * llama_print_system_info(void) {
1852
1929
 
1853
1930
  return s.c_str();
1854
1931
  }
1932
+
1933
+ // For internal test use
1934
+ std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
1935
+ return ctx->model.tensors_by_name;
1936
+ }