llama_cpp 0.0.3 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,49 +1,30 @@
1
+ // Defines fileno on msys:
2
+ #ifndef _GNU_SOURCE
3
+ #define _GNU_SOURCE
4
+ #endif
5
+
6
+ #include "llama_util.h"
1
7
  #include "llama.h"
2
8
 
3
9
  #include "ggml.h"
4
10
 
11
+ #include <array>
5
12
  #include <cinttypes>
6
13
  #include <fstream>
7
14
  #include <random>
8
15
  #include <map>
9
16
  #include <unordered_map>
10
17
  #include <queue>
11
- #include <regex>
12
18
  #include <cassert>
13
19
  #include <cstring>
14
-
15
- #if defined(_WIN32) && !defined(_POSIX_MAPPED_FILES)
16
- #define WIN32_LEAN_AND_MEAN
17
- #include <Windows.h>
18
- #else
19
- #include <sys/types.h>
20
- #include <sys/mman.h>
21
- #include <unistd.h>
22
- #include <fcntl.h>
23
- #endif
24
-
25
- #define Min(X, Y) ((Y) > (X) ? (X) : (Y))
26
- #define Max(X, Y) ((Y) < (X) ? (X) : (Y))
20
+ #include <climits>
21
+ #include <memory>
22
+ #include <algorithm>
23
+ #include <initializer_list>
27
24
 
28
25
  #define LLAMA_USE_SCRATCH
29
26
  #define LLAMA_MAX_SCRATCH_BUFFERS 16
30
27
 
31
- #define LLAMA_ASSERT(x) \
32
- do { \
33
- if (!(x)) { \
34
- fprintf(stderr, "LLAMA_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
35
- abort(); \
36
- } \
37
- } while (0)
38
-
39
-
40
- // determine number of model parts based on the dimension
41
- static const std::unordered_map<int, int> LLAMA_N_PARTS = {
42
- { 4096, 1 },
43
- { 5120, 2 },
44
- { 6656, 4 },
45
- { 8192, 8 },
46
- };
47
28
 
48
29
  // available llama models
49
30
  enum e_model {
@@ -93,14 +74,18 @@ static const std::map<e_model, size_t> MEM_REQ_EVAL = {
93
74
 
94
75
  // default hparams (LLaMA 7B)
95
76
  struct llama_hparams {
96
- int32_t n_vocab = 32000;
97
- int32_t n_ctx = 512; // this is provided as user input?
98
- int32_t n_embd = 4096;
99
- int32_t n_mult = 256;
100
- int32_t n_head = 32;
101
- int32_t n_layer = 32;
102
- int32_t n_rot = 64;
103
- int32_t f16 = 1;
77
+ uint32_t n_vocab = 32000;
78
+ uint32_t n_ctx = 512; // this is provided as user input?
79
+ uint32_t n_embd = 4096;
80
+ uint32_t n_mult = 256;
81
+ uint32_t n_head = 32;
82
+ uint32_t n_layer = 32;
83
+ uint32_t n_rot = 64;
84
+ enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;
85
+
86
+ bool operator!=(const llama_hparams & other) const {
87
+ return memcmp(this, &other, sizeof(llama_hparams));
88
+ }
104
89
  };
105
90
 
106
91
  struct llama_layer {
@@ -126,11 +111,17 @@ struct llama_kv_cache {
126
111
  struct ggml_tensor * k;
127
112
  struct ggml_tensor * v;
128
113
 
129
- struct ggml_context * ctx;
114
+ struct ggml_context * ctx = NULL;
130
115
 
131
- std::vector<uint8_t> buf;
116
+ llama_buffer buf;
132
117
 
133
118
  int n; // number of tokens currently in the cache
119
+
120
+ ~llama_kv_cache() {
121
+ if (ctx) {
122
+ ggml_free(ctx);
123
+ }
124
+ }
134
125
  };
135
126
 
136
127
  struct llama_model {
@@ -146,22 +137,30 @@ struct llama_model {
146
137
  std::vector<llama_layer> layers;
147
138
 
148
139
  // context
149
- struct ggml_context * ctx;
140
+ struct ggml_context * ctx = NULL;
150
141
 
151
142
  // key + value cache for the self attention
152
143
  // TODO: move to llama_state
153
144
  struct llama_kv_cache kv_self;
154
145
 
155
146
  // the model memory buffer
156
- std::vector<uint8_t> buf;
147
+ llama_buffer buf;
157
148
 
158
149
  // model memory mapped file
159
- void * mm_addr = NULL;
160
- uint64_t mm_length = 0;
150
+ std::unique_ptr<llama_mmap> mapping;
151
+
152
+ // objects representing data potentially being locked in memory
153
+ llama_mlock mlock_buf;
154
+ llama_mlock mlock_mmap;
155
+
156
+ // for quantize-stats only
157
+ std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
161
158
 
162
- // tensors
163
- int n_loaded;
164
- std::unordered_map<std::string, struct ggml_tensor *> tensors;
159
+ ~llama_model() {
160
+ if (ctx) {
161
+ ggml_free(ctx);
162
+ }
163
+ }
165
164
  };
166
165
 
167
166
  struct llama_vocab {
@@ -206,8 +205,8 @@ struct llama_context {
206
205
 
207
206
  // memory buffers used to evaluate the model
208
207
  // TODO: move in llama_state
209
- std::vector<uint8_t> buf_compute;
210
- std::vector<uint8_t> buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
208
+ llama_buffer buf_compute;
209
+ llama_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
211
210
 
212
211
  int buf_last = 0;
213
212
  size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
@@ -220,11 +219,11 @@ struct llama_context {
220
219
  last_size = ggml_set_scratch(ctx, { 0, 0, nullptr, });
221
220
  } else {
222
221
  auto & buf = buf_scratch[i];
223
- last_size = ggml_set_scratch(ctx, { 0, buf.size(), buf.data(), });
222
+ last_size = ggml_set_scratch(ctx, { 0, buf.size, buf.addr, });
224
223
  }
225
224
 
226
225
  if (buf_last >= 0) {
227
- buf_max_size[buf_last] = Max(buf_max_size[buf_last], last_size);
226
+ buf_max_size[buf_last] = std::max(buf_max_size[buf_last], last_size);
228
227
  }
229
228
 
230
229
  buf_last = i;
@@ -244,6 +243,499 @@ struct llama_context {
244
243
  }
245
244
  };
246
245
 
246
+ template <typename T>
247
+ static T checked_mul(T a, T b) {
248
+ T ret = a * b;
249
+ if (a != 0 && ret / a != b) {
250
+ throw format("overflow multiplying %llu * %llu",
251
+ (unsigned long long) a, (unsigned long long) b);
252
+ }
253
+ return ret;
254
+ }
255
+
256
+ static size_t checked_div(size_t a, size_t b) {
257
+ if (b == 0 || a % b != 0) {
258
+ throw format("error dividing %zu / %zu", a, b);
259
+ }
260
+ return a / b;
261
+ }
262
+
263
+ static std::string llama_format_tensor_shape(const std::vector<uint32_t> & ne) {
264
+ std::string ret = "[" + std::to_string(ne.at(0));
265
+ for (size_t i = 1; i < ne.size(); i++) {
266
+ ret += " x " + std::to_string(ne.at(i));
267
+ }
268
+ ret += "]";
269
+ return ret;
270
+ }
271
+
272
+ static size_t llama_calc_tensor_size(const std::vector<uint32_t> & ne, enum ggml_type type) {
273
+ size_t size = ggml_type_size(type);
274
+ for (uint32_t dim : ne) {
275
+ size = checked_mul<size_t>(size, dim);
276
+ }
277
+ return size / ggml_blck_size(type);
278
+ }
279
+
280
+ struct llama_load_tensor_shard {
281
+ std::vector<uint32_t> ne;
282
+ size_t size;
283
+ enum ggml_type type;
284
+ size_t file_idx;
285
+ size_t file_off;
286
+
287
+ void calc_size() {
288
+ size = llama_calc_tensor_size(ne, type);
289
+ }
290
+ };
291
+
292
+ enum llama_split_type {
293
+ SPLIT_NONE,
294
+ SPLIT_BY_COLUMNS,
295
+ SPLIT_BY_ROWS
296
+ };
297
+
298
+ struct llama_load_tensor {
299
+ std::vector<llama_load_tensor_shard> shards;
300
+
301
+ std::string name;
302
+ enum ggml_type type = GGML_TYPE_F32;
303
+ llama_split_type split_type = SPLIT_NONE;
304
+ std::vector<uint32_t> ne;
305
+ size_t size;
306
+ struct ggml_tensor * ggml_tensor = NULL;
307
+ uint8_t * data;
308
+
309
+ llama_load_tensor(const std::string & name) : name(name) {}
310
+
311
+ void calc_all() {
312
+ calc_type();
313
+ calc_split_type();
314
+ calc_ne();
315
+ calc_size();
316
+ }
317
+
318
+ void calc_type() {
319
+ const auto & first_shard = shards.at(0);
320
+ for (const auto & shard : shards) {
321
+ if (shard.type != first_shard.type) {
322
+ throw format("inconsistent tensor shard type in '%s'", name.c_str());
323
+ }
324
+ }
325
+ type = first_shard.type;
326
+ }
327
+
328
+ void calc_split_type() {
329
+ if (shards.at(0).ne.size() == 1 || // 1D tensors are just duplicated in every file
330
+ shards.size() == 1) { // only one file?
331
+ split_type = SPLIT_NONE;
332
+ } else if (name.find("tok_embeddings.") == 0 ||
333
+ name.find(".attention.wo.weight") != std::string::npos ||
334
+ name.find(".feed_forward.w2.weight") != std::string::npos) {
335
+ split_type = SPLIT_BY_COLUMNS;
336
+ } else {
337
+ split_type = SPLIT_BY_ROWS;
338
+ }
339
+ }
340
+
341
+ void calc_ne() {
342
+ const auto & first_shard = shards.at(0);
343
+ for (const auto & shard : shards) {
344
+ if (shard.ne != first_shard.ne) {
345
+ throw format("inconsistent tensor shard shape in '%s': first was %s, other was %s",
346
+ name.c_str(), llama_format_tensor_shape(first_shard.ne).c_str(), llama_format_tensor_shape(shard.ne).c_str());
347
+ }
348
+ }
349
+ ne = first_shard.ne;
350
+ LLAMA_ASSERT(shards.size() <= UINT32_MAX);
351
+ uint32_t n_shards = (uint32_t) shards.size();
352
+ switch (split_type) {
353
+ case SPLIT_NONE:
354
+ ne = first_shard.ne;
355
+ break;
356
+ case SPLIT_BY_COLUMNS:
357
+ ne = {checked_mul<uint32_t>(first_shard.ne[0], n_shards),
358
+ first_shard.ne[1]};
359
+ break;
360
+ case SPLIT_BY_ROWS:
361
+ ne = {first_shard.ne[0],
362
+ checked_mul<uint32_t>(first_shard.ne[1], n_shards)};
363
+ break;
364
+ }
365
+ }
366
+
367
+ void calc_size() {
368
+ size = llama_calc_tensor_size(ne, type);
369
+ }
370
+ };
371
+
372
+ struct llama_load_tensors_map {
373
+ // tensors is kept in a separate vector to preserve file order
374
+ std::vector<llama_load_tensor> tensors;
375
+ std::unordered_map<std::string, size_t> name_to_idx;
376
+ };
377
+
378
+ enum llama_file_version {
379
+ LLAMA_FILE_VERSION_GGML,
380
+ LLAMA_FILE_VERSION_GGMF_V1, // added version field and scores in vocab
381
+ LLAMA_FILE_VERSION_GGJT_V1, // added padding
382
+ };
383
+
384
+ struct llama_file_loader {
385
+ llama_file file;
386
+ llama_file_version file_version;
387
+ llama_hparams hparams;
388
+ llama_vocab vocab;
389
+
390
+ llama_file_loader(const char * fname, size_t file_idx, llama_load_tensors_map & tensors_map)
391
+ : file(fname, "rb") {
392
+ fprintf(stderr, "llama.cpp: loading model from %s\n", fname);
393
+ read_magic();
394
+ read_hparams();
395
+ read_vocab();
396
+ read_tensor_metadata(file_idx, tensors_map);
397
+ }
398
+ void read_magic() {
399
+ uint32_t magic = file.read_u32();
400
+ uint32_t version = 0;
401
+
402
+ if (magic != 'ggml') {
403
+ version = file.read_u32();
404
+ }
405
+
406
+ if (magic == 'ggml' && version == 0) {
407
+ file_version = LLAMA_FILE_VERSION_GGML;
408
+ } else if (magic == 'ggmf' && version == 1) {
409
+ file_version = LLAMA_FILE_VERSION_GGMF_V1;
410
+ } else if (magic == 'ggjt' && version == 1) {
411
+ file_version = LLAMA_FILE_VERSION_GGJT_V1;
412
+ } else {
413
+ throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
414
+ magic, version);
415
+ }
416
+ }
417
+ void read_hparams() {
418
+ hparams.n_vocab = file.read_u32();
419
+ hparams.n_embd = file.read_u32();
420
+ hparams.n_mult = file.read_u32();
421
+ hparams.n_head = file.read_u32();
422
+ hparams.n_layer = file.read_u32();
423
+ hparams.n_rot = file.read_u32();
424
+ hparams.ftype = (enum llama_ftype) file.read_u32();
425
+ }
426
+ void read_vocab() {
427
+ vocab.id_to_token.resize(hparams.n_vocab);
428
+
429
+ for (uint32_t i = 0; i < hparams.n_vocab; i++) {
430
+ uint32_t len = file.read_u32();
431
+ std::string word = file.read_string(len);
432
+
433
+ float score = 0.0f;
434
+ if (file_version >= LLAMA_FILE_VERSION_GGMF_V1) {
435
+ file.read_raw(&score, sizeof(score));
436
+ }
437
+
438
+ vocab.token_to_id[word] = i;
439
+
440
+ auto & tok_score = vocab.id_to_token[i];
441
+ tok_score.tok = std::move(word);
442
+ tok_score.score = score;
443
+ }
444
+ }
445
+ void read_tensor_metadata(size_t file_idx, llama_load_tensors_map & tensors_map) {
446
+ while (file.tell() < file.size) {
447
+ llama_load_tensor_shard shard;
448
+ uint32_t n_dims = file.read_u32();
449
+ uint32_t name_len = file.read_u32();
450
+ shard.type = (enum ggml_type) file.read_u32();
451
+ shard.ne.resize(n_dims);
452
+ file.read_raw(shard.ne.data(), sizeof(shard.ne[0]) * n_dims);
453
+ std::string name = file.read_string(name_len);
454
+ if (n_dims < 1 || n_dims > 2) {
455
+ throw format("llama.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims);
456
+ }
457
+ switch (shard.type) {
458
+ case GGML_TYPE_F32:
459
+ case GGML_TYPE_F16:
460
+ case GGML_TYPE_Q4_0:
461
+ case GGML_TYPE_Q4_1:
462
+ break;
463
+ default: {
464
+ throw format("unrecognized tensor type %u\n", shard.type);
465
+ }
466
+ }
467
+
468
+ if (file_version >= LLAMA_FILE_VERSION_GGJT_V1) {
469
+ // skip to the next multiple of 32 bytes
470
+ file.seek(-file.tell() & 31, SEEK_CUR);
471
+ }
472
+ shard.file_idx = file_idx;
473
+ shard.file_off = file.tell();
474
+
475
+ shard.calc_size();
476
+ file.seek(shard.size, SEEK_CUR);
477
+
478
+ auto it = tensors_map.name_to_idx.find(name);
479
+ size_t idx;
480
+ if (it != tensors_map.name_to_idx.end()) {
481
+ idx = it->second;
482
+ } else {
483
+ tensors_map.tensors.emplace_back(name);
484
+ idx = tensors_map.tensors.size() - 1;
485
+ tensors_map.name_to_idx.emplace(name, idx);
486
+ }
487
+ tensors_map.tensors.at(idx).shards.push_back(shard);
488
+ }
489
+ }
490
+ };
491
+
492
+ struct llama_file_saver {
493
+ llama_file file;
494
+ llama_file_loader * any_file_loader;
495
+ llama_file_saver(const char * fname, llama_file_loader * any_file_loader, enum llama_ftype new_ftype)
496
+ : file(fname, "wb"), any_file_loader(any_file_loader) {
497
+ fprintf(stderr, "llama.cpp: saving model to %s\n", fname);
498
+ write_magic();
499
+ write_hparams(new_ftype);
500
+ write_vocab();
501
+ }
502
+ void write_magic() {
503
+ file.write_u32('ggjt'); // magic
504
+ file.write_u32(1); // version
505
+ }
506
+ void write_hparams(enum llama_ftype new_ftype) {
507
+ const llama_hparams & hparams = any_file_loader->hparams;
508
+ file.write_u32(hparams.n_vocab);
509
+ file.write_u32(hparams.n_embd);
510
+ file.write_u32(hparams.n_mult);
511
+ file.write_u32(hparams.n_head);
512
+ file.write_u32(hparams.n_layer);
513
+ file.write_u32(hparams.n_rot);
514
+ file.write_u32(new_ftype);
515
+ }
516
+ void write_vocab() {
517
+ if (any_file_loader->file_version == LLAMA_FILE_VERSION_GGML) {
518
+ fprintf(stderr, "llama.cpp: WARNING: input is an old file that doesn't have scores; will add dummy scores\n");
519
+ }
520
+ uint32_t n_vocab = any_file_loader->hparams.n_vocab;
521
+ for (uint32_t i = 0; i < n_vocab; i++) {
522
+ const auto & token_score = any_file_loader->vocab.id_to_token.at(i);
523
+ file.write_u32((uint32_t) token_score.tok.size());
524
+ file.write_raw(token_score.tok.data(), token_score.tok.size());
525
+ file.write_raw(&token_score.score, sizeof(token_score.score));
526
+ }
527
+ }
528
+ void write_tensor(llama_load_tensor & tensor, enum ggml_type new_type, const void * new_data, size_t new_size) {
529
+ switch (new_type) {
530
+ case GGML_TYPE_F32:
531
+ case GGML_TYPE_F16:
532
+ case GGML_TYPE_Q4_0:
533
+ case GGML_TYPE_Q4_1:
534
+ break;
535
+ default: LLAMA_ASSERT(false);
536
+ }
537
+ file.write_u32((uint32_t) tensor.ne.size());
538
+ file.write_u32((uint32_t) tensor.name.size());
539
+ file.write_u32(new_type);
540
+ file.write_raw(tensor.ne.data(), sizeof(tensor.ne[0]) * tensor.ne.size());
541
+ file.write_raw(tensor.name.data(), tensor.name.size());
542
+ file.seek(-file.tell() & 31, SEEK_CUR);
543
+ LLAMA_ASSERT(new_size == llama_calc_tensor_size(tensor.ne, new_type));
544
+ file.write_raw(new_data, new_size);
545
+ }
546
+ };
547
+
548
+ struct llama_model_loader {
549
+ std::vector<std::unique_ptr<llama_file_loader>> file_loaders;
550
+ llama_load_tensors_map tensors_map;
551
+ bool use_mmap;
552
+ size_t num_ggml_tensors_created = 0;
553
+ struct ggml_context * ggml_ctx = NULL;
554
+ std::unique_ptr<llama_mmap> mapping;
555
+
556
+ llama_model_loader(const std::string & fname_base, bool use_mmap, bool vocab_only) {
557
+ auto first_file = new llama_file_loader(fname_base.c_str(), 0, tensors_map);
558
+ file_loaders.emplace_back(first_file);
559
+ uint32_t n_parts = vocab_only ? 1 : guess_n_parts();
560
+ for (uint32_t i = 1; i < n_parts; i++) {
561
+ std::string fname = fname_base + "." + std::to_string(i);
562
+ auto ith_file = new llama_file_loader(fname.c_str(), i, tensors_map);
563
+ file_loaders.emplace_back(ith_file);
564
+ if (ith_file->hparams != first_file->hparams) {
565
+ throw format("llama.cpp: hparams inconsistent between files");
566
+ }
567
+ }
568
+ if (!llama_mmap::SUPPORTED) {
569
+ use_mmap = false;
570
+ }
571
+ if (use_mmap && alignment_prevents_mmap()) {
572
+ fprintf(stderr, "llama.cpp: can't use mmap because tensors are not aligned; convert to new format to avoid this\n");
573
+ use_mmap = false;
574
+ }
575
+ this->use_mmap = use_mmap;
576
+ for (llama_load_tensor & lt : tensors_map.tensors) {
577
+ lt.calc_all();
578
+ }
579
+ }
580
+
581
+ bool alignment_prevents_mmap() {
582
+ for (const llama_load_tensor & lt : tensors_map.tensors) {
583
+ for (const llama_load_tensor_shard & shard : lt.shards) {
584
+ if (shard.file_off & 3) {
585
+ return true;
586
+ }
587
+ }
588
+ }
589
+ return false;
590
+ }
591
+
592
+ uint32_t guess_n_parts() const {
593
+ auto it = tensors_map.name_to_idx.find("tok_embeddings.weight");
594
+ if (it == tensors_map.name_to_idx.end()) {
595
+ throw std::string("missing tok_embeddings.weight");
596
+ }
597
+ const llama_load_tensor & lt = tensors_map.tensors.at(it->second);
598
+ return file_loaders.at(0)->hparams.n_embd / lt.shards.at(0).ne.at(0);
599
+ }
600
+
601
+ void calc_sizes(size_t * ctx_size_p, size_t * mmapped_size_p) const {
602
+ *ctx_size_p = *mmapped_size_p = 0;
603
+ for (const llama_load_tensor & lt : tensors_map.tensors) {
604
+ *ctx_size_p += sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE;
605
+ *(use_mmap ? mmapped_size_p : ctx_size_p) += lt.size;
606
+ }
607
+ }
608
+
609
+ struct ggml_tensor * get_tensor(const std::string & name, std::vector<uint32_t> ne) {
610
+ auto it = tensors_map.name_to_idx.find(name);
611
+ if (it == tensors_map.name_to_idx.end()) {
612
+ throw format("llama.cpp: tensor '%s' is missing from model", name.c_str());
613
+ }
614
+ llama_load_tensor & lt = tensors_map.tensors.at(it->second);
615
+ if (lt.ne != ne) {
616
+ throw format("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s",
617
+ name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str());
618
+ }
619
+ return get_tensor_for(lt);
620
+ }
621
+
622
+ struct ggml_tensor * get_tensor_for(llama_load_tensor & lt) {
623
+ struct ggml_tensor * tensor;
624
+ if (lt.ne.size() == 2) {
625
+ tensor = ggml_new_tensor_2d(ggml_ctx, lt.type, lt.ne.at(0), lt.ne.at(1));
626
+ } else {
627
+ LLAMA_ASSERT(lt.ne.size() == 1);
628
+ tensor = ggml_new_tensor_1d(ggml_ctx, lt.type, lt.ne.at(0));
629
+ }
630
+ LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor
631
+ lt.ggml_tensor = tensor;
632
+ num_ggml_tensors_created++;
633
+ return tensor;
634
+ }
635
+
636
+ void done_getting_tensors() {
637
+ if (num_ggml_tensors_created != tensors_map.tensors.size()) {
638
+ throw std::string("llama.cpp: file contained more tensors than expected");
639
+ }
640
+ }
641
+
642
+ void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) {
643
+ size_t data_size = 0;
644
+ for (const llama_load_tensor & lt : tensors_map.tensors) {
645
+ data_size += lt.size;
646
+ }
647
+
648
+ if (use_mmap) {
649
+ mapping.reset(new llama_mmap(&file_loaders.at(0)->file));
650
+ if (!lmlock) {
651
+ // Don't call the callback since the actual loading will be lazy
652
+ // and we can't measure it.
653
+ progress_callback = NULL;
654
+ }
655
+ if (lmlock) {
656
+ lmlock->init(mapping->addr);
657
+ }
658
+ }
659
+
660
+ size_t done_size = 0;
661
+ for (llama_load_tensor & lt : tensors_map.tensors) {
662
+ if (progress_callback) {
663
+ progress_callback((float) done_size / data_size, progress_callback_user_data);
664
+ }
665
+ LLAMA_ASSERT(lt.ggml_tensor); // unused tensors should have been caught by load_data already
666
+ lt.data = (uint8_t *) lt.ggml_tensor->data;
667
+ load_data_for(lt);
668
+ lt.ggml_tensor->data = lt.data;
669
+ done_size += lt.size;
670
+ if (use_mmap && lmlock) {
671
+ lmlock->grow_to(done_size);
672
+ }
673
+ }
674
+ if (progress_callback) {
675
+ progress_callback(1.0f, progress_callback_user_data);
676
+ }
677
+ }
678
+
679
+ void load_data_for(llama_load_tensor & lt) {
680
+ if (use_mmap) {
681
+ LLAMA_ASSERT(lt.shards.size() == 1);
682
+ lt.data = (uint8_t *) mapping->addr + lt.shards.at(0).file_off;
683
+ } else if (lt.split_type == SPLIT_NONE) {
684
+ llama_file & file = file_loaders.at(lt.shards.at(0).file_idx)->file;
685
+ file.seek(lt.shards.at(0).file_off, SEEK_SET);
686
+ file.read_raw(lt.data, lt.size);
687
+ } else if (lt.split_type == SPLIT_BY_ROWS) {
688
+ size_t offset = 0;
689
+ for (llama_load_tensor_shard & shard : lt.shards) {
690
+ llama_file & file = file_loaders.at(shard.file_idx)->file;
691
+ file.seek(shard.file_off, SEEK_SET);
692
+ file.read_raw(lt.data + offset, shard.size);
693
+ offset += shard.size;
694
+ }
695
+ LLAMA_ASSERT(offset == lt.size);
696
+ } else if (lt.split_type == SPLIT_BY_COLUMNS) {
697
+ // Let's load the data into temporary buffers to ensure the OS performs large loads.
698
+ std::vector<llama_buffer> tmp_bufs;
699
+ tmp_bufs.resize(lt.shards.size());
700
+ for (size_t i = 0; i < lt.shards.size(); i++) {
701
+ llama_load_tensor_shard & shard = lt.shards.at(i);
702
+ llama_file & file = file_loaders.at(shard.file_idx)->file;
703
+ file.seek(shard.file_off, SEEK_SET);
704
+ tmp_bufs.at(i).resize(shard.size);
705
+ file.read_raw(tmp_bufs.at(i).addr, shard.size);
706
+ }
707
+ // Then reshape.
708
+ size_t num_rows = lt.ne.at(1);
709
+ size_t per_shard_row_size = lt.shards.at(0).size / num_rows;
710
+ size_t out_offset = 0;
711
+ for (size_t row = 0; row < num_rows; row++) {
712
+ for (llama_buffer & tmp_buf : tmp_bufs) {
713
+ memcpy(lt.data + out_offset,
714
+ tmp_buf.addr + row * per_shard_row_size,
715
+ per_shard_row_size);
716
+ out_offset += per_shard_row_size;
717
+ }
718
+ }
719
+ LLAMA_ASSERT(out_offset == lt.size);
720
+ }
721
+ if (0) {
722
+ print_checksum(lt);
723
+ }
724
+ }
725
+
726
+ static void print_checksum(llama_load_tensor & lt) {
727
+ uint32_t sum = 0;
728
+ for (size_t i = 0; i < lt.size; i++) {
729
+ uint8_t byte = lt.data[i];
730
+ sum = byte + (sum << 6) + (sum << 16) - sum; // sdbm hash
731
+ }
732
+ fprintf(stderr, "%s checksum: %#08x (%s, size %zu)\n", lt.name.c_str(), sum,
733
+ llama_format_tensor_shape(lt.ne).c_str(), lt.size);
734
+ }
735
+
736
+ };
737
+
738
+
247
739
  //
248
740
  // kv cache
249
741
  //
@@ -262,8 +754,8 @@ static bool kv_cache_init(
262
754
  cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
263
755
 
264
756
  struct ggml_init_params params;
265
- params.mem_size = cache.buf.size();
266
- params.mem_buffer = cache.buf.data();
757
+ params.mem_size = cache.buf.size;
758
+ params.mem_buffer = cache.buf.addr;
267
759
  params.no_alloc = false;
268
760
 
269
761
  cache.ctx = ggml_init(params);
@@ -279,13 +771,6 @@ static bool kv_cache_init(
279
771
  return true;
280
772
  }
281
773
 
282
- static void kv_cache_free(struct llama_kv_cache & cache) {
283
- if (cache.ctx) {
284
- ggml_free(cache.ctx);
285
- cache.ctx = nullptr;
286
- }
287
- }
288
-
289
774
  struct llama_context_params llama_context_default_params() {
290
775
  struct llama_context_params result = {
291
776
  /*.n_ctx =*/ 512,
@@ -294,6 +779,7 @@ struct llama_context_params llama_context_default_params() {
294
779
  /*.f16_kv =*/ false,
295
780
  /*.logits_all =*/ false,
296
781
  /*.vocab_only =*/ false,
782
+ /*.use_mmap =*/ true,
297
783
  /*.use_mlock =*/ false,
298
784
  /*.embedding =*/ false,
299
785
  /*.progress_callback =*/ nullptr,
@@ -303,243 +789,106 @@ struct llama_context_params llama_context_default_params() {
303
789
  return result;
304
790
  }
305
791
 
792
+ bool llama_mmap_supported() {
793
+ return llama_mmap::SUPPORTED;
794
+ }
795
+
796
+ bool llama_mlock_supported() {
797
+ return llama_mlock::SUPPORTED;
798
+ }
799
+
306
800
  //
307
801
  // model loading
308
802
  //
309
803
 
310
- static void *mmap_file(const char *fname, uint64_t *mm_length) {
311
- #if defined(_WIN32) && !defined(_POSIX_MAPPED_FILES)
312
- HANDLE hFile = CreateFileA(fname,
313
- GENERIC_READ,
314
- FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
315
- NULL,
316
- OPEN_EXISTING,
317
- FILE_ATTRIBUTE_NORMAL | FILE_ATTRIBUTE_NOT_CONTENT_INDEXED,
318
- NULL);
319
- if (hFile == INVALID_HANDLE_VALUE) return 0;
320
- LARGE_INTEGER fileSize;
321
- fileSize.QuadPart = -1;
322
- GetFileSizeEx(hFile, &fileSize);
323
- int64_t length = fileSize.QuadPart;
324
- HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
325
- CloseHandle(hFile);
326
- if (!hMapping) return 0;
327
- void *addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
328
- CloseHandle(hMapping);
329
- if (!addr) return 0;
330
- #else
331
- int fd = open(fname, O_RDONLY);
332
- if (fd == -1) return 0;
333
- int64_t length = lseek(fd, 0, SEEK_END);
334
- void *addr = mmap(NULL, length, PROT_READ, MAP_SHARED, fd, 0);
335
- close(fd);
336
- if (addr == MAP_FAILED) return 0;
337
- #endif
338
- *mm_length = length;
339
- return addr;
804
+ static const char *llama_file_version_name(llama_file_version version) {
805
+ switch (version) {
806
+ case LLAMA_FILE_VERSION_GGML: return "'ggml' (old version with low tokenizer quality and no mmap support)";
807
+ case LLAMA_FILE_VERSION_GGMF_V1: return "ggmf v1 (old version with no mmap support)";
808
+ case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (latest)";
809
+ default: LLAMA_ASSERT(false);
810
+ }
340
811
  }
341
812
 
342
- static void munmap_file(void * addr, size_t length) {
343
- #if defined(_WIN32) && !defined(_POSIX_MAPPED_FILES)
344
- UnmapViewOfFile(addr);
345
- #else
346
- munmap(addr, length);
347
- #endif
813
+ static const char *llama_ftype_name(enum llama_ftype ftype) {
814
+ switch (ftype) {
815
+ case LLAMA_FTYPE_ALL_F32: return "all F32";
816
+ case LLAMA_FTYPE_MOSTLY_F16: return "mostly F16";
817
+ case LLAMA_FTYPE_MOSTLY_Q4_0: return "mostly Q4_0";
818
+ case LLAMA_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1";
819
+ case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
820
+ return "mostly Q4_1, some F16";
821
+ default: return "unknown, may not work";
822
+ }
348
823
  }
349
824
 
350
- static bool report_bad_magic(const char *path, uint32_t got, uint32_t want) {
351
- fprintf(stderr,
352
- "%s: invalid model file (bad magic [got %#x want %#x])\n"
353
- "\tyou most likely need to regenerate your ggml files\n"
354
- "\tthe benefit is you'll get 10-100x faster load times\n"
355
- "\tsee https://github.com/ggerganov/llama.cpp/issues/91\n"
356
- "\tuse convert-pth-to-ggml.py to regenerate from original pth\n"
357
- "\tuse migrate-ggml-2023-03-30-pr613.py if you deleted originals\n",
358
- path, got, want);
359
- return false;
825
+ static const char *llama_model_type_name(e_model type) {
826
+ switch (type) {
827
+ case MODEL_7B: return "7B";
828
+ case MODEL_13B: return "13B";
829
+ case MODEL_30B: return "30B";
830
+ case MODEL_65B: return "65B";
831
+ default: LLAMA_ASSERT(false);
832
+ }
360
833
  }
361
834
 
362
- static bool llama_model_load(
835
+ static void llama_model_load_internal(
363
836
  const std::string & fname,
364
837
  llama_context & lctx,
365
838
  int n_ctx,
366
- int n_parts,
367
839
  ggml_type memory_type,
840
+ bool use_mmap,
841
+ bool use_mlock,
368
842
  bool vocab_only,
369
843
  llama_progress_callback progress_callback,
370
- void *progress_callback_user_data) {
371
- fprintf(stderr, "%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
844
+ void * progress_callback_user_data) {
372
845
 
373
846
  lctx.t_start_us = ggml_time_us();
374
847
 
375
- auto & model = lctx.model;
376
- auto & vocab = lctx.vocab;
377
-
378
- auto fin = std::ifstream(fname, std::ios::binary);
379
- if (!fin) {
380
- fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
381
- return false;
382
- }
383
-
384
- std::vector<char> f_buf(1024*1024);
385
- fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size());
848
+ std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap, vocab_only));
386
849
 
387
- fin.seekg(0, fin.end);
388
- const size_t file_size = fin.tellg();
389
- fin.seekg(0);
850
+ lctx.vocab = std::move(ml->file_loaders.at(0)->vocab);
851
+ auto & model = lctx.model;
852
+ model.hparams = ml->file_loaders.at(0)->hparams;
853
+ llama_file_version file_version = ml->file_loaders.at(0)->file_version;
854
+ auto & hparams = model.hparams;
855
+ uint32_t n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
390
856
 
391
- // verify magic
392
857
  {
393
- uint32_t magic;
394
- fin.read((char *) &magic, sizeof(magic));
395
- if (magic == LLAMA_FILE_MAGIC_UNVERSIONED) {
396
- fprintf(stderr, "%s: invalid model file '%s' (too old, regenerate your model files or convert them with convert-unversioned-ggml-to-ggml.py!)\n",
397
- __func__, fname.c_str());
398
- return false;
858
+ switch (hparams.n_layer) {
859
+ case 32: model.type = e_model::MODEL_7B; break;
860
+ case 40: model.type = e_model::MODEL_13B; break;
861
+ case 60: model.type = e_model::MODEL_30B; break;
862
+ case 80: model.type = e_model::MODEL_65B; break;
399
863
  }
400
- if (magic != LLAMA_FILE_MAGIC) {
401
- return report_bad_magic(fname.c_str(), magic, LLAMA_FILE_MAGIC);
402
- }
403
-
404
- uint32_t format_version;
405
- fin.read((char *) &format_version, sizeof(format_version));
406
-
407
- if (format_version != LLAMA_FILE_VERSION) {
408
- fprintf(stderr, "%s: invalid model file '%s' (unsupported format version %" PRIu32 ", expected %d)\n",
409
- __func__, fname.c_str(), format_version, LLAMA_FILE_VERSION);
410
- return false;
411
- }
412
- }
413
-
414
- int n_ff = 0;
415
-
416
- // load hparams
417
- {
418
- auto & hparams = model.hparams;
419
-
420
- fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
421
- //fin.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
422
- fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd));
423
- fin.read((char *) &hparams.n_mult, sizeof(hparams.n_mult));
424
- fin.read((char *) &hparams.n_head, sizeof(hparams.n_head));
425
- fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
426
- fin.read((char *) &hparams.n_rot, sizeof(hparams.n_rot));
427
- fin.read((char *) &hparams.f16, sizeof(hparams.f16));
428
864
 
429
865
  hparams.n_ctx = n_ctx;
430
-
431
- n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
432
-
433
- if (n_parts < 1) {
434
- n_parts = LLAMA_N_PARTS.at(hparams.n_embd);
435
- }
436
-
437
- // temp warning to tell the user to use "--n_parts"
438
- if (hparams.f16 == 4 && n_parts != 1) {
439
- fprintf(stderr, "%s: GPTQ model detected - are you sure n_parts should be %d? we normally expect it to be 1\n", __func__, n_parts);
440
- fprintf(stderr, "%s: use '--n_parts 1' if necessary\n", __func__);
441
- }
442
-
443
- if (hparams.n_layer == 32) {
444
- model.type = e_model::MODEL_7B;
445
- }
446
-
447
- if (hparams.n_layer == 40) {
448
- model.type = e_model::MODEL_13B;
449
- }
450
-
451
- if (hparams.n_layer == 60) {
452
- model.type = e_model::MODEL_30B;
453
- }
454
-
455
- if (hparams.n_layer == 80) {
456
- model.type = e_model::MODEL_65B;
457
- }
458
-
459
- fprintf(stderr, "%s: n_vocab = %d\n", __func__, hparams.n_vocab);
460
- fprintf(stderr, "%s: n_ctx = %d\n", __func__, hparams.n_ctx);
461
- fprintf(stderr, "%s: n_embd = %d\n", __func__, hparams.n_embd);
462
- fprintf(stderr, "%s: n_mult = %d\n", __func__, hparams.n_mult);
463
- fprintf(stderr, "%s: n_head = %d\n", __func__, hparams.n_head);
464
- fprintf(stderr, "%s: n_layer = %d\n", __func__, hparams.n_layer);
465
- fprintf(stderr, "%s: n_rot = %d\n", __func__, hparams.n_rot);
466
- fprintf(stderr, "%s: f16 = %d\n", __func__, hparams.f16);
467
- fprintf(stderr, "%s: n_ff = %d\n", __func__, n_ff);
468
- fprintf(stderr, "%s: n_parts = %d\n", __func__, n_parts);
469
- fprintf(stderr, "%s: type = %d\n", __func__, model.type);
470
866
  }
471
867
 
472
- // load vocab
473
868
  {
474
- std::string word;
475
- vocab.id_to_token.resize(model.hparams.n_vocab);
476
- std::vector<char> tmp(64);
477
-
478
- for (int i = 0; i < model.hparams.n_vocab; i++) {
479
- uint32_t len;
480
- fin.read((char *) &len, sizeof(len));
481
-
482
- word.resize(len);
483
- if (len > 0) {
484
- tmp.resize(len);
485
- fin.read(tmp.data(), len);
486
- word.assign(tmp.data(), len);
487
- } else {
488
- word.clear();
489
- }
490
-
491
- float score;
492
- fin.read((char *) &score, sizeof(score));
493
-
494
- vocab.token_to_id[word] = i;
495
-
496
- auto &tok_score = vocab.id_to_token[i];
497
- tok_score.tok = word;
498
- tok_score.score = score;
499
- }
869
+ fprintf(stderr, "%s: format = %s\n", __func__, llama_file_version_name(file_version));
870
+ fprintf(stderr, "%s: n_vocab = %u\n", __func__, hparams.n_vocab);
871
+ fprintf(stderr, "%s: n_ctx = %u\n", __func__, hparams.n_ctx);
872
+ fprintf(stderr, "%s: n_embd = %u\n", __func__, hparams.n_embd);
873
+ fprintf(stderr, "%s: n_mult = %u\n", __func__, hparams.n_mult);
874
+ fprintf(stderr, "%s: n_head = %u\n", __func__, hparams.n_head);
875
+ fprintf(stderr, "%s: n_layer = %u\n", __func__, hparams.n_layer);
876
+ fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot);
877
+ fprintf(stderr, "%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype));
878
+ fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff);
879
+ fprintf(stderr, "%s: n_parts = %zu\n", __func__, ml->file_loaders.size());
880
+ fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
500
881
  }
501
882
 
502
883
  if (vocab_only) {
503
- return true;
504
- }
505
-
506
- // for the big tensors, we have the option to store the data in 16-bit floats or quantized
507
- // in order to save memory and also to speed up the computation
508
- // wtype is for per-layer weights, while vtype is for other weights
509
- ggml_type wtype, vtype;
510
- switch (model.hparams.f16) {
511
- case 0: wtype = vtype = GGML_TYPE_F32; break;
512
- case 1: wtype = vtype = GGML_TYPE_F16; break;
513
- case 2: wtype = vtype = GGML_TYPE_Q4_0; break;
514
- case 3: wtype = vtype = GGML_TYPE_Q4_1; break;
515
- case 4: wtype = GGML_TYPE_Q4_1; vtype = GGML_TYPE_F16; break;
516
- default:
517
- {
518
- fprintf(stderr, "%s: invalid model file '%s' (bad f16 value %d)\n",
519
- __func__, fname.c_str(), model.hparams.f16);
520
- return false;
521
- }
522
- }
523
-
524
- // map model into memory
525
- char *mm_addr = NULL;
526
- model.mm_addr = mmap_file(fname.c_str(), &model.mm_length);
527
- if (model.mm_addr == NULL) {
528
- fprintf(stderr, "%s: failed to mmap '%s'\n", __func__, fname.c_str());
529
- return false;
884
+ return;
530
885
  }
531
- mm_addr = (char *)model.mm_addr;
532
- fprintf(stderr, "%s: ggml map size = %6.2f MB\n", __func__, model.mm_length/(1024.0*1024.0));
533
886
 
534
887
  auto & ctx = model.ctx;
535
888
 
536
- size_t ctx_size = 0;
537
- {
538
- const auto &hparams = model.hparams;
539
- const int n_layer = hparams.n_layer;
540
- ctx_size += (5 + 10*n_layer)*256; // object overhead
541
- fprintf(stderr, "%s: ggml ctx size = %6.2f KB\n", __func__, ctx_size/1024.0);
542
- }
889
+ size_t ctx_size, mmapped_size;
890
+ ml->calc_sizes(&ctx_size, &mmapped_size);
891
+ fprintf(stderr, "%s: ggml ctx size = %6.2f KB\n", __func__, ctx_size/1024.0);
543
892
 
544
893
  // print memory requirements
545
894
  {
@@ -548,7 +897,7 @@ static bool llama_model_load(
548
897
  // this is the total memory required to run the inference
549
898
  const size_t mem_required =
550
899
  ctx_size +
551
- model.mm_length +
900
+ mmapped_size +
552
901
  MEM_REQ_SCRATCH0.at(model.type) +
553
902
  MEM_REQ_SCRATCH1.at(model.type) +
554
903
  MEM_REQ_EVAL.at (model.type);
@@ -564,17 +913,20 @@ static bool llama_model_load(
564
913
  // create the ggml context
565
914
  {
566
915
  lctx.model.buf.resize(ctx_size);
916
+ if (use_mlock) {
917
+ lctx.model.mlock_buf.init(lctx.model.buf.addr);
918
+ lctx.model.mlock_buf.grow_to(lctx.model.buf.size);
919
+ }
567
920
 
568
921
  struct ggml_init_params params = {
569
- /*.mem_size =*/ lctx.model.buf.size(),
570
- /*.mem_buffer =*/ lctx.model.buf.data(),
571
- /*.no_alloc =*/ true,
922
+ /*.mem_size =*/ lctx.model.buf.size,
923
+ /*.mem_buffer =*/ lctx.model.buf.addr,
924
+ /*.no_alloc =*/ ml->use_mmap,
572
925
  };
573
926
 
574
927
  model.ctx = ggml_init(params);
575
928
  if (!model.ctx) {
576
- fprintf(stderr, "%s: ggml_init() failed\n", __func__);
577
- return false;
929
+ throw format("ggml_init() failed");
578
930
  }
579
931
  }
580
932
 
@@ -582,161 +934,71 @@ static bool llama_model_load(
582
934
  {
583
935
  const auto & hparams = model.hparams;
584
936
 
585
- const int n_embd = hparams.n_embd;
586
- const int n_layer = hparams.n_layer;
587
- const int n_vocab = hparams.n_vocab;
588
-
589
- model.layers.resize(n_layer);
590
-
591
- model.tok_embeddings = ggml_new_tensor_2d(ctx, vtype, n_embd, n_vocab);
592
-
593
- model.norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
594
- model.output = ggml_new_tensor_2d(ctx, vtype, n_embd, n_vocab);
937
+ const uint32_t n_embd = hparams.n_embd;
938
+ const uint32_t n_layer = hparams.n_layer;
939
+ const uint32_t n_vocab = hparams.n_vocab;
595
940
 
596
- // map by name
597
- model.tensors["tok_embeddings.weight"] = model.tok_embeddings;
941
+ ml->ggml_ctx = ctx;
598
942
 
599
- model.tensors["norm.weight"] = model.norm;
600
- model.tensors["output.weight"] = model.output;
943
+ model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab});
944
+ model.norm = ml->get_tensor("norm.weight", {n_embd});
945
+ model.output = ml->get_tensor("output.weight", {n_embd, n_vocab});
601
946
 
602
- for (int i = 0; i < n_layer; ++i) {
947
+ model.layers.resize(n_layer);
948
+ for (uint32_t i = 0; i < n_layer; ++i) {
603
949
  auto & layer = model.layers[i];
604
950
 
605
- layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
606
-
607
- layer.wq = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
608
- layer.wk = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
609
- layer.wv = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
610
- layer.wo = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
611
-
612
- layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
613
-
614
- layer.w1 = ggml_new_tensor_2d(ctx, wtype, n_embd, n_ff);
615
- layer.w2 = ggml_new_tensor_2d(ctx, wtype, n_ff, n_embd);
616
- layer.w3 = ggml_new_tensor_2d(ctx, wtype, n_embd, n_ff);
951
+ std::string layers_i = "layers." + std::to_string(i);
617
952
 
618
- // map by name
619
- model.tensors["layers." + std::to_string(i) + ".attention_norm.weight"] = layer.attention_norm;
953
+ layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd});
620
954
 
621
- model.tensors["layers." + std::to_string(i) + ".attention.wq.weight"] = layer.wq;
622
- model.tensors["layers." + std::to_string(i) + ".attention.wk.weight"] = layer.wk;
623
- model.tensors["layers." + std::to_string(i) + ".attention.wv.weight"] = layer.wv;
624
- model.tensors["layers." + std::to_string(i) + ".attention.wo.weight"] = layer.wo;
955
+ layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd});
956
+ layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd});
957
+ layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd});
958
+ layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd});
625
959
 
626
- model.tensors["layers." + std::to_string(i) + ".ffn_norm.weight"] = layer.ffn_norm;
960
+ layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd});
627
961
 
628
- model.tensors["layers." + std::to_string(i) + ".feed_forward.w1.weight"] = layer.w1;
629
- model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.weight"] = layer.w2;
630
- model.tensors["layers." + std::to_string(i) + ".feed_forward.w3.weight"] = layer.w3;
962
+ layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff});
963
+ layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd});
964
+ layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff});
631
965
  }
632
966
  }
633
967
 
634
- std::vector<uint8_t> tmp;
968
+ ml->done_getting_tensors();
635
969
 
636
- if (progress_callback) {
637
- progress_callback(0.0, progress_callback_user_data);
970
+ // populate `tensors_by_name`
971
+ for (llama_load_tensor & lt : ml->tensors_map.tensors) {
972
+ model.tensors_by_name.emplace_back(lt.name, lt.ggml_tensor);
638
973
  }
639
974
 
640
- fprintf(stderr, "%s: loading tensors from '%s'\n", __func__, fname.c_str());
975
+ ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
641
976
 
642
- // load weights
643
- {
644
- size_t total_size = 0;
645
- model.n_loaded = 0;
646
-
647
- while (true) {
648
- int32_t n_dims;
649
- int32_t length;
650
- int32_t ftype;
651
-
652
- fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
653
- fin.read(reinterpret_cast<char *>(&length), sizeof(length));
654
- fin.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
655
-
656
- if (fin.eof()) {
657
- break;
658
- }
659
-
660
- int32_t nelements = 1;
661
- int32_t ne[2] = { 1, 1 };
662
- for (int i = 0; i < n_dims; ++i) {
663
- fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
664
- nelements *= ne[i];
665
- }
666
-
667
- std::string name(length, 0);
668
- fin.read(&name[0], length);
669
-
670
- if (model.tensors.find(name.data()) == model.tensors.end()) {
671
- fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
672
- return false;
673
- }
674
-
675
- auto tensor = model.tensors[name.data()];
676
-
677
- if (ggml_nelements(tensor) != nelements) {
678
- fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
679
- return false;
680
- }
681
- if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
682
- fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%" PRId64 ", %" PRId64 "], expected [%d, %d]\n",
683
- __func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
684
- return false;
685
- }
686
- if (0) {
687
- static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
688
- fprintf(stderr, "%24s - [%5d, %5d], type = %6s\n", name.data(), ne[0], ne[1], ftype_str[ftype]);
689
- }
690
-
691
- switch (ftype) {
692
- case 0: // f32
693
- case 1: // f16
694
- break;
695
- case 2: // q4_0
696
- case 3: // q4_1
697
- assert(ne[0] % 64 == 0);
698
- break;
699
- default:
700
- fprintf(stderr, "%s: unknown ftype %d in model file\n", __func__, ftype);
701
- return false;
702
- };
703
-
704
- // load the tensor data into memory without copying or reading it
705
- size_t offset = fin.tellg();
706
- size_t tensor_data_size = ggml_nbytes(tensor);
707
- offset = (offset + 31) & -32;
708
- tensor->data = mm_addr + offset;
709
- fin.seekg(offset + tensor_data_size);
710
- total_size += tensor_data_size;
711
- model.n_loaded++;
712
-
713
- // progress
714
- if (progress_callback) {
715
- double current_progress = size_t(fin.tellg()) / double(file_size);
716
- progress_callback(current_progress, progress_callback_user_data);
717
- }
718
- }
719
-
720
- fin.close();
721
-
722
- fprintf(stderr, "%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, model.n_loaded);
723
- if (model.n_loaded == 0) {
724
- fprintf(stderr, "%s: WARN no tensors loaded from model file - assuming empty model for testing\n", __func__);
725
- } else if (model.n_loaded != (int) model.tensors.size()) {
726
- fprintf(stderr, "%s: ERROR not all tensors loaded from model file - expected %zu, got %d\n", __func__, model.tensors.size(), model.n_loaded);
727
- return false;
728
- }
729
- }
977
+ model.mapping = std::move(ml->mapping);
730
978
 
731
979
  // loading time will be recalculate after the first eval, so
732
980
  // we take page faults deferred by mmap() into consideration
733
981
  lctx.t_load_us = ggml_time_us() - lctx.t_start_us;
982
+ }
734
983
 
735
- if (progress_callback) {
736
- progress_callback(1.0, progress_callback_user_data);
984
+ static bool llama_model_load(
985
+ const std::string & fname,
986
+ llama_context & lctx,
987
+ int n_ctx,
988
+ ggml_type memory_type,
989
+ bool use_mmap,
990
+ bool use_mlock,
991
+ bool vocab_only,
992
+ llama_progress_callback progress_callback,
993
+ void *progress_callback_user_data) {
994
+ try {
995
+ llama_model_load_internal(fname, lctx, n_ctx, memory_type, use_mmap, use_mlock,
996
+ vocab_only, progress_callback, progress_callback_user_data);
997
+ return true;
998
+ } catch (const std::string & err) {
999
+ fprintf(stderr, "error loading model: %s\n", err.c_str());
1000
+ return false;
737
1001
  }
738
-
739
- return true;
740
1002
  }
741
1003
 
742
1004
  // evaluate the transformer
@@ -774,8 +1036,8 @@ static bool llama_eval_internal(
774
1036
  auto & buf_compute = lctx.buf_compute;
775
1037
 
776
1038
  struct ggml_init_params params = {
777
- /*.mem_size =*/ buf_compute.size(),
778
- /*.mem_buffer =*/ buf_compute.data(),
1039
+ /*.mem_size =*/ buf_compute.size,
1040
+ /*.mem_buffer =*/ buf_compute.addr,
779
1041
  /*.no_alloc =*/ false,
780
1042
  };
781
1043
 
@@ -1061,7 +1323,7 @@ struct llama_tokenizer {
1061
1323
  size_t offs = 0;
1062
1324
  while (offs < text.size()) {
1063
1325
  llama_sp_symbol sym;
1064
- size_t char_len = Min(text.size() - offs, utf8_len(text[offs]));
1326
+ size_t char_len = std::min(text.size() - offs, utf8_len(text[offs]));
1065
1327
  sym.text = text.c_str() + offs;
1066
1328
  sym.n = char_len;
1067
1329
  offs += char_len;
@@ -1236,7 +1498,7 @@ static llama_vocab::id llama_sample_top_p_top_k(
1236
1498
  }
1237
1499
  }
1238
1500
 
1239
- sample_top_k(logits_id, top_k > 0 ? Min(top_k, n_logits) : n_logits);
1501
+ sample_top_k(logits_id, top_k > 0 ? std::min(top_k, n_logits) : n_logits);
1240
1502
 
1241
1503
  // compute probs for the top k tokens
1242
1504
  std::vector<float> probs;
@@ -1284,298 +1546,118 @@ static llama_vocab::id llama_sample_top_p_top_k(
1284
1546
  // quantization
1285
1547
  //
1286
1548
 
1287
- // TODO: reuse code from the llama_model_load() somehow
1288
- static bool llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, int itype) {
1289
- ggml_type type = GGML_TYPE_Q4_1;
1290
-
1291
- switch (itype) {
1292
- case 2: type = GGML_TYPE_Q4_0; break;
1293
- case 3: type = GGML_TYPE_Q4_1; break;
1294
- default: fprintf(stderr, "%s: invalid quantization type %d\n", __func__, itype); return 1;
1549
+ static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, enum llama_ftype ftype) {
1550
+ ggml_type quantized_type;
1551
+ switch (ftype) {
1552
+ case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
1553
+ case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
1554
+ default: throw format("invalid output file type %d\n", ftype);
1295
1555
  };
1296
1556
 
1297
- if (type != GGML_TYPE_Q4_0 && type != GGML_TYPE_Q4_1) {
1298
- fprintf(stderr, "%s: invalid quantization type %d\n", __func__, type);
1299
- return false;
1300
- }
1301
-
1302
- llama_vocab vocab;
1303
-
1304
- printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
1305
-
1306
- auto finp = std::ifstream(fname_inp, std::ios::binary);
1307
- if (!finp) {
1308
- fprintf(stderr, "%s: failed to open '%s' for reading\n", __func__, fname_inp.c_str());
1309
- return false;
1310
- }
1311
-
1312
- auto fout = std::ofstream(fname_out, std::ios::binary);
1313
- if (!fout) {
1314
- fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname_out.c_str());
1315
- return false;
1316
- }
1317
-
1318
- // verify magic
1319
- {
1320
- uint32_t magic;
1321
- finp.read((char *) &magic, sizeof(magic));
1322
- if (magic == LLAMA_FILE_MAGIC_UNVERSIONED) {
1323
- fprintf(stderr, "%s: invalid model file '%s' (too old, regenerate your model files!)\n",
1324
- __func__, fname_inp.c_str());
1325
- return false;
1326
- }
1327
- if (magic != LLAMA_FILE_MAGIC) {
1328
- return report_bad_magic(fname_inp.c_str(), magic, LLAMA_FILE_MAGIC);
1329
- }
1330
-
1331
- fout.write((char *) &magic, sizeof(magic));
1332
-
1333
- uint32_t format_version;
1334
- finp.read((char *) &format_version, sizeof(format_version));
1335
-
1336
- if (format_version != LLAMA_FILE_VERSION) {
1337
- fprintf(stderr, "%s: invalid model file '%s' (unsupported format version %" PRIu32 ", expected %d)\n",
1338
- __func__, fname_inp.c_str(), format_version, LLAMA_FILE_VERSION);
1339
- return false;
1340
- }
1341
-
1342
- fout.write((char *) &format_version, sizeof(format_version));
1343
- }
1344
-
1345
- llama_hparams hparams;
1346
-
1347
- // load hparams
1348
- {
1349
- finp.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
1350
- //finp.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
1351
- finp.read((char *) &hparams.n_embd, sizeof(hparams.n_embd));
1352
- finp.read((char *) &hparams.n_mult, sizeof(hparams.n_mult));
1353
- finp.read((char *) &hparams.n_head, sizeof(hparams.n_head));
1354
- finp.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
1355
- finp.read((char *) &hparams.n_rot, sizeof(hparams.n_rot));
1356
- finp.read((char *) &hparams.f16, sizeof(hparams.f16));
1357
-
1358
- printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
1359
- printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
1360
- printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
1361
- printf("%s: n_mult = %d\n", __func__, hparams.n_mult);
1362
- printf("%s: n_head = %d\n", __func__, hparams.n_head);
1363
- printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
1364
- printf("%s: f16 = %d\n", __func__, hparams.f16);
1365
-
1366
- fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
1367
- //fout.write((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
1368
- fout.write((char *) &hparams.n_embd, sizeof(hparams.n_embd));
1369
- fout.write((char *) &hparams.n_mult, sizeof(hparams.n_mult));
1370
- fout.write((char *) &hparams.n_head, sizeof(hparams.n_head));
1371
- fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer));
1372
- fout.write((char *) &hparams.n_rot, sizeof(hparams.n_rot));
1373
- fout.write((char *) &itype, sizeof(hparams.f16));
1374
- }
1375
-
1376
- // load vocab
1377
- {
1378
- const int32_t n_vocab = hparams.n_vocab;
1379
-
1380
- if (n_vocab != hparams.n_vocab) {
1381
- fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
1382
- __func__, fname_inp.c_str(), n_vocab, hparams.n_vocab);
1383
- return false;
1384
- }
1385
-
1386
- std::vector<char> word(32);
1387
- vocab.id_to_token.resize(n_vocab);
1388
- for (int i = 0; i < n_vocab; i++) {
1389
- uint32_t len;
1390
- finp.read ((char *) &len, sizeof(len));
1391
- fout.write((char *) &len, sizeof(len));
1392
-
1393
- word.resize(len);
1394
- finp.read ((char *) &word[0], len);
1395
- fout.write((char *) &word[0], len);
1396
-
1397
- float score;
1398
- finp.read ((char *) &score, sizeof(score));
1399
- fout.write((char *) &score, sizeof(score));
1400
-
1401
- vocab.token_to_id[word.data()] = i;
1402
-
1403
- auto &tok_score = vocab.id_to_token[i];
1404
- tok_score.tok = word.data();
1405
- tok_score.score = score;
1406
- }
1407
- }
1408
-
1409
- // load weights
1410
- {
1411
- size_t total_size_org = 0;
1412
- size_t total_size_new = 0;
1413
-
1414
- std::vector<float> work;
1415
-
1416
- std::vector<uint8_t> data_u8;
1417
- std::vector<ggml_fp16_t> data_f16;
1418
- std::vector<float> data_f32;
1419
-
1420
- std::vector<int64_t> hist_all(1 << 4, 0);
1421
-
1422
- while (true) {
1423
- int32_t n_dims;
1424
- int32_t length;
1425
- int32_t ftype;
1426
-
1427
- finp.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
1428
- finp.read(reinterpret_cast<char *>(&length), sizeof(length));
1429
- finp.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
1430
-
1431
- if (finp.eof()) {
1432
- break;
1433
- }
1434
-
1435
- int32_t nelements = 1;
1436
- int32_t ne[2] = { 1, 1 };
1437
- for (int i = 0; i < n_dims; ++i) {
1438
- finp.read (reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
1439
- nelements *= ne[i];
1440
- }
1441
-
1442
- std::string name(length, 0);
1443
- finp.read (&name[0], length);
1444
-
1445
- {
1446
- // ensure tensor data is aligned
1447
- uint64_t offset = finp.tellg();
1448
- offset = (offset + 31) & -32;
1449
- finp.seekg(offset);
1450
- }
1451
-
1452
- {
1453
- static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
1454
- printf("%48s - [%5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ftype_str[ftype]);
1455
- }
1456
-
1457
- // regexes of tensor names to be quantized
1458
- const std::vector<std::string> k_names = {
1459
- ".*weight",
1460
- };
1461
-
1462
- bool quantize = false;
1463
- for (const auto & s : k_names) {
1464
- if (std::regex_match(name, std::regex(s))) {
1465
- quantize = true;
1466
- break;
1467
- }
1468
- }
1469
-
1470
- // quantize only 2D tensors
1471
- quantize &= (n_dims == 2);
1472
-
1473
- if (quantize) {
1474
- if (ftype != 0 && ftype != 1) {
1475
- fprintf(stderr, "%s: unsupported ftype %d for integer quantization\n", __func__, ftype);
1476
- return false;
1477
- }
1478
-
1479
- if (ftype == 1) {
1480
- data_f16.resize(nelements);
1481
- finp.read(reinterpret_cast<char *>(data_f16.data()), nelements * sizeof(ggml_fp16_t));
1482
- data_f32.resize(nelements);
1483
- for (int i = 0; i < nelements; ++i) {
1484
- data_f32[i] = ggml_fp16_to_fp32(data_f16[i]);
1485
- }
1486
- } else {
1487
- data_f32.resize(nelements);
1488
- finp.read(reinterpret_cast<char *>(data_f32.data()), nelements * sizeof(float));
1557
+ std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp.c_str(), /*use_mmap*/ false,
1558
+ /*vocab_only*/ false));
1559
+ llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), ftype);
1560
+
1561
+ size_t total_size_org = 0;
1562
+ size_t total_size_new = 0;
1563
+ std::vector<int64_t> hist_all(1 << 4, 0);
1564
+
1565
+ size_t idx = 0;
1566
+ for (llama_load_tensor & tensor : model_loader->tensors_map.tensors) {
1567
+ llama_buffer read_data;
1568
+ read_data.resize(tensor.size);
1569
+ tensor.data = read_data.addr;
1570
+ model_loader->load_data_for(tensor);
1571
+
1572
+ printf("[%zu/%zu] %36s - %s, type = %6s, ",
1573
+ ++idx, model_loader->tensors_map.tensors.size(),
1574
+ tensor.name.c_str(), llama_format_tensor_shape(tensor.ne).c_str(),
1575
+ ggml_type_name(tensor.type));
1576
+
1577
+ // This used to be a regex, but <regex> has an extreme cost to compile times.
1578
+ bool quantize = tensor.name.rfind("weight") == tensor.name.size() - 6; // ends with 'weight'?
1579
+
1580
+ // quantize only 2D tensors
1581
+ quantize &= (tensor.ne.size() == 2);
1582
+
1583
+ enum ggml_type new_type;
1584
+ void * new_data;
1585
+ size_t new_size;
1586
+ llama_buffer work;
1587
+
1588
+ if (!quantize) {
1589
+ new_type = tensor.type;
1590
+ new_data = tensor.data;
1591
+ new_size = tensor.size;
1592
+ printf("size = %8.3f MB\n", tensor.size/1024.0/1024.0);
1593
+ } else {
1594
+ new_type = quantized_type;
1595
+ float * f32_data;
1596
+ size_t nelements = tensor.ne.at(0) * tensor.ne.at(1);
1597
+ llama_buffer f32_conv_buf;
1598
+ if (tensor.type == GGML_TYPE_F32) {
1599
+ f32_data = (float *) tensor.data;
1600
+ } else if (tensor.type == GGML_TYPE_F16) {
1601
+ f32_conv_buf.resize(nelements * sizeof(float));
1602
+ f32_data = (float *) f32_conv_buf.addr;
1603
+ auto f16_data = (const ggml_fp16_t *) tensor.data;
1604
+ for (size_t i = 0; i < nelements; i++) {
1605
+ f32_data[i] = ggml_fp16_to_fp32(f16_data[i]);
1489
1606
  }
1490
-
1491
- ftype = itype;
1492
1607
  } else {
1493
- const int bpe = (ftype == 0) ? sizeof(float) : sizeof(uint16_t);
1494
-
1495
- data_u8.resize(nelements*bpe);
1496
- finp.read(reinterpret_cast<char *>(data_u8.data()), nelements * bpe);
1608
+ throw format("type %s unsupported for integer quantization", ggml_type_name(tensor.type));
1497
1609
  }
1498
1610
 
1499
- fout.write(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
1500
- fout.write(reinterpret_cast<char *>(&length), sizeof(length));
1501
- fout.write(reinterpret_cast<char *>(&ftype), sizeof(ftype));
1502
- for (int i = 0; i < n_dims; ++i) {
1503
- fout.write(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
1611
+ printf("quantizing .. ");
1612
+ fflush(stdout);
1613
+
1614
+ work.resize(nelements * 4); // upper bound on size
1615
+ new_data = work.addr;
1616
+ std::vector<int64_t> hist_cur(1 << 4, 0);
1617
+
1618
+ switch (new_type) {
1619
+ case GGML_TYPE_Q4_0:
1620
+ {
1621
+ new_size = ggml_quantize_q4_0(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
1622
+ } break;
1623
+ case GGML_TYPE_Q4_1:
1624
+ {
1625
+ new_size = ggml_quantize_q4_1(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
1626
+ } break;
1627
+ default:
1628
+ LLAMA_ASSERT(false);
1504
1629
  }
1505
- fout.write(&name[0], length);
1506
1630
 
1507
- {
1508
- // ensure tensor data is aligned
1509
- uint64_t offset = fout.tellp();
1510
- offset = (offset + 31) & -32;
1511
- fout.seekp(offset);
1631
+ printf("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0);
1632
+ for (size_t i = 0; i < hist_cur.size(); i++) {
1633
+ hist_all[i] += hist_cur[i];
1512
1634
  }
1513
1635
 
1514
- if (quantize) {
1515
- printf("quantizing .. ");
1516
- work.resize(nelements); // for quantization
1517
-
1518
- size_t cur_size = 0;
1519
- std::vector<int64_t> hist_cur(1 << 4, 0);
1520
-
1521
- switch (type) {
1522
- case GGML_TYPE_Q4_0:
1523
- {
1524
- cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
1525
- } break;
1526
- case GGML_TYPE_Q4_1:
1527
- {
1528
- cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
1529
- } break;
1530
- default:
1531
- {
1532
- fprintf(stderr, "%s: unsupported quantization type %d\n", __func__, type);
1533
- return false;
1534
- }
1535
- }
1536
-
1537
- fout.write(reinterpret_cast<char *>(work.data()), cur_size);
1538
- total_size_new += cur_size;
1539
-
1540
- printf("size = %8.2f MB -> %8.2f MB | hist: ", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0);
1541
- for (int i = 0; i < (int) hist_cur.size(); ++i) {
1542
- hist_all[i] += hist_cur[i];
1543
- }
1544
-
1545
- for (int i = 0; i < (int) hist_cur.size(); ++i) {
1546
- printf("%5.3f ", hist_cur[i] / float(nelements));
1547
- }
1548
- printf("\n");
1549
- } else {
1550
- printf("size = %8.3f MB\n", data_u8.size()/1024.0/1024.0);
1551
- fout.write(reinterpret_cast<char *>(data_u8.data()), data_u8.size());
1552
- total_size_new += data_u8.size();
1636
+ for (size_t i = 0; i < hist_cur.size(); i++) {
1637
+ printf("%5.3f ", hist_cur[i] / float(nelements));
1553
1638
  }
1554
-
1555
- total_size_org += nelements * sizeof(float);
1639
+ printf("\n");
1556
1640
  }
1641
+ total_size_org += tensor.size;
1642
+ total_size_new += new_size;
1643
+ file_saver.write_tensor(tensor, new_type, new_data, new_size);
1644
+ }
1557
1645
 
1558
- printf("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
1559
- printf("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0);
1646
+ printf("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
1647
+ printf("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0);
1560
1648
 
1561
- {
1562
- int64_t sum_all = 0;
1563
- for (int i = 0; i < (int) hist_all.size(); ++i) {
1564
- sum_all += hist_all[i];
1565
- }
1649
+ {
1650
+ int64_t sum_all = 0;
1651
+ for (size_t i = 0; i < hist_all.size(); i++) {
1652
+ sum_all += hist_all[i];
1653
+ }
1566
1654
 
1567
- printf("%s: hist: ", __func__);
1568
- for (int i = 0; i < (int) hist_all.size(); ++i) {
1569
- printf("%5.3f ", hist_all[i] / float(sum_all));
1570
- }
1571
- printf("\n");
1655
+ printf("%s: hist: ", __func__);
1656
+ for (size_t i = 0; i < hist_all.size(); i++) {
1657
+ printf("%5.3f ", hist_all[i] / float(sum_all));
1572
1658
  }
1659
+ printf("\n");
1573
1660
  }
1574
-
1575
- finp.close();
1576
- fout.close();
1577
-
1578
- return true;
1579
1661
  }
1580
1662
 
1581
1663
  //
@@ -1593,32 +1675,36 @@ struct llama_context * llama_init_from_file(
1593
1675
  params.seed = time(NULL);
1594
1676
  }
1595
1677
 
1678
+ unsigned cur_percentage = 0;
1679
+ if (params.progress_callback == NULL) {
1680
+ params.progress_callback_user_data = &cur_percentage;
1681
+ params.progress_callback = [](float progress, void * ctx) {
1682
+ unsigned * cur_percentage_p = (unsigned *) ctx;
1683
+ unsigned percentage = (unsigned) (100 * progress);
1684
+ while (percentage > *cur_percentage_p) {
1685
+ ++*cur_percentage_p;
1686
+ fprintf(stderr, ".");
1687
+ fflush(stderr);
1688
+ if (percentage >= 100) {
1689
+ fprintf(stderr, "\n");
1690
+ }
1691
+ }
1692
+ };
1693
+ }
1694
+
1596
1695
  ctx->rng = std::mt19937(params.seed);
1597
1696
  ctx->logits_all = params.logits_all;
1598
1697
 
1599
1698
  ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
1600
1699
 
1601
- if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_parts, memory_type,
1602
- params.vocab_only, params.progress_callback,
1603
- params.progress_callback_user_data)) {
1700
+ if (!llama_model_load(path_model, *ctx, params.n_ctx, memory_type,
1701
+ params.use_mmap, params.use_mlock, params.vocab_only,
1702
+ params.progress_callback, params.progress_callback_user_data)) {
1604
1703
  fprintf(stderr, "%s: failed to load model\n", __func__);
1605
1704
  llama_free(ctx);
1606
1705
  return nullptr;
1607
1706
  }
1608
1707
 
1609
- if (params.use_mlock) {
1610
- char *err;
1611
- if (!ggml_mlock(ctx->model.ctx,
1612
- ctx->model.mm_addr,
1613
- ctx->model.mm_length,
1614
- &err)) {
1615
- fprintf(stderr, "%s\n", err);
1616
- free(err);
1617
- llama_free(ctx);
1618
- return nullptr;
1619
- }
1620
- }
1621
-
1622
1708
  // reserve memory for context buffers
1623
1709
  if (!params.vocab_only) {
1624
1710
  if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx)) {
@@ -1655,40 +1741,31 @@ struct llama_context * llama_init_from_file(
1655
1741
  }
1656
1742
 
1657
1743
  void llama_free(struct llama_context * ctx) {
1658
- kv_cache_free(ctx->model.kv_self);
1659
-
1660
- if (ctx->model.ctx) {
1661
- ggml_free(ctx->model.ctx);
1662
- }
1663
-
1664
- if (ctx->model.mm_addr) {
1665
- munmap_file(ctx->model.mm_addr, ctx->model.mm_length);
1666
- }
1667
-
1668
1744
  delete ctx;
1669
1745
  }
1670
1746
 
1671
1747
  int llama_model_quantize(
1672
1748
  const char * fname_inp,
1673
1749
  const char * fname_out,
1674
- int itype) {
1675
- if (!llama_model_quantize_internal(fname_inp, fname_out, itype)) {
1676
- fprintf(stderr, "%s: failed to quantize\n", __func__);
1750
+ enum llama_ftype ftype) {
1751
+ try {
1752
+ llama_model_quantize_internal(fname_inp, fname_out, ftype);
1753
+ return 0;
1754
+ } catch (const std::string & err) {
1755
+ fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.c_str());
1677
1756
  return 1;
1678
1757
  }
1679
-
1680
- return 0;
1681
1758
  }
1682
1759
 
1683
1760
  // Returns the KV cache that will contain the context for the
1684
1761
  // ongoing prediction with the model.
1685
1762
  const uint8_t * llama_get_kv_cache(struct llama_context * ctx) {
1686
- return ctx->model.kv_self.buf.data();
1763
+ return ctx->model.kv_self.buf.addr;
1687
1764
  }
1688
1765
 
1689
1766
  // Returns the size of the KV cache
1690
1767
  size_t llama_get_kv_cache_size(struct llama_context * ctx) {
1691
- return ctx->model.kv_self.buf.size();
1768
+ return ctx->model.kv_self.buf.size;
1692
1769
  }
1693
1770
 
1694
1771
  int llama_get_kv_cache_token_count(struct llama_context * ctx) {
@@ -1702,8 +1779,8 @@ void llama_set_kv_cache(
1702
1779
  size_t n_size,
1703
1780
  int n_token_count) {
1704
1781
  // Make sure we have the same kv cache setup
1705
- LLAMA_ASSERT(ctx->model.kv_self.buf.size() == n_size);
1706
- memcpy(ctx->model.kv_self.buf.data(), kv_cache, n_size);
1782
+ LLAMA_ASSERT(ctx->model.kv_self.buf.size == n_size);
1783
+ memcpy(ctx->model.kv_self.buf.addr, kv_cache, n_size);
1707
1784
  ctx->model.kv_self.n = n_token_count;
1708
1785
  }
1709
1786
 
@@ -1814,9 +1891,9 @@ llama_token llama_sample_top_p_top_k(
1814
1891
  void llama_print_timings(struct llama_context * ctx) {
1815
1892
  const int64_t t_end_us = ggml_time_us();
1816
1893
 
1817
- const int32_t n_sample = Max(1, ctx->n_sample);
1818
- const int32_t n_eval = Max(1, ctx->n_eval);
1819
- const int32_t n_p_eval = Max(1, ctx->n_p_eval);
1894
+ const int32_t n_sample = std::max(1, ctx->n_sample);
1895
+ const int32_t n_eval = std::max(1, ctx->n_eval);
1896
+ const int32_t n_p_eval = std::max(1, ctx->n_p_eval);
1820
1897
 
1821
1898
  fprintf(stderr, "\n");
1822
1899
  fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0);
@@ -1852,3 +1929,8 @@ const char * llama_print_system_info(void) {
1852
1929
 
1853
1930
  return s.c_str();
1854
1931
  }
1932
+
1933
+ // For internal test use
1934
+ std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
1935
+ return ctx->model.tensors_by_name;
1936
+ }