llama_cpp 0.0.2 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,49 +1,30 @@
1
+ // Defines fileno on msys:
2
+ #ifndef _GNU_SOURCE
3
+ #define _GNU_SOURCE
4
+ #endif
5
+
6
+ #include "llama_util.h"
1
7
  #include "llama.h"
2
8
 
3
9
  #include "ggml.h"
4
10
 
11
+ #include <array>
5
12
  #include <cinttypes>
6
13
  #include <fstream>
7
14
  #include <random>
8
15
  #include <map>
9
16
  #include <unordered_map>
10
17
  #include <queue>
11
- #include <regex>
12
18
  #include <cassert>
13
19
  #include <cstring>
14
-
15
- #if defined(_WIN32) && !defined(_POSIX_MAPPED_FILES)
16
- #define WIN32_LEAN_AND_MEAN
17
- #include <Windows.h>
18
- #else
19
- #include <sys/types.h>
20
- #include <sys/mman.h>
21
- #include <unistd.h>
22
- #include <fcntl.h>
23
- #endif
24
-
25
- #define Min(X, Y) ((Y) > (X) ? (X) : (Y))
26
- #define Max(X, Y) ((Y) < (X) ? (X) : (Y))
20
+ #include <climits>
21
+ #include <memory>
22
+ #include <algorithm>
23
+ #include <initializer_list>
27
24
 
28
25
  #define LLAMA_USE_SCRATCH
29
26
  #define LLAMA_MAX_SCRATCH_BUFFERS 16
30
27
 
31
- #define LLAMA_ASSERT(x) \
32
- do { \
33
- if (!(x)) { \
34
- fprintf(stderr, "LLAMA_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
35
- abort(); \
36
- } \
37
- } while (0)
38
-
39
-
40
- // determine number of model parts based on the dimension
41
- static const std::unordered_map<int, int> LLAMA_N_PARTS = {
42
- { 4096, 1 },
43
- { 5120, 2 },
44
- { 6656, 4 },
45
- { 8192, 8 },
46
- };
47
28
 
48
29
  // available llama models
49
30
  enum e_model {
@@ -93,14 +74,18 @@ static const std::map<e_model, size_t> MEM_REQ_EVAL = {
93
74
 
94
75
  // default hparams (LLaMA 7B)
95
76
  struct llama_hparams {
96
- int32_t n_vocab = 32000;
97
- int32_t n_ctx = 512; // this is provided as user input?
98
- int32_t n_embd = 4096;
99
- int32_t n_mult = 256;
100
- int32_t n_head = 32;
101
- int32_t n_layer = 32;
102
- int32_t n_rot = 64;
103
- int32_t f16 = 1;
77
+ uint32_t n_vocab = 32000;
78
+ uint32_t n_ctx = 512; // this is provided as user input?
79
+ uint32_t n_embd = 4096;
80
+ uint32_t n_mult = 256;
81
+ uint32_t n_head = 32;
82
+ uint32_t n_layer = 32;
83
+ uint32_t n_rot = 64;
84
+ enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;
85
+
86
+ bool operator!=(const llama_hparams & other) const {
87
+ return memcmp(this, &other, sizeof(llama_hparams));
88
+ }
104
89
  };
105
90
 
106
91
  struct llama_layer {
@@ -126,11 +111,17 @@ struct llama_kv_cache {
126
111
  struct ggml_tensor * k;
127
112
  struct ggml_tensor * v;
128
113
 
129
- struct ggml_context * ctx;
114
+ struct ggml_context * ctx = NULL;
130
115
 
131
- std::vector<uint8_t> buf;
116
+ llama_buffer buf;
132
117
 
133
118
  int n; // number of tokens currently in the cache
119
+
120
+ ~llama_kv_cache() {
121
+ if (ctx) {
122
+ ggml_free(ctx);
123
+ }
124
+ }
134
125
  };
135
126
 
136
127
  struct llama_model {
@@ -146,22 +137,30 @@ struct llama_model {
146
137
  std::vector<llama_layer> layers;
147
138
 
148
139
  // context
149
- struct ggml_context * ctx;
140
+ struct ggml_context * ctx = NULL;
150
141
 
151
142
  // key + value cache for the self attention
152
143
  // TODO: move to llama_state
153
144
  struct llama_kv_cache kv_self;
154
145
 
155
146
  // the model memory buffer
156
- std::vector<uint8_t> buf;
147
+ llama_buffer buf;
157
148
 
158
149
  // model memory mapped file
159
- void * mm_addr = NULL;
160
- uint64_t mm_length = 0;
150
+ std::unique_ptr<llama_mmap> mapping;
151
+
152
+ // objects representing data potentially being locked in memory
153
+ llama_mlock mlock_buf;
154
+ llama_mlock mlock_mmap;
155
+
156
+ // for quantize-stats only
157
+ std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
161
158
 
162
- // tensors
163
- int n_loaded;
164
- std::unordered_map<std::string, struct ggml_tensor *> tensors;
159
+ ~llama_model() {
160
+ if (ctx) {
161
+ ggml_free(ctx);
162
+ }
163
+ }
165
164
  };
166
165
 
167
166
  struct llama_vocab {
@@ -206,8 +205,8 @@ struct llama_context {
206
205
 
207
206
  // memory buffers used to evaluate the model
208
207
  // TODO: move in llama_state
209
- std::vector<uint8_t> buf_compute;
210
- std::vector<uint8_t> buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
208
+ llama_buffer buf_compute;
209
+ llama_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
211
210
 
212
211
  int buf_last = 0;
213
212
  size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
@@ -220,11 +219,11 @@ struct llama_context {
220
219
  last_size = ggml_set_scratch(ctx, { 0, 0, nullptr, });
221
220
  } else {
222
221
  auto & buf = buf_scratch[i];
223
- last_size = ggml_set_scratch(ctx, { 0, buf.size(), buf.data(), });
222
+ last_size = ggml_set_scratch(ctx, { 0, buf.size, buf.addr, });
224
223
  }
225
224
 
226
225
  if (buf_last >= 0) {
227
- buf_max_size[buf_last] = Max(buf_max_size[buf_last], last_size);
226
+ buf_max_size[buf_last] = std::max(buf_max_size[buf_last], last_size);
228
227
  }
229
228
 
230
229
  buf_last = i;
@@ -244,6 +243,499 @@ struct llama_context {
244
243
  }
245
244
  };
246
245
 
246
+ template <typename T>
247
+ static T checked_mul(T a, T b) {
248
+ T ret = a * b;
249
+ if (a != 0 && ret / a != b) {
250
+ throw format("overflow multiplying %llu * %llu",
251
+ (unsigned long long) a, (unsigned long long) b);
252
+ }
253
+ return ret;
254
+ }
255
+
256
+ static size_t checked_div(size_t a, size_t b) {
257
+ if (b == 0 || a % b != 0) {
258
+ throw format("error dividing %zu / %zu", a, b);
259
+ }
260
+ return a / b;
261
+ }
262
+
263
+ static std::string llama_format_tensor_shape(const std::vector<uint32_t> & ne) {
264
+ std::string ret = "[" + std::to_string(ne.at(0));
265
+ for (size_t i = 1; i < ne.size(); i++) {
266
+ ret += " x " + std::to_string(ne.at(i));
267
+ }
268
+ ret += "]";
269
+ return ret;
270
+ }
271
+
272
+ static size_t llama_calc_tensor_size(const std::vector<uint32_t> & ne, enum ggml_type type) {
273
+ size_t size = ggml_type_size(type);
274
+ for (uint32_t dim : ne) {
275
+ size = checked_mul<size_t>(size, dim);
276
+ }
277
+ return size / ggml_blck_size(type);
278
+ }
279
+
280
+ struct llama_load_tensor_shard {
281
+ std::vector<uint32_t> ne;
282
+ size_t size;
283
+ enum ggml_type type;
284
+ size_t file_idx;
285
+ size_t file_off;
286
+
287
+ void calc_size() {
288
+ size = llama_calc_tensor_size(ne, type);
289
+ }
290
+ };
291
+
292
+ enum llama_split_type {
293
+ SPLIT_NONE,
294
+ SPLIT_BY_COLUMNS,
295
+ SPLIT_BY_ROWS
296
+ };
297
+
298
+ struct llama_load_tensor {
299
+ std::vector<llama_load_tensor_shard> shards;
300
+
301
+ std::string name;
302
+ enum ggml_type type = GGML_TYPE_F32;
303
+ llama_split_type split_type = SPLIT_NONE;
304
+ std::vector<uint32_t> ne;
305
+ size_t size;
306
+ struct ggml_tensor * ggml_tensor = NULL;
307
+ uint8_t * data;
308
+
309
+ llama_load_tensor(const std::string & name) : name(name) {}
310
+
311
+ void calc_all() {
312
+ calc_type();
313
+ calc_split_type();
314
+ calc_ne();
315
+ calc_size();
316
+ }
317
+
318
+ void calc_type() {
319
+ const auto & first_shard = shards.at(0);
320
+ for (const auto & shard : shards) {
321
+ if (shard.type != first_shard.type) {
322
+ throw format("inconsistent tensor shard type in '%s'", name.c_str());
323
+ }
324
+ }
325
+ type = first_shard.type;
326
+ }
327
+
328
+ void calc_split_type() {
329
+ if (shards.at(0).ne.size() == 1 || // 1D tensors are just duplicated in every file
330
+ shards.size() == 1) { // only one file?
331
+ split_type = SPLIT_NONE;
332
+ } else if (name.find("tok_embeddings.") == 0 ||
333
+ name.find(".attention.wo.weight") != std::string::npos ||
334
+ name.find(".feed_forward.w2.weight") != std::string::npos) {
335
+ split_type = SPLIT_BY_COLUMNS;
336
+ } else {
337
+ split_type = SPLIT_BY_ROWS;
338
+ }
339
+ }
340
+
341
+ void calc_ne() {
342
+ const auto & first_shard = shards.at(0);
343
+ for (const auto & shard : shards) {
344
+ if (shard.ne != first_shard.ne) {
345
+ throw format("inconsistent tensor shard shape in '%s': first was %s, other was %s",
346
+ name.c_str(), llama_format_tensor_shape(first_shard.ne).c_str(), llama_format_tensor_shape(shard.ne).c_str());
347
+ }
348
+ }
349
+ ne = first_shard.ne;
350
+ LLAMA_ASSERT(shards.size() <= UINT32_MAX);
351
+ uint32_t n_shards = (uint32_t) shards.size();
352
+ switch (split_type) {
353
+ case SPLIT_NONE:
354
+ ne = first_shard.ne;
355
+ break;
356
+ case SPLIT_BY_COLUMNS:
357
+ ne = {checked_mul<uint32_t>(first_shard.ne[0], n_shards),
358
+ first_shard.ne[1]};
359
+ break;
360
+ case SPLIT_BY_ROWS:
361
+ ne = {first_shard.ne[0],
362
+ checked_mul<uint32_t>(first_shard.ne[1], n_shards)};
363
+ break;
364
+ }
365
+ }
366
+
367
+ void calc_size() {
368
+ size = llama_calc_tensor_size(ne, type);
369
+ }
370
+ };
371
+
372
+ struct llama_load_tensors_map {
373
+ // tensors is kept in a separate vector to preserve file order
374
+ std::vector<llama_load_tensor> tensors;
375
+ std::unordered_map<std::string, size_t> name_to_idx;
376
+ };
377
+
378
+ enum llama_file_version {
379
+ LLAMA_FILE_VERSION_GGML,
380
+ LLAMA_FILE_VERSION_GGMF_V1, // added version field and scores in vocab
381
+ LLAMA_FILE_VERSION_GGJT_V1, // added padding
382
+ };
383
+
384
+ struct llama_file_loader {
385
+ llama_file file;
386
+ llama_file_version file_version;
387
+ llama_hparams hparams;
388
+ llama_vocab vocab;
389
+
390
+ llama_file_loader(const char * fname, size_t file_idx, llama_load_tensors_map & tensors_map)
391
+ : file(fname, "rb") {
392
+ fprintf(stderr, "llama.cpp: loading model from %s\n", fname);
393
+ read_magic();
394
+ read_hparams();
395
+ read_vocab();
396
+ read_tensor_metadata(file_idx, tensors_map);
397
+ }
398
+ void read_magic() {
399
+ uint32_t magic = file.read_u32();
400
+ uint32_t version = 0;
401
+
402
+ if (magic != 'ggml') {
403
+ version = file.read_u32();
404
+ }
405
+
406
+ if (magic == 'ggml' && version == 0) {
407
+ file_version = LLAMA_FILE_VERSION_GGML;
408
+ } else if (magic == 'ggmf' && version == 1) {
409
+ file_version = LLAMA_FILE_VERSION_GGMF_V1;
410
+ } else if (magic == 'ggjt' && version == 1) {
411
+ file_version = LLAMA_FILE_VERSION_GGJT_V1;
412
+ } else {
413
+ throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
414
+ magic, version);
415
+ }
416
+ }
417
+ void read_hparams() {
418
+ hparams.n_vocab = file.read_u32();
419
+ hparams.n_embd = file.read_u32();
420
+ hparams.n_mult = file.read_u32();
421
+ hparams.n_head = file.read_u32();
422
+ hparams.n_layer = file.read_u32();
423
+ hparams.n_rot = file.read_u32();
424
+ hparams.ftype = (enum llama_ftype) file.read_u32();
425
+ }
426
+ void read_vocab() {
427
+ vocab.id_to_token.resize(hparams.n_vocab);
428
+
429
+ for (uint32_t i = 0; i < hparams.n_vocab; i++) {
430
+ uint32_t len = file.read_u32();
431
+ std::string word = file.read_string(len);
432
+
433
+ float score = 0.0f;
434
+ if (file_version >= LLAMA_FILE_VERSION_GGMF_V1) {
435
+ file.read_raw(&score, sizeof(score));
436
+ }
437
+
438
+ vocab.token_to_id[word] = i;
439
+
440
+ auto & tok_score = vocab.id_to_token[i];
441
+ tok_score.tok = std::move(word);
442
+ tok_score.score = score;
443
+ }
444
+ }
445
+ void read_tensor_metadata(size_t file_idx, llama_load_tensors_map & tensors_map) {
446
+ while (file.tell() < file.size) {
447
+ llama_load_tensor_shard shard;
448
+ uint32_t n_dims = file.read_u32();
449
+ uint32_t name_len = file.read_u32();
450
+ shard.type = (enum ggml_type) file.read_u32();
451
+ shard.ne.resize(n_dims);
452
+ file.read_raw(shard.ne.data(), sizeof(shard.ne[0]) * n_dims);
453
+ std::string name = file.read_string(name_len);
454
+ if (n_dims < 1 || n_dims > 2) {
455
+ throw format("llama.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims);
456
+ }
457
+ switch (shard.type) {
458
+ case GGML_TYPE_F32:
459
+ case GGML_TYPE_F16:
460
+ case GGML_TYPE_Q4_0:
461
+ case GGML_TYPE_Q4_1:
462
+ break;
463
+ default: {
464
+ throw format("unrecognized tensor type %u\n", shard.type);
465
+ }
466
+ }
467
+
468
+ if (file_version >= LLAMA_FILE_VERSION_GGJT_V1) {
469
+ // skip to the next multiple of 32 bytes
470
+ file.seek(-file.tell() & 31, SEEK_CUR);
471
+ }
472
+ shard.file_idx = file_idx;
473
+ shard.file_off = file.tell();
474
+
475
+ shard.calc_size();
476
+ file.seek(shard.size, SEEK_CUR);
477
+
478
+ auto it = tensors_map.name_to_idx.find(name);
479
+ size_t idx;
480
+ if (it != tensors_map.name_to_idx.end()) {
481
+ idx = it->second;
482
+ } else {
483
+ tensors_map.tensors.emplace_back(name);
484
+ idx = tensors_map.tensors.size() - 1;
485
+ tensors_map.name_to_idx.emplace(name, idx);
486
+ }
487
+ tensors_map.tensors.at(idx).shards.push_back(shard);
488
+ }
489
+ }
490
+ };
491
+
492
+ struct llama_file_saver {
493
+ llama_file file;
494
+ llama_file_loader * any_file_loader;
495
+ llama_file_saver(const char * fname, llama_file_loader * any_file_loader, enum llama_ftype new_ftype)
496
+ : file(fname, "wb"), any_file_loader(any_file_loader) {
497
+ fprintf(stderr, "llama.cpp: saving model to %s\n", fname);
498
+ write_magic();
499
+ write_hparams(new_ftype);
500
+ write_vocab();
501
+ }
502
+ void write_magic() {
503
+ file.write_u32('ggjt'); // magic
504
+ file.write_u32(1); // version
505
+ }
506
+ void write_hparams(enum llama_ftype new_ftype) {
507
+ const llama_hparams & hparams = any_file_loader->hparams;
508
+ file.write_u32(hparams.n_vocab);
509
+ file.write_u32(hparams.n_embd);
510
+ file.write_u32(hparams.n_mult);
511
+ file.write_u32(hparams.n_head);
512
+ file.write_u32(hparams.n_layer);
513
+ file.write_u32(hparams.n_rot);
514
+ file.write_u32(new_ftype);
515
+ }
516
+ void write_vocab() {
517
+ if (any_file_loader->file_version == LLAMA_FILE_VERSION_GGML) {
518
+ fprintf(stderr, "llama.cpp: WARNING: input is an old file that doesn't have scores; will add dummy scores\n");
519
+ }
520
+ uint32_t n_vocab = any_file_loader->hparams.n_vocab;
521
+ for (uint32_t i = 0; i < n_vocab; i++) {
522
+ const auto & token_score = any_file_loader->vocab.id_to_token.at(i);
523
+ file.write_u32((uint32_t) token_score.tok.size());
524
+ file.write_raw(token_score.tok.data(), token_score.tok.size());
525
+ file.write_raw(&token_score.score, sizeof(token_score.score));
526
+ }
527
+ }
528
+ void write_tensor(llama_load_tensor & tensor, enum ggml_type new_type, const void * new_data, size_t new_size) {
529
+ switch (new_type) {
530
+ case GGML_TYPE_F32:
531
+ case GGML_TYPE_F16:
532
+ case GGML_TYPE_Q4_0:
533
+ case GGML_TYPE_Q4_1:
534
+ break;
535
+ default: LLAMA_ASSERT(false);
536
+ }
537
+ file.write_u32((uint32_t) tensor.ne.size());
538
+ file.write_u32((uint32_t) tensor.name.size());
539
+ file.write_u32(new_type);
540
+ file.write_raw(tensor.ne.data(), sizeof(tensor.ne[0]) * tensor.ne.size());
541
+ file.write_raw(tensor.name.data(), tensor.name.size());
542
+ file.seek(-file.tell() & 31, SEEK_CUR);
543
+ LLAMA_ASSERT(new_size == llama_calc_tensor_size(tensor.ne, new_type));
544
+ file.write_raw(new_data, new_size);
545
+ }
546
+ };
547
+
548
+ struct llama_model_loader {
549
+ std::vector<std::unique_ptr<llama_file_loader>> file_loaders;
550
+ llama_load_tensors_map tensors_map;
551
+ bool use_mmap;
552
+ size_t num_ggml_tensors_created = 0;
553
+ struct ggml_context * ggml_ctx = NULL;
554
+ std::unique_ptr<llama_mmap> mapping;
555
+
556
+ llama_model_loader(const std::string & fname_base, bool use_mmap, bool vocab_only) {
557
+ auto first_file = new llama_file_loader(fname_base.c_str(), 0, tensors_map);
558
+ file_loaders.emplace_back(first_file);
559
+ uint32_t n_parts = vocab_only ? 1 : guess_n_parts();
560
+ for (uint32_t i = 1; i < n_parts; i++) {
561
+ std::string fname = fname_base + "." + std::to_string(i);
562
+ auto ith_file = new llama_file_loader(fname.c_str(), i, tensors_map);
563
+ file_loaders.emplace_back(ith_file);
564
+ if (ith_file->hparams != first_file->hparams) {
565
+ throw format("llama.cpp: hparams inconsistent between files");
566
+ }
567
+ }
568
+ if (!llama_mmap::SUPPORTED) {
569
+ use_mmap = false;
570
+ }
571
+ if (use_mmap && alignment_prevents_mmap()) {
572
+ fprintf(stderr, "llama.cpp: can't use mmap because tensors are not aligned; convert to new format to avoid this\n");
573
+ use_mmap = false;
574
+ }
575
+ this->use_mmap = use_mmap;
576
+ for (llama_load_tensor & lt : tensors_map.tensors) {
577
+ lt.calc_all();
578
+ }
579
+ }
580
+
581
+ bool alignment_prevents_mmap() {
582
+ for (const llama_load_tensor & lt : tensors_map.tensors) {
583
+ for (const llama_load_tensor_shard & shard : lt.shards) {
584
+ if (shard.file_off & 3) {
585
+ return true;
586
+ }
587
+ }
588
+ }
589
+ return false;
590
+ }
591
+
592
+ uint32_t guess_n_parts() const {
593
+ auto it = tensors_map.name_to_idx.find("tok_embeddings.weight");
594
+ if (it == tensors_map.name_to_idx.end()) {
595
+ throw std::string("missing tok_embeddings.weight");
596
+ }
597
+ const llama_load_tensor & lt = tensors_map.tensors.at(it->second);
598
+ return file_loaders.at(0)->hparams.n_embd / lt.shards.at(0).ne.at(0);
599
+ }
600
+
601
+ void calc_sizes(size_t * ctx_size_p, size_t * mmapped_size_p) const {
602
+ *ctx_size_p = *mmapped_size_p = 0;
603
+ for (const llama_load_tensor & lt : tensors_map.tensors) {
604
+ *ctx_size_p += sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE;
605
+ *(use_mmap ? mmapped_size_p : ctx_size_p) += lt.size;
606
+ }
607
+ }
608
+
609
+ struct ggml_tensor * get_tensor(const std::string & name, std::vector<uint32_t> ne) {
610
+ auto it = tensors_map.name_to_idx.find(name);
611
+ if (it == tensors_map.name_to_idx.end()) {
612
+ throw format("llama.cpp: tensor '%s' is missing from model", name.c_str());
613
+ }
614
+ llama_load_tensor & lt = tensors_map.tensors.at(it->second);
615
+ if (lt.ne != ne) {
616
+ throw format("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s",
617
+ name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str());
618
+ }
619
+ return get_tensor_for(lt);
620
+ }
621
+
622
+ struct ggml_tensor * get_tensor_for(llama_load_tensor & lt) {
623
+ struct ggml_tensor * tensor;
624
+ if (lt.ne.size() == 2) {
625
+ tensor = ggml_new_tensor_2d(ggml_ctx, lt.type, lt.ne.at(0), lt.ne.at(1));
626
+ } else {
627
+ LLAMA_ASSERT(lt.ne.size() == 1);
628
+ tensor = ggml_new_tensor_1d(ggml_ctx, lt.type, lt.ne.at(0));
629
+ }
630
+ LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor
631
+ lt.ggml_tensor = tensor;
632
+ num_ggml_tensors_created++;
633
+ return tensor;
634
+ }
635
+
636
+ void done_getting_tensors() {
637
+ if (num_ggml_tensors_created != tensors_map.tensors.size()) {
638
+ throw std::string("llama.cpp: file contained more tensors than expected");
639
+ }
640
+ }
641
+
642
+ void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) {
643
+ size_t data_size = 0;
644
+ for (const llama_load_tensor & lt : tensors_map.tensors) {
645
+ data_size += lt.size;
646
+ }
647
+
648
+ if (use_mmap) {
649
+ mapping.reset(new llama_mmap(&file_loaders.at(0)->file));
650
+ if (!lmlock) {
651
+ // Don't call the callback since the actual loading will be lazy
652
+ // and we can't measure it.
653
+ progress_callback = NULL;
654
+ }
655
+ if (lmlock) {
656
+ lmlock->init(mapping->addr);
657
+ }
658
+ }
659
+
660
+ size_t done_size = 0;
661
+ for (llama_load_tensor & lt : tensors_map.tensors) {
662
+ if (progress_callback) {
663
+ progress_callback((float) done_size / data_size, progress_callback_user_data);
664
+ }
665
+ LLAMA_ASSERT(lt.ggml_tensor); // unused tensors should have been caught by load_data already
666
+ lt.data = (uint8_t *) lt.ggml_tensor->data;
667
+ load_data_for(lt);
668
+ lt.ggml_tensor->data = lt.data;
669
+ done_size += lt.size;
670
+ if (use_mmap && lmlock) {
671
+ lmlock->grow_to(done_size);
672
+ }
673
+ }
674
+ if (progress_callback) {
675
+ progress_callback(1.0f, progress_callback_user_data);
676
+ }
677
+ }
678
+
679
+ void load_data_for(llama_load_tensor & lt) {
680
+ if (use_mmap) {
681
+ LLAMA_ASSERT(lt.shards.size() == 1);
682
+ lt.data = (uint8_t *) mapping->addr + lt.shards.at(0).file_off;
683
+ } else if (lt.split_type == SPLIT_NONE) {
684
+ llama_file & file = file_loaders.at(lt.shards.at(0).file_idx)->file;
685
+ file.seek(lt.shards.at(0).file_off, SEEK_SET);
686
+ file.read_raw(lt.data, lt.size);
687
+ } else if (lt.split_type == SPLIT_BY_ROWS) {
688
+ size_t offset = 0;
689
+ for (llama_load_tensor_shard & shard : lt.shards) {
690
+ llama_file & file = file_loaders.at(shard.file_idx)->file;
691
+ file.seek(shard.file_off, SEEK_SET);
692
+ file.read_raw(lt.data + offset, shard.size);
693
+ offset += shard.size;
694
+ }
695
+ LLAMA_ASSERT(offset == lt.size);
696
+ } else if (lt.split_type == SPLIT_BY_COLUMNS) {
697
+ // Let's load the data into temporary buffers to ensure the OS performs large loads.
698
+ std::vector<llama_buffer> tmp_bufs;
699
+ tmp_bufs.resize(lt.shards.size());
700
+ for (size_t i = 0; i < lt.shards.size(); i++) {
701
+ llama_load_tensor_shard & shard = lt.shards.at(i);
702
+ llama_file & file = file_loaders.at(shard.file_idx)->file;
703
+ file.seek(shard.file_off, SEEK_SET);
704
+ tmp_bufs.at(i).resize(shard.size);
705
+ file.read_raw(tmp_bufs.at(i).addr, shard.size);
706
+ }
707
+ // Then reshape.
708
+ size_t num_rows = lt.ne.at(1);
709
+ size_t per_shard_row_size = lt.shards.at(0).size / num_rows;
710
+ size_t out_offset = 0;
711
+ for (size_t row = 0; row < num_rows; row++) {
712
+ for (llama_buffer & tmp_buf : tmp_bufs) {
713
+ memcpy(lt.data + out_offset,
714
+ tmp_buf.addr + row * per_shard_row_size,
715
+ per_shard_row_size);
716
+ out_offset += per_shard_row_size;
717
+ }
718
+ }
719
+ LLAMA_ASSERT(out_offset == lt.size);
720
+ }
721
+ if (0) {
722
+ print_checksum(lt);
723
+ }
724
+ }
725
+
726
+ static void print_checksum(llama_load_tensor & lt) {
727
+ uint32_t sum = 0;
728
+ for (size_t i = 0; i < lt.size; i++) {
729
+ uint8_t byte = lt.data[i];
730
+ sum = byte + (sum << 6) + (sum << 16) - sum; // sdbm hash
731
+ }
732
+ fprintf(stderr, "%s checksum: %#08x (%s, size %zu)\n", lt.name.c_str(), sum,
733
+ llama_format_tensor_shape(lt.ne).c_str(), lt.size);
734
+ }
735
+
736
+ };
737
+
738
+
247
739
  //
248
740
  // kv cache
249
741
  //
@@ -256,14 +748,14 @@ static bool kv_cache_init(
256
748
  const int n_embd = hparams.n_embd;
257
749
  const int n_layer = hparams.n_layer;
258
750
 
259
- const int n_mem = n_layer*n_ctx;
260
- const int n_elements = n_embd*n_mem;
751
+ const int64_t n_mem = (int64_t)n_layer*n_ctx;
752
+ const int64_t n_elements = n_embd*n_mem;
261
753
 
262
754
  cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
263
755
 
264
756
  struct ggml_init_params params;
265
- params.mem_size = cache.buf.size();
266
- params.mem_buffer = cache.buf.data();
757
+ params.mem_size = cache.buf.size;
758
+ params.mem_buffer = cache.buf.addr;
267
759
  params.no_alloc = false;
268
760
 
269
761
  cache.ctx = ggml_init(params);
@@ -279,13 +771,6 @@ static bool kv_cache_init(
279
771
  return true;
280
772
  }
281
773
 
282
- static void kv_cache_free(struct llama_kv_cache & cache) {
283
- if (cache.ctx) {
284
- ggml_free(cache.ctx);
285
- cache.ctx = nullptr;
286
- }
287
- }
288
-
289
774
  struct llama_context_params llama_context_default_params() {
290
775
  struct llama_context_params result = {
291
776
  /*.n_ctx =*/ 512,
@@ -294,6 +779,7 @@ struct llama_context_params llama_context_default_params() {
294
779
  /*.f16_kv =*/ false,
295
780
  /*.logits_all =*/ false,
296
781
  /*.vocab_only =*/ false,
782
+ /*.use_mmap =*/ true,
297
783
  /*.use_mlock =*/ false,
298
784
  /*.embedding =*/ false,
299
785
  /*.progress_callback =*/ nullptr,
@@ -303,243 +789,106 @@ struct llama_context_params llama_context_default_params() {
303
789
  return result;
304
790
  }
305
791
 
792
+ bool llama_mmap_supported() {
793
+ return llama_mmap::SUPPORTED;
794
+ }
795
+
796
+ bool llama_mlock_supported() {
797
+ return llama_mlock::SUPPORTED;
798
+ }
799
+
306
800
  //
307
801
  // model loading
308
802
  //
309
803
 
310
- static void *mmap_file(const char *fname, uint64_t *mm_length) {
311
- #if defined(_WIN32) && !defined(_POSIX_MAPPED_FILES)
312
- HANDLE hFile = CreateFileA(fname,
313
- GENERIC_READ,
314
- FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
315
- NULL,
316
- OPEN_EXISTING,
317
- FILE_ATTRIBUTE_NORMAL | FILE_ATTRIBUTE_NOT_CONTENT_INDEXED,
318
- NULL);
319
- if (hFile == INVALID_HANDLE_VALUE) return 0;
320
- LARGE_INTEGER fileSize;
321
- fileSize.QuadPart = -1;
322
- GetFileSizeEx(hFile, &fileSize);
323
- int64_t length = fileSize.QuadPart;
324
- HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
325
- CloseHandle(hFile);
326
- if (!hMapping) return 0;
327
- void *addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
328
- CloseHandle(hMapping);
329
- if (!addr) return 0;
330
- #else
331
- int fd = open(fname, O_RDONLY);
332
- if (fd == -1) return 0;
333
- int64_t length = lseek(fd, 0, SEEK_END);
334
- void *addr = mmap(NULL, length, PROT_READ, MAP_SHARED, fd, 0);
335
- close(fd);
336
- if (addr == MAP_FAILED) return 0;
337
- #endif
338
- *mm_length = length;
339
- return addr;
804
+ static const char *llama_file_version_name(llama_file_version version) {
805
+ switch (version) {
806
+ case LLAMA_FILE_VERSION_GGML: return "'ggml' (old version with low tokenizer quality and no mmap support)";
807
+ case LLAMA_FILE_VERSION_GGMF_V1: return "ggmf v1 (old version with no mmap support)";
808
+ case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (latest)";
809
+ default: LLAMA_ASSERT(false);
810
+ }
340
811
  }
341
812
 
342
- static void munmap_file(void * addr, size_t length) {
343
- #if defined(_WIN32) && !defined(_POSIX_MAPPED_FILES)
344
- UnmapViewOfFile(addr);
345
- #else
346
- munmap(addr, length);
347
- #endif
813
+ static const char *llama_ftype_name(enum llama_ftype ftype) {
814
+ switch (ftype) {
815
+ case LLAMA_FTYPE_ALL_F32: return "all F32";
816
+ case LLAMA_FTYPE_MOSTLY_F16: return "mostly F16";
817
+ case LLAMA_FTYPE_MOSTLY_Q4_0: return "mostly Q4_0";
818
+ case LLAMA_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1";
819
+ case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
820
+ return "mostly Q4_1, some F16";
821
+ default: return "unknown, may not work";
822
+ }
348
823
  }
349
824
 
350
- static bool report_bad_magic(const char *path, uint32_t got, uint32_t want) {
351
- fprintf(stderr,
352
- "%s: invalid model file (bad magic [got %#x want %#x])\n"
353
- "\tyou most likely need to regenerate your ggml files\n"
354
- "\tthe benefit is you'll get 10-100x faster load times\n"
355
- "\tsee https://github.com/ggerganov/llama.cpp/issues/91\n"
356
- "\tuse convert-pth-to-ggml.py to regenerate from original pth\n"
357
- "\tuse migrate-ggml-2023-03-30-pr613.py if you deleted originals\n",
358
- path, got, want);
359
- return false;
825
+ static const char *llama_model_type_name(e_model type) {
826
+ switch (type) {
827
+ case MODEL_7B: return "7B";
828
+ case MODEL_13B: return "13B";
829
+ case MODEL_30B: return "30B";
830
+ case MODEL_65B: return "65B";
831
+ default: LLAMA_ASSERT(false);
832
+ }
360
833
  }
361
834
 
362
- static bool llama_model_load(
835
+ static void llama_model_load_internal(
363
836
  const std::string & fname,
364
837
  llama_context & lctx,
365
838
  int n_ctx,
366
- int n_parts,
367
839
  ggml_type memory_type,
840
+ bool use_mmap,
841
+ bool use_mlock,
368
842
  bool vocab_only,
369
843
  llama_progress_callback progress_callback,
370
- void *progress_callback_user_data) {
371
- fprintf(stderr, "%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
844
+ void * progress_callback_user_data) {
372
845
 
373
846
  lctx.t_start_us = ggml_time_us();
374
847
 
375
- auto & model = lctx.model;
376
- auto & vocab = lctx.vocab;
848
+ std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap, vocab_only));
377
849
 
378
- auto fin = std::ifstream(fname, std::ios::binary);
379
- if (!fin) {
380
- fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
381
- return false;
382
- }
383
-
384
- std::vector<char> f_buf(1024*1024);
385
- fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size());
386
-
387
- fin.seekg(0, fin.end);
388
- const size_t file_size = fin.tellg();
389
- fin.seekg(0);
850
+ lctx.vocab = std::move(ml->file_loaders.at(0)->vocab);
851
+ auto & model = lctx.model;
852
+ model.hparams = ml->file_loaders.at(0)->hparams;
853
+ llama_file_version file_version = ml->file_loaders.at(0)->file_version;
854
+ auto & hparams = model.hparams;
855
+ uint32_t n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
390
856
 
391
- // verify magic
392
857
  {
393
- uint32_t magic;
394
- fin.read((char *) &magic, sizeof(magic));
395
- if (magic == LLAMA_FILE_MAGIC_UNVERSIONED) {
396
- fprintf(stderr, "%s: invalid model file '%s' (too old, regenerate your model files or convert them with convert-unversioned-ggml-to-ggml.py!)\n",
397
- __func__, fname.c_str());
398
- return false;
858
+ switch (hparams.n_layer) {
859
+ case 32: model.type = e_model::MODEL_7B; break;
860
+ case 40: model.type = e_model::MODEL_13B; break;
861
+ case 60: model.type = e_model::MODEL_30B; break;
862
+ case 80: model.type = e_model::MODEL_65B; break;
399
863
  }
400
- if (magic != LLAMA_FILE_MAGIC) {
401
- return report_bad_magic(fname.c_str(), magic, LLAMA_FILE_MAGIC);
402
- }
403
-
404
- uint32_t format_version;
405
- fin.read((char *) &format_version, sizeof(format_version));
406
-
407
- if (format_version != LLAMA_FILE_VERSION) {
408
- fprintf(stderr, "%s: invalid model file '%s' (unsupported format version %" PRIu32 ", expected %d)\n",
409
- __func__, fname.c_str(), format_version, LLAMA_FILE_VERSION);
410
- return false;
411
- }
412
- }
413
-
414
- int n_ff = 0;
415
-
416
- // load hparams
417
- {
418
- auto & hparams = model.hparams;
419
-
420
- fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
421
- //fin.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
422
- fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd));
423
- fin.read((char *) &hparams.n_mult, sizeof(hparams.n_mult));
424
- fin.read((char *) &hparams.n_head, sizeof(hparams.n_head));
425
- fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
426
- fin.read((char *) &hparams.n_rot, sizeof(hparams.n_rot));
427
- fin.read((char *) &hparams.f16, sizeof(hparams.f16));
428
864
 
429
865
  hparams.n_ctx = n_ctx;
430
-
431
- n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
432
-
433
- if (n_parts < 1) {
434
- n_parts = LLAMA_N_PARTS.at(hparams.n_embd);
435
- }
436
-
437
- // temp warning to tell the user to use "--n_parts"
438
- if (hparams.f16 == 4 && n_parts != 1) {
439
- fprintf(stderr, "%s: GPTQ model detected - are you sure n_parts should be %d? we normally expect it to be 1\n", __func__, n_parts);
440
- fprintf(stderr, "%s: use '--n_parts 1' if necessary\n", __func__);
441
- }
442
-
443
- if (hparams.n_layer == 32) {
444
- model.type = e_model::MODEL_7B;
445
- }
446
-
447
- if (hparams.n_layer == 40) {
448
- model.type = e_model::MODEL_13B;
449
- }
450
-
451
- if (hparams.n_layer == 60) {
452
- model.type = e_model::MODEL_30B;
453
- }
454
-
455
- if (hparams.n_layer == 80) {
456
- model.type = e_model::MODEL_65B;
457
- }
458
-
459
- fprintf(stderr, "%s: n_vocab = %d\n", __func__, hparams.n_vocab);
460
- fprintf(stderr, "%s: n_ctx = %d\n", __func__, hparams.n_ctx);
461
- fprintf(stderr, "%s: n_embd = %d\n", __func__, hparams.n_embd);
462
- fprintf(stderr, "%s: n_mult = %d\n", __func__, hparams.n_mult);
463
- fprintf(stderr, "%s: n_head = %d\n", __func__, hparams.n_head);
464
- fprintf(stderr, "%s: n_layer = %d\n", __func__, hparams.n_layer);
465
- fprintf(stderr, "%s: n_rot = %d\n", __func__, hparams.n_rot);
466
- fprintf(stderr, "%s: f16 = %d\n", __func__, hparams.f16);
467
- fprintf(stderr, "%s: n_ff = %d\n", __func__, n_ff);
468
- fprintf(stderr, "%s: n_parts = %d\n", __func__, n_parts);
469
- fprintf(stderr, "%s: type = %d\n", __func__, model.type);
470
866
  }
471
867
 
472
- // load vocab
473
868
  {
474
- std::string word;
475
- vocab.id_to_token.resize(model.hparams.n_vocab);
476
- std::vector<char> tmp(64);
477
-
478
- for (int i = 0; i < model.hparams.n_vocab; i++) {
479
- uint32_t len;
480
- fin.read((char *) &len, sizeof(len));
481
-
482
- word.resize(len);
483
- if (len > 0) {
484
- tmp.resize(len);
485
- fin.read(tmp.data(), len);
486
- word.assign(tmp.data(), len);
487
- } else {
488
- word.clear();
489
- }
490
-
491
- float score;
492
- fin.read((char *) &score, sizeof(score));
493
-
494
- vocab.token_to_id[word] = i;
495
-
496
- auto &tok_score = vocab.id_to_token[i];
497
- tok_score.tok = word;
498
- tok_score.score = score;
499
- }
869
+ fprintf(stderr, "%s: format = %s\n", __func__, llama_file_version_name(file_version));
870
+ fprintf(stderr, "%s: n_vocab = %u\n", __func__, hparams.n_vocab);
871
+ fprintf(stderr, "%s: n_ctx = %u\n", __func__, hparams.n_ctx);
872
+ fprintf(stderr, "%s: n_embd = %u\n", __func__, hparams.n_embd);
873
+ fprintf(stderr, "%s: n_mult = %u\n", __func__, hparams.n_mult);
874
+ fprintf(stderr, "%s: n_head = %u\n", __func__, hparams.n_head);
875
+ fprintf(stderr, "%s: n_layer = %u\n", __func__, hparams.n_layer);
876
+ fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot);
877
+ fprintf(stderr, "%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype));
878
+ fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff);
879
+ fprintf(stderr, "%s: n_parts = %zu\n", __func__, ml->file_loaders.size());
880
+ fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
500
881
  }
501
882
 
502
883
  if (vocab_only) {
503
- return true;
504
- }
505
-
506
- // for the big tensors, we have the option to store the data in 16-bit floats or quantized
507
- // in order to save memory and also to speed up the computation
508
- // wtype is for per-layer weights, while vtype is for other weights
509
- ggml_type wtype, vtype;
510
- switch (model.hparams.f16) {
511
- case 0: wtype = vtype = GGML_TYPE_F32; break;
512
- case 1: wtype = vtype = GGML_TYPE_F16; break;
513
- case 2: wtype = vtype = GGML_TYPE_Q4_0; break;
514
- case 3: wtype = vtype = GGML_TYPE_Q4_1; break;
515
- case 4: wtype = GGML_TYPE_Q4_1; vtype = GGML_TYPE_F16; break;
516
- default:
517
- {
518
- fprintf(stderr, "%s: invalid model file '%s' (bad f16 value %d)\n",
519
- __func__, fname.c_str(), model.hparams.f16);
520
- return false;
521
- }
522
- }
523
-
524
- // map model into memory
525
- char *mm_addr = NULL;
526
- model.mm_addr = mmap_file(fname.c_str(), &model.mm_length);
527
- if (model.mm_addr == NULL) {
528
- fprintf(stderr, "%s: failed to mmap '%s'\n", __func__, fname.c_str());
529
- return false;
884
+ return;
530
885
  }
531
- mm_addr = (char *)model.mm_addr;
532
- fprintf(stderr, "%s: ggml map size = %6.2f MB\n", __func__, model.mm_length/(1024.0*1024.0));
533
886
 
534
887
  auto & ctx = model.ctx;
535
888
 
536
- size_t ctx_size = 0;
537
- {
538
- const auto &hparams = model.hparams;
539
- const int n_layer = hparams.n_layer;
540
- ctx_size += (5 + 10*n_layer)*256; // object overhead
541
- fprintf(stderr, "%s: ggml ctx size = %6.2f KB\n", __func__, ctx_size/1024.0);
542
- }
889
+ size_t ctx_size, mmapped_size;
890
+ ml->calc_sizes(&ctx_size, &mmapped_size);
891
+ fprintf(stderr, "%s: ggml ctx size = %6.2f KB\n", __func__, ctx_size/1024.0);
543
892
 
544
893
  // print memory requirements
545
894
  {
@@ -548,7 +897,7 @@ static bool llama_model_load(
548
897
  // this is the total memory required to run the inference
549
898
  const size_t mem_required =
550
899
  ctx_size +
551
- model.mm_length +
900
+ mmapped_size +
552
901
  MEM_REQ_SCRATCH0.at(model.type) +
553
902
  MEM_REQ_SCRATCH1.at(model.type) +
554
903
  MEM_REQ_EVAL.at (model.type);
@@ -564,17 +913,20 @@ static bool llama_model_load(
564
913
  // create the ggml context
565
914
  {
566
915
  lctx.model.buf.resize(ctx_size);
916
+ if (use_mlock) {
917
+ lctx.model.mlock_buf.init(lctx.model.buf.addr);
918
+ lctx.model.mlock_buf.grow_to(lctx.model.buf.size);
919
+ }
567
920
 
568
921
  struct ggml_init_params params = {
569
- /*.mem_size =*/ lctx.model.buf.size(),
570
- /*.mem_buffer =*/ lctx.model.buf.data(),
571
- /*.no_alloc =*/ true,
922
+ /*.mem_size =*/ lctx.model.buf.size,
923
+ /*.mem_buffer =*/ lctx.model.buf.addr,
924
+ /*.no_alloc =*/ ml->use_mmap,
572
925
  };
573
926
 
574
927
  model.ctx = ggml_init(params);
575
928
  if (!model.ctx) {
576
- fprintf(stderr, "%s: ggml_init() failed\n", __func__);
577
- return false;
929
+ throw format("ggml_init() failed");
578
930
  }
579
931
  }
580
932
 
@@ -582,161 +934,71 @@ static bool llama_model_load(
582
934
  {
583
935
  const auto & hparams = model.hparams;
584
936
 
585
- const int n_embd = hparams.n_embd;
586
- const int n_layer = hparams.n_layer;
587
- const int n_vocab = hparams.n_vocab;
588
-
589
- model.layers.resize(n_layer);
590
-
591
- model.tok_embeddings = ggml_new_tensor_2d(ctx, vtype, n_embd, n_vocab);
937
+ const uint32_t n_embd = hparams.n_embd;
938
+ const uint32_t n_layer = hparams.n_layer;
939
+ const uint32_t n_vocab = hparams.n_vocab;
592
940
 
593
- model.norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
594
- model.output = ggml_new_tensor_2d(ctx, vtype, n_embd, n_vocab);
941
+ ml->ggml_ctx = ctx;
595
942
 
596
- // map by name
597
- model.tensors["tok_embeddings.weight"] = model.tok_embeddings;
943
+ model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab});
944
+ model.norm = ml->get_tensor("norm.weight", {n_embd});
945
+ model.output = ml->get_tensor("output.weight", {n_embd, n_vocab});
598
946
 
599
- model.tensors["norm.weight"] = model.norm;
600
- model.tensors["output.weight"] = model.output;
601
-
602
- for (int i = 0; i < n_layer; ++i) {
947
+ model.layers.resize(n_layer);
948
+ for (uint32_t i = 0; i < n_layer; ++i) {
603
949
  auto & layer = model.layers[i];
604
950
 
605
- layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
606
-
607
- layer.wq = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
608
- layer.wk = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
609
- layer.wv = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
610
- layer.wo = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
611
-
612
- layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
951
+ std::string layers_i = "layers." + std::to_string(i);
613
952
 
614
- layer.w1 = ggml_new_tensor_2d(ctx, wtype, n_embd, n_ff);
615
- layer.w2 = ggml_new_tensor_2d(ctx, wtype, n_ff, n_embd);
616
- layer.w3 = ggml_new_tensor_2d(ctx, wtype, n_embd, n_ff);
953
+ layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd});
617
954
 
618
- // map by name
619
- model.tensors["layers." + std::to_string(i) + ".attention_norm.weight"] = layer.attention_norm;
955
+ layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd});
956
+ layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd});
957
+ layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd});
958
+ layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd});
620
959
 
621
- model.tensors["layers." + std::to_string(i) + ".attention.wq.weight"] = layer.wq;
622
- model.tensors["layers." + std::to_string(i) + ".attention.wk.weight"] = layer.wk;
623
- model.tensors["layers." + std::to_string(i) + ".attention.wv.weight"] = layer.wv;
624
- model.tensors["layers." + std::to_string(i) + ".attention.wo.weight"] = layer.wo;
960
+ layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd});
625
961
 
626
- model.tensors["layers." + std::to_string(i) + ".ffn_norm.weight"] = layer.ffn_norm;
627
-
628
- model.tensors["layers." + std::to_string(i) + ".feed_forward.w1.weight"] = layer.w1;
629
- model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.weight"] = layer.w2;
630
- model.tensors["layers." + std::to_string(i) + ".feed_forward.w3.weight"] = layer.w3;
962
+ layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff});
963
+ layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd});
964
+ layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff});
631
965
  }
632
966
  }
633
967
 
634
- std::vector<uint8_t> tmp;
968
+ ml->done_getting_tensors();
635
969
 
636
- if (progress_callback) {
637
- progress_callback(0.0, progress_callback_user_data);
970
+ // populate `tensors_by_name`
971
+ for (llama_load_tensor & lt : ml->tensors_map.tensors) {
972
+ model.tensors_by_name.emplace_back(lt.name, lt.ggml_tensor);
638
973
  }
639
974
 
640
- fprintf(stderr, "%s: loading tensors from '%s'\n", __func__, fname.c_str());
641
-
642
- // load weights
643
- {
644
- size_t total_size = 0;
645
- model.n_loaded = 0;
646
-
647
- while (true) {
648
- int32_t n_dims;
649
- int32_t length;
650
- int32_t ftype;
651
-
652
- fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
653
- fin.read(reinterpret_cast<char *>(&length), sizeof(length));
654
- fin.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
655
-
656
- if (fin.eof()) {
657
- break;
658
- }
659
-
660
- int32_t nelements = 1;
661
- int32_t ne[2] = { 1, 1 };
662
- for (int i = 0; i < n_dims; ++i) {
663
- fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
664
- nelements *= ne[i];
665
- }
666
-
667
- std::string name(length, 0);
668
- fin.read(&name[0], length);
669
-
670
- if (model.tensors.find(name.data()) == model.tensors.end()) {
671
- fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
672
- return false;
673
- }
975
+ ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
674
976
 
675
- auto tensor = model.tensors[name.data()];
676
-
677
- if (ggml_nelements(tensor) != nelements) {
678
- fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
679
- return false;
680
- }
681
- if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
682
- fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
683
- __func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
684
- return false;
685
- }
686
- if (0) {
687
- static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
688
- fprintf(stderr, "%24s - [%5d, %5d], type = %6s\n", name.data(), ne[0], ne[1], ftype_str[ftype]);
689
- }
690
-
691
- switch (ftype) {
692
- case 0: // f32
693
- case 1: // f16
694
- break;
695
- case 2: // q4_0
696
- case 3: // q4_1
697
- assert(ne[0] % 64 == 0);
698
- break;
699
- default:
700
- fprintf(stderr, "%s: unknown ftype %d in model file\n", __func__, ftype);
701
- return false;
702
- };
703
-
704
- // load the tensor data into memory without copying or reading it
705
- size_t offset = fin.tellg();
706
- size_t tensor_data_size = ggml_nbytes(tensor);
707
- offset = (offset + 31) & -32;
708
- tensor->data = mm_addr + offset;
709
- fin.seekg(offset + tensor_data_size);
710
- total_size += tensor_data_size;
711
- model.n_loaded++;
712
-
713
- // progress
714
- if (progress_callback) {
715
- double current_progress = size_t(fin.tellg()) / double(file_size);
716
- progress_callback(current_progress, progress_callback_user_data);
717
- }
718
- }
719
-
720
- fin.close();
721
-
722
- fprintf(stderr, "%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, model.n_loaded);
723
- if (model.n_loaded == 0) {
724
- fprintf(stderr, "%s: WARN no tensors loaded from model file - assuming empty model for testing\n", __func__);
725
- } else if (model.n_loaded != (int) model.tensors.size()) {
726
- fprintf(stderr, "%s: ERROR not all tensors loaded from model file - expected %zu, got %d\n", __func__, model.tensors.size(), model.n_loaded);
727
- return false;
728
- }
729
- }
977
+ model.mapping = std::move(ml->mapping);
730
978
 
731
979
  // loading time will be recalculate after the first eval, so
732
980
  // we take page faults deferred by mmap() into consideration
733
981
  lctx.t_load_us = ggml_time_us() - lctx.t_start_us;
982
+ }
734
983
 
735
- if (progress_callback) {
736
- progress_callback(1.0, progress_callback_user_data);
984
+ static bool llama_model_load(
985
+ const std::string & fname,
986
+ llama_context & lctx,
987
+ int n_ctx,
988
+ ggml_type memory_type,
989
+ bool use_mmap,
990
+ bool use_mlock,
991
+ bool vocab_only,
992
+ llama_progress_callback progress_callback,
993
+ void *progress_callback_user_data) {
994
+ try {
995
+ llama_model_load_internal(fname, lctx, n_ctx, memory_type, use_mmap, use_mlock,
996
+ vocab_only, progress_callback, progress_callback_user_data);
997
+ return true;
998
+ } catch (const std::string & err) {
999
+ fprintf(stderr, "error loading model: %s\n", err.c_str());
1000
+ return false;
737
1001
  }
738
-
739
- return true;
740
1002
  }
741
1003
 
742
1004
  // evaluate the transformer
@@ -774,8 +1036,8 @@ static bool llama_eval_internal(
774
1036
  auto & buf_compute = lctx.buf_compute;
775
1037
 
776
1038
  struct ggml_init_params params = {
777
- /*.mem_size =*/ buf_compute.size(),
778
- /*.mem_buffer =*/ buf_compute.data(),
1039
+ /*.mem_size =*/ buf_compute.size,
1040
+ /*.mem_buffer =*/ buf_compute.addr,
779
1041
  /*.no_alloc =*/ false,
780
1042
  };
781
1043
 
@@ -810,37 +1072,35 @@ static bool llama_eval_internal(
810
1072
 
811
1073
  // self-attention
812
1074
  {
813
- struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
814
- struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
815
- struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
1075
+ // compute Q and K and RoPE them
1076
+ struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
1077
+ struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
816
1078
 
817
1079
  // store key and value to memory
818
- if (N >= 1) {
1080
+ {
1081
+ // compute the transposed [N, n_embd] V matrix
1082
+ struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), n_embd, N));
1083
+
819
1084
  struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
820
- struct ggml_tensor * v = ggml_view_1d(ctx0, kv_self.v, N*n_embd, (ggml_element_size(kv_self.v)*n_embd)*(il*n_ctx + n_past));
1085
+ struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
1086
+ ( n_ctx)*ggml_element_size(kv_self.v),
1087
+ (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
821
1088
 
1089
+ // important: storing RoPE-ed version of K in the KV cache!
822
1090
  ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
823
1091
  ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
824
1092
  }
825
1093
 
826
- // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
827
1094
  struct ggml_tensor * Q =
828
1095
  ggml_permute(ctx0,
829
- ggml_rope(ctx0,
830
- ggml_cpy(ctx0,
831
- Qcur,
832
- ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)),
833
- n_past, n_rot, 0),
1096
+ Qcur,
834
1097
  0, 2, 1, 3);
835
1098
 
836
- // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
837
1099
  struct ggml_tensor * K =
838
1100
  ggml_permute(ctx0,
839
- ggml_rope(ctx0,
840
- ggml_reshape_3d(ctx0,
841
- ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd),
842
- n_embd/n_head, n_head, n_past + N),
843
- n_past, n_rot, 1),
1101
+ ggml_reshape_3d(ctx0,
1102
+ ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd),
1103
+ n_embd/n_head, n_head, n_past + N),
844
1104
  0, 2, 1, 3);
845
1105
 
846
1106
  // K * Q
@@ -858,18 +1118,23 @@ static bool llama_eval_internal(
858
1118
  // KQ = soft_max(KQ_masked)
859
1119
  struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
860
1120
 
861
- // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
862
- struct ggml_tensor * V_trans =
863
- ggml_cpy(ctx0,
864
- ggml_permute(ctx0,
865
- ggml_reshape_3d(ctx0,
866
- ggml_view_1d(ctx0, kv_self.v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.v)*n_embd),
867
- n_embd/n_head, n_head, n_past + N),
868
- 1, 2, 0, 3),
869
- ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd/n_head, n_head));
1121
+ // split cached V into n_head heads
1122
+ struct ggml_tensor * V =
1123
+ ggml_view_3d(ctx0, kv_self.v,
1124
+ n_past + N, n_embd/n_head, n_head,
1125
+ n_ctx*ggml_element_size(kv_self.v),
1126
+ n_ctx*ggml_element_size(kv_self.v)*n_embd/n_head,
1127
+ il*n_ctx*ggml_element_size(kv_self.v)*n_embd);
870
1128
 
871
- // KQV = transpose(V) * KQ_soft_max
872
- struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
1129
+ #if 1
1130
+ struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
1131
+ #else
1132
+ // make V contiguous in memory to speed up the matmul, however we waste time on the copy
1133
+ // on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
1134
+ // is there a better way?
1135
+ struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd/n_head, n_head));
1136
+ struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
1137
+ #endif
873
1138
 
874
1139
  // KQV_merged = KQV.permute(0, 2, 1, 3)
875
1140
  struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
@@ -955,9 +1220,13 @@ static bool llama_eval_internal(
955
1220
  ggml_build_forward_expand(&gf, inpL);
956
1221
  ggml_graph_compute (ctx0, &gf);
957
1222
 
1223
+ // print timing information per ggml operation (for debugging purposes)
1224
+ // requires GGML_PERF to be defined
1225
+ //ggml_graph_print(&gf);
1226
+
1227
+ // plot the computation graph in dot format (for debugging purposes)
958
1228
  //if (n_past%100 == 0) {
959
- // ggml_graph_print (&gf);
960
- // ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot");
1229
+ // ggml_graph_dump_dot(&gf, NULL, "llama.dot");
961
1230
  //}
962
1231
 
963
1232
  //embd_w.resize(n_vocab*N);
@@ -1054,7 +1323,7 @@ struct llama_tokenizer {
1054
1323
  size_t offs = 0;
1055
1324
  while (offs < text.size()) {
1056
1325
  llama_sp_symbol sym;
1057
- size_t char_len = Min(text.size() - offs, utf8_len(text[offs]));
1326
+ size_t char_len = std::min(text.size() - offs, utf8_len(text[offs]));
1058
1327
  sym.text = text.c_str() + offs;
1059
1328
  sym.n = char_len;
1060
1329
  offs += char_len;
@@ -1194,6 +1463,20 @@ static llama_vocab::id llama_sample_top_p_top_k(
1194
1463
  const auto & logits = lctx.logits;
1195
1464
  const auto * plogits = logits.data() + logits.size() - n_logits;
1196
1465
 
1466
+ if (temp <= 0) {
1467
+ // select the token with the highest logit directly
1468
+ float max_logit = plogits[0];
1469
+ llama_vocab::id max_id = 0;
1470
+
1471
+ for (int i = 1; i < n_logits; ++i) {
1472
+ if (plogits[i] > max_logit) {
1473
+ max_logit = plogits[i];
1474
+ max_id = i;
1475
+ }
1476
+ }
1477
+ return max_id;
1478
+ }
1479
+
1197
1480
  std::vector<std::pair<float, llama_vocab::id>> logits_id;
1198
1481
  logits_id.reserve(n_logits);
1199
1482
 
@@ -1215,17 +1498,13 @@ static llama_vocab::id llama_sample_top_p_top_k(
1215
1498
  }
1216
1499
  }
1217
1500
 
1218
- sample_top_k(logits_id, top_k);
1219
-
1220
- float maxl = -std::numeric_limits<float>::infinity();
1221
- for (const auto & kv : logits_id) {
1222
- maxl = Max(maxl, kv.first);
1223
- }
1501
+ sample_top_k(logits_id, top_k > 0 ? std::min(top_k, n_logits) : n_logits);
1224
1502
 
1225
1503
  // compute probs for the top k tokens
1226
1504
  std::vector<float> probs;
1227
1505
  probs.reserve(logits_id.size());
1228
1506
 
1507
+ float maxl = logits_id[0].first;
1229
1508
  double sum = 0.0;
1230
1509
  for (const auto & kv : logits_id) {
1231
1510
  const float p = expf(kv.first - maxl);
@@ -1248,16 +1527,11 @@ static llama_vocab::id llama_sample_top_p_top_k(
1248
1527
  break;
1249
1528
  }
1250
1529
  }
1251
-
1252
- cumsum = 1.0/cumsum;
1253
- for (int i = 0; i < (int) probs.size(); i++) {
1254
- probs[i] *= cumsum;
1255
- }
1256
1530
  }
1257
1531
 
1258
1532
  //printf("\n");
1259
1533
  //for (int i = 0; i < (int) 10; i++) {
1260
- // printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), probs[i]);
1534
+ // printf("%d: '%s' %f\n", i, lctx.vocab.id_to_token.at(logits_id[i].second).tok.c_str(), probs[i]);
1261
1535
  //}
1262
1536
  //printf("\n\n");
1263
1537
  //exit(0);
@@ -1272,298 +1546,118 @@ static llama_vocab::id llama_sample_top_p_top_k(
1272
1546
  // quantization
1273
1547
  //
1274
1548
 
1275
- // TODO: reuse code from the llama_model_load() somehow
1276
- static bool llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, int itype) {
1277
- ggml_type type = GGML_TYPE_Q4_1;
1278
-
1279
- switch (itype) {
1280
- case 2: type = GGML_TYPE_Q4_0; break;
1281
- case 3: type = GGML_TYPE_Q4_1; break;
1282
- default: fprintf(stderr, "%s: invalid quantization type %d\n", __func__, itype); return 1;
1549
+ static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, enum llama_ftype ftype) {
1550
+ ggml_type quantized_type;
1551
+ switch (ftype) {
1552
+ case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
1553
+ case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
1554
+ default: throw format("invalid output file type %d\n", ftype);
1283
1555
  };
1284
1556
 
1285
- if (type != GGML_TYPE_Q4_0 && type != GGML_TYPE_Q4_1) {
1286
- fprintf(stderr, "%s: invalid quantization type %d\n", __func__, type);
1287
- return false;
1288
- }
1289
-
1290
- llama_vocab vocab;
1291
-
1292
- printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
1293
-
1294
- auto finp = std::ifstream(fname_inp, std::ios::binary);
1295
- if (!finp) {
1296
- fprintf(stderr, "%s: failed to open '%s' for reading\n", __func__, fname_inp.c_str());
1297
- return false;
1298
- }
1299
-
1300
- auto fout = std::ofstream(fname_out, std::ios::binary);
1301
- if (!fout) {
1302
- fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname_out.c_str());
1303
- return false;
1304
- }
1305
-
1306
- // verify magic
1307
- {
1308
- uint32_t magic;
1309
- finp.read((char *) &magic, sizeof(magic));
1310
- if (magic == LLAMA_FILE_MAGIC_UNVERSIONED) {
1311
- fprintf(stderr, "%s: invalid model file '%s' (too old, regenerate your model files!)\n",
1312
- __func__, fname_inp.c_str());
1313
- return false;
1314
- }
1315
- if (magic != LLAMA_FILE_MAGIC) {
1316
- return report_bad_magic(fname_inp.c_str(), magic, LLAMA_FILE_MAGIC);
1317
- }
1318
-
1319
- fout.write((char *) &magic, sizeof(magic));
1320
-
1321
- uint32_t format_version;
1322
- finp.read((char *) &format_version, sizeof(format_version));
1323
-
1324
- if (format_version != LLAMA_FILE_VERSION) {
1325
- fprintf(stderr, "%s: invalid model file '%s' (unsupported format version %" PRIu32 ", expected %d)\n",
1326
- __func__, fname_inp.c_str(), format_version, LLAMA_FILE_VERSION);
1327
- return false;
1328
- }
1329
-
1330
- fout.write((char *) &format_version, sizeof(format_version));
1331
- }
1332
-
1333
- llama_hparams hparams;
1334
-
1335
- // load hparams
1336
- {
1337
- finp.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
1338
- //finp.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
1339
- finp.read((char *) &hparams.n_embd, sizeof(hparams.n_embd));
1340
- finp.read((char *) &hparams.n_mult, sizeof(hparams.n_mult));
1341
- finp.read((char *) &hparams.n_head, sizeof(hparams.n_head));
1342
- finp.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
1343
- finp.read((char *) &hparams.n_rot, sizeof(hparams.n_rot));
1344
- finp.read((char *) &hparams.f16, sizeof(hparams.f16));
1345
-
1346
- printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
1347
- printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
1348
- printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
1349
- printf("%s: n_mult = %d\n", __func__, hparams.n_mult);
1350
- printf("%s: n_head = %d\n", __func__, hparams.n_head);
1351
- printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
1352
- printf("%s: f16 = %d\n", __func__, hparams.f16);
1353
-
1354
- fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
1355
- //fout.write((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
1356
- fout.write((char *) &hparams.n_embd, sizeof(hparams.n_embd));
1357
- fout.write((char *) &hparams.n_mult, sizeof(hparams.n_mult));
1358
- fout.write((char *) &hparams.n_head, sizeof(hparams.n_head));
1359
- fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer));
1360
- fout.write((char *) &hparams.n_rot, sizeof(hparams.n_rot));
1361
- fout.write((char *) &itype, sizeof(hparams.f16));
1362
- }
1363
-
1364
- // load vocab
1365
- {
1366
- const int32_t n_vocab = hparams.n_vocab;
1367
-
1368
- if (n_vocab != hparams.n_vocab) {
1369
- fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
1370
- __func__, fname_inp.c_str(), n_vocab, hparams.n_vocab);
1371
- return false;
1372
- }
1373
-
1374
- std::vector<char> word(32);
1375
- vocab.id_to_token.resize(n_vocab);
1376
- for (int i = 0; i < n_vocab; i++) {
1377
- uint32_t len;
1378
- finp.read ((char *) &len, sizeof(len));
1379
- fout.write((char *) &len, sizeof(len));
1380
-
1381
- word.resize(len);
1382
- finp.read ((char *) &word[0], len);
1383
- fout.write((char *) &word[0], len);
1384
-
1385
- float score;
1386
- finp.read ((char *) &score, sizeof(score));
1387
- fout.write((char *) &score, sizeof(score));
1388
-
1389
- vocab.token_to_id[word.data()] = i;
1390
-
1391
- auto &tok_score = vocab.id_to_token[i];
1392
- tok_score.tok = word.data();
1393
- tok_score.score = score;
1394
- }
1395
- }
1396
-
1397
- // load weights
1398
- {
1399
- size_t total_size_org = 0;
1400
- size_t total_size_new = 0;
1401
-
1402
- std::vector<float> work;
1403
-
1404
- std::vector<uint8_t> data_u8;
1405
- std::vector<ggml_fp16_t> data_f16;
1406
- std::vector<float> data_f32;
1407
-
1408
- std::vector<int64_t> hist_all(1 << 4, 0);
1409
-
1410
- while (true) {
1411
- int32_t n_dims;
1412
- int32_t length;
1413
- int32_t ftype;
1414
-
1415
- finp.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
1416
- finp.read(reinterpret_cast<char *>(&length), sizeof(length));
1417
- finp.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
1418
-
1419
- if (finp.eof()) {
1420
- break;
1421
- }
1422
-
1423
- int32_t nelements = 1;
1424
- int32_t ne[2] = { 1, 1 };
1425
- for (int i = 0; i < n_dims; ++i) {
1426
- finp.read (reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
1427
- nelements *= ne[i];
1428
- }
1429
-
1430
- std::string name(length, 0);
1431
- finp.read (&name[0], length);
1432
-
1433
- {
1434
- // ensure tensor data is aligned
1435
- uint64_t offset = finp.tellg();
1436
- offset = (offset + 31) & -32;
1437
- finp.seekg(offset);
1438
- }
1439
-
1440
- {
1441
- static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
1442
- printf("%48s - [%5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ftype_str[ftype]);
1443
- }
1444
-
1445
- // regexes of tensor names to be quantized
1446
- const std::vector<std::string> k_names = {
1447
- ".*weight",
1448
- };
1449
-
1450
- bool quantize = false;
1451
- for (const auto & s : k_names) {
1452
- if (std::regex_match(name, std::regex(s))) {
1453
- quantize = true;
1454
- break;
1455
- }
1456
- }
1457
-
1458
- // quantize only 2D tensors
1459
- quantize &= (n_dims == 2);
1460
-
1461
- if (quantize) {
1462
- if (ftype != 0 && ftype != 1) {
1463
- fprintf(stderr, "%s: unsupported ftype %d for integer quantization\n", __func__, ftype);
1464
- return false;
1465
- }
1466
-
1467
- if (ftype == 1) {
1468
- data_f16.resize(nelements);
1469
- finp.read(reinterpret_cast<char *>(data_f16.data()), nelements * sizeof(ggml_fp16_t));
1470
- data_f32.resize(nelements);
1471
- for (int i = 0; i < nelements; ++i) {
1472
- data_f32[i] = ggml_fp16_to_fp32(data_f16[i]);
1473
- }
1474
- } else {
1475
- data_f32.resize(nelements);
1476
- finp.read(reinterpret_cast<char *>(data_f32.data()), nelements * sizeof(float));
1557
+ std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp.c_str(), /*use_mmap*/ false,
1558
+ /*vocab_only*/ false));
1559
+ llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), ftype);
1560
+
1561
+ size_t total_size_org = 0;
1562
+ size_t total_size_new = 0;
1563
+ std::vector<int64_t> hist_all(1 << 4, 0);
1564
+
1565
+ size_t idx = 0;
1566
+ for (llama_load_tensor & tensor : model_loader->tensors_map.tensors) {
1567
+ llama_buffer read_data;
1568
+ read_data.resize(tensor.size);
1569
+ tensor.data = read_data.addr;
1570
+ model_loader->load_data_for(tensor);
1571
+
1572
+ printf("[%zu/%zu] %36s - %s, type = %6s, ",
1573
+ ++idx, model_loader->tensors_map.tensors.size(),
1574
+ tensor.name.c_str(), llama_format_tensor_shape(tensor.ne).c_str(),
1575
+ ggml_type_name(tensor.type));
1576
+
1577
+ // This used to be a regex, but <regex> has an extreme cost to compile times.
1578
+ bool quantize = tensor.name.rfind("weight") == tensor.name.size() - 6; // ends with 'weight'?
1579
+
1580
+ // quantize only 2D tensors
1581
+ quantize &= (tensor.ne.size() == 2);
1582
+
1583
+ enum ggml_type new_type;
1584
+ void * new_data;
1585
+ size_t new_size;
1586
+ llama_buffer work;
1587
+
1588
+ if (!quantize) {
1589
+ new_type = tensor.type;
1590
+ new_data = tensor.data;
1591
+ new_size = tensor.size;
1592
+ printf("size = %8.3f MB\n", tensor.size/1024.0/1024.0);
1593
+ } else {
1594
+ new_type = quantized_type;
1595
+ float * f32_data;
1596
+ size_t nelements = tensor.ne.at(0) * tensor.ne.at(1);
1597
+ llama_buffer f32_conv_buf;
1598
+ if (tensor.type == GGML_TYPE_F32) {
1599
+ f32_data = (float *) tensor.data;
1600
+ } else if (tensor.type == GGML_TYPE_F16) {
1601
+ f32_conv_buf.resize(nelements * sizeof(float));
1602
+ f32_data = (float *) f32_conv_buf.addr;
1603
+ auto f16_data = (const ggml_fp16_t *) tensor.data;
1604
+ for (size_t i = 0; i < nelements; i++) {
1605
+ f32_data[i] = ggml_fp16_to_fp32(f16_data[i]);
1477
1606
  }
1478
-
1479
- ftype = itype;
1480
1607
  } else {
1481
- const int bpe = (ftype == 0) ? sizeof(float) : sizeof(uint16_t);
1482
-
1483
- data_u8.resize(nelements*bpe);
1484
- finp.read(reinterpret_cast<char *>(data_u8.data()), nelements * bpe);
1608
+ throw format("type %s unsupported for integer quantization", ggml_type_name(tensor.type));
1485
1609
  }
1486
1610
 
1487
- fout.write(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
1488
- fout.write(reinterpret_cast<char *>(&length), sizeof(length));
1489
- fout.write(reinterpret_cast<char *>(&ftype), sizeof(ftype));
1490
- for (int i = 0; i < n_dims; ++i) {
1491
- fout.write(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
1611
+ printf("quantizing .. ");
1612
+ fflush(stdout);
1613
+
1614
+ work.resize(nelements * 4); // upper bound on size
1615
+ new_data = work.addr;
1616
+ std::vector<int64_t> hist_cur(1 << 4, 0);
1617
+
1618
+ switch (new_type) {
1619
+ case GGML_TYPE_Q4_0:
1620
+ {
1621
+ new_size = ggml_quantize_q4_0(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
1622
+ } break;
1623
+ case GGML_TYPE_Q4_1:
1624
+ {
1625
+ new_size = ggml_quantize_q4_1(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
1626
+ } break;
1627
+ default:
1628
+ LLAMA_ASSERT(false);
1492
1629
  }
1493
- fout.write(&name[0], length);
1494
1630
 
1495
- {
1496
- // ensure tensor data is aligned
1497
- uint64_t offset = fout.tellp();
1498
- offset = (offset + 31) & -32;
1499
- fout.seekp(offset);
1631
+ printf("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0);
1632
+ for (size_t i = 0; i < hist_cur.size(); i++) {
1633
+ hist_all[i] += hist_cur[i];
1500
1634
  }
1501
1635
 
1502
- if (quantize) {
1503
- printf("quantizing .. ");
1504
- work.resize(nelements); // for quantization
1505
-
1506
- size_t cur_size = 0;
1507
- std::vector<int64_t> hist_cur(1 << 4, 0);
1508
-
1509
- switch (type) {
1510
- case GGML_TYPE_Q4_0:
1511
- {
1512
- cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
1513
- } break;
1514
- case GGML_TYPE_Q4_1:
1515
- {
1516
- cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
1517
- } break;
1518
- default:
1519
- {
1520
- fprintf(stderr, "%s: unsupported quantization type %d\n", __func__, type);
1521
- return false;
1522
- }
1523
- }
1524
-
1525
- fout.write(reinterpret_cast<char *>(work.data()), cur_size);
1526
- total_size_new += cur_size;
1527
-
1528
- printf("size = %8.2f MB -> %8.2f MB | hist: ", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0);
1529
- for (int i = 0; i < (int) hist_cur.size(); ++i) {
1530
- hist_all[i] += hist_cur[i];
1531
- }
1532
-
1533
- for (int i = 0; i < (int) hist_cur.size(); ++i) {
1534
- printf("%5.3f ", hist_cur[i] / float(nelements));
1535
- }
1536
- printf("\n");
1537
- } else {
1538
- printf("size = %8.3f MB\n", data_u8.size()/1024.0/1024.0);
1539
- fout.write(reinterpret_cast<char *>(data_u8.data()), data_u8.size());
1540
- total_size_new += data_u8.size();
1636
+ for (size_t i = 0; i < hist_cur.size(); i++) {
1637
+ printf("%5.3f ", hist_cur[i] / float(nelements));
1541
1638
  }
1542
-
1543
- total_size_org += nelements * sizeof(float);
1639
+ printf("\n");
1544
1640
  }
1641
+ total_size_org += tensor.size;
1642
+ total_size_new += new_size;
1643
+ file_saver.write_tensor(tensor, new_type, new_data, new_size);
1644
+ }
1545
1645
 
1546
- printf("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
1547
- printf("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0);
1646
+ printf("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
1647
+ printf("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0);
1548
1648
 
1549
- {
1550
- int64_t sum_all = 0;
1551
- for (int i = 0; i < (int) hist_all.size(); ++i) {
1552
- sum_all += hist_all[i];
1553
- }
1649
+ {
1650
+ int64_t sum_all = 0;
1651
+ for (size_t i = 0; i < hist_all.size(); i++) {
1652
+ sum_all += hist_all[i];
1653
+ }
1554
1654
 
1555
- printf("%s: hist: ", __func__);
1556
- for (int i = 0; i < (int) hist_all.size(); ++i) {
1557
- printf("%5.3f ", hist_all[i] / float(sum_all));
1558
- }
1559
- printf("\n");
1655
+ printf("%s: hist: ", __func__);
1656
+ for (size_t i = 0; i < hist_all.size(); i++) {
1657
+ printf("%5.3f ", hist_all[i] / float(sum_all));
1560
1658
  }
1659
+ printf("\n");
1561
1660
  }
1562
-
1563
- finp.close();
1564
- fout.close();
1565
-
1566
- return true;
1567
1661
  }
1568
1662
 
1569
1663
  //
@@ -1581,34 +1675,38 @@ struct llama_context * llama_init_from_file(
1581
1675
  params.seed = time(NULL);
1582
1676
  }
1583
1677
 
1678
+ unsigned cur_percentage = 0;
1679
+ if (params.progress_callback == NULL) {
1680
+ params.progress_callback_user_data = &cur_percentage;
1681
+ params.progress_callback = [](float progress, void * ctx) {
1682
+ unsigned * cur_percentage_p = (unsigned *) ctx;
1683
+ unsigned percentage = (unsigned) (100 * progress);
1684
+ while (percentage > *cur_percentage_p) {
1685
+ ++*cur_percentage_p;
1686
+ fprintf(stderr, ".");
1687
+ fflush(stderr);
1688
+ if (percentage >= 100) {
1689
+ fprintf(stderr, "\n");
1690
+ }
1691
+ }
1692
+ };
1693
+ }
1694
+
1584
1695
  ctx->rng = std::mt19937(params.seed);
1585
1696
  ctx->logits_all = params.logits_all;
1586
1697
 
1587
1698
  ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
1588
1699
 
1589
- if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_parts, memory_type,
1590
- params.vocab_only, params.progress_callback,
1591
- params.progress_callback_user_data)) {
1700
+ if (!llama_model_load(path_model, *ctx, params.n_ctx, memory_type,
1701
+ params.use_mmap, params.use_mlock, params.vocab_only,
1702
+ params.progress_callback, params.progress_callback_user_data)) {
1592
1703
  fprintf(stderr, "%s: failed to load model\n", __func__);
1593
1704
  llama_free(ctx);
1594
1705
  return nullptr;
1595
1706
  }
1596
1707
 
1597
- if (params.use_mlock) {
1598
- char *err;
1599
- if (!ggml_mlock(ctx->model.ctx,
1600
- ctx->model.mm_addr,
1601
- ctx->model.mm_length,
1602
- &err)) {
1603
- fprintf(stderr, "%s\n", err);
1604
- free(err);
1605
- llama_free(ctx);
1606
- return nullptr;
1607
- }
1608
- }
1609
-
1610
1708
  // reserve memory for context buffers
1611
- {
1709
+ if (!params.vocab_only) {
1612
1710
  if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx)) {
1613
1711
  fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
1614
1712
  llama_free(ctx);
@@ -1643,29 +1741,47 @@ struct llama_context * llama_init_from_file(
1643
1741
  }
1644
1742
 
1645
1743
  void llama_free(struct llama_context * ctx) {
1646
- kv_cache_free(ctx->model.kv_self);
1647
-
1648
- if (ctx->model.ctx) {
1649
- ggml_free(ctx->model.ctx);
1650
- }
1651
-
1652
- if (ctx->model.mm_addr) {
1653
- munmap_file(ctx->model.mm_addr, ctx->model.mm_length);
1654
- }
1655
-
1656
1744
  delete ctx;
1657
1745
  }
1658
1746
 
1659
1747
  int llama_model_quantize(
1660
1748
  const char * fname_inp,
1661
1749
  const char * fname_out,
1662
- int itype) {
1663
- if (!llama_model_quantize_internal(fname_inp, fname_out, itype)) {
1664
- fprintf(stderr, "%s: failed to quantize\n", __func__);
1750
+ enum llama_ftype ftype) {
1751
+ try {
1752
+ llama_model_quantize_internal(fname_inp, fname_out, ftype);
1753
+ return 0;
1754
+ } catch (const std::string & err) {
1755
+ fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.c_str());
1665
1756
  return 1;
1666
1757
  }
1758
+ }
1667
1759
 
1668
- return 0;
1760
+ // Returns the KV cache that will contain the context for the
1761
+ // ongoing prediction with the model.
1762
+ const uint8_t * llama_get_kv_cache(struct llama_context * ctx) {
1763
+ return ctx->model.kv_self.buf.addr;
1764
+ }
1765
+
1766
+ // Returns the size of the KV cache
1767
+ size_t llama_get_kv_cache_size(struct llama_context * ctx) {
1768
+ return ctx->model.kv_self.buf.size;
1769
+ }
1770
+
1771
+ int llama_get_kv_cache_token_count(struct llama_context * ctx) {
1772
+ return ctx->model.kv_self.n;
1773
+ }
1774
+
1775
+ // Sets the KV cache containing the current context for the model
1776
+ void llama_set_kv_cache(
1777
+ struct llama_context * ctx,
1778
+ const uint8_t * kv_cache,
1779
+ size_t n_size,
1780
+ int n_token_count) {
1781
+ // Make sure we have the same kv cache setup
1782
+ LLAMA_ASSERT(ctx->model.kv_self.buf.size == n_size);
1783
+ memcpy(ctx->model.kv_self.buf.addr, kv_cache, n_size);
1784
+ ctx->model.kv_self.n = n_token_count;
1669
1785
  }
1670
1786
 
1671
1787
  int llama_eval(
@@ -1775,9 +1891,9 @@ llama_token llama_sample_top_p_top_k(
1775
1891
  void llama_print_timings(struct llama_context * ctx) {
1776
1892
  const int64_t t_end_us = ggml_time_us();
1777
1893
 
1778
- const int32_t n_sample = Max(1, ctx->n_sample);
1779
- const int32_t n_eval = Max(1, ctx->n_eval);
1780
- const int32_t n_p_eval = Max(1, ctx->n_p_eval);
1894
+ const int32_t n_sample = std::max(1, ctx->n_sample);
1895
+ const int32_t n_eval = std::max(1, ctx->n_eval);
1896
+ const int32_t n_p_eval = std::max(1, ctx->n_p_eval);
1781
1897
 
1782
1898
  fprintf(stderr, "\n");
1783
1899
  fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0);
@@ -1813,3 +1929,8 @@ const char * llama_print_system_info(void) {
1813
1929
 
1814
1930
  return s.c_str();
1815
1931
  }
1932
+
1933
+ // For internal test use
1934
+ std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
1935
+ return ctx->model.tensors_by_name;
1936
+ }