llama_cpp 0.0.3 → 0.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,49 +1,33 @@
1
+ // Defines fileno on msys:
2
+ #ifndef _GNU_SOURCE
3
+ #define _GNU_SOURCE
4
+ #include <cstdint>
5
+ #include <cstdio>
6
+ #endif
7
+
8
+ #include "llama_util.h"
1
9
  #include "llama.h"
2
10
 
3
11
  #include "ggml.h"
4
12
 
13
+ #include <array>
14
+ #include <ctime>
5
15
  #include <cinttypes>
6
16
  #include <fstream>
7
17
  #include <random>
8
18
  #include <map>
9
19
  #include <unordered_map>
10
20
  #include <queue>
11
- #include <regex>
12
21
  #include <cassert>
13
22
  #include <cstring>
14
-
15
- #if defined(_WIN32) && !defined(_POSIX_MAPPED_FILES)
16
- #define WIN32_LEAN_AND_MEAN
17
- #include <Windows.h>
18
- #else
19
- #include <sys/types.h>
20
- #include <sys/mman.h>
21
- #include <unistd.h>
22
- #include <fcntl.h>
23
- #endif
24
-
25
- #define Min(X, Y) ((Y) > (X) ? (X) : (Y))
26
- #define Max(X, Y) ((Y) < (X) ? (X) : (Y))
23
+ #include <climits>
24
+ #include <memory>
25
+ #include <algorithm>
26
+ #include <initializer_list>
27
27
 
28
28
  #define LLAMA_USE_SCRATCH
29
29
  #define LLAMA_MAX_SCRATCH_BUFFERS 16
30
30
 
31
- #define LLAMA_ASSERT(x) \
32
- do { \
33
- if (!(x)) { \
34
- fprintf(stderr, "LLAMA_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
35
- abort(); \
36
- } \
37
- } while (0)
38
-
39
-
40
- // determine number of model parts based on the dimension
41
- static const std::unordered_map<int, int> LLAMA_N_PARTS = {
42
- { 4096, 1 },
43
- { 5120, 2 },
44
- { 6656, 4 },
45
- { 8192, 8 },
46
- };
47
31
 
48
32
  // available llama models
49
33
  enum e_model {
@@ -60,47 +44,67 @@ static const size_t MB = 1024*1024;
60
44
  // TODO: dynamically determine these sizes
61
45
  // needs modifications in ggml
62
46
 
63
- static const std::map<e_model, size_t> MEM_REQ_SCRATCH0 = {
64
- { MODEL_7B, 512ull*MB },
65
- { MODEL_13B, 512ull*MB },
66
- { MODEL_30B, 512ull*MB },
67
- { MODEL_65B, 512ull*MB },
68
- };
47
+ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
48
+ {
49
+ static std::map<e_model, size_t> _MEM_REQ_SCRATCH0 = {
50
+ { MODEL_7B, 512ull * MB },
51
+ { MODEL_13B, 512ull * MB },
52
+ { MODEL_30B, 512ull * MB },
53
+ { MODEL_65B, 512ull * MB },
54
+ };
55
+ return _MEM_REQ_SCRATCH0;
56
+ }
69
57
 
70
- static const std::map<e_model, size_t> MEM_REQ_SCRATCH1 = {
71
- { MODEL_7B, 512ull*MB },
72
- { MODEL_13B, 512ull*MB },
73
- { MODEL_30B, 512ull*MB },
74
- { MODEL_65B, 512ull*MB },
58
+ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
59
+ {
60
+ static std::map<e_model, size_t> _MEM_REQ_SCRATCH1 = {
61
+ { MODEL_7B, 512ull * MB },
62
+ { MODEL_13B, 512ull * MB },
63
+ { MODEL_30B, 512ull * MB },
64
+ { MODEL_65B, 512ull * MB },
65
+ };
66
+ return _MEM_REQ_SCRATCH1;
75
67
  };
76
68
 
77
69
  // 2*n_embd*n_ctx*n_layer*sizeof(float16)
78
- static const std::map<e_model, size_t> MEM_REQ_KV_SELF = {
79
- { MODEL_7B, 1026ull*MB },
80
- { MODEL_13B, 1608ull*MB },
81
- { MODEL_30B, 3124ull*MB },
82
- { MODEL_65B, 5120ull*MB },
70
+ static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
71
+ {
72
+ static std::map<e_model, size_t> _MEM_REQ_KV_SELF = {
73
+ { MODEL_7B, 1026ull * MB },
74
+ { MODEL_13B, 1608ull * MB },
75
+ { MODEL_30B, 3124ull * MB },
76
+ { MODEL_65B, 5120ull * MB },
77
+ };
78
+ return _MEM_REQ_KV_SELF;
83
79
  };
84
80
 
85
81
  // this is mostly needed for temporary mul_mat buffers to dequantize the data
86
82
  // not actually needed if BLAS is disabled
87
- static const std::map<e_model, size_t> MEM_REQ_EVAL = {
88
- { MODEL_7B, 768ull*MB },
89
- { MODEL_13B, 1024ull*MB },
90
- { MODEL_30B, 1280ull*MB },
91
- { MODEL_65B, 1536ull*MB },
83
+ static const std::map<e_model, size_t> & MEM_REQ_EVAL()
84
+ {
85
+ static std::map<e_model, size_t> _MEM_REQ_EVAL = {
86
+ { MODEL_7B, 768ull * MB },
87
+ { MODEL_13B, 1024ull * MB },
88
+ { MODEL_30B, 1280ull * MB },
89
+ { MODEL_65B, 1536ull * MB },
90
+ };
91
+ return _MEM_REQ_EVAL;
92
92
  };
93
93
 
94
94
  // default hparams (LLaMA 7B)
95
95
  struct llama_hparams {
96
- int32_t n_vocab = 32000;
97
- int32_t n_ctx = 512; // this is provided as user input?
98
- int32_t n_embd = 4096;
99
- int32_t n_mult = 256;
100
- int32_t n_head = 32;
101
- int32_t n_layer = 32;
102
- int32_t n_rot = 64;
103
- int32_t f16 = 1;
96
+ uint32_t n_vocab = 32000;
97
+ uint32_t n_ctx = 512; // this is provided as user input?
98
+ uint32_t n_embd = 4096;
99
+ uint32_t n_mult = 256;
100
+ uint32_t n_head = 32;
101
+ uint32_t n_layer = 32;
102
+ uint32_t n_rot = 64;
103
+ enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;
104
+
105
+ bool operator!=(const llama_hparams & other) const {
106
+ return memcmp(this, &other, sizeof(llama_hparams));
107
+ }
104
108
  };
105
109
 
106
110
  struct llama_layer {
@@ -126,11 +130,17 @@ struct llama_kv_cache {
126
130
  struct ggml_tensor * k;
127
131
  struct ggml_tensor * v;
128
132
 
129
- struct ggml_context * ctx;
133
+ struct ggml_context * ctx = NULL;
130
134
 
131
- std::vector<uint8_t> buf;
135
+ llama_buffer buf;
132
136
 
133
137
  int n; // number of tokens currently in the cache
138
+
139
+ ~llama_kv_cache() {
140
+ if (ctx) {
141
+ ggml_free(ctx);
142
+ }
143
+ }
134
144
  };
135
145
 
136
146
  struct llama_model {
@@ -146,22 +156,30 @@ struct llama_model {
146
156
  std::vector<llama_layer> layers;
147
157
 
148
158
  // context
149
- struct ggml_context * ctx;
159
+ struct ggml_context * ctx = NULL;
150
160
 
151
161
  // key + value cache for the self attention
152
162
  // TODO: move to llama_state
153
163
  struct llama_kv_cache kv_self;
154
164
 
155
165
  // the model memory buffer
156
- std::vector<uint8_t> buf;
166
+ llama_buffer buf;
157
167
 
158
168
  // model memory mapped file
159
- void * mm_addr = NULL;
160
- uint64_t mm_length = 0;
169
+ std::unique_ptr<llama_mmap> mapping;
170
+
171
+ // objects representing data potentially being locked in memory
172
+ llama_mlock mlock_buf;
173
+ llama_mlock mlock_mmap;
174
+
175
+ // for quantize-stats only
176
+ std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
161
177
 
162
- // tensors
163
- int n_loaded;
164
- std::unordered_map<std::string, struct ggml_tensor *> tensors;
178
+ ~llama_model() {
179
+ if (ctx) {
180
+ ggml_free(ctx);
181
+ }
182
+ }
165
183
  };
166
184
 
167
185
  struct llama_vocab {
@@ -206,8 +224,8 @@ struct llama_context {
206
224
 
207
225
  // memory buffers used to evaluate the model
208
226
  // TODO: move in llama_state
209
- std::vector<uint8_t> buf_compute;
210
- std::vector<uint8_t> buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
227
+ llama_buffer buf_compute;
228
+ llama_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
211
229
 
212
230
  int buf_last = 0;
213
231
  size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
@@ -220,11 +238,11 @@ struct llama_context {
220
238
  last_size = ggml_set_scratch(ctx, { 0, 0, nullptr, });
221
239
  } else {
222
240
  auto & buf = buf_scratch[i];
223
- last_size = ggml_set_scratch(ctx, { 0, buf.size(), buf.data(), });
241
+ last_size = ggml_set_scratch(ctx, { 0, buf.size, buf.addr, });
224
242
  }
225
243
 
226
244
  if (buf_last >= 0) {
227
- buf_max_size[buf_last] = Max(buf_max_size[buf_last], last_size);
245
+ buf_max_size[buf_last] = std::max(buf_max_size[buf_last], last_size);
228
246
  }
229
247
 
230
248
  buf_last = i;
@@ -244,6 +262,500 @@ struct llama_context {
244
262
  }
245
263
  };
246
264
 
265
+ template <typename T>
266
+ static T checked_mul(T a, T b) {
267
+ T ret = a * b;
268
+ if (a != 0 && ret / a != b) {
269
+ throw format("overflow multiplying %llu * %llu",
270
+ (unsigned long long) a, (unsigned long long) b);
271
+ }
272
+ return ret;
273
+ }
274
+
275
+ static size_t checked_div(size_t a, size_t b) {
276
+ if (b == 0 || a % b != 0) {
277
+ throw format("error dividing %zu / %zu", a, b);
278
+ }
279
+ return a / b;
280
+ }
281
+
282
+ static std::string llama_format_tensor_shape(const std::vector<uint32_t> & ne) {
283
+ char buf[256];
284
+ snprintf(buf, sizeof(buf), "%5u", ne.at(0));
285
+ for (size_t i = 1; i < ne.size(); i++) {
286
+ snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), " x %5u", ne.at(i));
287
+ }
288
+ return buf;
289
+ }
290
+
291
+ static size_t llama_calc_tensor_size(const std::vector<uint32_t> & ne, enum ggml_type type) {
292
+ size_t size = ggml_type_size(type);
293
+ for (uint32_t dim : ne) {
294
+ size = checked_mul<size_t>(size, dim);
295
+ }
296
+ return size / ggml_blck_size(type);
297
+ }
298
+
299
+ struct llama_load_tensor_shard {
300
+ std::vector<uint32_t> ne;
301
+ size_t size;
302
+ enum ggml_type type;
303
+ size_t file_idx;
304
+ size_t file_off;
305
+
306
+ void calc_size() {
307
+ size = llama_calc_tensor_size(ne, type);
308
+ }
309
+ };
310
+
311
+ enum llama_split_type {
312
+ SPLIT_NONE,
313
+ SPLIT_BY_COLUMNS,
314
+ SPLIT_BY_ROWS
315
+ };
316
+
317
+ struct llama_load_tensor {
318
+ std::vector<llama_load_tensor_shard> shards;
319
+
320
+ std::string name;
321
+ enum ggml_type type = GGML_TYPE_F32;
322
+ llama_split_type split_type = SPLIT_NONE;
323
+ std::vector<uint32_t> ne;
324
+ size_t size;
325
+ struct ggml_tensor * ggml_tensor = NULL;
326
+ uint8_t * data;
327
+
328
+ llama_load_tensor(const std::string & name) : name(name) {}
329
+
330
+ void calc_all() {
331
+ calc_type();
332
+ calc_split_type();
333
+ calc_ne();
334
+ calc_size();
335
+ }
336
+
337
+ void calc_type() {
338
+ const auto & first_shard = shards.at(0);
339
+ for (const auto & shard : shards) {
340
+ if (shard.type != first_shard.type) {
341
+ throw format("inconsistent tensor shard type in '%s'", name.c_str());
342
+ }
343
+ }
344
+ type = first_shard.type;
345
+ }
346
+
347
+ void calc_split_type() {
348
+ if (shards.at(0).ne.size() == 1 || // 1D tensors are just duplicated in every file
349
+ shards.size() == 1) { // only one file?
350
+ split_type = SPLIT_NONE;
351
+ } else if (name.find("tok_embeddings.") == 0 ||
352
+ name.find(".attention.wo.weight") != std::string::npos ||
353
+ name.find(".feed_forward.w2.weight") != std::string::npos) {
354
+ split_type = SPLIT_BY_COLUMNS;
355
+ } else {
356
+ split_type = SPLIT_BY_ROWS;
357
+ }
358
+ }
359
+
360
+ void calc_ne() {
361
+ const auto & first_shard = shards.at(0);
362
+ for (const auto & shard : shards) {
363
+ if (shard.ne != first_shard.ne) {
364
+ throw format("inconsistent tensor shard shape in '%s': first was %s, other was %s",
365
+ name.c_str(), llama_format_tensor_shape(first_shard.ne).c_str(), llama_format_tensor_shape(shard.ne).c_str());
366
+ }
367
+ }
368
+ ne = first_shard.ne;
369
+ LLAMA_ASSERT(shards.size() <= UINT32_MAX);
370
+ uint32_t n_shards = (uint32_t) shards.size();
371
+ switch (split_type) {
372
+ case SPLIT_NONE:
373
+ ne = first_shard.ne;
374
+ break;
375
+ case SPLIT_BY_COLUMNS:
376
+ ne = {checked_mul<uint32_t>(first_shard.ne[0], n_shards),
377
+ first_shard.ne[1]};
378
+ break;
379
+ case SPLIT_BY_ROWS:
380
+ ne = {first_shard.ne[0],
381
+ checked_mul<uint32_t>(first_shard.ne[1], n_shards)};
382
+ break;
383
+ }
384
+ }
385
+
386
+ void calc_size() {
387
+ size = llama_calc_tensor_size(ne, type);
388
+ }
389
+ };
390
+
391
+ struct llama_load_tensors_map {
392
+ // tensors is kept in a separate vector to preserve file order
393
+ std::vector<llama_load_tensor> tensors;
394
+ std::unordered_map<std::string, size_t> name_to_idx;
395
+ };
396
+
397
+ enum llama_file_version {
398
+ LLAMA_FILE_VERSION_GGML,
399
+ LLAMA_FILE_VERSION_GGMF_V1, // added version field and scores in vocab
400
+ LLAMA_FILE_VERSION_GGJT_V1, // added padding
401
+ };
402
+
403
+ struct llama_file_loader {
404
+ llama_file file;
405
+ llama_file_version file_version;
406
+ llama_hparams hparams;
407
+ llama_vocab vocab;
408
+
409
+ llama_file_loader(const char * fname, size_t file_idx, llama_load_tensors_map & tensors_map)
410
+ : file(fname, "rb") {
411
+ fprintf(stderr, "llama.cpp: loading model from %s\n", fname);
412
+ read_magic();
413
+ read_hparams();
414
+ read_vocab();
415
+ read_tensor_metadata(file_idx, tensors_map);
416
+ }
417
+ void read_magic() {
418
+ uint32_t magic = file.read_u32();
419
+ uint32_t version = 0;
420
+
421
+ if (magic != 'ggml') {
422
+ version = file.read_u32();
423
+ }
424
+
425
+ if (magic == 'ggml' && version == 0) {
426
+ file_version = LLAMA_FILE_VERSION_GGML;
427
+ } else if (magic == 'ggmf' && version == 1) {
428
+ file_version = LLAMA_FILE_VERSION_GGMF_V1;
429
+ } else if (magic == 'ggjt' && version == 1) {
430
+ file_version = LLAMA_FILE_VERSION_GGJT_V1;
431
+ } else {
432
+ throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
433
+ magic, version);
434
+ }
435
+ }
436
+ void read_hparams() {
437
+ hparams.n_vocab = file.read_u32();
438
+ hparams.n_embd = file.read_u32();
439
+ hparams.n_mult = file.read_u32();
440
+ hparams.n_head = file.read_u32();
441
+ hparams.n_layer = file.read_u32();
442
+ hparams.n_rot = file.read_u32();
443
+ hparams.ftype = (enum llama_ftype) file.read_u32();
444
+ }
445
+ void read_vocab() {
446
+ vocab.id_to_token.resize(hparams.n_vocab);
447
+
448
+ for (uint32_t i = 0; i < hparams.n_vocab; i++) {
449
+ uint32_t len = file.read_u32();
450
+ std::string word = file.read_string(len);
451
+
452
+ float score = 0.0f;
453
+ if (file_version >= LLAMA_FILE_VERSION_GGMF_V1) {
454
+ file.read_raw(&score, sizeof(score));
455
+ }
456
+
457
+ vocab.token_to_id[word] = i;
458
+
459
+ auto & tok_score = vocab.id_to_token[i];
460
+ tok_score.tok = std::move(word);
461
+ tok_score.score = score;
462
+ }
463
+ }
464
+ void read_tensor_metadata(size_t file_idx, llama_load_tensors_map & tensors_map) {
465
+ while (file.tell() < file.size) {
466
+ llama_load_tensor_shard shard;
467
+ uint32_t n_dims = file.read_u32();
468
+ uint32_t name_len = file.read_u32();
469
+ shard.type = (enum ggml_type) file.read_u32();
470
+ shard.ne.resize(n_dims);
471
+ file.read_raw(shard.ne.data(), sizeof(shard.ne[0]) * n_dims);
472
+ std::string name = file.read_string(name_len);
473
+ if (n_dims < 1 || n_dims > 2) {
474
+ throw format("llama.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims);
475
+ }
476
+ switch (shard.type) {
477
+ case GGML_TYPE_F32:
478
+ case GGML_TYPE_F16:
479
+ case GGML_TYPE_Q4_0:
480
+ case GGML_TYPE_Q4_1:
481
+ break;
482
+ default: {
483
+ throw format("unrecognized tensor type %u\n", shard.type);
484
+ }
485
+ }
486
+
487
+ if (file_version >= LLAMA_FILE_VERSION_GGJT_V1) {
488
+ // skip to the next multiple of 32 bytes
489
+ file.seek(-file.tell() & 31, SEEK_CUR);
490
+ }
491
+ shard.file_idx = file_idx;
492
+ shard.file_off = file.tell();
493
+
494
+ shard.calc_size();
495
+ file.seek(shard.size, SEEK_CUR);
496
+
497
+ auto it = tensors_map.name_to_idx.find(name);
498
+ size_t idx;
499
+ if (it != tensors_map.name_to_idx.end()) {
500
+ idx = it->second;
501
+ } else {
502
+ tensors_map.tensors.emplace_back(name);
503
+ idx = tensors_map.tensors.size() - 1;
504
+ tensors_map.name_to_idx.emplace(name, idx);
505
+ }
506
+ tensors_map.tensors.at(idx).shards.push_back(shard);
507
+ }
508
+ }
509
+ };
510
+
511
+ struct llama_file_saver {
512
+ llama_file file;
513
+ llama_file_loader * any_file_loader;
514
+ llama_file_saver(const char * fname, llama_file_loader * any_file_loader, enum llama_ftype new_ftype)
515
+ : file(fname, "wb"), any_file_loader(any_file_loader) {
516
+ fprintf(stderr, "llama.cpp: saving model to %s\n", fname);
517
+ write_magic();
518
+ write_hparams(new_ftype);
519
+ write_vocab();
520
+ }
521
+ void write_magic() {
522
+ file.write_u32('ggjt'); // magic
523
+ file.write_u32(1); // version
524
+ }
525
+ void write_hparams(enum llama_ftype new_ftype) {
526
+ const llama_hparams & hparams = any_file_loader->hparams;
527
+ file.write_u32(hparams.n_vocab);
528
+ file.write_u32(hparams.n_embd);
529
+ file.write_u32(hparams.n_mult);
530
+ file.write_u32(hparams.n_head);
531
+ file.write_u32(hparams.n_layer);
532
+ file.write_u32(hparams.n_rot);
533
+ file.write_u32(new_ftype);
534
+ }
535
+ void write_vocab() {
536
+ if (any_file_loader->file_version == LLAMA_FILE_VERSION_GGML) {
537
+ fprintf(stderr, "llama.cpp: WARNING: input is an old file that doesn't have scores; will add dummy scores\n");
538
+ }
539
+ uint32_t n_vocab = any_file_loader->hparams.n_vocab;
540
+ for (uint32_t i = 0; i < n_vocab; i++) {
541
+ const auto & token_score = any_file_loader->vocab.id_to_token.at(i);
542
+ file.write_u32((uint32_t) token_score.tok.size());
543
+ file.write_raw(token_score.tok.data(), token_score.tok.size());
544
+ file.write_raw(&token_score.score, sizeof(token_score.score));
545
+ }
546
+ }
547
+ void write_tensor(llama_load_tensor & tensor, enum ggml_type new_type, const void * new_data, size_t new_size) {
548
+ switch (new_type) {
549
+ case GGML_TYPE_F32:
550
+ case GGML_TYPE_F16:
551
+ case GGML_TYPE_Q4_0:
552
+ case GGML_TYPE_Q4_1:
553
+ break;
554
+ default: LLAMA_ASSERT(false);
555
+ }
556
+ file.write_u32((uint32_t) tensor.ne.size());
557
+ file.write_u32((uint32_t) tensor.name.size());
558
+ file.write_u32(new_type);
559
+ file.write_raw(tensor.ne.data(), sizeof(tensor.ne[0]) * tensor.ne.size());
560
+ file.write_raw(tensor.name.data(), tensor.name.size());
561
+ file.seek(-file.tell() & 31, SEEK_CUR);
562
+ LLAMA_ASSERT(new_size == llama_calc_tensor_size(tensor.ne, new_type));
563
+ file.write_raw(new_data, new_size);
564
+ }
565
+ };
566
+
567
+ struct llama_model_loader {
568
+ std::vector<std::unique_ptr<llama_file_loader>> file_loaders;
569
+ llama_load_tensors_map tensors_map;
570
+ bool use_mmap;
571
+ size_t num_ggml_tensors_created = 0;
572
+ struct ggml_context * ggml_ctx = NULL;
573
+ std::unique_ptr<llama_mmap> mapping;
574
+
575
+ llama_model_loader(const std::string & fname_base, bool use_mmap, bool vocab_only) {
576
+ auto first_file = new llama_file_loader(fname_base.c_str(), 0, tensors_map);
577
+ file_loaders.emplace_back(first_file);
578
+ uint32_t n_parts = vocab_only ? 1 : guess_n_parts();
579
+ for (uint32_t i = 1; i < n_parts; i++) {
580
+ std::string fname = fname_base + "." + std::to_string(i);
581
+ auto ith_file = new llama_file_loader(fname.c_str(), i, tensors_map);
582
+ file_loaders.emplace_back(ith_file);
583
+ if (ith_file->hparams != first_file->hparams) {
584
+ throw format("llama.cpp: hparams inconsistent between files");
585
+ }
586
+ }
587
+ if (!llama_mmap::SUPPORTED) {
588
+ use_mmap = false;
589
+ }
590
+ if (use_mmap && alignment_prevents_mmap()) {
591
+ fprintf(stderr, "llama.cpp: can't use mmap because tensors are not aligned; convert to new format to avoid this\n");
592
+ use_mmap = false;
593
+ }
594
+ this->use_mmap = use_mmap;
595
+ for (llama_load_tensor & lt : tensors_map.tensors) {
596
+ lt.calc_all();
597
+ }
598
+ }
599
+
600
+ bool alignment_prevents_mmap() {
601
+ for (const llama_load_tensor & lt : tensors_map.tensors) {
602
+ for (const llama_load_tensor_shard & shard : lt.shards) {
603
+ if (shard.file_off & 3) {
604
+ return true;
605
+ }
606
+ }
607
+ }
608
+ return false;
609
+ }
610
+
611
+ uint32_t guess_n_parts() const {
612
+ auto it = tensors_map.name_to_idx.find("tok_embeddings.weight");
613
+ if (it == tensors_map.name_to_idx.end()) {
614
+ throw std::string("missing tok_embeddings.weight");
615
+ }
616
+ const llama_load_tensor & lt = tensors_map.tensors.at(it->second);
617
+ return file_loaders.at(0)->hparams.n_embd / lt.shards.at(0).ne.at(0);
618
+ }
619
+
620
+ void calc_sizes(size_t * ctx_size_p, size_t * mmapped_size_p) const {
621
+ *ctx_size_p = *mmapped_size_p = 0;
622
+ for (const llama_load_tensor & lt : tensors_map.tensors) {
623
+ *ctx_size_p += sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE;
624
+ *(use_mmap ? mmapped_size_p : ctx_size_p) += lt.size;
625
+ }
626
+ }
627
+
628
+ struct ggml_tensor * get_tensor(const std::string & name, std::vector<uint32_t> ne) {
629
+ auto it = tensors_map.name_to_idx.find(name);
630
+ if (it == tensors_map.name_to_idx.end()) {
631
+ throw format("llama.cpp: tensor '%s' is missing from model", name.c_str());
632
+ }
633
+ llama_load_tensor & lt = tensors_map.tensors.at(it->second);
634
+ if (lt.ne != ne) {
635
+ throw format("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s",
636
+ name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str());
637
+ }
638
+
639
+ return get_tensor_for(lt);
640
+ }
641
+
642
+ struct ggml_tensor * get_tensor_for(llama_load_tensor & lt) {
643
+ struct ggml_tensor * tensor;
644
+ if (lt.ne.size() == 2) {
645
+ tensor = ggml_new_tensor_2d(ggml_ctx, lt.type, lt.ne.at(0), lt.ne.at(1));
646
+ } else {
647
+ LLAMA_ASSERT(lt.ne.size() == 1);
648
+ tensor = ggml_new_tensor_1d(ggml_ctx, lt.type, lt.ne.at(0));
649
+ }
650
+ LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor
651
+ lt.ggml_tensor = tensor;
652
+ num_ggml_tensors_created++;
653
+ return tensor;
654
+ }
655
+
656
+ void done_getting_tensors() {
657
+ if (num_ggml_tensors_created != tensors_map.tensors.size()) {
658
+ throw std::string("llama.cpp: file contained more tensors than expected");
659
+ }
660
+ }
661
+
662
+ void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) {
663
+ size_t data_size = 0;
664
+ for (const llama_load_tensor & lt : tensors_map.tensors) {
665
+ data_size += lt.size;
666
+ }
667
+
668
+ if (use_mmap) {
669
+ mapping.reset(new llama_mmap(&file_loaders.at(0)->file));
670
+ if (!lmlock) {
671
+ // Don't call the callback since the actual loading will be lazy
672
+ // and we can't measure it.
673
+ progress_callback = NULL;
674
+ }
675
+ if (lmlock) {
676
+ lmlock->init(mapping->addr);
677
+ }
678
+ }
679
+
680
+ size_t done_size = 0;
681
+ for (llama_load_tensor & lt : tensors_map.tensors) {
682
+ if (progress_callback) {
683
+ progress_callback((float) done_size / data_size, progress_callback_user_data);
684
+ }
685
+ LLAMA_ASSERT(lt.ggml_tensor); // unused tensors should have been caught by load_data already
686
+ lt.data = (uint8_t *) lt.ggml_tensor->data;
687
+ load_data_for(lt);
688
+ lt.ggml_tensor->data = lt.data;
689
+ done_size += lt.size;
690
+ if (use_mmap && lmlock) {
691
+ lmlock->grow_to(done_size);
692
+ }
693
+ }
694
+ if (progress_callback) {
695
+ progress_callback(1.0f, progress_callback_user_data);
696
+ }
697
+ }
698
+
699
+ void load_data_for(llama_load_tensor & lt) {
700
+ if (use_mmap) {
701
+ LLAMA_ASSERT(lt.shards.size() == 1);
702
+ lt.data = (uint8_t *) mapping->addr + lt.shards.at(0).file_off;
703
+ } else if (lt.split_type == SPLIT_NONE) {
704
+ llama_file & file = file_loaders.at(lt.shards.at(0).file_idx)->file;
705
+ file.seek(lt.shards.at(0).file_off, SEEK_SET);
706
+ file.read_raw(lt.data, lt.size);
707
+ } else if (lt.split_type == SPLIT_BY_ROWS) {
708
+ size_t offset = 0;
709
+ for (llama_load_tensor_shard & shard : lt.shards) {
710
+ llama_file & file = file_loaders.at(shard.file_idx)->file;
711
+ file.seek(shard.file_off, SEEK_SET);
712
+ file.read_raw(lt.data + offset, shard.size);
713
+ offset += shard.size;
714
+ }
715
+ LLAMA_ASSERT(offset == lt.size);
716
+ } else if (lt.split_type == SPLIT_BY_COLUMNS) {
717
+ // Let's load the data into temporary buffers to ensure the OS performs large loads.
718
+ std::vector<llama_buffer> tmp_bufs;
719
+ tmp_bufs.resize(lt.shards.size());
720
+ for (size_t i = 0; i < lt.shards.size(); i++) {
721
+ llama_load_tensor_shard & shard = lt.shards.at(i);
722
+ llama_file & file = file_loaders.at(shard.file_idx)->file;
723
+ file.seek(shard.file_off, SEEK_SET);
724
+ tmp_bufs.at(i).resize(shard.size);
725
+ file.read_raw(tmp_bufs.at(i).addr, shard.size);
726
+ }
727
+ // Then reshape.
728
+ size_t num_rows = lt.ne.at(1);
729
+ size_t per_shard_row_size = lt.shards.at(0).size / num_rows;
730
+ size_t out_offset = 0;
731
+ for (size_t row = 0; row < num_rows; row++) {
732
+ for (llama_buffer & tmp_buf : tmp_bufs) {
733
+ memcpy(lt.data + out_offset,
734
+ tmp_buf.addr + row * per_shard_row_size,
735
+ per_shard_row_size);
736
+ out_offset += per_shard_row_size;
737
+ }
738
+ }
739
+ LLAMA_ASSERT(out_offset == lt.size);
740
+ }
741
+ if (0) {
742
+ print_checksum(lt);
743
+ }
744
+ }
745
+
746
+ static void print_checksum(llama_load_tensor & lt) {
747
+ uint32_t sum = 0;
748
+ for (size_t i = 0; i < lt.size; i++) {
749
+ uint8_t byte = lt.data[i];
750
+ sum = byte + (sum << 6) + (sum << 16) - sum; // sdbm hash
751
+ }
752
+ fprintf(stderr, "%s checksum: %#08x (%s, size %zu)\n", lt.name.c_str(), sum,
753
+ llama_format_tensor_shape(lt.ne).c_str(), lt.size);
754
+ }
755
+
756
+ };
757
+
758
+
247
759
  //
248
760
  // kv cache
249
761
  //
@@ -262,8 +774,8 @@ static bool kv_cache_init(
262
774
  cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
263
775
 
264
776
  struct ggml_init_params params;
265
- params.mem_size = cache.buf.size();
266
- params.mem_buffer = cache.buf.data();
777
+ params.mem_size = cache.buf.size;
778
+ params.mem_buffer = cache.buf.addr;
267
779
  params.no_alloc = false;
268
780
 
269
781
  cache.ctx = ggml_init(params);
@@ -279,13 +791,6 @@ static bool kv_cache_init(
279
791
  return true;
280
792
  }
281
793
 
282
- static void kv_cache_free(struct llama_kv_cache & cache) {
283
- if (cache.ctx) {
284
- ggml_free(cache.ctx);
285
- cache.ctx = nullptr;
286
- }
287
- }
288
-
289
794
  struct llama_context_params llama_context_default_params() {
290
795
  struct llama_context_params result = {
291
796
  /*.n_ctx =*/ 512,
@@ -294,6 +799,7 @@ struct llama_context_params llama_context_default_params() {
294
799
  /*.f16_kv =*/ false,
295
800
  /*.logits_all =*/ false,
296
801
  /*.vocab_only =*/ false,
802
+ /*.use_mmap =*/ true,
297
803
  /*.use_mlock =*/ false,
298
804
  /*.embedding =*/ false,
299
805
  /*.progress_callback =*/ nullptr,
@@ -303,243 +809,106 @@ struct llama_context_params llama_context_default_params() {
303
809
  return result;
304
810
  }
305
811
 
812
+ bool llama_mmap_supported() {
813
+ return llama_mmap::SUPPORTED;
814
+ }
815
+
816
+ bool llama_mlock_supported() {
817
+ return llama_mlock::SUPPORTED;
818
+ }
819
+
306
820
  //
307
821
  // model loading
308
822
  //
309
823
 
310
- static void *mmap_file(const char *fname, uint64_t *mm_length) {
311
- #if defined(_WIN32) && !defined(_POSIX_MAPPED_FILES)
312
- HANDLE hFile = CreateFileA(fname,
313
- GENERIC_READ,
314
- FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
315
- NULL,
316
- OPEN_EXISTING,
317
- FILE_ATTRIBUTE_NORMAL | FILE_ATTRIBUTE_NOT_CONTENT_INDEXED,
318
- NULL);
319
- if (hFile == INVALID_HANDLE_VALUE) return 0;
320
- LARGE_INTEGER fileSize;
321
- fileSize.QuadPart = -1;
322
- GetFileSizeEx(hFile, &fileSize);
323
- int64_t length = fileSize.QuadPart;
324
- HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
325
- CloseHandle(hFile);
326
- if (!hMapping) return 0;
327
- void *addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
328
- CloseHandle(hMapping);
329
- if (!addr) return 0;
330
- #else
331
- int fd = open(fname, O_RDONLY);
332
- if (fd == -1) return 0;
333
- int64_t length = lseek(fd, 0, SEEK_END);
334
- void *addr = mmap(NULL, length, PROT_READ, MAP_SHARED, fd, 0);
335
- close(fd);
336
- if (addr == MAP_FAILED) return 0;
337
- #endif
338
- *mm_length = length;
339
- return addr;
824
+ static const char *llama_file_version_name(llama_file_version version) {
825
+ switch (version) {
826
+ case LLAMA_FILE_VERSION_GGML: return "'ggml' (old version with low tokenizer quality and no mmap support)";
827
+ case LLAMA_FILE_VERSION_GGMF_V1: return "ggmf v1 (old version with no mmap support)";
828
+ case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (latest)";
829
+ default: LLAMA_ASSERT(false);
830
+ }
340
831
  }
341
832
 
342
- static void munmap_file(void * addr, size_t length) {
343
- #if defined(_WIN32) && !defined(_POSIX_MAPPED_FILES)
344
- UnmapViewOfFile(addr);
345
- #else
346
- munmap(addr, length);
347
- #endif
833
+ static const char *llama_ftype_name(enum llama_ftype ftype) {
834
+ switch (ftype) {
835
+ case LLAMA_FTYPE_ALL_F32: return "all F32";
836
+ case LLAMA_FTYPE_MOSTLY_F16: return "mostly F16";
837
+ case LLAMA_FTYPE_MOSTLY_Q4_0: return "mostly Q4_0";
838
+ case LLAMA_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1";
839
+ case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
840
+ return "mostly Q4_1, some F16";
841
+ default: return "unknown, may not work";
842
+ }
348
843
  }
349
844
 
350
- static bool report_bad_magic(const char *path, uint32_t got, uint32_t want) {
351
- fprintf(stderr,
352
- "%s: invalid model file (bad magic [got %#x want %#x])\n"
353
- "\tyou most likely need to regenerate your ggml files\n"
354
- "\tthe benefit is you'll get 10-100x faster load times\n"
355
- "\tsee https://github.com/ggerganov/llama.cpp/issues/91\n"
356
- "\tuse convert-pth-to-ggml.py to regenerate from original pth\n"
357
- "\tuse migrate-ggml-2023-03-30-pr613.py if you deleted originals\n",
358
- path, got, want);
359
- return false;
845
+ static const char *llama_model_type_name(e_model type) {
846
+ switch (type) {
847
+ case MODEL_7B: return "7B";
848
+ case MODEL_13B: return "13B";
849
+ case MODEL_30B: return "30B";
850
+ case MODEL_65B: return "65B";
851
+ default: LLAMA_ASSERT(false);
852
+ }
360
853
  }
361
854
 
362
- static bool llama_model_load(
855
+ static void llama_model_load_internal(
363
856
  const std::string & fname,
364
857
  llama_context & lctx,
365
858
  int n_ctx,
366
- int n_parts,
367
859
  ggml_type memory_type,
860
+ bool use_mmap,
861
+ bool use_mlock,
368
862
  bool vocab_only,
369
863
  llama_progress_callback progress_callback,
370
- void *progress_callback_user_data) {
371
- fprintf(stderr, "%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
864
+ void * progress_callback_user_data) {
372
865
 
373
866
  lctx.t_start_us = ggml_time_us();
374
867
 
375
- auto & model = lctx.model;
376
- auto & vocab = lctx.vocab;
868
+ std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap, vocab_only));
377
869
 
378
- auto fin = std::ifstream(fname, std::ios::binary);
379
- if (!fin) {
380
- fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
381
- return false;
382
- }
383
-
384
- std::vector<char> f_buf(1024*1024);
385
- fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size());
386
-
387
- fin.seekg(0, fin.end);
388
- const size_t file_size = fin.tellg();
389
- fin.seekg(0);
870
+ lctx.vocab = std::move(ml->file_loaders.at(0)->vocab);
871
+ auto & model = lctx.model;
872
+ model.hparams = ml->file_loaders.at(0)->hparams;
873
+ llama_file_version file_version = ml->file_loaders.at(0)->file_version;
874
+ auto & hparams = model.hparams;
875
+ uint32_t n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
390
876
 
391
- // verify magic
392
877
  {
393
- uint32_t magic;
394
- fin.read((char *) &magic, sizeof(magic));
395
- if (magic == LLAMA_FILE_MAGIC_UNVERSIONED) {
396
- fprintf(stderr, "%s: invalid model file '%s' (too old, regenerate your model files or convert them with convert-unversioned-ggml-to-ggml.py!)\n",
397
- __func__, fname.c_str());
398
- return false;
878
+ switch (hparams.n_layer) {
879
+ case 32: model.type = e_model::MODEL_7B; break;
880
+ case 40: model.type = e_model::MODEL_13B; break;
881
+ case 60: model.type = e_model::MODEL_30B; break;
882
+ case 80: model.type = e_model::MODEL_65B; break;
399
883
  }
400
- if (magic != LLAMA_FILE_MAGIC) {
401
- return report_bad_magic(fname.c_str(), magic, LLAMA_FILE_MAGIC);
402
- }
403
-
404
- uint32_t format_version;
405
- fin.read((char *) &format_version, sizeof(format_version));
406
-
407
- if (format_version != LLAMA_FILE_VERSION) {
408
- fprintf(stderr, "%s: invalid model file '%s' (unsupported format version %" PRIu32 ", expected %d)\n",
409
- __func__, fname.c_str(), format_version, LLAMA_FILE_VERSION);
410
- return false;
411
- }
412
- }
413
-
414
- int n_ff = 0;
415
-
416
- // load hparams
417
- {
418
- auto & hparams = model.hparams;
419
-
420
- fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
421
- //fin.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
422
- fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd));
423
- fin.read((char *) &hparams.n_mult, sizeof(hparams.n_mult));
424
- fin.read((char *) &hparams.n_head, sizeof(hparams.n_head));
425
- fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
426
- fin.read((char *) &hparams.n_rot, sizeof(hparams.n_rot));
427
- fin.read((char *) &hparams.f16, sizeof(hparams.f16));
428
884
 
429
885
  hparams.n_ctx = n_ctx;
430
-
431
- n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
432
-
433
- if (n_parts < 1) {
434
- n_parts = LLAMA_N_PARTS.at(hparams.n_embd);
435
- }
436
-
437
- // temp warning to tell the user to use "--n_parts"
438
- if (hparams.f16 == 4 && n_parts != 1) {
439
- fprintf(stderr, "%s: GPTQ model detected - are you sure n_parts should be %d? we normally expect it to be 1\n", __func__, n_parts);
440
- fprintf(stderr, "%s: use '--n_parts 1' if necessary\n", __func__);
441
- }
442
-
443
- if (hparams.n_layer == 32) {
444
- model.type = e_model::MODEL_7B;
445
- }
446
-
447
- if (hparams.n_layer == 40) {
448
- model.type = e_model::MODEL_13B;
449
- }
450
-
451
- if (hparams.n_layer == 60) {
452
- model.type = e_model::MODEL_30B;
453
- }
454
-
455
- if (hparams.n_layer == 80) {
456
- model.type = e_model::MODEL_65B;
457
- }
458
-
459
- fprintf(stderr, "%s: n_vocab = %d\n", __func__, hparams.n_vocab);
460
- fprintf(stderr, "%s: n_ctx = %d\n", __func__, hparams.n_ctx);
461
- fprintf(stderr, "%s: n_embd = %d\n", __func__, hparams.n_embd);
462
- fprintf(stderr, "%s: n_mult = %d\n", __func__, hparams.n_mult);
463
- fprintf(stderr, "%s: n_head = %d\n", __func__, hparams.n_head);
464
- fprintf(stderr, "%s: n_layer = %d\n", __func__, hparams.n_layer);
465
- fprintf(stderr, "%s: n_rot = %d\n", __func__, hparams.n_rot);
466
- fprintf(stderr, "%s: f16 = %d\n", __func__, hparams.f16);
467
- fprintf(stderr, "%s: n_ff = %d\n", __func__, n_ff);
468
- fprintf(stderr, "%s: n_parts = %d\n", __func__, n_parts);
469
- fprintf(stderr, "%s: type = %d\n", __func__, model.type);
470
886
  }
471
887
 
472
- // load vocab
473
888
  {
474
- std::string word;
475
- vocab.id_to_token.resize(model.hparams.n_vocab);
476
- std::vector<char> tmp(64);
477
-
478
- for (int i = 0; i < model.hparams.n_vocab; i++) {
479
- uint32_t len;
480
- fin.read((char *) &len, sizeof(len));
481
-
482
- word.resize(len);
483
- if (len > 0) {
484
- tmp.resize(len);
485
- fin.read(tmp.data(), len);
486
- word.assign(tmp.data(), len);
487
- } else {
488
- word.clear();
489
- }
490
-
491
- float score;
492
- fin.read((char *) &score, sizeof(score));
493
-
494
- vocab.token_to_id[word] = i;
495
-
496
- auto &tok_score = vocab.id_to_token[i];
497
- tok_score.tok = word;
498
- tok_score.score = score;
499
- }
889
+ fprintf(stderr, "%s: format = %s\n", __func__, llama_file_version_name(file_version));
890
+ fprintf(stderr, "%s: n_vocab = %u\n", __func__, hparams.n_vocab);
891
+ fprintf(stderr, "%s: n_ctx = %u\n", __func__, hparams.n_ctx);
892
+ fprintf(stderr, "%s: n_embd = %u\n", __func__, hparams.n_embd);
893
+ fprintf(stderr, "%s: n_mult = %u\n", __func__, hparams.n_mult);
894
+ fprintf(stderr, "%s: n_head = %u\n", __func__, hparams.n_head);
895
+ fprintf(stderr, "%s: n_layer = %u\n", __func__, hparams.n_layer);
896
+ fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot);
897
+ fprintf(stderr, "%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype));
898
+ fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff);
899
+ fprintf(stderr, "%s: n_parts = %zu\n", __func__, ml->file_loaders.size());
900
+ fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
500
901
  }
501
902
 
502
903
  if (vocab_only) {
503
- return true;
504
- }
505
-
506
- // for the big tensors, we have the option to store the data in 16-bit floats or quantized
507
- // in order to save memory and also to speed up the computation
508
- // wtype is for per-layer weights, while vtype is for other weights
509
- ggml_type wtype, vtype;
510
- switch (model.hparams.f16) {
511
- case 0: wtype = vtype = GGML_TYPE_F32; break;
512
- case 1: wtype = vtype = GGML_TYPE_F16; break;
513
- case 2: wtype = vtype = GGML_TYPE_Q4_0; break;
514
- case 3: wtype = vtype = GGML_TYPE_Q4_1; break;
515
- case 4: wtype = GGML_TYPE_Q4_1; vtype = GGML_TYPE_F16; break;
516
- default:
517
- {
518
- fprintf(stderr, "%s: invalid model file '%s' (bad f16 value %d)\n",
519
- __func__, fname.c_str(), model.hparams.f16);
520
- return false;
521
- }
904
+ return;
522
905
  }
523
906
 
524
- // map model into memory
525
- char *mm_addr = NULL;
526
- model.mm_addr = mmap_file(fname.c_str(), &model.mm_length);
527
- if (model.mm_addr == NULL) {
528
- fprintf(stderr, "%s: failed to mmap '%s'\n", __func__, fname.c_str());
529
- return false;
530
- }
531
- mm_addr = (char *)model.mm_addr;
532
- fprintf(stderr, "%s: ggml map size = %6.2f MB\n", __func__, model.mm_length/(1024.0*1024.0));
533
-
534
907
  auto & ctx = model.ctx;
535
908
 
536
- size_t ctx_size = 0;
537
- {
538
- const auto &hparams = model.hparams;
539
- const int n_layer = hparams.n_layer;
540
- ctx_size += (5 + 10*n_layer)*256; // object overhead
541
- fprintf(stderr, "%s: ggml ctx size = %6.2f KB\n", __func__, ctx_size/1024.0);
542
- }
909
+ size_t ctx_size, mmapped_size;
910
+ ml->calc_sizes(&ctx_size, &mmapped_size);
911
+ fprintf(stderr, "%s: ggml ctx size = %6.2f KB\n", __func__, ctx_size/1024.0);
543
912
 
544
913
  // print memory requirements
545
914
  {
@@ -548,14 +917,14 @@ static bool llama_model_load(
548
917
  // this is the total memory required to run the inference
549
918
  const size_t mem_required =
550
919
  ctx_size +
551
- model.mm_length +
552
- MEM_REQ_SCRATCH0.at(model.type) +
553
- MEM_REQ_SCRATCH1.at(model.type) +
554
- MEM_REQ_EVAL.at (model.type);
920
+ mmapped_size +
921
+ MEM_REQ_SCRATCH0().at(model.type) +
922
+ MEM_REQ_SCRATCH1().at(model.type) +
923
+ MEM_REQ_EVAL().at(model.type);
555
924
 
556
925
  // this is the memory required by one llama_state
557
926
  const size_t mem_required_state =
558
- scale*MEM_REQ_KV_SELF.at(model.type);
927
+ scale*MEM_REQ_KV_SELF().at(model.type);
559
928
 
560
929
  fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
561
930
  mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
@@ -564,17 +933,20 @@ static bool llama_model_load(
564
933
  // create the ggml context
565
934
  {
566
935
  lctx.model.buf.resize(ctx_size);
936
+ if (use_mlock) {
937
+ lctx.model.mlock_buf.init(lctx.model.buf.addr);
938
+ lctx.model.mlock_buf.grow_to(lctx.model.buf.size);
939
+ }
567
940
 
568
941
  struct ggml_init_params params = {
569
- /*.mem_size =*/ lctx.model.buf.size(),
570
- /*.mem_buffer =*/ lctx.model.buf.data(),
571
- /*.no_alloc =*/ true,
942
+ /*.mem_size =*/ lctx.model.buf.size,
943
+ /*.mem_buffer =*/ lctx.model.buf.addr,
944
+ /*.no_alloc =*/ ml->use_mmap,
572
945
  };
573
946
 
574
947
  model.ctx = ggml_init(params);
575
948
  if (!model.ctx) {
576
- fprintf(stderr, "%s: ggml_init() failed\n", __func__);
577
- return false;
949
+ throw format("ggml_init() failed");
578
950
  }
579
951
  }
580
952
 
@@ -582,161 +954,71 @@ static bool llama_model_load(
582
954
  {
583
955
  const auto & hparams = model.hparams;
584
956
 
585
- const int n_embd = hparams.n_embd;
586
- const int n_layer = hparams.n_layer;
587
- const int n_vocab = hparams.n_vocab;
588
-
589
- model.layers.resize(n_layer);
590
-
591
- model.tok_embeddings = ggml_new_tensor_2d(ctx, vtype, n_embd, n_vocab);
957
+ const uint32_t n_embd = hparams.n_embd;
958
+ const uint32_t n_layer = hparams.n_layer;
959
+ const uint32_t n_vocab = hparams.n_vocab;
592
960
 
593
- model.norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
594
- model.output = ggml_new_tensor_2d(ctx, vtype, n_embd, n_vocab);
961
+ ml->ggml_ctx = ctx;
595
962
 
596
- // map by name
597
- model.tensors["tok_embeddings.weight"] = model.tok_embeddings;
963
+ model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab});
964
+ model.norm = ml->get_tensor("norm.weight", {n_embd});
965
+ model.output = ml->get_tensor("output.weight", {n_embd, n_vocab});
598
966
 
599
- model.tensors["norm.weight"] = model.norm;
600
- model.tensors["output.weight"] = model.output;
601
-
602
- for (int i = 0; i < n_layer; ++i) {
967
+ model.layers.resize(n_layer);
968
+ for (uint32_t i = 0; i < n_layer; ++i) {
603
969
  auto & layer = model.layers[i];
604
970
 
605
- layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
606
-
607
- layer.wq = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
608
- layer.wk = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
609
- layer.wv = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
610
- layer.wo = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
611
-
612
- layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
971
+ std::string layers_i = "layers." + std::to_string(i);
613
972
 
614
- layer.w1 = ggml_new_tensor_2d(ctx, wtype, n_embd, n_ff);
615
- layer.w2 = ggml_new_tensor_2d(ctx, wtype, n_ff, n_embd);
616
- layer.w3 = ggml_new_tensor_2d(ctx, wtype, n_embd, n_ff);
973
+ layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd});
617
974
 
618
- // map by name
619
- model.tensors["layers." + std::to_string(i) + ".attention_norm.weight"] = layer.attention_norm;
975
+ layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd});
976
+ layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd});
977
+ layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd});
978
+ layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd});
620
979
 
621
- model.tensors["layers." + std::to_string(i) + ".attention.wq.weight"] = layer.wq;
622
- model.tensors["layers." + std::to_string(i) + ".attention.wk.weight"] = layer.wk;
623
- model.tensors["layers." + std::to_string(i) + ".attention.wv.weight"] = layer.wv;
624
- model.tensors["layers." + std::to_string(i) + ".attention.wo.weight"] = layer.wo;
980
+ layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd});
625
981
 
626
- model.tensors["layers." + std::to_string(i) + ".ffn_norm.weight"] = layer.ffn_norm;
627
-
628
- model.tensors["layers." + std::to_string(i) + ".feed_forward.w1.weight"] = layer.w1;
629
- model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.weight"] = layer.w2;
630
- model.tensors["layers." + std::to_string(i) + ".feed_forward.w3.weight"] = layer.w3;
982
+ layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff});
983
+ layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd});
984
+ layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff});
631
985
  }
632
986
  }
633
987
 
634
- std::vector<uint8_t> tmp;
988
+ ml->done_getting_tensors();
635
989
 
636
- if (progress_callback) {
637
- progress_callback(0.0, progress_callback_user_data);
990
+ // populate `tensors_by_name`
991
+ for (llama_load_tensor & lt : ml->tensors_map.tensors) {
992
+ model.tensors_by_name.emplace_back(lt.name, lt.ggml_tensor);
638
993
  }
639
994
 
640
- fprintf(stderr, "%s: loading tensors from '%s'\n", __func__, fname.c_str());
641
-
642
- // load weights
643
- {
644
- size_t total_size = 0;
645
- model.n_loaded = 0;
646
-
647
- while (true) {
648
- int32_t n_dims;
649
- int32_t length;
650
- int32_t ftype;
651
-
652
- fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
653
- fin.read(reinterpret_cast<char *>(&length), sizeof(length));
654
- fin.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
655
-
656
- if (fin.eof()) {
657
- break;
658
- }
659
-
660
- int32_t nelements = 1;
661
- int32_t ne[2] = { 1, 1 };
662
- for (int i = 0; i < n_dims; ++i) {
663
- fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
664
- nelements *= ne[i];
665
- }
666
-
667
- std::string name(length, 0);
668
- fin.read(&name[0], length);
669
-
670
- if (model.tensors.find(name.data()) == model.tensors.end()) {
671
- fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
672
- return false;
673
- }
674
-
675
- auto tensor = model.tensors[name.data()];
676
-
677
- if (ggml_nelements(tensor) != nelements) {
678
- fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
679
- return false;
680
- }
681
- if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
682
- fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%" PRId64 ", %" PRId64 "], expected [%d, %d]\n",
683
- __func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
684
- return false;
685
- }
686
- if (0) {
687
- static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
688
- fprintf(stderr, "%24s - [%5d, %5d], type = %6s\n", name.data(), ne[0], ne[1], ftype_str[ftype]);
689
- }
690
-
691
- switch (ftype) {
692
- case 0: // f32
693
- case 1: // f16
694
- break;
695
- case 2: // q4_0
696
- case 3: // q4_1
697
- assert(ne[0] % 64 == 0);
698
- break;
699
- default:
700
- fprintf(stderr, "%s: unknown ftype %d in model file\n", __func__, ftype);
701
- return false;
702
- };
703
-
704
- // load the tensor data into memory without copying or reading it
705
- size_t offset = fin.tellg();
706
- size_t tensor_data_size = ggml_nbytes(tensor);
707
- offset = (offset + 31) & -32;
708
- tensor->data = mm_addr + offset;
709
- fin.seekg(offset + tensor_data_size);
710
- total_size += tensor_data_size;
711
- model.n_loaded++;
712
-
713
- // progress
714
- if (progress_callback) {
715
- double current_progress = size_t(fin.tellg()) / double(file_size);
716
- progress_callback(current_progress, progress_callback_user_data);
717
- }
718
- }
719
-
720
- fin.close();
995
+ ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
721
996
 
722
- fprintf(stderr, "%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, model.n_loaded);
723
- if (model.n_loaded == 0) {
724
- fprintf(stderr, "%s: WARN no tensors loaded from model file - assuming empty model for testing\n", __func__);
725
- } else if (model.n_loaded != (int) model.tensors.size()) {
726
- fprintf(stderr, "%s: ERROR not all tensors loaded from model file - expected %zu, got %d\n", __func__, model.tensors.size(), model.n_loaded);
727
- return false;
728
- }
729
- }
997
+ model.mapping = std::move(ml->mapping);
730
998
 
731
999
  // loading time will be recalculate after the first eval, so
732
1000
  // we take page faults deferred by mmap() into consideration
733
1001
  lctx.t_load_us = ggml_time_us() - lctx.t_start_us;
1002
+ }
734
1003
 
735
- if (progress_callback) {
736
- progress_callback(1.0, progress_callback_user_data);
1004
+ static bool llama_model_load(
1005
+ const std::string & fname,
1006
+ llama_context & lctx,
1007
+ int n_ctx,
1008
+ ggml_type memory_type,
1009
+ bool use_mmap,
1010
+ bool use_mlock,
1011
+ bool vocab_only,
1012
+ llama_progress_callback progress_callback,
1013
+ void *progress_callback_user_data) {
1014
+ try {
1015
+ llama_model_load_internal(fname, lctx, n_ctx, memory_type, use_mmap, use_mlock,
1016
+ vocab_only, progress_callback, progress_callback_user_data);
1017
+ return true;
1018
+ } catch (const std::string & err) {
1019
+ fprintf(stderr, "error loading model: %s\n", err.c_str());
1020
+ return false;
737
1021
  }
738
-
739
- return true;
740
1022
  }
741
1023
 
742
1024
  // evaluate the transformer
@@ -774,8 +1056,8 @@ static bool llama_eval_internal(
774
1056
  auto & buf_compute = lctx.buf_compute;
775
1057
 
776
1058
  struct ggml_init_params params = {
777
- /*.mem_size =*/ buf_compute.size(),
778
- /*.mem_buffer =*/ buf_compute.data(),
1059
+ /*.mem_size =*/ buf_compute.size,
1060
+ /*.mem_buffer =*/ buf_compute.addr,
779
1061
  /*.no_alloc =*/ false,
780
1062
  };
781
1063
 
@@ -1061,7 +1343,7 @@ struct llama_tokenizer {
1061
1343
  size_t offs = 0;
1062
1344
  while (offs < text.size()) {
1063
1345
  llama_sp_symbol sym;
1064
- size_t char_len = Min(text.size() - offs, utf8_len(text[offs]));
1346
+ size_t char_len = std::min(text.size() - offs, utf8_len(text[offs]));
1065
1347
  sym.text = text.c_str() + offs;
1066
1348
  sym.n = char_len;
1067
1349
  offs += char_len;
@@ -1236,7 +1518,7 @@ static llama_vocab::id llama_sample_top_p_top_k(
1236
1518
  }
1237
1519
  }
1238
1520
 
1239
- sample_top_k(logits_id, top_k > 0 ? Min(top_k, n_logits) : n_logits);
1521
+ sample_top_k(logits_id, top_k > 0 ? std::min(top_k, n_logits) : n_logits);
1240
1522
 
1241
1523
  // compute probs for the top k tokens
1242
1524
  std::vector<float> probs;
@@ -1284,298 +1566,118 @@ static llama_vocab::id llama_sample_top_p_top_k(
1284
1566
  // quantization
1285
1567
  //
1286
1568
 
1287
- // TODO: reuse code from the llama_model_load() somehow
1288
- static bool llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, int itype) {
1289
- ggml_type type = GGML_TYPE_Q4_1;
1290
-
1291
- switch (itype) {
1292
- case 2: type = GGML_TYPE_Q4_0; break;
1293
- case 3: type = GGML_TYPE_Q4_1; break;
1294
- default: fprintf(stderr, "%s: invalid quantization type %d\n", __func__, itype); return 1;
1569
+ static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, enum llama_ftype ftype) {
1570
+ ggml_type quantized_type;
1571
+ switch (ftype) {
1572
+ case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
1573
+ case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
1574
+ default: throw format("invalid output file type %d\n", ftype);
1295
1575
  };
1296
1576
 
1297
- if (type != GGML_TYPE_Q4_0 && type != GGML_TYPE_Q4_1) {
1298
- fprintf(stderr, "%s: invalid quantization type %d\n", __func__, type);
1299
- return false;
1300
- }
1301
-
1302
- llama_vocab vocab;
1303
-
1304
- printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
1305
-
1306
- auto finp = std::ifstream(fname_inp, std::ios::binary);
1307
- if (!finp) {
1308
- fprintf(stderr, "%s: failed to open '%s' for reading\n", __func__, fname_inp.c_str());
1309
- return false;
1310
- }
1311
-
1312
- auto fout = std::ofstream(fname_out, std::ios::binary);
1313
- if (!fout) {
1314
- fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname_out.c_str());
1315
- return false;
1316
- }
1317
-
1318
- // verify magic
1319
- {
1320
- uint32_t magic;
1321
- finp.read((char *) &magic, sizeof(magic));
1322
- if (magic == LLAMA_FILE_MAGIC_UNVERSIONED) {
1323
- fprintf(stderr, "%s: invalid model file '%s' (too old, regenerate your model files!)\n",
1324
- __func__, fname_inp.c_str());
1325
- return false;
1326
- }
1327
- if (magic != LLAMA_FILE_MAGIC) {
1328
- return report_bad_magic(fname_inp.c_str(), magic, LLAMA_FILE_MAGIC);
1329
- }
1330
-
1331
- fout.write((char *) &magic, sizeof(magic));
1332
-
1333
- uint32_t format_version;
1334
- finp.read((char *) &format_version, sizeof(format_version));
1335
-
1336
- if (format_version != LLAMA_FILE_VERSION) {
1337
- fprintf(stderr, "%s: invalid model file '%s' (unsupported format version %" PRIu32 ", expected %d)\n",
1338
- __func__, fname_inp.c_str(), format_version, LLAMA_FILE_VERSION);
1339
- return false;
1340
- }
1341
-
1342
- fout.write((char *) &format_version, sizeof(format_version));
1343
- }
1344
-
1345
- llama_hparams hparams;
1346
-
1347
- // load hparams
1348
- {
1349
- finp.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
1350
- //finp.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
1351
- finp.read((char *) &hparams.n_embd, sizeof(hparams.n_embd));
1352
- finp.read((char *) &hparams.n_mult, sizeof(hparams.n_mult));
1353
- finp.read((char *) &hparams.n_head, sizeof(hparams.n_head));
1354
- finp.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
1355
- finp.read((char *) &hparams.n_rot, sizeof(hparams.n_rot));
1356
- finp.read((char *) &hparams.f16, sizeof(hparams.f16));
1357
-
1358
- printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
1359
- printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
1360
- printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
1361
- printf("%s: n_mult = %d\n", __func__, hparams.n_mult);
1362
- printf("%s: n_head = %d\n", __func__, hparams.n_head);
1363
- printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
1364
- printf("%s: f16 = %d\n", __func__, hparams.f16);
1365
-
1366
- fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
1367
- //fout.write((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
1368
- fout.write((char *) &hparams.n_embd, sizeof(hparams.n_embd));
1369
- fout.write((char *) &hparams.n_mult, sizeof(hparams.n_mult));
1370
- fout.write((char *) &hparams.n_head, sizeof(hparams.n_head));
1371
- fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer));
1372
- fout.write((char *) &hparams.n_rot, sizeof(hparams.n_rot));
1373
- fout.write((char *) &itype, sizeof(hparams.f16));
1374
- }
1375
-
1376
- // load vocab
1377
- {
1378
- const int32_t n_vocab = hparams.n_vocab;
1379
-
1380
- if (n_vocab != hparams.n_vocab) {
1381
- fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
1382
- __func__, fname_inp.c_str(), n_vocab, hparams.n_vocab);
1383
- return false;
1384
- }
1385
-
1386
- std::vector<char> word(32);
1387
- vocab.id_to_token.resize(n_vocab);
1388
- for (int i = 0; i < n_vocab; i++) {
1389
- uint32_t len;
1390
- finp.read ((char *) &len, sizeof(len));
1391
- fout.write((char *) &len, sizeof(len));
1392
-
1393
- word.resize(len);
1394
- finp.read ((char *) &word[0], len);
1395
- fout.write((char *) &word[0], len);
1396
-
1397
- float score;
1398
- finp.read ((char *) &score, sizeof(score));
1399
- fout.write((char *) &score, sizeof(score));
1400
-
1401
- vocab.token_to_id[word.data()] = i;
1402
-
1403
- auto &tok_score = vocab.id_to_token[i];
1404
- tok_score.tok = word.data();
1405
- tok_score.score = score;
1406
- }
1407
- }
1408
-
1409
- // load weights
1410
- {
1411
- size_t total_size_org = 0;
1412
- size_t total_size_new = 0;
1413
-
1414
- std::vector<float> work;
1415
-
1416
- std::vector<uint8_t> data_u8;
1417
- std::vector<ggml_fp16_t> data_f16;
1418
- std::vector<float> data_f32;
1419
-
1420
- std::vector<int64_t> hist_all(1 << 4, 0);
1421
-
1422
- while (true) {
1423
- int32_t n_dims;
1424
- int32_t length;
1425
- int32_t ftype;
1426
-
1427
- finp.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
1428
- finp.read(reinterpret_cast<char *>(&length), sizeof(length));
1429
- finp.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
1430
-
1431
- if (finp.eof()) {
1432
- break;
1433
- }
1434
-
1435
- int32_t nelements = 1;
1436
- int32_t ne[2] = { 1, 1 };
1437
- for (int i = 0; i < n_dims; ++i) {
1438
- finp.read (reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
1439
- nelements *= ne[i];
1440
- }
1441
-
1442
- std::string name(length, 0);
1443
- finp.read (&name[0], length);
1444
-
1445
- {
1446
- // ensure tensor data is aligned
1447
- uint64_t offset = finp.tellg();
1448
- offset = (offset + 31) & -32;
1449
- finp.seekg(offset);
1450
- }
1451
-
1452
- {
1453
- static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
1454
- printf("%48s - [%5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ftype_str[ftype]);
1455
- }
1456
-
1457
- // regexes of tensor names to be quantized
1458
- const std::vector<std::string> k_names = {
1459
- ".*weight",
1460
- };
1461
-
1462
- bool quantize = false;
1463
- for (const auto & s : k_names) {
1464
- if (std::regex_match(name, std::regex(s))) {
1465
- quantize = true;
1466
- break;
1467
- }
1468
- }
1469
-
1470
- // quantize only 2D tensors
1471
- quantize &= (n_dims == 2);
1472
-
1473
- if (quantize) {
1474
- if (ftype != 0 && ftype != 1) {
1475
- fprintf(stderr, "%s: unsupported ftype %d for integer quantization\n", __func__, ftype);
1476
- return false;
1477
- }
1478
-
1479
- if (ftype == 1) {
1480
- data_f16.resize(nelements);
1481
- finp.read(reinterpret_cast<char *>(data_f16.data()), nelements * sizeof(ggml_fp16_t));
1482
- data_f32.resize(nelements);
1483
- for (int i = 0; i < nelements; ++i) {
1484
- data_f32[i] = ggml_fp16_to_fp32(data_f16[i]);
1485
- }
1486
- } else {
1487
- data_f32.resize(nelements);
1488
- finp.read(reinterpret_cast<char *>(data_f32.data()), nelements * sizeof(float));
1577
+ std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp.c_str(), /*use_mmap*/ false,
1578
+ /*vocab_only*/ false));
1579
+ llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), ftype);
1580
+
1581
+ size_t total_size_org = 0;
1582
+ size_t total_size_new = 0;
1583
+ std::vector<int64_t> hist_all(1 << 4, 0);
1584
+
1585
+ size_t idx = 0;
1586
+ for (llama_load_tensor & tensor : model_loader->tensors_map.tensors) {
1587
+ llama_buffer read_data;
1588
+ read_data.resize(tensor.size);
1589
+ tensor.data = read_data.addr;
1590
+ model_loader->load_data_for(tensor);
1591
+
1592
+ printf("[%4zu/%4zu] %36s - %16s, type = %6s, ",
1593
+ ++idx, model_loader->tensors_map.tensors.size(),
1594
+ tensor.name.c_str(), llama_format_tensor_shape(tensor.ne).c_str(),
1595
+ ggml_type_name(tensor.type));
1596
+
1597
+ // This used to be a regex, but <regex> has an extreme cost to compile times.
1598
+ bool quantize = tensor.name.rfind("weight") == tensor.name.size() - 6; // ends with 'weight'?
1599
+
1600
+ // quantize only 2D tensors
1601
+ quantize &= (tensor.ne.size() == 2);
1602
+
1603
+ enum ggml_type new_type;
1604
+ void * new_data;
1605
+ size_t new_size;
1606
+ llama_buffer work;
1607
+
1608
+ if (!quantize) {
1609
+ new_type = tensor.type;
1610
+ new_data = tensor.data;
1611
+ new_size = tensor.size;
1612
+ printf("size = %8.3f MB\n", tensor.size/1024.0/1024.0);
1613
+ } else {
1614
+ new_type = quantized_type;
1615
+ float * f32_data;
1616
+ size_t nelements = tensor.ne.at(0) * tensor.ne.at(1);
1617
+ llama_buffer f32_conv_buf;
1618
+ if (tensor.type == GGML_TYPE_F32) {
1619
+ f32_data = (float *) tensor.data;
1620
+ } else if (tensor.type == GGML_TYPE_F16) {
1621
+ f32_conv_buf.resize(nelements * sizeof(float));
1622
+ f32_data = (float *) f32_conv_buf.addr;
1623
+ auto f16_data = (const ggml_fp16_t *) tensor.data;
1624
+ for (size_t i = 0; i < nelements; i++) {
1625
+ f32_data[i] = ggml_fp16_to_fp32(f16_data[i]);
1489
1626
  }
1490
-
1491
- ftype = itype;
1492
1627
  } else {
1493
- const int bpe = (ftype == 0) ? sizeof(float) : sizeof(uint16_t);
1494
-
1495
- data_u8.resize(nelements*bpe);
1496
- finp.read(reinterpret_cast<char *>(data_u8.data()), nelements * bpe);
1628
+ throw format("type %s unsupported for integer quantization", ggml_type_name(tensor.type));
1497
1629
  }
1498
1630
 
1499
- fout.write(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
1500
- fout.write(reinterpret_cast<char *>(&length), sizeof(length));
1501
- fout.write(reinterpret_cast<char *>(&ftype), sizeof(ftype));
1502
- for (int i = 0; i < n_dims; ++i) {
1503
- fout.write(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
1631
+ printf("quantizing .. ");
1632
+ fflush(stdout);
1633
+
1634
+ work.resize(nelements * 4); // upper bound on size
1635
+ new_data = work.addr;
1636
+ std::vector<int64_t> hist_cur(1 << 4, 0);
1637
+
1638
+ switch (new_type) {
1639
+ case GGML_TYPE_Q4_0:
1640
+ {
1641
+ new_size = ggml_quantize_q4_0(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
1642
+ } break;
1643
+ case GGML_TYPE_Q4_1:
1644
+ {
1645
+ new_size = ggml_quantize_q4_1(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
1646
+ } break;
1647
+ default:
1648
+ LLAMA_ASSERT(false);
1504
1649
  }
1505
- fout.write(&name[0], length);
1506
1650
 
1507
- {
1508
- // ensure tensor data is aligned
1509
- uint64_t offset = fout.tellp();
1510
- offset = (offset + 31) & -32;
1511
- fout.seekp(offset);
1651
+ printf("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0);
1652
+ for (size_t i = 0; i < hist_cur.size(); i++) {
1653
+ hist_all[i] += hist_cur[i];
1512
1654
  }
1513
1655
 
1514
- if (quantize) {
1515
- printf("quantizing .. ");
1516
- work.resize(nelements); // for quantization
1517
-
1518
- size_t cur_size = 0;
1519
- std::vector<int64_t> hist_cur(1 << 4, 0);
1520
-
1521
- switch (type) {
1522
- case GGML_TYPE_Q4_0:
1523
- {
1524
- cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
1525
- } break;
1526
- case GGML_TYPE_Q4_1:
1527
- {
1528
- cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
1529
- } break;
1530
- default:
1531
- {
1532
- fprintf(stderr, "%s: unsupported quantization type %d\n", __func__, type);
1533
- return false;
1534
- }
1535
- }
1536
-
1537
- fout.write(reinterpret_cast<char *>(work.data()), cur_size);
1538
- total_size_new += cur_size;
1539
-
1540
- printf("size = %8.2f MB -> %8.2f MB | hist: ", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0);
1541
- for (int i = 0; i < (int) hist_cur.size(); ++i) {
1542
- hist_all[i] += hist_cur[i];
1543
- }
1544
-
1545
- for (int i = 0; i < (int) hist_cur.size(); ++i) {
1546
- printf("%5.3f ", hist_cur[i] / float(nelements));
1547
- }
1548
- printf("\n");
1549
- } else {
1550
- printf("size = %8.3f MB\n", data_u8.size()/1024.0/1024.0);
1551
- fout.write(reinterpret_cast<char *>(data_u8.data()), data_u8.size());
1552
- total_size_new += data_u8.size();
1656
+ for (size_t i = 0; i < hist_cur.size(); i++) {
1657
+ printf("%5.3f ", hist_cur[i] / float(nelements));
1553
1658
  }
1554
-
1555
- total_size_org += nelements * sizeof(float);
1659
+ printf("\n");
1556
1660
  }
1661
+ total_size_org += tensor.size;
1662
+ total_size_new += new_size;
1663
+ file_saver.write_tensor(tensor, new_type, new_data, new_size);
1664
+ }
1557
1665
 
1558
- printf("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
1559
- printf("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0);
1666
+ printf("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
1667
+ printf("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0);
1560
1668
 
1561
- {
1562
- int64_t sum_all = 0;
1563
- for (int i = 0; i < (int) hist_all.size(); ++i) {
1564
- sum_all += hist_all[i];
1565
- }
1669
+ {
1670
+ int64_t sum_all = 0;
1671
+ for (size_t i = 0; i < hist_all.size(); i++) {
1672
+ sum_all += hist_all[i];
1673
+ }
1566
1674
 
1567
- printf("%s: hist: ", __func__);
1568
- for (int i = 0; i < (int) hist_all.size(); ++i) {
1569
- printf("%5.3f ", hist_all[i] / float(sum_all));
1570
- }
1571
- printf("\n");
1675
+ printf("%s: hist: ", __func__);
1676
+ for (size_t i = 0; i < hist_all.size(); i++) {
1677
+ printf("%5.3f ", hist_all[i] / float(sum_all));
1572
1678
  }
1679
+ printf("\n");
1573
1680
  }
1574
-
1575
- finp.close();
1576
- fout.close();
1577
-
1578
- return true;
1579
1681
  }
1580
1682
 
1581
1683
  //
@@ -1593,32 +1695,36 @@ struct llama_context * llama_init_from_file(
1593
1695
  params.seed = time(NULL);
1594
1696
  }
1595
1697
 
1698
+ unsigned cur_percentage = 0;
1699
+ if (params.progress_callback == NULL) {
1700
+ params.progress_callback_user_data = &cur_percentage;
1701
+ params.progress_callback = [](float progress, void * ctx) {
1702
+ unsigned * cur_percentage_p = (unsigned *) ctx;
1703
+ unsigned percentage = (unsigned) (100 * progress);
1704
+ while (percentage > *cur_percentage_p) {
1705
+ ++*cur_percentage_p;
1706
+ fprintf(stderr, ".");
1707
+ fflush(stderr);
1708
+ if (percentage >= 100) {
1709
+ fprintf(stderr, "\n");
1710
+ }
1711
+ }
1712
+ };
1713
+ }
1714
+
1596
1715
  ctx->rng = std::mt19937(params.seed);
1597
1716
  ctx->logits_all = params.logits_all;
1598
1717
 
1599
1718
  ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
1600
1719
 
1601
- if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_parts, memory_type,
1602
- params.vocab_only, params.progress_callback,
1603
- params.progress_callback_user_data)) {
1720
+ if (!llama_model_load(path_model, *ctx, params.n_ctx, memory_type,
1721
+ params.use_mmap, params.use_mlock, params.vocab_only,
1722
+ params.progress_callback, params.progress_callback_user_data)) {
1604
1723
  fprintf(stderr, "%s: failed to load model\n", __func__);
1605
1724
  llama_free(ctx);
1606
1725
  return nullptr;
1607
1726
  }
1608
1727
 
1609
- if (params.use_mlock) {
1610
- char *err;
1611
- if (!ggml_mlock(ctx->model.ctx,
1612
- ctx->model.mm_addr,
1613
- ctx->model.mm_length,
1614
- &err)) {
1615
- fprintf(stderr, "%s\n", err);
1616
- free(err);
1617
- llama_free(ctx);
1618
- return nullptr;
1619
- }
1620
- }
1621
-
1622
1728
  // reserve memory for context buffers
1623
1729
  if (!params.vocab_only) {
1624
1730
  if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx)) {
@@ -1645,50 +1751,289 @@ struct llama_context * llama_init_from_file(
1645
1751
  ctx->embedding.resize(hparams.n_embd);
1646
1752
  }
1647
1753
 
1648
- ctx->buf_compute.resize(MEM_REQ_EVAL.at(ctx->model.type));
1754
+ ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type));
1649
1755
 
1650
- ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0.at(ctx->model.type));
1651
- ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1.at(ctx->model.type));
1756
+ ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0().at(ctx->model.type));
1757
+ ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
1652
1758
  }
1653
1759
 
1654
1760
  return ctx;
1655
1761
  }
1656
1762
 
1657
1763
  void llama_free(struct llama_context * ctx) {
1658
- kv_cache_free(ctx->model.kv_self);
1659
-
1660
- if (ctx->model.ctx) {
1661
- ggml_free(ctx->model.ctx);
1662
- }
1663
-
1664
- if (ctx->model.mm_addr) {
1665
- munmap_file(ctx->model.mm_addr, ctx->model.mm_length);
1666
- }
1667
-
1668
1764
  delete ctx;
1669
1765
  }
1670
1766
 
1671
1767
  int llama_model_quantize(
1672
1768
  const char * fname_inp,
1673
1769
  const char * fname_out,
1674
- int itype) {
1675
- if (!llama_model_quantize_internal(fname_inp, fname_out, itype)) {
1676
- fprintf(stderr, "%s: failed to quantize\n", __func__);
1770
+ enum llama_ftype ftype) {
1771
+ try {
1772
+ llama_model_quantize_internal(fname_inp, fname_out, ftype);
1773
+ return 0;
1774
+ } catch (const std::string & err) {
1775
+ fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.c_str());
1776
+ return 1;
1777
+ }
1778
+ }
1779
+
1780
+ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
1781
+ fprintf(stderr, "%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
1782
+
1783
+ auto & model = ctx->model;
1784
+
1785
+ const int64_t t_start_lora_us = ggml_time_us();
1786
+
1787
+ auto fin = std::ifstream(path_lora, std::ios::binary);
1788
+ if (!fin) {
1789
+ fprintf(stderr, "%s: failed to open '%s'\n", __func__, path_lora);
1677
1790
  return 1;
1678
1791
  }
1679
1792
 
1793
+ // verify magic and version
1794
+ {
1795
+ uint32_t magic;
1796
+ fin.read((char *) &magic, sizeof(magic));
1797
+ if (magic != 'ggla') {
1798
+ fprintf(stderr, "%s: bad file magic\n", __func__);
1799
+ return 1;
1800
+ }
1801
+ uint32_t format_version;
1802
+ fin.read((char *) &format_version, sizeof(format_version));
1803
+
1804
+ if (format_version != 1) {
1805
+ fprintf(stderr, "%s: unsupported file version\n", __func__ );
1806
+ return 1;
1807
+ }
1808
+ }
1809
+
1810
+ int32_t lora_r;
1811
+ int32_t lora_alpha;
1812
+ fin.read((char *) &lora_r, sizeof(lora_r));
1813
+ fin.read((char *) &lora_alpha, sizeof(lora_alpha));
1814
+ float scaling = (float)lora_alpha / (float)lora_r;
1815
+
1816
+ fprintf(stderr, "%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);
1817
+
1818
+
1819
+ // create a temporary ggml context to store the lora tensors
1820
+ // todo: calculate size from biggest possible tensor
1821
+ std::vector<uint8_t> lora_buf(1024ull * 1024ull * 1024ull);
1822
+ struct ggml_init_params params;
1823
+ params.mem_size = lora_buf.size();
1824
+ params.mem_buffer = lora_buf.data();
1825
+ params.no_alloc = false;
1826
+
1827
+ ggml_context * lora_ctx = ggml_init(params);
1828
+ std::unordered_map<std::string, struct ggml_tensor *> lora_tensors;
1829
+
1830
+ // create a name -> tensor map of the model to accelerate lookups
1831
+ std::unordered_map<std::string, struct ggml_tensor*> model_tensors;
1832
+ for (auto & kv: model.tensors_by_name) {
1833
+ model_tensors.insert(kv);
1834
+ }
1835
+
1836
+
1837
+ // load base model
1838
+ std::unique_ptr<llama_model_loader> model_loader;
1839
+ ggml_context * base_ctx = NULL;
1840
+ llama_buffer base_buf;
1841
+ if (path_base_model) {
1842
+ fprintf(stderr, "%s: loading base model from '%s'\n", __func__, path_base_model);
1843
+ model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*vocab_only*/ false));
1844
+
1845
+ size_t ctx_size, mmapped_size;
1846
+ model_loader->calc_sizes(&ctx_size, &mmapped_size);
1847
+ base_buf.resize(ctx_size);
1848
+
1849
+ ggml_init_params base_params;
1850
+ base_params.mem_size = base_buf.size;
1851
+ base_params.mem_buffer = base_buf.addr;
1852
+ base_params.no_alloc = model_loader->use_mmap;
1853
+
1854
+ base_ctx = ggml_init(base_params);
1855
+
1856
+ model_loader->ggml_ctx = base_ctx;
1857
+
1858
+ // maybe this should in llama_model_loader
1859
+ if (model_loader->use_mmap) {
1860
+ model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ false));
1861
+ }
1862
+ }
1863
+
1864
+ // read tensors and apply
1865
+ bool warned = false;
1866
+ int n_tensors = 0;
1867
+ while (true) {
1868
+ int32_t n_dims;
1869
+ int32_t length;
1870
+ int32_t ftype;
1871
+
1872
+ fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
1873
+ fin.read(reinterpret_cast<char *>(&length), sizeof(length));
1874
+ fin.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
1875
+ if (fin.eof()) {
1876
+ break;
1877
+ }
1878
+
1879
+ int32_t ne[2] = { 1, 1 };
1880
+ for (int i = 0; i < n_dims; ++i) {
1881
+ fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
1882
+ }
1883
+
1884
+ std::string name(length, 0);
1885
+ fin.read(&name[0], length);
1886
+
1887
+ // check for lora suffix and get the type of tensor
1888
+ const std::string lora_suffix = ".lora";
1889
+ size_t pos = name.rfind(lora_suffix);
1890
+ if (pos == std::string::npos) {
1891
+ fprintf(stderr, "%s: error: '%s' is not a lora tensor\n", __func__, name.c_str());
1892
+ return 1;
1893
+ }
1894
+
1895
+ std::string lora_type = name.substr(pos + lora_suffix.length());
1896
+ std::string base_name = name;
1897
+ base_name.erase(pos);
1898
+ // fprintf(stderr, "%s: %s => %s (lora type %s) ", __func__, name.c_str(),base_name.c_str(), lora_type.c_str());
1899
+
1900
+ if (model_tensors.find(base_name.data()) == model_tensors.end()) {
1901
+ fprintf(stderr, "%s: unknown tensor '%s' in lora adapter\n", __func__, name.data());
1902
+ return 1;
1903
+ }
1904
+
1905
+ // create ggml tensor
1906
+ ggml_type wtype;
1907
+ switch (ftype) {
1908
+ case 0: wtype = GGML_TYPE_F32; break;
1909
+ case 1: wtype = GGML_TYPE_F16; break;
1910
+ default:
1911
+ {
1912
+ fprintf(stderr, "%s: invalid tensor data type '%d'\n",
1913
+ __func__, ftype);
1914
+ return false;
1915
+ }
1916
+ }
1917
+ ggml_tensor* lora_tensor;
1918
+ if (n_dims == 2) {
1919
+ lora_tensor = ggml_new_tensor_2d(lora_ctx, wtype, ne[0], ne[1]);
1920
+ }
1921
+ else {
1922
+ fprintf(stderr, "%s: unsupported tensor dimension %d\n", __func__, n_dims);
1923
+ return 1;
1924
+ }
1925
+
1926
+ // load tensor data
1927
+ size_t offset = fin.tellg();
1928
+ size_t tensor_data_size = ggml_nbytes(lora_tensor);
1929
+ offset = (offset + 31) & -32;
1930
+ fin.seekg(offset);
1931
+ fin.read((char*)lora_tensor->data, tensor_data_size);
1932
+
1933
+ lora_tensors[name] = lora_tensor;
1934
+
1935
+ // check if we have both A and B tensors and apply
1936
+ if (lora_tensors.find(base_name + ".loraA") != lora_tensors.end() &&
1937
+ lora_tensors.find(base_name + ".loraB") != lora_tensors.end()) {
1938
+
1939
+ ggml_tensor * dest_t = model_tensors[base_name];
1940
+ ggml_tensor * base_t;
1941
+ if (model_loader) {
1942
+ // load from base model
1943
+ if (model_loader->tensors_map.name_to_idx.find(base_name) == model_loader->tensors_map.name_to_idx.end()) {
1944
+ fprintf(stderr, "%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
1945
+ return 1;
1946
+ }
1947
+ size_t idx = model_loader->tensors_map.name_to_idx[base_name];
1948
+ llama_load_tensor & lt = model_loader->tensors_map.tensors[idx];
1949
+ base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] });
1950
+ lt.data = (uint8_t *) lt.ggml_tensor->data;
1951
+ model_loader->load_data_for(lt);
1952
+ lt.ggml_tensor->data = lt.data;
1953
+ }
1954
+ else {
1955
+ base_t = dest_t;
1956
+ }
1957
+
1958
+ if (base_t->type == GGML_TYPE_Q4_0 || base_t->type == GGML_TYPE_Q4_1) {
1959
+ if (!warned) {
1960
+ fprintf(stderr, "%s: warning: using a lora adapter with a quantized model may result in poor quality, "
1961
+ "use a f16 or f32 base model with --lora-base\n", __func__);
1962
+ warned = true;
1963
+ }
1964
+ }
1965
+
1966
+ ggml_tensor * loraA = lora_tensors[base_name + ".loraA"];
1967
+ ggml_tensor * loraB = lora_tensors[base_name + ".loraB"];
1968
+
1969
+ if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) {
1970
+ fprintf(stderr, "%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");"
1971
+ " are you sure that this adapter is for this model?\n", __func__, base_t->ne[0], loraA->ne[1]);
1972
+ return 1;
1973
+ }
1974
+
1975
+ // w = w + BA*s
1976
+ ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB);
1977
+
1978
+ if (scaling != 1.0f) {
1979
+ ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling);
1980
+ BA = ggml_scale(lora_ctx, BA, scale_tensor);
1981
+ }
1982
+
1983
+ ggml_tensor * r;
1984
+ if (base_t == dest_t) {
1985
+ r = ggml_add_inplace(lora_ctx, dest_t, BA);
1986
+ }
1987
+ else {
1988
+ r = ggml_add(lora_ctx, base_t, BA);
1989
+ r = ggml_cpy(lora_ctx, r, dest_t);
1990
+ }
1991
+
1992
+ struct ggml_cgraph gf = ggml_build_forward(r);
1993
+ gf.n_threads = n_threads;
1994
+ ggml_graph_compute(lora_ctx, &gf);
1995
+
1996
+ // we won't need these tensors again, reset the context to save memory
1997
+ ggml_free(lora_ctx);
1998
+ lora_ctx = ggml_init(params);
1999
+ lora_tensors.clear();
2000
+
2001
+ n_tensors++;
2002
+ if (n_tensors % 4 == 0)
2003
+ fprintf(stderr, ".");
2004
+ }
2005
+ }
2006
+
2007
+ // TODO: this should be in a destructor, it will leak on failure
2008
+ ggml_free(lora_ctx);
2009
+ if (base_ctx) {
2010
+ ggml_free(base_ctx);
2011
+ }
2012
+
2013
+ const int64_t t_lora_us = ggml_time_us() - t_start_lora_us;
2014
+ fprintf(stderr, " done (%.2f ms)\n", t_lora_us / 1000.0);
2015
+
1680
2016
  return 0;
1681
2017
  }
1682
2018
 
2019
+ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
2020
+ try {
2021
+ return llama_apply_lora_from_file_internal(ctx, path_lora, path_base_model, n_threads);
2022
+ } catch (const std::string & err) {
2023
+ fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.c_str());
2024
+ return 1;
2025
+ }
2026
+ }
2027
+
1683
2028
  // Returns the KV cache that will contain the context for the
1684
2029
  // ongoing prediction with the model.
1685
2030
  const uint8_t * llama_get_kv_cache(struct llama_context * ctx) {
1686
- return ctx->model.kv_self.buf.data();
2031
+ return ctx->model.kv_self.buf.addr;
1687
2032
  }
1688
2033
 
1689
2034
  // Returns the size of the KV cache
1690
2035
  size_t llama_get_kv_cache_size(struct llama_context * ctx) {
1691
- return ctx->model.kv_self.buf.size();
2036
+ return ctx->model.kv_self.buf.size;
1692
2037
  }
1693
2038
 
1694
2039
  int llama_get_kv_cache_token_count(struct llama_context * ctx) {
@@ -1702,8 +2047,8 @@ void llama_set_kv_cache(
1702
2047
  size_t n_size,
1703
2048
  int n_token_count) {
1704
2049
  // Make sure we have the same kv cache setup
1705
- LLAMA_ASSERT(ctx->model.kv_self.buf.size() == n_size);
1706
- memcpy(ctx->model.kv_self.buf.data(), kv_cache, n_size);
2050
+ LLAMA_ASSERT(ctx->model.kv_self.buf.size == n_size);
2051
+ memcpy(ctx->model.kv_self.buf.addr, kv_cache, n_size);
1707
2052
  ctx->model.kv_self.n = n_token_count;
1708
2053
  }
1709
2054
 
@@ -1814,9 +2159,9 @@ llama_token llama_sample_top_p_top_k(
1814
2159
  void llama_print_timings(struct llama_context * ctx) {
1815
2160
  const int64_t t_end_us = ggml_time_us();
1816
2161
 
1817
- const int32_t n_sample = Max(1, ctx->n_sample);
1818
- const int32_t n_eval = Max(1, ctx->n_eval);
1819
- const int32_t n_p_eval = Max(1, ctx->n_p_eval);
2162
+ const int32_t n_sample = std::max(1, ctx->n_sample);
2163
+ const int32_t n_eval = std::max(1, ctx->n_eval);
2164
+ const int32_t n_p_eval = std::max(1, ctx->n_p_eval);
1820
2165
 
1821
2166
  fprintf(stderr, "\n");
1822
2167
  fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0);
@@ -1837,18 +2182,25 @@ const char * llama_print_system_info(void) {
1837
2182
  static std::string s;
1838
2183
 
1839
2184
  s = "";
1840
- s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | ";
1841
- s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | ";
1842
- s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
1843
- s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | ";
1844
- s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
1845
- s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | ";
1846
- s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | ";
1847
- s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
1848
- s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
1849
- s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | ";
1850
- s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
1851
- s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
2185
+ s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | ";
2186
+ s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | ";
2187
+ s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
2188
+ s += "AVX512_VBMI = " + std::to_string(ggml_cpu_has_avx512_vbmi()) + " | ";
2189
+ s += "AVX512_VNNI = " + std::to_string(ggml_cpu_has_avx512_vnni()) + " | ";
2190
+ s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | ";
2191
+ s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
2192
+ s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | ";
2193
+ s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | ";
2194
+ s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
2195
+ s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
2196
+ s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | ";
2197
+ s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
2198
+ s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
1852
2199
 
1853
2200
  return s.c_str();
1854
2201
  }
2202
+
2203
+ // For internal test use
2204
+ std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
2205
+ return ctx->model.tensors_by_name;
2206
+ }