llama_cpp 0.0.3 → 0.0.5

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,49 +1,33 @@
1
+ // Defines fileno on msys:
2
+ #ifndef _GNU_SOURCE
3
+ #define _GNU_SOURCE
4
+ #include <cstdint>
5
+ #include <cstdio>
6
+ #endif
7
+
8
+ #include "llama_util.h"
1
9
  #include "llama.h"
2
10
 
3
11
  #include "ggml.h"
4
12
 
13
+ #include <array>
14
+ #include <ctime>
5
15
  #include <cinttypes>
6
16
  #include <fstream>
7
17
  #include <random>
8
18
  #include <map>
9
19
  #include <unordered_map>
10
20
  #include <queue>
11
- #include <regex>
12
21
  #include <cassert>
13
22
  #include <cstring>
14
-
15
- #if defined(_WIN32) && !defined(_POSIX_MAPPED_FILES)
16
- #define WIN32_LEAN_AND_MEAN
17
- #include <Windows.h>
18
- #else
19
- #include <sys/types.h>
20
- #include <sys/mman.h>
21
- #include <unistd.h>
22
- #include <fcntl.h>
23
- #endif
24
-
25
- #define Min(X, Y) ((Y) > (X) ? (X) : (Y))
26
- #define Max(X, Y) ((Y) < (X) ? (X) : (Y))
23
+ #include <climits>
24
+ #include <memory>
25
+ #include <algorithm>
26
+ #include <initializer_list>
27
27
 
28
28
  #define LLAMA_USE_SCRATCH
29
29
  #define LLAMA_MAX_SCRATCH_BUFFERS 16
30
30
 
31
- #define LLAMA_ASSERT(x) \
32
- do { \
33
- if (!(x)) { \
34
- fprintf(stderr, "LLAMA_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
35
- abort(); \
36
- } \
37
- } while (0)
38
-
39
-
40
- // determine number of model parts based on the dimension
41
- static const std::unordered_map<int, int> LLAMA_N_PARTS = {
42
- { 4096, 1 },
43
- { 5120, 2 },
44
- { 6656, 4 },
45
- { 8192, 8 },
46
- };
47
31
 
48
32
  // available llama models
49
33
  enum e_model {
@@ -60,47 +44,67 @@ static const size_t MB = 1024*1024;
60
44
  // TODO: dynamically determine these sizes
61
45
  // needs modifications in ggml
62
46
 
63
- static const std::map<e_model, size_t> MEM_REQ_SCRATCH0 = {
64
- { MODEL_7B, 512ull*MB },
65
- { MODEL_13B, 512ull*MB },
66
- { MODEL_30B, 512ull*MB },
67
- { MODEL_65B, 512ull*MB },
68
- };
47
+ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
48
+ {
49
+ static std::map<e_model, size_t> _MEM_REQ_SCRATCH0 = {
50
+ { MODEL_7B, 512ull * MB },
51
+ { MODEL_13B, 512ull * MB },
52
+ { MODEL_30B, 512ull * MB },
53
+ { MODEL_65B, 512ull * MB },
54
+ };
55
+ return _MEM_REQ_SCRATCH0;
56
+ }
69
57
 
70
- static const std::map<e_model, size_t> MEM_REQ_SCRATCH1 = {
71
- { MODEL_7B, 512ull*MB },
72
- { MODEL_13B, 512ull*MB },
73
- { MODEL_30B, 512ull*MB },
74
- { MODEL_65B, 512ull*MB },
58
+ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
59
+ {
60
+ static std::map<e_model, size_t> _MEM_REQ_SCRATCH1 = {
61
+ { MODEL_7B, 512ull * MB },
62
+ { MODEL_13B, 512ull * MB },
63
+ { MODEL_30B, 512ull * MB },
64
+ { MODEL_65B, 512ull * MB },
65
+ };
66
+ return _MEM_REQ_SCRATCH1;
75
67
  };
76
68
 
77
69
  // 2*n_embd*n_ctx*n_layer*sizeof(float16)
78
- static const std::map<e_model, size_t> MEM_REQ_KV_SELF = {
79
- { MODEL_7B, 1026ull*MB },
80
- { MODEL_13B, 1608ull*MB },
81
- { MODEL_30B, 3124ull*MB },
82
- { MODEL_65B, 5120ull*MB },
70
+ static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
71
+ {
72
+ static std::map<e_model, size_t> _MEM_REQ_KV_SELF = {
73
+ { MODEL_7B, 1026ull * MB },
74
+ { MODEL_13B, 1608ull * MB },
75
+ { MODEL_30B, 3124ull * MB },
76
+ { MODEL_65B, 5120ull * MB },
77
+ };
78
+ return _MEM_REQ_KV_SELF;
83
79
  };
84
80
 
85
81
  // this is mostly needed for temporary mul_mat buffers to dequantize the data
86
82
  // not actually needed if BLAS is disabled
87
- static const std::map<e_model, size_t> MEM_REQ_EVAL = {
88
- { MODEL_7B, 768ull*MB },
89
- { MODEL_13B, 1024ull*MB },
90
- { MODEL_30B, 1280ull*MB },
91
- { MODEL_65B, 1536ull*MB },
83
+ static const std::map<e_model, size_t> & MEM_REQ_EVAL()
84
+ {
85
+ static std::map<e_model, size_t> _MEM_REQ_EVAL = {
86
+ { MODEL_7B, 768ull * MB },
87
+ { MODEL_13B, 1024ull * MB },
88
+ { MODEL_30B, 1280ull * MB },
89
+ { MODEL_65B, 1536ull * MB },
90
+ };
91
+ return _MEM_REQ_EVAL;
92
92
  };
93
93
 
94
94
  // default hparams (LLaMA 7B)
95
95
  struct llama_hparams {
96
- int32_t n_vocab = 32000;
97
- int32_t n_ctx = 512; // this is provided as user input?
98
- int32_t n_embd = 4096;
99
- int32_t n_mult = 256;
100
- int32_t n_head = 32;
101
- int32_t n_layer = 32;
102
- int32_t n_rot = 64;
103
- int32_t f16 = 1;
96
+ uint32_t n_vocab = 32000;
97
+ uint32_t n_ctx = 512; // this is provided as user input?
98
+ uint32_t n_embd = 4096;
99
+ uint32_t n_mult = 256;
100
+ uint32_t n_head = 32;
101
+ uint32_t n_layer = 32;
102
+ uint32_t n_rot = 64;
103
+ enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;
104
+
105
+ bool operator!=(const llama_hparams & other) const {
106
+ return memcmp(this, &other, sizeof(llama_hparams));
107
+ }
104
108
  };
105
109
 
106
110
  struct llama_layer {
@@ -126,11 +130,17 @@ struct llama_kv_cache {
126
130
  struct ggml_tensor * k;
127
131
  struct ggml_tensor * v;
128
132
 
129
- struct ggml_context * ctx;
133
+ struct ggml_context * ctx = NULL;
130
134
 
131
- std::vector<uint8_t> buf;
135
+ llama_buffer buf;
132
136
 
133
137
  int n; // number of tokens currently in the cache
138
+
139
+ ~llama_kv_cache() {
140
+ if (ctx) {
141
+ ggml_free(ctx);
142
+ }
143
+ }
134
144
  };
135
145
 
136
146
  struct llama_model {
@@ -146,22 +156,30 @@ struct llama_model {
146
156
  std::vector<llama_layer> layers;
147
157
 
148
158
  // context
149
- struct ggml_context * ctx;
159
+ struct ggml_context * ctx = NULL;
150
160
 
151
161
  // key + value cache for the self attention
152
162
  // TODO: move to llama_state
153
163
  struct llama_kv_cache kv_self;
154
164
 
155
165
  // the model memory buffer
156
- std::vector<uint8_t> buf;
166
+ llama_buffer buf;
157
167
 
158
168
  // model memory mapped file
159
- void * mm_addr = NULL;
160
- uint64_t mm_length = 0;
169
+ std::unique_ptr<llama_mmap> mapping;
170
+
171
+ // objects representing data potentially being locked in memory
172
+ llama_mlock mlock_buf;
173
+ llama_mlock mlock_mmap;
174
+
175
+ // for quantize-stats only
176
+ std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
161
177
 
162
- // tensors
163
- int n_loaded;
164
- std::unordered_map<std::string, struct ggml_tensor *> tensors;
178
+ ~llama_model() {
179
+ if (ctx) {
180
+ ggml_free(ctx);
181
+ }
182
+ }
165
183
  };
166
184
 
167
185
  struct llama_vocab {
@@ -206,8 +224,8 @@ struct llama_context {
206
224
 
207
225
  // memory buffers used to evaluate the model
208
226
  // TODO: move in llama_state
209
- std::vector<uint8_t> buf_compute;
210
- std::vector<uint8_t> buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
227
+ llama_buffer buf_compute;
228
+ llama_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
211
229
 
212
230
  int buf_last = 0;
213
231
  size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
@@ -220,11 +238,11 @@ struct llama_context {
220
238
  last_size = ggml_set_scratch(ctx, { 0, 0, nullptr, });
221
239
  } else {
222
240
  auto & buf = buf_scratch[i];
223
- last_size = ggml_set_scratch(ctx, { 0, buf.size(), buf.data(), });
241
+ last_size = ggml_set_scratch(ctx, { 0, buf.size, buf.addr, });
224
242
  }
225
243
 
226
244
  if (buf_last >= 0) {
227
- buf_max_size[buf_last] = Max(buf_max_size[buf_last], last_size);
245
+ buf_max_size[buf_last] = std::max(buf_max_size[buf_last], last_size);
228
246
  }
229
247
 
230
248
  buf_last = i;
@@ -244,6 +262,500 @@ struct llama_context {
244
262
  }
245
263
  };
246
264
 
265
+ template <typename T>
266
+ static T checked_mul(T a, T b) {
267
+ T ret = a * b;
268
+ if (a != 0 && ret / a != b) {
269
+ throw format("overflow multiplying %llu * %llu",
270
+ (unsigned long long) a, (unsigned long long) b);
271
+ }
272
+ return ret;
273
+ }
274
+
275
+ static size_t checked_div(size_t a, size_t b) {
276
+ if (b == 0 || a % b != 0) {
277
+ throw format("error dividing %zu / %zu", a, b);
278
+ }
279
+ return a / b;
280
+ }
281
+
282
+ static std::string llama_format_tensor_shape(const std::vector<uint32_t> & ne) {
283
+ char buf[256];
284
+ snprintf(buf, sizeof(buf), "%5u", ne.at(0));
285
+ for (size_t i = 1; i < ne.size(); i++) {
286
+ snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), " x %5u", ne.at(i));
287
+ }
288
+ return buf;
289
+ }
290
+
291
+ static size_t llama_calc_tensor_size(const std::vector<uint32_t> & ne, enum ggml_type type) {
292
+ size_t size = ggml_type_size(type);
293
+ for (uint32_t dim : ne) {
294
+ size = checked_mul<size_t>(size, dim);
295
+ }
296
+ return size / ggml_blck_size(type);
297
+ }
298
+
299
+ struct llama_load_tensor_shard {
300
+ std::vector<uint32_t> ne;
301
+ size_t size;
302
+ enum ggml_type type;
303
+ size_t file_idx;
304
+ size_t file_off;
305
+
306
+ void calc_size() {
307
+ size = llama_calc_tensor_size(ne, type);
308
+ }
309
+ };
310
+
311
+ enum llama_split_type {
312
+ SPLIT_NONE,
313
+ SPLIT_BY_COLUMNS,
314
+ SPLIT_BY_ROWS
315
+ };
316
+
317
+ struct llama_load_tensor {
318
+ std::vector<llama_load_tensor_shard> shards;
319
+
320
+ std::string name;
321
+ enum ggml_type type = GGML_TYPE_F32;
322
+ llama_split_type split_type = SPLIT_NONE;
323
+ std::vector<uint32_t> ne;
324
+ size_t size;
325
+ struct ggml_tensor * ggml_tensor = NULL;
326
+ uint8_t * data;
327
+
328
+ llama_load_tensor(const std::string & name) : name(name) {}
329
+
330
+ void calc_all() {
331
+ calc_type();
332
+ calc_split_type();
333
+ calc_ne();
334
+ calc_size();
335
+ }
336
+
337
+ void calc_type() {
338
+ const auto & first_shard = shards.at(0);
339
+ for (const auto & shard : shards) {
340
+ if (shard.type != first_shard.type) {
341
+ throw format("inconsistent tensor shard type in '%s'", name.c_str());
342
+ }
343
+ }
344
+ type = first_shard.type;
345
+ }
346
+
347
+ void calc_split_type() {
348
+ if (shards.at(0).ne.size() == 1 || // 1D tensors are just duplicated in every file
349
+ shards.size() == 1) { // only one file?
350
+ split_type = SPLIT_NONE;
351
+ } else if (name.find("tok_embeddings.") == 0 ||
352
+ name.find(".attention.wo.weight") != std::string::npos ||
353
+ name.find(".feed_forward.w2.weight") != std::string::npos) {
354
+ split_type = SPLIT_BY_COLUMNS;
355
+ } else {
356
+ split_type = SPLIT_BY_ROWS;
357
+ }
358
+ }
359
+
360
+ void calc_ne() {
361
+ const auto & first_shard = shards.at(0);
362
+ for (const auto & shard : shards) {
363
+ if (shard.ne != first_shard.ne) {
364
+ throw format("inconsistent tensor shard shape in '%s': first was %s, other was %s",
365
+ name.c_str(), llama_format_tensor_shape(first_shard.ne).c_str(), llama_format_tensor_shape(shard.ne).c_str());
366
+ }
367
+ }
368
+ ne = first_shard.ne;
369
+ LLAMA_ASSERT(shards.size() <= UINT32_MAX);
370
+ uint32_t n_shards = (uint32_t) shards.size();
371
+ switch (split_type) {
372
+ case SPLIT_NONE:
373
+ ne = first_shard.ne;
374
+ break;
375
+ case SPLIT_BY_COLUMNS:
376
+ ne = {checked_mul<uint32_t>(first_shard.ne[0], n_shards),
377
+ first_shard.ne[1]};
378
+ break;
379
+ case SPLIT_BY_ROWS:
380
+ ne = {first_shard.ne[0],
381
+ checked_mul<uint32_t>(first_shard.ne[1], n_shards)};
382
+ break;
383
+ }
384
+ }
385
+
386
+ void calc_size() {
387
+ size = llama_calc_tensor_size(ne, type);
388
+ }
389
+ };
390
+
391
+ struct llama_load_tensors_map {
392
+ // tensors is kept in a separate vector to preserve file order
393
+ std::vector<llama_load_tensor> tensors;
394
+ std::unordered_map<std::string, size_t> name_to_idx;
395
+ };
396
+
397
+ enum llama_file_version {
398
+ LLAMA_FILE_VERSION_GGML,
399
+ LLAMA_FILE_VERSION_GGMF_V1, // added version field and scores in vocab
400
+ LLAMA_FILE_VERSION_GGJT_V1, // added padding
401
+ };
402
+
403
+ struct llama_file_loader {
404
+ llama_file file;
405
+ llama_file_version file_version;
406
+ llama_hparams hparams;
407
+ llama_vocab vocab;
408
+
409
+ llama_file_loader(const char * fname, size_t file_idx, llama_load_tensors_map & tensors_map)
410
+ : file(fname, "rb") {
411
+ fprintf(stderr, "llama.cpp: loading model from %s\n", fname);
412
+ read_magic();
413
+ read_hparams();
414
+ read_vocab();
415
+ read_tensor_metadata(file_idx, tensors_map);
416
+ }
417
+ void read_magic() {
418
+ uint32_t magic = file.read_u32();
419
+ uint32_t version = 0;
420
+
421
+ if (magic != 'ggml') {
422
+ version = file.read_u32();
423
+ }
424
+
425
+ if (magic == 'ggml' && version == 0) {
426
+ file_version = LLAMA_FILE_VERSION_GGML;
427
+ } else if (magic == 'ggmf' && version == 1) {
428
+ file_version = LLAMA_FILE_VERSION_GGMF_V1;
429
+ } else if (magic == 'ggjt' && version == 1) {
430
+ file_version = LLAMA_FILE_VERSION_GGJT_V1;
431
+ } else {
432
+ throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
433
+ magic, version);
434
+ }
435
+ }
436
+ void read_hparams() {
437
+ hparams.n_vocab = file.read_u32();
438
+ hparams.n_embd = file.read_u32();
439
+ hparams.n_mult = file.read_u32();
440
+ hparams.n_head = file.read_u32();
441
+ hparams.n_layer = file.read_u32();
442
+ hparams.n_rot = file.read_u32();
443
+ hparams.ftype = (enum llama_ftype) file.read_u32();
444
+ }
445
+ void read_vocab() {
446
+ vocab.id_to_token.resize(hparams.n_vocab);
447
+
448
+ for (uint32_t i = 0; i < hparams.n_vocab; i++) {
449
+ uint32_t len = file.read_u32();
450
+ std::string word = file.read_string(len);
451
+
452
+ float score = 0.0f;
453
+ if (file_version >= LLAMA_FILE_VERSION_GGMF_V1) {
454
+ file.read_raw(&score, sizeof(score));
455
+ }
456
+
457
+ vocab.token_to_id[word] = i;
458
+
459
+ auto & tok_score = vocab.id_to_token[i];
460
+ tok_score.tok = std::move(word);
461
+ tok_score.score = score;
462
+ }
463
+ }
464
+ void read_tensor_metadata(size_t file_idx, llama_load_tensors_map & tensors_map) {
465
+ while (file.tell() < file.size) {
466
+ llama_load_tensor_shard shard;
467
+ uint32_t n_dims = file.read_u32();
468
+ uint32_t name_len = file.read_u32();
469
+ shard.type = (enum ggml_type) file.read_u32();
470
+ shard.ne.resize(n_dims);
471
+ file.read_raw(shard.ne.data(), sizeof(shard.ne[0]) * n_dims);
472
+ std::string name = file.read_string(name_len);
473
+ if (n_dims < 1 || n_dims > 2) {
474
+ throw format("llama.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims);
475
+ }
476
+ switch (shard.type) {
477
+ case GGML_TYPE_F32:
478
+ case GGML_TYPE_F16:
479
+ case GGML_TYPE_Q4_0:
480
+ case GGML_TYPE_Q4_1:
481
+ break;
482
+ default: {
483
+ throw format("unrecognized tensor type %u\n", shard.type);
484
+ }
485
+ }
486
+
487
+ if (file_version >= LLAMA_FILE_VERSION_GGJT_V1) {
488
+ // skip to the next multiple of 32 bytes
489
+ file.seek(-file.tell() & 31, SEEK_CUR);
490
+ }
491
+ shard.file_idx = file_idx;
492
+ shard.file_off = file.tell();
493
+
494
+ shard.calc_size();
495
+ file.seek(shard.size, SEEK_CUR);
496
+
497
+ auto it = tensors_map.name_to_idx.find(name);
498
+ size_t idx;
499
+ if (it != tensors_map.name_to_idx.end()) {
500
+ idx = it->second;
501
+ } else {
502
+ tensors_map.tensors.emplace_back(name);
503
+ idx = tensors_map.tensors.size() - 1;
504
+ tensors_map.name_to_idx.emplace(name, idx);
505
+ }
506
+ tensors_map.tensors.at(idx).shards.push_back(shard);
507
+ }
508
+ }
509
+ };
510
+
511
+ struct llama_file_saver {
512
+ llama_file file;
513
+ llama_file_loader * any_file_loader;
514
+ llama_file_saver(const char * fname, llama_file_loader * any_file_loader, enum llama_ftype new_ftype)
515
+ : file(fname, "wb"), any_file_loader(any_file_loader) {
516
+ fprintf(stderr, "llama.cpp: saving model to %s\n", fname);
517
+ write_magic();
518
+ write_hparams(new_ftype);
519
+ write_vocab();
520
+ }
521
+ void write_magic() {
522
+ file.write_u32('ggjt'); // magic
523
+ file.write_u32(1); // version
524
+ }
525
+ void write_hparams(enum llama_ftype new_ftype) {
526
+ const llama_hparams & hparams = any_file_loader->hparams;
527
+ file.write_u32(hparams.n_vocab);
528
+ file.write_u32(hparams.n_embd);
529
+ file.write_u32(hparams.n_mult);
530
+ file.write_u32(hparams.n_head);
531
+ file.write_u32(hparams.n_layer);
532
+ file.write_u32(hparams.n_rot);
533
+ file.write_u32(new_ftype);
534
+ }
535
+ void write_vocab() {
536
+ if (any_file_loader->file_version == LLAMA_FILE_VERSION_GGML) {
537
+ fprintf(stderr, "llama.cpp: WARNING: input is an old file that doesn't have scores; will add dummy scores\n");
538
+ }
539
+ uint32_t n_vocab = any_file_loader->hparams.n_vocab;
540
+ for (uint32_t i = 0; i < n_vocab; i++) {
541
+ const auto & token_score = any_file_loader->vocab.id_to_token.at(i);
542
+ file.write_u32((uint32_t) token_score.tok.size());
543
+ file.write_raw(token_score.tok.data(), token_score.tok.size());
544
+ file.write_raw(&token_score.score, sizeof(token_score.score));
545
+ }
546
+ }
547
+ void write_tensor(llama_load_tensor & tensor, enum ggml_type new_type, const void * new_data, size_t new_size) {
548
+ switch (new_type) {
549
+ case GGML_TYPE_F32:
550
+ case GGML_TYPE_F16:
551
+ case GGML_TYPE_Q4_0:
552
+ case GGML_TYPE_Q4_1:
553
+ break;
554
+ default: LLAMA_ASSERT(false);
555
+ }
556
+ file.write_u32((uint32_t) tensor.ne.size());
557
+ file.write_u32((uint32_t) tensor.name.size());
558
+ file.write_u32(new_type);
559
+ file.write_raw(tensor.ne.data(), sizeof(tensor.ne[0]) * tensor.ne.size());
560
+ file.write_raw(tensor.name.data(), tensor.name.size());
561
+ file.seek(-file.tell() & 31, SEEK_CUR);
562
+ LLAMA_ASSERT(new_size == llama_calc_tensor_size(tensor.ne, new_type));
563
+ file.write_raw(new_data, new_size);
564
+ }
565
+ };
566
+
567
+ struct llama_model_loader {
568
+ std::vector<std::unique_ptr<llama_file_loader>> file_loaders;
569
+ llama_load_tensors_map tensors_map;
570
+ bool use_mmap;
571
+ size_t num_ggml_tensors_created = 0;
572
+ struct ggml_context * ggml_ctx = NULL;
573
+ std::unique_ptr<llama_mmap> mapping;
574
+
575
+ llama_model_loader(const std::string & fname_base, bool use_mmap, bool vocab_only) {
576
+ auto first_file = new llama_file_loader(fname_base.c_str(), 0, tensors_map);
577
+ file_loaders.emplace_back(first_file);
578
+ uint32_t n_parts = vocab_only ? 1 : guess_n_parts();
579
+ for (uint32_t i = 1; i < n_parts; i++) {
580
+ std::string fname = fname_base + "." + std::to_string(i);
581
+ auto ith_file = new llama_file_loader(fname.c_str(), i, tensors_map);
582
+ file_loaders.emplace_back(ith_file);
583
+ if (ith_file->hparams != first_file->hparams) {
584
+ throw format("llama.cpp: hparams inconsistent between files");
585
+ }
586
+ }
587
+ if (!llama_mmap::SUPPORTED) {
588
+ use_mmap = false;
589
+ }
590
+ if (use_mmap && alignment_prevents_mmap()) {
591
+ fprintf(stderr, "llama.cpp: can't use mmap because tensors are not aligned; convert to new format to avoid this\n");
592
+ use_mmap = false;
593
+ }
594
+ this->use_mmap = use_mmap;
595
+ for (llama_load_tensor & lt : tensors_map.tensors) {
596
+ lt.calc_all();
597
+ }
598
+ }
599
+
600
+ bool alignment_prevents_mmap() {
601
+ for (const llama_load_tensor & lt : tensors_map.tensors) {
602
+ for (const llama_load_tensor_shard & shard : lt.shards) {
603
+ if (shard.file_off & 3) {
604
+ return true;
605
+ }
606
+ }
607
+ }
608
+ return false;
609
+ }
610
+
611
+ uint32_t guess_n_parts() const {
612
+ auto it = tensors_map.name_to_idx.find("tok_embeddings.weight");
613
+ if (it == tensors_map.name_to_idx.end()) {
614
+ throw std::string("missing tok_embeddings.weight");
615
+ }
616
+ const llama_load_tensor & lt = tensors_map.tensors.at(it->second);
617
+ return file_loaders.at(0)->hparams.n_embd / lt.shards.at(0).ne.at(0);
618
+ }
619
+
620
+ void calc_sizes(size_t * ctx_size_p, size_t * mmapped_size_p) const {
621
+ *ctx_size_p = *mmapped_size_p = 0;
622
+ for (const llama_load_tensor & lt : tensors_map.tensors) {
623
+ *ctx_size_p += sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE;
624
+ *(use_mmap ? mmapped_size_p : ctx_size_p) += lt.size;
625
+ }
626
+ }
627
+
628
+ struct ggml_tensor * get_tensor(const std::string & name, std::vector<uint32_t> ne) {
629
+ auto it = tensors_map.name_to_idx.find(name);
630
+ if (it == tensors_map.name_to_idx.end()) {
631
+ throw format("llama.cpp: tensor '%s' is missing from model", name.c_str());
632
+ }
633
+ llama_load_tensor & lt = tensors_map.tensors.at(it->second);
634
+ if (lt.ne != ne) {
635
+ throw format("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s",
636
+ name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str());
637
+ }
638
+
639
+ return get_tensor_for(lt);
640
+ }
641
+
642
+ struct ggml_tensor * get_tensor_for(llama_load_tensor & lt) {
643
+ struct ggml_tensor * tensor;
644
+ if (lt.ne.size() == 2) {
645
+ tensor = ggml_new_tensor_2d(ggml_ctx, lt.type, lt.ne.at(0), lt.ne.at(1));
646
+ } else {
647
+ LLAMA_ASSERT(lt.ne.size() == 1);
648
+ tensor = ggml_new_tensor_1d(ggml_ctx, lt.type, lt.ne.at(0));
649
+ }
650
+ LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor
651
+ lt.ggml_tensor = tensor;
652
+ num_ggml_tensors_created++;
653
+ return tensor;
654
+ }
655
+
656
+ void done_getting_tensors() {
657
+ if (num_ggml_tensors_created != tensors_map.tensors.size()) {
658
+ throw std::string("llama.cpp: file contained more tensors than expected");
659
+ }
660
+ }
661
+
662
+ void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) {
663
+ size_t data_size = 0;
664
+ for (const llama_load_tensor & lt : tensors_map.tensors) {
665
+ data_size += lt.size;
666
+ }
667
+
668
+ if (use_mmap) {
669
+ mapping.reset(new llama_mmap(&file_loaders.at(0)->file));
670
+ if (!lmlock) {
671
+ // Don't call the callback since the actual loading will be lazy
672
+ // and we can't measure it.
673
+ progress_callback = NULL;
674
+ }
675
+ if (lmlock) {
676
+ lmlock->init(mapping->addr);
677
+ }
678
+ }
679
+
680
+ size_t done_size = 0;
681
+ for (llama_load_tensor & lt : tensors_map.tensors) {
682
+ if (progress_callback) {
683
+ progress_callback((float) done_size / data_size, progress_callback_user_data);
684
+ }
685
+ LLAMA_ASSERT(lt.ggml_tensor); // unused tensors should have been caught by load_data already
686
+ lt.data = (uint8_t *) lt.ggml_tensor->data;
687
+ load_data_for(lt);
688
+ lt.ggml_tensor->data = lt.data;
689
+ done_size += lt.size;
690
+ if (use_mmap && lmlock) {
691
+ lmlock->grow_to(done_size);
692
+ }
693
+ }
694
+ if (progress_callback) {
695
+ progress_callback(1.0f, progress_callback_user_data);
696
+ }
697
+ }
698
+
699
+ void load_data_for(llama_load_tensor & lt) {
700
+ if (use_mmap) {
701
+ LLAMA_ASSERT(lt.shards.size() == 1);
702
+ lt.data = (uint8_t *) mapping->addr + lt.shards.at(0).file_off;
703
+ } else if (lt.split_type == SPLIT_NONE) {
704
+ llama_file & file = file_loaders.at(lt.shards.at(0).file_idx)->file;
705
+ file.seek(lt.shards.at(0).file_off, SEEK_SET);
706
+ file.read_raw(lt.data, lt.size);
707
+ } else if (lt.split_type == SPLIT_BY_ROWS) {
708
+ size_t offset = 0;
709
+ for (llama_load_tensor_shard & shard : lt.shards) {
710
+ llama_file & file = file_loaders.at(shard.file_idx)->file;
711
+ file.seek(shard.file_off, SEEK_SET);
712
+ file.read_raw(lt.data + offset, shard.size);
713
+ offset += shard.size;
714
+ }
715
+ LLAMA_ASSERT(offset == lt.size);
716
+ } else if (lt.split_type == SPLIT_BY_COLUMNS) {
717
+ // Let's load the data into temporary buffers to ensure the OS performs large loads.
718
+ std::vector<llama_buffer> tmp_bufs;
719
+ tmp_bufs.resize(lt.shards.size());
720
+ for (size_t i = 0; i < lt.shards.size(); i++) {
721
+ llama_load_tensor_shard & shard = lt.shards.at(i);
722
+ llama_file & file = file_loaders.at(shard.file_idx)->file;
723
+ file.seek(shard.file_off, SEEK_SET);
724
+ tmp_bufs.at(i).resize(shard.size);
725
+ file.read_raw(tmp_bufs.at(i).addr, shard.size);
726
+ }
727
+ // Then reshape.
728
+ size_t num_rows = lt.ne.at(1);
729
+ size_t per_shard_row_size = lt.shards.at(0).size / num_rows;
730
+ size_t out_offset = 0;
731
+ for (size_t row = 0; row < num_rows; row++) {
732
+ for (llama_buffer & tmp_buf : tmp_bufs) {
733
+ memcpy(lt.data + out_offset,
734
+ tmp_buf.addr + row * per_shard_row_size,
735
+ per_shard_row_size);
736
+ out_offset += per_shard_row_size;
737
+ }
738
+ }
739
+ LLAMA_ASSERT(out_offset == lt.size);
740
+ }
741
+ if (0) {
742
+ print_checksum(lt);
743
+ }
744
+ }
745
+
746
+ static void print_checksum(llama_load_tensor & lt) {
747
+ uint32_t sum = 0;
748
+ for (size_t i = 0; i < lt.size; i++) {
749
+ uint8_t byte = lt.data[i];
750
+ sum = byte + (sum << 6) + (sum << 16) - sum; // sdbm hash
751
+ }
752
+ fprintf(stderr, "%s checksum: %#08x (%s, size %zu)\n", lt.name.c_str(), sum,
753
+ llama_format_tensor_shape(lt.ne).c_str(), lt.size);
754
+ }
755
+
756
+ };
757
+
758
+
247
759
  //
248
760
  // kv cache
249
761
  //
@@ -262,8 +774,8 @@ static bool kv_cache_init(
262
774
  cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
263
775
 
264
776
  struct ggml_init_params params;
265
- params.mem_size = cache.buf.size();
266
- params.mem_buffer = cache.buf.data();
777
+ params.mem_size = cache.buf.size;
778
+ params.mem_buffer = cache.buf.addr;
267
779
  params.no_alloc = false;
268
780
 
269
781
  cache.ctx = ggml_init(params);
@@ -279,13 +791,6 @@ static bool kv_cache_init(
279
791
  return true;
280
792
  }
281
793
 
282
- static void kv_cache_free(struct llama_kv_cache & cache) {
283
- if (cache.ctx) {
284
- ggml_free(cache.ctx);
285
- cache.ctx = nullptr;
286
- }
287
- }
288
-
289
794
  struct llama_context_params llama_context_default_params() {
290
795
  struct llama_context_params result = {
291
796
  /*.n_ctx =*/ 512,
@@ -294,6 +799,7 @@ struct llama_context_params llama_context_default_params() {
294
799
  /*.f16_kv =*/ false,
295
800
  /*.logits_all =*/ false,
296
801
  /*.vocab_only =*/ false,
802
+ /*.use_mmap =*/ true,
297
803
  /*.use_mlock =*/ false,
298
804
  /*.embedding =*/ false,
299
805
  /*.progress_callback =*/ nullptr,
@@ -303,243 +809,106 @@ struct llama_context_params llama_context_default_params() {
303
809
  return result;
304
810
  }
305
811
 
812
+ bool llama_mmap_supported() {
813
+ return llama_mmap::SUPPORTED;
814
+ }
815
+
816
+ bool llama_mlock_supported() {
817
+ return llama_mlock::SUPPORTED;
818
+ }
819
+
306
820
  //
307
821
  // model loading
308
822
  //
309
823
 
310
- static void *mmap_file(const char *fname, uint64_t *mm_length) {
311
- #if defined(_WIN32) && !defined(_POSIX_MAPPED_FILES)
312
- HANDLE hFile = CreateFileA(fname,
313
- GENERIC_READ,
314
- FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
315
- NULL,
316
- OPEN_EXISTING,
317
- FILE_ATTRIBUTE_NORMAL | FILE_ATTRIBUTE_NOT_CONTENT_INDEXED,
318
- NULL);
319
- if (hFile == INVALID_HANDLE_VALUE) return 0;
320
- LARGE_INTEGER fileSize;
321
- fileSize.QuadPart = -1;
322
- GetFileSizeEx(hFile, &fileSize);
323
- int64_t length = fileSize.QuadPart;
324
- HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
325
- CloseHandle(hFile);
326
- if (!hMapping) return 0;
327
- void *addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
328
- CloseHandle(hMapping);
329
- if (!addr) return 0;
330
- #else
331
- int fd = open(fname, O_RDONLY);
332
- if (fd == -1) return 0;
333
- int64_t length = lseek(fd, 0, SEEK_END);
334
- void *addr = mmap(NULL, length, PROT_READ, MAP_SHARED, fd, 0);
335
- close(fd);
336
- if (addr == MAP_FAILED) return 0;
337
- #endif
338
- *mm_length = length;
339
- return addr;
824
+ static const char *llama_file_version_name(llama_file_version version) {
825
+ switch (version) {
826
+ case LLAMA_FILE_VERSION_GGML: return "'ggml' (old version with low tokenizer quality and no mmap support)";
827
+ case LLAMA_FILE_VERSION_GGMF_V1: return "ggmf v1 (old version with no mmap support)";
828
+ case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (latest)";
829
+ default: LLAMA_ASSERT(false);
830
+ }
340
831
  }
341
832
 
342
- static void munmap_file(void * addr, size_t length) {
343
- #if defined(_WIN32) && !defined(_POSIX_MAPPED_FILES)
344
- UnmapViewOfFile(addr);
345
- #else
346
- munmap(addr, length);
347
- #endif
833
+ static const char *llama_ftype_name(enum llama_ftype ftype) {
834
+ switch (ftype) {
835
+ case LLAMA_FTYPE_ALL_F32: return "all F32";
836
+ case LLAMA_FTYPE_MOSTLY_F16: return "mostly F16";
837
+ case LLAMA_FTYPE_MOSTLY_Q4_0: return "mostly Q4_0";
838
+ case LLAMA_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1";
839
+ case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
840
+ return "mostly Q4_1, some F16";
841
+ default: return "unknown, may not work";
842
+ }
348
843
  }
349
844
 
350
- static bool report_bad_magic(const char *path, uint32_t got, uint32_t want) {
351
- fprintf(stderr,
352
- "%s: invalid model file (bad magic [got %#x want %#x])\n"
353
- "\tyou most likely need to regenerate your ggml files\n"
354
- "\tthe benefit is you'll get 10-100x faster load times\n"
355
- "\tsee https://github.com/ggerganov/llama.cpp/issues/91\n"
356
- "\tuse convert-pth-to-ggml.py to regenerate from original pth\n"
357
- "\tuse migrate-ggml-2023-03-30-pr613.py if you deleted originals\n",
358
- path, got, want);
359
- return false;
845
+ static const char *llama_model_type_name(e_model type) {
846
+ switch (type) {
847
+ case MODEL_7B: return "7B";
848
+ case MODEL_13B: return "13B";
849
+ case MODEL_30B: return "30B";
850
+ case MODEL_65B: return "65B";
851
+ default: LLAMA_ASSERT(false);
852
+ }
360
853
  }
361
854
 
362
- static bool llama_model_load(
855
+ static void llama_model_load_internal(
363
856
  const std::string & fname,
364
857
  llama_context & lctx,
365
858
  int n_ctx,
366
- int n_parts,
367
859
  ggml_type memory_type,
860
+ bool use_mmap,
861
+ bool use_mlock,
368
862
  bool vocab_only,
369
863
  llama_progress_callback progress_callback,
370
- void *progress_callback_user_data) {
371
- fprintf(stderr, "%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
864
+ void * progress_callback_user_data) {
372
865
 
373
866
  lctx.t_start_us = ggml_time_us();
374
867
 
375
- auto & model = lctx.model;
376
- auto & vocab = lctx.vocab;
868
+ std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap, vocab_only));
377
869
 
378
- auto fin = std::ifstream(fname, std::ios::binary);
379
- if (!fin) {
380
- fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
381
- return false;
382
- }
383
-
384
- std::vector<char> f_buf(1024*1024);
385
- fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size());
386
-
387
- fin.seekg(0, fin.end);
388
- const size_t file_size = fin.tellg();
389
- fin.seekg(0);
870
+ lctx.vocab = std::move(ml->file_loaders.at(0)->vocab);
871
+ auto & model = lctx.model;
872
+ model.hparams = ml->file_loaders.at(0)->hparams;
873
+ llama_file_version file_version = ml->file_loaders.at(0)->file_version;
874
+ auto & hparams = model.hparams;
875
+ uint32_t n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
390
876
 
391
- // verify magic
392
877
  {
393
- uint32_t magic;
394
- fin.read((char *) &magic, sizeof(magic));
395
- if (magic == LLAMA_FILE_MAGIC_UNVERSIONED) {
396
- fprintf(stderr, "%s: invalid model file '%s' (too old, regenerate your model files or convert them with convert-unversioned-ggml-to-ggml.py!)\n",
397
- __func__, fname.c_str());
398
- return false;
878
+ switch (hparams.n_layer) {
879
+ case 32: model.type = e_model::MODEL_7B; break;
880
+ case 40: model.type = e_model::MODEL_13B; break;
881
+ case 60: model.type = e_model::MODEL_30B; break;
882
+ case 80: model.type = e_model::MODEL_65B; break;
399
883
  }
400
- if (magic != LLAMA_FILE_MAGIC) {
401
- return report_bad_magic(fname.c_str(), magic, LLAMA_FILE_MAGIC);
402
- }
403
-
404
- uint32_t format_version;
405
- fin.read((char *) &format_version, sizeof(format_version));
406
-
407
- if (format_version != LLAMA_FILE_VERSION) {
408
- fprintf(stderr, "%s: invalid model file '%s' (unsupported format version %" PRIu32 ", expected %d)\n",
409
- __func__, fname.c_str(), format_version, LLAMA_FILE_VERSION);
410
- return false;
411
- }
412
- }
413
-
414
- int n_ff = 0;
415
-
416
- // load hparams
417
- {
418
- auto & hparams = model.hparams;
419
-
420
- fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
421
- //fin.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
422
- fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd));
423
- fin.read((char *) &hparams.n_mult, sizeof(hparams.n_mult));
424
- fin.read((char *) &hparams.n_head, sizeof(hparams.n_head));
425
- fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
426
- fin.read((char *) &hparams.n_rot, sizeof(hparams.n_rot));
427
- fin.read((char *) &hparams.f16, sizeof(hparams.f16));
428
884
 
429
885
  hparams.n_ctx = n_ctx;
430
-
431
- n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
432
-
433
- if (n_parts < 1) {
434
- n_parts = LLAMA_N_PARTS.at(hparams.n_embd);
435
- }
436
-
437
- // temp warning to tell the user to use "--n_parts"
438
- if (hparams.f16 == 4 && n_parts != 1) {
439
- fprintf(stderr, "%s: GPTQ model detected - are you sure n_parts should be %d? we normally expect it to be 1\n", __func__, n_parts);
440
- fprintf(stderr, "%s: use '--n_parts 1' if necessary\n", __func__);
441
- }
442
-
443
- if (hparams.n_layer == 32) {
444
- model.type = e_model::MODEL_7B;
445
- }
446
-
447
- if (hparams.n_layer == 40) {
448
- model.type = e_model::MODEL_13B;
449
- }
450
-
451
- if (hparams.n_layer == 60) {
452
- model.type = e_model::MODEL_30B;
453
- }
454
-
455
- if (hparams.n_layer == 80) {
456
- model.type = e_model::MODEL_65B;
457
- }
458
-
459
- fprintf(stderr, "%s: n_vocab = %d\n", __func__, hparams.n_vocab);
460
- fprintf(stderr, "%s: n_ctx = %d\n", __func__, hparams.n_ctx);
461
- fprintf(stderr, "%s: n_embd = %d\n", __func__, hparams.n_embd);
462
- fprintf(stderr, "%s: n_mult = %d\n", __func__, hparams.n_mult);
463
- fprintf(stderr, "%s: n_head = %d\n", __func__, hparams.n_head);
464
- fprintf(stderr, "%s: n_layer = %d\n", __func__, hparams.n_layer);
465
- fprintf(stderr, "%s: n_rot = %d\n", __func__, hparams.n_rot);
466
- fprintf(stderr, "%s: f16 = %d\n", __func__, hparams.f16);
467
- fprintf(stderr, "%s: n_ff = %d\n", __func__, n_ff);
468
- fprintf(stderr, "%s: n_parts = %d\n", __func__, n_parts);
469
- fprintf(stderr, "%s: type = %d\n", __func__, model.type);
470
886
  }
471
887
 
472
- // load vocab
473
888
  {
474
- std::string word;
475
- vocab.id_to_token.resize(model.hparams.n_vocab);
476
- std::vector<char> tmp(64);
477
-
478
- for (int i = 0; i < model.hparams.n_vocab; i++) {
479
- uint32_t len;
480
- fin.read((char *) &len, sizeof(len));
481
-
482
- word.resize(len);
483
- if (len > 0) {
484
- tmp.resize(len);
485
- fin.read(tmp.data(), len);
486
- word.assign(tmp.data(), len);
487
- } else {
488
- word.clear();
489
- }
490
-
491
- float score;
492
- fin.read((char *) &score, sizeof(score));
493
-
494
- vocab.token_to_id[word] = i;
495
-
496
- auto &tok_score = vocab.id_to_token[i];
497
- tok_score.tok = word;
498
- tok_score.score = score;
499
- }
889
+ fprintf(stderr, "%s: format = %s\n", __func__, llama_file_version_name(file_version));
890
+ fprintf(stderr, "%s: n_vocab = %u\n", __func__, hparams.n_vocab);
891
+ fprintf(stderr, "%s: n_ctx = %u\n", __func__, hparams.n_ctx);
892
+ fprintf(stderr, "%s: n_embd = %u\n", __func__, hparams.n_embd);
893
+ fprintf(stderr, "%s: n_mult = %u\n", __func__, hparams.n_mult);
894
+ fprintf(stderr, "%s: n_head = %u\n", __func__, hparams.n_head);
895
+ fprintf(stderr, "%s: n_layer = %u\n", __func__, hparams.n_layer);
896
+ fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot);
897
+ fprintf(stderr, "%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype));
898
+ fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff);
899
+ fprintf(stderr, "%s: n_parts = %zu\n", __func__, ml->file_loaders.size());
900
+ fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
500
901
  }
501
902
 
502
903
  if (vocab_only) {
503
- return true;
504
- }
505
-
506
- // for the big tensors, we have the option to store the data in 16-bit floats or quantized
507
- // in order to save memory and also to speed up the computation
508
- // wtype is for per-layer weights, while vtype is for other weights
509
- ggml_type wtype, vtype;
510
- switch (model.hparams.f16) {
511
- case 0: wtype = vtype = GGML_TYPE_F32; break;
512
- case 1: wtype = vtype = GGML_TYPE_F16; break;
513
- case 2: wtype = vtype = GGML_TYPE_Q4_0; break;
514
- case 3: wtype = vtype = GGML_TYPE_Q4_1; break;
515
- case 4: wtype = GGML_TYPE_Q4_1; vtype = GGML_TYPE_F16; break;
516
- default:
517
- {
518
- fprintf(stderr, "%s: invalid model file '%s' (bad f16 value %d)\n",
519
- __func__, fname.c_str(), model.hparams.f16);
520
- return false;
521
- }
904
+ return;
522
905
  }
523
906
 
524
- // map model into memory
525
- char *mm_addr = NULL;
526
- model.mm_addr = mmap_file(fname.c_str(), &model.mm_length);
527
- if (model.mm_addr == NULL) {
528
- fprintf(stderr, "%s: failed to mmap '%s'\n", __func__, fname.c_str());
529
- return false;
530
- }
531
- mm_addr = (char *)model.mm_addr;
532
- fprintf(stderr, "%s: ggml map size = %6.2f MB\n", __func__, model.mm_length/(1024.0*1024.0));
533
-
534
907
  auto & ctx = model.ctx;
535
908
 
536
- size_t ctx_size = 0;
537
- {
538
- const auto &hparams = model.hparams;
539
- const int n_layer = hparams.n_layer;
540
- ctx_size += (5 + 10*n_layer)*256; // object overhead
541
- fprintf(stderr, "%s: ggml ctx size = %6.2f KB\n", __func__, ctx_size/1024.0);
542
- }
909
+ size_t ctx_size, mmapped_size;
910
+ ml->calc_sizes(&ctx_size, &mmapped_size);
911
+ fprintf(stderr, "%s: ggml ctx size = %6.2f KB\n", __func__, ctx_size/1024.0);
543
912
 
544
913
  // print memory requirements
545
914
  {
@@ -548,14 +917,14 @@ static bool llama_model_load(
548
917
  // this is the total memory required to run the inference
549
918
  const size_t mem_required =
550
919
  ctx_size +
551
- model.mm_length +
552
- MEM_REQ_SCRATCH0.at(model.type) +
553
- MEM_REQ_SCRATCH1.at(model.type) +
554
- MEM_REQ_EVAL.at (model.type);
920
+ mmapped_size +
921
+ MEM_REQ_SCRATCH0().at(model.type) +
922
+ MEM_REQ_SCRATCH1().at(model.type) +
923
+ MEM_REQ_EVAL().at(model.type);
555
924
 
556
925
  // this is the memory required by one llama_state
557
926
  const size_t mem_required_state =
558
- scale*MEM_REQ_KV_SELF.at(model.type);
927
+ scale*MEM_REQ_KV_SELF().at(model.type);
559
928
 
560
929
  fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
561
930
  mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
@@ -564,17 +933,20 @@ static bool llama_model_load(
564
933
  // create the ggml context
565
934
  {
566
935
  lctx.model.buf.resize(ctx_size);
936
+ if (use_mlock) {
937
+ lctx.model.mlock_buf.init(lctx.model.buf.addr);
938
+ lctx.model.mlock_buf.grow_to(lctx.model.buf.size);
939
+ }
567
940
 
568
941
  struct ggml_init_params params = {
569
- /*.mem_size =*/ lctx.model.buf.size(),
570
- /*.mem_buffer =*/ lctx.model.buf.data(),
571
- /*.no_alloc =*/ true,
942
+ /*.mem_size =*/ lctx.model.buf.size,
943
+ /*.mem_buffer =*/ lctx.model.buf.addr,
944
+ /*.no_alloc =*/ ml->use_mmap,
572
945
  };
573
946
 
574
947
  model.ctx = ggml_init(params);
575
948
  if (!model.ctx) {
576
- fprintf(stderr, "%s: ggml_init() failed\n", __func__);
577
- return false;
949
+ throw format("ggml_init() failed");
578
950
  }
579
951
  }
580
952
 
@@ -582,161 +954,71 @@ static bool llama_model_load(
582
954
  {
583
955
  const auto & hparams = model.hparams;
584
956
 
585
- const int n_embd = hparams.n_embd;
586
- const int n_layer = hparams.n_layer;
587
- const int n_vocab = hparams.n_vocab;
588
-
589
- model.layers.resize(n_layer);
590
-
591
- model.tok_embeddings = ggml_new_tensor_2d(ctx, vtype, n_embd, n_vocab);
957
+ const uint32_t n_embd = hparams.n_embd;
958
+ const uint32_t n_layer = hparams.n_layer;
959
+ const uint32_t n_vocab = hparams.n_vocab;
592
960
 
593
- model.norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
594
- model.output = ggml_new_tensor_2d(ctx, vtype, n_embd, n_vocab);
961
+ ml->ggml_ctx = ctx;
595
962
 
596
- // map by name
597
- model.tensors["tok_embeddings.weight"] = model.tok_embeddings;
963
+ model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab});
964
+ model.norm = ml->get_tensor("norm.weight", {n_embd});
965
+ model.output = ml->get_tensor("output.weight", {n_embd, n_vocab});
598
966
 
599
- model.tensors["norm.weight"] = model.norm;
600
- model.tensors["output.weight"] = model.output;
601
-
602
- for (int i = 0; i < n_layer; ++i) {
967
+ model.layers.resize(n_layer);
968
+ for (uint32_t i = 0; i < n_layer; ++i) {
603
969
  auto & layer = model.layers[i];
604
970
 
605
- layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
606
-
607
- layer.wq = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
608
- layer.wk = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
609
- layer.wv = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
610
- layer.wo = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
611
-
612
- layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
971
+ std::string layers_i = "layers." + std::to_string(i);
613
972
 
614
- layer.w1 = ggml_new_tensor_2d(ctx, wtype, n_embd, n_ff);
615
- layer.w2 = ggml_new_tensor_2d(ctx, wtype, n_ff, n_embd);
616
- layer.w3 = ggml_new_tensor_2d(ctx, wtype, n_embd, n_ff);
973
+ layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd});
617
974
 
618
- // map by name
619
- model.tensors["layers." + std::to_string(i) + ".attention_norm.weight"] = layer.attention_norm;
975
+ layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd});
976
+ layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd});
977
+ layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd});
978
+ layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd});
620
979
 
621
- model.tensors["layers." + std::to_string(i) + ".attention.wq.weight"] = layer.wq;
622
- model.tensors["layers." + std::to_string(i) + ".attention.wk.weight"] = layer.wk;
623
- model.tensors["layers." + std::to_string(i) + ".attention.wv.weight"] = layer.wv;
624
- model.tensors["layers." + std::to_string(i) + ".attention.wo.weight"] = layer.wo;
980
+ layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd});
625
981
 
626
- model.tensors["layers." + std::to_string(i) + ".ffn_norm.weight"] = layer.ffn_norm;
627
-
628
- model.tensors["layers." + std::to_string(i) + ".feed_forward.w1.weight"] = layer.w1;
629
- model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.weight"] = layer.w2;
630
- model.tensors["layers." + std::to_string(i) + ".feed_forward.w3.weight"] = layer.w3;
982
+ layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff});
983
+ layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd});
984
+ layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff});
631
985
  }
632
986
  }
633
987
 
634
- std::vector<uint8_t> tmp;
988
+ ml->done_getting_tensors();
635
989
 
636
- if (progress_callback) {
637
- progress_callback(0.0, progress_callback_user_data);
990
+ // populate `tensors_by_name`
991
+ for (llama_load_tensor & lt : ml->tensors_map.tensors) {
992
+ model.tensors_by_name.emplace_back(lt.name, lt.ggml_tensor);
638
993
  }
639
994
 
640
- fprintf(stderr, "%s: loading tensors from '%s'\n", __func__, fname.c_str());
641
-
642
- // load weights
643
- {
644
- size_t total_size = 0;
645
- model.n_loaded = 0;
646
-
647
- while (true) {
648
- int32_t n_dims;
649
- int32_t length;
650
- int32_t ftype;
651
-
652
- fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
653
- fin.read(reinterpret_cast<char *>(&length), sizeof(length));
654
- fin.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
655
-
656
- if (fin.eof()) {
657
- break;
658
- }
659
-
660
- int32_t nelements = 1;
661
- int32_t ne[2] = { 1, 1 };
662
- for (int i = 0; i < n_dims; ++i) {
663
- fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
664
- nelements *= ne[i];
665
- }
666
-
667
- std::string name(length, 0);
668
- fin.read(&name[0], length);
669
-
670
- if (model.tensors.find(name.data()) == model.tensors.end()) {
671
- fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
672
- return false;
673
- }
674
-
675
- auto tensor = model.tensors[name.data()];
676
-
677
- if (ggml_nelements(tensor) != nelements) {
678
- fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
679
- return false;
680
- }
681
- if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
682
- fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%" PRId64 ", %" PRId64 "], expected [%d, %d]\n",
683
- __func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
684
- return false;
685
- }
686
- if (0) {
687
- static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
688
- fprintf(stderr, "%24s - [%5d, %5d], type = %6s\n", name.data(), ne[0], ne[1], ftype_str[ftype]);
689
- }
690
-
691
- switch (ftype) {
692
- case 0: // f32
693
- case 1: // f16
694
- break;
695
- case 2: // q4_0
696
- case 3: // q4_1
697
- assert(ne[0] % 64 == 0);
698
- break;
699
- default:
700
- fprintf(stderr, "%s: unknown ftype %d in model file\n", __func__, ftype);
701
- return false;
702
- };
703
-
704
- // load the tensor data into memory without copying or reading it
705
- size_t offset = fin.tellg();
706
- size_t tensor_data_size = ggml_nbytes(tensor);
707
- offset = (offset + 31) & -32;
708
- tensor->data = mm_addr + offset;
709
- fin.seekg(offset + tensor_data_size);
710
- total_size += tensor_data_size;
711
- model.n_loaded++;
712
-
713
- // progress
714
- if (progress_callback) {
715
- double current_progress = size_t(fin.tellg()) / double(file_size);
716
- progress_callback(current_progress, progress_callback_user_data);
717
- }
718
- }
719
-
720
- fin.close();
995
+ ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
721
996
 
722
- fprintf(stderr, "%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, model.n_loaded);
723
- if (model.n_loaded == 0) {
724
- fprintf(stderr, "%s: WARN no tensors loaded from model file - assuming empty model for testing\n", __func__);
725
- } else if (model.n_loaded != (int) model.tensors.size()) {
726
- fprintf(stderr, "%s: ERROR not all tensors loaded from model file - expected %zu, got %d\n", __func__, model.tensors.size(), model.n_loaded);
727
- return false;
728
- }
729
- }
997
+ model.mapping = std::move(ml->mapping);
730
998
 
731
999
  // loading time will be recalculate after the first eval, so
732
1000
  // we take page faults deferred by mmap() into consideration
733
1001
  lctx.t_load_us = ggml_time_us() - lctx.t_start_us;
1002
+ }
734
1003
 
735
- if (progress_callback) {
736
- progress_callback(1.0, progress_callback_user_data);
1004
+ static bool llama_model_load(
1005
+ const std::string & fname,
1006
+ llama_context & lctx,
1007
+ int n_ctx,
1008
+ ggml_type memory_type,
1009
+ bool use_mmap,
1010
+ bool use_mlock,
1011
+ bool vocab_only,
1012
+ llama_progress_callback progress_callback,
1013
+ void *progress_callback_user_data) {
1014
+ try {
1015
+ llama_model_load_internal(fname, lctx, n_ctx, memory_type, use_mmap, use_mlock,
1016
+ vocab_only, progress_callback, progress_callback_user_data);
1017
+ return true;
1018
+ } catch (const std::string & err) {
1019
+ fprintf(stderr, "error loading model: %s\n", err.c_str());
1020
+ return false;
737
1021
  }
738
-
739
- return true;
740
1022
  }
741
1023
 
742
1024
  // evaluate the transformer
@@ -774,8 +1056,8 @@ static bool llama_eval_internal(
774
1056
  auto & buf_compute = lctx.buf_compute;
775
1057
 
776
1058
  struct ggml_init_params params = {
777
- /*.mem_size =*/ buf_compute.size(),
778
- /*.mem_buffer =*/ buf_compute.data(),
1059
+ /*.mem_size =*/ buf_compute.size,
1060
+ /*.mem_buffer =*/ buf_compute.addr,
779
1061
  /*.no_alloc =*/ false,
780
1062
  };
781
1063
 
@@ -1061,7 +1343,7 @@ struct llama_tokenizer {
1061
1343
  size_t offs = 0;
1062
1344
  while (offs < text.size()) {
1063
1345
  llama_sp_symbol sym;
1064
- size_t char_len = Min(text.size() - offs, utf8_len(text[offs]));
1346
+ size_t char_len = std::min(text.size() - offs, utf8_len(text[offs]));
1065
1347
  sym.text = text.c_str() + offs;
1066
1348
  sym.n = char_len;
1067
1349
  offs += char_len;
@@ -1236,7 +1518,7 @@ static llama_vocab::id llama_sample_top_p_top_k(
1236
1518
  }
1237
1519
  }
1238
1520
 
1239
- sample_top_k(logits_id, top_k > 0 ? Min(top_k, n_logits) : n_logits);
1521
+ sample_top_k(logits_id, top_k > 0 ? std::min(top_k, n_logits) : n_logits);
1240
1522
 
1241
1523
  // compute probs for the top k tokens
1242
1524
  std::vector<float> probs;
@@ -1284,298 +1566,118 @@ static llama_vocab::id llama_sample_top_p_top_k(
1284
1566
  // quantization
1285
1567
  //
1286
1568
 
1287
- // TODO: reuse code from the llama_model_load() somehow
1288
- static bool llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, int itype) {
1289
- ggml_type type = GGML_TYPE_Q4_1;
1290
-
1291
- switch (itype) {
1292
- case 2: type = GGML_TYPE_Q4_0; break;
1293
- case 3: type = GGML_TYPE_Q4_1; break;
1294
- default: fprintf(stderr, "%s: invalid quantization type %d\n", __func__, itype); return 1;
1569
+ static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, enum llama_ftype ftype) {
1570
+ ggml_type quantized_type;
1571
+ switch (ftype) {
1572
+ case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
1573
+ case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
1574
+ default: throw format("invalid output file type %d\n", ftype);
1295
1575
  };
1296
1576
 
1297
- if (type != GGML_TYPE_Q4_0 && type != GGML_TYPE_Q4_1) {
1298
- fprintf(stderr, "%s: invalid quantization type %d\n", __func__, type);
1299
- return false;
1300
- }
1301
-
1302
- llama_vocab vocab;
1303
-
1304
- printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
1305
-
1306
- auto finp = std::ifstream(fname_inp, std::ios::binary);
1307
- if (!finp) {
1308
- fprintf(stderr, "%s: failed to open '%s' for reading\n", __func__, fname_inp.c_str());
1309
- return false;
1310
- }
1311
-
1312
- auto fout = std::ofstream(fname_out, std::ios::binary);
1313
- if (!fout) {
1314
- fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname_out.c_str());
1315
- return false;
1316
- }
1317
-
1318
- // verify magic
1319
- {
1320
- uint32_t magic;
1321
- finp.read((char *) &magic, sizeof(magic));
1322
- if (magic == LLAMA_FILE_MAGIC_UNVERSIONED) {
1323
- fprintf(stderr, "%s: invalid model file '%s' (too old, regenerate your model files!)\n",
1324
- __func__, fname_inp.c_str());
1325
- return false;
1326
- }
1327
- if (magic != LLAMA_FILE_MAGIC) {
1328
- return report_bad_magic(fname_inp.c_str(), magic, LLAMA_FILE_MAGIC);
1329
- }
1330
-
1331
- fout.write((char *) &magic, sizeof(magic));
1332
-
1333
- uint32_t format_version;
1334
- finp.read((char *) &format_version, sizeof(format_version));
1335
-
1336
- if (format_version != LLAMA_FILE_VERSION) {
1337
- fprintf(stderr, "%s: invalid model file '%s' (unsupported format version %" PRIu32 ", expected %d)\n",
1338
- __func__, fname_inp.c_str(), format_version, LLAMA_FILE_VERSION);
1339
- return false;
1340
- }
1341
-
1342
- fout.write((char *) &format_version, sizeof(format_version));
1343
- }
1344
-
1345
- llama_hparams hparams;
1346
-
1347
- // load hparams
1348
- {
1349
- finp.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
1350
- //finp.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
1351
- finp.read((char *) &hparams.n_embd, sizeof(hparams.n_embd));
1352
- finp.read((char *) &hparams.n_mult, sizeof(hparams.n_mult));
1353
- finp.read((char *) &hparams.n_head, sizeof(hparams.n_head));
1354
- finp.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
1355
- finp.read((char *) &hparams.n_rot, sizeof(hparams.n_rot));
1356
- finp.read((char *) &hparams.f16, sizeof(hparams.f16));
1357
-
1358
- printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
1359
- printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
1360
- printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
1361
- printf("%s: n_mult = %d\n", __func__, hparams.n_mult);
1362
- printf("%s: n_head = %d\n", __func__, hparams.n_head);
1363
- printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
1364
- printf("%s: f16 = %d\n", __func__, hparams.f16);
1365
-
1366
- fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
1367
- //fout.write((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
1368
- fout.write((char *) &hparams.n_embd, sizeof(hparams.n_embd));
1369
- fout.write((char *) &hparams.n_mult, sizeof(hparams.n_mult));
1370
- fout.write((char *) &hparams.n_head, sizeof(hparams.n_head));
1371
- fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer));
1372
- fout.write((char *) &hparams.n_rot, sizeof(hparams.n_rot));
1373
- fout.write((char *) &itype, sizeof(hparams.f16));
1374
- }
1375
-
1376
- // load vocab
1377
- {
1378
- const int32_t n_vocab = hparams.n_vocab;
1379
-
1380
- if (n_vocab != hparams.n_vocab) {
1381
- fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
1382
- __func__, fname_inp.c_str(), n_vocab, hparams.n_vocab);
1383
- return false;
1384
- }
1385
-
1386
- std::vector<char> word(32);
1387
- vocab.id_to_token.resize(n_vocab);
1388
- for (int i = 0; i < n_vocab; i++) {
1389
- uint32_t len;
1390
- finp.read ((char *) &len, sizeof(len));
1391
- fout.write((char *) &len, sizeof(len));
1392
-
1393
- word.resize(len);
1394
- finp.read ((char *) &word[0], len);
1395
- fout.write((char *) &word[0], len);
1396
-
1397
- float score;
1398
- finp.read ((char *) &score, sizeof(score));
1399
- fout.write((char *) &score, sizeof(score));
1400
-
1401
- vocab.token_to_id[word.data()] = i;
1402
-
1403
- auto &tok_score = vocab.id_to_token[i];
1404
- tok_score.tok = word.data();
1405
- tok_score.score = score;
1406
- }
1407
- }
1408
-
1409
- // load weights
1410
- {
1411
- size_t total_size_org = 0;
1412
- size_t total_size_new = 0;
1413
-
1414
- std::vector<float> work;
1415
-
1416
- std::vector<uint8_t> data_u8;
1417
- std::vector<ggml_fp16_t> data_f16;
1418
- std::vector<float> data_f32;
1419
-
1420
- std::vector<int64_t> hist_all(1 << 4, 0);
1421
-
1422
- while (true) {
1423
- int32_t n_dims;
1424
- int32_t length;
1425
- int32_t ftype;
1426
-
1427
- finp.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
1428
- finp.read(reinterpret_cast<char *>(&length), sizeof(length));
1429
- finp.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
1430
-
1431
- if (finp.eof()) {
1432
- break;
1433
- }
1434
-
1435
- int32_t nelements = 1;
1436
- int32_t ne[2] = { 1, 1 };
1437
- for (int i = 0; i < n_dims; ++i) {
1438
- finp.read (reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
1439
- nelements *= ne[i];
1440
- }
1441
-
1442
- std::string name(length, 0);
1443
- finp.read (&name[0], length);
1444
-
1445
- {
1446
- // ensure tensor data is aligned
1447
- uint64_t offset = finp.tellg();
1448
- offset = (offset + 31) & -32;
1449
- finp.seekg(offset);
1450
- }
1451
-
1452
- {
1453
- static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
1454
- printf("%48s - [%5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ftype_str[ftype]);
1455
- }
1456
-
1457
- // regexes of tensor names to be quantized
1458
- const std::vector<std::string> k_names = {
1459
- ".*weight",
1460
- };
1461
-
1462
- bool quantize = false;
1463
- for (const auto & s : k_names) {
1464
- if (std::regex_match(name, std::regex(s))) {
1465
- quantize = true;
1466
- break;
1467
- }
1468
- }
1469
-
1470
- // quantize only 2D tensors
1471
- quantize &= (n_dims == 2);
1472
-
1473
- if (quantize) {
1474
- if (ftype != 0 && ftype != 1) {
1475
- fprintf(stderr, "%s: unsupported ftype %d for integer quantization\n", __func__, ftype);
1476
- return false;
1477
- }
1478
-
1479
- if (ftype == 1) {
1480
- data_f16.resize(nelements);
1481
- finp.read(reinterpret_cast<char *>(data_f16.data()), nelements * sizeof(ggml_fp16_t));
1482
- data_f32.resize(nelements);
1483
- for (int i = 0; i < nelements; ++i) {
1484
- data_f32[i] = ggml_fp16_to_fp32(data_f16[i]);
1485
- }
1486
- } else {
1487
- data_f32.resize(nelements);
1488
- finp.read(reinterpret_cast<char *>(data_f32.data()), nelements * sizeof(float));
1577
+ std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp.c_str(), /*use_mmap*/ false,
1578
+ /*vocab_only*/ false));
1579
+ llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), ftype);
1580
+
1581
+ size_t total_size_org = 0;
1582
+ size_t total_size_new = 0;
1583
+ std::vector<int64_t> hist_all(1 << 4, 0);
1584
+
1585
+ size_t idx = 0;
1586
+ for (llama_load_tensor & tensor : model_loader->tensors_map.tensors) {
1587
+ llama_buffer read_data;
1588
+ read_data.resize(tensor.size);
1589
+ tensor.data = read_data.addr;
1590
+ model_loader->load_data_for(tensor);
1591
+
1592
+ printf("[%4zu/%4zu] %36s - %16s, type = %6s, ",
1593
+ ++idx, model_loader->tensors_map.tensors.size(),
1594
+ tensor.name.c_str(), llama_format_tensor_shape(tensor.ne).c_str(),
1595
+ ggml_type_name(tensor.type));
1596
+
1597
+ // This used to be a regex, but <regex> has an extreme cost to compile times.
1598
+ bool quantize = tensor.name.rfind("weight") == tensor.name.size() - 6; // ends with 'weight'?
1599
+
1600
+ // quantize only 2D tensors
1601
+ quantize &= (tensor.ne.size() == 2);
1602
+
1603
+ enum ggml_type new_type;
1604
+ void * new_data;
1605
+ size_t new_size;
1606
+ llama_buffer work;
1607
+
1608
+ if (!quantize) {
1609
+ new_type = tensor.type;
1610
+ new_data = tensor.data;
1611
+ new_size = tensor.size;
1612
+ printf("size = %8.3f MB\n", tensor.size/1024.0/1024.0);
1613
+ } else {
1614
+ new_type = quantized_type;
1615
+ float * f32_data;
1616
+ size_t nelements = tensor.ne.at(0) * tensor.ne.at(1);
1617
+ llama_buffer f32_conv_buf;
1618
+ if (tensor.type == GGML_TYPE_F32) {
1619
+ f32_data = (float *) tensor.data;
1620
+ } else if (tensor.type == GGML_TYPE_F16) {
1621
+ f32_conv_buf.resize(nelements * sizeof(float));
1622
+ f32_data = (float *) f32_conv_buf.addr;
1623
+ auto f16_data = (const ggml_fp16_t *) tensor.data;
1624
+ for (size_t i = 0; i < nelements; i++) {
1625
+ f32_data[i] = ggml_fp16_to_fp32(f16_data[i]);
1489
1626
  }
1490
-
1491
- ftype = itype;
1492
1627
  } else {
1493
- const int bpe = (ftype == 0) ? sizeof(float) : sizeof(uint16_t);
1494
-
1495
- data_u8.resize(nelements*bpe);
1496
- finp.read(reinterpret_cast<char *>(data_u8.data()), nelements * bpe);
1628
+ throw format("type %s unsupported for integer quantization", ggml_type_name(tensor.type));
1497
1629
  }
1498
1630
 
1499
- fout.write(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
1500
- fout.write(reinterpret_cast<char *>(&length), sizeof(length));
1501
- fout.write(reinterpret_cast<char *>(&ftype), sizeof(ftype));
1502
- for (int i = 0; i < n_dims; ++i) {
1503
- fout.write(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
1631
+ printf("quantizing .. ");
1632
+ fflush(stdout);
1633
+
1634
+ work.resize(nelements * 4); // upper bound on size
1635
+ new_data = work.addr;
1636
+ std::vector<int64_t> hist_cur(1 << 4, 0);
1637
+
1638
+ switch (new_type) {
1639
+ case GGML_TYPE_Q4_0:
1640
+ {
1641
+ new_size = ggml_quantize_q4_0(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
1642
+ } break;
1643
+ case GGML_TYPE_Q4_1:
1644
+ {
1645
+ new_size = ggml_quantize_q4_1(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
1646
+ } break;
1647
+ default:
1648
+ LLAMA_ASSERT(false);
1504
1649
  }
1505
- fout.write(&name[0], length);
1506
1650
 
1507
- {
1508
- // ensure tensor data is aligned
1509
- uint64_t offset = fout.tellp();
1510
- offset = (offset + 31) & -32;
1511
- fout.seekp(offset);
1651
+ printf("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0);
1652
+ for (size_t i = 0; i < hist_cur.size(); i++) {
1653
+ hist_all[i] += hist_cur[i];
1512
1654
  }
1513
1655
 
1514
- if (quantize) {
1515
- printf("quantizing .. ");
1516
- work.resize(nelements); // for quantization
1517
-
1518
- size_t cur_size = 0;
1519
- std::vector<int64_t> hist_cur(1 << 4, 0);
1520
-
1521
- switch (type) {
1522
- case GGML_TYPE_Q4_0:
1523
- {
1524
- cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
1525
- } break;
1526
- case GGML_TYPE_Q4_1:
1527
- {
1528
- cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
1529
- } break;
1530
- default:
1531
- {
1532
- fprintf(stderr, "%s: unsupported quantization type %d\n", __func__, type);
1533
- return false;
1534
- }
1535
- }
1536
-
1537
- fout.write(reinterpret_cast<char *>(work.data()), cur_size);
1538
- total_size_new += cur_size;
1539
-
1540
- printf("size = %8.2f MB -> %8.2f MB | hist: ", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0);
1541
- for (int i = 0; i < (int) hist_cur.size(); ++i) {
1542
- hist_all[i] += hist_cur[i];
1543
- }
1544
-
1545
- for (int i = 0; i < (int) hist_cur.size(); ++i) {
1546
- printf("%5.3f ", hist_cur[i] / float(nelements));
1547
- }
1548
- printf("\n");
1549
- } else {
1550
- printf("size = %8.3f MB\n", data_u8.size()/1024.0/1024.0);
1551
- fout.write(reinterpret_cast<char *>(data_u8.data()), data_u8.size());
1552
- total_size_new += data_u8.size();
1656
+ for (size_t i = 0; i < hist_cur.size(); i++) {
1657
+ printf("%5.3f ", hist_cur[i] / float(nelements));
1553
1658
  }
1554
-
1555
- total_size_org += nelements * sizeof(float);
1659
+ printf("\n");
1556
1660
  }
1661
+ total_size_org += tensor.size;
1662
+ total_size_new += new_size;
1663
+ file_saver.write_tensor(tensor, new_type, new_data, new_size);
1664
+ }
1557
1665
 
1558
- printf("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
1559
- printf("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0);
1666
+ printf("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
1667
+ printf("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0);
1560
1668
 
1561
- {
1562
- int64_t sum_all = 0;
1563
- for (int i = 0; i < (int) hist_all.size(); ++i) {
1564
- sum_all += hist_all[i];
1565
- }
1669
+ {
1670
+ int64_t sum_all = 0;
1671
+ for (size_t i = 0; i < hist_all.size(); i++) {
1672
+ sum_all += hist_all[i];
1673
+ }
1566
1674
 
1567
- printf("%s: hist: ", __func__);
1568
- for (int i = 0; i < (int) hist_all.size(); ++i) {
1569
- printf("%5.3f ", hist_all[i] / float(sum_all));
1570
- }
1571
- printf("\n");
1675
+ printf("%s: hist: ", __func__);
1676
+ for (size_t i = 0; i < hist_all.size(); i++) {
1677
+ printf("%5.3f ", hist_all[i] / float(sum_all));
1572
1678
  }
1679
+ printf("\n");
1573
1680
  }
1574
-
1575
- finp.close();
1576
- fout.close();
1577
-
1578
- return true;
1579
1681
  }
1580
1682
 
1581
1683
  //
@@ -1593,32 +1695,36 @@ struct llama_context * llama_init_from_file(
1593
1695
  params.seed = time(NULL);
1594
1696
  }
1595
1697
 
1698
+ unsigned cur_percentage = 0;
1699
+ if (params.progress_callback == NULL) {
1700
+ params.progress_callback_user_data = &cur_percentage;
1701
+ params.progress_callback = [](float progress, void * ctx) {
1702
+ unsigned * cur_percentage_p = (unsigned *) ctx;
1703
+ unsigned percentage = (unsigned) (100 * progress);
1704
+ while (percentage > *cur_percentage_p) {
1705
+ ++*cur_percentage_p;
1706
+ fprintf(stderr, ".");
1707
+ fflush(stderr);
1708
+ if (percentage >= 100) {
1709
+ fprintf(stderr, "\n");
1710
+ }
1711
+ }
1712
+ };
1713
+ }
1714
+
1596
1715
  ctx->rng = std::mt19937(params.seed);
1597
1716
  ctx->logits_all = params.logits_all;
1598
1717
 
1599
1718
  ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
1600
1719
 
1601
- if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_parts, memory_type,
1602
- params.vocab_only, params.progress_callback,
1603
- params.progress_callback_user_data)) {
1720
+ if (!llama_model_load(path_model, *ctx, params.n_ctx, memory_type,
1721
+ params.use_mmap, params.use_mlock, params.vocab_only,
1722
+ params.progress_callback, params.progress_callback_user_data)) {
1604
1723
  fprintf(stderr, "%s: failed to load model\n", __func__);
1605
1724
  llama_free(ctx);
1606
1725
  return nullptr;
1607
1726
  }
1608
1727
 
1609
- if (params.use_mlock) {
1610
- char *err;
1611
- if (!ggml_mlock(ctx->model.ctx,
1612
- ctx->model.mm_addr,
1613
- ctx->model.mm_length,
1614
- &err)) {
1615
- fprintf(stderr, "%s\n", err);
1616
- free(err);
1617
- llama_free(ctx);
1618
- return nullptr;
1619
- }
1620
- }
1621
-
1622
1728
  // reserve memory for context buffers
1623
1729
  if (!params.vocab_only) {
1624
1730
  if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx)) {
@@ -1645,50 +1751,289 @@ struct llama_context * llama_init_from_file(
1645
1751
  ctx->embedding.resize(hparams.n_embd);
1646
1752
  }
1647
1753
 
1648
- ctx->buf_compute.resize(MEM_REQ_EVAL.at(ctx->model.type));
1754
+ ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type));
1649
1755
 
1650
- ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0.at(ctx->model.type));
1651
- ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1.at(ctx->model.type));
1756
+ ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0().at(ctx->model.type));
1757
+ ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
1652
1758
  }
1653
1759
 
1654
1760
  return ctx;
1655
1761
  }
1656
1762
 
1657
1763
  void llama_free(struct llama_context * ctx) {
1658
- kv_cache_free(ctx->model.kv_self);
1659
-
1660
- if (ctx->model.ctx) {
1661
- ggml_free(ctx->model.ctx);
1662
- }
1663
-
1664
- if (ctx->model.mm_addr) {
1665
- munmap_file(ctx->model.mm_addr, ctx->model.mm_length);
1666
- }
1667
-
1668
1764
  delete ctx;
1669
1765
  }
1670
1766
 
1671
1767
  int llama_model_quantize(
1672
1768
  const char * fname_inp,
1673
1769
  const char * fname_out,
1674
- int itype) {
1675
- if (!llama_model_quantize_internal(fname_inp, fname_out, itype)) {
1676
- fprintf(stderr, "%s: failed to quantize\n", __func__);
1770
+ enum llama_ftype ftype) {
1771
+ try {
1772
+ llama_model_quantize_internal(fname_inp, fname_out, ftype);
1773
+ return 0;
1774
+ } catch (const std::string & err) {
1775
+ fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.c_str());
1776
+ return 1;
1777
+ }
1778
+ }
1779
+
1780
+ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
1781
+ fprintf(stderr, "%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
1782
+
1783
+ auto & model = ctx->model;
1784
+
1785
+ const int64_t t_start_lora_us = ggml_time_us();
1786
+
1787
+ auto fin = std::ifstream(path_lora, std::ios::binary);
1788
+ if (!fin) {
1789
+ fprintf(stderr, "%s: failed to open '%s'\n", __func__, path_lora);
1677
1790
  return 1;
1678
1791
  }
1679
1792
 
1793
+ // verify magic and version
1794
+ {
1795
+ uint32_t magic;
1796
+ fin.read((char *) &magic, sizeof(magic));
1797
+ if (magic != 'ggla') {
1798
+ fprintf(stderr, "%s: bad file magic\n", __func__);
1799
+ return 1;
1800
+ }
1801
+ uint32_t format_version;
1802
+ fin.read((char *) &format_version, sizeof(format_version));
1803
+
1804
+ if (format_version != 1) {
1805
+ fprintf(stderr, "%s: unsupported file version\n", __func__ );
1806
+ return 1;
1807
+ }
1808
+ }
1809
+
1810
+ int32_t lora_r;
1811
+ int32_t lora_alpha;
1812
+ fin.read((char *) &lora_r, sizeof(lora_r));
1813
+ fin.read((char *) &lora_alpha, sizeof(lora_alpha));
1814
+ float scaling = (float)lora_alpha / (float)lora_r;
1815
+
1816
+ fprintf(stderr, "%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);
1817
+
1818
+
1819
+ // create a temporary ggml context to store the lora tensors
1820
+ // todo: calculate size from biggest possible tensor
1821
+ std::vector<uint8_t> lora_buf(1024ull * 1024ull * 1024ull);
1822
+ struct ggml_init_params params;
1823
+ params.mem_size = lora_buf.size();
1824
+ params.mem_buffer = lora_buf.data();
1825
+ params.no_alloc = false;
1826
+
1827
+ ggml_context * lora_ctx = ggml_init(params);
1828
+ std::unordered_map<std::string, struct ggml_tensor *> lora_tensors;
1829
+
1830
+ // create a name -> tensor map of the model to accelerate lookups
1831
+ std::unordered_map<std::string, struct ggml_tensor*> model_tensors;
1832
+ for (auto & kv: model.tensors_by_name) {
1833
+ model_tensors.insert(kv);
1834
+ }
1835
+
1836
+
1837
+ // load base model
1838
+ std::unique_ptr<llama_model_loader> model_loader;
1839
+ ggml_context * base_ctx = NULL;
1840
+ llama_buffer base_buf;
1841
+ if (path_base_model) {
1842
+ fprintf(stderr, "%s: loading base model from '%s'\n", __func__, path_base_model);
1843
+ model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*vocab_only*/ false));
1844
+
1845
+ size_t ctx_size, mmapped_size;
1846
+ model_loader->calc_sizes(&ctx_size, &mmapped_size);
1847
+ base_buf.resize(ctx_size);
1848
+
1849
+ ggml_init_params base_params;
1850
+ base_params.mem_size = base_buf.size;
1851
+ base_params.mem_buffer = base_buf.addr;
1852
+ base_params.no_alloc = model_loader->use_mmap;
1853
+
1854
+ base_ctx = ggml_init(base_params);
1855
+
1856
+ model_loader->ggml_ctx = base_ctx;
1857
+
1858
+ // maybe this should in llama_model_loader
1859
+ if (model_loader->use_mmap) {
1860
+ model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ false));
1861
+ }
1862
+ }
1863
+
1864
+ // read tensors and apply
1865
+ bool warned = false;
1866
+ int n_tensors = 0;
1867
+ while (true) {
1868
+ int32_t n_dims;
1869
+ int32_t length;
1870
+ int32_t ftype;
1871
+
1872
+ fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
1873
+ fin.read(reinterpret_cast<char *>(&length), sizeof(length));
1874
+ fin.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
1875
+ if (fin.eof()) {
1876
+ break;
1877
+ }
1878
+
1879
+ int32_t ne[2] = { 1, 1 };
1880
+ for (int i = 0; i < n_dims; ++i) {
1881
+ fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
1882
+ }
1883
+
1884
+ std::string name(length, 0);
1885
+ fin.read(&name[0], length);
1886
+
1887
+ // check for lora suffix and get the type of tensor
1888
+ const std::string lora_suffix = ".lora";
1889
+ size_t pos = name.rfind(lora_suffix);
1890
+ if (pos == std::string::npos) {
1891
+ fprintf(stderr, "%s: error: '%s' is not a lora tensor\n", __func__, name.c_str());
1892
+ return 1;
1893
+ }
1894
+
1895
+ std::string lora_type = name.substr(pos + lora_suffix.length());
1896
+ std::string base_name = name;
1897
+ base_name.erase(pos);
1898
+ // fprintf(stderr, "%s: %s => %s (lora type %s) ", __func__, name.c_str(),base_name.c_str(), lora_type.c_str());
1899
+
1900
+ if (model_tensors.find(base_name.data()) == model_tensors.end()) {
1901
+ fprintf(stderr, "%s: unknown tensor '%s' in lora adapter\n", __func__, name.data());
1902
+ return 1;
1903
+ }
1904
+
1905
+ // create ggml tensor
1906
+ ggml_type wtype;
1907
+ switch (ftype) {
1908
+ case 0: wtype = GGML_TYPE_F32; break;
1909
+ case 1: wtype = GGML_TYPE_F16; break;
1910
+ default:
1911
+ {
1912
+ fprintf(stderr, "%s: invalid tensor data type '%d'\n",
1913
+ __func__, ftype);
1914
+ return false;
1915
+ }
1916
+ }
1917
+ ggml_tensor* lora_tensor;
1918
+ if (n_dims == 2) {
1919
+ lora_tensor = ggml_new_tensor_2d(lora_ctx, wtype, ne[0], ne[1]);
1920
+ }
1921
+ else {
1922
+ fprintf(stderr, "%s: unsupported tensor dimension %d\n", __func__, n_dims);
1923
+ return 1;
1924
+ }
1925
+
1926
+ // load tensor data
1927
+ size_t offset = fin.tellg();
1928
+ size_t tensor_data_size = ggml_nbytes(lora_tensor);
1929
+ offset = (offset + 31) & -32;
1930
+ fin.seekg(offset);
1931
+ fin.read((char*)lora_tensor->data, tensor_data_size);
1932
+
1933
+ lora_tensors[name] = lora_tensor;
1934
+
1935
+ // check if we have both A and B tensors and apply
1936
+ if (lora_tensors.find(base_name + ".loraA") != lora_tensors.end() &&
1937
+ lora_tensors.find(base_name + ".loraB") != lora_tensors.end()) {
1938
+
1939
+ ggml_tensor * dest_t = model_tensors[base_name];
1940
+ ggml_tensor * base_t;
1941
+ if (model_loader) {
1942
+ // load from base model
1943
+ if (model_loader->tensors_map.name_to_idx.find(base_name) == model_loader->tensors_map.name_to_idx.end()) {
1944
+ fprintf(stderr, "%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
1945
+ return 1;
1946
+ }
1947
+ size_t idx = model_loader->tensors_map.name_to_idx[base_name];
1948
+ llama_load_tensor & lt = model_loader->tensors_map.tensors[idx];
1949
+ base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] });
1950
+ lt.data = (uint8_t *) lt.ggml_tensor->data;
1951
+ model_loader->load_data_for(lt);
1952
+ lt.ggml_tensor->data = lt.data;
1953
+ }
1954
+ else {
1955
+ base_t = dest_t;
1956
+ }
1957
+
1958
+ if (base_t->type == GGML_TYPE_Q4_0 || base_t->type == GGML_TYPE_Q4_1) {
1959
+ if (!warned) {
1960
+ fprintf(stderr, "%s: warning: using a lora adapter with a quantized model may result in poor quality, "
1961
+ "use a f16 or f32 base model with --lora-base\n", __func__);
1962
+ warned = true;
1963
+ }
1964
+ }
1965
+
1966
+ ggml_tensor * loraA = lora_tensors[base_name + ".loraA"];
1967
+ ggml_tensor * loraB = lora_tensors[base_name + ".loraB"];
1968
+
1969
+ if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) {
1970
+ fprintf(stderr, "%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");"
1971
+ " are you sure that this adapter is for this model?\n", __func__, base_t->ne[0], loraA->ne[1]);
1972
+ return 1;
1973
+ }
1974
+
1975
+ // w = w + BA*s
1976
+ ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB);
1977
+
1978
+ if (scaling != 1.0f) {
1979
+ ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling);
1980
+ BA = ggml_scale(lora_ctx, BA, scale_tensor);
1981
+ }
1982
+
1983
+ ggml_tensor * r;
1984
+ if (base_t == dest_t) {
1985
+ r = ggml_add_inplace(lora_ctx, dest_t, BA);
1986
+ }
1987
+ else {
1988
+ r = ggml_add(lora_ctx, base_t, BA);
1989
+ r = ggml_cpy(lora_ctx, r, dest_t);
1990
+ }
1991
+
1992
+ struct ggml_cgraph gf = ggml_build_forward(r);
1993
+ gf.n_threads = n_threads;
1994
+ ggml_graph_compute(lora_ctx, &gf);
1995
+
1996
+ // we won't need these tensors again, reset the context to save memory
1997
+ ggml_free(lora_ctx);
1998
+ lora_ctx = ggml_init(params);
1999
+ lora_tensors.clear();
2000
+
2001
+ n_tensors++;
2002
+ if (n_tensors % 4 == 0)
2003
+ fprintf(stderr, ".");
2004
+ }
2005
+ }
2006
+
2007
+ // TODO: this should be in a destructor, it will leak on failure
2008
+ ggml_free(lora_ctx);
2009
+ if (base_ctx) {
2010
+ ggml_free(base_ctx);
2011
+ }
2012
+
2013
+ const int64_t t_lora_us = ggml_time_us() - t_start_lora_us;
2014
+ fprintf(stderr, " done (%.2f ms)\n", t_lora_us / 1000.0);
2015
+
1680
2016
  return 0;
1681
2017
  }
1682
2018
 
2019
+ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
2020
+ try {
2021
+ return llama_apply_lora_from_file_internal(ctx, path_lora, path_base_model, n_threads);
2022
+ } catch (const std::string & err) {
2023
+ fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.c_str());
2024
+ return 1;
2025
+ }
2026
+ }
2027
+
1683
2028
  // Returns the KV cache that will contain the context for the
1684
2029
  // ongoing prediction with the model.
1685
2030
  const uint8_t * llama_get_kv_cache(struct llama_context * ctx) {
1686
- return ctx->model.kv_self.buf.data();
2031
+ return ctx->model.kv_self.buf.addr;
1687
2032
  }
1688
2033
 
1689
2034
  // Returns the size of the KV cache
1690
2035
  size_t llama_get_kv_cache_size(struct llama_context * ctx) {
1691
- return ctx->model.kv_self.buf.size();
2036
+ return ctx->model.kv_self.buf.size;
1692
2037
  }
1693
2038
 
1694
2039
  int llama_get_kv_cache_token_count(struct llama_context * ctx) {
@@ -1702,8 +2047,8 @@ void llama_set_kv_cache(
1702
2047
  size_t n_size,
1703
2048
  int n_token_count) {
1704
2049
  // Make sure we have the same kv cache setup
1705
- LLAMA_ASSERT(ctx->model.kv_self.buf.size() == n_size);
1706
- memcpy(ctx->model.kv_self.buf.data(), kv_cache, n_size);
2050
+ LLAMA_ASSERT(ctx->model.kv_self.buf.size == n_size);
2051
+ memcpy(ctx->model.kv_self.buf.addr, kv_cache, n_size);
1707
2052
  ctx->model.kv_self.n = n_token_count;
1708
2053
  }
1709
2054
 
@@ -1814,9 +2159,9 @@ llama_token llama_sample_top_p_top_k(
1814
2159
  void llama_print_timings(struct llama_context * ctx) {
1815
2160
  const int64_t t_end_us = ggml_time_us();
1816
2161
 
1817
- const int32_t n_sample = Max(1, ctx->n_sample);
1818
- const int32_t n_eval = Max(1, ctx->n_eval);
1819
- const int32_t n_p_eval = Max(1, ctx->n_p_eval);
2162
+ const int32_t n_sample = std::max(1, ctx->n_sample);
2163
+ const int32_t n_eval = std::max(1, ctx->n_eval);
2164
+ const int32_t n_p_eval = std::max(1, ctx->n_p_eval);
1820
2165
 
1821
2166
  fprintf(stderr, "\n");
1822
2167
  fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0);
@@ -1837,18 +2182,25 @@ const char * llama_print_system_info(void) {
1837
2182
  static std::string s;
1838
2183
 
1839
2184
  s = "";
1840
- s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | ";
1841
- s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | ";
1842
- s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
1843
- s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | ";
1844
- s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
1845
- s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | ";
1846
- s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | ";
1847
- s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
1848
- s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
1849
- s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | ";
1850
- s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
1851
- s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
2185
+ s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | ";
2186
+ s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | ";
2187
+ s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
2188
+ s += "AVX512_VBMI = " + std::to_string(ggml_cpu_has_avx512_vbmi()) + " | ";
2189
+ s += "AVX512_VNNI = " + std::to_string(ggml_cpu_has_avx512_vnni()) + " | ";
2190
+ s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | ";
2191
+ s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
2192
+ s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | ";
2193
+ s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | ";
2194
+ s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
2195
+ s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
2196
+ s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | ";
2197
+ s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
2198
+ s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
1852
2199
 
1853
2200
  return s.c_str();
1854
2201
  }
2202
+
2203
+ // For internal test use
2204
+ std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
2205
+ return ctx->model.tensors_by_name;
2206
+ }