llama_cpp 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/README.md +2 -0
- data/ext/llama_cpp/src/ggml.c +354 -51
- data/ext/llama_cpp/src/ggml.h +6 -1
- data/ext/llama_cpp/src/llama.cpp +210 -259
- data/ext/llama_cpp/src/llama.h +2 -2
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +3 -2
- metadata +1 -1
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -12,6 +12,19 @@
|
|
12
12
|
#include <cassert>
|
13
13
|
#include <cstring>
|
14
14
|
|
15
|
+
#if defined(_WIN32) && !defined(_POSIX_MAPPED_FILES)
|
16
|
+
#define WIN32_LEAN_AND_MEAN
|
17
|
+
#include <Windows.h>
|
18
|
+
#else
|
19
|
+
#include <sys/types.h>
|
20
|
+
#include <sys/mman.h>
|
21
|
+
#include <unistd.h>
|
22
|
+
#include <fcntl.h>
|
23
|
+
#endif
|
24
|
+
|
25
|
+
#define Min(X, Y) ((Y) > (X) ? (X) : (Y))
|
26
|
+
#define Max(X, Y) ((Y) < (X) ? (X) : (Y))
|
27
|
+
|
15
28
|
#define LLAMA_USE_SCRATCH
|
16
29
|
#define LLAMA_MAX_SCRATCH_BUFFERS 16
|
17
30
|
|
@@ -142,6 +155,10 @@ struct llama_model {
|
|
142
155
|
// the model memory buffer
|
143
156
|
std::vector<uint8_t> buf;
|
144
157
|
|
158
|
+
// model memory mapped file
|
159
|
+
void * mm_addr = NULL;
|
160
|
+
uint64_t mm_length = 0;
|
161
|
+
|
145
162
|
// tensors
|
146
163
|
int n_loaded;
|
147
164
|
std::unordered_map<std::string, struct ggml_tensor *> tensors;
|
@@ -165,6 +182,7 @@ struct llama_context {
|
|
165
182
|
|
166
183
|
int64_t t_load_us = 0;
|
167
184
|
int64_t t_start_us = 0;
|
185
|
+
bool has_evaluated_once = false;
|
168
186
|
|
169
187
|
int64_t t_sample_us = 0;
|
170
188
|
int64_t t_eval_us = 0;
|
@@ -206,7 +224,7 @@ struct llama_context {
|
|
206
224
|
}
|
207
225
|
|
208
226
|
if (buf_last >= 0) {
|
209
|
-
buf_max_size[buf_last] =
|
227
|
+
buf_max_size[buf_last] = Max(buf_max_size[buf_last], last_size);
|
210
228
|
}
|
211
229
|
|
212
230
|
buf_last = i;
|
@@ -246,6 +264,7 @@ static bool kv_cache_init(
|
|
246
264
|
struct ggml_init_params params;
|
247
265
|
params.mem_size = cache.buf.size();
|
248
266
|
params.mem_buffer = cache.buf.data();
|
267
|
+
params.no_alloc = false;
|
249
268
|
|
250
269
|
cache.ctx = ggml_init(params);
|
251
270
|
|
@@ -288,6 +307,58 @@ struct llama_context_params llama_context_default_params() {
|
|
288
307
|
// model loading
|
289
308
|
//
|
290
309
|
|
310
|
+
static void *mmap_file(const char *fname, uint64_t *mm_length) {
|
311
|
+
#if defined(_WIN32) && !defined(_POSIX_MAPPED_FILES)
|
312
|
+
HANDLE hFile = CreateFileA(fname,
|
313
|
+
GENERIC_READ,
|
314
|
+
FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
|
315
|
+
NULL,
|
316
|
+
OPEN_EXISTING,
|
317
|
+
FILE_ATTRIBUTE_NORMAL | FILE_ATTRIBUTE_NOT_CONTENT_INDEXED,
|
318
|
+
NULL);
|
319
|
+
if (hFile == INVALID_HANDLE_VALUE) return 0;
|
320
|
+
LARGE_INTEGER fileSize;
|
321
|
+
fileSize.QuadPart = -1;
|
322
|
+
GetFileSizeEx(hFile, &fileSize);
|
323
|
+
int64_t length = fileSize.QuadPart;
|
324
|
+
HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
|
325
|
+
CloseHandle(hFile);
|
326
|
+
if (!hMapping) return 0;
|
327
|
+
void *addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
|
328
|
+
CloseHandle(hMapping);
|
329
|
+
if (!addr) return 0;
|
330
|
+
#else
|
331
|
+
int fd = open(fname, O_RDONLY);
|
332
|
+
if (fd == -1) return 0;
|
333
|
+
int64_t length = lseek(fd, 0, SEEK_END);
|
334
|
+
void *addr = mmap(NULL, length, PROT_READ, MAP_SHARED, fd, 0);
|
335
|
+
close(fd);
|
336
|
+
if (addr == MAP_FAILED) return 0;
|
337
|
+
#endif
|
338
|
+
*mm_length = length;
|
339
|
+
return addr;
|
340
|
+
}
|
341
|
+
|
342
|
+
static void munmap_file(void * addr, size_t length) {
|
343
|
+
#if defined(_WIN32) && !defined(_POSIX_MAPPED_FILES)
|
344
|
+
UnmapViewOfFile(addr);
|
345
|
+
#else
|
346
|
+
munmap(addr, length);
|
347
|
+
#endif
|
348
|
+
}
|
349
|
+
|
350
|
+
static bool report_bad_magic(const char *path, uint32_t got, uint32_t want) {
|
351
|
+
fprintf(stderr,
|
352
|
+
"%s: invalid model file (bad magic [got %#x want %#x])\n"
|
353
|
+
"\tyou most likely need to regenerate your ggml files\n"
|
354
|
+
"\tthe benefit is you'll get 10-100x faster load times\n"
|
355
|
+
"\tsee https://github.com/ggerganov/llama.cpp/issues/91\n"
|
356
|
+
"\tuse convert-pth-to-ggml.py to regenerate from original pth\n"
|
357
|
+
"\tuse migrate-ggml-2023-03-30-pr613.py if you deleted originals\n",
|
358
|
+
path, got, want);
|
359
|
+
return false;
|
360
|
+
}
|
361
|
+
|
291
362
|
static bool llama_model_load(
|
292
363
|
const std::string & fname,
|
293
364
|
llama_context & lctx,
|
@@ -299,22 +370,24 @@ static bool llama_model_load(
|
|
299
370
|
void *progress_callback_user_data) {
|
300
371
|
fprintf(stderr, "%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
|
301
372
|
|
302
|
-
|
303
|
-
|
304
|
-
lctx.t_start_us = t_start_us;
|
305
|
-
|
306
|
-
std::vector<char> f_buf(1024*1024);
|
373
|
+
lctx.t_start_us = ggml_time_us();
|
307
374
|
|
308
375
|
auto & model = lctx.model;
|
309
376
|
auto & vocab = lctx.vocab;
|
310
377
|
|
311
378
|
auto fin = std::ifstream(fname, std::ios::binary);
|
312
|
-
fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size());
|
313
379
|
if (!fin) {
|
314
380
|
fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
|
315
381
|
return false;
|
316
382
|
}
|
317
383
|
|
384
|
+
std::vector<char> f_buf(1024*1024);
|
385
|
+
fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size());
|
386
|
+
|
387
|
+
fin.seekg(0, fin.end);
|
388
|
+
const size_t file_size = fin.tellg();
|
389
|
+
fin.seekg(0);
|
390
|
+
|
318
391
|
// verify magic
|
319
392
|
{
|
320
393
|
uint32_t magic;
|
@@ -325,8 +398,7 @@ static bool llama_model_load(
|
|
325
398
|
return false;
|
326
399
|
}
|
327
400
|
if (magic != LLAMA_FILE_MAGIC) {
|
328
|
-
|
329
|
-
return false;
|
401
|
+
return report_bad_magic(fname.c_str(), magic, LLAMA_FILE_MAGIC);
|
330
402
|
}
|
331
403
|
|
332
404
|
uint32_t format_version;
|
@@ -449,43 +521,24 @@ static bool llama_model_load(
|
|
449
521
|
}
|
450
522
|
}
|
451
523
|
|
524
|
+
// map model into memory
|
525
|
+
char *mm_addr = NULL;
|
526
|
+
model.mm_addr = mmap_file(fname.c_str(), &model.mm_length);
|
527
|
+
if (model.mm_addr == NULL) {
|
528
|
+
fprintf(stderr, "%s: failed to mmap '%s'\n", __func__, fname.c_str());
|
529
|
+
return false;
|
530
|
+
}
|
531
|
+
mm_addr = (char *)model.mm_addr;
|
532
|
+
fprintf(stderr, "%s: ggml map size = %6.2f MB\n", __func__, model.mm_length/(1024.0*1024.0));
|
533
|
+
|
452
534
|
auto & ctx = model.ctx;
|
453
535
|
|
454
536
|
size_t ctx_size = 0;
|
455
|
-
|
456
537
|
{
|
457
|
-
const auto &
|
458
|
-
|
459
|
-
const int n_embd = hparams.n_embd;
|
538
|
+
const auto &hparams = model.hparams;
|
460
539
|
const int n_layer = hparams.n_layer;
|
461
|
-
const int n_ctx = hparams.n_ctx;
|
462
|
-
const int n_vocab = hparams.n_vocab;
|
463
|
-
|
464
|
-
ctx_size += n_embd*n_vocab*ggml_type_sizef(vtype); // tok_embeddings
|
465
|
-
|
466
|
-
ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // norm
|
467
|
-
|
468
|
-
ctx_size += n_embd*n_vocab*ggml_type_sizef(vtype); // output
|
469
|
-
|
470
|
-
ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // attention_norm
|
471
|
-
|
472
|
-
ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wq
|
473
|
-
ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wk
|
474
|
-
ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wv
|
475
|
-
ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wo
|
476
|
-
|
477
|
-
ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ffn_norm
|
478
|
-
|
479
|
-
ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w1
|
480
|
-
ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w2
|
481
|
-
ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w3
|
482
|
-
|
483
|
-
ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(memory_type); // memory_k
|
484
|
-
ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(memory_type); // memory_v
|
485
|
-
|
486
540
|
ctx_size += (5 + 10*n_layer)*256; // object overhead
|
487
|
-
|
488
|
-
fprintf(stderr, "%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
|
541
|
+
fprintf(stderr, "%s: ggml ctx size = %6.2f KB\n", __func__, ctx_size/1024.0);
|
489
542
|
}
|
490
543
|
|
491
544
|
// print memory requirements
|
@@ -495,6 +548,7 @@ static bool llama_model_load(
|
|
495
548
|
// this is the total memory required to run the inference
|
496
549
|
const size_t mem_required =
|
497
550
|
ctx_size +
|
551
|
+
model.mm_length +
|
498
552
|
MEM_REQ_SCRATCH0.at(model.type) +
|
499
553
|
MEM_REQ_SCRATCH1.at(model.type) +
|
500
554
|
MEM_REQ_EVAL.at (model.type);
|
@@ -514,6 +568,7 @@ static bool llama_model_load(
|
|
514
568
|
struct ggml_init_params params = {
|
515
569
|
/*.mem_size =*/ lctx.model.buf.size(),
|
516
570
|
/*.mem_buffer =*/ lctx.model.buf.data(),
|
571
|
+
/*.no_alloc =*/ true,
|
517
572
|
};
|
518
573
|
|
519
574
|
model.ctx = ggml_init(params);
|
@@ -576,234 +631,106 @@ static bool llama_model_load(
|
|
576
631
|
}
|
577
632
|
}
|
578
633
|
|
579
|
-
const size_t file_offset = fin.tellg();
|
580
|
-
|
581
|
-
fin.close();
|
582
|
-
|
583
634
|
std::vector<uint8_t> tmp;
|
584
635
|
|
585
636
|
if (progress_callback) {
|
586
637
|
progress_callback(0.0, progress_callback_user_data);
|
587
638
|
}
|
588
639
|
|
589
|
-
|
590
|
-
const int part_id = i;
|
591
|
-
//const int part_id = n_parts - i - 1;
|
640
|
+
fprintf(stderr, "%s: loading tensors from '%s'\n", __func__, fname.c_str());
|
592
641
|
|
593
|
-
|
594
|
-
|
595
|
-
|
596
|
-
|
597
|
-
|
598
|
-
fprintf(stderr, "%s: loading model part %d/%d from '%s'\n", __func__, i+1, n_parts, fname_part.c_str());
|
642
|
+
// load weights
|
643
|
+
{
|
644
|
+
size_t total_size = 0;
|
645
|
+
model.n_loaded = 0;
|
599
646
|
|
600
|
-
|
601
|
-
|
647
|
+
while (true) {
|
648
|
+
int32_t n_dims;
|
649
|
+
int32_t length;
|
650
|
+
int32_t ftype;
|
602
651
|
|
603
|
-
|
604
|
-
|
652
|
+
fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
|
653
|
+
fin.read(reinterpret_cast<char *>(&length), sizeof(length));
|
654
|
+
fin.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
|
605
655
|
|
606
|
-
|
656
|
+
if (fin.eof()) {
|
657
|
+
break;
|
658
|
+
}
|
607
659
|
|
608
|
-
|
609
|
-
|
610
|
-
|
660
|
+
int32_t nelements = 1;
|
661
|
+
int32_t ne[2] = { 1, 1 };
|
662
|
+
for (int i = 0; i < n_dims; ++i) {
|
663
|
+
fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
|
664
|
+
nelements *= ne[i];
|
665
|
+
}
|
611
666
|
|
612
|
-
|
667
|
+
std::string name(length, 0);
|
668
|
+
fin.read(&name[0], length);
|
613
669
|
|
614
|
-
|
670
|
+
if (model.tensors.find(name.data()) == model.tensors.end()) {
|
671
|
+
fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
|
672
|
+
return false;
|
673
|
+
}
|
615
674
|
|
616
|
-
|
617
|
-
int32_t n_dims;
|
618
|
-
int32_t length;
|
619
|
-
int32_t ftype;
|
675
|
+
auto tensor = model.tensors[name.data()];
|
620
676
|
|
621
|
-
|
622
|
-
|
623
|
-
|
677
|
+
if (ggml_nelements(tensor) != nelements) {
|
678
|
+
fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
|
679
|
+
return false;
|
680
|
+
}
|
681
|
+
if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
|
682
|
+
fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
|
683
|
+
__func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
|
684
|
+
return false;
|
685
|
+
}
|
686
|
+
if (0) {
|
687
|
+
static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
|
688
|
+
fprintf(stderr, "%24s - [%5d, %5d], type = %6s\n", name.data(), ne[0], ne[1], ftype_str[ftype]);
|
689
|
+
}
|
624
690
|
|
625
|
-
|
691
|
+
switch (ftype) {
|
692
|
+
case 0: // f32
|
693
|
+
case 1: // f16
|
626
694
|
break;
|
627
|
-
|
628
|
-
|
629
|
-
|
630
|
-
|
631
|
-
|
632
|
-
|
633
|
-
nelements *= ne[i];
|
634
|
-
}
|
635
|
-
|
636
|
-
std::string name(length, 0);
|
637
|
-
fin.read(&name[0], length);
|
638
|
-
|
639
|
-
if (model.tensors.find(name.data()) == model.tensors.end()) {
|
640
|
-
fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
|
695
|
+
case 2: // q4_0
|
696
|
+
case 3: // q4_1
|
697
|
+
assert(ne[0] % 64 == 0);
|
698
|
+
break;
|
699
|
+
default:
|
700
|
+
fprintf(stderr, "%s: unknown ftype %d in model file\n", __func__, ftype);
|
641
701
|
return false;
|
642
|
-
|
643
|
-
|
644
|
-
// split_type = 0: split by columns
|
645
|
-
// split_type = 1: split by rows
|
646
|
-
int split_type = 0;
|
647
|
-
|
648
|
-
// split_type = 0:
|
649
|
-
// regex:
|
650
|
-
// - tok_embeddings.*
|
651
|
-
// - layers.*.attention.wo.weight
|
652
|
-
// - layers.*.feed_forward.w2.weight
|
653
|
-
|
654
|
-
// split_type = 1:
|
655
|
-
// regex:
|
656
|
-
// - output.*
|
657
|
-
// - layers.*.attention.wq.weight
|
658
|
-
// - layers.*.attention.wk.weight
|
659
|
-
// - layers.*.attention.wv.weight
|
660
|
-
// - layers.*.feed_forward.w1.weight
|
661
|
-
// - layers.*.feed_forward.w3.weight
|
662
|
-
if (name.find("tok_embeddings") != std::string::npos) {
|
663
|
-
split_type = 0;
|
664
|
-
} else if (name.find("layers") != std::string::npos) {
|
665
|
-
if (name.find("attention.wo.weight") != std::string::npos) {
|
666
|
-
split_type = 0;
|
667
|
-
} else if (name.find("feed_forward.w2.weight") != std::string::npos) {
|
668
|
-
split_type = 0;
|
669
|
-
} else {
|
670
|
-
split_type = 1;
|
671
|
-
}
|
672
|
-
} else if (name.find("output") != std::string::npos) {
|
673
|
-
split_type = 1;
|
674
|
-
}
|
675
|
-
|
676
|
-
auto tensor = model.tensors[name.data()];
|
677
|
-
|
678
|
-
if (n_dims == 1) {
|
679
|
-
if (ggml_nelements(tensor) != nelements) {
|
680
|
-
fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
|
681
|
-
return false;
|
682
|
-
}
|
683
|
-
} else {
|
684
|
-
if (ggml_nelements(tensor)/n_parts != nelements) {
|
685
|
-
fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
|
686
|
-
return false;
|
687
|
-
}
|
688
|
-
}
|
689
|
-
|
690
|
-
if (n_dims == 1) {
|
691
|
-
if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
|
692
|
-
fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
|
693
|
-
__func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
|
694
|
-
return false;
|
695
|
-
}
|
696
|
-
} else {
|
697
|
-
if (split_type == 0) {
|
698
|
-
if (tensor->ne[0]/n_parts != ne[0] || tensor->ne[1] != ne[1]) {
|
699
|
-
fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
|
700
|
-
__func__, name.data(), tensor->ne[0]/n_parts, tensor->ne[1], ne[0], ne[1]);
|
701
|
-
return false;
|
702
|
-
}
|
703
|
-
} else {
|
704
|
-
if (tensor->ne[0] != ne[0] || tensor->ne[1]/n_parts != ne[1]) {
|
705
|
-
fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
|
706
|
-
__func__, name.data(), tensor->ne[0], tensor->ne[1]/n_parts, ne[0], ne[1]);
|
707
|
-
return false;
|
708
|
-
}
|
709
|
-
}
|
710
|
-
}
|
711
|
-
|
712
|
-
if (0) {
|
713
|
-
static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
|
714
|
-
fprintf(stderr, "%24s - [%5d, %5d], type = %6s, split = %d\n", name.data(), ne[0], ne[1], ftype_str[ftype], split_type);
|
715
|
-
}
|
716
|
-
|
717
|
-
size_t bpe = 0;
|
718
|
-
|
719
|
-
switch (ftype) {
|
720
|
-
case 0: bpe = ggml_type_size(GGML_TYPE_F32); break;
|
721
|
-
case 1: bpe = ggml_type_size(GGML_TYPE_F16); break;
|
722
|
-
case 2: bpe = ggml_type_size(GGML_TYPE_Q4_0); assert(ne[0] % 64 == 0); break;
|
723
|
-
case 3: bpe = ggml_type_size(GGML_TYPE_Q4_1); assert(ne[0] % 64 == 0); break;
|
724
|
-
default:
|
725
|
-
{
|
726
|
-
fprintf(stderr, "%s: unknown ftype %d in model file\n", __func__, ftype);
|
727
|
-
return false;
|
728
|
-
}
|
729
|
-
};
|
730
|
-
|
731
|
-
if (n_dims == 1 || n_parts == 1) {
|
732
|
-
if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
|
733
|
-
fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
|
734
|
-
__func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
|
735
|
-
return false;
|
736
|
-
}
|
737
|
-
|
738
|
-
if (part_id == 0) {
|
739
|
-
fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
|
740
|
-
} else {
|
741
|
-
fin.seekg(ggml_nbytes(tensor), std::ios::cur);
|
742
|
-
}
|
743
|
-
|
744
|
-
total_size += ggml_nbytes(tensor);
|
745
|
-
} else {
|
746
|
-
if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)/n_parts) {
|
747
|
-
fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
|
748
|
-
__func__, name.data(), ggml_nbytes(tensor)/n_parts, nelements*bpe);
|
749
|
-
return false;
|
750
|
-
}
|
751
|
-
|
752
|
-
if (split_type == 0) {
|
753
|
-
const int np0 = ne[0];
|
754
|
-
|
755
|
-
const size_t row_size = (tensor->ne[0]/ggml_blck_size(tensor->type))*ggml_type_size(tensor->type);
|
756
|
-
assert(row_size == tensor->nb[1]);
|
757
|
-
|
758
|
-
for (int i1 = 0; i1 < ne[1]; ++i1) {
|
759
|
-
const size_t offset_row = i1*row_size;
|
760
|
-
const size_t offset = offset_row + ((part_id*np0)/ggml_blck_size(tensor->type))*ggml_type_size(tensor->type);
|
761
|
-
fin.read(reinterpret_cast<char *>(tensor->data) + offset, row_size/n_parts);
|
762
|
-
}
|
763
|
-
} else {
|
764
|
-
const int np1 = ne[1];
|
765
|
-
|
766
|
-
const size_t row_size = (tensor->ne[0]/ggml_blck_size(tensor->type))*ggml_type_size(tensor->type);
|
767
|
-
|
768
|
-
for (int i1 = 0; i1 < ne[1]; ++i1) {
|
769
|
-
const size_t offset_row = (i1 + part_id*np1)*row_size;
|
770
|
-
fin.read(reinterpret_cast<char *>(tensor->data) + offset_row, row_size);
|
771
|
-
}
|
772
|
-
}
|
773
|
-
|
774
|
-
total_size += ggml_nbytes(tensor)/n_parts;
|
775
|
-
}
|
776
|
-
|
777
|
-
//fprintf(stderr, "%42s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);
|
778
|
-
model.n_loaded++;
|
779
|
-
|
780
|
-
// progress
|
781
|
-
if (progress_callback) {
|
782
|
-
float current_file_progress = float(size_t(fin.tellg()) - file_offset) / float(file_size - file_offset);
|
783
|
-
float current_progress = (float(i) + current_file_progress) / float(n_parts);
|
784
|
-
progress_callback(current_progress, progress_callback_user_data);
|
785
|
-
}
|
786
|
-
if (model.n_loaded % 8 == 0) {
|
787
|
-
fprintf(stderr, ".");
|
788
|
-
fflush(stderr);
|
789
|
-
}
|
790
|
-
}
|
791
|
-
|
792
|
-
fprintf(stderr, " done\n");
|
702
|
+
};
|
793
703
|
|
794
|
-
|
795
|
-
|
796
|
-
|
797
|
-
|
798
|
-
|
799
|
-
|
704
|
+
// load the tensor data into memory without copying or reading it
|
705
|
+
size_t offset = fin.tellg();
|
706
|
+
size_t tensor_data_size = ggml_nbytes(tensor);
|
707
|
+
offset = (offset + 31) & -32;
|
708
|
+
tensor->data = mm_addr + offset;
|
709
|
+
fin.seekg(offset + tensor_data_size);
|
710
|
+
total_size += tensor_data_size;
|
711
|
+
model.n_loaded++;
|
712
|
+
|
713
|
+
// progress
|
714
|
+
if (progress_callback) {
|
715
|
+
double current_progress = size_t(fin.tellg()) / double(file_size);
|
716
|
+
progress_callback(current_progress, progress_callback_user_data);
|
800
717
|
}
|
801
718
|
}
|
802
719
|
|
803
720
|
fin.close();
|
721
|
+
|
722
|
+
fprintf(stderr, "%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, model.n_loaded);
|
723
|
+
if (model.n_loaded == 0) {
|
724
|
+
fprintf(stderr, "%s: WARN no tensors loaded from model file - assuming empty model for testing\n", __func__);
|
725
|
+
} else if (model.n_loaded != (int) model.tensors.size()) {
|
726
|
+
fprintf(stderr, "%s: ERROR not all tensors loaded from model file - expected %zu, got %d\n", __func__, model.tensors.size(), model.n_loaded);
|
727
|
+
return false;
|
728
|
+
}
|
804
729
|
}
|
805
730
|
|
806
|
-
|
731
|
+
// loading time will be recalculate after the first eval, so
|
732
|
+
// we take page faults deferred by mmap() into consideration
|
733
|
+
lctx.t_load_us = ggml_time_us() - lctx.t_start_us;
|
807
734
|
|
808
735
|
if (progress_callback) {
|
809
736
|
progress_callback(1.0, progress_callback_user_data);
|
@@ -849,6 +776,7 @@ static bool llama_eval_internal(
|
|
849
776
|
struct ggml_init_params params = {
|
850
777
|
/*.mem_size =*/ buf_compute.size(),
|
851
778
|
/*.mem_buffer =*/ buf_compute.data(),
|
779
|
+
/*.no_alloc =*/ false,
|
852
780
|
};
|
853
781
|
|
854
782
|
struct ggml_context * ctx0 = ggml_init(params);
|
@@ -856,7 +784,7 @@ static bool llama_eval_internal(
|
|
856
784
|
// for big prompts, if BLAS is enabled, it is better to use only one thread
|
857
785
|
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
|
858
786
|
ggml_cgraph gf = {};
|
859
|
-
gf.n_threads = N
|
787
|
+
gf.n_threads = N >= 32 && ggml_cpu_has_blas() ? 1 : n_threads;
|
860
788
|
|
861
789
|
struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
862
790
|
memcpy(embd->data, tokens, N*ggml_element_size(embd));
|
@@ -1126,7 +1054,7 @@ struct llama_tokenizer {
|
|
1126
1054
|
size_t offs = 0;
|
1127
1055
|
while (offs < text.size()) {
|
1128
1056
|
llama_sp_symbol sym;
|
1129
|
-
size_t char_len =
|
1057
|
+
size_t char_len = Min(text.size() - offs, utf8_len(text[offs]));
|
1130
1058
|
sym.text = text.c_str() + offs;
|
1131
1059
|
sym.n = char_len;
|
1132
1060
|
offs += char_len;
|
@@ -1291,7 +1219,7 @@ static llama_vocab::id llama_sample_top_p_top_k(
|
|
1291
1219
|
|
1292
1220
|
float maxl = -std::numeric_limits<float>::infinity();
|
1293
1221
|
for (const auto & kv : logits_id) {
|
1294
|
-
maxl =
|
1222
|
+
maxl = Max(maxl, kv.first);
|
1295
1223
|
}
|
1296
1224
|
|
1297
1225
|
// compute probs for the top k tokens
|
@@ -1385,8 +1313,7 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
1385
1313
|
return false;
|
1386
1314
|
}
|
1387
1315
|
if (magic != LLAMA_FILE_MAGIC) {
|
1388
|
-
|
1389
|
-
return false;
|
1316
|
+
return report_bad_magic(fname_inp.c_str(), magic, LLAMA_FILE_MAGIC);
|
1390
1317
|
}
|
1391
1318
|
|
1392
1319
|
fout.write((char *) &magic, sizeof(magic));
|
@@ -1444,7 +1371,7 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
1444
1371
|
return false;
|
1445
1372
|
}
|
1446
1373
|
|
1447
|
-
std::
|
1374
|
+
std::vector<char> word(32);
|
1448
1375
|
vocab.id_to_token.resize(n_vocab);
|
1449
1376
|
for (int i = 0; i < n_vocab; i++) {
|
1450
1377
|
uint32_t len;
|
@@ -1452,17 +1379,17 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
1452
1379
|
fout.write((char *) &len, sizeof(len));
|
1453
1380
|
|
1454
1381
|
word.resize(len);
|
1455
|
-
finp.read ((char *) word
|
1456
|
-
fout.write((char *) word
|
1382
|
+
finp.read ((char *) &word[0], len);
|
1383
|
+
fout.write((char *) &word[0], len);
|
1457
1384
|
|
1458
1385
|
float score;
|
1459
1386
|
finp.read ((char *) &score, sizeof(score));
|
1460
1387
|
fout.write((char *) &score, sizeof(score));
|
1461
1388
|
|
1462
|
-
vocab.token_to_id[word] = i;
|
1389
|
+
vocab.token_to_id[word.data()] = i;
|
1463
1390
|
|
1464
1391
|
auto &tok_score = vocab.id_to_token[i];
|
1465
|
-
tok_score.tok = word;
|
1392
|
+
tok_score.tok = word.data();
|
1466
1393
|
tok_score.score = score;
|
1467
1394
|
}
|
1468
1395
|
}
|
@@ -1503,6 +1430,13 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
1503
1430
|
std::string name(length, 0);
|
1504
1431
|
finp.read (&name[0], length);
|
1505
1432
|
|
1433
|
+
{
|
1434
|
+
// ensure tensor data is aligned
|
1435
|
+
uint64_t offset = finp.tellg();
|
1436
|
+
offset = (offset + 31) & -32;
|
1437
|
+
finp.seekg(offset);
|
1438
|
+
}
|
1439
|
+
|
1506
1440
|
{
|
1507
1441
|
static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
|
1508
1442
|
printf("%48s - [%5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ftype_str[ftype]);
|
@@ -1558,6 +1492,13 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
1558
1492
|
}
|
1559
1493
|
fout.write(&name[0], length);
|
1560
1494
|
|
1495
|
+
{
|
1496
|
+
// ensure tensor data is aligned
|
1497
|
+
uint64_t offset = fout.tellp();
|
1498
|
+
offset = (offset + 31) & -32;
|
1499
|
+
fout.seekp(offset);
|
1500
|
+
}
|
1501
|
+
|
1561
1502
|
if (quantize) {
|
1562
1503
|
printf("quantizing .. ");
|
1563
1504
|
work.resize(nelements); // for quantization
|
@@ -1655,7 +1596,10 @@ struct llama_context * llama_init_from_file(
|
|
1655
1596
|
|
1656
1597
|
if (params.use_mlock) {
|
1657
1598
|
char *err;
|
1658
|
-
if (!ggml_mlock(ctx->model.ctx,
|
1599
|
+
if (!ggml_mlock(ctx->model.ctx,
|
1600
|
+
ctx->model.mm_addr,
|
1601
|
+
ctx->model.mm_length,
|
1602
|
+
&err)) {
|
1659
1603
|
fprintf(stderr, "%s\n", err);
|
1660
1604
|
free(err);
|
1661
1605
|
llama_free(ctx);
|
@@ -1705,6 +1649,10 @@ void llama_free(struct llama_context * ctx) {
|
|
1705
1649
|
ggml_free(ctx->model.ctx);
|
1706
1650
|
}
|
1707
1651
|
|
1652
|
+
if (ctx->model.mm_addr) {
|
1653
|
+
munmap_file(ctx->model.mm_addr, ctx->model.mm_length);
|
1654
|
+
}
|
1655
|
+
|
1708
1656
|
delete ctx;
|
1709
1657
|
}
|
1710
1658
|
|
@@ -1730,7 +1678,11 @@ int llama_eval(
|
|
1730
1678
|
fprintf(stderr, "%s: failed to eval\n", __func__);
|
1731
1679
|
return 1;
|
1732
1680
|
}
|
1733
|
-
|
1681
|
+
// get a more accurate load time, upon first eval
|
1682
|
+
if (!ctx->has_evaluated_once) {
|
1683
|
+
ctx->t_load_us = ggml_time_us() - ctx->t_start_us;
|
1684
|
+
ctx->has_evaluated_once = true;
|
1685
|
+
}
|
1734
1686
|
return 0;
|
1735
1687
|
}
|
1736
1688
|
|
@@ -1823,9 +1775,9 @@ llama_token llama_sample_top_p_top_k(
|
|
1823
1775
|
void llama_print_timings(struct llama_context * ctx) {
|
1824
1776
|
const int64_t t_end_us = ggml_time_us();
|
1825
1777
|
|
1826
|
-
const int32_t n_sample =
|
1827
|
-
const int32_t n_eval =
|
1828
|
-
const int32_t n_p_eval =
|
1778
|
+
const int32_t n_sample = Max(1, ctx->n_sample);
|
1779
|
+
const int32_t n_eval = Max(1, ctx->n_eval);
|
1780
|
+
const int32_t n_p_eval = Max(1, ctx->n_p_eval);
|
1829
1781
|
|
1830
1782
|
fprintf(stderr, "\n");
|
1831
1783
|
fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0);
|
@@ -1837,7 +1789,6 @@ void llama_print_timings(struct llama_context * ctx) {
|
|
1837
1789
|
|
1838
1790
|
void llama_reset_timings(struct llama_context * ctx) {
|
1839
1791
|
ctx->t_start_us = ggml_time_us();
|
1840
|
-
|
1841
1792
|
ctx->t_sample_us = ctx->n_sample = 0;
|
1842
1793
|
ctx->t_eval_us = ctx->n_eval = 0;
|
1843
1794
|
ctx->t_p_eval_us = ctx->n_p_eval = 0;
|