llama_cpp 0.1.0 → 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +17 -0
- data/ext/llama_cpp/extconf.rb +7 -0
- data/ext/llama_cpp/llama_cpp.cpp +153 -21
- data/ext/llama_cpp/src/ggml-cuda.h +4 -0
- data/ext/llama_cpp/src/ggml-opencl.c +291 -215
- data/ext/llama_cpp/src/ggml.c +4428 -2143
- data/ext/llama_cpp/src/ggml.h +216 -13
- data/ext/llama_cpp/src/llama-util.h +23 -23
- data/ext/llama_cpp/src/llama.cpp +300 -149
- data/ext/llama_cpp/src/llama.h +38 -25
- data/lib/llama_cpp/client.rb +1 -3
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +2 -0
- data/sig/llama_cpp.rbs +4 -4
- metadata +2 -2
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
// Defines fileno on msys:
|
2
2
|
#ifndef _GNU_SOURCE
|
3
3
|
#define _GNU_SOURCE
|
4
|
+
#include <cstddef>
|
4
5
|
#include <cstdint>
|
5
6
|
#include <cstdio>
|
6
7
|
#endif
|
@@ -9,6 +10,9 @@
|
|
9
10
|
#include "llama.h"
|
10
11
|
|
11
12
|
#include "ggml.h"
|
13
|
+
#ifdef GGML_USE_CUBLAS
|
14
|
+
#include "ggml-cuda.h"
|
15
|
+
#endif
|
12
16
|
|
13
17
|
#include <array>
|
14
18
|
#include <ctime>
|
@@ -42,6 +46,7 @@ enum e_model {
|
|
42
46
|
MODEL_65B,
|
43
47
|
};
|
44
48
|
|
49
|
+
|
45
50
|
static const size_t MB = 1024*1024;
|
46
51
|
|
47
52
|
// computed for n_ctx == 2048
|
@@ -50,49 +55,49 @@ static const size_t MB = 1024*1024;
|
|
50
55
|
|
51
56
|
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
|
52
57
|
{
|
53
|
-
static std::map<e_model, size_t>
|
58
|
+
static std::map<e_model, size_t> k_sizes = {
|
54
59
|
{ MODEL_7B, 512ull * MB },
|
55
60
|
{ MODEL_13B, 512ull * MB },
|
56
61
|
{ MODEL_30B, 512ull * MB },
|
57
62
|
{ MODEL_65B, 1024ull * MB },
|
58
63
|
};
|
59
|
-
return
|
64
|
+
return k_sizes;
|
60
65
|
}
|
61
66
|
|
62
67
|
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
|
63
68
|
{
|
64
|
-
static std::map<e_model, size_t>
|
69
|
+
static std::map<e_model, size_t> k_sizes = {
|
65
70
|
{ MODEL_7B, 512ull * MB },
|
66
71
|
{ MODEL_13B, 512ull * MB },
|
67
72
|
{ MODEL_30B, 512ull * MB },
|
68
73
|
{ MODEL_65B, 1024ull * MB },
|
69
74
|
};
|
70
|
-
return
|
75
|
+
return k_sizes;
|
71
76
|
}
|
72
77
|
|
73
78
|
// 2*n_embd*n_ctx*n_layer*sizeof(float16)
|
74
79
|
static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
|
75
80
|
{
|
76
|
-
static std::map<e_model, size_t>
|
81
|
+
static std::map<e_model, size_t> k_sizes = {
|
77
82
|
{ MODEL_7B, 1026ull * MB },
|
78
83
|
{ MODEL_13B, 1608ull * MB },
|
79
84
|
{ MODEL_30B, 3124ull * MB },
|
80
85
|
{ MODEL_65B, 5120ull * MB },
|
81
86
|
};
|
82
|
-
return
|
87
|
+
return k_sizes;
|
83
88
|
}
|
84
89
|
|
85
90
|
// this is mostly needed for temporary mul_mat buffers to dequantize the data
|
86
91
|
// not actually needed if BLAS is disabled
|
87
92
|
static const std::map<e_model, size_t> & MEM_REQ_EVAL()
|
88
93
|
{
|
89
|
-
static std::map<e_model, size_t>
|
94
|
+
static std::map<e_model, size_t> k_sizes = {
|
90
95
|
{ MODEL_7B, 768ull * MB },
|
91
96
|
{ MODEL_13B, 1024ull * MB },
|
92
97
|
{ MODEL_30B, 1280ull * MB },
|
93
98
|
{ MODEL_65B, 1536ull * MB },
|
94
99
|
};
|
95
|
-
return
|
100
|
+
return k_sizes;
|
96
101
|
}
|
97
102
|
|
98
103
|
// default hparams (LLaMA 7B)
|
@@ -107,7 +112,7 @@ struct llama_hparams {
|
|
107
112
|
enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;
|
108
113
|
|
109
114
|
bool operator!=(const llama_hparams & other) const {
|
110
|
-
return memcmp(this, &other, sizeof(llama_hparams));
|
115
|
+
return static_cast<bool>(memcmp(this, &other, sizeof(llama_hparams)));
|
111
116
|
}
|
112
117
|
};
|
113
118
|
|
@@ -402,6 +407,8 @@ enum llama_file_version {
|
|
402
407
|
LLAMA_FILE_VERSION_GGML,
|
403
408
|
LLAMA_FILE_VERSION_GGMF_V1, // added version field and scores in vocab
|
404
409
|
LLAMA_FILE_VERSION_GGJT_V1, // added padding
|
410
|
+
LLAMA_FILE_VERSION_GGJT_V2, // changed quantization format
|
411
|
+
LLAMA_FILE_VERSION_GGJT_V3, // changed Q4 and Q8 quantization format
|
405
412
|
};
|
406
413
|
|
407
414
|
struct llama_file_loader {
|
@@ -420,22 +427,30 @@ struct llama_file_loader {
|
|
420
427
|
}
|
421
428
|
void read_magic() {
|
422
429
|
uint32_t magic = file.read_u32();
|
423
|
-
uint32_t version = 0;
|
424
430
|
|
425
|
-
if (magic
|
426
|
-
|
431
|
+
if (magic == LLAMA_FILE_MAGIC_GGML) {
|
432
|
+
file_version = LLAMA_FILE_VERSION_GGML;
|
433
|
+
return;
|
427
434
|
}
|
428
435
|
|
429
|
-
|
430
|
-
|
431
|
-
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
|
436
|
-
|
437
|
-
|
436
|
+
uint32_t version = file.read_u32();
|
437
|
+
|
438
|
+
switch (magic) {
|
439
|
+
case LLAMA_FILE_MAGIC_GGMF:
|
440
|
+
switch (version) {
|
441
|
+
case 1: file_version = LLAMA_FILE_VERSION_GGMF_V1; return;
|
442
|
+
}
|
443
|
+
break;
|
444
|
+
case LLAMA_FILE_MAGIC_GGJT:
|
445
|
+
switch (version) {
|
446
|
+
case 1: file_version = LLAMA_FILE_VERSION_GGJT_V1; return;
|
447
|
+
case 2: file_version = LLAMA_FILE_VERSION_GGJT_V2; return;
|
448
|
+
case 3: file_version = LLAMA_FILE_VERSION_GGJT_V3; return;
|
449
|
+
}
|
438
450
|
}
|
451
|
+
|
452
|
+
throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
|
453
|
+
magic, version);
|
439
454
|
}
|
440
455
|
void read_hparams() {
|
441
456
|
hparams.n_vocab = file.read_u32();
|
@@ -482,7 +497,6 @@ struct llama_file_loader {
|
|
482
497
|
case GGML_TYPE_F16:
|
483
498
|
case GGML_TYPE_Q4_0:
|
484
499
|
case GGML_TYPE_Q4_1:
|
485
|
-
case GGML_TYPE_Q4_2:
|
486
500
|
case GGML_TYPE_Q5_0:
|
487
501
|
case GGML_TYPE_Q5_1:
|
488
502
|
case GGML_TYPE_Q8_0:
|
@@ -494,7 +508,7 @@ struct llama_file_loader {
|
|
494
508
|
|
495
509
|
if (file_version >= LLAMA_FILE_VERSION_GGJT_V1) {
|
496
510
|
// skip to the next multiple of 32 bytes
|
497
|
-
file.seek(-file.tell() & 31, SEEK_CUR);
|
511
|
+
file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
|
498
512
|
}
|
499
513
|
shard.file_idx = file_idx;
|
500
514
|
shard.file_off = file.tell();
|
@@ -527,8 +541,8 @@ struct llama_file_saver {
|
|
527
541
|
write_vocab();
|
528
542
|
}
|
529
543
|
void write_magic() {
|
530
|
-
file.write_u32(
|
531
|
-
file.write_u32(
|
544
|
+
file.write_u32(LLAMA_FILE_MAGIC); // magic
|
545
|
+
file.write_u32(LLAMA_FILE_VERSION); // version
|
532
546
|
}
|
533
547
|
void write_hparams(enum llama_ftype new_ftype) {
|
534
548
|
const llama_hparams & hparams = any_file_loader->hparams;
|
@@ -558,7 +572,6 @@ struct llama_file_saver {
|
|
558
572
|
case GGML_TYPE_F16:
|
559
573
|
case GGML_TYPE_Q4_0:
|
560
574
|
case GGML_TYPE_Q4_1:
|
561
|
-
case GGML_TYPE_Q4_2:
|
562
575
|
case GGML_TYPE_Q5_0:
|
563
576
|
case GGML_TYPE_Q5_1:
|
564
577
|
case GGML_TYPE_Q8_0:
|
@@ -570,7 +583,7 @@ struct llama_file_saver {
|
|
570
583
|
file.write_u32(new_type);
|
571
584
|
file.write_raw(tensor.ne.data(), sizeof(tensor.ne[0]) * tensor.ne.size());
|
572
585
|
file.write_raw(tensor.name.data(), tensor.name.size());
|
573
|
-
file.seek(-file.tell() & 31, SEEK_CUR);
|
586
|
+
file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
|
574
587
|
LLAMA_ASSERT(new_size == llama_calc_tensor_size(tensor.ne, new_type));
|
575
588
|
file.write_raw(new_data, new_size);
|
576
589
|
}
|
@@ -585,12 +598,12 @@ struct llama_model_loader {
|
|
585
598
|
std::unique_ptr<llama_mmap> mapping;
|
586
599
|
|
587
600
|
llama_model_loader(const std::string & fname_base, bool use_mmap, bool vocab_only) {
|
588
|
-
auto first_file = new llama_file_loader(fname_base.c_str(), 0, tensors_map);
|
601
|
+
auto * first_file = new llama_file_loader(fname_base.c_str(), 0, tensors_map);
|
589
602
|
file_loaders.emplace_back(first_file);
|
590
603
|
uint32_t n_parts = vocab_only ? 1 : guess_n_parts();
|
591
604
|
for (uint32_t i = 1; i < n_parts; i++) {
|
592
605
|
std::string fname = fname_base + "." + std::to_string(i);
|
593
|
-
auto ith_file = new llama_file_loader(fname.c_str(), i, tensors_map);
|
606
|
+
auto * ith_file = new llama_file_loader(fname.c_str(), i, tensors_map);
|
594
607
|
file_loaders.emplace_back(ith_file);
|
595
608
|
if (ith_file->hparams != first_file->hparams) {
|
596
609
|
throw format("llama.cpp: hparams inconsistent between files");
|
@@ -637,7 +650,7 @@ struct llama_model_loader {
|
|
637
650
|
}
|
638
651
|
}
|
639
652
|
|
640
|
-
struct ggml_tensor * get_tensor(const std::string & name, std::vector<uint32_t> ne) {
|
653
|
+
struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne, ggml_backend backend) {
|
641
654
|
auto it = tensors_map.name_to_idx.find(name);
|
642
655
|
if (it == tensors_map.name_to_idx.end()) {
|
643
656
|
throw format("llama.cpp: tensor '%s' is missing from model", name.c_str());
|
@@ -648,10 +661,10 @@ struct llama_model_loader {
|
|
648
661
|
name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str());
|
649
662
|
}
|
650
663
|
|
651
|
-
return get_tensor_for(lt);
|
664
|
+
return get_tensor_for(lt, backend);
|
652
665
|
}
|
653
666
|
|
654
|
-
struct ggml_tensor * get_tensor_for(llama_load_tensor & lt) {
|
667
|
+
struct ggml_tensor * get_tensor_for(llama_load_tensor & lt, ggml_backend backend) {
|
655
668
|
struct ggml_tensor * tensor;
|
656
669
|
if (lt.ne.size() == 2) {
|
657
670
|
tensor = ggml_new_tensor_2d(ggml_ctx, lt.type, lt.ne.at(0), lt.ne.at(1));
|
@@ -661,12 +674,13 @@ struct llama_model_loader {
|
|
661
674
|
}
|
662
675
|
ggml_set_name(tensor, lt.name.c_str());
|
663
676
|
LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor
|
677
|
+
tensor->backend = backend;
|
664
678
|
lt.ggml_tensor = tensor;
|
665
679
|
num_ggml_tensors_created++;
|
666
680
|
return tensor;
|
667
681
|
}
|
668
682
|
|
669
|
-
void done_getting_tensors() {
|
683
|
+
void done_getting_tensors() const {
|
670
684
|
if (num_ggml_tensors_created != tensors_map.tensors.size()) {
|
671
685
|
throw std::string("llama.cpp: file contained more tensors than expected");
|
672
686
|
}
|
@@ -674,12 +688,16 @@ struct llama_model_loader {
|
|
674
688
|
|
675
689
|
void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) {
|
676
690
|
size_t data_size = 0;
|
691
|
+
size_t prefetch_size = 0;
|
677
692
|
for (const llama_load_tensor & lt : tensors_map.tensors) {
|
678
693
|
data_size += lt.size;
|
694
|
+
if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
|
695
|
+
prefetch_size += lt.size;
|
696
|
+
}
|
679
697
|
}
|
680
698
|
|
681
699
|
if (use_mmap) {
|
682
|
-
mapping.reset(new llama_mmap(&file_loaders.at(0)->file));
|
700
|
+
mapping.reset(new llama_mmap(&file_loaders.at(0)->file, prefetch_size));
|
683
701
|
if (!lmlock) {
|
684
702
|
// Don't call the callback since the actual loading will be lazy
|
685
703
|
// and we can't measure it.
|
@@ -692,6 +710,9 @@ struct llama_model_loader {
|
|
692
710
|
|
693
711
|
size_t done_size = 0;
|
694
712
|
for (llama_load_tensor & lt : tensors_map.tensors) {
|
713
|
+
if (lt.ggml_tensor->backend != GGML_BACKEND_CPU) {
|
714
|
+
continue;
|
715
|
+
}
|
695
716
|
if (progress_callback) {
|
696
717
|
progress_callback((float) done_size / data_size, progress_callback_user_data);
|
697
718
|
}
|
@@ -704,9 +725,6 @@ struct llama_model_loader {
|
|
704
725
|
lmlock->grow_to(done_size);
|
705
726
|
}
|
706
727
|
}
|
707
|
-
if (progress_callback) {
|
708
|
-
progress_callback(1.0f, progress_callback_user_data);
|
709
|
-
}
|
710
728
|
}
|
711
729
|
|
712
730
|
void load_data_for(llama_load_tensor & lt) {
|
@@ -808,9 +826,9 @@ static bool kv_cache_init(
|
|
808
826
|
struct llama_context_params llama_context_default_params() {
|
809
827
|
struct llama_context_params result = {
|
810
828
|
/*.n_ctx =*/ 512,
|
811
|
-
/*.
|
829
|
+
/*.gpu_layers =*/ 0,
|
812
830
|
/*.seed =*/ -1,
|
813
|
-
/*.f16_kv =*/
|
831
|
+
/*.f16_kv =*/ true,
|
814
832
|
/*.logits_all =*/ false,
|
815
833
|
/*.vocab_only =*/ false,
|
816
834
|
/*.use_mmap =*/ true,
|
@@ -831,6 +849,21 @@ bool llama_mlock_supported() {
|
|
831
849
|
return llama_mlock::SUPPORTED;
|
832
850
|
}
|
833
851
|
|
852
|
+
void llama_init_backend() {
|
853
|
+
ggml_time_init();
|
854
|
+
|
855
|
+
// needed to initialize f16 tables
|
856
|
+
{
|
857
|
+
struct ggml_init_params params = { 0, NULL, false };
|
858
|
+
struct ggml_context * ctx = ggml_init(params);
|
859
|
+
ggml_free(ctx);
|
860
|
+
}
|
861
|
+
}
|
862
|
+
|
863
|
+
int64_t llama_time_us() {
|
864
|
+
return ggml_time_us();
|
865
|
+
}
|
866
|
+
|
834
867
|
//
|
835
868
|
// model loading
|
836
869
|
//
|
@@ -839,9 +872,12 @@ static const char *llama_file_version_name(llama_file_version version) {
|
|
839
872
|
switch (version) {
|
840
873
|
case LLAMA_FILE_VERSION_GGML: return "'ggml' (old version with low tokenizer quality and no mmap support)";
|
841
874
|
case LLAMA_FILE_VERSION_GGMF_V1: return "ggmf v1 (old version with no mmap support)";
|
842
|
-
case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (
|
843
|
-
|
875
|
+
case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (pre #1405)";
|
876
|
+
case LLAMA_FILE_VERSION_GGJT_V2: return "ggjt v2 (pre #1508)";
|
877
|
+
case LLAMA_FILE_VERSION_GGJT_V3: return "ggjt v3 (latest)";
|
844
878
|
}
|
879
|
+
|
880
|
+
return "unknown";
|
845
881
|
}
|
846
882
|
|
847
883
|
static const char *llama_ftype_name(enum llama_ftype ftype) {
|
@@ -852,7 +888,6 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
|
|
852
888
|
case LLAMA_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1";
|
853
889
|
case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
|
854
890
|
return "mostly Q4_1, some F16";
|
855
|
-
case LLAMA_FTYPE_MOSTLY_Q4_2: return "mostly Q4_2";
|
856
891
|
case LLAMA_FTYPE_MOSTLY_Q5_0: return "mostly Q5_0";
|
857
892
|
case LLAMA_FTYPE_MOSTLY_Q5_1: return "mostly Q5_1";
|
858
893
|
case LLAMA_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0";
|
@@ -874,6 +909,7 @@ static void llama_model_load_internal(
|
|
874
909
|
const std::string & fname,
|
875
910
|
llama_context & lctx,
|
876
911
|
int n_ctx,
|
912
|
+
int n_gpu_layers,
|
877
913
|
ggml_type memory_type,
|
878
914
|
bool use_mmap,
|
879
915
|
bool use_mlock,
|
@@ -918,35 +954,32 @@ static void llama_model_load_internal(
|
|
918
954
|
fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
|
919
955
|
}
|
920
956
|
|
957
|
+
if (file_version < LLAMA_FILE_VERSION_GGJT_V2) {
|
958
|
+
if (hparams.ftype != LLAMA_FTYPE_ALL_F32 &&
|
959
|
+
hparams.ftype != LLAMA_FTYPE_MOSTLY_F16 &&
|
960
|
+
hparams.ftype != LLAMA_FTYPE_MOSTLY_Q8_0) {
|
961
|
+
throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1405)");
|
962
|
+
}
|
963
|
+
}
|
964
|
+
|
965
|
+
if (file_version < LLAMA_FILE_VERSION_GGJT_V3) {
|
966
|
+
if (hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ||
|
967
|
+
hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_1 ||
|
968
|
+
hparams.ftype == LLAMA_FTYPE_MOSTLY_Q8_0) {
|
969
|
+
throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1508)");
|
970
|
+
}
|
971
|
+
}
|
972
|
+
|
921
973
|
if (vocab_only) {
|
922
974
|
return;
|
923
975
|
}
|
924
976
|
|
925
977
|
auto & ctx = model.ctx;
|
926
978
|
|
927
|
-
size_t ctx_size
|
979
|
+
size_t ctx_size;
|
980
|
+
size_t mmapped_size;
|
928
981
|
ml->calc_sizes(&ctx_size, &mmapped_size);
|
929
|
-
fprintf(stderr, "%s: ggml ctx size = %
|
930
|
-
|
931
|
-
// print memory requirements
|
932
|
-
{
|
933
|
-
const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
|
934
|
-
|
935
|
-
// this is the total memory required to run the inference
|
936
|
-
const size_t mem_required =
|
937
|
-
ctx_size +
|
938
|
-
mmapped_size +
|
939
|
-
MEM_REQ_SCRATCH0().at(model.type) +
|
940
|
-
MEM_REQ_SCRATCH1().at(model.type) +
|
941
|
-
MEM_REQ_EVAL().at(model.type);
|
942
|
-
|
943
|
-
// this is the memory required by one llama_state
|
944
|
-
const size_t mem_required_state =
|
945
|
-
scale*MEM_REQ_KV_SELF().at(model.type);
|
946
|
-
|
947
|
-
fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
|
948
|
-
mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
|
949
|
-
}
|
982
|
+
fprintf(stderr, "%s: ggml ctx size = %7.2f MB\n", __func__, ctx_size/1024.0/1024.0);
|
950
983
|
|
951
984
|
// create the ggml context
|
952
985
|
{
|
@@ -968,43 +1001,102 @@ static void llama_model_load_internal(
|
|
968
1001
|
}
|
969
1002
|
}
|
970
1003
|
|
1004
|
+
#ifdef GGML_USE_CUBLAS
|
1005
|
+
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CUDA
|
1006
|
+
#else
|
1007
|
+
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CPU
|
1008
|
+
#endif
|
1009
|
+
|
971
1010
|
// prepare memory for the weights
|
1011
|
+
size_t vram_total = 0;
|
972
1012
|
{
|
973
|
-
const auto & hparams = model.hparams;
|
974
|
-
|
975
1013
|
const uint32_t n_embd = hparams.n_embd;
|
976
1014
|
const uint32_t n_layer = hparams.n_layer;
|
977
1015
|
const uint32_t n_vocab = hparams.n_vocab;
|
978
1016
|
|
979
1017
|
ml->ggml_ctx = ctx;
|
980
1018
|
|
981
|
-
model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab});
|
982
|
-
model.norm = ml->get_tensor("norm.weight", {n_embd});
|
983
|
-
|
1019
|
+
model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
1020
|
+
model.norm = ml->get_tensor("norm.weight", {n_embd}, GGML_BACKEND_CPU);
|
1021
|
+
|
1022
|
+
// "output" tensor
|
1023
|
+
{
|
1024
|
+
ggml_backend backend_output;
|
1025
|
+
if (n_gpu_layers > int(n_layer)) { // NOLINT
|
1026
|
+
backend_output = LLAMA_BACKEND_OFFLOAD;
|
1027
|
+
} else {
|
1028
|
+
backend_output = GGML_BACKEND_CPU;
|
1029
|
+
}
|
1030
|
+
|
1031
|
+
model.output = ml->get_tensor("output.weight", {n_embd, n_vocab}, backend_output);
|
1032
|
+
}
|
1033
|
+
|
1034
|
+
const int i_gpu_start = n_layer - n_gpu_layers;
|
984
1035
|
|
985
1036
|
model.layers.resize(n_layer);
|
986
1037
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
1038
|
+
const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
1039
|
+
|
987
1040
|
auto & layer = model.layers[i];
|
988
1041
|
|
989
1042
|
std::string layers_i = "layers." + std::to_string(i);
|
990
1043
|
|
991
|
-
layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd});
|
1044
|
+
layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd}, backend);
|
1045
|
+
|
1046
|
+
layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}, backend);
|
1047
|
+
layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd}, backend);
|
1048
|
+
layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd}, backend);
|
1049
|
+
layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}, backend);
|
992
1050
|
|
993
|
-
layer.
|
994
|
-
layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd});
|
995
|
-
layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd});
|
996
|
-
layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd});
|
1051
|
+
layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd}, backend);
|
997
1052
|
|
998
|
-
layer.
|
1053
|
+
layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}, backend);
|
1054
|
+
layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}, backend);
|
1055
|
+
layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}, backend);
|
999
1056
|
|
1000
|
-
|
1001
|
-
|
1002
|
-
|
1057
|
+
if (backend == GGML_BACKEND_CUDA) {
|
1058
|
+
vram_total +=
|
1059
|
+
ggml_nbytes(layer.attention_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
|
1060
|
+
ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.attention_norm) +
|
1061
|
+
ggml_nbytes(layer.w1) + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3);
|
1062
|
+
}
|
1003
1063
|
}
|
1004
1064
|
}
|
1005
1065
|
|
1006
1066
|
ml->done_getting_tensors();
|
1007
1067
|
|
1068
|
+
// print memory requirements
|
1069
|
+
{
|
1070
|
+
const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
|
1071
|
+
|
1072
|
+
// this is the total memory required to run the inference
|
1073
|
+
const size_t mem_required =
|
1074
|
+
ctx_size +
|
1075
|
+
mmapped_size - vram_total + // weights in VRAM not in memory
|
1076
|
+
MEM_REQ_SCRATCH0().at(model.type) +
|
1077
|
+
MEM_REQ_SCRATCH1().at(model.type) +
|
1078
|
+
MEM_REQ_EVAL().at(model.type);
|
1079
|
+
|
1080
|
+
// this is the memory required by one llama_state
|
1081
|
+
const size_t mem_required_state =
|
1082
|
+
scale*MEM_REQ_KV_SELF().at(model.type);
|
1083
|
+
|
1084
|
+
fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
|
1085
|
+
mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
|
1086
|
+
|
1087
|
+
#ifdef GGML_USE_CUBLAS
|
1088
|
+
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
1089
|
+
|
1090
|
+
fprintf(stderr, "%s: [cublas] offloading %d layers to GPU\n", __func__, n_gpu);
|
1091
|
+
if (n_gpu_layers > (int) hparams.n_layer) {
|
1092
|
+
fprintf(stderr, "%s: [cublas] offloading output layer to GPU\n", __func__);
|
1093
|
+
}
|
1094
|
+
fprintf(stderr, "%s: [cublas] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
|
1095
|
+
#else
|
1096
|
+
(void) n_gpu_layers;
|
1097
|
+
#endif
|
1098
|
+
}
|
1099
|
+
|
1008
1100
|
// populate `tensors_by_name`
|
1009
1101
|
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
|
1010
1102
|
model.tensors_by_name.emplace_back(lt.name, lt.ggml_tensor);
|
@@ -1012,6 +1104,33 @@ static void llama_model_load_internal(
|
|
1012
1104
|
|
1013
1105
|
ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
|
1014
1106
|
|
1107
|
+
#ifdef GGML_USE_CUBLAS
|
1108
|
+
{
|
1109
|
+
size_t done_size = 0;
|
1110
|
+
size_t data_size = 0;
|
1111
|
+
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
|
1112
|
+
data_size += lt.size;
|
1113
|
+
if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
|
1114
|
+
done_size += lt.size;
|
1115
|
+
}
|
1116
|
+
}
|
1117
|
+
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
|
1118
|
+
if (lt.ggml_tensor->backend != GGML_BACKEND_CUDA) {
|
1119
|
+
continue;
|
1120
|
+
}
|
1121
|
+
if (progress_callback) {
|
1122
|
+
progress_callback((float) done_size / data_size, progress_callback_user_data);
|
1123
|
+
}
|
1124
|
+
ggml_cuda_load_data(fname.c_str(), lt.ggml_tensor, lt.shards.at(0).file_off);
|
1125
|
+
done_size += lt.size;
|
1126
|
+
}
|
1127
|
+
}
|
1128
|
+
#endif // GGML_USE_CUBLAS
|
1129
|
+
|
1130
|
+
if (progress_callback) {
|
1131
|
+
progress_callback(1.0f, progress_callback_user_data);
|
1132
|
+
}
|
1133
|
+
|
1015
1134
|
model.mapping = std::move(ml->mapping);
|
1016
1135
|
|
1017
1136
|
// loading time will be recalculate after the first eval, so
|
@@ -1023,6 +1142,7 @@ static bool llama_model_load(
|
|
1023
1142
|
const std::string & fname,
|
1024
1143
|
llama_context & lctx,
|
1025
1144
|
int n_ctx,
|
1145
|
+
int n_gpu_layers,
|
1026
1146
|
ggml_type memory_type,
|
1027
1147
|
bool use_mmap,
|
1028
1148
|
bool use_mlock,
|
@@ -1030,7 +1150,7 @@ static bool llama_model_load(
|
|
1030
1150
|
llama_progress_callback progress_callback,
|
1031
1151
|
void *progress_callback_user_data) {
|
1032
1152
|
try {
|
1033
|
-
llama_model_load_internal(fname, lctx, n_ctx, memory_type, use_mmap, use_mlock,
|
1153
|
+
llama_model_load_internal(fname, lctx, n_ctx, n_gpu_layers, memory_type, use_mmap, use_mlock,
|
1034
1154
|
vocab_only, progress_callback, progress_callback_user_data);
|
1035
1155
|
return true;
|
1036
1156
|
} catch (const std::string & err) {
|
@@ -1052,6 +1172,13 @@ static bool llama_eval_internal(
|
|
1052
1172
|
const int n_tokens,
|
1053
1173
|
const int n_past,
|
1054
1174
|
const int n_threads) {
|
1175
|
+
|
1176
|
+
// enforce that the first token is BOS
|
1177
|
+
if (n_past == 0 && tokens[0] != llama_token_bos()) {
|
1178
|
+
fprintf(stderr, "%s: first token must be BOS\n", __func__);
|
1179
|
+
return false;
|
1180
|
+
}
|
1181
|
+
|
1055
1182
|
const int64_t t_start_us = ggml_time_us();
|
1056
1183
|
|
1057
1184
|
const int N = n_tokens;
|
@@ -1059,7 +1186,7 @@ static bool llama_eval_internal(
|
|
1059
1186
|
const auto & model = lctx.model;
|
1060
1187
|
const auto & hparams = model.hparams;
|
1061
1188
|
|
1062
|
-
auto & kv_self = model.kv_self;
|
1189
|
+
const auto & kv_self = model.kv_self;
|
1063
1190
|
|
1064
1191
|
LLAMA_ASSERT(!!kv_self.ctx);
|
1065
1192
|
|
@@ -1103,17 +1230,15 @@ static bool llama_eval_internal(
|
|
1103
1230
|
{
|
1104
1231
|
cur = ggml_rms_norm(ctx0, inpL);
|
1105
1232
|
|
1106
|
-
// cur = attention_norm
|
1107
|
-
cur = ggml_mul(ctx0,
|
1108
|
-
ggml_repeat(ctx0, model.layers[il].attention_norm, cur),
|
1109
|
-
cur);
|
1233
|
+
// cur = cur*attention_norm(broadcasted)
|
1234
|
+
cur = ggml_mul(ctx0, cur, model.layers[il].attention_norm);
|
1110
1235
|
}
|
1111
1236
|
|
1112
1237
|
// self-attention
|
1113
1238
|
{
|
1114
1239
|
// compute Q and K and RoPE them
|
1115
|
-
struct ggml_tensor * Qcur =
|
1116
|
-
struct ggml_tensor * Kcur =
|
1240
|
+
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
|
1241
|
+
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
|
1117
1242
|
ggml_set_name(Qcur, "Qcur");
|
1118
1243
|
ggml_set_name(Kcur, "Kcur");
|
1119
1244
|
|
@@ -1154,17 +1279,19 @@ static bool llama_eval_internal(
|
|
1154
1279
|
struct ggml_tensor * KQ_scale = ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head));
|
1155
1280
|
ggml_set_name(KQ_scale, "1/sqrt(n_embd/n_head)");
|
1156
1281
|
|
1157
|
-
|
1282
|
+
// KQ_scaled shape [n_past + N, N, n_head, 1]
|
1283
|
+
struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
|
1158
1284
|
ggml_set_name(KQ_scaled, "KQ_scaled");
|
1159
1285
|
|
1160
1286
|
// KQ_masked = mask_past(KQ_scaled)
|
1161
|
-
struct ggml_tensor * KQ_masked =
|
1287
|
+
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
|
1162
1288
|
ggml_set_name(KQ_masked, "KQ_masked");
|
1163
1289
|
|
1164
1290
|
// KQ = soft_max(KQ_masked)
|
1165
|
-
struct ggml_tensor * KQ_soft_max =
|
1291
|
+
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
|
1166
1292
|
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
1167
1293
|
|
1294
|
+
|
1168
1295
|
// split cached V into n_head heads
|
1169
1296
|
struct ggml_tensor * V =
|
1170
1297
|
ggml_view_3d(ctx0, kv_self.v,
|
@@ -1211,10 +1338,8 @@ static bool llama_eval_internal(
|
|
1211
1338
|
{
|
1212
1339
|
cur = ggml_rms_norm(ctx0, inpFF);
|
1213
1340
|
|
1214
|
-
// cur = ffn_norm
|
1215
|
-
cur = ggml_mul(ctx0,
|
1216
|
-
ggml_repeat(ctx0, model.layers[il].ffn_norm, cur),
|
1217
|
-
cur);
|
1341
|
+
// cur = cur*ffn_norm(broadcasted)
|
1342
|
+
cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
|
1218
1343
|
}
|
1219
1344
|
|
1220
1345
|
struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
|
@@ -1251,10 +1376,8 @@ static bool llama_eval_internal(
|
|
1251
1376
|
|
1252
1377
|
inpL = ggml_rms_norm(ctx0, inpL);
|
1253
1378
|
|
1254
|
-
// inpL = norm
|
1255
|
-
inpL = ggml_mul(ctx0,
|
1256
|
-
ggml_repeat(ctx0, model.norm, inpL),
|
1257
|
-
inpL);
|
1379
|
+
// inpL = inpL*norm(broadcasted)
|
1380
|
+
inpL = ggml_mul(ctx0, inpL, model.norm);
|
1258
1381
|
|
1259
1382
|
embeddings = inpL;
|
1260
1383
|
}
|
@@ -1265,7 +1388,7 @@ static bool llama_eval_internal(
|
|
1265
1388
|
lctx.use_buf(ctx0, -1);
|
1266
1389
|
|
1267
1390
|
// logits -> probs
|
1268
|
-
//inpL =
|
1391
|
+
//inpL = ggml_soft_max_inplace(ctx0, inpL);
|
1269
1392
|
|
1270
1393
|
// run the computation
|
1271
1394
|
ggml_build_forward_expand(&gf, inpL);
|
@@ -1303,7 +1426,7 @@ static bool llama_eval_internal(
|
|
1303
1426
|
}
|
1304
1427
|
|
1305
1428
|
// extract embeddings
|
1306
|
-
if (lctx.embedding.
|
1429
|
+
if (!lctx.embedding.empty()) {
|
1307
1430
|
auto & embedding_out = lctx.embedding;
|
1308
1431
|
|
1309
1432
|
embedding_out.resize(n_embd);
|
@@ -1354,6 +1477,8 @@ struct llama_sp_symbol {
|
|
1354
1477
|
size_t n;
|
1355
1478
|
};
|
1356
1479
|
|
1480
|
+
static_assert(std::is_trivially_copyable<llama_sp_symbol>::value, "llama_sp_symbol is not trivially copyable");
|
1481
|
+
|
1357
1482
|
struct llama_sp_bigram {
|
1358
1483
|
struct comparator {
|
1359
1484
|
bool operator()(llama_sp_bigram & l, llama_sp_bigram & r) {
|
@@ -1386,7 +1511,7 @@ struct llama_tokenizer {
|
|
1386
1511
|
sym.prev = index - 1;
|
1387
1512
|
sym.next = offs == text.size() ? -1 : index + 1;
|
1388
1513
|
index++;
|
1389
|
-
symbols_.emplace_back(
|
1514
|
+
symbols_.emplace_back(sym);
|
1390
1515
|
}
|
1391
1516
|
|
1392
1517
|
// seed the work queue with all possible 2-character tokens.
|
@@ -1477,12 +1602,12 @@ static std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, co
|
|
1477
1602
|
llama_tokenizer tokenizer(vocab);
|
1478
1603
|
std::vector<llama_vocab::id> output;
|
1479
1604
|
|
1480
|
-
if (text.
|
1605
|
+
if (text.empty()) {
|
1481
1606
|
return output;
|
1482
1607
|
}
|
1483
1608
|
|
1484
1609
|
if (bos) {
|
1485
|
-
output.push_back(
|
1610
|
+
output.push_back(llama_token_bos());
|
1486
1611
|
}
|
1487
1612
|
|
1488
1613
|
tokenizer.tokenize(text, output);
|
@@ -1713,7 +1838,7 @@ void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_dat
|
|
1713
1838
|
const int64_t t_start_sample_us = ggml_time_us();
|
1714
1839
|
|
1715
1840
|
for (size_t i = 0; i < candidates->size; ++i) {
|
1716
|
-
auto token_iter = std::find(last_tokens, last_tokens + last_tokens_size, candidates->data[i].id);
|
1841
|
+
const auto * token_iter = std::find(last_tokens, last_tokens + last_tokens_size, candidates->data[i].id);
|
1717
1842
|
if (token_iter == last_tokens + last_tokens_size) {
|
1718
1843
|
continue;
|
1719
1844
|
}
|
@@ -1791,7 +1916,7 @@ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_
|
|
1791
1916
|
float k = powf((epsilon_hat * powf(2, *mu)) / (1 - powf(N, -epsilon_hat)), 1 / s_hat);
|
1792
1917
|
|
1793
1918
|
// Sample the next word X using top-k sampling
|
1794
|
-
llama_sample_top_k(nullptr, candidates, int(k));
|
1919
|
+
llama_sample_top_k(nullptr, candidates, int(k), 1);
|
1795
1920
|
if (ctx) {
|
1796
1921
|
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
1797
1922
|
}
|
@@ -1857,7 +1982,7 @@ llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_da
|
|
1857
1982
|
const int64_t t_start_sample_us = ggml_time_us();
|
1858
1983
|
|
1859
1984
|
// Find max element
|
1860
|
-
auto max_iter = std::max_element(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
|
1985
|
+
auto * max_iter = std::max_element(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
|
1861
1986
|
return a.logit < b.logit;
|
1862
1987
|
});
|
1863
1988
|
|
@@ -1900,7 +2025,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
1900
2025
|
switch (ftype) {
|
1901
2026
|
case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
|
1902
2027
|
case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
|
1903
|
-
case LLAMA_FTYPE_MOSTLY_Q4_2: quantized_type = GGML_TYPE_Q4_2; break;
|
1904
2028
|
case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
|
1905
2029
|
case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
|
1906
2030
|
case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
|
@@ -1911,7 +2035,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
1911
2035
|
nthread = std::thread::hardware_concurrency();
|
1912
2036
|
}
|
1913
2037
|
|
1914
|
-
std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp
|
2038
|
+
std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp, /*use_mmap*/ false,
|
1915
2039
|
/*vocab_only*/ false));
|
1916
2040
|
llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), ftype);
|
1917
2041
|
|
@@ -1965,7 +2089,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
1965
2089
|
} else if (tensor.type == GGML_TYPE_F16) {
|
1966
2090
|
f32_conv_buf.resize(nelements * sizeof(float));
|
1967
2091
|
f32_data = (float *) f32_conv_buf.addr;
|
1968
|
-
auto f16_data = (const ggml_fp16_t *) tensor.data;
|
2092
|
+
const auto * f16_data = (const ggml_fp16_t *) tensor.data;
|
1969
2093
|
for (size_t i = 0; i < nelements; i++) {
|
1970
2094
|
f32_data[i] = ggml_fp16_to_fp32(f16_data[i]);
|
1971
2095
|
}
|
@@ -1996,21 +2120,31 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
1996
2120
|
size_t first = counter; counter += chunk_size;
|
1997
2121
|
if (first >= nelements) {
|
1998
2122
|
if (!local_hist.empty()) {
|
1999
|
-
for (int j=0; j<int(local_hist.size()); ++j)
|
2123
|
+
for (int j=0; j<int(local_hist.size()); ++j) {
|
2124
|
+
hist_cur[j] += local_hist[j];
|
2125
|
+
}
|
2000
2126
|
new_size += local_size;
|
2001
2127
|
}
|
2002
2128
|
break;
|
2003
2129
|
}
|
2004
2130
|
lock.unlock();
|
2005
2131
|
size_t last = std::min(nelements, first + chunk_size);
|
2006
|
-
if (local_hist.empty())
|
2132
|
+
if (local_hist.empty()) {
|
2133
|
+
local_hist.resize(hist_cur.size(), 0);
|
2134
|
+
}
|
2007
2135
|
local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data());
|
2008
2136
|
}
|
2009
2137
|
};
|
2010
|
-
if (int
|
2011
|
-
|
2138
|
+
if ((int) workers.size() < nthread_use - 1) {
|
2139
|
+
workers.resize(nthread_use - 1);
|
2140
|
+
}
|
2141
|
+
for (int it = 0; it < nthread_use - 1; ++it) {
|
2142
|
+
workers[it] = std::thread(compute);
|
2143
|
+
}
|
2012
2144
|
compute();
|
2013
|
-
for (int it = 0; it < nthread_use - 1; ++it)
|
2145
|
+
for (int it = 0; it < nthread_use - 1; ++it) {
|
2146
|
+
workers[it].join();
|
2147
|
+
}
|
2014
2148
|
}
|
2015
2149
|
|
2016
2150
|
printf("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0);
|
@@ -2067,7 +2201,7 @@ struct llama_context * llama_init_from_file(
|
|
2067
2201
|
unsigned * cur_percentage_p = (unsigned *) ctx;
|
2068
2202
|
unsigned percentage = (unsigned) (100 * progress);
|
2069
2203
|
while (percentage > *cur_percentage_p) {
|
2070
|
-
|
2204
|
+
*cur_percentage_p = percentage;
|
2071
2205
|
fprintf(stderr, ".");
|
2072
2206
|
fflush(stderr);
|
2073
2207
|
if (percentage >= 100) {
|
@@ -2082,7 +2216,7 @@ struct llama_context * llama_init_from_file(
|
|
2082
2216
|
|
2083
2217
|
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
2084
2218
|
|
2085
|
-
if (!llama_model_load(path_model, *ctx, params.n_ctx, memory_type,
|
2219
|
+
if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_gpu_layers, memory_type,
|
2086
2220
|
params.use_mmap, params.use_mlock, params.vocab_only,
|
2087
2221
|
params.progress_callback, params.progress_callback_user_data)) {
|
2088
2222
|
fprintf(stderr, "%s: failed to load model\n", __func__);
|
@@ -2160,7 +2294,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|
2160
2294
|
{
|
2161
2295
|
uint32_t magic;
|
2162
2296
|
fin.read((char *) &magic, sizeof(magic));
|
2163
|
-
if (magic !=
|
2297
|
+
if (magic != LLAMA_FILE_MAGIC_GGLA) {
|
2164
2298
|
fprintf(stderr, "%s: bad file magic\n", __func__);
|
2165
2299
|
return 1;
|
2166
2300
|
}
|
@@ -2208,7 +2342,8 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|
2208
2342
|
fprintf(stderr, "%s: loading base model from '%s'\n", __func__, path_base_model);
|
2209
2343
|
model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*vocab_only*/ false));
|
2210
2344
|
|
2211
|
-
size_t ctx_size
|
2345
|
+
size_t ctx_size;
|
2346
|
+
size_t mmapped_size;
|
2212
2347
|
model_loader->calc_sizes(&ctx_size, &mmapped_size);
|
2213
2348
|
base_buf.resize(ctx_size);
|
2214
2349
|
|
@@ -2223,7 +2358,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|
2223
2358
|
|
2224
2359
|
// maybe this should in llama_model_loader
|
2225
2360
|
if (model_loader->use_mmap) {
|
2226
|
-
model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */
|
2361
|
+
model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ 0));
|
2227
2362
|
}
|
2228
2363
|
}
|
2229
2364
|
|
@@ -2247,8 +2382,12 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|
2247
2382
|
fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
|
2248
2383
|
}
|
2249
2384
|
|
2250
|
-
std::string name
|
2251
|
-
|
2385
|
+
std::string name;
|
2386
|
+
{
|
2387
|
+
char buf[1024];
|
2388
|
+
fin.read(buf, length);
|
2389
|
+
name = std::string(buf, length);
|
2390
|
+
}
|
2252
2391
|
|
2253
2392
|
// check for lora suffix and get the type of tensor
|
2254
2393
|
const std::string lora_suffix = ".lora";
|
@@ -2263,7 +2402,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|
2263
2402
|
base_name.erase(pos);
|
2264
2403
|
// fprintf(stderr, "%s: %s => %s (lora type %s) ", __func__, name.c_str(),base_name.c_str(), lora_type.c_str());
|
2265
2404
|
|
2266
|
-
if (model_tensors.find(base_name
|
2405
|
+
if (model_tensors.find(base_name) == model_tensors.end()) {
|
2267
2406
|
fprintf(stderr, "%s: unknown tensor '%s' in lora adapter\n", __func__, name.data());
|
2268
2407
|
return 1;
|
2269
2408
|
}
|
@@ -2312,7 +2451,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|
2312
2451
|
}
|
2313
2452
|
size_t idx = model_loader->tensors_map.name_to_idx[base_name];
|
2314
2453
|
llama_load_tensor & lt = model_loader->tensors_map.tensors[idx];
|
2315
|
-
base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] });
|
2454
|
+
base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] }, GGML_BACKEND_CPU);
|
2316
2455
|
lt.data = (uint8_t *) lt.ggml_tensor->data;
|
2317
2456
|
model_loader->load_data_for(lt);
|
2318
2457
|
lt.ggml_tensor->data = lt.data;
|
@@ -2343,7 +2482,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|
2343
2482
|
|
2344
2483
|
if (scaling != 1.0f) {
|
2345
2484
|
ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling);
|
2346
|
-
BA =
|
2485
|
+
BA = ggml_scale_inplace(lora_ctx, BA, scale_tensor);
|
2347
2486
|
}
|
2348
2487
|
|
2349
2488
|
ggml_tensor * r;
|
@@ -2365,8 +2504,9 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|
2365
2504
|
lora_tensors.clear();
|
2366
2505
|
|
2367
2506
|
n_tensors++;
|
2368
|
-
if (n_tensors % 4 == 0)
|
2507
|
+
if (n_tensors % 4 == 0) {
|
2369
2508
|
fprintf(stderr, ".");
|
2509
|
+
}
|
2370
2510
|
}
|
2371
2511
|
}
|
2372
2512
|
|
@@ -2395,7 +2535,7 @@ int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
|
|
2395
2535
|
return ctx->model.kv_self.n;
|
2396
2536
|
}
|
2397
2537
|
|
2398
|
-
#define LLAMA_MAX_RNG_STATE 64*1024
|
2538
|
+
#define LLAMA_MAX_RNG_STATE (64*1024)
|
2399
2539
|
|
2400
2540
|
void llama_set_rng_seed(struct llama_context * ctx, int seed) {
|
2401
2541
|
if (seed < 0) {
|
@@ -2436,8 +2576,8 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
|
|
2436
2576
|
}
|
2437
2577
|
|
2438
2578
|
// Copies the state to the specified destination address
|
2439
|
-
size_t llama_copy_state_data(struct llama_context * ctx, uint8_t *
|
2440
|
-
uint8_t * out =
|
2579
|
+
size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
2580
|
+
uint8_t * out = dst;
|
2441
2581
|
|
2442
2582
|
// copy rng
|
2443
2583
|
{
|
@@ -2497,7 +2637,9 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) {
|
|
2497
2637
|
|
2498
2638
|
if (kv_size) {
|
2499
2639
|
const size_t elt_size = ggml_element_size(kv_self.k);
|
2640
|
+
|
2500
2641
|
char buffer[4096];
|
2642
|
+
|
2501
2643
|
ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
|
2502
2644
|
ggml_cgraph gf{};
|
2503
2645
|
gf.n_threads = 1;
|
@@ -2521,10 +2663,12 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) {
|
|
2521
2663
|
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
|
2522
2664
|
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d));
|
2523
2665
|
ggml_graph_compute(cpy_ctx, &gf);
|
2666
|
+
|
2667
|
+
ggml_free(cpy_ctx);
|
2524
2668
|
}
|
2525
2669
|
}
|
2526
2670
|
|
2527
|
-
const size_t written = out -
|
2671
|
+
const size_t written = out - dst;
|
2528
2672
|
const size_t max_size = llama_get_state_size(ctx);
|
2529
2673
|
|
2530
2674
|
LLAMA_ASSERT(written <= max_size);
|
@@ -2533,16 +2677,16 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) {
|
|
2533
2677
|
}
|
2534
2678
|
|
2535
2679
|
// Sets the state reading from the specified source address
|
2536
|
-
size_t llama_set_state_data(struct llama_context * ctx,
|
2537
|
-
|
2680
|
+
size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
2681
|
+
uint8_t * inp = src;
|
2538
2682
|
|
2539
2683
|
// set rng
|
2540
2684
|
{
|
2541
2685
|
size_t rng_size;
|
2542
2686
|
char rng_buf[LLAMA_MAX_RNG_STATE];
|
2543
2687
|
|
2544
|
-
memcpy(&rng_size,
|
2545
|
-
memcpy(&rng_buf[0],
|
2688
|
+
memcpy(&rng_size, inp, sizeof(rng_size)); inp += sizeof(rng_size);
|
2689
|
+
memcpy(&rng_buf[0], inp, LLAMA_MAX_RNG_STATE); inp += LLAMA_MAX_RNG_STATE;
|
2546
2690
|
|
2547
2691
|
std::stringstream rng_ss;
|
2548
2692
|
rng_ss.str(std::string(&rng_buf[0], rng_size));
|
@@ -2556,30 +2700,30 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
|
2556
2700
|
size_t logits_cap;
|
2557
2701
|
size_t logits_size;
|
2558
2702
|
|
2559
|
-
memcpy(&logits_cap,
|
2560
|
-
memcpy(&logits_size,
|
2703
|
+
memcpy(&logits_cap, inp, sizeof(logits_cap)); inp += sizeof(logits_cap);
|
2704
|
+
memcpy(&logits_size, inp, sizeof(logits_size)); inp += sizeof(logits_size);
|
2561
2705
|
|
2562
2706
|
LLAMA_ASSERT(ctx->logits.capacity() == logits_cap);
|
2563
2707
|
|
2564
2708
|
if (logits_size) {
|
2565
2709
|
ctx->logits.resize(logits_size);
|
2566
|
-
memcpy(ctx->logits.data(),
|
2710
|
+
memcpy(ctx->logits.data(), inp, logits_size * sizeof(float));
|
2567
2711
|
}
|
2568
2712
|
|
2569
|
-
|
2713
|
+
inp += logits_cap * sizeof(float);
|
2570
2714
|
}
|
2571
2715
|
|
2572
2716
|
// set embeddings
|
2573
2717
|
{
|
2574
2718
|
size_t embedding_size;
|
2575
2719
|
|
2576
|
-
memcpy(&embedding_size,
|
2720
|
+
memcpy(&embedding_size, inp, sizeof(embedding_size)); inp += sizeof(embedding_size);
|
2577
2721
|
|
2578
2722
|
LLAMA_ASSERT(ctx->embedding.capacity() == embedding_size);
|
2579
2723
|
|
2580
2724
|
if (embedding_size) {
|
2581
|
-
memcpy(ctx->embedding.data(),
|
2582
|
-
|
2725
|
+
memcpy(ctx->embedding.data(), inp, embedding_size * sizeof(float));
|
2726
|
+
inp += embedding_size * sizeof(float);
|
2583
2727
|
}
|
2584
2728
|
}
|
2585
2729
|
|
@@ -2594,25 +2738,27 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
|
2594
2738
|
size_t kv_size;
|
2595
2739
|
int kv_ntok;
|
2596
2740
|
|
2597
|
-
memcpy(&kv_size,
|
2598
|
-
memcpy(&kv_ntok,
|
2741
|
+
memcpy(&kv_size, inp, sizeof(kv_size)); inp += sizeof(kv_size);
|
2742
|
+
memcpy(&kv_ntok, inp, sizeof(kv_ntok)); inp += sizeof(kv_ntok);
|
2599
2743
|
|
2600
2744
|
if (kv_size) {
|
2601
2745
|
LLAMA_ASSERT(kv_self.buf.size == kv_size);
|
2602
2746
|
|
2603
2747
|
const size_t elt_size = ggml_element_size(kv_self.k);
|
2748
|
+
|
2604
2749
|
char buffer[4096];
|
2750
|
+
|
2605
2751
|
ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
|
2606
2752
|
ggml_cgraph gf{};
|
2607
2753
|
gf.n_threads = 1;
|
2608
2754
|
|
2609
2755
|
ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
|
2610
|
-
kin3d->data = (void *)
|
2611
|
-
|
2756
|
+
kin3d->data = (void *) inp;
|
2757
|
+
inp += ggml_nbytes(kin3d);
|
2612
2758
|
|
2613
2759
|
ggml_tensor * vin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_ntok, n_embd, n_layer);
|
2614
|
-
vin3d->data = (void *)
|
2615
|
-
|
2760
|
+
vin3d->data = (void *) inp;
|
2761
|
+
inp += ggml_nbytes(vin3d);
|
2616
2762
|
|
2617
2763
|
ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k,
|
2618
2764
|
n_embd, kv_ntok, n_layer,
|
@@ -2625,12 +2771,14 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
|
2625
2771
|
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d));
|
2626
2772
|
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d));
|
2627
2773
|
ggml_graph_compute(cpy_ctx, &gf);
|
2774
|
+
|
2775
|
+
ggml_free(cpy_ctx);
|
2628
2776
|
}
|
2629
2777
|
|
2630
2778
|
ctx->model.kv_self.n = kv_ntok;
|
2631
2779
|
}
|
2632
2780
|
|
2633
|
-
const size_t nread =
|
2781
|
+
const size_t nread = inp - src;
|
2634
2782
|
const size_t max_size = llama_get_state_size(ctx);
|
2635
2783
|
|
2636
2784
|
LLAMA_ASSERT(nread <= max_size);
|
@@ -2646,7 +2794,7 @@ bool llama_load_session_file(struct llama_context * ctx, const char * path_sessi
|
|
2646
2794
|
const uint32_t magic = file.read_u32();
|
2647
2795
|
const uint32_t version = file.read_u32();
|
2648
2796
|
|
2649
|
-
if (
|
2797
|
+
if (magic != LLAMA_SESSION_MAGIC || version != LLAMA_SESSION_VERSION) {
|
2650
2798
|
fprintf(stderr, "%s : unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version);
|
2651
2799
|
return false;
|
2652
2800
|
}
|
@@ -2727,11 +2875,14 @@ int llama_eval(
|
|
2727
2875
|
fprintf(stderr, "%s: failed to eval\n", __func__);
|
2728
2876
|
return 1;
|
2729
2877
|
}
|
2878
|
+
|
2730
2879
|
// get a more accurate load time, upon first eval
|
2880
|
+
// TODO: fix this
|
2731
2881
|
if (!ctx->has_evaluated_once) {
|
2732
2882
|
ctx->t_load_us = ggml_time_us() - ctx->t_start_us;
|
2733
2883
|
ctx->has_evaluated_once = true;
|
2734
2884
|
}
|
2885
|
+
|
2735
2886
|
return 0;
|
2736
2887
|
}
|
2737
2888
|
|
@@ -2805,9 +2956,9 @@ void llama_print_timings(struct llama_context * ctx) {
|
|
2805
2956
|
|
2806
2957
|
fprintf(stderr, "\n");
|
2807
2958
|
fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0);
|
2808
|
-
fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per
|
2959
|
+
fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_sample_us, n_sample, 1e-3 * ctx->t_sample_us / n_sample);
|
2809
2960
|
fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_p_eval_us, n_p_eval, 1e-3 * ctx->t_p_eval_us / n_p_eval);
|
2810
|
-
fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per
|
2961
|
+
fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_eval_us, n_eval, 1e-3 * ctx->t_eval_us / n_eval);
|
2811
2962
|
fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0);
|
2812
2963
|
}
|
2813
2964
|
|