llama_cpp 0.1.0 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +17 -0
- data/ext/llama_cpp/extconf.rb +7 -0
- data/ext/llama_cpp/llama_cpp.cpp +153 -21
- data/ext/llama_cpp/src/ggml-cuda.h +4 -0
- data/ext/llama_cpp/src/ggml-opencl.c +291 -215
- data/ext/llama_cpp/src/ggml.c +4428 -2143
- data/ext/llama_cpp/src/ggml.h +216 -13
- data/ext/llama_cpp/src/llama-util.h +23 -23
- data/ext/llama_cpp/src/llama.cpp +300 -149
- data/ext/llama_cpp/src/llama.h +38 -25
- data/lib/llama_cpp/client.rb +1 -3
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +2 -0
- data/sig/llama_cpp.rbs +4 -4
- metadata +2 -2
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
// Defines fileno on msys:
|
2
2
|
#ifndef _GNU_SOURCE
|
3
3
|
#define _GNU_SOURCE
|
4
|
+
#include <cstddef>
|
4
5
|
#include <cstdint>
|
5
6
|
#include <cstdio>
|
6
7
|
#endif
|
@@ -9,6 +10,9 @@
|
|
9
10
|
#include "llama.h"
|
10
11
|
|
11
12
|
#include "ggml.h"
|
13
|
+
#ifdef GGML_USE_CUBLAS
|
14
|
+
#include "ggml-cuda.h"
|
15
|
+
#endif
|
12
16
|
|
13
17
|
#include <array>
|
14
18
|
#include <ctime>
|
@@ -42,6 +46,7 @@ enum e_model {
|
|
42
46
|
MODEL_65B,
|
43
47
|
};
|
44
48
|
|
49
|
+
|
45
50
|
static const size_t MB = 1024*1024;
|
46
51
|
|
47
52
|
// computed for n_ctx == 2048
|
@@ -50,49 +55,49 @@ static const size_t MB = 1024*1024;
|
|
50
55
|
|
51
56
|
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
|
52
57
|
{
|
53
|
-
static std::map<e_model, size_t>
|
58
|
+
static std::map<e_model, size_t> k_sizes = {
|
54
59
|
{ MODEL_7B, 512ull * MB },
|
55
60
|
{ MODEL_13B, 512ull * MB },
|
56
61
|
{ MODEL_30B, 512ull * MB },
|
57
62
|
{ MODEL_65B, 1024ull * MB },
|
58
63
|
};
|
59
|
-
return
|
64
|
+
return k_sizes;
|
60
65
|
}
|
61
66
|
|
62
67
|
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
|
63
68
|
{
|
64
|
-
static std::map<e_model, size_t>
|
69
|
+
static std::map<e_model, size_t> k_sizes = {
|
65
70
|
{ MODEL_7B, 512ull * MB },
|
66
71
|
{ MODEL_13B, 512ull * MB },
|
67
72
|
{ MODEL_30B, 512ull * MB },
|
68
73
|
{ MODEL_65B, 1024ull * MB },
|
69
74
|
};
|
70
|
-
return
|
75
|
+
return k_sizes;
|
71
76
|
}
|
72
77
|
|
73
78
|
// 2*n_embd*n_ctx*n_layer*sizeof(float16)
|
74
79
|
static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
|
75
80
|
{
|
76
|
-
static std::map<e_model, size_t>
|
81
|
+
static std::map<e_model, size_t> k_sizes = {
|
77
82
|
{ MODEL_7B, 1026ull * MB },
|
78
83
|
{ MODEL_13B, 1608ull * MB },
|
79
84
|
{ MODEL_30B, 3124ull * MB },
|
80
85
|
{ MODEL_65B, 5120ull * MB },
|
81
86
|
};
|
82
|
-
return
|
87
|
+
return k_sizes;
|
83
88
|
}
|
84
89
|
|
85
90
|
// this is mostly needed for temporary mul_mat buffers to dequantize the data
|
86
91
|
// not actually needed if BLAS is disabled
|
87
92
|
static const std::map<e_model, size_t> & MEM_REQ_EVAL()
|
88
93
|
{
|
89
|
-
static std::map<e_model, size_t>
|
94
|
+
static std::map<e_model, size_t> k_sizes = {
|
90
95
|
{ MODEL_7B, 768ull * MB },
|
91
96
|
{ MODEL_13B, 1024ull * MB },
|
92
97
|
{ MODEL_30B, 1280ull * MB },
|
93
98
|
{ MODEL_65B, 1536ull * MB },
|
94
99
|
};
|
95
|
-
return
|
100
|
+
return k_sizes;
|
96
101
|
}
|
97
102
|
|
98
103
|
// default hparams (LLaMA 7B)
|
@@ -107,7 +112,7 @@ struct llama_hparams {
|
|
107
112
|
enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;
|
108
113
|
|
109
114
|
bool operator!=(const llama_hparams & other) const {
|
110
|
-
return memcmp(this, &other, sizeof(llama_hparams));
|
115
|
+
return static_cast<bool>(memcmp(this, &other, sizeof(llama_hparams)));
|
111
116
|
}
|
112
117
|
};
|
113
118
|
|
@@ -402,6 +407,8 @@ enum llama_file_version {
|
|
402
407
|
LLAMA_FILE_VERSION_GGML,
|
403
408
|
LLAMA_FILE_VERSION_GGMF_V1, // added version field and scores in vocab
|
404
409
|
LLAMA_FILE_VERSION_GGJT_V1, // added padding
|
410
|
+
LLAMA_FILE_VERSION_GGJT_V2, // changed quantization format
|
411
|
+
LLAMA_FILE_VERSION_GGJT_V3, // changed Q4 and Q8 quantization format
|
405
412
|
};
|
406
413
|
|
407
414
|
struct llama_file_loader {
|
@@ -420,22 +427,30 @@ struct llama_file_loader {
|
|
420
427
|
}
|
421
428
|
void read_magic() {
|
422
429
|
uint32_t magic = file.read_u32();
|
423
|
-
uint32_t version = 0;
|
424
430
|
|
425
|
-
if (magic
|
426
|
-
|
431
|
+
if (magic == LLAMA_FILE_MAGIC_GGML) {
|
432
|
+
file_version = LLAMA_FILE_VERSION_GGML;
|
433
|
+
return;
|
427
434
|
}
|
428
435
|
|
429
|
-
|
430
|
-
|
431
|
-
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
|
436
|
-
|
437
|
-
|
436
|
+
uint32_t version = file.read_u32();
|
437
|
+
|
438
|
+
switch (magic) {
|
439
|
+
case LLAMA_FILE_MAGIC_GGMF:
|
440
|
+
switch (version) {
|
441
|
+
case 1: file_version = LLAMA_FILE_VERSION_GGMF_V1; return;
|
442
|
+
}
|
443
|
+
break;
|
444
|
+
case LLAMA_FILE_MAGIC_GGJT:
|
445
|
+
switch (version) {
|
446
|
+
case 1: file_version = LLAMA_FILE_VERSION_GGJT_V1; return;
|
447
|
+
case 2: file_version = LLAMA_FILE_VERSION_GGJT_V2; return;
|
448
|
+
case 3: file_version = LLAMA_FILE_VERSION_GGJT_V3; return;
|
449
|
+
}
|
438
450
|
}
|
451
|
+
|
452
|
+
throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
|
453
|
+
magic, version);
|
439
454
|
}
|
440
455
|
void read_hparams() {
|
441
456
|
hparams.n_vocab = file.read_u32();
|
@@ -482,7 +497,6 @@ struct llama_file_loader {
|
|
482
497
|
case GGML_TYPE_F16:
|
483
498
|
case GGML_TYPE_Q4_0:
|
484
499
|
case GGML_TYPE_Q4_1:
|
485
|
-
case GGML_TYPE_Q4_2:
|
486
500
|
case GGML_TYPE_Q5_0:
|
487
501
|
case GGML_TYPE_Q5_1:
|
488
502
|
case GGML_TYPE_Q8_0:
|
@@ -494,7 +508,7 @@ struct llama_file_loader {
|
|
494
508
|
|
495
509
|
if (file_version >= LLAMA_FILE_VERSION_GGJT_V1) {
|
496
510
|
// skip to the next multiple of 32 bytes
|
497
|
-
file.seek(-file.tell() & 31, SEEK_CUR);
|
511
|
+
file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
|
498
512
|
}
|
499
513
|
shard.file_idx = file_idx;
|
500
514
|
shard.file_off = file.tell();
|
@@ -527,8 +541,8 @@ struct llama_file_saver {
|
|
527
541
|
write_vocab();
|
528
542
|
}
|
529
543
|
void write_magic() {
|
530
|
-
file.write_u32(
|
531
|
-
file.write_u32(
|
544
|
+
file.write_u32(LLAMA_FILE_MAGIC); // magic
|
545
|
+
file.write_u32(LLAMA_FILE_VERSION); // version
|
532
546
|
}
|
533
547
|
void write_hparams(enum llama_ftype new_ftype) {
|
534
548
|
const llama_hparams & hparams = any_file_loader->hparams;
|
@@ -558,7 +572,6 @@ struct llama_file_saver {
|
|
558
572
|
case GGML_TYPE_F16:
|
559
573
|
case GGML_TYPE_Q4_0:
|
560
574
|
case GGML_TYPE_Q4_1:
|
561
|
-
case GGML_TYPE_Q4_2:
|
562
575
|
case GGML_TYPE_Q5_0:
|
563
576
|
case GGML_TYPE_Q5_1:
|
564
577
|
case GGML_TYPE_Q8_0:
|
@@ -570,7 +583,7 @@ struct llama_file_saver {
|
|
570
583
|
file.write_u32(new_type);
|
571
584
|
file.write_raw(tensor.ne.data(), sizeof(tensor.ne[0]) * tensor.ne.size());
|
572
585
|
file.write_raw(tensor.name.data(), tensor.name.size());
|
573
|
-
file.seek(-file.tell() & 31, SEEK_CUR);
|
586
|
+
file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
|
574
587
|
LLAMA_ASSERT(new_size == llama_calc_tensor_size(tensor.ne, new_type));
|
575
588
|
file.write_raw(new_data, new_size);
|
576
589
|
}
|
@@ -585,12 +598,12 @@ struct llama_model_loader {
|
|
585
598
|
std::unique_ptr<llama_mmap> mapping;
|
586
599
|
|
587
600
|
llama_model_loader(const std::string & fname_base, bool use_mmap, bool vocab_only) {
|
588
|
-
auto first_file = new llama_file_loader(fname_base.c_str(), 0, tensors_map);
|
601
|
+
auto * first_file = new llama_file_loader(fname_base.c_str(), 0, tensors_map);
|
589
602
|
file_loaders.emplace_back(first_file);
|
590
603
|
uint32_t n_parts = vocab_only ? 1 : guess_n_parts();
|
591
604
|
for (uint32_t i = 1; i < n_parts; i++) {
|
592
605
|
std::string fname = fname_base + "." + std::to_string(i);
|
593
|
-
auto ith_file = new llama_file_loader(fname.c_str(), i, tensors_map);
|
606
|
+
auto * ith_file = new llama_file_loader(fname.c_str(), i, tensors_map);
|
594
607
|
file_loaders.emplace_back(ith_file);
|
595
608
|
if (ith_file->hparams != first_file->hparams) {
|
596
609
|
throw format("llama.cpp: hparams inconsistent between files");
|
@@ -637,7 +650,7 @@ struct llama_model_loader {
|
|
637
650
|
}
|
638
651
|
}
|
639
652
|
|
640
|
-
struct ggml_tensor * get_tensor(const std::string & name, std::vector<uint32_t> ne) {
|
653
|
+
struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne, ggml_backend backend) {
|
641
654
|
auto it = tensors_map.name_to_idx.find(name);
|
642
655
|
if (it == tensors_map.name_to_idx.end()) {
|
643
656
|
throw format("llama.cpp: tensor '%s' is missing from model", name.c_str());
|
@@ -648,10 +661,10 @@ struct llama_model_loader {
|
|
648
661
|
name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str());
|
649
662
|
}
|
650
663
|
|
651
|
-
return get_tensor_for(lt);
|
664
|
+
return get_tensor_for(lt, backend);
|
652
665
|
}
|
653
666
|
|
654
|
-
struct ggml_tensor * get_tensor_for(llama_load_tensor & lt) {
|
667
|
+
struct ggml_tensor * get_tensor_for(llama_load_tensor & lt, ggml_backend backend) {
|
655
668
|
struct ggml_tensor * tensor;
|
656
669
|
if (lt.ne.size() == 2) {
|
657
670
|
tensor = ggml_new_tensor_2d(ggml_ctx, lt.type, lt.ne.at(0), lt.ne.at(1));
|
@@ -661,12 +674,13 @@ struct llama_model_loader {
|
|
661
674
|
}
|
662
675
|
ggml_set_name(tensor, lt.name.c_str());
|
663
676
|
LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor
|
677
|
+
tensor->backend = backend;
|
664
678
|
lt.ggml_tensor = tensor;
|
665
679
|
num_ggml_tensors_created++;
|
666
680
|
return tensor;
|
667
681
|
}
|
668
682
|
|
669
|
-
void done_getting_tensors() {
|
683
|
+
void done_getting_tensors() const {
|
670
684
|
if (num_ggml_tensors_created != tensors_map.tensors.size()) {
|
671
685
|
throw std::string("llama.cpp: file contained more tensors than expected");
|
672
686
|
}
|
@@ -674,12 +688,16 @@ struct llama_model_loader {
|
|
674
688
|
|
675
689
|
void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) {
|
676
690
|
size_t data_size = 0;
|
691
|
+
size_t prefetch_size = 0;
|
677
692
|
for (const llama_load_tensor & lt : tensors_map.tensors) {
|
678
693
|
data_size += lt.size;
|
694
|
+
if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
|
695
|
+
prefetch_size += lt.size;
|
696
|
+
}
|
679
697
|
}
|
680
698
|
|
681
699
|
if (use_mmap) {
|
682
|
-
mapping.reset(new llama_mmap(&file_loaders.at(0)->file));
|
700
|
+
mapping.reset(new llama_mmap(&file_loaders.at(0)->file, prefetch_size));
|
683
701
|
if (!lmlock) {
|
684
702
|
// Don't call the callback since the actual loading will be lazy
|
685
703
|
// and we can't measure it.
|
@@ -692,6 +710,9 @@ struct llama_model_loader {
|
|
692
710
|
|
693
711
|
size_t done_size = 0;
|
694
712
|
for (llama_load_tensor & lt : tensors_map.tensors) {
|
713
|
+
if (lt.ggml_tensor->backend != GGML_BACKEND_CPU) {
|
714
|
+
continue;
|
715
|
+
}
|
695
716
|
if (progress_callback) {
|
696
717
|
progress_callback((float) done_size / data_size, progress_callback_user_data);
|
697
718
|
}
|
@@ -704,9 +725,6 @@ struct llama_model_loader {
|
|
704
725
|
lmlock->grow_to(done_size);
|
705
726
|
}
|
706
727
|
}
|
707
|
-
if (progress_callback) {
|
708
|
-
progress_callback(1.0f, progress_callback_user_data);
|
709
|
-
}
|
710
728
|
}
|
711
729
|
|
712
730
|
void load_data_for(llama_load_tensor & lt) {
|
@@ -808,9 +826,9 @@ static bool kv_cache_init(
|
|
808
826
|
struct llama_context_params llama_context_default_params() {
|
809
827
|
struct llama_context_params result = {
|
810
828
|
/*.n_ctx =*/ 512,
|
811
|
-
/*.
|
829
|
+
/*.gpu_layers =*/ 0,
|
812
830
|
/*.seed =*/ -1,
|
813
|
-
/*.f16_kv =*/
|
831
|
+
/*.f16_kv =*/ true,
|
814
832
|
/*.logits_all =*/ false,
|
815
833
|
/*.vocab_only =*/ false,
|
816
834
|
/*.use_mmap =*/ true,
|
@@ -831,6 +849,21 @@ bool llama_mlock_supported() {
|
|
831
849
|
return llama_mlock::SUPPORTED;
|
832
850
|
}
|
833
851
|
|
852
|
+
void llama_init_backend() {
|
853
|
+
ggml_time_init();
|
854
|
+
|
855
|
+
// needed to initialize f16 tables
|
856
|
+
{
|
857
|
+
struct ggml_init_params params = { 0, NULL, false };
|
858
|
+
struct ggml_context * ctx = ggml_init(params);
|
859
|
+
ggml_free(ctx);
|
860
|
+
}
|
861
|
+
}
|
862
|
+
|
863
|
+
int64_t llama_time_us() {
|
864
|
+
return ggml_time_us();
|
865
|
+
}
|
866
|
+
|
834
867
|
//
|
835
868
|
// model loading
|
836
869
|
//
|
@@ -839,9 +872,12 @@ static const char *llama_file_version_name(llama_file_version version) {
|
|
839
872
|
switch (version) {
|
840
873
|
case LLAMA_FILE_VERSION_GGML: return "'ggml' (old version with low tokenizer quality and no mmap support)";
|
841
874
|
case LLAMA_FILE_VERSION_GGMF_V1: return "ggmf v1 (old version with no mmap support)";
|
842
|
-
case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (
|
843
|
-
|
875
|
+
case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (pre #1405)";
|
876
|
+
case LLAMA_FILE_VERSION_GGJT_V2: return "ggjt v2 (pre #1508)";
|
877
|
+
case LLAMA_FILE_VERSION_GGJT_V3: return "ggjt v3 (latest)";
|
844
878
|
}
|
879
|
+
|
880
|
+
return "unknown";
|
845
881
|
}
|
846
882
|
|
847
883
|
static const char *llama_ftype_name(enum llama_ftype ftype) {
|
@@ -852,7 +888,6 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
|
|
852
888
|
case LLAMA_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1";
|
853
889
|
case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
|
854
890
|
return "mostly Q4_1, some F16";
|
855
|
-
case LLAMA_FTYPE_MOSTLY_Q4_2: return "mostly Q4_2";
|
856
891
|
case LLAMA_FTYPE_MOSTLY_Q5_0: return "mostly Q5_0";
|
857
892
|
case LLAMA_FTYPE_MOSTLY_Q5_1: return "mostly Q5_1";
|
858
893
|
case LLAMA_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0";
|
@@ -874,6 +909,7 @@ static void llama_model_load_internal(
|
|
874
909
|
const std::string & fname,
|
875
910
|
llama_context & lctx,
|
876
911
|
int n_ctx,
|
912
|
+
int n_gpu_layers,
|
877
913
|
ggml_type memory_type,
|
878
914
|
bool use_mmap,
|
879
915
|
bool use_mlock,
|
@@ -918,35 +954,32 @@ static void llama_model_load_internal(
|
|
918
954
|
fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
|
919
955
|
}
|
920
956
|
|
957
|
+
if (file_version < LLAMA_FILE_VERSION_GGJT_V2) {
|
958
|
+
if (hparams.ftype != LLAMA_FTYPE_ALL_F32 &&
|
959
|
+
hparams.ftype != LLAMA_FTYPE_MOSTLY_F16 &&
|
960
|
+
hparams.ftype != LLAMA_FTYPE_MOSTLY_Q8_0) {
|
961
|
+
throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1405)");
|
962
|
+
}
|
963
|
+
}
|
964
|
+
|
965
|
+
if (file_version < LLAMA_FILE_VERSION_GGJT_V3) {
|
966
|
+
if (hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ||
|
967
|
+
hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_1 ||
|
968
|
+
hparams.ftype == LLAMA_FTYPE_MOSTLY_Q8_0) {
|
969
|
+
throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1508)");
|
970
|
+
}
|
971
|
+
}
|
972
|
+
|
921
973
|
if (vocab_only) {
|
922
974
|
return;
|
923
975
|
}
|
924
976
|
|
925
977
|
auto & ctx = model.ctx;
|
926
978
|
|
927
|
-
size_t ctx_size
|
979
|
+
size_t ctx_size;
|
980
|
+
size_t mmapped_size;
|
928
981
|
ml->calc_sizes(&ctx_size, &mmapped_size);
|
929
|
-
fprintf(stderr, "%s: ggml ctx size = %
|
930
|
-
|
931
|
-
// print memory requirements
|
932
|
-
{
|
933
|
-
const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
|
934
|
-
|
935
|
-
// this is the total memory required to run the inference
|
936
|
-
const size_t mem_required =
|
937
|
-
ctx_size +
|
938
|
-
mmapped_size +
|
939
|
-
MEM_REQ_SCRATCH0().at(model.type) +
|
940
|
-
MEM_REQ_SCRATCH1().at(model.type) +
|
941
|
-
MEM_REQ_EVAL().at(model.type);
|
942
|
-
|
943
|
-
// this is the memory required by one llama_state
|
944
|
-
const size_t mem_required_state =
|
945
|
-
scale*MEM_REQ_KV_SELF().at(model.type);
|
946
|
-
|
947
|
-
fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
|
948
|
-
mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
|
949
|
-
}
|
982
|
+
fprintf(stderr, "%s: ggml ctx size = %7.2f MB\n", __func__, ctx_size/1024.0/1024.0);
|
950
983
|
|
951
984
|
// create the ggml context
|
952
985
|
{
|
@@ -968,43 +1001,102 @@ static void llama_model_load_internal(
|
|
968
1001
|
}
|
969
1002
|
}
|
970
1003
|
|
1004
|
+
#ifdef GGML_USE_CUBLAS
|
1005
|
+
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CUDA
|
1006
|
+
#else
|
1007
|
+
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CPU
|
1008
|
+
#endif
|
1009
|
+
|
971
1010
|
// prepare memory for the weights
|
1011
|
+
size_t vram_total = 0;
|
972
1012
|
{
|
973
|
-
const auto & hparams = model.hparams;
|
974
|
-
|
975
1013
|
const uint32_t n_embd = hparams.n_embd;
|
976
1014
|
const uint32_t n_layer = hparams.n_layer;
|
977
1015
|
const uint32_t n_vocab = hparams.n_vocab;
|
978
1016
|
|
979
1017
|
ml->ggml_ctx = ctx;
|
980
1018
|
|
981
|
-
model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab});
|
982
|
-
model.norm = ml->get_tensor("norm.weight", {n_embd});
|
983
|
-
|
1019
|
+
model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
1020
|
+
model.norm = ml->get_tensor("norm.weight", {n_embd}, GGML_BACKEND_CPU);
|
1021
|
+
|
1022
|
+
// "output" tensor
|
1023
|
+
{
|
1024
|
+
ggml_backend backend_output;
|
1025
|
+
if (n_gpu_layers > int(n_layer)) { // NOLINT
|
1026
|
+
backend_output = LLAMA_BACKEND_OFFLOAD;
|
1027
|
+
} else {
|
1028
|
+
backend_output = GGML_BACKEND_CPU;
|
1029
|
+
}
|
1030
|
+
|
1031
|
+
model.output = ml->get_tensor("output.weight", {n_embd, n_vocab}, backend_output);
|
1032
|
+
}
|
1033
|
+
|
1034
|
+
const int i_gpu_start = n_layer - n_gpu_layers;
|
984
1035
|
|
985
1036
|
model.layers.resize(n_layer);
|
986
1037
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
1038
|
+
const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
1039
|
+
|
987
1040
|
auto & layer = model.layers[i];
|
988
1041
|
|
989
1042
|
std::string layers_i = "layers." + std::to_string(i);
|
990
1043
|
|
991
|
-
layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd});
|
1044
|
+
layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd}, backend);
|
1045
|
+
|
1046
|
+
layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}, backend);
|
1047
|
+
layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd}, backend);
|
1048
|
+
layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd}, backend);
|
1049
|
+
layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}, backend);
|
992
1050
|
|
993
|
-
layer.
|
994
|
-
layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd});
|
995
|
-
layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd});
|
996
|
-
layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd});
|
1051
|
+
layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd}, backend);
|
997
1052
|
|
998
|
-
layer.
|
1053
|
+
layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}, backend);
|
1054
|
+
layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}, backend);
|
1055
|
+
layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}, backend);
|
999
1056
|
|
1000
|
-
|
1001
|
-
|
1002
|
-
|
1057
|
+
if (backend == GGML_BACKEND_CUDA) {
|
1058
|
+
vram_total +=
|
1059
|
+
ggml_nbytes(layer.attention_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
|
1060
|
+
ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.attention_norm) +
|
1061
|
+
ggml_nbytes(layer.w1) + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3);
|
1062
|
+
}
|
1003
1063
|
}
|
1004
1064
|
}
|
1005
1065
|
|
1006
1066
|
ml->done_getting_tensors();
|
1007
1067
|
|
1068
|
+
// print memory requirements
|
1069
|
+
{
|
1070
|
+
const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
|
1071
|
+
|
1072
|
+
// this is the total memory required to run the inference
|
1073
|
+
const size_t mem_required =
|
1074
|
+
ctx_size +
|
1075
|
+
mmapped_size - vram_total + // weights in VRAM not in memory
|
1076
|
+
MEM_REQ_SCRATCH0().at(model.type) +
|
1077
|
+
MEM_REQ_SCRATCH1().at(model.type) +
|
1078
|
+
MEM_REQ_EVAL().at(model.type);
|
1079
|
+
|
1080
|
+
// this is the memory required by one llama_state
|
1081
|
+
const size_t mem_required_state =
|
1082
|
+
scale*MEM_REQ_KV_SELF().at(model.type);
|
1083
|
+
|
1084
|
+
fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
|
1085
|
+
mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
|
1086
|
+
|
1087
|
+
#ifdef GGML_USE_CUBLAS
|
1088
|
+
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
1089
|
+
|
1090
|
+
fprintf(stderr, "%s: [cublas] offloading %d layers to GPU\n", __func__, n_gpu);
|
1091
|
+
if (n_gpu_layers > (int) hparams.n_layer) {
|
1092
|
+
fprintf(stderr, "%s: [cublas] offloading output layer to GPU\n", __func__);
|
1093
|
+
}
|
1094
|
+
fprintf(stderr, "%s: [cublas] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
|
1095
|
+
#else
|
1096
|
+
(void) n_gpu_layers;
|
1097
|
+
#endif
|
1098
|
+
}
|
1099
|
+
|
1008
1100
|
// populate `tensors_by_name`
|
1009
1101
|
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
|
1010
1102
|
model.tensors_by_name.emplace_back(lt.name, lt.ggml_tensor);
|
@@ -1012,6 +1104,33 @@ static void llama_model_load_internal(
|
|
1012
1104
|
|
1013
1105
|
ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
|
1014
1106
|
|
1107
|
+
#ifdef GGML_USE_CUBLAS
|
1108
|
+
{
|
1109
|
+
size_t done_size = 0;
|
1110
|
+
size_t data_size = 0;
|
1111
|
+
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
|
1112
|
+
data_size += lt.size;
|
1113
|
+
if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
|
1114
|
+
done_size += lt.size;
|
1115
|
+
}
|
1116
|
+
}
|
1117
|
+
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
|
1118
|
+
if (lt.ggml_tensor->backend != GGML_BACKEND_CUDA) {
|
1119
|
+
continue;
|
1120
|
+
}
|
1121
|
+
if (progress_callback) {
|
1122
|
+
progress_callback((float) done_size / data_size, progress_callback_user_data);
|
1123
|
+
}
|
1124
|
+
ggml_cuda_load_data(fname.c_str(), lt.ggml_tensor, lt.shards.at(0).file_off);
|
1125
|
+
done_size += lt.size;
|
1126
|
+
}
|
1127
|
+
}
|
1128
|
+
#endif // GGML_USE_CUBLAS
|
1129
|
+
|
1130
|
+
if (progress_callback) {
|
1131
|
+
progress_callback(1.0f, progress_callback_user_data);
|
1132
|
+
}
|
1133
|
+
|
1015
1134
|
model.mapping = std::move(ml->mapping);
|
1016
1135
|
|
1017
1136
|
// loading time will be recalculate after the first eval, so
|
@@ -1023,6 +1142,7 @@ static bool llama_model_load(
|
|
1023
1142
|
const std::string & fname,
|
1024
1143
|
llama_context & lctx,
|
1025
1144
|
int n_ctx,
|
1145
|
+
int n_gpu_layers,
|
1026
1146
|
ggml_type memory_type,
|
1027
1147
|
bool use_mmap,
|
1028
1148
|
bool use_mlock,
|
@@ -1030,7 +1150,7 @@ static bool llama_model_load(
|
|
1030
1150
|
llama_progress_callback progress_callback,
|
1031
1151
|
void *progress_callback_user_data) {
|
1032
1152
|
try {
|
1033
|
-
llama_model_load_internal(fname, lctx, n_ctx, memory_type, use_mmap, use_mlock,
|
1153
|
+
llama_model_load_internal(fname, lctx, n_ctx, n_gpu_layers, memory_type, use_mmap, use_mlock,
|
1034
1154
|
vocab_only, progress_callback, progress_callback_user_data);
|
1035
1155
|
return true;
|
1036
1156
|
} catch (const std::string & err) {
|
@@ -1052,6 +1172,13 @@ static bool llama_eval_internal(
|
|
1052
1172
|
const int n_tokens,
|
1053
1173
|
const int n_past,
|
1054
1174
|
const int n_threads) {
|
1175
|
+
|
1176
|
+
// enforce that the first token is BOS
|
1177
|
+
if (n_past == 0 && tokens[0] != llama_token_bos()) {
|
1178
|
+
fprintf(stderr, "%s: first token must be BOS\n", __func__);
|
1179
|
+
return false;
|
1180
|
+
}
|
1181
|
+
|
1055
1182
|
const int64_t t_start_us = ggml_time_us();
|
1056
1183
|
|
1057
1184
|
const int N = n_tokens;
|
@@ -1059,7 +1186,7 @@ static bool llama_eval_internal(
|
|
1059
1186
|
const auto & model = lctx.model;
|
1060
1187
|
const auto & hparams = model.hparams;
|
1061
1188
|
|
1062
|
-
auto & kv_self = model.kv_self;
|
1189
|
+
const auto & kv_self = model.kv_self;
|
1063
1190
|
|
1064
1191
|
LLAMA_ASSERT(!!kv_self.ctx);
|
1065
1192
|
|
@@ -1103,17 +1230,15 @@ static bool llama_eval_internal(
|
|
1103
1230
|
{
|
1104
1231
|
cur = ggml_rms_norm(ctx0, inpL);
|
1105
1232
|
|
1106
|
-
// cur = attention_norm
|
1107
|
-
cur = ggml_mul(ctx0,
|
1108
|
-
ggml_repeat(ctx0, model.layers[il].attention_norm, cur),
|
1109
|
-
cur);
|
1233
|
+
// cur = cur*attention_norm(broadcasted)
|
1234
|
+
cur = ggml_mul(ctx0, cur, model.layers[il].attention_norm);
|
1110
1235
|
}
|
1111
1236
|
|
1112
1237
|
// self-attention
|
1113
1238
|
{
|
1114
1239
|
// compute Q and K and RoPE them
|
1115
|
-
struct ggml_tensor * Qcur =
|
1116
|
-
struct ggml_tensor * Kcur =
|
1240
|
+
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
|
1241
|
+
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
|
1117
1242
|
ggml_set_name(Qcur, "Qcur");
|
1118
1243
|
ggml_set_name(Kcur, "Kcur");
|
1119
1244
|
|
@@ -1154,17 +1279,19 @@ static bool llama_eval_internal(
|
|
1154
1279
|
struct ggml_tensor * KQ_scale = ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head));
|
1155
1280
|
ggml_set_name(KQ_scale, "1/sqrt(n_embd/n_head)");
|
1156
1281
|
|
1157
|
-
|
1282
|
+
// KQ_scaled shape [n_past + N, N, n_head, 1]
|
1283
|
+
struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
|
1158
1284
|
ggml_set_name(KQ_scaled, "KQ_scaled");
|
1159
1285
|
|
1160
1286
|
// KQ_masked = mask_past(KQ_scaled)
|
1161
|
-
struct ggml_tensor * KQ_masked =
|
1287
|
+
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
|
1162
1288
|
ggml_set_name(KQ_masked, "KQ_masked");
|
1163
1289
|
|
1164
1290
|
// KQ = soft_max(KQ_masked)
|
1165
|
-
struct ggml_tensor * KQ_soft_max =
|
1291
|
+
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
|
1166
1292
|
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
1167
1293
|
|
1294
|
+
|
1168
1295
|
// split cached V into n_head heads
|
1169
1296
|
struct ggml_tensor * V =
|
1170
1297
|
ggml_view_3d(ctx0, kv_self.v,
|
@@ -1211,10 +1338,8 @@ static bool llama_eval_internal(
|
|
1211
1338
|
{
|
1212
1339
|
cur = ggml_rms_norm(ctx0, inpFF);
|
1213
1340
|
|
1214
|
-
// cur = ffn_norm
|
1215
|
-
cur = ggml_mul(ctx0,
|
1216
|
-
ggml_repeat(ctx0, model.layers[il].ffn_norm, cur),
|
1217
|
-
cur);
|
1341
|
+
// cur = cur*ffn_norm(broadcasted)
|
1342
|
+
cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
|
1218
1343
|
}
|
1219
1344
|
|
1220
1345
|
struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
|
@@ -1251,10 +1376,8 @@ static bool llama_eval_internal(
|
|
1251
1376
|
|
1252
1377
|
inpL = ggml_rms_norm(ctx0, inpL);
|
1253
1378
|
|
1254
|
-
// inpL = norm
|
1255
|
-
inpL = ggml_mul(ctx0,
|
1256
|
-
ggml_repeat(ctx0, model.norm, inpL),
|
1257
|
-
inpL);
|
1379
|
+
// inpL = inpL*norm(broadcasted)
|
1380
|
+
inpL = ggml_mul(ctx0, inpL, model.norm);
|
1258
1381
|
|
1259
1382
|
embeddings = inpL;
|
1260
1383
|
}
|
@@ -1265,7 +1388,7 @@ static bool llama_eval_internal(
|
|
1265
1388
|
lctx.use_buf(ctx0, -1);
|
1266
1389
|
|
1267
1390
|
// logits -> probs
|
1268
|
-
//inpL =
|
1391
|
+
//inpL = ggml_soft_max_inplace(ctx0, inpL);
|
1269
1392
|
|
1270
1393
|
// run the computation
|
1271
1394
|
ggml_build_forward_expand(&gf, inpL);
|
@@ -1303,7 +1426,7 @@ static bool llama_eval_internal(
|
|
1303
1426
|
}
|
1304
1427
|
|
1305
1428
|
// extract embeddings
|
1306
|
-
if (lctx.embedding.
|
1429
|
+
if (!lctx.embedding.empty()) {
|
1307
1430
|
auto & embedding_out = lctx.embedding;
|
1308
1431
|
|
1309
1432
|
embedding_out.resize(n_embd);
|
@@ -1354,6 +1477,8 @@ struct llama_sp_symbol {
|
|
1354
1477
|
size_t n;
|
1355
1478
|
};
|
1356
1479
|
|
1480
|
+
static_assert(std::is_trivially_copyable<llama_sp_symbol>::value, "llama_sp_symbol is not trivially copyable");
|
1481
|
+
|
1357
1482
|
struct llama_sp_bigram {
|
1358
1483
|
struct comparator {
|
1359
1484
|
bool operator()(llama_sp_bigram & l, llama_sp_bigram & r) {
|
@@ -1386,7 +1511,7 @@ struct llama_tokenizer {
|
|
1386
1511
|
sym.prev = index - 1;
|
1387
1512
|
sym.next = offs == text.size() ? -1 : index + 1;
|
1388
1513
|
index++;
|
1389
|
-
symbols_.emplace_back(
|
1514
|
+
symbols_.emplace_back(sym);
|
1390
1515
|
}
|
1391
1516
|
|
1392
1517
|
// seed the work queue with all possible 2-character tokens.
|
@@ -1477,12 +1602,12 @@ static std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, co
|
|
1477
1602
|
llama_tokenizer tokenizer(vocab);
|
1478
1603
|
std::vector<llama_vocab::id> output;
|
1479
1604
|
|
1480
|
-
if (text.
|
1605
|
+
if (text.empty()) {
|
1481
1606
|
return output;
|
1482
1607
|
}
|
1483
1608
|
|
1484
1609
|
if (bos) {
|
1485
|
-
output.push_back(
|
1610
|
+
output.push_back(llama_token_bos());
|
1486
1611
|
}
|
1487
1612
|
|
1488
1613
|
tokenizer.tokenize(text, output);
|
@@ -1713,7 +1838,7 @@ void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_dat
|
|
1713
1838
|
const int64_t t_start_sample_us = ggml_time_us();
|
1714
1839
|
|
1715
1840
|
for (size_t i = 0; i < candidates->size; ++i) {
|
1716
|
-
auto token_iter = std::find(last_tokens, last_tokens + last_tokens_size, candidates->data[i].id);
|
1841
|
+
const auto * token_iter = std::find(last_tokens, last_tokens + last_tokens_size, candidates->data[i].id);
|
1717
1842
|
if (token_iter == last_tokens + last_tokens_size) {
|
1718
1843
|
continue;
|
1719
1844
|
}
|
@@ -1791,7 +1916,7 @@ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_
|
|
1791
1916
|
float k = powf((epsilon_hat * powf(2, *mu)) / (1 - powf(N, -epsilon_hat)), 1 / s_hat);
|
1792
1917
|
|
1793
1918
|
// Sample the next word X using top-k sampling
|
1794
|
-
llama_sample_top_k(nullptr, candidates, int(k));
|
1919
|
+
llama_sample_top_k(nullptr, candidates, int(k), 1);
|
1795
1920
|
if (ctx) {
|
1796
1921
|
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
1797
1922
|
}
|
@@ -1857,7 +1982,7 @@ llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_da
|
|
1857
1982
|
const int64_t t_start_sample_us = ggml_time_us();
|
1858
1983
|
|
1859
1984
|
// Find max element
|
1860
|
-
auto max_iter = std::max_element(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
|
1985
|
+
auto * max_iter = std::max_element(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
|
1861
1986
|
return a.logit < b.logit;
|
1862
1987
|
});
|
1863
1988
|
|
@@ -1900,7 +2025,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
1900
2025
|
switch (ftype) {
|
1901
2026
|
case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
|
1902
2027
|
case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
|
1903
|
-
case LLAMA_FTYPE_MOSTLY_Q4_2: quantized_type = GGML_TYPE_Q4_2; break;
|
1904
2028
|
case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
|
1905
2029
|
case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
|
1906
2030
|
case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
|
@@ -1911,7 +2035,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
1911
2035
|
nthread = std::thread::hardware_concurrency();
|
1912
2036
|
}
|
1913
2037
|
|
1914
|
-
std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp
|
2038
|
+
std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp, /*use_mmap*/ false,
|
1915
2039
|
/*vocab_only*/ false));
|
1916
2040
|
llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), ftype);
|
1917
2041
|
|
@@ -1965,7 +2089,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
1965
2089
|
} else if (tensor.type == GGML_TYPE_F16) {
|
1966
2090
|
f32_conv_buf.resize(nelements * sizeof(float));
|
1967
2091
|
f32_data = (float *) f32_conv_buf.addr;
|
1968
|
-
auto f16_data = (const ggml_fp16_t *) tensor.data;
|
2092
|
+
const auto * f16_data = (const ggml_fp16_t *) tensor.data;
|
1969
2093
|
for (size_t i = 0; i < nelements; i++) {
|
1970
2094
|
f32_data[i] = ggml_fp16_to_fp32(f16_data[i]);
|
1971
2095
|
}
|
@@ -1996,21 +2120,31 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
1996
2120
|
size_t first = counter; counter += chunk_size;
|
1997
2121
|
if (first >= nelements) {
|
1998
2122
|
if (!local_hist.empty()) {
|
1999
|
-
for (int j=0; j<int(local_hist.size()); ++j)
|
2123
|
+
for (int j=0; j<int(local_hist.size()); ++j) {
|
2124
|
+
hist_cur[j] += local_hist[j];
|
2125
|
+
}
|
2000
2126
|
new_size += local_size;
|
2001
2127
|
}
|
2002
2128
|
break;
|
2003
2129
|
}
|
2004
2130
|
lock.unlock();
|
2005
2131
|
size_t last = std::min(nelements, first + chunk_size);
|
2006
|
-
if (local_hist.empty())
|
2132
|
+
if (local_hist.empty()) {
|
2133
|
+
local_hist.resize(hist_cur.size(), 0);
|
2134
|
+
}
|
2007
2135
|
local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data());
|
2008
2136
|
}
|
2009
2137
|
};
|
2010
|
-
if (int
|
2011
|
-
|
2138
|
+
if ((int) workers.size() < nthread_use - 1) {
|
2139
|
+
workers.resize(nthread_use - 1);
|
2140
|
+
}
|
2141
|
+
for (int it = 0; it < nthread_use - 1; ++it) {
|
2142
|
+
workers[it] = std::thread(compute);
|
2143
|
+
}
|
2012
2144
|
compute();
|
2013
|
-
for (int it = 0; it < nthread_use - 1; ++it)
|
2145
|
+
for (int it = 0; it < nthread_use - 1; ++it) {
|
2146
|
+
workers[it].join();
|
2147
|
+
}
|
2014
2148
|
}
|
2015
2149
|
|
2016
2150
|
printf("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0);
|
@@ -2067,7 +2201,7 @@ struct llama_context * llama_init_from_file(
|
|
2067
2201
|
unsigned * cur_percentage_p = (unsigned *) ctx;
|
2068
2202
|
unsigned percentage = (unsigned) (100 * progress);
|
2069
2203
|
while (percentage > *cur_percentage_p) {
|
2070
|
-
|
2204
|
+
*cur_percentage_p = percentage;
|
2071
2205
|
fprintf(stderr, ".");
|
2072
2206
|
fflush(stderr);
|
2073
2207
|
if (percentage >= 100) {
|
@@ -2082,7 +2216,7 @@ struct llama_context * llama_init_from_file(
|
|
2082
2216
|
|
2083
2217
|
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
2084
2218
|
|
2085
|
-
if (!llama_model_load(path_model, *ctx, params.n_ctx, memory_type,
|
2219
|
+
if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_gpu_layers, memory_type,
|
2086
2220
|
params.use_mmap, params.use_mlock, params.vocab_only,
|
2087
2221
|
params.progress_callback, params.progress_callback_user_data)) {
|
2088
2222
|
fprintf(stderr, "%s: failed to load model\n", __func__);
|
@@ -2160,7 +2294,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|
2160
2294
|
{
|
2161
2295
|
uint32_t magic;
|
2162
2296
|
fin.read((char *) &magic, sizeof(magic));
|
2163
|
-
if (magic !=
|
2297
|
+
if (magic != LLAMA_FILE_MAGIC_GGLA) {
|
2164
2298
|
fprintf(stderr, "%s: bad file magic\n", __func__);
|
2165
2299
|
return 1;
|
2166
2300
|
}
|
@@ -2208,7 +2342,8 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|
2208
2342
|
fprintf(stderr, "%s: loading base model from '%s'\n", __func__, path_base_model);
|
2209
2343
|
model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*vocab_only*/ false));
|
2210
2344
|
|
2211
|
-
size_t ctx_size
|
2345
|
+
size_t ctx_size;
|
2346
|
+
size_t mmapped_size;
|
2212
2347
|
model_loader->calc_sizes(&ctx_size, &mmapped_size);
|
2213
2348
|
base_buf.resize(ctx_size);
|
2214
2349
|
|
@@ -2223,7 +2358,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|
2223
2358
|
|
2224
2359
|
// maybe this should in llama_model_loader
|
2225
2360
|
if (model_loader->use_mmap) {
|
2226
|
-
model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */
|
2361
|
+
model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ 0));
|
2227
2362
|
}
|
2228
2363
|
}
|
2229
2364
|
|
@@ -2247,8 +2382,12 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|
2247
2382
|
fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
|
2248
2383
|
}
|
2249
2384
|
|
2250
|
-
std::string name
|
2251
|
-
|
2385
|
+
std::string name;
|
2386
|
+
{
|
2387
|
+
char buf[1024];
|
2388
|
+
fin.read(buf, length);
|
2389
|
+
name = std::string(buf, length);
|
2390
|
+
}
|
2252
2391
|
|
2253
2392
|
// check for lora suffix and get the type of tensor
|
2254
2393
|
const std::string lora_suffix = ".lora";
|
@@ -2263,7 +2402,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|
2263
2402
|
base_name.erase(pos);
|
2264
2403
|
// fprintf(stderr, "%s: %s => %s (lora type %s) ", __func__, name.c_str(),base_name.c_str(), lora_type.c_str());
|
2265
2404
|
|
2266
|
-
if (model_tensors.find(base_name
|
2405
|
+
if (model_tensors.find(base_name) == model_tensors.end()) {
|
2267
2406
|
fprintf(stderr, "%s: unknown tensor '%s' in lora adapter\n", __func__, name.data());
|
2268
2407
|
return 1;
|
2269
2408
|
}
|
@@ -2312,7 +2451,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|
2312
2451
|
}
|
2313
2452
|
size_t idx = model_loader->tensors_map.name_to_idx[base_name];
|
2314
2453
|
llama_load_tensor & lt = model_loader->tensors_map.tensors[idx];
|
2315
|
-
base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] });
|
2454
|
+
base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] }, GGML_BACKEND_CPU);
|
2316
2455
|
lt.data = (uint8_t *) lt.ggml_tensor->data;
|
2317
2456
|
model_loader->load_data_for(lt);
|
2318
2457
|
lt.ggml_tensor->data = lt.data;
|
@@ -2343,7 +2482,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|
2343
2482
|
|
2344
2483
|
if (scaling != 1.0f) {
|
2345
2484
|
ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling);
|
2346
|
-
BA =
|
2485
|
+
BA = ggml_scale_inplace(lora_ctx, BA, scale_tensor);
|
2347
2486
|
}
|
2348
2487
|
|
2349
2488
|
ggml_tensor * r;
|
@@ -2365,8 +2504,9 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|
2365
2504
|
lora_tensors.clear();
|
2366
2505
|
|
2367
2506
|
n_tensors++;
|
2368
|
-
if (n_tensors % 4 == 0)
|
2507
|
+
if (n_tensors % 4 == 0) {
|
2369
2508
|
fprintf(stderr, ".");
|
2509
|
+
}
|
2370
2510
|
}
|
2371
2511
|
}
|
2372
2512
|
|
@@ -2395,7 +2535,7 @@ int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
|
|
2395
2535
|
return ctx->model.kv_self.n;
|
2396
2536
|
}
|
2397
2537
|
|
2398
|
-
#define LLAMA_MAX_RNG_STATE 64*1024
|
2538
|
+
#define LLAMA_MAX_RNG_STATE (64*1024)
|
2399
2539
|
|
2400
2540
|
void llama_set_rng_seed(struct llama_context * ctx, int seed) {
|
2401
2541
|
if (seed < 0) {
|
@@ -2436,8 +2576,8 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
|
|
2436
2576
|
}
|
2437
2577
|
|
2438
2578
|
// Copies the state to the specified destination address
|
2439
|
-
size_t llama_copy_state_data(struct llama_context * ctx, uint8_t *
|
2440
|
-
uint8_t * out =
|
2579
|
+
size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
2580
|
+
uint8_t * out = dst;
|
2441
2581
|
|
2442
2582
|
// copy rng
|
2443
2583
|
{
|
@@ -2497,7 +2637,9 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) {
|
|
2497
2637
|
|
2498
2638
|
if (kv_size) {
|
2499
2639
|
const size_t elt_size = ggml_element_size(kv_self.k);
|
2640
|
+
|
2500
2641
|
char buffer[4096];
|
2642
|
+
|
2501
2643
|
ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
|
2502
2644
|
ggml_cgraph gf{};
|
2503
2645
|
gf.n_threads = 1;
|
@@ -2521,10 +2663,12 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) {
|
|
2521
2663
|
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
|
2522
2664
|
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d));
|
2523
2665
|
ggml_graph_compute(cpy_ctx, &gf);
|
2666
|
+
|
2667
|
+
ggml_free(cpy_ctx);
|
2524
2668
|
}
|
2525
2669
|
}
|
2526
2670
|
|
2527
|
-
const size_t written = out -
|
2671
|
+
const size_t written = out - dst;
|
2528
2672
|
const size_t max_size = llama_get_state_size(ctx);
|
2529
2673
|
|
2530
2674
|
LLAMA_ASSERT(written <= max_size);
|
@@ -2533,16 +2677,16 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) {
|
|
2533
2677
|
}
|
2534
2678
|
|
2535
2679
|
// Sets the state reading from the specified source address
|
2536
|
-
size_t llama_set_state_data(struct llama_context * ctx,
|
2537
|
-
|
2680
|
+
size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
2681
|
+
uint8_t * inp = src;
|
2538
2682
|
|
2539
2683
|
// set rng
|
2540
2684
|
{
|
2541
2685
|
size_t rng_size;
|
2542
2686
|
char rng_buf[LLAMA_MAX_RNG_STATE];
|
2543
2687
|
|
2544
|
-
memcpy(&rng_size,
|
2545
|
-
memcpy(&rng_buf[0],
|
2688
|
+
memcpy(&rng_size, inp, sizeof(rng_size)); inp += sizeof(rng_size);
|
2689
|
+
memcpy(&rng_buf[0], inp, LLAMA_MAX_RNG_STATE); inp += LLAMA_MAX_RNG_STATE;
|
2546
2690
|
|
2547
2691
|
std::stringstream rng_ss;
|
2548
2692
|
rng_ss.str(std::string(&rng_buf[0], rng_size));
|
@@ -2556,30 +2700,30 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
|
2556
2700
|
size_t logits_cap;
|
2557
2701
|
size_t logits_size;
|
2558
2702
|
|
2559
|
-
memcpy(&logits_cap,
|
2560
|
-
memcpy(&logits_size,
|
2703
|
+
memcpy(&logits_cap, inp, sizeof(logits_cap)); inp += sizeof(logits_cap);
|
2704
|
+
memcpy(&logits_size, inp, sizeof(logits_size)); inp += sizeof(logits_size);
|
2561
2705
|
|
2562
2706
|
LLAMA_ASSERT(ctx->logits.capacity() == logits_cap);
|
2563
2707
|
|
2564
2708
|
if (logits_size) {
|
2565
2709
|
ctx->logits.resize(logits_size);
|
2566
|
-
memcpy(ctx->logits.data(),
|
2710
|
+
memcpy(ctx->logits.data(), inp, logits_size * sizeof(float));
|
2567
2711
|
}
|
2568
2712
|
|
2569
|
-
|
2713
|
+
inp += logits_cap * sizeof(float);
|
2570
2714
|
}
|
2571
2715
|
|
2572
2716
|
// set embeddings
|
2573
2717
|
{
|
2574
2718
|
size_t embedding_size;
|
2575
2719
|
|
2576
|
-
memcpy(&embedding_size,
|
2720
|
+
memcpy(&embedding_size, inp, sizeof(embedding_size)); inp += sizeof(embedding_size);
|
2577
2721
|
|
2578
2722
|
LLAMA_ASSERT(ctx->embedding.capacity() == embedding_size);
|
2579
2723
|
|
2580
2724
|
if (embedding_size) {
|
2581
|
-
memcpy(ctx->embedding.data(),
|
2582
|
-
|
2725
|
+
memcpy(ctx->embedding.data(), inp, embedding_size * sizeof(float));
|
2726
|
+
inp += embedding_size * sizeof(float);
|
2583
2727
|
}
|
2584
2728
|
}
|
2585
2729
|
|
@@ -2594,25 +2738,27 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
|
2594
2738
|
size_t kv_size;
|
2595
2739
|
int kv_ntok;
|
2596
2740
|
|
2597
|
-
memcpy(&kv_size,
|
2598
|
-
memcpy(&kv_ntok,
|
2741
|
+
memcpy(&kv_size, inp, sizeof(kv_size)); inp += sizeof(kv_size);
|
2742
|
+
memcpy(&kv_ntok, inp, sizeof(kv_ntok)); inp += sizeof(kv_ntok);
|
2599
2743
|
|
2600
2744
|
if (kv_size) {
|
2601
2745
|
LLAMA_ASSERT(kv_self.buf.size == kv_size);
|
2602
2746
|
|
2603
2747
|
const size_t elt_size = ggml_element_size(kv_self.k);
|
2748
|
+
|
2604
2749
|
char buffer[4096];
|
2750
|
+
|
2605
2751
|
ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
|
2606
2752
|
ggml_cgraph gf{};
|
2607
2753
|
gf.n_threads = 1;
|
2608
2754
|
|
2609
2755
|
ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
|
2610
|
-
kin3d->data = (void *)
|
2611
|
-
|
2756
|
+
kin3d->data = (void *) inp;
|
2757
|
+
inp += ggml_nbytes(kin3d);
|
2612
2758
|
|
2613
2759
|
ggml_tensor * vin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_ntok, n_embd, n_layer);
|
2614
|
-
vin3d->data = (void *)
|
2615
|
-
|
2760
|
+
vin3d->data = (void *) inp;
|
2761
|
+
inp += ggml_nbytes(vin3d);
|
2616
2762
|
|
2617
2763
|
ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k,
|
2618
2764
|
n_embd, kv_ntok, n_layer,
|
@@ -2625,12 +2771,14 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
|
2625
2771
|
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d));
|
2626
2772
|
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d));
|
2627
2773
|
ggml_graph_compute(cpy_ctx, &gf);
|
2774
|
+
|
2775
|
+
ggml_free(cpy_ctx);
|
2628
2776
|
}
|
2629
2777
|
|
2630
2778
|
ctx->model.kv_self.n = kv_ntok;
|
2631
2779
|
}
|
2632
2780
|
|
2633
|
-
const size_t nread =
|
2781
|
+
const size_t nread = inp - src;
|
2634
2782
|
const size_t max_size = llama_get_state_size(ctx);
|
2635
2783
|
|
2636
2784
|
LLAMA_ASSERT(nread <= max_size);
|
@@ -2646,7 +2794,7 @@ bool llama_load_session_file(struct llama_context * ctx, const char * path_sessi
|
|
2646
2794
|
const uint32_t magic = file.read_u32();
|
2647
2795
|
const uint32_t version = file.read_u32();
|
2648
2796
|
|
2649
|
-
if (
|
2797
|
+
if (magic != LLAMA_SESSION_MAGIC || version != LLAMA_SESSION_VERSION) {
|
2650
2798
|
fprintf(stderr, "%s : unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version);
|
2651
2799
|
return false;
|
2652
2800
|
}
|
@@ -2727,11 +2875,14 @@ int llama_eval(
|
|
2727
2875
|
fprintf(stderr, "%s: failed to eval\n", __func__);
|
2728
2876
|
return 1;
|
2729
2877
|
}
|
2878
|
+
|
2730
2879
|
// get a more accurate load time, upon first eval
|
2880
|
+
// TODO: fix this
|
2731
2881
|
if (!ctx->has_evaluated_once) {
|
2732
2882
|
ctx->t_load_us = ggml_time_us() - ctx->t_start_us;
|
2733
2883
|
ctx->has_evaluated_once = true;
|
2734
2884
|
}
|
2885
|
+
|
2735
2886
|
return 0;
|
2736
2887
|
}
|
2737
2888
|
|
@@ -2805,9 +2956,9 @@ void llama_print_timings(struct llama_context * ctx) {
|
|
2805
2956
|
|
2806
2957
|
fprintf(stderr, "\n");
|
2807
2958
|
fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0);
|
2808
|
-
fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per
|
2959
|
+
fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_sample_us, n_sample, 1e-3 * ctx->t_sample_us / n_sample);
|
2809
2960
|
fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_p_eval_us, n_p_eval, 1e-3 * ctx->t_p_eval_us / n_p_eval);
|
2810
|
-
fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per
|
2961
|
+
fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_eval_us, n_eval, 1e-3 * ctx->t_eval_us / n_eval);
|
2811
2962
|
fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0);
|
2812
2963
|
}
|
2813
2964
|
|