llama_cpp 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -0
- data/ext/llama_cpp/llama_cpp.cpp +93 -15
- data/ext/llama_cpp/src/ggml-cuda.h +2 -0
- data/ext/llama_cpp/src/ggml-opencl.c +85 -122
- data/ext/llama_cpp/src/ggml.c +6268 -4208
- data/ext/llama_cpp/src/ggml.h +205 -12
- data/ext/llama_cpp/src/llama.cpp +159 -79
- data/ext/llama_cpp/src/llama.h +10 -10
- data/lib/llama_cpp/client.rb +1 -3
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +3 -4
- metadata +2 -2
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -9,6 +9,9 @@
|
|
9
9
|
#include "llama.h"
|
10
10
|
|
11
11
|
#include "ggml.h"
|
12
|
+
#ifdef GGML_USE_CUBLAS
|
13
|
+
#include "ggml-cuda.h"
|
14
|
+
#endif
|
12
15
|
|
13
16
|
#include <array>
|
14
17
|
#include <ctime>
|
@@ -50,49 +53,49 @@ static const size_t MB = 1024*1024;
|
|
50
53
|
|
51
54
|
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
|
52
55
|
{
|
53
|
-
static std::map<e_model, size_t>
|
56
|
+
static std::map<e_model, size_t> k_sizes = {
|
54
57
|
{ MODEL_7B, 512ull * MB },
|
55
58
|
{ MODEL_13B, 512ull * MB },
|
56
59
|
{ MODEL_30B, 512ull * MB },
|
57
60
|
{ MODEL_65B, 1024ull * MB },
|
58
61
|
};
|
59
|
-
return
|
62
|
+
return k_sizes;
|
60
63
|
}
|
61
64
|
|
62
65
|
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
|
63
66
|
{
|
64
|
-
static std::map<e_model, size_t>
|
67
|
+
static std::map<e_model, size_t> k_sizes = {
|
65
68
|
{ MODEL_7B, 512ull * MB },
|
66
69
|
{ MODEL_13B, 512ull * MB },
|
67
70
|
{ MODEL_30B, 512ull * MB },
|
68
71
|
{ MODEL_65B, 1024ull * MB },
|
69
72
|
};
|
70
|
-
return
|
73
|
+
return k_sizes;
|
71
74
|
}
|
72
75
|
|
73
76
|
// 2*n_embd*n_ctx*n_layer*sizeof(float16)
|
74
77
|
static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
|
75
78
|
{
|
76
|
-
static std::map<e_model, size_t>
|
79
|
+
static std::map<e_model, size_t> k_sizes = {
|
77
80
|
{ MODEL_7B, 1026ull * MB },
|
78
81
|
{ MODEL_13B, 1608ull * MB },
|
79
82
|
{ MODEL_30B, 3124ull * MB },
|
80
83
|
{ MODEL_65B, 5120ull * MB },
|
81
84
|
};
|
82
|
-
return
|
85
|
+
return k_sizes;
|
83
86
|
}
|
84
87
|
|
85
88
|
// this is mostly needed for temporary mul_mat buffers to dequantize the data
|
86
89
|
// not actually needed if BLAS is disabled
|
87
90
|
static const std::map<e_model, size_t> & MEM_REQ_EVAL()
|
88
91
|
{
|
89
|
-
static std::map<e_model, size_t>
|
92
|
+
static std::map<e_model, size_t> k_sizes = {
|
90
93
|
{ MODEL_7B, 768ull * MB },
|
91
94
|
{ MODEL_13B, 1024ull * MB },
|
92
95
|
{ MODEL_30B, 1280ull * MB },
|
93
96
|
{ MODEL_65B, 1536ull * MB },
|
94
97
|
};
|
95
|
-
return
|
98
|
+
return k_sizes;
|
96
99
|
}
|
97
100
|
|
98
101
|
// default hparams (LLaMA 7B)
|
@@ -402,6 +405,7 @@ enum llama_file_version {
|
|
402
405
|
LLAMA_FILE_VERSION_GGML,
|
403
406
|
LLAMA_FILE_VERSION_GGMF_V1, // added version field and scores in vocab
|
404
407
|
LLAMA_FILE_VERSION_GGJT_V1, // added padding
|
408
|
+
LLAMA_FILE_VERSION_GGJT_V2, // changed quantization format
|
405
409
|
};
|
406
410
|
|
407
411
|
struct llama_file_loader {
|
@@ -432,6 +436,8 @@ struct llama_file_loader {
|
|
432
436
|
file_version = LLAMA_FILE_VERSION_GGMF_V1;
|
433
437
|
} else if (magic == 'ggjt' && version == 1) {
|
434
438
|
file_version = LLAMA_FILE_VERSION_GGJT_V1;
|
439
|
+
} else if (magic == 'ggjt' && version == 2) {
|
440
|
+
file_version = LLAMA_FILE_VERSION_GGJT_V2;
|
435
441
|
} else {
|
436
442
|
throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
|
437
443
|
magic, version);
|
@@ -482,7 +488,6 @@ struct llama_file_loader {
|
|
482
488
|
case GGML_TYPE_F16:
|
483
489
|
case GGML_TYPE_Q4_0:
|
484
490
|
case GGML_TYPE_Q4_1:
|
485
|
-
case GGML_TYPE_Q4_2:
|
486
491
|
case GGML_TYPE_Q5_0:
|
487
492
|
case GGML_TYPE_Q5_1:
|
488
493
|
case GGML_TYPE_Q8_0:
|
@@ -527,8 +532,8 @@ struct llama_file_saver {
|
|
527
532
|
write_vocab();
|
528
533
|
}
|
529
534
|
void write_magic() {
|
530
|
-
file.write_u32(
|
531
|
-
file.write_u32(
|
535
|
+
file.write_u32(LLAMA_FILE_MAGIC); // magic
|
536
|
+
file.write_u32(LLAMA_FILE_VERSION); // version
|
532
537
|
}
|
533
538
|
void write_hparams(enum llama_ftype new_ftype) {
|
534
539
|
const llama_hparams & hparams = any_file_loader->hparams;
|
@@ -558,7 +563,6 @@ struct llama_file_saver {
|
|
558
563
|
case GGML_TYPE_F16:
|
559
564
|
case GGML_TYPE_Q4_0:
|
560
565
|
case GGML_TYPE_Q4_1:
|
561
|
-
case GGML_TYPE_Q4_2:
|
562
566
|
case GGML_TYPE_Q5_0:
|
563
567
|
case GGML_TYPE_Q5_1:
|
564
568
|
case GGML_TYPE_Q8_0:
|
@@ -585,12 +589,12 @@ struct llama_model_loader {
|
|
585
589
|
std::unique_ptr<llama_mmap> mapping;
|
586
590
|
|
587
591
|
llama_model_loader(const std::string & fname_base, bool use_mmap, bool vocab_only) {
|
588
|
-
auto first_file = new llama_file_loader(fname_base.c_str(), 0, tensors_map);
|
592
|
+
auto * first_file = new llama_file_loader(fname_base.c_str(), 0, tensors_map);
|
589
593
|
file_loaders.emplace_back(first_file);
|
590
594
|
uint32_t n_parts = vocab_only ? 1 : guess_n_parts();
|
591
595
|
for (uint32_t i = 1; i < n_parts; i++) {
|
592
596
|
std::string fname = fname_base + "." + std::to_string(i);
|
593
|
-
auto ith_file = new llama_file_loader(fname.c_str(), i, tensors_map);
|
597
|
+
auto * ith_file = new llama_file_loader(fname.c_str(), i, tensors_map);
|
594
598
|
file_loaders.emplace_back(ith_file);
|
595
599
|
if (ith_file->hparams != first_file->hparams) {
|
596
600
|
throw format("llama.cpp: hparams inconsistent between files");
|
@@ -637,7 +641,7 @@ struct llama_model_loader {
|
|
637
641
|
}
|
638
642
|
}
|
639
643
|
|
640
|
-
struct ggml_tensor * get_tensor(const std::string & name, std::vector<uint32_t> ne) {
|
644
|
+
struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne) {
|
641
645
|
auto it = tensors_map.name_to_idx.find(name);
|
642
646
|
if (it == tensors_map.name_to_idx.end()) {
|
643
647
|
throw format("llama.cpp: tensor '%s' is missing from model", name.c_str());
|
@@ -666,7 +670,7 @@ struct llama_model_loader {
|
|
666
670
|
return tensor;
|
667
671
|
}
|
668
672
|
|
669
|
-
void done_getting_tensors() {
|
673
|
+
void done_getting_tensors() const {
|
670
674
|
if (num_ggml_tensors_created != tensors_map.tensors.size()) {
|
671
675
|
throw std::string("llama.cpp: file contained more tensors than expected");
|
672
676
|
}
|
@@ -808,9 +812,9 @@ static bool kv_cache_init(
|
|
808
812
|
struct llama_context_params llama_context_default_params() {
|
809
813
|
struct llama_context_params result = {
|
810
814
|
/*.n_ctx =*/ 512,
|
811
|
-
/*.
|
815
|
+
/*.gpu_layers =*/ 0,
|
812
816
|
/*.seed =*/ -1,
|
813
|
-
/*.f16_kv =*/
|
817
|
+
/*.f16_kv =*/ true,
|
814
818
|
/*.logits_all =*/ false,
|
815
819
|
/*.vocab_only =*/ false,
|
816
820
|
/*.use_mmap =*/ true,
|
@@ -839,9 +843,11 @@ static const char *llama_file_version_name(llama_file_version version) {
|
|
839
843
|
switch (version) {
|
840
844
|
case LLAMA_FILE_VERSION_GGML: return "'ggml' (old version with low tokenizer quality and no mmap support)";
|
841
845
|
case LLAMA_FILE_VERSION_GGMF_V1: return "ggmf v1 (old version with no mmap support)";
|
842
|
-
case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (
|
843
|
-
|
846
|
+
case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (pre #1405)";
|
847
|
+
case LLAMA_FILE_VERSION_GGJT_V2: return "ggjt v2 (latest)";
|
844
848
|
}
|
849
|
+
|
850
|
+
return "unknown";
|
845
851
|
}
|
846
852
|
|
847
853
|
static const char *llama_ftype_name(enum llama_ftype ftype) {
|
@@ -852,7 +858,6 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
|
|
852
858
|
case LLAMA_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1";
|
853
859
|
case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
|
854
860
|
return "mostly Q4_1, some F16";
|
855
|
-
case LLAMA_FTYPE_MOSTLY_Q4_2: return "mostly Q4_2";
|
856
861
|
case LLAMA_FTYPE_MOSTLY_Q5_0: return "mostly Q5_0";
|
857
862
|
case LLAMA_FTYPE_MOSTLY_Q5_1: return "mostly Q5_1";
|
858
863
|
case LLAMA_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0";
|
@@ -874,6 +879,7 @@ static void llama_model_load_internal(
|
|
874
879
|
const std::string & fname,
|
875
880
|
llama_context & lctx,
|
876
881
|
int n_ctx,
|
882
|
+
int n_gpu_layers,
|
877
883
|
ggml_type memory_type,
|
878
884
|
bool use_mmap,
|
879
885
|
bool use_mlock,
|
@@ -918,15 +924,24 @@ static void llama_model_load_internal(
|
|
918
924
|
fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
|
919
925
|
}
|
920
926
|
|
927
|
+
if (file_version != LLAMA_FILE_VERSION_GGJT_V2) {
|
928
|
+
if (hparams.ftype != LLAMA_FTYPE_ALL_F32 &&
|
929
|
+
hparams.ftype != LLAMA_FTYPE_MOSTLY_F16 &&
|
930
|
+
hparams.ftype != LLAMA_FTYPE_MOSTLY_Q8_0) {
|
931
|
+
throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1305)");
|
932
|
+
}
|
933
|
+
}
|
934
|
+
|
921
935
|
if (vocab_only) {
|
922
936
|
return;
|
923
937
|
}
|
924
938
|
|
925
939
|
auto & ctx = model.ctx;
|
926
940
|
|
927
|
-
size_t ctx_size
|
941
|
+
size_t ctx_size;
|
942
|
+
size_t mmapped_size;
|
928
943
|
ml->calc_sizes(&ctx_size, &mmapped_size);
|
929
|
-
fprintf(stderr, "%s: ggml ctx size = %6.2f
|
944
|
+
fprintf(stderr, "%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/1024.0/1024.0);
|
930
945
|
|
931
946
|
// print memory requirements
|
932
947
|
{
|
@@ -970,8 +985,6 @@ static void llama_model_load_internal(
|
|
970
985
|
|
971
986
|
// prepare memory for the weights
|
972
987
|
{
|
973
|
-
const auto & hparams = model.hparams;
|
974
|
-
|
975
988
|
const uint32_t n_embd = hparams.n_embd;
|
976
989
|
const uint32_t n_layer = hparams.n_layer;
|
977
990
|
const uint32_t n_vocab = hparams.n_vocab;
|
@@ -1013,6 +1026,35 @@ static void llama_model_load_internal(
|
|
1013
1026
|
ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
|
1014
1027
|
|
1015
1028
|
model.mapping = std::move(ml->mapping);
|
1029
|
+
#ifdef GGML_USE_CUBLAS
|
1030
|
+
{
|
1031
|
+
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
1032
|
+
|
1033
|
+
fprintf(stderr, "%s: [cublas] offloading %d layers to GPU\n", __func__, n_gpu);
|
1034
|
+
|
1035
|
+
size_t vram_total = 0;
|
1036
|
+
|
1037
|
+
for (int i = 0; i < n_gpu; ++i) {
|
1038
|
+
const auto & layer = model.layers[i];
|
1039
|
+
|
1040
|
+
ggml_cuda_transform_tensor(layer.wq); vram_total += ggml_nbytes(layer.wq);
|
1041
|
+
ggml_cuda_transform_tensor(layer.wk); vram_total += ggml_nbytes(layer.wk);
|
1042
|
+
ggml_cuda_transform_tensor(layer.wv); vram_total += ggml_nbytes(layer.wv);
|
1043
|
+
ggml_cuda_transform_tensor(layer.wo); vram_total += ggml_nbytes(layer.wo);
|
1044
|
+
ggml_cuda_transform_tensor(layer.w1); vram_total += ggml_nbytes(layer.w1);
|
1045
|
+
ggml_cuda_transform_tensor(layer.w2); vram_total += ggml_nbytes(layer.w2);
|
1046
|
+
ggml_cuda_transform_tensor(layer.w3); vram_total += ggml_nbytes(layer.w3);
|
1047
|
+
}
|
1048
|
+
if (n_gpu_layers > (int) hparams.n_layer) {
|
1049
|
+
fprintf(stderr, "%s: [cublas] offloading output layer to GPU\n", __func__);
|
1050
|
+
ggml_cuda_transform_tensor(model.output); vram_total += ggml_nbytes(model.output);
|
1051
|
+
}
|
1052
|
+
|
1053
|
+
fprintf(stderr, "%s: [cublas] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
|
1054
|
+
}
|
1055
|
+
#else
|
1056
|
+
(void) n_gpu_layers;
|
1057
|
+
#endif
|
1016
1058
|
|
1017
1059
|
// loading time will be recalculate after the first eval, so
|
1018
1060
|
// we take page faults deferred by mmap() into consideration
|
@@ -1023,6 +1065,7 @@ static bool llama_model_load(
|
|
1023
1065
|
const std::string & fname,
|
1024
1066
|
llama_context & lctx,
|
1025
1067
|
int n_ctx,
|
1068
|
+
int n_gpu_layers,
|
1026
1069
|
ggml_type memory_type,
|
1027
1070
|
bool use_mmap,
|
1028
1071
|
bool use_mlock,
|
@@ -1030,7 +1073,7 @@ static bool llama_model_load(
|
|
1030
1073
|
llama_progress_callback progress_callback,
|
1031
1074
|
void *progress_callback_user_data) {
|
1032
1075
|
try {
|
1033
|
-
llama_model_load_internal(fname, lctx, n_ctx, memory_type, use_mmap, use_mlock,
|
1076
|
+
llama_model_load_internal(fname, lctx, n_ctx, n_gpu_layers, memory_type, use_mmap, use_mlock,
|
1034
1077
|
vocab_only, progress_callback, progress_callback_user_data);
|
1035
1078
|
return true;
|
1036
1079
|
} catch (const std::string & err) {
|
@@ -1052,6 +1095,13 @@ static bool llama_eval_internal(
|
|
1052
1095
|
const int n_tokens,
|
1053
1096
|
const int n_past,
|
1054
1097
|
const int n_threads) {
|
1098
|
+
|
1099
|
+
// enforce that the first token is BOS
|
1100
|
+
if (n_past == 0 && tokens[0] != llama_token_bos()) {
|
1101
|
+
fprintf(stderr, "%s: first token must be BOS\n", __func__);
|
1102
|
+
return false;
|
1103
|
+
}
|
1104
|
+
|
1055
1105
|
const int64_t t_start_us = ggml_time_us();
|
1056
1106
|
|
1057
1107
|
const int N = n_tokens;
|
@@ -1059,7 +1109,7 @@ static bool llama_eval_internal(
|
|
1059
1109
|
const auto & model = lctx.model;
|
1060
1110
|
const auto & hparams = model.hparams;
|
1061
1111
|
|
1062
|
-
auto & kv_self = model.kv_self;
|
1112
|
+
const auto & kv_self = model.kv_self;
|
1063
1113
|
|
1064
1114
|
LLAMA_ASSERT(!!kv_self.ctx);
|
1065
1115
|
|
@@ -1112,8 +1162,8 @@ static bool llama_eval_internal(
|
|
1112
1162
|
// self-attention
|
1113
1163
|
{
|
1114
1164
|
// compute Q and K and RoPE them
|
1115
|
-
struct ggml_tensor * Qcur =
|
1116
|
-
struct ggml_tensor * Kcur =
|
1165
|
+
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
|
1166
|
+
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
|
1117
1167
|
ggml_set_name(Qcur, "Qcur");
|
1118
1168
|
ggml_set_name(Kcur, "Kcur");
|
1119
1169
|
|
@@ -1154,17 +1204,19 @@ static bool llama_eval_internal(
|
|
1154
1204
|
struct ggml_tensor * KQ_scale = ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head));
|
1155
1205
|
ggml_set_name(KQ_scale, "1/sqrt(n_embd/n_head)");
|
1156
1206
|
|
1157
|
-
|
1207
|
+
// KQ_scaled shape [n_past + N, N, n_head, 1]
|
1208
|
+
struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
|
1158
1209
|
ggml_set_name(KQ_scaled, "KQ_scaled");
|
1159
1210
|
|
1160
1211
|
// KQ_masked = mask_past(KQ_scaled)
|
1161
|
-
struct ggml_tensor * KQ_masked =
|
1212
|
+
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
|
1162
1213
|
ggml_set_name(KQ_masked, "KQ_masked");
|
1163
1214
|
|
1164
1215
|
// KQ = soft_max(KQ_masked)
|
1165
|
-
struct ggml_tensor * KQ_soft_max =
|
1216
|
+
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
|
1166
1217
|
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
1167
1218
|
|
1219
|
+
|
1168
1220
|
// split cached V into n_head heads
|
1169
1221
|
struct ggml_tensor * V =
|
1170
1222
|
ggml_view_3d(ctx0, kv_self.v,
|
@@ -1265,7 +1317,7 @@ static bool llama_eval_internal(
|
|
1265
1317
|
lctx.use_buf(ctx0, -1);
|
1266
1318
|
|
1267
1319
|
// logits -> probs
|
1268
|
-
//inpL =
|
1320
|
+
//inpL = ggml_soft_max_inplace(ctx0, inpL);
|
1269
1321
|
|
1270
1322
|
// run the computation
|
1271
1323
|
ggml_build_forward_expand(&gf, inpL);
|
@@ -1303,7 +1355,7 @@ static bool llama_eval_internal(
|
|
1303
1355
|
}
|
1304
1356
|
|
1305
1357
|
// extract embeddings
|
1306
|
-
if (lctx.embedding.
|
1358
|
+
if (!lctx.embedding.empty()) {
|
1307
1359
|
auto & embedding_out = lctx.embedding;
|
1308
1360
|
|
1309
1361
|
embedding_out.resize(n_embd);
|
@@ -1354,6 +1406,8 @@ struct llama_sp_symbol {
|
|
1354
1406
|
size_t n;
|
1355
1407
|
};
|
1356
1408
|
|
1409
|
+
static_assert(std::is_trivially_copyable<llama_sp_symbol>::value, "llama_sp_symbol is not trivially copyable");
|
1410
|
+
|
1357
1411
|
struct llama_sp_bigram {
|
1358
1412
|
struct comparator {
|
1359
1413
|
bool operator()(llama_sp_bigram & l, llama_sp_bigram & r) {
|
@@ -1386,7 +1440,7 @@ struct llama_tokenizer {
|
|
1386
1440
|
sym.prev = index - 1;
|
1387
1441
|
sym.next = offs == text.size() ? -1 : index + 1;
|
1388
1442
|
index++;
|
1389
|
-
symbols_.emplace_back(
|
1443
|
+
symbols_.emplace_back(sym);
|
1390
1444
|
}
|
1391
1445
|
|
1392
1446
|
// seed the work queue with all possible 2-character tokens.
|
@@ -1477,12 +1531,12 @@ static std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, co
|
|
1477
1531
|
llama_tokenizer tokenizer(vocab);
|
1478
1532
|
std::vector<llama_vocab::id> output;
|
1479
1533
|
|
1480
|
-
if (text.
|
1534
|
+
if (text.empty()) {
|
1481
1535
|
return output;
|
1482
1536
|
}
|
1483
1537
|
|
1484
1538
|
if (bos) {
|
1485
|
-
output.push_back(
|
1539
|
+
output.push_back(llama_token_bos());
|
1486
1540
|
}
|
1487
1541
|
|
1488
1542
|
tokenizer.tokenize(text, output);
|
@@ -1713,7 +1767,7 @@ void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_dat
|
|
1713
1767
|
const int64_t t_start_sample_us = ggml_time_us();
|
1714
1768
|
|
1715
1769
|
for (size_t i = 0; i < candidates->size; ++i) {
|
1716
|
-
auto token_iter = std::find(last_tokens, last_tokens + last_tokens_size, candidates->data[i].id);
|
1770
|
+
const auto * token_iter = std::find(last_tokens, last_tokens + last_tokens_size, candidates->data[i].id);
|
1717
1771
|
if (token_iter == last_tokens + last_tokens_size) {
|
1718
1772
|
continue;
|
1719
1773
|
}
|
@@ -1791,7 +1845,7 @@ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_
|
|
1791
1845
|
float k = powf((epsilon_hat * powf(2, *mu)) / (1 - powf(N, -epsilon_hat)), 1 / s_hat);
|
1792
1846
|
|
1793
1847
|
// Sample the next word X using top-k sampling
|
1794
|
-
llama_sample_top_k(nullptr, candidates, int(k));
|
1848
|
+
llama_sample_top_k(nullptr, candidates, int(k), 1);
|
1795
1849
|
if (ctx) {
|
1796
1850
|
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
1797
1851
|
}
|
@@ -1857,7 +1911,7 @@ llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_da
|
|
1857
1911
|
const int64_t t_start_sample_us = ggml_time_us();
|
1858
1912
|
|
1859
1913
|
// Find max element
|
1860
|
-
auto max_iter = std::max_element(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
|
1914
|
+
auto * max_iter = std::max_element(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
|
1861
1915
|
return a.logit < b.logit;
|
1862
1916
|
});
|
1863
1917
|
|
@@ -1900,7 +1954,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
1900
1954
|
switch (ftype) {
|
1901
1955
|
case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
|
1902
1956
|
case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
|
1903
|
-
case LLAMA_FTYPE_MOSTLY_Q4_2: quantized_type = GGML_TYPE_Q4_2; break;
|
1904
1957
|
case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
|
1905
1958
|
case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
|
1906
1959
|
case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
|
@@ -1911,7 +1964,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
1911
1964
|
nthread = std::thread::hardware_concurrency();
|
1912
1965
|
}
|
1913
1966
|
|
1914
|
-
std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp
|
1967
|
+
std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp, /*use_mmap*/ false,
|
1915
1968
|
/*vocab_only*/ false));
|
1916
1969
|
llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), ftype);
|
1917
1970
|
|
@@ -1965,7 +2018,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
1965
2018
|
} else if (tensor.type == GGML_TYPE_F16) {
|
1966
2019
|
f32_conv_buf.resize(nelements * sizeof(float));
|
1967
2020
|
f32_data = (float *) f32_conv_buf.addr;
|
1968
|
-
auto f16_data = (const ggml_fp16_t *) tensor.data;
|
2021
|
+
const auto * f16_data = (const ggml_fp16_t *) tensor.data;
|
1969
2022
|
for (size_t i = 0; i < nelements; i++) {
|
1970
2023
|
f32_data[i] = ggml_fp16_to_fp32(f16_data[i]);
|
1971
2024
|
}
|
@@ -1996,21 +2049,31 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
1996
2049
|
size_t first = counter; counter += chunk_size;
|
1997
2050
|
if (first >= nelements) {
|
1998
2051
|
if (!local_hist.empty()) {
|
1999
|
-
for (int j=0; j<int(local_hist.size()); ++j)
|
2052
|
+
for (int j=0; j<int(local_hist.size()); ++j) {
|
2053
|
+
hist_cur[j] += local_hist[j];
|
2054
|
+
}
|
2000
2055
|
new_size += local_size;
|
2001
2056
|
}
|
2002
2057
|
break;
|
2003
2058
|
}
|
2004
2059
|
lock.unlock();
|
2005
2060
|
size_t last = std::min(nelements, first + chunk_size);
|
2006
|
-
if (local_hist.empty())
|
2061
|
+
if (local_hist.empty()) {
|
2062
|
+
local_hist.resize(hist_cur.size(), 0);
|
2063
|
+
}
|
2007
2064
|
local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data());
|
2008
2065
|
}
|
2009
2066
|
};
|
2010
|
-
if (int
|
2011
|
-
|
2067
|
+
if ((int) workers.size() < nthread_use - 1) {
|
2068
|
+
workers.resize(nthread_use - 1);
|
2069
|
+
}
|
2070
|
+
for (int it = 0; it < nthread_use - 1; ++it) {
|
2071
|
+
workers[it] = std::thread(compute);
|
2072
|
+
}
|
2012
2073
|
compute();
|
2013
|
-
for (int it = 0; it < nthread_use - 1; ++it)
|
2074
|
+
for (int it = 0; it < nthread_use - 1; ++it) {
|
2075
|
+
workers[it].join();
|
2076
|
+
}
|
2014
2077
|
}
|
2015
2078
|
|
2016
2079
|
printf("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0);
|
@@ -2082,7 +2145,7 @@ struct llama_context * llama_init_from_file(
|
|
2082
2145
|
|
2083
2146
|
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
2084
2147
|
|
2085
|
-
if (!llama_model_load(path_model, *ctx, params.n_ctx, memory_type,
|
2148
|
+
if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_gpu_layers, memory_type,
|
2086
2149
|
params.use_mmap, params.use_mlock, params.vocab_only,
|
2087
2150
|
params.progress_callback, params.progress_callback_user_data)) {
|
2088
2151
|
fprintf(stderr, "%s: failed to load model\n", __func__);
|
@@ -2208,7 +2271,8 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|
2208
2271
|
fprintf(stderr, "%s: loading base model from '%s'\n", __func__, path_base_model);
|
2209
2272
|
model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*vocab_only*/ false));
|
2210
2273
|
|
2211
|
-
size_t ctx_size
|
2274
|
+
size_t ctx_size;
|
2275
|
+
size_t mmapped_size;
|
2212
2276
|
model_loader->calc_sizes(&ctx_size, &mmapped_size);
|
2213
2277
|
base_buf.resize(ctx_size);
|
2214
2278
|
|
@@ -2247,8 +2311,12 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|
2247
2311
|
fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
|
2248
2312
|
}
|
2249
2313
|
|
2250
|
-
std::string name
|
2251
|
-
|
2314
|
+
std::string name;
|
2315
|
+
{
|
2316
|
+
char buf[1024];
|
2317
|
+
fin.read(buf, length);
|
2318
|
+
name = std::string(buf, length);
|
2319
|
+
}
|
2252
2320
|
|
2253
2321
|
// check for lora suffix and get the type of tensor
|
2254
2322
|
const std::string lora_suffix = ".lora";
|
@@ -2263,7 +2331,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|
2263
2331
|
base_name.erase(pos);
|
2264
2332
|
// fprintf(stderr, "%s: %s => %s (lora type %s) ", __func__, name.c_str(),base_name.c_str(), lora_type.c_str());
|
2265
2333
|
|
2266
|
-
if (model_tensors.find(base_name
|
2334
|
+
if (model_tensors.find(base_name) == model_tensors.end()) {
|
2267
2335
|
fprintf(stderr, "%s: unknown tensor '%s' in lora adapter\n", __func__, name.data());
|
2268
2336
|
return 1;
|
2269
2337
|
}
|
@@ -2343,7 +2411,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|
2343
2411
|
|
2344
2412
|
if (scaling != 1.0f) {
|
2345
2413
|
ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling);
|
2346
|
-
BA =
|
2414
|
+
BA = ggml_scale_inplace(lora_ctx, BA, scale_tensor);
|
2347
2415
|
}
|
2348
2416
|
|
2349
2417
|
ggml_tensor * r;
|
@@ -2365,8 +2433,9 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|
2365
2433
|
lora_tensors.clear();
|
2366
2434
|
|
2367
2435
|
n_tensors++;
|
2368
|
-
if (n_tensors % 4 == 0)
|
2436
|
+
if (n_tensors % 4 == 0) {
|
2369
2437
|
fprintf(stderr, ".");
|
2438
|
+
}
|
2370
2439
|
}
|
2371
2440
|
}
|
2372
2441
|
|
@@ -2395,7 +2464,7 @@ int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
|
|
2395
2464
|
return ctx->model.kv_self.n;
|
2396
2465
|
}
|
2397
2466
|
|
2398
|
-
#define LLAMA_MAX_RNG_STATE 64*1024
|
2467
|
+
#define LLAMA_MAX_RNG_STATE (64*1024)
|
2399
2468
|
|
2400
2469
|
void llama_set_rng_seed(struct llama_context * ctx, int seed) {
|
2401
2470
|
if (seed < 0) {
|
@@ -2436,8 +2505,8 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
|
|
2436
2505
|
}
|
2437
2506
|
|
2438
2507
|
// Copies the state to the specified destination address
|
2439
|
-
size_t llama_copy_state_data(struct llama_context * ctx, uint8_t *
|
2440
|
-
uint8_t * out =
|
2508
|
+
size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
2509
|
+
uint8_t * out = dst;
|
2441
2510
|
|
2442
2511
|
// copy rng
|
2443
2512
|
{
|
@@ -2497,7 +2566,9 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) {
|
|
2497
2566
|
|
2498
2567
|
if (kv_size) {
|
2499
2568
|
const size_t elt_size = ggml_element_size(kv_self.k);
|
2569
|
+
|
2500
2570
|
char buffer[4096];
|
2571
|
+
|
2501
2572
|
ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
|
2502
2573
|
ggml_cgraph gf{};
|
2503
2574
|
gf.n_threads = 1;
|
@@ -2521,10 +2592,12 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) {
|
|
2521
2592
|
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
|
2522
2593
|
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d));
|
2523
2594
|
ggml_graph_compute(cpy_ctx, &gf);
|
2595
|
+
|
2596
|
+
ggml_free(cpy_ctx);
|
2524
2597
|
}
|
2525
2598
|
}
|
2526
2599
|
|
2527
|
-
const size_t written = out -
|
2600
|
+
const size_t written = out - dst;
|
2528
2601
|
const size_t max_size = llama_get_state_size(ctx);
|
2529
2602
|
|
2530
2603
|
LLAMA_ASSERT(written <= max_size);
|
@@ -2534,15 +2607,15 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) {
|
|
2534
2607
|
|
2535
2608
|
// Sets the state reading from the specified source address
|
2536
2609
|
size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
2537
|
-
const uint8_t *
|
2610
|
+
const uint8_t * inp = src;
|
2538
2611
|
|
2539
2612
|
// set rng
|
2540
2613
|
{
|
2541
2614
|
size_t rng_size;
|
2542
2615
|
char rng_buf[LLAMA_MAX_RNG_STATE];
|
2543
2616
|
|
2544
|
-
memcpy(&rng_size,
|
2545
|
-
memcpy(&rng_buf[0],
|
2617
|
+
memcpy(&rng_size, inp, sizeof(rng_size)); inp += sizeof(rng_size);
|
2618
|
+
memcpy(&rng_buf[0], inp, LLAMA_MAX_RNG_STATE); inp += LLAMA_MAX_RNG_STATE;
|
2546
2619
|
|
2547
2620
|
std::stringstream rng_ss;
|
2548
2621
|
rng_ss.str(std::string(&rng_buf[0], rng_size));
|
@@ -2556,30 +2629,30 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
|
2556
2629
|
size_t logits_cap;
|
2557
2630
|
size_t logits_size;
|
2558
2631
|
|
2559
|
-
memcpy(&logits_cap,
|
2560
|
-
memcpy(&logits_size,
|
2632
|
+
memcpy(&logits_cap, inp, sizeof(logits_cap)); inp += sizeof(logits_cap);
|
2633
|
+
memcpy(&logits_size, inp, sizeof(logits_size)); inp += sizeof(logits_size);
|
2561
2634
|
|
2562
2635
|
LLAMA_ASSERT(ctx->logits.capacity() == logits_cap);
|
2563
2636
|
|
2564
2637
|
if (logits_size) {
|
2565
2638
|
ctx->logits.resize(logits_size);
|
2566
|
-
memcpy(ctx->logits.data(),
|
2639
|
+
memcpy(ctx->logits.data(), inp, logits_size * sizeof(float));
|
2567
2640
|
}
|
2568
2641
|
|
2569
|
-
|
2642
|
+
inp += logits_cap * sizeof(float);
|
2570
2643
|
}
|
2571
2644
|
|
2572
2645
|
// set embeddings
|
2573
2646
|
{
|
2574
2647
|
size_t embedding_size;
|
2575
2648
|
|
2576
|
-
memcpy(&embedding_size,
|
2649
|
+
memcpy(&embedding_size, inp, sizeof(embedding_size)); inp += sizeof(embedding_size);
|
2577
2650
|
|
2578
2651
|
LLAMA_ASSERT(ctx->embedding.capacity() == embedding_size);
|
2579
2652
|
|
2580
2653
|
if (embedding_size) {
|
2581
|
-
memcpy(ctx->embedding.data(),
|
2582
|
-
|
2654
|
+
memcpy(ctx->embedding.data(), inp, embedding_size * sizeof(float));
|
2655
|
+
inp += embedding_size * sizeof(float);
|
2583
2656
|
}
|
2584
2657
|
}
|
2585
2658
|
|
@@ -2594,25 +2667,27 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
|
2594
2667
|
size_t kv_size;
|
2595
2668
|
int kv_ntok;
|
2596
2669
|
|
2597
|
-
memcpy(&kv_size,
|
2598
|
-
memcpy(&kv_ntok,
|
2670
|
+
memcpy(&kv_size, inp, sizeof(kv_size)); inp += sizeof(kv_size);
|
2671
|
+
memcpy(&kv_ntok, inp, sizeof(kv_ntok)); inp += sizeof(kv_ntok);
|
2599
2672
|
|
2600
2673
|
if (kv_size) {
|
2601
2674
|
LLAMA_ASSERT(kv_self.buf.size == kv_size);
|
2602
2675
|
|
2603
2676
|
const size_t elt_size = ggml_element_size(kv_self.k);
|
2677
|
+
|
2604
2678
|
char buffer[4096];
|
2679
|
+
|
2605
2680
|
ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
|
2606
2681
|
ggml_cgraph gf{};
|
2607
2682
|
gf.n_threads = 1;
|
2608
2683
|
|
2609
2684
|
ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
|
2610
|
-
kin3d->data = (void *)
|
2611
|
-
|
2685
|
+
kin3d->data = (void *) inp;
|
2686
|
+
inp += ggml_nbytes(kin3d);
|
2612
2687
|
|
2613
2688
|
ggml_tensor * vin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_ntok, n_embd, n_layer);
|
2614
|
-
vin3d->data = (void *)
|
2615
|
-
|
2689
|
+
vin3d->data = (void *) inp;
|
2690
|
+
inp += ggml_nbytes(vin3d);
|
2616
2691
|
|
2617
2692
|
ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k,
|
2618
2693
|
n_embd, kv_ntok, n_layer,
|
@@ -2625,12 +2700,14 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
|
2625
2700
|
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d));
|
2626
2701
|
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d));
|
2627
2702
|
ggml_graph_compute(cpy_ctx, &gf);
|
2703
|
+
|
2704
|
+
ggml_free(cpy_ctx);
|
2628
2705
|
}
|
2629
2706
|
|
2630
2707
|
ctx->model.kv_self.n = kv_ntok;
|
2631
2708
|
}
|
2632
2709
|
|
2633
|
-
const size_t nread =
|
2710
|
+
const size_t nread = inp - src;
|
2634
2711
|
const size_t max_size = llama_get_state_size(ctx);
|
2635
2712
|
|
2636
2713
|
LLAMA_ASSERT(nread <= max_size);
|
@@ -2646,7 +2723,7 @@ bool llama_load_session_file(struct llama_context * ctx, const char * path_sessi
|
|
2646
2723
|
const uint32_t magic = file.read_u32();
|
2647
2724
|
const uint32_t version = file.read_u32();
|
2648
2725
|
|
2649
|
-
if (
|
2726
|
+
if (magic != LLAMA_SESSION_MAGIC || version != LLAMA_SESSION_VERSION) {
|
2650
2727
|
fprintf(stderr, "%s : unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version);
|
2651
2728
|
return false;
|
2652
2729
|
}
|
@@ -2727,11 +2804,14 @@ int llama_eval(
|
|
2727
2804
|
fprintf(stderr, "%s: failed to eval\n", __func__);
|
2728
2805
|
return 1;
|
2729
2806
|
}
|
2807
|
+
|
2730
2808
|
// get a more accurate load time, upon first eval
|
2809
|
+
// TODO: fix this
|
2731
2810
|
if (!ctx->has_evaluated_once) {
|
2732
2811
|
ctx->t_load_us = ggml_time_us() - ctx->t_start_us;
|
2733
2812
|
ctx->has_evaluated_once = true;
|
2734
2813
|
}
|
2814
|
+
|
2735
2815
|
return 0;
|
2736
2816
|
}
|
2737
2817
|
|
@@ -2805,9 +2885,9 @@ void llama_print_timings(struct llama_context * ctx) {
|
|
2805
2885
|
|
2806
2886
|
fprintf(stderr, "\n");
|
2807
2887
|
fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0);
|
2808
|
-
fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per
|
2888
|
+
fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_sample_us, n_sample, 1e-3 * ctx->t_sample_us / n_sample);
|
2809
2889
|
fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_p_eval_us, n_p_eval, 1e-3 * ctx->t_p_eval_us / n_p_eval);
|
2810
|
-
fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per
|
2890
|
+
fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_eval_us, n_eval, 1e-3 * ctx->t_eval_us / n_eval);
|
2811
2891
|
fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0);
|
2812
2892
|
}
|
2813
2893
|
|