llama_cpp 0.1.0 → 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -0
- data/ext/llama_cpp/llama_cpp.cpp +93 -15
- data/ext/llama_cpp/src/ggml-cuda.h +2 -0
- data/ext/llama_cpp/src/ggml-opencl.c +85 -122
- data/ext/llama_cpp/src/ggml.c +6268 -4208
- data/ext/llama_cpp/src/ggml.h +205 -12
- data/ext/llama_cpp/src/llama.cpp +159 -79
- data/ext/llama_cpp/src/llama.h +10 -10
- data/lib/llama_cpp/client.rb +1 -3
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +3 -4
- metadata +2 -2
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -9,6 +9,9 @@
|
|
9
9
|
#include "llama.h"
|
10
10
|
|
11
11
|
#include "ggml.h"
|
12
|
+
#ifdef GGML_USE_CUBLAS
|
13
|
+
#include "ggml-cuda.h"
|
14
|
+
#endif
|
12
15
|
|
13
16
|
#include <array>
|
14
17
|
#include <ctime>
|
@@ -50,49 +53,49 @@ static const size_t MB = 1024*1024;
|
|
50
53
|
|
51
54
|
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
|
52
55
|
{
|
53
|
-
static std::map<e_model, size_t>
|
56
|
+
static std::map<e_model, size_t> k_sizes = {
|
54
57
|
{ MODEL_7B, 512ull * MB },
|
55
58
|
{ MODEL_13B, 512ull * MB },
|
56
59
|
{ MODEL_30B, 512ull * MB },
|
57
60
|
{ MODEL_65B, 1024ull * MB },
|
58
61
|
};
|
59
|
-
return
|
62
|
+
return k_sizes;
|
60
63
|
}
|
61
64
|
|
62
65
|
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
|
63
66
|
{
|
64
|
-
static std::map<e_model, size_t>
|
67
|
+
static std::map<e_model, size_t> k_sizes = {
|
65
68
|
{ MODEL_7B, 512ull * MB },
|
66
69
|
{ MODEL_13B, 512ull * MB },
|
67
70
|
{ MODEL_30B, 512ull * MB },
|
68
71
|
{ MODEL_65B, 1024ull * MB },
|
69
72
|
};
|
70
|
-
return
|
73
|
+
return k_sizes;
|
71
74
|
}
|
72
75
|
|
73
76
|
// 2*n_embd*n_ctx*n_layer*sizeof(float16)
|
74
77
|
static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
|
75
78
|
{
|
76
|
-
static std::map<e_model, size_t>
|
79
|
+
static std::map<e_model, size_t> k_sizes = {
|
77
80
|
{ MODEL_7B, 1026ull * MB },
|
78
81
|
{ MODEL_13B, 1608ull * MB },
|
79
82
|
{ MODEL_30B, 3124ull * MB },
|
80
83
|
{ MODEL_65B, 5120ull * MB },
|
81
84
|
};
|
82
|
-
return
|
85
|
+
return k_sizes;
|
83
86
|
}
|
84
87
|
|
85
88
|
// this is mostly needed for temporary mul_mat buffers to dequantize the data
|
86
89
|
// not actually needed if BLAS is disabled
|
87
90
|
static const std::map<e_model, size_t> & MEM_REQ_EVAL()
|
88
91
|
{
|
89
|
-
static std::map<e_model, size_t>
|
92
|
+
static std::map<e_model, size_t> k_sizes = {
|
90
93
|
{ MODEL_7B, 768ull * MB },
|
91
94
|
{ MODEL_13B, 1024ull * MB },
|
92
95
|
{ MODEL_30B, 1280ull * MB },
|
93
96
|
{ MODEL_65B, 1536ull * MB },
|
94
97
|
};
|
95
|
-
return
|
98
|
+
return k_sizes;
|
96
99
|
}
|
97
100
|
|
98
101
|
// default hparams (LLaMA 7B)
|
@@ -402,6 +405,7 @@ enum llama_file_version {
|
|
402
405
|
LLAMA_FILE_VERSION_GGML,
|
403
406
|
LLAMA_FILE_VERSION_GGMF_V1, // added version field and scores in vocab
|
404
407
|
LLAMA_FILE_VERSION_GGJT_V1, // added padding
|
408
|
+
LLAMA_FILE_VERSION_GGJT_V2, // changed quantization format
|
405
409
|
};
|
406
410
|
|
407
411
|
struct llama_file_loader {
|
@@ -432,6 +436,8 @@ struct llama_file_loader {
|
|
432
436
|
file_version = LLAMA_FILE_VERSION_GGMF_V1;
|
433
437
|
} else if (magic == 'ggjt' && version == 1) {
|
434
438
|
file_version = LLAMA_FILE_VERSION_GGJT_V1;
|
439
|
+
} else if (magic == 'ggjt' && version == 2) {
|
440
|
+
file_version = LLAMA_FILE_VERSION_GGJT_V2;
|
435
441
|
} else {
|
436
442
|
throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
|
437
443
|
magic, version);
|
@@ -482,7 +488,6 @@ struct llama_file_loader {
|
|
482
488
|
case GGML_TYPE_F16:
|
483
489
|
case GGML_TYPE_Q4_0:
|
484
490
|
case GGML_TYPE_Q4_1:
|
485
|
-
case GGML_TYPE_Q4_2:
|
486
491
|
case GGML_TYPE_Q5_0:
|
487
492
|
case GGML_TYPE_Q5_1:
|
488
493
|
case GGML_TYPE_Q8_0:
|
@@ -527,8 +532,8 @@ struct llama_file_saver {
|
|
527
532
|
write_vocab();
|
528
533
|
}
|
529
534
|
void write_magic() {
|
530
|
-
file.write_u32(
|
531
|
-
file.write_u32(
|
535
|
+
file.write_u32(LLAMA_FILE_MAGIC); // magic
|
536
|
+
file.write_u32(LLAMA_FILE_VERSION); // version
|
532
537
|
}
|
533
538
|
void write_hparams(enum llama_ftype new_ftype) {
|
534
539
|
const llama_hparams & hparams = any_file_loader->hparams;
|
@@ -558,7 +563,6 @@ struct llama_file_saver {
|
|
558
563
|
case GGML_TYPE_F16:
|
559
564
|
case GGML_TYPE_Q4_0:
|
560
565
|
case GGML_TYPE_Q4_1:
|
561
|
-
case GGML_TYPE_Q4_2:
|
562
566
|
case GGML_TYPE_Q5_0:
|
563
567
|
case GGML_TYPE_Q5_1:
|
564
568
|
case GGML_TYPE_Q8_0:
|
@@ -585,12 +589,12 @@ struct llama_model_loader {
|
|
585
589
|
std::unique_ptr<llama_mmap> mapping;
|
586
590
|
|
587
591
|
llama_model_loader(const std::string & fname_base, bool use_mmap, bool vocab_only) {
|
588
|
-
auto first_file = new llama_file_loader(fname_base.c_str(), 0, tensors_map);
|
592
|
+
auto * first_file = new llama_file_loader(fname_base.c_str(), 0, tensors_map);
|
589
593
|
file_loaders.emplace_back(first_file);
|
590
594
|
uint32_t n_parts = vocab_only ? 1 : guess_n_parts();
|
591
595
|
for (uint32_t i = 1; i < n_parts; i++) {
|
592
596
|
std::string fname = fname_base + "." + std::to_string(i);
|
593
|
-
auto ith_file = new llama_file_loader(fname.c_str(), i, tensors_map);
|
597
|
+
auto * ith_file = new llama_file_loader(fname.c_str(), i, tensors_map);
|
594
598
|
file_loaders.emplace_back(ith_file);
|
595
599
|
if (ith_file->hparams != first_file->hparams) {
|
596
600
|
throw format("llama.cpp: hparams inconsistent between files");
|
@@ -637,7 +641,7 @@ struct llama_model_loader {
|
|
637
641
|
}
|
638
642
|
}
|
639
643
|
|
640
|
-
struct ggml_tensor * get_tensor(const std::string & name, std::vector<uint32_t> ne) {
|
644
|
+
struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne) {
|
641
645
|
auto it = tensors_map.name_to_idx.find(name);
|
642
646
|
if (it == tensors_map.name_to_idx.end()) {
|
643
647
|
throw format("llama.cpp: tensor '%s' is missing from model", name.c_str());
|
@@ -666,7 +670,7 @@ struct llama_model_loader {
|
|
666
670
|
return tensor;
|
667
671
|
}
|
668
672
|
|
669
|
-
void done_getting_tensors() {
|
673
|
+
void done_getting_tensors() const {
|
670
674
|
if (num_ggml_tensors_created != tensors_map.tensors.size()) {
|
671
675
|
throw std::string("llama.cpp: file contained more tensors than expected");
|
672
676
|
}
|
@@ -808,9 +812,9 @@ static bool kv_cache_init(
|
|
808
812
|
struct llama_context_params llama_context_default_params() {
|
809
813
|
struct llama_context_params result = {
|
810
814
|
/*.n_ctx =*/ 512,
|
811
|
-
/*.
|
815
|
+
/*.gpu_layers =*/ 0,
|
812
816
|
/*.seed =*/ -1,
|
813
|
-
/*.f16_kv =*/
|
817
|
+
/*.f16_kv =*/ true,
|
814
818
|
/*.logits_all =*/ false,
|
815
819
|
/*.vocab_only =*/ false,
|
816
820
|
/*.use_mmap =*/ true,
|
@@ -839,9 +843,11 @@ static const char *llama_file_version_name(llama_file_version version) {
|
|
839
843
|
switch (version) {
|
840
844
|
case LLAMA_FILE_VERSION_GGML: return "'ggml' (old version with low tokenizer quality and no mmap support)";
|
841
845
|
case LLAMA_FILE_VERSION_GGMF_V1: return "ggmf v1 (old version with no mmap support)";
|
842
|
-
case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (
|
843
|
-
|
846
|
+
case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (pre #1405)";
|
847
|
+
case LLAMA_FILE_VERSION_GGJT_V2: return "ggjt v2 (latest)";
|
844
848
|
}
|
849
|
+
|
850
|
+
return "unknown";
|
845
851
|
}
|
846
852
|
|
847
853
|
static const char *llama_ftype_name(enum llama_ftype ftype) {
|
@@ -852,7 +858,6 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
|
|
852
858
|
case LLAMA_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1";
|
853
859
|
case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
|
854
860
|
return "mostly Q4_1, some F16";
|
855
|
-
case LLAMA_FTYPE_MOSTLY_Q4_2: return "mostly Q4_2";
|
856
861
|
case LLAMA_FTYPE_MOSTLY_Q5_0: return "mostly Q5_0";
|
857
862
|
case LLAMA_FTYPE_MOSTLY_Q5_1: return "mostly Q5_1";
|
858
863
|
case LLAMA_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0";
|
@@ -874,6 +879,7 @@ static void llama_model_load_internal(
|
|
874
879
|
const std::string & fname,
|
875
880
|
llama_context & lctx,
|
876
881
|
int n_ctx,
|
882
|
+
int n_gpu_layers,
|
877
883
|
ggml_type memory_type,
|
878
884
|
bool use_mmap,
|
879
885
|
bool use_mlock,
|
@@ -918,15 +924,24 @@ static void llama_model_load_internal(
|
|
918
924
|
fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
|
919
925
|
}
|
920
926
|
|
927
|
+
if (file_version != LLAMA_FILE_VERSION_GGJT_V2) {
|
928
|
+
if (hparams.ftype != LLAMA_FTYPE_ALL_F32 &&
|
929
|
+
hparams.ftype != LLAMA_FTYPE_MOSTLY_F16 &&
|
930
|
+
hparams.ftype != LLAMA_FTYPE_MOSTLY_Q8_0) {
|
931
|
+
throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1305)");
|
932
|
+
}
|
933
|
+
}
|
934
|
+
|
921
935
|
if (vocab_only) {
|
922
936
|
return;
|
923
937
|
}
|
924
938
|
|
925
939
|
auto & ctx = model.ctx;
|
926
940
|
|
927
|
-
size_t ctx_size
|
941
|
+
size_t ctx_size;
|
942
|
+
size_t mmapped_size;
|
928
943
|
ml->calc_sizes(&ctx_size, &mmapped_size);
|
929
|
-
fprintf(stderr, "%s: ggml ctx size = %6.2f
|
944
|
+
fprintf(stderr, "%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/1024.0/1024.0);
|
930
945
|
|
931
946
|
// print memory requirements
|
932
947
|
{
|
@@ -970,8 +985,6 @@ static void llama_model_load_internal(
|
|
970
985
|
|
971
986
|
// prepare memory for the weights
|
972
987
|
{
|
973
|
-
const auto & hparams = model.hparams;
|
974
|
-
|
975
988
|
const uint32_t n_embd = hparams.n_embd;
|
976
989
|
const uint32_t n_layer = hparams.n_layer;
|
977
990
|
const uint32_t n_vocab = hparams.n_vocab;
|
@@ -1013,6 +1026,35 @@ static void llama_model_load_internal(
|
|
1013
1026
|
ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
|
1014
1027
|
|
1015
1028
|
model.mapping = std::move(ml->mapping);
|
1029
|
+
#ifdef GGML_USE_CUBLAS
|
1030
|
+
{
|
1031
|
+
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
1032
|
+
|
1033
|
+
fprintf(stderr, "%s: [cublas] offloading %d layers to GPU\n", __func__, n_gpu);
|
1034
|
+
|
1035
|
+
size_t vram_total = 0;
|
1036
|
+
|
1037
|
+
for (int i = 0; i < n_gpu; ++i) {
|
1038
|
+
const auto & layer = model.layers[i];
|
1039
|
+
|
1040
|
+
ggml_cuda_transform_tensor(layer.wq); vram_total += ggml_nbytes(layer.wq);
|
1041
|
+
ggml_cuda_transform_tensor(layer.wk); vram_total += ggml_nbytes(layer.wk);
|
1042
|
+
ggml_cuda_transform_tensor(layer.wv); vram_total += ggml_nbytes(layer.wv);
|
1043
|
+
ggml_cuda_transform_tensor(layer.wo); vram_total += ggml_nbytes(layer.wo);
|
1044
|
+
ggml_cuda_transform_tensor(layer.w1); vram_total += ggml_nbytes(layer.w1);
|
1045
|
+
ggml_cuda_transform_tensor(layer.w2); vram_total += ggml_nbytes(layer.w2);
|
1046
|
+
ggml_cuda_transform_tensor(layer.w3); vram_total += ggml_nbytes(layer.w3);
|
1047
|
+
}
|
1048
|
+
if (n_gpu_layers > (int) hparams.n_layer) {
|
1049
|
+
fprintf(stderr, "%s: [cublas] offloading output layer to GPU\n", __func__);
|
1050
|
+
ggml_cuda_transform_tensor(model.output); vram_total += ggml_nbytes(model.output);
|
1051
|
+
}
|
1052
|
+
|
1053
|
+
fprintf(stderr, "%s: [cublas] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
|
1054
|
+
}
|
1055
|
+
#else
|
1056
|
+
(void) n_gpu_layers;
|
1057
|
+
#endif
|
1016
1058
|
|
1017
1059
|
// loading time will be recalculate after the first eval, so
|
1018
1060
|
// we take page faults deferred by mmap() into consideration
|
@@ -1023,6 +1065,7 @@ static bool llama_model_load(
|
|
1023
1065
|
const std::string & fname,
|
1024
1066
|
llama_context & lctx,
|
1025
1067
|
int n_ctx,
|
1068
|
+
int n_gpu_layers,
|
1026
1069
|
ggml_type memory_type,
|
1027
1070
|
bool use_mmap,
|
1028
1071
|
bool use_mlock,
|
@@ -1030,7 +1073,7 @@ static bool llama_model_load(
|
|
1030
1073
|
llama_progress_callback progress_callback,
|
1031
1074
|
void *progress_callback_user_data) {
|
1032
1075
|
try {
|
1033
|
-
llama_model_load_internal(fname, lctx, n_ctx, memory_type, use_mmap, use_mlock,
|
1076
|
+
llama_model_load_internal(fname, lctx, n_ctx, n_gpu_layers, memory_type, use_mmap, use_mlock,
|
1034
1077
|
vocab_only, progress_callback, progress_callback_user_data);
|
1035
1078
|
return true;
|
1036
1079
|
} catch (const std::string & err) {
|
@@ -1052,6 +1095,13 @@ static bool llama_eval_internal(
|
|
1052
1095
|
const int n_tokens,
|
1053
1096
|
const int n_past,
|
1054
1097
|
const int n_threads) {
|
1098
|
+
|
1099
|
+
// enforce that the first token is BOS
|
1100
|
+
if (n_past == 0 && tokens[0] != llama_token_bos()) {
|
1101
|
+
fprintf(stderr, "%s: first token must be BOS\n", __func__);
|
1102
|
+
return false;
|
1103
|
+
}
|
1104
|
+
|
1055
1105
|
const int64_t t_start_us = ggml_time_us();
|
1056
1106
|
|
1057
1107
|
const int N = n_tokens;
|
@@ -1059,7 +1109,7 @@ static bool llama_eval_internal(
|
|
1059
1109
|
const auto & model = lctx.model;
|
1060
1110
|
const auto & hparams = model.hparams;
|
1061
1111
|
|
1062
|
-
auto & kv_self = model.kv_self;
|
1112
|
+
const auto & kv_self = model.kv_self;
|
1063
1113
|
|
1064
1114
|
LLAMA_ASSERT(!!kv_self.ctx);
|
1065
1115
|
|
@@ -1112,8 +1162,8 @@ static bool llama_eval_internal(
|
|
1112
1162
|
// self-attention
|
1113
1163
|
{
|
1114
1164
|
// compute Q and K and RoPE them
|
1115
|
-
struct ggml_tensor * Qcur =
|
1116
|
-
struct ggml_tensor * Kcur =
|
1165
|
+
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
|
1166
|
+
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
|
1117
1167
|
ggml_set_name(Qcur, "Qcur");
|
1118
1168
|
ggml_set_name(Kcur, "Kcur");
|
1119
1169
|
|
@@ -1154,17 +1204,19 @@ static bool llama_eval_internal(
|
|
1154
1204
|
struct ggml_tensor * KQ_scale = ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head));
|
1155
1205
|
ggml_set_name(KQ_scale, "1/sqrt(n_embd/n_head)");
|
1156
1206
|
|
1157
|
-
|
1207
|
+
// KQ_scaled shape [n_past + N, N, n_head, 1]
|
1208
|
+
struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
|
1158
1209
|
ggml_set_name(KQ_scaled, "KQ_scaled");
|
1159
1210
|
|
1160
1211
|
// KQ_masked = mask_past(KQ_scaled)
|
1161
|
-
struct ggml_tensor * KQ_masked =
|
1212
|
+
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
|
1162
1213
|
ggml_set_name(KQ_masked, "KQ_masked");
|
1163
1214
|
|
1164
1215
|
// KQ = soft_max(KQ_masked)
|
1165
|
-
struct ggml_tensor * KQ_soft_max =
|
1216
|
+
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
|
1166
1217
|
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
1167
1218
|
|
1219
|
+
|
1168
1220
|
// split cached V into n_head heads
|
1169
1221
|
struct ggml_tensor * V =
|
1170
1222
|
ggml_view_3d(ctx0, kv_self.v,
|
@@ -1265,7 +1317,7 @@ static bool llama_eval_internal(
|
|
1265
1317
|
lctx.use_buf(ctx0, -1);
|
1266
1318
|
|
1267
1319
|
// logits -> probs
|
1268
|
-
//inpL =
|
1320
|
+
//inpL = ggml_soft_max_inplace(ctx0, inpL);
|
1269
1321
|
|
1270
1322
|
// run the computation
|
1271
1323
|
ggml_build_forward_expand(&gf, inpL);
|
@@ -1303,7 +1355,7 @@ static bool llama_eval_internal(
|
|
1303
1355
|
}
|
1304
1356
|
|
1305
1357
|
// extract embeddings
|
1306
|
-
if (lctx.embedding.
|
1358
|
+
if (!lctx.embedding.empty()) {
|
1307
1359
|
auto & embedding_out = lctx.embedding;
|
1308
1360
|
|
1309
1361
|
embedding_out.resize(n_embd);
|
@@ -1354,6 +1406,8 @@ struct llama_sp_symbol {
|
|
1354
1406
|
size_t n;
|
1355
1407
|
};
|
1356
1408
|
|
1409
|
+
static_assert(std::is_trivially_copyable<llama_sp_symbol>::value, "llama_sp_symbol is not trivially copyable");
|
1410
|
+
|
1357
1411
|
struct llama_sp_bigram {
|
1358
1412
|
struct comparator {
|
1359
1413
|
bool operator()(llama_sp_bigram & l, llama_sp_bigram & r) {
|
@@ -1386,7 +1440,7 @@ struct llama_tokenizer {
|
|
1386
1440
|
sym.prev = index - 1;
|
1387
1441
|
sym.next = offs == text.size() ? -1 : index + 1;
|
1388
1442
|
index++;
|
1389
|
-
symbols_.emplace_back(
|
1443
|
+
symbols_.emplace_back(sym);
|
1390
1444
|
}
|
1391
1445
|
|
1392
1446
|
// seed the work queue with all possible 2-character tokens.
|
@@ -1477,12 +1531,12 @@ static std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, co
|
|
1477
1531
|
llama_tokenizer tokenizer(vocab);
|
1478
1532
|
std::vector<llama_vocab::id> output;
|
1479
1533
|
|
1480
|
-
if (text.
|
1534
|
+
if (text.empty()) {
|
1481
1535
|
return output;
|
1482
1536
|
}
|
1483
1537
|
|
1484
1538
|
if (bos) {
|
1485
|
-
output.push_back(
|
1539
|
+
output.push_back(llama_token_bos());
|
1486
1540
|
}
|
1487
1541
|
|
1488
1542
|
tokenizer.tokenize(text, output);
|
@@ -1713,7 +1767,7 @@ void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_dat
|
|
1713
1767
|
const int64_t t_start_sample_us = ggml_time_us();
|
1714
1768
|
|
1715
1769
|
for (size_t i = 0; i < candidates->size; ++i) {
|
1716
|
-
auto token_iter = std::find(last_tokens, last_tokens + last_tokens_size, candidates->data[i].id);
|
1770
|
+
const auto * token_iter = std::find(last_tokens, last_tokens + last_tokens_size, candidates->data[i].id);
|
1717
1771
|
if (token_iter == last_tokens + last_tokens_size) {
|
1718
1772
|
continue;
|
1719
1773
|
}
|
@@ -1791,7 +1845,7 @@ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_
|
|
1791
1845
|
float k = powf((epsilon_hat * powf(2, *mu)) / (1 - powf(N, -epsilon_hat)), 1 / s_hat);
|
1792
1846
|
|
1793
1847
|
// Sample the next word X using top-k sampling
|
1794
|
-
llama_sample_top_k(nullptr, candidates, int(k));
|
1848
|
+
llama_sample_top_k(nullptr, candidates, int(k), 1);
|
1795
1849
|
if (ctx) {
|
1796
1850
|
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
1797
1851
|
}
|
@@ -1857,7 +1911,7 @@ llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_da
|
|
1857
1911
|
const int64_t t_start_sample_us = ggml_time_us();
|
1858
1912
|
|
1859
1913
|
// Find max element
|
1860
|
-
auto max_iter = std::max_element(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
|
1914
|
+
auto * max_iter = std::max_element(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
|
1861
1915
|
return a.logit < b.logit;
|
1862
1916
|
});
|
1863
1917
|
|
@@ -1900,7 +1954,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
1900
1954
|
switch (ftype) {
|
1901
1955
|
case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
|
1902
1956
|
case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
|
1903
|
-
case LLAMA_FTYPE_MOSTLY_Q4_2: quantized_type = GGML_TYPE_Q4_2; break;
|
1904
1957
|
case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
|
1905
1958
|
case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
|
1906
1959
|
case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
|
@@ -1911,7 +1964,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
1911
1964
|
nthread = std::thread::hardware_concurrency();
|
1912
1965
|
}
|
1913
1966
|
|
1914
|
-
std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp
|
1967
|
+
std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp, /*use_mmap*/ false,
|
1915
1968
|
/*vocab_only*/ false));
|
1916
1969
|
llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), ftype);
|
1917
1970
|
|
@@ -1965,7 +2018,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
1965
2018
|
} else if (tensor.type == GGML_TYPE_F16) {
|
1966
2019
|
f32_conv_buf.resize(nelements * sizeof(float));
|
1967
2020
|
f32_data = (float *) f32_conv_buf.addr;
|
1968
|
-
auto f16_data = (const ggml_fp16_t *) tensor.data;
|
2021
|
+
const auto * f16_data = (const ggml_fp16_t *) tensor.data;
|
1969
2022
|
for (size_t i = 0; i < nelements; i++) {
|
1970
2023
|
f32_data[i] = ggml_fp16_to_fp32(f16_data[i]);
|
1971
2024
|
}
|
@@ -1996,21 +2049,31 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
1996
2049
|
size_t first = counter; counter += chunk_size;
|
1997
2050
|
if (first >= nelements) {
|
1998
2051
|
if (!local_hist.empty()) {
|
1999
|
-
for (int j=0; j<int(local_hist.size()); ++j)
|
2052
|
+
for (int j=0; j<int(local_hist.size()); ++j) {
|
2053
|
+
hist_cur[j] += local_hist[j];
|
2054
|
+
}
|
2000
2055
|
new_size += local_size;
|
2001
2056
|
}
|
2002
2057
|
break;
|
2003
2058
|
}
|
2004
2059
|
lock.unlock();
|
2005
2060
|
size_t last = std::min(nelements, first + chunk_size);
|
2006
|
-
if (local_hist.empty())
|
2061
|
+
if (local_hist.empty()) {
|
2062
|
+
local_hist.resize(hist_cur.size(), 0);
|
2063
|
+
}
|
2007
2064
|
local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data());
|
2008
2065
|
}
|
2009
2066
|
};
|
2010
|
-
if (int
|
2011
|
-
|
2067
|
+
if ((int) workers.size() < nthread_use - 1) {
|
2068
|
+
workers.resize(nthread_use - 1);
|
2069
|
+
}
|
2070
|
+
for (int it = 0; it < nthread_use - 1; ++it) {
|
2071
|
+
workers[it] = std::thread(compute);
|
2072
|
+
}
|
2012
2073
|
compute();
|
2013
|
-
for (int it = 0; it < nthread_use - 1; ++it)
|
2074
|
+
for (int it = 0; it < nthread_use - 1; ++it) {
|
2075
|
+
workers[it].join();
|
2076
|
+
}
|
2014
2077
|
}
|
2015
2078
|
|
2016
2079
|
printf("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0);
|
@@ -2082,7 +2145,7 @@ struct llama_context * llama_init_from_file(
|
|
2082
2145
|
|
2083
2146
|
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
2084
2147
|
|
2085
|
-
if (!llama_model_load(path_model, *ctx, params.n_ctx, memory_type,
|
2148
|
+
if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_gpu_layers, memory_type,
|
2086
2149
|
params.use_mmap, params.use_mlock, params.vocab_only,
|
2087
2150
|
params.progress_callback, params.progress_callback_user_data)) {
|
2088
2151
|
fprintf(stderr, "%s: failed to load model\n", __func__);
|
@@ -2208,7 +2271,8 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|
2208
2271
|
fprintf(stderr, "%s: loading base model from '%s'\n", __func__, path_base_model);
|
2209
2272
|
model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*vocab_only*/ false));
|
2210
2273
|
|
2211
|
-
size_t ctx_size
|
2274
|
+
size_t ctx_size;
|
2275
|
+
size_t mmapped_size;
|
2212
2276
|
model_loader->calc_sizes(&ctx_size, &mmapped_size);
|
2213
2277
|
base_buf.resize(ctx_size);
|
2214
2278
|
|
@@ -2247,8 +2311,12 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|
2247
2311
|
fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
|
2248
2312
|
}
|
2249
2313
|
|
2250
|
-
std::string name
|
2251
|
-
|
2314
|
+
std::string name;
|
2315
|
+
{
|
2316
|
+
char buf[1024];
|
2317
|
+
fin.read(buf, length);
|
2318
|
+
name = std::string(buf, length);
|
2319
|
+
}
|
2252
2320
|
|
2253
2321
|
// check for lora suffix and get the type of tensor
|
2254
2322
|
const std::string lora_suffix = ".lora";
|
@@ -2263,7 +2331,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|
2263
2331
|
base_name.erase(pos);
|
2264
2332
|
// fprintf(stderr, "%s: %s => %s (lora type %s) ", __func__, name.c_str(),base_name.c_str(), lora_type.c_str());
|
2265
2333
|
|
2266
|
-
if (model_tensors.find(base_name
|
2334
|
+
if (model_tensors.find(base_name) == model_tensors.end()) {
|
2267
2335
|
fprintf(stderr, "%s: unknown tensor '%s' in lora adapter\n", __func__, name.data());
|
2268
2336
|
return 1;
|
2269
2337
|
}
|
@@ -2343,7 +2411,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|
2343
2411
|
|
2344
2412
|
if (scaling != 1.0f) {
|
2345
2413
|
ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling);
|
2346
|
-
BA =
|
2414
|
+
BA = ggml_scale_inplace(lora_ctx, BA, scale_tensor);
|
2347
2415
|
}
|
2348
2416
|
|
2349
2417
|
ggml_tensor * r;
|
@@ -2365,8 +2433,9 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|
2365
2433
|
lora_tensors.clear();
|
2366
2434
|
|
2367
2435
|
n_tensors++;
|
2368
|
-
if (n_tensors % 4 == 0)
|
2436
|
+
if (n_tensors % 4 == 0) {
|
2369
2437
|
fprintf(stderr, ".");
|
2438
|
+
}
|
2370
2439
|
}
|
2371
2440
|
}
|
2372
2441
|
|
@@ -2395,7 +2464,7 @@ int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
|
|
2395
2464
|
return ctx->model.kv_self.n;
|
2396
2465
|
}
|
2397
2466
|
|
2398
|
-
#define LLAMA_MAX_RNG_STATE 64*1024
|
2467
|
+
#define LLAMA_MAX_RNG_STATE (64*1024)
|
2399
2468
|
|
2400
2469
|
void llama_set_rng_seed(struct llama_context * ctx, int seed) {
|
2401
2470
|
if (seed < 0) {
|
@@ -2436,8 +2505,8 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
|
|
2436
2505
|
}
|
2437
2506
|
|
2438
2507
|
// Copies the state to the specified destination address
|
2439
|
-
size_t llama_copy_state_data(struct llama_context * ctx, uint8_t *
|
2440
|
-
uint8_t * out =
|
2508
|
+
size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
2509
|
+
uint8_t * out = dst;
|
2441
2510
|
|
2442
2511
|
// copy rng
|
2443
2512
|
{
|
@@ -2497,7 +2566,9 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) {
|
|
2497
2566
|
|
2498
2567
|
if (kv_size) {
|
2499
2568
|
const size_t elt_size = ggml_element_size(kv_self.k);
|
2569
|
+
|
2500
2570
|
char buffer[4096];
|
2571
|
+
|
2501
2572
|
ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
|
2502
2573
|
ggml_cgraph gf{};
|
2503
2574
|
gf.n_threads = 1;
|
@@ -2521,10 +2592,12 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) {
|
|
2521
2592
|
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
|
2522
2593
|
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d));
|
2523
2594
|
ggml_graph_compute(cpy_ctx, &gf);
|
2595
|
+
|
2596
|
+
ggml_free(cpy_ctx);
|
2524
2597
|
}
|
2525
2598
|
}
|
2526
2599
|
|
2527
|
-
const size_t written = out -
|
2600
|
+
const size_t written = out - dst;
|
2528
2601
|
const size_t max_size = llama_get_state_size(ctx);
|
2529
2602
|
|
2530
2603
|
LLAMA_ASSERT(written <= max_size);
|
@@ -2534,15 +2607,15 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) {
|
|
2534
2607
|
|
2535
2608
|
// Sets the state reading from the specified source address
|
2536
2609
|
size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
2537
|
-
const uint8_t *
|
2610
|
+
const uint8_t * inp = src;
|
2538
2611
|
|
2539
2612
|
// set rng
|
2540
2613
|
{
|
2541
2614
|
size_t rng_size;
|
2542
2615
|
char rng_buf[LLAMA_MAX_RNG_STATE];
|
2543
2616
|
|
2544
|
-
memcpy(&rng_size,
|
2545
|
-
memcpy(&rng_buf[0],
|
2617
|
+
memcpy(&rng_size, inp, sizeof(rng_size)); inp += sizeof(rng_size);
|
2618
|
+
memcpy(&rng_buf[0], inp, LLAMA_MAX_RNG_STATE); inp += LLAMA_MAX_RNG_STATE;
|
2546
2619
|
|
2547
2620
|
std::stringstream rng_ss;
|
2548
2621
|
rng_ss.str(std::string(&rng_buf[0], rng_size));
|
@@ -2556,30 +2629,30 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
|
2556
2629
|
size_t logits_cap;
|
2557
2630
|
size_t logits_size;
|
2558
2631
|
|
2559
|
-
memcpy(&logits_cap,
|
2560
|
-
memcpy(&logits_size,
|
2632
|
+
memcpy(&logits_cap, inp, sizeof(logits_cap)); inp += sizeof(logits_cap);
|
2633
|
+
memcpy(&logits_size, inp, sizeof(logits_size)); inp += sizeof(logits_size);
|
2561
2634
|
|
2562
2635
|
LLAMA_ASSERT(ctx->logits.capacity() == logits_cap);
|
2563
2636
|
|
2564
2637
|
if (logits_size) {
|
2565
2638
|
ctx->logits.resize(logits_size);
|
2566
|
-
memcpy(ctx->logits.data(),
|
2639
|
+
memcpy(ctx->logits.data(), inp, logits_size * sizeof(float));
|
2567
2640
|
}
|
2568
2641
|
|
2569
|
-
|
2642
|
+
inp += logits_cap * sizeof(float);
|
2570
2643
|
}
|
2571
2644
|
|
2572
2645
|
// set embeddings
|
2573
2646
|
{
|
2574
2647
|
size_t embedding_size;
|
2575
2648
|
|
2576
|
-
memcpy(&embedding_size,
|
2649
|
+
memcpy(&embedding_size, inp, sizeof(embedding_size)); inp += sizeof(embedding_size);
|
2577
2650
|
|
2578
2651
|
LLAMA_ASSERT(ctx->embedding.capacity() == embedding_size);
|
2579
2652
|
|
2580
2653
|
if (embedding_size) {
|
2581
|
-
memcpy(ctx->embedding.data(),
|
2582
|
-
|
2654
|
+
memcpy(ctx->embedding.data(), inp, embedding_size * sizeof(float));
|
2655
|
+
inp += embedding_size * sizeof(float);
|
2583
2656
|
}
|
2584
2657
|
}
|
2585
2658
|
|
@@ -2594,25 +2667,27 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
|
2594
2667
|
size_t kv_size;
|
2595
2668
|
int kv_ntok;
|
2596
2669
|
|
2597
|
-
memcpy(&kv_size,
|
2598
|
-
memcpy(&kv_ntok,
|
2670
|
+
memcpy(&kv_size, inp, sizeof(kv_size)); inp += sizeof(kv_size);
|
2671
|
+
memcpy(&kv_ntok, inp, sizeof(kv_ntok)); inp += sizeof(kv_ntok);
|
2599
2672
|
|
2600
2673
|
if (kv_size) {
|
2601
2674
|
LLAMA_ASSERT(kv_self.buf.size == kv_size);
|
2602
2675
|
|
2603
2676
|
const size_t elt_size = ggml_element_size(kv_self.k);
|
2677
|
+
|
2604
2678
|
char buffer[4096];
|
2679
|
+
|
2605
2680
|
ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
|
2606
2681
|
ggml_cgraph gf{};
|
2607
2682
|
gf.n_threads = 1;
|
2608
2683
|
|
2609
2684
|
ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
|
2610
|
-
kin3d->data = (void *)
|
2611
|
-
|
2685
|
+
kin3d->data = (void *) inp;
|
2686
|
+
inp += ggml_nbytes(kin3d);
|
2612
2687
|
|
2613
2688
|
ggml_tensor * vin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_ntok, n_embd, n_layer);
|
2614
|
-
vin3d->data = (void *)
|
2615
|
-
|
2689
|
+
vin3d->data = (void *) inp;
|
2690
|
+
inp += ggml_nbytes(vin3d);
|
2616
2691
|
|
2617
2692
|
ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k,
|
2618
2693
|
n_embd, kv_ntok, n_layer,
|
@@ -2625,12 +2700,14 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
|
2625
2700
|
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d));
|
2626
2701
|
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d));
|
2627
2702
|
ggml_graph_compute(cpy_ctx, &gf);
|
2703
|
+
|
2704
|
+
ggml_free(cpy_ctx);
|
2628
2705
|
}
|
2629
2706
|
|
2630
2707
|
ctx->model.kv_self.n = kv_ntok;
|
2631
2708
|
}
|
2632
2709
|
|
2633
|
-
const size_t nread =
|
2710
|
+
const size_t nread = inp - src;
|
2634
2711
|
const size_t max_size = llama_get_state_size(ctx);
|
2635
2712
|
|
2636
2713
|
LLAMA_ASSERT(nread <= max_size);
|
@@ -2646,7 +2723,7 @@ bool llama_load_session_file(struct llama_context * ctx, const char * path_sessi
|
|
2646
2723
|
const uint32_t magic = file.read_u32();
|
2647
2724
|
const uint32_t version = file.read_u32();
|
2648
2725
|
|
2649
|
-
if (
|
2726
|
+
if (magic != LLAMA_SESSION_MAGIC || version != LLAMA_SESSION_VERSION) {
|
2650
2727
|
fprintf(stderr, "%s : unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version);
|
2651
2728
|
return false;
|
2652
2729
|
}
|
@@ -2727,11 +2804,14 @@ int llama_eval(
|
|
2727
2804
|
fprintf(stderr, "%s: failed to eval\n", __func__);
|
2728
2805
|
return 1;
|
2729
2806
|
}
|
2807
|
+
|
2730
2808
|
// get a more accurate load time, upon first eval
|
2809
|
+
// TODO: fix this
|
2731
2810
|
if (!ctx->has_evaluated_once) {
|
2732
2811
|
ctx->t_load_us = ggml_time_us() - ctx->t_start_us;
|
2733
2812
|
ctx->has_evaluated_once = true;
|
2734
2813
|
}
|
2814
|
+
|
2735
2815
|
return 0;
|
2736
2816
|
}
|
2737
2817
|
|
@@ -2805,9 +2885,9 @@ void llama_print_timings(struct llama_context * ctx) {
|
|
2805
2885
|
|
2806
2886
|
fprintf(stderr, "\n");
|
2807
2887
|
fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0);
|
2808
|
-
fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per
|
2888
|
+
fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_sample_us, n_sample, 1e-3 * ctx->t_sample_us / n_sample);
|
2809
2889
|
fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_p_eval_us, n_p_eval, 1e-3 * ctx->t_p_eval_us / n_p_eval);
|
2810
|
-
fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per
|
2890
|
+
fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_eval_us, n_eval, 1e-3 * ctx->t_eval_us / n_eval);
|
2811
2891
|
fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0);
|
2812
2892
|
}
|
2813
2893
|
|