llama_cpp 0.1.3 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +39 -8
- data/ext/llama_cpp/extconf.rb +26 -1
- data/ext/llama_cpp/llama_cpp.cpp +210 -13
- data/ext/llama_cpp/src/ggml-cuda.cu +1916 -0
- data/ext/llama_cpp/src/ggml-cuda.h +15 -2
- data/ext/llama_cpp/src/ggml-metal.h +63 -0
- data/ext/llama_cpp/src/ggml-metal.m +783 -0
- data/ext/llama_cpp/src/ggml-metal.metal +1133 -0
- data/ext/llama_cpp/src/ggml-opencl.cpp +242 -52
- data/ext/llama_cpp/src/ggml-opencl.h +4 -0
- data/ext/llama_cpp/src/ggml.c +835 -82
- data/ext/llama_cpp/src/ggml.h +64 -8
- data/ext/llama_cpp/src/k_quants.c +2244 -0
- data/ext/llama_cpp/src/k_quants.h +122 -0
- data/ext/llama_cpp/src/llama-util.h +16 -0
- data/ext/llama_cpp/src/llama.cpp +489 -134
- data/ext/llama_cpp/src/llama.h +43 -7
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +33 -1
- metadata +8 -2
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -16,6 +16,10 @@
|
|
16
16
|
#include "ggml-opencl.h"
|
17
17
|
#endif
|
18
18
|
|
19
|
+
#ifdef GGML_USE_METAL
|
20
|
+
#include "ggml-metal.h"
|
21
|
+
#endif
|
22
|
+
|
19
23
|
#include <array>
|
20
24
|
#include <ctime>
|
21
25
|
#include <cinttypes>
|
@@ -42,22 +46,29 @@
|
|
42
46
|
// available llama models
|
43
47
|
enum e_model {
|
44
48
|
MODEL_UNKNOWN,
|
49
|
+
MODEL_3B,
|
45
50
|
MODEL_7B,
|
46
51
|
MODEL_13B,
|
47
52
|
MODEL_30B,
|
48
53
|
MODEL_65B,
|
49
54
|
};
|
50
55
|
|
51
|
-
|
52
56
|
static const size_t MB = 1024*1024;
|
53
57
|
|
54
58
|
// computed for n_ctx == 2048
|
55
59
|
// TODO: dynamically determine these sizes
|
56
60
|
// needs modifications in ggml
|
57
61
|
|
62
|
+
typedef void (*offload_func_t)(struct ggml_tensor * tensor);
|
63
|
+
|
64
|
+
void llama_nop(struct ggml_tensor * tensor) { // don't offload by default
|
65
|
+
(void) tensor;
|
66
|
+
}
|
67
|
+
|
58
68
|
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
|
59
69
|
{
|
60
70
|
static std::map<e_model, size_t> k_sizes = {
|
71
|
+
{ MODEL_3B, 256ull * MB },
|
61
72
|
{ MODEL_7B, 512ull * MB },
|
62
73
|
{ MODEL_13B, 512ull * MB },
|
63
74
|
{ MODEL_30B, 512ull * MB },
|
@@ -69,6 +80,7 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
|
|
69
80
|
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
|
70
81
|
{
|
71
82
|
static std::map<e_model, size_t> k_sizes = {
|
83
|
+
{ MODEL_3B, 256ull * MB },
|
72
84
|
{ MODEL_7B, 512ull * MB },
|
73
85
|
{ MODEL_13B, 512ull * MB },
|
74
86
|
{ MODEL_30B, 512ull * MB },
|
@@ -81,6 +93,7 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
|
|
81
93
|
static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
|
82
94
|
{
|
83
95
|
static std::map<e_model, size_t> k_sizes = {
|
96
|
+
{ MODEL_3B, 682ull * MB },
|
84
97
|
{ MODEL_7B, 1026ull * MB },
|
85
98
|
{ MODEL_13B, 1608ull * MB },
|
86
99
|
{ MODEL_30B, 3124ull * MB },
|
@@ -94,6 +107,7 @@ static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
|
|
94
107
|
static const std::map<e_model, size_t> & MEM_REQ_EVAL()
|
95
108
|
{
|
96
109
|
static std::map<e_model, size_t> k_sizes = {
|
110
|
+
{ MODEL_3B, 512ull * MB },
|
97
111
|
{ MODEL_7B, 768ull * MB },
|
98
112
|
{ MODEL_13B, 1024ull * MB },
|
99
113
|
{ MODEL_30B, 1280ull * MB },
|
@@ -165,6 +179,7 @@ struct llama_model {
|
|
165
179
|
struct ggml_tensor * output;
|
166
180
|
|
167
181
|
std::vector<llama_layer> layers;
|
182
|
+
int n_gpu_layers;
|
168
183
|
|
169
184
|
// context
|
170
185
|
struct ggml_context * ctx = NULL;
|
@@ -190,6 +205,16 @@ struct llama_model {
|
|
190
205
|
if (ctx) {
|
191
206
|
ggml_free(ctx);
|
192
207
|
}
|
208
|
+
|
209
|
+
#ifdef GGML_USE_CUBLAS
|
210
|
+
for (size_t i = 0; i < tensors_by_name.size(); ++i) {
|
211
|
+
ggml_cuda_free_data(tensors_by_name[i].second);
|
212
|
+
}
|
213
|
+
#elif defined(GGML_USE_CLBLAST)
|
214
|
+
for (size_t i = 0; i < tensors_by_name.size(); ++i) {
|
215
|
+
ggml_cl_free_data(tensors_by_name[i].second);
|
216
|
+
}
|
217
|
+
#endif
|
193
218
|
}
|
194
219
|
};
|
195
220
|
|
@@ -238,6 +263,10 @@ struct llama_context {
|
|
238
263
|
llama_ctx_buffer buf_compute;
|
239
264
|
llama_ctx_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
|
240
265
|
|
266
|
+
#ifdef GGML_USE_METAL
|
267
|
+
ggml_metal_context * ctx_metal = NULL;
|
268
|
+
#endif
|
269
|
+
|
241
270
|
int buf_last = 0;
|
242
271
|
size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
|
243
272
|
|
@@ -277,15 +306,15 @@ template <typename T>
|
|
277
306
|
static T checked_mul(T a, T b) {
|
278
307
|
T ret = a * b;
|
279
308
|
if (a != 0 && ret / a != b) {
|
280
|
-
throw format("overflow multiplying %llu * %llu",
|
281
|
-
(unsigned long long) a, (unsigned long long) b);
|
309
|
+
throw std::runtime_error(format("overflow multiplying %llu * %llu",
|
310
|
+
(unsigned long long) a, (unsigned long long) b));
|
282
311
|
}
|
283
312
|
return ret;
|
284
313
|
}
|
285
314
|
|
286
315
|
static size_t checked_div(size_t a, size_t b) {
|
287
316
|
if (b == 0 || a % b != 0) {
|
288
|
-
throw format("error dividing %zu / %zu", a, b);
|
317
|
+
throw std::runtime_error(format("error dividing %zu / %zu", a, b));
|
289
318
|
}
|
290
319
|
return a / b;
|
291
320
|
}
|
@@ -349,7 +378,7 @@ struct llama_load_tensor {
|
|
349
378
|
const auto & first_shard = shards.at(0);
|
350
379
|
for (const auto & shard : shards) {
|
351
380
|
if (shard.type != first_shard.type) {
|
352
|
-
throw format("inconsistent tensor shard type in '%s'", name.c_str());
|
381
|
+
throw std::runtime_error(format("inconsistent tensor shard type in '%s'", name.c_str()));
|
353
382
|
}
|
354
383
|
}
|
355
384
|
type = first_shard.type;
|
@@ -372,8 +401,8 @@ struct llama_load_tensor {
|
|
372
401
|
const auto & first_shard = shards.at(0);
|
373
402
|
for (const auto & shard : shards) {
|
374
403
|
if (shard.ne != first_shard.ne) {
|
375
|
-
throw format("inconsistent tensor shard shape in '%s': first was %s, other was %s",
|
376
|
-
name.c_str(), llama_format_tensor_shape(first_shard.ne).c_str(), llama_format_tensor_shape(shard.ne).c_str());
|
404
|
+
throw std::runtime_error(format("inconsistent tensor shard shape in '%s': first was %s, other was %s",
|
405
|
+
name.c_str(), llama_format_tensor_shape(first_shard.ne).c_str(), llama_format_tensor_shape(shard.ne).c_str()));
|
377
406
|
}
|
378
407
|
}
|
379
408
|
ne = first_shard.ne;
|
@@ -451,8 +480,8 @@ struct llama_file_loader {
|
|
451
480
|
}
|
452
481
|
}
|
453
482
|
|
454
|
-
throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
|
455
|
-
magic, version);
|
483
|
+
throw std::runtime_error(format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
|
484
|
+
magic, version));
|
456
485
|
}
|
457
486
|
void read_hparams() {
|
458
487
|
hparams.n_vocab = file.read_u32();
|
@@ -492,7 +521,7 @@ struct llama_file_loader {
|
|
492
521
|
file.read_raw(shard.ne.data(), sizeof(shard.ne[0]) * n_dims);
|
493
522
|
std::string name = file.read_string(name_len);
|
494
523
|
if (n_dims < 1 || n_dims > 2) {
|
495
|
-
throw format("llama.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims);
|
524
|
+
throw std::runtime_error(format("llama.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims));
|
496
525
|
}
|
497
526
|
switch (shard.type) {
|
498
527
|
case GGML_TYPE_F32:
|
@@ -502,9 +531,14 @@ struct llama_file_loader {
|
|
502
531
|
case GGML_TYPE_Q5_0:
|
503
532
|
case GGML_TYPE_Q5_1:
|
504
533
|
case GGML_TYPE_Q8_0:
|
534
|
+
case GGML_TYPE_Q2_K:
|
535
|
+
case GGML_TYPE_Q3_K:
|
536
|
+
case GGML_TYPE_Q4_K:
|
537
|
+
case GGML_TYPE_Q5_K:
|
538
|
+
case GGML_TYPE_Q6_K:
|
505
539
|
break;
|
506
540
|
default: {
|
507
|
-
throw format("unrecognized tensor type %u\n", shard.type);
|
541
|
+
throw std::runtime_error(format("unrecognized tensor type %u\n", shard.type));
|
508
542
|
}
|
509
543
|
}
|
510
544
|
|
@@ -577,6 +611,11 @@ struct llama_file_saver {
|
|
577
611
|
case GGML_TYPE_Q5_0:
|
578
612
|
case GGML_TYPE_Q5_1:
|
579
613
|
case GGML_TYPE_Q8_0:
|
614
|
+
case GGML_TYPE_Q2_K:
|
615
|
+
case GGML_TYPE_Q3_K:
|
616
|
+
case GGML_TYPE_Q4_K:
|
617
|
+
case GGML_TYPE_Q5_K:
|
618
|
+
case GGML_TYPE_Q6_K:
|
580
619
|
break;
|
581
620
|
default: LLAMA_ASSERT(false);
|
582
621
|
}
|
@@ -608,7 +647,7 @@ struct llama_model_loader {
|
|
608
647
|
auto * ith_file = new llama_file_loader(fname.c_str(), i, tensors_map);
|
609
648
|
file_loaders.emplace_back(ith_file);
|
610
649
|
if (ith_file->hparams != first_file->hparams) {
|
611
|
-
throw format("llama.cpp: hparams inconsistent between files");
|
650
|
+
throw std::runtime_error(format("llama.cpp: hparams inconsistent between files"));
|
612
651
|
}
|
613
652
|
}
|
614
653
|
if (!llama_mmap::SUPPORTED) {
|
@@ -638,7 +677,7 @@ struct llama_model_loader {
|
|
638
677
|
uint32_t guess_n_parts() const {
|
639
678
|
auto it = tensors_map.name_to_idx.find("tok_embeddings.weight");
|
640
679
|
if (it == tensors_map.name_to_idx.end()) {
|
641
|
-
throw std::string("missing tok_embeddings.weight");
|
680
|
+
throw std::runtime_error(std::string("missing tok_embeddings.weight"));
|
642
681
|
}
|
643
682
|
const llama_load_tensor & lt = tensors_map.tensors.at(it->second);
|
644
683
|
return file_loaders.at(0)->hparams.n_embd / lt.shards.at(0).ne.at(0);
|
@@ -655,12 +694,12 @@ struct llama_model_loader {
|
|
655
694
|
struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne, ggml_backend backend) {
|
656
695
|
auto it = tensors_map.name_to_idx.find(name);
|
657
696
|
if (it == tensors_map.name_to_idx.end()) {
|
658
|
-
throw format("llama.cpp: tensor '%s' is missing from model", name.c_str());
|
697
|
+
throw std::runtime_error(std::runtime_error(format("llama.cpp: tensor '%s' is missing from model", name.c_str())));
|
659
698
|
}
|
660
699
|
llama_load_tensor & lt = tensors_map.tensors.at(it->second);
|
661
700
|
if (lt.ne != ne) {
|
662
|
-
throw format("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s",
|
663
|
-
name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str());
|
701
|
+
throw std::runtime_error(format("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s",
|
702
|
+
name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str()));
|
664
703
|
}
|
665
704
|
|
666
705
|
return get_tensor_for(lt, backend);
|
@@ -676,6 +715,7 @@ struct llama_model_loader {
|
|
676
715
|
}
|
677
716
|
ggml_set_name(tensor, lt.name.c_str());
|
678
717
|
LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor
|
718
|
+
|
679
719
|
tensor->backend = backend;
|
680
720
|
lt.ggml_tensor = tensor;
|
681
721
|
num_ggml_tensors_created++;
|
@@ -684,7 +724,7 @@ struct llama_model_loader {
|
|
684
724
|
|
685
725
|
void done_getting_tensors() const {
|
686
726
|
if (num_ggml_tensors_created != tensors_map.tensors.size()) {
|
687
|
-
throw std::string("llama.cpp: file contained more tensors than expected");
|
727
|
+
throw std::runtime_error(std::string("llama.cpp: file contained more tensors than expected"));
|
688
728
|
}
|
689
729
|
}
|
690
730
|
|
@@ -828,7 +868,10 @@ static bool kv_cache_init(
|
|
828
868
|
struct llama_context_params llama_context_default_params() {
|
829
869
|
struct llama_context_params result = {
|
830
870
|
/*.n_ctx =*/ 512,
|
871
|
+
/*.n_batch =*/ 512,
|
831
872
|
/*.gpu_layers =*/ 0,
|
873
|
+
/*.main_gpu =*/ 0,
|
874
|
+
/*.tensor_split =*/ {0},
|
832
875
|
/*.seed =*/ -1,
|
833
876
|
/*.f16_kv =*/ true,
|
834
877
|
/*.logits_all =*/ false,
|
@@ -843,6 +886,17 @@ struct llama_context_params llama_context_default_params() {
|
|
843
886
|
return result;
|
844
887
|
}
|
845
888
|
|
889
|
+
struct llama_model_quantize_params llama_model_quantize_default_params() {
|
890
|
+
struct llama_model_quantize_params result = {
|
891
|
+
/*.nthread =*/ 0,
|
892
|
+
/*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
|
893
|
+
/*.allow_requantize =*/ false,
|
894
|
+
/*.quantize_output_tensor =*/ true,
|
895
|
+
};
|
896
|
+
|
897
|
+
return result;
|
898
|
+
}
|
899
|
+
|
846
900
|
bool llama_mmap_supported() {
|
847
901
|
return llama_mmap::SUPPORTED;
|
848
902
|
}
|
@@ -893,12 +947,23 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
|
|
893
947
|
case LLAMA_FTYPE_MOSTLY_Q5_0: return "mostly Q5_0";
|
894
948
|
case LLAMA_FTYPE_MOSTLY_Q5_1: return "mostly Q5_1";
|
895
949
|
case LLAMA_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0";
|
950
|
+
// K-quants
|
951
|
+
case LLAMA_FTYPE_MOSTLY_Q2_K: return "mostly Q2_K";
|
952
|
+
case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "mostly Q3_K - Small";
|
953
|
+
case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "mostly Q3_K - Medium";
|
954
|
+
case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "mostly Q3_K - Large";
|
955
|
+
case LLAMA_FTYPE_MOSTLY_Q4_K_S: return "mostly Q4_K - Small";
|
956
|
+
case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "mostly Q4_K - Medium";
|
957
|
+
case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "mostly Q5_K - Small";
|
958
|
+
case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "mostly Q5_K - Medium";
|
959
|
+
case LLAMA_FTYPE_MOSTLY_Q6_K: return "mostly Q6_K";
|
896
960
|
default: return "unknown, may not work";
|
897
961
|
}
|
898
962
|
}
|
899
963
|
|
900
964
|
static const char *llama_model_type_name(e_model type) {
|
901
965
|
switch (type) {
|
966
|
+
case MODEL_3B: return "3B";
|
902
967
|
case MODEL_7B: return "7B";
|
903
968
|
case MODEL_13B: return "13B";
|
904
969
|
case MODEL_30B: return "30B";
|
@@ -911,7 +976,10 @@ static void llama_model_load_internal(
|
|
911
976
|
const std::string & fname,
|
912
977
|
llama_context & lctx,
|
913
978
|
int n_ctx,
|
979
|
+
int n_batch,
|
914
980
|
int n_gpu_layers,
|
981
|
+
int main_gpu,
|
982
|
+
const float * tensor_split,
|
915
983
|
ggml_type memory_type,
|
916
984
|
bool use_mmap,
|
917
985
|
bool use_mlock,
|
@@ -926,12 +994,13 @@ static void llama_model_load_internal(
|
|
926
994
|
lctx.vocab = std::move(ml->file_loaders.at(0)->vocab);
|
927
995
|
auto & model = lctx.model;
|
928
996
|
model.hparams = ml->file_loaders.at(0)->hparams;
|
997
|
+
model.n_gpu_layers = n_gpu_layers;
|
929
998
|
llama_file_version file_version = ml->file_loaders.at(0)->file_version;
|
930
999
|
auto & hparams = model.hparams;
|
931
|
-
uint32_t n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
|
932
1000
|
|
933
1001
|
{
|
934
1002
|
switch (hparams.n_layer) {
|
1003
|
+
case 26: model.type = e_model::MODEL_3B; break;
|
935
1004
|
case 32: model.type = e_model::MODEL_7B; break;
|
936
1005
|
case 40: model.type = e_model::MODEL_13B; break;
|
937
1006
|
case 60: model.type = e_model::MODEL_30B; break;
|
@@ -941,6 +1010,8 @@ static void llama_model_load_internal(
|
|
941
1010
|
hparams.n_ctx = n_ctx;
|
942
1011
|
}
|
943
1012
|
|
1013
|
+
const uint32_t n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
|
1014
|
+
|
944
1015
|
{
|
945
1016
|
fprintf(stderr, "%s: format = %s\n", __func__, llama_file_version_name(file_version));
|
946
1017
|
fprintf(stderr, "%s: n_vocab = %u\n", __func__, hparams.n_vocab);
|
@@ -960,7 +1031,7 @@ static void llama_model_load_internal(
|
|
960
1031
|
if (hparams.ftype != LLAMA_FTYPE_ALL_F32 &&
|
961
1032
|
hparams.ftype != LLAMA_FTYPE_MOSTLY_F16 &&
|
962
1033
|
hparams.ftype != LLAMA_FTYPE_MOSTLY_Q8_0) {
|
963
|
-
throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1405)");
|
1034
|
+
throw std::runtime_error(format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1405)"));
|
964
1035
|
}
|
965
1036
|
}
|
966
1037
|
|
@@ -968,7 +1039,7 @@ static void llama_model_load_internal(
|
|
968
1039
|
if (hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ||
|
969
1040
|
hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_1 ||
|
970
1041
|
hparams.ftype == LLAMA_FTYPE_MOSTLY_Q8_0) {
|
971
|
-
throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1508)");
|
1042
|
+
throw std::runtime_error(format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1508)"));
|
972
1043
|
}
|
973
1044
|
}
|
974
1045
|
|
@@ -999,18 +1070,28 @@ static void llama_model_load_internal(
|
|
999
1070
|
|
1000
1071
|
model.ctx = ggml_init(params);
|
1001
1072
|
if (!model.ctx) {
|
1002
|
-
throw format("ggml_init() failed");
|
1073
|
+
throw std::runtime_error(format("ggml_init() failed"));
|
1003
1074
|
}
|
1004
1075
|
}
|
1005
1076
|
|
1006
|
-
|
1007
|
-
#
|
1077
|
+
(void) main_gpu;
|
1078
|
+
#if defined(GGML_USE_CUBLAS)
|
1079
|
+
fprintf(stderr, "%s: using CUDA for GPU acceleration\n", __func__);
|
1080
|
+
ggml_cuda_set_main_device(main_gpu);
|
1081
|
+
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
|
1082
|
+
#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT
|
1083
|
+
#elif defined(GGML_USE_CLBLAST)
|
1084
|
+
fprintf(stderr, "%s: using OpenCL for GPU acceleration\n", __func__);
|
1085
|
+
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
|
1086
|
+
#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU
|
1008
1087
|
#else
|
1009
|
-
#define LLAMA_BACKEND_OFFLOAD
|
1088
|
+
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CPU
|
1089
|
+
#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_CPU
|
1010
1090
|
#endif
|
1011
1091
|
|
1012
1092
|
// prepare memory for the weights
|
1013
|
-
size_t
|
1093
|
+
size_t vram_weights = 0;
|
1094
|
+
size_t vram_scratch = 0;
|
1014
1095
|
{
|
1015
1096
|
const uint32_t n_embd = hparams.n_embd;
|
1016
1097
|
const uint32_t n_layer = hparams.n_layer;
|
@@ -1025,7 +1106,7 @@ static void llama_model_load_internal(
|
|
1025
1106
|
{
|
1026
1107
|
ggml_backend backend_output;
|
1027
1108
|
if (n_gpu_layers > int(n_layer)) { // NOLINT
|
1028
|
-
backend_output =
|
1109
|
+
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
|
1029
1110
|
} else {
|
1030
1111
|
backend_output = GGML_BACKEND_CPU;
|
1031
1112
|
}
|
@@ -1037,7 +1118,8 @@ static void llama_model_load_internal(
|
|
1037
1118
|
|
1038
1119
|
model.layers.resize(n_layer);
|
1039
1120
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
1040
|
-
const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
1121
|
+
const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
1122
|
+
const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
1041
1123
|
|
1042
1124
|
auto & layer = model.layers[i];
|
1043
1125
|
|
@@ -1045,19 +1127,19 @@ static void llama_model_load_internal(
|
|
1045
1127
|
|
1046
1128
|
layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd}, backend);
|
1047
1129
|
|
1048
|
-
layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd},
|
1049
|
-
layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd},
|
1050
|
-
layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd},
|
1051
|
-
layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd},
|
1130
|
+
layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}, backend_split);
|
1131
|
+
layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd}, backend_split);
|
1132
|
+
layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd}, backend_split);
|
1133
|
+
layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}, backend_split);
|
1052
1134
|
|
1053
1135
|
layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd}, backend);
|
1054
1136
|
|
1055
|
-
layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff},
|
1056
|
-
layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd},
|
1057
|
-
layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff},
|
1137
|
+
layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}, backend_split);
|
1138
|
+
layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}, backend_split);
|
1139
|
+
layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}, backend_split);
|
1058
1140
|
|
1059
|
-
if (backend ==
|
1060
|
-
|
1141
|
+
if (backend == GGML_BACKEND_GPU) {
|
1142
|
+
vram_weights +=
|
1061
1143
|
ggml_nbytes(layer.attention_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
|
1062
1144
|
ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.attention_norm) +
|
1063
1145
|
ggml_nbytes(layer.w1) + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3);
|
@@ -1074,10 +1156,10 @@ static void llama_model_load_internal(
|
|
1074
1156
|
// this is the total memory required to run the inference
|
1075
1157
|
const size_t mem_required =
|
1076
1158
|
ctx_size +
|
1077
|
-
mmapped_size -
|
1159
|
+
mmapped_size - vram_weights + // weights in VRAM not in memory
|
1078
1160
|
MEM_REQ_SCRATCH0().at(model.type) +
|
1079
1161
|
MEM_REQ_SCRATCH1().at(model.type) +
|
1080
|
-
MEM_REQ_EVAL().at(model.type);
|
1162
|
+
MEM_REQ_EVAL().at (model.type);
|
1081
1163
|
|
1082
1164
|
// this is the memory required by one llama_state
|
1083
1165
|
const size_t mem_required_state =
|
@@ -1086,15 +1168,25 @@ static void llama_model_load_internal(
|
|
1086
1168
|
fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
|
1087
1169
|
mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
|
1088
1170
|
|
1171
|
+
(void) vram_scratch;
|
1089
1172
|
#ifdef GGML_USE_CUBLAS
|
1173
|
+
vram_scratch = n_batch * MB;
|
1174
|
+
ggml_cuda_set_scratch_size(vram_scratch);
|
1175
|
+
if (n_gpu_layers > 0) {
|
1176
|
+
fprintf(stderr, "%s: allocating batch_size x 1 MB = %ld MB VRAM for the scratch buffer\n",
|
1177
|
+
__func__, vram_scratch / MB);
|
1178
|
+
}
|
1179
|
+
#endif // GGML_USE_CUBLAS
|
1180
|
+
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
1090
1181
|
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
1091
1182
|
|
1092
|
-
fprintf(stderr, "%s:
|
1183
|
+
fprintf(stderr, "%s: offloading %d layers to GPU\n", __func__, n_gpu);
|
1093
1184
|
if (n_gpu_layers > (int) hparams.n_layer) {
|
1094
|
-
fprintf(stderr, "%s:
|
1185
|
+
fprintf(stderr, "%s: offloading output layer to GPU\n", __func__);
|
1095
1186
|
}
|
1096
|
-
fprintf(stderr, "%s:
|
1097
|
-
|
1187
|
+
fprintf(stderr, "%s: total VRAM used: %zu MB\n",
|
1188
|
+
__func__, (vram_weights + vram_scratch + MB - 1) / MB); // round up
|
1189
|
+
#else
|
1098
1190
|
(void) n_gpu_layers;
|
1099
1191
|
#endif
|
1100
1192
|
}
|
@@ -1106,8 +1198,10 @@ static void llama_model_load_internal(
|
|
1106
1198
|
|
1107
1199
|
ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
|
1108
1200
|
|
1109
|
-
#
|
1201
|
+
#if defined(GGML_USE_CUBLAS)
|
1110
1202
|
{
|
1203
|
+
ggml_cuda_set_tensor_split(tensor_split);
|
1204
|
+
|
1111
1205
|
size_t done_size = 0;
|
1112
1206
|
size_t data_size = 0;
|
1113
1207
|
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
|
@@ -1117,7 +1211,8 @@ static void llama_model_load_internal(
|
|
1117
1211
|
}
|
1118
1212
|
}
|
1119
1213
|
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
|
1120
|
-
|
1214
|
+
ggml_backend backend = lt.ggml_tensor->backend;
|
1215
|
+
if (backend != GGML_BACKEND_GPU && backend != GGML_BACKEND_GPU_SPLIT) {
|
1121
1216
|
continue;
|
1122
1217
|
}
|
1123
1218
|
if (progress_callback) {
|
@@ -1129,30 +1224,28 @@ static void llama_model_load_internal(
|
|
1129
1224
|
}
|
1130
1225
|
#elif defined(GGML_USE_CLBLAST)
|
1131
1226
|
{
|
1132
|
-
|
1133
|
-
|
1134
|
-
|
1135
|
-
|
1136
|
-
|
1137
|
-
|
1138
|
-
|
1139
|
-
const auto & layer = model.layers[i];
|
1140
|
-
|
1141
|
-
ggml_cl_transform_tensor(layer.wq); vram_total += ggml_nbytes(layer.wq);
|
1142
|
-
ggml_cl_transform_tensor(layer.wk); vram_total += ggml_nbytes(layer.wk);
|
1143
|
-
ggml_cl_transform_tensor(layer.wv); vram_total += ggml_nbytes(layer.wv);
|
1144
|
-
ggml_cl_transform_tensor(layer.wo); vram_total += ggml_nbytes(layer.wo);
|
1145
|
-
ggml_cl_transform_tensor(layer.w1); vram_total += ggml_nbytes(layer.w1);
|
1146
|
-
ggml_cl_transform_tensor(layer.w2); vram_total += ggml_nbytes(layer.w2);
|
1147
|
-
ggml_cl_transform_tensor(layer.w3); vram_total += ggml_nbytes(layer.w3);
|
1227
|
+
size_t done_size = 0;
|
1228
|
+
size_t data_size = 0;
|
1229
|
+
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
|
1230
|
+
data_size += lt.size;
|
1231
|
+
if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
|
1232
|
+
done_size += lt.size;
|
1233
|
+
}
|
1148
1234
|
}
|
1149
|
-
|
1150
|
-
|
1151
|
-
|
1235
|
+
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
|
1236
|
+
if (lt.ggml_tensor->backend != GGML_BACKEND_GPU) {
|
1237
|
+
continue;
|
1238
|
+
}
|
1239
|
+
if (progress_callback) {
|
1240
|
+
progress_callback((float) done_size / data_size, progress_callback_user_data);
|
1241
|
+
}
|
1242
|
+
ggml_cl_load_data(fname.c_str(), lt.ggml_tensor, lt.shards.at(0).file_off);
|
1243
|
+
done_size += lt.size;
|
1152
1244
|
}
|
1153
|
-
|
1154
|
-
fprintf(stderr, "ggml_opencl: total VRAM used: %zu MB\n", vram_total / 1024 / 1024);
|
1155
1245
|
}
|
1246
|
+
#else
|
1247
|
+
(void) n_batch;
|
1248
|
+
(void) tensor_split;
|
1156
1249
|
#endif
|
1157
1250
|
|
1158
1251
|
if (progress_callback) {
|
@@ -1170,7 +1263,10 @@ static bool llama_model_load(
|
|
1170
1263
|
const std::string & fname,
|
1171
1264
|
llama_context & lctx,
|
1172
1265
|
int n_ctx,
|
1266
|
+
int n_batch,
|
1173
1267
|
int n_gpu_layers,
|
1268
|
+
int main_gpu,
|
1269
|
+
float * tensor_split,
|
1174
1270
|
ggml_type memory_type,
|
1175
1271
|
bool use_mmap,
|
1176
1272
|
bool use_mlock,
|
@@ -1178,28 +1274,30 @@ static bool llama_model_load(
|
|
1178
1274
|
llama_progress_callback progress_callback,
|
1179
1275
|
void *progress_callback_user_data) {
|
1180
1276
|
try {
|
1181
|
-
llama_model_load_internal(fname, lctx, n_ctx, n_gpu_layers,
|
1182
|
-
vocab_only, progress_callback, progress_callback_user_data);
|
1277
|
+
llama_model_load_internal(fname, lctx, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, memory_type,
|
1278
|
+
use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
|
1183
1279
|
return true;
|
1184
|
-
} catch (const std::
|
1185
|
-
fprintf(stderr, "error loading model: %s\n", err.
|
1280
|
+
} catch (const std::exception & err) {
|
1281
|
+
fprintf(stderr, "error loading model: %s\n", err.what());
|
1186
1282
|
return false;
|
1187
1283
|
}
|
1188
1284
|
}
|
1189
1285
|
|
1190
1286
|
// evaluate the transformer
|
1191
1287
|
//
|
1192
|
-
// - lctx:
|
1193
|
-
// - tokens:
|
1194
|
-
// - n_past:
|
1195
|
-
// - n_threads:
|
1288
|
+
// - lctx: llama context
|
1289
|
+
// - tokens: new batch of tokens to process
|
1290
|
+
// - n_past: the context size so far
|
1291
|
+
// - n_threads: number of threads to use
|
1292
|
+
// - cgraph_fname: filename of the exported computation graph
|
1196
1293
|
//
|
1197
1294
|
static bool llama_eval_internal(
|
1198
|
-
llama_context &
|
1199
|
-
const llama_token *
|
1200
|
-
const int
|
1201
|
-
const int
|
1202
|
-
const int
|
1295
|
+
llama_context & lctx,
|
1296
|
+
const llama_token * tokens,
|
1297
|
+
const int n_tokens,
|
1298
|
+
const int n_past,
|
1299
|
+
const int n_threads,
|
1300
|
+
const char * cgraph_fname) {
|
1203
1301
|
|
1204
1302
|
// enforce that the first token is BOS
|
1205
1303
|
if (n_past == 0 && tokens[0] != llama_token_bos()) {
|
@@ -1218,12 +1316,13 @@ static bool llama_eval_internal(
|
|
1218
1316
|
|
1219
1317
|
LLAMA_ASSERT(!!kv_self.ctx);
|
1220
1318
|
|
1221
|
-
const int n_embd
|
1222
|
-
const int n_layer
|
1223
|
-
const int n_ctx
|
1224
|
-
const int n_head
|
1225
|
-
const int n_vocab
|
1226
|
-
const int n_rot
|
1319
|
+
const int n_embd = hparams.n_embd;
|
1320
|
+
const int n_layer = hparams.n_layer;
|
1321
|
+
const int n_ctx = hparams.n_ctx;
|
1322
|
+
const int n_head = hparams.n_head;
|
1323
|
+
const int n_vocab = hparams.n_vocab;
|
1324
|
+
const int n_rot = hparams.n_embd/hparams.n_head;
|
1325
|
+
const int n_gpu_layers = model.n_gpu_layers;
|
1227
1326
|
|
1228
1327
|
auto & mem_per_token = lctx.mem_per_token;
|
1229
1328
|
auto & buf_compute = lctx.buf_compute;
|
@@ -1245,40 +1344,66 @@ static bool llama_eval_internal(
|
|
1245
1344
|
ggml_set_name(embd, "embd");
|
1246
1345
|
memcpy(embd->data, tokens, N*ggml_element_size(embd));
|
1247
1346
|
|
1347
|
+
struct ggml_tensor * cur;
|
1248
1348
|
struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
|
1249
1349
|
|
1350
|
+
const int i_gpu_start = n_layer - n_gpu_layers;
|
1351
|
+
(void) i_gpu_start;
|
1352
|
+
|
1250
1353
|
for (int il = 0; il < n_layer; ++il) {
|
1251
|
-
|
1354
|
+
offload_func_t offload_func = llama_nop;
|
1252
1355
|
|
1253
|
-
|
1356
|
+
#ifdef GGML_USE_CUBLAS
|
1357
|
+
if (il >= i_gpu_start) {
|
1358
|
+
offload_func = ggml_cuda_assign_buffers; // sets the output backend to GPU
|
1359
|
+
}
|
1360
|
+
#endif // GGML_USE_CUBLAS
|
1361
|
+
|
1362
|
+
struct ggml_tensor * inpSA = inpL;
|
1254
1363
|
|
1255
1364
|
lctx.use_buf(ctx0, 0);
|
1256
1365
|
|
1257
1366
|
// norm
|
1258
1367
|
{
|
1259
1368
|
cur = ggml_rms_norm(ctx0, inpL);
|
1369
|
+
offload_func(cur);
|
1370
|
+
ggml_set_name(cur, "rms_norm_0");
|
1260
1371
|
|
1261
1372
|
// cur = cur*attention_norm(broadcasted)
|
1262
1373
|
cur = ggml_mul(ctx0, cur, model.layers[il].attention_norm);
|
1374
|
+
offload_func(cur);
|
1375
|
+
ggml_set_name(cur, "attention_norm_0");
|
1263
1376
|
}
|
1264
1377
|
|
1265
1378
|
// self-attention
|
1266
1379
|
{
|
1267
1380
|
// compute Q and K and RoPE them
|
1268
|
-
struct ggml_tensor *
|
1269
|
-
|
1270
|
-
ggml_set_name(
|
1381
|
+
struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
1382
|
+
// offload_func(tmpq);
|
1383
|
+
ggml_set_name(tmpq, "tmpq");
|
1384
|
+
|
1385
|
+
struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
1386
|
+
// offload_func(tmpk);
|
1387
|
+
ggml_set_name(tmpk, "tmpk");
|
1388
|
+
|
1389
|
+
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0);
|
1271
1390
|
ggml_set_name(Kcur, "Kcur");
|
1272
1391
|
|
1392
|
+
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0);
|
1393
|
+
ggml_set_name(Qcur, "Qcur");
|
1394
|
+
|
1273
1395
|
// store key and value to memory
|
1274
1396
|
{
|
1275
1397
|
// compute the transposed [N, n_embd] V matrix
|
1276
1398
|
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), n_embd, N));
|
1399
|
+
ggml_set_name(Vcur, "Vcur");
|
1277
1400
|
|
1278
1401
|
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
|
1402
|
+
ggml_set_name(k, "k");
|
1279
1403
|
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
|
1280
1404
|
( n_ctx)*ggml_element_size(kv_self.v),
|
1281
1405
|
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
|
1406
|
+
ggml_set_name(v, "v");
|
1282
1407
|
|
1283
1408
|
// important: storing RoPE-ed version of K in the KV cache!
|
1284
1409
|
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
|
@@ -1319,7 +1444,6 @@ static bool llama_eval_internal(
|
|
1319
1444
|
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
|
1320
1445
|
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
1321
1446
|
|
1322
|
-
|
1323
1447
|
// split cached V into n_head heads
|
1324
1448
|
struct ggml_tensor * V =
|
1325
1449
|
ggml_view_3d(ctx0, kv_self.v,
|
@@ -1354,73 +1478,143 @@ static bool llama_eval_internal(
|
|
1354
1478
|
cur = ggml_mul_mat(ctx0,
|
1355
1479
|
model.layers[il].wo,
|
1356
1480
|
cur);
|
1481
|
+
offload_func(cur);
|
1482
|
+
ggml_set_name(cur, "result_wo");
|
1357
1483
|
}
|
1358
1484
|
|
1359
1485
|
lctx.use_buf(ctx0, 1);
|
1486
|
+
//ggml_cuda_set_scratch(1);
|
1360
1487
|
|
1361
1488
|
struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
|
1489
|
+
offload_func(inpFF);
|
1490
|
+
ggml_set_name(inpFF, "inpFF");
|
1362
1491
|
|
1363
1492
|
// feed-forward network
|
1364
1493
|
{
|
1365
1494
|
// norm
|
1366
1495
|
{
|
1367
1496
|
cur = ggml_rms_norm(ctx0, inpFF);
|
1497
|
+
offload_func(cur);
|
1498
|
+
ggml_set_name(cur, "rms_norm_1");
|
1368
1499
|
|
1369
1500
|
// cur = cur*ffn_norm(broadcasted)
|
1370
1501
|
cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
|
1502
|
+
offload_func(cur);
|
1503
|
+
ggml_set_name(cur, "ffn_norm");
|
1371
1504
|
}
|
1372
1505
|
|
1373
1506
|
struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
|
1374
1507
|
model.layers[il].w3,
|
1375
1508
|
cur);
|
1509
|
+
offload_func(tmp);
|
1510
|
+
ggml_set_name(tmp, "result_w3");
|
1376
1511
|
|
1377
1512
|
cur = ggml_mul_mat(ctx0,
|
1378
1513
|
model.layers[il].w1,
|
1379
1514
|
cur);
|
1515
|
+
offload_func(cur);
|
1516
|
+
ggml_set_name(cur, "result_w2");
|
1380
1517
|
|
1381
1518
|
// SILU activation
|
1382
1519
|
cur = ggml_silu(ctx0, cur);
|
1520
|
+
offload_func(cur);
|
1521
|
+
ggml_set_name(cur, "silu");
|
1383
1522
|
|
1384
1523
|
cur = ggml_mul(ctx0, cur, tmp);
|
1524
|
+
offload_func(cur);
|
1525
|
+
ggml_set_name(cur, "silu_x_result_w3");
|
1385
1526
|
|
1386
1527
|
cur = ggml_mul_mat(ctx0,
|
1387
1528
|
model.layers[il].w2,
|
1388
1529
|
cur);
|
1530
|
+
offload_func(cur);
|
1531
|
+
ggml_set_name(cur, "result_w2");
|
1389
1532
|
}
|
1390
1533
|
|
1391
1534
|
cur = ggml_add(ctx0, cur, inpFF);
|
1535
|
+
offload_func(cur);
|
1536
|
+
ggml_set_name(cur, "inpFF_+_result_w2");
|
1392
1537
|
|
1393
1538
|
// input for next layer
|
1394
1539
|
inpL = cur;
|
1540
|
+
|
1395
1541
|
}
|
1396
1542
|
|
1397
1543
|
lctx.use_buf(ctx0, 0);
|
1544
|
+
//ggml_cuda_set_scratch(0);
|
1398
1545
|
|
1399
1546
|
// used at the end to optionally extract the embeddings
|
1400
1547
|
struct ggml_tensor * embeddings = NULL;
|
1401
1548
|
|
1549
|
+
offload_func_t offload_func = llama_nop;
|
1550
|
+
|
1551
|
+
#ifdef GGML_USE_CUBLAS
|
1552
|
+
if (n_gpu_layers > n_layer) {
|
1553
|
+
offload_func = ggml_cuda_assign_buffers; // sets the output backend to GPU
|
1554
|
+
}
|
1555
|
+
#endif // GGML_USE_CUBLAS
|
1556
|
+
|
1402
1557
|
// norm
|
1403
1558
|
{
|
1559
|
+
cur = ggml_rms_norm(ctx0, inpL);
|
1560
|
+
offload_func(cur);
|
1561
|
+
ggml_set_name(cur, "rms_norm_inpL");
|
1404
1562
|
|
1405
|
-
|
1563
|
+
cur = ggml_rms_norm(ctx0, cur);
|
1564
|
+
offload_func(cur);
|
1565
|
+
ggml_set_name(cur, "rms_norm_after");
|
1406
1566
|
|
1407
|
-
//
|
1408
|
-
|
1567
|
+
// cur = cur*norm(broadcasted)
|
1568
|
+
cur = ggml_mul(ctx0, cur, model.norm);
|
1569
|
+
offload_func(cur);
|
1570
|
+
ggml_set_name(cur, "result_norm");
|
1409
1571
|
|
1410
|
-
embeddings =
|
1572
|
+
embeddings = cur;
|
1411
1573
|
}
|
1412
1574
|
|
1575
|
+
|
1413
1576
|
// lm_head
|
1414
|
-
|
1577
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
1578
|
+
ggml_set_name(cur, "result_output");
|
1415
1579
|
|
1416
1580
|
lctx.use_buf(ctx0, -1);
|
1417
1581
|
|
1418
1582
|
// logits -> probs
|
1419
|
-
//
|
1583
|
+
//cur = ggml_soft_max_inplace(ctx0, cur);
|
1420
1584
|
|
1421
1585
|
// run the computation
|
1422
|
-
ggml_build_forward_expand(&gf,
|
1423
|
-
|
1586
|
+
ggml_build_forward_expand(&gf, cur);
|
1587
|
+
|
1588
|
+
#ifdef GGML_USE_METAL
|
1589
|
+
if (lctx.ctx_metal && N == 1) {
|
1590
|
+
ggml_metal_graph_compute(lctx.ctx_metal, &gf);
|
1591
|
+
ggml_metal_get_tensor (lctx.ctx_metal, cur);
|
1592
|
+
} else {
|
1593
|
+
// IMPORTANT:
|
1594
|
+
// Since we don't have efficient Matrix x Matrix Metal multiplication yet, we fallback to vanilla
|
1595
|
+
// ggml_graph_compute(). It uses Apple's Accelerate CBLAS API which takes advantage of the ANE or the AMX
|
1596
|
+
// coprocessor.
|
1597
|
+
//
|
1598
|
+
// When we implement Matrix x Matrix Metal multiplication, we can avoid this branch.
|
1599
|
+
// But for now, we have focused only on Matrix x Vector Metal multiplication.
|
1600
|
+
//
|
1601
|
+
// TODO: avoid these syncs via shared memory (ref #1696)
|
1602
|
+
//
|
1603
|
+
if (lctx.ctx_metal) {
|
1604
|
+
// We need to sync the GPU KV cache with the CPU KV cache
|
1605
|
+
ggml_metal_get_tensor(lctx.ctx_metal, kv_self.k);
|
1606
|
+
ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v);
|
1607
|
+
}
|
1608
|
+
|
1609
|
+
ggml_graph_compute(ctx0, &gf);
|
1610
|
+
}
|
1611
|
+
#else
|
1612
|
+
ggml_graph_compute(ctx0, &gf);
|
1613
|
+
#endif
|
1614
|
+
|
1615
|
+
if (cgraph_fname) {
|
1616
|
+
ggml_graph_export(&gf, cgraph_fname);
|
1617
|
+
}
|
1424
1618
|
|
1425
1619
|
#ifdef GGML_PERF
|
1426
1620
|
// print timing information per ggml operation (for debugging purposes)
|
@@ -1434,7 +1628,7 @@ static bool llama_eval_internal(
|
|
1434
1628
|
//}
|
1435
1629
|
|
1436
1630
|
//embd_w.resize(n_vocab*N);
|
1437
|
-
//memcpy(embd_w.data(), ggml_get_data(
|
1631
|
+
//memcpy(embd_w.data(), ggml_get_data(cur), sizeof(float)*n_vocab*N);
|
1438
1632
|
|
1439
1633
|
// update kv token count
|
1440
1634
|
lctx.model.kv_self.n = n_past + N;
|
@@ -1445,11 +1639,11 @@ static bool llama_eval_internal(
|
|
1445
1639
|
|
1446
1640
|
if (lctx.logits_all) {
|
1447
1641
|
logits_out.resize(n_vocab * N);
|
1448
|
-
memcpy(logits_out.data(), (float *) ggml_get_data(
|
1642
|
+
memcpy(logits_out.data(), (float *) ggml_get_data(cur), sizeof(float)*n_vocab*N);
|
1449
1643
|
} else {
|
1450
1644
|
// return result for just the last token
|
1451
1645
|
logits_out.resize(n_vocab);
|
1452
|
-
memcpy(logits_out.data(), (float *) ggml_get_data(
|
1646
|
+
memcpy(logits_out.data(), (float *) ggml_get_data(cur) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
|
1453
1647
|
}
|
1454
1648
|
}
|
1455
1649
|
|
@@ -2048,16 +2242,88 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra
|
|
2048
2242
|
// quantization
|
2049
2243
|
//
|
2050
2244
|
|
2051
|
-
static void
|
2245
|
+
static void llama_convert_tensor_internal(const llama_load_tensor & tensor, llama_buffer & output, const int nelements, const int nthread) {
|
2246
|
+
if (output.size < nelements * sizeof(float)) {
|
2247
|
+
output.resize(nelements * sizeof(float));
|
2248
|
+
}
|
2249
|
+
float * f32_output = (float *) output.addr;
|
2250
|
+
|
2251
|
+
quantize_fns_t qtype;
|
2252
|
+
if (ggml_is_quantized(tensor.type)) {
|
2253
|
+
qtype = ggml_internal_get_quantize_fn(tensor.type);
|
2254
|
+
if (qtype.dequantize_row_q == NULL) {
|
2255
|
+
throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor.type)));
|
2256
|
+
}
|
2257
|
+
} else if (tensor.type != GGML_TYPE_F16) {
|
2258
|
+
throw std::runtime_error(format("cannot dequantize/convert tensor type %s", ggml_type_name(tensor.type)));
|
2259
|
+
}
|
2260
|
+
|
2261
|
+
if (nthread < 2) {
|
2262
|
+
if (tensor.type == GGML_TYPE_F16) {
|
2263
|
+
ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor.data, f32_output, nelements);
|
2264
|
+
} else if (ggml_is_quantized(tensor.type)) {
|
2265
|
+
qtype.dequantize_row_q(tensor.data, f32_output, nelements);
|
2266
|
+
} else {
|
2267
|
+
LLAMA_ASSERT(false); // unreachable
|
2268
|
+
}
|
2269
|
+
return;
|
2270
|
+
}
|
2271
|
+
|
2272
|
+
auto block_size = tensor.type == GGML_TYPE_F16 ? 1 : (size_t)ggml_blck_size(tensor.type);
|
2273
|
+
auto block_size_bytes = ggml_type_size(tensor.type);
|
2274
|
+
|
2275
|
+
LLAMA_ASSERT(nelements % block_size == 0);
|
2276
|
+
auto nblocks = nelements / block_size;
|
2277
|
+
auto blocks_per_thread = nblocks / nthread;
|
2278
|
+
auto spare_blocks = nblocks - (blocks_per_thread * nthread); // if blocks aren't divisible by thread count
|
2279
|
+
|
2280
|
+
std::vector<std::thread> workers;
|
2281
|
+
for (auto tnum = 0, in_buff_offs = 0, out_buff_offs = 0; tnum < nthread; tnum++) {
|
2282
|
+
auto thr_blocks = blocks_per_thread + (tnum == nthread - 1 ? spare_blocks : 0); // num blocks for this thread
|
2283
|
+
auto thr_elems = thr_blocks * block_size; // number of elements for this thread
|
2284
|
+
auto thr_block_bytes = thr_blocks * block_size_bytes; // number of input bytes for this thread
|
2285
|
+
|
2286
|
+
auto compute = [qtype] (ggml_type typ, uint8_t * inbuf, float * outbuf, int nels) {
|
2287
|
+
if (typ == GGML_TYPE_F16) {
|
2288
|
+
ggml_fp16_to_fp32_row((ggml_fp16_t *)inbuf, outbuf, nels);
|
2289
|
+
} else {
|
2290
|
+
qtype.dequantize_row_q(inbuf, outbuf, nels);
|
2291
|
+
}
|
2292
|
+
};
|
2293
|
+
workers.push_back(std::thread(compute, tensor.type, tensor.data + in_buff_offs, f32_output + out_buff_offs, thr_elems));
|
2294
|
+
in_buff_offs += thr_block_bytes;
|
2295
|
+
out_buff_offs += thr_elems;
|
2296
|
+
}
|
2297
|
+
for (auto & worker : workers) {
|
2298
|
+
worker.join();
|
2299
|
+
}
|
2300
|
+
|
2301
|
+
}
|
2302
|
+
|
2303
|
+
static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
|
2052
2304
|
ggml_type quantized_type;
|
2053
|
-
|
2305
|
+
llama_ftype ftype = params->ftype;
|
2306
|
+
int nthread = params->nthread;
|
2307
|
+
|
2308
|
+
switch (params->ftype) {
|
2054
2309
|
case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
|
2055
2310
|
case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
|
2056
2311
|
case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
|
2057
2312
|
case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
|
2058
2313
|
case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
|
2059
|
-
|
2060
|
-
|
2314
|
+
|
2315
|
+
// K-quants
|
2316
|
+
case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
|
2317
|
+
case LLAMA_FTYPE_MOSTLY_Q3_K_S:
|
2318
|
+
case LLAMA_FTYPE_MOSTLY_Q3_K_M:
|
2319
|
+
case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break;
|
2320
|
+
case LLAMA_FTYPE_MOSTLY_Q4_K_S:
|
2321
|
+
case LLAMA_FTYPE_MOSTLY_Q4_K_M: quantized_type = GGML_TYPE_Q4_K; break;
|
2322
|
+
case LLAMA_FTYPE_MOSTLY_Q5_K_S:
|
2323
|
+
case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break;
|
2324
|
+
case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
|
2325
|
+
default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
|
2326
|
+
}
|
2061
2327
|
|
2062
2328
|
if (nthread <= 0) {
|
2063
2329
|
nthread = std::thread::hardware_concurrency();
|
@@ -2065,7 +2331,21 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2065
2331
|
|
2066
2332
|
std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp, /*use_mmap*/ false,
|
2067
2333
|
/*vocab_only*/ false));
|
2068
|
-
llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), ftype);
|
2334
|
+
llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), params->ftype);
|
2335
|
+
|
2336
|
+
int n_attention_wv = 0;
|
2337
|
+
int n_feed_forward_w2 = 0;
|
2338
|
+
for (auto& tensor : model_loader->tensors_map.tensors) {
|
2339
|
+
if (tensor.name.find("attention.wv.weight") != std::string::npos) {
|
2340
|
+
++n_attention_wv;
|
2341
|
+
}
|
2342
|
+
else if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) {
|
2343
|
+
++n_feed_forward_w2;
|
2344
|
+
}
|
2345
|
+
}
|
2346
|
+
|
2347
|
+
int i_attention_wv = 0;
|
2348
|
+
int i_feed_forward_w2 = 0;
|
2069
2349
|
|
2070
2350
|
size_t total_size_org = 0;
|
2071
2351
|
size_t total_size_new = 0;
|
@@ -2093,9 +2373,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2093
2373
|
quantize &= (tensor.ne.size() == 2);
|
2094
2374
|
|
2095
2375
|
// uncomment this to keep the output layer in FP16
|
2096
|
-
|
2097
|
-
|
2098
|
-
|
2376
|
+
if (!params->quantize_output_tensor && tensor.name == "output.weight") {
|
2377
|
+
quantize = false;
|
2378
|
+
}
|
2379
|
+
quantize = quantize && quantized_type != tensor.type;
|
2099
2380
|
|
2100
2381
|
enum ggml_type new_type;
|
2101
2382
|
void * new_data;
|
@@ -2109,20 +2390,43 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2109
2390
|
printf("size = %8.3f MB\n", tensor.size/1024.0/1024.0);
|
2110
2391
|
} else {
|
2111
2392
|
new_type = quantized_type;
|
2393
|
+
// TODO: temporary disabled until Metal / OpenCL support is available
|
2394
|
+
// ref: https://github.com/ggerganov/llama.cpp/issues/1711
|
2395
|
+
//if (tensor.name == "output.weight") {
|
2396
|
+
// new_type = GGML_TYPE_Q6_K;
|
2397
|
+
//}
|
2398
|
+
if (tensor.name.find("attention.wv.weight") != std::string::npos) {
|
2399
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
2400
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
2401
|
+
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
2402
|
+
(i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8 ||
|
2403
|
+
(i_attention_wv - n_attention_wv/8)%3 == 2)) new_type = GGML_TYPE_Q6_K;
|
2404
|
+
++i_attention_wv;
|
2405
|
+
}
|
2406
|
+
if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) {
|
2407
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
2408
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
2409
|
+
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
2410
|
+
(i_feed_forward_w2 < n_feed_forward_w2/8 || i_feed_forward_w2 >= 7*n_feed_forward_w2/8 ||
|
2411
|
+
(i_feed_forward_w2 - n_feed_forward_w2/8)%3 == 2)) new_type = GGML_TYPE_Q6_K;
|
2412
|
+
++i_feed_forward_w2;
|
2413
|
+
}
|
2414
|
+
if (tensor.name.find("attention.wo.weight") != std::string::npos) {
|
2415
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
2416
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
2417
|
+
}
|
2418
|
+
|
2112
2419
|
float * f32_data;
|
2113
2420
|
size_t nelements = tensor.ne.at(0) * tensor.ne.at(1);
|
2114
2421
|
llama_buffer f32_conv_buf;
|
2422
|
+
|
2115
2423
|
if (tensor.type == GGML_TYPE_F32) {
|
2116
2424
|
f32_data = (float *) tensor.data;
|
2117
|
-
} else if (tensor.type
|
2118
|
-
|
2119
|
-
f32_data = (float *) f32_conv_buf.addr;
|
2120
|
-
const auto * f16_data = (const ggml_fp16_t *) tensor.data;
|
2121
|
-
for (size_t i = 0; i < nelements; i++) {
|
2122
|
-
f32_data[i] = ggml_fp16_to_fp32(f16_data[i]);
|
2123
|
-
}
|
2425
|
+
} else if (ggml_is_quantized(tensor.type) && !params->allow_requantize) {
|
2426
|
+
throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor.type)));
|
2124
2427
|
} else {
|
2125
|
-
|
2428
|
+
llama_convert_tensor_internal(tensor, f32_conv_buf, nelements, nthread);
|
2429
|
+
f32_data = (float *) f32_conv_buf.addr;
|
2126
2430
|
}
|
2127
2431
|
|
2128
2432
|
printf("quantizing .. ");
|
@@ -2176,12 +2480,16 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2176
2480
|
}
|
2177
2481
|
|
2178
2482
|
printf("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0);
|
2483
|
+
int64_t tot_count = 0;
|
2179
2484
|
for (size_t i = 0; i < hist_cur.size(); i++) {
|
2180
2485
|
hist_all[i] += hist_cur[i];
|
2486
|
+
tot_count += hist_cur[i];
|
2181
2487
|
}
|
2182
2488
|
|
2183
|
-
|
2184
|
-
|
2489
|
+
if (tot_count > 0) {
|
2490
|
+
for (size_t i = 0; i < hist_cur.size(); i++) {
|
2491
|
+
printf("%5.3f ", hist_cur[i] / float(nelements));
|
2492
|
+
}
|
2185
2493
|
}
|
2186
2494
|
printf("\n");
|
2187
2495
|
}
|
@@ -2199,11 +2507,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2199
2507
|
sum_all += hist_all[i];
|
2200
2508
|
}
|
2201
2509
|
|
2202
|
-
|
2203
|
-
|
2204
|
-
|
2510
|
+
if (sum_all > 0) {
|
2511
|
+
printf("%s: hist: ", __func__);
|
2512
|
+
for (size_t i = 0; i < hist_all.size(); i++) {
|
2513
|
+
printf("%5.3f ", hist_all[i] / float(sum_all));
|
2514
|
+
}
|
2515
|
+
printf("\n");
|
2205
2516
|
}
|
2206
|
-
printf("\n");
|
2207
2517
|
}
|
2208
2518
|
}
|
2209
2519
|
|
@@ -2244,9 +2554,9 @@ struct llama_context * llama_init_from_file(
|
|
2244
2554
|
|
2245
2555
|
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
2246
2556
|
|
2247
|
-
if (!llama_model_load(path_model, *ctx, params.n_ctx, params.
|
2248
|
-
|
2249
|
-
|
2557
|
+
if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_batch, params.n_gpu_layers,
|
2558
|
+
params.main_gpu, params.tensor_split, memory_type, params.use_mmap, params.use_mlock,
|
2559
|
+
params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
|
2250
2560
|
fprintf(stderr, "%s: failed to load model\n", __func__);
|
2251
2561
|
llama_free(ctx);
|
2252
2562
|
return nullptr;
|
@@ -2284,6 +2594,38 @@ struct llama_context * llama_init_from_file(
|
|
2284
2594
|
ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
|
2285
2595
|
}
|
2286
2596
|
|
2597
|
+
#ifdef GGML_USE_METAL
|
2598
|
+
if (params.n_gpu_layers > 0) {
|
2599
|
+
// this allocates all Metal resources and memory buffers
|
2600
|
+
ctx->ctx_metal = ggml_metal_init();
|
2601
|
+
|
2602
|
+
void *data_ptr = NULL;
|
2603
|
+
size_t data_size = 0;
|
2604
|
+
if (params.use_mmap) {
|
2605
|
+
data_ptr = ctx->model.mapping->addr;
|
2606
|
+
data_size= ctx->model.mapping->size;
|
2607
|
+
} else {
|
2608
|
+
data_ptr = ggml_get_mem_buffer(ctx->model.ctx);
|
2609
|
+
data_size= ggml_get_mem_size(ctx->model.ctx);
|
2610
|
+
}
|
2611
|
+
|
2612
|
+
#define LLAMA_METAL_CHECK_BUF(result) \
|
2613
|
+
if (!(result)) { \
|
2614
|
+
fprintf(stderr, "%s: failed to add buffer\n", __func__); \
|
2615
|
+
llama_free(ctx); \
|
2616
|
+
return NULL; \
|
2617
|
+
}
|
2618
|
+
|
2619
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size));
|
2620
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size));
|
2621
|
+
|
2622
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->model.kv_self.buf.addr, ctx->model.kv_self.buf.size));
|
2623
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size));
|
2624
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size));
|
2625
|
+
#undef LLAMA_METAL_CHECK_BUF
|
2626
|
+
}
|
2627
|
+
#endif
|
2628
|
+
|
2287
2629
|
return ctx;
|
2288
2630
|
}
|
2289
2631
|
|
@@ -2294,13 +2636,12 @@ void llama_free(struct llama_context * ctx) {
|
|
2294
2636
|
int llama_model_quantize(
|
2295
2637
|
const char * fname_inp,
|
2296
2638
|
const char * fname_out,
|
2297
|
-
|
2298
|
-
int nthread) {
|
2639
|
+
const llama_model_quantize_params *params) {
|
2299
2640
|
try {
|
2300
|
-
llama_model_quantize_internal(fname_inp, fname_out,
|
2641
|
+
llama_model_quantize_internal(fname_inp, fname_out, params);
|
2301
2642
|
return 0;
|
2302
|
-
} catch (const std::
|
2303
|
-
fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.
|
2643
|
+
} catch (const std::exception & err) {
|
2644
|
+
fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.what());
|
2304
2645
|
return 1;
|
2305
2646
|
}
|
2306
2647
|
}
|
@@ -2553,8 +2894,8 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|
2553
2894
|
int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
|
2554
2895
|
try {
|
2555
2896
|
return llama_apply_lora_from_file_internal(ctx, path_lora, path_base_model, n_threads);
|
2556
|
-
} catch (const std::
|
2557
|
-
fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.
|
2897
|
+
} catch (const std::exception & err) {
|
2898
|
+
fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what());
|
2558
2899
|
return 1;
|
2559
2900
|
}
|
2560
2901
|
}
|
@@ -2899,7 +3240,7 @@ int llama_eval(
|
|
2899
3240
|
int n_tokens,
|
2900
3241
|
int n_past,
|
2901
3242
|
int n_threads) {
|
2902
|
-
if (!llama_eval_internal(*ctx, tokens, n_tokens, n_past, n_threads)) {
|
3243
|
+
if (!llama_eval_internal(*ctx, tokens, n_tokens, n_past, n_threads, nullptr)) {
|
2903
3244
|
fprintf(stderr, "%s: failed to eval\n", __func__);
|
2904
3245
|
return 1;
|
2905
3246
|
}
|
@@ -2914,6 +3255,20 @@ int llama_eval(
|
|
2914
3255
|
return 0;
|
2915
3256
|
}
|
2916
3257
|
|
3258
|
+
int llama_eval_export(struct llama_context * ctx, const char * fname) {
|
3259
|
+
const int n_batch = 1;
|
3260
|
+
const int n_ctx = 512 - n_batch;
|
3261
|
+
|
3262
|
+
const std::vector<llama_token> tmp(n_batch, llama_token_bos());
|
3263
|
+
|
3264
|
+
if (!llama_eval_internal(*ctx, tmp.data(), tmp.size(), n_ctx, 1, fname)) {
|
3265
|
+
fprintf(stderr, "%s: failed to eval\n", __func__);
|
3266
|
+
return 1;
|
3267
|
+
}
|
3268
|
+
|
3269
|
+
return 0;
|
3270
|
+
}
|
3271
|
+
|
2917
3272
|
int llama_tokenize(
|
2918
3273
|
struct llama_context * ctx,
|
2919
3274
|
const char * text,
|