llama_cpp 0.1.3 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +39 -8
- data/ext/llama_cpp/extconf.rb +26 -1
- data/ext/llama_cpp/llama_cpp.cpp +210 -13
- data/ext/llama_cpp/src/ggml-cuda.cu +1916 -0
- data/ext/llama_cpp/src/ggml-cuda.h +15 -2
- data/ext/llama_cpp/src/ggml-metal.h +63 -0
- data/ext/llama_cpp/src/ggml-metal.m +783 -0
- data/ext/llama_cpp/src/ggml-metal.metal +1133 -0
- data/ext/llama_cpp/src/ggml-opencl.cpp +242 -52
- data/ext/llama_cpp/src/ggml-opencl.h +4 -0
- data/ext/llama_cpp/src/ggml.c +835 -82
- data/ext/llama_cpp/src/ggml.h +64 -8
- data/ext/llama_cpp/src/k_quants.c +2244 -0
- data/ext/llama_cpp/src/k_quants.h +122 -0
- data/ext/llama_cpp/src/llama-util.h +16 -0
- data/ext/llama_cpp/src/llama.cpp +489 -134
- data/ext/llama_cpp/src/llama.h +43 -7
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +33 -1
- metadata +8 -2
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -16,6 +16,10 @@
|
|
16
16
|
#include "ggml-opencl.h"
|
17
17
|
#endif
|
18
18
|
|
19
|
+
#ifdef GGML_USE_METAL
|
20
|
+
#include "ggml-metal.h"
|
21
|
+
#endif
|
22
|
+
|
19
23
|
#include <array>
|
20
24
|
#include <ctime>
|
21
25
|
#include <cinttypes>
|
@@ -42,22 +46,29 @@
|
|
42
46
|
// available llama models
|
43
47
|
enum e_model {
|
44
48
|
MODEL_UNKNOWN,
|
49
|
+
MODEL_3B,
|
45
50
|
MODEL_7B,
|
46
51
|
MODEL_13B,
|
47
52
|
MODEL_30B,
|
48
53
|
MODEL_65B,
|
49
54
|
};
|
50
55
|
|
51
|
-
|
52
56
|
static const size_t MB = 1024*1024;
|
53
57
|
|
54
58
|
// computed for n_ctx == 2048
|
55
59
|
// TODO: dynamically determine these sizes
|
56
60
|
// needs modifications in ggml
|
57
61
|
|
62
|
+
typedef void (*offload_func_t)(struct ggml_tensor * tensor);
|
63
|
+
|
64
|
+
void llama_nop(struct ggml_tensor * tensor) { // don't offload by default
|
65
|
+
(void) tensor;
|
66
|
+
}
|
67
|
+
|
58
68
|
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
|
59
69
|
{
|
60
70
|
static std::map<e_model, size_t> k_sizes = {
|
71
|
+
{ MODEL_3B, 256ull * MB },
|
61
72
|
{ MODEL_7B, 512ull * MB },
|
62
73
|
{ MODEL_13B, 512ull * MB },
|
63
74
|
{ MODEL_30B, 512ull * MB },
|
@@ -69,6 +80,7 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
|
|
69
80
|
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
|
70
81
|
{
|
71
82
|
static std::map<e_model, size_t> k_sizes = {
|
83
|
+
{ MODEL_3B, 256ull * MB },
|
72
84
|
{ MODEL_7B, 512ull * MB },
|
73
85
|
{ MODEL_13B, 512ull * MB },
|
74
86
|
{ MODEL_30B, 512ull * MB },
|
@@ -81,6 +93,7 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
|
|
81
93
|
static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
|
82
94
|
{
|
83
95
|
static std::map<e_model, size_t> k_sizes = {
|
96
|
+
{ MODEL_3B, 682ull * MB },
|
84
97
|
{ MODEL_7B, 1026ull * MB },
|
85
98
|
{ MODEL_13B, 1608ull * MB },
|
86
99
|
{ MODEL_30B, 3124ull * MB },
|
@@ -94,6 +107,7 @@ static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
|
|
94
107
|
static const std::map<e_model, size_t> & MEM_REQ_EVAL()
|
95
108
|
{
|
96
109
|
static std::map<e_model, size_t> k_sizes = {
|
110
|
+
{ MODEL_3B, 512ull * MB },
|
97
111
|
{ MODEL_7B, 768ull * MB },
|
98
112
|
{ MODEL_13B, 1024ull * MB },
|
99
113
|
{ MODEL_30B, 1280ull * MB },
|
@@ -165,6 +179,7 @@ struct llama_model {
|
|
165
179
|
struct ggml_tensor * output;
|
166
180
|
|
167
181
|
std::vector<llama_layer> layers;
|
182
|
+
int n_gpu_layers;
|
168
183
|
|
169
184
|
// context
|
170
185
|
struct ggml_context * ctx = NULL;
|
@@ -190,6 +205,16 @@ struct llama_model {
|
|
190
205
|
if (ctx) {
|
191
206
|
ggml_free(ctx);
|
192
207
|
}
|
208
|
+
|
209
|
+
#ifdef GGML_USE_CUBLAS
|
210
|
+
for (size_t i = 0; i < tensors_by_name.size(); ++i) {
|
211
|
+
ggml_cuda_free_data(tensors_by_name[i].second);
|
212
|
+
}
|
213
|
+
#elif defined(GGML_USE_CLBLAST)
|
214
|
+
for (size_t i = 0; i < tensors_by_name.size(); ++i) {
|
215
|
+
ggml_cl_free_data(tensors_by_name[i].second);
|
216
|
+
}
|
217
|
+
#endif
|
193
218
|
}
|
194
219
|
};
|
195
220
|
|
@@ -238,6 +263,10 @@ struct llama_context {
|
|
238
263
|
llama_ctx_buffer buf_compute;
|
239
264
|
llama_ctx_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
|
240
265
|
|
266
|
+
#ifdef GGML_USE_METAL
|
267
|
+
ggml_metal_context * ctx_metal = NULL;
|
268
|
+
#endif
|
269
|
+
|
241
270
|
int buf_last = 0;
|
242
271
|
size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
|
243
272
|
|
@@ -277,15 +306,15 @@ template <typename T>
|
|
277
306
|
static T checked_mul(T a, T b) {
|
278
307
|
T ret = a * b;
|
279
308
|
if (a != 0 && ret / a != b) {
|
280
|
-
throw format("overflow multiplying %llu * %llu",
|
281
|
-
(unsigned long long) a, (unsigned long long) b);
|
309
|
+
throw std::runtime_error(format("overflow multiplying %llu * %llu",
|
310
|
+
(unsigned long long) a, (unsigned long long) b));
|
282
311
|
}
|
283
312
|
return ret;
|
284
313
|
}
|
285
314
|
|
286
315
|
static size_t checked_div(size_t a, size_t b) {
|
287
316
|
if (b == 0 || a % b != 0) {
|
288
|
-
throw format("error dividing %zu / %zu", a, b);
|
317
|
+
throw std::runtime_error(format("error dividing %zu / %zu", a, b));
|
289
318
|
}
|
290
319
|
return a / b;
|
291
320
|
}
|
@@ -349,7 +378,7 @@ struct llama_load_tensor {
|
|
349
378
|
const auto & first_shard = shards.at(0);
|
350
379
|
for (const auto & shard : shards) {
|
351
380
|
if (shard.type != first_shard.type) {
|
352
|
-
throw format("inconsistent tensor shard type in '%s'", name.c_str());
|
381
|
+
throw std::runtime_error(format("inconsistent tensor shard type in '%s'", name.c_str()));
|
353
382
|
}
|
354
383
|
}
|
355
384
|
type = first_shard.type;
|
@@ -372,8 +401,8 @@ struct llama_load_tensor {
|
|
372
401
|
const auto & first_shard = shards.at(0);
|
373
402
|
for (const auto & shard : shards) {
|
374
403
|
if (shard.ne != first_shard.ne) {
|
375
|
-
throw format("inconsistent tensor shard shape in '%s': first was %s, other was %s",
|
376
|
-
name.c_str(), llama_format_tensor_shape(first_shard.ne).c_str(), llama_format_tensor_shape(shard.ne).c_str());
|
404
|
+
throw std::runtime_error(format("inconsistent tensor shard shape in '%s': first was %s, other was %s",
|
405
|
+
name.c_str(), llama_format_tensor_shape(first_shard.ne).c_str(), llama_format_tensor_shape(shard.ne).c_str()));
|
377
406
|
}
|
378
407
|
}
|
379
408
|
ne = first_shard.ne;
|
@@ -451,8 +480,8 @@ struct llama_file_loader {
|
|
451
480
|
}
|
452
481
|
}
|
453
482
|
|
454
|
-
throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
|
455
|
-
magic, version);
|
483
|
+
throw std::runtime_error(format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
|
484
|
+
magic, version));
|
456
485
|
}
|
457
486
|
void read_hparams() {
|
458
487
|
hparams.n_vocab = file.read_u32();
|
@@ -492,7 +521,7 @@ struct llama_file_loader {
|
|
492
521
|
file.read_raw(shard.ne.data(), sizeof(shard.ne[0]) * n_dims);
|
493
522
|
std::string name = file.read_string(name_len);
|
494
523
|
if (n_dims < 1 || n_dims > 2) {
|
495
|
-
throw format("llama.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims);
|
524
|
+
throw std::runtime_error(format("llama.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims));
|
496
525
|
}
|
497
526
|
switch (shard.type) {
|
498
527
|
case GGML_TYPE_F32:
|
@@ -502,9 +531,14 @@ struct llama_file_loader {
|
|
502
531
|
case GGML_TYPE_Q5_0:
|
503
532
|
case GGML_TYPE_Q5_1:
|
504
533
|
case GGML_TYPE_Q8_0:
|
534
|
+
case GGML_TYPE_Q2_K:
|
535
|
+
case GGML_TYPE_Q3_K:
|
536
|
+
case GGML_TYPE_Q4_K:
|
537
|
+
case GGML_TYPE_Q5_K:
|
538
|
+
case GGML_TYPE_Q6_K:
|
505
539
|
break;
|
506
540
|
default: {
|
507
|
-
throw format("unrecognized tensor type %u\n", shard.type);
|
541
|
+
throw std::runtime_error(format("unrecognized tensor type %u\n", shard.type));
|
508
542
|
}
|
509
543
|
}
|
510
544
|
|
@@ -577,6 +611,11 @@ struct llama_file_saver {
|
|
577
611
|
case GGML_TYPE_Q5_0:
|
578
612
|
case GGML_TYPE_Q5_1:
|
579
613
|
case GGML_TYPE_Q8_0:
|
614
|
+
case GGML_TYPE_Q2_K:
|
615
|
+
case GGML_TYPE_Q3_K:
|
616
|
+
case GGML_TYPE_Q4_K:
|
617
|
+
case GGML_TYPE_Q5_K:
|
618
|
+
case GGML_TYPE_Q6_K:
|
580
619
|
break;
|
581
620
|
default: LLAMA_ASSERT(false);
|
582
621
|
}
|
@@ -608,7 +647,7 @@ struct llama_model_loader {
|
|
608
647
|
auto * ith_file = new llama_file_loader(fname.c_str(), i, tensors_map);
|
609
648
|
file_loaders.emplace_back(ith_file);
|
610
649
|
if (ith_file->hparams != first_file->hparams) {
|
611
|
-
throw format("llama.cpp: hparams inconsistent between files");
|
650
|
+
throw std::runtime_error(format("llama.cpp: hparams inconsistent between files"));
|
612
651
|
}
|
613
652
|
}
|
614
653
|
if (!llama_mmap::SUPPORTED) {
|
@@ -638,7 +677,7 @@ struct llama_model_loader {
|
|
638
677
|
uint32_t guess_n_parts() const {
|
639
678
|
auto it = tensors_map.name_to_idx.find("tok_embeddings.weight");
|
640
679
|
if (it == tensors_map.name_to_idx.end()) {
|
641
|
-
throw std::string("missing tok_embeddings.weight");
|
680
|
+
throw std::runtime_error(std::string("missing tok_embeddings.weight"));
|
642
681
|
}
|
643
682
|
const llama_load_tensor & lt = tensors_map.tensors.at(it->second);
|
644
683
|
return file_loaders.at(0)->hparams.n_embd / lt.shards.at(0).ne.at(0);
|
@@ -655,12 +694,12 @@ struct llama_model_loader {
|
|
655
694
|
struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne, ggml_backend backend) {
|
656
695
|
auto it = tensors_map.name_to_idx.find(name);
|
657
696
|
if (it == tensors_map.name_to_idx.end()) {
|
658
|
-
throw format("llama.cpp: tensor '%s' is missing from model", name.c_str());
|
697
|
+
throw std::runtime_error(std::runtime_error(format("llama.cpp: tensor '%s' is missing from model", name.c_str())));
|
659
698
|
}
|
660
699
|
llama_load_tensor & lt = tensors_map.tensors.at(it->second);
|
661
700
|
if (lt.ne != ne) {
|
662
|
-
throw format("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s",
|
663
|
-
name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str());
|
701
|
+
throw std::runtime_error(format("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s",
|
702
|
+
name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str()));
|
664
703
|
}
|
665
704
|
|
666
705
|
return get_tensor_for(lt, backend);
|
@@ -676,6 +715,7 @@ struct llama_model_loader {
|
|
676
715
|
}
|
677
716
|
ggml_set_name(tensor, lt.name.c_str());
|
678
717
|
LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor
|
718
|
+
|
679
719
|
tensor->backend = backend;
|
680
720
|
lt.ggml_tensor = tensor;
|
681
721
|
num_ggml_tensors_created++;
|
@@ -684,7 +724,7 @@ struct llama_model_loader {
|
|
684
724
|
|
685
725
|
void done_getting_tensors() const {
|
686
726
|
if (num_ggml_tensors_created != tensors_map.tensors.size()) {
|
687
|
-
throw std::string("llama.cpp: file contained more tensors than expected");
|
727
|
+
throw std::runtime_error(std::string("llama.cpp: file contained more tensors than expected"));
|
688
728
|
}
|
689
729
|
}
|
690
730
|
|
@@ -828,7 +868,10 @@ static bool kv_cache_init(
|
|
828
868
|
struct llama_context_params llama_context_default_params() {
|
829
869
|
struct llama_context_params result = {
|
830
870
|
/*.n_ctx =*/ 512,
|
871
|
+
/*.n_batch =*/ 512,
|
831
872
|
/*.gpu_layers =*/ 0,
|
873
|
+
/*.main_gpu =*/ 0,
|
874
|
+
/*.tensor_split =*/ {0},
|
832
875
|
/*.seed =*/ -1,
|
833
876
|
/*.f16_kv =*/ true,
|
834
877
|
/*.logits_all =*/ false,
|
@@ -843,6 +886,17 @@ struct llama_context_params llama_context_default_params() {
|
|
843
886
|
return result;
|
844
887
|
}
|
845
888
|
|
889
|
+
struct llama_model_quantize_params llama_model_quantize_default_params() {
|
890
|
+
struct llama_model_quantize_params result = {
|
891
|
+
/*.nthread =*/ 0,
|
892
|
+
/*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
|
893
|
+
/*.allow_requantize =*/ false,
|
894
|
+
/*.quantize_output_tensor =*/ true,
|
895
|
+
};
|
896
|
+
|
897
|
+
return result;
|
898
|
+
}
|
899
|
+
|
846
900
|
bool llama_mmap_supported() {
|
847
901
|
return llama_mmap::SUPPORTED;
|
848
902
|
}
|
@@ -893,12 +947,23 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
|
|
893
947
|
case LLAMA_FTYPE_MOSTLY_Q5_0: return "mostly Q5_0";
|
894
948
|
case LLAMA_FTYPE_MOSTLY_Q5_1: return "mostly Q5_1";
|
895
949
|
case LLAMA_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0";
|
950
|
+
// K-quants
|
951
|
+
case LLAMA_FTYPE_MOSTLY_Q2_K: return "mostly Q2_K";
|
952
|
+
case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "mostly Q3_K - Small";
|
953
|
+
case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "mostly Q3_K - Medium";
|
954
|
+
case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "mostly Q3_K - Large";
|
955
|
+
case LLAMA_FTYPE_MOSTLY_Q4_K_S: return "mostly Q4_K - Small";
|
956
|
+
case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "mostly Q4_K - Medium";
|
957
|
+
case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "mostly Q5_K - Small";
|
958
|
+
case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "mostly Q5_K - Medium";
|
959
|
+
case LLAMA_FTYPE_MOSTLY_Q6_K: return "mostly Q6_K";
|
896
960
|
default: return "unknown, may not work";
|
897
961
|
}
|
898
962
|
}
|
899
963
|
|
900
964
|
static const char *llama_model_type_name(e_model type) {
|
901
965
|
switch (type) {
|
966
|
+
case MODEL_3B: return "3B";
|
902
967
|
case MODEL_7B: return "7B";
|
903
968
|
case MODEL_13B: return "13B";
|
904
969
|
case MODEL_30B: return "30B";
|
@@ -911,7 +976,10 @@ static void llama_model_load_internal(
|
|
911
976
|
const std::string & fname,
|
912
977
|
llama_context & lctx,
|
913
978
|
int n_ctx,
|
979
|
+
int n_batch,
|
914
980
|
int n_gpu_layers,
|
981
|
+
int main_gpu,
|
982
|
+
const float * tensor_split,
|
915
983
|
ggml_type memory_type,
|
916
984
|
bool use_mmap,
|
917
985
|
bool use_mlock,
|
@@ -926,12 +994,13 @@ static void llama_model_load_internal(
|
|
926
994
|
lctx.vocab = std::move(ml->file_loaders.at(0)->vocab);
|
927
995
|
auto & model = lctx.model;
|
928
996
|
model.hparams = ml->file_loaders.at(0)->hparams;
|
997
|
+
model.n_gpu_layers = n_gpu_layers;
|
929
998
|
llama_file_version file_version = ml->file_loaders.at(0)->file_version;
|
930
999
|
auto & hparams = model.hparams;
|
931
|
-
uint32_t n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
|
932
1000
|
|
933
1001
|
{
|
934
1002
|
switch (hparams.n_layer) {
|
1003
|
+
case 26: model.type = e_model::MODEL_3B; break;
|
935
1004
|
case 32: model.type = e_model::MODEL_7B; break;
|
936
1005
|
case 40: model.type = e_model::MODEL_13B; break;
|
937
1006
|
case 60: model.type = e_model::MODEL_30B; break;
|
@@ -941,6 +1010,8 @@ static void llama_model_load_internal(
|
|
941
1010
|
hparams.n_ctx = n_ctx;
|
942
1011
|
}
|
943
1012
|
|
1013
|
+
const uint32_t n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
|
1014
|
+
|
944
1015
|
{
|
945
1016
|
fprintf(stderr, "%s: format = %s\n", __func__, llama_file_version_name(file_version));
|
946
1017
|
fprintf(stderr, "%s: n_vocab = %u\n", __func__, hparams.n_vocab);
|
@@ -960,7 +1031,7 @@ static void llama_model_load_internal(
|
|
960
1031
|
if (hparams.ftype != LLAMA_FTYPE_ALL_F32 &&
|
961
1032
|
hparams.ftype != LLAMA_FTYPE_MOSTLY_F16 &&
|
962
1033
|
hparams.ftype != LLAMA_FTYPE_MOSTLY_Q8_0) {
|
963
|
-
throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1405)");
|
1034
|
+
throw std::runtime_error(format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1405)"));
|
964
1035
|
}
|
965
1036
|
}
|
966
1037
|
|
@@ -968,7 +1039,7 @@ static void llama_model_load_internal(
|
|
968
1039
|
if (hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ||
|
969
1040
|
hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_1 ||
|
970
1041
|
hparams.ftype == LLAMA_FTYPE_MOSTLY_Q8_0) {
|
971
|
-
throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1508)");
|
1042
|
+
throw std::runtime_error(format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1508)"));
|
972
1043
|
}
|
973
1044
|
}
|
974
1045
|
|
@@ -999,18 +1070,28 @@ static void llama_model_load_internal(
|
|
999
1070
|
|
1000
1071
|
model.ctx = ggml_init(params);
|
1001
1072
|
if (!model.ctx) {
|
1002
|
-
throw format("ggml_init() failed");
|
1073
|
+
throw std::runtime_error(format("ggml_init() failed"));
|
1003
1074
|
}
|
1004
1075
|
}
|
1005
1076
|
|
1006
|
-
|
1007
|
-
#
|
1077
|
+
(void) main_gpu;
|
1078
|
+
#if defined(GGML_USE_CUBLAS)
|
1079
|
+
fprintf(stderr, "%s: using CUDA for GPU acceleration\n", __func__);
|
1080
|
+
ggml_cuda_set_main_device(main_gpu);
|
1081
|
+
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
|
1082
|
+
#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT
|
1083
|
+
#elif defined(GGML_USE_CLBLAST)
|
1084
|
+
fprintf(stderr, "%s: using OpenCL for GPU acceleration\n", __func__);
|
1085
|
+
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
|
1086
|
+
#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU
|
1008
1087
|
#else
|
1009
|
-
#define LLAMA_BACKEND_OFFLOAD
|
1088
|
+
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CPU
|
1089
|
+
#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_CPU
|
1010
1090
|
#endif
|
1011
1091
|
|
1012
1092
|
// prepare memory for the weights
|
1013
|
-
size_t
|
1093
|
+
size_t vram_weights = 0;
|
1094
|
+
size_t vram_scratch = 0;
|
1014
1095
|
{
|
1015
1096
|
const uint32_t n_embd = hparams.n_embd;
|
1016
1097
|
const uint32_t n_layer = hparams.n_layer;
|
@@ -1025,7 +1106,7 @@ static void llama_model_load_internal(
|
|
1025
1106
|
{
|
1026
1107
|
ggml_backend backend_output;
|
1027
1108
|
if (n_gpu_layers > int(n_layer)) { // NOLINT
|
1028
|
-
backend_output =
|
1109
|
+
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
|
1029
1110
|
} else {
|
1030
1111
|
backend_output = GGML_BACKEND_CPU;
|
1031
1112
|
}
|
@@ -1037,7 +1118,8 @@ static void llama_model_load_internal(
|
|
1037
1118
|
|
1038
1119
|
model.layers.resize(n_layer);
|
1039
1120
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
1040
|
-
const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
1121
|
+
const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
1122
|
+
const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
1041
1123
|
|
1042
1124
|
auto & layer = model.layers[i];
|
1043
1125
|
|
@@ -1045,19 +1127,19 @@ static void llama_model_load_internal(
|
|
1045
1127
|
|
1046
1128
|
layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd}, backend);
|
1047
1129
|
|
1048
|
-
layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd},
|
1049
|
-
layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd},
|
1050
|
-
layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd},
|
1051
|
-
layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd},
|
1130
|
+
layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}, backend_split);
|
1131
|
+
layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd}, backend_split);
|
1132
|
+
layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd}, backend_split);
|
1133
|
+
layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}, backend_split);
|
1052
1134
|
|
1053
1135
|
layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd}, backend);
|
1054
1136
|
|
1055
|
-
layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff},
|
1056
|
-
layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd},
|
1057
|
-
layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff},
|
1137
|
+
layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}, backend_split);
|
1138
|
+
layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}, backend_split);
|
1139
|
+
layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}, backend_split);
|
1058
1140
|
|
1059
|
-
if (backend ==
|
1060
|
-
|
1141
|
+
if (backend == GGML_BACKEND_GPU) {
|
1142
|
+
vram_weights +=
|
1061
1143
|
ggml_nbytes(layer.attention_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
|
1062
1144
|
ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.attention_norm) +
|
1063
1145
|
ggml_nbytes(layer.w1) + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3);
|
@@ -1074,10 +1156,10 @@ static void llama_model_load_internal(
|
|
1074
1156
|
// this is the total memory required to run the inference
|
1075
1157
|
const size_t mem_required =
|
1076
1158
|
ctx_size +
|
1077
|
-
mmapped_size -
|
1159
|
+
mmapped_size - vram_weights + // weights in VRAM not in memory
|
1078
1160
|
MEM_REQ_SCRATCH0().at(model.type) +
|
1079
1161
|
MEM_REQ_SCRATCH1().at(model.type) +
|
1080
|
-
MEM_REQ_EVAL().at(model.type);
|
1162
|
+
MEM_REQ_EVAL().at (model.type);
|
1081
1163
|
|
1082
1164
|
// this is the memory required by one llama_state
|
1083
1165
|
const size_t mem_required_state =
|
@@ -1086,15 +1168,25 @@ static void llama_model_load_internal(
|
|
1086
1168
|
fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
|
1087
1169
|
mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
|
1088
1170
|
|
1171
|
+
(void) vram_scratch;
|
1089
1172
|
#ifdef GGML_USE_CUBLAS
|
1173
|
+
vram_scratch = n_batch * MB;
|
1174
|
+
ggml_cuda_set_scratch_size(vram_scratch);
|
1175
|
+
if (n_gpu_layers > 0) {
|
1176
|
+
fprintf(stderr, "%s: allocating batch_size x 1 MB = %ld MB VRAM for the scratch buffer\n",
|
1177
|
+
__func__, vram_scratch / MB);
|
1178
|
+
}
|
1179
|
+
#endif // GGML_USE_CUBLAS
|
1180
|
+
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
1090
1181
|
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
1091
1182
|
|
1092
|
-
fprintf(stderr, "%s:
|
1183
|
+
fprintf(stderr, "%s: offloading %d layers to GPU\n", __func__, n_gpu);
|
1093
1184
|
if (n_gpu_layers > (int) hparams.n_layer) {
|
1094
|
-
fprintf(stderr, "%s:
|
1185
|
+
fprintf(stderr, "%s: offloading output layer to GPU\n", __func__);
|
1095
1186
|
}
|
1096
|
-
fprintf(stderr, "%s:
|
1097
|
-
|
1187
|
+
fprintf(stderr, "%s: total VRAM used: %zu MB\n",
|
1188
|
+
__func__, (vram_weights + vram_scratch + MB - 1) / MB); // round up
|
1189
|
+
#else
|
1098
1190
|
(void) n_gpu_layers;
|
1099
1191
|
#endif
|
1100
1192
|
}
|
@@ -1106,8 +1198,10 @@ static void llama_model_load_internal(
|
|
1106
1198
|
|
1107
1199
|
ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
|
1108
1200
|
|
1109
|
-
#
|
1201
|
+
#if defined(GGML_USE_CUBLAS)
|
1110
1202
|
{
|
1203
|
+
ggml_cuda_set_tensor_split(tensor_split);
|
1204
|
+
|
1111
1205
|
size_t done_size = 0;
|
1112
1206
|
size_t data_size = 0;
|
1113
1207
|
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
|
@@ -1117,7 +1211,8 @@ static void llama_model_load_internal(
|
|
1117
1211
|
}
|
1118
1212
|
}
|
1119
1213
|
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
|
1120
|
-
|
1214
|
+
ggml_backend backend = lt.ggml_tensor->backend;
|
1215
|
+
if (backend != GGML_BACKEND_GPU && backend != GGML_BACKEND_GPU_SPLIT) {
|
1121
1216
|
continue;
|
1122
1217
|
}
|
1123
1218
|
if (progress_callback) {
|
@@ -1129,30 +1224,28 @@ static void llama_model_load_internal(
|
|
1129
1224
|
}
|
1130
1225
|
#elif defined(GGML_USE_CLBLAST)
|
1131
1226
|
{
|
1132
|
-
|
1133
|
-
|
1134
|
-
|
1135
|
-
|
1136
|
-
|
1137
|
-
|
1138
|
-
|
1139
|
-
const auto & layer = model.layers[i];
|
1140
|
-
|
1141
|
-
ggml_cl_transform_tensor(layer.wq); vram_total += ggml_nbytes(layer.wq);
|
1142
|
-
ggml_cl_transform_tensor(layer.wk); vram_total += ggml_nbytes(layer.wk);
|
1143
|
-
ggml_cl_transform_tensor(layer.wv); vram_total += ggml_nbytes(layer.wv);
|
1144
|
-
ggml_cl_transform_tensor(layer.wo); vram_total += ggml_nbytes(layer.wo);
|
1145
|
-
ggml_cl_transform_tensor(layer.w1); vram_total += ggml_nbytes(layer.w1);
|
1146
|
-
ggml_cl_transform_tensor(layer.w2); vram_total += ggml_nbytes(layer.w2);
|
1147
|
-
ggml_cl_transform_tensor(layer.w3); vram_total += ggml_nbytes(layer.w3);
|
1227
|
+
size_t done_size = 0;
|
1228
|
+
size_t data_size = 0;
|
1229
|
+
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
|
1230
|
+
data_size += lt.size;
|
1231
|
+
if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
|
1232
|
+
done_size += lt.size;
|
1233
|
+
}
|
1148
1234
|
}
|
1149
|
-
|
1150
|
-
|
1151
|
-
|
1235
|
+
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
|
1236
|
+
if (lt.ggml_tensor->backend != GGML_BACKEND_GPU) {
|
1237
|
+
continue;
|
1238
|
+
}
|
1239
|
+
if (progress_callback) {
|
1240
|
+
progress_callback((float) done_size / data_size, progress_callback_user_data);
|
1241
|
+
}
|
1242
|
+
ggml_cl_load_data(fname.c_str(), lt.ggml_tensor, lt.shards.at(0).file_off);
|
1243
|
+
done_size += lt.size;
|
1152
1244
|
}
|
1153
|
-
|
1154
|
-
fprintf(stderr, "ggml_opencl: total VRAM used: %zu MB\n", vram_total / 1024 / 1024);
|
1155
1245
|
}
|
1246
|
+
#else
|
1247
|
+
(void) n_batch;
|
1248
|
+
(void) tensor_split;
|
1156
1249
|
#endif
|
1157
1250
|
|
1158
1251
|
if (progress_callback) {
|
@@ -1170,7 +1263,10 @@ static bool llama_model_load(
|
|
1170
1263
|
const std::string & fname,
|
1171
1264
|
llama_context & lctx,
|
1172
1265
|
int n_ctx,
|
1266
|
+
int n_batch,
|
1173
1267
|
int n_gpu_layers,
|
1268
|
+
int main_gpu,
|
1269
|
+
float * tensor_split,
|
1174
1270
|
ggml_type memory_type,
|
1175
1271
|
bool use_mmap,
|
1176
1272
|
bool use_mlock,
|
@@ -1178,28 +1274,30 @@ static bool llama_model_load(
|
|
1178
1274
|
llama_progress_callback progress_callback,
|
1179
1275
|
void *progress_callback_user_data) {
|
1180
1276
|
try {
|
1181
|
-
llama_model_load_internal(fname, lctx, n_ctx, n_gpu_layers,
|
1182
|
-
vocab_only, progress_callback, progress_callback_user_data);
|
1277
|
+
llama_model_load_internal(fname, lctx, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, memory_type,
|
1278
|
+
use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
|
1183
1279
|
return true;
|
1184
|
-
} catch (const std::
|
1185
|
-
fprintf(stderr, "error loading model: %s\n", err.
|
1280
|
+
} catch (const std::exception & err) {
|
1281
|
+
fprintf(stderr, "error loading model: %s\n", err.what());
|
1186
1282
|
return false;
|
1187
1283
|
}
|
1188
1284
|
}
|
1189
1285
|
|
1190
1286
|
// evaluate the transformer
|
1191
1287
|
//
|
1192
|
-
// - lctx:
|
1193
|
-
// - tokens:
|
1194
|
-
// - n_past:
|
1195
|
-
// - n_threads:
|
1288
|
+
// - lctx: llama context
|
1289
|
+
// - tokens: new batch of tokens to process
|
1290
|
+
// - n_past: the context size so far
|
1291
|
+
// - n_threads: number of threads to use
|
1292
|
+
// - cgraph_fname: filename of the exported computation graph
|
1196
1293
|
//
|
1197
1294
|
static bool llama_eval_internal(
|
1198
|
-
llama_context &
|
1199
|
-
const llama_token *
|
1200
|
-
const int
|
1201
|
-
const int
|
1202
|
-
const int
|
1295
|
+
llama_context & lctx,
|
1296
|
+
const llama_token * tokens,
|
1297
|
+
const int n_tokens,
|
1298
|
+
const int n_past,
|
1299
|
+
const int n_threads,
|
1300
|
+
const char * cgraph_fname) {
|
1203
1301
|
|
1204
1302
|
// enforce that the first token is BOS
|
1205
1303
|
if (n_past == 0 && tokens[0] != llama_token_bos()) {
|
@@ -1218,12 +1316,13 @@ static bool llama_eval_internal(
|
|
1218
1316
|
|
1219
1317
|
LLAMA_ASSERT(!!kv_self.ctx);
|
1220
1318
|
|
1221
|
-
const int n_embd
|
1222
|
-
const int n_layer
|
1223
|
-
const int n_ctx
|
1224
|
-
const int n_head
|
1225
|
-
const int n_vocab
|
1226
|
-
const int n_rot
|
1319
|
+
const int n_embd = hparams.n_embd;
|
1320
|
+
const int n_layer = hparams.n_layer;
|
1321
|
+
const int n_ctx = hparams.n_ctx;
|
1322
|
+
const int n_head = hparams.n_head;
|
1323
|
+
const int n_vocab = hparams.n_vocab;
|
1324
|
+
const int n_rot = hparams.n_embd/hparams.n_head;
|
1325
|
+
const int n_gpu_layers = model.n_gpu_layers;
|
1227
1326
|
|
1228
1327
|
auto & mem_per_token = lctx.mem_per_token;
|
1229
1328
|
auto & buf_compute = lctx.buf_compute;
|
@@ -1245,40 +1344,66 @@ static bool llama_eval_internal(
|
|
1245
1344
|
ggml_set_name(embd, "embd");
|
1246
1345
|
memcpy(embd->data, tokens, N*ggml_element_size(embd));
|
1247
1346
|
|
1347
|
+
struct ggml_tensor * cur;
|
1248
1348
|
struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
|
1249
1349
|
|
1350
|
+
const int i_gpu_start = n_layer - n_gpu_layers;
|
1351
|
+
(void) i_gpu_start;
|
1352
|
+
|
1250
1353
|
for (int il = 0; il < n_layer; ++il) {
|
1251
|
-
|
1354
|
+
offload_func_t offload_func = llama_nop;
|
1252
1355
|
|
1253
|
-
|
1356
|
+
#ifdef GGML_USE_CUBLAS
|
1357
|
+
if (il >= i_gpu_start) {
|
1358
|
+
offload_func = ggml_cuda_assign_buffers; // sets the output backend to GPU
|
1359
|
+
}
|
1360
|
+
#endif // GGML_USE_CUBLAS
|
1361
|
+
|
1362
|
+
struct ggml_tensor * inpSA = inpL;
|
1254
1363
|
|
1255
1364
|
lctx.use_buf(ctx0, 0);
|
1256
1365
|
|
1257
1366
|
// norm
|
1258
1367
|
{
|
1259
1368
|
cur = ggml_rms_norm(ctx0, inpL);
|
1369
|
+
offload_func(cur);
|
1370
|
+
ggml_set_name(cur, "rms_norm_0");
|
1260
1371
|
|
1261
1372
|
// cur = cur*attention_norm(broadcasted)
|
1262
1373
|
cur = ggml_mul(ctx0, cur, model.layers[il].attention_norm);
|
1374
|
+
offload_func(cur);
|
1375
|
+
ggml_set_name(cur, "attention_norm_0");
|
1263
1376
|
}
|
1264
1377
|
|
1265
1378
|
// self-attention
|
1266
1379
|
{
|
1267
1380
|
// compute Q and K and RoPE them
|
1268
|
-
struct ggml_tensor *
|
1269
|
-
|
1270
|
-
ggml_set_name(
|
1381
|
+
struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
1382
|
+
// offload_func(tmpq);
|
1383
|
+
ggml_set_name(tmpq, "tmpq");
|
1384
|
+
|
1385
|
+
struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
1386
|
+
// offload_func(tmpk);
|
1387
|
+
ggml_set_name(tmpk, "tmpk");
|
1388
|
+
|
1389
|
+
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0);
|
1271
1390
|
ggml_set_name(Kcur, "Kcur");
|
1272
1391
|
|
1392
|
+
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0);
|
1393
|
+
ggml_set_name(Qcur, "Qcur");
|
1394
|
+
|
1273
1395
|
// store key and value to memory
|
1274
1396
|
{
|
1275
1397
|
// compute the transposed [N, n_embd] V matrix
|
1276
1398
|
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), n_embd, N));
|
1399
|
+
ggml_set_name(Vcur, "Vcur");
|
1277
1400
|
|
1278
1401
|
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
|
1402
|
+
ggml_set_name(k, "k");
|
1279
1403
|
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
|
1280
1404
|
( n_ctx)*ggml_element_size(kv_self.v),
|
1281
1405
|
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
|
1406
|
+
ggml_set_name(v, "v");
|
1282
1407
|
|
1283
1408
|
// important: storing RoPE-ed version of K in the KV cache!
|
1284
1409
|
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
|
@@ -1319,7 +1444,6 @@ static bool llama_eval_internal(
|
|
1319
1444
|
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
|
1320
1445
|
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
1321
1446
|
|
1322
|
-
|
1323
1447
|
// split cached V into n_head heads
|
1324
1448
|
struct ggml_tensor * V =
|
1325
1449
|
ggml_view_3d(ctx0, kv_self.v,
|
@@ -1354,73 +1478,143 @@ static bool llama_eval_internal(
|
|
1354
1478
|
cur = ggml_mul_mat(ctx0,
|
1355
1479
|
model.layers[il].wo,
|
1356
1480
|
cur);
|
1481
|
+
offload_func(cur);
|
1482
|
+
ggml_set_name(cur, "result_wo");
|
1357
1483
|
}
|
1358
1484
|
|
1359
1485
|
lctx.use_buf(ctx0, 1);
|
1486
|
+
//ggml_cuda_set_scratch(1);
|
1360
1487
|
|
1361
1488
|
struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
|
1489
|
+
offload_func(inpFF);
|
1490
|
+
ggml_set_name(inpFF, "inpFF");
|
1362
1491
|
|
1363
1492
|
// feed-forward network
|
1364
1493
|
{
|
1365
1494
|
// norm
|
1366
1495
|
{
|
1367
1496
|
cur = ggml_rms_norm(ctx0, inpFF);
|
1497
|
+
offload_func(cur);
|
1498
|
+
ggml_set_name(cur, "rms_norm_1");
|
1368
1499
|
|
1369
1500
|
// cur = cur*ffn_norm(broadcasted)
|
1370
1501
|
cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
|
1502
|
+
offload_func(cur);
|
1503
|
+
ggml_set_name(cur, "ffn_norm");
|
1371
1504
|
}
|
1372
1505
|
|
1373
1506
|
struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
|
1374
1507
|
model.layers[il].w3,
|
1375
1508
|
cur);
|
1509
|
+
offload_func(tmp);
|
1510
|
+
ggml_set_name(tmp, "result_w3");
|
1376
1511
|
|
1377
1512
|
cur = ggml_mul_mat(ctx0,
|
1378
1513
|
model.layers[il].w1,
|
1379
1514
|
cur);
|
1515
|
+
offload_func(cur);
|
1516
|
+
ggml_set_name(cur, "result_w2");
|
1380
1517
|
|
1381
1518
|
// SILU activation
|
1382
1519
|
cur = ggml_silu(ctx0, cur);
|
1520
|
+
offload_func(cur);
|
1521
|
+
ggml_set_name(cur, "silu");
|
1383
1522
|
|
1384
1523
|
cur = ggml_mul(ctx0, cur, tmp);
|
1524
|
+
offload_func(cur);
|
1525
|
+
ggml_set_name(cur, "silu_x_result_w3");
|
1385
1526
|
|
1386
1527
|
cur = ggml_mul_mat(ctx0,
|
1387
1528
|
model.layers[il].w2,
|
1388
1529
|
cur);
|
1530
|
+
offload_func(cur);
|
1531
|
+
ggml_set_name(cur, "result_w2");
|
1389
1532
|
}
|
1390
1533
|
|
1391
1534
|
cur = ggml_add(ctx0, cur, inpFF);
|
1535
|
+
offload_func(cur);
|
1536
|
+
ggml_set_name(cur, "inpFF_+_result_w2");
|
1392
1537
|
|
1393
1538
|
// input for next layer
|
1394
1539
|
inpL = cur;
|
1540
|
+
|
1395
1541
|
}
|
1396
1542
|
|
1397
1543
|
lctx.use_buf(ctx0, 0);
|
1544
|
+
//ggml_cuda_set_scratch(0);
|
1398
1545
|
|
1399
1546
|
// used at the end to optionally extract the embeddings
|
1400
1547
|
struct ggml_tensor * embeddings = NULL;
|
1401
1548
|
|
1549
|
+
offload_func_t offload_func = llama_nop;
|
1550
|
+
|
1551
|
+
#ifdef GGML_USE_CUBLAS
|
1552
|
+
if (n_gpu_layers > n_layer) {
|
1553
|
+
offload_func = ggml_cuda_assign_buffers; // sets the output backend to GPU
|
1554
|
+
}
|
1555
|
+
#endif // GGML_USE_CUBLAS
|
1556
|
+
|
1402
1557
|
// norm
|
1403
1558
|
{
|
1559
|
+
cur = ggml_rms_norm(ctx0, inpL);
|
1560
|
+
offload_func(cur);
|
1561
|
+
ggml_set_name(cur, "rms_norm_inpL");
|
1404
1562
|
|
1405
|
-
|
1563
|
+
cur = ggml_rms_norm(ctx0, cur);
|
1564
|
+
offload_func(cur);
|
1565
|
+
ggml_set_name(cur, "rms_norm_after");
|
1406
1566
|
|
1407
|
-
//
|
1408
|
-
|
1567
|
+
// cur = cur*norm(broadcasted)
|
1568
|
+
cur = ggml_mul(ctx0, cur, model.norm);
|
1569
|
+
offload_func(cur);
|
1570
|
+
ggml_set_name(cur, "result_norm");
|
1409
1571
|
|
1410
|
-
embeddings =
|
1572
|
+
embeddings = cur;
|
1411
1573
|
}
|
1412
1574
|
|
1575
|
+
|
1413
1576
|
// lm_head
|
1414
|
-
|
1577
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
1578
|
+
ggml_set_name(cur, "result_output");
|
1415
1579
|
|
1416
1580
|
lctx.use_buf(ctx0, -1);
|
1417
1581
|
|
1418
1582
|
// logits -> probs
|
1419
|
-
//
|
1583
|
+
//cur = ggml_soft_max_inplace(ctx0, cur);
|
1420
1584
|
|
1421
1585
|
// run the computation
|
1422
|
-
ggml_build_forward_expand(&gf,
|
1423
|
-
|
1586
|
+
ggml_build_forward_expand(&gf, cur);
|
1587
|
+
|
1588
|
+
#ifdef GGML_USE_METAL
|
1589
|
+
if (lctx.ctx_metal && N == 1) {
|
1590
|
+
ggml_metal_graph_compute(lctx.ctx_metal, &gf);
|
1591
|
+
ggml_metal_get_tensor (lctx.ctx_metal, cur);
|
1592
|
+
} else {
|
1593
|
+
// IMPORTANT:
|
1594
|
+
// Since we don't have efficient Matrix x Matrix Metal multiplication yet, we fallback to vanilla
|
1595
|
+
// ggml_graph_compute(). It uses Apple's Accelerate CBLAS API which takes advantage of the ANE or the AMX
|
1596
|
+
// coprocessor.
|
1597
|
+
//
|
1598
|
+
// When we implement Matrix x Matrix Metal multiplication, we can avoid this branch.
|
1599
|
+
// But for now, we have focused only on Matrix x Vector Metal multiplication.
|
1600
|
+
//
|
1601
|
+
// TODO: avoid these syncs via shared memory (ref #1696)
|
1602
|
+
//
|
1603
|
+
if (lctx.ctx_metal) {
|
1604
|
+
// We need to sync the GPU KV cache with the CPU KV cache
|
1605
|
+
ggml_metal_get_tensor(lctx.ctx_metal, kv_self.k);
|
1606
|
+
ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v);
|
1607
|
+
}
|
1608
|
+
|
1609
|
+
ggml_graph_compute(ctx0, &gf);
|
1610
|
+
}
|
1611
|
+
#else
|
1612
|
+
ggml_graph_compute(ctx0, &gf);
|
1613
|
+
#endif
|
1614
|
+
|
1615
|
+
if (cgraph_fname) {
|
1616
|
+
ggml_graph_export(&gf, cgraph_fname);
|
1617
|
+
}
|
1424
1618
|
|
1425
1619
|
#ifdef GGML_PERF
|
1426
1620
|
// print timing information per ggml operation (for debugging purposes)
|
@@ -1434,7 +1628,7 @@ static bool llama_eval_internal(
|
|
1434
1628
|
//}
|
1435
1629
|
|
1436
1630
|
//embd_w.resize(n_vocab*N);
|
1437
|
-
//memcpy(embd_w.data(), ggml_get_data(
|
1631
|
+
//memcpy(embd_w.data(), ggml_get_data(cur), sizeof(float)*n_vocab*N);
|
1438
1632
|
|
1439
1633
|
// update kv token count
|
1440
1634
|
lctx.model.kv_self.n = n_past + N;
|
@@ -1445,11 +1639,11 @@ static bool llama_eval_internal(
|
|
1445
1639
|
|
1446
1640
|
if (lctx.logits_all) {
|
1447
1641
|
logits_out.resize(n_vocab * N);
|
1448
|
-
memcpy(logits_out.data(), (float *) ggml_get_data(
|
1642
|
+
memcpy(logits_out.data(), (float *) ggml_get_data(cur), sizeof(float)*n_vocab*N);
|
1449
1643
|
} else {
|
1450
1644
|
// return result for just the last token
|
1451
1645
|
logits_out.resize(n_vocab);
|
1452
|
-
memcpy(logits_out.data(), (float *) ggml_get_data(
|
1646
|
+
memcpy(logits_out.data(), (float *) ggml_get_data(cur) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
|
1453
1647
|
}
|
1454
1648
|
}
|
1455
1649
|
|
@@ -2048,16 +2242,88 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra
|
|
2048
2242
|
// quantization
|
2049
2243
|
//
|
2050
2244
|
|
2051
|
-
static void
|
2245
|
+
static void llama_convert_tensor_internal(const llama_load_tensor & tensor, llama_buffer & output, const int nelements, const int nthread) {
|
2246
|
+
if (output.size < nelements * sizeof(float)) {
|
2247
|
+
output.resize(nelements * sizeof(float));
|
2248
|
+
}
|
2249
|
+
float * f32_output = (float *) output.addr;
|
2250
|
+
|
2251
|
+
quantize_fns_t qtype;
|
2252
|
+
if (ggml_is_quantized(tensor.type)) {
|
2253
|
+
qtype = ggml_internal_get_quantize_fn(tensor.type);
|
2254
|
+
if (qtype.dequantize_row_q == NULL) {
|
2255
|
+
throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor.type)));
|
2256
|
+
}
|
2257
|
+
} else if (tensor.type != GGML_TYPE_F16) {
|
2258
|
+
throw std::runtime_error(format("cannot dequantize/convert tensor type %s", ggml_type_name(tensor.type)));
|
2259
|
+
}
|
2260
|
+
|
2261
|
+
if (nthread < 2) {
|
2262
|
+
if (tensor.type == GGML_TYPE_F16) {
|
2263
|
+
ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor.data, f32_output, nelements);
|
2264
|
+
} else if (ggml_is_quantized(tensor.type)) {
|
2265
|
+
qtype.dequantize_row_q(tensor.data, f32_output, nelements);
|
2266
|
+
} else {
|
2267
|
+
LLAMA_ASSERT(false); // unreachable
|
2268
|
+
}
|
2269
|
+
return;
|
2270
|
+
}
|
2271
|
+
|
2272
|
+
auto block_size = tensor.type == GGML_TYPE_F16 ? 1 : (size_t)ggml_blck_size(tensor.type);
|
2273
|
+
auto block_size_bytes = ggml_type_size(tensor.type);
|
2274
|
+
|
2275
|
+
LLAMA_ASSERT(nelements % block_size == 0);
|
2276
|
+
auto nblocks = nelements / block_size;
|
2277
|
+
auto blocks_per_thread = nblocks / nthread;
|
2278
|
+
auto spare_blocks = nblocks - (blocks_per_thread * nthread); // if blocks aren't divisible by thread count
|
2279
|
+
|
2280
|
+
std::vector<std::thread> workers;
|
2281
|
+
for (auto tnum = 0, in_buff_offs = 0, out_buff_offs = 0; tnum < nthread; tnum++) {
|
2282
|
+
auto thr_blocks = blocks_per_thread + (tnum == nthread - 1 ? spare_blocks : 0); // num blocks for this thread
|
2283
|
+
auto thr_elems = thr_blocks * block_size; // number of elements for this thread
|
2284
|
+
auto thr_block_bytes = thr_blocks * block_size_bytes; // number of input bytes for this thread
|
2285
|
+
|
2286
|
+
auto compute = [qtype] (ggml_type typ, uint8_t * inbuf, float * outbuf, int nels) {
|
2287
|
+
if (typ == GGML_TYPE_F16) {
|
2288
|
+
ggml_fp16_to_fp32_row((ggml_fp16_t *)inbuf, outbuf, nels);
|
2289
|
+
} else {
|
2290
|
+
qtype.dequantize_row_q(inbuf, outbuf, nels);
|
2291
|
+
}
|
2292
|
+
};
|
2293
|
+
workers.push_back(std::thread(compute, tensor.type, tensor.data + in_buff_offs, f32_output + out_buff_offs, thr_elems));
|
2294
|
+
in_buff_offs += thr_block_bytes;
|
2295
|
+
out_buff_offs += thr_elems;
|
2296
|
+
}
|
2297
|
+
for (auto & worker : workers) {
|
2298
|
+
worker.join();
|
2299
|
+
}
|
2300
|
+
|
2301
|
+
}
|
2302
|
+
|
2303
|
+
static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
|
2052
2304
|
ggml_type quantized_type;
|
2053
|
-
|
2305
|
+
llama_ftype ftype = params->ftype;
|
2306
|
+
int nthread = params->nthread;
|
2307
|
+
|
2308
|
+
switch (params->ftype) {
|
2054
2309
|
case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
|
2055
2310
|
case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
|
2056
2311
|
case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
|
2057
2312
|
case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
|
2058
2313
|
case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
|
2059
|
-
|
2060
|
-
|
2314
|
+
|
2315
|
+
// K-quants
|
2316
|
+
case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
|
2317
|
+
case LLAMA_FTYPE_MOSTLY_Q3_K_S:
|
2318
|
+
case LLAMA_FTYPE_MOSTLY_Q3_K_M:
|
2319
|
+
case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break;
|
2320
|
+
case LLAMA_FTYPE_MOSTLY_Q4_K_S:
|
2321
|
+
case LLAMA_FTYPE_MOSTLY_Q4_K_M: quantized_type = GGML_TYPE_Q4_K; break;
|
2322
|
+
case LLAMA_FTYPE_MOSTLY_Q5_K_S:
|
2323
|
+
case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break;
|
2324
|
+
case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
|
2325
|
+
default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
|
2326
|
+
}
|
2061
2327
|
|
2062
2328
|
if (nthread <= 0) {
|
2063
2329
|
nthread = std::thread::hardware_concurrency();
|
@@ -2065,7 +2331,21 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2065
2331
|
|
2066
2332
|
std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp, /*use_mmap*/ false,
|
2067
2333
|
/*vocab_only*/ false));
|
2068
|
-
llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), ftype);
|
2334
|
+
llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), params->ftype);
|
2335
|
+
|
2336
|
+
int n_attention_wv = 0;
|
2337
|
+
int n_feed_forward_w2 = 0;
|
2338
|
+
for (auto& tensor : model_loader->tensors_map.tensors) {
|
2339
|
+
if (tensor.name.find("attention.wv.weight") != std::string::npos) {
|
2340
|
+
++n_attention_wv;
|
2341
|
+
}
|
2342
|
+
else if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) {
|
2343
|
+
++n_feed_forward_w2;
|
2344
|
+
}
|
2345
|
+
}
|
2346
|
+
|
2347
|
+
int i_attention_wv = 0;
|
2348
|
+
int i_feed_forward_w2 = 0;
|
2069
2349
|
|
2070
2350
|
size_t total_size_org = 0;
|
2071
2351
|
size_t total_size_new = 0;
|
@@ -2093,9 +2373,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2093
2373
|
quantize &= (tensor.ne.size() == 2);
|
2094
2374
|
|
2095
2375
|
// uncomment this to keep the output layer in FP16
|
2096
|
-
|
2097
|
-
|
2098
|
-
|
2376
|
+
if (!params->quantize_output_tensor && tensor.name == "output.weight") {
|
2377
|
+
quantize = false;
|
2378
|
+
}
|
2379
|
+
quantize = quantize && quantized_type != tensor.type;
|
2099
2380
|
|
2100
2381
|
enum ggml_type new_type;
|
2101
2382
|
void * new_data;
|
@@ -2109,20 +2390,43 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2109
2390
|
printf("size = %8.3f MB\n", tensor.size/1024.0/1024.0);
|
2110
2391
|
} else {
|
2111
2392
|
new_type = quantized_type;
|
2393
|
+
// TODO: temporary disabled until Metal / OpenCL support is available
|
2394
|
+
// ref: https://github.com/ggerganov/llama.cpp/issues/1711
|
2395
|
+
//if (tensor.name == "output.weight") {
|
2396
|
+
// new_type = GGML_TYPE_Q6_K;
|
2397
|
+
//}
|
2398
|
+
if (tensor.name.find("attention.wv.weight") != std::string::npos) {
|
2399
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
2400
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
2401
|
+
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
2402
|
+
(i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8 ||
|
2403
|
+
(i_attention_wv - n_attention_wv/8)%3 == 2)) new_type = GGML_TYPE_Q6_K;
|
2404
|
+
++i_attention_wv;
|
2405
|
+
}
|
2406
|
+
if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) {
|
2407
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
2408
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
2409
|
+
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
2410
|
+
(i_feed_forward_w2 < n_feed_forward_w2/8 || i_feed_forward_w2 >= 7*n_feed_forward_w2/8 ||
|
2411
|
+
(i_feed_forward_w2 - n_feed_forward_w2/8)%3 == 2)) new_type = GGML_TYPE_Q6_K;
|
2412
|
+
++i_feed_forward_w2;
|
2413
|
+
}
|
2414
|
+
if (tensor.name.find("attention.wo.weight") != std::string::npos) {
|
2415
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
2416
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
2417
|
+
}
|
2418
|
+
|
2112
2419
|
float * f32_data;
|
2113
2420
|
size_t nelements = tensor.ne.at(0) * tensor.ne.at(1);
|
2114
2421
|
llama_buffer f32_conv_buf;
|
2422
|
+
|
2115
2423
|
if (tensor.type == GGML_TYPE_F32) {
|
2116
2424
|
f32_data = (float *) tensor.data;
|
2117
|
-
} else if (tensor.type
|
2118
|
-
|
2119
|
-
f32_data = (float *) f32_conv_buf.addr;
|
2120
|
-
const auto * f16_data = (const ggml_fp16_t *) tensor.data;
|
2121
|
-
for (size_t i = 0; i < nelements; i++) {
|
2122
|
-
f32_data[i] = ggml_fp16_to_fp32(f16_data[i]);
|
2123
|
-
}
|
2425
|
+
} else if (ggml_is_quantized(tensor.type) && !params->allow_requantize) {
|
2426
|
+
throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor.type)));
|
2124
2427
|
} else {
|
2125
|
-
|
2428
|
+
llama_convert_tensor_internal(tensor, f32_conv_buf, nelements, nthread);
|
2429
|
+
f32_data = (float *) f32_conv_buf.addr;
|
2126
2430
|
}
|
2127
2431
|
|
2128
2432
|
printf("quantizing .. ");
|
@@ -2176,12 +2480,16 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2176
2480
|
}
|
2177
2481
|
|
2178
2482
|
printf("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0);
|
2483
|
+
int64_t tot_count = 0;
|
2179
2484
|
for (size_t i = 0; i < hist_cur.size(); i++) {
|
2180
2485
|
hist_all[i] += hist_cur[i];
|
2486
|
+
tot_count += hist_cur[i];
|
2181
2487
|
}
|
2182
2488
|
|
2183
|
-
|
2184
|
-
|
2489
|
+
if (tot_count > 0) {
|
2490
|
+
for (size_t i = 0; i < hist_cur.size(); i++) {
|
2491
|
+
printf("%5.3f ", hist_cur[i] / float(nelements));
|
2492
|
+
}
|
2185
2493
|
}
|
2186
2494
|
printf("\n");
|
2187
2495
|
}
|
@@ -2199,11 +2507,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2199
2507
|
sum_all += hist_all[i];
|
2200
2508
|
}
|
2201
2509
|
|
2202
|
-
|
2203
|
-
|
2204
|
-
|
2510
|
+
if (sum_all > 0) {
|
2511
|
+
printf("%s: hist: ", __func__);
|
2512
|
+
for (size_t i = 0; i < hist_all.size(); i++) {
|
2513
|
+
printf("%5.3f ", hist_all[i] / float(sum_all));
|
2514
|
+
}
|
2515
|
+
printf("\n");
|
2205
2516
|
}
|
2206
|
-
printf("\n");
|
2207
2517
|
}
|
2208
2518
|
}
|
2209
2519
|
|
@@ -2244,9 +2554,9 @@ struct llama_context * llama_init_from_file(
|
|
2244
2554
|
|
2245
2555
|
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
2246
2556
|
|
2247
|
-
if (!llama_model_load(path_model, *ctx, params.n_ctx, params.
|
2248
|
-
|
2249
|
-
|
2557
|
+
if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_batch, params.n_gpu_layers,
|
2558
|
+
params.main_gpu, params.tensor_split, memory_type, params.use_mmap, params.use_mlock,
|
2559
|
+
params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
|
2250
2560
|
fprintf(stderr, "%s: failed to load model\n", __func__);
|
2251
2561
|
llama_free(ctx);
|
2252
2562
|
return nullptr;
|
@@ -2284,6 +2594,38 @@ struct llama_context * llama_init_from_file(
|
|
2284
2594
|
ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
|
2285
2595
|
}
|
2286
2596
|
|
2597
|
+
#ifdef GGML_USE_METAL
|
2598
|
+
if (params.n_gpu_layers > 0) {
|
2599
|
+
// this allocates all Metal resources and memory buffers
|
2600
|
+
ctx->ctx_metal = ggml_metal_init();
|
2601
|
+
|
2602
|
+
void *data_ptr = NULL;
|
2603
|
+
size_t data_size = 0;
|
2604
|
+
if (params.use_mmap) {
|
2605
|
+
data_ptr = ctx->model.mapping->addr;
|
2606
|
+
data_size= ctx->model.mapping->size;
|
2607
|
+
} else {
|
2608
|
+
data_ptr = ggml_get_mem_buffer(ctx->model.ctx);
|
2609
|
+
data_size= ggml_get_mem_size(ctx->model.ctx);
|
2610
|
+
}
|
2611
|
+
|
2612
|
+
#define LLAMA_METAL_CHECK_BUF(result) \
|
2613
|
+
if (!(result)) { \
|
2614
|
+
fprintf(stderr, "%s: failed to add buffer\n", __func__); \
|
2615
|
+
llama_free(ctx); \
|
2616
|
+
return NULL; \
|
2617
|
+
}
|
2618
|
+
|
2619
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size));
|
2620
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size));
|
2621
|
+
|
2622
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->model.kv_self.buf.addr, ctx->model.kv_self.buf.size));
|
2623
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size));
|
2624
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size));
|
2625
|
+
#undef LLAMA_METAL_CHECK_BUF
|
2626
|
+
}
|
2627
|
+
#endif
|
2628
|
+
|
2287
2629
|
return ctx;
|
2288
2630
|
}
|
2289
2631
|
|
@@ -2294,13 +2636,12 @@ void llama_free(struct llama_context * ctx) {
|
|
2294
2636
|
int llama_model_quantize(
|
2295
2637
|
const char * fname_inp,
|
2296
2638
|
const char * fname_out,
|
2297
|
-
|
2298
|
-
int nthread) {
|
2639
|
+
const llama_model_quantize_params *params) {
|
2299
2640
|
try {
|
2300
|
-
llama_model_quantize_internal(fname_inp, fname_out,
|
2641
|
+
llama_model_quantize_internal(fname_inp, fname_out, params);
|
2301
2642
|
return 0;
|
2302
|
-
} catch (const std::
|
2303
|
-
fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.
|
2643
|
+
} catch (const std::exception & err) {
|
2644
|
+
fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.what());
|
2304
2645
|
return 1;
|
2305
2646
|
}
|
2306
2647
|
}
|
@@ -2553,8 +2894,8 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|
2553
2894
|
int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
|
2554
2895
|
try {
|
2555
2896
|
return llama_apply_lora_from_file_internal(ctx, path_lora, path_base_model, n_threads);
|
2556
|
-
} catch (const std::
|
2557
|
-
fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.
|
2897
|
+
} catch (const std::exception & err) {
|
2898
|
+
fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what());
|
2558
2899
|
return 1;
|
2559
2900
|
}
|
2560
2901
|
}
|
@@ -2899,7 +3240,7 @@ int llama_eval(
|
|
2899
3240
|
int n_tokens,
|
2900
3241
|
int n_past,
|
2901
3242
|
int n_threads) {
|
2902
|
-
if (!llama_eval_internal(*ctx, tokens, n_tokens, n_past, n_threads)) {
|
3243
|
+
if (!llama_eval_internal(*ctx, tokens, n_tokens, n_past, n_threads, nullptr)) {
|
2903
3244
|
fprintf(stderr, "%s: failed to eval\n", __func__);
|
2904
3245
|
return 1;
|
2905
3246
|
}
|
@@ -2914,6 +3255,20 @@ int llama_eval(
|
|
2914
3255
|
return 0;
|
2915
3256
|
}
|
2916
3257
|
|
3258
|
+
int llama_eval_export(struct llama_context * ctx, const char * fname) {
|
3259
|
+
const int n_batch = 1;
|
3260
|
+
const int n_ctx = 512 - n_batch;
|
3261
|
+
|
3262
|
+
const std::vector<llama_token> tmp(n_batch, llama_token_bos());
|
3263
|
+
|
3264
|
+
if (!llama_eval_internal(*ctx, tmp.data(), tmp.size(), n_ctx, 1, fname)) {
|
3265
|
+
fprintf(stderr, "%s: failed to eval\n", __func__);
|
3266
|
+
return 1;
|
3267
|
+
}
|
3268
|
+
|
3269
|
+
return 0;
|
3270
|
+
}
|
3271
|
+
|
2917
3272
|
int llama_tokenize(
|
2918
3273
|
struct llama_context * ctx,
|
2919
3274
|
const char * text,
|