llama_cpp 0.1.4 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +29 -0
- data/ext/llama_cpp/extconf.rb +26 -1
- data/ext/llama_cpp/llama_cpp.cpp +210 -13
- data/ext/llama_cpp/src/ggml-cuda.cu +1916 -0
- data/ext/llama_cpp/src/ggml-cuda.h +15 -2
- data/ext/llama_cpp/src/ggml-metal.h +63 -0
- data/ext/llama_cpp/src/ggml-metal.m +783 -0
- data/ext/llama_cpp/src/ggml-metal.metal +1133 -0
- data/ext/llama_cpp/src/ggml-opencl.cpp +235 -39
- data/ext/llama_cpp/src/ggml-opencl.h +4 -0
- data/ext/llama_cpp/src/ggml.c +340 -109
- data/ext/llama_cpp/src/ggml.h +44 -6
- data/ext/llama_cpp/src/k_quants.c +2244 -0
- data/ext/llama_cpp/src/k_quants.h +122 -0
- data/ext/llama_cpp/src/llama-util.h +16 -0
- data/ext/llama_cpp/src/llama.cpp +484 -136
- data/ext/llama_cpp/src/llama.h +39 -8
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +33 -1
- metadata +8 -2
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -16,6 +16,10 @@
|
|
16
16
|
#include "ggml-opencl.h"
|
17
17
|
#endif
|
18
18
|
|
19
|
+
#ifdef GGML_USE_METAL
|
20
|
+
#include "ggml-metal.h"
|
21
|
+
#endif
|
22
|
+
|
19
23
|
#include <array>
|
20
24
|
#include <ctime>
|
21
25
|
#include <cinttypes>
|
@@ -49,17 +53,22 @@ enum e_model {
|
|
49
53
|
MODEL_65B,
|
50
54
|
};
|
51
55
|
|
52
|
-
|
53
56
|
static const size_t MB = 1024*1024;
|
54
57
|
|
55
58
|
// computed for n_ctx == 2048
|
56
59
|
// TODO: dynamically determine these sizes
|
57
60
|
// needs modifications in ggml
|
58
61
|
|
62
|
+
typedef void (*offload_func_t)(struct ggml_tensor * tensor);
|
63
|
+
|
64
|
+
void llama_nop(struct ggml_tensor * tensor) { // don't offload by default
|
65
|
+
(void) tensor;
|
66
|
+
}
|
67
|
+
|
59
68
|
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
|
60
69
|
{
|
61
70
|
static std::map<e_model, size_t> k_sizes = {
|
62
|
-
{ MODEL_3B,
|
71
|
+
{ MODEL_3B, 256ull * MB },
|
63
72
|
{ MODEL_7B, 512ull * MB },
|
64
73
|
{ MODEL_13B, 512ull * MB },
|
65
74
|
{ MODEL_30B, 512ull * MB },
|
@@ -71,7 +80,7 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
|
|
71
80
|
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
|
72
81
|
{
|
73
82
|
static std::map<e_model, size_t> k_sizes = {
|
74
|
-
{ MODEL_3B,
|
83
|
+
{ MODEL_3B, 256ull * MB },
|
75
84
|
{ MODEL_7B, 512ull * MB },
|
76
85
|
{ MODEL_13B, 512ull * MB },
|
77
86
|
{ MODEL_30B, 512ull * MB },
|
@@ -170,6 +179,7 @@ struct llama_model {
|
|
170
179
|
struct ggml_tensor * output;
|
171
180
|
|
172
181
|
std::vector<llama_layer> layers;
|
182
|
+
int n_gpu_layers;
|
173
183
|
|
174
184
|
// context
|
175
185
|
struct ggml_context * ctx = NULL;
|
@@ -195,6 +205,16 @@ struct llama_model {
|
|
195
205
|
if (ctx) {
|
196
206
|
ggml_free(ctx);
|
197
207
|
}
|
208
|
+
|
209
|
+
#ifdef GGML_USE_CUBLAS
|
210
|
+
for (size_t i = 0; i < tensors_by_name.size(); ++i) {
|
211
|
+
ggml_cuda_free_data(tensors_by_name[i].second);
|
212
|
+
}
|
213
|
+
#elif defined(GGML_USE_CLBLAST)
|
214
|
+
for (size_t i = 0; i < tensors_by_name.size(); ++i) {
|
215
|
+
ggml_cl_free_data(tensors_by_name[i].second);
|
216
|
+
}
|
217
|
+
#endif
|
198
218
|
}
|
199
219
|
};
|
200
220
|
|
@@ -243,6 +263,10 @@ struct llama_context {
|
|
243
263
|
llama_ctx_buffer buf_compute;
|
244
264
|
llama_ctx_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
|
245
265
|
|
266
|
+
#ifdef GGML_USE_METAL
|
267
|
+
ggml_metal_context * ctx_metal = NULL;
|
268
|
+
#endif
|
269
|
+
|
246
270
|
int buf_last = 0;
|
247
271
|
size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
|
248
272
|
|
@@ -282,15 +306,15 @@ template <typename T>
|
|
282
306
|
static T checked_mul(T a, T b) {
|
283
307
|
T ret = a * b;
|
284
308
|
if (a != 0 && ret / a != b) {
|
285
|
-
throw format("overflow multiplying %llu * %llu",
|
286
|
-
(unsigned long long) a, (unsigned long long) b);
|
309
|
+
throw std::runtime_error(format("overflow multiplying %llu * %llu",
|
310
|
+
(unsigned long long) a, (unsigned long long) b));
|
287
311
|
}
|
288
312
|
return ret;
|
289
313
|
}
|
290
314
|
|
291
315
|
static size_t checked_div(size_t a, size_t b) {
|
292
316
|
if (b == 0 || a % b != 0) {
|
293
|
-
throw format("error dividing %zu / %zu", a, b);
|
317
|
+
throw std::runtime_error(format("error dividing %zu / %zu", a, b));
|
294
318
|
}
|
295
319
|
return a / b;
|
296
320
|
}
|
@@ -354,7 +378,7 @@ struct llama_load_tensor {
|
|
354
378
|
const auto & first_shard = shards.at(0);
|
355
379
|
for (const auto & shard : shards) {
|
356
380
|
if (shard.type != first_shard.type) {
|
357
|
-
throw format("inconsistent tensor shard type in '%s'", name.c_str());
|
381
|
+
throw std::runtime_error(format("inconsistent tensor shard type in '%s'", name.c_str()));
|
358
382
|
}
|
359
383
|
}
|
360
384
|
type = first_shard.type;
|
@@ -377,8 +401,8 @@ struct llama_load_tensor {
|
|
377
401
|
const auto & first_shard = shards.at(0);
|
378
402
|
for (const auto & shard : shards) {
|
379
403
|
if (shard.ne != first_shard.ne) {
|
380
|
-
throw format("inconsistent tensor shard shape in '%s': first was %s, other was %s",
|
381
|
-
name.c_str(), llama_format_tensor_shape(first_shard.ne).c_str(), llama_format_tensor_shape(shard.ne).c_str());
|
404
|
+
throw std::runtime_error(format("inconsistent tensor shard shape in '%s': first was %s, other was %s",
|
405
|
+
name.c_str(), llama_format_tensor_shape(first_shard.ne).c_str(), llama_format_tensor_shape(shard.ne).c_str()));
|
382
406
|
}
|
383
407
|
}
|
384
408
|
ne = first_shard.ne;
|
@@ -456,8 +480,8 @@ struct llama_file_loader {
|
|
456
480
|
}
|
457
481
|
}
|
458
482
|
|
459
|
-
throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
|
460
|
-
magic, version);
|
483
|
+
throw std::runtime_error(format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
|
484
|
+
magic, version));
|
461
485
|
}
|
462
486
|
void read_hparams() {
|
463
487
|
hparams.n_vocab = file.read_u32();
|
@@ -497,7 +521,7 @@ struct llama_file_loader {
|
|
497
521
|
file.read_raw(shard.ne.data(), sizeof(shard.ne[0]) * n_dims);
|
498
522
|
std::string name = file.read_string(name_len);
|
499
523
|
if (n_dims < 1 || n_dims > 2) {
|
500
|
-
throw format("llama.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims);
|
524
|
+
throw std::runtime_error(format("llama.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims));
|
501
525
|
}
|
502
526
|
switch (shard.type) {
|
503
527
|
case GGML_TYPE_F32:
|
@@ -507,9 +531,14 @@ struct llama_file_loader {
|
|
507
531
|
case GGML_TYPE_Q5_0:
|
508
532
|
case GGML_TYPE_Q5_1:
|
509
533
|
case GGML_TYPE_Q8_0:
|
534
|
+
case GGML_TYPE_Q2_K:
|
535
|
+
case GGML_TYPE_Q3_K:
|
536
|
+
case GGML_TYPE_Q4_K:
|
537
|
+
case GGML_TYPE_Q5_K:
|
538
|
+
case GGML_TYPE_Q6_K:
|
510
539
|
break;
|
511
540
|
default: {
|
512
|
-
throw format("unrecognized tensor type %u\n", shard.type);
|
541
|
+
throw std::runtime_error(format("unrecognized tensor type %u\n", shard.type));
|
513
542
|
}
|
514
543
|
}
|
515
544
|
|
@@ -582,6 +611,11 @@ struct llama_file_saver {
|
|
582
611
|
case GGML_TYPE_Q5_0:
|
583
612
|
case GGML_TYPE_Q5_1:
|
584
613
|
case GGML_TYPE_Q8_0:
|
614
|
+
case GGML_TYPE_Q2_K:
|
615
|
+
case GGML_TYPE_Q3_K:
|
616
|
+
case GGML_TYPE_Q4_K:
|
617
|
+
case GGML_TYPE_Q5_K:
|
618
|
+
case GGML_TYPE_Q6_K:
|
585
619
|
break;
|
586
620
|
default: LLAMA_ASSERT(false);
|
587
621
|
}
|
@@ -613,7 +647,7 @@ struct llama_model_loader {
|
|
613
647
|
auto * ith_file = new llama_file_loader(fname.c_str(), i, tensors_map);
|
614
648
|
file_loaders.emplace_back(ith_file);
|
615
649
|
if (ith_file->hparams != first_file->hparams) {
|
616
|
-
throw format("llama.cpp: hparams inconsistent between files");
|
650
|
+
throw std::runtime_error(format("llama.cpp: hparams inconsistent between files"));
|
617
651
|
}
|
618
652
|
}
|
619
653
|
if (!llama_mmap::SUPPORTED) {
|
@@ -643,7 +677,7 @@ struct llama_model_loader {
|
|
643
677
|
uint32_t guess_n_parts() const {
|
644
678
|
auto it = tensors_map.name_to_idx.find("tok_embeddings.weight");
|
645
679
|
if (it == tensors_map.name_to_idx.end()) {
|
646
|
-
throw std::string("missing tok_embeddings.weight");
|
680
|
+
throw std::runtime_error(std::string("missing tok_embeddings.weight"));
|
647
681
|
}
|
648
682
|
const llama_load_tensor & lt = tensors_map.tensors.at(it->second);
|
649
683
|
return file_loaders.at(0)->hparams.n_embd / lt.shards.at(0).ne.at(0);
|
@@ -660,12 +694,12 @@ struct llama_model_loader {
|
|
660
694
|
struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne, ggml_backend backend) {
|
661
695
|
auto it = tensors_map.name_to_idx.find(name);
|
662
696
|
if (it == tensors_map.name_to_idx.end()) {
|
663
|
-
throw format("llama.cpp: tensor '%s' is missing from model", name.c_str());
|
697
|
+
throw std::runtime_error(std::runtime_error(format("llama.cpp: tensor '%s' is missing from model", name.c_str())));
|
664
698
|
}
|
665
699
|
llama_load_tensor & lt = tensors_map.tensors.at(it->second);
|
666
700
|
if (lt.ne != ne) {
|
667
|
-
throw format("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s",
|
668
|
-
name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str());
|
701
|
+
throw std::runtime_error(format("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s",
|
702
|
+
name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str()));
|
669
703
|
}
|
670
704
|
|
671
705
|
return get_tensor_for(lt, backend);
|
@@ -681,6 +715,7 @@ struct llama_model_loader {
|
|
681
715
|
}
|
682
716
|
ggml_set_name(tensor, lt.name.c_str());
|
683
717
|
LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor
|
718
|
+
|
684
719
|
tensor->backend = backend;
|
685
720
|
lt.ggml_tensor = tensor;
|
686
721
|
num_ggml_tensors_created++;
|
@@ -689,7 +724,7 @@ struct llama_model_loader {
|
|
689
724
|
|
690
725
|
void done_getting_tensors() const {
|
691
726
|
if (num_ggml_tensors_created != tensors_map.tensors.size()) {
|
692
|
-
throw std::string("llama.cpp: file contained more tensors than expected");
|
727
|
+
throw std::runtime_error(std::string("llama.cpp: file contained more tensors than expected"));
|
693
728
|
}
|
694
729
|
}
|
695
730
|
|
@@ -833,7 +868,10 @@ static bool kv_cache_init(
|
|
833
868
|
struct llama_context_params llama_context_default_params() {
|
834
869
|
struct llama_context_params result = {
|
835
870
|
/*.n_ctx =*/ 512,
|
871
|
+
/*.n_batch =*/ 512,
|
836
872
|
/*.gpu_layers =*/ 0,
|
873
|
+
/*.main_gpu =*/ 0,
|
874
|
+
/*.tensor_split =*/ {0},
|
837
875
|
/*.seed =*/ -1,
|
838
876
|
/*.f16_kv =*/ true,
|
839
877
|
/*.logits_all =*/ false,
|
@@ -848,6 +886,17 @@ struct llama_context_params llama_context_default_params() {
|
|
848
886
|
return result;
|
849
887
|
}
|
850
888
|
|
889
|
+
struct llama_model_quantize_params llama_model_quantize_default_params() {
|
890
|
+
struct llama_model_quantize_params result = {
|
891
|
+
/*.nthread =*/ 0,
|
892
|
+
/*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
|
893
|
+
/*.allow_requantize =*/ false,
|
894
|
+
/*.quantize_output_tensor =*/ true,
|
895
|
+
};
|
896
|
+
|
897
|
+
return result;
|
898
|
+
}
|
899
|
+
|
851
900
|
bool llama_mmap_supported() {
|
852
901
|
return llama_mmap::SUPPORTED;
|
853
902
|
}
|
@@ -898,6 +947,16 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
|
|
898
947
|
case LLAMA_FTYPE_MOSTLY_Q5_0: return "mostly Q5_0";
|
899
948
|
case LLAMA_FTYPE_MOSTLY_Q5_1: return "mostly Q5_1";
|
900
949
|
case LLAMA_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0";
|
950
|
+
// K-quants
|
951
|
+
case LLAMA_FTYPE_MOSTLY_Q2_K: return "mostly Q2_K";
|
952
|
+
case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "mostly Q3_K - Small";
|
953
|
+
case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "mostly Q3_K - Medium";
|
954
|
+
case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "mostly Q3_K - Large";
|
955
|
+
case LLAMA_FTYPE_MOSTLY_Q4_K_S: return "mostly Q4_K - Small";
|
956
|
+
case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "mostly Q4_K - Medium";
|
957
|
+
case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "mostly Q5_K - Small";
|
958
|
+
case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "mostly Q5_K - Medium";
|
959
|
+
case LLAMA_FTYPE_MOSTLY_Q6_K: return "mostly Q6_K";
|
901
960
|
default: return "unknown, may not work";
|
902
961
|
}
|
903
962
|
}
|
@@ -917,7 +976,10 @@ static void llama_model_load_internal(
|
|
917
976
|
const std::string & fname,
|
918
977
|
llama_context & lctx,
|
919
978
|
int n_ctx,
|
979
|
+
int n_batch,
|
920
980
|
int n_gpu_layers,
|
981
|
+
int main_gpu,
|
982
|
+
const float * tensor_split,
|
921
983
|
ggml_type memory_type,
|
922
984
|
bool use_mmap,
|
923
985
|
bool use_mlock,
|
@@ -932,9 +994,9 @@ static void llama_model_load_internal(
|
|
932
994
|
lctx.vocab = std::move(ml->file_loaders.at(0)->vocab);
|
933
995
|
auto & model = lctx.model;
|
934
996
|
model.hparams = ml->file_loaders.at(0)->hparams;
|
997
|
+
model.n_gpu_layers = n_gpu_layers;
|
935
998
|
llama_file_version file_version = ml->file_loaders.at(0)->file_version;
|
936
999
|
auto & hparams = model.hparams;
|
937
|
-
uint32_t n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
|
938
1000
|
|
939
1001
|
{
|
940
1002
|
switch (hparams.n_layer) {
|
@@ -948,6 +1010,8 @@ static void llama_model_load_internal(
|
|
948
1010
|
hparams.n_ctx = n_ctx;
|
949
1011
|
}
|
950
1012
|
|
1013
|
+
const uint32_t n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
|
1014
|
+
|
951
1015
|
{
|
952
1016
|
fprintf(stderr, "%s: format = %s\n", __func__, llama_file_version_name(file_version));
|
953
1017
|
fprintf(stderr, "%s: n_vocab = %u\n", __func__, hparams.n_vocab);
|
@@ -967,7 +1031,7 @@ static void llama_model_load_internal(
|
|
967
1031
|
if (hparams.ftype != LLAMA_FTYPE_ALL_F32 &&
|
968
1032
|
hparams.ftype != LLAMA_FTYPE_MOSTLY_F16 &&
|
969
1033
|
hparams.ftype != LLAMA_FTYPE_MOSTLY_Q8_0) {
|
970
|
-
throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1405)");
|
1034
|
+
throw std::runtime_error(format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1405)"));
|
971
1035
|
}
|
972
1036
|
}
|
973
1037
|
|
@@ -975,7 +1039,7 @@ static void llama_model_load_internal(
|
|
975
1039
|
if (hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ||
|
976
1040
|
hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_1 ||
|
977
1041
|
hparams.ftype == LLAMA_FTYPE_MOSTLY_Q8_0) {
|
978
|
-
throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1508)");
|
1042
|
+
throw std::runtime_error(format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1508)"));
|
979
1043
|
}
|
980
1044
|
}
|
981
1045
|
|
@@ -1006,18 +1070,28 @@ static void llama_model_load_internal(
|
|
1006
1070
|
|
1007
1071
|
model.ctx = ggml_init(params);
|
1008
1072
|
if (!model.ctx) {
|
1009
|
-
throw format("ggml_init() failed");
|
1073
|
+
throw std::runtime_error(format("ggml_init() failed"));
|
1010
1074
|
}
|
1011
1075
|
}
|
1012
1076
|
|
1013
|
-
|
1014
|
-
#
|
1077
|
+
(void) main_gpu;
|
1078
|
+
#if defined(GGML_USE_CUBLAS)
|
1079
|
+
fprintf(stderr, "%s: using CUDA for GPU acceleration\n", __func__);
|
1080
|
+
ggml_cuda_set_main_device(main_gpu);
|
1081
|
+
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
|
1082
|
+
#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT
|
1083
|
+
#elif defined(GGML_USE_CLBLAST)
|
1084
|
+
fprintf(stderr, "%s: using OpenCL for GPU acceleration\n", __func__);
|
1085
|
+
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
|
1086
|
+
#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU
|
1015
1087
|
#else
|
1016
|
-
#define LLAMA_BACKEND_OFFLOAD
|
1088
|
+
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CPU
|
1089
|
+
#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_CPU
|
1017
1090
|
#endif
|
1018
1091
|
|
1019
1092
|
// prepare memory for the weights
|
1020
|
-
size_t
|
1093
|
+
size_t vram_weights = 0;
|
1094
|
+
size_t vram_scratch = 0;
|
1021
1095
|
{
|
1022
1096
|
const uint32_t n_embd = hparams.n_embd;
|
1023
1097
|
const uint32_t n_layer = hparams.n_layer;
|
@@ -1032,7 +1106,7 @@ static void llama_model_load_internal(
|
|
1032
1106
|
{
|
1033
1107
|
ggml_backend backend_output;
|
1034
1108
|
if (n_gpu_layers > int(n_layer)) { // NOLINT
|
1035
|
-
backend_output =
|
1109
|
+
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
|
1036
1110
|
} else {
|
1037
1111
|
backend_output = GGML_BACKEND_CPU;
|
1038
1112
|
}
|
@@ -1044,7 +1118,8 @@ static void llama_model_load_internal(
|
|
1044
1118
|
|
1045
1119
|
model.layers.resize(n_layer);
|
1046
1120
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
1047
|
-
const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
1121
|
+
const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
1122
|
+
const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
1048
1123
|
|
1049
1124
|
auto & layer = model.layers[i];
|
1050
1125
|
|
@@ -1052,19 +1127,19 @@ static void llama_model_load_internal(
|
|
1052
1127
|
|
1053
1128
|
layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd}, backend);
|
1054
1129
|
|
1055
|
-
layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd},
|
1056
|
-
layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd},
|
1057
|
-
layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd},
|
1058
|
-
layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd},
|
1130
|
+
layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}, backend_split);
|
1131
|
+
layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd}, backend_split);
|
1132
|
+
layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd}, backend_split);
|
1133
|
+
layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}, backend_split);
|
1059
1134
|
|
1060
1135
|
layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd}, backend);
|
1061
1136
|
|
1062
|
-
layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff},
|
1063
|
-
layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd},
|
1064
|
-
layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff},
|
1137
|
+
layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}, backend_split);
|
1138
|
+
layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}, backend_split);
|
1139
|
+
layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}, backend_split);
|
1065
1140
|
|
1066
|
-
if (backend ==
|
1067
|
-
|
1141
|
+
if (backend == GGML_BACKEND_GPU) {
|
1142
|
+
vram_weights +=
|
1068
1143
|
ggml_nbytes(layer.attention_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
|
1069
1144
|
ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.attention_norm) +
|
1070
1145
|
ggml_nbytes(layer.w1) + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3);
|
@@ -1081,10 +1156,10 @@ static void llama_model_load_internal(
|
|
1081
1156
|
// this is the total memory required to run the inference
|
1082
1157
|
const size_t mem_required =
|
1083
1158
|
ctx_size +
|
1084
|
-
mmapped_size -
|
1159
|
+
mmapped_size - vram_weights + // weights in VRAM not in memory
|
1085
1160
|
MEM_REQ_SCRATCH0().at(model.type) +
|
1086
1161
|
MEM_REQ_SCRATCH1().at(model.type) +
|
1087
|
-
MEM_REQ_EVAL().at(model.type);
|
1162
|
+
MEM_REQ_EVAL().at (model.type);
|
1088
1163
|
|
1089
1164
|
// this is the memory required by one llama_state
|
1090
1165
|
const size_t mem_required_state =
|
@@ -1093,15 +1168,25 @@ static void llama_model_load_internal(
|
|
1093
1168
|
fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
|
1094
1169
|
mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
|
1095
1170
|
|
1171
|
+
(void) vram_scratch;
|
1096
1172
|
#ifdef GGML_USE_CUBLAS
|
1173
|
+
vram_scratch = n_batch * MB;
|
1174
|
+
ggml_cuda_set_scratch_size(vram_scratch);
|
1175
|
+
if (n_gpu_layers > 0) {
|
1176
|
+
fprintf(stderr, "%s: allocating batch_size x 1 MB = %ld MB VRAM for the scratch buffer\n",
|
1177
|
+
__func__, vram_scratch / MB);
|
1178
|
+
}
|
1179
|
+
#endif // GGML_USE_CUBLAS
|
1180
|
+
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
1097
1181
|
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
1098
1182
|
|
1099
|
-
fprintf(stderr, "%s:
|
1183
|
+
fprintf(stderr, "%s: offloading %d layers to GPU\n", __func__, n_gpu);
|
1100
1184
|
if (n_gpu_layers > (int) hparams.n_layer) {
|
1101
|
-
fprintf(stderr, "%s:
|
1185
|
+
fprintf(stderr, "%s: offloading output layer to GPU\n", __func__);
|
1102
1186
|
}
|
1103
|
-
fprintf(stderr, "%s:
|
1104
|
-
|
1187
|
+
fprintf(stderr, "%s: total VRAM used: %zu MB\n",
|
1188
|
+
__func__, (vram_weights + vram_scratch + MB - 1) / MB); // round up
|
1189
|
+
#else
|
1105
1190
|
(void) n_gpu_layers;
|
1106
1191
|
#endif
|
1107
1192
|
}
|
@@ -1113,8 +1198,10 @@ static void llama_model_load_internal(
|
|
1113
1198
|
|
1114
1199
|
ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
|
1115
1200
|
|
1116
|
-
#
|
1201
|
+
#if defined(GGML_USE_CUBLAS)
|
1117
1202
|
{
|
1203
|
+
ggml_cuda_set_tensor_split(tensor_split);
|
1204
|
+
|
1118
1205
|
size_t done_size = 0;
|
1119
1206
|
size_t data_size = 0;
|
1120
1207
|
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
|
@@ -1124,7 +1211,8 @@ static void llama_model_load_internal(
|
|
1124
1211
|
}
|
1125
1212
|
}
|
1126
1213
|
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
|
1127
|
-
|
1214
|
+
ggml_backend backend = lt.ggml_tensor->backend;
|
1215
|
+
if (backend != GGML_BACKEND_GPU && backend != GGML_BACKEND_GPU_SPLIT) {
|
1128
1216
|
continue;
|
1129
1217
|
}
|
1130
1218
|
if (progress_callback) {
|
@@ -1136,30 +1224,28 @@ static void llama_model_load_internal(
|
|
1136
1224
|
}
|
1137
1225
|
#elif defined(GGML_USE_CLBLAST)
|
1138
1226
|
{
|
1139
|
-
|
1140
|
-
|
1141
|
-
|
1142
|
-
|
1143
|
-
|
1144
|
-
|
1145
|
-
|
1146
|
-
const auto & layer = model.layers[i];
|
1147
|
-
|
1148
|
-
ggml_cl_transform_tensor(layer.wq); vram_total += ggml_nbytes(layer.wq);
|
1149
|
-
ggml_cl_transform_tensor(layer.wk); vram_total += ggml_nbytes(layer.wk);
|
1150
|
-
ggml_cl_transform_tensor(layer.wv); vram_total += ggml_nbytes(layer.wv);
|
1151
|
-
ggml_cl_transform_tensor(layer.wo); vram_total += ggml_nbytes(layer.wo);
|
1152
|
-
ggml_cl_transform_tensor(layer.w1); vram_total += ggml_nbytes(layer.w1);
|
1153
|
-
ggml_cl_transform_tensor(layer.w2); vram_total += ggml_nbytes(layer.w2);
|
1154
|
-
ggml_cl_transform_tensor(layer.w3); vram_total += ggml_nbytes(layer.w3);
|
1227
|
+
size_t done_size = 0;
|
1228
|
+
size_t data_size = 0;
|
1229
|
+
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
|
1230
|
+
data_size += lt.size;
|
1231
|
+
if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
|
1232
|
+
done_size += lt.size;
|
1233
|
+
}
|
1155
1234
|
}
|
1156
|
-
|
1157
|
-
|
1158
|
-
|
1235
|
+
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
|
1236
|
+
if (lt.ggml_tensor->backend != GGML_BACKEND_GPU) {
|
1237
|
+
continue;
|
1238
|
+
}
|
1239
|
+
if (progress_callback) {
|
1240
|
+
progress_callback((float) done_size / data_size, progress_callback_user_data);
|
1241
|
+
}
|
1242
|
+
ggml_cl_load_data(fname.c_str(), lt.ggml_tensor, lt.shards.at(0).file_off);
|
1243
|
+
done_size += lt.size;
|
1159
1244
|
}
|
1160
|
-
|
1161
|
-
fprintf(stderr, "ggml_opencl: total VRAM used: %zu MB\n", vram_total / 1024 / 1024);
|
1162
1245
|
}
|
1246
|
+
#else
|
1247
|
+
(void) n_batch;
|
1248
|
+
(void) tensor_split;
|
1163
1249
|
#endif
|
1164
1250
|
|
1165
1251
|
if (progress_callback) {
|
@@ -1177,7 +1263,10 @@ static bool llama_model_load(
|
|
1177
1263
|
const std::string & fname,
|
1178
1264
|
llama_context & lctx,
|
1179
1265
|
int n_ctx,
|
1266
|
+
int n_batch,
|
1180
1267
|
int n_gpu_layers,
|
1268
|
+
int main_gpu,
|
1269
|
+
float * tensor_split,
|
1181
1270
|
ggml_type memory_type,
|
1182
1271
|
bool use_mmap,
|
1183
1272
|
bool use_mlock,
|
@@ -1185,28 +1274,30 @@ static bool llama_model_load(
|
|
1185
1274
|
llama_progress_callback progress_callback,
|
1186
1275
|
void *progress_callback_user_data) {
|
1187
1276
|
try {
|
1188
|
-
llama_model_load_internal(fname, lctx, n_ctx, n_gpu_layers,
|
1189
|
-
vocab_only, progress_callback, progress_callback_user_data);
|
1277
|
+
llama_model_load_internal(fname, lctx, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, memory_type,
|
1278
|
+
use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
|
1190
1279
|
return true;
|
1191
|
-
} catch (const std::
|
1192
|
-
fprintf(stderr, "error loading model: %s\n", err.
|
1280
|
+
} catch (const std::exception & err) {
|
1281
|
+
fprintf(stderr, "error loading model: %s\n", err.what());
|
1193
1282
|
return false;
|
1194
1283
|
}
|
1195
1284
|
}
|
1196
1285
|
|
1197
1286
|
// evaluate the transformer
|
1198
1287
|
//
|
1199
|
-
// - lctx:
|
1200
|
-
// - tokens:
|
1201
|
-
// - n_past:
|
1202
|
-
// - n_threads:
|
1288
|
+
// - lctx: llama context
|
1289
|
+
// - tokens: new batch of tokens to process
|
1290
|
+
// - n_past: the context size so far
|
1291
|
+
// - n_threads: number of threads to use
|
1292
|
+
// - cgraph_fname: filename of the exported computation graph
|
1203
1293
|
//
|
1204
1294
|
static bool llama_eval_internal(
|
1205
|
-
llama_context &
|
1206
|
-
const llama_token *
|
1207
|
-
const int
|
1208
|
-
const int
|
1209
|
-
const int
|
1295
|
+
llama_context & lctx,
|
1296
|
+
const llama_token * tokens,
|
1297
|
+
const int n_tokens,
|
1298
|
+
const int n_past,
|
1299
|
+
const int n_threads,
|
1300
|
+
const char * cgraph_fname) {
|
1210
1301
|
|
1211
1302
|
// enforce that the first token is BOS
|
1212
1303
|
if (n_past == 0 && tokens[0] != llama_token_bos()) {
|
@@ -1225,12 +1316,13 @@ static bool llama_eval_internal(
|
|
1225
1316
|
|
1226
1317
|
LLAMA_ASSERT(!!kv_self.ctx);
|
1227
1318
|
|
1228
|
-
const int n_embd
|
1229
|
-
const int n_layer
|
1230
|
-
const int n_ctx
|
1231
|
-
const int n_head
|
1232
|
-
const int n_vocab
|
1233
|
-
const int n_rot
|
1319
|
+
const int n_embd = hparams.n_embd;
|
1320
|
+
const int n_layer = hparams.n_layer;
|
1321
|
+
const int n_ctx = hparams.n_ctx;
|
1322
|
+
const int n_head = hparams.n_head;
|
1323
|
+
const int n_vocab = hparams.n_vocab;
|
1324
|
+
const int n_rot = hparams.n_embd/hparams.n_head;
|
1325
|
+
const int n_gpu_layers = model.n_gpu_layers;
|
1234
1326
|
|
1235
1327
|
auto & mem_per_token = lctx.mem_per_token;
|
1236
1328
|
auto & buf_compute = lctx.buf_compute;
|
@@ -1252,40 +1344,66 @@ static bool llama_eval_internal(
|
|
1252
1344
|
ggml_set_name(embd, "embd");
|
1253
1345
|
memcpy(embd->data, tokens, N*ggml_element_size(embd));
|
1254
1346
|
|
1347
|
+
struct ggml_tensor * cur;
|
1255
1348
|
struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
|
1256
1349
|
|
1350
|
+
const int i_gpu_start = n_layer - n_gpu_layers;
|
1351
|
+
(void) i_gpu_start;
|
1352
|
+
|
1257
1353
|
for (int il = 0; il < n_layer; ++il) {
|
1258
|
-
|
1354
|
+
offload_func_t offload_func = llama_nop;
|
1259
1355
|
|
1260
|
-
|
1356
|
+
#ifdef GGML_USE_CUBLAS
|
1357
|
+
if (il >= i_gpu_start) {
|
1358
|
+
offload_func = ggml_cuda_assign_buffers; // sets the output backend to GPU
|
1359
|
+
}
|
1360
|
+
#endif // GGML_USE_CUBLAS
|
1361
|
+
|
1362
|
+
struct ggml_tensor * inpSA = inpL;
|
1261
1363
|
|
1262
1364
|
lctx.use_buf(ctx0, 0);
|
1263
1365
|
|
1264
1366
|
// norm
|
1265
1367
|
{
|
1266
1368
|
cur = ggml_rms_norm(ctx0, inpL);
|
1369
|
+
offload_func(cur);
|
1370
|
+
ggml_set_name(cur, "rms_norm_0");
|
1267
1371
|
|
1268
1372
|
// cur = cur*attention_norm(broadcasted)
|
1269
1373
|
cur = ggml_mul(ctx0, cur, model.layers[il].attention_norm);
|
1374
|
+
offload_func(cur);
|
1375
|
+
ggml_set_name(cur, "attention_norm_0");
|
1270
1376
|
}
|
1271
1377
|
|
1272
1378
|
// self-attention
|
1273
1379
|
{
|
1274
1380
|
// compute Q and K and RoPE them
|
1275
|
-
struct ggml_tensor *
|
1276
|
-
|
1277
|
-
ggml_set_name(
|
1381
|
+
struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
1382
|
+
// offload_func(tmpq);
|
1383
|
+
ggml_set_name(tmpq, "tmpq");
|
1384
|
+
|
1385
|
+
struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
1386
|
+
// offload_func(tmpk);
|
1387
|
+
ggml_set_name(tmpk, "tmpk");
|
1388
|
+
|
1389
|
+
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0);
|
1278
1390
|
ggml_set_name(Kcur, "Kcur");
|
1279
1391
|
|
1392
|
+
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0);
|
1393
|
+
ggml_set_name(Qcur, "Qcur");
|
1394
|
+
|
1280
1395
|
// store key and value to memory
|
1281
1396
|
{
|
1282
1397
|
// compute the transposed [N, n_embd] V matrix
|
1283
1398
|
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), n_embd, N));
|
1399
|
+
ggml_set_name(Vcur, "Vcur");
|
1284
1400
|
|
1285
1401
|
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
|
1402
|
+
ggml_set_name(k, "k");
|
1286
1403
|
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
|
1287
1404
|
( n_ctx)*ggml_element_size(kv_self.v),
|
1288
1405
|
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
|
1406
|
+
ggml_set_name(v, "v");
|
1289
1407
|
|
1290
1408
|
// important: storing RoPE-ed version of K in the KV cache!
|
1291
1409
|
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
|
@@ -1326,7 +1444,6 @@ static bool llama_eval_internal(
|
|
1326
1444
|
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
|
1327
1445
|
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
1328
1446
|
|
1329
|
-
|
1330
1447
|
// split cached V into n_head heads
|
1331
1448
|
struct ggml_tensor * V =
|
1332
1449
|
ggml_view_3d(ctx0, kv_self.v,
|
@@ -1361,73 +1478,143 @@ static bool llama_eval_internal(
|
|
1361
1478
|
cur = ggml_mul_mat(ctx0,
|
1362
1479
|
model.layers[il].wo,
|
1363
1480
|
cur);
|
1481
|
+
offload_func(cur);
|
1482
|
+
ggml_set_name(cur, "result_wo");
|
1364
1483
|
}
|
1365
1484
|
|
1366
1485
|
lctx.use_buf(ctx0, 1);
|
1486
|
+
//ggml_cuda_set_scratch(1);
|
1367
1487
|
|
1368
1488
|
struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
|
1489
|
+
offload_func(inpFF);
|
1490
|
+
ggml_set_name(inpFF, "inpFF");
|
1369
1491
|
|
1370
1492
|
// feed-forward network
|
1371
1493
|
{
|
1372
1494
|
// norm
|
1373
1495
|
{
|
1374
1496
|
cur = ggml_rms_norm(ctx0, inpFF);
|
1497
|
+
offload_func(cur);
|
1498
|
+
ggml_set_name(cur, "rms_norm_1");
|
1375
1499
|
|
1376
1500
|
// cur = cur*ffn_norm(broadcasted)
|
1377
1501
|
cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
|
1502
|
+
offload_func(cur);
|
1503
|
+
ggml_set_name(cur, "ffn_norm");
|
1378
1504
|
}
|
1379
1505
|
|
1380
1506
|
struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
|
1381
1507
|
model.layers[il].w3,
|
1382
1508
|
cur);
|
1509
|
+
offload_func(tmp);
|
1510
|
+
ggml_set_name(tmp, "result_w3");
|
1383
1511
|
|
1384
1512
|
cur = ggml_mul_mat(ctx0,
|
1385
1513
|
model.layers[il].w1,
|
1386
1514
|
cur);
|
1515
|
+
offload_func(cur);
|
1516
|
+
ggml_set_name(cur, "result_w2");
|
1387
1517
|
|
1388
1518
|
// SILU activation
|
1389
1519
|
cur = ggml_silu(ctx0, cur);
|
1520
|
+
offload_func(cur);
|
1521
|
+
ggml_set_name(cur, "silu");
|
1390
1522
|
|
1391
1523
|
cur = ggml_mul(ctx0, cur, tmp);
|
1524
|
+
offload_func(cur);
|
1525
|
+
ggml_set_name(cur, "silu_x_result_w3");
|
1392
1526
|
|
1393
1527
|
cur = ggml_mul_mat(ctx0,
|
1394
1528
|
model.layers[il].w2,
|
1395
1529
|
cur);
|
1530
|
+
offload_func(cur);
|
1531
|
+
ggml_set_name(cur, "result_w2");
|
1396
1532
|
}
|
1397
1533
|
|
1398
1534
|
cur = ggml_add(ctx0, cur, inpFF);
|
1535
|
+
offload_func(cur);
|
1536
|
+
ggml_set_name(cur, "inpFF_+_result_w2");
|
1399
1537
|
|
1400
1538
|
// input for next layer
|
1401
1539
|
inpL = cur;
|
1540
|
+
|
1402
1541
|
}
|
1403
1542
|
|
1404
1543
|
lctx.use_buf(ctx0, 0);
|
1544
|
+
//ggml_cuda_set_scratch(0);
|
1405
1545
|
|
1406
1546
|
// used at the end to optionally extract the embeddings
|
1407
1547
|
struct ggml_tensor * embeddings = NULL;
|
1408
1548
|
|
1549
|
+
offload_func_t offload_func = llama_nop;
|
1550
|
+
|
1551
|
+
#ifdef GGML_USE_CUBLAS
|
1552
|
+
if (n_gpu_layers > n_layer) {
|
1553
|
+
offload_func = ggml_cuda_assign_buffers; // sets the output backend to GPU
|
1554
|
+
}
|
1555
|
+
#endif // GGML_USE_CUBLAS
|
1556
|
+
|
1409
1557
|
// norm
|
1410
1558
|
{
|
1559
|
+
cur = ggml_rms_norm(ctx0, inpL);
|
1560
|
+
offload_func(cur);
|
1561
|
+
ggml_set_name(cur, "rms_norm_inpL");
|
1411
1562
|
|
1412
|
-
|
1563
|
+
cur = ggml_rms_norm(ctx0, cur);
|
1564
|
+
offload_func(cur);
|
1565
|
+
ggml_set_name(cur, "rms_norm_after");
|
1413
1566
|
|
1414
|
-
//
|
1415
|
-
|
1567
|
+
// cur = cur*norm(broadcasted)
|
1568
|
+
cur = ggml_mul(ctx0, cur, model.norm);
|
1569
|
+
offload_func(cur);
|
1570
|
+
ggml_set_name(cur, "result_norm");
|
1416
1571
|
|
1417
|
-
embeddings =
|
1572
|
+
embeddings = cur;
|
1418
1573
|
}
|
1419
1574
|
|
1575
|
+
|
1420
1576
|
// lm_head
|
1421
|
-
|
1577
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
1578
|
+
ggml_set_name(cur, "result_output");
|
1422
1579
|
|
1423
1580
|
lctx.use_buf(ctx0, -1);
|
1424
1581
|
|
1425
1582
|
// logits -> probs
|
1426
|
-
//
|
1583
|
+
//cur = ggml_soft_max_inplace(ctx0, cur);
|
1427
1584
|
|
1428
1585
|
// run the computation
|
1429
|
-
ggml_build_forward_expand(&gf,
|
1430
|
-
|
1586
|
+
ggml_build_forward_expand(&gf, cur);
|
1587
|
+
|
1588
|
+
#ifdef GGML_USE_METAL
|
1589
|
+
if (lctx.ctx_metal && N == 1) {
|
1590
|
+
ggml_metal_graph_compute(lctx.ctx_metal, &gf);
|
1591
|
+
ggml_metal_get_tensor (lctx.ctx_metal, cur);
|
1592
|
+
} else {
|
1593
|
+
// IMPORTANT:
|
1594
|
+
// Since we don't have efficient Matrix x Matrix Metal multiplication yet, we fallback to vanilla
|
1595
|
+
// ggml_graph_compute(). It uses Apple's Accelerate CBLAS API which takes advantage of the ANE or the AMX
|
1596
|
+
// coprocessor.
|
1597
|
+
//
|
1598
|
+
// When we implement Matrix x Matrix Metal multiplication, we can avoid this branch.
|
1599
|
+
// But for now, we have focused only on Matrix x Vector Metal multiplication.
|
1600
|
+
//
|
1601
|
+
// TODO: avoid these syncs via shared memory (ref #1696)
|
1602
|
+
//
|
1603
|
+
if (lctx.ctx_metal) {
|
1604
|
+
// We need to sync the GPU KV cache with the CPU KV cache
|
1605
|
+
ggml_metal_get_tensor(lctx.ctx_metal, kv_self.k);
|
1606
|
+
ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v);
|
1607
|
+
}
|
1608
|
+
|
1609
|
+
ggml_graph_compute(ctx0, &gf);
|
1610
|
+
}
|
1611
|
+
#else
|
1612
|
+
ggml_graph_compute(ctx0, &gf);
|
1613
|
+
#endif
|
1614
|
+
|
1615
|
+
if (cgraph_fname) {
|
1616
|
+
ggml_graph_export(&gf, cgraph_fname);
|
1617
|
+
}
|
1431
1618
|
|
1432
1619
|
#ifdef GGML_PERF
|
1433
1620
|
// print timing information per ggml operation (for debugging purposes)
|
@@ -1441,7 +1628,7 @@ static bool llama_eval_internal(
|
|
1441
1628
|
//}
|
1442
1629
|
|
1443
1630
|
//embd_w.resize(n_vocab*N);
|
1444
|
-
//memcpy(embd_w.data(), ggml_get_data(
|
1631
|
+
//memcpy(embd_w.data(), ggml_get_data(cur), sizeof(float)*n_vocab*N);
|
1445
1632
|
|
1446
1633
|
// update kv token count
|
1447
1634
|
lctx.model.kv_self.n = n_past + N;
|
@@ -1452,11 +1639,11 @@ static bool llama_eval_internal(
|
|
1452
1639
|
|
1453
1640
|
if (lctx.logits_all) {
|
1454
1641
|
logits_out.resize(n_vocab * N);
|
1455
|
-
memcpy(logits_out.data(), (float *) ggml_get_data(
|
1642
|
+
memcpy(logits_out.data(), (float *) ggml_get_data(cur), sizeof(float)*n_vocab*N);
|
1456
1643
|
} else {
|
1457
1644
|
// return result for just the last token
|
1458
1645
|
logits_out.resize(n_vocab);
|
1459
|
-
memcpy(logits_out.data(), (float *) ggml_get_data(
|
1646
|
+
memcpy(logits_out.data(), (float *) ggml_get_data(cur) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
|
1460
1647
|
}
|
1461
1648
|
}
|
1462
1649
|
|
@@ -2055,16 +2242,88 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra
|
|
2055
2242
|
// quantization
|
2056
2243
|
//
|
2057
2244
|
|
2058
|
-
static void
|
2245
|
+
static void llama_convert_tensor_internal(const llama_load_tensor & tensor, llama_buffer & output, const int nelements, const int nthread) {
|
2246
|
+
if (output.size < nelements * sizeof(float)) {
|
2247
|
+
output.resize(nelements * sizeof(float));
|
2248
|
+
}
|
2249
|
+
float * f32_output = (float *) output.addr;
|
2250
|
+
|
2251
|
+
quantize_fns_t qtype;
|
2252
|
+
if (ggml_is_quantized(tensor.type)) {
|
2253
|
+
qtype = ggml_internal_get_quantize_fn(tensor.type);
|
2254
|
+
if (qtype.dequantize_row_q == NULL) {
|
2255
|
+
throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor.type)));
|
2256
|
+
}
|
2257
|
+
} else if (tensor.type != GGML_TYPE_F16) {
|
2258
|
+
throw std::runtime_error(format("cannot dequantize/convert tensor type %s", ggml_type_name(tensor.type)));
|
2259
|
+
}
|
2260
|
+
|
2261
|
+
if (nthread < 2) {
|
2262
|
+
if (tensor.type == GGML_TYPE_F16) {
|
2263
|
+
ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor.data, f32_output, nelements);
|
2264
|
+
} else if (ggml_is_quantized(tensor.type)) {
|
2265
|
+
qtype.dequantize_row_q(tensor.data, f32_output, nelements);
|
2266
|
+
} else {
|
2267
|
+
LLAMA_ASSERT(false); // unreachable
|
2268
|
+
}
|
2269
|
+
return;
|
2270
|
+
}
|
2271
|
+
|
2272
|
+
auto block_size = tensor.type == GGML_TYPE_F16 ? 1 : (size_t)ggml_blck_size(tensor.type);
|
2273
|
+
auto block_size_bytes = ggml_type_size(tensor.type);
|
2274
|
+
|
2275
|
+
LLAMA_ASSERT(nelements % block_size == 0);
|
2276
|
+
auto nblocks = nelements / block_size;
|
2277
|
+
auto blocks_per_thread = nblocks / nthread;
|
2278
|
+
auto spare_blocks = nblocks - (blocks_per_thread * nthread); // if blocks aren't divisible by thread count
|
2279
|
+
|
2280
|
+
std::vector<std::thread> workers;
|
2281
|
+
for (auto tnum = 0, in_buff_offs = 0, out_buff_offs = 0; tnum < nthread; tnum++) {
|
2282
|
+
auto thr_blocks = blocks_per_thread + (tnum == nthread - 1 ? spare_blocks : 0); // num blocks for this thread
|
2283
|
+
auto thr_elems = thr_blocks * block_size; // number of elements for this thread
|
2284
|
+
auto thr_block_bytes = thr_blocks * block_size_bytes; // number of input bytes for this thread
|
2285
|
+
|
2286
|
+
auto compute = [qtype] (ggml_type typ, uint8_t * inbuf, float * outbuf, int nels) {
|
2287
|
+
if (typ == GGML_TYPE_F16) {
|
2288
|
+
ggml_fp16_to_fp32_row((ggml_fp16_t *)inbuf, outbuf, nels);
|
2289
|
+
} else {
|
2290
|
+
qtype.dequantize_row_q(inbuf, outbuf, nels);
|
2291
|
+
}
|
2292
|
+
};
|
2293
|
+
workers.push_back(std::thread(compute, tensor.type, tensor.data + in_buff_offs, f32_output + out_buff_offs, thr_elems));
|
2294
|
+
in_buff_offs += thr_block_bytes;
|
2295
|
+
out_buff_offs += thr_elems;
|
2296
|
+
}
|
2297
|
+
for (auto & worker : workers) {
|
2298
|
+
worker.join();
|
2299
|
+
}
|
2300
|
+
|
2301
|
+
}
|
2302
|
+
|
2303
|
+
static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
|
2059
2304
|
ggml_type quantized_type;
|
2060
|
-
|
2305
|
+
llama_ftype ftype = params->ftype;
|
2306
|
+
int nthread = params->nthread;
|
2307
|
+
|
2308
|
+
switch (params->ftype) {
|
2061
2309
|
case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
|
2062
2310
|
case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
|
2063
2311
|
case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
|
2064
2312
|
case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
|
2065
2313
|
case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
|
2066
|
-
|
2067
|
-
|
2314
|
+
|
2315
|
+
// K-quants
|
2316
|
+
case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
|
2317
|
+
case LLAMA_FTYPE_MOSTLY_Q3_K_S:
|
2318
|
+
case LLAMA_FTYPE_MOSTLY_Q3_K_M:
|
2319
|
+
case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break;
|
2320
|
+
case LLAMA_FTYPE_MOSTLY_Q4_K_S:
|
2321
|
+
case LLAMA_FTYPE_MOSTLY_Q4_K_M: quantized_type = GGML_TYPE_Q4_K; break;
|
2322
|
+
case LLAMA_FTYPE_MOSTLY_Q5_K_S:
|
2323
|
+
case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break;
|
2324
|
+
case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
|
2325
|
+
default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
|
2326
|
+
}
|
2068
2327
|
|
2069
2328
|
if (nthread <= 0) {
|
2070
2329
|
nthread = std::thread::hardware_concurrency();
|
@@ -2072,7 +2331,21 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2072
2331
|
|
2073
2332
|
std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp, /*use_mmap*/ false,
|
2074
2333
|
/*vocab_only*/ false));
|
2075
|
-
llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), ftype);
|
2334
|
+
llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), params->ftype);
|
2335
|
+
|
2336
|
+
int n_attention_wv = 0;
|
2337
|
+
int n_feed_forward_w2 = 0;
|
2338
|
+
for (auto& tensor : model_loader->tensors_map.tensors) {
|
2339
|
+
if (tensor.name.find("attention.wv.weight") != std::string::npos) {
|
2340
|
+
++n_attention_wv;
|
2341
|
+
}
|
2342
|
+
else if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) {
|
2343
|
+
++n_feed_forward_w2;
|
2344
|
+
}
|
2345
|
+
}
|
2346
|
+
|
2347
|
+
int i_attention_wv = 0;
|
2348
|
+
int i_feed_forward_w2 = 0;
|
2076
2349
|
|
2077
2350
|
size_t total_size_org = 0;
|
2078
2351
|
size_t total_size_new = 0;
|
@@ -2100,9 +2373,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2100
2373
|
quantize &= (tensor.ne.size() == 2);
|
2101
2374
|
|
2102
2375
|
// uncomment this to keep the output layer in FP16
|
2103
|
-
|
2104
|
-
|
2105
|
-
|
2376
|
+
if (!params->quantize_output_tensor && tensor.name == "output.weight") {
|
2377
|
+
quantize = false;
|
2378
|
+
}
|
2379
|
+
quantize = quantize && quantized_type != tensor.type;
|
2106
2380
|
|
2107
2381
|
enum ggml_type new_type;
|
2108
2382
|
void * new_data;
|
@@ -2116,20 +2390,43 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2116
2390
|
printf("size = %8.3f MB\n", tensor.size/1024.0/1024.0);
|
2117
2391
|
} else {
|
2118
2392
|
new_type = quantized_type;
|
2393
|
+
// TODO: temporary disabled until Metal / OpenCL support is available
|
2394
|
+
// ref: https://github.com/ggerganov/llama.cpp/issues/1711
|
2395
|
+
//if (tensor.name == "output.weight") {
|
2396
|
+
// new_type = GGML_TYPE_Q6_K;
|
2397
|
+
//}
|
2398
|
+
if (tensor.name.find("attention.wv.weight") != std::string::npos) {
|
2399
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
2400
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
2401
|
+
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
2402
|
+
(i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8 ||
|
2403
|
+
(i_attention_wv - n_attention_wv/8)%3 == 2)) new_type = GGML_TYPE_Q6_K;
|
2404
|
+
++i_attention_wv;
|
2405
|
+
}
|
2406
|
+
if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) {
|
2407
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
2408
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
2409
|
+
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
2410
|
+
(i_feed_forward_w2 < n_feed_forward_w2/8 || i_feed_forward_w2 >= 7*n_feed_forward_w2/8 ||
|
2411
|
+
(i_feed_forward_w2 - n_feed_forward_w2/8)%3 == 2)) new_type = GGML_TYPE_Q6_K;
|
2412
|
+
++i_feed_forward_w2;
|
2413
|
+
}
|
2414
|
+
if (tensor.name.find("attention.wo.weight") != std::string::npos) {
|
2415
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
2416
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
2417
|
+
}
|
2418
|
+
|
2119
2419
|
float * f32_data;
|
2120
2420
|
size_t nelements = tensor.ne.at(0) * tensor.ne.at(1);
|
2121
2421
|
llama_buffer f32_conv_buf;
|
2422
|
+
|
2122
2423
|
if (tensor.type == GGML_TYPE_F32) {
|
2123
2424
|
f32_data = (float *) tensor.data;
|
2124
|
-
} else if (tensor.type
|
2125
|
-
|
2126
|
-
f32_data = (float *) f32_conv_buf.addr;
|
2127
|
-
const auto * f16_data = (const ggml_fp16_t *) tensor.data;
|
2128
|
-
for (size_t i = 0; i < nelements; i++) {
|
2129
|
-
f32_data[i] = ggml_fp16_to_fp32(f16_data[i]);
|
2130
|
-
}
|
2425
|
+
} else if (ggml_is_quantized(tensor.type) && !params->allow_requantize) {
|
2426
|
+
throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor.type)));
|
2131
2427
|
} else {
|
2132
|
-
|
2428
|
+
llama_convert_tensor_internal(tensor, f32_conv_buf, nelements, nthread);
|
2429
|
+
f32_data = (float *) f32_conv_buf.addr;
|
2133
2430
|
}
|
2134
2431
|
|
2135
2432
|
printf("quantizing .. ");
|
@@ -2183,12 +2480,16 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2183
2480
|
}
|
2184
2481
|
|
2185
2482
|
printf("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0);
|
2483
|
+
int64_t tot_count = 0;
|
2186
2484
|
for (size_t i = 0; i < hist_cur.size(); i++) {
|
2187
2485
|
hist_all[i] += hist_cur[i];
|
2486
|
+
tot_count += hist_cur[i];
|
2188
2487
|
}
|
2189
2488
|
|
2190
|
-
|
2191
|
-
|
2489
|
+
if (tot_count > 0) {
|
2490
|
+
for (size_t i = 0; i < hist_cur.size(); i++) {
|
2491
|
+
printf("%5.3f ", hist_cur[i] / float(nelements));
|
2492
|
+
}
|
2192
2493
|
}
|
2193
2494
|
printf("\n");
|
2194
2495
|
}
|
@@ -2206,11 +2507,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2206
2507
|
sum_all += hist_all[i];
|
2207
2508
|
}
|
2208
2509
|
|
2209
|
-
|
2210
|
-
|
2211
|
-
|
2510
|
+
if (sum_all > 0) {
|
2511
|
+
printf("%s: hist: ", __func__);
|
2512
|
+
for (size_t i = 0; i < hist_all.size(); i++) {
|
2513
|
+
printf("%5.3f ", hist_all[i] / float(sum_all));
|
2514
|
+
}
|
2515
|
+
printf("\n");
|
2212
2516
|
}
|
2213
|
-
printf("\n");
|
2214
2517
|
}
|
2215
2518
|
}
|
2216
2519
|
|
@@ -2251,9 +2554,9 @@ struct llama_context * llama_init_from_file(
|
|
2251
2554
|
|
2252
2555
|
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
2253
2556
|
|
2254
|
-
if (!llama_model_load(path_model, *ctx, params.n_ctx, params.
|
2255
|
-
|
2256
|
-
|
2557
|
+
if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_batch, params.n_gpu_layers,
|
2558
|
+
params.main_gpu, params.tensor_split, memory_type, params.use_mmap, params.use_mlock,
|
2559
|
+
params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
|
2257
2560
|
fprintf(stderr, "%s: failed to load model\n", __func__);
|
2258
2561
|
llama_free(ctx);
|
2259
2562
|
return nullptr;
|
@@ -2291,6 +2594,38 @@ struct llama_context * llama_init_from_file(
|
|
2291
2594
|
ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
|
2292
2595
|
}
|
2293
2596
|
|
2597
|
+
#ifdef GGML_USE_METAL
|
2598
|
+
if (params.n_gpu_layers > 0) {
|
2599
|
+
// this allocates all Metal resources and memory buffers
|
2600
|
+
ctx->ctx_metal = ggml_metal_init();
|
2601
|
+
|
2602
|
+
void *data_ptr = NULL;
|
2603
|
+
size_t data_size = 0;
|
2604
|
+
if (params.use_mmap) {
|
2605
|
+
data_ptr = ctx->model.mapping->addr;
|
2606
|
+
data_size= ctx->model.mapping->size;
|
2607
|
+
} else {
|
2608
|
+
data_ptr = ggml_get_mem_buffer(ctx->model.ctx);
|
2609
|
+
data_size= ggml_get_mem_size(ctx->model.ctx);
|
2610
|
+
}
|
2611
|
+
|
2612
|
+
#define LLAMA_METAL_CHECK_BUF(result) \
|
2613
|
+
if (!(result)) { \
|
2614
|
+
fprintf(stderr, "%s: failed to add buffer\n", __func__); \
|
2615
|
+
llama_free(ctx); \
|
2616
|
+
return NULL; \
|
2617
|
+
}
|
2618
|
+
|
2619
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size));
|
2620
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size));
|
2621
|
+
|
2622
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->model.kv_self.buf.addr, ctx->model.kv_self.buf.size));
|
2623
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size));
|
2624
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size));
|
2625
|
+
#undef LLAMA_METAL_CHECK_BUF
|
2626
|
+
}
|
2627
|
+
#endif
|
2628
|
+
|
2294
2629
|
return ctx;
|
2295
2630
|
}
|
2296
2631
|
|
@@ -2301,13 +2636,12 @@ void llama_free(struct llama_context * ctx) {
|
|
2301
2636
|
int llama_model_quantize(
|
2302
2637
|
const char * fname_inp,
|
2303
2638
|
const char * fname_out,
|
2304
|
-
|
2305
|
-
int nthread) {
|
2639
|
+
const llama_model_quantize_params *params) {
|
2306
2640
|
try {
|
2307
|
-
llama_model_quantize_internal(fname_inp, fname_out,
|
2641
|
+
llama_model_quantize_internal(fname_inp, fname_out, params);
|
2308
2642
|
return 0;
|
2309
|
-
} catch (const std::
|
2310
|
-
fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.
|
2643
|
+
} catch (const std::exception & err) {
|
2644
|
+
fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.what());
|
2311
2645
|
return 1;
|
2312
2646
|
}
|
2313
2647
|
}
|
@@ -2560,8 +2894,8 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|
2560
2894
|
int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
|
2561
2895
|
try {
|
2562
2896
|
return llama_apply_lora_from_file_internal(ctx, path_lora, path_base_model, n_threads);
|
2563
|
-
} catch (const std::
|
2564
|
-
fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.
|
2897
|
+
} catch (const std::exception & err) {
|
2898
|
+
fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what());
|
2565
2899
|
return 1;
|
2566
2900
|
}
|
2567
2901
|
}
|
@@ -2906,7 +3240,7 @@ int llama_eval(
|
|
2906
3240
|
int n_tokens,
|
2907
3241
|
int n_past,
|
2908
3242
|
int n_threads) {
|
2909
|
-
if (!llama_eval_internal(*ctx, tokens, n_tokens, n_past, n_threads)) {
|
3243
|
+
if (!llama_eval_internal(*ctx, tokens, n_tokens, n_past, n_threads, nullptr)) {
|
2910
3244
|
fprintf(stderr, "%s: failed to eval\n", __func__);
|
2911
3245
|
return 1;
|
2912
3246
|
}
|
@@ -2921,6 +3255,20 @@ int llama_eval(
|
|
2921
3255
|
return 0;
|
2922
3256
|
}
|
2923
3257
|
|
3258
|
+
int llama_eval_export(struct llama_context * ctx, const char * fname) {
|
3259
|
+
const int n_batch = 1;
|
3260
|
+
const int n_ctx = 512 - n_batch;
|
3261
|
+
|
3262
|
+
const std::vector<llama_token> tmp(n_batch, llama_token_bos());
|
3263
|
+
|
3264
|
+
if (!llama_eval_internal(*ctx, tmp.data(), tmp.size(), n_ctx, 1, fname)) {
|
3265
|
+
fprintf(stderr, "%s: failed to eval\n", __func__);
|
3266
|
+
return 1;
|
3267
|
+
}
|
3268
|
+
|
3269
|
+
return 0;
|
3270
|
+
}
|
3271
|
+
|
2924
3272
|
int llama_tokenize(
|
2925
3273
|
struct llama_context * ctx,
|
2926
3274
|
const char * text,
|