llama_cpp 0.1.4 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +29 -0
- data/ext/llama_cpp/extconf.rb +26 -1
- data/ext/llama_cpp/llama_cpp.cpp +210 -13
- data/ext/llama_cpp/src/ggml-cuda.cu +1916 -0
- data/ext/llama_cpp/src/ggml-cuda.h +15 -2
- data/ext/llama_cpp/src/ggml-metal.h +63 -0
- data/ext/llama_cpp/src/ggml-metal.m +783 -0
- data/ext/llama_cpp/src/ggml-metal.metal +1133 -0
- data/ext/llama_cpp/src/ggml-opencl.cpp +235 -39
- data/ext/llama_cpp/src/ggml-opencl.h +4 -0
- data/ext/llama_cpp/src/ggml.c +340 -109
- data/ext/llama_cpp/src/ggml.h +44 -6
- data/ext/llama_cpp/src/k_quants.c +2244 -0
- data/ext/llama_cpp/src/k_quants.h +122 -0
- data/ext/llama_cpp/src/llama-util.h +16 -0
- data/ext/llama_cpp/src/llama.cpp +484 -136
- data/ext/llama_cpp/src/llama.h +39 -8
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +33 -1
- metadata +8 -2
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -16,6 +16,10 @@
|
|
16
16
|
#include "ggml-opencl.h"
|
17
17
|
#endif
|
18
18
|
|
19
|
+
#ifdef GGML_USE_METAL
|
20
|
+
#include "ggml-metal.h"
|
21
|
+
#endif
|
22
|
+
|
19
23
|
#include <array>
|
20
24
|
#include <ctime>
|
21
25
|
#include <cinttypes>
|
@@ -49,17 +53,22 @@ enum e_model {
|
|
49
53
|
MODEL_65B,
|
50
54
|
};
|
51
55
|
|
52
|
-
|
53
56
|
static const size_t MB = 1024*1024;
|
54
57
|
|
55
58
|
// computed for n_ctx == 2048
|
56
59
|
// TODO: dynamically determine these sizes
|
57
60
|
// needs modifications in ggml
|
58
61
|
|
62
|
+
typedef void (*offload_func_t)(struct ggml_tensor * tensor);
|
63
|
+
|
64
|
+
void llama_nop(struct ggml_tensor * tensor) { // don't offload by default
|
65
|
+
(void) tensor;
|
66
|
+
}
|
67
|
+
|
59
68
|
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
|
60
69
|
{
|
61
70
|
static std::map<e_model, size_t> k_sizes = {
|
62
|
-
{ MODEL_3B,
|
71
|
+
{ MODEL_3B, 256ull * MB },
|
63
72
|
{ MODEL_7B, 512ull * MB },
|
64
73
|
{ MODEL_13B, 512ull * MB },
|
65
74
|
{ MODEL_30B, 512ull * MB },
|
@@ -71,7 +80,7 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
|
|
71
80
|
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
|
72
81
|
{
|
73
82
|
static std::map<e_model, size_t> k_sizes = {
|
74
|
-
{ MODEL_3B,
|
83
|
+
{ MODEL_3B, 256ull * MB },
|
75
84
|
{ MODEL_7B, 512ull * MB },
|
76
85
|
{ MODEL_13B, 512ull * MB },
|
77
86
|
{ MODEL_30B, 512ull * MB },
|
@@ -170,6 +179,7 @@ struct llama_model {
|
|
170
179
|
struct ggml_tensor * output;
|
171
180
|
|
172
181
|
std::vector<llama_layer> layers;
|
182
|
+
int n_gpu_layers;
|
173
183
|
|
174
184
|
// context
|
175
185
|
struct ggml_context * ctx = NULL;
|
@@ -195,6 +205,16 @@ struct llama_model {
|
|
195
205
|
if (ctx) {
|
196
206
|
ggml_free(ctx);
|
197
207
|
}
|
208
|
+
|
209
|
+
#ifdef GGML_USE_CUBLAS
|
210
|
+
for (size_t i = 0; i < tensors_by_name.size(); ++i) {
|
211
|
+
ggml_cuda_free_data(tensors_by_name[i].second);
|
212
|
+
}
|
213
|
+
#elif defined(GGML_USE_CLBLAST)
|
214
|
+
for (size_t i = 0; i < tensors_by_name.size(); ++i) {
|
215
|
+
ggml_cl_free_data(tensors_by_name[i].second);
|
216
|
+
}
|
217
|
+
#endif
|
198
218
|
}
|
199
219
|
};
|
200
220
|
|
@@ -243,6 +263,10 @@ struct llama_context {
|
|
243
263
|
llama_ctx_buffer buf_compute;
|
244
264
|
llama_ctx_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
|
245
265
|
|
266
|
+
#ifdef GGML_USE_METAL
|
267
|
+
ggml_metal_context * ctx_metal = NULL;
|
268
|
+
#endif
|
269
|
+
|
246
270
|
int buf_last = 0;
|
247
271
|
size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
|
248
272
|
|
@@ -282,15 +306,15 @@ template <typename T>
|
|
282
306
|
static T checked_mul(T a, T b) {
|
283
307
|
T ret = a * b;
|
284
308
|
if (a != 0 && ret / a != b) {
|
285
|
-
throw format("overflow multiplying %llu * %llu",
|
286
|
-
(unsigned long long) a, (unsigned long long) b);
|
309
|
+
throw std::runtime_error(format("overflow multiplying %llu * %llu",
|
310
|
+
(unsigned long long) a, (unsigned long long) b));
|
287
311
|
}
|
288
312
|
return ret;
|
289
313
|
}
|
290
314
|
|
291
315
|
static size_t checked_div(size_t a, size_t b) {
|
292
316
|
if (b == 0 || a % b != 0) {
|
293
|
-
throw format("error dividing %zu / %zu", a, b);
|
317
|
+
throw std::runtime_error(format("error dividing %zu / %zu", a, b));
|
294
318
|
}
|
295
319
|
return a / b;
|
296
320
|
}
|
@@ -354,7 +378,7 @@ struct llama_load_tensor {
|
|
354
378
|
const auto & first_shard = shards.at(0);
|
355
379
|
for (const auto & shard : shards) {
|
356
380
|
if (shard.type != first_shard.type) {
|
357
|
-
throw format("inconsistent tensor shard type in '%s'", name.c_str());
|
381
|
+
throw std::runtime_error(format("inconsistent tensor shard type in '%s'", name.c_str()));
|
358
382
|
}
|
359
383
|
}
|
360
384
|
type = first_shard.type;
|
@@ -377,8 +401,8 @@ struct llama_load_tensor {
|
|
377
401
|
const auto & first_shard = shards.at(0);
|
378
402
|
for (const auto & shard : shards) {
|
379
403
|
if (shard.ne != first_shard.ne) {
|
380
|
-
throw format("inconsistent tensor shard shape in '%s': first was %s, other was %s",
|
381
|
-
name.c_str(), llama_format_tensor_shape(first_shard.ne).c_str(), llama_format_tensor_shape(shard.ne).c_str());
|
404
|
+
throw std::runtime_error(format("inconsistent tensor shard shape in '%s': first was %s, other was %s",
|
405
|
+
name.c_str(), llama_format_tensor_shape(first_shard.ne).c_str(), llama_format_tensor_shape(shard.ne).c_str()));
|
382
406
|
}
|
383
407
|
}
|
384
408
|
ne = first_shard.ne;
|
@@ -456,8 +480,8 @@ struct llama_file_loader {
|
|
456
480
|
}
|
457
481
|
}
|
458
482
|
|
459
|
-
throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
|
460
|
-
magic, version);
|
483
|
+
throw std::runtime_error(format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
|
484
|
+
magic, version));
|
461
485
|
}
|
462
486
|
void read_hparams() {
|
463
487
|
hparams.n_vocab = file.read_u32();
|
@@ -497,7 +521,7 @@ struct llama_file_loader {
|
|
497
521
|
file.read_raw(shard.ne.data(), sizeof(shard.ne[0]) * n_dims);
|
498
522
|
std::string name = file.read_string(name_len);
|
499
523
|
if (n_dims < 1 || n_dims > 2) {
|
500
|
-
throw format("llama.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims);
|
524
|
+
throw std::runtime_error(format("llama.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims));
|
501
525
|
}
|
502
526
|
switch (shard.type) {
|
503
527
|
case GGML_TYPE_F32:
|
@@ -507,9 +531,14 @@ struct llama_file_loader {
|
|
507
531
|
case GGML_TYPE_Q5_0:
|
508
532
|
case GGML_TYPE_Q5_1:
|
509
533
|
case GGML_TYPE_Q8_0:
|
534
|
+
case GGML_TYPE_Q2_K:
|
535
|
+
case GGML_TYPE_Q3_K:
|
536
|
+
case GGML_TYPE_Q4_K:
|
537
|
+
case GGML_TYPE_Q5_K:
|
538
|
+
case GGML_TYPE_Q6_K:
|
510
539
|
break;
|
511
540
|
default: {
|
512
|
-
throw format("unrecognized tensor type %u\n", shard.type);
|
541
|
+
throw std::runtime_error(format("unrecognized tensor type %u\n", shard.type));
|
513
542
|
}
|
514
543
|
}
|
515
544
|
|
@@ -582,6 +611,11 @@ struct llama_file_saver {
|
|
582
611
|
case GGML_TYPE_Q5_0:
|
583
612
|
case GGML_TYPE_Q5_1:
|
584
613
|
case GGML_TYPE_Q8_0:
|
614
|
+
case GGML_TYPE_Q2_K:
|
615
|
+
case GGML_TYPE_Q3_K:
|
616
|
+
case GGML_TYPE_Q4_K:
|
617
|
+
case GGML_TYPE_Q5_K:
|
618
|
+
case GGML_TYPE_Q6_K:
|
585
619
|
break;
|
586
620
|
default: LLAMA_ASSERT(false);
|
587
621
|
}
|
@@ -613,7 +647,7 @@ struct llama_model_loader {
|
|
613
647
|
auto * ith_file = new llama_file_loader(fname.c_str(), i, tensors_map);
|
614
648
|
file_loaders.emplace_back(ith_file);
|
615
649
|
if (ith_file->hparams != first_file->hparams) {
|
616
|
-
throw format("llama.cpp: hparams inconsistent between files");
|
650
|
+
throw std::runtime_error(format("llama.cpp: hparams inconsistent between files"));
|
617
651
|
}
|
618
652
|
}
|
619
653
|
if (!llama_mmap::SUPPORTED) {
|
@@ -643,7 +677,7 @@ struct llama_model_loader {
|
|
643
677
|
uint32_t guess_n_parts() const {
|
644
678
|
auto it = tensors_map.name_to_idx.find("tok_embeddings.weight");
|
645
679
|
if (it == tensors_map.name_to_idx.end()) {
|
646
|
-
throw std::string("missing tok_embeddings.weight");
|
680
|
+
throw std::runtime_error(std::string("missing tok_embeddings.weight"));
|
647
681
|
}
|
648
682
|
const llama_load_tensor & lt = tensors_map.tensors.at(it->second);
|
649
683
|
return file_loaders.at(0)->hparams.n_embd / lt.shards.at(0).ne.at(0);
|
@@ -660,12 +694,12 @@ struct llama_model_loader {
|
|
660
694
|
struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne, ggml_backend backend) {
|
661
695
|
auto it = tensors_map.name_to_idx.find(name);
|
662
696
|
if (it == tensors_map.name_to_idx.end()) {
|
663
|
-
throw format("llama.cpp: tensor '%s' is missing from model", name.c_str());
|
697
|
+
throw std::runtime_error(std::runtime_error(format("llama.cpp: tensor '%s' is missing from model", name.c_str())));
|
664
698
|
}
|
665
699
|
llama_load_tensor & lt = tensors_map.tensors.at(it->second);
|
666
700
|
if (lt.ne != ne) {
|
667
|
-
throw format("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s",
|
668
|
-
name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str());
|
701
|
+
throw std::runtime_error(format("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s",
|
702
|
+
name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str()));
|
669
703
|
}
|
670
704
|
|
671
705
|
return get_tensor_for(lt, backend);
|
@@ -681,6 +715,7 @@ struct llama_model_loader {
|
|
681
715
|
}
|
682
716
|
ggml_set_name(tensor, lt.name.c_str());
|
683
717
|
LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor
|
718
|
+
|
684
719
|
tensor->backend = backend;
|
685
720
|
lt.ggml_tensor = tensor;
|
686
721
|
num_ggml_tensors_created++;
|
@@ -689,7 +724,7 @@ struct llama_model_loader {
|
|
689
724
|
|
690
725
|
void done_getting_tensors() const {
|
691
726
|
if (num_ggml_tensors_created != tensors_map.tensors.size()) {
|
692
|
-
throw std::string("llama.cpp: file contained more tensors than expected");
|
727
|
+
throw std::runtime_error(std::string("llama.cpp: file contained more tensors than expected"));
|
693
728
|
}
|
694
729
|
}
|
695
730
|
|
@@ -833,7 +868,10 @@ static bool kv_cache_init(
|
|
833
868
|
struct llama_context_params llama_context_default_params() {
|
834
869
|
struct llama_context_params result = {
|
835
870
|
/*.n_ctx =*/ 512,
|
871
|
+
/*.n_batch =*/ 512,
|
836
872
|
/*.gpu_layers =*/ 0,
|
873
|
+
/*.main_gpu =*/ 0,
|
874
|
+
/*.tensor_split =*/ {0},
|
837
875
|
/*.seed =*/ -1,
|
838
876
|
/*.f16_kv =*/ true,
|
839
877
|
/*.logits_all =*/ false,
|
@@ -848,6 +886,17 @@ struct llama_context_params llama_context_default_params() {
|
|
848
886
|
return result;
|
849
887
|
}
|
850
888
|
|
889
|
+
struct llama_model_quantize_params llama_model_quantize_default_params() {
|
890
|
+
struct llama_model_quantize_params result = {
|
891
|
+
/*.nthread =*/ 0,
|
892
|
+
/*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
|
893
|
+
/*.allow_requantize =*/ false,
|
894
|
+
/*.quantize_output_tensor =*/ true,
|
895
|
+
};
|
896
|
+
|
897
|
+
return result;
|
898
|
+
}
|
899
|
+
|
851
900
|
bool llama_mmap_supported() {
|
852
901
|
return llama_mmap::SUPPORTED;
|
853
902
|
}
|
@@ -898,6 +947,16 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
|
|
898
947
|
case LLAMA_FTYPE_MOSTLY_Q5_0: return "mostly Q5_0";
|
899
948
|
case LLAMA_FTYPE_MOSTLY_Q5_1: return "mostly Q5_1";
|
900
949
|
case LLAMA_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0";
|
950
|
+
// K-quants
|
951
|
+
case LLAMA_FTYPE_MOSTLY_Q2_K: return "mostly Q2_K";
|
952
|
+
case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "mostly Q3_K - Small";
|
953
|
+
case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "mostly Q3_K - Medium";
|
954
|
+
case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "mostly Q3_K - Large";
|
955
|
+
case LLAMA_FTYPE_MOSTLY_Q4_K_S: return "mostly Q4_K - Small";
|
956
|
+
case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "mostly Q4_K - Medium";
|
957
|
+
case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "mostly Q5_K - Small";
|
958
|
+
case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "mostly Q5_K - Medium";
|
959
|
+
case LLAMA_FTYPE_MOSTLY_Q6_K: return "mostly Q6_K";
|
901
960
|
default: return "unknown, may not work";
|
902
961
|
}
|
903
962
|
}
|
@@ -917,7 +976,10 @@ static void llama_model_load_internal(
|
|
917
976
|
const std::string & fname,
|
918
977
|
llama_context & lctx,
|
919
978
|
int n_ctx,
|
979
|
+
int n_batch,
|
920
980
|
int n_gpu_layers,
|
981
|
+
int main_gpu,
|
982
|
+
const float * tensor_split,
|
921
983
|
ggml_type memory_type,
|
922
984
|
bool use_mmap,
|
923
985
|
bool use_mlock,
|
@@ -932,9 +994,9 @@ static void llama_model_load_internal(
|
|
932
994
|
lctx.vocab = std::move(ml->file_loaders.at(0)->vocab);
|
933
995
|
auto & model = lctx.model;
|
934
996
|
model.hparams = ml->file_loaders.at(0)->hparams;
|
997
|
+
model.n_gpu_layers = n_gpu_layers;
|
935
998
|
llama_file_version file_version = ml->file_loaders.at(0)->file_version;
|
936
999
|
auto & hparams = model.hparams;
|
937
|
-
uint32_t n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
|
938
1000
|
|
939
1001
|
{
|
940
1002
|
switch (hparams.n_layer) {
|
@@ -948,6 +1010,8 @@ static void llama_model_load_internal(
|
|
948
1010
|
hparams.n_ctx = n_ctx;
|
949
1011
|
}
|
950
1012
|
|
1013
|
+
const uint32_t n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
|
1014
|
+
|
951
1015
|
{
|
952
1016
|
fprintf(stderr, "%s: format = %s\n", __func__, llama_file_version_name(file_version));
|
953
1017
|
fprintf(stderr, "%s: n_vocab = %u\n", __func__, hparams.n_vocab);
|
@@ -967,7 +1031,7 @@ static void llama_model_load_internal(
|
|
967
1031
|
if (hparams.ftype != LLAMA_FTYPE_ALL_F32 &&
|
968
1032
|
hparams.ftype != LLAMA_FTYPE_MOSTLY_F16 &&
|
969
1033
|
hparams.ftype != LLAMA_FTYPE_MOSTLY_Q8_0) {
|
970
|
-
throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1405)");
|
1034
|
+
throw std::runtime_error(format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1405)"));
|
971
1035
|
}
|
972
1036
|
}
|
973
1037
|
|
@@ -975,7 +1039,7 @@ static void llama_model_load_internal(
|
|
975
1039
|
if (hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ||
|
976
1040
|
hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_1 ||
|
977
1041
|
hparams.ftype == LLAMA_FTYPE_MOSTLY_Q8_0) {
|
978
|
-
throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1508)");
|
1042
|
+
throw std::runtime_error(format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1508)"));
|
979
1043
|
}
|
980
1044
|
}
|
981
1045
|
|
@@ -1006,18 +1070,28 @@ static void llama_model_load_internal(
|
|
1006
1070
|
|
1007
1071
|
model.ctx = ggml_init(params);
|
1008
1072
|
if (!model.ctx) {
|
1009
|
-
throw format("ggml_init() failed");
|
1073
|
+
throw std::runtime_error(format("ggml_init() failed"));
|
1010
1074
|
}
|
1011
1075
|
}
|
1012
1076
|
|
1013
|
-
|
1014
|
-
#
|
1077
|
+
(void) main_gpu;
|
1078
|
+
#if defined(GGML_USE_CUBLAS)
|
1079
|
+
fprintf(stderr, "%s: using CUDA for GPU acceleration\n", __func__);
|
1080
|
+
ggml_cuda_set_main_device(main_gpu);
|
1081
|
+
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
|
1082
|
+
#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT
|
1083
|
+
#elif defined(GGML_USE_CLBLAST)
|
1084
|
+
fprintf(stderr, "%s: using OpenCL for GPU acceleration\n", __func__);
|
1085
|
+
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
|
1086
|
+
#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU
|
1015
1087
|
#else
|
1016
|
-
#define LLAMA_BACKEND_OFFLOAD
|
1088
|
+
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CPU
|
1089
|
+
#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_CPU
|
1017
1090
|
#endif
|
1018
1091
|
|
1019
1092
|
// prepare memory for the weights
|
1020
|
-
size_t
|
1093
|
+
size_t vram_weights = 0;
|
1094
|
+
size_t vram_scratch = 0;
|
1021
1095
|
{
|
1022
1096
|
const uint32_t n_embd = hparams.n_embd;
|
1023
1097
|
const uint32_t n_layer = hparams.n_layer;
|
@@ -1032,7 +1106,7 @@ static void llama_model_load_internal(
|
|
1032
1106
|
{
|
1033
1107
|
ggml_backend backend_output;
|
1034
1108
|
if (n_gpu_layers > int(n_layer)) { // NOLINT
|
1035
|
-
backend_output =
|
1109
|
+
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
|
1036
1110
|
} else {
|
1037
1111
|
backend_output = GGML_BACKEND_CPU;
|
1038
1112
|
}
|
@@ -1044,7 +1118,8 @@ static void llama_model_load_internal(
|
|
1044
1118
|
|
1045
1119
|
model.layers.resize(n_layer);
|
1046
1120
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
1047
|
-
const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
1121
|
+
const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
1122
|
+
const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
1048
1123
|
|
1049
1124
|
auto & layer = model.layers[i];
|
1050
1125
|
|
@@ -1052,19 +1127,19 @@ static void llama_model_load_internal(
|
|
1052
1127
|
|
1053
1128
|
layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd}, backend);
|
1054
1129
|
|
1055
|
-
layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd},
|
1056
|
-
layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd},
|
1057
|
-
layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd},
|
1058
|
-
layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd},
|
1130
|
+
layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}, backend_split);
|
1131
|
+
layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd}, backend_split);
|
1132
|
+
layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd}, backend_split);
|
1133
|
+
layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}, backend_split);
|
1059
1134
|
|
1060
1135
|
layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd}, backend);
|
1061
1136
|
|
1062
|
-
layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff},
|
1063
|
-
layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd},
|
1064
|
-
layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff},
|
1137
|
+
layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}, backend_split);
|
1138
|
+
layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}, backend_split);
|
1139
|
+
layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}, backend_split);
|
1065
1140
|
|
1066
|
-
if (backend ==
|
1067
|
-
|
1141
|
+
if (backend == GGML_BACKEND_GPU) {
|
1142
|
+
vram_weights +=
|
1068
1143
|
ggml_nbytes(layer.attention_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
|
1069
1144
|
ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.attention_norm) +
|
1070
1145
|
ggml_nbytes(layer.w1) + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3);
|
@@ -1081,10 +1156,10 @@ static void llama_model_load_internal(
|
|
1081
1156
|
// this is the total memory required to run the inference
|
1082
1157
|
const size_t mem_required =
|
1083
1158
|
ctx_size +
|
1084
|
-
mmapped_size -
|
1159
|
+
mmapped_size - vram_weights + // weights in VRAM not in memory
|
1085
1160
|
MEM_REQ_SCRATCH0().at(model.type) +
|
1086
1161
|
MEM_REQ_SCRATCH1().at(model.type) +
|
1087
|
-
MEM_REQ_EVAL().at(model.type);
|
1162
|
+
MEM_REQ_EVAL().at (model.type);
|
1088
1163
|
|
1089
1164
|
// this is the memory required by one llama_state
|
1090
1165
|
const size_t mem_required_state =
|
@@ -1093,15 +1168,25 @@ static void llama_model_load_internal(
|
|
1093
1168
|
fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
|
1094
1169
|
mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
|
1095
1170
|
|
1171
|
+
(void) vram_scratch;
|
1096
1172
|
#ifdef GGML_USE_CUBLAS
|
1173
|
+
vram_scratch = n_batch * MB;
|
1174
|
+
ggml_cuda_set_scratch_size(vram_scratch);
|
1175
|
+
if (n_gpu_layers > 0) {
|
1176
|
+
fprintf(stderr, "%s: allocating batch_size x 1 MB = %ld MB VRAM for the scratch buffer\n",
|
1177
|
+
__func__, vram_scratch / MB);
|
1178
|
+
}
|
1179
|
+
#endif // GGML_USE_CUBLAS
|
1180
|
+
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
1097
1181
|
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
1098
1182
|
|
1099
|
-
fprintf(stderr, "%s:
|
1183
|
+
fprintf(stderr, "%s: offloading %d layers to GPU\n", __func__, n_gpu);
|
1100
1184
|
if (n_gpu_layers > (int) hparams.n_layer) {
|
1101
|
-
fprintf(stderr, "%s:
|
1185
|
+
fprintf(stderr, "%s: offloading output layer to GPU\n", __func__);
|
1102
1186
|
}
|
1103
|
-
fprintf(stderr, "%s:
|
1104
|
-
|
1187
|
+
fprintf(stderr, "%s: total VRAM used: %zu MB\n",
|
1188
|
+
__func__, (vram_weights + vram_scratch + MB - 1) / MB); // round up
|
1189
|
+
#else
|
1105
1190
|
(void) n_gpu_layers;
|
1106
1191
|
#endif
|
1107
1192
|
}
|
@@ -1113,8 +1198,10 @@ static void llama_model_load_internal(
|
|
1113
1198
|
|
1114
1199
|
ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
|
1115
1200
|
|
1116
|
-
#
|
1201
|
+
#if defined(GGML_USE_CUBLAS)
|
1117
1202
|
{
|
1203
|
+
ggml_cuda_set_tensor_split(tensor_split);
|
1204
|
+
|
1118
1205
|
size_t done_size = 0;
|
1119
1206
|
size_t data_size = 0;
|
1120
1207
|
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
|
@@ -1124,7 +1211,8 @@ static void llama_model_load_internal(
|
|
1124
1211
|
}
|
1125
1212
|
}
|
1126
1213
|
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
|
1127
|
-
|
1214
|
+
ggml_backend backend = lt.ggml_tensor->backend;
|
1215
|
+
if (backend != GGML_BACKEND_GPU && backend != GGML_BACKEND_GPU_SPLIT) {
|
1128
1216
|
continue;
|
1129
1217
|
}
|
1130
1218
|
if (progress_callback) {
|
@@ -1136,30 +1224,28 @@ static void llama_model_load_internal(
|
|
1136
1224
|
}
|
1137
1225
|
#elif defined(GGML_USE_CLBLAST)
|
1138
1226
|
{
|
1139
|
-
|
1140
|
-
|
1141
|
-
|
1142
|
-
|
1143
|
-
|
1144
|
-
|
1145
|
-
|
1146
|
-
const auto & layer = model.layers[i];
|
1147
|
-
|
1148
|
-
ggml_cl_transform_tensor(layer.wq); vram_total += ggml_nbytes(layer.wq);
|
1149
|
-
ggml_cl_transform_tensor(layer.wk); vram_total += ggml_nbytes(layer.wk);
|
1150
|
-
ggml_cl_transform_tensor(layer.wv); vram_total += ggml_nbytes(layer.wv);
|
1151
|
-
ggml_cl_transform_tensor(layer.wo); vram_total += ggml_nbytes(layer.wo);
|
1152
|
-
ggml_cl_transform_tensor(layer.w1); vram_total += ggml_nbytes(layer.w1);
|
1153
|
-
ggml_cl_transform_tensor(layer.w2); vram_total += ggml_nbytes(layer.w2);
|
1154
|
-
ggml_cl_transform_tensor(layer.w3); vram_total += ggml_nbytes(layer.w3);
|
1227
|
+
size_t done_size = 0;
|
1228
|
+
size_t data_size = 0;
|
1229
|
+
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
|
1230
|
+
data_size += lt.size;
|
1231
|
+
if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
|
1232
|
+
done_size += lt.size;
|
1233
|
+
}
|
1155
1234
|
}
|
1156
|
-
|
1157
|
-
|
1158
|
-
|
1235
|
+
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
|
1236
|
+
if (lt.ggml_tensor->backend != GGML_BACKEND_GPU) {
|
1237
|
+
continue;
|
1238
|
+
}
|
1239
|
+
if (progress_callback) {
|
1240
|
+
progress_callback((float) done_size / data_size, progress_callback_user_data);
|
1241
|
+
}
|
1242
|
+
ggml_cl_load_data(fname.c_str(), lt.ggml_tensor, lt.shards.at(0).file_off);
|
1243
|
+
done_size += lt.size;
|
1159
1244
|
}
|
1160
|
-
|
1161
|
-
fprintf(stderr, "ggml_opencl: total VRAM used: %zu MB\n", vram_total / 1024 / 1024);
|
1162
1245
|
}
|
1246
|
+
#else
|
1247
|
+
(void) n_batch;
|
1248
|
+
(void) tensor_split;
|
1163
1249
|
#endif
|
1164
1250
|
|
1165
1251
|
if (progress_callback) {
|
@@ -1177,7 +1263,10 @@ static bool llama_model_load(
|
|
1177
1263
|
const std::string & fname,
|
1178
1264
|
llama_context & lctx,
|
1179
1265
|
int n_ctx,
|
1266
|
+
int n_batch,
|
1180
1267
|
int n_gpu_layers,
|
1268
|
+
int main_gpu,
|
1269
|
+
float * tensor_split,
|
1181
1270
|
ggml_type memory_type,
|
1182
1271
|
bool use_mmap,
|
1183
1272
|
bool use_mlock,
|
@@ -1185,28 +1274,30 @@ static bool llama_model_load(
|
|
1185
1274
|
llama_progress_callback progress_callback,
|
1186
1275
|
void *progress_callback_user_data) {
|
1187
1276
|
try {
|
1188
|
-
llama_model_load_internal(fname, lctx, n_ctx, n_gpu_layers,
|
1189
|
-
vocab_only, progress_callback, progress_callback_user_data);
|
1277
|
+
llama_model_load_internal(fname, lctx, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, memory_type,
|
1278
|
+
use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
|
1190
1279
|
return true;
|
1191
|
-
} catch (const std::
|
1192
|
-
fprintf(stderr, "error loading model: %s\n", err.
|
1280
|
+
} catch (const std::exception & err) {
|
1281
|
+
fprintf(stderr, "error loading model: %s\n", err.what());
|
1193
1282
|
return false;
|
1194
1283
|
}
|
1195
1284
|
}
|
1196
1285
|
|
1197
1286
|
// evaluate the transformer
|
1198
1287
|
//
|
1199
|
-
// - lctx:
|
1200
|
-
// - tokens:
|
1201
|
-
// - n_past:
|
1202
|
-
// - n_threads:
|
1288
|
+
// - lctx: llama context
|
1289
|
+
// - tokens: new batch of tokens to process
|
1290
|
+
// - n_past: the context size so far
|
1291
|
+
// - n_threads: number of threads to use
|
1292
|
+
// - cgraph_fname: filename of the exported computation graph
|
1203
1293
|
//
|
1204
1294
|
static bool llama_eval_internal(
|
1205
|
-
llama_context &
|
1206
|
-
const llama_token *
|
1207
|
-
const int
|
1208
|
-
const int
|
1209
|
-
const int
|
1295
|
+
llama_context & lctx,
|
1296
|
+
const llama_token * tokens,
|
1297
|
+
const int n_tokens,
|
1298
|
+
const int n_past,
|
1299
|
+
const int n_threads,
|
1300
|
+
const char * cgraph_fname) {
|
1210
1301
|
|
1211
1302
|
// enforce that the first token is BOS
|
1212
1303
|
if (n_past == 0 && tokens[0] != llama_token_bos()) {
|
@@ -1225,12 +1316,13 @@ static bool llama_eval_internal(
|
|
1225
1316
|
|
1226
1317
|
LLAMA_ASSERT(!!kv_self.ctx);
|
1227
1318
|
|
1228
|
-
const int n_embd
|
1229
|
-
const int n_layer
|
1230
|
-
const int n_ctx
|
1231
|
-
const int n_head
|
1232
|
-
const int n_vocab
|
1233
|
-
const int n_rot
|
1319
|
+
const int n_embd = hparams.n_embd;
|
1320
|
+
const int n_layer = hparams.n_layer;
|
1321
|
+
const int n_ctx = hparams.n_ctx;
|
1322
|
+
const int n_head = hparams.n_head;
|
1323
|
+
const int n_vocab = hparams.n_vocab;
|
1324
|
+
const int n_rot = hparams.n_embd/hparams.n_head;
|
1325
|
+
const int n_gpu_layers = model.n_gpu_layers;
|
1234
1326
|
|
1235
1327
|
auto & mem_per_token = lctx.mem_per_token;
|
1236
1328
|
auto & buf_compute = lctx.buf_compute;
|
@@ -1252,40 +1344,66 @@ static bool llama_eval_internal(
|
|
1252
1344
|
ggml_set_name(embd, "embd");
|
1253
1345
|
memcpy(embd->data, tokens, N*ggml_element_size(embd));
|
1254
1346
|
|
1347
|
+
struct ggml_tensor * cur;
|
1255
1348
|
struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
|
1256
1349
|
|
1350
|
+
const int i_gpu_start = n_layer - n_gpu_layers;
|
1351
|
+
(void) i_gpu_start;
|
1352
|
+
|
1257
1353
|
for (int il = 0; il < n_layer; ++il) {
|
1258
|
-
|
1354
|
+
offload_func_t offload_func = llama_nop;
|
1259
1355
|
|
1260
|
-
|
1356
|
+
#ifdef GGML_USE_CUBLAS
|
1357
|
+
if (il >= i_gpu_start) {
|
1358
|
+
offload_func = ggml_cuda_assign_buffers; // sets the output backend to GPU
|
1359
|
+
}
|
1360
|
+
#endif // GGML_USE_CUBLAS
|
1361
|
+
|
1362
|
+
struct ggml_tensor * inpSA = inpL;
|
1261
1363
|
|
1262
1364
|
lctx.use_buf(ctx0, 0);
|
1263
1365
|
|
1264
1366
|
// norm
|
1265
1367
|
{
|
1266
1368
|
cur = ggml_rms_norm(ctx0, inpL);
|
1369
|
+
offload_func(cur);
|
1370
|
+
ggml_set_name(cur, "rms_norm_0");
|
1267
1371
|
|
1268
1372
|
// cur = cur*attention_norm(broadcasted)
|
1269
1373
|
cur = ggml_mul(ctx0, cur, model.layers[il].attention_norm);
|
1374
|
+
offload_func(cur);
|
1375
|
+
ggml_set_name(cur, "attention_norm_0");
|
1270
1376
|
}
|
1271
1377
|
|
1272
1378
|
// self-attention
|
1273
1379
|
{
|
1274
1380
|
// compute Q and K and RoPE them
|
1275
|
-
struct ggml_tensor *
|
1276
|
-
|
1277
|
-
ggml_set_name(
|
1381
|
+
struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
1382
|
+
// offload_func(tmpq);
|
1383
|
+
ggml_set_name(tmpq, "tmpq");
|
1384
|
+
|
1385
|
+
struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
1386
|
+
// offload_func(tmpk);
|
1387
|
+
ggml_set_name(tmpk, "tmpk");
|
1388
|
+
|
1389
|
+
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0);
|
1278
1390
|
ggml_set_name(Kcur, "Kcur");
|
1279
1391
|
|
1392
|
+
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0);
|
1393
|
+
ggml_set_name(Qcur, "Qcur");
|
1394
|
+
|
1280
1395
|
// store key and value to memory
|
1281
1396
|
{
|
1282
1397
|
// compute the transposed [N, n_embd] V matrix
|
1283
1398
|
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), n_embd, N));
|
1399
|
+
ggml_set_name(Vcur, "Vcur");
|
1284
1400
|
|
1285
1401
|
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
|
1402
|
+
ggml_set_name(k, "k");
|
1286
1403
|
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
|
1287
1404
|
( n_ctx)*ggml_element_size(kv_self.v),
|
1288
1405
|
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
|
1406
|
+
ggml_set_name(v, "v");
|
1289
1407
|
|
1290
1408
|
// important: storing RoPE-ed version of K in the KV cache!
|
1291
1409
|
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
|
@@ -1326,7 +1444,6 @@ static bool llama_eval_internal(
|
|
1326
1444
|
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
|
1327
1445
|
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
1328
1446
|
|
1329
|
-
|
1330
1447
|
// split cached V into n_head heads
|
1331
1448
|
struct ggml_tensor * V =
|
1332
1449
|
ggml_view_3d(ctx0, kv_self.v,
|
@@ -1361,73 +1478,143 @@ static bool llama_eval_internal(
|
|
1361
1478
|
cur = ggml_mul_mat(ctx0,
|
1362
1479
|
model.layers[il].wo,
|
1363
1480
|
cur);
|
1481
|
+
offload_func(cur);
|
1482
|
+
ggml_set_name(cur, "result_wo");
|
1364
1483
|
}
|
1365
1484
|
|
1366
1485
|
lctx.use_buf(ctx0, 1);
|
1486
|
+
//ggml_cuda_set_scratch(1);
|
1367
1487
|
|
1368
1488
|
struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
|
1489
|
+
offload_func(inpFF);
|
1490
|
+
ggml_set_name(inpFF, "inpFF");
|
1369
1491
|
|
1370
1492
|
// feed-forward network
|
1371
1493
|
{
|
1372
1494
|
// norm
|
1373
1495
|
{
|
1374
1496
|
cur = ggml_rms_norm(ctx0, inpFF);
|
1497
|
+
offload_func(cur);
|
1498
|
+
ggml_set_name(cur, "rms_norm_1");
|
1375
1499
|
|
1376
1500
|
// cur = cur*ffn_norm(broadcasted)
|
1377
1501
|
cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
|
1502
|
+
offload_func(cur);
|
1503
|
+
ggml_set_name(cur, "ffn_norm");
|
1378
1504
|
}
|
1379
1505
|
|
1380
1506
|
struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
|
1381
1507
|
model.layers[il].w3,
|
1382
1508
|
cur);
|
1509
|
+
offload_func(tmp);
|
1510
|
+
ggml_set_name(tmp, "result_w3");
|
1383
1511
|
|
1384
1512
|
cur = ggml_mul_mat(ctx0,
|
1385
1513
|
model.layers[il].w1,
|
1386
1514
|
cur);
|
1515
|
+
offload_func(cur);
|
1516
|
+
ggml_set_name(cur, "result_w2");
|
1387
1517
|
|
1388
1518
|
// SILU activation
|
1389
1519
|
cur = ggml_silu(ctx0, cur);
|
1520
|
+
offload_func(cur);
|
1521
|
+
ggml_set_name(cur, "silu");
|
1390
1522
|
|
1391
1523
|
cur = ggml_mul(ctx0, cur, tmp);
|
1524
|
+
offload_func(cur);
|
1525
|
+
ggml_set_name(cur, "silu_x_result_w3");
|
1392
1526
|
|
1393
1527
|
cur = ggml_mul_mat(ctx0,
|
1394
1528
|
model.layers[il].w2,
|
1395
1529
|
cur);
|
1530
|
+
offload_func(cur);
|
1531
|
+
ggml_set_name(cur, "result_w2");
|
1396
1532
|
}
|
1397
1533
|
|
1398
1534
|
cur = ggml_add(ctx0, cur, inpFF);
|
1535
|
+
offload_func(cur);
|
1536
|
+
ggml_set_name(cur, "inpFF_+_result_w2");
|
1399
1537
|
|
1400
1538
|
// input for next layer
|
1401
1539
|
inpL = cur;
|
1540
|
+
|
1402
1541
|
}
|
1403
1542
|
|
1404
1543
|
lctx.use_buf(ctx0, 0);
|
1544
|
+
//ggml_cuda_set_scratch(0);
|
1405
1545
|
|
1406
1546
|
// used at the end to optionally extract the embeddings
|
1407
1547
|
struct ggml_tensor * embeddings = NULL;
|
1408
1548
|
|
1549
|
+
offload_func_t offload_func = llama_nop;
|
1550
|
+
|
1551
|
+
#ifdef GGML_USE_CUBLAS
|
1552
|
+
if (n_gpu_layers > n_layer) {
|
1553
|
+
offload_func = ggml_cuda_assign_buffers; // sets the output backend to GPU
|
1554
|
+
}
|
1555
|
+
#endif // GGML_USE_CUBLAS
|
1556
|
+
|
1409
1557
|
// norm
|
1410
1558
|
{
|
1559
|
+
cur = ggml_rms_norm(ctx0, inpL);
|
1560
|
+
offload_func(cur);
|
1561
|
+
ggml_set_name(cur, "rms_norm_inpL");
|
1411
1562
|
|
1412
|
-
|
1563
|
+
cur = ggml_rms_norm(ctx0, cur);
|
1564
|
+
offload_func(cur);
|
1565
|
+
ggml_set_name(cur, "rms_norm_after");
|
1413
1566
|
|
1414
|
-
//
|
1415
|
-
|
1567
|
+
// cur = cur*norm(broadcasted)
|
1568
|
+
cur = ggml_mul(ctx0, cur, model.norm);
|
1569
|
+
offload_func(cur);
|
1570
|
+
ggml_set_name(cur, "result_norm");
|
1416
1571
|
|
1417
|
-
embeddings =
|
1572
|
+
embeddings = cur;
|
1418
1573
|
}
|
1419
1574
|
|
1575
|
+
|
1420
1576
|
// lm_head
|
1421
|
-
|
1577
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
1578
|
+
ggml_set_name(cur, "result_output");
|
1422
1579
|
|
1423
1580
|
lctx.use_buf(ctx0, -1);
|
1424
1581
|
|
1425
1582
|
// logits -> probs
|
1426
|
-
//
|
1583
|
+
//cur = ggml_soft_max_inplace(ctx0, cur);
|
1427
1584
|
|
1428
1585
|
// run the computation
|
1429
|
-
ggml_build_forward_expand(&gf,
|
1430
|
-
|
1586
|
+
ggml_build_forward_expand(&gf, cur);
|
1587
|
+
|
1588
|
+
#ifdef GGML_USE_METAL
|
1589
|
+
if (lctx.ctx_metal && N == 1) {
|
1590
|
+
ggml_metal_graph_compute(lctx.ctx_metal, &gf);
|
1591
|
+
ggml_metal_get_tensor (lctx.ctx_metal, cur);
|
1592
|
+
} else {
|
1593
|
+
// IMPORTANT:
|
1594
|
+
// Since we don't have efficient Matrix x Matrix Metal multiplication yet, we fallback to vanilla
|
1595
|
+
// ggml_graph_compute(). It uses Apple's Accelerate CBLAS API which takes advantage of the ANE or the AMX
|
1596
|
+
// coprocessor.
|
1597
|
+
//
|
1598
|
+
// When we implement Matrix x Matrix Metal multiplication, we can avoid this branch.
|
1599
|
+
// But for now, we have focused only on Matrix x Vector Metal multiplication.
|
1600
|
+
//
|
1601
|
+
// TODO: avoid these syncs via shared memory (ref #1696)
|
1602
|
+
//
|
1603
|
+
if (lctx.ctx_metal) {
|
1604
|
+
// We need to sync the GPU KV cache with the CPU KV cache
|
1605
|
+
ggml_metal_get_tensor(lctx.ctx_metal, kv_self.k);
|
1606
|
+
ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v);
|
1607
|
+
}
|
1608
|
+
|
1609
|
+
ggml_graph_compute(ctx0, &gf);
|
1610
|
+
}
|
1611
|
+
#else
|
1612
|
+
ggml_graph_compute(ctx0, &gf);
|
1613
|
+
#endif
|
1614
|
+
|
1615
|
+
if (cgraph_fname) {
|
1616
|
+
ggml_graph_export(&gf, cgraph_fname);
|
1617
|
+
}
|
1431
1618
|
|
1432
1619
|
#ifdef GGML_PERF
|
1433
1620
|
// print timing information per ggml operation (for debugging purposes)
|
@@ -1441,7 +1628,7 @@ static bool llama_eval_internal(
|
|
1441
1628
|
//}
|
1442
1629
|
|
1443
1630
|
//embd_w.resize(n_vocab*N);
|
1444
|
-
//memcpy(embd_w.data(), ggml_get_data(
|
1631
|
+
//memcpy(embd_w.data(), ggml_get_data(cur), sizeof(float)*n_vocab*N);
|
1445
1632
|
|
1446
1633
|
// update kv token count
|
1447
1634
|
lctx.model.kv_self.n = n_past + N;
|
@@ -1452,11 +1639,11 @@ static bool llama_eval_internal(
|
|
1452
1639
|
|
1453
1640
|
if (lctx.logits_all) {
|
1454
1641
|
logits_out.resize(n_vocab * N);
|
1455
|
-
memcpy(logits_out.data(), (float *) ggml_get_data(
|
1642
|
+
memcpy(logits_out.data(), (float *) ggml_get_data(cur), sizeof(float)*n_vocab*N);
|
1456
1643
|
} else {
|
1457
1644
|
// return result for just the last token
|
1458
1645
|
logits_out.resize(n_vocab);
|
1459
|
-
memcpy(logits_out.data(), (float *) ggml_get_data(
|
1646
|
+
memcpy(logits_out.data(), (float *) ggml_get_data(cur) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
|
1460
1647
|
}
|
1461
1648
|
}
|
1462
1649
|
|
@@ -2055,16 +2242,88 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra
|
|
2055
2242
|
// quantization
|
2056
2243
|
//
|
2057
2244
|
|
2058
|
-
static void
|
2245
|
+
static void llama_convert_tensor_internal(const llama_load_tensor & tensor, llama_buffer & output, const int nelements, const int nthread) {
|
2246
|
+
if (output.size < nelements * sizeof(float)) {
|
2247
|
+
output.resize(nelements * sizeof(float));
|
2248
|
+
}
|
2249
|
+
float * f32_output = (float *) output.addr;
|
2250
|
+
|
2251
|
+
quantize_fns_t qtype;
|
2252
|
+
if (ggml_is_quantized(tensor.type)) {
|
2253
|
+
qtype = ggml_internal_get_quantize_fn(tensor.type);
|
2254
|
+
if (qtype.dequantize_row_q == NULL) {
|
2255
|
+
throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor.type)));
|
2256
|
+
}
|
2257
|
+
} else if (tensor.type != GGML_TYPE_F16) {
|
2258
|
+
throw std::runtime_error(format("cannot dequantize/convert tensor type %s", ggml_type_name(tensor.type)));
|
2259
|
+
}
|
2260
|
+
|
2261
|
+
if (nthread < 2) {
|
2262
|
+
if (tensor.type == GGML_TYPE_F16) {
|
2263
|
+
ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor.data, f32_output, nelements);
|
2264
|
+
} else if (ggml_is_quantized(tensor.type)) {
|
2265
|
+
qtype.dequantize_row_q(tensor.data, f32_output, nelements);
|
2266
|
+
} else {
|
2267
|
+
LLAMA_ASSERT(false); // unreachable
|
2268
|
+
}
|
2269
|
+
return;
|
2270
|
+
}
|
2271
|
+
|
2272
|
+
auto block_size = tensor.type == GGML_TYPE_F16 ? 1 : (size_t)ggml_blck_size(tensor.type);
|
2273
|
+
auto block_size_bytes = ggml_type_size(tensor.type);
|
2274
|
+
|
2275
|
+
LLAMA_ASSERT(nelements % block_size == 0);
|
2276
|
+
auto nblocks = nelements / block_size;
|
2277
|
+
auto blocks_per_thread = nblocks / nthread;
|
2278
|
+
auto spare_blocks = nblocks - (blocks_per_thread * nthread); // if blocks aren't divisible by thread count
|
2279
|
+
|
2280
|
+
std::vector<std::thread> workers;
|
2281
|
+
for (auto tnum = 0, in_buff_offs = 0, out_buff_offs = 0; tnum < nthread; tnum++) {
|
2282
|
+
auto thr_blocks = blocks_per_thread + (tnum == nthread - 1 ? spare_blocks : 0); // num blocks for this thread
|
2283
|
+
auto thr_elems = thr_blocks * block_size; // number of elements for this thread
|
2284
|
+
auto thr_block_bytes = thr_blocks * block_size_bytes; // number of input bytes for this thread
|
2285
|
+
|
2286
|
+
auto compute = [qtype] (ggml_type typ, uint8_t * inbuf, float * outbuf, int nels) {
|
2287
|
+
if (typ == GGML_TYPE_F16) {
|
2288
|
+
ggml_fp16_to_fp32_row((ggml_fp16_t *)inbuf, outbuf, nels);
|
2289
|
+
} else {
|
2290
|
+
qtype.dequantize_row_q(inbuf, outbuf, nels);
|
2291
|
+
}
|
2292
|
+
};
|
2293
|
+
workers.push_back(std::thread(compute, tensor.type, tensor.data + in_buff_offs, f32_output + out_buff_offs, thr_elems));
|
2294
|
+
in_buff_offs += thr_block_bytes;
|
2295
|
+
out_buff_offs += thr_elems;
|
2296
|
+
}
|
2297
|
+
for (auto & worker : workers) {
|
2298
|
+
worker.join();
|
2299
|
+
}
|
2300
|
+
|
2301
|
+
}
|
2302
|
+
|
2303
|
+
static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
|
2059
2304
|
ggml_type quantized_type;
|
2060
|
-
|
2305
|
+
llama_ftype ftype = params->ftype;
|
2306
|
+
int nthread = params->nthread;
|
2307
|
+
|
2308
|
+
switch (params->ftype) {
|
2061
2309
|
case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
|
2062
2310
|
case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
|
2063
2311
|
case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
|
2064
2312
|
case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
|
2065
2313
|
case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
|
2066
|
-
|
2067
|
-
|
2314
|
+
|
2315
|
+
// K-quants
|
2316
|
+
case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
|
2317
|
+
case LLAMA_FTYPE_MOSTLY_Q3_K_S:
|
2318
|
+
case LLAMA_FTYPE_MOSTLY_Q3_K_M:
|
2319
|
+
case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break;
|
2320
|
+
case LLAMA_FTYPE_MOSTLY_Q4_K_S:
|
2321
|
+
case LLAMA_FTYPE_MOSTLY_Q4_K_M: quantized_type = GGML_TYPE_Q4_K; break;
|
2322
|
+
case LLAMA_FTYPE_MOSTLY_Q5_K_S:
|
2323
|
+
case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break;
|
2324
|
+
case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
|
2325
|
+
default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
|
2326
|
+
}
|
2068
2327
|
|
2069
2328
|
if (nthread <= 0) {
|
2070
2329
|
nthread = std::thread::hardware_concurrency();
|
@@ -2072,7 +2331,21 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2072
2331
|
|
2073
2332
|
std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp, /*use_mmap*/ false,
|
2074
2333
|
/*vocab_only*/ false));
|
2075
|
-
llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), ftype);
|
2334
|
+
llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), params->ftype);
|
2335
|
+
|
2336
|
+
int n_attention_wv = 0;
|
2337
|
+
int n_feed_forward_w2 = 0;
|
2338
|
+
for (auto& tensor : model_loader->tensors_map.tensors) {
|
2339
|
+
if (tensor.name.find("attention.wv.weight") != std::string::npos) {
|
2340
|
+
++n_attention_wv;
|
2341
|
+
}
|
2342
|
+
else if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) {
|
2343
|
+
++n_feed_forward_w2;
|
2344
|
+
}
|
2345
|
+
}
|
2346
|
+
|
2347
|
+
int i_attention_wv = 0;
|
2348
|
+
int i_feed_forward_w2 = 0;
|
2076
2349
|
|
2077
2350
|
size_t total_size_org = 0;
|
2078
2351
|
size_t total_size_new = 0;
|
@@ -2100,9 +2373,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2100
2373
|
quantize &= (tensor.ne.size() == 2);
|
2101
2374
|
|
2102
2375
|
// uncomment this to keep the output layer in FP16
|
2103
|
-
|
2104
|
-
|
2105
|
-
|
2376
|
+
if (!params->quantize_output_tensor && tensor.name == "output.weight") {
|
2377
|
+
quantize = false;
|
2378
|
+
}
|
2379
|
+
quantize = quantize && quantized_type != tensor.type;
|
2106
2380
|
|
2107
2381
|
enum ggml_type new_type;
|
2108
2382
|
void * new_data;
|
@@ -2116,20 +2390,43 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2116
2390
|
printf("size = %8.3f MB\n", tensor.size/1024.0/1024.0);
|
2117
2391
|
} else {
|
2118
2392
|
new_type = quantized_type;
|
2393
|
+
// TODO: temporary disabled until Metal / OpenCL support is available
|
2394
|
+
// ref: https://github.com/ggerganov/llama.cpp/issues/1711
|
2395
|
+
//if (tensor.name == "output.weight") {
|
2396
|
+
// new_type = GGML_TYPE_Q6_K;
|
2397
|
+
//}
|
2398
|
+
if (tensor.name.find("attention.wv.weight") != std::string::npos) {
|
2399
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
2400
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
2401
|
+
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
2402
|
+
(i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8 ||
|
2403
|
+
(i_attention_wv - n_attention_wv/8)%3 == 2)) new_type = GGML_TYPE_Q6_K;
|
2404
|
+
++i_attention_wv;
|
2405
|
+
}
|
2406
|
+
if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) {
|
2407
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
2408
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
2409
|
+
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
2410
|
+
(i_feed_forward_w2 < n_feed_forward_w2/8 || i_feed_forward_w2 >= 7*n_feed_forward_w2/8 ||
|
2411
|
+
(i_feed_forward_w2 - n_feed_forward_w2/8)%3 == 2)) new_type = GGML_TYPE_Q6_K;
|
2412
|
+
++i_feed_forward_w2;
|
2413
|
+
}
|
2414
|
+
if (tensor.name.find("attention.wo.weight") != std::string::npos) {
|
2415
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
2416
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
2417
|
+
}
|
2418
|
+
|
2119
2419
|
float * f32_data;
|
2120
2420
|
size_t nelements = tensor.ne.at(0) * tensor.ne.at(1);
|
2121
2421
|
llama_buffer f32_conv_buf;
|
2422
|
+
|
2122
2423
|
if (tensor.type == GGML_TYPE_F32) {
|
2123
2424
|
f32_data = (float *) tensor.data;
|
2124
|
-
} else if (tensor.type
|
2125
|
-
|
2126
|
-
f32_data = (float *) f32_conv_buf.addr;
|
2127
|
-
const auto * f16_data = (const ggml_fp16_t *) tensor.data;
|
2128
|
-
for (size_t i = 0; i < nelements; i++) {
|
2129
|
-
f32_data[i] = ggml_fp16_to_fp32(f16_data[i]);
|
2130
|
-
}
|
2425
|
+
} else if (ggml_is_quantized(tensor.type) && !params->allow_requantize) {
|
2426
|
+
throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor.type)));
|
2131
2427
|
} else {
|
2132
|
-
|
2428
|
+
llama_convert_tensor_internal(tensor, f32_conv_buf, nelements, nthread);
|
2429
|
+
f32_data = (float *) f32_conv_buf.addr;
|
2133
2430
|
}
|
2134
2431
|
|
2135
2432
|
printf("quantizing .. ");
|
@@ -2183,12 +2480,16 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2183
2480
|
}
|
2184
2481
|
|
2185
2482
|
printf("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0);
|
2483
|
+
int64_t tot_count = 0;
|
2186
2484
|
for (size_t i = 0; i < hist_cur.size(); i++) {
|
2187
2485
|
hist_all[i] += hist_cur[i];
|
2486
|
+
tot_count += hist_cur[i];
|
2188
2487
|
}
|
2189
2488
|
|
2190
|
-
|
2191
|
-
|
2489
|
+
if (tot_count > 0) {
|
2490
|
+
for (size_t i = 0; i < hist_cur.size(); i++) {
|
2491
|
+
printf("%5.3f ", hist_cur[i] / float(nelements));
|
2492
|
+
}
|
2192
2493
|
}
|
2193
2494
|
printf("\n");
|
2194
2495
|
}
|
@@ -2206,11 +2507,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2206
2507
|
sum_all += hist_all[i];
|
2207
2508
|
}
|
2208
2509
|
|
2209
|
-
|
2210
|
-
|
2211
|
-
|
2510
|
+
if (sum_all > 0) {
|
2511
|
+
printf("%s: hist: ", __func__);
|
2512
|
+
for (size_t i = 0; i < hist_all.size(); i++) {
|
2513
|
+
printf("%5.3f ", hist_all[i] / float(sum_all));
|
2514
|
+
}
|
2515
|
+
printf("\n");
|
2212
2516
|
}
|
2213
|
-
printf("\n");
|
2214
2517
|
}
|
2215
2518
|
}
|
2216
2519
|
|
@@ -2251,9 +2554,9 @@ struct llama_context * llama_init_from_file(
|
|
2251
2554
|
|
2252
2555
|
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
2253
2556
|
|
2254
|
-
if (!llama_model_load(path_model, *ctx, params.n_ctx, params.
|
2255
|
-
|
2256
|
-
|
2557
|
+
if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_batch, params.n_gpu_layers,
|
2558
|
+
params.main_gpu, params.tensor_split, memory_type, params.use_mmap, params.use_mlock,
|
2559
|
+
params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
|
2257
2560
|
fprintf(stderr, "%s: failed to load model\n", __func__);
|
2258
2561
|
llama_free(ctx);
|
2259
2562
|
return nullptr;
|
@@ -2291,6 +2594,38 @@ struct llama_context * llama_init_from_file(
|
|
2291
2594
|
ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
|
2292
2595
|
}
|
2293
2596
|
|
2597
|
+
#ifdef GGML_USE_METAL
|
2598
|
+
if (params.n_gpu_layers > 0) {
|
2599
|
+
// this allocates all Metal resources and memory buffers
|
2600
|
+
ctx->ctx_metal = ggml_metal_init();
|
2601
|
+
|
2602
|
+
void *data_ptr = NULL;
|
2603
|
+
size_t data_size = 0;
|
2604
|
+
if (params.use_mmap) {
|
2605
|
+
data_ptr = ctx->model.mapping->addr;
|
2606
|
+
data_size= ctx->model.mapping->size;
|
2607
|
+
} else {
|
2608
|
+
data_ptr = ggml_get_mem_buffer(ctx->model.ctx);
|
2609
|
+
data_size= ggml_get_mem_size(ctx->model.ctx);
|
2610
|
+
}
|
2611
|
+
|
2612
|
+
#define LLAMA_METAL_CHECK_BUF(result) \
|
2613
|
+
if (!(result)) { \
|
2614
|
+
fprintf(stderr, "%s: failed to add buffer\n", __func__); \
|
2615
|
+
llama_free(ctx); \
|
2616
|
+
return NULL; \
|
2617
|
+
}
|
2618
|
+
|
2619
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size));
|
2620
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size));
|
2621
|
+
|
2622
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->model.kv_self.buf.addr, ctx->model.kv_self.buf.size));
|
2623
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size));
|
2624
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size));
|
2625
|
+
#undef LLAMA_METAL_CHECK_BUF
|
2626
|
+
}
|
2627
|
+
#endif
|
2628
|
+
|
2294
2629
|
return ctx;
|
2295
2630
|
}
|
2296
2631
|
|
@@ -2301,13 +2636,12 @@ void llama_free(struct llama_context * ctx) {
|
|
2301
2636
|
int llama_model_quantize(
|
2302
2637
|
const char * fname_inp,
|
2303
2638
|
const char * fname_out,
|
2304
|
-
|
2305
|
-
int nthread) {
|
2639
|
+
const llama_model_quantize_params *params) {
|
2306
2640
|
try {
|
2307
|
-
llama_model_quantize_internal(fname_inp, fname_out,
|
2641
|
+
llama_model_quantize_internal(fname_inp, fname_out, params);
|
2308
2642
|
return 0;
|
2309
|
-
} catch (const std::
|
2310
|
-
fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.
|
2643
|
+
} catch (const std::exception & err) {
|
2644
|
+
fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.what());
|
2311
2645
|
return 1;
|
2312
2646
|
}
|
2313
2647
|
}
|
@@ -2560,8 +2894,8 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|
2560
2894
|
int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
|
2561
2895
|
try {
|
2562
2896
|
return llama_apply_lora_from_file_internal(ctx, path_lora, path_base_model, n_threads);
|
2563
|
-
} catch (const std::
|
2564
|
-
fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.
|
2897
|
+
} catch (const std::exception & err) {
|
2898
|
+
fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what());
|
2565
2899
|
return 1;
|
2566
2900
|
}
|
2567
2901
|
}
|
@@ -2906,7 +3240,7 @@ int llama_eval(
|
|
2906
3240
|
int n_tokens,
|
2907
3241
|
int n_past,
|
2908
3242
|
int n_threads) {
|
2909
|
-
if (!llama_eval_internal(*ctx, tokens, n_tokens, n_past, n_threads)) {
|
3243
|
+
if (!llama_eval_internal(*ctx, tokens, n_tokens, n_past, n_threads, nullptr)) {
|
2910
3244
|
fprintf(stderr, "%s: failed to eval\n", __func__);
|
2911
3245
|
return 1;
|
2912
3246
|
}
|
@@ -2921,6 +3255,20 @@ int llama_eval(
|
|
2921
3255
|
return 0;
|
2922
3256
|
}
|
2923
3257
|
|
3258
|
+
int llama_eval_export(struct llama_context * ctx, const char * fname) {
|
3259
|
+
const int n_batch = 1;
|
3260
|
+
const int n_ctx = 512 - n_batch;
|
3261
|
+
|
3262
|
+
const std::vector<llama_token> tmp(n_batch, llama_token_bos());
|
3263
|
+
|
3264
|
+
if (!llama_eval_internal(*ctx, tmp.data(), tmp.size(), n_ctx, 1, fname)) {
|
3265
|
+
fprintf(stderr, "%s: failed to eval\n", __func__);
|
3266
|
+
return 1;
|
3267
|
+
}
|
3268
|
+
|
3269
|
+
return 0;
|
3270
|
+
}
|
3271
|
+
|
2924
3272
|
int llama_tokenize(
|
2925
3273
|
struct llama_context * ctx,
|
2926
3274
|
const char * text,
|